1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "allocator.h"
16
17 #include "gpu.h"
18 #include "pipeline.h"
19
20 #if __ANDROID_API__ >= 26
21 #include <android/hardware_buffer.h>
22 #endif // __ANDROID_API__ >= 26
23
24 namespace ncnn {
25
~Allocator()26 Allocator::~Allocator()
27 {
28 }
29
30 class PoolAllocatorPrivate
31 {
32 public:
33 Mutex budgets_lock;
34 Mutex payouts_lock;
35 unsigned int size_compare_ratio; // 0~256
36 std::list<std::pair<size_t, void*> > budgets;
37 std::list<std::pair<size_t, void*> > payouts;
38 };
39
PoolAllocator()40 PoolAllocator::PoolAllocator()
41 : Allocator(), d(new PoolAllocatorPrivate)
42 {
43 d->size_compare_ratio = 192; // 0.75f * 256
44 }
45
~PoolAllocator()46 PoolAllocator::~PoolAllocator()
47 {
48 clear();
49
50 if (!d->payouts.empty())
51 {
52 NCNN_LOGE("FATAL ERROR! pool allocator destroyed too early");
53 #if NCNN_STDIO
54 std::list<std::pair<size_t, void*> >::iterator it = d->payouts.begin();
55 for (; it != d->payouts.end(); ++it)
56 {
57 void* ptr = it->second;
58 NCNN_LOGE("%p still in use", ptr);
59 }
60 #endif
61 }
62
63 delete d;
64 }
65
PoolAllocator(const PoolAllocator &)66 PoolAllocator::PoolAllocator(const PoolAllocator&)
67 : d(0)
68 {
69 }
70
operator =(const PoolAllocator &)71 PoolAllocator& PoolAllocator::operator=(const PoolAllocator&)
72 {
73 return *this;
74 }
75
clear()76 void PoolAllocator::clear()
77 {
78 d->budgets_lock.lock();
79
80 std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin();
81 for (; it != d->budgets.end(); ++it)
82 {
83 void* ptr = it->second;
84 ncnn::fastFree(ptr);
85 }
86 d->budgets.clear();
87
88 d->budgets_lock.unlock();
89 }
90
set_size_compare_ratio(float scr)91 void PoolAllocator::set_size_compare_ratio(float scr)
92 {
93 if (scr < 0.f || scr > 1.f)
94 {
95 NCNN_LOGE("invalid size compare ratio %f", scr);
96 return;
97 }
98
99 d->size_compare_ratio = (unsigned int)(scr * 256);
100 }
101
fastMalloc(size_t size)102 void* PoolAllocator::fastMalloc(size_t size)
103 {
104 d->budgets_lock.lock();
105
106 // find free budget
107 std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin();
108 for (; it != d->budgets.end(); ++it)
109 {
110 size_t bs = it->first;
111
112 // size_compare_ratio ~ 100%
113 if (bs >= size && ((bs * d->size_compare_ratio) >> 8) <= size)
114 {
115 void* ptr = it->second;
116
117 d->budgets.erase(it);
118
119 d->budgets_lock.unlock();
120
121 d->payouts_lock.lock();
122
123 d->payouts.push_back(std::make_pair(bs, ptr));
124
125 d->payouts_lock.unlock();
126
127 return ptr;
128 }
129 }
130
131 d->budgets_lock.unlock();
132
133 // new
134 void* ptr = ncnn::fastMalloc(size);
135
136 d->payouts_lock.lock();
137
138 d->payouts.push_back(std::make_pair(size, ptr));
139
140 d->payouts_lock.unlock();
141
142 return ptr;
143 }
144
fastFree(void * ptr)145 void PoolAllocator::fastFree(void* ptr)
146 {
147 d->payouts_lock.lock();
148
149 // return to budgets
150 std::list<std::pair<size_t, void*> >::iterator it = d->payouts.begin();
151 for (; it != d->payouts.end(); ++it)
152 {
153 if (it->second == ptr)
154 {
155 size_t size = it->first;
156
157 d->payouts.erase(it);
158
159 d->payouts_lock.unlock();
160
161 d->budgets_lock.lock();
162
163 d->budgets.push_back(std::make_pair(size, ptr));
164
165 d->budgets_lock.unlock();
166
167 return;
168 }
169 }
170
171 d->payouts_lock.unlock();
172
173 NCNN_LOGE("FATAL ERROR! pool allocator get wild %p", ptr);
174 ncnn::fastFree(ptr);
175 }
176
177 class UnlockedPoolAllocatorPrivate
178 {
179 public:
180 unsigned int size_compare_ratio; // 0~256
181 std::list<std::pair<size_t, void*> > budgets;
182 std::list<std::pair<size_t, void*> > payouts;
183 };
184
UnlockedPoolAllocator()185 UnlockedPoolAllocator::UnlockedPoolAllocator()
186 : Allocator(), d(new UnlockedPoolAllocatorPrivate)
187 {
188 d->size_compare_ratio = 192; // 0.75f * 256
189 }
190
~UnlockedPoolAllocator()191 UnlockedPoolAllocator::~UnlockedPoolAllocator()
192 {
193 clear();
194
195 if (!d->payouts.empty())
196 {
197 NCNN_LOGE("FATAL ERROR! unlocked pool allocator destroyed too early");
198 #if NCNN_STDIO
199 std::list<std::pair<size_t, void*> >::iterator it = d->payouts.begin();
200 for (; it != d->payouts.end(); ++it)
201 {
202 void* ptr = it->second;
203 NCNN_LOGE("%p still in use", ptr);
204 }
205 #endif
206 }
207
208 delete d;
209 }
210
UnlockedPoolAllocator(const UnlockedPoolAllocator &)211 UnlockedPoolAllocator::UnlockedPoolAllocator(const UnlockedPoolAllocator&)
212 : d(0)
213 {
214 }
215
operator =(const UnlockedPoolAllocator &)216 UnlockedPoolAllocator& UnlockedPoolAllocator::operator=(const UnlockedPoolAllocator&)
217 {
218 return *this;
219 }
220
clear()221 void UnlockedPoolAllocator::clear()
222 {
223 std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin();
224 for (; it != d->budgets.end(); ++it)
225 {
226 void* ptr = it->second;
227 ncnn::fastFree(ptr);
228 }
229 d->budgets.clear();
230 }
231
set_size_compare_ratio(float scr)232 void UnlockedPoolAllocator::set_size_compare_ratio(float scr)
233 {
234 if (scr < 0.f || scr > 1.f)
235 {
236 NCNN_LOGE("invalid size compare ratio %f", scr);
237 return;
238 }
239
240 d->size_compare_ratio = (unsigned int)(scr * 256);
241 }
242
fastMalloc(size_t size)243 void* UnlockedPoolAllocator::fastMalloc(size_t size)
244 {
245 // find free budget
246 std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin();
247 for (; it != d->budgets.end(); ++it)
248 {
249 size_t bs = it->first;
250
251 // size_compare_ratio ~ 100%
252 if (bs >= size && ((bs * d->size_compare_ratio) >> 8) <= size)
253 {
254 void* ptr = it->second;
255
256 d->budgets.erase(it);
257
258 d->payouts.push_back(std::make_pair(bs, ptr));
259
260 return ptr;
261 }
262 }
263
264 // new
265 void* ptr = ncnn::fastMalloc(size);
266
267 d->payouts.push_back(std::make_pair(size, ptr));
268
269 return ptr;
270 }
271
fastFree(void * ptr)272 void UnlockedPoolAllocator::fastFree(void* ptr)
273 {
274 // return to budgets
275 std::list<std::pair<size_t, void*> >::iterator it = d->payouts.begin();
276 for (; it != d->payouts.end(); ++it)
277 {
278 if (it->second == ptr)
279 {
280 size_t size = it->first;
281
282 d->payouts.erase(it);
283
284 d->budgets.push_back(std::make_pair(size, ptr));
285
286 return;
287 }
288 }
289
290 NCNN_LOGE("FATAL ERROR! unlocked pool allocator get wild %p", ptr);
291 ncnn::fastFree(ptr);
292 }
293
294 #if NCNN_VULKAN
VkAllocator(const VulkanDevice * _vkdev)295 VkAllocator::VkAllocator(const VulkanDevice* _vkdev)
296 : vkdev(_vkdev)
297 {
298 buffer_memory_type_index = (uint32_t)-1;
299 image_memory_type_index = (uint32_t)-1;
300 reserved_type_index = (uint32_t)-1;
301 mappable = false;
302 coherent = false;
303 }
304
~VkAllocator()305 VkAllocator::~VkAllocator()
306 {
307 clear();
308 }
309
clear()310 void VkAllocator::clear()
311 {
312 }
313
round_up(size_t n,size_t multiple)314 static inline size_t round_up(size_t n, size_t multiple)
315 {
316 return (n + multiple - 1) / multiple * multiple;
317 }
318
round_down(size_t n,size_t multiple)319 static inline size_t round_down(size_t n, size_t multiple)
320 {
321 return n / multiple * multiple;
322 }
323
flush(VkBufferMemory * ptr)324 int VkAllocator::flush(VkBufferMemory* ptr)
325 {
326 if (coherent)
327 return 0;
328
329 VkMappedMemoryRange mappedMemoryRange;
330 mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
331 mappedMemoryRange.pNext = 0;
332 mappedMemoryRange.memory = ptr->memory;
333 mappedMemoryRange.offset = round_down(ptr->offset, vkdev->info.non_coherent_atom_size());
334 mappedMemoryRange.size = round_up(ptr->offset + ptr->capacity, vkdev->info.non_coherent_atom_size()) - mappedMemoryRange.offset;
335
336 VkResult ret = vkFlushMappedMemoryRanges(vkdev->vkdevice(), 1, &mappedMemoryRange);
337 if (ret != VK_SUCCESS)
338 {
339 NCNN_LOGE("vkFlushMappedMemoryRanges failed %d", ret);
340 return -1;
341 }
342
343 return 0;
344 }
345
invalidate(VkBufferMemory * ptr)346 int VkAllocator::invalidate(VkBufferMemory* ptr)
347 {
348 if (coherent)
349 return 0;
350
351 VkMappedMemoryRange mappedMemoryRange;
352 mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
353 mappedMemoryRange.pNext = 0;
354 mappedMemoryRange.memory = ptr->memory;
355 mappedMemoryRange.offset = round_down(ptr->offset, vkdev->info.non_coherent_atom_size());
356 mappedMemoryRange.size = round_up(ptr->offset + ptr->capacity, vkdev->info.non_coherent_atom_size()) - mappedMemoryRange.offset;
357
358 VkResult ret = vkInvalidateMappedMemoryRanges(vkdev->vkdevice(), 1, &mappedMemoryRange);
359 if (ret != VK_SUCCESS)
360 {
361 NCNN_LOGE("vkInvalidateMappedMemoryRanges failed %d", ret);
362 return -1;
363 }
364
365 return 0;
366 }
367
create_buffer(size_t size,VkBufferUsageFlags usage)368 VkBuffer VkAllocator::create_buffer(size_t size, VkBufferUsageFlags usage)
369 {
370 VkBufferCreateInfo bufferCreateInfo;
371 bufferCreateInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
372 bufferCreateInfo.pNext = 0;
373 bufferCreateInfo.flags = 0;
374 bufferCreateInfo.size = size;
375 bufferCreateInfo.usage = usage;
376 bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
377 bufferCreateInfo.queueFamilyIndexCount = 0;
378 bufferCreateInfo.pQueueFamilyIndices = 0;
379
380 VkBuffer buffer = 0;
381 VkResult ret = vkCreateBuffer(vkdev->vkdevice(), &bufferCreateInfo, 0, &buffer);
382 if (ret != VK_SUCCESS)
383 {
384 NCNN_LOGE("vkCreateBuffer failed %d", ret);
385 return 0;
386 }
387
388 return buffer;
389 }
390
allocate_memory(size_t size,uint32_t memory_type_index)391 VkDeviceMemory VkAllocator::allocate_memory(size_t size, uint32_t memory_type_index)
392 {
393 VkMemoryAllocateInfo memoryAllocateInfo;
394 memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
395 memoryAllocateInfo.pNext = 0;
396 memoryAllocateInfo.allocationSize = size;
397 memoryAllocateInfo.memoryTypeIndex = memory_type_index;
398
399 VkDeviceMemory memory = 0;
400 VkResult ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory);
401 if (ret != VK_SUCCESS)
402 {
403 NCNN_LOGE("vkAllocateMemory failed %d", ret);
404 return 0;
405 }
406
407 return memory;
408 }
409
allocate_dedicated_memory(size_t size,uint32_t memory_type_index,VkImage image,VkBuffer buffer)410 VkDeviceMemory VkAllocator::allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer)
411 {
412 VkMemoryAllocateInfo memoryAllocateInfo;
413 memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
414 memoryAllocateInfo.pNext = 0;
415 memoryAllocateInfo.allocationSize = size;
416 memoryAllocateInfo.memoryTypeIndex = memory_type_index;
417
418 VkMemoryDedicatedAllocateInfoKHR memoryDedicatedAllocateInfo;
419 memoryDedicatedAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR;
420 memoryDedicatedAllocateInfo.pNext = 0;
421 memoryDedicatedAllocateInfo.image = image;
422 memoryDedicatedAllocateInfo.buffer = buffer;
423 memoryAllocateInfo.pNext = &memoryDedicatedAllocateInfo;
424
425 VkDeviceMemory memory = 0;
426 VkResult ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory);
427 if (ret != VK_SUCCESS)
428 {
429 NCNN_LOGE("vkAllocateMemory failed %d", ret);
430 return 0;
431 }
432
433 return memory;
434 }
435
create_image(int width,int height,int depth,VkFormat format,VkImageTiling tiling,VkImageUsageFlags usage)436 VkImage VkAllocator::create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage)
437 {
438 VkImageCreateInfo imageCreateInfo;
439 imageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
440 imageCreateInfo.pNext = 0;
441 imageCreateInfo.flags = 0;
442 imageCreateInfo.imageType = VK_IMAGE_TYPE_3D;
443 imageCreateInfo.format = format;
444 imageCreateInfo.extent.width = width;
445 imageCreateInfo.extent.height = height;
446 imageCreateInfo.extent.depth = depth;
447 imageCreateInfo.mipLevels = 1;
448 imageCreateInfo.arrayLayers = 1;
449 imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT;
450 imageCreateInfo.tiling = tiling;
451 imageCreateInfo.usage = usage;
452 imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
453 imageCreateInfo.queueFamilyIndexCount = 0;
454 imageCreateInfo.pQueueFamilyIndices = 0;
455 imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
456
457 VkImage image;
458 VkResult ret = vkCreateImage(vkdev->vkdevice(), &imageCreateInfo, 0, &image);
459 if (ret != VK_SUCCESS)
460 {
461 NCNN_LOGE("vkCreateImage failed %d %d %d %d %d %d %d", ret, width, height, depth, format, tiling, usage);
462 return 0;
463 }
464
465 return image;
466 }
467
create_imageview(VkImage image,VkFormat format)468 VkImageView VkAllocator::create_imageview(VkImage image, VkFormat format)
469 {
470 VkImageViewCreateInfo imageViewCreateInfo;
471 imageViewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
472 imageViewCreateInfo.pNext = 0;
473 imageViewCreateInfo.flags = 0;
474 imageViewCreateInfo.image = image;
475 imageViewCreateInfo.viewType = VK_IMAGE_VIEW_TYPE_3D;
476 imageViewCreateInfo.format = format;
477 imageViewCreateInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
478 imageViewCreateInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
479 imageViewCreateInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
480 imageViewCreateInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
481 imageViewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
482 imageViewCreateInfo.subresourceRange.baseMipLevel = 0;
483 imageViewCreateInfo.subresourceRange.levelCount = 1;
484 imageViewCreateInfo.subresourceRange.baseArrayLayer = 0;
485 imageViewCreateInfo.subresourceRange.layerCount = 1;
486
487 VkImageView imageview;
488 VkResult ret = vkCreateImageView(vkdev->vkdevice(), &imageViewCreateInfo, 0, &imageview);
489 if (ret != VK_SUCCESS)
490 {
491 NCNN_LOGE("vkCreateImageView failed %d", ret);
492 return 0;
493 }
494
495 return imageview;
496 }
497
least_common_multiple(size_t a,size_t b)498 static inline size_t least_common_multiple(size_t a, size_t b)
499 {
500 if (a == b)
501 return a;
502
503 if (a > b)
504 return least_common_multiple(b, a);
505
506 size_t lcm = b;
507 while (lcm % a != 0)
508 {
509 lcm += b;
510 }
511
512 return lcm;
513 }
514
515 class VkBlobAllocatorPrivate
516 {
517 public:
518 size_t block_size;
519 size_t buffer_offset_alignment;
520 size_t bind_memory_offset_alignment;
521 std::vector<std::list<std::pair<size_t, size_t> > > buffer_budgets;
522 std::vector<VkBufferMemory*> buffer_blocks;
523 std::vector<std::list<std::pair<size_t, size_t> > > image_memory_budgets;
524 std::vector<VkDeviceMemory> image_memory_blocks;
525 };
526
VkBlobAllocator(const VulkanDevice * _vkdev,size_t preferred_block_size)527 VkBlobAllocator::VkBlobAllocator(const VulkanDevice* _vkdev, size_t preferred_block_size)
528 : VkAllocator(_vkdev), d(new VkBlobAllocatorPrivate)
529 {
530 d->buffer_offset_alignment = vkdev->info.buffer_offset_alignment();
531 d->bind_memory_offset_alignment = vkdev->info.buffer_image_granularity();
532
533 if (vkdev->info.type() == 1)
534 {
535 // on integrated gpu, there may be device local only memory too, eg. AMD APU
536 // assuming larger alignment always keeps us safe :)
537
538 // least common multiple for memory_map_alignment and buffer_offset_alignment and non_coherent_atom_size
539 d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.memory_map_alignment());
540 d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.non_coherent_atom_size());
541 }
542
543 d->block_size = alignSize(preferred_block_size, d->buffer_offset_alignment);
544 }
545
~VkBlobAllocator()546 VkBlobAllocator::~VkBlobAllocator()
547 {
548 clear();
549
550 delete d;
551 }
552
VkBlobAllocator(const VkBlobAllocator &)553 VkBlobAllocator::VkBlobAllocator(const VkBlobAllocator&)
554 : VkAllocator(0), d(0)
555 {
556 }
557
operator =(const VkBlobAllocator &)558 VkBlobAllocator& VkBlobAllocator::operator=(const VkBlobAllocator&)
559 {
560 return *this;
561 }
562
clear()563 void VkBlobAllocator::clear()
564 {
565 // NCNN_LOGE("VkBlobAllocator %lu", buffer_blocks.size());
566
567 for (size_t i = 0; i < d->buffer_blocks.size(); i++)
568 {
569 VkBufferMemory* ptr = d->buffer_blocks[i];
570
571 // std::list< std::pair<size_t, size_t> >::iterator it = buffer_budgets[i].begin();
572 // while (it != buffer_budgets[i].end())
573 // {
574 // NCNN_LOGE("VkBlobAllocator budget %p %lu %lu", ptr->buffer, it->first, it->second);
575 // it++;
576 // }
577
578 if (mappable)
579 vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
580
581 vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
582 vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
583
584 delete ptr;
585 }
586 d->buffer_blocks.clear();
587
588 d->buffer_budgets.clear();
589
590 for (size_t i = 0; i < d->image_memory_blocks.size(); i++)
591 {
592 VkDeviceMemory memory = d->image_memory_blocks[i];
593
594 // std::list< std::pair<size_t, size_t> >::iterator it = d->image_memory_budgets[i].begin();
595 // while (it != d->image_memory_budgets[i].end())
596 // {
597 // NCNN_LOGE("VkBlobAllocator budget %p %lu %lu", memory, it->first, it->second);
598 // it++;
599 // }
600
601 vkFreeMemory(vkdev->vkdevice(), memory, 0);
602 }
603 d->image_memory_blocks.clear();
604
605 d->image_memory_budgets.clear();
606 }
607
fastMalloc(size_t size)608 VkBufferMemory* VkBlobAllocator::fastMalloc(size_t size)
609 {
610 size_t aligned_size = alignSize(size, d->buffer_offset_alignment);
611
612 const int buffer_block_count = d->buffer_blocks.size();
613
614 // find first spare space in buffer_blocks
615 for (int i = 0; i < buffer_block_count; i++)
616 {
617 std::list<std::pair<size_t, size_t> >::iterator it = d->buffer_budgets[i].begin();
618 while (it != d->buffer_budgets[i].end())
619 {
620 size_t budget_size = it->second;
621 if (budget_size < aligned_size)
622 {
623 it++;
624 continue;
625 }
626
627 // return sub buffer
628 VkBufferMemory* ptr = new VkBufferMemory;
629
630 ptr->buffer = d->buffer_blocks[i]->buffer;
631 ptr->offset = it->first;
632 ptr->memory = d->buffer_blocks[i]->memory;
633 ptr->capacity = aligned_size;
634 ptr->mapped_ptr = d->buffer_blocks[i]->mapped_ptr;
635 ptr->access_flags = 0;
636 ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
637
638 // adjust buffer_budgets
639 if (budget_size == aligned_size)
640 {
641 d->buffer_budgets[i].erase(it);
642 }
643 else
644 {
645 it->first += aligned_size;
646 it->second -= aligned_size;
647 }
648
649 // NCNN_LOGE("VkBlobAllocator M %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity);
650
651 return ptr;
652 }
653 }
654
655 size_t new_block_size = std::max(d->block_size, aligned_size);
656
657 // create new block
658 VkBufferMemory* block = new VkBufferMemory;
659
660 block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
661 block->offset = 0;
662
663 // TODO respect VK_KHR_dedicated_allocation ?
664
665 VkMemoryRequirements memoryRequirements;
666 vkGetBufferMemoryRequirements(vkdev->vkdevice(), block->buffer, &memoryRequirements);
667
668 // setup memory type and alignment
669 if (buffer_memory_type_index == (uint32_t)-1)
670 {
671 if (vkdev->info.type() == 1)
672 {
673 // integrated gpu, prefer unified memory
674 buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
675 }
676 else
677 {
678 // discrete gpu, device local
679 buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
680 }
681
682 mappable = vkdev->is_mappable(buffer_memory_type_index);
683 coherent = vkdev->is_coherent(buffer_memory_type_index);
684 }
685
686 block->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
687
688 // ignore memoryRequirements.alignment as we always bind at zero offset
689 vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);
690
691 block->mapped_ptr = 0;
692 if (mappable)
693 {
694 vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr);
695 }
696
697 d->buffer_blocks.push_back(block);
698
699 // return sub buffer
700 VkBufferMemory* ptr = new VkBufferMemory;
701
702 ptr->buffer = block->buffer;
703 ptr->offset = 0;
704 ptr->memory = block->memory;
705 ptr->capacity = aligned_size;
706 ptr->mapped_ptr = block->mapped_ptr;
707 ptr->access_flags = 0;
708 ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
709
710 // adjust buffer_budgets
711 std::list<std::pair<size_t, size_t> > budget;
712 if (new_block_size > aligned_size)
713 {
714 budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size));
715 }
716 d->buffer_budgets.push_back(budget);
717
718 // NCNN_LOGE("VkBlobAllocator M %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity);
719
720 return ptr;
721 }
722
fastFree(VkBufferMemory * ptr)723 void VkBlobAllocator::fastFree(VkBufferMemory* ptr)
724 {
725 // NCNN_LOGE("VkBlobAllocator F %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity);
726
727 const int buffer_block_count = d->buffer_blocks.size();
728
729 int block_index = -1;
730 for (int i = 0; i < buffer_block_count; i++)
731 {
732 if (d->buffer_blocks[i]->buffer == ptr->buffer && d->buffer_blocks[i]->memory == ptr->memory)
733 {
734 block_index = i;
735 break;
736 }
737 }
738
739 if (block_index == -1)
740 {
741 NCNN_LOGE("FATAL ERROR! unlocked VkBlobAllocator get wild %p", ptr->buffer);
742
743 delete ptr;
744
745 return;
746 }
747
748 // merge
749 std::list<std::pair<size_t, size_t> >::iterator it_merge_left = d->buffer_budgets[block_index].end();
750 std::list<std::pair<size_t, size_t> >::iterator it_merge_right = d->buffer_budgets[block_index].end();
751 std::list<std::pair<size_t, size_t> >::iterator it = d->buffer_budgets[block_index].begin();
752 for (; it != d->buffer_budgets[block_index].end(); it++)
753 {
754 if (it->first + it->second == ptr->offset)
755 {
756 it_merge_left = it;
757 }
758 else if (ptr->offset + ptr->capacity == it->first)
759 {
760 it_merge_right = it;
761 }
762 }
763
764 if (it_merge_left != d->buffer_budgets[block_index].end() && it_merge_right != d->buffer_budgets[block_index].end())
765 {
766 it_merge_left->second = it_merge_right->first + it_merge_right->second - it_merge_left->first;
767 d->buffer_budgets[block_index].erase(it_merge_right);
768 }
769 else if (it_merge_left != d->buffer_budgets[block_index].end())
770 {
771 it_merge_left->second = ptr->offset + ptr->capacity - it_merge_left->first;
772 }
773 else if (it_merge_right != d->buffer_budgets[block_index].end())
774 {
775 it_merge_right->second = it_merge_right->first + it_merge_right->second - ptr->offset;
776 it_merge_right->first = ptr->offset;
777 }
778 else
779 {
780 if (ptr->offset == 0)
781 {
782 // chain leading block
783 d->buffer_budgets[block_index].push_front(std::make_pair(ptr->offset, ptr->capacity));
784 }
785 else
786 {
787 d->buffer_budgets[block_index].push_back(std::make_pair(ptr->offset, ptr->capacity));
788 }
789 }
790
791 delete ptr;
792 }
793
fastMalloc(int w,int h,int c,size_t elemsize,int elempack)794 VkImageMemory* VkBlobAllocator::fastMalloc(int w, int h, int c, size_t elemsize, int elempack)
795 {
796 if (elempack != 1 && elempack != 4 && elempack != 8)
797 {
798 NCNN_LOGE("elempack must be 1 4 8");
799 return 0;
800 }
801
802 // resolve format
803 VkFormat format = VK_FORMAT_UNDEFINED;
804
805 if (elemsize / elempack == 4)
806 {
807 // fp32
808 if (elempack == 1) format = VK_FORMAT_R32_SFLOAT;
809 if (elempack == 4) format = VK_FORMAT_R32G32B32A32_SFLOAT;
810 if (elempack == 8) format = VK_FORMAT_R32G32B32A32_SFLOAT;
811 }
812 if (elemsize / elempack == 2)
813 {
814 // fp16
815 if (elempack == 1) format = VK_FORMAT_R16_SFLOAT;
816 if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT;
817 if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT;
818 }
819
820 // resolve image width height depth
821 int width = w;
822 int height = h;
823 int depth = c;
824
825 // large elempack spills on image w
826 if (elempack == 8) width *= 2;
827
828 if (width > (int)vkdev->info.max_image_dimension_3d() || height > (int)vkdev->info.max_image_dimension_3d() || depth > (int)vkdev->info.max_image_dimension_3d())
829 {
830 NCNN_LOGE("image dimension too large %d %d %d > %d", width, height, depth, (int)vkdev->info.max_image_dimension_3d());
831 return 0;
832 }
833
834 VkImageMemory* ptr = new VkImageMemory;
835
836 ptr->image = create_image(width, height, depth, format, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
837
838 ptr->width = width;
839 ptr->height = height;
840 ptr->depth = depth;
841 ptr->format = format;
842
843 // TODO respect VK_KHR_dedicated_allocation ?
844 VkMemoryRequirements memoryRequirements;
845 vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements);
846
847 const size_t size = memoryRequirements.size;
848 const size_t alignment = std::max((size_t)memoryRequirements.alignment, d->bind_memory_offset_alignment);
849
850 size_t aligned_size = alignSize(size, alignment);
851
852 const int image_memory_block_count = d->image_memory_blocks.size();
853
854 // find first spare space in image_memory_blocks
855 for (int i = 0; i < image_memory_block_count; i++)
856 {
857 std::list<std::pair<size_t, size_t> >::iterator it = d->image_memory_budgets[i].begin();
858 while (it != d->image_memory_budgets[i].end())
859 {
860 // we cannot use it->first directly for base offset alignment
861 size_t bind_base_offset = it->first;
862 size_t bind_offset = alignSize(bind_base_offset, alignment);
863 size_t budget_size = it->second;
864 if (budget_size < aligned_size + (bind_offset - bind_base_offset))
865 {
866 it++;
867 continue;
868 }
869
870 // bind at memory offset
871 ptr->memory = d->image_memory_blocks[i];
872 ptr->bind_offset = bind_offset;
873 ptr->bind_capacity = aligned_size;
874
875 vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
876
877 // do not allow host access to optimal tiling image
878 ptr->mapped_ptr = 0;
879
880 ptr->imageview = create_imageview(ptr->image, format);
881
882 ptr->access_flags = 0;
883 ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
884 ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
885 ptr->command_refcount = 0;
886
887 if (bind_base_offset != bind_offset)
888 {
889 // NOTE there is small offset inside bind_base_offset and bind_offset
890 // adjust ptr->bind_offset and ptr->bind_capacity after vkBindImageMemory
891 // so that memory management could be easier
892 aligned_size += (bind_offset - bind_base_offset);
893
894 ptr->bind_offset = bind_base_offset;
895 ptr->bind_capacity = aligned_size;
896 }
897
898 // adjust image_memory_budgets
899 if (budget_size == aligned_size)
900 {
901 d->image_memory_budgets[i].erase(it);
902 }
903 else
904 {
905 it->first += aligned_size;
906 it->second -= aligned_size;
907 }
908
909 // NCNN_LOGE("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
910
911 return ptr;
912 }
913 }
914
915 // setup memory type and alignment
916 if (image_memory_type_index == (uint32_t)-1)
917 {
918 if (vkdev->info.type() == 1)
919 {
920 // integrated gpu, prefer unified memory
921 image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
922 }
923 else
924 {
925 // discrete gpu, device local
926 image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
927 }
928
929 mappable = vkdev->is_mappable(image_memory_type_index);
930 coherent = vkdev->is_coherent(image_memory_type_index);
931 }
932
933 // create new block
934 size_t new_block_size = std::max(d->block_size, aligned_size);
935
936 // bind at memory offset
937 ptr->memory = allocate_memory(new_block_size, image_memory_type_index);
938 ptr->bind_offset = 0;
939 ptr->bind_capacity = aligned_size;
940
941 // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
942 vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
943
944 // do not allow host access to optimal tiling image
945 ptr->mapped_ptr = 0;
946
947 ptr->imageview = create_imageview(ptr->image, format);
948
949 ptr->access_flags = 0;
950 ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
951 ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
952 ptr->command_refcount = 0;
953
954 // adjust image_memory_budgets
955 d->image_memory_blocks.push_back(ptr->memory);
956
957 std::list<std::pair<size_t, size_t> > budget;
958 if (new_block_size > aligned_size)
959 {
960 budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size));
961 }
962 d->image_memory_budgets.push_back(budget);
963
964 // NCNN_LOGE("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
965
966 return ptr;
967 }
968
fastFree(VkImageMemory * ptr)969 void VkBlobAllocator::fastFree(VkImageMemory* ptr)
970 {
971 // NCNN_LOGE("VkBlobAllocator F %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
972
973 const int image_memory_block_count = d->image_memory_blocks.size();
974
975 int block_index = -1;
976 for (int i = 0; i < image_memory_block_count; i++)
977 {
978 if (d->image_memory_blocks[i] == ptr->memory)
979 {
980 block_index = i;
981 break;
982 }
983 }
984
985 if (block_index == -1)
986 {
987 NCNN_LOGE("FATAL ERROR! unlocked VkBlobAllocator get wild %p", ptr->memory);
988
989 if (!ptr->command_refcount)
990 {
991 vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
992 vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
993
994 delete ptr;
995 }
996
997 return;
998 }
999
1000 // merge
1001 std::list<std::pair<size_t, size_t> >::iterator it_merge_left = d->image_memory_budgets[block_index].end();
1002 std::list<std::pair<size_t, size_t> >::iterator it_merge_right = d->image_memory_budgets[block_index].end();
1003 std::list<std::pair<size_t, size_t> >::iterator it = d->image_memory_budgets[block_index].begin();
1004 for (; it != d->image_memory_budgets[block_index].end(); it++)
1005 {
1006 if (it->first + it->second == ptr->bind_offset)
1007 {
1008 it_merge_left = it;
1009 }
1010 else if (ptr->bind_offset + ptr->bind_capacity == it->first)
1011 {
1012 it_merge_right = it;
1013 }
1014 }
1015
1016 if (it_merge_left != d->image_memory_budgets[block_index].end() && it_merge_right != d->image_memory_budgets[block_index].end())
1017 {
1018 it_merge_left->second = it_merge_right->first + it_merge_right->second - it_merge_left->first;
1019 d->image_memory_budgets[block_index].erase(it_merge_right);
1020 }
1021 else if (it_merge_left != d->image_memory_budgets[block_index].end())
1022 {
1023 it_merge_left->second = ptr->bind_offset + ptr->bind_capacity - it_merge_left->first;
1024 }
1025 else if (it_merge_right != d->image_memory_budgets[block_index].end())
1026 {
1027 it_merge_right->second = it_merge_right->first + it_merge_right->second - ptr->bind_offset;
1028 it_merge_right->first = ptr->bind_offset;
1029 }
1030 else
1031 {
1032 if (ptr->bind_offset == 0)
1033 {
1034 // chain leading block
1035 d->image_memory_budgets[block_index].push_front(std::make_pair(ptr->bind_offset, ptr->bind_capacity));
1036 }
1037 else
1038 {
1039 d->image_memory_budgets[block_index].push_back(std::make_pair(ptr->bind_offset, ptr->bind_capacity));
1040 }
1041 }
1042
1043 if (!ptr->command_refcount)
1044 {
1045 vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
1046 vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
1047
1048 delete ptr;
1049 }
1050 }
1051
1052 class VkWeightAllocatorPrivate
1053 {
1054 public:
1055 size_t block_size;
1056 size_t buffer_offset_alignment;
1057 size_t bind_memory_offset_alignment;
1058 std::vector<size_t> buffer_block_free_spaces;
1059 std::vector<VkBufferMemory*> buffer_blocks;
1060 std::vector<VkBufferMemory*> dedicated_buffer_blocks;
1061 std::vector<size_t> image_memory_block_free_spaces;
1062 std::vector<VkDeviceMemory> image_memory_blocks;
1063 std::vector<VkDeviceMemory> dedicated_image_memory_blocks;
1064 };
1065
VkWeightAllocator(const VulkanDevice * _vkdev,size_t preferred_block_size)1066 VkWeightAllocator::VkWeightAllocator(const VulkanDevice* _vkdev, size_t preferred_block_size)
1067 : VkAllocator(_vkdev), d(new VkWeightAllocatorPrivate)
1068 {
1069 d->buffer_offset_alignment = vkdev->info.buffer_offset_alignment();
1070 d->bind_memory_offset_alignment = vkdev->info.buffer_image_granularity();
1071
1072 if (vkdev->info.type() == 1)
1073 {
1074 // on integrated gpu, there may be device local only memory too, eg. AMD APU
1075 // assuming larger alignment always keeps us safe :)
1076
1077 // least common multiple for memory_map_alignment and buffer_offset_alignment and non_coherent_atom_size
1078 d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.memory_map_alignment());
1079 d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.non_coherent_atom_size());
1080 }
1081
1082 d->block_size = alignSize(preferred_block_size, d->buffer_offset_alignment);
1083 }
1084
~VkWeightAllocator()1085 VkWeightAllocator::~VkWeightAllocator()
1086 {
1087 clear();
1088
1089 delete d;
1090 }
1091
VkWeightAllocator(const VkWeightAllocator &)1092 VkWeightAllocator::VkWeightAllocator(const VkWeightAllocator&)
1093 : VkAllocator(0), d(0)
1094 {
1095 }
1096
operator =(const VkWeightAllocator &)1097 VkWeightAllocator& VkWeightAllocator::operator=(const VkWeightAllocator&)
1098 {
1099 return *this;
1100 }
1101
clear()1102 void VkWeightAllocator::clear()
1103 {
1104 // NCNN_LOGE("VkWeightAllocator %lu %lu", d->buffer_blocks.size(), d->dedicated_buffer_blocks.size());
1105
1106 d->buffer_block_free_spaces.clear();
1107
1108 for (size_t i = 0; i < d->buffer_blocks.size(); i++)
1109 {
1110 VkBufferMemory* ptr = d->buffer_blocks[i];
1111
1112 if (mappable)
1113 vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
1114
1115 vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
1116 vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
1117
1118 delete ptr;
1119 }
1120 d->buffer_blocks.clear();
1121
1122 for (size_t i = 0; i < d->dedicated_buffer_blocks.size(); i++)
1123 {
1124 VkBufferMemory* ptr = d->dedicated_buffer_blocks[i];
1125
1126 if (mappable)
1127 vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
1128
1129 vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
1130 vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
1131
1132 delete ptr;
1133 }
1134 d->dedicated_buffer_blocks.clear();
1135
1136 d->image_memory_block_free_spaces.clear();
1137
1138 for (size_t i = 0; i < d->image_memory_blocks.size(); i++)
1139 {
1140 VkDeviceMemory memory = d->image_memory_blocks[i];
1141
1142 vkFreeMemory(vkdev->vkdevice(), memory, 0);
1143 }
1144 d->image_memory_blocks.clear();
1145
1146 for (size_t i = 0; i < d->dedicated_image_memory_blocks.size(); i++)
1147 {
1148 VkDeviceMemory memory = d->dedicated_image_memory_blocks[i];
1149
1150 vkFreeMemory(vkdev->vkdevice(), memory, 0);
1151 }
1152 d->dedicated_image_memory_blocks.clear();
1153 }
1154
fastMalloc(size_t size)1155 VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size)
1156 {
1157 // NCNN_LOGE("VkWeightAllocator fastMalloc %lu", size);
1158
1159 size_t aligned_size = alignSize(size, d->buffer_offset_alignment);
1160
1161 const int buffer_block_count = d->buffer_blocks.size();
1162
1163 // find first spare space in buffer_blocks
1164 for (int i = 0; i < buffer_block_count; i++)
1165 {
1166 size_t free_size = d->buffer_block_free_spaces[i];
1167 if (free_size >= aligned_size)
1168 {
1169 size_t block_offset = d->block_size - free_size;
1170
1171 // return sub buffer
1172 VkBufferMemory* ptr = new VkBufferMemory;
1173
1174 ptr->buffer = d->buffer_blocks[i]->buffer;
1175 ptr->offset = block_offset;
1176 ptr->memory = d->buffer_blocks[i]->memory;
1177 ptr->capacity = aligned_size;
1178 ptr->mapped_ptr = d->buffer_blocks[i]->mapped_ptr;
1179 ptr->access_flags = 0;
1180 ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1181
1182 d->buffer_block_free_spaces[i] -= aligned_size;
1183
1184 return ptr;
1185 }
1186 }
1187
1188 size_t new_block_size = std::max(d->block_size, aligned_size);
1189
1190 // create new block
1191 VkBufferMemory* block = new VkBufferMemory;
1192
1193 block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
1194 block->offset = 0;
1195
1196 if (vkdev->info.support_VK_KHR_get_memory_requirements2() && vkdev->info.support_VK_KHR_dedicated_allocation())
1197 {
1198 VkBufferMemoryRequirementsInfo2KHR bufferMemoryRequirementsInfo2;
1199 bufferMemoryRequirementsInfo2.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2_KHR;
1200 bufferMemoryRequirementsInfo2.pNext = 0;
1201 bufferMemoryRequirementsInfo2.buffer = block->buffer;
1202
1203 VkMemoryRequirements2KHR memoryRequirements2;
1204 memoryRequirements2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR;
1205 memoryRequirements2.pNext = 0;
1206
1207 VkMemoryDedicatedRequirementsKHR memoryDedicatedRequirements;
1208 memoryDedicatedRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR;
1209 memoryDedicatedRequirements.pNext = 0;
1210 memoryRequirements2.pNext = &memoryDedicatedRequirements;
1211
1212 vkdev->vkGetBufferMemoryRequirements2KHR(vkdev->vkdevice(), &bufferMemoryRequirementsInfo2, &memoryRequirements2);
1213
1214 bool dedicatedAllocation = memoryDedicatedRequirements.requiresDedicatedAllocation || memoryDedicatedRequirements.prefersDedicatedAllocation;
1215
1216 if (dedicatedAllocation)
1217 {
1218 // setup memory type and alignment
1219 if (buffer_memory_type_index == (uint32_t)-1)
1220 {
1221 if (vkdev->info.type() == 1)
1222 {
1223 // integrated gpu, prefer unified memory
1224 buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
1225 }
1226 else
1227 {
1228 // discrete gpu, device local
1229 buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1230 }
1231
1232 mappable = vkdev->is_mappable(buffer_memory_type_index);
1233 coherent = vkdev->is_coherent(buffer_memory_type_index);
1234 }
1235
1236 block->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, buffer_memory_type_index, 0, block->buffer);
1237
1238 // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
1239 vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);
1240
1241 block->mapped_ptr = 0;
1242 if (mappable)
1243 {
1244 vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr);
1245 }
1246
1247 d->dedicated_buffer_blocks.push_back(block);
1248
1249 // return sub buffer
1250 VkBufferMemory* ptr = new VkBufferMemory;
1251
1252 ptr->buffer = block->buffer;
1253 ptr->offset = 0;
1254 ptr->memory = block->memory;
1255 ptr->capacity = new_block_size;
1256 ptr->mapped_ptr = block->mapped_ptr;
1257 ptr->access_flags = 0;
1258 ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1259
1260 return ptr;
1261 }
1262 }
1263
1264 VkMemoryRequirements memoryRequirements;
1265 vkGetBufferMemoryRequirements(vkdev->vkdevice(), block->buffer, &memoryRequirements);
1266
1267 // setup memory type and alignment
1268 if (buffer_memory_type_index == (uint32_t)-1)
1269 {
1270 if (vkdev->info.type() == 1)
1271 {
1272 // integrated gpu, prefer unified memory
1273 buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
1274 }
1275 else
1276 {
1277 // discrete gpu, device local
1278 buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1279 }
1280
1281 mappable = vkdev->is_mappable(buffer_memory_type_index);
1282 coherent = vkdev->is_coherent(buffer_memory_type_index);
1283 }
1284
1285 block->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
1286
1287 // ignore memoryRequirements.alignment as we always bind at zero offset
1288 vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);
1289
1290 // NCNN_LOGE("VkWeightAllocator M %p", block->buffer);
1291
1292 block->mapped_ptr = 0;
1293 if (mappable)
1294 {
1295 vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr);
1296 }
1297
1298 d->buffer_blocks.push_back(block);
1299
1300 d->buffer_block_free_spaces.push_back(new_block_size - aligned_size);
1301
1302 // return sub buffer
1303 VkBufferMemory* ptr = new VkBufferMemory;
1304
1305 ptr->buffer = block->buffer;
1306 ptr->offset = 0;
1307 ptr->memory = block->memory;
1308 ptr->capacity = aligned_size;
1309 ptr->mapped_ptr = block->mapped_ptr;
1310 ptr->access_flags = 0;
1311 ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1312
1313 return ptr;
1314 }
1315
fastFree(VkBufferMemory * ptr)1316 void VkWeightAllocator::fastFree(VkBufferMemory* ptr)
1317 {
1318 // NCNN_LOGE("VkWeightAllocator F %p", ptr->buffer);
1319
1320 delete ptr;
1321 }
1322
fastMalloc(int w,int h,int c,size_t elemsize,int elempack)1323 VkImageMemory* VkWeightAllocator::fastMalloc(int w, int h, int c, size_t elemsize, int elempack)
1324 {
1325 if (elempack != 1 && elempack != 4 && elempack != 8 && elempack != 16 && elempack != 32 && elempack != 64)
1326 {
1327 NCNN_LOGE("elempack must be 1 4 8 16 32 64");
1328 return 0;
1329 }
1330
1331 // resolve format
1332 VkFormat format = VK_FORMAT_UNDEFINED;
1333
1334 if (elemsize / elempack == 4)
1335 {
1336 // fp32
1337 if (elempack == 1) format = VK_FORMAT_R32_SFLOAT;
1338 if (elempack == 4) format = VK_FORMAT_R32G32B32A32_SFLOAT;
1339 if (elempack == 8) format = VK_FORMAT_R32G32B32A32_SFLOAT;
1340 if (elempack == 16) format = VK_FORMAT_R32G32B32A32_SFLOAT;
1341 if (elempack == 32) format = VK_FORMAT_R32G32B32A32_SFLOAT;
1342 if (elempack == 64) format = VK_FORMAT_R32G32B32A32_SFLOAT;
1343 }
1344 if (elemsize / elempack == 2)
1345 {
1346 // fp16
1347 if (elempack == 1) format = VK_FORMAT_R16_SFLOAT;
1348 if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT;
1349 if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT;
1350 if (elempack == 16) format = VK_FORMAT_R16G16B16A16_SFLOAT;
1351 if (elempack == 32) format = VK_FORMAT_R16G16B16A16_SFLOAT;
1352 if (elempack == 64) format = VK_FORMAT_R16G16B16A16_SFLOAT;
1353 }
1354
1355 // resolve image width height depth
1356 int width = w;
1357 int height = h;
1358 int depth = c;
1359
1360 // large elempack spills on image w
1361 if (elempack == 8) width *= 2;
1362 if (elempack == 16) width *= 4;
1363 if (elempack == 32) width *= 8;
1364 if (elempack == 64) width *= 16;
1365
1366 if (width > (int)vkdev->info.max_image_dimension_3d() || height > (int)vkdev->info.max_image_dimension_3d() || depth > (int)vkdev->info.max_image_dimension_3d())
1367 {
1368 NCNN_LOGE("image dimension too large %d %d %d > %d", width, height, depth, (int)vkdev->info.max_image_dimension_3d());
1369 return 0;
1370 }
1371
1372 VkImageMemory* ptr = new VkImageMemory;
1373
1374 ptr->image = create_image(width, height, depth, format, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
1375
1376 ptr->width = width;
1377 ptr->height = height;
1378 ptr->depth = depth;
1379 ptr->format = format;
1380
1381 if (vkdev->info.support_VK_KHR_get_memory_requirements2() && vkdev->info.support_VK_KHR_dedicated_allocation())
1382 {
1383 VkImageMemoryRequirementsInfo2KHR imageMemoryRequirementsInfo2;
1384 imageMemoryRequirementsInfo2.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR;
1385 imageMemoryRequirementsInfo2.pNext = 0;
1386 imageMemoryRequirementsInfo2.image = ptr->image;
1387
1388 VkMemoryRequirements2KHR memoryRequirements2;
1389 memoryRequirements2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR;
1390 memoryRequirements2.pNext = 0;
1391
1392 VkMemoryDedicatedRequirementsKHR memoryDedicatedRequirements;
1393 memoryDedicatedRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR;
1394 memoryDedicatedRequirements.pNext = 0;
1395 memoryRequirements2.pNext = &memoryDedicatedRequirements;
1396
1397 vkdev->vkGetImageMemoryRequirements2KHR(vkdev->vkdevice(), &imageMemoryRequirementsInfo2, &memoryRequirements2);
1398
1399 bool dedicatedAllocation = memoryDedicatedRequirements.requiresDedicatedAllocation || memoryDedicatedRequirements.prefersDedicatedAllocation;
1400
1401 if (dedicatedAllocation)
1402 {
1403 // setup memory type and alignment
1404 if (image_memory_type_index == (uint32_t)-1)
1405 {
1406 if (vkdev->info.type() == 1)
1407 {
1408 // integrated gpu, prefer unified memory
1409 image_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
1410 }
1411 else
1412 {
1413 // discrete gpu, device local
1414 image_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1415 }
1416
1417 mappable = vkdev->is_mappable(image_memory_type_index);
1418 coherent = vkdev->is_coherent(image_memory_type_index);
1419 }
1420
1421 // bind memory
1422 ptr->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, image_memory_type_index, ptr->image, 0);
1423 ptr->bind_offset = 0;
1424 ptr->bind_capacity = memoryRequirements2.memoryRequirements.size;
1425
1426 // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
1427 vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
1428
1429 // do not allow host access to optimal tiling image
1430 ptr->mapped_ptr = 0;
1431
1432 ptr->imageview = create_imageview(ptr->image, format);
1433
1434 ptr->access_flags = 0;
1435 ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
1436 ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1437 ptr->command_refcount = 0;
1438
1439 d->dedicated_image_memory_blocks.push_back(ptr->memory);
1440
1441 return ptr;
1442 }
1443 }
1444
1445 VkMemoryRequirements memoryRequirements;
1446 vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements);
1447
1448 const size_t size = memoryRequirements.size;
1449 const size_t alignment = std::max((size_t)memoryRequirements.alignment, d->bind_memory_offset_alignment);
1450
1451 size_t aligned_size = alignSize(size, alignment);
1452
1453 const int image_memory_block_count = d->image_memory_blocks.size();
1454
1455 // find first spare space in buffer_blocks
1456 for (int i = 0; i < image_memory_block_count; i++)
1457 {
1458 // we cannot use image_memory_block_free_spaces[i] directly for base offset alignment
1459 size_t bind_base_offset = d->block_size - d->image_memory_block_free_spaces[i];
1460 size_t bind_offset = alignSize(bind_base_offset, alignment);
1461 if (d->image_memory_block_free_spaces[i] >= aligned_size + (bind_offset - bind_base_offset))
1462 {
1463 // bind at memory offset
1464 ptr->memory = d->image_memory_blocks[i];
1465 ptr->bind_offset = bind_offset;
1466 ptr->bind_capacity = aligned_size;
1467
1468 vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
1469
1470 // do not allow host access to optimal tiling image
1471 ptr->mapped_ptr = 0;
1472
1473 ptr->imageview = create_imageview(ptr->image, format);
1474
1475 ptr->access_flags = 0;
1476 ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
1477 ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1478 ptr->command_refcount = 0;
1479
1480 if (bind_base_offset != bind_offset)
1481 {
1482 // NOTE there is small offset inside bind_base_offset and bind_offset
1483 // adjust ptr->bind_offset and ptr->bind_capacity after vkBindImageMemory
1484 // so that memory management could be easier
1485 aligned_size += (bind_offset - bind_base_offset);
1486
1487 ptr->bind_offset = bind_base_offset;
1488 ptr->bind_capacity = aligned_size;
1489 }
1490
1491 d->image_memory_block_free_spaces[i] -= aligned_size;
1492
1493 return ptr;
1494 }
1495 }
1496
1497 // setup memory type and alignment
1498 if (image_memory_type_index == (uint32_t)-1)
1499 {
1500 if (vkdev->info.type() == 1)
1501 {
1502 // integrated gpu, prefer unified memory
1503 image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
1504 }
1505 else
1506 {
1507 // discrete gpu, device local
1508 image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1509 }
1510
1511 mappable = vkdev->is_mappable(image_memory_type_index);
1512 coherent = vkdev->is_coherent(image_memory_type_index);
1513 }
1514
1515 // create new block
1516 size_t new_block_size = std::max(d->block_size, aligned_size);
1517
1518 // bind at memory offset
1519 ptr->memory = allocate_memory(new_block_size, image_memory_type_index);
1520 ptr->bind_offset = 0;
1521 ptr->bind_capacity = aligned_size;
1522
1523 // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
1524 vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
1525
1526 // do not allow host access to optimal tiling image
1527 ptr->mapped_ptr = 0;
1528
1529 ptr->imageview = create_imageview(ptr->image, format);
1530
1531 ptr->access_flags = 0;
1532 ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
1533 ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1534 ptr->command_refcount = 0;
1535
1536 d->image_memory_blocks.push_back(ptr->memory);
1537 d->image_memory_block_free_spaces.push_back(new_block_size - aligned_size);
1538
1539 return ptr;
1540 }
1541
fastFree(VkImageMemory * ptr)1542 void VkWeightAllocator::fastFree(VkImageMemory* ptr)
1543 {
1544 // NCNN_LOGE("VkWeightAllocator F %p", ptr->memory);
1545
1546 if (!ptr->command_refcount)
1547 {
1548 vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
1549 vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
1550
1551 delete ptr;
1552 }
1553 }
1554
1555 class VkStagingAllocatorPrivate
1556 {
1557 public:
1558 unsigned int size_compare_ratio; // 0~256
1559 std::list<VkBufferMemory*> buffer_budgets;
1560 };
1561
VkStagingAllocator(const VulkanDevice * _vkdev)1562 VkStagingAllocator::VkStagingAllocator(const VulkanDevice* _vkdev)
1563 : VkAllocator(_vkdev), d(new VkStagingAllocatorPrivate)
1564 {
1565 mappable = true;
1566 coherent = true;
1567
1568 d->size_compare_ratio = 192; // 0.75f * 256
1569 }
1570
~VkStagingAllocator()1571 VkStagingAllocator::~VkStagingAllocator()
1572 {
1573 clear();
1574
1575 delete d;
1576 }
1577
VkStagingAllocator(const VkStagingAllocator &)1578 VkStagingAllocator::VkStagingAllocator(const VkStagingAllocator&)
1579 : VkAllocator(0), d(0)
1580 {
1581 }
1582
operator =(const VkStagingAllocator &)1583 VkStagingAllocator& VkStagingAllocator::operator=(const VkStagingAllocator&)
1584 {
1585 return *this;
1586 }
1587
set_size_compare_ratio(float scr)1588 void VkStagingAllocator::set_size_compare_ratio(float scr)
1589 {
1590 if (scr < 0.f || scr > 1.f)
1591 {
1592 NCNN_LOGE("invalid size compare ratio %f", scr);
1593 return;
1594 }
1595
1596 d->size_compare_ratio = (unsigned int)(scr * 256);
1597 }
1598
clear()1599 void VkStagingAllocator::clear()
1600 {
1601 // NCNN_LOGE("VkStagingAllocator %lu", buffer_budgets.size());
1602
1603 for (std::list<VkBufferMemory*>::iterator it = d->buffer_budgets.begin(); it != d->buffer_budgets.end(); it++)
1604 {
1605 VkBufferMemory* ptr = *it;
1606
1607 // NCNN_LOGE("VkStagingAllocator F %p", ptr->buffer);
1608
1609 vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
1610 vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
1611 vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
1612
1613 delete ptr;
1614 }
1615 d->buffer_budgets.clear();
1616 }
1617
fastMalloc(size_t size)1618 VkBufferMemory* VkStagingAllocator::fastMalloc(size_t size)
1619 {
1620 // find free budget
1621 std::list<VkBufferMemory*>::iterator it = d->buffer_budgets.begin();
1622 for (; it != d->buffer_budgets.end(); it++)
1623 {
1624 VkBufferMemory* ptr = *it;
1625
1626 size_t capacity = ptr->capacity;
1627
1628 // size_compare_ratio ~ 100%
1629 if (capacity >= size && ((capacity * d->size_compare_ratio) >> 8) <= size)
1630 {
1631 d->buffer_budgets.erase(it);
1632
1633 // NCNN_LOGE("VkStagingAllocator M %p %lu reused %lu", ptr->buffer, size, capacity);
1634
1635 return ptr;
1636 }
1637 }
1638
1639 VkBufferMemory* ptr = new VkBufferMemory;
1640
1641 ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
1642 ptr->offset = 0;
1643
1644 VkMemoryRequirements memoryRequirements;
1645 vkGetBufferMemoryRequirements(vkdev->vkdevice(), ptr->buffer, &memoryRequirements);
1646
1647 // setup memory type
1648 if (buffer_memory_type_index == (uint32_t)-1)
1649 {
1650 buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
1651 }
1652
1653 ptr->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
1654
1655 // ignore memoryRequirements.alignment as we always bind at zero offset
1656 vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0);
1657
1658 ptr->capacity = size;
1659
1660 vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr);
1661
1662 ptr->access_flags = 0;
1663 ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1664
1665 // NCNN_LOGE("VkStagingAllocator M %p %lu", ptr->buffer, size);
1666
1667 return ptr;
1668 }
1669
fastFree(VkBufferMemory * ptr)1670 void VkStagingAllocator::fastFree(VkBufferMemory* ptr)
1671 {
1672 // NCNN_LOGE("VkStagingAllocator F %p", ptr->buffer);
1673
1674 // return to buffer_budgets
1675 d->buffer_budgets.push_back(ptr);
1676 }
1677
fastMalloc(int w,int h,int c,size_t elemsize,int)1678 VkImageMemory* VkStagingAllocator::fastMalloc(int w, int h, int c, size_t elemsize, int /* elempack */)
1679 {
1680 // staging image is mainly used for storing small piece of dynamic parameters
1681 // we allocate host memory as a fake image, it's simple and good
1682
1683 const size_t size = w * h * c * elemsize;
1684
1685 VkImageMemory* ptr = new VkImageMemory;
1686
1687 ptr->image = 0;
1688 ptr->width = w;
1689 ptr->height = h;
1690 ptr->depth = c;
1691 ptr->format = VK_FORMAT_UNDEFINED;
1692 ptr->memory = 0;
1693 ptr->bind_offset = 0;
1694 ptr->bind_capacity = size;
1695
1696 ptr->mapped_ptr = malloc(size);
1697
1698 ptr->imageview = 0;
1699
1700 ptr->access_flags = 0;
1701 ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
1702 ptr->stage_flags = VK_PIPELINE_STAGE_HOST_BIT;
1703 ptr->command_refcount = 0;
1704
1705 // NCNN_LOGE("VkStagingAllocator M %p %d %d %d %d %d", ptr->image, dims, width, height, depth, format);
1706
1707 return ptr;
1708 }
1709
fastFree(VkImageMemory * ptr)1710 void VkStagingAllocator::fastFree(VkImageMemory* ptr)
1711 {
1712 // NCNN_LOGE("VkStagingAllocator F %p", ptr->image);
1713
1714 free(ptr->mapped_ptr);
1715
1716 delete ptr;
1717 }
1718
1719 class VkWeightStagingAllocatorPrivate
1720 {
1721 public:
1722 };
1723
VkWeightStagingAllocator(const VulkanDevice * _vkdev)1724 VkWeightStagingAllocator::VkWeightStagingAllocator(const VulkanDevice* _vkdev)
1725 : VkAllocator(_vkdev), d(new VkWeightStagingAllocatorPrivate)
1726 {
1727 mappable = true;
1728 coherent = true;
1729 }
1730
~VkWeightStagingAllocator()1731 VkWeightStagingAllocator::~VkWeightStagingAllocator()
1732 {
1733 delete d;
1734 }
1735
VkWeightStagingAllocator(const VkWeightStagingAllocator &)1736 VkWeightStagingAllocator::VkWeightStagingAllocator(const VkWeightStagingAllocator&)
1737 : VkAllocator(0), d(0)
1738 {
1739 }
1740
operator =(const VkWeightStagingAllocator &)1741 VkWeightStagingAllocator& VkWeightStagingAllocator::operator=(const VkWeightStagingAllocator&)
1742 {
1743 return *this;
1744 }
1745
fastMalloc(size_t size)1746 VkBufferMemory* VkWeightStagingAllocator::fastMalloc(size_t size)
1747 {
1748 VkBufferMemory* ptr = new VkBufferMemory;
1749
1750 ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
1751 ptr->offset = 0;
1752
1753 VkMemoryRequirements memoryRequirements;
1754 vkGetBufferMemoryRequirements(vkdev->vkdevice(), ptr->buffer, &memoryRequirements);
1755
1756 // setup memory type
1757 if (buffer_memory_type_index == (uint32_t)-1)
1758 {
1759 buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
1760 }
1761
1762 ptr->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
1763
1764 // ignore memoryRequirements.alignment as we always bind at zero offset
1765 vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0);
1766
1767 ptr->capacity = size;
1768
1769 vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr);
1770
1771 ptr->access_flags = 0;
1772 ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1773
1774 // NCNN_LOGE("VkWeightStagingAllocator M %p %lu", ptr->buffer, size);
1775
1776 return ptr;
1777 }
1778
fastFree(VkBufferMemory * ptr)1779 void VkWeightStagingAllocator::fastFree(VkBufferMemory* ptr)
1780 {
1781 // NCNN_LOGE("VkWeightStagingAllocator F %p", ptr->buffer);
1782
1783 vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
1784 vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
1785 vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
1786
1787 delete ptr;
1788 }
1789
fastMalloc(int,int,int,size_t,int)1790 VkImageMemory* VkWeightStagingAllocator::fastMalloc(int /*w*/, int /*h*/, int /*c*/, size_t /*elemsize*/, int /*elempack*/)
1791 {
1792 return 0;
1793 }
1794
fastFree(VkImageMemory *)1795 void VkWeightStagingAllocator::fastFree(VkImageMemory* /*ptr*/)
1796 {
1797 }
1798
1799 #if __ANDROID_API__ >= 26
VkAndroidHardwareBufferImageAllocator(const VulkanDevice * _vkdev,AHardwareBuffer * _hb)1800 VkAndroidHardwareBufferImageAllocator::VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb)
1801 : VkAllocator(_vkdev), hb(_hb)
1802 {
1803 samplerYcbcrConversion = 0;
1804
1805 init();
1806 }
1807
~VkAndroidHardwareBufferImageAllocator()1808 VkAndroidHardwareBufferImageAllocator::~VkAndroidHardwareBufferImageAllocator()
1809 {
1810 if (samplerYcbcrConversion)
1811 {
1812 vkdev->vkDestroySamplerYcbcrConversionKHR(vkdev->vkdevice(), samplerYcbcrConversion, 0);
1813 samplerYcbcrConversion = 0;
1814 }
1815 }
1816
VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator &)1817 VkAndroidHardwareBufferImageAllocator::VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator&)
1818 : VkAllocator(0)
1819 {
1820 }
1821
operator =(const VkAndroidHardwareBufferImageAllocator &)1822 VkAndroidHardwareBufferImageAllocator& VkAndroidHardwareBufferImageAllocator::operator=(const VkAndroidHardwareBufferImageAllocator&)
1823 {
1824 return *this;
1825 }
1826
fastMalloc(size_t)1827 VkBufferMemory* VkAndroidHardwareBufferImageAllocator::fastMalloc(size_t /*size*/)
1828 {
1829 return 0;
1830 }
1831
fastFree(VkBufferMemory *)1832 void VkAndroidHardwareBufferImageAllocator::fastFree(VkBufferMemory* /*ptr*/)
1833 {
1834 }
1835
fastMalloc(int,int,int,size_t,int)1836 VkImageMemory* VkAndroidHardwareBufferImageAllocator::fastMalloc(int /*w*/, int /*h*/, int /*c*/, size_t /*elemsize*/, int /*elempack*/)
1837 {
1838 VkResult ret;
1839
1840 VkExternalFormatANDROID externalFormat;
1841 externalFormat.sType = VK_STRUCTURE_TYPE_EXTERNAL_FORMAT_ANDROID;
1842 externalFormat.pNext = 0;
1843 externalFormat.externalFormat = bufferFormatProperties.externalFormat;
1844
1845 VkExternalMemoryImageCreateInfo externalMemoryImageCreateInfo;
1846 externalMemoryImageCreateInfo.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO,
1847 externalMemoryImageCreateInfo.pNext = &externalFormat,
1848 externalMemoryImageCreateInfo.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID;
1849
1850 VkImageCreateInfo imageCreateInfo;
1851 imageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
1852 imageCreateInfo.pNext = &externalMemoryImageCreateInfo;
1853 imageCreateInfo.flags = 0;
1854 imageCreateInfo.imageType = VK_IMAGE_TYPE_2D;
1855 imageCreateInfo.format = VK_FORMAT_UNDEFINED;
1856 imageCreateInfo.extent.width = bufferDesc.width;
1857 imageCreateInfo.extent.height = bufferDesc.height;
1858 imageCreateInfo.extent.depth = 1;
1859 imageCreateInfo.mipLevels = 1;
1860 imageCreateInfo.arrayLayers = 1;
1861 imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT;
1862 imageCreateInfo.tiling = VK_IMAGE_TILING_OPTIMAL;
1863 imageCreateInfo.usage = VK_IMAGE_USAGE_SAMPLED_BIT;
1864 imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
1865 imageCreateInfo.queueFamilyIndexCount = 0;
1866 imageCreateInfo.pQueueFamilyIndices = 0;
1867 imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
1868
1869 VkImage image = 0;
1870 ret = vkCreateImage(vkdev->vkdevice(), &imageCreateInfo, 0, &image);
1871 if (ret != VK_SUCCESS)
1872 {
1873 NCNN_LOGE("vkCreateImage failed %d", ret);
1874 return 0;
1875 }
1876
1877 // setup memory type
1878 if (image_memory_type_index == (uint32_t)-1)
1879 {
1880 image_memory_type_index = vkdev->find_memory_index(bufferProperties.memoryTypeBits, 0, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1881 }
1882
1883 VkImportAndroidHardwareBufferInfoANDROID importAndroidHardwareBufferInfo;
1884 importAndroidHardwareBufferInfo.sType = VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID;
1885 importAndroidHardwareBufferInfo.pNext = 0;
1886 importAndroidHardwareBufferInfo.buffer = hb;
1887
1888 VkMemoryDedicatedAllocateInfo memoryDedicatedAllocateInfo;
1889 memoryDedicatedAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO;
1890 memoryDedicatedAllocateInfo.pNext = &importAndroidHardwareBufferInfo;
1891 memoryDedicatedAllocateInfo.image = image;
1892 memoryDedicatedAllocateInfo.buffer = VK_NULL_HANDLE;
1893
1894 VkMemoryAllocateInfo memoryAllocateInfo;
1895 memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
1896 memoryAllocateInfo.pNext = &memoryDedicatedAllocateInfo;
1897 memoryAllocateInfo.allocationSize = bufferProperties.allocationSize;
1898 memoryAllocateInfo.memoryTypeIndex = image_memory_type_index;
1899
1900 VkDeviceMemory memory = 0;
1901 ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory);
1902 if (ret != VK_SUCCESS)
1903 {
1904 NCNN_LOGE("vkAllocateMemory failed %d", ret);
1905 return 0;
1906 }
1907
1908 VkBindImageMemoryInfo bindImageMemoryInfo;
1909 bindImageMemoryInfo.sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO;
1910 bindImageMemoryInfo.pNext = 0;
1911 bindImageMemoryInfo.image = image;
1912 bindImageMemoryInfo.memory = memory;
1913 bindImageMemoryInfo.memoryOffset = 0;
1914 ret = vkdev->vkBindImageMemory2KHR(vkdev->vkdevice(), 1, &bindImageMemoryInfo);
1915 if (ret != VK_SUCCESS)
1916 {
1917 NCNN_LOGE("vkBindImageMemory2KHR failed %d", ret);
1918 vkDestroyImage(vkdev->vkdevice(), image, 0);
1919 return 0;
1920 }
1921
1922 VkSamplerYcbcrConversionInfoKHR samplerYcbcrConversionInfo;
1923 samplerYcbcrConversionInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO_KHR;
1924 samplerYcbcrConversionInfo.pNext = &externalFormat;
1925 samplerYcbcrConversionInfo.conversion = samplerYcbcrConversion;
1926
1927 VkImageViewCreateInfo imageViewCreateInfo;
1928 imageViewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
1929 imageViewCreateInfo.pNext = &samplerYcbcrConversionInfo;
1930 imageViewCreateInfo.flags = 0;
1931 imageViewCreateInfo.image = image;
1932 imageViewCreateInfo.viewType = VK_IMAGE_VIEW_TYPE_2D;
1933 imageViewCreateInfo.format = VK_FORMAT_UNDEFINED;
1934 imageViewCreateInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
1935 imageViewCreateInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
1936 imageViewCreateInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
1937 imageViewCreateInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
1938 imageViewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
1939 imageViewCreateInfo.subresourceRange.baseMipLevel = 0;
1940 imageViewCreateInfo.subresourceRange.levelCount = 1;
1941 imageViewCreateInfo.subresourceRange.baseArrayLayer = 0;
1942 imageViewCreateInfo.subresourceRange.layerCount = 1;
1943
1944 VkImageView imageview = 0;
1945 ret = vkCreateImageView(vkdev->vkdevice(), &imageViewCreateInfo, 0, &imageview);
1946 if (ret != VK_SUCCESS)
1947 {
1948 NCNN_LOGE("vkCreateImageView failed %d", ret);
1949 vkDestroyImage(vkdev->vkdevice(), image, 0);
1950 vkFreeMemory(vkdev->vkdevice(), memory, 0);
1951 return 0;
1952 }
1953
1954 VkImageMemory* ptr = new VkImageMemory;
1955 ptr->image = image;
1956 ptr->memory = memory;
1957 ptr->imageview = imageview;
1958 ptr->access_flags = 0;
1959 ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
1960 ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1961
1962 return ptr;
1963 }
1964
fastFree(VkImageMemory * ptr)1965 void VkAndroidHardwareBufferImageAllocator::fastFree(VkImageMemory* ptr)
1966 {
1967 vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
1968 vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
1969 vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
1970
1971 delete ptr;
1972 }
1973
init()1974 int VkAndroidHardwareBufferImageAllocator::init()
1975 {
1976 AHardwareBuffer_describe(hb, &bufferDesc);
1977
1978 VkResult ret;
1979
1980 // resolve externalFormat
1981 bufferFormatProperties.sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_ANDROID;
1982 bufferFormatProperties.pNext = 0;
1983
1984 bufferProperties.sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_PROPERTIES_ANDROID;
1985 bufferProperties.pNext = &bufferFormatProperties;
1986
1987 ret = vkGetAndroidHardwareBufferPropertiesANDROID(vkdev->vkdevice(), hb, &bufferProperties);
1988 if (ret != VK_SUCCESS)
1989 {
1990 NCNN_LOGE("vkGetAndroidHardwareBufferPropertiesANDROID failed %d", ret);
1991 return -1;
1992 }
1993
1994 // setup samplerYcbcrConversion
1995 VkExternalFormatANDROID externalFormat;
1996 externalFormat.sType = VK_STRUCTURE_TYPE_EXTERNAL_FORMAT_ANDROID;
1997 externalFormat.pNext = 0;
1998 externalFormat.externalFormat = bufferFormatProperties.externalFormat;
1999
2000 VkSamplerYcbcrConversionCreateInfoKHR samplerYcbcrConversionCreateInfo;
2001 samplerYcbcrConversionCreateInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_CREATE_INFO_KHR;
2002 samplerYcbcrConversionCreateInfo.pNext = &externalFormat;
2003 samplerYcbcrConversionCreateInfo.format = VK_FORMAT_UNDEFINED;
2004 samplerYcbcrConversionCreateInfo.ycbcrModel = bufferFormatProperties.suggestedYcbcrModel;
2005 samplerYcbcrConversionCreateInfo.ycbcrRange = bufferFormatProperties.suggestedYcbcrRange;
2006 samplerYcbcrConversionCreateInfo.components = bufferFormatProperties.samplerYcbcrConversionComponents;
2007 samplerYcbcrConversionCreateInfo.xChromaOffset = bufferFormatProperties.suggestedXChromaOffset;
2008 samplerYcbcrConversionCreateInfo.yChromaOffset = bufferFormatProperties.suggestedYChromaOffset;
2009 samplerYcbcrConversionCreateInfo.chromaFilter = VK_FILTER_NEAREST;
2010 samplerYcbcrConversionCreateInfo.forceExplicitReconstruction = VK_FALSE;
2011
2012 ret = vkdev->vkCreateSamplerYcbcrConversionKHR(vkdev->vkdevice(), &samplerYcbcrConversionCreateInfo, 0, &samplerYcbcrConversion);
2013 if (ret != VK_SUCCESS)
2014 {
2015 NCNN_LOGE("vkCreateSamplerYcbcrConversionKHR failed %d", ret);
2016 return -1;
2017 }
2018
2019 return 0;
2020 }
2021
width() const2022 int VkAndroidHardwareBufferImageAllocator::width() const
2023 {
2024 return bufferDesc.width;
2025 }
2026
height() const2027 int VkAndroidHardwareBufferImageAllocator::height() const
2028 {
2029 return bufferDesc.height;
2030 }
2031
external_format() const2032 uint64_t VkAndroidHardwareBufferImageAllocator::external_format() const
2033 {
2034 return bufferFormatProperties.externalFormat;
2035 }
2036 #endif // __ANDROID_API__ >= 26
2037
2038 #endif // NCNN_VULKAN
2039
2040 } // namespace ncnn
2041