1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "allocator.h"
16 
17 #include "gpu.h"
18 #include "pipeline.h"
19 
20 #if __ANDROID_API__ >= 26
21 #include <android/hardware_buffer.h>
22 #endif // __ANDROID_API__ >= 26
23 
24 namespace ncnn {
25 
~Allocator()26 Allocator::~Allocator()
27 {
28 }
29 
30 class PoolAllocatorPrivate
31 {
32 public:
33     Mutex budgets_lock;
34     Mutex payouts_lock;
35     unsigned int size_compare_ratio; // 0~256
36     std::list<std::pair<size_t, void*> > budgets;
37     std::list<std::pair<size_t, void*> > payouts;
38 };
39 
PoolAllocator()40 PoolAllocator::PoolAllocator()
41     : Allocator(), d(new PoolAllocatorPrivate)
42 {
43     d->size_compare_ratio = 192; // 0.75f * 256
44 }
45 
~PoolAllocator()46 PoolAllocator::~PoolAllocator()
47 {
48     clear();
49 
50     if (!d->payouts.empty())
51     {
52         NCNN_LOGE("FATAL ERROR! pool allocator destroyed too early");
53 #if NCNN_STDIO
54         std::list<std::pair<size_t, void*> >::iterator it = d->payouts.begin();
55         for (; it != d->payouts.end(); ++it)
56         {
57             void* ptr = it->second;
58             NCNN_LOGE("%p still in use", ptr);
59         }
60 #endif
61     }
62 
63     delete d;
64 }
65 
PoolAllocator(const PoolAllocator &)66 PoolAllocator::PoolAllocator(const PoolAllocator&)
67     : d(0)
68 {
69 }
70 
operator =(const PoolAllocator &)71 PoolAllocator& PoolAllocator::operator=(const PoolAllocator&)
72 {
73     return *this;
74 }
75 
clear()76 void PoolAllocator::clear()
77 {
78     d->budgets_lock.lock();
79 
80     std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin();
81     for (; it != d->budgets.end(); ++it)
82     {
83         void* ptr = it->second;
84         ncnn::fastFree(ptr);
85     }
86     d->budgets.clear();
87 
88     d->budgets_lock.unlock();
89 }
90 
set_size_compare_ratio(float scr)91 void PoolAllocator::set_size_compare_ratio(float scr)
92 {
93     if (scr < 0.f || scr > 1.f)
94     {
95         NCNN_LOGE("invalid size compare ratio %f", scr);
96         return;
97     }
98 
99     d->size_compare_ratio = (unsigned int)(scr * 256);
100 }
101 
fastMalloc(size_t size)102 void* PoolAllocator::fastMalloc(size_t size)
103 {
104     d->budgets_lock.lock();
105 
106     // find free budget
107     std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin();
108     for (; it != d->budgets.end(); ++it)
109     {
110         size_t bs = it->first;
111 
112         // size_compare_ratio ~ 100%
113         if (bs >= size && ((bs * d->size_compare_ratio) >> 8) <= size)
114         {
115             void* ptr = it->second;
116 
117             d->budgets.erase(it);
118 
119             d->budgets_lock.unlock();
120 
121             d->payouts_lock.lock();
122 
123             d->payouts.push_back(std::make_pair(bs, ptr));
124 
125             d->payouts_lock.unlock();
126 
127             return ptr;
128         }
129     }
130 
131     d->budgets_lock.unlock();
132 
133     // new
134     void* ptr = ncnn::fastMalloc(size);
135 
136     d->payouts_lock.lock();
137 
138     d->payouts.push_back(std::make_pair(size, ptr));
139 
140     d->payouts_lock.unlock();
141 
142     return ptr;
143 }
144 
fastFree(void * ptr)145 void PoolAllocator::fastFree(void* ptr)
146 {
147     d->payouts_lock.lock();
148 
149     // return to budgets
150     std::list<std::pair<size_t, void*> >::iterator it = d->payouts.begin();
151     for (; it != d->payouts.end(); ++it)
152     {
153         if (it->second == ptr)
154         {
155             size_t size = it->first;
156 
157             d->payouts.erase(it);
158 
159             d->payouts_lock.unlock();
160 
161             d->budgets_lock.lock();
162 
163             d->budgets.push_back(std::make_pair(size, ptr));
164 
165             d->budgets_lock.unlock();
166 
167             return;
168         }
169     }
170 
171     d->payouts_lock.unlock();
172 
173     NCNN_LOGE("FATAL ERROR! pool allocator get wild %p", ptr);
174     ncnn::fastFree(ptr);
175 }
176 
177 class UnlockedPoolAllocatorPrivate
178 {
179 public:
180     unsigned int size_compare_ratio; // 0~256
181     std::list<std::pair<size_t, void*> > budgets;
182     std::list<std::pair<size_t, void*> > payouts;
183 };
184 
UnlockedPoolAllocator()185 UnlockedPoolAllocator::UnlockedPoolAllocator()
186     : Allocator(), d(new UnlockedPoolAllocatorPrivate)
187 {
188     d->size_compare_ratio = 192; // 0.75f * 256
189 }
190 
~UnlockedPoolAllocator()191 UnlockedPoolAllocator::~UnlockedPoolAllocator()
192 {
193     clear();
194 
195     if (!d->payouts.empty())
196     {
197         NCNN_LOGE("FATAL ERROR! unlocked pool allocator destroyed too early");
198 #if NCNN_STDIO
199         std::list<std::pair<size_t, void*> >::iterator it = d->payouts.begin();
200         for (; it != d->payouts.end(); ++it)
201         {
202             void* ptr = it->second;
203             NCNN_LOGE("%p still in use", ptr);
204         }
205 #endif
206     }
207 
208     delete d;
209 }
210 
UnlockedPoolAllocator(const UnlockedPoolAllocator &)211 UnlockedPoolAllocator::UnlockedPoolAllocator(const UnlockedPoolAllocator&)
212     : d(0)
213 {
214 }
215 
operator =(const UnlockedPoolAllocator &)216 UnlockedPoolAllocator& UnlockedPoolAllocator::operator=(const UnlockedPoolAllocator&)
217 {
218     return *this;
219 }
220 
clear()221 void UnlockedPoolAllocator::clear()
222 {
223     std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin();
224     for (; it != d->budgets.end(); ++it)
225     {
226         void* ptr = it->second;
227         ncnn::fastFree(ptr);
228     }
229     d->budgets.clear();
230 }
231 
set_size_compare_ratio(float scr)232 void UnlockedPoolAllocator::set_size_compare_ratio(float scr)
233 {
234     if (scr < 0.f || scr > 1.f)
235     {
236         NCNN_LOGE("invalid size compare ratio %f", scr);
237         return;
238     }
239 
240     d->size_compare_ratio = (unsigned int)(scr * 256);
241 }
242 
fastMalloc(size_t size)243 void* UnlockedPoolAllocator::fastMalloc(size_t size)
244 {
245     // find free budget
246     std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin();
247     for (; it != d->budgets.end(); ++it)
248     {
249         size_t bs = it->first;
250 
251         // size_compare_ratio ~ 100%
252         if (bs >= size && ((bs * d->size_compare_ratio) >> 8) <= size)
253         {
254             void* ptr = it->second;
255 
256             d->budgets.erase(it);
257 
258             d->payouts.push_back(std::make_pair(bs, ptr));
259 
260             return ptr;
261         }
262     }
263 
264     // new
265     void* ptr = ncnn::fastMalloc(size);
266 
267     d->payouts.push_back(std::make_pair(size, ptr));
268 
269     return ptr;
270 }
271 
fastFree(void * ptr)272 void UnlockedPoolAllocator::fastFree(void* ptr)
273 {
274     // return to budgets
275     std::list<std::pair<size_t, void*> >::iterator it = d->payouts.begin();
276     for (; it != d->payouts.end(); ++it)
277     {
278         if (it->second == ptr)
279         {
280             size_t size = it->first;
281 
282             d->payouts.erase(it);
283 
284             d->budgets.push_back(std::make_pair(size, ptr));
285 
286             return;
287         }
288     }
289 
290     NCNN_LOGE("FATAL ERROR! unlocked pool allocator get wild %p", ptr);
291     ncnn::fastFree(ptr);
292 }
293 
294 #if NCNN_VULKAN
VkAllocator(const VulkanDevice * _vkdev)295 VkAllocator::VkAllocator(const VulkanDevice* _vkdev)
296     : vkdev(_vkdev)
297 {
298     buffer_memory_type_index = (uint32_t)-1;
299     image_memory_type_index = (uint32_t)-1;
300     reserved_type_index = (uint32_t)-1;
301     mappable = false;
302     coherent = false;
303 }
304 
~VkAllocator()305 VkAllocator::~VkAllocator()
306 {
307     clear();
308 }
309 
clear()310 void VkAllocator::clear()
311 {
312 }
313 
round_up(size_t n,size_t multiple)314 static inline size_t round_up(size_t n, size_t multiple)
315 {
316     return (n + multiple - 1) / multiple * multiple;
317 }
318 
round_down(size_t n,size_t multiple)319 static inline size_t round_down(size_t n, size_t multiple)
320 {
321     return n / multiple * multiple;
322 }
323 
flush(VkBufferMemory * ptr)324 int VkAllocator::flush(VkBufferMemory* ptr)
325 {
326     if (coherent)
327         return 0;
328 
329     VkMappedMemoryRange mappedMemoryRange;
330     mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
331     mappedMemoryRange.pNext = 0;
332     mappedMemoryRange.memory = ptr->memory;
333     mappedMemoryRange.offset = round_down(ptr->offset, vkdev->info.non_coherent_atom_size());
334     mappedMemoryRange.size = round_up(ptr->offset + ptr->capacity, vkdev->info.non_coherent_atom_size()) - mappedMemoryRange.offset;
335 
336     VkResult ret = vkFlushMappedMemoryRanges(vkdev->vkdevice(), 1, &mappedMemoryRange);
337     if (ret != VK_SUCCESS)
338     {
339         NCNN_LOGE("vkFlushMappedMemoryRanges failed %d", ret);
340         return -1;
341     }
342 
343     return 0;
344 }
345 
invalidate(VkBufferMemory * ptr)346 int VkAllocator::invalidate(VkBufferMemory* ptr)
347 {
348     if (coherent)
349         return 0;
350 
351     VkMappedMemoryRange mappedMemoryRange;
352     mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
353     mappedMemoryRange.pNext = 0;
354     mappedMemoryRange.memory = ptr->memory;
355     mappedMemoryRange.offset = round_down(ptr->offset, vkdev->info.non_coherent_atom_size());
356     mappedMemoryRange.size = round_up(ptr->offset + ptr->capacity, vkdev->info.non_coherent_atom_size()) - mappedMemoryRange.offset;
357 
358     VkResult ret = vkInvalidateMappedMemoryRanges(vkdev->vkdevice(), 1, &mappedMemoryRange);
359     if (ret != VK_SUCCESS)
360     {
361         NCNN_LOGE("vkInvalidateMappedMemoryRanges failed %d", ret);
362         return -1;
363     }
364 
365     return 0;
366 }
367 
create_buffer(size_t size,VkBufferUsageFlags usage)368 VkBuffer VkAllocator::create_buffer(size_t size, VkBufferUsageFlags usage)
369 {
370     VkBufferCreateInfo bufferCreateInfo;
371     bufferCreateInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
372     bufferCreateInfo.pNext = 0;
373     bufferCreateInfo.flags = 0;
374     bufferCreateInfo.size = size;
375     bufferCreateInfo.usage = usage;
376     bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
377     bufferCreateInfo.queueFamilyIndexCount = 0;
378     bufferCreateInfo.pQueueFamilyIndices = 0;
379 
380     VkBuffer buffer = 0;
381     VkResult ret = vkCreateBuffer(vkdev->vkdevice(), &bufferCreateInfo, 0, &buffer);
382     if (ret != VK_SUCCESS)
383     {
384         NCNN_LOGE("vkCreateBuffer failed %d", ret);
385         return 0;
386     }
387 
388     return buffer;
389 }
390 
allocate_memory(size_t size,uint32_t memory_type_index)391 VkDeviceMemory VkAllocator::allocate_memory(size_t size, uint32_t memory_type_index)
392 {
393     VkMemoryAllocateInfo memoryAllocateInfo;
394     memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
395     memoryAllocateInfo.pNext = 0;
396     memoryAllocateInfo.allocationSize = size;
397     memoryAllocateInfo.memoryTypeIndex = memory_type_index;
398 
399     VkDeviceMemory memory = 0;
400     VkResult ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory);
401     if (ret != VK_SUCCESS)
402     {
403         NCNN_LOGE("vkAllocateMemory failed %d", ret);
404         return 0;
405     }
406 
407     return memory;
408 }
409 
allocate_dedicated_memory(size_t size,uint32_t memory_type_index,VkImage image,VkBuffer buffer)410 VkDeviceMemory VkAllocator::allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer)
411 {
412     VkMemoryAllocateInfo memoryAllocateInfo;
413     memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
414     memoryAllocateInfo.pNext = 0;
415     memoryAllocateInfo.allocationSize = size;
416     memoryAllocateInfo.memoryTypeIndex = memory_type_index;
417 
418     VkMemoryDedicatedAllocateInfoKHR memoryDedicatedAllocateInfo;
419     memoryDedicatedAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR;
420     memoryDedicatedAllocateInfo.pNext = 0;
421     memoryDedicatedAllocateInfo.image = image;
422     memoryDedicatedAllocateInfo.buffer = buffer;
423     memoryAllocateInfo.pNext = &memoryDedicatedAllocateInfo;
424 
425     VkDeviceMemory memory = 0;
426     VkResult ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory);
427     if (ret != VK_SUCCESS)
428     {
429         NCNN_LOGE("vkAllocateMemory failed %d", ret);
430         return 0;
431     }
432 
433     return memory;
434 }
435 
create_image(int width,int height,int depth,VkFormat format,VkImageTiling tiling,VkImageUsageFlags usage)436 VkImage VkAllocator::create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage)
437 {
438     VkImageCreateInfo imageCreateInfo;
439     imageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
440     imageCreateInfo.pNext = 0;
441     imageCreateInfo.flags = 0;
442     imageCreateInfo.imageType = VK_IMAGE_TYPE_3D;
443     imageCreateInfo.format = format;
444     imageCreateInfo.extent.width = width;
445     imageCreateInfo.extent.height = height;
446     imageCreateInfo.extent.depth = depth;
447     imageCreateInfo.mipLevels = 1;
448     imageCreateInfo.arrayLayers = 1;
449     imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT;
450     imageCreateInfo.tiling = tiling;
451     imageCreateInfo.usage = usage;
452     imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
453     imageCreateInfo.queueFamilyIndexCount = 0;
454     imageCreateInfo.pQueueFamilyIndices = 0;
455     imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
456 
457     VkImage image;
458     VkResult ret = vkCreateImage(vkdev->vkdevice(), &imageCreateInfo, 0, &image);
459     if (ret != VK_SUCCESS)
460     {
461         NCNN_LOGE("vkCreateImage failed %d %d %d %d %d %d %d", ret, width, height, depth, format, tiling, usage);
462         return 0;
463     }
464 
465     return image;
466 }
467 
create_imageview(VkImage image,VkFormat format)468 VkImageView VkAllocator::create_imageview(VkImage image, VkFormat format)
469 {
470     VkImageViewCreateInfo imageViewCreateInfo;
471     imageViewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
472     imageViewCreateInfo.pNext = 0;
473     imageViewCreateInfo.flags = 0;
474     imageViewCreateInfo.image = image;
475     imageViewCreateInfo.viewType = VK_IMAGE_VIEW_TYPE_3D;
476     imageViewCreateInfo.format = format;
477     imageViewCreateInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
478     imageViewCreateInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
479     imageViewCreateInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
480     imageViewCreateInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
481     imageViewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
482     imageViewCreateInfo.subresourceRange.baseMipLevel = 0;
483     imageViewCreateInfo.subresourceRange.levelCount = 1;
484     imageViewCreateInfo.subresourceRange.baseArrayLayer = 0;
485     imageViewCreateInfo.subresourceRange.layerCount = 1;
486 
487     VkImageView imageview;
488     VkResult ret = vkCreateImageView(vkdev->vkdevice(), &imageViewCreateInfo, 0, &imageview);
489     if (ret != VK_SUCCESS)
490     {
491         NCNN_LOGE("vkCreateImageView failed %d", ret);
492         return 0;
493     }
494 
495     return imageview;
496 }
497 
least_common_multiple(size_t a,size_t b)498 static inline size_t least_common_multiple(size_t a, size_t b)
499 {
500     if (a == b)
501         return a;
502 
503     if (a > b)
504         return least_common_multiple(b, a);
505 
506     size_t lcm = b;
507     while (lcm % a != 0)
508     {
509         lcm += b;
510     }
511 
512     return lcm;
513 }
514 
515 class VkBlobAllocatorPrivate
516 {
517 public:
518     size_t block_size;
519     size_t buffer_offset_alignment;
520     size_t bind_memory_offset_alignment;
521     std::vector<std::list<std::pair<size_t, size_t> > > buffer_budgets;
522     std::vector<VkBufferMemory*> buffer_blocks;
523     std::vector<std::list<std::pair<size_t, size_t> > > image_memory_budgets;
524     std::vector<VkDeviceMemory> image_memory_blocks;
525 };
526 
VkBlobAllocator(const VulkanDevice * _vkdev,size_t preferred_block_size)527 VkBlobAllocator::VkBlobAllocator(const VulkanDevice* _vkdev, size_t preferred_block_size)
528     : VkAllocator(_vkdev), d(new VkBlobAllocatorPrivate)
529 {
530     d->buffer_offset_alignment = vkdev->info.buffer_offset_alignment();
531     d->bind_memory_offset_alignment = vkdev->info.buffer_image_granularity();
532 
533     if (vkdev->info.type() == 1)
534     {
535         // on integrated gpu, there may be device local only memory too, eg. AMD APU
536         // assuming larger alignment always keeps us safe :)
537 
538         // least common multiple for memory_map_alignment and buffer_offset_alignment and non_coherent_atom_size
539         d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.memory_map_alignment());
540         d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.non_coherent_atom_size());
541     }
542 
543     d->block_size = alignSize(preferred_block_size, d->buffer_offset_alignment);
544 }
545 
~VkBlobAllocator()546 VkBlobAllocator::~VkBlobAllocator()
547 {
548     clear();
549 
550     delete d;
551 }
552 
VkBlobAllocator(const VkBlobAllocator &)553 VkBlobAllocator::VkBlobAllocator(const VkBlobAllocator&)
554     : VkAllocator(0), d(0)
555 {
556 }
557 
operator =(const VkBlobAllocator &)558 VkBlobAllocator& VkBlobAllocator::operator=(const VkBlobAllocator&)
559 {
560     return *this;
561 }
562 
clear()563 void VkBlobAllocator::clear()
564 {
565     //     NCNN_LOGE("VkBlobAllocator %lu", buffer_blocks.size());
566 
567     for (size_t i = 0; i < d->buffer_blocks.size(); i++)
568     {
569         VkBufferMemory* ptr = d->buffer_blocks[i];
570 
571         //         std::list< std::pair<size_t, size_t> >::iterator it = buffer_budgets[i].begin();
572         //         while (it != buffer_budgets[i].end())
573         //         {
574         //             NCNN_LOGE("VkBlobAllocator budget %p %lu %lu", ptr->buffer, it->first, it->second);
575         //             it++;
576         //         }
577 
578         if (mappable)
579             vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
580 
581         vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
582         vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
583 
584         delete ptr;
585     }
586     d->buffer_blocks.clear();
587 
588     d->buffer_budgets.clear();
589 
590     for (size_t i = 0; i < d->image_memory_blocks.size(); i++)
591     {
592         VkDeviceMemory memory = d->image_memory_blocks[i];
593 
594         //         std::list< std::pair<size_t, size_t> >::iterator it = d->image_memory_budgets[i].begin();
595         //         while (it != d->image_memory_budgets[i].end())
596         //         {
597         //             NCNN_LOGE("VkBlobAllocator budget %p %lu %lu", memory, it->first, it->second);
598         //             it++;
599         //         }
600 
601         vkFreeMemory(vkdev->vkdevice(), memory, 0);
602     }
603     d->image_memory_blocks.clear();
604 
605     d->image_memory_budgets.clear();
606 }
607 
fastMalloc(size_t size)608 VkBufferMemory* VkBlobAllocator::fastMalloc(size_t size)
609 {
610     size_t aligned_size = alignSize(size, d->buffer_offset_alignment);
611 
612     const int buffer_block_count = d->buffer_blocks.size();
613 
614     // find first spare space in buffer_blocks
615     for (int i = 0; i < buffer_block_count; i++)
616     {
617         std::list<std::pair<size_t, size_t> >::iterator it = d->buffer_budgets[i].begin();
618         while (it != d->buffer_budgets[i].end())
619         {
620             size_t budget_size = it->second;
621             if (budget_size < aligned_size)
622             {
623                 it++;
624                 continue;
625             }
626 
627             // return sub buffer
628             VkBufferMemory* ptr = new VkBufferMemory;
629 
630             ptr->buffer = d->buffer_blocks[i]->buffer;
631             ptr->offset = it->first;
632             ptr->memory = d->buffer_blocks[i]->memory;
633             ptr->capacity = aligned_size;
634             ptr->mapped_ptr = d->buffer_blocks[i]->mapped_ptr;
635             ptr->access_flags = 0;
636             ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
637 
638             // adjust buffer_budgets
639             if (budget_size == aligned_size)
640             {
641                 d->buffer_budgets[i].erase(it);
642             }
643             else
644             {
645                 it->first += aligned_size;
646                 it->second -= aligned_size;
647             }
648 
649             //             NCNN_LOGE("VkBlobAllocator M %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity);
650 
651             return ptr;
652         }
653     }
654 
655     size_t new_block_size = std::max(d->block_size, aligned_size);
656 
657     // create new block
658     VkBufferMemory* block = new VkBufferMemory;
659 
660     block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
661     block->offset = 0;
662 
663     // TODO respect VK_KHR_dedicated_allocation ?
664 
665     VkMemoryRequirements memoryRequirements;
666     vkGetBufferMemoryRequirements(vkdev->vkdevice(), block->buffer, &memoryRequirements);
667 
668     // setup memory type and alignment
669     if (buffer_memory_type_index == (uint32_t)-1)
670     {
671         if (vkdev->info.type() == 1)
672         {
673             // integrated gpu, prefer unified memory
674             buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
675         }
676         else
677         {
678             // discrete gpu, device local
679             buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
680         }
681 
682         mappable = vkdev->is_mappable(buffer_memory_type_index);
683         coherent = vkdev->is_coherent(buffer_memory_type_index);
684     }
685 
686     block->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
687 
688     // ignore memoryRequirements.alignment as we always bind at zero offset
689     vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);
690 
691     block->mapped_ptr = 0;
692     if (mappable)
693     {
694         vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr);
695     }
696 
697     d->buffer_blocks.push_back(block);
698 
699     // return sub buffer
700     VkBufferMemory* ptr = new VkBufferMemory;
701 
702     ptr->buffer = block->buffer;
703     ptr->offset = 0;
704     ptr->memory = block->memory;
705     ptr->capacity = aligned_size;
706     ptr->mapped_ptr = block->mapped_ptr;
707     ptr->access_flags = 0;
708     ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
709 
710     // adjust buffer_budgets
711     std::list<std::pair<size_t, size_t> > budget;
712     if (new_block_size > aligned_size)
713     {
714         budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size));
715     }
716     d->buffer_budgets.push_back(budget);
717 
718     //     NCNN_LOGE("VkBlobAllocator M %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity);
719 
720     return ptr;
721 }
722 
fastFree(VkBufferMemory * ptr)723 void VkBlobAllocator::fastFree(VkBufferMemory* ptr)
724 {
725     //     NCNN_LOGE("VkBlobAllocator F %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity);
726 
727     const int buffer_block_count = d->buffer_blocks.size();
728 
729     int block_index = -1;
730     for (int i = 0; i < buffer_block_count; i++)
731     {
732         if (d->buffer_blocks[i]->buffer == ptr->buffer && d->buffer_blocks[i]->memory == ptr->memory)
733         {
734             block_index = i;
735             break;
736         }
737     }
738 
739     if (block_index == -1)
740     {
741         NCNN_LOGE("FATAL ERROR! unlocked VkBlobAllocator get wild %p", ptr->buffer);
742 
743         delete ptr;
744 
745         return;
746     }
747 
748     // merge
749     std::list<std::pair<size_t, size_t> >::iterator it_merge_left = d->buffer_budgets[block_index].end();
750     std::list<std::pair<size_t, size_t> >::iterator it_merge_right = d->buffer_budgets[block_index].end();
751     std::list<std::pair<size_t, size_t> >::iterator it = d->buffer_budgets[block_index].begin();
752     for (; it != d->buffer_budgets[block_index].end(); it++)
753     {
754         if (it->first + it->second == ptr->offset)
755         {
756             it_merge_left = it;
757         }
758         else if (ptr->offset + ptr->capacity == it->first)
759         {
760             it_merge_right = it;
761         }
762     }
763 
764     if (it_merge_left != d->buffer_budgets[block_index].end() && it_merge_right != d->buffer_budgets[block_index].end())
765     {
766         it_merge_left->second = it_merge_right->first + it_merge_right->second - it_merge_left->first;
767         d->buffer_budgets[block_index].erase(it_merge_right);
768     }
769     else if (it_merge_left != d->buffer_budgets[block_index].end())
770     {
771         it_merge_left->second = ptr->offset + ptr->capacity - it_merge_left->first;
772     }
773     else if (it_merge_right != d->buffer_budgets[block_index].end())
774     {
775         it_merge_right->second = it_merge_right->first + it_merge_right->second - ptr->offset;
776         it_merge_right->first = ptr->offset;
777     }
778     else
779     {
780         if (ptr->offset == 0)
781         {
782             // chain leading block
783             d->buffer_budgets[block_index].push_front(std::make_pair(ptr->offset, ptr->capacity));
784         }
785         else
786         {
787             d->buffer_budgets[block_index].push_back(std::make_pair(ptr->offset, ptr->capacity));
788         }
789     }
790 
791     delete ptr;
792 }
793 
fastMalloc(int w,int h,int c,size_t elemsize,int elempack)794 VkImageMemory* VkBlobAllocator::fastMalloc(int w, int h, int c, size_t elemsize, int elempack)
795 {
796     if (elempack != 1 && elempack != 4 && elempack != 8)
797     {
798         NCNN_LOGE("elempack must be 1 4 8");
799         return 0;
800     }
801 
802     // resolve format
803     VkFormat format = VK_FORMAT_UNDEFINED;
804 
805     if (elemsize / elempack == 4)
806     {
807         // fp32
808         if (elempack == 1) format = VK_FORMAT_R32_SFLOAT;
809         if (elempack == 4) format = VK_FORMAT_R32G32B32A32_SFLOAT;
810         if (elempack == 8) format = VK_FORMAT_R32G32B32A32_SFLOAT;
811     }
812     if (elemsize / elempack == 2)
813     {
814         // fp16
815         if (elempack == 1) format = VK_FORMAT_R16_SFLOAT;
816         if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT;
817         if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT;
818     }
819 
820     // resolve image width height depth
821     int width = w;
822     int height = h;
823     int depth = c;
824 
825     // large elempack spills on image w
826     if (elempack == 8) width *= 2;
827 
828     if (width > (int)vkdev->info.max_image_dimension_3d() || height > (int)vkdev->info.max_image_dimension_3d() || depth > (int)vkdev->info.max_image_dimension_3d())
829     {
830         NCNN_LOGE("image dimension too large %d %d %d > %d", width, height, depth, (int)vkdev->info.max_image_dimension_3d());
831         return 0;
832     }
833 
834     VkImageMemory* ptr = new VkImageMemory;
835 
836     ptr->image = create_image(width, height, depth, format, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
837 
838     ptr->width = width;
839     ptr->height = height;
840     ptr->depth = depth;
841     ptr->format = format;
842 
843     // TODO respect VK_KHR_dedicated_allocation ?
844     VkMemoryRequirements memoryRequirements;
845     vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements);
846 
847     const size_t size = memoryRequirements.size;
848     const size_t alignment = std::max((size_t)memoryRequirements.alignment, d->bind_memory_offset_alignment);
849 
850     size_t aligned_size = alignSize(size, alignment);
851 
852     const int image_memory_block_count = d->image_memory_blocks.size();
853 
854     // find first spare space in image_memory_blocks
855     for (int i = 0; i < image_memory_block_count; i++)
856     {
857         std::list<std::pair<size_t, size_t> >::iterator it = d->image_memory_budgets[i].begin();
858         while (it != d->image_memory_budgets[i].end())
859         {
860             // we cannot use it->first directly for base offset alignment
861             size_t bind_base_offset = it->first;
862             size_t bind_offset = alignSize(bind_base_offset, alignment);
863             size_t budget_size = it->second;
864             if (budget_size < aligned_size + (bind_offset - bind_base_offset))
865             {
866                 it++;
867                 continue;
868             }
869 
870             // bind at memory offset
871             ptr->memory = d->image_memory_blocks[i];
872             ptr->bind_offset = bind_offset;
873             ptr->bind_capacity = aligned_size;
874 
875             vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
876 
877             // do not allow host access to optimal tiling image
878             ptr->mapped_ptr = 0;
879 
880             ptr->imageview = create_imageview(ptr->image, format);
881 
882             ptr->access_flags = 0;
883             ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
884             ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
885             ptr->command_refcount = 0;
886 
887             if (bind_base_offset != bind_offset)
888             {
889                 // NOTE there is small offset inside bind_base_offset and bind_offset
890                 // adjust ptr->bind_offset and ptr->bind_capacity after vkBindImageMemory
891                 // so that memory management could be easier
892                 aligned_size += (bind_offset - bind_base_offset);
893 
894                 ptr->bind_offset = bind_base_offset;
895                 ptr->bind_capacity = aligned_size;
896             }
897 
898             // adjust image_memory_budgets
899             if (budget_size == aligned_size)
900             {
901                 d->image_memory_budgets[i].erase(it);
902             }
903             else
904             {
905                 it->first += aligned_size;
906                 it->second -= aligned_size;
907             }
908 
909             //             NCNN_LOGE("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
910 
911             return ptr;
912         }
913     }
914 
915     // setup memory type and alignment
916     if (image_memory_type_index == (uint32_t)-1)
917     {
918         if (vkdev->info.type() == 1)
919         {
920             // integrated gpu, prefer unified memory
921             image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
922         }
923         else
924         {
925             // discrete gpu, device local
926             image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
927         }
928 
929         mappable = vkdev->is_mappable(image_memory_type_index);
930         coherent = vkdev->is_coherent(image_memory_type_index);
931     }
932 
933     // create new block
934     size_t new_block_size = std::max(d->block_size, aligned_size);
935 
936     // bind at memory offset
937     ptr->memory = allocate_memory(new_block_size, image_memory_type_index);
938     ptr->bind_offset = 0;
939     ptr->bind_capacity = aligned_size;
940 
941     // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
942     vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
943 
944     // do not allow host access to optimal tiling image
945     ptr->mapped_ptr = 0;
946 
947     ptr->imageview = create_imageview(ptr->image, format);
948 
949     ptr->access_flags = 0;
950     ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
951     ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
952     ptr->command_refcount = 0;
953 
954     // adjust image_memory_budgets
955     d->image_memory_blocks.push_back(ptr->memory);
956 
957     std::list<std::pair<size_t, size_t> > budget;
958     if (new_block_size > aligned_size)
959     {
960         budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size));
961     }
962     d->image_memory_budgets.push_back(budget);
963 
964     //     NCNN_LOGE("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
965 
966     return ptr;
967 }
968 
fastFree(VkImageMemory * ptr)969 void VkBlobAllocator::fastFree(VkImageMemory* ptr)
970 {
971     //     NCNN_LOGE("VkBlobAllocator F %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
972 
973     const int image_memory_block_count = d->image_memory_blocks.size();
974 
975     int block_index = -1;
976     for (int i = 0; i < image_memory_block_count; i++)
977     {
978         if (d->image_memory_blocks[i] == ptr->memory)
979         {
980             block_index = i;
981             break;
982         }
983     }
984 
985     if (block_index == -1)
986     {
987         NCNN_LOGE("FATAL ERROR! unlocked VkBlobAllocator get wild %p", ptr->memory);
988 
989         if (!ptr->command_refcount)
990         {
991             vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
992             vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
993 
994             delete ptr;
995         }
996 
997         return;
998     }
999 
1000     // merge
1001     std::list<std::pair<size_t, size_t> >::iterator it_merge_left = d->image_memory_budgets[block_index].end();
1002     std::list<std::pair<size_t, size_t> >::iterator it_merge_right = d->image_memory_budgets[block_index].end();
1003     std::list<std::pair<size_t, size_t> >::iterator it = d->image_memory_budgets[block_index].begin();
1004     for (; it != d->image_memory_budgets[block_index].end(); it++)
1005     {
1006         if (it->first + it->second == ptr->bind_offset)
1007         {
1008             it_merge_left = it;
1009         }
1010         else if (ptr->bind_offset + ptr->bind_capacity == it->first)
1011         {
1012             it_merge_right = it;
1013         }
1014     }
1015 
1016     if (it_merge_left != d->image_memory_budgets[block_index].end() && it_merge_right != d->image_memory_budgets[block_index].end())
1017     {
1018         it_merge_left->second = it_merge_right->first + it_merge_right->second - it_merge_left->first;
1019         d->image_memory_budgets[block_index].erase(it_merge_right);
1020     }
1021     else if (it_merge_left != d->image_memory_budgets[block_index].end())
1022     {
1023         it_merge_left->second = ptr->bind_offset + ptr->bind_capacity - it_merge_left->first;
1024     }
1025     else if (it_merge_right != d->image_memory_budgets[block_index].end())
1026     {
1027         it_merge_right->second = it_merge_right->first + it_merge_right->second - ptr->bind_offset;
1028         it_merge_right->first = ptr->bind_offset;
1029     }
1030     else
1031     {
1032         if (ptr->bind_offset == 0)
1033         {
1034             // chain leading block
1035             d->image_memory_budgets[block_index].push_front(std::make_pair(ptr->bind_offset, ptr->bind_capacity));
1036         }
1037         else
1038         {
1039             d->image_memory_budgets[block_index].push_back(std::make_pair(ptr->bind_offset, ptr->bind_capacity));
1040         }
1041     }
1042 
1043     if (!ptr->command_refcount)
1044     {
1045         vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
1046         vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
1047 
1048         delete ptr;
1049     }
1050 }
1051 
1052 class VkWeightAllocatorPrivate
1053 {
1054 public:
1055     size_t block_size;
1056     size_t buffer_offset_alignment;
1057     size_t bind_memory_offset_alignment;
1058     std::vector<size_t> buffer_block_free_spaces;
1059     std::vector<VkBufferMemory*> buffer_blocks;
1060     std::vector<VkBufferMemory*> dedicated_buffer_blocks;
1061     std::vector<size_t> image_memory_block_free_spaces;
1062     std::vector<VkDeviceMemory> image_memory_blocks;
1063     std::vector<VkDeviceMemory> dedicated_image_memory_blocks;
1064 };
1065 
VkWeightAllocator(const VulkanDevice * _vkdev,size_t preferred_block_size)1066 VkWeightAllocator::VkWeightAllocator(const VulkanDevice* _vkdev, size_t preferred_block_size)
1067     : VkAllocator(_vkdev), d(new VkWeightAllocatorPrivate)
1068 {
1069     d->buffer_offset_alignment = vkdev->info.buffer_offset_alignment();
1070     d->bind_memory_offset_alignment = vkdev->info.buffer_image_granularity();
1071 
1072     if (vkdev->info.type() == 1)
1073     {
1074         // on integrated gpu, there may be device local only memory too, eg. AMD APU
1075         // assuming larger alignment always keeps us safe :)
1076 
1077         // least common multiple for memory_map_alignment and buffer_offset_alignment and non_coherent_atom_size
1078         d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.memory_map_alignment());
1079         d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.non_coherent_atom_size());
1080     }
1081 
1082     d->block_size = alignSize(preferred_block_size, d->buffer_offset_alignment);
1083 }
1084 
~VkWeightAllocator()1085 VkWeightAllocator::~VkWeightAllocator()
1086 {
1087     clear();
1088 
1089     delete d;
1090 }
1091 
VkWeightAllocator(const VkWeightAllocator &)1092 VkWeightAllocator::VkWeightAllocator(const VkWeightAllocator&)
1093     : VkAllocator(0), d(0)
1094 {
1095 }
1096 
operator =(const VkWeightAllocator &)1097 VkWeightAllocator& VkWeightAllocator::operator=(const VkWeightAllocator&)
1098 {
1099     return *this;
1100 }
1101 
clear()1102 void VkWeightAllocator::clear()
1103 {
1104     //     NCNN_LOGE("VkWeightAllocator %lu %lu", d->buffer_blocks.size(), d->dedicated_buffer_blocks.size());
1105 
1106     d->buffer_block_free_spaces.clear();
1107 
1108     for (size_t i = 0; i < d->buffer_blocks.size(); i++)
1109     {
1110         VkBufferMemory* ptr = d->buffer_blocks[i];
1111 
1112         if (mappable)
1113             vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
1114 
1115         vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
1116         vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
1117 
1118         delete ptr;
1119     }
1120     d->buffer_blocks.clear();
1121 
1122     for (size_t i = 0; i < d->dedicated_buffer_blocks.size(); i++)
1123     {
1124         VkBufferMemory* ptr = d->dedicated_buffer_blocks[i];
1125 
1126         if (mappable)
1127             vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
1128 
1129         vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
1130         vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
1131 
1132         delete ptr;
1133     }
1134     d->dedicated_buffer_blocks.clear();
1135 
1136     d->image_memory_block_free_spaces.clear();
1137 
1138     for (size_t i = 0; i < d->image_memory_blocks.size(); i++)
1139     {
1140         VkDeviceMemory memory = d->image_memory_blocks[i];
1141 
1142         vkFreeMemory(vkdev->vkdevice(), memory, 0);
1143     }
1144     d->image_memory_blocks.clear();
1145 
1146     for (size_t i = 0; i < d->dedicated_image_memory_blocks.size(); i++)
1147     {
1148         VkDeviceMemory memory = d->dedicated_image_memory_blocks[i];
1149 
1150         vkFreeMemory(vkdev->vkdevice(), memory, 0);
1151     }
1152     d->dedicated_image_memory_blocks.clear();
1153 }
1154 
fastMalloc(size_t size)1155 VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size)
1156 {
1157     //     NCNN_LOGE("VkWeightAllocator fastMalloc %lu", size);
1158 
1159     size_t aligned_size = alignSize(size, d->buffer_offset_alignment);
1160 
1161     const int buffer_block_count = d->buffer_blocks.size();
1162 
1163     // find first spare space in buffer_blocks
1164     for (int i = 0; i < buffer_block_count; i++)
1165     {
1166         size_t free_size = d->buffer_block_free_spaces[i];
1167         if (free_size >= aligned_size)
1168         {
1169             size_t block_offset = d->block_size - free_size;
1170 
1171             // return sub buffer
1172             VkBufferMemory* ptr = new VkBufferMemory;
1173 
1174             ptr->buffer = d->buffer_blocks[i]->buffer;
1175             ptr->offset = block_offset;
1176             ptr->memory = d->buffer_blocks[i]->memory;
1177             ptr->capacity = aligned_size;
1178             ptr->mapped_ptr = d->buffer_blocks[i]->mapped_ptr;
1179             ptr->access_flags = 0;
1180             ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1181 
1182             d->buffer_block_free_spaces[i] -= aligned_size;
1183 
1184             return ptr;
1185         }
1186     }
1187 
1188     size_t new_block_size = std::max(d->block_size, aligned_size);
1189 
1190     // create new block
1191     VkBufferMemory* block = new VkBufferMemory;
1192 
1193     block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
1194     block->offset = 0;
1195 
1196     if (vkdev->info.support_VK_KHR_get_memory_requirements2() && vkdev->info.support_VK_KHR_dedicated_allocation())
1197     {
1198         VkBufferMemoryRequirementsInfo2KHR bufferMemoryRequirementsInfo2;
1199         bufferMemoryRequirementsInfo2.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2_KHR;
1200         bufferMemoryRequirementsInfo2.pNext = 0;
1201         bufferMemoryRequirementsInfo2.buffer = block->buffer;
1202 
1203         VkMemoryRequirements2KHR memoryRequirements2;
1204         memoryRequirements2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR;
1205         memoryRequirements2.pNext = 0;
1206 
1207         VkMemoryDedicatedRequirementsKHR memoryDedicatedRequirements;
1208         memoryDedicatedRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR;
1209         memoryDedicatedRequirements.pNext = 0;
1210         memoryRequirements2.pNext = &memoryDedicatedRequirements;
1211 
1212         vkdev->vkGetBufferMemoryRequirements2KHR(vkdev->vkdevice(), &bufferMemoryRequirementsInfo2, &memoryRequirements2);
1213 
1214         bool dedicatedAllocation = memoryDedicatedRequirements.requiresDedicatedAllocation || memoryDedicatedRequirements.prefersDedicatedAllocation;
1215 
1216         if (dedicatedAllocation)
1217         {
1218             // setup memory type and alignment
1219             if (buffer_memory_type_index == (uint32_t)-1)
1220             {
1221                 if (vkdev->info.type() == 1)
1222                 {
1223                     // integrated gpu, prefer unified memory
1224                     buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
1225                 }
1226                 else
1227                 {
1228                     // discrete gpu, device local
1229                     buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1230                 }
1231 
1232                 mappable = vkdev->is_mappable(buffer_memory_type_index);
1233                 coherent = vkdev->is_coherent(buffer_memory_type_index);
1234             }
1235 
1236             block->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, buffer_memory_type_index, 0, block->buffer);
1237 
1238             // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
1239             vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);
1240 
1241             block->mapped_ptr = 0;
1242             if (mappable)
1243             {
1244                 vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr);
1245             }
1246 
1247             d->dedicated_buffer_blocks.push_back(block);
1248 
1249             // return sub buffer
1250             VkBufferMemory* ptr = new VkBufferMemory;
1251 
1252             ptr->buffer = block->buffer;
1253             ptr->offset = 0;
1254             ptr->memory = block->memory;
1255             ptr->capacity = new_block_size;
1256             ptr->mapped_ptr = block->mapped_ptr;
1257             ptr->access_flags = 0;
1258             ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1259 
1260             return ptr;
1261         }
1262     }
1263 
1264     VkMemoryRequirements memoryRequirements;
1265     vkGetBufferMemoryRequirements(vkdev->vkdevice(), block->buffer, &memoryRequirements);
1266 
1267     // setup memory type and alignment
1268     if (buffer_memory_type_index == (uint32_t)-1)
1269     {
1270         if (vkdev->info.type() == 1)
1271         {
1272             // integrated gpu, prefer unified memory
1273             buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
1274         }
1275         else
1276         {
1277             // discrete gpu, device local
1278             buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1279         }
1280 
1281         mappable = vkdev->is_mappable(buffer_memory_type_index);
1282         coherent = vkdev->is_coherent(buffer_memory_type_index);
1283     }
1284 
1285     block->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
1286 
1287     // ignore memoryRequirements.alignment as we always bind at zero offset
1288     vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);
1289 
1290     //     NCNN_LOGE("VkWeightAllocator M %p", block->buffer);
1291 
1292     block->mapped_ptr = 0;
1293     if (mappable)
1294     {
1295         vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr);
1296     }
1297 
1298     d->buffer_blocks.push_back(block);
1299 
1300     d->buffer_block_free_spaces.push_back(new_block_size - aligned_size);
1301 
1302     // return sub buffer
1303     VkBufferMemory* ptr = new VkBufferMemory;
1304 
1305     ptr->buffer = block->buffer;
1306     ptr->offset = 0;
1307     ptr->memory = block->memory;
1308     ptr->capacity = aligned_size;
1309     ptr->mapped_ptr = block->mapped_ptr;
1310     ptr->access_flags = 0;
1311     ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1312 
1313     return ptr;
1314 }
1315 
fastFree(VkBufferMemory * ptr)1316 void VkWeightAllocator::fastFree(VkBufferMemory* ptr)
1317 {
1318     //     NCNN_LOGE("VkWeightAllocator F %p", ptr->buffer);
1319 
1320     delete ptr;
1321 }
1322 
fastMalloc(int w,int h,int c,size_t elemsize,int elempack)1323 VkImageMemory* VkWeightAllocator::fastMalloc(int w, int h, int c, size_t elemsize, int elempack)
1324 {
1325     if (elempack != 1 && elempack != 4 && elempack != 8 && elempack != 16 && elempack != 32 && elempack != 64)
1326     {
1327         NCNN_LOGE("elempack must be 1 4 8 16 32 64");
1328         return 0;
1329     }
1330 
1331     // resolve format
1332     VkFormat format = VK_FORMAT_UNDEFINED;
1333 
1334     if (elemsize / elempack == 4)
1335     {
1336         // fp32
1337         if (elempack == 1) format = VK_FORMAT_R32_SFLOAT;
1338         if (elempack == 4) format = VK_FORMAT_R32G32B32A32_SFLOAT;
1339         if (elempack == 8) format = VK_FORMAT_R32G32B32A32_SFLOAT;
1340         if (elempack == 16) format = VK_FORMAT_R32G32B32A32_SFLOAT;
1341         if (elempack == 32) format = VK_FORMAT_R32G32B32A32_SFLOAT;
1342         if (elempack == 64) format = VK_FORMAT_R32G32B32A32_SFLOAT;
1343     }
1344     if (elemsize / elempack == 2)
1345     {
1346         // fp16
1347         if (elempack == 1) format = VK_FORMAT_R16_SFLOAT;
1348         if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT;
1349         if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT;
1350         if (elempack == 16) format = VK_FORMAT_R16G16B16A16_SFLOAT;
1351         if (elempack == 32) format = VK_FORMAT_R16G16B16A16_SFLOAT;
1352         if (elempack == 64) format = VK_FORMAT_R16G16B16A16_SFLOAT;
1353     }
1354 
1355     // resolve image width height depth
1356     int width = w;
1357     int height = h;
1358     int depth = c;
1359 
1360     // large elempack spills on image w
1361     if (elempack == 8) width *= 2;
1362     if (elempack == 16) width *= 4;
1363     if (elempack == 32) width *= 8;
1364     if (elempack == 64) width *= 16;
1365 
1366     if (width > (int)vkdev->info.max_image_dimension_3d() || height > (int)vkdev->info.max_image_dimension_3d() || depth > (int)vkdev->info.max_image_dimension_3d())
1367     {
1368         NCNN_LOGE("image dimension too large %d %d %d > %d", width, height, depth, (int)vkdev->info.max_image_dimension_3d());
1369         return 0;
1370     }
1371 
1372     VkImageMemory* ptr = new VkImageMemory;
1373 
1374     ptr->image = create_image(width, height, depth, format, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
1375 
1376     ptr->width = width;
1377     ptr->height = height;
1378     ptr->depth = depth;
1379     ptr->format = format;
1380 
1381     if (vkdev->info.support_VK_KHR_get_memory_requirements2() && vkdev->info.support_VK_KHR_dedicated_allocation())
1382     {
1383         VkImageMemoryRequirementsInfo2KHR imageMemoryRequirementsInfo2;
1384         imageMemoryRequirementsInfo2.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR;
1385         imageMemoryRequirementsInfo2.pNext = 0;
1386         imageMemoryRequirementsInfo2.image = ptr->image;
1387 
1388         VkMemoryRequirements2KHR memoryRequirements2;
1389         memoryRequirements2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR;
1390         memoryRequirements2.pNext = 0;
1391 
1392         VkMemoryDedicatedRequirementsKHR memoryDedicatedRequirements;
1393         memoryDedicatedRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR;
1394         memoryDedicatedRequirements.pNext = 0;
1395         memoryRequirements2.pNext = &memoryDedicatedRequirements;
1396 
1397         vkdev->vkGetImageMemoryRequirements2KHR(vkdev->vkdevice(), &imageMemoryRequirementsInfo2, &memoryRequirements2);
1398 
1399         bool dedicatedAllocation = memoryDedicatedRequirements.requiresDedicatedAllocation || memoryDedicatedRequirements.prefersDedicatedAllocation;
1400 
1401         if (dedicatedAllocation)
1402         {
1403             // setup memory type and alignment
1404             if (image_memory_type_index == (uint32_t)-1)
1405             {
1406                 if (vkdev->info.type() == 1)
1407                 {
1408                     // integrated gpu, prefer unified memory
1409                     image_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
1410                 }
1411                 else
1412                 {
1413                     // discrete gpu, device local
1414                     image_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1415                 }
1416 
1417                 mappable = vkdev->is_mappable(image_memory_type_index);
1418                 coherent = vkdev->is_coherent(image_memory_type_index);
1419             }
1420 
1421             // bind memory
1422             ptr->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, image_memory_type_index, ptr->image, 0);
1423             ptr->bind_offset = 0;
1424             ptr->bind_capacity = memoryRequirements2.memoryRequirements.size;
1425 
1426             // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
1427             vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
1428 
1429             // do not allow host access to optimal tiling image
1430             ptr->mapped_ptr = 0;
1431 
1432             ptr->imageview = create_imageview(ptr->image, format);
1433 
1434             ptr->access_flags = 0;
1435             ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
1436             ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1437             ptr->command_refcount = 0;
1438 
1439             d->dedicated_image_memory_blocks.push_back(ptr->memory);
1440 
1441             return ptr;
1442         }
1443     }
1444 
1445     VkMemoryRequirements memoryRequirements;
1446     vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements);
1447 
1448     const size_t size = memoryRequirements.size;
1449     const size_t alignment = std::max((size_t)memoryRequirements.alignment, d->bind_memory_offset_alignment);
1450 
1451     size_t aligned_size = alignSize(size, alignment);
1452 
1453     const int image_memory_block_count = d->image_memory_blocks.size();
1454 
1455     // find first spare space in buffer_blocks
1456     for (int i = 0; i < image_memory_block_count; i++)
1457     {
1458         // we cannot use image_memory_block_free_spaces[i] directly for base offset alignment
1459         size_t bind_base_offset = d->block_size - d->image_memory_block_free_spaces[i];
1460         size_t bind_offset = alignSize(bind_base_offset, alignment);
1461         if (d->image_memory_block_free_spaces[i] >= aligned_size + (bind_offset - bind_base_offset))
1462         {
1463             // bind at memory offset
1464             ptr->memory = d->image_memory_blocks[i];
1465             ptr->bind_offset = bind_offset;
1466             ptr->bind_capacity = aligned_size;
1467 
1468             vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
1469 
1470             // do not allow host access to optimal tiling image
1471             ptr->mapped_ptr = 0;
1472 
1473             ptr->imageview = create_imageview(ptr->image, format);
1474 
1475             ptr->access_flags = 0;
1476             ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
1477             ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1478             ptr->command_refcount = 0;
1479 
1480             if (bind_base_offset != bind_offset)
1481             {
1482                 // NOTE there is small offset inside bind_base_offset and bind_offset
1483                 // adjust ptr->bind_offset and ptr->bind_capacity after vkBindImageMemory
1484                 // so that memory management could be easier
1485                 aligned_size += (bind_offset - bind_base_offset);
1486 
1487                 ptr->bind_offset = bind_base_offset;
1488                 ptr->bind_capacity = aligned_size;
1489             }
1490 
1491             d->image_memory_block_free_spaces[i] -= aligned_size;
1492 
1493             return ptr;
1494         }
1495     }
1496 
1497     // setup memory type and alignment
1498     if (image_memory_type_index == (uint32_t)-1)
1499     {
1500         if (vkdev->info.type() == 1)
1501         {
1502             // integrated gpu, prefer unified memory
1503             image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
1504         }
1505         else
1506         {
1507             // discrete gpu, device local
1508             image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1509         }
1510 
1511         mappable = vkdev->is_mappable(image_memory_type_index);
1512         coherent = vkdev->is_coherent(image_memory_type_index);
1513     }
1514 
1515     // create new block
1516     size_t new_block_size = std::max(d->block_size, aligned_size);
1517 
1518     // bind at memory offset
1519     ptr->memory = allocate_memory(new_block_size, image_memory_type_index);
1520     ptr->bind_offset = 0;
1521     ptr->bind_capacity = aligned_size;
1522 
1523     // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
1524     vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
1525 
1526     // do not allow host access to optimal tiling image
1527     ptr->mapped_ptr = 0;
1528 
1529     ptr->imageview = create_imageview(ptr->image, format);
1530 
1531     ptr->access_flags = 0;
1532     ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
1533     ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1534     ptr->command_refcount = 0;
1535 
1536     d->image_memory_blocks.push_back(ptr->memory);
1537     d->image_memory_block_free_spaces.push_back(new_block_size - aligned_size);
1538 
1539     return ptr;
1540 }
1541 
fastFree(VkImageMemory * ptr)1542 void VkWeightAllocator::fastFree(VkImageMemory* ptr)
1543 {
1544     //     NCNN_LOGE("VkWeightAllocator F %p", ptr->memory);
1545 
1546     if (!ptr->command_refcount)
1547     {
1548         vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
1549         vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
1550 
1551         delete ptr;
1552     }
1553 }
1554 
1555 class VkStagingAllocatorPrivate
1556 {
1557 public:
1558     unsigned int size_compare_ratio; // 0~256
1559     std::list<VkBufferMemory*> buffer_budgets;
1560 };
1561 
VkStagingAllocator(const VulkanDevice * _vkdev)1562 VkStagingAllocator::VkStagingAllocator(const VulkanDevice* _vkdev)
1563     : VkAllocator(_vkdev), d(new VkStagingAllocatorPrivate)
1564 {
1565     mappable = true;
1566     coherent = true;
1567 
1568     d->size_compare_ratio = 192; // 0.75f * 256
1569 }
1570 
~VkStagingAllocator()1571 VkStagingAllocator::~VkStagingAllocator()
1572 {
1573     clear();
1574 
1575     delete d;
1576 }
1577 
VkStagingAllocator(const VkStagingAllocator &)1578 VkStagingAllocator::VkStagingAllocator(const VkStagingAllocator&)
1579     : VkAllocator(0), d(0)
1580 {
1581 }
1582 
operator =(const VkStagingAllocator &)1583 VkStagingAllocator& VkStagingAllocator::operator=(const VkStagingAllocator&)
1584 {
1585     return *this;
1586 }
1587 
set_size_compare_ratio(float scr)1588 void VkStagingAllocator::set_size_compare_ratio(float scr)
1589 {
1590     if (scr < 0.f || scr > 1.f)
1591     {
1592         NCNN_LOGE("invalid size compare ratio %f", scr);
1593         return;
1594     }
1595 
1596     d->size_compare_ratio = (unsigned int)(scr * 256);
1597 }
1598 
clear()1599 void VkStagingAllocator::clear()
1600 {
1601     //     NCNN_LOGE("VkStagingAllocator %lu", buffer_budgets.size());
1602 
1603     for (std::list<VkBufferMemory*>::iterator it = d->buffer_budgets.begin(); it != d->buffer_budgets.end(); it++)
1604     {
1605         VkBufferMemory* ptr = *it;
1606 
1607         //         NCNN_LOGE("VkStagingAllocator F %p", ptr->buffer);
1608 
1609         vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
1610         vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
1611         vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
1612 
1613         delete ptr;
1614     }
1615     d->buffer_budgets.clear();
1616 }
1617 
fastMalloc(size_t size)1618 VkBufferMemory* VkStagingAllocator::fastMalloc(size_t size)
1619 {
1620     // find free budget
1621     std::list<VkBufferMemory*>::iterator it = d->buffer_budgets.begin();
1622     for (; it != d->buffer_budgets.end(); it++)
1623     {
1624         VkBufferMemory* ptr = *it;
1625 
1626         size_t capacity = ptr->capacity;
1627 
1628         // size_compare_ratio ~ 100%
1629         if (capacity >= size && ((capacity * d->size_compare_ratio) >> 8) <= size)
1630         {
1631             d->buffer_budgets.erase(it);
1632 
1633             //             NCNN_LOGE("VkStagingAllocator M %p %lu reused %lu", ptr->buffer, size, capacity);
1634 
1635             return ptr;
1636         }
1637     }
1638 
1639     VkBufferMemory* ptr = new VkBufferMemory;
1640 
1641     ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
1642     ptr->offset = 0;
1643 
1644     VkMemoryRequirements memoryRequirements;
1645     vkGetBufferMemoryRequirements(vkdev->vkdevice(), ptr->buffer, &memoryRequirements);
1646 
1647     // setup memory type
1648     if (buffer_memory_type_index == (uint32_t)-1)
1649     {
1650         buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
1651     }
1652 
1653     ptr->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
1654 
1655     // ignore memoryRequirements.alignment as we always bind at zero offset
1656     vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0);
1657 
1658     ptr->capacity = size;
1659 
1660     vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr);
1661 
1662     ptr->access_flags = 0;
1663     ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1664 
1665     //     NCNN_LOGE("VkStagingAllocator M %p %lu", ptr->buffer, size);
1666 
1667     return ptr;
1668 }
1669 
fastFree(VkBufferMemory * ptr)1670 void VkStagingAllocator::fastFree(VkBufferMemory* ptr)
1671 {
1672     //     NCNN_LOGE("VkStagingAllocator F %p", ptr->buffer);
1673 
1674     // return to buffer_budgets
1675     d->buffer_budgets.push_back(ptr);
1676 }
1677 
fastMalloc(int w,int h,int c,size_t elemsize,int)1678 VkImageMemory* VkStagingAllocator::fastMalloc(int w, int h, int c, size_t elemsize, int /* elempack */)
1679 {
1680     // staging image is mainly used for storing small piece of dynamic parameters
1681     // we allocate host memory as a fake image, it's simple and good
1682 
1683     const size_t size = w * h * c * elemsize;
1684 
1685     VkImageMemory* ptr = new VkImageMemory;
1686 
1687     ptr->image = 0;
1688     ptr->width = w;
1689     ptr->height = h;
1690     ptr->depth = c;
1691     ptr->format = VK_FORMAT_UNDEFINED;
1692     ptr->memory = 0;
1693     ptr->bind_offset = 0;
1694     ptr->bind_capacity = size;
1695 
1696     ptr->mapped_ptr = malloc(size);
1697 
1698     ptr->imageview = 0;
1699 
1700     ptr->access_flags = 0;
1701     ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
1702     ptr->stage_flags = VK_PIPELINE_STAGE_HOST_BIT;
1703     ptr->command_refcount = 0;
1704 
1705     //     NCNN_LOGE("VkStagingAllocator M %p %d %d %d %d %d", ptr->image, dims, width, height, depth, format);
1706 
1707     return ptr;
1708 }
1709 
fastFree(VkImageMemory * ptr)1710 void VkStagingAllocator::fastFree(VkImageMemory* ptr)
1711 {
1712     //     NCNN_LOGE("VkStagingAllocator F %p", ptr->image);
1713 
1714     free(ptr->mapped_ptr);
1715 
1716     delete ptr;
1717 }
1718 
1719 class VkWeightStagingAllocatorPrivate
1720 {
1721 public:
1722 };
1723 
VkWeightStagingAllocator(const VulkanDevice * _vkdev)1724 VkWeightStagingAllocator::VkWeightStagingAllocator(const VulkanDevice* _vkdev)
1725     : VkAllocator(_vkdev), d(new VkWeightStagingAllocatorPrivate)
1726 {
1727     mappable = true;
1728     coherent = true;
1729 }
1730 
~VkWeightStagingAllocator()1731 VkWeightStagingAllocator::~VkWeightStagingAllocator()
1732 {
1733     delete d;
1734 }
1735 
VkWeightStagingAllocator(const VkWeightStagingAllocator &)1736 VkWeightStagingAllocator::VkWeightStagingAllocator(const VkWeightStagingAllocator&)
1737     : VkAllocator(0), d(0)
1738 {
1739 }
1740 
operator =(const VkWeightStagingAllocator &)1741 VkWeightStagingAllocator& VkWeightStagingAllocator::operator=(const VkWeightStagingAllocator&)
1742 {
1743     return *this;
1744 }
1745 
fastMalloc(size_t size)1746 VkBufferMemory* VkWeightStagingAllocator::fastMalloc(size_t size)
1747 {
1748     VkBufferMemory* ptr = new VkBufferMemory;
1749 
1750     ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
1751     ptr->offset = 0;
1752 
1753     VkMemoryRequirements memoryRequirements;
1754     vkGetBufferMemoryRequirements(vkdev->vkdevice(), ptr->buffer, &memoryRequirements);
1755 
1756     // setup memory type
1757     if (buffer_memory_type_index == (uint32_t)-1)
1758     {
1759         buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
1760     }
1761 
1762     ptr->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
1763 
1764     // ignore memoryRequirements.alignment as we always bind at zero offset
1765     vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0);
1766 
1767     ptr->capacity = size;
1768 
1769     vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr);
1770 
1771     ptr->access_flags = 0;
1772     ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1773 
1774     //     NCNN_LOGE("VkWeightStagingAllocator M %p %lu", ptr->buffer, size);
1775 
1776     return ptr;
1777 }
1778 
fastFree(VkBufferMemory * ptr)1779 void VkWeightStagingAllocator::fastFree(VkBufferMemory* ptr)
1780 {
1781     //     NCNN_LOGE("VkWeightStagingAllocator F %p", ptr->buffer);
1782 
1783     vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
1784     vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
1785     vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
1786 
1787     delete ptr;
1788 }
1789 
fastMalloc(int,int,int,size_t,int)1790 VkImageMemory* VkWeightStagingAllocator::fastMalloc(int /*w*/, int /*h*/, int /*c*/, size_t /*elemsize*/, int /*elempack*/)
1791 {
1792     return 0;
1793 }
1794 
fastFree(VkImageMemory *)1795 void VkWeightStagingAllocator::fastFree(VkImageMemory* /*ptr*/)
1796 {
1797 }
1798 
1799 #if __ANDROID_API__ >= 26
VkAndroidHardwareBufferImageAllocator(const VulkanDevice * _vkdev,AHardwareBuffer * _hb)1800 VkAndroidHardwareBufferImageAllocator::VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb)
1801     : VkAllocator(_vkdev), hb(_hb)
1802 {
1803     samplerYcbcrConversion = 0;
1804 
1805     init();
1806 }
1807 
~VkAndroidHardwareBufferImageAllocator()1808 VkAndroidHardwareBufferImageAllocator::~VkAndroidHardwareBufferImageAllocator()
1809 {
1810     if (samplerYcbcrConversion)
1811     {
1812         vkdev->vkDestroySamplerYcbcrConversionKHR(vkdev->vkdevice(), samplerYcbcrConversion, 0);
1813         samplerYcbcrConversion = 0;
1814     }
1815 }
1816 
VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator &)1817 VkAndroidHardwareBufferImageAllocator::VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator&)
1818     : VkAllocator(0)
1819 {
1820 }
1821 
operator =(const VkAndroidHardwareBufferImageAllocator &)1822 VkAndroidHardwareBufferImageAllocator& VkAndroidHardwareBufferImageAllocator::operator=(const VkAndroidHardwareBufferImageAllocator&)
1823 {
1824     return *this;
1825 }
1826 
fastMalloc(size_t)1827 VkBufferMemory* VkAndroidHardwareBufferImageAllocator::fastMalloc(size_t /*size*/)
1828 {
1829     return 0;
1830 }
1831 
fastFree(VkBufferMemory *)1832 void VkAndroidHardwareBufferImageAllocator::fastFree(VkBufferMemory* /*ptr*/)
1833 {
1834 }
1835 
fastMalloc(int,int,int,size_t,int)1836 VkImageMemory* VkAndroidHardwareBufferImageAllocator::fastMalloc(int /*w*/, int /*h*/, int /*c*/, size_t /*elemsize*/, int /*elempack*/)
1837 {
1838     VkResult ret;
1839 
1840     VkExternalFormatANDROID externalFormat;
1841     externalFormat.sType = VK_STRUCTURE_TYPE_EXTERNAL_FORMAT_ANDROID;
1842     externalFormat.pNext = 0;
1843     externalFormat.externalFormat = bufferFormatProperties.externalFormat;
1844 
1845     VkExternalMemoryImageCreateInfo externalMemoryImageCreateInfo;
1846     externalMemoryImageCreateInfo.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO,
1847     externalMemoryImageCreateInfo.pNext = &externalFormat,
1848     externalMemoryImageCreateInfo.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID;
1849 
1850     VkImageCreateInfo imageCreateInfo;
1851     imageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
1852     imageCreateInfo.pNext = &externalMemoryImageCreateInfo;
1853     imageCreateInfo.flags = 0;
1854     imageCreateInfo.imageType = VK_IMAGE_TYPE_2D;
1855     imageCreateInfo.format = VK_FORMAT_UNDEFINED;
1856     imageCreateInfo.extent.width = bufferDesc.width;
1857     imageCreateInfo.extent.height = bufferDesc.height;
1858     imageCreateInfo.extent.depth = 1;
1859     imageCreateInfo.mipLevels = 1;
1860     imageCreateInfo.arrayLayers = 1;
1861     imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT;
1862     imageCreateInfo.tiling = VK_IMAGE_TILING_OPTIMAL;
1863     imageCreateInfo.usage = VK_IMAGE_USAGE_SAMPLED_BIT;
1864     imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
1865     imageCreateInfo.queueFamilyIndexCount = 0;
1866     imageCreateInfo.pQueueFamilyIndices = 0;
1867     imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
1868 
1869     VkImage image = 0;
1870     ret = vkCreateImage(vkdev->vkdevice(), &imageCreateInfo, 0, &image);
1871     if (ret != VK_SUCCESS)
1872     {
1873         NCNN_LOGE("vkCreateImage failed %d", ret);
1874         return 0;
1875     }
1876 
1877     // setup memory type
1878     if (image_memory_type_index == (uint32_t)-1)
1879     {
1880         image_memory_type_index = vkdev->find_memory_index(bufferProperties.memoryTypeBits, 0, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
1881     }
1882 
1883     VkImportAndroidHardwareBufferInfoANDROID importAndroidHardwareBufferInfo;
1884     importAndroidHardwareBufferInfo.sType = VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID;
1885     importAndroidHardwareBufferInfo.pNext = 0;
1886     importAndroidHardwareBufferInfo.buffer = hb;
1887 
1888     VkMemoryDedicatedAllocateInfo memoryDedicatedAllocateInfo;
1889     memoryDedicatedAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO;
1890     memoryDedicatedAllocateInfo.pNext = &importAndroidHardwareBufferInfo;
1891     memoryDedicatedAllocateInfo.image = image;
1892     memoryDedicatedAllocateInfo.buffer = VK_NULL_HANDLE;
1893 
1894     VkMemoryAllocateInfo memoryAllocateInfo;
1895     memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
1896     memoryAllocateInfo.pNext = &memoryDedicatedAllocateInfo;
1897     memoryAllocateInfo.allocationSize = bufferProperties.allocationSize;
1898     memoryAllocateInfo.memoryTypeIndex = image_memory_type_index;
1899 
1900     VkDeviceMemory memory = 0;
1901     ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory);
1902     if (ret != VK_SUCCESS)
1903     {
1904         NCNN_LOGE("vkAllocateMemory failed %d", ret);
1905         return 0;
1906     }
1907 
1908     VkBindImageMemoryInfo bindImageMemoryInfo;
1909     bindImageMemoryInfo.sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO;
1910     bindImageMemoryInfo.pNext = 0;
1911     bindImageMemoryInfo.image = image;
1912     bindImageMemoryInfo.memory = memory;
1913     bindImageMemoryInfo.memoryOffset = 0;
1914     ret = vkdev->vkBindImageMemory2KHR(vkdev->vkdevice(), 1, &bindImageMemoryInfo);
1915     if (ret != VK_SUCCESS)
1916     {
1917         NCNN_LOGE("vkBindImageMemory2KHR failed %d", ret);
1918         vkDestroyImage(vkdev->vkdevice(), image, 0);
1919         return 0;
1920     }
1921 
1922     VkSamplerYcbcrConversionInfoKHR samplerYcbcrConversionInfo;
1923     samplerYcbcrConversionInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO_KHR;
1924     samplerYcbcrConversionInfo.pNext = &externalFormat;
1925     samplerYcbcrConversionInfo.conversion = samplerYcbcrConversion;
1926 
1927     VkImageViewCreateInfo imageViewCreateInfo;
1928     imageViewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
1929     imageViewCreateInfo.pNext = &samplerYcbcrConversionInfo;
1930     imageViewCreateInfo.flags = 0;
1931     imageViewCreateInfo.image = image;
1932     imageViewCreateInfo.viewType = VK_IMAGE_VIEW_TYPE_2D;
1933     imageViewCreateInfo.format = VK_FORMAT_UNDEFINED;
1934     imageViewCreateInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
1935     imageViewCreateInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
1936     imageViewCreateInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
1937     imageViewCreateInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
1938     imageViewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
1939     imageViewCreateInfo.subresourceRange.baseMipLevel = 0;
1940     imageViewCreateInfo.subresourceRange.levelCount = 1;
1941     imageViewCreateInfo.subresourceRange.baseArrayLayer = 0;
1942     imageViewCreateInfo.subresourceRange.layerCount = 1;
1943 
1944     VkImageView imageview = 0;
1945     ret = vkCreateImageView(vkdev->vkdevice(), &imageViewCreateInfo, 0, &imageview);
1946     if (ret != VK_SUCCESS)
1947     {
1948         NCNN_LOGE("vkCreateImageView failed %d", ret);
1949         vkDestroyImage(vkdev->vkdevice(), image, 0);
1950         vkFreeMemory(vkdev->vkdevice(), memory, 0);
1951         return 0;
1952     }
1953 
1954     VkImageMemory* ptr = new VkImageMemory;
1955     ptr->image = image;
1956     ptr->memory = memory;
1957     ptr->imageview = imageview;
1958     ptr->access_flags = 0;
1959     ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
1960     ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1961 
1962     return ptr;
1963 }
1964 
fastFree(VkImageMemory * ptr)1965 void VkAndroidHardwareBufferImageAllocator::fastFree(VkImageMemory* ptr)
1966 {
1967     vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
1968     vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
1969     vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
1970 
1971     delete ptr;
1972 }
1973 
init()1974 int VkAndroidHardwareBufferImageAllocator::init()
1975 {
1976     AHardwareBuffer_describe(hb, &bufferDesc);
1977 
1978     VkResult ret;
1979 
1980     // resolve externalFormat
1981     bufferFormatProperties.sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_ANDROID;
1982     bufferFormatProperties.pNext = 0;
1983 
1984     bufferProperties.sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_PROPERTIES_ANDROID;
1985     bufferProperties.pNext = &bufferFormatProperties;
1986 
1987     ret = vkGetAndroidHardwareBufferPropertiesANDROID(vkdev->vkdevice(), hb, &bufferProperties);
1988     if (ret != VK_SUCCESS)
1989     {
1990         NCNN_LOGE("vkGetAndroidHardwareBufferPropertiesANDROID failed %d", ret);
1991         return -1;
1992     }
1993 
1994     // setup samplerYcbcrConversion
1995     VkExternalFormatANDROID externalFormat;
1996     externalFormat.sType = VK_STRUCTURE_TYPE_EXTERNAL_FORMAT_ANDROID;
1997     externalFormat.pNext = 0;
1998     externalFormat.externalFormat = bufferFormatProperties.externalFormat;
1999 
2000     VkSamplerYcbcrConversionCreateInfoKHR samplerYcbcrConversionCreateInfo;
2001     samplerYcbcrConversionCreateInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_CREATE_INFO_KHR;
2002     samplerYcbcrConversionCreateInfo.pNext = &externalFormat;
2003     samplerYcbcrConversionCreateInfo.format = VK_FORMAT_UNDEFINED;
2004     samplerYcbcrConversionCreateInfo.ycbcrModel = bufferFormatProperties.suggestedYcbcrModel;
2005     samplerYcbcrConversionCreateInfo.ycbcrRange = bufferFormatProperties.suggestedYcbcrRange;
2006     samplerYcbcrConversionCreateInfo.components = bufferFormatProperties.samplerYcbcrConversionComponents;
2007     samplerYcbcrConversionCreateInfo.xChromaOffset = bufferFormatProperties.suggestedXChromaOffset;
2008     samplerYcbcrConversionCreateInfo.yChromaOffset = bufferFormatProperties.suggestedYChromaOffset;
2009     samplerYcbcrConversionCreateInfo.chromaFilter = VK_FILTER_NEAREST;
2010     samplerYcbcrConversionCreateInfo.forceExplicitReconstruction = VK_FALSE;
2011 
2012     ret = vkdev->vkCreateSamplerYcbcrConversionKHR(vkdev->vkdevice(), &samplerYcbcrConversionCreateInfo, 0, &samplerYcbcrConversion);
2013     if (ret != VK_SUCCESS)
2014     {
2015         NCNN_LOGE("vkCreateSamplerYcbcrConversionKHR failed %d", ret);
2016         return -1;
2017     }
2018 
2019     return 0;
2020 }
2021 
width() const2022 int VkAndroidHardwareBufferImageAllocator::width() const
2023 {
2024     return bufferDesc.width;
2025 }
2026 
height() const2027 int VkAndroidHardwareBufferImageAllocator::height() const
2028 {
2029     return bufferDesc.height;
2030 }
2031 
external_format() const2032 uint64_t VkAndroidHardwareBufferImageAllocator::external_format() const
2033 {
2034     return bufferFormatProperties.externalFormat;
2035 }
2036 #endif // __ANDROID_API__ >= 26
2037 
2038 #endif // NCNN_VULKAN
2039 
2040 } // namespace ncnn
2041