1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "gpu.h"
16 
17 #if NCNN_VULKAN
18 
19 #include <math.h>
20 #include <string.h>
21 #include <vulkan/vulkan.h>
22 
23 #include "glslang/SPIRV/GlslangToSpv.h"
24 #include "glslang/Public/ShaderLang.h"
25 
26 #include "command.h"
27 #include "layer.h"
28 #include "layer/vulkan/packing_vulkan.h"
29 #include "layer_type.h"
30 #include "mat.h"
31 #include "pipelinecache.h"
32 
33 // There is known issue that vkDestroyDebugUtilsMessengerEXT crash on exit when vulkan validation layer enabled
34 // upstream fix https://github.com/KhronosGroup/Vulkan-Loader/pull/539
35 #define ENABLE_VALIDATION_LAYER 0
36 
37 namespace ncnn {
38 
39 // global
40 static Mutex g_instance_lock;
41 
42 class __ncnn_vulkan_instance_holder
43 {
44 public:
__ncnn_vulkan_instance_holder()45     __ncnn_vulkan_instance_holder()
46     {
47         instance = 0;
48 #if ENABLE_VALIDATION_LAYER
49         callback = 0;
50 #endif
51     }
52 
~__ncnn_vulkan_instance_holder()53     ~__ncnn_vulkan_instance_holder()
54     {
55         destroy_gpu_instance();
56     }
57 
operator VkInstance()58     operator VkInstance()
59     {
60         return instance;
61     }
62 
63     VkInstance instance;
64 #if ENABLE_VALIDATION_LAYER
65     VkDebugUtilsMessengerEXT callback;
66 #endif
67 };
68 static __ncnn_vulkan_instance_holder g_instance;
69 
70 static int g_gpu_count = 0;
71 static int g_default_gpu_index = -1;
72 
73 // NOTE 8 is large enough i think ...
74 #define NCNN_MAX_GPU_COUNT 8
75 static GpuInfo* g_gpu_infos[NCNN_MAX_GPU_COUNT] = {0};
76 
77 // default vulkan device
78 static Mutex g_default_vkdev_lock;
79 static VulkanDevice* g_default_vkdev[NCNN_MAX_GPU_COUNT] = {0};
80 
81 struct layer_shader_registry_entry
82 {
83     const char* comp_data;
84     int comp_data_size;
85 };
86 
87 #include "layer_shader_spv_data.h"
88 
89 static const layer_shader_registry_entry layer_shader_registry[] = {
90 #include "layer_shader_registry.h"
91 };
92 
93 static const int layer_shader_registry_entry_count = sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry);
94 
95 int support_VK_KHR_external_memory_capabilities = 0;
96 int support_VK_KHR_get_physical_device_properties2 = 0;
97 int support_VK_KHR_get_surface_capabilities2 = 0;
98 int support_VK_KHR_surface = 0;
99 int support_VK_EXT_debug_utils = 0;
100 #if __ANDROID_API__ >= 26
101 int support_VK_KHR_android_surface = 0;
102 #endif // __ANDROID_API__ >= 26
103 
104 // VK_KHR_external_memory_capabilities
105 PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR = 0;
106 
107 // VK_KHR_get_physical_device_properties2
108 PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR = 0;
109 PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR = 0;
110 PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR = 0;
111 PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR = 0;
112 PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR = 0;
113 PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR = 0;
114 PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR vkGetPhysicalDeviceSparseImageFormatProperties2KHR = 0;
115 
116 // VK_KHR_get_surface_capabilities2
117 PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR = 0;
118 PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR = 0;
119 
120 // VK_KHR_surface
121 PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR = 0;
122 PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR = 0;
123 PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR = 0;
124 PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR = 0;
125 PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR = 0;
126 
127 #if __ANDROID_API__ >= 26
128 // VK_KHR_android_surface
129 PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR = 0;
130 #endif // __ANDROID_API__ >= 26
131 
132 class GpuInfoPrivate
133 {
134 public:
135     // vulkan physical device
136     VkPhysicalDevice physical_device;
137 
138     // memory properties
139     VkPhysicalDeviceMemoryProperties physical_device_memory_properties;
140 
141     // info
142     uint32_t api_version;
143     uint32_t driver_version;
144     uint32_t vendor_id;
145     uint32_t device_id;
146     char device_name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
147     uint8_t pipeline_cache_uuid[VK_UUID_SIZE];
148 
149     // 0 = discrete gpu
150     // 1 = integrated gpu
151     // 2 = virtual gpu
152     // 3 = cpu
153     int type;
154 
155     // hardware limit
156     uint32_t max_shared_memory_size;
157     uint32_t max_workgroup_count_x;
158     uint32_t max_workgroup_count_y;
159     uint32_t max_workgroup_count_z;
160     uint32_t max_workgroup_invocations;
161     uint32_t max_workgroup_size_x;
162     uint32_t max_workgroup_size_y;
163     uint32_t max_workgroup_size_z;
164     size_t memory_map_alignment;
165     size_t buffer_offset_alignment;
166     size_t non_coherent_atom_size;
167     size_t buffer_image_granularity;
168     uint32_t max_image_dimension_1d;
169     uint32_t max_image_dimension_2d;
170     uint32_t max_image_dimension_3d;
171     float timestamp_period;
172 
173     // runtime
174     uint32_t compute_queue_family_index;
175     uint32_t graphics_queue_family_index;
176     uint32_t transfer_queue_family_index;
177 
178     uint32_t compute_queue_count;
179     uint32_t graphics_queue_count;
180     uint32_t transfer_queue_count;
181 
182     // property
183     bool unified_compute_transfer_queue;
184 
185     // subgroup
186     uint32_t subgroup_size;
187     bool support_subgroup_basic;
188     bool support_subgroup_vote;
189     bool support_subgroup_ballot;
190     bool support_subgroup_shuffle;
191 
192     // bug is not feature
193     bool bug_storage_buffer_no_l1;
194     bool bug_corrupted_online_pipeline_cache;
195     bool bug_buffer_image_load_zero;
196 
197     // but sometimes bug is a feature
198     bool bug_implicit_fp16_arithmetic;
199 
200     // fp16 and int8 feature
201     bool support_fp16_packed;
202     bool support_fp16_storage;
203     bool support_fp16_arithmetic;
204     bool support_int8_packed;
205     bool support_int8_storage;
206     bool support_int8_arithmetic;
207 
208     // ycbcr conversion feature
209     bool support_ycbcr_conversion;
210 
211     bool support_reserved_0;
212 
213     // extension capability
214     int support_VK_KHR_8bit_storage;
215     int support_VK_KHR_16bit_storage;
216     int support_VK_KHR_bind_memory2;
217     int support_VK_KHR_create_renderpass2;
218     int support_VK_KHR_dedicated_allocation;
219     int support_VK_KHR_descriptor_update_template;
220     int support_VK_KHR_external_memory;
221     int support_VK_KHR_get_memory_requirements2;
222     int support_VK_KHR_maintenance1;
223     int support_VK_KHR_maintenance2;
224     int support_VK_KHR_maintenance3;
225     int support_VK_KHR_multiview;
226     int support_VK_KHR_push_descriptor;
227     int support_VK_KHR_sampler_ycbcr_conversion;
228     int support_VK_KHR_shader_float16_int8;
229     int support_VK_KHR_shader_float_controls;
230     int support_VK_KHR_storage_buffer_storage_class;
231     int support_VK_KHR_swapchain;
232     int support_VK_EXT_descriptor_indexing;
233     int support_VK_EXT_memory_budget;
234     int support_VK_EXT_queue_family_foreign;
235 #if __ANDROID_API__ >= 26
236     int support_VK_ANDROID_external_memory_android_hardware_buffer;
237 #endif // __ANDROID_API__ >= 26
238 };
239 
GpuInfo()240 GpuInfo::GpuInfo()
241     : d(new GpuInfoPrivate)
242 {
243 }
244 
~GpuInfo()245 GpuInfo::~GpuInfo()
246 {
247     delete d;
248 }
249 
GpuInfo(const GpuInfo &)250 GpuInfo::GpuInfo(const GpuInfo&)
251     : d(0)
252 {
253 }
254 
operator =(const GpuInfo &)255 GpuInfo& GpuInfo::operator=(const GpuInfo&)
256 {
257     return *this;
258 }
259 
physical_device() const260 VkPhysicalDevice GpuInfo::physical_device() const
261 {
262     return d->physical_device;
263 }
264 
physical_device_memory_properties() const265 const VkPhysicalDeviceMemoryProperties& GpuInfo::physical_device_memory_properties() const
266 {
267     return d->physical_device_memory_properties;
268 }
269 
api_version() const270 uint32_t GpuInfo::api_version() const
271 {
272     return d->api_version;
273 }
274 
driver_version() const275 uint32_t GpuInfo::driver_version() const
276 {
277     return d->driver_version;
278 }
279 
vendor_id() const280 uint32_t GpuInfo::vendor_id() const
281 {
282     return d->vendor_id;
283 }
284 
device_id() const285 uint32_t GpuInfo::device_id() const
286 {
287     return d->device_id;
288 }
289 
device_name() const290 const char* GpuInfo::device_name() const
291 {
292     return d->device_name;
293 }
294 
pipeline_cache_uuid() const295 uint8_t* GpuInfo::pipeline_cache_uuid() const
296 {
297     return d->pipeline_cache_uuid;
298 }
299 
type() const300 int GpuInfo::type() const
301 {
302     return d->type;
303 }
304 
max_shared_memory_size() const305 uint32_t GpuInfo::max_shared_memory_size() const
306 {
307     return d->max_shared_memory_size;
308 }
309 
max_workgroup_count_x() const310 uint32_t GpuInfo::max_workgroup_count_x() const
311 {
312     return d->max_workgroup_count_x;
313 }
314 
max_workgroup_count_y() const315 uint32_t GpuInfo::max_workgroup_count_y() const
316 {
317     return d->max_workgroup_count_y;
318 }
319 
max_workgroup_count_z() const320 uint32_t GpuInfo::max_workgroup_count_z() const
321 {
322     return d->max_workgroup_count_z;
323 }
324 
max_workgroup_invocations() const325 uint32_t GpuInfo::max_workgroup_invocations() const
326 {
327     return d->max_workgroup_invocations;
328 }
329 
max_workgroup_size_x() const330 uint32_t GpuInfo::max_workgroup_size_x() const
331 {
332     return d->max_workgroup_size_x;
333 }
334 
max_workgroup_size_y() const335 uint32_t GpuInfo::max_workgroup_size_y() const
336 {
337     return d->max_workgroup_size_y;
338 }
339 
max_workgroup_size_z() const340 uint32_t GpuInfo::max_workgroup_size_z() const
341 {
342     return d->max_workgroup_size_z;
343 }
344 
memory_map_alignment() const345 size_t GpuInfo::memory_map_alignment() const
346 {
347     return d->memory_map_alignment;
348 }
349 
buffer_offset_alignment() const350 size_t GpuInfo::buffer_offset_alignment() const
351 {
352     return d->buffer_offset_alignment;
353 }
354 
non_coherent_atom_size() const355 size_t GpuInfo::non_coherent_atom_size() const
356 {
357     return d->non_coherent_atom_size;
358 }
359 
buffer_image_granularity() const360 size_t GpuInfo::buffer_image_granularity() const
361 {
362     return d->buffer_image_granularity;
363 }
364 
max_image_dimension_1d() const365 uint32_t GpuInfo::max_image_dimension_1d() const
366 {
367     return d->max_image_dimension_1d;
368 }
369 
max_image_dimension_2d() const370 uint32_t GpuInfo::max_image_dimension_2d() const
371 {
372     return d->max_image_dimension_2d;
373 }
374 
max_image_dimension_3d() const375 uint32_t GpuInfo::max_image_dimension_3d() const
376 {
377     return d->max_image_dimension_3d;
378 }
379 
timestamp_period() const380 float GpuInfo::timestamp_period() const
381 {
382     return d->timestamp_period;
383 }
384 
compute_queue_family_index() const385 uint32_t GpuInfo::compute_queue_family_index() const
386 {
387     return d->compute_queue_family_index;
388 }
389 
graphics_queue_family_index() const390 uint32_t GpuInfo::graphics_queue_family_index() const
391 {
392     return d->graphics_queue_family_index;
393 }
394 
transfer_queue_family_index() const395 uint32_t GpuInfo::transfer_queue_family_index() const
396 {
397     return d->transfer_queue_family_index;
398 }
399 
compute_queue_count() const400 uint32_t GpuInfo::compute_queue_count() const
401 {
402     return d->compute_queue_count;
403 }
404 
graphics_queue_count() const405 uint32_t GpuInfo::graphics_queue_count() const
406 {
407     return d->graphics_queue_count;
408 }
409 
transfer_queue_count() const410 uint32_t GpuInfo::transfer_queue_count() const
411 {
412     return d->transfer_queue_count;
413 }
414 
unified_compute_transfer_queue() const415 bool GpuInfo::unified_compute_transfer_queue() const
416 {
417     return d->unified_compute_transfer_queue;
418 }
419 
subgroup_size() const420 uint32_t GpuInfo::subgroup_size() const
421 {
422     return d->subgroup_size;
423 }
424 
support_subgroup_basic() const425 bool GpuInfo::support_subgroup_basic() const
426 {
427     return d->support_subgroup_basic;
428 }
429 
support_subgroup_vote() const430 bool GpuInfo::support_subgroup_vote() const
431 {
432     return d->support_subgroup_vote;
433 }
434 
support_subgroup_ballot() const435 bool GpuInfo::support_subgroup_ballot() const
436 {
437     return d->support_subgroup_ballot;
438 }
439 
support_subgroup_shuffle() const440 bool GpuInfo::support_subgroup_shuffle() const
441 {
442     return d->support_subgroup_shuffle;
443 }
444 
bug_storage_buffer_no_l1() const445 bool GpuInfo::bug_storage_buffer_no_l1() const
446 {
447     return d->bug_storage_buffer_no_l1;
448 }
449 
bug_corrupted_online_pipeline_cache() const450 bool GpuInfo::bug_corrupted_online_pipeline_cache() const
451 {
452     return d->bug_corrupted_online_pipeline_cache;
453 }
454 
bug_buffer_image_load_zero() const455 bool GpuInfo::bug_buffer_image_load_zero() const
456 {
457     return d->bug_buffer_image_load_zero;
458 }
459 
bug_implicit_fp16_arithmetic() const460 bool GpuInfo::bug_implicit_fp16_arithmetic() const
461 {
462     return d->bug_implicit_fp16_arithmetic;
463 }
464 
support_fp16_packed() const465 bool GpuInfo::support_fp16_packed() const
466 {
467     return d->support_fp16_packed;
468 }
469 
support_fp16_storage() const470 bool GpuInfo::support_fp16_storage() const
471 {
472     return d->support_fp16_storage;
473 }
474 
support_fp16_arithmetic() const475 bool GpuInfo::support_fp16_arithmetic() const
476 {
477     return d->support_fp16_arithmetic;
478 }
479 
support_int8_packed() const480 bool GpuInfo::support_int8_packed() const
481 {
482     return d->support_int8_packed;
483 }
484 
support_int8_storage() const485 bool GpuInfo::support_int8_storage() const
486 {
487     return d->support_int8_storage;
488 }
489 
support_int8_arithmetic() const490 bool GpuInfo::support_int8_arithmetic() const
491 {
492     return d->support_int8_arithmetic;
493 }
494 
support_ycbcr_conversion() const495 bool GpuInfo::support_ycbcr_conversion() const
496 {
497     return d->support_ycbcr_conversion;
498 }
499 
support_VK_KHR_8bit_storage() const500 int GpuInfo::support_VK_KHR_8bit_storage() const
501 {
502     return d->support_VK_KHR_8bit_storage;
503 }
504 
support_VK_KHR_16bit_storage() const505 int GpuInfo::support_VK_KHR_16bit_storage() const
506 {
507     return d->support_VK_KHR_16bit_storage;
508 }
509 
support_VK_KHR_bind_memory2() const510 int GpuInfo::support_VK_KHR_bind_memory2() const
511 {
512     return d->support_VK_KHR_bind_memory2;
513 }
514 
support_VK_KHR_create_renderpass2() const515 int GpuInfo::support_VK_KHR_create_renderpass2() const
516 {
517     return d->support_VK_KHR_create_renderpass2;
518 }
519 
support_VK_KHR_dedicated_allocation() const520 int GpuInfo::support_VK_KHR_dedicated_allocation() const
521 {
522     return d->support_VK_KHR_dedicated_allocation;
523 }
524 
support_VK_KHR_descriptor_update_template() const525 int GpuInfo::support_VK_KHR_descriptor_update_template() const
526 {
527     return d->support_VK_KHR_descriptor_update_template;
528 }
529 
support_VK_KHR_external_memory() const530 int GpuInfo::support_VK_KHR_external_memory() const
531 {
532     return d->support_VK_KHR_external_memory;
533 }
534 
support_VK_KHR_get_memory_requirements2() const535 int GpuInfo::support_VK_KHR_get_memory_requirements2() const
536 {
537     return d->support_VK_KHR_get_memory_requirements2;
538 }
539 
support_VK_KHR_maintenance1() const540 int GpuInfo::support_VK_KHR_maintenance1() const
541 {
542     return d->support_VK_KHR_maintenance1;
543 }
544 
support_VK_KHR_maintenance2() const545 int GpuInfo::support_VK_KHR_maintenance2() const
546 {
547     return d->support_VK_KHR_maintenance2;
548 }
549 
support_VK_KHR_maintenance3() const550 int GpuInfo::support_VK_KHR_maintenance3() const
551 {
552     return d->support_VK_KHR_maintenance3;
553 }
554 
support_VK_KHR_multiview() const555 int GpuInfo::support_VK_KHR_multiview() const
556 {
557     return d->support_VK_KHR_multiview;
558 }
559 
support_VK_KHR_push_descriptor() const560 int GpuInfo::support_VK_KHR_push_descriptor() const
561 {
562     return d->support_VK_KHR_push_descriptor;
563 }
564 
support_VK_KHR_sampler_ycbcr_conversion() const565 int GpuInfo::support_VK_KHR_sampler_ycbcr_conversion() const
566 {
567     return d->support_VK_KHR_sampler_ycbcr_conversion;
568 }
569 
support_VK_KHR_shader_float16_int8() const570 int GpuInfo::support_VK_KHR_shader_float16_int8() const
571 {
572     return d->support_VK_KHR_shader_float16_int8;
573 }
574 
support_VK_KHR_shader_float_controls() const575 int GpuInfo::support_VK_KHR_shader_float_controls() const
576 {
577     return d->support_VK_KHR_shader_float_controls;
578 }
579 
support_VK_KHR_storage_buffer_storage_class() const580 int GpuInfo::support_VK_KHR_storage_buffer_storage_class() const
581 {
582     return d->support_VK_KHR_storage_buffer_storage_class;
583 }
584 
support_VK_KHR_swapchain() const585 int GpuInfo::support_VK_KHR_swapchain() const
586 {
587     return d->support_VK_KHR_swapchain;
588 }
589 
support_VK_EXT_descriptor_indexing() const590 int GpuInfo::support_VK_EXT_descriptor_indexing() const
591 {
592     return d->support_VK_EXT_descriptor_indexing;
593 }
594 
support_VK_EXT_memory_budget() const595 int GpuInfo::support_VK_EXT_memory_budget() const
596 {
597     return d->support_VK_EXT_memory_budget;
598 }
599 
support_VK_EXT_queue_family_foreign() const600 int GpuInfo::support_VK_EXT_queue_family_foreign() const
601 {
602     return d->support_VK_EXT_queue_family_foreign;
603 }
604 
605 #if __ANDROID_API__ >= 26
support_VK_ANDROID_external_memory_android_hardware_buffer() const606 int GpuInfo::support_VK_ANDROID_external_memory_android_hardware_buffer() const
607 {
608     return d->support_VK_ANDROID_external_memory_android_hardware_buffer;
609 }
610 #endif // __ANDROID_API__ >= 26
611 
init_instance_extension()612 static int init_instance_extension()
613 {
614     if (support_VK_KHR_external_memory_capabilities)
615     {
616         vkGetPhysicalDeviceExternalBufferPropertiesKHR = (PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceExternalBufferPropertiesKHR");
617     }
618 
619     if (support_VK_KHR_get_physical_device_properties2)
620     {
621         vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFeatures2KHR");
622         vkGetPhysicalDeviceProperties2KHR = (PFN_vkGetPhysicalDeviceProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceProperties2KHR");
623         vkGetPhysicalDeviceFormatProperties2KHR = (PFN_vkGetPhysicalDeviceFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFormatProperties2KHR");
624         vkGetPhysicalDeviceImageFormatProperties2KHR = (PFN_vkGetPhysicalDeviceImageFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceImageFormatProperties2KHR");
625         vkGetPhysicalDeviceQueueFamilyProperties2KHR = (PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceQueueFamilyProperties2KHR");
626         vkGetPhysicalDeviceMemoryProperties2KHR = (PFN_vkGetPhysicalDeviceMemoryProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceMemoryProperties2KHR");
627         vkGetPhysicalDeviceSparseImageFormatProperties2KHR = (PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSparseImageFormatProperties2KHR");
628     }
629 
630     if (support_VK_KHR_get_surface_capabilities2)
631     {
632         vkGetPhysicalDeviceSurfaceCapabilities2KHR = (PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceCapabilities2KHR");
633         vkGetPhysicalDeviceSurfaceFormats2KHR = (PFN_vkGetPhysicalDeviceSurfaceFormats2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceFormats2KHR");
634     }
635 
636     if (support_VK_KHR_surface)
637     {
638         vkDestroySurfaceKHR = (PFN_vkDestroySurfaceKHR)vkGetInstanceProcAddr(g_instance, "vkDestroySurfaceKHR");
639         vkGetPhysicalDeviceSurfaceSupportKHR = (PFN_vkGetPhysicalDeviceSurfaceSupportKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceSupportKHR");
640         vkGetPhysicalDeviceSurfaceCapabilitiesKHR = (PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceCapabilitiesKHR");
641         vkGetPhysicalDeviceSurfaceFormatsKHR = (PFN_vkGetPhysicalDeviceSurfaceFormatsKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceFormatsKHR");
642         vkGetPhysicalDeviceSurfacePresentModesKHR = (PFN_vkGetPhysicalDeviceSurfacePresentModesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfacePresentModesKHR");
643     }
644 
645 #if __ANDROID_API__ >= 26
646     if (support_VK_KHR_android_surface)
647     {
648         vkCreateAndroidSurfaceKHR = (PFN_vkCreateAndroidSurfaceKHR)vkGetInstanceProcAddr(g_instance, "vkCreateAndroidSurfaceKHR");
649     }
650 #endif // __ANDROID_API__ >= 26
651 
652     return 0;
653 }
654 
655 #if ENABLE_VALIDATION_LAYER
debugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT,VkDebugUtilsMessageTypeFlagsEXT,const VkDebugUtilsMessengerCallbackDataEXT * pCallbackData,void *)656 static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback(
657     VkDebugUtilsMessageSeverityFlagBitsEXT /*messageSeverity*/,
658     VkDebugUtilsMessageTypeFlagsEXT /*messageType*/,
659     const VkDebugUtilsMessengerCallbackDataEXT* pCallbackData,
660     void* /*pUserData*/)
661 {
662     NCNN_LOGE("validation layer: %s", pCallbackData->pMessage);
663 
664     return VK_FALSE;
665 }
666 
CreateDebugUtilsMessengerEXT(VkInstance instance,const VkDebugUtilsMessengerCreateInfoEXT * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkDebugUtilsMessengerEXT * pCallback)667 static VkResult CreateDebugUtilsMessengerEXT(VkInstance instance, const VkDebugUtilsMessengerCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDebugUtilsMessengerEXT* pCallback)
668 {
669     PFN_vkCreateDebugUtilsMessengerEXT func = (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkCreateDebugUtilsMessengerEXT");
670     if (func)
671         return func(instance, pCreateInfo, pAllocator, pCallback);
672 
673     return VK_ERROR_EXTENSION_NOT_PRESENT;
674 }
675 
DestroyDebugUtilsMessengerEXT(VkInstance instance,VkDebugUtilsMessengerEXT callback,const VkAllocationCallbacks * pAllocator)676 static void DestroyDebugUtilsMessengerEXT(VkInstance instance, VkDebugUtilsMessengerEXT callback, const VkAllocationCallbacks* pAllocator)
677 {
678     PFN_vkDestroyDebugUtilsMessengerEXT func = (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkDestroyDebugUtilsMessengerEXT");
679     if (func)
680         func(instance, callback, pAllocator);
681 }
682 #endif // ENABLE_VALIDATION_LAYER
683 
find_device_compute_queue(const std::vector<VkQueueFamilyProperties> & queueFamilyProperties)684 static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
685 {
686     // first try, compute only queue
687     for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
688     {
689         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
690 
691         if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
692                 && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
693         {
694             return i;
695         }
696     }
697 
698     // second try, any queue with compute and graphics
699     for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
700     {
701         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
702 
703         if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
704                 && (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
705         {
706             return i;
707         }
708     }
709 
710     // third try, any queue with compute
711     for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
712     {
713         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
714 
715         if (queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
716         {
717             return i;
718         }
719     }
720 
721     //     NCNN_LOGE("no compute queue");
722     return -1;
723 }
724 
find_device_graphics_queue(const std::vector<VkQueueFamilyProperties> & queueFamilyProperties)725 static uint32_t find_device_graphics_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
726 {
727     // first try, graphics only queue
728     for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
729     {
730         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
731 
732         if ((queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)
733                 && !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT))
734         {
735             return i;
736         }
737     }
738 
739     // second try, any queue with graphics and compute
740     for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
741     {
742         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
743 
744         if ((queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)
745                 && (queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT))
746         {
747             return i;
748         }
749     }
750 
751     // third try, any queue with graphics
752     for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
753     {
754         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
755 
756         if (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)
757         {
758             return i;
759         }
760     }
761 
762     //     NCNN_LOGE("no graphics queue");
763     return -1;
764 }
765 
find_device_transfer_queue(const std::vector<VkQueueFamilyProperties> & queueFamilyProperties)766 static uint32_t find_device_transfer_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
767 {
768     // first try, transfer only queue
769     for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
770     {
771         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
772 
773         if ((queueFamilyProperty.queueFlags & VK_QUEUE_TRANSFER_BIT)
774                 && !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
775                 && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
776         {
777             return i;
778         }
779     }
780 
781     // second try, any queue with transfer
782     for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
783     {
784         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
785 
786         if (queueFamilyProperty.queueFlags & VK_QUEUE_TRANSFER_BIT)
787         {
788             return i;
789         }
790     }
791 
792     // third try, use compute queue
793     uint32_t compute_queue_index = find_device_compute_queue(queueFamilyProperties);
794     if (compute_queue_index != (uint32_t)-1)
795     {
796         return compute_queue_index;
797     }
798 
799     // fourth try, use graphics queue
800     uint32_t graphics_queue_index = find_device_graphics_queue(queueFamilyProperties);
801     if (graphics_queue_index != (uint32_t)-1)
802     {
803         return graphics_queue_index;
804     }
805 
806     //     NCNN_LOGE("no transfer queue");
807     return -1;
808 }
809 
find_default_vulkan_device_index()810 static int find_default_vulkan_device_index()
811 {
812     // first try, discrete gpu
813     for (int i = 0; i < g_gpu_count; i++)
814     {
815         if (g_gpu_infos[i]->type() == 0)
816             return i;
817     }
818 
819     // second try, integrated gpu
820     for (int i = 0; i < g_gpu_count; i++)
821     {
822         if (g_gpu_infos[i]->type() == 1)
823             return i;
824     }
825 
826     // third try, any probed device
827     if (g_gpu_count > 0)
828         return 0;
829 
830     NCNN_LOGE("no vulkan device");
831     return -1;
832 }
833 
create_gpu_instance()834 int create_gpu_instance()
835 {
836     MutexLockGuard lock(g_instance_lock);
837 
838     if ((VkInstance)g_instance != 0)
839         return 0;
840 
841     // NCNN_LOGE("create_gpu_instance");
842 
843     VkResult ret;
844 
845     std::vector<const char*> enabledLayers;
846 
847 #if ENABLE_VALIDATION_LAYER
848     uint32_t instanceLayerPropertyCount;
849     ret = vkEnumerateInstanceLayerProperties(&instanceLayerPropertyCount, NULL);
850     if (ret != VK_SUCCESS)
851     {
852         NCNN_LOGE("vkEnumerateInstanceLayerProperties failed %d", ret);
853         return -1;
854     }
855 
856     std::vector<VkLayerProperties> instanceLayerProperties(instanceLayerPropertyCount);
857     ret = vkEnumerateInstanceLayerProperties(&instanceLayerPropertyCount, instanceLayerProperties.data());
858     if (ret != VK_SUCCESS)
859     {
860         NCNN_LOGE("vkEnumerateInstanceLayerProperties failed %d", ret);
861         return -1;
862     }
863 
864     for (uint32_t i = 0; i < instanceLayerPropertyCount; i++)
865     {
866         const VkLayerProperties& lp = instanceLayerProperties[i];
867         //         NCNN_LOGE("instance layer %s = %u", lp.layerName, lp.implementationVersion);
868 
869         if (strcmp(lp.layerName, "VK_LAYER_LUNARG_standard_validation") == 0)
870         {
871             enabledLayers.push_back("VK_LAYER_LUNARG_standard_validation");
872         }
873         if (strcmp(lp.layerName, "VK_LAYER_LUNARG_parameter_validation") == 0)
874         {
875             enabledLayers.push_back("VK_LAYER_LUNARG_parameter_validation");
876         }
877         if (strcmp(lp.layerName, "VK_LAYER_KHRONOS_validation") == 0)
878         {
879             enabledLayers.push_back("VK_LAYER_KHRONOS_validation");
880         }
881     }
882 #endif // ENABLE_VALIDATION_LAYER
883 
884     std::vector<const char*> enabledExtensions;
885 
886     uint32_t instanceExtensionPropertyCount;
887     ret = vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, NULL);
888     if (ret != VK_SUCCESS)
889     {
890         NCNN_LOGE("vkEnumerateInstanceExtensionProperties failed %d", ret);
891         return -1;
892     }
893 
894     std::vector<VkExtensionProperties> instanceExtensionProperties(instanceExtensionPropertyCount);
895     ret = vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, instanceExtensionProperties.data());
896     if (ret != VK_SUCCESS)
897     {
898         NCNN_LOGE("vkEnumerateInstanceExtensionProperties failed %d", ret);
899         return -1;
900     }
901 
902     support_VK_KHR_get_physical_device_properties2 = 0;
903     support_VK_KHR_get_surface_capabilities2 = 0;
904     support_VK_KHR_surface = 0;
905     support_VK_EXT_debug_utils = 0;
906 #if __ANDROID_API__ >= 26
907     support_VK_KHR_android_surface = 0;
908 #endif // __ANDROID_API__ >= 26
909     for (uint32_t j = 0; j < instanceExtensionPropertyCount; j++)
910     {
911         const VkExtensionProperties& exp = instanceExtensionProperties[j];
912         //         NCNN_LOGE("instance extension %s = %u", exp.extensionName, exp.specVersion);
913 
914         if (strcmp(exp.extensionName, "VK_KHR_external_memory_capabilities") == 0)
915             support_VK_KHR_external_memory_capabilities = exp.specVersion;
916         else if (strcmp(exp.extensionName, "VK_KHR_get_physical_device_properties2") == 0)
917             support_VK_KHR_get_physical_device_properties2 = exp.specVersion;
918         else if (strcmp(exp.extensionName, "VK_KHR_get_surface_capabilities2") == 0)
919             support_VK_KHR_get_surface_capabilities2 = exp.specVersion;
920         else if (strcmp(exp.extensionName, "VK_KHR_surface") == 0)
921             support_VK_KHR_surface = exp.specVersion;
922         else if (strcmp(exp.extensionName, "VK_EXT_debug_utils") == 0)
923             support_VK_EXT_debug_utils = exp.specVersion;
924 #if __ANDROID_API__ >= 26
925         else if (strcmp(exp.extensionName, "VK_KHR_android_surface") == 0)
926             support_VK_KHR_android_surface = exp.specVersion;
927 #endif // __ANDROID_API__ >= 26
928     }
929 
930     if (support_VK_KHR_external_memory_capabilities)
931         enabledExtensions.push_back("VK_KHR_external_memory_capabilities");
932     if (support_VK_KHR_get_physical_device_properties2)
933         enabledExtensions.push_back("VK_KHR_get_physical_device_properties2");
934     if (support_VK_KHR_get_surface_capabilities2)
935         enabledExtensions.push_back("VK_KHR_get_surface_capabilities2");
936     if (support_VK_KHR_surface)
937         enabledExtensions.push_back("VK_KHR_surface");
938 #if ENABLE_VALIDATION_LAYER
939     if (support_VK_EXT_debug_utils)
940         enabledExtensions.push_back("VK_EXT_debug_utils");
941 #endif // ENABLE_VALIDATION_LAYER
942 #if __ANDROID_API__ >= 26
943     if (support_VK_KHR_android_surface)
944         enabledExtensions.push_back("VK_KHR_android_surface");
945 #endif // __ANDROID_API__ >= 26
946 
947     uint32_t instance_api_version = VK_MAKE_VERSION(1, 0, 0);
948     typedef VkResult(VKAPI_PTR * PFN_vkEnumerateInstanceVersion)(uint32_t * pApiVersion);
949     PFN_vkEnumerateInstanceVersion vkEnumerateInstanceVersion = (PFN_vkEnumerateInstanceVersion)vkGetInstanceProcAddr(0, "vkEnumerateInstanceVersion");
950     if (vkEnumerateInstanceVersion)
951     {
952         ret = vkEnumerateInstanceVersion(&instance_api_version);
953         if (ret != VK_SUCCESS)
954         {
955             NCNN_LOGE("vkEnumerateInstanceVersion failed %d", ret);
956             return -1;
957         }
958     }
959 
960     // NCNN_LOGE("instance apiVersion = %u.%u.%u", VK_VERSION_MAJOR(instance_api_version), VK_VERSION_MINOR(instance_api_version), VK_VERSION_PATCH(instance_api_version));
961 
962     VkApplicationInfo applicationInfo;
963     applicationInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
964     applicationInfo.pNext = 0;
965     applicationInfo.pApplicationName = "ncnn";
966     applicationInfo.applicationVersion = 0;
967     applicationInfo.pEngineName = "ncnn";
968     applicationInfo.engineVersion = 20201010;
969     applicationInfo.apiVersion = instance_api_version;
970 
971     VkInstanceCreateInfo instanceCreateInfo;
972     instanceCreateInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
973     instanceCreateInfo.pNext = 0;
974     instanceCreateInfo.flags = 0;
975     instanceCreateInfo.pApplicationInfo = &applicationInfo;
976     instanceCreateInfo.enabledLayerCount = enabledLayers.size();
977     instanceCreateInfo.ppEnabledLayerNames = enabledLayers.data();
978     instanceCreateInfo.enabledExtensionCount = enabledExtensions.size();
979     instanceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data();
980 
981     VkInstance instance = 0;
982     ret = vkCreateInstance(&instanceCreateInfo, 0, &instance);
983     if (ret != VK_SUCCESS)
984     {
985         NCNN_LOGE("vkCreateInstance failed %d", ret);
986         return -1;
987     }
988 
989     g_instance.instance = instance;
990 
991 #if ENABLE_VALIDATION_LAYER
992     if (support_VK_EXT_debug_utils)
993     {
994         VkDebugUtilsMessengerCreateInfoEXT createInfo = {};
995         createInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT;
996         createInfo.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT;
997         createInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT;
998         createInfo.pfnUserCallback = debugCallback;
999         createInfo.pUserData = 0;
1000         ret = CreateDebugUtilsMessengerEXT(g_instance, &createInfo, NULL, &g_instance.callback);
1001         if (ret != VK_SUCCESS)
1002         {
1003             NCNN_LOGE("CreateDebugUtilsMessengerEXT failed %d", ret);
1004             return -1;
1005         }
1006     }
1007 #endif // ENABLE_VALIDATION_LAYER
1008 
1009     init_instance_extension();
1010 
1011     uint32_t physicalDeviceCount = 0;
1012     ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, 0);
1013     if (ret != VK_SUCCESS)
1014     {
1015         NCNN_LOGE("vkEnumeratePhysicalDevices failed %d", ret);
1016         return -1;
1017     }
1018 
1019     if (physicalDeviceCount > NCNN_MAX_GPU_COUNT)
1020         physicalDeviceCount = NCNN_MAX_GPU_COUNT;
1021 
1022     std::vector<VkPhysicalDevice> physicalDevices(physicalDeviceCount);
1023 
1024     ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, physicalDevices.data());
1025     if (ret != VK_SUCCESS)
1026     {
1027         NCNN_LOGE("vkEnumeratePhysicalDevices failed %d", ret);
1028         return -1;
1029     }
1030 
1031     // find proper device and queue
1032     int gpu_info_index = 0;
1033     for (uint32_t i = 0; i < physicalDeviceCount; i++)
1034     {
1035         const VkPhysicalDevice& physicalDevice = physicalDevices[i];
1036         delete g_gpu_infos[gpu_info_index];
1037         g_gpu_infos[gpu_info_index] = new GpuInfo;
1038         GpuInfoPrivate& gpu_info = *(g_gpu_infos[gpu_info_index]->d);
1039 
1040         // device type
1041         VkPhysicalDeviceProperties physicalDeviceProperties;
1042         vkGetPhysicalDeviceProperties(physicalDevice, &physicalDeviceProperties);
1043 
1044         //         NCNN_LOGE("[%u] apiVersion = %u.%u.%u", i, VK_VERSION_MAJOR(physicalDeviceProperties.apiVersion),
1045         //             VK_VERSION_MINOR(physicalDeviceProperties.apiVersion), VK_VERSION_PATCH(physicalDeviceProperties.apiVersion));
1046         //         NCNN_LOGE("[%u] driverVersion = %u.%u.%u", i, VK_VERSION_MAJOR(physicalDeviceProperties.driverVersion),
1047         //             VK_VERSION_MINOR(physicalDeviceProperties.driverVersion), VK_VERSION_PATCH(physicalDeviceProperties.driverVersion));
1048         //         NCNN_LOGE("[%u] vendorID = %x", i, physicalDeviceProperties.vendorID);
1049         //         NCNN_LOGE("[%u] deviceID = %x", i, physicalDeviceProperties.deviceID);
1050         //         NCNN_LOGE("[%u] deviceType = %x", i, physicalDeviceProperties.deviceType);
1051         //         NCNN_LOGE("[%u] deviceName = %s", i, physicalDeviceProperties.deviceName);
1052         //         NCNN_LOGE("[%u] pipelineCacheUUID = %u", i, physicalDeviceProperties.pipelineCacheUUID);
1053 
1054         // mali
1055         // t760 = 0x13b5 0x7500001
1056         // t860 = 0x13b5 0x8602000
1057         // t880 = 0x13b5 0x8800020
1058         // g51  = 0x13b5 0x70901010
1059         // g52  = 0x13b5 0x74021000
1060         // g71  = 0x13b5 0x60a00002
1061         // g72  = 0x13b5 0x62210001
1062         // g76  = 0x13b5 0x72110000
1063         // g77  = 0x13b5 0x90800011
1064 
1065         // adreno
1066         // 506 = 0x5143 0x5000600
1067         // 510 = 0x5143 0x5010000
1068         // 512 = 0x5143 0x5010200
1069         // 530 = 0x5143 0x5030004
1070         // 540 = 0x5143 0x5040001
1071         // 616 = 0x5143 0x6010600
1072         // 630 = 0x5143 0x6030001
1073         // 640 = 0x5143 0x6040001
1074         // 650 = 0x5143 0x6050002
1075 
1076         gpu_info.bug_storage_buffer_no_l1 = false;
1077         gpu_info.bug_corrupted_online_pipeline_cache = false;
1078         gpu_info.bug_implicit_fp16_arithmetic = false;
1079 
1080         if (physicalDeviceProperties.vendorID == 0x5143 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 0, 66))
1081         {
1082             // qcom adreno with old buggy driver cannot share created pipeline properly
1083             gpu_info.bug_corrupted_online_pipeline_cache = true;
1084         }
1085 
1086         if (physicalDeviceProperties.vendorID == 0x5143 && !(physicalDeviceProperties.deviceID == 0x6040001 || physicalDeviceProperties.deviceID == 0x6050002))
1087         {
1088             // NOTE but qcom855/qcom855plus/qcom865 are known exceptions
1089             // qcom adreno storage buffer without L1 cache
1090             gpu_info.bug_storage_buffer_no_l1 = true;
1091         }
1092 
1093         if (physicalDeviceProperties.vendorID == 0x5143 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 1, 87))
1094         {
1095             // HACK buffer2image before image-read dependency does not work properly
1096             // even promised with full image memory barrier on old adreno driver
1097             // TODO figure out a proper workaround without hurt speed too much
1098             // TODO only for old drivers
1099             gpu_info.bug_buffer_image_load_zero = true;
1100         }
1101 
1102         if (physicalDeviceProperties.vendorID == 0x13b5
1103                 && (physicalDeviceProperties.deviceID == 0x7500001
1104                     || physicalDeviceProperties.deviceID == 0x8602000
1105                     || physicalDeviceProperties.deviceID == 0x8800020
1106                     || physicalDeviceProperties.deviceID == 0x70901010
1107                     || physicalDeviceProperties.deviceID == 0x74021000
1108                     || physicalDeviceProperties.deviceID == 0x60a00002
1109                     || physicalDeviceProperties.deviceID == 0x62210001))
1110         {
1111             // NOTE rk3288/rk3399/t880/g51/g52/g71/g72
1112             // however, g76/g77 has explicit fp16 arithmetic
1113             // arm mali driver accept spirv with fp16 arithmetic
1114             gpu_info.bug_implicit_fp16_arithmetic = true;
1115         }
1116 
1117         if (physicalDeviceProperties.vendorID == 0x5143
1118                 && (physicalDeviceProperties.deviceID == 0x6030001
1119                     || physicalDeviceProperties.deviceID == 0x6040001
1120                     || physicalDeviceProperties.deviceID == 0x6050002))
1121         {
1122             // TODO enable devices other than qcom845/qcom855/qcom855plus/qcom865
1123             // qcom adreno driver accept spirv with fp16 arithmetic
1124             gpu_info.bug_implicit_fp16_arithmetic = true;
1125         }
1126 
1127         gpu_info.physical_device = physicalDevice;
1128 
1129         // info
1130         gpu_info.api_version = physicalDeviceProperties.apiVersion;
1131         gpu_info.driver_version = physicalDeviceProperties.driverVersion;
1132         gpu_info.vendor_id = physicalDeviceProperties.vendorID;
1133         gpu_info.device_id = physicalDeviceProperties.deviceID;
1134         memcpy(gpu_info.device_name, physicalDeviceProperties.deviceName, VK_MAX_PHYSICAL_DEVICE_NAME_SIZE);
1135         memcpy(gpu_info.pipeline_cache_uuid, physicalDeviceProperties.pipelineCacheUUID, VK_UUID_SIZE);
1136 
1137         if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU)
1138             gpu_info.type = 0;
1139         else if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU)
1140             gpu_info.type = 1;
1141         else if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU)
1142             gpu_info.type = 2;
1143         else if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_CPU)
1144             gpu_info.type = 3;
1145         else
1146             gpu_info.type = -1;
1147 
1148         // device capability
1149         gpu_info.max_shared_memory_size = physicalDeviceProperties.limits.maxComputeSharedMemorySize;
1150 
1151         gpu_info.max_workgroup_count_x = physicalDeviceProperties.limits.maxComputeWorkGroupCount[0];
1152         gpu_info.max_workgroup_count_y = physicalDeviceProperties.limits.maxComputeWorkGroupCount[1];
1153         gpu_info.max_workgroup_count_z = physicalDeviceProperties.limits.maxComputeWorkGroupCount[2];
1154 
1155         gpu_info.max_workgroup_invocations = physicalDeviceProperties.limits.maxComputeWorkGroupInvocations;
1156 
1157         gpu_info.max_workgroup_size_x = physicalDeviceProperties.limits.maxComputeWorkGroupSize[0];
1158         gpu_info.max_workgroup_size_y = physicalDeviceProperties.limits.maxComputeWorkGroupSize[1];
1159         gpu_info.max_workgroup_size_z = physicalDeviceProperties.limits.maxComputeWorkGroupSize[2];
1160 
1161         gpu_info.memory_map_alignment = physicalDeviceProperties.limits.minMemoryMapAlignment;
1162         gpu_info.buffer_offset_alignment = physicalDeviceProperties.limits.minStorageBufferOffsetAlignment;
1163         gpu_info.non_coherent_atom_size = physicalDeviceProperties.limits.nonCoherentAtomSize;
1164         gpu_info.buffer_image_granularity = physicalDeviceProperties.limits.bufferImageGranularity;
1165         gpu_info.max_image_dimension_1d = physicalDeviceProperties.limits.maxImageDimension1D;
1166         gpu_info.max_image_dimension_2d = physicalDeviceProperties.limits.maxImageDimension2D;
1167         gpu_info.max_image_dimension_3d = physicalDeviceProperties.limits.maxImageDimension3D;
1168 
1169         gpu_info.timestamp_period = physicalDeviceProperties.limits.timestampPeriod;
1170 
1171         //         NCNN_LOGE("[%u] max_shared_memory_size = %u", i, gpu_info.max_shared_memory_size);
1172         //         NCNN_LOGE("[%u] max_workgroup_count = %u %u %u", i, gpu_info.max_workgroup_count[0], gpu_info.max_workgroup_count[1], gpu_info.max_workgroup_count[2]);
1173         //         NCNN_LOGE("[%u] max_workgroup_invocations = %u", i, gpu_info.max_workgroup_invocations);
1174         //         NCNN_LOGE("[%u] max_workgroup_size = %u %u %u", i, gpu_info.max_workgroup_size[0], gpu_info.max_workgroup_size[1], gpu_info.max_workgroup_size[2]);
1175         //         NCNN_LOGE("[%u] memory_map_alignment = %lu", i, gpu_info.memory_map_alignment);
1176         //         NCNN_LOGE("[%u] buffer_offset_alignment = %lu", i, gpu_info.buffer_offset_alignment);
1177 
1178         // find compute queue
1179         uint32_t queueFamilyPropertiesCount;
1180         vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, 0);
1181 
1182         std::vector<VkQueueFamilyProperties> queueFamilyProperties(queueFamilyPropertiesCount);
1183         vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, queueFamilyProperties.data());
1184 
1185         gpu_info.compute_queue_family_index = find_device_compute_queue(queueFamilyProperties);
1186         gpu_info.graphics_queue_family_index = find_device_graphics_queue(queueFamilyProperties);
1187         gpu_info.transfer_queue_family_index = find_device_transfer_queue(queueFamilyProperties);
1188 
1189         gpu_info.compute_queue_count = queueFamilyProperties[gpu_info.compute_queue_family_index].queueCount;
1190         gpu_info.graphics_queue_count = queueFamilyProperties[gpu_info.graphics_queue_family_index].queueCount;
1191         gpu_info.transfer_queue_count = queueFamilyProperties[gpu_info.transfer_queue_family_index].queueCount;
1192 
1193         gpu_info.unified_compute_transfer_queue = gpu_info.compute_queue_family_index == gpu_info.transfer_queue_family_index;
1194 
1195         // additional device properties
1196         gpu_info.subgroup_size = 64;
1197         gpu_info.support_subgroup_basic = false;
1198         gpu_info.support_subgroup_vote = false;
1199         gpu_info.support_subgroup_ballot = false;
1200         gpu_info.support_subgroup_shuffle = false;
1201         if (support_VK_KHR_get_physical_device_properties2)
1202         {
1203             void* queryDeviceProperties = 0;
1204 
1205             // query subgroup
1206             VkPhysicalDeviceSubgroupProperties physicalDeviceSubgroupProperties;
1207             physicalDeviceSubgroupProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
1208             physicalDeviceSubgroupProperties.pNext = queryDeviceProperties;
1209             if (VK_VERSION_MAJOR(instance_api_version) >= 1 && VK_VERSION_MINOR(instance_api_version) >= 1)
1210             {
1211                 queryDeviceProperties = &physicalDeviceSubgroupProperties;
1212             }
1213 
1214             VkPhysicalDeviceProperties2KHR queryProperties;
1215             queryProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR;
1216             queryProperties.pNext = queryDeviceProperties;
1217 
1218             vkGetPhysicalDeviceProperties2KHR(physicalDevice, &queryProperties);
1219 
1220             if (VK_VERSION_MAJOR(instance_api_version) >= 1 && VK_VERSION_MINOR(instance_api_version) >= 1)
1221             {
1222                 gpu_info.subgroup_size = physicalDeviceSubgroupProperties.subgroupSize;
1223                 if (physicalDeviceSubgroupProperties.supportedStages & VK_SHADER_STAGE_COMPUTE_BIT)
1224                 {
1225                     gpu_info.support_subgroup_basic = physicalDeviceSubgroupProperties.supportedOperations & VK_SUBGROUP_FEATURE_BASIC_BIT;
1226                     gpu_info.support_subgroup_vote = physicalDeviceSubgroupProperties.supportedOperations & VK_SUBGROUP_FEATURE_VOTE_BIT;
1227                     gpu_info.support_subgroup_ballot = physicalDeviceSubgroupProperties.supportedOperations & VK_SUBGROUP_FEATURE_BALLOT_BIT;
1228                     gpu_info.support_subgroup_shuffle = physicalDeviceSubgroupProperties.supportedOperations & VK_SUBGROUP_FEATURE_SHUFFLE_BIT;
1229                 }
1230             }
1231             else
1232             {
1233                 if (physicalDeviceProperties.vendorID == 0x5143) // qcom adreno prefer very large workgroup :P
1234                     gpu_info.subgroup_size = 128;
1235                 if (physicalDeviceProperties.vendorID == 0x13b5) // arm mali
1236                     gpu_info.subgroup_size = 16;
1237                 if (physicalDeviceProperties.vendorID == 0x1010) // imgtec powervr
1238                     gpu_info.subgroup_size = 32;
1239                 if (physicalDeviceProperties.vendorID == 0x1002) // amd
1240                     gpu_info.subgroup_size = 64;
1241                 if (physicalDeviceProperties.vendorID == 0x10de) // nvidia
1242                     gpu_info.subgroup_size = 32;
1243                 if (physicalDeviceProperties.vendorID == 0x8086) // intel
1244                     gpu_info.subgroup_size = 32;
1245             }
1246         }
1247 
1248         // cache memory properties
1249         vkGetPhysicalDeviceMemoryProperties(physicalDevice, &gpu_info.physical_device_memory_properties);
1250 
1251         // get device extension
1252         uint32_t deviceExtensionPropertyCount = 0;
1253         ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, NULL);
1254         if (ret != VK_SUCCESS)
1255         {
1256             NCNN_LOGE("vkEnumerateDeviceExtensionProperties failed %d", ret);
1257             return -1;
1258         }
1259 
1260         std::vector<VkExtensionProperties> deviceExtensionProperties(deviceExtensionPropertyCount);
1261         ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, deviceExtensionProperties.data());
1262         if (ret != VK_SUCCESS)
1263         {
1264             NCNN_LOGE("vkEnumerateDeviceExtensionProperties failed %d", ret);
1265             return -1;
1266         }
1267 
1268         // extension capability
1269         gpu_info.support_VK_KHR_8bit_storage = 0;
1270         gpu_info.support_VK_KHR_16bit_storage = 0;
1271         gpu_info.support_VK_KHR_bind_memory2 = 0;
1272         gpu_info.support_VK_KHR_create_renderpass2 = 0;
1273         gpu_info.support_VK_KHR_dedicated_allocation = 0;
1274         gpu_info.support_VK_KHR_descriptor_update_template = 0;
1275         gpu_info.support_VK_KHR_external_memory = 0;
1276         gpu_info.support_VK_KHR_get_memory_requirements2 = 0;
1277         gpu_info.support_VK_KHR_maintenance1 = 0;
1278         gpu_info.support_VK_KHR_maintenance2 = 0;
1279         gpu_info.support_VK_KHR_maintenance3 = 0;
1280         gpu_info.support_VK_KHR_multiview = 0;
1281         gpu_info.support_VK_KHR_push_descriptor = 0;
1282         gpu_info.support_VK_KHR_sampler_ycbcr_conversion = 0;
1283         gpu_info.support_VK_KHR_shader_float16_int8 = 0;
1284         gpu_info.support_VK_KHR_shader_float_controls = 0;
1285         gpu_info.support_VK_KHR_storage_buffer_storage_class = 0;
1286         gpu_info.support_VK_KHR_swapchain = 0;
1287         gpu_info.support_VK_EXT_descriptor_indexing = 0;
1288         gpu_info.support_VK_EXT_memory_budget = 0;
1289         gpu_info.support_VK_EXT_queue_family_foreign = 0;
1290 #if __ANDROID_API__ >= 26
1291         gpu_info.support_VK_ANDROID_external_memory_android_hardware_buffer = 0;
1292 #endif // __ANDROID_API__ >= 26
1293         for (uint32_t j = 0; j < deviceExtensionPropertyCount; j++)
1294         {
1295             const VkExtensionProperties& exp = deviceExtensionProperties[j];
1296             // NCNN_LOGE("device extension %s = %u", exp.extensionName, exp.specVersion);
1297 
1298             if (strcmp(exp.extensionName, "VK_KHR_8bit_storage") == 0)
1299                 gpu_info.support_VK_KHR_8bit_storage = exp.specVersion;
1300             else if (strcmp(exp.extensionName, "VK_KHR_16bit_storage") == 0)
1301                 gpu_info.support_VK_KHR_16bit_storage = exp.specVersion;
1302             else if (strcmp(exp.extensionName, "VK_KHR_bind_memory2") == 0)
1303                 gpu_info.support_VK_KHR_bind_memory2 = exp.specVersion;
1304             else if (strcmp(exp.extensionName, "VK_KHR_create_renderpass2") == 0)
1305                 gpu_info.support_VK_KHR_create_renderpass2 = exp.specVersion;
1306             else if (strcmp(exp.extensionName, "VK_KHR_dedicated_allocation") == 0)
1307                 gpu_info.support_VK_KHR_dedicated_allocation = exp.specVersion;
1308             else if (strcmp(exp.extensionName, "VK_KHR_descriptor_update_template") == 0)
1309                 gpu_info.support_VK_KHR_descriptor_update_template = exp.specVersion;
1310             else if (strcmp(exp.extensionName, "VK_KHR_external_memory") == 0)
1311                 gpu_info.support_VK_KHR_external_memory = exp.specVersion;
1312             else if (strcmp(exp.extensionName, "VK_KHR_get_memory_requirements2") == 0)
1313                 gpu_info.support_VK_KHR_get_memory_requirements2 = exp.specVersion;
1314             else if (strcmp(exp.extensionName, "VK_KHR_maintenance1") == 0)
1315                 gpu_info.support_VK_KHR_maintenance1 = exp.specVersion;
1316             else if (strcmp(exp.extensionName, "VK_KHR_maintenance2") == 0)
1317                 gpu_info.support_VK_KHR_maintenance2 = exp.specVersion;
1318             else if (strcmp(exp.extensionName, "VK_KHR_maintenance3") == 0)
1319                 gpu_info.support_VK_KHR_maintenance3 = exp.specVersion;
1320             else if (strcmp(exp.extensionName, "VK_KHR_multiview") == 0)
1321                 gpu_info.support_VK_KHR_multiview = exp.specVersion;
1322             else if (strcmp(exp.extensionName, "VK_KHR_push_descriptor") == 0)
1323                 gpu_info.support_VK_KHR_push_descriptor = exp.specVersion;
1324             else if (strcmp(exp.extensionName, "VK_KHR_sampler_ycbcr_conversion") == 0)
1325                 gpu_info.support_VK_KHR_sampler_ycbcr_conversion = exp.specVersion;
1326             else if (strcmp(exp.extensionName, "VK_KHR_shader_float16_int8") == 0)
1327                 gpu_info.support_VK_KHR_shader_float16_int8 = exp.specVersion;
1328             else if (strcmp(exp.extensionName, "VK_KHR_shader_float_controls") == 0)
1329                 gpu_info.support_VK_KHR_shader_float_controls = exp.specVersion;
1330             else if (strcmp(exp.extensionName, "VK_KHR_storage_buffer_storage_class") == 0)
1331                 gpu_info.support_VK_KHR_storage_buffer_storage_class = exp.specVersion;
1332             else if (strcmp(exp.extensionName, "VK_KHR_swapchain") == 0)
1333                 gpu_info.support_VK_KHR_swapchain = exp.specVersion;
1334             else if (strcmp(exp.extensionName, "VK_EXT_descriptor_indexing") == 0)
1335                 gpu_info.support_VK_EXT_descriptor_indexing = exp.specVersion;
1336             else if (strcmp(exp.extensionName, "VK_EXT_memory_budget") == 0)
1337                 gpu_info.support_VK_EXT_memory_budget = exp.specVersion;
1338             else if (strcmp(exp.extensionName, "VK_EXT_queue_family_foreign") == 0)
1339                 gpu_info.support_VK_EXT_queue_family_foreign = exp.specVersion;
1340 #if __ANDROID_API__ >= 26
1341             else if (strcmp(exp.extensionName, "VK_ANDROID_external_memory_android_hardware_buffer") == 0)
1342                 gpu_info.support_VK_ANDROID_external_memory_android_hardware_buffer = exp.specVersion;
1343 #endif // __ANDROID_API__ >= 26
1344         }
1345 
1346         // check features
1347         gpu_info.support_fp16_packed = true;
1348         gpu_info.support_fp16_storage = false;
1349         gpu_info.support_fp16_arithmetic = false;
1350         gpu_info.support_int8_packed = true;
1351         gpu_info.support_int8_storage = false;
1352         gpu_info.support_int8_arithmetic = false;
1353         gpu_info.support_ycbcr_conversion = false;
1354         if (support_VK_KHR_get_physical_device_properties2)
1355         {
1356             void* queryExtensionFeatures = 0;
1357 
1358             // query int8 storage
1359             VkPhysicalDevice8BitStorageFeaturesKHR query8BitStorageFeatures;
1360             query8BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR;
1361             query8BitStorageFeatures.pNext = 0;
1362             if (gpu_info.support_VK_KHR_8bit_storage)
1363             {
1364                 query8BitStorageFeatures.pNext = queryExtensionFeatures;
1365                 queryExtensionFeatures = &query8BitStorageFeatures;
1366             }
1367 
1368             // query fp16/int16 storage
1369             VkPhysicalDevice16BitStorageFeaturesKHR query16BitStorageFeatures;
1370             query16BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR;
1371             query16BitStorageFeatures.pNext = 0;
1372             if (gpu_info.support_VK_KHR_16bit_storage)
1373             {
1374                 query16BitStorageFeatures.pNext = queryExtensionFeatures;
1375                 queryExtensionFeatures = &query16BitStorageFeatures;
1376             }
1377 
1378             // query fp16/int8 arithmetic
1379             VkPhysicalDeviceFloat16Int8FeaturesKHR queryFloat16Int8Features;
1380             queryFloat16Int8Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR;
1381             queryFloat16Int8Features.pNext = 0;
1382             if (gpu_info.support_VK_KHR_shader_float16_int8)
1383             {
1384                 queryFloat16Int8Features.pNext = queryExtensionFeatures;
1385                 queryExtensionFeatures = &queryFloat16Int8Features;
1386             }
1387 
1388             // query ycbcr_conversion
1389             VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR querySamplerYcbcrConversionFeatures;
1390             querySamplerYcbcrConversionFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES_KHR;
1391             querySamplerYcbcrConversionFeatures.pNext = 0;
1392             if (gpu_info.support_VK_KHR_sampler_ycbcr_conversion)
1393             {
1394                 querySamplerYcbcrConversionFeatures.pNext = queryExtensionFeatures;
1395                 queryExtensionFeatures = &querySamplerYcbcrConversionFeatures;
1396             }
1397 
1398             VkPhysicalDeviceFeatures2KHR queryFeatures;
1399             queryFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR;
1400             queryFeatures.pNext = queryExtensionFeatures;
1401 
1402             vkGetPhysicalDeviceFeatures2KHR(physicalDevice, &queryFeatures);
1403 
1404             if (gpu_info.support_VK_KHR_8bit_storage)
1405             {
1406                 gpu_info.support_int8_storage = query8BitStorageFeatures.storageBuffer8BitAccess;
1407             }
1408             if (gpu_info.support_VK_KHR_16bit_storage && queryFeatures.features.shaderStorageImageExtendedFormats)
1409             {
1410                 // shaderStorageImageExtendedFormats enables r16f format in storage image
1411                 gpu_info.support_fp16_storage = query16BitStorageFeatures.storageBuffer16BitAccess;
1412             }
1413             if (gpu_info.support_VK_KHR_shader_float16_int8)
1414             {
1415                 gpu_info.support_fp16_arithmetic = queryFloat16Int8Features.shaderFloat16;
1416                 gpu_info.support_int8_arithmetic = queryFloat16Int8Features.shaderInt8;
1417             }
1418             if (gpu_info.support_VK_KHR_sampler_ycbcr_conversion)
1419             {
1420                 gpu_info.support_ycbcr_conversion = querySamplerYcbcrConversionFeatures.samplerYcbcrConversion;
1421             }
1422         }
1423         else
1424         {
1425             //             // TODO
1426             //             VkPhysicalDeviceFeatures features;
1427             //             vkGetPhysicalDeviceFeatures(physicalDevice, &features);
1428         }
1429 
1430         if (physicalDeviceProperties.vendorID == 0x13b5 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 0, 82))
1431         {
1432             // the 16bit_storage implementation of arm mali driver is buggy :[
1433             gpu_info.support_fp16_storage = false;
1434         }
1435 
1436         if (physicalDeviceProperties.vendorID == 0x10002 && physicalDeviceProperties.deviceID == 0x70006214 && physicalDeviceProperties.apiVersion == VK_MAKE_VERSION(1, 1, 82))
1437         {
1438             // the 16bit_storage implementation of vivante gc1700 driver is buggy :[
1439             gpu_info.support_fp16_storage = false;
1440         }
1441 
1442         if (gpu_info.bug_implicit_fp16_arithmetic)
1443         {
1444             // force capability on as long as the driver accept spirv with fp16 arithmetic :D
1445             gpu_info.support_fp16_arithmetic = true;
1446         }
1447 
1448         NCNN_LOGE("[%u %s]  queueC=%u[%u]  queueG=%u[%u]  queueT=%u[%u]", i, physicalDeviceProperties.deviceName,
1449                   gpu_info.compute_queue_family_index, gpu_info.compute_queue_count,
1450                   gpu_info.graphics_queue_family_index, gpu_info.graphics_queue_count,
1451                   gpu_info.transfer_queue_family_index, gpu_info.transfer_queue_count);
1452 
1453         NCNN_LOGE("[%u %s]  bugsbn1=%d  bugbilz=%d  bugcopc=%d  bugihfa=%d", i, physicalDeviceProperties.deviceName,
1454                   gpu_info.bug_storage_buffer_no_l1, gpu_info.bug_buffer_image_load_zero, gpu_info.bug_corrupted_online_pipeline_cache, gpu_info.bug_implicit_fp16_arithmetic);
1455 
1456         NCNN_LOGE("[%u %s]  fp16-p/s/a=%d/%d/%d  int8-p/s/a=%d/%d/%d", i, physicalDeviceProperties.deviceName,
1457                   gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic,
1458                   gpu_info.support_int8_packed, gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic);
1459 
1460         NCNN_LOGE("[%u %s]  subgroup=%u  basic=%d  vote=%d  ballot=%d  shuffle=%d", i, physicalDeviceProperties.deviceName,
1461                   gpu_info.subgroup_size, gpu_info.support_subgroup_basic, gpu_info.support_subgroup_vote,
1462                   gpu_info.support_subgroup_ballot, gpu_info.support_subgroup_shuffle);
1463 
1464         gpu_info_index++;
1465     }
1466 
1467     g_gpu_count = gpu_info_index;
1468 
1469     // the default gpu device
1470     g_default_gpu_index = find_default_vulkan_device_index();
1471 
1472     glslang::InitializeProcess();
1473 
1474     return 0;
1475 }
1476 
destroy_gpu_instance()1477 void destroy_gpu_instance()
1478 {
1479     MutexLockGuard lock(g_instance_lock);
1480 
1481     if ((VkInstance)g_instance == 0)
1482         return;
1483 
1484     // NCNN_LOGE("destroy_gpu_instance");
1485 
1486     glslang::FinalizeProcess();
1487 
1488     for (int i = 0; i < NCNN_MAX_GPU_COUNT; i++)
1489     {
1490         delete g_default_vkdev[i];
1491         g_default_vkdev[i] = 0;
1492 
1493         delete g_gpu_infos[i];
1494         g_gpu_infos[i] = 0;
1495     }
1496 
1497 #if ENABLE_VALIDATION_LAYER
1498     if (support_VK_EXT_debug_utils)
1499     {
1500         DestroyDebugUtilsMessengerEXT(g_instance, g_instance.callback, NULL);
1501         g_instance.callback = 0;
1502     }
1503 #endif // ENABLE_VALIDATION_LAYER
1504 
1505     vkDestroyInstance(g_instance, 0);
1506 
1507     g_instance.instance = 0;
1508 }
1509 
is_gpu_instance_ready()1510 static bool is_gpu_instance_ready()
1511 {
1512     MutexLockGuard lock(g_instance_lock);
1513 
1514     return (VkInstance)g_instance != 0;
1515 }
1516 
try_create_gpu_instance()1517 static void try_create_gpu_instance()
1518 {
1519     if (!is_gpu_instance_ready())
1520         create_gpu_instance();
1521 }
1522 
get_gpu_count()1523 int get_gpu_count()
1524 {
1525     try_create_gpu_instance();
1526 
1527     return g_gpu_count;
1528 }
1529 
get_default_gpu_index()1530 int get_default_gpu_index()
1531 {
1532     try_create_gpu_instance();
1533 
1534     return g_default_gpu_index;
1535 }
1536 
get_gpu_info(int device_index)1537 const GpuInfo& get_gpu_info(int device_index)
1538 {
1539     try_create_gpu_instance();
1540 
1541     return *g_gpu_infos[device_index];
1542 }
1543 
1544 class VkDummyAllocator : public VkBlobAllocator
1545 {
1546 public:
1547     // NOTE 16k is large enough I think ...
VkDummyAllocator(const VulkanDevice * _vkdev)1548     VkDummyAllocator(const VulkanDevice* _vkdev)
1549         : VkBlobAllocator(_vkdev, 16 * 1024)
1550     {
1551     }
1552 };
1553 
1554 class VkDummyCompute : public VkCompute
1555 {
1556 public:
VkDummyCompute(const VulkanDevice * _vkdev)1557     VkDummyCompute(const VulkanDevice* _vkdev)
1558         : VkCompute(_vkdev)
1559     {
1560     }
1561 
record_dummy(const VkMat & buffer)1562     void record_dummy(const VkMat& buffer)
1563     {
1564         barrier_readwrite(buffer);
1565     }
1566 
record_dummy(const VkImageMat & image)1567     void record_dummy(const VkImageMat& image)
1568     {
1569         barrier_readwrite(image);
1570     }
1571 
record_dummy_readonly(const VkImageMat & image)1572     void record_dummy_readonly(const VkImageMat& image)
1573     {
1574         barrier_readonly(image);
1575     }
1576 };
1577 
1578 class VulkanDevicePrivate
1579 {
1580 public:
VulkanDevicePrivate(VulkanDevice * _vkdev)1581     VulkanDevicePrivate(VulkanDevice* _vkdev)
1582         : vkdev(_vkdev)
1583     {
1584     }
1585     VulkanDevice* const vkdev;
1586 
1587     // dummy buffer and image
1588     int create_dummy_buffer_image();
1589     void destroy_dummy_buffer_image();
1590 
1591     // utility operator
1592     const ncnn::Packing_vulkan* get_utility_operator(int storage_type_from, int storage_type_to, int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const;
1593     void destroy_utility_operator();
1594 
1595     VkDevice device;
1596 
1597     // hardware queue
1598     mutable std::vector<VkQueue> compute_queues;
1599     mutable std::vector<VkQueue> graphics_queues;
1600     mutable std::vector<VkQueue> transfer_queues;
1601     mutable int free_compute_queue_count;
1602     mutable int free_graphics_queue_count;
1603     mutable int free_transfer_queue_count;
1604     mutable Mutex compute_queue_lock;
1605     mutable Mutex graphics_queue_lock;
1606     mutable Mutex transfer_queue_lock;
1607     mutable ConditionVariable compute_queue_condition;
1608     mutable ConditionVariable graphics_queue_condition;
1609     mutable ConditionVariable transfer_queue_condition;
1610 
1611     // default blob allocator for each queue
1612     mutable std::vector<VkAllocator*> blob_allocators;
1613     mutable Mutex blob_allocator_lock;
1614 
1615     // default staging allocator for each queue
1616     mutable std::vector<VkAllocator*> staging_allocators;
1617     mutable Mutex staging_allocator_lock;
1618 
1619     // nearest sampler for texelfetch
1620     VkSampler texelfetch_sampler;
1621 
1622     // dummy buffer and image
1623     VkAllocator* dummy_allocator;
1624     VkMat dummy_buffer;
1625     VkImageMat dummy_image;
1626     VkImageMat dummy_image_readonly;
1627 
1628     // device-wide pipeline cache
1629     PipelineCache* pipeline_cache;
1630 
1631     // utility operator
1632     // from buffer | image
1633     // to buffer | image
1634     // from fp32-b/i | fp16p-b/i | fp16s-b/i
1635     // to fp32-b/i | fp16p-b/i | fp16s-b/i
1636     // to pack1 | pack4 | pack8
1637     mutable ncnn::Packing_vulkan* uop_packing[2][2][3][3][3];
1638     mutable Mutex uop_lock;
1639 };
1640 
create_dummy_buffer_image()1641 int VulkanDevicePrivate::create_dummy_buffer_image()
1642 {
1643     dummy_allocator = new VkDummyAllocator(vkdev);
1644 
1645     dummy_buffer.create(1, 4u, dummy_allocator);
1646     dummy_image.create(1, 4u, dummy_allocator);
1647 #if __APPLE__
1648     if (vkdev->info.vendor_id() != 0x8086)
1649         dummy_image_readonly.create(1, 4u, dummy_allocator);
1650 #else
1651     dummy_image_readonly.create(1, 4u, dummy_allocator);
1652 #endif
1653 
1654     VkDummyCompute cmd(vkdev);
1655 
1656     cmd.record_dummy(dummy_buffer);
1657     cmd.record_dummy(dummy_image);
1658 #if __APPLE__
1659     if (vkdev->info.vendor_id() != 0x8086)
1660         cmd.record_dummy_readonly(dummy_image_readonly);
1661 #else
1662     cmd.record_dummy_readonly(dummy_image_readonly);
1663 #endif
1664 
1665     cmd.submit_and_wait();
1666 
1667     return 0;
1668 }
1669 
destroy_dummy_buffer_image()1670 void VulkanDevicePrivate::destroy_dummy_buffer_image()
1671 {
1672     dummy_buffer.release();
1673     dummy_image.release();
1674 #if __APPLE__
1675     if (vkdev->info.vendor_id() != 0x8086)
1676         dummy_image_readonly.release();
1677 #else
1678     dummy_image_readonly.release();
1679 #endif
1680 
1681     delete dummy_allocator;
1682 }
1683 
get_utility_operator(int storage_type_from,int storage_type_to,int cast_type_from_index,int cast_type_to_index,int packing_type_to_index) const1684 const ncnn::Packing_vulkan* VulkanDevicePrivate::get_utility_operator(int storage_type_from, int storage_type_to, int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const
1685 {
1686     MutexLockGuard lock(uop_lock);
1687 
1688     const ncnn::Packing_vulkan* cached_uop = uop_packing[storage_type_from][storage_type_to][cast_type_from_index][cast_type_to_index][packing_type_to_index];
1689     if (cached_uop)
1690         return cached_uop;
1691 
1692     if ((cast_type_from_index == 1 && cast_type_to_index == 2) || (cast_type_from_index == 2 && cast_type_to_index == 1))
1693     {
1694         NCNN_LOGE("no fp16p to/from fp16s conversion");
1695         return 0;
1696     }
1697 
1698     // create uop
1699     Option opt;
1700     opt.use_image_storage = (storage_type_from == 1 || storage_type_to == 1);
1701     opt.use_fp16_packed = (cast_type_from_index == 1 || cast_type_to_index == 1);
1702     opt.use_fp16_storage = (cast_type_from_index == 2 || cast_type_to_index == 2);
1703 
1704     if (!vkdev->info.support_fp16_packed() && opt.use_fp16_packed)
1705     {
1706         NCNN_LOGE("cannot create uop with use_fp16_packed if not support_fp16_packed");
1707         return 0;
1708     }
1709 
1710     if (!vkdev->info.support_fp16_storage() && opt.use_fp16_storage)
1711     {
1712         NCNN_LOGE("cannot create uop with use_fp16_storage if not support_fp16_storage");
1713         return 0;
1714     }
1715 
1716     // fp16/int8 arithmetic are not necessary for packing
1717     // and may conflict with storage options
1718     opt.use_fp16_arithmetic = false;
1719     opt.use_int8_arithmetic = false;
1720 
1721     // enable pack8 for pack8to1/pack8to4
1722     opt.use_shader_pack8 = true;
1723 
1724     opt.use_vulkan_compute = true;
1725 
1726     // cache uop pipeline as device member explicitly
1727     opt.pipeline_cache = 0;
1728 
1729     ncnn::Packing_vulkan* uop = new ncnn::Packing_vulkan;
1730     uop->vkdev = vkdev;
1731 
1732     ncnn::ParamDict pd;
1733     pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4 : 8); // out_elempack
1734     pd.set(2, cast_type_from_index + 1);                                            // 0=auto 1=fp32 2=fp16p 3=fp16s
1735     pd.set(3, cast_type_to_index + 1);
1736     pd.set(4, storage_type_from); // 0=buffer 1=image
1737     pd.set(5, storage_type_to);
1738 
1739     uop->load_param(pd);
1740 
1741     uop->create_pipeline(opt);
1742 
1743     uop_packing[storage_type_from][storage_type_to][cast_type_from_index][cast_type_to_index][packing_type_to_index] = uop;
1744 
1745     return uop;
1746 }
1747 
destroy_utility_operator()1748 void VulkanDevicePrivate::destroy_utility_operator()
1749 {
1750     Option opt;
1751     opt.use_vulkan_compute = true;
1752     opt.use_fp16_arithmetic = false;
1753     opt.use_int8_arithmetic = false;
1754     opt.pipeline_cache = 0;
1755 
1756     // from buffer | image
1757     // to buffer | image
1758     for (int i0 = 0; i0 < 2; i0++)
1759     {
1760         for (int i1 = 0; i1 < 2; i1++)
1761         {
1762             opt.use_image_storage = (i0 == 1 || i1 == 1);
1763 
1764             // from fp32-b/i | fp16p-b/i | fp16s-b/i
1765             // to fp32-b/i | fp16p-b/i | fp16s-b/i
1766             for (int j0 = 0; j0 < 3; j0++)
1767             {
1768                 for (int j1 = 0; j1 < 3; j1++)
1769                 {
1770                     if ((j0 == 1 && j1 == 2) || (j0 == 2 && j1 == 1))
1771                     {
1772                         // no fp16p to/from fp16s conversion
1773                         continue;
1774                     }
1775 
1776                     opt.use_fp16_packed = (j0 == 1 || j1 == 1);
1777                     opt.use_fp16_storage = (j0 == 2 || j1 == 2);
1778 
1779                     if (!vkdev->info.support_fp16_packed() && opt.use_fp16_packed)
1780                         continue;
1781 
1782                     if (!vkdev->info.support_fp16_storage() && opt.use_fp16_storage)
1783                         continue;
1784 
1785                     // to pack1 | pack4 | pack8
1786                     for (int k = 0; k < 3; k++)
1787                     {
1788                         // enable pack8 for pack8to1/pack8to4
1789                         opt.use_shader_pack8 = true;
1790 
1791                         ncnn::Layer* uop = uop_packing[i0][i1][j0][j1][k];
1792                         if (!uop)
1793                             continue;
1794 
1795                         uop->destroy_pipeline(opt);
1796 
1797                         delete uop;
1798 
1799                         uop_packing[i0][i1][j0][j1][k] = 0;
1800                     }
1801                 }
1802             }
1803         }
1804     }
1805 }
1806 
VulkanDevice(int device_index)1807 VulkanDevice::VulkanDevice(int device_index)
1808     : info(get_gpu_info(device_index)), d(new VulkanDevicePrivate(this))
1809 {
1810     try_create_gpu_instance();
1811 
1812     std::vector<const char*> enabledExtensions;
1813     if (info.support_VK_KHR_8bit_storage())
1814         enabledExtensions.push_back("VK_KHR_8bit_storage");
1815     if (info.support_VK_KHR_16bit_storage())
1816         enabledExtensions.push_back("VK_KHR_16bit_storage");
1817     if (info.support_VK_KHR_bind_memory2())
1818         enabledExtensions.push_back("VK_KHR_bind_memory2");
1819     if (info.support_VK_KHR_create_renderpass2())
1820         enabledExtensions.push_back("VK_KHR_create_renderpass2");
1821     if (info.support_VK_KHR_dedicated_allocation())
1822         enabledExtensions.push_back("VK_KHR_dedicated_allocation");
1823     if (info.support_VK_KHR_descriptor_update_template())
1824         enabledExtensions.push_back("VK_KHR_descriptor_update_template");
1825     if (info.support_VK_KHR_external_memory())
1826         enabledExtensions.push_back("VK_KHR_external_memory");
1827     if (info.support_VK_KHR_get_memory_requirements2())
1828         enabledExtensions.push_back("VK_KHR_get_memory_requirements2");
1829     if (info.support_VK_KHR_maintenance1())
1830         enabledExtensions.push_back("VK_KHR_maintenance1");
1831     if (info.support_VK_KHR_maintenance2())
1832         enabledExtensions.push_back("VK_KHR_maintenance2");
1833     if (info.support_VK_KHR_maintenance3())
1834         enabledExtensions.push_back("VK_KHR_maintenance3");
1835     if (info.support_VK_KHR_multiview())
1836         enabledExtensions.push_back("VK_KHR_multiview");
1837     if (info.support_VK_KHR_push_descriptor())
1838         enabledExtensions.push_back("VK_KHR_push_descriptor");
1839     if (info.support_VK_KHR_sampler_ycbcr_conversion())
1840         enabledExtensions.push_back("VK_KHR_sampler_ycbcr_conversion");
1841     if (info.support_VK_KHR_shader_float16_int8())
1842         enabledExtensions.push_back("VK_KHR_shader_float16_int8");
1843     if (info.support_VK_KHR_shader_float_controls())
1844         enabledExtensions.push_back("VK_KHR_shader_float_controls");
1845     if (info.support_VK_KHR_storage_buffer_storage_class())
1846         enabledExtensions.push_back("VK_KHR_storage_buffer_storage_class");
1847     if (info.support_VK_KHR_swapchain())
1848         enabledExtensions.push_back("VK_KHR_swapchain");
1849     if (info.support_VK_EXT_descriptor_indexing())
1850         enabledExtensions.push_back("VK_EXT_descriptor_indexing");
1851     if (info.support_VK_EXT_memory_budget())
1852         enabledExtensions.push_back("VK_EXT_memory_budget");
1853     if (info.support_VK_EXT_queue_family_foreign())
1854         enabledExtensions.push_back("VK_EXT_queue_family_foreign");
1855 #if __ANDROID_API__ >= 26
1856     if (info.support_VK_ANDROID_external_memory_android_hardware_buffer())
1857         enabledExtensions.push_back("VK_ANDROID_external_memory_android_hardware_buffer");
1858 #endif // __ANDROID_API__ >= 26
1859 
1860     void* enabledExtensionFeatures = 0;
1861 
1862     // enable int8 storage
1863     VkPhysicalDevice8BitStorageFeaturesKHR enabled8BitStorageFeatures;
1864     enabled8BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR;
1865     enabled8BitStorageFeatures.pNext = 0;
1866     enabled8BitStorageFeatures.storageBuffer8BitAccess = info.support_int8_storage();
1867     enabled8BitStorageFeatures.uniformAndStorageBuffer8BitAccess = VK_FALSE;
1868     enabled8BitStorageFeatures.storagePushConstant8 = VK_FALSE;
1869     if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_8bit_storage())
1870     {
1871         enabled8BitStorageFeatures.pNext = enabledExtensionFeatures;
1872         enabledExtensionFeatures = &enabled8BitStorageFeatures;
1873     }
1874 
1875     // enable fp16/int16 storage
1876     VkPhysicalDevice16BitStorageFeaturesKHR enabled16BitStorageFeatures;
1877     enabled16BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR;
1878     enabled16BitStorageFeatures.pNext = 0;
1879     enabled16BitStorageFeatures.storageBuffer16BitAccess = info.support_fp16_storage();
1880     enabled16BitStorageFeatures.uniformAndStorageBuffer16BitAccess = VK_FALSE;
1881     enabled16BitStorageFeatures.storagePushConstant16 = VK_FALSE;
1882     enabled16BitStorageFeatures.storageInputOutput16 = VK_FALSE;
1883     if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_16bit_storage())
1884     {
1885         enabled16BitStorageFeatures.pNext = enabledExtensionFeatures;
1886         enabledExtensionFeatures = &enabled16BitStorageFeatures;
1887     }
1888 
1889     // enable fp16/int8 arithmetic
1890     VkPhysicalDeviceFloat16Int8FeaturesKHR enabledFloat16Int8Features;
1891     enabledFloat16Int8Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR;
1892     enabledFloat16Int8Features.pNext = 0;
1893     enabledFloat16Int8Features.shaderFloat16 = info.support_fp16_arithmetic();
1894     enabledFloat16Int8Features.shaderInt8 = info.support_int8_arithmetic();
1895     if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_shader_float16_int8())
1896     {
1897         enabledFloat16Int8Features.pNext = enabledExtensionFeatures;
1898         enabledExtensionFeatures = &enabledFloat16Int8Features;
1899     }
1900 
1901     // enable ycbcr conversion
1902     VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR querySamplerYcbcrConversionFeatures;
1903     querySamplerYcbcrConversionFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES_KHR;
1904     querySamplerYcbcrConversionFeatures.pNext = 0;
1905     querySamplerYcbcrConversionFeatures.samplerYcbcrConversion = info.support_ycbcr_conversion();
1906     if (support_VK_KHR_get_physical_device_properties2 && info.support_ycbcr_conversion())
1907     {
1908         querySamplerYcbcrConversionFeatures.pNext = enabledExtensionFeatures;
1909         enabledExtensionFeatures = &querySamplerYcbcrConversionFeatures;
1910     }
1911 
1912     std::vector<float> compute_queue_priorities(info.compute_queue_count(), 1.f);   // 0.f ~ 1.f
1913     std::vector<float> graphics_queue_priorities(info.graphics_queue_count(), 1.f); // 0.f ~ 1.f
1914     std::vector<float> transfer_queue_priorities(info.transfer_queue_count(), 1.f); // 0.f ~ 1.f
1915 
1916     VkDeviceQueueCreateInfo deviceQueueCreateInfos[3];
1917 
1918     VkDeviceQueueCreateInfo deviceComputeQueueCreateInfo;
1919     deviceComputeQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
1920     deviceComputeQueueCreateInfo.pNext = 0;
1921     deviceComputeQueueCreateInfo.flags = 0;
1922     deviceComputeQueueCreateInfo.queueFamilyIndex = info.compute_queue_family_index();
1923     deviceComputeQueueCreateInfo.queueCount = info.compute_queue_count();
1924     deviceComputeQueueCreateInfo.pQueuePriorities = compute_queue_priorities.data();
1925 
1926     VkDeviceQueueCreateInfo deviceGraphicsQueueCreateInfo;
1927     deviceGraphicsQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
1928     deviceGraphicsQueueCreateInfo.pNext = 0;
1929     deviceGraphicsQueueCreateInfo.flags = 0;
1930     deviceGraphicsQueueCreateInfo.queueFamilyIndex = info.graphics_queue_family_index();
1931     deviceGraphicsQueueCreateInfo.queueCount = info.graphics_queue_count();
1932     deviceGraphicsQueueCreateInfo.pQueuePriorities = graphics_queue_priorities.data();
1933 
1934     VkDeviceQueueCreateInfo deviceTransferQueueCreateInfo;
1935     deviceTransferQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
1936     deviceTransferQueueCreateInfo.pNext = 0;
1937     deviceTransferQueueCreateInfo.flags = 0;
1938     deviceTransferQueueCreateInfo.queueFamilyIndex = info.transfer_queue_family_index();
1939     deviceTransferQueueCreateInfo.queueCount = info.transfer_queue_count();
1940     deviceTransferQueueCreateInfo.pQueuePriorities = transfer_queue_priorities.data();
1941 
1942     VkDeviceCreateInfo deviceCreateInfo;
1943     deviceCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
1944     deviceCreateInfo.pNext = enabledExtensionFeatures;
1945     deviceCreateInfo.flags = 0;
1946     if (info.compute_queue_family_index() == info.graphics_queue_family_index() && info.compute_queue_family_index() == info.transfer_queue_family_index())
1947     {
1948         deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo;
1949         deviceCreateInfo.queueCreateInfoCount = 1;
1950     }
1951     else if (info.compute_queue_family_index() == info.graphics_queue_family_index() && info.compute_queue_family_index() != info.transfer_queue_family_index())
1952     {
1953         deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo;
1954         deviceQueueCreateInfos[1] = deviceTransferQueueCreateInfo;
1955         deviceCreateInfo.queueCreateInfoCount = 2;
1956     }
1957     else if (info.compute_queue_family_index() != info.graphics_queue_family_index() && info.graphics_queue_family_index() == info.transfer_queue_family_index())
1958     {
1959         deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo;
1960         deviceQueueCreateInfos[1] = deviceGraphicsQueueCreateInfo;
1961         deviceCreateInfo.queueCreateInfoCount = 2;
1962     }
1963     else // if (info.compute_queue_family_index() != info.graphics_queue_family_index() && info.graphics_queue_family_index() != info.transfer_queue_family_index())
1964     {
1965         deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo;
1966         deviceQueueCreateInfos[1] = deviceGraphicsQueueCreateInfo;
1967         deviceQueueCreateInfos[2] = deviceTransferQueueCreateInfo;
1968         deviceCreateInfo.queueCreateInfoCount = 3;
1969     }
1970     deviceCreateInfo.pQueueCreateInfos = deviceQueueCreateInfos;
1971     deviceCreateInfo.enabledLayerCount = 0;
1972     deviceCreateInfo.ppEnabledLayerNames = 0;
1973     deviceCreateInfo.enabledExtensionCount = enabledExtensions.size();
1974     deviceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data();
1975     deviceCreateInfo.pEnabledFeatures = 0; // VkPhysicalDeviceFeatures pointer
1976 
1977     VkResult ret = vkCreateDevice(info.physical_device(), &deviceCreateInfo, 0, &d->device);
1978     if (ret != VK_SUCCESS)
1979     {
1980         NCNN_LOGE("vkCreateDevice failed %d", ret);
1981     }
1982 
1983     init_device_extension();
1984 
1985     d->free_compute_queue_count = 0;
1986     d->free_graphics_queue_count = 0;
1987     d->free_transfer_queue_count = 0;
1988 
1989     d->free_compute_queue_count = info.compute_queue_count();
1990     d->compute_queues.resize(info.compute_queue_count());
1991     d->blob_allocators.resize(info.compute_queue_count());
1992     d->staging_allocators.resize(info.compute_queue_count());
1993     for (uint32_t i = 0; i < info.compute_queue_count(); i++)
1994     {
1995         vkGetDeviceQueue(d->device, info.compute_queue_family_index(), i, &d->compute_queues[i]);
1996         d->blob_allocators[i] = new VkBlobAllocator(this);
1997         d->staging_allocators[i] = new VkStagingAllocator(this);
1998     }
1999     if (info.compute_queue_family_index() != info.graphics_queue_family_index())
2000     {
2001         d->free_graphics_queue_count = info.graphics_queue_count();
2002         d->graphics_queues.resize(info.graphics_queue_count());
2003         for (uint32_t i = 0; i < info.graphics_queue_count(); i++)
2004         {
2005             vkGetDeviceQueue(d->device, info.graphics_queue_family_index(), i, &d->graphics_queues[i]);
2006         }
2007     }
2008     if (info.compute_queue_family_index() != info.transfer_queue_family_index() && info.graphics_queue_family_index() != info.transfer_queue_family_index())
2009     {
2010         d->free_transfer_queue_count = info.transfer_queue_count();
2011         d->transfer_queues.resize(info.transfer_queue_count());
2012         for (uint32_t i = 0; i < info.transfer_queue_count(); i++)
2013         {
2014             vkGetDeviceQueue(d->device, info.transfer_queue_family_index(), i, &d->transfer_queues[i]);
2015         }
2016     }
2017 
2018     // prepare immutable texelfetch sampler
2019     {
2020         VkSamplerCreateInfo samplerCreateInfo;
2021         samplerCreateInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
2022         samplerCreateInfo.pNext = 0;
2023         samplerCreateInfo.flags = 0;
2024         samplerCreateInfo.magFilter = VK_FILTER_NEAREST;
2025         samplerCreateInfo.minFilter = VK_FILTER_NEAREST;
2026         samplerCreateInfo.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
2027         samplerCreateInfo.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
2028         samplerCreateInfo.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
2029         samplerCreateInfo.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
2030         samplerCreateInfo.mipLodBias = 0.0f;
2031         samplerCreateInfo.anisotropyEnable = VK_FALSE;
2032         samplerCreateInfo.maxAnisotropy = 1;
2033         samplerCreateInfo.compareEnable = VK_FALSE;
2034         samplerCreateInfo.compareOp = VK_COMPARE_OP_NEVER;
2035         samplerCreateInfo.minLod = 0.0f;
2036         samplerCreateInfo.maxLod = 0.0f;
2037         samplerCreateInfo.borderColor = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK;
2038         samplerCreateInfo.unnormalizedCoordinates = VK_TRUE;
2039 
2040         d->texelfetch_sampler = 0;
2041         ret = vkCreateSampler(d->device, &samplerCreateInfo, 0, &d->texelfetch_sampler);
2042         if (ret != VK_SUCCESS)
2043         {
2044             NCNN_LOGE("vkCreateSampler failed %d", ret);
2045         }
2046     }
2047 
2048     d->create_dummy_buffer_image();
2049 
2050     d->pipeline_cache = new PipelineCache(this);
2051 
2052     memset(d->uop_packing, 0, sizeof(d->uop_packing));
2053 }
2054 
~VulkanDevice()2055 VulkanDevice::~VulkanDevice()
2056 {
2057     d->destroy_utility_operator();
2058 
2059     d->destroy_dummy_buffer_image();
2060 
2061     if (d->texelfetch_sampler)
2062     {
2063         vkDestroySampler(d->device, d->texelfetch_sampler, 0);
2064     }
2065 
2066     for (size_t i = 0; i < d->blob_allocators.size(); i++)
2067     {
2068         delete d->blob_allocators[i];
2069     }
2070     d->blob_allocators.clear();
2071     for (size_t i = 0; i < d->staging_allocators.size(); i++)
2072     {
2073         delete d->staging_allocators[i];
2074     }
2075     d->staging_allocators.clear();
2076 
2077     delete d->pipeline_cache;
2078 
2079     vkDestroyDevice(d->device, 0);
2080 
2081     delete d;
2082 }
2083 
VulkanDevice(const VulkanDevice &)2084 VulkanDevice::VulkanDevice(const VulkanDevice&)
2085     : info(get_gpu_info(0)), d(0)
2086 {
2087 }
2088 
operator =(const VulkanDevice &)2089 VulkanDevice& VulkanDevice::operator=(const VulkanDevice&)
2090 {
2091     return *this;
2092 }
2093 
vkdevice() const2094 VkDevice VulkanDevice::vkdevice() const
2095 {
2096     return d->device;
2097 }
2098 
compile_shader_module(const uint32_t * spv_data,size_t spv_data_size) const2099 VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const
2100 {
2101     VkShaderModuleCreateInfo shaderModuleCreateInfo;
2102     shaderModuleCreateInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
2103     shaderModuleCreateInfo.pNext = 0;
2104     shaderModuleCreateInfo.flags = 0;
2105     shaderModuleCreateInfo.codeSize = spv_data_size;
2106     shaderModuleCreateInfo.pCode = spv_data;
2107 
2108     VkShaderModule shader_module;
2109     VkResult ret = vkCreateShaderModule(d->device, &shaderModuleCreateInfo, 0, &shader_module);
2110     if (ret != VK_SUCCESS)
2111     {
2112         NCNN_LOGE("vkCreateShaderModule failed %d", ret);
2113         return 0;
2114     }
2115 
2116     return shader_module;
2117 }
2118 
inject_local_size_xyz(const uint32_t * code,size_t size,uint32_t local_size_x,uint32_t local_size_y,uint32_t local_size_z,uint32_t * dstcode,size_t * dstsize)2119 static void inject_local_size_xyz(const uint32_t* code, size_t size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t* dstcode, size_t* dstsize)
2120 {
2121     uint32_t local_size_x_id = -1;
2122     uint32_t local_size_y_id = -1;
2123     uint32_t local_size_z_id = -1;
2124     uint32_t gl_WorkGroupSize_id = -1;
2125 
2126     const uint32_t* p = code;
2127     uint32_t* dp = dstcode;
2128 
2129     // skip magic version generator bound schema
2130     memcpy(dp, p, 5 * sizeof(uint32_t));
2131     p += 5;
2132     dp += 5;
2133 
2134     // foreach op
2135     while ((const unsigned char*)p < (const unsigned char*)code + size)
2136     {
2137         uint32_t opcode = p[0];
2138 
2139         uint16_t wordcount = opcode >> 16;
2140         uint16_t op = opcode & 0xffff;
2141 
2142         if (op == 16) // OpExecutionMode
2143         {
2144             uint32_t mode = p[2];
2145             if (mode == 17) // LocalSize
2146             {
2147                 memcpy(dp, p, wordcount * sizeof(uint32_t));
2148 
2149                 // set local_size_xyz
2150                 dp[3] = local_size_x;
2151                 dp[4] = local_size_y;
2152                 dp[5] = local_size_z;
2153 
2154                 p += wordcount;
2155                 dp += wordcount;
2156                 continue;
2157             }
2158         }
2159         else if (op == 50) // OpSpecConstant
2160         {
2161             uint32_t id = p[2];
2162             if (id == local_size_x_id || id == local_size_y_id || id == local_size_z_id)
2163             {
2164                 p += wordcount;
2165                 continue;
2166             }
2167         }
2168         else if (op == 51) // OpSpecConstantComposite
2169         {
2170             uint32_t id = p[2];
2171             if (id == gl_WorkGroupSize_id)
2172             {
2173                 if (wordcount == 6 && (p[3] == local_size_x_id || p[4] == local_size_y_id || p[5] == local_size_z_id))
2174                 {
2175                     p += wordcount;
2176                     continue;
2177                 }
2178             }
2179         }
2180         else if (op == 71) // OpDecorate
2181         {
2182             uint32_t id = p[1];
2183             uint32_t decoration = p[2];
2184             if (decoration == 1) // SpecId
2185             {
2186                 uint32_t specid = p[3];
2187                 if (specid == 233) local_size_x_id = id;
2188                 if (specid == 234) local_size_y_id = id;
2189                 if (specid == 235) local_size_z_id = id;
2190                 if (specid == 233 || specid == 234 || specid == 235)
2191                 {
2192                     p += wordcount;
2193                     continue;
2194                 }
2195             }
2196             else if (decoration == 11) // BuiltIn
2197             {
2198                 uint32_t builtin = p[3];
2199                 if (builtin == 25) // WorkgroupSize
2200                 {
2201                     gl_WorkGroupSize_id = id;
2202                     p += wordcount;
2203                     continue;
2204                 }
2205             }
2206         }
2207 
2208         memcpy(dp, p, wordcount * sizeof(uint32_t));
2209         p += wordcount;
2210         dp += wordcount;
2211     }
2212 
2213     *dstsize = (unsigned char*)dp - (unsigned char*)dstcode;
2214 }
2215 
compile_shader_module(const uint32_t * spv_data,size_t spv_data_size,uint32_t local_size_x,uint32_t local_size_y,uint32_t local_size_z) const2216 VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const
2217 {
2218     uint32_t* spv_data_modified = (uint32_t*)malloc(spv_data_size);
2219     size_t spv_data_size_modified = spv_data_size;
2220     inject_local_size_xyz(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z, spv_data_modified, &spv_data_size_modified);
2221 
2222     VkShaderModule shader_module = compile_shader_module(spv_data_modified, spv_data_size_modified);
2223 
2224     free(spv_data_modified);
2225 
2226     return shader_module;
2227 }
2228 
create_descriptorset_layout(int binding_count,const int * binding_types,VkDescriptorSetLayout * descriptorset_layout) const2229 int VulkanDevice::create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const
2230 {
2231     if (binding_count == 0)
2232     {
2233         *descriptorset_layout = 0;
2234         return 0;
2235     }
2236 
2237     std::vector<VkDescriptorSetLayoutBinding> descriptorSetLayoutBindings(binding_count);
2238     for (int i = 0; i < binding_count; i++)
2239     {
2240         int binding_type = binding_types[i];
2241 
2242         descriptorSetLayoutBindings[i].binding = i;
2243         descriptorSetLayoutBindings[i].descriptorCount = 1;
2244         descriptorSetLayoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
2245 
2246         if (binding_type == 1)
2247         {
2248             descriptorSetLayoutBindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
2249             descriptorSetLayoutBindings[i].pImmutableSamplers = 0;
2250         }
2251         else if (binding_type == 2)
2252         {
2253             descriptorSetLayoutBindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
2254             descriptorSetLayoutBindings[i].pImmutableSamplers = 0;
2255         }
2256         else // if (binding_type == 3)
2257         {
2258             descriptorSetLayoutBindings[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
2259             descriptorSetLayoutBindings[i].pImmutableSamplers = immutable_texelfetch_sampler(); // we always use texelfetch
2260         }
2261     }
2262 
2263     VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo;
2264     descriptorSetLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
2265     descriptorSetLayoutCreateInfo.pNext = 0;
2266     descriptorSetLayoutCreateInfo.flags = 0;
2267     descriptorSetLayoutCreateInfo.bindingCount = binding_count;
2268     descriptorSetLayoutCreateInfo.pBindings = descriptorSetLayoutBindings.data();
2269 
2270     if (info.support_VK_KHR_push_descriptor())
2271     {
2272         descriptorSetLayoutCreateInfo.flags |= VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR;
2273     }
2274 
2275     VkResult ret = vkCreateDescriptorSetLayout(d->device, &descriptorSetLayoutCreateInfo, 0, descriptorset_layout);
2276     if (ret != VK_SUCCESS)
2277     {
2278         NCNN_LOGE("vkCreateDescriptorSetLayout failed %d", ret);
2279         return -1;
2280     }
2281 
2282     return 0;
2283 }
2284 
create_pipeline_layout(int push_constant_count,VkDescriptorSetLayout descriptorset_layout,VkPipelineLayout * pipeline_layout) const2285 int VulkanDevice::create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const
2286 {
2287     VkPushConstantRange pushConstantRange;
2288     pushConstantRange.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
2289     pushConstantRange.offset = 0;
2290     pushConstantRange.size = sizeof(vk_constant_type) * push_constant_count;
2291 
2292     VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo;
2293     pipelineLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
2294     pipelineLayoutCreateInfo.pNext = 0;
2295     pipelineLayoutCreateInfo.flags = 0;
2296 
2297     if (descriptorset_layout)
2298     {
2299         pipelineLayoutCreateInfo.setLayoutCount = 1;
2300         pipelineLayoutCreateInfo.pSetLayouts = &descriptorset_layout;
2301     }
2302     else
2303     {
2304         pipelineLayoutCreateInfo.setLayoutCount = 0;
2305         pipelineLayoutCreateInfo.pSetLayouts = 0;
2306     }
2307 
2308     if (push_constant_count > 0)
2309     {
2310         pipelineLayoutCreateInfo.pushConstantRangeCount = 1;
2311         pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange;
2312     }
2313     else
2314     {
2315         pipelineLayoutCreateInfo.pushConstantRangeCount = 0;
2316         pipelineLayoutCreateInfo.pPushConstantRanges = 0;
2317     }
2318 
2319     VkResult ret = vkCreatePipelineLayout(d->device, &pipelineLayoutCreateInfo, 0, pipeline_layout);
2320     if (ret != VK_SUCCESS)
2321     {
2322         NCNN_LOGE("vkCreatePipelineLayout failed %d", ret);
2323         return -1;
2324     }
2325 
2326     return 0;
2327 }
2328 
create_pipeline(VkShaderModule shader_module,VkPipelineLayout pipeline_layout,const std::vector<vk_specialization_type> & specializations,VkPipeline * pipeline) const2329 int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, VkPipeline* pipeline) const
2330 {
2331     const int specialization_count = specializations.size();
2332 
2333     std::vector<VkSpecializationMapEntry> specializationMapEntries(specialization_count);
2334     for (int i = 0; i < specialization_count; i++)
2335     {
2336         specializationMapEntries[i].constantID = i;
2337         specializationMapEntries[i].offset = i * sizeof(vk_specialization_type);
2338         specializationMapEntries[i].size = sizeof(vk_specialization_type);
2339     }
2340 
2341     VkSpecializationInfo specializationInfo;
2342     specializationInfo.mapEntryCount = specializationMapEntries.size();
2343     specializationInfo.pMapEntries = specializationMapEntries.data();
2344     specializationInfo.dataSize = specializations.size() * sizeof(vk_specialization_type);
2345     specializationInfo.pData = specializations.data();
2346 
2347     VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo;
2348     pipelineShaderStageCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
2349     pipelineShaderStageCreateInfo.pNext = 0;
2350     pipelineShaderStageCreateInfo.flags = 0;
2351     pipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
2352     pipelineShaderStageCreateInfo.module = shader_module;
2353     pipelineShaderStageCreateInfo.pName = "main";
2354     pipelineShaderStageCreateInfo.pSpecializationInfo = &specializationInfo;
2355 
2356     VkComputePipelineCreateInfo computePipelineCreateInfo;
2357     computePipelineCreateInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
2358     computePipelineCreateInfo.pNext = 0;
2359     computePipelineCreateInfo.flags = 0;
2360     computePipelineCreateInfo.stage = pipelineShaderStageCreateInfo;
2361     computePipelineCreateInfo.layout = pipeline_layout;
2362     computePipelineCreateInfo.basePipelineHandle = 0;
2363     computePipelineCreateInfo.basePipelineIndex = 0;
2364 
2365     VkResult ret = vkCreateComputePipelines(d->device, 0, 1, &computePipelineCreateInfo, 0, pipeline);
2366     if (ret != VK_SUCCESS)
2367     {
2368         NCNN_LOGE("vkCreateComputePipelines failed %d", ret);
2369         return -1;
2370     }
2371 
2372     return 0;
2373 }
2374 
create_descriptor_update_template(int binding_count,const int * binding_types,VkDescriptorSetLayout descriptorset_layout,VkPipelineLayout pipeline_layout,VkDescriptorUpdateTemplateKHR * descriptor_update_template) const2375 int VulkanDevice::create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const
2376 {
2377     if (binding_count == 0)
2378     {
2379         *descriptor_update_template = 0;
2380         return 0;
2381     }
2382 
2383     std::vector<VkDescriptorUpdateTemplateEntryKHR> descriptorUpdateTemplateEntries(binding_count);
2384     size_t offset = 0;
2385     for (int i = 0; i < binding_count; i++) // TODO do not update weights
2386     {
2387         int binding_type = binding_types[i];
2388 
2389         descriptorUpdateTemplateEntries[i].dstBinding = i;
2390         descriptorUpdateTemplateEntries[i].dstArrayElement = 0;
2391         descriptorUpdateTemplateEntries[i].descriptorCount = 1;
2392         descriptorUpdateTemplateEntries[i].offset = offset;
2393 
2394         if (binding_type == 1)
2395         {
2396             descriptorUpdateTemplateEntries[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
2397             descriptorUpdateTemplateEntries[i].stride = sizeof(VkDescriptorBufferInfo);
2398         }
2399         else if (binding_type == 2)
2400         {
2401             descriptorUpdateTemplateEntries[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
2402             descriptorUpdateTemplateEntries[i].stride = sizeof(VkDescriptorImageInfo);
2403         }
2404         else // if (binding_type == 3)
2405         {
2406             descriptorUpdateTemplateEntries[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
2407             descriptorUpdateTemplateEntries[i].stride = sizeof(VkDescriptorImageInfo);
2408         }
2409 
2410         offset += descriptorUpdateTemplateEntries[i].stride;
2411     }
2412 
2413     VkDescriptorUpdateTemplateCreateInfoKHR descriptorUpdateTemplateCreateInfo;
2414     descriptorUpdateTemplateCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR;
2415     descriptorUpdateTemplateCreateInfo.pNext = 0;
2416     descriptorUpdateTemplateCreateInfo.flags = 0;
2417     descriptorUpdateTemplateCreateInfo.descriptorUpdateEntryCount = binding_count; // TODO do not update weights
2418     descriptorUpdateTemplateCreateInfo.pDescriptorUpdateEntries = descriptorUpdateTemplateEntries.data();
2419     if (info.support_VK_KHR_push_descriptor())
2420     {
2421         descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR;
2422     }
2423     else
2424     {
2425         descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR;
2426     }
2427     // descriptorSetLayout should be ignored if VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR
2428     // FIXME HACK WARNING TODO NOTE but crash on radv if set NULL  :(
2429     descriptorUpdateTemplateCreateInfo.descriptorSetLayout = descriptorset_layout;
2430     descriptorUpdateTemplateCreateInfo.pipelineBindPoint = VK_PIPELINE_BIND_POINT_COMPUTE;
2431     descriptorUpdateTemplateCreateInfo.pipelineLayout = pipeline_layout;
2432     descriptorUpdateTemplateCreateInfo.set = 0;
2433 
2434     VkResult ret = vkCreateDescriptorUpdateTemplateKHR(d->device, &descriptorUpdateTemplateCreateInfo, 0, descriptor_update_template);
2435     if (ret != VK_SUCCESS)
2436     {
2437         NCNN_LOGE("vkCreateDescriptorUpdateTemplateKHR failed %d", ret);
2438         return -1;
2439     }
2440 
2441     return 0;
2442 }
2443 
find_memory_index(uint32_t memory_type_bits,VkFlags required,VkFlags preferred,VkFlags preferred_not) const2444 uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const
2445 {
2446     const VkPhysicalDeviceMemoryProperties& memory_properties = info.physical_device_memory_properties();
2447 
2448     // first try, find required and with preferred and without preferred_not
2449     for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
2450     {
2451         bool is_required = (1 << i) & memory_type_bits;
2452         if (is_required)
2453         {
2454             const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
2455             if ((memoryType.propertyFlags & required) == required
2456                     && (preferred && (memoryType.propertyFlags & preferred))
2457                     && (preferred_not && !(memoryType.propertyFlags & preferred_not)))
2458             {
2459                 return i;
2460             }
2461         }
2462     }
2463 
2464     // second try, find required and with preferred
2465     for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
2466     {
2467         bool is_required = (1 << i) & memory_type_bits;
2468         if (is_required)
2469         {
2470             const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
2471             if ((memoryType.propertyFlags & required) == required
2472                     && (preferred && (memoryType.propertyFlags & preferred)))
2473             {
2474                 return i;
2475             }
2476         }
2477     }
2478 
2479     // third try, find required and without preferred_not
2480     for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
2481     {
2482         bool is_required = (1 << i) & memory_type_bits;
2483         if (is_required)
2484         {
2485             const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
2486             if ((memoryType.propertyFlags & required) == required
2487                     && (preferred_not && !(memoryType.propertyFlags & preferred_not)))
2488             {
2489                 return i;
2490             }
2491         }
2492     }
2493 
2494     // fourth try, find any required
2495     for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
2496     {
2497         bool is_required = (1 << i) & memory_type_bits;
2498         if (is_required)
2499         {
2500             const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
2501             if ((memoryType.propertyFlags & required) == required)
2502             {
2503                 return i;
2504             }
2505         }
2506     }
2507 
2508     NCNN_LOGE("no such memory type %u %u %u %u", memory_type_bits, required, preferred, preferred_not);
2509     return -1;
2510 }
2511 
is_mappable(uint32_t memory_type_index) const2512 bool VulkanDevice::is_mappable(uint32_t memory_type_index) const
2513 {
2514     const VkMemoryType& memoryType = info.physical_device_memory_properties().memoryTypes[memory_type_index];
2515 
2516     return memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
2517 }
2518 
is_coherent(uint32_t memory_type_index) const2519 bool VulkanDevice::is_coherent(uint32_t memory_type_index) const
2520 {
2521     const VkMemoryType& memoryType = info.physical_device_memory_properties().memoryTypes[memory_type_index];
2522 
2523     return memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
2524 }
2525 
acquire_queue(uint32_t queue_family_index) const2526 VkQueue VulkanDevice::acquire_queue(uint32_t queue_family_index) const
2527 {
2528     if (queue_family_index != info.compute_queue_family_index()
2529             && queue_family_index != info.graphics_queue_family_index()
2530             && queue_family_index != info.transfer_queue_family_index())
2531     {
2532         NCNN_LOGE("invalid queue_family_index %u", queue_family_index);
2533         return 0;
2534     }
2535 
2536     Mutex& queue_lock = queue_family_index == info.compute_queue_family_index() ? d->compute_queue_lock
2537                         : queue_family_index == info.graphics_queue_family_index() ? d->graphics_queue_lock
2538                         : d->transfer_queue_lock;
2539 
2540     queue_lock.lock();
2541 
2542     ConditionVariable& queue_condition = queue_family_index == info.compute_queue_family_index() ? d->compute_queue_condition
2543                                          : queue_family_index == info.graphics_queue_family_index() ? d->graphics_queue_condition
2544                                          : d->transfer_queue_condition;
2545 
2546     int& free_queue_count = queue_family_index == info.compute_queue_family_index() ? d->free_compute_queue_count
2547                             : queue_family_index == info.graphics_queue_family_index() ? d->free_graphics_queue_count
2548                             : d->free_transfer_queue_count;
2549 
2550     while (free_queue_count == 0)
2551     {
2552         // no free queues, wait for recleams from other threads
2553         queue_condition.wait(queue_lock);
2554     }
2555 
2556     std::vector<VkQueue>& queues = queue_family_index == info.compute_queue_family_index() ? d->compute_queues
2557                                    : queue_family_index == info.graphics_queue_family_index() ? d->graphics_queues
2558                                    : d->transfer_queues;
2559 
2560     VkQueue queue = 0;
2561     for (size_t i = 0; i < queues.size(); i++)
2562     {
2563         if (queues[i])
2564         {
2565             queue = queues[i];
2566             queues[i] = 0;
2567             break;
2568         }
2569     }
2570 
2571     if (!queue)
2572     {
2573         NCNN_LOGE("FATAL ERROR! out of hardware queue %u", queue_family_index);
2574     }
2575 
2576     free_queue_count -= 1;
2577 
2578     queue_lock.unlock();
2579 
2580     queue_condition.signal();
2581 
2582     return queue;
2583 }
2584 
reclaim_queue(uint32_t queue_family_index,VkQueue queue) const2585 void VulkanDevice::reclaim_queue(uint32_t queue_family_index, VkQueue queue) const
2586 {
2587     if (queue_family_index != info.compute_queue_family_index()
2588             && queue_family_index != info.graphics_queue_family_index()
2589             && queue_family_index != info.transfer_queue_family_index())
2590     {
2591         NCNN_LOGE("invalid queue_family_index %u", queue_family_index);
2592         return;
2593     }
2594 
2595     Mutex& queue_lock = queue_family_index == info.compute_queue_family_index() ? d->compute_queue_lock
2596                         : queue_family_index == info.graphics_queue_family_index() ? d->graphics_queue_lock
2597                         : d->transfer_queue_lock;
2598 
2599     queue_lock.lock();
2600 
2601     ConditionVariable& queue_condition = queue_family_index == info.compute_queue_family_index() ? d->compute_queue_condition
2602                                          : queue_family_index == info.graphics_queue_family_index() ? d->graphics_queue_condition
2603                                          : d->transfer_queue_condition;
2604 
2605     int& free_queue_count = queue_family_index == info.compute_queue_family_index() ? d->free_compute_queue_count
2606                             : queue_family_index == info.graphics_queue_family_index() ? d->free_graphics_queue_count
2607                             : d->free_transfer_queue_count;
2608 
2609     std::vector<VkQueue>& queues = queue_family_index == info.compute_queue_family_index() ? d->compute_queues
2610                                    : queue_family_index == info.graphics_queue_family_index() ? d->graphics_queues
2611                                    : d->transfer_queues;
2612 
2613     size_t i = 0;
2614     for (; i < queues.size(); i++)
2615     {
2616         if (!queues[i])
2617         {
2618             queues[i] = queue;
2619             break;
2620         }
2621     }
2622 
2623     if (i == queues.size())
2624     {
2625         NCNN_LOGE("FATAL ERROR! reclaim_queue get wild queue %u %p", queue_family_index, queue);
2626     }
2627 
2628     free_queue_count += 1;
2629 
2630     queue_lock.unlock();
2631 
2632     queue_condition.signal();
2633 }
2634 
acquire_blob_allocator() const2635 VkAllocator* VulkanDevice::acquire_blob_allocator() const
2636 {
2637     MutexLockGuard lock(d->blob_allocator_lock);
2638 
2639     for (int i = 0; i < (int)d->blob_allocators.size(); i++)
2640     {
2641         VkAllocator* allocator = d->blob_allocators[i];
2642         if (allocator)
2643         {
2644             d->blob_allocators[i] = 0;
2645             return allocator;
2646         }
2647     }
2648 
2649     // pre-allocated allcator exhausted, create new
2650     VkAllocator* allocator = new VkBlobAllocator(this);
2651     d->blob_allocators.push_back(allocator);
2652     d->blob_allocators[d->blob_allocators.size() - 1] = 0;
2653     return allocator;
2654 }
2655 
reclaim_blob_allocator(VkAllocator * allocator) const2656 void VulkanDevice::reclaim_blob_allocator(VkAllocator* allocator) const
2657 {
2658     MutexLockGuard lock(d->blob_allocator_lock);
2659 
2660     for (int i = 0; i < (int)d->blob_allocators.size(); i++)
2661     {
2662         if (!d->blob_allocators[i])
2663         {
2664             d->blob_allocators[i] = allocator;
2665             return;
2666         }
2667     }
2668 
2669     NCNN_LOGE("FATAL ERROR! reclaim_blob_allocator get wild allocator %p", allocator);
2670 }
2671 
acquire_staging_allocator() const2672 VkAllocator* VulkanDevice::acquire_staging_allocator() const
2673 {
2674     MutexLockGuard lock(d->staging_allocator_lock);
2675 
2676     for (int i = 0; i < (int)d->staging_allocators.size(); i++)
2677     {
2678         VkAllocator* allocator = d->staging_allocators[i];
2679         if (allocator)
2680         {
2681             d->staging_allocators[i] = 0;
2682             return allocator;
2683         }
2684     }
2685 
2686     // pre-allocated allcator exhausted, create new
2687     VkAllocator* allocator = new VkStagingAllocator(this);
2688     d->staging_allocators.push_back(allocator);
2689     d->staging_allocators[d->staging_allocators.size() - 1] = 0;
2690     return allocator;
2691 }
2692 
reclaim_staging_allocator(VkAllocator * allocator) const2693 void VulkanDevice::reclaim_staging_allocator(VkAllocator* allocator) const
2694 {
2695     MutexLockGuard lock(d->staging_allocator_lock);
2696 
2697     for (int i = 0; i < (int)d->staging_allocators.size(); i++)
2698     {
2699         if (!d->staging_allocators[i])
2700         {
2701             d->staging_allocators[i] = allocator;
2702             return;
2703         }
2704     }
2705 
2706     NCNN_LOGE("FATAL ERROR! reclaim_staging_allocator get wild allocator %p", allocator);
2707 }
2708 
immutable_texelfetch_sampler() const2709 const VkSampler* VulkanDevice::immutable_texelfetch_sampler() const
2710 {
2711     return &d->texelfetch_sampler;
2712 }
2713 
get_dummy_buffer() const2714 VkMat VulkanDevice::get_dummy_buffer() const
2715 {
2716     return d->dummy_buffer;
2717 }
2718 
get_dummy_image() const2719 VkImageMat VulkanDevice::get_dummy_image() const
2720 {
2721     return d->dummy_image;
2722 }
2723 
get_dummy_image_readonly() const2724 VkImageMat VulkanDevice::get_dummy_image_readonly() const
2725 {
2726 #if __APPLE__
2727     if (info.vendor_id() == 0x8086)
2728         return d->dummy_image;
2729 #endif
2730     return d->dummy_image_readonly;
2731 }
2732 
get_pipeline_cache() const2733 const PipelineCache* VulkanDevice::get_pipeline_cache() const
2734 {
2735     return d->pipeline_cache;
2736 }
2737 
shape_support_image_storage(const Mat & shape) const2738 bool VulkanDevice::shape_support_image_storage(const Mat& shape) const
2739 {
2740     int dims = shape.dims;
2741     int width = shape.w;
2742     int height = shape.h;
2743     int depth = shape.c;
2744     int elempack = shape.elempack;
2745 
2746     // large elempack spills on image w
2747     if (elempack == 8) width *= 2;
2748     if (elempack == 16) width *= 4;
2749     if (elempack == 32) width *= 8;
2750     if (elempack == 64) width *= 16;
2751 
2752     if (dims == 1)
2753     {
2754         if (width > (int)info.max_image_dimension_1d())
2755         {
2756             return false;
2757         }
2758     }
2759     else if (dims == 2)
2760     {
2761         if (width > (int)info.max_image_dimension_2d() || height > (int)info.max_image_dimension_2d())
2762         {
2763             return false;
2764         }
2765     }
2766     else // if (dims == 3)
2767     {
2768         if (width > (int)info.max_image_dimension_3d() || height > (int)info.max_image_dimension_3d() || depth > (int)info.max_image_dimension_3d())
2769         {
2770             return false;
2771         }
2772     }
2773 
2774     return true;
2775 }
2776 
get_heap_budget() const2777 uint32_t VulkanDevice::get_heap_budget() const
2778 {
2779     const VkPhysicalDeviceMemoryProperties& memory_properties = info.physical_device_memory_properties();
2780 
2781     // the first device local heap
2782     uint32_t device_local_heap_index = 0;
2783     uint32_t device_local_heap_size = 0;
2784     for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
2785     {
2786         const VkMemoryHeap& memoryHeap = memory_properties.memoryHeaps[i];
2787         if (memoryHeap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT)
2788         {
2789             device_local_heap_index = i;
2790             device_local_heap_size = memoryHeap.size / 1024 / 1024;
2791             break;
2792         }
2793     }
2794 
2795     if (!info.support_VK_EXT_memory_budget())
2796     {
2797         //         NCNN_LOGE("heap budget from assumption\n");
2798 
2799         // we usually cannot use all heap
2800         // 70% for 4G+
2801         // 50% for 4G-
2802         return device_local_heap_size >= 4000 ? device_local_heap_size * 0.7 : device_local_heap_size * 0.5;
2803     }
2804 
2805     VkPhysicalDeviceMemoryBudgetPropertiesEXT memoryBudgetProperties;
2806     memoryBudgetProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT;
2807     memoryBudgetProperties.pNext = 0;
2808 
2809     VkPhysicalDeviceMemoryProperties2KHR memoryProperties;
2810     memoryProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2_KHR;
2811     memoryProperties.pNext = &memoryBudgetProperties;
2812 
2813     vkGetPhysicalDeviceMemoryProperties2KHR(info.physical_device(), &memoryProperties);
2814 
2815     return memoryBudgetProperties.heapBudget[device_local_heap_index] / 1024 / 1024;
2816 }
2817 
convert_packing(const VkMat & src,VkMat & dst,int dst_elempack,VkCompute & cmd,const Option & _opt) const2818 void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& _opt) const
2819 {
2820     // buffer2buffer uop is created with use_image_storage disabled
2821     Option opt = _opt;
2822     opt.use_image_storage = false;
2823 
2824     int cast_type_to_index = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0;
2825     int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2;
2826 
2827     int cast_type_from_index;
2828     if (src.elembits() == 32)
2829     {
2830         cast_type_from_index = 0;
2831     }
2832     else // if (src.elembits() == 16)
2833     {
2834         if (cast_type_to_index != 0)
2835         {
2836             cast_type_from_index = cast_type_to_index;
2837         }
2838         else if (info.support_fp16_storage())
2839         {
2840             cast_type_from_index = 2;
2841         }
2842         else // if (info.support_fp16_packed())
2843         {
2844             cast_type_from_index = 1;
2845         }
2846     }
2847 
2848     // NCNN_LOGE("convert_packing b2b %d %d %d", cast_type_from_index, cast_type_to_index, packing_type_to_index);
2849 
2850     const ncnn::Packing_vulkan* uop = d->get_utility_operator(0, 0, cast_type_from_index, cast_type_to_index, packing_type_to_index);
2851     uop->forward(src, dst, cmd, opt);
2852 }
2853 
convert_packing(const VkImageMat & src,VkImageMat & dst,int dst_elempack,VkCompute & cmd,const Option & opt) const2854 void VulkanDevice::convert_packing(const VkImageMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const
2855 {
2856     int cast_type_to_index = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0;
2857     int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2;
2858 
2859     int cast_type_from_index;
2860     if (src.elembits() == 32)
2861     {
2862         cast_type_from_index = 0;
2863     }
2864     else // if (src.elembits() == 16)
2865     {
2866         if (cast_type_to_index != 0)
2867         {
2868             cast_type_from_index = cast_type_to_index;
2869         }
2870         else if (info.support_fp16_storage())
2871         {
2872             cast_type_from_index = 2;
2873         }
2874         else // if (info.support_fp16_packed())
2875         {
2876             cast_type_from_index = 1;
2877         }
2878     }
2879 
2880     // NCNN_LOGE("convert_packing i2i %d %d %d", cast_type_from_index, cast_type_to_index, packing_type_to_index);
2881 
2882     const ncnn::Packing_vulkan* uop = d->get_utility_operator(1, 1, cast_type_from_index, cast_type_to_index, packing_type_to_index);
2883     uop->forward(src, dst, cmd, opt);
2884 }
2885 
convert_packing(const VkMat & src,VkImageMat & dst,int dst_elempack,VkCompute & cmd,const Option & opt) const2886 void VulkanDevice::convert_packing(const VkMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const
2887 {
2888     int cast_type_to_index = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0;
2889     int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2;
2890 
2891     int cast_type_from_index;
2892     if (src.elembits() == 32)
2893     {
2894         cast_type_from_index = 0;
2895     }
2896     else // if (src.elembits() == 16)
2897     {
2898         if (cast_type_to_index != 0)
2899         {
2900             cast_type_from_index = cast_type_to_index;
2901         }
2902         else if (info.support_fp16_storage())
2903         {
2904             cast_type_from_index = 2;
2905         }
2906         else // if (info.support_fp16_packed())
2907         {
2908             cast_type_from_index = 1;
2909         }
2910     }
2911 
2912     // NCNN_LOGE("convert_packing b2i %d %d %d", cast_type_from_index, cast_type_to_index, packing_type_to_index);
2913 
2914     const ncnn::Packing_vulkan* uop = d->get_utility_operator(0, 1, cast_type_from_index, cast_type_to_index, packing_type_to_index);
2915     uop->forward(src, dst, cmd, opt);
2916 }
2917 
convert_packing(const VkImageMat & src,VkMat & dst,int dst_elempack,VkCompute & cmd,const Option & opt) const2918 void VulkanDevice::convert_packing(const VkImageMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const
2919 {
2920     int cast_type_to_index = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0;
2921     int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2;
2922 
2923     int cast_type_from_index;
2924     if (src.elembits() == 32)
2925     {
2926         cast_type_from_index = 0;
2927     }
2928     else // if (src.elembits() == 16)
2929     {
2930         if (cast_type_to_index != 0)
2931         {
2932             cast_type_from_index = cast_type_to_index;
2933         }
2934         else if (info.support_fp16_storage())
2935         {
2936             cast_type_from_index = 2;
2937         }
2938         else // if (info.support_fp16_packed())
2939         {
2940             cast_type_from_index = 1;
2941         }
2942     }
2943 
2944     // NCNN_LOGE("convert_packing i2b %d %d %d", cast_type_from_index, cast_type_to_index, packing_type_to_index);
2945 
2946     const ncnn::Packing_vulkan* uop = d->get_utility_operator(1, 0, cast_type_from_index, cast_type_to_index, packing_type_to_index);
2947     uop->forward(src, dst, cmd, opt);
2948 }
2949 
init_device_extension()2950 int VulkanDevice::init_device_extension()
2951 {
2952     if (info.support_VK_KHR_bind_memory2())
2953     {
2954         vkBindBufferMemory2KHR = (PFN_vkBindBufferMemory2KHR)vkGetDeviceProcAddr(d->device, "vkBindBufferMemory2KHR");
2955         vkBindImageMemory2KHR = (PFN_vkBindImageMemory2KHR)vkGetDeviceProcAddr(d->device, "vkBindImageMemory2KHR");
2956     }
2957 
2958     if (info.support_VK_KHR_create_renderpass2())
2959     {
2960         vkCmdBeginRenderPass2KHR = (PFN_vkCmdBeginRenderPass2KHR)vkGetDeviceProcAddr(d->device, "vkCmdBeginRenderPass2KHR");
2961         vkCmdEndRenderPass2KHR = (PFN_vkCmdEndRenderPass2KHR)vkGetDeviceProcAddr(d->device, "vkCmdEndRenderPass2KHR");
2962         vkCmdNextSubpass2KHR = (PFN_vkCmdNextSubpass2KHR)vkGetDeviceProcAddr(d->device, "vkCmdNextSubpass2KHR");
2963         vkCreateRenderPass2KHR = (PFN_vkCreateRenderPass2KHR)vkGetDeviceProcAddr(d->device, "vkCreateRenderPass2KHR");
2964     }
2965 
2966     if (info.support_VK_KHR_descriptor_update_template())
2967     {
2968         vkCreateDescriptorUpdateTemplateKHR = (PFN_vkCreateDescriptorUpdateTemplateKHR)vkGetDeviceProcAddr(d->device, "vkCreateDescriptorUpdateTemplateKHR");
2969         vkDestroyDescriptorUpdateTemplateKHR = (PFN_vkDestroyDescriptorUpdateTemplateKHR)vkGetDeviceProcAddr(d->device, "vkDestroyDescriptorUpdateTemplateKHR");
2970         vkUpdateDescriptorSetWithTemplateKHR = (PFN_vkUpdateDescriptorSetWithTemplateKHR)vkGetDeviceProcAddr(d->device, "vkUpdateDescriptorSetWithTemplateKHR");
2971     }
2972 
2973     if (info.support_VK_KHR_get_memory_requirements2())
2974     {
2975         vkGetImageMemoryRequirements2KHR = (PFN_vkGetImageMemoryRequirements2KHR)vkGetDeviceProcAddr(d->device, "vkGetImageMemoryRequirements2KHR");
2976         vkGetBufferMemoryRequirements2KHR = (PFN_vkGetBufferMemoryRequirements2KHR)vkGetDeviceProcAddr(d->device, "vkGetBufferMemoryRequirements2KHR");
2977         vkGetImageSparseMemoryRequirements2KHR = (PFN_vkGetImageSparseMemoryRequirements2KHR)vkGetDeviceProcAddr(d->device, "vkGetImageSparseMemoryRequirements2KHR");
2978     }
2979 
2980     if (info.support_VK_KHR_maintenance1())
2981     {
2982         vkTrimCommandPoolKHR = (PFN_vkTrimCommandPoolKHR)vkGetDeviceProcAddr(d->device, "vkTrimCommandPoolKHR");
2983     }
2984 
2985     if (info.support_VK_KHR_maintenance3())
2986     {
2987         vkGetDescriptorSetLayoutSupportKHR = (PFN_vkGetDescriptorSetLayoutSupportKHR)vkGetDeviceProcAddr(d->device, "vkGetDescriptorSetLayoutSupportKHR");
2988     }
2989 
2990     if (info.support_VK_KHR_push_descriptor())
2991     {
2992         if (info.support_VK_KHR_descriptor_update_template())
2993         {
2994             vkCmdPushDescriptorSetWithTemplateKHR = (PFN_vkCmdPushDescriptorSetWithTemplateKHR)vkGetDeviceProcAddr(d->device, "vkCmdPushDescriptorSetWithTemplateKHR");
2995         }
2996 
2997         vkCmdPushDescriptorSetKHR = (PFN_vkCmdPushDescriptorSetKHR)vkGetDeviceProcAddr(d->device, "vkCmdPushDescriptorSetKHR");
2998     }
2999 
3000     if (info.support_VK_KHR_sampler_ycbcr_conversion())
3001     {
3002         vkCreateSamplerYcbcrConversionKHR = (PFN_vkCreateSamplerYcbcrConversionKHR)vkGetDeviceProcAddr(d->device, "vkCreateSamplerYcbcrConversionKHR");
3003         vkDestroySamplerYcbcrConversionKHR = (PFN_vkDestroySamplerYcbcrConversionKHR)vkGetDeviceProcAddr(d->device, "vkDestroySamplerYcbcrConversionKHR");
3004     }
3005 
3006     if (info.support_VK_KHR_swapchain())
3007     {
3008         vkCreateSwapchainKHR = (PFN_vkCreateSwapchainKHR)vkGetDeviceProcAddr(d->device, "vkCreateSwapchainKHR");
3009         vkDestroySwapchainKHR = (PFN_vkDestroySwapchainKHR)vkGetDeviceProcAddr(d->device, "vkDestroySwapchainKHR");
3010         vkGetSwapchainImagesKHR = (PFN_vkGetSwapchainImagesKHR)vkGetDeviceProcAddr(d->device, "vkGetSwapchainImagesKHR");
3011         vkAcquireNextImageKHR = (PFN_vkAcquireNextImageKHR)vkGetDeviceProcAddr(d->device, "vkAcquireNextImageKHR");
3012         vkQueuePresentKHR = (PFN_vkQueuePresentKHR)vkGetDeviceProcAddr(d->device, "vkQueuePresentKHR");
3013     }
3014 
3015 #if __ANDROID_API__ >= 26
3016     if (info.support_VK_ANDROID_external_memory_android_hardware_buffer())
3017     {
3018         vkGetAndroidHardwareBufferPropertiesANDROID = (PFN_vkGetAndroidHardwareBufferPropertiesANDROID)vkGetDeviceProcAddr(d->device, "vkGetAndroidHardwareBufferPropertiesANDROID");
3019         vkGetMemoryAndroidHardwareBufferANDROID = (PFN_vkGetMemoryAndroidHardwareBufferANDROID)vkGetDeviceProcAddr(d->device, "vkGetMemoryAndroidHardwareBufferANDROID");
3020     }
3021 #endif // __ANDROID_API__ >= 26
3022 
3023     return 0;
3024 }
3025 
get_gpu_device(int device_index)3026 VulkanDevice* get_gpu_device(int device_index)
3027 {
3028     try_create_gpu_instance();
3029 
3030     if (device_index < 0 || device_index >= g_gpu_count)
3031         return 0;
3032 
3033     MutexLockGuard lock(g_default_vkdev_lock);
3034 
3035     if (!g_default_vkdev[device_index])
3036         g_default_vkdev[device_index] = new VulkanDevice(device_index);
3037 
3038     return g_default_vkdev[device_index];
3039 }
3040 
get_default_TBuiltInResource()3041 static TBuiltInResource get_default_TBuiltInResource()
3042 {
3043     TBuiltInResource resource;
3044 
3045     resource.maxLights = 32;
3046     resource.maxClipPlanes = 6;
3047     resource.maxTextureUnits = 32;
3048     resource.maxTextureCoords = 32;
3049     resource.maxVertexAttribs = 64;
3050     resource.maxVertexUniformComponents = 4096;
3051     resource.maxVaryingFloats = 64;
3052     resource.maxVertexTextureImageUnits = 32;
3053     resource.maxCombinedTextureImageUnits = 80;
3054     resource.maxTextureImageUnits = 32;
3055     resource.maxFragmentUniformComponents = 4096;
3056     resource.maxDrawBuffers = 32;
3057     resource.maxVertexUniformVectors = 128;
3058     resource.maxVaryingVectors = 8;
3059     resource.maxFragmentUniformVectors = 16;
3060     resource.maxVertexOutputVectors = 16;
3061     resource.maxFragmentInputVectors = 15;
3062     resource.minProgramTexelOffset = -8;
3063     resource.maxProgramTexelOffset = 7;
3064     resource.maxClipDistances = 8;
3065     resource.maxComputeWorkGroupCountX = 65535;
3066     resource.maxComputeWorkGroupCountY = 65535;
3067     resource.maxComputeWorkGroupCountZ = 65535;
3068     resource.maxComputeWorkGroupSizeX = 1024;
3069     resource.maxComputeWorkGroupSizeY = 1024;
3070     resource.maxComputeWorkGroupSizeZ = 64;
3071     resource.maxComputeUniformComponents = 1024;
3072     resource.maxComputeTextureImageUnits = 16;
3073     resource.maxComputeImageUniforms = 8;
3074     resource.maxComputeAtomicCounters = 8;
3075     resource.maxComputeAtomicCounterBuffers = 1;
3076     resource.maxVaryingComponents = 60;
3077     resource.maxVertexOutputComponents = 64;
3078     resource.maxGeometryInputComponents = 64;
3079     resource.maxGeometryOutputComponents = 128;
3080     resource.maxFragmentInputComponents = 128;
3081     resource.maxImageUnits = 8;
3082     resource.maxCombinedImageUnitsAndFragmentOutputs = 8;
3083     resource.maxCombinedShaderOutputResources = 8;
3084     resource.maxImageSamples = 0;
3085     resource.maxVertexImageUniforms = 0;
3086     resource.maxTessControlImageUniforms = 0;
3087     resource.maxTessEvaluationImageUniforms = 0;
3088     resource.maxGeometryImageUniforms = 0;
3089     resource.maxFragmentImageUniforms = 8;
3090     resource.maxCombinedImageUniforms = 8;
3091     resource.maxGeometryTextureImageUnits = 16;
3092     resource.maxGeometryOutputVertices = 256;
3093     resource.maxGeometryTotalOutputComponents = 1024;
3094     resource.maxGeometryUniformComponents = 1024;
3095     resource.maxGeometryVaryingComponents = 64;
3096     resource.maxTessControlInputComponents = 128;
3097     resource.maxTessControlOutputComponents = 128;
3098     resource.maxTessControlTextureImageUnits = 16;
3099     resource.maxTessControlUniformComponents = 1024;
3100     resource.maxTessControlTotalOutputComponents = 4096;
3101     resource.maxTessEvaluationInputComponents = 128;
3102     resource.maxTessEvaluationOutputComponents = 128;
3103     resource.maxTessEvaluationTextureImageUnits = 16;
3104     resource.maxTessEvaluationUniformComponents = 1024;
3105     resource.maxTessPatchComponents = 120;
3106     resource.maxPatchVertices = 32;
3107     resource.maxTessGenLevel = 64;
3108     resource.maxViewports = 16;
3109     resource.maxVertexAtomicCounters = 0;
3110     resource.maxTessControlAtomicCounters = 0;
3111     resource.maxTessEvaluationAtomicCounters = 0;
3112     resource.maxGeometryAtomicCounters = 0;
3113     resource.maxFragmentAtomicCounters = 8;
3114     resource.maxCombinedAtomicCounters = 8;
3115     resource.maxAtomicCounterBindings = 1;
3116     resource.maxVertexAtomicCounterBuffers = 0;
3117     resource.maxTessControlAtomicCounterBuffers = 0;
3118     resource.maxTessEvaluationAtomicCounterBuffers = 0;
3119     resource.maxGeometryAtomicCounterBuffers = 0;
3120     resource.maxFragmentAtomicCounterBuffers = 1;
3121     resource.maxCombinedAtomicCounterBuffers = 1;
3122     resource.maxAtomicCounterBufferSize = 16384;
3123     resource.maxTransformFeedbackBuffers = 4;
3124     resource.maxTransformFeedbackInterleavedComponents = 64;
3125     resource.maxCullDistances = 8;
3126     resource.maxCombinedClipAndCullDistances = 8;
3127     resource.maxSamples = 4;
3128     resource.maxMeshOutputVerticesNV = 256;
3129     resource.maxMeshOutputPrimitivesNV = 512;
3130     resource.maxMeshWorkGroupSizeX_NV = 32;
3131     resource.maxMeshWorkGroupSizeY_NV = 1;
3132     resource.maxMeshWorkGroupSizeZ_NV = 1;
3133     resource.maxTaskWorkGroupSizeX_NV = 32;
3134     resource.maxTaskWorkGroupSizeY_NV = 1;
3135     resource.maxTaskWorkGroupSizeZ_NV = 1;
3136     resource.maxMeshViewCountNV = 4;
3137 
3138     // TODO compile-time glslang version check
3139     // resource.maxDualSourceDrawBuffersEXT = 1;
3140 
3141     resource.limits.nonInductiveForLoops = 1;
3142     resource.limits.whileLoops = 1;
3143     resource.limits.doWhileLoops = 1;
3144     resource.limits.generalUniformIndexing = 1;
3145     resource.limits.generalAttributeMatrixVectorIndexing = 1;
3146     resource.limits.generalVaryingIndexing = 1;
3147     resource.limits.generalSamplerIndexing = 1;
3148     resource.limits.generalVariableIndexing = 1;
3149     resource.limits.generalConstantMatrixVectorIndexing = 1;
3150 
3151     return resource;
3152 }
3153 
compile_spirv_module(const char * comp_string,const Option & opt,std::vector<uint32_t> & spirv)3154 int compile_spirv_module(const char* comp_string, const Option& opt, std::vector<uint32_t>& spirv)
3155 {
3156     // -1 for omitting the tail '\0'
3157     int length = strlen(comp_string) - 1;
3158     return compile_spirv_module(comp_string, length, opt, spirv);
3159 }
3160 
compile_spirv_module(const char * comp_data,int comp_data_size,const Option & opt,std::vector<uint32_t> & spirv)3161 int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector<uint32_t>& spirv)
3162 {
3163     std::vector<std::pair<const char*, const char*> > custom_defines;
3164 
3165     if (opt.use_fp16_storage)
3166     {
3167         custom_defines.push_back(std::make_pair("sfp", "float16_t"));
3168         custom_defines.push_back(std::make_pair("sfpvec2", "f16vec2"));
3169         custom_defines.push_back(std::make_pair("sfpvec4", "f16vec4"));
3170 
3171         if (opt.use_fp16_arithmetic)
3172         {
3173             custom_defines.push_back(std::make_pair("sfpvec8", "f16mat2x4"));
3174             custom_defines.push_back(std::make_pair("sfpmat4", "f16mat4"));
3175         }
3176     }
3177     else if (opt.use_fp16_packed)
3178     {
3179         custom_defines.push_back(std::make_pair("sfp", "float"));
3180         custom_defines.push_back(std::make_pair("sfpvec2", "uint"));
3181         custom_defines.push_back(std::make_pair("sfpvec4", "uvec2"));
3182         custom_defines.push_back(std::make_pair("sfpvec8", "uvec4"));
3183     }
3184     else
3185     {
3186         custom_defines.push_back(std::make_pair("sfp", "float"));
3187         custom_defines.push_back(std::make_pair("sfpvec2", "vec2"));
3188         custom_defines.push_back(std::make_pair("sfpvec4", "vec4"));
3189         custom_defines.push_back(std::make_pair("sfpvec8", "mat2x4"));
3190         custom_defines.push_back(std::make_pair("sfpmat4", "mat4"));
3191     }
3192 
3193     if (opt.use_fp16_arithmetic)
3194     {
3195         custom_defines.push_back(std::make_pair("afp", "float16_t"));
3196         custom_defines.push_back(std::make_pair("afpvec2", "f16vec2"));
3197         custom_defines.push_back(std::make_pair("afpvec4", "f16vec4"));
3198         custom_defines.push_back(std::make_pair("afpvec8", "f16mat2x4"));
3199         custom_defines.push_back(std::make_pair("afpmat4", "f16mat4"));
3200     }
3201     else
3202     {
3203         custom_defines.push_back(std::make_pair("afp", "float"));
3204         custom_defines.push_back(std::make_pair("afpvec2", "vec2"));
3205         custom_defines.push_back(std::make_pair("afpvec4", "vec4"));
3206         custom_defines.push_back(std::make_pair("afpvec8", "mat2x4"));
3207         custom_defines.push_back(std::make_pair("afpmat4", "mat4"));
3208     }
3209 
3210     if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
3211     {
3212         custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "buf[i]"));
3213         custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=v;}"));
3214         custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3215         custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=f16vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"));
3216         custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=f16mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"));
3217         custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "buf[i]"));
3218         custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=v;}"));
3219         custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3220         custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "buf[i]"));
3221         custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=v;}"));
3222         custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3223         custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"));
3224         custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=f16mat2x4(sbuf[si2.r],sbuf[si2.g]);}"));
3225         custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "buf[i]"));
3226         custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=v;}"));
3227         custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3228         custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{f16mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"));
3229         custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{f16mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"));
3230         custom_defines.push_back(std::make_pair("sfp2afpmat4(v)", "v"));
3231         custom_defines.push_back(std::make_pair("afp2sfpmat4(v)", "v"));
3232     }
3233     else if (opt.use_fp16_packed && opt.use_fp16_arithmetic)
3234     {
3235         custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "float16_t(buf[i])"));
3236         custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=float(v);}"));
3237         custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3238         custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=uvec2(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))));}"));
3239         custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=uvec4(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.r],sbuf[sii4.g]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.b],sbuf[sii4.a]))));}"));
3240         custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "f16vec2(unpackHalf2x16(buf[i]))"));
3241         custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=packHalf2x16(vec2(v))}"));
3242         custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3243         custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "f16vec4(vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y)))"));
3244         custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=uvec2(packHalf2x16(vec2(v.rg)),packHalf2x16(vec2(v.ba)));}"));
3245         custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3246         custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"));
3247         custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"));
3248         custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "f16mat2x4(f16vec4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g))),f16vec4(vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a))))"));
3249         custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=uvec4(uvec2(packHalf2x16(vec2(v[0].rg)),packHalf2x16(vec2(v[0].ba))),uvec2(packHalf2x16(vec2(v[1].rg)),packHalf2x16(vec2(v[1].ba))));}"));
3250         custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3251         custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"));
3252         custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"));
3253     }
3254     else if (opt.use_fp16_storage)
3255     {
3256         custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "float(buf[i])"));
3257         custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=float16_t(v);}"));
3258         custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3259         custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i].r=sbuf[si4.r];buf[i].g=sbuf[si4.g];buf[i].b=sbuf[si4.b];buf[i].a=sbuf[si4.a];}"));
3260         custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i].abcd.r=sbuf[si4.r];buf[i].abcd.g=sbuf[si4.g];buf[i].abcd.b=sbuf[si4.b];buf[i].abcd.a=sbuf[si4.a];buf[i].efgh.r=sbuf[sii4.r];buf[i].efgh.g=sbuf[sii4.g];buf[i].efgh.b=sbuf[sii4.b];buf[i].efgh.a=sbuf[sii4.a];}"));
3261         custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "vec2(buf[i])"));
3262         custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=f16vec2(v);}"));
3263         custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3264         custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "vec4(buf[i])"));
3265         custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=f16vec4(v);}"));
3266         custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3267         custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"));
3268         custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i].abcd=sbuf[si2.r];buf[i].efgh=sbuf[si2.g];}"));
3269         custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "mat2x4(vec4(buf[i].abcd),vec4(buf[i].efgh))"));
3270         custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i].abcd=f16vec4(v[0]);buf[i].efgh=f16vec4(v[1]);}"));
3271         custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i].abcd=sbuf[si].abcd;buf[i].efgh=sbuf[si].efgh;}"));
3272         custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{buf[i4.r]=sbuf[si].abcd.r;buf[i4.g]=sbuf[si].abcd.g;buf[i4.b]=sbuf[si].abcd.b;buf[i4.a]=sbuf[si].abcd.a; buf[ii4.r]=sbuf[si].efgh.r;buf[ii4.g]=sbuf[si].efgh.g;buf[ii4.b]=sbuf[si].efgh.b;buf[ii4.a]=sbuf[si].efgh.a;}"));
3273         custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{buf[i2.r]=sbuf[si].abcd;buf[i2.g]=sbuf[si].efgh;}"));
3274     }
3275     else if (opt.use_fp16_packed)
3276     {
3277         custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "buf[i]"));
3278         custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=v;}"));
3279         custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3280         custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=uvec2(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])));}"));
3281         custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=uvec4(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])),packHalf2x16(vec2(sbuf[sii4.r],sbuf[sii4.g])),packHalf2x16(vec2(sbuf[sii4.b],sbuf[sii4.a])));}"));
3282         custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "unpackHalf2x16(buf[i])"));
3283         custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=packHalf2x16(v)}"));
3284         custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3285         custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y))"));
3286         custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=uvec2(packHalf2x16(v.rg),packHalf2x16(v.ba));}"));
3287         custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3288         custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"));
3289         custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"));
3290         custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "mat2x4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g)),vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a)))"));
3291         custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=uvec4(uvec2(packHalf2x16(v[0].rg),packHalf2x16(v[0].ba)),uvec2(packHalf2x16(v[1].rg),packHalf2x16(v[1].ba)));}"));
3292         custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3293         custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"));
3294         custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"));
3295     }
3296     else
3297     {
3298         custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "buf[i]"));
3299         custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=v;}"));
3300         custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3301         custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"));
3302         custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"));
3303         custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "buf[i]"));
3304         custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=v;}"));
3305         custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3306         custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "buf[i]"));
3307         custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=v;}"));
3308         custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3309         custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{vec4 _v=sbuf[si]; buf[i4.r]=_v.r;buf[i4.g]=_v.g;buf[i4.b]=_v.b;buf[i4.a]=_v.a;}"));
3310         custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=mat2x4(sbuf[si2.r],sbuf[si2.g]);}"));
3311         custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "buf[i]"));
3312         custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=v;}"));
3313         custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3314         custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"));
3315         custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"));
3316         custom_defines.push_back(std::make_pair("sfp2afpmat4(v)", "v"));
3317         custom_defines.push_back(std::make_pair("afp2sfpmat4(v)", "v"));
3318     }
3319 
3320     if (opt.use_image_storage)
3321     {
3322         if (opt.use_fp16_storage)
3323         {
3324             custom_defines.push_back(std::make_pair("imfmtc1", "r16f"));
3325             custom_defines.push_back(std::make_pair("imfmtc4", "rgba16f"));
3326             custom_defines.push_back(std::make_pair("unfp", "mediump"));
3327         }
3328         else if (opt.use_fp16_packed)
3329         {
3330             custom_defines.push_back(std::make_pair("imfmtc1", "r32f"));
3331             custom_defines.push_back(std::make_pair("imfmtc4", "rgba16f"));
3332             custom_defines.push_back(std::make_pair("unfp", "mediump"));
3333         }
3334         else
3335         {
3336             custom_defines.push_back(std::make_pair("imfmtc1", "r32f"));
3337             custom_defines.push_back(std::make_pair("imfmtc4", "rgba32f"));
3338             custom_defines.push_back(std::make_pair("unfp", "highp"));
3339         }
3340 
3341         if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
3342         {
3343             custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
3344             custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
3345             custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
3346             custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=float(v);imageStore(img,p,_v);}"));
3347             custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=float(v);imageStore(img,p,_v);}"));
3348             custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=float(v);imageStore(img,p,_v);}"));
3349             custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3350             custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3351             custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3352             custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
3353             custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
3354             custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
3355             custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,vec4(v));}"));
3356             custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,vec4(v));}"));
3357             custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,vec4(v));}"));
3358             custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3359             custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3360             custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3361             custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
3362             custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
3363             custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
3364             custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,vec4(v[0]));imageStore(img,(p)*2+1,vec4(v[1]));}"));
3365             custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),vec4(v[0]));imageStore(img,ivec2(p.x*2+1,p.y),vec4(v[1]));}"));
3366             custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),vec4(v[0]));imageStore(img,ivec3(p.x*2+1,p.y,p.z),vec4(v[1]));}"));
3367             custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
3368             custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
3369             custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
3370         }
3371         else if (opt.use_fp16_packed && opt.use_fp16_arithmetic)
3372         {
3373             custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
3374             custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
3375             custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
3376             custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3377             custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3378             custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3379             custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3380             custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3381             custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3382             custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
3383             custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
3384             custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
3385             custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3386             custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3387             custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3388             custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3389             custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3390             custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3391             custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
3392             custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
3393             custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
3394             custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"));
3395             custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"));
3396             custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"));
3397             custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
3398             custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
3399             custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
3400         }
3401         else if (opt.use_fp16_storage)
3402         {
3403             custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3404             custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3405             custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3406             custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3407             custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3408             custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3409             custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3410             custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3411             custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3412             custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3413             custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3414             custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3415             custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3416             custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3417             custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3418             custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3419             custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3420             custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3421             custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
3422             custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
3423             custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
3424             custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"));
3425             custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"));
3426             custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"));
3427             custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
3428             custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
3429             custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
3430         }
3431         else if (opt.use_fp16_packed)
3432         {
3433             custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3434             custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3435             custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3436             custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3437             custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3438             custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3439             custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3440             custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3441             custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3442             custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3443             custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3444             custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3445             custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3446             custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3447             custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3448             custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3449             custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3450             custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3451             custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
3452             custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
3453             custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
3454             custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"));
3455             custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"));
3456             custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"));
3457             custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
3458             custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
3459             custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
3460         }
3461         else
3462         {
3463             custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3464             custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3465             custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3466             custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3467             custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3468             custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3469             custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3470             custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3471             custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3472             custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3473             custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3474             custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3475             custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3476             custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3477             custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3478             custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3479             custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3480             custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3481             custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
3482             custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
3483             custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
3484             custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"));
3485             custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"));
3486             custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"));
3487             custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
3488             custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
3489             custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
3490         }
3491     }
3492 
3493     custom_defines.push_back(std::make_pair("psc(x)", "(x==0?p.x:x)"));
3494 
3495     if (opt.use_fp16_storage)
3496     {
3497         custom_defines.push_back(std::make_pair("NCNN_fp16_storage", "1"));
3498     }
3499     else if (opt.use_fp16_packed)
3500     {
3501         custom_defines.push_back(std::make_pair("NCNN_fp16_packed", "1"));
3502     }
3503 
3504     if (opt.use_fp16_arithmetic)
3505     {
3506         custom_defines.push_back(std::make_pair("NCNN_fp16_arithmetic", "1"));
3507     }
3508 
3509     if (opt.use_int8_storage)
3510     {
3511         custom_defines.push_back(std::make_pair("NCNN_int8_storage", "1"));
3512     }
3513     else if (opt.use_int8_packed)
3514     {
3515         custom_defines.push_back(std::make_pair("NCNN_int8_packed", "1"));
3516     }
3517 
3518     if (opt.use_int8_arithmetic)
3519     {
3520         custom_defines.push_back(std::make_pair("NCNN_int8_arithmetic", "1"));
3521     }
3522 
3523     if (opt.use_image_storage)
3524     {
3525         custom_defines.push_back(std::make_pair("NCNN_image_shader", "1"));
3526     }
3527 
3528     if (opt.use_subgroup_basic)
3529     {
3530         custom_defines.push_back(std::make_pair("NCNN_subgroup_basic", "1"));
3531 
3532         if (opt.use_subgroup_vote)
3533         {
3534             custom_defines.push_back(std::make_pair("NCNN_subgroup_vote", "1"));
3535         }
3536         if (opt.use_subgroup_ballot)
3537         {
3538             custom_defines.push_back(std::make_pair("NCNN_subgroup_ballot", "1"));
3539         }
3540         if (opt.use_subgroup_shuffle)
3541         {
3542             custom_defines.push_back(std::make_pair("NCNN_subgroup_shuffle", "1"));
3543         }
3544     }
3545 
3546     std::string preamble;
3547     std::vector<std::string> processes;
3548 
3549     processes.resize(custom_defines.size());
3550     for (size_t i = 0; i < custom_defines.size(); i++)
3551     {
3552         const char* key = custom_defines[i].first;
3553         const char* def = custom_defines[i].second;
3554 
3555         preamble += std::string("#define ") + key + " " + def + "\n";
3556         processes[i] = std::string("define-macro ") + key + "=" + def;
3557     }
3558 
3559     bool compile_success = true;
3560 
3561     {
3562         glslang::TShader s(EShLangCompute);
3563 
3564         s.setStringsWithLengths(&comp_data, &comp_data_size, 1);
3565 
3566         s.setPreamble(preamble.c_str());
3567         s.addProcesses(processes);
3568         s.setEntryPoint("main");
3569         s.setSourceEntryPoint("main");
3570 
3571         s.setEnvInput(glslang::EShSourceGlsl, EShLangCompute, glslang::EShClientVulkan, 1);
3572 
3573         if (opt.use_subgroup_basic)
3574         {
3575             // subgroup need vulkan-1.1 and spirv-1.3
3576             s.setEnvClient(glslang::EShClientVulkan, glslang::EShTargetVulkan_1_1);
3577             s.setEnvTarget(glslang::EshTargetSpv, glslang::EShTargetSpv_1_3);
3578         }
3579         else
3580         {
3581             s.setEnvClient(glslang::EShClientVulkan, glslang::EShTargetVulkan_1_0);
3582             s.setEnvTarget(glslang::EshTargetSpv, glslang::EShTargetSpv_1_0);
3583         }
3584 
3585         TBuiltInResource resources = get_default_TBuiltInResource();
3586 
3587         bool pr = s.parse(&resources, 100, false, EShMsgDefault);
3588         if (!pr)
3589         {
3590             NCNN_LOGE("compile spir-v module failed");
3591             NCNN_LOGE("%s", s.getInfoLog());
3592             NCNN_LOGE("%s", s.getInfoDebugLog());
3593 
3594             compile_success = false;
3595         }
3596         else
3597         {
3598             glslang::TIntermediate* ir = s.getIntermediate();
3599             glslang::GlslangToSpv(*ir, spirv);
3600         }
3601     }
3602 
3603     return compile_success ? 0 : -1;
3604 }
3605 
compile_spirv_module(int shader_type_index,const Option & opt,std::vector<uint32_t> & spirv)3606 int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv)
3607 {
3608     if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count)
3609     {
3610         NCNN_LOGE("no such shader module %d", shader_type_index);
3611         return -1;
3612     }
3613 
3614     const char* comp_data = layer_shader_registry[shader_type_index].comp_data;
3615     int comp_data_size = layer_shader_registry[shader_type_index].comp_data_size;
3616 
3617     return compile_spirv_module(comp_data, comp_data_size, opt, spirv);
3618 }
3619 
resolve_shader_info(const uint32_t * spv_data,size_t spv_data_size,ShaderInfo & shader_info)3620 int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info)
3621 {
3622     shader_info.specialization_count = 0;
3623     shader_info.binding_count = 0;
3624     shader_info.push_constant_count = 0;
3625 
3626     uint32_t parameter_id = -233;
3627 
3628     int specialization_count = 0;
3629     int binding_count = 0;
3630     int push_constant_count = 0;
3631 
3632     // id -> binding_type
3633     std::vector<int> id_types;
3634 
3635     // binding_id -> binding_type
3636     std::vector<int> binding_types;
3637 
3638     const uint32_t* p = spv_data;
3639 
3640     int bound = p[3];
3641 
3642     id_types.resize(bound);
3643 
3644     // skip magic version generator bound schema
3645     p += 5;
3646 
3647     // foreach op
3648     while ((const unsigned char*)p < (const unsigned char*)spv_data + spv_data_size)
3649     {
3650         uint32_t opcode = p[0];
3651 
3652         uint16_t wordcount = opcode >> 16;
3653         uint16_t op = opcode & 0xffff;
3654 
3655         if (op == 5) // OpName
3656         {
3657             uint32_t id = p[1];
3658             const char* name = (const char*)&p[2];
3659             if (strcmp(name, "parameter") == 0)
3660             {
3661                 parameter_id = id;
3662             }
3663         }
3664         else if (op == 6) // OpMemberName
3665         {
3666             uint32_t id = p[1];
3667             if (id == parameter_id)
3668             {
3669                 push_constant_count++;
3670             }
3671         }
3672         else if (op == 25) // OpTypeImage
3673         {
3674             uint32_t id = p[1];
3675             id_types[id] = 2;
3676         }
3677         else if (op == 27) // OpTypeSampledImage
3678         {
3679             uint32_t id = p[1];
3680             id_types[id] = 3;
3681         }
3682         else if (op == 32) // OpTypePointer
3683         {
3684             uint32_t id = p[1];
3685             uint32_t storage_class = p[2];
3686             uint32_t type = p[3];
3687             if (storage_class == 0) // UniformConstant
3688             {
3689                 id_types[id] = id_types[type];
3690             }
3691             if (storage_class == 2) // Uniform
3692             {
3693                 id_types[id] = id_types[type];
3694             }
3695             if (storage_class == 12) // StorageBuffer
3696             {
3697                 id_types[type] = 1;
3698                 id_types[id] = id_types[type];
3699             }
3700         }
3701         else if (op == 59) // OpVariable
3702         {
3703             uint32_t id = p[1];
3704             uint32_t var_id = p[2];
3705             uint32_t storage_class = p[3];
3706             if (storage_class == 0) // UniformConstant
3707             {
3708                 id_types[var_id] = id_types[id];
3709             }
3710             if (storage_class == 2) // Uniform
3711             {
3712                 id_types[var_id] = id_types[id];
3713             }
3714             if (storage_class == 12) // StorageBuffer
3715             {
3716                 id_types[var_id] = id_types[id];
3717             }
3718         }
3719         else if (op == 71) // OpDecorate
3720         {
3721             uint32_t id = p[1];
3722             uint32_t decoration = p[2];
3723             uint32_t binding_id = p[3];
3724             if (decoration == 1) // SpecId
3725             {
3726                 specialization_count++;
3727             }
3728             if (decoration == 3) // BufferBlock
3729             {
3730                 id_types[id] = 1;
3731             }
3732             else if (decoration == 33) // Binding
3733             {
3734                 binding_count = std::max(binding_count, (int)binding_id + 1);
3735 
3736                 binding_types.resize(binding_count);
3737                 binding_types[binding_id] = id;
3738             }
3739         }
3740 
3741         p += wordcount;
3742     }
3743 
3744     if (binding_count > 16)
3745     {
3746         NCNN_LOGE("too many binding %d", binding_count);
3747         return -1;
3748     }
3749 
3750     shader_info.specialization_count = specialization_count;
3751     shader_info.binding_count = binding_count;
3752     shader_info.push_constant_count = push_constant_count;
3753 
3754     // resolve binding_types
3755     for (int i = 0; i < binding_count; i++)
3756     {
3757         shader_info.binding_types[i] = id_types[binding_types[i]];
3758     }
3759 
3760     return 0;
3761 }
3762 
3763 } // namespace ncnn
3764 
3765 #endif // NCNN_VULKAN
3766