1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "gpu.h"
16 
17 #if NCNN_VULKAN
18 
19 #include <math.h>
20 #include <string.h>
21 #include <vulkan/vulkan.h>
22 
23 #include "glslang/SPIRV/GlslangToSpv.h"
24 #include "glslang/glslang/Public/ShaderLang.h"
25 
26 #include "command.h"
27 #include "layer.h"
28 #include "layer/vulkan/packing_vulkan.h"
29 #include "layer_type.h"
30 #include "mat.h"
31 #include "pipelinecache.h"
32 
33 // There is known issue that vkDestroyDebugUtilsMessengerEXT crash on exit when vulkan validation layer enabled
34 // upstream fix https://github.com/KhronosGroup/Vulkan-Loader/pull/539
35 #define ENABLE_VALIDATION_LAYER 0
36 
37 namespace ncnn {
38 
39 // global
40 static Mutex g_instance_lock;
41 
42 class __ncnn_vulkan_instance_holder
43 {
44 public:
__ncnn_vulkan_instance_holder()45     __ncnn_vulkan_instance_holder()
46     {
47         instance = 0;
48 #if ENABLE_VALIDATION_LAYER
49         callback = 0;
50 #endif
51     }
52 
~__ncnn_vulkan_instance_holder()53     ~__ncnn_vulkan_instance_holder()
54     {
55         destroy_gpu_instance();
56     }
57 
operator VkInstance()58     operator VkInstance()
59     {
60         return instance;
61     }
62 
63     VkInstance instance;
64 #if ENABLE_VALIDATION_LAYER
65     VkDebugUtilsMessengerEXT callback;
66 #endif
67 };
68 static __ncnn_vulkan_instance_holder g_instance;
69 
70 static int g_gpu_count = 0;
71 static int g_default_gpu_index = -1;
72 
73 // NOTE 8 is large enough i think ...
74 #define NCNN_MAX_GPU_COUNT 8
75 static GpuInfo* g_gpu_infos[NCNN_MAX_GPU_COUNT] = {0};
76 
77 // default vulkan device
78 static Mutex g_default_vkdev_lock;
79 static VulkanDevice* g_default_vkdev[NCNN_MAX_GPU_COUNT] = {0};
80 
81 struct layer_shader_registry_entry
82 {
83     const char* comp_data;
84     int comp_data_size;
85 };
86 
87 #include "layer_shader_spv_data.h"
88 
89 static const layer_shader_registry_entry layer_shader_registry[] = {
90 #include "layer_shader_registry.h"
91 };
92 
93 static const int layer_shader_registry_entry_count = sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry);
94 
95 int support_VK_KHR_external_memory_capabilities = 0;
96 int support_VK_KHR_get_physical_device_properties2 = 0;
97 int support_VK_KHR_get_surface_capabilities2 = 0;
98 int support_VK_KHR_surface = 0;
99 int support_VK_EXT_debug_utils = 0;
100 #if __ANDROID_API__ >= 26
101 int support_VK_KHR_android_surface = 0;
102 #endif // __ANDROID_API__ >= 26
103 
104 // VK_KHR_external_memory_capabilities
105 PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR = 0;
106 
107 // VK_KHR_get_physical_device_properties2
108 PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR = 0;
109 PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR = 0;
110 PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR = 0;
111 PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR = 0;
112 PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR = 0;
113 PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR = 0;
114 PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR vkGetPhysicalDeviceSparseImageFormatProperties2KHR = 0;
115 
116 // VK_KHR_get_surface_capabilities2
117 PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR = 0;
118 PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR = 0;
119 
120 // VK_KHR_surface
121 PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR = 0;
122 PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR = 0;
123 PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR = 0;
124 PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR = 0;
125 PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR = 0;
126 
127 #if __ANDROID_API__ >= 26
128 // VK_KHR_android_surface
129 PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR = 0;
130 #endif // __ANDROID_API__ >= 26
131 
132 class GpuInfoPrivate
133 {
134 public:
135     // vulkan physical device
136     VkPhysicalDevice physical_device;
137 
138     // memory properties
139     VkPhysicalDeviceMemoryProperties physical_device_memory_properties;
140 
141     // info
142     uint32_t api_version;
143     uint32_t driver_version;
144     uint32_t vendor_id;
145     uint32_t device_id;
146     char device_name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
147     uint8_t pipeline_cache_uuid[VK_UUID_SIZE];
148 
149     // 0 = discrete gpu
150     // 1 = integrated gpu
151     // 2 = virtual gpu
152     // 3 = cpu
153     int type;
154 
155     // hardware limit
156     uint32_t max_shared_memory_size;
157     uint32_t max_workgroup_count_x;
158     uint32_t max_workgroup_count_y;
159     uint32_t max_workgroup_count_z;
160     uint32_t max_workgroup_invocations;
161     uint32_t max_workgroup_size_x;
162     uint32_t max_workgroup_size_y;
163     uint32_t max_workgroup_size_z;
164     size_t memory_map_alignment;
165     size_t buffer_offset_alignment;
166     size_t non_coherent_atom_size;
167     size_t buffer_image_granularity;
168     uint32_t max_image_dimension_1d;
169     uint32_t max_image_dimension_2d;
170     uint32_t max_image_dimension_3d;
171     float timestamp_period;
172 
173     // runtime
174     uint32_t compute_queue_family_index;
175     uint32_t graphics_queue_family_index;
176     uint32_t transfer_queue_family_index;
177 
178     uint32_t compute_queue_count;
179     uint32_t graphics_queue_count;
180     uint32_t transfer_queue_count;
181 
182     // property
183     bool unified_compute_transfer_queue;
184 
185     // subgroup
186     uint32_t subgroup_size;
187     bool support_subgroup_basic;
188     bool support_subgroup_vote;
189     bool support_subgroup_ballot;
190     bool support_subgroup_shuffle;
191 
192     // bug is not feature
193     bool bug_storage_buffer_no_l1;
194     bool bug_corrupted_online_pipeline_cache;
195     bool bug_buffer_image_load_zero;
196 
197     // but sometimes bug is a feature
198     bool bug_implicit_fp16_arithmetic;
199 
200     // fp16 and int8 feature
201     bool support_fp16_packed;
202     bool support_fp16_storage;
203     bool support_fp16_arithmetic;
204     bool support_int8_packed;
205     bool support_int8_storage;
206     bool support_int8_arithmetic;
207 
208     // ycbcr conversion feature
209     bool support_ycbcr_conversion;
210 
211     bool support_reserved_0;
212 
213     // extension capability
214     int support_VK_KHR_8bit_storage;
215     int support_VK_KHR_16bit_storage;
216     int support_VK_KHR_bind_memory2;
217     int support_VK_KHR_create_renderpass2;
218     int support_VK_KHR_dedicated_allocation;
219     int support_VK_KHR_descriptor_update_template;
220     int support_VK_KHR_external_memory;
221     int support_VK_KHR_get_memory_requirements2;
222     int support_VK_KHR_maintenance1;
223     int support_VK_KHR_maintenance2;
224     int support_VK_KHR_maintenance3;
225     int support_VK_KHR_multiview;
226     int support_VK_KHR_push_descriptor;
227     int support_VK_KHR_sampler_ycbcr_conversion;
228     int support_VK_KHR_shader_float16_int8;
229     int support_VK_KHR_shader_float_controls;
230     int support_VK_KHR_storage_buffer_storage_class;
231     int support_VK_KHR_swapchain;
232     int support_VK_EXT_descriptor_indexing;
233     int support_VK_EXT_memory_budget;
234     int support_VK_EXT_queue_family_foreign;
235 #if __ANDROID_API__ >= 26
236     int support_VK_ANDROID_external_memory_android_hardware_buffer;
237 #endif // __ANDROID_API__ >= 26
238 };
239 
GpuInfo()240 GpuInfo::GpuInfo()
241     : d(new GpuInfoPrivate)
242 {
243 }
244 
~GpuInfo()245 GpuInfo::~GpuInfo()
246 {
247     delete d;
248 }
249 
GpuInfo(const GpuInfo &)250 GpuInfo::GpuInfo(const GpuInfo&)
251     : d(0)
252 {
253 }
254 
operator =(const GpuInfo &)255 GpuInfo& GpuInfo::operator=(const GpuInfo&)
256 {
257     return *this;
258 }
259 
physical_device() const260 VkPhysicalDevice GpuInfo::physical_device() const
261 {
262     return d->physical_device;
263 }
264 
physical_device_memory_properties() const265 const VkPhysicalDeviceMemoryProperties& GpuInfo::physical_device_memory_properties() const
266 {
267     return d->physical_device_memory_properties;
268 }
269 
api_version() const270 uint32_t GpuInfo::api_version() const
271 {
272     return d->api_version;
273 }
274 
driver_version() const275 uint32_t GpuInfo::driver_version() const
276 {
277     return d->driver_version;
278 }
279 
vendor_id() const280 uint32_t GpuInfo::vendor_id() const
281 {
282     return d->vendor_id;
283 }
284 
device_id() const285 uint32_t GpuInfo::device_id() const
286 {
287     return d->device_id;
288 }
289 
device_name() const290 const char* GpuInfo::device_name() const
291 {
292     return d->device_name;
293 }
294 
pipeline_cache_uuid() const295 uint8_t* GpuInfo::pipeline_cache_uuid() const
296 {
297     return d->pipeline_cache_uuid;
298 }
299 
type() const300 int GpuInfo::type() const
301 {
302     return d->type;
303 }
304 
max_shared_memory_size() const305 uint32_t GpuInfo::max_shared_memory_size() const
306 {
307     return d->max_shared_memory_size;
308 }
309 
max_workgroup_count_x() const310 uint32_t GpuInfo::max_workgroup_count_x() const
311 {
312     return d->max_workgroup_count_x;
313 }
314 
max_workgroup_count_y() const315 uint32_t GpuInfo::max_workgroup_count_y() const
316 {
317     return d->max_workgroup_count_y;
318 }
319 
max_workgroup_count_z() const320 uint32_t GpuInfo::max_workgroup_count_z() const
321 {
322     return d->max_workgroup_count_z;
323 }
324 
max_workgroup_invocations() const325 uint32_t GpuInfo::max_workgroup_invocations() const
326 {
327     return d->max_workgroup_invocations;
328 }
329 
max_workgroup_size_x() const330 uint32_t GpuInfo::max_workgroup_size_x() const
331 {
332     return d->max_workgroup_size_x;
333 }
334 
max_workgroup_size_y() const335 uint32_t GpuInfo::max_workgroup_size_y() const
336 {
337     return d->max_workgroup_size_y;
338 }
339 
max_workgroup_size_z() const340 uint32_t GpuInfo::max_workgroup_size_z() const
341 {
342     return d->max_workgroup_size_z;
343 }
344 
memory_map_alignment() const345 size_t GpuInfo::memory_map_alignment() const
346 {
347     return d->memory_map_alignment;
348 }
349 
buffer_offset_alignment() const350 size_t GpuInfo::buffer_offset_alignment() const
351 {
352     return d->buffer_offset_alignment;
353 }
354 
non_coherent_atom_size() const355 size_t GpuInfo::non_coherent_atom_size() const
356 {
357     return d->non_coherent_atom_size;
358 }
359 
buffer_image_granularity() const360 size_t GpuInfo::buffer_image_granularity() const
361 {
362     return d->buffer_image_granularity;
363 }
364 
max_image_dimension_1d() const365 uint32_t GpuInfo::max_image_dimension_1d() const
366 {
367     return d->max_image_dimension_1d;
368 }
369 
max_image_dimension_2d() const370 uint32_t GpuInfo::max_image_dimension_2d() const
371 {
372     return d->max_image_dimension_2d;
373 }
374 
max_image_dimension_3d() const375 uint32_t GpuInfo::max_image_dimension_3d() const
376 {
377     return d->max_image_dimension_3d;
378 }
379 
timestamp_period() const380 float GpuInfo::timestamp_period() const
381 {
382     return d->timestamp_period;
383 }
384 
compute_queue_family_index() const385 uint32_t GpuInfo::compute_queue_family_index() const
386 {
387     return d->compute_queue_family_index;
388 }
389 
graphics_queue_family_index() const390 uint32_t GpuInfo::graphics_queue_family_index() const
391 {
392     return d->graphics_queue_family_index;
393 }
394 
transfer_queue_family_index() const395 uint32_t GpuInfo::transfer_queue_family_index() const
396 {
397     return d->transfer_queue_family_index;
398 }
399 
compute_queue_count() const400 uint32_t GpuInfo::compute_queue_count() const
401 {
402     return d->compute_queue_count;
403 }
404 
graphics_queue_count() const405 uint32_t GpuInfo::graphics_queue_count() const
406 {
407     return d->graphics_queue_count;
408 }
409 
transfer_queue_count() const410 uint32_t GpuInfo::transfer_queue_count() const
411 {
412     return d->transfer_queue_count;
413 }
414 
unified_compute_transfer_queue() const415 bool GpuInfo::unified_compute_transfer_queue() const
416 {
417     return d->unified_compute_transfer_queue;
418 }
419 
subgroup_size() const420 uint32_t GpuInfo::subgroup_size() const
421 {
422     return d->subgroup_size;
423 }
424 
support_subgroup_basic() const425 bool GpuInfo::support_subgroup_basic() const
426 {
427     return d->support_subgroup_basic;
428 }
429 
support_subgroup_vote() const430 bool GpuInfo::support_subgroup_vote() const
431 {
432     return d->support_subgroup_vote;
433 }
434 
support_subgroup_ballot() const435 bool GpuInfo::support_subgroup_ballot() const
436 {
437     return d->support_subgroup_ballot;
438 }
439 
support_subgroup_shuffle() const440 bool GpuInfo::support_subgroup_shuffle() const
441 {
442     return d->support_subgroup_shuffle;
443 }
444 
bug_storage_buffer_no_l1() const445 bool GpuInfo::bug_storage_buffer_no_l1() const
446 {
447     return d->bug_storage_buffer_no_l1;
448 }
449 
bug_corrupted_online_pipeline_cache() const450 bool GpuInfo::bug_corrupted_online_pipeline_cache() const
451 {
452     return d->bug_corrupted_online_pipeline_cache;
453 }
454 
bug_buffer_image_load_zero() const455 bool GpuInfo::bug_buffer_image_load_zero() const
456 {
457     return d->bug_buffer_image_load_zero;
458 }
459 
bug_implicit_fp16_arithmetic() const460 bool GpuInfo::bug_implicit_fp16_arithmetic() const
461 {
462     return d->bug_implicit_fp16_arithmetic;
463 }
464 
support_fp16_packed() const465 bool GpuInfo::support_fp16_packed() const
466 {
467     return d->support_fp16_packed;
468 }
469 
support_fp16_storage() const470 bool GpuInfo::support_fp16_storage() const
471 {
472     return d->support_fp16_storage;
473 }
474 
support_fp16_arithmetic() const475 bool GpuInfo::support_fp16_arithmetic() const
476 {
477     return d->support_fp16_arithmetic;
478 }
479 
support_int8_packed() const480 bool GpuInfo::support_int8_packed() const
481 {
482     return d->support_int8_packed;
483 }
484 
support_int8_storage() const485 bool GpuInfo::support_int8_storage() const
486 {
487     return d->support_int8_storage;
488 }
489 
support_int8_arithmetic() const490 bool GpuInfo::support_int8_arithmetic() const
491 {
492     return d->support_int8_arithmetic;
493 }
494 
support_ycbcr_conversion() const495 bool GpuInfo::support_ycbcr_conversion() const
496 {
497     return d->support_ycbcr_conversion;
498 }
499 
support_VK_KHR_8bit_storage() const500 int GpuInfo::support_VK_KHR_8bit_storage() const
501 {
502     return d->support_VK_KHR_8bit_storage;
503 }
504 
support_VK_KHR_16bit_storage() const505 int GpuInfo::support_VK_KHR_16bit_storage() const
506 {
507     return d->support_VK_KHR_16bit_storage;
508 }
509 
support_VK_KHR_bind_memory2() const510 int GpuInfo::support_VK_KHR_bind_memory2() const
511 {
512     return d->support_VK_KHR_bind_memory2;
513 }
514 
support_VK_KHR_create_renderpass2() const515 int GpuInfo::support_VK_KHR_create_renderpass2() const
516 {
517     return d->support_VK_KHR_create_renderpass2;
518 }
519 
support_VK_KHR_dedicated_allocation() const520 int GpuInfo::support_VK_KHR_dedicated_allocation() const
521 {
522     return d->support_VK_KHR_dedicated_allocation;
523 }
524 
support_VK_KHR_descriptor_update_template() const525 int GpuInfo::support_VK_KHR_descriptor_update_template() const
526 {
527     return d->support_VK_KHR_descriptor_update_template;
528 }
529 
support_VK_KHR_external_memory() const530 int GpuInfo::support_VK_KHR_external_memory() const
531 {
532     return d->support_VK_KHR_external_memory;
533 }
534 
support_VK_KHR_get_memory_requirements2() const535 int GpuInfo::support_VK_KHR_get_memory_requirements2() const
536 {
537     return d->support_VK_KHR_get_memory_requirements2;
538 }
539 
support_VK_KHR_maintenance1() const540 int GpuInfo::support_VK_KHR_maintenance1() const
541 {
542     return d->support_VK_KHR_maintenance1;
543 }
544 
support_VK_KHR_maintenance2() const545 int GpuInfo::support_VK_KHR_maintenance2() const
546 {
547     return d->support_VK_KHR_maintenance2;
548 }
549 
support_VK_KHR_maintenance3() const550 int GpuInfo::support_VK_KHR_maintenance3() const
551 {
552     return d->support_VK_KHR_maintenance3;
553 }
554 
support_VK_KHR_multiview() const555 int GpuInfo::support_VK_KHR_multiview() const
556 {
557     return d->support_VK_KHR_multiview;
558 }
559 
support_VK_KHR_push_descriptor() const560 int GpuInfo::support_VK_KHR_push_descriptor() const
561 {
562     return d->support_VK_KHR_push_descriptor;
563 }
564 
support_VK_KHR_sampler_ycbcr_conversion() const565 int GpuInfo::support_VK_KHR_sampler_ycbcr_conversion() const
566 {
567     return d->support_VK_KHR_sampler_ycbcr_conversion;
568 }
569 
support_VK_KHR_shader_float16_int8() const570 int GpuInfo::support_VK_KHR_shader_float16_int8() const
571 {
572     return d->support_VK_KHR_shader_float16_int8;
573 }
574 
support_VK_KHR_shader_float_controls() const575 int GpuInfo::support_VK_KHR_shader_float_controls() const
576 {
577     return d->support_VK_KHR_shader_float_controls;
578 }
579 
support_VK_KHR_storage_buffer_storage_class() const580 int GpuInfo::support_VK_KHR_storage_buffer_storage_class() const
581 {
582     return d->support_VK_KHR_storage_buffer_storage_class;
583 }
584 
support_VK_KHR_swapchain() const585 int GpuInfo::support_VK_KHR_swapchain() const
586 {
587     return d->support_VK_KHR_swapchain;
588 }
589 
support_VK_EXT_descriptor_indexing() const590 int GpuInfo::support_VK_EXT_descriptor_indexing() const
591 {
592     return d->support_VK_EXT_descriptor_indexing;
593 }
594 
support_VK_EXT_memory_budget() const595 int GpuInfo::support_VK_EXT_memory_budget() const
596 {
597     return d->support_VK_EXT_memory_budget;
598 }
599 
support_VK_EXT_queue_family_foreign() const600 int GpuInfo::support_VK_EXT_queue_family_foreign() const
601 {
602     return d->support_VK_EXT_queue_family_foreign;
603 }
604 
605 #if __ANDROID_API__ >= 26
support_VK_ANDROID_external_memory_android_hardware_buffer() const606 int GpuInfo::support_VK_ANDROID_external_memory_android_hardware_buffer() const
607 {
608     return d->support_VK_ANDROID_external_memory_android_hardware_buffer;
609 }
610 #endif // __ANDROID_API__ >= 26
611 
init_instance_extension()612 static int init_instance_extension()
613 {
614     if (support_VK_KHR_external_memory_capabilities)
615     {
616         vkGetPhysicalDeviceExternalBufferPropertiesKHR = (PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceExternalBufferPropertiesKHR");
617     }
618 
619     if (support_VK_KHR_get_physical_device_properties2)
620     {
621         vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFeatures2KHR");
622         vkGetPhysicalDeviceProperties2KHR = (PFN_vkGetPhysicalDeviceProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceProperties2KHR");
623         vkGetPhysicalDeviceFormatProperties2KHR = (PFN_vkGetPhysicalDeviceFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFormatProperties2KHR");
624         vkGetPhysicalDeviceImageFormatProperties2KHR = (PFN_vkGetPhysicalDeviceImageFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceImageFormatProperties2KHR");
625         vkGetPhysicalDeviceQueueFamilyProperties2KHR = (PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceQueueFamilyProperties2KHR");
626         vkGetPhysicalDeviceMemoryProperties2KHR = (PFN_vkGetPhysicalDeviceMemoryProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceMemoryProperties2KHR");
627         vkGetPhysicalDeviceSparseImageFormatProperties2KHR = (PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSparseImageFormatProperties2KHR");
628     }
629 
630     if (support_VK_KHR_get_surface_capabilities2)
631     {
632         vkGetPhysicalDeviceSurfaceCapabilities2KHR = (PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceCapabilities2KHR");
633         vkGetPhysicalDeviceSurfaceFormats2KHR = (PFN_vkGetPhysicalDeviceSurfaceFormats2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceFormats2KHR");
634     }
635 
636     if (support_VK_KHR_surface)
637     {
638         vkDestroySurfaceKHR = (PFN_vkDestroySurfaceKHR)vkGetInstanceProcAddr(g_instance, "vkDestroySurfaceKHR");
639         vkGetPhysicalDeviceSurfaceSupportKHR = (PFN_vkGetPhysicalDeviceSurfaceSupportKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceSupportKHR");
640         vkGetPhysicalDeviceSurfaceCapabilitiesKHR = (PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceCapabilitiesKHR");
641         vkGetPhysicalDeviceSurfaceFormatsKHR = (PFN_vkGetPhysicalDeviceSurfaceFormatsKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceFormatsKHR");
642         vkGetPhysicalDeviceSurfacePresentModesKHR = (PFN_vkGetPhysicalDeviceSurfacePresentModesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfacePresentModesKHR");
643     }
644 
645 #if __ANDROID_API__ >= 26
646     if (support_VK_KHR_android_surface)
647     {
648         vkCreateAndroidSurfaceKHR = (PFN_vkCreateAndroidSurfaceKHR)vkGetInstanceProcAddr(g_instance, "vkCreateAndroidSurfaceKHR");
649     }
650 #endif // __ANDROID_API__ >= 26
651 
652     return 0;
653 }
654 
655 #if ENABLE_VALIDATION_LAYER
debugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT,VkDebugUtilsMessageTypeFlagsEXT,const VkDebugUtilsMessengerCallbackDataEXT * pCallbackData,void *)656 static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback(
657     VkDebugUtilsMessageSeverityFlagBitsEXT /*messageSeverity*/,
658     VkDebugUtilsMessageTypeFlagsEXT /*messageType*/,
659     const VkDebugUtilsMessengerCallbackDataEXT* pCallbackData,
660     void* /*pUserData*/)
661 {
662     NCNN_LOGE("validation layer: %s", pCallbackData->pMessage);
663 
664     return VK_FALSE;
665 }
666 
CreateDebugUtilsMessengerEXT(VkInstance instance,const VkDebugUtilsMessengerCreateInfoEXT * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkDebugUtilsMessengerEXT * pCallback)667 static VkResult CreateDebugUtilsMessengerEXT(VkInstance instance, const VkDebugUtilsMessengerCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDebugUtilsMessengerEXT* pCallback)
668 {
669     PFN_vkCreateDebugUtilsMessengerEXT func = (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkCreateDebugUtilsMessengerEXT");
670     if (func)
671         return func(instance, pCreateInfo, pAllocator, pCallback);
672 
673     return VK_ERROR_EXTENSION_NOT_PRESENT;
674 }
675 
DestroyDebugUtilsMessengerEXT(VkInstance instance,VkDebugUtilsMessengerEXT callback,const VkAllocationCallbacks * pAllocator)676 static void DestroyDebugUtilsMessengerEXT(VkInstance instance, VkDebugUtilsMessengerEXT callback, const VkAllocationCallbacks* pAllocator)
677 {
678     PFN_vkDestroyDebugUtilsMessengerEXT func = (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkDestroyDebugUtilsMessengerEXT");
679     if (func)
680         func(instance, callback, pAllocator);
681 }
682 #endif // ENABLE_VALIDATION_LAYER
683 
find_device_compute_queue(const std::vector<VkQueueFamilyProperties> & queueFamilyProperties)684 static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
685 {
686     // first try, compute only queue
687     for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
688     {
689         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
690 
691         if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
692                 && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
693         {
694             return i;
695         }
696     }
697 
698     // second try, any queue with compute and graphics
699     for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
700     {
701         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
702 
703         if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
704                 && (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
705         {
706             return i;
707         }
708     }
709 
710     // third try, any queue with compute
711     for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
712     {
713         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
714 
715         if (queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
716         {
717             return i;
718         }
719     }
720 
721     //     NCNN_LOGE("no compute queue");
722     return -1;
723 }
724 
find_device_graphics_queue(const std::vector<VkQueueFamilyProperties> & queueFamilyProperties)725 static uint32_t find_device_graphics_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
726 {
727     // first try, graphics only queue
728     for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
729     {
730         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
731 
732         if ((queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)
733                 && !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT))
734         {
735             return i;
736         }
737     }
738 
739     // second try, any queue with graphics and compute
740     for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
741     {
742         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
743 
744         if ((queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)
745                 && (queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT))
746         {
747             return i;
748         }
749     }
750 
751     // third try, any queue with graphics
752     for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
753     {
754         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
755 
756         if (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)
757         {
758             return i;
759         }
760     }
761 
762     //     NCNN_LOGE("no graphics queue");
763     return -1;
764 }
765 
find_device_transfer_queue(const std::vector<VkQueueFamilyProperties> & queueFamilyProperties)766 static uint32_t find_device_transfer_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
767 {
768     // first try, transfer only queue
769     for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
770     {
771         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
772 
773         if ((queueFamilyProperty.queueFlags & VK_QUEUE_TRANSFER_BIT)
774                 && !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
775                 && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
776         {
777             return i;
778         }
779     }
780 
781     // second try, any queue with transfer
782     for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
783     {
784         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
785 
786         if (queueFamilyProperty.queueFlags & VK_QUEUE_TRANSFER_BIT)
787         {
788             return i;
789         }
790     }
791 
792     // third try, use compute queue
793     uint32_t compute_queue_index = find_device_compute_queue(queueFamilyProperties);
794     if (compute_queue_index != (uint32_t)-1)
795     {
796         return compute_queue_index;
797     }
798 
799     // fourth try, use graphics queue
800     uint32_t graphics_queue_index = find_device_graphics_queue(queueFamilyProperties);
801     if (graphics_queue_index != (uint32_t)-1)
802     {
803         return graphics_queue_index;
804     }
805 
806     //     NCNN_LOGE("no transfer queue");
807     return -1;
808 }
809 
find_default_vulkan_device_index()810 static int find_default_vulkan_device_index()
811 {
812     // first try, discrete gpu
813     for (int i = 0; i < g_gpu_count; i++)
814     {
815         if (g_gpu_infos[i]->type() == 0)
816             return i;
817     }
818 
819     // second try, integrated gpu
820     for (int i = 0; i < g_gpu_count; i++)
821     {
822         if (g_gpu_infos[i]->type() == 1)
823             return i;
824     }
825 
826     // third try, any probed device
827     if (g_gpu_count > 0)
828         return 0;
829 
830     NCNN_LOGE("no vulkan device");
831     return -1;
832 }
833 
create_gpu_instance()834 int create_gpu_instance()
835 {
836     MutexLockGuard lock(g_instance_lock);
837 
838     if ((VkInstance)g_instance != 0)
839         return 0;
840 
841     // NCNN_LOGE("create_gpu_instance");
842 
843     VkResult ret;
844 
845     std::vector<const char*> enabledLayers;
846 
847 #if ENABLE_VALIDATION_LAYER
848     uint32_t instanceLayerPropertyCount;
849     ret = vkEnumerateInstanceLayerProperties(&instanceLayerPropertyCount, NULL);
850     if (ret != VK_SUCCESS)
851     {
852         NCNN_LOGE("vkEnumerateInstanceLayerProperties failed %d", ret);
853         return -1;
854     }
855 
856     std::vector<VkLayerProperties> instanceLayerProperties(instanceLayerPropertyCount);
857     ret = vkEnumerateInstanceLayerProperties(&instanceLayerPropertyCount, instanceLayerProperties.data());
858     if (ret != VK_SUCCESS)
859     {
860         NCNN_LOGE("vkEnumerateInstanceLayerProperties failed %d", ret);
861         return -1;
862     }
863 
864     for (uint32_t i = 0; i < instanceLayerPropertyCount; i++)
865     {
866         const VkLayerProperties& lp = instanceLayerProperties[i];
867         //         NCNN_LOGE("instance layer %s = %u", lp.layerName, lp.implementationVersion);
868 
869         if (strcmp(lp.layerName, "VK_LAYER_LUNARG_standard_validation") == 0)
870         {
871             enabledLayers.push_back("VK_LAYER_LUNARG_standard_validation");
872         }
873         if (strcmp(lp.layerName, "VK_LAYER_LUNARG_parameter_validation") == 0)
874         {
875             enabledLayers.push_back("VK_LAYER_LUNARG_parameter_validation");
876         }
877         if (strcmp(lp.layerName, "VK_LAYER_KHRONOS_validation") == 0)
878         {
879             enabledLayers.push_back("VK_LAYER_KHRONOS_validation");
880         }
881     }
882 #endif // ENABLE_VALIDATION_LAYER
883 
884     std::vector<const char*> enabledExtensions;
885 
886     uint32_t instanceExtensionPropertyCount;
887     ret = vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, NULL);
888     if (ret != VK_SUCCESS)
889     {
890         NCNN_LOGE("vkEnumerateInstanceExtensionProperties failed %d", ret);
891         return -1;
892     }
893 
894     std::vector<VkExtensionProperties> instanceExtensionProperties(instanceExtensionPropertyCount);
895     ret = vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, instanceExtensionProperties.data());
896     if (ret != VK_SUCCESS)
897     {
898         NCNN_LOGE("vkEnumerateInstanceExtensionProperties failed %d", ret);
899         return -1;
900     }
901 
902     support_VK_KHR_get_physical_device_properties2 = 0;
903     support_VK_KHR_get_surface_capabilities2 = 0;
904     support_VK_KHR_surface = 0;
905     support_VK_EXT_debug_utils = 0;
906 #if __ANDROID_API__ >= 26
907     support_VK_KHR_android_surface = 0;
908 #endif // __ANDROID_API__ >= 26
909     for (uint32_t j = 0; j < instanceExtensionPropertyCount; j++)
910     {
911         const VkExtensionProperties& exp = instanceExtensionProperties[j];
912         //         NCNN_LOGE("instance extension %s = %u", exp.extensionName, exp.specVersion);
913 
914         if (strcmp(exp.extensionName, "VK_KHR_external_memory_capabilities") == 0)
915             support_VK_KHR_external_memory_capabilities = exp.specVersion;
916         else if (strcmp(exp.extensionName, "VK_KHR_get_physical_device_properties2") == 0)
917             support_VK_KHR_get_physical_device_properties2 = exp.specVersion;
918         else if (strcmp(exp.extensionName, "VK_KHR_get_surface_capabilities2") == 0)
919             support_VK_KHR_get_surface_capabilities2 = exp.specVersion;
920         else if (strcmp(exp.extensionName, "VK_KHR_surface") == 0)
921             support_VK_KHR_surface = exp.specVersion;
922         else if (strcmp(exp.extensionName, "VK_EXT_debug_utils") == 0)
923             support_VK_EXT_debug_utils = exp.specVersion;
924 #if __ANDROID_API__ >= 26
925         else if (strcmp(exp.extensionName, "VK_KHR_android_surface") == 0)
926             support_VK_KHR_android_surface = exp.specVersion;
927 #endif // __ANDROID_API__ >= 26
928     }
929 
930     if (support_VK_KHR_external_memory_capabilities)
931         enabledExtensions.push_back("VK_KHR_external_memory_capabilities");
932     if (support_VK_KHR_get_physical_device_properties2)
933         enabledExtensions.push_back("VK_KHR_get_physical_device_properties2");
934     if (support_VK_KHR_get_surface_capabilities2)
935         enabledExtensions.push_back("VK_KHR_get_surface_capabilities2");
936     if (support_VK_KHR_surface)
937         enabledExtensions.push_back("VK_KHR_surface");
938 #if ENABLE_VALIDATION_LAYER
939     if (support_VK_EXT_debug_utils)
940         enabledExtensions.push_back("VK_EXT_debug_utils");
941 #endif // ENABLE_VALIDATION_LAYER
942 #if __ANDROID_API__ >= 26
943     if (support_VK_KHR_android_surface)
944         enabledExtensions.push_back("VK_KHR_android_surface");
945 #endif // __ANDROID_API__ >= 26
946 
947     uint32_t instance_api_version = VK_MAKE_VERSION(1, 0, 0);
948     typedef VkResult(VKAPI_PTR * PFN_vkEnumerateInstanceVersion)(uint32_t * pApiVersion);
949     PFN_vkEnumerateInstanceVersion vkEnumerateInstanceVersion = (PFN_vkEnumerateInstanceVersion)vkGetInstanceProcAddr(0, "vkEnumerateInstanceVersion");
950     if (vkEnumerateInstanceVersion)
951     {
952         ret = vkEnumerateInstanceVersion(&instance_api_version);
953         if (ret != VK_SUCCESS)
954         {
955             NCNN_LOGE("vkEnumerateInstanceVersion failed %d", ret);
956             return -1;
957         }
958     }
959 
960     // NCNN_LOGE("instance apiVersion = %u.%u.%u", VK_VERSION_MAJOR(instance_api_version), VK_VERSION_MINOR(instance_api_version), VK_VERSION_PATCH(instance_api_version));
961 
962     VkApplicationInfo applicationInfo;
963     applicationInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
964     applicationInfo.pNext = 0;
965     applicationInfo.pApplicationName = "ncnn";
966     applicationInfo.applicationVersion = 0;
967     applicationInfo.pEngineName = "ncnn";
968     applicationInfo.engineVersion = 20201010;
969     applicationInfo.apiVersion = instance_api_version;
970 
971     VkInstanceCreateInfo instanceCreateInfo;
972     instanceCreateInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
973     instanceCreateInfo.pNext = 0;
974     instanceCreateInfo.flags = 0;
975     instanceCreateInfo.pApplicationInfo = &applicationInfo;
976     instanceCreateInfo.enabledLayerCount = enabledLayers.size();
977     instanceCreateInfo.ppEnabledLayerNames = enabledLayers.data();
978     instanceCreateInfo.enabledExtensionCount = enabledExtensions.size();
979     instanceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data();
980 
981     VkInstance instance = 0;
982     ret = vkCreateInstance(&instanceCreateInfo, 0, &instance);
983     if (ret != VK_SUCCESS)
984     {
985         NCNN_LOGE("vkCreateInstance failed %d", ret);
986         return -1;
987     }
988 
989     g_instance.instance = instance;
990 
991 #if ENABLE_VALIDATION_LAYER
992     if (support_VK_EXT_debug_utils)
993     {
994         VkDebugUtilsMessengerCreateInfoEXT createInfo = {};
995         createInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT;
996         createInfo.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT;
997         createInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT;
998         createInfo.pfnUserCallback = debugCallback;
999         createInfo.pUserData = 0;
1000         ret = CreateDebugUtilsMessengerEXT(g_instance, &createInfo, NULL, &g_instance.callback);
1001         if (ret != VK_SUCCESS)
1002         {
1003             NCNN_LOGE("CreateDebugUtilsMessengerEXT failed %d", ret);
1004             return -1;
1005         }
1006     }
1007 #endif // ENABLE_VALIDATION_LAYER
1008 
1009     init_instance_extension();
1010 
1011     uint32_t physicalDeviceCount = 0;
1012     ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, 0);
1013     if (ret != VK_SUCCESS)
1014     {
1015         NCNN_LOGE("vkEnumeratePhysicalDevices failed %d", ret);
1016         return -1;
1017     }
1018 
1019     if (physicalDeviceCount > NCNN_MAX_GPU_COUNT)
1020         physicalDeviceCount = NCNN_MAX_GPU_COUNT;
1021 
1022     std::vector<VkPhysicalDevice> physicalDevices(physicalDeviceCount);
1023 
1024     ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, physicalDevices.data());
1025     if (ret != VK_SUCCESS)
1026     {
1027         NCNN_LOGE("vkEnumeratePhysicalDevices failed %d", ret);
1028         return -1;
1029     }
1030 
1031     // find proper device and queue
1032     int gpu_info_index = 0;
1033     for (uint32_t i = 0; i < physicalDeviceCount; i++)
1034     {
1035         const VkPhysicalDevice& physicalDevice = physicalDevices[i];
1036         delete g_gpu_infos[gpu_info_index];
1037         g_gpu_infos[gpu_info_index] = new GpuInfo;
1038         GpuInfoPrivate& gpu_info = *(g_gpu_infos[gpu_info_index]->d);
1039 
1040         // device type
1041         VkPhysicalDeviceProperties physicalDeviceProperties;
1042         vkGetPhysicalDeviceProperties(physicalDevice, &physicalDeviceProperties);
1043 
1044         //         NCNN_LOGE("[%u] apiVersion = %u.%u.%u", i, VK_VERSION_MAJOR(physicalDeviceProperties.apiVersion),
1045         //             VK_VERSION_MINOR(physicalDeviceProperties.apiVersion), VK_VERSION_PATCH(physicalDeviceProperties.apiVersion));
1046         //         NCNN_LOGE("[%u] driverVersion = %u.%u.%u", i, VK_VERSION_MAJOR(physicalDeviceProperties.driverVersion),
1047         //             VK_VERSION_MINOR(physicalDeviceProperties.driverVersion), VK_VERSION_PATCH(physicalDeviceProperties.driverVersion));
1048         //         NCNN_LOGE("[%u] vendorID = %x", i, physicalDeviceProperties.vendorID);
1049         //         NCNN_LOGE("[%u] deviceID = %x", i, physicalDeviceProperties.deviceID);
1050         //         NCNN_LOGE("[%u] deviceType = %x", i, physicalDeviceProperties.deviceType);
1051         //         NCNN_LOGE("[%u] deviceName = %s", i, physicalDeviceProperties.deviceName);
1052         //         NCNN_LOGE("[%u] pipelineCacheUUID = %u", i, physicalDeviceProperties.pipelineCacheUUID);
1053 
1054         // mali
1055         // t760 = 0x13b5 0x7500001
1056         // t860 = 0x13b5 0x8602000
1057         // t880 = 0x13b5 0x8800020
1058         // g51  = 0x13b5 0x70901010
1059         // g52  = 0x13b5 0x74021000
1060         // g71  = 0x13b5 0x60a00002
1061         // g72  = 0x13b5 0x62210001
1062         // g76  = 0x13b5 0x72110000
1063         // g77  = 0x13b5 0x90800011
1064 
1065         // adreno
1066         // 506 = 0x5143 0x5000600
1067         // 510 = 0x5143 0x5010000
1068         // 512 = 0x5143 0x5010200
1069         // 530 = 0x5143 0x5030004
1070         // 540 = 0x5143 0x5040001
1071         // 616 = 0x5143 0x6010600
1072         // 630 = 0x5143 0x6030001
1073         // 640 = 0x5143 0x6040001
1074         // 650 = 0x5143 0x6050002
1075 
1076         gpu_info.bug_storage_buffer_no_l1 = false;
1077         gpu_info.bug_corrupted_online_pipeline_cache = false;
1078         gpu_info.bug_implicit_fp16_arithmetic = false;
1079 
1080         if (physicalDeviceProperties.vendorID == 0x5143 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 0, 66))
1081         {
1082             // qcom adreno with old buggy driver cannot share created pipeline properly
1083             gpu_info.bug_corrupted_online_pipeline_cache = true;
1084         }
1085 
1086         if (physicalDeviceProperties.vendorID == 0x5143 && !(physicalDeviceProperties.deviceID == 0x6040001 || physicalDeviceProperties.deviceID == 0x6050002))
1087         {
1088             // NOTE but qcom855/qcom855plus/qcom865 are known exceptions
1089             // qcom adreno storage buffer without L1 cache
1090             gpu_info.bug_storage_buffer_no_l1 = true;
1091         }
1092 
1093         if (physicalDeviceProperties.vendorID == 0x5143 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 1, 87))
1094         {
1095             // HACK buffer2image before image-read dependency does not work properly
1096             // even promised with full image memory barrier on old adreno driver
1097             // TODO figure out a proper workaround without hurt speed too much
1098             // TODO only for old drivers
1099             gpu_info.bug_buffer_image_load_zero = true;
1100         }
1101 
1102         if (physicalDeviceProperties.vendorID == 0x13b5
1103                 && (physicalDeviceProperties.deviceID == 0x7500001
1104                     || physicalDeviceProperties.deviceID == 0x8602000
1105                     || physicalDeviceProperties.deviceID == 0x8800020
1106                     || physicalDeviceProperties.deviceID == 0x70901010
1107                     || physicalDeviceProperties.deviceID == 0x74021000
1108                     || physicalDeviceProperties.deviceID == 0x60a00002
1109                     || physicalDeviceProperties.deviceID == 0x62210001))
1110         {
1111             // NOTE rk3288/rk3399/t880/g51/g52/g71/g72
1112             // however, g76/g77 has explicit fp16 arithmetic
1113             // arm mali driver accept spirv with fp16 arithmetic
1114             gpu_info.bug_implicit_fp16_arithmetic = true;
1115         }
1116 
1117         if (physicalDeviceProperties.vendorID == 0x5143
1118                 && (physicalDeviceProperties.deviceID == 0x6030001
1119                     || physicalDeviceProperties.deviceID == 0x6040001
1120                     || physicalDeviceProperties.deviceID == 0x6050002))
1121         {
1122             // TODO enable devices other than qcom845/qcom855/qcom855plus/qcom865
1123             // qcom adreno driver accept spirv with fp16 arithmetic
1124             gpu_info.bug_implicit_fp16_arithmetic = true;
1125         }
1126 
1127         gpu_info.physical_device = physicalDevice;
1128 
1129         // info
1130         gpu_info.api_version = physicalDeviceProperties.apiVersion;
1131         gpu_info.driver_version = physicalDeviceProperties.driverVersion;
1132         gpu_info.vendor_id = physicalDeviceProperties.vendorID;
1133         gpu_info.device_id = physicalDeviceProperties.deviceID;
1134         memcpy(gpu_info.device_name, physicalDeviceProperties.deviceName, VK_MAX_PHYSICAL_DEVICE_NAME_SIZE);
1135         memcpy(gpu_info.pipeline_cache_uuid, physicalDeviceProperties.pipelineCacheUUID, VK_UUID_SIZE);
1136 
1137         if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU)
1138             gpu_info.type = 0;
1139         else if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU)
1140             gpu_info.type = 1;
1141         else if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU)
1142             gpu_info.type = 2;
1143         else if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_CPU)
1144             gpu_info.type = 3;
1145         else
1146             gpu_info.type = -1;
1147 
1148         // device capability
1149         gpu_info.max_shared_memory_size = physicalDeviceProperties.limits.maxComputeSharedMemorySize;
1150 
1151         gpu_info.max_workgroup_count_x = physicalDeviceProperties.limits.maxComputeWorkGroupCount[0];
1152         gpu_info.max_workgroup_count_y = physicalDeviceProperties.limits.maxComputeWorkGroupCount[1];
1153         gpu_info.max_workgroup_count_z = physicalDeviceProperties.limits.maxComputeWorkGroupCount[2];
1154 
1155         gpu_info.max_workgroup_invocations = physicalDeviceProperties.limits.maxComputeWorkGroupInvocations;
1156 
1157         gpu_info.max_workgroup_size_x = physicalDeviceProperties.limits.maxComputeWorkGroupSize[0];
1158         gpu_info.max_workgroup_size_y = physicalDeviceProperties.limits.maxComputeWorkGroupSize[1];
1159         gpu_info.max_workgroup_size_z = physicalDeviceProperties.limits.maxComputeWorkGroupSize[2];
1160 
1161         gpu_info.memory_map_alignment = physicalDeviceProperties.limits.minMemoryMapAlignment;
1162         gpu_info.buffer_offset_alignment = physicalDeviceProperties.limits.minStorageBufferOffsetAlignment;
1163         gpu_info.non_coherent_atom_size = physicalDeviceProperties.limits.nonCoherentAtomSize;
1164         gpu_info.buffer_image_granularity = physicalDeviceProperties.limits.bufferImageGranularity;
1165         gpu_info.max_image_dimension_1d = physicalDeviceProperties.limits.maxImageDimension1D;
1166         gpu_info.max_image_dimension_2d = physicalDeviceProperties.limits.maxImageDimension2D;
1167         gpu_info.max_image_dimension_3d = physicalDeviceProperties.limits.maxImageDimension3D;
1168 
1169         gpu_info.timestamp_period = physicalDeviceProperties.limits.timestampPeriod;
1170 
1171         //         NCNN_LOGE("[%u] max_shared_memory_size = %u", i, gpu_info.max_shared_memory_size);
1172         //         NCNN_LOGE("[%u] max_workgroup_count = %u %u %u", i, gpu_info.max_workgroup_count[0], gpu_info.max_workgroup_count[1], gpu_info.max_workgroup_count[2]);
1173         //         NCNN_LOGE("[%u] max_workgroup_invocations = %u", i, gpu_info.max_workgroup_invocations);
1174         //         NCNN_LOGE("[%u] max_workgroup_size = %u %u %u", i, gpu_info.max_workgroup_size[0], gpu_info.max_workgroup_size[1], gpu_info.max_workgroup_size[2]);
1175         //         NCNN_LOGE("[%u] memory_map_alignment = %lu", i, gpu_info.memory_map_alignment);
1176         //         NCNN_LOGE("[%u] buffer_offset_alignment = %lu", i, gpu_info.buffer_offset_alignment);
1177 
1178         // find compute queue
1179         uint32_t queueFamilyPropertiesCount;
1180         vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, 0);
1181 
1182         std::vector<VkQueueFamilyProperties> queueFamilyProperties(queueFamilyPropertiesCount);
1183         vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, queueFamilyProperties.data());
1184 
1185         gpu_info.compute_queue_family_index = find_device_compute_queue(queueFamilyProperties);
1186         gpu_info.graphics_queue_family_index = find_device_graphics_queue(queueFamilyProperties);
1187         gpu_info.transfer_queue_family_index = find_device_transfer_queue(queueFamilyProperties);
1188 
1189         gpu_info.compute_queue_count = queueFamilyProperties[gpu_info.compute_queue_family_index].queueCount;
1190         gpu_info.graphics_queue_count = queueFamilyProperties[gpu_info.graphics_queue_family_index].queueCount;
1191         gpu_info.transfer_queue_count = queueFamilyProperties[gpu_info.transfer_queue_family_index].queueCount;
1192 
1193         gpu_info.unified_compute_transfer_queue = gpu_info.compute_queue_family_index == gpu_info.transfer_queue_family_index;
1194 
1195         // additional device properties
1196         gpu_info.subgroup_size = 64;
1197         gpu_info.support_subgroup_basic = false;
1198         gpu_info.support_subgroup_vote = false;
1199         gpu_info.support_subgroup_ballot = false;
1200         gpu_info.support_subgroup_shuffle = false;
1201         if (support_VK_KHR_get_physical_device_properties2)
1202         {
1203             void* queryDeviceProperties = 0;
1204 
1205             // query subgroup
1206             VkPhysicalDeviceSubgroupProperties physicalDeviceSubgroupProperties;
1207             physicalDeviceSubgroupProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
1208             physicalDeviceSubgroupProperties.pNext = queryDeviceProperties;
1209             if (VK_VERSION_MAJOR(instance_api_version) >= 1 && VK_VERSION_MINOR(instance_api_version) >= 1)
1210             {
1211                 queryDeviceProperties = &physicalDeviceSubgroupProperties;
1212             }
1213 
1214             VkPhysicalDeviceProperties2KHR queryProperties;
1215             queryProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR;
1216             queryProperties.pNext = queryDeviceProperties;
1217 
1218             vkGetPhysicalDeviceProperties2KHR(physicalDevice, &queryProperties);
1219 
1220             if (VK_VERSION_MAJOR(instance_api_version) >= 1 && VK_VERSION_MINOR(instance_api_version) >= 1)
1221             {
1222                 gpu_info.subgroup_size = physicalDeviceSubgroupProperties.subgroupSize;
1223                 if (physicalDeviceSubgroupProperties.supportedStages & VK_SHADER_STAGE_COMPUTE_BIT)
1224                 {
1225                     gpu_info.support_subgroup_basic = physicalDeviceSubgroupProperties.supportedOperations & VK_SUBGROUP_FEATURE_BASIC_BIT;
1226                     gpu_info.support_subgroup_vote = physicalDeviceSubgroupProperties.supportedOperations & VK_SUBGROUP_FEATURE_VOTE_BIT;
1227                     gpu_info.support_subgroup_ballot = physicalDeviceSubgroupProperties.supportedOperations & VK_SUBGROUP_FEATURE_BALLOT_BIT;
1228                     gpu_info.support_subgroup_shuffle = physicalDeviceSubgroupProperties.supportedOperations & VK_SUBGROUP_FEATURE_SHUFFLE_BIT;
1229                 }
1230             }
1231             else
1232             {
1233                 if (physicalDeviceProperties.vendorID == 0x5143) // qcom adreno prefer very large workgroup :P
1234                     gpu_info.subgroup_size = 128;
1235                 if (physicalDeviceProperties.vendorID == 0x13b5) // arm mali
1236                     gpu_info.subgroup_size = 16;
1237                 if (physicalDeviceProperties.vendorID == 0x1010) // imgtec powervr
1238                     gpu_info.subgroup_size = 32;
1239                 if (physicalDeviceProperties.vendorID == 0x1002) // amd
1240                     gpu_info.subgroup_size = 64;
1241                 if (physicalDeviceProperties.vendorID == 0x10de) // nvidia
1242                     gpu_info.subgroup_size = 32;
1243                 if (physicalDeviceProperties.vendorID == 0x8086) // intel
1244                     gpu_info.subgroup_size = 32;
1245             }
1246         }
1247 
1248         // cache memory properties
1249         vkGetPhysicalDeviceMemoryProperties(physicalDevice, &gpu_info.physical_device_memory_properties);
1250 
1251         // get device extension
1252         uint32_t deviceExtensionPropertyCount = 0;
1253         ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, NULL);
1254         if (ret != VK_SUCCESS)
1255         {
1256             NCNN_LOGE("vkEnumerateDeviceExtensionProperties failed %d", ret);
1257             return -1;
1258         }
1259 
1260         std::vector<VkExtensionProperties> deviceExtensionProperties(deviceExtensionPropertyCount);
1261         ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, deviceExtensionProperties.data());
1262         if (ret != VK_SUCCESS)
1263         {
1264             NCNN_LOGE("vkEnumerateDeviceExtensionProperties failed %d", ret);
1265             return -1;
1266         }
1267 
1268         // extension capability
1269         gpu_info.support_VK_KHR_8bit_storage = 0;
1270         gpu_info.support_VK_KHR_16bit_storage = 0;
1271         gpu_info.support_VK_KHR_bind_memory2 = 0;
1272         gpu_info.support_VK_KHR_create_renderpass2 = 0;
1273         gpu_info.support_VK_KHR_dedicated_allocation = 0;
1274         gpu_info.support_VK_KHR_descriptor_update_template = 0;
1275         gpu_info.support_VK_KHR_external_memory = 0;
1276         gpu_info.support_VK_KHR_get_memory_requirements2 = 0;
1277         gpu_info.support_VK_KHR_maintenance1 = 0;
1278         gpu_info.support_VK_KHR_maintenance2 = 0;
1279         gpu_info.support_VK_KHR_maintenance3 = 0;
1280         gpu_info.support_VK_KHR_multiview = 0;
1281         gpu_info.support_VK_KHR_push_descriptor = 0;
1282         gpu_info.support_VK_KHR_sampler_ycbcr_conversion = 0;
1283         gpu_info.support_VK_KHR_shader_float16_int8 = 0;
1284         gpu_info.support_VK_KHR_shader_float_controls = 0;
1285         gpu_info.support_VK_KHR_storage_buffer_storage_class = 0;
1286         gpu_info.support_VK_KHR_swapchain = 0;
1287         gpu_info.support_VK_EXT_descriptor_indexing = 0;
1288         gpu_info.support_VK_EXT_memory_budget = 0;
1289         gpu_info.support_VK_EXT_queue_family_foreign = 0;
1290 #if __ANDROID_API__ >= 26
1291         gpu_info.support_VK_ANDROID_external_memory_android_hardware_buffer = 0;
1292 #endif // __ANDROID_API__ >= 26
1293         for (uint32_t j = 0; j < deviceExtensionPropertyCount; j++)
1294         {
1295             const VkExtensionProperties& exp = deviceExtensionProperties[j];
1296             // NCNN_LOGE("device extension %s = %u", exp.extensionName, exp.specVersion);
1297 
1298             if (strcmp(exp.extensionName, "VK_KHR_8bit_storage") == 0)
1299                 gpu_info.support_VK_KHR_8bit_storage = exp.specVersion;
1300             else if (strcmp(exp.extensionName, "VK_KHR_16bit_storage") == 0)
1301                 gpu_info.support_VK_KHR_16bit_storage = exp.specVersion;
1302             else if (strcmp(exp.extensionName, "VK_KHR_bind_memory2") == 0)
1303                 gpu_info.support_VK_KHR_bind_memory2 = exp.specVersion;
1304             else if (strcmp(exp.extensionName, "VK_KHR_create_renderpass2") == 0)
1305                 gpu_info.support_VK_KHR_create_renderpass2 = exp.specVersion;
1306             else if (strcmp(exp.extensionName, "VK_KHR_dedicated_allocation") == 0)
1307                 gpu_info.support_VK_KHR_dedicated_allocation = exp.specVersion;
1308             else if (strcmp(exp.extensionName, "VK_KHR_descriptor_update_template") == 0)
1309                 gpu_info.support_VK_KHR_descriptor_update_template = exp.specVersion;
1310             else if (strcmp(exp.extensionName, "VK_KHR_external_memory") == 0)
1311                 gpu_info.support_VK_KHR_external_memory = exp.specVersion;
1312             else if (strcmp(exp.extensionName, "VK_KHR_get_memory_requirements2") == 0)
1313                 gpu_info.support_VK_KHR_get_memory_requirements2 = exp.specVersion;
1314             else if (strcmp(exp.extensionName, "VK_KHR_maintenance1") == 0)
1315                 gpu_info.support_VK_KHR_maintenance1 = exp.specVersion;
1316             else if (strcmp(exp.extensionName, "VK_KHR_maintenance2") == 0)
1317                 gpu_info.support_VK_KHR_maintenance2 = exp.specVersion;
1318             else if (strcmp(exp.extensionName, "VK_KHR_maintenance3") == 0)
1319                 gpu_info.support_VK_KHR_maintenance3 = exp.specVersion;
1320             else if (strcmp(exp.extensionName, "VK_KHR_multiview") == 0)
1321                 gpu_info.support_VK_KHR_multiview = exp.specVersion;
1322             else if (strcmp(exp.extensionName, "VK_KHR_push_descriptor") == 0)
1323                 gpu_info.support_VK_KHR_push_descriptor = exp.specVersion;
1324             else if (strcmp(exp.extensionName, "VK_KHR_sampler_ycbcr_conversion") == 0)
1325                 gpu_info.support_VK_KHR_sampler_ycbcr_conversion = exp.specVersion;
1326             else if (strcmp(exp.extensionName, "VK_KHR_shader_float16_int8") == 0)
1327                 gpu_info.support_VK_KHR_shader_float16_int8 = exp.specVersion;
1328             else if (strcmp(exp.extensionName, "VK_KHR_shader_float_controls") == 0)
1329                 gpu_info.support_VK_KHR_shader_float_controls = exp.specVersion;
1330             else if (strcmp(exp.extensionName, "VK_KHR_storage_buffer_storage_class") == 0)
1331                 gpu_info.support_VK_KHR_storage_buffer_storage_class = exp.specVersion;
1332             else if (strcmp(exp.extensionName, "VK_KHR_swapchain") == 0)
1333                 gpu_info.support_VK_KHR_swapchain = exp.specVersion;
1334             else if (strcmp(exp.extensionName, "VK_EXT_descriptor_indexing") == 0)
1335                 gpu_info.support_VK_EXT_descriptor_indexing = exp.specVersion;
1336             else if (strcmp(exp.extensionName, "VK_EXT_memory_budget") == 0)
1337                 gpu_info.support_VK_EXT_memory_budget = exp.specVersion;
1338             else if (strcmp(exp.extensionName, "VK_EXT_queue_family_foreign") == 0)
1339                 gpu_info.support_VK_EXT_queue_family_foreign = exp.specVersion;
1340 #if __ANDROID_API__ >= 26
1341             else if (strcmp(exp.extensionName, "VK_ANDROID_external_memory_android_hardware_buffer") == 0)
1342                 gpu_info.support_VK_ANDROID_external_memory_android_hardware_buffer = exp.specVersion;
1343 #endif // __ANDROID_API__ >= 26
1344         }
1345 
1346         // check features
1347         gpu_info.support_fp16_packed = true;
1348         gpu_info.support_fp16_storage = false;
1349         gpu_info.support_fp16_arithmetic = false;
1350         gpu_info.support_int8_packed = true;
1351         gpu_info.support_int8_storage = false;
1352         gpu_info.support_int8_arithmetic = false;
1353         gpu_info.support_ycbcr_conversion = false;
1354         if (support_VK_KHR_get_physical_device_properties2)
1355         {
1356             void* queryExtensionFeatures = 0;
1357 
1358             // query int8 storage
1359             VkPhysicalDevice8BitStorageFeaturesKHR query8BitStorageFeatures;
1360             query8BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR;
1361             query8BitStorageFeatures.pNext = 0;
1362             if (gpu_info.support_VK_KHR_8bit_storage)
1363             {
1364                 query8BitStorageFeatures.pNext = queryExtensionFeatures;
1365                 queryExtensionFeatures = &query8BitStorageFeatures;
1366             }
1367 
1368             // query fp16/int16 storage
1369             VkPhysicalDevice16BitStorageFeaturesKHR query16BitStorageFeatures;
1370             query16BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR;
1371             query16BitStorageFeatures.pNext = 0;
1372             if (gpu_info.support_VK_KHR_16bit_storage)
1373             {
1374                 query16BitStorageFeatures.pNext = queryExtensionFeatures;
1375                 queryExtensionFeatures = &query16BitStorageFeatures;
1376             }
1377 
1378             // query fp16/int8 arithmetic
1379             VkPhysicalDeviceFloat16Int8FeaturesKHR queryFloat16Int8Features;
1380             queryFloat16Int8Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR;
1381             queryFloat16Int8Features.pNext = 0;
1382             if (gpu_info.support_VK_KHR_shader_float16_int8)
1383             {
1384                 queryFloat16Int8Features.pNext = queryExtensionFeatures;
1385                 queryExtensionFeatures = &queryFloat16Int8Features;
1386             }
1387 
1388             // query ycbcr_conversion
1389             VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR querySamplerYcbcrConversionFeatures;
1390             querySamplerYcbcrConversionFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES_KHR;
1391             querySamplerYcbcrConversionFeatures.pNext = 0;
1392             if (gpu_info.support_VK_KHR_sampler_ycbcr_conversion)
1393             {
1394                 querySamplerYcbcrConversionFeatures.pNext = queryExtensionFeatures;
1395                 queryExtensionFeatures = &querySamplerYcbcrConversionFeatures;
1396             }
1397 
1398             VkPhysicalDeviceFeatures2KHR queryFeatures;
1399             queryFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR;
1400             queryFeatures.pNext = queryExtensionFeatures;
1401 
1402             vkGetPhysicalDeviceFeatures2KHR(physicalDevice, &queryFeatures);
1403 
1404             if (gpu_info.support_VK_KHR_8bit_storage)
1405             {
1406                 gpu_info.support_int8_storage = query8BitStorageFeatures.storageBuffer8BitAccess;
1407             }
1408             if (gpu_info.support_VK_KHR_16bit_storage && queryFeatures.features.shaderStorageImageExtendedFormats)
1409             {
1410                 // shaderStorageImageExtendedFormats enables r16f format in storage image
1411                 gpu_info.support_fp16_storage = query16BitStorageFeatures.storageBuffer16BitAccess;
1412             }
1413             if (gpu_info.support_VK_KHR_shader_float16_int8)
1414             {
1415                 gpu_info.support_fp16_arithmetic = queryFloat16Int8Features.shaderFloat16;
1416                 gpu_info.support_int8_arithmetic = queryFloat16Int8Features.shaderInt8;
1417             }
1418             if (gpu_info.support_VK_KHR_sampler_ycbcr_conversion)
1419             {
1420                 gpu_info.support_ycbcr_conversion = querySamplerYcbcrConversionFeatures.samplerYcbcrConversion;
1421             }
1422         }
1423         else
1424         {
1425             //             // TODO
1426             //             VkPhysicalDeviceFeatures features;
1427             //             vkGetPhysicalDeviceFeatures(physicalDevice, &features);
1428         }
1429 
1430         if (physicalDeviceProperties.vendorID == 0x13b5 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 0, 82))
1431         {
1432             // the 16bit_storage implementation of arm mali driver is buggy :[
1433             gpu_info.support_fp16_storage = false;
1434         }
1435 
1436         if (physicalDeviceProperties.vendorID == 0x10002 && physicalDeviceProperties.deviceID == 0x70006214 && physicalDeviceProperties.apiVersion == VK_MAKE_VERSION(1, 1, 82))
1437         {
1438             // the 16bit_storage implementation of vivante gc1700 driver is buggy :[
1439             gpu_info.support_fp16_storage = false;
1440         }
1441 
1442         if (gpu_info.bug_implicit_fp16_arithmetic)
1443         {
1444             // force capability on as long as the driver accept spirv with fp16 arithmetic :D
1445             gpu_info.support_fp16_arithmetic = true;
1446         }
1447 
1448         NCNN_LOGE("[%u %s]  queueC=%u[%u]  queueG=%u[%u]  queueT=%u[%u]", i, physicalDeviceProperties.deviceName,
1449                   gpu_info.compute_queue_family_index, gpu_info.compute_queue_count,
1450                   gpu_info.graphics_queue_family_index, gpu_info.graphics_queue_count,
1451                   gpu_info.transfer_queue_family_index, gpu_info.transfer_queue_count);
1452 
1453         NCNN_LOGE("[%u %s]  bugsbn1=%d  bugbilz=%d  bugcopc=%d  bugihfa=%d", i, physicalDeviceProperties.deviceName,
1454                   gpu_info.bug_storage_buffer_no_l1, gpu_info.bug_buffer_image_load_zero, gpu_info.bug_corrupted_online_pipeline_cache, gpu_info.bug_implicit_fp16_arithmetic);
1455 
1456         NCNN_LOGE("[%u %s]  fp16-p/s/a=%d/%d/%d  int8-p/s/a=%d/%d/%d", i, physicalDeviceProperties.deviceName,
1457                   gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic,
1458                   gpu_info.support_int8_packed, gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic);
1459 
1460         NCNN_LOGE("[%u %s]  subgroup=%u  basic=%d  vote=%d  ballot=%d  shuffle=%d", i, physicalDeviceProperties.deviceName,
1461                   gpu_info.subgroup_size, gpu_info.support_subgroup_basic, gpu_info.support_subgroup_vote,
1462                   gpu_info.support_subgroup_ballot, gpu_info.support_subgroup_shuffle);
1463 
1464         gpu_info_index++;
1465     }
1466 
1467     g_gpu_count = gpu_info_index;
1468 
1469     // the default gpu device
1470     g_default_gpu_index = find_default_vulkan_device_index();
1471 
1472     glslang::InitializeProcess();
1473 
1474     return 0;
1475 }
1476 
destroy_gpu_instance()1477 void destroy_gpu_instance()
1478 {
1479     MutexLockGuard lock(g_instance_lock);
1480 
1481     if ((VkInstance)g_instance == 0)
1482         return;
1483 
1484     // NCNN_LOGE("destroy_gpu_instance");
1485 
1486     glslang::FinalizeProcess();
1487 
1488     for (int i = 0; i < NCNN_MAX_GPU_COUNT; i++)
1489     {
1490         delete g_default_vkdev[i];
1491         g_default_vkdev[i] = 0;
1492 
1493         delete g_gpu_infos[i];
1494         g_gpu_infos[i] = 0;
1495     }
1496 
1497 #if ENABLE_VALIDATION_LAYER
1498     if (support_VK_EXT_debug_utils)
1499     {
1500         DestroyDebugUtilsMessengerEXT(g_instance, g_instance.callback, NULL);
1501         g_instance.callback = 0;
1502     }
1503 #endif // ENABLE_VALIDATION_LAYER
1504 
1505     vkDestroyInstance(g_instance, 0);
1506 
1507     g_instance.instance = 0;
1508 }
1509 
is_gpu_instance_ready()1510 static bool is_gpu_instance_ready()
1511 {
1512     MutexLockGuard lock(g_instance_lock);
1513 
1514     return (VkInstance)g_instance != 0;
1515 }
1516 
try_create_gpu_instance()1517 static void try_create_gpu_instance()
1518 {
1519     if (!is_gpu_instance_ready())
1520         create_gpu_instance();
1521 }
1522 
get_gpu_count()1523 int get_gpu_count()
1524 {
1525     try_create_gpu_instance();
1526 
1527     return g_gpu_count;
1528 }
1529 
get_default_gpu_index()1530 int get_default_gpu_index()
1531 {
1532     try_create_gpu_instance();
1533 
1534     return g_default_gpu_index;
1535 }
1536 
get_gpu_info(int device_index)1537 const GpuInfo& get_gpu_info(int device_index)
1538 {
1539     try_create_gpu_instance();
1540 
1541     return *g_gpu_infos[device_index];
1542 }
1543 
1544 class VkDummyAllocator : public VkBlobAllocator
1545 {
1546 public:
1547     // NOTE 16k is large enough I think ...
VkDummyAllocator(const VulkanDevice * _vkdev)1548     VkDummyAllocator(const VulkanDevice* _vkdev)
1549         : VkBlobAllocator(_vkdev, 16 * 1024)
1550     {
1551     }
1552 };
1553 
1554 class VkDummyCompute : public VkCompute
1555 {
1556 public:
VkDummyCompute(const VulkanDevice * _vkdev)1557     VkDummyCompute(const VulkanDevice* _vkdev)
1558         : VkCompute(_vkdev)
1559     {
1560     }
1561 
record_dummy(const VkMat & buffer)1562     void record_dummy(const VkMat& buffer)
1563     {
1564         barrier_readwrite(buffer);
1565     }
1566 
record_dummy(const VkImageMat & image)1567     void record_dummy(const VkImageMat& image)
1568     {
1569         barrier_readwrite(image);
1570     }
1571 
record_dummy_readonly(const VkImageMat & image)1572     void record_dummy_readonly(const VkImageMat& image)
1573     {
1574         barrier_readonly(image);
1575     }
1576 };
1577 
1578 class VulkanDevicePrivate
1579 {
1580 public:
VulkanDevicePrivate(VulkanDevice * _vkdev)1581     VulkanDevicePrivate(VulkanDevice* _vkdev)
1582         : vkdev(_vkdev)
1583     {
1584     }
1585     VulkanDevice* const vkdev;
1586 
1587     // dummy buffer and image
1588     int create_dummy_buffer_image();
1589     void destroy_dummy_buffer_image();
1590 
1591     // utility operator
1592     const ncnn::Packing_vulkan* get_utility_operator(int storage_type_from, int storage_type_to, int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const;
1593     void destroy_utility_operator();
1594 
1595     VkDevice device;
1596 
1597     // hardware queue
1598     mutable std::vector<VkQueue> compute_queues;
1599     mutable std::vector<VkQueue> graphics_queues;
1600     mutable std::vector<VkQueue> transfer_queues;
1601     mutable Mutex queue_lock;
1602 
1603     // default blob allocator for each queue
1604     mutable std::vector<VkAllocator*> blob_allocators;
1605     mutable Mutex blob_allocator_lock;
1606 
1607     // default staging allocator for each queue
1608     mutable std::vector<VkAllocator*> staging_allocators;
1609     mutable Mutex staging_allocator_lock;
1610 
1611     // nearest sampler for texelfetch
1612     VkSampler texelfetch_sampler;
1613 
1614     // dummy buffer and image
1615     VkAllocator* dummy_allocator;
1616     VkMat dummy_buffer;
1617     VkImageMat dummy_image;
1618     VkImageMat dummy_image_readonly;
1619 
1620     // device-wide pipeline cache
1621     PipelineCache* pipeline_cache;
1622 
1623     // utility operator
1624     // from buffer | image
1625     // to buffer | image
1626     // from fp32-b/i | fp16p-b/i | fp16s-b/i
1627     // to fp32-b/i | fp16p-b/i | fp16s-b/i
1628     // to pack1 | pack4 | pack8
1629     mutable ncnn::Packing_vulkan* uop_packing[2][2][3][3][3];
1630     mutable Mutex uop_lock;
1631 };
1632 
create_dummy_buffer_image()1633 int VulkanDevicePrivate::create_dummy_buffer_image()
1634 {
1635     dummy_allocator = new VkDummyAllocator(vkdev);
1636 
1637     dummy_buffer.create(1, 4u, dummy_allocator);
1638     dummy_image.create(1, 4u, dummy_allocator);
1639 #if __APPLE__
1640     if (vkdev->info.vendor_id() != 0x8086)
1641         dummy_image_readonly.create(1, 4u, dummy_allocator);
1642 #else
1643     dummy_image_readonly.create(1, 4u, dummy_allocator);
1644 #endif
1645 
1646     VkDummyCompute cmd(vkdev);
1647 
1648     cmd.record_dummy(dummy_buffer);
1649     cmd.record_dummy(dummy_image);
1650 #if __APPLE__
1651     if (vkdev->info.vendor_id() != 0x8086)
1652         cmd.record_dummy_readonly(dummy_image_readonly);
1653 #else
1654     cmd.record_dummy_readonly(dummy_image_readonly);
1655 #endif
1656 
1657     cmd.submit_and_wait();
1658 
1659     return 0;
1660 }
1661 
destroy_dummy_buffer_image()1662 void VulkanDevicePrivate::destroy_dummy_buffer_image()
1663 {
1664     dummy_buffer.release();
1665     dummy_image.release();
1666 #if __APPLE__
1667     if (vkdev->info.vendor_id() != 0x8086)
1668         dummy_image_readonly.release();
1669 #else
1670     dummy_image_readonly.release();
1671 #endif
1672 
1673     delete dummy_allocator;
1674 }
1675 
get_utility_operator(int storage_type_from,int storage_type_to,int cast_type_from_index,int cast_type_to_index,int packing_type_to_index) const1676 const ncnn::Packing_vulkan* VulkanDevicePrivate::get_utility_operator(int storage_type_from, int storage_type_to, int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const
1677 {
1678     MutexLockGuard lock(uop_lock);
1679 
1680     const ncnn::Packing_vulkan* cached_uop = uop_packing[storage_type_from][storage_type_to][cast_type_from_index][cast_type_to_index][packing_type_to_index];
1681     if (cached_uop)
1682         return cached_uop;
1683 
1684     if ((cast_type_from_index == 1 && cast_type_to_index == 2) || (cast_type_from_index == 2 && cast_type_to_index == 1))
1685     {
1686         NCNN_LOGE("no fp16p to/from fp16s conversion");
1687         return 0;
1688     }
1689 
1690     // create uop
1691     Option opt;
1692     opt.use_image_storage = (storage_type_from == 1 || storage_type_to == 1);
1693     opt.use_fp16_packed = (cast_type_from_index == 1 || cast_type_to_index == 1);
1694     opt.use_fp16_storage = (cast_type_from_index == 2 || cast_type_to_index == 2);
1695 
1696     if (!vkdev->info.support_fp16_packed() && opt.use_fp16_packed)
1697     {
1698         NCNN_LOGE("cannot create uop with use_fp16_packed if not support_fp16_packed");
1699         return 0;
1700     }
1701 
1702     if (!vkdev->info.support_fp16_storage() && opt.use_fp16_storage)
1703     {
1704         NCNN_LOGE("cannot create uop with use_fp16_storage if not support_fp16_storage");
1705         return 0;
1706     }
1707 
1708     // fp16/int8 arithmetic are not necessary for packing
1709     // and may conflict with storage options
1710     opt.use_fp16_arithmetic = false;
1711     opt.use_int8_arithmetic = false;
1712 
1713     // enable pack8 for pack8to1/pack8to4
1714     opt.use_shader_pack8 = true;
1715 
1716     opt.use_vulkan_compute = true;
1717 
1718     // cache uop pipeline as device member explicitly
1719     opt.pipeline_cache = 0;
1720 
1721     ncnn::Packing_vulkan* uop = new ncnn::Packing_vulkan;
1722     uop->vkdev = vkdev;
1723 
1724     ncnn::ParamDict pd;
1725     pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4 : 8); // out_elempack
1726     pd.set(2, cast_type_from_index + 1);                                            // 0=auto 1=fp32 2=fp16p 3=fp16s
1727     pd.set(3, cast_type_to_index + 1);
1728     pd.set(4, storage_type_from); // 0=buffer 1=image
1729     pd.set(5, storage_type_to);
1730 
1731     uop->load_param(pd);
1732 
1733     uop->create_pipeline(opt);
1734 
1735     uop_packing[storage_type_from][storage_type_to][cast_type_from_index][cast_type_to_index][packing_type_to_index] = uop;
1736 
1737     return uop;
1738 }
1739 
destroy_utility_operator()1740 void VulkanDevicePrivate::destroy_utility_operator()
1741 {
1742     Option opt;
1743     opt.use_vulkan_compute = true;
1744     opt.use_fp16_arithmetic = false;
1745     opt.use_int8_arithmetic = false;
1746     opt.pipeline_cache = 0;
1747 
1748     // from buffer | image
1749     // to buffer | image
1750     for (int i0 = 0; i0 < 2; i0++)
1751     {
1752         for (int i1 = 0; i1 < 2; i1++)
1753         {
1754             opt.use_image_storage = (i0 == 1 || i1 == 1);
1755 
1756             // from fp32-b/i | fp16p-b/i | fp16s-b/i
1757             // to fp32-b/i | fp16p-b/i | fp16s-b/i
1758             for (int j0 = 0; j0 < 3; j0++)
1759             {
1760                 for (int j1 = 0; j1 < 3; j1++)
1761                 {
1762                     if ((j0 == 1 && j1 == 2) || (j0 == 2 && j1 == 1))
1763                     {
1764                         // no fp16p to/from fp16s conversion
1765                         continue;
1766                     }
1767 
1768                     opt.use_fp16_packed = (j0 == 1 || j1 == 1);
1769                     opt.use_fp16_storage = (j0 == 2 || j1 == 2);
1770 
1771                     if (!vkdev->info.support_fp16_packed() && opt.use_fp16_packed)
1772                         continue;
1773 
1774                     if (!vkdev->info.support_fp16_storage() && opt.use_fp16_storage)
1775                         continue;
1776 
1777                     // to pack1 | pack4 | pack8
1778                     for (int k = 0; k < 3; k++)
1779                     {
1780                         // enable pack8 for pack8to1/pack8to4
1781                         opt.use_shader_pack8 = true;
1782 
1783                         ncnn::Layer* uop = uop_packing[i0][i1][j0][j1][k];
1784                         if (!uop)
1785                             continue;
1786 
1787                         uop->destroy_pipeline(opt);
1788 
1789                         delete uop;
1790 
1791                         uop_packing[i0][i1][j0][j1][k] = 0;
1792                     }
1793                 }
1794             }
1795         }
1796     }
1797 }
1798 
VulkanDevice(int device_index)1799 VulkanDevice::VulkanDevice(int device_index)
1800     : info(get_gpu_info(device_index)), d(new VulkanDevicePrivate(this))
1801 {
1802     try_create_gpu_instance();
1803 
1804     std::vector<const char*> enabledExtensions;
1805     if (info.support_VK_KHR_8bit_storage())
1806         enabledExtensions.push_back("VK_KHR_8bit_storage");
1807     if (info.support_VK_KHR_16bit_storage())
1808         enabledExtensions.push_back("VK_KHR_16bit_storage");
1809     if (info.support_VK_KHR_bind_memory2())
1810         enabledExtensions.push_back("VK_KHR_bind_memory2");
1811     if (info.support_VK_KHR_create_renderpass2())
1812         enabledExtensions.push_back("VK_KHR_create_renderpass2");
1813     if (info.support_VK_KHR_dedicated_allocation())
1814         enabledExtensions.push_back("VK_KHR_dedicated_allocation");
1815     if (info.support_VK_KHR_descriptor_update_template())
1816         enabledExtensions.push_back("VK_KHR_descriptor_update_template");
1817     if (info.support_VK_KHR_external_memory())
1818         enabledExtensions.push_back("VK_KHR_external_memory");
1819     if (info.support_VK_KHR_get_memory_requirements2())
1820         enabledExtensions.push_back("VK_KHR_get_memory_requirements2");
1821     if (info.support_VK_KHR_maintenance1())
1822         enabledExtensions.push_back("VK_KHR_maintenance1");
1823     if (info.support_VK_KHR_maintenance2())
1824         enabledExtensions.push_back("VK_KHR_maintenance2");
1825     if (info.support_VK_KHR_maintenance3())
1826         enabledExtensions.push_back("VK_KHR_maintenance3");
1827     if (info.support_VK_KHR_multiview())
1828         enabledExtensions.push_back("VK_KHR_multiview");
1829     if (info.support_VK_KHR_push_descriptor())
1830         enabledExtensions.push_back("VK_KHR_push_descriptor");
1831     if (info.support_VK_KHR_sampler_ycbcr_conversion())
1832         enabledExtensions.push_back("VK_KHR_sampler_ycbcr_conversion");
1833     if (info.support_VK_KHR_shader_float16_int8())
1834         enabledExtensions.push_back("VK_KHR_shader_float16_int8");
1835     if (info.support_VK_KHR_shader_float_controls())
1836         enabledExtensions.push_back("VK_KHR_shader_float_controls");
1837     if (info.support_VK_KHR_storage_buffer_storage_class())
1838         enabledExtensions.push_back("VK_KHR_storage_buffer_storage_class");
1839     if (info.support_VK_KHR_swapchain())
1840         enabledExtensions.push_back("VK_KHR_swapchain");
1841     if (info.support_VK_EXT_descriptor_indexing())
1842         enabledExtensions.push_back("VK_EXT_descriptor_indexing");
1843     if (info.support_VK_EXT_memory_budget())
1844         enabledExtensions.push_back("VK_EXT_memory_budget");
1845     if (info.support_VK_EXT_queue_family_foreign())
1846         enabledExtensions.push_back("VK_EXT_queue_family_foreign");
1847 #if __ANDROID_API__ >= 26
1848     if (info.support_VK_ANDROID_external_memory_android_hardware_buffer())
1849         enabledExtensions.push_back("VK_ANDROID_external_memory_android_hardware_buffer");
1850 #endif // __ANDROID_API__ >= 26
1851 
1852     void* enabledExtensionFeatures = 0;
1853 
1854     // enable int8 storage
1855     VkPhysicalDevice8BitStorageFeaturesKHR enabled8BitStorageFeatures;
1856     enabled8BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR;
1857     enabled8BitStorageFeatures.pNext = 0;
1858     enabled8BitStorageFeatures.storageBuffer8BitAccess = info.support_int8_storage();
1859     enabled8BitStorageFeatures.uniformAndStorageBuffer8BitAccess = VK_FALSE;
1860     enabled8BitStorageFeatures.storagePushConstant8 = VK_FALSE;
1861     if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_8bit_storage())
1862     {
1863         enabled8BitStorageFeatures.pNext = enabledExtensionFeatures;
1864         enabledExtensionFeatures = &enabled8BitStorageFeatures;
1865     }
1866 
1867     // enable fp16/int16 storage
1868     VkPhysicalDevice16BitStorageFeaturesKHR enabled16BitStorageFeatures;
1869     enabled16BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR;
1870     enabled16BitStorageFeatures.pNext = 0;
1871     enabled16BitStorageFeatures.storageBuffer16BitAccess = info.support_fp16_storage();
1872     enabled16BitStorageFeatures.uniformAndStorageBuffer16BitAccess = VK_FALSE;
1873     enabled16BitStorageFeatures.storagePushConstant16 = VK_FALSE;
1874     enabled16BitStorageFeatures.storageInputOutput16 = VK_FALSE;
1875     if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_16bit_storage())
1876     {
1877         enabled16BitStorageFeatures.pNext = enabledExtensionFeatures;
1878         enabledExtensionFeatures = &enabled16BitStorageFeatures;
1879     }
1880 
1881     // enable fp16/int8 arithmetic
1882     VkPhysicalDeviceFloat16Int8FeaturesKHR enabledFloat16Int8Features;
1883     enabledFloat16Int8Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR;
1884     enabledFloat16Int8Features.pNext = 0;
1885     enabledFloat16Int8Features.shaderFloat16 = info.support_fp16_arithmetic();
1886     enabledFloat16Int8Features.shaderInt8 = info.support_int8_arithmetic();
1887     if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_shader_float16_int8())
1888     {
1889         enabledFloat16Int8Features.pNext = enabledExtensionFeatures;
1890         enabledExtensionFeatures = &enabledFloat16Int8Features;
1891     }
1892 
1893     // enable ycbcr conversion
1894     VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR querySamplerYcbcrConversionFeatures;
1895     querySamplerYcbcrConversionFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES_KHR;
1896     querySamplerYcbcrConversionFeatures.pNext = 0;
1897     querySamplerYcbcrConversionFeatures.samplerYcbcrConversion = info.support_ycbcr_conversion();
1898     if (support_VK_KHR_get_physical_device_properties2 && info.support_ycbcr_conversion())
1899     {
1900         querySamplerYcbcrConversionFeatures.pNext = enabledExtensionFeatures;
1901         enabledExtensionFeatures = &querySamplerYcbcrConversionFeatures;
1902     }
1903 
1904     std::vector<float> compute_queue_priorities(info.compute_queue_count(), 1.f);   // 0.f ~ 1.f
1905     std::vector<float> graphics_queue_priorities(info.graphics_queue_count(), 1.f); // 0.f ~ 1.f
1906     std::vector<float> transfer_queue_priorities(info.transfer_queue_count(), 1.f); // 0.f ~ 1.f
1907 
1908     VkDeviceQueueCreateInfo deviceQueueCreateInfos[3];
1909 
1910     VkDeviceQueueCreateInfo deviceComputeQueueCreateInfo;
1911     deviceComputeQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
1912     deviceComputeQueueCreateInfo.pNext = 0;
1913     deviceComputeQueueCreateInfo.flags = 0;
1914     deviceComputeQueueCreateInfo.queueFamilyIndex = info.compute_queue_family_index();
1915     deviceComputeQueueCreateInfo.queueCount = info.compute_queue_count();
1916     deviceComputeQueueCreateInfo.pQueuePriorities = compute_queue_priorities.data();
1917 
1918     VkDeviceQueueCreateInfo deviceGraphicsQueueCreateInfo;
1919     deviceGraphicsQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
1920     deviceGraphicsQueueCreateInfo.pNext = 0;
1921     deviceGraphicsQueueCreateInfo.flags = 0;
1922     deviceGraphicsQueueCreateInfo.queueFamilyIndex = info.graphics_queue_family_index();
1923     deviceGraphicsQueueCreateInfo.queueCount = info.graphics_queue_count();
1924     deviceGraphicsQueueCreateInfo.pQueuePriorities = graphics_queue_priorities.data();
1925 
1926     VkDeviceQueueCreateInfo deviceTransferQueueCreateInfo;
1927     deviceTransferQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
1928     deviceTransferQueueCreateInfo.pNext = 0;
1929     deviceTransferQueueCreateInfo.flags = 0;
1930     deviceTransferQueueCreateInfo.queueFamilyIndex = info.transfer_queue_family_index();
1931     deviceTransferQueueCreateInfo.queueCount = info.transfer_queue_count();
1932     deviceTransferQueueCreateInfo.pQueuePriorities = transfer_queue_priorities.data();
1933 
1934     VkDeviceCreateInfo deviceCreateInfo;
1935     deviceCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
1936     deviceCreateInfo.pNext = enabledExtensionFeatures;
1937     deviceCreateInfo.flags = 0;
1938     if (info.compute_queue_family_index() == info.graphics_queue_family_index() && info.compute_queue_family_index() == info.transfer_queue_family_index())
1939     {
1940         deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo;
1941         deviceCreateInfo.queueCreateInfoCount = 1;
1942     }
1943     else if (info.compute_queue_family_index() == info.graphics_queue_family_index() && info.compute_queue_family_index() != info.transfer_queue_family_index())
1944     {
1945         deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo;
1946         deviceQueueCreateInfos[1] = deviceTransferQueueCreateInfo;
1947         deviceCreateInfo.queueCreateInfoCount = 2;
1948     }
1949     else if (info.compute_queue_family_index() != info.graphics_queue_family_index() && info.graphics_queue_family_index() == info.transfer_queue_family_index())
1950     {
1951         deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo;
1952         deviceQueueCreateInfos[1] = deviceGraphicsQueueCreateInfo;
1953         deviceCreateInfo.queueCreateInfoCount = 2;
1954     }
1955     else // if (info.compute_queue_family_index() != info.graphics_queue_family_index() && info.graphics_queue_family_index() != info.transfer_queue_family_index())
1956     {
1957         deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo;
1958         deviceQueueCreateInfos[1] = deviceGraphicsQueueCreateInfo;
1959         deviceQueueCreateInfos[2] = deviceTransferQueueCreateInfo;
1960         deviceCreateInfo.queueCreateInfoCount = 3;
1961     }
1962     deviceCreateInfo.pQueueCreateInfos = deviceQueueCreateInfos;
1963     deviceCreateInfo.enabledLayerCount = 0;
1964     deviceCreateInfo.ppEnabledLayerNames = 0;
1965     deviceCreateInfo.enabledExtensionCount = enabledExtensions.size();
1966     deviceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data();
1967     deviceCreateInfo.pEnabledFeatures = 0; // VkPhysicalDeviceFeatures pointer
1968 
1969     VkResult ret = vkCreateDevice(info.physical_device(), &deviceCreateInfo, 0, &d->device);
1970     if (ret != VK_SUCCESS)
1971     {
1972         NCNN_LOGE("vkCreateDevice failed %d", ret);
1973     }
1974 
1975     init_device_extension();
1976 
1977     d->compute_queues.resize(info.compute_queue_count());
1978     d->blob_allocators.resize(info.compute_queue_count());
1979     d->staging_allocators.resize(info.compute_queue_count());
1980     for (uint32_t i = 0; i < info.compute_queue_count(); i++)
1981     {
1982         vkGetDeviceQueue(d->device, info.compute_queue_family_index(), i, &d->compute_queues[i]);
1983         d->blob_allocators[i] = new VkBlobAllocator(this);
1984         d->staging_allocators[i] = new VkStagingAllocator(this);
1985     }
1986     if (info.compute_queue_family_index() != info.graphics_queue_family_index())
1987     {
1988         d->graphics_queues.resize(info.graphics_queue_count());
1989         for (uint32_t i = 0; i < info.graphics_queue_count(); i++)
1990         {
1991             vkGetDeviceQueue(d->device, info.graphics_queue_family_index(), i, &d->graphics_queues[i]);
1992         }
1993     }
1994     if (info.compute_queue_family_index() != info.transfer_queue_family_index() && info.graphics_queue_family_index() != info.transfer_queue_family_index())
1995     {
1996         d->transfer_queues.resize(info.transfer_queue_count());
1997         for (uint32_t i = 0; i < info.transfer_queue_count(); i++)
1998         {
1999             vkGetDeviceQueue(d->device, info.transfer_queue_family_index(), i, &d->transfer_queues[i]);
2000         }
2001     }
2002 
2003     // prepare immutable texelfetch sampler
2004     {
2005         VkSamplerCreateInfo samplerCreateInfo;
2006         samplerCreateInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
2007         samplerCreateInfo.pNext = 0;
2008         samplerCreateInfo.flags = 0;
2009         samplerCreateInfo.magFilter = VK_FILTER_NEAREST;
2010         samplerCreateInfo.minFilter = VK_FILTER_NEAREST;
2011         samplerCreateInfo.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
2012         samplerCreateInfo.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
2013         samplerCreateInfo.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
2014         samplerCreateInfo.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
2015         samplerCreateInfo.mipLodBias = 0.0f;
2016         samplerCreateInfo.anisotropyEnable = VK_FALSE;
2017         samplerCreateInfo.maxAnisotropy = 1;
2018         samplerCreateInfo.compareEnable = VK_FALSE;
2019         samplerCreateInfo.compareOp = VK_COMPARE_OP_NEVER;
2020         samplerCreateInfo.minLod = 0.0f;
2021         samplerCreateInfo.maxLod = 0.0f;
2022         samplerCreateInfo.borderColor = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK;
2023         samplerCreateInfo.unnormalizedCoordinates = VK_TRUE;
2024 
2025         d->texelfetch_sampler = 0;
2026         ret = vkCreateSampler(d->device, &samplerCreateInfo, 0, &d->texelfetch_sampler);
2027         if (ret != VK_SUCCESS)
2028         {
2029             NCNN_LOGE("vkCreateSampler failed %d", ret);
2030         }
2031     }
2032 
2033     d->create_dummy_buffer_image();
2034 
2035     d->pipeline_cache = new PipelineCache(this);
2036 
2037     memset(d->uop_packing, 0, sizeof(d->uop_packing));
2038 }
2039 
~VulkanDevice()2040 VulkanDevice::~VulkanDevice()
2041 {
2042     d->destroy_utility_operator();
2043 
2044     d->destroy_dummy_buffer_image();
2045 
2046     if (d->texelfetch_sampler)
2047     {
2048         vkDestroySampler(d->device, d->texelfetch_sampler, 0);
2049     }
2050 
2051     for (size_t i = 0; i < d->blob_allocators.size(); i++)
2052     {
2053         delete d->blob_allocators[i];
2054     }
2055     d->blob_allocators.clear();
2056     for (size_t i = 0; i < d->staging_allocators.size(); i++)
2057     {
2058         delete d->staging_allocators[i];
2059     }
2060     d->staging_allocators.clear();
2061 
2062     delete d->pipeline_cache;
2063 
2064     vkDestroyDevice(d->device, 0);
2065 
2066     delete d;
2067 }
2068 
VulkanDevice(const VulkanDevice &)2069 VulkanDevice::VulkanDevice(const VulkanDevice&)
2070     : info(get_gpu_info(0)), d(0)
2071 {
2072 }
2073 
operator =(const VulkanDevice &)2074 VulkanDevice& VulkanDevice::operator=(const VulkanDevice&)
2075 {
2076     return *this;
2077 }
2078 
vkdevice() const2079 VkDevice VulkanDevice::vkdevice() const
2080 {
2081     return d->device;
2082 }
2083 
compile_shader_module(const uint32_t * spv_data,size_t spv_data_size) const2084 VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const
2085 {
2086     VkShaderModuleCreateInfo shaderModuleCreateInfo;
2087     shaderModuleCreateInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
2088     shaderModuleCreateInfo.pNext = 0;
2089     shaderModuleCreateInfo.flags = 0;
2090     shaderModuleCreateInfo.codeSize = spv_data_size;
2091     shaderModuleCreateInfo.pCode = spv_data;
2092 
2093     VkShaderModule shader_module;
2094     VkResult ret = vkCreateShaderModule(d->device, &shaderModuleCreateInfo, 0, &shader_module);
2095     if (ret != VK_SUCCESS)
2096     {
2097         NCNN_LOGE("vkCreateShaderModule failed %d", ret);
2098         return 0;
2099     }
2100 
2101     return shader_module;
2102 }
2103 
inject_local_size_xyz(const uint32_t * code,size_t size,uint32_t local_size_x,uint32_t local_size_y,uint32_t local_size_z,uint32_t * dstcode,size_t * dstsize)2104 static void inject_local_size_xyz(const uint32_t* code, size_t size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t* dstcode, size_t* dstsize)
2105 {
2106     uint32_t local_size_x_id = -1;
2107     uint32_t local_size_y_id = -1;
2108     uint32_t local_size_z_id = -1;
2109     uint32_t gl_WorkGroupSize_id = -1;
2110 
2111     const uint32_t* p = code;
2112     uint32_t* dp = dstcode;
2113 
2114     // skip magic version generator bound schema
2115     memcpy(dp, p, 5 * sizeof(uint32_t));
2116     p += 5;
2117     dp += 5;
2118 
2119     // foreach op
2120     while ((const unsigned char*)p < (const unsigned char*)code + size)
2121     {
2122         uint32_t opcode = p[0];
2123 
2124         uint16_t wordcount = opcode >> 16;
2125         uint16_t op = opcode & 0xffff;
2126 
2127         if (op == 16) // OpExecutionMode
2128         {
2129             uint32_t mode = p[2];
2130             if (mode == 17) // LocalSize
2131             {
2132                 memcpy(dp, p, wordcount * sizeof(uint32_t));
2133 
2134                 // set local_size_xyz
2135                 dp[3] = local_size_x;
2136                 dp[4] = local_size_y;
2137                 dp[5] = local_size_z;
2138 
2139                 p += wordcount;
2140                 dp += wordcount;
2141                 continue;
2142             }
2143         }
2144         else if (op == 50) // OpSpecConstant
2145         {
2146             uint32_t id = p[2];
2147             if (id == local_size_x_id || id == local_size_y_id || id == local_size_z_id)
2148             {
2149                 p += wordcount;
2150                 continue;
2151             }
2152         }
2153         else if (op == 51) // OpSpecConstantComposite
2154         {
2155             uint32_t id = p[2];
2156             if (id == gl_WorkGroupSize_id)
2157             {
2158                 if (wordcount == 6 && (p[3] == local_size_x_id || p[4] == local_size_y_id || p[5] == local_size_z_id))
2159                 {
2160                     p += wordcount;
2161                     continue;
2162                 }
2163             }
2164         }
2165         else if (op == 71) // OpDecorate
2166         {
2167             uint32_t id = p[1];
2168             uint32_t decoration = p[2];
2169             if (decoration == 1) // SpecId
2170             {
2171                 uint32_t specid = p[3];
2172                 if (specid == 233) local_size_x_id = id;
2173                 if (specid == 234) local_size_y_id = id;
2174                 if (specid == 235) local_size_z_id = id;
2175                 if (specid == 233 || specid == 234 || specid == 235)
2176                 {
2177                     p += wordcount;
2178                     continue;
2179                 }
2180             }
2181             else if (decoration == 11) // BuiltIn
2182             {
2183                 uint32_t builtin = p[3];
2184                 if (builtin == 25) // WorkgroupSize
2185                 {
2186                     gl_WorkGroupSize_id = id;
2187                     p += wordcount;
2188                     continue;
2189                 }
2190             }
2191         }
2192 
2193         memcpy(dp, p, wordcount * sizeof(uint32_t));
2194         p += wordcount;
2195         dp += wordcount;
2196     }
2197 
2198     *dstsize = (unsigned char*)dp - (unsigned char*)dstcode;
2199 }
2200 
compile_shader_module(const uint32_t * spv_data,size_t spv_data_size,uint32_t local_size_x,uint32_t local_size_y,uint32_t local_size_z) const2201 VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const
2202 {
2203     uint32_t* spv_data_modified = (uint32_t*)malloc(spv_data_size);
2204     size_t spv_data_size_modified = spv_data_size;
2205     inject_local_size_xyz(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z, spv_data_modified, &spv_data_size_modified);
2206 
2207     VkShaderModule shader_module = compile_shader_module(spv_data_modified, spv_data_size_modified);
2208 
2209     free(spv_data_modified);
2210 
2211     return shader_module;
2212 }
2213 
create_descriptorset_layout(int binding_count,const int * binding_types,VkDescriptorSetLayout * descriptorset_layout) const2214 int VulkanDevice::create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const
2215 {
2216     if (binding_count == 0)
2217     {
2218         *descriptorset_layout = 0;
2219         return 0;
2220     }
2221 
2222     std::vector<VkDescriptorSetLayoutBinding> descriptorSetLayoutBindings(binding_count);
2223     for (int i = 0; i < binding_count; i++)
2224     {
2225         int binding_type = binding_types[i];
2226 
2227         descriptorSetLayoutBindings[i].binding = i;
2228         descriptorSetLayoutBindings[i].descriptorCount = 1;
2229         descriptorSetLayoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
2230 
2231         if (binding_type == 1)
2232         {
2233             descriptorSetLayoutBindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
2234             descriptorSetLayoutBindings[i].pImmutableSamplers = 0;
2235         }
2236         else if (binding_type == 2)
2237         {
2238             descriptorSetLayoutBindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
2239             descriptorSetLayoutBindings[i].pImmutableSamplers = 0;
2240         }
2241         else // if (binding_type == 3)
2242         {
2243             descriptorSetLayoutBindings[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
2244             descriptorSetLayoutBindings[i].pImmutableSamplers = immutable_texelfetch_sampler(); // we always use texelfetch
2245         }
2246     }
2247 
2248     VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo;
2249     descriptorSetLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
2250     descriptorSetLayoutCreateInfo.pNext = 0;
2251     descriptorSetLayoutCreateInfo.flags = 0;
2252     descriptorSetLayoutCreateInfo.bindingCount = binding_count;
2253     descriptorSetLayoutCreateInfo.pBindings = descriptorSetLayoutBindings.data();
2254 
2255     if (info.support_VK_KHR_push_descriptor())
2256     {
2257         descriptorSetLayoutCreateInfo.flags |= VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR;
2258     }
2259 
2260     VkResult ret = vkCreateDescriptorSetLayout(d->device, &descriptorSetLayoutCreateInfo, 0, descriptorset_layout);
2261     if (ret != VK_SUCCESS)
2262     {
2263         NCNN_LOGE("vkCreateDescriptorSetLayout failed %d", ret);
2264         return -1;
2265     }
2266 
2267     return 0;
2268 }
2269 
create_pipeline_layout(int push_constant_count,VkDescriptorSetLayout descriptorset_layout,VkPipelineLayout * pipeline_layout) const2270 int VulkanDevice::create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const
2271 {
2272     VkPushConstantRange pushConstantRange;
2273     pushConstantRange.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
2274     pushConstantRange.offset = 0;
2275     pushConstantRange.size = sizeof(vk_constant_type) * push_constant_count;
2276 
2277     VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo;
2278     pipelineLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
2279     pipelineLayoutCreateInfo.pNext = 0;
2280     pipelineLayoutCreateInfo.flags = 0;
2281 
2282     if (descriptorset_layout)
2283     {
2284         pipelineLayoutCreateInfo.setLayoutCount = 1;
2285         pipelineLayoutCreateInfo.pSetLayouts = &descriptorset_layout;
2286     }
2287     else
2288     {
2289         pipelineLayoutCreateInfo.setLayoutCount = 0;
2290         pipelineLayoutCreateInfo.pSetLayouts = 0;
2291     }
2292 
2293     if (push_constant_count > 0)
2294     {
2295         pipelineLayoutCreateInfo.pushConstantRangeCount = 1;
2296         pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange;
2297     }
2298     else
2299     {
2300         pipelineLayoutCreateInfo.pushConstantRangeCount = 0;
2301         pipelineLayoutCreateInfo.pPushConstantRanges = 0;
2302     }
2303 
2304     VkResult ret = vkCreatePipelineLayout(d->device, &pipelineLayoutCreateInfo, 0, pipeline_layout);
2305     if (ret != VK_SUCCESS)
2306     {
2307         NCNN_LOGE("vkCreatePipelineLayout failed %d", ret);
2308         return -1;
2309     }
2310 
2311     return 0;
2312 }
2313 
create_pipeline(VkShaderModule shader_module,VkPipelineLayout pipeline_layout,const std::vector<vk_specialization_type> & specializations,VkPipeline * pipeline) const2314 int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, VkPipeline* pipeline) const
2315 {
2316     const int specialization_count = specializations.size();
2317 
2318     std::vector<VkSpecializationMapEntry> specializationMapEntries(specialization_count);
2319     for (int i = 0; i < specialization_count; i++)
2320     {
2321         specializationMapEntries[i].constantID = i;
2322         specializationMapEntries[i].offset = i * sizeof(vk_specialization_type);
2323         specializationMapEntries[i].size = sizeof(vk_specialization_type);
2324     }
2325 
2326     VkSpecializationInfo specializationInfo;
2327     specializationInfo.mapEntryCount = specializationMapEntries.size();
2328     specializationInfo.pMapEntries = specializationMapEntries.data();
2329     specializationInfo.dataSize = specializations.size() * sizeof(vk_specialization_type);
2330     specializationInfo.pData = specializations.data();
2331 
2332     VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo;
2333     pipelineShaderStageCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
2334     pipelineShaderStageCreateInfo.pNext = 0;
2335     pipelineShaderStageCreateInfo.flags = 0;
2336     pipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
2337     pipelineShaderStageCreateInfo.module = shader_module;
2338     pipelineShaderStageCreateInfo.pName = "main";
2339     pipelineShaderStageCreateInfo.pSpecializationInfo = &specializationInfo;
2340 
2341     VkComputePipelineCreateInfo computePipelineCreateInfo;
2342     computePipelineCreateInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
2343     computePipelineCreateInfo.pNext = 0;
2344     computePipelineCreateInfo.flags = 0;
2345     computePipelineCreateInfo.stage = pipelineShaderStageCreateInfo;
2346     computePipelineCreateInfo.layout = pipeline_layout;
2347     computePipelineCreateInfo.basePipelineHandle = 0;
2348     computePipelineCreateInfo.basePipelineIndex = 0;
2349 
2350     VkResult ret = vkCreateComputePipelines(d->device, 0, 1, &computePipelineCreateInfo, 0, pipeline);
2351     if (ret != VK_SUCCESS)
2352     {
2353         NCNN_LOGE("vkCreateComputePipelines failed %d", ret);
2354         return -1;
2355     }
2356 
2357     return 0;
2358 }
2359 
create_descriptor_update_template(int binding_count,const int * binding_types,VkDescriptorSetLayout descriptorset_layout,VkPipelineLayout pipeline_layout,VkDescriptorUpdateTemplateKHR * descriptor_update_template) const2360 int VulkanDevice::create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const
2361 {
2362     if (binding_count == 0)
2363     {
2364         *descriptor_update_template = 0;
2365         return 0;
2366     }
2367 
2368     std::vector<VkDescriptorUpdateTemplateEntryKHR> descriptorUpdateTemplateEntries(binding_count);
2369     size_t offset = 0;
2370     for (int i = 0; i < binding_count; i++) // TODO do not update weights
2371     {
2372         int binding_type = binding_types[i];
2373 
2374         descriptorUpdateTemplateEntries[i].dstBinding = i;
2375         descriptorUpdateTemplateEntries[i].dstArrayElement = 0;
2376         descriptorUpdateTemplateEntries[i].descriptorCount = 1;
2377         descriptorUpdateTemplateEntries[i].offset = offset;
2378 
2379         if (binding_type == 1)
2380         {
2381             descriptorUpdateTemplateEntries[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
2382             descriptorUpdateTemplateEntries[i].stride = sizeof(VkDescriptorBufferInfo);
2383         }
2384         else if (binding_type == 2)
2385         {
2386             descriptorUpdateTemplateEntries[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
2387             descriptorUpdateTemplateEntries[i].stride = sizeof(VkDescriptorImageInfo);
2388         }
2389         else // if (binding_type == 3)
2390         {
2391             descriptorUpdateTemplateEntries[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
2392             descriptorUpdateTemplateEntries[i].stride = sizeof(VkDescriptorImageInfo);
2393         }
2394 
2395         offset += descriptorUpdateTemplateEntries[i].stride;
2396     }
2397 
2398     VkDescriptorUpdateTemplateCreateInfoKHR descriptorUpdateTemplateCreateInfo;
2399     descriptorUpdateTemplateCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR;
2400     descriptorUpdateTemplateCreateInfo.pNext = 0;
2401     descriptorUpdateTemplateCreateInfo.flags = 0;
2402     descriptorUpdateTemplateCreateInfo.descriptorUpdateEntryCount = binding_count; // TODO do not update weights
2403     descriptorUpdateTemplateCreateInfo.pDescriptorUpdateEntries = descriptorUpdateTemplateEntries.data();
2404     if (info.support_VK_KHR_push_descriptor())
2405     {
2406         descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR;
2407     }
2408     else
2409     {
2410         descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR;
2411     }
2412     // descriptorSetLayout should be ignored if VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR
2413     // FIXME HACK WARNING TODO NOTE but crash on radv if set NULL  :(
2414     descriptorUpdateTemplateCreateInfo.descriptorSetLayout = descriptorset_layout;
2415     descriptorUpdateTemplateCreateInfo.pipelineBindPoint = VK_PIPELINE_BIND_POINT_COMPUTE;
2416     descriptorUpdateTemplateCreateInfo.pipelineLayout = pipeline_layout;
2417     descriptorUpdateTemplateCreateInfo.set = 0;
2418 
2419     VkResult ret = vkCreateDescriptorUpdateTemplateKHR(d->device, &descriptorUpdateTemplateCreateInfo, 0, descriptor_update_template);
2420     if (ret != VK_SUCCESS)
2421     {
2422         NCNN_LOGE("vkCreateDescriptorUpdateTemplateKHR failed %d", ret);
2423         return -1;
2424     }
2425 
2426     return 0;
2427 }
2428 
find_memory_index(uint32_t memory_type_bits,VkFlags required,VkFlags preferred,VkFlags preferred_not) const2429 uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const
2430 {
2431     const VkPhysicalDeviceMemoryProperties& memory_properties = info.physical_device_memory_properties();
2432 
2433     // first try, find required and with preferred and without preferred_not
2434     for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
2435     {
2436         bool is_required = (1 << i) & memory_type_bits;
2437         if (is_required)
2438         {
2439             const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
2440             if ((memoryType.propertyFlags & required) == required
2441                     && (preferred && (memoryType.propertyFlags & preferred))
2442                     && (preferred_not && !(memoryType.propertyFlags & preferred_not)))
2443             {
2444                 return i;
2445             }
2446         }
2447     }
2448 
2449     // second try, find required and with preferred
2450     for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
2451     {
2452         bool is_required = (1 << i) & memory_type_bits;
2453         if (is_required)
2454         {
2455             const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
2456             if ((memoryType.propertyFlags & required) == required
2457                     && (preferred && (memoryType.propertyFlags & preferred)))
2458             {
2459                 return i;
2460             }
2461         }
2462     }
2463 
2464     // third try, find required and without preferred_not
2465     for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
2466     {
2467         bool is_required = (1 << i) & memory_type_bits;
2468         if (is_required)
2469         {
2470             const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
2471             if ((memoryType.propertyFlags & required) == required
2472                     && (preferred_not && !(memoryType.propertyFlags & preferred_not)))
2473             {
2474                 return i;
2475             }
2476         }
2477     }
2478 
2479     // fourth try, find any required
2480     for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
2481     {
2482         bool is_required = (1 << i) & memory_type_bits;
2483         if (is_required)
2484         {
2485             const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
2486             if ((memoryType.propertyFlags & required) == required)
2487             {
2488                 return i;
2489             }
2490         }
2491     }
2492 
2493     NCNN_LOGE("no such memory type %u %u %u %u", memory_type_bits, required, preferred, preferred_not);
2494     return -1;
2495 }
2496 
is_mappable(uint32_t memory_type_index) const2497 bool VulkanDevice::is_mappable(uint32_t memory_type_index) const
2498 {
2499     const VkMemoryType& memoryType = info.physical_device_memory_properties().memoryTypes[memory_type_index];
2500 
2501     return memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
2502 }
2503 
is_coherent(uint32_t memory_type_index) const2504 bool VulkanDevice::is_coherent(uint32_t memory_type_index) const
2505 {
2506     const VkMemoryType& memoryType = info.physical_device_memory_properties().memoryTypes[memory_type_index];
2507 
2508     return memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
2509 }
2510 
acquire_queue(uint32_t queue_family_index) const2511 VkQueue VulkanDevice::acquire_queue(uint32_t queue_family_index) const
2512 {
2513     if (queue_family_index != info.compute_queue_family_index()
2514             && queue_family_index != info.graphics_queue_family_index()
2515             && queue_family_index != info.transfer_queue_family_index())
2516     {
2517         NCNN_LOGE("invalid queue_family_index %u", queue_family_index);
2518         return 0;
2519     }
2520 
2521     MutexLockGuard lock(d->queue_lock);
2522 
2523     std::vector<VkQueue>& queues = queue_family_index == info.compute_queue_family_index() ? d->compute_queues
2524                                    : queue_family_index == info.graphics_queue_family_index() ? d->graphics_queues
2525                                    : d->transfer_queues;
2526     for (int i = 0; i < (int)queues.size(); i++)
2527     {
2528         VkQueue queue = queues[i];
2529         if (queue)
2530         {
2531             queues[i] = 0;
2532             return queue;
2533         }
2534     }
2535 
2536     NCNN_LOGE("out of hardware queue %u", queue_family_index);
2537     return 0;
2538 }
2539 
reclaim_queue(uint32_t queue_family_index,VkQueue queue) const2540 void VulkanDevice::reclaim_queue(uint32_t queue_family_index, VkQueue queue) const
2541 {
2542     if (queue_family_index != info.compute_queue_family_index()
2543             && queue_family_index != info.graphics_queue_family_index()
2544             && queue_family_index != info.transfer_queue_family_index())
2545     {
2546         NCNN_LOGE("invalid queue_family_index %u", queue_family_index);
2547         return;
2548     }
2549 
2550     MutexLockGuard lock(d->queue_lock);
2551 
2552     std::vector<VkQueue>& queues = queue_family_index == info.compute_queue_family_index() ? d->compute_queues
2553                                    : queue_family_index == info.graphics_queue_family_index() ? d->graphics_queues
2554                                    : d->transfer_queues;
2555     for (int i = 0; i < (int)queues.size(); i++)
2556     {
2557         if (!queues[i])
2558         {
2559             queues[i] = queue;
2560             return;
2561         }
2562     }
2563 
2564     NCNN_LOGE("FATAL ERROR! reclaim_queue get wild queue %u %p", queue_family_index, queue);
2565 }
2566 
acquire_blob_allocator() const2567 VkAllocator* VulkanDevice::acquire_blob_allocator() const
2568 {
2569     MutexLockGuard lock(d->blob_allocator_lock);
2570 
2571     for (int i = 0; i < (int)d->blob_allocators.size(); i++)
2572     {
2573         VkAllocator* allocator = d->blob_allocators[i];
2574         if (allocator)
2575         {
2576             d->blob_allocators[i] = 0;
2577             return allocator;
2578         }
2579     }
2580 
2581     // pre-allocated allcator exhausted, create new
2582     VkAllocator* allocator = new VkBlobAllocator(this);
2583     d->blob_allocators.push_back(allocator);
2584     d->blob_allocators[d->blob_allocators.size() - 1] = 0;
2585     return allocator;
2586 }
2587 
reclaim_blob_allocator(VkAllocator * allocator) const2588 void VulkanDevice::reclaim_blob_allocator(VkAllocator* allocator) const
2589 {
2590     MutexLockGuard lock(d->blob_allocator_lock);
2591 
2592     for (int i = 0; i < (int)d->blob_allocators.size(); i++)
2593     {
2594         if (!d->blob_allocators[i])
2595         {
2596             d->blob_allocators[i] = allocator;
2597             return;
2598         }
2599     }
2600 
2601     NCNN_LOGE("FATAL ERROR! reclaim_blob_allocator get wild allocator %p", allocator);
2602 }
2603 
acquire_staging_allocator() const2604 VkAllocator* VulkanDevice::acquire_staging_allocator() const
2605 {
2606     MutexLockGuard lock(d->staging_allocator_lock);
2607 
2608     for (int i = 0; i < (int)d->staging_allocators.size(); i++)
2609     {
2610         VkAllocator* allocator = d->staging_allocators[i];
2611         if (allocator)
2612         {
2613             d->staging_allocators[i] = 0;
2614             return allocator;
2615         }
2616     }
2617 
2618     // pre-allocated allcator exhausted, create new
2619     VkAllocator* allocator = new VkStagingAllocator(this);
2620     d->staging_allocators.push_back(allocator);
2621     d->staging_allocators[d->staging_allocators.size() - 1] = 0;
2622     return allocator;
2623 }
2624 
reclaim_staging_allocator(VkAllocator * allocator) const2625 void VulkanDevice::reclaim_staging_allocator(VkAllocator* allocator) const
2626 {
2627     MutexLockGuard lock(d->staging_allocator_lock);
2628 
2629     for (int i = 0; i < (int)d->staging_allocators.size(); i++)
2630     {
2631         if (!d->staging_allocators[i])
2632         {
2633             d->staging_allocators[i] = allocator;
2634             return;
2635         }
2636     }
2637 
2638     NCNN_LOGE("FATAL ERROR! reclaim_staging_allocator get wild allocator %p", allocator);
2639 }
2640 
immutable_texelfetch_sampler() const2641 const VkSampler* VulkanDevice::immutable_texelfetch_sampler() const
2642 {
2643     return &d->texelfetch_sampler;
2644 }
2645 
get_dummy_buffer() const2646 VkMat VulkanDevice::get_dummy_buffer() const
2647 {
2648     return d->dummy_buffer;
2649 }
2650 
get_dummy_image() const2651 VkImageMat VulkanDevice::get_dummy_image() const
2652 {
2653     return d->dummy_image;
2654 }
2655 
get_dummy_image_readonly() const2656 VkImageMat VulkanDevice::get_dummy_image_readonly() const
2657 {
2658 #if __APPLE__
2659     if (info.vendor_id() == 0x8086)
2660         return d->dummy_image;
2661 #endif
2662     return d->dummy_image_readonly;
2663 }
2664 
get_pipeline_cache() const2665 const PipelineCache* VulkanDevice::get_pipeline_cache() const
2666 {
2667     return d->pipeline_cache;
2668 }
2669 
shape_support_image_storage(const Mat & shape) const2670 bool VulkanDevice::shape_support_image_storage(const Mat& shape) const
2671 {
2672     int dims = shape.dims;
2673     int width = shape.w;
2674     int height = shape.h;
2675     int depth = shape.c;
2676     int elempack = shape.elempack;
2677 
2678     // large elempack spills on image w
2679     if (elempack == 8) width *= 2;
2680     if (elempack == 16) width *= 4;
2681     if (elempack == 32) width *= 8;
2682     if (elempack == 64) width *= 16;
2683 
2684     if (dims == 1)
2685     {
2686         if (width > (int)info.max_image_dimension_1d())
2687         {
2688             return false;
2689         }
2690     }
2691     else if (dims == 2)
2692     {
2693         if (width > (int)info.max_image_dimension_2d() || height > (int)info.max_image_dimension_2d())
2694         {
2695             return false;
2696         }
2697     }
2698     else // if (dims == 3)
2699     {
2700         if (width > (int)info.max_image_dimension_3d() || height > (int)info.max_image_dimension_3d() || depth > (int)info.max_image_dimension_3d())
2701         {
2702             return false;
2703         }
2704     }
2705 
2706     return true;
2707 }
2708 
get_heap_budget() const2709 uint32_t VulkanDevice::get_heap_budget() const
2710 {
2711     const VkPhysicalDeviceMemoryProperties& memory_properties = info.physical_device_memory_properties();
2712 
2713     // the first device local heap
2714     uint32_t device_local_heap_index = 0;
2715     uint32_t device_local_heap_size = 0;
2716     for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
2717     {
2718         const VkMemoryHeap& memoryHeap = memory_properties.memoryHeaps[i];
2719         if (memoryHeap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT)
2720         {
2721             device_local_heap_index = i;
2722             device_local_heap_size = memoryHeap.size / 1024 / 1024;
2723             break;
2724         }
2725     }
2726 
2727     if (!info.support_VK_EXT_memory_budget())
2728     {
2729         //         NCNN_LOGE("heap budget from assumption\n");
2730 
2731         // we usually cannot use all heap
2732         // 70% for 4G+
2733         // 50% for 4G-
2734         return device_local_heap_size >= 4000 ? device_local_heap_size * 0.7 : device_local_heap_size * 0.5;
2735     }
2736 
2737     VkPhysicalDeviceMemoryBudgetPropertiesEXT memoryBudgetProperties;
2738     memoryBudgetProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT;
2739     memoryBudgetProperties.pNext = 0;
2740 
2741     VkPhysicalDeviceMemoryProperties2KHR memoryProperties;
2742     memoryProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2_KHR;
2743     memoryProperties.pNext = &memoryBudgetProperties;
2744 
2745     vkGetPhysicalDeviceMemoryProperties2KHR(info.physical_device(), &memoryProperties);
2746 
2747     return memoryBudgetProperties.heapBudget[device_local_heap_index] / 1024 / 1024;
2748 }
2749 
convert_packing(const VkMat & src,VkMat & dst,int dst_elempack,VkCompute & cmd,const Option & _opt) const2750 void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& _opt) const
2751 {
2752     // buffer2buffer uop is created with use_image_storage disabled
2753     Option opt = _opt;
2754     opt.use_image_storage = false;
2755 
2756     int cast_type_to_index = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0;
2757     int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2;
2758 
2759     int cast_type_from_index;
2760     if (src.elembits() == 32)
2761     {
2762         cast_type_from_index = 0;
2763     }
2764     else // if (src.elembits() == 16)
2765     {
2766         if (cast_type_to_index != 0)
2767         {
2768             cast_type_from_index = cast_type_to_index;
2769         }
2770         else if (info.support_fp16_storage())
2771         {
2772             cast_type_from_index = 2;
2773         }
2774         else // if (info.support_fp16_packed())
2775         {
2776             cast_type_from_index = 1;
2777         }
2778     }
2779 
2780     // NCNN_LOGE("convert_packing b2b %d %d %d", cast_type_from_index, cast_type_to_index, packing_type_to_index);
2781 
2782     const ncnn::Packing_vulkan* uop = d->get_utility_operator(0, 0, cast_type_from_index, cast_type_to_index, packing_type_to_index);
2783     uop->forward(src, dst, cmd, opt);
2784 }
2785 
convert_packing(const VkImageMat & src,VkImageMat & dst,int dst_elempack,VkCompute & cmd,const Option & opt) const2786 void VulkanDevice::convert_packing(const VkImageMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const
2787 {
2788     int cast_type_to_index = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0;
2789     int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2;
2790 
2791     int cast_type_from_index;
2792     if (src.elembits() == 32)
2793     {
2794         cast_type_from_index = 0;
2795     }
2796     else // if (src.elembits() == 16)
2797     {
2798         if (cast_type_to_index != 0)
2799         {
2800             cast_type_from_index = cast_type_to_index;
2801         }
2802         else if (info.support_fp16_storage())
2803         {
2804             cast_type_from_index = 2;
2805         }
2806         else // if (info.support_fp16_packed())
2807         {
2808             cast_type_from_index = 1;
2809         }
2810     }
2811 
2812     // NCNN_LOGE("convert_packing i2i %d %d %d", cast_type_from_index, cast_type_to_index, packing_type_to_index);
2813 
2814     const ncnn::Packing_vulkan* uop = d->get_utility_operator(1, 1, cast_type_from_index, cast_type_to_index, packing_type_to_index);
2815     uop->forward(src, dst, cmd, opt);
2816 }
2817 
convert_packing(const VkMat & src,VkImageMat & dst,int dst_elempack,VkCompute & cmd,const Option & opt) const2818 void VulkanDevice::convert_packing(const VkMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const
2819 {
2820     int cast_type_to_index = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0;
2821     int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2;
2822 
2823     int cast_type_from_index;
2824     if (src.elembits() == 32)
2825     {
2826         cast_type_from_index = 0;
2827     }
2828     else // if (src.elembits() == 16)
2829     {
2830         if (cast_type_to_index != 0)
2831         {
2832             cast_type_from_index = cast_type_to_index;
2833         }
2834         else if (info.support_fp16_storage())
2835         {
2836             cast_type_from_index = 2;
2837         }
2838         else // if (info.support_fp16_packed())
2839         {
2840             cast_type_from_index = 1;
2841         }
2842     }
2843 
2844     // NCNN_LOGE("convert_packing b2i %d %d %d", cast_type_from_index, cast_type_to_index, packing_type_to_index);
2845 
2846     const ncnn::Packing_vulkan* uop = d->get_utility_operator(0, 1, cast_type_from_index, cast_type_to_index, packing_type_to_index);
2847     uop->forward(src, dst, cmd, opt);
2848 }
2849 
convert_packing(const VkImageMat & src,VkMat & dst,int dst_elempack,VkCompute & cmd,const Option & opt) const2850 void VulkanDevice::convert_packing(const VkImageMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const
2851 {
2852     int cast_type_to_index = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0;
2853     int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2;
2854 
2855     int cast_type_from_index;
2856     if (src.elembits() == 32)
2857     {
2858         cast_type_from_index = 0;
2859     }
2860     else // if (src.elembits() == 16)
2861     {
2862         if (cast_type_to_index != 0)
2863         {
2864             cast_type_from_index = cast_type_to_index;
2865         }
2866         else if (info.support_fp16_storage())
2867         {
2868             cast_type_from_index = 2;
2869         }
2870         else // if (info.support_fp16_packed())
2871         {
2872             cast_type_from_index = 1;
2873         }
2874     }
2875 
2876     // NCNN_LOGE("convert_packing i2b %d %d %d", cast_type_from_index, cast_type_to_index, packing_type_to_index);
2877 
2878     const ncnn::Packing_vulkan* uop = d->get_utility_operator(1, 0, cast_type_from_index, cast_type_to_index, packing_type_to_index);
2879     uop->forward(src, dst, cmd, opt);
2880 }
2881 
init_device_extension()2882 int VulkanDevice::init_device_extension()
2883 {
2884     if (info.support_VK_KHR_bind_memory2())
2885     {
2886         vkBindBufferMemory2KHR = (PFN_vkBindBufferMemory2KHR)vkGetDeviceProcAddr(d->device, "vkBindBufferMemory2KHR");
2887         vkBindImageMemory2KHR = (PFN_vkBindImageMemory2KHR)vkGetDeviceProcAddr(d->device, "vkBindImageMemory2KHR");
2888     }
2889 
2890     if (info.support_VK_KHR_create_renderpass2())
2891     {
2892         vkCmdBeginRenderPass2KHR = (PFN_vkCmdBeginRenderPass2KHR)vkGetDeviceProcAddr(d->device, "vkCmdBeginRenderPass2KHR");
2893         vkCmdEndRenderPass2KHR = (PFN_vkCmdEndRenderPass2KHR)vkGetDeviceProcAddr(d->device, "vkCmdEndRenderPass2KHR");
2894         vkCmdNextSubpass2KHR = (PFN_vkCmdNextSubpass2KHR)vkGetDeviceProcAddr(d->device, "vkCmdNextSubpass2KHR");
2895         vkCreateRenderPass2KHR = (PFN_vkCreateRenderPass2KHR)vkGetDeviceProcAddr(d->device, "vkCreateRenderPass2KHR");
2896     }
2897 
2898     if (info.support_VK_KHR_descriptor_update_template())
2899     {
2900         vkCreateDescriptorUpdateTemplateKHR = (PFN_vkCreateDescriptorUpdateTemplateKHR)vkGetDeviceProcAddr(d->device, "vkCreateDescriptorUpdateTemplateKHR");
2901         vkDestroyDescriptorUpdateTemplateKHR = (PFN_vkDestroyDescriptorUpdateTemplateKHR)vkGetDeviceProcAddr(d->device, "vkDestroyDescriptorUpdateTemplateKHR");
2902         vkUpdateDescriptorSetWithTemplateKHR = (PFN_vkUpdateDescriptorSetWithTemplateKHR)vkGetDeviceProcAddr(d->device, "vkUpdateDescriptorSetWithTemplateKHR");
2903     }
2904 
2905     if (info.support_VK_KHR_get_memory_requirements2())
2906     {
2907         vkGetImageMemoryRequirements2KHR = (PFN_vkGetImageMemoryRequirements2KHR)vkGetDeviceProcAddr(d->device, "vkGetImageMemoryRequirements2KHR");
2908         vkGetBufferMemoryRequirements2KHR = (PFN_vkGetBufferMemoryRequirements2KHR)vkGetDeviceProcAddr(d->device, "vkGetBufferMemoryRequirements2KHR");
2909         vkGetImageSparseMemoryRequirements2KHR = (PFN_vkGetImageSparseMemoryRequirements2KHR)vkGetDeviceProcAddr(d->device, "vkGetImageSparseMemoryRequirements2KHR");
2910     }
2911 
2912     if (info.support_VK_KHR_maintenance1())
2913     {
2914         vkTrimCommandPoolKHR = (PFN_vkTrimCommandPoolKHR)vkGetDeviceProcAddr(d->device, "vkTrimCommandPoolKHR");
2915     }
2916 
2917     if (info.support_VK_KHR_maintenance3())
2918     {
2919         vkGetDescriptorSetLayoutSupportKHR = (PFN_vkGetDescriptorSetLayoutSupportKHR)vkGetDeviceProcAddr(d->device, "vkGetDescriptorSetLayoutSupportKHR");
2920     }
2921 
2922     if (info.support_VK_KHR_push_descriptor())
2923     {
2924         if (info.support_VK_KHR_descriptor_update_template())
2925         {
2926             vkCmdPushDescriptorSetWithTemplateKHR = (PFN_vkCmdPushDescriptorSetWithTemplateKHR)vkGetDeviceProcAddr(d->device, "vkCmdPushDescriptorSetWithTemplateKHR");
2927         }
2928 
2929         vkCmdPushDescriptorSetKHR = (PFN_vkCmdPushDescriptorSetKHR)vkGetDeviceProcAddr(d->device, "vkCmdPushDescriptorSetKHR");
2930     }
2931 
2932     if (info.support_VK_KHR_sampler_ycbcr_conversion())
2933     {
2934         vkCreateSamplerYcbcrConversionKHR = (PFN_vkCreateSamplerYcbcrConversionKHR)vkGetDeviceProcAddr(d->device, "vkCreateSamplerYcbcrConversionKHR");
2935         vkDestroySamplerYcbcrConversionKHR = (PFN_vkDestroySamplerYcbcrConversionKHR)vkGetDeviceProcAddr(d->device, "vkDestroySamplerYcbcrConversionKHR");
2936     }
2937 
2938     if (info.support_VK_KHR_swapchain())
2939     {
2940         vkCreateSwapchainKHR = (PFN_vkCreateSwapchainKHR)vkGetDeviceProcAddr(d->device, "vkCreateSwapchainKHR");
2941         vkDestroySwapchainKHR = (PFN_vkDestroySwapchainKHR)vkGetDeviceProcAddr(d->device, "vkDestroySwapchainKHR");
2942         vkGetSwapchainImagesKHR = (PFN_vkGetSwapchainImagesKHR)vkGetDeviceProcAddr(d->device, "vkGetSwapchainImagesKHR");
2943         vkAcquireNextImageKHR = (PFN_vkAcquireNextImageKHR)vkGetDeviceProcAddr(d->device, "vkAcquireNextImageKHR");
2944         vkQueuePresentKHR = (PFN_vkQueuePresentKHR)vkGetDeviceProcAddr(d->device, "vkQueuePresentKHR");
2945     }
2946 
2947 #if __ANDROID_API__ >= 26
2948     if (info.support_VK_ANDROID_external_memory_android_hardware_buffer())
2949     {
2950         vkGetAndroidHardwareBufferPropertiesANDROID = (PFN_vkGetAndroidHardwareBufferPropertiesANDROID)vkGetDeviceProcAddr(d->device, "vkGetAndroidHardwareBufferPropertiesANDROID");
2951         vkGetMemoryAndroidHardwareBufferANDROID = (PFN_vkGetMemoryAndroidHardwareBufferANDROID)vkGetDeviceProcAddr(d->device, "vkGetMemoryAndroidHardwareBufferANDROID");
2952     }
2953 #endif // __ANDROID_API__ >= 26
2954 
2955     return 0;
2956 }
2957 
get_gpu_device(int device_index)2958 VulkanDevice* get_gpu_device(int device_index)
2959 {
2960     try_create_gpu_instance();
2961 
2962     if (device_index < 0 || device_index >= g_gpu_count)
2963         return 0;
2964 
2965     MutexLockGuard lock(g_default_vkdev_lock);
2966 
2967     if (!g_default_vkdev[device_index])
2968         g_default_vkdev[device_index] = new VulkanDevice(device_index);
2969 
2970     return g_default_vkdev[device_index];
2971 }
2972 
get_default_TBuiltInResource()2973 static TBuiltInResource get_default_TBuiltInResource()
2974 {
2975     TBuiltInResource resource;
2976 
2977     resource.maxLights = 32;
2978     resource.maxClipPlanes = 6;
2979     resource.maxTextureUnits = 32;
2980     resource.maxTextureCoords = 32;
2981     resource.maxVertexAttribs = 64;
2982     resource.maxVertexUniformComponents = 4096;
2983     resource.maxVaryingFloats = 64;
2984     resource.maxVertexTextureImageUnits = 32;
2985     resource.maxCombinedTextureImageUnits = 80;
2986     resource.maxTextureImageUnits = 32;
2987     resource.maxFragmentUniformComponents = 4096;
2988     resource.maxDrawBuffers = 32;
2989     resource.maxVertexUniformVectors = 128;
2990     resource.maxVaryingVectors = 8;
2991     resource.maxFragmentUniformVectors = 16;
2992     resource.maxVertexOutputVectors = 16;
2993     resource.maxFragmentInputVectors = 15;
2994     resource.minProgramTexelOffset = -8;
2995     resource.maxProgramTexelOffset = 7;
2996     resource.maxClipDistances = 8;
2997     resource.maxComputeWorkGroupCountX = 65535;
2998     resource.maxComputeWorkGroupCountY = 65535;
2999     resource.maxComputeWorkGroupCountZ = 65535;
3000     resource.maxComputeWorkGroupSizeX = 1024;
3001     resource.maxComputeWorkGroupSizeY = 1024;
3002     resource.maxComputeWorkGroupSizeZ = 64;
3003     resource.maxComputeUniformComponents = 1024;
3004     resource.maxComputeTextureImageUnits = 16;
3005     resource.maxComputeImageUniforms = 8;
3006     resource.maxComputeAtomicCounters = 8;
3007     resource.maxComputeAtomicCounterBuffers = 1;
3008     resource.maxVaryingComponents = 60;
3009     resource.maxVertexOutputComponents = 64;
3010     resource.maxGeometryInputComponents = 64;
3011     resource.maxGeometryOutputComponents = 128;
3012     resource.maxFragmentInputComponents = 128;
3013     resource.maxImageUnits = 8;
3014     resource.maxCombinedImageUnitsAndFragmentOutputs = 8;
3015     resource.maxCombinedShaderOutputResources = 8;
3016     resource.maxImageSamples = 0;
3017     resource.maxVertexImageUniforms = 0;
3018     resource.maxTessControlImageUniforms = 0;
3019     resource.maxTessEvaluationImageUniforms = 0;
3020     resource.maxGeometryImageUniforms = 0;
3021     resource.maxFragmentImageUniforms = 8;
3022     resource.maxCombinedImageUniforms = 8;
3023     resource.maxGeometryTextureImageUnits = 16;
3024     resource.maxGeometryOutputVertices = 256;
3025     resource.maxGeometryTotalOutputComponents = 1024;
3026     resource.maxGeometryUniformComponents = 1024;
3027     resource.maxGeometryVaryingComponents = 64;
3028     resource.maxTessControlInputComponents = 128;
3029     resource.maxTessControlOutputComponents = 128;
3030     resource.maxTessControlTextureImageUnits = 16;
3031     resource.maxTessControlUniformComponents = 1024;
3032     resource.maxTessControlTotalOutputComponents = 4096;
3033     resource.maxTessEvaluationInputComponents = 128;
3034     resource.maxTessEvaluationOutputComponents = 128;
3035     resource.maxTessEvaluationTextureImageUnits = 16;
3036     resource.maxTessEvaluationUniformComponents = 1024;
3037     resource.maxTessPatchComponents = 120;
3038     resource.maxPatchVertices = 32;
3039     resource.maxTessGenLevel = 64;
3040     resource.maxViewports = 16;
3041     resource.maxVertexAtomicCounters = 0;
3042     resource.maxTessControlAtomicCounters = 0;
3043     resource.maxTessEvaluationAtomicCounters = 0;
3044     resource.maxGeometryAtomicCounters = 0;
3045     resource.maxFragmentAtomicCounters = 8;
3046     resource.maxCombinedAtomicCounters = 8;
3047     resource.maxAtomicCounterBindings = 1;
3048     resource.maxVertexAtomicCounterBuffers = 0;
3049     resource.maxTessControlAtomicCounterBuffers = 0;
3050     resource.maxTessEvaluationAtomicCounterBuffers = 0;
3051     resource.maxGeometryAtomicCounterBuffers = 0;
3052     resource.maxFragmentAtomicCounterBuffers = 1;
3053     resource.maxCombinedAtomicCounterBuffers = 1;
3054     resource.maxAtomicCounterBufferSize = 16384;
3055     resource.maxTransformFeedbackBuffers = 4;
3056     resource.maxTransformFeedbackInterleavedComponents = 64;
3057     resource.maxCullDistances = 8;
3058     resource.maxCombinedClipAndCullDistances = 8;
3059     resource.maxSamples = 4;
3060     resource.maxMeshOutputVerticesNV = 256;
3061     resource.maxMeshOutputPrimitivesNV = 512;
3062     resource.maxMeshWorkGroupSizeX_NV = 32;
3063     resource.maxMeshWorkGroupSizeY_NV = 1;
3064     resource.maxMeshWorkGroupSizeZ_NV = 1;
3065     resource.maxTaskWorkGroupSizeX_NV = 32;
3066     resource.maxTaskWorkGroupSizeY_NV = 1;
3067     resource.maxTaskWorkGroupSizeZ_NV = 1;
3068     resource.maxMeshViewCountNV = 4;
3069 
3070     // TODO compile-time glslang version check
3071     // resource.maxDualSourceDrawBuffersEXT = 1;
3072 
3073     resource.limits.nonInductiveForLoops = 1;
3074     resource.limits.whileLoops = 1;
3075     resource.limits.doWhileLoops = 1;
3076     resource.limits.generalUniformIndexing = 1;
3077     resource.limits.generalAttributeMatrixVectorIndexing = 1;
3078     resource.limits.generalVaryingIndexing = 1;
3079     resource.limits.generalSamplerIndexing = 1;
3080     resource.limits.generalVariableIndexing = 1;
3081     resource.limits.generalConstantMatrixVectorIndexing = 1;
3082 
3083     return resource;
3084 }
3085 
compile_spirv_module(const char * comp_string,const Option & opt,std::vector<uint32_t> & spirv)3086 int compile_spirv_module(const char* comp_string, const Option& opt, std::vector<uint32_t>& spirv)
3087 {
3088     // -1 for omitting the tail '\0'
3089     int length = strlen(comp_string) - 1;
3090     return compile_spirv_module(comp_string, length, opt, spirv);
3091 }
3092 
compile_spirv_module(const char * comp_data,int comp_data_size,const Option & opt,std::vector<uint32_t> & spirv)3093 int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector<uint32_t>& spirv)
3094 {
3095     std::vector<std::pair<const char*, const char*> > custom_defines;
3096 
3097     if (opt.use_fp16_storage)
3098     {
3099         custom_defines.push_back(std::make_pair("sfp", "float16_t"));
3100         custom_defines.push_back(std::make_pair("sfpvec2", "f16vec2"));
3101         custom_defines.push_back(std::make_pair("sfpvec4", "f16vec4"));
3102 
3103         if (opt.use_fp16_arithmetic)
3104         {
3105             custom_defines.push_back(std::make_pair("sfpvec8", "f16mat2x4"));
3106             custom_defines.push_back(std::make_pair("sfpmat4", "f16mat4"));
3107         }
3108     }
3109     else if (opt.use_fp16_packed)
3110     {
3111         custom_defines.push_back(std::make_pair("sfp", "float"));
3112         custom_defines.push_back(std::make_pair("sfpvec2", "uint"));
3113         custom_defines.push_back(std::make_pair("sfpvec4", "uvec2"));
3114         custom_defines.push_back(std::make_pair("sfpvec8", "uvec4"));
3115     }
3116     else
3117     {
3118         custom_defines.push_back(std::make_pair("sfp", "float"));
3119         custom_defines.push_back(std::make_pair("sfpvec2", "vec2"));
3120         custom_defines.push_back(std::make_pair("sfpvec4", "vec4"));
3121         custom_defines.push_back(std::make_pair("sfpvec8", "mat2x4"));
3122         custom_defines.push_back(std::make_pair("sfpmat4", "mat4"));
3123     }
3124 
3125     if (opt.use_fp16_arithmetic)
3126     {
3127         custom_defines.push_back(std::make_pair("afp", "float16_t"));
3128         custom_defines.push_back(std::make_pair("afpvec2", "f16vec2"));
3129         custom_defines.push_back(std::make_pair("afpvec4", "f16vec4"));
3130         custom_defines.push_back(std::make_pair("afpvec8", "f16mat2x4"));
3131         custom_defines.push_back(std::make_pair("afpmat4", "f16mat4"));
3132     }
3133     else
3134     {
3135         custom_defines.push_back(std::make_pair("afp", "float"));
3136         custom_defines.push_back(std::make_pair("afpvec2", "vec2"));
3137         custom_defines.push_back(std::make_pair("afpvec4", "vec4"));
3138         custom_defines.push_back(std::make_pair("afpvec8", "mat2x4"));
3139         custom_defines.push_back(std::make_pair("afpmat4", "mat4"));
3140     }
3141 
3142     if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
3143     {
3144         custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "buf[i]"));
3145         custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=v;}"));
3146         custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3147         custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=f16vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"));
3148         custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=f16mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"));
3149         custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "buf[i]"));
3150         custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=v;}"));
3151         custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3152         custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "buf[i]"));
3153         custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=v;}"));
3154         custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3155         custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"));
3156         custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=f16mat2x4(sbuf[si2.r],sbuf[si2.g]);}"));
3157         custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "buf[i]"));
3158         custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=v;}"));
3159         custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3160         custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{f16mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"));
3161         custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{f16mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"));
3162         custom_defines.push_back(std::make_pair("sfp2afpmat4(v)", "v"));
3163         custom_defines.push_back(std::make_pair("afp2sfpmat4(v)", "v"));
3164     }
3165     else if (opt.use_fp16_packed && opt.use_fp16_arithmetic)
3166     {
3167         custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "float16_t(buf[i])"));
3168         custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=float(v);}"));
3169         custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3170         custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=uvec2(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))));}"));
3171         custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=uvec4(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.r],sbuf[sii4.g]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.b],sbuf[sii4.a]))));}"));
3172         custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "f16vec2(unpackHalf2x16(buf[i]))"));
3173         custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=packHalf2x16(vec2(v))}"));
3174         custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3175         custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "f16vec4(vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y)))"));
3176         custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=uvec2(packHalf2x16(vec2(v.rg)),packHalf2x16(vec2(v.ba)));}"));
3177         custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3178         custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"));
3179         custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"));
3180         custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "f16mat2x4(f16vec4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g))),f16vec4(vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a))))"));
3181         custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=uvec4(uvec2(packHalf2x16(vec2(v[0].rg)),packHalf2x16(vec2(v[0].ba))),uvec2(packHalf2x16(vec2(v[1].rg)),packHalf2x16(vec2(v[1].ba))));}"));
3182         custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3183         custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"));
3184         custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"));
3185     }
3186     else if (opt.use_fp16_storage)
3187     {
3188         custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "float(buf[i])"));
3189         custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=float16_t(v);}"));
3190         custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3191         custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i].r=sbuf[si4.r];buf[i].g=sbuf[si4.g];buf[i].b=sbuf[si4.b];buf[i].a=sbuf[si4.a];}"));
3192         custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i].abcd.r=sbuf[si4.r];buf[i].abcd.g=sbuf[si4.g];buf[i].abcd.b=sbuf[si4.b];buf[i].abcd.a=sbuf[si4.a];buf[i].efgh.r=sbuf[sii4.r];buf[i].efgh.g=sbuf[sii4.g];buf[i].efgh.b=sbuf[sii4.b];buf[i].efgh.a=sbuf[sii4.a];}"));
3193         custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "vec2(buf[i])"));
3194         custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=f16vec2(v);}"));
3195         custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3196         custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "vec4(buf[i])"));
3197         custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=f16vec4(v);}"));
3198         custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3199         custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"));
3200         custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i].abcd=sbuf[si2.r];buf[i].efgh=sbuf[si2.g];}"));
3201         custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "mat2x4(vec4(buf[i].abcd),vec4(buf[i].efgh))"));
3202         custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i].abcd=f16vec4(v[0]);buf[i].efgh=f16vec4(v[1]);}"));
3203         custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i].abcd=sbuf[si].abcd;buf[i].efgh=sbuf[si].efgh;}"));
3204         custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{buf[i4.r]=sbuf[si].abcd.r;buf[i4.g]=sbuf[si].abcd.g;buf[i4.b]=sbuf[si].abcd.b;buf[i4.a]=sbuf[si].abcd.a; buf[ii4.r]=sbuf[si].efgh.r;buf[ii4.g]=sbuf[si].efgh.g;buf[ii4.b]=sbuf[si].efgh.b;buf[ii4.a]=sbuf[si].efgh.a;}"));
3205         custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{buf[i2.r]=sbuf[si].abcd;buf[i2.g]=sbuf[si].efgh;}"));
3206     }
3207     else if (opt.use_fp16_packed)
3208     {
3209         custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "buf[i]"));
3210         custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=v;}"));
3211         custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3212         custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=uvec2(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])));}"));
3213         custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=uvec4(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])),packHalf2x16(vec2(sbuf[sii4.r],sbuf[sii4.g])),packHalf2x16(vec2(sbuf[sii4.b],sbuf[sii4.a])));}"));
3214         custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "unpackHalf2x16(buf[i])"));
3215         custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=packHalf2x16(v)}"));
3216         custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3217         custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y))"));
3218         custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=uvec2(packHalf2x16(v.rg),packHalf2x16(v.ba));}"));
3219         custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3220         custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"));
3221         custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"));
3222         custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "mat2x4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g)),vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a)))"));
3223         custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=uvec4(uvec2(packHalf2x16(v[0].rg),packHalf2x16(v[0].ba)),uvec2(packHalf2x16(v[1].rg),packHalf2x16(v[1].ba)));}"));
3224         custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3225         custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"));
3226         custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"));
3227     }
3228     else
3229     {
3230         custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "buf[i]"));
3231         custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=v;}"));
3232         custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3233         custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"));
3234         custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"));
3235         custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "buf[i]"));
3236         custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=v;}"));
3237         custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3238         custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "buf[i]"));
3239         custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=v;}"));
3240         custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3241         custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{vec4 _v=sbuf[si]; buf[i4.r]=_v.r;buf[i4.g]=_v.g;buf[i4.b]=_v.b;buf[i4.a]=_v.a;}"));
3242         custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=mat2x4(sbuf[si2.r],sbuf[si2.g]);}"));
3243         custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "buf[i]"));
3244         custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=v;}"));
3245         custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3246         custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"));
3247         custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"));
3248         custom_defines.push_back(std::make_pair("sfp2afpmat4(v)", "v"));
3249         custom_defines.push_back(std::make_pair("afp2sfpmat4(v)", "v"));
3250     }
3251 
3252     if (opt.use_image_storage)
3253     {
3254         if (opt.use_fp16_storage)
3255         {
3256             custom_defines.push_back(std::make_pair("imfmtc1", "r16f"));
3257             custom_defines.push_back(std::make_pair("imfmtc4", "rgba16f"));
3258             custom_defines.push_back(std::make_pair("unfp", "mediump"));
3259         }
3260         else if (opt.use_fp16_packed)
3261         {
3262             custom_defines.push_back(std::make_pair("imfmtc1", "r32f"));
3263             custom_defines.push_back(std::make_pair("imfmtc4", "rgba16f"));
3264             custom_defines.push_back(std::make_pair("unfp", "mediump"));
3265         }
3266         else
3267         {
3268             custom_defines.push_back(std::make_pair("imfmtc1", "r32f"));
3269             custom_defines.push_back(std::make_pair("imfmtc4", "rgba32f"));
3270             custom_defines.push_back(std::make_pair("unfp", "highp"));
3271         }
3272 
3273         if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
3274         {
3275             custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
3276             custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
3277             custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
3278             custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=float(v);imageStore(img,p,_v);}"));
3279             custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=float(v);imageStore(img,p,_v);}"));
3280             custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=float(v);imageStore(img,p,_v);}"));
3281             custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3282             custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3283             custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3284             custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
3285             custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
3286             custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
3287             custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,vec4(v));}"));
3288             custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,vec4(v));}"));
3289             custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,vec4(v));}"));
3290             custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3291             custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3292             custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3293             custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
3294             custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
3295             custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
3296             custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,vec4(v[0]));imageStore(img,(p)*2+1,vec4(v[1]));}"));
3297             custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),vec4(v[0]));imageStore(img,ivec2(p.x*2+1,p.y),vec4(v[1]));}"));
3298             custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),vec4(v[0]));imageStore(img,ivec3(p.x*2+1,p.y,p.z),vec4(v[1]));}"));
3299             custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
3300             custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
3301             custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
3302         }
3303         else if (opt.use_fp16_packed && opt.use_fp16_arithmetic)
3304         {
3305             custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
3306             custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
3307             custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
3308             custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3309             custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3310             custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3311             custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3312             custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3313             custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3314             custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
3315             custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
3316             custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
3317             custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3318             custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3319             custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3320             custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3321             custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3322             custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3323             custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
3324             custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
3325             custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
3326             custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"));
3327             custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"));
3328             custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"));
3329             custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
3330             custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
3331             custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
3332         }
3333         else if (opt.use_fp16_storage)
3334         {
3335             custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3336             custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3337             custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3338             custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3339             custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3340             custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3341             custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3342             custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3343             custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3344             custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3345             custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3346             custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3347             custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3348             custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3349             custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3350             custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3351             custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3352             custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3353             custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
3354             custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
3355             custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
3356             custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"));
3357             custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"));
3358             custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"));
3359             custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
3360             custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
3361             custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
3362         }
3363         else if (opt.use_fp16_packed)
3364         {
3365             custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3366             custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3367             custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3368             custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3369             custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3370             custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3371             custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3372             custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3373             custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3374             custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3375             custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3376             custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3377             custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3378             custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3379             custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3380             custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3381             custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3382             custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3383             custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
3384             custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
3385             custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
3386             custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"));
3387             custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"));
3388             custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"));
3389             custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
3390             custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
3391             custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
3392         }
3393         else
3394         {
3395             custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3396             custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3397             custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3398             custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3399             custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3400             custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3401             custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3402             custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3403             custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3404             custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3405             custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3406             custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3407             custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3408             custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3409             custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3410             custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3411             custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3412             custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3413             custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
3414             custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
3415             custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
3416             custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"));
3417             custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"));
3418             custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"));
3419             custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
3420             custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
3421             custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
3422         }
3423     }
3424 
3425     custom_defines.push_back(std::make_pair("psc(x)", "(x==0?p.x:x)"));
3426 
3427     if (opt.use_fp16_storage)
3428     {
3429         custom_defines.push_back(std::make_pair("NCNN_fp16_storage", "1"));
3430     }
3431     else if (opt.use_fp16_packed)
3432     {
3433         custom_defines.push_back(std::make_pair("NCNN_fp16_packed", "1"));
3434     }
3435 
3436     if (opt.use_fp16_arithmetic)
3437     {
3438         custom_defines.push_back(std::make_pair("NCNN_fp16_arithmetic", "1"));
3439     }
3440 
3441     if (opt.use_int8_storage)
3442     {
3443         custom_defines.push_back(std::make_pair("NCNN_int8_storage", "1"));
3444     }
3445     else if (opt.use_int8_packed)
3446     {
3447         custom_defines.push_back(std::make_pair("NCNN_int8_packed", "1"));
3448     }
3449 
3450     if (opt.use_int8_arithmetic)
3451     {
3452         custom_defines.push_back(std::make_pair("NCNN_int8_arithmetic", "1"));
3453     }
3454 
3455     if (opt.use_image_storage)
3456     {
3457         custom_defines.push_back(std::make_pair("NCNN_image_shader", "1"));
3458     }
3459 
3460     if (opt.use_subgroup_basic)
3461     {
3462         custom_defines.push_back(std::make_pair("NCNN_subgroup_basic", "1"));
3463 
3464         if (opt.use_subgroup_vote)
3465         {
3466             custom_defines.push_back(std::make_pair("NCNN_subgroup_vote", "1"));
3467         }
3468         if (opt.use_subgroup_ballot)
3469         {
3470             custom_defines.push_back(std::make_pair("NCNN_subgroup_ballot", "1"));
3471         }
3472         if (opt.use_subgroup_shuffle)
3473         {
3474             custom_defines.push_back(std::make_pair("NCNN_subgroup_shuffle", "1"));
3475         }
3476     }
3477 
3478     std::string preamble;
3479     std::vector<std::string> processes;
3480 
3481     processes.resize(custom_defines.size());
3482     for (size_t i = 0; i < custom_defines.size(); i++)
3483     {
3484         const char* key = custom_defines[i].first;
3485         const char* def = custom_defines[i].second;
3486 
3487         preamble += std::string("#define ") + key + " " + def + "\n";
3488         processes[i] = std::string("define-macro ") + key + "=" + def;
3489     }
3490 
3491     bool compile_success = true;
3492 
3493     {
3494         glslang::TShader s(EShLangCompute);
3495 
3496         s.setStringsWithLengths(&comp_data, &comp_data_size, 1);
3497 
3498         s.setPreamble(preamble.c_str());
3499         s.addProcesses(processes);
3500         s.setEntryPoint("main");
3501         s.setSourceEntryPoint("main");
3502 
3503         s.setEnvInput(glslang::EShSourceGlsl, EShLangCompute, glslang::EShClientVulkan, 1);
3504 
3505         if (opt.use_subgroup_basic)
3506         {
3507             // subgroup need vulkan-1.1 and spirv-1.3
3508             s.setEnvClient(glslang::EShClientVulkan, glslang::EShTargetVulkan_1_1);
3509             s.setEnvTarget(glslang::EshTargetSpv, glslang::EShTargetSpv_1_3);
3510         }
3511         else
3512         {
3513             s.setEnvClient(glslang::EShClientVulkan, glslang::EShTargetVulkan_1_0);
3514             s.setEnvTarget(glslang::EshTargetSpv, glslang::EShTargetSpv_1_0);
3515         }
3516 
3517         TBuiltInResource resources = get_default_TBuiltInResource();
3518 
3519         bool pr = s.parse(&resources, 100, false, EShMsgDefault);
3520         if (!pr)
3521         {
3522             NCNN_LOGE("compile spir-v module failed");
3523             NCNN_LOGE("%s", s.getInfoLog());
3524             NCNN_LOGE("%s", s.getInfoDebugLog());
3525 
3526             compile_success = false;
3527         }
3528         else
3529         {
3530             glslang::TIntermediate* ir = s.getIntermediate();
3531             glslang::GlslangToSpv(*ir, spirv);
3532         }
3533     }
3534 
3535     return compile_success ? 0 : -1;
3536 }
3537 
compile_spirv_module(int shader_type_index,const Option & opt,std::vector<uint32_t> & spirv)3538 int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv)
3539 {
3540     if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count)
3541     {
3542         NCNN_LOGE("no such shader module %d", shader_type_index);
3543         return -1;
3544     }
3545 
3546     const char* comp_data = layer_shader_registry[shader_type_index].comp_data;
3547     int comp_data_size = layer_shader_registry[shader_type_index].comp_data_size;
3548 
3549     return compile_spirv_module(comp_data, comp_data_size, opt, spirv);
3550 }
3551 
resolve_shader_info(const uint32_t * spv_data,size_t spv_data_size,ShaderInfo & shader_info)3552 int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info)
3553 {
3554     shader_info.specialization_count = 0;
3555     shader_info.binding_count = 0;
3556     shader_info.push_constant_count = 0;
3557 
3558     uint32_t parameter_id = -233;
3559 
3560     int specialization_count = 0;
3561     int binding_count = 0;
3562     int push_constant_count = 0;
3563 
3564     // id -> binding_type
3565     std::vector<int> id_types;
3566 
3567     // binding_id -> binding_type
3568     std::vector<int> binding_types;
3569 
3570     const uint32_t* p = spv_data;
3571 
3572     int bound = p[3];
3573 
3574     id_types.resize(bound);
3575 
3576     // skip magic version generator bound schema
3577     p += 5;
3578 
3579     // foreach op
3580     while ((const unsigned char*)p < (const unsigned char*)spv_data + spv_data_size)
3581     {
3582         uint32_t opcode = p[0];
3583 
3584         uint16_t wordcount = opcode >> 16;
3585         uint16_t op = opcode & 0xffff;
3586 
3587         if (op == 5) // OpName
3588         {
3589             uint32_t id = p[1];
3590             const char* name = (const char*)&p[2];
3591             if (strcmp(name, "parameter") == 0)
3592             {
3593                 parameter_id = id;
3594             }
3595         }
3596         else if (op == 6) // OpMemberName
3597         {
3598             uint32_t id = p[1];
3599             if (id == parameter_id)
3600             {
3601                 push_constant_count++;
3602             }
3603         }
3604         else if (op == 25) // OpTypeImage
3605         {
3606             uint32_t id = p[1];
3607             id_types[id] = 2;
3608         }
3609         else if (op == 27) // OpTypeSampledImage
3610         {
3611             uint32_t id = p[1];
3612             id_types[id] = 3;
3613         }
3614         else if (op == 32) // OpTypePointer
3615         {
3616             uint32_t id = p[1];
3617             uint32_t storage_class = p[2];
3618             uint32_t type = p[3];
3619             if (storage_class == 0) // UniformConstant
3620             {
3621                 id_types[id] = id_types[type];
3622             }
3623             if (storage_class == 2) // Uniform
3624             {
3625                 id_types[id] = id_types[type];
3626             }
3627             if (storage_class == 12) // StorageBuffer
3628             {
3629                 id_types[type] = 1;
3630                 id_types[id] = id_types[type];
3631             }
3632         }
3633         else if (op == 59) // OpVariable
3634         {
3635             uint32_t id = p[1];
3636             uint32_t var_id = p[2];
3637             uint32_t storage_class = p[3];
3638             if (storage_class == 0) // UniformConstant
3639             {
3640                 id_types[var_id] = id_types[id];
3641             }
3642             if (storage_class == 2) // Uniform
3643             {
3644                 id_types[var_id] = id_types[id];
3645             }
3646             if (storage_class == 12) // StorageBuffer
3647             {
3648                 id_types[var_id] = id_types[id];
3649             }
3650         }
3651         else if (op == 71) // OpDecorate
3652         {
3653             uint32_t id = p[1];
3654             uint32_t decoration = p[2];
3655             uint32_t binding_id = p[3];
3656             if (decoration == 1) // SpecId
3657             {
3658                 specialization_count++;
3659             }
3660             if (decoration == 3) // BufferBlock
3661             {
3662                 id_types[id] = 1;
3663             }
3664             else if (decoration == 33) // Binding
3665             {
3666                 binding_count = std::max(binding_count, (int)binding_id + 1);
3667 
3668                 binding_types.resize(binding_count);
3669                 binding_types[binding_id] = id;
3670             }
3671         }
3672 
3673         p += wordcount;
3674     }
3675 
3676     if (binding_count > 16)
3677     {
3678         NCNN_LOGE("too many binding %d", binding_count);
3679         return -1;
3680     }
3681 
3682     shader_info.specialization_count = specialization_count;
3683     shader_info.binding_count = binding_count;
3684     shader_info.push_constant_count = push_constant_count;
3685 
3686     // resolve binding_types
3687     for (int i = 0; i < binding_count; i++)
3688     {
3689         shader_info.binding_types[i] = id_types[binding_types[i]];
3690     }
3691 
3692     return 0;
3693 }
3694 
3695 } // namespace ncnn
3696 
3697 #endif // NCNN_VULKAN
3698