1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "gpu.h"
16
17 #if NCNN_VULKAN
18
19 #include <math.h>
20 #include <string.h>
21 #include <vulkan/vulkan.h>
22
23 #include "glslang/SPIRV/GlslangToSpv.h"
24 #include "glslang/Public/ShaderLang.h"
25
26 #include "command.h"
27 #include "layer.h"
28 #include "layer/vulkan/packing_vulkan.h"
29 #include "layer_type.h"
30 #include "mat.h"
31 #include "pipelinecache.h"
32
33 // There is known issue that vkDestroyDebugUtilsMessengerEXT crash on exit when vulkan validation layer enabled
34 // upstream fix https://github.com/KhronosGroup/Vulkan-Loader/pull/539
35 #define ENABLE_VALIDATION_LAYER 0
36
37 namespace ncnn {
38
39 // global
40 static Mutex g_instance_lock;
41
42 class __ncnn_vulkan_instance_holder
43 {
44 public:
__ncnn_vulkan_instance_holder()45 __ncnn_vulkan_instance_holder()
46 {
47 instance = 0;
48 #if ENABLE_VALIDATION_LAYER
49 callback = 0;
50 #endif
51 }
52
~__ncnn_vulkan_instance_holder()53 ~__ncnn_vulkan_instance_holder()
54 {
55 destroy_gpu_instance();
56 }
57
operator VkInstance()58 operator VkInstance()
59 {
60 return instance;
61 }
62
63 VkInstance instance;
64 #if ENABLE_VALIDATION_LAYER
65 VkDebugUtilsMessengerEXT callback;
66 #endif
67 };
68 static __ncnn_vulkan_instance_holder g_instance;
69
70 static int g_gpu_count = 0;
71 static int g_default_gpu_index = -1;
72
73 // NOTE 8 is large enough i think ...
74 #define NCNN_MAX_GPU_COUNT 8
75 static GpuInfo* g_gpu_infos[NCNN_MAX_GPU_COUNT] = {0};
76
77 // default vulkan device
78 static Mutex g_default_vkdev_lock;
79 static VulkanDevice* g_default_vkdev[NCNN_MAX_GPU_COUNT] = {0};
80
81 struct layer_shader_registry_entry
82 {
83 const char* comp_data;
84 int comp_data_size;
85 };
86
87 #include "layer_shader_spv_data.h"
88
89 static const layer_shader_registry_entry layer_shader_registry[] = {
90 #include "layer_shader_registry.h"
91 };
92
93 static const int layer_shader_registry_entry_count = sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry);
94
95 int support_VK_KHR_external_memory_capabilities = 0;
96 int support_VK_KHR_get_physical_device_properties2 = 0;
97 int support_VK_KHR_get_surface_capabilities2 = 0;
98 int support_VK_KHR_surface = 0;
99 int support_VK_EXT_debug_utils = 0;
100 #if __ANDROID_API__ >= 26
101 int support_VK_KHR_android_surface = 0;
102 #endif // __ANDROID_API__ >= 26
103
104 // VK_KHR_external_memory_capabilities
105 PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR = 0;
106
107 // VK_KHR_get_physical_device_properties2
108 PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR = 0;
109 PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR = 0;
110 PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR = 0;
111 PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR = 0;
112 PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR = 0;
113 PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR = 0;
114 PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR vkGetPhysicalDeviceSparseImageFormatProperties2KHR = 0;
115
116 // VK_KHR_get_surface_capabilities2
117 PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR = 0;
118 PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR = 0;
119
120 // VK_KHR_surface
121 PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR = 0;
122 PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR = 0;
123 PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR = 0;
124 PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR = 0;
125 PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR = 0;
126
127 #if __ANDROID_API__ >= 26
128 // VK_KHR_android_surface
129 PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR = 0;
130 #endif // __ANDROID_API__ >= 26
131
132 class GpuInfoPrivate
133 {
134 public:
135 // vulkan physical device
136 VkPhysicalDevice physical_device;
137
138 // memory properties
139 VkPhysicalDeviceMemoryProperties physical_device_memory_properties;
140
141 // info
142 uint32_t api_version;
143 uint32_t driver_version;
144 uint32_t vendor_id;
145 uint32_t device_id;
146 char device_name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
147 uint8_t pipeline_cache_uuid[VK_UUID_SIZE];
148
149 // 0 = discrete gpu
150 // 1 = integrated gpu
151 // 2 = virtual gpu
152 // 3 = cpu
153 int type;
154
155 // hardware limit
156 uint32_t max_shared_memory_size;
157 uint32_t max_workgroup_count_x;
158 uint32_t max_workgroup_count_y;
159 uint32_t max_workgroup_count_z;
160 uint32_t max_workgroup_invocations;
161 uint32_t max_workgroup_size_x;
162 uint32_t max_workgroup_size_y;
163 uint32_t max_workgroup_size_z;
164 size_t memory_map_alignment;
165 size_t buffer_offset_alignment;
166 size_t non_coherent_atom_size;
167 size_t buffer_image_granularity;
168 uint32_t max_image_dimension_1d;
169 uint32_t max_image_dimension_2d;
170 uint32_t max_image_dimension_3d;
171 float timestamp_period;
172
173 // runtime
174 uint32_t compute_queue_family_index;
175 uint32_t graphics_queue_family_index;
176 uint32_t transfer_queue_family_index;
177
178 uint32_t compute_queue_count;
179 uint32_t graphics_queue_count;
180 uint32_t transfer_queue_count;
181
182 // property
183 bool unified_compute_transfer_queue;
184
185 // subgroup
186 uint32_t subgroup_size;
187 bool support_subgroup_basic;
188 bool support_subgroup_vote;
189 bool support_subgroup_ballot;
190 bool support_subgroup_shuffle;
191
192 // bug is not feature
193 bool bug_storage_buffer_no_l1;
194 bool bug_corrupted_online_pipeline_cache;
195 bool bug_buffer_image_load_zero;
196
197 // but sometimes bug is a feature
198 bool bug_implicit_fp16_arithmetic;
199
200 // fp16 and int8 feature
201 bool support_fp16_packed;
202 bool support_fp16_storage;
203 bool support_fp16_arithmetic;
204 bool support_int8_packed;
205 bool support_int8_storage;
206 bool support_int8_arithmetic;
207
208 // ycbcr conversion feature
209 bool support_ycbcr_conversion;
210
211 bool support_reserved_0;
212
213 // extension capability
214 int support_VK_KHR_8bit_storage;
215 int support_VK_KHR_16bit_storage;
216 int support_VK_KHR_bind_memory2;
217 int support_VK_KHR_create_renderpass2;
218 int support_VK_KHR_dedicated_allocation;
219 int support_VK_KHR_descriptor_update_template;
220 int support_VK_KHR_external_memory;
221 int support_VK_KHR_get_memory_requirements2;
222 int support_VK_KHR_maintenance1;
223 int support_VK_KHR_maintenance2;
224 int support_VK_KHR_maintenance3;
225 int support_VK_KHR_multiview;
226 int support_VK_KHR_push_descriptor;
227 int support_VK_KHR_sampler_ycbcr_conversion;
228 int support_VK_KHR_shader_float16_int8;
229 int support_VK_KHR_shader_float_controls;
230 int support_VK_KHR_storage_buffer_storage_class;
231 int support_VK_KHR_swapchain;
232 int support_VK_EXT_descriptor_indexing;
233 int support_VK_EXT_memory_budget;
234 int support_VK_EXT_queue_family_foreign;
235 #if __ANDROID_API__ >= 26
236 int support_VK_ANDROID_external_memory_android_hardware_buffer;
237 #endif // __ANDROID_API__ >= 26
238 };
239
GpuInfo()240 GpuInfo::GpuInfo()
241 : d(new GpuInfoPrivate)
242 {
243 }
244
~GpuInfo()245 GpuInfo::~GpuInfo()
246 {
247 delete d;
248 }
249
GpuInfo(const GpuInfo &)250 GpuInfo::GpuInfo(const GpuInfo&)
251 : d(0)
252 {
253 }
254
operator =(const GpuInfo &)255 GpuInfo& GpuInfo::operator=(const GpuInfo&)
256 {
257 return *this;
258 }
259
physical_device() const260 VkPhysicalDevice GpuInfo::physical_device() const
261 {
262 return d->physical_device;
263 }
264
physical_device_memory_properties() const265 const VkPhysicalDeviceMemoryProperties& GpuInfo::physical_device_memory_properties() const
266 {
267 return d->physical_device_memory_properties;
268 }
269
api_version() const270 uint32_t GpuInfo::api_version() const
271 {
272 return d->api_version;
273 }
274
driver_version() const275 uint32_t GpuInfo::driver_version() const
276 {
277 return d->driver_version;
278 }
279
vendor_id() const280 uint32_t GpuInfo::vendor_id() const
281 {
282 return d->vendor_id;
283 }
284
device_id() const285 uint32_t GpuInfo::device_id() const
286 {
287 return d->device_id;
288 }
289
device_name() const290 const char* GpuInfo::device_name() const
291 {
292 return d->device_name;
293 }
294
pipeline_cache_uuid() const295 uint8_t* GpuInfo::pipeline_cache_uuid() const
296 {
297 return d->pipeline_cache_uuid;
298 }
299
type() const300 int GpuInfo::type() const
301 {
302 return d->type;
303 }
304
max_shared_memory_size() const305 uint32_t GpuInfo::max_shared_memory_size() const
306 {
307 return d->max_shared_memory_size;
308 }
309
max_workgroup_count_x() const310 uint32_t GpuInfo::max_workgroup_count_x() const
311 {
312 return d->max_workgroup_count_x;
313 }
314
max_workgroup_count_y() const315 uint32_t GpuInfo::max_workgroup_count_y() const
316 {
317 return d->max_workgroup_count_y;
318 }
319
max_workgroup_count_z() const320 uint32_t GpuInfo::max_workgroup_count_z() const
321 {
322 return d->max_workgroup_count_z;
323 }
324
max_workgroup_invocations() const325 uint32_t GpuInfo::max_workgroup_invocations() const
326 {
327 return d->max_workgroup_invocations;
328 }
329
max_workgroup_size_x() const330 uint32_t GpuInfo::max_workgroup_size_x() const
331 {
332 return d->max_workgroup_size_x;
333 }
334
max_workgroup_size_y() const335 uint32_t GpuInfo::max_workgroup_size_y() const
336 {
337 return d->max_workgroup_size_y;
338 }
339
max_workgroup_size_z() const340 uint32_t GpuInfo::max_workgroup_size_z() const
341 {
342 return d->max_workgroup_size_z;
343 }
344
memory_map_alignment() const345 size_t GpuInfo::memory_map_alignment() const
346 {
347 return d->memory_map_alignment;
348 }
349
buffer_offset_alignment() const350 size_t GpuInfo::buffer_offset_alignment() const
351 {
352 return d->buffer_offset_alignment;
353 }
354
non_coherent_atom_size() const355 size_t GpuInfo::non_coherent_atom_size() const
356 {
357 return d->non_coherent_atom_size;
358 }
359
buffer_image_granularity() const360 size_t GpuInfo::buffer_image_granularity() const
361 {
362 return d->buffer_image_granularity;
363 }
364
max_image_dimension_1d() const365 uint32_t GpuInfo::max_image_dimension_1d() const
366 {
367 return d->max_image_dimension_1d;
368 }
369
max_image_dimension_2d() const370 uint32_t GpuInfo::max_image_dimension_2d() const
371 {
372 return d->max_image_dimension_2d;
373 }
374
max_image_dimension_3d() const375 uint32_t GpuInfo::max_image_dimension_3d() const
376 {
377 return d->max_image_dimension_3d;
378 }
379
timestamp_period() const380 float GpuInfo::timestamp_period() const
381 {
382 return d->timestamp_period;
383 }
384
compute_queue_family_index() const385 uint32_t GpuInfo::compute_queue_family_index() const
386 {
387 return d->compute_queue_family_index;
388 }
389
graphics_queue_family_index() const390 uint32_t GpuInfo::graphics_queue_family_index() const
391 {
392 return d->graphics_queue_family_index;
393 }
394
transfer_queue_family_index() const395 uint32_t GpuInfo::transfer_queue_family_index() const
396 {
397 return d->transfer_queue_family_index;
398 }
399
compute_queue_count() const400 uint32_t GpuInfo::compute_queue_count() const
401 {
402 return d->compute_queue_count;
403 }
404
graphics_queue_count() const405 uint32_t GpuInfo::graphics_queue_count() const
406 {
407 return d->graphics_queue_count;
408 }
409
transfer_queue_count() const410 uint32_t GpuInfo::transfer_queue_count() const
411 {
412 return d->transfer_queue_count;
413 }
414
unified_compute_transfer_queue() const415 bool GpuInfo::unified_compute_transfer_queue() const
416 {
417 return d->unified_compute_transfer_queue;
418 }
419
subgroup_size() const420 uint32_t GpuInfo::subgroup_size() const
421 {
422 return d->subgroup_size;
423 }
424
support_subgroup_basic() const425 bool GpuInfo::support_subgroup_basic() const
426 {
427 return d->support_subgroup_basic;
428 }
429
support_subgroup_vote() const430 bool GpuInfo::support_subgroup_vote() const
431 {
432 return d->support_subgroup_vote;
433 }
434
support_subgroup_ballot() const435 bool GpuInfo::support_subgroup_ballot() const
436 {
437 return d->support_subgroup_ballot;
438 }
439
support_subgroup_shuffle() const440 bool GpuInfo::support_subgroup_shuffle() const
441 {
442 return d->support_subgroup_shuffle;
443 }
444
bug_storage_buffer_no_l1() const445 bool GpuInfo::bug_storage_buffer_no_l1() const
446 {
447 return d->bug_storage_buffer_no_l1;
448 }
449
bug_corrupted_online_pipeline_cache() const450 bool GpuInfo::bug_corrupted_online_pipeline_cache() const
451 {
452 return d->bug_corrupted_online_pipeline_cache;
453 }
454
bug_buffer_image_load_zero() const455 bool GpuInfo::bug_buffer_image_load_zero() const
456 {
457 return d->bug_buffer_image_load_zero;
458 }
459
bug_implicit_fp16_arithmetic() const460 bool GpuInfo::bug_implicit_fp16_arithmetic() const
461 {
462 return d->bug_implicit_fp16_arithmetic;
463 }
464
support_fp16_packed() const465 bool GpuInfo::support_fp16_packed() const
466 {
467 return d->support_fp16_packed;
468 }
469
support_fp16_storage() const470 bool GpuInfo::support_fp16_storage() const
471 {
472 return d->support_fp16_storage;
473 }
474
support_fp16_arithmetic() const475 bool GpuInfo::support_fp16_arithmetic() const
476 {
477 return d->support_fp16_arithmetic;
478 }
479
support_int8_packed() const480 bool GpuInfo::support_int8_packed() const
481 {
482 return d->support_int8_packed;
483 }
484
support_int8_storage() const485 bool GpuInfo::support_int8_storage() const
486 {
487 return d->support_int8_storage;
488 }
489
support_int8_arithmetic() const490 bool GpuInfo::support_int8_arithmetic() const
491 {
492 return d->support_int8_arithmetic;
493 }
494
support_ycbcr_conversion() const495 bool GpuInfo::support_ycbcr_conversion() const
496 {
497 return d->support_ycbcr_conversion;
498 }
499
support_VK_KHR_8bit_storage() const500 int GpuInfo::support_VK_KHR_8bit_storage() const
501 {
502 return d->support_VK_KHR_8bit_storage;
503 }
504
support_VK_KHR_16bit_storage() const505 int GpuInfo::support_VK_KHR_16bit_storage() const
506 {
507 return d->support_VK_KHR_16bit_storage;
508 }
509
support_VK_KHR_bind_memory2() const510 int GpuInfo::support_VK_KHR_bind_memory2() const
511 {
512 return d->support_VK_KHR_bind_memory2;
513 }
514
support_VK_KHR_create_renderpass2() const515 int GpuInfo::support_VK_KHR_create_renderpass2() const
516 {
517 return d->support_VK_KHR_create_renderpass2;
518 }
519
support_VK_KHR_dedicated_allocation() const520 int GpuInfo::support_VK_KHR_dedicated_allocation() const
521 {
522 return d->support_VK_KHR_dedicated_allocation;
523 }
524
support_VK_KHR_descriptor_update_template() const525 int GpuInfo::support_VK_KHR_descriptor_update_template() const
526 {
527 return d->support_VK_KHR_descriptor_update_template;
528 }
529
support_VK_KHR_external_memory() const530 int GpuInfo::support_VK_KHR_external_memory() const
531 {
532 return d->support_VK_KHR_external_memory;
533 }
534
support_VK_KHR_get_memory_requirements2() const535 int GpuInfo::support_VK_KHR_get_memory_requirements2() const
536 {
537 return d->support_VK_KHR_get_memory_requirements2;
538 }
539
support_VK_KHR_maintenance1() const540 int GpuInfo::support_VK_KHR_maintenance1() const
541 {
542 return d->support_VK_KHR_maintenance1;
543 }
544
support_VK_KHR_maintenance2() const545 int GpuInfo::support_VK_KHR_maintenance2() const
546 {
547 return d->support_VK_KHR_maintenance2;
548 }
549
support_VK_KHR_maintenance3() const550 int GpuInfo::support_VK_KHR_maintenance3() const
551 {
552 return d->support_VK_KHR_maintenance3;
553 }
554
support_VK_KHR_multiview() const555 int GpuInfo::support_VK_KHR_multiview() const
556 {
557 return d->support_VK_KHR_multiview;
558 }
559
support_VK_KHR_push_descriptor() const560 int GpuInfo::support_VK_KHR_push_descriptor() const
561 {
562 return d->support_VK_KHR_push_descriptor;
563 }
564
support_VK_KHR_sampler_ycbcr_conversion() const565 int GpuInfo::support_VK_KHR_sampler_ycbcr_conversion() const
566 {
567 return d->support_VK_KHR_sampler_ycbcr_conversion;
568 }
569
support_VK_KHR_shader_float16_int8() const570 int GpuInfo::support_VK_KHR_shader_float16_int8() const
571 {
572 return d->support_VK_KHR_shader_float16_int8;
573 }
574
support_VK_KHR_shader_float_controls() const575 int GpuInfo::support_VK_KHR_shader_float_controls() const
576 {
577 return d->support_VK_KHR_shader_float_controls;
578 }
579
support_VK_KHR_storage_buffer_storage_class() const580 int GpuInfo::support_VK_KHR_storage_buffer_storage_class() const
581 {
582 return d->support_VK_KHR_storage_buffer_storage_class;
583 }
584
support_VK_KHR_swapchain() const585 int GpuInfo::support_VK_KHR_swapchain() const
586 {
587 return d->support_VK_KHR_swapchain;
588 }
589
support_VK_EXT_descriptor_indexing() const590 int GpuInfo::support_VK_EXT_descriptor_indexing() const
591 {
592 return d->support_VK_EXT_descriptor_indexing;
593 }
594
support_VK_EXT_memory_budget() const595 int GpuInfo::support_VK_EXT_memory_budget() const
596 {
597 return d->support_VK_EXT_memory_budget;
598 }
599
support_VK_EXT_queue_family_foreign() const600 int GpuInfo::support_VK_EXT_queue_family_foreign() const
601 {
602 return d->support_VK_EXT_queue_family_foreign;
603 }
604
605 #if __ANDROID_API__ >= 26
support_VK_ANDROID_external_memory_android_hardware_buffer() const606 int GpuInfo::support_VK_ANDROID_external_memory_android_hardware_buffer() const
607 {
608 return d->support_VK_ANDROID_external_memory_android_hardware_buffer;
609 }
610 #endif // __ANDROID_API__ >= 26
611
init_instance_extension()612 static int init_instance_extension()
613 {
614 if (support_VK_KHR_external_memory_capabilities)
615 {
616 vkGetPhysicalDeviceExternalBufferPropertiesKHR = (PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceExternalBufferPropertiesKHR");
617 }
618
619 if (support_VK_KHR_get_physical_device_properties2)
620 {
621 vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFeatures2KHR");
622 vkGetPhysicalDeviceProperties2KHR = (PFN_vkGetPhysicalDeviceProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceProperties2KHR");
623 vkGetPhysicalDeviceFormatProperties2KHR = (PFN_vkGetPhysicalDeviceFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFormatProperties2KHR");
624 vkGetPhysicalDeviceImageFormatProperties2KHR = (PFN_vkGetPhysicalDeviceImageFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceImageFormatProperties2KHR");
625 vkGetPhysicalDeviceQueueFamilyProperties2KHR = (PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceQueueFamilyProperties2KHR");
626 vkGetPhysicalDeviceMemoryProperties2KHR = (PFN_vkGetPhysicalDeviceMemoryProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceMemoryProperties2KHR");
627 vkGetPhysicalDeviceSparseImageFormatProperties2KHR = (PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSparseImageFormatProperties2KHR");
628 }
629
630 if (support_VK_KHR_get_surface_capabilities2)
631 {
632 vkGetPhysicalDeviceSurfaceCapabilities2KHR = (PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceCapabilities2KHR");
633 vkGetPhysicalDeviceSurfaceFormats2KHR = (PFN_vkGetPhysicalDeviceSurfaceFormats2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceFormats2KHR");
634 }
635
636 if (support_VK_KHR_surface)
637 {
638 vkDestroySurfaceKHR = (PFN_vkDestroySurfaceKHR)vkGetInstanceProcAddr(g_instance, "vkDestroySurfaceKHR");
639 vkGetPhysicalDeviceSurfaceSupportKHR = (PFN_vkGetPhysicalDeviceSurfaceSupportKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceSupportKHR");
640 vkGetPhysicalDeviceSurfaceCapabilitiesKHR = (PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceCapabilitiesKHR");
641 vkGetPhysicalDeviceSurfaceFormatsKHR = (PFN_vkGetPhysicalDeviceSurfaceFormatsKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceFormatsKHR");
642 vkGetPhysicalDeviceSurfacePresentModesKHR = (PFN_vkGetPhysicalDeviceSurfacePresentModesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfacePresentModesKHR");
643 }
644
645 #if __ANDROID_API__ >= 26
646 if (support_VK_KHR_android_surface)
647 {
648 vkCreateAndroidSurfaceKHR = (PFN_vkCreateAndroidSurfaceKHR)vkGetInstanceProcAddr(g_instance, "vkCreateAndroidSurfaceKHR");
649 }
650 #endif // __ANDROID_API__ >= 26
651
652 return 0;
653 }
654
655 #if ENABLE_VALIDATION_LAYER
debugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT,VkDebugUtilsMessageTypeFlagsEXT,const VkDebugUtilsMessengerCallbackDataEXT * pCallbackData,void *)656 static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback(
657 VkDebugUtilsMessageSeverityFlagBitsEXT /*messageSeverity*/,
658 VkDebugUtilsMessageTypeFlagsEXT /*messageType*/,
659 const VkDebugUtilsMessengerCallbackDataEXT* pCallbackData,
660 void* /*pUserData*/)
661 {
662 NCNN_LOGE("validation layer: %s", pCallbackData->pMessage);
663
664 return VK_FALSE;
665 }
666
CreateDebugUtilsMessengerEXT(VkInstance instance,const VkDebugUtilsMessengerCreateInfoEXT * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkDebugUtilsMessengerEXT * pCallback)667 static VkResult CreateDebugUtilsMessengerEXT(VkInstance instance, const VkDebugUtilsMessengerCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDebugUtilsMessengerEXT* pCallback)
668 {
669 PFN_vkCreateDebugUtilsMessengerEXT func = (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkCreateDebugUtilsMessengerEXT");
670 if (func)
671 return func(instance, pCreateInfo, pAllocator, pCallback);
672
673 return VK_ERROR_EXTENSION_NOT_PRESENT;
674 }
675
DestroyDebugUtilsMessengerEXT(VkInstance instance,VkDebugUtilsMessengerEXT callback,const VkAllocationCallbacks * pAllocator)676 static void DestroyDebugUtilsMessengerEXT(VkInstance instance, VkDebugUtilsMessengerEXT callback, const VkAllocationCallbacks* pAllocator)
677 {
678 PFN_vkDestroyDebugUtilsMessengerEXT func = (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkDestroyDebugUtilsMessengerEXT");
679 if (func)
680 func(instance, callback, pAllocator);
681 }
682 #endif // ENABLE_VALIDATION_LAYER
683
find_device_compute_queue(const std::vector<VkQueueFamilyProperties> & queueFamilyProperties)684 static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
685 {
686 // first try, compute only queue
687 for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
688 {
689 const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
690
691 if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
692 && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
693 {
694 return i;
695 }
696 }
697
698 // second try, any queue with compute and graphics
699 for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
700 {
701 const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
702
703 if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
704 && (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
705 {
706 return i;
707 }
708 }
709
710 // third try, any queue with compute
711 for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
712 {
713 const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
714
715 if (queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
716 {
717 return i;
718 }
719 }
720
721 // NCNN_LOGE("no compute queue");
722 return -1;
723 }
724
find_device_graphics_queue(const std::vector<VkQueueFamilyProperties> & queueFamilyProperties)725 static uint32_t find_device_graphics_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
726 {
727 // first try, graphics only queue
728 for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
729 {
730 const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
731
732 if ((queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)
733 && !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT))
734 {
735 return i;
736 }
737 }
738
739 // second try, any queue with graphics and compute
740 for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
741 {
742 const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
743
744 if ((queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)
745 && (queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT))
746 {
747 return i;
748 }
749 }
750
751 // third try, any queue with graphics
752 for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
753 {
754 const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
755
756 if (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)
757 {
758 return i;
759 }
760 }
761
762 // NCNN_LOGE("no graphics queue");
763 return -1;
764 }
765
find_device_transfer_queue(const std::vector<VkQueueFamilyProperties> & queueFamilyProperties)766 static uint32_t find_device_transfer_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
767 {
768 // first try, transfer only queue
769 for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
770 {
771 const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
772
773 if ((queueFamilyProperty.queueFlags & VK_QUEUE_TRANSFER_BIT)
774 && !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
775 && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
776 {
777 return i;
778 }
779 }
780
781 // second try, any queue with transfer
782 for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
783 {
784 const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
785
786 if (queueFamilyProperty.queueFlags & VK_QUEUE_TRANSFER_BIT)
787 {
788 return i;
789 }
790 }
791
792 // third try, use compute queue
793 uint32_t compute_queue_index = find_device_compute_queue(queueFamilyProperties);
794 if (compute_queue_index != (uint32_t)-1)
795 {
796 return compute_queue_index;
797 }
798
799 // fourth try, use graphics queue
800 uint32_t graphics_queue_index = find_device_graphics_queue(queueFamilyProperties);
801 if (graphics_queue_index != (uint32_t)-1)
802 {
803 return graphics_queue_index;
804 }
805
806 // NCNN_LOGE("no transfer queue");
807 return -1;
808 }
809
find_default_vulkan_device_index()810 static int find_default_vulkan_device_index()
811 {
812 // first try, discrete gpu
813 for (int i = 0; i < g_gpu_count; i++)
814 {
815 if (g_gpu_infos[i]->type() == 0)
816 return i;
817 }
818
819 // second try, integrated gpu
820 for (int i = 0; i < g_gpu_count; i++)
821 {
822 if (g_gpu_infos[i]->type() == 1)
823 return i;
824 }
825
826 // third try, any probed device
827 if (g_gpu_count > 0)
828 return 0;
829
830 NCNN_LOGE("no vulkan device");
831 return -1;
832 }
833
create_gpu_instance()834 int create_gpu_instance()
835 {
836 MutexLockGuard lock(g_instance_lock);
837
838 if ((VkInstance)g_instance != 0)
839 return 0;
840
841 // NCNN_LOGE("create_gpu_instance");
842
843 VkResult ret;
844
845 std::vector<const char*> enabledLayers;
846
847 #if ENABLE_VALIDATION_LAYER
848 uint32_t instanceLayerPropertyCount;
849 ret = vkEnumerateInstanceLayerProperties(&instanceLayerPropertyCount, NULL);
850 if (ret != VK_SUCCESS)
851 {
852 NCNN_LOGE("vkEnumerateInstanceLayerProperties failed %d", ret);
853 return -1;
854 }
855
856 std::vector<VkLayerProperties> instanceLayerProperties(instanceLayerPropertyCount);
857 ret = vkEnumerateInstanceLayerProperties(&instanceLayerPropertyCount, instanceLayerProperties.data());
858 if (ret != VK_SUCCESS)
859 {
860 NCNN_LOGE("vkEnumerateInstanceLayerProperties failed %d", ret);
861 return -1;
862 }
863
864 for (uint32_t i = 0; i < instanceLayerPropertyCount; i++)
865 {
866 const VkLayerProperties& lp = instanceLayerProperties[i];
867 // NCNN_LOGE("instance layer %s = %u", lp.layerName, lp.implementationVersion);
868
869 if (strcmp(lp.layerName, "VK_LAYER_LUNARG_standard_validation") == 0)
870 {
871 enabledLayers.push_back("VK_LAYER_LUNARG_standard_validation");
872 }
873 if (strcmp(lp.layerName, "VK_LAYER_LUNARG_parameter_validation") == 0)
874 {
875 enabledLayers.push_back("VK_LAYER_LUNARG_parameter_validation");
876 }
877 if (strcmp(lp.layerName, "VK_LAYER_KHRONOS_validation") == 0)
878 {
879 enabledLayers.push_back("VK_LAYER_KHRONOS_validation");
880 }
881 }
882 #endif // ENABLE_VALIDATION_LAYER
883
884 std::vector<const char*> enabledExtensions;
885
886 uint32_t instanceExtensionPropertyCount;
887 ret = vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, NULL);
888 if (ret != VK_SUCCESS)
889 {
890 NCNN_LOGE("vkEnumerateInstanceExtensionProperties failed %d", ret);
891 return -1;
892 }
893
894 std::vector<VkExtensionProperties> instanceExtensionProperties(instanceExtensionPropertyCount);
895 ret = vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, instanceExtensionProperties.data());
896 if (ret != VK_SUCCESS)
897 {
898 NCNN_LOGE("vkEnumerateInstanceExtensionProperties failed %d", ret);
899 return -1;
900 }
901
902 support_VK_KHR_get_physical_device_properties2 = 0;
903 support_VK_KHR_get_surface_capabilities2 = 0;
904 support_VK_KHR_surface = 0;
905 support_VK_EXT_debug_utils = 0;
906 #if __ANDROID_API__ >= 26
907 support_VK_KHR_android_surface = 0;
908 #endif // __ANDROID_API__ >= 26
909 for (uint32_t j = 0; j < instanceExtensionPropertyCount; j++)
910 {
911 const VkExtensionProperties& exp = instanceExtensionProperties[j];
912 // NCNN_LOGE("instance extension %s = %u", exp.extensionName, exp.specVersion);
913
914 if (strcmp(exp.extensionName, "VK_KHR_external_memory_capabilities") == 0)
915 support_VK_KHR_external_memory_capabilities = exp.specVersion;
916 else if (strcmp(exp.extensionName, "VK_KHR_get_physical_device_properties2") == 0)
917 support_VK_KHR_get_physical_device_properties2 = exp.specVersion;
918 else if (strcmp(exp.extensionName, "VK_KHR_get_surface_capabilities2") == 0)
919 support_VK_KHR_get_surface_capabilities2 = exp.specVersion;
920 else if (strcmp(exp.extensionName, "VK_KHR_surface") == 0)
921 support_VK_KHR_surface = exp.specVersion;
922 else if (strcmp(exp.extensionName, "VK_EXT_debug_utils") == 0)
923 support_VK_EXT_debug_utils = exp.specVersion;
924 #if __ANDROID_API__ >= 26
925 else if (strcmp(exp.extensionName, "VK_KHR_android_surface") == 0)
926 support_VK_KHR_android_surface = exp.specVersion;
927 #endif // __ANDROID_API__ >= 26
928 }
929
930 if (support_VK_KHR_external_memory_capabilities)
931 enabledExtensions.push_back("VK_KHR_external_memory_capabilities");
932 if (support_VK_KHR_get_physical_device_properties2)
933 enabledExtensions.push_back("VK_KHR_get_physical_device_properties2");
934 if (support_VK_KHR_get_surface_capabilities2)
935 enabledExtensions.push_back("VK_KHR_get_surface_capabilities2");
936 if (support_VK_KHR_surface)
937 enabledExtensions.push_back("VK_KHR_surface");
938 #if ENABLE_VALIDATION_LAYER
939 if (support_VK_EXT_debug_utils)
940 enabledExtensions.push_back("VK_EXT_debug_utils");
941 #endif // ENABLE_VALIDATION_LAYER
942 #if __ANDROID_API__ >= 26
943 if (support_VK_KHR_android_surface)
944 enabledExtensions.push_back("VK_KHR_android_surface");
945 #endif // __ANDROID_API__ >= 26
946
947 uint32_t instance_api_version = VK_MAKE_VERSION(1, 0, 0);
948 typedef VkResult(VKAPI_PTR * PFN_vkEnumerateInstanceVersion)(uint32_t * pApiVersion);
949 PFN_vkEnumerateInstanceVersion vkEnumerateInstanceVersion = (PFN_vkEnumerateInstanceVersion)vkGetInstanceProcAddr(0, "vkEnumerateInstanceVersion");
950 if (vkEnumerateInstanceVersion)
951 {
952 ret = vkEnumerateInstanceVersion(&instance_api_version);
953 if (ret != VK_SUCCESS)
954 {
955 NCNN_LOGE("vkEnumerateInstanceVersion failed %d", ret);
956 return -1;
957 }
958 }
959
960 // NCNN_LOGE("instance apiVersion = %u.%u.%u", VK_VERSION_MAJOR(instance_api_version), VK_VERSION_MINOR(instance_api_version), VK_VERSION_PATCH(instance_api_version));
961
962 VkApplicationInfo applicationInfo;
963 applicationInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
964 applicationInfo.pNext = 0;
965 applicationInfo.pApplicationName = "ncnn";
966 applicationInfo.applicationVersion = 0;
967 applicationInfo.pEngineName = "ncnn";
968 applicationInfo.engineVersion = 20201010;
969 applicationInfo.apiVersion = instance_api_version;
970
971 VkInstanceCreateInfo instanceCreateInfo;
972 instanceCreateInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
973 instanceCreateInfo.pNext = 0;
974 instanceCreateInfo.flags = 0;
975 instanceCreateInfo.pApplicationInfo = &applicationInfo;
976 instanceCreateInfo.enabledLayerCount = enabledLayers.size();
977 instanceCreateInfo.ppEnabledLayerNames = enabledLayers.data();
978 instanceCreateInfo.enabledExtensionCount = enabledExtensions.size();
979 instanceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data();
980
981 VkInstance instance = 0;
982 ret = vkCreateInstance(&instanceCreateInfo, 0, &instance);
983 if (ret != VK_SUCCESS)
984 {
985 NCNN_LOGE("vkCreateInstance failed %d", ret);
986 return -1;
987 }
988
989 g_instance.instance = instance;
990
991 #if ENABLE_VALIDATION_LAYER
992 if (support_VK_EXT_debug_utils)
993 {
994 VkDebugUtilsMessengerCreateInfoEXT createInfo = {};
995 createInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT;
996 createInfo.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT;
997 createInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT;
998 createInfo.pfnUserCallback = debugCallback;
999 createInfo.pUserData = 0;
1000 ret = CreateDebugUtilsMessengerEXT(g_instance, &createInfo, NULL, &g_instance.callback);
1001 if (ret != VK_SUCCESS)
1002 {
1003 NCNN_LOGE("CreateDebugUtilsMessengerEXT failed %d", ret);
1004 return -1;
1005 }
1006 }
1007 #endif // ENABLE_VALIDATION_LAYER
1008
1009 init_instance_extension();
1010
1011 uint32_t physicalDeviceCount = 0;
1012 ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, 0);
1013 if (ret != VK_SUCCESS)
1014 {
1015 NCNN_LOGE("vkEnumeratePhysicalDevices failed %d", ret);
1016 return -1;
1017 }
1018
1019 if (physicalDeviceCount > NCNN_MAX_GPU_COUNT)
1020 physicalDeviceCount = NCNN_MAX_GPU_COUNT;
1021
1022 std::vector<VkPhysicalDevice> physicalDevices(physicalDeviceCount);
1023
1024 ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, physicalDevices.data());
1025 if (ret != VK_SUCCESS)
1026 {
1027 NCNN_LOGE("vkEnumeratePhysicalDevices failed %d", ret);
1028 return -1;
1029 }
1030
1031 // find proper device and queue
1032 int gpu_info_index = 0;
1033 for (uint32_t i = 0; i < physicalDeviceCount; i++)
1034 {
1035 const VkPhysicalDevice& physicalDevice = physicalDevices[i];
1036 delete g_gpu_infos[gpu_info_index];
1037 g_gpu_infos[gpu_info_index] = new GpuInfo;
1038 GpuInfoPrivate& gpu_info = *(g_gpu_infos[gpu_info_index]->d);
1039
1040 // device type
1041 VkPhysicalDeviceProperties physicalDeviceProperties;
1042 vkGetPhysicalDeviceProperties(physicalDevice, &physicalDeviceProperties);
1043
1044 // NCNN_LOGE("[%u] apiVersion = %u.%u.%u", i, VK_VERSION_MAJOR(physicalDeviceProperties.apiVersion),
1045 // VK_VERSION_MINOR(physicalDeviceProperties.apiVersion), VK_VERSION_PATCH(physicalDeviceProperties.apiVersion));
1046 // NCNN_LOGE("[%u] driverVersion = %u.%u.%u", i, VK_VERSION_MAJOR(physicalDeviceProperties.driverVersion),
1047 // VK_VERSION_MINOR(physicalDeviceProperties.driverVersion), VK_VERSION_PATCH(physicalDeviceProperties.driverVersion));
1048 // NCNN_LOGE("[%u] vendorID = %x", i, physicalDeviceProperties.vendorID);
1049 // NCNN_LOGE("[%u] deviceID = %x", i, physicalDeviceProperties.deviceID);
1050 // NCNN_LOGE("[%u] deviceType = %x", i, physicalDeviceProperties.deviceType);
1051 // NCNN_LOGE("[%u] deviceName = %s", i, physicalDeviceProperties.deviceName);
1052 // NCNN_LOGE("[%u] pipelineCacheUUID = %u", i, physicalDeviceProperties.pipelineCacheUUID);
1053
1054 // mali
1055 // t760 = 0x13b5 0x7500001
1056 // t860 = 0x13b5 0x8602000
1057 // t880 = 0x13b5 0x8800020
1058 // g51 = 0x13b5 0x70901010
1059 // g52 = 0x13b5 0x74021000
1060 // g71 = 0x13b5 0x60a00002
1061 // g72 = 0x13b5 0x62210001
1062 // g76 = 0x13b5 0x72110000
1063 // g77 = 0x13b5 0x90800011
1064
1065 // adreno
1066 // 506 = 0x5143 0x5000600
1067 // 510 = 0x5143 0x5010000
1068 // 512 = 0x5143 0x5010200
1069 // 530 = 0x5143 0x5030004
1070 // 540 = 0x5143 0x5040001
1071 // 616 = 0x5143 0x6010600
1072 // 630 = 0x5143 0x6030001
1073 // 640 = 0x5143 0x6040001
1074 // 650 = 0x5143 0x6050002
1075
1076 gpu_info.bug_storage_buffer_no_l1 = false;
1077 gpu_info.bug_corrupted_online_pipeline_cache = false;
1078 gpu_info.bug_implicit_fp16_arithmetic = false;
1079
1080 if (physicalDeviceProperties.vendorID == 0x5143 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 0, 66))
1081 {
1082 // qcom adreno with old buggy driver cannot share created pipeline properly
1083 gpu_info.bug_corrupted_online_pipeline_cache = true;
1084 }
1085
1086 if (physicalDeviceProperties.vendorID == 0x5143 && !(physicalDeviceProperties.deviceID == 0x6040001 || physicalDeviceProperties.deviceID == 0x6050002))
1087 {
1088 // NOTE but qcom855/qcom855plus/qcom865 are known exceptions
1089 // qcom adreno storage buffer without L1 cache
1090 gpu_info.bug_storage_buffer_no_l1 = true;
1091 }
1092
1093 if (physicalDeviceProperties.vendorID == 0x5143 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 1, 87))
1094 {
1095 // HACK buffer2image before image-read dependency does not work properly
1096 // even promised with full image memory barrier on old adreno driver
1097 // TODO figure out a proper workaround without hurt speed too much
1098 // TODO only for old drivers
1099 gpu_info.bug_buffer_image_load_zero = true;
1100 }
1101
1102 if (physicalDeviceProperties.vendorID == 0x13b5
1103 && (physicalDeviceProperties.deviceID == 0x7500001
1104 || physicalDeviceProperties.deviceID == 0x8602000
1105 || physicalDeviceProperties.deviceID == 0x8800020
1106 || physicalDeviceProperties.deviceID == 0x70901010
1107 || physicalDeviceProperties.deviceID == 0x74021000
1108 || physicalDeviceProperties.deviceID == 0x60a00002
1109 || physicalDeviceProperties.deviceID == 0x62210001))
1110 {
1111 // NOTE rk3288/rk3399/t880/g51/g52/g71/g72
1112 // however, g76/g77 has explicit fp16 arithmetic
1113 // arm mali driver accept spirv with fp16 arithmetic
1114 gpu_info.bug_implicit_fp16_arithmetic = true;
1115 }
1116
1117 if (physicalDeviceProperties.vendorID == 0x5143
1118 && (physicalDeviceProperties.deviceID == 0x6030001
1119 || physicalDeviceProperties.deviceID == 0x6040001
1120 || physicalDeviceProperties.deviceID == 0x6050002))
1121 {
1122 // TODO enable devices other than qcom845/qcom855/qcom855plus/qcom865
1123 // qcom adreno driver accept spirv with fp16 arithmetic
1124 gpu_info.bug_implicit_fp16_arithmetic = true;
1125 }
1126
1127 gpu_info.physical_device = physicalDevice;
1128
1129 // info
1130 gpu_info.api_version = physicalDeviceProperties.apiVersion;
1131 gpu_info.driver_version = physicalDeviceProperties.driverVersion;
1132 gpu_info.vendor_id = physicalDeviceProperties.vendorID;
1133 gpu_info.device_id = physicalDeviceProperties.deviceID;
1134 memcpy(gpu_info.device_name, physicalDeviceProperties.deviceName, VK_MAX_PHYSICAL_DEVICE_NAME_SIZE);
1135 memcpy(gpu_info.pipeline_cache_uuid, physicalDeviceProperties.pipelineCacheUUID, VK_UUID_SIZE);
1136
1137 if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU)
1138 gpu_info.type = 0;
1139 else if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU)
1140 gpu_info.type = 1;
1141 else if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU)
1142 gpu_info.type = 2;
1143 else if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_CPU)
1144 gpu_info.type = 3;
1145 else
1146 gpu_info.type = -1;
1147
1148 // device capability
1149 gpu_info.max_shared_memory_size = physicalDeviceProperties.limits.maxComputeSharedMemorySize;
1150
1151 gpu_info.max_workgroup_count_x = physicalDeviceProperties.limits.maxComputeWorkGroupCount[0];
1152 gpu_info.max_workgroup_count_y = physicalDeviceProperties.limits.maxComputeWorkGroupCount[1];
1153 gpu_info.max_workgroup_count_z = physicalDeviceProperties.limits.maxComputeWorkGroupCount[2];
1154
1155 gpu_info.max_workgroup_invocations = physicalDeviceProperties.limits.maxComputeWorkGroupInvocations;
1156
1157 gpu_info.max_workgroup_size_x = physicalDeviceProperties.limits.maxComputeWorkGroupSize[0];
1158 gpu_info.max_workgroup_size_y = physicalDeviceProperties.limits.maxComputeWorkGroupSize[1];
1159 gpu_info.max_workgroup_size_z = physicalDeviceProperties.limits.maxComputeWorkGroupSize[2];
1160
1161 gpu_info.memory_map_alignment = physicalDeviceProperties.limits.minMemoryMapAlignment;
1162 gpu_info.buffer_offset_alignment = physicalDeviceProperties.limits.minStorageBufferOffsetAlignment;
1163 gpu_info.non_coherent_atom_size = physicalDeviceProperties.limits.nonCoherentAtomSize;
1164 gpu_info.buffer_image_granularity = physicalDeviceProperties.limits.bufferImageGranularity;
1165 gpu_info.max_image_dimension_1d = physicalDeviceProperties.limits.maxImageDimension1D;
1166 gpu_info.max_image_dimension_2d = physicalDeviceProperties.limits.maxImageDimension2D;
1167 gpu_info.max_image_dimension_3d = physicalDeviceProperties.limits.maxImageDimension3D;
1168
1169 gpu_info.timestamp_period = physicalDeviceProperties.limits.timestampPeriod;
1170
1171 // NCNN_LOGE("[%u] max_shared_memory_size = %u", i, gpu_info.max_shared_memory_size);
1172 // NCNN_LOGE("[%u] max_workgroup_count = %u %u %u", i, gpu_info.max_workgroup_count[0], gpu_info.max_workgroup_count[1], gpu_info.max_workgroup_count[2]);
1173 // NCNN_LOGE("[%u] max_workgroup_invocations = %u", i, gpu_info.max_workgroup_invocations);
1174 // NCNN_LOGE("[%u] max_workgroup_size = %u %u %u", i, gpu_info.max_workgroup_size[0], gpu_info.max_workgroup_size[1], gpu_info.max_workgroup_size[2]);
1175 // NCNN_LOGE("[%u] memory_map_alignment = %lu", i, gpu_info.memory_map_alignment);
1176 // NCNN_LOGE("[%u] buffer_offset_alignment = %lu", i, gpu_info.buffer_offset_alignment);
1177
1178 // find compute queue
1179 uint32_t queueFamilyPropertiesCount;
1180 vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, 0);
1181
1182 std::vector<VkQueueFamilyProperties> queueFamilyProperties(queueFamilyPropertiesCount);
1183 vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, queueFamilyProperties.data());
1184
1185 gpu_info.compute_queue_family_index = find_device_compute_queue(queueFamilyProperties);
1186 gpu_info.graphics_queue_family_index = find_device_graphics_queue(queueFamilyProperties);
1187 gpu_info.transfer_queue_family_index = find_device_transfer_queue(queueFamilyProperties);
1188
1189 gpu_info.compute_queue_count = queueFamilyProperties[gpu_info.compute_queue_family_index].queueCount;
1190 gpu_info.graphics_queue_count = queueFamilyProperties[gpu_info.graphics_queue_family_index].queueCount;
1191 gpu_info.transfer_queue_count = queueFamilyProperties[gpu_info.transfer_queue_family_index].queueCount;
1192
1193 gpu_info.unified_compute_transfer_queue = gpu_info.compute_queue_family_index == gpu_info.transfer_queue_family_index;
1194
1195 // additional device properties
1196 gpu_info.subgroup_size = 64;
1197 gpu_info.support_subgroup_basic = false;
1198 gpu_info.support_subgroup_vote = false;
1199 gpu_info.support_subgroup_ballot = false;
1200 gpu_info.support_subgroup_shuffle = false;
1201 if (support_VK_KHR_get_physical_device_properties2)
1202 {
1203 void* queryDeviceProperties = 0;
1204
1205 // query subgroup
1206 VkPhysicalDeviceSubgroupProperties physicalDeviceSubgroupProperties;
1207 physicalDeviceSubgroupProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
1208 physicalDeviceSubgroupProperties.pNext = queryDeviceProperties;
1209 if (VK_VERSION_MAJOR(instance_api_version) >= 1 && VK_VERSION_MINOR(instance_api_version) >= 1)
1210 {
1211 queryDeviceProperties = &physicalDeviceSubgroupProperties;
1212 }
1213
1214 VkPhysicalDeviceProperties2KHR queryProperties;
1215 queryProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR;
1216 queryProperties.pNext = queryDeviceProperties;
1217
1218 vkGetPhysicalDeviceProperties2KHR(physicalDevice, &queryProperties);
1219
1220 if (VK_VERSION_MAJOR(instance_api_version) >= 1 && VK_VERSION_MINOR(instance_api_version) >= 1)
1221 {
1222 gpu_info.subgroup_size = physicalDeviceSubgroupProperties.subgroupSize;
1223 if (physicalDeviceSubgroupProperties.supportedStages & VK_SHADER_STAGE_COMPUTE_BIT)
1224 {
1225 gpu_info.support_subgroup_basic = physicalDeviceSubgroupProperties.supportedOperations & VK_SUBGROUP_FEATURE_BASIC_BIT;
1226 gpu_info.support_subgroup_vote = physicalDeviceSubgroupProperties.supportedOperations & VK_SUBGROUP_FEATURE_VOTE_BIT;
1227 gpu_info.support_subgroup_ballot = physicalDeviceSubgroupProperties.supportedOperations & VK_SUBGROUP_FEATURE_BALLOT_BIT;
1228 gpu_info.support_subgroup_shuffle = physicalDeviceSubgroupProperties.supportedOperations & VK_SUBGROUP_FEATURE_SHUFFLE_BIT;
1229 }
1230 }
1231 else
1232 {
1233 if (physicalDeviceProperties.vendorID == 0x5143) // qcom adreno prefer very large workgroup :P
1234 gpu_info.subgroup_size = 128;
1235 if (physicalDeviceProperties.vendorID == 0x13b5) // arm mali
1236 gpu_info.subgroup_size = 16;
1237 if (physicalDeviceProperties.vendorID == 0x1010) // imgtec powervr
1238 gpu_info.subgroup_size = 32;
1239 if (physicalDeviceProperties.vendorID == 0x1002) // amd
1240 gpu_info.subgroup_size = 64;
1241 if (physicalDeviceProperties.vendorID == 0x10de) // nvidia
1242 gpu_info.subgroup_size = 32;
1243 if (physicalDeviceProperties.vendorID == 0x8086) // intel
1244 gpu_info.subgroup_size = 32;
1245 }
1246 }
1247
1248 // cache memory properties
1249 vkGetPhysicalDeviceMemoryProperties(physicalDevice, &gpu_info.physical_device_memory_properties);
1250
1251 // get device extension
1252 uint32_t deviceExtensionPropertyCount = 0;
1253 ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, NULL);
1254 if (ret != VK_SUCCESS)
1255 {
1256 NCNN_LOGE("vkEnumerateDeviceExtensionProperties failed %d", ret);
1257 return -1;
1258 }
1259
1260 std::vector<VkExtensionProperties> deviceExtensionProperties(deviceExtensionPropertyCount);
1261 ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, deviceExtensionProperties.data());
1262 if (ret != VK_SUCCESS)
1263 {
1264 NCNN_LOGE("vkEnumerateDeviceExtensionProperties failed %d", ret);
1265 return -1;
1266 }
1267
1268 // extension capability
1269 gpu_info.support_VK_KHR_8bit_storage = 0;
1270 gpu_info.support_VK_KHR_16bit_storage = 0;
1271 gpu_info.support_VK_KHR_bind_memory2 = 0;
1272 gpu_info.support_VK_KHR_create_renderpass2 = 0;
1273 gpu_info.support_VK_KHR_dedicated_allocation = 0;
1274 gpu_info.support_VK_KHR_descriptor_update_template = 0;
1275 gpu_info.support_VK_KHR_external_memory = 0;
1276 gpu_info.support_VK_KHR_get_memory_requirements2 = 0;
1277 gpu_info.support_VK_KHR_maintenance1 = 0;
1278 gpu_info.support_VK_KHR_maintenance2 = 0;
1279 gpu_info.support_VK_KHR_maintenance3 = 0;
1280 gpu_info.support_VK_KHR_multiview = 0;
1281 gpu_info.support_VK_KHR_push_descriptor = 0;
1282 gpu_info.support_VK_KHR_sampler_ycbcr_conversion = 0;
1283 gpu_info.support_VK_KHR_shader_float16_int8 = 0;
1284 gpu_info.support_VK_KHR_shader_float_controls = 0;
1285 gpu_info.support_VK_KHR_storage_buffer_storage_class = 0;
1286 gpu_info.support_VK_KHR_swapchain = 0;
1287 gpu_info.support_VK_EXT_descriptor_indexing = 0;
1288 gpu_info.support_VK_EXT_memory_budget = 0;
1289 gpu_info.support_VK_EXT_queue_family_foreign = 0;
1290 #if __ANDROID_API__ >= 26
1291 gpu_info.support_VK_ANDROID_external_memory_android_hardware_buffer = 0;
1292 #endif // __ANDROID_API__ >= 26
1293 for (uint32_t j = 0; j < deviceExtensionPropertyCount; j++)
1294 {
1295 const VkExtensionProperties& exp = deviceExtensionProperties[j];
1296 // NCNN_LOGE("device extension %s = %u", exp.extensionName, exp.specVersion);
1297
1298 if (strcmp(exp.extensionName, "VK_KHR_8bit_storage") == 0)
1299 gpu_info.support_VK_KHR_8bit_storage = exp.specVersion;
1300 else if (strcmp(exp.extensionName, "VK_KHR_16bit_storage") == 0)
1301 gpu_info.support_VK_KHR_16bit_storage = exp.specVersion;
1302 else if (strcmp(exp.extensionName, "VK_KHR_bind_memory2") == 0)
1303 gpu_info.support_VK_KHR_bind_memory2 = exp.specVersion;
1304 else if (strcmp(exp.extensionName, "VK_KHR_create_renderpass2") == 0)
1305 gpu_info.support_VK_KHR_create_renderpass2 = exp.specVersion;
1306 else if (strcmp(exp.extensionName, "VK_KHR_dedicated_allocation") == 0)
1307 gpu_info.support_VK_KHR_dedicated_allocation = exp.specVersion;
1308 else if (strcmp(exp.extensionName, "VK_KHR_descriptor_update_template") == 0)
1309 gpu_info.support_VK_KHR_descriptor_update_template = exp.specVersion;
1310 else if (strcmp(exp.extensionName, "VK_KHR_external_memory") == 0)
1311 gpu_info.support_VK_KHR_external_memory = exp.specVersion;
1312 else if (strcmp(exp.extensionName, "VK_KHR_get_memory_requirements2") == 0)
1313 gpu_info.support_VK_KHR_get_memory_requirements2 = exp.specVersion;
1314 else if (strcmp(exp.extensionName, "VK_KHR_maintenance1") == 0)
1315 gpu_info.support_VK_KHR_maintenance1 = exp.specVersion;
1316 else if (strcmp(exp.extensionName, "VK_KHR_maintenance2") == 0)
1317 gpu_info.support_VK_KHR_maintenance2 = exp.specVersion;
1318 else if (strcmp(exp.extensionName, "VK_KHR_maintenance3") == 0)
1319 gpu_info.support_VK_KHR_maintenance3 = exp.specVersion;
1320 else if (strcmp(exp.extensionName, "VK_KHR_multiview") == 0)
1321 gpu_info.support_VK_KHR_multiview = exp.specVersion;
1322 else if (strcmp(exp.extensionName, "VK_KHR_push_descriptor") == 0)
1323 gpu_info.support_VK_KHR_push_descriptor = exp.specVersion;
1324 else if (strcmp(exp.extensionName, "VK_KHR_sampler_ycbcr_conversion") == 0)
1325 gpu_info.support_VK_KHR_sampler_ycbcr_conversion = exp.specVersion;
1326 else if (strcmp(exp.extensionName, "VK_KHR_shader_float16_int8") == 0)
1327 gpu_info.support_VK_KHR_shader_float16_int8 = exp.specVersion;
1328 else if (strcmp(exp.extensionName, "VK_KHR_shader_float_controls") == 0)
1329 gpu_info.support_VK_KHR_shader_float_controls = exp.specVersion;
1330 else if (strcmp(exp.extensionName, "VK_KHR_storage_buffer_storage_class") == 0)
1331 gpu_info.support_VK_KHR_storage_buffer_storage_class = exp.specVersion;
1332 else if (strcmp(exp.extensionName, "VK_KHR_swapchain") == 0)
1333 gpu_info.support_VK_KHR_swapchain = exp.specVersion;
1334 else if (strcmp(exp.extensionName, "VK_EXT_descriptor_indexing") == 0)
1335 gpu_info.support_VK_EXT_descriptor_indexing = exp.specVersion;
1336 else if (strcmp(exp.extensionName, "VK_EXT_memory_budget") == 0)
1337 gpu_info.support_VK_EXT_memory_budget = exp.specVersion;
1338 else if (strcmp(exp.extensionName, "VK_EXT_queue_family_foreign") == 0)
1339 gpu_info.support_VK_EXT_queue_family_foreign = exp.specVersion;
1340 #if __ANDROID_API__ >= 26
1341 else if (strcmp(exp.extensionName, "VK_ANDROID_external_memory_android_hardware_buffer") == 0)
1342 gpu_info.support_VK_ANDROID_external_memory_android_hardware_buffer = exp.specVersion;
1343 #endif // __ANDROID_API__ >= 26
1344 }
1345
1346 // check features
1347 gpu_info.support_fp16_packed = true;
1348 gpu_info.support_fp16_storage = false;
1349 gpu_info.support_fp16_arithmetic = false;
1350 gpu_info.support_int8_packed = true;
1351 gpu_info.support_int8_storage = false;
1352 gpu_info.support_int8_arithmetic = false;
1353 gpu_info.support_ycbcr_conversion = false;
1354 if (support_VK_KHR_get_physical_device_properties2)
1355 {
1356 void* queryExtensionFeatures = 0;
1357
1358 // query int8 storage
1359 VkPhysicalDevice8BitStorageFeaturesKHR query8BitStorageFeatures;
1360 query8BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR;
1361 query8BitStorageFeatures.pNext = 0;
1362 if (gpu_info.support_VK_KHR_8bit_storage)
1363 {
1364 query8BitStorageFeatures.pNext = queryExtensionFeatures;
1365 queryExtensionFeatures = &query8BitStorageFeatures;
1366 }
1367
1368 // query fp16/int16 storage
1369 VkPhysicalDevice16BitStorageFeaturesKHR query16BitStorageFeatures;
1370 query16BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR;
1371 query16BitStorageFeatures.pNext = 0;
1372 if (gpu_info.support_VK_KHR_16bit_storage)
1373 {
1374 query16BitStorageFeatures.pNext = queryExtensionFeatures;
1375 queryExtensionFeatures = &query16BitStorageFeatures;
1376 }
1377
1378 // query fp16/int8 arithmetic
1379 VkPhysicalDeviceFloat16Int8FeaturesKHR queryFloat16Int8Features;
1380 queryFloat16Int8Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR;
1381 queryFloat16Int8Features.pNext = 0;
1382 if (gpu_info.support_VK_KHR_shader_float16_int8)
1383 {
1384 queryFloat16Int8Features.pNext = queryExtensionFeatures;
1385 queryExtensionFeatures = &queryFloat16Int8Features;
1386 }
1387
1388 // query ycbcr_conversion
1389 VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR querySamplerYcbcrConversionFeatures;
1390 querySamplerYcbcrConversionFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES_KHR;
1391 querySamplerYcbcrConversionFeatures.pNext = 0;
1392 if (gpu_info.support_VK_KHR_sampler_ycbcr_conversion)
1393 {
1394 querySamplerYcbcrConversionFeatures.pNext = queryExtensionFeatures;
1395 queryExtensionFeatures = &querySamplerYcbcrConversionFeatures;
1396 }
1397
1398 VkPhysicalDeviceFeatures2KHR queryFeatures;
1399 queryFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR;
1400 queryFeatures.pNext = queryExtensionFeatures;
1401
1402 vkGetPhysicalDeviceFeatures2KHR(physicalDevice, &queryFeatures);
1403
1404 if (gpu_info.support_VK_KHR_8bit_storage)
1405 {
1406 gpu_info.support_int8_storage = query8BitStorageFeatures.storageBuffer8BitAccess;
1407 }
1408 if (gpu_info.support_VK_KHR_16bit_storage && queryFeatures.features.shaderStorageImageExtendedFormats)
1409 {
1410 // shaderStorageImageExtendedFormats enables r16f format in storage image
1411 gpu_info.support_fp16_storage = query16BitStorageFeatures.storageBuffer16BitAccess;
1412 }
1413 if (gpu_info.support_VK_KHR_shader_float16_int8)
1414 {
1415 gpu_info.support_fp16_arithmetic = queryFloat16Int8Features.shaderFloat16;
1416 gpu_info.support_int8_arithmetic = queryFloat16Int8Features.shaderInt8;
1417 }
1418 if (gpu_info.support_VK_KHR_sampler_ycbcr_conversion)
1419 {
1420 gpu_info.support_ycbcr_conversion = querySamplerYcbcrConversionFeatures.samplerYcbcrConversion;
1421 }
1422 }
1423 else
1424 {
1425 // // TODO
1426 // VkPhysicalDeviceFeatures features;
1427 // vkGetPhysicalDeviceFeatures(physicalDevice, &features);
1428 }
1429
1430 if (physicalDeviceProperties.vendorID == 0x13b5 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 0, 82))
1431 {
1432 // the 16bit_storage implementation of arm mali driver is buggy :[
1433 gpu_info.support_fp16_storage = false;
1434 }
1435
1436 if (physicalDeviceProperties.vendorID == 0x10002 && physicalDeviceProperties.deviceID == 0x70006214 && physicalDeviceProperties.apiVersion == VK_MAKE_VERSION(1, 1, 82))
1437 {
1438 // the 16bit_storage implementation of vivante gc1700 driver is buggy :[
1439 gpu_info.support_fp16_storage = false;
1440 }
1441
1442 if (gpu_info.bug_implicit_fp16_arithmetic)
1443 {
1444 // force capability on as long as the driver accept spirv with fp16 arithmetic :D
1445 gpu_info.support_fp16_arithmetic = true;
1446 }
1447
1448 NCNN_LOGE("[%u %s] queueC=%u[%u] queueG=%u[%u] queueT=%u[%u]", i, physicalDeviceProperties.deviceName,
1449 gpu_info.compute_queue_family_index, gpu_info.compute_queue_count,
1450 gpu_info.graphics_queue_family_index, gpu_info.graphics_queue_count,
1451 gpu_info.transfer_queue_family_index, gpu_info.transfer_queue_count);
1452
1453 NCNN_LOGE("[%u %s] bugsbn1=%d bugbilz=%d bugcopc=%d bugihfa=%d", i, physicalDeviceProperties.deviceName,
1454 gpu_info.bug_storage_buffer_no_l1, gpu_info.bug_buffer_image_load_zero, gpu_info.bug_corrupted_online_pipeline_cache, gpu_info.bug_implicit_fp16_arithmetic);
1455
1456 NCNN_LOGE("[%u %s] fp16-p/s/a=%d/%d/%d int8-p/s/a=%d/%d/%d", i, physicalDeviceProperties.deviceName,
1457 gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic,
1458 gpu_info.support_int8_packed, gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic);
1459
1460 NCNN_LOGE("[%u %s] subgroup=%u basic=%d vote=%d ballot=%d shuffle=%d", i, physicalDeviceProperties.deviceName,
1461 gpu_info.subgroup_size, gpu_info.support_subgroup_basic, gpu_info.support_subgroup_vote,
1462 gpu_info.support_subgroup_ballot, gpu_info.support_subgroup_shuffle);
1463
1464 gpu_info_index++;
1465 }
1466
1467 g_gpu_count = gpu_info_index;
1468
1469 // the default gpu device
1470 g_default_gpu_index = find_default_vulkan_device_index();
1471
1472 glslang::InitializeProcess();
1473
1474 return 0;
1475 }
1476
destroy_gpu_instance()1477 void destroy_gpu_instance()
1478 {
1479 MutexLockGuard lock(g_instance_lock);
1480
1481 if ((VkInstance)g_instance == 0)
1482 return;
1483
1484 // NCNN_LOGE("destroy_gpu_instance");
1485
1486 glslang::FinalizeProcess();
1487
1488 for (int i = 0; i < NCNN_MAX_GPU_COUNT; i++)
1489 {
1490 delete g_default_vkdev[i];
1491 g_default_vkdev[i] = 0;
1492
1493 delete g_gpu_infos[i];
1494 g_gpu_infos[i] = 0;
1495 }
1496
1497 #if ENABLE_VALIDATION_LAYER
1498 if (support_VK_EXT_debug_utils)
1499 {
1500 DestroyDebugUtilsMessengerEXT(g_instance, g_instance.callback, NULL);
1501 g_instance.callback = 0;
1502 }
1503 #endif // ENABLE_VALIDATION_LAYER
1504
1505 vkDestroyInstance(g_instance, 0);
1506
1507 g_instance.instance = 0;
1508 }
1509
is_gpu_instance_ready()1510 static bool is_gpu_instance_ready()
1511 {
1512 MutexLockGuard lock(g_instance_lock);
1513
1514 return (VkInstance)g_instance != 0;
1515 }
1516
try_create_gpu_instance()1517 static void try_create_gpu_instance()
1518 {
1519 if (!is_gpu_instance_ready())
1520 create_gpu_instance();
1521 }
1522
get_gpu_count()1523 int get_gpu_count()
1524 {
1525 try_create_gpu_instance();
1526
1527 return g_gpu_count;
1528 }
1529
get_default_gpu_index()1530 int get_default_gpu_index()
1531 {
1532 try_create_gpu_instance();
1533
1534 return g_default_gpu_index;
1535 }
1536
get_gpu_info(int device_index)1537 const GpuInfo& get_gpu_info(int device_index)
1538 {
1539 try_create_gpu_instance();
1540
1541 return *g_gpu_infos[device_index];
1542 }
1543
1544 class VkDummyAllocator : public VkBlobAllocator
1545 {
1546 public:
1547 // NOTE 16k is large enough I think ...
VkDummyAllocator(const VulkanDevice * _vkdev)1548 VkDummyAllocator(const VulkanDevice* _vkdev)
1549 : VkBlobAllocator(_vkdev, 16 * 1024)
1550 {
1551 }
1552 };
1553
1554 class VkDummyCompute : public VkCompute
1555 {
1556 public:
VkDummyCompute(const VulkanDevice * _vkdev)1557 VkDummyCompute(const VulkanDevice* _vkdev)
1558 : VkCompute(_vkdev)
1559 {
1560 }
1561
record_dummy(const VkMat & buffer)1562 void record_dummy(const VkMat& buffer)
1563 {
1564 barrier_readwrite(buffer);
1565 }
1566
record_dummy(const VkImageMat & image)1567 void record_dummy(const VkImageMat& image)
1568 {
1569 barrier_readwrite(image);
1570 }
1571
record_dummy_readonly(const VkImageMat & image)1572 void record_dummy_readonly(const VkImageMat& image)
1573 {
1574 barrier_readonly(image);
1575 }
1576 };
1577
1578 class VulkanDevicePrivate
1579 {
1580 public:
VulkanDevicePrivate(VulkanDevice * _vkdev)1581 VulkanDevicePrivate(VulkanDevice* _vkdev)
1582 : vkdev(_vkdev)
1583 {
1584 }
1585 VulkanDevice* const vkdev;
1586
1587 // dummy buffer and image
1588 int create_dummy_buffer_image();
1589 void destroy_dummy_buffer_image();
1590
1591 // utility operator
1592 const ncnn::Packing_vulkan* get_utility_operator(int storage_type_from, int storage_type_to, int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const;
1593 void destroy_utility_operator();
1594
1595 VkDevice device;
1596
1597 // hardware queue
1598 mutable std::vector<VkQueue> compute_queues;
1599 mutable std::vector<VkQueue> graphics_queues;
1600 mutable std::vector<VkQueue> transfer_queues;
1601 mutable int free_compute_queue_count;
1602 mutable int free_graphics_queue_count;
1603 mutable int free_transfer_queue_count;
1604 mutable Mutex compute_queue_lock;
1605 mutable Mutex graphics_queue_lock;
1606 mutable Mutex transfer_queue_lock;
1607 mutable ConditionVariable compute_queue_condition;
1608 mutable ConditionVariable graphics_queue_condition;
1609 mutable ConditionVariable transfer_queue_condition;
1610
1611 // default blob allocator for each queue
1612 mutable std::vector<VkAllocator*> blob_allocators;
1613 mutable Mutex blob_allocator_lock;
1614
1615 // default staging allocator for each queue
1616 mutable std::vector<VkAllocator*> staging_allocators;
1617 mutable Mutex staging_allocator_lock;
1618
1619 // nearest sampler for texelfetch
1620 VkSampler texelfetch_sampler;
1621
1622 // dummy buffer and image
1623 VkAllocator* dummy_allocator;
1624 VkMat dummy_buffer;
1625 VkImageMat dummy_image;
1626 VkImageMat dummy_image_readonly;
1627
1628 // device-wide pipeline cache
1629 PipelineCache* pipeline_cache;
1630
1631 // utility operator
1632 // from buffer | image
1633 // to buffer | image
1634 // from fp32-b/i | fp16p-b/i | fp16s-b/i
1635 // to fp32-b/i | fp16p-b/i | fp16s-b/i
1636 // to pack1 | pack4 | pack8
1637 mutable ncnn::Packing_vulkan* uop_packing[2][2][3][3][3];
1638 mutable Mutex uop_lock;
1639 };
1640
create_dummy_buffer_image()1641 int VulkanDevicePrivate::create_dummy_buffer_image()
1642 {
1643 dummy_allocator = new VkDummyAllocator(vkdev);
1644
1645 dummy_buffer.create(1, 4u, dummy_allocator);
1646 dummy_image.create(1, 4u, dummy_allocator);
1647 #if __APPLE__
1648 if (vkdev->info.vendor_id() != 0x8086)
1649 dummy_image_readonly.create(1, 4u, dummy_allocator);
1650 #else
1651 dummy_image_readonly.create(1, 4u, dummy_allocator);
1652 #endif
1653
1654 VkDummyCompute cmd(vkdev);
1655
1656 cmd.record_dummy(dummy_buffer);
1657 cmd.record_dummy(dummy_image);
1658 #if __APPLE__
1659 if (vkdev->info.vendor_id() != 0x8086)
1660 cmd.record_dummy_readonly(dummy_image_readonly);
1661 #else
1662 cmd.record_dummy_readonly(dummy_image_readonly);
1663 #endif
1664
1665 cmd.submit_and_wait();
1666
1667 return 0;
1668 }
1669
destroy_dummy_buffer_image()1670 void VulkanDevicePrivate::destroy_dummy_buffer_image()
1671 {
1672 dummy_buffer.release();
1673 dummy_image.release();
1674 #if __APPLE__
1675 if (vkdev->info.vendor_id() != 0x8086)
1676 dummy_image_readonly.release();
1677 #else
1678 dummy_image_readonly.release();
1679 #endif
1680
1681 delete dummy_allocator;
1682 }
1683
get_utility_operator(int storage_type_from,int storage_type_to,int cast_type_from_index,int cast_type_to_index,int packing_type_to_index) const1684 const ncnn::Packing_vulkan* VulkanDevicePrivate::get_utility_operator(int storage_type_from, int storage_type_to, int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const
1685 {
1686 MutexLockGuard lock(uop_lock);
1687
1688 const ncnn::Packing_vulkan* cached_uop = uop_packing[storage_type_from][storage_type_to][cast_type_from_index][cast_type_to_index][packing_type_to_index];
1689 if (cached_uop)
1690 return cached_uop;
1691
1692 if ((cast_type_from_index == 1 && cast_type_to_index == 2) || (cast_type_from_index == 2 && cast_type_to_index == 1))
1693 {
1694 NCNN_LOGE("no fp16p to/from fp16s conversion");
1695 return 0;
1696 }
1697
1698 // create uop
1699 Option opt;
1700 opt.use_image_storage = (storage_type_from == 1 || storage_type_to == 1);
1701 opt.use_fp16_packed = (cast_type_from_index == 1 || cast_type_to_index == 1);
1702 opt.use_fp16_storage = (cast_type_from_index == 2 || cast_type_to_index == 2);
1703
1704 if (!vkdev->info.support_fp16_packed() && opt.use_fp16_packed)
1705 {
1706 NCNN_LOGE("cannot create uop with use_fp16_packed if not support_fp16_packed");
1707 return 0;
1708 }
1709
1710 if (!vkdev->info.support_fp16_storage() && opt.use_fp16_storage)
1711 {
1712 NCNN_LOGE("cannot create uop with use_fp16_storage if not support_fp16_storage");
1713 return 0;
1714 }
1715
1716 // fp16/int8 arithmetic are not necessary for packing
1717 // and may conflict with storage options
1718 opt.use_fp16_arithmetic = false;
1719 opt.use_int8_arithmetic = false;
1720
1721 // enable pack8 for pack8to1/pack8to4
1722 opt.use_shader_pack8 = true;
1723
1724 opt.use_vulkan_compute = true;
1725
1726 // cache uop pipeline as device member explicitly
1727 opt.pipeline_cache = 0;
1728
1729 ncnn::Packing_vulkan* uop = new ncnn::Packing_vulkan;
1730 uop->vkdev = vkdev;
1731
1732 ncnn::ParamDict pd;
1733 pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4 : 8); // out_elempack
1734 pd.set(2, cast_type_from_index + 1); // 0=auto 1=fp32 2=fp16p 3=fp16s
1735 pd.set(3, cast_type_to_index + 1);
1736 pd.set(4, storage_type_from); // 0=buffer 1=image
1737 pd.set(5, storage_type_to);
1738
1739 uop->load_param(pd);
1740
1741 uop->create_pipeline(opt);
1742
1743 uop_packing[storage_type_from][storage_type_to][cast_type_from_index][cast_type_to_index][packing_type_to_index] = uop;
1744
1745 return uop;
1746 }
1747
destroy_utility_operator()1748 void VulkanDevicePrivate::destroy_utility_operator()
1749 {
1750 Option opt;
1751 opt.use_vulkan_compute = true;
1752 opt.use_fp16_arithmetic = false;
1753 opt.use_int8_arithmetic = false;
1754 opt.pipeline_cache = 0;
1755
1756 // from buffer | image
1757 // to buffer | image
1758 for (int i0 = 0; i0 < 2; i0++)
1759 {
1760 for (int i1 = 0; i1 < 2; i1++)
1761 {
1762 opt.use_image_storage = (i0 == 1 || i1 == 1);
1763
1764 // from fp32-b/i | fp16p-b/i | fp16s-b/i
1765 // to fp32-b/i | fp16p-b/i | fp16s-b/i
1766 for (int j0 = 0; j0 < 3; j0++)
1767 {
1768 for (int j1 = 0; j1 < 3; j1++)
1769 {
1770 if ((j0 == 1 && j1 == 2) || (j0 == 2 && j1 == 1))
1771 {
1772 // no fp16p to/from fp16s conversion
1773 continue;
1774 }
1775
1776 opt.use_fp16_packed = (j0 == 1 || j1 == 1);
1777 opt.use_fp16_storage = (j0 == 2 || j1 == 2);
1778
1779 if (!vkdev->info.support_fp16_packed() && opt.use_fp16_packed)
1780 continue;
1781
1782 if (!vkdev->info.support_fp16_storage() && opt.use_fp16_storage)
1783 continue;
1784
1785 // to pack1 | pack4 | pack8
1786 for (int k = 0; k < 3; k++)
1787 {
1788 // enable pack8 for pack8to1/pack8to4
1789 opt.use_shader_pack8 = true;
1790
1791 ncnn::Layer* uop = uop_packing[i0][i1][j0][j1][k];
1792 if (!uop)
1793 continue;
1794
1795 uop->destroy_pipeline(opt);
1796
1797 delete uop;
1798
1799 uop_packing[i0][i1][j0][j1][k] = 0;
1800 }
1801 }
1802 }
1803 }
1804 }
1805 }
1806
VulkanDevice(int device_index)1807 VulkanDevice::VulkanDevice(int device_index)
1808 : info(get_gpu_info(device_index)), d(new VulkanDevicePrivate(this))
1809 {
1810 try_create_gpu_instance();
1811
1812 std::vector<const char*> enabledExtensions;
1813 if (info.support_VK_KHR_8bit_storage())
1814 enabledExtensions.push_back("VK_KHR_8bit_storage");
1815 if (info.support_VK_KHR_16bit_storage())
1816 enabledExtensions.push_back("VK_KHR_16bit_storage");
1817 if (info.support_VK_KHR_bind_memory2())
1818 enabledExtensions.push_back("VK_KHR_bind_memory2");
1819 if (info.support_VK_KHR_create_renderpass2())
1820 enabledExtensions.push_back("VK_KHR_create_renderpass2");
1821 if (info.support_VK_KHR_dedicated_allocation())
1822 enabledExtensions.push_back("VK_KHR_dedicated_allocation");
1823 if (info.support_VK_KHR_descriptor_update_template())
1824 enabledExtensions.push_back("VK_KHR_descriptor_update_template");
1825 if (info.support_VK_KHR_external_memory())
1826 enabledExtensions.push_back("VK_KHR_external_memory");
1827 if (info.support_VK_KHR_get_memory_requirements2())
1828 enabledExtensions.push_back("VK_KHR_get_memory_requirements2");
1829 if (info.support_VK_KHR_maintenance1())
1830 enabledExtensions.push_back("VK_KHR_maintenance1");
1831 if (info.support_VK_KHR_maintenance2())
1832 enabledExtensions.push_back("VK_KHR_maintenance2");
1833 if (info.support_VK_KHR_maintenance3())
1834 enabledExtensions.push_back("VK_KHR_maintenance3");
1835 if (info.support_VK_KHR_multiview())
1836 enabledExtensions.push_back("VK_KHR_multiview");
1837 if (info.support_VK_KHR_push_descriptor())
1838 enabledExtensions.push_back("VK_KHR_push_descriptor");
1839 if (info.support_VK_KHR_sampler_ycbcr_conversion())
1840 enabledExtensions.push_back("VK_KHR_sampler_ycbcr_conversion");
1841 if (info.support_VK_KHR_shader_float16_int8())
1842 enabledExtensions.push_back("VK_KHR_shader_float16_int8");
1843 if (info.support_VK_KHR_shader_float_controls())
1844 enabledExtensions.push_back("VK_KHR_shader_float_controls");
1845 if (info.support_VK_KHR_storage_buffer_storage_class())
1846 enabledExtensions.push_back("VK_KHR_storage_buffer_storage_class");
1847 if (info.support_VK_KHR_swapchain())
1848 enabledExtensions.push_back("VK_KHR_swapchain");
1849 if (info.support_VK_EXT_descriptor_indexing())
1850 enabledExtensions.push_back("VK_EXT_descriptor_indexing");
1851 if (info.support_VK_EXT_memory_budget())
1852 enabledExtensions.push_back("VK_EXT_memory_budget");
1853 if (info.support_VK_EXT_queue_family_foreign())
1854 enabledExtensions.push_back("VK_EXT_queue_family_foreign");
1855 #if __ANDROID_API__ >= 26
1856 if (info.support_VK_ANDROID_external_memory_android_hardware_buffer())
1857 enabledExtensions.push_back("VK_ANDROID_external_memory_android_hardware_buffer");
1858 #endif // __ANDROID_API__ >= 26
1859
1860 void* enabledExtensionFeatures = 0;
1861
1862 // enable int8 storage
1863 VkPhysicalDevice8BitStorageFeaturesKHR enabled8BitStorageFeatures;
1864 enabled8BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR;
1865 enabled8BitStorageFeatures.pNext = 0;
1866 enabled8BitStorageFeatures.storageBuffer8BitAccess = info.support_int8_storage();
1867 enabled8BitStorageFeatures.uniformAndStorageBuffer8BitAccess = VK_FALSE;
1868 enabled8BitStorageFeatures.storagePushConstant8 = VK_FALSE;
1869 if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_8bit_storage())
1870 {
1871 enabled8BitStorageFeatures.pNext = enabledExtensionFeatures;
1872 enabledExtensionFeatures = &enabled8BitStorageFeatures;
1873 }
1874
1875 // enable fp16/int16 storage
1876 VkPhysicalDevice16BitStorageFeaturesKHR enabled16BitStorageFeatures;
1877 enabled16BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR;
1878 enabled16BitStorageFeatures.pNext = 0;
1879 enabled16BitStorageFeatures.storageBuffer16BitAccess = info.support_fp16_storage();
1880 enabled16BitStorageFeatures.uniformAndStorageBuffer16BitAccess = VK_FALSE;
1881 enabled16BitStorageFeatures.storagePushConstant16 = VK_FALSE;
1882 enabled16BitStorageFeatures.storageInputOutput16 = VK_FALSE;
1883 if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_16bit_storage())
1884 {
1885 enabled16BitStorageFeatures.pNext = enabledExtensionFeatures;
1886 enabledExtensionFeatures = &enabled16BitStorageFeatures;
1887 }
1888
1889 // enable fp16/int8 arithmetic
1890 VkPhysicalDeviceFloat16Int8FeaturesKHR enabledFloat16Int8Features;
1891 enabledFloat16Int8Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR;
1892 enabledFloat16Int8Features.pNext = 0;
1893 enabledFloat16Int8Features.shaderFloat16 = info.support_fp16_arithmetic();
1894 enabledFloat16Int8Features.shaderInt8 = info.support_int8_arithmetic();
1895 if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_shader_float16_int8())
1896 {
1897 enabledFloat16Int8Features.pNext = enabledExtensionFeatures;
1898 enabledExtensionFeatures = &enabledFloat16Int8Features;
1899 }
1900
1901 // enable ycbcr conversion
1902 VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR querySamplerYcbcrConversionFeatures;
1903 querySamplerYcbcrConversionFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES_KHR;
1904 querySamplerYcbcrConversionFeatures.pNext = 0;
1905 querySamplerYcbcrConversionFeatures.samplerYcbcrConversion = info.support_ycbcr_conversion();
1906 if (support_VK_KHR_get_physical_device_properties2 && info.support_ycbcr_conversion())
1907 {
1908 querySamplerYcbcrConversionFeatures.pNext = enabledExtensionFeatures;
1909 enabledExtensionFeatures = &querySamplerYcbcrConversionFeatures;
1910 }
1911
1912 std::vector<float> compute_queue_priorities(info.compute_queue_count(), 1.f); // 0.f ~ 1.f
1913 std::vector<float> graphics_queue_priorities(info.graphics_queue_count(), 1.f); // 0.f ~ 1.f
1914 std::vector<float> transfer_queue_priorities(info.transfer_queue_count(), 1.f); // 0.f ~ 1.f
1915
1916 VkDeviceQueueCreateInfo deviceQueueCreateInfos[3];
1917
1918 VkDeviceQueueCreateInfo deviceComputeQueueCreateInfo;
1919 deviceComputeQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
1920 deviceComputeQueueCreateInfo.pNext = 0;
1921 deviceComputeQueueCreateInfo.flags = 0;
1922 deviceComputeQueueCreateInfo.queueFamilyIndex = info.compute_queue_family_index();
1923 deviceComputeQueueCreateInfo.queueCount = info.compute_queue_count();
1924 deviceComputeQueueCreateInfo.pQueuePriorities = compute_queue_priorities.data();
1925
1926 VkDeviceQueueCreateInfo deviceGraphicsQueueCreateInfo;
1927 deviceGraphicsQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
1928 deviceGraphicsQueueCreateInfo.pNext = 0;
1929 deviceGraphicsQueueCreateInfo.flags = 0;
1930 deviceGraphicsQueueCreateInfo.queueFamilyIndex = info.graphics_queue_family_index();
1931 deviceGraphicsQueueCreateInfo.queueCount = info.graphics_queue_count();
1932 deviceGraphicsQueueCreateInfo.pQueuePriorities = graphics_queue_priorities.data();
1933
1934 VkDeviceQueueCreateInfo deviceTransferQueueCreateInfo;
1935 deviceTransferQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
1936 deviceTransferQueueCreateInfo.pNext = 0;
1937 deviceTransferQueueCreateInfo.flags = 0;
1938 deviceTransferQueueCreateInfo.queueFamilyIndex = info.transfer_queue_family_index();
1939 deviceTransferQueueCreateInfo.queueCount = info.transfer_queue_count();
1940 deviceTransferQueueCreateInfo.pQueuePriorities = transfer_queue_priorities.data();
1941
1942 VkDeviceCreateInfo deviceCreateInfo;
1943 deviceCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
1944 deviceCreateInfo.pNext = enabledExtensionFeatures;
1945 deviceCreateInfo.flags = 0;
1946 if (info.compute_queue_family_index() == info.graphics_queue_family_index() && info.compute_queue_family_index() == info.transfer_queue_family_index())
1947 {
1948 deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo;
1949 deviceCreateInfo.queueCreateInfoCount = 1;
1950 }
1951 else if (info.compute_queue_family_index() == info.graphics_queue_family_index() && info.compute_queue_family_index() != info.transfer_queue_family_index())
1952 {
1953 deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo;
1954 deviceQueueCreateInfos[1] = deviceTransferQueueCreateInfo;
1955 deviceCreateInfo.queueCreateInfoCount = 2;
1956 }
1957 else if (info.compute_queue_family_index() != info.graphics_queue_family_index() && info.graphics_queue_family_index() == info.transfer_queue_family_index())
1958 {
1959 deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo;
1960 deviceQueueCreateInfos[1] = deviceGraphicsQueueCreateInfo;
1961 deviceCreateInfo.queueCreateInfoCount = 2;
1962 }
1963 else // if (info.compute_queue_family_index() != info.graphics_queue_family_index() && info.graphics_queue_family_index() != info.transfer_queue_family_index())
1964 {
1965 deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo;
1966 deviceQueueCreateInfos[1] = deviceGraphicsQueueCreateInfo;
1967 deviceQueueCreateInfos[2] = deviceTransferQueueCreateInfo;
1968 deviceCreateInfo.queueCreateInfoCount = 3;
1969 }
1970 deviceCreateInfo.pQueueCreateInfos = deviceQueueCreateInfos;
1971 deviceCreateInfo.enabledLayerCount = 0;
1972 deviceCreateInfo.ppEnabledLayerNames = 0;
1973 deviceCreateInfo.enabledExtensionCount = enabledExtensions.size();
1974 deviceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data();
1975 deviceCreateInfo.pEnabledFeatures = 0; // VkPhysicalDeviceFeatures pointer
1976
1977 VkResult ret = vkCreateDevice(info.physical_device(), &deviceCreateInfo, 0, &d->device);
1978 if (ret != VK_SUCCESS)
1979 {
1980 NCNN_LOGE("vkCreateDevice failed %d", ret);
1981 }
1982
1983 init_device_extension();
1984
1985 d->free_compute_queue_count = 0;
1986 d->free_graphics_queue_count = 0;
1987 d->free_transfer_queue_count = 0;
1988
1989 d->free_compute_queue_count = info.compute_queue_count();
1990 d->compute_queues.resize(info.compute_queue_count());
1991 d->blob_allocators.resize(info.compute_queue_count());
1992 d->staging_allocators.resize(info.compute_queue_count());
1993 for (uint32_t i = 0; i < info.compute_queue_count(); i++)
1994 {
1995 vkGetDeviceQueue(d->device, info.compute_queue_family_index(), i, &d->compute_queues[i]);
1996 d->blob_allocators[i] = new VkBlobAllocator(this);
1997 d->staging_allocators[i] = new VkStagingAllocator(this);
1998 }
1999 if (info.compute_queue_family_index() != info.graphics_queue_family_index())
2000 {
2001 d->free_graphics_queue_count = info.graphics_queue_count();
2002 d->graphics_queues.resize(info.graphics_queue_count());
2003 for (uint32_t i = 0; i < info.graphics_queue_count(); i++)
2004 {
2005 vkGetDeviceQueue(d->device, info.graphics_queue_family_index(), i, &d->graphics_queues[i]);
2006 }
2007 }
2008 if (info.compute_queue_family_index() != info.transfer_queue_family_index() && info.graphics_queue_family_index() != info.transfer_queue_family_index())
2009 {
2010 d->free_transfer_queue_count = info.transfer_queue_count();
2011 d->transfer_queues.resize(info.transfer_queue_count());
2012 for (uint32_t i = 0; i < info.transfer_queue_count(); i++)
2013 {
2014 vkGetDeviceQueue(d->device, info.transfer_queue_family_index(), i, &d->transfer_queues[i]);
2015 }
2016 }
2017
2018 // prepare immutable texelfetch sampler
2019 {
2020 VkSamplerCreateInfo samplerCreateInfo;
2021 samplerCreateInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
2022 samplerCreateInfo.pNext = 0;
2023 samplerCreateInfo.flags = 0;
2024 samplerCreateInfo.magFilter = VK_FILTER_NEAREST;
2025 samplerCreateInfo.minFilter = VK_FILTER_NEAREST;
2026 samplerCreateInfo.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
2027 samplerCreateInfo.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
2028 samplerCreateInfo.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
2029 samplerCreateInfo.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
2030 samplerCreateInfo.mipLodBias = 0.0f;
2031 samplerCreateInfo.anisotropyEnable = VK_FALSE;
2032 samplerCreateInfo.maxAnisotropy = 1;
2033 samplerCreateInfo.compareEnable = VK_FALSE;
2034 samplerCreateInfo.compareOp = VK_COMPARE_OP_NEVER;
2035 samplerCreateInfo.minLod = 0.0f;
2036 samplerCreateInfo.maxLod = 0.0f;
2037 samplerCreateInfo.borderColor = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK;
2038 samplerCreateInfo.unnormalizedCoordinates = VK_TRUE;
2039
2040 d->texelfetch_sampler = 0;
2041 ret = vkCreateSampler(d->device, &samplerCreateInfo, 0, &d->texelfetch_sampler);
2042 if (ret != VK_SUCCESS)
2043 {
2044 NCNN_LOGE("vkCreateSampler failed %d", ret);
2045 }
2046 }
2047
2048 d->create_dummy_buffer_image();
2049
2050 d->pipeline_cache = new PipelineCache(this);
2051
2052 memset(d->uop_packing, 0, sizeof(d->uop_packing));
2053 }
2054
~VulkanDevice()2055 VulkanDevice::~VulkanDevice()
2056 {
2057 d->destroy_utility_operator();
2058
2059 d->destroy_dummy_buffer_image();
2060
2061 if (d->texelfetch_sampler)
2062 {
2063 vkDestroySampler(d->device, d->texelfetch_sampler, 0);
2064 }
2065
2066 for (size_t i = 0; i < d->blob_allocators.size(); i++)
2067 {
2068 delete d->blob_allocators[i];
2069 }
2070 d->blob_allocators.clear();
2071 for (size_t i = 0; i < d->staging_allocators.size(); i++)
2072 {
2073 delete d->staging_allocators[i];
2074 }
2075 d->staging_allocators.clear();
2076
2077 delete d->pipeline_cache;
2078
2079 vkDestroyDevice(d->device, 0);
2080
2081 delete d;
2082 }
2083
VulkanDevice(const VulkanDevice &)2084 VulkanDevice::VulkanDevice(const VulkanDevice&)
2085 : info(get_gpu_info(0)), d(0)
2086 {
2087 }
2088
operator =(const VulkanDevice &)2089 VulkanDevice& VulkanDevice::operator=(const VulkanDevice&)
2090 {
2091 return *this;
2092 }
2093
vkdevice() const2094 VkDevice VulkanDevice::vkdevice() const
2095 {
2096 return d->device;
2097 }
2098
compile_shader_module(const uint32_t * spv_data,size_t spv_data_size) const2099 VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const
2100 {
2101 VkShaderModuleCreateInfo shaderModuleCreateInfo;
2102 shaderModuleCreateInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
2103 shaderModuleCreateInfo.pNext = 0;
2104 shaderModuleCreateInfo.flags = 0;
2105 shaderModuleCreateInfo.codeSize = spv_data_size;
2106 shaderModuleCreateInfo.pCode = spv_data;
2107
2108 VkShaderModule shader_module;
2109 VkResult ret = vkCreateShaderModule(d->device, &shaderModuleCreateInfo, 0, &shader_module);
2110 if (ret != VK_SUCCESS)
2111 {
2112 NCNN_LOGE("vkCreateShaderModule failed %d", ret);
2113 return 0;
2114 }
2115
2116 return shader_module;
2117 }
2118
inject_local_size_xyz(const uint32_t * code,size_t size,uint32_t local_size_x,uint32_t local_size_y,uint32_t local_size_z,uint32_t * dstcode,size_t * dstsize)2119 static void inject_local_size_xyz(const uint32_t* code, size_t size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t* dstcode, size_t* dstsize)
2120 {
2121 uint32_t local_size_x_id = -1;
2122 uint32_t local_size_y_id = -1;
2123 uint32_t local_size_z_id = -1;
2124 uint32_t gl_WorkGroupSize_id = -1;
2125
2126 const uint32_t* p = code;
2127 uint32_t* dp = dstcode;
2128
2129 // skip magic version generator bound schema
2130 memcpy(dp, p, 5 * sizeof(uint32_t));
2131 p += 5;
2132 dp += 5;
2133
2134 // foreach op
2135 while ((const unsigned char*)p < (const unsigned char*)code + size)
2136 {
2137 uint32_t opcode = p[0];
2138
2139 uint16_t wordcount = opcode >> 16;
2140 uint16_t op = opcode & 0xffff;
2141
2142 if (op == 16) // OpExecutionMode
2143 {
2144 uint32_t mode = p[2];
2145 if (mode == 17) // LocalSize
2146 {
2147 memcpy(dp, p, wordcount * sizeof(uint32_t));
2148
2149 // set local_size_xyz
2150 dp[3] = local_size_x;
2151 dp[4] = local_size_y;
2152 dp[5] = local_size_z;
2153
2154 p += wordcount;
2155 dp += wordcount;
2156 continue;
2157 }
2158 }
2159 else if (op == 50) // OpSpecConstant
2160 {
2161 uint32_t id = p[2];
2162 if (id == local_size_x_id || id == local_size_y_id || id == local_size_z_id)
2163 {
2164 p += wordcount;
2165 continue;
2166 }
2167 }
2168 else if (op == 51) // OpSpecConstantComposite
2169 {
2170 uint32_t id = p[2];
2171 if (id == gl_WorkGroupSize_id)
2172 {
2173 if (wordcount == 6 && (p[3] == local_size_x_id || p[4] == local_size_y_id || p[5] == local_size_z_id))
2174 {
2175 p += wordcount;
2176 continue;
2177 }
2178 }
2179 }
2180 else if (op == 71) // OpDecorate
2181 {
2182 uint32_t id = p[1];
2183 uint32_t decoration = p[2];
2184 if (decoration == 1) // SpecId
2185 {
2186 uint32_t specid = p[3];
2187 if (specid == 233) local_size_x_id = id;
2188 if (specid == 234) local_size_y_id = id;
2189 if (specid == 235) local_size_z_id = id;
2190 if (specid == 233 || specid == 234 || specid == 235)
2191 {
2192 p += wordcount;
2193 continue;
2194 }
2195 }
2196 else if (decoration == 11) // BuiltIn
2197 {
2198 uint32_t builtin = p[3];
2199 if (builtin == 25) // WorkgroupSize
2200 {
2201 gl_WorkGroupSize_id = id;
2202 p += wordcount;
2203 continue;
2204 }
2205 }
2206 }
2207
2208 memcpy(dp, p, wordcount * sizeof(uint32_t));
2209 p += wordcount;
2210 dp += wordcount;
2211 }
2212
2213 *dstsize = (unsigned char*)dp - (unsigned char*)dstcode;
2214 }
2215
compile_shader_module(const uint32_t * spv_data,size_t spv_data_size,uint32_t local_size_x,uint32_t local_size_y,uint32_t local_size_z) const2216 VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const
2217 {
2218 uint32_t* spv_data_modified = (uint32_t*)malloc(spv_data_size);
2219 size_t spv_data_size_modified = spv_data_size;
2220 inject_local_size_xyz(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z, spv_data_modified, &spv_data_size_modified);
2221
2222 VkShaderModule shader_module = compile_shader_module(spv_data_modified, spv_data_size_modified);
2223
2224 free(spv_data_modified);
2225
2226 return shader_module;
2227 }
2228
create_descriptorset_layout(int binding_count,const int * binding_types,VkDescriptorSetLayout * descriptorset_layout) const2229 int VulkanDevice::create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const
2230 {
2231 if (binding_count == 0)
2232 {
2233 *descriptorset_layout = 0;
2234 return 0;
2235 }
2236
2237 std::vector<VkDescriptorSetLayoutBinding> descriptorSetLayoutBindings(binding_count);
2238 for (int i = 0; i < binding_count; i++)
2239 {
2240 int binding_type = binding_types[i];
2241
2242 descriptorSetLayoutBindings[i].binding = i;
2243 descriptorSetLayoutBindings[i].descriptorCount = 1;
2244 descriptorSetLayoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
2245
2246 if (binding_type == 1)
2247 {
2248 descriptorSetLayoutBindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
2249 descriptorSetLayoutBindings[i].pImmutableSamplers = 0;
2250 }
2251 else if (binding_type == 2)
2252 {
2253 descriptorSetLayoutBindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
2254 descriptorSetLayoutBindings[i].pImmutableSamplers = 0;
2255 }
2256 else // if (binding_type == 3)
2257 {
2258 descriptorSetLayoutBindings[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
2259 descriptorSetLayoutBindings[i].pImmutableSamplers = immutable_texelfetch_sampler(); // we always use texelfetch
2260 }
2261 }
2262
2263 VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo;
2264 descriptorSetLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
2265 descriptorSetLayoutCreateInfo.pNext = 0;
2266 descriptorSetLayoutCreateInfo.flags = 0;
2267 descriptorSetLayoutCreateInfo.bindingCount = binding_count;
2268 descriptorSetLayoutCreateInfo.pBindings = descriptorSetLayoutBindings.data();
2269
2270 if (info.support_VK_KHR_push_descriptor())
2271 {
2272 descriptorSetLayoutCreateInfo.flags |= VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR;
2273 }
2274
2275 VkResult ret = vkCreateDescriptorSetLayout(d->device, &descriptorSetLayoutCreateInfo, 0, descriptorset_layout);
2276 if (ret != VK_SUCCESS)
2277 {
2278 NCNN_LOGE("vkCreateDescriptorSetLayout failed %d", ret);
2279 return -1;
2280 }
2281
2282 return 0;
2283 }
2284
create_pipeline_layout(int push_constant_count,VkDescriptorSetLayout descriptorset_layout,VkPipelineLayout * pipeline_layout) const2285 int VulkanDevice::create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const
2286 {
2287 VkPushConstantRange pushConstantRange;
2288 pushConstantRange.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
2289 pushConstantRange.offset = 0;
2290 pushConstantRange.size = sizeof(vk_constant_type) * push_constant_count;
2291
2292 VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo;
2293 pipelineLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
2294 pipelineLayoutCreateInfo.pNext = 0;
2295 pipelineLayoutCreateInfo.flags = 0;
2296
2297 if (descriptorset_layout)
2298 {
2299 pipelineLayoutCreateInfo.setLayoutCount = 1;
2300 pipelineLayoutCreateInfo.pSetLayouts = &descriptorset_layout;
2301 }
2302 else
2303 {
2304 pipelineLayoutCreateInfo.setLayoutCount = 0;
2305 pipelineLayoutCreateInfo.pSetLayouts = 0;
2306 }
2307
2308 if (push_constant_count > 0)
2309 {
2310 pipelineLayoutCreateInfo.pushConstantRangeCount = 1;
2311 pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange;
2312 }
2313 else
2314 {
2315 pipelineLayoutCreateInfo.pushConstantRangeCount = 0;
2316 pipelineLayoutCreateInfo.pPushConstantRanges = 0;
2317 }
2318
2319 VkResult ret = vkCreatePipelineLayout(d->device, &pipelineLayoutCreateInfo, 0, pipeline_layout);
2320 if (ret != VK_SUCCESS)
2321 {
2322 NCNN_LOGE("vkCreatePipelineLayout failed %d", ret);
2323 return -1;
2324 }
2325
2326 return 0;
2327 }
2328
create_pipeline(VkShaderModule shader_module,VkPipelineLayout pipeline_layout,const std::vector<vk_specialization_type> & specializations,VkPipeline * pipeline) const2329 int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, VkPipeline* pipeline) const
2330 {
2331 const int specialization_count = specializations.size();
2332
2333 std::vector<VkSpecializationMapEntry> specializationMapEntries(specialization_count);
2334 for (int i = 0; i < specialization_count; i++)
2335 {
2336 specializationMapEntries[i].constantID = i;
2337 specializationMapEntries[i].offset = i * sizeof(vk_specialization_type);
2338 specializationMapEntries[i].size = sizeof(vk_specialization_type);
2339 }
2340
2341 VkSpecializationInfo specializationInfo;
2342 specializationInfo.mapEntryCount = specializationMapEntries.size();
2343 specializationInfo.pMapEntries = specializationMapEntries.data();
2344 specializationInfo.dataSize = specializations.size() * sizeof(vk_specialization_type);
2345 specializationInfo.pData = specializations.data();
2346
2347 VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo;
2348 pipelineShaderStageCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
2349 pipelineShaderStageCreateInfo.pNext = 0;
2350 pipelineShaderStageCreateInfo.flags = 0;
2351 pipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
2352 pipelineShaderStageCreateInfo.module = shader_module;
2353 pipelineShaderStageCreateInfo.pName = "main";
2354 pipelineShaderStageCreateInfo.pSpecializationInfo = &specializationInfo;
2355
2356 VkComputePipelineCreateInfo computePipelineCreateInfo;
2357 computePipelineCreateInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
2358 computePipelineCreateInfo.pNext = 0;
2359 computePipelineCreateInfo.flags = 0;
2360 computePipelineCreateInfo.stage = pipelineShaderStageCreateInfo;
2361 computePipelineCreateInfo.layout = pipeline_layout;
2362 computePipelineCreateInfo.basePipelineHandle = 0;
2363 computePipelineCreateInfo.basePipelineIndex = 0;
2364
2365 VkResult ret = vkCreateComputePipelines(d->device, 0, 1, &computePipelineCreateInfo, 0, pipeline);
2366 if (ret != VK_SUCCESS)
2367 {
2368 NCNN_LOGE("vkCreateComputePipelines failed %d", ret);
2369 return -1;
2370 }
2371
2372 return 0;
2373 }
2374
create_descriptor_update_template(int binding_count,const int * binding_types,VkDescriptorSetLayout descriptorset_layout,VkPipelineLayout pipeline_layout,VkDescriptorUpdateTemplateKHR * descriptor_update_template) const2375 int VulkanDevice::create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const
2376 {
2377 if (binding_count == 0)
2378 {
2379 *descriptor_update_template = 0;
2380 return 0;
2381 }
2382
2383 std::vector<VkDescriptorUpdateTemplateEntryKHR> descriptorUpdateTemplateEntries(binding_count);
2384 size_t offset = 0;
2385 for (int i = 0; i < binding_count; i++) // TODO do not update weights
2386 {
2387 int binding_type = binding_types[i];
2388
2389 descriptorUpdateTemplateEntries[i].dstBinding = i;
2390 descriptorUpdateTemplateEntries[i].dstArrayElement = 0;
2391 descriptorUpdateTemplateEntries[i].descriptorCount = 1;
2392 descriptorUpdateTemplateEntries[i].offset = offset;
2393
2394 if (binding_type == 1)
2395 {
2396 descriptorUpdateTemplateEntries[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
2397 descriptorUpdateTemplateEntries[i].stride = sizeof(VkDescriptorBufferInfo);
2398 }
2399 else if (binding_type == 2)
2400 {
2401 descriptorUpdateTemplateEntries[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
2402 descriptorUpdateTemplateEntries[i].stride = sizeof(VkDescriptorImageInfo);
2403 }
2404 else // if (binding_type == 3)
2405 {
2406 descriptorUpdateTemplateEntries[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
2407 descriptorUpdateTemplateEntries[i].stride = sizeof(VkDescriptorImageInfo);
2408 }
2409
2410 offset += descriptorUpdateTemplateEntries[i].stride;
2411 }
2412
2413 VkDescriptorUpdateTemplateCreateInfoKHR descriptorUpdateTemplateCreateInfo;
2414 descriptorUpdateTemplateCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR;
2415 descriptorUpdateTemplateCreateInfo.pNext = 0;
2416 descriptorUpdateTemplateCreateInfo.flags = 0;
2417 descriptorUpdateTemplateCreateInfo.descriptorUpdateEntryCount = binding_count; // TODO do not update weights
2418 descriptorUpdateTemplateCreateInfo.pDescriptorUpdateEntries = descriptorUpdateTemplateEntries.data();
2419 if (info.support_VK_KHR_push_descriptor())
2420 {
2421 descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR;
2422 }
2423 else
2424 {
2425 descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR;
2426 }
2427 // descriptorSetLayout should be ignored if VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR
2428 // FIXME HACK WARNING TODO NOTE but crash on radv if set NULL :(
2429 descriptorUpdateTemplateCreateInfo.descriptorSetLayout = descriptorset_layout;
2430 descriptorUpdateTemplateCreateInfo.pipelineBindPoint = VK_PIPELINE_BIND_POINT_COMPUTE;
2431 descriptorUpdateTemplateCreateInfo.pipelineLayout = pipeline_layout;
2432 descriptorUpdateTemplateCreateInfo.set = 0;
2433
2434 VkResult ret = vkCreateDescriptorUpdateTemplateKHR(d->device, &descriptorUpdateTemplateCreateInfo, 0, descriptor_update_template);
2435 if (ret != VK_SUCCESS)
2436 {
2437 NCNN_LOGE("vkCreateDescriptorUpdateTemplateKHR failed %d", ret);
2438 return -1;
2439 }
2440
2441 return 0;
2442 }
2443
find_memory_index(uint32_t memory_type_bits,VkFlags required,VkFlags preferred,VkFlags preferred_not) const2444 uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const
2445 {
2446 const VkPhysicalDeviceMemoryProperties& memory_properties = info.physical_device_memory_properties();
2447
2448 // first try, find required and with preferred and without preferred_not
2449 for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
2450 {
2451 bool is_required = (1 << i) & memory_type_bits;
2452 if (is_required)
2453 {
2454 const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
2455 if ((memoryType.propertyFlags & required) == required
2456 && (preferred && (memoryType.propertyFlags & preferred))
2457 && (preferred_not && !(memoryType.propertyFlags & preferred_not)))
2458 {
2459 return i;
2460 }
2461 }
2462 }
2463
2464 // second try, find required and with preferred
2465 for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
2466 {
2467 bool is_required = (1 << i) & memory_type_bits;
2468 if (is_required)
2469 {
2470 const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
2471 if ((memoryType.propertyFlags & required) == required
2472 && (preferred && (memoryType.propertyFlags & preferred)))
2473 {
2474 return i;
2475 }
2476 }
2477 }
2478
2479 // third try, find required and without preferred_not
2480 for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
2481 {
2482 bool is_required = (1 << i) & memory_type_bits;
2483 if (is_required)
2484 {
2485 const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
2486 if ((memoryType.propertyFlags & required) == required
2487 && (preferred_not && !(memoryType.propertyFlags & preferred_not)))
2488 {
2489 return i;
2490 }
2491 }
2492 }
2493
2494 // fourth try, find any required
2495 for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
2496 {
2497 bool is_required = (1 << i) & memory_type_bits;
2498 if (is_required)
2499 {
2500 const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
2501 if ((memoryType.propertyFlags & required) == required)
2502 {
2503 return i;
2504 }
2505 }
2506 }
2507
2508 NCNN_LOGE("no such memory type %u %u %u %u", memory_type_bits, required, preferred, preferred_not);
2509 return -1;
2510 }
2511
is_mappable(uint32_t memory_type_index) const2512 bool VulkanDevice::is_mappable(uint32_t memory_type_index) const
2513 {
2514 const VkMemoryType& memoryType = info.physical_device_memory_properties().memoryTypes[memory_type_index];
2515
2516 return memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
2517 }
2518
is_coherent(uint32_t memory_type_index) const2519 bool VulkanDevice::is_coherent(uint32_t memory_type_index) const
2520 {
2521 const VkMemoryType& memoryType = info.physical_device_memory_properties().memoryTypes[memory_type_index];
2522
2523 return memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
2524 }
2525
acquire_queue(uint32_t queue_family_index) const2526 VkQueue VulkanDevice::acquire_queue(uint32_t queue_family_index) const
2527 {
2528 if (queue_family_index != info.compute_queue_family_index()
2529 && queue_family_index != info.graphics_queue_family_index()
2530 && queue_family_index != info.transfer_queue_family_index())
2531 {
2532 NCNN_LOGE("invalid queue_family_index %u", queue_family_index);
2533 return 0;
2534 }
2535
2536 Mutex& queue_lock = queue_family_index == info.compute_queue_family_index() ? d->compute_queue_lock
2537 : queue_family_index == info.graphics_queue_family_index() ? d->graphics_queue_lock
2538 : d->transfer_queue_lock;
2539
2540 queue_lock.lock();
2541
2542 ConditionVariable& queue_condition = queue_family_index == info.compute_queue_family_index() ? d->compute_queue_condition
2543 : queue_family_index == info.graphics_queue_family_index() ? d->graphics_queue_condition
2544 : d->transfer_queue_condition;
2545
2546 int& free_queue_count = queue_family_index == info.compute_queue_family_index() ? d->free_compute_queue_count
2547 : queue_family_index == info.graphics_queue_family_index() ? d->free_graphics_queue_count
2548 : d->free_transfer_queue_count;
2549
2550 while (free_queue_count == 0)
2551 {
2552 // no free queues, wait for recleams from other threads
2553 queue_condition.wait(queue_lock);
2554 }
2555
2556 std::vector<VkQueue>& queues = queue_family_index == info.compute_queue_family_index() ? d->compute_queues
2557 : queue_family_index == info.graphics_queue_family_index() ? d->graphics_queues
2558 : d->transfer_queues;
2559
2560 VkQueue queue = 0;
2561 for (size_t i = 0; i < queues.size(); i++)
2562 {
2563 if (queues[i])
2564 {
2565 queue = queues[i];
2566 queues[i] = 0;
2567 break;
2568 }
2569 }
2570
2571 if (!queue)
2572 {
2573 NCNN_LOGE("FATAL ERROR! out of hardware queue %u", queue_family_index);
2574 }
2575
2576 free_queue_count -= 1;
2577
2578 queue_lock.unlock();
2579
2580 queue_condition.signal();
2581
2582 return queue;
2583 }
2584
reclaim_queue(uint32_t queue_family_index,VkQueue queue) const2585 void VulkanDevice::reclaim_queue(uint32_t queue_family_index, VkQueue queue) const
2586 {
2587 if (queue_family_index != info.compute_queue_family_index()
2588 && queue_family_index != info.graphics_queue_family_index()
2589 && queue_family_index != info.transfer_queue_family_index())
2590 {
2591 NCNN_LOGE("invalid queue_family_index %u", queue_family_index);
2592 return;
2593 }
2594
2595 Mutex& queue_lock = queue_family_index == info.compute_queue_family_index() ? d->compute_queue_lock
2596 : queue_family_index == info.graphics_queue_family_index() ? d->graphics_queue_lock
2597 : d->transfer_queue_lock;
2598
2599 queue_lock.lock();
2600
2601 ConditionVariable& queue_condition = queue_family_index == info.compute_queue_family_index() ? d->compute_queue_condition
2602 : queue_family_index == info.graphics_queue_family_index() ? d->graphics_queue_condition
2603 : d->transfer_queue_condition;
2604
2605 int& free_queue_count = queue_family_index == info.compute_queue_family_index() ? d->free_compute_queue_count
2606 : queue_family_index == info.graphics_queue_family_index() ? d->free_graphics_queue_count
2607 : d->free_transfer_queue_count;
2608
2609 std::vector<VkQueue>& queues = queue_family_index == info.compute_queue_family_index() ? d->compute_queues
2610 : queue_family_index == info.graphics_queue_family_index() ? d->graphics_queues
2611 : d->transfer_queues;
2612
2613 size_t i = 0;
2614 for (; i < queues.size(); i++)
2615 {
2616 if (!queues[i])
2617 {
2618 queues[i] = queue;
2619 break;
2620 }
2621 }
2622
2623 if (i == queues.size())
2624 {
2625 NCNN_LOGE("FATAL ERROR! reclaim_queue get wild queue %u %p", queue_family_index, queue);
2626 }
2627
2628 free_queue_count += 1;
2629
2630 queue_lock.unlock();
2631
2632 queue_condition.signal();
2633 }
2634
acquire_blob_allocator() const2635 VkAllocator* VulkanDevice::acquire_blob_allocator() const
2636 {
2637 MutexLockGuard lock(d->blob_allocator_lock);
2638
2639 for (int i = 0; i < (int)d->blob_allocators.size(); i++)
2640 {
2641 VkAllocator* allocator = d->blob_allocators[i];
2642 if (allocator)
2643 {
2644 d->blob_allocators[i] = 0;
2645 return allocator;
2646 }
2647 }
2648
2649 // pre-allocated allcator exhausted, create new
2650 VkAllocator* allocator = new VkBlobAllocator(this);
2651 d->blob_allocators.push_back(allocator);
2652 d->blob_allocators[d->blob_allocators.size() - 1] = 0;
2653 return allocator;
2654 }
2655
reclaim_blob_allocator(VkAllocator * allocator) const2656 void VulkanDevice::reclaim_blob_allocator(VkAllocator* allocator) const
2657 {
2658 MutexLockGuard lock(d->blob_allocator_lock);
2659
2660 for (int i = 0; i < (int)d->blob_allocators.size(); i++)
2661 {
2662 if (!d->blob_allocators[i])
2663 {
2664 d->blob_allocators[i] = allocator;
2665 return;
2666 }
2667 }
2668
2669 NCNN_LOGE("FATAL ERROR! reclaim_blob_allocator get wild allocator %p", allocator);
2670 }
2671
acquire_staging_allocator() const2672 VkAllocator* VulkanDevice::acquire_staging_allocator() const
2673 {
2674 MutexLockGuard lock(d->staging_allocator_lock);
2675
2676 for (int i = 0; i < (int)d->staging_allocators.size(); i++)
2677 {
2678 VkAllocator* allocator = d->staging_allocators[i];
2679 if (allocator)
2680 {
2681 d->staging_allocators[i] = 0;
2682 return allocator;
2683 }
2684 }
2685
2686 // pre-allocated allcator exhausted, create new
2687 VkAllocator* allocator = new VkStagingAllocator(this);
2688 d->staging_allocators.push_back(allocator);
2689 d->staging_allocators[d->staging_allocators.size() - 1] = 0;
2690 return allocator;
2691 }
2692
reclaim_staging_allocator(VkAllocator * allocator) const2693 void VulkanDevice::reclaim_staging_allocator(VkAllocator* allocator) const
2694 {
2695 MutexLockGuard lock(d->staging_allocator_lock);
2696
2697 for (int i = 0; i < (int)d->staging_allocators.size(); i++)
2698 {
2699 if (!d->staging_allocators[i])
2700 {
2701 d->staging_allocators[i] = allocator;
2702 return;
2703 }
2704 }
2705
2706 NCNN_LOGE("FATAL ERROR! reclaim_staging_allocator get wild allocator %p", allocator);
2707 }
2708
immutable_texelfetch_sampler() const2709 const VkSampler* VulkanDevice::immutable_texelfetch_sampler() const
2710 {
2711 return &d->texelfetch_sampler;
2712 }
2713
get_dummy_buffer() const2714 VkMat VulkanDevice::get_dummy_buffer() const
2715 {
2716 return d->dummy_buffer;
2717 }
2718
get_dummy_image() const2719 VkImageMat VulkanDevice::get_dummy_image() const
2720 {
2721 return d->dummy_image;
2722 }
2723
get_dummy_image_readonly() const2724 VkImageMat VulkanDevice::get_dummy_image_readonly() const
2725 {
2726 #if __APPLE__
2727 if (info.vendor_id() == 0x8086)
2728 return d->dummy_image;
2729 #endif
2730 return d->dummy_image_readonly;
2731 }
2732
get_pipeline_cache() const2733 const PipelineCache* VulkanDevice::get_pipeline_cache() const
2734 {
2735 return d->pipeline_cache;
2736 }
2737
shape_support_image_storage(const Mat & shape) const2738 bool VulkanDevice::shape_support_image_storage(const Mat& shape) const
2739 {
2740 int dims = shape.dims;
2741 int width = shape.w;
2742 int height = shape.h;
2743 int depth = shape.c;
2744 int elempack = shape.elempack;
2745
2746 // large elempack spills on image w
2747 if (elempack == 8) width *= 2;
2748 if (elempack == 16) width *= 4;
2749 if (elempack == 32) width *= 8;
2750 if (elempack == 64) width *= 16;
2751
2752 if (dims == 1)
2753 {
2754 if (width > (int)info.max_image_dimension_1d())
2755 {
2756 return false;
2757 }
2758 }
2759 else if (dims == 2)
2760 {
2761 if (width > (int)info.max_image_dimension_2d() || height > (int)info.max_image_dimension_2d())
2762 {
2763 return false;
2764 }
2765 }
2766 else // if (dims == 3)
2767 {
2768 if (width > (int)info.max_image_dimension_3d() || height > (int)info.max_image_dimension_3d() || depth > (int)info.max_image_dimension_3d())
2769 {
2770 return false;
2771 }
2772 }
2773
2774 return true;
2775 }
2776
get_heap_budget() const2777 uint32_t VulkanDevice::get_heap_budget() const
2778 {
2779 const VkPhysicalDeviceMemoryProperties& memory_properties = info.physical_device_memory_properties();
2780
2781 // the first device local heap
2782 uint32_t device_local_heap_index = 0;
2783 uint32_t device_local_heap_size = 0;
2784 for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
2785 {
2786 const VkMemoryHeap& memoryHeap = memory_properties.memoryHeaps[i];
2787 if (memoryHeap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT)
2788 {
2789 device_local_heap_index = i;
2790 device_local_heap_size = memoryHeap.size / 1024 / 1024;
2791 break;
2792 }
2793 }
2794
2795 if (!info.support_VK_EXT_memory_budget())
2796 {
2797 // NCNN_LOGE("heap budget from assumption\n");
2798
2799 // we usually cannot use all heap
2800 // 70% for 4G+
2801 // 50% for 4G-
2802 return device_local_heap_size >= 4000 ? device_local_heap_size * 0.7 : device_local_heap_size * 0.5;
2803 }
2804
2805 VkPhysicalDeviceMemoryBudgetPropertiesEXT memoryBudgetProperties;
2806 memoryBudgetProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT;
2807 memoryBudgetProperties.pNext = 0;
2808
2809 VkPhysicalDeviceMemoryProperties2KHR memoryProperties;
2810 memoryProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2_KHR;
2811 memoryProperties.pNext = &memoryBudgetProperties;
2812
2813 vkGetPhysicalDeviceMemoryProperties2KHR(info.physical_device(), &memoryProperties);
2814
2815 return memoryBudgetProperties.heapBudget[device_local_heap_index] / 1024 / 1024;
2816 }
2817
convert_packing(const VkMat & src,VkMat & dst,int dst_elempack,VkCompute & cmd,const Option & _opt) const2818 void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& _opt) const
2819 {
2820 // buffer2buffer uop is created with use_image_storage disabled
2821 Option opt = _opt;
2822 opt.use_image_storage = false;
2823
2824 int cast_type_to_index = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0;
2825 int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2;
2826
2827 int cast_type_from_index;
2828 if (src.elembits() == 32)
2829 {
2830 cast_type_from_index = 0;
2831 }
2832 else // if (src.elembits() == 16)
2833 {
2834 if (cast_type_to_index != 0)
2835 {
2836 cast_type_from_index = cast_type_to_index;
2837 }
2838 else if (info.support_fp16_storage())
2839 {
2840 cast_type_from_index = 2;
2841 }
2842 else // if (info.support_fp16_packed())
2843 {
2844 cast_type_from_index = 1;
2845 }
2846 }
2847
2848 // NCNN_LOGE("convert_packing b2b %d %d %d", cast_type_from_index, cast_type_to_index, packing_type_to_index);
2849
2850 const ncnn::Packing_vulkan* uop = d->get_utility_operator(0, 0, cast_type_from_index, cast_type_to_index, packing_type_to_index);
2851 uop->forward(src, dst, cmd, opt);
2852 }
2853
convert_packing(const VkImageMat & src,VkImageMat & dst,int dst_elempack,VkCompute & cmd,const Option & opt) const2854 void VulkanDevice::convert_packing(const VkImageMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const
2855 {
2856 int cast_type_to_index = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0;
2857 int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2;
2858
2859 int cast_type_from_index;
2860 if (src.elembits() == 32)
2861 {
2862 cast_type_from_index = 0;
2863 }
2864 else // if (src.elembits() == 16)
2865 {
2866 if (cast_type_to_index != 0)
2867 {
2868 cast_type_from_index = cast_type_to_index;
2869 }
2870 else if (info.support_fp16_storage())
2871 {
2872 cast_type_from_index = 2;
2873 }
2874 else // if (info.support_fp16_packed())
2875 {
2876 cast_type_from_index = 1;
2877 }
2878 }
2879
2880 // NCNN_LOGE("convert_packing i2i %d %d %d", cast_type_from_index, cast_type_to_index, packing_type_to_index);
2881
2882 const ncnn::Packing_vulkan* uop = d->get_utility_operator(1, 1, cast_type_from_index, cast_type_to_index, packing_type_to_index);
2883 uop->forward(src, dst, cmd, opt);
2884 }
2885
convert_packing(const VkMat & src,VkImageMat & dst,int dst_elempack,VkCompute & cmd,const Option & opt) const2886 void VulkanDevice::convert_packing(const VkMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const
2887 {
2888 int cast_type_to_index = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0;
2889 int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2;
2890
2891 int cast_type_from_index;
2892 if (src.elembits() == 32)
2893 {
2894 cast_type_from_index = 0;
2895 }
2896 else // if (src.elembits() == 16)
2897 {
2898 if (cast_type_to_index != 0)
2899 {
2900 cast_type_from_index = cast_type_to_index;
2901 }
2902 else if (info.support_fp16_storage())
2903 {
2904 cast_type_from_index = 2;
2905 }
2906 else // if (info.support_fp16_packed())
2907 {
2908 cast_type_from_index = 1;
2909 }
2910 }
2911
2912 // NCNN_LOGE("convert_packing b2i %d %d %d", cast_type_from_index, cast_type_to_index, packing_type_to_index);
2913
2914 const ncnn::Packing_vulkan* uop = d->get_utility_operator(0, 1, cast_type_from_index, cast_type_to_index, packing_type_to_index);
2915 uop->forward(src, dst, cmd, opt);
2916 }
2917
convert_packing(const VkImageMat & src,VkMat & dst,int dst_elempack,VkCompute & cmd,const Option & opt) const2918 void VulkanDevice::convert_packing(const VkImageMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const
2919 {
2920 int cast_type_to_index = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0;
2921 int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2;
2922
2923 int cast_type_from_index;
2924 if (src.elembits() == 32)
2925 {
2926 cast_type_from_index = 0;
2927 }
2928 else // if (src.elembits() == 16)
2929 {
2930 if (cast_type_to_index != 0)
2931 {
2932 cast_type_from_index = cast_type_to_index;
2933 }
2934 else if (info.support_fp16_storage())
2935 {
2936 cast_type_from_index = 2;
2937 }
2938 else // if (info.support_fp16_packed())
2939 {
2940 cast_type_from_index = 1;
2941 }
2942 }
2943
2944 // NCNN_LOGE("convert_packing i2b %d %d %d", cast_type_from_index, cast_type_to_index, packing_type_to_index);
2945
2946 const ncnn::Packing_vulkan* uop = d->get_utility_operator(1, 0, cast_type_from_index, cast_type_to_index, packing_type_to_index);
2947 uop->forward(src, dst, cmd, opt);
2948 }
2949
init_device_extension()2950 int VulkanDevice::init_device_extension()
2951 {
2952 if (info.support_VK_KHR_bind_memory2())
2953 {
2954 vkBindBufferMemory2KHR = (PFN_vkBindBufferMemory2KHR)vkGetDeviceProcAddr(d->device, "vkBindBufferMemory2KHR");
2955 vkBindImageMemory2KHR = (PFN_vkBindImageMemory2KHR)vkGetDeviceProcAddr(d->device, "vkBindImageMemory2KHR");
2956 }
2957
2958 if (info.support_VK_KHR_create_renderpass2())
2959 {
2960 vkCmdBeginRenderPass2KHR = (PFN_vkCmdBeginRenderPass2KHR)vkGetDeviceProcAddr(d->device, "vkCmdBeginRenderPass2KHR");
2961 vkCmdEndRenderPass2KHR = (PFN_vkCmdEndRenderPass2KHR)vkGetDeviceProcAddr(d->device, "vkCmdEndRenderPass2KHR");
2962 vkCmdNextSubpass2KHR = (PFN_vkCmdNextSubpass2KHR)vkGetDeviceProcAddr(d->device, "vkCmdNextSubpass2KHR");
2963 vkCreateRenderPass2KHR = (PFN_vkCreateRenderPass2KHR)vkGetDeviceProcAddr(d->device, "vkCreateRenderPass2KHR");
2964 }
2965
2966 if (info.support_VK_KHR_descriptor_update_template())
2967 {
2968 vkCreateDescriptorUpdateTemplateKHR = (PFN_vkCreateDescriptorUpdateTemplateKHR)vkGetDeviceProcAddr(d->device, "vkCreateDescriptorUpdateTemplateKHR");
2969 vkDestroyDescriptorUpdateTemplateKHR = (PFN_vkDestroyDescriptorUpdateTemplateKHR)vkGetDeviceProcAddr(d->device, "vkDestroyDescriptorUpdateTemplateKHR");
2970 vkUpdateDescriptorSetWithTemplateKHR = (PFN_vkUpdateDescriptorSetWithTemplateKHR)vkGetDeviceProcAddr(d->device, "vkUpdateDescriptorSetWithTemplateKHR");
2971 }
2972
2973 if (info.support_VK_KHR_get_memory_requirements2())
2974 {
2975 vkGetImageMemoryRequirements2KHR = (PFN_vkGetImageMemoryRequirements2KHR)vkGetDeviceProcAddr(d->device, "vkGetImageMemoryRequirements2KHR");
2976 vkGetBufferMemoryRequirements2KHR = (PFN_vkGetBufferMemoryRequirements2KHR)vkGetDeviceProcAddr(d->device, "vkGetBufferMemoryRequirements2KHR");
2977 vkGetImageSparseMemoryRequirements2KHR = (PFN_vkGetImageSparseMemoryRequirements2KHR)vkGetDeviceProcAddr(d->device, "vkGetImageSparseMemoryRequirements2KHR");
2978 }
2979
2980 if (info.support_VK_KHR_maintenance1())
2981 {
2982 vkTrimCommandPoolKHR = (PFN_vkTrimCommandPoolKHR)vkGetDeviceProcAddr(d->device, "vkTrimCommandPoolKHR");
2983 }
2984
2985 if (info.support_VK_KHR_maintenance3())
2986 {
2987 vkGetDescriptorSetLayoutSupportKHR = (PFN_vkGetDescriptorSetLayoutSupportKHR)vkGetDeviceProcAddr(d->device, "vkGetDescriptorSetLayoutSupportKHR");
2988 }
2989
2990 if (info.support_VK_KHR_push_descriptor())
2991 {
2992 if (info.support_VK_KHR_descriptor_update_template())
2993 {
2994 vkCmdPushDescriptorSetWithTemplateKHR = (PFN_vkCmdPushDescriptorSetWithTemplateKHR)vkGetDeviceProcAddr(d->device, "vkCmdPushDescriptorSetWithTemplateKHR");
2995 }
2996
2997 vkCmdPushDescriptorSetKHR = (PFN_vkCmdPushDescriptorSetKHR)vkGetDeviceProcAddr(d->device, "vkCmdPushDescriptorSetKHR");
2998 }
2999
3000 if (info.support_VK_KHR_sampler_ycbcr_conversion())
3001 {
3002 vkCreateSamplerYcbcrConversionKHR = (PFN_vkCreateSamplerYcbcrConversionKHR)vkGetDeviceProcAddr(d->device, "vkCreateSamplerYcbcrConversionKHR");
3003 vkDestroySamplerYcbcrConversionKHR = (PFN_vkDestroySamplerYcbcrConversionKHR)vkGetDeviceProcAddr(d->device, "vkDestroySamplerYcbcrConversionKHR");
3004 }
3005
3006 if (info.support_VK_KHR_swapchain())
3007 {
3008 vkCreateSwapchainKHR = (PFN_vkCreateSwapchainKHR)vkGetDeviceProcAddr(d->device, "vkCreateSwapchainKHR");
3009 vkDestroySwapchainKHR = (PFN_vkDestroySwapchainKHR)vkGetDeviceProcAddr(d->device, "vkDestroySwapchainKHR");
3010 vkGetSwapchainImagesKHR = (PFN_vkGetSwapchainImagesKHR)vkGetDeviceProcAddr(d->device, "vkGetSwapchainImagesKHR");
3011 vkAcquireNextImageKHR = (PFN_vkAcquireNextImageKHR)vkGetDeviceProcAddr(d->device, "vkAcquireNextImageKHR");
3012 vkQueuePresentKHR = (PFN_vkQueuePresentKHR)vkGetDeviceProcAddr(d->device, "vkQueuePresentKHR");
3013 }
3014
3015 #if __ANDROID_API__ >= 26
3016 if (info.support_VK_ANDROID_external_memory_android_hardware_buffer())
3017 {
3018 vkGetAndroidHardwareBufferPropertiesANDROID = (PFN_vkGetAndroidHardwareBufferPropertiesANDROID)vkGetDeviceProcAddr(d->device, "vkGetAndroidHardwareBufferPropertiesANDROID");
3019 vkGetMemoryAndroidHardwareBufferANDROID = (PFN_vkGetMemoryAndroidHardwareBufferANDROID)vkGetDeviceProcAddr(d->device, "vkGetMemoryAndroidHardwareBufferANDROID");
3020 }
3021 #endif // __ANDROID_API__ >= 26
3022
3023 return 0;
3024 }
3025
get_gpu_device(int device_index)3026 VulkanDevice* get_gpu_device(int device_index)
3027 {
3028 try_create_gpu_instance();
3029
3030 if (device_index < 0 || device_index >= g_gpu_count)
3031 return 0;
3032
3033 MutexLockGuard lock(g_default_vkdev_lock);
3034
3035 if (!g_default_vkdev[device_index])
3036 g_default_vkdev[device_index] = new VulkanDevice(device_index);
3037
3038 return g_default_vkdev[device_index];
3039 }
3040
get_default_TBuiltInResource()3041 static TBuiltInResource get_default_TBuiltInResource()
3042 {
3043 TBuiltInResource resource;
3044
3045 resource.maxLights = 32;
3046 resource.maxClipPlanes = 6;
3047 resource.maxTextureUnits = 32;
3048 resource.maxTextureCoords = 32;
3049 resource.maxVertexAttribs = 64;
3050 resource.maxVertexUniformComponents = 4096;
3051 resource.maxVaryingFloats = 64;
3052 resource.maxVertexTextureImageUnits = 32;
3053 resource.maxCombinedTextureImageUnits = 80;
3054 resource.maxTextureImageUnits = 32;
3055 resource.maxFragmentUniformComponents = 4096;
3056 resource.maxDrawBuffers = 32;
3057 resource.maxVertexUniformVectors = 128;
3058 resource.maxVaryingVectors = 8;
3059 resource.maxFragmentUniformVectors = 16;
3060 resource.maxVertexOutputVectors = 16;
3061 resource.maxFragmentInputVectors = 15;
3062 resource.minProgramTexelOffset = -8;
3063 resource.maxProgramTexelOffset = 7;
3064 resource.maxClipDistances = 8;
3065 resource.maxComputeWorkGroupCountX = 65535;
3066 resource.maxComputeWorkGroupCountY = 65535;
3067 resource.maxComputeWorkGroupCountZ = 65535;
3068 resource.maxComputeWorkGroupSizeX = 1024;
3069 resource.maxComputeWorkGroupSizeY = 1024;
3070 resource.maxComputeWorkGroupSizeZ = 64;
3071 resource.maxComputeUniformComponents = 1024;
3072 resource.maxComputeTextureImageUnits = 16;
3073 resource.maxComputeImageUniforms = 8;
3074 resource.maxComputeAtomicCounters = 8;
3075 resource.maxComputeAtomicCounterBuffers = 1;
3076 resource.maxVaryingComponents = 60;
3077 resource.maxVertexOutputComponents = 64;
3078 resource.maxGeometryInputComponents = 64;
3079 resource.maxGeometryOutputComponents = 128;
3080 resource.maxFragmentInputComponents = 128;
3081 resource.maxImageUnits = 8;
3082 resource.maxCombinedImageUnitsAndFragmentOutputs = 8;
3083 resource.maxCombinedShaderOutputResources = 8;
3084 resource.maxImageSamples = 0;
3085 resource.maxVertexImageUniforms = 0;
3086 resource.maxTessControlImageUniforms = 0;
3087 resource.maxTessEvaluationImageUniforms = 0;
3088 resource.maxGeometryImageUniforms = 0;
3089 resource.maxFragmentImageUniforms = 8;
3090 resource.maxCombinedImageUniforms = 8;
3091 resource.maxGeometryTextureImageUnits = 16;
3092 resource.maxGeometryOutputVertices = 256;
3093 resource.maxGeometryTotalOutputComponents = 1024;
3094 resource.maxGeometryUniformComponents = 1024;
3095 resource.maxGeometryVaryingComponents = 64;
3096 resource.maxTessControlInputComponents = 128;
3097 resource.maxTessControlOutputComponents = 128;
3098 resource.maxTessControlTextureImageUnits = 16;
3099 resource.maxTessControlUniformComponents = 1024;
3100 resource.maxTessControlTotalOutputComponents = 4096;
3101 resource.maxTessEvaluationInputComponents = 128;
3102 resource.maxTessEvaluationOutputComponents = 128;
3103 resource.maxTessEvaluationTextureImageUnits = 16;
3104 resource.maxTessEvaluationUniformComponents = 1024;
3105 resource.maxTessPatchComponents = 120;
3106 resource.maxPatchVertices = 32;
3107 resource.maxTessGenLevel = 64;
3108 resource.maxViewports = 16;
3109 resource.maxVertexAtomicCounters = 0;
3110 resource.maxTessControlAtomicCounters = 0;
3111 resource.maxTessEvaluationAtomicCounters = 0;
3112 resource.maxGeometryAtomicCounters = 0;
3113 resource.maxFragmentAtomicCounters = 8;
3114 resource.maxCombinedAtomicCounters = 8;
3115 resource.maxAtomicCounterBindings = 1;
3116 resource.maxVertexAtomicCounterBuffers = 0;
3117 resource.maxTessControlAtomicCounterBuffers = 0;
3118 resource.maxTessEvaluationAtomicCounterBuffers = 0;
3119 resource.maxGeometryAtomicCounterBuffers = 0;
3120 resource.maxFragmentAtomicCounterBuffers = 1;
3121 resource.maxCombinedAtomicCounterBuffers = 1;
3122 resource.maxAtomicCounterBufferSize = 16384;
3123 resource.maxTransformFeedbackBuffers = 4;
3124 resource.maxTransformFeedbackInterleavedComponents = 64;
3125 resource.maxCullDistances = 8;
3126 resource.maxCombinedClipAndCullDistances = 8;
3127 resource.maxSamples = 4;
3128 resource.maxMeshOutputVerticesNV = 256;
3129 resource.maxMeshOutputPrimitivesNV = 512;
3130 resource.maxMeshWorkGroupSizeX_NV = 32;
3131 resource.maxMeshWorkGroupSizeY_NV = 1;
3132 resource.maxMeshWorkGroupSizeZ_NV = 1;
3133 resource.maxTaskWorkGroupSizeX_NV = 32;
3134 resource.maxTaskWorkGroupSizeY_NV = 1;
3135 resource.maxTaskWorkGroupSizeZ_NV = 1;
3136 resource.maxMeshViewCountNV = 4;
3137
3138 // TODO compile-time glslang version check
3139 // resource.maxDualSourceDrawBuffersEXT = 1;
3140
3141 resource.limits.nonInductiveForLoops = 1;
3142 resource.limits.whileLoops = 1;
3143 resource.limits.doWhileLoops = 1;
3144 resource.limits.generalUniformIndexing = 1;
3145 resource.limits.generalAttributeMatrixVectorIndexing = 1;
3146 resource.limits.generalVaryingIndexing = 1;
3147 resource.limits.generalSamplerIndexing = 1;
3148 resource.limits.generalVariableIndexing = 1;
3149 resource.limits.generalConstantMatrixVectorIndexing = 1;
3150
3151 return resource;
3152 }
3153
compile_spirv_module(const char * comp_string,const Option & opt,std::vector<uint32_t> & spirv)3154 int compile_spirv_module(const char* comp_string, const Option& opt, std::vector<uint32_t>& spirv)
3155 {
3156 // -1 for omitting the tail '\0'
3157 int length = strlen(comp_string) - 1;
3158 return compile_spirv_module(comp_string, length, opt, spirv);
3159 }
3160
compile_spirv_module(const char * comp_data,int comp_data_size,const Option & opt,std::vector<uint32_t> & spirv)3161 int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector<uint32_t>& spirv)
3162 {
3163 std::vector<std::pair<const char*, const char*> > custom_defines;
3164
3165 if (opt.use_fp16_storage)
3166 {
3167 custom_defines.push_back(std::make_pair("sfp", "float16_t"));
3168 custom_defines.push_back(std::make_pair("sfpvec2", "f16vec2"));
3169 custom_defines.push_back(std::make_pair("sfpvec4", "f16vec4"));
3170
3171 if (opt.use_fp16_arithmetic)
3172 {
3173 custom_defines.push_back(std::make_pair("sfpvec8", "f16mat2x4"));
3174 custom_defines.push_back(std::make_pair("sfpmat4", "f16mat4"));
3175 }
3176 }
3177 else if (opt.use_fp16_packed)
3178 {
3179 custom_defines.push_back(std::make_pair("sfp", "float"));
3180 custom_defines.push_back(std::make_pair("sfpvec2", "uint"));
3181 custom_defines.push_back(std::make_pair("sfpvec4", "uvec2"));
3182 custom_defines.push_back(std::make_pair("sfpvec8", "uvec4"));
3183 }
3184 else
3185 {
3186 custom_defines.push_back(std::make_pair("sfp", "float"));
3187 custom_defines.push_back(std::make_pair("sfpvec2", "vec2"));
3188 custom_defines.push_back(std::make_pair("sfpvec4", "vec4"));
3189 custom_defines.push_back(std::make_pair("sfpvec8", "mat2x4"));
3190 custom_defines.push_back(std::make_pair("sfpmat4", "mat4"));
3191 }
3192
3193 if (opt.use_fp16_arithmetic)
3194 {
3195 custom_defines.push_back(std::make_pair("afp", "float16_t"));
3196 custom_defines.push_back(std::make_pair("afpvec2", "f16vec2"));
3197 custom_defines.push_back(std::make_pair("afpvec4", "f16vec4"));
3198 custom_defines.push_back(std::make_pair("afpvec8", "f16mat2x4"));
3199 custom_defines.push_back(std::make_pair("afpmat4", "f16mat4"));
3200 }
3201 else
3202 {
3203 custom_defines.push_back(std::make_pair("afp", "float"));
3204 custom_defines.push_back(std::make_pair("afpvec2", "vec2"));
3205 custom_defines.push_back(std::make_pair("afpvec4", "vec4"));
3206 custom_defines.push_back(std::make_pair("afpvec8", "mat2x4"));
3207 custom_defines.push_back(std::make_pair("afpmat4", "mat4"));
3208 }
3209
3210 if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
3211 {
3212 custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "buf[i]"));
3213 custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=v;}"));
3214 custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3215 custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=f16vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"));
3216 custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=f16mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"));
3217 custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "buf[i]"));
3218 custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=v;}"));
3219 custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3220 custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "buf[i]"));
3221 custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=v;}"));
3222 custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3223 custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"));
3224 custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=f16mat2x4(sbuf[si2.r],sbuf[si2.g]);}"));
3225 custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "buf[i]"));
3226 custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=v;}"));
3227 custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3228 custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{f16mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"));
3229 custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{f16mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"));
3230 custom_defines.push_back(std::make_pair("sfp2afpmat4(v)", "v"));
3231 custom_defines.push_back(std::make_pair("afp2sfpmat4(v)", "v"));
3232 }
3233 else if (opt.use_fp16_packed && opt.use_fp16_arithmetic)
3234 {
3235 custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "float16_t(buf[i])"));
3236 custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=float(v);}"));
3237 custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3238 custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=uvec2(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))));}"));
3239 custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=uvec4(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.r],sbuf[sii4.g]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.b],sbuf[sii4.a]))));}"));
3240 custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "f16vec2(unpackHalf2x16(buf[i]))"));
3241 custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=packHalf2x16(vec2(v))}"));
3242 custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3243 custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "f16vec4(vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y)))"));
3244 custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=uvec2(packHalf2x16(vec2(v.rg)),packHalf2x16(vec2(v.ba)));}"));
3245 custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3246 custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"));
3247 custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"));
3248 custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "f16mat2x4(f16vec4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g))),f16vec4(vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a))))"));
3249 custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=uvec4(uvec2(packHalf2x16(vec2(v[0].rg)),packHalf2x16(vec2(v[0].ba))),uvec2(packHalf2x16(vec2(v[1].rg)),packHalf2x16(vec2(v[1].ba))));}"));
3250 custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3251 custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"));
3252 custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"));
3253 }
3254 else if (opt.use_fp16_storage)
3255 {
3256 custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "float(buf[i])"));
3257 custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=float16_t(v);}"));
3258 custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3259 custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i].r=sbuf[si4.r];buf[i].g=sbuf[si4.g];buf[i].b=sbuf[si4.b];buf[i].a=sbuf[si4.a];}"));
3260 custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i].abcd.r=sbuf[si4.r];buf[i].abcd.g=sbuf[si4.g];buf[i].abcd.b=sbuf[si4.b];buf[i].abcd.a=sbuf[si4.a];buf[i].efgh.r=sbuf[sii4.r];buf[i].efgh.g=sbuf[sii4.g];buf[i].efgh.b=sbuf[sii4.b];buf[i].efgh.a=sbuf[sii4.a];}"));
3261 custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "vec2(buf[i])"));
3262 custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=f16vec2(v);}"));
3263 custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3264 custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "vec4(buf[i])"));
3265 custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=f16vec4(v);}"));
3266 custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3267 custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"));
3268 custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i].abcd=sbuf[si2.r];buf[i].efgh=sbuf[si2.g];}"));
3269 custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "mat2x4(vec4(buf[i].abcd),vec4(buf[i].efgh))"));
3270 custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i].abcd=f16vec4(v[0]);buf[i].efgh=f16vec4(v[1]);}"));
3271 custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i].abcd=sbuf[si].abcd;buf[i].efgh=sbuf[si].efgh;}"));
3272 custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{buf[i4.r]=sbuf[si].abcd.r;buf[i4.g]=sbuf[si].abcd.g;buf[i4.b]=sbuf[si].abcd.b;buf[i4.a]=sbuf[si].abcd.a; buf[ii4.r]=sbuf[si].efgh.r;buf[ii4.g]=sbuf[si].efgh.g;buf[ii4.b]=sbuf[si].efgh.b;buf[ii4.a]=sbuf[si].efgh.a;}"));
3273 custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{buf[i2.r]=sbuf[si].abcd;buf[i2.g]=sbuf[si].efgh;}"));
3274 }
3275 else if (opt.use_fp16_packed)
3276 {
3277 custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "buf[i]"));
3278 custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=v;}"));
3279 custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3280 custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=uvec2(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])));}"));
3281 custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=uvec4(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])),packHalf2x16(vec2(sbuf[sii4.r],sbuf[sii4.g])),packHalf2x16(vec2(sbuf[sii4.b],sbuf[sii4.a])));}"));
3282 custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "unpackHalf2x16(buf[i])"));
3283 custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=packHalf2x16(v)}"));
3284 custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3285 custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y))"));
3286 custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=uvec2(packHalf2x16(v.rg),packHalf2x16(v.ba));}"));
3287 custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3288 custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"));
3289 custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"));
3290 custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "mat2x4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g)),vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a)))"));
3291 custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=uvec4(uvec2(packHalf2x16(v[0].rg),packHalf2x16(v[0].ba)),uvec2(packHalf2x16(v[1].rg),packHalf2x16(v[1].ba)));}"));
3292 custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3293 custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"));
3294 custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"));
3295 }
3296 else
3297 {
3298 custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "buf[i]"));
3299 custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=v;}"));
3300 custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3301 custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"));
3302 custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"));
3303 custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "buf[i]"));
3304 custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=v;}"));
3305 custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3306 custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "buf[i]"));
3307 custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=v;}"));
3308 custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3309 custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{vec4 _v=sbuf[si]; buf[i4.r]=_v.r;buf[i4.g]=_v.g;buf[i4.b]=_v.b;buf[i4.a]=_v.a;}"));
3310 custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=mat2x4(sbuf[si2.r],sbuf[si2.g]);}"));
3311 custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "buf[i]"));
3312 custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=v;}"));
3313 custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
3314 custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"));
3315 custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"));
3316 custom_defines.push_back(std::make_pair("sfp2afpmat4(v)", "v"));
3317 custom_defines.push_back(std::make_pair("afp2sfpmat4(v)", "v"));
3318 }
3319
3320 if (opt.use_image_storage)
3321 {
3322 if (opt.use_fp16_storage)
3323 {
3324 custom_defines.push_back(std::make_pair("imfmtc1", "r16f"));
3325 custom_defines.push_back(std::make_pair("imfmtc4", "rgba16f"));
3326 custom_defines.push_back(std::make_pair("unfp", "mediump"));
3327 }
3328 else if (opt.use_fp16_packed)
3329 {
3330 custom_defines.push_back(std::make_pair("imfmtc1", "r32f"));
3331 custom_defines.push_back(std::make_pair("imfmtc4", "rgba16f"));
3332 custom_defines.push_back(std::make_pair("unfp", "mediump"));
3333 }
3334 else
3335 {
3336 custom_defines.push_back(std::make_pair("imfmtc1", "r32f"));
3337 custom_defines.push_back(std::make_pair("imfmtc4", "rgba32f"));
3338 custom_defines.push_back(std::make_pair("unfp", "highp"));
3339 }
3340
3341 if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
3342 {
3343 custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
3344 custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
3345 custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
3346 custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=float(v);imageStore(img,p,_v);}"));
3347 custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=float(v);imageStore(img,p,_v);}"));
3348 custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=float(v);imageStore(img,p,_v);}"));
3349 custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3350 custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3351 custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3352 custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
3353 custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
3354 custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
3355 custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,vec4(v));}"));
3356 custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,vec4(v));}"));
3357 custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,vec4(v));}"));
3358 custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3359 custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3360 custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3361 custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
3362 custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
3363 custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
3364 custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,vec4(v[0]));imageStore(img,(p)*2+1,vec4(v[1]));}"));
3365 custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),vec4(v[0]));imageStore(img,ivec2(p.x*2+1,p.y),vec4(v[1]));}"));
3366 custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),vec4(v[0]));imageStore(img,ivec3(p.x*2+1,p.y,p.z),vec4(v[1]));}"));
3367 custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
3368 custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
3369 custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
3370 }
3371 else if (opt.use_fp16_packed && opt.use_fp16_arithmetic)
3372 {
3373 custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
3374 custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
3375 custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
3376 custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3377 custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3378 custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3379 custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3380 custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3381 custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3382 custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
3383 custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
3384 custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
3385 custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3386 custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3387 custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3388 custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3389 custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3390 custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3391 custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
3392 custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
3393 custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
3394 custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"));
3395 custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"));
3396 custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"));
3397 custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
3398 custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
3399 custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
3400 }
3401 else if (opt.use_fp16_storage)
3402 {
3403 custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3404 custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3405 custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3406 custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3407 custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3408 custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3409 custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3410 custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3411 custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3412 custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3413 custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3414 custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3415 custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3416 custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3417 custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3418 custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3419 custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3420 custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3421 custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
3422 custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
3423 custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
3424 custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"));
3425 custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"));
3426 custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"));
3427 custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
3428 custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
3429 custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
3430 }
3431 else if (opt.use_fp16_packed)
3432 {
3433 custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3434 custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3435 custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3436 custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3437 custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3438 custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3439 custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3440 custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3441 custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3442 custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3443 custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3444 custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3445 custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3446 custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3447 custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3448 custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3449 custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3450 custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3451 custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
3452 custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
3453 custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
3454 custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"));
3455 custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"));
3456 custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"));
3457 custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
3458 custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
3459 custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
3460 }
3461 else
3462 {
3463 custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3464 custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3465 custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
3466 custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3467 custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3468 custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
3469 custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3470 custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3471 custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3472 custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3473 custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3474 custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "texelFetch(tex,p,0)"));
3475 custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3476 custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3477 custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}"));
3478 custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3479 custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3480 custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
3481 custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
3482 custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
3483 custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
3484 custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"));
3485 custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"));
3486 custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"));
3487 custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
3488 custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
3489 custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
3490 }
3491 }
3492
3493 custom_defines.push_back(std::make_pair("psc(x)", "(x==0?p.x:x)"));
3494
3495 if (opt.use_fp16_storage)
3496 {
3497 custom_defines.push_back(std::make_pair("NCNN_fp16_storage", "1"));
3498 }
3499 else if (opt.use_fp16_packed)
3500 {
3501 custom_defines.push_back(std::make_pair("NCNN_fp16_packed", "1"));
3502 }
3503
3504 if (opt.use_fp16_arithmetic)
3505 {
3506 custom_defines.push_back(std::make_pair("NCNN_fp16_arithmetic", "1"));
3507 }
3508
3509 if (opt.use_int8_storage)
3510 {
3511 custom_defines.push_back(std::make_pair("NCNN_int8_storage", "1"));
3512 }
3513 else if (opt.use_int8_packed)
3514 {
3515 custom_defines.push_back(std::make_pair("NCNN_int8_packed", "1"));
3516 }
3517
3518 if (opt.use_int8_arithmetic)
3519 {
3520 custom_defines.push_back(std::make_pair("NCNN_int8_arithmetic", "1"));
3521 }
3522
3523 if (opt.use_image_storage)
3524 {
3525 custom_defines.push_back(std::make_pair("NCNN_image_shader", "1"));
3526 }
3527
3528 if (opt.use_subgroup_basic)
3529 {
3530 custom_defines.push_back(std::make_pair("NCNN_subgroup_basic", "1"));
3531
3532 if (opt.use_subgroup_vote)
3533 {
3534 custom_defines.push_back(std::make_pair("NCNN_subgroup_vote", "1"));
3535 }
3536 if (opt.use_subgroup_ballot)
3537 {
3538 custom_defines.push_back(std::make_pair("NCNN_subgroup_ballot", "1"));
3539 }
3540 if (opt.use_subgroup_shuffle)
3541 {
3542 custom_defines.push_back(std::make_pair("NCNN_subgroup_shuffle", "1"));
3543 }
3544 }
3545
3546 std::string preamble;
3547 std::vector<std::string> processes;
3548
3549 processes.resize(custom_defines.size());
3550 for (size_t i = 0; i < custom_defines.size(); i++)
3551 {
3552 const char* key = custom_defines[i].first;
3553 const char* def = custom_defines[i].second;
3554
3555 preamble += std::string("#define ") + key + " " + def + "\n";
3556 processes[i] = std::string("define-macro ") + key + "=" + def;
3557 }
3558
3559 bool compile_success = true;
3560
3561 {
3562 glslang::TShader s(EShLangCompute);
3563
3564 s.setStringsWithLengths(&comp_data, &comp_data_size, 1);
3565
3566 s.setPreamble(preamble.c_str());
3567 s.addProcesses(processes);
3568 s.setEntryPoint("main");
3569 s.setSourceEntryPoint("main");
3570
3571 s.setEnvInput(glslang::EShSourceGlsl, EShLangCompute, glslang::EShClientVulkan, 1);
3572
3573 if (opt.use_subgroup_basic)
3574 {
3575 // subgroup need vulkan-1.1 and spirv-1.3
3576 s.setEnvClient(glslang::EShClientVulkan, glslang::EShTargetVulkan_1_1);
3577 s.setEnvTarget(glslang::EshTargetSpv, glslang::EShTargetSpv_1_3);
3578 }
3579 else
3580 {
3581 s.setEnvClient(glslang::EShClientVulkan, glslang::EShTargetVulkan_1_0);
3582 s.setEnvTarget(glslang::EshTargetSpv, glslang::EShTargetSpv_1_0);
3583 }
3584
3585 TBuiltInResource resources = get_default_TBuiltInResource();
3586
3587 bool pr = s.parse(&resources, 100, false, EShMsgDefault);
3588 if (!pr)
3589 {
3590 NCNN_LOGE("compile spir-v module failed");
3591 NCNN_LOGE("%s", s.getInfoLog());
3592 NCNN_LOGE("%s", s.getInfoDebugLog());
3593
3594 compile_success = false;
3595 }
3596 else
3597 {
3598 glslang::TIntermediate* ir = s.getIntermediate();
3599 glslang::GlslangToSpv(*ir, spirv);
3600 }
3601 }
3602
3603 return compile_success ? 0 : -1;
3604 }
3605
compile_spirv_module(int shader_type_index,const Option & opt,std::vector<uint32_t> & spirv)3606 int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv)
3607 {
3608 if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count)
3609 {
3610 NCNN_LOGE("no such shader module %d", shader_type_index);
3611 return -1;
3612 }
3613
3614 const char* comp_data = layer_shader_registry[shader_type_index].comp_data;
3615 int comp_data_size = layer_shader_registry[shader_type_index].comp_data_size;
3616
3617 return compile_spirv_module(comp_data, comp_data_size, opt, spirv);
3618 }
3619
resolve_shader_info(const uint32_t * spv_data,size_t spv_data_size,ShaderInfo & shader_info)3620 int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info)
3621 {
3622 shader_info.specialization_count = 0;
3623 shader_info.binding_count = 0;
3624 shader_info.push_constant_count = 0;
3625
3626 uint32_t parameter_id = -233;
3627
3628 int specialization_count = 0;
3629 int binding_count = 0;
3630 int push_constant_count = 0;
3631
3632 // id -> binding_type
3633 std::vector<int> id_types;
3634
3635 // binding_id -> binding_type
3636 std::vector<int> binding_types;
3637
3638 const uint32_t* p = spv_data;
3639
3640 int bound = p[3];
3641
3642 id_types.resize(bound);
3643
3644 // skip magic version generator bound schema
3645 p += 5;
3646
3647 // foreach op
3648 while ((const unsigned char*)p < (const unsigned char*)spv_data + spv_data_size)
3649 {
3650 uint32_t opcode = p[0];
3651
3652 uint16_t wordcount = opcode >> 16;
3653 uint16_t op = opcode & 0xffff;
3654
3655 if (op == 5) // OpName
3656 {
3657 uint32_t id = p[1];
3658 const char* name = (const char*)&p[2];
3659 if (strcmp(name, "parameter") == 0)
3660 {
3661 parameter_id = id;
3662 }
3663 }
3664 else if (op == 6) // OpMemberName
3665 {
3666 uint32_t id = p[1];
3667 if (id == parameter_id)
3668 {
3669 push_constant_count++;
3670 }
3671 }
3672 else if (op == 25) // OpTypeImage
3673 {
3674 uint32_t id = p[1];
3675 id_types[id] = 2;
3676 }
3677 else if (op == 27) // OpTypeSampledImage
3678 {
3679 uint32_t id = p[1];
3680 id_types[id] = 3;
3681 }
3682 else if (op == 32) // OpTypePointer
3683 {
3684 uint32_t id = p[1];
3685 uint32_t storage_class = p[2];
3686 uint32_t type = p[3];
3687 if (storage_class == 0) // UniformConstant
3688 {
3689 id_types[id] = id_types[type];
3690 }
3691 if (storage_class == 2) // Uniform
3692 {
3693 id_types[id] = id_types[type];
3694 }
3695 if (storage_class == 12) // StorageBuffer
3696 {
3697 id_types[type] = 1;
3698 id_types[id] = id_types[type];
3699 }
3700 }
3701 else if (op == 59) // OpVariable
3702 {
3703 uint32_t id = p[1];
3704 uint32_t var_id = p[2];
3705 uint32_t storage_class = p[3];
3706 if (storage_class == 0) // UniformConstant
3707 {
3708 id_types[var_id] = id_types[id];
3709 }
3710 if (storage_class == 2) // Uniform
3711 {
3712 id_types[var_id] = id_types[id];
3713 }
3714 if (storage_class == 12) // StorageBuffer
3715 {
3716 id_types[var_id] = id_types[id];
3717 }
3718 }
3719 else if (op == 71) // OpDecorate
3720 {
3721 uint32_t id = p[1];
3722 uint32_t decoration = p[2];
3723 uint32_t binding_id = p[3];
3724 if (decoration == 1) // SpecId
3725 {
3726 specialization_count++;
3727 }
3728 if (decoration == 3) // BufferBlock
3729 {
3730 id_types[id] = 1;
3731 }
3732 else if (decoration == 33) // Binding
3733 {
3734 binding_count = std::max(binding_count, (int)binding_id + 1);
3735
3736 binding_types.resize(binding_count);
3737 binding_types[binding_id] = id;
3738 }
3739 }
3740
3741 p += wordcount;
3742 }
3743
3744 if (binding_count > 16)
3745 {
3746 NCNN_LOGE("too many binding %d", binding_count);
3747 return -1;
3748 }
3749
3750 shader_info.specialization_count = specialization_count;
3751 shader_info.binding_count = binding_count;
3752 shader_info.push_constant_count = push_constant_count;
3753
3754 // resolve binding_types
3755 for (int i = 0; i < binding_count; i++)
3756 {
3757 shader_info.binding_types[i] = id_types[binding_types[i]];
3758 }
3759
3760 return 0;
3761 }
3762
3763 } // namespace ncnn
3764
3765 #endif // NCNN_VULKAN
3766