1 /* Copyright (c) 2017-2020 Hans-Kristian Arntzen
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining
4 * a copy of this software and associated documentation files (the
5 * "Software"), to deal in the Software without restriction, including
6 * without limitation the rights to use, copy, modify, merge, publish,
7 * distribute, sublicense, and/or sell copies of the Software, and to
8 * permit persons to whom the Software is furnished to do so, subject to
9 * the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be
12 * included in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include "device.hpp"
24 #include "format.hpp"
25 #include "type_to_string.hpp"
26 #include "quirks.hpp"
27 #include "timer.hpp"
28 #include <algorithm>
29 #include <string.h>
30 #include <stdlib.h>
31
32 #ifdef _WIN32
33 #define WIN32_LEAN_AND_MEAN
34 #include <windows.h>
35 #endif
36
37 #ifdef GRANITE_VULKAN_FILESYSTEM
38 #include "string_helpers.hpp"
39 #endif
40
41 #ifdef GRANITE_VULKAN_MT
42 #include "thread_id.hpp"
get_thread_index()43 static unsigned get_thread_index()
44 {
45 return Vulkan::get_current_thread_index();
46 }
47 #define LOCK() std::lock_guard<std::mutex> holder__{lock.lock}
48 #define DRAIN_FRAME_LOCK() \
49 std::unique_lock<std::mutex> holder__{lock.lock}; \
50 lock.cond.wait(holder__, [&]() { \
51 return lock.counter == 0; \
52 })
53 #else
54 #define LOCK() ((void)0)
55 #define DRAIN_FRAME_LOCK() VK_ASSERT(lock.counter == 0)
get_thread_index()56 static unsigned get_thread_index()
57 {
58 return 0;
59 }
60 #endif
61
62 using namespace std;
63 using namespace Util;
64
65 namespace Vulkan
66 {
Device()67 Device::Device()
68 : framebuffer_allocator(this)
69 , transient_allocator(this)
70 #ifdef GRANITE_VULKAN_FILESYSTEM
71 , shader_manager(this)
72 , texture_manager(this)
73 #endif
74 {
75 #ifdef GRANITE_VULKAN_MT
76 cookie.store(0);
77 #endif
78
79 if (const char *env = getenv("GRANITE_TIMESTAMP_TRACE"))
80 {
81 LOGI("Tracing timestamps to %s.\n", env);
82 if (!init_timestamp_trace(env))
83 LOGE("Failed to init timestamp trace.\n");
84 }
85 }
86
request_legacy_semaphore()87 Semaphore Device::request_legacy_semaphore()
88 {
89 LOCK();
90 auto semaphore = managers.semaphore.request_cleared_semaphore();
91 Semaphore ptr(handle_pool.semaphores.allocate(this, semaphore, false));
92 return ptr;
93 }
94
request_external_semaphore(VkSemaphore semaphore,bool signalled)95 Semaphore Device::request_external_semaphore(VkSemaphore semaphore, bool signalled)
96 {
97 LOCK();
98 VK_ASSERT(semaphore);
99 Semaphore ptr(handle_pool.semaphores.allocate(this, semaphore, signalled));
100 return ptr;
101 }
102
103 #ifndef _WIN32
request_imported_semaphore(int fd,VkExternalSemaphoreHandleTypeFlagBitsKHR handle_type)104 Semaphore Device::request_imported_semaphore(int fd, VkExternalSemaphoreHandleTypeFlagBitsKHR handle_type)
105 {
106 LOCK();
107 if (!ext.supports_external)
108 return {};
109
110 VkExternalSemaphorePropertiesKHR props = { VK_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_PROPERTIES_KHR };
111 VkPhysicalDeviceExternalSemaphoreInfoKHR info = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_SEMAPHORE_INFO_KHR };
112 info.handleType = handle_type;
113
114 vkGetPhysicalDeviceExternalSemaphorePropertiesKHR(gpu, &info, &props);
115 if ((props.externalSemaphoreFeatures & VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT_KHR) == 0)
116 return Semaphore(nullptr);
117
118 auto semaphore = managers.semaphore.request_cleared_semaphore();
119
120 VkImportSemaphoreFdInfoKHR import = { VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR };
121 import.fd = fd;
122 import.semaphore = semaphore;
123 import.handleType = handle_type;
124 import.flags = VK_SEMAPHORE_IMPORT_TEMPORARY_BIT_KHR;
125 Semaphore ptr(handle_pool.semaphores.allocate(this, semaphore, false));
126
127 if (table->vkImportSemaphoreFdKHR(device, &import) != VK_SUCCESS)
128 return Semaphore(nullptr);
129
130 ptr->signal_external();
131 ptr->destroy_on_consume();
132 return ptr;
133 }
134 #endif
135
add_wait_semaphore(CommandBuffer::Type type,Semaphore semaphore,VkPipelineStageFlags stages,bool flush)136 void Device::add_wait_semaphore(CommandBuffer::Type type, Semaphore semaphore, VkPipelineStageFlags stages, bool flush)
137 {
138 LOCK();
139 add_wait_semaphore_nolock(type, semaphore, stages, flush);
140 }
141
add_wait_semaphore_nolock(CommandBuffer::Type type,Semaphore semaphore,VkPipelineStageFlags stages,bool flush)142 void Device::add_wait_semaphore_nolock(CommandBuffer::Type type, Semaphore semaphore, VkPipelineStageFlags stages,
143 bool flush)
144 {
145 VK_ASSERT(stages != 0);
146 if (flush)
147 flush_frame(type);
148 auto &data = get_queue_data(type);
149
150 #ifdef VULKAN_DEBUG
151 for (auto &sem : data.wait_semaphores)
152 VK_ASSERT(sem.get() != semaphore.get());
153 #endif
154
155 semaphore->signal_pending_wait();
156 data.wait_semaphores.push_back(semaphore);
157 data.wait_stages.push_back(stages);
158 data.need_fence = true;
159
160 // Sanity check.
161 VK_ASSERT(data.wait_semaphores.size() < 16 * 1024);
162 }
163
create_linear_host_image(const LinearHostImageCreateInfo & info)164 LinearHostImageHandle Device::create_linear_host_image(const LinearHostImageCreateInfo &info)
165 {
166 if ((info.usage & ~VK_IMAGE_USAGE_SAMPLED_BIT) != 0)
167 return LinearHostImageHandle(nullptr);
168
169 ImageCreateInfo create_info;
170 create_info.width = info.width;
171 create_info.height = info.height;
172 create_info.domain =
173 (info.flags & LINEAR_HOST_IMAGE_HOST_CACHED_BIT) != 0 ?
174 ImageDomain::LinearHostCached :
175 ImageDomain::LinearHost;
176 create_info.levels = 1;
177 create_info.layers = 1;
178 create_info.initial_layout = VK_IMAGE_LAYOUT_GENERAL;
179 create_info.format = info.format;
180 create_info.samples = VK_SAMPLE_COUNT_1_BIT;
181 create_info.usage = info.usage;
182 create_info.type = VK_IMAGE_TYPE_2D;
183
184 if ((info.flags & LINEAR_HOST_IMAGE_REQUIRE_LINEAR_FILTER_BIT) != 0)
185 create_info.misc |= IMAGE_MISC_VERIFY_FORMAT_FEATURE_SAMPLED_LINEAR_FILTER_BIT;
186 if ((info.flags & LINEAR_HOST_IMAGE_IGNORE_DEVICE_LOCAL_BIT) != 0)
187 create_info.misc |= IMAGE_MISC_LINEAR_IMAGE_IGNORE_DEVICE_LOCAL_BIT;
188
189 BufferHandle cpu_image;
190 auto gpu_image = create_image(create_info);
191 if (!gpu_image)
192 {
193 // Fall-back to staging buffer.
194 create_info.domain = ImageDomain::Physical;
195 create_info.initial_layout = VK_IMAGE_LAYOUT_UNDEFINED;
196 create_info.misc = IMAGE_MISC_CONCURRENT_QUEUE_GRAPHICS_BIT | IMAGE_MISC_CONCURRENT_QUEUE_ASYNC_TRANSFER_BIT;
197 create_info.usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
198 gpu_image = create_image(create_info);
199 if (!gpu_image)
200 return LinearHostImageHandle(nullptr);
201
202 BufferCreateInfo buffer;
203 buffer.domain =
204 (info.flags & LINEAR_HOST_IMAGE_HOST_CACHED_BIT) != 0 ?
205 BufferDomain::CachedHost :
206 BufferDomain::Host;
207 buffer.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
208 buffer.size = info.width * info.height * TextureFormatLayout::format_block_size(info.format, format_to_aspect_mask(info.format));
209 cpu_image = create_buffer(buffer);
210 if (!cpu_image)
211 return LinearHostImageHandle(nullptr);
212 }
213 else
214 gpu_image->set_layout(Layout::General);
215
216 return LinearHostImageHandle(handle_pool.linear_images.allocate(this, move(gpu_image), move(cpu_image), info.stages));
217 }
218
map_linear_host_image(const LinearHostImage & image,MemoryAccessFlags access)219 void *Device::map_linear_host_image(const LinearHostImage &image, MemoryAccessFlags access)
220 {
221 void *host = managers.memory.map_memory(image.get_host_visible_allocation(), access,
222 0, image.get_host_visible_allocation().get_size());
223 return host;
224 }
225
unmap_linear_host_image_and_sync(const LinearHostImage & image,MemoryAccessFlags access)226 void Device::unmap_linear_host_image_and_sync(const LinearHostImage &image, MemoryAccessFlags access)
227 {
228 managers.memory.unmap_memory(image.get_host_visible_allocation(), access,
229 0, image.get_host_visible_allocation().get_size());
230 if (image.need_staging_copy())
231 {
232 // Kinda icky fallback, shouldn't really be used on discrete cards.
233 auto cmd = request_command_buffer(CommandBuffer::Type::AsyncTransfer);
234 cmd->image_barrier(image.get_image(), VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
235 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0,
236 VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT);
237 cmd->copy_buffer_to_image(image.get_image(), image.get_host_visible_buffer(),
238 0, {},
239 { image.get_image().get_width(), image.get_image().get_height(), 1 },
240 0, 0, { VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1 });
241
242 // Don't care about dstAccessMask, semaphore takes care of everything.
243 cmd->image_barrier(image.get_image(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
244 VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT,
245 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, 0);
246
247 Semaphore sem;
248 submit(cmd, nullptr, 1, &sem);
249
250 // The queue type is an assumption. Should add some parameter for that.
251 add_wait_semaphore(CommandBuffer::Type::Generic, sem, image.get_used_pipeline_stages(), true);
252 }
253 }
254
map_host_buffer(const Buffer & buffer,MemoryAccessFlags access)255 void *Device::map_host_buffer(const Buffer &buffer, MemoryAccessFlags access)
256 {
257 void *host = managers.memory.map_memory(buffer.get_allocation(), access, 0, buffer.get_create_info().size);
258 return host;
259 }
260
unmap_host_buffer(const Buffer & buffer,MemoryAccessFlags access)261 void Device::unmap_host_buffer(const Buffer &buffer, MemoryAccessFlags access)
262 {
263 managers.memory.unmap_memory(buffer.get_allocation(), access, 0, buffer.get_create_info().size);
264 }
265
map_host_buffer(const Buffer & buffer,MemoryAccessFlags access,VkDeviceSize offset,VkDeviceSize length)266 void *Device::map_host_buffer(const Buffer &buffer, MemoryAccessFlags access, VkDeviceSize offset, VkDeviceSize length)
267 {
268 VK_ASSERT(offset + length <= buffer.get_create_info().size);
269 void *host = managers.memory.map_memory(buffer.get_allocation(), access, offset, length);
270 return host;
271 }
272
unmap_host_buffer(const Buffer & buffer,MemoryAccessFlags access,VkDeviceSize offset,VkDeviceSize length)273 void Device::unmap_host_buffer(const Buffer &buffer, MemoryAccessFlags access, VkDeviceSize offset, VkDeviceSize length)
274 {
275 VK_ASSERT(offset + length <= buffer.get_create_info().size);
276 managers.memory.unmap_memory(buffer.get_allocation(), access, offset, length);
277 }
278
request_shader(const uint32_t * data,size_t size)279 Shader *Device::request_shader(const uint32_t *data, size_t size)
280 {
281 Util::Hasher hasher;
282 hasher.data(data, size);
283
284 auto hash = hasher.get();
285 auto *ret = shaders.find(hash);
286 if (!ret)
287 ret = shaders.emplace_yield(hash, hash, this, data, size);
288 return ret;
289 }
290
request_shader_by_hash(Hash hash)291 Shader *Device::request_shader_by_hash(Hash hash)
292 {
293 return shaders.find(hash);
294 }
295
request_program(Vulkan::Shader * compute_shader)296 Program *Device::request_program(Vulkan::Shader *compute_shader)
297 {
298 if (!compute_shader)
299 return nullptr;
300
301 Util::Hasher hasher;
302 hasher.u64(compute_shader->get_hash());
303
304 auto hash = hasher.get();
305 auto *ret = programs.find(hash);
306 if (!ret)
307 ret = programs.emplace_yield(hash, this, compute_shader);
308 return ret;
309 }
310
request_program(const uint32_t * compute_data,size_t compute_size)311 Program *Device::request_program(const uint32_t *compute_data, size_t compute_size)
312 {
313 if (!compute_size)
314 return nullptr;
315
316 auto *compute_shader = request_shader(compute_data, compute_size);
317 return request_program(compute_shader);
318 }
319
request_program(Shader * vertex,Shader * fragment)320 Program *Device::request_program(Shader *vertex, Shader *fragment)
321 {
322 if (!vertex || !fragment)
323 return nullptr;
324
325 Util::Hasher hasher;
326 hasher.u64(vertex->get_hash());
327 hasher.u64(fragment->get_hash());
328
329 auto hash = hasher.get();
330 auto *ret = programs.find(hash);
331
332 if (!ret)
333 ret = programs.emplace_yield(hash, this, vertex, fragment);
334 return ret;
335 }
336
request_program(const uint32_t * vertex_data,size_t vertex_size,const uint32_t * fragment_data,size_t fragment_size)337 Program *Device::request_program(const uint32_t *vertex_data, size_t vertex_size, const uint32_t *fragment_data,
338 size_t fragment_size)
339 {
340 if (!vertex_size || !fragment_size)
341 return nullptr;
342
343 auto *vertex = request_shader(vertex_data, vertex_size);
344 auto *fragment = request_shader(fragment_data, fragment_size);
345 return request_program(vertex, fragment);
346 }
347
request_pipeline_layout(const CombinedResourceLayout & layout)348 PipelineLayout *Device::request_pipeline_layout(const CombinedResourceLayout &layout)
349 {
350 Hasher h;
351 h.data(reinterpret_cast<const uint32_t *>(layout.sets), sizeof(layout.sets));
352 h.data(&layout.stages_for_bindings[0][0], sizeof(layout.stages_for_bindings));
353 h.u32(layout.push_constant_range.stageFlags);
354 h.u32(layout.push_constant_range.size);
355 h.data(layout.spec_constant_mask, sizeof(layout.spec_constant_mask));
356 h.u32(layout.attribute_mask);
357 h.u32(layout.render_target_mask);
358
359 auto hash = h.get();
360 auto *ret = pipeline_layouts.find(hash);
361 if (!ret)
362 ret = pipeline_layouts.emplace_yield(hash, hash, this, layout);
363 return ret;
364 }
365
request_descriptor_set_allocator(const DescriptorSetLayout & layout,const uint32_t * stages_for_bindings)366 DescriptorSetAllocator *Device::request_descriptor_set_allocator(const DescriptorSetLayout &layout, const uint32_t *stages_for_bindings)
367 {
368 Hasher h;
369 h.data(reinterpret_cast<const uint32_t *>(&layout), sizeof(layout));
370 h.data(stages_for_bindings, sizeof(uint32_t) * VULKAN_NUM_BINDINGS);
371 auto hash = h.get();
372
373 auto *ret = descriptor_set_allocators.find(hash);
374 if (!ret)
375 ret = descriptor_set_allocators.emplace_yield(hash, hash, this, layout, stages_for_bindings);
376 return ret;
377 }
378
bake_program(Program & program)379 void Device::bake_program(Program &program)
380 {
381 CombinedResourceLayout layout;
382 if (program.get_shader(ShaderStage::Vertex))
383 layout.attribute_mask = program.get_shader(ShaderStage::Vertex)->get_layout().input_mask;
384 if (program.get_shader(ShaderStage::Fragment))
385 layout.render_target_mask = program.get_shader(ShaderStage::Fragment)->get_layout().output_mask;
386
387 layout.descriptor_set_mask = 0;
388
389 for (unsigned i = 0; i < static_cast<unsigned>(ShaderStage::Count); i++)
390 {
391 auto *shader = program.get_shader(static_cast<ShaderStage>(i));
392 if (!shader)
393 continue;
394
395 uint32_t stage_mask = 1u << i;
396
397 auto &shader_layout = shader->get_layout();
398 for (unsigned set = 0; set < VULKAN_NUM_DESCRIPTOR_SETS; set++)
399 {
400 layout.sets[set].sampled_image_mask |= shader_layout.sets[set].sampled_image_mask;
401 layout.sets[set].storage_image_mask |= shader_layout.sets[set].storage_image_mask;
402 layout.sets[set].uniform_buffer_mask |= shader_layout.sets[set].uniform_buffer_mask;
403 layout.sets[set].storage_buffer_mask |= shader_layout.sets[set].storage_buffer_mask;
404 layout.sets[set].sampled_buffer_mask |= shader_layout.sets[set].sampled_buffer_mask;
405 layout.sets[set].input_attachment_mask |= shader_layout.sets[set].input_attachment_mask;
406 layout.sets[set].sampler_mask |= shader_layout.sets[set].sampler_mask;
407 layout.sets[set].separate_image_mask |= shader_layout.sets[set].separate_image_mask;
408 layout.sets[set].fp_mask |= shader_layout.sets[set].fp_mask;
409
410 for_each_bit(shader_layout.sets[set].immutable_sampler_mask, [&](uint32_t binding) {
411 StockSampler sampler = get_immutable_sampler(shader_layout.sets[set], binding);
412
413 // Do we already have an immutable sampler? Make sure it matches the layout.
414 if (has_immutable_sampler(layout.sets[set], binding))
415 {
416 if (sampler != get_immutable_sampler(layout.sets[set], binding))
417 LOGE("Immutable sampler mismatch detected!\n");
418 }
419
420 set_immutable_sampler(layout.sets[set], binding, sampler);
421 });
422
423 uint32_t active_binds =
424 shader_layout.sets[set].sampled_image_mask |
425 shader_layout.sets[set].storage_image_mask |
426 shader_layout.sets[set].uniform_buffer_mask|
427 shader_layout.sets[set].storage_buffer_mask |
428 shader_layout.sets[set].sampled_buffer_mask |
429 shader_layout.sets[set].input_attachment_mask |
430 shader_layout.sets[set].sampler_mask |
431 shader_layout.sets[set].separate_image_mask;
432
433 if (active_binds)
434 layout.stages_for_sets[set] |= stage_mask;
435
436 for_each_bit(active_binds, [&](uint32_t bit) {
437 layout.stages_for_bindings[set][bit] |= stage_mask;
438
439 auto &combined_size = layout.sets[set].array_size[bit];
440 auto &shader_size = shader_layout.sets[set].array_size[bit];
441 if (combined_size && combined_size != shader_size)
442 LOGE("Mismatch between array sizes in different shaders.\n");
443 else
444 combined_size = shader_size;
445 });
446 }
447
448 // Merge push constant ranges into one range.
449 // Do not try to split into multiple ranges as it just complicates things for no obvious gain.
450 if (shader_layout.push_constant_size != 0)
451 {
452 layout.push_constant_range.stageFlags |= 1u << i;
453 layout.push_constant_range.size =
454 std::max(layout.push_constant_range.size, shader_layout.push_constant_size);
455 }
456
457 layout.spec_constant_mask[i] = shader_layout.spec_constant_mask;
458 layout.combined_spec_constant_mask |= shader_layout.spec_constant_mask;
459 layout.bindless_descriptor_set_mask |= shader_layout.bindless_set_mask;
460 }
461
462 for (unsigned set = 0; set < VULKAN_NUM_DESCRIPTOR_SETS; set++)
463 {
464 if (layout.stages_for_sets[set] != 0)
465 {
466 layout.descriptor_set_mask |= 1u << set;
467
468 for (unsigned binding = 0; binding < VULKAN_NUM_BINDINGS; binding++)
469 {
470 auto &array_size = layout.sets[set].array_size[binding];
471 if (array_size == DescriptorSetLayout::UNSIZED_ARRAY)
472 {
473 for (unsigned i = 1; i < VULKAN_NUM_BINDINGS; i++)
474 {
475 if (layout.stages_for_bindings[set][i] != 0)
476 LOGE("Using bindless for set = %u, but binding = %u has a descriptor attached to it.\n", set, i);
477 }
478
479 // Allows us to have one unified descriptor set layout for bindless.
480 layout.stages_for_bindings[set][binding] = VK_SHADER_STAGE_ALL;
481 }
482 else if (array_size == 0)
483 {
484 array_size = 1;
485 }
486 else
487 {
488 for (unsigned i = 1; i < array_size; i++)
489 {
490 if (layout.stages_for_bindings[set][binding + i] != 0)
491 {
492 LOGE("Detected binding aliasing for (%u, %u). Binding array with %u elements starting at (%u, %u) overlaps.\n",
493 set, binding + i, array_size, set, binding);
494 }
495 }
496 }
497 }
498 }
499 }
500
501 Hasher h;
502 h.u32(layout.push_constant_range.stageFlags);
503 h.u32(layout.push_constant_range.size);
504 layout.push_constant_layout_hash = h.get();
505 program.set_pipeline_layout(request_pipeline_layout(layout));
506 }
507
init_pipeline_cache(const uint8_t * data,size_t size)508 bool Device::init_pipeline_cache(const uint8_t *data, size_t size)
509 {
510 static const auto uuid_size = sizeof(gpu_props.pipelineCacheUUID);
511
512 VkPipelineCacheCreateInfo info = { VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO };
513 if (!data || size < uuid_size)
514 {
515 LOGI("Creating a fresh pipeline cache.\n");
516 }
517 else if (memcmp(data, gpu_props.pipelineCacheUUID, uuid_size) != 0)
518 {
519 LOGI("Pipeline cache UUID changed.\n");
520 }
521 else
522 {
523 info.initialDataSize = size - uuid_size;
524 info.pInitialData = data + uuid_size;
525 LOGI("Initializing pipeline cache.\n");
526 }
527
528 if (pipeline_cache != VK_NULL_HANDLE)
529 table->vkDestroyPipelineCache(device, pipeline_cache, nullptr);
530 pipeline_cache = VK_NULL_HANDLE;
531 return table->vkCreatePipelineCache(device, &info, nullptr, &pipeline_cache) == VK_SUCCESS;
532 }
533
to_hex(uint8_t v)534 static inline char to_hex(uint8_t v)
535 {
536 if (v < 10)
537 return char('0' + v);
538 else
539 return char('a' + (v - 10));
540 }
541
get_pipeline_cache_string() const542 string Device::get_pipeline_cache_string() const
543 {
544 string res;
545 res.reserve(sizeof(gpu_props.pipelineCacheUUID) * 2);
546
547 for (auto &c : gpu_props.pipelineCacheUUID)
548 {
549 res += to_hex(uint8_t((c >> 4) & 0xf));
550 res += to_hex(uint8_t(c & 0xf));
551 }
552
553 return res;
554 }
555
init_pipeline_cache()556 void Device::init_pipeline_cache()
557 {
558 #ifdef GRANITE_VULKAN_FILESYSTEM
559 auto file = Granite::Global::filesystem()->open(Util::join("cache://pipeline_cache_", get_pipeline_cache_string(), ".bin"),
560 Granite::FileMode::ReadOnly);
561 if (file)
562 {
563 auto size = file->get_size();
564 auto *mapped = static_cast<uint8_t *>(file->map());
565 if (mapped && !init_pipeline_cache(mapped, size))
566 LOGE("Failed to initialize pipeline cache.\n");
567 }
568 else if (!init_pipeline_cache(nullptr, 0))
569 LOGE("Failed to initialize pipeline cache.\n");
570 #endif
571 }
572
get_pipeline_cache_size()573 size_t Device::get_pipeline_cache_size()
574 {
575 if (pipeline_cache == VK_NULL_HANDLE)
576 return 0;
577
578 static const auto uuid_size = sizeof(gpu_props.pipelineCacheUUID);
579 size_t size = 0;
580 if (table->vkGetPipelineCacheData(device, pipeline_cache, &size, nullptr) != VK_SUCCESS)
581 {
582 LOGE("Failed to get pipeline cache data.\n");
583 return 0;
584 }
585
586 return size + uuid_size;
587 }
588
get_pipeline_cache_data(uint8_t * data,size_t size)589 bool Device::get_pipeline_cache_data(uint8_t *data, size_t size)
590 {
591 if (pipeline_cache == VK_NULL_HANDLE)
592 return false;
593
594 static const auto uuid_size = sizeof(gpu_props.pipelineCacheUUID);
595 if (size < uuid_size)
596 return false;
597
598 size -= uuid_size;
599 memcpy(data, gpu_props.pipelineCacheUUID, uuid_size);
600 data += uuid_size;
601
602 if (table->vkGetPipelineCacheData(device, pipeline_cache, &size, data) != VK_SUCCESS)
603 {
604 LOGE("Failed to get pipeline cache data.\n");
605 return false;
606 }
607
608 return true;
609 }
610
flush_pipeline_cache()611 void Device::flush_pipeline_cache()
612 {
613 #ifdef GRANITE_VULKAN_FILESYSTEM
614 size_t size = get_pipeline_cache_size();
615 if (!size)
616 {
617 LOGE("Failed to get pipeline cache size.\n");
618 return;
619 }
620
621 auto file = Granite::Global::filesystem()->open(Util::join("cache://pipeline_cache_", get_pipeline_cache_string(), ".bin"),
622 Granite::FileMode::WriteOnly);
623 if (!file)
624 {
625 LOGE("Failed to get pipeline cache data.\n");
626 return;
627 }
628
629 uint8_t *data = static_cast<uint8_t *>(file->map_write(size));
630 if (!data)
631 {
632 LOGE("Failed to get pipeline cache data.\n");
633 return;
634 }
635
636 if (!get_pipeline_cache_data(data, size))
637 {
638 LOGE("Failed to get pipeline cache data.\n");
639 return;
640 }
641 #endif
642 }
643
init_workarounds()644 void Device::init_workarounds()
645 {
646 workarounds = {};
647
648 #ifdef __APPLE__
649 // Events are not supported in MoltenVK.
650 workarounds.emulate_event_as_pipeline_barrier = true;
651 LOGW("Emulating events as pipeline barriers on Metal emulation.\n");
652 #else
653 if (gpu_props.vendorID == VENDOR_ID_NVIDIA &&
654 #ifdef _WIN32
655 VK_VERSION_MAJOR(gpu_props.driverVersion) < 417)
656 #else
657 VK_VERSION_MAJOR(gpu_props.driverVersion) < 415)
658 #endif
659 {
660 workarounds.force_store_in_render_pass = true;
661 LOGW("Detected workaround for render pass STORE_OP_STORE.\n");
662 }
663
664 if (gpu_props.vendorID == VENDOR_ID_QCOM)
665 {
666 // Apparently, we need to use STORE_OP_STORE in all render passes no matter what ...
667 workarounds.force_store_in_render_pass = true;
668 workarounds.broken_color_write_mask = true;
669 LOGW("Detected workaround for render pass STORE_OP_STORE.\n");
670 LOGW("Detected workaround for broken color write masks.\n");
671 }
672
673 // UNDEFINED -> COLOR_ATTACHMENT_OPTIMAL stalls, so need to acquire async.
674 if (gpu_props.vendorID == VENDOR_ID_ARM)
675 {
676 LOGW("Workaround applied: Acquiring WSI images early on Mali.\n");
677 LOGW("Workaround applied: Emulating events as pipeline barriers.\n");
678 LOGW("Workaround applied: Optimize ALL_GRAPHICS_BIT barriers.\n");
679
680 // All performance related workarounds.
681 workarounds.wsi_acquire_barrier_is_expensive = true;
682 workarounds.emulate_event_as_pipeline_barrier = true;
683 workarounds.optimize_all_graphics_barrier = true;
684 }
685 #endif
686 }
687
set_context(const Context & context)688 void Device::set_context(const Context &context)
689 {
690 table = &context.get_device_table();
691
692 #ifdef GRANITE_VULKAN_MT
693 register_thread_index(0);
694 #endif
695 instance = context.get_instance();
696 gpu = context.get_gpu();
697 device = context.get_device();
698 num_thread_indices = context.get_num_thread_indices();
699
700 graphics_queue_family_index = context.get_graphics_queue_family();
701 graphics_queue = context.get_graphics_queue();
702 compute_queue_family_index = context.get_compute_queue_family();
703 compute_queue = context.get_compute_queue();
704 transfer_queue_family_index = context.get_transfer_queue_family();
705 transfer_queue = context.get_transfer_queue();
706 timestamp_valid_bits = context.get_timestamp_valid_bits();
707
708 mem_props = context.get_mem_props();
709 gpu_props = context.get_gpu_props();
710 ext = context.get_enabled_device_features();
711
712 init_workarounds();
713
714 init_stock_samplers();
715 init_pipeline_cache();
716
717 init_timeline_semaphores();
718 init_bindless();
719
720 #ifdef ANDROID
721 init_frame_contexts(3); // Android needs a bit more ... ;)
722 #else
723 init_frame_contexts(2); // By default, regular double buffer between CPU and GPU.
724 #endif
725
726 managers.memory.init(this);
727 managers.memory.set_supports_dedicated_allocation(ext.supports_dedicated);
728 managers.semaphore.init(this);
729 managers.fence.init(this);
730 managers.event.init(this);
731 managers.vbo.init(this, 4 * 1024, 16, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
732 ImplementationQuirks::get().staging_need_device_local);
733 managers.ibo.init(this, 4 * 1024, 16, VK_BUFFER_USAGE_INDEX_BUFFER_BIT,
734 ImplementationQuirks::get().staging_need_device_local);
735 managers.ubo.init(this, 256 * 1024, std::max<VkDeviceSize>(16u, gpu_props.limits.minUniformBufferOffsetAlignment),
736 VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
737 ImplementationQuirks::get().staging_need_device_local);
738 managers.ubo.set_spill_region_size(VULKAN_MAX_UBO_SIZE);
739 managers.staging.init(this, 64 * 1024, std::max<VkDeviceSize>(16u, gpu_props.limits.optimalBufferCopyOffsetAlignment),
740 VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
741 false);
742
743 graphics.performance_query_pool.init_device(this, graphics_queue_family_index);
744 if (graphics_queue_family_index != compute_queue_family_index)
745 compute.performance_query_pool.init_device(this, compute_queue_family_index);
746 if (graphics_queue_family_index != transfer_queue_family_index &&
747 compute_queue_family_index != transfer_queue_family_index)
748 {
749 transfer.performance_query_pool.init_device(this, transfer_queue_family_index);
750 }
751
752 #ifdef GRANITE_VULKAN_FOSSILIZE
753 init_pipeline_state();
754 #endif
755 #ifdef GRANITE_VULKAN_FILESYSTEM
756 init_shader_manager_cache();
757 #endif
758
759 init_calibrated_timestamps();
760 }
761
init_bindless()762 void Device::init_bindless()
763 {
764 if (!ext.supports_descriptor_indexing)
765 return;
766
767 DescriptorSetLayout layout;
768
769 layout.array_size[0] = DescriptorSetLayout::UNSIZED_ARRAY;
770 for (unsigned i = 1; i < VULKAN_NUM_BINDINGS; i++)
771 layout.array_size[i] = 1;
772
773 layout.separate_image_mask = 1;
774 uint32_t stages_for_sets[VULKAN_NUM_BINDINGS] = { VK_SHADER_STAGE_ALL };
775 bindless_sampled_image_allocator_integer = request_descriptor_set_allocator(layout, stages_for_sets);
776 layout.fp_mask = 1;
777 bindless_sampled_image_allocator_fp = request_descriptor_set_allocator(layout, stages_for_sets);
778 }
779
init_timeline_semaphores()780 void Device::init_timeline_semaphores()
781 {
782 if (!ext.timeline_semaphore_features.timelineSemaphore)
783 return;
784
785 VkSemaphoreTypeCreateInfoKHR type_info = { VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR };
786 VkSemaphoreCreateInfo info = { VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO };
787 info.pNext = &type_info;
788 type_info.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR;
789 type_info.initialValue = 0;
790 if (table->vkCreateSemaphore(device, &info, nullptr, &graphics.timeline_semaphore) != VK_SUCCESS)
791 LOGE("Failed to create timeline semaphore.\n");
792 if (table->vkCreateSemaphore(device, &info, nullptr, &compute.timeline_semaphore) != VK_SUCCESS)
793 LOGE("Failed to create timeline semaphore.\n");
794 if (table->vkCreateSemaphore(device, &info, nullptr, &transfer.timeline_semaphore) != VK_SUCCESS)
795 LOGE("Failed to create timeline sempahore.\n");
796 }
797
init_stock_samplers()798 void Device::init_stock_samplers()
799 {
800 if (ext.sampler_ycbcr_conversion_features.samplerYcbcrConversion)
801 {
802 for (auto &sampler : samplers_ycbcr)
803 {
804 if (sampler)
805 table->vkDestroySamplerYcbcrConversion(device, sampler, nullptr);
806 sampler = VK_NULL_HANDLE;
807 }
808
809 VkSamplerYcbcrConversionCreateInfo info = { VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_CREATE_INFO };
810 info.ycbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_709;
811 info.ycbcrRange = VK_SAMPLER_YCBCR_RANGE_ITU_NARROW;
812 info.components = {
813 VK_COMPONENT_SWIZZLE_IDENTITY,
814 VK_COMPONENT_SWIZZLE_IDENTITY,
815 VK_COMPONENT_SWIZZLE_IDENTITY,
816 VK_COMPONENT_SWIZZLE_IDENTITY,
817 };
818 info.chromaFilter = VK_FILTER_LINEAR;
819 info.xChromaOffset = VK_CHROMA_LOCATION_MIDPOINT;
820 info.yChromaOffset = VK_CHROMA_LOCATION_MIDPOINT;
821 info.forceExplicitReconstruction = VK_FALSE;
822
823 info.format = VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM;
824 table->vkCreateSamplerYcbcrConversionKHR(device, &info, nullptr,
825 &samplers_ycbcr[static_cast<unsigned>(YCbCrFormat::YUV420P_3PLANE)]);
826
827 info.format = VK_FORMAT_G8_B8_R8_3PLANE_422_UNORM;
828 table->vkCreateSamplerYcbcrConversionKHR(device, &info, nullptr,
829 &samplers_ycbcr[static_cast<unsigned>(YCbCrFormat::YUV422P_3PLANE)]);
830
831 info.format = VK_FORMAT_G8_B8_R8_3PLANE_444_UNORM;
832 table->vkCreateSamplerYcbcrConversionKHR(device, &info, nullptr,
833 &samplers_ycbcr[static_cast<unsigned>(YCbCrFormat::YUV444P_3PLANE)]);
834 }
835
836 SamplerCreateInfo info = {};
837 info.max_lod = VK_LOD_CLAMP_NONE;
838 info.max_anisotropy = 1.0f;
839
840 for (unsigned i = 0; i < static_cast<unsigned>(StockSampler::Count); i++)
841 {
842 auto mode = static_cast<StockSampler>(i);
843
844 switch (mode)
845 {
846 case StockSampler::NearestShadow:
847 case StockSampler::LinearShadow:
848 info.compare_enable = true;
849 info.compare_op = VK_COMPARE_OP_LESS_OR_EQUAL;
850 break;
851
852 default:
853 info.compare_enable = false;
854 break;
855 }
856
857 switch (mode)
858 {
859 case StockSampler::TrilinearClamp:
860 case StockSampler::TrilinearWrap:
861 info.mipmap_mode = VK_SAMPLER_MIPMAP_MODE_LINEAR;
862 break;
863
864 default:
865 info.mipmap_mode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
866 break;
867 }
868
869 switch (mode)
870 {
871 case StockSampler::LinearClamp:
872 case StockSampler::LinearWrap:
873 case StockSampler::TrilinearClamp:
874 case StockSampler::TrilinearWrap:
875 case StockSampler::LinearShadow:
876 case StockSampler::LinearYUV420P:
877 case StockSampler::LinearYUV422P:
878 case StockSampler::LinearYUV444P:
879 info.mag_filter = VK_FILTER_LINEAR;
880 info.min_filter = VK_FILTER_LINEAR;
881 break;
882
883 default:
884 info.mag_filter = VK_FILTER_NEAREST;
885 info.min_filter = VK_FILTER_NEAREST;
886 break;
887 }
888
889 switch (mode)
890 {
891 default:
892 case StockSampler::LinearWrap:
893 case StockSampler::NearestWrap:
894 case StockSampler::TrilinearWrap:
895 info.address_mode_u = VK_SAMPLER_ADDRESS_MODE_REPEAT;
896 info.address_mode_v = VK_SAMPLER_ADDRESS_MODE_REPEAT;
897 info.address_mode_w = VK_SAMPLER_ADDRESS_MODE_REPEAT;
898 break;
899
900 case StockSampler::LinearClamp:
901 case StockSampler::NearestClamp:
902 case StockSampler::TrilinearClamp:
903 case StockSampler::NearestShadow:
904 case StockSampler::LinearShadow:
905 case StockSampler::LinearYUV420P:
906 case StockSampler::LinearYUV422P:
907 case StockSampler::LinearYUV444P:
908 info.address_mode_u = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
909 info.address_mode_v = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
910 info.address_mode_w = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
911 break;
912 }
913
914 samplers[i] = create_sampler(info, mode);
915 }
916 }
917
request_block(Device & device,BufferBlock & block,VkDeviceSize size,BufferPool & pool,std::vector<BufferBlock> * dma,std::vector<BufferBlock> & recycle)918 static void request_block(Device &device, BufferBlock &block, VkDeviceSize size,
919 BufferPool &pool, std::vector<BufferBlock> *dma, std::vector<BufferBlock> &recycle)
920 {
921 if (block.mapped)
922 device.unmap_host_buffer(*block.cpu, MEMORY_ACCESS_WRITE_BIT);
923
924 if (block.offset == 0)
925 {
926 if (block.size == pool.get_block_size())
927 pool.recycle_block(move(block));
928 }
929 else
930 {
931 if (block.cpu != block.gpu)
932 {
933 VK_ASSERT(dma);
934 dma->push_back(block);
935 }
936
937 if (block.size == pool.get_block_size())
938 recycle.push_back(block);
939 }
940
941 if (size)
942 block = pool.request_block(size);
943 else
944 block = {};
945 }
946
request_vertex_block(BufferBlock & block,VkDeviceSize size)947 void Device::request_vertex_block(BufferBlock &block, VkDeviceSize size)
948 {
949 LOCK();
950 request_vertex_block_nolock(block, size);
951 }
952
request_vertex_block_nolock(BufferBlock & block,VkDeviceSize size)953 void Device::request_vertex_block_nolock(BufferBlock &block, VkDeviceSize size)
954 {
955 request_block(*this, block, size, managers.vbo, &dma.vbo, frame().vbo_blocks);
956 }
957
request_index_block(BufferBlock & block,VkDeviceSize size)958 void Device::request_index_block(BufferBlock &block, VkDeviceSize size)
959 {
960 LOCK();
961 request_index_block_nolock(block, size);
962 }
963
request_index_block_nolock(BufferBlock & block,VkDeviceSize size)964 void Device::request_index_block_nolock(BufferBlock &block, VkDeviceSize size)
965 {
966 request_block(*this, block, size, managers.ibo, &dma.ibo, frame().ibo_blocks);
967 }
968
request_uniform_block(BufferBlock & block,VkDeviceSize size)969 void Device::request_uniform_block(BufferBlock &block, VkDeviceSize size)
970 {
971 LOCK();
972 request_uniform_block_nolock(block, size);
973 }
974
request_uniform_block_nolock(BufferBlock & block,VkDeviceSize size)975 void Device::request_uniform_block_nolock(BufferBlock &block, VkDeviceSize size)
976 {
977 request_block(*this, block, size, managers.ubo, &dma.ubo, frame().ubo_blocks);
978 }
979
request_staging_block(BufferBlock & block,VkDeviceSize size)980 void Device::request_staging_block(BufferBlock &block, VkDeviceSize size)
981 {
982 LOCK();
983 request_staging_block_nolock(block, size);
984 }
985
request_staging_block_nolock(BufferBlock & block,VkDeviceSize size)986 void Device::request_staging_block_nolock(BufferBlock &block, VkDeviceSize size)
987 {
988 request_block(*this, block, size, managers.staging, nullptr, frame().staging_blocks);
989 }
990
submit(CommandBufferHandle & cmd,Fence * fence,unsigned semaphore_count,Semaphore * semaphores)991 void Device::submit(CommandBufferHandle &cmd, Fence *fence, unsigned semaphore_count, Semaphore *semaphores)
992 {
993 cmd->end_debug_channel();
994
995 LOCK();
996 submit_nolock(move(cmd), fence, semaphore_count, semaphores);
997 }
998
get_physical_queue_type(CommandBuffer::Type queue_type) const999 CommandBuffer::Type Device::get_physical_queue_type(CommandBuffer::Type queue_type) const
1000 {
1001 if (queue_type != CommandBuffer::Type::AsyncGraphics)
1002 {
1003 return queue_type;
1004 }
1005 else
1006 {
1007 if (graphics_queue_family_index == compute_queue_family_index && graphics_queue != compute_queue)
1008 return CommandBuffer::Type::AsyncCompute;
1009 else
1010 return CommandBuffer::Type::Generic;
1011 }
1012 }
1013
submit_nolock(CommandBufferHandle cmd,Fence * fence,unsigned semaphore_count,Semaphore * semaphores)1014 void Device::submit_nolock(CommandBufferHandle cmd, Fence *fence, unsigned semaphore_count, Semaphore *semaphores)
1015 {
1016 auto type = cmd->get_command_buffer_type();
1017 auto &submissions = get_queue_submissions(type);
1018 #ifdef VULKAN_DEBUG
1019 auto &pool = get_command_pool(type, cmd->get_thread_index());
1020 pool.signal_submitted(cmd->get_command_buffer());
1021 #endif
1022
1023 bool profiled_submit = cmd->has_profiling();
1024
1025 if (profiled_submit)
1026 {
1027 LOGI("Submitting profiled command buffer, draining GPU.\n");
1028 auto &query_pool = get_performance_query_pool(type);
1029 // Profiled submit, drain GPU before submitting to make sure there's no overlap going on.
1030 query_pool.end_command_buffer(cmd->get_command_buffer());
1031 Fence drain_fence;
1032 submit_empty_nolock(type, &drain_fence, 0, nullptr, -1);
1033 drain_fence->wait();
1034 drain_fence->set_internal_sync_object();
1035 }
1036
1037 cmd->end();
1038 submissions.push_back(move(cmd));
1039
1040 InternalFence signalled_fence;
1041
1042 if (fence || semaphore_count)
1043 {
1044 submit_queue(type, fence ? &signalled_fence : nullptr,
1045 semaphore_count, semaphores,
1046 profiled_submit ? 0 : -1);
1047 }
1048
1049 if (fence)
1050 {
1051 VK_ASSERT(!*fence);
1052 if (signalled_fence.value)
1053 *fence = Fence(handle_pool.fences.allocate(this, signalled_fence.value, signalled_fence.timeline));
1054 else
1055 *fence = Fence(handle_pool.fences.allocate(this, signalled_fence.fence));
1056 }
1057
1058 if (profiled_submit)
1059 {
1060 // Drain queue again and report results.
1061 LOGI("Submitted profiled command buffer, draining GPU and report ...\n");
1062 auto &query_pool = get_performance_query_pool(type);
1063 Fence drain_fence;
1064 submit_empty_nolock(type, &drain_fence, 0, nullptr, fence || semaphore_count ? -1 : 0);
1065 drain_fence->wait();
1066 drain_fence->set_internal_sync_object();
1067 query_pool.report();
1068 }
1069
1070 decrement_frame_counter_nolock();
1071 }
1072
submit_empty(CommandBuffer::Type type,Fence * fence,unsigned semaphore_count,Semaphore * semaphores)1073 void Device::submit_empty(CommandBuffer::Type type, Fence *fence,
1074 unsigned semaphore_count, Semaphore *semaphores)
1075 {
1076 LOCK();
1077 submit_empty_nolock(type, fence, semaphore_count, semaphores, -1);
1078 }
1079
submit_empty_nolock(CommandBuffer::Type type,Fence * fence,unsigned semaphore_count,Semaphore * semaphores,int profiling_iteration)1080 void Device::submit_empty_nolock(CommandBuffer::Type type, Fence *fence,
1081 unsigned semaphore_count, Semaphore *semaphores, int profiling_iteration)
1082 {
1083 if (type != CommandBuffer::Type::AsyncTransfer)
1084 flush_frame(CommandBuffer::Type::AsyncTransfer);
1085
1086 InternalFence signalled_fence;
1087 submit_queue(type, fence ? &signalled_fence : nullptr, semaphore_count, semaphores, profiling_iteration);
1088 if (fence)
1089 {
1090 if (signalled_fence.value)
1091 *fence = Fence(handle_pool.fences.allocate(this, signalled_fence.value, signalled_fence.timeline));
1092 else
1093 *fence = Fence(handle_pool.fences.allocate(this, signalled_fence.fence));
1094 }
1095 }
1096
submit_empty_inner(CommandBuffer::Type type,InternalFence * fence,unsigned semaphore_count,Semaphore * semaphores)1097 void Device::submit_empty_inner(CommandBuffer::Type type, InternalFence *fence,
1098 unsigned semaphore_count, Semaphore *semaphores)
1099 {
1100 auto &data = get_queue_data(type);
1101 VkSubmitInfo submit = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
1102 VkTimelineSemaphoreSubmitInfoKHR timeline_info = { VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR };
1103
1104 if (ext.timeline_semaphore_features.timelineSemaphore)
1105 submit.pNext = &timeline_info;
1106
1107 VkSemaphore timeline_semaphore = data.timeline_semaphore;
1108 uint64_t timeline_value = ++data.current_timeline;
1109
1110 VkQueue queue = get_vk_queue(type);
1111 switch (type)
1112 {
1113 default:
1114 case CommandBuffer::Type::Generic:
1115 frame().timeline_fence_graphics = data.current_timeline;
1116 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
1117 if (ext.timeline_semaphore_features.timelineSemaphore)
1118 {
1119 LOGI("Signal graphics: (%p) %u\n",
1120 reinterpret_cast<void *>(timeline_semaphore),
1121 unsigned(data.current_timeline));
1122 }
1123 #endif
1124 break;
1125
1126 case CommandBuffer::Type::AsyncCompute:
1127 frame().timeline_fence_compute = data.current_timeline;
1128 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
1129 if (ext.timeline_semaphore_features.timelineSemaphore)
1130 {
1131 LOGI("Signal compute: (%p) %u\n",
1132 reinterpret_cast<void *>(timeline_semaphore),
1133 unsigned(data.current_timeline));
1134 }
1135 #endif
1136 break;
1137
1138 case CommandBuffer::Type::AsyncTransfer:
1139 frame().timeline_fence_transfer = data.current_timeline;
1140 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
1141 if (ext.timeline_semaphore_features.timelineSemaphore)
1142 {
1143 LOGI("Signal transfer: (%p) %u\n",
1144 reinterpret_cast<void *>(timeline_semaphore),
1145 unsigned(data.current_timeline));
1146 }
1147 #endif
1148 break;
1149 }
1150
1151 // Add external signal semaphores.
1152 SmallVector<VkSemaphore> signals;
1153 if (ext.timeline_semaphore_features.timelineSemaphore)
1154 {
1155 // Signal once and distribute the timeline value to all.
1156 timeline_info.signalSemaphoreValueCount = 1;
1157 timeline_info.pSignalSemaphoreValues = &timeline_value;
1158 submit.signalSemaphoreCount = 1;
1159 submit.pSignalSemaphores = &timeline_semaphore;
1160
1161 if (fence)
1162 {
1163 fence->timeline = timeline_semaphore;
1164 fence->value = timeline_value;
1165 fence->fence = VK_NULL_HANDLE;
1166 }
1167
1168 for (unsigned i = 0; i < semaphore_count; i++)
1169 {
1170 VK_ASSERT(!semaphores[i]);
1171 semaphores[i] = Semaphore(handle_pool.semaphores.allocate(this, timeline_value, timeline_semaphore));
1172 }
1173 }
1174 else
1175 {
1176 if (fence)
1177 {
1178 fence->timeline = VK_NULL_HANDLE;
1179 fence->value = 0;
1180 }
1181
1182 for (unsigned i = 0; i < semaphore_count; i++)
1183 {
1184 VkSemaphore cleared_semaphore = managers.semaphore.request_cleared_semaphore();
1185 signals.push_back(cleared_semaphore);
1186 VK_ASSERT(!semaphores[i]);
1187 semaphores[i] = Semaphore(handle_pool.semaphores.allocate(this, cleared_semaphore, true));
1188 }
1189
1190 submit.signalSemaphoreCount = signals.size();
1191 if (!signals.empty())
1192 submit.pSignalSemaphores = signals.data();
1193 }
1194
1195 // Add external wait semaphores.
1196 SmallVector<VkSemaphore> waits;
1197 SmallVector<uint64_t> waits_count;
1198 auto stages = move(data.wait_stages);
1199
1200 for (auto &semaphore : data.wait_semaphores)
1201 {
1202 auto wait = semaphore->consume();
1203 if (!semaphore->get_timeline_value())
1204 {
1205 if (semaphore->can_recycle())
1206 frame().recycled_semaphores.push_back(wait);
1207 else
1208 frame().destroyed_semaphores.push_back(wait);
1209 }
1210 waits.push_back(wait);
1211 waits_count.push_back(semaphore->get_timeline_value());
1212 }
1213
1214 data.wait_stages.clear();
1215 data.wait_semaphores.clear();
1216
1217 submit.waitSemaphoreCount = waits.size();
1218 if (!stages.empty())
1219 submit.pWaitDstStageMask = stages.data();
1220 if (!waits.empty())
1221 submit.pWaitSemaphores = waits.data();
1222
1223 if (!waits_count.empty())
1224 {
1225 timeline_info.waitSemaphoreValueCount = waits_count.size();
1226 timeline_info.pWaitSemaphoreValues = waits_count.data();
1227 }
1228
1229 VkFence cleared_fence = fence && !ext.timeline_semaphore_features.timelineSemaphore ?
1230 managers.fence.request_cleared_fence() :
1231 VK_NULL_HANDLE;
1232 if (fence)
1233 fence->fence = cleared_fence;
1234
1235 QueryPoolHandle start_ts, end_ts;
1236 if (json_timestamp_origin)
1237 start_ts = write_calibrated_timestamp_nolock();
1238
1239 if (queue_lock_callback)
1240 queue_lock_callback();
1241 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
1242 if (cleared_fence)
1243 LOGI("Signalling Fence: %llx\n", reinterpret_cast<unsigned long long>(cleared_fence));
1244 #endif
1245
1246 VkResult result = table->vkQueueSubmit(queue, 1, &submit, cleared_fence);
1247 if (ImplementationQuirks::get().queue_wait_on_submission)
1248 table->vkQueueWaitIdle(queue);
1249 if (queue_unlock_callback)
1250 queue_unlock_callback();
1251
1252 if (json_timestamp_origin)
1253 {
1254 end_ts = write_calibrated_timestamp_nolock();
1255 register_time_interval_nolock("CPU", std::move(start_ts), std::move(end_ts), "submit", "");
1256 }
1257
1258 if (result != VK_SUCCESS)
1259 LOGE("vkQueueSubmit failed (code: %d).\n", int(result));
1260 if (result == VK_ERROR_DEVICE_LOST)
1261 report_checkpoints();
1262
1263 if (!ext.timeline_semaphore_features.timelineSemaphore)
1264 data.need_fence = true;
1265
1266 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
1267 const char *queue_name = nullptr;
1268 switch (type)
1269 {
1270 default:
1271 case CommandBuffer::Type::Generic:
1272 queue_name = "Graphics";
1273 break;
1274 case CommandBuffer::Type::AsyncCompute:
1275 queue_name = "Compute";
1276 break;
1277 case CommandBuffer::Type::AsyncTransfer:
1278 queue_name = "Transfer";
1279 break;
1280 }
1281
1282 LOGI("Empty submission to %s queue:\n", queue_name);
1283 for (uint32_t i = 0; i < submit.waitSemaphoreCount; i++)
1284 {
1285 LOGI(" Waiting for semaphore: %llx in stages %s\n",
1286 reinterpret_cast<unsigned long long>(submit.pWaitSemaphores[i]),
1287 stage_flags_to_string(submit.pWaitDstStageMask[i]).c_str());
1288 }
1289
1290 for (uint32_t i = 0; i < submit.signalSemaphoreCount; i++)
1291 {
1292 LOGI(" Signalling semaphore: %llx\n",
1293 reinterpret_cast<unsigned long long>(submit.pSignalSemaphores[i]));
1294 }
1295 #endif
1296 }
1297
request_legacy_fence()1298 Fence Device::request_legacy_fence()
1299 {
1300 VkFence fence = managers.fence.request_cleared_fence();
1301 return Fence(handle_pool.fences.allocate(this, fence));
1302 }
1303
submit_staging(CommandBufferHandle & cmd,VkBufferUsageFlags usage,bool flush)1304 void Device::submit_staging(CommandBufferHandle &cmd, VkBufferUsageFlags usage, bool flush)
1305 {
1306 auto access = buffer_usage_to_possible_access(usage);
1307 auto stages = buffer_usage_to_possible_stages(usage);
1308 VkQueue src_queue = get_vk_queue(cmd->get_command_buffer_type());
1309
1310 if (src_queue == graphics_queue && src_queue == compute_queue)
1311 {
1312 // For single-queue systems, just use a pipeline barrier.
1313 cmd->barrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, stages, access);
1314 submit_nolock(cmd, nullptr, 0, nullptr);
1315 }
1316 else
1317 {
1318 auto compute_stages = stages &
1319 (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT |
1320 VK_PIPELINE_STAGE_TRANSFER_BIT |
1321 VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT);
1322
1323 auto compute_access = access &
1324 (VK_ACCESS_SHADER_READ_BIT |
1325 VK_ACCESS_SHADER_WRITE_BIT |
1326 VK_ACCESS_TRANSFER_READ_BIT |
1327 VK_ACCESS_UNIFORM_READ_BIT |
1328 VK_ACCESS_TRANSFER_WRITE_BIT |
1329 VK_ACCESS_INDIRECT_COMMAND_READ_BIT);
1330
1331 auto graphics_stages = stages;
1332
1333 if (src_queue == graphics_queue)
1334 {
1335 cmd->barrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT,
1336 graphics_stages, access);
1337
1338 if (compute_stages != 0)
1339 {
1340 Semaphore sem;
1341 submit_nolock(cmd, nullptr, 1, &sem);
1342 add_wait_semaphore_nolock(CommandBuffer::Type::AsyncCompute, sem, compute_stages, flush);
1343 }
1344 else
1345 submit_nolock(cmd, nullptr, 0, nullptr);
1346 }
1347 else if (src_queue == compute_queue)
1348 {
1349 cmd->barrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT,
1350 compute_stages, compute_access);
1351
1352 if (graphics_stages != 0)
1353 {
1354 Semaphore sem;
1355 submit_nolock(cmd, nullptr, 1, &sem);
1356 add_wait_semaphore_nolock(CommandBuffer::Type::Generic, sem, graphics_stages, flush);
1357 }
1358 else
1359 submit_nolock(cmd, nullptr, 0, nullptr);
1360 }
1361 else
1362 {
1363 if (graphics_stages != 0 && compute_stages != 0)
1364 {
1365 Semaphore semaphores[2];
1366 submit_nolock(cmd, nullptr, 2, semaphores);
1367 add_wait_semaphore_nolock(CommandBuffer::Type::Generic, semaphores[0], graphics_stages, flush);
1368 add_wait_semaphore_nolock(CommandBuffer::Type::AsyncCompute, semaphores[1], compute_stages, flush);
1369 }
1370 else if (graphics_stages != 0)
1371 {
1372 Semaphore sem;
1373 submit_nolock(cmd, nullptr, 1, &sem);
1374 add_wait_semaphore_nolock(CommandBuffer::Type::Generic, sem, graphics_stages, flush);
1375 }
1376 else if (compute_stages != 0)
1377 {
1378 Semaphore sem;
1379 submit_nolock(cmd, nullptr, 1, &sem);
1380 add_wait_semaphore_nolock(CommandBuffer::Type::AsyncCompute, sem, compute_stages, flush);
1381 }
1382 else
1383 submit_nolock(cmd, nullptr, 0, nullptr);
1384 }
1385 }
1386 }
1387
submit_queue(CommandBuffer::Type type,InternalFence * fence,unsigned semaphore_count,Semaphore * semaphores,int profiling_iteration)1388 void Device::submit_queue(CommandBuffer::Type type, InternalFence *fence,
1389 unsigned semaphore_count, Semaphore *semaphores, int profiling_iteration)
1390 {
1391 type = get_physical_queue_type(type);
1392
1393 // Always check if we need to flush pending transfers.
1394 if (type != CommandBuffer::Type::AsyncTransfer)
1395 flush_frame(CommandBuffer::Type::AsyncTransfer);
1396
1397 auto &data = get_queue_data(type);
1398 auto &submissions = get_queue_submissions(type);
1399
1400 if (submissions.empty())
1401 {
1402 if (fence || semaphore_count)
1403 submit_empty_inner(type, fence, semaphore_count, semaphores);
1404 return;
1405 }
1406
1407 VkSemaphore timeline_semaphore = data.timeline_semaphore;
1408 uint64_t timeline_value = ++data.current_timeline;
1409
1410 VkQueue queue = get_vk_queue(type);
1411 switch (type)
1412 {
1413 default:
1414 case CommandBuffer::Type::Generic:
1415 frame().timeline_fence_graphics = data.current_timeline;
1416 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
1417 LOGI("Signal graphics: (%p) %u\n",
1418 reinterpret_cast<void *>(timeline_semaphore),
1419 unsigned(data.current_timeline));
1420 #endif
1421 break;
1422
1423 case CommandBuffer::Type::AsyncCompute:
1424 frame().timeline_fence_compute = data.current_timeline;
1425 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
1426 LOGI("Signal compute: (%p) %u\n",
1427 reinterpret_cast<void *>(timeline_semaphore),
1428 unsigned(data.current_timeline));
1429 #endif
1430 break;
1431
1432 case CommandBuffer::Type::AsyncTransfer:
1433 frame().timeline_fence_transfer = data.current_timeline;
1434 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
1435 LOGI("Signal transfer: (%p) %u\n",
1436 reinterpret_cast<void *>(timeline_semaphore),
1437 unsigned(data.current_timeline));
1438 #endif
1439 break;
1440 }
1441
1442 SmallVector<VkCommandBuffer> cmds;
1443 cmds.reserve(submissions.size());
1444
1445 SmallVector<VkSubmitInfo> submits;
1446 SmallVector<VkTimelineSemaphoreSubmitInfoKHR> timeline_infos;
1447
1448 submits.reserve(2);
1449 timeline_infos.reserve(2);
1450
1451 size_t last_cmd = 0;
1452
1453 SmallVector<VkSemaphore> waits[2];
1454 SmallVector<uint64_t> wait_counts[2];
1455 SmallVector<VkFlags> wait_stages[2];
1456 SmallVector<VkSemaphore> signals[2];
1457 SmallVector<uint64_t> signal_counts[2];
1458
1459 // Add external wait semaphores.
1460 wait_stages[0] = move(data.wait_stages);
1461
1462 for (auto &semaphore : data.wait_semaphores)
1463 {
1464 auto wait = semaphore->consume();
1465 if (!semaphore->get_timeline_value())
1466 {
1467 if (semaphore->can_recycle())
1468 frame().recycled_semaphores.push_back(wait);
1469 else
1470 frame().destroyed_semaphores.push_back(wait);
1471 }
1472 wait_counts[0].push_back(semaphore->get_timeline_value());
1473 waits[0].push_back(wait);
1474 }
1475 data.wait_stages.clear();
1476 data.wait_semaphores.clear();
1477
1478 for (auto &cmd : submissions)
1479 {
1480 if (cmd->swapchain_touched() && !wsi.touched && !wsi.consumed)
1481 {
1482 if (!cmds.empty())
1483 {
1484 // Push all pending cmd buffers to their own submission.
1485 submits.emplace_back();
1486
1487 timeline_infos.emplace_back();
1488 auto &timeline_info = timeline_infos.back();
1489 timeline_info = { VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR };
1490
1491 auto &submit = submits.back();
1492 submit = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
1493 if (ext.timeline_semaphore_features.timelineSemaphore)
1494 submit.pNext = &timeline_info;
1495
1496 submit.commandBufferCount = cmds.size() - last_cmd;
1497 submit.pCommandBuffers = cmds.data() + last_cmd;
1498 last_cmd = cmds.size();
1499 }
1500 wsi.touched = true;
1501 }
1502
1503 cmds.push_back(cmd->get_command_buffer());
1504 }
1505
1506 if (cmds.size() > last_cmd)
1507 {
1508 unsigned index = submits.size();
1509
1510 // Push all pending cmd buffers to their own submission.
1511 timeline_infos.emplace_back();
1512 auto &timeline_info = timeline_infos.back();
1513 timeline_info = { VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR };
1514
1515 submits.emplace_back();
1516 auto &submit = submits.back();
1517 submit = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
1518
1519 if (ext.timeline_semaphore_features.timelineSemaphore)
1520 submit.pNext = &timeline_info;
1521
1522 submit.commandBufferCount = cmds.size() - last_cmd;
1523 submit.pCommandBuffers = cmds.data() + last_cmd;
1524 if (wsi.touched && !wsi.consumed)
1525 {
1526 static const VkFlags wait = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
1527 if (wsi.acquire && wsi.acquire->get_semaphore() != VK_NULL_HANDLE)
1528 {
1529 VK_ASSERT(wsi.acquire->is_signalled());
1530 VkSemaphore sem = wsi.acquire->consume();
1531
1532 waits[index].push_back(sem);
1533 wait_counts[index].push_back(wsi.acquire->get_timeline_value());
1534 wait_stages[index].push_back(wait);
1535
1536 if (!wsi.acquire->get_timeline_value())
1537 {
1538 if (wsi.acquire->can_recycle())
1539 frame().recycled_semaphores.push_back(sem);
1540 else
1541 frame().destroyed_semaphores.push_back(sem);
1542 }
1543
1544 wsi.acquire.reset();
1545 }
1546
1547 VkSemaphore release = managers.semaphore.request_cleared_semaphore();
1548 wsi.release = Semaphore(handle_pool.semaphores.allocate(this, release, true));
1549 wsi.release->set_internal_sync_object();
1550 signals[index].push_back(wsi.release->get_semaphore());
1551 signal_counts[index].push_back(0);
1552 wsi.consumed = true;
1553 }
1554 last_cmd = cmds.size();
1555 }
1556
1557 VkFence cleared_fence = fence && !ext.timeline_semaphore_features.timelineSemaphore ?
1558 managers.fence.request_cleared_fence() :
1559 VK_NULL_HANDLE;
1560
1561 if (fence)
1562 fence->fence = cleared_fence;
1563
1564 // Add external signal semaphores.
1565 if (ext.timeline_semaphore_features.timelineSemaphore)
1566 {
1567 // Signal once and distribute the timeline value to all.
1568 signals[submits.size() - 1].push_back(timeline_semaphore);
1569 signal_counts[submits.size() - 1].push_back(timeline_value);
1570
1571 if (fence)
1572 {
1573 fence->timeline = timeline_semaphore;
1574 fence->value = timeline_value;
1575 fence->fence = VK_NULL_HANDLE;
1576 }
1577
1578 for (unsigned i = 0; i < semaphore_count; i++)
1579 {
1580 VK_ASSERT(!semaphores[i]);
1581 semaphores[i] = Semaphore(handle_pool.semaphores.allocate(this, timeline_value, timeline_semaphore));
1582 }
1583 }
1584 else
1585 {
1586 if (fence)
1587 {
1588 fence->timeline = VK_NULL_HANDLE;
1589 fence->value = 0;
1590 }
1591
1592 for (unsigned i = 0; i < semaphore_count; i++)
1593 {
1594 VkSemaphore cleared_semaphore = managers.semaphore.request_cleared_semaphore();
1595 signals[submits.size() - 1].push_back(cleared_semaphore);
1596 signal_counts[submits.size() - 1].push_back(0);
1597 VK_ASSERT(!semaphores[i]);
1598 semaphores[i] = Semaphore(handle_pool.semaphores.allocate(this, cleared_semaphore, true));
1599 }
1600 }
1601
1602 VkPerformanceQuerySubmitInfoKHR profiling_infos[2];
1603
1604 for (unsigned i = 0; i < submits.size(); i++)
1605 {
1606 auto &submit = submits[i];
1607 auto &timeline_submit = timeline_infos[i];
1608
1609 if (profiling_iteration >= 0)
1610 {
1611 profiling_infos[i] = { VK_STRUCTURE_TYPE_PERFORMANCE_QUERY_SUBMIT_INFO_KHR };
1612 profiling_infos[i].counterPassIndex = uint32_t(profiling_iteration);
1613 if (submit.pNext)
1614 timeline_submit.pNext = &profiling_infos[i];
1615 else
1616 submit.pNext = &profiling_infos[i];
1617 }
1618
1619 submit.waitSemaphoreCount = waits[i].size();
1620 submit.pWaitSemaphores = waits[i].data();
1621 submit.pWaitDstStageMask = wait_stages[i].data();
1622 timeline_submit.waitSemaphoreValueCount = submit.waitSemaphoreCount;
1623 timeline_submit.pWaitSemaphoreValues = wait_counts[i].data();
1624
1625 submit.signalSemaphoreCount = signals[i].size();
1626 submit.pSignalSemaphores = signals[i].data();
1627 timeline_submit.signalSemaphoreValueCount = submit.signalSemaphoreCount;
1628 timeline_submit.pSignalSemaphoreValues = signal_counts[i].data();
1629 }
1630
1631 QueryPoolHandle start_ts, end_ts;
1632 if (json_timestamp_origin)
1633 start_ts = write_calibrated_timestamp_nolock();
1634
1635 if (queue_lock_callback)
1636 queue_lock_callback();
1637 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
1638 if (cleared_fence)
1639 LOGI("Signalling fence: %llx\n", reinterpret_cast<unsigned long long>(cleared_fence));
1640 #endif
1641 VkResult result = table->vkQueueSubmit(queue, submits.size(), submits.data(), cleared_fence);
1642 if (ImplementationQuirks::get().queue_wait_on_submission)
1643 table->vkQueueWaitIdle(queue);
1644 if (queue_unlock_callback)
1645 queue_unlock_callback();
1646
1647 if (json_timestamp_origin)
1648 {
1649 end_ts = write_calibrated_timestamp_nolock();
1650 register_time_interval_nolock("CPU", std::move(start_ts), std::move(end_ts), "submit", "");
1651 }
1652
1653 if (result != VK_SUCCESS)
1654 LOGE("vkQueueSubmit failed (code: %d).\n", int(result));
1655 if (result == VK_ERROR_DEVICE_LOST)
1656 report_checkpoints();
1657 submissions.clear();
1658
1659 if (!ext.timeline_semaphore_features.timelineSemaphore)
1660 data.need_fence = true;
1661
1662 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
1663 const char *queue_name = nullptr;
1664 switch (type)
1665 {
1666 default:
1667 case CommandBuffer::Type::Generic:
1668 queue_name = "Graphics";
1669 break;
1670 case CommandBuffer::Type::AsyncCompute:
1671 queue_name = "Compute";
1672 break;
1673 case CommandBuffer::Type::AsyncTransfer:
1674 queue_name = "Transfer";
1675 break;
1676 }
1677
1678 for (auto &submit : submits)
1679 {
1680 LOGI("Submission to %s queue:\n", queue_name);
1681 for (uint32_t i = 0; i < submit.waitSemaphoreCount; i++)
1682 {
1683 LOGI(" Waiting for semaphore: %llx in stages %s\n",
1684 reinterpret_cast<unsigned long long>(submit.pWaitSemaphores[i]),
1685 stage_flags_to_string(submit.pWaitDstStageMask[i]).c_str());
1686 }
1687
1688 for (uint32_t i = 0; i < submit.commandBufferCount; i++)
1689 LOGI(" Command Buffer %llx\n", reinterpret_cast<unsigned long long>(submit.pCommandBuffers[i]));
1690
1691 for (uint32_t i = 0; i < submit.signalSemaphoreCount; i++)
1692 {
1693 LOGI(" Signalling semaphore: %llx\n",
1694 reinterpret_cast<unsigned long long>(submit.pSignalSemaphores[i]));
1695 }
1696 }
1697 #endif
1698 }
1699
flush_frame(CommandBuffer::Type type)1700 void Device::flush_frame(CommandBuffer::Type type)
1701 {
1702 if (type == CommandBuffer::Type::AsyncTransfer)
1703 sync_buffer_blocks();
1704 submit_queue(type, nullptr, 0, nullptr);
1705 }
1706
sync_buffer_blocks()1707 void Device::sync_buffer_blocks()
1708 {
1709 if (dma.vbo.empty() && dma.ibo.empty() && dma.ubo.empty())
1710 return;
1711
1712 VkBufferUsageFlags usage = 0;
1713
1714 auto cmd = request_command_buffer_nolock(get_thread_index(), CommandBuffer::Type::AsyncTransfer, false);
1715
1716 cmd->begin_region("buffer-block-sync");
1717
1718 for (auto &block : dma.vbo)
1719 {
1720 VK_ASSERT(block.offset != 0);
1721 cmd->copy_buffer(*block.gpu, 0, *block.cpu, 0, block.offset);
1722 usage |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
1723 }
1724
1725 for (auto &block : dma.ibo)
1726 {
1727 VK_ASSERT(block.offset != 0);
1728 cmd->copy_buffer(*block.gpu, 0, *block.cpu, 0, block.offset);
1729 usage |= VK_BUFFER_USAGE_INDEX_BUFFER_BIT;
1730 }
1731
1732 for (auto &block : dma.ubo)
1733 {
1734 VK_ASSERT(block.offset != 0);
1735 cmd->copy_buffer(*block.gpu, 0, *block.cpu, 0, block.offset);
1736 usage |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
1737 }
1738
1739 dma.vbo.clear();
1740 dma.ibo.clear();
1741 dma.ubo.clear();
1742
1743 cmd->end_region();
1744
1745 // Do not flush graphics or compute in this context.
1746 // We must be able to inject semaphores into all currently enqueued graphics / compute.
1747 submit_staging(cmd, usage, false);
1748 }
1749
end_frame_context()1750 void Device::end_frame_context()
1751 {
1752 DRAIN_FRAME_LOCK();
1753 end_frame_nolock();
1754 }
1755
end_frame_nolock()1756 void Device::end_frame_nolock()
1757 {
1758 // Kept handles alive until end-of-frame, free now if appropriate.
1759 for (auto &image : frame().keep_alive_images)
1760 {
1761 image->set_internal_sync_object();
1762 image->get_view().set_internal_sync_object();
1763 }
1764 frame().keep_alive_images.clear();
1765
1766 // Make sure we have a fence which covers all submissions in the frame.
1767 InternalFence fence;
1768
1769 if (transfer.need_fence || !frame().transfer_submissions.empty())
1770 {
1771 submit_queue(CommandBuffer::Type::AsyncTransfer, &fence, 0, nullptr);
1772 if (fence.fence != VK_NULL_HANDLE)
1773 {
1774 frame().wait_fences.push_back(fence.fence);
1775 frame().recycle_fences.push_back(fence.fence);
1776 }
1777 transfer.need_fence = false;
1778 }
1779
1780 if (graphics.need_fence || !frame().graphics_submissions.empty())
1781 {
1782 submit_queue(CommandBuffer::Type::Generic, &fence, 0, nullptr);
1783 if (fence.fence != VK_NULL_HANDLE)
1784 {
1785 frame().wait_fences.push_back(fence.fence);
1786 frame().recycle_fences.push_back(fence.fence);
1787 }
1788 graphics.need_fence = false;
1789 }
1790
1791 if (compute.need_fence || !frame().compute_submissions.empty())
1792 {
1793 submit_queue(CommandBuffer::Type::AsyncCompute, &fence, 0, nullptr);
1794 if (fence.fence != VK_NULL_HANDLE)
1795 {
1796 frame().wait_fences.push_back(fence.fence);
1797 frame().recycle_fences.push_back(fence.fence);
1798 }
1799 compute.need_fence = false;
1800 }
1801 }
1802
flush_frame()1803 void Device::flush_frame()
1804 {
1805 LOCK();
1806 flush_frame_nolock();
1807 }
1808
flush_frame_nolock()1809 void Device::flush_frame_nolock()
1810 {
1811 flush_frame(CommandBuffer::Type::AsyncTransfer);
1812 flush_frame(CommandBuffer::Type::Generic);
1813 flush_frame(CommandBuffer::Type::AsyncCompute);
1814 }
1815
get_queue_data(CommandBuffer::Type type)1816 Device::QueueData &Device::get_queue_data(CommandBuffer::Type type)
1817 {
1818 switch (get_physical_queue_type(type))
1819 {
1820 default:
1821 case CommandBuffer::Type::Generic:
1822 return graphics;
1823 case CommandBuffer::Type::AsyncCompute:
1824 return compute;
1825 case CommandBuffer::Type::AsyncTransfer:
1826 return transfer;
1827 }
1828 }
1829
get_vk_queue(CommandBuffer::Type type) const1830 VkQueue Device::get_vk_queue(CommandBuffer::Type type) const
1831 {
1832 switch (get_physical_queue_type(type))
1833 {
1834 default:
1835 case CommandBuffer::Type::Generic:
1836 return graphics_queue;
1837 case CommandBuffer::Type::AsyncCompute:
1838 return compute_queue;
1839 case CommandBuffer::Type::AsyncTransfer:
1840 return transfer_queue;
1841 }
1842 }
1843
get_performance_query_pool(CommandBuffer::Type type)1844 PerformanceQueryPool &Device::get_performance_query_pool(CommandBuffer::Type type)
1845 {
1846 switch (get_physical_queue_type(type))
1847 {
1848 default:
1849 case CommandBuffer::Type::Generic:
1850 return graphics.performance_query_pool;
1851 case CommandBuffer::Type::AsyncCompute:
1852 if (graphics_queue_family_index == compute_queue_family_index)
1853 return graphics.performance_query_pool;
1854 else
1855 return compute.performance_query_pool;
1856 case CommandBuffer::Type::AsyncTransfer:
1857 if (graphics_queue_family_index == transfer_queue_family_index)
1858 return graphics.performance_query_pool;
1859 else if (compute_queue_family_index == transfer_queue_family_index)
1860 return compute.performance_query_pool;
1861 else
1862 return transfer.performance_query_pool;
1863 }
1864 }
1865
get_command_pool(CommandBuffer::Type type,unsigned thread)1866 CommandPool &Device::get_command_pool(CommandBuffer::Type type, unsigned thread)
1867 {
1868 switch (get_physical_queue_type(type))
1869 {
1870 default:
1871 case CommandBuffer::Type::Generic:
1872 return frame().graphics_cmd_pool[thread];
1873 case CommandBuffer::Type::AsyncCompute:
1874 return frame().compute_cmd_pool[thread];
1875 case CommandBuffer::Type::AsyncTransfer:
1876 return frame().transfer_cmd_pool[thread];
1877 }
1878 }
1879
get_queue_submissions(CommandBuffer::Type type)1880 Util::SmallVector<CommandBufferHandle> &Device::get_queue_submissions(CommandBuffer::Type type)
1881 {
1882 switch (get_physical_queue_type(type))
1883 {
1884 default:
1885 case CommandBuffer::Type::Generic:
1886 return frame().graphics_submissions;
1887 case CommandBuffer::Type::AsyncCompute:
1888 return frame().compute_submissions;
1889 case CommandBuffer::Type::AsyncTransfer:
1890 return frame().transfer_submissions;
1891 }
1892 }
1893
request_command_buffer(CommandBuffer::Type type)1894 CommandBufferHandle Device::request_command_buffer(CommandBuffer::Type type)
1895 {
1896 return request_command_buffer_for_thread(get_thread_index(), type);
1897 }
1898
request_command_buffer_for_thread(unsigned thread_index,CommandBuffer::Type type)1899 CommandBufferHandle Device::request_command_buffer_for_thread(unsigned thread_index, CommandBuffer::Type type)
1900 {
1901 LOCK();
1902 return request_command_buffer_nolock(thread_index, type, false);
1903 }
1904
request_profiled_command_buffer(CommandBuffer::Type type)1905 CommandBufferHandle Device::request_profiled_command_buffer(CommandBuffer::Type type)
1906 {
1907 return request_profiled_command_buffer_for_thread(get_thread_index(), type);
1908 }
1909
request_profiled_command_buffer_for_thread(unsigned thread_index,CommandBuffer::Type type)1910 CommandBufferHandle Device::request_profiled_command_buffer_for_thread(unsigned thread_index,
1911 CommandBuffer::Type type)
1912 {
1913 LOCK();
1914 return request_command_buffer_nolock(thread_index, type, true);
1915 }
1916
request_command_buffer_nolock(unsigned thread_index,CommandBuffer::Type type,bool profiled)1917 CommandBufferHandle Device::request_command_buffer_nolock(unsigned thread_index, CommandBuffer::Type type, bool profiled)
1918 {
1919 #ifndef GRANITE_VULKAN_MT
1920 VK_ASSERT(thread_index == 0);
1921 #endif
1922 auto cmd = get_command_pool(type, thread_index).request_command_buffer();
1923
1924 if (profiled && !ext.performance_query_features.performanceCounterQueryPools)
1925 {
1926 LOGW("Profiling is not supported on this device.\n");
1927 profiled = false;
1928 }
1929
1930 VkCommandBufferBeginInfo info = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
1931 info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
1932 table->vkBeginCommandBuffer(cmd, &info);
1933 add_frame_counter_nolock();
1934 CommandBufferHandle handle(handle_pool.command_buffers.allocate(this, cmd, pipeline_cache, type));
1935 handle->set_thread_index(thread_index);
1936
1937 if (profiled)
1938 {
1939 auto &query_pool = get_performance_query_pool(type);
1940 handle->enable_profiling();
1941 query_pool.begin_command_buffer(handle->get_command_buffer());
1942 }
1943
1944 return handle;
1945 }
1946
submit_secondary(CommandBuffer & primary,CommandBuffer & secondary)1947 void Device::submit_secondary(CommandBuffer &primary, CommandBuffer &secondary)
1948 {
1949 {
1950 LOCK();
1951 secondary.end();
1952 decrement_frame_counter_nolock();
1953
1954 #ifdef VULKAN_DEBUG
1955 auto &pool = get_command_pool(secondary.get_command_buffer_type(),
1956 secondary.get_thread_index());
1957 pool.signal_submitted(secondary.get_command_buffer());
1958 #endif
1959 }
1960
1961 VkCommandBuffer secondary_cmd = secondary.get_command_buffer();
1962 table->vkCmdExecuteCommands(primary.get_command_buffer(), 1, &secondary_cmd);
1963 }
1964
request_secondary_command_buffer_for_thread(unsigned thread_index,const Framebuffer * framebuffer,unsigned subpass,CommandBuffer::Type type)1965 CommandBufferHandle Device::request_secondary_command_buffer_for_thread(unsigned thread_index,
1966 const Framebuffer *framebuffer,
1967 unsigned subpass,
1968 CommandBuffer::Type type)
1969 {
1970 LOCK();
1971
1972 auto cmd = get_command_pool(type, thread_index).request_secondary_command_buffer();
1973 VkCommandBufferBeginInfo info = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
1974 VkCommandBufferInheritanceInfo inherit = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_INFO };
1975
1976 inherit.framebuffer = VK_NULL_HANDLE;
1977 inherit.renderPass = framebuffer->get_compatible_render_pass().get_render_pass();
1978 inherit.subpass = subpass;
1979 info.pInheritanceInfo = &inherit;
1980 info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT | VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
1981
1982 table->vkBeginCommandBuffer(cmd, &info);
1983 add_frame_counter_nolock();
1984 CommandBufferHandle handle(handle_pool.command_buffers.allocate(this, cmd, pipeline_cache, type));
1985 handle->set_thread_index(thread_index);
1986 handle->set_is_secondary();
1987 return handle;
1988 }
1989
set_acquire_semaphore(unsigned index,Semaphore acquire)1990 void Device::set_acquire_semaphore(unsigned index, Semaphore acquire)
1991 {
1992 wsi.acquire = move(acquire);
1993 wsi.index = index;
1994 wsi.touched = false;
1995 wsi.consumed = false;
1996
1997 if (wsi.acquire)
1998 {
1999 wsi.acquire->set_internal_sync_object();
2000 VK_ASSERT(wsi.acquire->is_signalled());
2001 }
2002 }
2003
consume_release_semaphore()2004 Semaphore Device::consume_release_semaphore()
2005 {
2006 auto ret = move(wsi.release);
2007 wsi.release.reset();
2008 return ret;
2009 }
2010
get_stock_sampler(StockSampler sampler) const2011 const Sampler &Device::get_stock_sampler(StockSampler sampler) const
2012 {
2013 return *samplers[static_cast<unsigned>(sampler)];
2014 }
2015
swapchain_touched() const2016 bool Device::swapchain_touched() const
2017 {
2018 return wsi.touched;
2019 }
2020
~Device()2021 Device::~Device()
2022 {
2023 wait_idle();
2024
2025 managers.timestamps.log_simple();
2026
2027 wsi.acquire.reset();
2028 wsi.release.reset();
2029 wsi.swapchain.clear();
2030
2031 if (pipeline_cache != VK_NULL_HANDLE)
2032 {
2033 flush_pipeline_cache();
2034 table->vkDestroyPipelineCache(device, pipeline_cache, nullptr);
2035 }
2036
2037 #ifdef GRANITE_VULKAN_FILESYSTEM
2038 flush_shader_manager_cache();
2039 #endif
2040
2041 #ifdef GRANITE_VULKAN_FOSSILIZE
2042 flush_pipeline_state();
2043 #endif
2044
2045 framebuffer_allocator.clear();
2046 transient_allocator.clear();
2047 for (auto &sampler : samplers)
2048 sampler.reset();
2049
2050 for (auto &sampler : samplers_ycbcr)
2051 if (sampler)
2052 table->vkDestroySamplerYcbcrConversion(device, sampler, nullptr);
2053
2054 deinit_timeline_semaphores();
2055 }
2056
deinit_timeline_semaphores()2057 void Device::deinit_timeline_semaphores()
2058 {
2059 if (graphics.timeline_semaphore != VK_NULL_HANDLE)
2060 table->vkDestroySemaphore(device, graphics.timeline_semaphore, nullptr);
2061 if (compute.timeline_semaphore != VK_NULL_HANDLE)
2062 table->vkDestroySemaphore(device, compute.timeline_semaphore, nullptr);
2063 if (transfer.timeline_semaphore != VK_NULL_HANDLE)
2064 table->vkDestroySemaphore(device, transfer.timeline_semaphore, nullptr);
2065
2066 graphics.timeline_semaphore = VK_NULL_HANDLE;
2067 compute.timeline_semaphore = VK_NULL_HANDLE;
2068 transfer.timeline_semaphore = VK_NULL_HANDLE;
2069
2070 // Make sure we don't accidentally try to wait for these after we destroy the semaphores.
2071 for (auto &frame : per_frame)
2072 {
2073 frame->timeline_fence_graphics = 0;
2074 frame->timeline_fence_compute = 0;
2075 frame->timeline_fence_transfer = 0;
2076 frame->graphics_timeline_semaphore = VK_NULL_HANDLE;
2077 frame->compute_timeline_semaphore = VK_NULL_HANDLE;
2078 frame->transfer_timeline_semaphore = VK_NULL_HANDLE;
2079 }
2080 }
2081
init_frame_contexts(unsigned count)2082 void Device::init_frame_contexts(unsigned count)
2083 {
2084 DRAIN_FRAME_LOCK();
2085 wait_idle_nolock();
2086
2087 // Clear out caches which might contain stale data from now on.
2088 framebuffer_allocator.clear();
2089 transient_allocator.clear();
2090 per_frame.clear();
2091
2092 for (unsigned i = 0; i < count; i++)
2093 {
2094 auto frame = unique_ptr<PerFrame>(new PerFrame(this, i));
2095 per_frame.emplace_back(move(frame));
2096 }
2097 }
2098
init_external_swapchain(const vector<ImageHandle> & swapchain_images)2099 void Device::init_external_swapchain(const vector<ImageHandle> &swapchain_images)
2100 {
2101 DRAIN_FRAME_LOCK();
2102 wsi.swapchain.clear();
2103 wait_idle_nolock();
2104
2105 wsi.index = 0;
2106 wsi.touched = false;
2107 wsi.consumed = false;
2108 for (auto &image : swapchain_images)
2109 {
2110 wsi.swapchain.push_back(image);
2111 if (image)
2112 {
2113 wsi.swapchain.back()->set_internal_sync_object();
2114 wsi.swapchain.back()->get_view().set_internal_sync_object();
2115 }
2116 }
2117 }
2118
init_swapchain(const vector<VkImage> & swapchain_images,unsigned width,unsigned height,VkFormat format)2119 void Device::init_swapchain(const vector<VkImage> &swapchain_images, unsigned width, unsigned height, VkFormat format)
2120 {
2121 DRAIN_FRAME_LOCK();
2122 wsi.swapchain.clear();
2123 wait_idle_nolock();
2124
2125 const auto info = ImageCreateInfo::render_target(width, height, format);
2126
2127 wsi.index = 0;
2128 wsi.touched = false;
2129 wsi.consumed = false;
2130 for (auto &image : swapchain_images)
2131 {
2132 VkImageViewCreateInfo view_info = { VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO };
2133 view_info.image = image;
2134 view_info.format = format;
2135 view_info.components.r = VK_COMPONENT_SWIZZLE_R;
2136 view_info.components.g = VK_COMPONENT_SWIZZLE_G;
2137 view_info.components.b = VK_COMPONENT_SWIZZLE_B;
2138 view_info.components.a = VK_COMPONENT_SWIZZLE_A;
2139 view_info.subresourceRange.aspectMask = format_to_aspect_mask(format);
2140 view_info.subresourceRange.baseMipLevel = 0;
2141 view_info.subresourceRange.baseArrayLayer = 0;
2142 view_info.subresourceRange.levelCount = 1;
2143 view_info.subresourceRange.layerCount = 1;
2144 view_info.viewType = VK_IMAGE_VIEW_TYPE_2D;
2145
2146 VkImageView image_view;
2147 if (table->vkCreateImageView(device, &view_info, nullptr, &image_view) != VK_SUCCESS)
2148 LOGE("Failed to create view for backbuffer.");
2149
2150 auto backbuffer = ImageHandle(handle_pool.images.allocate(this, image, image_view, DeviceAllocation{}, info, VK_IMAGE_VIEW_TYPE_2D));
2151 backbuffer->set_internal_sync_object();
2152 backbuffer->disown_image();
2153 backbuffer->get_view().set_internal_sync_object();
2154 wsi.swapchain.push_back(backbuffer);
2155 set_name(*backbuffer, "backbuffer");
2156 backbuffer->set_swapchain_layout(VK_IMAGE_LAYOUT_PRESENT_SRC_KHR);
2157 }
2158 }
2159
PerFrame(Device * device_,unsigned frame_index_)2160 Device::PerFrame::PerFrame(Device *device_, unsigned frame_index_)
2161 : device(*device_)
2162 , frame_index(frame_index_)
2163 , table(device_->get_device_table())
2164 , managers(device_->managers)
2165 , query_pool(device_)
2166 {
2167 graphics_timeline_semaphore = device.graphics.timeline_semaphore;
2168 compute_timeline_semaphore = device.compute.timeline_semaphore;
2169 transfer_timeline_semaphore = device.transfer.timeline_semaphore;
2170
2171 unsigned count = device_->num_thread_indices;
2172 graphics_cmd_pool.reserve(count);
2173 compute_cmd_pool.reserve(count);
2174 transfer_cmd_pool.reserve(count);
2175 for (unsigned i = 0; i < count; i++)
2176 {
2177 graphics_cmd_pool.emplace_back(device_, device_->graphics_queue_family_index);
2178 compute_cmd_pool.emplace_back(device_, device_->compute_queue_family_index);
2179 transfer_cmd_pool.emplace_back(device_, device_->transfer_queue_family_index);
2180 }
2181 }
2182
keep_handle_alive(ImageHandle handle)2183 void Device::keep_handle_alive(ImageHandle handle)
2184 {
2185 LOCK();
2186 frame().keep_alive_images.push_back(move(handle));
2187 }
2188
free_memory_nolock(const DeviceAllocation & alloc)2189 void Device::free_memory_nolock(const DeviceAllocation &alloc)
2190 {
2191 frame().allocations.push_back(alloc);
2192 }
2193
2194 #ifdef VULKAN_DEBUG
2195
2196 template <typename T, typename U>
exists(const T & container,const U & value)2197 static inline bool exists(const T &container, const U &value)
2198 {
2199 return find(begin(container), end(container), value) != end(container);
2200 }
2201
2202 #endif
2203
destroy_pipeline(VkPipeline pipeline)2204 void Device::destroy_pipeline(VkPipeline pipeline)
2205 {
2206 LOCK();
2207 destroy_pipeline_nolock(pipeline);
2208 }
2209
reset_fence(VkFence fence,bool observed_wait)2210 void Device::reset_fence(VkFence fence, bool observed_wait)
2211 {
2212 LOCK();
2213 reset_fence_nolock(fence, observed_wait);
2214 }
2215
destroy_buffer(VkBuffer buffer)2216 void Device::destroy_buffer(VkBuffer buffer)
2217 {
2218 LOCK();
2219 destroy_buffer_nolock(buffer);
2220 }
2221
destroy_descriptor_pool(VkDescriptorPool desc_pool)2222 void Device::destroy_descriptor_pool(VkDescriptorPool desc_pool)
2223 {
2224 LOCK();
2225 destroy_descriptor_pool_nolock(desc_pool);
2226 }
2227
destroy_buffer_view(VkBufferView view)2228 void Device::destroy_buffer_view(VkBufferView view)
2229 {
2230 LOCK();
2231 destroy_buffer_view_nolock(view);
2232 }
2233
destroy_event(VkEvent event)2234 void Device::destroy_event(VkEvent event)
2235 {
2236 LOCK();
2237 destroy_event_nolock(event);
2238 }
2239
destroy_framebuffer(VkFramebuffer framebuffer)2240 void Device::destroy_framebuffer(VkFramebuffer framebuffer)
2241 {
2242 LOCK();
2243 destroy_framebuffer_nolock(framebuffer);
2244 }
2245
destroy_image(VkImage image)2246 void Device::destroy_image(VkImage image)
2247 {
2248 LOCK();
2249 destroy_image_nolock(image);
2250 }
2251
destroy_semaphore(VkSemaphore semaphore)2252 void Device::destroy_semaphore(VkSemaphore semaphore)
2253 {
2254 LOCK();
2255 destroy_semaphore_nolock(semaphore);
2256 }
2257
recycle_semaphore(VkSemaphore semaphore)2258 void Device::recycle_semaphore(VkSemaphore semaphore)
2259 {
2260 LOCK();
2261 recycle_semaphore_nolock(semaphore);
2262 }
2263
free_memory(const DeviceAllocation & alloc)2264 void Device::free_memory(const DeviceAllocation &alloc)
2265 {
2266 LOCK();
2267 free_memory_nolock(alloc);
2268 }
2269
destroy_sampler(VkSampler sampler)2270 void Device::destroy_sampler(VkSampler sampler)
2271 {
2272 LOCK();
2273 destroy_sampler_nolock(sampler);
2274 }
2275
destroy_image_view(VkImageView view)2276 void Device::destroy_image_view(VkImageView view)
2277 {
2278 LOCK();
2279 destroy_image_view_nolock(view);
2280 }
2281
destroy_pipeline_nolock(VkPipeline pipeline)2282 void Device::destroy_pipeline_nolock(VkPipeline pipeline)
2283 {
2284 VK_ASSERT(!exists(frame().destroyed_pipelines, pipeline));
2285 frame().destroyed_pipelines.push_back(pipeline);
2286 }
2287
destroy_image_view_nolock(VkImageView view)2288 void Device::destroy_image_view_nolock(VkImageView view)
2289 {
2290 VK_ASSERT(!exists(frame().destroyed_image_views, view));
2291 frame().destroyed_image_views.push_back(view);
2292 }
2293
destroy_buffer_view_nolock(VkBufferView view)2294 void Device::destroy_buffer_view_nolock(VkBufferView view)
2295 {
2296 VK_ASSERT(!exists(frame().destroyed_buffer_views, view));
2297 frame().destroyed_buffer_views.push_back(view);
2298 }
2299
destroy_semaphore_nolock(VkSemaphore semaphore)2300 void Device::destroy_semaphore_nolock(VkSemaphore semaphore)
2301 {
2302 VK_ASSERT(!exists(frame().destroyed_semaphores, semaphore));
2303 frame().destroyed_semaphores.push_back(semaphore);
2304 }
2305
recycle_semaphore_nolock(VkSemaphore semaphore)2306 void Device::recycle_semaphore_nolock(VkSemaphore semaphore)
2307 {
2308 VK_ASSERT(!exists(frame().recycled_semaphores, semaphore));
2309 frame().recycled_semaphores.push_back(semaphore);
2310 }
2311
destroy_event_nolock(VkEvent event)2312 void Device::destroy_event_nolock(VkEvent event)
2313 {
2314 VK_ASSERT(!exists(frame().recycled_events, event));
2315 frame().recycled_events.push_back(event);
2316 }
2317
reset_fence_nolock(VkFence fence,bool observed_wait)2318 void Device::reset_fence_nolock(VkFence fence, bool observed_wait)
2319 {
2320 if (observed_wait)
2321 {
2322 table->vkResetFences(device, 1, &fence);
2323 managers.fence.recycle_fence(fence);
2324 }
2325 else
2326 frame().recycle_fences.push_back(fence);
2327 }
2328
request_pipeline_event()2329 PipelineEvent Device::request_pipeline_event()
2330 {
2331 return PipelineEvent(handle_pool.events.allocate(this, managers.event.request_cleared_event()));
2332 }
2333
destroy_image_nolock(VkImage image)2334 void Device::destroy_image_nolock(VkImage image)
2335 {
2336 VK_ASSERT(!exists(frame().destroyed_images, image));
2337 frame().destroyed_images.push_back(image);
2338 }
2339
destroy_buffer_nolock(VkBuffer buffer)2340 void Device::destroy_buffer_nolock(VkBuffer buffer)
2341 {
2342 VK_ASSERT(!exists(frame().destroyed_buffers, buffer));
2343 frame().destroyed_buffers.push_back(buffer);
2344 }
2345
destroy_descriptor_pool_nolock(VkDescriptorPool desc_pool)2346 void Device::destroy_descriptor_pool_nolock(VkDescriptorPool desc_pool)
2347 {
2348 VK_ASSERT(!exists(frame().destroyed_descriptor_pools, desc_pool));
2349 frame().destroyed_descriptor_pools.push_back(desc_pool);
2350 }
2351
destroy_sampler_nolock(VkSampler sampler)2352 void Device::destroy_sampler_nolock(VkSampler sampler)
2353 {
2354 VK_ASSERT(!exists(frame().destroyed_samplers, sampler));
2355 frame().destroyed_samplers.push_back(sampler);
2356 }
2357
destroy_framebuffer_nolock(VkFramebuffer framebuffer)2358 void Device::destroy_framebuffer_nolock(VkFramebuffer framebuffer)
2359 {
2360 VK_ASSERT(!exists(frame().destroyed_framebuffers, framebuffer));
2361 frame().destroyed_framebuffers.push_back(framebuffer);
2362 }
2363
clear_wait_semaphores()2364 void Device::clear_wait_semaphores()
2365 {
2366 for (auto &sem : graphics.wait_semaphores)
2367 table->vkDestroySemaphore(device, sem->consume(), nullptr);
2368 for (auto &sem : compute.wait_semaphores)
2369 table->vkDestroySemaphore(device, sem->consume(), nullptr);
2370 for (auto &sem : transfer.wait_semaphores)
2371 table->vkDestroySemaphore(device, sem->consume(), nullptr);
2372
2373 graphics.wait_semaphores.clear();
2374 graphics.wait_stages.clear();
2375 compute.wait_semaphores.clear();
2376 compute.wait_stages.clear();
2377 transfer.wait_semaphores.clear();
2378 transfer.wait_stages.clear();
2379 }
2380
wait_idle()2381 void Device::wait_idle()
2382 {
2383 DRAIN_FRAME_LOCK();
2384 wait_idle_nolock();
2385 }
2386
wait_idle_nolock()2387 void Device::wait_idle_nolock()
2388 {
2389 if (!per_frame.empty())
2390 end_frame_nolock();
2391
2392 if (device != VK_NULL_HANDLE)
2393 {
2394 if (queue_lock_callback)
2395 queue_lock_callback();
2396 auto result = table->vkDeviceWaitIdle(device);
2397 if (result != VK_SUCCESS)
2398 LOGE("vkDeviceWaitIdle failed with code: %d\n", result);
2399 if (result == VK_ERROR_DEVICE_LOST)
2400 report_checkpoints();
2401 if (queue_unlock_callback)
2402 queue_unlock_callback();
2403 }
2404
2405 clear_wait_semaphores();
2406
2407 // Free memory for buffer pools.
2408 managers.vbo.reset();
2409 managers.ubo.reset();
2410 managers.ibo.reset();
2411 managers.staging.reset();
2412 for (auto &frame : per_frame)
2413 {
2414 frame->vbo_blocks.clear();
2415 frame->ibo_blocks.clear();
2416 frame->ubo_blocks.clear();
2417 frame->staging_blocks.clear();
2418 }
2419
2420 framebuffer_allocator.clear();
2421 transient_allocator.clear();
2422 for (auto &allocator : descriptor_set_allocators)
2423 allocator.clear();
2424
2425 for (auto &frame : per_frame)
2426 {
2427 // We have done WaitIdle, no need to wait for extra fences, it's also not safe.
2428 frame->wait_fences.clear();
2429 frame->begin();
2430 }
2431 }
2432
next_frame_context()2433 void Device::next_frame_context()
2434 {
2435 DRAIN_FRAME_LOCK();
2436
2437 if (frame_context_begin_ts)
2438 {
2439 auto frame_context_end_ts = write_calibrated_timestamp_nolock();
2440 register_time_interval_nolock("CPU", std::move(frame_context_begin_ts), std::move(frame_context_end_ts), "command submissions", "");
2441 frame_context_begin_ts = {};
2442 }
2443
2444 // Flush the frame here as we might have pending staging command buffers from init stage.
2445 end_frame_nolock();
2446
2447 framebuffer_allocator.begin_frame();
2448 transient_allocator.begin_frame();
2449 for (auto &allocator : descriptor_set_allocators)
2450 allocator.begin_frame();
2451
2452 VK_ASSERT(!per_frame.empty());
2453 frame_context_index++;
2454 if (frame_context_index >= per_frame.size())
2455 frame_context_index = 0;
2456
2457 frame().begin();
2458 recalibrate_timestamps();
2459 frame_context_begin_ts = write_calibrated_timestamp_nolock();
2460 }
2461
write_timestamp(VkCommandBuffer cmd,VkPipelineStageFlagBits stage)2462 QueryPoolHandle Device::write_timestamp(VkCommandBuffer cmd, VkPipelineStageFlagBits stage)
2463 {
2464 LOCK();
2465 return write_timestamp_nolock(cmd, stage);
2466 }
2467
write_timestamp_nolock(VkCommandBuffer cmd,VkPipelineStageFlagBits stage)2468 QueryPoolHandle Device::write_timestamp_nolock(VkCommandBuffer cmd, VkPipelineStageFlagBits stage)
2469 {
2470 return frame().query_pool.write_timestamp(cmd, stage);
2471 }
2472
write_calibrated_timestamp()2473 QueryPoolHandle Device::write_calibrated_timestamp()
2474 {
2475 LOCK();
2476 return write_calibrated_timestamp_nolock();
2477 }
2478
write_calibrated_timestamp_nolock()2479 QueryPoolHandle Device::write_calibrated_timestamp_nolock()
2480 {
2481 if (!json_trace_file)
2482 return {};
2483
2484 auto handle = QueryPoolHandle(handle_pool.query.allocate(this));
2485 handle->signal_timestamp_ticks(get_calibrated_timestamp());
2486 return handle;
2487 }
2488
recalibrate_timestamps_fallback()2489 void Device::recalibrate_timestamps_fallback()
2490 {
2491 wait_idle_nolock();
2492 auto cmd = request_command_buffer_nolock(0, CommandBuffer::Type::Generic, false);
2493 auto ts = write_timestamp_nolock(cmd->get_command_buffer(), VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
2494 if (!ts)
2495 return;
2496 auto start_ts = Util::get_current_time_nsecs();
2497 submit_nolock(cmd, nullptr, 0, nullptr);
2498 wait_idle_nolock();
2499 auto end_ts = Util::get_current_time_nsecs();
2500 auto host_ts = (start_ts + end_ts) / 2;
2501
2502 LOGI("Calibrated timestamps with a fallback method. Uncertainty: %.3f us.\n", 1e-3 * (end_ts - start_ts));
2503
2504 calibrated_timestamp_host = host_ts;
2505 VK_ASSERT(ts->is_signalled());
2506 calibrated_timestamp_device = ts->get_timestamp_ticks();
2507 }
2508
init_calibrated_timestamps()2509 void Device::init_calibrated_timestamps()
2510 {
2511 if (!get_device_features().supports_calibrated_timestamps)
2512 {
2513 recalibrate_timestamps_fallback();
2514 return;
2515 }
2516
2517 uint32_t count;
2518 vkGetPhysicalDeviceCalibrateableTimeDomainsEXT(gpu, &count, nullptr);
2519 std::vector<VkTimeDomainEXT> domains(count);
2520 if (vkGetPhysicalDeviceCalibrateableTimeDomainsEXT(gpu, &count, domains.data()) != VK_SUCCESS)
2521 return;
2522
2523 bool supports_device_domain = false;
2524 for (auto &domain : domains)
2525 {
2526 if (domain == VK_TIME_DOMAIN_DEVICE_EXT)
2527 {
2528 supports_device_domain = true;
2529 break;
2530 }
2531 }
2532
2533 if (!supports_device_domain)
2534 return;
2535
2536 for (auto &domain : domains)
2537 {
2538 #ifdef _WIN32
2539 const auto supported_domain = VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT;
2540 #else
2541 const auto supported_domain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT;
2542 #endif
2543 if (domain == supported_domain)
2544 {
2545 calibrated_time_domain = domain;
2546 break;
2547 }
2548 }
2549
2550 if (calibrated_time_domain == VK_TIME_DOMAIN_DEVICE_EXT)
2551 {
2552 LOGE("Could not find a suitable time domain for calibrated timestamps.\n");
2553 return;
2554 }
2555
2556 if (!resample_calibrated_timestamps())
2557 {
2558 LOGE("Failed to get calibrated timestamps.\n");
2559 calibrated_time_domain = VK_TIME_DOMAIN_DEVICE_EXT;
2560 return;
2561 }
2562 }
2563
resample_calibrated_timestamps()2564 bool Device::resample_calibrated_timestamps()
2565 {
2566 VkCalibratedTimestampInfoEXT infos[2] = {};
2567 infos[0].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
2568 infos[1].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
2569 infos[0].timeDomain = calibrated_time_domain;
2570 infos[1].timeDomain = VK_TIME_DOMAIN_DEVICE_EXT;
2571 uint64_t timestamps[2] = {};
2572 uint64_t max_deviation[2] = {};
2573
2574 if (table->vkGetCalibratedTimestampsEXT(device, 2, infos, timestamps, max_deviation) != VK_SUCCESS)
2575 {
2576 LOGE("Failed to get calibrated timestamps.\n");
2577 calibrated_time_domain = VK_TIME_DOMAIN_DEVICE_EXT;
2578 return false;
2579 }
2580
2581 calibrated_timestamp_host = timestamps[0];
2582 calibrated_timestamp_device = timestamps[1];
2583
2584 #ifdef _WIN32
2585 LARGE_INTEGER freq;
2586 QueryPerformanceFrequency(&freq);
2587 calibrated_timestamp_host = int64_t(1e9 * calibrated_timestamp_host / double(freq.QuadPart));
2588 #endif
2589 return true;
2590 }
2591
recalibrate_timestamps()2592 void Device::recalibrate_timestamps()
2593 {
2594 // Don't bother recalibrating timestamps if we're not tracing.
2595 if (!json_trace_file)
2596 return;
2597
2598 // Recalibrate every once in a while ...
2599 timestamp_calibration_counter++;
2600 if (timestamp_calibration_counter < 1000)
2601 return;
2602 timestamp_calibration_counter = 0;
2603
2604 if (calibrated_time_domain == VK_TIME_DOMAIN_DEVICE_EXT)
2605 recalibrate_timestamps_fallback();
2606 else
2607 resample_calibrated_timestamps();
2608 }
2609
get_calibrated_timestamp()2610 int64_t Device::get_calibrated_timestamp()
2611 {
2612 int64_t nsecs = Util::get_current_time_nsecs();
2613
2614 auto offset_from_calibration = double(nsecs - calibrated_timestamp_host);
2615 auto ticks_in_device_timebase = int64_t(offset_from_calibration / double(gpu_props.limits.timestampPeriod));
2616 int64_t reported = calibrated_timestamp_device + ticks_in_device_timebase;
2617 reported = std::max(reported, last_calibrated_timestamp_host);
2618 last_calibrated_timestamp_host = reported;
2619 return reported;
2620 }
2621
register_time_interval(std::string tid,QueryPoolHandle start_ts,QueryPoolHandle end_ts,std::string tag,std::string extra)2622 void Device::register_time_interval(std::string tid, QueryPoolHandle start_ts, QueryPoolHandle end_ts, std::string tag, std::string extra)
2623 {
2624 LOCK();
2625 register_time_interval_nolock(std::move(tid), std::move(start_ts), std::move(end_ts), std::move(tag), std::move(extra));
2626 }
2627
register_time_interval_nolock(std::string tid,QueryPoolHandle start_ts,QueryPoolHandle end_ts,std::string tag,std::string extra)2628 void Device::register_time_interval_nolock(std::string tid, QueryPoolHandle start_ts, QueryPoolHandle end_ts,
2629 std::string tag, std::string extra)
2630 {
2631 if (start_ts && end_ts)
2632 {
2633 TimestampInterval *timestamp_tag = managers.timestamps.get_timestamp_tag(tag.c_str());
2634 #ifdef VULKAN_DEBUG
2635 if (start_ts->is_signalled() && end_ts->is_signalled())
2636 VK_ASSERT(end_ts->get_timestamp_ticks() >= start_ts->get_timestamp_ticks());
2637 #endif
2638 frame().timestamp_intervals.push_back({ std::move(tid), move(start_ts), move(end_ts), timestamp_tag, std::move(extra) });
2639 }
2640 }
2641
add_frame_counter_nolock()2642 void Device::add_frame_counter_nolock()
2643 {
2644 lock.counter++;
2645 }
2646
decrement_frame_counter_nolock()2647 void Device::decrement_frame_counter_nolock()
2648 {
2649 VK_ASSERT(lock.counter > 0);
2650 lock.counter--;
2651 #ifdef GRANITE_VULKAN_MT
2652 lock.cond.notify_one();
2653 #endif
2654 }
2655
begin()2656 void Device::PerFrame::begin()
2657 {
2658 VkDevice vkdevice = device.get_device();
2659
2660 Vulkan::QueryPoolHandle wait_fence_ts;
2661 if (!in_destructor && device.json_timestamp_origin)
2662 wait_fence_ts = device.write_calibrated_timestamp_nolock();
2663
2664 if (device.get_device_features().timeline_semaphore_features.timelineSemaphore &&
2665 graphics_timeline_semaphore && compute_timeline_semaphore && transfer_timeline_semaphore)
2666 {
2667 VkSemaphoreWaitInfoKHR info = { VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO_KHR };
2668 const VkSemaphore semaphores[3] = { graphics_timeline_semaphore, compute_timeline_semaphore, transfer_timeline_semaphore };
2669 const uint64_t values[3] = { timeline_fence_graphics, timeline_fence_compute, timeline_fence_transfer };
2670
2671 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
2672 if (device.get_device_features().timeline_semaphore_features.timelineSemaphore)
2673 {
2674 LOGI("Waiting for graphics (%p) %u\n",
2675 reinterpret_cast<void *>(graphics_timeline_semaphore),
2676 unsigned(timeline_fence_graphics));
2677 LOGI("Waiting for compute (%p) %u\n",
2678 reinterpret_cast<void *>(compute_timeline_semaphore),
2679 unsigned(timeline_fence_compute));
2680 LOGI("Waiting for transfer (%p) %u\n",
2681 reinterpret_cast<void *>(transfer_timeline_semaphore),
2682 unsigned(timeline_fence_transfer));
2683 }
2684 #endif
2685
2686 info.pSemaphores = semaphores;
2687 info.pValues = values;
2688 info.semaphoreCount = 3;
2689 table.vkWaitSemaphoresKHR(vkdevice, &info, UINT64_MAX);
2690 }
2691
2692 // If we're using timeline semaphores, these paths should never be hit.
2693 if (!wait_fences.empty())
2694 {
2695 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
2696 for (auto &fence : wait_fences)
2697 LOGI("Waiting for Fence: %llx\n", reinterpret_cast<unsigned long long>(fence));
2698 #endif
2699 table.vkWaitForFences(vkdevice, wait_fences.size(), wait_fences.data(), VK_TRUE, UINT64_MAX);
2700 wait_fences.clear();
2701 }
2702
2703 if (!in_destructor && device.json_timestamp_origin)
2704 device.register_time_interval_nolock("CPU", std::move(wait_fence_ts), device.write_calibrated_timestamp_nolock(), "fence", "");
2705
2706 // If we're using timeline semaphores, these paths should never be hit.
2707 if (!recycle_fences.empty())
2708 {
2709 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
2710 for (auto &fence : recycle_fences)
2711 LOGI("Recycling Fence: %llx\n", reinterpret_cast<unsigned long long>(fence));
2712 #endif
2713 table.vkResetFences(vkdevice, recycle_fences.size(), recycle_fences.data());
2714 for (auto &fence : recycle_fences)
2715 managers.fence.recycle_fence(fence);
2716 recycle_fences.clear();
2717 }
2718
2719 for (auto &pool : graphics_cmd_pool)
2720 pool.begin();
2721 for (auto &pool : compute_cmd_pool)
2722 pool.begin();
2723 for (auto &pool : transfer_cmd_pool)
2724 pool.begin();
2725 query_pool.begin();
2726
2727 for (auto &channel : debug_channels)
2728 device.parse_debug_channel(channel);
2729
2730 // Free the debug channel buffers here, and they will immediately be recycled by the destroyed_buffers right below.
2731 debug_channels.clear();
2732
2733 for (auto &framebuffer : destroyed_framebuffers)
2734 table.vkDestroyFramebuffer(vkdevice, framebuffer, nullptr);
2735 for (auto &sampler : destroyed_samplers)
2736 table.vkDestroySampler(vkdevice, sampler, nullptr);
2737 for (auto &pipeline : destroyed_pipelines)
2738 table.vkDestroyPipeline(vkdevice, pipeline, nullptr);
2739 for (auto &view : destroyed_image_views)
2740 table.vkDestroyImageView(vkdevice, view, nullptr);
2741 for (auto &view : destroyed_buffer_views)
2742 table.vkDestroyBufferView(vkdevice, view, nullptr);
2743 for (auto &image : destroyed_images)
2744 table.vkDestroyImage(vkdevice, image, nullptr);
2745 for (auto &buffer : destroyed_buffers)
2746 table.vkDestroyBuffer(vkdevice, buffer, nullptr);
2747 for (auto &semaphore : destroyed_semaphores)
2748 table.vkDestroySemaphore(vkdevice, semaphore, nullptr);
2749 for (auto &pool : destroyed_descriptor_pools)
2750 table.vkDestroyDescriptorPool(vkdevice, pool, nullptr);
2751 for (auto &semaphore : recycled_semaphores)
2752 {
2753 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
2754 LOGI("Recycling semaphore: %llx\n", reinterpret_cast<unsigned long long>(semaphore));
2755 #endif
2756 managers.semaphore.recycle(semaphore);
2757 }
2758 for (auto &event : recycled_events)
2759 managers.event.recycle(event);
2760 for (auto &alloc : allocations)
2761 alloc.free_immediate(managers.memory);
2762
2763 for (auto &block : vbo_blocks)
2764 managers.vbo.recycle_block(move(block));
2765 for (auto &block : ibo_blocks)
2766 managers.ibo.recycle_block(move(block));
2767 for (auto &block : ubo_blocks)
2768 managers.ubo.recycle_block(move(block));
2769 for (auto &block : staging_blocks)
2770 managers.staging.recycle_block(move(block));
2771 vbo_blocks.clear();
2772 ibo_blocks.clear();
2773 ubo_blocks.clear();
2774 staging_blocks.clear();
2775
2776 destroyed_framebuffers.clear();
2777 destroyed_samplers.clear();
2778 destroyed_pipelines.clear();
2779 destroyed_image_views.clear();
2780 destroyed_buffer_views.clear();
2781 destroyed_images.clear();
2782 destroyed_buffers.clear();
2783 destroyed_semaphores.clear();
2784 destroyed_descriptor_pools.clear();
2785 recycled_semaphores.clear();
2786 recycled_events.clear();
2787 allocations.clear();
2788
2789 int64_t min_timestamp_us = std::numeric_limits<int64_t>::max();
2790 int64_t max_timestamp_us = 0;
2791
2792 for (auto &ts : timestamp_intervals)
2793 {
2794 if (ts.end_ts->is_signalled() && ts.start_ts->is_signalled())
2795 {
2796 ts.timestamp_tag->accumulate_time(
2797 device.convert_timestamp_delta(ts.start_ts->get_timestamp_ticks(), ts.end_ts->get_timestamp_ticks()));
2798 device.write_json_timestamp_range(frame_index, ts.tid.c_str(), ts.timestamp_tag->get_tag().c_str(),
2799 ts.extra.c_str(),
2800 ts.start_ts->get_timestamp_ticks(), ts.end_ts->get_timestamp_ticks(),
2801 min_timestamp_us, max_timestamp_us);
2802 }
2803 }
2804 device.write_json_timestamp_range_us(frame_index, "CPU + GPU", "full frame lifetime", min_timestamp_us, max_timestamp_us);
2805 managers.timestamps.mark_end_of_frame_context();
2806 timestamp_intervals.clear();
2807 }
2808
~PerFrame()2809 Device::PerFrame::~PerFrame()
2810 {
2811 in_destructor = true;
2812 begin();
2813 }
2814
find_memory_type(BufferDomain domain,uint32_t mask)2815 uint32_t Device::find_memory_type(BufferDomain domain, uint32_t mask)
2816 {
2817 uint32_t prio[3] = {};
2818 switch (domain)
2819 {
2820 case BufferDomain::Device:
2821 prio[0] = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
2822 break;
2823
2824 case BufferDomain::LinkedDeviceHost:
2825 prio[0] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
2826 prio[1] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
2827 prio[2] = prio[1];
2828 break;
2829
2830 case BufferDomain::LinkedDeviceHostPreferDevice:
2831 prio[0] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
2832 prio[1] = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
2833 prio[2] = prio[1];
2834 break;
2835
2836 case BufferDomain::Host:
2837 prio[0] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
2838 prio[1] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
2839 prio[2] = prio[1];
2840 break;
2841
2842 case BufferDomain::CachedHost:
2843 prio[0] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
2844 prio[1] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
2845 prio[2] = prio[1];
2846 break;
2847
2848 case BufferDomain::CachedCoherentHostPreferCached:
2849 prio[0] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
2850 prio[1] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
2851 prio[2] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
2852 break;
2853
2854 case BufferDomain::CachedCoherentHostPreferCoherent:
2855 prio[0] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
2856 prio[1] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
2857 prio[2] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
2858 break;
2859 }
2860
2861 for (auto &p : prio)
2862 {
2863 for (uint32_t i = 0; i < mem_props.memoryTypeCount; i++)
2864 {
2865 if ((1u << i) & mask)
2866 {
2867 uint32_t flags = mem_props.memoryTypes[i].propertyFlags;
2868 if ((flags & p) == p)
2869 return i;
2870 }
2871 }
2872 }
2873
2874 return UINT32_MAX;
2875 }
2876
find_memory_type(ImageDomain domain,uint32_t mask)2877 uint32_t Device::find_memory_type(ImageDomain domain, uint32_t mask)
2878 {
2879 uint32_t desired = 0, fallback = 0;
2880 switch (domain)
2881 {
2882 case ImageDomain::Physical:
2883 desired = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
2884 fallback = 0;
2885 break;
2886
2887 case ImageDomain::Transient:
2888 desired = VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT;
2889 fallback = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
2890 break;
2891
2892 case ImageDomain::LinearHostCached:
2893 desired = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
2894 fallback = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
2895 break;
2896
2897 case ImageDomain::LinearHost:
2898 desired = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
2899 fallback = 0;
2900 break;
2901 }
2902
2903 for (uint32_t i = 0; i < mem_props.memoryTypeCount; i++)
2904 {
2905 if ((1u << i) & mask)
2906 {
2907 uint32_t flags = mem_props.memoryTypes[i].propertyFlags;
2908 if ((flags & desired) == desired)
2909 return i;
2910 }
2911 }
2912
2913 for (uint32_t i = 0; i < mem_props.memoryTypeCount; i++)
2914 {
2915 if ((1u << i) & mask)
2916 {
2917 uint32_t flags = mem_props.memoryTypes[i].propertyFlags;
2918 if ((flags & fallback) == fallback)
2919 return i;
2920 }
2921 }
2922
2923 return UINT32_MAX;
2924 }
2925
get_image_view_type(const ImageCreateInfo & create_info,const ImageViewCreateInfo * view)2926 static inline VkImageViewType get_image_view_type(const ImageCreateInfo &create_info, const ImageViewCreateInfo *view)
2927 {
2928 unsigned layers = view ? view->layers : create_info.layers;
2929 unsigned base_layer = view ? view->base_layer : 0;
2930
2931 if (layers == VK_REMAINING_ARRAY_LAYERS)
2932 layers = create_info.layers - base_layer;
2933
2934 bool force_array =
2935 view ? (view->misc & IMAGE_VIEW_MISC_FORCE_ARRAY_BIT) : (create_info.misc & IMAGE_MISC_FORCE_ARRAY_BIT);
2936
2937 switch (create_info.type)
2938 {
2939 case VK_IMAGE_TYPE_1D:
2940 VK_ASSERT(create_info.width >= 1);
2941 VK_ASSERT(create_info.height == 1);
2942 VK_ASSERT(create_info.depth == 1);
2943 VK_ASSERT(create_info.samples == VK_SAMPLE_COUNT_1_BIT);
2944
2945 if (layers > 1 || force_array)
2946 return VK_IMAGE_VIEW_TYPE_1D_ARRAY;
2947 else
2948 return VK_IMAGE_VIEW_TYPE_1D;
2949
2950 case VK_IMAGE_TYPE_2D:
2951 VK_ASSERT(create_info.width >= 1);
2952 VK_ASSERT(create_info.height >= 1);
2953 VK_ASSERT(create_info.depth == 1);
2954
2955 if ((create_info.flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT) && (layers % 6) == 0)
2956 {
2957 VK_ASSERT(create_info.width == create_info.height);
2958
2959 if (layers > 6 || force_array)
2960 return VK_IMAGE_VIEW_TYPE_CUBE_ARRAY;
2961 else
2962 return VK_IMAGE_VIEW_TYPE_CUBE;
2963 }
2964 else
2965 {
2966 if (layers > 1 || force_array)
2967 return VK_IMAGE_VIEW_TYPE_2D_ARRAY;
2968 else
2969 return VK_IMAGE_VIEW_TYPE_2D;
2970 }
2971
2972 case VK_IMAGE_TYPE_3D:
2973 VK_ASSERT(create_info.width >= 1);
2974 VK_ASSERT(create_info.height >= 1);
2975 VK_ASSERT(create_info.depth >= 1);
2976 return VK_IMAGE_VIEW_TYPE_3D;
2977
2978 default:
2979 VK_ASSERT(0 && "bogus");
2980 return VK_IMAGE_VIEW_TYPE_RANGE_SIZE;
2981 }
2982 }
2983
create_buffer_view(const BufferViewCreateInfo & view_info)2984 BufferViewHandle Device::create_buffer_view(const BufferViewCreateInfo &view_info)
2985 {
2986 VkBufferViewCreateInfo info = { VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO };
2987 info.buffer = view_info.buffer->get_buffer();
2988 info.format = view_info.format;
2989 info.offset = view_info.offset;
2990 info.range = view_info.range;
2991
2992 VkBufferView view;
2993 auto res = table->vkCreateBufferView(device, &info, nullptr, &view);
2994 if (res != VK_SUCCESS)
2995 return BufferViewHandle(nullptr);
2996
2997 return BufferViewHandle(handle_pool.buffer_views.allocate(this, view, view_info));
2998 }
2999
3000 class ImageResourceHolder
3001 {
3002 public:
ImageResourceHolder(Device * device_)3003 explicit ImageResourceHolder(Device *device_)
3004 : device(device_)
3005 , table(device_->get_device_table())
3006 {
3007 }
3008
~ImageResourceHolder()3009 ~ImageResourceHolder()
3010 {
3011 if (owned)
3012 cleanup();
3013 }
3014
3015 Device *device;
3016 const VolkDeviceTable &table;
3017
3018 VkImage image = VK_NULL_HANDLE;
3019 VkDeviceMemory memory = VK_NULL_HANDLE;
3020 VkImageView image_view = VK_NULL_HANDLE;
3021 VkImageView depth_view = VK_NULL_HANDLE;
3022 VkImageView stencil_view = VK_NULL_HANDLE;
3023 VkImageView unorm_view = VK_NULL_HANDLE;
3024 VkImageView srgb_view = VK_NULL_HANDLE;
3025 VkImageViewType default_view_type = VK_IMAGE_VIEW_TYPE_RANGE_SIZE;
3026 vector<VkImageView> rt_views;
3027 DeviceAllocation allocation;
3028 DeviceAllocator *allocator = nullptr;
3029 bool owned = true;
3030
get_default_view_type() const3031 VkImageViewType get_default_view_type() const
3032 {
3033 return default_view_type;
3034 }
3035
setup_conversion_info(VkImageViewCreateInfo & create_info,VkSamplerYcbcrConversionInfo & conversion)3036 bool setup_conversion_info(VkImageViewCreateInfo &create_info, VkSamplerYcbcrConversionInfo &conversion)
3037 {
3038 switch (create_info.format)
3039 {
3040 case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
3041 if (!device->get_device_features().sampler_ycbcr_conversion_features.samplerYcbcrConversion)
3042 return false;
3043 create_info.pNext = &conversion;
3044 conversion = { VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO };
3045 conversion.conversion = device->samplers_ycbcr[static_cast<unsigned>(YCbCrFormat::YUV420P_3PLANE)];
3046 break;
3047
3048 case VK_FORMAT_G8_B8_R8_3PLANE_422_UNORM:
3049 if (!device->get_device_features().sampler_ycbcr_conversion_features.samplerYcbcrConversion)
3050 return false;
3051 create_info.pNext = &conversion;
3052 conversion = { VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO };
3053 conversion.conversion = device->samplers_ycbcr[static_cast<unsigned>(YCbCrFormat::YUV422P_3PLANE)];
3054 break;
3055
3056 case VK_FORMAT_G8_B8_R8_3PLANE_444_UNORM:
3057 if (!device->get_device_features().sampler_ycbcr_conversion_features.samplerYcbcrConversion)
3058 return false;
3059 create_info.pNext = &conversion;
3060 conversion = { VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO };
3061 conversion.conversion = device->samplers_ycbcr[static_cast<unsigned>(YCbCrFormat::YUV444P_3PLANE)];
3062 break;
3063
3064 default:
3065 break;
3066 }
3067
3068 return true;
3069 }
3070
create_default_views(const ImageCreateInfo & create_info,const VkImageViewCreateInfo * view_info,bool create_unorm_srgb_views=false,const VkFormat * view_formats=nullptr)3071 bool create_default_views(const ImageCreateInfo &create_info, const VkImageViewCreateInfo *view_info,
3072 bool create_unorm_srgb_views = false, const VkFormat *view_formats = nullptr)
3073 {
3074 VkDevice vkdevice = device->get_device();
3075
3076 if ((create_info.usage & (VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
3077 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) == 0)
3078 {
3079 LOGE("Cannot create image view unless certain usage flags are present.\n");
3080 return false;
3081 }
3082
3083 VkImageViewCreateInfo default_view_info = { VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO };
3084 VkSamplerYcbcrConversionInfo conversion_info = { VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO };
3085
3086 if (!view_info)
3087 {
3088 default_view_info.image = image;
3089 default_view_info.format = create_info.format;
3090 default_view_info.components = create_info.swizzle;
3091 default_view_info.subresourceRange.aspectMask = format_to_aspect_mask(default_view_info.format);
3092 default_view_info.viewType = get_image_view_type(create_info, nullptr);
3093 default_view_info.subresourceRange.baseMipLevel = 0;
3094 default_view_info.subresourceRange.baseArrayLayer = 0;
3095 default_view_info.subresourceRange.levelCount = create_info.levels;
3096 default_view_info.subresourceRange.layerCount = create_info.layers;
3097
3098 default_view_type = default_view_info.viewType;
3099 }
3100 else
3101 default_view_info = *view_info;
3102
3103 view_info = &default_view_info;
3104 if (!setup_conversion_info(default_view_info, conversion_info))
3105 return false;
3106
3107 if (!create_alt_views(create_info, *view_info))
3108 return false;
3109
3110 if (!create_render_target_views(create_info, *view_info))
3111 return false;
3112
3113 if (!create_default_view(*view_info))
3114 return false;
3115
3116 if (create_unorm_srgb_views)
3117 {
3118 auto info = *view_info;
3119
3120 info.format = view_formats[0];
3121 if (table.vkCreateImageView(vkdevice, &info, nullptr, &unorm_view) != VK_SUCCESS)
3122 return false;
3123
3124 info.format = view_formats[1];
3125 if (table.vkCreateImageView(vkdevice, &info, nullptr, &srgb_view) != VK_SUCCESS)
3126 return false;
3127 }
3128
3129 return true;
3130 }
3131
3132 private:
create_render_target_views(const ImageCreateInfo & image_create_info,const VkImageViewCreateInfo & info)3133 bool create_render_target_views(const ImageCreateInfo &image_create_info, const VkImageViewCreateInfo &info)
3134 {
3135 rt_views.reserve(info.subresourceRange.layerCount);
3136
3137 if (info.viewType == VK_IMAGE_VIEW_TYPE_3D)
3138 return true;
3139
3140 // If we have a render target, and non-trivial case (layers = 1, levels = 1),
3141 // create an array of render targets which correspond to each layer (mip 0).
3142 if ((image_create_info.usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)) != 0 &&
3143 ((info.subresourceRange.levelCount > 1) || (info.subresourceRange.layerCount > 1)))
3144 {
3145 auto view_info = info;
3146 view_info.viewType = VK_IMAGE_VIEW_TYPE_2D;
3147 view_info.subresourceRange.baseMipLevel = info.subresourceRange.baseMipLevel;
3148 for (uint32_t layer = 0; layer < info.subresourceRange.layerCount; layer++)
3149 {
3150 view_info.subresourceRange.levelCount = 1;
3151 view_info.subresourceRange.layerCount = 1;
3152 view_info.subresourceRange.baseArrayLayer = layer + info.subresourceRange.baseArrayLayer;
3153
3154 VkImageView rt_view;
3155 if (table.vkCreateImageView(device->get_device(), &view_info, nullptr, &rt_view) != VK_SUCCESS)
3156 return false;
3157
3158 rt_views.push_back(rt_view);
3159 }
3160 }
3161
3162 return true;
3163 }
3164
create_alt_views(const ImageCreateInfo & image_create_info,const VkImageViewCreateInfo & info)3165 bool create_alt_views(const ImageCreateInfo &image_create_info, const VkImageViewCreateInfo &info)
3166 {
3167 if (info.viewType == VK_IMAGE_VIEW_TYPE_CUBE ||
3168 info.viewType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY ||
3169 info.viewType == VK_IMAGE_VIEW_TYPE_3D)
3170 {
3171 return true;
3172 }
3173
3174 VkDevice vkdevice = device->get_device();
3175
3176 if (info.subresourceRange.aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))
3177 {
3178 if ((image_create_info.usage & ~VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) != 0)
3179 {
3180 // Sanity check. Don't want to implement layered views for this.
3181 if (info.subresourceRange.levelCount > 1)
3182 {
3183 LOGE("Cannot create depth stencil attachments with more than 1 mip level currently, and non-DS usage flags.\n");
3184 return false;
3185 }
3186
3187 if (info.subresourceRange.layerCount > 1)
3188 {
3189 LOGE("Cannot create layered depth stencil attachments with non-DS usage flags.\n");
3190 return false;
3191 }
3192
3193 auto view_info = info;
3194
3195 // We need this to be able to sample the texture, or otherwise use it as a non-pure DS attachment.
3196 view_info.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
3197 if (table.vkCreateImageView(vkdevice, &view_info, nullptr, &depth_view) != VK_SUCCESS)
3198 return false;
3199
3200 view_info.subresourceRange.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
3201 if (table.vkCreateImageView(vkdevice, &view_info, nullptr, &stencil_view) != VK_SUCCESS)
3202 return false;
3203 }
3204 }
3205
3206 return true;
3207 }
3208
create_default_view(const VkImageViewCreateInfo & info)3209 bool create_default_view(const VkImageViewCreateInfo &info)
3210 {
3211 VkDevice vkdevice = device->get_device();
3212
3213 // Create the normal image view. This one contains every subresource.
3214 if (table.vkCreateImageView(vkdevice, &info, nullptr, &image_view) != VK_SUCCESS)
3215 return false;
3216
3217 return true;
3218 }
3219
cleanup()3220 void cleanup()
3221 {
3222 VkDevice vkdevice = device->get_device();
3223
3224 if (image_view)
3225 table.vkDestroyImageView(vkdevice, image_view, nullptr);
3226 if (depth_view)
3227 table.vkDestroyImageView(vkdevice, depth_view, nullptr);
3228 if (stencil_view)
3229 table.vkDestroyImageView(vkdevice, stencil_view, nullptr);
3230 if (unorm_view)
3231 table.vkDestroyImageView(vkdevice, unorm_view, nullptr);
3232 if (srgb_view)
3233 table.vkDestroyImageView(vkdevice, srgb_view, nullptr);
3234 for (auto &view : rt_views)
3235 table.vkDestroyImageView(vkdevice, view, nullptr);
3236
3237 if (image)
3238 table.vkDestroyImage(vkdevice, image, nullptr);
3239 if (memory)
3240 table.vkFreeMemory(vkdevice, memory, nullptr);
3241 if (allocator)
3242 allocation.free_immediate(*allocator);
3243 }
3244 };
3245
create_image_view(const ImageViewCreateInfo & create_info)3246 ImageViewHandle Device::create_image_view(const ImageViewCreateInfo &create_info)
3247 {
3248 ImageResourceHolder holder(this);
3249 auto &image_create_info = create_info.image->get_create_info();
3250
3251 VkFormat format = create_info.format != VK_FORMAT_UNDEFINED ? create_info.format : image_create_info.format;
3252
3253 VkImageViewCreateInfo view_info = { VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO };
3254 view_info.image = create_info.image->get_image();
3255 view_info.format = format;
3256 view_info.components = create_info.swizzle;
3257 view_info.subresourceRange.aspectMask = format_to_aspect_mask(format);
3258 view_info.subresourceRange.baseMipLevel = create_info.base_level;
3259 view_info.subresourceRange.baseArrayLayer = create_info.base_layer;
3260 view_info.subresourceRange.levelCount = create_info.levels;
3261 view_info.subresourceRange.layerCount = create_info.layers;
3262
3263 if (create_info.view_type == VK_IMAGE_VIEW_TYPE_RANGE_SIZE)
3264 view_info.viewType = get_image_view_type(image_create_info, &create_info);
3265 else
3266 view_info.viewType = create_info.view_type;
3267
3268 unsigned num_levels;
3269 if (view_info.subresourceRange.levelCount == VK_REMAINING_MIP_LEVELS)
3270 num_levels = create_info.image->get_create_info().levels - view_info.subresourceRange.baseMipLevel;
3271 else
3272 num_levels = view_info.subresourceRange.levelCount;
3273
3274 unsigned num_layers;
3275 if (view_info.subresourceRange.layerCount == VK_REMAINING_ARRAY_LAYERS)
3276 num_layers = create_info.image->get_create_info().layers - view_info.subresourceRange.baseArrayLayer;
3277 else
3278 num_layers = view_info.subresourceRange.layerCount;
3279
3280 view_info.subresourceRange.levelCount = num_levels;
3281 view_info.subresourceRange.layerCount = num_layers;
3282
3283 if (!holder.create_default_views(image_create_info, &view_info))
3284 return ImageViewHandle(nullptr);
3285
3286 ImageViewCreateInfo tmp = create_info;
3287 tmp.format = format;
3288 ImageViewHandle ret(handle_pool.image_views.allocate(this, holder.image_view, tmp));
3289 if (ret)
3290 {
3291 holder.owned = false;
3292 ret->set_alt_views(holder.depth_view, holder.stencil_view);
3293 ret->set_render_target_views(move(holder.rt_views));
3294 return ret;
3295 }
3296 else
3297 return ImageViewHandle(nullptr);
3298 }
3299
3300 #ifndef _WIN32
create_imported_image(int fd,VkDeviceSize size,uint32_t memory_type,VkExternalMemoryHandleTypeFlagBitsKHR handle_type,const ImageCreateInfo & create_info)3301 ImageHandle Device::create_imported_image(int fd, VkDeviceSize size, uint32_t memory_type,
3302 VkExternalMemoryHandleTypeFlagBitsKHR handle_type,
3303 const ImageCreateInfo &create_info)
3304 {
3305 if (!ext.supports_external)
3306 return {};
3307
3308 ImageResourceHolder holder(this);
3309
3310 VkImageCreateInfo info = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO };
3311 info.format = create_info.format;
3312 info.extent.width = create_info.width;
3313 info.extent.height = create_info.height;
3314 info.extent.depth = create_info.depth;
3315 info.imageType = create_info.type;
3316 info.mipLevels = create_info.levels;
3317 info.arrayLayers = create_info.layers;
3318 info.samples = create_info.samples;
3319 info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
3320 info.tiling = VK_IMAGE_TILING_OPTIMAL;
3321 info.usage = create_info.usage;
3322 info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
3323 info.flags = create_info.flags;
3324 VK_ASSERT(create_info.domain != ImageDomain::Transient);
3325
3326 VkExternalMemoryImageCreateInfoKHR externalInfo = { VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO_KHR };
3327 externalInfo.handleTypes = handle_type;
3328 info.pNext = &externalInfo;
3329
3330 VK_ASSERT(image_format_is_supported(create_info.format, image_usage_to_features(info.usage), info.tiling));
3331
3332 if (table->vkCreateImage(device, &info, nullptr, &holder.image) != VK_SUCCESS)
3333 return ImageHandle(nullptr);
3334
3335 VkMemoryAllocateInfo alloc_info = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO };
3336 alloc_info.allocationSize = size;
3337 alloc_info.memoryTypeIndex = memory_type;
3338
3339 VkMemoryDedicatedAllocateInfoKHR dedicated_info = { VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR };
3340 dedicated_info.image = holder.image;
3341 alloc_info.pNext = &dedicated_info;
3342
3343 VkImportMemoryFdInfoKHR fd_info = { VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR };
3344 fd_info.handleType = handle_type;
3345 fd_info.fd = fd;
3346 dedicated_info.pNext = &fd_info;
3347
3348 VkMemoryRequirements reqs;
3349 table->vkGetImageMemoryRequirements(device, holder.image, &reqs);
3350 if (reqs.size > size)
3351 return ImageHandle(nullptr);
3352
3353 if (((1u << memory_type) & reqs.memoryTypeBits) == 0)
3354 return ImageHandle(nullptr);
3355
3356 if (table->vkAllocateMemory(device, &alloc_info, nullptr, &holder.memory) != VK_SUCCESS)
3357 return ImageHandle(nullptr);
3358
3359 if (table->vkBindImageMemory(device, holder.image, holder.memory, 0) != VK_SUCCESS)
3360 return ImageHandle(nullptr);
3361
3362 // Create default image views.
3363 // App could of course to this on its own, but it's very handy to have these being created automatically for you.
3364 VkImageViewType view_type = VK_IMAGE_VIEW_TYPE_RANGE_SIZE;
3365 if (info.usage & (VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
3366 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT))
3367 {
3368 if (!holder.create_default_views(create_info, nullptr))
3369 return ImageHandle(nullptr);
3370 view_type = holder.get_default_view_type();
3371 }
3372
3373 auto allocation = DeviceAllocation::make_imported_allocation(holder.memory, size, memory_type);
3374 ImageHandle handle(handle_pool.images.allocate(this, holder.image, holder.image_view, allocation, create_info, view_type));
3375 if (handle)
3376 {
3377 holder.owned = false;
3378 handle->get_view().set_alt_views(holder.depth_view, holder.stencil_view);
3379 handle->get_view().set_render_target_views(move(holder.rt_views));
3380
3381 // Set possible dstStage and dstAccess.
3382 handle->set_stage_flags(image_usage_to_possible_stages(info.usage));
3383 handle->set_access_flags(image_usage_to_possible_access(info.usage));
3384 return handle;
3385 }
3386 else
3387 return ImageHandle(nullptr);
3388 }
3389 #endif
3390
create_image_staging_buffer(const TextureFormatLayout & layout)3391 InitialImageBuffer Device::create_image_staging_buffer(const TextureFormatLayout &layout)
3392 {
3393 InitialImageBuffer result;
3394
3395 BufferCreateInfo buffer_info = {};
3396 buffer_info.domain = BufferDomain::Host;
3397 buffer_info.size = layout.get_required_size();
3398 buffer_info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
3399 result.buffer = create_buffer(buffer_info, nullptr);
3400 set_name(*result.buffer, "image-upload-staging-buffer");
3401
3402 auto *mapped = static_cast<uint8_t *>(map_host_buffer(*result.buffer, MEMORY_ACCESS_WRITE_BIT));
3403 memcpy(mapped, layout.data(), layout.get_required_size());
3404 unmap_host_buffer(*result.buffer, MEMORY_ACCESS_WRITE_BIT);
3405
3406 layout.build_buffer_image_copies(result.blits);
3407 return result;
3408 }
3409
create_image_staging_buffer(const ImageCreateInfo & info,const ImageInitialData * initial)3410 InitialImageBuffer Device::create_image_staging_buffer(const ImageCreateInfo &info, const ImageInitialData *initial)
3411 {
3412 InitialImageBuffer result;
3413
3414 bool generate_mips = (info.misc & IMAGE_MISC_GENERATE_MIPS_BIT) != 0;
3415 TextureFormatLayout layout;
3416
3417 unsigned copy_levels;
3418 if (generate_mips)
3419 copy_levels = 1;
3420 else if (info.levels == 0)
3421 copy_levels = TextureFormatLayout::num_miplevels(info.width, info.height, info.depth);
3422 else
3423 copy_levels = info.levels;
3424
3425 switch (info.type)
3426 {
3427 case VK_IMAGE_TYPE_1D:
3428 layout.set_1d(info.format, info.width, info.layers, copy_levels);
3429 break;
3430 case VK_IMAGE_TYPE_2D:
3431 layout.set_2d(info.format, info.width, info.height, info.layers, copy_levels);
3432 break;
3433 case VK_IMAGE_TYPE_3D:
3434 layout.set_3d(info.format, info.width, info.height, info.depth, copy_levels);
3435 break;
3436 default:
3437 return {};
3438 }
3439
3440 BufferCreateInfo buffer_info = {};
3441 buffer_info.domain = BufferDomain::Host;
3442 buffer_info.size = layout.get_required_size();
3443 buffer_info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
3444 result.buffer = create_buffer(buffer_info, nullptr);
3445 set_name(*result.buffer, "image-upload-staging-buffer");
3446
3447 // And now, do the actual copy.
3448 auto *mapped = static_cast<uint8_t *>(map_host_buffer(*result.buffer, MEMORY_ACCESS_WRITE_BIT));
3449 unsigned index = 0;
3450
3451 layout.set_buffer(mapped, layout.get_required_size());
3452
3453 for (unsigned level = 0; level < copy_levels; level++)
3454 {
3455 const auto &mip_info = layout.get_mip_info(level);
3456 uint32_t dst_height_stride = layout.get_layer_size(level);
3457 size_t row_size = layout.get_row_size(level);
3458
3459 for (unsigned layer = 0; layer < info.layers; layer++, index++)
3460 {
3461 uint32_t src_row_length =
3462 initial[index].row_length ? initial[index].row_length : mip_info.row_length;
3463 uint32_t src_array_height =
3464 initial[index].image_height ? initial[index].image_height : mip_info.image_height;
3465
3466 uint32_t src_row_stride = layout.row_byte_stride(src_row_length);
3467 uint32_t src_height_stride = layout.layer_byte_stride(src_array_height, src_row_stride);
3468
3469 uint8_t *dst = static_cast<uint8_t *>(layout.data(layer, level));
3470 const uint8_t *src = static_cast<const uint8_t *>(initial[index].data);
3471
3472 for (uint32_t z = 0; z < mip_info.depth; z++)
3473 for (uint32_t y = 0; y < mip_info.block_image_height; y++)
3474 memcpy(dst + z * dst_height_stride + y * row_size, src + z * src_height_stride + y * src_row_stride, row_size);
3475 }
3476 }
3477
3478 unmap_host_buffer(*result.buffer, MEMORY_ACCESS_WRITE_BIT);
3479 layout.build_buffer_image_copies(result.blits);
3480 return result;
3481 }
3482
create_ycbcr_image(const YCbCrImageCreateInfo & create_info)3483 YCbCrImageHandle Device::create_ycbcr_image(const YCbCrImageCreateInfo &create_info)
3484 {
3485 if (!ext.sampler_ycbcr_conversion_features.samplerYcbcrConversion)
3486 return YCbCrImageHandle(nullptr);
3487
3488 VkFormatProperties format_properties = {};
3489 get_format_properties(format_ycbcr_planar_vk_format(create_info.format), &format_properties);
3490
3491 if ((format_properties.optimalTilingFeatures & VK_FORMAT_FEATURE_DISJOINT_BIT) == 0)
3492 {
3493 LOGE("YCbCr format does not support DISJOINT_BIT.\n");
3494 return YCbCrImageHandle(nullptr);
3495 }
3496
3497 if ((format_properties.optimalTilingFeatures & VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT) == 0)
3498 {
3499 LOGE("YCbCr format does not support MIDPOINT_CHROMA_SAMPLES_BIT.\n");
3500 return YCbCrImageHandle(nullptr);
3501 }
3502
3503 if ((format_properties.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT) == 0)
3504 {
3505 LOGE("YCbCr format does not support YCBCR_CONVERSION_LINEAR_FILTER_BIT.\n");
3506 return YCbCrImageHandle(nullptr);
3507 }
3508
3509 ImageHandle ycbcr_image;
3510 ImageHandle plane_handles[3];
3511 unsigned num_planes = format_ycbcr_num_planes(create_info.format);
3512
3513 for (unsigned i = 0; i < num_planes; i++)
3514 {
3515 ImageCreateInfo plane_info = ImageCreateInfo::immutable_2d_image(
3516 create_info.width,
3517 create_info.height,
3518 format_ycbcr_plane_vk_format(create_info.format, i));
3519 plane_info.usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_SAMPLED_BIT;
3520 plane_info.initial_layout = VK_IMAGE_LAYOUT_UNDEFINED;
3521
3522 plane_info.width >>= format_ycbcr_downsample_ratio_log2(create_info.format, 0, i);
3523 plane_info.height >>= format_ycbcr_downsample_ratio_log2(create_info.format, 1, i);
3524 plane_info.flags = VK_IMAGE_CREATE_ALIAS_BIT; // Will alias directly over the YCbCr image.
3525 plane_info.misc = IMAGE_MISC_FORCE_NO_DEDICATED_BIT;
3526 plane_handles[i] = create_image(plane_info);
3527 if (!plane_handles[i])
3528 {
3529 LOGE("Failed to create plane image.\n");
3530 return YCbCrImageHandle(nullptr);
3531 }
3532 }
3533
3534 ImageCreateInfo ycbcr_info = ImageCreateInfo::immutable_2d_image(
3535 create_info.width,
3536 create_info.height,
3537 format_ycbcr_planar_vk_format(create_info.format));
3538 ycbcr_info.usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_SAMPLED_BIT;
3539 ycbcr_info.initial_layout = VK_IMAGE_LAYOUT_UNDEFINED;
3540 ycbcr_info.flags = VK_IMAGE_CREATE_DISJOINT_BIT | VK_IMAGE_CREATE_ALIAS_BIT;
3541 ycbcr_info.misc = IMAGE_MISC_FORCE_NO_DEDICATED_BIT;
3542
3543 const DeviceAllocation *allocations[3];
3544 for (unsigned i = 0; i < num_planes; i++)
3545 allocations[i] = &plane_handles[i]->get_allocation();
3546 ycbcr_info.memory_aliases = allocations;
3547 ycbcr_info.num_memory_aliases = num_planes;
3548
3549 ycbcr_image = create_image(ycbcr_info);
3550 if (!ycbcr_image)
3551 return YCbCrImageHandle(nullptr);
3552
3553 YCbCrImageHandle handle(handle_pool.ycbcr_images.allocate(this, create_info.format, ycbcr_image, plane_handles, num_planes));
3554 return handle;
3555 }
3556
create_image(const ImageCreateInfo & create_info,const ImageInitialData * initial)3557 ImageHandle Device::create_image(const ImageCreateInfo &create_info, const ImageInitialData *initial)
3558 {
3559 if (initial)
3560 {
3561 auto staging_buffer = create_image_staging_buffer(create_info, initial);
3562 return create_image_from_staging_buffer(create_info, &staging_buffer);
3563 }
3564 else
3565 return create_image_from_staging_buffer(create_info, nullptr);
3566 }
3567
allocate_image_memory(DeviceAllocation * allocation,const ImageCreateInfo & info,VkImage image,VkImageTiling tiling)3568 bool Device::allocate_image_memory(DeviceAllocation *allocation, const ImageCreateInfo &info,
3569 VkImage image, VkImageTiling tiling)
3570 {
3571 if ((info.flags & VK_IMAGE_CREATE_DISJOINT_BIT) != 0 && info.num_memory_aliases == 0)
3572 {
3573 LOGE("Must use memory aliases when creating a DISJOINT planar image.\n");
3574 return false;
3575 }
3576
3577 if (info.num_memory_aliases != 0)
3578 {
3579 *allocation = {};
3580
3581 unsigned num_planes = format_ycbcr_num_planes(info.format);
3582 if (info.num_memory_aliases < num_planes)
3583 return false;
3584
3585 if (num_planes == 1)
3586 {
3587 VkMemoryRequirements reqs;
3588 table->vkGetImageMemoryRequirements(device, image, &reqs);
3589 auto &alias = *info.memory_aliases[0];
3590
3591 // Verify we can actually use this aliased allocation.
3592 if ((reqs.memoryTypeBits & (1u << alias.memory_type)) == 0)
3593 return false;
3594 if (reqs.size > alias.size)
3595 return false;
3596 if (((alias.offset + reqs.alignment - 1) & ~(reqs.alignment - 1)) != alias.offset)
3597 return false;
3598
3599 if (table->vkBindImageMemory(device, image, alias.get_memory(), alias.get_offset()) != VK_SUCCESS)
3600 return false;
3601 }
3602 else
3603 {
3604 if (!ext.supports_bind_memory2 || !ext.supports_get_memory_requirements2)
3605 return false;
3606
3607 VkBindImageMemoryInfo bind_infos[3];
3608 VkBindImagePlaneMemoryInfo bind_plane_infos[3];
3609 VK_ASSERT(num_planes <= 3);
3610
3611 for (unsigned plane = 0; plane < num_planes; plane++)
3612 {
3613 VkMemoryRequirements2KHR memory_req = {VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR };
3614 VkImageMemoryRequirementsInfo2KHR image_info = {VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR };
3615 image_info.image = image;
3616
3617 VkImagePlaneMemoryRequirementsInfo plane_info = { VK_STRUCTURE_TYPE_IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO_KHR };
3618 plane_info.planeAspect = static_cast<VkImageAspectFlagBits>(VK_IMAGE_ASPECT_PLANE_0_BIT << plane);
3619 image_info.pNext = &plane_info;
3620
3621 table->vkGetImageMemoryRequirements2KHR(device, &image_info, &memory_req);
3622 auto &reqs = memory_req.memoryRequirements;
3623 auto &alias = *info.memory_aliases[plane];
3624
3625 // Verify we can actually use this aliased allocation.
3626 if ((reqs.memoryTypeBits & (1u << alias.memory_type)) == 0)
3627 return false;
3628 if (reqs.size > alias.size)
3629 return false;
3630 if (((alias.offset + reqs.alignment - 1) & ~(reqs.alignment - 1)) != alias.offset)
3631 return false;
3632
3633 bind_infos[plane] = { VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO };
3634 bind_infos[plane].image = image;
3635 bind_infos[plane].memory = alias.base;
3636 bind_infos[plane].memoryOffset = alias.offset;
3637 bind_infos[plane].pNext = &bind_plane_infos[plane];
3638
3639 bind_plane_infos[plane] = { VK_STRUCTURE_TYPE_BIND_IMAGE_PLANE_MEMORY_INFO };
3640 bind_plane_infos[plane].planeAspect = static_cast<VkImageAspectFlagBits>(VK_IMAGE_ASPECT_PLANE_0_BIT << plane);
3641 }
3642
3643 if (table->vkBindImageMemory2KHR(device, num_planes, bind_infos) != VK_SUCCESS)
3644 return false;
3645 }
3646 }
3647 else
3648 {
3649 VkMemoryRequirements reqs;
3650 table->vkGetImageMemoryRequirements(device, image, &reqs);
3651
3652 // If we intend to alias with other images bump the alignment to something very high.
3653 // This is kind of crude, but should be high enough to allow YCbCr disjoint aliasing on any implementation.
3654 if (info.flags & VK_IMAGE_CREATE_ALIAS_BIT)
3655 if (reqs.alignment < 64 * 1024)
3656 reqs.alignment = 64 * 1024;
3657
3658 uint32_t memory_type = find_memory_type(info.domain, reqs.memoryTypeBits);
3659 if (memory_type == UINT32_MAX)
3660 {
3661 LOGE("Failed to find memory type.\n");
3662 return false;
3663 }
3664
3665 if (tiling == VK_IMAGE_TILING_LINEAR &&
3666 (info.misc & IMAGE_MISC_LINEAR_IMAGE_IGNORE_DEVICE_LOCAL_BIT) == 0)
3667 {
3668 // Is it also device local?
3669 if ((mem_props.memoryTypes[memory_type].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) == 0)
3670 return false;
3671 }
3672
3673 if (!managers.memory.allocate_image_memory(reqs.size, reqs.alignment, memory_type,
3674 tiling == VK_IMAGE_TILING_OPTIMAL ? ALLOCATION_TILING_OPTIMAL
3675 : ALLOCATION_TILING_LINEAR,
3676 allocation, image,
3677 (info.misc & IMAGE_MISC_FORCE_NO_DEDICATED_BIT) != 0))
3678 {
3679 LOGE("Failed to allocate image memory (type %u, size: %u).\n", unsigned(memory_type), unsigned(reqs.size));
3680 return false;
3681 }
3682
3683 if (table->vkBindImageMemory(device, image, allocation->get_memory(),
3684 allocation->get_offset()) != VK_SUCCESS)
3685 {
3686 LOGE("Failed to bind image memory.\n");
3687 return false;
3688 }
3689 }
3690
3691 return true;
3692 }
3693
create_image_from_staging_buffer(const ImageCreateInfo & create_info,const InitialImageBuffer * staging_buffer)3694 ImageHandle Device::create_image_from_staging_buffer(const ImageCreateInfo &create_info,
3695 const InitialImageBuffer *staging_buffer)
3696 {
3697 ImageResourceHolder holder(this);
3698
3699 VkImageCreateInfo info = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO };
3700 info.format = create_info.format;
3701 info.extent.width = create_info.width;
3702 info.extent.height = create_info.height;
3703 info.extent.depth = create_info.depth;
3704 info.imageType = create_info.type;
3705 info.mipLevels = create_info.levels;
3706 info.arrayLayers = create_info.layers;
3707 info.samples = create_info.samples;
3708
3709 if (create_info.domain == ImageDomain::LinearHostCached || create_info.domain == ImageDomain::LinearHost)
3710 {
3711 info.tiling = VK_IMAGE_TILING_LINEAR;
3712 info.initialLayout = VK_IMAGE_LAYOUT_PREINITIALIZED;
3713 }
3714 else
3715 {
3716 info.tiling = VK_IMAGE_TILING_OPTIMAL;
3717 info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
3718 }
3719
3720 info.usage = create_info.usage;
3721 info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
3722 if (create_info.domain == ImageDomain::Transient)
3723 info.usage |= VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT;
3724 if (staging_buffer)
3725 info.usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT;
3726
3727 info.flags = create_info.flags;
3728
3729 if (info.mipLevels == 0)
3730 info.mipLevels = image_num_miplevels(info.extent);
3731
3732 VkImageFormatListCreateInfoKHR format_info = { VK_STRUCTURE_TYPE_IMAGE_FORMAT_LIST_CREATE_INFO_KHR };
3733 VkFormat view_formats[2];
3734 format_info.pViewFormats = view_formats;
3735 format_info.viewFormatCount = 2;
3736 bool create_unorm_srgb_views = false;
3737
3738 if (create_info.misc & IMAGE_MISC_MUTABLE_SRGB_BIT)
3739 {
3740 format_info.viewFormatCount = ImageCreateInfo::compute_view_formats(create_info, view_formats);
3741 if (format_info.viewFormatCount != 0)
3742 {
3743 create_unorm_srgb_views = true;
3744 if (ext.supports_image_format_list)
3745 info.pNext = &format_info;
3746 }
3747 }
3748
3749 if ((create_info.usage & VK_IMAGE_USAGE_STORAGE_BIT) ||
3750 (create_info.misc & IMAGE_MISC_MUTABLE_SRGB_BIT))
3751 {
3752 info.flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;
3753 }
3754
3755 // Only do this conditionally.
3756 // On AMD, using CONCURRENT with async compute disables compression.
3757 uint32_t sharing_indices[3] = {};
3758
3759 uint32_t queue_flags = create_info.misc & (IMAGE_MISC_CONCURRENT_QUEUE_GRAPHICS_BIT |
3760 IMAGE_MISC_CONCURRENT_QUEUE_ASYNC_COMPUTE_BIT |
3761 IMAGE_MISC_CONCURRENT_QUEUE_ASYNC_GRAPHICS_BIT |
3762 IMAGE_MISC_CONCURRENT_QUEUE_ASYNC_TRANSFER_BIT);
3763 bool concurrent_queue = queue_flags != 0;
3764 if (concurrent_queue)
3765 {
3766 info.sharingMode = VK_SHARING_MODE_CONCURRENT;
3767
3768 const auto add_unique_family = [&](uint32_t family) {
3769 for (uint32_t i = 0; i < info.queueFamilyIndexCount; i++)
3770 {
3771 if (sharing_indices[i] == family)
3772 return;
3773 }
3774 sharing_indices[info.queueFamilyIndexCount++] = family;
3775 };
3776
3777 if (queue_flags & (IMAGE_MISC_CONCURRENT_QUEUE_GRAPHICS_BIT | IMAGE_MISC_CONCURRENT_QUEUE_ASYNC_GRAPHICS_BIT))
3778 add_unique_family(graphics_queue_family_index);
3779 if (queue_flags & IMAGE_MISC_CONCURRENT_QUEUE_ASYNC_COMPUTE_BIT)
3780 add_unique_family(compute_queue_family_index);
3781 if (staging_buffer || (queue_flags & IMAGE_MISC_CONCURRENT_QUEUE_ASYNC_TRANSFER_BIT) != 0)
3782 add_unique_family(transfer_queue_family_index);
3783
3784 if (info.queueFamilyIndexCount > 1)
3785 info.pQueueFamilyIndices = sharing_indices;
3786 else
3787 {
3788 info.pQueueFamilyIndices = nullptr;
3789 info.queueFamilyIndexCount = 0;
3790 info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
3791 }
3792 }
3793
3794 VkFormatFeatureFlags check_extra_features = 0;
3795 if ((create_info.misc & IMAGE_MISC_VERIFY_FORMAT_FEATURE_SAMPLED_LINEAR_FILTER_BIT) != 0)
3796 check_extra_features |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
3797
3798 if (info.tiling == VK_IMAGE_TILING_LINEAR)
3799 {
3800 if (staging_buffer)
3801 return ImageHandle(nullptr);
3802
3803 // Do some more stringent checks.
3804 if (info.mipLevels > 1)
3805 return ImageHandle(nullptr);
3806 if (info.arrayLayers > 1)
3807 return ImageHandle(nullptr);
3808 if (info.imageType != VK_IMAGE_TYPE_2D)
3809 return ImageHandle(nullptr);
3810 if (info.samples != VK_SAMPLE_COUNT_1_BIT)
3811 return ImageHandle(nullptr);
3812
3813 VkImageFormatProperties props;
3814 if (!get_image_format_properties(info.format, info.imageType, info.tiling, info.usage, info.flags, &props))
3815 return ImageHandle(nullptr);
3816
3817 if (!props.maxArrayLayers ||
3818 !props.maxMipLevels ||
3819 (info.extent.width > props.maxExtent.width) ||
3820 (info.extent.height > props.maxExtent.height) ||
3821 (info.extent.depth > props.maxExtent.depth))
3822 {
3823 return ImageHandle(nullptr);
3824 }
3825 }
3826
3827 if (!image_format_is_supported(create_info.format, image_usage_to_features(info.usage) | check_extra_features, info.tiling))
3828 {
3829 LOGE("Format %u is not supported for usage flags!\n", unsigned(create_info.format));
3830 return ImageHandle(nullptr);
3831 }
3832
3833 if (table->vkCreateImage(device, &info, nullptr, &holder.image) != VK_SUCCESS)
3834 {
3835 LOGE("Failed to create image in vkCreateImage.\n");
3836 return ImageHandle(nullptr);
3837 }
3838
3839 if (!allocate_image_memory(&holder.allocation, create_info, holder.image, info.tiling))
3840 {
3841 LOGE("Failed to allocate memory for image.\n");
3842 return ImageHandle(nullptr);
3843 }
3844
3845 auto tmpinfo = create_info;
3846 tmpinfo.usage = info.usage;
3847 tmpinfo.flags = info.flags;
3848 tmpinfo.levels = info.mipLevels;
3849
3850 bool has_view = (info.usage & (VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
3851 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) != 0;
3852
3853 VkImageViewType view_type = VK_IMAGE_VIEW_TYPE_RANGE_SIZE;
3854 if (has_view)
3855 {
3856 if (!holder.create_default_views(tmpinfo, nullptr, create_unorm_srgb_views, view_formats))
3857 return ImageHandle(nullptr);
3858 view_type = holder.get_default_view_type();
3859 }
3860
3861 ImageHandle handle(handle_pool.images.allocate(this, holder.image, holder.image_view, holder.allocation, tmpinfo, view_type));
3862 if (handle)
3863 {
3864 holder.owned = false;
3865 if (has_view)
3866 {
3867 handle->get_view().set_alt_views(holder.depth_view, holder.stencil_view);
3868 handle->get_view().set_render_target_views(move(holder.rt_views));
3869 handle->get_view().set_unorm_view(holder.unorm_view);
3870 handle->get_view().set_srgb_view(holder.srgb_view);
3871 }
3872
3873 // Set possible dstStage and dstAccess.
3874 handle->set_stage_flags(image_usage_to_possible_stages(info.usage));
3875 handle->set_access_flags(image_usage_to_possible_access(info.usage));
3876 }
3877
3878 // Copy initial data to texture.
3879 if (staging_buffer)
3880 {
3881 VK_ASSERT(create_info.domain != ImageDomain::Transient);
3882 VK_ASSERT(create_info.initial_layout != VK_IMAGE_LAYOUT_UNDEFINED);
3883 bool generate_mips = (create_info.misc & IMAGE_MISC_GENERATE_MIPS_BIT) != 0;
3884
3885 // If graphics_queue != transfer_queue, we will use a semaphore, so no srcAccess mask is necessary.
3886 VkAccessFlags final_transition_src_access = 0;
3887 if (generate_mips)
3888 final_transition_src_access = VK_ACCESS_TRANSFER_READ_BIT; // Validation complains otherwise.
3889 else if (graphics_queue == transfer_queue)
3890 final_transition_src_access = VK_ACCESS_TRANSFER_WRITE_BIT;
3891
3892 VkAccessFlags prepare_src_access = graphics_queue == transfer_queue ? VK_ACCESS_TRANSFER_WRITE_BIT : 0;
3893 bool need_mipmap_barrier = true;
3894 bool need_initial_barrier = true;
3895
3896 // Now we've used the TRANSFER queue to copy data over to the GPU.
3897 // For mipmapping, we're now moving over to graphics,
3898 // the transfer queue is designed for CPU <-> GPU and that's it.
3899
3900 // For concurrent queue mode, we just need to inject a semaphore.
3901 // For non-concurrent queue mode, we will have to inject ownership transfer barrier if the queue families do not match.
3902
3903 auto graphics_cmd = request_command_buffer(CommandBuffer::Type::Generic);
3904 CommandBufferHandle transfer_cmd;
3905
3906 // Don't split the upload into multiple command buffers unless we have to.
3907 if (transfer_queue != graphics_queue)
3908 transfer_cmd = request_command_buffer(CommandBuffer::Type::AsyncTransfer);
3909 else
3910 transfer_cmd = graphics_cmd;
3911
3912 transfer_cmd->image_barrier(*handle, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
3913 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, VK_PIPELINE_STAGE_TRANSFER_BIT,
3914 VK_ACCESS_TRANSFER_WRITE_BIT);
3915
3916 transfer_cmd->begin_region("copy-image-to-gpu");
3917 transfer_cmd->copy_buffer_to_image(*handle, *staging_buffer->buffer, staging_buffer->blits.size(), staging_buffer->blits.data());
3918 transfer_cmd->end_region();
3919
3920 if (transfer_queue != graphics_queue)
3921 {
3922 VkPipelineStageFlags dst_stages =
3923 generate_mips ? VkPipelineStageFlags(VK_PIPELINE_STAGE_TRANSFER_BIT) : handle->get_stage_flags();
3924
3925 // We can't just use semaphores, we will also need a release + acquire barrier to marshal ownership from
3926 // transfer queue over to graphics ...
3927 if (!concurrent_queue && transfer_queue_family_index != graphics_queue_family_index)
3928 {
3929 need_mipmap_barrier = false;
3930
3931 VkImageMemoryBarrier release = { VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER };
3932 release.image = handle->get_image();
3933 release.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
3934 release.dstAccessMask = 0;
3935 release.srcQueueFamilyIndex = transfer_queue_family_index;
3936 release.dstQueueFamilyIndex = graphics_queue_family_index;
3937 release.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
3938
3939 if (generate_mips)
3940 {
3941 release.newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
3942 release.subresourceRange.levelCount = 1;
3943 }
3944 else
3945 {
3946 release.newLayout = create_info.initial_layout;
3947 release.subresourceRange.levelCount = info.mipLevels;
3948 need_initial_barrier = false;
3949 }
3950
3951 release.subresourceRange.aspectMask = format_to_aspect_mask(info.format);
3952 release.subresourceRange.layerCount = info.arrayLayers;
3953
3954 VkImageMemoryBarrier acquire = release;
3955 acquire.srcAccessMask = 0;
3956
3957 if (generate_mips)
3958 acquire.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
3959 else
3960 acquire.dstAccessMask = handle->get_access_flags() & image_layout_to_possible_access(create_info.initial_layout);
3961
3962 transfer_cmd->barrier(VK_PIPELINE_STAGE_TRANSFER_BIT,
3963 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
3964 0, nullptr, 0, nullptr, 1, &release);
3965
3966 graphics_cmd->barrier(dst_stages,
3967 dst_stages,
3968 0, nullptr, 0, nullptr, 1, &acquire);
3969 }
3970
3971 Semaphore sem;
3972 submit(transfer_cmd, nullptr, 1, &sem);
3973 add_wait_semaphore(CommandBuffer::Type::Generic, sem, dst_stages, true);
3974 }
3975
3976 if (generate_mips)
3977 {
3978 graphics_cmd->begin_region("mipgen");
3979 graphics_cmd->barrier_prepare_generate_mipmap(*handle, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
3980 VK_PIPELINE_STAGE_TRANSFER_BIT,
3981 prepare_src_access, need_mipmap_barrier);
3982 graphics_cmd->generate_mipmap(*handle);
3983 graphics_cmd->end_region();
3984 }
3985
3986 if (need_initial_barrier)
3987 {
3988 graphics_cmd->image_barrier(
3989 *handle, generate_mips ? VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL : VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
3990 create_info.initial_layout,
3991 VK_PIPELINE_STAGE_TRANSFER_BIT, final_transition_src_access,
3992 handle->get_stage_flags(),
3993 handle->get_access_flags() & image_layout_to_possible_access(create_info.initial_layout));
3994 }
3995
3996 bool share_compute = concurrent_queue && graphics_queue != compute_queue;
3997 bool share_async_graphics = get_physical_queue_type(CommandBuffer::Type::AsyncGraphics) == CommandBuffer::Type::AsyncCompute;
3998
3999 // For concurrent queue, make sure that compute can see the final image as well.
4000 // Also add semaphore if the compute queue can be used for async graphics as well.
4001 if (share_compute || share_async_graphics)
4002 {
4003 Semaphore sem;
4004 submit(graphics_cmd, nullptr, 1, &sem);
4005
4006 VkPipelineStageFlags dst_stages = handle->get_stage_flags();
4007 if (graphics_queue_family_index != compute_queue_family_index)
4008 dst_stages &= VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT;
4009 add_wait_semaphore(CommandBuffer::Type::AsyncCompute, sem, dst_stages, true);
4010 }
4011 else
4012 submit(graphics_cmd);
4013 }
4014 else if (create_info.initial_layout != VK_IMAGE_LAYOUT_UNDEFINED)
4015 {
4016 VK_ASSERT(create_info.domain != ImageDomain::Transient);
4017 auto cmd = request_command_buffer(CommandBuffer::Type::Generic);
4018 cmd->image_barrier(*handle, info.initialLayout, create_info.initial_layout,
4019 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, handle->get_stage_flags(),
4020 handle->get_access_flags() &
4021 image_layout_to_possible_access(create_info.initial_layout));
4022
4023 // For concurrent queue, make sure that compute can see the final image as well.
4024 if (concurrent_queue && graphics_queue != compute_queue)
4025 {
4026 Semaphore sem;
4027 submit(cmd, nullptr, 1, &sem);
4028 add_wait_semaphore(CommandBuffer::Type::AsyncCompute,
4029 sem, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT, true);
4030 }
4031 else
4032 submit(cmd);
4033 }
4034
4035 return handle;
4036 }
4037
fill_vk_sampler_info(const SamplerCreateInfo & sampler_info)4038 static VkSamplerCreateInfo fill_vk_sampler_info(const SamplerCreateInfo &sampler_info)
4039 {
4040 VkSamplerCreateInfo info = { VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO };
4041
4042 info.magFilter = sampler_info.mag_filter;
4043 info.minFilter = sampler_info.min_filter;
4044 info.mipmapMode = sampler_info.mipmap_mode;
4045 info.addressModeU = sampler_info.address_mode_u;
4046 info.addressModeV = sampler_info.address_mode_v;
4047 info.addressModeW = sampler_info.address_mode_w;
4048 info.mipLodBias = sampler_info.mip_lod_bias;
4049 info.anisotropyEnable = sampler_info.anisotropy_enable;
4050 info.maxAnisotropy = sampler_info.max_anisotropy;
4051 info.compareEnable = sampler_info.compare_enable;
4052 info.compareOp = sampler_info.compare_op;
4053 info.minLod = sampler_info.min_lod;
4054 info.maxLod = sampler_info.max_lod;
4055 info.borderColor = sampler_info.border_color;
4056 info.unnormalizedCoordinates = sampler_info.unnormalized_coordinates;
4057 return info;
4058 }
4059
create_sampler(const SamplerCreateInfo & sampler_info,StockSampler stock_sampler)4060 SamplerHandle Device::create_sampler(const SamplerCreateInfo &sampler_info, StockSampler stock_sampler)
4061 {
4062 auto info = fill_vk_sampler_info(sampler_info);
4063 VkSampler sampler;
4064
4065 VkSamplerYcbcrConversionInfo conversion_info = { VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO };
4066
4067 switch (stock_sampler)
4068 {
4069 case StockSampler::LinearYUV420P:
4070 if (!ext.sampler_ycbcr_conversion_features.samplerYcbcrConversion)
4071 return SamplerHandle(nullptr);
4072 info.pNext = &conversion_info;
4073 conversion_info.conversion = samplers_ycbcr[static_cast<unsigned>(YCbCrFormat::YUV420P_3PLANE)];
4074 break;
4075
4076 case StockSampler::LinearYUV422P:
4077 if (!ext.sampler_ycbcr_conversion_features.samplerYcbcrConversion)
4078 return SamplerHandle(nullptr);
4079 info.pNext = &conversion_info;
4080 conversion_info.conversion = samplers_ycbcr[static_cast<unsigned>(YCbCrFormat::YUV422P_3PLANE)];
4081 break;
4082
4083 case StockSampler::LinearYUV444P:
4084 if (!ext.sampler_ycbcr_conversion_features.samplerYcbcrConversion)
4085 return SamplerHandle(nullptr);
4086 info.pNext = &conversion_info;
4087 conversion_info.conversion = samplers_ycbcr[static_cast<unsigned>(YCbCrFormat::YUV444P_3PLANE)];
4088 break;
4089
4090 default:
4091 info.pNext = nullptr;
4092 break;
4093 }
4094
4095 if (table->vkCreateSampler(device, &info, nullptr, &sampler) != VK_SUCCESS)
4096 return SamplerHandle(nullptr);
4097 #ifdef GRANITE_VULKAN_FOSSILIZE
4098 register_sampler(sampler, Fossilize::Hash(stock_sampler) | 0x10000, info);
4099 #else
4100 (void)stock_sampler;
4101 #endif
4102 SamplerHandle handle(handle_pool.samplers.allocate(this, sampler, sampler_info));
4103 handle->set_internal_sync_object();
4104 return handle;
4105 }
4106
create_sampler(const SamplerCreateInfo & sampler_info)4107 SamplerHandle Device::create_sampler(const SamplerCreateInfo &sampler_info)
4108 {
4109 auto info = fill_vk_sampler_info(sampler_info);
4110 VkSampler sampler;
4111 if (table->vkCreateSampler(device, &info, nullptr, &sampler) != VK_SUCCESS)
4112 return SamplerHandle(nullptr);
4113 return SamplerHandle(handle_pool.samplers.allocate(this, sampler, sampler_info));
4114 }
4115
create_bindless_descriptor_pool(BindlessResourceType type,unsigned num_sets,unsigned num_descriptors)4116 BindlessDescriptorPoolHandle Device::create_bindless_descriptor_pool(BindlessResourceType type,
4117 unsigned num_sets, unsigned num_descriptors)
4118 {
4119 if (!ext.supports_descriptor_indexing)
4120 return BindlessDescriptorPoolHandle{ nullptr };
4121
4122 DescriptorSetAllocator *allocator = nullptr;
4123
4124 switch (type)
4125 {
4126 case BindlessResourceType::ImageFP:
4127 allocator = bindless_sampled_image_allocator_fp;
4128 break;
4129
4130 case BindlessResourceType::ImageInt:
4131 allocator = bindless_sampled_image_allocator_integer;
4132 break;
4133
4134 default:
4135 break;
4136 }
4137
4138 VkDescriptorPool pool = VK_NULL_HANDLE;
4139 if (allocator)
4140 pool = allocator->allocate_bindless_pool(num_sets, num_descriptors);
4141
4142 if (!pool)
4143 {
4144 LOGE("Failed to allocate bindless pool.\n");
4145 return BindlessDescriptorPoolHandle{ nullptr };
4146 }
4147
4148 auto *handle = handle_pool.bindless_descriptor_pool.allocate(this, allocator, pool);
4149 return BindlessDescriptorPoolHandle{ handle };
4150 }
4151
fill_buffer_sharing_indices(VkBufferCreateInfo & info,uint32_t * sharing_indices)4152 void Device::fill_buffer_sharing_indices(VkBufferCreateInfo &info, uint32_t *sharing_indices)
4153 {
4154 if (graphics_queue_family_index != compute_queue_family_index ||
4155 graphics_queue_family_index != transfer_queue_family_index)
4156 {
4157 // For buffers, always just use CONCURRENT access modes,
4158 // so we don't have to deal with acquire/release barriers in async compute.
4159 info.sharingMode = VK_SHARING_MODE_CONCURRENT;
4160
4161 sharing_indices[info.queueFamilyIndexCount++] = graphics_queue_family_index;
4162
4163 if (graphics_queue_family_index != compute_queue_family_index)
4164 sharing_indices[info.queueFamilyIndexCount++] = compute_queue_family_index;
4165
4166 if (graphics_queue_family_index != transfer_queue_family_index &&
4167 compute_queue_family_index != transfer_queue_family_index)
4168 {
4169 sharing_indices[info.queueFamilyIndexCount++] = transfer_queue_family_index;
4170 }
4171
4172 info.pQueueFamilyIndices = sharing_indices;
4173 }
4174 }
4175
create_imported_host_buffer(const BufferCreateInfo & create_info,VkExternalMemoryHandleTypeFlagBits type,void * host_buffer)4176 BufferHandle Device::create_imported_host_buffer(const BufferCreateInfo &create_info, VkExternalMemoryHandleTypeFlagBits type, void *host_buffer)
4177 {
4178 if (create_info.domain != BufferDomain::Host &&
4179 create_info.domain != BufferDomain::CachedHost &&
4180 create_info.domain != BufferDomain::CachedCoherentHostPreferCached &&
4181 create_info.domain != BufferDomain::CachedCoherentHostPreferCoherent)
4182 {
4183 return BufferHandle{};
4184 }
4185
4186 if (!ext.supports_external_memory_host)
4187 return BufferHandle{};
4188
4189 if ((reinterpret_cast<uintptr_t>(host_buffer) & (ext.host_memory_properties.minImportedHostPointerAlignment - 1)) != 0)
4190 {
4191 LOGE("Host buffer is not aligned appropriately.\n");
4192 return BufferHandle{};
4193 }
4194
4195 VkMemoryHostPointerPropertiesEXT host_pointer_props = { VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT };
4196 if (table->vkGetMemoryHostPointerPropertiesEXT(device, type, host_buffer, &host_pointer_props) != VK_SUCCESS)
4197 {
4198 LOGE("Host pointer is not importable.\n");
4199 return BufferHandle{};
4200 }
4201
4202 VkBufferCreateInfo info = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
4203 info.size = create_info.size;
4204 info.usage = create_info.usage;
4205 info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
4206
4207 uint32_t sharing_indices[3];
4208 fill_buffer_sharing_indices(info, sharing_indices);
4209
4210 VkBuffer buffer;
4211 VkMemoryRequirements reqs;
4212 if (table->vkCreateBuffer(device, &info, nullptr, &buffer) != VK_SUCCESS)
4213 return BufferHandle{};
4214
4215 table->vkGetBufferMemoryRequirements(device, buffer, &reqs);
4216
4217 reqs.memoryTypeBits &= host_pointer_props.memoryTypeBits;
4218 if (reqs.memoryTypeBits == 0)
4219 {
4220 LOGE("No compatible host pointer types are available.\n");
4221 table->vkDestroyBuffer(device, buffer, nullptr);
4222 return BufferHandle{};
4223 }
4224
4225 uint32_t memory_type = find_memory_type(create_info.domain, reqs.memoryTypeBits);
4226 if (memory_type == UINT32_MAX)
4227 {
4228 LOGE("Failed to find memory type.\n");
4229 table->vkDestroyBuffer(device, buffer, nullptr);
4230 return BufferHandle{};
4231 }
4232
4233 VkMemoryAllocateInfo alloc_info = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO };
4234 alloc_info.allocationSize = (create_info.size + ext.host_memory_properties.minImportedHostPointerAlignment - 1) &
4235 ~(ext.host_memory_properties.minImportedHostPointerAlignment - 1);
4236 alloc_info.memoryTypeIndex = memory_type;
4237
4238 VkImportMemoryHostPointerInfoEXT import = { VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT };
4239 import.handleType = type;
4240 import.pHostPointer = host_buffer;
4241 alloc_info.pNext = &import;
4242
4243 VkDeviceMemory memory;
4244 if (table->vkAllocateMemory(device, &alloc_info, nullptr, &memory) != VK_SUCCESS)
4245 {
4246 table->vkDestroyBuffer(device, buffer, nullptr);
4247 return BufferHandle{};
4248 }
4249
4250 auto allocation = DeviceAllocation::make_imported_allocation(memory, info.size, memory_type);
4251 if (table->vkMapMemory(device, memory, 0, VK_WHOLE_SIZE, 0, reinterpret_cast<void **>(&allocation.host_base)) != VK_SUCCESS)
4252 {
4253 allocation.free_immediate(managers.memory);
4254 table->vkDestroyBuffer(device, buffer, nullptr);
4255 return BufferHandle{};
4256 }
4257
4258 if (table->vkBindBufferMemory(device, buffer, memory, 0) != VK_SUCCESS)
4259 {
4260 allocation.free_immediate(managers.memory);
4261 table->vkDestroyBuffer(device, buffer, nullptr);
4262 return BufferHandle{};
4263 }
4264
4265 BufferHandle handle(handle_pool.buffers.allocate(this, buffer, allocation, create_info));
4266 return handle;
4267 }
4268
create_buffer(const BufferCreateInfo & create_info,const void * initial)4269 BufferHandle Device::create_buffer(const BufferCreateInfo &create_info, const void *initial)
4270 {
4271 VkBuffer buffer;
4272 VkMemoryRequirements reqs;
4273 DeviceAllocation allocation;
4274
4275 bool zero_initialize = (create_info.misc & BUFFER_MISC_ZERO_INITIALIZE_BIT) != 0;
4276 if (initial && zero_initialize)
4277 {
4278 LOGE("Cannot initialize buffer with data and clear.\n");
4279 return BufferHandle{};
4280 }
4281
4282 VkBufferCreateInfo info = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
4283 info.size = create_info.size;
4284 info.usage = create_info.usage | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
4285 info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
4286
4287 uint32_t sharing_indices[3];
4288 fill_buffer_sharing_indices(info, sharing_indices);
4289
4290 if (table->vkCreateBuffer(device, &info, nullptr, &buffer) != VK_SUCCESS)
4291 return BufferHandle(nullptr);
4292
4293 table->vkGetBufferMemoryRequirements(device, buffer, &reqs);
4294
4295 uint32_t memory_type = find_memory_type(create_info.domain, reqs.memoryTypeBits);
4296 if (memory_type == UINT32_MAX)
4297 {
4298 LOGE("Failed to find memory type.\n");
4299 table->vkDestroyBuffer(device, buffer, nullptr);
4300 return BufferHandle(nullptr);
4301 }
4302
4303 if (!managers.memory.allocate(reqs.size, reqs.alignment, memory_type, ALLOCATION_TILING_LINEAR, &allocation))
4304 {
4305 // This memory type is rather scarce, so fallback to Host type if we've exhausted this memory.
4306 if (create_info.domain == BufferDomain::LinkedDeviceHost)
4307 {
4308 LOGW("Exhausted LinkedDeviceHost memory, falling back to host.\n");
4309 memory_type = find_memory_type(BufferDomain::Host, reqs.memoryTypeBits);
4310 if (memory_type == UINT32_MAX)
4311 {
4312 LOGE("Failed to find memory type.\n");
4313 table->vkDestroyBuffer(device, buffer, nullptr);
4314 return BufferHandle(nullptr);
4315 }
4316
4317 if (!managers.memory.allocate(reqs.size, reqs.alignment, memory_type, ALLOCATION_TILING_LINEAR, &allocation))
4318 {
4319 table->vkDestroyBuffer(device, buffer, nullptr);
4320 return BufferHandle(nullptr);
4321 }
4322 }
4323 else
4324 {
4325 table->vkDestroyBuffer(device, buffer, nullptr);
4326 return BufferHandle(nullptr);
4327 }
4328 }
4329
4330 if (table->vkBindBufferMemory(device, buffer, allocation.get_memory(), allocation.get_offset()) != VK_SUCCESS)
4331 {
4332 allocation.free_immediate(managers.memory);
4333 table->vkDestroyBuffer(device, buffer, nullptr);
4334 return BufferHandle(nullptr);
4335 }
4336
4337 auto tmpinfo = create_info;
4338 tmpinfo.usage |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
4339 BufferHandle handle(handle_pool.buffers.allocate(this, buffer, allocation, tmpinfo));
4340
4341 if (create_info.domain == BufferDomain::Device && (initial || zero_initialize) && !memory_type_is_host_visible(memory_type))
4342 {
4343 CommandBufferHandle cmd;
4344 if (initial)
4345 {
4346 auto staging_info = create_info;
4347 staging_info.domain = BufferDomain::Host;
4348 auto staging_buffer = create_buffer(staging_info, initial);
4349 set_name(*staging_buffer, "buffer-upload-staging-buffer");
4350
4351 cmd = request_command_buffer(CommandBuffer::Type::AsyncTransfer);
4352 cmd->begin_region("copy-buffer-staging");
4353 cmd->copy_buffer(*handle, *staging_buffer);
4354 cmd->end_region();
4355 }
4356 else
4357 {
4358 cmd = request_command_buffer(CommandBuffer::Type::AsyncCompute);
4359 cmd->begin_region("fill-buffer-staging");
4360 cmd->fill_buffer(*handle, 0);
4361 cmd->end_region();
4362 }
4363
4364 LOCK();
4365 submit_staging(cmd, info.usage, true);
4366 }
4367 else if (initial || zero_initialize)
4368 {
4369 void *ptr = managers.memory.map_memory(allocation, MEMORY_ACCESS_WRITE_BIT, 0, allocation.get_size());
4370 if (!ptr)
4371 return BufferHandle(nullptr);
4372
4373 if (initial)
4374 memcpy(ptr, initial, create_info.size);
4375 else
4376 memset(ptr, 0, create_info.size);
4377 managers.memory.unmap_memory(allocation, MEMORY_ACCESS_WRITE_BIT, 0, allocation.get_size());
4378 }
4379 return handle;
4380 }
4381
memory_type_is_device_optimal(uint32_t type) const4382 bool Device::memory_type_is_device_optimal(uint32_t type) const
4383 {
4384 return (mem_props.memoryTypes[type].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) != 0;
4385 }
4386
memory_type_is_host_visible(uint32_t type) const4387 bool Device::memory_type_is_host_visible(uint32_t type) const
4388 {
4389 return (mem_props.memoryTypes[type].propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) != 0;
4390 }
4391
get_format_properties(VkFormat format,VkFormatProperties * properties)4392 void Device::get_format_properties(VkFormat format, VkFormatProperties *properties)
4393 {
4394 vkGetPhysicalDeviceFormatProperties(gpu, format, properties);
4395 }
4396
get_image_format_properties(VkFormat format,VkImageType type,VkImageTiling tiling,VkImageUsageFlags usage,VkImageCreateFlags flags,VkImageFormatProperties * properties)4397 bool Device::get_image_format_properties(VkFormat format, VkImageType type, VkImageTiling tiling,
4398 VkImageUsageFlags usage, VkImageCreateFlags flags,
4399 VkImageFormatProperties *properties)
4400 {
4401 auto res = vkGetPhysicalDeviceImageFormatProperties(gpu, format, type, tiling, usage, flags,
4402 properties);
4403 return res == VK_SUCCESS;
4404 }
4405
image_format_is_supported(VkFormat format,VkFormatFeatureFlags required,VkImageTiling tiling) const4406 bool Device::image_format_is_supported(VkFormat format, VkFormatFeatureFlags required, VkImageTiling tiling) const
4407 {
4408 VkFormatProperties props;
4409 vkGetPhysicalDeviceFormatProperties(gpu, format, &props);
4410 auto flags = tiling == VK_IMAGE_TILING_OPTIMAL ? props.optimalTilingFeatures : props.linearTilingFeatures;
4411 return (flags & required) == required;
4412 }
4413
get_default_depth_stencil_format() const4414 VkFormat Device::get_default_depth_stencil_format() const
4415 {
4416 if (image_format_is_supported(VK_FORMAT_D24_UNORM_S8_UINT, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT, VK_IMAGE_TILING_OPTIMAL))
4417 return VK_FORMAT_D24_UNORM_S8_UINT;
4418 if (image_format_is_supported(VK_FORMAT_D32_SFLOAT_S8_UINT, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT, VK_IMAGE_TILING_OPTIMAL))
4419 return VK_FORMAT_D32_SFLOAT_S8_UINT;
4420
4421 return VK_FORMAT_UNDEFINED;
4422 }
4423
get_default_depth_format() const4424 VkFormat Device::get_default_depth_format() const
4425 {
4426 if (image_format_is_supported(VK_FORMAT_D32_SFLOAT, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT, VK_IMAGE_TILING_OPTIMAL))
4427 return VK_FORMAT_D32_SFLOAT;
4428 if (image_format_is_supported(VK_FORMAT_X8_D24_UNORM_PACK32, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT, VK_IMAGE_TILING_OPTIMAL))
4429 return VK_FORMAT_X8_D24_UNORM_PACK32;
4430 if (image_format_is_supported(VK_FORMAT_D16_UNORM, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT, VK_IMAGE_TILING_OPTIMAL))
4431 return VK_FORMAT_D16_UNORM;
4432
4433 return VK_FORMAT_UNDEFINED;
4434 }
4435
allocate_cookie()4436 uint64_t Device::allocate_cookie()
4437 {
4438 // Reserve lower bits for "special purposes".
4439 #ifdef GRANITE_VULKAN_MT
4440 return cookie.fetch_add(16, memory_order_relaxed) + 16;
4441 #else
4442 cookie += 16;
4443 return cookie;
4444 #endif
4445 }
4446
request_render_pass(const RenderPassInfo & info,bool compatible)4447 const RenderPass &Device::request_render_pass(const RenderPassInfo &info, bool compatible)
4448 {
4449 Hasher h;
4450 VkFormat formats[VULKAN_NUM_ATTACHMENTS];
4451 VkFormat depth_stencil;
4452 uint32_t lazy = 0;
4453 uint32_t optimal = 0;
4454
4455 for (unsigned i = 0; i < info.num_color_attachments; i++)
4456 {
4457 VK_ASSERT(info.color_attachments[i]);
4458 formats[i] = info.color_attachments[i]->get_format();
4459 if (info.color_attachments[i]->get_image().get_create_info().domain == ImageDomain::Transient)
4460 lazy |= 1u << i;
4461 if (info.color_attachments[i]->get_image().get_layout_type() == Layout::Optimal)
4462 optimal |= 1u << i;
4463
4464 // This can change external subpass dependencies, so it must always be hashed.
4465 h.u32(info.color_attachments[i]->get_image().get_swapchain_layout());
4466 }
4467
4468 if (info.depth_stencil)
4469 {
4470 if (info.depth_stencil->get_image().get_create_info().domain == ImageDomain::Transient)
4471 lazy |= 1u << info.num_color_attachments;
4472 if (info.depth_stencil->get_image().get_layout_type() == Layout::Optimal)
4473 optimal |= 1u << info.num_color_attachments;
4474 }
4475
4476 // For multiview, base layer is encoded into the view mask.
4477 if (info.num_layers > 1)
4478 {
4479 h.u32(info.base_layer);
4480 h.u32(info.num_layers);
4481 }
4482 else
4483 {
4484 h.u32(0);
4485 h.u32(info.num_layers);
4486 }
4487
4488 h.u32(info.num_subpasses);
4489 for (unsigned i = 0; i < info.num_subpasses; i++)
4490 {
4491 h.u32(info.subpasses[i].num_color_attachments);
4492 h.u32(info.subpasses[i].num_input_attachments);
4493 h.u32(info.subpasses[i].num_resolve_attachments);
4494 h.u32(static_cast<uint32_t>(info.subpasses[i].depth_stencil_mode));
4495 for (unsigned j = 0; j < info.subpasses[i].num_color_attachments; j++)
4496 h.u32(info.subpasses[i].color_attachments[j]);
4497 for (unsigned j = 0; j < info.subpasses[i].num_input_attachments; j++)
4498 h.u32(info.subpasses[i].input_attachments[j]);
4499 for (unsigned j = 0; j < info.subpasses[i].num_resolve_attachments; j++)
4500 h.u32(info.subpasses[i].resolve_attachments[j]);
4501 }
4502
4503 depth_stencil = info.depth_stencil ? info.depth_stencil->get_format() : VK_FORMAT_UNDEFINED;
4504 h.data(formats, info.num_color_attachments * sizeof(VkFormat));
4505 h.u32(info.num_color_attachments);
4506 h.u32(depth_stencil);
4507
4508 // Compatible render passes do not care about load/store, or image layouts.
4509 if (!compatible)
4510 {
4511 h.u32(info.op_flags);
4512 h.u32(info.clear_attachments);
4513 h.u32(info.load_attachments);
4514 h.u32(info.store_attachments);
4515 h.u32(optimal);
4516 }
4517
4518 // Lazy flag can change external subpass dependencies, which is not compatible.
4519 h.u32(lazy);
4520
4521 auto hash = h.get();
4522
4523 auto *ret = render_passes.find(hash);
4524 if (!ret)
4525 ret = render_passes.emplace_yield(hash, hash, this, info);
4526 return *ret;
4527 }
4528
request_framebuffer(const RenderPassInfo & info)4529 const Framebuffer &Device::request_framebuffer(const RenderPassInfo &info)
4530 {
4531 return framebuffer_allocator.request_framebuffer(info);
4532 }
4533
get_transient_attachment(unsigned width,unsigned height,VkFormat format,unsigned index,unsigned samples,unsigned layers)4534 ImageView &Device::get_transient_attachment(unsigned width, unsigned height, VkFormat format,
4535 unsigned index, unsigned samples, unsigned layers)
4536 {
4537 return transient_allocator.request_attachment(width, height, format, index, samples, layers);
4538 }
4539
get_swapchain_view()4540 ImageView &Device::get_swapchain_view()
4541 {
4542 VK_ASSERT(wsi.index < wsi.swapchain.size());
4543 return wsi.swapchain[wsi.index]->get_view();
4544 }
4545
get_swapchain_view(unsigned index)4546 ImageView &Device::get_swapchain_view(unsigned index)
4547 {
4548 VK_ASSERT(index < wsi.swapchain.size());
4549 return wsi.swapchain[index]->get_view();
4550 }
4551
get_num_frame_contexts() const4552 unsigned Device::get_num_frame_contexts() const
4553 {
4554 return unsigned(per_frame.size());
4555 }
4556
get_num_swapchain_images() const4557 unsigned Device::get_num_swapchain_images() const
4558 {
4559 return unsigned(wsi.swapchain.size());
4560 }
4561
get_swapchain_index() const4562 unsigned Device::get_swapchain_index() const
4563 {
4564 return wsi.index;
4565 }
4566
get_current_frame_context() const4567 unsigned Device::get_current_frame_context() const
4568 {
4569 return frame_context_index;
4570 }
4571
get_swapchain_render_pass(SwapchainRenderPass style)4572 RenderPassInfo Device::get_swapchain_render_pass(SwapchainRenderPass style)
4573 {
4574 RenderPassInfo info;
4575 info.num_color_attachments = 1;
4576 info.color_attachments[0] = &get_swapchain_view();
4577 info.clear_attachments = ~0u;
4578 info.store_attachments = 1u << 0;
4579
4580 switch (style)
4581 {
4582 case SwapchainRenderPass::Depth:
4583 {
4584 info.op_flags |= RENDER_PASS_OP_CLEAR_DEPTH_STENCIL_BIT;
4585 info.depth_stencil =
4586 &get_transient_attachment(wsi.swapchain[wsi.index]->get_create_info().width,
4587 wsi.swapchain[wsi.index]->get_create_info().height, get_default_depth_format());
4588 break;
4589 }
4590
4591 case SwapchainRenderPass::DepthStencil:
4592 {
4593 info.op_flags |= RENDER_PASS_OP_CLEAR_DEPTH_STENCIL_BIT;
4594 info.depth_stencil =
4595 &get_transient_attachment(wsi.swapchain[wsi.index]->get_create_info().width,
4596 wsi.swapchain[wsi.index]->get_create_info().height, get_default_depth_stencil_format());
4597 break;
4598 }
4599
4600 default:
4601 break;
4602 }
4603 return info;
4604 }
4605
set_queue_lock(std::function<void ()> lock_callback,std::function<void ()> unlock_callback)4606 void Device::set_queue_lock(std::function<void()> lock_callback, std::function<void()> unlock_callback)
4607 {
4608 queue_lock_callback = move(lock_callback);
4609 queue_unlock_callback = move(unlock_callback);
4610 }
4611
set_name(const Buffer & buffer,const char * name)4612 void Device::set_name(const Buffer &buffer, const char *name)
4613 {
4614 if (ext.supports_debug_utils)
4615 {
4616 VkDebugUtilsObjectNameInfoEXT info = { VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT };
4617 info.objectType = VK_OBJECT_TYPE_BUFFER;
4618 info.objectHandle = (uint64_t)buffer.get_buffer();
4619 info.pObjectName = name;
4620 if (vkSetDebugUtilsObjectNameEXT)
4621 vkSetDebugUtilsObjectNameEXT(device, &info);
4622 }
4623 else if (ext.supports_debug_marker)
4624 {
4625 VkDebugMarkerObjectNameInfoEXT info = { VK_STRUCTURE_TYPE_DEBUG_MARKER_OBJECT_NAME_INFO_EXT };
4626 info.objectType = VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_EXT;
4627 info.object = (uint64_t)buffer.get_buffer();
4628 info.pObjectName = name;
4629 table->vkDebugMarkerSetObjectNameEXT(device, &info);
4630 }
4631 }
4632
set_name(const Image & image,const char * name)4633 void Device::set_name(const Image &image, const char *name)
4634 {
4635 if (ext.supports_debug_utils)
4636 {
4637 VkDebugUtilsObjectNameInfoEXT info = { VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT };
4638 info.objectType = VK_OBJECT_TYPE_IMAGE;
4639 info.objectHandle = (uint64_t)image.get_image();
4640 info.pObjectName = name;
4641 if (vkSetDebugUtilsObjectNameEXT)
4642 vkSetDebugUtilsObjectNameEXT(device, &info);
4643 }
4644 else if (ext.supports_debug_marker)
4645 {
4646 VkDebugMarkerObjectNameInfoEXT info = { VK_STRUCTURE_TYPE_DEBUG_MARKER_OBJECT_NAME_INFO_EXT };
4647 info.objectType = VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_EXT;
4648 info.object = (uint64_t)image.get_image();
4649 info.pObjectName = name;
4650 table->vkDebugMarkerSetObjectNameEXT(device, &info);
4651 }
4652 }
4653
set_name(const CommandBuffer & cmd,const char * name)4654 void Device::set_name(const CommandBuffer &cmd, const char *name)
4655 {
4656 if (ext.supports_debug_utils)
4657 {
4658 VkDebugUtilsObjectNameInfoEXT info = { VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT };
4659 info.objectType = VK_OBJECT_TYPE_COMMAND_BUFFER;
4660 info.objectHandle = (uint64_t)cmd.get_command_buffer();
4661 info.pObjectName = name;
4662 if (vkSetDebugUtilsObjectNameEXT)
4663 vkSetDebugUtilsObjectNameEXT(device, &info);
4664 }
4665 else if (ext.supports_debug_marker)
4666 {
4667 VkDebugMarkerObjectNameInfoEXT info = { VK_STRUCTURE_TYPE_DEBUG_MARKER_OBJECT_NAME_INFO_EXT };
4668 info.objectType = VK_DEBUG_REPORT_OBJECT_TYPE_COMMAND_BUFFER_EXT;
4669 info.object = (uint64_t)cmd.get_command_buffer();
4670 info.pObjectName = name;
4671 table->vkDebugMarkerSetObjectNameEXT(device, &info);
4672 }
4673 }
4674
report_checkpoints()4675 void Device::report_checkpoints()
4676 {
4677 if (!ext.supports_nv_device_diagnostic_checkpoints)
4678 return;
4679
4680 uint32_t graphics_count;
4681 table->vkGetQueueCheckpointDataNV(graphics_queue, &graphics_count, nullptr);
4682 vector<VkCheckpointDataNV> graphics_data(graphics_count);
4683 for (auto &g : graphics_data)
4684 g.sType = VK_STRUCTURE_TYPE_CHECKPOINT_DATA_NV;
4685 table->vkGetQueueCheckpointDataNV(graphics_queue, &graphics_count, graphics_data.data());
4686
4687 uint32_t compute_count;
4688 table->vkGetQueueCheckpointDataNV(compute_queue, &compute_count, nullptr);
4689 vector<VkCheckpointDataNV> compute_data(compute_count);
4690 for (auto &g : compute_data)
4691 g.sType = VK_STRUCTURE_TYPE_CHECKPOINT_DATA_NV;
4692 table->vkGetQueueCheckpointDataNV(compute_queue, &compute_count, compute_data.data());
4693
4694 uint32_t transfer_count;
4695 table->vkGetQueueCheckpointDataNV(transfer_queue, &transfer_count, nullptr);
4696 vector<VkCheckpointDataNV> transfer_data(compute_count);
4697 for (auto &g : transfer_data)
4698 g.sType = VK_STRUCTURE_TYPE_CHECKPOINT_DATA_NV;
4699 table->vkGetQueueCheckpointDataNV(transfer_queue, &transfer_count, transfer_data.data());
4700
4701 if (!graphics_data.empty())
4702 {
4703 LOGI("Checkpoints for graphics queue:\n");
4704 for (auto &g : graphics_data)
4705 LOGI("Stage %u:\n%s\n", g.stage, static_cast<const char *>(g.pCheckpointMarker));
4706 }
4707
4708 if (!compute_data.empty())
4709 {
4710 LOGI("Checkpoints for compute queue:\n");
4711 for (auto &g : compute_data)
4712 LOGI(" Stage %u:\n%s\n", g.stage, static_cast<const char *>(g.pCheckpointMarker));
4713 }
4714
4715 if (!transfer_data.empty())
4716 {
4717 LOGI("Checkpoints for transfer queue:\n");
4718 for (auto &g : transfer_data)
4719 LOGI(" Stage %u:\n%s\n", g.stage, static_cast<const char *>(g.pCheckpointMarker));
4720 }
4721 }
4722
query_available_performance_counters(CommandBuffer::Type type,uint32_t * count,const VkPerformanceCounterKHR ** counters,const VkPerformanceCounterDescriptionKHR ** desc)4723 void Device::query_available_performance_counters(CommandBuffer::Type type, uint32_t *count,
4724 const VkPerformanceCounterKHR **counters,
4725 const VkPerformanceCounterDescriptionKHR **desc)
4726 {
4727 auto &query_pool = get_performance_query_pool(type);
4728 *count = query_pool.get_num_counters();
4729 *counters = query_pool.get_available_counters();
4730 *desc = query_pool.get_available_counter_descs();
4731 }
4732
init_performance_counters(const std::vector<std::string> & names)4733 bool Device::init_performance_counters(const std::vector<std::string> &names)
4734 {
4735 if (!graphics.performance_query_pool.init_counters(names))
4736 return false;
4737
4738 if (compute_queue_family_index != graphics_queue_family_index &&
4739 !compute.performance_query_pool.init_counters(names))
4740 {
4741 return false;
4742 }
4743
4744 if (transfer_queue_family_index != compute_queue_family_index &&
4745 transfer_queue_family_index != graphics_queue_family_index &&
4746 !transfer.performance_query_pool.init_counters(names))
4747 {
4748 return false;
4749 }
4750
4751 return true;
4752 }
4753
release_profiling()4754 void Device::release_profiling()
4755 {
4756 table->vkReleaseProfilingLockKHR(device);
4757 }
4758
acquire_profiling()4759 bool Device::acquire_profiling()
4760 {
4761 if (!ext.performance_query_features.performanceCounterQueryPools)
4762 return false;
4763
4764 VkAcquireProfilingLockInfoKHR info = { VK_STRUCTURE_TYPE_ACQUIRE_PROFILING_LOCK_INFO_KHR };
4765 info.timeout = UINT64_MAX;
4766 if (table->vkAcquireProfilingLockKHR(device, &info) != VK_SUCCESS)
4767 {
4768 LOGE("Failed to acquire profiling lock.\n");
4769 return false;
4770 }
4771
4772 return true;
4773 }
4774
add_debug_channel_buffer(DebugChannelInterface * iface,std::string tag,Vulkan::BufferHandle buffer)4775 void Device::add_debug_channel_buffer(DebugChannelInterface *iface, std::string tag, Vulkan::BufferHandle buffer)
4776 {
4777 buffer->set_internal_sync_object();
4778 LOCK();
4779 frame().debug_channels.push_back({ iface, std::move(tag), std::move(buffer) });
4780 }
4781
parse_debug_channel(const PerFrame::DebugChannel & channel)4782 void Device::parse_debug_channel(const PerFrame::DebugChannel &channel)
4783 {
4784 if (!channel.iface)
4785 return;
4786
4787 auto *words = static_cast<const DebugChannelInterface::Word *>(map_host_buffer(*channel.buffer, MEMORY_ACCESS_READ_BIT));
4788
4789 size_t size = channel.buffer->get_create_info().size;
4790 if (size <= sizeof(uint32_t))
4791 {
4792 LOGE("Debug channel buffer is too small.\n");
4793 return;
4794 }
4795
4796 // Format for the debug channel.
4797 // Word 0: Atomic counter used by shader.
4798 // Word 1-*: [total message length, code, x, y, z, args]
4799
4800 size -= sizeof(uint32_t);
4801 size /= sizeof(uint32_t);
4802
4803 if (words[0].u32 > size)
4804 {
4805 LOGW("Debug channel overflowed and messaged were dropped. Consider increasing debug channel size to at least %u bytes.\n",
4806 unsigned((words[0].u32 + 1) * sizeof(uint32_t)));
4807 }
4808
4809 words++;
4810
4811 while (size != 0 && words[0].u32 >= 5 && words[0].u32 <= size)
4812 {
4813 channel.iface->message(channel.tag, words[1].u32,
4814 words[2].u32, words[3].u32, words[4].u32,
4815 words[0].u32 - 5, &words[5]);
4816 size -= words[0].u32;
4817 words += words[0].u32;
4818 }
4819
4820 unmap_host_buffer(*channel.buffer, MEMORY_ACCESS_READ_BIT);
4821 }
4822
convert_to_signed_delta(uint64_t start_ticks,uint64_t end_ticks,unsigned valid_bits)4823 static int64_t convert_to_signed_delta(uint64_t start_ticks, uint64_t end_ticks, unsigned valid_bits)
4824 {
4825 unsigned shamt = 64 - valid_bits;
4826 start_ticks <<= shamt;
4827 end_ticks <<= shamt;
4828 auto ticks_delta = int64_t(end_ticks - start_ticks);
4829 ticks_delta >>= shamt;
4830 return ticks_delta;
4831 }
4832
convert_timestamp_delta(uint64_t start_ticks,uint64_t end_ticks) const4833 double Device::convert_timestamp_delta(uint64_t start_ticks, uint64_t end_ticks) const
4834 {
4835 int64_t ticks_delta = convert_to_signed_delta(start_ticks, end_ticks, timestamp_valid_bits);
4836 return double(int64_t(ticks_delta)) * gpu_props.limits.timestampPeriod * 1e-9;
4837 }
4838
update_wrapped_base_timestamp(uint64_t end_ticks)4839 uint64_t Device::update_wrapped_base_timestamp(uint64_t end_ticks)
4840 {
4841 json_base_timestamp_value += convert_to_signed_delta(json_base_timestamp_value, end_ticks, timestamp_valid_bits);
4842 return json_base_timestamp_value;
4843 }
4844
init_timestamp_trace(const char * path)4845 bool Device::init_timestamp_trace(const char *path)
4846 {
4847 // Use the Chrome tracing format. It's trivial to emit and we get a frontend for free :)
4848 json_trace_file.reset();
4849 json_trace_file.reset(fopen(path, "w"));
4850 if (json_trace_file)
4851 fprintf(json_trace_file.get(), "[");
4852 return bool(json_trace_file);
4853 }
4854
convert_timestamp_to_absolute_usec(uint64_t ts)4855 int64_t Device::convert_timestamp_to_absolute_usec(uint64_t ts)
4856 {
4857 // Ensure that we deal with timestamp wraparound correctly.
4858 // On some hardware, we have < 64 valid bits and the timestamp counters will wrap around at some interval.
4859 // As long as timestamps come in at a reasonably steady pace, we can deal with wraparound cleanly.
4860 ts = update_wrapped_base_timestamp(ts);
4861 if (json_timestamp_origin == 0)
4862 json_timestamp_origin = ts;
4863
4864 auto delta_ts = int64_t(ts - json_timestamp_origin);
4865 auto us = int64_t(double(int64_t(delta_ts)) * gpu_props.limits.timestampPeriod * 1e-3);
4866 return us;
4867 }
4868
write_json_timestamp_range(unsigned frame_index,const char * tid,const char * name,const char * extra,uint64_t start_ts,uint64_t end_ts,int64_t & min_us,int64_t & max_us)4869 void Device::write_json_timestamp_range(unsigned frame_index, const char *tid,
4870 const char *name, const char *extra,
4871 uint64_t start_ts, uint64_t end_ts,
4872 int64_t &min_us, int64_t &max_us)
4873 {
4874 if (!json_trace_file)
4875 return;
4876
4877 int64_t absolute_start = convert_timestamp_to_absolute_usec(start_ts);
4878 int64_t absolute_end = convert_timestamp_to_absolute_usec(end_ts);
4879
4880 VK_ASSERT(absolute_start <= absolute_end);
4881
4882 min_us = std::min(absolute_start, min_us);
4883 max_us = std::max(absolute_end, max_us);
4884
4885 fprintf(json_trace_file.get(), "\t{ \"name\": \"%s%s%s\", \"ph\": \"B\", \"tid\": \"%s\", \"pid\": \"%u\", \"ts\": %lld },\n",
4886 name, *extra != '\0' ? " " : "", extra, tid, frame_index, static_cast<long long>(absolute_start));
4887 fprintf(json_trace_file.get(), "\t{ \"name\": \"%s%s%s\", \"ph\": \"E\", \"tid\": \"%s\", \"pid\": \"%u\", \"ts\": %lld },\n",
4888 name, *extra != '\0' ? " " : "", extra, tid, frame_index, static_cast<long long>(absolute_end));
4889 }
4890
write_json_timestamp_range_us(unsigned frame_index,const char * tid,const char * name,int64_t start_us,int64_t end_us)4891 void Device::write_json_timestamp_range_us(unsigned frame_index, const char *tid, const char *name, int64_t start_us, int64_t end_us)
4892 {
4893 if (!json_trace_file)
4894 return;
4895 if (start_us > end_us)
4896 return;
4897
4898 fprintf(json_trace_file.get(), "\t{ \"name\": \"%s\", \"ph\": \"B\", \"tid\": \"%s\", \"pid\": \"%u\", \"ts\": %lld },\n",
4899 name, tid, frame_index, static_cast<long long>(start_us));
4900 fprintf(json_trace_file.get(), "\t{ \"name\": \"%s\", \"ph\": \"E\", \"tid\": \"%s\", \"pid\": \"%u\", \"ts\": %lld },\n",
4901 name, tid, frame_index, static_cast<long long>(end_us));
4902 }
4903
operator ()(FILE * file)4904 void Device::JSONTraceFileDeleter::operator()(FILE *file)
4905 {
4906 // Intentionally truncate the JSON so that we can emit "," after the last element.
4907 if (file)
4908 fclose(file);
4909 }
4910
4911 #ifdef GRANITE_VULKAN_FILESYSTEM
get_texture_manager()4912 TextureManager &Device::get_texture_manager()
4913 {
4914 return texture_manager;
4915 }
4916
get_shader_manager()4917 ShaderManager &Device::get_shader_manager()
4918 {
4919 return shader_manager;
4920 }
4921 #endif
4922
4923 #ifdef GRANITE_VULKAN_FILESYSTEM
init_shader_manager_cache()4924 void Device::init_shader_manager_cache()
4925 {
4926 //if (!shader_manager.load_shader_cache("assets://shader_cache.json"))
4927 // shader_manager.load_shader_cache("cache://shader_cache.json");
4928 shader_manager.load_shader_cache("assets://shader_cache.json");
4929 }
4930
flush_shader_manager_cache()4931 void Device::flush_shader_manager_cache()
4932 {
4933 shader_manager.save_shader_cache("cache://shader_cache.json");
4934 }
4935 #endif
4936
get_device_table() const4937 const VolkDeviceTable &Device::get_device_table() const
4938 {
4939 return *table;
4940 }
4941
4942 #ifndef GRANITE_RENDERDOC_CAPTURE
init_renderdoc_capture()4943 bool Device::init_renderdoc_capture()
4944 {
4945 LOGE("RenderDoc API capture is not enabled in this build.\n");
4946 return false;
4947 }
4948
begin_renderdoc_capture()4949 void Device::begin_renderdoc_capture()
4950 {
4951 }
4952
end_renderdoc_capture()4953 void Device::end_renderdoc_capture()
4954 {
4955 }
4956 #endif
4957
4958 }
4959