1 /* Copyright (c) 2017-2020 Hans-Kristian Arntzen
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining
4  * a copy of this software and associated documentation files (the
5  * "Software"), to deal in the Software without restriction, including
6  * without limitation the rights to use, copy, modify, merge, publish,
7  * distribute, sublicense, and/or sell copies of the Software, and to
8  * permit persons to whom the Software is furnished to do so, subject to
9  * the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be
12  * included in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "device.hpp"
24 #include "format.hpp"
25 #include "type_to_string.hpp"
26 #include "quirks.hpp"
27 #include "timer.hpp"
28 #include <algorithm>
29 #include <string.h>
30 #include <stdlib.h>
31 
32 #ifdef _WIN32
33 #define WIN32_LEAN_AND_MEAN
34 #include <windows.h>
35 #endif
36 
37 #ifdef GRANITE_VULKAN_FILESYSTEM
38 #include "string_helpers.hpp"
39 #endif
40 
41 #ifdef GRANITE_VULKAN_MT
42 #include "thread_id.hpp"
get_thread_index()43 static unsigned get_thread_index()
44 {
45 	return Vulkan::get_current_thread_index();
46 }
47 #define LOCK() std::lock_guard<std::mutex> holder__{lock.lock}
48 #define DRAIN_FRAME_LOCK() \
49 	std::unique_lock<std::mutex> holder__{lock.lock}; \
50 	lock.cond.wait(holder__, [&]() { \
51 		return lock.counter == 0; \
52 	})
53 #else
54 #define LOCK() ((void)0)
55 #define DRAIN_FRAME_LOCK() VK_ASSERT(lock.counter == 0)
get_thread_index()56 static unsigned get_thread_index()
57 {
58 	return 0;
59 }
60 #endif
61 
62 using namespace std;
63 using namespace Util;
64 
65 namespace Vulkan
66 {
Device()67 Device::Device()
68     : framebuffer_allocator(this)
69     , transient_allocator(this)
70 #ifdef GRANITE_VULKAN_FILESYSTEM
71 	, shader_manager(this)
72 	, texture_manager(this)
73 #endif
74 {
75 #ifdef GRANITE_VULKAN_MT
76 	cookie.store(0);
77 #endif
78 
79 	if (const char *env = getenv("GRANITE_TIMESTAMP_TRACE"))
80 	{
81 		LOGI("Tracing timestamps to %s.\n", env);
82 		if (!init_timestamp_trace(env))
83 			LOGE("Failed to init timestamp trace.\n");
84 	}
85 }
86 
request_legacy_semaphore()87 Semaphore Device::request_legacy_semaphore()
88 {
89 	LOCK();
90 	auto semaphore = managers.semaphore.request_cleared_semaphore();
91 	Semaphore ptr(handle_pool.semaphores.allocate(this, semaphore, false));
92 	return ptr;
93 }
94 
request_external_semaphore(VkSemaphore semaphore,bool signalled)95 Semaphore Device::request_external_semaphore(VkSemaphore semaphore, bool signalled)
96 {
97 	LOCK();
98 	VK_ASSERT(semaphore);
99 	Semaphore ptr(handle_pool.semaphores.allocate(this, semaphore, signalled));
100 	return ptr;
101 }
102 
103 #ifndef _WIN32
request_imported_semaphore(int fd,VkExternalSemaphoreHandleTypeFlagBitsKHR handle_type)104 Semaphore Device::request_imported_semaphore(int fd, VkExternalSemaphoreHandleTypeFlagBitsKHR handle_type)
105 {
106 	LOCK();
107 	if (!ext.supports_external)
108 		return {};
109 
110 	VkExternalSemaphorePropertiesKHR props = { VK_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_PROPERTIES_KHR };
111 	VkPhysicalDeviceExternalSemaphoreInfoKHR info = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_SEMAPHORE_INFO_KHR };
112 	info.handleType = handle_type;
113 
114 	vkGetPhysicalDeviceExternalSemaphorePropertiesKHR(gpu, &info, &props);
115 	if ((props.externalSemaphoreFeatures & VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT_KHR) == 0)
116 		return Semaphore(nullptr);
117 
118 	auto semaphore = managers.semaphore.request_cleared_semaphore();
119 
120 	VkImportSemaphoreFdInfoKHR import = { VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR };
121 	import.fd = fd;
122 	import.semaphore = semaphore;
123 	import.handleType = handle_type;
124 	import.flags = VK_SEMAPHORE_IMPORT_TEMPORARY_BIT_KHR;
125 	Semaphore ptr(handle_pool.semaphores.allocate(this, semaphore, false));
126 
127 	if (table->vkImportSemaphoreFdKHR(device, &import) != VK_SUCCESS)
128 		return Semaphore(nullptr);
129 
130 	ptr->signal_external();
131 	ptr->destroy_on_consume();
132 	return ptr;
133 }
134 #endif
135 
add_wait_semaphore(CommandBuffer::Type type,Semaphore semaphore,VkPipelineStageFlags stages,bool flush)136 void Device::add_wait_semaphore(CommandBuffer::Type type, Semaphore semaphore, VkPipelineStageFlags stages, bool flush)
137 {
138 	LOCK();
139 	add_wait_semaphore_nolock(type, semaphore, stages, flush);
140 }
141 
add_wait_semaphore_nolock(CommandBuffer::Type type,Semaphore semaphore,VkPipelineStageFlags stages,bool flush)142 void Device::add_wait_semaphore_nolock(CommandBuffer::Type type, Semaphore semaphore, VkPipelineStageFlags stages,
143                                        bool flush)
144 {
145 	VK_ASSERT(stages != 0);
146 	if (flush)
147 		flush_frame(type);
148 	auto &data = get_queue_data(type);
149 
150 #ifdef VULKAN_DEBUG
151 	for (auto &sem : data.wait_semaphores)
152 		VK_ASSERT(sem.get() != semaphore.get());
153 #endif
154 
155 	semaphore->signal_pending_wait();
156 	data.wait_semaphores.push_back(semaphore);
157 	data.wait_stages.push_back(stages);
158 	data.need_fence = true;
159 
160 	// Sanity check.
161 	VK_ASSERT(data.wait_semaphores.size() < 16 * 1024);
162 }
163 
create_linear_host_image(const LinearHostImageCreateInfo & info)164 LinearHostImageHandle Device::create_linear_host_image(const LinearHostImageCreateInfo &info)
165 {
166 	if ((info.usage & ~VK_IMAGE_USAGE_SAMPLED_BIT) != 0)
167 		return LinearHostImageHandle(nullptr);
168 
169 	ImageCreateInfo create_info;
170 	create_info.width = info.width;
171 	create_info.height = info.height;
172 	create_info.domain =
173 			(info.flags & LINEAR_HOST_IMAGE_HOST_CACHED_BIT) != 0 ?
174 			ImageDomain::LinearHostCached :
175 			ImageDomain::LinearHost;
176 	create_info.levels = 1;
177 	create_info.layers = 1;
178 	create_info.initial_layout = VK_IMAGE_LAYOUT_GENERAL;
179 	create_info.format = info.format;
180 	create_info.samples = VK_SAMPLE_COUNT_1_BIT;
181 	create_info.usage = info.usage;
182 	create_info.type = VK_IMAGE_TYPE_2D;
183 
184 	if ((info.flags & LINEAR_HOST_IMAGE_REQUIRE_LINEAR_FILTER_BIT) != 0)
185 		create_info.misc |= IMAGE_MISC_VERIFY_FORMAT_FEATURE_SAMPLED_LINEAR_FILTER_BIT;
186 	if ((info.flags & LINEAR_HOST_IMAGE_IGNORE_DEVICE_LOCAL_BIT) != 0)
187 		create_info.misc |= IMAGE_MISC_LINEAR_IMAGE_IGNORE_DEVICE_LOCAL_BIT;
188 
189 	BufferHandle cpu_image;
190 	auto gpu_image = create_image(create_info);
191 	if (!gpu_image)
192 	{
193 		// Fall-back to staging buffer.
194 		create_info.domain = ImageDomain::Physical;
195 		create_info.initial_layout = VK_IMAGE_LAYOUT_UNDEFINED;
196 		create_info.misc = IMAGE_MISC_CONCURRENT_QUEUE_GRAPHICS_BIT | IMAGE_MISC_CONCURRENT_QUEUE_ASYNC_TRANSFER_BIT;
197 		create_info.usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
198 		gpu_image = create_image(create_info);
199 		if (!gpu_image)
200 			return LinearHostImageHandle(nullptr);
201 
202 		BufferCreateInfo buffer;
203 		buffer.domain =
204 				(info.flags & LINEAR_HOST_IMAGE_HOST_CACHED_BIT) != 0 ?
205 				BufferDomain::CachedHost :
206 				BufferDomain::Host;
207 		buffer.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
208 		buffer.size = info.width * info.height * TextureFormatLayout::format_block_size(info.format, format_to_aspect_mask(info.format));
209 		cpu_image = create_buffer(buffer);
210 		if (!cpu_image)
211 			return LinearHostImageHandle(nullptr);
212 	}
213 	else
214 		gpu_image->set_layout(Layout::General);
215 
216 	return LinearHostImageHandle(handle_pool.linear_images.allocate(this, move(gpu_image), move(cpu_image), info.stages));
217 }
218 
map_linear_host_image(const LinearHostImage & image,MemoryAccessFlags access)219 void *Device::map_linear_host_image(const LinearHostImage &image, MemoryAccessFlags access)
220 {
221 	void *host = managers.memory.map_memory(image.get_host_visible_allocation(), access,
222 	                                        0, image.get_host_visible_allocation().get_size());
223 	return host;
224 }
225 
unmap_linear_host_image_and_sync(const LinearHostImage & image,MemoryAccessFlags access)226 void Device::unmap_linear_host_image_and_sync(const LinearHostImage &image, MemoryAccessFlags access)
227 {
228 	managers.memory.unmap_memory(image.get_host_visible_allocation(), access,
229 	                             0, image.get_host_visible_allocation().get_size());
230 	if (image.need_staging_copy())
231 	{
232 		// Kinda icky fallback, shouldn't really be used on discrete cards.
233 		auto cmd = request_command_buffer(CommandBuffer::Type::AsyncTransfer);
234 		cmd->image_barrier(image.get_image(), VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
235 		                   VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0,
236 		                   VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT);
237 		cmd->copy_buffer_to_image(image.get_image(), image.get_host_visible_buffer(),
238 		                          0, {},
239 		                          { image.get_image().get_width(), image.get_image().get_height(), 1 },
240 		                          0, 0, { VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1 });
241 
242 		// Don't care about dstAccessMask, semaphore takes care of everything.
243 		cmd->image_barrier(image.get_image(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
244 		                   VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT,
245 		                   VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, 0);
246 
247 		Semaphore sem;
248 		submit(cmd, nullptr, 1, &sem);
249 
250 		// The queue type is an assumption. Should add some parameter for that.
251 		add_wait_semaphore(CommandBuffer::Type::Generic, sem, image.get_used_pipeline_stages(), true);
252 	}
253 }
254 
map_host_buffer(const Buffer & buffer,MemoryAccessFlags access)255 void *Device::map_host_buffer(const Buffer &buffer, MemoryAccessFlags access)
256 {
257 	void *host = managers.memory.map_memory(buffer.get_allocation(), access, 0, buffer.get_create_info().size);
258 	return host;
259 }
260 
unmap_host_buffer(const Buffer & buffer,MemoryAccessFlags access)261 void Device::unmap_host_buffer(const Buffer &buffer, MemoryAccessFlags access)
262 {
263 	managers.memory.unmap_memory(buffer.get_allocation(), access, 0, buffer.get_create_info().size);
264 }
265 
map_host_buffer(const Buffer & buffer,MemoryAccessFlags access,VkDeviceSize offset,VkDeviceSize length)266 void *Device::map_host_buffer(const Buffer &buffer, MemoryAccessFlags access, VkDeviceSize offset, VkDeviceSize length)
267 {
268 	VK_ASSERT(offset + length <= buffer.get_create_info().size);
269 	void *host = managers.memory.map_memory(buffer.get_allocation(), access, offset, length);
270 	return host;
271 }
272 
unmap_host_buffer(const Buffer & buffer,MemoryAccessFlags access,VkDeviceSize offset,VkDeviceSize length)273 void Device::unmap_host_buffer(const Buffer &buffer, MemoryAccessFlags access, VkDeviceSize offset, VkDeviceSize length)
274 {
275 	VK_ASSERT(offset + length <= buffer.get_create_info().size);
276 	managers.memory.unmap_memory(buffer.get_allocation(), access, offset, length);
277 }
278 
request_shader(const uint32_t * data,size_t size)279 Shader *Device::request_shader(const uint32_t *data, size_t size)
280 {
281 	Util::Hasher hasher;
282 	hasher.data(data, size);
283 
284 	auto hash = hasher.get();
285 	auto *ret = shaders.find(hash);
286 	if (!ret)
287 		ret = shaders.emplace_yield(hash, hash, this, data, size);
288 	return ret;
289 }
290 
request_shader_by_hash(Hash hash)291 Shader *Device::request_shader_by_hash(Hash hash)
292 {
293 	return shaders.find(hash);
294 }
295 
request_program(Vulkan::Shader * compute_shader)296 Program *Device::request_program(Vulkan::Shader *compute_shader)
297 {
298 	if (!compute_shader)
299 		return nullptr;
300 
301 	Util::Hasher hasher;
302 	hasher.u64(compute_shader->get_hash());
303 
304 	auto hash = hasher.get();
305 	auto *ret = programs.find(hash);
306 	if (!ret)
307 		ret = programs.emplace_yield(hash, this, compute_shader);
308 	return ret;
309 }
310 
request_program(const uint32_t * compute_data,size_t compute_size)311 Program *Device::request_program(const uint32_t *compute_data, size_t compute_size)
312 {
313 	if (!compute_size)
314 		return nullptr;
315 
316 	auto *compute_shader = request_shader(compute_data, compute_size);
317 	return request_program(compute_shader);
318 }
319 
request_program(Shader * vertex,Shader * fragment)320 Program *Device::request_program(Shader *vertex, Shader *fragment)
321 {
322 	if (!vertex || !fragment)
323 		return nullptr;
324 
325 	Util::Hasher hasher;
326 	hasher.u64(vertex->get_hash());
327 	hasher.u64(fragment->get_hash());
328 
329 	auto hash = hasher.get();
330 	auto *ret = programs.find(hash);
331 
332 	if (!ret)
333 		ret = programs.emplace_yield(hash, this, vertex, fragment);
334 	return ret;
335 }
336 
request_program(const uint32_t * vertex_data,size_t vertex_size,const uint32_t * fragment_data,size_t fragment_size)337 Program *Device::request_program(const uint32_t *vertex_data, size_t vertex_size, const uint32_t *fragment_data,
338                                  size_t fragment_size)
339 {
340 	if (!vertex_size || !fragment_size)
341 		return nullptr;
342 
343 	auto *vertex = request_shader(vertex_data, vertex_size);
344 	auto *fragment = request_shader(fragment_data, fragment_size);
345 	return request_program(vertex, fragment);
346 }
347 
request_pipeline_layout(const CombinedResourceLayout & layout)348 PipelineLayout *Device::request_pipeline_layout(const CombinedResourceLayout &layout)
349 {
350 	Hasher h;
351 	h.data(reinterpret_cast<const uint32_t *>(layout.sets), sizeof(layout.sets));
352 	h.data(&layout.stages_for_bindings[0][0], sizeof(layout.stages_for_bindings));
353 	h.u32(layout.push_constant_range.stageFlags);
354 	h.u32(layout.push_constant_range.size);
355 	h.data(layout.spec_constant_mask, sizeof(layout.spec_constant_mask));
356 	h.u32(layout.attribute_mask);
357 	h.u32(layout.render_target_mask);
358 
359 	auto hash = h.get();
360 	auto *ret = pipeline_layouts.find(hash);
361 	if (!ret)
362 		ret = pipeline_layouts.emplace_yield(hash, hash, this, layout);
363 	return ret;
364 }
365 
request_descriptor_set_allocator(const DescriptorSetLayout & layout,const uint32_t * stages_for_bindings)366 DescriptorSetAllocator *Device::request_descriptor_set_allocator(const DescriptorSetLayout &layout, const uint32_t *stages_for_bindings)
367 {
368 	Hasher h;
369 	h.data(reinterpret_cast<const uint32_t *>(&layout), sizeof(layout));
370 	h.data(stages_for_bindings, sizeof(uint32_t) * VULKAN_NUM_BINDINGS);
371 	auto hash = h.get();
372 
373 	auto *ret = descriptor_set_allocators.find(hash);
374 	if (!ret)
375 		ret = descriptor_set_allocators.emplace_yield(hash, hash, this, layout, stages_for_bindings);
376 	return ret;
377 }
378 
bake_program(Program & program)379 void Device::bake_program(Program &program)
380 {
381 	CombinedResourceLayout layout;
382 	if (program.get_shader(ShaderStage::Vertex))
383 		layout.attribute_mask = program.get_shader(ShaderStage::Vertex)->get_layout().input_mask;
384 	if (program.get_shader(ShaderStage::Fragment))
385 		layout.render_target_mask = program.get_shader(ShaderStage::Fragment)->get_layout().output_mask;
386 
387 	layout.descriptor_set_mask = 0;
388 
389 	for (unsigned i = 0; i < static_cast<unsigned>(ShaderStage::Count); i++)
390 	{
391 		auto *shader = program.get_shader(static_cast<ShaderStage>(i));
392 		if (!shader)
393 			continue;
394 
395 		uint32_t stage_mask = 1u << i;
396 
397 		auto &shader_layout = shader->get_layout();
398 		for (unsigned set = 0; set < VULKAN_NUM_DESCRIPTOR_SETS; set++)
399 		{
400 			layout.sets[set].sampled_image_mask |= shader_layout.sets[set].sampled_image_mask;
401 			layout.sets[set].storage_image_mask |= shader_layout.sets[set].storage_image_mask;
402 			layout.sets[set].uniform_buffer_mask |= shader_layout.sets[set].uniform_buffer_mask;
403 			layout.sets[set].storage_buffer_mask |= shader_layout.sets[set].storage_buffer_mask;
404 			layout.sets[set].sampled_buffer_mask |= shader_layout.sets[set].sampled_buffer_mask;
405 			layout.sets[set].input_attachment_mask |= shader_layout.sets[set].input_attachment_mask;
406 			layout.sets[set].sampler_mask |= shader_layout.sets[set].sampler_mask;
407 			layout.sets[set].separate_image_mask |= shader_layout.sets[set].separate_image_mask;
408 			layout.sets[set].fp_mask |= shader_layout.sets[set].fp_mask;
409 
410 			for_each_bit(shader_layout.sets[set].immutable_sampler_mask, [&](uint32_t binding) {
411 				StockSampler sampler = get_immutable_sampler(shader_layout.sets[set], binding);
412 
413 				// Do we already have an immutable sampler? Make sure it matches the layout.
414 				if (has_immutable_sampler(layout.sets[set], binding))
415 				{
416 					if (sampler != get_immutable_sampler(layout.sets[set], binding))
417 						LOGE("Immutable sampler mismatch detected!\n");
418 				}
419 
420 				set_immutable_sampler(layout.sets[set], binding, sampler);
421 			});
422 
423 			uint32_t active_binds =
424 					shader_layout.sets[set].sampled_image_mask |
425 					shader_layout.sets[set].storage_image_mask |
426 					shader_layout.sets[set].uniform_buffer_mask|
427 					shader_layout.sets[set].storage_buffer_mask |
428 					shader_layout.sets[set].sampled_buffer_mask |
429 					shader_layout.sets[set].input_attachment_mask |
430 					shader_layout.sets[set].sampler_mask |
431 					shader_layout.sets[set].separate_image_mask;
432 
433 			if (active_binds)
434 				layout.stages_for_sets[set] |= stage_mask;
435 
436 			for_each_bit(active_binds, [&](uint32_t bit) {
437 				layout.stages_for_bindings[set][bit] |= stage_mask;
438 
439 				auto &combined_size = layout.sets[set].array_size[bit];
440 				auto &shader_size = shader_layout.sets[set].array_size[bit];
441 				if (combined_size && combined_size != shader_size)
442 					LOGE("Mismatch between array sizes in different shaders.\n");
443 				else
444 					combined_size = shader_size;
445 			});
446 		}
447 
448 		// Merge push constant ranges into one range.
449 		// Do not try to split into multiple ranges as it just complicates things for no obvious gain.
450 		if (shader_layout.push_constant_size != 0)
451 		{
452 			layout.push_constant_range.stageFlags |= 1u << i;
453 			layout.push_constant_range.size =
454 					std::max(layout.push_constant_range.size, shader_layout.push_constant_size);
455 		}
456 
457 		layout.spec_constant_mask[i] = shader_layout.spec_constant_mask;
458 		layout.combined_spec_constant_mask |= shader_layout.spec_constant_mask;
459 		layout.bindless_descriptor_set_mask |= shader_layout.bindless_set_mask;
460 	}
461 
462 	for (unsigned set = 0; set < VULKAN_NUM_DESCRIPTOR_SETS; set++)
463 	{
464 		if (layout.stages_for_sets[set] != 0)
465 		{
466 			layout.descriptor_set_mask |= 1u << set;
467 
468 			for (unsigned binding = 0; binding < VULKAN_NUM_BINDINGS; binding++)
469 			{
470 				auto &array_size = layout.sets[set].array_size[binding];
471 				if (array_size == DescriptorSetLayout::UNSIZED_ARRAY)
472 				{
473 					for (unsigned i = 1; i < VULKAN_NUM_BINDINGS; i++)
474 					{
475 						if (layout.stages_for_bindings[set][i] != 0)
476 							LOGE("Using bindless for set = %u, but binding = %u has a descriptor attached to it.\n", set, i);
477 					}
478 
479 					// Allows us to have one unified descriptor set layout for bindless.
480 					layout.stages_for_bindings[set][binding] = VK_SHADER_STAGE_ALL;
481 				}
482 				else if (array_size == 0)
483 				{
484 					array_size = 1;
485 				}
486 				else
487 				{
488 					for (unsigned i = 1; i < array_size; i++)
489 					{
490 						if (layout.stages_for_bindings[set][binding + i] != 0)
491 						{
492 							LOGE("Detected binding aliasing for (%u, %u). Binding array with %u elements starting at (%u, %u) overlaps.\n",
493 							     set, binding + i, array_size, set, binding);
494 						}
495 					}
496 				}
497 			}
498 		}
499 	}
500 
501 	Hasher h;
502 	h.u32(layout.push_constant_range.stageFlags);
503 	h.u32(layout.push_constant_range.size);
504 	layout.push_constant_layout_hash = h.get();
505 	program.set_pipeline_layout(request_pipeline_layout(layout));
506 }
507 
init_pipeline_cache(const uint8_t * data,size_t size)508 bool Device::init_pipeline_cache(const uint8_t *data, size_t size)
509 {
510 	static const auto uuid_size = sizeof(gpu_props.pipelineCacheUUID);
511 
512 	VkPipelineCacheCreateInfo info = { VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO };
513 	if (!data || size < uuid_size)
514 	{
515 		LOGI("Creating a fresh pipeline cache.\n");
516 	}
517 	else if (memcmp(data, gpu_props.pipelineCacheUUID, uuid_size) != 0)
518 	{
519 		LOGI("Pipeline cache UUID changed.\n");
520 	}
521 	else
522 	{
523 		info.initialDataSize = size - uuid_size;
524 		info.pInitialData = data + uuid_size;
525 		LOGI("Initializing pipeline cache.\n");
526 	}
527 
528 	if (pipeline_cache != VK_NULL_HANDLE)
529 		table->vkDestroyPipelineCache(device, pipeline_cache, nullptr);
530 	pipeline_cache = VK_NULL_HANDLE;
531 	return table->vkCreatePipelineCache(device, &info, nullptr, &pipeline_cache) == VK_SUCCESS;
532 }
533 
to_hex(uint8_t v)534 static inline char to_hex(uint8_t v)
535 {
536 	if (v < 10)
537 		return char('0' + v);
538 	else
539 		return char('a' + (v - 10));
540 }
541 
get_pipeline_cache_string() const542 string Device::get_pipeline_cache_string() const
543 {
544 	string res;
545 	res.reserve(sizeof(gpu_props.pipelineCacheUUID) * 2);
546 
547 	for (auto &c : gpu_props.pipelineCacheUUID)
548 	{
549 		res += to_hex(uint8_t((c >> 4) & 0xf));
550 		res += to_hex(uint8_t(c & 0xf));
551 	}
552 
553 	return res;
554 }
555 
init_pipeline_cache()556 void Device::init_pipeline_cache()
557 {
558 #ifdef GRANITE_VULKAN_FILESYSTEM
559 	auto file = Granite::Global::filesystem()->open(Util::join("cache://pipeline_cache_", get_pipeline_cache_string(), ".bin"),
560 	                                                Granite::FileMode::ReadOnly);
561 	if (file)
562 	{
563 		auto size = file->get_size();
564 		auto *mapped = static_cast<uint8_t *>(file->map());
565 		if (mapped && !init_pipeline_cache(mapped, size))
566 			LOGE("Failed to initialize pipeline cache.\n");
567 	}
568 	else if (!init_pipeline_cache(nullptr, 0))
569 		LOGE("Failed to initialize pipeline cache.\n");
570 #endif
571 }
572 
get_pipeline_cache_size()573 size_t Device::get_pipeline_cache_size()
574 {
575 	if (pipeline_cache == VK_NULL_HANDLE)
576 		return 0;
577 
578 	static const auto uuid_size = sizeof(gpu_props.pipelineCacheUUID);
579 	size_t size = 0;
580 	if (table->vkGetPipelineCacheData(device, pipeline_cache, &size, nullptr) != VK_SUCCESS)
581 	{
582 		LOGE("Failed to get pipeline cache data.\n");
583 		return 0;
584 	}
585 
586 	return size + uuid_size;
587 }
588 
get_pipeline_cache_data(uint8_t * data,size_t size)589 bool Device::get_pipeline_cache_data(uint8_t *data, size_t size)
590 {
591 	if (pipeline_cache == VK_NULL_HANDLE)
592 		return false;
593 
594 	static const auto uuid_size = sizeof(gpu_props.pipelineCacheUUID);
595 	if (size < uuid_size)
596 		return false;
597 
598 	size -= uuid_size;
599 	memcpy(data, gpu_props.pipelineCacheUUID, uuid_size);
600 	data += uuid_size;
601 
602 	if (table->vkGetPipelineCacheData(device, pipeline_cache, &size, data) != VK_SUCCESS)
603 	{
604 		LOGE("Failed to get pipeline cache data.\n");
605 		return false;
606 	}
607 
608 	return true;
609 }
610 
flush_pipeline_cache()611 void Device::flush_pipeline_cache()
612 {
613 #ifdef GRANITE_VULKAN_FILESYSTEM
614 	size_t size = get_pipeline_cache_size();
615 	if (!size)
616 	{
617 		LOGE("Failed to get pipeline cache size.\n");
618 		return;
619 	}
620 
621 	auto file = Granite::Global::filesystem()->open(Util::join("cache://pipeline_cache_", get_pipeline_cache_string(), ".bin"),
622 	                                                Granite::FileMode::WriteOnly);
623 	if (!file)
624 	{
625 		LOGE("Failed to get pipeline cache data.\n");
626 		return;
627 	}
628 
629 	uint8_t *data = static_cast<uint8_t *>(file->map_write(size));
630 	if (!data)
631 	{
632 		LOGE("Failed to get pipeline cache data.\n");
633 		return;
634 	}
635 
636 	if (!get_pipeline_cache_data(data, size))
637 	{
638 		LOGE("Failed to get pipeline cache data.\n");
639 		return;
640 	}
641 #endif
642 }
643 
init_workarounds()644 void Device::init_workarounds()
645 {
646 	workarounds = {};
647 
648 #ifdef __APPLE__
649 	// Events are not supported in MoltenVK.
650 	workarounds.emulate_event_as_pipeline_barrier = true;
651 	LOGW("Emulating events as pipeline barriers on Metal emulation.\n");
652 #else
653 	if (gpu_props.vendorID == VENDOR_ID_NVIDIA &&
654 #ifdef _WIN32
655 	    VK_VERSION_MAJOR(gpu_props.driverVersion) < 417)
656 #else
657 	    VK_VERSION_MAJOR(gpu_props.driverVersion) < 415)
658 #endif
659 	{
660 		workarounds.force_store_in_render_pass = true;
661 		LOGW("Detected workaround for render pass STORE_OP_STORE.\n");
662 	}
663 
664 	if (gpu_props.vendorID == VENDOR_ID_QCOM)
665 	{
666 		// Apparently, we need to use STORE_OP_STORE in all render passes no matter what ...
667 		workarounds.force_store_in_render_pass = true;
668 		workarounds.broken_color_write_mask = true;
669 		LOGW("Detected workaround for render pass STORE_OP_STORE.\n");
670 		LOGW("Detected workaround for broken color write masks.\n");
671 	}
672 
673 	// UNDEFINED -> COLOR_ATTACHMENT_OPTIMAL stalls, so need to acquire async.
674 	if (gpu_props.vendorID == VENDOR_ID_ARM)
675 	{
676 		LOGW("Workaround applied: Acquiring WSI images early on Mali.\n");
677 		LOGW("Workaround applied: Emulating events as pipeline barriers.\n");
678 		LOGW("Workaround applied: Optimize ALL_GRAPHICS_BIT barriers.\n");
679 
680 		// All performance related workarounds.
681 		workarounds.wsi_acquire_barrier_is_expensive = true;
682 		workarounds.emulate_event_as_pipeline_barrier = true;
683 		workarounds.optimize_all_graphics_barrier = true;
684 	}
685 #endif
686 }
687 
set_context(const Context & context)688 void Device::set_context(const Context &context)
689 {
690 	table = &context.get_device_table();
691 
692 #ifdef GRANITE_VULKAN_MT
693 	register_thread_index(0);
694 #endif
695 	instance = context.get_instance();
696 	gpu = context.get_gpu();
697 	device = context.get_device();
698 	num_thread_indices = context.get_num_thread_indices();
699 
700 	graphics_queue_family_index = context.get_graphics_queue_family();
701 	graphics_queue = context.get_graphics_queue();
702 	compute_queue_family_index = context.get_compute_queue_family();
703 	compute_queue = context.get_compute_queue();
704 	transfer_queue_family_index = context.get_transfer_queue_family();
705 	transfer_queue = context.get_transfer_queue();
706 	timestamp_valid_bits = context.get_timestamp_valid_bits();
707 
708 	mem_props = context.get_mem_props();
709 	gpu_props = context.get_gpu_props();
710 	ext = context.get_enabled_device_features();
711 
712 	init_workarounds();
713 
714 	init_stock_samplers();
715 	init_pipeline_cache();
716 
717 	init_timeline_semaphores();
718 	init_bindless();
719 
720 #ifdef ANDROID
721 	init_frame_contexts(3); // Android needs a bit more ... ;)
722 #else
723 	init_frame_contexts(2); // By default, regular double buffer between CPU and GPU.
724 #endif
725 
726 	managers.memory.init(this);
727 	managers.memory.set_supports_dedicated_allocation(ext.supports_dedicated);
728 	managers.semaphore.init(this);
729 	managers.fence.init(this);
730 	managers.event.init(this);
731 	managers.vbo.init(this, 4 * 1024, 16, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
732 	                  ImplementationQuirks::get().staging_need_device_local);
733 	managers.ibo.init(this, 4 * 1024, 16, VK_BUFFER_USAGE_INDEX_BUFFER_BIT,
734 	                  ImplementationQuirks::get().staging_need_device_local);
735 	managers.ubo.init(this, 256 * 1024, std::max<VkDeviceSize>(16u, gpu_props.limits.minUniformBufferOffsetAlignment),
736 	                  VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
737 	                  ImplementationQuirks::get().staging_need_device_local);
738 	managers.ubo.set_spill_region_size(VULKAN_MAX_UBO_SIZE);
739 	managers.staging.init(this, 64 * 1024, std::max<VkDeviceSize>(16u, gpu_props.limits.optimalBufferCopyOffsetAlignment),
740 	                      VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
741 	                      false);
742 
743 	graphics.performance_query_pool.init_device(this, graphics_queue_family_index);
744 	if (graphics_queue_family_index != compute_queue_family_index)
745 		compute.performance_query_pool.init_device(this, compute_queue_family_index);
746 	if (graphics_queue_family_index != transfer_queue_family_index &&
747 	    compute_queue_family_index != transfer_queue_family_index)
748 	{
749 		transfer.performance_query_pool.init_device(this, transfer_queue_family_index);
750 	}
751 
752 #ifdef GRANITE_VULKAN_FOSSILIZE
753 	init_pipeline_state();
754 #endif
755 #ifdef GRANITE_VULKAN_FILESYSTEM
756 	init_shader_manager_cache();
757 #endif
758 
759 	init_calibrated_timestamps();
760 }
761 
init_bindless()762 void Device::init_bindless()
763 {
764 	if (!ext.supports_descriptor_indexing)
765 		return;
766 
767 	DescriptorSetLayout layout;
768 
769 	layout.array_size[0] = DescriptorSetLayout::UNSIZED_ARRAY;
770 	for (unsigned i = 1; i < VULKAN_NUM_BINDINGS; i++)
771 		layout.array_size[i] = 1;
772 
773 	layout.separate_image_mask = 1;
774 	uint32_t stages_for_sets[VULKAN_NUM_BINDINGS] = { VK_SHADER_STAGE_ALL };
775 	bindless_sampled_image_allocator_integer = request_descriptor_set_allocator(layout, stages_for_sets);
776 	layout.fp_mask = 1;
777 	bindless_sampled_image_allocator_fp = request_descriptor_set_allocator(layout, stages_for_sets);
778 }
779 
init_timeline_semaphores()780 void Device::init_timeline_semaphores()
781 {
782 	if (!ext.timeline_semaphore_features.timelineSemaphore)
783 		return;
784 
785 	VkSemaphoreTypeCreateInfoKHR type_info = { VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR };
786 	VkSemaphoreCreateInfo info = { VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO };
787 	info.pNext = &type_info;
788 	type_info.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR;
789 	type_info.initialValue = 0;
790 	if (table->vkCreateSemaphore(device, &info, nullptr, &graphics.timeline_semaphore) != VK_SUCCESS)
791 		LOGE("Failed to create timeline semaphore.\n");
792 	if (table->vkCreateSemaphore(device, &info, nullptr, &compute.timeline_semaphore) != VK_SUCCESS)
793 		LOGE("Failed to create timeline semaphore.\n");
794 	if (table->vkCreateSemaphore(device, &info, nullptr, &transfer.timeline_semaphore) != VK_SUCCESS)
795 		LOGE("Failed to create timeline sempahore.\n");
796 }
797 
init_stock_samplers()798 void Device::init_stock_samplers()
799 {
800 	if (ext.sampler_ycbcr_conversion_features.samplerYcbcrConversion)
801 	{
802 		for (auto &sampler : samplers_ycbcr)
803 		{
804 			if (sampler)
805 				table->vkDestroySamplerYcbcrConversion(device, sampler, nullptr);
806 			sampler = VK_NULL_HANDLE;
807 		}
808 
809 		VkSamplerYcbcrConversionCreateInfo info = { VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_CREATE_INFO };
810 		info.ycbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_709;
811 		info.ycbcrRange = VK_SAMPLER_YCBCR_RANGE_ITU_NARROW;
812 		info.components = {
813 			VK_COMPONENT_SWIZZLE_IDENTITY,
814 			VK_COMPONENT_SWIZZLE_IDENTITY,
815 			VK_COMPONENT_SWIZZLE_IDENTITY,
816 			VK_COMPONENT_SWIZZLE_IDENTITY,
817 		};
818 		info.chromaFilter = VK_FILTER_LINEAR;
819 		info.xChromaOffset = VK_CHROMA_LOCATION_MIDPOINT;
820 		info.yChromaOffset = VK_CHROMA_LOCATION_MIDPOINT;
821 		info.forceExplicitReconstruction = VK_FALSE;
822 
823 		info.format = VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM;
824 		table->vkCreateSamplerYcbcrConversionKHR(device, &info, nullptr,
825 		                                         &samplers_ycbcr[static_cast<unsigned>(YCbCrFormat::YUV420P_3PLANE)]);
826 
827 		info.format = VK_FORMAT_G8_B8_R8_3PLANE_422_UNORM;
828 		table->vkCreateSamplerYcbcrConversionKHR(device, &info, nullptr,
829 		                                         &samplers_ycbcr[static_cast<unsigned>(YCbCrFormat::YUV422P_3PLANE)]);
830 
831 		info.format = VK_FORMAT_G8_B8_R8_3PLANE_444_UNORM;
832 		table->vkCreateSamplerYcbcrConversionKHR(device, &info, nullptr,
833 		                                         &samplers_ycbcr[static_cast<unsigned>(YCbCrFormat::YUV444P_3PLANE)]);
834 	}
835 
836 	SamplerCreateInfo info = {};
837 	info.max_lod = VK_LOD_CLAMP_NONE;
838 	info.max_anisotropy = 1.0f;
839 
840 	for (unsigned i = 0; i < static_cast<unsigned>(StockSampler::Count); i++)
841 	{
842 		auto mode = static_cast<StockSampler>(i);
843 
844 		switch (mode)
845 		{
846 		case StockSampler::NearestShadow:
847 		case StockSampler::LinearShadow:
848 			info.compare_enable = true;
849 			info.compare_op = VK_COMPARE_OP_LESS_OR_EQUAL;
850 			break;
851 
852 		default:
853 			info.compare_enable = false;
854 			break;
855 		}
856 
857 		switch (mode)
858 		{
859 		case StockSampler::TrilinearClamp:
860 		case StockSampler::TrilinearWrap:
861 			info.mipmap_mode = VK_SAMPLER_MIPMAP_MODE_LINEAR;
862 			break;
863 
864 		default:
865 			info.mipmap_mode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
866 			break;
867 		}
868 
869 		switch (mode)
870 		{
871 		case StockSampler::LinearClamp:
872 		case StockSampler::LinearWrap:
873 		case StockSampler::TrilinearClamp:
874 		case StockSampler::TrilinearWrap:
875 		case StockSampler::LinearShadow:
876 		case StockSampler::LinearYUV420P:
877 		case StockSampler::LinearYUV422P:
878 		case StockSampler::LinearYUV444P:
879 			info.mag_filter = VK_FILTER_LINEAR;
880 			info.min_filter = VK_FILTER_LINEAR;
881 			break;
882 
883 		default:
884 			info.mag_filter = VK_FILTER_NEAREST;
885 			info.min_filter = VK_FILTER_NEAREST;
886 			break;
887 		}
888 
889 		switch (mode)
890 		{
891 		default:
892 		case StockSampler::LinearWrap:
893 		case StockSampler::NearestWrap:
894 		case StockSampler::TrilinearWrap:
895 			info.address_mode_u = VK_SAMPLER_ADDRESS_MODE_REPEAT;
896 			info.address_mode_v = VK_SAMPLER_ADDRESS_MODE_REPEAT;
897 			info.address_mode_w = VK_SAMPLER_ADDRESS_MODE_REPEAT;
898 			break;
899 
900 		case StockSampler::LinearClamp:
901 		case StockSampler::NearestClamp:
902 		case StockSampler::TrilinearClamp:
903 		case StockSampler::NearestShadow:
904 		case StockSampler::LinearShadow:
905 		case StockSampler::LinearYUV420P:
906 		case StockSampler::LinearYUV422P:
907 		case StockSampler::LinearYUV444P:
908 			info.address_mode_u = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
909 			info.address_mode_v = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
910 			info.address_mode_w = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
911 			break;
912 		}
913 
914 		samplers[i] = create_sampler(info, mode);
915 	}
916 }
917 
request_block(Device & device,BufferBlock & block,VkDeviceSize size,BufferPool & pool,std::vector<BufferBlock> * dma,std::vector<BufferBlock> & recycle)918 static void request_block(Device &device, BufferBlock &block, VkDeviceSize size,
919                           BufferPool &pool, std::vector<BufferBlock> *dma, std::vector<BufferBlock> &recycle)
920 {
921 	if (block.mapped)
922 		device.unmap_host_buffer(*block.cpu, MEMORY_ACCESS_WRITE_BIT);
923 
924 	if (block.offset == 0)
925 	{
926 		if (block.size == pool.get_block_size())
927 			pool.recycle_block(move(block));
928 	}
929 	else
930 	{
931 		if (block.cpu != block.gpu)
932 		{
933 			VK_ASSERT(dma);
934 			dma->push_back(block);
935 		}
936 
937 		if (block.size == pool.get_block_size())
938 			recycle.push_back(block);
939 	}
940 
941 	if (size)
942 		block = pool.request_block(size);
943 	else
944 		block = {};
945 }
946 
request_vertex_block(BufferBlock & block,VkDeviceSize size)947 void Device::request_vertex_block(BufferBlock &block, VkDeviceSize size)
948 {
949 	LOCK();
950 	request_vertex_block_nolock(block, size);
951 }
952 
request_vertex_block_nolock(BufferBlock & block,VkDeviceSize size)953 void Device::request_vertex_block_nolock(BufferBlock &block, VkDeviceSize size)
954 {
955 	request_block(*this, block, size, managers.vbo, &dma.vbo, frame().vbo_blocks);
956 }
957 
request_index_block(BufferBlock & block,VkDeviceSize size)958 void Device::request_index_block(BufferBlock &block, VkDeviceSize size)
959 {
960 	LOCK();
961 	request_index_block_nolock(block, size);
962 }
963 
request_index_block_nolock(BufferBlock & block,VkDeviceSize size)964 void Device::request_index_block_nolock(BufferBlock &block, VkDeviceSize size)
965 {
966 	request_block(*this, block, size, managers.ibo, &dma.ibo, frame().ibo_blocks);
967 }
968 
request_uniform_block(BufferBlock & block,VkDeviceSize size)969 void Device::request_uniform_block(BufferBlock &block, VkDeviceSize size)
970 {
971 	LOCK();
972 	request_uniform_block_nolock(block, size);
973 }
974 
request_uniform_block_nolock(BufferBlock & block,VkDeviceSize size)975 void Device::request_uniform_block_nolock(BufferBlock &block, VkDeviceSize size)
976 {
977 	request_block(*this, block, size, managers.ubo, &dma.ubo, frame().ubo_blocks);
978 }
979 
request_staging_block(BufferBlock & block,VkDeviceSize size)980 void Device::request_staging_block(BufferBlock &block, VkDeviceSize size)
981 {
982 	LOCK();
983 	request_staging_block_nolock(block, size);
984 }
985 
request_staging_block_nolock(BufferBlock & block,VkDeviceSize size)986 void Device::request_staging_block_nolock(BufferBlock &block, VkDeviceSize size)
987 {
988 	request_block(*this, block, size, managers.staging, nullptr, frame().staging_blocks);
989 }
990 
submit(CommandBufferHandle & cmd,Fence * fence,unsigned semaphore_count,Semaphore * semaphores)991 void Device::submit(CommandBufferHandle &cmd, Fence *fence, unsigned semaphore_count, Semaphore *semaphores)
992 {
993 	cmd->end_debug_channel();
994 
995 	LOCK();
996 	submit_nolock(move(cmd), fence, semaphore_count, semaphores);
997 }
998 
get_physical_queue_type(CommandBuffer::Type queue_type) const999 CommandBuffer::Type Device::get_physical_queue_type(CommandBuffer::Type queue_type) const
1000 {
1001 	if (queue_type != CommandBuffer::Type::AsyncGraphics)
1002 	{
1003 		return queue_type;
1004 	}
1005 	else
1006 	{
1007 		if (graphics_queue_family_index == compute_queue_family_index && graphics_queue != compute_queue)
1008 			return CommandBuffer::Type::AsyncCompute;
1009 		else
1010 			return CommandBuffer::Type::Generic;
1011 	}
1012 }
1013 
submit_nolock(CommandBufferHandle cmd,Fence * fence,unsigned semaphore_count,Semaphore * semaphores)1014 void Device::submit_nolock(CommandBufferHandle cmd, Fence *fence, unsigned semaphore_count, Semaphore *semaphores)
1015 {
1016 	auto type = cmd->get_command_buffer_type();
1017 	auto &submissions = get_queue_submissions(type);
1018 #ifdef VULKAN_DEBUG
1019 	auto &pool = get_command_pool(type, cmd->get_thread_index());
1020 	pool.signal_submitted(cmd->get_command_buffer());
1021 #endif
1022 
1023 	bool profiled_submit = cmd->has_profiling();
1024 
1025 	if (profiled_submit)
1026 	{
1027 		LOGI("Submitting profiled command buffer, draining GPU.\n");
1028 		auto &query_pool = get_performance_query_pool(type);
1029 		// Profiled submit, drain GPU before submitting to make sure there's no overlap going on.
1030 		query_pool.end_command_buffer(cmd->get_command_buffer());
1031 		Fence drain_fence;
1032 		submit_empty_nolock(type, &drain_fence, 0, nullptr, -1);
1033 		drain_fence->wait();
1034 		drain_fence->set_internal_sync_object();
1035 	}
1036 
1037 	cmd->end();
1038 	submissions.push_back(move(cmd));
1039 
1040 	InternalFence signalled_fence;
1041 
1042 	if (fence || semaphore_count)
1043 	{
1044 		submit_queue(type, fence ? &signalled_fence : nullptr,
1045 		             semaphore_count, semaphores,
1046 		             profiled_submit ? 0 : -1);
1047 	}
1048 
1049 	if (fence)
1050 	{
1051 		VK_ASSERT(!*fence);
1052 		if (signalled_fence.value)
1053 			*fence = Fence(handle_pool.fences.allocate(this, signalled_fence.value, signalled_fence.timeline));
1054 		else
1055 			*fence = Fence(handle_pool.fences.allocate(this, signalled_fence.fence));
1056 	}
1057 
1058 	if (profiled_submit)
1059 	{
1060 		// Drain queue again and report results.
1061 		LOGI("Submitted profiled command buffer, draining GPU and report ...\n");
1062 		auto &query_pool = get_performance_query_pool(type);
1063 		Fence drain_fence;
1064 		submit_empty_nolock(type, &drain_fence, 0, nullptr, fence || semaphore_count ? -1 : 0);
1065 		drain_fence->wait();
1066 		drain_fence->set_internal_sync_object();
1067 		query_pool.report();
1068 	}
1069 
1070 	decrement_frame_counter_nolock();
1071 }
1072 
submit_empty(CommandBuffer::Type type,Fence * fence,unsigned semaphore_count,Semaphore * semaphores)1073 void Device::submit_empty(CommandBuffer::Type type, Fence *fence,
1074                           unsigned semaphore_count, Semaphore *semaphores)
1075 {
1076 	LOCK();
1077 	submit_empty_nolock(type, fence, semaphore_count, semaphores, -1);
1078 }
1079 
submit_empty_nolock(CommandBuffer::Type type,Fence * fence,unsigned semaphore_count,Semaphore * semaphores,int profiling_iteration)1080 void Device::submit_empty_nolock(CommandBuffer::Type type, Fence *fence,
1081                                  unsigned semaphore_count, Semaphore *semaphores, int profiling_iteration)
1082 {
1083 	if (type != CommandBuffer::Type::AsyncTransfer)
1084 		flush_frame(CommandBuffer::Type::AsyncTransfer);
1085 
1086 	InternalFence signalled_fence;
1087 	submit_queue(type, fence ? &signalled_fence : nullptr, semaphore_count, semaphores, profiling_iteration);
1088 	if (fence)
1089 	{
1090 		if (signalled_fence.value)
1091 			*fence = Fence(handle_pool.fences.allocate(this, signalled_fence.value, signalled_fence.timeline));
1092 		else
1093 			*fence = Fence(handle_pool.fences.allocate(this, signalled_fence.fence));
1094 	}
1095 }
1096 
submit_empty_inner(CommandBuffer::Type type,InternalFence * fence,unsigned semaphore_count,Semaphore * semaphores)1097 void Device::submit_empty_inner(CommandBuffer::Type type, InternalFence *fence,
1098                                 unsigned semaphore_count, Semaphore *semaphores)
1099 {
1100 	auto &data = get_queue_data(type);
1101 	VkSubmitInfo submit = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
1102 	VkTimelineSemaphoreSubmitInfoKHR timeline_info = { VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR };
1103 
1104 	if (ext.timeline_semaphore_features.timelineSemaphore)
1105 		submit.pNext = &timeline_info;
1106 
1107 	VkSemaphore timeline_semaphore = data.timeline_semaphore;
1108 	uint64_t timeline_value = ++data.current_timeline;
1109 
1110 	VkQueue queue = get_vk_queue(type);
1111 	switch (type)
1112 	{
1113 	default:
1114 	case CommandBuffer::Type::Generic:
1115 		frame().timeline_fence_graphics = data.current_timeline;
1116 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
1117 		if (ext.timeline_semaphore_features.timelineSemaphore)
1118 		{
1119 			LOGI("Signal graphics: (%p) %u\n",
1120 			     reinterpret_cast<void *>(timeline_semaphore),
1121 			     unsigned(data.current_timeline));
1122 		}
1123 #endif
1124 		break;
1125 
1126 	case CommandBuffer::Type::AsyncCompute:
1127 		frame().timeline_fence_compute = data.current_timeline;
1128 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
1129 		if (ext.timeline_semaphore_features.timelineSemaphore)
1130 		{
1131 			LOGI("Signal compute: (%p) %u\n",
1132 			     reinterpret_cast<void *>(timeline_semaphore),
1133 			     unsigned(data.current_timeline));
1134 		}
1135 #endif
1136 		break;
1137 
1138 	case CommandBuffer::Type::AsyncTransfer:
1139 		frame().timeline_fence_transfer = data.current_timeline;
1140 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
1141 		if (ext.timeline_semaphore_features.timelineSemaphore)
1142 		{
1143 			LOGI("Signal transfer: (%p) %u\n",
1144 			     reinterpret_cast<void *>(timeline_semaphore),
1145 			     unsigned(data.current_timeline));
1146 		}
1147 #endif
1148 		break;
1149 	}
1150 
1151 	// Add external signal semaphores.
1152 	SmallVector<VkSemaphore> signals;
1153 	if (ext.timeline_semaphore_features.timelineSemaphore)
1154 	{
1155 		// Signal once and distribute the timeline value to all.
1156 		timeline_info.signalSemaphoreValueCount = 1;
1157 		timeline_info.pSignalSemaphoreValues = &timeline_value;
1158 		submit.signalSemaphoreCount = 1;
1159 		submit.pSignalSemaphores = &timeline_semaphore;
1160 
1161 		if (fence)
1162 		{
1163 			fence->timeline = timeline_semaphore;
1164 			fence->value = timeline_value;
1165 			fence->fence = VK_NULL_HANDLE;
1166 		}
1167 
1168 		for (unsigned i = 0; i < semaphore_count; i++)
1169 		{
1170 			VK_ASSERT(!semaphores[i]);
1171 			semaphores[i] = Semaphore(handle_pool.semaphores.allocate(this, timeline_value, timeline_semaphore));
1172 		}
1173 	}
1174 	else
1175 	{
1176 		if (fence)
1177 		{
1178 			fence->timeline = VK_NULL_HANDLE;
1179 			fence->value = 0;
1180 		}
1181 
1182 		for (unsigned i = 0; i < semaphore_count; i++)
1183 		{
1184 			VkSemaphore cleared_semaphore = managers.semaphore.request_cleared_semaphore();
1185 			signals.push_back(cleared_semaphore);
1186 			VK_ASSERT(!semaphores[i]);
1187 			semaphores[i] = Semaphore(handle_pool.semaphores.allocate(this, cleared_semaphore, true));
1188 		}
1189 
1190 		submit.signalSemaphoreCount = signals.size();
1191 		if (!signals.empty())
1192 			submit.pSignalSemaphores = signals.data();
1193 	}
1194 
1195 	// Add external wait semaphores.
1196 	SmallVector<VkSemaphore> waits;
1197 	SmallVector<uint64_t> waits_count;
1198 	auto stages = move(data.wait_stages);
1199 
1200 	for (auto &semaphore : data.wait_semaphores)
1201 	{
1202 		auto wait = semaphore->consume();
1203 		if (!semaphore->get_timeline_value())
1204 		{
1205 			if (semaphore->can_recycle())
1206 				frame().recycled_semaphores.push_back(wait);
1207 			else
1208 				frame().destroyed_semaphores.push_back(wait);
1209 		}
1210 		waits.push_back(wait);
1211 		waits_count.push_back(semaphore->get_timeline_value());
1212 	}
1213 
1214 	data.wait_stages.clear();
1215 	data.wait_semaphores.clear();
1216 
1217 	submit.waitSemaphoreCount = waits.size();
1218 	if (!stages.empty())
1219 		submit.pWaitDstStageMask = stages.data();
1220 	if (!waits.empty())
1221 		submit.pWaitSemaphores = waits.data();
1222 
1223 	if (!waits_count.empty())
1224 	{
1225 		timeline_info.waitSemaphoreValueCount = waits_count.size();
1226 		timeline_info.pWaitSemaphoreValues = waits_count.data();
1227 	}
1228 
1229 	VkFence cleared_fence = fence && !ext.timeline_semaphore_features.timelineSemaphore ?
1230 	                        managers.fence.request_cleared_fence() :
1231 	                        VK_NULL_HANDLE;
1232 	if (fence)
1233 		fence->fence = cleared_fence;
1234 
1235 	QueryPoolHandle start_ts, end_ts;
1236 	if (json_timestamp_origin)
1237 		start_ts = write_calibrated_timestamp_nolock();
1238 
1239 	if (queue_lock_callback)
1240 		queue_lock_callback();
1241 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
1242 	if (cleared_fence)
1243 		LOGI("Signalling Fence: %llx\n", reinterpret_cast<unsigned long long>(cleared_fence));
1244 #endif
1245 
1246 	VkResult result = table->vkQueueSubmit(queue, 1, &submit, cleared_fence);
1247 	if (ImplementationQuirks::get().queue_wait_on_submission)
1248 		table->vkQueueWaitIdle(queue);
1249 	if (queue_unlock_callback)
1250 		queue_unlock_callback();
1251 
1252 	if (json_timestamp_origin)
1253 	{
1254 		end_ts = write_calibrated_timestamp_nolock();
1255 		register_time_interval_nolock("CPU", std::move(start_ts), std::move(end_ts), "submit", "");
1256 	}
1257 
1258 	if (result != VK_SUCCESS)
1259 		LOGE("vkQueueSubmit failed (code: %d).\n", int(result));
1260 	if (result == VK_ERROR_DEVICE_LOST)
1261 		report_checkpoints();
1262 
1263 	if (!ext.timeline_semaphore_features.timelineSemaphore)
1264 		data.need_fence = true;
1265 
1266 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
1267 	const char *queue_name = nullptr;
1268 	switch (type)
1269 	{
1270 	default:
1271 	case CommandBuffer::Type::Generic:
1272 		queue_name = "Graphics";
1273 		break;
1274 	case CommandBuffer::Type::AsyncCompute:
1275 		queue_name = "Compute";
1276 		break;
1277 	case CommandBuffer::Type::AsyncTransfer:
1278 		queue_name = "Transfer";
1279 		break;
1280 	}
1281 
1282 	LOGI("Empty submission to %s queue:\n", queue_name);
1283 	for (uint32_t i = 0; i < submit.waitSemaphoreCount; i++)
1284 	{
1285 		LOGI("  Waiting for semaphore: %llx in stages %s\n",
1286 		     reinterpret_cast<unsigned long long>(submit.pWaitSemaphores[i]),
1287 		     stage_flags_to_string(submit.pWaitDstStageMask[i]).c_str());
1288 	}
1289 
1290 	for (uint32_t i = 0; i < submit.signalSemaphoreCount; i++)
1291 	{
1292 		LOGI("  Signalling semaphore: %llx\n",
1293 		     reinterpret_cast<unsigned long long>(submit.pSignalSemaphores[i]));
1294 	}
1295 #endif
1296 }
1297 
request_legacy_fence()1298 Fence Device::request_legacy_fence()
1299 {
1300 	VkFence fence = managers.fence.request_cleared_fence();
1301 	return Fence(handle_pool.fences.allocate(this, fence));
1302 }
1303 
submit_staging(CommandBufferHandle & cmd,VkBufferUsageFlags usage,bool flush)1304 void Device::submit_staging(CommandBufferHandle &cmd, VkBufferUsageFlags usage, bool flush)
1305 {
1306 	auto access = buffer_usage_to_possible_access(usage);
1307 	auto stages = buffer_usage_to_possible_stages(usage);
1308 	VkQueue src_queue = get_vk_queue(cmd->get_command_buffer_type());
1309 
1310 	if (src_queue == graphics_queue && src_queue == compute_queue)
1311 	{
1312 		// For single-queue systems, just use a pipeline barrier.
1313 		cmd->barrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, stages, access);
1314 		submit_nolock(cmd, nullptr, 0, nullptr);
1315 	}
1316 	else
1317 	{
1318 		auto compute_stages = stages &
1319 		                      (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT |
1320 		                       VK_PIPELINE_STAGE_TRANSFER_BIT |
1321 		                       VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT);
1322 
1323 		auto compute_access = access &
1324 		                      (VK_ACCESS_SHADER_READ_BIT |
1325 		                       VK_ACCESS_SHADER_WRITE_BIT |
1326 		                       VK_ACCESS_TRANSFER_READ_BIT |
1327 		                       VK_ACCESS_UNIFORM_READ_BIT |
1328 		                       VK_ACCESS_TRANSFER_WRITE_BIT |
1329 		                       VK_ACCESS_INDIRECT_COMMAND_READ_BIT);
1330 
1331 		auto graphics_stages = stages;
1332 
1333 		if (src_queue == graphics_queue)
1334 		{
1335 			cmd->barrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT,
1336 			             graphics_stages, access);
1337 
1338 			if (compute_stages != 0)
1339 			{
1340 				Semaphore sem;
1341 				submit_nolock(cmd, nullptr, 1, &sem);
1342 				add_wait_semaphore_nolock(CommandBuffer::Type::AsyncCompute, sem, compute_stages, flush);
1343 			}
1344 			else
1345 				submit_nolock(cmd, nullptr, 0, nullptr);
1346 		}
1347 		else if (src_queue == compute_queue)
1348 		{
1349 			cmd->barrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT,
1350 			             compute_stages, compute_access);
1351 
1352 			if (graphics_stages != 0)
1353 			{
1354 				Semaphore sem;
1355 				submit_nolock(cmd, nullptr, 1, &sem);
1356 				add_wait_semaphore_nolock(CommandBuffer::Type::Generic, sem, graphics_stages, flush);
1357 			}
1358 			else
1359 				submit_nolock(cmd, nullptr, 0, nullptr);
1360 		}
1361 		else
1362 		{
1363 			if (graphics_stages != 0 && compute_stages != 0)
1364 			{
1365 				Semaphore semaphores[2];
1366 				submit_nolock(cmd, nullptr, 2, semaphores);
1367 				add_wait_semaphore_nolock(CommandBuffer::Type::Generic, semaphores[0], graphics_stages, flush);
1368 				add_wait_semaphore_nolock(CommandBuffer::Type::AsyncCompute, semaphores[1], compute_stages, flush);
1369 			}
1370 			else if (graphics_stages != 0)
1371 			{
1372 				Semaphore sem;
1373 				submit_nolock(cmd, nullptr, 1, &sem);
1374 				add_wait_semaphore_nolock(CommandBuffer::Type::Generic, sem, graphics_stages, flush);
1375 			}
1376 			else if (compute_stages != 0)
1377 			{
1378 				Semaphore sem;
1379 				submit_nolock(cmd, nullptr, 1, &sem);
1380 				add_wait_semaphore_nolock(CommandBuffer::Type::AsyncCompute, sem, compute_stages, flush);
1381 			}
1382 			else
1383 				submit_nolock(cmd, nullptr, 0, nullptr);
1384 		}
1385 	}
1386 }
1387 
submit_queue(CommandBuffer::Type type,InternalFence * fence,unsigned semaphore_count,Semaphore * semaphores,int profiling_iteration)1388 void Device::submit_queue(CommandBuffer::Type type, InternalFence *fence,
1389                           unsigned semaphore_count, Semaphore *semaphores, int profiling_iteration)
1390 {
1391 	type = get_physical_queue_type(type);
1392 
1393 	// Always check if we need to flush pending transfers.
1394 	if (type != CommandBuffer::Type::AsyncTransfer)
1395 		flush_frame(CommandBuffer::Type::AsyncTransfer);
1396 
1397 	auto &data = get_queue_data(type);
1398 	auto &submissions = get_queue_submissions(type);
1399 
1400 	if (submissions.empty())
1401 	{
1402 		if (fence || semaphore_count)
1403 			submit_empty_inner(type, fence, semaphore_count, semaphores);
1404 		return;
1405 	}
1406 
1407 	VkSemaphore timeline_semaphore = data.timeline_semaphore;
1408 	uint64_t timeline_value = ++data.current_timeline;
1409 
1410 	VkQueue queue = get_vk_queue(type);
1411 	switch (type)
1412 	{
1413 	default:
1414 	case CommandBuffer::Type::Generic:
1415 		frame().timeline_fence_graphics = data.current_timeline;
1416 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
1417 		LOGI("Signal graphics: (%p) %u\n",
1418 			 reinterpret_cast<void *>(timeline_semaphore),
1419 			 unsigned(data.current_timeline));
1420 #endif
1421 		break;
1422 
1423 	case CommandBuffer::Type::AsyncCompute:
1424 		frame().timeline_fence_compute = data.current_timeline;
1425 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
1426 		LOGI("Signal compute: (%p) %u\n",
1427 			 reinterpret_cast<void *>(timeline_semaphore),
1428 			 unsigned(data.current_timeline));
1429 #endif
1430 		break;
1431 
1432 	case CommandBuffer::Type::AsyncTransfer:
1433 		frame().timeline_fence_transfer = data.current_timeline;
1434 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
1435 		LOGI("Signal transfer: (%p) %u\n",
1436 			 reinterpret_cast<void *>(timeline_semaphore),
1437 			 unsigned(data.current_timeline));
1438 #endif
1439 		break;
1440 	}
1441 
1442 	SmallVector<VkCommandBuffer> cmds;
1443 	cmds.reserve(submissions.size());
1444 
1445 	SmallVector<VkSubmitInfo> submits;
1446 	SmallVector<VkTimelineSemaphoreSubmitInfoKHR> timeline_infos;
1447 
1448 	submits.reserve(2);
1449 	timeline_infos.reserve(2);
1450 
1451 	size_t last_cmd = 0;
1452 
1453 	SmallVector<VkSemaphore> waits[2];
1454 	SmallVector<uint64_t> wait_counts[2];
1455 	SmallVector<VkFlags> wait_stages[2];
1456 	SmallVector<VkSemaphore> signals[2];
1457 	SmallVector<uint64_t> signal_counts[2];
1458 
1459 	// Add external wait semaphores.
1460 	wait_stages[0] = move(data.wait_stages);
1461 
1462 	for (auto &semaphore : data.wait_semaphores)
1463 	{
1464 		auto wait = semaphore->consume();
1465 		if (!semaphore->get_timeline_value())
1466 		{
1467 			if (semaphore->can_recycle())
1468 				frame().recycled_semaphores.push_back(wait);
1469 			else
1470 				frame().destroyed_semaphores.push_back(wait);
1471 		}
1472 		wait_counts[0].push_back(semaphore->get_timeline_value());
1473 		waits[0].push_back(wait);
1474 	}
1475 	data.wait_stages.clear();
1476 	data.wait_semaphores.clear();
1477 
1478 	for (auto &cmd : submissions)
1479 	{
1480 		if (cmd->swapchain_touched() && !wsi.touched && !wsi.consumed)
1481 		{
1482 			if (!cmds.empty())
1483 			{
1484 				// Push all pending cmd buffers to their own submission.
1485 				submits.emplace_back();
1486 
1487 				timeline_infos.emplace_back();
1488 				auto &timeline_info = timeline_infos.back();
1489 				timeline_info = { VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR };
1490 
1491 				auto &submit = submits.back();
1492 				submit = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
1493 				if (ext.timeline_semaphore_features.timelineSemaphore)
1494 					submit.pNext = &timeline_info;
1495 
1496 				submit.commandBufferCount = cmds.size() - last_cmd;
1497 				submit.pCommandBuffers = cmds.data() + last_cmd;
1498 				last_cmd = cmds.size();
1499 			}
1500 			wsi.touched = true;
1501 		}
1502 
1503 		cmds.push_back(cmd->get_command_buffer());
1504 	}
1505 
1506 	if (cmds.size() > last_cmd)
1507 	{
1508 		unsigned index = submits.size();
1509 
1510 		// Push all pending cmd buffers to their own submission.
1511 		timeline_infos.emplace_back();
1512 		auto &timeline_info = timeline_infos.back();
1513 		timeline_info = { VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR };
1514 
1515 		submits.emplace_back();
1516 		auto &submit = submits.back();
1517 		submit = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
1518 
1519 		if (ext.timeline_semaphore_features.timelineSemaphore)
1520 			submit.pNext = &timeline_info;
1521 
1522 		submit.commandBufferCount = cmds.size() - last_cmd;
1523 		submit.pCommandBuffers = cmds.data() + last_cmd;
1524 		if (wsi.touched && !wsi.consumed)
1525 		{
1526 			static const VkFlags wait = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
1527 			if (wsi.acquire && wsi.acquire->get_semaphore() != VK_NULL_HANDLE)
1528 			{
1529 				VK_ASSERT(wsi.acquire->is_signalled());
1530 				VkSemaphore sem = wsi.acquire->consume();
1531 
1532 				waits[index].push_back(sem);
1533 				wait_counts[index].push_back(wsi.acquire->get_timeline_value());
1534 				wait_stages[index].push_back(wait);
1535 
1536 				if (!wsi.acquire->get_timeline_value())
1537 				{
1538 					if (wsi.acquire->can_recycle())
1539 						frame().recycled_semaphores.push_back(sem);
1540 					else
1541 						frame().destroyed_semaphores.push_back(sem);
1542 				}
1543 
1544 				wsi.acquire.reset();
1545 			}
1546 
1547 			VkSemaphore release = managers.semaphore.request_cleared_semaphore();
1548 			wsi.release = Semaphore(handle_pool.semaphores.allocate(this, release, true));
1549 			wsi.release->set_internal_sync_object();
1550 			signals[index].push_back(wsi.release->get_semaphore());
1551 			signal_counts[index].push_back(0);
1552 			wsi.consumed = true;
1553 		}
1554 		last_cmd = cmds.size();
1555 	}
1556 
1557 	VkFence cleared_fence = fence && !ext.timeline_semaphore_features.timelineSemaphore ?
1558 	                        managers.fence.request_cleared_fence() :
1559 	                        VK_NULL_HANDLE;
1560 
1561 	if (fence)
1562 		fence->fence = cleared_fence;
1563 
1564 	// Add external signal semaphores.
1565 	if (ext.timeline_semaphore_features.timelineSemaphore)
1566 	{
1567 		// Signal once and distribute the timeline value to all.
1568 		signals[submits.size() - 1].push_back(timeline_semaphore);
1569 		signal_counts[submits.size() - 1].push_back(timeline_value);
1570 
1571 		if (fence)
1572 		{
1573 			fence->timeline = timeline_semaphore;
1574 			fence->value = timeline_value;
1575 			fence->fence = VK_NULL_HANDLE;
1576 		}
1577 
1578 		for (unsigned i = 0; i < semaphore_count; i++)
1579 		{
1580 			VK_ASSERT(!semaphores[i]);
1581 			semaphores[i] = Semaphore(handle_pool.semaphores.allocate(this, timeline_value, timeline_semaphore));
1582 		}
1583 	}
1584 	else
1585 	{
1586 		if (fence)
1587 		{
1588 			fence->timeline = VK_NULL_HANDLE;
1589 			fence->value = 0;
1590 		}
1591 
1592 		for (unsigned i = 0; i < semaphore_count; i++)
1593 		{
1594 			VkSemaphore cleared_semaphore = managers.semaphore.request_cleared_semaphore();
1595 			signals[submits.size() - 1].push_back(cleared_semaphore);
1596 			signal_counts[submits.size() - 1].push_back(0);
1597 			VK_ASSERT(!semaphores[i]);
1598 			semaphores[i] = Semaphore(handle_pool.semaphores.allocate(this, cleared_semaphore, true));
1599 		}
1600 	}
1601 
1602 	VkPerformanceQuerySubmitInfoKHR profiling_infos[2];
1603 
1604 	for (unsigned i = 0; i < submits.size(); i++)
1605 	{
1606 		auto &submit = submits[i];
1607 		auto &timeline_submit = timeline_infos[i];
1608 
1609 		if (profiling_iteration >= 0)
1610 		{
1611 			profiling_infos[i] = { VK_STRUCTURE_TYPE_PERFORMANCE_QUERY_SUBMIT_INFO_KHR };
1612 			profiling_infos[i].counterPassIndex = uint32_t(profiling_iteration);
1613 			if (submit.pNext)
1614 				timeline_submit.pNext = &profiling_infos[i];
1615 			else
1616 				submit.pNext = &profiling_infos[i];
1617 		}
1618 
1619 		submit.waitSemaphoreCount = waits[i].size();
1620 		submit.pWaitSemaphores = waits[i].data();
1621 		submit.pWaitDstStageMask = wait_stages[i].data();
1622 		timeline_submit.waitSemaphoreValueCount = submit.waitSemaphoreCount;
1623 		timeline_submit.pWaitSemaphoreValues = wait_counts[i].data();
1624 
1625 		submit.signalSemaphoreCount = signals[i].size();
1626 		submit.pSignalSemaphores = signals[i].data();
1627 		timeline_submit.signalSemaphoreValueCount = submit.signalSemaphoreCount;
1628 		timeline_submit.pSignalSemaphoreValues = signal_counts[i].data();
1629 	}
1630 
1631 	QueryPoolHandle start_ts, end_ts;
1632 	if (json_timestamp_origin)
1633 		start_ts = write_calibrated_timestamp_nolock();
1634 
1635 	if (queue_lock_callback)
1636 		queue_lock_callback();
1637 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
1638 	if (cleared_fence)
1639 		LOGI("Signalling fence: %llx\n", reinterpret_cast<unsigned long long>(cleared_fence));
1640 #endif
1641 	VkResult result = table->vkQueueSubmit(queue, submits.size(), submits.data(), cleared_fence);
1642 	if (ImplementationQuirks::get().queue_wait_on_submission)
1643 		table->vkQueueWaitIdle(queue);
1644 	if (queue_unlock_callback)
1645 		queue_unlock_callback();
1646 
1647 	if (json_timestamp_origin)
1648 	{
1649 		end_ts = write_calibrated_timestamp_nolock();
1650 		register_time_interval_nolock("CPU", std::move(start_ts), std::move(end_ts), "submit", "");
1651 	}
1652 
1653 	if (result != VK_SUCCESS)
1654 		LOGE("vkQueueSubmit failed (code: %d).\n", int(result));
1655 	if (result == VK_ERROR_DEVICE_LOST)
1656 		report_checkpoints();
1657 	submissions.clear();
1658 
1659 	if (!ext.timeline_semaphore_features.timelineSemaphore)
1660 		data.need_fence = true;
1661 
1662 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
1663 	const char *queue_name = nullptr;
1664 	switch (type)
1665 	{
1666 	default:
1667 	case CommandBuffer::Type::Generic:
1668 		queue_name = "Graphics";
1669 		break;
1670 	case CommandBuffer::Type::AsyncCompute:
1671 		queue_name = "Compute";
1672 		break;
1673 	case CommandBuffer::Type::AsyncTransfer:
1674 		queue_name = "Transfer";
1675 		break;
1676 	}
1677 
1678 	for (auto &submit : submits)
1679 	{
1680 		LOGI("Submission to %s queue:\n", queue_name);
1681 		for (uint32_t i = 0; i < submit.waitSemaphoreCount; i++)
1682 		{
1683 			LOGI("  Waiting for semaphore: %llx in stages %s\n",
1684 			     reinterpret_cast<unsigned long long>(submit.pWaitSemaphores[i]),
1685 			     stage_flags_to_string(submit.pWaitDstStageMask[i]).c_str());
1686 		}
1687 
1688 		for (uint32_t i = 0; i < submit.commandBufferCount; i++)
1689 			LOGI(" Command Buffer %llx\n", reinterpret_cast<unsigned long long>(submit.pCommandBuffers[i]));
1690 
1691 		for (uint32_t i = 0; i < submit.signalSemaphoreCount; i++)
1692 		{
1693 			LOGI("  Signalling semaphore: %llx\n",
1694 			     reinterpret_cast<unsigned long long>(submit.pSignalSemaphores[i]));
1695 		}
1696 	}
1697 #endif
1698 }
1699 
flush_frame(CommandBuffer::Type type)1700 void Device::flush_frame(CommandBuffer::Type type)
1701 {
1702 	if (type == CommandBuffer::Type::AsyncTransfer)
1703 		sync_buffer_blocks();
1704 	submit_queue(type, nullptr, 0, nullptr);
1705 }
1706 
sync_buffer_blocks()1707 void Device::sync_buffer_blocks()
1708 {
1709 	if (dma.vbo.empty() && dma.ibo.empty() && dma.ubo.empty())
1710 		return;
1711 
1712 	VkBufferUsageFlags usage = 0;
1713 
1714 	auto cmd = request_command_buffer_nolock(get_thread_index(), CommandBuffer::Type::AsyncTransfer, false);
1715 
1716 	cmd->begin_region("buffer-block-sync");
1717 
1718 	for (auto &block : dma.vbo)
1719 	{
1720 		VK_ASSERT(block.offset != 0);
1721 		cmd->copy_buffer(*block.gpu, 0, *block.cpu, 0, block.offset);
1722 		usage |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
1723 	}
1724 
1725 	for (auto &block : dma.ibo)
1726 	{
1727 		VK_ASSERT(block.offset != 0);
1728 		cmd->copy_buffer(*block.gpu, 0, *block.cpu, 0, block.offset);
1729 		usage |= VK_BUFFER_USAGE_INDEX_BUFFER_BIT;
1730 	}
1731 
1732 	for (auto &block : dma.ubo)
1733 	{
1734 		VK_ASSERT(block.offset != 0);
1735 		cmd->copy_buffer(*block.gpu, 0, *block.cpu, 0, block.offset);
1736 		usage |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
1737 	}
1738 
1739 	dma.vbo.clear();
1740 	dma.ibo.clear();
1741 	dma.ubo.clear();
1742 
1743 	cmd->end_region();
1744 
1745 	// Do not flush graphics or compute in this context.
1746 	// We must be able to inject semaphores into all currently enqueued graphics / compute.
1747 	submit_staging(cmd, usage, false);
1748 }
1749 
end_frame_context()1750 void Device::end_frame_context()
1751 {
1752 	DRAIN_FRAME_LOCK();
1753 	end_frame_nolock();
1754 }
1755 
end_frame_nolock()1756 void Device::end_frame_nolock()
1757 {
1758 	// Kept handles alive until end-of-frame, free now if appropriate.
1759 	for (auto &image : frame().keep_alive_images)
1760 	{
1761 		image->set_internal_sync_object();
1762 		image->get_view().set_internal_sync_object();
1763 	}
1764 	frame().keep_alive_images.clear();
1765 
1766 	// Make sure we have a fence which covers all submissions in the frame.
1767 	InternalFence fence;
1768 
1769 	if (transfer.need_fence || !frame().transfer_submissions.empty())
1770 	{
1771 		submit_queue(CommandBuffer::Type::AsyncTransfer, &fence, 0, nullptr);
1772 		if (fence.fence != VK_NULL_HANDLE)
1773 		{
1774 			frame().wait_fences.push_back(fence.fence);
1775 			frame().recycle_fences.push_back(fence.fence);
1776 		}
1777 		transfer.need_fence = false;
1778 	}
1779 
1780 	if (graphics.need_fence || !frame().graphics_submissions.empty())
1781 	{
1782 		submit_queue(CommandBuffer::Type::Generic, &fence, 0, nullptr);
1783 		if (fence.fence != VK_NULL_HANDLE)
1784 		{
1785 			frame().wait_fences.push_back(fence.fence);
1786 			frame().recycle_fences.push_back(fence.fence);
1787 		}
1788 		graphics.need_fence = false;
1789 	}
1790 
1791 	if (compute.need_fence || !frame().compute_submissions.empty())
1792 	{
1793 		submit_queue(CommandBuffer::Type::AsyncCompute, &fence, 0, nullptr);
1794 		if (fence.fence != VK_NULL_HANDLE)
1795 		{
1796 			frame().wait_fences.push_back(fence.fence);
1797 			frame().recycle_fences.push_back(fence.fence);
1798 		}
1799 		compute.need_fence = false;
1800 	}
1801 }
1802 
flush_frame()1803 void Device::flush_frame()
1804 {
1805 	LOCK();
1806 	flush_frame_nolock();
1807 }
1808 
flush_frame_nolock()1809 void Device::flush_frame_nolock()
1810 {
1811 	flush_frame(CommandBuffer::Type::AsyncTransfer);
1812 	flush_frame(CommandBuffer::Type::Generic);
1813 	flush_frame(CommandBuffer::Type::AsyncCompute);
1814 }
1815 
get_queue_data(CommandBuffer::Type type)1816 Device::QueueData &Device::get_queue_data(CommandBuffer::Type type)
1817 {
1818 	switch (get_physical_queue_type(type))
1819 	{
1820 	default:
1821 	case CommandBuffer::Type::Generic:
1822 		return graphics;
1823 	case CommandBuffer::Type::AsyncCompute:
1824 		return compute;
1825 	case CommandBuffer::Type::AsyncTransfer:
1826 		return transfer;
1827 	}
1828 }
1829 
get_vk_queue(CommandBuffer::Type type) const1830 VkQueue Device::get_vk_queue(CommandBuffer::Type type) const
1831 {
1832 	switch (get_physical_queue_type(type))
1833 	{
1834 	default:
1835 	case CommandBuffer::Type::Generic:
1836 		return graphics_queue;
1837 	case CommandBuffer::Type::AsyncCompute:
1838 		return compute_queue;
1839 	case CommandBuffer::Type::AsyncTransfer:
1840 		return transfer_queue;
1841 	}
1842 }
1843 
get_performance_query_pool(CommandBuffer::Type type)1844 PerformanceQueryPool &Device::get_performance_query_pool(CommandBuffer::Type type)
1845 {
1846 	switch (get_physical_queue_type(type))
1847 	{
1848 	default:
1849 	case CommandBuffer::Type::Generic:
1850 		return graphics.performance_query_pool;
1851 	case CommandBuffer::Type::AsyncCompute:
1852 		if (graphics_queue_family_index == compute_queue_family_index)
1853 			return graphics.performance_query_pool;
1854 		else
1855 			return compute.performance_query_pool;
1856 	case CommandBuffer::Type::AsyncTransfer:
1857 		if (graphics_queue_family_index == transfer_queue_family_index)
1858 			return graphics.performance_query_pool;
1859 		else if (compute_queue_family_index == transfer_queue_family_index)
1860 			return compute.performance_query_pool;
1861 		else
1862 			return transfer.performance_query_pool;
1863 	}
1864 }
1865 
get_command_pool(CommandBuffer::Type type,unsigned thread)1866 CommandPool &Device::get_command_pool(CommandBuffer::Type type, unsigned thread)
1867 {
1868 	switch (get_physical_queue_type(type))
1869 	{
1870 	default:
1871 	case CommandBuffer::Type::Generic:
1872 		return frame().graphics_cmd_pool[thread];
1873 	case CommandBuffer::Type::AsyncCompute:
1874 		return frame().compute_cmd_pool[thread];
1875 	case CommandBuffer::Type::AsyncTransfer:
1876 		return frame().transfer_cmd_pool[thread];
1877 	}
1878 }
1879 
get_queue_submissions(CommandBuffer::Type type)1880 Util::SmallVector<CommandBufferHandle> &Device::get_queue_submissions(CommandBuffer::Type type)
1881 {
1882 	switch (get_physical_queue_type(type))
1883 	{
1884 	default:
1885 	case CommandBuffer::Type::Generic:
1886 		return frame().graphics_submissions;
1887 	case CommandBuffer::Type::AsyncCompute:
1888 		return frame().compute_submissions;
1889 	case CommandBuffer::Type::AsyncTransfer:
1890 		return frame().transfer_submissions;
1891 	}
1892 }
1893 
request_command_buffer(CommandBuffer::Type type)1894 CommandBufferHandle Device::request_command_buffer(CommandBuffer::Type type)
1895 {
1896 	return request_command_buffer_for_thread(get_thread_index(), type);
1897 }
1898 
request_command_buffer_for_thread(unsigned thread_index,CommandBuffer::Type type)1899 CommandBufferHandle Device::request_command_buffer_for_thread(unsigned thread_index, CommandBuffer::Type type)
1900 {
1901 	LOCK();
1902 	return request_command_buffer_nolock(thread_index, type, false);
1903 }
1904 
request_profiled_command_buffer(CommandBuffer::Type type)1905 CommandBufferHandle Device::request_profiled_command_buffer(CommandBuffer::Type type)
1906 {
1907 	return request_profiled_command_buffer_for_thread(get_thread_index(), type);
1908 }
1909 
request_profiled_command_buffer_for_thread(unsigned thread_index,CommandBuffer::Type type)1910 CommandBufferHandle Device::request_profiled_command_buffer_for_thread(unsigned thread_index,
1911                                                                        CommandBuffer::Type type)
1912 {
1913 	LOCK();
1914 	return request_command_buffer_nolock(thread_index, type, true);
1915 }
1916 
request_command_buffer_nolock(unsigned thread_index,CommandBuffer::Type type,bool profiled)1917 CommandBufferHandle Device::request_command_buffer_nolock(unsigned thread_index, CommandBuffer::Type type, bool profiled)
1918 {
1919 #ifndef GRANITE_VULKAN_MT
1920 	VK_ASSERT(thread_index == 0);
1921 #endif
1922 	auto cmd = get_command_pool(type, thread_index).request_command_buffer();
1923 
1924 	if (profiled && !ext.performance_query_features.performanceCounterQueryPools)
1925 	{
1926 		LOGW("Profiling is not supported on this device.\n");
1927 		profiled = false;
1928 	}
1929 
1930 	VkCommandBufferBeginInfo info = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
1931 	info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
1932 	table->vkBeginCommandBuffer(cmd, &info);
1933 	add_frame_counter_nolock();
1934 	CommandBufferHandle handle(handle_pool.command_buffers.allocate(this, cmd, pipeline_cache, type));
1935 	handle->set_thread_index(thread_index);
1936 
1937 	if (profiled)
1938 	{
1939 		auto &query_pool = get_performance_query_pool(type);
1940 		handle->enable_profiling();
1941 		query_pool.begin_command_buffer(handle->get_command_buffer());
1942 	}
1943 
1944 	return handle;
1945 }
1946 
submit_secondary(CommandBuffer & primary,CommandBuffer & secondary)1947 void Device::submit_secondary(CommandBuffer &primary, CommandBuffer &secondary)
1948 {
1949 	{
1950 		LOCK();
1951 		secondary.end();
1952 		decrement_frame_counter_nolock();
1953 
1954 #ifdef VULKAN_DEBUG
1955 		auto &pool = get_command_pool(secondary.get_command_buffer_type(),
1956 		                              secondary.get_thread_index());
1957 		pool.signal_submitted(secondary.get_command_buffer());
1958 #endif
1959 	}
1960 
1961 	VkCommandBuffer secondary_cmd = secondary.get_command_buffer();
1962 	table->vkCmdExecuteCommands(primary.get_command_buffer(), 1, &secondary_cmd);
1963 }
1964 
request_secondary_command_buffer_for_thread(unsigned thread_index,const Framebuffer * framebuffer,unsigned subpass,CommandBuffer::Type type)1965 CommandBufferHandle Device::request_secondary_command_buffer_for_thread(unsigned thread_index,
1966                                                                         const Framebuffer *framebuffer,
1967                                                                         unsigned subpass,
1968                                                                         CommandBuffer::Type type)
1969 {
1970 	LOCK();
1971 
1972 	auto cmd = get_command_pool(type, thread_index).request_secondary_command_buffer();
1973 	VkCommandBufferBeginInfo info = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
1974 	VkCommandBufferInheritanceInfo inherit = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_INFO };
1975 
1976 	inherit.framebuffer = VK_NULL_HANDLE;
1977 	inherit.renderPass = framebuffer->get_compatible_render_pass().get_render_pass();
1978 	inherit.subpass = subpass;
1979 	info.pInheritanceInfo = &inherit;
1980 	info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT | VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
1981 
1982 	table->vkBeginCommandBuffer(cmd, &info);
1983 	add_frame_counter_nolock();
1984 	CommandBufferHandle handle(handle_pool.command_buffers.allocate(this, cmd, pipeline_cache, type));
1985 	handle->set_thread_index(thread_index);
1986 	handle->set_is_secondary();
1987 	return handle;
1988 }
1989 
set_acquire_semaphore(unsigned index,Semaphore acquire)1990 void Device::set_acquire_semaphore(unsigned index, Semaphore acquire)
1991 {
1992 	wsi.acquire = move(acquire);
1993 	wsi.index = index;
1994 	wsi.touched = false;
1995 	wsi.consumed = false;
1996 
1997 	if (wsi.acquire)
1998 	{
1999 		wsi.acquire->set_internal_sync_object();
2000 		VK_ASSERT(wsi.acquire->is_signalled());
2001 	}
2002 }
2003 
consume_release_semaphore()2004 Semaphore Device::consume_release_semaphore()
2005 {
2006 	auto ret = move(wsi.release);
2007 	wsi.release.reset();
2008 	return ret;
2009 }
2010 
get_stock_sampler(StockSampler sampler) const2011 const Sampler &Device::get_stock_sampler(StockSampler sampler) const
2012 {
2013 	return *samplers[static_cast<unsigned>(sampler)];
2014 }
2015 
swapchain_touched() const2016 bool Device::swapchain_touched() const
2017 {
2018 	return wsi.touched;
2019 }
2020 
~Device()2021 Device::~Device()
2022 {
2023 	wait_idle();
2024 
2025 	managers.timestamps.log_simple();
2026 
2027 	wsi.acquire.reset();
2028 	wsi.release.reset();
2029 	wsi.swapchain.clear();
2030 
2031 	if (pipeline_cache != VK_NULL_HANDLE)
2032 	{
2033 		flush_pipeline_cache();
2034 		table->vkDestroyPipelineCache(device, pipeline_cache, nullptr);
2035 	}
2036 
2037 #ifdef GRANITE_VULKAN_FILESYSTEM
2038 	flush_shader_manager_cache();
2039 #endif
2040 
2041 #ifdef GRANITE_VULKAN_FOSSILIZE
2042 	flush_pipeline_state();
2043 #endif
2044 
2045 	framebuffer_allocator.clear();
2046 	transient_allocator.clear();
2047 	for (auto &sampler : samplers)
2048 		sampler.reset();
2049 
2050 	for (auto &sampler : samplers_ycbcr)
2051 		if (sampler)
2052 			table->vkDestroySamplerYcbcrConversion(device, sampler, nullptr);
2053 
2054 	deinit_timeline_semaphores();
2055 }
2056 
deinit_timeline_semaphores()2057 void Device::deinit_timeline_semaphores()
2058 {
2059 	if (graphics.timeline_semaphore != VK_NULL_HANDLE)
2060 		table->vkDestroySemaphore(device, graphics.timeline_semaphore, nullptr);
2061 	if (compute.timeline_semaphore != VK_NULL_HANDLE)
2062 		table->vkDestroySemaphore(device, compute.timeline_semaphore, nullptr);
2063 	if (transfer.timeline_semaphore != VK_NULL_HANDLE)
2064 		table->vkDestroySemaphore(device, transfer.timeline_semaphore, nullptr);
2065 
2066 	graphics.timeline_semaphore = VK_NULL_HANDLE;
2067 	compute.timeline_semaphore = VK_NULL_HANDLE;
2068 	transfer.timeline_semaphore = VK_NULL_HANDLE;
2069 
2070 	// Make sure we don't accidentally try to wait for these after we destroy the semaphores.
2071 	for (auto &frame : per_frame)
2072 	{
2073 		frame->timeline_fence_graphics = 0;
2074 		frame->timeline_fence_compute = 0;
2075 		frame->timeline_fence_transfer = 0;
2076 		frame->graphics_timeline_semaphore = VK_NULL_HANDLE;
2077 		frame->compute_timeline_semaphore = VK_NULL_HANDLE;
2078 		frame->transfer_timeline_semaphore = VK_NULL_HANDLE;
2079 	}
2080 }
2081 
init_frame_contexts(unsigned count)2082 void Device::init_frame_contexts(unsigned count)
2083 {
2084 	DRAIN_FRAME_LOCK();
2085 	wait_idle_nolock();
2086 
2087 	// Clear out caches which might contain stale data from now on.
2088 	framebuffer_allocator.clear();
2089 	transient_allocator.clear();
2090 	per_frame.clear();
2091 
2092 	for (unsigned i = 0; i < count; i++)
2093 	{
2094 		auto frame = unique_ptr<PerFrame>(new PerFrame(this, i));
2095 		per_frame.emplace_back(move(frame));
2096 	}
2097 }
2098 
init_external_swapchain(const vector<ImageHandle> & swapchain_images)2099 void Device::init_external_swapchain(const vector<ImageHandle> &swapchain_images)
2100 {
2101 	DRAIN_FRAME_LOCK();
2102 	wsi.swapchain.clear();
2103 	wait_idle_nolock();
2104 
2105 	wsi.index = 0;
2106 	wsi.touched = false;
2107 	wsi.consumed = false;
2108 	for (auto &image : swapchain_images)
2109 	{
2110 		wsi.swapchain.push_back(image);
2111 		if (image)
2112 		{
2113 			wsi.swapchain.back()->set_internal_sync_object();
2114 			wsi.swapchain.back()->get_view().set_internal_sync_object();
2115 		}
2116 	}
2117 }
2118 
init_swapchain(const vector<VkImage> & swapchain_images,unsigned width,unsigned height,VkFormat format)2119 void Device::init_swapchain(const vector<VkImage> &swapchain_images, unsigned width, unsigned height, VkFormat format)
2120 {
2121 	DRAIN_FRAME_LOCK();
2122 	wsi.swapchain.clear();
2123 	wait_idle_nolock();
2124 
2125 	const auto info = ImageCreateInfo::render_target(width, height, format);
2126 
2127 	wsi.index = 0;
2128 	wsi.touched = false;
2129 	wsi.consumed = false;
2130 	for (auto &image : swapchain_images)
2131 	{
2132 		VkImageViewCreateInfo view_info = { VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO };
2133 		view_info.image = image;
2134 		view_info.format = format;
2135 		view_info.components.r = VK_COMPONENT_SWIZZLE_R;
2136 		view_info.components.g = VK_COMPONENT_SWIZZLE_G;
2137 		view_info.components.b = VK_COMPONENT_SWIZZLE_B;
2138 		view_info.components.a = VK_COMPONENT_SWIZZLE_A;
2139 		view_info.subresourceRange.aspectMask = format_to_aspect_mask(format);
2140 		view_info.subresourceRange.baseMipLevel = 0;
2141 		view_info.subresourceRange.baseArrayLayer = 0;
2142 		view_info.subresourceRange.levelCount = 1;
2143 		view_info.subresourceRange.layerCount = 1;
2144 		view_info.viewType = VK_IMAGE_VIEW_TYPE_2D;
2145 
2146 		VkImageView image_view;
2147 		if (table->vkCreateImageView(device, &view_info, nullptr, &image_view) != VK_SUCCESS)
2148 			LOGE("Failed to create view for backbuffer.");
2149 
2150 		auto backbuffer = ImageHandle(handle_pool.images.allocate(this, image, image_view, DeviceAllocation{}, info, VK_IMAGE_VIEW_TYPE_2D));
2151 		backbuffer->set_internal_sync_object();
2152 		backbuffer->disown_image();
2153 		backbuffer->get_view().set_internal_sync_object();
2154 		wsi.swapchain.push_back(backbuffer);
2155 		set_name(*backbuffer, "backbuffer");
2156 		backbuffer->set_swapchain_layout(VK_IMAGE_LAYOUT_PRESENT_SRC_KHR);
2157 	}
2158 }
2159 
PerFrame(Device * device_,unsigned frame_index_)2160 Device::PerFrame::PerFrame(Device *device_, unsigned frame_index_)
2161     : device(*device_)
2162     , frame_index(frame_index_)
2163     , table(device_->get_device_table())
2164     , managers(device_->managers)
2165     , query_pool(device_)
2166 {
2167 	graphics_timeline_semaphore = device.graphics.timeline_semaphore;
2168 	compute_timeline_semaphore = device.compute.timeline_semaphore;
2169 	transfer_timeline_semaphore = device.transfer.timeline_semaphore;
2170 
2171 	unsigned count = device_->num_thread_indices;
2172 	graphics_cmd_pool.reserve(count);
2173 	compute_cmd_pool.reserve(count);
2174 	transfer_cmd_pool.reserve(count);
2175 	for (unsigned i = 0; i < count; i++)
2176 	{
2177 		graphics_cmd_pool.emplace_back(device_, device_->graphics_queue_family_index);
2178 		compute_cmd_pool.emplace_back(device_, device_->compute_queue_family_index);
2179 		transfer_cmd_pool.emplace_back(device_, device_->transfer_queue_family_index);
2180 	}
2181 }
2182 
keep_handle_alive(ImageHandle handle)2183 void Device::keep_handle_alive(ImageHandle handle)
2184 {
2185 	LOCK();
2186 	frame().keep_alive_images.push_back(move(handle));
2187 }
2188 
free_memory_nolock(const DeviceAllocation & alloc)2189 void Device::free_memory_nolock(const DeviceAllocation &alloc)
2190 {
2191 	frame().allocations.push_back(alloc);
2192 }
2193 
2194 #ifdef VULKAN_DEBUG
2195 
2196 template <typename T, typename U>
exists(const T & container,const U & value)2197 static inline bool exists(const T &container, const U &value)
2198 {
2199 	return find(begin(container), end(container), value) != end(container);
2200 }
2201 
2202 #endif
2203 
destroy_pipeline(VkPipeline pipeline)2204 void Device::destroy_pipeline(VkPipeline pipeline)
2205 {
2206 	LOCK();
2207 	destroy_pipeline_nolock(pipeline);
2208 }
2209 
reset_fence(VkFence fence,bool observed_wait)2210 void Device::reset_fence(VkFence fence, bool observed_wait)
2211 {
2212 	LOCK();
2213 	reset_fence_nolock(fence, observed_wait);
2214 }
2215 
destroy_buffer(VkBuffer buffer)2216 void Device::destroy_buffer(VkBuffer buffer)
2217 {
2218 	LOCK();
2219 	destroy_buffer_nolock(buffer);
2220 }
2221 
destroy_descriptor_pool(VkDescriptorPool desc_pool)2222 void Device::destroy_descriptor_pool(VkDescriptorPool desc_pool)
2223 {
2224 	LOCK();
2225 	destroy_descriptor_pool_nolock(desc_pool);
2226 }
2227 
destroy_buffer_view(VkBufferView view)2228 void Device::destroy_buffer_view(VkBufferView view)
2229 {
2230 	LOCK();
2231 	destroy_buffer_view_nolock(view);
2232 }
2233 
destroy_event(VkEvent event)2234 void Device::destroy_event(VkEvent event)
2235 {
2236 	LOCK();
2237 	destroy_event_nolock(event);
2238 }
2239 
destroy_framebuffer(VkFramebuffer framebuffer)2240 void Device::destroy_framebuffer(VkFramebuffer framebuffer)
2241 {
2242 	LOCK();
2243 	destroy_framebuffer_nolock(framebuffer);
2244 }
2245 
destroy_image(VkImage image)2246 void Device::destroy_image(VkImage image)
2247 {
2248 	LOCK();
2249 	destroy_image_nolock(image);
2250 }
2251 
destroy_semaphore(VkSemaphore semaphore)2252 void Device::destroy_semaphore(VkSemaphore semaphore)
2253 {
2254 	LOCK();
2255 	destroy_semaphore_nolock(semaphore);
2256 }
2257 
recycle_semaphore(VkSemaphore semaphore)2258 void Device::recycle_semaphore(VkSemaphore semaphore)
2259 {
2260 	LOCK();
2261 	recycle_semaphore_nolock(semaphore);
2262 }
2263 
free_memory(const DeviceAllocation & alloc)2264 void Device::free_memory(const DeviceAllocation &alloc)
2265 {
2266 	LOCK();
2267 	free_memory_nolock(alloc);
2268 }
2269 
destroy_sampler(VkSampler sampler)2270 void Device::destroy_sampler(VkSampler sampler)
2271 {
2272 	LOCK();
2273 	destroy_sampler_nolock(sampler);
2274 }
2275 
destroy_image_view(VkImageView view)2276 void Device::destroy_image_view(VkImageView view)
2277 {
2278 	LOCK();
2279 	destroy_image_view_nolock(view);
2280 }
2281 
destroy_pipeline_nolock(VkPipeline pipeline)2282 void Device::destroy_pipeline_nolock(VkPipeline pipeline)
2283 {
2284 	VK_ASSERT(!exists(frame().destroyed_pipelines, pipeline));
2285 	frame().destroyed_pipelines.push_back(pipeline);
2286 }
2287 
destroy_image_view_nolock(VkImageView view)2288 void Device::destroy_image_view_nolock(VkImageView view)
2289 {
2290 	VK_ASSERT(!exists(frame().destroyed_image_views, view));
2291 	frame().destroyed_image_views.push_back(view);
2292 }
2293 
destroy_buffer_view_nolock(VkBufferView view)2294 void Device::destroy_buffer_view_nolock(VkBufferView view)
2295 {
2296 	VK_ASSERT(!exists(frame().destroyed_buffer_views, view));
2297 	frame().destroyed_buffer_views.push_back(view);
2298 }
2299 
destroy_semaphore_nolock(VkSemaphore semaphore)2300 void Device::destroy_semaphore_nolock(VkSemaphore semaphore)
2301 {
2302 	VK_ASSERT(!exists(frame().destroyed_semaphores, semaphore));
2303 	frame().destroyed_semaphores.push_back(semaphore);
2304 }
2305 
recycle_semaphore_nolock(VkSemaphore semaphore)2306 void Device::recycle_semaphore_nolock(VkSemaphore semaphore)
2307 {
2308 	VK_ASSERT(!exists(frame().recycled_semaphores, semaphore));
2309 	frame().recycled_semaphores.push_back(semaphore);
2310 }
2311 
destroy_event_nolock(VkEvent event)2312 void Device::destroy_event_nolock(VkEvent event)
2313 {
2314 	VK_ASSERT(!exists(frame().recycled_events, event));
2315 	frame().recycled_events.push_back(event);
2316 }
2317 
reset_fence_nolock(VkFence fence,bool observed_wait)2318 void Device::reset_fence_nolock(VkFence fence, bool observed_wait)
2319 {
2320 	if (observed_wait)
2321 	{
2322 		table->vkResetFences(device, 1, &fence);
2323 		managers.fence.recycle_fence(fence);
2324 	}
2325 	else
2326 		frame().recycle_fences.push_back(fence);
2327 }
2328 
request_pipeline_event()2329 PipelineEvent Device::request_pipeline_event()
2330 {
2331 	return PipelineEvent(handle_pool.events.allocate(this, managers.event.request_cleared_event()));
2332 }
2333 
destroy_image_nolock(VkImage image)2334 void Device::destroy_image_nolock(VkImage image)
2335 {
2336 	VK_ASSERT(!exists(frame().destroyed_images, image));
2337 	frame().destroyed_images.push_back(image);
2338 }
2339 
destroy_buffer_nolock(VkBuffer buffer)2340 void Device::destroy_buffer_nolock(VkBuffer buffer)
2341 {
2342 	VK_ASSERT(!exists(frame().destroyed_buffers, buffer));
2343 	frame().destroyed_buffers.push_back(buffer);
2344 }
2345 
destroy_descriptor_pool_nolock(VkDescriptorPool desc_pool)2346 void Device::destroy_descriptor_pool_nolock(VkDescriptorPool desc_pool)
2347 {
2348 	VK_ASSERT(!exists(frame().destroyed_descriptor_pools, desc_pool));
2349 	frame().destroyed_descriptor_pools.push_back(desc_pool);
2350 }
2351 
destroy_sampler_nolock(VkSampler sampler)2352 void Device::destroy_sampler_nolock(VkSampler sampler)
2353 {
2354 	VK_ASSERT(!exists(frame().destroyed_samplers, sampler));
2355 	frame().destroyed_samplers.push_back(sampler);
2356 }
2357 
destroy_framebuffer_nolock(VkFramebuffer framebuffer)2358 void Device::destroy_framebuffer_nolock(VkFramebuffer framebuffer)
2359 {
2360 	VK_ASSERT(!exists(frame().destroyed_framebuffers, framebuffer));
2361 	frame().destroyed_framebuffers.push_back(framebuffer);
2362 }
2363 
clear_wait_semaphores()2364 void Device::clear_wait_semaphores()
2365 {
2366 	for (auto &sem : graphics.wait_semaphores)
2367 		table->vkDestroySemaphore(device, sem->consume(), nullptr);
2368 	for (auto &sem : compute.wait_semaphores)
2369 		table->vkDestroySemaphore(device, sem->consume(), nullptr);
2370 	for (auto &sem : transfer.wait_semaphores)
2371 		table->vkDestroySemaphore(device, sem->consume(), nullptr);
2372 
2373 	graphics.wait_semaphores.clear();
2374 	graphics.wait_stages.clear();
2375 	compute.wait_semaphores.clear();
2376 	compute.wait_stages.clear();
2377 	transfer.wait_semaphores.clear();
2378 	transfer.wait_stages.clear();
2379 }
2380 
wait_idle()2381 void Device::wait_idle()
2382 {
2383 	DRAIN_FRAME_LOCK();
2384 	wait_idle_nolock();
2385 }
2386 
wait_idle_nolock()2387 void Device::wait_idle_nolock()
2388 {
2389 	if (!per_frame.empty())
2390 		end_frame_nolock();
2391 
2392 	if (device != VK_NULL_HANDLE)
2393 	{
2394 		if (queue_lock_callback)
2395 			queue_lock_callback();
2396 		auto result = table->vkDeviceWaitIdle(device);
2397 		if (result != VK_SUCCESS)
2398 			LOGE("vkDeviceWaitIdle failed with code: %d\n", result);
2399 		if (result == VK_ERROR_DEVICE_LOST)
2400 			report_checkpoints();
2401 		if (queue_unlock_callback)
2402 			queue_unlock_callback();
2403 	}
2404 
2405 	clear_wait_semaphores();
2406 
2407 	// Free memory for buffer pools.
2408 	managers.vbo.reset();
2409 	managers.ubo.reset();
2410 	managers.ibo.reset();
2411 	managers.staging.reset();
2412 	for (auto &frame : per_frame)
2413 	{
2414 		frame->vbo_blocks.clear();
2415 		frame->ibo_blocks.clear();
2416 		frame->ubo_blocks.clear();
2417 		frame->staging_blocks.clear();
2418 	}
2419 
2420 	framebuffer_allocator.clear();
2421 	transient_allocator.clear();
2422 	for (auto &allocator : descriptor_set_allocators)
2423 		allocator.clear();
2424 
2425 	for (auto &frame : per_frame)
2426 	{
2427 		// We have done WaitIdle, no need to wait for extra fences, it's also not safe.
2428 		frame->wait_fences.clear();
2429 		frame->begin();
2430 	}
2431 }
2432 
next_frame_context()2433 void Device::next_frame_context()
2434 {
2435 	DRAIN_FRAME_LOCK();
2436 
2437 	if (frame_context_begin_ts)
2438 	{
2439 		auto frame_context_end_ts = write_calibrated_timestamp_nolock();
2440 		register_time_interval_nolock("CPU", std::move(frame_context_begin_ts), std::move(frame_context_end_ts), "command submissions", "");
2441 		frame_context_begin_ts = {};
2442 	}
2443 
2444 	// Flush the frame here as we might have pending staging command buffers from init stage.
2445 	end_frame_nolock();
2446 
2447 	framebuffer_allocator.begin_frame();
2448 	transient_allocator.begin_frame();
2449 	for (auto &allocator : descriptor_set_allocators)
2450 		allocator.begin_frame();
2451 
2452 	VK_ASSERT(!per_frame.empty());
2453 	frame_context_index++;
2454 	if (frame_context_index >= per_frame.size())
2455 		frame_context_index = 0;
2456 
2457 	frame().begin();
2458 	recalibrate_timestamps();
2459 	frame_context_begin_ts = write_calibrated_timestamp_nolock();
2460 }
2461 
write_timestamp(VkCommandBuffer cmd,VkPipelineStageFlagBits stage)2462 QueryPoolHandle Device::write_timestamp(VkCommandBuffer cmd, VkPipelineStageFlagBits stage)
2463 {
2464 	LOCK();
2465 	return write_timestamp_nolock(cmd, stage);
2466 }
2467 
write_timestamp_nolock(VkCommandBuffer cmd,VkPipelineStageFlagBits stage)2468 QueryPoolHandle Device::write_timestamp_nolock(VkCommandBuffer cmd, VkPipelineStageFlagBits stage)
2469 {
2470 	return frame().query_pool.write_timestamp(cmd, stage);
2471 }
2472 
write_calibrated_timestamp()2473 QueryPoolHandle Device::write_calibrated_timestamp()
2474 {
2475 	LOCK();
2476 	return write_calibrated_timestamp_nolock();
2477 }
2478 
write_calibrated_timestamp_nolock()2479 QueryPoolHandle Device::write_calibrated_timestamp_nolock()
2480 {
2481 	if (!json_trace_file)
2482 		return {};
2483 
2484 	auto handle = QueryPoolHandle(handle_pool.query.allocate(this));
2485 	handle->signal_timestamp_ticks(get_calibrated_timestamp());
2486 	return handle;
2487 }
2488 
recalibrate_timestamps_fallback()2489 void Device::recalibrate_timestamps_fallback()
2490 {
2491 	wait_idle_nolock();
2492 	auto cmd = request_command_buffer_nolock(0, CommandBuffer::Type::Generic, false);
2493 	auto ts = write_timestamp_nolock(cmd->get_command_buffer(), VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
2494 	if (!ts)
2495 		return;
2496 	auto start_ts = Util::get_current_time_nsecs();
2497 	submit_nolock(cmd, nullptr, 0, nullptr);
2498 	wait_idle_nolock();
2499 	auto end_ts = Util::get_current_time_nsecs();
2500 	auto host_ts = (start_ts + end_ts) / 2;
2501 
2502 	LOGI("Calibrated timestamps with a fallback method. Uncertainty: %.3f us.\n", 1e-3 * (end_ts - start_ts));
2503 
2504 	calibrated_timestamp_host = host_ts;
2505 	VK_ASSERT(ts->is_signalled());
2506 	calibrated_timestamp_device = ts->get_timestamp_ticks();
2507 }
2508 
init_calibrated_timestamps()2509 void Device::init_calibrated_timestamps()
2510 {
2511 	if (!get_device_features().supports_calibrated_timestamps)
2512 	{
2513 		recalibrate_timestamps_fallback();
2514 		return;
2515 	}
2516 
2517 	uint32_t count;
2518 	vkGetPhysicalDeviceCalibrateableTimeDomainsEXT(gpu, &count, nullptr);
2519 	std::vector<VkTimeDomainEXT> domains(count);
2520 	if (vkGetPhysicalDeviceCalibrateableTimeDomainsEXT(gpu, &count, domains.data()) != VK_SUCCESS)
2521 		return;
2522 
2523 	bool supports_device_domain = false;
2524 	for (auto &domain : domains)
2525 	{
2526 		if (domain == VK_TIME_DOMAIN_DEVICE_EXT)
2527 		{
2528 			supports_device_domain = true;
2529 			break;
2530 		}
2531 	}
2532 
2533 	if (!supports_device_domain)
2534 		return;
2535 
2536 	for (auto &domain : domains)
2537 	{
2538 #ifdef _WIN32
2539 		const auto supported_domain = VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT;
2540 #else
2541 		const auto supported_domain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT;
2542 #endif
2543 		if (domain == supported_domain)
2544 		{
2545 			calibrated_time_domain = domain;
2546 			break;
2547 		}
2548 	}
2549 
2550 	if (calibrated_time_domain == VK_TIME_DOMAIN_DEVICE_EXT)
2551 	{
2552 		LOGE("Could not find a suitable time domain for calibrated timestamps.\n");
2553 		return;
2554 	}
2555 
2556 	if (!resample_calibrated_timestamps())
2557 	{
2558 		LOGE("Failed to get calibrated timestamps.\n");
2559 		calibrated_time_domain = VK_TIME_DOMAIN_DEVICE_EXT;
2560 		return;
2561 	}
2562 }
2563 
resample_calibrated_timestamps()2564 bool Device::resample_calibrated_timestamps()
2565 {
2566 	VkCalibratedTimestampInfoEXT infos[2] = {};
2567 	infos[0].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
2568 	infos[1].sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
2569 	infos[0].timeDomain = calibrated_time_domain;
2570 	infos[1].timeDomain = VK_TIME_DOMAIN_DEVICE_EXT;
2571 	uint64_t timestamps[2] = {};
2572 	uint64_t max_deviation[2] = {};
2573 
2574 	if (table->vkGetCalibratedTimestampsEXT(device, 2, infos, timestamps, max_deviation) != VK_SUCCESS)
2575 	{
2576 		LOGE("Failed to get calibrated timestamps.\n");
2577 		calibrated_time_domain = VK_TIME_DOMAIN_DEVICE_EXT;
2578 		return false;
2579 	}
2580 
2581 	calibrated_timestamp_host = timestamps[0];
2582 	calibrated_timestamp_device = timestamps[1];
2583 
2584 #ifdef _WIN32
2585 	LARGE_INTEGER freq;
2586 	QueryPerformanceFrequency(&freq);
2587 	calibrated_timestamp_host = int64_t(1e9 * calibrated_timestamp_host / double(freq.QuadPart));
2588 #endif
2589 	return true;
2590 }
2591 
recalibrate_timestamps()2592 void Device::recalibrate_timestamps()
2593 {
2594 	// Don't bother recalibrating timestamps if we're not tracing.
2595 	if (!json_trace_file)
2596 		return;
2597 
2598 	// Recalibrate every once in a while ...
2599 	timestamp_calibration_counter++;
2600 	if (timestamp_calibration_counter < 1000)
2601 		return;
2602 	timestamp_calibration_counter = 0;
2603 
2604 	if (calibrated_time_domain == VK_TIME_DOMAIN_DEVICE_EXT)
2605 		recalibrate_timestamps_fallback();
2606 	else
2607 		resample_calibrated_timestamps();
2608 }
2609 
get_calibrated_timestamp()2610 int64_t Device::get_calibrated_timestamp()
2611 {
2612 	int64_t nsecs = Util::get_current_time_nsecs();
2613 
2614 	auto offset_from_calibration = double(nsecs - calibrated_timestamp_host);
2615 	auto ticks_in_device_timebase = int64_t(offset_from_calibration / double(gpu_props.limits.timestampPeriod));
2616 	int64_t reported = calibrated_timestamp_device + ticks_in_device_timebase;
2617 	reported = std::max(reported, last_calibrated_timestamp_host);
2618 	last_calibrated_timestamp_host = reported;
2619 	return reported;
2620 }
2621 
register_time_interval(std::string tid,QueryPoolHandle start_ts,QueryPoolHandle end_ts,std::string tag,std::string extra)2622 void Device::register_time_interval(std::string tid, QueryPoolHandle start_ts, QueryPoolHandle end_ts, std::string tag, std::string extra)
2623 {
2624 	LOCK();
2625 	register_time_interval_nolock(std::move(tid), std::move(start_ts), std::move(end_ts), std::move(tag), std::move(extra));
2626 }
2627 
register_time_interval_nolock(std::string tid,QueryPoolHandle start_ts,QueryPoolHandle end_ts,std::string tag,std::string extra)2628 void Device::register_time_interval_nolock(std::string tid, QueryPoolHandle start_ts, QueryPoolHandle end_ts,
2629                                            std::string tag, std::string extra)
2630 {
2631 	if (start_ts && end_ts)
2632 	{
2633 		TimestampInterval *timestamp_tag = managers.timestamps.get_timestamp_tag(tag.c_str());
2634 #ifdef VULKAN_DEBUG
2635 		if (start_ts->is_signalled() && end_ts->is_signalled())
2636 			VK_ASSERT(end_ts->get_timestamp_ticks() >= start_ts->get_timestamp_ticks());
2637 #endif
2638 		frame().timestamp_intervals.push_back({ std::move(tid), move(start_ts), move(end_ts), timestamp_tag, std::move(extra) });
2639 	}
2640 }
2641 
add_frame_counter_nolock()2642 void Device::add_frame_counter_nolock()
2643 {
2644 	lock.counter++;
2645 }
2646 
decrement_frame_counter_nolock()2647 void Device::decrement_frame_counter_nolock()
2648 {
2649 	VK_ASSERT(lock.counter > 0);
2650 	lock.counter--;
2651 #ifdef GRANITE_VULKAN_MT
2652 	lock.cond.notify_one();
2653 #endif
2654 }
2655 
begin()2656 void Device::PerFrame::begin()
2657 {
2658 	VkDevice vkdevice = device.get_device();
2659 
2660 	Vulkan::QueryPoolHandle wait_fence_ts;
2661 	if (!in_destructor && device.json_timestamp_origin)
2662 		wait_fence_ts = device.write_calibrated_timestamp_nolock();
2663 
2664 	if (device.get_device_features().timeline_semaphore_features.timelineSemaphore &&
2665 	    graphics_timeline_semaphore && compute_timeline_semaphore && transfer_timeline_semaphore)
2666 	{
2667 		VkSemaphoreWaitInfoKHR info = { VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO_KHR };
2668 		const VkSemaphore semaphores[3] = { graphics_timeline_semaphore, compute_timeline_semaphore, transfer_timeline_semaphore };
2669 		const uint64_t values[3] = { timeline_fence_graphics, timeline_fence_compute, timeline_fence_transfer };
2670 
2671 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
2672 		if (device.get_device_features().timeline_semaphore_features.timelineSemaphore)
2673 		{
2674 			LOGI("Waiting for graphics (%p) %u\n",
2675 			     reinterpret_cast<void *>(graphics_timeline_semaphore),
2676 			     unsigned(timeline_fence_graphics));
2677 			LOGI("Waiting for compute (%p) %u\n",
2678 			     reinterpret_cast<void *>(compute_timeline_semaphore),
2679 			     unsigned(timeline_fence_compute));
2680 			LOGI("Waiting for transfer (%p) %u\n",
2681 			     reinterpret_cast<void *>(transfer_timeline_semaphore),
2682 			     unsigned(timeline_fence_transfer));
2683 		}
2684 #endif
2685 
2686 		info.pSemaphores = semaphores;
2687 		info.pValues = values;
2688 		info.semaphoreCount = 3;
2689 		table.vkWaitSemaphoresKHR(vkdevice, &info, UINT64_MAX);
2690 	}
2691 
2692 	// If we're using timeline semaphores, these paths should never be hit.
2693 	if (!wait_fences.empty())
2694 	{
2695 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
2696 		for (auto &fence : wait_fences)
2697 			LOGI("Waiting for Fence: %llx\n", reinterpret_cast<unsigned long long>(fence));
2698 #endif
2699 		table.vkWaitForFences(vkdevice, wait_fences.size(), wait_fences.data(), VK_TRUE, UINT64_MAX);
2700 		wait_fences.clear();
2701 	}
2702 
2703 	if (!in_destructor && device.json_timestamp_origin)
2704 		device.register_time_interval_nolock("CPU", std::move(wait_fence_ts), device.write_calibrated_timestamp_nolock(), "fence", "");
2705 
2706 	// If we're using timeline semaphores, these paths should never be hit.
2707 	if (!recycle_fences.empty())
2708 	{
2709 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
2710 		for (auto &fence : recycle_fences)
2711 			LOGI("Recycling Fence: %llx\n", reinterpret_cast<unsigned long long>(fence));
2712 #endif
2713 		table.vkResetFences(vkdevice, recycle_fences.size(), recycle_fences.data());
2714 		for (auto &fence : recycle_fences)
2715 			managers.fence.recycle_fence(fence);
2716 		recycle_fences.clear();
2717 	}
2718 
2719 	for (auto &pool : graphics_cmd_pool)
2720 		pool.begin();
2721 	for (auto &pool : compute_cmd_pool)
2722 		pool.begin();
2723 	for (auto &pool : transfer_cmd_pool)
2724 		pool.begin();
2725 	query_pool.begin();
2726 
2727 	for (auto &channel : debug_channels)
2728 		device.parse_debug_channel(channel);
2729 
2730 	// Free the debug channel buffers here, and they will immediately be recycled by the destroyed_buffers right below.
2731 	debug_channels.clear();
2732 
2733 	for (auto &framebuffer : destroyed_framebuffers)
2734 		table.vkDestroyFramebuffer(vkdevice, framebuffer, nullptr);
2735 	for (auto &sampler : destroyed_samplers)
2736 		table.vkDestroySampler(vkdevice, sampler, nullptr);
2737 	for (auto &pipeline : destroyed_pipelines)
2738 		table.vkDestroyPipeline(vkdevice, pipeline, nullptr);
2739 	for (auto &view : destroyed_image_views)
2740 		table.vkDestroyImageView(vkdevice, view, nullptr);
2741 	for (auto &view : destroyed_buffer_views)
2742 		table.vkDestroyBufferView(vkdevice, view, nullptr);
2743 	for (auto &image : destroyed_images)
2744 		table.vkDestroyImage(vkdevice, image, nullptr);
2745 	for (auto &buffer : destroyed_buffers)
2746 		table.vkDestroyBuffer(vkdevice, buffer, nullptr);
2747 	for (auto &semaphore : destroyed_semaphores)
2748 		table.vkDestroySemaphore(vkdevice, semaphore, nullptr);
2749 	for (auto &pool : destroyed_descriptor_pools)
2750 		table.vkDestroyDescriptorPool(vkdevice, pool, nullptr);
2751 	for (auto &semaphore : recycled_semaphores)
2752 	{
2753 #if defined(VULKAN_DEBUG) && defined(SUBMIT_DEBUG)
2754 		LOGI("Recycling semaphore: %llx\n", reinterpret_cast<unsigned long long>(semaphore));
2755 #endif
2756 		managers.semaphore.recycle(semaphore);
2757 	}
2758 	for (auto &event : recycled_events)
2759 		managers.event.recycle(event);
2760 	for (auto &alloc : allocations)
2761 		alloc.free_immediate(managers.memory);
2762 
2763 	for (auto &block : vbo_blocks)
2764 		managers.vbo.recycle_block(move(block));
2765 	for (auto &block : ibo_blocks)
2766 		managers.ibo.recycle_block(move(block));
2767 	for (auto &block : ubo_blocks)
2768 		managers.ubo.recycle_block(move(block));
2769 	for (auto &block : staging_blocks)
2770 		managers.staging.recycle_block(move(block));
2771 	vbo_blocks.clear();
2772 	ibo_blocks.clear();
2773 	ubo_blocks.clear();
2774 	staging_blocks.clear();
2775 
2776 	destroyed_framebuffers.clear();
2777 	destroyed_samplers.clear();
2778 	destroyed_pipelines.clear();
2779 	destroyed_image_views.clear();
2780 	destroyed_buffer_views.clear();
2781 	destroyed_images.clear();
2782 	destroyed_buffers.clear();
2783 	destroyed_semaphores.clear();
2784 	destroyed_descriptor_pools.clear();
2785 	recycled_semaphores.clear();
2786 	recycled_events.clear();
2787 	allocations.clear();
2788 
2789 	int64_t min_timestamp_us = std::numeric_limits<int64_t>::max();
2790 	int64_t max_timestamp_us = 0;
2791 
2792 	for (auto &ts : timestamp_intervals)
2793 	{
2794 		if (ts.end_ts->is_signalled() && ts.start_ts->is_signalled())
2795 		{
2796 			ts.timestamp_tag->accumulate_time(
2797 			    device.convert_timestamp_delta(ts.start_ts->get_timestamp_ticks(), ts.end_ts->get_timestamp_ticks()));
2798 			device.write_json_timestamp_range(frame_index, ts.tid.c_str(), ts.timestamp_tag->get_tag().c_str(),
2799 			                                  ts.extra.c_str(),
2800 			                                  ts.start_ts->get_timestamp_ticks(), ts.end_ts->get_timestamp_ticks(),
2801 			                                  min_timestamp_us, max_timestamp_us);
2802 		}
2803 	}
2804 	device.write_json_timestamp_range_us(frame_index, "CPU + GPU", "full frame lifetime", min_timestamp_us, max_timestamp_us);
2805 	managers.timestamps.mark_end_of_frame_context();
2806 	timestamp_intervals.clear();
2807 }
2808 
~PerFrame()2809 Device::PerFrame::~PerFrame()
2810 {
2811 	in_destructor = true;
2812 	begin();
2813 }
2814 
find_memory_type(BufferDomain domain,uint32_t mask)2815 uint32_t Device::find_memory_type(BufferDomain domain, uint32_t mask)
2816 {
2817 	uint32_t prio[3] = {};
2818 	switch (domain)
2819 	{
2820 	case BufferDomain::Device:
2821 		prio[0] = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
2822 		break;
2823 
2824 	case BufferDomain::LinkedDeviceHost:
2825 		prio[0] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
2826 		prio[1] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
2827 		prio[2] = prio[1];
2828 		break;
2829 
2830 	case BufferDomain::LinkedDeviceHostPreferDevice:
2831 		prio[0] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
2832 		prio[1] = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
2833 		prio[2] = prio[1];
2834 		break;
2835 
2836 	case BufferDomain::Host:
2837 		prio[0] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
2838 		prio[1] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
2839 		prio[2] = prio[1];
2840 		break;
2841 
2842 	case BufferDomain::CachedHost:
2843 		prio[0] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
2844 		prio[1] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
2845 		prio[2] = prio[1];
2846 		break;
2847 
2848 	case BufferDomain::CachedCoherentHostPreferCached:
2849 		prio[0] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
2850 		prio[1] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
2851 		prio[2] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
2852 		break;
2853 
2854 	case BufferDomain::CachedCoherentHostPreferCoherent:
2855 		prio[0] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
2856 		prio[1] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
2857 		prio[2] = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
2858 		break;
2859 	}
2860 
2861 	for (auto &p : prio)
2862 	{
2863 		for (uint32_t i = 0; i < mem_props.memoryTypeCount; i++)
2864 		{
2865 			if ((1u << i) & mask)
2866 			{
2867 				uint32_t flags = mem_props.memoryTypes[i].propertyFlags;
2868 				if ((flags & p) == p)
2869 					return i;
2870 			}
2871 		}
2872 	}
2873 
2874 	return UINT32_MAX;
2875 }
2876 
find_memory_type(ImageDomain domain,uint32_t mask)2877 uint32_t Device::find_memory_type(ImageDomain domain, uint32_t mask)
2878 {
2879 	uint32_t desired = 0, fallback = 0;
2880 	switch (domain)
2881 	{
2882 	case ImageDomain::Physical:
2883 		desired = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
2884 		fallback = 0;
2885 		break;
2886 
2887 	case ImageDomain::Transient:
2888 		desired = VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT;
2889 		fallback = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
2890 		break;
2891 
2892 	case ImageDomain::LinearHostCached:
2893 		desired = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
2894 		fallback = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
2895 		break;
2896 
2897 	case ImageDomain::LinearHost:
2898 		desired = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
2899 		fallback = 0;
2900 		break;
2901 	}
2902 
2903 	for (uint32_t i = 0; i < mem_props.memoryTypeCount; i++)
2904 	{
2905 		if ((1u << i) & mask)
2906 		{
2907 			uint32_t flags = mem_props.memoryTypes[i].propertyFlags;
2908 			if ((flags & desired) == desired)
2909 				return i;
2910 		}
2911 	}
2912 
2913 	for (uint32_t i = 0; i < mem_props.memoryTypeCount; i++)
2914 	{
2915 		if ((1u << i) & mask)
2916 		{
2917 			uint32_t flags = mem_props.memoryTypes[i].propertyFlags;
2918 			if ((flags & fallback) == fallback)
2919 				return i;
2920 		}
2921 	}
2922 
2923 	return UINT32_MAX;
2924 }
2925 
get_image_view_type(const ImageCreateInfo & create_info,const ImageViewCreateInfo * view)2926 static inline VkImageViewType get_image_view_type(const ImageCreateInfo &create_info, const ImageViewCreateInfo *view)
2927 {
2928 	unsigned layers = view ? view->layers : create_info.layers;
2929 	unsigned base_layer = view ? view->base_layer : 0;
2930 
2931 	if (layers == VK_REMAINING_ARRAY_LAYERS)
2932 		layers = create_info.layers - base_layer;
2933 
2934 	bool force_array =
2935 	    view ? (view->misc & IMAGE_VIEW_MISC_FORCE_ARRAY_BIT) : (create_info.misc & IMAGE_MISC_FORCE_ARRAY_BIT);
2936 
2937 	switch (create_info.type)
2938 	{
2939 	case VK_IMAGE_TYPE_1D:
2940 		VK_ASSERT(create_info.width >= 1);
2941 		VK_ASSERT(create_info.height == 1);
2942 		VK_ASSERT(create_info.depth == 1);
2943 		VK_ASSERT(create_info.samples == VK_SAMPLE_COUNT_1_BIT);
2944 
2945 		if (layers > 1 || force_array)
2946 			return VK_IMAGE_VIEW_TYPE_1D_ARRAY;
2947 		else
2948 			return VK_IMAGE_VIEW_TYPE_1D;
2949 
2950 	case VK_IMAGE_TYPE_2D:
2951 		VK_ASSERT(create_info.width >= 1);
2952 		VK_ASSERT(create_info.height >= 1);
2953 		VK_ASSERT(create_info.depth == 1);
2954 
2955 		if ((create_info.flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT) && (layers % 6) == 0)
2956 		{
2957 			VK_ASSERT(create_info.width == create_info.height);
2958 
2959 			if (layers > 6 || force_array)
2960 				return VK_IMAGE_VIEW_TYPE_CUBE_ARRAY;
2961 			else
2962 				return VK_IMAGE_VIEW_TYPE_CUBE;
2963 		}
2964 		else
2965 		{
2966 			if (layers > 1 || force_array)
2967 				return VK_IMAGE_VIEW_TYPE_2D_ARRAY;
2968 			else
2969 				return VK_IMAGE_VIEW_TYPE_2D;
2970 		}
2971 
2972 	case VK_IMAGE_TYPE_3D:
2973 		VK_ASSERT(create_info.width >= 1);
2974 		VK_ASSERT(create_info.height >= 1);
2975 		VK_ASSERT(create_info.depth >= 1);
2976 		return VK_IMAGE_VIEW_TYPE_3D;
2977 
2978 	default:
2979 		VK_ASSERT(0 && "bogus");
2980 		return VK_IMAGE_VIEW_TYPE_RANGE_SIZE;
2981 	}
2982 }
2983 
create_buffer_view(const BufferViewCreateInfo & view_info)2984 BufferViewHandle Device::create_buffer_view(const BufferViewCreateInfo &view_info)
2985 {
2986 	VkBufferViewCreateInfo info = { VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO };
2987 	info.buffer = view_info.buffer->get_buffer();
2988 	info.format = view_info.format;
2989 	info.offset = view_info.offset;
2990 	info.range = view_info.range;
2991 
2992 	VkBufferView view;
2993 	auto res = table->vkCreateBufferView(device, &info, nullptr, &view);
2994 	if (res != VK_SUCCESS)
2995 		return BufferViewHandle(nullptr);
2996 
2997 	return BufferViewHandle(handle_pool.buffer_views.allocate(this, view, view_info));
2998 }
2999 
3000 class ImageResourceHolder
3001 {
3002 public:
ImageResourceHolder(Device * device_)3003 	explicit ImageResourceHolder(Device *device_)
3004 		: device(device_)
3005 		, table(device_->get_device_table())
3006 	{
3007 	}
3008 
~ImageResourceHolder()3009 	~ImageResourceHolder()
3010 	{
3011 		if (owned)
3012 			cleanup();
3013 	}
3014 
3015 	Device *device;
3016 	const VolkDeviceTable &table;
3017 
3018 	VkImage image = VK_NULL_HANDLE;
3019 	VkDeviceMemory memory = VK_NULL_HANDLE;
3020 	VkImageView image_view = VK_NULL_HANDLE;
3021 	VkImageView depth_view = VK_NULL_HANDLE;
3022 	VkImageView stencil_view = VK_NULL_HANDLE;
3023 	VkImageView unorm_view = VK_NULL_HANDLE;
3024 	VkImageView srgb_view = VK_NULL_HANDLE;
3025 	VkImageViewType default_view_type = VK_IMAGE_VIEW_TYPE_RANGE_SIZE;
3026 	vector<VkImageView> rt_views;
3027 	DeviceAllocation allocation;
3028 	DeviceAllocator *allocator = nullptr;
3029 	bool owned = true;
3030 
get_default_view_type() const3031 	VkImageViewType get_default_view_type() const
3032 	{
3033 		return default_view_type;
3034 	}
3035 
setup_conversion_info(VkImageViewCreateInfo & create_info,VkSamplerYcbcrConversionInfo & conversion)3036 	bool setup_conversion_info(VkImageViewCreateInfo &create_info, VkSamplerYcbcrConversionInfo &conversion)
3037 	{
3038 		switch (create_info.format)
3039 		{
3040 		case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
3041 			if (!device->get_device_features().sampler_ycbcr_conversion_features.samplerYcbcrConversion)
3042 				return false;
3043 			create_info.pNext = &conversion;
3044 			conversion = { VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO };
3045 			conversion.conversion = device->samplers_ycbcr[static_cast<unsigned>(YCbCrFormat::YUV420P_3PLANE)];
3046 			break;
3047 
3048 		case VK_FORMAT_G8_B8_R8_3PLANE_422_UNORM:
3049 			if (!device->get_device_features().sampler_ycbcr_conversion_features.samplerYcbcrConversion)
3050 				return false;
3051 			create_info.pNext = &conversion;
3052 			conversion = { VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO };
3053 			conversion.conversion = device->samplers_ycbcr[static_cast<unsigned>(YCbCrFormat::YUV422P_3PLANE)];
3054 			break;
3055 
3056 		case VK_FORMAT_G8_B8_R8_3PLANE_444_UNORM:
3057 			if (!device->get_device_features().sampler_ycbcr_conversion_features.samplerYcbcrConversion)
3058 				return false;
3059 			create_info.pNext = &conversion;
3060 			conversion = { VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO };
3061 			conversion.conversion = device->samplers_ycbcr[static_cast<unsigned>(YCbCrFormat::YUV444P_3PLANE)];
3062 			break;
3063 
3064 		default:
3065 			break;
3066 		}
3067 
3068 		return true;
3069 	}
3070 
create_default_views(const ImageCreateInfo & create_info,const VkImageViewCreateInfo * view_info,bool create_unorm_srgb_views=false,const VkFormat * view_formats=nullptr)3071 	bool create_default_views(const ImageCreateInfo &create_info, const VkImageViewCreateInfo *view_info,
3072 	                          bool create_unorm_srgb_views = false, const VkFormat *view_formats = nullptr)
3073 	{
3074 		VkDevice vkdevice = device->get_device();
3075 
3076 		if ((create_info.usage & (VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
3077 		                          VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) == 0)
3078 		{
3079 			LOGE("Cannot create image view unless certain usage flags are present.\n");
3080 			return false;
3081 		}
3082 
3083 		VkImageViewCreateInfo default_view_info = { VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO };
3084 		VkSamplerYcbcrConversionInfo conversion_info = { VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO };
3085 
3086 		if (!view_info)
3087 		{
3088 			default_view_info.image = image;
3089 			default_view_info.format = create_info.format;
3090 			default_view_info.components = create_info.swizzle;
3091 			default_view_info.subresourceRange.aspectMask = format_to_aspect_mask(default_view_info.format);
3092 			default_view_info.viewType = get_image_view_type(create_info, nullptr);
3093 			default_view_info.subresourceRange.baseMipLevel = 0;
3094 			default_view_info.subresourceRange.baseArrayLayer = 0;
3095 			default_view_info.subresourceRange.levelCount = create_info.levels;
3096 			default_view_info.subresourceRange.layerCount = create_info.layers;
3097 
3098 			default_view_type = default_view_info.viewType;
3099 		}
3100 		else
3101 			default_view_info = *view_info;
3102 
3103 		view_info = &default_view_info;
3104 		if (!setup_conversion_info(default_view_info, conversion_info))
3105 			return false;
3106 
3107 		if (!create_alt_views(create_info, *view_info))
3108 			return false;
3109 
3110 		if (!create_render_target_views(create_info, *view_info))
3111 			return false;
3112 
3113 		if (!create_default_view(*view_info))
3114 			return false;
3115 
3116 		if (create_unorm_srgb_views)
3117 		{
3118 			auto info = *view_info;
3119 
3120 			info.format = view_formats[0];
3121 			if (table.vkCreateImageView(vkdevice, &info, nullptr, &unorm_view) != VK_SUCCESS)
3122 				return false;
3123 
3124 			info.format = view_formats[1];
3125 			if (table.vkCreateImageView(vkdevice, &info, nullptr, &srgb_view) != VK_SUCCESS)
3126 				return false;
3127 		}
3128 
3129 		return true;
3130 	}
3131 
3132 private:
create_render_target_views(const ImageCreateInfo & image_create_info,const VkImageViewCreateInfo & info)3133 	bool create_render_target_views(const ImageCreateInfo &image_create_info, const VkImageViewCreateInfo &info)
3134 	{
3135 		rt_views.reserve(info.subresourceRange.layerCount);
3136 
3137 		if (info.viewType == VK_IMAGE_VIEW_TYPE_3D)
3138 			return true;
3139 
3140 		// If we have a render target, and non-trivial case (layers = 1, levels = 1),
3141 		// create an array of render targets which correspond to each layer (mip 0).
3142 		if ((image_create_info.usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)) != 0 &&
3143 		    ((info.subresourceRange.levelCount > 1) || (info.subresourceRange.layerCount > 1)))
3144 		{
3145 			auto view_info = info;
3146 			view_info.viewType = VK_IMAGE_VIEW_TYPE_2D;
3147 			view_info.subresourceRange.baseMipLevel = info.subresourceRange.baseMipLevel;
3148 			for (uint32_t layer = 0; layer < info.subresourceRange.layerCount; layer++)
3149 			{
3150 				view_info.subresourceRange.levelCount = 1;
3151 				view_info.subresourceRange.layerCount = 1;
3152 				view_info.subresourceRange.baseArrayLayer = layer + info.subresourceRange.baseArrayLayer;
3153 
3154 				VkImageView rt_view;
3155 				if (table.vkCreateImageView(device->get_device(), &view_info, nullptr, &rt_view) != VK_SUCCESS)
3156 					return false;
3157 
3158 				rt_views.push_back(rt_view);
3159 			}
3160 		}
3161 
3162 		return true;
3163 	}
3164 
create_alt_views(const ImageCreateInfo & image_create_info,const VkImageViewCreateInfo & info)3165 	bool create_alt_views(const ImageCreateInfo &image_create_info, const VkImageViewCreateInfo &info)
3166 	{
3167 		if (info.viewType == VK_IMAGE_VIEW_TYPE_CUBE ||
3168 		    info.viewType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY ||
3169 		    info.viewType == VK_IMAGE_VIEW_TYPE_3D)
3170 		{
3171 			return true;
3172 		}
3173 
3174 		VkDevice vkdevice = device->get_device();
3175 
3176 		if (info.subresourceRange.aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))
3177 		{
3178 			if ((image_create_info.usage & ~VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) != 0)
3179 			{
3180 				// Sanity check. Don't want to implement layered views for this.
3181 				if (info.subresourceRange.levelCount > 1)
3182 				{
3183 					LOGE("Cannot create depth stencil attachments with more than 1 mip level currently, and non-DS usage flags.\n");
3184 					return false;
3185 				}
3186 
3187 				if (info.subresourceRange.layerCount > 1)
3188 				{
3189 					LOGE("Cannot create layered depth stencil attachments with non-DS usage flags.\n");
3190 					return false;
3191 				}
3192 
3193 				auto view_info = info;
3194 
3195 				// We need this to be able to sample the texture, or otherwise use it as a non-pure DS attachment.
3196 				view_info.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
3197 				if (table.vkCreateImageView(vkdevice, &view_info, nullptr, &depth_view) != VK_SUCCESS)
3198 					return false;
3199 
3200 				view_info.subresourceRange.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
3201 				if (table.vkCreateImageView(vkdevice, &view_info, nullptr, &stencil_view) != VK_SUCCESS)
3202 					return false;
3203 			}
3204 		}
3205 
3206 		return true;
3207 	}
3208 
create_default_view(const VkImageViewCreateInfo & info)3209 	bool create_default_view(const VkImageViewCreateInfo &info)
3210 	{
3211 		VkDevice vkdevice = device->get_device();
3212 
3213 		// Create the normal image view. This one contains every subresource.
3214 		if (table.vkCreateImageView(vkdevice, &info, nullptr, &image_view) != VK_SUCCESS)
3215 			return false;
3216 
3217 		return true;
3218 	}
3219 
cleanup()3220 	void cleanup()
3221 	{
3222 		VkDevice vkdevice = device->get_device();
3223 
3224 		if (image_view)
3225 			table.vkDestroyImageView(vkdevice, image_view, nullptr);
3226 		if (depth_view)
3227 			table.vkDestroyImageView(vkdevice, depth_view, nullptr);
3228 		if (stencil_view)
3229 			table.vkDestroyImageView(vkdevice, stencil_view, nullptr);
3230 		if (unorm_view)
3231 			table.vkDestroyImageView(vkdevice, unorm_view, nullptr);
3232 		if (srgb_view)
3233 			table.vkDestroyImageView(vkdevice, srgb_view, nullptr);
3234 		for (auto &view : rt_views)
3235 			table.vkDestroyImageView(vkdevice, view, nullptr);
3236 
3237 		if (image)
3238 			table.vkDestroyImage(vkdevice, image, nullptr);
3239 		if (memory)
3240 			table.vkFreeMemory(vkdevice, memory, nullptr);
3241 		if (allocator)
3242 			allocation.free_immediate(*allocator);
3243 	}
3244 };
3245 
create_image_view(const ImageViewCreateInfo & create_info)3246 ImageViewHandle Device::create_image_view(const ImageViewCreateInfo &create_info)
3247 {
3248 	ImageResourceHolder holder(this);
3249 	auto &image_create_info = create_info.image->get_create_info();
3250 
3251 	VkFormat format = create_info.format != VK_FORMAT_UNDEFINED ? create_info.format : image_create_info.format;
3252 
3253 	VkImageViewCreateInfo view_info = { VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO };
3254 	view_info.image = create_info.image->get_image();
3255 	view_info.format = format;
3256 	view_info.components = create_info.swizzle;
3257 	view_info.subresourceRange.aspectMask = format_to_aspect_mask(format);
3258 	view_info.subresourceRange.baseMipLevel = create_info.base_level;
3259 	view_info.subresourceRange.baseArrayLayer = create_info.base_layer;
3260 	view_info.subresourceRange.levelCount = create_info.levels;
3261 	view_info.subresourceRange.layerCount = create_info.layers;
3262 
3263 	if (create_info.view_type == VK_IMAGE_VIEW_TYPE_RANGE_SIZE)
3264 		view_info.viewType = get_image_view_type(image_create_info, &create_info);
3265 	else
3266 		view_info.viewType = create_info.view_type;
3267 
3268 	unsigned num_levels;
3269 	if (view_info.subresourceRange.levelCount == VK_REMAINING_MIP_LEVELS)
3270 		num_levels = create_info.image->get_create_info().levels - view_info.subresourceRange.baseMipLevel;
3271 	else
3272 		num_levels = view_info.subresourceRange.levelCount;
3273 
3274 	unsigned num_layers;
3275 	if (view_info.subresourceRange.layerCount == VK_REMAINING_ARRAY_LAYERS)
3276 		num_layers = create_info.image->get_create_info().layers - view_info.subresourceRange.baseArrayLayer;
3277 	else
3278 		num_layers = view_info.subresourceRange.layerCount;
3279 
3280 	view_info.subresourceRange.levelCount = num_levels;
3281 	view_info.subresourceRange.layerCount = num_layers;
3282 
3283 	if (!holder.create_default_views(image_create_info, &view_info))
3284 		return ImageViewHandle(nullptr);
3285 
3286 	ImageViewCreateInfo tmp = create_info;
3287 	tmp.format = format;
3288 	ImageViewHandle ret(handle_pool.image_views.allocate(this, holder.image_view, tmp));
3289 	if (ret)
3290 	{
3291 		holder.owned = false;
3292 		ret->set_alt_views(holder.depth_view, holder.stencil_view);
3293 		ret->set_render_target_views(move(holder.rt_views));
3294 		return ret;
3295 	}
3296 	else
3297 		return ImageViewHandle(nullptr);
3298 }
3299 
3300 #ifndef _WIN32
create_imported_image(int fd,VkDeviceSize size,uint32_t memory_type,VkExternalMemoryHandleTypeFlagBitsKHR handle_type,const ImageCreateInfo & create_info)3301 ImageHandle Device::create_imported_image(int fd, VkDeviceSize size, uint32_t memory_type,
3302                                           VkExternalMemoryHandleTypeFlagBitsKHR handle_type,
3303                                           const ImageCreateInfo &create_info)
3304 {
3305 	if (!ext.supports_external)
3306 		return {};
3307 
3308 	ImageResourceHolder holder(this);
3309 
3310 	VkImageCreateInfo info = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO };
3311 	info.format = create_info.format;
3312 	info.extent.width = create_info.width;
3313 	info.extent.height = create_info.height;
3314 	info.extent.depth = create_info.depth;
3315 	info.imageType = create_info.type;
3316 	info.mipLevels = create_info.levels;
3317 	info.arrayLayers = create_info.layers;
3318 	info.samples = create_info.samples;
3319 	info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
3320 	info.tiling = VK_IMAGE_TILING_OPTIMAL;
3321 	info.usage = create_info.usage;
3322 	info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
3323 	info.flags = create_info.flags;
3324 	VK_ASSERT(create_info.domain != ImageDomain::Transient);
3325 
3326 	VkExternalMemoryImageCreateInfoKHR externalInfo = { VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO_KHR };
3327 	externalInfo.handleTypes = handle_type;
3328 	info.pNext = &externalInfo;
3329 
3330 	VK_ASSERT(image_format_is_supported(create_info.format, image_usage_to_features(info.usage), info.tiling));
3331 
3332 	if (table->vkCreateImage(device, &info, nullptr, &holder.image) != VK_SUCCESS)
3333 		return ImageHandle(nullptr);
3334 
3335 	VkMemoryAllocateInfo alloc_info = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO };
3336 	alloc_info.allocationSize = size;
3337 	alloc_info.memoryTypeIndex = memory_type;
3338 
3339 	VkMemoryDedicatedAllocateInfoKHR dedicated_info = { VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR };
3340 	dedicated_info.image = holder.image;
3341 	alloc_info.pNext = &dedicated_info;
3342 
3343 	VkImportMemoryFdInfoKHR fd_info = { VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR };
3344 	fd_info.handleType = handle_type;
3345 	fd_info.fd = fd;
3346 	dedicated_info.pNext = &fd_info;
3347 
3348 	VkMemoryRequirements reqs;
3349 	table->vkGetImageMemoryRequirements(device, holder.image, &reqs);
3350 	if (reqs.size > size)
3351 		return ImageHandle(nullptr);
3352 
3353 	if (((1u << memory_type) & reqs.memoryTypeBits) == 0)
3354 		return ImageHandle(nullptr);
3355 
3356 	if (table->vkAllocateMemory(device, &alloc_info, nullptr, &holder.memory) != VK_SUCCESS)
3357 		return ImageHandle(nullptr);
3358 
3359 	if (table->vkBindImageMemory(device, holder.image, holder.memory, 0) != VK_SUCCESS)
3360 		return ImageHandle(nullptr);
3361 
3362 	// Create default image views.
3363 	// App could of course to this on its own, but it's very handy to have these being created automatically for you.
3364 	VkImageViewType view_type = VK_IMAGE_VIEW_TYPE_RANGE_SIZE;
3365 	if (info.usage & (VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
3366 	                  VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT))
3367 	{
3368 		if (!holder.create_default_views(create_info, nullptr))
3369 			return ImageHandle(nullptr);
3370 		view_type = holder.get_default_view_type();
3371 	}
3372 
3373 	auto allocation = DeviceAllocation::make_imported_allocation(holder.memory, size, memory_type);
3374 	ImageHandle handle(handle_pool.images.allocate(this, holder.image, holder.image_view, allocation, create_info, view_type));
3375 	if (handle)
3376 	{
3377 		holder.owned = false;
3378 		handle->get_view().set_alt_views(holder.depth_view, holder.stencil_view);
3379 		handle->get_view().set_render_target_views(move(holder.rt_views));
3380 
3381 		// Set possible dstStage and dstAccess.
3382 		handle->set_stage_flags(image_usage_to_possible_stages(info.usage));
3383 		handle->set_access_flags(image_usage_to_possible_access(info.usage));
3384 		return handle;
3385 	}
3386 	else
3387 		return ImageHandle(nullptr);
3388 }
3389 #endif
3390 
create_image_staging_buffer(const TextureFormatLayout & layout)3391 InitialImageBuffer Device::create_image_staging_buffer(const TextureFormatLayout &layout)
3392 {
3393 	InitialImageBuffer result;
3394 
3395 	BufferCreateInfo buffer_info = {};
3396 	buffer_info.domain = BufferDomain::Host;
3397 	buffer_info.size = layout.get_required_size();
3398 	buffer_info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
3399 	result.buffer = create_buffer(buffer_info, nullptr);
3400 	set_name(*result.buffer, "image-upload-staging-buffer");
3401 
3402 	auto *mapped = static_cast<uint8_t *>(map_host_buffer(*result.buffer, MEMORY_ACCESS_WRITE_BIT));
3403 	memcpy(mapped, layout.data(), layout.get_required_size());
3404 	unmap_host_buffer(*result.buffer, MEMORY_ACCESS_WRITE_BIT);
3405 
3406 	layout.build_buffer_image_copies(result.blits);
3407 	return result;
3408 }
3409 
create_image_staging_buffer(const ImageCreateInfo & info,const ImageInitialData * initial)3410 InitialImageBuffer Device::create_image_staging_buffer(const ImageCreateInfo &info, const ImageInitialData *initial)
3411 {
3412 	InitialImageBuffer result;
3413 
3414 	bool generate_mips = (info.misc & IMAGE_MISC_GENERATE_MIPS_BIT) != 0;
3415 	TextureFormatLayout layout;
3416 
3417 	unsigned copy_levels;
3418 	if (generate_mips)
3419 		copy_levels = 1;
3420 	else if (info.levels == 0)
3421 		copy_levels = TextureFormatLayout::num_miplevels(info.width, info.height, info.depth);
3422 	else
3423 		copy_levels = info.levels;
3424 
3425 	switch (info.type)
3426 	{
3427 	case VK_IMAGE_TYPE_1D:
3428 		layout.set_1d(info.format, info.width, info.layers, copy_levels);
3429 		break;
3430 	case VK_IMAGE_TYPE_2D:
3431 		layout.set_2d(info.format, info.width, info.height, info.layers, copy_levels);
3432 		break;
3433 	case VK_IMAGE_TYPE_3D:
3434 		layout.set_3d(info.format, info.width, info.height, info.depth, copy_levels);
3435 		break;
3436 	default:
3437 		return {};
3438 	}
3439 
3440 	BufferCreateInfo buffer_info = {};
3441 	buffer_info.domain = BufferDomain::Host;
3442 	buffer_info.size = layout.get_required_size();
3443 	buffer_info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
3444 	result.buffer = create_buffer(buffer_info, nullptr);
3445 	set_name(*result.buffer, "image-upload-staging-buffer");
3446 
3447 	// And now, do the actual copy.
3448 	auto *mapped = static_cast<uint8_t *>(map_host_buffer(*result.buffer, MEMORY_ACCESS_WRITE_BIT));
3449 	unsigned index = 0;
3450 
3451 	layout.set_buffer(mapped, layout.get_required_size());
3452 
3453 	for (unsigned level = 0; level < copy_levels; level++)
3454 	{
3455 		const auto &mip_info = layout.get_mip_info(level);
3456 		uint32_t dst_height_stride = layout.get_layer_size(level);
3457 		size_t row_size = layout.get_row_size(level);
3458 
3459 		for (unsigned layer = 0; layer < info.layers; layer++, index++)
3460 		{
3461 			uint32_t src_row_length =
3462 					initial[index].row_length ? initial[index].row_length : mip_info.row_length;
3463 			uint32_t src_array_height =
3464 					initial[index].image_height ? initial[index].image_height : mip_info.image_height;
3465 
3466 			uint32_t src_row_stride = layout.row_byte_stride(src_row_length);
3467 			uint32_t src_height_stride = layout.layer_byte_stride(src_array_height, src_row_stride);
3468 
3469 			uint8_t *dst = static_cast<uint8_t *>(layout.data(layer, level));
3470 			const uint8_t *src = static_cast<const uint8_t *>(initial[index].data);
3471 
3472 			for (uint32_t z = 0; z < mip_info.depth; z++)
3473 				for (uint32_t y = 0; y < mip_info.block_image_height; y++)
3474 					memcpy(dst + z * dst_height_stride + y * row_size, src + z * src_height_stride + y * src_row_stride, row_size);
3475 		}
3476 	}
3477 
3478 	unmap_host_buffer(*result.buffer, MEMORY_ACCESS_WRITE_BIT);
3479 	layout.build_buffer_image_copies(result.blits);
3480 	return result;
3481 }
3482 
create_ycbcr_image(const YCbCrImageCreateInfo & create_info)3483 YCbCrImageHandle Device::create_ycbcr_image(const YCbCrImageCreateInfo &create_info)
3484 {
3485 	if (!ext.sampler_ycbcr_conversion_features.samplerYcbcrConversion)
3486 		return YCbCrImageHandle(nullptr);
3487 
3488 	VkFormatProperties format_properties = {};
3489 	get_format_properties(format_ycbcr_planar_vk_format(create_info.format), &format_properties);
3490 
3491 	if ((format_properties.optimalTilingFeatures & VK_FORMAT_FEATURE_DISJOINT_BIT) == 0)
3492 	{
3493 		LOGE("YCbCr format does not support DISJOINT_BIT.\n");
3494 		return YCbCrImageHandle(nullptr);
3495 	}
3496 
3497 	if ((format_properties.optimalTilingFeatures & VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT) == 0)
3498 	{
3499 		LOGE("YCbCr format does not support MIDPOINT_CHROMA_SAMPLES_BIT.\n");
3500 		return YCbCrImageHandle(nullptr);
3501 	}
3502 
3503 	if ((format_properties.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT) == 0)
3504 	{
3505 		LOGE("YCbCr format does not support YCBCR_CONVERSION_LINEAR_FILTER_BIT.\n");
3506 		return YCbCrImageHandle(nullptr);
3507 	}
3508 
3509 	ImageHandle ycbcr_image;
3510 	ImageHandle plane_handles[3];
3511 	unsigned num_planes = format_ycbcr_num_planes(create_info.format);
3512 
3513 	for (unsigned i = 0; i < num_planes; i++)
3514 	{
3515 		ImageCreateInfo plane_info = ImageCreateInfo::immutable_2d_image(
3516 				create_info.width,
3517 				create_info.height,
3518 				format_ycbcr_plane_vk_format(create_info.format, i));
3519 		plane_info.usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_SAMPLED_BIT;
3520 		plane_info.initial_layout = VK_IMAGE_LAYOUT_UNDEFINED;
3521 
3522 		plane_info.width >>= format_ycbcr_downsample_ratio_log2(create_info.format, 0, i);
3523 		plane_info.height >>= format_ycbcr_downsample_ratio_log2(create_info.format, 1, i);
3524 		plane_info.flags = VK_IMAGE_CREATE_ALIAS_BIT; // Will alias directly over the YCbCr image.
3525 		plane_info.misc = IMAGE_MISC_FORCE_NO_DEDICATED_BIT;
3526 		plane_handles[i] = create_image(plane_info);
3527 		if (!plane_handles[i])
3528 		{
3529 			LOGE("Failed to create plane image.\n");
3530 			return YCbCrImageHandle(nullptr);
3531 		}
3532 	}
3533 
3534 	ImageCreateInfo ycbcr_info = ImageCreateInfo::immutable_2d_image(
3535 			create_info.width,
3536 			create_info.height,
3537 			format_ycbcr_planar_vk_format(create_info.format));
3538 	ycbcr_info.usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_SAMPLED_BIT;
3539 	ycbcr_info.initial_layout = VK_IMAGE_LAYOUT_UNDEFINED;
3540 	ycbcr_info.flags = VK_IMAGE_CREATE_DISJOINT_BIT | VK_IMAGE_CREATE_ALIAS_BIT;
3541 	ycbcr_info.misc = IMAGE_MISC_FORCE_NO_DEDICATED_BIT;
3542 
3543 	const DeviceAllocation *allocations[3];
3544 	for (unsigned i = 0; i < num_planes; i++)
3545 		allocations[i] = &plane_handles[i]->get_allocation();
3546 	ycbcr_info.memory_aliases = allocations;
3547 	ycbcr_info.num_memory_aliases = num_planes;
3548 
3549 	ycbcr_image = create_image(ycbcr_info);
3550 	if (!ycbcr_image)
3551 		return YCbCrImageHandle(nullptr);
3552 
3553 	YCbCrImageHandle handle(handle_pool.ycbcr_images.allocate(this, create_info.format, ycbcr_image, plane_handles, num_planes));
3554 	return handle;
3555 }
3556 
create_image(const ImageCreateInfo & create_info,const ImageInitialData * initial)3557 ImageHandle Device::create_image(const ImageCreateInfo &create_info, const ImageInitialData *initial)
3558 {
3559 	if (initial)
3560 	{
3561 		auto staging_buffer = create_image_staging_buffer(create_info, initial);
3562 		return create_image_from_staging_buffer(create_info, &staging_buffer);
3563 	}
3564 	else
3565 		return create_image_from_staging_buffer(create_info, nullptr);
3566 }
3567 
allocate_image_memory(DeviceAllocation * allocation,const ImageCreateInfo & info,VkImage image,VkImageTiling tiling)3568 bool Device::allocate_image_memory(DeviceAllocation *allocation, const ImageCreateInfo &info,
3569                                    VkImage image, VkImageTiling tiling)
3570 {
3571 	if ((info.flags & VK_IMAGE_CREATE_DISJOINT_BIT) != 0 && info.num_memory_aliases == 0)
3572 	{
3573 		LOGE("Must use memory aliases when creating a DISJOINT planar image.\n");
3574 		return false;
3575 	}
3576 
3577 	if (info.num_memory_aliases != 0)
3578 	{
3579 		*allocation = {};
3580 
3581 		unsigned num_planes = format_ycbcr_num_planes(info.format);
3582 		if (info.num_memory_aliases < num_planes)
3583 			return false;
3584 
3585 		if (num_planes == 1)
3586 		{
3587 			VkMemoryRequirements reqs;
3588 			table->vkGetImageMemoryRequirements(device, image, &reqs);
3589 			auto &alias = *info.memory_aliases[0];
3590 
3591 			// Verify we can actually use this aliased allocation.
3592 			if ((reqs.memoryTypeBits & (1u << alias.memory_type)) == 0)
3593 				return false;
3594 			if (reqs.size > alias.size)
3595 				return false;
3596 			if (((alias.offset + reqs.alignment - 1) & ~(reqs.alignment - 1)) != alias.offset)
3597 				return false;
3598 
3599 			if (table->vkBindImageMemory(device, image, alias.get_memory(), alias.get_offset()) != VK_SUCCESS)
3600 				return false;
3601 		}
3602 		else
3603 		{
3604 			if (!ext.supports_bind_memory2 || !ext.supports_get_memory_requirements2)
3605 				return false;
3606 
3607 			VkBindImageMemoryInfo bind_infos[3];
3608 			VkBindImagePlaneMemoryInfo bind_plane_infos[3];
3609 			VK_ASSERT(num_planes <= 3);
3610 
3611 			for (unsigned plane = 0; plane < num_planes; plane++)
3612 			{
3613 				VkMemoryRequirements2KHR memory_req = {VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR };
3614 				VkImageMemoryRequirementsInfo2KHR image_info = {VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR };
3615 				image_info.image = image;
3616 
3617 				VkImagePlaneMemoryRequirementsInfo plane_info = { VK_STRUCTURE_TYPE_IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO_KHR };
3618 				plane_info.planeAspect = static_cast<VkImageAspectFlagBits>(VK_IMAGE_ASPECT_PLANE_0_BIT << plane);
3619 				image_info.pNext = &plane_info;
3620 
3621 				table->vkGetImageMemoryRequirements2KHR(device, &image_info, &memory_req);
3622 				auto &reqs = memory_req.memoryRequirements;
3623 				auto &alias = *info.memory_aliases[plane];
3624 
3625 				// Verify we can actually use this aliased allocation.
3626 				if ((reqs.memoryTypeBits & (1u << alias.memory_type)) == 0)
3627 					return false;
3628 				if (reqs.size > alias.size)
3629 					return false;
3630 				if (((alias.offset + reqs.alignment - 1) & ~(reqs.alignment - 1)) != alias.offset)
3631 					return false;
3632 
3633 				bind_infos[plane] = { VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO };
3634 				bind_infos[plane].image = image;
3635 				bind_infos[plane].memory = alias.base;
3636 				bind_infos[plane].memoryOffset = alias.offset;
3637 				bind_infos[plane].pNext = &bind_plane_infos[plane];
3638 
3639 				bind_plane_infos[plane] = { VK_STRUCTURE_TYPE_BIND_IMAGE_PLANE_MEMORY_INFO };
3640 				bind_plane_infos[plane].planeAspect = static_cast<VkImageAspectFlagBits>(VK_IMAGE_ASPECT_PLANE_0_BIT << plane);
3641 			}
3642 
3643 			if (table->vkBindImageMemory2KHR(device, num_planes, bind_infos) != VK_SUCCESS)
3644 				return false;
3645 		}
3646 	}
3647 	else
3648 	{
3649 		VkMemoryRequirements reqs;
3650 		table->vkGetImageMemoryRequirements(device, image, &reqs);
3651 
3652 		// If we intend to alias with other images bump the alignment to something very high.
3653 		// This is kind of crude, but should be high enough to allow YCbCr disjoint aliasing on any implementation.
3654 		if (info.flags & VK_IMAGE_CREATE_ALIAS_BIT)
3655 			if (reqs.alignment < 64 * 1024)
3656 				reqs.alignment = 64 * 1024;
3657 
3658 		uint32_t memory_type = find_memory_type(info.domain, reqs.memoryTypeBits);
3659 		if (memory_type == UINT32_MAX)
3660 		{
3661 			LOGE("Failed to find memory type.\n");
3662 			return false;
3663 		}
3664 
3665 		if (tiling == VK_IMAGE_TILING_LINEAR &&
3666 		    (info.misc & IMAGE_MISC_LINEAR_IMAGE_IGNORE_DEVICE_LOCAL_BIT) == 0)
3667 		{
3668 			// Is it also device local?
3669 			if ((mem_props.memoryTypes[memory_type].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) == 0)
3670 				return false;
3671 		}
3672 
3673 		if (!managers.memory.allocate_image_memory(reqs.size, reqs.alignment, memory_type,
3674 		                                           tiling == VK_IMAGE_TILING_OPTIMAL ? ALLOCATION_TILING_OPTIMAL
3675 		                                                                             : ALLOCATION_TILING_LINEAR,
3676 		                                           allocation, image,
3677 		                                           (info.misc & IMAGE_MISC_FORCE_NO_DEDICATED_BIT) != 0))
3678 		{
3679 			LOGE("Failed to allocate image memory (type %u, size: %u).\n", unsigned(memory_type), unsigned(reqs.size));
3680 			return false;
3681 		}
3682 
3683 		if (table->vkBindImageMemory(device, image, allocation->get_memory(),
3684 		                             allocation->get_offset()) != VK_SUCCESS)
3685 		{
3686 			LOGE("Failed to bind image memory.\n");
3687 			return false;
3688 		}
3689 	}
3690 
3691 	return true;
3692 }
3693 
create_image_from_staging_buffer(const ImageCreateInfo & create_info,const InitialImageBuffer * staging_buffer)3694 ImageHandle Device::create_image_from_staging_buffer(const ImageCreateInfo &create_info,
3695                                                      const InitialImageBuffer *staging_buffer)
3696 {
3697 	ImageResourceHolder holder(this);
3698 
3699 	VkImageCreateInfo info = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO };
3700 	info.format = create_info.format;
3701 	info.extent.width = create_info.width;
3702 	info.extent.height = create_info.height;
3703 	info.extent.depth = create_info.depth;
3704 	info.imageType = create_info.type;
3705 	info.mipLevels = create_info.levels;
3706 	info.arrayLayers = create_info.layers;
3707 	info.samples = create_info.samples;
3708 
3709 	if (create_info.domain == ImageDomain::LinearHostCached || create_info.domain == ImageDomain::LinearHost)
3710 	{
3711 		info.tiling = VK_IMAGE_TILING_LINEAR;
3712 		info.initialLayout = VK_IMAGE_LAYOUT_PREINITIALIZED;
3713 	}
3714 	else
3715 	{
3716 		info.tiling = VK_IMAGE_TILING_OPTIMAL;
3717 		info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
3718 	}
3719 
3720 	info.usage = create_info.usage;
3721 	info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
3722 	if (create_info.domain == ImageDomain::Transient)
3723 		info.usage |= VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT;
3724 	if (staging_buffer)
3725 		info.usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT;
3726 
3727 	info.flags = create_info.flags;
3728 
3729 	if (info.mipLevels == 0)
3730 		info.mipLevels = image_num_miplevels(info.extent);
3731 
3732 	VkImageFormatListCreateInfoKHR format_info = { VK_STRUCTURE_TYPE_IMAGE_FORMAT_LIST_CREATE_INFO_KHR };
3733 	VkFormat view_formats[2];
3734 	format_info.pViewFormats = view_formats;
3735 	format_info.viewFormatCount = 2;
3736 	bool create_unorm_srgb_views = false;
3737 
3738 	if (create_info.misc & IMAGE_MISC_MUTABLE_SRGB_BIT)
3739 	{
3740 		format_info.viewFormatCount = ImageCreateInfo::compute_view_formats(create_info, view_formats);
3741 		if (format_info.viewFormatCount != 0)
3742 		{
3743 			create_unorm_srgb_views = true;
3744 			if (ext.supports_image_format_list)
3745 				info.pNext = &format_info;
3746 		}
3747 	}
3748 
3749 	if ((create_info.usage & VK_IMAGE_USAGE_STORAGE_BIT) ||
3750 	    (create_info.misc & IMAGE_MISC_MUTABLE_SRGB_BIT))
3751 	{
3752 		info.flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;
3753 	}
3754 
3755 	// Only do this conditionally.
3756 	// On AMD, using CONCURRENT with async compute disables compression.
3757 	uint32_t sharing_indices[3] = {};
3758 
3759 	uint32_t queue_flags = create_info.misc & (IMAGE_MISC_CONCURRENT_QUEUE_GRAPHICS_BIT |
3760 	                                           IMAGE_MISC_CONCURRENT_QUEUE_ASYNC_COMPUTE_BIT |
3761 	                                           IMAGE_MISC_CONCURRENT_QUEUE_ASYNC_GRAPHICS_BIT |
3762 	                                           IMAGE_MISC_CONCURRENT_QUEUE_ASYNC_TRANSFER_BIT);
3763 	bool concurrent_queue = queue_flags != 0;
3764 	if (concurrent_queue)
3765 	{
3766 		info.sharingMode = VK_SHARING_MODE_CONCURRENT;
3767 
3768 		const auto add_unique_family = [&](uint32_t family) {
3769 			for (uint32_t i = 0; i < info.queueFamilyIndexCount; i++)
3770 			{
3771 				if (sharing_indices[i] == family)
3772 					return;
3773 			}
3774 			sharing_indices[info.queueFamilyIndexCount++] = family;
3775 		};
3776 
3777 		if (queue_flags & (IMAGE_MISC_CONCURRENT_QUEUE_GRAPHICS_BIT | IMAGE_MISC_CONCURRENT_QUEUE_ASYNC_GRAPHICS_BIT))
3778 			add_unique_family(graphics_queue_family_index);
3779 		if (queue_flags & IMAGE_MISC_CONCURRENT_QUEUE_ASYNC_COMPUTE_BIT)
3780 			add_unique_family(compute_queue_family_index);
3781 		if (staging_buffer || (queue_flags & IMAGE_MISC_CONCURRENT_QUEUE_ASYNC_TRANSFER_BIT) != 0)
3782 			add_unique_family(transfer_queue_family_index);
3783 
3784 		if (info.queueFamilyIndexCount > 1)
3785 			info.pQueueFamilyIndices = sharing_indices;
3786 		else
3787 		{
3788 			info.pQueueFamilyIndices = nullptr;
3789 			info.queueFamilyIndexCount = 0;
3790 			info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
3791 		}
3792 	}
3793 
3794 	VkFormatFeatureFlags check_extra_features = 0;
3795 	if ((create_info.misc & IMAGE_MISC_VERIFY_FORMAT_FEATURE_SAMPLED_LINEAR_FILTER_BIT) != 0)
3796 		check_extra_features |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
3797 
3798 	if (info.tiling == VK_IMAGE_TILING_LINEAR)
3799 	{
3800 		if (staging_buffer)
3801 			return ImageHandle(nullptr);
3802 
3803 		// Do some more stringent checks.
3804 		if (info.mipLevels > 1)
3805 			return ImageHandle(nullptr);
3806 		if (info.arrayLayers > 1)
3807 			return ImageHandle(nullptr);
3808 		if (info.imageType != VK_IMAGE_TYPE_2D)
3809 			return ImageHandle(nullptr);
3810 		if (info.samples != VK_SAMPLE_COUNT_1_BIT)
3811 			return ImageHandle(nullptr);
3812 
3813 		VkImageFormatProperties props;
3814 		if (!get_image_format_properties(info.format, info.imageType, info.tiling, info.usage, info.flags, &props))
3815 			return ImageHandle(nullptr);
3816 
3817 		if (!props.maxArrayLayers ||
3818 		    !props.maxMipLevels ||
3819 		    (info.extent.width > props.maxExtent.width) ||
3820 		    (info.extent.height > props.maxExtent.height) ||
3821 		    (info.extent.depth > props.maxExtent.depth))
3822 		{
3823 			return ImageHandle(nullptr);
3824 		}
3825 	}
3826 
3827 	if (!image_format_is_supported(create_info.format, image_usage_to_features(info.usage) | check_extra_features, info.tiling))
3828 	{
3829 		LOGE("Format %u is not supported for usage flags!\n", unsigned(create_info.format));
3830 		return ImageHandle(nullptr);
3831 	}
3832 
3833 	if (table->vkCreateImage(device, &info, nullptr, &holder.image) != VK_SUCCESS)
3834 	{
3835 		LOGE("Failed to create image in vkCreateImage.\n");
3836 		return ImageHandle(nullptr);
3837 	}
3838 
3839 	if (!allocate_image_memory(&holder.allocation, create_info, holder.image, info.tiling))
3840 	{
3841 		LOGE("Failed to allocate memory for image.\n");
3842 		return ImageHandle(nullptr);
3843 	}
3844 
3845 	auto tmpinfo = create_info;
3846 	tmpinfo.usage = info.usage;
3847 	tmpinfo.flags = info.flags;
3848 	tmpinfo.levels = info.mipLevels;
3849 
3850 	bool has_view = (info.usage & (VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
3851 	                               VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) != 0;
3852 
3853 	VkImageViewType view_type = VK_IMAGE_VIEW_TYPE_RANGE_SIZE;
3854 	if (has_view)
3855 	{
3856 		if (!holder.create_default_views(tmpinfo, nullptr, create_unorm_srgb_views, view_formats))
3857 			return ImageHandle(nullptr);
3858 		view_type = holder.get_default_view_type();
3859 	}
3860 
3861 	ImageHandle handle(handle_pool.images.allocate(this, holder.image, holder.image_view, holder.allocation, tmpinfo, view_type));
3862 	if (handle)
3863 	{
3864 		holder.owned = false;
3865 		if (has_view)
3866 		{
3867 			handle->get_view().set_alt_views(holder.depth_view, holder.stencil_view);
3868 			handle->get_view().set_render_target_views(move(holder.rt_views));
3869 			handle->get_view().set_unorm_view(holder.unorm_view);
3870 			handle->get_view().set_srgb_view(holder.srgb_view);
3871 		}
3872 
3873 		// Set possible dstStage and dstAccess.
3874 		handle->set_stage_flags(image_usage_to_possible_stages(info.usage));
3875 		handle->set_access_flags(image_usage_to_possible_access(info.usage));
3876 	}
3877 
3878 	// Copy initial data to texture.
3879 	if (staging_buffer)
3880 	{
3881 		VK_ASSERT(create_info.domain != ImageDomain::Transient);
3882 		VK_ASSERT(create_info.initial_layout != VK_IMAGE_LAYOUT_UNDEFINED);
3883 		bool generate_mips = (create_info.misc & IMAGE_MISC_GENERATE_MIPS_BIT) != 0;
3884 
3885 		// If graphics_queue != transfer_queue, we will use a semaphore, so no srcAccess mask is necessary.
3886 		VkAccessFlags final_transition_src_access = 0;
3887 		if (generate_mips)
3888 			final_transition_src_access = VK_ACCESS_TRANSFER_READ_BIT; // Validation complains otherwise.
3889 		else if (graphics_queue == transfer_queue)
3890 			final_transition_src_access = VK_ACCESS_TRANSFER_WRITE_BIT;
3891 
3892 		VkAccessFlags prepare_src_access = graphics_queue == transfer_queue ? VK_ACCESS_TRANSFER_WRITE_BIT : 0;
3893 		bool need_mipmap_barrier = true;
3894 		bool need_initial_barrier = true;
3895 
3896 		// Now we've used the TRANSFER queue to copy data over to the GPU.
3897 		// For mipmapping, we're now moving over to graphics,
3898 		// the transfer queue is designed for CPU <-> GPU and that's it.
3899 
3900 		// For concurrent queue mode, we just need to inject a semaphore.
3901 		// For non-concurrent queue mode, we will have to inject ownership transfer barrier if the queue families do not match.
3902 
3903 		auto graphics_cmd = request_command_buffer(CommandBuffer::Type::Generic);
3904 		CommandBufferHandle transfer_cmd;
3905 
3906 		// Don't split the upload into multiple command buffers unless we have to.
3907 		if (transfer_queue != graphics_queue)
3908 			transfer_cmd = request_command_buffer(CommandBuffer::Type::AsyncTransfer);
3909 		else
3910 			transfer_cmd = graphics_cmd;
3911 
3912 		transfer_cmd->image_barrier(*handle, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
3913 		                            VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, VK_PIPELINE_STAGE_TRANSFER_BIT,
3914 		                            VK_ACCESS_TRANSFER_WRITE_BIT);
3915 
3916 		transfer_cmd->begin_region("copy-image-to-gpu");
3917 		transfer_cmd->copy_buffer_to_image(*handle, *staging_buffer->buffer, staging_buffer->blits.size(), staging_buffer->blits.data());
3918 		transfer_cmd->end_region();
3919 
3920 		if (transfer_queue != graphics_queue)
3921 		{
3922 			VkPipelineStageFlags dst_stages =
3923 					generate_mips ? VkPipelineStageFlags(VK_PIPELINE_STAGE_TRANSFER_BIT) : handle->get_stage_flags();
3924 
3925 			// We can't just use semaphores, we will also need a release + acquire barrier to marshal ownership from
3926 			// transfer queue over to graphics ...
3927 			if (!concurrent_queue && transfer_queue_family_index != graphics_queue_family_index)
3928 			{
3929 				need_mipmap_barrier = false;
3930 
3931 				VkImageMemoryBarrier release = { VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER };
3932 				release.image = handle->get_image();
3933 				release.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
3934 				release.dstAccessMask = 0;
3935 				release.srcQueueFamilyIndex = transfer_queue_family_index;
3936 				release.dstQueueFamilyIndex = graphics_queue_family_index;
3937 				release.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
3938 
3939 				if (generate_mips)
3940 				{
3941 					release.newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
3942 					release.subresourceRange.levelCount = 1;
3943 				}
3944 				else
3945 				{
3946 					release.newLayout = create_info.initial_layout;
3947 					release.subresourceRange.levelCount = info.mipLevels;
3948 					need_initial_barrier = false;
3949 				}
3950 
3951 				release.subresourceRange.aspectMask = format_to_aspect_mask(info.format);
3952 				release.subresourceRange.layerCount = info.arrayLayers;
3953 
3954 				VkImageMemoryBarrier acquire = release;
3955 				acquire.srcAccessMask = 0;
3956 
3957 				if (generate_mips)
3958 					acquire.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
3959 				else
3960 					acquire.dstAccessMask = handle->get_access_flags() & image_layout_to_possible_access(create_info.initial_layout);
3961 
3962 				transfer_cmd->barrier(VK_PIPELINE_STAGE_TRANSFER_BIT,
3963 				                      VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
3964 				                      0, nullptr, 0, nullptr, 1, &release);
3965 
3966 				graphics_cmd->barrier(dst_stages,
3967 				                      dst_stages,
3968 				                      0, nullptr, 0, nullptr, 1, &acquire);
3969 			}
3970 
3971 			Semaphore sem;
3972 			submit(transfer_cmd, nullptr, 1, &sem);
3973 			add_wait_semaphore(CommandBuffer::Type::Generic, sem, dst_stages, true);
3974 		}
3975 
3976 		if (generate_mips)
3977 		{
3978 			graphics_cmd->begin_region("mipgen");
3979 			graphics_cmd->barrier_prepare_generate_mipmap(*handle, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
3980 			                                              VK_PIPELINE_STAGE_TRANSFER_BIT,
3981 			                                              prepare_src_access, need_mipmap_barrier);
3982 			graphics_cmd->generate_mipmap(*handle);
3983 			graphics_cmd->end_region();
3984 		}
3985 
3986 		if (need_initial_barrier)
3987 		{
3988 			graphics_cmd->image_barrier(
3989 					*handle, generate_mips ? VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL : VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
3990 					create_info.initial_layout,
3991 					VK_PIPELINE_STAGE_TRANSFER_BIT, final_transition_src_access,
3992 					handle->get_stage_flags(),
3993 					handle->get_access_flags() & image_layout_to_possible_access(create_info.initial_layout));
3994 		}
3995 
3996 		bool share_compute = concurrent_queue && graphics_queue != compute_queue;
3997 		bool share_async_graphics = get_physical_queue_type(CommandBuffer::Type::AsyncGraphics) == CommandBuffer::Type::AsyncCompute;
3998 
3999 		// For concurrent queue, make sure that compute can see the final image as well.
4000 		// Also add semaphore if the compute queue can be used for async graphics as well.
4001 		if (share_compute || share_async_graphics)
4002 		{
4003 			Semaphore sem;
4004 			submit(graphics_cmd, nullptr, 1, &sem);
4005 
4006 			VkPipelineStageFlags dst_stages = handle->get_stage_flags();
4007 			if (graphics_queue_family_index != compute_queue_family_index)
4008 				dst_stages &= VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT;
4009 			add_wait_semaphore(CommandBuffer::Type::AsyncCompute, sem, dst_stages, true);
4010 		}
4011 		else
4012 			submit(graphics_cmd);
4013 	}
4014 	else if (create_info.initial_layout != VK_IMAGE_LAYOUT_UNDEFINED)
4015 	{
4016 		VK_ASSERT(create_info.domain != ImageDomain::Transient);
4017 		auto cmd = request_command_buffer(CommandBuffer::Type::Generic);
4018 		cmd->image_barrier(*handle, info.initialLayout, create_info.initial_layout,
4019 		                   VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, handle->get_stage_flags(),
4020 		                   handle->get_access_flags() &
4021 		                   image_layout_to_possible_access(create_info.initial_layout));
4022 
4023 		// For concurrent queue, make sure that compute can see the final image as well.
4024 		if (concurrent_queue && graphics_queue != compute_queue)
4025 		{
4026 			Semaphore sem;
4027 			submit(cmd, nullptr, 1, &sem);
4028 			add_wait_semaphore(CommandBuffer::Type::AsyncCompute,
4029 			                   sem, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT, true);
4030 		}
4031 		else
4032 			submit(cmd);
4033 	}
4034 
4035 	return handle;
4036 }
4037 
fill_vk_sampler_info(const SamplerCreateInfo & sampler_info)4038 static VkSamplerCreateInfo fill_vk_sampler_info(const SamplerCreateInfo &sampler_info)
4039 {
4040 	VkSamplerCreateInfo info = { VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO };
4041 
4042 	info.magFilter = sampler_info.mag_filter;
4043 	info.minFilter = sampler_info.min_filter;
4044 	info.mipmapMode = sampler_info.mipmap_mode;
4045 	info.addressModeU = sampler_info.address_mode_u;
4046 	info.addressModeV = sampler_info.address_mode_v;
4047 	info.addressModeW = sampler_info.address_mode_w;
4048 	info.mipLodBias = sampler_info.mip_lod_bias;
4049 	info.anisotropyEnable = sampler_info.anisotropy_enable;
4050 	info.maxAnisotropy = sampler_info.max_anisotropy;
4051 	info.compareEnable = sampler_info.compare_enable;
4052 	info.compareOp = sampler_info.compare_op;
4053 	info.minLod = sampler_info.min_lod;
4054 	info.maxLod = sampler_info.max_lod;
4055 	info.borderColor = sampler_info.border_color;
4056 	info.unnormalizedCoordinates = sampler_info.unnormalized_coordinates;
4057 	return info;
4058 }
4059 
create_sampler(const SamplerCreateInfo & sampler_info,StockSampler stock_sampler)4060 SamplerHandle Device::create_sampler(const SamplerCreateInfo &sampler_info, StockSampler stock_sampler)
4061 {
4062 	auto info = fill_vk_sampler_info(sampler_info);
4063 	VkSampler sampler;
4064 
4065 	VkSamplerYcbcrConversionInfo conversion_info = { VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO };
4066 
4067 	switch (stock_sampler)
4068 	{
4069 	case StockSampler::LinearYUV420P:
4070 		if (!ext.sampler_ycbcr_conversion_features.samplerYcbcrConversion)
4071 			return SamplerHandle(nullptr);
4072 		info.pNext = &conversion_info;
4073 		conversion_info.conversion = samplers_ycbcr[static_cast<unsigned>(YCbCrFormat::YUV420P_3PLANE)];
4074 		break;
4075 
4076 	case StockSampler::LinearYUV422P:
4077 		if (!ext.sampler_ycbcr_conversion_features.samplerYcbcrConversion)
4078 			return SamplerHandle(nullptr);
4079 		info.pNext = &conversion_info;
4080 		conversion_info.conversion = samplers_ycbcr[static_cast<unsigned>(YCbCrFormat::YUV422P_3PLANE)];
4081 		break;
4082 
4083 	case StockSampler::LinearYUV444P:
4084 		if (!ext.sampler_ycbcr_conversion_features.samplerYcbcrConversion)
4085 			return SamplerHandle(nullptr);
4086 		info.pNext = &conversion_info;
4087 		conversion_info.conversion = samplers_ycbcr[static_cast<unsigned>(YCbCrFormat::YUV444P_3PLANE)];
4088 		break;
4089 
4090 	default:
4091 		info.pNext = nullptr;
4092 		break;
4093 	}
4094 
4095 	if (table->vkCreateSampler(device, &info, nullptr, &sampler) != VK_SUCCESS)
4096 		return SamplerHandle(nullptr);
4097 #ifdef GRANITE_VULKAN_FOSSILIZE
4098 	register_sampler(sampler, Fossilize::Hash(stock_sampler) | 0x10000, info);
4099 #else
4100 	(void)stock_sampler;
4101 #endif
4102 	SamplerHandle handle(handle_pool.samplers.allocate(this, sampler, sampler_info));
4103 	handle->set_internal_sync_object();
4104 	return handle;
4105 }
4106 
create_sampler(const SamplerCreateInfo & sampler_info)4107 SamplerHandle Device::create_sampler(const SamplerCreateInfo &sampler_info)
4108 {
4109 	auto info = fill_vk_sampler_info(sampler_info);
4110 	VkSampler sampler;
4111 	if (table->vkCreateSampler(device, &info, nullptr, &sampler) != VK_SUCCESS)
4112 		return SamplerHandle(nullptr);
4113 	return SamplerHandle(handle_pool.samplers.allocate(this, sampler, sampler_info));
4114 }
4115 
create_bindless_descriptor_pool(BindlessResourceType type,unsigned num_sets,unsigned num_descriptors)4116 BindlessDescriptorPoolHandle Device::create_bindless_descriptor_pool(BindlessResourceType type,
4117                                                                      unsigned num_sets, unsigned num_descriptors)
4118 {
4119 	if (!ext.supports_descriptor_indexing)
4120 		return BindlessDescriptorPoolHandle{ nullptr };
4121 
4122 	DescriptorSetAllocator *allocator = nullptr;
4123 
4124 	switch (type)
4125 	{
4126 	case BindlessResourceType::ImageFP:
4127 		allocator = bindless_sampled_image_allocator_fp;
4128 		break;
4129 
4130 	case BindlessResourceType::ImageInt:
4131 		allocator = bindless_sampled_image_allocator_integer;
4132 		break;
4133 
4134 	default:
4135 		break;
4136 	}
4137 
4138 	VkDescriptorPool pool = VK_NULL_HANDLE;
4139 	if (allocator)
4140 		pool = allocator->allocate_bindless_pool(num_sets, num_descriptors);
4141 
4142 	if (!pool)
4143 	{
4144 		LOGE("Failed to allocate bindless pool.\n");
4145 		return BindlessDescriptorPoolHandle{ nullptr };
4146 	}
4147 
4148 	auto *handle = handle_pool.bindless_descriptor_pool.allocate(this, allocator, pool);
4149 	return BindlessDescriptorPoolHandle{ handle };
4150 }
4151 
fill_buffer_sharing_indices(VkBufferCreateInfo & info,uint32_t * sharing_indices)4152 void Device::fill_buffer_sharing_indices(VkBufferCreateInfo &info, uint32_t *sharing_indices)
4153 {
4154 	if (graphics_queue_family_index != compute_queue_family_index ||
4155 	    graphics_queue_family_index != transfer_queue_family_index)
4156 	{
4157 		// For buffers, always just use CONCURRENT access modes,
4158 		// so we don't have to deal with acquire/release barriers in async compute.
4159 		info.sharingMode = VK_SHARING_MODE_CONCURRENT;
4160 
4161 		sharing_indices[info.queueFamilyIndexCount++] = graphics_queue_family_index;
4162 
4163 		if (graphics_queue_family_index != compute_queue_family_index)
4164 			sharing_indices[info.queueFamilyIndexCount++] = compute_queue_family_index;
4165 
4166 		if (graphics_queue_family_index != transfer_queue_family_index &&
4167 		    compute_queue_family_index != transfer_queue_family_index)
4168 		{
4169 			sharing_indices[info.queueFamilyIndexCount++] = transfer_queue_family_index;
4170 		}
4171 
4172 		info.pQueueFamilyIndices = sharing_indices;
4173 	}
4174 }
4175 
create_imported_host_buffer(const BufferCreateInfo & create_info,VkExternalMemoryHandleTypeFlagBits type,void * host_buffer)4176 BufferHandle Device::create_imported_host_buffer(const BufferCreateInfo &create_info, VkExternalMemoryHandleTypeFlagBits type, void *host_buffer)
4177 {
4178 	if (create_info.domain != BufferDomain::Host &&
4179 	    create_info.domain != BufferDomain::CachedHost &&
4180 	    create_info.domain != BufferDomain::CachedCoherentHostPreferCached &&
4181 	    create_info.domain != BufferDomain::CachedCoherentHostPreferCoherent)
4182 	{
4183 		return BufferHandle{};
4184 	}
4185 
4186 	if (!ext.supports_external_memory_host)
4187 		return BufferHandle{};
4188 
4189 	if ((reinterpret_cast<uintptr_t>(host_buffer) & (ext.host_memory_properties.minImportedHostPointerAlignment - 1)) != 0)
4190 	{
4191 		LOGE("Host buffer is not aligned appropriately.\n");
4192 		return BufferHandle{};
4193 	}
4194 
4195 	VkMemoryHostPointerPropertiesEXT host_pointer_props = { VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT };
4196 	if (table->vkGetMemoryHostPointerPropertiesEXT(device, type, host_buffer, &host_pointer_props) != VK_SUCCESS)
4197 	{
4198 		LOGE("Host pointer is not importable.\n");
4199 		return BufferHandle{};
4200 	}
4201 
4202 	VkBufferCreateInfo info = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
4203 	info.size = create_info.size;
4204 	info.usage = create_info.usage;
4205 	info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
4206 
4207 	uint32_t sharing_indices[3];
4208 	fill_buffer_sharing_indices(info, sharing_indices);
4209 
4210 	VkBuffer buffer;
4211 	VkMemoryRequirements reqs;
4212 	if (table->vkCreateBuffer(device, &info, nullptr, &buffer) != VK_SUCCESS)
4213 		return BufferHandle{};
4214 
4215 	table->vkGetBufferMemoryRequirements(device, buffer, &reqs);
4216 
4217 	reqs.memoryTypeBits &= host_pointer_props.memoryTypeBits;
4218 	if (reqs.memoryTypeBits == 0)
4219 	{
4220 		LOGE("No compatible host pointer types are available.\n");
4221 		table->vkDestroyBuffer(device, buffer, nullptr);
4222 		return BufferHandle{};
4223 	}
4224 
4225 	uint32_t memory_type = find_memory_type(create_info.domain, reqs.memoryTypeBits);
4226 	if (memory_type == UINT32_MAX)
4227 	{
4228 		LOGE("Failed to find memory type.\n");
4229 		table->vkDestroyBuffer(device, buffer, nullptr);
4230 		return BufferHandle{};
4231 	}
4232 
4233 	VkMemoryAllocateInfo alloc_info = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO };
4234 	alloc_info.allocationSize = (create_info.size + ext.host_memory_properties.minImportedHostPointerAlignment - 1) &
4235 	                            ~(ext.host_memory_properties.minImportedHostPointerAlignment - 1);
4236 	alloc_info.memoryTypeIndex = memory_type;
4237 
4238 	VkImportMemoryHostPointerInfoEXT import = { VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT };
4239 	import.handleType = type;
4240 	import.pHostPointer = host_buffer;
4241 	alloc_info.pNext = &import;
4242 
4243 	VkDeviceMemory memory;
4244 	if (table->vkAllocateMemory(device, &alloc_info, nullptr, &memory) != VK_SUCCESS)
4245 	{
4246 		table->vkDestroyBuffer(device, buffer, nullptr);
4247 		return BufferHandle{};
4248 	}
4249 
4250 	auto allocation = DeviceAllocation::make_imported_allocation(memory, info.size, memory_type);
4251 	if (table->vkMapMemory(device, memory, 0, VK_WHOLE_SIZE, 0, reinterpret_cast<void **>(&allocation.host_base)) != VK_SUCCESS)
4252 	{
4253 		allocation.free_immediate(managers.memory);
4254 		table->vkDestroyBuffer(device, buffer, nullptr);
4255 		return BufferHandle{};
4256 	}
4257 
4258 	if (table->vkBindBufferMemory(device, buffer, memory, 0) != VK_SUCCESS)
4259 	{
4260 		allocation.free_immediate(managers.memory);
4261 		table->vkDestroyBuffer(device, buffer, nullptr);
4262 		return BufferHandle{};
4263 	}
4264 
4265 	BufferHandle handle(handle_pool.buffers.allocate(this, buffer, allocation, create_info));
4266 	return handle;
4267 }
4268 
create_buffer(const BufferCreateInfo & create_info,const void * initial)4269 BufferHandle Device::create_buffer(const BufferCreateInfo &create_info, const void *initial)
4270 {
4271 	VkBuffer buffer;
4272 	VkMemoryRequirements reqs;
4273 	DeviceAllocation allocation;
4274 
4275 	bool zero_initialize = (create_info.misc & BUFFER_MISC_ZERO_INITIALIZE_BIT) != 0;
4276 	if (initial && zero_initialize)
4277 	{
4278 		LOGE("Cannot initialize buffer with data and clear.\n");
4279 		return BufferHandle{};
4280 	}
4281 
4282 	VkBufferCreateInfo info = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
4283 	info.size = create_info.size;
4284 	info.usage = create_info.usage | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
4285 	info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
4286 
4287 	uint32_t sharing_indices[3];
4288 	fill_buffer_sharing_indices(info, sharing_indices);
4289 
4290 	if (table->vkCreateBuffer(device, &info, nullptr, &buffer) != VK_SUCCESS)
4291 		return BufferHandle(nullptr);
4292 
4293 	table->vkGetBufferMemoryRequirements(device, buffer, &reqs);
4294 
4295 	uint32_t memory_type = find_memory_type(create_info.domain, reqs.memoryTypeBits);
4296 	if (memory_type == UINT32_MAX)
4297 	{
4298 		LOGE("Failed to find memory type.\n");
4299 		table->vkDestroyBuffer(device, buffer, nullptr);
4300 		return BufferHandle(nullptr);
4301 	}
4302 
4303 	if (!managers.memory.allocate(reqs.size, reqs.alignment, memory_type, ALLOCATION_TILING_LINEAR, &allocation))
4304 	{
4305 		// This memory type is rather scarce, so fallback to Host type if we've exhausted this memory.
4306 		if (create_info.domain == BufferDomain::LinkedDeviceHost)
4307 		{
4308 			LOGW("Exhausted LinkedDeviceHost memory, falling back to host.\n");
4309 			memory_type = find_memory_type(BufferDomain::Host, reqs.memoryTypeBits);
4310 			if (memory_type == UINT32_MAX)
4311 			{
4312 				LOGE("Failed to find memory type.\n");
4313 				table->vkDestroyBuffer(device, buffer, nullptr);
4314 				return BufferHandle(nullptr);
4315 			}
4316 
4317 			if (!managers.memory.allocate(reqs.size, reqs.alignment, memory_type, ALLOCATION_TILING_LINEAR, &allocation))
4318 			{
4319 				table->vkDestroyBuffer(device, buffer, nullptr);
4320 				return BufferHandle(nullptr);
4321 			}
4322 		}
4323 		else
4324 		{
4325 			table->vkDestroyBuffer(device, buffer, nullptr);
4326 			return BufferHandle(nullptr);
4327 		}
4328 	}
4329 
4330 	if (table->vkBindBufferMemory(device, buffer, allocation.get_memory(), allocation.get_offset()) != VK_SUCCESS)
4331 	{
4332 		allocation.free_immediate(managers.memory);
4333 		table->vkDestroyBuffer(device, buffer, nullptr);
4334 		return BufferHandle(nullptr);
4335 	}
4336 
4337 	auto tmpinfo = create_info;
4338 	tmpinfo.usage |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
4339 	BufferHandle handle(handle_pool.buffers.allocate(this, buffer, allocation, tmpinfo));
4340 
4341 	if (create_info.domain == BufferDomain::Device && (initial || zero_initialize) && !memory_type_is_host_visible(memory_type))
4342 	{
4343 		CommandBufferHandle cmd;
4344 		if (initial)
4345 		{
4346 			auto staging_info = create_info;
4347 			staging_info.domain = BufferDomain::Host;
4348 			auto staging_buffer = create_buffer(staging_info, initial);
4349 			set_name(*staging_buffer, "buffer-upload-staging-buffer");
4350 
4351 			cmd = request_command_buffer(CommandBuffer::Type::AsyncTransfer);
4352 			cmd->begin_region("copy-buffer-staging");
4353 			cmd->copy_buffer(*handle, *staging_buffer);
4354 			cmd->end_region();
4355 		}
4356 		else
4357 		{
4358 			cmd = request_command_buffer(CommandBuffer::Type::AsyncCompute);
4359 			cmd->begin_region("fill-buffer-staging");
4360 			cmd->fill_buffer(*handle, 0);
4361 			cmd->end_region();
4362 		}
4363 
4364 		LOCK();
4365 		submit_staging(cmd, info.usage, true);
4366 	}
4367 	else if (initial || zero_initialize)
4368 	{
4369 		void *ptr = managers.memory.map_memory(allocation, MEMORY_ACCESS_WRITE_BIT, 0, allocation.get_size());
4370 		if (!ptr)
4371 			return BufferHandle(nullptr);
4372 
4373 		if (initial)
4374 			memcpy(ptr, initial, create_info.size);
4375 		else
4376 			memset(ptr, 0, create_info.size);
4377 		managers.memory.unmap_memory(allocation, MEMORY_ACCESS_WRITE_BIT, 0, allocation.get_size());
4378 	}
4379 	return handle;
4380 }
4381 
memory_type_is_device_optimal(uint32_t type) const4382 bool Device::memory_type_is_device_optimal(uint32_t type) const
4383 {
4384 	return (mem_props.memoryTypes[type].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) != 0;
4385 }
4386 
memory_type_is_host_visible(uint32_t type) const4387 bool Device::memory_type_is_host_visible(uint32_t type) const
4388 {
4389 	return (mem_props.memoryTypes[type].propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) != 0;
4390 }
4391 
get_format_properties(VkFormat format,VkFormatProperties * properties)4392 void Device::get_format_properties(VkFormat format, VkFormatProperties *properties)
4393 {
4394 	vkGetPhysicalDeviceFormatProperties(gpu, format, properties);
4395 }
4396 
get_image_format_properties(VkFormat format,VkImageType type,VkImageTiling tiling,VkImageUsageFlags usage,VkImageCreateFlags flags,VkImageFormatProperties * properties)4397 bool Device::get_image_format_properties(VkFormat format, VkImageType type, VkImageTiling tiling,
4398                                          VkImageUsageFlags usage, VkImageCreateFlags flags,
4399                                          VkImageFormatProperties *properties)
4400 {
4401 	auto res = vkGetPhysicalDeviceImageFormatProperties(gpu, format, type, tiling, usage, flags,
4402 	                                                    properties);
4403 	return res == VK_SUCCESS;
4404 }
4405 
image_format_is_supported(VkFormat format,VkFormatFeatureFlags required,VkImageTiling tiling) const4406 bool Device::image_format_is_supported(VkFormat format, VkFormatFeatureFlags required, VkImageTiling tiling) const
4407 {
4408 	VkFormatProperties props;
4409 	vkGetPhysicalDeviceFormatProperties(gpu, format, &props);
4410 	auto flags = tiling == VK_IMAGE_TILING_OPTIMAL ? props.optimalTilingFeatures : props.linearTilingFeatures;
4411 	return (flags & required) == required;
4412 }
4413 
get_default_depth_stencil_format() const4414 VkFormat Device::get_default_depth_stencil_format() const
4415 {
4416 	if (image_format_is_supported(VK_FORMAT_D24_UNORM_S8_UINT, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT, VK_IMAGE_TILING_OPTIMAL))
4417 		return VK_FORMAT_D24_UNORM_S8_UINT;
4418 	if (image_format_is_supported(VK_FORMAT_D32_SFLOAT_S8_UINT, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT, VK_IMAGE_TILING_OPTIMAL))
4419 		return VK_FORMAT_D32_SFLOAT_S8_UINT;
4420 
4421 	return VK_FORMAT_UNDEFINED;
4422 }
4423 
get_default_depth_format() const4424 VkFormat Device::get_default_depth_format() const
4425 {
4426 	if (image_format_is_supported(VK_FORMAT_D32_SFLOAT, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT, VK_IMAGE_TILING_OPTIMAL))
4427 		return VK_FORMAT_D32_SFLOAT;
4428 	if (image_format_is_supported(VK_FORMAT_X8_D24_UNORM_PACK32, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT, VK_IMAGE_TILING_OPTIMAL))
4429 		return VK_FORMAT_X8_D24_UNORM_PACK32;
4430 	if (image_format_is_supported(VK_FORMAT_D16_UNORM, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT, VK_IMAGE_TILING_OPTIMAL))
4431 		return VK_FORMAT_D16_UNORM;
4432 
4433 	return VK_FORMAT_UNDEFINED;
4434 }
4435 
allocate_cookie()4436 uint64_t Device::allocate_cookie()
4437 {
4438 	// Reserve lower bits for "special purposes".
4439 #ifdef GRANITE_VULKAN_MT
4440 	return cookie.fetch_add(16, memory_order_relaxed) + 16;
4441 #else
4442 	cookie += 16;
4443 	return cookie;
4444 #endif
4445 }
4446 
request_render_pass(const RenderPassInfo & info,bool compatible)4447 const RenderPass &Device::request_render_pass(const RenderPassInfo &info, bool compatible)
4448 {
4449 	Hasher h;
4450 	VkFormat formats[VULKAN_NUM_ATTACHMENTS];
4451 	VkFormat depth_stencil;
4452 	uint32_t lazy = 0;
4453 	uint32_t optimal = 0;
4454 
4455 	for (unsigned i = 0; i < info.num_color_attachments; i++)
4456 	{
4457 		VK_ASSERT(info.color_attachments[i]);
4458 		formats[i] = info.color_attachments[i]->get_format();
4459 		if (info.color_attachments[i]->get_image().get_create_info().domain == ImageDomain::Transient)
4460 			lazy |= 1u << i;
4461 		if (info.color_attachments[i]->get_image().get_layout_type() == Layout::Optimal)
4462 			optimal |= 1u << i;
4463 
4464 		// This can change external subpass dependencies, so it must always be hashed.
4465 		h.u32(info.color_attachments[i]->get_image().get_swapchain_layout());
4466 	}
4467 
4468 	if (info.depth_stencil)
4469 	{
4470 		if (info.depth_stencil->get_image().get_create_info().domain == ImageDomain::Transient)
4471 			lazy |= 1u << info.num_color_attachments;
4472 		if (info.depth_stencil->get_image().get_layout_type() == Layout::Optimal)
4473 			optimal |= 1u << info.num_color_attachments;
4474 	}
4475 
4476 	// For multiview, base layer is encoded into the view mask.
4477 	if (info.num_layers > 1)
4478 	{
4479 		h.u32(info.base_layer);
4480 		h.u32(info.num_layers);
4481 	}
4482 	else
4483 	{
4484 		h.u32(0);
4485 		h.u32(info.num_layers);
4486 	}
4487 
4488 	h.u32(info.num_subpasses);
4489 	for (unsigned i = 0; i < info.num_subpasses; i++)
4490 	{
4491 		h.u32(info.subpasses[i].num_color_attachments);
4492 		h.u32(info.subpasses[i].num_input_attachments);
4493 		h.u32(info.subpasses[i].num_resolve_attachments);
4494 		h.u32(static_cast<uint32_t>(info.subpasses[i].depth_stencil_mode));
4495 		for (unsigned j = 0; j < info.subpasses[i].num_color_attachments; j++)
4496 			h.u32(info.subpasses[i].color_attachments[j]);
4497 		for (unsigned j = 0; j < info.subpasses[i].num_input_attachments; j++)
4498 			h.u32(info.subpasses[i].input_attachments[j]);
4499 		for (unsigned j = 0; j < info.subpasses[i].num_resolve_attachments; j++)
4500 			h.u32(info.subpasses[i].resolve_attachments[j]);
4501 	}
4502 
4503 	depth_stencil = info.depth_stencil ? info.depth_stencil->get_format() : VK_FORMAT_UNDEFINED;
4504 	h.data(formats, info.num_color_attachments * sizeof(VkFormat));
4505 	h.u32(info.num_color_attachments);
4506 	h.u32(depth_stencil);
4507 
4508 	// Compatible render passes do not care about load/store, or image layouts.
4509 	if (!compatible)
4510 	{
4511 		h.u32(info.op_flags);
4512 		h.u32(info.clear_attachments);
4513 		h.u32(info.load_attachments);
4514 		h.u32(info.store_attachments);
4515 		h.u32(optimal);
4516 	}
4517 
4518 	// Lazy flag can change external subpass dependencies, which is not compatible.
4519 	h.u32(lazy);
4520 
4521 	auto hash = h.get();
4522 
4523 	auto *ret = render_passes.find(hash);
4524 	if (!ret)
4525 		ret = render_passes.emplace_yield(hash, hash, this, info);
4526 	return *ret;
4527 }
4528 
request_framebuffer(const RenderPassInfo & info)4529 const Framebuffer &Device::request_framebuffer(const RenderPassInfo &info)
4530 {
4531 	return framebuffer_allocator.request_framebuffer(info);
4532 }
4533 
get_transient_attachment(unsigned width,unsigned height,VkFormat format,unsigned index,unsigned samples,unsigned layers)4534 ImageView &Device::get_transient_attachment(unsigned width, unsigned height, VkFormat format,
4535                                             unsigned index, unsigned samples, unsigned layers)
4536 {
4537 	return transient_allocator.request_attachment(width, height, format, index, samples, layers);
4538 }
4539 
get_swapchain_view()4540 ImageView &Device::get_swapchain_view()
4541 {
4542 	VK_ASSERT(wsi.index < wsi.swapchain.size());
4543 	return wsi.swapchain[wsi.index]->get_view();
4544 }
4545 
get_swapchain_view(unsigned index)4546 ImageView &Device::get_swapchain_view(unsigned index)
4547 {
4548 	VK_ASSERT(index < wsi.swapchain.size());
4549 	return wsi.swapchain[index]->get_view();
4550 }
4551 
get_num_frame_contexts() const4552 unsigned Device::get_num_frame_contexts() const
4553 {
4554 	return unsigned(per_frame.size());
4555 }
4556 
get_num_swapchain_images() const4557 unsigned Device::get_num_swapchain_images() const
4558 {
4559 	return unsigned(wsi.swapchain.size());
4560 }
4561 
get_swapchain_index() const4562 unsigned Device::get_swapchain_index() const
4563 {
4564 	return wsi.index;
4565 }
4566 
get_current_frame_context() const4567 unsigned Device::get_current_frame_context() const
4568 {
4569 	return frame_context_index;
4570 }
4571 
get_swapchain_render_pass(SwapchainRenderPass style)4572 RenderPassInfo Device::get_swapchain_render_pass(SwapchainRenderPass style)
4573 {
4574 	RenderPassInfo info;
4575 	info.num_color_attachments = 1;
4576 	info.color_attachments[0] = &get_swapchain_view();
4577 	info.clear_attachments = ~0u;
4578 	info.store_attachments = 1u << 0;
4579 
4580 	switch (style)
4581 	{
4582 	case SwapchainRenderPass::Depth:
4583 	{
4584 		info.op_flags |= RENDER_PASS_OP_CLEAR_DEPTH_STENCIL_BIT;
4585 		info.depth_stencil =
4586 		    &get_transient_attachment(wsi.swapchain[wsi.index]->get_create_info().width,
4587 		                              wsi.swapchain[wsi.index]->get_create_info().height, get_default_depth_format());
4588 		break;
4589 	}
4590 
4591 	case SwapchainRenderPass::DepthStencil:
4592 	{
4593 		info.op_flags |= RENDER_PASS_OP_CLEAR_DEPTH_STENCIL_BIT;
4594 		info.depth_stencil =
4595 		    &get_transient_attachment(wsi.swapchain[wsi.index]->get_create_info().width,
4596 		                              wsi.swapchain[wsi.index]->get_create_info().height, get_default_depth_stencil_format());
4597 		break;
4598 	}
4599 
4600 	default:
4601 		break;
4602 	}
4603 	return info;
4604 }
4605 
set_queue_lock(std::function<void ()> lock_callback,std::function<void ()> unlock_callback)4606 void Device::set_queue_lock(std::function<void()> lock_callback, std::function<void()> unlock_callback)
4607 {
4608 	queue_lock_callback = move(lock_callback);
4609 	queue_unlock_callback = move(unlock_callback);
4610 }
4611 
set_name(const Buffer & buffer,const char * name)4612 void Device::set_name(const Buffer &buffer, const char *name)
4613 {
4614 	if (ext.supports_debug_utils)
4615 	{
4616 		VkDebugUtilsObjectNameInfoEXT info = { VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT };
4617 		info.objectType = VK_OBJECT_TYPE_BUFFER;
4618 		info.objectHandle = (uint64_t)buffer.get_buffer();
4619 		info.pObjectName = name;
4620 		if (vkSetDebugUtilsObjectNameEXT)
4621 			vkSetDebugUtilsObjectNameEXT(device, &info);
4622 	}
4623 	else if (ext.supports_debug_marker)
4624 	{
4625 		VkDebugMarkerObjectNameInfoEXT info = { VK_STRUCTURE_TYPE_DEBUG_MARKER_OBJECT_NAME_INFO_EXT };
4626 		info.objectType = VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_EXT;
4627 		info.object = (uint64_t)buffer.get_buffer();
4628 		info.pObjectName = name;
4629 		table->vkDebugMarkerSetObjectNameEXT(device, &info);
4630 	}
4631 }
4632 
set_name(const Image & image,const char * name)4633 void Device::set_name(const Image &image, const char *name)
4634 {
4635 	if (ext.supports_debug_utils)
4636 	{
4637 		VkDebugUtilsObjectNameInfoEXT info = { VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT };
4638 		info.objectType = VK_OBJECT_TYPE_IMAGE;
4639 		info.objectHandle = (uint64_t)image.get_image();
4640 		info.pObjectName = name;
4641 		if (vkSetDebugUtilsObjectNameEXT)
4642 			vkSetDebugUtilsObjectNameEXT(device, &info);
4643 	}
4644 	else if (ext.supports_debug_marker)
4645 	{
4646 		VkDebugMarkerObjectNameInfoEXT info = { VK_STRUCTURE_TYPE_DEBUG_MARKER_OBJECT_NAME_INFO_EXT };
4647 		info.objectType = VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_EXT;
4648 		info.object = (uint64_t)image.get_image();
4649 		info.pObjectName = name;
4650 		table->vkDebugMarkerSetObjectNameEXT(device, &info);
4651 	}
4652 }
4653 
set_name(const CommandBuffer & cmd,const char * name)4654 void Device::set_name(const CommandBuffer &cmd, const char *name)
4655 {
4656 	if (ext.supports_debug_utils)
4657 	{
4658 		VkDebugUtilsObjectNameInfoEXT info = { VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT };
4659 		info.objectType = VK_OBJECT_TYPE_COMMAND_BUFFER;
4660 		info.objectHandle = (uint64_t)cmd.get_command_buffer();
4661 		info.pObjectName = name;
4662 		if (vkSetDebugUtilsObjectNameEXT)
4663 			vkSetDebugUtilsObjectNameEXT(device, &info);
4664 	}
4665 	else if (ext.supports_debug_marker)
4666 	{
4667 		VkDebugMarkerObjectNameInfoEXT info = { VK_STRUCTURE_TYPE_DEBUG_MARKER_OBJECT_NAME_INFO_EXT };
4668 		info.objectType = VK_DEBUG_REPORT_OBJECT_TYPE_COMMAND_BUFFER_EXT;
4669 		info.object = (uint64_t)cmd.get_command_buffer();
4670 		info.pObjectName = name;
4671 		table->vkDebugMarkerSetObjectNameEXT(device, &info);
4672 	}
4673 }
4674 
report_checkpoints()4675 void Device::report_checkpoints()
4676 {
4677 	if (!ext.supports_nv_device_diagnostic_checkpoints)
4678 		return;
4679 
4680 	uint32_t graphics_count;
4681 	table->vkGetQueueCheckpointDataNV(graphics_queue, &graphics_count, nullptr);
4682 	vector<VkCheckpointDataNV> graphics_data(graphics_count);
4683 	for (auto &g : graphics_data)
4684 		g.sType = VK_STRUCTURE_TYPE_CHECKPOINT_DATA_NV;
4685 	table->vkGetQueueCheckpointDataNV(graphics_queue, &graphics_count, graphics_data.data());
4686 
4687 	uint32_t compute_count;
4688 	table->vkGetQueueCheckpointDataNV(compute_queue, &compute_count, nullptr);
4689 	vector<VkCheckpointDataNV> compute_data(compute_count);
4690 	for (auto &g : compute_data)
4691 		g.sType = VK_STRUCTURE_TYPE_CHECKPOINT_DATA_NV;
4692 	table->vkGetQueueCheckpointDataNV(compute_queue, &compute_count, compute_data.data());
4693 
4694 	uint32_t transfer_count;
4695 	table->vkGetQueueCheckpointDataNV(transfer_queue, &transfer_count, nullptr);
4696 	vector<VkCheckpointDataNV> transfer_data(compute_count);
4697 	for (auto &g : transfer_data)
4698 		g.sType = VK_STRUCTURE_TYPE_CHECKPOINT_DATA_NV;
4699 	table->vkGetQueueCheckpointDataNV(transfer_queue, &transfer_count, transfer_data.data());
4700 
4701 	if (!graphics_data.empty())
4702 	{
4703 		LOGI("Checkpoints for graphics queue:\n");
4704 		for (auto &g : graphics_data)
4705 			LOGI("Stage %u:\n%s\n", g.stage, static_cast<const char *>(g.pCheckpointMarker));
4706 	}
4707 
4708 	if (!compute_data.empty())
4709 	{
4710 		LOGI("Checkpoints for compute queue:\n");
4711 		for (auto &g : compute_data)
4712 			LOGI("    Stage %u:\n%s\n", g.stage, static_cast<const char *>(g.pCheckpointMarker));
4713 	}
4714 
4715 	if (!transfer_data.empty())
4716 	{
4717 		LOGI("Checkpoints for transfer queue:\n");
4718 		for (auto &g : transfer_data)
4719 			LOGI("    Stage %u:\n%s\n", g.stage, static_cast<const char *>(g.pCheckpointMarker));
4720 	}
4721 }
4722 
query_available_performance_counters(CommandBuffer::Type type,uint32_t * count,const VkPerformanceCounterKHR ** counters,const VkPerformanceCounterDescriptionKHR ** desc)4723 void Device::query_available_performance_counters(CommandBuffer::Type type, uint32_t *count,
4724                                                   const VkPerformanceCounterKHR **counters,
4725                                                   const VkPerformanceCounterDescriptionKHR **desc)
4726 {
4727 	auto &query_pool = get_performance_query_pool(type);
4728 	*count = query_pool.get_num_counters();
4729 	*counters = query_pool.get_available_counters();
4730 	*desc = query_pool.get_available_counter_descs();
4731 }
4732 
init_performance_counters(const std::vector<std::string> & names)4733 bool Device::init_performance_counters(const std::vector<std::string> &names)
4734 {
4735 	if (!graphics.performance_query_pool.init_counters(names))
4736 		return false;
4737 
4738 	if (compute_queue_family_index != graphics_queue_family_index &&
4739 	    !compute.performance_query_pool.init_counters(names))
4740 	{
4741 		return false;
4742 	}
4743 
4744 	if (transfer_queue_family_index != compute_queue_family_index &&
4745 	    transfer_queue_family_index != graphics_queue_family_index &&
4746 	    !transfer.performance_query_pool.init_counters(names))
4747 	{
4748 		return false;
4749 	}
4750 
4751 	return true;
4752 }
4753 
release_profiling()4754 void Device::release_profiling()
4755 {
4756 	table->vkReleaseProfilingLockKHR(device);
4757 }
4758 
acquire_profiling()4759 bool Device::acquire_profiling()
4760 {
4761 	if (!ext.performance_query_features.performanceCounterQueryPools)
4762 		return false;
4763 
4764 	VkAcquireProfilingLockInfoKHR info = { VK_STRUCTURE_TYPE_ACQUIRE_PROFILING_LOCK_INFO_KHR };
4765 	info.timeout = UINT64_MAX;
4766 	if (table->vkAcquireProfilingLockKHR(device, &info) != VK_SUCCESS)
4767 	{
4768 		LOGE("Failed to acquire profiling lock.\n");
4769 		return false;
4770 	}
4771 
4772 	return true;
4773 }
4774 
add_debug_channel_buffer(DebugChannelInterface * iface,std::string tag,Vulkan::BufferHandle buffer)4775 void Device::add_debug_channel_buffer(DebugChannelInterface *iface, std::string tag, Vulkan::BufferHandle buffer)
4776 {
4777 	buffer->set_internal_sync_object();
4778 	LOCK();
4779 	frame().debug_channels.push_back({ iface, std::move(tag), std::move(buffer) });
4780 }
4781 
parse_debug_channel(const PerFrame::DebugChannel & channel)4782 void Device::parse_debug_channel(const PerFrame::DebugChannel &channel)
4783 {
4784 	if (!channel.iface)
4785 		return;
4786 
4787 	auto *words = static_cast<const DebugChannelInterface::Word *>(map_host_buffer(*channel.buffer, MEMORY_ACCESS_READ_BIT));
4788 
4789 	size_t size = channel.buffer->get_create_info().size;
4790 	if (size <= sizeof(uint32_t))
4791 	{
4792 		LOGE("Debug channel buffer is too small.\n");
4793 		return;
4794 	}
4795 
4796 	// Format for the debug channel.
4797 	// Word 0: Atomic counter used by shader.
4798 	// Word 1-*: [total message length, code, x, y, z, args]
4799 
4800 	size -= sizeof(uint32_t);
4801 	size /= sizeof(uint32_t);
4802 
4803 	if (words[0].u32 > size)
4804 	{
4805 		LOGW("Debug channel overflowed and messaged were dropped. Consider increasing debug channel size to at least %u bytes.\n",
4806 		     unsigned((words[0].u32 + 1) * sizeof(uint32_t)));
4807 	}
4808 
4809 	words++;
4810 
4811 	while (size != 0 && words[0].u32 >= 5 && words[0].u32 <= size)
4812 	{
4813 		channel.iface->message(channel.tag, words[1].u32,
4814 		                       words[2].u32, words[3].u32, words[4].u32,
4815 		                       words[0].u32 - 5, &words[5]);
4816 		size -= words[0].u32;
4817 		words += words[0].u32;
4818 	}
4819 
4820 	unmap_host_buffer(*channel.buffer, MEMORY_ACCESS_READ_BIT);
4821 }
4822 
convert_to_signed_delta(uint64_t start_ticks,uint64_t end_ticks,unsigned valid_bits)4823 static int64_t convert_to_signed_delta(uint64_t start_ticks, uint64_t end_ticks, unsigned valid_bits)
4824 {
4825 	unsigned shamt = 64 - valid_bits;
4826 	start_ticks <<= shamt;
4827 	end_ticks <<= shamt;
4828 	auto ticks_delta = int64_t(end_ticks - start_ticks);
4829 	ticks_delta >>= shamt;
4830 	return ticks_delta;
4831 }
4832 
convert_timestamp_delta(uint64_t start_ticks,uint64_t end_ticks) const4833 double Device::convert_timestamp_delta(uint64_t start_ticks, uint64_t end_ticks) const
4834 {
4835 	int64_t ticks_delta = convert_to_signed_delta(start_ticks, end_ticks, timestamp_valid_bits);
4836 	return double(int64_t(ticks_delta)) * gpu_props.limits.timestampPeriod * 1e-9;
4837 }
4838 
update_wrapped_base_timestamp(uint64_t end_ticks)4839 uint64_t Device::update_wrapped_base_timestamp(uint64_t end_ticks)
4840 {
4841 	json_base_timestamp_value += convert_to_signed_delta(json_base_timestamp_value, end_ticks, timestamp_valid_bits);
4842 	return json_base_timestamp_value;
4843 }
4844 
init_timestamp_trace(const char * path)4845 bool Device::init_timestamp_trace(const char *path)
4846 {
4847 	// Use the Chrome tracing format. It's trivial to emit and we get a frontend for free :)
4848 	json_trace_file.reset();
4849 	json_trace_file.reset(fopen(path, "w"));
4850 	if (json_trace_file)
4851 		fprintf(json_trace_file.get(), "[");
4852 	return bool(json_trace_file);
4853 }
4854 
convert_timestamp_to_absolute_usec(uint64_t ts)4855 int64_t Device::convert_timestamp_to_absolute_usec(uint64_t ts)
4856 {
4857 	// Ensure that we deal with timestamp wraparound correctly.
4858 	// On some hardware, we have < 64 valid bits and the timestamp counters will wrap around at some interval.
4859 	// As long as timestamps come in at a reasonably steady pace, we can deal with wraparound cleanly.
4860 	ts = update_wrapped_base_timestamp(ts);
4861 	if (json_timestamp_origin == 0)
4862 		json_timestamp_origin = ts;
4863 
4864 	auto delta_ts = int64_t(ts - json_timestamp_origin);
4865 	auto us = int64_t(double(int64_t(delta_ts)) * gpu_props.limits.timestampPeriod * 1e-3);
4866 	return us;
4867 }
4868 
write_json_timestamp_range(unsigned frame_index,const char * tid,const char * name,const char * extra,uint64_t start_ts,uint64_t end_ts,int64_t & min_us,int64_t & max_us)4869 void Device::write_json_timestamp_range(unsigned frame_index, const char *tid,
4870                                         const char *name, const char *extra,
4871                                         uint64_t start_ts, uint64_t end_ts,
4872                                         int64_t &min_us, int64_t &max_us)
4873 {
4874 	if (!json_trace_file)
4875 		return;
4876 
4877 	int64_t absolute_start = convert_timestamp_to_absolute_usec(start_ts);
4878 	int64_t absolute_end = convert_timestamp_to_absolute_usec(end_ts);
4879 
4880 	VK_ASSERT(absolute_start <= absolute_end);
4881 
4882 	min_us = std::min(absolute_start, min_us);
4883 	max_us = std::max(absolute_end, max_us);
4884 
4885 	fprintf(json_trace_file.get(), "\t{ \"name\": \"%s%s%s\", \"ph\": \"B\", \"tid\": \"%s\", \"pid\": \"%u\", \"ts\": %lld },\n",
4886 	        name, *extra != '\0' ? " " : "", extra, tid, frame_index, static_cast<long long>(absolute_start));
4887 	fprintf(json_trace_file.get(), "\t{ \"name\": \"%s%s%s\", \"ph\": \"E\", \"tid\": \"%s\", \"pid\": \"%u\", \"ts\": %lld },\n",
4888 	        name, *extra != '\0' ? " " : "", extra, tid, frame_index, static_cast<long long>(absolute_end));
4889 }
4890 
write_json_timestamp_range_us(unsigned frame_index,const char * tid,const char * name,int64_t start_us,int64_t end_us)4891 void Device::write_json_timestamp_range_us(unsigned frame_index, const char *tid, const char *name, int64_t start_us, int64_t end_us)
4892 {
4893 	if (!json_trace_file)
4894 		return;
4895 	if (start_us > end_us)
4896 		return;
4897 
4898 	fprintf(json_trace_file.get(), "\t{ \"name\": \"%s\", \"ph\": \"B\", \"tid\": \"%s\", \"pid\": \"%u\", \"ts\": %lld },\n",
4899 	        name, tid, frame_index, static_cast<long long>(start_us));
4900 	fprintf(json_trace_file.get(), "\t{ \"name\": \"%s\", \"ph\": \"E\", \"tid\": \"%s\", \"pid\": \"%u\", \"ts\": %lld },\n",
4901 	        name, tid, frame_index, static_cast<long long>(end_us));
4902 }
4903 
operator ()(FILE * file)4904 void Device::JSONTraceFileDeleter::operator()(FILE *file)
4905 {
4906 	// Intentionally truncate the JSON so that we can emit "," after the last element.
4907 	if (file)
4908 		fclose(file);
4909 }
4910 
4911 #ifdef GRANITE_VULKAN_FILESYSTEM
get_texture_manager()4912 TextureManager &Device::get_texture_manager()
4913 {
4914 	return texture_manager;
4915 }
4916 
get_shader_manager()4917 ShaderManager &Device::get_shader_manager()
4918 {
4919 	return shader_manager;
4920 }
4921 #endif
4922 
4923 #ifdef GRANITE_VULKAN_FILESYSTEM
init_shader_manager_cache()4924 void Device::init_shader_manager_cache()
4925 {
4926 	//if (!shader_manager.load_shader_cache("assets://shader_cache.json"))
4927 	//	shader_manager.load_shader_cache("cache://shader_cache.json");
4928 	shader_manager.load_shader_cache("assets://shader_cache.json");
4929 }
4930 
flush_shader_manager_cache()4931 void Device::flush_shader_manager_cache()
4932 {
4933 	shader_manager.save_shader_cache("cache://shader_cache.json");
4934 }
4935 #endif
4936 
get_device_table() const4937 const VolkDeviceTable &Device::get_device_table() const
4938 {
4939 	return *table;
4940 }
4941 
4942 #ifndef GRANITE_RENDERDOC_CAPTURE
init_renderdoc_capture()4943 bool Device::init_renderdoc_capture()
4944 {
4945 	LOGE("RenderDoc API capture is not enabled in this build.\n");
4946 	return false;
4947 }
4948 
begin_renderdoc_capture()4949 void Device::begin_renderdoc_capture()
4950 {
4951 }
4952 
end_renderdoc_capture()4953 void Device::end_renderdoc_capture()
4954 {
4955 }
4956 #endif
4957 
4958 }
4959