1 // Copyright (c) 2012- PPSSPP Project.
2 
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0 or later versions.
6 
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 // GNU General Public License 2.0 for more details.
11 
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
14 
15 // Official git repository and contact information can be found at
16 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17 
18 #include <algorithm>
19 
20 #include "Common/Data/Convert/SmallDataConvert.h"
21 #include "Common/Profiler/Profiler.h"
22 #include "Common/GPU/Vulkan/VulkanRenderManager.h"
23 
24 #include "Common/Log.h"
25 #include "Common/MemoryUtil.h"
26 #include "Common/TimeUtil.h"
27 #include "Core/MemMap.h"
28 #include "Core/System.h"
29 #include "Core/Reporting.h"
30 #include "Core/Config.h"
31 #include "Core/CoreTiming.h"
32 
33 #include "GPU/Math3D.h"
34 #include "GPU/GPUState.h"
35 #include "GPU/ge_constants.h"
36 
37 #include "Common/GPU/Vulkan/VulkanContext.h"
38 #include "Common/GPU/Vulkan/VulkanMemory.h"
39 
40 #include "GPU/Common/SplineCommon.h"
41 #include "GPU/Common/TransformCommon.h"
42 #include "GPU/Common/VertexDecoderCommon.h"
43 #include "GPU/Common/SoftwareTransformCommon.h"
44 #include "GPU/Common/DrawEngineCommon.h"
45 #include "GPU/Debugger/Debugger.h"
46 #include "GPU/Vulkan/DrawEngineVulkan.h"
47 #include "GPU/Vulkan/TextureCacheVulkan.h"
48 #include "GPU/Vulkan/ShaderManagerVulkan.h"
49 #include "GPU/Vulkan/PipelineManagerVulkan.h"
50 #include "GPU/Vulkan/FramebufferManagerVulkan.h"
51 #include "GPU/Vulkan/GPU_Vulkan.h"
52 
53 using namespace PPSSPP_VK;
54 
55 enum {
56 	VERTEX_CACHE_SIZE = 8192 * 1024
57 };
58 
59 #define VERTEXCACHE_DECIMATION_INTERVAL 17
60 #define DESCRIPTORSET_DECIMATION_INTERVAL 1  // Temporarily cut to 1. Handle reuse breaks this when textures get deleted.
61 
62 enum { VAI_KILL_AGE = 120, VAI_UNRELIABLE_KILL_AGE = 240, VAI_UNRELIABLE_KILL_MAX = 4 };
63 
64 enum {
65 	DRAW_BINDING_TEXTURE = 0,
66 	DRAW_BINDING_2ND_TEXTURE = 1,
67 	DRAW_BINDING_DEPAL_TEXTURE = 2,
68 	DRAW_BINDING_DYNUBO_BASE = 3,
69 	DRAW_BINDING_DYNUBO_LIGHT = 4,
70 	DRAW_BINDING_DYNUBO_BONE = 5,
71 	DRAW_BINDING_TESS_STORAGE_BUF = 6,
72 	DRAW_BINDING_TESS_STORAGE_BUF_WU = 7,
73 	DRAW_BINDING_TESS_STORAGE_BUF_WV = 8,
74 };
75 
76 enum {
77 	TRANSFORMED_VERTEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * sizeof(TransformedVertex)
78 };
79 
DrawEngineVulkan(VulkanContext * vulkan,Draw::DrawContext * draw)80 DrawEngineVulkan::DrawEngineVulkan(VulkanContext *vulkan, Draw::DrawContext *draw)
81 	:	vulkan_(vulkan),
82 		draw_(draw),
83 		vai_(1024) {
84 	decOptions_.expandAllWeightsToFloat = false;
85 	decOptions_.expand8BitNormalsToFloat = false;
86 
87 	// Allocate nicely aligned memory. Maybe graphics drivers will appreciate it.
88 	// All this is a LOT of memory, need to see if we can cut down somehow.
89 	decoded = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
90 	decIndex = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
91 
92 	indexGen.Setup(decIndex);
93 
94 	InitDeviceObjects();
95 }
96 
InitDeviceObjects()97 void DrawEngineVulkan::InitDeviceObjects() {
98 	// All resources we need for PSP drawing. Usually only bindings 0 and 2-4 are populated.
99 	VkDescriptorSetLayoutBinding bindings[9]{};
100 	bindings[0].descriptorCount = 1;
101 	bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
102 	bindings[0].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
103 	bindings[0].binding = DRAW_BINDING_TEXTURE;
104 	bindings[1].descriptorCount = 1;
105 	bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
106 	bindings[1].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
107 	bindings[1].binding = DRAW_BINDING_2ND_TEXTURE;
108 	bindings[2].descriptorCount = 1;
109 	bindings[2].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;  // sampler is ignored though.
110 	bindings[2].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
111 	bindings[2].binding = DRAW_BINDING_DEPAL_TEXTURE;
112 	bindings[3].descriptorCount = 1;
113 	bindings[3].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
114 	bindings[3].stageFlags = VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT;
115 	bindings[3].binding = DRAW_BINDING_DYNUBO_BASE;
116 	bindings[4].descriptorCount = 1;
117 	bindings[4].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
118 	bindings[4].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
119 	bindings[4].binding = DRAW_BINDING_DYNUBO_LIGHT;
120 	bindings[5].descriptorCount = 1;
121 	bindings[5].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
122 	bindings[5].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
123 	bindings[5].binding = DRAW_BINDING_DYNUBO_BONE;
124 	// Used only for hardware tessellation.
125 	bindings[6].descriptorCount = 1;
126 	bindings[6].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
127 	bindings[6].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
128 	bindings[6].binding = DRAW_BINDING_TESS_STORAGE_BUF;
129 	bindings[7].descriptorCount = 1;
130 	bindings[7].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
131 	bindings[7].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
132 	bindings[7].binding = DRAW_BINDING_TESS_STORAGE_BUF_WU;
133 	bindings[8].descriptorCount = 1;
134 	bindings[8].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
135 	bindings[8].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
136 	bindings[8].binding = DRAW_BINDING_TESS_STORAGE_BUF_WV;
137 
138 	VkDevice device = vulkan_->GetDevice();
139 
140 	VkDescriptorSetLayoutCreateInfo dsl{ VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO };
141 	dsl.bindingCount = ARRAY_SIZE(bindings);
142 	dsl.pBindings = bindings;
143 	VkResult res = vkCreateDescriptorSetLayout(device, &dsl, nullptr, &descriptorSetLayout_);
144 	_dbg_assert_(VK_SUCCESS == res);
145 
146 	// We are going to use one-shot descriptors in the initial implementation. Might look into caching them
147 	// if creating and updating them turns out to be expensive.
148 	for (int i = 0; i < VulkanContext::MAX_INFLIGHT_FRAMES; i++) {
149 		// We now create descriptor pools on demand, so removed from here.
150 		// Note that pushUBO is also used for tessellation data (search for SetPushBuffer), and to upload
151 		// the null texture. This should be cleaned up...
152 		frame_[i].pushUBO = new VulkanPushBuffer(vulkan_, 8 * 1024 * 1024, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT);
153 		frame_[i].pushVertex = new VulkanPushBuffer(vulkan_, 2 * 1024 * 1024, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT);
154 		frame_[i].pushIndex = new VulkanPushBuffer(vulkan_, 1 * 1024 * 1024, VK_BUFFER_USAGE_INDEX_BUFFER_BIT);
155 
156 		frame_[i].pushLocal = new VulkanPushBuffer(vulkan_, 1 * 1024 * 1024, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
157 	}
158 
159 	VkPipelineLayoutCreateInfo pl{ VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO };
160 	pl.pPushConstantRanges = nullptr;
161 	pl.pushConstantRangeCount = 0;
162 	pl.setLayoutCount = 1;
163 	pl.pSetLayouts = &descriptorSetLayout_;
164 	pl.flags = 0;
165 	res = vkCreatePipelineLayout(device, &pl, nullptr, &pipelineLayout_);
166 	_dbg_assert_(VK_SUCCESS == res);
167 
168 	VkSamplerCreateInfo samp{ VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO };
169 	samp.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
170 	samp.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
171 	samp.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
172 	samp.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
173 	samp.flags = 0;
174 	samp.magFilter = VK_FILTER_NEAREST;
175 	samp.minFilter = VK_FILTER_NEAREST;
176 	res = vkCreateSampler(device, &samp, nullptr, &samplerSecondary_);
177 	_dbg_assert_(VK_SUCCESS == res);
178 	res = vkCreateSampler(device, &samp, nullptr, &nullSampler_);
179 	_dbg_assert_(VK_SUCCESS == res);
180 
181 	vertexCache_ = new VulkanPushBuffer(vulkan_, VERTEX_CACHE_SIZE, VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT);
182 
183 	tessDataTransferVulkan = new TessellationDataTransferVulkan(vulkan_);
184 	tessDataTransfer = tessDataTransferVulkan;
185 }
186 
~DrawEngineVulkan()187 DrawEngineVulkan::~DrawEngineVulkan() {
188 	FreeMemoryPages(decoded, DECODED_VERTEX_BUFFER_SIZE);
189 	FreeMemoryPages(decIndex, DECODED_INDEX_BUFFER_SIZE);
190 
191 	DestroyDeviceObjects();
192 }
193 
Destroy(VulkanContext * vulkan)194 void DrawEngineVulkan::FrameData::Destroy(VulkanContext *vulkan) {
195 	if (descPool != VK_NULL_HANDLE) {
196 		vulkan->Delete().QueueDeleteDescriptorPool(descPool);
197 	}
198 
199 	if (pushUBO) {
200 		pushUBO->Destroy(vulkan);
201 		delete pushUBO;
202 		pushUBO = nullptr;
203 	}
204 	if (pushVertex) {
205 		pushVertex->Destroy(vulkan);
206 		delete pushVertex;
207 		pushVertex = nullptr;
208 	}
209 	if (pushIndex) {
210 		pushIndex->Destroy(vulkan);
211 		delete pushIndex;
212 		pushIndex = nullptr;
213 	}
214 	if (pushLocal) {
215 		pushLocal->Destroy(vulkan);
216 		delete pushLocal;
217 		pushLocal = nullptr;
218 	}
219 }
220 
DestroyDeviceObjects()221 void DrawEngineVulkan::DestroyDeviceObjects() {
222 	delete tessDataTransferVulkan;
223 	tessDataTransfer = nullptr;
224 	tessDataTransferVulkan = nullptr;
225 
226 	for (int i = 0; i < VulkanContext::MAX_INFLIGHT_FRAMES; i++) {
227 		frame_[i].Destroy(vulkan_);
228 	}
229 	if (samplerSecondary_ != VK_NULL_HANDLE)
230 		vulkan_->Delete().QueueDeleteSampler(samplerSecondary_);
231 	if (nullSampler_ != VK_NULL_HANDLE)
232 		vulkan_->Delete().QueueDeleteSampler(nullSampler_);
233 	if (pipelineLayout_ != VK_NULL_HANDLE)
234 		vulkan_->Delete().QueueDeletePipelineLayout(pipelineLayout_);
235 	if (descriptorSetLayout_ != VK_NULL_HANDLE)
236 		vulkan_->Delete().QueueDeleteDescriptorSetLayout(descriptorSetLayout_);
237 	if (vertexCache_) {
238 		vertexCache_->Destroy(vulkan_);
239 		delete vertexCache_;
240 		vertexCache_ = nullptr;
241 	}
242 	// Need to clear this to get rid of all remaining references to the dead buffers.
243 	vai_.Iterate([](uint32_t hash, VertexArrayInfoVulkan *vai) {
244 		delete vai;
245 	});
246 	vai_.Clear();
247 }
248 
DeviceLost()249 void DrawEngineVulkan::DeviceLost() {
250 	DestroyDeviceObjects();
251 	DirtyAllUBOs();
252 }
253 
DeviceRestore(VulkanContext * vulkan,Draw::DrawContext * draw)254 void DrawEngineVulkan::DeviceRestore(VulkanContext *vulkan, Draw::DrawContext *draw) {
255 	vulkan_ = vulkan;
256 	draw_ = draw;
257 
258 	InitDeviceObjects();
259 }
260 
BeginFrame()261 void DrawEngineVulkan::BeginFrame() {
262 	lastPipeline_ = nullptr;
263 
264 	lastRenderStepId_ = -1;
265 
266 	int curFrame = vulkan_->GetCurFrame();
267 	FrameData *frame = &frame_[curFrame];
268 
269 	// First reset all buffers, then begin. This is so that Reset can free memory and Begin can allocate it,
270 	// if growing the buffer is needed. Doing it this way will reduce fragmentation if more than one buffer
271 	// needs to grow in the same frame. The state where many buffers are reset can also be used to
272 	// defragment memory.
273 	frame->pushUBO->Reset();
274 	frame->pushVertex->Reset();
275 	frame->pushIndex->Reset();
276 	frame->pushLocal->Reset();
277 
278 	frame->pushUBO->Begin(vulkan_);
279 	frame->pushVertex->Begin(vulkan_);
280 	frame->pushIndex->Begin(vulkan_);
281 	frame->pushLocal->Begin(vulkan_);
282 
283 	// TODO: How can we make this nicer...
284 	tessDataTransferVulkan->SetPushBuffer(frame->pushUBO);
285 
286 	DirtyAllUBOs();
287 
288 	// Wipe the vertex cache if it's grown too large.
289 	if (vertexCache_->GetTotalSize() > VERTEX_CACHE_SIZE) {
290 		vertexCache_->Destroy(vulkan_);
291 		delete vertexCache_;  // orphans the buffers, they'll get deleted once no longer used by an in-flight frame.
292 		vertexCache_ = new VulkanPushBuffer(vulkan_, VERTEX_CACHE_SIZE, VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT);
293 		vai_.Iterate([&](uint32_t hash, VertexArrayInfoVulkan *vai) {
294 			delete vai;
295 		});
296 		vai_.Clear();
297 	}
298 
299 	vertexCache_->BeginNoReset();
300 
301 	if (--descDecimationCounter_ <= 0) {
302 		if (frame->descPool != VK_NULL_HANDLE)
303 			vkResetDescriptorPool(vulkan_->GetDevice(), frame->descPool, 0);
304 		frame->descSets.Clear();
305 		frame->descCount = 0;
306 		descDecimationCounter_ = DESCRIPTORSET_DECIMATION_INTERVAL;
307 	}
308 
309 	if (--decimationCounter_ <= 0) {
310 		decimationCounter_ = VERTEXCACHE_DECIMATION_INTERVAL;
311 
312 		const int threshold = gpuStats.numFlips - VAI_KILL_AGE;
313 		const int unreliableThreshold = gpuStats.numFlips - VAI_UNRELIABLE_KILL_AGE;
314 		int unreliableLeft = VAI_UNRELIABLE_KILL_MAX;
315 		vai_.Iterate([&](uint32_t hash, VertexArrayInfoVulkan *vai) {
316 			bool kill;
317 			if (vai->status == VertexArrayInfoVulkan::VAI_UNRELIABLE) {
318 				// We limit killing unreliable so we don't rehash too often.
319 				kill = vai->lastFrame < unreliableThreshold && --unreliableLeft >= 0;
320 			} else {
321 				kill = vai->lastFrame < threshold;
322 			}
323 			if (kill) {
324 				// This is actually quite safe.
325 				vai_.Remove(hash);
326 				delete vai;
327 			}
328 		});
329 	}
330 	vai_.Maintain();
331 }
332 
EndFrame()333 void DrawEngineVulkan::EndFrame() {
334 	FrameData *frame = &frame_[vulkan_->GetCurFrame()];
335 	stats_.pushUBOSpaceUsed = (int)frame->pushUBO->GetOffset();
336 	stats_.pushVertexSpaceUsed = (int)frame->pushVertex->GetOffset();
337 	stats_.pushIndexSpaceUsed = (int)frame->pushIndex->GetOffset();
338 	frame->pushUBO->End();
339 	frame->pushVertex->End();
340 	frame->pushIndex->End();
341 	frame->pushLocal->End();
342 	vertexCache_->End();
343 }
344 
DecodeVertsToPushBuffer(VulkanPushBuffer * push,uint32_t * bindOffset,VkBuffer * vkbuf)345 void DrawEngineVulkan::DecodeVertsToPushBuffer(VulkanPushBuffer *push, uint32_t *bindOffset, VkBuffer *vkbuf) {
346 	u8 *dest = decoded;
347 
348 	// Figure out how much pushbuffer space we need to allocate.
349 	if (push) {
350 		int vertsToDecode = ComputeNumVertsToDecode();
351 		dest = (u8 *)push->Push(vertsToDecode * dec_->GetDecVtxFmt().stride, bindOffset, vkbuf);
352 	}
353 	DecodeVerts(dest);
354 }
355 
SetLineWidth(float lineWidth)356 void DrawEngineVulkan::SetLineWidth(float lineWidth) {
357 	pipelineManager_->SetLineWidth(lineWidth);
358 }
359 
RecreateDescriptorPool(FrameData & frame,int newSize)360 VkResult DrawEngineVulkan::RecreateDescriptorPool(FrameData &frame, int newSize) {
361 	// Reallocate this desc pool larger, and "wipe" the cache. We might lose a tiny bit of descriptor set reuse but
362 	// only for this frame.
363 	if (frame.descPool) {
364 		DEBUG_LOG(G3D, "Reallocating desc pool from %d to %d", frame.descPoolSize, newSize);
365 		vulkan_->Delete().QueueDeleteDescriptorPool(frame.descPool);
366 		frame.descSets.Clear();
367 		frame.descCount = 0;
368 	}
369 	frame.descPoolSize = newSize;
370 
371 	VkDescriptorPoolSize dpTypes[3];
372 	dpTypes[0].descriptorCount = frame.descPoolSize * 3;
373 	dpTypes[0].type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
374 	dpTypes[1].descriptorCount = frame.descPoolSize * 3;  // Don't use these for tess anymore, need max three per set.
375 	dpTypes[1].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
376 	dpTypes[2].descriptorCount = frame.descPoolSize * 3;  // TODO: Use a separate layout when no spline stuff is needed to reduce the need for these.
377 	dpTypes[2].type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
378 
379 	VkDescriptorPoolCreateInfo dp{ VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO };
380 	dp.flags = 0;   // Don't want to mess around with individually freeing these.
381 									// We zap the whole pool every few frames.
382 	dp.maxSets = frame.descPoolSize;
383 	dp.pPoolSizes = dpTypes;
384 	dp.poolSizeCount = ARRAY_SIZE(dpTypes);
385 
386 	VkResult res = vkCreateDescriptorPool(vulkan_->GetDevice(), &dp, nullptr, &frame.descPool);
387 	return res;
388 }
389 
GetOrCreateDescriptorSet(VkImageView imageView,VkSampler sampler,VkBuffer base,VkBuffer light,VkBuffer bone,bool tess)390 VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView, VkSampler sampler, VkBuffer base, VkBuffer light, VkBuffer bone, bool tess) {
391 	_dbg_assert_(base != VK_NULL_HANDLE);
392 	_dbg_assert_(light != VK_NULL_HANDLE);
393 	_dbg_assert_(bone != VK_NULL_HANDLE);
394 
395 	DescriptorSetKey key;
396 	key.imageView_ = imageView;
397 	key.sampler_ = sampler;
398 	key.secondaryImageView_ = boundSecondary_;
399 	key.depalImageView_ = boundDepal_;
400 	key.base_ = base;
401 	key.light_ = light;
402 	key.bone_ = bone;
403 
404 	FrameData &frame = frame_[vulkan_->GetCurFrame()];
405 	// See if we already have this descriptor set cached.
406 	if (!tess) { // Don't cache descriptors for HW tessellation.
407 		VkDescriptorSet d = frame.descSets.Get(key);
408 		if (d != VK_NULL_HANDLE)
409 			return d;
410 	}
411 
412 	if (!frame.descPool || frame.descPoolSize < frame.descCount + 1) {
413 		VkResult res = RecreateDescriptorPool(frame, frame.descPoolSize * 2);
414 		_dbg_assert_(res == VK_SUCCESS);
415 	}
416 
417 	// Didn't find one in the frame descriptor set cache, let's make a new one.
418 	// We wipe the cache on every frame.
419 
420 	VkDescriptorSet desc;
421 	VkDescriptorSetAllocateInfo descAlloc{ VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO };
422 	descAlloc.pSetLayouts = &descriptorSetLayout_;
423 	descAlloc.descriptorPool = frame.descPool;
424 	descAlloc.descriptorSetCount = 1;
425 	VkResult result = vkAllocateDescriptorSets(vulkan_->GetDevice(), &descAlloc, &desc);
426 
427 	if (result == VK_ERROR_FRAGMENTED_POOL || result < 0) {
428 		// There seems to have been a spec revision. Here we should apparently recreate the descriptor pool,
429 		// so let's do that. See https://www.khronos.org/registry/vulkan/specs/1.0/man/html/vkAllocateDescriptorSets.html
430 		// Fragmentation shouldn't really happen though since we wipe the pool every frame..
431 		VkResult res = RecreateDescriptorPool(frame, frame.descPoolSize);
432 		_assert_msg_(res == VK_SUCCESS, "Ran out of descriptor space (frag?) and failed to recreate a descriptor pool. sz=%d res=%d", (int)frame.descSets.size(), (int)res);
433 		descAlloc.descriptorPool = frame.descPool;  // Need to update this pointer since we have allocated a new one.
434 		result = vkAllocateDescriptorSets(vulkan_->GetDevice(), &descAlloc, &desc);
435 		_assert_msg_(result == VK_SUCCESS, "Ran out of descriptor space (frag?) and failed to allocate after recreating a descriptor pool. res=%d", (int)result);
436 	}
437 
438 	// Even in release mode, this is bad.
439 	_assert_msg_(result == VK_SUCCESS, "Ran out of descriptor space in pool. sz=%d res=%d", (int)frame.descSets.size(), (int)result);
440 
441 	// We just don't write to the slots we don't care about, which is fine.
442 	VkWriteDescriptorSet writes[9]{};
443 	// Main texture
444 	int n = 0;
445 	VkDescriptorImageInfo tex[3]{};
446 	if (imageView) {
447 		_dbg_assert_(sampler != VK_NULL_HANDLE);
448 
449 		tex[0].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
450 		tex[0].imageView = imageView;
451 		tex[0].sampler = sampler;
452 		writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
453 		writes[n].pNext = nullptr;
454 		writes[n].dstBinding = DRAW_BINDING_TEXTURE;
455 		writes[n].pImageInfo = &tex[0];
456 		writes[n].descriptorCount = 1;
457 		writes[n].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
458 		writes[n].dstSet = desc;
459 		n++;
460 	}
461 
462 	if (boundSecondary_) {
463 		tex[1].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
464 		tex[1].imageView = boundSecondary_;
465 		tex[1].sampler = samplerSecondary_;
466 		writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
467 		writes[n].pNext = nullptr;
468 		writes[n].dstBinding = DRAW_BINDING_2ND_TEXTURE;
469 		writes[n].pImageInfo = &tex[1];
470 		writes[n].descriptorCount = 1;
471 		writes[n].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
472 		writes[n].dstSet = desc;
473 		n++;
474 	}
475 
476 	if (boundDepal_) {
477 		tex[2].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
478 		tex[2].imageView = boundDepal_;
479 		tex[2].sampler = samplerSecondary_;  // doesn't matter, we use load
480 		writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
481 		writes[n].pNext = nullptr;
482 		writes[n].dstBinding = DRAW_BINDING_DEPAL_TEXTURE;
483 		writes[n].pImageInfo = &tex[2];
484 		writes[n].descriptorCount = 1;
485 		writes[n].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
486 		writes[n].dstSet = desc;
487 		n++;
488 	}
489 
490 	// Tessellation data buffer.
491 	if (tess) {
492 		const VkDescriptorBufferInfo *bufInfo = tessDataTransferVulkan->GetBufferInfo();
493 		// Control Points
494 		writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
495 		writes[n].pNext = nullptr;
496 		writes[n].dstBinding = DRAW_BINDING_TESS_STORAGE_BUF;
497 		writes[n].pBufferInfo = &bufInfo[0];
498 		writes[n].descriptorCount = 1;
499 		writes[n].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
500 		writes[n].dstSet = desc;
501 		n++;
502 		// Weights U
503 		writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
504 		writes[n].pNext = nullptr;
505 		writes[n].dstBinding = DRAW_BINDING_TESS_STORAGE_BUF_WU;
506 		writes[n].pBufferInfo = &bufInfo[1];
507 		writes[n].descriptorCount = 1;
508 		writes[n].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
509 		writes[n].dstSet = desc;
510 		n++;
511 		// Weights V
512 		writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
513 		writes[n].pNext = nullptr;
514 		writes[n].dstBinding = DRAW_BINDING_TESS_STORAGE_BUF_WV;
515 		writes[n].pBufferInfo = &bufInfo[2];
516 		writes[n].descriptorCount = 1;
517 		writes[n].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
518 		writes[n].dstSet = desc;
519 		n++;
520 	}
521 
522 	// Uniform buffer objects
523 	VkDescriptorBufferInfo buf[3]{};
524 	int count = 0;
525 	buf[count].buffer = base;
526 	buf[count].offset = 0;
527 	buf[count].range = sizeof(UB_VS_FS_Base);
528 	count++;
529 	buf[count].buffer = light;
530 	buf[count].offset = 0;
531 	buf[count].range = sizeof(UB_VS_Lights);
532 	count++;
533 	buf[count].buffer = bone;
534 	buf[count].offset = 0;
535 	buf[count].range = sizeof(UB_VS_Bones);
536 	count++;
537 	for (int i = 0; i < count; i++) {
538 		writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
539 		writes[n].pNext = nullptr;
540 		writes[n].dstBinding = DRAW_BINDING_DYNUBO_BASE + i;
541 		writes[n].dstArrayElement = 0;
542 		writes[n].pBufferInfo = &buf[i];
543 		writes[n].dstSet = desc;
544 		writes[n].descriptorCount = 1;
545 		writes[n].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
546 		n++;
547 	}
548 
549 	vkUpdateDescriptorSets(vulkan_->GetDevice(), n, writes, 0, nullptr);
550 
551 	if (!tess) // Again, avoid caching when HW tessellation.
552 		frame.descSets.Insert(key, desc);
553 	frame.descCount++;
554 	return desc;
555 }
556 
DirtyAllUBOs()557 void DrawEngineVulkan::DirtyAllUBOs() {
558 	baseUBOOffset = 0;
559 	lightUBOOffset = 0;
560 	boneUBOOffset = 0;
561 	baseBuf = VK_NULL_HANDLE;
562 	lightBuf = VK_NULL_HANDLE;
563 	boneBuf = VK_NULL_HANDLE;
564 	dirtyUniforms_ = DIRTY_BASE_UNIFORMS | DIRTY_LIGHT_UNIFORMS | DIRTY_BONE_UNIFORMS;
565 	imageView = VK_NULL_HANDLE;
566 	sampler = VK_NULL_HANDLE;
567 	gstate_c.Dirty(DIRTY_TEXTURE_IMAGE);
568 }
569 
MarkUnreliable(VertexArrayInfoVulkan * vai)570 void MarkUnreliable(VertexArrayInfoVulkan *vai) {
571 	vai->status = VertexArrayInfoVulkan::VAI_UNRELIABLE;
572 	// TODO: If we change to a real allocator, free the data here.
573 	// For now we just leave it in the pushbuffer.
574 }
575 
576 // The inline wrapper in the header checks for numDrawCalls == 0
DoFlush()577 void DrawEngineVulkan::DoFlush() {
578 	PROFILE_THIS_SCOPE("Flush");
579 	gpuStats.numFlushes++;
580 	// TODO: Should be enough to update this once per frame?
581 	gpuStats.numTrackedVertexArrays = (int)vai_.size();
582 
583 	VulkanRenderManager *renderManager = (VulkanRenderManager *)draw_->GetNativeObject(Draw::NativeObject::RENDER_MANAGER);
584 
585 	// TODO: Needs to be behind a check for changed render pass, at an appropriate time in this function.
586 	// Similar issues as with the lastRenderStepId_ check. Will need a bit of a rethink.
587 	lastPipeline_ = nullptr;
588 	// If have a new render pass, dirty our dynamic state so it gets re-set.
589 	// We have to do this again after the last possible place in DoFlush that can cause a renderpass switch
590 	// like a shader blend blit or similar. But before we actually set the state!
591 	int curRenderStepId = renderManager->GetCurrentStepId();
592 	if (lastRenderStepId_ != curRenderStepId) {
593 		// Dirty everything that has dynamic state that will need re-recording.
594 		gstate_c.Dirty(DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_BLEND_STATE | DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS);
595 		textureCache_->ForgetLastTexture();
596 		lastRenderStepId_ = curRenderStepId;
597 	}
598 
599 	FrameData *frame = &frame_[vulkan_->GetCurFrame()];
600 
601 	bool tess = gstate_c.submitType == SubmitType::HW_BEZIER || gstate_c.submitType == SubmitType::HW_SPLINE;
602 
603 	bool textureNeedsApply = false;
604 	if (gstate_c.IsDirty(DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS) && !gstate.isModeClear() && gstate.isTextureMapEnabled()) {
605 		textureCache_->SetTexture();
606 		gstate_c.Clean(DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS);
607 		textureNeedsApply = true;
608 	} else if (gstate.getTextureAddress(0) == ((gstate.getFrameBufRawAddress() | 0x04000000) & 0x3FFFFFFF)) {
609 		// This catches the case of clearing a texture.
610 		gstate_c.Dirty(DIRTY_TEXTURE_IMAGE);
611 	}
612 
613 	GEPrimitiveType prim = prevPrim_;
614 
615 	// Always use software for flat shading to fix the provoking index.
616 	bool useHWTransform = CanUseHardwareTransform(prim) && (tess || gstate.getShadeMode() != GE_SHADE_FLAT);
617 
618 	VulkanVertexShader *vshader = nullptr;
619 	VulkanFragmentShader *fshader = nullptr;
620 
621 	uint32_t ibOffset;
622 	uint32_t vbOffset;
623 
624 	if (useHWTransform) {
625 		// We don't detect clears in this path, so here we can switch framebuffers if necessary.
626 
627 		int vertexCount = 0;
628 		int maxIndex;
629 		bool useElements = true;
630 
631 		// Cannot cache vertex data with morph enabled.
632 		bool useCache = g_Config.bVertexCache && !(lastVType_ & GE_VTYPE_MORPHCOUNT_MASK);
633 		// Also avoid caching when software skinning.
634 		VkBuffer vbuf = VK_NULL_HANDLE;
635 		VkBuffer ibuf = VK_NULL_HANDLE;
636 		if (g_Config.bSoftwareSkinning && (lastVType_ & GE_VTYPE_WEIGHT_MASK)) {
637 			useCache = false;
638 		}
639 
640 		if (useCache) {
641 			PROFILE_THIS_SCOPE("vcache");
642 			u32 id = dcid_ ^ gstate.getUVGenMode();  // This can have an effect on which UV decoder we need to use! And hence what the decoded data will look like. See #9263
643 			VertexArrayInfoVulkan *vai = vai_.Get(id);
644 			if (!vai) {
645 				vai = new VertexArrayInfoVulkan();
646 				vai_.Insert(id, vai);
647 			}
648 
649 			switch (vai->status) {
650 			case VertexArrayInfoVulkan::VAI_NEW:
651 			{
652 				// Haven't seen this one before. We don't actually upload the vertex data yet.
653 				uint64_t dataHash = ComputeHash();
654 				vai->hash = dataHash;
655 				vai->minihash = ComputeMiniHash();
656 				vai->status = VertexArrayInfoVulkan::VAI_HASHING;
657 				vai->drawsUntilNextFullHash = 0;
658 				DecodeVertsToPushBuffer(frame->pushVertex, &vbOffset, &vbuf);  // writes to indexGen
659 				vai->numVerts = indexGen.VertexCount();
660 				vai->prim = indexGen.Prim();
661 				vai->maxIndex = indexGen.MaxIndex();
662 				vai->flags = gstate_c.vertexFullAlpha ? VAIVULKAN_FLAG_VERTEXFULLALPHA : 0;
663 				goto rotateVBO;
664 			}
665 
666 			// Hashing - still gaining confidence about the buffer.
667 			// But if we get this far it's likely to be worth uploading the data.
668 			case VertexArrayInfoVulkan::VAI_HASHING:
669 			{
670 				PROFILE_THIS_SCOPE("vcachehash");
671 				vai->numDraws++;
672 				if (vai->lastFrame != gpuStats.numFlips) {
673 					vai->numFrames++;
674 				}
675 				if (vai->drawsUntilNextFullHash == 0) {
676 					// Let's try to skip a full hash if mini would fail.
677 					const u32 newMiniHash = ComputeMiniHash();
678 					uint64_t newHash = vai->hash;
679 					if (newMiniHash == vai->minihash) {
680 						newHash = ComputeHash();
681 					}
682 					if (newMiniHash != vai->minihash || newHash != vai->hash) {
683 						MarkUnreliable(vai);
684 						DecodeVertsToPushBuffer(frame->pushVertex, &vbOffset, &vbuf);
685 						goto rotateVBO;
686 					}
687 					if (vai->numVerts > 64) {
688 						// exponential backoff up to 16 draws, then every 24
689 						vai->drawsUntilNextFullHash = std::min(24, vai->numFrames);
690 					} else {
691 						// Lower numbers seem much more likely to change.
692 						vai->drawsUntilNextFullHash = 0;
693 					}
694 					// TODO: tweak
695 					//if (vai->numFrames > 1000) {
696 					//	vai->status = VertexArrayInfo::VAI_RELIABLE;
697 					//}
698 				} else {
699 					vai->drawsUntilNextFullHash--;
700 					u32 newMiniHash = ComputeMiniHash();
701 					if (newMiniHash != vai->minihash) {
702 						MarkUnreliable(vai);
703 						DecodeVertsToPushBuffer(frame->pushVertex, &vbOffset, &vbuf);
704 						goto rotateVBO;
705 					}
706 				}
707 
708 				if (!vai->vb) {
709 					// Directly push to the vertex cache.
710 					DecodeVertsToPushBuffer(vertexCache_, &vai->vbOffset, &vai->vb);
711 					_dbg_assert_msg_(gstate_c.vertBounds.minV >= gstate_c.vertBounds.maxV, "Should not have checked UVs when caching.");
712 					vai->numVerts = indexGen.VertexCount();
713 					vai->prim = indexGen.Prim();
714 					vai->maxIndex = indexGen.MaxIndex();
715 					vai->flags = gstate_c.vertexFullAlpha ? VAIVULKAN_FLAG_VERTEXFULLALPHA : 0;
716 					useElements = !indexGen.SeenOnlyPurePrims();
717 					if (!useElements && indexGen.PureCount()) {
718 						vai->numVerts = indexGen.PureCount();
719 					}
720 					if (useElements) {
721 						u32 size = sizeof(uint16_t) * indexGen.VertexCount();
722 						void *dest = vertexCache_->Push(size, &vai->ibOffset, &vai->ib);
723 						memcpy(dest, decIndex, size);
724 					} else {
725 						vai->ib = VK_NULL_HANDLE;
726 						vai->ibOffset = 0;
727 					}
728 				} else {
729 					gpuStats.numCachedDrawCalls++;
730 					useElements = vai->ib ? true : false;
731 					gpuStats.numCachedVertsDrawn += vai->numVerts;
732 					gstate_c.vertexFullAlpha = vai->flags & VAIVULKAN_FLAG_VERTEXFULLALPHA;
733 				}
734 				vbuf = vai->vb;
735 				ibuf = vai->ib;
736 				vbOffset = vai->vbOffset;
737 				ibOffset = vai->ibOffset;
738 				vertexCount = vai->numVerts;
739 				maxIndex = vai->maxIndex;
740 				prim = static_cast<GEPrimitiveType>(vai->prim);
741 				break;
742 			}
743 
744 			// Reliable - we don't even bother hashing anymore. Right now we don't go here until after a very long time.
745 			case VertexArrayInfoVulkan::VAI_RELIABLE:
746 			{
747 				vai->numDraws++;
748 				if (vai->lastFrame != gpuStats.numFlips) {
749 					vai->numFrames++;
750 				}
751 				gpuStats.numCachedDrawCalls++;
752 				gpuStats.numCachedVertsDrawn += vai->numVerts;
753 				vbuf = vai->vb;
754 				ibuf = vai->ib;
755 				vbOffset = vai->vbOffset;
756 				ibOffset = vai->ibOffset;
757 				vertexCount = vai->numVerts;
758 				maxIndex = vai->maxIndex;
759 				prim = static_cast<GEPrimitiveType>(vai->prim);
760 
761 				gstate_c.vertexFullAlpha = vai->flags & VAIVULKAN_FLAG_VERTEXFULLALPHA;
762 				break;
763 			}
764 
765 			case VertexArrayInfoVulkan::VAI_UNRELIABLE:
766 			{
767 				vai->numDraws++;
768 				if (vai->lastFrame != gpuStats.numFlips) {
769 					vai->numFrames++;
770 				}
771 				DecodeVertsToPushBuffer(frame->pushVertex, &vbOffset, &vbuf);
772 				goto rotateVBO;
773 			}
774 			default:
775 				break;
776 			}
777 		} else {
778 			if (g_Config.bSoftwareSkinning && (lastVType_ & GE_VTYPE_WEIGHT_MASK)) {
779 				// If software skinning, we've already predecoded into "decoded". So push that content.
780 				VkDeviceSize size = decodedVerts_ * dec_->GetDecVtxFmt().stride;
781 				u8 *dest = (u8 *)frame->pushVertex->Push(size, &vbOffset, &vbuf);
782 				memcpy(dest, decoded, size);
783 			} else {
784 				// Decode directly into the pushbuffer
785 				DecodeVertsToPushBuffer(frame->pushVertex, &vbOffset, &vbuf);
786 			}
787 
788 	rotateVBO:
789 			gpuStats.numUncachedVertsDrawn += indexGen.VertexCount();
790 			useElements = !indexGen.SeenOnlyPurePrims();
791 			vertexCount = indexGen.VertexCount();
792 			if (!useElements && indexGen.PureCount()) {
793 				vertexCount = indexGen.PureCount();
794 			}
795 			prim = indexGen.Prim();
796 		}
797 
798 		bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE;
799 		if (gstate.isModeThrough()) {
800 			gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (hasColor || gstate.getMaterialAmbientA() == 255);
801 		} else {
802 			gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && ((hasColor && (gstate.materialupdate & 1)) || gstate.getMaterialAmbientA() == 255) && (!gstate.isLightingEnabled() || gstate.getAmbientA() == 255);
803 		}
804 
805 		PROFILE_THIS_SCOPE("updatestate");
806 
807 		if (textureNeedsApply) {
808 			textureCache_->ApplyTexture();
809 			textureCache_->GetVulkanHandles(imageView, sampler);
810 			if (imageView == VK_NULL_HANDLE)
811 				imageView = (VkImageView)draw_->GetNativeObject(Draw::NativeObject::NULL_IMAGEVIEW);
812 			if (sampler == VK_NULL_HANDLE)
813 				sampler = nullSampler_;
814 		}
815 
816 		if (!lastPipeline_ || gstate_c.IsDirty(DIRTY_BLEND_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_RASTER_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE) || prim != lastPrim_) {
817 			if (prim != lastPrim_ || gstate_c.IsDirty(DIRTY_BLEND_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_RASTER_STATE | DIRTY_DEPTHSTENCIL_STATE)) {
818 				ConvertStateToVulkanKey(*framebufferManager_, shaderManager_, prim, pipelineKey_, dynState_);
819 			}
820 
821 			shaderManager_->GetShaders(prim, lastVType_, &vshader, &fshader, true, useHWTessellation_, decOptions_.expandAllWeightsToFloat);  // usehwtransform
822 			if (!vshader) {
823 				// We're screwed.
824 				return;
825 			}
826 			_dbg_assert_msg_(vshader->UseHWTransform(), "Bad vshader");
827 
828 			Draw::NativeObject object = framebufferManager_->UseBufferedRendering() ? Draw::NativeObject::FRAMEBUFFER_RENDERPASS : Draw::NativeObject::BACKBUFFER_RENDERPASS;
829 			VkRenderPass renderPass = (VkRenderPass)draw_->GetNativeObject(object);
830 			VulkanPipeline *pipeline = pipelineManager_->GetOrCreatePipeline(pipelineLayout_, renderPass, pipelineKey_, &dec_->decFmt, vshader, fshader, true);
831 			if (!pipeline || !pipeline->pipeline) {
832 				// Already logged, let's bail out.
833 				return;
834 			}
835 			BindShaderBlendTex();  // This might cause copies so important to do before BindPipeline.
836 
837 			// If have a new render pass, dirty our dynamic state so it gets re-set.
838 			// WARNING: We have to do this AFTER the last possible place in DoFlush that can cause a renderpass switch
839 			// like a shader blend blit or similar. But before we actually set the state!
840 			int curRenderStepId = renderManager->GetCurrentStepId();
841 			if (lastRenderStepId_ != curRenderStepId) {
842 				// Dirty everything that has dynamic state that will need re-recording.
843 				gstate_c.Dirty(DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_BLEND_STATE);
844 				lastRenderStepId_ = curRenderStepId;
845 			}
846 
847 			renderManager->BindPipeline(pipeline->pipeline, (PipelineFlags)pipeline->flags);
848 			if (pipeline != lastPipeline_) {
849 				if (lastPipeline_ && !(lastPipeline_->UsesBlendConstant() && pipeline->UsesBlendConstant())) {
850 					gstate_c.Dirty(DIRTY_BLEND_STATE);
851 				}
852 				lastPipeline_ = pipeline;
853 			}
854 			ApplyDrawStateLate(renderManager, false, 0, pipeline->UsesBlendConstant());
855 			gstate_c.Clean(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE);
856 			lastPipeline_ = pipeline;
857 
858 			// Must dirty blend state here so we re-copy next time.  Example: Lunar's spell effects.
859 			if (fboTexBound_)
860 				gstate_c.Dirty(DIRTY_BLEND_STATE);
861 		}
862 		lastPrim_ = prim;
863 
864 		dirtyUniforms_ |= shaderManager_->UpdateUniforms(framebufferManager_->UseBufferedRendering());
865 		UpdateUBOs(frame);
866 
867 		VkDescriptorSet ds = GetOrCreateDescriptorSet(imageView, sampler, baseBuf, lightBuf, boneBuf, tess);
868 
869 		const uint32_t dynamicUBOOffsets[3] = {
870 			baseUBOOffset, lightUBOOffset, boneUBOOffset,
871 		};
872 
873 		if (useElements) {
874 			if (!ibuf) {
875 				ibOffset = (uint32_t)frame->pushIndex->Push(decIndex, sizeof(uint16_t) * indexGen.VertexCount(), &ibuf);
876 			}
877 			renderManager->DrawIndexed(pipelineLayout_, ds, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, ibuf, ibOffset, vertexCount, 1, VK_INDEX_TYPE_UINT16);
878 		} else {
879 			renderManager->Draw(pipelineLayout_, ds, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, vertexCount);
880 		}
881 	} else {
882 		PROFILE_THIS_SCOPE("soft");
883 		// Decode to "decoded"
884 		DecodeVertsToPushBuffer(nullptr, nullptr, nullptr);
885 		bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE;
886 		if (gstate.isModeThrough()) {
887 			gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (hasColor || gstate.getMaterialAmbientA() == 255);
888 		} else {
889 			gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && ((hasColor && (gstate.materialupdate & 1)) || gstate.getMaterialAmbientA() == 255) && (!gstate.isLightingEnabled() || gstate.getAmbientA() == 255);
890 		}
891 
892 		gpuStats.numUncachedVertsDrawn += indexGen.VertexCount();
893 		prim = indexGen.Prim();
894 		// Undo the strip optimization, not supported by the SW code yet.
895 		if (prim == GE_PRIM_TRIANGLE_STRIP)
896 			prim = GE_PRIM_TRIANGLES;
897 		VERBOSE_LOG(G3D, "Flush prim %i SW! %i verts in one go", prim, indexGen.VertexCount());
898 
899 		u16 *inds = decIndex;
900 		SoftwareTransformResult result{};
901 		SoftwareTransformParams params{};
902 		params.decoded = decoded;
903 		params.transformed = transformed;
904 		params.transformedExpanded = transformedExpanded;
905 		params.fbman = framebufferManager_;
906 		params.texCache = textureCache_;
907 		// We have to force drawing of primitives if !framebufferManager_->UseBufferedRendering() because Vulkan clears
908 		// do not respect scissor rects.
909 		params.allowClear = framebufferManager_->UseBufferedRendering();
910 		params.allowSeparateAlphaClear = false;
911 		params.provokeFlatFirst = true;
912 
913 		// We need to update the viewport early because it's checked for flipping in SoftwareTransform.
914 		// We don't have a "DrawStateEarly" in vulkan, so...
915 		// TODO: Probably should eventually refactor this and feed the vp size into SoftwareTransform directly (Unknown's idea).
916 		if (gstate_c.IsDirty(DIRTY_VIEWPORTSCISSOR_STATE)) {
917 			gstate_c.vpWidth = gstate.getViewportXScale() * 2.0f;
918 			gstate_c.vpHeight = gstate.getViewportYScale() * 2.0f;
919 		}
920 
921 		int maxIndex = indexGen.MaxIndex();
922 		SoftwareTransform swTransform(params);
923 		swTransform.Decode(prim, dec_->VertexType(), dec_->GetDecVtxFmt(), maxIndex, &result);
924 		if (result.action == SW_NOT_READY) {
925 			swTransform.DetectOffsetTexture(maxIndex);
926 			swTransform.BuildDrawingParams(prim, indexGen.VertexCount(), dec_->VertexType(), inds, maxIndex, &result);
927 		}
928 
929 		if (result.setSafeSize)
930 			framebufferManager_->SetSafeSize(result.safeWidth, result.safeHeight);
931 
932 		// Only here, where we know whether to clear or to draw primitives, should we actually set the current framebuffer! Because that gives use the opportunity
933 		// to use a "pre-clear" render pass, for high efficiency on tilers.
934 		if (result.action == SW_DRAW_PRIMITIVES) {
935 			if (textureNeedsApply) {
936 				textureCache_->ApplyTexture();
937 				textureCache_->GetVulkanHandles(imageView, sampler);
938 				if (imageView == VK_NULL_HANDLE)
939 					imageView = (VkImageView)draw_->GetNativeObject(Draw::NativeObject::NULL_IMAGEVIEW);
940 				if (sampler == VK_NULL_HANDLE)
941 					sampler = nullSampler_;
942 			}
943 			if (!lastPipeline_ || gstate_c.IsDirty(DIRTY_BLEND_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_RASTER_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE) || prim != lastPrim_) {
944 				shaderManager_->GetShaders(prim, lastVType_, &vshader, &fshader, false, false, decOptions_.expandAllWeightsToFloat);  // usehwtransform
945 				_dbg_assert_msg_(!vshader->UseHWTransform(), "Bad vshader");
946 				if (prim != lastPrim_ || gstate_c.IsDirty(DIRTY_BLEND_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_RASTER_STATE | DIRTY_DEPTHSTENCIL_STATE)) {
947 					ConvertStateToVulkanKey(*framebufferManager_, shaderManager_, prim, pipelineKey_, dynState_);
948 				}
949 				Draw::NativeObject object = framebufferManager_->UseBufferedRendering() ? Draw::NativeObject::FRAMEBUFFER_RENDERPASS : Draw::NativeObject::BACKBUFFER_RENDERPASS;
950 				VkRenderPass renderPass = (VkRenderPass)draw_->GetNativeObject(object);
951 				VulkanPipeline *pipeline = pipelineManager_->GetOrCreatePipeline(pipelineLayout_, renderPass, pipelineKey_, &dec_->decFmt, vshader, fshader, false);
952 				if (!pipeline || !pipeline->pipeline) {
953 					// Already logged, let's bail out.
954 					return;
955 				}
956 				BindShaderBlendTex();  // This might cause copies so super important to do before BindPipeline.
957 
958 				// If have a new render pass, dirty our dynamic state so it gets re-set.
959 				// WARNING: We have to do this AFTER the last possible place in DoFlush that can cause a renderpass switch
960 				// like a shader blend blit or similar. But before we actually set the state!
961 				int curRenderStepId = renderManager->GetCurrentStepId();
962 				if (lastRenderStepId_ != curRenderStepId) {
963 					// Dirty everything that has dynamic state that will need re-recording.
964 					gstate_c.Dirty(DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_BLEND_STATE);
965 					lastRenderStepId_ = curRenderStepId;
966 				}
967 
968 				renderManager->BindPipeline(pipeline->pipeline, (PipelineFlags)pipeline->flags);
969 				if (pipeline != lastPipeline_) {
970 					if (lastPipeline_ && !lastPipeline_->UsesBlendConstant() && pipeline->UsesBlendConstant()) {
971 						gstate_c.Dirty(DIRTY_BLEND_STATE);
972 					}
973 					lastPipeline_ = pipeline;
974 				}
975 				ApplyDrawStateLate(renderManager, result.setStencil, result.stencilValue, pipeline->UsesBlendConstant());
976 				gstate_c.Clean(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE);
977 				lastPipeline_ = pipeline;
978 
979 				// Must dirty blend state here so we re-copy next time.  Example: Lunar's spell effects.
980 				if (fboTexBound_)
981 					gstate_c.Dirty(DIRTY_BLEND_STATE);
982 			}
983 			lastPrim_ = prim;
984 
985 			dirtyUniforms_ |= shaderManager_->UpdateUniforms(framebufferManager_->UseBufferedRendering());
986 
987 			// Even if the first draw is through-mode, make sure we at least have one copy of these uniforms buffered
988 			UpdateUBOs(frame);
989 
990 			VkDescriptorSet ds = GetOrCreateDescriptorSet(imageView, sampler, baseBuf, lightBuf, boneBuf, tess);
991 			const uint32_t dynamicUBOOffsets[3] = {
992 				baseUBOOffset, lightUBOOffset, boneUBOOffset,
993 			};
994 
995 			PROFILE_THIS_SCOPE("renderman_q");
996 
997 			if (result.drawIndexed) {
998 				VkBuffer vbuf, ibuf;
999 				vbOffset = (uint32_t)frame->pushVertex->Push(result.drawBuffer, maxIndex * sizeof(TransformedVertex), &vbuf);
1000 				ibOffset = (uint32_t)frame->pushIndex->Push(inds, sizeof(short) * result.drawNumTrans, &ibuf);
1001 				renderManager->DrawIndexed(pipelineLayout_, ds, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, ibuf, ibOffset, result.drawNumTrans, 1, VK_INDEX_TYPE_UINT16);
1002 			} else {
1003 				VkBuffer vbuf;
1004 				vbOffset = (uint32_t)frame->pushVertex->Push(result.drawBuffer, result.drawNumTrans * sizeof(TransformedVertex), &vbuf);
1005 				renderManager->Draw(pipelineLayout_, ds, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, result.drawNumTrans);
1006 			}
1007 		} else if (result.action == SW_CLEAR) {
1008 			// Note: we won't get here if the clear is alpha but not color, or color but not alpha.
1009 
1010 			// We let the framebuffer manager handle the clear. It can use renderpasses to optimize on tilers.
1011 			// If non-buffered though, it'll just do a plain clear.
1012 			framebufferManager_->NotifyClear(gstate.isClearModeColorMask(), gstate.isClearModeAlphaMask(), gstate.isClearModeDepthMask(), result.color, result.depth);
1013 
1014 			if (gstate_c.Supports(GPU_USE_CLEAR_RAM_HACK) && gstate.isClearModeColorMask() && (gstate.isClearModeAlphaMask() || gstate.FrameBufFormat() == GE_FORMAT_565)) {
1015 				int scissorX1 = gstate.getScissorX1();
1016 				int scissorY1 = gstate.getScissorY1();
1017 				int scissorX2 = gstate.getScissorX2() + 1;
1018 				int scissorY2 = gstate.getScissorY2() + 1;
1019 				framebufferManager_->ApplyClearToMemory(scissorX1, scissorY1, scissorX2, scissorY2, result.color);
1020 			}
1021 		}
1022 	}
1023 
1024 	gpuStats.numDrawCalls += numDrawCalls;
1025 	gpuStats.numVertsSubmitted += vertexCountInDrawCalls_;
1026 
1027 	indexGen.Reset();
1028 	decodedVerts_ = 0;
1029 	numDrawCalls = 0;
1030 	vertexCountInDrawCalls_ = 0;
1031 	decodeCounter_ = 0;
1032 	dcid_ = 0;
1033 	prevPrim_ = GE_PRIM_INVALID;
1034 	gstate_c.vertexFullAlpha = true;
1035 	framebufferManager_->SetColorUpdated(gstate_c.skipDrawReason);
1036 
1037 	// Now seems as good a time as any to reset the min/max coords, which we may examine later.
1038 	gstate_c.vertBounds.minU = 512;
1039 	gstate_c.vertBounds.minV = 512;
1040 	gstate_c.vertBounds.maxU = 0;
1041 	gstate_c.vertBounds.maxV = 0;
1042 
1043 	GPUDebug::NotifyDraw();
1044 }
1045 
UpdateUBOs(FrameData * frame)1046 void DrawEngineVulkan::UpdateUBOs(FrameData *frame) {
1047 	if ((dirtyUniforms_ & DIRTY_BASE_UNIFORMS) || baseBuf == VK_NULL_HANDLE) {
1048 		baseUBOOffset = shaderManager_->PushBaseBuffer(frame->pushUBO, &baseBuf);
1049 		dirtyUniforms_ &= ~DIRTY_BASE_UNIFORMS;
1050 	}
1051 	if ((dirtyUniforms_ & DIRTY_LIGHT_UNIFORMS) || lightBuf == VK_NULL_HANDLE) {
1052 		lightUBOOffset = shaderManager_->PushLightBuffer(frame->pushUBO, &lightBuf);
1053 		dirtyUniforms_ &= ~DIRTY_LIGHT_UNIFORMS;
1054 	}
1055 	if ((dirtyUniforms_ & DIRTY_BONE_UNIFORMS) || boneBuf == VK_NULL_HANDLE) {
1056 		boneUBOOffset = shaderManager_->PushBoneBuffer(frame->pushUBO, &boneBuf);
1057 		dirtyUniforms_ &= ~DIRTY_BONE_UNIFORMS;
1058 	}
1059 }
1060 
SendDataToShader(const SimpleVertex * const * points,int size_u,int size_v,u32 vertType,const Spline::Weight2D & weights)1061 void TessellationDataTransferVulkan::SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) {
1062 	// SSBOs that are not simply float1 or float2 need to be padded up to a float4 size. vec3 members
1063 	// also need to be 16-byte aligned, hence the padding.
1064 	struct TessData {
1065 		float pos[3]; float pad1;
1066 		float uv[2]; float pad2[2];
1067 		float color[4];
1068 	};
1069 
1070 	int size = size_u * size_v;
1071 
1072 	int ssboAlignment = vulkan_->GetPhysicalDeviceProperties().properties.limits.minStorageBufferOffsetAlignment;
1073 	uint8_t *data = (uint8_t *)push_->PushAligned(size * sizeof(TessData), (uint32_t *)&bufInfo_[0].offset, &bufInfo_[0].buffer, ssboAlignment);
1074 	bufInfo_[0].range = size * sizeof(TessData);
1075 
1076 	float *pos = (float *)(data);
1077 	float *tex = (float *)(data + offsetof(TessData, uv));
1078 	float *col = (float *)(data + offsetof(TessData, color));
1079 	int stride = sizeof(TessData) / sizeof(float);
1080 
1081 	CopyControlPoints(pos, tex, col, stride, stride, stride, points, size, vertType);
1082 
1083 	using Spline::Weight;
1084 
1085 	// Weights U
1086 	data = (uint8_t *)push_->PushAligned(weights.size_u * sizeof(Weight), (uint32_t *)&bufInfo_[1].offset, &bufInfo_[1].buffer, ssboAlignment);
1087 	memcpy(data, weights.u, weights.size_u * sizeof(Weight));
1088 	bufInfo_[1].range = weights.size_u * sizeof(Weight);
1089 
1090 	// Weights V
1091 	data = (uint8_t *)push_->PushAligned(weights.size_v * sizeof(Weight), (uint32_t *)&bufInfo_[2].offset, &bufInfo_[2].buffer, ssboAlignment);
1092 	memcpy(data, weights.v, weights.size_v * sizeof(Weight));
1093 	bufInfo_[2].range = weights.size_v * sizeof(Weight);
1094 }
1095