1 /* Copyright (c) 2020 Themaister
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining
4  * a copy of this software and associated documentation files (the
5  * "Software"), to deal in the Software without restriction, including
6  * without limitation the rights to use, copy, modify, merge, publish,
7  * distribute, sublicense, and/or sell copies of the Software, and to
8  * permit persons to whom the Software is furnished to do so, subject to
9  * the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be
12  * included in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "rdp_renderer.hpp"
24 #include "rdp_device.hpp"
25 #include "logging.hpp"
26 #include "bitops.hpp"
27 #include "luts.hpp"
28 #include "timer.hpp"
29 #ifdef PARALLEL_RDP_SHADER_DIR
30 #include "global_managers.hpp"
31 #include "os_filesystem.hpp"
32 #else
33 #include "shaders/slangmosh.hpp"
34 #endif
35 
36 namespace RDP
37 {
Renderer(CommandProcessor & processor_)38 Renderer::Renderer(CommandProcessor &processor_)
39 	: processor(processor_)
40 {
41 	active_submissions = 0;
42 }
43 
~Renderer()44 Renderer::~Renderer()
45 {
46 }
47 
set_shader_bank(const ShaderBank * bank)48 void Renderer::set_shader_bank(const ShaderBank *bank)
49 {
50 	shader_bank = bank;
51 }
52 
init_renderer(const RendererOptions & options)53 bool Renderer::init_renderer(const RendererOptions &options)
54 {
55 	if (options.upscaling_factor == 0)
56 		return false;
57 
58 	caps.max_width = options.upscaling_factor * Limits::MaxWidth;
59 	caps.max_height = options.upscaling_factor * Limits::MaxHeight;
60 	caps.max_tiles_x = options.upscaling_factor * ImplementationConstants::MaxTilesX;
61 	caps.max_tiles_y = options.upscaling_factor * ImplementationConstants::MaxTilesY;
62 	caps.max_num_tile_instances = options.upscaling_factor * options.upscaling_factor * Limits::MaxTileInstances;
63 
64 #ifdef PARALLEL_RDP_SHADER_DIR
65 	pipeline_worker.reset(new WorkerThread<Vulkan::DeferredPipelineCompile, PipelineExecutor>(
66 			Granite::Global::create_thread_context(), { device }));
67 #else
68 	pipeline_worker.reset(new WorkerThread<Vulkan::DeferredPipelineCompile, PipelineExecutor>({ device }));
69 #endif
70 
71 #ifdef PARALLEL_RDP_SHADER_DIR
72 	if (!Granite::Global::filesystem()->get_backend("rdp"))
73 		Granite::Global::filesystem()->register_protocol("rdp", std::make_unique<Granite::OSFilesystem>(PARALLEL_RDP_SHADER_DIR));
74 	device->get_shader_manager().add_include_directory("builtin://shaders/inc");
75 #endif
76 
77 	for (auto &buffer : buffer_instances)
78 		buffer.init(*device);
79 
80 	if (const char *env = getenv("RDP_DEBUG"))
81 		debug_channel = strtoul(env, nullptr, 0) != 0;
82 	if (const char *env = getenv("RDP_DEBUG_X"))
83 		filter_debug_channel_x = strtol(env, nullptr, 0);
84 	if (const char *env = getenv("RDP_DEBUG_Y"))
85 		filter_debug_channel_y = strtol(env, nullptr, 0);
86 
87 	{
88 		Vulkan::BufferCreateInfo info = {};
89 		info.size = Limits::MaxTMEMInstances * 0x1000;
90 		info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
91 		info.domain = Vulkan::BufferDomain::Device;
92 		info.misc = Vulkan::BUFFER_MISC_ZERO_INITIALIZE_BIT;
93 		tmem_instances = device->create_buffer(info);
94 		device->set_name(*tmem_instances, "tmem-instances");
95 		stream.tmem_upload_infos.reserve(Limits::MaxTMEMInstances);
96 	}
97 
98 	{
99 		Vulkan::BufferCreateInfo info = {};
100 		info.size = options.upscaling_factor * Limits::MaxSpanSetups * sizeof(SpanSetup);
101 		info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
102 		info.domain = Vulkan::BufferDomain::Device;
103 		info.misc = Vulkan::BUFFER_MISC_ZERO_INITIALIZE_BIT;
104 		span_setups = device->create_buffer(info);
105 		device->set_name(*span_setups, "span-setups");
106 	}
107 
108 	init_blender_lut();
109 	init_buffers(options);
110 	if (options.upscaling_factor > 1 && !init_internal_upscaling_factor(options))
111 		return false;
112 	return init_caps();
113 }
114 
set_device(Vulkan::Device * device_)115 void Renderer::set_device(Vulkan::Device *device_)
116 {
117 	device = device_;
118 }
119 
init_caps()120 bool Renderer::init_caps()
121 {
122 	auto &features = device->get_device_features();
123 
124 	if (const char *timestamp = getenv("PARALLEL_RDP_BENCH"))
125 	{
126 		caps.timestamp = strtol(timestamp, nullptr, 0);
127 		LOGI("Enabling timestamps = %d\n", caps.timestamp);
128 	}
129 
130 	if (const char *ubershader = getenv("PARALLEL_RDP_UBERSHADER"))
131 	{
132 		caps.ubershader = strtol(ubershader, nullptr, 0) > 0;
133 		LOGI("Overriding ubershader = %d\n", int(caps.ubershader));
134 	}
135 
136 	if (const char *force_sync = getenv("PARALLEL_RDP_FORCE_SYNC_SHADER"))
137 	{
138 		caps.force_sync = strtol(force_sync, nullptr, 0) > 0;
139 		LOGI("Overriding force sync shader = %d\n", int(caps.force_sync));
140 	}
141 
142 	bool allow_subgroup = true;
143 	if (const char *subgroup = getenv("PARALLEL_RDP_SUBGROUP"))
144 	{
145 		allow_subgroup = strtol(subgroup, nullptr, 0) > 0;
146 		LOGI("Allow subgroups = %d\n", int(allow_subgroup));
147 	}
148 
149 	bool allow_small_types = true;
150 	bool forces_small_types = false;
151 	if (const char *small = getenv("PARALLEL_RDP_SMALL_TYPES"))
152 	{
153 		allow_small_types = strtol(small, nullptr, 0) > 0;
154 		forces_small_types = true;
155 		LOGI("Allow small types = %d.\n", int(allow_small_types));
156 	}
157 
158 	if (!features.storage_16bit_features.storageBuffer16BitAccess)
159 	{
160 		LOGE("VK_KHR_16bit_storage for SSBOs is not supported! This is a minimum requirement for paraLLEl-RDP.\n");
161 		return false;
162 	}
163 
164 	if (!features.storage_8bit_features.storageBuffer8BitAccess)
165 	{
166 		LOGE("VK_KHR_8bit_storage for SSBOs is not supported! This is a minimum requirement for paraLLEl-RDP.\n");
167 		return false;
168 	}
169 
170 	// Driver workarounds here for 8/16-bit integer support.
171 	if (features.supports_driver_properties && !forces_small_types)
172 	{
173 		if (features.driver_properties.driverID == VK_DRIVER_ID_AMD_PROPRIETARY_KHR)
174 		{
175 			LOGW("Current proprietary AMD driver is known to be buggy with 8/16-bit integer arithmetic, disabling support for time being.\n");
176 			allow_small_types = false;
177 		}
178 		else if (features.driver_properties.driverID == VK_DRIVER_ID_AMD_OPEN_SOURCE_KHR ||
179 		         features.driver_properties.driverID == VK_DRIVER_ID_MESA_RADV_KHR)
180 		{
181 			LOGW("Current open-source AMD drivers are known to be slightly faster without 8/16-bit integer arithmetic.\n");
182 			allow_small_types = false;
183 		}
184 		else if (features.driver_properties.driverID == VK_DRIVER_ID_NVIDIA_PROPRIETARY_KHR)
185 		{
186 			LOGW("Current NVIDIA driver is known to be slightly faster without 8/16-bit integer arithmetic.\n");
187 			allow_small_types = false;
188 		}
189 		else if (features.driver_properties.driverID == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS_KHR)
190 		{
191 			LOGW("Current proprietary Intel Windows driver is tested to perform much better without 8/16-bit integer support.\n");
192 			allow_small_types = false;
193 		}
194 
195 		// Intel ANV *must* use small integer arithmetic, or it doesn't pass test suite.
196 	}
197 
198 	if (!allow_small_types)
199 	{
200 		caps.supports_small_integer_arithmetic = false;
201 	}
202 	else if (features.enabled_features.shaderInt16 && features.float16_int8_features.shaderInt8)
203 	{
204 		LOGI("Enabling 8 and 16-bit integer arithmetic support for more efficient shaders!\n");
205 		caps.supports_small_integer_arithmetic = true;
206 	}
207 	else
208 	{
209 		LOGW("Device does not support 8 and 16-bit integer arithmetic support. Falling back to 32-bit arithmetic everywhere.\n");
210 		caps.supports_small_integer_arithmetic = false;
211 	}
212 
213 	uint32_t subgroup_size = features.subgroup_properties.subgroupSize;
214 
215 	const VkSubgroupFeatureFlags required =
216 			VK_SUBGROUP_FEATURE_BALLOT_BIT |
217 			VK_SUBGROUP_FEATURE_BASIC_BIT |
218 			VK_SUBGROUP_FEATURE_VOTE_BIT |
219 			VK_SUBGROUP_FEATURE_ARITHMETIC_BIT;
220 
221 	caps.subgroup_tile_binning =
222 			allow_subgroup &&
223 			(features.subgroup_properties.supportedOperations & required) == required &&
224 			(features.subgroup_properties.supportedStages & VK_SHADER_STAGE_COMPUTE_BIT) != 0 &&
225 			can_support_minimum_subgroup_size(32) && subgroup_size <= 64;
226 
227 	return true;
228 }
229 
resolve_shader_define(const char * name,const char * define) const230 int Renderer::resolve_shader_define(const char *name, const char *define) const
231 {
232 	if (strcmp(define, "DEBUG_ENABLE") == 0)
233 		return int(debug_channel);
234 	else if (strcmp(define, "UBERSHADER") == 0)
235 		return int(caps.ubershader);
236 	else if (strcmp(define, "SMALL_TYPES") == 0)
237 		return int(caps.supports_small_integer_arithmetic);
238 	else if (strcmp(define, "SUBGROUP") == 0)
239 	{
240 		if (strcmp(name, "tile_binning_combined") == 0)
241 			return int(caps.subgroup_tile_binning);
242 		else
243 			return 0;
244 	}
245 	else
246 		return 0;
247 }
248 
init_buffers(const RendererOptions & options)249 void Renderer::init_buffers(const RendererOptions &options)
250 {
251 	Vulkan::BufferCreateInfo info = {};
252 	info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
253 	info.domain = Vulkan::BufferDomain::Device;
254 	info.misc = Vulkan::BUFFER_MISC_ZERO_INITIALIZE_BIT;
255 
256 	static_assert((Limits::MaxPrimitives % 32) == 0, "MaxPrimitives must be divisble by 32.");
257 	static_assert(Limits::MaxPrimitives <= (32 * 32), "MaxPrimitives must be less-or-equal than 1024.");
258 
259 	info.size = sizeof(uint32_t) *
260 	            (Limits::MaxPrimitives / 32) *
261 	            (caps.max_width / ImplementationConstants::TileWidth) *
262 	            (caps.max_height / ImplementationConstants::TileHeight);
263 
264 	tile_binning_buffer = device->create_buffer(info);
265 	device->set_name(*tile_binning_buffer, "tile-binning-buffer");
266 
267 	info.size = sizeof(uint32_t) *
268 	            (caps.max_width / ImplementationConstants::TileWidth) *
269 	            (caps.max_height / ImplementationConstants::TileHeight);
270 
271 	tile_binning_buffer_coarse = device->create_buffer(info);
272 	device->set_name(*tile_binning_buffer_coarse, "tile-binning-buffer-coarse");
273 
274 	if (!caps.ubershader)
275 	{
276 		info.size = sizeof(uint32_t) *
277 		            (Limits::MaxPrimitives / 32) *
278 		            (caps.max_width / ImplementationConstants::TileWidth) *
279 		            (caps.max_height / ImplementationConstants::TileHeight);
280 
281 		per_tile_offsets = device->create_buffer(info);
282 		device->set_name(*per_tile_offsets, "per-tile-offsets");
283 
284 		info.size = sizeof(TileRasterWork) * Limits::MaxStaticRasterizationStates * caps.max_num_tile_instances;
285 		tile_work_list = device->create_buffer(info);
286 		device->set_name(*tile_work_list, "tile-work-list");
287 
288 		info.size = sizeof(uint32_t) *
289 		            caps.max_num_tile_instances *
290 		            ImplementationConstants::TileWidth *
291 		            ImplementationConstants::TileHeight;
292 		per_tile_shaded_color = device->create_buffer(info);
293 		device->set_name(*per_tile_shaded_color, "per-tile-shaded-color");
294 		per_tile_shaded_depth = device->create_buffer(info);
295 		device->set_name(*per_tile_shaded_depth, "per-tile-shaded-depth");
296 
297 		info.size = sizeof(uint8_t) *
298 		            caps.max_num_tile_instances *
299 		            ImplementationConstants::TileWidth *
300 		            ImplementationConstants::TileHeight;
301 		per_tile_shaded_coverage = device->create_buffer(info);
302 		per_tile_shaded_shaded_alpha = device->create_buffer(info);
303 		device->set_name(*per_tile_shaded_coverage, "per-tile-shaded-coverage");
304 		device->set_name(*per_tile_shaded_shaded_alpha, "per-tile-shaded-shaded-alpha");
305 	}
306 }
307 
init_blender_lut()308 void Renderer::init_blender_lut()
309 {
310 	Vulkan::BufferCreateInfo info = {};
311 	info.size = sizeof(blender_lut);
312 	info.domain = Vulkan::BufferDomain::Device;
313 	info.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
314 
315 	blender_divider_lut_buffer = device->create_buffer(info, blender_lut);
316 	device->set_name(*blender_divider_lut_buffer, "blender-divider-lut-buffer");
317 
318 	Vulkan::BufferViewCreateInfo view = {};
319 	view.buffer = blender_divider_lut_buffer.get();
320 	view.format = VK_FORMAT_R8_UINT;
321 	view.range = info.size;
322 	blender_divider_buffer = device->create_buffer_view(view);
323 }
324 
message(const std::string & tag,uint32_t code,uint32_t x,uint32_t y,uint32_t,uint32_t num_words,const Vulkan::DebugChannelInterface::Word * words)325 void Renderer::message(const std::string &tag, uint32_t code, uint32_t x, uint32_t y, uint32_t, uint32_t num_words,
326                        const Vulkan::DebugChannelInterface::Word *words)
327 {
328 	if (filter_debug_channel_x >= 0 && x != uint32_t(filter_debug_channel_x))
329 		return;
330 	if (filter_debug_channel_y >= 0 && y != uint32_t(filter_debug_channel_y))
331 		return;
332 
333 	enum Code
334 	{
335 		ASSERT_EQUAL = 0,
336 		ASSERT_NOT_EQUAL = 1,
337 		ASSERT_LESS_THAN = 2,
338 		ASSERT_LESS_THAN_EQUAL = 3,
339 		GENERIC = 4,
340 		HEX = 5
341 	};
342 
343 	switch (Code(code))
344 	{
345 	case ASSERT_EQUAL:
346 		LOGE("ASSERT TRIPPED FOR (%u, %u), line %d, %d == %d failed.\n",
347 		     x, y, words[0].s32, words[1].s32, words[2].s32);
348 		break;
349 
350 	case ASSERT_NOT_EQUAL:
351 		LOGE("ASSERT TRIPPED FOR (%u, %u), line %d, %d != %d failed.\n",
352 		     x, y, words[0].s32, words[1].s32, words[2].s32);
353 		break;
354 
355 	case ASSERT_LESS_THAN:
356 		LOGE("ASSERT TRIPPED FOR (%u, %u), line %d, %d < %d failed.\n",
357 		     x, y, words[0].s32, words[1].s32, words[2].s32);
358 		break;
359 
360 	case ASSERT_LESS_THAN_EQUAL:
361 		LOGE("ASSERT TRIPPED FOR (%u, %u), line %d, %d <= %d failed.\n",
362 		     x, y, words[0].s32, words[1].s32, words[2].s32);
363 		break;
364 
365 	case GENERIC:
366 		switch (num_words)
367 		{
368 		case 1:
369 			LOGI("(%u, %u), line %d.\n", x, y, words[0].s32);
370 			break;
371 
372 		case 2:
373 			LOGI("(%u, %u), line %d: (%d).\n", x, y, words[0].s32, words[1].s32);
374 			break;
375 
376 		case 3:
377 			LOGI("(%u, %u), line %d: (%d, %d).\n", x, y, words[0].s32, words[1].s32, words[2].s32);
378 			break;
379 
380 		case 4:
381 			LOGI("(%u, %u), line %d: (%d, %d, %d).\n", x, y,
382 					words[0].s32, words[1].s32, words[2].s32, words[3].s32);
383 			break;
384 
385 		default:
386 			LOGE("Unknown number of generic parameters: %u\n", num_words);
387 			break;
388 		}
389 		break;
390 
391 	case HEX:
392 		switch (num_words)
393 		{
394 		case 1:
395 			LOGI("(%u, %u), line %d.\n", x, y, words[0].s32);
396 			break;
397 
398 		case 2:
399 			LOGI("(%u, %u), line %d: (0x%x).\n", x, y, words[0].s32, words[1].s32);
400 			break;
401 
402 		case 3:
403 			LOGI("(%u, %u), line %d: (0x%x, 0x%x).\n", x, y, words[0].s32, words[1].s32, words[2].s32);
404 			break;
405 
406 		case 4:
407 			LOGI("(%u, %u), line %d: (0x%x, 0x%x, 0x%x).\n", x, y,
408 			     words[0].s32, words[1].s32, words[2].s32, words[3].s32);
409 			break;
410 
411 		default:
412 			LOGE("Unknown number of generic parameters: %u\n", num_words);
413 			break;
414 		}
415 		break;
416 
417 	default:
418 		LOGE("Unexpected message code: %u\n", code);
419 		break;
420 	}
421 }
422 
init(Vulkan::Device & device,Vulkan::BufferDomain domain,RenderBuffers * borrow)423 void Renderer::RenderBuffers::init(Vulkan::Device &device, Vulkan::BufferDomain domain,
424                                    RenderBuffers *borrow)
425 {
426 	triangle_setup = create_buffer(device, domain,
427 	                               sizeof(TriangleSetup) * Limits::MaxPrimitives,
428 	                               borrow ? &borrow->triangle_setup : nullptr);
429 	device.set_name(*triangle_setup.buffer, "triangle-setup");
430 
431 	attribute_setup = create_buffer(device, domain,
432 	                                sizeof(AttributeSetup) * Limits::MaxPrimitives,
433 	                                borrow ? &borrow->attribute_setup: nullptr);
434 	device.set_name(*attribute_setup.buffer, "attribute-setup");
435 
436 	derived_setup = create_buffer(device, domain,
437 	                              sizeof(DerivedSetup) * Limits::MaxPrimitives,
438 	                              borrow ? &borrow->derived_setup : nullptr);
439 	device.set_name(*derived_setup.buffer, "derived-setup");
440 
441 	scissor_setup = create_buffer(device, domain,
442 	                              sizeof(ScissorState) * Limits::MaxPrimitives,
443 	                              borrow ? &borrow->scissor_setup : nullptr);
444 	device.set_name(*scissor_setup.buffer, "scissor-state");
445 
446 	static_raster_state = create_buffer(device, domain,
447 	                                    sizeof(StaticRasterizationState) * Limits::MaxStaticRasterizationStates,
448 	                                    borrow ? &borrow->static_raster_state : nullptr);
449 	device.set_name(*static_raster_state.buffer, "static-raster-state");
450 
451 	depth_blend_state = create_buffer(device, domain,
452 	                                  sizeof(DepthBlendState) * Limits::MaxDepthBlendStates,
453 	                                  borrow ? &borrow->depth_blend_state : nullptr);
454 	device.set_name(*depth_blend_state.buffer, "depth-blend-state");
455 
456 	tile_info_state = create_buffer(device, domain,
457 	                                sizeof(TileInfo) * Limits::MaxTileInfoStates,
458 	                                borrow ? &borrow->tile_info_state : nullptr);
459 	device.set_name(*tile_info_state.buffer, "tile-info-state");
460 
461 	state_indices = create_buffer(device, domain,
462 	                              sizeof(InstanceIndices) * Limits::MaxPrimitives,
463 	                              borrow ? &borrow->state_indices : nullptr);
464 	device.set_name(*state_indices.buffer, "state-indices");
465 
466 	span_info_offsets = create_buffer(device, domain,
467 	                                  sizeof(SpanInfoOffsets) * Limits::MaxPrimitives,
468 	                                  borrow ? &borrow->span_info_offsets : nullptr);
469 	device.set_name(*span_info_offsets.buffer, "span-info-offsets");
470 
471 	span_info_jobs = create_buffer(device, domain,
472 	                               sizeof(SpanInterpolationJob) * Limits::MaxSpanSetups,
473 	                               borrow ? &borrow->span_info_jobs : nullptr);
474 	device.set_name(*span_info_jobs.buffer, "span-info-jobs");
475 
476 	if (!borrow)
477 	{
478 		Vulkan::BufferViewCreateInfo info = {};
479 		info.buffer = span_info_jobs.buffer.get();
480 		info.format = VK_FORMAT_R16G16B16A16_UINT;
481 		info.range = span_info_jobs.buffer->get_create_info().size;
482 		span_info_jobs_view = device.create_buffer_view(info);
483 	}
484 }
485 
create_buffer(Vulkan::Device & device,Vulkan::BufferDomain domain,VkDeviceSize size,Renderer::MappedBuffer * borrow)486 Renderer::MappedBuffer Renderer::RenderBuffers::create_buffer(
487 		Vulkan::Device &device, Vulkan::BufferDomain domain, VkDeviceSize size,
488 		Renderer::MappedBuffer *borrow)
489 {
490 	Vulkan::BufferCreateInfo info = {};
491 	info.domain = domain;
492 
493 	if (domain == Vulkan::BufferDomain::Device || domain == Vulkan::BufferDomain::LinkedDeviceHostPreferDevice)
494 	{
495 		info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
496 		             VK_BUFFER_USAGE_TRANSFER_DST_BIT |
497 		             VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
498 		             VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
499 	}
500 	else if (borrow && borrow->is_host)
501 	{
502 		return *borrow;
503 	}
504 	else
505 	{
506 		info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
507 	}
508 
509 	info.size = size;
510 	Renderer::MappedBuffer buffer;
511 	buffer.buffer = device.create_buffer(info);
512 	buffer.is_host = device.map_host_buffer(*buffer.buffer, 0) != nullptr;
513 	return buffer;
514 }
515 
init(Vulkan::Device & device)516 void Renderer::RenderBuffersUpdater::init(Vulkan::Device &device)
517 {
518 	gpu.init(device, Vulkan::BufferDomain::LinkedDeviceHostPreferDevice, nullptr);
519 	cpu.init(device, Vulkan::BufferDomain::Host, &gpu);
520 }
521 
init_internal_upscaling_factor(const RendererOptions & options)522 bool Renderer::init_internal_upscaling_factor(const RendererOptions &options)
523 {
524 	unsigned factor = options.upscaling_factor;
525 	if (!device || !rdram || !hidden_rdram)
526 	{
527 		LOGE("Renderer is not initialized.\n");
528 		return false;
529 	}
530 
531 	caps.upscaling = factor;
532 
533 	if (factor == 1)
534 	{
535 		upscaling_multisampled_hidden_rdram.reset();
536 		upscaling_reference_rdram.reset();
537 		upscaling_multisampled_rdram.reset();
538 		return true;
539 	}
540 
541 	Vulkan::BufferCreateInfo info;
542 	info.domain = Vulkan::BufferDomain::Device;
543 	info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
544 	info.misc = Vulkan::BUFFER_MISC_ZERO_INITIALIZE_BIT;
545 
546 	info.size = rdram_size;
547 	upscaling_reference_rdram = device->create_buffer(info);
548 	device->set_name(*upscaling_reference_rdram, "reference-rdram");
549 
550 	info.size = rdram_size * factor * factor;
551 	upscaling_multisampled_rdram = device->create_buffer(info);
552 	device->set_name(*upscaling_multisampled_rdram, "multisampled-rdram");
553 
554 	info.size = hidden_rdram->get_create_info().size * factor * factor;
555 	upscaling_multisampled_hidden_rdram = device->create_buffer(info);
556 	device->set_name(*upscaling_multisampled_hidden_rdram, "multisampled-hidden-rdram");
557 
558 	{
559 		auto cmd = device->request_command_buffer();
560 		cmd->fill_buffer(*upscaling_multisampled_hidden_rdram, 0x03030303);
561 		cmd->barrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT,
562 		             VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
563 		             VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT);
564 		device->submit(cmd);
565 	}
566 
567 	return true;
568 }
569 
set_rdram(Vulkan::Buffer * buffer,uint8_t * host_rdram,size_t offset,size_t size,bool coherent)570 void Renderer::set_rdram(Vulkan::Buffer *buffer, uint8_t *host_rdram, size_t offset, size_t size, bool coherent)
571 {
572 	rdram = buffer;
573 	rdram_offset = offset;
574 	rdram_size = size;
575 	is_host_coherent = coherent;
576 	device->set_name(*rdram, "rdram");
577 
578 	if (!is_host_coherent)
579 	{
580 		assert(rdram_offset == 0);
581 		incoherent.host_rdram = host_rdram;
582 
583 		// If we're not host coherent (missing VK_EXT_external_memory_host),
584 		// we need to create a staging RDRAM buffer which is used for the real RDRAM uploads.
585 		// RDRAM may be uploaded in a masked way (if GPU has pending writes), or direct copy (if no pending writes are outstanding).
586 		Vulkan::BufferCreateInfo info = {};
587 		info.size = size;
588 		info.domain = Vulkan::BufferDomain::Host;
589 		info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
590 		incoherent.staging_rdram = device->create_buffer(info);
591 		device->set_name(*incoherent.staging_rdram, "staging-rdram");
592 
593 		const auto div_round_up = [](size_t a, size_t b) -> size_t { return (a + b - 1) / b; };
594 
595 		if (!rdram->get_allocation().is_host_allocation())
596 		{
597 			// If we cannot map RDRAM, we need a staging readback buffer.
598 			Vulkan::BufferCreateInfo readback_info = {};
599 			readback_info.domain = Vulkan::BufferDomain::CachedCoherentHostPreferCached;
600 			readback_info.size = rdram_size * Limits::NumSyncStates;
601 			readback_info.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT;
602 			incoherent.staging_readback = device->create_buffer(readback_info);
603 			device->set_name(*incoherent.staging_readback, "staging-readback");
604 			incoherent.staging_readback_pages = div_round_up(readback_info.size, ImplementationConstants::IncoherentPageSize);
605 		}
606 
607 		incoherent.page_to_direct_copy.clear();
608 		incoherent.page_to_masked_copy.clear();
609 		incoherent.page_to_pending_readback.clear();
610 
611 		auto packed_pages = div_round_up(size, ImplementationConstants::IncoherentPageSize * 32);
612 		incoherent.num_pages = div_round_up(size, ImplementationConstants::IncoherentPageSize);
613 
614 		incoherent.page_to_direct_copy.resize(packed_pages);
615 		incoherent.page_to_masked_copy.resize(packed_pages);
616 		incoherent.page_to_pending_readback.resize(packed_pages);
617 		incoherent.pending_writes_for_page.reset(new std::atomic_uint32_t[incoherent.num_pages]);
618 		for (unsigned i = 0; i < incoherent.num_pages; i++)
619 			incoherent.pending_writes_for_page[i].store(0);
620 	}
621 	else
622 	{
623 		incoherent = {};
624 	}
625 }
626 
set_hidden_rdram(Vulkan::Buffer * buffer)627 void Renderer::set_hidden_rdram(Vulkan::Buffer *buffer)
628 {
629 	hidden_rdram = buffer;
630 	device->set_name(*hidden_rdram, "hidden-rdram");
631 }
632 
set_tmem(Vulkan::Buffer * buffer)633 void Renderer::set_tmem(Vulkan::Buffer *buffer)
634 {
635 	tmem = buffer;
636 	device->set_name(*tmem, "tmem");
637 }
638 
flush_and_signal()639 void Renderer::flush_and_signal()
640 {
641 	flush_queues();
642 	submit_to_queue();
643 	assert(!stream.cmd);
644 }
645 
set_color_framebuffer(uint32_t addr,uint32_t width,FBFormat fmt)646 void Renderer::set_color_framebuffer(uint32_t addr, uint32_t width, FBFormat fmt)
647 {
648 	if (fb.addr != addr || fb.width != width || fb.fmt != fmt)
649 		flush_queues();
650 
651 	fb.addr = addr;
652 	fb.width = width;
653 	fb.fmt = fmt;
654 }
655 
set_depth_framebuffer(uint32_t addr)656 void Renderer::set_depth_framebuffer(uint32_t addr)
657 {
658 	if (fb.depth_addr != addr)
659 		flush_queues();
660 
661 	fb.depth_addr = addr;
662 }
663 
set_scissor_state(const ScissorState & state)664 void Renderer::set_scissor_state(const ScissorState &state)
665 {
666 	stream.scissor_state = state;
667 }
668 
set_static_rasterization_state(const StaticRasterizationState & state)669 void Renderer::set_static_rasterization_state(const StaticRasterizationState &state)
670 {
671 	stream.static_raster_state = state;
672 }
673 
set_depth_blend_state(const DepthBlendState & state)674 void Renderer::set_depth_blend_state(const DepthBlendState &state)
675 {
676 	stream.depth_blend_state = state;
677 }
678 
draw_flat_primitive(const TriangleSetup & setup)679 void Renderer::draw_flat_primitive(const TriangleSetup &setup)
680 {
681 	draw_shaded_primitive(setup, {});
682 }
683 
normalize_dzpix(int dz)684 static int normalize_dzpix(int dz)
685 {
686 	if (dz >= 0x8000)
687 		return 0x8000;
688 	else if (dz == 0)
689 		return 1;
690 
691 	unsigned bit = 31 - leading_zeroes(dz);
692 	return 1 << (bit + 1);
693 }
694 
dz_compress(int dz)695 static uint16_t dz_compress(int dz)
696 {
697 	int val = 0;
698 	if (dz & 0xff00)
699 		val |= 8;
700 	if (dz & 0xf0f0)
701 		val |= 4;
702 	if (dz & 0xcccc)
703 		val |= 2;
704 	if (dz & 0xaaaa)
705 		val |= 1;
706 	return uint16_t(val);
707 }
708 
encode_rgb(uint8_t * rgba,uint32_t color)709 static void encode_rgb(uint8_t *rgba, uint32_t color)
710 {
711 	rgba[0] = uint8_t(color >> 24);
712 	rgba[1] = uint8_t(color >> 16);
713 	rgba[2] = uint8_t(color >> 8);
714 }
715 
encode_alpha(uint8_t * rgba,uint32_t color)716 static void encode_alpha(uint8_t *rgba, uint32_t color)
717 {
718 	rgba[3] = uint8_t(color);
719 }
720 
build_combiner_constants(DerivedSetup & setup,unsigned cycle) const721 void Renderer::build_combiner_constants(DerivedSetup &setup, unsigned cycle) const
722 {
723 	auto &comb = stream.static_raster_state.combiner[cycle];
724 	auto &output = setup.constants[cycle];
725 
726 	switch (comb.rgb.muladd)
727 	{
728 	case RGBMulAdd::Env:
729 		encode_rgb(output.muladd, constants.env_color);
730 		break;
731 
732 	case RGBMulAdd::Primitive:
733 		encode_rgb(output.muladd, constants.primitive_color);
734 		break;
735 
736 	default:
737 		break;
738 	}
739 
740 	switch (comb.rgb.mulsub)
741 	{
742 	case RGBMulSub::Env:
743 		encode_rgb(output.mulsub, constants.env_color);
744 		break;
745 
746 	case RGBMulSub::Primitive:
747 		encode_rgb(output.mulsub, constants.primitive_color);
748 		break;
749 
750 	case RGBMulSub::ConvertK4:
751 		// Need to decode this specially since it's a 9-bit value.
752 		encode_rgb(output.mulsub, uint32_t(constants.convert[4]) << 8);
753 		break;
754 
755 	case RGBMulSub::KeyCenter:
756 		output.mulsub[0] = constants.key_center[0];
757 		output.mulsub[1] = constants.key_center[1];
758 		output.mulsub[2] = constants.key_center[2];
759 		break;
760 
761 	default:
762 		break;
763 	}
764 
765 	switch (comb.rgb.mul)
766 	{
767 	case RGBMul::Primitive:
768 		encode_rgb(output.mul, constants.primitive_color);
769 		break;
770 
771 	case RGBMul::Env:
772 		encode_rgb(output.mul, constants.env_color);
773 		break;
774 
775 	case RGBMul::PrimitiveAlpha:
776 		encode_rgb(output.mul, 0x01010101 * ((constants.primitive_color) & 0xff));
777 		break;
778 
779 	case RGBMul::EnvAlpha:
780 		encode_rgb(output.mul, 0x01010101 * ((constants.env_color) & 0xff));
781 		break;
782 
783 	case RGBMul::PrimLODFrac:
784 		encode_rgb(output.mul, 0x01010101 * constants.prim_lod_frac);
785 		break;
786 
787 	case RGBMul::ConvertK5:
788 		// Need to decode this specially since it's a 9-bit value.
789 		encode_rgb(output.mul, uint32_t(constants.convert[5]) << 8);
790 		break;
791 
792 	case RGBMul::KeyScale:
793 		output.mul[0] = constants.key_scale[0];
794 		output.mul[1] = constants.key_scale[1];
795 		output.mul[2] = constants.key_scale[2];
796 		break;
797 
798 	default:
799 		break;
800 	}
801 
802 	switch (comb.rgb.add)
803 	{
804 	case RGBAdd::Primitive:
805 		encode_rgb(output.add, constants.primitive_color);
806 		break;
807 
808 	case RGBAdd::Env:
809 		encode_rgb(output.add, constants.env_color);
810 		break;
811 
812 	default:
813 		break;
814 	}
815 
816 	switch (comb.alpha.muladd)
817 	{
818 	case AlphaAddSub::PrimitiveAlpha:
819 		encode_alpha(output.muladd, constants.primitive_color);
820 		break;
821 
822 	case AlphaAddSub::EnvAlpha:
823 		encode_alpha(output.muladd, constants.env_color);
824 		break;
825 
826 	default:
827 		break;
828 	}
829 
830 	switch (comb.alpha.mulsub)
831 	{
832 	case AlphaAddSub::PrimitiveAlpha:
833 		encode_alpha(output.mulsub, constants.primitive_color);
834 		break;
835 
836 	case AlphaAddSub::EnvAlpha:
837 		encode_alpha(output.mulsub, constants.env_color);
838 		break;
839 
840 	default:
841 		break;
842 	}
843 
844 	switch (comb.alpha.mul)
845 	{
846 	case AlphaMul::PrimitiveAlpha:
847 		encode_alpha(output.mul, constants.primitive_color);
848 		break;
849 
850 	case AlphaMul::EnvAlpha:
851 		encode_alpha(output.mul, constants.env_color);
852 		break;
853 
854 	case AlphaMul::PrimLODFrac:
855 		encode_alpha(output.mul, constants.prim_lod_frac);
856 		break;
857 
858 	default:
859 		break;
860 	}
861 
862 	switch (comb.alpha.add)
863 	{
864 	case AlphaAddSub::PrimitiveAlpha:
865 		encode_alpha(output.add, constants.primitive_color);
866 		break;
867 
868 	case AlphaAddSub::EnvAlpha:
869 		encode_alpha(output.add, constants.env_color);
870 		break;
871 
872 	default:
873 		break;
874 	}
875 }
876 
build_derived_attributes(const AttributeSetup & attr) const877 DerivedSetup Renderer::build_derived_attributes(const AttributeSetup &attr) const
878 {
879 	DerivedSetup setup = {};
880 	if (constants.use_prim_depth)
881 	{
882 		setup.dz = constants.prim_dz;
883 		setup.dz_compressed = dz_compress(setup.dz);
884 	}
885 	else
886 	{
887 		int dzdx = attr.dzdx >> 16;
888 		int dzdy = attr.dzdy >> 16;
889 		int dzpix = (dzdx < 0 ? (~dzdx & 0x7fff) : dzdx) + (dzdy < 0 ? (~dzdy & 0x7fff) : dzdy);
890 		dzpix = normalize_dzpix(dzpix);
891 		setup.dz = dzpix;
892 		setup.dz_compressed = dz_compress(dzpix);
893 	}
894 
895 	build_combiner_constants(setup, 0);
896 	build_combiner_constants(setup, 1);
897 
898 	setup.fog_color[0] = uint8_t(constants.fog_color >> 24);
899 	setup.fog_color[1] = uint8_t(constants.fog_color >> 16);
900 	setup.fog_color[2] = uint8_t(constants.fog_color >> 8);
901 	setup.fog_color[3] = uint8_t(constants.fog_color >> 0);
902 
903 	setup.blend_color[0] = uint8_t(constants.blend_color >> 24);
904 	setup.blend_color[1] = uint8_t(constants.blend_color >> 16);
905 	setup.blend_color[2] = uint8_t(constants.blend_color >> 8);
906 	setup.blend_color[3] = uint8_t(constants.blend_color >> 0);
907 
908 	setup.fill_color = constants.fill_color;
909 	setup.min_lod = constants.min_level;
910 
911 	for (unsigned i = 0; i < 4; i++)
912 		setup.convert_factors[i] = int16_t(constants.convert[i]);
913 
914 	return setup;
915 }
916 
917 static constexpr unsigned SUBPIXELS_Y = 4;
918 
interpolate_x(const TriangleSetup & setup,int y,bool flip,int scaling)919 static std::pair<int, int> interpolate_x(const TriangleSetup &setup, int y, bool flip, int scaling)
920 {
921 	int yh_interpolation_base = setup.yh & ~(SUBPIXELS_Y - 1);
922 	int ym_interpolation_base = setup.ym;
923 	yh_interpolation_base *= scaling;
924 	ym_interpolation_base *= scaling;
925 
926 	int xh = scaling * setup.xh + (y - yh_interpolation_base) * setup.dxhdy;
927 	int xm = scaling * setup.xm + (y - yh_interpolation_base) * setup.dxmdy;
928 	int xl = scaling * setup.xl + (y - ym_interpolation_base) * setup.dxldy;
929 	if (y < scaling * setup.ym)
930 		xl = xm;
931 
932 	int xh_shifted = xh >> 15;
933 	int xl_shifted = xl >> 15;
934 
935 	int xleft, xright;
936 	if (flip)
937 	{
938 		xleft = xh_shifted;
939 		xright = xl_shifted;
940 	}
941 	else
942 	{
943 		xleft = xl_shifted;
944 		xright = xh_shifted;
945 	}
946 
947 	return { xleft, xright };
948 }
949 
compute_conservative_max_num_tiles(const TriangleSetup & setup) const950 unsigned Renderer::compute_conservative_max_num_tiles(const TriangleSetup &setup) const
951 {
952 	if (setup.yl <= setup.yh)
953 		return 0;
954 
955 	int scaling = int(caps.upscaling);
956 	int start_y = setup.yh & ~(SUBPIXELS_Y - 1);
957 	int end_y = (setup.yl - 1) | (SUBPIXELS_Y - 1);
958 
959 	start_y = std::max(int(stream.scissor_state.ylo), start_y);
960 	end_y = std::min(int(stream.scissor_state.yhi) - 1, end_y);
961 	start_y *= scaling;
962 	end_y *= scaling;
963 
964 	// Y is clipped out, exit early.
965 	if (end_y < start_y)
966 		return 0;
967 
968 	bool flip = (setup.flags & TRIANGLE_SETUP_FLIP_BIT) != 0;
969 
970 	auto upper = interpolate_x(setup, start_y, flip, scaling);
971 	auto lower = interpolate_x(setup, end_y, flip, scaling);
972 	auto mid = upper;
973 	auto mid1 = upper;
974 
975 	int ym = scaling * setup.ym;
976 	if (ym > start_y && ym < end_y)
977 	{
978 		mid = interpolate_x(setup, ym, flip, scaling);
979 		mid1 = interpolate_x(setup, ym - 1, flip, scaling);
980 	}
981 
982 	int start_x = std::min(std::min(upper.first, lower.first), std::min(mid.first, mid1.first));
983 	int end_x = std::max(std::max(upper.second, lower.second), std::max(mid.second, mid1.second));
984 
985 	start_x = std::max(start_x, scaling * (int(stream.scissor_state.xlo) >> 2));
986 	end_x = std::min(end_x, scaling * ((int(stream.scissor_state.xhi) + 3) >> 2) - 1);
987 
988 	if (end_x < start_x)
989 		return 0;
990 
991 	start_x /= ImplementationConstants::TileWidth;
992 	end_x /= ImplementationConstants::TileWidth;
993 	start_y /= (SUBPIXELS_Y * ImplementationConstants::TileHeight);
994 	end_y /= (SUBPIXELS_Y * ImplementationConstants::TileHeight);
995 
996 	return (end_x - start_x + 1) * (end_y - start_y + 1);
997 }
998 
combiner_accesses_texel0(const CombinerInputs & inputs)999 static bool combiner_accesses_texel0(const CombinerInputs &inputs)
1000 {
1001 	return inputs.rgb.muladd == RGBMulAdd::Texel0 ||
1002 	       inputs.rgb.mulsub == RGBMulSub::Texel0 ||
1003 	       inputs.rgb.mul == RGBMul::Texel0 ||
1004 	       inputs.rgb.add == RGBAdd::Texel0 ||
1005 	       inputs.rgb.mul == RGBMul::Texel0Alpha ||
1006 	       inputs.alpha.muladd == AlphaAddSub::Texel0Alpha ||
1007 	       inputs.alpha.mulsub == AlphaAddSub::Texel0Alpha ||
1008 	       inputs.alpha.mul == AlphaMul::Texel0Alpha ||
1009 	       inputs.alpha.add == AlphaAddSub::Texel0Alpha;
1010 }
1011 
combiner_accesses_lod_frac(const CombinerInputs & inputs)1012 static bool combiner_accesses_lod_frac(const CombinerInputs &inputs)
1013 {
1014 	return inputs.rgb.mul == RGBMul::LODFrac || inputs.alpha.mul == AlphaMul::LODFrac;
1015 }
1016 
combiner_accesses_texel1(const CombinerInputs & inputs)1017 static bool combiner_accesses_texel1(const CombinerInputs &inputs)
1018 {
1019 	return inputs.rgb.muladd == RGBMulAdd::Texel1 ||
1020 	       inputs.rgb.mulsub == RGBMulSub::Texel1 ||
1021 	       inputs.rgb.mul == RGBMul::Texel1 ||
1022 	       inputs.rgb.add == RGBAdd::Texel1 ||
1023 	       inputs.rgb.mul == RGBMul::Texel1Alpha ||
1024 	       inputs.alpha.muladd == AlphaAddSub::Texel1Alpha ||
1025 	       inputs.alpha.mulsub == AlphaAddSub::Texel1Alpha ||
1026 	       inputs.alpha.mul == AlphaMul::Texel1Alpha ||
1027 	       inputs.alpha.add == AlphaAddSub::Texel1Alpha;
1028 }
1029 
combiner_uses_texel0(const StaticRasterizationState & state)1030 static bool combiner_uses_texel0(const StaticRasterizationState &state)
1031 {
1032 	// Texel0 can be safely used in cycle0 of CYCLE2 mode, or in cycle1 (only cycle) of CYCLE1 mode.
1033 	if ((state.flags & RASTERIZATION_MULTI_CYCLE_BIT) != 0)
1034 	{
1035 		// In second cycle, Texel0 and Texel1 swap around ...
1036 		return combiner_accesses_texel0(state.combiner[0]) ||
1037 		       combiner_accesses_texel1(state.combiner[1]);
1038 	}
1039 	else
1040 		return combiner_accesses_texel0(state.combiner[1]);
1041 }
1042 
combiner_uses_texel1(const StaticRasterizationState & state)1043 static bool combiner_uses_texel1(const StaticRasterizationState &state)
1044 {
1045 	// Texel1 can be safely used in cycle0 of CYCLE2 mode, and never in cycle1 mode.
1046 	// Texel0 can be safely accessed in cycle1, which is an alias due to pipelining.
1047 	if ((state.flags & RASTERIZATION_MULTI_CYCLE_BIT) != 0)
1048 	{
1049 		return combiner_accesses_texel1(state.combiner[0]) ||
1050 		       combiner_accesses_texel0(state.combiner[1]);
1051 	}
1052 	else
1053 		return false;
1054 }
1055 
combiner_uses_pipelined_texel1(const StaticRasterizationState & state)1056 static bool combiner_uses_pipelined_texel1(const StaticRasterizationState &state)
1057 {
1058 	// If you access Texel1 in cycle1 mode, you end up reading the next pixel's color for whatever reason.
1059 	if ((state.flags & RASTERIZATION_MULTI_CYCLE_BIT) == 0)
1060 		return combiner_accesses_texel1(state.combiner[1]);
1061 	else
1062 		return false;
1063 }
1064 
combiner_uses_lod_frac(const StaticRasterizationState & state)1065 static bool combiner_uses_lod_frac(const StaticRasterizationState &state)
1066 {
1067 	if ((state.flags & RASTERIZATION_MULTI_CYCLE_BIT) != 0)
1068 		return combiner_accesses_lod_frac(state.combiner[0]) || combiner_accesses_lod_frac(state.combiner[1]);
1069 	else
1070 		return false;
1071 }
1072 
deduce_noise_state()1073 void Renderer::deduce_noise_state()
1074 {
1075 	auto &state = stream.static_raster_state;
1076 	state.flags &= ~RASTERIZATION_NEED_NOISE_BIT;
1077 
1078 	// Figure out if we need to seed noise variable for this primitive.
1079 	if ((state.dither & 3) == 2 || ((state.dither >> 2) & 3) == 2)
1080 	{
1081 		state.flags |= RASTERIZATION_NEED_NOISE_BIT;
1082 		return;
1083 	}
1084 
1085 	if ((state.flags & (RASTERIZATION_COPY_BIT | RASTERIZATION_FILL_BIT)) != 0)
1086 		return;
1087 
1088 	if ((state.flags & RASTERIZATION_MULTI_CYCLE_BIT) != 0)
1089 	{
1090 		if (state.combiner[0].rgb.muladd == RGBMulAdd::Noise)
1091 			state.flags |= RASTERIZATION_NEED_NOISE_BIT;
1092 	}
1093 	else if (state.combiner[1].rgb.muladd == RGBMulAdd::Noise)
1094 		state.flags |= RASTERIZATION_NEED_NOISE_BIT;
1095 
1096 	if ((state.flags & (RASTERIZATION_ALPHA_TEST_BIT | RASTERIZATION_ALPHA_TEST_DITHER_BIT)) ==
1097 	    (RASTERIZATION_ALPHA_TEST_BIT | RASTERIZATION_ALPHA_TEST_DITHER_BIT))
1098 	{
1099 		state.flags |= RASTERIZATION_NEED_NOISE_BIT;
1100 	}
1101 }
1102 
normalize_combiner(RGBMulAdd muladd)1103 static RGBMulAdd normalize_combiner(RGBMulAdd muladd)
1104 {
1105 	switch (muladd)
1106 	{
1107 	case RGBMulAdd::Noise:
1108 	case RGBMulAdd::Texel0:
1109 	case RGBMulAdd::Texel1:
1110 	case RGBMulAdd::Combined:
1111 	case RGBMulAdd::One:
1112 	case RGBMulAdd::Shade:
1113 		return muladd;
1114 
1115 	default:
1116 		return RGBMulAdd::Zero;
1117 	}
1118 }
1119 
normalize_combiner(RGBMulSub mulsub)1120 static RGBMulSub normalize_combiner(RGBMulSub mulsub)
1121 {
1122 	switch (mulsub)
1123 	{
1124 	case RGBMulSub::Combined:
1125 	case RGBMulSub::Texel0:
1126 	case RGBMulSub::Texel1:
1127 	case RGBMulSub::Shade:
1128 	case RGBMulSub::ConvertK4:
1129 		return mulsub;
1130 
1131 	default:
1132 		return RGBMulSub::Zero;
1133 	}
1134 }
1135 
normalize_combiner(RGBMul mul)1136 static RGBMul normalize_combiner(RGBMul mul)
1137 {
1138 	switch (mul)
1139 	{
1140 	case RGBMul::Combined:
1141 	case RGBMul::CombinedAlpha:
1142 	case RGBMul::Texel0:
1143 	case RGBMul::Texel1:
1144 	case RGBMul::Texel0Alpha:
1145 	case RGBMul::Texel1Alpha:
1146 	case RGBMul::Shade:
1147 	case RGBMul::ShadeAlpha:
1148 	case RGBMul::LODFrac:
1149 	case RGBMul::ConvertK5:
1150 		return mul;
1151 
1152 	default:
1153 		return RGBMul::Zero;
1154 	}
1155 }
1156 
normalize_combiner(RGBAdd add)1157 static RGBAdd normalize_combiner(RGBAdd add)
1158 {
1159 	switch (add)
1160 	{
1161 	case RGBAdd::Texel0:
1162 	case RGBAdd::Texel1:
1163 	case RGBAdd::Combined:
1164 	case RGBAdd::One:
1165 	case RGBAdd::Shade:
1166 		return add;
1167 
1168 	default:
1169 		return RGBAdd::Zero;
1170 	}
1171 }
1172 
normalize_combiner(AlphaAddSub addsub)1173 static AlphaAddSub normalize_combiner(AlphaAddSub addsub)
1174 {
1175 	switch (addsub)
1176 	{
1177 	case AlphaAddSub::CombinedAlpha:
1178 	case AlphaAddSub::Texel0Alpha:
1179 	case AlphaAddSub::Texel1Alpha:
1180 	case AlphaAddSub::ShadeAlpha:
1181 	case AlphaAddSub::One:
1182 		return addsub;
1183 
1184 	default:
1185 		return AlphaAddSub::Zero;
1186 	}
1187 }
1188 
normalize_combiner(AlphaMul mul)1189 static AlphaMul normalize_combiner(AlphaMul mul)
1190 {
1191 	switch (mul)
1192 	{
1193 	case AlphaMul::LODFrac:
1194 	case AlphaMul::Texel0Alpha:
1195 	case AlphaMul::Texel1Alpha:
1196 	case AlphaMul::ShadeAlpha:
1197 		return mul;
1198 
1199 	default:
1200 		return AlphaMul::Zero;
1201 	}
1202 }
1203 
normalize_combiner(CombinerInputsRGB & comb)1204 static void normalize_combiner(CombinerInputsRGB &comb)
1205 {
1206 	comb.muladd = normalize_combiner(comb.muladd);
1207 	comb.mulsub = normalize_combiner(comb.mulsub);
1208 	comb.mul = normalize_combiner(comb.mul);
1209 	comb.add = normalize_combiner(comb.add);
1210 }
1211 
normalize_combiner(CombinerInputsAlpha & comb)1212 static void normalize_combiner(CombinerInputsAlpha &comb)
1213 {
1214 	comb.muladd = normalize_combiner(comb.muladd);
1215 	comb.mulsub = normalize_combiner(comb.mulsub);
1216 	comb.mul = normalize_combiner(comb.mul);
1217 	comb.add = normalize_combiner(comb.add);
1218 }
1219 
normalize_combiner(CombinerInputs & comb)1220 static void normalize_combiner(CombinerInputs &comb)
1221 {
1222 	normalize_combiner(comb.rgb);
1223 	normalize_combiner(comb.alpha);
1224 }
1225 
normalize_static_state(StaticRasterizationState state)1226 StaticRasterizationState Renderer::normalize_static_state(StaticRasterizationState state)
1227 {
1228 	if ((state.flags & RASTERIZATION_FILL_BIT) != 0)
1229 	{
1230 		state = {};
1231 		state.flags = RASTERIZATION_FILL_BIT;
1232 		return state;
1233 	}
1234 
1235 	if ((state.flags & RASTERIZATION_COPY_BIT) != 0)
1236 	{
1237 		auto flags = state.flags &
1238 		             (RASTERIZATION_COPY_BIT |
1239 		              RASTERIZATION_TLUT_BIT |
1240 		              RASTERIZATION_TLUT_TYPE_BIT |
1241 		              RASTERIZATION_USES_TEXEL0_BIT |
1242 		              RASTERIZATION_USE_STATIC_TEXTURE_SIZE_FORMAT_BIT |
1243 		              RASTERIZATION_TEX_LOD_ENABLE_BIT |
1244 		              RASTERIZATION_DETAIL_LOD_ENABLE_BIT |
1245 		              RASTERIZATION_ALPHA_TEST_BIT);
1246 
1247 		auto fmt = state.texture_fmt;
1248 		auto siz = state.texture_size;
1249 		state = {};
1250 		state.flags = flags;
1251 		state.texture_fmt = fmt;
1252 		state.texture_size = siz;
1253 		return state;
1254 	}
1255 
1256 	if ((state.flags & RASTERIZATION_MULTI_CYCLE_BIT) == 0)
1257 		state.flags &= ~(RASTERIZATION_BILERP_1_BIT | RASTERIZATION_CONVERT_ONE_BIT);
1258 
1259 	normalize_combiner(state.combiner[0]);
1260 	normalize_combiner(state.combiner[1]);
1261 	return state;
1262 }
1263 
deduce_static_texture_state(unsigned tile,unsigned max_lod_level)1264 void Renderer::deduce_static_texture_state(unsigned tile, unsigned max_lod_level)
1265 {
1266 	auto &state = stream.static_raster_state;
1267 	state.flags &= ~RASTERIZATION_USE_STATIC_TEXTURE_SIZE_FORMAT_BIT;
1268 	state.texture_size = 0;
1269 	state.texture_fmt = 0;
1270 
1271 	if ((state.flags & RASTERIZATION_FILL_BIT) != 0)
1272 		return;
1273 
1274 	auto fmt = tiles[tile].meta.fmt;
1275 	auto siz = tiles[tile].meta.size;
1276 
1277 	if ((state.flags & RASTERIZATION_COPY_BIT) == 0)
1278 	{
1279 		// If all tiles we sample have the same fmt and size (common case), we can use a static variant.
1280 		bool uses_texel0 = combiner_uses_texel0(state);
1281 		bool uses_texel1 = combiner_uses_texel1(state);
1282 		bool uses_pipelined_texel1 = combiner_uses_pipelined_texel1(state);
1283 		bool uses_lod_frac = combiner_uses_lod_frac(state);
1284 
1285 		if (uses_texel1 && (state.flags & RASTERIZATION_CONVERT_ONE_BIT) != 0)
1286 			uses_texel0 = true;
1287 
1288 		state.flags &= ~(RASTERIZATION_USES_TEXEL0_BIT |
1289 		                 RASTERIZATION_USES_TEXEL1_BIT |
1290 		                 RASTERIZATION_USES_PIPELINED_TEXEL1_BIT |
1291 		                 RASTERIZATION_USES_LOD_BIT);
1292 		if (uses_texel0)
1293 			state.flags |= RASTERIZATION_USES_TEXEL0_BIT;
1294 		if (uses_texel1)
1295 			state.flags |= RASTERIZATION_USES_TEXEL1_BIT;
1296 		if (uses_pipelined_texel1)
1297 			state.flags |= RASTERIZATION_USES_PIPELINED_TEXEL1_BIT;
1298 		if (uses_lod_frac || (state.flags & RASTERIZATION_TEX_LOD_ENABLE_BIT) != 0)
1299 			state.flags |= RASTERIZATION_USES_LOD_BIT;
1300 
1301 		if (!uses_texel0 && !uses_texel1 && !uses_pipelined_texel1)
1302 			return;
1303 
1304 		bool use_lod = (state.flags & RASTERIZATION_TEX_LOD_ENABLE_BIT) != 0;
1305 		bool use_detail = (state.flags & RASTERIZATION_DETAIL_LOD_ENABLE_BIT) != 0;
1306 
1307 		bool uses_physical_texel1 = uses_texel1 &&
1308 		                            ((state.flags & RASTERIZATION_CONVERT_ONE_BIT) == 0 ||
1309 		                             (state.flags & RASTERIZATION_BILERP_1_BIT) != 0);
1310 
1311 		if (!use_lod)
1312 			max_lod_level = uses_physical_texel1 ? 1 : 0;
1313 		if (use_detail)
1314 			max_lod_level++;
1315 		max_lod_level = std::min(max_lod_level, 7u);
1316 
1317 		for (unsigned i = 1; i <= max_lod_level; i++)
1318 		{
1319 			auto &t = tiles[(tile + i) & 7].meta;
1320 			if (t.fmt != fmt)
1321 				return;
1322 			if (t.size != siz)
1323 				return;
1324 		}
1325 	}
1326 
1327 	// We have a static format.
1328 	state.flags |= RASTERIZATION_USE_STATIC_TEXTURE_SIZE_FORMAT_BIT;
1329 	state.texture_fmt = uint32_t(fmt);
1330 	state.texture_size = uint32_t(siz);
1331 }
1332 
draw_shaded_primitive(const TriangleSetup & setup,const AttributeSetup & attr)1333 void Renderer::draw_shaded_primitive(const TriangleSetup &setup, const AttributeSetup &attr)
1334 {
1335 	unsigned num_tiles = compute_conservative_max_num_tiles(setup);
1336 
1337 #if 0
1338 	// Don't exit early, throws off seeding of noise channels.
1339 	if (!num_tiles)
1340 		return;
1341 #endif
1342 
1343 	if (!caps.ubershader)
1344 		stream.max_shaded_tiles += num_tiles;
1345 
1346 	update_deduced_height(setup);
1347 	stream.span_info_offsets.add(allocate_span_jobs(setup));
1348 
1349 	if ((stream.static_raster_state.flags & RASTERIZATION_INTERLACE_FIELD_BIT) != 0)
1350 	{
1351 		auto tmp = setup;
1352 		tmp.flags |= (stream.static_raster_state.flags & RASTERIZATION_INTERLACE_FIELD_BIT) ?
1353 				TRIANGLE_SETUP_INTERLACE_FIELD_BIT : 0;
1354 		tmp.flags |= (stream.static_raster_state.flags & RASTERIZATION_INTERLACE_KEEP_ODD_BIT) ?
1355 				TRIANGLE_SETUP_INTERLACE_KEEP_ODD_BIT : 0;
1356 		stream.triangle_setup.add(tmp);
1357 	}
1358 	else
1359 		stream.triangle_setup.add(setup);
1360 
1361 	if (constants.use_prim_depth)
1362 	{
1363 		auto tmp_attr = attr;
1364 		tmp_attr.z = constants.prim_depth;
1365 		tmp_attr.dzdx = 0;
1366 		tmp_attr.dzde = 0;
1367 		tmp_attr.dzdy = 0;
1368 		stream.attribute_setup.add(tmp_attr);
1369 	}
1370 	else
1371 	{
1372 		stream.attribute_setup.add(attr);
1373 	}
1374 
1375 	stream.derived_setup.add(build_derived_attributes(attr));
1376 	stream.scissor_setup.add(stream.scissor_state);
1377 
1378 	deduce_static_texture_state(setup.tile & 7, setup.tile >> 3);
1379 	deduce_noise_state();
1380 
1381 	InstanceIndices indices = {};
1382 	indices.static_index = stream.static_raster_state_cache.add(normalize_static_state(stream.static_raster_state));
1383 	indices.depth_blend_index = stream.depth_blend_state_cache.add(stream.depth_blend_state);
1384 	indices.tile_instance_index = uint8_t(stream.tmem_upload_infos.size());
1385 	for (unsigned i = 0; i < 8; i++)
1386 		indices.tile_indices[i] = stream.tile_info_state_cache.add(tiles[i]);
1387 	stream.state_indices.add(indices);
1388 
1389 	fb.color_write_pending = true;
1390 	if (stream.depth_blend_state.flags & DEPTH_BLEND_DEPTH_UPDATE_BIT)
1391 		fb.depth_write_pending = true;
1392 	pending_primitives++;
1393 
1394 	if (need_flush())
1395 		flush_queues();
1396 }
1397 
allocate_span_jobs(const TriangleSetup & setup)1398 SpanInfoOffsets Renderer::allocate_span_jobs(const TriangleSetup &setup)
1399 {
1400 	int min_active_sub_scanline = std::max(int(setup.yh), int(stream.scissor_state.ylo));
1401 	int min_active_line = min_active_sub_scanline >> 2;
1402 
1403 	int max_active_sub_scanline = std::min(setup.yl - 1, int(stream.scissor_state.yhi) - 1);
1404 	int max_active_line = max_active_sub_scanline >> 2;
1405 
1406 	if (max_active_line < min_active_line)
1407 		return { 0, 0, -1, 0 };
1408 
1409 	// Need to poke into next scanline validation for certain workarounds.
1410 	int height = std::max(max_active_line - min_active_line + 2, 0);
1411 	height = std::min(height, 1024);
1412 
1413 	int num_jobs = (height + ImplementationConstants::DefaultWorkgroupSize - 1) / ImplementationConstants::DefaultWorkgroupSize;
1414 
1415 	SpanInfoOffsets offsets = {};
1416 	offsets.offset = uint32_t(stream.span_info_jobs.size()) * ImplementationConstants::DefaultWorkgroupSize;
1417 	offsets.ylo = min_active_line;
1418 	offsets.yhi = max_active_line;
1419 
1420 	for (int i = 0; i < num_jobs; i++)
1421 	{
1422 		SpanInterpolationJob interpolation_job = {};
1423 		interpolation_job.primitive_index = uint32_t(stream.triangle_setup.size());
1424 		interpolation_job.base_y = min_active_line + ImplementationConstants::DefaultWorkgroupSize * i;
1425 		interpolation_job.max_y = max_active_line + 1;
1426 		stream.span_info_jobs.add(interpolation_job);
1427 	}
1428 	return offsets;
1429 }
1430 
update_deduced_height(const TriangleSetup & setup)1431 void Renderer::update_deduced_height(const TriangleSetup &setup)
1432 {
1433 	int max_active_sub_scanline = std::min(setup.yl - 1, int(stream.scissor_state.yhi) - 1);
1434 	int max_active_line = max_active_sub_scanline >> 2;
1435 	int height = std::max(max_active_line + 1, 0);
1436 	fb.deduced_height = std::max(fb.deduced_height, uint32_t(height));
1437 }
1438 
need_flush() const1439 bool Renderer::need_flush() const
1440 {
1441 	bool cache_full =
1442 			stream.static_raster_state_cache.full() ||
1443 			stream.depth_blend_state_cache.full() ||
1444 			(stream.tile_info_state_cache.size() + 8 > Limits::MaxTileInfoStates);
1445 
1446 	bool triangle_full =
1447 			stream.triangle_setup.full();
1448 	bool span_info_full =
1449 			(stream.span_info_jobs.size() * ImplementationConstants::DefaultWorkgroupSize + Limits::MaxHeight > Limits::MaxSpanSetups);
1450 	bool max_shaded_tiles =
1451 			(stream.max_shaded_tiles + caps.max_tiles_x * caps.max_tiles_y > caps.max_num_tile_instances);
1452 
1453 #ifdef VULKAN_DEBUG
1454 	if (cache_full)
1455 		LOGI("Cache is full.\n");
1456 	if (triangle_full)
1457 		LOGI("Triangle is full.\n");
1458 	if (span_info_full)
1459 		LOGI("Span info is full.\n");
1460 	if (max_shaded_tiles)
1461 		LOGI("Shaded tiles is full.\n");
1462 #endif
1463 
1464 	return cache_full || triangle_full || span_info_full || max_shaded_tiles;
1465 }
1466 
1467 template <typename Cache>
upload(Vulkan::CommandBuffer & cmd,Vulkan::Device & device,const MappedBuffer & gpu,const MappedBuffer & cpu,const Cache & cache,bool & did_upload)1468 void Renderer::RenderBuffersUpdater::upload(Vulkan::CommandBuffer &cmd, Vulkan::Device &device,
1469                                             const MappedBuffer &gpu, const MappedBuffer &cpu, const Cache &cache,
1470                                             bool &did_upload)
1471 {
1472 	if (!cache.empty())
1473 	{
1474 		memcpy(device.map_host_buffer(*cpu.buffer, Vulkan::MEMORY_ACCESS_WRITE_BIT), cache.data(), cache.byte_size());
1475 		device.unmap_host_buffer(*cpu.buffer, Vulkan::MEMORY_ACCESS_WRITE_BIT);
1476 		if (gpu.buffer != cpu.buffer)
1477 		{
1478 			cmd.copy_buffer(*gpu.buffer, 0, *cpu.buffer, 0, cache.byte_size());
1479 			did_upload = true;
1480 		}
1481 	}
1482 }
1483 
upload(Vulkan::Device & device,const Renderer::StreamCaches & caches,Vulkan::CommandBuffer & cmd)1484 void Renderer::RenderBuffersUpdater::upload(Vulkan::Device &device, const Renderer::StreamCaches &caches,
1485                                             Vulkan::CommandBuffer &cmd)
1486 {
1487 	bool did_upload = false;
1488 
1489 	upload(cmd, device, gpu.triangle_setup, cpu.triangle_setup, caches.triangle_setup, did_upload);
1490 	upload(cmd, device, gpu.attribute_setup, cpu.attribute_setup, caches.attribute_setup, did_upload);
1491 	upload(cmd, device, gpu.derived_setup, cpu.derived_setup, caches.derived_setup, did_upload);
1492 	upload(cmd, device, gpu.scissor_setup, cpu.scissor_setup, caches.scissor_setup, did_upload);
1493 
1494 	upload(cmd, device, gpu.static_raster_state, cpu.static_raster_state, caches.static_raster_state_cache, did_upload);
1495 	upload(cmd, device, gpu.depth_blend_state, cpu.depth_blend_state, caches.depth_blend_state_cache, did_upload);
1496 	upload(cmd, device, gpu.tile_info_state, cpu.tile_info_state, caches.tile_info_state_cache, did_upload);
1497 
1498 	upload(cmd, device, gpu.state_indices, cpu.state_indices, caches.state_indices, did_upload);
1499 	upload(cmd, device, gpu.span_info_offsets, cpu.span_info_offsets, caches.span_info_offsets, did_upload);
1500 	upload(cmd, device, gpu.span_info_jobs, cpu.span_info_jobs, caches.span_info_jobs, did_upload);
1501 
1502 	if (did_upload)
1503 	{
1504 		cmd.barrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT,
1505 		            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_SHADER_READ_BIT);
1506 	}
1507 }
1508 
update_tmem_instances(Vulkan::CommandBuffer & cmd)1509 void Renderer::update_tmem_instances(Vulkan::CommandBuffer &cmd)
1510 {
1511 	cmd.begin_region("tmem-update");
1512 	cmd.set_storage_buffer(0, 0, *rdram, rdram_offset, rdram_size);
1513 	cmd.set_storage_buffer(0, 1, *tmem);
1514 	cmd.set_storage_buffer(0, 2, *tmem_instances);
1515 
1516 	memcpy(cmd.allocate_typed_constant_data<UploadInfo>(1, 0, stream.tmem_upload_infos.size()),
1517 	       stream.tmem_upload_infos.data(),
1518 	       stream.tmem_upload_infos.size() * sizeof(UploadInfo));
1519 
1520 	auto count = uint32_t(stream.tmem_upload_infos.size());
1521 
1522 #ifdef PARALLEL_RDP_SHADER_DIR
1523 	cmd.set_program("rdp://tmem_update.comp", {{ "DEBUG_ENABLE", debug_channel ? 1 : 0 }});
1524 #else
1525 	cmd.set_program(shader_bank->tmem_update);
1526 #endif
1527 
1528 	cmd.push_constants(&count, 0, sizeof(count));
1529 	cmd.set_specialization_constant_mask(1);
1530 	cmd.set_specialization_constant(0, ImplementationConstants::DefaultWorkgroupSize);
1531 
1532 	Vulkan::QueryPoolHandle start_ts, end_ts;
1533 	if (caps.timestamp >= 2)
1534 		start_ts = cmd.write_timestamp(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
1535 	cmd.dispatch(2048 / ImplementationConstants::DefaultWorkgroupSize, 1, 1);
1536 	if (caps.timestamp >= 2)
1537 	{
1538 		end_ts = cmd.write_timestamp(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
1539 		device->register_time_interval("RDP GPU", std::move(start_ts), std::move(end_ts),
1540 		                               "tmem-update", std::to_string(stream.tmem_upload_infos.size()));
1541 	}
1542 	cmd.end_region();
1543 }
1544 
submit_span_setup_jobs(Vulkan::CommandBuffer & cmd,bool upscale)1545 void Renderer::submit_span_setup_jobs(Vulkan::CommandBuffer &cmd, bool upscale)
1546 {
1547 	cmd.begin_region("span-setup");
1548 	auto &instance = buffer_instances[buffer_instance];
1549 	cmd.set_storage_buffer(0, 0, *instance.gpu.triangle_setup.buffer);
1550 	cmd.set_storage_buffer(0, 1, *instance.gpu.attribute_setup.buffer);
1551 	cmd.set_storage_buffer(0, 2, *instance.gpu.scissor_setup.buffer);
1552 	cmd.set_storage_buffer(0, 3, *span_setups);
1553 
1554 #ifdef PARALLEL_RDP_SHADER_DIR
1555 	cmd.set_program("rdp://span_setup.comp", {{ "DEBUG_ENABLE", debug_channel ? 1 : 0 }});
1556 #else
1557 	cmd.set_program(shader_bank->span_setup);
1558 #endif
1559 
1560 	cmd.set_buffer_view(1, 0, *instance.gpu.span_info_jobs_view);
1561 	cmd.set_specialization_constant_mask(3);
1562 	cmd.set_specialization_constant(0, (upscale ? caps.upscaling : 1) * ImplementationConstants::DefaultWorkgroupSize);
1563 	cmd.set_specialization_constant(1, upscale ? trailing_zeroes(caps.upscaling) : 0u);
1564 
1565 	Vulkan::QueryPoolHandle begin_ts, end_ts;
1566 	if (caps.timestamp >= 2)
1567 		begin_ts = cmd.write_timestamp(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
1568 	cmd.dispatch(stream.span_info_jobs.size(), 1, 1);
1569 	if (caps.timestamp >= 2)
1570 	{
1571 		end_ts = cmd.write_timestamp(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
1572 		device->register_time_interval("RDP GPU", std::move(begin_ts), std::move(end_ts), "span-info-jobs");
1573 	}
1574 	cmd.end_region();
1575 }
1576 
clear_indirect_buffer(Vulkan::CommandBuffer & cmd)1577 void Renderer::clear_indirect_buffer(Vulkan::CommandBuffer &cmd)
1578 {
1579 	cmd.begin_region("clear-indirect-buffer");
1580 
1581 #ifdef PARALLEL_RDP_SHADER_DIR
1582 	cmd.set_program("rdp://clear_indirect_buffer.comp");
1583 #else
1584 	cmd.set_program(shader_bank->clear_indirect_buffer);
1585 #endif
1586 
1587 	cmd.set_storage_buffer(0, 0, *indirect_dispatch_buffer);
1588 
1589 	static_assert((Limits::MaxStaticRasterizationStates % ImplementationConstants::DefaultWorkgroupSize) == 0, "MaxStaticRasterizationStates does not align.");
1590 	cmd.set_specialization_constant_mask(1);
1591 	cmd.set_specialization_constant(0, ImplementationConstants::DefaultWorkgroupSize);
1592 	cmd.dispatch(Limits::MaxStaticRasterizationStates / ImplementationConstants::DefaultWorkgroupSize, 1, 1);
1593 	cmd.end_region();
1594 }
1595 
submit_rasterization(Vulkan::CommandBuffer & cmd,Vulkan::Buffer & tmem,bool upscaling)1596 void Renderer::submit_rasterization(Vulkan::CommandBuffer &cmd, Vulkan::Buffer &tmem, bool upscaling)
1597 {
1598 	cmd.begin_region("rasterization");
1599 	auto &instance = buffer_instances[buffer_instance];
1600 
1601 	cmd.set_storage_buffer(0, 0, *instance.gpu.triangle_setup.buffer);
1602 	cmd.set_storage_buffer(0, 1, *instance.gpu.attribute_setup.buffer);
1603 	cmd.set_storage_buffer(0, 2, *instance.gpu.derived_setup.buffer);
1604 	cmd.set_storage_buffer(0, 3, *instance.gpu.static_raster_state.buffer);
1605 	cmd.set_storage_buffer(0, 4, *instance.gpu.state_indices.buffer);
1606 	cmd.set_storage_buffer(0, 5, *instance.gpu.span_info_offsets.buffer);
1607 	cmd.set_storage_buffer(0, 6, *span_setups);
1608 	cmd.set_storage_buffer(0, 7, tmem);
1609 	cmd.set_storage_buffer(0, 8, *instance.gpu.tile_info_state.buffer);
1610 
1611 	cmd.set_storage_buffer(0, 9, *per_tile_shaded_color);
1612 	cmd.set_storage_buffer(0, 10, *per_tile_shaded_depth);
1613 	cmd.set_storage_buffer(0, 11, *per_tile_shaded_shaded_alpha);
1614 	cmd.set_storage_buffer(0, 12, *per_tile_shaded_coverage);
1615 
1616 	auto *global_fb_info = cmd.allocate_typed_constant_data<GlobalFBInfo>(2, 0, 1);
1617 	switch (fb.fmt)
1618 	{
1619 	case FBFormat::I4:
1620 		global_fb_info->fb_size = 0;
1621 		global_fb_info->dx_mask = 0;
1622 		global_fb_info->dx_shift = 0;
1623 		break;
1624 
1625 	case FBFormat::I8:
1626 		global_fb_info->fb_size = 1;
1627 		global_fb_info->dx_mask = ~7u;
1628 		global_fb_info->dx_shift = 3;
1629 		break;
1630 
1631 	case FBFormat::RGBA5551:
1632 	case FBFormat::IA88:
1633 		global_fb_info->fb_size = 2;
1634 		global_fb_info->dx_mask = ~3u;
1635 		global_fb_info->dx_shift = 2;
1636 		break;
1637 
1638 	case FBFormat::RGBA8888:
1639 		global_fb_info->fb_size = 4;
1640 		global_fb_info->dx_shift = ~1u;
1641 		global_fb_info->dx_shift = 1;
1642 		break;
1643 	}
1644 
1645 	global_fb_info->base_primitive_index = base_primitive_index;
1646 
1647 #ifdef PARALLEL_RDP_SHADER_DIR
1648 	cmd.set_program("rdp://rasterizer.comp", {
1649 		{ "DEBUG_ENABLE", debug_channel ? 1 : 0 },
1650 		{ "SMALL_TYPES", caps.supports_small_integer_arithmetic ? 1 : 0 },
1651 	});
1652 #else
1653 	cmd.set_program(shader_bank->rasterizer);
1654 #endif
1655 
1656 	cmd.set_specialization_constant(0, ImplementationConstants::TileWidth);
1657 	cmd.set_specialization_constant(1, ImplementationConstants::TileHeight);
1658 
1659 	Vulkan::QueryPoolHandle start_ts, end_ts;
1660 	if (caps.timestamp >= 2)
1661 		start_ts = cmd.write_timestamp(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
1662 
1663 	uint32_t scale_log2_bit = (upscaling ? trailing_zeroes(caps.upscaling) : 0u) << RASTERIZATION_UPSCALING_LOG2_BIT_OFFSET;
1664 
1665 	for (size_t i = 0; i < stream.static_raster_state_cache.size(); i++)
1666 	{
1667 		cmd.set_storage_buffer(1, 0, *tile_work_list,
1668 		                       i * sizeof(TileRasterWork) * caps.max_num_tile_instances,
1669 		                       sizeof(TileRasterWork) * caps.max_num_tile_instances);
1670 
1671 		auto &state = stream.static_raster_state_cache.data()[i];
1672 		cmd.set_specialization_constant(2, state.flags | RASTERIZATION_USE_SPECIALIZATION_CONSTANT_BIT | scale_log2_bit);
1673 		cmd.set_specialization_constant(3, state.combiner[0].rgb);
1674 		cmd.set_specialization_constant(4, state.combiner[0].alpha);
1675 		cmd.set_specialization_constant(5, state.combiner[1].rgb);
1676 		cmd.set_specialization_constant(6, state.combiner[1].alpha);
1677 
1678 		cmd.set_specialization_constant(7, state.dither |
1679 		                                   (state.texture_size << 8u) |
1680 		                                   (state.texture_fmt << 16u));
1681 		cmd.set_specialization_constant_mask(0xff);
1682 
1683 		if (!caps.force_sync && !cmd.flush_pipeline_state_without_blocking())
1684 		{
1685 			Vulkan::DeferredPipelineCompile compile;
1686 			cmd.extract_pipeline_state(compile);
1687 			if (pending_async_pipelines.count(compile.hash) == 0)
1688 			{
1689 				pending_async_pipelines.insert(compile.hash);
1690 				pipeline_worker->push(std::move(compile));
1691 			}
1692 			cmd.set_specialization_constant_mask(7);
1693 			cmd.set_specialization_constant(2, scale_log2_bit);
1694 		}
1695 
1696 		cmd.dispatch_indirect(*indirect_dispatch_buffer, 4 * sizeof(uint32_t) * i);
1697 	}
1698 
1699 	if (caps.timestamp >= 2)
1700 	{
1701 		end_ts = cmd.write_timestamp(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
1702 		device->register_time_interval("RDP GPU", std::move(start_ts), std::move(end_ts), "shading");
1703 	}
1704 	cmd.end_region();
1705 }
1706 
submit_tile_binning_combined(Vulkan::CommandBuffer & cmd,bool upscale)1707 void Renderer::submit_tile_binning_combined(Vulkan::CommandBuffer &cmd, bool upscale)
1708 {
1709 	cmd.begin_region("tile-binning-combined");
1710 	auto &instance = buffer_instances[buffer_instance];
1711 	cmd.set_storage_buffer(0, 0, *instance.gpu.triangle_setup.buffer);
1712 	cmd.set_storage_buffer(0, 1, *instance.gpu.scissor_setup.buffer);
1713 	cmd.set_storage_buffer(0, 2, *instance.gpu.state_indices.buffer);
1714 	cmd.set_storage_buffer(0, 3, *tile_binning_buffer);
1715 	cmd.set_storage_buffer(0, 4, *tile_binning_buffer_coarse);
1716 
1717 	if (!caps.ubershader)
1718 	{
1719 		cmd.set_storage_buffer(0, 5, *per_tile_offsets);
1720 		cmd.set_storage_buffer(0, 6, *indirect_dispatch_buffer);
1721 		cmd.set_storage_buffer(0, 7, *tile_work_list);
1722 	}
1723 
1724 	cmd.set_specialization_constant_mask(0x7f);
1725 	cmd.set_specialization_constant(1, ImplementationConstants::TileWidth);
1726 	cmd.set_specialization_constant(2, ImplementationConstants::TileHeight);
1727 	cmd.set_specialization_constant(3, Limits::MaxPrimitives);
1728 	cmd.set_specialization_constant(4, upscale ? caps.max_width : Limits::MaxWidth);
1729 	cmd.set_specialization_constant(5, caps.max_num_tile_instances);
1730 	cmd.set_specialization_constant(6, upscale ? caps.upscaling : 1u);
1731 
1732 	struct PushData
1733 	{
1734 		uint32_t width, height;
1735 		uint32_t num_primitives;
1736 	} push = {};
1737 	push.width = fb.width;
1738 	push.height = fb.deduced_height;
1739 
1740 	if (upscale)
1741 	{
1742 		push.width *= caps.upscaling;
1743 		push.height *= caps.upscaling;
1744 	}
1745 
1746 	push.num_primitives = uint32_t(stream.triangle_setup.size());
1747 	unsigned num_primitives_32 = (push.num_primitives + 31) / 32;
1748 
1749 	cmd.push_constants(&push, 0, sizeof(push));
1750 
1751 	auto &features = device->get_device_features();
1752 	uint32_t subgroup_size = features.subgroup_properties.subgroupSize;
1753 
1754 	Vulkan::QueryPoolHandle start_ts, end_ts;
1755 	if (caps.timestamp >= 2)
1756 		start_ts = cmd.write_timestamp(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
1757 
1758 	if (caps.subgroup_tile_binning)
1759 	{
1760 #ifdef PARALLEL_RDP_SHADER_DIR
1761 		cmd.set_program("rdp://tile_binning_combined.comp", {
1762 			{ "DEBUG_ENABLE", debug_channel ? 1 : 0 },
1763 			{ "SUBGROUP", 1 },
1764 			{ "UBERSHADER", int(caps.ubershader) },
1765 			{ "SMALL_TYPES", caps.supports_small_integer_arithmetic ? 1 : 0 },
1766 		});
1767 #else
1768 		cmd.set_program(shader_bank->tile_binning_combined);
1769 #endif
1770 
1771 		if (supports_subgroup_size_control(32, subgroup_size))
1772 		{
1773 			cmd.enable_subgroup_size_control(true);
1774 			cmd.set_subgroup_size_log2(true, 5, trailing_zeroes(subgroup_size));
1775 		}
1776 	}
1777 	else
1778 	{
1779 #ifdef PARALLEL_RDP_SHADER_DIR
1780 		cmd.set_program("rdp://tile_binning_combined.comp", {
1781 			{ "DEBUG_ENABLE", debug_channel ? 1 : 0 },
1782 			{ "SUBGROUP", 0 },
1783 			{ "UBERSHADER", int(caps.ubershader) },
1784 			{ "SMALL_TYPES", caps.supports_small_integer_arithmetic ? 1 : 0 },
1785 		});
1786 #else
1787 		cmd.set_program(shader_bank->tile_binning_combined);
1788 #endif
1789 
1790 		subgroup_size = 32;
1791 	}
1792 
1793 	cmd.set_specialization_constant(0, subgroup_size);
1794 	unsigned meta_tiles_x = 8;
1795 	unsigned meta_tiles_y = subgroup_size / meta_tiles_x;
1796 	unsigned num_tiles_x = (push.width + ImplementationConstants::TileWidth - 1) / ImplementationConstants::TileWidth;
1797 	unsigned num_tiles_y = (push.height + ImplementationConstants::TileHeight - 1) / ImplementationConstants::TileHeight;
1798 	unsigned num_meta_tiles_x = (num_tiles_x + meta_tiles_x - 1) / meta_tiles_x;
1799 	unsigned num_meta_tiles_y = (num_tiles_y + meta_tiles_y - 1) / meta_tiles_y;
1800 	cmd.dispatch(num_primitives_32, num_meta_tiles_x, num_meta_tiles_y);
1801 
1802 	if (caps.timestamp >= 2)
1803 	{
1804 		end_ts = cmd.write_timestamp(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
1805 		device->register_time_interval("RDP GPU", std::move(start_ts), std::move(end_ts), "tile-binning");
1806 	}
1807 
1808 	cmd.enable_subgroup_size_control(false);
1809 	cmd.end_region();
1810 }
1811 
submit_update_upscaled_domain_external(Vulkan::CommandBuffer & cmd,unsigned addr,unsigned length,unsigned pixel_size_log2)1812 void Renderer::submit_update_upscaled_domain_external(Vulkan::CommandBuffer &cmd,
1813                                                       unsigned addr, unsigned length, unsigned pixel_size_log2)
1814 {
1815 	submit_update_upscaled_domain(cmd, ResolveStage::Pre, addr, addr, length, pixel_size_log2);
1816 }
1817 
submit_update_upscaled_domain(Vulkan::CommandBuffer & cmd,ResolveStage stage,unsigned addr,unsigned depth_addr,unsigned num_pixels,unsigned pixel_size_log2)1818 void Renderer::submit_update_upscaled_domain(Vulkan::CommandBuffer &cmd, ResolveStage stage,
1819                                              unsigned addr, unsigned depth_addr,
1820                                              unsigned num_pixels, unsigned pixel_size_log2)
1821 {
1822 #ifdef PARALLEL_RDP_SHADER_DIR
1823 	if (stage == ResolveStage::Pre)
1824 		cmd.set_program("rdp://update_upscaled_domain_pre.comp");
1825 	else
1826 		cmd.set_program("rdp://update_upscaled_domain_post.comp");
1827 #else
1828 	if (stage == ResolveStage::Pre)
1829 		cmd.set_program(shader_bank->update_upscaled_domain_pre);
1830 	else
1831 		cmd.set_program(shader_bank->update_upscaled_domain_post);
1832 #endif
1833 
1834 	cmd.set_storage_buffer(0, 0, *rdram, rdram_offset, rdram_size);
1835 	cmd.set_storage_buffer(0, 1, *hidden_rdram);
1836 	cmd.set_storage_buffer(0, 2, *upscaling_reference_rdram);
1837 	cmd.set_storage_buffer(0, 3, *upscaling_multisampled_rdram);
1838 	cmd.set_storage_buffer(0, 4, *upscaling_multisampled_hidden_rdram);
1839 
1840 	cmd.set_specialization_constant_mask(0x1f);
1841 	cmd.set_specialization_constant(0, uint32_t(rdram_size));
1842 	cmd.set_specialization_constant(1, pixel_size_log2);
1843 	cmd.set_specialization_constant(2, int(addr == depth_addr));
1844 	cmd.set_specialization_constant(3, ImplementationConstants::DefaultWorkgroupSize);
1845 	cmd.set_specialization_constant(4, caps.upscaling * caps.upscaling);
1846 
1847 	unsigned num_workgroups =
1848 			(num_pixels + ImplementationConstants::DefaultWorkgroupSize - 1) /
1849 			ImplementationConstants::DefaultWorkgroupSize;
1850 
1851 	struct Push
1852 	{
1853 		uint32_t pixels;
1854 		uint32_t fb_addr, fb_depth_addr;
1855 	} push = {};
1856 	push.pixels = num_pixels;
1857 	push.fb_addr = addr >> pixel_size_log2;
1858 	push.fb_depth_addr = depth_addr >> 1;
1859 
1860 	cmd.push_constants(&push, 0, sizeof(push));
1861 	cmd.dispatch(num_workgroups, 1, 1);
1862 }
1863 
submit_update_upscaled_domain(Vulkan::CommandBuffer & cmd,ResolveStage stage)1864 void Renderer::submit_update_upscaled_domain(Vulkan::CommandBuffer &cmd, ResolveStage stage)
1865 {
1866 	unsigned num_pixels = fb.width * fb.deduced_height;
1867 	unsigned pixel_size_log2;
1868 
1869 	switch (fb.fmt)
1870 	{
1871 	case FBFormat::RGBA8888:
1872 		pixel_size_log2 = 2;
1873 		break;
1874 
1875 	case FBFormat::RGBA5551:
1876 	case FBFormat::IA88:
1877 		pixel_size_log2 = 1;
1878 		break;
1879 
1880 	default:
1881 		pixel_size_log2 = 0;
1882 		break;
1883 	}
1884 
1885 	submit_update_upscaled_domain(cmd, stage, fb.addr, fb.depth_addr, num_pixels, pixel_size_log2);
1886 }
1887 
submit_depth_blend(Vulkan::CommandBuffer & cmd,Vulkan::Buffer & tmem,bool upscaled)1888 void Renderer::submit_depth_blend(Vulkan::CommandBuffer &cmd, Vulkan::Buffer &tmem, bool upscaled)
1889 {
1890 	cmd.begin_region("render-pass");
1891 	auto &instance = buffer_instances[buffer_instance];
1892 
1893 	cmd.set_specialization_constant_mask(0xff);
1894 	cmd.set_specialization_constant(0, uint32_t(rdram_size));
1895 	cmd.set_specialization_constant(1, uint32_t(fb.fmt));
1896 	cmd.set_specialization_constant(2, int(fb.addr == fb.depth_addr));
1897 	cmd.set_specialization_constant(3, ImplementationConstants::TileWidth);
1898 	cmd.set_specialization_constant(4, ImplementationConstants::TileHeight);
1899 	cmd.set_specialization_constant(5, Limits::MaxPrimitives);
1900 	cmd.set_specialization_constant(6, upscaled ? caps.max_width : Limits::MaxWidth);
1901 	cmd.set_specialization_constant(7, uint32_t(!is_host_coherent && !upscaled) |
1902 	                                   ((upscaled ? trailing_zeroes(caps.upscaling) : 0u) << 1u));
1903 
1904 	if (upscaled)
1905 		cmd.set_storage_buffer(0, 0, *upscaling_multisampled_rdram);
1906 	else
1907 		cmd.set_storage_buffer(0, 0, *rdram, rdram_offset, rdram_size * (is_host_coherent ? 1 : 2));
1908 	cmd.set_storage_buffer(0, 1, upscaled ? *upscaling_multisampled_hidden_rdram : *hidden_rdram);
1909 	cmd.set_storage_buffer(0, 2, tmem);
1910 
1911 	if (!caps.ubershader)
1912 	{
1913 		cmd.set_storage_buffer(0, 3, *per_tile_shaded_color);
1914 		cmd.set_storage_buffer(0, 4, *per_tile_shaded_depth);
1915 		cmd.set_storage_buffer(0, 5, *per_tile_shaded_shaded_alpha);
1916 		cmd.set_storage_buffer(0, 6, *per_tile_shaded_coverage);
1917 		cmd.set_storage_buffer(0, 7, *per_tile_offsets);
1918 	}
1919 
1920 	cmd.set_storage_buffer(1, 0, *instance.gpu.triangle_setup.buffer);
1921 	cmd.set_storage_buffer(1, 1, *instance.gpu.attribute_setup.buffer);
1922 	cmd.set_storage_buffer(1, 2, *instance.gpu.derived_setup.buffer);
1923 	cmd.set_storage_buffer(1, 3, *instance.gpu.scissor_setup.buffer);
1924 	cmd.set_storage_buffer(1, 4, *instance.gpu.static_raster_state.buffer);
1925 	cmd.set_storage_buffer(1, 5, *instance.gpu.depth_blend_state.buffer);
1926 	cmd.set_storage_buffer(1, 6, *instance.gpu.state_indices.buffer);
1927 	cmd.set_storage_buffer(1, 7, *instance.gpu.tile_info_state.buffer);
1928 	cmd.set_storage_buffer(1, 8, *span_setups);
1929 	cmd.set_storage_buffer(1, 9, *instance.gpu.span_info_offsets.buffer);
1930 	cmd.set_buffer_view(1, 10, *blender_divider_buffer);
1931 	cmd.set_storage_buffer(1, 11, *tile_binning_buffer);
1932 	cmd.set_storage_buffer(1, 12, *tile_binning_buffer_coarse);
1933 
1934 	auto *global_fb_info = cmd.allocate_typed_constant_data<GlobalFBInfo>(2, 0, 1);
1935 
1936 	GlobalState push = {};
1937 	push.fb_width = fb.width;
1938 	push.fb_height = fb.deduced_height;
1939 
1940 	if (upscaled)
1941 	{
1942 		push.fb_width *= caps.upscaling;
1943 		push.fb_height *= caps.upscaling;
1944 	}
1945 
1946 	switch (fb.fmt)
1947 	{
1948 	case FBFormat::I4:
1949 		push.addr_index = fb.addr;
1950 		global_fb_info->fb_size = 0;
1951 		global_fb_info->dx_mask = 0;
1952 		global_fb_info->dx_shift = 0;
1953 		break;
1954 
1955 	case FBFormat::I8:
1956 		push.addr_index = fb.addr;
1957 		global_fb_info->fb_size = 1;
1958 		global_fb_info->dx_mask = ~7u;
1959 		global_fb_info->dx_shift = 3;
1960 		break;
1961 
1962 	case FBFormat::RGBA5551:
1963 	case FBFormat::IA88:
1964 		push.addr_index = fb.addr >> 1u;
1965 		global_fb_info->fb_size = 2;
1966 		global_fb_info->dx_mask = ~3u;
1967 		global_fb_info->dx_shift = 2;
1968 		break;
1969 
1970 	case FBFormat::RGBA8888:
1971 		push.addr_index = fb.addr >> 2u;
1972 		global_fb_info->fb_size = 4;
1973 		global_fb_info->dx_mask = ~1u;
1974 		global_fb_info->dx_shift = 1;
1975 		break;
1976 	}
1977 
1978 	global_fb_info->base_primitive_index = base_primitive_index;
1979 
1980 	push.depth_addr_index = fb.depth_addr >> 1;
1981 	unsigned num_primitives_32 = (stream.triangle_setup.size() + 31) / 32;
1982 	push.group_mask = (1u << num_primitives_32) - 1;
1983 	cmd.push_constants(&push, 0, sizeof(push));
1984 
1985 	if (caps.ubershader)
1986 	{
1987 #ifdef PARALLEL_RDP_SHADER_DIR
1988 		cmd.set_program("rdp://ubershader.comp", {
1989 				{ "DEBUG_ENABLE", debug_channel ? 1 : 0 },
1990 				{ "SMALL_TYPES", caps.supports_small_integer_arithmetic ? 1 : 0 },
1991 		});
1992 #else
1993 		cmd.set_program(shader_bank->ubershader);
1994 #endif
1995 	}
1996 	else
1997 	{
1998 #ifdef PARALLEL_RDP_SHADER_DIR
1999 		cmd.set_program("rdp://depth_blend.comp", {
2000 				{ "DEBUG_ENABLE", debug_channel ? 1 : 0 },
2001 				{ "SMALL_TYPES", caps.supports_small_integer_arithmetic ? 1 : 0 },
2002 		});
2003 #else
2004 		cmd.set_program(shader_bank->depth_blend);
2005 #endif
2006 	}
2007 
2008 	Vulkan::QueryPoolHandle start_ts, end_ts;
2009 	if (caps.timestamp >= 2)
2010 		start_ts = cmd.write_timestamp(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
2011 
2012 	cmd.dispatch((push.fb_width + 7) / 8, (push.fb_height + 7) / 8, 1);
2013 
2014 	if (caps.timestamp >= 2)
2015 	{
2016 		end_ts = cmd.write_timestamp(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
2017 		device->register_time_interval("RDP GPU", std::move(start_ts), std::move(end_ts), "depth-blending");
2018 	}
2019 
2020 	cmd.end_region();
2021 }
2022 
submit_render_pass(Vulkan::CommandBuffer & cmd)2023 void Renderer::submit_render_pass(Vulkan::CommandBuffer &cmd)
2024 {
2025 	bool need_render_pass = fb.width != 0 && fb.deduced_height != 0 && !stream.span_info_jobs.empty();
2026 	bool need_tmem_upload = !stream.tmem_upload_infos.empty();
2027 	bool need_submit = need_render_pass || need_tmem_upload;
2028 	if (!need_submit)
2029 		return;
2030 
2031 	Vulkan::QueryPoolHandle render_pass_start, render_pass_end;
2032 	if (caps.timestamp >= 1)
2033 		render_pass_start = cmd.write_timestamp(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
2034 
2035 	if (debug_channel)
2036 		cmd.begin_debug_channel(this, "Debug", 16 * 1024 * 1024);
2037 
2038 	// Here we run 3 dispatches in parallel. Span setup and TMEM instances are low occupancy kind of jobs, but the binning
2039 	// pass should dominate here unless the workload is trivial.
2040 	if (need_render_pass)
2041 	{
2042 		submit_span_setup_jobs(cmd, false);
2043 		submit_tile_binning_combined(cmd, false);
2044 		if (caps.upscaling > 1)
2045 			submit_update_upscaled_domain(cmd, ResolveStage::Pre);
2046 	}
2047 
2048 	if (need_tmem_upload)
2049 		update_tmem_instances(cmd);
2050 
2051 	cmd.barrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_SHADER_WRITE_BIT,
2052 	            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | (!caps.ubershader ? VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT : 0),
2053 	            VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT |
2054 	            (!caps.ubershader ? VK_ACCESS_INDIRECT_COMMAND_READ_BIT : 0));
2055 
2056 	if (need_render_pass && !caps.ubershader)
2057 	{
2058 		submit_rasterization(cmd, need_tmem_upload ? *tmem_instances : *tmem, false);
2059 		cmd.barrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_SHADER_WRITE_BIT,
2060 		            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_SHADER_READ_BIT);
2061 	}
2062 
2063 	if (need_render_pass)
2064 		submit_depth_blend(cmd, need_tmem_upload ? *tmem_instances : *tmem, false);
2065 
2066 	if (!caps.ubershader)
2067 		clear_indirect_buffer(cmd);
2068 
2069 	if (render_pass_is_upscaled())
2070 	{
2071 		cmd.barrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_SHADER_WRITE_BIT,
2072 		            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
2073 		            VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT);
2074 
2075 		// TODO: Could probably do this reference update in the render pass itself,
2076 		// just write output to two buffers ... This is more composable for now.
2077 		submit_update_upscaled_domain(cmd, ResolveStage::Post);
2078 	}
2079 
2080 	if (caps.timestamp >= 1)
2081 	{
2082 		render_pass_end = cmd.write_timestamp(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
2083 		std::string tag;
2084 		tag = "(" + std::to_string(fb.width) + " x " + std::to_string(fb.deduced_height) + ")";
2085 		tag += " (" + std::to_string(stream.triangle_setup.size()) + " triangles)";
2086 		device->register_time_interval("RDP GPU", std::move(render_pass_start), std::move(render_pass_end), "render-pass", std::move(tag));
2087 	}
2088 }
2089 
submit_render_pass_upscaled(Vulkan::CommandBuffer & cmd)2090 void Renderer::submit_render_pass_upscaled(Vulkan::CommandBuffer &cmd)
2091 {
2092 	cmd.begin_region("render-pass-upscaled");
2093 	Vulkan::QueryPoolHandle start_ts, end_ts;
2094 	if (caps.timestamp >= 1)
2095 		start_ts = cmd.write_timestamp(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
2096 
2097 	bool need_tmem_upload = !stream.tmem_upload_infos.empty();
2098 	submit_span_setup_jobs(cmd, true);
2099 	submit_tile_binning_combined(cmd, true);
2100 
2101 	cmd.barrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_SHADER_WRITE_BIT,
2102 	            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT |
2103 	            (!caps.ubershader ? VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT : 0),
2104 	            VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT |
2105 	            (!caps.ubershader ? VK_ACCESS_INDIRECT_COMMAND_READ_BIT : 0));
2106 
2107 	if (!caps.ubershader)
2108 	{
2109 		submit_rasterization(cmd, need_tmem_upload ? *tmem_instances : *tmem, true);
2110 		cmd.barrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
2111 		            VK_ACCESS_SHADER_WRITE_BIT,
2112 		            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
2113 		            VK_ACCESS_SHADER_READ_BIT);
2114 	}
2115 
2116 	submit_depth_blend(cmd, need_tmem_upload ? *tmem_instances : *tmem, true);
2117 	if (!caps.ubershader)
2118 		clear_indirect_buffer(cmd);
2119 
2120 	if (caps.timestamp >= 1)
2121 	{
2122 		end_ts = cmd.write_timestamp(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
2123 		device->register_time_interval("RDP GPU", std::move(start_ts), std::move(end_ts), "render-pass-upscaled");
2124 	}
2125 	cmd.end_region();
2126 }
2127 
submit_render_pass_end(Vulkan::CommandBuffer & cmd)2128 void Renderer::submit_render_pass_end(Vulkan::CommandBuffer &cmd)
2129 {
2130 	base_primitive_index += uint32_t(stream.triangle_setup.size());
2131 	cmd.barrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_SHADER_WRITE_BIT,
2132 	            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
2133 	            VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT);
2134 }
2135 
maintain_queues()2136 void Renderer::maintain_queues()
2137 {
2138 	// Some conditions dictate if we should flush a render pass.
2139 	// These heuristics ensures we don't wait too long to flush render passes,
2140 	// and also ensure that we don't spam submissions too often, causing massive bubbles on GPU.
2141 
2142 	// If we get a lot of small render passes in a row, it makes sense to batch them up, e.g. 8 at a time.
2143 	// If we get 2 full render passes of ~256 primitives, that's also a good indication we should flush since we're getting spammed.
2144 	// If we have no pending submissions, the GPU is idle and there is no reason not to submit.
2145 	// If we haven't submitted anything in a while (1.0 ms), it's probably fine to submit again.
2146 	if (pending_render_passes >= ImplementationConstants::MaxPendingRenderPassesBeforeFlush ||
2147 	    pending_primitives >= Limits::MaxPrimitives ||
2148 	    pending_primitives_upscaled >= Limits::MaxPrimitives ||
2149 	    active_submissions.load(std::memory_order_relaxed) == 0 ||
2150 	    int64_t(Util::get_current_time_nsecs() - last_submit_ns) > 1000000)
2151 	{
2152 		submit_to_queue();
2153 	}
2154 }
2155 
lock_command_processing()2156 void Renderer::lock_command_processing()
2157 {
2158 	idle_lock.lock();
2159 }
2160 
unlock_command_processing()2161 void Renderer::unlock_command_processing()
2162 {
2163 	idle_lock.unlock();
2164 }
2165 
maintain_queues_idle()2166 void Renderer::maintain_queues_idle()
2167 {
2168 	std::lock_guard<std::mutex> holder{idle_lock};
2169 	if (pending_primitives >= ImplementationConstants::MinimumPrimitivesForIdleFlush ||
2170 	    pending_render_passes >= ImplementationConstants::MinimumRenderPassesForIdleFlush)
2171 	{
2172 		flush_queues();
2173 		submit_to_queue();
2174 	}
2175 }
2176 
enqueue_fence_wait(Vulkan::Fence fence)2177 void Renderer::enqueue_fence_wait(Vulkan::Fence fence)
2178 {
2179 	CoherencyOperation op;
2180 	op.fence = std::move(fence);
2181 	op.unlock_cookie = &active_submissions;
2182 	active_submissions.fetch_add(1, std::memory_order_relaxed);
2183 	processor.enqueue_coherency_operation(std::move(op));
2184 	last_submit_ns = Util::get_current_time_nsecs();
2185 }
2186 
submit_to_queue()2187 void Renderer::submit_to_queue()
2188 {
2189 	bool pending_host_visible_render_passes = pending_render_passes != 0;
2190 	bool pending_upscaled_passes = pending_render_passes_upscaled != 0;
2191 	pending_render_passes = 0;
2192 	pending_render_passes_upscaled = 0;
2193 	pending_primitives = 0;
2194 	pending_primitives_upscaled = 0;
2195 
2196 	if (!stream.cmd)
2197 	{
2198 		if (pending_host_visible_render_passes)
2199 		{
2200 			Vulkan::Fence fence;
2201 			device->submit_empty(Vulkan::CommandBuffer::Type::AsyncCompute, &fence);
2202 			enqueue_fence_wait(fence);
2203 		}
2204 		return;
2205 	}
2206 
2207 	bool need_host_barrier = is_host_coherent || !incoherent.staging_readback;
2208 
2209 	// If we maintain queues in-between doing 1x render pass and upscaled render pass,
2210 	// we haven't flushed memory yet.
2211 	bool need_memory_flush = pending_host_visible_render_passes && !pending_upscaled_passes;
2212 	stream.cmd->barrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
2213 	                    need_memory_flush ? VK_ACCESS_MEMORY_WRITE_BIT : 0,
2214 	                    (need_host_barrier ? VK_PIPELINE_STAGE_HOST_BIT : VK_PIPELINE_STAGE_TRANSFER_BIT),
2215 	                    (need_host_barrier ? VK_ACCESS_HOST_READ_BIT : VK_ACCESS_TRANSFER_READ_BIT));
2216 
2217 	Vulkan::Fence fence;
2218 
2219 	if (is_host_coherent)
2220 	{
2221 		device->submit(stream.cmd, &fence);
2222 		if (pending_host_visible_render_passes)
2223 			enqueue_fence_wait(fence);
2224 	}
2225 	else
2226 	{
2227 		CoherencyOperation op;
2228 		if (pending_host_visible_render_passes)
2229 			resolve_coherency_gpu_to_host(op, *stream.cmd);
2230 
2231 		device->submit(stream.cmd, &fence);
2232 
2233 		if (pending_host_visible_render_passes)
2234 		{
2235 			enqueue_fence_wait(fence);
2236 			op.fence = fence;
2237 			if (!op.copies.empty())
2238 				processor.enqueue_coherency_operation(std::move(op));
2239 		}
2240 	}
2241 
2242 	Util::for_each_bit(sync_indices_needs_flush, [&](unsigned bit) {
2243 		auto &sync = internal_sync[bit];
2244 		sync.fence = fence;
2245 	});
2246 	sync_indices_needs_flush = 0;
2247 	stream.cmd.reset();
2248 }
2249 
reset_context()2250 void Renderer::reset_context()
2251 {
2252 	stream.scissor_setup.reset();
2253 	stream.static_raster_state_cache.reset();
2254 	stream.depth_blend_state_cache.reset();
2255 	stream.tile_info_state_cache.reset();
2256 	stream.triangle_setup.reset();
2257 	stream.attribute_setup.reset();
2258 	stream.derived_setup.reset();
2259 	stream.state_indices.reset();
2260 	stream.span_info_offsets.reset();
2261 	stream.span_info_jobs.reset();
2262 	stream.max_shaded_tiles = 0;
2263 
2264 	fb.deduced_height = 0;
2265 	fb.color_write_pending = false;
2266 	fb.depth_write_pending = false;
2267 
2268 	stream.tmem_upload_infos.clear();
2269 }
2270 
begin_new_context()2271 void Renderer::begin_new_context()
2272 {
2273 	buffer_instance = (buffer_instance + 1) % Limits::NumSyncStates;
2274 	reset_context();
2275 }
2276 
get_byte_size_for_bound_color_framebuffer() const2277 uint32_t Renderer::get_byte_size_for_bound_color_framebuffer() const
2278 {
2279 	unsigned pixel_count = fb.width * fb.deduced_height;
2280 	unsigned byte_count;
2281 	switch (fb.fmt)
2282 	{
2283 	case FBFormat::RGBA8888:
2284 		byte_count = pixel_count * 4;
2285 		break;
2286 
2287 	case FBFormat::RGBA5551:
2288 	case FBFormat::IA88:
2289 		byte_count = pixel_count * 2;
2290 		break;
2291 
2292 	default:
2293 		byte_count = pixel_count;
2294 		break;
2295 	}
2296 
2297 	return byte_count;
2298 }
2299 
get_byte_size_for_bound_depth_framebuffer() const2300 uint32_t Renderer::get_byte_size_for_bound_depth_framebuffer() const
2301 {
2302 	return fb.width * fb.deduced_height * 2;
2303 }
2304 
mark_pages_for_gpu_read(uint32_t base_addr,uint32_t byte_count)2305 void Renderer::mark_pages_for_gpu_read(uint32_t base_addr, uint32_t byte_count)
2306 {
2307 	if (byte_count == 0)
2308 		return;
2309 
2310 	uint32_t start_page = base_addr / ImplementationConstants::IncoherentPageSize;
2311 	uint32_t end_page = (base_addr + byte_count - 1) / ImplementationConstants::IncoherentPageSize + 1;
2312 	start_page &= incoherent.num_pages - 1;
2313 	end_page &= incoherent.num_pages - 1;
2314 
2315 	uint32_t page = start_page;
2316 	while (page != end_page)
2317 	{
2318 		bool pending_writes = (incoherent.page_to_pending_readback[page / 32] & (1u << (page & 31))) != 0 ||
2319 		                      incoherent.pending_writes_for_page[page].load(std::memory_order_relaxed) != 0;
2320 
2321 		// We'll do an acquire memory barrier later before we start memcpy-ing from host memory.
2322 		if (pending_writes)
2323 			incoherent.page_to_masked_copy[page / 32] |= 1u << (page & 31);
2324 		else
2325 			incoherent.page_to_direct_copy[page / 32] |= 1u << (page & 31);
2326 
2327 		page = (page + 1) & (incoherent.num_pages - 1);
2328 	}
2329 }
2330 
lock_pages_for_gpu_write(uint32_t base_addr,uint32_t byte_count)2331 void Renderer::lock_pages_for_gpu_write(uint32_t base_addr, uint32_t byte_count)
2332 {
2333 	if (byte_count == 0)
2334 		return;
2335 
2336 	uint32_t start_page = base_addr / ImplementationConstants::IncoherentPageSize;
2337 	uint32_t end_page = (base_addr + byte_count - 1) / ImplementationConstants::IncoherentPageSize + 1;
2338 
2339 	for (uint32_t page = start_page; page < end_page; page++)
2340 	{
2341 		uint32_t wrapped_page = page & (incoherent.num_pages - 1);
2342 		incoherent.page_to_pending_readback[wrapped_page / 32] |= 1u << (wrapped_page & 31);
2343 	}
2344 }
2345 
resolve_coherency_gpu_to_host(CoherencyOperation & op,Vulkan::CommandBuffer & cmd)2346 void Renderer::resolve_coherency_gpu_to_host(CoherencyOperation &op, Vulkan::CommandBuffer &cmd)
2347 {
2348 	cmd.begin_region("resolve-coherency-gpu-to-host");
2349 	if (!incoherent.staging_readback)
2350 	{
2351 		// iGPU path.
2352 		op.src = rdram;
2353 		op.dst = incoherent.host_rdram;
2354 		op.timeline_value = 0;
2355 
2356 		for (auto &readback : incoherent.page_to_pending_readback)
2357 		{
2358 			uint32_t base_index = 32 * uint32_t(&readback - incoherent.page_to_pending_readback.data());
2359 
2360 			Util::for_each_bit_range(readback, [&](unsigned index, unsigned count) {
2361 				index += base_index;
2362 
2363 				for (unsigned i = 0; i < count; i++)
2364 					incoherent.pending_writes_for_page[index + i].fetch_add(1, std::memory_order_relaxed);
2365 
2366 				CoherencyCopy coherent_copy = {};
2367 				coherent_copy.counter_base = &incoherent.pending_writes_for_page[index];
2368 				coherent_copy.counters = count;
2369 				coherent_copy.src_offset = index * ImplementationConstants::IncoherentPageSize;
2370 				coherent_copy.mask_offset = coherent_copy.src_offset + rdram_size;
2371 				coherent_copy.dst_offset = index * ImplementationConstants::IncoherentPageSize;
2372 				coherent_copy.size = ImplementationConstants::IncoherentPageSize * count;
2373 				op.copies.push_back(coherent_copy);
2374 			});
2375 
2376 			readback = 0;
2377 		}
2378 	}
2379 	else
2380 	{
2381 		// Discrete GPU path.
2382 		Util::SmallVector<VkBufferCopy, 1024> copies;
2383 		op.src = incoherent.staging_readback.get();
2384 		op.dst = incoherent.host_rdram;
2385 		op.timeline_value = 0;
2386 
2387 		for (auto &readback : incoherent.page_to_pending_readback)
2388 		{
2389 			uint32_t base_index = 32 * uint32_t(&readback - incoherent.page_to_pending_readback.data());
2390 
2391 			Util::for_each_bit_range(readback, [&](unsigned index, unsigned count) {
2392 				index += base_index;
2393 
2394 				for (unsigned i = 0; i < count; i++)
2395 					incoherent.pending_writes_for_page[index + i].fetch_add(1, std::memory_order_relaxed);
2396 
2397 				VkBufferCopy copy = {};
2398 				copy.srcOffset = index * ImplementationConstants::IncoherentPageSize;
2399 
2400 				unsigned dst_page_index = incoherent.staging_readback_index;
2401 				copy.dstOffset = dst_page_index * ImplementationConstants::IncoherentPageSize;
2402 
2403 				incoherent.staging_readback_index += count;
2404 				incoherent.staging_readback_index &= (incoherent.staging_readback_pages - 1);
2405 				// Unclean wraparound check.
2406 				if (incoherent.staging_readback_index != 0 && incoherent.staging_readback_index < dst_page_index)
2407 				{
2408 					copy.dstOffset = 0;
2409 					incoherent.staging_readback_index = count;
2410 				}
2411 
2412 				copy.size = ImplementationConstants::IncoherentPageSize * count;
2413 				copies.push_back(copy);
2414 
2415 				CoherencyCopy coherent_copy = {};
2416 				coherent_copy.counter_base = &incoherent.pending_writes_for_page[index];
2417 				coherent_copy.counters = count;
2418 				coherent_copy.src_offset = copy.dstOffset;
2419 				coherent_copy.dst_offset = index * ImplementationConstants::IncoherentPageSize;
2420 				coherent_copy.size = ImplementationConstants::IncoherentPageSize * count;
2421 
2422 				VkBufferCopy mask_copy = {};
2423 				mask_copy.srcOffset = index * ImplementationConstants::IncoherentPageSize + rdram_size;
2424 
2425 				dst_page_index = incoherent.staging_readback_index;
2426 				mask_copy.dstOffset = dst_page_index * ImplementationConstants::IncoherentPageSize;
2427 
2428 				incoherent.staging_readback_index += count;
2429 				incoherent.staging_readback_index &= (incoherent.staging_readback_pages - 1);
2430 				// Unclean wraparound check.
2431 				if (incoherent.staging_readback_index != 0 && incoherent.staging_readback_index < dst_page_index)
2432 				{
2433 					mask_copy.dstOffset = 0;
2434 					incoherent.staging_readback_index = count;
2435 				}
2436 
2437 				mask_copy.size = ImplementationConstants::IncoherentPageSize * count;
2438 				copies.push_back(mask_copy);
2439 				coherent_copy.mask_offset = mask_copy.dstOffset;
2440 
2441 				op.copies.push_back(coherent_copy);
2442 			});
2443 
2444 			readback = 0;
2445 		}
2446 
2447 		if (!copies.empty())
2448 		{
2449 //#define COHERENCY_READBACK_TIMESTAMPS
2450 #ifdef COHERENCY_READBACK_TIMESTAMPS
2451 			Vulkan::QueryPoolHandle start_ts, end_ts;
2452 			start_ts = cmd.write_timestamp(VK_PIPELINE_STAGE_TRANSFER_BIT);
2453 #endif
2454 			cmd.copy_buffer(*incoherent.staging_readback, *rdram, copies.data(), copies.size());
2455 #ifdef COHERENCY_READBACK_TIMESTAMPS
2456 			end_ts = cmd.write_timestamp(VK_PIPELINE_STAGE_TRANSFER_BIT);
2457 			device->register_time_interval(std::move(start_ts), std::move(end_ts), "coherency-readback");
2458 #endif
2459 			cmd.barrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT,
2460 			            VK_PIPELINE_STAGE_HOST_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
2461 			            VK_ACCESS_HOST_READ_BIT);
2462 		}
2463 	}
2464 	cmd.end_region();
2465 }
2466 
resolve_coherency_external(unsigned offset,unsigned length)2467 void Renderer::resolve_coherency_external(unsigned offset, unsigned length)
2468 {
2469 	mark_pages_for_gpu_read(offset, length);
2470 	ensure_command_buffer();
2471 	resolve_coherency_host_to_gpu(*stream.cmd);
2472 	device->submit(stream.cmd);
2473 	stream.cmd.reset();
2474 }
2475 
get_scaling_factor() const2476 unsigned Renderer::get_scaling_factor() const
2477 {
2478 	return caps.upscaling;
2479 }
2480 
get_upscaled_rdram_buffer() const2481 const Vulkan::Buffer *Renderer::get_upscaled_rdram_buffer() const
2482 {
2483 	return upscaling_multisampled_rdram.get();
2484 }
2485 
get_upscaled_hidden_rdram_buffer() const2486 const Vulkan::Buffer *Renderer::get_upscaled_hidden_rdram_buffer() const
2487 {
2488 	return upscaling_multisampled_hidden_rdram.get();
2489 }
2490 
resolve_coherency_host_to_gpu(Vulkan::CommandBuffer & cmd)2491 void Renderer::resolve_coherency_host_to_gpu(Vulkan::CommandBuffer &cmd)
2492 {
2493 	// Now, ensure that the GPU sees a coherent view of the CPU memory writes up until now.
2494 	// Writes made by the GPU which are not known to be resolved on the timeline waiter thread will always
2495 	// "win" over writes made by CPU, since CPU is not allowed to meaningfully overwrite data which the GPU
2496 	// is going to touch.
2497 
2498 	cmd.begin_region("resolve-coherency-host-to-gpu");
2499 
2500 	Vulkan::QueryPoolHandle start_ts, end_ts;
2501 	if (caps.timestamp)
2502 		start_ts = device->write_calibrated_timestamp();
2503 
2504 	std::atomic_thread_fence(std::memory_order_acquire);
2505 
2506 	Util::SmallVector<VkBufferCopy, 1024> buffer_copies;
2507 	Util::SmallVector<uint32_t, 1024> masked_page_copies;
2508 	Util::SmallVector<uint32_t, 1024> to_clear_write_mask;
2509 
2510 	// If we're able to map RDRAM directly, we can just memcpy straight into RDRAM if we have an unmasked copy.
2511 	// Important for iGPU.
2512 	if (rdram->get_allocation().is_host_allocation())
2513 	{
2514 		for (auto &direct : incoherent.page_to_direct_copy)
2515 		{
2516 			uint32_t base_index = 32 * (&direct - incoherent.page_to_direct_copy.data());
2517 			Util::for_each_bit_range(direct, [&](unsigned index, unsigned count) {
2518 				index += base_index;
2519 				auto *mapped_rdram = device->map_host_buffer(*rdram, Vulkan::MEMORY_ACCESS_WRITE_BIT,
2520 				                                             ImplementationConstants::IncoherentPageSize * index,
2521 				                                             ImplementationConstants::IncoherentPageSize * count);
2522 				memcpy(mapped_rdram,
2523 				       incoherent.host_rdram + ImplementationConstants::IncoherentPageSize * index,
2524 				       ImplementationConstants::IncoherentPageSize * count);
2525 
2526 				device->unmap_host_buffer(*rdram, Vulkan::MEMORY_ACCESS_WRITE_BIT,
2527 				                          ImplementationConstants::IncoherentPageSize * index,
2528 				                          ImplementationConstants::IncoherentPageSize * count);
2529 
2530 				mapped_rdram = device->map_host_buffer(*rdram, Vulkan::MEMORY_ACCESS_WRITE_BIT,
2531 				                                       ImplementationConstants::IncoherentPageSize * index + rdram_size,
2532 				                                       ImplementationConstants::IncoherentPageSize * count);
2533 
2534 				memset(mapped_rdram, 0, ImplementationConstants::IncoherentPageSize * count);
2535 
2536 				device->unmap_host_buffer(*rdram, Vulkan::MEMORY_ACCESS_WRITE_BIT,
2537 				                          ImplementationConstants::IncoherentPageSize * index + rdram_size,
2538 				                          ImplementationConstants::IncoherentPageSize * count);
2539 			});
2540 			direct = 0;
2541 		}
2542 
2543 		auto *mapped_staging = static_cast<uint8_t *>(device->map_host_buffer(*incoherent.staging_rdram,
2544 		                                                                      Vulkan::MEMORY_ACCESS_WRITE_BIT));
2545 
2546 		for (auto &indirect : incoherent.page_to_masked_copy)
2547 		{
2548 			uint32_t base_index = 32 * (&indirect - incoherent.page_to_masked_copy.data());
2549 			Util::for_each_bit(indirect, [&](unsigned index) {
2550 				index += base_index;
2551 				masked_page_copies.push_back(index);
2552 				memcpy(mapped_staging + ImplementationConstants::IncoherentPageSize * index,
2553 				       incoherent.host_rdram + ImplementationConstants::IncoherentPageSize * index,
2554 				       ImplementationConstants::IncoherentPageSize);
2555 			});
2556 			indirect = 0;
2557 		}
2558 
2559 		device->unmap_host_buffer(*incoherent.staging_rdram, Vulkan::MEMORY_ACCESS_WRITE_BIT);
2560 	}
2561 	else
2562 	{
2563 		auto *mapped_rdram = static_cast<uint8_t *>(device->map_host_buffer(*incoherent.staging_rdram, Vulkan::MEMORY_ACCESS_WRITE_BIT));
2564 
2565 		size_t num_packed_pages = incoherent.page_to_masked_copy.size();
2566 		for (size_t i = 0; i < num_packed_pages; i++)
2567 		{
2568 			uint32_t base_index = 32 * i;
2569 			uint32_t tmp = incoherent.page_to_masked_copy[i] | incoherent.page_to_direct_copy[i];
2570 			Util::for_each_bit(tmp, [&](unsigned index) {
2571 				unsigned bit = index;
2572 				index += base_index;
2573 
2574 				if ((1u << bit) & incoherent.page_to_masked_copy[i])
2575 					masked_page_copies.push_back(index);
2576 				else
2577 				{
2578 					VkBufferCopy copy = {};
2579 					copy.size = ImplementationConstants::IncoherentPageSize;
2580 					copy.dstOffset = copy.srcOffset = index * ImplementationConstants::IncoherentPageSize;
2581 					buffer_copies.push_back(copy);
2582 					to_clear_write_mask.push_back(index);
2583 				}
2584 
2585 				memcpy(mapped_rdram + ImplementationConstants::IncoherentPageSize * index,
2586 				       incoherent.host_rdram + ImplementationConstants::IncoherentPageSize * index,
2587 				       ImplementationConstants::IncoherentPageSize);
2588 			});
2589 
2590 			incoherent.page_to_masked_copy[i] = 0;
2591 			incoherent.page_to_direct_copy[i] = 0;
2592 		}
2593 
2594 		device->unmap_host_buffer(*incoherent.staging_rdram, Vulkan::MEMORY_ACCESS_WRITE_BIT);
2595 	}
2596 
2597 	if (!masked_page_copies.empty())
2598 	{
2599 #ifdef PARALLEL_RDP_SHADER_DIR
2600 		cmd.set_program("rdp://masked_rdram_resolve.comp");
2601 #else
2602 		cmd.set_program(shader_bank->masked_rdram_resolve);
2603 #endif
2604 		cmd.set_specialization_constant_mask(3);
2605 		cmd.set_specialization_constant(0, ImplementationConstants::IncoherentPageSize / 4);
2606 		cmd.set_specialization_constant(1, ImplementationConstants::IncoherentPageSize / 4);
2607 
2608 		cmd.set_storage_buffer(0, 0, *rdram, rdram_offset, rdram_size);
2609 		cmd.set_storage_buffer(0, 1, *incoherent.staging_rdram);
2610 		cmd.set_storage_buffer(0, 2, *rdram, rdram_offset + rdram_size, rdram_size);
2611 
2612 //#define COHERENCY_MASK_TIMESTAMPS
2613 #ifdef COHERENCY_MASK_TIMESTAMPS
2614 		Vulkan::QueryPoolHandle start_ts, end_ts;
2615 		start_ts = cmd->write_timestamp(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
2616 #endif
2617 
2618 		for (size_t i = 0; i < masked_page_copies.size(); i += 4096)
2619 		{
2620 			size_t to_copy = std::min(masked_page_copies.size() - i, size_t(4096));
2621 			memcpy(cmd.allocate_typed_constant_data<uint32_t>(1, 0, to_copy),
2622 				   masked_page_copies.data() + i,
2623 				   to_copy * sizeof(uint32_t));
2624 			cmd.dispatch(to_copy, 1, 1);
2625 		}
2626 
2627 #ifdef COHERENCY_MASK_TIMESTAMPS
2628 		end_ts = cmd->write_timestamp(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
2629 		device->register_time_interval(std::move(start_ts), std::move(end_ts), "coherent-mask-copy");
2630 #endif
2631 	}
2632 
2633 	// Could use FillBuffer here, but would need to use TRANSFER stage, and introduce more barriers than needed.
2634 	if (!to_clear_write_mask.empty())
2635 	{
2636 #ifdef PARALLEL_RDP_SHADER_DIR
2637 		cmd.set_program("rdp://clear_write_mask.comp");
2638 #else
2639 		cmd.set_program(shader_bank->clear_write_mask);
2640 #endif
2641 		cmd.set_specialization_constant_mask(3);
2642 		cmd.set_specialization_constant(0, ImplementationConstants::IncoherentPageSize / 4);
2643 		cmd.set_specialization_constant(1, ImplementationConstants::IncoherentPageSize / 4);
2644 		cmd.set_storage_buffer(0, 0, *rdram, rdram_offset + rdram_size, rdram_size);
2645 		for (size_t i = 0; i < to_clear_write_mask.size(); i += 4096)
2646 		{
2647 			size_t to_copy = std::min(to_clear_write_mask.size() - i, size_t(4096));
2648 			memcpy(cmd.allocate_typed_constant_data<uint32_t>(1, 0, to_copy),
2649 				   to_clear_write_mask.data() + i,
2650 				   to_copy * sizeof(uint32_t));
2651 			cmd.dispatch(to_copy, 1, 1);
2652 		}
2653 	}
2654 
2655 	if (!to_clear_write_mask.empty() || !masked_page_copies.empty())
2656 	{
2657 		cmd.barrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_SHADER_WRITE_BIT,
2658 		            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_SHADER_READ_BIT);
2659 	}
2660 
2661 	// If we cannot map the device memory, use the copy queue.
2662 	if (!buffer_copies.empty())
2663 	{
2664 		cmd.barrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0,
2665 		            VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT);
2666 
2667 //#define COHERENCY_COPY_TIMESTAMPS
2668 #ifdef COHERENCY_COPY_TIMESTAMPS
2669 		Vulkan::QueryPoolHandle start_ts, end_ts;
2670 		start_ts = cmd->write_timestamp(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
2671 #endif
2672 		cmd.copy_buffer(*rdram, *incoherent.staging_rdram, buffer_copies.data(), buffer_copies.size());
2673 #ifdef COHERENCY_COPY_TIMESTAMPS
2674 		end_ts = cmd->write_timestamp(VK_PIPELINE_STAGE_TRANSFER_BIT);
2675 		device->register_time_interval(std::move(start_ts), std::move(end_ts), "coherent-copy");
2676 #endif
2677 
2678 		cmd.barrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT,
2679 		             VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_SHADER_READ_BIT);
2680 	}
2681 
2682 	if (caps.timestamp)
2683 	{
2684 		end_ts = device->write_calibrated_timestamp();
2685 		device->register_time_interval("RDP CPU", std::move(start_ts), std::move(end_ts), "coherency-host-to-gpu");
2686 	}
2687 
2688 	cmd.end_region();
2689 }
2690 
flush_queues()2691 void Renderer::flush_queues()
2692 {
2693 	if (stream.tmem_upload_infos.empty() && stream.span_info_jobs.empty())
2694 	{
2695 		base_primitive_index += stream.triangle_setup.size();
2696 		reset_context();
2697 		return;
2698 	}
2699 
2700 	if (!is_host_coherent)
2701 	{
2702 		mark_pages_for_gpu_read(fb.addr, get_byte_size_for_bound_color_framebuffer());
2703 		mark_pages_for_gpu_read(fb.depth_addr, get_byte_size_for_bound_depth_framebuffer());
2704 
2705 		// We're going to write to these pages, so lock them down.
2706 		lock_pages_for_gpu_write(fb.addr, get_byte_size_for_bound_color_framebuffer());
2707 		lock_pages_for_gpu_write(fb.depth_addr, get_byte_size_for_bound_depth_framebuffer());
2708 	}
2709 
2710 	auto &instance = buffer_instances[buffer_instance];
2711 	auto &sync = internal_sync[buffer_instance];
2712 	if (sync_indices_needs_flush & (1u << buffer_instance))
2713 		submit_to_queue();
2714 	sync_indices_needs_flush |= 1u << buffer_instance;
2715 
2716 	if (sync.fence)
2717 	{
2718 		Vulkan::QueryPoolHandle start_ts, end_ts;
2719 		if (caps.timestamp)
2720 			start_ts = device->write_calibrated_timestamp();
2721 		sync.fence->wait();
2722 		if (caps.timestamp)
2723 		{
2724 			end_ts = device->write_calibrated_timestamp();
2725 			device->register_time_interval("RDP CPU", std::move(start_ts), std::move(end_ts), "render-pass-fence");
2726 		}
2727 		sync.fence.reset();
2728 	}
2729 
2730 	ensure_command_buffer();
2731 
2732 	if (!is_host_coherent)
2733 		resolve_coherency_host_to_gpu(*stream.cmd);
2734 	instance.upload(*device, stream, *stream.cmd);
2735 
2736 	stream.cmd->begin_region("render-pass-1x");
2737 	submit_render_pass(*stream.cmd);
2738 	stream.cmd->end_region();
2739 	pending_render_passes++;
2740 
2741 	if (render_pass_is_upscaled())
2742 	{
2743 		maintain_queues();
2744 		ensure_command_buffer();
2745 		// We're going to keep reading the same data structures, so make sure
2746 		// we signal fence after upscaled render pass is submitted.
2747 		sync_indices_needs_flush |= 1u << buffer_instance;
2748 		submit_render_pass_upscaled(*stream.cmd);
2749 		pending_render_passes_upscaled++;
2750 		pending_primitives_upscaled += uint32_t(stream.triangle_setup.size());
2751 	}
2752 
2753 	submit_render_pass_end(*stream.cmd);
2754 
2755 	begin_new_context();
2756 	maintain_queues();
2757 }
2758 
render_pass_is_upscaled() const2759 bool Renderer::render_pass_is_upscaled() const
2760 {
2761 	bool need_render_pass = fb.width != 0 && fb.deduced_height != 0 && !stream.span_info_jobs.empty();
2762 	return caps.upscaling > 1 && need_render_pass && should_render_upscaled();
2763 }
2764 
should_render_upscaled() const2765 bool Renderer::should_render_upscaled() const
2766 {
2767 	// A heuristic. There is no point to render upscaled for purely off-screen passes.
2768 	// We should ideally only upscale the final pass which hits screen.
2769 	// From a heuristic point-of-view we expect only 16-bit/32-bit frame buffers to be relevant,
2770 	// and only frame buffers with at least 256 pixels.
2771 	return (fb.fmt == FBFormat::RGBA5551 || fb.fmt == FBFormat::RGBA8888) && fb.width >= 256;
2772 }
2773 
ensure_command_buffer()2774 void Renderer::ensure_command_buffer()
2775 {
2776 	if (!stream.cmd)
2777 		stream.cmd = device->request_command_buffer(Vulkan::CommandBuffer::Type::AsyncCompute);
2778 
2779 	if (!caps.ubershader && !indirect_dispatch_buffer)
2780 	{
2781 		Vulkan::BufferCreateInfo indirect_info = {};
2782 		indirect_info.size = 4 * sizeof(uint32_t) * Limits::MaxStaticRasterizationStates;
2783 		indirect_info.domain = Vulkan::BufferDomain::Device;
2784 		indirect_info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT;
2785 
2786 		indirect_dispatch_buffer = device->create_buffer(indirect_info);
2787 		device->set_name(*indirect_dispatch_buffer, "indirect-dispatch-buffer");
2788 
2789 		clear_indirect_buffer(*stream.cmd);
2790 		stream.cmd->barrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_SHADER_WRITE_BIT,
2791 		                    VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_SHADER_READ_BIT);
2792 	}
2793 }
2794 
set_tile(uint32_t tile,const TileMeta & meta)2795 void Renderer::set_tile(uint32_t tile, const TileMeta &meta)
2796 {
2797 	tiles[tile].meta = meta;
2798 }
2799 
set_tile_size(uint32_t tile,uint32_t slo,uint32_t shi,uint32_t tlo,uint32_t thi)2800 void Renderer::set_tile_size(uint32_t tile, uint32_t slo, uint32_t shi, uint32_t tlo, uint32_t thi)
2801 {
2802 	tiles[tile].size.slo = slo;
2803 	tiles[tile].size.shi = shi;
2804 	tiles[tile].size.tlo = tlo;
2805 	tiles[tile].size.thi = thi;
2806 }
2807 
notify_idle_command_thread()2808 void Renderer::notify_idle_command_thread()
2809 {
2810 	maintain_queues_idle();
2811 }
2812 
tmem_upload_needs_flush(uint32_t addr) const2813 bool Renderer::tmem_upload_needs_flush(uint32_t addr) const
2814 {
2815 	// Not perfect, since TMEM upload could slice into framebuffer,
2816 	// but I doubt this will be an issue (famous last words ...)
2817 
2818 	if (fb.color_write_pending)
2819 	{
2820 		uint32_t offset = (addr - fb.addr) & (rdram_size - 1);
2821 		uint32_t pending_pixels = fb.deduced_height * fb.width;
2822 
2823 		switch (fb.fmt)
2824 		{
2825 		case FBFormat::RGBA5551:
2826 		case FBFormat::I8:
2827 			offset >>= 1;
2828 			break;
2829 
2830 		case FBFormat::RGBA8888:
2831 			offset >>= 2;
2832 			break;
2833 
2834 		default:
2835 			break;
2836 		}
2837 
2838 		if (offset < pending_pixels)
2839 		{
2840 			//LOGI("Flushing render pass due to coherent TMEM fetch from color buffer.\n");
2841 			return true;
2842 		}
2843 	}
2844 
2845 	if (fb.depth_write_pending)
2846 	{
2847 		uint32_t offset = (addr - fb.depth_addr) & (rdram_size - 1);
2848 		uint32_t pending_pixels = fb.deduced_height * fb.width;
2849 		offset >>= 1;
2850 
2851 		if (offset < pending_pixels)
2852 		{
2853 			//LOGI("Flushing render pass due to coherent TMEM fetch from depth buffer.\n");
2854 			return true;
2855 		}
2856 	}
2857 
2858 	return false;
2859 }
2860 
load_tile(uint32_t tile,const LoadTileInfo & info)2861 void Renderer::load_tile(uint32_t tile, const LoadTileInfo &info)
2862 {
2863 	if (tmem_upload_needs_flush(info.tex_addr))
2864 		flush_queues();
2865 
2866 	// Detect noop cases.
2867 	if (info.mode != UploadMode::Block)
2868 	{
2869 		if ((info.thi >> 2) < (info.tlo >> 2))
2870 			return;
2871 
2872 		unsigned pixel_count = (((info.shi >> 2) - (info.slo >> 2)) + 1) & 0xfff;
2873 		if (!pixel_count)
2874 			return;
2875 	}
2876 	else
2877 	{
2878 		unsigned pixel_count = ((info.shi - info.slo) + 1) & 0xfff;
2879 		if (!pixel_count)
2880 			return;
2881 	}
2882 
2883 	if (!is_host_coherent)
2884 	{
2885 		unsigned pixel_count;
2886 		unsigned offset_pixels;
2887 		unsigned base_addr = info.tex_addr;
2888 
2889 		if (info.mode == UploadMode::Block)
2890 		{
2891 			pixel_count = (info.shi - info.slo + 1) & 0xfff;
2892 			offset_pixels = info.slo + info.tex_width * info.tlo;
2893 		}
2894 		else
2895 		{
2896 			unsigned max_x = ((info.shi >> 2) - (info.slo >> 2)) & 0xfff;
2897 			unsigned max_y = (info.thi >> 2) - (info.tlo >> 2);
2898 			pixel_count = max_y * info.tex_width + max_x + 1;
2899 			offset_pixels = (info.slo >> 2) + info.tex_width * (info.tlo >> 2);
2900 		}
2901 
2902 		unsigned byte_size = pixel_count << (unsigned(info.size) - 1);
2903 		byte_size = (byte_size + 7) & ~7;
2904 		base_addr += offset_pixels << (unsigned(info.size) - 1);
2905 		mark_pages_for_gpu_read(base_addr, byte_size);
2906 	}
2907 
2908 	if (info.mode == UploadMode::Tile)
2909 	{
2910 		auto &meta = tiles[tile].meta;
2911 		unsigned pixels_coverered_per_line = (((info.shi >> 2) - (info.slo >> 2)) + 1) & 0xfff;
2912 
2913 		if (meta.fmt == TextureFormat::YUV)
2914 			pixels_coverered_per_line *= 2;
2915 
2916 		// Technically, 32-bpp TMEM upload and YUV upload will work like 16bpp, just split into two halves, but that also means
2917 		// we get 2kB wraparound instead of 4kB wraparound, so this works out just fine for our purposes.
2918 		unsigned quad_words_covered_per_line = ((pixels_coverered_per_line << unsigned(meta.size)) + 15) >> 4;
2919 
2920 		// Deal with mismatch in state, there is no reasonable scenarios where this should even matter, but you never know ...
2921 		if (unsigned(meta.size) > unsigned(info.size))
2922 			quad_words_covered_per_line <<= unsigned(meta.size) - unsigned(info.size);
2923 		else if (unsigned(meta.size) < unsigned(info.size))
2924 			quad_words_covered_per_line >>= unsigned(info.size) - unsigned(meta.size);
2925 
2926 		// Compute a conservative estimate for how many bytes we're going to splat down into TMEM.
2927 		unsigned bytes_covered_per_line = std::max<unsigned>(quad_words_covered_per_line * 8, meta.stride);
2928 
2929 		unsigned num_lines = ((info.thi >> 2) - (info.tlo >> 2)) + 1;
2930 		unsigned total_bytes_covered = bytes_covered_per_line * num_lines;
2931 
2932 		if (total_bytes_covered > 0x1000)
2933 		{
2934 			// Welp, for whatever reason, the game wants to write more than 4k of texture data to TMEM in one go.
2935 			// We can only handle 4kB in one go due to wrap-around effects,
2936 			// so split up the upload in multiple chunks.
2937 
2938 			unsigned max_lines_per_iteration = 0x1000u / bytes_covered_per_line;
2939 			// Align T-state.
2940 			max_lines_per_iteration &= ~1u;
2941 
2942 			if (max_lines_per_iteration == 0)
2943 			{
2944 				LOGE("Pure insanity where content is attempting to load more than 2kB of TMEM data in one single line ...\n");
2945 				// Could be supported if we start splitting up horizonal direction as well, but seriously ...
2946 				return;
2947 			}
2948 
2949 			for (unsigned line = 0; line < num_lines; line += max_lines_per_iteration)
2950 			{
2951 				unsigned to_copy_lines = std::min(num_lines - line, max_lines_per_iteration);
2952 
2953 				LoadTileInfo tmp_info = info;
2954 				tmp_info.tlo = info.tlo + (line << 2);
2955 				tmp_info.thi = tmp_info.tlo + ((to_copy_lines - 1) << 2);
2956 				load_tile_iteration(tile, tmp_info, line * meta.stride);
2957 			}
2958 
2959 			auto &size = tiles[tile].size;
2960 			size.slo = info.slo;
2961 			size.shi = info.shi;
2962 			size.tlo = info.tlo;
2963 			size.thi = info.thi;
2964 		}
2965 		else
2966 			load_tile_iteration(tile, info, 0);
2967 	}
2968 	else
2969 		load_tile_iteration(tile, info, 0);
2970 }
2971 
load_tile_iteration(uint32_t tile,const LoadTileInfo & info,uint32_t tmem_offset)2972 void Renderer::load_tile_iteration(uint32_t tile, const LoadTileInfo &info, uint32_t tmem_offset)
2973 {
2974 	auto &size = tiles[tile].size;
2975 	auto &meta = tiles[tile].meta;
2976 	size.slo = info.slo;
2977 	size.shi = info.shi;
2978 	size.tlo = info.tlo;
2979 	size.thi = info.thi;
2980 
2981 	if (meta.fmt == TextureFormat::YUV && ((meta.size != TextureSize::Bpp16) || (info.size != TextureSize::Bpp16)))
2982 	{
2983 		LOGE("Only 16bpp is supported for YUV uploads.\n");
2984 		return;
2985 	}
2986 
2987 	// This case does not appear to be supported.
2988 	if (info.size == TextureSize::Bpp4)
2989 	{
2990 		LOGE("4-bit VRAM pointer crashes the RDP.\n");
2991 		return;
2992 	}
2993 
2994 	if (meta.size == TextureSize::Bpp32 && meta.fmt != TextureFormat::RGBA)
2995 	{
2996 		LOGE("32bpp tile uploads must using RGBA texture format, unsupported otherwise.\n");
2997 		return;
2998 	}
2999 
3000 	if (info.mode == UploadMode::TLUT && meta.size == TextureSize::Bpp32)
3001 	{
3002 		LOGE("TLUT uploads with 32bpp tiles are unsupported.\n");
3003 		return;
3004 	}
3005 
3006 	if (info.mode != UploadMode::TLUT)
3007 	{
3008 		if (info.size == TextureSize::Bpp32 && meta.size == TextureSize::Bpp8)
3009 		{
3010 			LOGE("FIXME: Loading tile with Texture 32-bit and Tile 8-bit. This creates insane results, unsupported.\n");
3011 			return;
3012 		}
3013 		else if (info.size == TextureSize::Bpp16 && meta.size == TextureSize::Bpp4)
3014 		{
3015 			LOGE("FIXME: Loading tile with Texture 16-bit and Tile 4-bit. This creates insane results, unsupported.\n");
3016 			return;
3017 		}
3018 		else if (info.size == TextureSize::Bpp32 && meta.size == TextureSize::Bpp4)
3019 		{
3020 			LOGE("FIXME: Loading tile with Texture 32-bit and Tile 4-bit. This creates insane results, unsupported.\n");
3021 			return;
3022 		}
3023 	}
3024 
3025 	UploadInfo upload = {};
3026 	upload.tmem_stride_words = meta.stride >> 1;
3027 
3028 	uint32_t upload_x = 0;
3029 	uint32_t upload_y = 0;
3030 
3031 	auto upload_mode = info.mode;
3032 
3033 	if (upload_mode == UploadMode::Block)
3034 	{
3035 		upload_x = info.slo;
3036 		upload_y = info.tlo;
3037 
3038 		// LoadBlock is kinda awkward. Rather than specifying width and height, we get width and dTdx.
3039 		// dTdx will increment and generate a T coordinate based on S coordinate (T = (S_64bpp_word * dTdx) >> 11).
3040 		// The stride is added on top of this, so effective stride is stride(T) + stride(tile).
3041 		// Usually it makes sense for stride(tile) to be 0, but it doesn't have to be ...
3042 		// The only reasonable solution is to try to decompose this mess into a normal width/height/stride.
3043 		// In the general dTdx case, we don't have to deduce a stable value for stride.
3044 		// If dTdx is very weird, we might get variable stride, which is near-impossible to deal with.
3045 		// However, it makes zero sense for content to actually rely on this behavior.
3046 		// Even if there are inaccuracies in the fraction, we always floor it to get T, and thus we'll have to run
3047 		// for quite some time to observe the fractional error accumulate.
3048 
3049 		unsigned pixel_count = (info.shi - info.slo + 1) & 0xfff;
3050 
3051 		unsigned dt = info.thi;
3052 
3053 		unsigned max_tmem_iteration = (pixel_count - 1) >> (4u - unsigned(info.size));
3054 		unsigned max_t = (max_tmem_iteration * dt) >> 11;
3055 
3056 		if (max_t != 0)
3057 		{
3058 			// dT is an inverse which is not necessarily accurate, we can end up with an uneven amount of
3059 			// texels per "line". If we have stride == 0, this is fairly easy to deal with,
3060 			// but for the case where stride != 0, it is very difficult to implement it correctly.
3061 			// We will need to solve this kind of equation for X:
3062 
3063 			// TMEM word = floor((x * dt) / 2048) * stride + x
3064 			// This equation has no solutions for cases where we stride over TMEM words.
3065 			// The only way I can think of is to test all candidates for the floor() expression, and see if that is a valid solution.
3066 			// We can find an conservative estimate for floor() by:
3067 			// t_min = TMEM word / (max_num_64bpp_elements + stride)
3068 			// t_max = TMEM word / (min_num_64bpp_elements + stride)
3069 			unsigned max_num_64bpp_elements_before_wrap = ((1u << 11u) + dt - 1u) / dt;
3070 			unsigned min_num_64bpp_elements_before_wrap = (1u << 11u) / dt;
3071 
3072 			bool uneven_dt = max_num_64bpp_elements_before_wrap != min_num_64bpp_elements_before_wrap;
3073 
3074 			if (uneven_dt)
3075 			{
3076 				// If we never get rounding errors, we can handwave this issue away and pretend that min == max iterations.
3077 				// This is by far the common case.
3078 
3079 				// Each overflow into next T adds a certain amount of error.
3080 				unsigned overflow_amt = dt * max_num_64bpp_elements_before_wrap - (1 << 11);
3081 
3082 				// Multiply this by maximum value of T we can observe, and we have a conservative estimate for our T error.
3083 				overflow_amt *= max_t;
3084 
3085 				// If this error is less than 1 step of dt, we can be certain that we will get max_num iterations every time,
3086 				// and we can ignore the worst edge cases.
3087 				if (overflow_amt < dt)
3088 				{
3089 					min_num_64bpp_elements_before_wrap = max_num_64bpp_elements_before_wrap;
3090 					uneven_dt = false;
3091 				}
3092 			}
3093 
3094 			// Add more precision bits to DXT. We might have to shift it down if we have a meta.size fixup down below.
3095 			// Also makes the right shift nicer (16 vs 11).
3096 			upload.dxt = dt << 5;
3097 
3098 			if (meta.size == TextureSize::Bpp32 || meta.fmt == TextureFormat::YUV)
3099 			{
3100 				// We iterate twice for Bpp32 and YUV to complete a 64bpp word.
3101 				upload.tmem_stride_words <<= 1;
3102 
3103 				// Pure, utter insanity, but no content should *ever* hit this ...
3104 				if (uneven_dt && meta.size != info.size)
3105 				{
3106 					LOGE("Got uneven_dt, and texture size != tile size.\n");
3107 					return;
3108 				}
3109 			}
3110 
3111 			// If TMEM and VRAM bpp misalign, we need to fixup this since we step too fast or slow.
3112 			if (unsigned(meta.size) > unsigned(info.size))
3113 			{
3114 				unsigned shamt = unsigned(meta.size) - unsigned(info.size);
3115 				max_num_64bpp_elements_before_wrap <<= shamt;
3116 				min_num_64bpp_elements_before_wrap <<= shamt;
3117 				// Need to step slower so we can handle the added striding.
3118 				upload.dxt >>= shamt;
3119 			}
3120 			else if (unsigned(info.size) > unsigned(meta.size))
3121 			{
3122 				// Here we step multiple times over the same pixel, but potentially with different T state,
3123 				// since dTdx applies between the iterations.
3124 				// Horrible, horrible mess ...
3125 				LOGE("LoadBlock: VRAM bpp size is larger than tile bpp. This is unsupported.\n");
3126 				return;
3127 			}
3128 
3129 			unsigned max_line_stride_64bpp = max_num_64bpp_elements_before_wrap + (upload.tmem_stride_words >> 2);
3130 			unsigned min_line_stride_64bpp = min_num_64bpp_elements_before_wrap + (upload.tmem_stride_words >> 2);
3131 
3132 			// Multiplying 64bpp TMEM word by these gives us lower and upper bounds for T.
3133 			// These serve as candidate expressions for floor().
3134 			float min_t_mod = 1.0f / float(max_line_stride_64bpp);
3135 			float max_t_mod = 1.0f / float(min_line_stride_64bpp);
3136 			upload.min_t_mod = min_t_mod;
3137 			upload.max_t_mod = max_t_mod;
3138 
3139 			upload.width = pixel_count;
3140 			upload.height = 1;
3141 			upload.tmem_stride_words >>= 2; // Stride in 64bpp instead of 16bpp.
3142 		}
3143 		else
3144 		{
3145 			// We never trigger a case where T is non-zero, so this is equivalent to a Tile upload.
3146 			upload.width = pixel_count;
3147 			upload.height = 1;
3148 			upload.tmem_stride_words = 0;
3149 			upload_mode = UploadMode::Tile;
3150 		}
3151 	}
3152 	else
3153 	{
3154 		upload_x = info.slo >> 2;
3155 		upload_y = info.tlo >> 2;
3156 		upload.width = (((info.shi >> 2) - (info.slo >> 2)) + 1) & 0xfff;
3157 		upload.height = ((info.thi >> 2) - (info.tlo >> 2)) + 1;
3158 	}
3159 
3160 	if (!upload.width)
3161 		return;
3162 
3163 	switch (info.size)
3164 	{
3165 	case TextureSize::Bpp8:
3166 		upload.vram_effective_width = (upload.width + 7) & ~7;
3167 		break;
3168 
3169 	case TextureSize::Bpp16:
3170 		// In 16-bit VRAM pointer with TLUT, we iterate one texel at a time, not 4.
3171 		if (upload_mode == UploadMode::TLUT)
3172 			upload.vram_effective_width = upload.width;
3173 		else
3174 			upload.vram_effective_width = (upload.width + 3) & ~3;
3175 		break;
3176 
3177 	case TextureSize::Bpp32:
3178 		upload.vram_effective_width = (upload.width + 1) & ~1;
3179 		break;
3180 
3181 	default:
3182 		break;
3183 	}
3184 
3185 	// Uploads happen in chunks of 8 bytes in groups of 4x16-bits.
3186 	switch (meta.size)
3187 	{
3188 	case TextureSize::Bpp4:
3189 		upload.width = (upload.width + 15) & ~15;
3190 		upload.width >>= 2;
3191 		break;
3192 
3193 	case TextureSize::Bpp8:
3194 		upload.width = (upload.width + 7) & ~7;
3195 		upload.width >>= 1;
3196 		break;
3197 
3198 	case TextureSize::Bpp16:
3199 		upload.width = (upload.width + 3) & ~3;
3200 		// Consider YUV uploads to be 32bpp since that's kinda what they are.
3201 		if (meta.fmt == TextureFormat::YUV)
3202 			upload.width >>= 1;
3203 		break;
3204 
3205 	case TextureSize::Bpp32:
3206 		upload.width = (upload.width + 1) & ~1;
3207 		break;
3208 
3209 	default:
3210 		LOGE("Unimplemented!\n");
3211 		break;
3212 	}
3213 
3214 	if (upload.height > 1 && upload_mode == UploadMode::TLUT)
3215 	{
3216 		LOGE("Load TLUT with height > 1 is not supported.\n");
3217 		return;
3218 	}
3219 
3220 	upload.vram_addr = info.tex_addr + ((info.tex_width * upload_y + upload_x) << (unsigned(info.size) - 1));
3221 	upload.vram_width = upload_mode == UploadMode::Block ? upload.vram_effective_width : info.tex_width;
3222 	upload.vram_size = int32_t(info.size);
3223 
3224 	upload.tmem_offset = (meta.offset + tmem_offset) & 0xfff;
3225 	upload.tmem_size = int32_t(meta.size);
3226 	upload.tmem_fmt = int32_t(meta.fmt);
3227 	upload.mode = int32_t(upload_mode);
3228 
3229 	upload.inv_tmem_stride_words = 1.0f / float(upload.tmem_stride_words);
3230 
3231 	stream.tmem_upload_infos.push_back(upload);
3232 	if (stream.tmem_upload_infos.size() + 1 >= Limits::MaxTMEMInstances)
3233 		flush_queues();
3234 }
3235 
set_blend_color(uint32_t color)3236 void Renderer::set_blend_color(uint32_t color)
3237 {
3238 	constants.blend_color = color;
3239 }
3240 
set_fog_color(uint32_t color)3241 void Renderer::set_fog_color(uint32_t color)
3242 {
3243 	constants.fog_color = color;
3244 }
3245 
set_env_color(uint32_t color)3246 void Renderer::set_env_color(uint32_t color)
3247 {
3248 	constants.env_color = color;
3249 }
3250 
set_fill_color(uint32_t color)3251 void Renderer::set_fill_color(uint32_t color)
3252 {
3253 	constants.fill_color = color;
3254 }
3255 
set_primitive_depth(uint16_t prim_depth,uint16_t prim_dz)3256 void Renderer::set_primitive_depth(uint16_t prim_depth, uint16_t prim_dz)
3257 {
3258 	constants.prim_depth = int32_t(prim_depth & 0x7fff) << 16;
3259 	constants.prim_dz = prim_dz;
3260 }
3261 
set_enable_primitive_depth(bool enable)3262 void Renderer::set_enable_primitive_depth(bool enable)
3263 {
3264 	constants.use_prim_depth = enable;
3265 }
3266 
set_convert(uint16_t k0,uint16_t k1,uint16_t k2,uint16_t k3,uint16_t k4,uint16_t k5)3267 void Renderer::set_convert(uint16_t k0, uint16_t k1, uint16_t k2, uint16_t k3, uint16_t k4, uint16_t k5)
3268 {
3269 	constants.convert[0] = 2 * sext<9>(k0) + 1;
3270 	constants.convert[1] = 2 * sext<9>(k1) + 1;
3271 	constants.convert[2] = 2 * sext<9>(k2) + 1;
3272 	constants.convert[3] = 2 * sext<9>(k3) + 1;
3273 	constants.convert[4] = k4;
3274 	constants.convert[5] = k5;
3275 }
3276 
set_color_key(unsigned component,uint32_t width,uint32_t center,uint32_t scale)3277 void Renderer::set_color_key(unsigned component, uint32_t width, uint32_t center, uint32_t scale)
3278 {
3279 	constants.key_width[component] = width;
3280 	constants.key_center[component] = center;
3281 	constants.key_scale[component] = scale;
3282 }
3283 
set_primitive_color(uint8_t min_level,uint8_t prim_lod_frac,uint32_t color)3284 void Renderer::set_primitive_color(uint8_t min_level, uint8_t prim_lod_frac, uint32_t color)
3285 {
3286 	constants.primitive_color = color;
3287 	constants.min_level = min_level;
3288 	constants.prim_lod_frac = prim_lod_frac;
3289 }
3290 
can_support_minimum_subgroup_size(unsigned size) const3291 bool Renderer::can_support_minimum_subgroup_size(unsigned size) const
3292 {
3293 	return supports_subgroup_size_control(size, device->get_device_features().subgroup_properties.subgroupSize);
3294 }
3295 
supports_subgroup_size_control(uint32_t minimum_size,uint32_t maximum_size) const3296 bool Renderer::supports_subgroup_size_control(uint32_t minimum_size, uint32_t maximum_size) const
3297 {
3298 	auto &features = device->get_device_features();
3299 
3300 	if (!features.subgroup_size_control_features.computeFullSubgroups)
3301 		return false;
3302 
3303 	bool use_varying = minimum_size <= features.subgroup_size_control_properties.minSubgroupSize &&
3304 	                   maximum_size >= features.subgroup_size_control_properties.maxSubgroupSize;
3305 
3306 	if (!use_varying)
3307 	{
3308 		bool outside_range = minimum_size > features.subgroup_size_control_properties.maxSubgroupSize ||
3309 		                     maximum_size < features.subgroup_size_control_properties.minSubgroupSize;
3310 		if (outside_range)
3311 			return false;
3312 
3313 		if ((features.subgroup_size_control_properties.requiredSubgroupSizeStages & VK_SHADER_STAGE_COMPUTE_BIT) == 0)
3314 			return false;
3315 	}
3316 
3317 	return true;
3318 }
3319 
perform_work(const Vulkan::DeferredPipelineCompile & compile) const3320 void Renderer::PipelineExecutor::perform_work(const Vulkan::DeferredPipelineCompile &compile) const
3321 {
3322 	auto start_ts = device->write_calibrated_timestamp();
3323 	Vulkan::CommandBuffer::build_compute_pipeline(device, compile);
3324 	auto end_ts = device->write_calibrated_timestamp();
3325 	device->register_time_interval("RDP Pipeline", std::move(start_ts), std::move(end_ts),
3326 	                               "pipeline-compilation", std::to_string(compile.hash));
3327 }
3328 
is_sentinel(const Vulkan::DeferredPipelineCompile & compile) const3329 bool Renderer::PipelineExecutor::is_sentinel(const Vulkan::DeferredPipelineCompile &compile) const
3330 {
3331 	return compile.hash == 0;
3332 }
3333 
notify_work_locked(const Vulkan::DeferredPipelineCompile &) const3334 void Renderer::PipelineExecutor::notify_work_locked(const Vulkan::DeferredPipelineCompile &) const
3335 {
3336 }
3337 }
3338