1 /* Copyright (c) 2020 Themaister
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining
4  * a copy of this software and associated documentation files (the
5  * "Software"), to deal in the Software without restriction, including
6  * without limitation the rights to use, copy, modify, merge, publish,
7  * distribute, sublicense, and/or sell copies of the Software, and to
8  * permit persons to whom the Software is furnished to do so, subject to
9  * the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be
12  * included in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "rdp_device.hpp"
24 #include "rdp_common.hpp"
25 #include <chrono>
26 
27 #ifdef __SSE2__
28 #include <emmintrin.h>
29 #endif
30 
31 #ifndef PARALLEL_RDP_SHADER_DIR
32 #include "shaders/slangmosh.hpp"
33 #endif
34 
35 using namespace Vulkan;
36 
37 #define STATE_MASK(flag, cond, mask) do { \
38     (flag) &= ~(mask); \
39     if (cond) (flag) |= (mask); \
40 } while(0)
41 
42 namespace RDP
43 {
CommandProcessor(Vulkan::Device & device_,void * rdram_ptr,size_t rdram_offset_,size_t rdram_size_,size_t hidden_rdram_size,CommandProcessorFlags flags_)44 CommandProcessor::CommandProcessor(Vulkan::Device &device_, void *rdram_ptr,
45                                    size_t rdram_offset_, size_t rdram_size_, size_t hidden_rdram_size,
46                                    CommandProcessorFlags flags_)
47 	: device(device_), rdram_offset(rdram_offset_), rdram_size(rdram_size_), flags(flags_), renderer(*this),
48 #ifdef PARALLEL_RDP_SHADER_DIR
49 	  timeline_worker(Granite::Global::create_thread_context(), FenceExecutor{&device, &thread_timeline_value})
50 #else
51 	  timeline_worker(FenceExecutor{&device, &thread_timeline_value})
52 #endif
53 {
54 	BufferCreateInfo info = {};
55 	info.size = rdram_size;
56 	info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
57 	info.domain = BufferDomain::CachedCoherentHostPreferCached;
58 	info.misc = BUFFER_MISC_ZERO_INITIALIZE_BIT;
59 
60 	if (rdram_ptr)
61 	{
62 		bool allow_memory_host = true;
63 		if (const char *env = getenv("PARALLEL_RDP_ALLOW_EXTERNAL_HOST"))
64 			allow_memory_host = strtol(env, nullptr, 0) > 0;
65 
66 		if (allow_memory_host && device.get_device_features().supports_external_memory_host)
67 		{
68 			size_t import_size = rdram_size + rdram_offset;
69 			size_t align = device.get_device_features().host_memory_properties.minImportedHostPointerAlignment;
70 			import_size = (import_size + align - 1) & ~(align - 1);
71 			info.size = import_size;
72 			rdram = device.create_imported_host_buffer(info, VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT, rdram_ptr);
73 		}
74 		else
75 		{
76 			LOGW("VK_EXT_external_memory_host is not supported on this device. Falling back to a slower path.\n");
77 			is_host_coherent = false;
78 			rdram_offset = 0;
79 			host_rdram = static_cast<uint8_t *>(rdram_ptr) + rdram_offset_;
80 
81 			BufferCreateInfo device_rdram = {};
82 			device_rdram.size = rdram_size * 2; // Need twice the memory amount so we can also store a writemask.
83 			device_rdram.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT |
84 			                     VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
85 			                     VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
86 
87 			if (device.get_gpu_properties().deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU)
88 				device_rdram.domain = BufferDomain::CachedCoherentHostPreferCached;
89 			else
90 				device_rdram.domain = BufferDomain::Device;
91 
92 			device_rdram.misc = BUFFER_MISC_ZERO_INITIALIZE_BIT;
93 			rdram = device.create_buffer(device_rdram);
94 		}
95 	}
96 	else
97 		rdram = device.create_buffer(info);
98 
99 	if (!rdram)
100 		LOGE("Failed to allocate RDRAM.\n");
101 
102 	info.size = hidden_rdram_size;
103 	// Should be CachedHost, but seeing some insane bug on incoherent Arm systems for time being,
104 	// so just forcing coherent memory here for now. Not sure what is going on.
105 	info.domain = (flags & COMMAND_PROCESSOR_FLAG_HOST_VISIBLE_HIDDEN_RDRAM_BIT) != 0 ?
106 	              BufferDomain::CachedCoherentHostPreferCoherent : BufferDomain::Device;
107 	info.misc = 0;
108 	hidden_rdram = device.create_buffer(info);
109 
110 	info.size = 0x1000;
111 	info.domain = (flags & COMMAND_PROCESSOR_FLAG_HOST_VISIBLE_TMEM_BIT) != 0 ?
112 	              BufferDomain::CachedCoherentHostPreferCoherent : BufferDomain::Device;
113 	tmem = device.create_buffer(info);
114 
115 	clear_hidden_rdram();
116 	clear_tmem();
117 	init_renderer();
118 
119 	if (const char *env = getenv("PARALLEL_RDP_BENCH"))
120 	{
121 		measure_stall_time = strtol(env, nullptr, 0) > 0;
122 		if (measure_stall_time)
123 			LOGI("Will measure stall timings.\n");
124 	}
125 
126 	if (const char *env = getenv("PARALLEL_RDP_SINGLE_THREADED_COMMAND"))
127 	{
128 		single_threaded_processing = strtol(env, nullptr, 0) > 0;
129 		if (single_threaded_processing)
130 			LOGI("Will use single threaded command processing.\n");
131 	}
132 
133 	if (!single_threaded_processing)
134 	{
135 		ring.init(
136 #ifdef PARALLEL_RDP_SHADER_DIR
137 				Granite::Global::create_thread_context(),
138 #endif
139 				this, 4 * 1024);
140 	}
141 
142 	if (const char *env = getenv("PARALLEL_RDP_BENCH"))
143 		timestamp = strtol(env, nullptr, 0) > 0;
144 }
145 
~CommandProcessor()146 CommandProcessor::~CommandProcessor()
147 {
148 	idle();
149 }
150 
begin_frame_context()151 void CommandProcessor::begin_frame_context()
152 {
153 	flush();
154 	drain_command_ring();
155 	device.next_frame_context();
156 }
157 
init_renderer()158 void CommandProcessor::init_renderer()
159 {
160 	if (!rdram)
161 	{
162 		is_supported = false;
163 		return;
164 	}
165 
166 	renderer.set_device(&device);
167 	renderer.set_rdram(rdram.get(), host_rdram, rdram_offset, rdram_size, is_host_coherent);
168 	renderer.set_hidden_rdram(hidden_rdram.get());
169 	renderer.set_tmem(tmem.get());
170 
171 	unsigned factor = 1;
172 	if (flags & COMMAND_PROCESSOR_FLAG_UPSCALING_8X_BIT)
173 		factor = 8;
174 	else if (flags & COMMAND_PROCESSOR_FLAG_UPSCALING_4X_BIT)
175 		factor = 4;
176 	else if (flags & COMMAND_PROCESSOR_FLAG_UPSCALING_2X_BIT)
177 		factor = 2;
178 
179 	if (factor != 1)
180 		LOGI("Enabling upscaling: %ux.\n", factor);
181 
182 	RendererOptions opts;
183 	opts.upscaling_factor = factor;
184 
185 	is_supported = renderer.init_renderer(opts);
186 
187 	vi.set_device(&device);
188 	vi.set_rdram(rdram.get(), rdram_offset, rdram_size);
189 	vi.set_hidden_rdram(hidden_rdram.get());
190 	vi.set_renderer(&renderer);
191 
192 #ifndef PARALLEL_RDP_SHADER_DIR
193 	shader_bank.reset(new ShaderBank(device, [&](const char *name, const char *define) -> int {
194 		if (strncmp(name, "vi_", 3) == 0)
195 			return vi.resolve_shader_define(name, define);
196 		else
197 			return renderer.resolve_shader_define(name, define);
198 	}));
199 	renderer.set_shader_bank(shader_bank.get());
200 	vi.set_shader_bank(shader_bank.get());
201 #endif
202 }
203 
device_is_supported() const204 bool CommandProcessor::device_is_supported() const
205 {
206 	return is_supported;
207 }
208 
clear_hidden_rdram()209 void CommandProcessor::clear_hidden_rdram()
210 {
211 	clear_buffer(*hidden_rdram, 0x03030303);
212 }
213 
clear_tmem()214 void CommandProcessor::clear_tmem()
215 {
216 	clear_buffer(*tmem, 0);
217 }
218 
clear_buffer(Vulkan::Buffer & buffer,uint32_t value)219 void CommandProcessor::clear_buffer(Vulkan::Buffer &buffer, uint32_t value)
220 {
221 	if (!buffer.get_allocation().is_host_allocation())
222 	{
223 		auto cmd = device.request_command_buffer();
224 		cmd->fill_buffer(buffer, value);
225 		Fence fence;
226 		device.submit(cmd, &fence);
227 		fence->wait();
228 	}
229 	else
230 	{
231 		auto *mapped = device.map_host_buffer(buffer, MEMORY_ACCESS_WRITE_BIT);
232 		memset(mapped, value & 0xff, buffer.get_create_info().size);
233 		device.unmap_host_buffer(buffer, MEMORY_ACCESS_WRITE_BIT);
234 	}
235 }
236 
op_sync_full(const uint32_t *)237 void CommandProcessor::op_sync_full(const uint32_t *)
238 {
239 	renderer.flush_and_signal();
240 }
241 
decode_triangle_setup(TriangleSetup & setup,const uint32_t * words) const242 void CommandProcessor::decode_triangle_setup(TriangleSetup &setup, const uint32_t *words) const
243 {
244 	bool copy_cycle = (static_state.flags & RASTERIZATION_COPY_BIT) != 0;
245 	bool flip = (words[0] & 0x800000u) != 0;
246 	bool sign_dxhdy = (words[5] & 0x80000000u) != 0;
247 	bool do_offset = flip == sign_dxhdy;
248 
249 	setup.flags |= flip ? TRIANGLE_SETUP_FLIP_BIT : 0;
250 	setup.flags |= do_offset ? TRIANGLE_SETUP_DO_OFFSET_BIT : 0;
251 	setup.flags |= copy_cycle ? TRIANGLE_SETUP_SKIP_XFRAC_BIT : 0;
252 	setup.flags |= quirks.u.options.native_texture_lod ? TRIANGLE_SETUP_NATIVE_LOD_BIT : 0;
253 
254 	setup.tile = (words[0] >> 16) & 63;
255 
256 	setup.yl = sext<14>(words[0]);
257 	setup.ym = sext<14>(words[1] >> 16);
258 	setup.yh = sext<14>(words[1]);
259 
260 	// The lower bit is ignored, so shift here to obtain an extra bit of subpixel precision.
261 	// This is very useful for upscaling, since we can obtain 8x before we overflow instead of 4x.
262 	setup.xl = sext<28>(words[2]) >> 1;
263 	setup.xh = sext<28>(words[4]) >> 1;
264 	setup.xm = sext<28>(words[6]) >> 1;
265 	setup.dxldy = sext<28>(words[3] >> 2) >> 1;
266 	setup.dxhdy = sext<28>(words[5] >> 2) >> 1;
267 	setup.dxmdy = sext<28>(words[7] >> 2) >> 1;
268 }
269 
decode_tex_setup(AttributeSetup & attr,const uint32_t * words)270 static void decode_tex_setup(AttributeSetup &attr, const uint32_t *words)
271 {
272 	attr.s = (words[0] & 0xffff0000u) | ((words[4] >> 16) & 0x0000ffffu);
273 	attr.t = ((words[0] << 16) & 0xffff0000u) | (words[4] & 0x0000ffffu);
274 	attr.w = (words[1] & 0xffff0000u) | ((words[5] >> 16) & 0x0000ffffu);
275 
276 	attr.dsdx = (words[2] & 0xffff0000u) | ((words[6] >> 16) & 0x0000ffffu);
277 	attr.dtdx = ((words[2] << 16) & 0xffff0000u) | (words[6] & 0x0000ffffu);
278 	attr.dwdx = (words[3] & 0xffff0000u) | ((words[7] >> 16) & 0x0000ffffu);
279 
280 	attr.dsde = (words[8] & 0xffff0000u) | ((words[12] >> 16) & 0x0000ffffu);
281 	attr.dtde = ((words[8] << 16) & 0xffff0000u) | (words[12] & 0x0000ffffu);
282 	attr.dwde = (words[9] & 0xffff0000u) | ((words[13] >> 16) & 0x0000ffffu);
283 
284 	attr.dsdy = (words[10] & 0xffff0000u) | ((words[14] >> 16) & 0x0000ffffu);
285 	attr.dtdy = ((words[10] << 16) & 0xffff0000u) | (words[14] & 0x0000ffffu);
286 	attr.dwdy = (words[11] & 0xffff0000u) | ((words[15] >> 16) & 0x0000ffffu);
287 }
288 
decode_rgba_setup(AttributeSetup & attr,const uint32_t * words)289 static void decode_rgba_setup(AttributeSetup &attr, const uint32_t *words)
290 {
291 	attr.r = (words[0] & 0xffff0000u) | ((words[4] >> 16) & 0xffff);
292 	attr.g = (words[0] << 16) | (words[4] & 0xffff);
293 	attr.b = (words[1] & 0xffff0000u) | ((words[5] >> 16) & 0xffff);
294 	attr.a = (words[1] << 16) | (words[5] & 0xffff);
295 
296 	attr.drdx = (words[2] & 0xffff0000u) | ((words[6] >> 16) & 0xffff);
297 	attr.dgdx = (words[2] << 16) | (words[6] & 0xffff);
298 	attr.dbdx = (words[3] & 0xffff0000u) | ((words[7] >> 16) & 0xffff);
299 	attr.dadx = (words[3] << 16) | (words[7] & 0xffff);
300 
301 	attr.drde = (words[8] & 0xffff0000u) | ((words[12] >> 16) & 0xffff);
302 	attr.dgde = (words[8] << 16) | (words[12] & 0xffff);
303 	attr.dbde = (words[9] & 0xffff0000u) | ((words[13] >> 16) & 0xffff);
304 	attr.dade = (words[9] << 16) | (words[13] & 0xffff);
305 
306 	attr.drdy = (words[10] & 0xffff0000u) | ((words[14] >> 16) & 0xffff);
307 	attr.dgdy = (words[10] << 16) | (words[14] & 0xffff);
308 	attr.dbdy = (words[11] & 0xffff0000u) | ((words[15] >> 16) & 0xffff);
309 	attr.dady = (words[11] << 16) | (words[15] & 0xffff);
310 }
311 
decode_z_setup(AttributeSetup & attr,const uint32_t * words)312 static void decode_z_setup(AttributeSetup &attr, const uint32_t *words)
313 {
314 	attr.z = words[0];
315 	attr.dzdx = words[1];
316 	attr.dzde = words[2];
317 	attr.dzdy = words[3];
318 }
319 
op_fill_triangle(const uint32_t * words)320 void CommandProcessor::op_fill_triangle(const uint32_t *words)
321 {
322 	TriangleSetup setup = {};
323 	decode_triangle_setup(setup, words);
324 	renderer.draw_flat_primitive(setup);
325 }
326 
op_shade_triangle(const uint32_t * words)327 void CommandProcessor::op_shade_triangle(const uint32_t *words)
328 {
329 	TriangleSetup setup = {};
330 	AttributeSetup attr = {};
331 	decode_triangle_setup(setup, words);
332 	decode_rgba_setup(attr, words + 8);
333 	renderer.draw_shaded_primitive(setup, attr);
334 }
335 
op_shade_z_buffer_triangle(const uint32_t * words)336 void CommandProcessor::op_shade_z_buffer_triangle(const uint32_t *words)
337 {
338 	TriangleSetup setup = {};
339 	AttributeSetup attr = {};
340 	decode_triangle_setup(setup, words);
341 	decode_rgba_setup(attr, words + 8);
342 	decode_z_setup(attr, words + 24);
343 	renderer.draw_shaded_primitive(setup, attr);
344 }
345 
op_shade_texture_z_buffer_triangle(const uint32_t * words)346 void CommandProcessor::op_shade_texture_z_buffer_triangle(const uint32_t *words)
347 {
348 	TriangleSetup setup = {};
349 	AttributeSetup attr = {};
350 	decode_triangle_setup(setup, words);
351 	decode_rgba_setup(attr, words + 8);
352 	decode_tex_setup(attr, words + 24);
353 	decode_z_setup(attr, words + 40);
354 	renderer.draw_shaded_primitive(setup, attr);
355 }
356 
op_fill_z_buffer_triangle(const uint32_t * words)357 void CommandProcessor::op_fill_z_buffer_triangle(const uint32_t *words)
358 {
359 	TriangleSetup setup = {};
360 	AttributeSetup attr = {};
361 	decode_triangle_setup(setup, words);
362 	decode_z_setup(attr, words + 8);
363 	renderer.draw_shaded_primitive(setup, attr);
364 }
365 
op_texture_triangle(const uint32_t * words)366 void CommandProcessor::op_texture_triangle(const uint32_t *words)
367 {
368 	TriangleSetup setup = {};
369 	AttributeSetup attr = {};
370 	decode_triangle_setup(setup, words);
371 	decode_tex_setup(attr, words + 8);
372 	renderer.draw_shaded_primitive(setup, attr);
373 }
374 
op_texture_z_buffer_triangle(const uint32_t * words)375 void CommandProcessor::op_texture_z_buffer_triangle(const uint32_t *words)
376 {
377 	TriangleSetup setup = {};
378 	AttributeSetup attr = {};
379 	decode_triangle_setup(setup, words);
380 	decode_tex_setup(attr, words + 8);
381 	decode_z_setup(attr, words + 24);
382 	renderer.draw_shaded_primitive(setup, attr);
383 }
384 
op_shade_texture_triangle(const uint32_t * words)385 void CommandProcessor::op_shade_texture_triangle(const uint32_t *words)
386 {
387 	TriangleSetup setup = {};
388 	AttributeSetup attr = {};
389 	decode_triangle_setup(setup, words);
390 	decode_rgba_setup(attr, words + 8);
391 	decode_tex_setup(attr, words + 24);
392 	renderer.draw_shaded_primitive(setup, attr);
393 }
394 
op_set_color_image(const uint32_t * words)395 void CommandProcessor::op_set_color_image(const uint32_t *words)
396 {
397 	unsigned fmt = (words[0] >> 21) & 7;
398 	unsigned size = (words[0] >> 19) & 3;
399 	unsigned width = (words[0] & 1023) + 1;
400 	unsigned addr = words[1] & 0xffffff;
401 
402 	FBFormat fbfmt;
403 	switch (size)
404 	{
405 	case 0:
406 		fbfmt = FBFormat::I4;
407 		break;
408 
409 	case 1:
410 		fbfmt = FBFormat::I8;
411 		break;
412 
413 	case 2:
414 		fbfmt = fmt ? FBFormat::IA88 : FBFormat::RGBA5551;
415 		break;
416 
417 	case 3:
418 		fbfmt = FBFormat::RGBA8888;
419 		break;
420 
421 	default:
422 		LOGE("Invalid pixel size %u.\n", size);
423 		return;
424 	}
425 
426 	renderer.set_color_framebuffer(addr, width, fbfmt);
427 }
428 
op_set_mask_image(const uint32_t * words)429 void CommandProcessor::op_set_mask_image(const uint32_t *words)
430 {
431 	unsigned addr = words[1] & 0xffffff;
432 	renderer.set_depth_framebuffer(addr);
433 }
434 
op_set_scissor(const uint32_t * words)435 void CommandProcessor::op_set_scissor(const uint32_t *words)
436 {
437 	scissor_state.xlo = (words[0] >> 12) & 0xfff;
438 	scissor_state.xhi = (words[1] >> 12) & 0xfff;
439 	scissor_state.ylo = (words[0] >> 0) & 0xfff;
440 	scissor_state.yhi = (words[1] >> 0) & 0xfff;
441 
442 	STATE_MASK(static_state.flags, bool(words[1] & (1 << 25)), RASTERIZATION_INTERLACE_FIELD_BIT);
443 	STATE_MASK(static_state.flags, bool(words[1] & (1 << 24)), RASTERIZATION_INTERLACE_KEEP_ODD_BIT);
444 	renderer.set_scissor_state(scissor_state);
445 	renderer.set_static_rasterization_state(static_state);
446 }
447 
op_set_other_modes(const uint32_t * words)448 void CommandProcessor::op_set_other_modes(const uint32_t *words)
449 {
450 	STATE_MASK(static_state.flags, bool(words[0] & (1 << 19)), RASTERIZATION_PERSPECTIVE_CORRECT_BIT);
451 	STATE_MASK(static_state.flags, bool(words[0] & (1 << 18)), RASTERIZATION_DETAIL_LOD_ENABLE_BIT);
452 	STATE_MASK(static_state.flags, bool(words[0] & (1 << 17)), RASTERIZATION_SHARPEN_LOD_ENABLE_BIT);
453 	STATE_MASK(static_state.flags, bool(words[0] & (1 << 16)), RASTERIZATION_TEX_LOD_ENABLE_BIT);
454 	STATE_MASK(static_state.flags, bool(words[0] & (1 << 15)), RASTERIZATION_TLUT_BIT);
455 	STATE_MASK(static_state.flags, bool(words[0] & (1 << 14)), RASTERIZATION_TLUT_TYPE_BIT);
456 	STATE_MASK(static_state.flags, bool(words[0] & (1 << 13)), RASTERIZATION_SAMPLE_MODE_BIT);
457 	STATE_MASK(static_state.flags, bool(words[0] & (1 << 12)), RASTERIZATION_SAMPLE_MID_TEXEL_BIT);
458 	STATE_MASK(static_state.flags, bool(words[0] & (1 << 11)), RASTERIZATION_BILERP_0_BIT);
459 	STATE_MASK(static_state.flags, bool(words[0] & (1 << 10)), RASTERIZATION_BILERP_1_BIT);
460 	STATE_MASK(static_state.flags, bool(words[0] & (1 << 9)), RASTERIZATION_CONVERT_ONE_BIT);
461 	STATE_MASK(depth_blend.flags, bool(words[1] & (1 << 14)), DEPTH_BLEND_FORCE_BLEND_BIT);
462 	STATE_MASK(static_state.flags, bool(words[1] & (1 << 13)), RASTERIZATION_ALPHA_CVG_SELECT_BIT);
463 	STATE_MASK(static_state.flags, bool(words[1] & (1 << 12)), RASTERIZATION_CVG_TIMES_ALPHA_BIT);
464 	STATE_MASK(depth_blend.flags, bool(words[1] & (1 << 7)), DEPTH_BLEND_COLOR_ON_COVERAGE_BIT);
465 	STATE_MASK(depth_blend.flags, bool(words[1] & (1 << 6)), DEPTH_BLEND_IMAGE_READ_ENABLE_BIT);
466 	STATE_MASK(depth_blend.flags, bool(words[1] & (1 << 5)), DEPTH_BLEND_DEPTH_UPDATE_BIT);
467 	STATE_MASK(depth_blend.flags, bool(words[1] & (1 << 4)), DEPTH_BLEND_DEPTH_TEST_BIT);
468 	STATE_MASK(static_state.flags, bool(words[1] & (1 << 3)), RASTERIZATION_AA_BIT);
469 	STATE_MASK(depth_blend.flags, bool(words[1] & (1 << 3)), DEPTH_BLEND_AA_BIT);
470 
471 	STATE_MASK(static_state.flags, bool(words[1] & (1 << 1)), RASTERIZATION_ALPHA_TEST_DITHER_BIT);
472 	STATE_MASK(static_state.flags, bool(words[1] & (1 << 0)), RASTERIZATION_ALPHA_TEST_BIT);
473 	static_state.dither = (words[0] >> 4) & 0x0f;
474 	STATE_MASK(depth_blend.flags, RGBDitherMode(static_state.dither >> 2) != RGBDitherMode::Off, DEPTH_BLEND_DITHER_ENABLE_BIT);
475 	depth_blend.coverage_mode = static_cast<CoverageMode>((words[1] >> 8) & 3);
476 	depth_blend.z_mode = static_cast<ZMode>((words[1] >> 10) & 3);
477 
478 	static_state.flags &= ~(RASTERIZATION_MULTI_CYCLE_BIT |
479 	                        RASTERIZATION_FILL_BIT |
480 	                        RASTERIZATION_COPY_BIT);
481 	depth_blend.flags &= ~DEPTH_BLEND_MULTI_CYCLE_BIT;
482 
483 	switch (CycleType((words[0] >> 20) & 3))
484 	{
485 	case CycleType::Cycle2:
486 		static_state.flags |= RASTERIZATION_MULTI_CYCLE_BIT;
487 		depth_blend.flags |= DEPTH_BLEND_MULTI_CYCLE_BIT;
488 		break;
489 
490 	case CycleType::Fill:
491 		static_state.flags |= RASTERIZATION_FILL_BIT;
492 		break;
493 
494 	case CycleType::Copy:
495 		static_state.flags |= RASTERIZATION_COPY_BIT;
496 		break;
497 
498 	default:
499 		break;
500 	}
501 
502 	depth_blend.blend_cycles[0].blend_1a = static_cast<BlendMode1A>((words[1] >> 30) & 3);
503 	depth_blend.blend_cycles[1].blend_1a = static_cast<BlendMode1A>((words[1] >> 28) & 3);
504 	depth_blend.blend_cycles[0].blend_1b = static_cast<BlendMode1B>((words[1] >> 26) & 3);
505 	depth_blend.blend_cycles[1].blend_1b = static_cast<BlendMode1B>((words[1] >> 24) & 3);
506 	depth_blend.blend_cycles[0].blend_2a = static_cast<BlendMode2A>((words[1] >> 22) & 3);
507 	depth_blend.blend_cycles[1].blend_2a = static_cast<BlendMode2A>((words[1] >> 20) & 3);
508 	depth_blend.blend_cycles[0].blend_2b = static_cast<BlendMode2B>((words[1] >> 18) & 3);
509 	depth_blend.blend_cycles[1].blend_2b = static_cast<BlendMode2B>((words[1] >> 16) & 3);
510 
511 	renderer.set_static_rasterization_state(static_state);
512 	renderer.set_depth_blend_state(depth_blend);
513 	renderer.set_enable_primitive_depth(bool(words[1] & (1 << 2)));
514 }
515 
op_set_texture_image(const uint32_t * words)516 void CommandProcessor::op_set_texture_image(const uint32_t *words)
517 {
518 	auto fmt = TextureFormat((words[0] >> 21) & 7);
519 	auto size = TextureSize((words[0] >> 19) & 3);
520 	uint32_t width = (words[0] & 0x3ff) + 1;
521 	uint32_t addr = words[1] & 0x00ffffffu;
522 
523 	texture_image.addr = addr;
524 	texture_image.width = width;
525 	texture_image.size = size;
526 	texture_image.fmt = fmt;
527 }
528 
op_set_tile(const uint32_t * words)529 void CommandProcessor::op_set_tile(const uint32_t *words)
530 {
531 	uint32_t tile = (words[1] >> 24) & 7;
532 
533 	TileMeta info = {};
534 	info.offset = ((words[0] >> 0) & 511) << 3;
535 	info.stride = ((words[0] >> 9) & 511) << 3;
536 	info.size = TextureSize((words[0] >> 19) & 3);
537 	info.fmt = TextureFormat((words[0] >> 21) & 7);
538 
539 	info.palette = (words[1] >> 20) & 15;
540 
541 	info.shift_s = (words[1] >> 0) & 15;
542 	info.mask_s = (words[1] >> 4) & 15;
543 	info.shift_t = (words[1] >> 10) & 15;
544 	info.mask_t = (words[1] >> 14) & 15;
545 
546 	if (words[1] & (1 << 8))
547 		info.flags |= TILE_INFO_MIRROR_S_BIT;
548 	if (words[1] & (1 << 9))
549 		info.flags |= TILE_INFO_CLAMP_S_BIT;
550 	if (words[1] & (1 << 18))
551 		info.flags |= TILE_INFO_MIRROR_T_BIT;
552 	if (words[1] & (1 << 19))
553 		info.flags |= TILE_INFO_CLAMP_T_BIT;
554 
555 	if (info.mask_s > 10)
556 		info.mask_s = 10;
557 	else if (info.mask_s == 0)
558 		info.flags |= TILE_INFO_CLAMP_S_BIT;
559 
560 	if (info.mask_t > 10)
561 		info.mask_t = 10;
562 	else if (info.mask_t == 0)
563 		info.flags |= TILE_INFO_CLAMP_T_BIT;
564 
565 	renderer.set_tile(tile, info);
566 }
567 
op_load_tile(const uint32_t * words)568 void CommandProcessor::op_load_tile(const uint32_t *words)
569 {
570 	uint32_t tile = (words[1] >> 24) & 7;
571 
572 	LoadTileInfo info = {};
573 
574 	info.tex_addr = texture_image.addr;
575 	info.tex_width = texture_image.width;
576 	info.fmt = texture_image.fmt;
577 	info.size = texture_image.size;
578 	info.slo = (words[0] >> 12) & 0xfff;
579 	info.shi = (words[1] >> 12) & 0xfff;
580 	info.tlo = (words[0] >> 0) & 0xfff;
581 	info.thi = (words[1] >> 0) & 0xfff;
582 	info.mode = UploadMode::Tile;
583 
584 	renderer.load_tile(tile, info);
585 }
586 
op_load_tlut(const uint32_t * words)587 void CommandProcessor::op_load_tlut(const uint32_t *words)
588 {
589 	uint32_t tile = (words[1] >> 24) & 7;
590 
591 	LoadTileInfo info = {};
592 
593 	info.tex_addr = texture_image.addr;
594 	info.tex_width = texture_image.width;
595 	info.fmt = texture_image.fmt;
596 	info.size = texture_image.size;
597 	info.slo = (words[0] >> 12) & 0xfff;
598 	info.shi = (words[1] >> 12) & 0xfff;
599 	info.tlo = (words[0] >> 0) & 0xfff;
600 	info.thi = (words[1] >> 0) & 0xfff;
601 	info.mode = UploadMode::TLUT;
602 
603 	renderer.load_tile(tile, info);
604 }
605 
op_load_block(const uint32_t * words)606 void CommandProcessor::op_load_block(const uint32_t *words)
607 {
608 	uint32_t tile = (words[1] >> 24) & 7;
609 
610 	LoadTileInfo info = {};
611 
612 	info.tex_addr = texture_image.addr;
613 	info.tex_width = texture_image.width;
614 	info.fmt = texture_image.fmt;
615 	info.size = texture_image.size;
616 	info.slo = (words[0] >> 12) & 0xfff;
617 	info.shi = (words[1] >> 12) & 0xfff;
618 	info.tlo = (words[0] >> 0) & 0xfff;
619 	info.thi = (words[1] >> 0) & 0xfff;
620 	info.mode = UploadMode::Block;
621 
622 	renderer.load_tile(tile, info);
623 }
624 
op_set_tile_size(const uint32_t * words)625 void CommandProcessor::op_set_tile_size(const uint32_t *words)
626 {
627 	uint32_t tile = (words[1] >> 24) & 7;
628 	auto slo = (words[0] >> 12) & 0xfff;
629 	auto shi = (words[1] >> 12) & 0xfff;
630 	auto tlo = (words[0] >> 0) & 0xfff;
631 	auto thi = (words[1] >> 0) & 0xfff;
632 	renderer.set_tile_size(tile, slo, shi, tlo, thi);
633 }
634 
op_set_combine(const uint32_t * words)635 void CommandProcessor::op_set_combine(const uint32_t *words)
636 {
637 	static_state.combiner[0].rgb.muladd = static_cast<RGBMulAdd>((words[0] >> 20) & 0xf);
638 	static_state.combiner[0].rgb.mul = static_cast<RGBMul>((words[0] >> 15) & 0x1f);
639 	static_state.combiner[0].rgb.mulsub = static_cast<RGBMulSub>((words[1] >> 28) & 0xf);
640 	static_state.combiner[0].rgb.add = static_cast<RGBAdd>(words[1] >> 15 & 0x7);
641 
642 	static_state.combiner[0].alpha.muladd = static_cast<AlphaAddSub>((words[0] >> 12) & 0x7);
643 	static_state.combiner[0].alpha.mulsub = static_cast<AlphaAddSub>((words[1] >> 12) & 0x7);
644 	static_state.combiner[0].alpha.mul = static_cast<AlphaMul>((words[0] >> 9) & 0x7);
645 	static_state.combiner[0].alpha.add = static_cast<AlphaAddSub>((words[1] >> 9) & 0x7);
646 
647 	static_state.combiner[1].rgb.muladd = static_cast<RGBMulAdd>((words[0] >> 5) & 0xf);
648 	static_state.combiner[1].rgb.mul = static_cast<RGBMul>((words[0] >> 0) & 0x1f);
649 	static_state.combiner[1].rgb.mulsub = static_cast<RGBMulSub>((words[1] >> 24) & 0xf);
650 	static_state.combiner[1].rgb.add = static_cast<RGBAdd>(words[1] >> 6 & 0x7);
651 
652 	static_state.combiner[1].alpha.muladd = static_cast<AlphaAddSub>((words[1] >> 21) & 0x7);
653 	static_state.combiner[1].alpha.mulsub = static_cast<AlphaAddSub>((words[1] >> 3) & 0x7);
654 	static_state.combiner[1].alpha.mul = static_cast<AlphaMul>((words[1] >> 18) & 0x7);
655 	static_state.combiner[1].alpha.add = static_cast<AlphaAddSub>((words[1] >> 0) & 0x7);
656 
657 	renderer.set_static_rasterization_state(static_state);
658 }
659 
op_set_blend_color(const uint32_t * words)660 void CommandProcessor::op_set_blend_color(const uint32_t *words)
661 {
662 	renderer.set_blend_color(words[1]);
663 }
664 
op_set_env_color(const uint32_t * words)665 void CommandProcessor::op_set_env_color(const uint32_t *words)
666 {
667 	renderer.set_env_color(words[1]);
668 }
669 
op_set_fog_color(const uint32_t * words)670 void CommandProcessor::op_set_fog_color(const uint32_t *words)
671 {
672 	renderer.set_fog_color(words[1]);
673 }
674 
op_set_prim_color(const uint32_t * words)675 void CommandProcessor::op_set_prim_color(const uint32_t *words)
676 {
677 	uint8_t prim_min_level = (words[0] >> 8) & 31;
678 	uint8_t prim_level_frac = (words[0] >> 0) & 0xff;
679 	renderer.set_primitive_color(prim_min_level, prim_level_frac, words[1]);
680 }
681 
op_set_fill_color(const uint32_t * words)682 void CommandProcessor::op_set_fill_color(const uint32_t *words)
683 {
684 	renderer.set_fill_color(words[1]);
685 }
686 
op_fill_rectangle(const uint32_t * words)687 void CommandProcessor::op_fill_rectangle(const uint32_t *words)
688 {
689 	uint32_t xl = (words[0] >> 12) & 0xfff;
690 	uint32_t yl = (words[0] >> 0) & 0xfff;
691 	uint32_t xh = (words[1] >> 12) & 0xfff;
692 	uint32_t yh = (words[1] >> 0) & 0xfff;
693 
694 	if ((static_state.flags & (RASTERIZATION_COPY_BIT | RASTERIZATION_FILL_BIT)) != 0)
695 		yl |= 3;
696 
697 	TriangleSetup setup = {};
698 	setup.xh = xh << 13;
699 	setup.xl = xl << 13;
700 	setup.xm = xl << 13;
701 	setup.ym = yl;
702 	setup.yl = yl;
703 	setup.yh = yh;
704 	setup.flags = TRIANGLE_SETUP_FLIP_BIT | TRIANGLE_SETUP_DISABLE_UPSCALING_BIT;
705 
706 	renderer.draw_flat_primitive(setup);
707 }
708 
op_texture_rectangle(const uint32_t * words)709 void CommandProcessor::op_texture_rectangle(const uint32_t *words)
710 {
711 	uint32_t xl = (words[0] >> 12) & 0xfff;
712 	uint32_t yl = (words[0] >> 0) & 0xfff;
713 	uint32_t xh = (words[1] >> 12) & 0xfff;
714 	uint32_t yh = (words[1] >> 0) & 0xfff;
715 	uint32_t tile = (words[1] >> 24) & 0x7;
716 
717 	int32_t s = (words[2] >> 16) & 0xffff;
718 	int32_t t = (words[2] >> 0) & 0xffff;
719 	int32_t dsdx = (words[3] >> 16) & 0xffff;
720 	int32_t dtdy = (words[3] >> 0) & 0xffff;
721 	dsdx = sext<16>(dsdx);
722 	dtdy = sext<16>(dtdy);
723 
724 	if ((static_state.flags & (RASTERIZATION_COPY_BIT | RASTERIZATION_FILL_BIT)) != 0)
725 		yl |= 3;
726 
727 	TriangleSetup setup = {};
728 	AttributeSetup attr = {};
729 
730 	setup.xh = xh << 13;
731 	setup.xl = xl << 13;
732 	setup.xm = xl << 13;
733 	setup.ym = yl;
734 	setup.yl = yl;
735 	setup.yh = yh;
736 	setup.flags = TRIANGLE_SETUP_FLIP_BIT |
737 	              (quirks.u.options.native_resolution_tex_rect ? TRIANGLE_SETUP_DISABLE_UPSCALING_BIT : 0) |
738 	              (quirks.u.options.native_texture_lod ? TRIANGLE_SETUP_NATIVE_LOD_BIT : 0);
739 	setup.tile = tile;
740 
741 	attr.s = s << 16;
742 	attr.t = t << 16;
743 	attr.dsdx = dsdx << 11;
744 	attr.dtde = dtdy << 11;
745 	attr.dtdy = dtdy << 11;
746 
747 	if ((static_state.flags & RASTERIZATION_COPY_BIT) != 0)
748 		setup.flags |= TRIANGLE_SETUP_SKIP_XFRAC_BIT;
749 
750 	renderer.draw_shaded_primitive(setup, attr);
751 }
752 
op_texture_rectangle_flip(const uint32_t * words)753 void CommandProcessor::op_texture_rectangle_flip(const uint32_t *words)
754 {
755 	uint32_t xl = (words[0] >> 12) & 0xfff;
756 	uint32_t yl = (words[0] >> 0) & 0xfff;
757 	uint32_t xh = (words[1] >> 12) & 0xfff;
758 	uint32_t yh = (words[1] >> 0) & 0xfff;
759 	uint32_t tile = (words[1] >> 24) & 0x7;
760 
761 	int32_t s = (words[2] >> 16) & 0xffff;
762 	int32_t t = (words[2] >> 0) & 0xffff;
763 	int32_t dsdx = (words[3] >> 16) & 0xffff;
764 	int32_t dtdy = (words[3] >> 0) & 0xffff;
765 	dsdx = sext<16>(dsdx);
766 	dtdy = sext<16>(dtdy);
767 
768 	if ((static_state.flags & (RASTERIZATION_COPY_BIT | RASTERIZATION_FILL_BIT)) != 0)
769 		yl |= 3;
770 
771 	TriangleSetup setup = {};
772 	AttributeSetup attr = {};
773 
774 	setup.xh = xh << 13;
775 	setup.xl = xl << 13;
776 	setup.xm = xl << 13;
777 	setup.ym = yl;
778 	setup.yl = yl;
779 	setup.yh = yh;
780 	setup.flags = TRIANGLE_SETUP_FLIP_BIT | TRIANGLE_SETUP_DISABLE_UPSCALING_BIT |
781 	              (quirks.u.options.native_resolution_tex_rect ? TRIANGLE_SETUP_DISABLE_UPSCALING_BIT : 0) |
782 	              (quirks.u.options.native_texture_lod ? TRIANGLE_SETUP_NATIVE_LOD_BIT : 0);
783 	setup.tile = tile;
784 
785 	attr.s = s << 16;
786 	attr.t = t << 16;
787 	attr.dtdx = dtdy << 11;
788 	attr.dsde = dsdx << 11;
789 	attr.dsdy = dsdx << 11;
790 
791 	if ((static_state.flags & RASTERIZATION_COPY_BIT) != 0)
792 		setup.flags |= TRIANGLE_SETUP_SKIP_XFRAC_BIT;
793 
794 	renderer.draw_shaded_primitive(setup, attr);
795 }
796 
op_set_prim_depth(const uint32_t * words)797 void CommandProcessor::op_set_prim_depth(const uint32_t *words)
798 {
799 	renderer.set_primitive_depth((words[1] >> 16) & 0xffff, words[1] & 0xffff);
800 }
801 
op_set_convert(const uint32_t * words)802 void CommandProcessor::op_set_convert(const uint32_t *words)
803 {
804 	uint64_t merged = (uint64_t(words[0]) << 32) | words[1];
805 
806 	uint16_t k5 = (merged >> 0) & 0x1ff;
807 	uint16_t k4 = (merged >> 9) & 0x1ff;
808 	uint16_t k3 = (merged >> 18) & 0x1ff;
809 	uint16_t k2 = (merged >> 27) & 0x1ff;
810 	uint16_t k1 = (merged >> 36) & 0x1ff;
811 	uint16_t k0 = (merged >> 45) & 0x1ff;
812 	renderer.set_convert(k0, k1, k2, k3, k4, k5);
813 }
814 
op_set_key_gb(const uint32_t * words)815 void CommandProcessor::op_set_key_gb(const uint32_t *words)
816 {
817 	uint32_t g_width = (words[0] >> 12) & 0xfff;
818 	uint32_t b_width = (words[0] >> 0) & 0xfff;
819 	uint32_t g_center = (words[1] >> 24) & 0xff;
820 	uint32_t g_scale = (words[1] >> 16) & 0xff;
821 	uint32_t b_center = (words[1] >> 8) & 0xff;
822 	uint32_t b_scale = (words[1] >> 0) & 0xff;
823 	renderer.set_color_key(1, g_width, g_center, g_scale);
824 	renderer.set_color_key(2, b_width, b_center, b_scale);
825 }
826 
op_set_key_r(const uint32_t * words)827 void CommandProcessor::op_set_key_r(const uint32_t *words)
828 {
829 	uint32_t r_width = (words[1] >> 16) & 0xfff;
830 	uint32_t r_center = (words[1] >> 8) & 0xff;
831 	uint32_t r_scale = (words[1] >> 0) & 0xff;
832 	renderer.set_color_key(0, r_width, r_center, r_scale);
833 }
834 
835 #define OP(x) void CommandProcessor::op_##x(const uint32_t *) {}
OP(sync_pipe)836 OP(sync_load) OP(sync_pipe)
837 OP(sync_tile)
838 #undef OP
839 
840 void CommandProcessor::enqueue_command(unsigned num_words, const uint32_t *words)
841 {
842 	if (single_threaded_processing)
843 		enqueue_command_direct(num_words, words);
844 	else
845 		ring.enqueue_command(num_words, words);
846 }
847 
enqueue_command_direct(unsigned,const uint32_t * words)848 void CommandProcessor::enqueue_command_direct(unsigned, const uint32_t *words)
849 {
850 #define OP(x) &CommandProcessor::op_##x
851 	using CommandFunc = void (CommandProcessor::*)(const uint32_t *words);
852 	static const CommandFunc funcs[64] = {
853 		/* 0x00 */ nullptr, nullptr, nullptr, nullptr,
854 		/* 0x04 */ nullptr, nullptr, nullptr, nullptr,
855 		/* 0x08 */ OP(fill_triangle), OP(fill_z_buffer_triangle), OP(texture_triangle), OP(texture_z_buffer_triangle),
856 		/* 0x0c */ OP(shade_triangle), OP(shade_z_buffer_triangle), OP(shade_texture_triangle), OP(shade_texture_z_buffer_triangle),
857 		/* 0x10 */ nullptr, nullptr, nullptr, nullptr,
858 		/* 0x14 */ nullptr, nullptr, nullptr, nullptr,
859 		/* 0x18 */ nullptr, nullptr, nullptr, nullptr,
860 		/* 0x1c */ nullptr, nullptr, nullptr, nullptr,
861 		/* 0x20 */ nullptr, nullptr, nullptr, nullptr,
862 		/* 0x24 */ OP(texture_rectangle), OP(texture_rectangle_flip), OP(sync_load), OP(sync_pipe),
863 		/* 0x28 */ OP(sync_tile), OP(sync_full), OP(set_key_gb), OP(set_key_r),
864 		/* 0x2c */ OP(set_convert), OP(set_scissor), OP(set_prim_depth), OP(set_other_modes),
865 		/* 0x30 */ OP(load_tlut), nullptr, OP(set_tile_size), OP(load_block),
866 		/* 0x34 */ OP(load_tile), OP(set_tile), OP(fill_rectangle), OP(set_fill_color),
867 		/* 0x38 */ OP(set_fog_color), OP(set_blend_color), OP(set_prim_color), OP(set_env_color),
868 		/* 0x3c */ OP(set_combine), OP(set_texture_image), OP(set_mask_image), OP(set_color_image),
869 	};
870 #undef OP
871 
872 	unsigned op = (words[0] >> 24) & 63;
873 	switch (Op(op))
874 	{
875 	case Op::MetaSignalTimeline:
876 	{
877 		renderer.flush_and_signal();
878 		uint64_t val = words[1] | (uint64_t(words[2]) << 32);
879 		CoherencyOperation signal_op;
880 		signal_op.timeline_value = val;
881 		timeline_worker.push(std::move(signal_op));
882 		break;
883 	}
884 
885 	case Op::MetaFlush:
886 	{
887 		renderer.flush_and_signal();
888 		break;
889 	}
890 
891 	case Op::MetaIdle:
892 	{
893 		renderer.notify_idle_command_thread();
894 		break;
895 	}
896 
897 	case Op::MetaSetQuirks:
898 	{
899 		quirks.u.words[0] = words[1];
900 		break;
901 	}
902 
903 	default:
904 		if (funcs[op])
905 			(this->*funcs[op])(words);
906 		break;
907 	}
908 }
909 
set_quirks(const Quirks & quirks_)910 void CommandProcessor::set_quirks(const Quirks &quirks_)
911 {
912 	const uint32_t words[2] = {
913 		uint32_t(Op::MetaSetQuirks) << 24u,
914 		quirks_.u.words[0],
915 	};
916 	enqueue_command(2, words);
917 }
918 
set_vi_register(VIRegister reg,uint32_t value)919 void CommandProcessor::set_vi_register(VIRegister reg, uint32_t value)
920 {
921 	vi.set_vi_register(reg, value);
922 }
923 
begin_read_rdram()924 void *CommandProcessor::begin_read_rdram()
925 {
926 	if (rdram)
927 		return device.map_host_buffer(*rdram, MEMORY_ACCESS_READ_BIT);
928 	else
929 		return nullptr;
930 }
931 
end_write_rdram()932 void CommandProcessor::end_write_rdram()
933 {
934 	if (rdram)
935 		device.unmap_host_buffer(*rdram, MEMORY_ACCESS_WRITE_BIT);
936 }
937 
begin_read_hidden_rdram()938 void *CommandProcessor::begin_read_hidden_rdram()
939 {
940 	return device.map_host_buffer(*hidden_rdram, MEMORY_ACCESS_READ_BIT);
941 }
942 
end_write_hidden_rdram()943 void CommandProcessor::end_write_hidden_rdram()
944 {
945 	device.unmap_host_buffer(*hidden_rdram, MEMORY_ACCESS_WRITE_BIT);
946 }
947 
get_rdram_size() const948 size_t CommandProcessor::get_rdram_size() const
949 {
950 	if (is_host_coherent)
951 		return rdram->get_create_info().size;
952 	else
953 		return rdram->get_create_info().size / 2;
954 }
955 
get_hidden_rdram_size() const956 size_t CommandProcessor::get_hidden_rdram_size() const
957 {
958 	return hidden_rdram->get_create_info().size;
959 }
960 
get_tmem()961 void *CommandProcessor::get_tmem()
962 {
963 	return device.map_host_buffer(*tmem, MEMORY_ACCESS_READ_BIT);
964 }
965 
idle()966 void CommandProcessor::idle()
967 {
968 	flush();
969 	wait_for_timeline(signal_timeline());
970 }
971 
flush()972 void CommandProcessor::flush()
973 {
974 	const uint32_t words[1] = {
975 		uint32_t(Op::MetaFlush) << 24,
976 	};
977 	enqueue_command(1, words);
978 }
979 
signal_timeline()980 uint64_t CommandProcessor::signal_timeline()
981 {
982 	timeline_value++;
983 
984 	const uint32_t words[3] = {
985 		uint32_t(Op::MetaSignalTimeline) << 24,
986 		uint32_t(timeline_value),
987 		uint32_t(timeline_value >> 32),
988 	};
989 	enqueue_command(3, words);
990 
991 	return timeline_value;
992 }
993 
wait_for_timeline(uint64_t index)994 void CommandProcessor::wait_for_timeline(uint64_t index)
995 {
996 	Vulkan::QueryPoolHandle start_ts, end_ts;
997 	if (measure_stall_time)
998 		start_ts = device.write_calibrated_timestamp();
999 	timeline_worker.wait([this, index]() -> bool {
1000 		return thread_timeline_value >= index;
1001 	});
1002 	if (measure_stall_time)
1003 	{
1004 		end_ts = device.write_calibrated_timestamp();
1005 		device.register_time_interval("RDP CPU", std::move(start_ts), std::move(end_ts), "wait-for-timeline");
1006 	}
1007 }
1008 
scanout(const ScanoutOptions & opts)1009 Vulkan::ImageHandle CommandProcessor::scanout(const ScanoutOptions &opts)
1010 {
1011 	Vulkan::QueryPoolHandle start_ts, end_ts;
1012 	drain_command_ring();
1013 
1014 	// Block idle callbacks triggering while we're doing this.
1015 	renderer.lock_command_processing();
1016 	{
1017 		renderer.flush_and_signal();
1018 		if (!is_host_coherent)
1019 		{
1020 			unsigned offset, length;
1021 			vi.scanout_memory_range(offset, length);
1022 			renderer.resolve_coherency_external(offset, length);
1023 		}
1024 	}
1025 	renderer.unlock_command_processing();
1026 
1027 	auto scanout = vi.scanout(VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, opts, renderer.get_scaling_factor());
1028 	return scanout;
1029 }
1030 
drain_command_ring()1031 void CommandProcessor::drain_command_ring()
1032 {
1033 	Vulkan::QueryPoolHandle start_ts, end_ts;
1034 	if (timestamp)
1035 		start_ts = device.write_calibrated_timestamp();
1036 	ring.drain();
1037 	if (timestamp)
1038 	{
1039 		end_ts = device.write_calibrated_timestamp();
1040 		device.register_time_interval("RDP CPU", std::move(start_ts), std::move(end_ts), "drain-command-ring");
1041 	}
1042 }
1043 
scanout_sync(std::vector<RGBA> & colors,unsigned & width,unsigned & height)1044 void CommandProcessor::scanout_sync(std::vector<RGBA> &colors, unsigned &width, unsigned &height)
1045 {
1046 	drain_command_ring();
1047 	renderer.flush_and_signal();
1048 
1049 	if (!is_host_coherent)
1050 	{
1051 		unsigned offset, length;
1052 		vi.scanout_memory_range(offset, length);
1053 		renderer.resolve_coherency_external(offset, length);
1054 	}
1055 
1056 	ScanoutOptions opts = {};
1057 	// Downscale down to 1x, always.
1058 	opts.downscale_steps = 32;
1059 	opts.blend_previous_frame = true;
1060 	opts.upscale_deinterlacing = false;
1061 
1062 	auto handle = vi.scanout(VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, opts, renderer.get_scaling_factor());
1063 
1064 	if (!handle)
1065 	{
1066 		width = 0;
1067 		height = 0;
1068 		colors.clear();
1069 		return;
1070 	}
1071 
1072 	width = handle->get_width();
1073 	height = handle->get_height();
1074 
1075 	Vulkan::BufferCreateInfo info = {};
1076 	info.size = width * height * sizeof(uint32_t);
1077 	info.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT;
1078 	info.domain = Vulkan::BufferDomain::CachedHost;
1079 	auto readback = device.create_buffer(info);
1080 
1081 	auto cmd = device.request_command_buffer();
1082 	cmd->copy_image_to_buffer(*readback, *handle, 0, {}, { width, height, 1 }, 0, 0, { VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1 });
1083 	cmd->barrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT,
1084 	             VK_PIPELINE_STAGE_HOST_BIT, VK_ACCESS_HOST_READ_BIT);
1085 
1086 	Vulkan::Fence fence;
1087 	device.submit(cmd, &fence);
1088 	fence->wait();
1089 
1090 	colors.resize(width * height);
1091 	memcpy(colors.data(), device.map_host_buffer(*readback, Vulkan::MEMORY_ACCESS_READ_BIT),
1092 	       width * height * sizeof(uint32_t));
1093 	device.unmap_host_buffer(*readback, Vulkan::MEMORY_ACCESS_READ_BIT);
1094 }
1095 
notify_work_locked(const CoherencyOperation & work)1096 void CommandProcessor::FenceExecutor::notify_work_locked(const CoherencyOperation &work)
1097 {
1098 	if (work.timeline_value)
1099 		*value = work.timeline_value;
1100 }
1101 
is_sentinel(const CoherencyOperation & work) const1102 bool CommandProcessor::FenceExecutor::is_sentinel(const CoherencyOperation &work) const
1103 {
1104 	return !work.fence && !work.timeline_value;
1105 }
1106 
masked_memcpy(uint8_t * __restrict dst,const uint8_t * __restrict data_src,const uint8_t * __restrict masked_src,size_t size)1107 static void masked_memcpy(uint8_t * __restrict dst,
1108                           const uint8_t * __restrict data_src,
1109                           const uint8_t * __restrict masked_src,
1110                           size_t size)
1111 {
1112 #if defined(__SSE2__)
1113 	for (size_t i = 0; i < size; i += 16)
1114 	{
1115 		__m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data_src + i));
1116 		__m128i mask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(masked_src + i));
1117 		_mm_maskmoveu_si128(data, mask, reinterpret_cast<char *>(dst + i));
1118 	}
1119 #else
1120 	auto * __restrict data32 = reinterpret_cast<const uint32_t *>(data_src);
1121 	auto * __restrict mask32 = reinterpret_cast<const uint32_t *>(masked_src);
1122 	auto * __restrict dst32 = reinterpret_cast<uint32_t *>(dst);
1123 	auto size32 = size >> 2;
1124 
1125 	for (size_t i = 0; i < size32; i++)
1126 	{
1127 		auto mask = mask32[i];
1128 		if (mask == ~0u)
1129 		{
1130 			dst32[i] = data32[i];
1131 		}
1132 		else if (mask)
1133 		{
1134 			// Fairly rare path.
1135 			for (unsigned j = 0; j < 4; j++)
1136 				if (masked_src[4 * i + j])
1137 					dst[4 * i + j] = data_src[4 * i + j];
1138 		}
1139 	}
1140 #endif
1141 }
1142 
perform_work(CoherencyOperation & work)1143 void CommandProcessor::FenceExecutor::perform_work(CoherencyOperation &work)
1144 {
1145 	if (work.fence)
1146 		work.fence->wait();
1147 
1148 	if (work.unlock_cookie)
1149 		work.unlock_cookie->fetch_sub(1, std::memory_order_relaxed);
1150 
1151 	if (work.src)
1152 	{
1153 		for (auto &copy : work.copies)
1154 		{
1155 			auto *mapped_data = static_cast<uint8_t *>(device->map_host_buffer(*work.src, MEMORY_ACCESS_READ_BIT, copy.src_offset, copy.size));
1156 			auto *mapped_mask = static_cast<uint8_t *>(device->map_host_buffer(*work.src, MEMORY_ACCESS_READ_BIT, copy.mask_offset, copy.size));
1157 			masked_memcpy(work.dst + copy.dst_offset, mapped_data, mapped_mask, copy.size);
1158 			for (unsigned i = 0; i < copy.counters; i++)
1159 			{
1160 				unsigned val = copy.counter_base[i].fetch_sub(1, std::memory_order_release);
1161 				(void)val;
1162 				assert(val > 0);
1163 			}
1164 		}
1165 
1166 #ifdef __SSE2__
1167 		_mm_mfence();
1168 #endif
1169 	}
1170 }
1171 
enqueue_coherency_operation(CoherencyOperation && op)1172 void CommandProcessor::enqueue_coherency_operation(CoherencyOperation &&op)
1173 {
1174 	timeline_worker.push(std::move(op));
1175 }
1176 }
1177