1 /* Copyright (c) 2020 Themaister
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining
4 * a copy of this software and associated documentation files (the
5 * "Software"), to deal in the Software without restriction, including
6 * without limitation the rights to use, copy, modify, merge, publish,
7 * distribute, sublicense, and/or sell copies of the Software, and to
8 * permit persons to whom the Software is furnished to do so, subject to
9 * the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be
12 * included in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include "rdp_device.hpp"
24 #include "rdp_common.hpp"
25 #include <chrono>
26
27 #ifdef __SSE2__
28 #include <emmintrin.h>
29 #endif
30
31 #ifndef PARALLEL_RDP_SHADER_DIR
32 #include "shaders/slangmosh.hpp"
33 #endif
34
35 using namespace Vulkan;
36
37 #define STATE_MASK(flag, cond, mask) do { \
38 (flag) &= ~(mask); \
39 if (cond) (flag) |= (mask); \
40 } while(0)
41
42 namespace RDP
43 {
CommandProcessor(Vulkan::Device & device_,void * rdram_ptr,size_t rdram_offset_,size_t rdram_size_,size_t hidden_rdram_size,CommandProcessorFlags flags_)44 CommandProcessor::CommandProcessor(Vulkan::Device &device_, void *rdram_ptr,
45 size_t rdram_offset_, size_t rdram_size_, size_t hidden_rdram_size,
46 CommandProcessorFlags flags_)
47 : device(device_), rdram_offset(rdram_offset_), rdram_size(rdram_size_), flags(flags_), renderer(*this),
48 #ifdef PARALLEL_RDP_SHADER_DIR
49 timeline_worker(Granite::Global::create_thread_context(), FenceExecutor{&device, &thread_timeline_value})
50 #else
51 timeline_worker(FenceExecutor{&device, &thread_timeline_value})
52 #endif
53 {
54 BufferCreateInfo info = {};
55 info.size = rdram_size;
56 info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
57 info.domain = BufferDomain::CachedCoherentHostPreferCached;
58 info.misc = BUFFER_MISC_ZERO_INITIALIZE_BIT;
59
60 if (rdram_ptr)
61 {
62 bool allow_memory_host = true;
63 if (const char *env = getenv("PARALLEL_RDP_ALLOW_EXTERNAL_HOST"))
64 allow_memory_host = strtol(env, nullptr, 0) > 0;
65
66 if (allow_memory_host && device.get_device_features().supports_external_memory_host)
67 {
68 size_t import_size = rdram_size + rdram_offset;
69 size_t align = device.get_device_features().host_memory_properties.minImportedHostPointerAlignment;
70 import_size = (import_size + align - 1) & ~(align - 1);
71 info.size = import_size;
72 rdram = device.create_imported_host_buffer(info, VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT, rdram_ptr);
73 }
74 else
75 {
76 LOGW("VK_EXT_external_memory_host is not supported on this device. Falling back to a slower path.\n");
77 is_host_coherent = false;
78 rdram_offset = 0;
79 host_rdram = static_cast<uint8_t *>(rdram_ptr) + rdram_offset_;
80
81 BufferCreateInfo device_rdram = {};
82 device_rdram.size = rdram_size * 2; // Need twice the memory amount so we can also store a writemask.
83 device_rdram.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT |
84 VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
85 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
86
87 if (device.get_gpu_properties().deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU)
88 device_rdram.domain = BufferDomain::CachedCoherentHostPreferCached;
89 else
90 device_rdram.domain = BufferDomain::Device;
91
92 device_rdram.misc = BUFFER_MISC_ZERO_INITIALIZE_BIT;
93 rdram = device.create_buffer(device_rdram);
94 }
95 }
96 else
97 rdram = device.create_buffer(info);
98
99 if (!rdram)
100 LOGE("Failed to allocate RDRAM.\n");
101
102 info.size = hidden_rdram_size;
103 // Should be CachedHost, but seeing some insane bug on incoherent Arm systems for time being,
104 // so just forcing coherent memory here for now. Not sure what is going on.
105 info.domain = (flags & COMMAND_PROCESSOR_FLAG_HOST_VISIBLE_HIDDEN_RDRAM_BIT) != 0 ?
106 BufferDomain::CachedCoherentHostPreferCoherent : BufferDomain::Device;
107 info.misc = 0;
108 hidden_rdram = device.create_buffer(info);
109
110 info.size = 0x1000;
111 info.domain = (flags & COMMAND_PROCESSOR_FLAG_HOST_VISIBLE_TMEM_BIT) != 0 ?
112 BufferDomain::CachedCoherentHostPreferCoherent : BufferDomain::Device;
113 tmem = device.create_buffer(info);
114
115 clear_hidden_rdram();
116 clear_tmem();
117 init_renderer();
118
119 if (const char *env = getenv("PARALLEL_RDP_BENCH"))
120 {
121 measure_stall_time = strtol(env, nullptr, 0) > 0;
122 if (measure_stall_time)
123 LOGI("Will measure stall timings.\n");
124 }
125
126 if (const char *env = getenv("PARALLEL_RDP_SINGLE_THREADED_COMMAND"))
127 {
128 single_threaded_processing = strtol(env, nullptr, 0) > 0;
129 if (single_threaded_processing)
130 LOGI("Will use single threaded command processing.\n");
131 }
132
133 if (!single_threaded_processing)
134 {
135 ring.init(
136 #ifdef PARALLEL_RDP_SHADER_DIR
137 Granite::Global::create_thread_context(),
138 #endif
139 this, 4 * 1024);
140 }
141
142 if (const char *env = getenv("PARALLEL_RDP_BENCH"))
143 timestamp = strtol(env, nullptr, 0) > 0;
144 }
145
~CommandProcessor()146 CommandProcessor::~CommandProcessor()
147 {
148 idle();
149 }
150
begin_frame_context()151 void CommandProcessor::begin_frame_context()
152 {
153 flush();
154 drain_command_ring();
155 device.next_frame_context();
156 }
157
init_renderer()158 void CommandProcessor::init_renderer()
159 {
160 if (!rdram)
161 {
162 is_supported = false;
163 return;
164 }
165
166 renderer.set_device(&device);
167 renderer.set_rdram(rdram.get(), host_rdram, rdram_offset, rdram_size, is_host_coherent);
168 renderer.set_hidden_rdram(hidden_rdram.get());
169 renderer.set_tmem(tmem.get());
170
171 unsigned factor = 1;
172 if (flags & COMMAND_PROCESSOR_FLAG_UPSCALING_8X_BIT)
173 factor = 8;
174 else if (flags & COMMAND_PROCESSOR_FLAG_UPSCALING_4X_BIT)
175 factor = 4;
176 else if (flags & COMMAND_PROCESSOR_FLAG_UPSCALING_2X_BIT)
177 factor = 2;
178
179 if (factor != 1)
180 LOGI("Enabling upscaling: %ux.\n", factor);
181
182 RendererOptions opts;
183 opts.upscaling_factor = factor;
184
185 is_supported = renderer.init_renderer(opts);
186
187 vi.set_device(&device);
188 vi.set_rdram(rdram.get(), rdram_offset, rdram_size);
189 vi.set_hidden_rdram(hidden_rdram.get());
190 vi.set_renderer(&renderer);
191
192 #ifndef PARALLEL_RDP_SHADER_DIR
193 shader_bank.reset(new ShaderBank(device, [&](const char *name, const char *define) -> int {
194 if (strncmp(name, "vi_", 3) == 0)
195 return vi.resolve_shader_define(name, define);
196 else
197 return renderer.resolve_shader_define(name, define);
198 }));
199 renderer.set_shader_bank(shader_bank.get());
200 vi.set_shader_bank(shader_bank.get());
201 #endif
202 }
203
device_is_supported() const204 bool CommandProcessor::device_is_supported() const
205 {
206 return is_supported;
207 }
208
clear_hidden_rdram()209 void CommandProcessor::clear_hidden_rdram()
210 {
211 clear_buffer(*hidden_rdram, 0x03030303);
212 }
213
clear_tmem()214 void CommandProcessor::clear_tmem()
215 {
216 clear_buffer(*tmem, 0);
217 }
218
clear_buffer(Vulkan::Buffer & buffer,uint32_t value)219 void CommandProcessor::clear_buffer(Vulkan::Buffer &buffer, uint32_t value)
220 {
221 if (!buffer.get_allocation().is_host_allocation())
222 {
223 auto cmd = device.request_command_buffer();
224 cmd->fill_buffer(buffer, value);
225 Fence fence;
226 device.submit(cmd, &fence);
227 fence->wait();
228 }
229 else
230 {
231 auto *mapped = device.map_host_buffer(buffer, MEMORY_ACCESS_WRITE_BIT);
232 memset(mapped, value & 0xff, buffer.get_create_info().size);
233 device.unmap_host_buffer(buffer, MEMORY_ACCESS_WRITE_BIT);
234 }
235 }
236
op_sync_full(const uint32_t *)237 void CommandProcessor::op_sync_full(const uint32_t *)
238 {
239 renderer.flush_and_signal();
240 }
241
decode_triangle_setup(TriangleSetup & setup,const uint32_t * words) const242 void CommandProcessor::decode_triangle_setup(TriangleSetup &setup, const uint32_t *words) const
243 {
244 bool copy_cycle = (static_state.flags & RASTERIZATION_COPY_BIT) != 0;
245 bool flip = (words[0] & 0x800000u) != 0;
246 bool sign_dxhdy = (words[5] & 0x80000000u) != 0;
247 bool do_offset = flip == sign_dxhdy;
248
249 setup.flags |= flip ? TRIANGLE_SETUP_FLIP_BIT : 0;
250 setup.flags |= do_offset ? TRIANGLE_SETUP_DO_OFFSET_BIT : 0;
251 setup.flags |= copy_cycle ? TRIANGLE_SETUP_SKIP_XFRAC_BIT : 0;
252 setup.flags |= quirks.u.options.native_texture_lod ? TRIANGLE_SETUP_NATIVE_LOD_BIT : 0;
253
254 setup.tile = (words[0] >> 16) & 63;
255
256 setup.yl = sext<14>(words[0]);
257 setup.ym = sext<14>(words[1] >> 16);
258 setup.yh = sext<14>(words[1]);
259
260 // The lower bit is ignored, so shift here to obtain an extra bit of subpixel precision.
261 // This is very useful for upscaling, since we can obtain 8x before we overflow instead of 4x.
262 setup.xl = sext<28>(words[2]) >> 1;
263 setup.xh = sext<28>(words[4]) >> 1;
264 setup.xm = sext<28>(words[6]) >> 1;
265 setup.dxldy = sext<28>(words[3] >> 2) >> 1;
266 setup.dxhdy = sext<28>(words[5] >> 2) >> 1;
267 setup.dxmdy = sext<28>(words[7] >> 2) >> 1;
268 }
269
decode_tex_setup(AttributeSetup & attr,const uint32_t * words)270 static void decode_tex_setup(AttributeSetup &attr, const uint32_t *words)
271 {
272 attr.s = (words[0] & 0xffff0000u) | ((words[4] >> 16) & 0x0000ffffu);
273 attr.t = ((words[0] << 16) & 0xffff0000u) | (words[4] & 0x0000ffffu);
274 attr.w = (words[1] & 0xffff0000u) | ((words[5] >> 16) & 0x0000ffffu);
275
276 attr.dsdx = (words[2] & 0xffff0000u) | ((words[6] >> 16) & 0x0000ffffu);
277 attr.dtdx = ((words[2] << 16) & 0xffff0000u) | (words[6] & 0x0000ffffu);
278 attr.dwdx = (words[3] & 0xffff0000u) | ((words[7] >> 16) & 0x0000ffffu);
279
280 attr.dsde = (words[8] & 0xffff0000u) | ((words[12] >> 16) & 0x0000ffffu);
281 attr.dtde = ((words[8] << 16) & 0xffff0000u) | (words[12] & 0x0000ffffu);
282 attr.dwde = (words[9] & 0xffff0000u) | ((words[13] >> 16) & 0x0000ffffu);
283
284 attr.dsdy = (words[10] & 0xffff0000u) | ((words[14] >> 16) & 0x0000ffffu);
285 attr.dtdy = ((words[10] << 16) & 0xffff0000u) | (words[14] & 0x0000ffffu);
286 attr.dwdy = (words[11] & 0xffff0000u) | ((words[15] >> 16) & 0x0000ffffu);
287 }
288
decode_rgba_setup(AttributeSetup & attr,const uint32_t * words)289 static void decode_rgba_setup(AttributeSetup &attr, const uint32_t *words)
290 {
291 attr.r = (words[0] & 0xffff0000u) | ((words[4] >> 16) & 0xffff);
292 attr.g = (words[0] << 16) | (words[4] & 0xffff);
293 attr.b = (words[1] & 0xffff0000u) | ((words[5] >> 16) & 0xffff);
294 attr.a = (words[1] << 16) | (words[5] & 0xffff);
295
296 attr.drdx = (words[2] & 0xffff0000u) | ((words[6] >> 16) & 0xffff);
297 attr.dgdx = (words[2] << 16) | (words[6] & 0xffff);
298 attr.dbdx = (words[3] & 0xffff0000u) | ((words[7] >> 16) & 0xffff);
299 attr.dadx = (words[3] << 16) | (words[7] & 0xffff);
300
301 attr.drde = (words[8] & 0xffff0000u) | ((words[12] >> 16) & 0xffff);
302 attr.dgde = (words[8] << 16) | (words[12] & 0xffff);
303 attr.dbde = (words[9] & 0xffff0000u) | ((words[13] >> 16) & 0xffff);
304 attr.dade = (words[9] << 16) | (words[13] & 0xffff);
305
306 attr.drdy = (words[10] & 0xffff0000u) | ((words[14] >> 16) & 0xffff);
307 attr.dgdy = (words[10] << 16) | (words[14] & 0xffff);
308 attr.dbdy = (words[11] & 0xffff0000u) | ((words[15] >> 16) & 0xffff);
309 attr.dady = (words[11] << 16) | (words[15] & 0xffff);
310 }
311
decode_z_setup(AttributeSetup & attr,const uint32_t * words)312 static void decode_z_setup(AttributeSetup &attr, const uint32_t *words)
313 {
314 attr.z = words[0];
315 attr.dzdx = words[1];
316 attr.dzde = words[2];
317 attr.dzdy = words[3];
318 }
319
op_fill_triangle(const uint32_t * words)320 void CommandProcessor::op_fill_triangle(const uint32_t *words)
321 {
322 TriangleSetup setup = {};
323 decode_triangle_setup(setup, words);
324 renderer.draw_flat_primitive(setup);
325 }
326
op_shade_triangle(const uint32_t * words)327 void CommandProcessor::op_shade_triangle(const uint32_t *words)
328 {
329 TriangleSetup setup = {};
330 AttributeSetup attr = {};
331 decode_triangle_setup(setup, words);
332 decode_rgba_setup(attr, words + 8);
333 renderer.draw_shaded_primitive(setup, attr);
334 }
335
op_shade_z_buffer_triangle(const uint32_t * words)336 void CommandProcessor::op_shade_z_buffer_triangle(const uint32_t *words)
337 {
338 TriangleSetup setup = {};
339 AttributeSetup attr = {};
340 decode_triangle_setup(setup, words);
341 decode_rgba_setup(attr, words + 8);
342 decode_z_setup(attr, words + 24);
343 renderer.draw_shaded_primitive(setup, attr);
344 }
345
op_shade_texture_z_buffer_triangle(const uint32_t * words)346 void CommandProcessor::op_shade_texture_z_buffer_triangle(const uint32_t *words)
347 {
348 TriangleSetup setup = {};
349 AttributeSetup attr = {};
350 decode_triangle_setup(setup, words);
351 decode_rgba_setup(attr, words + 8);
352 decode_tex_setup(attr, words + 24);
353 decode_z_setup(attr, words + 40);
354 renderer.draw_shaded_primitive(setup, attr);
355 }
356
op_fill_z_buffer_triangle(const uint32_t * words)357 void CommandProcessor::op_fill_z_buffer_triangle(const uint32_t *words)
358 {
359 TriangleSetup setup = {};
360 AttributeSetup attr = {};
361 decode_triangle_setup(setup, words);
362 decode_z_setup(attr, words + 8);
363 renderer.draw_shaded_primitive(setup, attr);
364 }
365
op_texture_triangle(const uint32_t * words)366 void CommandProcessor::op_texture_triangle(const uint32_t *words)
367 {
368 TriangleSetup setup = {};
369 AttributeSetup attr = {};
370 decode_triangle_setup(setup, words);
371 decode_tex_setup(attr, words + 8);
372 renderer.draw_shaded_primitive(setup, attr);
373 }
374
op_texture_z_buffer_triangle(const uint32_t * words)375 void CommandProcessor::op_texture_z_buffer_triangle(const uint32_t *words)
376 {
377 TriangleSetup setup = {};
378 AttributeSetup attr = {};
379 decode_triangle_setup(setup, words);
380 decode_tex_setup(attr, words + 8);
381 decode_z_setup(attr, words + 24);
382 renderer.draw_shaded_primitive(setup, attr);
383 }
384
op_shade_texture_triangle(const uint32_t * words)385 void CommandProcessor::op_shade_texture_triangle(const uint32_t *words)
386 {
387 TriangleSetup setup = {};
388 AttributeSetup attr = {};
389 decode_triangle_setup(setup, words);
390 decode_rgba_setup(attr, words + 8);
391 decode_tex_setup(attr, words + 24);
392 renderer.draw_shaded_primitive(setup, attr);
393 }
394
op_set_color_image(const uint32_t * words)395 void CommandProcessor::op_set_color_image(const uint32_t *words)
396 {
397 unsigned fmt = (words[0] >> 21) & 7;
398 unsigned size = (words[0] >> 19) & 3;
399 unsigned width = (words[0] & 1023) + 1;
400 unsigned addr = words[1] & 0xffffff;
401
402 FBFormat fbfmt;
403 switch (size)
404 {
405 case 0:
406 fbfmt = FBFormat::I4;
407 break;
408
409 case 1:
410 fbfmt = FBFormat::I8;
411 break;
412
413 case 2:
414 fbfmt = fmt ? FBFormat::IA88 : FBFormat::RGBA5551;
415 break;
416
417 case 3:
418 fbfmt = FBFormat::RGBA8888;
419 break;
420
421 default:
422 LOGE("Invalid pixel size %u.\n", size);
423 return;
424 }
425
426 renderer.set_color_framebuffer(addr, width, fbfmt);
427 }
428
op_set_mask_image(const uint32_t * words)429 void CommandProcessor::op_set_mask_image(const uint32_t *words)
430 {
431 unsigned addr = words[1] & 0xffffff;
432 renderer.set_depth_framebuffer(addr);
433 }
434
op_set_scissor(const uint32_t * words)435 void CommandProcessor::op_set_scissor(const uint32_t *words)
436 {
437 scissor_state.xlo = (words[0] >> 12) & 0xfff;
438 scissor_state.xhi = (words[1] >> 12) & 0xfff;
439 scissor_state.ylo = (words[0] >> 0) & 0xfff;
440 scissor_state.yhi = (words[1] >> 0) & 0xfff;
441
442 STATE_MASK(static_state.flags, bool(words[1] & (1 << 25)), RASTERIZATION_INTERLACE_FIELD_BIT);
443 STATE_MASK(static_state.flags, bool(words[1] & (1 << 24)), RASTERIZATION_INTERLACE_KEEP_ODD_BIT);
444 renderer.set_scissor_state(scissor_state);
445 renderer.set_static_rasterization_state(static_state);
446 }
447
op_set_other_modes(const uint32_t * words)448 void CommandProcessor::op_set_other_modes(const uint32_t *words)
449 {
450 STATE_MASK(static_state.flags, bool(words[0] & (1 << 19)), RASTERIZATION_PERSPECTIVE_CORRECT_BIT);
451 STATE_MASK(static_state.flags, bool(words[0] & (1 << 18)), RASTERIZATION_DETAIL_LOD_ENABLE_BIT);
452 STATE_MASK(static_state.flags, bool(words[0] & (1 << 17)), RASTERIZATION_SHARPEN_LOD_ENABLE_BIT);
453 STATE_MASK(static_state.flags, bool(words[0] & (1 << 16)), RASTERIZATION_TEX_LOD_ENABLE_BIT);
454 STATE_MASK(static_state.flags, bool(words[0] & (1 << 15)), RASTERIZATION_TLUT_BIT);
455 STATE_MASK(static_state.flags, bool(words[0] & (1 << 14)), RASTERIZATION_TLUT_TYPE_BIT);
456 STATE_MASK(static_state.flags, bool(words[0] & (1 << 13)), RASTERIZATION_SAMPLE_MODE_BIT);
457 STATE_MASK(static_state.flags, bool(words[0] & (1 << 12)), RASTERIZATION_SAMPLE_MID_TEXEL_BIT);
458 STATE_MASK(static_state.flags, bool(words[0] & (1 << 11)), RASTERIZATION_BILERP_0_BIT);
459 STATE_MASK(static_state.flags, bool(words[0] & (1 << 10)), RASTERIZATION_BILERP_1_BIT);
460 STATE_MASK(static_state.flags, bool(words[0] & (1 << 9)), RASTERIZATION_CONVERT_ONE_BIT);
461 STATE_MASK(depth_blend.flags, bool(words[1] & (1 << 14)), DEPTH_BLEND_FORCE_BLEND_BIT);
462 STATE_MASK(static_state.flags, bool(words[1] & (1 << 13)), RASTERIZATION_ALPHA_CVG_SELECT_BIT);
463 STATE_MASK(static_state.flags, bool(words[1] & (1 << 12)), RASTERIZATION_CVG_TIMES_ALPHA_BIT);
464 STATE_MASK(depth_blend.flags, bool(words[1] & (1 << 7)), DEPTH_BLEND_COLOR_ON_COVERAGE_BIT);
465 STATE_MASK(depth_blend.flags, bool(words[1] & (1 << 6)), DEPTH_BLEND_IMAGE_READ_ENABLE_BIT);
466 STATE_MASK(depth_blend.flags, bool(words[1] & (1 << 5)), DEPTH_BLEND_DEPTH_UPDATE_BIT);
467 STATE_MASK(depth_blend.flags, bool(words[1] & (1 << 4)), DEPTH_BLEND_DEPTH_TEST_BIT);
468 STATE_MASK(static_state.flags, bool(words[1] & (1 << 3)), RASTERIZATION_AA_BIT);
469 STATE_MASK(depth_blend.flags, bool(words[1] & (1 << 3)), DEPTH_BLEND_AA_BIT);
470
471 STATE_MASK(static_state.flags, bool(words[1] & (1 << 1)), RASTERIZATION_ALPHA_TEST_DITHER_BIT);
472 STATE_MASK(static_state.flags, bool(words[1] & (1 << 0)), RASTERIZATION_ALPHA_TEST_BIT);
473 static_state.dither = (words[0] >> 4) & 0x0f;
474 STATE_MASK(depth_blend.flags, RGBDitherMode(static_state.dither >> 2) != RGBDitherMode::Off, DEPTH_BLEND_DITHER_ENABLE_BIT);
475 depth_blend.coverage_mode = static_cast<CoverageMode>((words[1] >> 8) & 3);
476 depth_blend.z_mode = static_cast<ZMode>((words[1] >> 10) & 3);
477
478 static_state.flags &= ~(RASTERIZATION_MULTI_CYCLE_BIT |
479 RASTERIZATION_FILL_BIT |
480 RASTERIZATION_COPY_BIT);
481 depth_blend.flags &= ~DEPTH_BLEND_MULTI_CYCLE_BIT;
482
483 switch (CycleType((words[0] >> 20) & 3))
484 {
485 case CycleType::Cycle2:
486 static_state.flags |= RASTERIZATION_MULTI_CYCLE_BIT;
487 depth_blend.flags |= DEPTH_BLEND_MULTI_CYCLE_BIT;
488 break;
489
490 case CycleType::Fill:
491 static_state.flags |= RASTERIZATION_FILL_BIT;
492 break;
493
494 case CycleType::Copy:
495 static_state.flags |= RASTERIZATION_COPY_BIT;
496 break;
497
498 default:
499 break;
500 }
501
502 depth_blend.blend_cycles[0].blend_1a = static_cast<BlendMode1A>((words[1] >> 30) & 3);
503 depth_blend.blend_cycles[1].blend_1a = static_cast<BlendMode1A>((words[1] >> 28) & 3);
504 depth_blend.blend_cycles[0].blend_1b = static_cast<BlendMode1B>((words[1] >> 26) & 3);
505 depth_blend.blend_cycles[1].blend_1b = static_cast<BlendMode1B>((words[1] >> 24) & 3);
506 depth_blend.blend_cycles[0].blend_2a = static_cast<BlendMode2A>((words[1] >> 22) & 3);
507 depth_blend.blend_cycles[1].blend_2a = static_cast<BlendMode2A>((words[1] >> 20) & 3);
508 depth_blend.blend_cycles[0].blend_2b = static_cast<BlendMode2B>((words[1] >> 18) & 3);
509 depth_blend.blend_cycles[1].blend_2b = static_cast<BlendMode2B>((words[1] >> 16) & 3);
510
511 renderer.set_static_rasterization_state(static_state);
512 renderer.set_depth_blend_state(depth_blend);
513 renderer.set_enable_primitive_depth(bool(words[1] & (1 << 2)));
514 }
515
op_set_texture_image(const uint32_t * words)516 void CommandProcessor::op_set_texture_image(const uint32_t *words)
517 {
518 auto fmt = TextureFormat((words[0] >> 21) & 7);
519 auto size = TextureSize((words[0] >> 19) & 3);
520 uint32_t width = (words[0] & 0x3ff) + 1;
521 uint32_t addr = words[1] & 0x00ffffffu;
522
523 texture_image.addr = addr;
524 texture_image.width = width;
525 texture_image.size = size;
526 texture_image.fmt = fmt;
527 }
528
op_set_tile(const uint32_t * words)529 void CommandProcessor::op_set_tile(const uint32_t *words)
530 {
531 uint32_t tile = (words[1] >> 24) & 7;
532
533 TileMeta info = {};
534 info.offset = ((words[0] >> 0) & 511) << 3;
535 info.stride = ((words[0] >> 9) & 511) << 3;
536 info.size = TextureSize((words[0] >> 19) & 3);
537 info.fmt = TextureFormat((words[0] >> 21) & 7);
538
539 info.palette = (words[1] >> 20) & 15;
540
541 info.shift_s = (words[1] >> 0) & 15;
542 info.mask_s = (words[1] >> 4) & 15;
543 info.shift_t = (words[1] >> 10) & 15;
544 info.mask_t = (words[1] >> 14) & 15;
545
546 if (words[1] & (1 << 8))
547 info.flags |= TILE_INFO_MIRROR_S_BIT;
548 if (words[1] & (1 << 9))
549 info.flags |= TILE_INFO_CLAMP_S_BIT;
550 if (words[1] & (1 << 18))
551 info.flags |= TILE_INFO_MIRROR_T_BIT;
552 if (words[1] & (1 << 19))
553 info.flags |= TILE_INFO_CLAMP_T_BIT;
554
555 if (info.mask_s > 10)
556 info.mask_s = 10;
557 else if (info.mask_s == 0)
558 info.flags |= TILE_INFO_CLAMP_S_BIT;
559
560 if (info.mask_t > 10)
561 info.mask_t = 10;
562 else if (info.mask_t == 0)
563 info.flags |= TILE_INFO_CLAMP_T_BIT;
564
565 renderer.set_tile(tile, info);
566 }
567
op_load_tile(const uint32_t * words)568 void CommandProcessor::op_load_tile(const uint32_t *words)
569 {
570 uint32_t tile = (words[1] >> 24) & 7;
571
572 LoadTileInfo info = {};
573
574 info.tex_addr = texture_image.addr;
575 info.tex_width = texture_image.width;
576 info.fmt = texture_image.fmt;
577 info.size = texture_image.size;
578 info.slo = (words[0] >> 12) & 0xfff;
579 info.shi = (words[1] >> 12) & 0xfff;
580 info.tlo = (words[0] >> 0) & 0xfff;
581 info.thi = (words[1] >> 0) & 0xfff;
582 info.mode = UploadMode::Tile;
583
584 renderer.load_tile(tile, info);
585 }
586
op_load_tlut(const uint32_t * words)587 void CommandProcessor::op_load_tlut(const uint32_t *words)
588 {
589 uint32_t tile = (words[1] >> 24) & 7;
590
591 LoadTileInfo info = {};
592
593 info.tex_addr = texture_image.addr;
594 info.tex_width = texture_image.width;
595 info.fmt = texture_image.fmt;
596 info.size = texture_image.size;
597 info.slo = (words[0] >> 12) & 0xfff;
598 info.shi = (words[1] >> 12) & 0xfff;
599 info.tlo = (words[0] >> 0) & 0xfff;
600 info.thi = (words[1] >> 0) & 0xfff;
601 info.mode = UploadMode::TLUT;
602
603 renderer.load_tile(tile, info);
604 }
605
op_load_block(const uint32_t * words)606 void CommandProcessor::op_load_block(const uint32_t *words)
607 {
608 uint32_t tile = (words[1] >> 24) & 7;
609
610 LoadTileInfo info = {};
611
612 info.tex_addr = texture_image.addr;
613 info.tex_width = texture_image.width;
614 info.fmt = texture_image.fmt;
615 info.size = texture_image.size;
616 info.slo = (words[0] >> 12) & 0xfff;
617 info.shi = (words[1] >> 12) & 0xfff;
618 info.tlo = (words[0] >> 0) & 0xfff;
619 info.thi = (words[1] >> 0) & 0xfff;
620 info.mode = UploadMode::Block;
621
622 renderer.load_tile(tile, info);
623 }
624
op_set_tile_size(const uint32_t * words)625 void CommandProcessor::op_set_tile_size(const uint32_t *words)
626 {
627 uint32_t tile = (words[1] >> 24) & 7;
628 auto slo = (words[0] >> 12) & 0xfff;
629 auto shi = (words[1] >> 12) & 0xfff;
630 auto tlo = (words[0] >> 0) & 0xfff;
631 auto thi = (words[1] >> 0) & 0xfff;
632 renderer.set_tile_size(tile, slo, shi, tlo, thi);
633 }
634
op_set_combine(const uint32_t * words)635 void CommandProcessor::op_set_combine(const uint32_t *words)
636 {
637 static_state.combiner[0].rgb.muladd = static_cast<RGBMulAdd>((words[0] >> 20) & 0xf);
638 static_state.combiner[0].rgb.mul = static_cast<RGBMul>((words[0] >> 15) & 0x1f);
639 static_state.combiner[0].rgb.mulsub = static_cast<RGBMulSub>((words[1] >> 28) & 0xf);
640 static_state.combiner[0].rgb.add = static_cast<RGBAdd>(words[1] >> 15 & 0x7);
641
642 static_state.combiner[0].alpha.muladd = static_cast<AlphaAddSub>((words[0] >> 12) & 0x7);
643 static_state.combiner[0].alpha.mulsub = static_cast<AlphaAddSub>((words[1] >> 12) & 0x7);
644 static_state.combiner[0].alpha.mul = static_cast<AlphaMul>((words[0] >> 9) & 0x7);
645 static_state.combiner[0].alpha.add = static_cast<AlphaAddSub>((words[1] >> 9) & 0x7);
646
647 static_state.combiner[1].rgb.muladd = static_cast<RGBMulAdd>((words[0] >> 5) & 0xf);
648 static_state.combiner[1].rgb.mul = static_cast<RGBMul>((words[0] >> 0) & 0x1f);
649 static_state.combiner[1].rgb.mulsub = static_cast<RGBMulSub>((words[1] >> 24) & 0xf);
650 static_state.combiner[1].rgb.add = static_cast<RGBAdd>(words[1] >> 6 & 0x7);
651
652 static_state.combiner[1].alpha.muladd = static_cast<AlphaAddSub>((words[1] >> 21) & 0x7);
653 static_state.combiner[1].alpha.mulsub = static_cast<AlphaAddSub>((words[1] >> 3) & 0x7);
654 static_state.combiner[1].alpha.mul = static_cast<AlphaMul>((words[1] >> 18) & 0x7);
655 static_state.combiner[1].alpha.add = static_cast<AlphaAddSub>((words[1] >> 0) & 0x7);
656
657 renderer.set_static_rasterization_state(static_state);
658 }
659
op_set_blend_color(const uint32_t * words)660 void CommandProcessor::op_set_blend_color(const uint32_t *words)
661 {
662 renderer.set_blend_color(words[1]);
663 }
664
op_set_env_color(const uint32_t * words)665 void CommandProcessor::op_set_env_color(const uint32_t *words)
666 {
667 renderer.set_env_color(words[1]);
668 }
669
op_set_fog_color(const uint32_t * words)670 void CommandProcessor::op_set_fog_color(const uint32_t *words)
671 {
672 renderer.set_fog_color(words[1]);
673 }
674
op_set_prim_color(const uint32_t * words)675 void CommandProcessor::op_set_prim_color(const uint32_t *words)
676 {
677 uint8_t prim_min_level = (words[0] >> 8) & 31;
678 uint8_t prim_level_frac = (words[0] >> 0) & 0xff;
679 renderer.set_primitive_color(prim_min_level, prim_level_frac, words[1]);
680 }
681
op_set_fill_color(const uint32_t * words)682 void CommandProcessor::op_set_fill_color(const uint32_t *words)
683 {
684 renderer.set_fill_color(words[1]);
685 }
686
op_fill_rectangle(const uint32_t * words)687 void CommandProcessor::op_fill_rectangle(const uint32_t *words)
688 {
689 uint32_t xl = (words[0] >> 12) & 0xfff;
690 uint32_t yl = (words[0] >> 0) & 0xfff;
691 uint32_t xh = (words[1] >> 12) & 0xfff;
692 uint32_t yh = (words[1] >> 0) & 0xfff;
693
694 if ((static_state.flags & (RASTERIZATION_COPY_BIT | RASTERIZATION_FILL_BIT)) != 0)
695 yl |= 3;
696
697 TriangleSetup setup = {};
698 setup.xh = xh << 13;
699 setup.xl = xl << 13;
700 setup.xm = xl << 13;
701 setup.ym = yl;
702 setup.yl = yl;
703 setup.yh = yh;
704 setup.flags = TRIANGLE_SETUP_FLIP_BIT | TRIANGLE_SETUP_DISABLE_UPSCALING_BIT;
705
706 renderer.draw_flat_primitive(setup);
707 }
708
op_texture_rectangle(const uint32_t * words)709 void CommandProcessor::op_texture_rectangle(const uint32_t *words)
710 {
711 uint32_t xl = (words[0] >> 12) & 0xfff;
712 uint32_t yl = (words[0] >> 0) & 0xfff;
713 uint32_t xh = (words[1] >> 12) & 0xfff;
714 uint32_t yh = (words[1] >> 0) & 0xfff;
715 uint32_t tile = (words[1] >> 24) & 0x7;
716
717 int32_t s = (words[2] >> 16) & 0xffff;
718 int32_t t = (words[2] >> 0) & 0xffff;
719 int32_t dsdx = (words[3] >> 16) & 0xffff;
720 int32_t dtdy = (words[3] >> 0) & 0xffff;
721 dsdx = sext<16>(dsdx);
722 dtdy = sext<16>(dtdy);
723
724 if ((static_state.flags & (RASTERIZATION_COPY_BIT | RASTERIZATION_FILL_BIT)) != 0)
725 yl |= 3;
726
727 TriangleSetup setup = {};
728 AttributeSetup attr = {};
729
730 setup.xh = xh << 13;
731 setup.xl = xl << 13;
732 setup.xm = xl << 13;
733 setup.ym = yl;
734 setup.yl = yl;
735 setup.yh = yh;
736 setup.flags = TRIANGLE_SETUP_FLIP_BIT |
737 (quirks.u.options.native_resolution_tex_rect ? TRIANGLE_SETUP_DISABLE_UPSCALING_BIT : 0) |
738 (quirks.u.options.native_texture_lod ? TRIANGLE_SETUP_NATIVE_LOD_BIT : 0);
739 setup.tile = tile;
740
741 attr.s = s << 16;
742 attr.t = t << 16;
743 attr.dsdx = dsdx << 11;
744 attr.dtde = dtdy << 11;
745 attr.dtdy = dtdy << 11;
746
747 if ((static_state.flags & RASTERIZATION_COPY_BIT) != 0)
748 setup.flags |= TRIANGLE_SETUP_SKIP_XFRAC_BIT;
749
750 renderer.draw_shaded_primitive(setup, attr);
751 }
752
op_texture_rectangle_flip(const uint32_t * words)753 void CommandProcessor::op_texture_rectangle_flip(const uint32_t *words)
754 {
755 uint32_t xl = (words[0] >> 12) & 0xfff;
756 uint32_t yl = (words[0] >> 0) & 0xfff;
757 uint32_t xh = (words[1] >> 12) & 0xfff;
758 uint32_t yh = (words[1] >> 0) & 0xfff;
759 uint32_t tile = (words[1] >> 24) & 0x7;
760
761 int32_t s = (words[2] >> 16) & 0xffff;
762 int32_t t = (words[2] >> 0) & 0xffff;
763 int32_t dsdx = (words[3] >> 16) & 0xffff;
764 int32_t dtdy = (words[3] >> 0) & 0xffff;
765 dsdx = sext<16>(dsdx);
766 dtdy = sext<16>(dtdy);
767
768 if ((static_state.flags & (RASTERIZATION_COPY_BIT | RASTERIZATION_FILL_BIT)) != 0)
769 yl |= 3;
770
771 TriangleSetup setup = {};
772 AttributeSetup attr = {};
773
774 setup.xh = xh << 13;
775 setup.xl = xl << 13;
776 setup.xm = xl << 13;
777 setup.ym = yl;
778 setup.yl = yl;
779 setup.yh = yh;
780 setup.flags = TRIANGLE_SETUP_FLIP_BIT | TRIANGLE_SETUP_DISABLE_UPSCALING_BIT |
781 (quirks.u.options.native_resolution_tex_rect ? TRIANGLE_SETUP_DISABLE_UPSCALING_BIT : 0) |
782 (quirks.u.options.native_texture_lod ? TRIANGLE_SETUP_NATIVE_LOD_BIT : 0);
783 setup.tile = tile;
784
785 attr.s = s << 16;
786 attr.t = t << 16;
787 attr.dtdx = dtdy << 11;
788 attr.dsde = dsdx << 11;
789 attr.dsdy = dsdx << 11;
790
791 if ((static_state.flags & RASTERIZATION_COPY_BIT) != 0)
792 setup.flags |= TRIANGLE_SETUP_SKIP_XFRAC_BIT;
793
794 renderer.draw_shaded_primitive(setup, attr);
795 }
796
op_set_prim_depth(const uint32_t * words)797 void CommandProcessor::op_set_prim_depth(const uint32_t *words)
798 {
799 renderer.set_primitive_depth((words[1] >> 16) & 0xffff, words[1] & 0xffff);
800 }
801
op_set_convert(const uint32_t * words)802 void CommandProcessor::op_set_convert(const uint32_t *words)
803 {
804 uint64_t merged = (uint64_t(words[0]) << 32) | words[1];
805
806 uint16_t k5 = (merged >> 0) & 0x1ff;
807 uint16_t k4 = (merged >> 9) & 0x1ff;
808 uint16_t k3 = (merged >> 18) & 0x1ff;
809 uint16_t k2 = (merged >> 27) & 0x1ff;
810 uint16_t k1 = (merged >> 36) & 0x1ff;
811 uint16_t k0 = (merged >> 45) & 0x1ff;
812 renderer.set_convert(k0, k1, k2, k3, k4, k5);
813 }
814
op_set_key_gb(const uint32_t * words)815 void CommandProcessor::op_set_key_gb(const uint32_t *words)
816 {
817 uint32_t g_width = (words[0] >> 12) & 0xfff;
818 uint32_t b_width = (words[0] >> 0) & 0xfff;
819 uint32_t g_center = (words[1] >> 24) & 0xff;
820 uint32_t g_scale = (words[1] >> 16) & 0xff;
821 uint32_t b_center = (words[1] >> 8) & 0xff;
822 uint32_t b_scale = (words[1] >> 0) & 0xff;
823 renderer.set_color_key(1, g_width, g_center, g_scale);
824 renderer.set_color_key(2, b_width, b_center, b_scale);
825 }
826
op_set_key_r(const uint32_t * words)827 void CommandProcessor::op_set_key_r(const uint32_t *words)
828 {
829 uint32_t r_width = (words[1] >> 16) & 0xfff;
830 uint32_t r_center = (words[1] >> 8) & 0xff;
831 uint32_t r_scale = (words[1] >> 0) & 0xff;
832 renderer.set_color_key(0, r_width, r_center, r_scale);
833 }
834
835 #define OP(x) void CommandProcessor::op_##x(const uint32_t *) {}
OP(sync_pipe)836 OP(sync_load) OP(sync_pipe)
837 OP(sync_tile)
838 #undef OP
839
840 void CommandProcessor::enqueue_command(unsigned num_words, const uint32_t *words)
841 {
842 if (single_threaded_processing)
843 enqueue_command_direct(num_words, words);
844 else
845 ring.enqueue_command(num_words, words);
846 }
847
enqueue_command_direct(unsigned,const uint32_t * words)848 void CommandProcessor::enqueue_command_direct(unsigned, const uint32_t *words)
849 {
850 #define OP(x) &CommandProcessor::op_##x
851 using CommandFunc = void (CommandProcessor::*)(const uint32_t *words);
852 static const CommandFunc funcs[64] = {
853 /* 0x00 */ nullptr, nullptr, nullptr, nullptr,
854 /* 0x04 */ nullptr, nullptr, nullptr, nullptr,
855 /* 0x08 */ OP(fill_triangle), OP(fill_z_buffer_triangle), OP(texture_triangle), OP(texture_z_buffer_triangle),
856 /* 0x0c */ OP(shade_triangle), OP(shade_z_buffer_triangle), OP(shade_texture_triangle), OP(shade_texture_z_buffer_triangle),
857 /* 0x10 */ nullptr, nullptr, nullptr, nullptr,
858 /* 0x14 */ nullptr, nullptr, nullptr, nullptr,
859 /* 0x18 */ nullptr, nullptr, nullptr, nullptr,
860 /* 0x1c */ nullptr, nullptr, nullptr, nullptr,
861 /* 0x20 */ nullptr, nullptr, nullptr, nullptr,
862 /* 0x24 */ OP(texture_rectangle), OP(texture_rectangle_flip), OP(sync_load), OP(sync_pipe),
863 /* 0x28 */ OP(sync_tile), OP(sync_full), OP(set_key_gb), OP(set_key_r),
864 /* 0x2c */ OP(set_convert), OP(set_scissor), OP(set_prim_depth), OP(set_other_modes),
865 /* 0x30 */ OP(load_tlut), nullptr, OP(set_tile_size), OP(load_block),
866 /* 0x34 */ OP(load_tile), OP(set_tile), OP(fill_rectangle), OP(set_fill_color),
867 /* 0x38 */ OP(set_fog_color), OP(set_blend_color), OP(set_prim_color), OP(set_env_color),
868 /* 0x3c */ OP(set_combine), OP(set_texture_image), OP(set_mask_image), OP(set_color_image),
869 };
870 #undef OP
871
872 unsigned op = (words[0] >> 24) & 63;
873 switch (Op(op))
874 {
875 case Op::MetaSignalTimeline:
876 {
877 renderer.flush_and_signal();
878 uint64_t val = words[1] | (uint64_t(words[2]) << 32);
879 CoherencyOperation signal_op;
880 signal_op.timeline_value = val;
881 timeline_worker.push(std::move(signal_op));
882 break;
883 }
884
885 case Op::MetaFlush:
886 {
887 renderer.flush_and_signal();
888 break;
889 }
890
891 case Op::MetaIdle:
892 {
893 renderer.notify_idle_command_thread();
894 break;
895 }
896
897 case Op::MetaSetQuirks:
898 {
899 quirks.u.words[0] = words[1];
900 break;
901 }
902
903 default:
904 if (funcs[op])
905 (this->*funcs[op])(words);
906 break;
907 }
908 }
909
set_quirks(const Quirks & quirks_)910 void CommandProcessor::set_quirks(const Quirks &quirks_)
911 {
912 const uint32_t words[2] = {
913 uint32_t(Op::MetaSetQuirks) << 24u,
914 quirks_.u.words[0],
915 };
916 enqueue_command(2, words);
917 }
918
set_vi_register(VIRegister reg,uint32_t value)919 void CommandProcessor::set_vi_register(VIRegister reg, uint32_t value)
920 {
921 vi.set_vi_register(reg, value);
922 }
923
begin_read_rdram()924 void *CommandProcessor::begin_read_rdram()
925 {
926 if (rdram)
927 return device.map_host_buffer(*rdram, MEMORY_ACCESS_READ_BIT);
928 else
929 return nullptr;
930 }
931
end_write_rdram()932 void CommandProcessor::end_write_rdram()
933 {
934 if (rdram)
935 device.unmap_host_buffer(*rdram, MEMORY_ACCESS_WRITE_BIT);
936 }
937
begin_read_hidden_rdram()938 void *CommandProcessor::begin_read_hidden_rdram()
939 {
940 return device.map_host_buffer(*hidden_rdram, MEMORY_ACCESS_READ_BIT);
941 }
942
end_write_hidden_rdram()943 void CommandProcessor::end_write_hidden_rdram()
944 {
945 device.unmap_host_buffer(*hidden_rdram, MEMORY_ACCESS_WRITE_BIT);
946 }
947
get_rdram_size() const948 size_t CommandProcessor::get_rdram_size() const
949 {
950 if (is_host_coherent)
951 return rdram->get_create_info().size;
952 else
953 return rdram->get_create_info().size / 2;
954 }
955
get_hidden_rdram_size() const956 size_t CommandProcessor::get_hidden_rdram_size() const
957 {
958 return hidden_rdram->get_create_info().size;
959 }
960
get_tmem()961 void *CommandProcessor::get_tmem()
962 {
963 return device.map_host_buffer(*tmem, MEMORY_ACCESS_READ_BIT);
964 }
965
idle()966 void CommandProcessor::idle()
967 {
968 flush();
969 wait_for_timeline(signal_timeline());
970 }
971
flush()972 void CommandProcessor::flush()
973 {
974 const uint32_t words[1] = {
975 uint32_t(Op::MetaFlush) << 24,
976 };
977 enqueue_command(1, words);
978 }
979
signal_timeline()980 uint64_t CommandProcessor::signal_timeline()
981 {
982 timeline_value++;
983
984 const uint32_t words[3] = {
985 uint32_t(Op::MetaSignalTimeline) << 24,
986 uint32_t(timeline_value),
987 uint32_t(timeline_value >> 32),
988 };
989 enqueue_command(3, words);
990
991 return timeline_value;
992 }
993
wait_for_timeline(uint64_t index)994 void CommandProcessor::wait_for_timeline(uint64_t index)
995 {
996 Vulkan::QueryPoolHandle start_ts, end_ts;
997 if (measure_stall_time)
998 start_ts = device.write_calibrated_timestamp();
999 timeline_worker.wait([this, index]() -> bool {
1000 return thread_timeline_value >= index;
1001 });
1002 if (measure_stall_time)
1003 {
1004 end_ts = device.write_calibrated_timestamp();
1005 device.register_time_interval("RDP CPU", std::move(start_ts), std::move(end_ts), "wait-for-timeline");
1006 }
1007 }
1008
scanout(const ScanoutOptions & opts)1009 Vulkan::ImageHandle CommandProcessor::scanout(const ScanoutOptions &opts)
1010 {
1011 Vulkan::QueryPoolHandle start_ts, end_ts;
1012 drain_command_ring();
1013
1014 // Block idle callbacks triggering while we're doing this.
1015 renderer.lock_command_processing();
1016 {
1017 renderer.flush_and_signal();
1018 if (!is_host_coherent)
1019 {
1020 unsigned offset, length;
1021 vi.scanout_memory_range(offset, length);
1022 renderer.resolve_coherency_external(offset, length);
1023 }
1024 }
1025 renderer.unlock_command_processing();
1026
1027 auto scanout = vi.scanout(VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, opts, renderer.get_scaling_factor());
1028 return scanout;
1029 }
1030
drain_command_ring()1031 void CommandProcessor::drain_command_ring()
1032 {
1033 Vulkan::QueryPoolHandle start_ts, end_ts;
1034 if (timestamp)
1035 start_ts = device.write_calibrated_timestamp();
1036 ring.drain();
1037 if (timestamp)
1038 {
1039 end_ts = device.write_calibrated_timestamp();
1040 device.register_time_interval("RDP CPU", std::move(start_ts), std::move(end_ts), "drain-command-ring");
1041 }
1042 }
1043
scanout_sync(std::vector<RGBA> & colors,unsigned & width,unsigned & height)1044 void CommandProcessor::scanout_sync(std::vector<RGBA> &colors, unsigned &width, unsigned &height)
1045 {
1046 drain_command_ring();
1047 renderer.flush_and_signal();
1048
1049 if (!is_host_coherent)
1050 {
1051 unsigned offset, length;
1052 vi.scanout_memory_range(offset, length);
1053 renderer.resolve_coherency_external(offset, length);
1054 }
1055
1056 ScanoutOptions opts = {};
1057 // Downscale down to 1x, always.
1058 opts.downscale_steps = 32;
1059 opts.blend_previous_frame = true;
1060 opts.upscale_deinterlacing = false;
1061
1062 auto handle = vi.scanout(VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, opts, renderer.get_scaling_factor());
1063
1064 if (!handle)
1065 {
1066 width = 0;
1067 height = 0;
1068 colors.clear();
1069 return;
1070 }
1071
1072 width = handle->get_width();
1073 height = handle->get_height();
1074
1075 Vulkan::BufferCreateInfo info = {};
1076 info.size = width * height * sizeof(uint32_t);
1077 info.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT;
1078 info.domain = Vulkan::BufferDomain::CachedHost;
1079 auto readback = device.create_buffer(info);
1080
1081 auto cmd = device.request_command_buffer();
1082 cmd->copy_image_to_buffer(*readback, *handle, 0, {}, { width, height, 1 }, 0, 0, { VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1 });
1083 cmd->barrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT,
1084 VK_PIPELINE_STAGE_HOST_BIT, VK_ACCESS_HOST_READ_BIT);
1085
1086 Vulkan::Fence fence;
1087 device.submit(cmd, &fence);
1088 fence->wait();
1089
1090 colors.resize(width * height);
1091 memcpy(colors.data(), device.map_host_buffer(*readback, Vulkan::MEMORY_ACCESS_READ_BIT),
1092 width * height * sizeof(uint32_t));
1093 device.unmap_host_buffer(*readback, Vulkan::MEMORY_ACCESS_READ_BIT);
1094 }
1095
notify_work_locked(const CoherencyOperation & work)1096 void CommandProcessor::FenceExecutor::notify_work_locked(const CoherencyOperation &work)
1097 {
1098 if (work.timeline_value)
1099 *value = work.timeline_value;
1100 }
1101
is_sentinel(const CoherencyOperation & work) const1102 bool CommandProcessor::FenceExecutor::is_sentinel(const CoherencyOperation &work) const
1103 {
1104 return !work.fence && !work.timeline_value;
1105 }
1106
masked_memcpy(uint8_t * __restrict dst,const uint8_t * __restrict data_src,const uint8_t * __restrict masked_src,size_t size)1107 static void masked_memcpy(uint8_t * __restrict dst,
1108 const uint8_t * __restrict data_src,
1109 const uint8_t * __restrict masked_src,
1110 size_t size)
1111 {
1112 #if defined(__SSE2__)
1113 for (size_t i = 0; i < size; i += 16)
1114 {
1115 __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data_src + i));
1116 __m128i mask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(masked_src + i));
1117 _mm_maskmoveu_si128(data, mask, reinterpret_cast<char *>(dst + i));
1118 }
1119 #else
1120 auto * __restrict data32 = reinterpret_cast<const uint32_t *>(data_src);
1121 auto * __restrict mask32 = reinterpret_cast<const uint32_t *>(masked_src);
1122 auto * __restrict dst32 = reinterpret_cast<uint32_t *>(dst);
1123 auto size32 = size >> 2;
1124
1125 for (size_t i = 0; i < size32; i++)
1126 {
1127 auto mask = mask32[i];
1128 if (mask == ~0u)
1129 {
1130 dst32[i] = data32[i];
1131 }
1132 else if (mask)
1133 {
1134 // Fairly rare path.
1135 for (unsigned j = 0; j < 4; j++)
1136 if (masked_src[4 * i + j])
1137 dst[4 * i + j] = data_src[4 * i + j];
1138 }
1139 }
1140 #endif
1141 }
1142
perform_work(CoherencyOperation & work)1143 void CommandProcessor::FenceExecutor::perform_work(CoherencyOperation &work)
1144 {
1145 if (work.fence)
1146 work.fence->wait();
1147
1148 if (work.unlock_cookie)
1149 work.unlock_cookie->fetch_sub(1, std::memory_order_relaxed);
1150
1151 if (work.src)
1152 {
1153 for (auto © : work.copies)
1154 {
1155 auto *mapped_data = static_cast<uint8_t *>(device->map_host_buffer(*work.src, MEMORY_ACCESS_READ_BIT, copy.src_offset, copy.size));
1156 auto *mapped_mask = static_cast<uint8_t *>(device->map_host_buffer(*work.src, MEMORY_ACCESS_READ_BIT, copy.mask_offset, copy.size));
1157 masked_memcpy(work.dst + copy.dst_offset, mapped_data, mapped_mask, copy.size);
1158 for (unsigned i = 0; i < copy.counters; i++)
1159 {
1160 unsigned val = copy.counter_base[i].fetch_sub(1, std::memory_order_release);
1161 (void)val;
1162 assert(val > 0);
1163 }
1164 }
1165
1166 #ifdef __SSE2__
1167 _mm_mfence();
1168 #endif
1169 }
1170 }
1171
enqueue_coherency_operation(CoherencyOperation && op)1172 void CommandProcessor::enqueue_coherency_operation(CoherencyOperation &&op)
1173 {
1174 timeline_worker.push(std::move(op));
1175 }
1176 }
1177