1 // Copyright 2014 Citra Emulator Project
2 // Licensed under GPLv2 or any later version
3 // Refer to the license.txt file included.
4
5 #include <cstring>
6 #include <numeric>
7 #include <type_traits>
8 #include "common/alignment.h"
9 #include "common/color.h"
10 #include "common/common_types.h"
11 #include "common/logging/log.h"
12 #include "common/microprofile.h"
13 #include "common/vector_math.h"
14 #include "core/core.h"
15 #include "core/core_timing.h"
16 #include "core/hle/service/gsp/gsp.h"
17 #include "core/hw/gpu.h"
18 #include "core/hw/hw.h"
19 #include "core/memory.h"
20 #include "core/tracer/recorder.h"
21 #include "video_core/command_processor.h"
22 #include "video_core/debug_utils/debug_utils.h"
23 #include "video_core/rasterizer_interface.h"
24 #include "video_core/renderer_base.h"
25 #include "video_core/utils.h"
26 #include "video_core/video_core.h"
27
28 namespace GPU {
29
30 Regs g_regs;
31 Memory::MemorySystem* g_memory;
32
33 /// Event id for CoreTiming
34 static Core::TimingEventType* vblank_event;
35
36 template <typename T>
Read(T & var,const u32 raw_addr)37 inline void Read(T& var, const u32 raw_addr) {
38 u32 addr = raw_addr - HW::VADDR_GPU;
39 u32 index = addr / 4;
40
41 // Reads other than u32 are untested, so I'd rather have them abort than silently fail
42 if (index >= Regs::NumIds() || !std::is_same<T, u32>::value) {
43 LOG_ERROR(HW_GPU, "unknown Read{} @ {:#010X}", sizeof(var) * 8, addr);
44 return;
45 }
46
47 var = g_regs[addr / 4];
48 }
49
DecodePixel(Regs::PixelFormat input_format,const u8 * src_pixel)50 static Common::Vec4<u8> DecodePixel(Regs::PixelFormat input_format, const u8* src_pixel) {
51 switch (input_format) {
52 case Regs::PixelFormat::RGBA8:
53 return Color::DecodeRGBA8(src_pixel);
54
55 case Regs::PixelFormat::RGB8:
56 return Color::DecodeRGB8(src_pixel);
57
58 case Regs::PixelFormat::RGB565:
59 return Color::DecodeRGB565(src_pixel);
60
61 case Regs::PixelFormat::RGB5A1:
62 return Color::DecodeRGB5A1(src_pixel);
63
64 case Regs::PixelFormat::RGBA4:
65 return Color::DecodeRGBA4(src_pixel);
66
67 default:
68 LOG_ERROR(HW_GPU, "Unknown source framebuffer format {:x}", input_format);
69 return {0, 0, 0, 0};
70 }
71 }
72
73 MICROPROFILE_DEFINE(GPU_DisplayTransfer, "GPU", "DisplayTransfer", MP_RGB(100, 100, 255));
74 MICROPROFILE_DEFINE(GPU_CmdlistProcessing, "GPU", "Cmdlist Processing", MP_RGB(100, 255, 100));
75
MemoryFill(const Regs::MemoryFillConfig & config)76 static void MemoryFill(const Regs::MemoryFillConfig& config) {
77 const PAddr start_addr = config.GetStartAddress();
78 const PAddr end_addr = config.GetEndAddress();
79
80 // TODO: do hwtest with these cases
81 if (!g_memory->IsValidPhysicalAddress(start_addr)) {
82 LOG_CRITICAL(HW_GPU, "invalid start address {:#010X}", start_addr);
83 return;
84 }
85
86 if (!g_memory->IsValidPhysicalAddress(end_addr)) {
87 LOG_CRITICAL(HW_GPU, "invalid end address {:#010X}", end_addr);
88 return;
89 }
90
91 if (end_addr <= start_addr) {
92 LOG_CRITICAL(HW_GPU, "invalid memory range from {:#010X} to {:#010X}", start_addr,
93 end_addr);
94 return;
95 }
96
97 u8* start = g_memory->GetPhysicalPointer(start_addr);
98 u8* end = g_memory->GetPhysicalPointer(end_addr);
99
100 if (VideoCore::g_renderer->Rasterizer()->AccelerateFill(config))
101 return;
102
103 Memory::RasterizerInvalidateRegion(config.GetStartAddress(),
104 config.GetEndAddress() - config.GetStartAddress());
105
106 if (config.fill_24bit) {
107 // fill with 24-bit values
108 for (u8* ptr = start; ptr < end; ptr += 3) {
109 ptr[0] = config.value_24bit_r;
110 ptr[1] = config.value_24bit_g;
111 ptr[2] = config.value_24bit_b;
112 }
113 } else if (config.fill_32bit) {
114 // fill with 32-bit values
115 if (end > start) {
116 u32 value = config.value_32bit;
117 std::size_t len = (end - start) / sizeof(u32);
118 for (std::size_t i = 0; i < len; ++i)
119 memcpy(&start[i * sizeof(u32)], &value, sizeof(u32));
120 }
121 } else {
122 // fill with 16-bit values
123 u16 value_16bit = config.value_16bit.Value();
124 for (u8* ptr = start; ptr < end; ptr += sizeof(u16))
125 memcpy(ptr, &value_16bit, sizeof(u16));
126 }
127 }
128
DisplayTransfer(const Regs::DisplayTransferConfig & config)129 static void DisplayTransfer(const Regs::DisplayTransferConfig& config) {
130 const PAddr src_addr = config.GetPhysicalInputAddress();
131 const PAddr dst_addr = config.GetPhysicalOutputAddress();
132
133 // TODO: do hwtest with these cases
134 if (!g_memory->IsValidPhysicalAddress(src_addr)) {
135 LOG_CRITICAL(HW_GPU, "invalid input address {:#010X}", src_addr);
136 return;
137 }
138
139 if (!g_memory->IsValidPhysicalAddress(dst_addr)) {
140 LOG_CRITICAL(HW_GPU, "invalid output address {:#010X}", dst_addr);
141 return;
142 }
143
144 if (config.input_width == 0) {
145 LOG_CRITICAL(HW_GPU, "zero input width");
146 return;
147 }
148
149 if (config.input_height == 0) {
150 LOG_CRITICAL(HW_GPU, "zero input height");
151 return;
152 }
153
154 if (config.output_width == 0) {
155 LOG_CRITICAL(HW_GPU, "zero output width");
156 return;
157 }
158
159 if (config.output_height == 0) {
160 LOG_CRITICAL(HW_GPU, "zero output height");
161 return;
162 }
163
164 if (VideoCore::g_renderer->Rasterizer()->AccelerateDisplayTransfer(config))
165 return;
166
167 u8* src_pointer = g_memory->GetPhysicalPointer(src_addr);
168 u8* dst_pointer = g_memory->GetPhysicalPointer(dst_addr);
169
170 if (config.scaling > config.ScaleXY) {
171 LOG_CRITICAL(HW_GPU, "Unimplemented display transfer scaling mode {}",
172 config.scaling.Value());
173 UNIMPLEMENTED();
174 return;
175 }
176
177 if (config.input_linear && config.scaling != config.NoScale) {
178 LOG_CRITICAL(HW_GPU, "Scaling is only implemented on tiled input");
179 UNIMPLEMENTED();
180 return;
181 }
182
183 int horizontal_scale = config.scaling != config.NoScale ? 1 : 0;
184 int vertical_scale = config.scaling == config.ScaleXY ? 1 : 0;
185
186 u32 output_width = config.output_width >> horizontal_scale;
187 u32 output_height = config.output_height >> vertical_scale;
188
189 u32 input_size =
190 config.input_width * config.input_height * GPU::Regs::BytesPerPixel(config.input_format);
191 u32 output_size = output_width * output_height * GPU::Regs::BytesPerPixel(config.output_format);
192
193 Memory::RasterizerFlushRegion(config.GetPhysicalInputAddress(), input_size);
194 Memory::RasterizerInvalidateRegion(config.GetPhysicalOutputAddress(), output_size);
195
196 for (u32 y = 0; y < output_height; ++y) {
197 for (u32 x = 0; x < output_width; ++x) {
198 Common::Vec4<u8> src_color;
199
200 // Calculate the [x,y] position of the input image
201 // based on the current output position and the scale
202 u32 input_x = x << horizontal_scale;
203 u32 input_y = y << vertical_scale;
204
205 u32 output_y;
206 if (config.flip_vertically) {
207 // Flip the y value of the output data,
208 // we do this after calculating the [x,y] position of the input image
209 // to account for the scaling options.
210 output_y = output_height - y - 1;
211 } else {
212 output_y = y;
213 }
214
215 u32 dst_bytes_per_pixel = GPU::Regs::BytesPerPixel(config.output_format);
216 u32 src_bytes_per_pixel = GPU::Regs::BytesPerPixel(config.input_format);
217 u32 src_offset;
218 u32 dst_offset;
219
220 if (config.input_linear) {
221 if (!config.dont_swizzle) {
222 // Interpret the input as linear and the output as tiled
223 u32 coarse_y = output_y & ~7;
224 u32 stride = output_width * dst_bytes_per_pixel;
225
226 src_offset = (input_x + input_y * config.input_width) * src_bytes_per_pixel;
227 dst_offset = VideoCore::GetMortonOffset(x, output_y, dst_bytes_per_pixel) +
228 coarse_y * stride;
229 } else {
230 // Both input and output are linear
231 src_offset = (input_x + input_y * config.input_width) * src_bytes_per_pixel;
232 dst_offset = (x + output_y * output_width) * dst_bytes_per_pixel;
233 }
234 } else {
235 if (!config.dont_swizzle) {
236 // Interpret the input as tiled and the output as linear
237 u32 coarse_y = input_y & ~7;
238 u32 stride = config.input_width * src_bytes_per_pixel;
239
240 src_offset = VideoCore::GetMortonOffset(input_x, input_y, src_bytes_per_pixel) +
241 coarse_y * stride;
242 dst_offset = (x + output_y * output_width) * dst_bytes_per_pixel;
243 } else {
244 // Both input and output are tiled
245 u32 out_coarse_y = output_y & ~7;
246 u32 out_stride = output_width * dst_bytes_per_pixel;
247
248 u32 in_coarse_y = input_y & ~7;
249 u32 in_stride = config.input_width * src_bytes_per_pixel;
250
251 src_offset = VideoCore::GetMortonOffset(input_x, input_y, src_bytes_per_pixel) +
252 in_coarse_y * in_stride;
253 dst_offset = VideoCore::GetMortonOffset(x, output_y, dst_bytes_per_pixel) +
254 out_coarse_y * out_stride;
255 }
256 }
257
258 const u8* src_pixel = src_pointer + src_offset;
259 src_color = DecodePixel(config.input_format, src_pixel);
260 if (config.scaling == config.ScaleX) {
261 Common::Vec4<u8> pixel =
262 DecodePixel(config.input_format, src_pixel + src_bytes_per_pixel);
263 src_color = ((src_color + pixel) / 2).Cast<u8>();
264 } else if (config.scaling == config.ScaleXY) {
265 Common::Vec4<u8> pixel1 =
266 DecodePixel(config.input_format, src_pixel + 1 * src_bytes_per_pixel);
267 Common::Vec4<u8> pixel2 =
268 DecodePixel(config.input_format, src_pixel + 2 * src_bytes_per_pixel);
269 Common::Vec4<u8> pixel3 =
270 DecodePixel(config.input_format, src_pixel + 3 * src_bytes_per_pixel);
271 src_color = (((src_color + pixel1) + (pixel2 + pixel3)) / 4).Cast<u8>();
272 }
273
274 u8* dst_pixel = dst_pointer + dst_offset;
275 switch (config.output_format) {
276 case Regs::PixelFormat::RGBA8:
277 Color::EncodeRGBA8(src_color, dst_pixel);
278 break;
279
280 case Regs::PixelFormat::RGB8:
281 Color::EncodeRGB8(src_color, dst_pixel);
282 break;
283
284 case Regs::PixelFormat::RGB565:
285 Color::EncodeRGB565(src_color, dst_pixel);
286 break;
287
288 case Regs::PixelFormat::RGB5A1:
289 Color::EncodeRGB5A1(src_color, dst_pixel);
290 break;
291
292 case Regs::PixelFormat::RGBA4:
293 Color::EncodeRGBA4(src_color, dst_pixel);
294 break;
295
296 default:
297 LOG_ERROR(HW_GPU, "Unknown destination framebuffer format {:x}",
298 static_cast<u32>(config.output_format.Value()));
299 break;
300 }
301 }
302 }
303 }
304
TextureCopy(const Regs::DisplayTransferConfig & config)305 static void TextureCopy(const Regs::DisplayTransferConfig& config) {
306 const PAddr src_addr = config.GetPhysicalInputAddress();
307 const PAddr dst_addr = config.GetPhysicalOutputAddress();
308
309 // TODO: do hwtest with invalid addresses
310 if (!g_memory->IsValidPhysicalAddress(src_addr)) {
311 LOG_CRITICAL(HW_GPU, "invalid input address {:#010X}", src_addr);
312 return;
313 }
314
315 if (!g_memory->IsValidPhysicalAddress(dst_addr)) {
316 LOG_CRITICAL(HW_GPU, "invalid output address {:#010X}", dst_addr);
317 return;
318 }
319
320 if (VideoCore::g_renderer->Rasterizer()->AccelerateTextureCopy(config))
321 return;
322
323 u8* src_pointer = g_memory->GetPhysicalPointer(src_addr);
324 u8* dst_pointer = g_memory->GetPhysicalPointer(dst_addr);
325
326 u32 remaining_size = Common::AlignDown(config.texture_copy.size, 16);
327
328 if (remaining_size == 0) {
329 LOG_CRITICAL(HW_GPU, "zero size. Real hardware freezes on this.");
330 return;
331 }
332
333 u32 input_gap = config.texture_copy.input_gap * 16;
334 u32 output_gap = config.texture_copy.output_gap * 16;
335
336 // Zero gap means contiguous input/output even if width = 0. To avoid infinite loop below, width
337 // is assigned with the total size if gap = 0.
338 u32 input_width = input_gap == 0 ? remaining_size : config.texture_copy.input_width * 16;
339 u32 output_width = output_gap == 0 ? remaining_size : config.texture_copy.output_width * 16;
340
341 if (input_width == 0) {
342 LOG_CRITICAL(HW_GPU, "zero input width. Real hardware freezes on this.");
343 return;
344 }
345
346 if (output_width == 0) {
347 LOG_CRITICAL(HW_GPU, "zero output width. Real hardware freezes on this.");
348 return;
349 }
350
351 std::size_t contiguous_input_size =
352 config.texture_copy.size / input_width * (input_width + input_gap);
353 Memory::RasterizerFlushRegion(config.GetPhysicalInputAddress(),
354 static_cast<u32>(contiguous_input_size));
355
356 std::size_t contiguous_output_size =
357 config.texture_copy.size / output_width * (output_width + output_gap);
358 // Only need to flush output if it has a gap
359 const auto FlushInvalidate_fn = (output_gap != 0) ? Memory::RasterizerFlushAndInvalidateRegion
360 : Memory::RasterizerInvalidateRegion;
361 FlushInvalidate_fn(config.GetPhysicalOutputAddress(), static_cast<u32>(contiguous_output_size));
362
363 u32 remaining_input = input_width;
364 u32 remaining_output = output_width;
365 while (remaining_size > 0) {
366 u32 copy_size = std::min({remaining_input, remaining_output, remaining_size});
367
368 std::memcpy(dst_pointer, src_pointer, copy_size);
369 src_pointer += copy_size;
370 dst_pointer += copy_size;
371
372 remaining_input -= copy_size;
373 remaining_output -= copy_size;
374 remaining_size -= copy_size;
375
376 if (remaining_input == 0) {
377 remaining_input = input_width;
378 src_pointer += input_gap;
379 }
380 if (remaining_output == 0) {
381 remaining_output = output_width;
382 dst_pointer += output_gap;
383 }
384 }
385 }
386
387 template <typename T>
Write(u32 addr,const T data)388 inline void Write(u32 addr, const T data) {
389 addr -= HW::VADDR_GPU;
390 u32 index = addr / 4;
391
392 // Writes other than u32 are untested, so I'd rather have them abort than silently fail
393 if (index >= Regs::NumIds() || !std::is_same<T, u32>::value) {
394 LOG_ERROR(HW_GPU, "unknown Write{} {:#010X} @ {:#010X}", sizeof(data) * 8, (u32)data, addr);
395 return;
396 }
397
398 g_regs[index] = static_cast<u32>(data);
399
400 switch (index) {
401
402 // Memory fills are triggered once the fill value is written.
403 case GPU_REG_INDEX(memory_fill_config[0].trigger):
404 case GPU_REG_INDEX(memory_fill_config[1].trigger): {
405 const bool is_second_filler = (index != GPU_REG_INDEX(memory_fill_config[0].trigger));
406 auto& config = g_regs.memory_fill_config[is_second_filler];
407
408 if (config.trigger) {
409 MemoryFill(config);
410 LOG_TRACE(HW_GPU, "MemoryFill from {:#010X} to {:#010X}", config.GetStartAddress(),
411 config.GetEndAddress());
412
413 // It seems that it won't signal interrupt if "address_start" is zero.
414 // TODO: hwtest this
415 if (config.GetStartAddress() != 0) {
416 if (!is_second_filler) {
417 Service::GSP::SignalInterrupt(Service::GSP::InterruptId::PSC0);
418 } else {
419 Service::GSP::SignalInterrupt(Service::GSP::InterruptId::PSC1);
420 }
421 }
422
423 // Reset "trigger" flag and set the "finish" flag
424 // NOTE: This was confirmed to happen on hardware even if "address_start" is zero.
425 config.trigger.Assign(0);
426 config.finished.Assign(1);
427 }
428 break;
429 }
430
431 case GPU_REG_INDEX(display_transfer_config.trigger): {
432 MICROPROFILE_SCOPE(GPU_DisplayTransfer);
433
434 const auto& config = g_regs.display_transfer_config;
435 if (config.trigger & 1) {
436
437 if (Pica::g_debug_context)
438 Pica::g_debug_context->OnEvent(Pica::DebugContext::Event::IncomingDisplayTransfer,
439 nullptr);
440
441 if (config.is_texture_copy) {
442 TextureCopy(config);
443 LOG_TRACE(HW_GPU,
444 "TextureCopy: {:#X} bytes from {:#010X}({}+{})-> "
445 "{:#010X}({}+{}), flags {:#010X}",
446 config.texture_copy.size, config.GetPhysicalInputAddress(),
447 config.texture_copy.input_width * 16, config.texture_copy.input_gap * 16,
448 config.GetPhysicalOutputAddress(), config.texture_copy.output_width * 16,
449 config.texture_copy.output_gap * 16, config.flags);
450 } else {
451 DisplayTransfer(config);
452 LOG_TRACE(HW_GPU,
453 "DisplayTransfer: {:#010X}({}x{})-> "
454 "{:#010X}({}x{}), dst format {:x}, flags {:#010X}",
455 config.GetPhysicalInputAddress(), config.input_width.Value(),
456 config.input_height.Value(), config.GetPhysicalOutputAddress(),
457 config.output_width.Value(), config.output_height.Value(),
458 static_cast<u32>(config.output_format.Value()), config.flags);
459 }
460
461 g_regs.display_transfer_config.trigger = 0;
462 Service::GSP::SignalInterrupt(Service::GSP::InterruptId::PPF);
463 }
464 break;
465 }
466
467 // Seems like writing to this register triggers processing
468 case GPU_REG_INDEX(command_processor_config.trigger): {
469 const auto& config = g_regs.command_processor_config;
470 if (config.trigger & 1) {
471 MICROPROFILE_SCOPE(GPU_CmdlistProcessing);
472
473 Pica::CommandProcessor::ProcessCommandList(config.GetPhysicalAddress(), config.size);
474
475 g_regs.command_processor_config.trigger = 0;
476 }
477 break;
478 }
479
480 default:
481 break;
482 }
483
484 // Notify tracer about the register write
485 // This is happening *after* handling the write to make sure we properly catch all memory reads.
486 if (Pica::g_debug_context && Pica::g_debug_context->recorder) {
487 // addr + GPU VBase - IO VBase + IO PBase
488 Pica::g_debug_context->recorder->RegisterWritten<T>(
489 addr + 0x1EF00000 - 0x1EC00000 + 0x10100000, data);
490 }
491 }
492
493 // Explicitly instantiate template functions because we aren't defining this in the header:
494
495 template void Read<u64>(u64& var, const u32 addr);
496 template void Read<u32>(u32& var, const u32 addr);
497 template void Read<u16>(u16& var, const u32 addr);
498 template void Read<u8>(u8& var, const u32 addr);
499
500 template void Write<u64>(u32 addr, const u64 data);
501 template void Write<u32>(u32 addr, const u32 data);
502 template void Write<u16>(u32 addr, const u16 data);
503 template void Write<u8>(u32 addr, const u8 data);
504
505 /// Update hardware
VBlankCallback(u64 userdata,s64 cycles_late)506 static void VBlankCallback(u64 userdata, s64 cycles_late) {
507 VideoCore::g_renderer->SwapBuffers();
508
509 // Signal to GSP that GPU interrupt has occurred
510 // TODO(yuriks): hwtest to determine if PDC0 is for the Top screen and PDC1 for the Sub
511 // screen, or if both use the same interrupts and these two instead determine the
512 // beginning and end of the VBlank period. If needed, split the interrupt firing into
513 // two different intervals.
514 Service::GSP::SignalInterrupt(Service::GSP::InterruptId::PDC0);
515 Service::GSP::SignalInterrupt(Service::GSP::InterruptId::PDC1);
516
517 // Reschedule recurrent event
518 Core::System::GetInstance().CoreTiming().ScheduleEvent(frame_ticks - cycles_late, vblank_event);
519 }
520
521 /// Initialize hardware
Init(Memory::MemorySystem & memory)522 void Init(Memory::MemorySystem& memory) {
523 g_memory = &memory;
524 memset(&g_regs, 0, sizeof(g_regs));
525
526 auto& framebuffer_top = g_regs.framebuffer_config[0];
527 auto& framebuffer_sub = g_regs.framebuffer_config[1];
528
529 // Setup default framebuffer addresses (located in VRAM)
530 // .. or at least these are the ones used by system applets.
531 // There's probably a smarter way to come up with addresses
532 // like this which does not require hardcoding.
533 framebuffer_top.address_left1 = 0x181E6000;
534 framebuffer_top.address_left2 = 0x1822C800;
535 framebuffer_top.address_right1 = 0x18273000;
536 framebuffer_top.address_right2 = 0x182B9800;
537 framebuffer_sub.address_left1 = 0x1848F000;
538 framebuffer_sub.address_left2 = 0x184C7800;
539
540 framebuffer_top.width.Assign(240);
541 framebuffer_top.height.Assign(400);
542 framebuffer_top.stride = 3 * 240;
543 framebuffer_top.color_format.Assign(Regs::PixelFormat::RGB8);
544 framebuffer_top.active_fb = 0;
545
546 framebuffer_sub.width.Assign(240);
547 framebuffer_sub.height.Assign(320);
548 framebuffer_sub.stride = 3 * 240;
549 framebuffer_sub.color_format.Assign(Regs::PixelFormat::RGB8);
550 framebuffer_sub.active_fb = 0;
551
552 Core::Timing& timing = Core::System::GetInstance().CoreTiming();
553 vblank_event = timing.RegisterEvent("GPU::VBlankCallback", VBlankCallback);
554 timing.ScheduleEvent(frame_ticks, vblank_event);
555
556 LOG_DEBUG(HW_GPU, "initialized OK");
557 }
558
559 /// Shutdown hardware
Shutdown()560 void Shutdown() {
561 LOG_DEBUG(HW_GPU, "shutdown OK");
562 }
563
564 } // namespace GPU
565