1 // Copyright 2015 Citra Emulator Project
2 // Licensed under GPLv2 or any later version
3 // Refer to the license.txt file included.
4
5 #include <algorithm>
6 #include <array>
7 #include <cstddef>
8 #include <memory>
9 #include "common/assert.h"
10 #include "common/color.h"
11 #include "common/common_types.h"
12 #include "common/vector_math.h"
13 #include "core/core.h"
14 #include "core/hle/service/y2r_u.h"
15 #include "core/hw/y2r.h"
16 #include "core/memory.h"
17
18 namespace HW::Y2R {
19
20 using namespace Service::Y2R;
21
22 static const std::size_t MAX_TILES = 1024 / 8;
23 static const std::size_t TILE_SIZE = 8 * 8;
24 using ImageTile = std::array<u32, TILE_SIZE>;
25
26 /// Converts a image strip from the source YUV format into individual 8x8 RGB32 tiles.
ConvertYUVToRGB(InputFormat input_format,const u8 * input_Y,const u8 * input_U,const u8 * input_V,ImageTile output[],unsigned int width,unsigned int height,const CoefficientSet & coefficients)27 static void ConvertYUVToRGB(InputFormat input_format, const u8* input_Y, const u8* input_U,
28 const u8* input_V, ImageTile output[], unsigned int width,
29 unsigned int height, const CoefficientSet& coefficients) {
30
31 for (unsigned int y = 0; y < height; ++y) {
32 for (unsigned int x = 0; x < width; ++x) {
33 s32 Y = 0;
34 s32 U = 0;
35 s32 V = 0;
36 switch (input_format) {
37 case InputFormat::YUV422_Indiv8:
38 case InputFormat::YUV422_Indiv16:
39 Y = input_Y[y * width + x];
40 U = input_U[(y * width + x) / 2];
41 V = input_V[(y * width + x) / 2];
42 break;
43 case InputFormat::YUV420_Indiv8:
44 case InputFormat::YUV420_Indiv16:
45 Y = input_Y[y * width + x];
46 U = input_U[((y / 2) * width + x) / 2];
47 V = input_V[((y / 2) * width + x) / 2];
48 break;
49 case InputFormat::YUYV422_Interleaved:
50 Y = input_Y[(y * width + x) * 2];
51 U = input_Y[(y * width + (x / 2) * 2) * 2 + 1];
52 V = input_Y[(y * width + (x / 2) * 2) * 2 + 3];
53 break;
54 }
55
56 // This conversion process is bit-exact with hardware, as far as could be tested.
57 auto& c = coefficients;
58 s32 cY = c[0] * Y;
59
60 s32 r = cY + c[1] * V;
61 s32 g = cY - c[2] * V - c[3] * U;
62 s32 b = cY + c[4] * U;
63
64 const s32 rounding_offset = 0x18;
65 r = (r >> 3) + c[5] + rounding_offset;
66 g = (g >> 3) + c[6] + rounding_offset;
67 b = (b >> 3) + c[7] + rounding_offset;
68
69 unsigned int tile = x / 8;
70 unsigned int tile_x = x % 8;
71 u32* out = &output[tile][y * 8 + tile_x];
72 *out = ((u32)std::clamp(r >> 5, 0, 0xFF) << 24) |
73 ((u32)std::clamp(g >> 5, 0, 0xFF) << 16) |
74 ((u32)std::clamp(b >> 5, 0, 0xFF) << 8);
75 }
76 }
77 }
78
79 /// Simulates an incoming CDMA transfer. The N parameter is used to automatically convert 16-bit
80 /// formats to 8-bit.
81 template <std::size_t N>
ReceiveData(Memory::MemorySystem & memory,u8 * output,ConversionBuffer & buf,std::size_t amount_of_data)82 static void ReceiveData(Memory::MemorySystem& memory, u8* output, ConversionBuffer& buf,
83 std::size_t amount_of_data) {
84 const u8* input = memory.GetPointer(buf.address);
85
86 std::size_t output_unit = buf.transfer_unit / N;
87 ASSERT(amount_of_data % output_unit == 0);
88
89 while (amount_of_data > 0) {
90 for (std::size_t i = 0; i < output_unit; ++i) {
91 output[i] = input[i * N];
92 }
93
94 output += output_unit;
95 input += buf.transfer_unit + buf.gap;
96
97 buf.address += buf.transfer_unit + buf.gap;
98 buf.image_size -= buf.transfer_unit;
99 amount_of_data -= output_unit;
100 }
101 }
102
103 /// Convert intermediate RGB32 format to the final output format while simulating an outgoing CDMA
104 /// transfer.
SendData(Memory::MemorySystem & memory,const u32 * input,ConversionBuffer & buf,int amount_of_data,OutputFormat output_format,u8 alpha)105 static void SendData(Memory::MemorySystem& memory, const u32* input, ConversionBuffer& buf,
106 int amount_of_data, OutputFormat output_format, u8 alpha) {
107
108 u8* output = memory.GetPointer(buf.address);
109
110 while (amount_of_data > 0) {
111 u8* unit_end = output + buf.transfer_unit;
112 while (output < unit_end) {
113 u32 color = *input++;
114 Common::Vec4<u8> col_vec{(u8)(color >> 24), (u8)(color >> 16), (u8)(color >> 8), alpha};
115
116 switch (output_format) {
117 case OutputFormat::RGBA8:
118 Color::EncodeRGBA8(col_vec, output);
119 output += 4;
120 break;
121 case OutputFormat::RGB8:
122 Color::EncodeRGB8(col_vec, output);
123 output += 3;
124 break;
125 case OutputFormat::RGB5A1:
126 Color::EncodeRGB5A1(col_vec, output);
127 output += 2;
128 break;
129 case OutputFormat::RGB565:
130 Color::EncodeRGB565(col_vec, output);
131 output += 2;
132 break;
133 }
134
135 amount_of_data -= 1;
136 }
137
138 output += buf.gap;
139 buf.address += buf.transfer_unit + buf.gap;
140 buf.image_size -= buf.transfer_unit;
141 }
142 }
143
144 static const u8 linear_lut[TILE_SIZE] = {
145 // clang-format off
146 0, 1, 2, 3, 4, 5, 6, 7,
147 8, 9, 10, 11, 12, 13, 14, 15,
148 16, 17, 18, 19, 20, 21, 22, 23,
149 24, 25, 26, 27, 28, 29, 30, 31,
150 32, 33, 34, 35, 36, 37, 38, 39,
151 40, 41, 42, 43, 44, 45, 46, 47,
152 48, 49, 50, 51, 52, 53, 54, 55,
153 56, 57, 58, 59, 60, 61, 62, 63,
154 // clang-format on
155 };
156
157 static const u8 morton_lut[TILE_SIZE] = {
158 // clang-format off
159 0, 1, 4, 5, 16, 17, 20, 21,
160 2, 3, 6, 7, 18, 19, 22, 23,
161 8, 9, 12, 13, 24, 25, 28, 29,
162 10, 11, 14, 15, 26, 27, 30, 31,
163 32, 33, 36, 37, 48, 49, 52, 53,
164 34, 35, 38, 39, 50, 51, 54, 55,
165 40, 41, 44, 45, 56, 57, 60, 61,
166 42, 43, 46, 47, 58, 59, 62, 63,
167 // clang-format on
168 };
169
RotateTile0(const ImageTile & input,ImageTile & output,int height,const u8 out_map[64])170 static void RotateTile0(const ImageTile& input, ImageTile& output, int height,
171 const u8 out_map[64]) {
172 for (int i = 0; i < height * 8; ++i) {
173 output[out_map[i]] = input[i];
174 }
175 }
176
RotateTile90(const ImageTile & input,ImageTile & output,int height,const u8 out_map[64])177 static void RotateTile90(const ImageTile& input, ImageTile& output, int height,
178 const u8 out_map[64]) {
179 int out_i = 0;
180 for (int x = 0; x < 8; ++x) {
181 for (int y = height - 1; y >= 0; --y) {
182 output[out_map[out_i++]] = input[y * 8 + x];
183 }
184 }
185 }
186
RotateTile180(const ImageTile & input,ImageTile & output,int height,const u8 out_map[64])187 static void RotateTile180(const ImageTile& input, ImageTile& output, int height,
188 const u8 out_map[64]) {
189 int out_i = 0;
190 for (int i = height * 8 - 1; i >= 0; --i) {
191 output[out_map[out_i++]] = input[i];
192 }
193 }
194
RotateTile270(const ImageTile & input,ImageTile & output,int height,const u8 out_map[64])195 static void RotateTile270(const ImageTile& input, ImageTile& output, int height,
196 const u8 out_map[64]) {
197 int out_i = 0;
198 for (int x = 8 - 1; x >= 0; --x) {
199 for (int y = 0; y < height; ++y) {
200 output[out_map[out_i++]] = input[y * 8 + x];
201 }
202 }
203 }
204
WriteTileToOutput(u32 * output,const ImageTile & tile,int height,int line_stride)205 static void WriteTileToOutput(u32* output, const ImageTile& tile, int height, int line_stride) {
206 for (int y = 0; y < height; ++y) {
207 for (int x = 0; x < 8; ++x) {
208 output[y * line_stride + x] = tile[y * 8 + x];
209 }
210 }
211 }
212
213 /**
214 * Performs a Y2R colorspace conversion.
215 *
216 * The Y2R hardware implements hardware-accelerated YUV to RGB colorspace conversions. It is most
217 * commonly used for video playback or to display camera input to the screen.
218 *
219 * The conversion process is quite configurable, and can be divided in distinct steps. From
220 * observation, it appears that the hardware buffers a single 8-pixel tall strip of image data
221 * internally and converts it in one go before writing to the output and loading the next strip.
222 *
223 * The steps taken to convert one strip of image data are:
224 *
225 * - The hardware receives data via CDMA (http://3dbrew.org/wiki/Corelink_DMA_Engines), which is
226 * presumably stored in one or more internal buffers. This process can be done in several separate
227 * transfers, as long as they don't exceed the size of the internal image buffer. This allows
228 * flexibility in input strides.
229 * - The input data is decoded into a YUV tuple. Several formats are suported, see the `InputFormat`
230 * enum.
231 * - The YUV tuple is converted, using fixed point calculations, to RGB. This step can be configured
232 * using a set of coefficients to support different colorspace standards. See `CoefficientSet`.
233 * - The strip can be optionally rotated 90, 180 or 270 degrees. Since each strip is processed
234 * independently, this notably rotates each *strip*, not the entire image. This means that for 90
235 * or 270 degree rotations, the output will be in terms of several 8 x height images, and for any
236 * non-zero rotation the strips will have to be re-arranged so that the parts of the image will
237 * not be shuffled together. This limitation makes this a feature of somewhat dubious utility. 90
238 * or 270 degree rotations in images with non-even height don't seem to work properly.
239 * - The data is converted to the output RGB format. See the `OutputFormat` enum.
240 * - The data can be output either linearly line-by-line or in the swizzled 8x8 tile format used by
241 * the PICA. This is decided by the `BlockAlignment` enum. If 8x8 alignment is used, then the
242 * image must have a height divisible by 8. The image width must always be divisible by 8.
243 * - The final data is then CDMAed out to main memory and the next image strip is processed. This
244 * offers the same flexibility as the input stage.
245 *
246 * In this implementation, to avoid the combinatorial explosion of parameter combinations, common
247 * intermediate formats are used and where possible tables or parameters are used instead of
248 * diverging code paths to keep the amount of branches in check. Some steps are also merged to
249 * increase efficiency.
250 *
251 * Output for all valid settings combinations matches hardware, however output in some edge-cases
252 * differs:
253 *
254 * - `Block8x8` alignment with non-mod8 height produces different garbage patterns on the last
255 * strip, especially when combined with rotation.
256 * - Hardware, when using `Linear` alignment with a non-even height and 90 or 270 degree rotation
257 * produces misaligned output on the last strip. This implmentation produces output with the
258 * correct "expected" alignment.
259 *
260 * Hardware behaves strangely (doesn't fire the completion interrupt, for example) in these cases,
261 * so they are believed to be invalid configurations anyway.
262 */
PerformConversion(Memory::MemorySystem & memory,ConversionConfiguration & cvt)263 void PerformConversion(Memory::MemorySystem& memory, ConversionConfiguration& cvt) {
264 ASSERT(cvt.input_line_width % 8 == 0);
265 ASSERT(cvt.block_alignment != BlockAlignment::Block8x8 || cvt.input_lines % 8 == 0);
266 // Tiles per row
267 std::size_t num_tiles = cvt.input_line_width / 8;
268 ASSERT(num_tiles <= MAX_TILES);
269
270 // Buffer used as a CDMA source/target.
271 std::unique_ptr<u8[]> data_buffer(new u8[cvt.input_line_width * 8 * 4]);
272 // Intermediate storage for decoded 8x8 image tiles. Always stored as RGB32.
273 std::unique_ptr<ImageTile[]> tiles(new ImageTile[num_tiles]);
274 ImageTile tmp_tile;
275
276 // LUT used to remap writes to a tile. Used to allow linear or swizzled output without
277 // requiring two different code paths.
278 const u8* tile_remap = nullptr;
279 switch (cvt.block_alignment) {
280 case BlockAlignment::Linear:
281 tile_remap = linear_lut;
282 break;
283 case BlockAlignment::Block8x8:
284 tile_remap = morton_lut;
285 break;
286 }
287
288 for (unsigned int y = 0; y < cvt.input_lines; y += 8) {
289 unsigned int row_height = std::min(cvt.input_lines - y, 8u);
290
291 // Total size in pixels of incoming data required for this strip.
292 const std::size_t row_data_size = row_height * cvt.input_line_width;
293
294 u8* input_Y = data_buffer.get();
295 u8* input_U = input_Y + 8 * cvt.input_line_width;
296 u8* input_V = input_U + 8 * cvt.input_line_width / 2;
297
298 switch (cvt.input_format) {
299 case InputFormat::YUV422_Indiv8:
300 ReceiveData<1>(memory, input_Y, cvt.src_Y, row_data_size);
301 ReceiveData<1>(memory, input_U, cvt.src_U, row_data_size / 2);
302 ReceiveData<1>(memory, input_V, cvt.src_V, row_data_size / 2);
303 break;
304 case InputFormat::YUV420_Indiv8:
305 ReceiveData<1>(memory, input_Y, cvt.src_Y, row_data_size);
306 ReceiveData<1>(memory, input_U, cvt.src_U, row_data_size / 4);
307 ReceiveData<1>(memory, input_V, cvt.src_V, row_data_size / 4);
308 break;
309 case InputFormat::YUV422_Indiv16:
310 ReceiveData<2>(memory, input_Y, cvt.src_Y, row_data_size);
311 ReceiveData<2>(memory, input_U, cvt.src_U, row_data_size / 2);
312 ReceiveData<2>(memory, input_V, cvt.src_V, row_data_size / 2);
313 break;
314 case InputFormat::YUV420_Indiv16:
315 ReceiveData<2>(memory, input_Y, cvt.src_Y, row_data_size);
316 ReceiveData<2>(memory, input_U, cvt.src_U, row_data_size / 4);
317 ReceiveData<2>(memory, input_V, cvt.src_V, row_data_size / 4);
318 break;
319 case InputFormat::YUYV422_Interleaved:
320 input_U = nullptr;
321 input_V = nullptr;
322 ReceiveData<1>(memory, input_Y, cvt.src_YUYV, row_data_size * 2);
323 break;
324 }
325
326 // Note(yuriks): If additional optimization is required, input_format can be moved to a
327 // template parameter, so that its dispatch can be moved to outside the inner loop.
328 ConvertYUVToRGB(cvt.input_format, input_Y, input_U, input_V, tiles.get(),
329 cvt.input_line_width, row_height, cvt.coefficients);
330
331 u32* output_buffer = reinterpret_cast<u32*>(data_buffer.get());
332
333 for (std::size_t i = 0; i < num_tiles; ++i) {
334 int image_strip_width = 0;
335 int output_stride = 0;
336
337 switch (cvt.rotation) {
338 case Rotation::None:
339 RotateTile0(tiles[i], tmp_tile, row_height, tile_remap);
340 image_strip_width = cvt.input_line_width;
341 output_stride = 8;
342 break;
343 case Rotation::Clockwise_90:
344 RotateTile90(tiles[i], tmp_tile, row_height, tile_remap);
345 image_strip_width = 8;
346 output_stride = 8 * row_height;
347 break;
348 case Rotation::Clockwise_180:
349 // For 180 and 270 degree rotations we also invert the order of tiles in the strip,
350 // since the rotates are done individually on each tile.
351 RotateTile180(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap);
352 image_strip_width = cvt.input_line_width;
353 output_stride = 8;
354 break;
355 case Rotation::Clockwise_270:
356 RotateTile270(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap);
357 image_strip_width = 8;
358 output_stride = 8 * row_height;
359 break;
360 }
361
362 switch (cvt.block_alignment) {
363 case BlockAlignment::Linear:
364 WriteTileToOutput(output_buffer, tmp_tile, row_height, image_strip_width);
365 output_buffer += output_stride;
366 break;
367 case BlockAlignment::Block8x8:
368 WriteTileToOutput(output_buffer, tmp_tile, 8, 8);
369 output_buffer += TILE_SIZE;
370 break;
371 }
372 }
373
374 // Note(yuriks): If additional optimization is required, output_format can be moved to a
375 // template parameter, so that its dispatch can be moved to outside the inner loop.
376 SendData(memory, reinterpret_cast<u32*>(data_buffer.get()), cvt.dst, (int)row_data_size,
377 cvt.output_format, (u8)cvt.alpha);
378 }
379 }
380 } // namespace HW::Y2R
381