1 // Copyright 2015 Citra Emulator Project
2 // Licensed under GPLv2 or any later version
3 // Refer to the license.txt file included.
4 
5 #include <algorithm>
6 #include <array>
7 #include <cstddef>
8 #include <memory>
9 #include "common/assert.h"
10 #include "common/color.h"
11 #include "common/common_types.h"
12 #include "common/vector_math.h"
13 #include "core/core.h"
14 #include "core/hle/service/y2r_u.h"
15 #include "core/hw/y2r.h"
16 #include "core/memory.h"
17 
18 namespace HW::Y2R {
19 
20 using namespace Service::Y2R;
21 
22 static const std::size_t MAX_TILES = 1024 / 8;
23 static const std::size_t TILE_SIZE = 8 * 8;
24 using ImageTile = std::array<u32, TILE_SIZE>;
25 
26 /// Converts a image strip from the source YUV format into individual 8x8 RGB32 tiles.
ConvertYUVToRGB(InputFormat input_format,const u8 * input_Y,const u8 * input_U,const u8 * input_V,ImageTile output[],unsigned int width,unsigned int height,const CoefficientSet & coefficients)27 static void ConvertYUVToRGB(InputFormat input_format, const u8* input_Y, const u8* input_U,
28                             const u8* input_V, ImageTile output[], unsigned int width,
29                             unsigned int height, const CoefficientSet& coefficients) {
30 
31     for (unsigned int y = 0; y < height; ++y) {
32         for (unsigned int x = 0; x < width; ++x) {
33             s32 Y = 0;
34             s32 U = 0;
35             s32 V = 0;
36             switch (input_format) {
37             case InputFormat::YUV422_Indiv8:
38             case InputFormat::YUV422_Indiv16:
39                 Y = input_Y[y * width + x];
40                 U = input_U[(y * width + x) / 2];
41                 V = input_V[(y * width + x) / 2];
42                 break;
43             case InputFormat::YUV420_Indiv8:
44             case InputFormat::YUV420_Indiv16:
45                 Y = input_Y[y * width + x];
46                 U = input_U[((y / 2) * width + x) / 2];
47                 V = input_V[((y / 2) * width + x) / 2];
48                 break;
49             case InputFormat::YUYV422_Interleaved:
50                 Y = input_Y[(y * width + x) * 2];
51                 U = input_Y[(y * width + (x / 2) * 2) * 2 + 1];
52                 V = input_Y[(y * width + (x / 2) * 2) * 2 + 3];
53                 break;
54             }
55 
56             // This conversion process is bit-exact with hardware, as far as could be tested.
57             auto& c = coefficients;
58             s32 cY = c[0] * Y;
59 
60             s32 r = cY + c[1] * V;
61             s32 g = cY - c[2] * V - c[3] * U;
62             s32 b = cY + c[4] * U;
63 
64             const s32 rounding_offset = 0x18;
65             r = (r >> 3) + c[5] + rounding_offset;
66             g = (g >> 3) + c[6] + rounding_offset;
67             b = (b >> 3) + c[7] + rounding_offset;
68 
69             unsigned int tile = x / 8;
70             unsigned int tile_x = x % 8;
71             u32* out = &output[tile][y * 8 + tile_x];
72             *out = ((u32)std::clamp(r >> 5, 0, 0xFF) << 24) |
73                    ((u32)std::clamp(g >> 5, 0, 0xFF) << 16) |
74                    ((u32)std::clamp(b >> 5, 0, 0xFF) << 8);
75         }
76     }
77 }
78 
79 /// Simulates an incoming CDMA transfer. The N parameter is used to automatically convert 16-bit
80 /// formats to 8-bit.
81 template <std::size_t N>
ReceiveData(Memory::MemorySystem & memory,u8 * output,ConversionBuffer & buf,std::size_t amount_of_data)82 static void ReceiveData(Memory::MemorySystem& memory, u8* output, ConversionBuffer& buf,
83                         std::size_t amount_of_data) {
84     const u8* input = memory.GetPointer(buf.address);
85 
86     std::size_t output_unit = buf.transfer_unit / N;
87     ASSERT(amount_of_data % output_unit == 0);
88 
89     while (amount_of_data > 0) {
90         for (std::size_t i = 0; i < output_unit; ++i) {
91             output[i] = input[i * N];
92         }
93 
94         output += output_unit;
95         input += buf.transfer_unit + buf.gap;
96 
97         buf.address += buf.transfer_unit + buf.gap;
98         buf.image_size -= buf.transfer_unit;
99         amount_of_data -= output_unit;
100     }
101 }
102 
103 /// Convert intermediate RGB32 format to the final output format while simulating an outgoing CDMA
104 /// transfer.
SendData(Memory::MemorySystem & memory,const u32 * input,ConversionBuffer & buf,int amount_of_data,OutputFormat output_format,u8 alpha)105 static void SendData(Memory::MemorySystem& memory, const u32* input, ConversionBuffer& buf,
106                      int amount_of_data, OutputFormat output_format, u8 alpha) {
107 
108     u8* output = memory.GetPointer(buf.address);
109 
110     while (amount_of_data > 0) {
111         u8* unit_end = output + buf.transfer_unit;
112         while (output < unit_end) {
113             u32 color = *input++;
114             Common::Vec4<u8> col_vec{(u8)(color >> 24), (u8)(color >> 16), (u8)(color >> 8), alpha};
115 
116             switch (output_format) {
117             case OutputFormat::RGBA8:
118                 Color::EncodeRGBA8(col_vec, output);
119                 output += 4;
120                 break;
121             case OutputFormat::RGB8:
122                 Color::EncodeRGB8(col_vec, output);
123                 output += 3;
124                 break;
125             case OutputFormat::RGB5A1:
126                 Color::EncodeRGB5A1(col_vec, output);
127                 output += 2;
128                 break;
129             case OutputFormat::RGB565:
130                 Color::EncodeRGB565(col_vec, output);
131                 output += 2;
132                 break;
133             }
134 
135             amount_of_data -= 1;
136         }
137 
138         output += buf.gap;
139         buf.address += buf.transfer_unit + buf.gap;
140         buf.image_size -= buf.transfer_unit;
141     }
142 }
143 
144 static const u8 linear_lut[TILE_SIZE] = {
145     // clang-format off
146      0,  1,  2,  3,  4,  5,  6,  7,
147      8,  9, 10, 11, 12, 13, 14, 15,
148     16, 17, 18, 19, 20, 21, 22, 23,
149     24, 25, 26, 27, 28, 29, 30, 31,
150     32, 33, 34, 35, 36, 37, 38, 39,
151     40, 41, 42, 43, 44, 45, 46, 47,
152     48, 49, 50, 51, 52, 53, 54, 55,
153     56, 57, 58, 59, 60, 61, 62, 63,
154     // clang-format on
155 };
156 
157 static const u8 morton_lut[TILE_SIZE] = {
158     // clang-format off
159      0,  1,  4,  5, 16, 17, 20, 21,
160      2,  3,  6,  7, 18, 19, 22, 23,
161      8,  9, 12, 13, 24, 25, 28, 29,
162     10, 11, 14, 15, 26, 27, 30, 31,
163     32, 33, 36, 37, 48, 49, 52, 53,
164     34, 35, 38, 39, 50, 51, 54, 55,
165     40, 41, 44, 45, 56, 57, 60, 61,
166     42, 43, 46, 47, 58, 59, 62, 63,
167     // clang-format on
168 };
169 
RotateTile0(const ImageTile & input,ImageTile & output,int height,const u8 out_map[64])170 static void RotateTile0(const ImageTile& input, ImageTile& output, int height,
171                         const u8 out_map[64]) {
172     for (int i = 0; i < height * 8; ++i) {
173         output[out_map[i]] = input[i];
174     }
175 }
176 
RotateTile90(const ImageTile & input,ImageTile & output,int height,const u8 out_map[64])177 static void RotateTile90(const ImageTile& input, ImageTile& output, int height,
178                          const u8 out_map[64]) {
179     int out_i = 0;
180     for (int x = 0; x < 8; ++x) {
181         for (int y = height - 1; y >= 0; --y) {
182             output[out_map[out_i++]] = input[y * 8 + x];
183         }
184     }
185 }
186 
RotateTile180(const ImageTile & input,ImageTile & output,int height,const u8 out_map[64])187 static void RotateTile180(const ImageTile& input, ImageTile& output, int height,
188                           const u8 out_map[64]) {
189     int out_i = 0;
190     for (int i = height * 8 - 1; i >= 0; --i) {
191         output[out_map[out_i++]] = input[i];
192     }
193 }
194 
RotateTile270(const ImageTile & input,ImageTile & output,int height,const u8 out_map[64])195 static void RotateTile270(const ImageTile& input, ImageTile& output, int height,
196                           const u8 out_map[64]) {
197     int out_i = 0;
198     for (int x = 8 - 1; x >= 0; --x) {
199         for (int y = 0; y < height; ++y) {
200             output[out_map[out_i++]] = input[y * 8 + x];
201         }
202     }
203 }
204 
WriteTileToOutput(u32 * output,const ImageTile & tile,int height,int line_stride)205 static void WriteTileToOutput(u32* output, const ImageTile& tile, int height, int line_stride) {
206     for (int y = 0; y < height; ++y) {
207         for (int x = 0; x < 8; ++x) {
208             output[y * line_stride + x] = tile[y * 8 + x];
209         }
210     }
211 }
212 
213 /**
214  * Performs a Y2R colorspace conversion.
215  *
216  * The Y2R hardware implements hardware-accelerated YUV to RGB colorspace conversions. It is most
217  * commonly used for video playback or to display camera input to the screen.
218  *
219  * The conversion process is quite configurable, and can be divided in distinct steps. From
220  * observation, it appears that the hardware buffers a single 8-pixel tall strip of image data
221  * internally and converts it in one go before writing to the output and loading the next strip.
222  *
223  * The steps taken to convert one strip of image data are:
224  *
225  * - The hardware receives data via CDMA (http://3dbrew.org/wiki/Corelink_DMA_Engines), which is
226  *   presumably stored in one or more internal buffers. This process can be done in several separate
227  *   transfers, as long as they don't exceed the size of the internal image buffer. This allows
228  *   flexibility in input strides.
229  * - The input data is decoded into a YUV tuple. Several formats are suported, see the `InputFormat`
230  *   enum.
231  * - The YUV tuple is converted, using fixed point calculations, to RGB. This step can be configured
232  *   using a set of coefficients to support different colorspace standards. See `CoefficientSet`.
233  * - The strip can be optionally rotated 90, 180 or 270 degrees. Since each strip is processed
234  *   independently, this notably rotates each *strip*, not the entire image. This means that for 90
235  *   or 270 degree rotations, the output will be in terms of several 8 x height images, and for any
236  *   non-zero rotation the strips will have to be re-arranged so that the parts of the image will
237  *   not be shuffled together. This limitation makes this a feature of somewhat dubious utility. 90
238  *   or 270 degree rotations in images with non-even height don't seem to work properly.
239  * - The data is converted to the output RGB format. See the `OutputFormat` enum.
240  * - The data can be output either linearly line-by-line or in the swizzled 8x8 tile format used by
241  *   the PICA. This is decided by the `BlockAlignment` enum. If 8x8 alignment is used, then the
242  *   image must have a height divisible by 8. The image width must always be divisible by 8.
243  * - The final data is then CDMAed out to main memory and the next image strip is processed. This
244  *   offers the same flexibility as the input stage.
245  *
246  * In this implementation, to avoid the combinatorial explosion of parameter combinations, common
247  * intermediate formats are used and where possible tables or parameters are used instead of
248  * diverging code paths to keep the amount of branches in check. Some steps are also merged to
249  * increase efficiency.
250  *
251  * Output for all valid settings combinations matches hardware, however output in some edge-cases
252  * differs:
253  *
254  * - `Block8x8` alignment with non-mod8 height produces different garbage patterns on the last
255  *   strip, especially when combined with rotation.
256  * - Hardware, when using `Linear` alignment with a non-even height and 90 or 270 degree rotation
257  *   produces misaligned output on the last strip. This implmentation produces output with the
258  *   correct "expected" alignment.
259  *
260  * Hardware behaves strangely (doesn't fire the completion interrupt, for example) in these cases,
261  * so they are believed to be invalid configurations anyway.
262  */
PerformConversion(Memory::MemorySystem & memory,ConversionConfiguration & cvt)263 void PerformConversion(Memory::MemorySystem& memory, ConversionConfiguration& cvt) {
264     ASSERT(cvt.input_line_width % 8 == 0);
265     ASSERT(cvt.block_alignment != BlockAlignment::Block8x8 || cvt.input_lines % 8 == 0);
266     // Tiles per row
267     std::size_t num_tiles = cvt.input_line_width / 8;
268     ASSERT(num_tiles <= MAX_TILES);
269 
270     // Buffer used as a CDMA source/target.
271     std::unique_ptr<u8[]> data_buffer(new u8[cvt.input_line_width * 8 * 4]);
272     // Intermediate storage for decoded 8x8 image tiles. Always stored as RGB32.
273     std::unique_ptr<ImageTile[]> tiles(new ImageTile[num_tiles]);
274     ImageTile tmp_tile;
275 
276     // LUT used to remap writes to a tile. Used to allow linear or swizzled output without
277     // requiring two different code paths.
278     const u8* tile_remap = nullptr;
279     switch (cvt.block_alignment) {
280     case BlockAlignment::Linear:
281         tile_remap = linear_lut;
282         break;
283     case BlockAlignment::Block8x8:
284         tile_remap = morton_lut;
285         break;
286     }
287 
288     for (unsigned int y = 0; y < cvt.input_lines; y += 8) {
289         unsigned int row_height = std::min(cvt.input_lines - y, 8u);
290 
291         // Total size in pixels of incoming data required for this strip.
292         const std::size_t row_data_size = row_height * cvt.input_line_width;
293 
294         u8* input_Y = data_buffer.get();
295         u8* input_U = input_Y + 8 * cvt.input_line_width;
296         u8* input_V = input_U + 8 * cvt.input_line_width / 2;
297 
298         switch (cvt.input_format) {
299         case InputFormat::YUV422_Indiv8:
300             ReceiveData<1>(memory, input_Y, cvt.src_Y, row_data_size);
301             ReceiveData<1>(memory, input_U, cvt.src_U, row_data_size / 2);
302             ReceiveData<1>(memory, input_V, cvt.src_V, row_data_size / 2);
303             break;
304         case InputFormat::YUV420_Indiv8:
305             ReceiveData<1>(memory, input_Y, cvt.src_Y, row_data_size);
306             ReceiveData<1>(memory, input_U, cvt.src_U, row_data_size / 4);
307             ReceiveData<1>(memory, input_V, cvt.src_V, row_data_size / 4);
308             break;
309         case InputFormat::YUV422_Indiv16:
310             ReceiveData<2>(memory, input_Y, cvt.src_Y, row_data_size);
311             ReceiveData<2>(memory, input_U, cvt.src_U, row_data_size / 2);
312             ReceiveData<2>(memory, input_V, cvt.src_V, row_data_size / 2);
313             break;
314         case InputFormat::YUV420_Indiv16:
315             ReceiveData<2>(memory, input_Y, cvt.src_Y, row_data_size);
316             ReceiveData<2>(memory, input_U, cvt.src_U, row_data_size / 4);
317             ReceiveData<2>(memory, input_V, cvt.src_V, row_data_size / 4);
318             break;
319         case InputFormat::YUYV422_Interleaved:
320             input_U = nullptr;
321             input_V = nullptr;
322             ReceiveData<1>(memory, input_Y, cvt.src_YUYV, row_data_size * 2);
323             break;
324         }
325 
326         // Note(yuriks): If additional optimization is required, input_format can be moved to a
327         // template parameter, so that its dispatch can be moved to outside the inner loop.
328         ConvertYUVToRGB(cvt.input_format, input_Y, input_U, input_V, tiles.get(),
329                         cvt.input_line_width, row_height, cvt.coefficients);
330 
331         u32* output_buffer = reinterpret_cast<u32*>(data_buffer.get());
332 
333         for (std::size_t i = 0; i < num_tiles; ++i) {
334             int image_strip_width = 0;
335             int output_stride = 0;
336 
337             switch (cvt.rotation) {
338             case Rotation::None:
339                 RotateTile0(tiles[i], tmp_tile, row_height, tile_remap);
340                 image_strip_width = cvt.input_line_width;
341                 output_stride = 8;
342                 break;
343             case Rotation::Clockwise_90:
344                 RotateTile90(tiles[i], tmp_tile, row_height, tile_remap);
345                 image_strip_width = 8;
346                 output_stride = 8 * row_height;
347                 break;
348             case Rotation::Clockwise_180:
349                 // For 180 and 270 degree rotations we also invert the order of tiles in the strip,
350                 // since the rotates are done individually on each tile.
351                 RotateTile180(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap);
352                 image_strip_width = cvt.input_line_width;
353                 output_stride = 8;
354                 break;
355             case Rotation::Clockwise_270:
356                 RotateTile270(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap);
357                 image_strip_width = 8;
358                 output_stride = 8 * row_height;
359                 break;
360             }
361 
362             switch (cvt.block_alignment) {
363             case BlockAlignment::Linear:
364                 WriteTileToOutput(output_buffer, tmp_tile, row_height, image_strip_width);
365                 output_buffer += output_stride;
366                 break;
367             case BlockAlignment::Block8x8:
368                 WriteTileToOutput(output_buffer, tmp_tile, 8, 8);
369                 output_buffer += TILE_SIZE;
370                 break;
371             }
372         }
373 
374         // Note(yuriks): If additional optimization is required, output_format can be moved to a
375         // template parameter, so that its dispatch can be moved to outside the inner loop.
376         SendData(memory, reinterpret_cast<u32*>(data_buffer.get()), cvt.dst, (int)row_data_size,
377                  cvt.output_format, (u8)cvt.alpha);
378     }
379 }
380 } // namespace HW::Y2R
381