1 // Copyright 2009 Dolphin Emulator Project
2 // Licensed under GPLv2+
3 // Refer to the license.txt file included.
4 
5 #include "VideoCommon/TextureConversionShader.h"
6 
7 #include <map>
8 #include <sstream>
9 #include <string_view>
10 
11 #include "Common/CommonTypes.h"
12 #include "Common/MathUtil.h"
13 #include "Common/MsgHandler.h"
14 #include "VideoCommon/ShaderGenCommon.h"
15 #include "VideoCommon/TextureCacheBase.h"
16 #include "VideoCommon/VertexManagerBase.h"
17 #include "VideoCommon/VideoCommon.h"
18 #include "VideoCommon/VideoConfig.h"
19 
20 namespace TextureConversionShaderTiled
21 {
22 static bool IntensityConstantAdded = false;
23 
GetEncodedSampleCount(EFBCopyFormat format)24 u16 GetEncodedSampleCount(EFBCopyFormat format)
25 {
26   switch (format)
27   {
28   case EFBCopyFormat::R4:
29     return 8;
30   case EFBCopyFormat::RA4:
31     return 4;
32   case EFBCopyFormat::RA8:
33     return 2;
34   case EFBCopyFormat::RGB565:
35     return 2;
36   case EFBCopyFormat::RGB5A3:
37     return 2;
38   case EFBCopyFormat::RGBA8:
39     return 1;
40   case EFBCopyFormat::A8:
41   case EFBCopyFormat::R8_0x1:
42   case EFBCopyFormat::R8:
43   case EFBCopyFormat::G8:
44   case EFBCopyFormat::B8:
45     return 4;
46   case EFBCopyFormat::RG8:
47   case EFBCopyFormat::GB8:
48     return 2;
49   case EFBCopyFormat::XFB:
50     return 2;
51   default:
52     PanicAlert("Invalid EFB Copy Format (0x%X)! (GetEncodedSampleCount)", static_cast<int>(format));
53     return 1;
54   }
55 }
56 
WriteHeader(ShaderCode & code,APIType api_type)57 static void WriteHeader(ShaderCode& code, APIType api_type)
58 {
59   if (api_type == APIType::OpenGL || api_type == APIType::Vulkan)
60   {
61     // left, top, of source rectangle within source texture
62     // width of the destination rectangle, scale_factor (1 or 2)
63     code.WriteFmt("UBO_BINDING(std140, 1) uniform PSBlock {{\n"
64                   "  int4 position;\n"
65                   "  float y_scale;\n"
66                   "  float gamma_rcp;\n"
67                   "  float2 clamp_tb;\n"
68                   "  float3 filter_coefficients;\n"
69                   "}};\n");
70     if (g_ActiveConfig.backend_info.bSupportsGeometryShaders)
71     {
72       code.WriteFmt("VARYING_LOCATION(0) in VertexData {{\n"
73                     "  float3 v_tex0;\n"
74                     "}};\n");
75     }
76     else
77     {
78       code.WriteFmt("VARYING_LOCATION(0) in float3 v_tex0;\n");
79     }
80     code.WriteFmt("SAMPLER_BINDING(0) uniform sampler2DArray samp0;\n"
81                   "FRAGMENT_OUTPUT_LOCATION(0) out float4 ocol0;\n");
82   }
83   else  // D3D
84   {
85     code.WriteFmt("cbuffer PSBlock : register(b0) {{\n"
86                   "  int4 position;\n"
87                   "  float y_scale;\n"
88                   "  float gamma_rcp;\n"
89                   "  float2 clamp_tb;\n"
90                   "  float3 filter_coefficients;\n"
91                   "}};\n"
92                   "sampler samp0 : register(s0);\n"
93                   "Texture2DArray Tex0 : register(t0);\n");
94   }
95 
96   // D3D does not have roundEven(), only round(), which is specified "to the nearest integer".
97   // This differs from the roundEven() behavior, but to get consistency across drivers in OpenGL
98   // we need to use roundEven().
99   if (api_type == APIType::D3D)
100     code.WriteFmt("#define roundEven(x) round(x)\n");
101 
102   // Alpha channel in the copy is set to 1 the EFB format does not have an alpha channel.
103   code.WriteFmt("float4 RGBA8ToRGB8(float4 src)\n"
104                 "{{\n"
105                 "  return float4(src.xyz, 1.0);\n"
106                 "}}\n"
107 
108                 "float4 RGBA8ToRGBA6(float4 src)\n"
109                 "{{\n"
110                 "  int4 val = int4(roundEven(src * 255.0)) >> 2;\n"
111                 "  return float4(val) / 63.0;\n"
112                 "}}\n"
113 
114                 "float4 RGBA8ToRGB565(float4 src)\n"
115                 "{{\n"
116                 "  int4 val = int4(roundEven(src * 255.0));\n"
117                 "  val = int4(val.r >> 3, val.g >> 2, val.b >> 3, 1);\n"
118                 "  return float4(val) / float4(31.0, 63.0, 31.0, 1.0);\n"
119                 "}}\n");
120 }
121 
WriteSampleFunction(ShaderCode & code,const EFBCopyParams & params,APIType api_type)122 static void WriteSampleFunction(ShaderCode& code, const EFBCopyParams& params, APIType api_type)
123 {
124   const auto WriteSampleOp = [api_type, &code, &params](int yoffset) {
125     if (!params.depth)
126     {
127       switch (params.efb_format)
128       {
129       case PEControl::RGB8_Z24:
130         code.WriteFmt("RGBA8ToRGB8(");
131         break;
132       case PEControl::RGBA6_Z24:
133         code.WriteFmt("RGBA8ToRGBA6(");
134         break;
135       case PEControl::RGB565_Z16:
136         code.WriteFmt("RGBA8ToRGB565(");
137         break;
138       default:
139         code.WriteFmt("(");
140         break;
141       }
142     }
143     else
144     {
145       // Handle D3D depth inversion.
146       if (!g_ActiveConfig.backend_info.bSupportsReversedDepthRange)
147         code.WriteFmt("1.0 - (");
148       else
149         code.WriteFmt("(");
150     }
151 
152     if (api_type == APIType::OpenGL || api_type == APIType::Vulkan)
153       code.WriteFmt("texture(samp0, float3(");
154     else
155       code.WriteFmt("Tex0.Sample(samp0, float3(");
156 
157     code.WriteFmt("uv.x + float(xoffset) * pixel_size.x, ");
158 
159     // Reverse the direction for OpenGL, since positive numbers are distance from the bottom row.
160     if (yoffset != 0)
161     {
162       if (api_type == APIType::OpenGL)
163         code.WriteFmt("clamp(uv.y - float({}) * pixel_size.y, clamp_tb.x, clamp_tb.y)", yoffset);
164       else
165         code.WriteFmt("clamp(uv.y + float({}) * pixel_size.y, clamp_tb.x, clamp_tb.y)", yoffset);
166     }
167     else
168     {
169       code.WriteFmt("uv.y");
170     }
171 
172     code.WriteFmt(", 0.0)))");
173   };
174 
175   // The copy filter applies to both color and depth copies. This has been verified on hardware.
176   // The filter is only applied to the RGB channels, the alpha channel is left intact.
177   code.WriteFmt("float4 SampleEFB(float2 uv, float2 pixel_size, int xoffset)\n"
178                 "{{\n");
179   if (params.copy_filter)
180   {
181     code.WriteFmt("  float4 prev_row = ");
182     WriteSampleOp(-1);
183     code.WriteFmt(";\n"
184                   "  float4 current_row = ");
185     WriteSampleOp(0);
186     code.WriteFmt(";\n"
187                   "  float4 next_row = ");
188     WriteSampleOp(1);
189     code.WriteFmt(";\n"
190                   "  return float4(min(prev_row.rgb * filter_coefficients[0] +\n"
191                   "                      current_row.rgb * filter_coefficients[1] +\n"
192                   "                      next_row.rgb * filter_coefficients[2], \n"
193                   "                    float3(1, 1, 1)), current_row.a);\n");
194   }
195   else
196   {
197     code.WriteFmt("  float4 current_row = ");
198     WriteSampleOp(0);
199     code.WriteFmt(";\n"
200                   "return float4(min(current_row.rgb * filter_coefficients[1], float3(1, 1, 1)),\n"
201                   "              current_row.a);\n");
202   }
203   code.WriteFmt("}}\n");
204 }
205 
206 // Block dimensions   : widthStride, heightStride
207 // Texture dimensions : width, height, x offset, y offset
WriteSwizzler(ShaderCode & code,const EFBCopyParams & params,EFBCopyFormat format,APIType api_type)208 static void WriteSwizzler(ShaderCode& code, const EFBCopyParams& params, EFBCopyFormat format,
209                           APIType api_type)
210 {
211   WriteHeader(code, api_type);
212   WriteSampleFunction(code, params, api_type);
213 
214   if (api_type == APIType::OpenGL || api_type == APIType::Vulkan)
215   {
216     code.WriteFmt("void main()\n"
217                   "{{\n"
218                   "  int2 sampleUv;\n"
219                   "  int2 uv1 = int2(gl_FragCoord.xy);\n");
220   }
221   else  // D3D
222   {
223     code.WriteFmt("void main(\n"
224                   "  in float3 v_tex0 : TEXCOORD0,\n"
225                   "  in float4 rawpos : SV_Position,\n"
226                   "  out float4 ocol0 : SV_Target)\n"
227                   "{{\n"
228                   "  int2 sampleUv;\n"
229                   "  int2 uv1 = int2(rawpos.xy);\n");
230   }
231 
232   const int blkW = TexDecoder_GetEFBCopyBlockWidthInTexels(format);
233   const int blkH = TexDecoder_GetEFBCopyBlockHeightInTexels(format);
234   int samples = GetEncodedSampleCount(format);
235 
236   code.WriteFmt("  int x_block_position = (uv1.x >> {}) << {};\n", IntLog2(blkH * blkW / samples),
237                 IntLog2(blkW));
238   code.WriteFmt("  int y_block_position = uv1.y << {};\n", IntLog2(blkH));
239   if (samples == 1)
240   {
241     // With samples == 1, we write out pairs of blocks; one A8R8, one G8B8.
242     code.WriteFmt("  bool first = (uv1.x & {}) == 0;\n", blkH * blkW / 2);
243     samples = 2;
244   }
245   code.WriteFmt("  int offset_in_block = uv1.x & {};\n", (blkH * blkW / samples) - 1);
246   code.WriteFmt("  int y_offset_in_block = offset_in_block >> {};\n", IntLog2(blkW / samples));
247   code.WriteFmt("  int x_offset_in_block = (offset_in_block & {}) << {};\n", (blkW / samples) - 1,
248                 IntLog2(samples));
249 
250   code.WriteFmt("  sampleUv.x = x_block_position + x_offset_in_block;\n"
251                 "  sampleUv.y = y_block_position + y_offset_in_block;\n");
252 
253   // sampleUv is the sample position in (int)gx_coords
254   code.WriteFmt("  float2 uv0 = float2(sampleUv);\n");
255   // Move to center of pixel
256   code.WriteFmt("  uv0 += float2(0.5, 0.5);\n");
257   // Scale by two if needed (also move to pixel borders
258   // so that linear filtering will average adjacent
259   // pixel)
260   code.WriteFmt("  uv0 *= float(position.w);\n");
261 
262   // Move to copied rect
263   code.WriteFmt("  uv0 += float2(position.xy);\n");
264   // Normalize to [0:1]
265   code.WriteFmt("  uv0 /= float2({}, {});\n", EFB_WIDTH, EFB_HEIGHT);
266   // Apply the y scaling
267   code.WriteFmt("  uv0 /= float2(1, y_scale);\n");
268   // OGL has to flip up and down
269   if (api_type == APIType::OpenGL)
270   {
271     code.WriteFmt("  uv0.y = 1.0-uv0.y;\n");
272   }
273 
274   code.WriteFmt("  float2 pixel_size = float2(position.w, position.w) / float2({}, {});\n",
275                 EFB_WIDTH, EFB_HEIGHT);
276 }
277 
WriteSampleColor(ShaderCode & code,std::string_view color_comp,std::string_view dest,int x_offset,APIType api_type,const EFBCopyParams & params)278 static void WriteSampleColor(ShaderCode& code, std::string_view color_comp, std::string_view dest,
279                              int x_offset, APIType api_type, const EFBCopyParams& params)
280 {
281   code.WriteFmt("  {} = SampleEFB(uv0, pixel_size, {}).{};\n", dest, x_offset, color_comp);
282 }
283 
WriteColorToIntensity(ShaderCode & code,std::string_view src,std::string_view dest)284 static void WriteColorToIntensity(ShaderCode& code, std::string_view src, std::string_view dest)
285 {
286   if (!IntensityConstantAdded)
287   {
288     code.WriteFmt("  float4 IntensityConst = float4(0.257f,0.504f,0.098f,0.0625f);\n");
289     IntensityConstantAdded = true;
290   }
291   code.WriteFmt("  {} = dot(IntensityConst.rgb, {}.rgb);\n", dest, src);
292   // don't add IntensityConst.a yet, because doing it later is faster and uses less instructions,
293   // due to vectorization
294 }
295 
WriteToBitDepth(ShaderCode & code,u8 depth,std::string_view src,std::string_view dest)296 static void WriteToBitDepth(ShaderCode& code, u8 depth, std::string_view src, std::string_view dest)
297 {
298   code.WriteFmt("  {} = floor({} * 255.0 / exp2(8.0 - {}.0));\n", dest, src, depth);
299 }
300 
WriteEncoderEnd(ShaderCode & code)301 static void WriteEncoderEnd(ShaderCode& code)
302 {
303   code.WriteFmt("}}\n");
304   IntensityConstantAdded = false;
305 }
306 
WriteI8Encoder(ShaderCode & code,APIType api_type,const EFBCopyParams & params)307 static void WriteI8Encoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
308 {
309   WriteSwizzler(code, params, EFBCopyFormat::R8, api_type);
310   code.WriteFmt("  float3 texSample;\n");
311 
312   WriteSampleColor(code, "rgb", "texSample", 0, api_type, params);
313   WriteColorToIntensity(code, "texSample", "ocol0.b");
314 
315   WriteSampleColor(code, "rgb", "texSample", 1, api_type, params);
316   WriteColorToIntensity(code, "texSample", "ocol0.g");
317 
318   WriteSampleColor(code, "rgb", "texSample", 2, api_type, params);
319   WriteColorToIntensity(code, "texSample", "ocol0.r");
320 
321   WriteSampleColor(code, "rgb", "texSample", 3, api_type, params);
322   WriteColorToIntensity(code, "texSample", "ocol0.a");
323 
324   // See WriteColorToIntensity
325   code.WriteFmt("  ocol0.rgba += IntensityConst.aaaa;\n");
326 
327   WriteEncoderEnd(code);
328 }
329 
WriteI4Encoder(ShaderCode & code,APIType api_type,const EFBCopyParams & params)330 static void WriteI4Encoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
331 {
332   WriteSwizzler(code, params, EFBCopyFormat::R4, api_type);
333   code.WriteFmt("  float3 texSample;\n"
334                 "  float4 color0;\n"
335                 "  float4 color1;\n");
336 
337   WriteSampleColor(code, "rgb", "texSample", 0, api_type, params);
338   WriteColorToIntensity(code, "texSample", "color0.b");
339 
340   WriteSampleColor(code, "rgb", "texSample", 1, api_type, params);
341   WriteColorToIntensity(code, "texSample", "color1.b");
342 
343   WriteSampleColor(code, "rgb", "texSample", 2, api_type, params);
344   WriteColorToIntensity(code, "texSample", "color0.g");
345 
346   WriteSampleColor(code, "rgb", "texSample", 3, api_type, params);
347   WriteColorToIntensity(code, "texSample", "color1.g");
348 
349   WriteSampleColor(code, "rgb", "texSample", 4, api_type, params);
350   WriteColorToIntensity(code, "texSample", "color0.r");
351 
352   WriteSampleColor(code, "rgb", "texSample", 5, api_type, params);
353   WriteColorToIntensity(code, "texSample", "color1.r");
354 
355   WriteSampleColor(code, "rgb", "texSample", 6, api_type, params);
356   WriteColorToIntensity(code, "texSample", "color0.a");
357 
358   WriteSampleColor(code, "rgb", "texSample", 7, api_type, params);
359   WriteColorToIntensity(code, "texSample", "color1.a");
360 
361   code.WriteFmt("  color0.rgba += IntensityConst.aaaa;\n"
362                 "  color1.rgba += IntensityConst.aaaa;\n");
363 
364   WriteToBitDepth(code, 4, "color0", "color0");
365   WriteToBitDepth(code, 4, "color1", "color1");
366 
367   code.WriteFmt("  ocol0 = (color0 * 16.0 + color1) / 255.0;\n");
368   WriteEncoderEnd(code);
369 }
370 
WriteIA8Encoder(ShaderCode & code,APIType api_type,const EFBCopyParams & params)371 static void WriteIA8Encoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
372 {
373   WriteSwizzler(code, params, EFBCopyFormat::RA8, api_type);
374   code.WriteFmt("  float4 texSample;\n");
375 
376   WriteSampleColor(code, "rgba", "texSample", 0, api_type, params);
377   code.WriteFmt("  ocol0.b = texSample.a;\n");
378   WriteColorToIntensity(code, "texSample", "ocol0.g");
379 
380   WriteSampleColor(code, "rgba", "texSample", 1, api_type, params);
381   code.WriteFmt("  ocol0.r = texSample.a;\n");
382   WriteColorToIntensity(code, "texSample", "ocol0.a");
383 
384   code.WriteFmt("  ocol0.ga += IntensityConst.aa;\n");
385 
386   WriteEncoderEnd(code);
387 }
388 
WriteIA4Encoder(ShaderCode & code,APIType api_type,const EFBCopyParams & params)389 static void WriteIA4Encoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
390 {
391   WriteSwizzler(code, params, EFBCopyFormat::RA4, api_type);
392   code.WriteFmt("  float4 texSample;\n"
393                 "  float4 color0;\n"
394                 "  float4 color1;\n");
395 
396   WriteSampleColor(code, "rgba", "texSample", 0, api_type, params);
397   code.WriteFmt("  color0.b = texSample.a;\n");
398   WriteColorToIntensity(code, "texSample", "color1.b");
399 
400   WriteSampleColor(code, "rgba", "texSample", 1, api_type, params);
401   code.WriteFmt("  color0.g = texSample.a;\n");
402   WriteColorToIntensity(code, "texSample", "color1.g");
403 
404   WriteSampleColor(code, "rgba", "texSample", 2, api_type, params);
405   code.WriteFmt("  color0.r = texSample.a;\n");
406   WriteColorToIntensity(code, "texSample", "color1.r");
407 
408   WriteSampleColor(code, "rgba", "texSample", 3, api_type, params);
409   code.WriteFmt("  color0.a = texSample.a;\n");
410   WriteColorToIntensity(code, "texSample", "color1.a");
411 
412   code.WriteFmt("  color1.rgba += IntensityConst.aaaa;\n");
413 
414   WriteToBitDepth(code, 4, "color0", "color0");
415   WriteToBitDepth(code, 4, "color1", "color1");
416 
417   code.WriteFmt("  ocol0 = (color0 * 16.0 + color1) / 255.0;\n");
418   WriteEncoderEnd(code);
419 }
420 
WriteRGB565Encoder(ShaderCode & code,APIType api_type,const EFBCopyParams & params)421 static void WriteRGB565Encoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
422 {
423   WriteSwizzler(code, params, EFBCopyFormat::RGB565, api_type);
424   code.WriteFmt("  float3 texSample0;\n"
425                 "  float3 texSample1;\n");
426 
427   WriteSampleColor(code, "rgb", "texSample0", 0, api_type, params);
428   WriteSampleColor(code, "rgb", "texSample1", 1, api_type, params);
429   code.WriteFmt("  float2 texRs = float2(texSample0.r, texSample1.r);\n"
430                 "  float2 texGs = float2(texSample0.g, texSample1.g);\n"
431                 "  float2 texBs = float2(texSample0.b, texSample1.b);\n");
432 
433   WriteToBitDepth(code, 6, "texGs", "float2 gInt");
434   code.WriteFmt("  float2 gUpper = floor(gInt / 8.0);\n"
435                 "  float2 gLower = gInt - gUpper * 8.0;\n");
436 
437   WriteToBitDepth(code, 5, "texRs", "ocol0.br");
438   code.WriteFmt("  ocol0.br = ocol0.br * 8.0 + gUpper;\n");
439   WriteToBitDepth(code, 5, "texBs", "ocol0.ga");
440   code.WriteFmt("  ocol0.ga = ocol0.ga + gLower * 32.0;\n");
441 
442   code.WriteFmt("  ocol0 = ocol0 / 255.0;\n");
443   WriteEncoderEnd(code);
444 }
445 
WriteRGB5A3Encoder(ShaderCode & code,APIType api_type,const EFBCopyParams & params)446 static void WriteRGB5A3Encoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
447 {
448   WriteSwizzler(code, params, EFBCopyFormat::RGB5A3, api_type);
449 
450   code.WriteFmt("  float4 texSample;\n"
451                 "  float color0;\n"
452                 "  float gUpper;\n"
453                 "  float gLower;\n");
454 
455   WriteSampleColor(code, "rgba", "texSample", 0, api_type, params);
456 
457   // 0.8784 = 224 / 255 which is the maximum alpha value that can be represented in 3 bits
458   code.WriteFmt("if(texSample.a > 0.878f) {{\n");
459 
460   WriteToBitDepth(code, 5, "texSample.g", "color0");
461   code.WriteFmt("  gUpper = floor(color0 / 8.0);\n"
462                 "  gLower = color0 - gUpper * 8.0;\n");
463 
464   WriteToBitDepth(code, 5, "texSample.r", "ocol0.b");
465   code.WriteFmt("  ocol0.b = ocol0.b * 4.0 + gUpper + 128.0;\n");
466   WriteToBitDepth(code, 5, "texSample.b", "ocol0.g");
467   code.WriteFmt("  ocol0.g = ocol0.g + gLower * 32.0;\n");
468 
469   code.WriteFmt("}} else {{\n");
470 
471   WriteToBitDepth(code, 4, "texSample.r", "ocol0.b");
472   WriteToBitDepth(code, 4, "texSample.b", "ocol0.g");
473 
474   WriteToBitDepth(code, 3, "texSample.a", "color0");
475   code.WriteFmt("ocol0.b = ocol0.b + color0 * 16.0;\n");
476   WriteToBitDepth(code, 4, "texSample.g", "color0");
477   code.WriteFmt("ocol0.g = ocol0.g + color0 * 16.0;\n");
478 
479   code.WriteFmt("}}\n");
480 
481   WriteSampleColor(code, "rgba", "texSample", 1, api_type, params);
482 
483   code.WriteFmt("if(texSample.a > 0.878f) {{\n");
484 
485   WriteToBitDepth(code, 5, "texSample.g", "color0");
486   code.WriteFmt("  gUpper = floor(color0 / 8.0);\n"
487                 "  gLower = color0 - gUpper * 8.0;\n");
488 
489   WriteToBitDepth(code, 5, "texSample.r", "ocol0.r");
490   code.WriteFmt("  ocol0.r = ocol0.r * 4.0 + gUpper + 128.0;\n");
491   WriteToBitDepth(code, 5, "texSample.b", "ocol0.a");
492   code.WriteFmt("  ocol0.a = ocol0.a + gLower * 32.0;\n");
493 
494   code.WriteFmt("}} else {{\n");
495 
496   WriteToBitDepth(code, 4, "texSample.r", "ocol0.r");
497   WriteToBitDepth(code, 4, "texSample.b", "ocol0.a");
498 
499   WriteToBitDepth(code, 3, "texSample.a", "color0");
500   code.WriteFmt("ocol0.r = ocol0.r + color0 * 16.0;\n");
501   WriteToBitDepth(code, 4, "texSample.g", "color0");
502   code.WriteFmt("ocol0.a = ocol0.a + color0 * 16.0;\n");
503 
504   code.WriteFmt("}}\n");
505 
506   code.WriteFmt("  ocol0 = ocol0 / 255.0;\n");
507   WriteEncoderEnd(code);
508 }
509 
WriteRGBA8Encoder(ShaderCode & code,APIType api_type,const EFBCopyParams & params)510 static void WriteRGBA8Encoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
511 {
512   WriteSwizzler(code, params, EFBCopyFormat::RGBA8, api_type);
513 
514   code.WriteFmt("  float4 texSample;\n"
515                 "  float4 color0;\n"
516                 "  float4 color1;\n");
517 
518   WriteSampleColor(code, "rgba", "texSample", 0, api_type, params);
519   code.WriteFmt("  color0.b = texSample.a;\n"
520                 "  color0.g = texSample.r;\n"
521                 "  color1.b = texSample.g;\n"
522                 "  color1.g = texSample.b;\n");
523 
524   WriteSampleColor(code, "rgba", "texSample", 1, api_type, params);
525   code.WriteFmt("  color0.r = texSample.a;\n"
526                 "  color0.a = texSample.r;\n"
527                 "  color1.r = texSample.g;\n"
528                 "  color1.a = texSample.b;\n");
529 
530   code.WriteFmt("  ocol0 = first ? color0 : color1;\n");
531 
532   WriteEncoderEnd(code);
533 }
534 
WriteC4Encoder(ShaderCode & code,std::string_view comp,APIType api_type,const EFBCopyParams & params)535 static void WriteC4Encoder(ShaderCode& code, std::string_view comp, APIType api_type,
536                            const EFBCopyParams& params)
537 {
538   WriteSwizzler(code, params, EFBCopyFormat::R4, api_type);
539   code.WriteFmt("  float4 color0;\n"
540                 "  float4 color1;\n");
541 
542   WriteSampleColor(code, comp, "color0.b", 0, api_type, params);
543   WriteSampleColor(code, comp, "color1.b", 1, api_type, params);
544   WriteSampleColor(code, comp, "color0.g", 2, api_type, params);
545   WriteSampleColor(code, comp, "color1.g", 3, api_type, params);
546   WriteSampleColor(code, comp, "color0.r", 4, api_type, params);
547   WriteSampleColor(code, comp, "color1.r", 5, api_type, params);
548   WriteSampleColor(code, comp, "color0.a", 6, api_type, params);
549   WriteSampleColor(code, comp, "color1.a", 7, api_type, params);
550 
551   WriteToBitDepth(code, 4, "color0", "color0");
552   WriteToBitDepth(code, 4, "color1", "color1");
553 
554   code.WriteFmt("  ocol0 = (color0 * 16.0 + color1) / 255.0;\n");
555   WriteEncoderEnd(code);
556 }
557 
WriteC8Encoder(ShaderCode & code,std::string_view comp,APIType api_type,const EFBCopyParams & params)558 static void WriteC8Encoder(ShaderCode& code, std::string_view comp, APIType api_type,
559                            const EFBCopyParams& params)
560 {
561   WriteSwizzler(code, params, EFBCopyFormat::R8, api_type);
562 
563   WriteSampleColor(code, comp, "ocol0.b", 0, api_type, params);
564   WriteSampleColor(code, comp, "ocol0.g", 1, api_type, params);
565   WriteSampleColor(code, comp, "ocol0.r", 2, api_type, params);
566   WriteSampleColor(code, comp, "ocol0.a", 3, api_type, params);
567 
568   WriteEncoderEnd(code);
569 }
570 
WriteCC4Encoder(ShaderCode & code,std::string_view comp,APIType api_type,const EFBCopyParams & params)571 static void WriteCC4Encoder(ShaderCode& code, std::string_view comp, APIType api_type,
572                             const EFBCopyParams& params)
573 {
574   WriteSwizzler(code, params, EFBCopyFormat::RA4, api_type);
575   code.WriteFmt("  float2 texSample;\n"
576                 "  float4 color0;\n"
577                 "  float4 color1;\n");
578 
579   WriteSampleColor(code, comp, "texSample", 0, api_type, params);
580   code.WriteFmt("  color0.b = texSample.x;\n"
581                 "  color1.b = texSample.y;\n");
582 
583   WriteSampleColor(code, comp, "texSample", 1, api_type, params);
584   code.WriteFmt("  color0.g = texSample.x;\n"
585                 "  color1.g = texSample.y;\n");
586 
587   WriteSampleColor(code, comp, "texSample", 2, api_type, params);
588   code.WriteFmt("  color0.r = texSample.x;\n"
589                 "  color1.r = texSample.y;\n");
590 
591   WriteSampleColor(code, comp, "texSample", 3, api_type, params);
592   code.WriteFmt("  color0.a = texSample.x;\n"
593                 "  color1.a = texSample.y;\n");
594 
595   WriteToBitDepth(code, 4, "color0", "color0");
596   WriteToBitDepth(code, 4, "color1", "color1");
597 
598   code.WriteFmt("  ocol0 = (color0 * 16.0 + color1) / 255.0;\n");
599   WriteEncoderEnd(code);
600 }
601 
WriteCC8Encoder(ShaderCode & code,std::string_view comp,APIType api_type,const EFBCopyParams & params)602 static void WriteCC8Encoder(ShaderCode& code, std::string_view comp, APIType api_type,
603                             const EFBCopyParams& params)
604 {
605   WriteSwizzler(code, params, EFBCopyFormat::RA8, api_type);
606 
607   WriteSampleColor(code, comp, "ocol0.bg", 0, api_type, params);
608   WriteSampleColor(code, comp, "ocol0.ra", 1, api_type, params);
609 
610   WriteEncoderEnd(code);
611 }
612 
WriteZ8Encoder(ShaderCode & code,std::string_view multiplier,APIType api_type,const EFBCopyParams & params)613 static void WriteZ8Encoder(ShaderCode& code, std::string_view multiplier, APIType api_type,
614                            const EFBCopyParams& params)
615 {
616   WriteSwizzler(code, params, EFBCopyFormat::G8, api_type);
617 
618   code.WriteFmt(" float depth;\n");
619 
620   WriteSampleColor(code, "r", "depth", 0, api_type, params);
621   code.WriteFmt("ocol0.b = frac(depth * {});\n", multiplier);
622 
623   WriteSampleColor(code, "r", "depth", 1, api_type, params);
624   code.WriteFmt("ocol0.g = frac(depth * {});\n", multiplier);
625 
626   WriteSampleColor(code, "r", "depth", 2, api_type, params);
627   code.WriteFmt("ocol0.r = frac(depth * {});\n", multiplier);
628 
629   WriteSampleColor(code, "r", "depth", 3, api_type, params);
630   code.WriteFmt("ocol0.a = frac(depth * {});\n", multiplier);
631 
632   WriteEncoderEnd(code);
633 }
634 
WriteZ16Encoder(ShaderCode & code,APIType api_type,const EFBCopyParams & params)635 static void WriteZ16Encoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
636 {
637   WriteSwizzler(code, params, EFBCopyFormat::RA8, api_type);
638 
639   code.WriteFmt("  float depth;\n"
640                 "  float3 expanded;\n");
641 
642   // Byte order is reversed
643 
644   WriteSampleColor(code, "r", "depth", 0, api_type, params);
645 
646   code.WriteFmt("  depth *= 16777216.0;\n"
647                 "  expanded.r = floor(depth / (256.0 * 256.0));\n"
648                 "  depth -= expanded.r * 256.0 * 256.0;\n"
649                 "  expanded.g = floor(depth / 256.0);\n");
650 
651   code.WriteFmt("  ocol0.b = expanded.g / 255.0;\n"
652                 "  ocol0.g = expanded.r / 255.0;\n");
653 
654   WriteSampleColor(code, "r", "depth", 1, api_type, params);
655 
656   code.WriteFmt("  depth *= 16777216.0;\n"
657                 "  expanded.r = floor(depth / (256.0 * 256.0));\n"
658                 "  depth -= expanded.r * 256.0 * 256.0;\n"
659                 "  expanded.g = floor(depth / 256.0);\n");
660 
661   code.WriteFmt("  ocol0.r = expanded.g / 255.0;\n"
662                 "  ocol0.a = expanded.r / 255.0;\n");
663 
664   WriteEncoderEnd(code);
665 }
666 
WriteZ16LEncoder(ShaderCode & code,APIType api_type,const EFBCopyParams & params)667 static void WriteZ16LEncoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
668 {
669   WriteSwizzler(code, params, EFBCopyFormat::GB8, api_type);
670 
671   code.WriteFmt("  float depth;\n"
672                 "  float3 expanded;\n");
673 
674   // Byte order is reversed
675 
676   WriteSampleColor(code, "r", "depth", 0, api_type, params);
677 
678   code.WriteFmt("  depth *= 16777216.0;\n"
679                 "  expanded.r = floor(depth / (256.0 * 256.0));\n"
680                 "  depth -= expanded.r * 256.0 * 256.0;\n"
681                 "  expanded.g = floor(depth / 256.0);\n"
682                 "  depth -= expanded.g * 256.0;\n"
683                 "  expanded.b = depth;\n");
684 
685   code.WriteFmt("  ocol0.b = expanded.b / 255.0;\n"
686                 "  ocol0.g = expanded.g / 255.0;\n");
687 
688   WriteSampleColor(code, "r", "depth", 1, api_type, params);
689 
690   code.WriteFmt("  depth *= 16777216.0;\n"
691                 "  expanded.r = floor(depth / (256.0 * 256.0));\n"
692                 "  depth -= expanded.r * 256.0 * 256.0;\n"
693                 "  expanded.g = floor(depth / 256.0);\n"
694                 "  depth -= expanded.g * 256.0;\n"
695                 "  expanded.b = depth;\n");
696 
697   code.WriteFmt("  ocol0.r = expanded.b / 255.0;\n"
698                 "  ocol0.a = expanded.g / 255.0;\n");
699 
700   WriteEncoderEnd(code);
701 }
702 
WriteZ24Encoder(ShaderCode & code,APIType api_type,const EFBCopyParams & params)703 static void WriteZ24Encoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
704 {
705   WriteSwizzler(code, params, EFBCopyFormat::RGBA8, api_type);
706 
707   code.WriteFmt("  float depth0;\n"
708                 "  float depth1;\n"
709                 "  float3 expanded0;\n"
710                 "  float3 expanded1;\n");
711 
712   WriteSampleColor(code, "r", "depth0", 0, api_type, params);
713   WriteSampleColor(code, "r", "depth1", 1, api_type, params);
714 
715   for (int i = 0; i < 2; i++)
716   {
717     code.WriteFmt("  depth{} *= 16777216.0;\n", i);
718 
719     code.WriteFmt("  expanded{}.r = floor(depth{} / (256.0 * 256.0));\n", i, i);
720     code.WriteFmt("  depth{} -= expanded{}.r * 256.0 * 256.0;\n", i, i);
721     code.WriteFmt("  expanded{}.g = floor(depth{} / 256.0);\n", i, i);
722     code.WriteFmt("  depth{} -= expanded{}.g * 256.0;\n", i, i);
723     code.WriteFmt("  expanded{}.b = depth{};\n", i, i);
724   }
725 
726   code.WriteFmt("  if (!first) {{\n");
727   // Upper 16
728   code.WriteFmt("     ocol0.b = expanded0.g / 255.0;\n"
729                 "     ocol0.g = expanded0.b / 255.0;\n"
730                 "     ocol0.r = expanded1.g / 255.0;\n"
731                 "     ocol0.a = expanded1.b / 255.0;\n"
732                 "  }} else {{\n");
733   // Lower 8
734   code.WriteFmt("     ocol0.b = 1.0;\n"
735                 "     ocol0.g = expanded0.r / 255.0;\n"
736                 "     ocol0.r = 1.0;\n"
737                 "     ocol0.a = expanded1.r / 255.0;\n"
738                 "  }}\n");
739 
740   WriteEncoderEnd(code);
741 }
742 
WriteXFBEncoder(ShaderCode & code,APIType api_type,const EFBCopyParams & params)743 static void WriteXFBEncoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
744 {
745   WriteSwizzler(code, params, EFBCopyFormat::XFB, api_type);
746 
747   code.WriteFmt("float3 color0, color1;\n");
748   WriteSampleColor(code, "rgb", "color0", 0, api_type, params);
749   WriteSampleColor(code, "rgb", "color1", 1, api_type, params);
750 
751   // Gamma is only applied to XFB copies.
752   code.WriteFmt("  color0 = pow(color0, float3(gamma_rcp, gamma_rcp, gamma_rcp));\n"
753                 "  color1 = pow(color1, float3(gamma_rcp, gamma_rcp, gamma_rcp));\n");
754 
755   // Convert to YUV.
756   code.WriteFmt("  const float3 y_const = float3(0.257, 0.504, 0.098);\n"
757                 "  const float3 u_const = float3(-0.148, -0.291, 0.439);\n"
758                 "  const float3 v_const = float3(0.439, -0.368, -0.071);\n"
759                 "  float3 average = (color0 + color1) * 0.5;\n"
760                 "  ocol0.b = dot(color0,  y_const) + 0.0625;\n"
761                 "  ocol0.g = dot(average, u_const) + 0.5;\n"
762                 "  ocol0.r = dot(color1,  y_const) + 0.0625;\n"
763                 "  ocol0.a = dot(average, v_const) + 0.5;\n");
764 
765   WriteEncoderEnd(code);
766 }
767 
GenerateEncodingShader(const EFBCopyParams & params,APIType api_type)768 std::string GenerateEncodingShader(const EFBCopyParams& params, APIType api_type)
769 {
770   ShaderCode code;
771 
772   switch (params.copy_format)
773   {
774   case EFBCopyFormat::R4:
775     if (params.yuv)
776       WriteI4Encoder(code, api_type, params);
777     else
778       WriteC4Encoder(code, "r", api_type, params);
779     break;
780   case EFBCopyFormat::RA4:
781     if (params.yuv)
782       WriteIA4Encoder(code, api_type, params);
783     else
784       WriteCC4Encoder(code, "ar", api_type, params);
785     break;
786   case EFBCopyFormat::RA8:
787     if (params.yuv)
788       WriteIA8Encoder(code, api_type, params);
789     else
790       WriteCC8Encoder(code, "ar", api_type, params);
791     break;
792   case EFBCopyFormat::RGB565:
793     WriteRGB565Encoder(code, api_type, params);
794     break;
795   case EFBCopyFormat::RGB5A3:
796     WriteRGB5A3Encoder(code, api_type, params);
797     break;
798   case EFBCopyFormat::RGBA8:
799     if (params.depth)
800       WriteZ24Encoder(code, api_type, params);
801     else
802       WriteRGBA8Encoder(code, api_type, params);
803     break;
804   case EFBCopyFormat::A8:
805     WriteC8Encoder(code, "a", api_type, params);
806     break;
807   case EFBCopyFormat::R8_0x1:
808   case EFBCopyFormat::R8:
809     if (params.yuv)
810       WriteI8Encoder(code, api_type, params);
811     else
812       WriteC8Encoder(code, "r", api_type, params);
813     break;
814   case EFBCopyFormat::G8:
815     if (params.depth)
816       WriteZ8Encoder(code, "256.0", api_type, params);  // Z8M
817     else
818       WriteC8Encoder(code, "g", api_type, params);
819     break;
820   case EFBCopyFormat::B8:
821     if (params.depth)
822       WriteZ8Encoder(code, "65536.0", api_type, params);  // Z8L
823     else
824       WriteC8Encoder(code, "b", api_type, params);
825     break;
826   case EFBCopyFormat::RG8:
827     if (params.depth)
828       WriteZ16Encoder(code, api_type, params);  // Z16H
829     else
830       WriteCC8Encoder(code, "gr", api_type, params);
831     break;
832   case EFBCopyFormat::GB8:
833     if (params.depth)
834       WriteZ16LEncoder(code, api_type, params);  // Z16L
835     else
836       WriteCC8Encoder(code, "bg", api_type, params);
837     break;
838   case EFBCopyFormat::XFB:
839     WriteXFBEncoder(code, api_type, params);
840     break;
841   default:
842     PanicAlert("Invalid EFB Copy Format (0x%X)! (GenerateEncodingShader)",
843                static_cast<int>(params.copy_format));
844     break;
845   }
846 
847   return code.GetBuffer();
848 }
849 
850 // NOTE: In these uniforms, a row refers to a row of blocks, not texels.
851 static const char decoding_shader_header[] = R"(
852 #if defined(PALETTE_FORMAT_IA8) || defined(PALETTE_FORMAT_RGB565) || defined(PALETTE_FORMAT_RGB5A3)
853 #define HAS_PALETTE 1
854 #endif
855 
856 #ifdef API_D3D
857 cbuffer UBO : register(b0) {
858 #else
859 UBO_BINDING(std140, 1) uniform UBO {
860 #endif
861   uint2 u_dst_size;
862   uint2 u_src_size;
863   uint u_src_offset;
864   uint u_src_row_stride;
865   uint u_palette_offset;
866 };
867 
868 #ifdef API_D3D
869 
870 Buffer<uint4> s_input_buffer : register(t0);
871 #ifdef HAS_PALETTE
872 Buffer<uint4> s_palette_buffer : register(t1);
873 #endif
874 
875 RWTexture2DArray<unorm float4> output_image : register(u0);
876 
877 // Helpers for reading/writing.
878 #define texelFetch(buffer, pos) buffer.Load(pos)
879 #define imageStore(image, coords, value) image[coords] = value
880 #define GROUP_MEMORY_BARRIER_WITH_SYNC GroupMemoryBarrierWithGroupSync();
881 #define GROUP_SHARED groupshared
882 
883 #define DEFINE_MAIN(lx, ly) \
884   [numthreads(lx, ly, 1)] \
885   void main(uint3 gl_WorkGroupID : SV_GroupId, \
886             uint3 gl_LocalInvocationID : SV_GroupThreadID, \
887             uint3 gl_GlobalInvocationID : SV_DispatchThreadID)
888 
889 uint bitfieldExtract(uint val, int off, int size)
890 {
891   // This built-in function is only support in OpenGL 4.0+ and ES 3.1+\n"
892   // Microsoft's HLSL compiler automatically optimises this to a bitfield extract instruction.
893   uint mask = uint((1 << size) - 1);
894   return uint(val >> off) & mask;
895 }
896 
897 #else
898 
899 TEXEL_BUFFER_BINDING(0) uniform usamplerBuffer s_input_buffer;
900 #ifdef HAS_PALETTE
901 TEXEL_BUFFER_BINDING(1) uniform usamplerBuffer s_palette_buffer;
902 #endif
903 IMAGE_BINDING(rgba8, 0) uniform writeonly image2DArray output_image;
904 
905 #define GROUP_MEMORY_BARRIER_WITH_SYNC memoryBarrierShared(); barrier();
906 #define GROUP_SHARED shared
907 
908 #define DEFINE_MAIN(lx, ly) \
909   layout(local_size_x = lx, local_size_y = ly) in; \
910   void main()
911 
912 #endif
913 
914 uint Swap16(uint v)
915 {
916   // Convert BE to LE.
917   return ((v >> 8) | (v << 8)) & 0xFFFFu;
918 }
919 
920 uint Convert3To8(uint v)
921 {
922   // Swizzle bits: 00000123 -> 12312312
923   return (v << 5) | (v << 2) | (v >> 1);
924 }
925 uint Convert4To8(uint v)
926 {
927   // Swizzle bits: 00001234 -> 12341234
928   return (v << 4) | v;
929 }
930 uint Convert5To8(uint v)
931 {
932   // Swizzle bits: 00012345 -> 12345123
933   return (v << 3) | (v >> 2);
934 }
935 uint Convert6To8(uint v)
936 {
937   // Swizzle bits: 00123456 -> 12345612
938   return (v << 2) | (v >> 4);
939 }
940 
941 uint GetTiledTexelOffset(uint2 block_size, uint2 coords)
942 {
943   uint2 block = coords / block_size;
944   uint2 offset = coords % block_size;
945   uint buffer_pos = u_src_offset;
946   buffer_pos += block.y * u_src_row_stride;
947   buffer_pos += block.x * (block_size.x * block_size.y);
948   buffer_pos += offset.y * block_size.x;
949   buffer_pos += offset.x;
950   return buffer_pos;
951 }
952 
953 uint4 GetPaletteColor(uint index)
954 {
955   // Fetch and swap BE to LE.
956   uint val = Swap16(texelFetch(s_palette_buffer, int(u_palette_offset + index)).x);
957 
958   uint4 color;
959 #if defined(PALETTE_FORMAT_IA8)
960   uint a = bitfieldExtract(val, 8, 8);
961   uint i = bitfieldExtract(val, 0, 8);
962   color = uint4(i, i, i, a);
963 #elif defined(PALETTE_FORMAT_RGB565)
964   color.x = Convert5To8(bitfieldExtract(val, 11, 5));
965   color.y = Convert6To8(bitfieldExtract(val, 5, 6));
966   color.z = Convert5To8(bitfieldExtract(val, 0, 5));
967   color.a = 255u;
968 
969 #elif defined(PALETTE_FORMAT_RGB5A3)
970   if ((val & 0x8000u) != 0u)
971   {
972     color.x = Convert5To8(bitfieldExtract(val, 10, 5));
973     color.y = Convert5To8(bitfieldExtract(val, 5, 5));
974     color.z = Convert5To8(bitfieldExtract(val, 0, 5));
975     color.a = 255u;
976   }
977   else
978   {
979     color.a = Convert3To8(bitfieldExtract(val, 12, 3));
980     color.r = Convert4To8(bitfieldExtract(val, 8, 4));
981     color.g = Convert4To8(bitfieldExtract(val, 4, 4));
982     color.b = Convert4To8(bitfieldExtract(val, 0, 4));
983   }
984 #else
985   // Not used.
986   color = uint4(0, 0, 0, 0);
987 #endif
988 
989   return color;
990 }
991 
992 float4 GetPaletteColorNormalized(uint index)
993 {
994   uint4 color = GetPaletteColor(index);
995   return float4(color) / 255.0;
996 }
997 
998 )";
999 
1000 static const std::map<TextureFormat, DecodingShaderInfo> s_decoding_shader_info{
1001     {TextureFormat::I4,
1002      {TEXEL_BUFFER_FORMAT_R8_UINT, 0, 8, 8, false,
1003       R"(
1004       DEFINE_MAIN(8, 8)
1005       {
1006         uint2 coords = gl_GlobalInvocationID.xy;
1007 
1008         // Tiled in 8x8 blocks, 4 bits per pixel
1009         // We need to do the tiling manually here because the texel size is smaller than
1010         // the size of the buffer elements.
1011         uint2 block = coords.xy / 8u;
1012         uint2 offset = coords.xy % 8u;
1013         uint buffer_pos = u_src_offset;
1014         buffer_pos += block.y * u_src_row_stride;
1015         buffer_pos += block.x * 32u;
1016         buffer_pos += offset.y * 4u;
1017         buffer_pos += offset.x / 2u;
1018 
1019         // Select high nibble for odd texels, low for even.
1020         uint val = texelFetch(s_input_buffer, int(buffer_pos)).x;
1021         uint i;
1022         if ((coords.x & 1u) == 0u)
1023           i = Convert4To8((val >> 4));
1024         else
1025           i = Convert4To8((val & 0x0Fu));
1026 
1027         uint4 color = uint4(i, i, i, i);
1028         float4 norm_color = float4(color) / 255.0;
1029 
1030         imageStore(output_image, int3(int2(coords), 0), norm_color);
1031       }
1032 
1033       )"}},
1034     {TextureFormat::IA4,
1035      {TEXEL_BUFFER_FORMAT_R8_UINT, 0, 8, 8, false,
1036       R"(
1037       DEFINE_MAIN(8, 8)
1038       {
1039         uint2 coords = gl_GlobalInvocationID.xy;
1040 
1041         // Tiled in 8x4 blocks, 8 bits per pixel
1042         uint buffer_pos = GetTiledTexelOffset(uint2(8u, 4u), coords);
1043         uint val = texelFetch(s_input_buffer, int(buffer_pos)).x;
1044         uint i = Convert4To8((val & 0x0Fu));
1045         uint a = Convert4To8((val >> 4));
1046         uint4 color = uint4(i, i, i, a);
1047         float4 norm_color = float4(color) / 255.0;
1048 
1049         imageStore(output_image, int3(int2(coords), 0), norm_color);
1050       }
1051       )"}},
1052     {TextureFormat::I8,
1053      {TEXEL_BUFFER_FORMAT_R8_UINT, 0, 8, 8, false,
1054       R"(
1055       DEFINE_MAIN(8, 8)
1056       {
1057         uint2 coords = gl_GlobalInvocationID.xy;
1058 
1059         // Tiled in 8x4 blocks, 8 bits per pixel
1060         uint buffer_pos = GetTiledTexelOffset(uint2(8u, 4u), coords);
1061         uint i = texelFetch(s_input_buffer, int(buffer_pos)).x;
1062         uint4 color = uint4(i, i, i, i);
1063         float4 norm_color = float4(color) / 255.0;
1064 
1065         imageStore(output_image, int3(int2(coords), 0), norm_color);
1066       }
1067       )"}},
1068     {TextureFormat::IA8,
1069      {TEXEL_BUFFER_FORMAT_R16_UINT, 0, 8, 8, false,
1070       R"(
1071       DEFINE_MAIN(8, 8)
1072       {
1073         uint2 coords = gl_GlobalInvocationID.xy;
1074 
1075         // Tiled in 4x4 blocks, 16 bits per pixel
1076         uint buffer_pos = GetTiledTexelOffset(uint2(4u, 4u), coords);
1077         uint val = texelFetch(s_input_buffer, int(buffer_pos)).x;
1078         uint a = (val & 0xFFu);
1079         uint i = (val >> 8);
1080         uint4 color = uint4(i, i, i, a);
1081         float4 norm_color = float4(color) / 255.0;
1082         imageStore(output_image, int3(int2(coords), 0), norm_color);
1083       }
1084       )"}},
1085     {TextureFormat::RGB565,
1086      {TEXEL_BUFFER_FORMAT_R16_UINT, 0, 8, 8, false,
1087       R"(
1088       DEFINE_MAIN(8, 8)
1089       {
1090         uint2 coords = gl_GlobalInvocationID.xy;
1091 
1092         // Tiled in 4x4 blocks
1093         uint buffer_pos = GetTiledTexelOffset(uint2(4u, 4u), coords);
1094         uint val = Swap16(texelFetch(s_input_buffer, int(buffer_pos)).x);
1095 
1096         uint4 color;
1097         color.x = Convert5To8(bitfieldExtract(val, 11, 5));
1098         color.y = Convert6To8(bitfieldExtract(val, 5, 6));
1099         color.z = Convert5To8(bitfieldExtract(val, 0, 5));
1100         color.a = 255u;
1101 
1102         float4 norm_color = float4(color) / 255.0;
1103         imageStore(output_image, int3(int2(coords), 0), norm_color);
1104       }
1105 
1106       )"}},
1107     {TextureFormat::RGB5A3,
1108      {TEXEL_BUFFER_FORMAT_R16_UINT, 0, 8, 8, false,
1109       R"(
1110       DEFINE_MAIN(8, 8)
1111       {
1112         uint2 coords = gl_GlobalInvocationID.xy;
1113 
1114         // Tiled in 4x4 blocks
1115         uint buffer_pos = GetTiledTexelOffset(uint2(4u, 4u), coords);
1116         uint val = Swap16(texelFetch(s_input_buffer, int(buffer_pos)).x);
1117 
1118         uint4 color;
1119         if ((val & 0x8000u) != 0u)
1120         {
1121           color.x = Convert5To8(bitfieldExtract(val, 10, 5));
1122           color.y = Convert5To8(bitfieldExtract(val, 5, 5));
1123           color.z = Convert5To8(bitfieldExtract(val, 0, 5));
1124           color.a = 255u;
1125         }
1126         else
1127         {
1128           color.a = Convert3To8(bitfieldExtract(val, 12, 3));
1129           color.r = Convert4To8(bitfieldExtract(val, 8, 4));
1130           color.g = Convert4To8(bitfieldExtract(val, 4, 4));
1131           color.b = Convert4To8(bitfieldExtract(val, 0, 4));
1132         }
1133 
1134         float4 norm_color = float4(color) / 255.0;
1135         imageStore(output_image, int3(int2(coords), 0), norm_color);
1136       }
1137 
1138       )"}},
1139     {TextureFormat::RGBA8,
1140      {TEXEL_BUFFER_FORMAT_R16_UINT, 0, 8, 8, false,
1141       R"(
1142       DEFINE_MAIN(8, 8)
1143       {
1144         uint2 coords = gl_GlobalInvocationID.xy;
1145 
1146         // Tiled in 4x4 blocks
1147         // We can't use the normal calculation function, as these are packed as the AR channels
1148         // for the entire block, then the GB channels afterwards.
1149         uint2 block = coords.xy / 4u;
1150         uint2 offset = coords.xy % 4u;
1151         uint buffer_pos = u_src_offset;
1152 
1153         // Our buffer has 16-bit elements, so the offsets here are half what they would be in bytes.
1154         buffer_pos += block.y * u_src_row_stride;
1155         buffer_pos += block.x * 32u;
1156         buffer_pos += offset.y * 4u;
1157         buffer_pos += offset.x;
1158 
1159         // The two GB channels follow after the block's AR channels.
1160         uint val1 = texelFetch(s_input_buffer, int(buffer_pos + 0u)).x;
1161         uint val2 = texelFetch(s_input_buffer, int(buffer_pos + 16u)).x;
1162 
1163         uint4 color;
1164         color.a = (val1 & 0xFFu);
1165         color.r = (val1 >> 8);
1166         color.g = (val2 & 0xFFu);
1167         color.b = (val2 >> 8);
1168 
1169         float4 norm_color = float4(color) / 255.0;
1170         imageStore(output_image, int3(int2(coords), 0), norm_color);
1171       }
1172       )"}},
1173     {TextureFormat::CMPR,
1174      {TEXEL_BUFFER_FORMAT_R32G32_UINT, 0, 64, 1, true,
1175       R"(
1176       // In the compute version of this decoder, we flatten the blocks to a one-dimension array.
1177       // Each group is subdivided into 16, and the first thread in each group fetches the DXT data.
1178       // All threads then calculate the possible colors for the block and write to the output image.
1179 
1180       #define GROUP_SIZE 64u
1181       #define BLOCK_SIZE_X 4u
1182       #define BLOCK_SIZE_Y 4u
1183       #define BLOCK_SIZE (BLOCK_SIZE_X * BLOCK_SIZE_Y)
1184       #define BLOCKS_PER_GROUP (GROUP_SIZE / BLOCK_SIZE)
1185 
1186       uint DXTBlend(uint v1, uint v2)
1187       {
1188         // 3/8 blend, which is close to 1/3
1189         return ((v1 * 3u + v2 * 5u) >> 3);
1190       }
1191 
1192       GROUP_SHARED uint2 shared_temp[BLOCKS_PER_GROUP];
1193 
1194       DEFINE_MAIN(GROUP_SIZE, 8)
1195       {
1196         uint local_thread_id = gl_LocalInvocationID.x;
1197         uint block_in_group = local_thread_id / BLOCK_SIZE;
1198         uint thread_in_block = local_thread_id % BLOCK_SIZE;
1199         uint block_index = gl_WorkGroupID.x * BLOCKS_PER_GROUP + block_in_group;
1200 
1201         // Annoyingly, we can't precalculate this as a uniform because the DXT block size differs
1202         // from the block size of the overall texture (4 vs 8). We can however use a multiply and
1203         // subtraction to avoid the modulo for calculating the block's X coordinate.
1204         uint blocks_wide = u_src_size.x / BLOCK_SIZE_X;
1205         uint2 block_coords;
1206         block_coords.y = block_index / blocks_wide;
1207         block_coords.x = block_index - (block_coords.y * blocks_wide);
1208 
1209         // Only the first thread for each block reads from the texel buffer.
1210         if (thread_in_block == 0u)
1211         {
1212           // Calculate tiled block coordinates.
1213           uint2 tile_block_coords = block_coords / 2u;
1214           uint2 subtile_block_coords = block_coords % 2u;
1215           uint buffer_pos = u_src_offset;
1216           buffer_pos += tile_block_coords.y * u_src_row_stride;
1217           buffer_pos += tile_block_coords.x * 4u;
1218           buffer_pos += subtile_block_coords.y * 2u;
1219           buffer_pos += subtile_block_coords.x;
1220 
1221           // Read the entire DXT block to shared memory.
1222           uint2 raw_data = texelFetch(s_input_buffer, int(buffer_pos)).xy;
1223           shared_temp[block_in_group] = raw_data;
1224         }
1225 
1226         // Ensure store is completed before the remaining threads in the block continue.
1227         GROUP_MEMORY_BARRIER_WITH_SYNC;
1228 
1229         // Unpack colors and swap BE to LE.
1230         uint2 raw_data = shared_temp[block_in_group];
1231         uint swapped = ((raw_data.x & 0xFF00FF00u) >> 8) | ((raw_data.x & 0x00FF00FFu) << 8);
1232         uint c1 = swapped & 0xFFFFu;
1233         uint c2 = swapped >> 16;
1234 
1235         // Expand 5/6 bit channels to 8-bits per channel.
1236         uint blue1 = Convert5To8(bitfieldExtract(c1, 0, 5));
1237         uint blue2 = Convert5To8(bitfieldExtract(c2, 0, 5));
1238         uint green1 = Convert6To8(bitfieldExtract(c1, 5, 6));
1239         uint green2 = Convert6To8(bitfieldExtract(c2, 5, 6));
1240         uint red1 = Convert5To8(bitfieldExtract(c1, 11, 5));
1241         uint red2 = Convert5To8(bitfieldExtract(c2, 11, 5));
1242 
1243         // Determine the four colors the block can use.
1244         // It's quicker to just precalculate all four colors rather than branching on the index.
1245         // NOTE: These must be masked with 0xFF. This is done at the normalization stage below.
1246         uint4 color0, color1, color2, color3;
1247         color0 = uint4(red1, green1, blue1, 255u);
1248         color1 = uint4(red2, green2, blue2, 255u);
1249         if (c1 > c2)
1250         {
1251           color2 = uint4(DXTBlend(red2, red1), DXTBlend(green2, green1), DXTBlend(blue2, blue1), 255u);
1252           color3 = uint4(DXTBlend(red1, red2), DXTBlend(green1, green2), DXTBlend(blue1, blue2), 255u);
1253         }
1254         else
1255         {
1256           color2 = uint4((red1 + red2) / 2u, (green1 + green2) / 2u, (blue1 + blue2) / 2u, 255u);
1257           color3 = uint4((red1 + red2) / 2u, (green1 + green2) / 2u, (blue1 + blue2) / 2u, 0u);
1258         }
1259 
1260         // Calculate the texel coordinates that we will write to.
1261         // The divides/modulo here should be turned into a shift/binary AND.
1262         uint local_y = thread_in_block / BLOCK_SIZE_X;
1263         uint local_x = thread_in_block % BLOCK_SIZE_X;
1264         uint global_x = block_coords.x * BLOCK_SIZE_X + local_x;
1265         uint global_y = block_coords.y * BLOCK_SIZE_Y + local_y;
1266 
1267         // Use the coordinates within the block to shift the 32-bit value containing
1268         // all 16 indices to a single 2-bit index.
1269         uint index = bitfieldExtract(raw_data.y, int((local_y * 8u) + (6u - local_x * 2u)), 2);
1270 
1271         // Select the un-normalized color from the precalculated color array.
1272         // Using a switch statement here removes the need for dynamic indexing of an array.
1273         uint4 color;
1274         switch (index)
1275         {
1276         case 0u:  color = color0;   break;
1277         case 1u:  color = color1;   break;
1278         case 2u:  color = color2;   break;
1279         case 3u:  color = color3;   break;
1280         default:  color = color0;   break;
1281         }
1282 
1283         // Normalize and write to the output image.
1284         float4 norm_color = float4(color & 0xFFu) / 255.0;
1285         imageStore(output_image, int3(int2(uint2(global_x, global_y)), 0), norm_color);
1286       }
1287       )"}},
1288     {TextureFormat::C4,
1289      {TEXEL_BUFFER_FORMAT_R8_UINT, static_cast<u32>(TexDecoder_GetPaletteSize(TextureFormat::C4)),
1290       8, 8, false,
1291       R"(
1292       DEFINE_MAIN(8, 8)
1293       {
1294         uint2 coords = gl_GlobalInvocationID.xy;
1295 
1296         // Tiled in 8x8 blocks, 4 bits per pixel
1297         // We need to do the tiling manually here because the texel size is smaller than
1298         // the size of the buffer elements.
1299         uint2 block = coords.xy / 8u;
1300         uint2 offset = coords.xy % 8u;
1301         uint buffer_pos = u_src_offset;
1302         buffer_pos += block.y * u_src_row_stride;
1303         buffer_pos += block.x * 32u;
1304         buffer_pos += offset.y * 4u;
1305         buffer_pos += offset.x / 2u;
1306 
1307         // Select high nibble for odd texels, low for even.
1308         uint val = texelFetch(s_input_buffer, int(buffer_pos)).x;
1309         uint index = ((coords.x & 1u) == 0u) ? (val >> 4) : (val & 0x0Fu);
1310         float4 norm_color = GetPaletteColorNormalized(index);
1311         imageStore(output_image, int3(int2(coords), 0), norm_color);
1312       }
1313 
1314       )"}},
1315     {TextureFormat::C8,
1316      {TEXEL_BUFFER_FORMAT_R8_UINT, static_cast<u32>(TexDecoder_GetPaletteSize(TextureFormat::C8)),
1317       8, 8, false,
1318       R"(
1319       DEFINE_MAIN(8, 8)
1320       {
1321         uint2 coords = gl_GlobalInvocationID.xy;
1322 
1323         // Tiled in 8x4 blocks, 8 bits per pixel
1324         uint buffer_pos = GetTiledTexelOffset(uint2(8u, 4u), coords);
1325         uint index = texelFetch(s_input_buffer, int(buffer_pos)).x;
1326         float4 norm_color = GetPaletteColorNormalized(index);
1327         imageStore(output_image, int3(int2(coords), 0), norm_color);
1328       }
1329       )"}},
1330     {TextureFormat::C14X2,
1331      {TEXEL_BUFFER_FORMAT_R16_UINT,
1332       static_cast<u32>(TexDecoder_GetPaletteSize(TextureFormat::C14X2)), 8, 8, false,
1333       R"(
1334       DEFINE_MAIN(8, 8)
1335       {
1336         uint2 coords = gl_GlobalInvocationID.xy;
1337 
1338         // Tiled in 4x4 blocks, 16 bits per pixel
1339         uint buffer_pos = GetTiledTexelOffset(uint2(4u, 4u), coords);
1340         uint index = Swap16(texelFetch(s_input_buffer, int(buffer_pos)).x) & 0x3FFFu;
1341         float4 norm_color = GetPaletteColorNormalized(index);
1342         imageStore(output_image, int3(int2(coords), 0), norm_color);
1343       }
1344       )"}},
1345 
1346     // We do the inverse BT.601 conversion for YCbCr to RGB
1347     // http://www.equasys.de/colorconversion.html#YCbCr-RGBColorFormatConversion
1348     {TextureFormat::XFB,
1349      {TEXEL_BUFFER_FORMAT_RGBA8_UINT, 0, 8, 8, false,
1350       R"(
1351       DEFINE_MAIN(8, 8)
1352       {
1353         uint2 uv = gl_GlobalInvocationID.xy;
1354         int buffer_pos = int(u_src_offset + (uv.y * u_src_row_stride) + (uv.x / 2u));
1355         float4 yuyv = float4(texelFetch(s_input_buffer, buffer_pos));
1356 
1357         float y = (uv.x & 1u) != 0u ? yuyv.b : yuyv.r;
1358 
1359         float yComp = 1.164 * (y - 16.0);
1360         float uComp = yuyv.g - 128.0;
1361         float vComp = yuyv.a - 128.0;
1362 
1363         float4 rgb = float4(yComp + (1.596 * vComp),
1364                         yComp - (0.813 * vComp) - (0.391 * uComp),
1365                         yComp + (2.018 * uComp),
1366                         255.0);
1367         float4 rgba_norm = rgb / 255.0;
1368         imageStore(output_image, int3(int2(uv), 0), rgba_norm);
1369       }
1370       )"}}};
1371 
GetDecodingShaderInfo(TextureFormat format)1372 const DecodingShaderInfo* GetDecodingShaderInfo(TextureFormat format)
1373 {
1374   auto iter = s_decoding_shader_info.find(format);
1375   return iter != s_decoding_shader_info.end() ? &iter->second : nullptr;
1376 }
1377 
1378 std::pair<u32, u32> GetDispatchCount(const DecodingShaderInfo* info, u32 width, u32 height)
1379 {
1380   // Flatten to a single dimension?
1381   if (info->group_flatten)
1382     return {(width * height + (info->group_size_x - 1)) / info->group_size_x, 1};
1383 
1384   return {(width + (info->group_size_x - 1)) / info->group_size_x,
1385           (height + (info->group_size_y - 1)) / info->group_size_y};
1386 }
1387 
1388 std::string GenerateDecodingShader(TextureFormat format, TLUTFormat palette_format,
1389                                    APIType api_type)
1390 {
1391   const DecodingShaderInfo* info = GetDecodingShaderInfo(format);
1392   if (!info)
1393     return "";
1394 
1395   std::ostringstream ss;
1396   switch (palette_format)
1397   {
1398   case TLUTFormat::IA8:
1399     ss << "#define PALETTE_FORMAT_IA8 1\n";
1400     break;
1401   case TLUTFormat::RGB565:
1402     ss << "#define PALETTE_FORMAT_RGB565 1\n";
1403     break;
1404   case TLUTFormat::RGB5A3:
1405     ss << "#define PALETTE_FORMAT_RGB5A3 1\n";
1406     break;
1407   }
1408 
1409   ss << decoding_shader_header;
1410   ss << info->shader_body;
1411 
1412   return ss.str();
1413 }
1414 
1415 std::string GeneratePaletteConversionShader(TLUTFormat palette_format, APIType api_type)
1416 {
1417   std::ostringstream ss;
1418 
1419   ss << R"(
1420 int Convert3To8(int v)
1421 {
1422   // Swizzle bits: 00000123 -> 12312312
1423   return (v << 5) | (v << 2) | (v >> 1);
1424 }
1425 int Convert4To8(int v)
1426 {
1427   // Swizzle bits: 00001234 -> 12341234
1428   return (v << 4) | v;
1429 }
1430 int Convert5To8(int v)
1431 {
1432   // Swizzle bits: 00012345 -> 12345123
1433   return (v << 3) | (v >> 2);
1434 }
1435 int Convert6To8(int v)
1436 {
1437   // Swizzle bits: 00123456 -> 12345612
1438   return (v << 2) | (v >> 4);
1439 })";
1440 
1441   switch (palette_format)
1442   {
1443   case TLUTFormat::IA8:
1444     ss << R"(
1445 float4 DecodePixel(int val)
1446 {
1447   int i = val & 0xFF;
1448   int a = val >> 8;
1449   return float4(i, i, i, a) / 255.0;
1450 })";
1451     break;
1452 
1453   case TLUTFormat::RGB565:
1454     ss << R"(
1455 float4 DecodePixel(int val)
1456 {
1457   int r, g, b, a;
1458   r = Convert5To8((val >> 11) & 0x1f);
1459   g = Convert6To8((val >> 5) & 0x3f);
1460   b = Convert5To8((val) & 0x1f);
1461   a = 0xFF;
1462   return float4(r, g, b, a) / 255.0;
1463 })";
1464     break;
1465 
1466   case TLUTFormat::RGB5A3:
1467     ss << R"(
1468 float4 DecodePixel(int val)
1469 {
1470   int r,g,b,a;
1471   if ((val&0x8000) > 0)
1472   {
1473     r=Convert5To8((val>>10) & 0x1f);
1474     g=Convert5To8((val>>5 ) & 0x1f);
1475     b=Convert5To8((val    ) & 0x1f);
1476     a=0xFF;
1477   }
1478   else
1479   {
1480     a=Convert3To8((val>>12) & 0x7);
1481     r=Convert4To8((val>>8 ) & 0xf);
1482     g=Convert4To8((val>>4 ) & 0xf);
1483     b=Convert4To8((val    ) & 0xf);
1484   }
1485   return float4(r, g, b, a) / 255.0;
1486 })";
1487     break;
1488 
1489   default:
1490     PanicAlert("Unknown format");
1491     break;
1492   }
1493 
1494   ss << "\n";
1495 
1496   if (api_type == APIType::D3D)
1497   {
1498     ss << "Buffer<uint> tex0 : register(t0);\n";
1499     ss << "Texture2DArray tex1 : register(t1);\n";
1500     ss << "SamplerState samp1 : register(s1);\n";
1501     ss << "cbuffer PSBlock : register(b0) {\n";
1502   }
1503   else
1504   {
1505     ss << "TEXEL_BUFFER_BINDING(0) uniform usamplerBuffer samp0;\n";
1506     ss << "SAMPLER_BINDING(1) uniform sampler2DArray samp1;\n";
1507     ss << "UBO_BINDING(std140, 1) uniform PSBlock {\n";
1508   }
1509 
1510   ss << "  float multiplier;\n";
1511   ss << "  int texel_buffer_offset;\n";
1512   ss << "};\n";
1513 
1514   if (api_type == APIType::D3D)
1515   {
1516     ss << "void main(in float3 v_tex0 : TEXCOORD0, out float4 ocol0 : SV_Target) {\n";
1517     ss << "  int src = int(round(tex1.Sample(samp1, v_tex0).r * multiplier));\n";
1518     ss << "  src = int(tex0.Load(src + texel_buffer_offset).r);\n";
1519   }
1520   else
1521   {
1522     if (g_ActiveConfig.backend_info.bSupportsGeometryShaders)
1523     {
1524       ss << "VARYING_LOCATION(0) in VertexData {\n";
1525       ss << "  float3 v_tex0;\n";
1526       ss << "};\n";
1527     }
1528     else
1529     {
1530       ss << "VARYING_LOCATION(0) in float3 v_tex0;\n";
1531     }
1532     ss << "FRAGMENT_OUTPUT_LOCATION(0) out float4 ocol0;\n";
1533     ss << "void main() {\n";
1534     ss << "  float3 coords = v_tex0;\n";
1535     ss << "  int src = int(round(texture(samp1, coords).r * multiplier));\n";
1536     ss << "  src = int(texelFetch(samp0, src + texel_buffer_offset).r);\n";
1537   }
1538 
1539   ss << "  src = ((src << 8) & 0xFF00) | (src >> 8);\n";
1540   ss << "  ocol0 = DecodePixel(src);\n";
1541   ss << "}\n";
1542 
1543   return ss.str();
1544 }
1545 
1546 }  // namespace TextureConversionShaderTiled
1547