1 // Copyright 2008 Dolphin Emulator Project
2 // Licensed under GPLv2+
3 // Refer to the license.txt file included.
4 
5 #include "VideoCommon/PixelShaderGen.h"
6 
7 #include <cmath>
8 #include <cstdio>
9 
10 #include "Common/Assert.h"
11 #include "Common/CommonTypes.h"
12 #include "Common/Logging/Log.h"
13 #include "VideoCommon/BPMemory.h"
14 #include "VideoCommon/BoundingBox.h"
15 #include "VideoCommon/DriverDetails.h"
16 #include "VideoCommon/LightingShaderGen.h"
17 #include "VideoCommon/NativeVertexFormat.h"
18 #include "VideoCommon/RenderState.h"
19 #include "VideoCommon/VertexLoaderManager.h"
20 #include "VideoCommon/VideoCommon.h"
21 #include "VideoCommon/VideoConfig.h"
22 #include "VideoCommon/XFMemory.h"  // for texture projection mode
23 
24 // TODO: Get rid of these
25 enum : u32
26 {
27   C_COLORMATRIX = 0,                //  0
28   C_COLORS = 0,                     //  0
29   C_KCOLORS = C_COLORS + 4,         //  4
30   C_ALPHA = C_KCOLORS + 4,          //  8
31   C_TEXDIMS = C_ALPHA + 1,          //  9
32   C_ZBIAS = C_TEXDIMS + 8,          // 17
33   C_INDTEXSCALE = C_ZBIAS + 2,      // 19
34   C_INDTEXMTX = C_INDTEXSCALE + 2,  // 21
35   C_FOGCOLOR = C_INDTEXMTX + 6,     // 27
36   C_FOGI = C_FOGCOLOR + 1,          // 28
37   C_FOGF = C_FOGI + 1,              // 29
38   C_ZSLOPE = C_FOGF + 2,            // 31
39   C_EFBSCALE = C_ZSLOPE + 1,        // 32
40   C_PENVCONST_END = C_EFBSCALE + 1
41 };
42 
43 constexpr std::array<const char*, 32> tev_ksel_table_c{
44     "255,255,255",        // 1   = 0x00
45     "223,223,223",        // 7_8 = 0x01
46     "191,191,191",        // 3_4 = 0x02
47     "159,159,159",        // 5_8 = 0x03
48     "128,128,128",        // 1_2 = 0x04
49     "96,96,96",           // 3_8 = 0x05
50     "64,64,64",           // 1_4 = 0x06
51     "32,32,32",           // 1_8 = 0x07
52     "0,0,0",              // INVALID = 0x08
53     "0,0,0",              // INVALID = 0x09
54     "0,0,0",              // INVALID = 0x0a
55     "0,0,0",              // INVALID = 0x0b
56     I_KCOLORS "[0].rgb",  // K0 = 0x0C
57     I_KCOLORS "[1].rgb",  // K1 = 0x0D
58     I_KCOLORS "[2].rgb",  // K2 = 0x0E
59     I_KCOLORS "[3].rgb",  // K3 = 0x0F
60     I_KCOLORS "[0].rrr",  // K0_R = 0x10
61     I_KCOLORS "[1].rrr",  // K1_R = 0x11
62     I_KCOLORS "[2].rrr",  // K2_R = 0x12
63     I_KCOLORS "[3].rrr",  // K3_R = 0x13
64     I_KCOLORS "[0].ggg",  // K0_G = 0x14
65     I_KCOLORS "[1].ggg",  // K1_G = 0x15
66     I_KCOLORS "[2].ggg",  // K2_G = 0x16
67     I_KCOLORS "[3].ggg",  // K3_G = 0x17
68     I_KCOLORS "[0].bbb",  // K0_B = 0x18
69     I_KCOLORS "[1].bbb",  // K1_B = 0x19
70     I_KCOLORS "[2].bbb",  // K2_B = 0x1A
71     I_KCOLORS "[3].bbb",  // K3_B = 0x1B
72     I_KCOLORS "[0].aaa",  // K0_A = 0x1C
73     I_KCOLORS "[1].aaa",  // K1_A = 0x1D
74     I_KCOLORS "[2].aaa",  // K2_A = 0x1E
75     I_KCOLORS "[3].aaa",  // K3_A = 0x1F
76 };
77 
78 constexpr std::array<const char*, 32> tev_ksel_table_a{
79     "255",              // 1   = 0x00
80     "223",              // 7_8 = 0x01
81     "191",              // 3_4 = 0x02
82     "159",              // 5_8 = 0x03
83     "128",              // 1_2 = 0x04
84     "96",               // 3_8 = 0x05
85     "64",               // 1_4 = 0x06
86     "32",               // 1_8 = 0x07
87     "0",                // INVALID = 0x08
88     "0",                // INVALID = 0x09
89     "0",                // INVALID = 0x0a
90     "0",                // INVALID = 0x0b
91     "0",                // INVALID = 0x0c
92     "0",                // INVALID = 0x0d
93     "0",                // INVALID = 0x0e
94     "0",                // INVALID = 0x0f
95     I_KCOLORS "[0].r",  // K0_R = 0x10
96     I_KCOLORS "[1].r",  // K1_R = 0x11
97     I_KCOLORS "[2].r",  // K2_R = 0x12
98     I_KCOLORS "[3].r",  // K3_R = 0x13
99     I_KCOLORS "[0].g",  // K0_G = 0x14
100     I_KCOLORS "[1].g",  // K1_G = 0x15
101     I_KCOLORS "[2].g",  // K2_G = 0x16
102     I_KCOLORS "[3].g",  // K3_G = 0x17
103     I_KCOLORS "[0].b",  // K0_B = 0x18
104     I_KCOLORS "[1].b",  // K1_B = 0x19
105     I_KCOLORS "[2].b",  // K2_B = 0x1A
106     I_KCOLORS "[3].b",  // K3_B = 0x1B
107     I_KCOLORS "[0].a",  // K0_A = 0x1C
108     I_KCOLORS "[1].a",  // K1_A = 0x1D
109     I_KCOLORS "[2].a",  // K2_A = 0x1E
110     I_KCOLORS "[3].a",  // K3_A = 0x1F
111 };
112 
113 constexpr std::array<const char*, 16> tev_c_input_table{
114     "prev.rgb",           // CPREV,
115     "prev.aaa",           // APREV,
116     "c0.rgb",             // C0,
117     "c0.aaa",             // A0,
118     "c1.rgb",             // C1,
119     "c1.aaa",             // A1,
120     "c2.rgb",             // C2,
121     "c2.aaa",             // A2,
122     "textemp.rgb",        // TEXC,
123     "textemp.aaa",        // TEXA,
124     "rastemp.rgb",        // RASC,
125     "rastemp.aaa",        // RASA,
126     "int3(255,255,255)",  // ONE
127     "int3(128,128,128)",  // HALF
128     "konsttemp.rgb",      // KONST
129     "int3(0,0,0)",        // ZERO
130 };
131 
132 constexpr std::array<const char*, 8> tev_a_input_table{
133     "prev.a",       // APREV,
134     "c0.a",         // A0,
135     "c1.a",         // A1,
136     "c2.a",         // A2,
137     "textemp.a",    // TEXA,
138     "rastemp.a",    // RASA,
139     "konsttemp.a",  // KONST,  (hw1 had quarter)
140     "0",            // ZERO
141 };
142 
143 constexpr std::array<const char*, 8> tev_ras_table{
144     "iround(col0 * 255.0)",
145     "iround(col1 * 255.0)",
146     "ERROR13",                                              // 2
147     "ERROR14",                                              // 3
148     "ERROR15",                                              // 4
149     "(int4(1, 1, 1, 1) * alphabump)",                       // bump alpha (0..248)
150     "(int4(1, 1, 1, 1) * (alphabump | (alphabump >> 5)))",  // normalized bump alpha (0..255)
151     "int4(0, 0, 0, 0)",                                     // zero
152 };
153 
154 constexpr std::array<const char*, 4> tev_c_output_table{
155     "prev.rgb",
156     "c0.rgb",
157     "c1.rgb",
158     "c2.rgb",
159 };
160 
161 constexpr std::array<const char*, 4> tev_a_output_table{
162     "prev.a",
163     "c0.a",
164     "c1.a",
165     "c2.a",
166 };
167 
168 // FIXME: Some of the video card's capabilities (BBox support, EarlyZ support, dstAlpha support)
169 //        leak into this UID; This is really unhelpful if these UIDs ever move from one machine to
170 //        another.
GetPixelShaderUid()171 PixelShaderUid GetPixelShaderUid()
172 {
173   PixelShaderUid out;
174 
175   pixel_shader_uid_data* const uid_data = out.GetUidData();
176   uid_data->useDstAlpha = bpmem.dstalpha.enable && bpmem.blendmode.alphaupdate &&
177                           bpmem.zcontrol.pixel_format == PEControl::RGBA6_Z24;
178 
179   uid_data->genMode_numindstages = bpmem.genMode.numindstages;
180   uid_data->genMode_numtevstages = bpmem.genMode.numtevstages;
181   uid_data->genMode_numtexgens = bpmem.genMode.numtexgens;
182   uid_data->bounding_box = g_ActiveConfig.bBBoxEnable && BoundingBox::IsEnabled();
183   uid_data->rgba6_format =
184       bpmem.zcontrol.pixel_format == PEControl::RGBA6_Z24 && !g_ActiveConfig.bForceTrueColor;
185   uid_data->dither = bpmem.blendmode.dither && uid_data->rgba6_format;
186   uid_data->uint_output = bpmem.blendmode.UseLogicOp();
187 
188   u32 numStages = uid_data->genMode_numtevstages + 1;
189 
190   const bool forced_early_z =
191       bpmem.UseEarlyDepthTest() &&
192       (g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED)
193       // We can't allow early_ztest for zfreeze because depth is overridden per-pixel.
194       // This means it's impossible for zcomploc to be emulated on a zfrozen polygon.
195       && !(bpmem.zmode.testenable && bpmem.genMode.zfreeze);
196   const bool per_pixel_depth =
197       (bpmem.ztex2.op != ZTEXTURE_DISABLE && bpmem.UseLateDepthTest()) ||
198       (!g_ActiveConfig.bFastDepthCalc && bpmem.zmode.testenable && !forced_early_z) ||
199       (bpmem.zmode.testenable && bpmem.genMode.zfreeze);
200 
201   uid_data->per_pixel_depth = per_pixel_depth;
202   uid_data->forced_early_z = forced_early_z;
203 
204   if (g_ActiveConfig.bEnablePixelLighting)
205   {
206     // The lighting shader only needs the two color bits of the 23bit component bit array.
207     uid_data->components =
208         (VertexLoaderManager::g_current_components & (VB_HAS_COL0 | VB_HAS_COL1)) >> VB_COL_SHIFT;
209     uid_data->numColorChans = xfmem.numChan.numColorChans;
210     GetLightingShaderUid(uid_data->lighting);
211   }
212 
213   if (uid_data->genMode_numtexgens > 0)
214   {
215     for (unsigned int i = 0; i < uid_data->genMode_numtexgens; ++i)
216     {
217       // optional perspective divides
218       uid_data->texMtxInfo_n_projection |= xfmem.texMtxInfo[i].projection << i;
219     }
220   }
221 
222   // indirect texture map lookup
223   int nIndirectStagesUsed = 0;
224   if (uid_data->genMode_numindstages > 0)
225   {
226     for (unsigned int i = 0; i < numStages; ++i)
227     {
228       if (bpmem.tevind[i].IsActive() && bpmem.tevind[i].bt < uid_data->genMode_numindstages)
229         nIndirectStagesUsed |= 1 << bpmem.tevind[i].bt;
230     }
231   }
232 
233   uid_data->nIndirectStagesUsed = nIndirectStagesUsed;
234   for (u32 i = 0; i < uid_data->genMode_numindstages; ++i)
235   {
236     if (uid_data->nIndirectStagesUsed & (1 << i))
237       uid_data->SetTevindrefValues(i, bpmem.tevindref.getTexCoord(i), bpmem.tevindref.getTexMap(i));
238   }
239 
240   for (unsigned int n = 0; n < numStages; n++)
241   {
242     int texcoord = bpmem.tevorders[n / 2].getTexCoord(n & 1);
243     bool bHasTexCoord = (u32)texcoord < bpmem.genMode.numtexgens;
244     // HACK to handle cases where the tex gen is not enabled
245     if (!bHasTexCoord)
246       texcoord = bpmem.genMode.numtexgens;
247 
248     uid_data->stagehash[n].hasindstage = bpmem.tevind[n].bt < bpmem.genMode.numindstages;
249     uid_data->stagehash[n].tevorders_texcoord = texcoord;
250     if (uid_data->stagehash[n].hasindstage)
251       uid_data->stagehash[n].tevind = bpmem.tevind[n].hex;
252 
253     TevStageCombiner::ColorCombiner& cc = bpmem.combiners[n].colorC;
254     TevStageCombiner::AlphaCombiner& ac = bpmem.combiners[n].alphaC;
255     uid_data->stagehash[n].cc = cc.hex & 0xFFFFFF;
256     uid_data->stagehash[n].ac = ac.hex & 0xFFFFF0;  // Storing rswap and tswap later
257 
258     if (cc.a == TEVCOLORARG_RASA || cc.a == TEVCOLORARG_RASC || cc.b == TEVCOLORARG_RASA ||
259         cc.b == TEVCOLORARG_RASC || cc.c == TEVCOLORARG_RASA || cc.c == TEVCOLORARG_RASC ||
260         cc.d == TEVCOLORARG_RASA || cc.d == TEVCOLORARG_RASC || ac.a == TEVALPHAARG_RASA ||
261         ac.b == TEVALPHAARG_RASA || ac.c == TEVALPHAARG_RASA || ac.d == TEVALPHAARG_RASA)
262     {
263       const int i = bpmem.combiners[n].alphaC.rswap;
264       uid_data->stagehash[n].tevksel_swap1a = bpmem.tevksel[i * 2].swap1;
265       uid_data->stagehash[n].tevksel_swap2a = bpmem.tevksel[i * 2].swap2;
266       uid_data->stagehash[n].tevksel_swap1b = bpmem.tevksel[i * 2 + 1].swap1;
267       uid_data->stagehash[n].tevksel_swap2b = bpmem.tevksel[i * 2 + 1].swap2;
268       uid_data->stagehash[n].tevorders_colorchan = bpmem.tevorders[n / 2].getColorChan(n & 1);
269     }
270 
271     uid_data->stagehash[n].tevorders_enable = bpmem.tevorders[n / 2].getEnable(n & 1);
272     if (uid_data->stagehash[n].tevorders_enable)
273     {
274       const int i = bpmem.combiners[n].alphaC.tswap;
275       uid_data->stagehash[n].tevksel_swap1c = bpmem.tevksel[i * 2].swap1;
276       uid_data->stagehash[n].tevksel_swap2c = bpmem.tevksel[i * 2].swap2;
277       uid_data->stagehash[n].tevksel_swap1d = bpmem.tevksel[i * 2 + 1].swap1;
278       uid_data->stagehash[n].tevksel_swap2d = bpmem.tevksel[i * 2 + 1].swap2;
279       uid_data->stagehash[n].tevorders_texmap = bpmem.tevorders[n / 2].getTexMap(n & 1);
280     }
281 
282     if (cc.a == TEVCOLORARG_KONST || cc.b == TEVCOLORARG_KONST || cc.c == TEVCOLORARG_KONST ||
283         cc.d == TEVCOLORARG_KONST || ac.a == TEVALPHAARG_KONST || ac.b == TEVALPHAARG_KONST ||
284         ac.c == TEVALPHAARG_KONST || ac.d == TEVALPHAARG_KONST)
285     {
286       uid_data->stagehash[n].tevksel_kc = bpmem.tevksel[n / 2].getKC(n & 1);
287       uid_data->stagehash[n].tevksel_ka = bpmem.tevksel[n / 2].getKA(n & 1);
288     }
289   }
290 
291 #define MY_STRUCT_OFFSET(str, elem) ((u32)((u64) & (str).elem - (u64) & (str)))
292   uid_data->num_values = (g_ActiveConfig.bEnablePixelLighting) ?
293                              sizeof(*uid_data) :
294                              MY_STRUCT_OFFSET(*uid_data, stagehash[numStages]);
295 
296   AlphaTest::TEST_RESULT Pretest = bpmem.alpha_test.TestResult();
297   uid_data->Pretest = Pretest;
298   uid_data->late_ztest = bpmem.UseLateDepthTest();
299 
300   // NOTE: Fragment may not be discarded if alpha test always fails and early depth test is enabled
301   // (in this case we need to write a depth value if depth test passes regardless of the alpha
302   // testing result)
303   if (uid_data->Pretest == AlphaTest::UNDETERMINED ||
304       (uid_data->Pretest == AlphaTest::FAIL && uid_data->late_ztest))
305   {
306     uid_data->alpha_test_comp0 = bpmem.alpha_test.comp0;
307     uid_data->alpha_test_comp1 = bpmem.alpha_test.comp1;
308     uid_data->alpha_test_logic = bpmem.alpha_test.logic;
309 
310     // ZCOMPLOC HACK:
311     // The only way to emulate alpha test + early-z is to force early-z in the shader.
312     // As this isn't available on all drivers and as we can't emulate this feature otherwise,
313     // we are only able to choose which one we want to respect more.
314     // Tests seem to have proven that writing depth even when the alpha test fails is more
315     // important that a reliable alpha test, so we just force the alpha test to always succeed.
316     // At least this seems to be less buggy.
317     uid_data->alpha_test_use_zcomploc_hack =
318         bpmem.UseEarlyDepthTest() && bpmem.zmode.updateenable &&
319         !g_ActiveConfig.backend_info.bSupportsEarlyZ && !bpmem.genMode.zfreeze;
320   }
321 
322   uid_data->zfreeze = bpmem.genMode.zfreeze;
323   uid_data->ztex_op = bpmem.ztex2.op;
324   uid_data->early_ztest = bpmem.UseEarlyDepthTest();
325   uid_data->fog_fsel = bpmem.fog.c_proj_fsel.fsel;
326   uid_data->fog_fsel = bpmem.fog.c_proj_fsel.fsel;
327   uid_data->fog_proj = bpmem.fog.c_proj_fsel.proj;
328   uid_data->fog_RangeBaseEnabled = bpmem.fogRange.Base.Enabled;
329 
330   BlendingState state = {};
331   state.Generate(bpmem);
332 
333   if (state.usedualsrc && state.dstalpha && g_ActiveConfig.backend_info.bSupportsFramebufferFetch &&
334       !g_ActiveConfig.backend_info.bSupportsDualSourceBlend)
335   {
336     uid_data->blend_enable = state.blendenable;
337     uid_data->blend_src_factor = state.srcfactor;
338     uid_data->blend_src_factor_alpha = state.srcfactoralpha;
339     uid_data->blend_dst_factor = state.dstfactor;
340     uid_data->blend_dst_factor_alpha = state.dstfactoralpha;
341     uid_data->blend_subtract = state.subtract;
342     uid_data->blend_subtract_alpha = state.subtractAlpha;
343   }
344 
345   return out;
346 }
347 
ClearUnusedPixelShaderUidBits(APIType ApiType,const ShaderHostConfig & host_config,PixelShaderUid * uid)348 void ClearUnusedPixelShaderUidBits(APIType ApiType, const ShaderHostConfig& host_config,
349                                    PixelShaderUid* uid)
350 {
351   pixel_shader_uid_data* const uid_data = uid->GetUidData();
352 
353   // OpenGL and Vulkan convert implicitly normalized color outputs to their uint representation.
354   // Therefore, it is not necessary to use a uint output on these backends. We also disable the
355   // uint output when logic op is not supported (i.e. driver/device does not support D3D11.1).
356   if (ApiType != APIType::D3D || !host_config.backend_logic_op)
357     uid_data->uint_output = 0;
358 
359   // If bounding box is enabled when a UID cache is created, then later disabled, we shouldn't
360   // emit the bounding box portion of the shader.
361   uid_data->bounding_box &= host_config.bounding_box & host_config.backend_bbox;
362 }
363 
WritePixelShaderCommonHeader(ShaderCode & out,APIType ApiType,u32 num_texgens,const ShaderHostConfig & host_config,bool bounding_box)364 void WritePixelShaderCommonHeader(ShaderCode& out, APIType ApiType, u32 num_texgens,
365                                   const ShaderHostConfig& host_config, bool bounding_box)
366 {
367   // dot product for integer vectors
368   out.Write("int idot(int3 x, int3 y)\n"
369             "{\n"
370             "\tint3 tmp = x * y;\n"
371             "\treturn tmp.x + tmp.y + tmp.z;\n"
372             "}\n");
373 
374   out.Write("int idot(int4 x, int4 y)\n"
375             "{\n"
376             "\tint4 tmp = x * y;\n"
377             "\treturn tmp.x + tmp.y + tmp.z + tmp.w;\n"
378             "}\n\n");
379 
380   // rounding + casting to integer at once in a single function
381   out.Write("int  iround(float  x) { return int (round(x)); }\n"
382             "int2 iround(float2 x) { return int2(round(x)); }\n"
383             "int3 iround(float3 x) { return int3(round(x)); }\n"
384             "int4 iround(float4 x) { return int4(round(x)); }\n\n");
385 
386   if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan)
387   {
388     out.Write("SAMPLER_BINDING(0) uniform sampler2DArray samp[8];\n");
389   }
390   else  // D3D
391   {
392     // Declare samplers
393     out.Write("SamplerState samp[8] : register(s0);\n");
394     out.Write("\n");
395     out.Write("Texture2DArray Tex[8] : register(t0);\n");
396   }
397   out.Write("\n");
398 
399   if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan)
400     out.Write("UBO_BINDING(std140, 1) uniform PSBlock {\n");
401   else
402     out.Write("cbuffer PSBlock : register(b0) {\n");
403 
404   out.Write("\tint4 " I_COLORS "[4];\n"
405             "\tint4 " I_KCOLORS "[4];\n"
406             "\tint4 " I_ALPHA ";\n"
407             "\tfloat4 " I_TEXDIMS "[8];\n"
408             "\tint4 " I_ZBIAS "[2];\n"
409             "\tint4 " I_INDTEXSCALE "[2];\n"
410             "\tint4 " I_INDTEXMTX "[6];\n"
411             "\tint4 " I_FOGCOLOR ";\n"
412             "\tint4 " I_FOGI ";\n"
413             "\tfloat4 " I_FOGF ";\n"
414             "\tfloat4 " I_FOGRANGE "[3];\n"
415             "\tfloat4 " I_ZSLOPE ";\n"
416             "\tfloat2 " I_EFBSCALE ";\n"
417             "\tuint  bpmem_genmode;\n"
418             "\tuint  bpmem_alphaTest;\n"
419             "\tuint  bpmem_fogParam3;\n"
420             "\tuint  bpmem_fogRangeBase;\n"
421             "\tuint  bpmem_dstalpha;\n"
422             "\tuint  bpmem_ztex_op;\n"
423             "\tbool  bpmem_late_ztest;\n"
424             "\tbool  bpmem_rgba6_format;\n"
425             "\tbool  bpmem_dither;\n"
426             "\tbool  bpmem_bounding_box;\n"
427             "\tuint4 bpmem_pack1[16];\n"  // .xy - combiners, .z - tevind
428             "\tuint4 bpmem_pack2[8];\n"   // .x - tevorder, .y - tevksel
429             "\tint4  konstLookup[32];\n"
430             "\tbool  blend_enable;\n"
431             "\tuint  blend_src_factor;\n"
432             "\tuint  blend_src_factor_alpha;\n"
433             "\tuint  blend_dst_factor;\n"
434             "\tuint  blend_dst_factor_alpha;\n"
435             "\tbool  blend_subtract;\n"
436             "\tbool  blend_subtract_alpha;\n"
437             "};\n\n");
438   out.Write("#define bpmem_combiners(i) (bpmem_pack1[(i)].xy)\n"
439             "#define bpmem_tevind(i) (bpmem_pack1[(i)].z)\n"
440             "#define bpmem_iref(i) (bpmem_pack1[(i)].w)\n"
441             "#define bpmem_tevorder(i) (bpmem_pack2[(i)].x)\n"
442             "#define bpmem_tevksel(i) (bpmem_pack2[(i)].y)\n\n");
443 
444   if (host_config.per_pixel_lighting)
445   {
446     out.Write("%s", s_lighting_struct);
447 
448     if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan)
449       out.Write("UBO_BINDING(std140, 2) uniform VSBlock {\n");
450     else
451       out.Write("cbuffer VSBlock : register(b1) {\n");
452 
453     out.Write(s_shader_uniforms);
454     out.Write("};\n");
455   }
456 
457   if (bounding_box)
458   {
459     out.Write(R"(
460 #ifdef API_D3D
461 globallycoherent RWBuffer<int> bbox_data : register(u2);
462 #define atomicMin InterlockedMin
463 #define atomicMax InterlockedMax
464 #define bbox_left bbox_data[0]
465 #define bbox_right bbox_data[1]
466 #define bbox_top bbox_data[2]
467 #define bbox_bottom bbox_data[3]
468 #else
469 SSBO_BINDING(0) buffer BBox {
470   int bbox_left, bbox_right, bbox_top, bbox_bottom;
471 };
472 #endif
473 
474 void UpdateBoundingBoxBuffer(int2 min_pos, int2 max_pos) {
475   if (bbox_left > min_pos.x)
476     atomicMin(bbox_left, min_pos.x);
477   if (bbox_right < max_pos.x)
478     atomicMax(bbox_right, max_pos.x);
479   if (bbox_top > min_pos.y)
480     atomicMin(bbox_top, min_pos.y);
481   if (bbox_bottom < max_pos.y)
482     atomicMax(bbox_bottom, max_pos.y);
483 }
484 
485 void UpdateBoundingBox(float2 rawpos) {
486   // The pixel center in the GameCube GPU is 7/12, not 0.5 (see VertexShaderGen.cpp)
487   // Adjust for this by unapplying the offset we added in the vertex shader.
488   const float PIXEL_CENTER_OFFSET = 7.0 / 12.0 - 0.5;
489   float2 offset = float2(PIXEL_CENTER_OFFSET, -PIXEL_CENTER_OFFSET);
490 
491 #ifdef API_OPENGL
492   // OpenGL lower-left origin means that Y goes in the opposite direction.
493   offset.y = -offset.y;
494 #endif
495 
496   // The rightmost shaded pixel is not included in the right bounding box register,
497   // such that width = right - left + 1. This has been verified on hardware.
498   int2 pos = iround(rawpos * cefbscale + offset);
499 
500 #ifdef SUPPORTS_SUBGROUP_REDUCTION
501   if (CAN_USE_SUBGROUP_REDUCTION) {
502     int2 min_pos = IS_HELPER_INVOCATION ? int2(2147483647, 2147483647) : pos;
503     int2 max_pos = IS_HELPER_INVOCATION ? int2(-2147483648, -2147483648) : pos;
504     SUBGROUP_MIN(min_pos);
505     SUBGROUP_MAX(max_pos);
506     if (IS_FIRST_ACTIVE_INVOCATION)
507       UpdateBoundingBoxBuffer(min_pos, max_pos);
508   } else {
509     UpdateBoundingBoxBuffer(pos, pos);
510   }
511 #else
512   UpdateBoundingBoxBuffer(pos, pos);
513 #endif
514 }
515 
516 )");
517   }
518 }
519 
520 static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, int n,
521                        APIType ApiType, bool stereo);
522 static void WriteTevRegular(ShaderCode& out, const char* components, int bias, int op, int clamp,
523                             int shift, bool alpha);
524 static void SampleTexture(ShaderCode& out, const char* texcoords, const char* texswap, int texmap,
525                           bool stereo, APIType ApiType);
526 static void WriteAlphaTest(ShaderCode& out, const pixel_shader_uid_data* uid_data, APIType ApiType,
527                            bool per_pixel_depth, bool use_dual_source);
528 static void WriteFog(ShaderCode& out, const pixel_shader_uid_data* uid_data);
529 static void WriteColor(ShaderCode& out, APIType api_type, const pixel_shader_uid_data* uid_data,
530                        bool use_dual_source);
531 static void WriteBlend(ShaderCode& out, const pixel_shader_uid_data* uid_data);
532 
GeneratePixelShaderCode(APIType ApiType,const ShaderHostConfig & host_config,const pixel_shader_uid_data * uid_data)533 ShaderCode GeneratePixelShaderCode(APIType ApiType, const ShaderHostConfig& host_config,
534                                    const pixel_shader_uid_data* uid_data)
535 {
536   ShaderCode out;
537 
538   const bool per_pixel_lighting = g_ActiveConfig.bEnablePixelLighting;
539   const bool msaa = host_config.msaa;
540   const bool ssaa = host_config.ssaa;
541   const bool stereo = host_config.stereo;
542   const u32 numStages = uid_data->genMode_numtevstages + 1;
543 
544   out.Write("//Pixel Shader for TEV stages\n");
545   out.Write("//%i TEV stages, %i texgens, %i IND stages\n", numStages, uid_data->genMode_numtexgens,
546             uid_data->genMode_numindstages);
547 
548   // Stuff that is shared between ubershaders and pixelgen.
549   WritePixelShaderCommonHeader(out, ApiType, uid_data->genMode_numtexgens, host_config,
550                                uid_data->bounding_box);
551 
552   if (uid_data->forced_early_z && g_ActiveConfig.backend_info.bSupportsEarlyZ)
553   {
554     // Zcomploc (aka early_ztest) is a way to control whether depth test is done before
555     // or after texturing and alpha test. PC graphics APIs used to provide no way to emulate
556     // this feature properly until 2012: Depth tests were always done after alpha testing.
557     // Most importantly, it was not possible to write to the depth buffer without also writing
558     // a color value (unless color writing was disabled altogether).
559 
560     // OpenGL 4.2 actually provides two extensions which can force an early z test:
561     //  * ARB_image_load_store has 'layout(early_fragment_tests)' which forces the driver to do z
562     //  and stencil tests early.
563     //  * ARB_conservative_depth has 'layout(depth_unchanged) which signals to the driver that it
564     //  can make optimisations
565     //    which assume the pixel shader won't update the depth buffer.
566 
567     // early_fragment_tests is the best option, as it requires the driver to do early-z and defines
568     // early-z exactly as
569     // we expect, with discard causing the shader to exit with only the depth buffer updated.
570 
571     // Conservative depth's 'depth_unchanged' only hints to the driver that an early-z optimisation
572     // can be made and
573     // doesn't define what will happen if we discard the fragment. But the way modern graphics
574     // hardware is implemented
575     // means it is not unreasonable to expect the same behaviour as early_fragment_tests.
576     // We can also assume that if a driver has gone out of its way to support conservative depth and
577     // not image_load_store
578     // as required by OpenGL 4.2 that it will be doing the optimisation.
579     // If the driver doesn't actually do an early z optimisation, ZCompLoc will be broken and depth
580     // will only be written
581     // if the alpha test passes.
582 
583     // We support Conservative as a fallback, because many drivers based on Mesa haven't implemented
584     // all of the
585     // ARB_image_load_store extension yet.
586 
587     // D3D11 also has a way to force the driver to enable early-z, so we're fine here.
588     if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan)
589     {
590       // This is a #define which signals whatever early-z method the driver supports.
591       out.Write("FORCE_EARLY_Z; \n");
592     }
593     else
594     {
595       out.Write("[earlydepthstencil]\n");
596     }
597   }
598 
599   // Only use dual-source blending when required on drivers that don't support it very well.
600   const bool use_dual_source =
601       host_config.backend_dual_source_blend &&
602       (!DriverDetails::HasBug(DriverDetails::BUG_BROKEN_DUAL_SOURCE_BLENDING) ||
603        uid_data->useDstAlpha);
604   const bool use_shader_blend =
605       !use_dual_source && (uid_data->useDstAlpha && host_config.backend_shader_framebuffer_fetch);
606 
607   if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan)
608   {
609     if (use_dual_source)
610     {
611       if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_FRAGMENT_SHADER_INDEX_DECORATION))
612       {
613         out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n");
614         out.Write("FRAGMENT_OUTPUT_LOCATION(1) out vec4 ocol1;\n");
615       }
616       else
617       {
618         out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0;\n");
619         out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1;\n");
620       }
621     }
622     else if (use_shader_blend)
623     {
624       // QComm's Adreno driver doesn't seem to like using the framebuffer_fetch value as an
625       // intermediate value with multiple reads & modifications, so pull out the "real" output value
626       // and use a temporary for calculations, then set the output value once at the end of the
627       // shader
628       if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_FRAGMENT_SHADER_INDEX_DECORATION))
629       {
630         out.Write("FRAGMENT_OUTPUT_LOCATION(0) FRAGMENT_INOUT vec4 real_ocol0;\n");
631       }
632       else
633       {
634         out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) FRAGMENT_INOUT vec4 real_ocol0;\n");
635       }
636     }
637     else
638     {
639       out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n");
640     }
641 
642     if (uid_data->per_pixel_depth)
643       out.Write("#define depth gl_FragDepth\n");
644 
645     if (host_config.backend_geometry_shaders)
646     {
647       out.Write("VARYING_LOCATION(0) in VertexData {\n");
648       GenerateVSOutputMembers(out, ApiType, uid_data->genMode_numtexgens, host_config,
649                               GetInterpolationQualifier(msaa, ssaa, true, true));
650 
651       if (stereo)
652         out.Write("\tflat int layer;\n");
653 
654       out.Write("};\n");
655     }
656     else
657     {
658       // Let's set up attributes
659       u32 counter = 0;
660       out.Write("VARYING_LOCATION(%u) %s in float4 colors_0;\n", counter++,
661                 GetInterpolationQualifier(msaa, ssaa));
662       out.Write("VARYING_LOCATION(%u) %s in float4 colors_1;\n", counter++,
663                 GetInterpolationQualifier(msaa, ssaa));
664       for (unsigned int i = 0; i < uid_data->genMode_numtexgens; ++i)
665       {
666         out.Write("VARYING_LOCATION(%u) %s in float3 tex%d;\n", counter++,
667                   GetInterpolationQualifier(msaa, ssaa), i);
668       }
669       if (!host_config.fast_depth_calc)
670         out.Write("VARYING_LOCATION(%u) %s in float4 clipPos;\n", counter++,
671                   GetInterpolationQualifier(msaa, ssaa));
672       if (per_pixel_lighting)
673       {
674         out.Write("VARYING_LOCATION(%u) %s in float3 Normal;\n", counter++,
675                   GetInterpolationQualifier(msaa, ssaa));
676         out.Write("VARYING_LOCATION(%u) %s in float3 WorldPos;\n", counter++,
677                   GetInterpolationQualifier(msaa, ssaa));
678       }
679     }
680 
681     out.Write("void main()\n{\n");
682     out.Write("\tfloat4 rawpos = gl_FragCoord;\n");
683     if (use_shader_blend)
684     {
685       // Store off a copy of the initial fb value for blending
686       out.Write("\tfloat4 initial_ocol0 = FB_FETCH_VALUE;\n");
687       out.Write("\tfloat4 ocol0;\n");
688       out.Write("\tfloat4 ocol1;\n");
689     }
690   }
691   else  // D3D
692   {
693     out.Write("void main(\n");
694     if (uid_data->uint_output)
695     {
696       out.Write("  out uint4 ocol0 : SV_Target,\n");
697     }
698     else
699     {
700       out.Write("  out float4 ocol0 : SV_Target0,\n"
701                 "  out float4 ocol1 : SV_Target1,\n");
702     }
703     out.Write("%s"
704               "  in float4 rawpos : SV_Position,\n",
705               uid_data->per_pixel_depth ? "  out float depth : SV_Depth,\n" : "");
706 
707     out.Write("  in %s float4 colors_0 : COLOR0,\n", GetInterpolationQualifier(msaa, ssaa));
708     out.Write("  in %s float4 colors_1 : COLOR1\n", GetInterpolationQualifier(msaa, ssaa));
709 
710     // compute window position if needed because binding semantic WPOS is not widely supported
711     for (unsigned int i = 0; i < uid_data->genMode_numtexgens; ++i)
712     {
713       out.Write(",\n  in %s float3 tex%d : TEXCOORD%d", GetInterpolationQualifier(msaa, ssaa), i,
714                 i);
715     }
716     if (!host_config.fast_depth_calc)
717     {
718       out.Write(",\n  in %s float4 clipPos : TEXCOORD%d", GetInterpolationQualifier(msaa, ssaa),
719                 uid_data->genMode_numtexgens);
720     }
721     if (per_pixel_lighting)
722     {
723       out.Write(",\n  in %s float3 Normal : TEXCOORD%d", GetInterpolationQualifier(msaa, ssaa),
724                 uid_data->genMode_numtexgens + 1);
725       out.Write(",\n  in %s float3 WorldPos : TEXCOORD%d", GetInterpolationQualifier(msaa, ssaa),
726                 uid_data->genMode_numtexgens + 2);
727     }
728     if (host_config.backend_geometry_shaders)
729     {
730       out.Write(",\n  in float clipDist0 : SV_ClipDistance0\n");
731       out.Write(",\n  in float clipDist1 : SV_ClipDistance1\n");
732     }
733     if (stereo)
734       out.Write(",\n  in uint layer : SV_RenderTargetArrayIndex\n");
735     out.Write("        ) {\n");
736   }
737 
738   out.Write("\tint4 c0 = " I_COLORS "[1], c1 = " I_COLORS "[2], c2 = " I_COLORS
739             "[3], prev = " I_COLORS "[0];\n"
740             "\tint4 rastemp = int4(0, 0, 0, 0), textemp = int4(0, 0, 0, 0), konsttemp = int4(0, 0, "
741             "0, 0);\n"
742             "\tint3 comp16 = int3(1, 256, 0), comp24 = int3(1, 256, 256*256);\n"
743             "\tint alphabump=0;\n"
744             "\tint3 tevcoord=int3(0, 0, 0);\n"
745             "\tint2 wrappedcoord=int2(0,0), tempcoord=int2(0,0);\n"
746             "\tint4 "
747             "tevin_a=int4(0,0,0,0),tevin_b=int4(0,0,0,0),tevin_c=int4(0,0,0,0),tevin_d=int4(0,0,0,"
748             "0);\n\n");  // tev combiner inputs
749 
750   // On GLSL, input variables must not be assigned to.
751   // This is why we declare these variables locally instead.
752   out.Write("\tfloat4 col0 = colors_0;\n");
753   out.Write("\tfloat4 col1 = colors_1;\n");
754 
755   if (per_pixel_lighting)
756   {
757     out.Write("\tfloat3 _norm0 = normalize(Normal.xyz);\n\n");
758     out.Write("\tfloat3 pos = WorldPos;\n");
759 
760     out.Write("\tint4 lacc;\n"
761               "\tfloat3 ldir, h, cosAttn, distAttn;\n"
762               "\tfloat dist, dist2, attn;\n");
763 
764     // TODO: Our current constant usage code isn't able to handle more than one buffer.
765     //       So we can't mark the VS constant as used here. But keep them here as reference.
766     // out.SetConstantsUsed(C_PLIGHT_COLORS, C_PLIGHT_COLORS+7); // TODO: Can be optimized further
767     // out.SetConstantsUsed(C_PLIGHTS, C_PLIGHTS+31); // TODO: Can be optimized further
768     // out.SetConstantsUsed(C_PMATERIALS, C_PMATERIALS+3);
769     GenerateLightingShaderCode(out, uid_data->lighting, uid_data->components << VB_COL_SHIFT,
770                                "colors_", "col");
771   }
772 
773   // HACK to handle cases where the tex gen is not enabled
774   if (uid_data->genMode_numtexgens == 0)
775   {
776     out.Write("\tint2 fixpoint_uv0 = int2(0, 0);\n\n");
777   }
778   else
779   {
780     out.SetConstantsUsed(C_TEXDIMS, C_TEXDIMS + uid_data->genMode_numtexgens - 1);
781     for (unsigned int i = 0; i < uid_data->genMode_numtexgens; ++i)
782     {
783       out.Write("\tint2 fixpoint_uv%d = int2(", i);
784       out.Write("(tex%d.z == 0.0 ? tex%d.xy : tex%d.xy / tex%d.z)", i, i, i, i);
785       out.Write(" * " I_TEXDIMS "[%d].zw);\n", i);
786       // TODO: S24 overflows here?
787     }
788   }
789 
790   for (u32 i = 0; i < uid_data->genMode_numindstages; ++i)
791   {
792     if (uid_data->nIndirectStagesUsed & (1 << i))
793     {
794       unsigned int texcoord = uid_data->GetTevindirefCoord(i);
795       unsigned int texmap = uid_data->GetTevindirefMap(i);
796 
797       if (texcoord < uid_data->genMode_numtexgens)
798       {
799         out.SetConstantsUsed(C_INDTEXSCALE + i / 2, C_INDTEXSCALE + i / 2);
800         out.Write("\ttempcoord = fixpoint_uv%d >> " I_INDTEXSCALE "[%d].%s;\n", texcoord, i / 2,
801                   (i & 1) ? "zw" : "xy");
802       }
803       else
804       {
805         out.Write("\ttempcoord = int2(0, 0);\n");
806       }
807 
808       out.Write("\tint3 iindtex%d = ", i);
809       SampleTexture(out, "float2(tempcoord)", "abg", texmap, stereo, ApiType);
810     }
811   }
812 
813   for (u32 i = 0; i < numStages; i++)
814   {
815     // Build the equation for this stage
816     WriteStage(out, uid_data, i, ApiType, stereo);
817   }
818 
819   {
820     // The results of the last texenv stage are put onto the screen,
821     // regardless of the used destination register
822     TevStageCombiner::ColorCombiner last_cc;
823     TevStageCombiner::AlphaCombiner last_ac;
824     last_cc.hex = uid_data->stagehash[uid_data->genMode_numtevstages].cc;
825     last_ac.hex = uid_data->stagehash[uid_data->genMode_numtevstages].ac;
826     if (last_cc.dest != 0)
827     {
828       out.Write("\tprev.rgb = %s;\n", tev_c_output_table[last_cc.dest]);
829     }
830     if (last_ac.dest != 0)
831     {
832       out.Write("\tprev.a = %s;\n", tev_a_output_table[last_ac.dest]);
833     }
834   }
835   out.Write("\tprev = prev & 255;\n");
836 
837   // NOTE: Fragment may not be discarded if alpha test always fails and early depth test is enabled
838   // (in this case we need to write a depth value if depth test passes regardless of the alpha
839   // testing result)
840   if (uid_data->Pretest == AlphaTest::UNDETERMINED ||
841       (uid_data->Pretest == AlphaTest::FAIL && uid_data->late_ztest))
842   {
843     WriteAlphaTest(out, uid_data, ApiType, uid_data->per_pixel_depth,
844                    use_dual_source || use_shader_blend);
845   }
846 
847   if (uid_data->zfreeze)
848   {
849     out.SetConstantsUsed(C_ZSLOPE, C_ZSLOPE);
850     out.SetConstantsUsed(C_EFBSCALE, C_EFBSCALE);
851 
852     out.Write("\tfloat2 screenpos = rawpos.xy * " I_EFBSCALE ".xy;\n");
853 
854     // Opengl has reversed vertical screenspace coordinates
855     if (ApiType == APIType::OpenGL)
856       out.Write("\tscreenpos.y = %i.0 - screenpos.y;\n", EFB_HEIGHT);
857 
858     out.Write("\tint zCoord = int(" I_ZSLOPE ".z + " I_ZSLOPE ".x * screenpos.x + " I_ZSLOPE
859               ".y * screenpos.y);\n");
860   }
861   else if (!host_config.fast_depth_calc)
862   {
863     // FastDepth means to trust the depth generated in perspective division.
864     // It should be correct, but it seems not to be as accurate as required. TODO: Find out why!
865     // For disabled FastDepth we just calculate the depth value again.
866     // The performance impact of this additional calculation doesn't matter, but it prevents
867     // the host GPU driver from performing any early depth test optimizations.
868     out.SetConstantsUsed(C_ZBIAS + 1, C_ZBIAS + 1);
869     // the screen space depth value = far z + (clip z / clip w) * z range
870     out.Write("\tint zCoord = " I_ZBIAS "[1].x + int((clipPos.z / clipPos.w) * float(" I_ZBIAS
871               "[1].y));\n");
872   }
873   else
874   {
875     if (!host_config.backend_reversed_depth_range)
876       out.Write("\tint zCoord = int((1.0 - rawpos.z) * 16777216.0);\n");
877     else
878       out.Write("\tint zCoord = int(rawpos.z * 16777216.0);\n");
879   }
880   out.Write("\tzCoord = clamp(zCoord, 0, 0xFFFFFF);\n");
881 
882   // depth texture can safely be ignored if the result won't be written to the depth buffer
883   // (early_ztest) and isn't used for fog either
884   const bool skip_ztexture = !uid_data->per_pixel_depth && !uid_data->fog_fsel;
885 
886   // Note: z-textures are not written to depth buffer if early depth test is used
887   if (uid_data->per_pixel_depth && uid_data->early_ztest)
888   {
889     if (!host_config.backend_reversed_depth_range)
890       out.Write("\tdepth = 1.0 - float(zCoord) / 16777216.0;\n");
891     else
892       out.Write("\tdepth = float(zCoord) / 16777216.0;\n");
893   }
894 
895   // Note: depth texture output is only written to depth buffer if late depth test is used
896   // theoretical final depth value is used for fog calculation, though, so we have to emulate
897   // ztextures anyway
898   if (uid_data->ztex_op != ZTEXTURE_DISABLE && !skip_ztexture)
899   {
900     // use the texture input of the last texture stage (textemp), hopefully this has been read and
901     // is in correct format...
902     out.SetConstantsUsed(C_ZBIAS, C_ZBIAS + 1);
903     out.Write("\tzCoord = idot(" I_ZBIAS "[0].xyzw, textemp.xyzw) + " I_ZBIAS "[1].w %s;\n",
904               (uid_data->ztex_op == ZTEXTURE_ADD) ? "+ zCoord" : "");
905     out.Write("\tzCoord = zCoord & 0xFFFFFF;\n");
906   }
907 
908   if (uid_data->per_pixel_depth && uid_data->late_ztest)
909   {
910     if (!host_config.backend_reversed_depth_range)
911       out.Write("\tdepth = 1.0 - float(zCoord) / 16777216.0;\n");
912     else
913       out.Write("\tdepth = float(zCoord) / 16777216.0;\n");
914   }
915 
916   // No dithering for RGB8 mode
917   if (uid_data->dither)
918   {
919     // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering
920     // Here the matrix is encoded into the two factor constants
921     out.Write("\tint2 dither = int2(rawpos.xy) & 1;\n");
922     out.Write("\tprev.rgb = (prev.rgb - (prev.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2);\n");
923   }
924 
925   WriteFog(out, uid_data);
926 
927   // Write the color and alpha values to the framebuffer
928   // If using shader blend, we still use the separate alpha
929   WriteColor(out, ApiType, uid_data, use_dual_source || use_shader_blend);
930 
931   if (use_shader_blend)
932     WriteBlend(out, uid_data);
933 
934   if (uid_data->bounding_box)
935     out.Write("\tUpdateBoundingBox(rawpos.xy);\n");
936 
937   out.Write("}\n");
938 
939   return out;
940 }
941 
WriteStage(ShaderCode & out,const pixel_shader_uid_data * uid_data,int n,APIType ApiType,bool stereo)942 static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, int n,
943                        APIType ApiType, bool stereo)
944 {
945   auto& stage = uid_data->stagehash[n];
946   out.Write("\n\t// TEV stage %d\n", n);
947 
948   // HACK to handle cases where the tex gen is not enabled
949   u32 texcoord = stage.tevorders_texcoord;
950   bool bHasTexCoord = texcoord < uid_data->genMode_numtexgens;
951   if (!bHasTexCoord)
952     texcoord = 0;
953 
954   if (stage.hasindstage)
955   {
956     TevStageIndirect tevind;
957     tevind.hex = stage.tevind;
958 
959     out.Write("\t// indirect op\n");
960     // perform the indirect op on the incoming regular coordinates using iindtex%d as the offset
961     // coords
962     if (tevind.bs != ITBA_OFF)
963     {
964       constexpr std::array<const char*, 4> tev_ind_alpha_sel{
965           "",
966           "x",
967           "y",
968           "z",
969       };
970 
971       // 0b11111000, 0b11100000, 0b11110000, 0b11111000
972       constexpr std::array<const char*, 4> tev_ind_alpha_mask{
973           "248",
974           "224",
975           "240",
976           "248",
977       };
978 
979       out.Write("alphabump = iindtex%d.%s & %s;\n", tevind.bt.Value(), tev_ind_alpha_sel[tevind.bs],
980                 tev_ind_alpha_mask[tevind.fmt]);
981     }
982     else
983     {
984       // TODO: Should we reset alphabump to 0 here?
985     }
986 
987     if (tevind.mid != 0)
988     {
989       // format
990       constexpr std::array<const char*, 4> tev_ind_fmt_mask{
991           "255",
992           "31",
993           "15",
994           "7",
995       };
996       out.Write("\tint3 iindtevcrd%d = iindtex%d & %s;\n", n, tevind.bt.Value(),
997                 tev_ind_fmt_mask[tevind.fmt]);
998 
999       // bias - TODO: Check if this needs to be this complicated...
1000       // indexed by bias
1001       constexpr std::array<const char*, 8> tev_ind_bias_field{
1002           "", "x", "y", "xy", "z", "xz", "yz", "xyz",
1003       };
1004 
1005       // indexed by fmt
1006       constexpr std::array<const char*, 4> tev_ind_bias_add{
1007           "-128",
1008           "1",
1009           "1",
1010           "1",
1011       };
1012 
1013       if (tevind.bias == ITB_S || tevind.bias == ITB_T || tevind.bias == ITB_U)
1014       {
1015         out.Write("\tiindtevcrd%d.%s += int(%s);\n", n, tev_ind_bias_field[tevind.bias],
1016                   tev_ind_bias_add[tevind.fmt]);
1017       }
1018       else if (tevind.bias == ITB_ST || tevind.bias == ITB_SU || tevind.bias == ITB_TU)
1019       {
1020         out.Write("\tiindtevcrd%d.%s += int2(%s, %s);\n", n, tev_ind_bias_field[tevind.bias],
1021                   tev_ind_bias_add[tevind.fmt], tev_ind_bias_add[tevind.fmt]);
1022       }
1023       else if (tevind.bias == ITB_STU)
1024       {
1025         out.Write("\tiindtevcrd%d.%s += int3(%s, %s, %s);\n", n, tev_ind_bias_field[tevind.bias],
1026                   tev_ind_bias_add[tevind.fmt], tev_ind_bias_add[tevind.fmt],
1027                   tev_ind_bias_add[tevind.fmt]);
1028       }
1029 
1030       // multiply by offset matrix and scale - calculations are likely to overflow badly,
1031       // yet it works out since we only care about the lower 23 bits (+1 sign bit) of the result
1032       if (tevind.mid <= 3)
1033       {
1034         int mtxidx = 2 * (tevind.mid - 1);
1035         out.SetConstantsUsed(C_INDTEXMTX + mtxidx, C_INDTEXMTX + mtxidx);
1036 
1037         out.Write("\tint2 indtevtrans%d = int2(idot(" I_INDTEXMTX
1038                   "[%d].xyz, iindtevcrd%d), idot(" I_INDTEXMTX "[%d].xyz, iindtevcrd%d)) >> 3;\n",
1039                   n, mtxidx, n, mtxidx + 1, n);
1040 
1041         // TODO: should use a shader uid branch for this for better performance
1042         if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_BITWISE_OP_NEGATION))
1043         {
1044           out.Write("\tint indtexmtx_w_inverse_%d = -" I_INDTEXMTX "[%d].w;\n", n, mtxidx);
1045           out.Write("\tif (" I_INDTEXMTX "[%d].w >= 0) indtevtrans%d >>= " I_INDTEXMTX "[%d].w;\n",
1046                     mtxidx, n, mtxidx);
1047           out.Write("\telse indtevtrans%d <<= indtexmtx_w_inverse_%d;\n", n, n);
1048         }
1049         else
1050         {
1051           out.Write("\tif (" I_INDTEXMTX "[%d].w >= 0) indtevtrans%d >>= " I_INDTEXMTX "[%d].w;\n",
1052                     mtxidx, n, mtxidx);
1053           out.Write("\telse indtevtrans%d <<= (-" I_INDTEXMTX "[%d].w);\n", n, mtxidx);
1054         }
1055       }
1056       else if (tevind.mid <= 7 && bHasTexCoord)
1057       {  // s matrix
1058         ASSERT(tevind.mid >= 5);
1059         int mtxidx = 2 * (tevind.mid - 5);
1060         out.SetConstantsUsed(C_INDTEXMTX + mtxidx, C_INDTEXMTX + mtxidx);
1061 
1062         out.Write("\tint2 indtevtrans%d = int2(fixpoint_uv%d * iindtevcrd%d.xx) >> 8;\n", n,
1063                   texcoord, n);
1064         if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_BITWISE_OP_NEGATION))
1065         {
1066           out.Write("\tint  indtexmtx_w_inverse_%d = -" I_INDTEXMTX "[%d].w;\n", n, mtxidx);
1067           out.Write("\tif (" I_INDTEXMTX "[%d].w >= 0) indtevtrans%d >>= " I_INDTEXMTX "[%d].w;\n",
1068                     mtxidx, n, mtxidx);
1069           out.Write("\telse indtevtrans%d <<= (indtexmtx_w_inverse_%d);\n", n, n);
1070         }
1071         else
1072         {
1073           out.Write("\tif (" I_INDTEXMTX "[%d].w >= 0) indtevtrans%d >>= " I_INDTEXMTX "[%d].w;\n",
1074                     mtxidx, n, mtxidx);
1075           out.Write("\telse indtevtrans%d <<= (-" I_INDTEXMTX "[%d].w);\n", n, mtxidx);
1076         }
1077       }
1078       else if (tevind.mid <= 11 && bHasTexCoord)
1079       {  // t matrix
1080         ASSERT(tevind.mid >= 9);
1081         int mtxidx = 2 * (tevind.mid - 9);
1082         out.SetConstantsUsed(C_INDTEXMTX + mtxidx, C_INDTEXMTX + mtxidx);
1083 
1084         out.Write("\tint2 indtevtrans%d = int2(fixpoint_uv%d * iindtevcrd%d.yy) >> 8;\n", n,
1085                   texcoord, n);
1086 
1087         if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_BITWISE_OP_NEGATION))
1088         {
1089           out.Write("\tint  indtexmtx_w_inverse_%d = -" I_INDTEXMTX "[%d].w;\n", n, mtxidx);
1090           out.Write("\tif (" I_INDTEXMTX "[%d].w >= 0) indtevtrans%d >>= " I_INDTEXMTX "[%d].w;\n",
1091                     mtxidx, n, mtxidx);
1092           out.Write("\telse indtevtrans%d <<= (indtexmtx_w_inverse_%d);\n", n, n);
1093         }
1094         else
1095         {
1096           out.Write("\tif (" I_INDTEXMTX "[%d].w >= 0) indtevtrans%d >>= " I_INDTEXMTX "[%d].w;\n",
1097                     mtxidx, n, mtxidx);
1098           out.Write("\telse indtevtrans%d <<= (-" I_INDTEXMTX "[%d].w);\n", n, mtxidx);
1099         }
1100       }
1101       else
1102       {
1103         out.Write("\tint2 indtevtrans%d = int2(0, 0);\n", n);
1104       }
1105     }
1106     else
1107     {
1108       out.Write("\tint2 indtevtrans%d = int2(0, 0);\n", n);
1109     }
1110 
1111     // ---------
1112     // Wrapping
1113     // ---------
1114 
1115     // TODO: Should the last element be 1 or (1<<7)?
1116     constexpr std::array<const char*, 7> tev_ind_wrap_start{
1117         "0", "(256<<7)", "(128<<7)", "(64<<7)", "(32<<7)", "(16<<7)", "1",
1118     };
1119 
1120     // wrap S
1121     if (tevind.sw == ITW_OFF)
1122     {
1123       out.Write("\twrappedcoord.x = fixpoint_uv%d.x;\n", texcoord);
1124     }
1125     else if (tevind.sw == ITW_0)
1126     {
1127       out.Write("\twrappedcoord.x = 0;\n");
1128     }
1129     else
1130     {
1131       out.Write("\twrappedcoord.x = fixpoint_uv%d.x & (%s - 1);\n", texcoord,
1132                 tev_ind_wrap_start[tevind.sw]);
1133     }
1134 
1135     // wrap T
1136     if (tevind.tw == ITW_OFF)
1137     {
1138       out.Write("\twrappedcoord.y = fixpoint_uv%d.y;\n", texcoord);
1139     }
1140     else if (tevind.tw == ITW_0)
1141     {
1142       out.Write("\twrappedcoord.y = 0;\n");
1143     }
1144     else
1145     {
1146       out.Write("\twrappedcoord.y = fixpoint_uv%d.y & (%s - 1);\n", texcoord,
1147                 tev_ind_wrap_start[tevind.tw]);
1148     }
1149 
1150     if (tevind.fb_addprev)  // add previous tevcoord
1151       out.Write("\ttevcoord.xy += wrappedcoord + indtevtrans%d;\n", n);
1152     else
1153       out.Write("\ttevcoord.xy = wrappedcoord + indtevtrans%d;\n", n);
1154 
1155     // Emulate s24 overflows
1156     out.Write("\ttevcoord.xy = (tevcoord.xy << 8) >> 8;\n");
1157   }
1158 
1159   TevStageCombiner::ColorCombiner cc;
1160   TevStageCombiner::AlphaCombiner ac;
1161   cc.hex = stage.cc;
1162   ac.hex = stage.ac;
1163 
1164   if (cc.a == TEVCOLORARG_RASA || cc.a == TEVCOLORARG_RASC || cc.b == TEVCOLORARG_RASA ||
1165       cc.b == TEVCOLORARG_RASC || cc.c == TEVCOLORARG_RASA || cc.c == TEVCOLORARG_RASC ||
1166       cc.d == TEVCOLORARG_RASA || cc.d == TEVCOLORARG_RASC || ac.a == TEVALPHAARG_RASA ||
1167       ac.b == TEVALPHAARG_RASA || ac.c == TEVALPHAARG_RASA || ac.d == TEVALPHAARG_RASA)
1168   {
1169     // Generate swizzle string to represent the Ras color channel swapping
1170     const char rasswap[5] = {
1171         "rgba"[stage.tevksel_swap1a],
1172         "rgba"[stage.tevksel_swap2a],
1173         "rgba"[stage.tevksel_swap1b],
1174         "rgba"[stage.tevksel_swap2b],
1175         '\0',
1176     };
1177 
1178     out.Write("\trastemp = %s.%s;\n", tev_ras_table[stage.tevorders_colorchan], rasswap);
1179   }
1180 
1181   if (stage.tevorders_enable)
1182   {
1183     // Generate swizzle string to represent the texture color channel swapping
1184     const char texswap[5] = {
1185         "rgba"[stage.tevksel_swap1c],
1186         "rgba"[stage.tevksel_swap2c],
1187         "rgba"[stage.tevksel_swap1d],
1188         "rgba"[stage.tevksel_swap2d],
1189         '\0',
1190     };
1191 
1192     if (!stage.hasindstage)
1193     {
1194       // calc tevcord
1195       if (bHasTexCoord)
1196         out.Write("\ttevcoord.xy = fixpoint_uv%d;\n", texcoord);
1197       else
1198         out.Write("\ttevcoord.xy = int2(0, 0);\n");
1199     }
1200     out.Write("\ttextemp = ");
1201     SampleTexture(out, "float2(tevcoord.xy)", texswap, stage.tevorders_texmap, stereo, ApiType);
1202   }
1203   else
1204   {
1205     out.Write("\ttextemp = int4(255, 255, 255, 255);\n");
1206   }
1207 
1208   if (cc.a == TEVCOLORARG_KONST || cc.b == TEVCOLORARG_KONST || cc.c == TEVCOLORARG_KONST ||
1209       cc.d == TEVCOLORARG_KONST || ac.a == TEVALPHAARG_KONST || ac.b == TEVALPHAARG_KONST ||
1210       ac.c == TEVALPHAARG_KONST || ac.d == TEVALPHAARG_KONST)
1211   {
1212     out.Write("\tkonsttemp = int4(%s, %s);\n", tev_ksel_table_c[stage.tevksel_kc],
1213               tev_ksel_table_a[stage.tevksel_ka]);
1214 
1215     if (stage.tevksel_kc > 7)
1216     {
1217       out.SetConstantsUsed(C_KCOLORS + ((stage.tevksel_kc - 0xc) % 4),
1218                            C_KCOLORS + ((stage.tevksel_kc - 0xc) % 4));
1219     }
1220     if (stage.tevksel_ka > 7)
1221     {
1222       out.SetConstantsUsed(C_KCOLORS + ((stage.tevksel_ka - 0xc) % 4),
1223                            C_KCOLORS + ((stage.tevksel_ka - 0xc) % 4));
1224     }
1225   }
1226 
1227   if (cc.d == TEVCOLORARG_C0 || cc.d == TEVCOLORARG_A0 || ac.d == TEVALPHAARG_A0)
1228     out.SetConstantsUsed(C_COLORS + 1, C_COLORS + 1);
1229 
1230   if (cc.d == TEVCOLORARG_C1 || cc.d == TEVCOLORARG_A1 || ac.d == TEVALPHAARG_A1)
1231     out.SetConstantsUsed(C_COLORS + 2, C_COLORS + 2);
1232 
1233   if (cc.d == TEVCOLORARG_C2 || cc.d == TEVCOLORARG_A2 || ac.d == TEVALPHAARG_A2)
1234     out.SetConstantsUsed(C_COLORS + 3, C_COLORS + 3);
1235 
1236   if (cc.dest >= GX_TEVREG0)
1237     out.SetConstantsUsed(C_COLORS + cc.dest, C_COLORS + cc.dest);
1238 
1239   if (ac.dest >= GX_TEVREG0)
1240     out.SetConstantsUsed(C_COLORS + ac.dest, C_COLORS + ac.dest);
1241 
1242   out.Write("\ttevin_a = int4(%s, %s)&int4(255, 255, 255, 255);\n", tev_c_input_table[cc.a],
1243             tev_a_input_table[ac.a]);
1244   out.Write("\ttevin_b = int4(%s, %s)&int4(255, 255, 255, 255);\n", tev_c_input_table[cc.b],
1245             tev_a_input_table[ac.b]);
1246   out.Write("\ttevin_c = int4(%s, %s)&int4(255, 255, 255, 255);\n", tev_c_input_table[cc.c],
1247             tev_a_input_table[ac.c]);
1248   out.Write("\ttevin_d = int4(%s, %s);\n", tev_c_input_table[cc.d], tev_a_input_table[ac.d]);
1249 
1250   out.Write("\t// color combine\n");
1251   out.Write("\t%s = clamp(", tev_c_output_table[cc.dest]);
1252   if (cc.bias != TEVBIAS_COMPARE)
1253   {
1254     WriteTevRegular(out, "rgb", cc.bias, cc.op, cc.clamp, cc.shift, false);
1255   }
1256   else
1257   {
1258     constexpr std::array<const char*, 8> function_table{
1259         "((tevin_a.r > tevin_b.r) ? tevin_c.rgb : int3(0,0,0))",   // TEVCMP_R8_GT
1260         "((tevin_a.r == tevin_b.r) ? tevin_c.rgb : int3(0,0,0))",  // TEVCMP_R8_EQ
1261         "((idot(tevin_a.rgb, comp16) >  idot(tevin_b.rgb, comp16)) ? tevin_c.rgb : "
1262         "int3(0,0,0))",  // TEVCMP_GR16_GT
1263         "((idot(tevin_a.rgb, comp16) == idot(tevin_b.rgb, comp16)) ? tevin_c.rgb : "
1264         "int3(0,0,0))",  // TEVCMP_GR16_EQ
1265         "((idot(tevin_a.rgb, comp24) >  idot(tevin_b.rgb, comp24)) ? tevin_c.rgb : "
1266         "int3(0,0,0))",  // TEVCMP_BGR24_GT
1267         "((idot(tevin_a.rgb, comp24) == idot(tevin_b.rgb, comp24)) ? tevin_c.rgb : "
1268         "int3(0,0,0))",                                                         // TEVCMP_BGR24_EQ
1269         "(max(sign(tevin_a.rgb - tevin_b.rgb), int3(0,0,0)) * tevin_c.rgb)",    // TEVCMP_RGB8_GT
1270         "((int3(1,1,1) - sign(abs(tevin_a.rgb - tevin_b.rgb))) * tevin_c.rgb)"  // TEVCMP_RGB8_EQ
1271     };
1272 
1273     const int mode = (cc.shift << 1) | cc.op;
1274     out.Write("   tevin_d.rgb + ");
1275     out.Write("%s", function_table[mode]);
1276   }
1277   if (cc.clamp)
1278     out.Write(", int3(0,0,0), int3(255,255,255))");
1279   else
1280     out.Write(", int3(-1024,-1024,-1024), int3(1023,1023,1023))");
1281   out.Write(";\n");
1282 
1283   out.Write("\t// alpha combine\n");
1284   out.Write("\t%s = clamp(", tev_a_output_table[ac.dest]);
1285   if (ac.bias != TEVBIAS_COMPARE)
1286   {
1287     WriteTevRegular(out, "a", ac.bias, ac.op, ac.clamp, ac.shift, true);
1288   }
1289   else
1290   {
1291     constexpr std::array<const char*, 8> function_table{
1292         "((tevin_a.r > tevin_b.r) ? tevin_c.a : 0)",   // TEVCMP_R8_GT
1293         "((tevin_a.r == tevin_b.r) ? tevin_c.a : 0)",  // TEVCMP_R8_EQ
1294         "((idot(tevin_a.rgb, comp16) >  idot(tevin_b.rgb, comp16)) ? tevin_c.a : 0)",  // TEVCMP_GR16_GT
1295         "((idot(tevin_a.rgb, comp16) == idot(tevin_b.rgb, comp16)) ? tevin_c.a : 0)",  // TEVCMP_GR16_EQ
1296         "((idot(tevin_a.rgb, comp24) >  idot(tevin_b.rgb, comp24)) ? tevin_c.a : 0)",  // TEVCMP_BGR24_GT
1297         "((idot(tevin_a.rgb, comp24) == idot(tevin_b.rgb, comp24)) ? tevin_c.a : 0)",  // TEVCMP_BGR24_EQ
1298         "((tevin_a.a >  tevin_b.a) ? tevin_c.a : 0)",  // TEVCMP_A8_GT
1299         "((tevin_a.a == tevin_b.a) ? tevin_c.a : 0)"   // TEVCMP_A8_EQ
1300     };
1301 
1302     const int mode = (ac.shift << 1) | ac.op;
1303     out.Write("   tevin_d.a + ");
1304     out.Write("%s", function_table[mode]);
1305   }
1306   if (ac.clamp)
1307     out.Write(", 0, 255)");
1308   else
1309     out.Write(", -1024, 1023)");
1310 
1311   out.Write(";\n");
1312 }
1313 
WriteTevRegular(ShaderCode & out,const char * components,int bias,int op,int clamp,int shift,bool alpha)1314 static void WriteTevRegular(ShaderCode& out, const char* components, int bias, int op, int clamp,
1315                             int shift, bool alpha)
1316 {
1317   constexpr std::array<const char*, 4> tev_scale_table_left{
1318       "",       // SCALE_1
1319       " << 1",  // SCALE_2
1320       " << 2",  // SCALE_4
1321       "",       // DIVIDE_2
1322   };
1323 
1324   constexpr std::array<const char*, 4> tev_scale_table_right{
1325       "",       // SCALE_1
1326       "",       // SCALE_2
1327       "",       // SCALE_4
1328       " >> 1",  // DIVIDE_2
1329   };
1330 
1331   // indexed by 2*op+(shift==3)
1332   constexpr std::array<const char*, 4> tev_lerp_bias{
1333       "",
1334       " + 128",
1335       "",
1336       " + 127",
1337   };
1338 
1339   constexpr std::array<const char*, 4> tev_bias_table{
1340       "",        // ZERO,
1341       " + 128",  // ADDHALF,
1342       " - 128",  // SUBHALF,
1343       "",
1344   };
1345 
1346   constexpr std::array<char, 2> tev_op_table{
1347       '+',  // TEVOP_ADD = 0,
1348       '-',  // TEVOP_SUB = 1,
1349   };
1350 
1351   // Regular TEV stage: (d + bias + lerp(a,b,c)) * scale
1352   // The GameCube/Wii GPU uses a very sophisticated algorithm for scale-lerping:
1353   // - c is scaled from 0..255 to 0..256, which allows dividing the result by 256 instead of 255
1354   // - if scale is bigger than one, it is moved inside the lerp calculation for increased accuracy
1355   // - a rounding bias is added before dividing by 256
1356   out.Write("(((tevin_d.%s%s)%s)", components, tev_bias_table[bias], tev_scale_table_left[shift]);
1357   out.Write(" %c ", tev_op_table[op]);
1358   out.Write("(((((tevin_a.%s<<8) + (tevin_b.%s-tevin_a.%s)*(tevin_c.%s+(tevin_c.%s>>7)))%s)%s)>>8)",
1359             components, components, components, components, components, tev_scale_table_left[shift],
1360             tev_lerp_bias[2 * op + ((shift == 3) == alpha)]);
1361   out.Write(")%s", tev_scale_table_right[shift]);
1362 }
1363 
SampleTexture(ShaderCode & out,const char * texcoords,const char * texswap,int texmap,bool stereo,APIType ApiType)1364 static void SampleTexture(ShaderCode& out, const char* texcoords, const char* texswap, int texmap,
1365                           bool stereo, APIType ApiType)
1366 {
1367   out.SetConstantsUsed(C_TEXDIMS + texmap, C_TEXDIMS + texmap);
1368 
1369   if (ApiType == APIType::D3D)
1370   {
1371     out.Write("iround(255.0 * Tex[%d].Sample(samp[%d], float3(%s.xy * " I_TEXDIMS
1372               "[%d].xy, %s))).%s;\n",
1373               texmap, texmap, texcoords, texmap, stereo ? "layer" : "0.0", texswap);
1374   }
1375   else
1376   {
1377     out.Write("iround(255.0 * texture(samp[%d], float3(%s.xy * " I_TEXDIMS "[%d].xy, %s))).%s;\n",
1378               texmap, texcoords, texmap, stereo ? "layer" : "0.0", texswap);
1379   }
1380 }
1381 
1382 constexpr std::array<const char*, 8> tev_alpha_funcs_table{
1383     "(false)",         // NEVER
1384     "(prev.a <  %s)",  // LESS
1385     "(prev.a == %s)",  // EQUAL
1386     "(prev.a <= %s)",  // LEQUAL
1387     "(prev.a >  %s)",  // GREATER
1388     "(prev.a != %s)",  // NEQUAL
1389     "(prev.a >= %s)",  // GEQUAL
1390     "(true)"           // ALWAYS
1391 };
1392 
1393 constexpr std::array<const char*, 4> tev_alpha_funclogic_table{
1394     " && ",  // and
1395     " || ",  // or
1396     " != ",  // xor
1397     " == "   // xnor
1398 };
1399 
WriteAlphaTest(ShaderCode & out,const pixel_shader_uid_data * uid_data,APIType ApiType,bool per_pixel_depth,bool use_dual_source)1400 static void WriteAlphaTest(ShaderCode& out, const pixel_shader_uid_data* uid_data, APIType ApiType,
1401                            bool per_pixel_depth, bool use_dual_source)
1402 {
1403   static constexpr std::array<const char*, 2> alpha_ref{I_ALPHA ".r", I_ALPHA ".g"};
1404 
1405   out.SetConstantsUsed(C_ALPHA, C_ALPHA);
1406 
1407   if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_NEGATED_BOOLEAN))
1408     out.Write("\tif(( ");
1409   else
1410     out.Write("\tif(!( ");
1411 
1412   // Lookup the first component from the alpha function table
1413   int compindex = uid_data->alpha_test_comp0;
1414   out.Write(tev_alpha_funcs_table[compindex], alpha_ref[0]);
1415 
1416   // Lookup the logic op
1417   out.Write("%s", tev_alpha_funclogic_table[uid_data->alpha_test_logic]);
1418 
1419   // Lookup the second component from the alpha function table
1420   compindex = uid_data->alpha_test_comp1;
1421   out.Write(tev_alpha_funcs_table[compindex], alpha_ref[1]);
1422 
1423   if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_NEGATED_BOOLEAN))
1424     out.Write(") == false) {\n");
1425   else
1426     out.Write(")) {\n");
1427 
1428   out.Write("\t\tocol0 = float4(0.0, 0.0, 0.0, 0.0);\n");
1429   if (use_dual_source && !(ApiType == APIType::D3D && uid_data->uint_output))
1430     out.Write("\t\tocol1 = float4(0.0, 0.0, 0.0, 0.0);\n");
1431   if (per_pixel_depth)
1432   {
1433     out.Write("\t\tdepth = %s;\n",
1434               !g_ActiveConfig.backend_info.bSupportsReversedDepthRange ? "0.0" : "1.0");
1435   }
1436 
1437   // ZCOMPLOC HACK:
1438   if (!uid_data->alpha_test_use_zcomploc_hack)
1439   {
1440     out.Write("\t\tdiscard;\n");
1441     if (ApiType == APIType::D3D)
1442       out.Write("\t\treturn;\n");
1443   }
1444 
1445   out.Write("\t}\n");
1446 }
1447 
1448 constexpr std::array<const char*, 8> tev_fog_funcs_table{
1449     "",                                                       // No Fog
1450     "",                                                       // ?
1451     "",                                                       // Linear
1452     "",                                                       // ?
1453     "\tfog = 1.0 - exp2(-8.0 * fog);\n",                      // exp
1454     "\tfog = 1.0 - exp2(-8.0 * fog * fog);\n",                // exp2
1455     "\tfog = exp2(-8.0 * (1.0 - fog));\n",                    // backward exp
1456     "\tfog = 1.0 - fog;\n   fog = exp2(-8.0 * fog * fog);\n"  // backward exp2
1457 };
1458 
WriteFog(ShaderCode & out,const pixel_shader_uid_data * uid_data)1459 static void WriteFog(ShaderCode& out, const pixel_shader_uid_data* uid_data)
1460 {
1461   if (uid_data->fog_fsel == 0)
1462     return;  // no Fog
1463 
1464   out.SetConstantsUsed(C_FOGCOLOR, C_FOGCOLOR);
1465   out.SetConstantsUsed(C_FOGI, C_FOGI);
1466   out.SetConstantsUsed(C_FOGF, C_FOGF + 1);
1467   if (uid_data->fog_proj == 0)
1468   {
1469     // perspective
1470     // ze = A/(B - (Zs >> B_SHF)
1471     // TODO: Verify that we want to drop lower bits here! (currently taken over from software
1472     // renderer)
1473     //       Maybe we want to use "ze = (A << B_SHF)/((B << B_SHF) - Zs)" instead?
1474     //       That's equivalent, but keeps the lower bits of Zs.
1475     out.Write("\tfloat ze = (" I_FOGF ".x * 16777216.0) / float(" I_FOGI ".y - (zCoord >> " I_FOGI
1476               ".w));\n");
1477   }
1478   else
1479   {
1480     // orthographic
1481     // ze = a*Zs    (here, no B_SHF)
1482     out.Write("\tfloat ze = " I_FOGF ".x * float(zCoord) / 16777216.0;\n");
1483   }
1484 
1485   // x_adjust = sqrt((x-center)^2 + k^2)/k
1486   // ze *= x_adjust
1487   if (uid_data->fog_RangeBaseEnabled)
1488   {
1489     out.SetConstantsUsed(C_FOGF, C_FOGF);
1490     out.Write("\tfloat offset = (2.0 * (rawpos.x / " I_FOGF ".w)) - 1.0 - " I_FOGF ".z;\n");
1491     out.Write("\tfloat floatindex = clamp(9.0 - abs(offset) * 9.0, 0.0, 9.0);\n");
1492     out.Write("\tuint indexlower = uint(floatindex);\n");
1493     out.Write("\tuint indexupper = indexlower + 1u;\n");
1494     out.Write("\tfloat klower = " I_FOGRANGE "[indexlower >> 2u][indexlower & 3u];\n");
1495     out.Write("\tfloat kupper = " I_FOGRANGE "[indexupper >> 2u][indexupper & 3u];\n");
1496     out.Write("\tfloat k = lerp(klower, kupper, frac(floatindex));\n");
1497     out.Write("\tfloat x_adjust = sqrt(offset * offset + k * k) / k;\n");
1498     out.Write("\tze *= x_adjust;\n");
1499   }
1500 
1501   out.Write("\tfloat fog = clamp(ze - " I_FOGF ".y, 0.0, 1.0);\n");
1502 
1503   if (uid_data->fog_fsel > 3)
1504   {
1505     out.Write("%s", tev_fog_funcs_table[uid_data->fog_fsel]);
1506   }
1507   else
1508   {
1509     if (uid_data->fog_fsel != 2)
1510       WARN_LOG(VIDEO, "Unknown Fog Type! %08x", uid_data->fog_fsel);
1511   }
1512 
1513   out.Write("\tint ifog = iround(fog * 256.0);\n");
1514   out.Write("\tprev.rgb = (prev.rgb * (256 - ifog) + " I_FOGCOLOR ".rgb * ifog) >> 8;\n");
1515 }
1516 
WriteColor(ShaderCode & out,APIType api_type,const pixel_shader_uid_data * uid_data,bool use_dual_source)1517 static void WriteColor(ShaderCode& out, APIType api_type, const pixel_shader_uid_data* uid_data,
1518                        bool use_dual_source)
1519 {
1520   // D3D requires that the shader outputs be uint when writing to a uint render target for logic op.
1521   if (api_type == APIType::D3D && uid_data->uint_output)
1522   {
1523     if (uid_data->rgba6_format)
1524       out.Write("\tocol0 = uint4(prev & 0xFC);\n");
1525     else
1526       out.Write("\tocol0 = uint4(prev);\n");
1527     return;
1528   }
1529 
1530   if (uid_data->rgba6_format)
1531     out.Write("\tocol0.rgb = float3(prev.rgb >> 2) / 63.0;\n");
1532   else
1533     out.Write("\tocol0.rgb = float3(prev.rgb) / 255.0;\n");
1534 
1535   // Colors will be blended against the 8-bit alpha from ocol1 and
1536   // the 6-bit alpha from ocol0 will be written to the framebuffer
1537   if (uid_data->useDstAlpha)
1538   {
1539     out.SetConstantsUsed(C_ALPHA, C_ALPHA);
1540     out.Write("\tocol0.a = float(" I_ALPHA ".a >> 2) / 63.0;\n");
1541 
1542     // Use dual-source color blending to perform dst alpha in a single pass
1543     if (use_dual_source)
1544       out.Write("\tocol1 = float4(0.0, 0.0, 0.0, float(prev.a) / 255.0);\n");
1545   }
1546   else
1547   {
1548     out.Write("\tocol0.a = float(prev.a >> 2) / 63.0;\n");
1549     if (use_dual_source)
1550       out.Write("\tocol1 = float4(0.0, 0.0, 0.0, float(prev.a) / 255.0);\n");
1551   }
1552 }
1553 
WriteBlend(ShaderCode & out,const pixel_shader_uid_data * uid_data)1554 static void WriteBlend(ShaderCode& out, const pixel_shader_uid_data* uid_data)
1555 {
1556   if (uid_data->blend_enable)
1557   {
1558     static constexpr std::array<const char*, 8> blend_src_factor{
1559         "float3(0,0,0);",                      // ZERO
1560         "float3(1,1,1);",                      // ONE
1561         "initial_ocol0.rgb;",                  // DSTCLR
1562         "float3(1,1,1) - initial_ocol0.rgb;",  // INVDSTCLR
1563         "ocol1.aaa;",                          // SRCALPHA
1564         "float3(1,1,1) - ocol1.aaa;",          // INVSRCALPHA
1565         "initial_ocol0.aaa;",                  // DSTALPHA
1566         "float3(1,1,1) - initial_ocol0.aaa;",  // INVDSTALPHA
1567     };
1568     static constexpr std::array<const char*, 8> blend_src_factor_alpha{
1569         "0.0;",                    // ZERO
1570         "1.0;",                    // ONE
1571         "initial_ocol0.a;",        // DSTCLR
1572         "1.0 - initial_ocol0.a;",  // INVDSTCLR
1573         "ocol1.a;",                // SRCALPHA
1574         "1.0 - ocol1.a;",          // INVSRCALPHA
1575         "initial_ocol0.a;",        // DSTALPHA
1576         "1.0 - initial_ocol0.a;",  // INVDSTALPHA
1577     };
1578     static constexpr std::array<const char*, 8> blend_dst_factor{
1579         "float3(0,0,0);",                      // ZERO
1580         "float3(1,1,1);",                      // ONE
1581         "ocol0.rgb;",                          // SRCCLR
1582         "float3(1,1,1) - ocol0.rgb;",          // INVSRCCLR
1583         "ocol1.aaa;",                          // SRCALHA
1584         "float3(1,1,1) - ocol1.aaa;",          // INVSRCALPHA
1585         "initial_ocol0.aaa;",                  // DSTALPHA
1586         "float3(1,1,1) - initial_ocol0.aaa;",  // INVDSTALPHA
1587     };
1588     static constexpr std::array<const char*, 8> blend_dst_factor_alpha{
1589         "0.0;",                    // ZERO
1590         "1.0;",                    // ONE
1591         "ocol0.a;",                // SRCCLR
1592         "1.0 - ocol0.a;",          // INVSRCCLR
1593         "ocol1.a;",                // SRCALPHA
1594         "1.0 - ocol1.a;",          // INVSRCALPHA
1595         "initial_ocol0.a;",        // DSTALPHA
1596         "1.0 - initial_ocol0.a;",  // INVDSTALPHA
1597     };
1598     out.Write("\tfloat4 blend_src;\n");
1599     out.Write("\tblend_src.rgb = %s\n", blend_src_factor[uid_data->blend_src_factor]);
1600     out.Write("\tblend_src.a = %s\n", blend_src_factor_alpha[uid_data->blend_src_factor_alpha]);
1601     out.Write("\tfloat4 blend_dst;\n");
1602     out.Write("\tblend_dst.rgb = %s\n", blend_dst_factor[uid_data->blend_dst_factor]);
1603     out.Write("\tblend_dst.a = %s\n", blend_dst_factor_alpha[uid_data->blend_dst_factor_alpha]);
1604 
1605     out.Write("\tfloat4 blend_result;\n");
1606     if (uid_data->blend_subtract)
1607     {
1608       out.Write("\tblend_result.rgb = initial_ocol0.rgb * blend_dst.rgb - ocol0.rgb * "
1609                 "blend_src.rgb;\n");
1610     }
1611     else
1612     {
1613       out.Write(
1614           "\tblend_result.rgb = initial_ocol0.rgb * blend_dst.rgb + ocol0.rgb * blend_src.rgb;\n");
1615     }
1616 
1617     if (uid_data->blend_subtract_alpha)
1618       out.Write("\tblend_result.a = initial_ocol0.a * blend_dst.a - ocol0.a * blend_src.a;\n");
1619     else
1620       out.Write("\tblend_result.a = initial_ocol0.a * blend_dst.a + ocol0.a * blend_src.a;\n");
1621   }
1622   else
1623   {
1624     out.Write("\tfloat4 blend_result = ocol0;\n");
1625   }
1626 
1627   out.Write("\treal_ocol0 = blend_result;\n");
1628 }
1629