1 // Copyright 2008 Dolphin Emulator Project
2 // Licensed under GPLv2+
3 // Refer to the license.txt file included.
4
5 #include "VideoCommon/PixelShaderGen.h"
6
7 #include <cmath>
8 #include <cstdio>
9
10 #include "Common/Assert.h"
11 #include "Common/CommonTypes.h"
12 #include "Common/Logging/Log.h"
13 #include "VideoCommon/BPMemory.h"
14 #include "VideoCommon/BoundingBox.h"
15 #include "VideoCommon/DriverDetails.h"
16 #include "VideoCommon/LightingShaderGen.h"
17 #include "VideoCommon/NativeVertexFormat.h"
18 #include "VideoCommon/RenderState.h"
19 #include "VideoCommon/VertexLoaderManager.h"
20 #include "VideoCommon/VideoCommon.h"
21 #include "VideoCommon/VideoConfig.h"
22 #include "VideoCommon/XFMemory.h" // for texture projection mode
23
24 // TODO: Get rid of these
25 enum : u32
26 {
27 C_COLORMATRIX = 0, // 0
28 C_COLORS = 0, // 0
29 C_KCOLORS = C_COLORS + 4, // 4
30 C_ALPHA = C_KCOLORS + 4, // 8
31 C_TEXDIMS = C_ALPHA + 1, // 9
32 C_ZBIAS = C_TEXDIMS + 8, // 17
33 C_INDTEXSCALE = C_ZBIAS + 2, // 19
34 C_INDTEXMTX = C_INDTEXSCALE + 2, // 21
35 C_FOGCOLOR = C_INDTEXMTX + 6, // 27
36 C_FOGI = C_FOGCOLOR + 1, // 28
37 C_FOGF = C_FOGI + 1, // 29
38 C_ZSLOPE = C_FOGF + 2, // 31
39 C_EFBSCALE = C_ZSLOPE + 1, // 32
40 C_PENVCONST_END = C_EFBSCALE + 1
41 };
42
43 constexpr std::array<const char*, 32> tev_ksel_table_c{
44 "255,255,255", // 1 = 0x00
45 "223,223,223", // 7_8 = 0x01
46 "191,191,191", // 3_4 = 0x02
47 "159,159,159", // 5_8 = 0x03
48 "128,128,128", // 1_2 = 0x04
49 "96,96,96", // 3_8 = 0x05
50 "64,64,64", // 1_4 = 0x06
51 "32,32,32", // 1_8 = 0x07
52 "0,0,0", // INVALID = 0x08
53 "0,0,0", // INVALID = 0x09
54 "0,0,0", // INVALID = 0x0a
55 "0,0,0", // INVALID = 0x0b
56 I_KCOLORS "[0].rgb", // K0 = 0x0C
57 I_KCOLORS "[1].rgb", // K1 = 0x0D
58 I_KCOLORS "[2].rgb", // K2 = 0x0E
59 I_KCOLORS "[3].rgb", // K3 = 0x0F
60 I_KCOLORS "[0].rrr", // K0_R = 0x10
61 I_KCOLORS "[1].rrr", // K1_R = 0x11
62 I_KCOLORS "[2].rrr", // K2_R = 0x12
63 I_KCOLORS "[3].rrr", // K3_R = 0x13
64 I_KCOLORS "[0].ggg", // K0_G = 0x14
65 I_KCOLORS "[1].ggg", // K1_G = 0x15
66 I_KCOLORS "[2].ggg", // K2_G = 0x16
67 I_KCOLORS "[3].ggg", // K3_G = 0x17
68 I_KCOLORS "[0].bbb", // K0_B = 0x18
69 I_KCOLORS "[1].bbb", // K1_B = 0x19
70 I_KCOLORS "[2].bbb", // K2_B = 0x1A
71 I_KCOLORS "[3].bbb", // K3_B = 0x1B
72 I_KCOLORS "[0].aaa", // K0_A = 0x1C
73 I_KCOLORS "[1].aaa", // K1_A = 0x1D
74 I_KCOLORS "[2].aaa", // K2_A = 0x1E
75 I_KCOLORS "[3].aaa", // K3_A = 0x1F
76 };
77
78 constexpr std::array<const char*, 32> tev_ksel_table_a{
79 "255", // 1 = 0x00
80 "223", // 7_8 = 0x01
81 "191", // 3_4 = 0x02
82 "159", // 5_8 = 0x03
83 "128", // 1_2 = 0x04
84 "96", // 3_8 = 0x05
85 "64", // 1_4 = 0x06
86 "32", // 1_8 = 0x07
87 "0", // INVALID = 0x08
88 "0", // INVALID = 0x09
89 "0", // INVALID = 0x0a
90 "0", // INVALID = 0x0b
91 "0", // INVALID = 0x0c
92 "0", // INVALID = 0x0d
93 "0", // INVALID = 0x0e
94 "0", // INVALID = 0x0f
95 I_KCOLORS "[0].r", // K0_R = 0x10
96 I_KCOLORS "[1].r", // K1_R = 0x11
97 I_KCOLORS "[2].r", // K2_R = 0x12
98 I_KCOLORS "[3].r", // K3_R = 0x13
99 I_KCOLORS "[0].g", // K0_G = 0x14
100 I_KCOLORS "[1].g", // K1_G = 0x15
101 I_KCOLORS "[2].g", // K2_G = 0x16
102 I_KCOLORS "[3].g", // K3_G = 0x17
103 I_KCOLORS "[0].b", // K0_B = 0x18
104 I_KCOLORS "[1].b", // K1_B = 0x19
105 I_KCOLORS "[2].b", // K2_B = 0x1A
106 I_KCOLORS "[3].b", // K3_B = 0x1B
107 I_KCOLORS "[0].a", // K0_A = 0x1C
108 I_KCOLORS "[1].a", // K1_A = 0x1D
109 I_KCOLORS "[2].a", // K2_A = 0x1E
110 I_KCOLORS "[3].a", // K3_A = 0x1F
111 };
112
113 constexpr std::array<const char*, 16> tev_c_input_table{
114 "prev.rgb", // CPREV,
115 "prev.aaa", // APREV,
116 "c0.rgb", // C0,
117 "c0.aaa", // A0,
118 "c1.rgb", // C1,
119 "c1.aaa", // A1,
120 "c2.rgb", // C2,
121 "c2.aaa", // A2,
122 "textemp.rgb", // TEXC,
123 "textemp.aaa", // TEXA,
124 "rastemp.rgb", // RASC,
125 "rastemp.aaa", // RASA,
126 "int3(255,255,255)", // ONE
127 "int3(128,128,128)", // HALF
128 "konsttemp.rgb", // KONST
129 "int3(0,0,0)", // ZERO
130 };
131
132 constexpr std::array<const char*, 8> tev_a_input_table{
133 "prev.a", // APREV,
134 "c0.a", // A0,
135 "c1.a", // A1,
136 "c2.a", // A2,
137 "textemp.a", // TEXA,
138 "rastemp.a", // RASA,
139 "konsttemp.a", // KONST, (hw1 had quarter)
140 "0", // ZERO
141 };
142
143 constexpr std::array<const char*, 8> tev_ras_table{
144 "iround(col0 * 255.0)",
145 "iround(col1 * 255.0)",
146 "ERROR13", // 2
147 "ERROR14", // 3
148 "ERROR15", // 4
149 "(int4(1, 1, 1, 1) * alphabump)", // bump alpha (0..248)
150 "(int4(1, 1, 1, 1) * (alphabump | (alphabump >> 5)))", // normalized bump alpha (0..255)
151 "int4(0, 0, 0, 0)", // zero
152 };
153
154 constexpr std::array<const char*, 4> tev_c_output_table{
155 "prev.rgb",
156 "c0.rgb",
157 "c1.rgb",
158 "c2.rgb",
159 };
160
161 constexpr std::array<const char*, 4> tev_a_output_table{
162 "prev.a",
163 "c0.a",
164 "c1.a",
165 "c2.a",
166 };
167
168 // FIXME: Some of the video card's capabilities (BBox support, EarlyZ support, dstAlpha support)
169 // leak into this UID; This is really unhelpful if these UIDs ever move from one machine to
170 // another.
GetPixelShaderUid()171 PixelShaderUid GetPixelShaderUid()
172 {
173 PixelShaderUid out;
174
175 pixel_shader_uid_data* const uid_data = out.GetUidData();
176 uid_data->useDstAlpha = bpmem.dstalpha.enable && bpmem.blendmode.alphaupdate &&
177 bpmem.zcontrol.pixel_format == PEControl::RGBA6_Z24;
178
179 uid_data->genMode_numindstages = bpmem.genMode.numindstages;
180 uid_data->genMode_numtevstages = bpmem.genMode.numtevstages;
181 uid_data->genMode_numtexgens = bpmem.genMode.numtexgens;
182 uid_data->bounding_box = g_ActiveConfig.bBBoxEnable && BoundingBox::IsEnabled();
183 uid_data->rgba6_format =
184 bpmem.zcontrol.pixel_format == PEControl::RGBA6_Z24 && !g_ActiveConfig.bForceTrueColor;
185 uid_data->dither = bpmem.blendmode.dither && uid_data->rgba6_format;
186 uid_data->uint_output = bpmem.blendmode.UseLogicOp();
187
188 u32 numStages = uid_data->genMode_numtevstages + 1;
189
190 const bool forced_early_z =
191 bpmem.UseEarlyDepthTest() &&
192 (g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED)
193 // We can't allow early_ztest for zfreeze because depth is overridden per-pixel.
194 // This means it's impossible for zcomploc to be emulated on a zfrozen polygon.
195 && !(bpmem.zmode.testenable && bpmem.genMode.zfreeze);
196 const bool per_pixel_depth =
197 (bpmem.ztex2.op != ZTEXTURE_DISABLE && bpmem.UseLateDepthTest()) ||
198 (!g_ActiveConfig.bFastDepthCalc && bpmem.zmode.testenable && !forced_early_z) ||
199 (bpmem.zmode.testenable && bpmem.genMode.zfreeze);
200
201 uid_data->per_pixel_depth = per_pixel_depth;
202 uid_data->forced_early_z = forced_early_z;
203
204 if (g_ActiveConfig.bEnablePixelLighting)
205 {
206 // The lighting shader only needs the two color bits of the 23bit component bit array.
207 uid_data->components =
208 (VertexLoaderManager::g_current_components & (VB_HAS_COL0 | VB_HAS_COL1)) >> VB_COL_SHIFT;
209 uid_data->numColorChans = xfmem.numChan.numColorChans;
210 GetLightingShaderUid(uid_data->lighting);
211 }
212
213 if (uid_data->genMode_numtexgens > 0)
214 {
215 for (unsigned int i = 0; i < uid_data->genMode_numtexgens; ++i)
216 {
217 // optional perspective divides
218 uid_data->texMtxInfo_n_projection |= xfmem.texMtxInfo[i].projection << i;
219 }
220 }
221
222 // indirect texture map lookup
223 int nIndirectStagesUsed = 0;
224 if (uid_data->genMode_numindstages > 0)
225 {
226 for (unsigned int i = 0; i < numStages; ++i)
227 {
228 if (bpmem.tevind[i].IsActive() && bpmem.tevind[i].bt < uid_data->genMode_numindstages)
229 nIndirectStagesUsed |= 1 << bpmem.tevind[i].bt;
230 }
231 }
232
233 uid_data->nIndirectStagesUsed = nIndirectStagesUsed;
234 for (u32 i = 0; i < uid_data->genMode_numindstages; ++i)
235 {
236 if (uid_data->nIndirectStagesUsed & (1 << i))
237 uid_data->SetTevindrefValues(i, bpmem.tevindref.getTexCoord(i), bpmem.tevindref.getTexMap(i));
238 }
239
240 for (unsigned int n = 0; n < numStages; n++)
241 {
242 int texcoord = bpmem.tevorders[n / 2].getTexCoord(n & 1);
243 bool bHasTexCoord = (u32)texcoord < bpmem.genMode.numtexgens;
244 // HACK to handle cases where the tex gen is not enabled
245 if (!bHasTexCoord)
246 texcoord = bpmem.genMode.numtexgens;
247
248 uid_data->stagehash[n].hasindstage = bpmem.tevind[n].bt < bpmem.genMode.numindstages;
249 uid_data->stagehash[n].tevorders_texcoord = texcoord;
250 if (uid_data->stagehash[n].hasindstage)
251 uid_data->stagehash[n].tevind = bpmem.tevind[n].hex;
252
253 TevStageCombiner::ColorCombiner& cc = bpmem.combiners[n].colorC;
254 TevStageCombiner::AlphaCombiner& ac = bpmem.combiners[n].alphaC;
255 uid_data->stagehash[n].cc = cc.hex & 0xFFFFFF;
256 uid_data->stagehash[n].ac = ac.hex & 0xFFFFF0; // Storing rswap and tswap later
257
258 if (cc.a == TEVCOLORARG_RASA || cc.a == TEVCOLORARG_RASC || cc.b == TEVCOLORARG_RASA ||
259 cc.b == TEVCOLORARG_RASC || cc.c == TEVCOLORARG_RASA || cc.c == TEVCOLORARG_RASC ||
260 cc.d == TEVCOLORARG_RASA || cc.d == TEVCOLORARG_RASC || ac.a == TEVALPHAARG_RASA ||
261 ac.b == TEVALPHAARG_RASA || ac.c == TEVALPHAARG_RASA || ac.d == TEVALPHAARG_RASA)
262 {
263 const int i = bpmem.combiners[n].alphaC.rswap;
264 uid_data->stagehash[n].tevksel_swap1a = bpmem.tevksel[i * 2].swap1;
265 uid_data->stagehash[n].tevksel_swap2a = bpmem.tevksel[i * 2].swap2;
266 uid_data->stagehash[n].tevksel_swap1b = bpmem.tevksel[i * 2 + 1].swap1;
267 uid_data->stagehash[n].tevksel_swap2b = bpmem.tevksel[i * 2 + 1].swap2;
268 uid_data->stagehash[n].tevorders_colorchan = bpmem.tevorders[n / 2].getColorChan(n & 1);
269 }
270
271 uid_data->stagehash[n].tevorders_enable = bpmem.tevorders[n / 2].getEnable(n & 1);
272 if (uid_data->stagehash[n].tevorders_enable)
273 {
274 const int i = bpmem.combiners[n].alphaC.tswap;
275 uid_data->stagehash[n].tevksel_swap1c = bpmem.tevksel[i * 2].swap1;
276 uid_data->stagehash[n].tevksel_swap2c = bpmem.tevksel[i * 2].swap2;
277 uid_data->stagehash[n].tevksel_swap1d = bpmem.tevksel[i * 2 + 1].swap1;
278 uid_data->stagehash[n].tevksel_swap2d = bpmem.tevksel[i * 2 + 1].swap2;
279 uid_data->stagehash[n].tevorders_texmap = bpmem.tevorders[n / 2].getTexMap(n & 1);
280 }
281
282 if (cc.a == TEVCOLORARG_KONST || cc.b == TEVCOLORARG_KONST || cc.c == TEVCOLORARG_KONST ||
283 cc.d == TEVCOLORARG_KONST || ac.a == TEVALPHAARG_KONST || ac.b == TEVALPHAARG_KONST ||
284 ac.c == TEVALPHAARG_KONST || ac.d == TEVALPHAARG_KONST)
285 {
286 uid_data->stagehash[n].tevksel_kc = bpmem.tevksel[n / 2].getKC(n & 1);
287 uid_data->stagehash[n].tevksel_ka = bpmem.tevksel[n / 2].getKA(n & 1);
288 }
289 }
290
291 #define MY_STRUCT_OFFSET(str, elem) ((u32)((u64) & (str).elem - (u64) & (str)))
292 uid_data->num_values = (g_ActiveConfig.bEnablePixelLighting) ?
293 sizeof(*uid_data) :
294 MY_STRUCT_OFFSET(*uid_data, stagehash[numStages]);
295
296 AlphaTest::TEST_RESULT Pretest = bpmem.alpha_test.TestResult();
297 uid_data->Pretest = Pretest;
298 uid_data->late_ztest = bpmem.UseLateDepthTest();
299
300 // NOTE: Fragment may not be discarded if alpha test always fails and early depth test is enabled
301 // (in this case we need to write a depth value if depth test passes regardless of the alpha
302 // testing result)
303 if (uid_data->Pretest == AlphaTest::UNDETERMINED ||
304 (uid_data->Pretest == AlphaTest::FAIL && uid_data->late_ztest))
305 {
306 uid_data->alpha_test_comp0 = bpmem.alpha_test.comp0;
307 uid_data->alpha_test_comp1 = bpmem.alpha_test.comp1;
308 uid_data->alpha_test_logic = bpmem.alpha_test.logic;
309
310 // ZCOMPLOC HACK:
311 // The only way to emulate alpha test + early-z is to force early-z in the shader.
312 // As this isn't available on all drivers and as we can't emulate this feature otherwise,
313 // we are only able to choose which one we want to respect more.
314 // Tests seem to have proven that writing depth even when the alpha test fails is more
315 // important that a reliable alpha test, so we just force the alpha test to always succeed.
316 // At least this seems to be less buggy.
317 uid_data->alpha_test_use_zcomploc_hack =
318 bpmem.UseEarlyDepthTest() && bpmem.zmode.updateenable &&
319 !g_ActiveConfig.backend_info.bSupportsEarlyZ && !bpmem.genMode.zfreeze;
320 }
321
322 uid_data->zfreeze = bpmem.genMode.zfreeze;
323 uid_data->ztex_op = bpmem.ztex2.op;
324 uid_data->early_ztest = bpmem.UseEarlyDepthTest();
325 uid_data->fog_fsel = bpmem.fog.c_proj_fsel.fsel;
326 uid_data->fog_fsel = bpmem.fog.c_proj_fsel.fsel;
327 uid_data->fog_proj = bpmem.fog.c_proj_fsel.proj;
328 uid_data->fog_RangeBaseEnabled = bpmem.fogRange.Base.Enabled;
329
330 BlendingState state = {};
331 state.Generate(bpmem);
332
333 if (state.usedualsrc && state.dstalpha && g_ActiveConfig.backend_info.bSupportsFramebufferFetch &&
334 !g_ActiveConfig.backend_info.bSupportsDualSourceBlend)
335 {
336 uid_data->blend_enable = state.blendenable;
337 uid_data->blend_src_factor = state.srcfactor;
338 uid_data->blend_src_factor_alpha = state.srcfactoralpha;
339 uid_data->blend_dst_factor = state.dstfactor;
340 uid_data->blend_dst_factor_alpha = state.dstfactoralpha;
341 uid_data->blend_subtract = state.subtract;
342 uid_data->blend_subtract_alpha = state.subtractAlpha;
343 }
344
345 return out;
346 }
347
ClearUnusedPixelShaderUidBits(APIType ApiType,const ShaderHostConfig & host_config,PixelShaderUid * uid)348 void ClearUnusedPixelShaderUidBits(APIType ApiType, const ShaderHostConfig& host_config,
349 PixelShaderUid* uid)
350 {
351 pixel_shader_uid_data* const uid_data = uid->GetUidData();
352
353 // OpenGL and Vulkan convert implicitly normalized color outputs to their uint representation.
354 // Therefore, it is not necessary to use a uint output on these backends. We also disable the
355 // uint output when logic op is not supported (i.e. driver/device does not support D3D11.1).
356 if (ApiType != APIType::D3D || !host_config.backend_logic_op)
357 uid_data->uint_output = 0;
358
359 // If bounding box is enabled when a UID cache is created, then later disabled, we shouldn't
360 // emit the bounding box portion of the shader.
361 uid_data->bounding_box &= host_config.bounding_box & host_config.backend_bbox;
362 }
363
WritePixelShaderCommonHeader(ShaderCode & out,APIType ApiType,u32 num_texgens,const ShaderHostConfig & host_config,bool bounding_box)364 void WritePixelShaderCommonHeader(ShaderCode& out, APIType ApiType, u32 num_texgens,
365 const ShaderHostConfig& host_config, bool bounding_box)
366 {
367 // dot product for integer vectors
368 out.Write("int idot(int3 x, int3 y)\n"
369 "{\n"
370 "\tint3 tmp = x * y;\n"
371 "\treturn tmp.x + tmp.y + tmp.z;\n"
372 "}\n");
373
374 out.Write("int idot(int4 x, int4 y)\n"
375 "{\n"
376 "\tint4 tmp = x * y;\n"
377 "\treturn tmp.x + tmp.y + tmp.z + tmp.w;\n"
378 "}\n\n");
379
380 // rounding + casting to integer at once in a single function
381 out.Write("int iround(float x) { return int (round(x)); }\n"
382 "int2 iround(float2 x) { return int2(round(x)); }\n"
383 "int3 iround(float3 x) { return int3(round(x)); }\n"
384 "int4 iround(float4 x) { return int4(round(x)); }\n\n");
385
386 if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan)
387 {
388 out.Write("SAMPLER_BINDING(0) uniform sampler2DArray samp[8];\n");
389 }
390 else // D3D
391 {
392 // Declare samplers
393 out.Write("SamplerState samp[8] : register(s0);\n");
394 out.Write("\n");
395 out.Write("Texture2DArray Tex[8] : register(t0);\n");
396 }
397 out.Write("\n");
398
399 if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan)
400 out.Write("UBO_BINDING(std140, 1) uniform PSBlock {\n");
401 else
402 out.Write("cbuffer PSBlock : register(b0) {\n");
403
404 out.Write("\tint4 " I_COLORS "[4];\n"
405 "\tint4 " I_KCOLORS "[4];\n"
406 "\tint4 " I_ALPHA ";\n"
407 "\tfloat4 " I_TEXDIMS "[8];\n"
408 "\tint4 " I_ZBIAS "[2];\n"
409 "\tint4 " I_INDTEXSCALE "[2];\n"
410 "\tint4 " I_INDTEXMTX "[6];\n"
411 "\tint4 " I_FOGCOLOR ";\n"
412 "\tint4 " I_FOGI ";\n"
413 "\tfloat4 " I_FOGF ";\n"
414 "\tfloat4 " I_FOGRANGE "[3];\n"
415 "\tfloat4 " I_ZSLOPE ";\n"
416 "\tfloat2 " I_EFBSCALE ";\n"
417 "\tuint bpmem_genmode;\n"
418 "\tuint bpmem_alphaTest;\n"
419 "\tuint bpmem_fogParam3;\n"
420 "\tuint bpmem_fogRangeBase;\n"
421 "\tuint bpmem_dstalpha;\n"
422 "\tuint bpmem_ztex_op;\n"
423 "\tbool bpmem_late_ztest;\n"
424 "\tbool bpmem_rgba6_format;\n"
425 "\tbool bpmem_dither;\n"
426 "\tbool bpmem_bounding_box;\n"
427 "\tuint4 bpmem_pack1[16];\n" // .xy - combiners, .z - tevind
428 "\tuint4 bpmem_pack2[8];\n" // .x - tevorder, .y - tevksel
429 "\tint4 konstLookup[32];\n"
430 "\tbool blend_enable;\n"
431 "\tuint blend_src_factor;\n"
432 "\tuint blend_src_factor_alpha;\n"
433 "\tuint blend_dst_factor;\n"
434 "\tuint blend_dst_factor_alpha;\n"
435 "\tbool blend_subtract;\n"
436 "\tbool blend_subtract_alpha;\n"
437 "};\n\n");
438 out.Write("#define bpmem_combiners(i) (bpmem_pack1[(i)].xy)\n"
439 "#define bpmem_tevind(i) (bpmem_pack1[(i)].z)\n"
440 "#define bpmem_iref(i) (bpmem_pack1[(i)].w)\n"
441 "#define bpmem_tevorder(i) (bpmem_pack2[(i)].x)\n"
442 "#define bpmem_tevksel(i) (bpmem_pack2[(i)].y)\n\n");
443
444 if (host_config.per_pixel_lighting)
445 {
446 out.Write("%s", s_lighting_struct);
447
448 if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan)
449 out.Write("UBO_BINDING(std140, 2) uniform VSBlock {\n");
450 else
451 out.Write("cbuffer VSBlock : register(b1) {\n");
452
453 out.Write(s_shader_uniforms);
454 out.Write("};\n");
455 }
456
457 if (bounding_box)
458 {
459 out.Write(R"(
460 #ifdef API_D3D
461 globallycoherent RWBuffer<int> bbox_data : register(u2);
462 #define atomicMin InterlockedMin
463 #define atomicMax InterlockedMax
464 #define bbox_left bbox_data[0]
465 #define bbox_right bbox_data[1]
466 #define bbox_top bbox_data[2]
467 #define bbox_bottom bbox_data[3]
468 #else
469 SSBO_BINDING(0) buffer BBox {
470 int bbox_left, bbox_right, bbox_top, bbox_bottom;
471 };
472 #endif
473
474 void UpdateBoundingBoxBuffer(int2 min_pos, int2 max_pos) {
475 if (bbox_left > min_pos.x)
476 atomicMin(bbox_left, min_pos.x);
477 if (bbox_right < max_pos.x)
478 atomicMax(bbox_right, max_pos.x);
479 if (bbox_top > min_pos.y)
480 atomicMin(bbox_top, min_pos.y);
481 if (bbox_bottom < max_pos.y)
482 atomicMax(bbox_bottom, max_pos.y);
483 }
484
485 void UpdateBoundingBox(float2 rawpos) {
486 // The pixel center in the GameCube GPU is 7/12, not 0.5 (see VertexShaderGen.cpp)
487 // Adjust for this by unapplying the offset we added in the vertex shader.
488 const float PIXEL_CENTER_OFFSET = 7.0 / 12.0 - 0.5;
489 float2 offset = float2(PIXEL_CENTER_OFFSET, -PIXEL_CENTER_OFFSET);
490
491 #ifdef API_OPENGL
492 // OpenGL lower-left origin means that Y goes in the opposite direction.
493 offset.y = -offset.y;
494 #endif
495
496 // The rightmost shaded pixel is not included in the right bounding box register,
497 // such that width = right - left + 1. This has been verified on hardware.
498 int2 pos = iround(rawpos * cefbscale + offset);
499
500 #ifdef SUPPORTS_SUBGROUP_REDUCTION
501 if (CAN_USE_SUBGROUP_REDUCTION) {
502 int2 min_pos = IS_HELPER_INVOCATION ? int2(2147483647, 2147483647) : pos;
503 int2 max_pos = IS_HELPER_INVOCATION ? int2(-2147483648, -2147483648) : pos;
504 SUBGROUP_MIN(min_pos);
505 SUBGROUP_MAX(max_pos);
506 if (IS_FIRST_ACTIVE_INVOCATION)
507 UpdateBoundingBoxBuffer(min_pos, max_pos);
508 } else {
509 UpdateBoundingBoxBuffer(pos, pos);
510 }
511 #else
512 UpdateBoundingBoxBuffer(pos, pos);
513 #endif
514 }
515
516 )");
517 }
518 }
519
520 static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, int n,
521 APIType ApiType, bool stereo);
522 static void WriteTevRegular(ShaderCode& out, const char* components, int bias, int op, int clamp,
523 int shift, bool alpha);
524 static void SampleTexture(ShaderCode& out, const char* texcoords, const char* texswap, int texmap,
525 bool stereo, APIType ApiType);
526 static void WriteAlphaTest(ShaderCode& out, const pixel_shader_uid_data* uid_data, APIType ApiType,
527 bool per_pixel_depth, bool use_dual_source);
528 static void WriteFog(ShaderCode& out, const pixel_shader_uid_data* uid_data);
529 static void WriteColor(ShaderCode& out, APIType api_type, const pixel_shader_uid_data* uid_data,
530 bool use_dual_source);
531 static void WriteBlend(ShaderCode& out, const pixel_shader_uid_data* uid_data);
532
GeneratePixelShaderCode(APIType ApiType,const ShaderHostConfig & host_config,const pixel_shader_uid_data * uid_data)533 ShaderCode GeneratePixelShaderCode(APIType ApiType, const ShaderHostConfig& host_config,
534 const pixel_shader_uid_data* uid_data)
535 {
536 ShaderCode out;
537
538 const bool per_pixel_lighting = g_ActiveConfig.bEnablePixelLighting;
539 const bool msaa = host_config.msaa;
540 const bool ssaa = host_config.ssaa;
541 const bool stereo = host_config.stereo;
542 const u32 numStages = uid_data->genMode_numtevstages + 1;
543
544 out.Write("//Pixel Shader for TEV stages\n");
545 out.Write("//%i TEV stages, %i texgens, %i IND stages\n", numStages, uid_data->genMode_numtexgens,
546 uid_data->genMode_numindstages);
547
548 // Stuff that is shared between ubershaders and pixelgen.
549 WritePixelShaderCommonHeader(out, ApiType, uid_data->genMode_numtexgens, host_config,
550 uid_data->bounding_box);
551
552 if (uid_data->forced_early_z && g_ActiveConfig.backend_info.bSupportsEarlyZ)
553 {
554 // Zcomploc (aka early_ztest) is a way to control whether depth test is done before
555 // or after texturing and alpha test. PC graphics APIs used to provide no way to emulate
556 // this feature properly until 2012: Depth tests were always done after alpha testing.
557 // Most importantly, it was not possible to write to the depth buffer without also writing
558 // a color value (unless color writing was disabled altogether).
559
560 // OpenGL 4.2 actually provides two extensions which can force an early z test:
561 // * ARB_image_load_store has 'layout(early_fragment_tests)' which forces the driver to do z
562 // and stencil tests early.
563 // * ARB_conservative_depth has 'layout(depth_unchanged) which signals to the driver that it
564 // can make optimisations
565 // which assume the pixel shader won't update the depth buffer.
566
567 // early_fragment_tests is the best option, as it requires the driver to do early-z and defines
568 // early-z exactly as
569 // we expect, with discard causing the shader to exit with only the depth buffer updated.
570
571 // Conservative depth's 'depth_unchanged' only hints to the driver that an early-z optimisation
572 // can be made and
573 // doesn't define what will happen if we discard the fragment. But the way modern graphics
574 // hardware is implemented
575 // means it is not unreasonable to expect the same behaviour as early_fragment_tests.
576 // We can also assume that if a driver has gone out of its way to support conservative depth and
577 // not image_load_store
578 // as required by OpenGL 4.2 that it will be doing the optimisation.
579 // If the driver doesn't actually do an early z optimisation, ZCompLoc will be broken and depth
580 // will only be written
581 // if the alpha test passes.
582
583 // We support Conservative as a fallback, because many drivers based on Mesa haven't implemented
584 // all of the
585 // ARB_image_load_store extension yet.
586
587 // D3D11 also has a way to force the driver to enable early-z, so we're fine here.
588 if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan)
589 {
590 // This is a #define which signals whatever early-z method the driver supports.
591 out.Write("FORCE_EARLY_Z; \n");
592 }
593 else
594 {
595 out.Write("[earlydepthstencil]\n");
596 }
597 }
598
599 // Only use dual-source blending when required on drivers that don't support it very well.
600 const bool use_dual_source =
601 host_config.backend_dual_source_blend &&
602 (!DriverDetails::HasBug(DriverDetails::BUG_BROKEN_DUAL_SOURCE_BLENDING) ||
603 uid_data->useDstAlpha);
604 const bool use_shader_blend =
605 !use_dual_source && (uid_data->useDstAlpha && host_config.backend_shader_framebuffer_fetch);
606
607 if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan)
608 {
609 if (use_dual_source)
610 {
611 if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_FRAGMENT_SHADER_INDEX_DECORATION))
612 {
613 out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n");
614 out.Write("FRAGMENT_OUTPUT_LOCATION(1) out vec4 ocol1;\n");
615 }
616 else
617 {
618 out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0;\n");
619 out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1;\n");
620 }
621 }
622 else if (use_shader_blend)
623 {
624 // QComm's Adreno driver doesn't seem to like using the framebuffer_fetch value as an
625 // intermediate value with multiple reads & modifications, so pull out the "real" output value
626 // and use a temporary for calculations, then set the output value once at the end of the
627 // shader
628 if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_FRAGMENT_SHADER_INDEX_DECORATION))
629 {
630 out.Write("FRAGMENT_OUTPUT_LOCATION(0) FRAGMENT_INOUT vec4 real_ocol0;\n");
631 }
632 else
633 {
634 out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) FRAGMENT_INOUT vec4 real_ocol0;\n");
635 }
636 }
637 else
638 {
639 out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n");
640 }
641
642 if (uid_data->per_pixel_depth)
643 out.Write("#define depth gl_FragDepth\n");
644
645 if (host_config.backend_geometry_shaders)
646 {
647 out.Write("VARYING_LOCATION(0) in VertexData {\n");
648 GenerateVSOutputMembers(out, ApiType, uid_data->genMode_numtexgens, host_config,
649 GetInterpolationQualifier(msaa, ssaa, true, true));
650
651 if (stereo)
652 out.Write("\tflat int layer;\n");
653
654 out.Write("};\n");
655 }
656 else
657 {
658 // Let's set up attributes
659 u32 counter = 0;
660 out.Write("VARYING_LOCATION(%u) %s in float4 colors_0;\n", counter++,
661 GetInterpolationQualifier(msaa, ssaa));
662 out.Write("VARYING_LOCATION(%u) %s in float4 colors_1;\n", counter++,
663 GetInterpolationQualifier(msaa, ssaa));
664 for (unsigned int i = 0; i < uid_data->genMode_numtexgens; ++i)
665 {
666 out.Write("VARYING_LOCATION(%u) %s in float3 tex%d;\n", counter++,
667 GetInterpolationQualifier(msaa, ssaa), i);
668 }
669 if (!host_config.fast_depth_calc)
670 out.Write("VARYING_LOCATION(%u) %s in float4 clipPos;\n", counter++,
671 GetInterpolationQualifier(msaa, ssaa));
672 if (per_pixel_lighting)
673 {
674 out.Write("VARYING_LOCATION(%u) %s in float3 Normal;\n", counter++,
675 GetInterpolationQualifier(msaa, ssaa));
676 out.Write("VARYING_LOCATION(%u) %s in float3 WorldPos;\n", counter++,
677 GetInterpolationQualifier(msaa, ssaa));
678 }
679 }
680
681 out.Write("void main()\n{\n");
682 out.Write("\tfloat4 rawpos = gl_FragCoord;\n");
683 if (use_shader_blend)
684 {
685 // Store off a copy of the initial fb value for blending
686 out.Write("\tfloat4 initial_ocol0 = FB_FETCH_VALUE;\n");
687 out.Write("\tfloat4 ocol0;\n");
688 out.Write("\tfloat4 ocol1;\n");
689 }
690 }
691 else // D3D
692 {
693 out.Write("void main(\n");
694 if (uid_data->uint_output)
695 {
696 out.Write(" out uint4 ocol0 : SV_Target,\n");
697 }
698 else
699 {
700 out.Write(" out float4 ocol0 : SV_Target0,\n"
701 " out float4 ocol1 : SV_Target1,\n");
702 }
703 out.Write("%s"
704 " in float4 rawpos : SV_Position,\n",
705 uid_data->per_pixel_depth ? " out float depth : SV_Depth,\n" : "");
706
707 out.Write(" in %s float4 colors_0 : COLOR0,\n", GetInterpolationQualifier(msaa, ssaa));
708 out.Write(" in %s float4 colors_1 : COLOR1\n", GetInterpolationQualifier(msaa, ssaa));
709
710 // compute window position if needed because binding semantic WPOS is not widely supported
711 for (unsigned int i = 0; i < uid_data->genMode_numtexgens; ++i)
712 {
713 out.Write(",\n in %s float3 tex%d : TEXCOORD%d", GetInterpolationQualifier(msaa, ssaa), i,
714 i);
715 }
716 if (!host_config.fast_depth_calc)
717 {
718 out.Write(",\n in %s float4 clipPos : TEXCOORD%d", GetInterpolationQualifier(msaa, ssaa),
719 uid_data->genMode_numtexgens);
720 }
721 if (per_pixel_lighting)
722 {
723 out.Write(",\n in %s float3 Normal : TEXCOORD%d", GetInterpolationQualifier(msaa, ssaa),
724 uid_data->genMode_numtexgens + 1);
725 out.Write(",\n in %s float3 WorldPos : TEXCOORD%d", GetInterpolationQualifier(msaa, ssaa),
726 uid_data->genMode_numtexgens + 2);
727 }
728 if (host_config.backend_geometry_shaders)
729 {
730 out.Write(",\n in float clipDist0 : SV_ClipDistance0\n");
731 out.Write(",\n in float clipDist1 : SV_ClipDistance1\n");
732 }
733 if (stereo)
734 out.Write(",\n in uint layer : SV_RenderTargetArrayIndex\n");
735 out.Write(" ) {\n");
736 }
737
738 out.Write("\tint4 c0 = " I_COLORS "[1], c1 = " I_COLORS "[2], c2 = " I_COLORS
739 "[3], prev = " I_COLORS "[0];\n"
740 "\tint4 rastemp = int4(0, 0, 0, 0), textemp = int4(0, 0, 0, 0), konsttemp = int4(0, 0, "
741 "0, 0);\n"
742 "\tint3 comp16 = int3(1, 256, 0), comp24 = int3(1, 256, 256*256);\n"
743 "\tint alphabump=0;\n"
744 "\tint3 tevcoord=int3(0, 0, 0);\n"
745 "\tint2 wrappedcoord=int2(0,0), tempcoord=int2(0,0);\n"
746 "\tint4 "
747 "tevin_a=int4(0,0,0,0),tevin_b=int4(0,0,0,0),tevin_c=int4(0,0,0,0),tevin_d=int4(0,0,0,"
748 "0);\n\n"); // tev combiner inputs
749
750 // On GLSL, input variables must not be assigned to.
751 // This is why we declare these variables locally instead.
752 out.Write("\tfloat4 col0 = colors_0;\n");
753 out.Write("\tfloat4 col1 = colors_1;\n");
754
755 if (per_pixel_lighting)
756 {
757 out.Write("\tfloat3 _norm0 = normalize(Normal.xyz);\n\n");
758 out.Write("\tfloat3 pos = WorldPos;\n");
759
760 out.Write("\tint4 lacc;\n"
761 "\tfloat3 ldir, h, cosAttn, distAttn;\n"
762 "\tfloat dist, dist2, attn;\n");
763
764 // TODO: Our current constant usage code isn't able to handle more than one buffer.
765 // So we can't mark the VS constant as used here. But keep them here as reference.
766 // out.SetConstantsUsed(C_PLIGHT_COLORS, C_PLIGHT_COLORS+7); // TODO: Can be optimized further
767 // out.SetConstantsUsed(C_PLIGHTS, C_PLIGHTS+31); // TODO: Can be optimized further
768 // out.SetConstantsUsed(C_PMATERIALS, C_PMATERIALS+3);
769 GenerateLightingShaderCode(out, uid_data->lighting, uid_data->components << VB_COL_SHIFT,
770 "colors_", "col");
771 }
772
773 // HACK to handle cases where the tex gen is not enabled
774 if (uid_data->genMode_numtexgens == 0)
775 {
776 out.Write("\tint2 fixpoint_uv0 = int2(0, 0);\n\n");
777 }
778 else
779 {
780 out.SetConstantsUsed(C_TEXDIMS, C_TEXDIMS + uid_data->genMode_numtexgens - 1);
781 for (unsigned int i = 0; i < uid_data->genMode_numtexgens; ++i)
782 {
783 out.Write("\tint2 fixpoint_uv%d = int2(", i);
784 out.Write("(tex%d.z == 0.0 ? tex%d.xy : tex%d.xy / tex%d.z)", i, i, i, i);
785 out.Write(" * " I_TEXDIMS "[%d].zw);\n", i);
786 // TODO: S24 overflows here?
787 }
788 }
789
790 for (u32 i = 0; i < uid_data->genMode_numindstages; ++i)
791 {
792 if (uid_data->nIndirectStagesUsed & (1 << i))
793 {
794 unsigned int texcoord = uid_data->GetTevindirefCoord(i);
795 unsigned int texmap = uid_data->GetTevindirefMap(i);
796
797 if (texcoord < uid_data->genMode_numtexgens)
798 {
799 out.SetConstantsUsed(C_INDTEXSCALE + i / 2, C_INDTEXSCALE + i / 2);
800 out.Write("\ttempcoord = fixpoint_uv%d >> " I_INDTEXSCALE "[%d].%s;\n", texcoord, i / 2,
801 (i & 1) ? "zw" : "xy");
802 }
803 else
804 {
805 out.Write("\ttempcoord = int2(0, 0);\n");
806 }
807
808 out.Write("\tint3 iindtex%d = ", i);
809 SampleTexture(out, "float2(tempcoord)", "abg", texmap, stereo, ApiType);
810 }
811 }
812
813 for (u32 i = 0; i < numStages; i++)
814 {
815 // Build the equation for this stage
816 WriteStage(out, uid_data, i, ApiType, stereo);
817 }
818
819 {
820 // The results of the last texenv stage are put onto the screen,
821 // regardless of the used destination register
822 TevStageCombiner::ColorCombiner last_cc;
823 TevStageCombiner::AlphaCombiner last_ac;
824 last_cc.hex = uid_data->stagehash[uid_data->genMode_numtevstages].cc;
825 last_ac.hex = uid_data->stagehash[uid_data->genMode_numtevstages].ac;
826 if (last_cc.dest != 0)
827 {
828 out.Write("\tprev.rgb = %s;\n", tev_c_output_table[last_cc.dest]);
829 }
830 if (last_ac.dest != 0)
831 {
832 out.Write("\tprev.a = %s;\n", tev_a_output_table[last_ac.dest]);
833 }
834 }
835 out.Write("\tprev = prev & 255;\n");
836
837 // NOTE: Fragment may not be discarded if alpha test always fails and early depth test is enabled
838 // (in this case we need to write a depth value if depth test passes regardless of the alpha
839 // testing result)
840 if (uid_data->Pretest == AlphaTest::UNDETERMINED ||
841 (uid_data->Pretest == AlphaTest::FAIL && uid_data->late_ztest))
842 {
843 WriteAlphaTest(out, uid_data, ApiType, uid_data->per_pixel_depth,
844 use_dual_source || use_shader_blend);
845 }
846
847 if (uid_data->zfreeze)
848 {
849 out.SetConstantsUsed(C_ZSLOPE, C_ZSLOPE);
850 out.SetConstantsUsed(C_EFBSCALE, C_EFBSCALE);
851
852 out.Write("\tfloat2 screenpos = rawpos.xy * " I_EFBSCALE ".xy;\n");
853
854 // Opengl has reversed vertical screenspace coordinates
855 if (ApiType == APIType::OpenGL)
856 out.Write("\tscreenpos.y = %i.0 - screenpos.y;\n", EFB_HEIGHT);
857
858 out.Write("\tint zCoord = int(" I_ZSLOPE ".z + " I_ZSLOPE ".x * screenpos.x + " I_ZSLOPE
859 ".y * screenpos.y);\n");
860 }
861 else if (!host_config.fast_depth_calc)
862 {
863 // FastDepth means to trust the depth generated in perspective division.
864 // It should be correct, but it seems not to be as accurate as required. TODO: Find out why!
865 // For disabled FastDepth we just calculate the depth value again.
866 // The performance impact of this additional calculation doesn't matter, but it prevents
867 // the host GPU driver from performing any early depth test optimizations.
868 out.SetConstantsUsed(C_ZBIAS + 1, C_ZBIAS + 1);
869 // the screen space depth value = far z + (clip z / clip w) * z range
870 out.Write("\tint zCoord = " I_ZBIAS "[1].x + int((clipPos.z / clipPos.w) * float(" I_ZBIAS
871 "[1].y));\n");
872 }
873 else
874 {
875 if (!host_config.backend_reversed_depth_range)
876 out.Write("\tint zCoord = int((1.0 - rawpos.z) * 16777216.0);\n");
877 else
878 out.Write("\tint zCoord = int(rawpos.z * 16777216.0);\n");
879 }
880 out.Write("\tzCoord = clamp(zCoord, 0, 0xFFFFFF);\n");
881
882 // depth texture can safely be ignored if the result won't be written to the depth buffer
883 // (early_ztest) and isn't used for fog either
884 const bool skip_ztexture = !uid_data->per_pixel_depth && !uid_data->fog_fsel;
885
886 // Note: z-textures are not written to depth buffer if early depth test is used
887 if (uid_data->per_pixel_depth && uid_data->early_ztest)
888 {
889 if (!host_config.backend_reversed_depth_range)
890 out.Write("\tdepth = 1.0 - float(zCoord) / 16777216.0;\n");
891 else
892 out.Write("\tdepth = float(zCoord) / 16777216.0;\n");
893 }
894
895 // Note: depth texture output is only written to depth buffer if late depth test is used
896 // theoretical final depth value is used for fog calculation, though, so we have to emulate
897 // ztextures anyway
898 if (uid_data->ztex_op != ZTEXTURE_DISABLE && !skip_ztexture)
899 {
900 // use the texture input of the last texture stage (textemp), hopefully this has been read and
901 // is in correct format...
902 out.SetConstantsUsed(C_ZBIAS, C_ZBIAS + 1);
903 out.Write("\tzCoord = idot(" I_ZBIAS "[0].xyzw, textemp.xyzw) + " I_ZBIAS "[1].w %s;\n",
904 (uid_data->ztex_op == ZTEXTURE_ADD) ? "+ zCoord" : "");
905 out.Write("\tzCoord = zCoord & 0xFFFFFF;\n");
906 }
907
908 if (uid_data->per_pixel_depth && uid_data->late_ztest)
909 {
910 if (!host_config.backend_reversed_depth_range)
911 out.Write("\tdepth = 1.0 - float(zCoord) / 16777216.0;\n");
912 else
913 out.Write("\tdepth = float(zCoord) / 16777216.0;\n");
914 }
915
916 // No dithering for RGB8 mode
917 if (uid_data->dither)
918 {
919 // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering
920 // Here the matrix is encoded into the two factor constants
921 out.Write("\tint2 dither = int2(rawpos.xy) & 1;\n");
922 out.Write("\tprev.rgb = (prev.rgb - (prev.rgb >> 6)) + abs(dither.y * 3 - dither.x * 2);\n");
923 }
924
925 WriteFog(out, uid_data);
926
927 // Write the color and alpha values to the framebuffer
928 // If using shader blend, we still use the separate alpha
929 WriteColor(out, ApiType, uid_data, use_dual_source || use_shader_blend);
930
931 if (use_shader_blend)
932 WriteBlend(out, uid_data);
933
934 if (uid_data->bounding_box)
935 out.Write("\tUpdateBoundingBox(rawpos.xy);\n");
936
937 out.Write("}\n");
938
939 return out;
940 }
941
WriteStage(ShaderCode & out,const pixel_shader_uid_data * uid_data,int n,APIType ApiType,bool stereo)942 static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, int n,
943 APIType ApiType, bool stereo)
944 {
945 auto& stage = uid_data->stagehash[n];
946 out.Write("\n\t// TEV stage %d\n", n);
947
948 // HACK to handle cases where the tex gen is not enabled
949 u32 texcoord = stage.tevorders_texcoord;
950 bool bHasTexCoord = texcoord < uid_data->genMode_numtexgens;
951 if (!bHasTexCoord)
952 texcoord = 0;
953
954 if (stage.hasindstage)
955 {
956 TevStageIndirect tevind;
957 tevind.hex = stage.tevind;
958
959 out.Write("\t// indirect op\n");
960 // perform the indirect op on the incoming regular coordinates using iindtex%d as the offset
961 // coords
962 if (tevind.bs != ITBA_OFF)
963 {
964 constexpr std::array<const char*, 4> tev_ind_alpha_sel{
965 "",
966 "x",
967 "y",
968 "z",
969 };
970
971 // 0b11111000, 0b11100000, 0b11110000, 0b11111000
972 constexpr std::array<const char*, 4> tev_ind_alpha_mask{
973 "248",
974 "224",
975 "240",
976 "248",
977 };
978
979 out.Write("alphabump = iindtex%d.%s & %s;\n", tevind.bt.Value(), tev_ind_alpha_sel[tevind.bs],
980 tev_ind_alpha_mask[tevind.fmt]);
981 }
982 else
983 {
984 // TODO: Should we reset alphabump to 0 here?
985 }
986
987 if (tevind.mid != 0)
988 {
989 // format
990 constexpr std::array<const char*, 4> tev_ind_fmt_mask{
991 "255",
992 "31",
993 "15",
994 "7",
995 };
996 out.Write("\tint3 iindtevcrd%d = iindtex%d & %s;\n", n, tevind.bt.Value(),
997 tev_ind_fmt_mask[tevind.fmt]);
998
999 // bias - TODO: Check if this needs to be this complicated...
1000 // indexed by bias
1001 constexpr std::array<const char*, 8> tev_ind_bias_field{
1002 "", "x", "y", "xy", "z", "xz", "yz", "xyz",
1003 };
1004
1005 // indexed by fmt
1006 constexpr std::array<const char*, 4> tev_ind_bias_add{
1007 "-128",
1008 "1",
1009 "1",
1010 "1",
1011 };
1012
1013 if (tevind.bias == ITB_S || tevind.bias == ITB_T || tevind.bias == ITB_U)
1014 {
1015 out.Write("\tiindtevcrd%d.%s += int(%s);\n", n, tev_ind_bias_field[tevind.bias],
1016 tev_ind_bias_add[tevind.fmt]);
1017 }
1018 else if (tevind.bias == ITB_ST || tevind.bias == ITB_SU || tevind.bias == ITB_TU)
1019 {
1020 out.Write("\tiindtevcrd%d.%s += int2(%s, %s);\n", n, tev_ind_bias_field[tevind.bias],
1021 tev_ind_bias_add[tevind.fmt], tev_ind_bias_add[tevind.fmt]);
1022 }
1023 else if (tevind.bias == ITB_STU)
1024 {
1025 out.Write("\tiindtevcrd%d.%s += int3(%s, %s, %s);\n", n, tev_ind_bias_field[tevind.bias],
1026 tev_ind_bias_add[tevind.fmt], tev_ind_bias_add[tevind.fmt],
1027 tev_ind_bias_add[tevind.fmt]);
1028 }
1029
1030 // multiply by offset matrix and scale - calculations are likely to overflow badly,
1031 // yet it works out since we only care about the lower 23 bits (+1 sign bit) of the result
1032 if (tevind.mid <= 3)
1033 {
1034 int mtxidx = 2 * (tevind.mid - 1);
1035 out.SetConstantsUsed(C_INDTEXMTX + mtxidx, C_INDTEXMTX + mtxidx);
1036
1037 out.Write("\tint2 indtevtrans%d = int2(idot(" I_INDTEXMTX
1038 "[%d].xyz, iindtevcrd%d), idot(" I_INDTEXMTX "[%d].xyz, iindtevcrd%d)) >> 3;\n",
1039 n, mtxidx, n, mtxidx + 1, n);
1040
1041 // TODO: should use a shader uid branch for this for better performance
1042 if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_BITWISE_OP_NEGATION))
1043 {
1044 out.Write("\tint indtexmtx_w_inverse_%d = -" I_INDTEXMTX "[%d].w;\n", n, mtxidx);
1045 out.Write("\tif (" I_INDTEXMTX "[%d].w >= 0) indtevtrans%d >>= " I_INDTEXMTX "[%d].w;\n",
1046 mtxidx, n, mtxidx);
1047 out.Write("\telse indtevtrans%d <<= indtexmtx_w_inverse_%d;\n", n, n);
1048 }
1049 else
1050 {
1051 out.Write("\tif (" I_INDTEXMTX "[%d].w >= 0) indtevtrans%d >>= " I_INDTEXMTX "[%d].w;\n",
1052 mtxidx, n, mtxidx);
1053 out.Write("\telse indtevtrans%d <<= (-" I_INDTEXMTX "[%d].w);\n", n, mtxidx);
1054 }
1055 }
1056 else if (tevind.mid <= 7 && bHasTexCoord)
1057 { // s matrix
1058 ASSERT(tevind.mid >= 5);
1059 int mtxidx = 2 * (tevind.mid - 5);
1060 out.SetConstantsUsed(C_INDTEXMTX + mtxidx, C_INDTEXMTX + mtxidx);
1061
1062 out.Write("\tint2 indtevtrans%d = int2(fixpoint_uv%d * iindtevcrd%d.xx) >> 8;\n", n,
1063 texcoord, n);
1064 if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_BITWISE_OP_NEGATION))
1065 {
1066 out.Write("\tint indtexmtx_w_inverse_%d = -" I_INDTEXMTX "[%d].w;\n", n, mtxidx);
1067 out.Write("\tif (" I_INDTEXMTX "[%d].w >= 0) indtevtrans%d >>= " I_INDTEXMTX "[%d].w;\n",
1068 mtxidx, n, mtxidx);
1069 out.Write("\telse indtevtrans%d <<= (indtexmtx_w_inverse_%d);\n", n, n);
1070 }
1071 else
1072 {
1073 out.Write("\tif (" I_INDTEXMTX "[%d].w >= 0) indtevtrans%d >>= " I_INDTEXMTX "[%d].w;\n",
1074 mtxidx, n, mtxidx);
1075 out.Write("\telse indtevtrans%d <<= (-" I_INDTEXMTX "[%d].w);\n", n, mtxidx);
1076 }
1077 }
1078 else if (tevind.mid <= 11 && bHasTexCoord)
1079 { // t matrix
1080 ASSERT(tevind.mid >= 9);
1081 int mtxidx = 2 * (tevind.mid - 9);
1082 out.SetConstantsUsed(C_INDTEXMTX + mtxidx, C_INDTEXMTX + mtxidx);
1083
1084 out.Write("\tint2 indtevtrans%d = int2(fixpoint_uv%d * iindtevcrd%d.yy) >> 8;\n", n,
1085 texcoord, n);
1086
1087 if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_BITWISE_OP_NEGATION))
1088 {
1089 out.Write("\tint indtexmtx_w_inverse_%d = -" I_INDTEXMTX "[%d].w;\n", n, mtxidx);
1090 out.Write("\tif (" I_INDTEXMTX "[%d].w >= 0) indtevtrans%d >>= " I_INDTEXMTX "[%d].w;\n",
1091 mtxidx, n, mtxidx);
1092 out.Write("\telse indtevtrans%d <<= (indtexmtx_w_inverse_%d);\n", n, n);
1093 }
1094 else
1095 {
1096 out.Write("\tif (" I_INDTEXMTX "[%d].w >= 0) indtevtrans%d >>= " I_INDTEXMTX "[%d].w;\n",
1097 mtxidx, n, mtxidx);
1098 out.Write("\telse indtevtrans%d <<= (-" I_INDTEXMTX "[%d].w);\n", n, mtxidx);
1099 }
1100 }
1101 else
1102 {
1103 out.Write("\tint2 indtevtrans%d = int2(0, 0);\n", n);
1104 }
1105 }
1106 else
1107 {
1108 out.Write("\tint2 indtevtrans%d = int2(0, 0);\n", n);
1109 }
1110
1111 // ---------
1112 // Wrapping
1113 // ---------
1114
1115 // TODO: Should the last element be 1 or (1<<7)?
1116 constexpr std::array<const char*, 7> tev_ind_wrap_start{
1117 "0", "(256<<7)", "(128<<7)", "(64<<7)", "(32<<7)", "(16<<7)", "1",
1118 };
1119
1120 // wrap S
1121 if (tevind.sw == ITW_OFF)
1122 {
1123 out.Write("\twrappedcoord.x = fixpoint_uv%d.x;\n", texcoord);
1124 }
1125 else if (tevind.sw == ITW_0)
1126 {
1127 out.Write("\twrappedcoord.x = 0;\n");
1128 }
1129 else
1130 {
1131 out.Write("\twrappedcoord.x = fixpoint_uv%d.x & (%s - 1);\n", texcoord,
1132 tev_ind_wrap_start[tevind.sw]);
1133 }
1134
1135 // wrap T
1136 if (tevind.tw == ITW_OFF)
1137 {
1138 out.Write("\twrappedcoord.y = fixpoint_uv%d.y;\n", texcoord);
1139 }
1140 else if (tevind.tw == ITW_0)
1141 {
1142 out.Write("\twrappedcoord.y = 0;\n");
1143 }
1144 else
1145 {
1146 out.Write("\twrappedcoord.y = fixpoint_uv%d.y & (%s - 1);\n", texcoord,
1147 tev_ind_wrap_start[tevind.tw]);
1148 }
1149
1150 if (tevind.fb_addprev) // add previous tevcoord
1151 out.Write("\ttevcoord.xy += wrappedcoord + indtevtrans%d;\n", n);
1152 else
1153 out.Write("\ttevcoord.xy = wrappedcoord + indtevtrans%d;\n", n);
1154
1155 // Emulate s24 overflows
1156 out.Write("\ttevcoord.xy = (tevcoord.xy << 8) >> 8;\n");
1157 }
1158
1159 TevStageCombiner::ColorCombiner cc;
1160 TevStageCombiner::AlphaCombiner ac;
1161 cc.hex = stage.cc;
1162 ac.hex = stage.ac;
1163
1164 if (cc.a == TEVCOLORARG_RASA || cc.a == TEVCOLORARG_RASC || cc.b == TEVCOLORARG_RASA ||
1165 cc.b == TEVCOLORARG_RASC || cc.c == TEVCOLORARG_RASA || cc.c == TEVCOLORARG_RASC ||
1166 cc.d == TEVCOLORARG_RASA || cc.d == TEVCOLORARG_RASC || ac.a == TEVALPHAARG_RASA ||
1167 ac.b == TEVALPHAARG_RASA || ac.c == TEVALPHAARG_RASA || ac.d == TEVALPHAARG_RASA)
1168 {
1169 // Generate swizzle string to represent the Ras color channel swapping
1170 const char rasswap[5] = {
1171 "rgba"[stage.tevksel_swap1a],
1172 "rgba"[stage.tevksel_swap2a],
1173 "rgba"[stage.tevksel_swap1b],
1174 "rgba"[stage.tevksel_swap2b],
1175 '\0',
1176 };
1177
1178 out.Write("\trastemp = %s.%s;\n", tev_ras_table[stage.tevorders_colorchan], rasswap);
1179 }
1180
1181 if (stage.tevorders_enable)
1182 {
1183 // Generate swizzle string to represent the texture color channel swapping
1184 const char texswap[5] = {
1185 "rgba"[stage.tevksel_swap1c],
1186 "rgba"[stage.tevksel_swap2c],
1187 "rgba"[stage.tevksel_swap1d],
1188 "rgba"[stage.tevksel_swap2d],
1189 '\0',
1190 };
1191
1192 if (!stage.hasindstage)
1193 {
1194 // calc tevcord
1195 if (bHasTexCoord)
1196 out.Write("\ttevcoord.xy = fixpoint_uv%d;\n", texcoord);
1197 else
1198 out.Write("\ttevcoord.xy = int2(0, 0);\n");
1199 }
1200 out.Write("\ttextemp = ");
1201 SampleTexture(out, "float2(tevcoord.xy)", texswap, stage.tevorders_texmap, stereo, ApiType);
1202 }
1203 else
1204 {
1205 out.Write("\ttextemp = int4(255, 255, 255, 255);\n");
1206 }
1207
1208 if (cc.a == TEVCOLORARG_KONST || cc.b == TEVCOLORARG_KONST || cc.c == TEVCOLORARG_KONST ||
1209 cc.d == TEVCOLORARG_KONST || ac.a == TEVALPHAARG_KONST || ac.b == TEVALPHAARG_KONST ||
1210 ac.c == TEVALPHAARG_KONST || ac.d == TEVALPHAARG_KONST)
1211 {
1212 out.Write("\tkonsttemp = int4(%s, %s);\n", tev_ksel_table_c[stage.tevksel_kc],
1213 tev_ksel_table_a[stage.tevksel_ka]);
1214
1215 if (stage.tevksel_kc > 7)
1216 {
1217 out.SetConstantsUsed(C_KCOLORS + ((stage.tevksel_kc - 0xc) % 4),
1218 C_KCOLORS + ((stage.tevksel_kc - 0xc) % 4));
1219 }
1220 if (stage.tevksel_ka > 7)
1221 {
1222 out.SetConstantsUsed(C_KCOLORS + ((stage.tevksel_ka - 0xc) % 4),
1223 C_KCOLORS + ((stage.tevksel_ka - 0xc) % 4));
1224 }
1225 }
1226
1227 if (cc.d == TEVCOLORARG_C0 || cc.d == TEVCOLORARG_A0 || ac.d == TEVALPHAARG_A0)
1228 out.SetConstantsUsed(C_COLORS + 1, C_COLORS + 1);
1229
1230 if (cc.d == TEVCOLORARG_C1 || cc.d == TEVCOLORARG_A1 || ac.d == TEVALPHAARG_A1)
1231 out.SetConstantsUsed(C_COLORS + 2, C_COLORS + 2);
1232
1233 if (cc.d == TEVCOLORARG_C2 || cc.d == TEVCOLORARG_A2 || ac.d == TEVALPHAARG_A2)
1234 out.SetConstantsUsed(C_COLORS + 3, C_COLORS + 3);
1235
1236 if (cc.dest >= GX_TEVREG0)
1237 out.SetConstantsUsed(C_COLORS + cc.dest, C_COLORS + cc.dest);
1238
1239 if (ac.dest >= GX_TEVREG0)
1240 out.SetConstantsUsed(C_COLORS + ac.dest, C_COLORS + ac.dest);
1241
1242 out.Write("\ttevin_a = int4(%s, %s)&int4(255, 255, 255, 255);\n", tev_c_input_table[cc.a],
1243 tev_a_input_table[ac.a]);
1244 out.Write("\ttevin_b = int4(%s, %s)&int4(255, 255, 255, 255);\n", tev_c_input_table[cc.b],
1245 tev_a_input_table[ac.b]);
1246 out.Write("\ttevin_c = int4(%s, %s)&int4(255, 255, 255, 255);\n", tev_c_input_table[cc.c],
1247 tev_a_input_table[ac.c]);
1248 out.Write("\ttevin_d = int4(%s, %s);\n", tev_c_input_table[cc.d], tev_a_input_table[ac.d]);
1249
1250 out.Write("\t// color combine\n");
1251 out.Write("\t%s = clamp(", tev_c_output_table[cc.dest]);
1252 if (cc.bias != TEVBIAS_COMPARE)
1253 {
1254 WriteTevRegular(out, "rgb", cc.bias, cc.op, cc.clamp, cc.shift, false);
1255 }
1256 else
1257 {
1258 constexpr std::array<const char*, 8> function_table{
1259 "((tevin_a.r > tevin_b.r) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_R8_GT
1260 "((tevin_a.r == tevin_b.r) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_R8_EQ
1261 "((idot(tevin_a.rgb, comp16) > idot(tevin_b.rgb, comp16)) ? tevin_c.rgb : "
1262 "int3(0,0,0))", // TEVCMP_GR16_GT
1263 "((idot(tevin_a.rgb, comp16) == idot(tevin_b.rgb, comp16)) ? tevin_c.rgb : "
1264 "int3(0,0,0))", // TEVCMP_GR16_EQ
1265 "((idot(tevin_a.rgb, comp24) > idot(tevin_b.rgb, comp24)) ? tevin_c.rgb : "
1266 "int3(0,0,0))", // TEVCMP_BGR24_GT
1267 "((idot(tevin_a.rgb, comp24) == idot(tevin_b.rgb, comp24)) ? tevin_c.rgb : "
1268 "int3(0,0,0))", // TEVCMP_BGR24_EQ
1269 "(max(sign(tevin_a.rgb - tevin_b.rgb), int3(0,0,0)) * tevin_c.rgb)", // TEVCMP_RGB8_GT
1270 "((int3(1,1,1) - sign(abs(tevin_a.rgb - tevin_b.rgb))) * tevin_c.rgb)" // TEVCMP_RGB8_EQ
1271 };
1272
1273 const int mode = (cc.shift << 1) | cc.op;
1274 out.Write(" tevin_d.rgb + ");
1275 out.Write("%s", function_table[mode]);
1276 }
1277 if (cc.clamp)
1278 out.Write(", int3(0,0,0), int3(255,255,255))");
1279 else
1280 out.Write(", int3(-1024,-1024,-1024), int3(1023,1023,1023))");
1281 out.Write(";\n");
1282
1283 out.Write("\t// alpha combine\n");
1284 out.Write("\t%s = clamp(", tev_a_output_table[ac.dest]);
1285 if (ac.bias != TEVBIAS_COMPARE)
1286 {
1287 WriteTevRegular(out, "a", ac.bias, ac.op, ac.clamp, ac.shift, true);
1288 }
1289 else
1290 {
1291 constexpr std::array<const char*, 8> function_table{
1292 "((tevin_a.r > tevin_b.r) ? tevin_c.a : 0)", // TEVCMP_R8_GT
1293 "((tevin_a.r == tevin_b.r) ? tevin_c.a : 0)", // TEVCMP_R8_EQ
1294 "((idot(tevin_a.rgb, comp16) > idot(tevin_b.rgb, comp16)) ? tevin_c.a : 0)", // TEVCMP_GR16_GT
1295 "((idot(tevin_a.rgb, comp16) == idot(tevin_b.rgb, comp16)) ? tevin_c.a : 0)", // TEVCMP_GR16_EQ
1296 "((idot(tevin_a.rgb, comp24) > idot(tevin_b.rgb, comp24)) ? tevin_c.a : 0)", // TEVCMP_BGR24_GT
1297 "((idot(tevin_a.rgb, comp24) == idot(tevin_b.rgb, comp24)) ? tevin_c.a : 0)", // TEVCMP_BGR24_EQ
1298 "((tevin_a.a > tevin_b.a) ? tevin_c.a : 0)", // TEVCMP_A8_GT
1299 "((tevin_a.a == tevin_b.a) ? tevin_c.a : 0)" // TEVCMP_A8_EQ
1300 };
1301
1302 const int mode = (ac.shift << 1) | ac.op;
1303 out.Write(" tevin_d.a + ");
1304 out.Write("%s", function_table[mode]);
1305 }
1306 if (ac.clamp)
1307 out.Write(", 0, 255)");
1308 else
1309 out.Write(", -1024, 1023)");
1310
1311 out.Write(";\n");
1312 }
1313
WriteTevRegular(ShaderCode & out,const char * components,int bias,int op,int clamp,int shift,bool alpha)1314 static void WriteTevRegular(ShaderCode& out, const char* components, int bias, int op, int clamp,
1315 int shift, bool alpha)
1316 {
1317 constexpr std::array<const char*, 4> tev_scale_table_left{
1318 "", // SCALE_1
1319 " << 1", // SCALE_2
1320 " << 2", // SCALE_4
1321 "", // DIVIDE_2
1322 };
1323
1324 constexpr std::array<const char*, 4> tev_scale_table_right{
1325 "", // SCALE_1
1326 "", // SCALE_2
1327 "", // SCALE_4
1328 " >> 1", // DIVIDE_2
1329 };
1330
1331 // indexed by 2*op+(shift==3)
1332 constexpr std::array<const char*, 4> tev_lerp_bias{
1333 "",
1334 " + 128",
1335 "",
1336 " + 127",
1337 };
1338
1339 constexpr std::array<const char*, 4> tev_bias_table{
1340 "", // ZERO,
1341 " + 128", // ADDHALF,
1342 " - 128", // SUBHALF,
1343 "",
1344 };
1345
1346 constexpr std::array<char, 2> tev_op_table{
1347 '+', // TEVOP_ADD = 0,
1348 '-', // TEVOP_SUB = 1,
1349 };
1350
1351 // Regular TEV stage: (d + bias + lerp(a,b,c)) * scale
1352 // The GameCube/Wii GPU uses a very sophisticated algorithm for scale-lerping:
1353 // - c is scaled from 0..255 to 0..256, which allows dividing the result by 256 instead of 255
1354 // - if scale is bigger than one, it is moved inside the lerp calculation for increased accuracy
1355 // - a rounding bias is added before dividing by 256
1356 out.Write("(((tevin_d.%s%s)%s)", components, tev_bias_table[bias], tev_scale_table_left[shift]);
1357 out.Write(" %c ", tev_op_table[op]);
1358 out.Write("(((((tevin_a.%s<<8) + (tevin_b.%s-tevin_a.%s)*(tevin_c.%s+(tevin_c.%s>>7)))%s)%s)>>8)",
1359 components, components, components, components, components, tev_scale_table_left[shift],
1360 tev_lerp_bias[2 * op + ((shift == 3) == alpha)]);
1361 out.Write(")%s", tev_scale_table_right[shift]);
1362 }
1363
SampleTexture(ShaderCode & out,const char * texcoords,const char * texswap,int texmap,bool stereo,APIType ApiType)1364 static void SampleTexture(ShaderCode& out, const char* texcoords, const char* texswap, int texmap,
1365 bool stereo, APIType ApiType)
1366 {
1367 out.SetConstantsUsed(C_TEXDIMS + texmap, C_TEXDIMS + texmap);
1368
1369 if (ApiType == APIType::D3D)
1370 {
1371 out.Write("iround(255.0 * Tex[%d].Sample(samp[%d], float3(%s.xy * " I_TEXDIMS
1372 "[%d].xy, %s))).%s;\n",
1373 texmap, texmap, texcoords, texmap, stereo ? "layer" : "0.0", texswap);
1374 }
1375 else
1376 {
1377 out.Write("iround(255.0 * texture(samp[%d], float3(%s.xy * " I_TEXDIMS "[%d].xy, %s))).%s;\n",
1378 texmap, texcoords, texmap, stereo ? "layer" : "0.0", texswap);
1379 }
1380 }
1381
1382 constexpr std::array<const char*, 8> tev_alpha_funcs_table{
1383 "(false)", // NEVER
1384 "(prev.a < %s)", // LESS
1385 "(prev.a == %s)", // EQUAL
1386 "(prev.a <= %s)", // LEQUAL
1387 "(prev.a > %s)", // GREATER
1388 "(prev.a != %s)", // NEQUAL
1389 "(prev.a >= %s)", // GEQUAL
1390 "(true)" // ALWAYS
1391 };
1392
1393 constexpr std::array<const char*, 4> tev_alpha_funclogic_table{
1394 " && ", // and
1395 " || ", // or
1396 " != ", // xor
1397 " == " // xnor
1398 };
1399
WriteAlphaTest(ShaderCode & out,const pixel_shader_uid_data * uid_data,APIType ApiType,bool per_pixel_depth,bool use_dual_source)1400 static void WriteAlphaTest(ShaderCode& out, const pixel_shader_uid_data* uid_data, APIType ApiType,
1401 bool per_pixel_depth, bool use_dual_source)
1402 {
1403 static constexpr std::array<const char*, 2> alpha_ref{I_ALPHA ".r", I_ALPHA ".g"};
1404
1405 out.SetConstantsUsed(C_ALPHA, C_ALPHA);
1406
1407 if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_NEGATED_BOOLEAN))
1408 out.Write("\tif(( ");
1409 else
1410 out.Write("\tif(!( ");
1411
1412 // Lookup the first component from the alpha function table
1413 int compindex = uid_data->alpha_test_comp0;
1414 out.Write(tev_alpha_funcs_table[compindex], alpha_ref[0]);
1415
1416 // Lookup the logic op
1417 out.Write("%s", tev_alpha_funclogic_table[uid_data->alpha_test_logic]);
1418
1419 // Lookup the second component from the alpha function table
1420 compindex = uid_data->alpha_test_comp1;
1421 out.Write(tev_alpha_funcs_table[compindex], alpha_ref[1]);
1422
1423 if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_NEGATED_BOOLEAN))
1424 out.Write(") == false) {\n");
1425 else
1426 out.Write(")) {\n");
1427
1428 out.Write("\t\tocol0 = float4(0.0, 0.0, 0.0, 0.0);\n");
1429 if (use_dual_source && !(ApiType == APIType::D3D && uid_data->uint_output))
1430 out.Write("\t\tocol1 = float4(0.0, 0.0, 0.0, 0.0);\n");
1431 if (per_pixel_depth)
1432 {
1433 out.Write("\t\tdepth = %s;\n",
1434 !g_ActiveConfig.backend_info.bSupportsReversedDepthRange ? "0.0" : "1.0");
1435 }
1436
1437 // ZCOMPLOC HACK:
1438 if (!uid_data->alpha_test_use_zcomploc_hack)
1439 {
1440 out.Write("\t\tdiscard;\n");
1441 if (ApiType == APIType::D3D)
1442 out.Write("\t\treturn;\n");
1443 }
1444
1445 out.Write("\t}\n");
1446 }
1447
1448 constexpr std::array<const char*, 8> tev_fog_funcs_table{
1449 "", // No Fog
1450 "", // ?
1451 "", // Linear
1452 "", // ?
1453 "\tfog = 1.0 - exp2(-8.0 * fog);\n", // exp
1454 "\tfog = 1.0 - exp2(-8.0 * fog * fog);\n", // exp2
1455 "\tfog = exp2(-8.0 * (1.0 - fog));\n", // backward exp
1456 "\tfog = 1.0 - fog;\n fog = exp2(-8.0 * fog * fog);\n" // backward exp2
1457 };
1458
WriteFog(ShaderCode & out,const pixel_shader_uid_data * uid_data)1459 static void WriteFog(ShaderCode& out, const pixel_shader_uid_data* uid_data)
1460 {
1461 if (uid_data->fog_fsel == 0)
1462 return; // no Fog
1463
1464 out.SetConstantsUsed(C_FOGCOLOR, C_FOGCOLOR);
1465 out.SetConstantsUsed(C_FOGI, C_FOGI);
1466 out.SetConstantsUsed(C_FOGF, C_FOGF + 1);
1467 if (uid_data->fog_proj == 0)
1468 {
1469 // perspective
1470 // ze = A/(B - (Zs >> B_SHF)
1471 // TODO: Verify that we want to drop lower bits here! (currently taken over from software
1472 // renderer)
1473 // Maybe we want to use "ze = (A << B_SHF)/((B << B_SHF) - Zs)" instead?
1474 // That's equivalent, but keeps the lower bits of Zs.
1475 out.Write("\tfloat ze = (" I_FOGF ".x * 16777216.0) / float(" I_FOGI ".y - (zCoord >> " I_FOGI
1476 ".w));\n");
1477 }
1478 else
1479 {
1480 // orthographic
1481 // ze = a*Zs (here, no B_SHF)
1482 out.Write("\tfloat ze = " I_FOGF ".x * float(zCoord) / 16777216.0;\n");
1483 }
1484
1485 // x_adjust = sqrt((x-center)^2 + k^2)/k
1486 // ze *= x_adjust
1487 if (uid_data->fog_RangeBaseEnabled)
1488 {
1489 out.SetConstantsUsed(C_FOGF, C_FOGF);
1490 out.Write("\tfloat offset = (2.0 * (rawpos.x / " I_FOGF ".w)) - 1.0 - " I_FOGF ".z;\n");
1491 out.Write("\tfloat floatindex = clamp(9.0 - abs(offset) * 9.0, 0.0, 9.0);\n");
1492 out.Write("\tuint indexlower = uint(floatindex);\n");
1493 out.Write("\tuint indexupper = indexlower + 1u;\n");
1494 out.Write("\tfloat klower = " I_FOGRANGE "[indexlower >> 2u][indexlower & 3u];\n");
1495 out.Write("\tfloat kupper = " I_FOGRANGE "[indexupper >> 2u][indexupper & 3u];\n");
1496 out.Write("\tfloat k = lerp(klower, kupper, frac(floatindex));\n");
1497 out.Write("\tfloat x_adjust = sqrt(offset * offset + k * k) / k;\n");
1498 out.Write("\tze *= x_adjust;\n");
1499 }
1500
1501 out.Write("\tfloat fog = clamp(ze - " I_FOGF ".y, 0.0, 1.0);\n");
1502
1503 if (uid_data->fog_fsel > 3)
1504 {
1505 out.Write("%s", tev_fog_funcs_table[uid_data->fog_fsel]);
1506 }
1507 else
1508 {
1509 if (uid_data->fog_fsel != 2)
1510 WARN_LOG(VIDEO, "Unknown Fog Type! %08x", uid_data->fog_fsel);
1511 }
1512
1513 out.Write("\tint ifog = iround(fog * 256.0);\n");
1514 out.Write("\tprev.rgb = (prev.rgb * (256 - ifog) + " I_FOGCOLOR ".rgb * ifog) >> 8;\n");
1515 }
1516
WriteColor(ShaderCode & out,APIType api_type,const pixel_shader_uid_data * uid_data,bool use_dual_source)1517 static void WriteColor(ShaderCode& out, APIType api_type, const pixel_shader_uid_data* uid_data,
1518 bool use_dual_source)
1519 {
1520 // D3D requires that the shader outputs be uint when writing to a uint render target for logic op.
1521 if (api_type == APIType::D3D && uid_data->uint_output)
1522 {
1523 if (uid_data->rgba6_format)
1524 out.Write("\tocol0 = uint4(prev & 0xFC);\n");
1525 else
1526 out.Write("\tocol0 = uint4(prev);\n");
1527 return;
1528 }
1529
1530 if (uid_data->rgba6_format)
1531 out.Write("\tocol0.rgb = float3(prev.rgb >> 2) / 63.0;\n");
1532 else
1533 out.Write("\tocol0.rgb = float3(prev.rgb) / 255.0;\n");
1534
1535 // Colors will be blended against the 8-bit alpha from ocol1 and
1536 // the 6-bit alpha from ocol0 will be written to the framebuffer
1537 if (uid_data->useDstAlpha)
1538 {
1539 out.SetConstantsUsed(C_ALPHA, C_ALPHA);
1540 out.Write("\tocol0.a = float(" I_ALPHA ".a >> 2) / 63.0;\n");
1541
1542 // Use dual-source color blending to perform dst alpha in a single pass
1543 if (use_dual_source)
1544 out.Write("\tocol1 = float4(0.0, 0.0, 0.0, float(prev.a) / 255.0);\n");
1545 }
1546 else
1547 {
1548 out.Write("\tocol0.a = float(prev.a >> 2) / 63.0;\n");
1549 if (use_dual_source)
1550 out.Write("\tocol1 = float4(0.0, 0.0, 0.0, float(prev.a) / 255.0);\n");
1551 }
1552 }
1553
WriteBlend(ShaderCode & out,const pixel_shader_uid_data * uid_data)1554 static void WriteBlend(ShaderCode& out, const pixel_shader_uid_data* uid_data)
1555 {
1556 if (uid_data->blend_enable)
1557 {
1558 static constexpr std::array<const char*, 8> blend_src_factor{
1559 "float3(0,0,0);", // ZERO
1560 "float3(1,1,1);", // ONE
1561 "initial_ocol0.rgb;", // DSTCLR
1562 "float3(1,1,1) - initial_ocol0.rgb;", // INVDSTCLR
1563 "ocol1.aaa;", // SRCALPHA
1564 "float3(1,1,1) - ocol1.aaa;", // INVSRCALPHA
1565 "initial_ocol0.aaa;", // DSTALPHA
1566 "float3(1,1,1) - initial_ocol0.aaa;", // INVDSTALPHA
1567 };
1568 static constexpr std::array<const char*, 8> blend_src_factor_alpha{
1569 "0.0;", // ZERO
1570 "1.0;", // ONE
1571 "initial_ocol0.a;", // DSTCLR
1572 "1.0 - initial_ocol0.a;", // INVDSTCLR
1573 "ocol1.a;", // SRCALPHA
1574 "1.0 - ocol1.a;", // INVSRCALPHA
1575 "initial_ocol0.a;", // DSTALPHA
1576 "1.0 - initial_ocol0.a;", // INVDSTALPHA
1577 };
1578 static constexpr std::array<const char*, 8> blend_dst_factor{
1579 "float3(0,0,0);", // ZERO
1580 "float3(1,1,1);", // ONE
1581 "ocol0.rgb;", // SRCCLR
1582 "float3(1,1,1) - ocol0.rgb;", // INVSRCCLR
1583 "ocol1.aaa;", // SRCALHA
1584 "float3(1,1,1) - ocol1.aaa;", // INVSRCALPHA
1585 "initial_ocol0.aaa;", // DSTALPHA
1586 "float3(1,1,1) - initial_ocol0.aaa;", // INVDSTALPHA
1587 };
1588 static constexpr std::array<const char*, 8> blend_dst_factor_alpha{
1589 "0.0;", // ZERO
1590 "1.0;", // ONE
1591 "ocol0.a;", // SRCCLR
1592 "1.0 - ocol0.a;", // INVSRCCLR
1593 "ocol1.a;", // SRCALPHA
1594 "1.0 - ocol1.a;", // INVSRCALPHA
1595 "initial_ocol0.a;", // DSTALPHA
1596 "1.0 - initial_ocol0.a;", // INVDSTALPHA
1597 };
1598 out.Write("\tfloat4 blend_src;\n");
1599 out.Write("\tblend_src.rgb = %s\n", blend_src_factor[uid_data->blend_src_factor]);
1600 out.Write("\tblend_src.a = %s\n", blend_src_factor_alpha[uid_data->blend_src_factor_alpha]);
1601 out.Write("\tfloat4 blend_dst;\n");
1602 out.Write("\tblend_dst.rgb = %s\n", blend_dst_factor[uid_data->blend_dst_factor]);
1603 out.Write("\tblend_dst.a = %s\n", blend_dst_factor_alpha[uid_data->blend_dst_factor_alpha]);
1604
1605 out.Write("\tfloat4 blend_result;\n");
1606 if (uid_data->blend_subtract)
1607 {
1608 out.Write("\tblend_result.rgb = initial_ocol0.rgb * blend_dst.rgb - ocol0.rgb * "
1609 "blend_src.rgb;\n");
1610 }
1611 else
1612 {
1613 out.Write(
1614 "\tblend_result.rgb = initial_ocol0.rgb * blend_dst.rgb + ocol0.rgb * blend_src.rgb;\n");
1615 }
1616
1617 if (uid_data->blend_subtract_alpha)
1618 out.Write("\tblend_result.a = initial_ocol0.a * blend_dst.a - ocol0.a * blend_src.a;\n");
1619 else
1620 out.Write("\tblend_result.a = initial_ocol0.a * blend_dst.a + ocol0.a * blend_src.a;\n");
1621 }
1622 else
1623 {
1624 out.Write("\tfloat4 blend_result = ocol0;\n");
1625 }
1626
1627 out.Write("\treal_ocol0 = blend_result;\n");
1628 }
1629