1 // Copyright (c) 2012- PPSSPP Project.
2 
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0 or later versions.
6 
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 // GNU General Public License 2.0 for more details.
11 
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
14 
15 // Official git repository and contact information can be found at
16 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17 
18 #include <cstdio>
19 #include <sstream>
20 
21 #include "Common/Log.h"
22 #include "Common/StringUtils.h"
23 #include "Common/GPU/OpenGL/GLFeatures.h"
24 #include "Common/GPU/ShaderWriter.h"
25 #include "Common/GPU/thin3d.h"
26 #include "Core/Reporting.h"
27 #include "Core/Config.h"
28 #include "GPU/Common/GPUStateUtils.h"
29 #include "GPU/Common/ShaderId.h"
30 #include "GPU/Common/ShaderUniforms.h"
31 #include "GPU/Common/FragmentShaderGenerator.h"
32 #include "GPU/ge_constants.h"
33 #include "GPU/GPUState.h"
34 
35 #define WRITE(p, ...) p.F(__VA_ARGS__)
36 
GenerateFragmentShader(const FShaderID & id,char * buffer,const ShaderLanguageDesc & compat,Draw::Bugs bugs,uint64_t * uniformMask,std::string * errorString)37 bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLanguageDesc &compat, Draw::Bugs bugs, uint64_t *uniformMask, std::string *errorString) {
38 	*uniformMask = 0;
39 	errorString->clear();
40 
41 	bool highpFog = false;
42 	bool highpTexcoord = false;
43 	bool enableFragmentTestCache = g_Config.bFragmentTestCache && ShaderLanguageIsOpenGL(compat.shaderLanguage);
44 
45 	if (compat.gles) {
46 		// PowerVR needs highp to do the fog in MHU correctly.
47 		// Others don't, and some can't handle highp in the fragment shader.
48 		highpFog = (gl_extensions.bugs & BUG_PVR_SHADER_PRECISION_BAD) ? true : false;
49 		highpTexcoord = highpFog;
50 	}
51 
52 	ReplaceAlphaType stencilToAlpha = static_cast<ReplaceAlphaType>(id.Bits(FS_BIT_STENCIL_TO_ALPHA, 2));
53 
54 	std::vector<const char*> gl_exts;
55 	if (ShaderLanguageIsOpenGL(compat.shaderLanguage)) {
56 		if (stencilToAlpha == REPLACE_ALPHA_DUALSOURCE && gl_extensions.EXT_blend_func_extended) {
57 			gl_exts.push_back("#extension GL_EXT_blend_func_extended : require");
58 		}
59 		if (gl_extensions.EXT_gpu_shader4) {
60 			gl_exts.push_back("#extension GL_EXT_gpu_shader4 : enable");
61 		}
62 		if (compat.framebufferFetchExtension) {
63 			gl_exts.push_back(compat.framebufferFetchExtension);
64 		}
65 	}
66 
67 	ShaderWriter p(buffer, compat, ShaderStage::Fragment, gl_exts.data(), gl_exts.size());
68 
69 	bool lmode = id.Bit(FS_BIT_LMODE);
70 	bool doTexture = id.Bit(FS_BIT_DO_TEXTURE);
71 	bool enableFog = id.Bit(FS_BIT_ENABLE_FOG);
72 	bool enableAlphaTest = id.Bit(FS_BIT_ALPHA_TEST);
73 
74 	bool alphaTestAgainstZero = id.Bit(FS_BIT_ALPHA_AGAINST_ZERO);
75 	bool testForceToZero = id.Bit(FS_BIT_TEST_DISCARD_TO_ZERO);
76 	bool enableColorTest = id.Bit(FS_BIT_COLOR_TEST);
77 	bool colorTestAgainstZero = id.Bit(FS_BIT_COLOR_AGAINST_ZERO);
78 	bool enableColorDoubling = id.Bit(FS_BIT_COLOR_DOUBLE);
79 	bool doTextureProjection = id.Bit(FS_BIT_DO_TEXTURE_PROJ);
80 	bool doTextureAlpha = id.Bit(FS_BIT_TEXALPHA);
81 
82 	bool flatBug = bugs.Has(Draw::Bugs::BROKEN_FLAT_IN_SHADER) && g_Config.bVendorBugChecksEnabled;
83 
84 	bool doFlatShading = id.Bit(FS_BIT_FLATSHADE) && !flatBug;
85 	bool shaderDepal = id.Bit(FS_BIT_SHADER_DEPAL);
86 	bool bgraTexture = id.Bit(FS_BIT_BGRA_TEXTURE);
87 	bool colorWriteMask = id.Bit(FS_BIT_COLOR_WRITEMASK) && compat.bitwiseOps;
88 
89 	GEComparison alphaTestFunc = (GEComparison)id.Bits(FS_BIT_ALPHA_TEST_FUNC, 3);
90 	GEComparison colorTestFunc = (GEComparison)id.Bits(FS_BIT_COLOR_TEST_FUNC, 2);
91 	bool needShaderTexClamp = id.Bit(FS_BIT_SHADER_TEX_CLAMP);
92 
93 	GETexFunc texFunc = (GETexFunc)id.Bits(FS_BIT_TEXFUNC, 3);
94 	bool textureAtOffset = id.Bit(FS_BIT_TEXTURE_AT_OFFSET);
95 
96 	ReplaceBlendType replaceBlend = static_cast<ReplaceBlendType>(id.Bits(FS_BIT_REPLACE_BLEND, 3));
97 
98 	GEBlendSrcFactor replaceBlendFuncA = (GEBlendSrcFactor)id.Bits(FS_BIT_BLENDFUNC_A, 4);
99 	GEBlendDstFactor replaceBlendFuncB = (GEBlendDstFactor)id.Bits(FS_BIT_BLENDFUNC_B, 4);
100 	GEBlendMode replaceBlendEq = (GEBlendMode)id.Bits(FS_BIT_BLENDEQ, 3);
101 	StencilValueType replaceAlphaWithStencilType = (StencilValueType)id.Bits(FS_BIT_REPLACE_ALPHA_WITH_STENCIL_TYPE, 4);
102 
103 	bool isModeClear = id.Bit(FS_BIT_CLEARMODE);
104 
105 	const char *shading = "";
106 	if (compat.glslES30 || compat.shaderLanguage == ShaderLanguage::GLSL_VULKAN)
107 		shading = doFlatShading ? "flat" : "";
108 
109 	bool earlyFragmentTests = ((!enableAlphaTest && !enableColorTest) || testForceToZero) && !gstate_c.Supports(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT);
110 	bool useAdrenoBugWorkaround = id.Bit(FS_BIT_NO_DEPTH_CANNOT_DISCARD_STENCIL);
111 
112 	bool readFramebuffer = replaceBlend == REPLACE_BLEND_COPY_FBO || colorWriteMask;
113 	bool readFramebufferTex = readFramebuffer && !gstate_c.Supports(GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH);
114 
115 	bool needFragCoord = readFramebuffer || gstate_c.Supports(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT);
116 	bool writeDepth = gstate_c.Supports(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT);
117 
118 	if (shaderDepal && !doTexture) {
119 		*errorString = "depal requires a texture";
120 		return false;
121 	}
122 
123 	if (readFramebuffer && compat.shaderLanguage == HLSL_D3D9) {
124 		*errorString = "Framebuffer read not yet supported in HLSL D3D9";
125 		return false;
126 	}
127 
128 	if (compat.shaderLanguage == ShaderLanguage::GLSL_VULKAN) {
129 		if (earlyFragmentTests) {
130 			WRITE(p, "layout (early_fragment_tests) in;\n");
131 		} else if (useAdrenoBugWorkaround && !gstate_c.Supports(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT)) {
132 			WRITE(p, "layout (depth_unchanged) out float gl_FragDepth;\n");
133 		}
134 
135 		WRITE(p, "layout (std140, set = 0, binding = 3) uniform baseUBO {\n%s};\n", ub_baseStr);
136 		if (doTexture) {
137 			WRITE(p, "layout (binding = 0) uniform sampler2D tex;\n");
138 		}
139 
140 		if (readFramebufferTex) {
141 			WRITE(p, "layout (binding = 1) uniform sampler2D fbotex;\n");
142 		}
143 
144 		if (shaderDepal) {
145 			WRITE(p, "layout (binding = 2) uniform sampler2D pal;\n");
146 		}
147 
148 		// Note: the precision qualifiers must match the vertex shader!
149 		WRITE(p, "layout (location = 1) %s in lowp vec4 v_color0;\n", shading);
150 		if (lmode)
151 			WRITE(p, "layout (location = 2) %s in lowp vec3 v_color1;\n", shading);
152 		if (enableFog) {
153 			WRITE(p, "layout (location = 3) in highp float v_fogdepth;\n");
154 		}
155 		if (doTexture) {
156 			WRITE(p, "layout (location = 0) in highp vec3 v_texcoord;\n");
157 		}
158 
159 		if (enableAlphaTest && !alphaTestAgainstZero) {
160 			WRITE(p, "int roundAndScaleTo255i(in highp float x) { return int(floor(x * 255.0 + 0.5)); }\n");
161 		}
162 		if (enableColorTest && !colorTestAgainstZero) {
163 			WRITE(p, "ivec3 roundAndScaleTo255iv(in highp vec3 x) { return ivec3(floor(x * 255.0 + 0.5)); }\n");
164 		}
165 
166 		WRITE(p, "layout (location = 0, index = 0) out vec4 fragColor0;\n");
167 		if (stencilToAlpha == REPLACE_ALPHA_DUALSOURCE) {
168 			WRITE(p, "layout (location = 0, index = 1) out vec4 fragColor1;\n");
169 		}
170 	} else if (compat.shaderLanguage == HLSL_D3D11 || compat.shaderLanguage == HLSL_D3D9) {
171 		if (compat.shaderLanguage == HLSL_D3D9) {
172 			if (doTexture)
173 				WRITE(p, "sampler tex : register(s0);\n");
174 
175 			if (readFramebufferTex) {
176 				WRITE(p, "vec2 u_fbotexSize : register(c%i);\n", CONST_PS_FBOTEXSIZE);
177 				WRITE(p, "sampler fbotex : register(s1);\n");
178 			}
179 
180 			if (replaceBlend > REPLACE_BLEND_STANDARD) {
181 				if (replaceBlendFuncA >= GE_SRCBLEND_FIXA) {
182 					WRITE(p, "float3 u_blendFixA : register(c%i);\n", CONST_PS_BLENDFIXA);
183 				}
184 				if (replaceBlendFuncB >= GE_DSTBLEND_FIXB) {
185 					WRITE(p, "float3 u_blendFixB : register(c%i);\n", CONST_PS_BLENDFIXB);
186 				}
187 			}
188 			if (needShaderTexClamp && doTexture) {
189 				WRITE(p, "vec4 u_texclamp : register(c%i);\n", CONST_PS_TEXCLAMP);
190 				if (textureAtOffset) {
191 					WRITE(p, "vec2 u_texclampoff : register(c%i);\n", CONST_PS_TEXCLAMPOFF);
192 				}
193 			}
194 
195 			if (enableAlphaTest || enableColorTest) {
196 				WRITE(p, "vec4 u_alphacolorref : register(c%i);\n", CONST_PS_ALPHACOLORREF);
197 				WRITE(p, "vec4 u_alphacolormask : register(c%i);\n", CONST_PS_ALPHACOLORMASK);
198 			}
199 			if (stencilToAlpha && replaceAlphaWithStencilType == STENCIL_VALUE_UNIFORM) {
200 				WRITE(p, "float u_stencilReplaceValue : register(c%i);\n", CONST_PS_STENCILREPLACE);
201 			}
202 			if (doTexture && texFunc == GE_TEXFUNC_BLEND) {
203 				WRITE(p, "float3 u_texenv : register(c%i);\n", CONST_PS_TEXENV);
204 			}
205 			if (enableFog) {
206 				WRITE(p, "float3 u_fogcolor : register(c%i);\n", CONST_PS_FOGCOLOR);
207 			}
208 		} else {
209 			WRITE(p, "SamplerState samp : register(s0);\n");
210 			WRITE(p, "Texture2D<vec4> tex : register(t0);\n");
211 			if (readFramebufferTex) {
212 				// No sampler required, we Load
213 				WRITE(p, "Texture2D<vec4> fboTex : register(t1);\n");
214 			}
215 			WRITE(p, "cbuffer base : register(b0) {\n%s};\n", ub_baseStr);
216 		}
217 
218 		if (enableAlphaTest) {
219 			if (compat.shaderLanguage == HLSL_D3D11) {
220 				WRITE(p, "int roundAndScaleTo255i(float x) { return int(floor(x * 255.0f + 0.5f)); }\n");
221 			} else {
222 				// D3D11 level 9 gets to take this path.
223 				WRITE(p, "float roundAndScaleTo255f(float x) { return floor(x * 255.0f + 0.5f); }\n");
224 			}
225 		}
226 		if (enableColorTest) {
227 			if (compat.shaderLanguage == HLSL_D3D11) {
228 				WRITE(p, "uvec3 roundAndScaleTo255iv(float3 x) { return (floor(x * 255.0f + 0.5f)); }\n");
229 			} else {
230 				WRITE(p, "vec3 roundAndScaleTo255v(float3 x) { return floor(x * 255.0f + 0.5f); }\n");
231 			}
232 		}
233 
234 		WRITE(p, "struct PS_IN {\n");
235 		if (doTexture) {
236 			WRITE(p, "  vec3 v_texcoord: TEXCOORD0;\n");
237 		}
238 		const char *colorInterpolation = doFlatShading && compat.shaderLanguage == HLSL_D3D11 ? "nointerpolation " : "";
239 		WRITE(p, "  %svec4 v_color0: COLOR0;\n", colorInterpolation);
240 		if (lmode) {
241 			WRITE(p, "  vec3 v_color1: COLOR1;\n");
242 		}
243 		if (enableFog) {
244 			WRITE(p, "  float v_fogdepth: TEXCOORD1;\n");
245 		}
246 		if (compat.shaderLanguage == HLSL_D3D11 && needFragCoord) {
247 			WRITE(p, "  vec4 pixelPos : SV_POSITION;\n");
248 		}
249 		WRITE(p, "};\n");
250 
251 		if (compat.shaderLanguage == HLSL_D3D11) {
252 			WRITE(p, "struct PS_OUT {\n");
253 			if (stencilToAlpha == REPLACE_ALPHA_DUALSOURCE) {
254 				WRITE(p, "  vec4 target : SV_Target0;\n");
255 				WRITE(p, "  vec4 target1 : SV_Target1;\n");
256 			} else {
257 				WRITE(p, "  vec4 target : SV_Target;\n");
258 			}
259 			if (writeDepth) {
260 				WRITE(p, "  float depth : SV_Depth;\n");
261 			}
262 			WRITE(p, "};\n");
263 		}
264 	} else if (compat.shaderLanguage == HLSL_D3D9) {
265 		if (doTexture)
266 			WRITE(p, "sampler tex : register(s0);\n");
267 		if (readFramebufferTex) {
268 			WRITE(p, "vec2 u_fbotexSize : register(c%i);\n", CONST_PS_FBOTEXSIZE);
269 			WRITE(p, "sampler fbotex : register(s1);\n");
270 		}
271 		if (replaceBlend > REPLACE_BLEND_STANDARD) {
272 			if (replaceBlendFuncA >= GE_SRCBLEND_FIXA) {
273 				WRITE(p, "float3 u_blendFixA : register(c%i);\n", CONST_PS_BLENDFIXA);
274 			}
275 			if (replaceBlendFuncB >= GE_DSTBLEND_FIXB) {
276 				WRITE(p, "float3 u_blendFixB : register(c%i);\n", CONST_PS_BLENDFIXB);
277 			}
278 		}
279 		if (needShaderTexClamp && doTexture) {
280 			WRITE(p, "vec4 u_texclamp : register(c%i);\n", CONST_PS_TEXCLAMP);
281 			if (textureAtOffset) {
282 				WRITE(p, "vec2 u_texclampoff : register(c%i);\n", CONST_PS_TEXCLAMPOFF);
283 			}
284 		}
285 
286 		if (enableAlphaTest || enableColorTest) {
287 			WRITE(p, "vec4 u_alphacolorref : register(c%i);\n", CONST_PS_ALPHACOLORREF);
288 			WRITE(p, "vec4 u_alphacolormask : register(c%i);\n", CONST_PS_ALPHACOLORMASK);
289 		}
290 		if (stencilToAlpha && replaceAlphaWithStencilType == STENCIL_VALUE_UNIFORM) {
291 			WRITE(p, "float u_stencilReplaceValue : register(c%i);\n", CONST_PS_STENCILREPLACE);
292 		}
293 		if (doTexture && texFunc == GE_TEXFUNC_BLEND) {
294 			WRITE(p, "float3 u_texenv : register(c%i);\n", CONST_PS_TEXENV);
295 		}
296 		if (enableFog) {
297 			WRITE(p, "float3 u_fogcolor : register(c%i);\n", CONST_PS_FOGCOLOR);
298 		}
299 	} else if (ShaderLanguageIsOpenGL(compat.shaderLanguage)) {
300 		if ((shaderDepal || colorWriteMask) && gl_extensions.IsGLES) {
301 			WRITE(p, "precision highp int;\n");
302 		}
303 
304 		if (doTexture)
305 			WRITE(p, "uniform sampler2D tex;\n");
306 
307 		if (readFramebufferTex) {
308 			if (!compat.texelFetch) {
309 				WRITE(p, "uniform vec2 u_fbotexSize;\n");
310 			}
311 			WRITE(p, "uniform sampler2D fbotex;\n");
312 		}
313 
314 		if (!isModeClear && replaceBlend > REPLACE_BLEND_STANDARD) {
315 			*uniformMask |= DIRTY_SHADERBLEND;
316 			if (replaceBlendFuncA >= GE_SRCBLEND_FIXA) {
317 				WRITE(p, "uniform vec3 u_blendFixA;\n");
318 			}
319 			if (replaceBlendFuncB >= GE_DSTBLEND_FIXB) {
320 				WRITE(p, "uniform vec3 u_blendFixB;\n");
321 			}
322 		}
323 
324 		if (needShaderTexClamp && doTexture) {
325 			*uniformMask |= DIRTY_TEXCLAMP;
326 			WRITE(p, "uniform vec4 u_texclamp;\n");
327 			if (id.Bit(FS_BIT_TEXTURE_AT_OFFSET)) {
328 				WRITE(p, "uniform vec2 u_texclampoff;\n");
329 			}
330 		}
331 
332 		if (enableAlphaTest || enableColorTest) {
333 			if (enableFragmentTestCache) {
334 				WRITE(p, "uniform sampler2D testtex;\n");
335 			} else {
336 				*uniformMask |= DIRTY_ALPHACOLORREF;
337 				WRITE(p, "uniform vec4 u_alphacolorref;\n");
338 				if (compat.bitwiseOps && ((enableColorTest && !colorTestAgainstZero) || (enableAlphaTest && !alphaTestAgainstZero))) {
339 					*uniformMask |= DIRTY_ALPHACOLORMASK;
340 					WRITE(p, "uniform ivec4 u_alphacolormask;\n");
341 				}
342 			}
343 		}
344 
345 		if (shaderDepal) {
346 			WRITE(p, "uniform sampler2D pal;\n");
347 			WRITE(p, "uniform uint u_depal_mask_shift_off_fmt;\n");
348 			*uniformMask |= DIRTY_DEPAL;
349 		}
350 
351 		if (colorWriteMask) {
352 			WRITE(p, "uniform uint u_colorWriteMask;\n");
353 			*uniformMask |= DIRTY_COLORWRITEMASK;
354 		}
355 
356 		if (stencilToAlpha && replaceAlphaWithStencilType == STENCIL_VALUE_UNIFORM) {
357 			*uniformMask |= DIRTY_STENCILREPLACEVALUE;
358 			WRITE(p, "uniform float u_stencilReplaceValue;\n");
359 		}
360 		if (doTexture && texFunc == GE_TEXFUNC_BLEND) {
361 			*uniformMask |= DIRTY_TEXENV;
362 			WRITE(p, "uniform vec3 u_texenv;\n");
363 		}
364 
365 		WRITE(p, "%s %s lowp vec4 v_color0;\n", shading, compat.varying_fs);
366 		if (lmode)
367 			WRITE(p, "%s %s lowp vec3 v_color1;\n", shading, compat.varying_fs);
368 		if (enableFog) {
369 			*uniformMask |= DIRTY_FOGCOLOR;
370 			WRITE(p, "uniform vec3 u_fogcolor;\n");
371 			WRITE(p, "%s %s float v_fogdepth;\n", compat.varying_fs, highpFog ? "highp" : "mediump");
372 		}
373 		if (doTexture) {
374 			WRITE(p, "%s %s vec3 v_texcoord;\n", compat.varying_fs, highpTexcoord ? "highp" : "mediump");
375 		}
376 
377 		if (!enableFragmentTestCache) {
378 			if (enableAlphaTest && !alphaTestAgainstZero) {
379 				if (compat.bitwiseOps) {
380 					WRITE(p, "int roundAndScaleTo255i(in float x) { return int(floor(x * 255.0 + 0.5)); }\n");
381 				} else if (gl_extensions.gpuVendor == GPU_VENDOR_IMGTEC) {
382 					WRITE(p, "float roundTo255thf(in mediump float x) { mediump float y = x + (0.5/255.0); return y - fract(y * 255.0) * (1.0 / 255.0); }\n");
383 				} else {
384 					WRITE(p, "float roundAndScaleTo255f(in float x) { return floor(x * 255.0 + 0.5); }\n");
385 				}
386 			}
387 			if (enableColorTest && !colorTestAgainstZero) {
388 				if (compat.bitwiseOps) {
389 					WRITE(p, "ivec3 roundAndScaleTo255iv(in vec3 x) { return ivec3(floor(x * 255.0 + 0.5)); }\n");
390 				} else if (gl_extensions.gpuVendor == GPU_VENDOR_IMGTEC) {
391 					WRITE(p, "vec3 roundTo255thv(in vec3 x) { vec3 y = x + (0.5/255.0); return y - fract(y * 255.0) * (1.0 / 255.0); }\n");
392 				} else {
393 					WRITE(p, "vec3 roundAndScaleTo255v(in vec3 x) { return floor(x * 255.0 + 0.5); }\n");
394 				}
395 			}
396 		}
397 
398 		if (!strcmp(compat.fragColor0, "fragColor0")) {
399 			const char *qualifierColor0 = "out";
400 			if (readFramebuffer && compat.lastFragData && !strcmp(compat.lastFragData, compat.fragColor0)) {
401 				qualifierColor0 = "inout";
402 			}
403 			// Output the output color definitions.
404 			if (stencilToAlpha == REPLACE_ALPHA_DUALSOURCE) {
405 				WRITE(p, "%s vec4 fragColor0;\n", qualifierColor0);
406 				WRITE(p, "out vec4 fragColor1;\n");
407 			} else {
408 				WRITE(p, "%s vec4 fragColor0;\n", qualifierColor0);
409 			}
410 		}
411 	}
412 
413 	bool hasPackUnorm4x8 = false;
414 	if (compat.shaderLanguage == GLSL_VULKAN) {
415 		hasPackUnorm4x8 = true;
416 	} else if (ShaderLanguageIsOpenGL(compat.shaderLanguage)) {
417 		if (compat.gles) {
418 			hasPackUnorm4x8 = compat.glslVersionNumber >= 310;
419 		} else {
420 			hasPackUnorm4x8 = compat.glslVersionNumber >= 400;
421 		}
422 	}
423 
424 	// Provide implementations of packUnorm4x8 and unpackUnorm4x8 if not available.
425 	if (colorWriteMask && !hasPackUnorm4x8) {
426 		WRITE(p, "uint packUnorm4x8(%svec4 v) {\n", compat.shaderLanguage == GLSL_VULKAN ? "highp " : "");
427 		WRITE(p, "  highp vec4 f = clamp(v, 0.0, 1.0);\n");
428 		WRITE(p, "  uvec4 u = uvec4(255.0 * f);\n");
429 		WRITE(p, "  return u.x | (u.y << 8) | (u.z << 16) | (u.w << 24);\n");
430 		WRITE(p, "}\n");
431 
432 		WRITE(p, "vec4 unpackUnorm4x8(highp uint x) {\n");
433 		WRITE(p, "  highp uvec4 u = uvec4(x & 0xFFU, (x >> 8) & 0xFFU, (x >> 16) & 0xFFU, (x >> 24) & 0xFFU);\n");
434 		WRITE(p, "  highp vec4 f = vec4(u);\n");
435 		WRITE(p, "  return f * (1.0 / 255.0);\n");
436 		WRITE(p, "}\n");
437 	}
438 
439 	// PowerVR needs a custom modulo function. For some reason, this has far higher precision than the builtin one.
440 	if ((gl_extensions.bugs & BUG_PVR_SHADER_PRECISION_BAD) && needShaderTexClamp) {
441 		WRITE(p, "float mymod(float a, float b) { return a - b * floor(a / b); }\n");
442 	}
443 
444 	if (compat.shaderLanguage == HLSL_D3D11) {
445 		WRITE(p, "PS_OUT main( PS_IN In ) {\n");
446 		WRITE(p, "  PS_OUT outfragment;\n");
447 		if (needFragCoord) {
448 			WRITE(p, "  vec4 gl_FragCoord = In.pixelPos;\n");
449 		}
450 		if (writeDepth) {
451 			WRITE(p, "  float gl_FragDepth;\n");
452 		}
453 	} else if (compat.shaderLanguage == HLSL_D3D9) {
454 		WRITE(p, "vec4 main( PS_IN In ) : COLOR {\n");
455 		WRITE(p, "  vec4 target;\n");
456 	} else {
457 		WRITE(p, "void main() {\n");
458 	}
459 
460 	if (compat.shaderLanguage == HLSL_D3D11 || compat.shaderLanguage == HLSL_D3D9) {
461 		WRITE(p, "  vec4 v_color0 = In.v_color0;\n");
462 		if (lmode)
463 			WRITE(p, "  vec3 v_color1 = In.v_color1;\n");
464 		if (enableFog) {
465 			WRITE(p, "  float v_fogdepth = In.v_fogdepth;\n");
466 		}
467 		if (doTexture) {
468 			WRITE(p, "  vec3 v_texcoord = In.v_texcoord;\n");
469 		}
470 	}
471 
472 	// Two things read from the old framebuffer - shader replacement blending and bit-level masking.
473 	if (readFramebuffer) {
474 		if (compat.shaderLanguage == HLSL_D3D11) {
475 			WRITE(p, "  vec4 destColor = fboTex.Load(int3((int)gl_FragCoord.x, (int)gl_FragCoord.y, 0));\n");
476 		} else if (gstate_c.Supports(GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH)) {
477 			// If we have EXT_shader_framebuffer_fetch / ARM_shader_framebuffer_fetch, we skip the blit.
478 			// We can just read the prev value more directly.
479 			WRITE(p, "  lowp vec4 destColor = %s;\n", compat.lastFragData);
480 		} else if (!compat.texelFetch) {
481 			WRITE(p, "  lowp vec4 destColor = %s(fbotex, gl_FragCoord.xy * u_fbotexSize.xy);\n", compat.texture);
482 		} else {
483 			WRITE(p, "  lowp vec4 destColor = %s(fbotex, ivec2(gl_FragCoord.x, gl_FragCoord.y), 0);\n", compat.texelFetch);
484 		}
485 	}
486 
487 	if (isModeClear) {
488 		// Clear mode does not allow any fancy shading.
489 		WRITE(p, "  vec4 v = v_color0;\n");
490 	} else {
491 		const char *secondary = "";
492 		// Secondary color for specular on top of texture
493 		if (lmode) {
494 			WRITE(p, "  vec4 s = vec4(v_color1, 0.0);\n");
495 			secondary = " + s";
496 		} else {
497 			secondary = "";
498 		}
499 
500 		if (doTexture) {
501 			char texcoord[64] = "v_texcoord";
502 			// TODO: Not sure the right way to do this for projection.
503 			// This path destroys resolution on older PowerVR no matter what I do if projection is needed,
504 			// so we disable it on SGX 540 and lesser, and live with the consequences.
505 			bool terriblePrecision = (gl_extensions.bugs & BUG_PVR_SHADER_PRECISION_TERRIBLE) != 0;
506 			bool clampDisabled = doTextureProjection && terriblePrecision;
507 			// Also with terrible precision we can't do wrapping without destroying the image. See #9189
508 			if (terriblePrecision && (!id.Bit(FS_BIT_CLAMP_S) || !id.Bit(FS_BIT_CLAMP_T))) {
509 				clampDisabled = true;
510 			}
511 			if (needShaderTexClamp && !clampDisabled) {
512 				// We may be clamping inside a larger surface (tex = 64x64, buffer=480x272).
513 				// We may also be wrapping in such a surface, or either one in a too-small surface.
514 				// Obviously, clamping to a smaller surface won't work.  But better to clamp to something.
515 				std::string ucoord = "v_texcoord.x";
516 				std::string vcoord = "v_texcoord.y";
517 				if (doTextureProjection) {
518 					ucoord = "(v_texcoord.x / v_texcoord.z)";
519 					vcoord = "(v_texcoord.y / v_texcoord.z)";
520 				}
521 
522 				std::string modulo = (gl_extensions.bugs & BUG_PVR_SHADER_PRECISION_BAD) ? "mymod" : "mod";
523 
524 				if (id.Bit(FS_BIT_CLAMP_S)) {
525 					ucoord = "clamp(" + ucoord + ", u_texclamp.z, u_texclamp.x - u_texclamp.z)";
526 				} else {
527 					ucoord = modulo + "(" + ucoord + ", u_texclamp.x)";
528 				}
529 				if (id.Bit(FS_BIT_CLAMP_T)) {
530 					vcoord = "clamp(" + vcoord + ", u_texclamp.w, u_texclamp.y - u_texclamp.w)";
531 				} else {
532 					vcoord = modulo + "(" + vcoord + ", u_texclamp.y)";
533 				}
534 				if (textureAtOffset) {
535 					ucoord = "(" + ucoord + " + u_texclampoff.x)";
536 					vcoord = "(" + vcoord + " + u_texclampoff.y)";
537 				}
538 
539 				WRITE(p, "  vec2 fixedcoord = vec2(%s, %s);\n", ucoord.c_str(), vcoord.c_str());
540 				truncate_cpy(texcoord, "fixedcoord");
541 				// We already projected it.
542 				doTextureProjection = false;
543 			}
544 
545 			if (!shaderDepal) {
546 				if (compat.shaderLanguage == HLSL_D3D11) {
547 					if (doTextureProjection) {
548 						WRITE(p, "  vec4 t = tex.Sample(samp, v_texcoord.xy / v_texcoord.z)%s;\n", bgraTexture ? ".bgra" : "");
549 					} else {
550 						WRITE(p, "  vec4 t = tex.Sample(samp, %s.xy)%s;\n", texcoord, bgraTexture ? ".bgra" : "");
551 					}
552 				} else if (compat.shaderLanguage == HLSL_D3D9) {
553 					if (doTextureProjection) {
554 						WRITE(p, "  vec4 t = tex2Dproj(tex, vec4(v_texcoord.x, v_texcoord.y, 0, v_texcoord.z))%s;\n", bgraTexture ? ".bgra" : "");
555 					} else {
556 						WRITE(p, "  vec4 t = tex2D(tex, %s.xy)%s;\n", texcoord, bgraTexture ? ".bgra" : "");
557 					}
558 				} else {
559 					if (doTextureProjection) {
560 						WRITE(p, "  vec4 t = %sProj(tex, %s);\n", compat.texture, texcoord);
561 					} else {
562 						WRITE(p, "  vec4 t = %s(tex, %s.xy);\n", compat.texture, texcoord);
563 					}
564 				}
565 			} else {
566 				if (doTextureProjection) {
567 					// We don't use textureProj because we need better control and it's probably not much of a savings anyway.
568 					// However it is good for precision on older hardware like PowerVR.
569 					WRITE(p, "  vec2 uv = %s.xy/%s.z;\n  vec2 uv_round;\n", texcoord, texcoord);
570 				} else {
571 					WRITE(p, "  vec2 uv = %s.xy;\n  vec2 uv_round;\n", texcoord);
572 				}
573 				WRITE(p, "  vec2 tsize = vec2(textureSize(tex, 0));\n");
574 				WRITE(p, "  vec2 fraction;\n");
575 				WRITE(p, "  bool bilinear = (u_depal_mask_shift_off_fmt >> 31) != 0U;\n");
576 				WRITE(p, "  if (bilinear) {\n");
577 				WRITE(p, "    uv_round = uv * tsize - vec2(0.5, 0.5);\n");
578 				WRITE(p, "    fraction = fract(uv_round);\n");
579 				WRITE(p, "    uv_round = (uv_round - fraction + vec2(0.5, 0.5)) / tsize;\n");  // We want to take our four point samples at pixel centers.
580 				WRITE(p, "  } else {\n");
581 				WRITE(p, "    uv_round = uv;\n");
582 				WRITE(p, "  }\n");
583 				WRITE(p, "  highp vec4 t = %s(tex, uv_round);\n", compat.texture);
584 				WRITE(p, "  highp vec4 t1 = %sOffset(tex, uv_round, ivec2(1, 0));\n", compat.texture);
585 				WRITE(p, "  highp vec4 t2 = %sOffset(tex, uv_round, ivec2(0, 1));\n", compat.texture);
586 				WRITE(p, "  highp vec4 t3 = %sOffset(tex, uv_round, ivec2(1, 1));\n", compat.texture);
587 				WRITE(p, "  uint depalMask = (u_depal_mask_shift_off_fmt & 0xFFU);\n");
588 				WRITE(p, "  uint depalShift = (u_depal_mask_shift_off_fmt >> 8) & 0xFFU;\n");
589 				WRITE(p, "  uint depalOffset = ((u_depal_mask_shift_off_fmt >> 16) & 0xFFU) << 4;\n");
590 				WRITE(p, "  uint depalFmt = (u_depal_mask_shift_off_fmt >> 24) & 0x3U;\n");
591 				WRITE(p, "  uvec4 col; uint index0; uint index1; uint index2; uint index3;\n");
592 				WRITE(p, "  switch (int(depalFmt)) {\n");  // We might want to include fmt in the shader ID if this is a performance issue.
593 				WRITE(p, "  case 0:\n");  // 565
594 				WRITE(p, "    col = uvec4(t.rgb * vec3(31.99, 63.99, 31.99), 0);\n");
595 				WRITE(p, "    index0 = (col.b << 11) | (col.g << 5) | (col.r);\n");
596 				WRITE(p, "    if (bilinear) {\n");
597 				WRITE(p, "      col = uvec4(t1.rgb * vec3(31.99, 63.99, 31.99), 0);\n");
598 				WRITE(p, "      index1 = (col.b << 11) | (col.g << 5) | (col.r);\n");
599 				WRITE(p, "      col = uvec4(t2.rgb * vec3(31.99, 63.99, 31.99), 0);\n");
600 				WRITE(p, "      index2 = (col.b << 11) | (col.g << 5) | (col.r);\n");
601 				WRITE(p, "      col = uvec4(t3.rgb * vec3(31.99, 63.99, 31.99), 0);\n");
602 				WRITE(p, "      index3 = (col.b << 11) | (col.g << 5) | (col.r);\n");
603 				WRITE(p, "    }\n");
604 				WRITE(p, "    break;\n");
605 				WRITE(p, "  case 1:\n");  // 5551
606 				WRITE(p, "    col = uvec4(t.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n");
607 				WRITE(p, "    index0 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r);\n");
608 				WRITE(p, "    if (bilinear) {\n");
609 				WRITE(p, "      col = uvec4(t1.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n");
610 				WRITE(p, "      index1 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r);\n");
611 				WRITE(p, "      col = uvec4(t2.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n");
612 				WRITE(p, "      index2 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r);\n");
613 				WRITE(p, "      col = uvec4(t3.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n");
614 				WRITE(p, "      index3 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r);\n");
615 				WRITE(p, "    }\n");
616 				WRITE(p, "    break;\n");
617 				WRITE(p, "  case 2:\n");  // 4444
618 				WRITE(p, "    col = uvec4(t.rgba * 15.99);\n");
619 				WRITE(p, "    index0 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r);\n");
620 				WRITE(p, "    if (bilinear) {\n");
621 				WRITE(p, "      col = uvec4(t1.rgba * 15.99);\n");
622 				WRITE(p, "      index1 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r);\n");
623 				WRITE(p, "      col = uvec4(t2.rgba * 15.99);\n");
624 				WRITE(p, "      index2 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r);\n");
625 				WRITE(p, "      col = uvec4(t3.rgba * 15.99);\n");
626 				WRITE(p, "      index3 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r);\n");
627 				WRITE(p, "    }\n");
628 				WRITE(p, "    break;\n");
629 				WRITE(p, "  case 3:\n");  // 8888
630 				WRITE(p, "    col = uvec4(t.rgba * 255.99);\n");
631 				WRITE(p, "    index0 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r);\n");
632 				WRITE(p, "    if (bilinear) {\n");
633 				WRITE(p, "      col = uvec4(t1.rgba * 255.99);\n");
634 				WRITE(p, "      index1 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r);\n");
635 				WRITE(p, "      col = uvec4(t2.rgba * 255.99);\n");
636 				WRITE(p, "      index2 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r);\n");
637 				WRITE(p, "      col = uvec4(t3.rgba * 255.99);\n");
638 				WRITE(p, "      index3 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r);\n");
639 				WRITE(p, "    }\n");
640 				WRITE(p, "    break;\n");
641 				WRITE(p, "  };\n");
642 				WRITE(p, "  index0 = ((index0 >> depalShift) & depalMask) | depalOffset;\n");
643 				WRITE(p, "  t = texelFetch(pal, ivec2(index0, 0), 0);\n");
644 				WRITE(p, "  if (bilinear && !(index0 == index1 && index1 == index2 && index2 == index3)) {\n");
645 				WRITE(p, "    index1 = ((index1 >> depalShift) & depalMask) | depalOffset;\n");
646 				WRITE(p, "    index2 = ((index2 >> depalShift) & depalMask) | depalOffset;\n");
647 				WRITE(p, "    index3 = ((index3 >> depalShift) & depalMask) | depalOffset;\n");
648 				WRITE(p, "    t1 = texelFetch(pal, ivec2(index1, 0), 0);\n");
649 				WRITE(p, "    t2 = texelFetch(pal, ivec2(index2, 0), 0);\n");
650 				WRITE(p, "    t3 = texelFetch(pal, ivec2(index3, 0), 0);\n");
651 				WRITE(p, "    t = mix(t, t1, fraction.x);\n");
652 				WRITE(p, "    t2 = mix(t2, t3, fraction.x);\n");
653 				WRITE(p, "    t = mix(t, t2, fraction.y);\n");
654 				WRITE(p, "  }\n");
655 			}
656 
657 			if (texFunc != GE_TEXFUNC_REPLACE || !doTextureAlpha)
658 				WRITE(p, "  vec4 p = v_color0;\n");
659 
660 			if (doTextureAlpha) { // texfmt == RGBA
661 				switch (texFunc) {
662 				case GE_TEXFUNC_MODULATE:
663 					WRITE(p, "  vec4 v = p * t%s;\n", secondary);
664 					break;
665 
666 				case GE_TEXFUNC_DECAL:
667 					WRITE(p, "  vec4 v = vec4(mix(p.rgb, t.rgb, t.a), p.a)%s;\n", secondary);
668 					break;
669 
670 				case GE_TEXFUNC_BLEND:
671 					WRITE(p, "  vec4 v = vec4(mix(p.rgb, u_texenv.rgb, t.rgb), p.a * t.a)%s;\n", secondary);
672 					break;
673 
674 				case GE_TEXFUNC_REPLACE:
675 					WRITE(p, "  vec4 v = t%s;\n", secondary);
676 					break;
677 
678 				case GE_TEXFUNC_ADD:
679 				case GE_TEXFUNC_UNKNOWN1:
680 				case GE_TEXFUNC_UNKNOWN2:
681 				case GE_TEXFUNC_UNKNOWN3:
682 					WRITE(p, "  vec4 v = vec4(p.rgb + t.rgb, p.a * t.a)%s;\n", secondary);
683 					break;
684 				default:
685 					WRITE(p, "  vec4 v = p;\n"); break;
686 				}
687 			} else { // texfmt == RGB
688 				switch (texFunc) {
689 				case GE_TEXFUNC_MODULATE:
690 					WRITE(p, "  vec4 v = vec4(t.rgb * p.rgb, p.a)%s;\n", secondary);
691 					break;
692 
693 				case GE_TEXFUNC_DECAL:
694 					WRITE(p, "  vec4 v = vec4(t.rgb, p.a)%s;\n", secondary);
695 					break;
696 
697 				case GE_TEXFUNC_BLEND:
698 					WRITE(p, "  vec4 v = vec4(mix(p.rgb, u_texenv.rgb, t.rgb), p.a)%s;\n", secondary);
699 					break;
700 
701 				case GE_TEXFUNC_REPLACE:
702 					WRITE(p, "  vec4 v = vec4(t.rgb, p.a)%s;\n", secondary);
703 					break;
704 
705 				case GE_TEXFUNC_ADD:
706 				case GE_TEXFUNC_UNKNOWN1:
707 				case GE_TEXFUNC_UNKNOWN2:
708 				case GE_TEXFUNC_UNKNOWN3:
709 					WRITE(p, "  vec4 v = vec4(p.rgb + t.rgb, p.a)%s;\n", secondary); break;
710 				default:
711 					WRITE(p, "  vec4 v = p;\n"); break;
712 				}
713 			}
714 
715 			if (enableColorDoubling) {
716 				// This happens before fog is applied.
717 				WRITE(p, "  v.rgb = clamp(v.rgb * 2.0, 0.0, 1.0);\n");
718 			}
719 		} else {
720 			// No texture mapping
721 			WRITE(p, "  vec4 v = v_color0 %s;\n", secondary);
722 		}
723 
724 		if (enableFog) {
725 			WRITE(p, "  float fogCoef = clamp(v_fogdepth, 0.0, 1.0);\n");
726 			WRITE(p, "  v = mix(vec4(u_fogcolor, v.a), v, fogCoef);\n");
727 		}
728 
729 		// Texture access is at half texels [0.5/256, 255.5/256], but colors are normalized [0, 255].
730 		// So we have to scale to account for the difference.
731 		char alphaTestXCoord[64] = "0";
732 		if (enableFragmentTestCache) {
733 			if (enableColorTest && !colorTestAgainstZero) {
734 				WRITE(p, "  vec4 vScale256 = v * %f + %f;\n", 255.0 / 256.0, 0.5 / 256.0);
735 				truncate_cpy(alphaTestXCoord, "vScale256.a");
736 			} else if (enableAlphaTest && !alphaTestAgainstZero) {
737 				snprintf(alphaTestXCoord, sizeof(alphaTestXCoord), "v.a * %f + %f", 255.0 / 256.0, 0.5 / 256.0);
738 			}
739 		}
740 
741 		const char *discardStatement = testForceToZero ? "v.a = 0.0;" : "DISCARD;";
742 		if (enableAlphaTest) {
743 			if (alphaTestAgainstZero) {
744 				// When testing against 0 (extremely common), we can avoid some math.
745 				// 0.002 is approximately half of 1.0 / 255.0.
746 				if (alphaTestFunc == GE_COMP_NOTEQUAL || alphaTestFunc == GE_COMP_GREATER) {
747 					WRITE(p, "  if (v.a < 0.002) %s\n", discardStatement);
748 				} else if (alphaTestFunc != GE_COMP_NEVER) {
749 					// Anything else is a test for == 0.  Happens sometimes, actually...
750 					WRITE(p, "  if (v.a > 0.002) %s\n", discardStatement);
751 				} else {
752 					// NEVER has been logged as used by games, although it makes little sense - statically failing.
753 					// Maybe we could discard the drawcall, but it's pretty rare.  Let's just statically discard here.
754 					WRITE(p, "  %s\n", discardStatement);
755 				}
756 			} else if (enableFragmentTestCache) {
757 				WRITE(p, "  float aResult = %s(testtex, vec2(%s, 0)).a;\n", compat.texture, alphaTestXCoord);
758 				WRITE(p, "  if (aResult < 0.5) %s\n", discardStatement);
759 			} else {
760 				const char *alphaTestFuncs[] = { "#", "#", " != ", " == ", " >= ", " > ", " <= ", " < " };
761 				if (alphaTestFuncs[alphaTestFunc][0] != '#') {
762 					if (compat.bitwiseOps) {
763 						WRITE(p, "  if ((roundAndScaleTo255i(v.a) & u_alphacolormask.a) %s int(u_alphacolorref.a)) %s\n", alphaTestFuncs[alphaTestFunc], discardStatement);
764 					} else if (gl_extensions.gpuVendor == GPU_VENDOR_IMGTEC) {
765 						// Work around bad PVR driver problem where equality check + discard just doesn't work.
766 						if (alphaTestFunc != GE_COMP_NOTEQUAL) {
767 							WRITE(p, "  if (roundTo255thf(v.a) %s u_alphacolorref.a) %s\n", alphaTestFuncs[alphaTestFunc], discardStatement);
768 						}
769 					} else {
770 						WRITE(p, "  if (roundAndScaleTo255f(v.a) %s u_alphacolorref.a) %s\n", alphaTestFuncs[alphaTestFunc], discardStatement);
771 					}
772 				} else {
773 					// This means NEVER.  See above.
774 					WRITE(p, "  %s\n", discardStatement);
775 				}
776 			}
777 		}
778 
779 		if (enableColorTest) {
780 			if (colorTestAgainstZero) {
781 				// When testing against 0 (common), we can avoid some math.
782 				// 0.002 is approximately half of 1.0 / 255.0.
783 				if (colorTestFunc == GE_COMP_NOTEQUAL) {
784 					if (compat.shaderLanguage == GLSL_VULKAN) {
785 						// Old workaround for Adreno driver bug. We could make this the main path actually
786 						// since the math is roughly equivalent given the non-negative inputs.
787 						WRITE(p, "  if (v.r + v.g + v.b < 0.002) %s\n", discardStatement);
788 					} else {
789 						WRITE(p, "  if (v.r < 0.002 && v.g < 0.002 && v.b < 0.002) %s\n", discardStatement);
790 					}
791 				} else if (colorTestFunc != GE_COMP_NEVER) {
792 					if (compat.shaderLanguage == GLSL_VULKAN) {
793 						// See the GE_COMP_NOTEQUAL case.
794 						WRITE(p, "  if (v.r + v.g + v.b > 0.002) %s\n", discardStatement);
795 					} else {
796 						// Anything else is a test for == 0.
797 						WRITE(p, "  if (v.r > 0.002 || v.g > 0.002 || v.b > 0.002) %s\n", discardStatement);
798 					}
799 				} else {
800 					// NEVER has been logged as used by games, although it makes little sense - statically failing.
801 					// Maybe we could discard the drawcall, but it's pretty rare.  Let's just statically discard here.
802 					WRITE(p, "  %s\n", discardStatement);
803 				}
804 			} else if (enableFragmentTestCache) {
805 				WRITE(p, "  float rResult = %s(testtex, vec2(vScale256.r, 0)).r;\n", compat.texture);
806 				WRITE(p, "  float gResult = %s(testtex, vec2(vScale256.g, 0)).g;\n", compat.texture);
807 				WRITE(p, "  float bResult = %s(testtex, vec2(vScale256.b, 0)).b;\n", compat.texture);
808 				if (colorTestFunc == GE_COMP_EQUAL) {
809 					// Equal means all parts must be equal (so discard if any is not.)
810 					WRITE(p, "  if (rResult < 0.5 || gResult < 0.5 || bResult < 0.5) %s\n", discardStatement);
811 				} else {
812 					// Not equal means any part must be not equal.
813 					WRITE(p, "  if (rResult < 0.5 && gResult < 0.5 && bResult < 0.5) %s\n", discardStatement);
814 				}
815 			} else {
816 				const char *colorTestFuncs[] = { "#", "#", " != ", " == " };
817 				if (colorTestFuncs[colorTestFunc][0] != '#') {
818 					// TODO: Unify these paths better.
819 					if (compat.shaderLanguage == HLSL_D3D11) {
820 						const char *test = colorTestFuncs[colorTestFunc];
821 						WRITE(p, "  uvec3 v_scaled = roundAndScaleTo255iv(v.rgb);\n");
822 						WRITE(p, "  uvec3 v_masked = v_scaled & u_alphacolormask.rgb;\n");
823 						WRITE(p, "  uvec3 colorTestRef = u_alphacolorref.rgb & u_alphacolormask.rgb;\n");
824 						// We have to test the components separately, or we get incorrect results.  See #10629.
825 						WRITE(p, "  if (v_masked.r %s colorTestRef.r && v_masked.g %s colorTestRef.g && v_masked.b %s colorTestRef.b) %s\n", test, test, test, discardStatement);
826 					} else if (compat.shaderLanguage == HLSL_D3D9) {
827 						const char *test = colorTestFuncs[colorTestFunc];
828 						// TODO: Use a texture to lookup bitwise ops instead?
829 						WRITE(p, "  vec3 colortest = roundAndScaleTo255v(v.rgb);\n");
830 						WRITE(p, "  if ((colortest.r %s u_alphacolorref.r) && (colortest.g %s u_alphacolorref.g) && (colortest.b %s u_alphacolorref.b)) %s\n", test, test, test, discardStatement);
831 					} else if (compat.bitwiseOps) {
832 						WRITE(p, "  ivec3 v_scaled = roundAndScaleTo255iv(v.rgb);\n");
833 						if (compat.shaderLanguage == GLSL_VULKAN) {
834 							// Apparently GLES3 does not support vector bitwise ops, but Vulkan does?
835 							WRITE(p, "  if ((v_scaled & u_alphacolormask.rgb) %s (u_alphacolorref.rgb & u_alphacolormask.rgb)) %s\n", colorTestFuncs[colorTestFunc], discardStatement);
836 						} else {
837 							const char *maskedFragColor = "ivec3(v_scaled.r & u_alphacolormask.r, v_scaled.g & u_alphacolormask.g, v_scaled.b & u_alphacolormask.b)";
838 							const char *maskedColorRef = "ivec3(int(u_alphacolorref.r) & u_alphacolormask.r, int(u_alphacolorref.g) & u_alphacolormask.g, int(u_alphacolorref.b) & u_alphacolormask.b)";
839 							WRITE(p, "  if (%s %s %s) %s\n", maskedFragColor, colorTestFuncs[colorTestFunc], maskedColorRef, discardStatement);
840 						}
841 					} else if (gl_extensions.gpuVendor == GPU_VENDOR_IMGTEC) {
842 						WRITE(p, "  if (roundTo255thv(v.rgb) %s u_alphacolorref.rgb) %s\n", colorTestFuncs[colorTestFunc], discardStatement);
843 					} else {
844 						WRITE(p, "  if (roundAndScaleTo255v(v.rgb) %s u_alphacolorref.rgb) %s\n", colorTestFuncs[colorTestFunc], discardStatement);
845 					}
846 				} else {
847 					WRITE(p, "  %s\n", discardStatement);
848 				}
849 			}
850 		}
851 
852 		if (replaceBlend == REPLACE_BLEND_2X_SRC) {
853 			WRITE(p, "  v.rgb = v.rgb * 2.0;\n");
854 		}
855 
856 		if (replaceBlend == REPLACE_BLEND_PRE_SRC || replaceBlend == REPLACE_BLEND_PRE_SRC_2X_ALPHA) {
857 			const char *srcFactor = "ERROR";
858 			switch (replaceBlendFuncA) {
859 			case GE_SRCBLEND_DSTCOLOR:          srcFactor = "ERROR"; break;
860 			case GE_SRCBLEND_INVDSTCOLOR:       srcFactor = "ERROR"; break;
861 			case GE_SRCBLEND_SRCALPHA:          srcFactor = "splat3(v.a)"; break;
862 			case GE_SRCBLEND_INVSRCALPHA:       srcFactor = "splat3(1.0 - v.a)"; break;
863 			case GE_SRCBLEND_DSTALPHA:          srcFactor = "ERROR"; break;
864 			case GE_SRCBLEND_INVDSTALPHA:       srcFactor = "ERROR"; break;
865 			case GE_SRCBLEND_DOUBLESRCALPHA:    srcFactor = "splat3(v.a * 2.0)"; break;
866 			case GE_SRCBLEND_DOUBLEINVSRCALPHA: srcFactor = "splat3(1.0 - v.a * 2.0)"; break;
867 			// PRE_SRC for REPLACE_BLEND_PRE_SRC_2X_ALPHA means "double the src."
868 			// It's close to the same, but clamping can still be an issue.
869 			case GE_SRCBLEND_DOUBLEDSTALPHA:    srcFactor = "splat3(2.0)"; break;
870 			case GE_SRCBLEND_DOUBLEINVDSTALPHA: srcFactor = "ERROR"; break;
871 			case GE_SRCBLEND_FIXA:              srcFactor = "u_blendFixA"; break;
872 			default:                            srcFactor = "u_blendFixA"; break;
873 			}
874 
875 			if (!strcmp(srcFactor, "ERROR")) {
876 				*errorString = "Bad replaceblend src factor";
877 				return false;
878 			}
879 
880 			WRITE(p, "  v.rgb = v.rgb * %s;\n", srcFactor);
881 		}
882 
883 		if (replaceBlend == REPLACE_BLEND_COPY_FBO) {
884 			const char *srcFactor = nullptr;
885 			const char *dstFactor = nullptr;
886 
887 			switch (replaceBlendFuncA) {
888 			case GE_SRCBLEND_DSTCOLOR:          srcFactor = "destColor.rgb"; break;
889 			case GE_SRCBLEND_INVDSTCOLOR:       srcFactor = "(splat3(1.0) - destColor.rgb)"; break;
890 			case GE_SRCBLEND_SRCALPHA:          srcFactor = "v.aaa"; break;
891 			case GE_SRCBLEND_INVSRCALPHA:       srcFactor = "splat3(1.0 - v.a)"; break;
892 			case GE_SRCBLEND_DSTALPHA:          srcFactor = "destColor.aaa"; break;
893 			case GE_SRCBLEND_INVDSTALPHA:       srcFactor = "(splat3(1.0) - destColor.aaa)"; break;
894 			case GE_SRCBLEND_DOUBLESRCALPHA:    srcFactor = "v.aaa * 2.0"; break;
895 			case GE_SRCBLEND_DOUBLEINVSRCALPHA: srcFactor = "(splat3(1.0) - v.aaa * 2.0)"; break;
896 			case GE_SRCBLEND_DOUBLEDSTALPHA:    srcFactor = "destColor.aaa * 2.0"; break;
897 			case GE_SRCBLEND_DOUBLEINVDSTALPHA: srcFactor = "(splat3(1.0) - destColor.aaa * 2.0)"; break;
898 			case GE_SRCBLEND_FIXA:              srcFactor = "u_blendFixA"; break;
899 			default:                            srcFactor = "u_blendFixA"; break;
900 			}
901 			switch (replaceBlendFuncB) {
902 			case GE_DSTBLEND_SRCCOLOR:          dstFactor = "v.rgb"; break;
903 			case GE_DSTBLEND_INVSRCCOLOR:       dstFactor = "(splat3(1.0) - v.rgb)"; break;
904 			case GE_DSTBLEND_SRCALPHA:          dstFactor = "v.aaa"; break;
905 			case GE_DSTBLEND_INVSRCALPHA:       dstFactor = "(splat3(1.0) - v.aaa)"; break;
906 			case GE_DSTBLEND_DSTALPHA:          dstFactor = "destColor.aaa"; break;
907 			case GE_DSTBLEND_INVDSTALPHA:       dstFactor = "(splat3(1.0) - destColor.aaa)"; break;
908 			case GE_DSTBLEND_DOUBLESRCALPHA:    dstFactor = "v.aaa * 2.0"; break;
909 			case GE_DSTBLEND_DOUBLEINVSRCALPHA: dstFactor = "(splat3(1.0) - v.aaa * 2.0)"; break;
910 			case GE_DSTBLEND_DOUBLEDSTALPHA:    dstFactor = "destColor.aaa * 2.0"; break;
911 			case GE_DSTBLEND_DOUBLEINVDSTALPHA: dstFactor = "(splat3(1.0) - destColor.aaa * 2.0)"; break;
912 			case GE_DSTBLEND_FIXB:              dstFactor = "u_blendFixB"; break;
913 			default:                            dstFactor = "u_blendFixB"; break;
914 			}
915 
916 			switch (replaceBlendEq) {
917 			case GE_BLENDMODE_MUL_AND_ADD:
918 				WRITE(p, "  v.rgb = v.rgb * %s + destColor.rgb * %s;\n", srcFactor, dstFactor);
919 				break;
920 			case GE_BLENDMODE_MUL_AND_SUBTRACT:
921 				WRITE(p, "  v.rgb = v.rgb * %s - destColor.rgb * %s;\n", srcFactor, dstFactor);
922 				break;
923 			case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
924 				WRITE(p, "  v.rgb = destColor.rgb * %s - v.rgb * %s;\n", dstFactor, srcFactor);
925 				break;
926 			case GE_BLENDMODE_MIN:
927 				WRITE(p, "  v.rgb = min(v.rgb, destColor.rgb);\n");
928 				break;
929 			case GE_BLENDMODE_MAX:
930 				WRITE(p, "  v.rgb = max(v.rgb, destColor.rgb);\n");
931 				break;
932 			case GE_BLENDMODE_ABSDIFF:
933 				WRITE(p, "  v.rgb = abs(v.rgb - destColor.rgb);\n");
934 				break;
935 			default:
936 				*errorString = "Bad replace blend eq";
937 				return false;
938 			}
939 		}
940 
941 		if (replaceBlend == REPLACE_BLEND_2X_ALPHA || replaceBlend == REPLACE_BLEND_PRE_SRC_2X_ALPHA) {
942 			WRITE(p, "  v.a *= 2.0;\n");
943 		}
944 	}
945 
946 	char replacedAlpha[64] = "0.0";
947 	if (stencilToAlpha != REPLACE_ALPHA_NO) {
948 		switch (replaceAlphaWithStencilType) {
949 		case STENCIL_VALUE_UNIFORM:
950 			truncate_cpy(replacedAlpha, "u_stencilReplaceValue");
951 			break;
952 
953 		case STENCIL_VALUE_ZERO:
954 			truncate_cpy(replacedAlpha, "0.0");
955 			break;
956 
957 		case STENCIL_VALUE_ONE:
958 		case STENCIL_VALUE_INVERT:
959 			// In invert, we subtract by one, but we want to output one here.
960 			truncate_cpy(replacedAlpha, "1.0");
961 			break;
962 
963 		case STENCIL_VALUE_INCR_4:
964 		case STENCIL_VALUE_DECR_4:
965 			// We're adding/subtracting, just by the smallest value in 4-bit.
966 			snprintf(replacedAlpha, sizeof(replacedAlpha), "%f", 1.0 / 15.0);
967 			break;
968 
969 		case STENCIL_VALUE_INCR_8:
970 		case STENCIL_VALUE_DECR_8:
971 			// We're adding/subtracting, just by the smallest value in 8-bit.
972 			snprintf(replacedAlpha, sizeof(replacedAlpha), "%f", 1.0 / 255.0);
973 			break;
974 
975 		case STENCIL_VALUE_KEEP:
976 			// Do nothing. We'll mask out the alpha using color mask.
977 			break;
978 		}
979 	}
980 
981 	switch (stencilToAlpha) {
982 	case REPLACE_ALPHA_DUALSOURCE:
983 		WRITE(p, "  %s = vec4(v.rgb, %s);\n", compat.fragColor0, replacedAlpha);
984 		WRITE(p, "  %s = vec4(0.0, 0.0, 0.0, v.a);\n", compat.fragColor1);
985 		break;
986 
987 	case REPLACE_ALPHA_YES:
988 		WRITE(p, "  %s = vec4(v.rgb, %s);\n", compat.fragColor0, replacedAlpha);
989 		break;
990 
991 	case REPLACE_ALPHA_NO:
992 		WRITE(p, "  %s = v;\n", compat.fragColor0);
993 		break;
994 
995 	default:
996 		*errorString = "Bad stencil-to-alpha type, corrupt ID?";
997 		return false;
998 	}
999 
1000 	// TODO: This could support more ops using the shader blending mechanism.
1001 	LogicOpReplaceType replaceLogicOpType = (LogicOpReplaceType)id.Bits(FS_BIT_REPLACE_LOGIC_OP_TYPE, 2);
1002 	switch (replaceLogicOpType) {
1003 	case LOGICOPTYPE_ONE:
1004 		WRITE(p, "  %s.rgb = splat3(1.0);\n", compat.fragColor0);
1005 		break;
1006 	case LOGICOPTYPE_INVERT:
1007 		WRITE(p, "  %s.rgb = splat3(1.0) - %s.rgb;\n", compat.fragColor0, compat.fragColor0);
1008 		break;
1009 	case LOGICOPTYPE_NORMAL:
1010 		break;
1011 
1012 	default:
1013 		*errorString = "Bad logic op type, corrupt ID?";
1014 		return false;
1015 	}
1016 
1017 	// Final color computed - apply color write mask.
1018 	// TODO: Maybe optimize to only do math on the affected channels?
1019 	// Or .. meh.
1020 	if (colorWriteMask) {
1021 		WRITE(p, "  highp uint v32 = packUnorm4x8(%s);\n", compat.fragColor0);
1022 		WRITE(p, "  highp uint d32 = packUnorm4x8(destColor);\n");
1023 		// Note that the mask has been flipped to the PC way - 1 means write.
1024 		WRITE(p, "  v32 = (v32 & u_colorWriteMask) | (d32 & ~u_colorWriteMask);\n");
1025 		WRITE(p, "  %s = unpackUnorm4x8(v32);\n", compat.fragColor0);
1026 	}
1027 
1028 	if (gstate_c.Supports(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT)) {
1029 		const double scale = DepthSliceFactor() * 65535.0;
1030 
1031 		WRITE(p, "  highp float z = gl_FragCoord.z;\n");
1032 		if (gstate_c.Supports(GPU_SUPPORTS_ACCURATE_DEPTH)) {
1033 			// We center the depth with an offset, but only its fraction matters.
1034 			// When (DepthSliceFactor() - 1) is odd, it will be 0.5, otherwise 0.
1035 			if (((int)(DepthSliceFactor() - 1.0f) & 1) == 1) {
1036 				WRITE(p, "  z = (floor((z * %f) - (1.0 / 2.0)) + (1.0 / 2.0)) * (1.0 / %f);\n", scale, scale);
1037 			} else {
1038 				WRITE(p, "  z = floor(z * %f) * (1.0 / %f);\n", scale, scale);
1039 			}
1040 		} else {
1041 			WRITE(p, "  z = (1.0/65535.0) * floor(z * 65535.0);\n");
1042 		}
1043 		WRITE(p, "  gl_FragDepth = z;\n");
1044 	} else if (!earlyFragmentTests && useAdrenoBugWorkaround) {
1045 		// Adreno (and possibly MESA/others) apply early frag tests even with discard in the shader.
1046 		// Writing depth prevents the bug, even with depth_unchanged specified.
1047 		WRITE(p, "  gl_FragDepth = gl_FragCoord.z;\n");
1048 	}
1049 
1050 	if (compat.shaderLanguage == HLSL_D3D11) {
1051 		if (writeDepth) {
1052 			WRITE(p, "  outfragment.depth = gl_FragDepth;\n");
1053 		}
1054 		WRITE(p, "  return outfragment;\n");
1055 	} else if (compat.shaderLanguage == HLSL_D3D9) {
1056 		WRITE(p, "  return target;\n");
1057 	}
1058 
1059 	WRITE(p, "}\n");
1060 
1061 	return true;
1062 }
1063 
1064