1 // Copyright (c) 2014- PPSSPP Project.
2 
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0 or later versions.
6 
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 // GNU General Public License 2.0 for more details.
11 
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
14 
15 // Official git repository and contact information can be found at
16 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17 
18 #include <cstdio>
19 
20 #include "Common/GPU/OpenGL/GLFeatures.h"
21 
22 #include "GPU/Common/ShaderId.h"
23 #include "GPU/Common/ShaderCommon.h"
24 #include "Common/StringUtils.h"
25 #include "Common/Log.h"
26 #include "Core/Reporting.h"
27 #include "GPU/GPUState.h"
28 #include "GPU/Common/GPUStateUtils.h"
29 #include "GPU/Common/DepalettizeShaderCommon.h"
30 
31 #define WRITE p+=sprintf
32 
33 // TODO: Add a compute shader path. Complete waste of time to set up a graphics state.
34 
35 // Uses integer instructions available since OpenGL 3.0. Suitable for ES 3.0 as well.
GenerateDepalShader300(char * buffer,GEBufferFormat pixelFormat,ShaderLanguage language)36 void GenerateDepalShader300(char *buffer, GEBufferFormat pixelFormat, ShaderLanguage language) {
37 	char *p = buffer;
38 	if (language == HLSL_D3D11) {
39 		WRITE(p, "SamplerState texSamp : register(s0);\n");
40 		WRITE(p, "Texture2D<float4> tex : register(t0);\n");
41 		WRITE(p, "Texture2D<float4> pal : register(t3);\n");
42 		// Support for depth.
43 		if (pixelFormat == GE_FORMAT_DEPTH16) {
44 			WRITE(p, "cbuffer params : register(b0) {\n");
45 			WRITE(p, "  float z_scale; float z_offset;\n");
46 			WRITE(p, "};\n");
47 		}
48 	} else if (language == GLSL_VULKAN) {
49 		WRITE(p, "#version 450\n");
50 		WRITE(p, "#extension GL_ARB_separate_shader_objects : enable\n");
51 		WRITE(p, "#extension GL_ARB_shading_language_420pack : enable\n");
52 		WRITE(p, "layout(set = 0, binding = 0) uniform sampler2D tex;\n");
53 		WRITE(p, "layout(set = 0, binding = 1) uniform sampler2D pal;\n");
54 		WRITE(p, "layout(location = 0) in vec2 v_texcoord0;\n");
55 		WRITE(p, "layout(location = 0) out vec4 fragColor0;\n");
56 
57 		// Support for depth.
58 		if (pixelFormat == GE_FORMAT_DEPTH16) {
59 			WRITE(p, "layout (push_constant) uniform params {\n");
60 			WRITE(p, "  float z_scale; float z_offset;\n");
61 			WRITE(p, "};\n");
62 		}
63 	} else {
64 		if (gl_extensions.IsGLES) {
65 			WRITE(p, "#version 300 es\n");
66 			WRITE(p, "precision mediump float;\n");
67 			WRITE(p, "precision highp int;\n");
68 		} else {
69 			WRITE(p, "#version %d\n", gl_extensions.GLSLVersion());
70 		}
71 		WRITE(p, "in vec2 v_texcoord0;\n");
72 		WRITE(p, "out vec4 fragColor0;\n");
73 		WRITE(p, "uniform sampler2D tex;\n");
74 		WRITE(p, "uniform sampler2D pal;\n");
75 
76 		if (pixelFormat == GE_FORMAT_DEPTH16) {
77 			DepthScaleFactors factors = GetDepthScaleFactors();
78 			WRITE(p, "const float z_scale = %f;\n", factors.scale);
79 			WRITE(p, "const float z_offset = %f;\n", factors.offset);
80 		}
81 	}
82 
83 	if (language == HLSL_D3D11) {
84 		WRITE(p, "float4 main(in float2 v_texcoord0 : TEXCOORD0) : SV_Target {\n");
85 		WRITE(p, "  float4 color = tex.Sample(texSamp, v_texcoord0);\n");
86 	} else {
87 		WRITE(p, "void main() {\n");
88 		WRITE(p, "  vec4 color = texture(tex, v_texcoord0);\n");
89 	}
90 
91 	int mask = gstate.getClutIndexMask();
92 	int shift = gstate.getClutIndexShift();
93 	int offset = gstate.getClutIndexStartPos();
94 	GEPaletteFormat clutFormat = gstate.getClutPaletteFormat();
95 
96 	// Sampling turns our texture into floating point. To avoid this, might be able
97 	// to declare them as isampler2D objects, but these require integer textures, which needs more work.
98 	// Anyhow, we simply work around this by converting back to integer, which is fine.
99 	// Use the mask to skip reading some components.
100 
101 	// TODO: Since we actually have higher precision color data here, we might want to apply a dithering pattern here
102 	// in the 5551, 565 and 4444 modes. This would benefit Test Drive which renders at 16-bit on the real hardware
103 	// and dithers immediately, while we render at higher color depth and thus don't dither resulting in banding
104 	// when we sample it at low color depth like this.
105 
106 	// An alternative would be to have a special mode where we keep some extra precision here and sample the CLUT linearly - works for ramps such
107 	// as those that Test Drive uses for its color remapping. But would need game specific flagging.
108 
109 	int shiftedMask = mask << shift;
110 	switch (pixelFormat) {
111 	case GE_FORMAT_8888:
112 		if (shiftedMask & 0xFF) WRITE(p, "  int r = int(color.r * 255.99);\n"); else WRITE(p, "  int r = 0;\n");
113 		if (shiftedMask & 0xFF00) WRITE(p, "  int g = int(color.g * 255.99);\n"); else WRITE(p, "  int g = 0;\n");
114 		if (shiftedMask & 0xFF0000) WRITE(p, "  int b = int(color.b * 255.99);\n"); else WRITE(p, "  int b = 0;\n");
115 		if (shiftedMask & 0xFF000000) WRITE(p, "  int a = int(color.a * 255.99);\n"); else WRITE(p, "  int a = 0;\n");
116 		WRITE(p, "  int index = (a << 24) | (b << 16) | (g << 8) | (r);\n");
117 		break;
118 	case GE_FORMAT_4444:
119 		if (shiftedMask & 0xF) WRITE(p, "  int r = int(color.r * 15.99);\n"); else WRITE(p, "  int r = 0;\n");
120 		if (shiftedMask & 0xF0) WRITE(p, "  int g = int(color.g * 15.99);\n"); else WRITE(p, "  int g = 0;\n");
121 		if (shiftedMask & 0xF00) WRITE(p, "  int b = int(color.b * 15.99);\n"); else WRITE(p, "  int b = 0;\n");
122 		if (shiftedMask & 0xF000) WRITE(p, "  int a = int(color.a * 15.99);\n"); else WRITE(p, "  int a = 0;\n");
123 		WRITE(p, "  int index = (a << 12) | (b << 8) | (g << 4) | (r);\n");
124 		break;
125 	case GE_FORMAT_565:
126 		if (shiftedMask & 0x1F) WRITE(p, "  int r = int(color.r * 31.99);\n"); else WRITE(p, "  int r = 0;\n");
127 		if (shiftedMask & 0x7E0) WRITE(p, "  int g = int(color.g * 63.99);\n"); else WRITE(p, "  int g = 0;\n");
128 		if (shiftedMask & 0xF800) WRITE(p, "  int b = int(color.b * 31.99);\n"); else WRITE(p, "  int b = 0;\n");
129 		WRITE(p, "  int index = (b << 11) | (g << 5) | (r);\n");
130 		break;
131 	case GE_FORMAT_5551:
132 		if (shiftedMask & 0x1F) WRITE(p, "  int r = int(color.r * 31.99);\n"); else WRITE(p, "  int r = 0;\n");
133 		if (shiftedMask & 0x3E0) WRITE(p, "  int g = int(color.g * 31.99);\n"); else WRITE(p, "  int g = 0;\n");
134 		if (shiftedMask & 0x7C00) WRITE(p, "  int b = int(color.b * 31.99);\n"); else WRITE(p, "  int b = 0;\n");
135 		if (shiftedMask & 0x8000) WRITE(p, "  int a = int(color.a);\n"); else WRITE(p, "  int a = 0;\n");
136 		WRITE(p, "  int index = (a << 15) | (b << 10) | (g << 5) | (r);\n");
137 		break;
138 	case GE_FORMAT_DEPTH16:
139 		// Remap depth buffer.
140 		WRITE(p, "  float depth = (color.x - z_offset) * z_scale;\n");
141 		WRITE(p, "  int index = int(clamp(depth, 0.0, 65535.0));\n");
142 		break;
143 	default:
144 		break;
145 	}
146 
147 	float texturePixels = 256;
148 	if (clutFormat != GE_CMODE_32BIT_ABGR8888)
149 		texturePixels = 512;
150 
151 	if (shift) {
152 		WRITE(p, "  index = (int(uint(index) >> uint(%i)) & 0x%02x)", shift, mask);
153 	} else {
154 		WRITE(p, "  index = (index & 0x%02x)", mask);
155 	}
156 	if (offset) {
157 		WRITE(p, " | %i;\n", offset);  // '|' matches what we have in gstate.h
158 	} else {
159 		WRITE(p, ";\n");
160 	}
161 
162 	if (language == HLSL_D3D11) {
163 		WRITE(p, "  return pal.Load(int3(index, 0, 0)).bgra;\n");
164 	} else {
165 		WRITE(p, "  fragColor0 = texture(pal, vec2((float(index) + 0.5) * (1.0 / %f), 0.0));\n", texturePixels);
166 	}
167 	WRITE(p, "}\n");
168 }
169 
170 // FP only, to suit GL(ES) 2.0
GenerateDepalShaderFloat(char * buffer,GEBufferFormat pixelFormat,ShaderLanguage lang)171 void GenerateDepalShaderFloat(char *buffer, GEBufferFormat pixelFormat, ShaderLanguage lang) {
172 	char *p = buffer;
173 
174 	const char *modFunc = lang == HLSL_D3D9 ? "fmod" : "mod";
175 
176 	char lookupMethod[128] = "index.r";
177 	char offset[128] = "";
178 
179 	const GEPaletteFormat clutFormat = gstate.getClutPaletteFormat();
180 	const u32 clutBase = gstate.getClutIndexStartPos();
181 
182 	const int shift = gstate.getClutIndexShift();
183 	const int mask = gstate.getClutIndexMask();
184 
185 	float index_multiplier = 1.0f;
186 	// pixelformat is the format of the texture we are sampling.
187 	bool formatOK = true;
188 	switch (pixelFormat) {
189 	case GE_FORMAT_8888:
190 		if ((mask & (mask + 1)) == 0) {
191 			// If the value has all bits contiguous (bitmask check above), we can mod by it + 1.
192 			const char *rgba = "rrrrrrrrggggggggbbbbbbbbaaaaaaaa";
193 			const u8 rgba_shift = shift & 7;
194 			if (rgba_shift == 0 && mask == 0xFF) {
195 				sprintf(lookupMethod, "index.%c", rgba[shift]);
196 			} else {
197 				sprintf(lookupMethod, "%s(index.%c * %f, %d.0)", modFunc, rgba[shift], 255.99f / (1 << rgba_shift), mask + 1);
198 				index_multiplier = 1.0f / 256.0f;
199 				// Format was OK if there weren't bits from another component.
200 				formatOK = mask <= 255 - (1 << rgba_shift);
201 			}
202 		} else {
203 			formatOK = false;
204 		}
205 		break;
206 	case GE_FORMAT_4444:
207 		if ((mask & (mask + 1)) == 0 && shift < 16) {
208 			const char *rgba = "rrrrggggbbbbaaaa";
209 			const u8 rgba_shift = shift & 3;
210 			if (rgba_shift == 0 && mask == 0xF) {
211 				sprintf(lookupMethod, "index.%c", rgba[shift]);
212 				index_multiplier = 15.0f / 256.0f;
213 			} else {
214 				// Let's divide and mod to get the right bits.  A common case is shift=0, mask=01.
215 				sprintf(lookupMethod, "%s(index.%c * %f, %d.0)", modFunc, rgba[shift], 15.99f / (1 << rgba_shift), mask + 1);
216 				index_multiplier = 1.0f / 256.0f;
217 				formatOK = mask <= 15 - (1 << rgba_shift);
218 			}
219 		} else {
220 			formatOK = false;
221 		}
222 		break;
223 	case GE_FORMAT_565:
224 		if ((mask & (mask + 1)) == 0 && shift < 16) {
225 			const u8 shifts[16] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4 };
226 			const int multipliers[16] = { 31, 31, 31, 31, 31, 63, 63, 63, 63, 63, 63, 31, 31, 31, 31, 31 };
227 			const char *rgba = "rrrrrggggggbbbbb";
228 			const u8 rgba_shift = shifts[shift];
229 			if (rgba_shift == 0 && mask == multipliers[shift]) {
230 				sprintf(lookupMethod, "index.%c", rgba[shift]);
231 				index_multiplier = multipliers[shift] / 256.0f;
232 			} else {
233 				// We just need to divide the right component by the right value, and then mod against the mask.
234 				// A common case is shift=1, mask=0f.
235 				sprintf(lookupMethod, "%s(index.%c * %f, %d.0)", modFunc, rgba[shift], ((float)multipliers[shift] + 0.99f) / (1 << rgba_shift), mask + 1);
236 				index_multiplier = 1.0f / 256.0f;
237 				formatOK = mask <= multipliers[shift] - (1 << rgba_shift);
238 			}
239 		} else {
240 			formatOK = false;
241 		}
242 		break;
243 	case GE_FORMAT_5551:
244 		if ((mask & (mask + 1)) == 0 && shift < 16) {
245 			const char *rgba = "rrrrrgggggbbbbba";
246 			const u8 rgba_shift = shift % 5;
247 			if (rgba_shift == 0 && mask == 0x1F) {
248 				sprintf(lookupMethod, "index.%c", rgba[shift]);
249 				index_multiplier = 31.0f / 256.0f;
250 			} else if (shift == 15 && mask == 1) {
251 				sprintf(lookupMethod, "index.%c", rgba[shift]);
252 				index_multiplier = 1.0f / 256.0f;
253 			} else {
254 				// A isn't possible here.
255 				sprintf(lookupMethod, "%s(index.%c * %f, %d.0)", modFunc, rgba[shift], 31.99f / (1 << rgba_shift), mask + 1);
256 				index_multiplier = 1.0f / 256.0f;
257 				formatOK = mask <= 31 - (1 << rgba_shift);
258 			}
259 		} else {
260 			formatOK = false;
261 		}
262 		break;
263 	case GE_FORMAT_DEPTH16:
264 	{
265 		// TODO: I think we can handle most scenarios here, but texturing from depth buffers requires an extension on ES 2.0 anyway.
266 		if ((mask & (mask + 1)) == 0 && shift < 16) {
267 			index_multiplier = 1.0f / (float)(1 << shift);
268 			truncate_cpy(lookupMethod, "index.r");
269 		} else {
270 			formatOK = false;
271 		}
272 		break;
273 	}
274 	default:
275 		break;
276 	}
277 
278 	float texturePixels = 256.f;
279 	if (clutFormat != GE_CMODE_32BIT_ABGR8888) {
280 		texturePixels = 512.f;
281 		index_multiplier *= 0.5f;
282 	}
283 
284 	// Adjust index_multiplier, similar to the use of 15.99 instead of 16 in the ES 3 path.
285 	// index_multiplier -= 0.01f / texturePixels;
286 
287 	if (!formatOK) {
288 		ERROR_LOG_REPORT_ONCE(depal, G3D, "%i depal unsupported: shift=%i mask=%02x offset=%d", pixelFormat, shift, mask, clutBase);
289 	}
290 
291 	// Offset by half a texel (plus clutBase) to turn NEAREST filtering into FLOOR.
292 	// Technically, the clutBase should be |'d, not added, but that's hard with floats.
293 	float texel_offset = ((float)clutBase + 0.5f) / texturePixels;
294 	sprintf(offset, " + %f", texel_offset);
295 
296 	if (lang == GLSL_1xx) {
297 		if (gl_extensions.IsGLES) {
298 			WRITE(p, "#version 100\n");
299 			WRITE(p, "precision mediump float;\n");
300 		} else {
301 			WRITE(p, "#version %d\n", gl_extensions.GLSLVersion());
302 			if (gl_extensions.VersionGEThan(3, 0, 0)) {
303 				WRITE(p, "#define gl_FragColor fragColor0\n");
304 				WRITE(p, "out vec4 fragColor0;\n");
305 			}
306 		}
307 		WRITE(p, "varying vec2 v_texcoord0;\n");
308 		WRITE(p, "uniform sampler2D tex;\n");
309 		WRITE(p, "uniform sampler2D pal;\n");
310 		WRITE(p, "void main() {\n");
311 		WRITE(p, "  vec4 index = texture2D(tex, v_texcoord0);\n");
312 		WRITE(p, "  float coord = (%s * %f)%s;\n", lookupMethod, index_multiplier, offset);
313 		WRITE(p, "  gl_FragColor = texture2D(pal, vec2(coord, 0.0));\n");
314 		WRITE(p, "}\n");
315 	} else if (lang == HLSL_D3D9) {
316 		WRITE(p, "sampler tex: register(s0);\n");
317 		WRITE(p, "sampler pal: register(s1);\n");
318 		WRITE(p, "float4 main(float2 v_texcoord0 : TEXCOORD0) : COLOR0 {\n");
319 		WRITE(p, "  float4 index = tex2D(tex, v_texcoord0);\n");
320 		WRITE(p, "  float coord = (%s * %f)%s;\n", lookupMethod, index_multiplier, offset);
321 		WRITE(p, "  return tex2D(pal, float2(coord, 0.0)).bgra;\n");
322 		WRITE(p, "}\n");
323 	}
324 }
325 
GenerateDepalShader(char * buffer,GEBufferFormat pixelFormat,ShaderLanguage language)326 void GenerateDepalShader(char *buffer, GEBufferFormat pixelFormat, ShaderLanguage language) {
327 	switch (language) {
328 	case GLSL_1xx:
329 		GenerateDepalShaderFloat(buffer, pixelFormat, language);
330 		break;
331 	case GLSL_3xx:
332 	case GLSL_VULKAN:
333 	case HLSL_D3D11:
334 		GenerateDepalShader300(buffer, pixelFormat, language);
335 		break;
336 	case HLSL_D3D9:
337 		GenerateDepalShaderFloat(buffer, pixelFormat, language);
338 		break;
339 	default:
340 		_assert_msg_(false, "Depal shader language not supported: %d", (int)language);
341 	}
342 }
343 
GenerateShaderID(uint32_t clutMode,GEBufferFormat pixelFormat) const344 uint32_t DepalShaderCacheCommon::GenerateShaderID(uint32_t clutMode, GEBufferFormat pixelFormat) const {
345 	return (clutMode & 0xFFFFFF) | (pixelFormat << 24);
346 }
347 
GetClutID(GEPaletteFormat clutFormat,uint32_t clutHash) const348 uint32_t DepalShaderCacheCommon::GetClutID(GEPaletteFormat clutFormat, uint32_t clutHash) const {
349 	// Simplistic.
350 	return clutHash ^ (uint32_t)clutFormat;
351 }
352 
353 #undef WRITE
354