1 // Copyright (c) 2012- PPSSPP Project.
2 
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0 or later versions.
6 
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 // GNU General Public License 2.0 for more details.
11 
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
14 
15 // Official git repository and contact information can be found at
16 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17 
18 #include "ppsspp_config.h"
19 #include "ext/xxhash.h"
20 #include "Common/Data/Convert/ColorConv.h"
21 #include "Common/CPUDetect.h"
22 #include "Common/Log.h"
23 
24 #include "GPU/GPU.h"
25 #include "GPU/GPUState.h"
26 #include "GPU/Common/TextureDecoder.h"
27 // NEON is in a separate file so that it can be compiled with a runtime check.
28 #include "GPU/Common/TextureDecoderNEON.h"
29 
30 #ifdef _M_SSE
31 #include <emmintrin.h>
32 #if _M_SSE >= 0x401
33 #include <smmintrin.h>
34 #endif
35 
QuickTexHashSSE2(const void * checkp,u32 size)36 u32 QuickTexHashSSE2(const void *checkp, u32 size) {
37 	u32 check = 0;
38 
39 	if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
40 		__m128i cursor = _mm_set1_epi32(0);
41 		__m128i cursor2 = _mm_set_epi16(0x0001U, 0x0083U, 0x4309U, 0x4d9bU, 0xb651U, 0x4b73U, 0x9bd9U, 0xc00bU);
42 		__m128i update = _mm_set1_epi16(0x2455U);
43 		const __m128i *p = (const __m128i *)checkp;
44 		for (u32 i = 0; i < size / 16; i += 4) {
45 			__m128i chunk = _mm_mullo_epi16(_mm_load_si128(&p[i]), cursor2);
46 			cursor = _mm_add_epi16(cursor, chunk);
47 			cursor = _mm_xor_si128(cursor, _mm_load_si128(&p[i + 1]));
48 			cursor = _mm_add_epi32(cursor, _mm_load_si128(&p[i + 2]));
49 			chunk = _mm_mullo_epi16(_mm_load_si128(&p[i + 3]), cursor2);
50 			cursor = _mm_xor_si128(cursor, chunk);
51 			cursor2 = _mm_add_epi16(cursor2, update);
52 		}
53 		cursor = _mm_add_epi32(cursor, cursor2);
54 		// Add the four parts into the low i32.
55 		cursor = _mm_add_epi32(cursor, _mm_srli_si128(cursor, 8));
56 		cursor = _mm_add_epi32(cursor, _mm_srli_si128(cursor, 4));
57 		check = _mm_cvtsi128_si32(cursor);
58 	} else {
59 		const u32 *p = (const u32 *)checkp;
60 		for (u32 i = 0; i < size / 8; ++i) {
61 			check += *p++;
62 			check ^= *p++;
63 		}
64 	}
65 
66 	return check;
67 }
68 #endif
69 
70 // Masks to downalign bufw to 16 bytes, and wrap at 2048.
71 static const u32 textureAlignMask16[16] = {
72 	0x7FF & ~(((8 * 16) / 16) - 1),  //GE_TFMT_5650,
73 	0x7FF & ~(((8 * 16) / 16) - 1),  //GE_TFMT_5551,
74 	0x7FF & ~(((8 * 16) / 16) - 1),  //GE_TFMT_4444,
75 	0x7FF & ~(((8 * 16) / 32) - 1),  //GE_TFMT_8888,
76 	0x7FF & ~(((8 * 16) / 4) - 1),   //GE_TFMT_CLUT4,
77 	0x7FF & ~(((8 * 16) / 8) - 1),   //GE_TFMT_CLUT8,
78 	0x7FF & ~(((8 * 16) / 16) - 1),  //GE_TFMT_CLUT16,
79 	0x7FF & ~(((8 * 16) / 32) - 1),  //GE_TFMT_CLUT32,
80 	0x7FF, //GE_TFMT_DXT1,
81 	0x7FF, //GE_TFMT_DXT3,
82 	0x7FF, //GE_TFMT_DXT5,
83 	0,   // INVALID,
84 	0,   // INVALID,
85 	0,   // INVALID,
86 	0,   // INVALID,
87 	0,   // INVALID,
88 };
89 
GetTextureBufw(int level,u32 texaddr,GETextureFormat format)90 u32 GetTextureBufw(int level, u32 texaddr, GETextureFormat format) {
91 	// This is a hack to allow for us to draw the huge PPGe texture, which is always in kernel ram.
92 	if (texaddr >= PSP_GetKernelMemoryBase() && texaddr < PSP_GetKernelMemoryEnd())
93 		return gstate.texbufwidth[level] & 0x1FFF;
94 
95 	u32 bufw = gstate.texbufwidth[level] & textureAlignMask16[format];
96 	if (bufw == 0 && format <= GE_TFMT_DXT5) {
97 		// If it's less than 16 bytes, use 16 bytes.
98 		bufw = (8 * 16) / textureBitsPerPixel[format];
99 	}
100 	return bufw;
101 }
102 
QuickTexHashNonSSE(const void * checkp,u32 size)103 u32 QuickTexHashNonSSE(const void *checkp, u32 size) {
104 	u32 check = 0;
105 
106 	if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
107 		static const u16 cursor2_initial[8] = {0xc00bU, 0x9bd9U, 0x4b73U, 0xb651U, 0x4d9bU, 0x4309U, 0x0083U, 0x0001U};
108 		union u32x4_u16x8 {
109 			u32 x32[4];
110 			u16 x16[8];
111 		};
112 		u32x4_u16x8 cursor{};
113 		u32x4_u16x8 cursor2;
114 		static const u16 update[8] = {0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U};
115 
116 		for (u32 j = 0; j < 8; ++j) {
117 			cursor2.x16[j] = cursor2_initial[j];
118 		}
119 
120 		const u32x4_u16x8 *p = (const u32x4_u16x8 *)checkp;
121 		for (u32 i = 0; i < size / 16; i += 4) {
122 			for (u32 j = 0; j < 8; ++j) {
123 				const u16 temp = p[i + 0].x16[j] * cursor2.x16[j];
124 				cursor.x16[j] += temp;
125 			}
126 			for (u32 j = 0; j < 4; ++j) {
127 				cursor.x32[j] ^= p[i + 1].x32[j];
128 				cursor.x32[j] += p[i + 2].x32[j];
129 			}
130 			for (u32 j = 0; j < 8; ++j) {
131 				const u16 temp = p[i + 3].x16[j] * cursor2.x16[j];
132 				cursor.x16[j] ^= temp;
133 			}
134 			for (u32 j = 0; j < 8; ++j) {
135 				cursor2.x16[j] += update[j];
136 			}
137 		}
138 
139 		for (u32 j = 0; j < 4; ++j) {
140 			cursor.x32[j] += cursor2.x32[j];
141 		}
142 		check = cursor.x32[0] + cursor.x32[1] + cursor.x32[2] + cursor.x32[3];
143 	} else {
144 		const u32 *p = (const u32 *)checkp;
145 		for (u32 i = 0; i < size / 8; ++i) {
146 			check += *p++;
147 			check ^= *p++;
148 		}
149 	}
150 
151 	return check;
152 }
153 
154 #if !PPSSPP_ARCH(ARM64) && !defined(_M_SSE)
QuickTexHashBasic(const void * checkp,u32 size)155 static u32 QuickTexHashBasic(const void *checkp, u32 size) {
156 #if PPSSPP_ARCH(ARM) && defined(__GNUC__)
157 	__builtin_prefetch(checkp, 0, 0);
158 
159 	u32 check;
160 	asm volatile (
161 		// Let's change size to the end address.
162 		"add %1, %1, %2\n"
163 		"mov r6, #0\n"
164 
165 		".align 2\n"
166 
167 		// If we have zero sized input, we'll return garbage.  Oh well, shouldn't happen.
168 		"QuickTexHashBasic_next:\n"
169 		"ldmia %2!, {r2-r5}\n"
170 		"add r6, r6, r2\n"
171 		"eor r6, r6, r3\n"
172 		"cmp %2, %1\n"
173 		"add r6, r6, r4\n"
174 		"eor r6, r6, r5\n"
175 		"blo QuickTexHashBasic_next\n"
176 
177 		".align 2\n"
178 
179 		"QuickTexHashBasic_done:\n"
180 		"mov %0, r6\n"
181 
182 		: "=r"(check)
183 		: "r"(size), "r"(checkp)
184 		: "r2", "r3", "r4", "r5", "r6"
185 	);
186 #else
187 	u32 check = 0;
188 	const u32 size_u32 = size / 4;
189 	const u32 *p = (const u32 *)checkp;
190 	for (u32 i = 0; i < size_u32; i += 4) {
191 		check += p[i + 0];
192 		check ^= p[i + 1];
193 		check += p[i + 2];
194 		check ^= p[i + 3];
195 	}
196 #endif
197 
198 	return check;
199 }
200 #endif
201 
DoSwizzleTex16(const u32 * ysrcp,u8 * texptr,int bxc,int byc,u32 pitch)202 void DoSwizzleTex16(const u32 *ysrcp, u8 *texptr, int bxc, int byc, u32 pitch) {
203 	// ysrcp is in 32-bits, so this is convenient.
204 	const u32 pitchBy32 = pitch >> 2;
205 #ifdef _M_SSE
206 	if (((uintptr_t)ysrcp & 0xF) == 0 && (pitch & 0xF) == 0) {
207 		__m128i *dest = (__m128i *)texptr;
208 		// The pitch parameter is in bytes, so shift down for 128-bit.
209 		// Note: it's always aligned to 16 bytes, so this is safe.
210 		const u32 pitchBy128 = pitch >> 4;
211 		for (int by = 0; by < byc; by++) {
212 			const __m128i *xsrc = (const __m128i *)ysrcp;
213 			for (int bx = 0; bx < bxc; bx++) {
214 				const __m128i *src = xsrc;
215 				for (int n = 0; n < 2; n++) {
216 					// Textures are always 16-byte aligned so this is fine.
217 					__m128i temp1 = _mm_load_si128(src);
218 					src += pitchBy128;
219 					__m128i temp2 = _mm_load_si128(src);
220 					src += pitchBy128;
221 					__m128i temp3 = _mm_load_si128(src);
222 					src += pitchBy128;
223 					__m128i temp4 = _mm_load_si128(src);
224 					src += pitchBy128;
225 
226 					_mm_store_si128(dest, temp1);
227 					_mm_store_si128(dest + 1, temp2);
228 					_mm_store_si128(dest + 2, temp3);
229 					_mm_store_si128(dest + 3, temp4);
230 					dest += 4;
231 				}
232 				xsrc++;
233 			}
234 			ysrcp += pitchBy32 * 8;
235 		}
236 	} else
237 #endif
238 	{
239 		u32 *dest = (u32 *)texptr;
240 		for (int by = 0; by < byc; by++) {
241 			const u32 *xsrc = ysrcp;
242 			for (int bx = 0; bx < bxc; bx++) {
243 				const u32 *src = xsrc;
244 				for (int n = 0; n < 8; n++) {
245 					memcpy(dest, src, 16);
246 					src += pitchBy32;
247 					dest += 4;
248 				}
249 				xsrc += 4;
250 			}
251 			ysrcp += pitchBy32 * 8;
252 		}
253 	}
254 }
255 
DoUnswizzleTex16Basic(const u8 * texptr,u32 * ydestp,int bxc,int byc,u32 pitch)256 void DoUnswizzleTex16Basic(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch) {
257 	// ydestp is in 32-bits, so this is convenient.
258 	const u32 pitchBy32 = pitch >> 2;
259 
260 #ifdef _M_SSE
261 	if (((uintptr_t)ydestp & 0xF) == 0 && (pitch & 0xF) == 0) {
262 		const __m128i *src = (const __m128i *)texptr;
263 		// The pitch parameter is in bytes, so shift down for 128-bit.
264 		// Note: it's always aligned to 16 bytes, so this is safe.
265 		const u32 pitchBy128 = pitch >> 4;
266 		for (int by = 0; by < byc; by++) {
267 			__m128i *xdest = (__m128i *)ydestp;
268 			for (int bx = 0; bx < bxc; bx++) {
269 				__m128i *dest = xdest;
270 				for (int n = 0; n < 2; n++) {
271 					// Textures are always 16-byte aligned so this is fine.
272 					__m128i temp1 = _mm_load_si128(src);
273 					__m128i temp2 = _mm_load_si128(src + 1);
274 					__m128i temp3 = _mm_load_si128(src + 2);
275 					__m128i temp4 = _mm_load_si128(src + 3);
276 					_mm_store_si128(dest, temp1);
277 					dest += pitchBy128;
278 					_mm_store_si128(dest, temp2);
279 					dest += pitchBy128;
280 					_mm_store_si128(dest, temp3);
281 					dest += pitchBy128;
282 					_mm_store_si128(dest, temp4);
283 					dest += pitchBy128;
284 					src += 4;
285 				}
286 				xdest++;
287 			}
288 			ydestp += pitchBy32 * 8;
289 		}
290 	} else
291 #endif
292 	{
293 		const u32 *src = (const u32 *)texptr;
294 		for (int by = 0; by < byc; by++) {
295 			u32 *xdest = ydestp;
296 			for (int bx = 0; bx < bxc; bx++) {
297 				u32 *dest = xdest;
298 				for (int n = 0; n < 8; n++) {
299 					memcpy(dest, src, 16);
300 					dest += pitchBy32;
301 					src += 4;
302 				}
303 				xdest += 4;
304 			}
305 			ydestp += pitchBy32 * 8;
306 		}
307 	}
308 }
309 
310 #if !PPSSPP_ARCH(ARM64) && !defined(_M_SSE)
311 QuickTexHashFunc DoQuickTexHash = &QuickTexHashBasic;
312 QuickTexHashFunc StableQuickTexHash = &QuickTexHashNonSSE;
313 UnswizzleTex16Func DoUnswizzleTex16 = &DoUnswizzleTex16Basic;
314 #endif
315 
316 // This has to be done after CPUDetect has done its magic.
SetupTextureDecoder()317 void SetupTextureDecoder() {
318 #if PPSSPP_ARCH(ARM_NEON) && !PPSSPP_ARCH(ARM64)
319 	if (cpu_info.bNEON) {
320 		DoQuickTexHash = &QuickTexHashNEON;
321 		StableQuickTexHash = &QuickTexHashNEON;
322 		DoUnswizzleTex16 = &DoUnswizzleTex16NEON;
323 	}
324 #endif
325 }
326 
327 // S3TC / DXT Decoder
328 class DXTDecoder {
329 public:
330 	inline void DecodeColors(const DXT1Block *src, bool ignore1bitAlpha);
331 	inline void DecodeAlphaDXT5(const DXT5Block *src);
332 	inline void WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int height);
333 	inline void WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int height);
334 	inline void WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int height);
335 
336 protected:
337 	u32 colors_[4];
338 	u8 alpha_[8];
339 };
340 
makecol(int r,int g,int b,int a)341 static inline u32 makecol(int r, int g, int b, int a) {
342 	return (a << 24) | (b << 16) | (g << 8) | r;
343 }
344 
mix_2_3(int c1,int c2)345 static inline int mix_2_3(int c1, int c2) {
346 	return (c1 + c1 + c2) / 3;
347 }
348 
349 // This could probably be done faster by decoding two or four blocks at a time with SSE/NEON.
DecodeColors(const DXT1Block * src,bool ignore1bitAlpha)350 void DXTDecoder::DecodeColors(const DXT1Block *src, bool ignore1bitAlpha) {
351 	u16 c1 = src->color1;
352 	u16 c2 = src->color2;
353 	int blue1 = (c1 << 3) & 0xF8;
354 	int blue2 = (c2 << 3) & 0xF8;
355 	int green1 = (c1 >> 3) & 0xFC;
356 	int green2 = (c2 >> 3) & 0xFC;
357 	int red1 = (c1 >> 8) & 0xF8;
358 	int red2 = (c2 >> 8) & 0xF8;
359 
360 	// Keep alpha zero for non-DXT1 to skip masking the colors.
361 	int alpha = ignore1bitAlpha ? 0 : 255;
362 
363 	colors_[0] = makecol(red1, green1, blue1, alpha);
364 	colors_[1] = makecol(red2, green2, blue2, alpha);
365 	if (c1 > c2) {
366 		colors_[2] = makecol(mix_2_3(red1, red2), mix_2_3(green1, green2), mix_2_3(blue1, blue2), alpha);
367 		colors_[3] = makecol(mix_2_3(red2, red1), mix_2_3(green2, green1), mix_2_3(blue2, blue1), alpha);
368 	} else {
369 		// Average - these are always left shifted, so no need to worry about ties.
370 		int red3 = (red1 + red2) / 2;
371 		int green3 = (green1 + green2) / 2;
372 		int blue3 = (blue1 + blue2) / 2;
373 		colors_[2] = makecol(red3, green3, blue3, alpha);
374 		colors_[3] = makecol(0, 0, 0, 0);
375 	}
376 }
377 
lerp8(const DXT5Block * src,int n)378 static inline u8 lerp8(const DXT5Block *src, int n) {
379 	// These weights multiple alpha1/alpha2 to fixed 8.8 point.
380 	int alpha1 = (src->alpha1 * ((7 - n) << 8)) / 7;
381 	int alpha2 = (src->alpha2 * (n << 8)) / 7;
382 	return (u8)((alpha1 + alpha2 + 31) >> 8);
383 }
384 
lerp6(const DXT5Block * src,int n)385 static inline u8 lerp6(const DXT5Block *src, int n) {
386 	int alpha1 = (src->alpha1 * ((5 - n) << 8)) / 5;
387 	int alpha2 = (src->alpha2 * (n << 8)) / 5;
388 	return (u8)((alpha1 + alpha2 + 31) >> 8);
389 }
390 
DecodeAlphaDXT5(const DXT5Block * src)391 void DXTDecoder::DecodeAlphaDXT5(const DXT5Block *src) {
392 	alpha_[0] = src->alpha1;
393 	alpha_[1] = src->alpha2;
394 	if (alpha_[0] > alpha_[1]) {
395 		alpha_[2] = lerp8(src, 1);
396 		alpha_[3] = lerp8(src, 2);
397 		alpha_[4] = lerp8(src, 3);
398 		alpha_[5] = lerp8(src, 4);
399 		alpha_[6] = lerp8(src, 5);
400 		alpha_[7] = lerp8(src, 6);
401 	} else {
402 		alpha_[2] = lerp6(src, 1);
403 		alpha_[3] = lerp6(src, 2);
404 		alpha_[4] = lerp6(src, 3);
405 		alpha_[5] = lerp6(src, 4);
406 		alpha_[6] = 0;
407 		alpha_[7] = 255;
408 	}
409 }
410 
WriteColorsDXT1(u32 * dst,const DXT1Block * src,int pitch,int height)411 void DXTDecoder::WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int height) {
412 	for (int y = 0; y < height; y++) {
413 		int colordata = src->lines[y];
414 		for (int x = 0; x < 4; x++) {
415 			dst[x] = colors_[colordata & 3];
416 			colordata >>= 2;
417 		}
418 		dst += pitch;
419 	}
420 }
421 
WriteColorsDXT3(u32 * dst,const DXT3Block * src,int pitch,int height)422 void DXTDecoder::WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int height) {
423 	for (int y = 0; y < height; y++) {
424 		int colordata = src->color.lines[y];
425 		u32 alphadata = src->alphaLines[y];
426 		for (int x = 0; x < 4; x++) {
427 			dst[x] = colors_[colordata & 3] | (alphadata << 28);
428 			colordata >>= 2;
429 			alphadata >>= 4;
430 		}
431 		dst += pitch;
432 	}
433 }
434 
WriteColorsDXT5(u32 * dst,const DXT5Block * src,int pitch,int height)435 void DXTDecoder::WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int height) {
436 	// 48 bits, 3 bit index per pixel, 12 bits per line.
437 	u64 alphadata = ((u64)(u16)src->alphadata1 << 32) | (u32)src->alphadata2;
438 
439 	for (int y = 0; y < height; y++) {
440 		int colordata = src->color.lines[y];
441 		for (int x = 0; x < 4; x++) {
442 			dst[x] = colors_[colordata & 3] | (alpha_[alphadata & 7] << 24);
443 			colordata >>= 2;
444 			alphadata >>= 3;
445 		}
446 		dst += pitch;
447 	}
448 }
449 
GetDXTTexelColor(const DXT1Block * src,int x,int y,int alpha)450 uint32_t GetDXTTexelColor(const DXT1Block *src, int x, int y, int alpha) {
451 	_dbg_assert_(x >= 0 && x < 4);
452 	_dbg_assert_(y >= 0 && y < 4);
453 
454 	uint16_t c1 = src->color1;
455 	uint16_t c2 = src->color2;
456 	int blue1 = (c1 << 3) & 0xF8;
457 	int blue2 = (c2 << 3) & 0xF8;
458 	int green1 = (c1 >> 3) & 0xFC;
459 	int green2 = (c2 >> 3) & 0xFC;
460 	int red1 = (c1 >> 8) & 0xF8;
461 	int red2 = (c2 >> 8) & 0xF8;
462 
463 	int colorIndex = (src->lines[y] >> (x * 2)) & 3;
464 	if (colorIndex == 0) {
465 		return makecol(red1, green1, blue1, alpha);
466 	} else if (colorIndex == 1) {
467 		return makecol(red2, green2, blue2, alpha);
468 	} else if (c1 > c2) {
469 		if (colorIndex == 2) {
470 			return makecol(mix_2_3(red1, red2), mix_2_3(green1, green2), mix_2_3(blue1, blue2), alpha);
471 		}
472 		return makecol(mix_2_3(red2, red1), mix_2_3(green2, green1), mix_2_3(blue2, blue1), alpha);
473 	} else if (colorIndex == 3) {
474 		return makecol(0, 0, 0, 0);
475 	}
476 
477 	// Average - these are always left shifted, so no need to worry about ties.
478 	int red3 = (red1 + red2) / 2;
479 	int green3 = (green1 + green2) / 2;
480 	int blue3 = (blue1 + blue2) / 2;
481 	return makecol(red3, green3, blue3, alpha);
482 }
483 
GetDXT1Texel(const DXT1Block * src,int x,int y)484 uint32_t GetDXT1Texel(const DXT1Block *src, int x, int y) {
485 	return GetDXTTexelColor(src, x, y, 255);
486 }
487 
GetDXT3Texel(const DXT3Block * src,int x,int y)488 uint32_t GetDXT3Texel(const DXT3Block *src, int x, int y) {
489 	uint32_t color = GetDXTTexelColor(&src->color, x, y, 0);
490 	u32 alpha = (src->alphaLines[y] >> (x * 4)) & 0xF;
491 	return color | (alpha << 28);
492 }
493 
GetDXT5Texel(const DXT5Block * src,int x,int y)494 uint32_t GetDXT5Texel(const DXT5Block *src, int x, int y) {
495 	uint32_t color = GetDXTTexelColor(&src->color, x, y, 0);
496 	uint64_t alphadata = ((uint64_t)(uint16_t)src->alphadata1 << 32) | (uint32_t)src->alphadata2;
497 	int alphaIndex = (alphadata >> (y * 12 + x * 3)) & 7;
498 
499 	if (alphaIndex == 0) {
500 		return color | (src->alpha1 << 24);
501 	} else if (alphaIndex == 1) {
502 		return color | (src->alpha2 << 24);
503 	} else if (src->alpha1 > src->alpha2) {
504 		return color | (lerp8(src, alphaIndex - 1) << 24);
505 	} else if (alphaIndex == 6) {
506 		return color;
507 	} else if (alphaIndex == 7) {
508 		return color | 0xFF000000;
509 	}
510 	return color | (lerp6(src, alphaIndex - 1) << 24);
511 }
512 
513 // This could probably be done faster by decoding two or four blocks at a time with SSE/NEON.
DecodeDXT1Block(u32 * dst,const DXT1Block * src,int pitch,int height,bool ignore1bitAlpha)514 void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int height, bool ignore1bitAlpha) {
515 	DXTDecoder dxt;
516 	dxt.DecodeColors(src, ignore1bitAlpha);
517 	dxt.WriteColorsDXT1(dst, src, pitch, height);
518 }
519 
DecodeDXT3Block(u32 * dst,const DXT3Block * src,int pitch,int height)520 void DecodeDXT3Block(u32 *dst, const DXT3Block *src, int pitch, int height) {
521 	DXTDecoder dxt;
522 	dxt.DecodeColors(&src->color, true);
523 	dxt.WriteColorsDXT3(dst, src, pitch, height);
524 }
525 
DecodeDXT5Block(u32 * dst,const DXT5Block * src,int pitch,int height)526 void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch, int height) {
527 	DXTDecoder dxt;
528 	dxt.DecodeColors(&src->color, true);
529 	dxt.DecodeAlphaDXT5(src);
530 	dxt.WriteColorsDXT5(dst, src, pitch, height);
531 }
532 
533 #ifdef _M_SSE
CombineSSEBitsToDWORD(const __m128i & v)534 static inline u32 CombineSSEBitsToDWORD(const __m128i &v) {
535 	__m128i temp;
536 	temp = _mm_or_si128(v, _mm_srli_si128(v, 8));
537 	temp = _mm_or_si128(temp, _mm_srli_si128(temp, 4));
538 	return _mm_cvtsi128_si32(temp);
539 }
540 
CheckAlphaRGBA8888SSE2(const u32 * pixelData,int stride,int w,int h)541 CheckAlphaResult CheckAlphaRGBA8888SSE2(const u32 *pixelData, int stride, int w, int h) {
542 	const __m128i mask = _mm_set1_epi32(0xFF000000);
543 
544 	const __m128i *p = (const __m128i *)pixelData;
545 	const int w4 = w / 4;
546 	const int stride4 = stride / 4;
547 
548 	__m128i bits = mask;
549 	for (int y = 0; y < h; ++y) {
550 		for (int i = 0; i < w4; ++i) {
551 			const __m128i a = _mm_load_si128(&p[i]);
552 			bits = _mm_and_si128(bits, a);
553 		}
554 
555 		__m128i result = _mm_xor_si128(bits, mask);
556 		if (CombineSSEBitsToDWORD(result) != 0) {
557 			return CHECKALPHA_ANY;
558 		}
559 
560 		p += stride4;
561 	}
562 
563 	return CHECKALPHA_FULL;
564 }
565 
CheckAlphaABGR4444SSE2(const u32 * pixelData,int stride,int w,int h)566 CheckAlphaResult CheckAlphaABGR4444SSE2(const u32 *pixelData, int stride, int w, int h) {
567 	const __m128i mask = _mm_set1_epi16((short)0x000F);
568 
569 	const __m128i *p = (const __m128i *)pixelData;
570 	const int w8 = w / 8;
571 	const int stride8 = stride / 8;
572 
573 	__m128i bits = mask;
574 	for (int y = 0; y < h; ++y) {
575 		for (int i = 0; i < w8; ++i) {
576 			const __m128i a = _mm_load_si128(&p[i]);
577 			bits = _mm_and_si128(bits, a);
578 		}
579 
580 		__m128i result = _mm_xor_si128(bits, mask);
581 		if (CombineSSEBitsToDWORD(result) != 0) {
582 			return CHECKALPHA_ANY;
583 		}
584 
585 		p += stride8;
586 	}
587 
588 	return CHECKALPHA_FULL;
589 }
590 
CheckAlphaABGR1555SSE2(const u32 * pixelData,int stride,int w,int h)591 CheckAlphaResult CheckAlphaABGR1555SSE2(const u32 *pixelData, int stride, int w, int h) {
592 	const __m128i mask = _mm_set1_epi16((short)0x0001);
593 
594 	const __m128i *p = (const __m128i *)pixelData;
595 	const int w8 = w / 8;
596 	const int stride8 = stride / 8;
597 
598 	__m128i bits = mask;
599 	for (int y = 0; y < h; ++y) {
600 		for (int i = 0; i < w8; ++i) {
601 			const __m128i a = _mm_load_si128(&p[i]);
602 			bits = _mm_and_si128(bits, a);
603 		}
604 
605 		__m128i result = _mm_xor_si128(bits, mask);
606 		if (CombineSSEBitsToDWORD(result) != 0) {
607 			return CHECKALPHA_ANY;
608 		}
609 
610 		p += stride8;
611 	}
612 
613 	return CHECKALPHA_FULL;
614 }
615 
CheckAlphaRGBA4444SSE2(const u32 * pixelData,int stride,int w,int h)616 CheckAlphaResult CheckAlphaRGBA4444SSE2(const u32 *pixelData, int stride, int w, int h) {
617 	const __m128i mask = _mm_set1_epi16((short)0xF000);
618 
619 	const __m128i *p = (const __m128i *)pixelData;
620 	const int w8 = w / 8;
621 	const int stride8 = stride / 8;
622 
623 	__m128i bits = mask;
624 	for (int y = 0; y < h; ++y) {
625 		for (int i = 0; i < w8; ++i) {
626 			const __m128i a = _mm_load_si128(&p[i]);
627 			bits = _mm_and_si128(bits, a);
628 		}
629 
630 		__m128i result = _mm_xor_si128(bits, mask);
631 		if (CombineSSEBitsToDWORD(result) != 0) {
632 			return CHECKALPHA_ANY;
633 		}
634 
635 		p += stride8;
636 	}
637 
638 	return CHECKALPHA_FULL;
639 }
640 
CheckAlphaRGBA5551SSE2(const u32 * pixelData,int stride,int w,int h)641 CheckAlphaResult CheckAlphaRGBA5551SSE2(const u32 *pixelData, int stride, int w, int h) {
642 	const __m128i mask = _mm_set1_epi16((short)0x8000);
643 
644 	const __m128i *p = (const __m128i *)pixelData;
645 	const int w8 = w / 8;
646 	const int stride8 = stride / 8;
647 
648 	__m128i bits = mask;
649 	for (int y = 0; y < h; ++y) {
650 		for (int i = 0; i < w8; ++i) {
651 			const __m128i a = _mm_load_si128(&p[i]);
652 			bits = _mm_and_si128(bits, a);
653 		}
654 
655 		__m128i result = _mm_xor_si128(bits, mask);
656 		if (CombineSSEBitsToDWORD(result) != 0) {
657 			return CHECKALPHA_ANY;
658 		}
659 
660 		p += stride8;
661 	}
662 
663 	return CHECKALPHA_FULL;
664 }
665 #endif
666 
CheckAlphaRGBA8888Basic(const u32 * pixelData,int stride,int w,int h)667 CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w, int h) {
668 	// Use SIMD if aligned to 16 bytes / 4 pixels (almost always the case.)
669 	if ((w & 3) == 0 && (stride & 3) == 0) {
670 #ifdef _M_SSE
671 		return CheckAlphaRGBA8888SSE2(pixelData, stride, w, h);
672 #elif PPSSPP_ARCH(ARM_NEON)
673 		if (cpu_info.bNEON) {
674 			return CheckAlphaRGBA8888NEON(pixelData, stride, w, h);
675 		}
676 #endif
677 	}
678 
679 	const u32 *p = pixelData;
680 	for (int y = 0; y < h; ++y) {
681 		u32 bits = 0xFF000000;
682 		for (int i = 0; i < w; ++i) {
683 			bits &= p[i];
684 		}
685 
686 		if (bits != 0xFF000000) {
687 			// We're done, we hit non-full alpha.
688 			return CHECKALPHA_ANY;
689 		}
690 
691 		p += stride;
692 	}
693 
694 	return CHECKALPHA_FULL;
695 }
696 
CheckAlphaABGR4444Basic(const u32 * pixelData,int stride,int w,int h)697 CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h) {
698 	// Use SIMD if aligned to 16 bytes / 8 pixels (usually the case.)
699 	if ((w & 7) == 0 && (stride & 7) == 0) {
700 #ifdef _M_SSE
701 		return CheckAlphaABGR4444SSE2(pixelData, stride, w, h);
702 #elif PPSSPP_ARCH(ARM_NEON)
703 		if (cpu_info.bNEON) {
704 			return CheckAlphaABGR4444NEON(pixelData, stride, w, h);
705 		}
706 #endif
707 	}
708 
709 	const u32 *p = pixelData;
710 	const int w2 = (w + 1) / 2;
711 	const int stride2 = (stride + 1) / 2;
712 
713 	for (int y = 0; y < h; ++y) {
714 		u32 bits = 0x000F000F;
715 		for (int i = 0; i < w2; ++i) {
716 			bits &= p[i];
717 		}
718 
719 		if (bits != 0x000F000F) {
720 			// We're done, we hit non-full alpha.
721 			return CHECKALPHA_ANY;
722 		}
723 
724 		p += stride2;
725 	}
726 
727 	return CHECKALPHA_FULL;
728 }
729 
CheckAlphaABGR1555Basic(const u32 * pixelData,int stride,int w,int h)730 CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w, int h) {
731 	// Use SIMD if aligned to 16 bytes / 8 pixels (usually the case.)
732 	if ((w & 7) == 0 && (stride & 7) == 0) {
733 #ifdef _M_SSE
734 		return CheckAlphaABGR1555SSE2(pixelData, stride, w, h);
735 #elif PPSSPP_ARCH(ARM_NEON)
736 		if (cpu_info.bNEON) {
737 			return CheckAlphaABGR1555NEON(pixelData, stride, w, h);
738 		}
739 #endif
740 	}
741 
742 	const u32 *p = pixelData;
743 	const int w2 = (w + 1) / 2;
744 	const int stride2 = (stride + 1) / 2;
745 
746 	for (int y = 0; y < h; ++y) {
747 		u32 bits = 0x00010001;
748 		for (int i = 0; i < w2; ++i) {
749 			bits &= p[i];
750 		}
751 
752 		if (bits != 0x00010001) {
753 			return CHECKALPHA_ANY;
754 		}
755 
756 		p += stride2;
757 	}
758 
759 	return CHECKALPHA_FULL;
760 }
761 
CheckAlphaRGBA4444Basic(const u32 * pixelData,int stride,int w,int h)762 CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h) {
763 	// Use SSE if aligned to 16 bytes / 8 pixels (usually the case.)
764 	if ((w & 7) == 0 && (stride & 7) == 0) {
765 #ifdef _M_SSE
766 		return CheckAlphaRGBA4444SSE2(pixelData, stride, w, h);
767 #elif PPSSPP_ARCH(ARM_NEON)
768 		if (cpu_info.bNEON) {
769 			return CheckAlphaRGBA4444NEON(pixelData, stride, w, h);
770 		}
771 #endif
772 	}
773 
774 	const u32 *p = pixelData;
775 	const int w2 = (w + 1) / 2;
776 	const int stride2 = (stride + 1) / 2;
777 
778 	for (int y = 0; y < h; ++y) {
779 		u32 bits = 0xF000F000;
780 		for (int i = 0; i < w2; ++i) {
781 			bits &= p[i];
782 		}
783 
784 		if (bits != 0xF000F000) {
785 			// We're done, we hit non-full alpha.
786 			return CHECKALPHA_ANY;
787 		}
788 
789 		p += stride2;
790 	}
791 
792 	return CHECKALPHA_FULL;
793 }
794 
CheckAlphaRGBA5551Basic(const u32 * pixelData,int stride,int w,int h)795 CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w, int h) {
796 	// Use SSE if aligned to 16 bytes / 8 pixels (usually the case.)
797 	if ((w & 7) == 0 && (stride & 7) == 0) {
798 #ifdef _M_SSE
799 		return CheckAlphaRGBA5551SSE2(pixelData, stride, w, h);
800 #elif PPSSPP_ARCH(ARM_NEON)
801 		if (cpu_info.bNEON) {
802 			return CheckAlphaRGBA5551NEON(pixelData, stride, w, h);
803 		}
804 #endif
805 	}
806 
807 	const u32 *p = pixelData;
808 	const int w2 = (w + 1) / 2;
809 	const int stride2 = (stride + 1) / 2;
810 
811 	for (int y = 0; y < h; ++y) {
812 		u32 bits = 0x80008000;
813 		for (int i = 0; i < w2; ++i) {
814 			bits &= p[i];
815 		}
816 
817 		if (bits != 0x80008000) {
818 			return CHECKALPHA_ANY;
819 		}
820 
821 		p += stride2;
822 	}
823 
824 	return CHECKALPHA_FULL;
825 }
826