1 // Copyright (c) 2012- PPSSPP Project.
2
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0 or later versions.
6
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 // GNU General Public License 2.0 for more details.
11
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
14
15 // Official git repository and contact information can be found at
16 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18 #include "ppsspp_config.h"
19 #include "ext/xxhash.h"
20 #include "Common/Data/Convert/ColorConv.h"
21 #include "Common/CPUDetect.h"
22 #include "Common/Log.h"
23
24 #include "GPU/GPU.h"
25 #include "GPU/GPUState.h"
26 #include "GPU/Common/TextureDecoder.h"
27 // NEON is in a separate file so that it can be compiled with a runtime check.
28 #include "GPU/Common/TextureDecoderNEON.h"
29
30 #ifdef _M_SSE
31 #include <emmintrin.h>
32 #if _M_SSE >= 0x401
33 #include <smmintrin.h>
34 #endif
35
QuickTexHashSSE2(const void * checkp,u32 size)36 u32 QuickTexHashSSE2(const void *checkp, u32 size) {
37 u32 check = 0;
38
39 if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
40 __m128i cursor = _mm_set1_epi32(0);
41 __m128i cursor2 = _mm_set_epi16(0x0001U, 0x0083U, 0x4309U, 0x4d9bU, 0xb651U, 0x4b73U, 0x9bd9U, 0xc00bU);
42 __m128i update = _mm_set1_epi16(0x2455U);
43 const __m128i *p = (const __m128i *)checkp;
44 for (u32 i = 0; i < size / 16; i += 4) {
45 __m128i chunk = _mm_mullo_epi16(_mm_load_si128(&p[i]), cursor2);
46 cursor = _mm_add_epi16(cursor, chunk);
47 cursor = _mm_xor_si128(cursor, _mm_load_si128(&p[i + 1]));
48 cursor = _mm_add_epi32(cursor, _mm_load_si128(&p[i + 2]));
49 chunk = _mm_mullo_epi16(_mm_load_si128(&p[i + 3]), cursor2);
50 cursor = _mm_xor_si128(cursor, chunk);
51 cursor2 = _mm_add_epi16(cursor2, update);
52 }
53 cursor = _mm_add_epi32(cursor, cursor2);
54 // Add the four parts into the low i32.
55 cursor = _mm_add_epi32(cursor, _mm_srli_si128(cursor, 8));
56 cursor = _mm_add_epi32(cursor, _mm_srli_si128(cursor, 4));
57 check = _mm_cvtsi128_si32(cursor);
58 } else {
59 const u32 *p = (const u32 *)checkp;
60 for (u32 i = 0; i < size / 8; ++i) {
61 check += *p++;
62 check ^= *p++;
63 }
64 }
65
66 return check;
67 }
68 #endif
69
70 // Masks to downalign bufw to 16 bytes, and wrap at 2048.
71 static const u32 textureAlignMask16[16] = {
72 0x7FF & ~(((8 * 16) / 16) - 1), //GE_TFMT_5650,
73 0x7FF & ~(((8 * 16) / 16) - 1), //GE_TFMT_5551,
74 0x7FF & ~(((8 * 16) / 16) - 1), //GE_TFMT_4444,
75 0x7FF & ~(((8 * 16) / 32) - 1), //GE_TFMT_8888,
76 0x7FF & ~(((8 * 16) / 4) - 1), //GE_TFMT_CLUT4,
77 0x7FF & ~(((8 * 16) / 8) - 1), //GE_TFMT_CLUT8,
78 0x7FF & ~(((8 * 16) / 16) - 1), //GE_TFMT_CLUT16,
79 0x7FF & ~(((8 * 16) / 32) - 1), //GE_TFMT_CLUT32,
80 0x7FF, //GE_TFMT_DXT1,
81 0x7FF, //GE_TFMT_DXT3,
82 0x7FF, //GE_TFMT_DXT5,
83 0, // INVALID,
84 0, // INVALID,
85 0, // INVALID,
86 0, // INVALID,
87 0, // INVALID,
88 };
89
GetTextureBufw(int level,u32 texaddr,GETextureFormat format)90 u32 GetTextureBufw(int level, u32 texaddr, GETextureFormat format) {
91 // This is a hack to allow for us to draw the huge PPGe texture, which is always in kernel ram.
92 if (texaddr >= PSP_GetKernelMemoryBase() && texaddr < PSP_GetKernelMemoryEnd())
93 return gstate.texbufwidth[level] & 0x1FFF;
94
95 u32 bufw = gstate.texbufwidth[level] & textureAlignMask16[format];
96 if (bufw == 0 && format <= GE_TFMT_DXT5) {
97 // If it's less than 16 bytes, use 16 bytes.
98 bufw = (8 * 16) / textureBitsPerPixel[format];
99 }
100 return bufw;
101 }
102
QuickTexHashNonSSE(const void * checkp,u32 size)103 u32 QuickTexHashNonSSE(const void *checkp, u32 size) {
104 u32 check = 0;
105
106 if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
107 static const u16 cursor2_initial[8] = {0xc00bU, 0x9bd9U, 0x4b73U, 0xb651U, 0x4d9bU, 0x4309U, 0x0083U, 0x0001U};
108 union u32x4_u16x8 {
109 u32 x32[4];
110 u16 x16[8];
111 };
112 u32x4_u16x8 cursor{};
113 u32x4_u16x8 cursor2;
114 static const u16 update[8] = {0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U};
115
116 for (u32 j = 0; j < 8; ++j) {
117 cursor2.x16[j] = cursor2_initial[j];
118 }
119
120 const u32x4_u16x8 *p = (const u32x4_u16x8 *)checkp;
121 for (u32 i = 0; i < size / 16; i += 4) {
122 for (u32 j = 0; j < 8; ++j) {
123 const u16 temp = p[i + 0].x16[j] * cursor2.x16[j];
124 cursor.x16[j] += temp;
125 }
126 for (u32 j = 0; j < 4; ++j) {
127 cursor.x32[j] ^= p[i + 1].x32[j];
128 cursor.x32[j] += p[i + 2].x32[j];
129 }
130 for (u32 j = 0; j < 8; ++j) {
131 const u16 temp = p[i + 3].x16[j] * cursor2.x16[j];
132 cursor.x16[j] ^= temp;
133 }
134 for (u32 j = 0; j < 8; ++j) {
135 cursor2.x16[j] += update[j];
136 }
137 }
138
139 for (u32 j = 0; j < 4; ++j) {
140 cursor.x32[j] += cursor2.x32[j];
141 }
142 check = cursor.x32[0] + cursor.x32[1] + cursor.x32[2] + cursor.x32[3];
143 } else {
144 const u32 *p = (const u32 *)checkp;
145 for (u32 i = 0; i < size / 8; ++i) {
146 check += *p++;
147 check ^= *p++;
148 }
149 }
150
151 return check;
152 }
153
154 #if !PPSSPP_ARCH(ARM64) && !defined(_M_SSE)
QuickTexHashBasic(const void * checkp,u32 size)155 static u32 QuickTexHashBasic(const void *checkp, u32 size) {
156 #if PPSSPP_ARCH(ARM) && defined(__GNUC__)
157 __builtin_prefetch(checkp, 0, 0);
158
159 u32 check;
160 asm volatile (
161 // Let's change size to the end address.
162 "add %1, %1, %2\n"
163 "mov r6, #0\n"
164
165 ".align 2\n"
166
167 // If we have zero sized input, we'll return garbage. Oh well, shouldn't happen.
168 "QuickTexHashBasic_next:\n"
169 "ldmia %2!, {r2-r5}\n"
170 "add r6, r6, r2\n"
171 "eor r6, r6, r3\n"
172 "cmp %2, %1\n"
173 "add r6, r6, r4\n"
174 "eor r6, r6, r5\n"
175 "blo QuickTexHashBasic_next\n"
176
177 ".align 2\n"
178
179 "QuickTexHashBasic_done:\n"
180 "mov %0, r6\n"
181
182 : "=r"(check)
183 : "r"(size), "r"(checkp)
184 : "r2", "r3", "r4", "r5", "r6"
185 );
186 #else
187 u32 check = 0;
188 const u32 size_u32 = size / 4;
189 const u32 *p = (const u32 *)checkp;
190 for (u32 i = 0; i < size_u32; i += 4) {
191 check += p[i + 0];
192 check ^= p[i + 1];
193 check += p[i + 2];
194 check ^= p[i + 3];
195 }
196 #endif
197
198 return check;
199 }
200 #endif
201
DoSwizzleTex16(const u32 * ysrcp,u8 * texptr,int bxc,int byc,u32 pitch)202 void DoSwizzleTex16(const u32 *ysrcp, u8 *texptr, int bxc, int byc, u32 pitch) {
203 // ysrcp is in 32-bits, so this is convenient.
204 const u32 pitchBy32 = pitch >> 2;
205 #ifdef _M_SSE
206 if (((uintptr_t)ysrcp & 0xF) == 0 && (pitch & 0xF) == 0) {
207 __m128i *dest = (__m128i *)texptr;
208 // The pitch parameter is in bytes, so shift down for 128-bit.
209 // Note: it's always aligned to 16 bytes, so this is safe.
210 const u32 pitchBy128 = pitch >> 4;
211 for (int by = 0; by < byc; by++) {
212 const __m128i *xsrc = (const __m128i *)ysrcp;
213 for (int bx = 0; bx < bxc; bx++) {
214 const __m128i *src = xsrc;
215 for (int n = 0; n < 2; n++) {
216 // Textures are always 16-byte aligned so this is fine.
217 __m128i temp1 = _mm_load_si128(src);
218 src += pitchBy128;
219 __m128i temp2 = _mm_load_si128(src);
220 src += pitchBy128;
221 __m128i temp3 = _mm_load_si128(src);
222 src += pitchBy128;
223 __m128i temp4 = _mm_load_si128(src);
224 src += pitchBy128;
225
226 _mm_store_si128(dest, temp1);
227 _mm_store_si128(dest + 1, temp2);
228 _mm_store_si128(dest + 2, temp3);
229 _mm_store_si128(dest + 3, temp4);
230 dest += 4;
231 }
232 xsrc++;
233 }
234 ysrcp += pitchBy32 * 8;
235 }
236 } else
237 #endif
238 {
239 u32 *dest = (u32 *)texptr;
240 for (int by = 0; by < byc; by++) {
241 const u32 *xsrc = ysrcp;
242 for (int bx = 0; bx < bxc; bx++) {
243 const u32 *src = xsrc;
244 for (int n = 0; n < 8; n++) {
245 memcpy(dest, src, 16);
246 src += pitchBy32;
247 dest += 4;
248 }
249 xsrc += 4;
250 }
251 ysrcp += pitchBy32 * 8;
252 }
253 }
254 }
255
DoUnswizzleTex16Basic(const u8 * texptr,u32 * ydestp,int bxc,int byc,u32 pitch)256 void DoUnswizzleTex16Basic(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch) {
257 // ydestp is in 32-bits, so this is convenient.
258 const u32 pitchBy32 = pitch >> 2;
259
260 #ifdef _M_SSE
261 if (((uintptr_t)ydestp & 0xF) == 0 && (pitch & 0xF) == 0) {
262 const __m128i *src = (const __m128i *)texptr;
263 // The pitch parameter is in bytes, so shift down for 128-bit.
264 // Note: it's always aligned to 16 bytes, so this is safe.
265 const u32 pitchBy128 = pitch >> 4;
266 for (int by = 0; by < byc; by++) {
267 __m128i *xdest = (__m128i *)ydestp;
268 for (int bx = 0; bx < bxc; bx++) {
269 __m128i *dest = xdest;
270 for (int n = 0; n < 2; n++) {
271 // Textures are always 16-byte aligned so this is fine.
272 __m128i temp1 = _mm_load_si128(src);
273 __m128i temp2 = _mm_load_si128(src + 1);
274 __m128i temp3 = _mm_load_si128(src + 2);
275 __m128i temp4 = _mm_load_si128(src + 3);
276 _mm_store_si128(dest, temp1);
277 dest += pitchBy128;
278 _mm_store_si128(dest, temp2);
279 dest += pitchBy128;
280 _mm_store_si128(dest, temp3);
281 dest += pitchBy128;
282 _mm_store_si128(dest, temp4);
283 dest += pitchBy128;
284 src += 4;
285 }
286 xdest++;
287 }
288 ydestp += pitchBy32 * 8;
289 }
290 } else
291 #endif
292 {
293 const u32 *src = (const u32 *)texptr;
294 for (int by = 0; by < byc; by++) {
295 u32 *xdest = ydestp;
296 for (int bx = 0; bx < bxc; bx++) {
297 u32 *dest = xdest;
298 for (int n = 0; n < 8; n++) {
299 memcpy(dest, src, 16);
300 dest += pitchBy32;
301 src += 4;
302 }
303 xdest += 4;
304 }
305 ydestp += pitchBy32 * 8;
306 }
307 }
308 }
309
310 #if !PPSSPP_ARCH(ARM64) && !defined(_M_SSE)
311 QuickTexHashFunc DoQuickTexHash = &QuickTexHashBasic;
312 QuickTexHashFunc StableQuickTexHash = &QuickTexHashNonSSE;
313 UnswizzleTex16Func DoUnswizzleTex16 = &DoUnswizzleTex16Basic;
314 #endif
315
316 // This has to be done after CPUDetect has done its magic.
SetupTextureDecoder()317 void SetupTextureDecoder() {
318 #if PPSSPP_ARCH(ARM_NEON) && !PPSSPP_ARCH(ARM64)
319 if (cpu_info.bNEON) {
320 DoQuickTexHash = &QuickTexHashNEON;
321 StableQuickTexHash = &QuickTexHashNEON;
322 DoUnswizzleTex16 = &DoUnswizzleTex16NEON;
323 }
324 #endif
325 }
326
327 // S3TC / DXT Decoder
328 class DXTDecoder {
329 public:
330 inline void DecodeColors(const DXT1Block *src, bool ignore1bitAlpha);
331 inline void DecodeAlphaDXT5(const DXT5Block *src);
332 inline void WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int height);
333 inline void WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int height);
334 inline void WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int height);
335
336 protected:
337 u32 colors_[4];
338 u8 alpha_[8];
339 };
340
makecol(int r,int g,int b,int a)341 static inline u32 makecol(int r, int g, int b, int a) {
342 return (a << 24) | (b << 16) | (g << 8) | r;
343 }
344
mix_2_3(int c1,int c2)345 static inline int mix_2_3(int c1, int c2) {
346 return (c1 + c1 + c2) / 3;
347 }
348
349 // This could probably be done faster by decoding two or four blocks at a time with SSE/NEON.
DecodeColors(const DXT1Block * src,bool ignore1bitAlpha)350 void DXTDecoder::DecodeColors(const DXT1Block *src, bool ignore1bitAlpha) {
351 u16 c1 = src->color1;
352 u16 c2 = src->color2;
353 int blue1 = (c1 << 3) & 0xF8;
354 int blue2 = (c2 << 3) & 0xF8;
355 int green1 = (c1 >> 3) & 0xFC;
356 int green2 = (c2 >> 3) & 0xFC;
357 int red1 = (c1 >> 8) & 0xF8;
358 int red2 = (c2 >> 8) & 0xF8;
359
360 // Keep alpha zero for non-DXT1 to skip masking the colors.
361 int alpha = ignore1bitAlpha ? 0 : 255;
362
363 colors_[0] = makecol(red1, green1, blue1, alpha);
364 colors_[1] = makecol(red2, green2, blue2, alpha);
365 if (c1 > c2) {
366 colors_[2] = makecol(mix_2_3(red1, red2), mix_2_3(green1, green2), mix_2_3(blue1, blue2), alpha);
367 colors_[3] = makecol(mix_2_3(red2, red1), mix_2_3(green2, green1), mix_2_3(blue2, blue1), alpha);
368 } else {
369 // Average - these are always left shifted, so no need to worry about ties.
370 int red3 = (red1 + red2) / 2;
371 int green3 = (green1 + green2) / 2;
372 int blue3 = (blue1 + blue2) / 2;
373 colors_[2] = makecol(red3, green3, blue3, alpha);
374 colors_[3] = makecol(0, 0, 0, 0);
375 }
376 }
377
lerp8(const DXT5Block * src,int n)378 static inline u8 lerp8(const DXT5Block *src, int n) {
379 // These weights multiple alpha1/alpha2 to fixed 8.8 point.
380 int alpha1 = (src->alpha1 * ((7 - n) << 8)) / 7;
381 int alpha2 = (src->alpha2 * (n << 8)) / 7;
382 return (u8)((alpha1 + alpha2 + 31) >> 8);
383 }
384
lerp6(const DXT5Block * src,int n)385 static inline u8 lerp6(const DXT5Block *src, int n) {
386 int alpha1 = (src->alpha1 * ((5 - n) << 8)) / 5;
387 int alpha2 = (src->alpha2 * (n << 8)) / 5;
388 return (u8)((alpha1 + alpha2 + 31) >> 8);
389 }
390
DecodeAlphaDXT5(const DXT5Block * src)391 void DXTDecoder::DecodeAlphaDXT5(const DXT5Block *src) {
392 alpha_[0] = src->alpha1;
393 alpha_[1] = src->alpha2;
394 if (alpha_[0] > alpha_[1]) {
395 alpha_[2] = lerp8(src, 1);
396 alpha_[3] = lerp8(src, 2);
397 alpha_[4] = lerp8(src, 3);
398 alpha_[5] = lerp8(src, 4);
399 alpha_[6] = lerp8(src, 5);
400 alpha_[7] = lerp8(src, 6);
401 } else {
402 alpha_[2] = lerp6(src, 1);
403 alpha_[3] = lerp6(src, 2);
404 alpha_[4] = lerp6(src, 3);
405 alpha_[5] = lerp6(src, 4);
406 alpha_[6] = 0;
407 alpha_[7] = 255;
408 }
409 }
410
WriteColorsDXT1(u32 * dst,const DXT1Block * src,int pitch,int height)411 void DXTDecoder::WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int height) {
412 for (int y = 0; y < height; y++) {
413 int colordata = src->lines[y];
414 for (int x = 0; x < 4; x++) {
415 dst[x] = colors_[colordata & 3];
416 colordata >>= 2;
417 }
418 dst += pitch;
419 }
420 }
421
WriteColorsDXT3(u32 * dst,const DXT3Block * src,int pitch,int height)422 void DXTDecoder::WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int height) {
423 for (int y = 0; y < height; y++) {
424 int colordata = src->color.lines[y];
425 u32 alphadata = src->alphaLines[y];
426 for (int x = 0; x < 4; x++) {
427 dst[x] = colors_[colordata & 3] | (alphadata << 28);
428 colordata >>= 2;
429 alphadata >>= 4;
430 }
431 dst += pitch;
432 }
433 }
434
WriteColorsDXT5(u32 * dst,const DXT5Block * src,int pitch,int height)435 void DXTDecoder::WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int height) {
436 // 48 bits, 3 bit index per pixel, 12 bits per line.
437 u64 alphadata = ((u64)(u16)src->alphadata1 << 32) | (u32)src->alphadata2;
438
439 for (int y = 0; y < height; y++) {
440 int colordata = src->color.lines[y];
441 for (int x = 0; x < 4; x++) {
442 dst[x] = colors_[colordata & 3] | (alpha_[alphadata & 7] << 24);
443 colordata >>= 2;
444 alphadata >>= 3;
445 }
446 dst += pitch;
447 }
448 }
449
GetDXTTexelColor(const DXT1Block * src,int x,int y,int alpha)450 uint32_t GetDXTTexelColor(const DXT1Block *src, int x, int y, int alpha) {
451 _dbg_assert_(x >= 0 && x < 4);
452 _dbg_assert_(y >= 0 && y < 4);
453
454 uint16_t c1 = src->color1;
455 uint16_t c2 = src->color2;
456 int blue1 = (c1 << 3) & 0xF8;
457 int blue2 = (c2 << 3) & 0xF8;
458 int green1 = (c1 >> 3) & 0xFC;
459 int green2 = (c2 >> 3) & 0xFC;
460 int red1 = (c1 >> 8) & 0xF8;
461 int red2 = (c2 >> 8) & 0xF8;
462
463 int colorIndex = (src->lines[y] >> (x * 2)) & 3;
464 if (colorIndex == 0) {
465 return makecol(red1, green1, blue1, alpha);
466 } else if (colorIndex == 1) {
467 return makecol(red2, green2, blue2, alpha);
468 } else if (c1 > c2) {
469 if (colorIndex == 2) {
470 return makecol(mix_2_3(red1, red2), mix_2_3(green1, green2), mix_2_3(blue1, blue2), alpha);
471 }
472 return makecol(mix_2_3(red2, red1), mix_2_3(green2, green1), mix_2_3(blue2, blue1), alpha);
473 } else if (colorIndex == 3) {
474 return makecol(0, 0, 0, 0);
475 }
476
477 // Average - these are always left shifted, so no need to worry about ties.
478 int red3 = (red1 + red2) / 2;
479 int green3 = (green1 + green2) / 2;
480 int blue3 = (blue1 + blue2) / 2;
481 return makecol(red3, green3, blue3, alpha);
482 }
483
GetDXT1Texel(const DXT1Block * src,int x,int y)484 uint32_t GetDXT1Texel(const DXT1Block *src, int x, int y) {
485 return GetDXTTexelColor(src, x, y, 255);
486 }
487
GetDXT3Texel(const DXT3Block * src,int x,int y)488 uint32_t GetDXT3Texel(const DXT3Block *src, int x, int y) {
489 uint32_t color = GetDXTTexelColor(&src->color, x, y, 0);
490 u32 alpha = (src->alphaLines[y] >> (x * 4)) & 0xF;
491 return color | (alpha << 28);
492 }
493
GetDXT5Texel(const DXT5Block * src,int x,int y)494 uint32_t GetDXT5Texel(const DXT5Block *src, int x, int y) {
495 uint32_t color = GetDXTTexelColor(&src->color, x, y, 0);
496 uint64_t alphadata = ((uint64_t)(uint16_t)src->alphadata1 << 32) | (uint32_t)src->alphadata2;
497 int alphaIndex = (alphadata >> (y * 12 + x * 3)) & 7;
498
499 if (alphaIndex == 0) {
500 return color | (src->alpha1 << 24);
501 } else if (alphaIndex == 1) {
502 return color | (src->alpha2 << 24);
503 } else if (src->alpha1 > src->alpha2) {
504 return color | (lerp8(src, alphaIndex - 1) << 24);
505 } else if (alphaIndex == 6) {
506 return color;
507 } else if (alphaIndex == 7) {
508 return color | 0xFF000000;
509 }
510 return color | (lerp6(src, alphaIndex - 1) << 24);
511 }
512
513 // This could probably be done faster by decoding two or four blocks at a time with SSE/NEON.
DecodeDXT1Block(u32 * dst,const DXT1Block * src,int pitch,int height,bool ignore1bitAlpha)514 void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int height, bool ignore1bitAlpha) {
515 DXTDecoder dxt;
516 dxt.DecodeColors(src, ignore1bitAlpha);
517 dxt.WriteColorsDXT1(dst, src, pitch, height);
518 }
519
DecodeDXT3Block(u32 * dst,const DXT3Block * src,int pitch,int height)520 void DecodeDXT3Block(u32 *dst, const DXT3Block *src, int pitch, int height) {
521 DXTDecoder dxt;
522 dxt.DecodeColors(&src->color, true);
523 dxt.WriteColorsDXT3(dst, src, pitch, height);
524 }
525
DecodeDXT5Block(u32 * dst,const DXT5Block * src,int pitch,int height)526 void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch, int height) {
527 DXTDecoder dxt;
528 dxt.DecodeColors(&src->color, true);
529 dxt.DecodeAlphaDXT5(src);
530 dxt.WriteColorsDXT5(dst, src, pitch, height);
531 }
532
533 #ifdef _M_SSE
CombineSSEBitsToDWORD(const __m128i & v)534 static inline u32 CombineSSEBitsToDWORD(const __m128i &v) {
535 __m128i temp;
536 temp = _mm_or_si128(v, _mm_srli_si128(v, 8));
537 temp = _mm_or_si128(temp, _mm_srli_si128(temp, 4));
538 return _mm_cvtsi128_si32(temp);
539 }
540
CheckAlphaRGBA8888SSE2(const u32 * pixelData,int stride,int w,int h)541 CheckAlphaResult CheckAlphaRGBA8888SSE2(const u32 *pixelData, int stride, int w, int h) {
542 const __m128i mask = _mm_set1_epi32(0xFF000000);
543
544 const __m128i *p = (const __m128i *)pixelData;
545 const int w4 = w / 4;
546 const int stride4 = stride / 4;
547
548 __m128i bits = mask;
549 for (int y = 0; y < h; ++y) {
550 for (int i = 0; i < w4; ++i) {
551 const __m128i a = _mm_load_si128(&p[i]);
552 bits = _mm_and_si128(bits, a);
553 }
554
555 __m128i result = _mm_xor_si128(bits, mask);
556 if (CombineSSEBitsToDWORD(result) != 0) {
557 return CHECKALPHA_ANY;
558 }
559
560 p += stride4;
561 }
562
563 return CHECKALPHA_FULL;
564 }
565
CheckAlphaABGR4444SSE2(const u32 * pixelData,int stride,int w,int h)566 CheckAlphaResult CheckAlphaABGR4444SSE2(const u32 *pixelData, int stride, int w, int h) {
567 const __m128i mask = _mm_set1_epi16((short)0x000F);
568
569 const __m128i *p = (const __m128i *)pixelData;
570 const int w8 = w / 8;
571 const int stride8 = stride / 8;
572
573 __m128i bits = mask;
574 for (int y = 0; y < h; ++y) {
575 for (int i = 0; i < w8; ++i) {
576 const __m128i a = _mm_load_si128(&p[i]);
577 bits = _mm_and_si128(bits, a);
578 }
579
580 __m128i result = _mm_xor_si128(bits, mask);
581 if (CombineSSEBitsToDWORD(result) != 0) {
582 return CHECKALPHA_ANY;
583 }
584
585 p += stride8;
586 }
587
588 return CHECKALPHA_FULL;
589 }
590
CheckAlphaABGR1555SSE2(const u32 * pixelData,int stride,int w,int h)591 CheckAlphaResult CheckAlphaABGR1555SSE2(const u32 *pixelData, int stride, int w, int h) {
592 const __m128i mask = _mm_set1_epi16((short)0x0001);
593
594 const __m128i *p = (const __m128i *)pixelData;
595 const int w8 = w / 8;
596 const int stride8 = stride / 8;
597
598 __m128i bits = mask;
599 for (int y = 0; y < h; ++y) {
600 for (int i = 0; i < w8; ++i) {
601 const __m128i a = _mm_load_si128(&p[i]);
602 bits = _mm_and_si128(bits, a);
603 }
604
605 __m128i result = _mm_xor_si128(bits, mask);
606 if (CombineSSEBitsToDWORD(result) != 0) {
607 return CHECKALPHA_ANY;
608 }
609
610 p += stride8;
611 }
612
613 return CHECKALPHA_FULL;
614 }
615
CheckAlphaRGBA4444SSE2(const u32 * pixelData,int stride,int w,int h)616 CheckAlphaResult CheckAlphaRGBA4444SSE2(const u32 *pixelData, int stride, int w, int h) {
617 const __m128i mask = _mm_set1_epi16((short)0xF000);
618
619 const __m128i *p = (const __m128i *)pixelData;
620 const int w8 = w / 8;
621 const int stride8 = stride / 8;
622
623 __m128i bits = mask;
624 for (int y = 0; y < h; ++y) {
625 for (int i = 0; i < w8; ++i) {
626 const __m128i a = _mm_load_si128(&p[i]);
627 bits = _mm_and_si128(bits, a);
628 }
629
630 __m128i result = _mm_xor_si128(bits, mask);
631 if (CombineSSEBitsToDWORD(result) != 0) {
632 return CHECKALPHA_ANY;
633 }
634
635 p += stride8;
636 }
637
638 return CHECKALPHA_FULL;
639 }
640
CheckAlphaRGBA5551SSE2(const u32 * pixelData,int stride,int w,int h)641 CheckAlphaResult CheckAlphaRGBA5551SSE2(const u32 *pixelData, int stride, int w, int h) {
642 const __m128i mask = _mm_set1_epi16((short)0x8000);
643
644 const __m128i *p = (const __m128i *)pixelData;
645 const int w8 = w / 8;
646 const int stride8 = stride / 8;
647
648 __m128i bits = mask;
649 for (int y = 0; y < h; ++y) {
650 for (int i = 0; i < w8; ++i) {
651 const __m128i a = _mm_load_si128(&p[i]);
652 bits = _mm_and_si128(bits, a);
653 }
654
655 __m128i result = _mm_xor_si128(bits, mask);
656 if (CombineSSEBitsToDWORD(result) != 0) {
657 return CHECKALPHA_ANY;
658 }
659
660 p += stride8;
661 }
662
663 return CHECKALPHA_FULL;
664 }
665 #endif
666
CheckAlphaRGBA8888Basic(const u32 * pixelData,int stride,int w,int h)667 CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w, int h) {
668 // Use SIMD if aligned to 16 bytes / 4 pixels (almost always the case.)
669 if ((w & 3) == 0 && (stride & 3) == 0) {
670 #ifdef _M_SSE
671 return CheckAlphaRGBA8888SSE2(pixelData, stride, w, h);
672 #elif PPSSPP_ARCH(ARM_NEON)
673 if (cpu_info.bNEON) {
674 return CheckAlphaRGBA8888NEON(pixelData, stride, w, h);
675 }
676 #endif
677 }
678
679 const u32 *p = pixelData;
680 for (int y = 0; y < h; ++y) {
681 u32 bits = 0xFF000000;
682 for (int i = 0; i < w; ++i) {
683 bits &= p[i];
684 }
685
686 if (bits != 0xFF000000) {
687 // We're done, we hit non-full alpha.
688 return CHECKALPHA_ANY;
689 }
690
691 p += stride;
692 }
693
694 return CHECKALPHA_FULL;
695 }
696
CheckAlphaABGR4444Basic(const u32 * pixelData,int stride,int w,int h)697 CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h) {
698 // Use SIMD if aligned to 16 bytes / 8 pixels (usually the case.)
699 if ((w & 7) == 0 && (stride & 7) == 0) {
700 #ifdef _M_SSE
701 return CheckAlphaABGR4444SSE2(pixelData, stride, w, h);
702 #elif PPSSPP_ARCH(ARM_NEON)
703 if (cpu_info.bNEON) {
704 return CheckAlphaABGR4444NEON(pixelData, stride, w, h);
705 }
706 #endif
707 }
708
709 const u32 *p = pixelData;
710 const int w2 = (w + 1) / 2;
711 const int stride2 = (stride + 1) / 2;
712
713 for (int y = 0; y < h; ++y) {
714 u32 bits = 0x000F000F;
715 for (int i = 0; i < w2; ++i) {
716 bits &= p[i];
717 }
718
719 if (bits != 0x000F000F) {
720 // We're done, we hit non-full alpha.
721 return CHECKALPHA_ANY;
722 }
723
724 p += stride2;
725 }
726
727 return CHECKALPHA_FULL;
728 }
729
CheckAlphaABGR1555Basic(const u32 * pixelData,int stride,int w,int h)730 CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w, int h) {
731 // Use SIMD if aligned to 16 bytes / 8 pixels (usually the case.)
732 if ((w & 7) == 0 && (stride & 7) == 0) {
733 #ifdef _M_SSE
734 return CheckAlphaABGR1555SSE2(pixelData, stride, w, h);
735 #elif PPSSPP_ARCH(ARM_NEON)
736 if (cpu_info.bNEON) {
737 return CheckAlphaABGR1555NEON(pixelData, stride, w, h);
738 }
739 #endif
740 }
741
742 const u32 *p = pixelData;
743 const int w2 = (w + 1) / 2;
744 const int stride2 = (stride + 1) / 2;
745
746 for (int y = 0; y < h; ++y) {
747 u32 bits = 0x00010001;
748 for (int i = 0; i < w2; ++i) {
749 bits &= p[i];
750 }
751
752 if (bits != 0x00010001) {
753 return CHECKALPHA_ANY;
754 }
755
756 p += stride2;
757 }
758
759 return CHECKALPHA_FULL;
760 }
761
CheckAlphaRGBA4444Basic(const u32 * pixelData,int stride,int w,int h)762 CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h) {
763 // Use SSE if aligned to 16 bytes / 8 pixels (usually the case.)
764 if ((w & 7) == 0 && (stride & 7) == 0) {
765 #ifdef _M_SSE
766 return CheckAlphaRGBA4444SSE2(pixelData, stride, w, h);
767 #elif PPSSPP_ARCH(ARM_NEON)
768 if (cpu_info.bNEON) {
769 return CheckAlphaRGBA4444NEON(pixelData, stride, w, h);
770 }
771 #endif
772 }
773
774 const u32 *p = pixelData;
775 const int w2 = (w + 1) / 2;
776 const int stride2 = (stride + 1) / 2;
777
778 for (int y = 0; y < h; ++y) {
779 u32 bits = 0xF000F000;
780 for (int i = 0; i < w2; ++i) {
781 bits &= p[i];
782 }
783
784 if (bits != 0xF000F000) {
785 // We're done, we hit non-full alpha.
786 return CHECKALPHA_ANY;
787 }
788
789 p += stride2;
790 }
791
792 return CHECKALPHA_FULL;
793 }
794
CheckAlphaRGBA5551Basic(const u32 * pixelData,int stride,int w,int h)795 CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w, int h) {
796 // Use SSE if aligned to 16 bytes / 8 pixels (usually the case.)
797 if ((w & 7) == 0 && (stride & 7) == 0) {
798 #ifdef _M_SSE
799 return CheckAlphaRGBA5551SSE2(pixelData, stride, w, h);
800 #elif PPSSPP_ARCH(ARM_NEON)
801 if (cpu_info.bNEON) {
802 return CheckAlphaRGBA5551NEON(pixelData, stride, w, h);
803 }
804 #endif
805 }
806
807 const u32 *p = pixelData;
808 const int w2 = (w + 1) / 2;
809 const int stride2 = (stride + 1) / 2;
810
811 for (int y = 0; y < h; ++y) {
812 u32 bits = 0x80008000;
813 for (int i = 0; i < w2; ++i) {
814 bits &= p[i];
815 }
816
817 if (bits != 0x80008000) {
818 return CHECKALPHA_ANY;
819 }
820
821 p += stride2;
822 }
823
824 return CHECKALPHA_FULL;
825 }
826