1 // Copyright (c) 2015- PPSSPP Project.
2
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0 or later versions.
6
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 // GNU General Public License 2.0 for more details.
11
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
14
15 // Official git repository and contact information can be found at
16 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18 #include "ppsspp_config.h"
19 #include "Common/Data/Convert/ColorConv.h"
20 #include "Common/Data/Convert/SmallDataConvert.h"
21 // NEON is in a separate file so that it can be compiled with a runtime check.
22 #include "Common/Data/Convert/ColorConvNEON.h"
23 #include "Common/Common.h"
24 #include "Common/CPUDetect.h"
25
26 #ifdef _M_SSE
27 #include <emmintrin.h>
28 #endif
29
30 #if _M_SSE >= 0x401
31 #include <smmintrin.h>
32 #endif
33
RGBA8888toRGB565(u32 px)34 inline u16 RGBA8888toRGB565(u32 px) {
35 return ((px >> 3) & 0x001F) | ((px >> 5) & 0x07E0) | ((px >> 8) & 0xF800);
36 }
37
RGBA8888toRGBA4444(u32 px)38 inline u16 RGBA8888toRGBA4444(u32 px) {
39 return ((px >> 4) & 0x000F) | ((px >> 8) & 0x00F0) | ((px >> 12) & 0x0F00) | ((px >> 16) & 0xF000);
40 }
41
BGRA8888toRGB565(u32 px)42 inline u16 BGRA8888toRGB565(u32 px) {
43 return ((px >> 19) & 0x001F) | ((px >> 5) & 0x07E0) | ((px << 8) & 0xF800);
44 }
45
BGRA8888toRGBA4444(u32 px)46 inline u16 BGRA8888toRGBA4444(u32 px) {
47 return ((px >> 20) & 0x000F) | ((px >> 8) & 0x00F0) | ((px << 4) & 0x0F00) | ((px >> 16) & 0xF000);
48 }
49
BGRA8888toRGBA5551(u32 px)50 inline u16 BGRA8888toRGBA5551(u32 px) {
51 return ((px >> 19) & 0x001F) | ((px >> 6) & 0x03E0) | ((px << 7) & 0x7C00) | ((px >> 16) & 0x8000);
52 }
53
RGBA8888toRGBA5551(u32 px)54 inline u16 RGBA8888toRGBA5551(u32 px) {
55 return ((px >> 3) & 0x001F) | ((px >> 6) & 0x03E0) | ((px >> 9) & 0x7C00) | ((px >> 16) & 0x8000);
56 }
57
58 // convert 4444 image to 8888, parallelizable
convert4444_gl(u16 * data,u32 * out,int width,int l,int u)59 void convert4444_gl(u16* data, u32* out, int width, int l, int u) {
60 for (int y = l; y < u; ++y) {
61 for (int x = 0; x < width; ++x) {
62 u32 val = data[y*width + x];
63 u32 r = ((val >> 12) & 0xF) * 17;
64 u32 g = ((val >> 8) & 0xF) * 17;
65 u32 b = ((val >> 4) & 0xF) * 17;
66 u32 a = ((val >> 0) & 0xF) * 17;
67 out[y*width + x] = (a << 24) | (b << 16) | (g << 8) | r;
68 }
69 }
70 }
71
72 // convert 565 image to 8888, parallelizable
convert565_gl(u16 * data,u32 * out,int width,int l,int u)73 void convert565_gl(u16* data, u32* out, int width, int l, int u) {
74 for (int y = l; y < u; ++y) {
75 for (int x = 0; x < width; ++x) {
76 u32 val = data[y*width + x];
77 u32 r = Convert5To8((val >> 11) & 0x1F);
78 u32 g = Convert6To8((val >> 5) & 0x3F);
79 u32 b = Convert5To8((val)& 0x1F);
80 out[y*width + x] = (0xFF << 24) | (b << 16) | (g << 8) | r;
81 }
82 }
83 }
84
85 // convert 5551 image to 8888, parallelizable
convert5551_gl(u16 * data,u32 * out,int width,int l,int u)86 void convert5551_gl(u16* data, u32* out, int width, int l, int u) {
87 for (int y = l; y < u; ++y) {
88 for (int x = 0; x < width; ++x) {
89 u32 val = data[y*width + x];
90 u32 r = Convert5To8((val >> 11) & 0x1F);
91 u32 g = Convert5To8((val >> 6) & 0x1F);
92 u32 b = Convert5To8((val >> 1) & 0x1F);
93 u32 a = (val & 0x1) * 255;
94 out[y*width + x] = (a << 24) | (b << 16) | (g << 8) | r;
95 }
96 }
97 }
98
99 // convert 4444 image to 8888, parallelizable
convert4444_dx9(u16 * data,u32 * out,int width,int l,int u)100 void convert4444_dx9(u16* data, u32* out, int width, int l, int u) {
101 for (int y = l; y < u; ++y) {
102 for (int x = 0; x < width; ++x) {
103 u32 val = data[y*width + x];
104 u32 r = ((val >> 0) & 0xF) * 17;
105 u32 g = ((val >> 4) & 0xF) * 17;
106 u32 b = ((val >> 8) & 0xF) * 17;
107 u32 a = ((val >> 12) & 0xF) * 17;
108 out[y*width + x] = (a << 24) | (b << 16) | (g << 8) | r;
109 }
110 }
111 }
112
113 // convert 565 image to 8888, parallelizable
convert565_dx9(u16 * data,u32 * out,int width,int l,int u)114 void convert565_dx9(u16* data, u32* out, int width, int l, int u) {
115 for (int y = l; y < u; ++y) {
116 for (int x = 0; x < width; ++x) {
117 u32 val = data[y*width + x];
118 u32 r = Convert5To8((val)& 0x1F);
119 u32 g = Convert6To8((val >> 5) & 0x3F);
120 u32 b = Convert5To8((val >> 11) & 0x1F);
121 out[y*width + x] = (0xFF << 24) | (b << 16) | (g << 8) | r;
122 }
123 }
124 }
125
126 // convert 5551 image to 8888, parallelizable
convert5551_dx9(u16 * data,u32 * out,int width,int l,int u)127 void convert5551_dx9(u16* data, u32* out, int width, int l, int u) {
128 for (int y = l; y < u; ++y) {
129 for (int x = 0; x < width; ++x) {
130 u32 val = data[y*width + x];
131 u32 r = Convert5To8((val >> 0) & 0x1F);
132 u32 g = Convert5To8((val >> 5) & 0x1F);
133 u32 b = Convert5To8((val >> 10) & 0x1F);
134 u32 a = ((val >> 15) & 0x1) * 255;
135 out[y*width + x] = (a << 24) | (b << 16) | (g << 8) | r;
136 }
137 }
138 }
139
140
141
ConvertBGRA8888ToRGBA8888(u32 * dst,const u32 * src,u32 numPixels)142 void ConvertBGRA8888ToRGBA8888(u32 *dst, const u32 *src, u32 numPixels) {
143 #ifdef _M_SSE
144 const __m128i maskGA = _mm_set1_epi32(0xFF00FF00);
145
146 const __m128i *srcp = (const __m128i *)src;
147 __m128i *dstp = (__m128i *)dst;
148 u32 sseChunks = numPixels / 4;
149 if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
150 sseChunks = 0;
151 }
152 for (u32 i = 0; i < sseChunks; ++i) {
153 __m128i c = _mm_load_si128(&srcp[i]);
154 __m128i rb = _mm_andnot_si128(maskGA, c);
155 c = _mm_and_si128(c, maskGA);
156
157 __m128i b = _mm_srli_epi32(rb, 16);
158 __m128i r = _mm_slli_epi32(rb, 16);
159 c = _mm_or_si128(_mm_or_si128(c, r), b);
160 _mm_store_si128(&dstp[i], c);
161 }
162 // The remainder starts right after those done via SSE.
163 u32 i = sseChunks * 4;
164 #else
165 u32 i = 0;
166 #endif
167 for (; i < numPixels; i++) {
168 const u32 c = src[i];
169 dst[i] = ((c >> 16) & 0x000000FF) |
170 (c & 0xFF00FF00) |
171 ((c << 16) & 0x00FF0000);
172 }
173 }
174
ConvertBGRA8888ToRGB888(u8 * dst,const u32 * src,u32 numPixels)175 void ConvertBGRA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) {
176 for (uint32_t x = 0; x < numPixels; ++x) {
177 uint32_t c = src[x];
178 dst[x * 3 + 0] = (c >> 16) & 0xFF;
179 dst[x * 3 + 1] = (c >> 8) & 0xFF;
180 dst[x * 3 + 2] = (c >> 0) & 0xFF;
181 }
182 }
183
ConvertRGBA8888ToRGBA5551(u16 * dst,const u32 * src,u32 numPixels)184 void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) {
185 #if _M_SSE >= 0x401
186 const __m128i maskAG = _mm_set1_epi32(0x8000F800);
187 const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
188 const __m128i mask = _mm_set1_epi32(0x0000FFFF);
189
190 const __m128i *srcp = (const __m128i *)src;
191 __m128i *dstp = (__m128i *)dst;
192 u32 sseChunks = (numPixels / 4) & ~1;
193 // SSE 4.1 required for _mm_packus_epi32.
194 if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) {
195 sseChunks = 0;
196 }
197 for (u32 i = 0; i < sseChunks; i += 2) {
198 __m128i c1 = _mm_load_si128(&srcp[i + 0]);
199 __m128i c2 = _mm_load_si128(&srcp[i + 1]);
200 __m128i ag, rb;
201
202 ag = _mm_and_si128(c1, maskAG);
203 ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
204 rb = _mm_and_si128(c1, maskRB);
205 rb = _mm_or_si128(_mm_srli_epi32(rb, 3), _mm_srli_epi32(rb, 9));
206 c1 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
207
208 ag = _mm_and_si128(c2, maskAG);
209 ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
210 rb = _mm_and_si128(c2, maskRB);
211 rb = _mm_or_si128(_mm_srli_epi32(rb, 3), _mm_srli_epi32(rb, 9));
212 c2 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
213
214 _mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2));
215 }
216 // The remainder starts right after those done via SSE.
217 u32 i = sseChunks * 4;
218 #else
219 u32 i = 0;
220 #endif
221 for (; i < numPixels; i++) {
222 dst[i] = RGBA8888toRGBA5551(src[i]);
223 }
224 }
225
ConvertBGRA8888ToRGBA5551(u16 * dst,const u32 * src,u32 numPixels)226 void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) {
227 #if _M_SSE >= 0x401
228 const __m128i maskAG = _mm_set1_epi32(0x8000F800);
229 const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
230 const __m128i mask = _mm_set1_epi32(0x0000FFFF);
231
232 const __m128i *srcp = (const __m128i *)src;
233 __m128i *dstp = (__m128i *)dst;
234 u32 sseChunks = (numPixels / 4) & ~1;
235 // SSE 4.1 required for _mm_packus_epi32.
236 if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) {
237 sseChunks = 0;
238 }
239 for (u32 i = 0; i < sseChunks; i += 2) {
240 __m128i c1 = _mm_load_si128(&srcp[i + 0]);
241 __m128i c2 = _mm_load_si128(&srcp[i + 1]);
242 __m128i ag, rb;
243
244 ag = _mm_and_si128(c1, maskAG);
245 ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
246 rb = _mm_and_si128(c1, maskRB);
247 rb = _mm_or_si128(_mm_srli_epi32(rb, 19), _mm_slli_epi32(rb, 7));
248 c1 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
249
250 ag = _mm_and_si128(c2, maskAG);
251 ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
252 rb = _mm_and_si128(c2, maskRB);
253 rb = _mm_or_si128(_mm_srli_epi32(rb, 19), _mm_slli_epi32(rb, 7));
254 c2 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
255
256 _mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2));
257 }
258 // The remainder starts right after those done via SSE.
259 u32 i = sseChunks * 4;
260 #else
261 u32 i = 0;
262 #endif
263 for (; i < numPixels; i++) {
264 dst[i] = BGRA8888toRGBA5551(src[i]);
265 }
266 }
267
ConvertBGRA8888ToRGB565(u16 * dst,const u32 * src,u32 numPixels)268 void ConvertBGRA8888ToRGB565(u16 *dst, const u32 *src, u32 numPixels) {
269 for (u32 i = 0; i < numPixels; i++) {
270 dst[i] = BGRA8888toRGB565(src[i]);
271 }
272 }
273
ConvertBGRA8888ToRGBA4444(u16 * dst,const u32 * src,u32 numPixels)274 void ConvertBGRA8888ToRGBA4444(u16 *dst, const u32 *src, u32 numPixels) {
275 for (u32 i = 0; i < numPixels; i++) {
276 dst[i] = BGRA8888toRGBA4444(src[i]);
277 }
278 }
279
ConvertRGBA8888ToRGB565(u16 * dst,const u32 * src,u32 numPixels)280 void ConvertRGBA8888ToRGB565(u16 *dst, const u32 *src, u32 numPixels) {
281 for (u32 x = 0; x < numPixels; ++x) {
282 dst[x] = RGBA8888toRGB565(src[x]);
283 }
284 }
285
ConvertRGBA8888ToRGBA4444(u16 * dst,const u32 * src,u32 numPixels)286 void ConvertRGBA8888ToRGBA4444(u16 *dst, const u32 *src, u32 numPixels) {
287 for (u32 x = 0; x < numPixels; ++x) {
288 dst[x] = RGBA8888toRGBA4444(src[x]);
289 }
290 }
291
ConvertRGBA8888ToRGB888(u8 * dst,const u32 * src,u32 numPixels)292 void ConvertRGBA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) {
293 for (uint32_t x = 0; x < numPixels; ++x) {
294 memcpy(dst + x * 3, src + x, 3);
295 }
296 }
297
ConvertRGB565ToRGBA8888(u32 * dst32,const u16 * src,u32 numPixels)298 void ConvertRGB565ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
299 #ifdef _M_SSE
300 const __m128i mask5 = _mm_set1_epi16(0x001f);
301 const __m128i mask6 = _mm_set1_epi16(0x003f);
302 const __m128i mask8 = _mm_set1_epi16(0x00ff);
303
304 const __m128i *srcp = (const __m128i *)src;
305 __m128i *dstp = (__m128i *)dst32;
306 u32 sseChunks = numPixels / 8;
307 if (((intptr_t)src & 0xF) || ((intptr_t)dst32 & 0xF)) {
308 sseChunks = 0;
309 }
310 for (u32 i = 0; i < sseChunks; ++i) {
311 const __m128i c = _mm_load_si128(&srcp[i]);
312
313 // Swizzle, resulting in RR00 RR00.
314 __m128i r = _mm_and_si128(c, mask5);
315 r = _mm_or_si128(_mm_slli_epi16(r, 3), _mm_srli_epi16(r, 2));
316 r = _mm_and_si128(r, mask8);
317
318 // This one becomes 00GG 00GG.
319 __m128i g = _mm_and_si128(_mm_srli_epi16(c, 5), mask6);
320 g = _mm_or_si128(_mm_slli_epi16(g, 2), _mm_srli_epi16(g, 4));
321 g = _mm_slli_epi16(g, 8);
322
323 // Almost done, we aim for BB00 BB00 again here.
324 __m128i b = _mm_and_si128(_mm_srli_epi16(c, 11), mask5);
325 b = _mm_or_si128(_mm_slli_epi16(b, 3), _mm_srli_epi16(b, 2));
326 b = _mm_and_si128(b, mask8);
327
328 // Always set alpha to 00FF 00FF.
329 __m128i a = _mm_slli_epi16(mask8, 8);
330
331 // Now combine them, RRGG RRGG and BBAA BBAA, and then interleave.
332 const __m128i rg = _mm_or_si128(r, g);
333 const __m128i ba = _mm_or_si128(b, a);
334 _mm_store_si128(&dstp[i * 2 + 0], _mm_unpacklo_epi16(rg, ba));
335 _mm_store_si128(&dstp[i * 2 + 1], _mm_unpackhi_epi16(rg, ba));
336 }
337 u32 i = sseChunks * 8;
338 #else
339 u32 i = 0;
340 #endif
341
342 u8 *dst = (u8 *)dst32;
343 for (u32 x = i; x < numPixels; x++) {
344 u16 col = src[x];
345 dst[x * 4] = Convert5To8((col) & 0x1f);
346 dst[x * 4 + 1] = Convert6To8((col >> 5) & 0x3f);
347 dst[x * 4 + 2] = Convert5To8((col >> 11) & 0x1f);
348 dst[x * 4 + 3] = 255;
349 }
350 }
351
ConvertRGBA5551ToRGBA8888(u32 * dst32,const u16 * src,u32 numPixels)352 void ConvertRGBA5551ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
353 #ifdef _M_SSE
354 const __m128i mask5 = _mm_set1_epi16(0x001f);
355 const __m128i mask8 = _mm_set1_epi16(0x00ff);
356
357 const __m128i *srcp = (const __m128i *)src;
358 __m128i *dstp = (__m128i *)dst32;
359 u32 sseChunks = numPixels / 8;
360 if (((intptr_t)src & 0xF) || ((intptr_t)dst32 & 0xF)) {
361 sseChunks = 0;
362 }
363 for (u32 i = 0; i < sseChunks; ++i) {
364 const __m128i c = _mm_load_si128(&srcp[i]);
365
366 // Swizzle, resulting in RR00 RR00.
367 __m128i r = _mm_and_si128(c, mask5);
368 r = _mm_or_si128(_mm_slli_epi16(r, 3), _mm_srli_epi16(r, 2));
369 r = _mm_and_si128(r, mask8);
370
371 // This one becomes 00GG 00GG.
372 __m128i g = _mm_and_si128(_mm_srli_epi16(c, 5), mask5);
373 g = _mm_or_si128(_mm_slli_epi16(g, 3), _mm_srli_epi16(g, 2));
374 g = _mm_slli_epi16(g, 8);
375
376 // Almost done, we aim for BB00 BB00 again here.
377 __m128i b = _mm_and_si128(_mm_srli_epi16(c, 10), mask5);
378 b = _mm_or_si128(_mm_slli_epi16(b, 3), _mm_srli_epi16(b, 2));
379 b = _mm_and_si128(b, mask8);
380
381 // 1 bit A to 00AA 00AA.
382 __m128i a = _mm_srai_epi16(c, 15);
383 a = _mm_slli_epi16(a, 8);
384
385 // Now combine them, RRGG RRGG and BBAA BBAA, and then interleave.
386 const __m128i rg = _mm_or_si128(r, g);
387 const __m128i ba = _mm_or_si128(b, a);
388 _mm_store_si128(&dstp[i * 2 + 0], _mm_unpacklo_epi16(rg, ba));
389 _mm_store_si128(&dstp[i * 2 + 1], _mm_unpackhi_epi16(rg, ba));
390 }
391 u32 i = sseChunks * 8;
392 #else
393 u32 i = 0;
394 #endif
395
396 u8 *dst = (u8 *)dst32;
397 for (u32 x = i; x < numPixels; x++) {
398 u16 col = src[x];
399 dst[x * 4] = Convert5To8((col) & 0x1f);
400 dst[x * 4 + 1] = Convert5To8((col >> 5) & 0x1f);
401 dst[x * 4 + 2] = Convert5To8((col >> 10) & 0x1f);
402 dst[x * 4 + 3] = (col >> 15) ? 255 : 0;
403 }
404 }
405
ConvertRGBA4444ToRGBA8888(u32 * dst32,const u16 * src,u32 numPixels)406 void ConvertRGBA4444ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
407 #ifdef _M_SSE
408 const __m128i mask4 = _mm_set1_epi16(0x000f);
409
410 const __m128i *srcp = (const __m128i *)src;
411 __m128i *dstp = (__m128i *)dst32;
412 u32 sseChunks = numPixels / 8;
413 if (((intptr_t)src & 0xF) || ((intptr_t)dst32 & 0xF)) {
414 sseChunks = 0;
415 }
416 for (u32 i = 0; i < sseChunks; ++i) {
417 const __m128i c = _mm_load_si128(&srcp[i]);
418
419 // Let's just grab R000 R000, without swizzling yet.
420 __m128i r = _mm_and_si128(c, mask4);
421 // And then 00G0 00G0.
422 __m128i g = _mm_and_si128(_mm_srli_epi16(c, 4), mask4);
423 g = _mm_slli_epi16(g, 8);
424 // Now B000 B000.
425 __m128i b = _mm_and_si128(_mm_srli_epi16(c, 8), mask4);
426 // And lastly 00A0 00A0. No mask needed, we have a wall.
427 __m128i a = _mm_srli_epi16(c, 12);
428 a = _mm_slli_epi16(a, 8);
429
430 // We swizzle after combining - R0G0 R0G0 and B0A0 B0A0 -> RRGG RRGG and BBAA BBAA.
431 __m128i rg = _mm_or_si128(r, g);
432 __m128i ba = _mm_or_si128(b, a);
433 rg = _mm_or_si128(rg, _mm_slli_epi16(rg, 4));
434 ba = _mm_or_si128(ba, _mm_slli_epi16(ba, 4));
435
436 // And then we can store.
437 _mm_store_si128(&dstp[i * 2 + 0], _mm_unpacklo_epi16(rg, ba));
438 _mm_store_si128(&dstp[i * 2 + 1], _mm_unpackhi_epi16(rg, ba));
439 }
440 u32 i = sseChunks * 8;
441 #else
442 u32 i = 0;
443 #endif
444
445 u8 *dst = (u8 *)dst32;
446 for (u32 x = i; x < numPixels; x++) {
447 u16 col = src[x];
448 dst[x * 4] = Convert4To8(col & 0xf);
449 dst[x * 4 + 1] = Convert4To8((col >> 4) & 0xf);
450 dst[x * 4 + 2] = Convert4To8((col >> 8) & 0xf);
451 dst[x * 4 + 3] = Convert4To8(col >> 12);
452 }
453 }
454
ConvertBGR565ToRGBA8888(u32 * dst32,const u16 * src,u32 numPixels)455 void ConvertBGR565ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
456 u8 *dst = (u8 *)dst32;
457 for (u32 x = 0; x < numPixels; x++) {
458 u16 col = src[x];
459 dst[x * 4] = Convert5To8((col >> 11) & 0x1f);
460 dst[x * 4 + 1] = Convert6To8((col >> 5) & 0x3f);
461 dst[x * 4 + 2] = Convert5To8((col) & 0x1f);
462 dst[x * 4 + 3] = 255;
463 }
464 }
465
ConvertABGR1555ToRGBA8888(u32 * dst32,const u16 * src,u32 numPixels)466 void ConvertABGR1555ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
467 u8 *dst = (u8 *)dst32;
468 for (u32 x = 0; x < numPixels; x++) {
469 u16 col = src[x];
470 dst[x * 4] = Convert5To8((col >> 11) & 0x1f);
471 dst[x * 4 + 1] = Convert5To8((col >> 6) & 0x1f);
472 dst[x * 4 + 2] = Convert5To8((col >> 1) & 0x1f);
473 dst[x * 4 + 3] = (col & 1) ? 255 : 0;
474 }
475 }
476
ConvertABGR4444ToRGBA8888(u32 * dst32,const u16 * src,u32 numPixels)477 void ConvertABGR4444ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
478 u8 *dst = (u8 *)dst32;
479 for (u32 x = 0; x < numPixels; x++) {
480 u16 col = src[x];
481 dst[x * 4] = Convert4To8(col >> 12);
482 dst[x * 4 + 1] = Convert4To8((col >> 8) & 0xf);
483 dst[x * 4 + 2] = Convert4To8((col >> 4) & 0xf);
484 dst[x * 4 + 3] = Convert4To8(col & 0xf);
485 }
486 }
487
ConvertRGBA4444ToBGRA8888(u32 * dst32,const u16 * src,u32 numPixels)488 void ConvertRGBA4444ToBGRA8888(u32 *dst32, const u16 *src, u32 numPixels) {
489 u8 *dst = (u8 *)dst32;
490 for (u32 x = 0; x < numPixels; x++) {
491 u16 c = src[x];
492 u32 r = Convert4To8(c & 0x000f);
493 u32 g = Convert4To8((c >> 4) & 0x000f);
494 u32 b = Convert4To8((c >> 8) & 0x000f);
495 u32 a = Convert4To8((c >> 12) & 0x000f);
496
497 dst[x] = (a << 24) | (r << 16) | (g << 8) | b;
498 }
499 }
500
ConvertRGBA5551ToBGRA8888(u32 * dst,const u16 * src,u32 numPixels)501 void ConvertRGBA5551ToBGRA8888(u32 *dst, const u16 *src, u32 numPixels) {
502 for (u32 x = 0; x < numPixels; x++) {
503 u16 c = src[x];
504 u32 r = Convert5To8(c & 0x001f);
505 u32 g = Convert5To8((c >> 5) & 0x001f);
506 u32 b = Convert5To8((c >> 10) & 0x001f);
507 // We force an arithmetic shift to get the sign bits.
508 u32 a = SignExtend16ToU32(c) & 0xff000000;
509
510 dst[x] = a | (r << 16) | (g << 8) | b;
511 }
512 }
513
ConvertRGB565ToBGRA8888(u32 * dst,const u16 * src,u32 numPixels)514 void ConvertRGB565ToBGRA8888(u32 *dst, const u16 *src, u32 numPixels) {
515 for (u32 x = 0; x < numPixels; x++) {
516 u16 c = src[x];
517 u32 r = Convert5To8(c & 0x001f);
518 u32 g = Convert6To8((c >> 5) & 0x003f);
519 u32 b = Convert5To8((c >> 11) & 0x001f);
520
521 dst[x] = 0xFF000000 | (r << 16) | (g << 8) | b;
522 }
523 }
524
ConvertRGBA4444ToABGR4444Basic(u16 * dst,const u16 * src,u32 numPixels)525 void ConvertRGBA4444ToABGR4444Basic(u16 *dst, const u16 *src, u32 numPixels) {
526 #ifdef _M_SSE
527 const __m128i mask0040 = _mm_set1_epi16(0x00F0);
528
529 const __m128i *srcp = (const __m128i *)src;
530 __m128i *dstp = (__m128i *)dst;
531 u32 sseChunks = numPixels / 8;
532 if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
533 sseChunks = 0;
534 }
535 for (u32 i = 0; i < sseChunks; ++i) {
536 const __m128i c = _mm_load_si128(&srcp[i]);
537 __m128i v = _mm_srli_epi16(c, 12);
538 v = _mm_or_si128(v, _mm_and_si128(_mm_srli_epi16(c, 4), mask0040));
539 v = _mm_or_si128(v, _mm_slli_epi16(_mm_and_si128(c, mask0040), 4));
540 v = _mm_or_si128(v, _mm_slli_epi16(c, 12));
541 _mm_store_si128(&dstp[i], v);
542 }
543 // The remainder is done in chunks of 2, SSE was chunks of 8.
544 u32 i = sseChunks * 8 / 2;
545 #else
546 u32 i = 0;
547 #endif
548
549 const u32 *src32 = (const u32 *)src;
550 u32 *dst32 = (u32 *)dst;
551 for (; i < numPixels / 2; i++) {
552 const u32 c = src32[i];
553 dst32[i] = ((c >> 12) & 0x000F000F) |
554 ((c >> 4) & 0x00F000F0) |
555 ((c << 4) & 0x0F000F00) |
556 ((c << 12) & 0xF000F000);
557 }
558
559 if (numPixels & 1) {
560 const u32 i = numPixels - 1;
561 const u16 c = src[i];
562 dst[i] = ((c >> 12) & 0x000F) |
563 ((c >> 4) & 0x00F0) |
564 ((c << 4) & 0x0F00) |
565 ((c << 12) & 0xF000);
566 }
567 }
568
ConvertRGBA5551ToABGR1555Basic(u16 * dst,const u16 * src,u32 numPixels)569 void ConvertRGBA5551ToABGR1555Basic(u16 *dst, const u16 *src, u32 numPixels) {
570 #ifdef _M_SSE
571 const __m128i maskB = _mm_set1_epi16(0x003E);
572 const __m128i maskG = _mm_set1_epi16(0x07C0);
573
574 const __m128i *srcp = (const __m128i *)src;
575 __m128i *dstp = (__m128i *)dst;
576 u32 sseChunks = numPixels / 8;
577 if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
578 sseChunks = 0;
579 }
580 for (u32 i = 0; i < sseChunks; ++i) {
581 const __m128i c = _mm_load_si128(&srcp[i]);
582 __m128i v = _mm_srli_epi16(c, 15);
583 v = _mm_or_si128(v, _mm_and_si128(_mm_srli_epi16(c, 9), maskB));
584 v = _mm_or_si128(v, _mm_and_si128(_mm_slli_epi16(c, 1), maskG));
585 v = _mm_or_si128(v, _mm_slli_epi16(c, 11));
586 _mm_store_si128(&dstp[i], v);
587 }
588 // The remainder is done in chunks of 2, SSE was chunks of 8.
589 u32 i = sseChunks * 8 / 2;
590 #else
591 u32 i = 0;
592 #endif
593
594 const u32 *src32 = (const u32 *)src;
595 u32 *dst32 = (u32 *)dst;
596 for (; i < numPixels / 2; i++) {
597 const u32 c = src32[i];
598 dst32[i] = ((c >> 15) & 0x00010001) |
599 ((c >> 9) & 0x003E003E) |
600 ((c << 1) & 0x07C007C0) |
601 ((c << 11) & 0xF800F800);
602 }
603
604 if (numPixels & 1) {
605 const u32 i = numPixels - 1;
606 const u16 c = src[i];
607 dst[i] = ((c >> 15) & 0x0001) |
608 ((c >> 9) & 0x003E) |
609 ((c << 1) & 0x07C0) |
610 ((c << 11) & 0xF800);
611 }
612 }
613
ConvertRGB565ToBGR565Basic(u16 * dst,const u16 * src,u32 numPixels)614 void ConvertRGB565ToBGR565Basic(u16 *dst, const u16 *src, u32 numPixels) {
615 #ifdef _M_SSE
616 const __m128i maskG = _mm_set1_epi16(0x07E0);
617
618 const __m128i *srcp = (const __m128i *)src;
619 __m128i *dstp = (__m128i *)dst;
620 u32 sseChunks = numPixels / 8;
621 if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
622 sseChunks = 0;
623 }
624 for (u32 i = 0; i < sseChunks; ++i) {
625 const __m128i c = _mm_load_si128(&srcp[i]);
626 __m128i v = _mm_srli_epi16(c, 11);
627 v = _mm_or_si128(v, _mm_and_si128(c, maskG));
628 v = _mm_or_si128(v, _mm_slli_epi16(c, 11));
629 _mm_store_si128(&dstp[i], v);
630 }
631 // The remainder is done in chunks of 2, SSE was chunks of 8.
632 u32 i = sseChunks * 8 / 2;
633 #else
634 u32 i = 0;
635 #endif
636
637 const u32 *src32 = (const u32 *)src;
638 u32 *dst32 = (u32 *)dst;
639 for (; i < numPixels / 2; i++) {
640 const u32 c = src32[i];
641 dst32[i] = ((c >> 11) & 0x001F001F) |
642 ((c >> 0) & 0x07E007E0) |
643 ((c << 11) & 0xF800F800);
644 }
645
646 if (numPixels & 1) {
647 const u32 i = numPixels - 1;
648 const u16 c = src[i];
649 dst[i] = ((c >> 11) & 0x001F) |
650 ((c >> 0) & 0x07E0) |
651 ((c << 11) & 0xF800);
652 }
653 }
654
ConvertBGRA5551ToABGR1555(u16 * dst,const u16 * src,u32 numPixels)655 void ConvertBGRA5551ToABGR1555(u16 *dst, const u16 *src, u32 numPixels) {
656 const u32 *src32 = (const u32 *)src;
657 u32 *dst32 = (u32 *)dst;
658 for (u32 i = 0; i < numPixels / 2; i++) {
659 const u32 c = src32[i];
660 dst32[i] = ((c >> 15) & 0x00010001) | ((c << 1) & 0xFFFEFFFE);
661 }
662
663 if (numPixels & 1) {
664 const u32 i = numPixels - 1;
665 const u16 c = src[i];
666 dst[i] = (c >> 15) | (c << 1);
667 }
668 }
669
670 // Reuse the logic from the header - if these aren't defined, we need externs.
671 #ifndef ConvertRGBA4444ToABGR4444
672 Convert16bppTo16bppFunc ConvertRGBA4444ToABGR4444 = &ConvertRGBA4444ToABGR4444Basic;
673 Convert16bppTo16bppFunc ConvertRGBA5551ToABGR1555 = &ConvertRGBA5551ToABGR1555Basic;
674 Convert16bppTo16bppFunc ConvertRGB565ToBGR565 = &ConvertRGB565ToBGR565Basic;
675 #endif
676
SetupColorConv()677 void SetupColorConv() {
678 #if PPSSPP_ARCH(ARM_NEON) && !PPSSPP_ARCH(ARM64)
679 if (cpu_info.bNEON) {
680 ConvertRGBA4444ToABGR4444 = &ConvertRGBA4444ToABGR4444NEON;
681 ConvertRGBA5551ToABGR1555 = &ConvertRGBA5551ToABGR1555NEON;
682 ConvertRGB565ToBGR565 = &ConvertRGB565ToBGR565NEON;
683 }
684 #endif
685 }
686