1 // Copyright (c) 2015- PPSSPP Project.
2 
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0 or later versions.
6 
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 // GNU General Public License 2.0 for more details.
11 
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
14 
15 // Official git repository and contact information can be found at
16 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17 
18 #include "ppsspp_config.h"
19 #include "Common/Data/Convert/ColorConv.h"
20 #include "Common/Data/Convert/SmallDataConvert.h"
21 // NEON is in a separate file so that it can be compiled with a runtime check.
22 #include "Common/Data/Convert/ColorConvNEON.h"
23 #include "Common/Common.h"
24 #include "Common/CPUDetect.h"
25 
26 #ifdef _M_SSE
27 #include <emmintrin.h>
28 #endif
29 
30 #if _M_SSE >= 0x401
31 #include <smmintrin.h>
32 #endif
33 
RGBA8888toRGB565(u32 px)34 inline u16 RGBA8888toRGB565(u32 px) {
35 	return ((px >> 3) & 0x001F) | ((px >> 5) & 0x07E0) | ((px >> 8) & 0xF800);
36 }
37 
RGBA8888toRGBA4444(u32 px)38 inline u16 RGBA8888toRGBA4444(u32 px) {
39 	return ((px >> 4) & 0x000F) | ((px >> 8) & 0x00F0) | ((px >> 12) & 0x0F00) | ((px >> 16) & 0xF000);
40 }
41 
BGRA8888toRGB565(u32 px)42 inline u16 BGRA8888toRGB565(u32 px) {
43 	return ((px >> 19) & 0x001F) | ((px >> 5) & 0x07E0) | ((px << 8) & 0xF800);
44 }
45 
BGRA8888toRGBA4444(u32 px)46 inline u16 BGRA8888toRGBA4444(u32 px) {
47 	return ((px >> 20) & 0x000F) | ((px >> 8) & 0x00F0) | ((px << 4) & 0x0F00) | ((px >> 16) & 0xF000);
48 }
49 
BGRA8888toRGBA5551(u32 px)50 inline u16 BGRA8888toRGBA5551(u32 px) {
51 	return ((px >> 19) & 0x001F) | ((px >> 6) & 0x03E0) | ((px << 7) & 0x7C00) | ((px >> 16) & 0x8000);
52 }
53 
RGBA8888toRGBA5551(u32 px)54 inline u16 RGBA8888toRGBA5551(u32 px) {
55 	return ((px >> 3) & 0x001F) | ((px >> 6) & 0x03E0) | ((px >> 9) & 0x7C00) | ((px >> 16) & 0x8000);
56 }
57 
58 // convert 4444 image to 8888, parallelizable
convert4444_gl(u16 * data,u32 * out,int width,int l,int u)59 void convert4444_gl(u16* data, u32* out, int width, int l, int u) {
60 	for (int y = l; y < u; ++y) {
61 		for (int x = 0; x < width; ++x) {
62 			u32 val = data[y*width + x];
63 			u32 r = ((val >> 12) & 0xF) * 17;
64 			u32 g = ((val >> 8) & 0xF) * 17;
65 			u32 b = ((val >> 4) & 0xF) * 17;
66 			u32 a = ((val >> 0) & 0xF) * 17;
67 			out[y*width + x] = (a << 24) | (b << 16) | (g << 8) | r;
68 		}
69 	}
70 }
71 
72 // convert 565 image to 8888, parallelizable
convert565_gl(u16 * data,u32 * out,int width,int l,int u)73 void convert565_gl(u16* data, u32* out, int width, int l, int u) {
74 	for (int y = l; y < u; ++y) {
75 		for (int x = 0; x < width; ++x) {
76 			u32 val = data[y*width + x];
77 			u32 r = Convert5To8((val >> 11) & 0x1F);
78 			u32 g = Convert6To8((val >> 5) & 0x3F);
79 			u32 b = Convert5To8((val)& 0x1F);
80 			out[y*width + x] = (0xFF << 24) | (b << 16) | (g << 8) | r;
81 		}
82 	}
83 }
84 
85 // convert 5551 image to 8888, parallelizable
convert5551_gl(u16 * data,u32 * out,int width,int l,int u)86 void convert5551_gl(u16* data, u32* out, int width, int l, int u) {
87 	for (int y = l; y < u; ++y) {
88 		for (int x = 0; x < width; ++x) {
89 			u32 val = data[y*width + x];
90 			u32 r = Convert5To8((val >> 11) & 0x1F);
91 			u32 g = Convert5To8((val >> 6) & 0x1F);
92 			u32 b = Convert5To8((val >> 1) & 0x1F);
93 			u32 a = (val & 0x1) * 255;
94 			out[y*width + x] = (a << 24) | (b << 16) | (g << 8) | r;
95 		}
96 	}
97 }
98 
99 // convert 4444 image to 8888, parallelizable
convert4444_dx9(u16 * data,u32 * out,int width,int l,int u)100 void convert4444_dx9(u16* data, u32* out, int width, int l, int u) {
101 	for (int y = l; y < u; ++y) {
102 		for (int x = 0; x < width; ++x) {
103 			u32 val = data[y*width + x];
104 			u32 r = ((val >> 0) & 0xF) * 17;
105 			u32 g = ((val >> 4) & 0xF) * 17;
106 			u32 b = ((val >> 8) & 0xF) * 17;
107 			u32 a = ((val >> 12) & 0xF) * 17;
108 			out[y*width + x] = (a << 24) | (b << 16) | (g << 8) | r;
109 		}
110 	}
111 }
112 
113 // convert 565 image to 8888, parallelizable
convert565_dx9(u16 * data,u32 * out,int width,int l,int u)114 void convert565_dx9(u16* data, u32* out, int width, int l, int u) {
115 	for (int y = l; y < u; ++y) {
116 		for (int x = 0; x < width; ++x) {
117 			u32 val = data[y*width + x];
118 			u32 r = Convert5To8((val)& 0x1F);
119 			u32 g = Convert6To8((val >> 5) & 0x3F);
120 			u32 b = Convert5To8((val >> 11) & 0x1F);
121 			out[y*width + x] = (0xFF << 24) | (b << 16) | (g << 8) | r;
122 		}
123 	}
124 }
125 
126 // convert 5551 image to 8888, parallelizable
convert5551_dx9(u16 * data,u32 * out,int width,int l,int u)127 void convert5551_dx9(u16* data, u32* out, int width, int l, int u) {
128 	for (int y = l; y < u; ++y) {
129 		for (int x = 0; x < width; ++x) {
130 			u32 val = data[y*width + x];
131 			u32 r = Convert5To8((val >> 0) & 0x1F);
132 			u32 g = Convert5To8((val >> 5) & 0x1F);
133 			u32 b = Convert5To8((val >> 10) & 0x1F);
134 			u32 a = ((val >> 15) & 0x1) * 255;
135 			out[y*width + x] = (a << 24) | (b << 16) | (g << 8) | r;
136 		}
137 	}
138 }
139 
140 
141 
ConvertBGRA8888ToRGBA8888(u32 * dst,const u32 * src,u32 numPixels)142 void ConvertBGRA8888ToRGBA8888(u32 *dst, const u32 *src, u32 numPixels) {
143 #ifdef _M_SSE
144 	const __m128i maskGA = _mm_set1_epi32(0xFF00FF00);
145 
146 	const __m128i *srcp = (const __m128i *)src;
147 	__m128i *dstp = (__m128i *)dst;
148 	u32 sseChunks = numPixels / 4;
149 	if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
150 		sseChunks = 0;
151 	}
152 	for (u32 i = 0; i < sseChunks; ++i) {
153 		__m128i c = _mm_load_si128(&srcp[i]);
154 		__m128i rb = _mm_andnot_si128(maskGA, c);
155 		c = _mm_and_si128(c, maskGA);
156 
157 		__m128i b = _mm_srli_epi32(rb, 16);
158 		__m128i r = _mm_slli_epi32(rb, 16);
159 		c = _mm_or_si128(_mm_or_si128(c, r), b);
160 		_mm_store_si128(&dstp[i], c);
161 	}
162 	// The remainder starts right after those done via SSE.
163 	u32 i = sseChunks * 4;
164 #else
165 	u32 i = 0;
166 #endif
167 	for (; i < numPixels; i++) {
168 		const u32 c = src[i];
169 		dst[i] = ((c >> 16) & 0x000000FF) |
170 			(c & 0xFF00FF00) |
171 			((c << 16) & 0x00FF0000);
172 	}
173 }
174 
ConvertBGRA8888ToRGB888(u8 * dst,const u32 * src,u32 numPixels)175 void ConvertBGRA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) {
176 	for (uint32_t x = 0; x < numPixels; ++x) {
177 		uint32_t c = src[x];
178 		dst[x * 3 + 0] = (c >> 16) & 0xFF;
179 		dst[x * 3 + 1] = (c >> 8) & 0xFF;
180 		dst[x * 3 + 2] = (c >> 0) & 0xFF;
181 	}
182 }
183 
ConvertRGBA8888ToRGBA5551(u16 * dst,const u32 * src,u32 numPixels)184 void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) {
185 #if _M_SSE >= 0x401
186 	const __m128i maskAG = _mm_set1_epi32(0x8000F800);
187 	const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
188 	const __m128i mask = _mm_set1_epi32(0x0000FFFF);
189 
190 	const __m128i *srcp = (const __m128i *)src;
191 	__m128i *dstp = (__m128i *)dst;
192 	u32 sseChunks = (numPixels / 4) & ~1;
193 	// SSE 4.1 required for _mm_packus_epi32.
194 	if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) {
195 		sseChunks = 0;
196 	}
197 	for (u32 i = 0; i < sseChunks; i += 2) {
198 		__m128i c1 = _mm_load_si128(&srcp[i + 0]);
199 		__m128i c2 = _mm_load_si128(&srcp[i + 1]);
200 		__m128i ag, rb;
201 
202 		ag = _mm_and_si128(c1, maskAG);
203 		ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
204 		rb = _mm_and_si128(c1, maskRB);
205 		rb = _mm_or_si128(_mm_srli_epi32(rb, 3), _mm_srli_epi32(rb, 9));
206 		c1 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
207 
208 		ag = _mm_and_si128(c2, maskAG);
209 		ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
210 		rb = _mm_and_si128(c2, maskRB);
211 		rb = _mm_or_si128(_mm_srli_epi32(rb, 3), _mm_srli_epi32(rb, 9));
212 		c2 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
213 
214 		_mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2));
215 	}
216 	// The remainder starts right after those done via SSE.
217 	u32 i = sseChunks * 4;
218 #else
219 	u32 i = 0;
220 #endif
221 	for (; i < numPixels; i++) {
222 		dst[i] = RGBA8888toRGBA5551(src[i]);
223 	}
224 }
225 
ConvertBGRA8888ToRGBA5551(u16 * dst,const u32 * src,u32 numPixels)226 void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) {
227 #if _M_SSE >= 0x401
228 	const __m128i maskAG = _mm_set1_epi32(0x8000F800);
229 	const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
230 	const __m128i mask = _mm_set1_epi32(0x0000FFFF);
231 
232 	const __m128i *srcp = (const __m128i *)src;
233 	__m128i *dstp = (__m128i *)dst;
234 	u32 sseChunks = (numPixels / 4) & ~1;
235 	// SSE 4.1 required for _mm_packus_epi32.
236 	if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) {
237 		sseChunks = 0;
238 	}
239 	for (u32 i = 0; i < sseChunks; i += 2) {
240 		__m128i c1 = _mm_load_si128(&srcp[i + 0]);
241 		__m128i c2 = _mm_load_si128(&srcp[i + 1]);
242 		__m128i ag, rb;
243 
244 		ag = _mm_and_si128(c1, maskAG);
245 		ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
246 		rb = _mm_and_si128(c1, maskRB);
247 		rb = _mm_or_si128(_mm_srli_epi32(rb, 19), _mm_slli_epi32(rb, 7));
248 		c1 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
249 
250 		ag = _mm_and_si128(c2, maskAG);
251 		ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
252 		rb = _mm_and_si128(c2, maskRB);
253 		rb = _mm_or_si128(_mm_srli_epi32(rb, 19), _mm_slli_epi32(rb, 7));
254 		c2 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
255 
256 		_mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2));
257 	}
258 	// The remainder starts right after those done via SSE.
259 	u32 i = sseChunks * 4;
260 #else
261 	u32 i = 0;
262 #endif
263 	for (; i < numPixels; i++) {
264 		dst[i] = BGRA8888toRGBA5551(src[i]);
265 	}
266 }
267 
ConvertBGRA8888ToRGB565(u16 * dst,const u32 * src,u32 numPixels)268 void ConvertBGRA8888ToRGB565(u16 *dst, const u32 *src, u32 numPixels) {
269 	for (u32 i = 0; i < numPixels; i++) {
270 		dst[i] = BGRA8888toRGB565(src[i]);
271 	}
272 }
273 
ConvertBGRA8888ToRGBA4444(u16 * dst,const u32 * src,u32 numPixels)274 void ConvertBGRA8888ToRGBA4444(u16 *dst, const u32 *src, u32 numPixels) {
275 	for (u32 i = 0; i < numPixels; i++) {
276 		dst[i] = BGRA8888toRGBA4444(src[i]);
277 	}
278 }
279 
ConvertRGBA8888ToRGB565(u16 * dst,const u32 * src,u32 numPixels)280 void ConvertRGBA8888ToRGB565(u16 *dst, const u32 *src, u32 numPixels) {
281 	for (u32 x = 0; x < numPixels; ++x) {
282 		dst[x] = RGBA8888toRGB565(src[x]);
283 	}
284 }
285 
ConvertRGBA8888ToRGBA4444(u16 * dst,const u32 * src,u32 numPixels)286 void ConvertRGBA8888ToRGBA4444(u16 *dst, const u32 *src, u32 numPixels) {
287 	for (u32 x = 0; x < numPixels; ++x) {
288 		dst[x] = RGBA8888toRGBA4444(src[x]);
289 	}
290 }
291 
ConvertRGBA8888ToRGB888(u8 * dst,const u32 * src,u32 numPixels)292 void ConvertRGBA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) {
293 	for (uint32_t x = 0; x < numPixels; ++x) {
294 		memcpy(dst + x * 3, src + x, 3);
295 	}
296 }
297 
ConvertRGB565ToRGBA8888(u32 * dst32,const u16 * src,u32 numPixels)298 void ConvertRGB565ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
299 #ifdef _M_SSE
300 	const __m128i mask5 = _mm_set1_epi16(0x001f);
301 	const __m128i mask6 = _mm_set1_epi16(0x003f);
302 	const __m128i mask8 = _mm_set1_epi16(0x00ff);
303 
304 	const __m128i *srcp = (const __m128i *)src;
305 	__m128i *dstp = (__m128i *)dst32;
306 	u32 sseChunks = numPixels / 8;
307 	if (((intptr_t)src & 0xF) || ((intptr_t)dst32 & 0xF)) {
308 		sseChunks = 0;
309 	}
310 	for (u32 i = 0; i < sseChunks; ++i) {
311 		const __m128i c = _mm_load_si128(&srcp[i]);
312 
313 		// Swizzle, resulting in RR00 RR00.
314 		__m128i r = _mm_and_si128(c, mask5);
315 		r = _mm_or_si128(_mm_slli_epi16(r, 3), _mm_srli_epi16(r, 2));
316 		r = _mm_and_si128(r, mask8);
317 
318 		// This one becomes 00GG 00GG.
319 		__m128i g = _mm_and_si128(_mm_srli_epi16(c, 5), mask6);
320 		g = _mm_or_si128(_mm_slli_epi16(g, 2), _mm_srli_epi16(g, 4));
321 		g = _mm_slli_epi16(g, 8);
322 
323 		// Almost done, we aim for BB00 BB00 again here.
324 		__m128i b = _mm_and_si128(_mm_srli_epi16(c, 11), mask5);
325 		b = _mm_or_si128(_mm_slli_epi16(b, 3), _mm_srli_epi16(b, 2));
326 		b = _mm_and_si128(b, mask8);
327 
328 		// Always set alpha to 00FF 00FF.
329 		__m128i a = _mm_slli_epi16(mask8, 8);
330 
331 		// Now combine them, RRGG RRGG and BBAA BBAA, and then interleave.
332 		const __m128i rg = _mm_or_si128(r, g);
333 		const __m128i ba = _mm_or_si128(b, a);
334 		_mm_store_si128(&dstp[i * 2 + 0], _mm_unpacklo_epi16(rg, ba));
335 		_mm_store_si128(&dstp[i * 2 + 1], _mm_unpackhi_epi16(rg, ba));
336 	}
337 	u32 i = sseChunks * 8;
338 #else
339 	u32 i = 0;
340 #endif
341 
342 	u8 *dst = (u8 *)dst32;
343 	for (u32 x = i; x < numPixels; x++) {
344 		u16 col = src[x];
345 		dst[x * 4] = Convert5To8((col) & 0x1f);
346 		dst[x * 4 + 1] = Convert6To8((col >> 5) & 0x3f);
347 		dst[x * 4 + 2] = Convert5To8((col >> 11) & 0x1f);
348 		dst[x * 4 + 3] = 255;
349 	}
350 }
351 
ConvertRGBA5551ToRGBA8888(u32 * dst32,const u16 * src,u32 numPixels)352 void ConvertRGBA5551ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
353 #ifdef _M_SSE
354 	const __m128i mask5 = _mm_set1_epi16(0x001f);
355 	const __m128i mask8 = _mm_set1_epi16(0x00ff);
356 
357 	const __m128i *srcp = (const __m128i *)src;
358 	__m128i *dstp = (__m128i *)dst32;
359 	u32 sseChunks = numPixels / 8;
360 	if (((intptr_t)src & 0xF) || ((intptr_t)dst32 & 0xF)) {
361 		sseChunks = 0;
362 	}
363 	for (u32 i = 0; i < sseChunks; ++i) {
364 		const __m128i c = _mm_load_si128(&srcp[i]);
365 
366 		// Swizzle, resulting in RR00 RR00.
367 		__m128i r = _mm_and_si128(c, mask5);
368 		r = _mm_or_si128(_mm_slli_epi16(r, 3), _mm_srli_epi16(r, 2));
369 		r = _mm_and_si128(r, mask8);
370 
371 		// This one becomes 00GG 00GG.
372 		__m128i g = _mm_and_si128(_mm_srli_epi16(c, 5), mask5);
373 		g = _mm_or_si128(_mm_slli_epi16(g, 3), _mm_srli_epi16(g, 2));
374 		g = _mm_slli_epi16(g, 8);
375 
376 		// Almost done, we aim for BB00 BB00 again here.
377 		__m128i b = _mm_and_si128(_mm_srli_epi16(c, 10), mask5);
378 		b = _mm_or_si128(_mm_slli_epi16(b, 3), _mm_srli_epi16(b, 2));
379 		b = _mm_and_si128(b, mask8);
380 
381 		// 1 bit A to 00AA 00AA.
382 		__m128i a = _mm_srai_epi16(c, 15);
383 		a = _mm_slli_epi16(a, 8);
384 
385 		// Now combine them, RRGG RRGG and BBAA BBAA, and then interleave.
386 		const __m128i rg = _mm_or_si128(r, g);
387 		const __m128i ba = _mm_or_si128(b, a);
388 		_mm_store_si128(&dstp[i * 2 + 0], _mm_unpacklo_epi16(rg, ba));
389 		_mm_store_si128(&dstp[i * 2 + 1], _mm_unpackhi_epi16(rg, ba));
390 	}
391 	u32 i = sseChunks * 8;
392 #else
393 	u32 i = 0;
394 #endif
395 
396 	u8 *dst = (u8 *)dst32;
397 	for (u32 x = i; x < numPixels; x++) {
398 		u16 col = src[x];
399 		dst[x * 4] = Convert5To8((col) & 0x1f);
400 		dst[x * 4 + 1] = Convert5To8((col >> 5) & 0x1f);
401 		dst[x * 4 + 2] = Convert5To8((col >> 10) & 0x1f);
402 		dst[x * 4 + 3] = (col >> 15) ? 255 : 0;
403 	}
404 }
405 
ConvertRGBA4444ToRGBA8888(u32 * dst32,const u16 * src,u32 numPixels)406 void ConvertRGBA4444ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
407 #ifdef _M_SSE
408 	const __m128i mask4 = _mm_set1_epi16(0x000f);
409 
410 	const __m128i *srcp = (const __m128i *)src;
411 	__m128i *dstp = (__m128i *)dst32;
412 	u32 sseChunks = numPixels / 8;
413 	if (((intptr_t)src & 0xF) || ((intptr_t)dst32 & 0xF)) {
414 		sseChunks = 0;
415 	}
416 	for (u32 i = 0; i < sseChunks; ++i) {
417 		const __m128i c = _mm_load_si128(&srcp[i]);
418 
419 		// Let's just grab R000 R000, without swizzling yet.
420 		__m128i r = _mm_and_si128(c, mask4);
421 		// And then 00G0 00G0.
422 		__m128i g = _mm_and_si128(_mm_srli_epi16(c, 4), mask4);
423 		g = _mm_slli_epi16(g, 8);
424 		// Now B000 B000.
425 		__m128i b = _mm_and_si128(_mm_srli_epi16(c, 8), mask4);
426 		// And lastly 00A0 00A0.  No mask needed, we have a wall.
427 		__m128i a = _mm_srli_epi16(c, 12);
428 		a = _mm_slli_epi16(a, 8);
429 
430 		// We swizzle after combining - R0G0 R0G0 and B0A0 B0A0 -> RRGG RRGG and BBAA BBAA.
431 		__m128i rg = _mm_or_si128(r, g);
432 		__m128i ba = _mm_or_si128(b, a);
433 		rg = _mm_or_si128(rg, _mm_slli_epi16(rg, 4));
434 		ba = _mm_or_si128(ba, _mm_slli_epi16(ba, 4));
435 
436 		// And then we can store.
437 		_mm_store_si128(&dstp[i * 2 + 0], _mm_unpacklo_epi16(rg, ba));
438 		_mm_store_si128(&dstp[i * 2 + 1], _mm_unpackhi_epi16(rg, ba));
439 	}
440 	u32 i = sseChunks * 8;
441 #else
442 	u32 i = 0;
443 #endif
444 
445 	u8 *dst = (u8 *)dst32;
446 	for (u32 x = i; x < numPixels; x++) {
447 		u16 col = src[x];
448 		dst[x * 4] = Convert4To8(col & 0xf);
449 		dst[x * 4 + 1] = Convert4To8((col >> 4) & 0xf);
450 		dst[x * 4 + 2] = Convert4To8((col >> 8) & 0xf);
451 		dst[x * 4 + 3] = Convert4To8(col >> 12);
452 	}
453 }
454 
ConvertBGR565ToRGBA8888(u32 * dst32,const u16 * src,u32 numPixels)455 void ConvertBGR565ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
456 	u8 *dst = (u8 *)dst32;
457 	for (u32 x = 0; x < numPixels; x++) {
458 		u16 col = src[x];
459 		dst[x * 4] = Convert5To8((col >> 11) & 0x1f);
460 		dst[x * 4 + 1] = Convert6To8((col >> 5) & 0x3f);
461 		dst[x * 4 + 2] = Convert5To8((col) & 0x1f);
462 		dst[x * 4 + 3] = 255;
463 	}
464 }
465 
ConvertABGR1555ToRGBA8888(u32 * dst32,const u16 * src,u32 numPixels)466 void ConvertABGR1555ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
467 	u8 *dst = (u8 *)dst32;
468 	for (u32 x = 0; x < numPixels; x++) {
469 		u16 col = src[x];
470 		dst[x * 4] = Convert5To8((col >> 11) & 0x1f);
471 		dst[x * 4 + 1] = Convert5To8((col >> 6) & 0x1f);
472 		dst[x * 4 + 2] = Convert5To8((col >> 1) & 0x1f);
473 		dst[x * 4 + 3] = (col & 1) ? 255 : 0;
474 	}
475 }
476 
ConvertABGR4444ToRGBA8888(u32 * dst32,const u16 * src,u32 numPixels)477 void ConvertABGR4444ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
478 	u8 *dst = (u8 *)dst32;
479 	for (u32 x = 0; x < numPixels; x++) {
480 		u16 col = src[x];
481 		dst[x * 4] = Convert4To8(col >> 12);
482 		dst[x * 4 + 1] = Convert4To8((col >> 8) & 0xf);
483 		dst[x * 4 + 2] = Convert4To8((col >> 4) & 0xf);
484 		dst[x * 4 + 3] = Convert4To8(col & 0xf);
485 	}
486 }
487 
ConvertRGBA4444ToBGRA8888(u32 * dst32,const u16 * src,u32 numPixels)488 void ConvertRGBA4444ToBGRA8888(u32 *dst32, const u16 *src, u32 numPixels) {
489 	u8 *dst = (u8 *)dst32;
490 	for (u32 x = 0; x < numPixels; x++) {
491 		u16 c = src[x];
492 		u32 r = Convert4To8(c & 0x000f);
493 		u32 g = Convert4To8((c >> 4) & 0x000f);
494 		u32 b = Convert4To8((c >> 8) & 0x000f);
495 		u32 a = Convert4To8((c >> 12) & 0x000f);
496 
497 		dst[x] = (a << 24) | (r << 16) | (g << 8) | b;
498 	}
499 }
500 
ConvertRGBA5551ToBGRA8888(u32 * dst,const u16 * src,u32 numPixels)501 void ConvertRGBA5551ToBGRA8888(u32 *dst, const u16 *src, u32 numPixels) {
502 	for (u32 x = 0; x < numPixels; x++) {
503 		u16 c = src[x];
504 		u32 r = Convert5To8(c & 0x001f);
505 		u32 g = Convert5To8((c >> 5) & 0x001f);
506 		u32 b = Convert5To8((c >> 10) & 0x001f);
507 		// We force an arithmetic shift to get the sign bits.
508 		u32 a = SignExtend16ToU32(c) & 0xff000000;
509 
510 		dst[x] = a | (r << 16) | (g << 8) | b;
511 	}
512 }
513 
ConvertRGB565ToBGRA8888(u32 * dst,const u16 * src,u32 numPixels)514 void ConvertRGB565ToBGRA8888(u32 *dst, const u16 *src, u32 numPixels) {
515 	for (u32 x = 0; x < numPixels; x++) {
516 		u16 c = src[x];
517 		u32 r = Convert5To8(c & 0x001f);
518 		u32 g = Convert6To8((c >> 5) & 0x003f);
519 		u32 b = Convert5To8((c >> 11) & 0x001f);
520 
521 		dst[x] = 0xFF000000 | (r << 16) | (g << 8) | b;
522 	}
523 }
524 
ConvertRGBA4444ToABGR4444Basic(u16 * dst,const u16 * src,u32 numPixels)525 void ConvertRGBA4444ToABGR4444Basic(u16 *dst, const u16 *src, u32 numPixels) {
526 #ifdef _M_SSE
527 	const __m128i mask0040 = _mm_set1_epi16(0x00F0);
528 
529 	const __m128i *srcp = (const __m128i *)src;
530 	__m128i *dstp = (__m128i *)dst;
531 	u32 sseChunks = numPixels / 8;
532 	if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
533 		sseChunks = 0;
534 	}
535 	for (u32 i = 0; i < sseChunks; ++i) {
536 		const __m128i c = _mm_load_si128(&srcp[i]);
537 		__m128i v = _mm_srli_epi16(c, 12);
538 		v = _mm_or_si128(v, _mm_and_si128(_mm_srli_epi16(c, 4), mask0040));
539 		v = _mm_or_si128(v, _mm_slli_epi16(_mm_and_si128(c, mask0040), 4));
540 		v = _mm_or_si128(v, _mm_slli_epi16(c, 12));
541 		_mm_store_si128(&dstp[i], v);
542 	}
543 	// The remainder is done in chunks of 2, SSE was chunks of 8.
544 	u32 i = sseChunks * 8 / 2;
545 #else
546 	u32 i = 0;
547 #endif
548 
549 	const u32 *src32 = (const u32 *)src;
550 	u32 *dst32 = (u32 *)dst;
551 	for (; i < numPixels / 2; i++) {
552 		const u32 c = src32[i];
553 		dst32[i] = ((c >> 12) & 0x000F000F) |
554 		           ((c >> 4)  & 0x00F000F0) |
555 		           ((c << 4)  & 0x0F000F00) |
556 		           ((c << 12) & 0xF000F000);
557 	}
558 
559 	if (numPixels & 1) {
560 		const u32 i = numPixels - 1;
561 		const u16 c = src[i];
562 		dst[i] = ((c >> 12) & 0x000F) |
563 		         ((c >> 4)  & 0x00F0) |
564 		         ((c << 4)  & 0x0F00) |
565 		         ((c << 12) & 0xF000);
566 	}
567 }
568 
ConvertRGBA5551ToABGR1555Basic(u16 * dst,const u16 * src,u32 numPixels)569 void ConvertRGBA5551ToABGR1555Basic(u16 *dst, const u16 *src, u32 numPixels) {
570 #ifdef _M_SSE
571 	const __m128i maskB = _mm_set1_epi16(0x003E);
572 	const __m128i maskG = _mm_set1_epi16(0x07C0);
573 
574 	const __m128i *srcp = (const __m128i *)src;
575 	__m128i *dstp = (__m128i *)dst;
576 	u32 sseChunks = numPixels / 8;
577 	if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
578 		sseChunks = 0;
579 	}
580 	for (u32 i = 0; i < sseChunks; ++i) {
581 		const __m128i c = _mm_load_si128(&srcp[i]);
582 		__m128i v = _mm_srli_epi16(c, 15);
583 		v = _mm_or_si128(v, _mm_and_si128(_mm_srli_epi16(c, 9), maskB));
584 		v = _mm_or_si128(v, _mm_and_si128(_mm_slli_epi16(c, 1), maskG));
585 		v = _mm_or_si128(v, _mm_slli_epi16(c, 11));
586 		_mm_store_si128(&dstp[i], v);
587 	}
588 	// The remainder is done in chunks of 2, SSE was chunks of 8.
589 	u32 i = sseChunks * 8 / 2;
590 #else
591 	u32 i = 0;
592 #endif
593 
594 	const u32 *src32 = (const u32 *)src;
595 	u32 *dst32 = (u32 *)dst;
596 	for (; i < numPixels / 2; i++) {
597 		const u32 c = src32[i];
598 		dst32[i] = ((c >> 15) & 0x00010001) |
599 		           ((c >> 9)  & 0x003E003E) |
600 		           ((c << 1)  & 0x07C007C0) |
601 		           ((c << 11) & 0xF800F800);
602 	}
603 
604 	if (numPixels & 1) {
605 		const u32 i = numPixels - 1;
606 		const u16 c = src[i];
607 		dst[i] = ((c >> 15) & 0x0001) |
608 		         ((c >> 9)  & 0x003E) |
609 		         ((c << 1)  & 0x07C0) |
610 		         ((c << 11) & 0xF800);
611 	}
612 }
613 
ConvertRGB565ToBGR565Basic(u16 * dst,const u16 * src,u32 numPixels)614 void ConvertRGB565ToBGR565Basic(u16 *dst, const u16 *src, u32 numPixels) {
615 #ifdef _M_SSE
616 	const __m128i maskG = _mm_set1_epi16(0x07E0);
617 
618 	const __m128i *srcp = (const __m128i *)src;
619 	__m128i *dstp = (__m128i *)dst;
620 	u32 sseChunks = numPixels / 8;
621 	if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
622 		sseChunks = 0;
623 	}
624 	for (u32 i = 0; i < sseChunks; ++i) {
625 		const __m128i c = _mm_load_si128(&srcp[i]);
626 		__m128i v = _mm_srli_epi16(c, 11);
627 		v = _mm_or_si128(v, _mm_and_si128(c, maskG));
628 		v = _mm_or_si128(v, _mm_slli_epi16(c, 11));
629 		_mm_store_si128(&dstp[i], v);
630 	}
631 	// The remainder is done in chunks of 2, SSE was chunks of 8.
632 	u32 i = sseChunks * 8 / 2;
633 #else
634 	u32 i = 0;
635 #endif
636 
637 	const u32 *src32 = (const u32 *)src;
638 	u32 *dst32 = (u32 *)dst;
639 	for (; i < numPixels / 2; i++) {
640 		const u32 c = src32[i];
641 		dst32[i] = ((c >> 11) & 0x001F001F) |
642 		           ((c >> 0)  & 0x07E007E0) |
643 		           ((c << 11) & 0xF800F800);
644 	}
645 
646 	if (numPixels & 1) {
647 		const u32 i = numPixels - 1;
648 		const u16 c = src[i];
649 		dst[i] = ((c >> 11) & 0x001F) |
650 		         ((c >> 0)  & 0x07E0) |
651 		         ((c << 11) & 0xF800);
652 	}
653 }
654 
ConvertBGRA5551ToABGR1555(u16 * dst,const u16 * src,u32 numPixels)655 void ConvertBGRA5551ToABGR1555(u16 *dst, const u16 *src, u32 numPixels) {
656 	const u32 *src32 = (const u32 *)src;
657 	u32 *dst32 = (u32 *)dst;
658 	for (u32 i = 0; i < numPixels / 2; i++) {
659 		const u32 c = src32[i];
660 		dst32[i] = ((c >> 15) & 0x00010001) | ((c << 1) & 0xFFFEFFFE);
661 	}
662 
663 	if (numPixels & 1) {
664 		const u32 i = numPixels - 1;
665 		const u16 c = src[i];
666 		dst[i] = (c >> 15) | (c << 1);
667 	}
668 }
669 
670 // Reuse the logic from the header - if these aren't defined, we need externs.
671 #ifndef ConvertRGBA4444ToABGR4444
672 Convert16bppTo16bppFunc ConvertRGBA4444ToABGR4444 = &ConvertRGBA4444ToABGR4444Basic;
673 Convert16bppTo16bppFunc ConvertRGBA5551ToABGR1555 = &ConvertRGBA5551ToABGR1555Basic;
674 Convert16bppTo16bppFunc ConvertRGB565ToBGR565 = &ConvertRGB565ToBGR565Basic;
675 #endif
676 
SetupColorConv()677 void SetupColorConv() {
678 #if PPSSPP_ARCH(ARM_NEON) && !PPSSPP_ARCH(ARM64)
679 	if (cpu_info.bNEON) {
680 		ConvertRGBA4444ToABGR4444 = &ConvertRGBA4444ToABGR4444NEON;
681 		ConvertRGBA5551ToABGR1555 = &ConvertRGBA5551ToABGR1555NEON;
682 		ConvertRGB565ToBGR565 = &ConvertRGB565ToBGR565NEON;
683 	}
684 #endif
685 }
686