1 /* Copyright (C) 2010-2020 The RetroArch team
2 *
3 * ---------------------------------------------------------------------------------------
4 * The following license statement only applies to this file (pixconv.c).
5 * ---------------------------------------------------------------------------------------
6 *
7 * Permission is hereby granted, free of charge,
8 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation the rights to
10 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include <stdio.h>
24 #include <stdint.h>
25 #include <stdlib.h>
26 #include <string.h>
27
28 #include <retro_inline.h>
29
30 #include <gfx/scaler/pixconv.h>
31
32 #if _MSC_VER && _MSC_VER <= 1800
33 #define SCALER_NO_SIMD
34 #endif
35
36 #ifdef SCALER_NO_SIMD
37 #undef __SSE2__
38 #endif
39
40 #if defined(__SSE2__)
41 #include <emmintrin.h>
42 #elif defined(__MMX__)
43 #include <mmintrin.h>
44 #endif
45
conv_rgb565_0rgb1555(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)46 void conv_rgb565_0rgb1555(void *output_, const void *input_,
47 int width, int height,
48 int out_stride, int in_stride)
49 {
50 int h;
51 const uint16_t *input = (const uint16_t*)input_;
52 uint16_t *output = (uint16_t*)output_;
53
54 #if defined(__SSE2__)
55 int max_width = width - 7;
56 const __m128i hi_mask = _mm_set1_epi16(0x7fe0);
57 const __m128i lo_mask = _mm_set1_epi16(0x1f);
58 #endif
59
60 for (h = 0; h < height;
61 h++, output += out_stride >> 1, input += in_stride >> 1)
62 {
63 int w = 0;
64 #if defined(__SSE2__)
65 for (; w < max_width; w += 8)
66 {
67 const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
68 __m128i hi = _mm_and_si128(_mm_slli_epi16(in, 1), hi_mask);
69 __m128i lo = _mm_and_si128(in, lo_mask);
70 _mm_storeu_si128((__m128i*)(output + w), _mm_or_si128(hi, lo));
71 }
72 #endif
73
74 for (; w < width; w++)
75 {
76 uint16_t col = input[w];
77 uint16_t hi = (col >> 1) & 0x7fe0;
78 uint16_t lo = col & 0x1f;
79 output[w] = hi | lo;
80 }
81 }
82 }
83
conv_0rgb1555_rgb565(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)84 void conv_0rgb1555_rgb565(void *output_, const void *input_,
85 int width, int height,
86 int out_stride, int in_stride)
87 {
88 int h;
89 const uint16_t *input = (const uint16_t*)input_;
90 uint16_t *output = (uint16_t*)output_;
91
92 #if defined(__SSE2__)
93 int max_width = width - 7;
94
95 const __m128i hi_mask = _mm_set1_epi16(
96 (int16_t)((0x1f << 11) | (0x1f << 6)));
97 const __m128i lo_mask = _mm_set1_epi16(0x1f);
98 const __m128i glow_mask = _mm_set1_epi16(1 << 5);
99 #endif
100
101 for (h = 0; h < height;
102 h++, output += out_stride >> 1, input += in_stride >> 1)
103 {
104 int w = 0;
105 #if defined(__SSE2__)
106 for (; w < max_width; w += 8)
107 {
108 const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
109 __m128i rg = _mm_and_si128(_mm_slli_epi16(in, 1), hi_mask);
110 __m128i b = _mm_and_si128(in, lo_mask);
111 __m128i glow = _mm_and_si128(_mm_srli_epi16(in, 4), glow_mask);
112 _mm_storeu_si128((__m128i*)(output + w),
113 _mm_or_si128(rg, _mm_or_si128(b, glow)));
114 }
115 #endif
116
117 for (; w < width; w++)
118 {
119 uint16_t col = input[w];
120 uint16_t rg = (col << 1) & ((0x1f << 11) | (0x1f << 6));
121 uint16_t b = col & 0x1f;
122 uint16_t glow = (col >> 4) & (1 << 5);
123 output[w] = rg | b | glow;
124 }
125 }
126 }
127
conv_0rgb1555_argb8888(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)128 void conv_0rgb1555_argb8888(void *output_, const void *input_,
129 int width, int height,
130 int out_stride, int in_stride)
131 {
132 int h;
133 const uint16_t *input = (const uint16_t*)input_;
134 uint32_t *output = (uint32_t*)output_;
135
136 #ifdef __SSE2__
137 const __m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
138 const __m128i pix_mask_gb = _mm_set1_epi16(0x1f << 5);
139 const __m128i mul15_mid = _mm_set1_epi16(0x4200);
140 const __m128i mul15_hi = _mm_set1_epi16(0x0210);
141 const __m128i a = _mm_set1_epi16(0x00ff);
142
143 int max_width = width - 7;
144 #endif
145
146 for (h = 0; h < height;
147 h++, output += out_stride >> 2, input += in_stride >> 1)
148 {
149 int w = 0;
150 #ifdef __SSE2__
151 for (; w < max_width; w += 8)
152 {
153 __m128i res_lo_bg, res_hi_bg;
154 __m128i res_lo_ra, res_hi_ra;
155 __m128i res_lo, res_hi;
156 const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
157 __m128i r = _mm_and_si128(in, pix_mask_r);
158 __m128i g = _mm_and_si128(in, pix_mask_gb);
159 __m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_gb);
160
161 r = _mm_mulhi_epi16(r, mul15_hi);
162 g = _mm_mulhi_epi16(g, mul15_mid);
163 b = _mm_mulhi_epi16(b, mul15_mid);
164
165 res_lo_bg = _mm_unpacklo_epi8(b, g);
166 res_hi_bg = _mm_unpackhi_epi8(b, g);
167 res_lo_ra = _mm_unpacklo_epi8(r, a);
168 res_hi_ra = _mm_unpackhi_epi8(r, a);
169
170 res_lo = _mm_or_si128(res_lo_bg,
171 _mm_slli_si128(res_lo_ra, 2));
172 res_hi = _mm_or_si128(res_hi_bg,
173 _mm_slli_si128(res_hi_ra, 2));
174
175 _mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
176 _mm_storeu_si128((__m128i*)(output + w + 4), res_hi);
177 }
178 #endif
179
180 for (; w < width; w++)
181 {
182 uint32_t col = input[w];
183 uint32_t r = (col >> 10) & 0x1f;
184 uint32_t g = (col >> 5) & 0x1f;
185 uint32_t b = (col >> 0) & 0x1f;
186 r = (r << 3) | (r >> 2);
187 g = (g << 3) | (g >> 2);
188 b = (b << 3) | (b >> 2);
189
190 output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
191 }
192 }
193 }
194
conv_rgb565_argb8888(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)195 void conv_rgb565_argb8888(void *output_, const void *input_,
196 int width, int height,
197 int out_stride, int in_stride)
198 {
199 int h;
200 const uint16_t *input = (const uint16_t*)input_;
201 uint32_t *output = (uint32_t*)output_;
202
203 #if defined(__SSE2__)
204 const __m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
205 const __m128i pix_mask_g = _mm_set1_epi16(0x3f << 5);
206 const __m128i pix_mask_b = _mm_set1_epi16(0x1f << 5);
207 const __m128i mul16_r = _mm_set1_epi16(0x0210);
208 const __m128i mul16_g = _mm_set1_epi16(0x2080);
209 const __m128i mul16_b = _mm_set1_epi16(0x4200);
210 const __m128i a = _mm_set1_epi16(0x00ff);
211
212 int max_width = width - 7;
213 #elif defined(__MMX__)
214 const __m64 pix_mask_r = _mm_set1_pi16(0x1f << 10);
215 const __m64 pix_mask_g = _mm_set1_pi16(0x3f << 5);
216 const __m64 pix_mask_b = _mm_set1_pi16(0x1f << 5);
217 const __m64 mul16_r = _mm_set1_pi16(0x0210);
218 const __m64 mul16_g = _mm_set1_pi16(0x2080);
219 const __m64 mul16_b = _mm_set1_pi16(0x4200);
220 const __m64 a = _mm_set1_pi16(0x00ff);
221
222 int max_width = width - 3;
223 #endif
224
225 for (h = 0; h < height;
226 h++, output += out_stride >> 2, input += in_stride >> 1)
227 {
228 int w = 0;
229 #if defined(__SSE2__)
230 for (; w < max_width; w += 8)
231 {
232 __m128i res_lo, res_hi;
233 __m128i res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
234 const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
235 __m128i r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r);
236 __m128i g = _mm_and_si128(in, pix_mask_g);
237 __m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b);
238
239 r = _mm_mulhi_epi16(r, mul16_r);
240 g = _mm_mulhi_epi16(g, mul16_g);
241 b = _mm_mulhi_epi16(b, mul16_b);
242
243 res_lo_bg = _mm_unpacklo_epi8(b, g);
244 res_hi_bg = _mm_unpackhi_epi8(b, g);
245 res_lo_ra = _mm_unpacklo_epi8(r, a);
246 res_hi_ra = _mm_unpackhi_epi8(r, a);
247
248 res_lo = _mm_or_si128(res_lo_bg,
249 _mm_slli_si128(res_lo_ra, 2));
250 res_hi = _mm_or_si128(res_hi_bg,
251 _mm_slli_si128(res_hi_ra, 2));
252
253 _mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
254 _mm_storeu_si128((__m128i*)(output + w + 4), res_hi);
255 }
256 #elif defined(__MMX__)
257 for (; w < max_width; w += 4)
258 {
259 __m64 res_lo, res_hi;
260 __m64 res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
261 const __m64 in = *((__m64*)(input + w));
262 __m64 r = _mm_and_si64(_mm_srli_pi16(in, 1), pix_mask_r);
263 __m64 g = _mm_and_si64(in, pix_mask_g);
264 __m64 b = _mm_and_si64(_mm_slli_pi16(in, 5), pix_mask_b);
265
266 r = _mm_mulhi_pi16(r, mul16_r);
267 g = _mm_mulhi_pi16(g, mul16_g);
268 b = _mm_mulhi_pi16(b, mul16_b);
269
270 res_lo_bg = _mm_unpacklo_pi8(b, g);
271 res_hi_bg = _mm_unpackhi_pi8(b, g);
272 res_lo_ra = _mm_unpacklo_pi8(r, a);
273 res_hi_ra = _mm_unpackhi_pi8(r, a);
274
275 res_lo = _mm_or_si64(res_lo_bg,
276 _mm_slli_si64(res_lo_ra, 16));
277 res_hi = _mm_or_si64(res_hi_bg,
278 _mm_slli_si64(res_hi_ra, 16));
279
280 *((__m64*)(output + w + 0)) = res_lo;
281 *((__m64*)(output + w + 2)) = res_hi;
282 }
283
284 _mm_empty();
285 #endif
286
287 for (; w < width; w++)
288 {
289 uint32_t col = input[w];
290 uint32_t r = (col >> 11) & 0x1f;
291 uint32_t g = (col >> 5) & 0x3f;
292 uint32_t b = (col >> 0) & 0x1f;
293 r = (r << 3) | (r >> 2);
294 g = (g << 2) | (g >> 4);
295 b = (b << 3) | (b >> 2);
296
297 output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
298 }
299 }
300 }
301
conv_rgb565_abgr8888(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)302 void conv_rgb565_abgr8888(void *output_, const void *input_,
303 int width, int height,
304 int out_stride, int in_stride)
305 {
306 int h;
307 const uint16_t *input = (const uint16_t*)input_;
308 uint32_t *output = (uint32_t*)output_;
309 #if defined(__SSE2__)
310 const __m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
311 const __m128i pix_mask_g = _mm_set1_epi16(0x3f << 5);
312 const __m128i pix_mask_b = _mm_set1_epi16(0x1f << 5);
313 const __m128i mul16_r = _mm_set1_epi16(0x0210);
314 const __m128i mul16_g = _mm_set1_epi16(0x2080);
315 const __m128i mul16_b = _mm_set1_epi16(0x4200);
316 const __m128i a = _mm_set1_epi16(0x00ff);
317 int max_width = width - 7;
318 #endif
319 for (h = 0; h < height;
320 h++, output += out_stride >> 2, input += in_stride >> 1)
321 {
322 int w = 0;
323 #if defined(__SSE2__)
324 for (; w < max_width; w += 8)
325 {
326 __m128i res_lo, res_hi;
327 __m128i res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
328 const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
329 __m128i r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r);
330 __m128i g = _mm_and_si128(in, pix_mask_g);
331 __m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b);
332 r = _mm_mulhi_epi16(r, mul16_r);
333 g = _mm_mulhi_epi16(g, mul16_g);
334 b = _mm_mulhi_epi16(b, mul16_b);
335 res_lo_bg = _mm_unpacklo_epi8(b, g);
336 res_hi_bg = _mm_unpackhi_epi8(b, g);
337 res_lo_ra = _mm_unpacklo_epi8(r, a);
338 res_hi_ra = _mm_unpackhi_epi8(r, a);
339 res_lo = _mm_or_si128(res_lo_bg,
340 _mm_slli_si128(res_lo_ra, 2));
341 res_hi = _mm_or_si128(res_hi_bg,
342 _mm_slli_si128(res_hi_ra, 2));
343 _mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
344 _mm_storeu_si128((__m128i*)(output + w + 4), res_hi);
345 }
346 #endif
347 for (; w < width; w++)
348 {
349 uint32_t col = input[w];
350 uint32_t r = (col >> 11) & 0x1f;
351 uint32_t g = (col >> 5) & 0x3f;
352 uint32_t b = (col >> 0) & 0x1f;
353 r = (r << 3) | (r >> 2);
354 g = (g << 2) | (g >> 4);
355 b = (b << 3) | (b >> 2);
356 output[w] = (0xffu << 24) | (b << 16) | (g << 8) | (r << 0);
357 }
358 }
359 }
360
conv_argb8888_rgba4444(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)361 void conv_argb8888_rgba4444(void *output_, const void *input_,
362 int width, int height,
363 int out_stride, int in_stride)
364 {
365 int h, w;
366 const uint32_t *input = (const uint32_t*)input_;
367 uint16_t *output = (uint16_t*)output_;
368
369 for (h = 0; h < height;
370 h++, output += out_stride >> 2, input += in_stride >> 1)
371 {
372 for (w = 0; w < width; w++)
373 {
374 uint32_t col = input[w];
375 uint32_t r = (col >> 16) & 0xf;
376 uint32_t g = (col >> 8) & 0xf;
377 uint32_t b = (col) & 0xf;
378 uint32_t a = (col >> 24) & 0xf;
379 r = (r >> 4) | r;
380 g = (g >> 4) | g;
381 b = (b >> 4) | b;
382 a = (a >> 4) | a;
383
384 output[w] = (r << 12) | (g << 8) | (b << 4) | a;
385 }
386 }
387 }
388
conv_rgba4444_argb8888(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)389 void conv_rgba4444_argb8888(void *output_, const void *input_,
390 int width, int height,
391 int out_stride, int in_stride)
392 {
393 int h;
394 const uint16_t *input = (const uint16_t*)input_;
395 uint32_t *output = (uint32_t*)output_;
396
397 #if defined(__MMX__)
398 const __m64 pix_mask_r = _mm_set1_pi16(0xf << 10);
399 const __m64 pix_mask_g = _mm_set1_pi16(0xf << 8);
400 const __m64 pix_mask_b = _mm_set1_pi16(0xf << 8);
401 const __m64 mul16_r = _mm_set1_pi16(0x0440);
402 const __m64 mul16_g = _mm_set1_pi16(0x1100);
403 const __m64 mul16_b = _mm_set1_pi16(0x1100);
404 const __m64 a = _mm_set1_pi16(0x00ff);
405
406 int max_width = width - 3;
407 #endif
408
409 for (h = 0; h < height;
410 h++, output += out_stride >> 2, input += in_stride >> 1)
411 {
412 int w = 0;
413 #if defined(__MMX__)
414 for (; w < max_width; w += 4)
415 {
416 __m64 res_lo, res_hi;
417 __m64 res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
418 const __m64 in = *((__m64*)(input + w));
419 __m64 r = _mm_and_si64(_mm_srli_pi16(in, 2), pix_mask_r);
420 __m64 g = _mm_and_si64(in, pix_mask_g);
421 __m64 b = _mm_and_si64(_mm_slli_pi16(in, 4), pix_mask_b);
422
423 r = _mm_mulhi_pi16(r, mul16_r);
424 g = _mm_mulhi_pi16(g, mul16_g);
425 b = _mm_mulhi_pi16(b, mul16_b);
426
427 res_lo_bg = _mm_unpacklo_pi8(b, g);
428 res_hi_bg = _mm_unpackhi_pi8(b, g);
429 res_lo_ra = _mm_unpacklo_pi8(r, a);
430 res_hi_ra = _mm_unpackhi_pi8(r, a);
431
432 res_lo = _mm_or_si64(res_lo_bg,
433 _mm_slli_si64(res_lo_ra, 16));
434 res_hi = _mm_or_si64(res_hi_bg,
435 _mm_slli_si64(res_hi_ra, 16));
436
437 *((__m64*)(output + w + 0)) = res_lo;
438 *((__m64*)(output + w + 2)) = res_hi;
439 }
440
441 _mm_empty();
442 #endif
443
444 for (; w < width; w++)
445 {
446 uint32_t col = input[w];
447 uint32_t r = (col >> 12) & 0xf;
448 uint32_t g = (col >> 8) & 0xf;
449 uint32_t b = (col >> 4) & 0xf;
450 uint32_t a = (col >> 0) & 0xf;
451 r = (r << 4) | r;
452 g = (g << 4) | g;
453 b = (b << 4) | b;
454 a = (a << 4) | a;
455
456 output[w] = (a << 24) | (r << 16) | (g << 8) | (b << 0);
457 }
458 }
459 }
460
conv_rgba4444_rgb565(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)461 void conv_rgba4444_rgb565(void *output_, const void *input_,
462 int width, int height,
463 int out_stride, int in_stride)
464 {
465 int h, w;
466 const uint16_t *input = (const uint16_t*)input_;
467 uint16_t *output = (uint16_t*)output_;
468
469 for (h = 0; h < height;
470 h++, output += out_stride >> 1, input += in_stride >> 1)
471 {
472 for (w = 0; w < width; w++)
473 {
474 uint32_t col = input[w];
475 uint32_t r = (col >> 12) & 0xf;
476 uint32_t g = (col >> 8) & 0xf;
477 uint32_t b = (col >> 4) & 0xf;
478
479 output[w] = (r << 12) | (g << 7) | (b << 1);
480 }
481 }
482 }
483
484 #if defined(__SSE2__)
485 /* :( TODO: Make this saner. */
store_bgr24_sse2(void * output,__m128i a,__m128i b,__m128i c,__m128i d)486 static INLINE void store_bgr24_sse2(void *output, __m128i a,
487 __m128i b, __m128i c, __m128i d)
488 {
489 const __m128i mask_0 = _mm_set_epi32(0, 0, 0, 0x00ffffff);
490 const __m128i mask_1 = _mm_set_epi32(0, 0, 0x00ffffff, 0);
491 const __m128i mask_2 = _mm_set_epi32(0, 0x00ffffff, 0, 0);
492 const __m128i mask_3 = _mm_set_epi32(0x00ffffff, 0, 0, 0);
493
494 __m128i a0 = _mm_and_si128(a, mask_0);
495 __m128i a1 = _mm_srli_si128(_mm_and_si128(a, mask_1), 1);
496 __m128i a2 = _mm_srli_si128(_mm_and_si128(a, mask_2), 2);
497 __m128i a3 = _mm_srli_si128(_mm_and_si128(a, mask_3), 3);
498 __m128i a4 = _mm_slli_si128(_mm_and_si128(b, mask_0), 12);
499 __m128i a5 = _mm_slli_si128(_mm_and_si128(b, mask_1), 11);
500
501 __m128i b0 = _mm_srli_si128(_mm_and_si128(b, mask_1), 5);
502 __m128i b1 = _mm_srli_si128(_mm_and_si128(b, mask_2), 6);
503 __m128i b2 = _mm_srli_si128(_mm_and_si128(b, mask_3), 7);
504 __m128i b3 = _mm_slli_si128(_mm_and_si128(c, mask_0), 8);
505 __m128i b4 = _mm_slli_si128(_mm_and_si128(c, mask_1), 7);
506 __m128i b5 = _mm_slli_si128(_mm_and_si128(c, mask_2), 6);
507
508 __m128i c0 = _mm_srli_si128(_mm_and_si128(c, mask_2), 10);
509 __m128i c1 = _mm_srli_si128(_mm_and_si128(c, mask_3), 11);
510 __m128i c2 = _mm_slli_si128(_mm_and_si128(d, mask_0), 4);
511 __m128i c3 = _mm_slli_si128(_mm_and_si128(d, mask_1), 3);
512 __m128i c4 = _mm_slli_si128(_mm_and_si128(d, mask_2), 2);
513 __m128i c5 = _mm_slli_si128(_mm_and_si128(d, mask_3), 1);
514
515 __m128i *out = (__m128i*)output;
516
517 _mm_storeu_si128(out + 0,
518 _mm_or_si128(a0, _mm_or_si128(a1, _mm_or_si128(a2,
519 _mm_or_si128(a3, _mm_or_si128(a4, a5))))));
520
521 _mm_storeu_si128(out + 1,
522 _mm_or_si128(b0, _mm_or_si128(b1, _mm_or_si128(b2,
523 _mm_or_si128(b3, _mm_or_si128(b4, b5))))));
524
525 _mm_storeu_si128(out + 2,
526 _mm_or_si128(c0, _mm_or_si128(c1, _mm_or_si128(c2,
527 _mm_or_si128(c3, _mm_or_si128(c4, c5))))));
528 }
529 #endif
530
conv_0rgb1555_bgr24(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)531 void conv_0rgb1555_bgr24(void *output_, const void *input_,
532 int width, int height,
533 int out_stride, int in_stride)
534 {
535 int h;
536 const uint16_t *input = (const uint16_t*)input_;
537 uint8_t *output = (uint8_t*)output_;
538
539 #if defined(__SSE2__)
540 const __m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
541 const __m128i pix_mask_gb = _mm_set1_epi16(0x1f << 5);
542 const __m128i mul15_mid = _mm_set1_epi16(0x4200);
543 const __m128i mul15_hi = _mm_set1_epi16(0x0210);
544 const __m128i a = _mm_set1_epi16(0x00ff);
545
546 int max_width = width - 15;
547 #endif
548
549 for (h = 0; h < height;
550 h++, output += out_stride, input += in_stride >> 1)
551 {
552 uint8_t *out = output;
553 int w = 0;
554
555 #if defined(__SSE2__)
556 for (; w < max_width; w += 16, out += 48)
557 {
558 __m128i res_lo_bg0, res_lo_bg1, res_hi_bg0, res_hi_bg1,
559 res_lo_ra0, res_lo_ra1, res_hi_ra0, res_hi_ra1,
560 res_lo0, res_lo1, res_hi0, res_hi1;
561 const __m128i in0 = _mm_loadu_si128((const __m128i*)(input + w + 0));
562 const __m128i in1 = _mm_loadu_si128((const __m128i*)(input + w + 8));
563 __m128i r0 = _mm_and_si128(in0, pix_mask_r);
564 __m128i r1 = _mm_and_si128(in1, pix_mask_r);
565 __m128i g0 = _mm_and_si128(in0, pix_mask_gb);
566 __m128i g1 = _mm_and_si128(in1, pix_mask_gb);
567 __m128i b0 = _mm_and_si128(_mm_slli_epi16(in0, 5), pix_mask_gb);
568 __m128i b1 = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_gb);
569
570 r0 = _mm_mulhi_epi16(r0, mul15_hi);
571 r1 = _mm_mulhi_epi16(r1, mul15_hi);
572 g0 = _mm_mulhi_epi16(g0, mul15_mid);
573 g1 = _mm_mulhi_epi16(g1, mul15_mid);
574 b0 = _mm_mulhi_epi16(b0, mul15_mid);
575 b1 = _mm_mulhi_epi16(b1, mul15_mid);
576
577 res_lo_bg0 = _mm_unpacklo_epi8(b0, g0);
578 res_lo_bg1 = _mm_unpacklo_epi8(b1, g1);
579 res_hi_bg0 = _mm_unpackhi_epi8(b0, g0);
580 res_hi_bg1 = _mm_unpackhi_epi8(b1, g1);
581 res_lo_ra0 = _mm_unpacklo_epi8(r0, a);
582 res_lo_ra1 = _mm_unpacklo_epi8(r1, a);
583 res_hi_ra0 = _mm_unpackhi_epi8(r0, a);
584 res_hi_ra1 = _mm_unpackhi_epi8(r1, a);
585
586 res_lo0 = _mm_or_si128(res_lo_bg0,
587 _mm_slli_si128(res_lo_ra0, 2));
588 res_lo1 = _mm_or_si128(res_lo_bg1,
589 _mm_slli_si128(res_lo_ra1, 2));
590 res_hi0 = _mm_or_si128(res_hi_bg0,
591 _mm_slli_si128(res_hi_ra0, 2));
592 res_hi1 = _mm_or_si128(res_hi_bg1,
593 _mm_slli_si128(res_hi_ra1, 2));
594
595 /* Non-POT pixel sizes for the loss */
596 store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
597 }
598 #endif
599
600 for (; w < width; w++)
601 {
602 uint32_t col = input[w];
603 uint32_t b = (col >> 0) & 0x1f;
604 uint32_t g = (col >> 5) & 0x1f;
605 uint32_t r = (col >> 10) & 0x1f;
606 b = (b << 3) | (b >> 2);
607 g = (g << 3) | (g >> 2);
608 r = (r << 3) | (r >> 2);
609
610 *out++ = b;
611 *out++ = g;
612 *out++ = r;
613 }
614 }
615 }
616
conv_rgb565_bgr24(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)617 void conv_rgb565_bgr24(void *output_, const void *input_,
618 int width, int height,
619 int out_stride, int in_stride)
620 {
621 int h;
622 const uint16_t *input = (const uint16_t*)input_;
623 uint8_t *output = (uint8_t*)output_;
624
625 #if defined(__SSE2__)
626 const __m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
627 const __m128i pix_mask_g = _mm_set1_epi16(0x3f << 5);
628 const __m128i pix_mask_b = _mm_set1_epi16(0x1f << 5);
629 const __m128i mul16_r = _mm_set1_epi16(0x0210);
630 const __m128i mul16_g = _mm_set1_epi16(0x2080);
631 const __m128i mul16_b = _mm_set1_epi16(0x4200);
632 const __m128i a = _mm_set1_epi16(0x00ff);
633
634 int max_width = width - 15;
635 #endif
636
637 for (h = 0; h < height; h++, output += out_stride, input += in_stride >> 1)
638 {
639 uint8_t *out = output;
640 int w = 0;
641 #if defined(__SSE2__)
642 for (; w < max_width; w += 16, out += 48)
643 {
644 __m128i res_lo_bg0, res_hi_bg0, res_lo_ra0, res_hi_ra0;
645 __m128i res_lo_bg1, res_hi_bg1, res_lo_ra1, res_hi_ra1;
646 __m128i res_lo0, res_hi0, res_lo1, res_hi1;
647 const __m128i in0 = _mm_loadu_si128((const __m128i*)(input + w));
648 const __m128i in1 = _mm_loadu_si128((const __m128i*)(input + w + 8));
649 __m128i r0 = _mm_and_si128(_mm_srli_epi16(in0, 1), pix_mask_r);
650 __m128i g0 = _mm_and_si128(in0, pix_mask_g);
651 __m128i b0 = _mm_and_si128(_mm_slli_epi16(in0, 5), pix_mask_b);
652 __m128i r1 = _mm_and_si128(_mm_srli_epi16(in1, 1), pix_mask_r);
653 __m128i g1 = _mm_and_si128(in1, pix_mask_g);
654 __m128i b1 = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_b);
655
656 r0 = _mm_mulhi_epi16(r0, mul16_r);
657 g0 = _mm_mulhi_epi16(g0, mul16_g);
658 b0 = _mm_mulhi_epi16(b0, mul16_b);
659 r1 = _mm_mulhi_epi16(r1, mul16_r);
660 g1 = _mm_mulhi_epi16(g1, mul16_g);
661 b1 = _mm_mulhi_epi16(b1, mul16_b);
662
663 res_lo_bg0 = _mm_unpacklo_epi8(b0, g0);
664 res_hi_bg0 = _mm_unpackhi_epi8(b0, g0);
665 res_lo_ra0 = _mm_unpacklo_epi8(r0, a);
666 res_hi_ra0 = _mm_unpackhi_epi8(r0, a);
667 res_lo_bg1 = _mm_unpacklo_epi8(b1, g1);
668 res_hi_bg1 = _mm_unpackhi_epi8(b1, g1);
669 res_lo_ra1 = _mm_unpacklo_epi8(r1, a);
670 res_hi_ra1 = _mm_unpackhi_epi8(r1, a);
671
672 res_lo0 = _mm_or_si128(res_lo_bg0,
673 _mm_slli_si128(res_lo_ra0, 2));
674 res_hi0 = _mm_or_si128(res_hi_bg0,
675 _mm_slli_si128(res_hi_ra0, 2));
676 res_lo1 = _mm_or_si128(res_lo_bg1,
677 _mm_slli_si128(res_lo_ra1, 2));
678 res_hi1 = _mm_or_si128(res_hi_bg1,
679 _mm_slli_si128(res_hi_ra1, 2));
680
681 store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
682 }
683 #endif
684
685 for (; w < width; w++)
686 {
687 uint32_t col = input[w];
688 uint32_t r = (col >> 11) & 0x1f;
689 uint32_t g = (col >> 5) & 0x3f;
690 uint32_t b = (col >> 0) & 0x1f;
691 r = (r << 3) | (r >> 2);
692 g = (g << 2) | (g >> 4);
693 b = (b << 3) | (b >> 2);
694
695 *out++ = b;
696 *out++ = g;
697 *out++ = r;
698 }
699 }
700 }
701
conv_bgr24_argb8888(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)702 void conv_bgr24_argb8888(void *output_, const void *input_,
703 int width, int height,
704 int out_stride, int in_stride)
705 {
706 int h, w;
707 const uint8_t *input = (const uint8_t*)input_;
708 uint32_t *output = (uint32_t*)output_;
709
710 for (h = 0; h < height;
711 h++, output += out_stride >> 2, input += in_stride)
712 {
713 const uint8_t *inp = input;
714 for (w = 0; w < width; w++)
715 {
716 uint32_t b = *inp++;
717 uint32_t g = *inp++;
718 uint32_t r = *inp++;
719 output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
720 }
721 }
722 }
723
conv_bgr24_rgb565(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)724 void conv_bgr24_rgb565(void *output_, const void *input_,
725 int width, int height,
726 int out_stride, int in_stride)
727 {
728 int h, w;
729 const uint8_t *input = (const uint8_t*)input_;
730 uint16_t *output = (uint16_t*)output_;
731 for (h = 0; h < height;
732 h++, output += out_stride, input += in_stride)
733 {
734 const uint8_t *inp = input;
735 for (w = 0; w < width; w++)
736 {
737 uint16_t b = *inp++;
738 uint16_t g = *inp++;
739 uint16_t r = *inp++;
740
741 output[w] = ((r & 0x00F8) << 8) | ((g&0x00FC) << 3) | ((b&0x00F8) >> 3);
742 }
743 }
744 }
745
conv_argb8888_0rgb1555(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)746 void conv_argb8888_0rgb1555(void *output_, const void *input_,
747 int width, int height,
748 int out_stride, int in_stride)
749 {
750 int h, w;
751 const uint32_t *input = (const uint32_t*)input_;
752 uint16_t *output = (uint16_t*)output_;
753
754 for (h = 0; h < height;
755 h++, output += out_stride >> 1, input += in_stride >> 2)
756 {
757 for (w = 0; w < width; w++)
758 {
759 uint32_t col = input[w];
760 uint16_t r = (col >> 19) & 0x1f;
761 uint16_t g = (col >> 11) & 0x1f;
762 uint16_t b = (col >> 3) & 0x1f;
763 output[w] = (r << 10) | (g << 5) | (b << 0);
764 }
765 }
766 }
767
conv_argb8888_bgr24(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)768 void conv_argb8888_bgr24(void *output_, const void *input_,
769 int width, int height,
770 int out_stride, int in_stride)
771 {
772 int h;
773 const uint32_t *input = (const uint32_t*)input_;
774 uint8_t *output = (uint8_t*)output_;
775
776 #if defined(__SSE2__)
777 int max_width = width - 15;
778 #endif
779
780 for (h = 0; h < height;
781 h++, output += out_stride, input += in_stride >> 2)
782 {
783 uint8_t *out = output;
784 int w = 0;
785 #if defined(__SSE2__)
786 for (; w < max_width; w += 16, out += 48)
787 {
788 __m128i l0 = _mm_loadu_si128((const __m128i*)(input + w + 0));
789 __m128i l1 = _mm_loadu_si128((const __m128i*)(input + w + 4));
790 __m128i l2 = _mm_loadu_si128((const __m128i*)(input + w + 8));
791 __m128i l3 = _mm_loadu_si128((const __m128i*)(input + w + 12));
792 store_bgr24_sse2(out, l0, l1, l2, l3);
793 }
794 #endif
795
796 for (; w < width; w++)
797 {
798 uint32_t col = input[w];
799 *out++ = (uint8_t)(col >> 0);
800 *out++ = (uint8_t)(col >> 8);
801 *out++ = (uint8_t)(col >> 16);
802 }
803 }
804 }
805
806 #if defined(__SSE2__)
conv_shuffle_rb_epi32(__m128i c)807 static INLINE __m128i conv_shuffle_rb_epi32(__m128i c)
808 {
809 /* SSSE3 plz */
810 const __m128i b_mask = _mm_set1_epi32(0x000000ff);
811 const __m128i g_mask = _mm_set1_epi32(0x0000ff00);
812 const __m128i r_mask = _mm_set1_epi32(0x00ff0000);
813 __m128i sl = _mm_and_si128(_mm_slli_epi32(c, 16), r_mask);
814 __m128i sr = _mm_and_si128(_mm_srli_epi32(c, 16), b_mask);
815 __m128i g = _mm_and_si128(c, g_mask);
816 __m128i rb = _mm_or_si128(sl, sr);
817 return _mm_or_si128(g, rb);
818 }
819 #endif
820
conv_abgr8888_bgr24(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)821 void conv_abgr8888_bgr24(void *output_, const void *input_,
822 int width, int height,
823 int out_stride, int in_stride)
824 {
825 int h;
826 const uint32_t *input = (const uint32_t*)input_;
827 uint8_t *output = (uint8_t*)output_;
828
829 #if defined(__SSE2__)
830 int max_width = width - 15;
831 #endif
832
833 for (h = 0; h < height;
834 h++, output += out_stride, input += in_stride >> 2)
835 {
836 uint8_t *out = output;
837 int w = 0;
838 #if defined(__SSE2__)
839 for (; w < max_width; w += 16, out += 48)
840 {
841 __m128i a = _mm_loadu_si128((const __m128i*)(input + w + 0));
842 __m128i b = _mm_loadu_si128((const __m128i*)(input + w + 4));
843 __m128i c = _mm_loadu_si128((const __m128i*)(input + w + 8));
844 __m128i d = _mm_loadu_si128((const __m128i*)(input + w + 12));
845 a = conv_shuffle_rb_epi32(a);
846 b = conv_shuffle_rb_epi32(b);
847 c = conv_shuffle_rb_epi32(c);
848 d = conv_shuffle_rb_epi32(d);
849 store_bgr24_sse2(out, a, b, c, d);
850 }
851 #endif
852
853 for (; w < width; w++)
854 {
855 uint32_t col = input[w];
856 *out++ = (uint8_t)(col >> 16);
857 *out++ = (uint8_t)(col >> 8);
858 *out++ = (uint8_t)(col >> 0);
859 }
860 }
861 }
862
conv_argb8888_abgr8888(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)863 void conv_argb8888_abgr8888(void *output_, const void *input_,
864 int width, int height,
865 int out_stride, int in_stride)
866 {
867 int h, w;
868 const uint32_t *input = (const uint32_t*)input_;
869 uint32_t *output = (uint32_t*)output_;
870
871 for (h = 0; h < height;
872 h++, output += out_stride >> 2, input += in_stride >> 2)
873 {
874 for (w = 0; w < width; w++)
875 {
876 uint32_t col = input[w];
877 output[w] = ((col << 16) & 0xff0000) |
878 ((col >> 16) & 0xff) | (col & 0xff00ff00);
879 }
880 }
881 }
882
883 #define YUV_SHIFT 6
884 #define YUV_OFFSET (1 << (YUV_SHIFT - 1))
885 #define YUV_MAT_Y (1 << 6)
886 #define YUV_MAT_U_G (-22)
887 #define YUV_MAT_U_B (113)
888 #define YUV_MAT_V_R (90)
889 #define YUV_MAT_V_G (-46)
890
conv_yuyv_argb8888(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)891 void conv_yuyv_argb8888(void *output_, const void *input_,
892 int width, int height,
893 int out_stride, int in_stride)
894 {
895 int h;
896 const uint8_t *input = (const uint8_t*)input_;
897 uint32_t *output = (uint32_t*)output_;
898
899 #if defined(__SSE2__)
900 const __m128i mask_y = _mm_set1_epi16(0xffu);
901 const __m128i mask_u = _mm_set1_epi32(0xffu << 8);
902 const __m128i mask_v = _mm_set1_epi32(0xffu << 24);
903 const __m128i chroma_offset = _mm_set1_epi16(128);
904 const __m128i round_offset = _mm_set1_epi16(YUV_OFFSET);
905
906 const __m128i yuv_mul = _mm_set1_epi16(YUV_MAT_Y);
907 const __m128i u_g_mul = _mm_set1_epi16(YUV_MAT_U_G);
908 const __m128i u_b_mul = _mm_set1_epi16(YUV_MAT_U_B);
909 const __m128i v_r_mul = _mm_set1_epi16(YUV_MAT_V_R);
910 const __m128i v_g_mul = _mm_set1_epi16(YUV_MAT_V_G);
911 const __m128i a = _mm_cmpeq_epi16(
912 _mm_setzero_si128(), _mm_setzero_si128());
913 #endif
914
915 for (h = 0; h < height; h++, output += out_stride >> 2, input += in_stride)
916 {
917 const uint8_t *src = input;
918 uint32_t *dst = output;
919 int w = 0;
920
921 #if defined(__SSE2__)
922 /* Each loop processes 16 pixels. */
923 for (; w + 16 <= width; w += 16, src += 32, dst += 16)
924 {
925 __m128i u, v, u0_g, u1_g, u0_b, u1_b, v0_r, v1_r, v0_g, v1_g,
926 r0, g0, b0, r1, g1, b1;
927 __m128i res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
928 __m128i res0, res1, res2, res3;
929 __m128i yuv0 = _mm_loadu_si128((const __m128i*)(src + 0)); /* [Y0, U0, Y1, V0, Y2, U1, Y3, V1, ...] */
930 __m128i yuv1 = _mm_loadu_si128((const __m128i*)(src + 16)); /* [Y0, U0, Y1, V0, Y2, U1, Y3, V1, ...] */
931
932 __m128i _y0 = _mm_and_si128(yuv0, mask_y); /* [Y0, Y1, Y2, ...] (16-bit) */
933 __m128i u0 = _mm_and_si128(yuv0, mask_u); /* [0, U0, 0, 0, 0, U1, 0, 0, ...] */
934 __m128i v0 = _mm_and_si128(yuv0, mask_v); /* [0, 0, 0, V1, 0, , 0, V1, ...] */
935 __m128i _y1 = _mm_and_si128(yuv1, mask_y); /* [Y0, Y1, Y2, ...] (16-bit) */
936 __m128i u1 = _mm_and_si128(yuv1, mask_u); /* [0, U0, 0, 0, 0, U1, 0, 0, ...] */
937 __m128i v1 = _mm_and_si128(yuv1, mask_v); /* [0, 0, 0, V1, 0, , 0, V1, ...] */
938
939 /* Juggle around to get U and V in the same 16-bit format as Y. */
940 u0 = _mm_srli_si128(u0, 1);
941 v0 = _mm_srli_si128(v0, 3);
942 u1 = _mm_srli_si128(u1, 1);
943 v1 = _mm_srli_si128(v1, 3);
944 u = _mm_packs_epi32(u0, u1);
945 v = _mm_packs_epi32(v0, v1);
946
947 /* Apply YUV offsets (U, V) -= (-128, -128). */
948 u = _mm_sub_epi16(u, chroma_offset);
949 v = _mm_sub_epi16(v, chroma_offset);
950
951 /* Upscale chroma horizontally (nearest). */
952 u0 = _mm_unpacklo_epi16(u, u);
953 u1 = _mm_unpackhi_epi16(u, u);
954 v0 = _mm_unpacklo_epi16(v, v);
955 v1 = _mm_unpackhi_epi16(v, v);
956
957 /* Apply transformations. */
958 _y0 = _mm_mullo_epi16(_y0, yuv_mul);
959 _y1 = _mm_mullo_epi16(_y1, yuv_mul);
960 u0_g = _mm_mullo_epi16(u0, u_g_mul);
961 u1_g = _mm_mullo_epi16(u1, u_g_mul);
962 u0_b = _mm_mullo_epi16(u0, u_b_mul);
963 u1_b = _mm_mullo_epi16(u1, u_b_mul);
964 v0_r = _mm_mullo_epi16(v0, v_r_mul);
965 v1_r = _mm_mullo_epi16(v1, v_r_mul);
966 v0_g = _mm_mullo_epi16(v0, v_g_mul);
967 v1_g = _mm_mullo_epi16(v1, v_g_mul);
968
969 /* Add contibutions from the transformed components. */
970 r0 = _mm_srai_epi16(_mm_adds_epi16(_mm_adds_epi16(_y0, v0_r),
971 round_offset), YUV_SHIFT);
972 g0 = _mm_srai_epi16(_mm_adds_epi16(
973 _mm_adds_epi16(_mm_adds_epi16(_y0, v0_g), u0_g), round_offset), YUV_SHIFT);
974 b0 = _mm_srai_epi16(_mm_adds_epi16(
975 _mm_adds_epi16(_y0, u0_b), round_offset), YUV_SHIFT);
976
977 r1 = _mm_srai_epi16(_mm_adds_epi16(
978 _mm_adds_epi16(_y1, v1_r), round_offset), YUV_SHIFT);
979 g1 = _mm_srai_epi16(_mm_adds_epi16(
980 _mm_adds_epi16(_mm_adds_epi16(_y1, v1_g), u1_g), round_offset), YUV_SHIFT);
981 b1 = _mm_srai_epi16(_mm_adds_epi16(
982 _mm_adds_epi16(_y1, u1_b), round_offset), YUV_SHIFT);
983
984 /* Saturate into 8-bit. */
985 r0 = _mm_packus_epi16(r0, r1);
986 g0 = _mm_packus_epi16(g0, g1);
987 b0 = _mm_packus_epi16(b0, b1);
988
989 /* Interleave into ARGB. */
990 res_lo_bg = _mm_unpacklo_epi8(b0, g0);
991 res_hi_bg = _mm_unpackhi_epi8(b0, g0);
992 res_lo_ra = _mm_unpacklo_epi8(r0, a);
993 res_hi_ra = _mm_unpackhi_epi8(r0, a);
994 res0 = _mm_unpacklo_epi16(res_lo_bg, res_lo_ra);
995 res1 = _mm_unpackhi_epi16(res_lo_bg, res_lo_ra);
996 res2 = _mm_unpacklo_epi16(res_hi_bg, res_hi_ra);
997 res3 = _mm_unpackhi_epi16(res_hi_bg, res_hi_ra);
998
999 _mm_storeu_si128((__m128i*)(dst + 0), res0);
1000 _mm_storeu_si128((__m128i*)(dst + 4), res1);
1001 _mm_storeu_si128((__m128i*)(dst + 8), res2);
1002 _mm_storeu_si128((__m128i*)(dst + 12), res3);
1003 }
1004 #endif
1005
1006 /* Finish off the rest (if any) in C. */
1007 for (; w < width; w += 2, src += 4, dst += 2)
1008 {
1009 int _y0 = src[0];
1010 int u = src[1] - 128;
1011 int _y1 = src[2];
1012 int v = src[3] - 128;
1013
1014 uint8_t r0 = clamp_8bit((YUV_MAT_Y * _y0 + YUV_MAT_V_R * v + YUV_OFFSET) >> YUV_SHIFT);
1015 uint8_t g0 = clamp_8bit((YUV_MAT_Y * _y0 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT);
1016 uint8_t b0 = clamp_8bit((YUV_MAT_Y * _y0 + YUV_MAT_U_B * u + YUV_OFFSET) >> YUV_SHIFT);
1017
1018 uint8_t r1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_V_R * v + YUV_OFFSET) >> YUV_SHIFT);
1019 uint8_t g1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT);
1020 uint8_t b1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_U_B * u + YUV_OFFSET) >> YUV_SHIFT);
1021
1022 dst[0] = 0xff000000u | (r0 << 16) | (g0 << 8) | (b0 << 0);
1023 dst[1] = 0xff000000u | (r1 << 16) | (g1 << 8) | (b1 << 0);
1024 }
1025 }
1026 }
1027
conv_copy(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)1028 void conv_copy(void *output_, const void *input_,
1029 int width, int height,
1030 int out_stride, int in_stride)
1031 {
1032 int h;
1033 int copy_len = abs(out_stride);
1034 const uint8_t *input = (const uint8_t*)input_;
1035 uint8_t *output = (uint8_t*)output_;
1036
1037 if (abs(in_stride) < copy_len)
1038 copy_len = abs(in_stride);
1039
1040 for (h = 0; h < height;
1041 h++, output += out_stride, input += in_stride)
1042 memcpy(output, input, copy_len);
1043 }
1044