1 /* Copyright  (C) 2010-2020 The RetroArch team
2  *
3  * ---------------------------------------------------------------------------------------
4  * The following license statement only applies to this file (pixconv.c).
5  * ---------------------------------------------------------------------------------------
6  *
7  * Permission is hereby granted, free of charge,
8  * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation the rights to
10  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11  * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16  * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include <stdio.h>
24 #include <stdint.h>
25 #include <stdlib.h>
26 #include <string.h>
27 
28 #include <retro_inline.h>
29 
30 #include <gfx/scaler/pixconv.h>
31 
32 #if _MSC_VER && _MSC_VER <= 1800
33 #define SCALER_NO_SIMD
34 #endif
35 
36 #ifdef SCALER_NO_SIMD
37 #undef __SSE2__
38 #endif
39 
40 #if defined(__SSE2__)
41 #include <emmintrin.h>
42 #elif defined(__MMX__)
43 #include <mmintrin.h>
44 #endif
45 
conv_rgb565_0rgb1555(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)46 void conv_rgb565_0rgb1555(void *output_, const void *input_,
47       int width, int height,
48       int out_stride, int in_stride)
49 {
50    int h;
51    const uint16_t *input = (const uint16_t*)input_;
52    uint16_t *output = (uint16_t*)output_;
53 
54 #if defined(__SSE2__)
55    int max_width           = width - 7;
56    const __m128i hi_mask   = _mm_set1_epi16(0x7fe0);
57    const __m128i lo_mask   = _mm_set1_epi16(0x1f);
58 #endif
59 
60    for (h = 0; h < height;
61          h++, output += out_stride >> 1, input += in_stride >> 1)
62    {
63       int w = 0;
64 #if defined(__SSE2__)
65       for (; w < max_width; w += 8)
66       {
67          const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
68          __m128i hi = _mm_and_si128(_mm_slli_epi16(in, 1), hi_mask);
69          __m128i lo = _mm_and_si128(in, lo_mask);
70          _mm_storeu_si128((__m128i*)(output + w), _mm_or_si128(hi, lo));
71       }
72 #endif
73 
74       for (; w < width; w++)
75       {
76          uint16_t col = input[w];
77          uint16_t hi  = (col >> 1) & 0x7fe0;
78          uint16_t lo  = col & 0x1f;
79          output[w]    = hi | lo;
80       }
81    }
82 }
83 
conv_0rgb1555_rgb565(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)84 void conv_0rgb1555_rgb565(void *output_, const void *input_,
85       int width, int height,
86       int out_stride, int in_stride)
87 {
88    int h;
89    const uint16_t *input   = (const uint16_t*)input_;
90    uint16_t *output        = (uint16_t*)output_;
91 
92 #if defined(__SSE2__)
93    int max_width           = width - 7;
94 
95    const __m128i hi_mask   = _mm_set1_epi16(
96          (int16_t)((0x1f << 11) | (0x1f << 6)));
97    const __m128i lo_mask   = _mm_set1_epi16(0x1f);
98    const __m128i glow_mask = _mm_set1_epi16(1 << 5);
99 #endif
100 
101    for (h = 0; h < height;
102          h++, output += out_stride >> 1, input += in_stride >> 1)
103    {
104       int w = 0;
105 #if defined(__SSE2__)
106       for (; w < max_width; w += 8)
107       {
108          const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
109          __m128i rg   = _mm_and_si128(_mm_slli_epi16(in, 1), hi_mask);
110          __m128i b    = _mm_and_si128(in, lo_mask);
111          __m128i glow = _mm_and_si128(_mm_srli_epi16(in, 4), glow_mask);
112          _mm_storeu_si128((__m128i*)(output + w),
113                _mm_or_si128(rg, _mm_or_si128(b, glow)));
114       }
115 #endif
116 
117       for (; w < width; w++)
118       {
119          uint16_t col  = input[w];
120          uint16_t rg   = (col << 1) & ((0x1f << 11) | (0x1f << 6));
121          uint16_t b    = col & 0x1f;
122          uint16_t glow = (col >> 4) & (1 << 5);
123          output[w]     = rg | b | glow;
124       }
125    }
126 }
127 
conv_0rgb1555_argb8888(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)128 void conv_0rgb1555_argb8888(void *output_, const void *input_,
129       int width, int height,
130       int out_stride, int in_stride)
131 {
132    int h;
133    const uint16_t *input = (const uint16_t*)input_;
134    uint32_t *output      = (uint32_t*)output_;
135 
136 #ifdef __SSE2__
137    const __m128i pix_mask_r  = _mm_set1_epi16(0x1f << 10);
138    const __m128i pix_mask_gb = _mm_set1_epi16(0x1f <<  5);
139    const __m128i mul15_mid   = _mm_set1_epi16(0x4200);
140    const __m128i mul15_hi    = _mm_set1_epi16(0x0210);
141    const __m128i a           = _mm_set1_epi16(0x00ff);
142 
143    int max_width = width - 7;
144 #endif
145 
146    for (h = 0; h < height;
147          h++, output += out_stride >> 2, input += in_stride >> 1)
148    {
149       int w = 0;
150 #ifdef __SSE2__
151       for (; w < max_width; w += 8)
152       {
153          __m128i res_lo_bg, res_hi_bg;
154          __m128i res_lo_ra, res_hi_ra;
155          __m128i res_lo, res_hi;
156          const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
157          __m128i r = _mm_and_si128(in, pix_mask_r);
158          __m128i g = _mm_and_si128(in, pix_mask_gb);
159          __m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_gb);
160 
161          r = _mm_mulhi_epi16(r, mul15_hi);
162          g = _mm_mulhi_epi16(g, mul15_mid);
163          b = _mm_mulhi_epi16(b, mul15_mid);
164 
165          res_lo_bg = _mm_unpacklo_epi8(b, g);
166          res_hi_bg = _mm_unpackhi_epi8(b, g);
167          res_lo_ra = _mm_unpacklo_epi8(r, a);
168          res_hi_ra = _mm_unpackhi_epi8(r, a);
169 
170          res_lo = _mm_or_si128(res_lo_bg,
171                _mm_slli_si128(res_lo_ra, 2));
172          res_hi = _mm_or_si128(res_hi_bg,
173                _mm_slli_si128(res_hi_ra, 2));
174 
175          _mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
176          _mm_storeu_si128((__m128i*)(output + w + 4), res_hi);
177       }
178 #endif
179 
180       for (; w < width; w++)
181       {
182          uint32_t col = input[w];
183          uint32_t r   = (col >> 10) & 0x1f;
184          uint32_t g   = (col >>  5) & 0x1f;
185          uint32_t b   = (col >>  0) & 0x1f;
186          r            = (r << 3) | (r >> 2);
187          g            = (g << 3) | (g >> 2);
188          b            = (b << 3) | (b >> 2);
189 
190          output[w]    = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
191       }
192    }
193 }
194 
conv_rgb565_argb8888(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)195 void conv_rgb565_argb8888(void *output_, const void *input_,
196       int width, int height,
197       int out_stride, int in_stride)
198 {
199    int h;
200    const uint16_t *input    = (const uint16_t*)input_;
201    uint32_t *output         = (uint32_t*)output_;
202 
203 #if defined(__SSE2__)
204    const __m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
205    const __m128i pix_mask_g = _mm_set1_epi16(0x3f <<  5);
206    const __m128i pix_mask_b = _mm_set1_epi16(0x1f <<  5);
207    const __m128i mul16_r    = _mm_set1_epi16(0x0210);
208    const __m128i mul16_g    = _mm_set1_epi16(0x2080);
209    const __m128i mul16_b    = _mm_set1_epi16(0x4200);
210    const __m128i a          = _mm_set1_epi16(0x00ff);
211 
212    int max_width            = width - 7;
213 #elif defined(__MMX__)
214    const __m64 pix_mask_r = _mm_set1_pi16(0x1f << 10);
215    const __m64 pix_mask_g = _mm_set1_pi16(0x3f << 5);
216    const __m64 pix_mask_b = _mm_set1_pi16(0x1f << 5);
217    const __m64 mul16_r    = _mm_set1_pi16(0x0210);
218    const __m64 mul16_g    = _mm_set1_pi16(0x2080);
219    const __m64 mul16_b    = _mm_set1_pi16(0x4200);
220    const __m64 a          = _mm_set1_pi16(0x00ff);
221 
222    int max_width            = width - 3;
223 #endif
224 
225    for (h = 0; h < height;
226          h++, output += out_stride >> 2, input += in_stride >> 1)
227    {
228       int w = 0;
229 #if defined(__SSE2__)
230       for (; w < max_width; w += 8)
231       {
232          __m128i res_lo, res_hi;
233          __m128i res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
234          const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
235          __m128i        r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r);
236          __m128i        g = _mm_and_si128(in, pix_mask_g);
237          __m128i        b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b);
238 
239          r                = _mm_mulhi_epi16(r, mul16_r);
240          g                = _mm_mulhi_epi16(g, mul16_g);
241          b                = _mm_mulhi_epi16(b, mul16_b);
242 
243          res_lo_bg        = _mm_unpacklo_epi8(b, g);
244          res_hi_bg        = _mm_unpackhi_epi8(b, g);
245          res_lo_ra        = _mm_unpacklo_epi8(r, a);
246          res_hi_ra        = _mm_unpackhi_epi8(r, a);
247 
248          res_lo           = _mm_or_si128(res_lo_bg,
249                _mm_slli_si128(res_lo_ra, 2));
250          res_hi           = _mm_or_si128(res_hi_bg,
251                _mm_slli_si128(res_hi_ra, 2));
252 
253          _mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
254          _mm_storeu_si128((__m128i*)(output + w + 4), res_hi);
255       }
256 #elif defined(__MMX__)
257       for (; w < max_width; w += 4)
258       {
259          __m64 res_lo, res_hi;
260          __m64 res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
261          const __m64 in = *((__m64*)(input + w));
262          __m64          r = _mm_and_si64(_mm_srli_pi16(in, 1), pix_mask_r);
263          __m64          g = _mm_and_si64(in, pix_mask_g);
264          __m64          b = _mm_and_si64(_mm_slli_pi16(in, 5), pix_mask_b);
265 
266          r                = _mm_mulhi_pi16(r, mul16_r);
267          g                = _mm_mulhi_pi16(g, mul16_g);
268          b                = _mm_mulhi_pi16(b, mul16_b);
269 
270          res_lo_bg        = _mm_unpacklo_pi8(b, g);
271          res_hi_bg        = _mm_unpackhi_pi8(b, g);
272          res_lo_ra        = _mm_unpacklo_pi8(r, a);
273          res_hi_ra        = _mm_unpackhi_pi8(r, a);
274 
275          res_lo           = _mm_or_si64(res_lo_bg,
276                _mm_slli_si64(res_lo_ra, 16));
277          res_hi           = _mm_or_si64(res_hi_bg,
278                _mm_slli_si64(res_hi_ra, 16));
279 
280          *((__m64*)(output + w + 0)) = res_lo;
281          *((__m64*)(output + w + 2)) = res_hi;
282       }
283 
284       _mm_empty();
285 #endif
286 
287       for (; w < width; w++)
288       {
289          uint32_t col = input[w];
290          uint32_t r   = (col >> 11) & 0x1f;
291          uint32_t g   = (col >>  5) & 0x3f;
292          uint32_t b   = (col >>  0) & 0x1f;
293          r            = (r << 3) | (r >> 2);
294          g            = (g << 2) | (g >> 4);
295          b            = (b << 3) | (b >> 2);
296 
297          output[w]    = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
298       }
299    }
300 }
301 
conv_rgb565_abgr8888(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)302 void conv_rgb565_abgr8888(void *output_, const void *input_,
303       int width, int height,
304       int out_stride, int in_stride)
305 {
306    int h;
307    const uint16_t *input    = (const uint16_t*)input_;
308    uint32_t *output         = (uint32_t*)output_;
309  #if defined(__SSE2__)
310    const __m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
311    const __m128i pix_mask_g = _mm_set1_epi16(0x3f <<  5);
312    const __m128i pix_mask_b = _mm_set1_epi16(0x1f <<  5);
313    const __m128i mul16_r    = _mm_set1_epi16(0x0210);
314    const __m128i mul16_g    = _mm_set1_epi16(0x2080);
315    const __m128i mul16_b    = _mm_set1_epi16(0x4200);
316    const __m128i a          = _mm_set1_epi16(0x00ff);
317     int max_width            = width - 7;
318 #endif
319     for (h = 0; h < height;
320          h++, output += out_stride >> 2, input += in_stride >> 1)
321    {
322       int w = 0;
323 #if defined(__SSE2__)
324       for (; w < max_width; w += 8)
325       {
326          __m128i res_lo, res_hi;
327          __m128i res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
328          const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
329          __m128i        r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r);
330          __m128i        g = _mm_and_si128(in, pix_mask_g);
331          __m128i        b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b);
332          r                = _mm_mulhi_epi16(r, mul16_r);
333          g                = _mm_mulhi_epi16(g, mul16_g);
334          b                = _mm_mulhi_epi16(b, mul16_b);
335          res_lo_bg        = _mm_unpacklo_epi8(b, g);
336          res_hi_bg        = _mm_unpackhi_epi8(b, g);
337          res_lo_ra        = _mm_unpacklo_epi8(r, a);
338          res_hi_ra        = _mm_unpackhi_epi8(r, a);
339          res_lo           = _mm_or_si128(res_lo_bg,
340                _mm_slli_si128(res_lo_ra, 2));
341          res_hi           = _mm_or_si128(res_hi_bg,
342                _mm_slli_si128(res_hi_ra, 2));
343          _mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
344          _mm_storeu_si128((__m128i*)(output + w + 4), res_hi);
345       }
346 #endif
347        for (; w < width; w++)
348       {
349          uint32_t col = input[w];
350          uint32_t r   = (col >> 11) & 0x1f;
351          uint32_t g   = (col >>  5) & 0x3f;
352          uint32_t b   = (col >>  0) & 0x1f;
353          r            = (r << 3) | (r >> 2);
354          g            = (g << 2) | (g >> 4);
355          b            = (b << 3) | (b >> 2);
356          output[w]    = (0xffu << 24) | (b << 16) | (g << 8) | (r << 0);
357       }
358    }
359 }
360 
conv_argb8888_rgba4444(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)361 void conv_argb8888_rgba4444(void *output_, const void *input_,
362       int width, int height,
363       int out_stride, int in_stride)
364 {
365    int h, w;
366    const uint32_t *input = (const uint32_t*)input_;
367    uint16_t *output      = (uint16_t*)output_;
368 
369    for (h = 0; h < height;
370          h++, output += out_stride >> 2, input += in_stride >> 1)
371    {
372       for (w = 0; w < width; w++)
373       {
374          uint32_t col = input[w];
375          uint32_t r   = (col >> 16) & 0xf;
376          uint32_t g   = (col >>  8) & 0xf;
377          uint32_t b   = (col) & 0xf;
378          uint32_t a   = (col >>  24) & 0xf;
379          r            = (r >> 4) | r;
380          g            = (g >> 4) | g;
381          b            = (b >> 4) | b;
382          a            = (a >> 4) | a;
383 
384          output[w]    = (r << 12) | (g << 8) | (b << 4) | a;
385       }
386    }
387 }
388 
conv_rgba4444_argb8888(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)389 void conv_rgba4444_argb8888(void *output_, const void *input_,
390       int width, int height,
391       int out_stride, int in_stride)
392 {
393    int h;
394    const uint16_t *input = (const uint16_t*)input_;
395    uint32_t *output      = (uint32_t*)output_;
396 
397 #if defined(__MMX__)
398    const __m64 pix_mask_r = _mm_set1_pi16(0xf << 10);
399    const __m64 pix_mask_g = _mm_set1_pi16(0xf << 8);
400    const __m64 pix_mask_b = _mm_set1_pi16(0xf << 8);
401    const __m64 mul16_r    = _mm_set1_pi16(0x0440);
402    const __m64 mul16_g    = _mm_set1_pi16(0x1100);
403    const __m64 mul16_b    = _mm_set1_pi16(0x1100);
404    const __m64 a          = _mm_set1_pi16(0x00ff);
405 
406    int max_width            = width - 3;
407 #endif
408 
409    for (h = 0; h < height;
410          h++, output += out_stride >> 2, input += in_stride >> 1)
411    {
412       int w = 0;
413 #if defined(__MMX__)
414       for (; w < max_width; w += 4)
415       {
416          __m64 res_lo, res_hi;
417          __m64 res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
418          const __m64 in = *((__m64*)(input + w));
419          __m64          r = _mm_and_si64(_mm_srli_pi16(in, 2), pix_mask_r);
420          __m64          g = _mm_and_si64(in, pix_mask_g);
421          __m64          b = _mm_and_si64(_mm_slli_pi16(in, 4), pix_mask_b);
422 
423          r                = _mm_mulhi_pi16(r, mul16_r);
424          g                = _mm_mulhi_pi16(g, mul16_g);
425          b                = _mm_mulhi_pi16(b, mul16_b);
426 
427          res_lo_bg        = _mm_unpacklo_pi8(b, g);
428          res_hi_bg        = _mm_unpackhi_pi8(b, g);
429          res_lo_ra        = _mm_unpacklo_pi8(r, a);
430          res_hi_ra        = _mm_unpackhi_pi8(r, a);
431 
432          res_lo           = _mm_or_si64(res_lo_bg,
433                _mm_slli_si64(res_lo_ra, 16));
434          res_hi           = _mm_or_si64(res_hi_bg,
435                _mm_slli_si64(res_hi_ra, 16));
436 
437          *((__m64*)(output + w + 0)) = res_lo;
438          *((__m64*)(output + w + 2)) = res_hi;
439       }
440 
441       _mm_empty();
442 #endif
443 
444       for (; w < width; w++)
445       {
446          uint32_t col = input[w];
447          uint32_t r   = (col >> 12) & 0xf;
448          uint32_t g   = (col >>  8) & 0xf;
449          uint32_t b   = (col >>  4) & 0xf;
450          uint32_t a   = (col >>  0) & 0xf;
451          r            = (r << 4) | r;
452          g            = (g << 4) | g;
453          b            = (b << 4) | b;
454          a            = (a << 4) | a;
455 
456          output[w]    = (a << 24) | (r << 16) | (g << 8) | (b << 0);
457       }
458    }
459 }
460 
conv_rgba4444_rgb565(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)461 void conv_rgba4444_rgb565(void *output_, const void *input_,
462       int width, int height,
463       int out_stride, int in_stride)
464 {
465    int h, w;
466    const uint16_t *input = (const uint16_t*)input_;
467    uint16_t *output      = (uint16_t*)output_;
468 
469    for (h = 0; h < height;
470          h++, output += out_stride >> 1, input += in_stride >> 1)
471    {
472       for (w = 0; w < width; w++)
473       {
474          uint32_t col = input[w];
475          uint32_t r   = (col >> 12) & 0xf;
476          uint32_t g   = (col >>  8) & 0xf;
477          uint32_t b   = (col >>  4) & 0xf;
478 
479          output[w]    = (r << 12) | (g << 7) | (b << 1);
480       }
481    }
482 }
483 
484 #if defined(__SSE2__)
485 /* :( TODO: Make this saner. */
store_bgr24_sse2(void * output,__m128i a,__m128i b,__m128i c,__m128i d)486 static INLINE void store_bgr24_sse2(void *output, __m128i a,
487       __m128i b, __m128i c, __m128i d)
488 {
489    const __m128i mask_0 = _mm_set_epi32(0, 0, 0, 0x00ffffff);
490    const __m128i mask_1 = _mm_set_epi32(0, 0, 0x00ffffff, 0);
491    const __m128i mask_2 = _mm_set_epi32(0, 0x00ffffff, 0, 0);
492    const __m128i mask_3 = _mm_set_epi32(0x00ffffff, 0, 0, 0);
493 
494    __m128i a0 = _mm_and_si128(a, mask_0);
495    __m128i a1 = _mm_srli_si128(_mm_and_si128(a, mask_1),  1);
496    __m128i a2 = _mm_srli_si128(_mm_and_si128(a, mask_2),  2);
497    __m128i a3 = _mm_srli_si128(_mm_and_si128(a, mask_3),  3);
498    __m128i a4 = _mm_slli_si128(_mm_and_si128(b, mask_0), 12);
499    __m128i a5 = _mm_slli_si128(_mm_and_si128(b, mask_1), 11);
500 
501    __m128i b0 = _mm_srli_si128(_mm_and_si128(b, mask_1), 5);
502    __m128i b1 = _mm_srli_si128(_mm_and_si128(b, mask_2), 6);
503    __m128i b2 = _mm_srli_si128(_mm_and_si128(b, mask_3), 7);
504    __m128i b3 = _mm_slli_si128(_mm_and_si128(c, mask_0), 8);
505    __m128i b4 = _mm_slli_si128(_mm_and_si128(c, mask_1), 7);
506    __m128i b5 = _mm_slli_si128(_mm_and_si128(c, mask_2), 6);
507 
508    __m128i c0 = _mm_srli_si128(_mm_and_si128(c, mask_2), 10);
509    __m128i c1 = _mm_srli_si128(_mm_and_si128(c, mask_3), 11);
510    __m128i c2 = _mm_slli_si128(_mm_and_si128(d, mask_0),  4);
511    __m128i c3 = _mm_slli_si128(_mm_and_si128(d, mask_1),  3);
512    __m128i c4 = _mm_slli_si128(_mm_and_si128(d, mask_2),  2);
513    __m128i c5 = _mm_slli_si128(_mm_and_si128(d, mask_3),  1);
514 
515    __m128i *out = (__m128i*)output;
516 
517    _mm_storeu_si128(out + 0,
518          _mm_or_si128(a0, _mm_or_si128(a1, _mm_or_si128(a2,
519                   _mm_or_si128(a3, _mm_or_si128(a4, a5))))));
520 
521    _mm_storeu_si128(out + 1,
522          _mm_or_si128(b0, _mm_or_si128(b1, _mm_or_si128(b2,
523                   _mm_or_si128(b3, _mm_or_si128(b4, b5))))));
524 
525    _mm_storeu_si128(out + 2,
526          _mm_or_si128(c0, _mm_or_si128(c1, _mm_or_si128(c2,
527                   _mm_or_si128(c3, _mm_or_si128(c4, c5))))));
528 }
529 #endif
530 
conv_0rgb1555_bgr24(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)531 void conv_0rgb1555_bgr24(void *output_, const void *input_,
532       int width, int height,
533       int out_stride, int in_stride)
534 {
535    int h;
536    const uint16_t *input     = (const uint16_t*)input_;
537    uint8_t *output           = (uint8_t*)output_;
538 
539 #if defined(__SSE2__)
540    const __m128i pix_mask_r  = _mm_set1_epi16(0x1f << 10);
541    const __m128i pix_mask_gb = _mm_set1_epi16(0x1f <<  5);
542    const __m128i mul15_mid   = _mm_set1_epi16(0x4200);
543    const __m128i mul15_hi    = _mm_set1_epi16(0x0210);
544    const __m128i a           = _mm_set1_epi16(0x00ff);
545 
546    int max_width             = width - 15;
547 #endif
548 
549    for (h = 0; h < height;
550          h++, output += out_stride, input += in_stride >> 1)
551    {
552       uint8_t *out = output;
553       int   w = 0;
554 
555 #if defined(__SSE2__)
556       for (; w < max_width; w += 16, out += 48)
557       {
558          __m128i res_lo_bg0, res_lo_bg1, res_hi_bg0, res_hi_bg1,
559                  res_lo_ra0, res_lo_ra1, res_hi_ra0, res_hi_ra1,
560                  res_lo0, res_lo1, res_hi0, res_hi1;
561          const __m128i in0 = _mm_loadu_si128((const __m128i*)(input + w + 0));
562          const __m128i in1 = _mm_loadu_si128((const __m128i*)(input + w + 8));
563          __m128i r0        = _mm_and_si128(in0, pix_mask_r);
564          __m128i r1        = _mm_and_si128(in1, pix_mask_r);
565          __m128i g0        = _mm_and_si128(in0, pix_mask_gb);
566          __m128i g1        = _mm_and_si128(in1, pix_mask_gb);
567          __m128i b0        = _mm_and_si128(_mm_slli_epi16(in0, 5), pix_mask_gb);
568          __m128i b1        = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_gb);
569 
570          r0                = _mm_mulhi_epi16(r0, mul15_hi);
571          r1                = _mm_mulhi_epi16(r1, mul15_hi);
572          g0                = _mm_mulhi_epi16(g0, mul15_mid);
573          g1                = _mm_mulhi_epi16(g1, mul15_mid);
574          b0                = _mm_mulhi_epi16(b0, mul15_mid);
575          b1                = _mm_mulhi_epi16(b1, mul15_mid);
576 
577          res_lo_bg0        = _mm_unpacklo_epi8(b0, g0);
578          res_lo_bg1        = _mm_unpacklo_epi8(b1, g1);
579          res_hi_bg0        = _mm_unpackhi_epi8(b0, g0);
580          res_hi_bg1        = _mm_unpackhi_epi8(b1, g1);
581          res_lo_ra0        = _mm_unpacklo_epi8(r0, a);
582          res_lo_ra1        = _mm_unpacklo_epi8(r1, a);
583          res_hi_ra0        = _mm_unpackhi_epi8(r0, a);
584          res_hi_ra1        = _mm_unpackhi_epi8(r1, a);
585 
586          res_lo0           = _mm_or_si128(res_lo_bg0,
587                _mm_slli_si128(res_lo_ra0, 2));
588          res_lo1           = _mm_or_si128(res_lo_bg1,
589                _mm_slli_si128(res_lo_ra1, 2));
590          res_hi0           = _mm_or_si128(res_hi_bg0,
591                _mm_slli_si128(res_hi_ra0, 2));
592          res_hi1           = _mm_or_si128(res_hi_bg1,
593                _mm_slli_si128(res_hi_ra1, 2));
594 
595          /* Non-POT pixel sizes for the loss */
596          store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
597       }
598 #endif
599 
600       for (; w < width; w++)
601       {
602          uint32_t col = input[w];
603          uint32_t b   = (col >>  0) & 0x1f;
604          uint32_t g   = (col >>  5) & 0x1f;
605          uint32_t r   = (col >> 10) & 0x1f;
606          b            = (b << 3) | (b >> 2);
607          g            = (g << 3) | (g >> 2);
608          r            = (r << 3) | (r >> 2);
609 
610          *out++       = b;
611          *out++       = g;
612          *out++       = r;
613       }
614    }
615 }
616 
conv_rgb565_bgr24(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)617 void conv_rgb565_bgr24(void *output_, const void *input_,
618       int width, int height,
619       int out_stride, int in_stride)
620 {
621    int h;
622    const uint16_t *input    = (const uint16_t*)input_;
623    uint8_t *output          = (uint8_t*)output_;
624 
625 #if defined(__SSE2__)
626    const __m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
627    const __m128i pix_mask_g = _mm_set1_epi16(0x3f <<  5);
628    const __m128i pix_mask_b = _mm_set1_epi16(0x1f <<  5);
629    const __m128i mul16_r    = _mm_set1_epi16(0x0210);
630    const __m128i mul16_g    = _mm_set1_epi16(0x2080);
631    const __m128i mul16_b    = _mm_set1_epi16(0x4200);
632    const __m128i a          = _mm_set1_epi16(0x00ff);
633 
634    int max_width            = width - 15;
635 #endif
636 
637    for (h = 0; h < height; h++, output += out_stride, input += in_stride >> 1)
638    {
639       uint8_t *out = output;
640       int        w = 0;
641 #if defined(__SSE2__)
642       for (; w < max_width; w += 16, out += 48)
643       {
644          __m128i res_lo_bg0, res_hi_bg0, res_lo_ra0, res_hi_ra0;
645          __m128i res_lo_bg1, res_hi_bg1, res_lo_ra1, res_hi_ra1;
646          __m128i res_lo0, res_hi0, res_lo1, res_hi1;
647          const __m128i in0 = _mm_loadu_si128((const __m128i*)(input + w));
648          const __m128i in1 = _mm_loadu_si128((const __m128i*)(input + w + 8));
649          __m128i r0 = _mm_and_si128(_mm_srli_epi16(in0, 1), pix_mask_r);
650          __m128i g0 = _mm_and_si128(in0, pix_mask_g);
651          __m128i b0 = _mm_and_si128(_mm_slli_epi16(in0, 5), pix_mask_b);
652          __m128i r1 = _mm_and_si128(_mm_srli_epi16(in1, 1), pix_mask_r);
653          __m128i g1 = _mm_and_si128(in1, pix_mask_g);
654          __m128i b1 = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_b);
655 
656          r0         = _mm_mulhi_epi16(r0, mul16_r);
657          g0         = _mm_mulhi_epi16(g0, mul16_g);
658          b0         = _mm_mulhi_epi16(b0, mul16_b);
659          r1         = _mm_mulhi_epi16(r1, mul16_r);
660          g1         = _mm_mulhi_epi16(g1, mul16_g);
661          b1         = _mm_mulhi_epi16(b1, mul16_b);
662 
663          res_lo_bg0 = _mm_unpacklo_epi8(b0, g0);
664          res_hi_bg0 = _mm_unpackhi_epi8(b0, g0);
665          res_lo_ra0 = _mm_unpacklo_epi8(r0, a);
666          res_hi_ra0 = _mm_unpackhi_epi8(r0, a);
667          res_lo_bg1 = _mm_unpacklo_epi8(b1, g1);
668          res_hi_bg1 = _mm_unpackhi_epi8(b1, g1);
669          res_lo_ra1 = _mm_unpacklo_epi8(r1, a);
670          res_hi_ra1 = _mm_unpackhi_epi8(r1, a);
671 
672          res_lo0    = _mm_or_si128(res_lo_bg0,
673                _mm_slli_si128(res_lo_ra0, 2));
674          res_hi0    = _mm_or_si128(res_hi_bg0,
675                _mm_slli_si128(res_hi_ra0, 2));
676          res_lo1    = _mm_or_si128(res_lo_bg1,
677                _mm_slli_si128(res_lo_ra1, 2));
678          res_hi1    = _mm_or_si128(res_hi_bg1,
679                _mm_slli_si128(res_hi_ra1, 2));
680 
681          store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
682       }
683 #endif
684 
685       for (; w < width; w++)
686       {
687          uint32_t col = input[w];
688          uint32_t r   = (col >> 11) & 0x1f;
689          uint32_t g   = (col >>  5) & 0x3f;
690          uint32_t b   = (col >>  0) & 0x1f;
691          r = (r << 3) | (r >> 2);
692          g = (g << 2) | (g >> 4);
693          b = (b << 3) | (b >> 2);
694 
695          *out++ = b;
696          *out++ = g;
697          *out++ = r;
698       }
699    }
700 }
701 
conv_bgr24_argb8888(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)702 void conv_bgr24_argb8888(void *output_, const void *input_,
703       int width, int height,
704       int out_stride, int in_stride)
705 {
706    int h, w;
707    const uint8_t *input = (const uint8_t*)input_;
708    uint32_t *output     = (uint32_t*)output_;
709 
710    for (h = 0; h < height;
711          h++, output += out_stride >> 2, input += in_stride)
712    {
713       const uint8_t *inp = input;
714       for (w = 0; w < width; w++)
715       {
716          uint32_t b = *inp++;
717          uint32_t g = *inp++;
718          uint32_t r = *inp++;
719          output[w]  = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
720       }
721    }
722 }
723 
conv_bgr24_rgb565(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)724 void conv_bgr24_rgb565(void *output_, const void *input_,
725       int width, int height,
726       int out_stride, int in_stride)
727 {
728    int h, w;
729    const uint8_t *input = (const uint8_t*)input_;
730    uint16_t *output     = (uint16_t*)output_;
731    for (h = 0; h < height;
732          h++, output += out_stride, input += in_stride)
733    {
734       const uint8_t *inp = input;
735       for (w = 0; w < width; w++)
736       {
737          uint16_t b = *inp++;
738          uint16_t g = *inp++;
739          uint16_t r = *inp++;
740 
741          output[w] = ((r & 0x00F8) << 8) | ((g&0x00FC) << 3) | ((b&0x00F8) >> 3);
742       }
743    }
744 }
745 
conv_argb8888_0rgb1555(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)746 void conv_argb8888_0rgb1555(void *output_, const void *input_,
747       int width, int height,
748       int out_stride, int in_stride)
749 {
750    int h, w;
751    const uint32_t *input = (const uint32_t*)input_;
752    uint16_t *output      = (uint16_t*)output_;
753 
754    for (h = 0; h < height;
755          h++, output += out_stride >> 1, input += in_stride >> 2)
756    {
757       for (w = 0; w < width; w++)
758       {
759          uint32_t col = input[w];
760          uint16_t r   = (col >> 19) & 0x1f;
761          uint16_t g   = (col >> 11) & 0x1f;
762          uint16_t b   = (col >>  3) & 0x1f;
763          output[w]    = (r << 10) | (g << 5) | (b << 0);
764       }
765    }
766 }
767 
conv_argb8888_bgr24(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)768 void conv_argb8888_bgr24(void *output_, const void *input_,
769       int width, int height,
770       int out_stride, int in_stride)
771 {
772    int h;
773    const uint32_t *input = (const uint32_t*)input_;
774    uint8_t *output       = (uint8_t*)output_;
775 
776 #if defined(__SSE2__)
777    int max_width = width - 15;
778 #endif
779 
780    for (h = 0; h < height;
781          h++, output += out_stride, input += in_stride >> 2)
782    {
783       uint8_t *out = output;
784       int        w = 0;
785 #if defined(__SSE2__)
786       for (; w < max_width; w += 16, out += 48)
787       {
788          __m128i l0 = _mm_loadu_si128((const __m128i*)(input + w +  0));
789          __m128i l1 = _mm_loadu_si128((const __m128i*)(input + w +  4));
790          __m128i l2 = _mm_loadu_si128((const __m128i*)(input + w +  8));
791          __m128i l3 = _mm_loadu_si128((const __m128i*)(input + w + 12));
792          store_bgr24_sse2(out, l0, l1, l2, l3);
793       }
794 #endif
795 
796       for (; w < width; w++)
797       {
798          uint32_t col = input[w];
799          *out++       = (uint8_t)(col >>  0);
800          *out++       = (uint8_t)(col >>  8);
801          *out++       = (uint8_t)(col >> 16);
802       }
803    }
804 }
805 
806 #if defined(__SSE2__)
conv_shuffle_rb_epi32(__m128i c)807 static INLINE __m128i conv_shuffle_rb_epi32(__m128i c)
808 {
809    /* SSSE3 plz */
810    const __m128i b_mask = _mm_set1_epi32(0x000000ff);
811    const __m128i g_mask = _mm_set1_epi32(0x0000ff00);
812    const __m128i r_mask = _mm_set1_epi32(0x00ff0000);
813    __m128i sl = _mm_and_si128(_mm_slli_epi32(c, 16), r_mask);
814    __m128i sr = _mm_and_si128(_mm_srli_epi32(c, 16), b_mask);
815    __m128i g  = _mm_and_si128(c, g_mask);
816    __m128i rb = _mm_or_si128(sl, sr);
817    return _mm_or_si128(g, rb);
818 }
819 #endif
820 
conv_abgr8888_bgr24(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)821 void conv_abgr8888_bgr24(void *output_, const void *input_,
822       int width, int height,
823       int out_stride, int in_stride)
824 {
825    int h;
826    const uint32_t *input = (const uint32_t*)input_;
827    uint8_t *output       = (uint8_t*)output_;
828 
829 #if defined(__SSE2__)
830    int max_width = width - 15;
831 #endif
832 
833    for (h = 0; h < height;
834          h++, output += out_stride, input += in_stride >> 2)
835    {
836       uint8_t *out = output;
837       int        w = 0;
838 #if defined(__SSE2__)
839       for (; w < max_width; w += 16, out += 48)
840       {
841 		 __m128i a = _mm_loadu_si128((const __m128i*)(input + w +  0));
842 		 __m128i b = _mm_loadu_si128((const __m128i*)(input + w +  4));
843 		 __m128i c = _mm_loadu_si128((const __m128i*)(input + w +  8));
844 		 __m128i d = _mm_loadu_si128((const __m128i*)(input + w + 12));
845          a = conv_shuffle_rb_epi32(a);
846          b = conv_shuffle_rb_epi32(b);
847          c = conv_shuffle_rb_epi32(c);
848          d = conv_shuffle_rb_epi32(d);
849          store_bgr24_sse2(out, a, b, c, d);
850       }
851 #endif
852 
853       for (; w < width; w++)
854       {
855          uint32_t col = input[w];
856          *out++       = (uint8_t)(col >> 16);
857          *out++       = (uint8_t)(col >>  8);
858          *out++       = (uint8_t)(col >>  0);
859       }
860    }
861 }
862 
conv_argb8888_abgr8888(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)863 void conv_argb8888_abgr8888(void *output_, const void *input_,
864       int width, int height,
865       int out_stride, int in_stride)
866 {
867    int h, w;
868    const uint32_t *input = (const uint32_t*)input_;
869    uint32_t *output      = (uint32_t*)output_;
870 
871    for (h = 0; h < height;
872          h++, output += out_stride >> 2, input += in_stride >> 2)
873    {
874       for (w = 0; w < width; w++)
875       {
876          uint32_t col = input[w];
877          output[w]    = ((col << 16) & 0xff0000) |
878             ((col >> 16) & 0xff) | (col & 0xff00ff00);
879       }
880    }
881 }
882 
883 #define YUV_SHIFT 6
884 #define YUV_OFFSET (1 << (YUV_SHIFT - 1))
885 #define YUV_MAT_Y (1 << 6)
886 #define YUV_MAT_U_G (-22)
887 #define YUV_MAT_U_B (113)
888 #define YUV_MAT_V_R (90)
889 #define YUV_MAT_V_G (-46)
890 
conv_yuyv_argb8888(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)891 void conv_yuyv_argb8888(void *output_, const void *input_,
892       int width, int height,
893       int out_stride, int in_stride)
894 {
895    int h;
896    const uint8_t *input        = (const uint8_t*)input_;
897    uint32_t *output            = (uint32_t*)output_;
898 
899 #if defined(__SSE2__)
900    const __m128i mask_y        = _mm_set1_epi16(0xffu);
901    const __m128i mask_u        = _mm_set1_epi32(0xffu << 8);
902    const __m128i mask_v        = _mm_set1_epi32(0xffu << 24);
903    const __m128i chroma_offset = _mm_set1_epi16(128);
904    const __m128i round_offset  = _mm_set1_epi16(YUV_OFFSET);
905 
906    const __m128i yuv_mul       = _mm_set1_epi16(YUV_MAT_Y);
907    const __m128i u_g_mul       = _mm_set1_epi16(YUV_MAT_U_G);
908    const __m128i u_b_mul       = _mm_set1_epi16(YUV_MAT_U_B);
909    const __m128i v_r_mul       = _mm_set1_epi16(YUV_MAT_V_R);
910    const __m128i v_g_mul       = _mm_set1_epi16(YUV_MAT_V_G);
911    const __m128i a             = _mm_cmpeq_epi16(
912          _mm_setzero_si128(), _mm_setzero_si128());
913 #endif
914 
915    for (h = 0; h < height; h++, output += out_stride >> 2, input += in_stride)
916    {
917       const uint8_t *src = input;
918       uint32_t      *dst = output;
919       int              w = 0;
920 
921 #if defined(__SSE2__)
922       /* Each loop processes 16 pixels. */
923       for (; w + 16 <= width; w += 16, src += 32, dst += 16)
924       {
925          __m128i u, v, u0_g, u1_g, u0_b, u1_b, v0_r, v1_r, v0_g, v1_g,
926                  r0, g0, b0, r1, g1, b1;
927          __m128i res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
928          __m128i res0, res1, res2, res3;
929          __m128i yuv0 = _mm_loadu_si128((const __m128i*)(src +  0)); /* [Y0, U0, Y1, V0, Y2, U1, Y3, V1, ...] */
930          __m128i yuv1 = _mm_loadu_si128((const __m128i*)(src + 16)); /* [Y0, U0, Y1, V0, Y2, U1, Y3, V1, ...] */
931 
932          __m128i _y0 = _mm_and_si128(yuv0, mask_y); /* [Y0, Y1, Y2, ...] (16-bit) */
933          __m128i u0 = _mm_and_si128(yuv0, mask_u); /* [0, U0, 0, 0, 0, U1, 0, 0, ...] */
934          __m128i v0 = _mm_and_si128(yuv0, mask_v); /* [0, 0, 0, V1, 0, , 0, V1, ...] */
935          __m128i _y1 = _mm_and_si128(yuv1, mask_y); /* [Y0, Y1, Y2, ...] (16-bit) */
936          __m128i u1 = _mm_and_si128(yuv1, mask_u); /* [0, U0, 0, 0, 0, U1, 0, 0, ...] */
937          __m128i v1 = _mm_and_si128(yuv1, mask_v); /* [0, 0, 0, V1, 0, , 0, V1, ...] */
938 
939          /* Juggle around to get U and V in the same 16-bit format as Y. */
940          u0 = _mm_srli_si128(u0, 1);
941          v0 = _mm_srli_si128(v0, 3);
942          u1 = _mm_srli_si128(u1, 1);
943          v1 = _mm_srli_si128(v1, 3);
944          u = _mm_packs_epi32(u0, u1);
945          v = _mm_packs_epi32(v0, v1);
946 
947          /* Apply YUV offsets (U, V) -= (-128, -128). */
948          u = _mm_sub_epi16(u, chroma_offset);
949          v = _mm_sub_epi16(v, chroma_offset);
950 
951          /* Upscale chroma horizontally (nearest). */
952          u0 = _mm_unpacklo_epi16(u, u);
953          u1 = _mm_unpackhi_epi16(u, u);
954          v0 = _mm_unpacklo_epi16(v, v);
955          v1 = _mm_unpackhi_epi16(v, v);
956 
957          /* Apply transformations. */
958          _y0 = _mm_mullo_epi16(_y0, yuv_mul);
959          _y1 = _mm_mullo_epi16(_y1, yuv_mul);
960          u0_g   = _mm_mullo_epi16(u0, u_g_mul);
961          u1_g   = _mm_mullo_epi16(u1, u_g_mul);
962          u0_b   = _mm_mullo_epi16(u0, u_b_mul);
963          u1_b   = _mm_mullo_epi16(u1, u_b_mul);
964          v0_r   = _mm_mullo_epi16(v0, v_r_mul);
965          v1_r   = _mm_mullo_epi16(v1, v_r_mul);
966          v0_g   = _mm_mullo_epi16(v0, v_g_mul);
967          v1_g   = _mm_mullo_epi16(v1, v_g_mul);
968 
969          /* Add contibutions from the transformed components. */
970          r0 = _mm_srai_epi16(_mm_adds_epi16(_mm_adds_epi16(_y0, v0_r),
971                   round_offset), YUV_SHIFT);
972          g0 = _mm_srai_epi16(_mm_adds_epi16(
973                   _mm_adds_epi16(_mm_adds_epi16(_y0, v0_g), u0_g), round_offset), YUV_SHIFT);
974          b0 = _mm_srai_epi16(_mm_adds_epi16(
975                   _mm_adds_epi16(_y0, u0_b), round_offset), YUV_SHIFT);
976 
977          r1 = _mm_srai_epi16(_mm_adds_epi16(
978                   _mm_adds_epi16(_y1, v1_r), round_offset), YUV_SHIFT);
979          g1 = _mm_srai_epi16(_mm_adds_epi16(
980                   _mm_adds_epi16(_mm_adds_epi16(_y1, v1_g), u1_g), round_offset), YUV_SHIFT);
981          b1 = _mm_srai_epi16(_mm_adds_epi16(
982                   _mm_adds_epi16(_y1, u1_b), round_offset), YUV_SHIFT);
983 
984          /* Saturate into 8-bit. */
985          r0 = _mm_packus_epi16(r0, r1);
986          g0 = _mm_packus_epi16(g0, g1);
987          b0 = _mm_packus_epi16(b0, b1);
988 
989          /* Interleave into ARGB. */
990          res_lo_bg = _mm_unpacklo_epi8(b0, g0);
991          res_hi_bg = _mm_unpackhi_epi8(b0, g0);
992          res_lo_ra = _mm_unpacklo_epi8(r0, a);
993          res_hi_ra = _mm_unpackhi_epi8(r0, a);
994          res0 = _mm_unpacklo_epi16(res_lo_bg, res_lo_ra);
995          res1 = _mm_unpackhi_epi16(res_lo_bg, res_lo_ra);
996          res2 = _mm_unpacklo_epi16(res_hi_bg, res_hi_ra);
997          res3 = _mm_unpackhi_epi16(res_hi_bg, res_hi_ra);
998 
999          _mm_storeu_si128((__m128i*)(dst +  0), res0);
1000          _mm_storeu_si128((__m128i*)(dst +  4), res1);
1001          _mm_storeu_si128((__m128i*)(dst +  8), res2);
1002          _mm_storeu_si128((__m128i*)(dst + 12), res3);
1003       }
1004 #endif
1005 
1006       /* Finish off the rest (if any) in C. */
1007       for (; w < width; w += 2, src += 4, dst += 2)
1008       {
1009          int _y0    = src[0];
1010          int  u     = src[1] - 128;
1011          int _y1    = src[2];
1012          int  v     = src[3] - 128;
1013 
1014          uint8_t r0 = clamp_8bit((YUV_MAT_Y * _y0 +                   YUV_MAT_V_R * v + YUV_OFFSET) >> YUV_SHIFT);
1015          uint8_t g0 = clamp_8bit((YUV_MAT_Y * _y0 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT);
1016          uint8_t b0 = clamp_8bit((YUV_MAT_Y * _y0 + YUV_MAT_U_B * u                   + YUV_OFFSET) >> YUV_SHIFT);
1017 
1018          uint8_t r1 = clamp_8bit((YUV_MAT_Y * _y1 +                   YUV_MAT_V_R * v + YUV_OFFSET) >> YUV_SHIFT);
1019          uint8_t g1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT);
1020          uint8_t b1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_U_B * u                   + YUV_OFFSET) >> YUV_SHIFT);
1021 
1022          dst[0]     = 0xff000000u | (r0 << 16) | (g0 << 8) | (b0 << 0);
1023          dst[1]     = 0xff000000u | (r1 << 16) | (g1 << 8) | (b1 << 0);
1024       }
1025    }
1026 }
1027 
conv_copy(void * output_,const void * input_,int width,int height,int out_stride,int in_stride)1028 void conv_copy(void *output_, const void *input_,
1029       int width, int height,
1030       int out_stride, int in_stride)
1031 {
1032    int h;
1033    int copy_len         = abs(out_stride);
1034    const uint8_t *input = (const uint8_t*)input_;
1035    uint8_t *output      = (uint8_t*)output_;
1036 
1037    if (abs(in_stride) < copy_len)
1038       copy_len          = abs(in_stride);
1039 
1040    for (h = 0; h < height;
1041          h++, output += out_stride, input += in_stride)
1042       memcpy(output, input, copy_len);
1043 }
1044