1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 #include <assert.h>
14 #include <stdio.h>
15 #include <string.h>  // For memcpy and memset.
16 
17 #include "libyuv/basic_types.h"
18 #include "libyuv/convert_argb.h"  // For kYuvI601Constants
19 
20 #ifdef __cplusplus
21 namespace libyuv {
22 extern "C" {
23 #endif
24 
25 // This macro control YUV to RGB using unsigned math to extend range of
26 // YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B:
27 // LIBYUV_UNLIMITED_DATA
28 
29 // The following macro from row_win makes the C code match the row_win code,
30 // which is 7 bit fixed point for ARGBToI420:
31 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
32     !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
33 #define LIBYUV_RGB7 1
34 #endif
35 
36 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
37     defined(_M_IX86)
38 #define LIBYUV_ARGBTOUV_PAVGB 1
39 #define LIBYUV_RGBTOU_TRUNCATE 1
40 #endif
41 
42 // llvm x86 is poor at ternary operator, so use branchless min/max.
43 
44 #define USE_BRANCHLESS 1
45 #if USE_BRANCHLESS
clamp0(int32_t v)46 static __inline int32_t clamp0(int32_t v) {
47   return -(v >= 0) & v;
48 }
49 // TODO(fbarchard): make clamp255 preserve negative values.
clamp255(int32_t v)50 static __inline int32_t clamp255(int32_t v) {
51   return (-(v >= 255) | v) & 255;
52 }
53 
clamp1023(int32_t v)54 static __inline int32_t clamp1023(int32_t v) {
55   return (-(v >= 1023) | v) & 1023;
56 }
57 
58 // clamp to max
ClampMax(int32_t v,int32_t max)59 static __inline int32_t ClampMax(int32_t v, int32_t max) {
60   return (-(v >= max) | v) & max;
61 }
62 
Abs(int32_t v)63 static __inline uint32_t Abs(int32_t v) {
64   int m = -(v < 0);
65   return (v + m) ^ m;
66 }
67 #else   // USE_BRANCHLESS
68 static __inline int32_t clamp0(int32_t v) {
69   return (v < 0) ? 0 : v;
70 }
71 
72 static __inline int32_t clamp255(int32_t v) {
73   return (v > 255) ? 255 : v;
74 }
75 
76 static __inline int32_t clamp1023(int32_t v) {
77   return (v > 1023) ? 1023 : v;
78 }
79 
80 static __inline int32_t ClampMax(int32_t v, int32_t max) {
81   return (v > max) ? max : v;
82 }
83 
84 static __inline uint32_t Abs(int32_t v) {
85   return (v < 0) ? -v : v;
86 }
87 #endif  // USE_BRANCHLESS
Clamp(int32_t val)88 static __inline uint32_t Clamp(int32_t val) {
89   int v = clamp0(val);
90   return (uint32_t)(clamp255(v));
91 }
92 
Clamp10(int32_t val)93 static __inline uint32_t Clamp10(int32_t val) {
94   int v = clamp0(val);
95   return (uint32_t)(clamp1023(v));
96 }
97 
98 // Little Endian
99 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
100     defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) ||     \
101     (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
102 #define WRITEWORD(p, v) *(uint32_t*)(p) = v
103 #else
WRITEWORD(uint8_t * p,uint32_t v)104 static inline void WRITEWORD(uint8_t* p, uint32_t v) {
105   p[0] = (uint8_t)(v & 255);
106   p[1] = (uint8_t)((v >> 8) & 255);
107   p[2] = (uint8_t)((v >> 16) & 255);
108   p[3] = (uint8_t)((v >> 24) & 255);
109 }
110 #endif
111 
RGB24ToARGBRow_C(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)112 void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
113   int x;
114   for (x = 0; x < width; ++x) {
115     uint8_t b = src_rgb24[0];
116     uint8_t g = src_rgb24[1];
117     uint8_t r = src_rgb24[2];
118     dst_argb[0] = b;
119     dst_argb[1] = g;
120     dst_argb[2] = r;
121     dst_argb[3] = 255u;
122     dst_argb += 4;
123     src_rgb24 += 3;
124   }
125 }
126 
RAWToARGBRow_C(const uint8_t * src_raw,uint8_t * dst_argb,int width)127 void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
128   int x;
129   for (x = 0; x < width; ++x) {
130     uint8_t r = src_raw[0];
131     uint8_t g = src_raw[1];
132     uint8_t b = src_raw[2];
133     dst_argb[0] = b;
134     dst_argb[1] = g;
135     dst_argb[2] = r;
136     dst_argb[3] = 255u;
137     dst_argb += 4;
138     src_raw += 3;
139   }
140 }
141 
RAWToRGBARow_C(const uint8_t * src_raw,uint8_t * dst_rgba,int width)142 void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
143   int x;
144   for (x = 0; x < width; ++x) {
145     uint8_t r = src_raw[0];
146     uint8_t g = src_raw[1];
147     uint8_t b = src_raw[2];
148     dst_rgba[0] = 255u;
149     dst_rgba[1] = b;
150     dst_rgba[2] = g;
151     dst_rgba[3] = r;
152     dst_rgba += 4;
153     src_raw += 3;
154   }
155 }
156 
RAWToRGB24Row_C(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)157 void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
158   int x;
159   for (x = 0; x < width; ++x) {
160     uint8_t r = src_raw[0];
161     uint8_t g = src_raw[1];
162     uint8_t b = src_raw[2];
163     dst_rgb24[0] = b;
164     dst_rgb24[1] = g;
165     dst_rgb24[2] = r;
166     dst_rgb24 += 3;
167     src_raw += 3;
168   }
169 }
170 
RGB565ToARGBRow_C(const uint8_t * src_rgb565,uint8_t * dst_argb,int width)171 void RGB565ToARGBRow_C(const uint8_t* src_rgb565,
172                        uint8_t* dst_argb,
173                        int width) {
174   int x;
175   for (x = 0; x < width; ++x) {
176     uint8_t b = src_rgb565[0] & 0x1f;
177     uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
178     uint8_t r = src_rgb565[1] >> 3;
179     dst_argb[0] = (b << 3) | (b >> 2);
180     dst_argb[1] = (g << 2) | (g >> 4);
181     dst_argb[2] = (r << 3) | (r >> 2);
182     dst_argb[3] = 255u;
183     dst_argb += 4;
184     src_rgb565 += 2;
185   }
186 }
187 
ARGB1555ToARGBRow_C(const uint8_t * src_argb1555,uint8_t * dst_argb,int width)188 void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
189                          uint8_t* dst_argb,
190                          int width) {
191   int x;
192   for (x = 0; x < width; ++x) {
193     uint8_t b = src_argb1555[0] & 0x1f;
194     uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
195     uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
196     uint8_t a = src_argb1555[1] >> 7;
197     dst_argb[0] = (b << 3) | (b >> 2);
198     dst_argb[1] = (g << 3) | (g >> 2);
199     dst_argb[2] = (r << 3) | (r >> 2);
200     dst_argb[3] = -a;
201     dst_argb += 4;
202     src_argb1555 += 2;
203   }
204 }
205 
ARGB4444ToARGBRow_C(const uint8_t * src_argb4444,uint8_t * dst_argb,int width)206 void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
207                          uint8_t* dst_argb,
208                          int width) {
209   int x;
210   for (x = 0; x < width; ++x) {
211     uint8_t b = src_argb4444[0] & 0x0f;
212     uint8_t g = src_argb4444[0] >> 4;
213     uint8_t r = src_argb4444[1] & 0x0f;
214     uint8_t a = src_argb4444[1] >> 4;
215     dst_argb[0] = (b << 4) | b;
216     dst_argb[1] = (g << 4) | g;
217     dst_argb[2] = (r << 4) | r;
218     dst_argb[3] = (a << 4) | a;
219     dst_argb += 4;
220     src_argb4444 += 2;
221   }
222 }
223 
AR30ToARGBRow_C(const uint8_t * src_ar30,uint8_t * dst_argb,int width)224 void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
225   int x;
226   for (x = 0; x < width; ++x) {
227     uint32_t ar30;
228     memcpy(&ar30, src_ar30, sizeof ar30);
229     uint32_t b = (ar30 >> 2) & 0xff;
230     uint32_t g = (ar30 >> 12) & 0xff;
231     uint32_t r = (ar30 >> 22) & 0xff;
232     uint32_t a = (ar30 >> 30) * 0x55;  // Replicate 2 bits to 8 bits.
233     *(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24);
234     dst_argb += 4;
235     src_ar30 += 4;
236   }
237 }
238 
AR30ToABGRRow_C(const uint8_t * src_ar30,uint8_t * dst_abgr,int width)239 void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
240   int x;
241   for (x = 0; x < width; ++x) {
242     uint32_t ar30;
243     memcpy(&ar30, src_ar30, sizeof ar30);
244     uint32_t b = (ar30 >> 2) & 0xff;
245     uint32_t g = (ar30 >> 12) & 0xff;
246     uint32_t r = (ar30 >> 22) & 0xff;
247     uint32_t a = (ar30 >> 30) * 0x55;  // Replicate 2 bits to 8 bits.
248     *(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24);
249     dst_abgr += 4;
250     src_ar30 += 4;
251   }
252 }
253 
AR30ToAB30Row_C(const uint8_t * src_ar30,uint8_t * dst_ab30,int width)254 void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
255   int x;
256   for (x = 0; x < width; ++x) {
257     uint32_t ar30;
258     memcpy(&ar30, src_ar30, sizeof ar30);
259     uint32_t b = ar30 & 0x3ff;
260     uint32_t ga = ar30 & 0xc00ffc00;
261     uint32_t r = (ar30 >> 20) & 0x3ff;
262     *(uint32_t*)(dst_ab30) = r | ga | (b << 20);
263     dst_ab30 += 4;
264     src_ar30 += 4;
265   }
266 }
267 
ARGBToRGB24Row_C(const uint8_t * src_argb,uint8_t * dst_rgb,int width)268 void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
269   int x;
270   for (x = 0; x < width; ++x) {
271     uint8_t b = src_argb[0];
272     uint8_t g = src_argb[1];
273     uint8_t r = src_argb[2];
274     dst_rgb[0] = b;
275     dst_rgb[1] = g;
276     dst_rgb[2] = r;
277     dst_rgb += 3;
278     src_argb += 4;
279   }
280 }
281 
ARGBToRAWRow_C(const uint8_t * src_argb,uint8_t * dst_rgb,int width)282 void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
283   int x;
284   for (x = 0; x < width; ++x) {
285     uint8_t b = src_argb[0];
286     uint8_t g = src_argb[1];
287     uint8_t r = src_argb[2];
288     dst_rgb[0] = r;
289     dst_rgb[1] = g;
290     dst_rgb[2] = b;
291     dst_rgb += 3;
292     src_argb += 4;
293   }
294 }
295 
ARGBToRGB565Row_C(const uint8_t * src_argb,uint8_t * dst_rgb,int width)296 void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
297   int x;
298   for (x = 0; x < width - 1; x += 2) {
299     uint8_t b0 = src_argb[0] >> 3;
300     uint8_t g0 = src_argb[1] >> 2;
301     uint8_t r0 = src_argb[2] >> 3;
302     uint8_t b1 = src_argb[4] >> 3;
303     uint8_t g1 = src_argb[5] >> 2;
304     uint8_t r1 = src_argb[6] >> 3;
305     WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
306                            (r1 << 27));
307     dst_rgb += 4;
308     src_argb += 8;
309   }
310   if (width & 1) {
311     uint8_t b0 = src_argb[0] >> 3;
312     uint8_t g0 = src_argb[1] >> 2;
313     uint8_t r0 = src_argb[2] >> 3;
314     *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
315   }
316 }
317 
318 // dither4 is a row of 4 values from 4x4 dither matrix.
319 // The 4x4 matrix contains values to increase RGB.  When converting to
320 // fewer bits (565) this provides an ordered dither.
321 // The order in the 4x4 matrix in first byte is upper left.
322 // The 4 values are passed as an int, then referenced as an array, so
323 // endian will not affect order of the original matrix.  But the dither4
324 // will containing the first pixel in the lower byte for little endian
325 // or the upper byte for big endian.
ARGBToRGB565DitherRow_C(const uint8_t * src_argb,uint8_t * dst_rgb,const uint32_t dither4,int width)326 void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
327                              uint8_t* dst_rgb,
328                              const uint32_t dither4,
329                              int width) {
330   int x;
331   for (x = 0; x < width - 1; x += 2) {
332     int dither0 = ((const unsigned char*)(&dither4))[x & 3];
333     int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
334     uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
335     uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
336     uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
337     uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3;
338     uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2;
339     uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3;
340     WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
341                            (r1 << 27));
342     dst_rgb += 4;
343     src_argb += 8;
344   }
345   if (width & 1) {
346     int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
347     uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
348     uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
349     uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
350     *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
351   }
352 }
353 
ARGBToARGB1555Row_C(const uint8_t * src_argb,uint8_t * dst_rgb,int width)354 void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
355   int x;
356   for (x = 0; x < width - 1; x += 2) {
357     uint8_t b0 = src_argb[0] >> 3;
358     uint8_t g0 = src_argb[1] >> 3;
359     uint8_t r0 = src_argb[2] >> 3;
360     uint8_t a0 = src_argb[3] >> 7;
361     uint8_t b1 = src_argb[4] >> 3;
362     uint8_t g1 = src_argb[5] >> 3;
363     uint8_t r1 = src_argb[6] >> 3;
364     uint8_t a1 = src_argb[7] >> 7;
365     *(uint32_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
366                             (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
367     dst_rgb += 4;
368     src_argb += 8;
369   }
370   if (width & 1) {
371     uint8_t b0 = src_argb[0] >> 3;
372     uint8_t g0 = src_argb[1] >> 3;
373     uint8_t r0 = src_argb[2] >> 3;
374     uint8_t a0 = src_argb[3] >> 7;
375     *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
376   }
377 }
378 
ARGBToARGB4444Row_C(const uint8_t * src_argb,uint8_t * dst_rgb,int width)379 void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
380   int x;
381   for (x = 0; x < width - 1; x += 2) {
382     uint8_t b0 = src_argb[0] >> 4;
383     uint8_t g0 = src_argb[1] >> 4;
384     uint8_t r0 = src_argb[2] >> 4;
385     uint8_t a0 = src_argb[3] >> 4;
386     uint8_t b1 = src_argb[4] >> 4;
387     uint8_t g1 = src_argb[5] >> 4;
388     uint8_t r1 = src_argb[6] >> 4;
389     uint8_t a1 = src_argb[7] >> 4;
390     *(uint32_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
391                             (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
392     dst_rgb += 4;
393     src_argb += 8;
394   }
395   if (width & 1) {
396     uint8_t b0 = src_argb[0] >> 4;
397     uint8_t g0 = src_argb[1] >> 4;
398     uint8_t r0 = src_argb[2] >> 4;
399     uint8_t a0 = src_argb[3] >> 4;
400     *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
401   }
402 }
403 
ABGRToAR30Row_C(const uint8_t * src_abgr,uint8_t * dst_ar30,int width)404 void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {
405   int x;
406   for (x = 0; x < width; ++x) {
407     uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2);
408     uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2);
409     uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2);
410     uint32_t a0 = (src_abgr[3] >> 6);
411     *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30);
412     dst_ar30 += 4;
413     src_abgr += 4;
414   }
415 }
416 
ARGBToAR30Row_C(const uint8_t * src_argb,uint8_t * dst_ar30,int width)417 void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
418   int x;
419   for (x = 0; x < width; ++x) {
420     uint32_t b0 = (src_argb[0] >> 6) | ((uint32_t)(src_argb[0]) << 2);
421     uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2);
422     uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2);
423     uint32_t a0 = (src_argb[3] >> 6);
424     *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30);
425     dst_ar30 += 4;
426     src_argb += 4;
427   }
428 }
429 
ARGBToAR64Row_C(const uint8_t * src_argb,uint16_t * dst_ar64,int width)430 void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
431   int x;
432   for (x = 0; x < width; ++x) {
433     dst_ar64[0] = src_argb[0] * 0x0101;
434     dst_ar64[1] = src_argb[1] * 0x0101;
435     dst_ar64[2] = src_argb[2] * 0x0101;
436     dst_ar64[3] = src_argb[3] * 0x0101;
437     dst_ar64 += 4;
438     src_argb += 4;
439   }
440 }
441 
ARGBToAB64Row_C(const uint8_t * src_argb,uint16_t * dst_ab64,int width)442 void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
443   int x;
444   for (x = 0; x < width; ++x) {
445     dst_ab64[0] = src_argb[2] * 0x0101;
446     dst_ab64[1] = src_argb[1] * 0x0101;
447     dst_ab64[2] = src_argb[0] * 0x0101;
448     dst_ab64[3] = src_argb[3] * 0x0101;
449     dst_ab64 += 4;
450     src_argb += 4;
451   }
452 }
453 
AR64ToARGBRow_C(const uint16_t * src_ar64,uint8_t * dst_argb,int width)454 void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
455   int x;
456   for (x = 0; x < width; ++x) {
457     dst_argb[0] = src_ar64[0] >> 8;
458     dst_argb[1] = src_ar64[1] >> 8;
459     dst_argb[2] = src_ar64[2] >> 8;
460     dst_argb[3] = src_ar64[3] >> 8;
461     dst_argb += 4;
462     src_ar64 += 4;
463   }
464 }
465 
AB64ToARGBRow_C(const uint16_t * src_ab64,uint8_t * dst_argb,int width)466 void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
467   int x;
468   for (x = 0; x < width; ++x) {
469     dst_argb[0] = src_ab64[2] >> 8;
470     dst_argb[1] = src_ab64[1] >> 8;
471     dst_argb[2] = src_ab64[0] >> 8;
472     dst_argb[3] = src_ab64[3] >> 8;
473     dst_argb += 4;
474     src_ab64 += 4;
475   }
476 }
477 
478 // TODO(fbarchard): Make shuffle compatible with SIMD versions
AR64ShuffleRow_C(const uint8_t * src_ar64,uint8_t * dst_ar64,const uint8_t * shuffler,int width)479 void AR64ShuffleRow_C(const uint8_t* src_ar64,
480                       uint8_t* dst_ar64,
481                       const uint8_t* shuffler,
482                       int width) {
483   const uint16_t* src_ar64_16 = (const uint16_t*)src_ar64;
484   uint16_t* dst_ar64_16 = (uint16_t*)dst_ar64;
485   int index0 = shuffler[0] / 2;
486   int index1 = shuffler[2] / 2;
487   int index2 = shuffler[4] / 2;
488   int index3 = shuffler[6] / 2;
489   // Shuffle a row of AR64.
490   int x;
491   for (x = 0; x < width / 2; ++x) {
492     // To support in-place conversion.
493     uint16_t b = src_ar64_16[index0];
494     uint16_t g = src_ar64_16[index1];
495     uint16_t r = src_ar64_16[index2];
496     uint16_t a = src_ar64_16[index3];
497     dst_ar64_16[0] = b;
498     dst_ar64_16[1] = g;
499     dst_ar64_16[2] = r;
500     dst_ar64_16[3] = a;
501     src_ar64_16 += 4;
502     dst_ar64_16 += 4;
503   }
504 }
505 
506 #ifdef LIBYUV_RGB7
507 // Old 7 bit math for compatibility on unsupported platforms.
RGBToY(uint8_t r,uint8_t g,uint8_t b)508 static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
509   return ((33 * r + 65 * g + 13 * b) >> 7) + 16;
510 }
511 #else
512 // 8 bit
513 // Intel SSE/AVX uses the following equivalent formula
514 // 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round.
515 //  return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) +
516 //  0x7e80) >> 8;
517 
RGBToY(uint8_t r,uint8_t g,uint8_t b)518 static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
519   return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
520 }
521 #endif
522 
523 #define AVGB(a, b) (((a) + (b) + 1) >> 1)
524 
525 #ifdef LIBYUV_RGBTOU_TRUNCATE
RGBToU(uint8_t r,uint8_t g,uint8_t b)526 static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
527   return (112 * b - 74 * g - 38 * r + 0x8000) >> 8;
528 }
RGBToV(uint8_t r,uint8_t g,uint8_t b)529 static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
530   return (112 * r - 94 * g - 18 * b + 0x8000) >> 8;
531 }
532 #else
533 // TODO(fbarchard): Add rounding to SIMD and use this
RGBToU(uint8_t r,uint8_t g,uint8_t b)534 static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
535   return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
536 }
RGBToV(uint8_t r,uint8_t g,uint8_t b)537 static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
538   return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
539 }
540 #endif
541 
542 #if !defined(LIBYUV_ARGBTOUV_PAVGB)
RGB2xToU(uint16_t r,uint16_t g,uint16_t b)543 static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) {
544   return ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8;
545 }
RGB2xToV(uint16_t r,uint16_t g,uint16_t b)546 static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
547   return ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8;
548 }
549 #endif
550 
551 // ARGBToY_C and ARGBToUV_C
552 // Intel version mimic SSE/AVX which does 2 pavgb
553 #if LIBYUV_ARGBTOUV_PAVGB
554 
555 #define MAKEROWY(NAME, R, G, B, BPP)                                       \
556   void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
557     int x;                                                                 \
558     for (x = 0; x < width; ++x) {                                          \
559       dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]);               \
560       src_rgb += BPP;                                                      \
561       dst_y += 1;                                                          \
562     }                                                                      \
563   }                                                                        \
564   void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
565                        uint8_t* dst_u, uint8_t* dst_v, int width) {        \
566     const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                    \
567     int x;                                                                 \
568     for (x = 0; x < width - 1; x += 2) {                                   \
569       uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]),                     \
570                         AVGB(src_rgb[B + BPP], src_rgb1[B + BPP]));        \
571       uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]),                     \
572                         AVGB(src_rgb[G + BPP], src_rgb1[G + BPP]));        \
573       uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]),                     \
574                         AVGB(src_rgb[R + BPP], src_rgb1[R + BPP]));        \
575       dst_u[0] = RGBToU(ar, ag, ab);                                       \
576       dst_v[0] = RGBToV(ar, ag, ab);                                       \
577       src_rgb += BPP * 2;                                                  \
578       src_rgb1 += BPP * 2;                                                 \
579       dst_u += 1;                                                          \
580       dst_v += 1;                                                          \
581     }                                                                      \
582     if (width & 1) {                                                       \
583       uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]);                          \
584       uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]);                          \
585       uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]);                          \
586       dst_u[0] = RGBToU(ar, ag, ab);                                       \
587       dst_v[0] = RGBToV(ar, ag, ab);                                       \
588     }                                                                      \
589   }
590 #else
591 // ARM version does sum / 2 then multiply by 2x smaller coefficients
592 #define MAKEROWY(NAME, R, G, B, BPP)                                       \
593   void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
594     int x;                                                                 \
595     for (x = 0; x < width; ++x) {                                          \
596       dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]);               \
597       src_rgb += BPP;                                                      \
598       dst_y += 1;                                                          \
599     }                                                                      \
600   }                                                                        \
601   void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
602                        uint8_t* dst_u, uint8_t* dst_v, int width) {        \
603     const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                    \
604     int x;                                                                 \
605     for (x = 0; x < width - 1; x += 2) {                                   \
606       uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] +         \
607                      src_rgb1[B + BPP] + 1) >>                             \
608                     1;                                                     \
609       uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] +         \
610                      src_rgb1[G + BPP] + 1) >>                             \
611                     1;                                                     \
612       uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] +         \
613                      src_rgb1[R + BPP] + 1) >>                             \
614                     1;                                                     \
615       dst_u[0] = RGB2xToU(ar, ag, ab);                                     \
616       dst_v[0] = RGB2xToV(ar, ag, ab);                                     \
617       src_rgb += BPP * 2;                                                  \
618       src_rgb1 += BPP * 2;                                                 \
619       dst_u += 1;                                                          \
620       dst_v += 1;                                                          \
621     }                                                                      \
622     if (width & 1) {                                                       \
623       uint16_t ab = src_rgb[B] + src_rgb1[B];                              \
624       uint16_t ag = src_rgb[G] + src_rgb1[G];                              \
625       uint16_t ar = src_rgb[R] + src_rgb1[R];                              \
626       dst_u[0] = RGB2xToU(ar, ag, ab);                                     \
627       dst_v[0] = RGB2xToV(ar, ag, ab);                                     \
628     }                                                                      \
629   }
630 #endif
631 
632 MAKEROWY(ARGB, 2, 1, 0, 4)
633 MAKEROWY(BGRA, 1, 2, 3, 4)
634 MAKEROWY(ABGR, 0, 1, 2, 4)
635 MAKEROWY(RGBA, 3, 2, 1, 4)
636 MAKEROWY(RGB24, 2, 1, 0, 3)
637 MAKEROWY(RAW, 0, 1, 2, 3)
638 #undef MAKEROWY
639 
640 // JPeg uses a variation on BT.601-1 full range
641 // y =  0.29900 * r + 0.58700 * g + 0.11400 * b
642 // u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center
643 // v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center
644 // BT.601 Mpeg range uses:
645 // b 0.1016 * 255 = 25.908 = 25
646 // g 0.5078 * 255 = 129.489 = 129
647 // r 0.2578 * 255 = 65.739 = 66
648 // JPeg 7 bit Y (deprecated)
649 // b 0.11400 * 128 = 14.592 = 15
650 // g 0.58700 * 128 = 75.136 = 75
651 // r 0.29900 * 128 = 38.272 = 38
652 // JPeg 8 bit Y:
653 // b 0.11400 * 256 = 29.184 = 29
654 // g 0.58700 * 256 = 150.272 = 150
655 // r 0.29900 * 256 = 76.544 = 77
656 // JPeg 8 bit U:
657 // b  0.50000 * 255 = 127.5 = 127
658 // g -0.33126 * 255 = -84.4713 = -84
659 // r -0.16874 * 255 = -43.0287 = -43
660 // JPeg 8 bit V:
661 // b -0.08131 * 255 = -20.73405 = -20
662 // g -0.41869 * 255 = -106.76595 = -107
663 // r  0.50000 * 255 = 127.5 = 127
664 
665 #ifdef LIBYUV_RGB7
666 // Old 7 bit math for compatibility on unsupported platforms.
RGBToYJ(uint8_t r,uint8_t g,uint8_t b)667 static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
668   return (38 * r + 75 * g + 15 * b + 64) >> 7;
669 }
670 #else
671 // 8 bit
672 static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
673   return (77 * r + 150 * g + 29 * b + 128) >> 8;
674 }
675 #endif
676 
677 #if defined(LIBYUV_ARGBTOUV_PAVGB)
RGBToUJ(uint8_t r,uint8_t g,uint8_t b)678 static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
679   return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
680 }
RGBToVJ(uint8_t r,uint8_t g,uint8_t b)681 static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
682   return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
683 }
684 #else
RGB2xToUJ(uint16_t r,uint16_t g,uint16_t b)685 static __inline int RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) {
686   return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8;
687 }
RGB2xToVJ(uint16_t r,uint16_t g,uint16_t b)688 static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
689   return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8;
690 }
691 #endif
692 
693 // ARGBToYJ_C and ARGBToUVJ_C
694 // Intel version mimic SSE/AVX which does 2 pavgb
695 #if LIBYUV_ARGBTOUV_PAVGB
696 #define MAKEROWYJ(NAME, R, G, B, BPP)                                       \
697   void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
698     int x;                                                                  \
699     for (x = 0; x < width; ++x) {                                           \
700       dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]);               \
701       src_rgb += BPP;                                                       \
702       dst_y += 1;                                                           \
703     }                                                                       \
704   }                                                                         \
705   void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
706                         uint8_t* dst_u, uint8_t* dst_v, int width) {        \
707     const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                     \
708     int x;                                                                  \
709     for (x = 0; x < width - 1; x += 2) {                                    \
710       uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]),                      \
711                         AVGB(src_rgb[B + BPP], src_rgb1[B + BPP]));         \
712       uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]),                      \
713                         AVGB(src_rgb[G + BPP], src_rgb1[G + BPP]));         \
714       uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]),                      \
715                         AVGB(src_rgb[R + BPP], src_rgb1[R + BPP]));         \
716       dst_u[0] = RGBToUJ(ar, ag, ab);                                       \
717       dst_v[0] = RGBToVJ(ar, ag, ab);                                       \
718       src_rgb += BPP * 2;                                                   \
719       src_rgb1 += BPP * 2;                                                  \
720       dst_u += 1;                                                           \
721       dst_v += 1;                                                           \
722     }                                                                       \
723     if (width & 1) {                                                        \
724       uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]);                           \
725       uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]);                           \
726       uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]);                           \
727       dst_u[0] = RGBToUJ(ar, ag, ab);                                       \
728       dst_v[0] = RGBToVJ(ar, ag, ab);                                       \
729     }                                                                       \
730   }
731 #else
732 // ARM version does sum / 2 then multiply by 2x smaller coefficients
733 #define MAKEROWYJ(NAME, R, G, B, BPP)                                       \
734   void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
735     int x;                                                                  \
736     for (x = 0; x < width; ++x) {                                           \
737       dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]);               \
738       src_rgb += BPP;                                                       \
739       dst_y += 1;                                                           \
740     }                                                                       \
741   }                                                                         \
742   void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
743                         uint8_t* dst_u, uint8_t* dst_v, int width) {        \
744     const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                     \
745     int x;                                                                  \
746     for (x = 0; x < width - 1; x += 2) {                                    \
747       uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] +          \
748                      src_rgb1[B + BPP] + 1) >>                              \
749                     1;                                                      \
750       uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] +          \
751                      src_rgb1[G + BPP] + 1) >>                              \
752                     1;                                                      \
753       uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] +          \
754                      src_rgb1[R + BPP] + 1) >>                              \
755                     1;                                                      \
756       dst_u[0] = RGB2xToUJ(ar, ag, ab);                                     \
757       dst_v[0] = RGB2xToVJ(ar, ag, ab);                                     \
758       src_rgb += BPP * 2;                                                   \
759       src_rgb1 += BPP * 2;                                                  \
760       dst_u += 1;                                                           \
761       dst_v += 1;                                                           \
762     }                                                                       \
763     if (width & 1) {                                                        \
764       uint16_t ab = (src_rgb[B] + src_rgb1[B]);                             \
765       uint16_t ag = (src_rgb[G] + src_rgb1[G]);                             \
766       uint16_t ar = (src_rgb[R] + src_rgb1[R]);                             \
767       dst_u[0] = RGB2xToUJ(ar, ag, ab);                                     \
768       dst_v[0] = RGB2xToVJ(ar, ag, ab);                                     \
769     }                                                                       \
770   }
771 
772 #endif
773 
774 MAKEROWYJ(ARGB, 2, 1, 0, 4)
775 MAKEROWYJ(RGBA, 3, 2, 1, 4)
776 MAKEROWYJ(RGB24, 2, 1, 0, 3)
777 MAKEROWYJ(RAW, 0, 1, 2, 3)
778 #undef MAKEROWYJ
779 
RGB565ToYRow_C(const uint8_t * src_rgb565,uint8_t * dst_y,int width)780 void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
781   int x;
782   for (x = 0; x < width; ++x) {
783     uint8_t b = src_rgb565[0] & 0x1f;
784     uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
785     uint8_t r = src_rgb565[1] >> 3;
786     b = (b << 3) | (b >> 2);
787     g = (g << 2) | (g >> 4);
788     r = (r << 3) | (r >> 2);
789     dst_y[0] = RGBToY(r, g, b);
790     src_rgb565 += 2;
791     dst_y += 1;
792   }
793 }
794 
ARGB1555ToYRow_C(const uint8_t * src_argb1555,uint8_t * dst_y,int width)795 void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) {
796   int x;
797   for (x = 0; x < width; ++x) {
798     uint8_t b = src_argb1555[0] & 0x1f;
799     uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
800     uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
801     b = (b << 3) | (b >> 2);
802     g = (g << 3) | (g >> 2);
803     r = (r << 3) | (r >> 2);
804     dst_y[0] = RGBToY(r, g, b);
805     src_argb1555 += 2;
806     dst_y += 1;
807   }
808 }
809 
ARGB4444ToYRow_C(const uint8_t * src_argb4444,uint8_t * dst_y,int width)810 void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) {
811   int x;
812   for (x = 0; x < width; ++x) {
813     uint8_t b = src_argb4444[0] & 0x0f;
814     uint8_t g = src_argb4444[0] >> 4;
815     uint8_t r = src_argb4444[1] & 0x0f;
816     b = (b << 4) | b;
817     g = (g << 4) | g;
818     r = (r << 4) | r;
819     dst_y[0] = RGBToY(r, g, b);
820     src_argb4444 += 2;
821     dst_y += 1;
822   }
823 }
824 
RGB565ToUVRow_C(const uint8_t * src_rgb565,int src_stride_rgb565,uint8_t * dst_u,uint8_t * dst_v,int width)825 void RGB565ToUVRow_C(const uint8_t* src_rgb565,
826                      int src_stride_rgb565,
827                      uint8_t* dst_u,
828                      uint8_t* dst_v,
829                      int width) {
830   const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
831   int x;
832   for (x = 0; x < width - 1; x += 2) {
833     uint8_t b0 = src_rgb565[0] & 0x1f;
834     uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
835     uint8_t r0 = src_rgb565[1] >> 3;
836     uint8_t b1 = src_rgb565[2] & 0x1f;
837     uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
838     uint8_t r1 = src_rgb565[3] >> 3;
839     uint8_t b2 = next_rgb565[0] & 0x1f;
840     uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
841     uint8_t r2 = next_rgb565[1] >> 3;
842     uint8_t b3 = next_rgb565[2] & 0x1f;
843     uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
844     uint8_t r3 = next_rgb565[3] >> 3;
845 
846     b0 = (b0 << 3) | (b0 >> 2);
847     g0 = (g0 << 2) | (g0 >> 4);
848     r0 = (r0 << 3) | (r0 >> 2);
849     b1 = (b1 << 3) | (b1 >> 2);
850     g1 = (g1 << 2) | (g1 >> 4);
851     r1 = (r1 << 3) | (r1 >> 2);
852     b2 = (b2 << 3) | (b2 >> 2);
853     g2 = (g2 << 2) | (g2 >> 4);
854     r2 = (r2 << 3) | (r2 >> 2);
855     b3 = (b3 << 3) | (b3 >> 2);
856     g3 = (g3 << 2) | (g3 >> 4);
857     r3 = (r3 << 3) | (r3 >> 2);
858 
859 #if LIBYUV_ARGBTOUV_PAVGB
860     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
861     uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
862     uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
863     dst_u[0] = RGBToU(ar, ag, ab);
864     dst_v[0] = RGBToV(ar, ag, ab);
865 #else
866     uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
867     uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
868     uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
869     dst_u[0] = RGB2xToU(r, g, b);
870     dst_v[0] = RGB2xToV(r, g, b);
871 #endif
872 
873     src_rgb565 += 4;
874     next_rgb565 += 4;
875     dst_u += 1;
876     dst_v += 1;
877   }
878   if (width & 1) {
879     uint8_t b0 = src_rgb565[0] & 0x1f;
880     uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
881     uint8_t r0 = src_rgb565[1] >> 3;
882     uint8_t b2 = next_rgb565[0] & 0x1f;
883     uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
884     uint8_t r2 = next_rgb565[1] >> 3;
885 
886     b0 = (b0 << 3) | (b0 >> 2);
887     g0 = (g0 << 2) | (g0 >> 4);
888     r0 = (r0 << 3) | (r0 >> 2);
889     b2 = (b2 << 3) | (b2 >> 2);
890     g2 = (g2 << 2) | (g2 >> 4);
891     r2 = (r2 << 3) | (r2 >> 2);
892 
893 #if LIBYUV_ARGBTOUV_PAVGB
894     uint8_t ab = AVGB(b0, b2);
895     uint8_t ag = AVGB(g0, g2);
896     uint8_t ar = AVGB(r0, r2);
897     dst_u[0] = RGBToU(ar, ag, ab);
898     dst_v[0] = RGBToV(ar, ag, ab);
899 #else
900     uint16_t b = b0 + b2;
901     uint16_t g = g0 + g2;
902     uint16_t r = r0 + r2;
903     dst_u[0] = RGB2xToU(r, g, b);
904     dst_v[0] = RGB2xToV(r, g, b);
905 #endif
906   }
907 }
908 
ARGB1555ToUVRow_C(const uint8_t * src_argb1555,int src_stride_argb1555,uint8_t * dst_u,uint8_t * dst_v,int width)909 void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
910                        int src_stride_argb1555,
911                        uint8_t* dst_u,
912                        uint8_t* dst_v,
913                        int width) {
914   const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
915   int x;
916   for (x = 0; x < width - 1; x += 2) {
917     uint8_t b0 = src_argb1555[0] & 0x1f;
918     uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
919     uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
920     uint8_t b1 = src_argb1555[2] & 0x1f;
921     uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
922     uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2;
923     uint8_t b2 = next_argb1555[0] & 0x1f;
924     uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
925     uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;
926     uint8_t b3 = next_argb1555[2] & 0x1f;
927     uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
928     uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2;
929 
930     b0 = (b0 << 3) | (b0 >> 2);
931     g0 = (g0 << 3) | (g0 >> 2);
932     r0 = (r0 << 3) | (r0 >> 2);
933     b1 = (b1 << 3) | (b1 >> 2);
934     g1 = (g1 << 3) | (g1 >> 2);
935     r1 = (r1 << 3) | (r1 >> 2);
936     b2 = (b2 << 3) | (b2 >> 2);
937     g2 = (g2 << 3) | (g2 >> 2);
938     r2 = (r2 << 3) | (r2 >> 2);
939     b3 = (b3 << 3) | (b3 >> 2);
940     g3 = (g3 << 3) | (g3 >> 2);
941     r3 = (r3 << 3) | (r3 >> 2);
942 
943 #if LIBYUV_ARGBTOUV_PAVGB
944     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
945     uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
946     uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
947     dst_u[0] = RGBToU(ar, ag, ab);
948     dst_v[0] = RGBToV(ar, ag, ab);
949 #else
950     uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
951     uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
952     uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
953     dst_u[0] = RGB2xToU(r, g, b);
954     dst_v[0] = RGB2xToV(r, g, b);
955 #endif
956 
957     src_argb1555 += 4;
958     next_argb1555 += 4;
959     dst_u += 1;
960     dst_v += 1;
961   }
962   if (width & 1) {
963     uint8_t b0 = src_argb1555[0] & 0x1f;
964     uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
965     uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
966     uint8_t b2 = next_argb1555[0] & 0x1f;
967     uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
968     uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;
969 
970     b0 = (b0 << 3) | (b0 >> 2);
971     g0 = (g0 << 3) | (g0 >> 2);
972     r0 = (r0 << 3) | (r0 >> 2);
973     b2 = (b2 << 3) | (b2 >> 2);
974     g2 = (g2 << 3) | (g2 >> 2);
975     r2 = (r2 << 3) | (r2 >> 2);
976 
977 #if LIBYUV_ARGBTOUV_PAVGB
978     uint8_t ab = AVGB(b0, b2);
979     uint8_t ag = AVGB(g0, g2);
980     uint8_t ar = AVGB(r0, r2);
981     dst_u[0] = RGBToU(ar, ag, ab);
982     dst_v[0] = RGBToV(ar, ag, ab);
983 #else
984     uint16_t b = b0 + b2;
985     uint16_t g = g0 + g2;
986     uint16_t r = r0 + r2;
987     dst_u[0] = RGB2xToU(r, g, b);
988     dst_v[0] = RGB2xToV(r, g, b);
989 #endif
990   }
991 }
992 
ARGB4444ToUVRow_C(const uint8_t * src_argb4444,int src_stride_argb4444,uint8_t * dst_u,uint8_t * dst_v,int width)993 void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
994                        int src_stride_argb4444,
995                        uint8_t* dst_u,
996                        uint8_t* dst_v,
997                        int width) {
998   const uint8_t* next_argb4444 = src_argb4444 + src_stride_argb4444;
999   int x;
1000   for (x = 0; x < width - 1; x += 2) {
1001     uint8_t b0 = src_argb4444[0] & 0x0f;
1002     uint8_t g0 = src_argb4444[0] >> 4;
1003     uint8_t r0 = src_argb4444[1] & 0x0f;
1004     uint8_t b1 = src_argb4444[2] & 0x0f;
1005     uint8_t g1 = src_argb4444[2] >> 4;
1006     uint8_t r1 = src_argb4444[3] & 0x0f;
1007     uint8_t b2 = next_argb4444[0] & 0x0f;
1008     uint8_t g2 = next_argb4444[0] >> 4;
1009     uint8_t r2 = next_argb4444[1] & 0x0f;
1010     uint8_t b3 = next_argb4444[2] & 0x0f;
1011     uint8_t g3 = next_argb4444[2] >> 4;
1012     uint8_t r3 = next_argb4444[3] & 0x0f;
1013 
1014     b0 = (b0 << 4) | b0;
1015     g0 = (g0 << 4) | g0;
1016     r0 = (r0 << 4) | r0;
1017     b1 = (b1 << 4) | b1;
1018     g1 = (g1 << 4) | g1;
1019     r1 = (r1 << 4) | r1;
1020     b2 = (b2 << 4) | b2;
1021     g2 = (g2 << 4) | g2;
1022     r2 = (r2 << 4) | r2;
1023     b3 = (b3 << 4) | b3;
1024     g3 = (g3 << 4) | g3;
1025     r3 = (r3 << 4) | r3;
1026 
1027 #if LIBYUV_ARGBTOUV_PAVGB
1028     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
1029     uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
1030     uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
1031     dst_u[0] = RGBToU(ar, ag, ab);
1032     dst_v[0] = RGBToV(ar, ag, ab);
1033 #else
1034     uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
1035     uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
1036     uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
1037     dst_u[0] = RGB2xToU(r, g, b);
1038     dst_v[0] = RGB2xToV(r, g, b);
1039 #endif
1040 
1041     src_argb4444 += 4;
1042     next_argb4444 += 4;
1043     dst_u += 1;
1044     dst_v += 1;
1045   }
1046   if (width & 1) {
1047     uint8_t b0 = src_argb4444[0] & 0x0f;
1048     uint8_t g0 = src_argb4444[0] >> 4;
1049     uint8_t r0 = src_argb4444[1] & 0x0f;
1050     uint8_t b2 = next_argb4444[0] & 0x0f;
1051     uint8_t g2 = next_argb4444[0] >> 4;
1052     uint8_t r2 = next_argb4444[1] & 0x0f;
1053 
1054     b0 = (b0 << 4) | b0;
1055     g0 = (g0 << 4) | g0;
1056     r0 = (r0 << 4) | r0;
1057     b2 = (b2 << 4) | b2;
1058     g2 = (g2 << 4) | g2;
1059     r2 = (r2 << 4) | r2;
1060 
1061 #if LIBYUV_ARGBTOUV_PAVGB
1062     uint8_t ab = AVGB(b0, b2);
1063     uint8_t ag = AVGB(g0, g2);
1064     uint8_t ar = AVGB(r0, r2);
1065     dst_u[0] = RGBToU(ar, ag, ab);
1066     dst_v[0] = RGBToV(ar, ag, ab);
1067 #else
1068     uint16_t b = b0 + b2;
1069     uint16_t g = g0 + g2;
1070     uint16_t r = r0 + r2;
1071     dst_u[0] = RGB2xToU(r, g, b);
1072     dst_v[0] = RGB2xToV(r, g, b);
1073 #endif
1074   }
1075 }
1076 
ARGBToUV444Row_C(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1077 void ARGBToUV444Row_C(const uint8_t* src_argb,
1078                       uint8_t* dst_u,
1079                       uint8_t* dst_v,
1080                       int width) {
1081   int x;
1082   for (x = 0; x < width; ++x) {
1083     uint8_t ab = src_argb[0];
1084     uint8_t ag = src_argb[1];
1085     uint8_t ar = src_argb[2];
1086     dst_u[0] = RGBToU(ar, ag, ab);
1087     dst_v[0] = RGBToV(ar, ag, ab);
1088     src_argb += 4;
1089     dst_u += 1;
1090     dst_v += 1;
1091   }
1092 }
1093 
ARGBGrayRow_C(const uint8_t * src_argb,uint8_t * dst_argb,int width)1094 void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
1095   int x;
1096   for (x = 0; x < width; ++x) {
1097     uint8_t y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
1098     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
1099     dst_argb[3] = src_argb[3];
1100     dst_argb += 4;
1101     src_argb += 4;
1102   }
1103 }
1104 
1105 // Convert a row of image to Sepia tone.
ARGBSepiaRow_C(uint8_t * dst_argb,int width)1106 void ARGBSepiaRow_C(uint8_t* dst_argb, int width) {
1107   int x;
1108   for (x = 0; x < width; ++x) {
1109     int b = dst_argb[0];
1110     int g = dst_argb[1];
1111     int r = dst_argb[2];
1112     int sb = (b * 17 + g * 68 + r * 35) >> 7;
1113     int sg = (b * 22 + g * 88 + r * 45) >> 7;
1114     int sr = (b * 24 + g * 98 + r * 50) >> 7;
1115     // b does not over flow. a is preserved from original.
1116     dst_argb[0] = sb;
1117     dst_argb[1] = clamp255(sg);
1118     dst_argb[2] = clamp255(sr);
1119     dst_argb += 4;
1120   }
1121 }
1122 
1123 // Apply color matrix to a row of image. Matrix is signed.
1124 // TODO(fbarchard): Consider adding rounding (+32).
ARGBColorMatrixRow_C(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)1125 void ARGBColorMatrixRow_C(const uint8_t* src_argb,
1126                           uint8_t* dst_argb,
1127                           const int8_t* matrix_argb,
1128                           int width) {
1129   int x;
1130   for (x = 0; x < width; ++x) {
1131     int b = src_argb[0];
1132     int g = src_argb[1];
1133     int r = src_argb[2];
1134     int a = src_argb[3];
1135     int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] +
1136               a * matrix_argb[3]) >>
1137              6;
1138     int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] +
1139               a * matrix_argb[7]) >>
1140              6;
1141     int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] +
1142               a * matrix_argb[11]) >>
1143              6;
1144     int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] +
1145               a * matrix_argb[15]) >>
1146              6;
1147     dst_argb[0] = Clamp(sb);
1148     dst_argb[1] = Clamp(sg);
1149     dst_argb[2] = Clamp(sr);
1150     dst_argb[3] = Clamp(sa);
1151     src_argb += 4;
1152     dst_argb += 4;
1153   }
1154 }
1155 
1156 // Apply color table to a row of image.
ARGBColorTableRow_C(uint8_t * dst_argb,const uint8_t * table_argb,int width)1157 void ARGBColorTableRow_C(uint8_t* dst_argb,
1158                          const uint8_t* table_argb,
1159                          int width) {
1160   int x;
1161   for (x = 0; x < width; ++x) {
1162     int b = dst_argb[0];
1163     int g = dst_argb[1];
1164     int r = dst_argb[2];
1165     int a = dst_argb[3];
1166     dst_argb[0] = table_argb[b * 4 + 0];
1167     dst_argb[1] = table_argb[g * 4 + 1];
1168     dst_argb[2] = table_argb[r * 4 + 2];
1169     dst_argb[3] = table_argb[a * 4 + 3];
1170     dst_argb += 4;
1171   }
1172 }
1173 
1174 // Apply color table to a row of image.
RGBColorTableRow_C(uint8_t * dst_argb,const uint8_t * table_argb,int width)1175 void RGBColorTableRow_C(uint8_t* dst_argb,
1176                         const uint8_t* table_argb,
1177                         int width) {
1178   int x;
1179   for (x = 0; x < width; ++x) {
1180     int b = dst_argb[0];
1181     int g = dst_argb[1];
1182     int r = dst_argb[2];
1183     dst_argb[0] = table_argb[b * 4 + 0];
1184     dst_argb[1] = table_argb[g * 4 + 1];
1185     dst_argb[2] = table_argb[r * 4 + 2];
1186     dst_argb += 4;
1187   }
1188 }
1189 
ARGBQuantizeRow_C(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)1190 void ARGBQuantizeRow_C(uint8_t* dst_argb,
1191                        int scale,
1192                        int interval_size,
1193                        int interval_offset,
1194                        int width) {
1195   int x;
1196   for (x = 0; x < width; ++x) {
1197     int b = dst_argb[0];
1198     int g = dst_argb[1];
1199     int r = dst_argb[2];
1200     dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
1201     dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
1202     dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
1203     dst_argb += 4;
1204   }
1205 }
1206 
1207 #define REPEAT8(v) (v) | ((v) << 8)
1208 #define SHADE(f, v) v* f >> 24
1209 
ARGBShadeRow_C(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)1210 void ARGBShadeRow_C(const uint8_t* src_argb,
1211                     uint8_t* dst_argb,
1212                     int width,
1213                     uint32_t value) {
1214   const uint32_t b_scale = REPEAT8(value & 0xff);
1215   const uint32_t g_scale = REPEAT8((value >> 8) & 0xff);
1216   const uint32_t r_scale = REPEAT8((value >> 16) & 0xff);
1217   const uint32_t a_scale = REPEAT8(value >> 24);
1218 
1219   int i;
1220   for (i = 0; i < width; ++i) {
1221     const uint32_t b = REPEAT8(src_argb[0]);
1222     const uint32_t g = REPEAT8(src_argb[1]);
1223     const uint32_t r = REPEAT8(src_argb[2]);
1224     const uint32_t a = REPEAT8(src_argb[3]);
1225     dst_argb[0] = SHADE(b, b_scale);
1226     dst_argb[1] = SHADE(g, g_scale);
1227     dst_argb[2] = SHADE(r, r_scale);
1228     dst_argb[3] = SHADE(a, a_scale);
1229     src_argb += 4;
1230     dst_argb += 4;
1231   }
1232 }
1233 #undef REPEAT8
1234 #undef SHADE
1235 
1236 #define REPEAT8(v) (v) | ((v) << 8)
1237 #define SHADE(f, v) v* f >> 16
1238 
ARGBMultiplyRow_C(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1239 void ARGBMultiplyRow_C(const uint8_t* src_argb,
1240                        const uint8_t* src_argb1,
1241                        uint8_t* dst_argb,
1242                        int width) {
1243   int i;
1244   for (i = 0; i < width; ++i) {
1245     const uint32_t b = REPEAT8(src_argb[0]);
1246     const uint32_t g = REPEAT8(src_argb[1]);
1247     const uint32_t r = REPEAT8(src_argb[2]);
1248     const uint32_t a = REPEAT8(src_argb[3]);
1249     const uint32_t b_scale = src_argb1[0];
1250     const uint32_t g_scale = src_argb1[1];
1251     const uint32_t r_scale = src_argb1[2];
1252     const uint32_t a_scale = src_argb1[3];
1253     dst_argb[0] = SHADE(b, b_scale);
1254     dst_argb[1] = SHADE(g, g_scale);
1255     dst_argb[2] = SHADE(r, r_scale);
1256     dst_argb[3] = SHADE(a, a_scale);
1257     src_argb += 4;
1258     src_argb1 += 4;
1259     dst_argb += 4;
1260   }
1261 }
1262 #undef REPEAT8
1263 #undef SHADE
1264 
1265 #define SHADE(f, v) clamp255(v + f)
1266 
ARGBAddRow_C(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1267 void ARGBAddRow_C(const uint8_t* src_argb,
1268                   const uint8_t* src_argb1,
1269                   uint8_t* dst_argb,
1270                   int width) {
1271   int i;
1272   for (i = 0; i < width; ++i) {
1273     const int b = src_argb[0];
1274     const int g = src_argb[1];
1275     const int r = src_argb[2];
1276     const int a = src_argb[3];
1277     const int b_add = src_argb1[0];
1278     const int g_add = src_argb1[1];
1279     const int r_add = src_argb1[2];
1280     const int a_add = src_argb1[3];
1281     dst_argb[0] = SHADE(b, b_add);
1282     dst_argb[1] = SHADE(g, g_add);
1283     dst_argb[2] = SHADE(r, r_add);
1284     dst_argb[3] = SHADE(a, a_add);
1285     src_argb += 4;
1286     src_argb1 += 4;
1287     dst_argb += 4;
1288   }
1289 }
1290 #undef SHADE
1291 
1292 #define SHADE(f, v) clamp0(f - v)
1293 
ARGBSubtractRow_C(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1294 void ARGBSubtractRow_C(const uint8_t* src_argb,
1295                        const uint8_t* src_argb1,
1296                        uint8_t* dst_argb,
1297                        int width) {
1298   int i;
1299   for (i = 0; i < width; ++i) {
1300     const int b = src_argb[0];
1301     const int g = src_argb[1];
1302     const int r = src_argb[2];
1303     const int a = src_argb[3];
1304     const int b_sub = src_argb1[0];
1305     const int g_sub = src_argb1[1];
1306     const int r_sub = src_argb1[2];
1307     const int a_sub = src_argb1[3];
1308     dst_argb[0] = SHADE(b, b_sub);
1309     dst_argb[1] = SHADE(g, g_sub);
1310     dst_argb[2] = SHADE(r, r_sub);
1311     dst_argb[3] = SHADE(a, a_sub);
1312     src_argb += 4;
1313     src_argb1 += 4;
1314     dst_argb += 4;
1315   }
1316 }
1317 #undef SHADE
1318 
1319 // Sobel functions which mimics SSSE3.
SobelXRow_C(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int width)1320 void SobelXRow_C(const uint8_t* src_y0,
1321                  const uint8_t* src_y1,
1322                  const uint8_t* src_y2,
1323                  uint8_t* dst_sobelx,
1324                  int width) {
1325   int i;
1326   for (i = 0; i < width; ++i) {
1327     int a = src_y0[i];
1328     int b = src_y1[i];
1329     int c = src_y2[i];
1330     int a_sub = src_y0[i + 2];
1331     int b_sub = src_y1[i + 2];
1332     int c_sub = src_y2[i + 2];
1333     int a_diff = a - a_sub;
1334     int b_diff = b - b_sub;
1335     int c_diff = c - c_sub;
1336     int sobel = Abs(a_diff + b_diff * 2 + c_diff);
1337     dst_sobelx[i] = (uint8_t)(clamp255(sobel));
1338   }
1339 }
1340 
SobelYRow_C(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int width)1341 void SobelYRow_C(const uint8_t* src_y0,
1342                  const uint8_t* src_y1,
1343                  uint8_t* dst_sobely,
1344                  int width) {
1345   int i;
1346   for (i = 0; i < width; ++i) {
1347     int a = src_y0[i + 0];
1348     int b = src_y0[i + 1];
1349     int c = src_y0[i + 2];
1350     int a_sub = src_y1[i + 0];
1351     int b_sub = src_y1[i + 1];
1352     int c_sub = src_y1[i + 2];
1353     int a_diff = a - a_sub;
1354     int b_diff = b - b_sub;
1355     int c_diff = c - c_sub;
1356     int sobel = Abs(a_diff + b_diff * 2 + c_diff);
1357     dst_sobely[i] = (uint8_t)(clamp255(sobel));
1358   }
1359 }
1360 
SobelRow_C(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)1361 void SobelRow_C(const uint8_t* src_sobelx,
1362                 const uint8_t* src_sobely,
1363                 uint8_t* dst_argb,
1364                 int width) {
1365   int i;
1366   for (i = 0; i < width; ++i) {
1367     int r = src_sobelx[i];
1368     int b = src_sobely[i];
1369     int s = clamp255(r + b);
1370     dst_argb[0] = (uint8_t)(s);
1371     dst_argb[1] = (uint8_t)(s);
1372     dst_argb[2] = (uint8_t)(s);
1373     dst_argb[3] = (uint8_t)(255u);
1374     dst_argb += 4;
1375   }
1376 }
1377 
SobelToPlaneRow_C(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)1378 void SobelToPlaneRow_C(const uint8_t* src_sobelx,
1379                        const uint8_t* src_sobely,
1380                        uint8_t* dst_y,
1381                        int width) {
1382   int i;
1383   for (i = 0; i < width; ++i) {
1384     int r = src_sobelx[i];
1385     int b = src_sobely[i];
1386     int s = clamp255(r + b);
1387     dst_y[i] = (uint8_t)(s);
1388   }
1389 }
1390 
SobelXYRow_C(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)1391 void SobelXYRow_C(const uint8_t* src_sobelx,
1392                   const uint8_t* src_sobely,
1393                   uint8_t* dst_argb,
1394                   int width) {
1395   int i;
1396   for (i = 0; i < width; ++i) {
1397     int r = src_sobelx[i];
1398     int b = src_sobely[i];
1399     int g = clamp255(r + b);
1400     dst_argb[0] = (uint8_t)(b);
1401     dst_argb[1] = (uint8_t)(g);
1402     dst_argb[2] = (uint8_t)(r);
1403     dst_argb[3] = (uint8_t)(255u);
1404     dst_argb += 4;
1405   }
1406 }
1407 
J400ToARGBRow_C(const uint8_t * src_y,uint8_t * dst_argb,int width)1408 void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
1409   // Copy a Y to RGB.
1410   int x;
1411   for (x = 0; x < width; ++x) {
1412     uint8_t y = src_y[0];
1413     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
1414     dst_argb[3] = 255u;
1415     dst_argb += 4;
1416     ++src_y;
1417   }
1418 }
1419 
1420 // Macros to create SIMD specific yuv to rgb conversion constants.
1421 
1422 // clang-format off
1423 
1424 #if defined(__aarch64__) || defined(__arm__)
1425 // Bias values include subtract 128 from U and V, bias from Y and rounding.
1426 // For B and R bias is negative. For G bias is positive.
1427 #define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                             \
1428   {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},                     \
1429    {YG, (UB * 128 - YB), (UG * 128 + VG * 128 + YB), (VR * 128 - YB), YB, 0, \
1430     0, 0}}
1431 #else
1432 #define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                     \
1433   {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,          \
1434     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},         \
1435    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,  \
1436     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \
1437    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,          \
1438     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},         \
1439    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
1440    {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}
1441 #endif
1442 
1443 // clang-format on
1444 
1445 #define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR)            \
1446   const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \
1447       YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR);                   \
1448   const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \
1449       YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB);
1450 
1451 // TODO(fbarchard): Generate SIMD structures from float matrix.
1452 
1453 // BT.601 limited range YUV to RGB reference
1454 //  R = (Y - 16) * 1.164             + V * 1.596
1455 //  G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
1456 //  B = (Y - 16) * 1.164 + U * 2.018
1457 // KR = 0.299; KB = 0.114
1458 
1459 // U and V contributions to R,G,B.
1460 #ifdef LIBYUV_UNLIMITED_DATA
1461 #define UB 129 /* round(2.018 * 64) */
1462 #else
1463 #define UB 128 /* max(128, round(2.018 * 64)) */
1464 #endif
1465 #define UG 25  /* round(0.391 * 64) */
1466 #define VG 52  /* round(0.813 * 64) */
1467 #define VR 102 /* round(1.596 * 64) */
1468 
1469 // Y contribution to R,G,B.  Scale and bias.
1470 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
1471 #define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
1472 
MAKEYUVCONSTANTS(I601,YG,YB,UB,UG,VG,VR)1473 MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR)
1474 
1475 #undef YG
1476 #undef YB
1477 #undef UB
1478 #undef UG
1479 #undef VG
1480 #undef VR
1481 
1482 // BT.601 full range YUV to RGB reference (aka JPEG)
1483 // *  R = Y               + V * 1.40200
1484 // *  G = Y - U * 0.34414 - V * 0.71414
1485 // *  B = Y + U * 1.77200
1486 // KR = 0.299; KB = 0.114
1487 
1488 // U and V contributions to R,G,B.
1489 #define UB 113 /* round(1.77200 * 64) */
1490 #define UG 22  /* round(0.34414 * 64) */
1491 #define VG 46  /* round(0.71414 * 64) */
1492 #define VR 90  /* round(1.40200 * 64) */
1493 
1494 // Y contribution to R,G,B.  Scale and bias.
1495 #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
1496 #define YB 32    /* 64 / 2 */
1497 
1498 MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR)
1499 
1500 #undef YG
1501 #undef YB
1502 #undef UB
1503 #undef UG
1504 #undef VG
1505 #undef VR
1506 
1507 // BT.709 limited range YUV to RGB reference
1508 //  R = (Y - 16) * 1.164             + V * 1.793
1509 //  G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533
1510 //  B = (Y - 16) * 1.164 + U * 2.112
1511 //  KR = 0.2126, KB = 0.0722
1512 
1513 // U and V contributions to R,G,B.
1514 #ifdef LIBYUV_UNLIMITED_DATA
1515 #define UB 135 /* round(2.112 * 64) */
1516 #else
1517 #define UB 128 /* max(128, round(2.112 * 64)) */
1518 #endif
1519 #define UG 14  /* round(0.213 * 64) */
1520 #define VG 34  /* round(0.533 * 64) */
1521 #define VR 115 /* round(1.793 * 64) */
1522 
1523 // Y contribution to R,G,B.  Scale and bias.
1524 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
1525 #define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
1526 
1527 MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR)
1528 
1529 #undef YG
1530 #undef YB
1531 #undef UB
1532 #undef UG
1533 #undef VG
1534 #undef VR
1535 
1536 // BT.709 full range YUV to RGB reference
1537 //  R = Y               + V * 1.5748
1538 //  G = Y - U * 0.18732 - V * 0.46812
1539 //  B = Y + U * 1.8556
1540 //  KR = 0.2126, KB = 0.0722
1541 
1542 // U and V contributions to R,G,B.
1543 #define UB 119 /* round(1.8556 * 64) */
1544 #define UG 12  /* round(0.18732 * 64) */
1545 #define VG 30  /* round(0.46812 * 64) */
1546 #define VR 101 /* round(1.5748 * 64) */
1547 
1548 // Y contribution to R,G,B.  Scale and bias.  (same as jpeg)
1549 #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
1550 #define YB 32    /* 64 / 2 */
1551 
1552 MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR)
1553 
1554 #undef YG
1555 #undef YB
1556 #undef UB
1557 #undef UG
1558 #undef VG
1559 #undef VR
1560 
1561 // BT.2020 limited range YUV to RGB reference
1562 //  R = (Y - 16) * 1.164384                + V * 1.67867
1563 //  G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042
1564 //  B = (Y - 16) * 1.164384 + U * 2.14177
1565 // KR = 0.2627; KB = 0.0593
1566 
1567 // U and V contributions to R,G,B.
1568 #ifdef LIBYUV_UNLIMITED_DATA
1569 #define UB 137 /* round(2.142 * 64) */
1570 #else
1571 #define UB 128 /* max(128, round(2.142 * 64)) */
1572 #endif
1573 #define UG 12  /* round(0.187326 * 64) */
1574 #define VG 42  /* round(0.65042 * 64) */
1575 #define VR 107 /* round(1.67867 * 64) */
1576 
1577 // Y contribution to R,G,B.  Scale and bias.
1578 #define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
1579 #define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
1580 
1581 MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR)
1582 
1583 #undef YG
1584 #undef YB
1585 #undef UB
1586 #undef UG
1587 #undef VG
1588 #undef VR
1589 
1590 // BT.2020 full range YUV to RGB reference
1591 //  R = Y                + V * 1.474600
1592 //  G = Y - U * 0.164553 - V * 0.571353
1593 //  B = Y + U * 1.881400
1594 // KR = 0.2627; KB = 0.0593
1595 
1596 #define UB 120 /* round(1.881400 * 64) */
1597 #define UG 11  /* round(0.164553 * 64) */
1598 #define VG 37  /* round(0.571353 * 64) */
1599 #define VR 94  /* round(1.474600 * 64) */
1600 
1601 // Y contribution to R,G,B.  Scale and bias.  (same as jpeg)
1602 #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
1603 #define YB 32    /* 64 / 2 */
1604 
1605 MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR)
1606 
1607 #undef YG
1608 #undef YB
1609 #undef UB
1610 #undef UG
1611 #undef VG
1612 #undef VR
1613 
1614 #undef BB
1615 #undef BG
1616 #undef BR
1617 
1618 #undef MAKEYUVCONSTANTS
1619 
1620 #if defined(__aarch64__) || defined(__arm__)
1621 #define LOAD_YUV_CONSTANTS                 \
1622   int ub = yuvconstants->kUVCoeff[0];      \
1623   int vr = yuvconstants->kUVCoeff[1];      \
1624   int ug = yuvconstants->kUVCoeff[2];      \
1625   int vg = yuvconstants->kUVCoeff[3];      \
1626   int yg = yuvconstants->kRGBCoeffBias[0]; \
1627   int bb = yuvconstants->kRGBCoeffBias[1]; \
1628   int bg = yuvconstants->kRGBCoeffBias[2]; \
1629   int br = yuvconstants->kRGBCoeffBias[3]
1630 
1631 #define CALC_RGB16                         \
1632   int32_t y1 = (uint32_t)(y32 * yg) >> 16; \
1633   int b16 = y1 + (u * ub) - bb;            \
1634   int g16 = y1 + bg - (u * ug + v * vg);   \
1635   int r16 = y1 + (v * vr) - br
1636 #else
1637 #define LOAD_YUV_CONSTANTS           \
1638   int ub = yuvconstants->kUVToB[0];  \
1639   int ug = yuvconstants->kUVToG[0];  \
1640   int vg = yuvconstants->kUVToG[1];  \
1641   int vr = yuvconstants->kUVToR[1];  \
1642   int yg = yuvconstants->kYToRgb[0]; \
1643   int yb = yuvconstants->kYBiasToRgb[0]
1644 
1645 #define CALC_RGB16                                \
1646   int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \
1647   int8_t ui = u;                                  \
1648   int8_t vi = v;                                  \
1649   ui -= 0x80;                                     \
1650   vi -= 0x80;                                     \
1651   int b16 = y1 + (ui * ub);                       \
1652   int g16 = y1 - (ui * ug + vi * vg);             \
1653   int r16 = y1 + (vi * vr)
1654 #endif
1655 
1656 // C reference code that mimics the YUV assembly.
1657 // Reads 8 bit YUV and leaves result as 16 bit.
1658 static __inline void YuvPixel(uint8_t y,
1659                               uint8_t u,
1660                               uint8_t v,
1661                               uint8_t* b,
1662                               uint8_t* g,
1663                               uint8_t* r,
1664                               const struct YuvConstants* yuvconstants) {
1665   LOAD_YUV_CONSTANTS;
1666   uint32_t y32 = y * 0x0101;
1667   CALC_RGB16;
1668   *b = Clamp((int32_t)(b16) >> 6);
1669   *g = Clamp((int32_t)(g16) >> 6);
1670   *r = Clamp((int32_t)(r16) >> 6);
1671 }
1672 
1673 // Reads 8 bit YUV and leaves result as 16 bit.
YuvPixel8_16(uint8_t y,uint8_t u,uint8_t v,int * b,int * g,int * r,const struct YuvConstants * yuvconstants)1674 static __inline void YuvPixel8_16(uint8_t y,
1675                                   uint8_t u,
1676                                   uint8_t v,
1677                                   int* b,
1678                                   int* g,
1679                                   int* r,
1680                                   const struct YuvConstants* yuvconstants) {
1681   LOAD_YUV_CONSTANTS;
1682   uint32_t y32 = y * 0x0101;
1683   CALC_RGB16;
1684   *b = b16;
1685   *g = g16;
1686   *r = r16;
1687 }
1688 
1689 // C reference code that mimics the YUV 16 bit assembly.
1690 // Reads 10 bit YUV and leaves result as 16 bit.
YuvPixel10_16(uint16_t y,uint16_t u,uint16_t v,int * b,int * g,int * r,const struct YuvConstants * yuvconstants)1691 static __inline void YuvPixel10_16(uint16_t y,
1692                                    uint16_t u,
1693                                    uint16_t v,
1694                                    int* b,
1695                                    int* g,
1696                                    int* r,
1697                                    const struct YuvConstants* yuvconstants) {
1698   LOAD_YUV_CONSTANTS;
1699   uint32_t y32 = y << 6;
1700   u = clamp255(u >> 2);
1701   v = clamp255(v >> 2);
1702   CALC_RGB16;
1703   *b = b16;
1704   *g = g16;
1705   *r = r16;
1706 }
1707 
1708 // C reference code that mimics the YUV 16 bit assembly.
1709 // Reads 12 bit YUV and leaves result as 16 bit.
YuvPixel12_16(int16_t y,int16_t u,int16_t v,int * b,int * g,int * r,const struct YuvConstants * yuvconstants)1710 static __inline void YuvPixel12_16(int16_t y,
1711                                    int16_t u,
1712                                    int16_t v,
1713                                    int* b,
1714                                    int* g,
1715                                    int* r,
1716                                    const struct YuvConstants* yuvconstants) {
1717   LOAD_YUV_CONSTANTS;
1718   uint32_t y32 = y << 4;
1719   u = clamp255(u >> 4);
1720   v = clamp255(v >> 4);
1721   CALC_RGB16;
1722   *b = b16;
1723   *g = g16;
1724   *r = r16;
1725 }
1726 
1727 // C reference code that mimics the YUV 10 bit assembly.
1728 // Reads 10 bit YUV and clamps down to 8 bit RGB.
YuvPixel10(uint16_t y,uint16_t u,uint16_t v,uint8_t * b,uint8_t * g,uint8_t * r,const struct YuvConstants * yuvconstants)1729 static __inline void YuvPixel10(uint16_t y,
1730                                 uint16_t u,
1731                                 uint16_t v,
1732                                 uint8_t* b,
1733                                 uint8_t* g,
1734                                 uint8_t* r,
1735                                 const struct YuvConstants* yuvconstants) {
1736   int b16;
1737   int g16;
1738   int r16;
1739   YuvPixel10_16(y, u, v, &b16, &g16, &r16, yuvconstants);
1740   *b = Clamp(b16 >> 6);
1741   *g = Clamp(g16 >> 6);
1742   *r = Clamp(r16 >> 6);
1743 }
1744 
1745 // C reference code that mimics the YUV 12 bit assembly.
1746 // Reads 12 bit YUV and clamps down to 8 bit RGB.
YuvPixel12(uint16_t y,uint16_t u,uint16_t v,uint8_t * b,uint8_t * g,uint8_t * r,const struct YuvConstants * yuvconstants)1747 static __inline void YuvPixel12(uint16_t y,
1748                                 uint16_t u,
1749                                 uint16_t v,
1750                                 uint8_t* b,
1751                                 uint8_t* g,
1752                                 uint8_t* r,
1753                                 const struct YuvConstants* yuvconstants) {
1754   int b16;
1755   int g16;
1756   int r16;
1757   YuvPixel12_16(y, u, v, &b16, &g16, &r16, yuvconstants);
1758   *b = Clamp(b16 >> 6);
1759   *g = Clamp(g16 >> 6);
1760   *r = Clamp(r16 >> 6);
1761 }
1762 
1763 // C reference code that mimics the YUV 16 bit assembly.
1764 // Reads 16 bit YUV and leaves result as 8 bit.
YuvPixel16_8(uint16_t y,uint16_t u,uint16_t v,uint8_t * b,uint8_t * g,uint8_t * r,const struct YuvConstants * yuvconstants)1765 static __inline void YuvPixel16_8(uint16_t y,
1766                                   uint16_t u,
1767                                   uint16_t v,
1768                                   uint8_t* b,
1769                                   uint8_t* g,
1770                                   uint8_t* r,
1771                                   const struct YuvConstants* yuvconstants) {
1772   LOAD_YUV_CONSTANTS;
1773   uint32_t y32 = y;
1774   u = clamp255(u >> 8);
1775   v = clamp255(v >> 8);
1776   CALC_RGB16;
1777   *b = Clamp((int32_t)(b16) >> 6);
1778   *g = Clamp((int32_t)(g16) >> 6);
1779   *r = Clamp((int32_t)(r16) >> 6);
1780 }
1781 
1782 // C reference code that mimics the YUV 16 bit assembly.
1783 // Reads 16 bit YUV and leaves result as 16 bit.
YuvPixel16_16(uint16_t y,uint16_t u,uint16_t v,int * b,int * g,int * r,const struct YuvConstants * yuvconstants)1784 static __inline void YuvPixel16_16(uint16_t y,
1785                                    uint16_t u,
1786                                    uint16_t v,
1787                                    int* b,
1788                                    int* g,
1789                                    int* r,
1790                                    const struct YuvConstants* yuvconstants) {
1791   LOAD_YUV_CONSTANTS;
1792   uint32_t y32 = y;
1793   u = clamp255(u >> 8);
1794   v = clamp255(v >> 8);
1795   CALC_RGB16;
1796   *b = b16;
1797   *g = g16;
1798   *r = r16;
1799 }
1800 
1801 // C reference code that mimics the YUV assembly.
1802 // Reads 8 bit YUV and leaves result as 8 bit.
YPixel(uint8_t y,uint8_t * b,uint8_t * g,uint8_t * r,const struct YuvConstants * yuvconstants)1803 static __inline void YPixel(uint8_t y,
1804                             uint8_t* b,
1805                             uint8_t* g,
1806                             uint8_t* r,
1807                             const struct YuvConstants* yuvconstants) {
1808 #if defined(__aarch64__) || defined(__arm__)
1809   int yg = yuvconstants->kRGBCoeffBias[0];
1810   int ygb = yuvconstants->kRGBCoeffBias[4];
1811 #else
1812   int ygb = yuvconstants->kYBiasToRgb[0];
1813   int yg = yuvconstants->kYToRgb[0];
1814 #endif
1815   uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
1816   *b = Clamp(((int32_t)(y1) + ygb) >> 6);
1817   *g = Clamp(((int32_t)(y1) + ygb) >> 6);
1818   *r = Clamp(((int32_t)(y1) + ygb) >> 6);
1819 }
1820 
I444ToARGBRow_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1821 void I444ToARGBRow_C(const uint8_t* src_y,
1822                      const uint8_t* src_u,
1823                      const uint8_t* src_v,
1824                      uint8_t* rgb_buf,
1825                      const struct YuvConstants* yuvconstants,
1826                      int width) {
1827   int x;
1828   for (x = 0; x < width; ++x) {
1829     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1830              rgb_buf + 2, yuvconstants);
1831     rgb_buf[3] = 255;
1832     src_y += 1;
1833     src_u += 1;
1834     src_v += 1;
1835     rgb_buf += 4;  // Advance 1 pixel.
1836   }
1837 }
1838 
1839 // Also used for 420
I422ToARGBRow_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1840 void I422ToARGBRow_C(const uint8_t* src_y,
1841                      const uint8_t* src_u,
1842                      const uint8_t* src_v,
1843                      uint8_t* rgb_buf,
1844                      const struct YuvConstants* yuvconstants,
1845                      int width) {
1846   int x;
1847   for (x = 0; x < width - 1; x += 2) {
1848     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1849              rgb_buf + 2, yuvconstants);
1850     rgb_buf[3] = 255;
1851     YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
1852              rgb_buf + 6, yuvconstants);
1853     rgb_buf[7] = 255;
1854     src_y += 2;
1855     src_u += 1;
1856     src_v += 1;
1857     rgb_buf += 8;  // Advance 2 pixels.
1858   }
1859   if (width & 1) {
1860     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1861              rgb_buf + 2, yuvconstants);
1862     rgb_buf[3] = 255;
1863   }
1864 }
1865 
1866 // 10 bit YUV to ARGB
I210ToARGBRow_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1867 void I210ToARGBRow_C(const uint16_t* src_y,
1868                      const uint16_t* src_u,
1869                      const uint16_t* src_v,
1870                      uint8_t* rgb_buf,
1871                      const struct YuvConstants* yuvconstants,
1872                      int width) {
1873   int x;
1874   for (x = 0; x < width - 1; x += 2) {
1875     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1876                rgb_buf + 2, yuvconstants);
1877     rgb_buf[3] = 255;
1878     YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
1879                rgb_buf + 6, yuvconstants);
1880     rgb_buf[7] = 255;
1881     src_y += 2;
1882     src_u += 1;
1883     src_v += 1;
1884     rgb_buf += 8;  // Advance 2 pixels.
1885   }
1886   if (width & 1) {
1887     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1888                rgb_buf + 2, yuvconstants);
1889     rgb_buf[3] = 255;
1890   }
1891 }
1892 
I410ToARGBRow_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1893 void I410ToARGBRow_C(const uint16_t* src_y,
1894                      const uint16_t* src_u,
1895                      const uint16_t* src_v,
1896                      uint8_t* rgb_buf,
1897                      const struct YuvConstants* yuvconstants,
1898                      int width) {
1899   int x;
1900   for (x = 0; x < width; ++x) {
1901     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1902                rgb_buf + 2, yuvconstants);
1903     rgb_buf[3] = 255;
1904     src_y += 1;
1905     src_u += 1;
1906     src_v += 1;
1907     rgb_buf += 4;  // Advance 1 pixels.
1908   }
1909 }
1910 
I210AlphaToARGBRow_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,const uint16_t * src_a,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1911 void I210AlphaToARGBRow_C(const uint16_t* src_y,
1912                           const uint16_t* src_u,
1913                           const uint16_t* src_v,
1914                           const uint16_t* src_a,
1915                           uint8_t* rgb_buf,
1916                           const struct YuvConstants* yuvconstants,
1917                           int width) {
1918   int x;
1919   for (x = 0; x < width - 1; x += 2) {
1920     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1921                rgb_buf + 2, yuvconstants);
1922     rgb_buf[3] = clamp255(src_a[0] >> 2);
1923     YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
1924                rgb_buf + 6, yuvconstants);
1925     rgb_buf[7] = clamp255(src_a[1] >> 2);
1926     src_y += 2;
1927     src_u += 1;
1928     src_v += 1;
1929     src_a += 2;
1930     rgb_buf += 8;  // Advance 2 pixels.
1931   }
1932   if (width & 1) {
1933     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1934                rgb_buf + 2, yuvconstants);
1935     rgb_buf[3] = clamp255(src_a[0] >> 2);
1936   }
1937 }
1938 
I410AlphaToARGBRow_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,const uint16_t * src_a,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1939 void I410AlphaToARGBRow_C(const uint16_t* src_y,
1940                           const uint16_t* src_u,
1941                           const uint16_t* src_v,
1942                           const uint16_t* src_a,
1943                           uint8_t* rgb_buf,
1944                           const struct YuvConstants* yuvconstants,
1945                           int width) {
1946   int x;
1947   for (x = 0; x < width; ++x) {
1948     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1949                rgb_buf + 2, yuvconstants);
1950     rgb_buf[3] = clamp255(src_a[0] >> 2);
1951     src_y += 1;
1952     src_u += 1;
1953     src_v += 1;
1954     src_a += 1;
1955     rgb_buf += 4;  // Advance 1 pixels.
1956   }
1957 }
1958 
1959 // 12 bit YUV to ARGB
I212ToARGBRow_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1960 void I212ToARGBRow_C(const uint16_t* src_y,
1961                      const uint16_t* src_u,
1962                      const uint16_t* src_v,
1963                      uint8_t* rgb_buf,
1964                      const struct YuvConstants* yuvconstants,
1965                      int width) {
1966   int x;
1967   for (x = 0; x < width - 1; x += 2) {
1968     YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1969                rgb_buf + 2, yuvconstants);
1970     rgb_buf[3] = 255;
1971     YuvPixel12(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
1972                rgb_buf + 6, yuvconstants);
1973     rgb_buf[7] = 255;
1974     src_y += 2;
1975     src_u += 1;
1976     src_v += 1;
1977     rgb_buf += 8;  // Advance 2 pixels.
1978   }
1979   if (width & 1) {
1980     YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1981                rgb_buf + 2, yuvconstants);
1982     rgb_buf[3] = 255;
1983   }
1984 }
1985 
StoreAR30(uint8_t * rgb_buf,int b,int g,int r)1986 static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) {
1987   uint32_t ar30;
1988   b = b >> 4;  // convert 8 bit 10.6 to 10 bit.
1989   g = g >> 4;
1990   r = r >> 4;
1991   b = Clamp10(b);
1992   g = Clamp10(g);
1993   r = Clamp10(r);
1994   ar30 = b | ((uint32_t)g << 10) | ((uint32_t)r << 20) | 0xc0000000;
1995   (*(uint32_t*)rgb_buf) = ar30;
1996 }
1997 
1998 // 10 bit YUV to 10 bit AR30
I210ToAR30Row_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1999 void I210ToAR30Row_C(const uint16_t* src_y,
2000                      const uint16_t* src_u,
2001                      const uint16_t* src_v,
2002                      uint8_t* rgb_buf,
2003                      const struct YuvConstants* yuvconstants,
2004                      int width) {
2005   int x;
2006   int b;
2007   int g;
2008   int r;
2009   for (x = 0; x < width - 1; x += 2) {
2010     YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2011     StoreAR30(rgb_buf, b, g, r);
2012     YuvPixel10_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2013     StoreAR30(rgb_buf + 4, b, g, r);
2014     src_y += 2;
2015     src_u += 1;
2016     src_v += 1;
2017     rgb_buf += 8;  // Advance 2 pixels.
2018   }
2019   if (width & 1) {
2020     YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2021     StoreAR30(rgb_buf, b, g, r);
2022   }
2023 }
2024 
2025 // 12 bit YUV to 10 bit AR30
I212ToAR30Row_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2026 void I212ToAR30Row_C(const uint16_t* src_y,
2027                      const uint16_t* src_u,
2028                      const uint16_t* src_v,
2029                      uint8_t* rgb_buf,
2030                      const struct YuvConstants* yuvconstants,
2031                      int width) {
2032   int x;
2033   int b;
2034   int g;
2035   int r;
2036   for (x = 0; x < width - 1; x += 2) {
2037     YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2038     StoreAR30(rgb_buf, b, g, r);
2039     YuvPixel12_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2040     StoreAR30(rgb_buf + 4, b, g, r);
2041     src_y += 2;
2042     src_u += 1;
2043     src_v += 1;
2044     rgb_buf += 8;  // Advance 2 pixels.
2045   }
2046   if (width & 1) {
2047     YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2048     StoreAR30(rgb_buf, b, g, r);
2049   }
2050 }
2051 
I410ToAR30Row_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2052 void I410ToAR30Row_C(const uint16_t* src_y,
2053                      const uint16_t* src_u,
2054                      const uint16_t* src_v,
2055                      uint8_t* rgb_buf,
2056                      const struct YuvConstants* yuvconstants,
2057                      int width) {
2058   int x;
2059   int b;
2060   int g;
2061   int r;
2062   for (x = 0; x < width; ++x) {
2063     YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2064     StoreAR30(rgb_buf, b, g, r);
2065     src_y += 1;
2066     src_u += 1;
2067     src_v += 1;
2068     rgb_buf += 4;  // Advance 1 pixel.
2069   }
2070 }
2071 
2072 // P210 has 10 bits in msb of 16 bit NV12 style layout.
P210ToARGBRow_C(const uint16_t * src_y,const uint16_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2073 void P210ToARGBRow_C(const uint16_t* src_y,
2074                      const uint16_t* src_uv,
2075                      uint8_t* dst_argb,
2076                      const struct YuvConstants* yuvconstants,
2077                      int width) {
2078   int x;
2079   for (x = 0; x < width - 1; x += 2) {
2080     YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
2081                  dst_argb + 2, yuvconstants);
2082     dst_argb[3] = 255;
2083     YuvPixel16_8(src_y[1], src_uv[0], src_uv[1], dst_argb + 4, dst_argb + 5,
2084                  dst_argb + 6, yuvconstants);
2085     dst_argb[7] = 255;
2086     src_y += 2;
2087     src_uv += 2;
2088     dst_argb += 8;  // Advance 2 pixels.
2089   }
2090   if (width & 1) {
2091     YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
2092                  dst_argb + 2, yuvconstants);
2093     dst_argb[3] = 255;
2094   }
2095 }
2096 
P410ToARGBRow_C(const uint16_t * src_y,const uint16_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2097 void P410ToARGBRow_C(const uint16_t* src_y,
2098                      const uint16_t* src_uv,
2099                      uint8_t* dst_argb,
2100                      const struct YuvConstants* yuvconstants,
2101                      int width) {
2102   int x;
2103   for (x = 0; x < width; ++x) {
2104     YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
2105                  dst_argb + 2, yuvconstants);
2106     dst_argb[3] = 255;
2107     src_y += 1;
2108     src_uv += 2;
2109     dst_argb += 4;  // Advance 1 pixels.
2110   }
2111 }
2112 
P210ToAR30Row_C(const uint16_t * src_y,const uint16_t * src_uv,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2113 void P210ToAR30Row_C(const uint16_t* src_y,
2114                      const uint16_t* src_uv,
2115                      uint8_t* dst_ar30,
2116                      const struct YuvConstants* yuvconstants,
2117                      int width) {
2118   int x;
2119   int b;
2120   int g;
2121   int r;
2122   for (x = 0; x < width - 1; x += 2) {
2123     YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
2124     StoreAR30(dst_ar30, b, g, r);
2125     YuvPixel16_16(src_y[1], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
2126     StoreAR30(dst_ar30 + 4, b, g, r);
2127     src_y += 2;
2128     src_uv += 2;
2129     dst_ar30 += 8;  // Advance 2 pixels.
2130   }
2131   if (width & 1) {
2132     YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
2133     StoreAR30(dst_ar30, b, g, r);
2134   }
2135 }
2136 
P410ToAR30Row_C(const uint16_t * src_y,const uint16_t * src_uv,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2137 void P410ToAR30Row_C(const uint16_t* src_y,
2138                      const uint16_t* src_uv,
2139                      uint8_t* dst_ar30,
2140                      const struct YuvConstants* yuvconstants,
2141                      int width) {
2142   int x;
2143   int b;
2144   int g;
2145   int r;
2146   for (x = 0; x < width; ++x) {
2147     YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
2148     StoreAR30(dst_ar30, b, g, r);
2149     src_y += 1;
2150     src_uv += 2;
2151     dst_ar30 += 4;  // Advance 1 pixel.
2152   }
2153 }
2154 
2155 // 8 bit YUV to 10 bit AR30
2156 // Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.
I422ToAR30Row_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2157 void I422ToAR30Row_C(const uint8_t* src_y,
2158                      const uint8_t* src_u,
2159                      const uint8_t* src_v,
2160                      uint8_t* rgb_buf,
2161                      const struct YuvConstants* yuvconstants,
2162                      int width) {
2163   int x;
2164   int b;
2165   int g;
2166   int r;
2167   for (x = 0; x < width - 1; x += 2) {
2168     YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2169     StoreAR30(rgb_buf, b, g, r);
2170     YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2171     StoreAR30(rgb_buf + 4, b, g, r);
2172     src_y += 2;
2173     src_u += 1;
2174     src_v += 1;
2175     rgb_buf += 8;  // Advance 2 pixels.
2176   }
2177   if (width & 1) {
2178     YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2179     StoreAR30(rgb_buf, b, g, r);
2180   }
2181 }
2182 
I444AlphaToARGBRow_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2183 void I444AlphaToARGBRow_C(const uint8_t* src_y,
2184                           const uint8_t* src_u,
2185                           const uint8_t* src_v,
2186                           const uint8_t* src_a,
2187                           uint8_t* rgb_buf,
2188                           const struct YuvConstants* yuvconstants,
2189                           int width) {
2190   int x;
2191   for (x = 0; x < width; ++x) {
2192     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
2193              rgb_buf + 2, yuvconstants);
2194     rgb_buf[3] = src_a[0];
2195     src_y += 1;
2196     src_u += 1;
2197     src_v += 1;
2198     src_a += 1;
2199     rgb_buf += 4;  // Advance 1 pixel.
2200   }
2201 }
2202 
I422AlphaToARGBRow_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2203 void I422AlphaToARGBRow_C(const uint8_t* src_y,
2204                           const uint8_t* src_u,
2205                           const uint8_t* src_v,
2206                           const uint8_t* src_a,
2207                           uint8_t* rgb_buf,
2208                           const struct YuvConstants* yuvconstants,
2209                           int width) {
2210   int x;
2211   for (x = 0; x < width - 1; x += 2) {
2212     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
2213              rgb_buf + 2, yuvconstants);
2214     rgb_buf[3] = src_a[0];
2215     YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
2216              rgb_buf + 6, yuvconstants);
2217     rgb_buf[7] = src_a[1];
2218     src_y += 2;
2219     src_u += 1;
2220     src_v += 1;
2221     src_a += 2;
2222     rgb_buf += 8;  // Advance 2 pixels.
2223   }
2224   if (width & 1) {
2225     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
2226              rgb_buf + 2, yuvconstants);
2227     rgb_buf[3] = src_a[0];
2228   }
2229 }
2230 
I422ToRGB24Row_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2231 void I422ToRGB24Row_C(const uint8_t* src_y,
2232                       const uint8_t* src_u,
2233                       const uint8_t* src_v,
2234                       uint8_t* rgb_buf,
2235                       const struct YuvConstants* yuvconstants,
2236                       int width) {
2237   int x;
2238   for (x = 0; x < width - 1; x += 2) {
2239     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
2240              rgb_buf + 2, yuvconstants);
2241     YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4,
2242              rgb_buf + 5, yuvconstants);
2243     src_y += 2;
2244     src_u += 1;
2245     src_v += 1;
2246     rgb_buf += 6;  // Advance 2 pixels.
2247   }
2248   if (width & 1) {
2249     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
2250              rgb_buf + 2, yuvconstants);
2251   }
2252 }
2253 
I422ToARGB4444Row_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)2254 void I422ToARGB4444Row_C(const uint8_t* src_y,
2255                          const uint8_t* src_u,
2256                          const uint8_t* src_v,
2257                          uint8_t* dst_argb4444,
2258                          const struct YuvConstants* yuvconstants,
2259                          int width) {
2260   uint8_t b0;
2261   uint8_t g0;
2262   uint8_t r0;
2263   uint8_t b1;
2264   uint8_t g1;
2265   uint8_t r1;
2266   int x;
2267   for (x = 0; x < width - 1; x += 2) {
2268     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
2269     YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
2270     b0 = b0 >> 4;
2271     g0 = g0 >> 4;
2272     r0 = r0 >> 4;
2273     b1 = b1 >> 4;
2274     g1 = g1 >> 4;
2275     r1 = r1 >> 4;
2276     *(uint32_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) |
2277                                  (g1 << 20) | (r1 << 24) | 0xf000f000;
2278     src_y += 2;
2279     src_u += 1;
2280     src_v += 1;
2281     dst_argb4444 += 4;  // Advance 2 pixels.
2282   }
2283   if (width & 1) {
2284     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
2285     b0 = b0 >> 4;
2286     g0 = g0 >> 4;
2287     r0 = r0 >> 4;
2288     *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
2289   }
2290 }
2291 
I422ToARGB1555Row_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)2292 void I422ToARGB1555Row_C(const uint8_t* src_y,
2293                          const uint8_t* src_u,
2294                          const uint8_t* src_v,
2295                          uint8_t* dst_argb1555,
2296                          const struct YuvConstants* yuvconstants,
2297                          int width) {
2298   uint8_t b0;
2299   uint8_t g0;
2300   uint8_t r0;
2301   uint8_t b1;
2302   uint8_t g1;
2303   uint8_t r1;
2304   int x;
2305   for (x = 0; x < width - 1; x += 2) {
2306     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
2307     YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
2308     b0 = b0 >> 3;
2309     g0 = g0 >> 3;
2310     r0 = r0 >> 3;
2311     b1 = b1 >> 3;
2312     g1 = g1 >> 3;
2313     r1 = r1 >> 3;
2314     *(uint32_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) |
2315                                  (g1 << 21) | (r1 << 26) | 0x80008000;
2316     src_y += 2;
2317     src_u += 1;
2318     src_v += 1;
2319     dst_argb1555 += 4;  // Advance 2 pixels.
2320   }
2321   if (width & 1) {
2322     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
2323     b0 = b0 >> 3;
2324     g0 = g0 >> 3;
2325     r0 = r0 >> 3;
2326     *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
2327   }
2328 }
2329 
I422ToRGB565Row_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)2330 void I422ToRGB565Row_C(const uint8_t* src_y,
2331                        const uint8_t* src_u,
2332                        const uint8_t* src_v,
2333                        uint8_t* dst_rgb565,
2334                        const struct YuvConstants* yuvconstants,
2335                        int width) {
2336   uint8_t b0;
2337   uint8_t g0;
2338   uint8_t r0;
2339   uint8_t b1;
2340   uint8_t g1;
2341   uint8_t r1;
2342   int x;
2343   for (x = 0; x < width - 1; x += 2) {
2344     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
2345     YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
2346     b0 = b0 >> 3;
2347     g0 = g0 >> 2;
2348     r0 = r0 >> 3;
2349     b1 = b1 >> 3;
2350     g1 = g1 >> 2;
2351     r1 = r1 >> 3;
2352     *(uint32_t*)(dst_rgb565) =
2353         b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
2354     src_y += 2;
2355     src_u += 1;
2356     src_v += 1;
2357     dst_rgb565 += 4;  // Advance 2 pixels.
2358   }
2359   if (width & 1) {
2360     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
2361     b0 = b0 >> 3;
2362     g0 = g0 >> 2;
2363     r0 = r0 >> 3;
2364     *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
2365   }
2366 }
2367 
NV12ToARGBRow_C(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2368 void NV12ToARGBRow_C(const uint8_t* src_y,
2369                      const uint8_t* src_uv,
2370                      uint8_t* rgb_buf,
2371                      const struct YuvConstants* yuvconstants,
2372                      int width) {
2373   int x;
2374   for (x = 0; x < width - 1; x += 2) {
2375     YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
2376              rgb_buf + 2, yuvconstants);
2377     rgb_buf[3] = 255;
2378     YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5,
2379              rgb_buf + 6, yuvconstants);
2380     rgb_buf[7] = 255;
2381     src_y += 2;
2382     src_uv += 2;
2383     rgb_buf += 8;  // Advance 2 pixels.
2384   }
2385   if (width & 1) {
2386     YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
2387              rgb_buf + 2, yuvconstants);
2388     rgb_buf[3] = 255;
2389   }
2390 }
2391 
NV21ToARGBRow_C(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2392 void NV21ToARGBRow_C(const uint8_t* src_y,
2393                      const uint8_t* src_vu,
2394                      uint8_t* rgb_buf,
2395                      const struct YuvConstants* yuvconstants,
2396                      int width) {
2397   int x;
2398   for (x = 0; x < width - 1; x += 2) {
2399     YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
2400              rgb_buf + 2, yuvconstants);
2401     rgb_buf[3] = 255;
2402     YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5,
2403              rgb_buf + 6, yuvconstants);
2404     rgb_buf[7] = 255;
2405     src_y += 2;
2406     src_vu += 2;
2407     rgb_buf += 8;  // Advance 2 pixels.
2408   }
2409   if (width & 1) {
2410     YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
2411              rgb_buf + 2, yuvconstants);
2412     rgb_buf[3] = 255;
2413   }
2414 }
2415 
NV12ToRGB24Row_C(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2416 void NV12ToRGB24Row_C(const uint8_t* src_y,
2417                       const uint8_t* src_uv,
2418                       uint8_t* rgb_buf,
2419                       const struct YuvConstants* yuvconstants,
2420                       int width) {
2421   int x;
2422   for (x = 0; x < width - 1; x += 2) {
2423     YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
2424              rgb_buf + 2, yuvconstants);
2425     YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4,
2426              rgb_buf + 5, yuvconstants);
2427     src_y += 2;
2428     src_uv += 2;
2429     rgb_buf += 6;  // Advance 2 pixels.
2430   }
2431   if (width & 1) {
2432     YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
2433              rgb_buf + 2, yuvconstants);
2434   }
2435 }
2436 
NV21ToRGB24Row_C(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2437 void NV21ToRGB24Row_C(const uint8_t* src_y,
2438                       const uint8_t* src_vu,
2439                       uint8_t* rgb_buf,
2440                       const struct YuvConstants* yuvconstants,
2441                       int width) {
2442   int x;
2443   for (x = 0; x < width - 1; x += 2) {
2444     YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
2445              rgb_buf + 2, yuvconstants);
2446     YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4,
2447              rgb_buf + 5, yuvconstants);
2448     src_y += 2;
2449     src_vu += 2;
2450     rgb_buf += 6;  // Advance 2 pixels.
2451   }
2452   if (width & 1) {
2453     YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
2454              rgb_buf + 2, yuvconstants);
2455   }
2456 }
2457 
NV12ToRGB565Row_C(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)2458 void NV12ToRGB565Row_C(const uint8_t* src_y,
2459                        const uint8_t* src_uv,
2460                        uint8_t* dst_rgb565,
2461                        const struct YuvConstants* yuvconstants,
2462                        int width) {
2463   uint8_t b0;
2464   uint8_t g0;
2465   uint8_t r0;
2466   uint8_t b1;
2467   uint8_t g1;
2468   uint8_t r1;
2469   int x;
2470   for (x = 0; x < width - 1; x += 2) {
2471     YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
2472     YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants);
2473     b0 = b0 >> 3;
2474     g0 = g0 >> 2;
2475     r0 = r0 >> 3;
2476     b1 = b1 >> 3;
2477     g1 = g1 >> 2;
2478     r1 = r1 >> 3;
2479     *(uint32_t*)(dst_rgb565) =
2480         b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
2481     src_y += 2;
2482     src_uv += 2;
2483     dst_rgb565 += 4;  // Advance 2 pixels.
2484   }
2485   if (width & 1) {
2486     YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
2487     b0 = b0 >> 3;
2488     g0 = g0 >> 2;
2489     r0 = r0 >> 3;
2490     *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
2491   }
2492 }
2493 
YUY2ToARGBRow_C(const uint8_t * src_yuy2,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2494 void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
2495                      uint8_t* rgb_buf,
2496                      const struct YuvConstants* yuvconstants,
2497                      int width) {
2498   int x;
2499   for (x = 0; x < width - 1; x += 2) {
2500     YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
2501              rgb_buf + 2, yuvconstants);
2502     rgb_buf[3] = 255;
2503     YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5,
2504              rgb_buf + 6, yuvconstants);
2505     rgb_buf[7] = 255;
2506     src_yuy2 += 4;
2507     rgb_buf += 8;  // Advance 2 pixels.
2508   }
2509   if (width & 1) {
2510     YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
2511              rgb_buf + 2, yuvconstants);
2512     rgb_buf[3] = 255;
2513   }
2514 }
2515 
UYVYToARGBRow_C(const uint8_t * src_uyvy,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2516 void UYVYToARGBRow_C(const uint8_t* src_uyvy,
2517                      uint8_t* rgb_buf,
2518                      const struct YuvConstants* yuvconstants,
2519                      int width) {
2520   int x;
2521   for (x = 0; x < width - 1; x += 2) {
2522     YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
2523              rgb_buf + 2, yuvconstants);
2524     rgb_buf[3] = 255;
2525     YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5,
2526              rgb_buf + 6, yuvconstants);
2527     rgb_buf[7] = 255;
2528     src_uyvy += 4;
2529     rgb_buf += 8;  // Advance 2 pixels.
2530   }
2531   if (width & 1) {
2532     YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
2533              rgb_buf + 2, yuvconstants);
2534     rgb_buf[3] = 255;
2535   }
2536 }
2537 
I422ToRGBARow_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2538 void I422ToRGBARow_C(const uint8_t* src_y,
2539                      const uint8_t* src_u,
2540                      const uint8_t* src_v,
2541                      uint8_t* rgb_buf,
2542                      const struct YuvConstants* yuvconstants,
2543                      int width) {
2544   int x;
2545   for (x = 0; x < width - 1; x += 2) {
2546     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
2547              rgb_buf + 3, yuvconstants);
2548     rgb_buf[0] = 255;
2549     YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6,
2550              rgb_buf + 7, yuvconstants);
2551     rgb_buf[4] = 255;
2552     src_y += 2;
2553     src_u += 1;
2554     src_v += 1;
2555     rgb_buf += 8;  // Advance 2 pixels.
2556   }
2557   if (width & 1) {
2558     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
2559              rgb_buf + 3, yuvconstants);
2560     rgb_buf[0] = 255;
2561   }
2562 }
2563 
I400ToARGBRow_C(const uint8_t * src_y,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2564 void I400ToARGBRow_C(const uint8_t* src_y,
2565                      uint8_t* rgb_buf,
2566                      const struct YuvConstants* yuvconstants,
2567                      int width) {
2568   int x;
2569   for (x = 0; x < width - 1; x += 2) {
2570     YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
2571     rgb_buf[3] = 255;
2572     YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
2573     rgb_buf[7] = 255;
2574     src_y += 2;
2575     rgb_buf += 8;  // Advance 2 pixels.
2576   }
2577   if (width & 1) {
2578     YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
2579     rgb_buf[3] = 255;
2580   }
2581 }
2582 
MirrorRow_C(const uint8_t * src,uint8_t * dst,int width)2583 void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
2584   int x;
2585   src += width - 1;
2586   for (x = 0; x < width - 1; x += 2) {
2587     dst[x] = src[0];
2588     dst[x + 1] = src[-1];
2589     src -= 2;
2590   }
2591   if (width & 1) {
2592     dst[width - 1] = src[0];
2593   }
2594 }
2595 
MirrorUVRow_C(const uint8_t * src_uv,uint8_t * dst_uv,int width)2596 void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
2597   int x;
2598   src_uv += (width - 1) << 1;
2599   for (x = 0; x < width; ++x) {
2600     dst_uv[0] = src_uv[0];
2601     dst_uv[1] = src_uv[1];
2602     src_uv -= 2;
2603     dst_uv += 2;
2604   }
2605 }
2606 
MirrorSplitUVRow_C(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)2607 void MirrorSplitUVRow_C(const uint8_t* src_uv,
2608                         uint8_t* dst_u,
2609                         uint8_t* dst_v,
2610                         int width) {
2611   int x;
2612   src_uv += (width - 1) << 1;
2613   for (x = 0; x < width - 1; x += 2) {
2614     dst_u[x] = src_uv[0];
2615     dst_u[x + 1] = src_uv[-2];
2616     dst_v[x] = src_uv[1];
2617     dst_v[x + 1] = src_uv[-2 + 1];
2618     src_uv -= 4;
2619   }
2620   if (width & 1) {
2621     dst_u[width - 1] = src_uv[0];
2622     dst_v[width - 1] = src_uv[1];
2623   }
2624 }
2625 
ARGBMirrorRow_C(const uint8_t * src,uint8_t * dst,int width)2626 void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
2627   int x;
2628   const uint32_t* src32 = (const uint32_t*)(src);
2629   uint32_t* dst32 = (uint32_t*)(dst);
2630   src32 += width - 1;
2631   for (x = 0; x < width - 1; x += 2) {
2632     dst32[x] = src32[0];
2633     dst32[x + 1] = src32[-1];
2634     src32 -= 2;
2635   }
2636   if (width & 1) {
2637     dst32[width - 1] = src32[0];
2638   }
2639 }
2640 
RGB24MirrorRow_C(const uint8_t * src_rgb24,uint8_t * dst_rgb24,int width)2641 void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) {
2642   int x;
2643   src_rgb24 += width * 3 - 3;
2644   for (x = 0; x < width; ++x) {
2645     uint8_t b = src_rgb24[0];
2646     uint8_t g = src_rgb24[1];
2647     uint8_t r = src_rgb24[2];
2648     dst_rgb24[0] = b;
2649     dst_rgb24[1] = g;
2650     dst_rgb24[2] = r;
2651     src_rgb24 -= 3;
2652     dst_rgb24 += 3;
2653   }
2654 }
2655 
SplitUVRow_C(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)2656 void SplitUVRow_C(const uint8_t* src_uv,
2657                   uint8_t* dst_u,
2658                   uint8_t* dst_v,
2659                   int width) {
2660   int x;
2661   for (x = 0; x < width - 1; x += 2) {
2662     dst_u[x] = src_uv[0];
2663     dst_u[x + 1] = src_uv[2];
2664     dst_v[x] = src_uv[1];
2665     dst_v[x + 1] = src_uv[3];
2666     src_uv += 4;
2667   }
2668   if (width & 1) {
2669     dst_u[width - 1] = src_uv[0];
2670     dst_v[width - 1] = src_uv[1];
2671   }
2672 }
2673 
MergeUVRow_C(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)2674 void MergeUVRow_C(const uint8_t* src_u,
2675                   const uint8_t* src_v,
2676                   uint8_t* dst_uv,
2677                   int width) {
2678   int x;
2679   for (x = 0; x < width - 1; x += 2) {
2680     dst_uv[0] = src_u[x];
2681     dst_uv[1] = src_v[x];
2682     dst_uv[2] = src_u[x + 1];
2683     dst_uv[3] = src_v[x + 1];
2684     dst_uv += 4;
2685   }
2686   if (width & 1) {
2687     dst_uv[0] = src_u[width - 1];
2688     dst_uv[1] = src_v[width - 1];
2689   }
2690 }
2691 
SplitRGBRow_C(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)2692 void SplitRGBRow_C(const uint8_t* src_rgb,
2693                    uint8_t* dst_r,
2694                    uint8_t* dst_g,
2695                    uint8_t* dst_b,
2696                    int width) {
2697   int x;
2698   for (x = 0; x < width; ++x) {
2699     dst_r[x] = src_rgb[0];
2700     dst_g[x] = src_rgb[1];
2701     dst_b[x] = src_rgb[2];
2702     src_rgb += 3;
2703   }
2704 }
2705 
MergeRGBRow_C(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)2706 void MergeRGBRow_C(const uint8_t* src_r,
2707                    const uint8_t* src_g,
2708                    const uint8_t* src_b,
2709                    uint8_t* dst_rgb,
2710                    int width) {
2711   int x;
2712   for (x = 0; x < width; ++x) {
2713     dst_rgb[0] = src_r[x];
2714     dst_rgb[1] = src_g[x];
2715     dst_rgb[2] = src_b[x];
2716     dst_rgb += 3;
2717   }
2718 }
2719 
SplitARGBRow_C(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)2720 void SplitARGBRow_C(const uint8_t* src_argb,
2721                     uint8_t* dst_r,
2722                     uint8_t* dst_g,
2723                     uint8_t* dst_b,
2724                     uint8_t* dst_a,
2725                     int width) {
2726   int x;
2727   for (x = 0; x < width; ++x) {
2728     dst_b[x] = src_argb[0];
2729     dst_g[x] = src_argb[1];
2730     dst_r[x] = src_argb[2];
2731     dst_a[x] = src_argb[3];
2732     src_argb += 4;
2733   }
2734 }
2735 
MergeARGBRow_C(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)2736 void MergeARGBRow_C(const uint8_t* src_r,
2737                     const uint8_t* src_g,
2738                     const uint8_t* src_b,
2739                     const uint8_t* src_a,
2740                     uint8_t* dst_argb,
2741                     int width) {
2742   int x;
2743   for (x = 0; x < width; ++x) {
2744     dst_argb[0] = src_b[x];
2745     dst_argb[1] = src_g[x];
2746     dst_argb[2] = src_r[x];
2747     dst_argb[3] = src_a[x];
2748     dst_argb += 4;
2749   }
2750 }
2751 
MergeXR30Row_C(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_ar30,int depth,int width)2752 void MergeXR30Row_C(const uint16_t* src_r,
2753                     const uint16_t* src_g,
2754                     const uint16_t* src_b,
2755                     uint8_t* dst_ar30,
2756                     int depth,
2757                     int width) {
2758   assert(depth >= 10);
2759   assert(depth <= 16);
2760   int x;
2761   int shift = depth - 10;
2762   uint32_t* dst_ar30_32 = (uint32_t*)dst_ar30;
2763   for (x = 0; x < width; ++x) {
2764     uint32_t r = clamp1023(src_r[x] >> shift);
2765     uint32_t g = clamp1023(src_g[x] >> shift);
2766     uint32_t b = clamp1023(src_b[x] >> shift);
2767     dst_ar30_32[x] = b | (g << 10) | (r << 20) | 0xc0000000;
2768   }
2769 }
2770 
MergeAR64Row_C(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint16_t * dst_ar64,int depth,int width)2771 void MergeAR64Row_C(const uint16_t* src_r,
2772                     const uint16_t* src_g,
2773                     const uint16_t* src_b,
2774                     const uint16_t* src_a,
2775                     uint16_t* dst_ar64,
2776                     int depth,
2777                     int width) {
2778   assert(depth >= 1);
2779   assert(depth <= 16);
2780   int x;
2781   int shift = 16 - depth;
2782   int max = (1 << depth) - 1;
2783   for (x = 0; x < width; ++x) {
2784     dst_ar64[0] = ClampMax(src_b[x], max) << shift;
2785     dst_ar64[1] = ClampMax(src_g[x], max) << shift;
2786     dst_ar64[2] = ClampMax(src_r[x], max) << shift;
2787     dst_ar64[3] = ClampMax(src_a[x], max) << shift;
2788     dst_ar64 += 4;
2789   }
2790 }
2791 
MergeARGB16To8Row_C(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint8_t * dst_argb,int depth,int width)2792 void MergeARGB16To8Row_C(const uint16_t* src_r,
2793                          const uint16_t* src_g,
2794                          const uint16_t* src_b,
2795                          const uint16_t* src_a,
2796                          uint8_t* dst_argb,
2797                          int depth,
2798                          int width) {
2799   assert(depth >= 8);
2800   assert(depth <= 16);
2801   int x;
2802   int shift = depth - 8;
2803   for (x = 0; x < width; ++x) {
2804     dst_argb[0] = clamp255(src_b[x] >> shift);
2805     dst_argb[1] = clamp255(src_g[x] >> shift);
2806     dst_argb[2] = clamp255(src_r[x] >> shift);
2807     dst_argb[3] = clamp255(src_a[x] >> shift);
2808     dst_argb += 4;
2809   }
2810 }
2811 
MergeXR64Row_C(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint16_t * dst_ar64,int depth,int width)2812 void MergeXR64Row_C(const uint16_t* src_r,
2813                     const uint16_t* src_g,
2814                     const uint16_t* src_b,
2815                     uint16_t* dst_ar64,
2816                     int depth,
2817                     int width) {
2818   assert(depth >= 1);
2819   assert(depth <= 16);
2820   int x;
2821   int shift = 16 - depth;
2822   int max = (1 << depth) - 1;
2823   for (x = 0; x < width; ++x) {
2824     dst_ar64[0] = ClampMax(src_b[x], max) << shift;
2825     dst_ar64[1] = ClampMax(src_g[x], max) << shift;
2826     dst_ar64[2] = ClampMax(src_r[x], max) << shift;
2827     dst_ar64[3] = 0xffff;
2828     dst_ar64 += 4;
2829   }
2830 }
2831 
MergeXRGB16To8Row_C(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_argb,int depth,int width)2832 void MergeXRGB16To8Row_C(const uint16_t* src_r,
2833                          const uint16_t* src_g,
2834                          const uint16_t* src_b,
2835                          uint8_t* dst_argb,
2836                          int depth,
2837                          int width) {
2838   assert(depth >= 8);
2839   assert(depth <= 16);
2840   int x;
2841   int shift = depth - 8;
2842   for (x = 0; x < width; ++x) {
2843     dst_argb[0] = clamp255(src_b[x] >> shift);
2844     dst_argb[1] = clamp255(src_g[x] >> shift);
2845     dst_argb[2] = clamp255(src_r[x] >> shift);
2846     dst_argb[3] = 0xff;
2847     dst_argb += 4;
2848   }
2849 }
2850 
SplitXRGBRow_C(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)2851 void SplitXRGBRow_C(const uint8_t* src_argb,
2852                     uint8_t* dst_r,
2853                     uint8_t* dst_g,
2854                     uint8_t* dst_b,
2855                     int width) {
2856   int x;
2857   for (x = 0; x < width; ++x) {
2858     dst_b[x] = src_argb[0];
2859     dst_g[x] = src_argb[1];
2860     dst_r[x] = src_argb[2];
2861     src_argb += 4;
2862   }
2863 }
2864 
MergeXRGBRow_C(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_argb,int width)2865 void MergeXRGBRow_C(const uint8_t* src_r,
2866                     const uint8_t* src_g,
2867                     const uint8_t* src_b,
2868                     uint8_t* dst_argb,
2869                     int width) {
2870   int x;
2871   for (x = 0; x < width; ++x) {
2872     dst_argb[0] = src_b[x];
2873     dst_argb[1] = src_g[x];
2874     dst_argb[2] = src_r[x];
2875     dst_argb[3] = 255;
2876     dst_argb += 4;
2877   }
2878 }
2879 
2880 // Convert lsb formats to msb, depending on sample depth.
MergeUVRow_16_C(const uint16_t * src_u,const uint16_t * src_v,uint16_t * dst_uv,int depth,int width)2881 void MergeUVRow_16_C(const uint16_t* src_u,
2882                      const uint16_t* src_v,
2883                      uint16_t* dst_uv,
2884                      int depth,
2885                      int width) {
2886   int shift = 16 - depth;
2887   assert(depth >= 8);
2888   assert(depth <= 16);
2889   int x;
2890   for (x = 0; x < width; ++x) {
2891     dst_uv[0] = src_u[x] << shift;
2892     dst_uv[1] = src_v[x] << shift;
2893     dst_uv += 2;
2894   }
2895 }
2896 
2897 // Convert msb formats to lsb, depending on sample depth.
SplitUVRow_16_C(const uint16_t * src_uv,uint16_t * dst_u,uint16_t * dst_v,int depth,int width)2898 void SplitUVRow_16_C(const uint16_t* src_uv,
2899                      uint16_t* dst_u,
2900                      uint16_t* dst_v,
2901                      int depth,
2902                      int width) {
2903   int shift = 16 - depth;
2904   int x;
2905   assert(depth >= 8);
2906   assert(depth <= 16);
2907   for (x = 0; x < width; ++x) {
2908     dst_u[x] = src_uv[0] >> shift;
2909     dst_v[x] = src_uv[1] >> shift;
2910     src_uv += 2;
2911   }
2912 }
2913 
MultiplyRow_16_C(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)2914 void MultiplyRow_16_C(const uint16_t* src_y,
2915                       uint16_t* dst_y,
2916                       int scale,
2917                       int width) {
2918   int x;
2919   for (x = 0; x < width; ++x) {
2920     dst_y[x] = src_y[x] * scale;
2921   }
2922 }
2923 
DivideRow_16_C(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)2924 void DivideRow_16_C(const uint16_t* src_y,
2925                     uint16_t* dst_y,
2926                     int scale,
2927                     int width) {
2928   int x;
2929   for (x = 0; x < width; ++x) {
2930     dst_y[x] = (src_y[x] * scale) >> 16;
2931   }
2932 }
2933 
2934 // Use scale to convert lsb formats to msb, depending how many bits there are:
2935 // 32768 = 9 bits
2936 // 16384 = 10 bits
2937 // 4096 = 12 bits
2938 // 256 = 16 bits
Convert16To8Row_C(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)2939 void Convert16To8Row_C(const uint16_t* src_y,
2940                        uint8_t* dst_y,
2941                        int scale,
2942                        int width) {
2943   int x;
2944   assert(scale >= 256);
2945   assert(scale <= 32768);
2946 
2947   for (x = 0; x < width; ++x) {
2948     dst_y[x] = clamp255((src_y[x] * scale) >> 16);
2949   }
2950 }
2951 
2952 // Use scale to convert lsb formats to msb, depending how many bits there are:
2953 // 1024 = 10 bits
Convert8To16Row_C(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)2954 void Convert8To16Row_C(const uint8_t* src_y,
2955                        uint16_t* dst_y,
2956                        int scale,
2957                        int width) {
2958   int x;
2959   scale *= 0x0101;  // replicates the byte.
2960   for (x = 0; x < width; ++x) {
2961     dst_y[x] = (src_y[x] * scale) >> 16;
2962   }
2963 }
2964 
CopyRow_C(const uint8_t * src,uint8_t * dst,int count)2965 void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) {
2966   memcpy(dst, src, count);
2967 }
2968 
CopyRow_16_C(const uint16_t * src,uint16_t * dst,int count)2969 void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count) {
2970   memcpy(dst, src, count * 2);
2971 }
2972 
SetRow_C(uint8_t * dst,uint8_t v8,int width)2973 void SetRow_C(uint8_t* dst, uint8_t v8, int width) {
2974   memset(dst, v8, width);
2975 }
2976 
ARGBSetRow_C(uint8_t * dst_argb,uint32_t v32,int width)2977 void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) {
2978   int x;
2979   for (x = 0; x < width; ++x) {
2980     memcpy(dst_argb + x * sizeof v32, &v32, sizeof v32);
2981   }
2982 }
2983 
2984 // Filter 2 rows of YUY2 UV's (422) into U and V (420).
YUY2ToUVRow_C(const uint8_t * src_yuy2,int src_stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)2985 void YUY2ToUVRow_C(const uint8_t* src_yuy2,
2986                    int src_stride_yuy2,
2987                    uint8_t* dst_u,
2988                    uint8_t* dst_v,
2989                    int width) {
2990   // Output a row of UV values, filtering 2 rows of YUY2.
2991   int x;
2992   for (x = 0; x < width; x += 2) {
2993     dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
2994     dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
2995     src_yuy2 += 4;
2996     dst_u += 1;
2997     dst_v += 1;
2998   }
2999 }
3000 
3001 // Copy row of YUY2 UV's (422) into U and V (422).
YUY2ToUV422Row_C(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)3002 void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
3003                       uint8_t* dst_u,
3004                       uint8_t* dst_v,
3005                       int width) {
3006   // Output a row of UV values.
3007   int x;
3008   for (x = 0; x < width; x += 2) {
3009     dst_u[0] = src_yuy2[1];
3010     dst_v[0] = src_yuy2[3];
3011     src_yuy2 += 4;
3012     dst_u += 1;
3013     dst_v += 1;
3014   }
3015 }
3016 
3017 // Copy row of YUY2 Y's (422) into Y (420/422).
YUY2ToYRow_C(const uint8_t * src_yuy2,uint8_t * dst_y,int width)3018 void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
3019   // Output a row of Y values.
3020   int x;
3021   for (x = 0; x < width - 1; x += 2) {
3022     dst_y[x] = src_yuy2[0];
3023     dst_y[x + 1] = src_yuy2[2];
3024     src_yuy2 += 4;
3025   }
3026   if (width & 1) {
3027     dst_y[width - 1] = src_yuy2[0];
3028   }
3029 }
3030 
3031 // Filter 2 rows of UYVY UV's (422) into U and V (420).
UYVYToUVRow_C(const uint8_t * src_uyvy,int src_stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)3032 void UYVYToUVRow_C(const uint8_t* src_uyvy,
3033                    int src_stride_uyvy,
3034                    uint8_t* dst_u,
3035                    uint8_t* dst_v,
3036                    int width) {
3037   // Output a row of UV values.
3038   int x;
3039   for (x = 0; x < width; x += 2) {
3040     dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
3041     dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
3042     src_uyvy += 4;
3043     dst_u += 1;
3044     dst_v += 1;
3045   }
3046 }
3047 
3048 // Copy row of UYVY UV's (422) into U and V (422).
UYVYToUV422Row_C(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)3049 void UYVYToUV422Row_C(const uint8_t* src_uyvy,
3050                       uint8_t* dst_u,
3051                       uint8_t* dst_v,
3052                       int width) {
3053   // Output a row of UV values.
3054   int x;
3055   for (x = 0; x < width; x += 2) {
3056     dst_u[0] = src_uyvy[0];
3057     dst_v[0] = src_uyvy[2];
3058     src_uyvy += 4;
3059     dst_u += 1;
3060     dst_v += 1;
3061   }
3062 }
3063 
3064 // Copy row of UYVY Y's (422) into Y (420/422).
UYVYToYRow_C(const uint8_t * src_uyvy,uint8_t * dst_y,int width)3065 void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
3066   // Output a row of Y values.
3067   int x;
3068   for (x = 0; x < width - 1; x += 2) {
3069     dst_y[x] = src_uyvy[1];
3070     dst_y[x + 1] = src_uyvy[3];
3071     src_uyvy += 4;
3072   }
3073   if (width & 1) {
3074     dst_y[width - 1] = src_uyvy[1];
3075   }
3076 }
3077 
3078 #define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f)
3079 
3080 // Blend src_argb over src_argb1 and store to dst_argb.
3081 // dst_argb may be src_argb or src_argb1.
3082 // This code mimics the SSSE3 version for better testability.
ARGBBlendRow_C(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)3083 void ARGBBlendRow_C(const uint8_t* src_argb,
3084                     const uint8_t* src_argb1,
3085                     uint8_t* dst_argb,
3086                     int width) {
3087   int x;
3088   for (x = 0; x < width - 1; x += 2) {
3089     uint32_t fb = src_argb[0];
3090     uint32_t fg = src_argb[1];
3091     uint32_t fr = src_argb[2];
3092     uint32_t a = src_argb[3];
3093     uint32_t bb = src_argb1[0];
3094     uint32_t bg = src_argb1[1];
3095     uint32_t br = src_argb1[2];
3096     dst_argb[0] = BLEND(fb, bb, a);
3097     dst_argb[1] = BLEND(fg, bg, a);
3098     dst_argb[2] = BLEND(fr, br, a);
3099     dst_argb[3] = 255u;
3100 
3101     fb = src_argb[4 + 0];
3102     fg = src_argb[4 + 1];
3103     fr = src_argb[4 + 2];
3104     a = src_argb[4 + 3];
3105     bb = src_argb1[4 + 0];
3106     bg = src_argb1[4 + 1];
3107     br = src_argb1[4 + 2];
3108     dst_argb[4 + 0] = BLEND(fb, bb, a);
3109     dst_argb[4 + 1] = BLEND(fg, bg, a);
3110     dst_argb[4 + 2] = BLEND(fr, br, a);
3111     dst_argb[4 + 3] = 255u;
3112     src_argb += 8;
3113     src_argb1 += 8;
3114     dst_argb += 8;
3115   }
3116 
3117   if (width & 1) {
3118     uint32_t fb = src_argb[0];
3119     uint32_t fg = src_argb[1];
3120     uint32_t fr = src_argb[2];
3121     uint32_t a = src_argb[3];
3122     uint32_t bb = src_argb1[0];
3123     uint32_t bg = src_argb1[1];
3124     uint32_t br = src_argb1[2];
3125     dst_argb[0] = BLEND(fb, bb, a);
3126     dst_argb[1] = BLEND(fg, bg, a);
3127     dst_argb[2] = BLEND(fr, br, a);
3128     dst_argb[3] = 255u;
3129   }
3130 }
3131 #undef BLEND
3132 
3133 #define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8
BlendPlaneRow_C(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)3134 void BlendPlaneRow_C(const uint8_t* src0,
3135                      const uint8_t* src1,
3136                      const uint8_t* alpha,
3137                      uint8_t* dst,
3138                      int width) {
3139   int x;
3140   for (x = 0; x < width - 1; x += 2) {
3141     dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
3142     dst[1] = UBLEND(src0[1], src1[1], alpha[1]);
3143     src0 += 2;
3144     src1 += 2;
3145     alpha += 2;
3146     dst += 2;
3147   }
3148   if (width & 1) {
3149     dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
3150   }
3151 }
3152 #undef UBLEND
3153 
3154 #if defined(__aarch64__) || defined(__arm__)
3155 #define ATTENUATE(f, a) (f * a + 128) >> 8
3156 #else
3157 // This code mimics the SSSE3 version for better testability.
3158 #define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
3159 #endif
3160 
3161 // Multiply source RGB by alpha and store to destination.
ARGBAttenuateRow_C(const uint8_t * src_argb,uint8_t * dst_argb,int width)3162 void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
3163   int i;
3164   for (i = 0; i < width - 1; i += 2) {
3165     uint32_t b = src_argb[0];
3166     uint32_t g = src_argb[1];
3167     uint32_t r = src_argb[2];
3168     uint32_t a = src_argb[3];
3169     dst_argb[0] = ATTENUATE(b, a);
3170     dst_argb[1] = ATTENUATE(g, a);
3171     dst_argb[2] = ATTENUATE(r, a);
3172     dst_argb[3] = a;
3173     b = src_argb[4];
3174     g = src_argb[5];
3175     r = src_argb[6];
3176     a = src_argb[7];
3177     dst_argb[4] = ATTENUATE(b, a);
3178     dst_argb[5] = ATTENUATE(g, a);
3179     dst_argb[6] = ATTENUATE(r, a);
3180     dst_argb[7] = a;
3181     src_argb += 8;
3182     dst_argb += 8;
3183   }
3184 
3185   if (width & 1) {
3186     const uint32_t b = src_argb[0];
3187     const uint32_t g = src_argb[1];
3188     const uint32_t r = src_argb[2];
3189     const uint32_t a = src_argb[3];
3190     dst_argb[0] = ATTENUATE(b, a);
3191     dst_argb[1] = ATTENUATE(g, a);
3192     dst_argb[2] = ATTENUATE(r, a);
3193     dst_argb[3] = a;
3194   }
3195 }
3196 #undef ATTENUATE
3197 
3198 // Divide source RGB by alpha and store to destination.
3199 // b = (b * 255 + (a / 2)) / a;
3200 // g = (g * 255 + (a / 2)) / a;
3201 // r = (r * 255 + (a / 2)) / a;
3202 // Reciprocal method is off by 1 on some values. ie 125
3203 // 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
3204 #define T(a) 0x01000000 + (0x10000 / a)
3205 const uint32_t fixed_invtbl8[256] = {
3206     0x01000000, 0x0100ffff, T(0x02), T(0x03),   T(0x04), T(0x05), T(0x06),
3207     T(0x07),    T(0x08),    T(0x09), T(0x0a),   T(0x0b), T(0x0c), T(0x0d),
3208     T(0x0e),    T(0x0f),    T(0x10), T(0x11),   T(0x12), T(0x13), T(0x14),
3209     T(0x15),    T(0x16),    T(0x17), T(0x18),   T(0x19), T(0x1a), T(0x1b),
3210     T(0x1c),    T(0x1d),    T(0x1e), T(0x1f),   T(0x20), T(0x21), T(0x22),
3211     T(0x23),    T(0x24),    T(0x25), T(0x26),   T(0x27), T(0x28), T(0x29),
3212     T(0x2a),    T(0x2b),    T(0x2c), T(0x2d),   T(0x2e), T(0x2f), T(0x30),
3213     T(0x31),    T(0x32),    T(0x33), T(0x34),   T(0x35), T(0x36), T(0x37),
3214     T(0x38),    T(0x39),    T(0x3a), T(0x3b),   T(0x3c), T(0x3d), T(0x3e),
3215     T(0x3f),    T(0x40),    T(0x41), T(0x42),   T(0x43), T(0x44), T(0x45),
3216     T(0x46),    T(0x47),    T(0x48), T(0x49),   T(0x4a), T(0x4b), T(0x4c),
3217     T(0x4d),    T(0x4e),    T(0x4f), T(0x50),   T(0x51), T(0x52), T(0x53),
3218     T(0x54),    T(0x55),    T(0x56), T(0x57),   T(0x58), T(0x59), T(0x5a),
3219     T(0x5b),    T(0x5c),    T(0x5d), T(0x5e),   T(0x5f), T(0x60), T(0x61),
3220     T(0x62),    T(0x63),    T(0x64), T(0x65),   T(0x66), T(0x67), T(0x68),
3221     T(0x69),    T(0x6a),    T(0x6b), T(0x6c),   T(0x6d), T(0x6e), T(0x6f),
3222     T(0x70),    T(0x71),    T(0x72), T(0x73),   T(0x74), T(0x75), T(0x76),
3223     T(0x77),    T(0x78),    T(0x79), T(0x7a),   T(0x7b), T(0x7c), T(0x7d),
3224     T(0x7e),    T(0x7f),    T(0x80), T(0x81),   T(0x82), T(0x83), T(0x84),
3225     T(0x85),    T(0x86),    T(0x87), T(0x88),   T(0x89), T(0x8a), T(0x8b),
3226     T(0x8c),    T(0x8d),    T(0x8e), T(0x8f),   T(0x90), T(0x91), T(0x92),
3227     T(0x93),    T(0x94),    T(0x95), T(0x96),   T(0x97), T(0x98), T(0x99),
3228     T(0x9a),    T(0x9b),    T(0x9c), T(0x9d),   T(0x9e), T(0x9f), T(0xa0),
3229     T(0xa1),    T(0xa2),    T(0xa3), T(0xa4),   T(0xa5), T(0xa6), T(0xa7),
3230     T(0xa8),    T(0xa9),    T(0xaa), T(0xab),   T(0xac), T(0xad), T(0xae),
3231     T(0xaf),    T(0xb0),    T(0xb1), T(0xb2),   T(0xb3), T(0xb4), T(0xb5),
3232     T(0xb6),    T(0xb7),    T(0xb8), T(0xb9),   T(0xba), T(0xbb), T(0xbc),
3233     T(0xbd),    T(0xbe),    T(0xbf), T(0xc0),   T(0xc1), T(0xc2), T(0xc3),
3234     T(0xc4),    T(0xc5),    T(0xc6), T(0xc7),   T(0xc8), T(0xc9), T(0xca),
3235     T(0xcb),    T(0xcc),    T(0xcd), T(0xce),   T(0xcf), T(0xd0), T(0xd1),
3236     T(0xd2),    T(0xd3),    T(0xd4), T(0xd5),   T(0xd6), T(0xd7), T(0xd8),
3237     T(0xd9),    T(0xda),    T(0xdb), T(0xdc),   T(0xdd), T(0xde), T(0xdf),
3238     T(0xe0),    T(0xe1),    T(0xe2), T(0xe3),   T(0xe4), T(0xe5), T(0xe6),
3239     T(0xe7),    T(0xe8),    T(0xe9), T(0xea),   T(0xeb), T(0xec), T(0xed),
3240     T(0xee),    T(0xef),    T(0xf0), T(0xf1),   T(0xf2), T(0xf3), T(0xf4),
3241     T(0xf5),    T(0xf6),    T(0xf7), T(0xf8),   T(0xf9), T(0xfa), T(0xfb),
3242     T(0xfc),    T(0xfd),    T(0xfe), 0x01000100};
3243 #undef T
3244 
ARGBUnattenuateRow_C(const uint8_t * src_argb,uint8_t * dst_argb,int width)3245 void ARGBUnattenuateRow_C(const uint8_t* src_argb,
3246                           uint8_t* dst_argb,
3247                           int width) {
3248   int i;
3249   for (i = 0; i < width; ++i) {
3250     uint32_t b = src_argb[0];
3251     uint32_t g = src_argb[1];
3252     uint32_t r = src_argb[2];
3253     const uint32_t a = src_argb[3];
3254     const uint32_t ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
3255     b = (b * ia) >> 8;
3256     g = (g * ia) >> 8;
3257     r = (r * ia) >> 8;
3258     // Clamping should not be necessary but is free in assembly.
3259     dst_argb[0] = clamp255(b);
3260     dst_argb[1] = clamp255(g);
3261     dst_argb[2] = clamp255(r);
3262     dst_argb[3] = a;
3263     src_argb += 4;
3264     dst_argb += 4;
3265   }
3266 }
3267 
ComputeCumulativeSumRow_C(const uint8_t * row,int32_t * cumsum,const int32_t * previous_cumsum,int width)3268 void ComputeCumulativeSumRow_C(const uint8_t* row,
3269                                int32_t* cumsum,
3270                                const int32_t* previous_cumsum,
3271                                int width) {
3272   int32_t row_sum[4] = {0, 0, 0, 0};
3273   int x;
3274   for (x = 0; x < width; ++x) {
3275     row_sum[0] += row[x * 4 + 0];
3276     row_sum[1] += row[x * 4 + 1];
3277     row_sum[2] += row[x * 4 + 2];
3278     row_sum[3] += row[x * 4 + 3];
3279     cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
3280     cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
3281     cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
3282     cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
3283   }
3284 }
3285 
CumulativeSumToAverageRow_C(const int32_t * tl,const int32_t * bl,int w,int area,uint8_t * dst,int count)3286 void CumulativeSumToAverageRow_C(const int32_t* tl,
3287                                  const int32_t* bl,
3288                                  int w,
3289                                  int area,
3290                                  uint8_t* dst,
3291                                  int count) {
3292   float ooa = 1.0f / area;
3293   int i;
3294   for (i = 0; i < count; ++i) {
3295     dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
3296     dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
3297     dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
3298     dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
3299     dst += 4;
3300     tl += 4;
3301     bl += 4;
3302   }
3303 }
3304 
3305 // Copy pixels from rotated source to destination row with a slope.
3306 LIBYUV_API
ARGBAffineRow_C(const uint8_t * src_argb,int src_argb_stride,uint8_t * dst_argb,const float * uv_dudv,int width)3307 void ARGBAffineRow_C(const uint8_t* src_argb,
3308                      int src_argb_stride,
3309                      uint8_t* dst_argb,
3310                      const float* uv_dudv,
3311                      int width) {
3312   int i;
3313   // Render a row of pixels from source into a buffer.
3314   float uv[2];
3315   uv[0] = uv_dudv[0];
3316   uv[1] = uv_dudv[1];
3317   for (i = 0; i < width; ++i) {
3318     int x = (int)(uv[0]);
3319     int y = (int)(uv[1]);
3320     *(uint32_t*)(dst_argb) =
3321         *(const uint32_t*)(src_argb + y * src_argb_stride + x * 4);
3322     dst_argb += 4;
3323     uv[0] += uv_dudv[2];
3324     uv[1] += uv_dudv[3];
3325   }
3326 }
3327 
3328 // Blend 2 rows into 1.
HalfRow_C(const uint8_t * src_uv,ptrdiff_t src_uv_stride,uint8_t * dst_uv,int width)3329 static void HalfRow_C(const uint8_t* src_uv,
3330                       ptrdiff_t src_uv_stride,
3331                       uint8_t* dst_uv,
3332                       int width) {
3333   int x;
3334   for (x = 0; x < width; ++x) {
3335     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
3336   }
3337 }
3338 
HalfRow_16_C(const uint16_t * src_uv,ptrdiff_t src_uv_stride,uint16_t * dst_uv,int width)3339 static void HalfRow_16_C(const uint16_t* src_uv,
3340                          ptrdiff_t src_uv_stride,
3341                          uint16_t* dst_uv,
3342                          int width) {
3343   int x;
3344   for (x = 0; x < width; ++x) {
3345     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
3346   }
3347 }
3348 
3349 // C version 2x2 -> 2x1.
InterpolateRow_C(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int width,int source_y_fraction)3350 void InterpolateRow_C(uint8_t* dst_ptr,
3351                       const uint8_t* src_ptr,
3352                       ptrdiff_t src_stride,
3353                       int width,
3354                       int source_y_fraction) {
3355   int y1_fraction = source_y_fraction;
3356   int y0_fraction = 256 - y1_fraction;
3357   const uint8_t* src_ptr1 = src_ptr + src_stride;
3358   int x;
3359   if (y1_fraction == 0) {
3360     memcpy(dst_ptr, src_ptr, width);
3361     return;
3362   }
3363   if (y1_fraction == 128) {
3364     HalfRow_C(src_ptr, src_stride, dst_ptr, width);
3365     return;
3366   }
3367   for (x = 0; x < width - 1; x += 2) {
3368     dst_ptr[0] =
3369         (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
3370     dst_ptr[1] =
3371         (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;
3372     src_ptr += 2;
3373     src_ptr1 += 2;
3374     dst_ptr += 2;
3375   }
3376   if (width & 1) {
3377     dst_ptr[0] =
3378         (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
3379   }
3380 }
3381 
InterpolateRow_16_C(uint16_t * dst_ptr,const uint16_t * src_ptr,ptrdiff_t src_stride,int width,int source_y_fraction)3382 void InterpolateRow_16_C(uint16_t* dst_ptr,
3383                          const uint16_t* src_ptr,
3384                          ptrdiff_t src_stride,
3385                          int width,
3386                          int source_y_fraction) {
3387   int y1_fraction = source_y_fraction;
3388   int y0_fraction = 256 - y1_fraction;
3389   const uint16_t* src_ptr1 = src_ptr + src_stride;
3390   int x;
3391   if (source_y_fraction == 0) {
3392     memcpy(dst_ptr, src_ptr, width * 2);
3393     return;
3394   }
3395   if (source_y_fraction == 128) {
3396     HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
3397     return;
3398   }
3399   for (x = 0; x < width - 1; x += 2) {
3400     dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
3401     dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
3402     src_ptr += 2;
3403     src_ptr1 += 2;
3404     dst_ptr += 2;
3405   }
3406   if (width & 1) {
3407     dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
3408   }
3409 }
3410 
3411 // Use first 4 shuffler values to reorder ARGB channels.
ARGBShuffleRow_C(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)3412 void ARGBShuffleRow_C(const uint8_t* src_argb,
3413                       uint8_t* dst_argb,
3414                       const uint8_t* shuffler,
3415                       int width) {
3416   int index0 = shuffler[0];
3417   int index1 = shuffler[1];
3418   int index2 = shuffler[2];
3419   int index3 = shuffler[3];
3420   // Shuffle a row of ARGB.
3421   int x;
3422   for (x = 0; x < width; ++x) {
3423     // To support in-place conversion.
3424     uint8_t b = src_argb[index0];
3425     uint8_t g = src_argb[index1];
3426     uint8_t r = src_argb[index2];
3427     uint8_t a = src_argb[index3];
3428     dst_argb[0] = b;
3429     dst_argb[1] = g;
3430     dst_argb[2] = r;
3431     dst_argb[3] = a;
3432     src_argb += 4;
3433     dst_argb += 4;
3434   }
3435 }
3436 
I422ToYUY2Row_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_frame,int width)3437 void I422ToYUY2Row_C(const uint8_t* src_y,
3438                      const uint8_t* src_u,
3439                      const uint8_t* src_v,
3440                      uint8_t* dst_frame,
3441                      int width) {
3442   int x;
3443   for (x = 0; x < width - 1; x += 2) {
3444     dst_frame[0] = src_y[0];
3445     dst_frame[1] = src_u[0];
3446     dst_frame[2] = src_y[1];
3447     dst_frame[3] = src_v[0];
3448     dst_frame += 4;
3449     src_y += 2;
3450     src_u += 1;
3451     src_v += 1;
3452   }
3453   if (width & 1) {
3454     dst_frame[0] = src_y[0];
3455     dst_frame[1] = src_u[0];
3456     dst_frame[2] = 0;
3457     dst_frame[3] = src_v[0];
3458   }
3459 }
3460 
I422ToUYVYRow_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_frame,int width)3461 void I422ToUYVYRow_C(const uint8_t* src_y,
3462                      const uint8_t* src_u,
3463                      const uint8_t* src_v,
3464                      uint8_t* dst_frame,
3465                      int width) {
3466   int x;
3467   for (x = 0; x < width - 1; x += 2) {
3468     dst_frame[0] = src_u[0];
3469     dst_frame[1] = src_y[0];
3470     dst_frame[2] = src_v[0];
3471     dst_frame[3] = src_y[1];
3472     dst_frame += 4;
3473     src_y += 2;
3474     src_u += 1;
3475     src_v += 1;
3476   }
3477   if (width & 1) {
3478     dst_frame[0] = src_u[0];
3479     dst_frame[1] = src_y[0];
3480     dst_frame[2] = src_v[0];
3481     dst_frame[3] = 0;
3482   }
3483 }
3484 
ARGBPolynomialRow_C(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)3485 void ARGBPolynomialRow_C(const uint8_t* src_argb,
3486                          uint8_t* dst_argb,
3487                          const float* poly,
3488                          int width) {
3489   int i;
3490   for (i = 0; i < width; ++i) {
3491     float b = (float)(src_argb[0]);
3492     float g = (float)(src_argb[1]);
3493     float r = (float)(src_argb[2]);
3494     float a = (float)(src_argb[3]);
3495     float b2 = b * b;
3496     float g2 = g * g;
3497     float r2 = r * r;
3498     float a2 = a * a;
3499     float db = poly[0] + poly[4] * b;
3500     float dg = poly[1] + poly[5] * g;
3501     float dr = poly[2] + poly[6] * r;
3502     float da = poly[3] + poly[7] * a;
3503     float b3 = b2 * b;
3504     float g3 = g2 * g;
3505     float r3 = r2 * r;
3506     float a3 = a2 * a;
3507     db += poly[8] * b2;
3508     dg += poly[9] * g2;
3509     dr += poly[10] * r2;
3510     da += poly[11] * a2;
3511     db += poly[12] * b3;
3512     dg += poly[13] * g3;
3513     dr += poly[14] * r3;
3514     da += poly[15] * a3;
3515 
3516     dst_argb[0] = Clamp((int32_t)(db));
3517     dst_argb[1] = Clamp((int32_t)(dg));
3518     dst_argb[2] = Clamp((int32_t)(dr));
3519     dst_argb[3] = Clamp((int32_t)(da));
3520     src_argb += 4;
3521     dst_argb += 4;
3522   }
3523 }
3524 
3525 // Samples assumed to be unsigned in low 9, 10 or 12 bits.  Scale factor
3526 // adjust the source integer range to the half float range desired.
3527 
3528 // This magic constant is 2^-112. Multiplying by this
3529 // is the same as subtracting 112 from the exponent, which
3530 // is the difference in exponent bias between 32-bit and
3531 // 16-bit floats. Once we've done this subtraction, we can
3532 // simply extract the low bits of the exponent and the high
3533 // bits of the mantissa from our float and we're done.
3534 
3535 // Work around GCC 7 punning warning -Wstrict-aliasing
3536 #if defined(__GNUC__)
3537 typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t;
3538 #else
3539 typedef uint32_t uint32_alias_t;
3540 #endif
3541 
HalfFloatRow_C(const uint16_t * src,uint16_t * dst,float scale,int width)3542 void HalfFloatRow_C(const uint16_t* src,
3543                     uint16_t* dst,
3544                     float scale,
3545                     int width) {
3546   int i;
3547   float mult = 1.9259299444e-34f * scale;
3548   for (i = 0; i < width; ++i) {
3549     float value = src[i] * mult;
3550     dst[i] = (uint16_t)((*(const uint32_alias_t*)&value) >> 13);
3551   }
3552 }
3553 
ByteToFloatRow_C(const uint8_t * src,float * dst,float scale,int width)3554 void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) {
3555   int i;
3556   for (i = 0; i < width; ++i) {
3557     float value = src[i] * scale;
3558     dst[i] = value;
3559   }
3560 }
3561 
ARGBLumaColorTableRow_C(const uint8_t * src_argb,uint8_t * dst_argb,int width,const uint8_t * luma,uint32_t lumacoeff)3562 void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
3563                              uint8_t* dst_argb,
3564                              int width,
3565                              const uint8_t* luma,
3566                              uint32_t lumacoeff) {
3567   uint32_t bc = lumacoeff & 0xff;
3568   uint32_t gc = (lumacoeff >> 8) & 0xff;
3569   uint32_t rc = (lumacoeff >> 16) & 0xff;
3570 
3571   int i;
3572   for (i = 0; i < width - 1; i += 2) {
3573     // Luminance in rows, color values in columns.
3574     const uint8_t* luma0 =
3575         ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
3576         luma;
3577     const uint8_t* luma1;
3578     dst_argb[0] = luma0[src_argb[0]];
3579     dst_argb[1] = luma0[src_argb[1]];
3580     dst_argb[2] = luma0[src_argb[2]];
3581     dst_argb[3] = src_argb[3];
3582     luma1 =
3583         ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) +
3584         luma;
3585     dst_argb[4] = luma1[src_argb[4]];
3586     dst_argb[5] = luma1[src_argb[5]];
3587     dst_argb[6] = luma1[src_argb[6]];
3588     dst_argb[7] = src_argb[7];
3589     src_argb += 8;
3590     dst_argb += 8;
3591   }
3592   if (width & 1) {
3593     // Luminance in rows, color values in columns.
3594     const uint8_t* luma0 =
3595         ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
3596         luma;
3597     dst_argb[0] = luma0[src_argb[0]];
3598     dst_argb[1] = luma0[src_argb[1]];
3599     dst_argb[2] = luma0[src_argb[2]];
3600     dst_argb[3] = src_argb[3];
3601   }
3602 }
3603 
ARGBCopyAlphaRow_C(const uint8_t * src,uint8_t * dst,int width)3604 void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
3605   int i;
3606   for (i = 0; i < width - 1; i += 2) {
3607     dst[3] = src[3];
3608     dst[7] = src[7];
3609     dst += 8;
3610     src += 8;
3611   }
3612   if (width & 1) {
3613     dst[3] = src[3];
3614   }
3615 }
3616 
ARGBExtractAlphaRow_C(const uint8_t * src_argb,uint8_t * dst_a,int width)3617 void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width) {
3618   int i;
3619   for (i = 0; i < width - 1; i += 2) {
3620     dst_a[0] = src_argb[3];
3621     dst_a[1] = src_argb[7];
3622     dst_a += 2;
3623     src_argb += 8;
3624   }
3625   if (width & 1) {
3626     dst_a[0] = src_argb[3];
3627   }
3628 }
3629 
ARGBCopyYToAlphaRow_C(const uint8_t * src,uint8_t * dst,int width)3630 void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
3631   int i;
3632   for (i = 0; i < width - 1; i += 2) {
3633     dst[3] = src[0];
3634     dst[7] = src[1];
3635     dst += 8;
3636     src += 2;
3637   }
3638   if (width & 1) {
3639     dst[3] = src[0];
3640   }
3641 }
3642 
3643 // Maximum temporary width for wrappers to process at a time, in pixels.
3644 #define MAXTWIDTH 2048
3645 
3646 #if !(defined(_MSC_VER) && !defined(__clang__) && defined(_M_IX86)) && \
3647     defined(HAS_I422TORGB565ROW_SSSE3)
3648 // row_win.cc has asm version, but GCC uses 2 step wrapper.
I422ToRGB565Row_SSSE3(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)3649 void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
3650                            const uint8_t* src_u,
3651                            const uint8_t* src_v,
3652                            uint8_t* dst_rgb565,
3653                            const struct YuvConstants* yuvconstants,
3654                            int width) {
3655   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3656   while (width > 0) {
3657     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3658     I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
3659     ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
3660     src_y += twidth;
3661     src_u += twidth / 2;
3662     src_v += twidth / 2;
3663     dst_rgb565 += twidth * 2;
3664     width -= twidth;
3665   }
3666 }
3667 #endif
3668 
3669 #if defined(HAS_I422TOARGB1555ROW_SSSE3)
I422ToARGB1555Row_SSSE3(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)3670 void I422ToARGB1555Row_SSSE3(const uint8_t* src_y,
3671                              const uint8_t* src_u,
3672                              const uint8_t* src_v,
3673                              uint8_t* dst_argb1555,
3674                              const struct YuvConstants* yuvconstants,
3675                              int width) {
3676   // Row buffer for intermediate ARGB pixels.
3677   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3678   while (width > 0) {
3679     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3680     I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
3681     ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
3682     src_y += twidth;
3683     src_u += twidth / 2;
3684     src_v += twidth / 2;
3685     dst_argb1555 += twidth * 2;
3686     width -= twidth;
3687   }
3688 }
3689 #endif
3690 
3691 #if defined(HAS_I422TOARGB4444ROW_SSSE3)
I422ToARGB4444Row_SSSE3(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)3692 void I422ToARGB4444Row_SSSE3(const uint8_t* src_y,
3693                              const uint8_t* src_u,
3694                              const uint8_t* src_v,
3695                              uint8_t* dst_argb4444,
3696                              const struct YuvConstants* yuvconstants,
3697                              int width) {
3698   // Row buffer for intermediate ARGB pixels.
3699   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3700   while (width > 0) {
3701     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3702     I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
3703     ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
3704     src_y += twidth;
3705     src_u += twidth / 2;
3706     src_v += twidth / 2;
3707     dst_argb4444 += twidth * 2;
3708     width -= twidth;
3709   }
3710 }
3711 #endif
3712 
3713 #if defined(HAS_NV12TORGB565ROW_SSSE3)
NV12ToRGB565Row_SSSE3(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)3714 void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,
3715                            const uint8_t* src_uv,
3716                            uint8_t* dst_rgb565,
3717                            const struct YuvConstants* yuvconstants,
3718                            int width) {
3719   // Row buffer for intermediate ARGB pixels.
3720   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3721   while (width > 0) {
3722     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3723     NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
3724     ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
3725     src_y += twidth;
3726     src_uv += twidth;
3727     dst_rgb565 += twidth * 2;
3728     width -= twidth;
3729   }
3730 }
3731 #endif
3732 
3733 #if defined(HAS_NV12TORGB24ROW_SSSE3)
NV12ToRGB24Row_SSSE3(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)3734 void NV12ToRGB24Row_SSSE3(const uint8_t* src_y,
3735                           const uint8_t* src_uv,
3736                           uint8_t* dst_rgb24,
3737                           const struct YuvConstants* yuvconstants,
3738                           int width) {
3739   // Row buffer for intermediate ARGB pixels.
3740   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3741   while (width > 0) {
3742     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3743     NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
3744     ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
3745     src_y += twidth;
3746     src_uv += twidth;
3747     dst_rgb24 += twidth * 3;
3748     width -= twidth;
3749   }
3750 }
3751 #endif
3752 
3753 #if defined(HAS_NV21TORGB24ROW_SSSE3)
NV21ToRGB24Row_SSSE3(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)3754 void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,
3755                           const uint8_t* src_vu,
3756                           uint8_t* dst_rgb24,
3757                           const struct YuvConstants* yuvconstants,
3758                           int width) {
3759   // Row buffer for intermediate ARGB pixels.
3760   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3761   while (width > 0) {
3762     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3763     NV21ToARGBRow_SSSE3(src_y, src_vu, row, yuvconstants, twidth);
3764     ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
3765     src_y += twidth;
3766     src_vu += twidth;
3767     dst_rgb24 += twidth * 3;
3768     width -= twidth;
3769   }
3770 }
3771 #endif
3772 
3773 #if defined(HAS_NV12TORGB24ROW_AVX2)
NV12ToRGB24Row_AVX2(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)3774 void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
3775                          const uint8_t* src_uv,
3776                          uint8_t* dst_rgb24,
3777                          const struct YuvConstants* yuvconstants,
3778                          int width) {
3779   // Row buffer for intermediate ARGB pixels.
3780   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3781   while (width > 0) {
3782     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3783     NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
3784 #if defined(HAS_ARGBTORGB24ROW_AVX2)
3785     ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
3786 #else
3787     ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
3788 #endif
3789     src_y += twidth;
3790     src_uv += twidth;
3791     dst_rgb24 += twidth * 3;
3792     width -= twidth;
3793   }
3794 }
3795 #endif
3796 
3797 #if defined(HAS_NV21TORGB24ROW_AVX2)
NV21ToRGB24Row_AVX2(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)3798 void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
3799                          const uint8_t* src_vu,
3800                          uint8_t* dst_rgb24,
3801                          const struct YuvConstants* yuvconstants,
3802                          int width) {
3803   // Row buffer for intermediate ARGB pixels.
3804   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3805   while (width > 0) {
3806     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3807     NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth);
3808 #if defined(HAS_ARGBTORGB24ROW_AVX2)
3809     ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
3810 #else
3811     ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
3812 #endif
3813     src_y += twidth;
3814     src_vu += twidth;
3815     dst_rgb24 += twidth * 3;
3816     width -= twidth;
3817   }
3818 }
3819 #endif
3820 
3821 #if defined(HAS_I422TORGB565ROW_AVX2)
I422ToRGB565Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)3822 void I422ToRGB565Row_AVX2(const uint8_t* src_y,
3823                           const uint8_t* src_u,
3824                           const uint8_t* src_v,
3825                           uint8_t* dst_rgb565,
3826                           const struct YuvConstants* yuvconstants,
3827                           int width) {
3828   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3829   while (width > 0) {
3830     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3831     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
3832 #if defined(HAS_ARGBTORGB565ROW_AVX2)
3833     ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
3834 #else
3835     ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
3836 #endif
3837     src_y += twidth;
3838     src_u += twidth / 2;
3839     src_v += twidth / 2;
3840     dst_rgb565 += twidth * 2;
3841     width -= twidth;
3842   }
3843 }
3844 #endif
3845 
3846 #if defined(HAS_I422TOARGB1555ROW_AVX2)
I422ToARGB1555Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)3847 void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
3848                             const uint8_t* src_u,
3849                             const uint8_t* src_v,
3850                             uint8_t* dst_argb1555,
3851                             const struct YuvConstants* yuvconstants,
3852                             int width) {
3853   // Row buffer for intermediate ARGB pixels.
3854   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3855   while (width > 0) {
3856     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3857     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
3858 #if defined(HAS_ARGBTOARGB1555ROW_AVX2)
3859     ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
3860 #else
3861     ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
3862 #endif
3863     src_y += twidth;
3864     src_u += twidth / 2;
3865     src_v += twidth / 2;
3866     dst_argb1555 += twidth * 2;
3867     width -= twidth;
3868   }
3869 }
3870 #endif
3871 
3872 #if defined(HAS_I422TOARGB4444ROW_AVX2)
I422ToARGB4444Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)3873 void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
3874                             const uint8_t* src_u,
3875                             const uint8_t* src_v,
3876                             uint8_t* dst_argb4444,
3877                             const struct YuvConstants* yuvconstants,
3878                             int width) {
3879   // Row buffer for intermediate ARGB pixels.
3880   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3881   while (width > 0) {
3882     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3883     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
3884 #if defined(HAS_ARGBTOARGB4444ROW_AVX2)
3885     ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
3886 #else
3887     ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
3888 #endif
3889     src_y += twidth;
3890     src_u += twidth / 2;
3891     src_v += twidth / 2;
3892     dst_argb4444 += twidth * 2;
3893     width -= twidth;
3894   }
3895 }
3896 #endif
3897 
3898 #if defined(HAS_I422TORGB24ROW_AVX2)
I422ToRGB24Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)3899 void I422ToRGB24Row_AVX2(const uint8_t* src_y,
3900                          const uint8_t* src_u,
3901                          const uint8_t* src_v,
3902                          uint8_t* dst_rgb24,
3903                          const struct YuvConstants* yuvconstants,
3904                          int width) {
3905   // Row buffer for intermediate ARGB pixels.
3906   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3907   while (width > 0) {
3908     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3909     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
3910 #if defined(HAS_ARGBTORGB24ROW_AVX2)
3911     ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
3912 #else
3913     ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
3914 #endif
3915     src_y += twidth;
3916     src_u += twidth / 2;
3917     src_v += twidth / 2;
3918     dst_rgb24 += twidth * 3;
3919     width -= twidth;
3920   }
3921 }
3922 #endif
3923 
3924 #if defined(HAS_NV12TORGB565ROW_AVX2)
NV12ToRGB565Row_AVX2(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)3925 void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
3926                           const uint8_t* src_uv,
3927                           uint8_t* dst_rgb565,
3928                           const struct YuvConstants* yuvconstants,
3929                           int width) {
3930   // Row buffer for intermediate ARGB pixels.
3931   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3932   while (width > 0) {
3933     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3934     NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
3935 #if defined(HAS_ARGBTORGB565ROW_AVX2)
3936     ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
3937 #else
3938     ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
3939 #endif
3940     src_y += twidth;
3941     src_uv += twidth;
3942     dst_rgb565 += twidth * 2;
3943     width -= twidth;
3944   }
3945 }
3946 #endif
3947 
3948 #ifdef HAS_RGB24TOYJROW_AVX2
3949 // Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
RGB24ToYJRow_AVX2(const uint8_t * src_rgb24,uint8_t * dst_yj,int width)3950 void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
3951   // Row buffer for intermediate ARGB pixels.
3952   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3953   while (width > 0) {
3954     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3955     RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
3956     ARGBToYJRow_AVX2(row, dst_yj, twidth);
3957     src_rgb24 += twidth * 3;
3958     dst_yj += twidth;
3959     width -= twidth;
3960   }
3961 }
3962 #endif  // HAS_RGB24TOYJROW_AVX2
3963 
3964 #ifdef HAS_RAWTOYJROW_AVX2
3965 // Convert 16 RAW pixels (64 bytes) to 16 YJ values.
RAWToYJRow_AVX2(const uint8_t * src_raw,uint8_t * dst_yj,int width)3966 void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
3967   // Row buffer for intermediate ARGB pixels.
3968   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3969   while (width > 0) {
3970     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3971     RAWToARGBRow_SSSE3(src_raw, row, twidth);
3972     ARGBToYJRow_AVX2(row, dst_yj, twidth);
3973     src_raw += twidth * 3;
3974     dst_yj += twidth;
3975     width -= twidth;
3976   }
3977 }
3978 #endif  // HAS_RAWTOYJROW_AVX2
3979 
3980 #ifdef HAS_RGB24TOYJROW_SSSE3
3981 // Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
RGB24ToYJRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_yj,int width)3982 void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
3983   // Row buffer for intermediate ARGB pixels.
3984   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3985   while (width > 0) {
3986     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3987     RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
3988     ARGBToYJRow_SSSE3(row, dst_yj, twidth);
3989     src_rgb24 += twidth * 3;
3990     dst_yj += twidth;
3991     width -= twidth;
3992   }
3993 }
3994 #endif  // HAS_RGB24TOYJROW_SSSE3
3995 
3996 #ifdef HAS_RAWTOYJROW_SSSE3
3997 // Convert 16 RAW pixels (64 bytes) to 16 YJ values.
RAWToYJRow_SSSE3(const uint8_t * src_raw,uint8_t * dst_yj,int width)3998 void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
3999   // Row buffer for intermediate ARGB pixels.
4000   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
4001   while (width > 0) {
4002     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
4003     RAWToARGBRow_SSSE3(src_raw, row, twidth);
4004     ARGBToYJRow_SSSE3(row, dst_yj, twidth);
4005     src_raw += twidth * 3;
4006     dst_yj += twidth;
4007     width -= twidth;
4008   }
4009 }
4010 #endif  // HAS_RAWTOYJROW_SSSE3
4011 
ScaleSumSamples_C(const float * src,float * dst,float scale,int width)4012 float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
4013   float fsum = 0.f;
4014   int i;
4015   for (i = 0; i < width; ++i) {
4016     float v = *src++;
4017     fsum += v * v;
4018     *dst++ = v * scale;
4019   }
4020   return fsum;
4021 }
4022 
ScaleMaxSamples_C(const float * src,float * dst,float scale,int width)4023 float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) {
4024   float fmax = 0.f;
4025   int i;
4026   for (i = 0; i < width; ++i) {
4027     float v = *src++;
4028     float vs = v * scale;
4029     fmax = (v > fmax) ? v : fmax;
4030     *dst++ = vs;
4031   }
4032   return fmax;
4033 }
4034 
ScaleSamples_C(const float * src,float * dst,float scale,int width)4035 void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
4036   int i;
4037   for (i = 0; i < width; ++i) {
4038     *dst++ = *src++ * scale;
4039   }
4040 }
4041 
GaussRow_C(const uint32_t * src,uint16_t * dst,int width)4042 void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) {
4043   int i;
4044   for (i = 0; i < width; ++i) {
4045     *dst++ =
4046         (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
4047     ++src;
4048   }
4049 }
4050 
4051 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussCol_C(const uint16_t * src0,const uint16_t * src1,const uint16_t * src2,const uint16_t * src3,const uint16_t * src4,uint32_t * dst,int width)4052 void GaussCol_C(const uint16_t* src0,
4053                 const uint16_t* src1,
4054                 const uint16_t* src2,
4055                 const uint16_t* src3,
4056                 const uint16_t* src4,
4057                 uint32_t* dst,
4058                 int width) {
4059   int i;
4060   for (i = 0; i < width; ++i) {
4061     *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
4062   }
4063 }
4064 
GaussRow_F32_C(const float * src,float * dst,int width)4065 void GaussRow_F32_C(const float* src, float* dst, int width) {
4066   int i;
4067   for (i = 0; i < width; ++i) {
4068     *dst++ = (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) *
4069              (1.0f / 256.0f);
4070     ++src;
4071   }
4072 }
4073 
4074 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussCol_F32_C(const float * src0,const float * src1,const float * src2,const float * src3,const float * src4,float * dst,int width)4075 void GaussCol_F32_C(const float* src0,
4076                     const float* src1,
4077                     const float* src2,
4078                     const float* src3,
4079                     const float* src4,
4080                     float* dst,
4081                     int width) {
4082   int i;
4083   for (i = 0; i < width; ++i) {
4084     *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
4085   }
4086 }
4087 
4088 // Convert biplanar NV21 to packed YUV24
NV21ToYUV24Row_C(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)4089 void NV21ToYUV24Row_C(const uint8_t* src_y,
4090                       const uint8_t* src_vu,
4091                       uint8_t* dst_yuv24,
4092                       int width) {
4093   int x;
4094   for (x = 0; x < width - 1; x += 2) {
4095     dst_yuv24[0] = src_vu[0];  // V
4096     dst_yuv24[1] = src_vu[1];  // U
4097     dst_yuv24[2] = src_y[0];   // Y0
4098     dst_yuv24[3] = src_vu[0];  // V
4099     dst_yuv24[4] = src_vu[1];  // U
4100     dst_yuv24[5] = src_y[1];   // Y1
4101     src_y += 2;
4102     src_vu += 2;
4103     dst_yuv24 += 6;  // Advance 2 pixels.
4104   }
4105   if (width & 1) {
4106     dst_yuv24[0] = src_vu[0];  // V
4107     dst_yuv24[1] = src_vu[1];  // U
4108     dst_yuv24[2] = src_y[0];   // Y0
4109   }
4110 }
4111 
4112 // Filter 2 rows of AYUV UV's (444) into UV (420).
4113 // AYUV is VUYA in memory.  UV for NV12 is UV order in memory.
AYUVToUVRow_C(const uint8_t * src_ayuv,int src_stride_ayuv,uint8_t * dst_uv,int width)4114 void AYUVToUVRow_C(const uint8_t* src_ayuv,
4115                    int src_stride_ayuv,
4116                    uint8_t* dst_uv,
4117                    int width) {
4118   // Output a row of UV values, filtering 2x2 rows of AYUV.
4119   int x;
4120   for (x = 0; x < width - 1; x += 2) {
4121     dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
4122                  src_ayuv[src_stride_ayuv + 5] + 2) >>
4123                 2;
4124     dst_uv[1] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
4125                  src_ayuv[src_stride_ayuv + 4] + 2) >>
4126                 2;
4127     src_ayuv += 8;
4128     dst_uv += 2;
4129   }
4130   if (width & 1) {
4131     dst_uv[0] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1;
4132     dst_uv[1] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1;
4133   }
4134 }
4135 
4136 // Filter 2 rows of AYUV UV's (444) into VU (420).
AYUVToVURow_C(const uint8_t * src_ayuv,int src_stride_ayuv,uint8_t * dst_vu,int width)4137 void AYUVToVURow_C(const uint8_t* src_ayuv,
4138                    int src_stride_ayuv,
4139                    uint8_t* dst_vu,
4140                    int width) {
4141   // Output a row of VU values, filtering 2x2 rows of AYUV.
4142   int x;
4143   for (x = 0; x < width - 1; x += 2) {
4144     dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
4145                  src_ayuv[src_stride_ayuv + 4] + 2) >>
4146                 2;
4147     dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
4148                  src_ayuv[src_stride_ayuv + 5] + 2) >>
4149                 2;
4150     src_ayuv += 8;
4151     dst_vu += 2;
4152   }
4153   if (width & 1) {
4154     dst_vu[0] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1;
4155     dst_vu[1] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1;
4156   }
4157 }
4158 
4159 // Copy row of AYUV Y's into Y
AYUVToYRow_C(const uint8_t * src_ayuv,uint8_t * dst_y,int width)4160 void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
4161   // Output a row of Y values.
4162   int x;
4163   for (x = 0; x < width; ++x) {
4164     dst_y[x] = src_ayuv[2];  // v,u,y,a
4165     src_ayuv += 4;
4166   }
4167 }
4168 
4169 // Convert UV plane of NV12 to VU of NV21.
SwapUVRow_C(const uint8_t * src_uv,uint8_t * dst_vu,int width)4170 void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
4171   int x;
4172   for (x = 0; x < width; ++x) {
4173     uint8_t u = src_uv[0];
4174     uint8_t v = src_uv[1];
4175     dst_vu[0] = v;
4176     dst_vu[1] = u;
4177     src_uv += 2;
4178     dst_vu += 2;
4179   }
4180 }
4181 
HalfMergeUVRow_C(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)4182 void HalfMergeUVRow_C(const uint8_t* src_u,
4183                       int src_stride_u,
4184                       const uint8_t* src_v,
4185                       int src_stride_v,
4186                       uint8_t* dst_uv,
4187                       int width) {
4188   int x;
4189   for (x = 0; x < width - 1; x += 2) {
4190     dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] +
4191                  src_u[src_stride_u + 1] + 2) >>
4192                 2;
4193     dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] +
4194                  src_v[src_stride_v + 1] + 2) >>
4195                 2;
4196     src_u += 2;
4197     src_v += 2;
4198     dst_uv += 2;
4199   }
4200   if (width & 1) {
4201     dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1;
4202     dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1;
4203   }
4204 }
4205 
4206 #ifdef __cplusplus
4207 }  // extern "C"
4208 }  // namespace libyuv
4209 #endif
4210