1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17 
18 // This module is for GCC x86 and x64.
19 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
20 
21 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
22 
23 // Constants for ARGB
24 static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
25                                25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
26 
27 // JPeg full range.
28 static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
29                                 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
30 
31 static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
32                                 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
33 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
34 
35 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
36 
37 static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
38                               112, -74, -38, 0, 112, -74, -38, 0};
39 
40 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
41                                127, -84, -43, 0, 127, -84, -43, 0};
42 
43 static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
44                               -18, -94, 112, 0, -18, -94, 112, 0};
45 
46 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
47                                -20, -107, 127, 0, -20, -107, 127, 0};
48 
49 // Constants for BGRA
50 static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
51                                0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
52 
53 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
54                               0, -38, -74, 112, 0, -38, -74, 112};
55 
56 static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
57                               0, 112, -94, -18, 0, 112, -94, -18};
58 
59 // Constants for ABGR
60 static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
61                                66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
62 
63 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
64                               -38, -74, 112, 0, -38, -74, 112, 0};
65 
66 static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
67                               112, -94, -18, 0, 112, -94, -18, 0};
68 
69 // Constants for RGBA.
70 static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
71                                0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
72 
73 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
74                               0, 112, -74, -38, 0, 112, -74, -38};
75 
76 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
77                               0, -18, -94, 112, 0, -18, -94, 112};
78 
79 static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
80                                0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
81 
82 static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
83                                 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
84 
85 static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
86                                0x8080u, 0x8080u, 0x8080u, 0x8080u};
87 
88 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
89 
90 #ifdef HAS_RGB24TOARGBROW_SSSE3
91 
92 // Shuffle table for converting RGB24 to ARGB.
93 static const uvec8 kShuffleMaskRGB24ToARGB = {
94     0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
95 
96 // Shuffle table for converting RAW to ARGB.
97 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
98                                             8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
99 
100 // Shuffle table for converting RAW to RGBA.
101 static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u,  4u,  3u,
102                                             14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u};
103 
104 // Shuffle table for converting RAW to RGB24.  First 8.
105 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
106     2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
107     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
108 
109 // Shuffle table for converting RAW to RGB24.  Middle 8.
110 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
111     2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
112     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
113 
114 // Shuffle table for converting RAW to RGB24.  Last 8.
115 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
116     8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
117     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
118 
119 // Shuffle table for converting ARGB to RGB24.
120 static const uvec8 kShuffleMaskARGBToRGB24 = {
121     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
122 
123 // Shuffle table for converting ARGB to RAW.
124 static const uvec8 kShuffleMaskARGBToRAW = {
125     2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
126 
127 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
128 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
129     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
130 
131 // YUY2 shuf 16 Y to 32 Y.
132 static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
133                                     10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
134                                     6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
135 
136 // YUY2 shuf 8 UV to 16 UV.
137 static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
138                                      11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
139                                      5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
140 
141 // UYVY shuf 16 Y to 32 Y.
142 static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
143                                     11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
144                                     7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
145 
146 // UYVY shuf 8 UV to 16 UV.
147 static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
148                                      10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
149                                      4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
150 
151 // NV21 shuf 8 VU to 16 UV.
152 static const lvec8 kShuffleNV21 = {
153     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
154     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
155 };
156 #endif  // HAS_RGB24TOARGBROW_SSSE3
157 
158 #ifdef HAS_J400TOARGBROW_SSE2
J400ToARGBRow_SSE2(const uint8_t * src_y,uint8_t * dst_argb,int width)159 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
160   asm volatile(
161       "pcmpeqb     %%xmm5,%%xmm5                 \n"
162       "pslld       $0x18,%%xmm5                  \n"
163 
164       LABELALIGN
165       "1:                                        \n"
166       "movq        (%0),%%xmm0                   \n"
167       "lea         0x8(%0),%0                    \n"
168       "punpcklbw   %%xmm0,%%xmm0                 \n"
169       "movdqa      %%xmm0,%%xmm1                 \n"
170       "punpcklwd   %%xmm0,%%xmm0                 \n"
171       "punpckhwd   %%xmm1,%%xmm1                 \n"
172       "por         %%xmm5,%%xmm0                 \n"
173       "por         %%xmm5,%%xmm1                 \n"
174       "movdqu      %%xmm0,(%1)                   \n"
175       "movdqu      %%xmm1,0x10(%1)               \n"
176       "lea         0x20(%1),%1                   \n"
177       "sub         $0x8,%2                       \n"
178       "jg          1b                            \n"
179       : "+r"(src_y),     // %0
180         "+r"(dst_argb),  // %1
181         "+r"(width)      // %2
182         ::"memory",
183         "cc", "xmm0", "xmm1", "xmm5");
184 }
185 #endif  // HAS_J400TOARGBROW_SSE2
186 
187 #ifdef HAS_RGB24TOARGBROW_SSSE3
RGB24ToARGBRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)188 void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
189                           uint8_t* dst_argb,
190                           int width) {
191   asm volatile(
192       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
193       "pslld       $0x18,%%xmm5                  \n"
194       "movdqa      %3,%%xmm4                     \n"
195 
196       LABELALIGN
197       "1:                                        \n"
198       "movdqu      (%0),%%xmm0                   \n"
199       "movdqu      0x10(%0),%%xmm1               \n"
200       "movdqu      0x20(%0),%%xmm3               \n"
201       "lea         0x30(%0),%0                   \n"
202       "movdqa      %%xmm3,%%xmm2                 \n"
203       "palignr     $0x8,%%xmm1,%%xmm2            \n"
204       "pshufb      %%xmm4,%%xmm2                 \n"
205       "por         %%xmm5,%%xmm2                 \n"
206       "palignr     $0xc,%%xmm0,%%xmm1            \n"
207       "pshufb      %%xmm4,%%xmm0                 \n"
208       "movdqu      %%xmm2,0x20(%1)               \n"
209       "por         %%xmm5,%%xmm0                 \n"
210       "pshufb      %%xmm4,%%xmm1                 \n"
211       "movdqu      %%xmm0,(%1)                   \n"
212       "por         %%xmm5,%%xmm1                 \n"
213       "palignr     $0x4,%%xmm3,%%xmm3            \n"
214       "pshufb      %%xmm4,%%xmm3                 \n"
215       "movdqu      %%xmm1,0x10(%1)               \n"
216       "por         %%xmm5,%%xmm3                 \n"
217       "movdqu      %%xmm3,0x30(%1)               \n"
218       "lea         0x40(%1),%1                   \n"
219       "sub         $0x10,%2                      \n"
220       "jg          1b                            \n"
221       : "+r"(src_rgb24),              // %0
222         "+r"(dst_argb),               // %1
223         "+r"(width)                   // %2
224       : "m"(kShuffleMaskRGB24ToARGB)  // %3
225       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
226 }
227 
RAWToARGBRow_SSSE3(const uint8_t * src_raw,uint8_t * dst_argb,int width)228 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
229   asm volatile(
230       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
231       "pslld       $0x18,%%xmm5                  \n"
232       "movdqa      %3,%%xmm4                     \n"
233 
234       LABELALIGN
235       "1:                                        \n"
236       "movdqu      (%0),%%xmm0                   \n"
237       "movdqu      0x10(%0),%%xmm1               \n"
238       "movdqu      0x20(%0),%%xmm3               \n"
239       "lea         0x30(%0),%0                   \n"
240       "movdqa      %%xmm3,%%xmm2                 \n"
241       "palignr     $0x8,%%xmm1,%%xmm2            \n"
242       "pshufb      %%xmm4,%%xmm2                 \n"
243       "por         %%xmm5,%%xmm2                 \n"
244       "palignr     $0xc,%%xmm0,%%xmm1            \n"
245       "pshufb      %%xmm4,%%xmm0                 \n"
246       "movdqu      %%xmm2,0x20(%1)               \n"
247       "por         %%xmm5,%%xmm0                 \n"
248       "pshufb      %%xmm4,%%xmm1                 \n"
249       "movdqu      %%xmm0,(%1)                   \n"
250       "por         %%xmm5,%%xmm1                 \n"
251       "palignr     $0x4,%%xmm3,%%xmm3            \n"
252       "pshufb      %%xmm4,%%xmm3                 \n"
253       "movdqu      %%xmm1,0x10(%1)               \n"
254       "por         %%xmm5,%%xmm3                 \n"
255       "movdqu      %%xmm3,0x30(%1)               \n"
256       "lea         0x40(%1),%1                   \n"
257       "sub         $0x10,%2                      \n"
258       "jg          1b                            \n"
259       : "+r"(src_raw),              // %0
260         "+r"(dst_argb),             // %1
261         "+r"(width)                 // %2
262       : "m"(kShuffleMaskRAWToARGB)  // %3
263       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
264 }
265 
266 // Same code as RAWToARGB with different shuffler and A in low bits
RAWToRGBARow_SSSE3(const uint8_t * src_raw,uint8_t * dst_rgba,int width)267 void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
268   asm volatile(
269       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0x000000ff
270       "psrld       $0x18,%%xmm5                  \n"
271       "movdqa      %3,%%xmm4                     \n"
272 
273       LABELALIGN
274       "1:                                        \n"
275       "movdqu      (%0),%%xmm0                   \n"
276       "movdqu      0x10(%0),%%xmm1               \n"
277       "movdqu      0x20(%0),%%xmm3               \n"
278       "lea         0x30(%0),%0                   \n"
279       "movdqa      %%xmm3,%%xmm2                 \n"
280       "palignr     $0x8,%%xmm1,%%xmm2            \n"
281       "pshufb      %%xmm4,%%xmm2                 \n"
282       "por         %%xmm5,%%xmm2                 \n"
283       "palignr     $0xc,%%xmm0,%%xmm1            \n"
284       "pshufb      %%xmm4,%%xmm0                 \n"
285       "movdqu      %%xmm2,0x20(%1)               \n"
286       "por         %%xmm5,%%xmm0                 \n"
287       "pshufb      %%xmm4,%%xmm1                 \n"
288       "movdqu      %%xmm0,(%1)                   \n"
289       "por         %%xmm5,%%xmm1                 \n"
290       "palignr     $0x4,%%xmm3,%%xmm3            \n"
291       "pshufb      %%xmm4,%%xmm3                 \n"
292       "movdqu      %%xmm1,0x10(%1)               \n"
293       "por         %%xmm5,%%xmm3                 \n"
294       "movdqu      %%xmm3,0x30(%1)               \n"
295       "lea         0x40(%1),%1                   \n"
296       "sub         $0x10,%2                      \n"
297       "jg          1b                            \n"
298       : "+r"(src_raw),              // %0
299         "+r"(dst_rgba),             // %1
300         "+r"(width)                 // %2
301       : "m"(kShuffleMaskRAWToRGBA)  // %3
302       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
303 }
304 
RAWToRGB24Row_SSSE3(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)305 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
306                          uint8_t* dst_rgb24,
307                          int width) {
308   asm volatile(
309       "movdqa      %3,%%xmm3                     \n"
310       "movdqa      %4,%%xmm4                     \n"
311       "movdqa      %5,%%xmm5                     \n"
312 
313       LABELALIGN
314       "1:                                        \n"
315       "movdqu      (%0),%%xmm0                   \n"
316       "movdqu      0x4(%0),%%xmm1                \n"
317       "movdqu      0x8(%0),%%xmm2                \n"
318       "lea         0x18(%0),%0                   \n"
319       "pshufb      %%xmm3,%%xmm0                 \n"
320       "pshufb      %%xmm4,%%xmm1                 \n"
321       "pshufb      %%xmm5,%%xmm2                 \n"
322       "movq        %%xmm0,(%1)                   \n"
323       "movq        %%xmm1,0x8(%1)                \n"
324       "movq        %%xmm2,0x10(%1)               \n"
325       "lea         0x18(%1),%1                   \n"
326       "sub         $0x8,%2                       \n"
327       "jg          1b                            \n"
328       : "+r"(src_raw),                  // %0
329         "+r"(dst_rgb24),                // %1
330         "+r"(width)                     // %2
331       : "m"(kShuffleMaskRAWToRGB24_0),  // %3
332         "m"(kShuffleMaskRAWToRGB24_1),  // %4
333         "m"(kShuffleMaskRAWToRGB24_2)   // %5
334       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
335 }
336 
RGB565ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)337 void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
338   asm volatile(
339       "mov         $0x1080108,%%eax              \n"
340       "movd        %%eax,%%xmm5                  \n"
341       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
342       "mov         $0x20802080,%%eax             \n"
343       "movd        %%eax,%%xmm6                  \n"
344       "pshufd      $0x0,%%xmm6,%%xmm6            \n"
345       "pcmpeqb     %%xmm3,%%xmm3                 \n"
346       "psllw       $0xb,%%xmm3                   \n"
347       "pcmpeqb     %%xmm4,%%xmm4                 \n"
348       "psllw       $0xa,%%xmm4                   \n"
349       "psrlw       $0x5,%%xmm4                   \n"
350       "pcmpeqb     %%xmm7,%%xmm7                 \n"
351       "psllw       $0x8,%%xmm7                   \n"
352       "sub         %0,%1                         \n"
353       "sub         %0,%1                         \n"
354 
355       LABELALIGN
356       "1:                                        \n"
357       "movdqu      (%0),%%xmm0                   \n"
358       "movdqa      %%xmm0,%%xmm1                 \n"
359       "movdqa      %%xmm0,%%xmm2                 \n"
360       "pand        %%xmm3,%%xmm1                 \n"
361       "psllw       $0xb,%%xmm2                   \n"
362       "pmulhuw     %%xmm5,%%xmm1                 \n"
363       "pmulhuw     %%xmm5,%%xmm2                 \n"
364       "psllw       $0x8,%%xmm1                   \n"
365       "por         %%xmm2,%%xmm1                 \n"
366       "pand        %%xmm4,%%xmm0                 \n"
367       "pmulhuw     %%xmm6,%%xmm0                 \n"
368       "por         %%xmm7,%%xmm0                 \n"
369       "movdqa      %%xmm1,%%xmm2                 \n"
370       "punpcklbw   %%xmm0,%%xmm1                 \n"
371       "punpckhbw   %%xmm0,%%xmm2                 \n"
372       "movdqu      %%xmm1,0x00(%1,%0,2)          \n"
373       "movdqu      %%xmm2,0x10(%1,%0,2)          \n"
374       "lea         0x10(%0),%0                   \n"
375       "sub         $0x8,%2                       \n"
376       "jg          1b                            \n"
377       : "+r"(src),   // %0
378         "+r"(dst),   // %1
379         "+r"(width)  // %2
380       :
381       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
382         "xmm6", "xmm7");
383 }
384 
ARGB1555ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)385 void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
386   asm volatile(
387       "mov         $0x1080108,%%eax              \n"
388       "movd        %%eax,%%xmm5                  \n"
389       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
390       "mov         $0x42004200,%%eax             \n"
391       "movd        %%eax,%%xmm6                  \n"
392       "pshufd      $0x0,%%xmm6,%%xmm6            \n"
393       "pcmpeqb     %%xmm3,%%xmm3                 \n"
394       "psllw       $0xb,%%xmm3                   \n"
395       "movdqa      %%xmm3,%%xmm4                 \n"
396       "psrlw       $0x6,%%xmm4                   \n"
397       "pcmpeqb     %%xmm7,%%xmm7                 \n"
398       "psllw       $0x8,%%xmm7                   \n"
399       "sub         %0,%1                         \n"
400       "sub         %0,%1                         \n"
401 
402       LABELALIGN
403       "1:                                        \n"
404       "movdqu      (%0),%%xmm0                   \n"
405       "movdqa      %%xmm0,%%xmm1                 \n"
406       "movdqa      %%xmm0,%%xmm2                 \n"
407       "psllw       $0x1,%%xmm1                   \n"
408       "psllw       $0xb,%%xmm2                   \n"
409       "pand        %%xmm3,%%xmm1                 \n"
410       "pmulhuw     %%xmm5,%%xmm2                 \n"
411       "pmulhuw     %%xmm5,%%xmm1                 \n"
412       "psllw       $0x8,%%xmm1                   \n"
413       "por         %%xmm2,%%xmm1                 \n"
414       "movdqa      %%xmm0,%%xmm2                 \n"
415       "pand        %%xmm4,%%xmm0                 \n"
416       "psraw       $0x8,%%xmm2                   \n"
417       "pmulhuw     %%xmm6,%%xmm0                 \n"
418       "pand        %%xmm7,%%xmm2                 \n"
419       "por         %%xmm2,%%xmm0                 \n"
420       "movdqa      %%xmm1,%%xmm2                 \n"
421       "punpcklbw   %%xmm0,%%xmm1                 \n"
422       "punpckhbw   %%xmm0,%%xmm2                 \n"
423       "movdqu      %%xmm1,0x00(%1,%0,2)          \n"
424       "movdqu      %%xmm2,0x10(%1,%0,2)          \n"
425       "lea         0x10(%0),%0                   \n"
426       "sub         $0x8,%2                       \n"
427       "jg          1b                            \n"
428       : "+r"(src),   // %0
429         "+r"(dst),   // %1
430         "+r"(width)  // %2
431       :
432       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
433         "xmm6", "xmm7");
434 }
435 
ARGB4444ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)436 void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
437   asm volatile(
438       "mov         $0xf0f0f0f,%%eax              \n"
439       "movd        %%eax,%%xmm4                  \n"
440       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
441       "movdqa      %%xmm4,%%xmm5                 \n"
442       "pslld       $0x4,%%xmm5                   \n"
443       "sub         %0,%1                         \n"
444       "sub         %0,%1                         \n"
445 
446       LABELALIGN
447       "1:                                        \n"
448       "movdqu      (%0),%%xmm0                   \n"
449       "movdqa      %%xmm0,%%xmm2                 \n"
450       "pand        %%xmm4,%%xmm0                 \n"
451       "pand        %%xmm5,%%xmm2                 \n"
452       "movdqa      %%xmm0,%%xmm1                 \n"
453       "movdqa      %%xmm2,%%xmm3                 \n"
454       "psllw       $0x4,%%xmm1                   \n"
455       "psrlw       $0x4,%%xmm3                   \n"
456       "por         %%xmm1,%%xmm0                 \n"
457       "por         %%xmm3,%%xmm2                 \n"
458       "movdqa      %%xmm0,%%xmm1                 \n"
459       "punpcklbw   %%xmm2,%%xmm0                 \n"
460       "punpckhbw   %%xmm2,%%xmm1                 \n"
461       "movdqu      %%xmm0,0x00(%1,%0,2)          \n"
462       "movdqu      %%xmm1,0x10(%1,%0,2)          \n"
463       "lea         0x10(%0),%0                   \n"
464       "sub         $0x8,%2                       \n"
465       "jg          1b                            \n"
466       : "+r"(src),   // %0
467         "+r"(dst),   // %1
468         "+r"(width)  // %2
469       :
470       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
471 }
472 
ARGBToRGB24Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)473 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
474   asm volatile(
475 
476       "movdqa      %3,%%xmm6                     \n"
477 
478       LABELALIGN
479       "1:                                        \n"
480       "movdqu      (%0),%%xmm0                   \n"
481       "movdqu      0x10(%0),%%xmm1               \n"
482       "movdqu      0x20(%0),%%xmm2               \n"
483       "movdqu      0x30(%0),%%xmm3               \n"
484       "lea         0x40(%0),%0                   \n"
485       "pshufb      %%xmm6,%%xmm0                 \n"
486       "pshufb      %%xmm6,%%xmm1                 \n"
487       "pshufb      %%xmm6,%%xmm2                 \n"
488       "pshufb      %%xmm6,%%xmm3                 \n"
489       "movdqa      %%xmm1,%%xmm4                 \n"
490       "psrldq      $0x4,%%xmm1                   \n"
491       "pslldq      $0xc,%%xmm4                   \n"
492       "movdqa      %%xmm2,%%xmm5                 \n"
493       "por         %%xmm4,%%xmm0                 \n"
494       "pslldq      $0x8,%%xmm5                   \n"
495       "movdqu      %%xmm0,(%1)                   \n"
496       "por         %%xmm5,%%xmm1                 \n"
497       "psrldq      $0x8,%%xmm2                   \n"
498       "pslldq      $0x4,%%xmm3                   \n"
499       "por         %%xmm3,%%xmm2                 \n"
500       "movdqu      %%xmm1,0x10(%1)               \n"
501       "movdqu      %%xmm2,0x20(%1)               \n"
502       "lea         0x30(%1),%1                   \n"
503       "sub         $0x10,%2                      \n"
504       "jg          1b                            \n"
505       : "+r"(src),                    // %0
506         "+r"(dst),                    // %1
507         "+r"(width)                   // %2
508       : "m"(kShuffleMaskARGBToRGB24)  // %3
509       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
510 }
511 
ARGBToRAWRow_SSSE3(const uint8_t * src,uint8_t * dst,int width)512 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
513   asm volatile(
514 
515       "movdqa      %3,%%xmm6                     \n"
516 
517       LABELALIGN
518       "1:                                        \n"
519       "movdqu      (%0),%%xmm0                   \n"
520       "movdqu      0x10(%0),%%xmm1               \n"
521       "movdqu      0x20(%0),%%xmm2               \n"
522       "movdqu      0x30(%0),%%xmm3               \n"
523       "lea         0x40(%0),%0                   \n"
524       "pshufb      %%xmm6,%%xmm0                 \n"
525       "pshufb      %%xmm6,%%xmm1                 \n"
526       "pshufb      %%xmm6,%%xmm2                 \n"
527       "pshufb      %%xmm6,%%xmm3                 \n"
528       "movdqa      %%xmm1,%%xmm4                 \n"
529       "psrldq      $0x4,%%xmm1                   \n"
530       "pslldq      $0xc,%%xmm4                   \n"
531       "movdqa      %%xmm2,%%xmm5                 \n"
532       "por         %%xmm4,%%xmm0                 \n"
533       "pslldq      $0x8,%%xmm5                   \n"
534       "movdqu      %%xmm0,(%1)                   \n"
535       "por         %%xmm5,%%xmm1                 \n"
536       "psrldq      $0x8,%%xmm2                   \n"
537       "pslldq      $0x4,%%xmm3                   \n"
538       "por         %%xmm3,%%xmm2                 \n"
539       "movdqu      %%xmm1,0x10(%1)               \n"
540       "movdqu      %%xmm2,0x20(%1)               \n"
541       "lea         0x30(%1),%1                   \n"
542       "sub         $0x10,%2                      \n"
543       "jg          1b                            \n"
544       : "+r"(src),                  // %0
545         "+r"(dst),                  // %1
546         "+r"(width)                 // %2
547       : "m"(kShuffleMaskARGBToRAW)  // %3
548       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
549 }
550 
551 #ifdef HAS_ARGBTORGB24ROW_AVX2
552 // vpermd for 12+12 to 24
553 static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
554 
ARGBToRGB24Row_AVX2(const uint8_t * src,uint8_t * dst,int width)555 void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
556   asm volatile(
557       "vbroadcastf128 %3,%%ymm6                  \n"
558       "vmovdqa     %4,%%ymm7                     \n"
559 
560       LABELALIGN
561       "1:                                        \n"
562       "vmovdqu     (%0),%%ymm0                   \n"
563       "vmovdqu     0x20(%0),%%ymm1               \n"
564       "vmovdqu     0x40(%0),%%ymm2               \n"
565       "vmovdqu     0x60(%0),%%ymm3               \n"
566       "lea         0x80(%0),%0                   \n"
567       "vpshufb     %%ymm6,%%ymm0,%%ymm0          \n"  // xxx0yyy0
568       "vpshufb     %%ymm6,%%ymm1,%%ymm1          \n"
569       "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
570       "vpshufb     %%ymm6,%%ymm3,%%ymm3          \n"
571       "vpermd      %%ymm0,%%ymm7,%%ymm0          \n"  // pack to 24 bytes
572       "vpermd      %%ymm1,%%ymm7,%%ymm1          \n"
573       "vpermd      %%ymm2,%%ymm7,%%ymm2          \n"
574       "vpermd      %%ymm3,%%ymm7,%%ymm3          \n"
575       "vpermq      $0x3f,%%ymm1,%%ymm4           \n"  // combine 24 + 8
576       "vpor        %%ymm4,%%ymm0,%%ymm0          \n"
577       "vmovdqu     %%ymm0,(%1)                   \n"
578       "vpermq      $0xf9,%%ymm1,%%ymm1           \n"  // combine 16 + 16
579       "vpermq      $0x4f,%%ymm2,%%ymm4           \n"
580       "vpor        %%ymm4,%%ymm1,%%ymm1          \n"
581       "vmovdqu     %%ymm1,0x20(%1)               \n"
582       "vpermq      $0xfe,%%ymm2,%%ymm2           \n"  // combine 8 + 24
583       "vpermq      $0x93,%%ymm3,%%ymm3           \n"
584       "vpor        %%ymm3,%%ymm2,%%ymm2          \n"
585       "vmovdqu     %%ymm2,0x40(%1)               \n"
586       "lea         0x60(%1),%1                   \n"
587       "sub         $0x20,%2                      \n"
588       "jg          1b                            \n"
589       "vzeroupper                                \n"
590       : "+r"(src),                     // %0
591         "+r"(dst),                     // %1
592         "+r"(width)                    // %2
593       : "m"(kShuffleMaskARGBToRGB24),  // %3
594         "m"(kPermdRGB24_AVX)           // %4
595       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
596         "xmm7");
597 }
598 #endif
599 
600 #ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
601 // Shuffle table for converting ARGBToRGB24
602 static const ulvec8 kPermARGBToRGB24_0 = {
603     0u,  1u,  2u,  4u,  5u,  6u,  8u,  9u,  10u, 12u, 13u,
604     14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
605     29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
606 static const ulvec8 kPermARGBToRGB24_1 = {
607     10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
608     25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
609     40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
610 static const ulvec8 kPermARGBToRGB24_2 = {
611     21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
612     36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
613     50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
614 
ARGBToRGB24Row_AVX512VBMI(const uint8_t * src,uint8_t * dst,int width)615 void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
616   asm volatile(
617       "vmovdqa     %3,%%ymm5                     \n"
618       "vmovdqa     %4,%%ymm6                     \n"
619       "vmovdqa     %5,%%ymm7                     \n"
620 
621       LABELALIGN
622       "1:                                        \n"
623       "vmovdqu     (%0),%%ymm0                   \n"
624       "vmovdqu     0x20(%0),%%ymm1               \n"
625       "vmovdqu     0x40(%0),%%ymm2               \n"
626       "vmovdqu     0x60(%0),%%ymm3               \n"
627       "lea         0x80(%0),%0                   \n"
628       "vpermt2b    %%ymm1,%%ymm5,%%ymm0          \n"
629       "vpermt2b    %%ymm2,%%ymm6,%%ymm1          \n"
630       "vpermt2b    %%ymm3,%%ymm7,%%ymm2          \n"
631       "vmovdqu     %%ymm0,(%1)                   \n"
632       "vmovdqu     %%ymm1,0x20(%1)               \n"
633       "vmovdqu     %%ymm2,0x40(%1)               \n"
634       "lea         0x60(%1),%1                   \n"
635       "sub         $0x20,%2                      \n"
636       "jg          1b                            \n"
637       "vzeroupper                                \n"
638       : "+r"(src),                // %0
639         "+r"(dst),                // %1
640         "+r"(width)               // %2
641       : "m"(kPermARGBToRGB24_0),  // %3
642         "m"(kPermARGBToRGB24_1),  // %4
643         "m"(kPermARGBToRGB24_2)   // %5
644       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
645 }
646 #endif
647 
648 #ifdef HAS_ARGBTORAWROW_AVX2
ARGBToRAWRow_AVX2(const uint8_t * src,uint8_t * dst,int width)649 void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
650   asm volatile(
651       "vbroadcastf128 %3,%%ymm6                  \n"
652       "vmovdqa     %4,%%ymm7                     \n"
653 
654       LABELALIGN
655       "1:                                        \n"
656       "vmovdqu     (%0),%%ymm0                   \n"
657       "vmovdqu     0x20(%0),%%ymm1               \n"
658       "vmovdqu     0x40(%0),%%ymm2               \n"
659       "vmovdqu     0x60(%0),%%ymm3               \n"
660       "lea         0x80(%0),%0                   \n"
661       "vpshufb     %%ymm6,%%ymm0,%%ymm0          \n"  // xxx0yyy0
662       "vpshufb     %%ymm6,%%ymm1,%%ymm1          \n"
663       "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
664       "vpshufb     %%ymm6,%%ymm3,%%ymm3          \n"
665       "vpermd      %%ymm0,%%ymm7,%%ymm0          \n"  // pack to 24 bytes
666       "vpermd      %%ymm1,%%ymm7,%%ymm1          \n"
667       "vpermd      %%ymm2,%%ymm7,%%ymm2          \n"
668       "vpermd      %%ymm3,%%ymm7,%%ymm3          \n"
669       "vpermq      $0x3f,%%ymm1,%%ymm4           \n"  // combine 24 + 8
670       "vpor        %%ymm4,%%ymm0,%%ymm0          \n"
671       "vmovdqu     %%ymm0,(%1)                   \n"
672       "vpermq      $0xf9,%%ymm1,%%ymm1           \n"  // combine 16 + 16
673       "vpermq      $0x4f,%%ymm2,%%ymm4           \n"
674       "vpor        %%ymm4,%%ymm1,%%ymm1          \n"
675       "vmovdqu     %%ymm1,0x20(%1)               \n"
676       "vpermq      $0xfe,%%ymm2,%%ymm2           \n"  // combine 8 + 24
677       "vpermq      $0x93,%%ymm3,%%ymm3           \n"
678       "vpor        %%ymm3,%%ymm2,%%ymm2          \n"
679       "vmovdqu     %%ymm2,0x40(%1)               \n"
680       "lea         0x60(%1),%1                   \n"
681       "sub         $0x20,%2                      \n"
682       "jg          1b                            \n"
683       "vzeroupper                                \n"
684       : "+r"(src),                   // %0
685         "+r"(dst),                   // %1
686         "+r"(width)                  // %2
687       : "m"(kShuffleMaskARGBToRAW),  // %3
688         "m"(kPermdRGB24_AVX)         // %4
689       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
690         "xmm7");
691 }
692 #endif
693 
ARGBToRGB565Row_SSE2(const uint8_t * src,uint8_t * dst,int width)694 void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
695   asm volatile(
696       "pcmpeqb     %%xmm3,%%xmm3                 \n"
697       "psrld       $0x1b,%%xmm3                  \n"
698       "pcmpeqb     %%xmm4,%%xmm4                 \n"
699       "psrld       $0x1a,%%xmm4                  \n"
700       "pslld       $0x5,%%xmm4                   \n"
701       "pcmpeqb     %%xmm5,%%xmm5                 \n"
702       "pslld       $0xb,%%xmm5                   \n"
703 
704       LABELALIGN
705       "1:                                        \n"
706       "movdqu      (%0),%%xmm0                   \n"
707       "movdqa      %%xmm0,%%xmm1                 \n"
708       "movdqa      %%xmm0,%%xmm2                 \n"
709       "pslld       $0x8,%%xmm0                   \n"
710       "psrld       $0x3,%%xmm1                   \n"
711       "psrld       $0x5,%%xmm2                   \n"
712       "psrad       $0x10,%%xmm0                  \n"
713       "pand        %%xmm3,%%xmm1                 \n"
714       "pand        %%xmm4,%%xmm2                 \n"
715       "pand        %%xmm5,%%xmm0                 \n"
716       "por         %%xmm2,%%xmm1                 \n"
717       "por         %%xmm1,%%xmm0                 \n"
718       "packssdw    %%xmm0,%%xmm0                 \n"
719       "lea         0x10(%0),%0                   \n"
720       "movq        %%xmm0,(%1)                   \n"
721       "lea         0x8(%1),%1                    \n"
722       "sub         $0x4,%2                       \n"
723       "jg          1b                            \n"
724       : "+r"(src),   // %0
725         "+r"(dst),   // %1
726         "+r"(width)  // %2
727         ::"memory",
728         "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
729 }
730 
ARGBToRGB565DitherRow_SSE2(const uint8_t * src,uint8_t * dst,const uint32_t dither4,int width)731 void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
732                                 uint8_t* dst,
733                                 const uint32_t dither4,
734                                 int width) {
735   asm volatile(
736       "movd        %3,%%xmm6                     \n"
737       "punpcklbw   %%xmm6,%%xmm6                 \n"
738       "movdqa      %%xmm6,%%xmm7                 \n"
739       "punpcklwd   %%xmm6,%%xmm6                 \n"
740       "punpckhwd   %%xmm7,%%xmm7                 \n"
741       "pcmpeqb     %%xmm3,%%xmm3                 \n"
742       "psrld       $0x1b,%%xmm3                  \n"
743       "pcmpeqb     %%xmm4,%%xmm4                 \n"
744       "psrld       $0x1a,%%xmm4                  \n"
745       "pslld       $0x5,%%xmm4                   \n"
746       "pcmpeqb     %%xmm5,%%xmm5                 \n"
747       "pslld       $0xb,%%xmm5                   \n"
748 
749       LABELALIGN
750       "1:                                        \n"
751       "movdqu      (%0),%%xmm0                   \n"
752       "paddusb     %%xmm6,%%xmm0                 \n"
753       "movdqa      %%xmm0,%%xmm1                 \n"
754       "movdqa      %%xmm0,%%xmm2                 \n"
755       "pslld       $0x8,%%xmm0                   \n"
756       "psrld       $0x3,%%xmm1                   \n"
757       "psrld       $0x5,%%xmm2                   \n"
758       "psrad       $0x10,%%xmm0                  \n"
759       "pand        %%xmm3,%%xmm1                 \n"
760       "pand        %%xmm4,%%xmm2                 \n"
761       "pand        %%xmm5,%%xmm0                 \n"
762       "por         %%xmm2,%%xmm1                 \n"
763       "por         %%xmm1,%%xmm0                 \n"
764       "packssdw    %%xmm0,%%xmm0                 \n"
765       "lea         0x10(%0),%0                   \n"
766       "movq        %%xmm0,(%1)                   \n"
767       "lea         0x8(%1),%1                    \n"
768       "sub         $0x4,%2                       \n"
769       "jg          1b                            \n"
770       : "+r"(src),    // %0
771         "+r"(dst),    // %1
772         "+r"(width)   // %2
773       : "m"(dither4)  // %3
774       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
775         "xmm7");
776 }
777 
778 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
ARGBToRGB565DitherRow_AVX2(const uint8_t * src,uint8_t * dst,const uint32_t dither4,int width)779 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
780                                 uint8_t* dst,
781                                 const uint32_t dither4,
782                                 int width) {
783   asm volatile(
784       "vbroadcastss %3,%%xmm6                    \n"
785       "vpunpcklbw  %%xmm6,%%xmm6,%%xmm6          \n"
786       "vpermq      $0xd8,%%ymm6,%%ymm6           \n"
787       "vpunpcklwd  %%ymm6,%%ymm6,%%ymm6          \n"
788       "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"
789       "vpsrld      $0x1b,%%ymm3,%%ymm3           \n"
790       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
791       "vpsrld      $0x1a,%%ymm4,%%ymm4           \n"
792       "vpslld      $0x5,%%ymm4,%%ymm4            \n"
793       "vpslld      $0xb,%%ymm3,%%ymm5            \n"
794 
795       LABELALIGN
796       "1:                                        \n"
797       "vmovdqu     (%0),%%ymm0                   \n"
798       "vpaddusb    %%ymm6,%%ymm0,%%ymm0          \n"
799       "vpsrld      $0x5,%%ymm0,%%ymm2            \n"
800       "vpsrld      $0x3,%%ymm0,%%ymm1            \n"
801       "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
802       "vpand       %%ymm4,%%ymm2,%%ymm2          \n"
803       "vpand       %%ymm3,%%ymm1,%%ymm1          \n"
804       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
805       "vpor        %%ymm2,%%ymm1,%%ymm1          \n"
806       "vpor        %%ymm1,%%ymm0,%%ymm0          \n"
807       "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
808       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
809       "lea         0x20(%0),%0                   \n"
810       "vmovdqu     %%xmm0,(%1)                   \n"
811       "lea         0x10(%1),%1                   \n"
812       "sub         $0x8,%2                       \n"
813       "jg          1b                            \n"
814       "vzeroupper                                \n"
815       : "+r"(src),    // %0
816         "+r"(dst),    // %1
817         "+r"(width)   // %2
818       : "m"(dither4)  // %3
819       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
820         "xmm7");
821 }
822 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
823 
ARGBToARGB1555Row_SSE2(const uint8_t * src,uint8_t * dst,int width)824 void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
825   asm volatile(
826       "pcmpeqb     %%xmm4,%%xmm4                 \n"
827       "psrld       $0x1b,%%xmm4                  \n"
828       "movdqa      %%xmm4,%%xmm5                 \n"
829       "pslld       $0x5,%%xmm5                   \n"
830       "movdqa      %%xmm4,%%xmm6                 \n"
831       "pslld       $0xa,%%xmm6                   \n"
832       "pcmpeqb     %%xmm7,%%xmm7                 \n"
833       "pslld       $0xf,%%xmm7                   \n"
834 
835       LABELALIGN
836       "1:                                        \n"
837       "movdqu      (%0),%%xmm0                   \n"
838       "movdqa      %%xmm0,%%xmm1                 \n"
839       "movdqa      %%xmm0,%%xmm2                 \n"
840       "movdqa      %%xmm0,%%xmm3                 \n"
841       "psrad       $0x10,%%xmm0                  \n"
842       "psrld       $0x3,%%xmm1                   \n"
843       "psrld       $0x6,%%xmm2                   \n"
844       "psrld       $0x9,%%xmm3                   \n"
845       "pand        %%xmm7,%%xmm0                 \n"
846       "pand        %%xmm4,%%xmm1                 \n"
847       "pand        %%xmm5,%%xmm2                 \n"
848       "pand        %%xmm6,%%xmm3                 \n"
849       "por         %%xmm1,%%xmm0                 \n"
850       "por         %%xmm3,%%xmm2                 \n"
851       "por         %%xmm2,%%xmm0                 \n"
852       "packssdw    %%xmm0,%%xmm0                 \n"
853       "lea         0x10(%0),%0                   \n"
854       "movq        %%xmm0,(%1)                   \n"
855       "lea         0x8(%1),%1                    \n"
856       "sub         $0x4,%2                       \n"
857       "jg          1b                            \n"
858       : "+r"(src),   // %0
859         "+r"(dst),   // %1
860         "+r"(width)  // %2
861         ::"memory",
862         "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
863 }
864 
ARGBToARGB4444Row_SSE2(const uint8_t * src,uint8_t * dst,int width)865 void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
866   asm volatile(
867       "pcmpeqb     %%xmm4,%%xmm4                 \n"
868       "psllw       $0xc,%%xmm4                   \n"
869       "movdqa      %%xmm4,%%xmm3                 \n"
870       "psrlw       $0x8,%%xmm3                   \n"
871 
872       LABELALIGN
873       "1:                                        \n"
874       "movdqu      (%0),%%xmm0                   \n"
875       "movdqa      %%xmm0,%%xmm1                 \n"
876       "pand        %%xmm3,%%xmm0                 \n"
877       "pand        %%xmm4,%%xmm1                 \n"
878       "psrlq       $0x4,%%xmm0                   \n"
879       "psrlq       $0x8,%%xmm1                   \n"
880       "por         %%xmm1,%%xmm0                 \n"
881       "packuswb    %%xmm0,%%xmm0                 \n"
882       "lea         0x10(%0),%0                   \n"
883       "movq        %%xmm0,(%1)                   \n"
884       "lea         0x8(%1),%1                    \n"
885       "sub         $0x4,%2                       \n"
886       "jg          1b                            \n"
887       : "+r"(src),   // %0
888         "+r"(dst),   // %1
889         "+r"(width)  // %2
890         ::"memory",
891         "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
892 }
893 #endif  // HAS_RGB24TOARGBROW_SSSE3
894 
895 /*
896 
897 ARGBToAR30Row:
898 
899 Red Blue
900 With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
901 produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
902 wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
903 (1024+4)*16 for red.
904 
905 Alpha Green
906 Alpha and Green are already in the high bits so vpand can zero out the other
907 bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
908 could be used for Green - (1024+4) putting the 10 bit green in the lsb.  Alpha
909 would be a simple multiplier to shift it into position.  It wants a gap of 10
910 above the green.  Green is 10 bits, so there are 6 bits in the low short.  4
911 more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
912 and then a shift of 4 is a multiply of 16, so (4*16) = 64.  Then shift the
913 result left 10 to position the A and G channels.
914 */
915 
916 // Shuffle table for converting RAW to RGB24.  Last 8.
917 static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u,  128u, 4u,  128u, 6u,
918                                    128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
919 
920 static const uvec8 kShuffleBR30 = {128u, 2u,  128u, 0u, 128u, 6u,  128u, 4u,
921                                    128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
922 
923 static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
924 static const uint32_t kMaskRB10 = 0x3ff003ff;
925 static const uint32_t kMaskAG10 = 0xc000ff00;
926 static const uint32_t kMulAG10 = 64 * 65536 + 1028;
927 
ARGBToAR30Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)928 void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
929   asm volatile(
930       "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
931       "movd        %4,%%xmm3                     \n"  // multipler for RB
932       "movd        %5,%%xmm4                     \n"  // mask for R10 B10
933       "movd        %6,%%xmm5                     \n"  // mask for AG
934       "movd        %7,%%xmm6                     \n"  // multipler for AG
935       "pshufd      $0x0,%%xmm3,%%xmm3            \n"
936       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
937       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
938       "pshufd      $0x0,%%xmm6,%%xmm6            \n"
939       "sub         %0,%1                         \n"
940 
941       "1:                                        \n"
942       "movdqu      (%0),%%xmm0                   \n"  // fetch 4 ARGB pixels
943       "movdqa      %%xmm0,%%xmm1                 \n"
944       "pshufb      %%xmm2,%%xmm1                 \n"  // R0B0
945       "pand        %%xmm5,%%xmm0                 \n"  // A0G0
946       "pmulhuw     %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
947       "pmulhuw     %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
948       "pand        %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
949       "pslld       $10,%%xmm0                    \n"  // A2 x10 G10 x10
950       "por         %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
951       "movdqu      %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
952       "add         $0x10,%0                      \n"
953       "sub         $0x4,%2                       \n"
954       "jg          1b                            \n"
955 
956       : "+r"(src),          // %0
957         "+r"(dst),          // %1
958         "+r"(width)         // %2
959       : "m"(kShuffleRB30),  // %3
960         "m"(kMulRB10),      // %4
961         "m"(kMaskRB10),     // %5
962         "m"(kMaskAG10),     // %6
963         "m"(kMulAG10)       // %7
964       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
965 }
966 
ABGRToAR30Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)967 void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
968   asm volatile(
969       "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
970       "movd        %4,%%xmm3                     \n"  // multipler for RB
971       "movd        %5,%%xmm4                     \n"  // mask for R10 B10
972       "movd        %6,%%xmm5                     \n"  // mask for AG
973       "movd        %7,%%xmm6                     \n"  // multipler for AG
974       "pshufd      $0x0,%%xmm3,%%xmm3            \n"
975       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
976       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
977       "pshufd      $0x0,%%xmm6,%%xmm6            \n"
978       "sub         %0,%1                         \n"
979 
980       "1:                                        \n"
981       "movdqu      (%0),%%xmm0                   \n"  // fetch 4 ABGR pixels
982       "movdqa      %%xmm0,%%xmm1                 \n"
983       "pshufb      %%xmm2,%%xmm1                 \n"  // R0B0
984       "pand        %%xmm5,%%xmm0                 \n"  // A0G0
985       "pmulhuw     %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
986       "pmulhuw     %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
987       "pand        %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
988       "pslld       $10,%%xmm0                    \n"  // A2 x10 G10 x10
989       "por         %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
990       "movdqu      %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
991       "add         $0x10,%0                      \n"
992       "sub         $0x4,%2                       \n"
993       "jg          1b                            \n"
994 
995       : "+r"(src),          // %0
996         "+r"(dst),          // %1
997         "+r"(width)         // %2
998       : "m"(kShuffleBR30),  // %3  reversed shuffler
999         "m"(kMulRB10),      // %4
1000         "m"(kMaskRB10),     // %5
1001         "m"(kMaskAG10),     // %6
1002         "m"(kMulAG10)       // %7
1003       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1004 }
1005 
1006 #ifdef HAS_ARGBTOAR30ROW_AVX2
ARGBToAR30Row_AVX2(const uint8_t * src,uint8_t * dst,int width)1007 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
1008   asm volatile(
1009       "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
1010       "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
1011       "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
1012       "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
1013       "vbroadcastss %7,%%ymm6                    \n"  // multipler for AG
1014       "sub         %0,%1                         \n"
1015 
1016       "1:                                        \n"
1017       "vmovdqu     (%0),%%ymm0                   \n"  // fetch 8 ARGB pixels
1018       "vpshufb     %%ymm2,%%ymm0,%%ymm1          \n"  // R0B0
1019       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"  // A0G0
1020       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"  // X2 R16 X4  B10
1021       "vpmulhuw    %%ymm6,%%ymm0,%%ymm0          \n"  // X10 A2 X10 G10
1022       "vpand       %%ymm4,%%ymm1,%%ymm1          \n"  // X2 R10 X10 B10
1023       "vpslld      $10,%%ymm0,%%ymm0             \n"  // A2 x10 G10 x10
1024       "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // A2 R10 G10 B10
1025       "vmovdqu     %%ymm0,(%1,%0)                \n"  // store 8 AR30 pixels
1026       "add         $0x20,%0                      \n"
1027       "sub         $0x8,%2                       \n"
1028       "jg          1b                            \n"
1029       "vzeroupper                                \n"
1030 
1031       : "+r"(src),          // %0
1032         "+r"(dst),          // %1
1033         "+r"(width)         // %2
1034       : "m"(kShuffleRB30),  // %3
1035         "m"(kMulRB10),      // %4
1036         "m"(kMaskRB10),     // %5
1037         "m"(kMaskAG10),     // %6
1038         "m"(kMulAG10)       // %7
1039       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1040 }
1041 #endif
1042 
1043 #ifdef HAS_ABGRTOAR30ROW_AVX2
ABGRToAR30Row_AVX2(const uint8_t * src,uint8_t * dst,int width)1044 void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
1045   asm volatile(
1046       "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
1047       "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
1048       "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
1049       "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
1050       "vbroadcastss %7,%%ymm6                    \n"  // multipler for AG
1051       "sub         %0,%1                         \n"
1052 
1053       "1:                                        \n"
1054       "vmovdqu     (%0),%%ymm0                   \n"  // fetch 8 ABGR pixels
1055       "vpshufb     %%ymm2,%%ymm0,%%ymm1          \n"  // R0B0
1056       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"  // A0G0
1057       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"  // X2 R16 X4  B10
1058       "vpmulhuw    %%ymm6,%%ymm0,%%ymm0          \n"  // X10 A2 X10 G10
1059       "vpand       %%ymm4,%%ymm1,%%ymm1          \n"  // X2 R10 X10 B10
1060       "vpslld      $10,%%ymm0,%%ymm0             \n"  // A2 x10 G10 x10
1061       "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // A2 R10 G10 B10
1062       "vmovdqu     %%ymm0,(%1,%0)                \n"  // store 8 AR30 pixels
1063       "add         $0x20,%0                      \n"
1064       "sub         $0x8,%2                       \n"
1065       "jg          1b                            \n"
1066       "vzeroupper                                \n"
1067 
1068       : "+r"(src),          // %0
1069         "+r"(dst),          // %1
1070         "+r"(width)         // %2
1071       : "m"(kShuffleBR30),  // %3  reversed shuffler
1072         "m"(kMulRB10),      // %4
1073         "m"(kMaskRB10),     // %5
1074         "m"(kMaskAG10),     // %6
1075         "m"(kMulAG10)       // %7
1076       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1077 }
1078 #endif
1079 
1080 static const uvec8 kShuffleARGBToABGR = {2,  1, 0, 3,  6,  5,  4,  7,
1081                                          10, 9, 8, 11, 14, 13, 12, 15};
1082 
1083 static const uvec8 kShuffleARGBToAB64Lo = {2, 2, 1, 1, 0, 0, 3, 3,
1084                                            6, 6, 5, 5, 4, 4, 7, 7};
1085 static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9,  9,  8,  8,  11, 11,
1086                                            14, 14, 13, 13, 12, 12, 15, 15};
1087 
ARGBToAR64Row_SSSE3(const uint8_t * src_argb,uint16_t * dst_ar64,int width)1088 void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
1089                          uint16_t* dst_ar64,
1090                          int width) {
1091   asm volatile(
1092 
1093       LABELALIGN
1094       "1:                                        \n"
1095       "movdqu      (%0),%%xmm0                   \n"
1096       "movdqa      %%xmm0,%%xmm1                 \n"
1097       "punpcklbw   %%xmm0,%%xmm0                 \n"
1098       "punpckhbw   %%xmm1,%%xmm1                 \n"
1099       "movdqu      %%xmm0,(%1)                   \n"
1100       "movdqu      %%xmm1,0x10(%1)               \n"
1101       "lea         0x10(%0),%0                   \n"
1102       "lea         0x20(%1),%1                   \n"
1103       "sub         $0x4,%2                       \n"
1104       "jg          1b                            \n"
1105       : "+r"(src_argb),  // %0
1106         "+r"(dst_ar64),  // %1
1107         "+r"(width)      // %2
1108       :
1109       : "memory", "cc", "xmm0", "xmm1");
1110 }
1111 
ARGBToAB64Row_SSSE3(const uint8_t * src_argb,uint16_t * dst_ab64,int width)1112 void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
1113                          uint16_t* dst_ab64,
1114                          int width) {
1115   asm volatile(
1116 
1117       "movdqa      %3,%%xmm2                     \n"
1118       "movdqa      %4,%%xmm3                     \n" LABELALIGN
1119       "1:                                        \n"
1120       "movdqu      (%0),%%xmm0                   \n"
1121       "movdqa      %%xmm0,%%xmm1                 \n"
1122       "pshufb      %%xmm2,%%xmm0                 \n"
1123       "pshufb      %%xmm3,%%xmm1                 \n"
1124       "movdqu      %%xmm0,(%1)                   \n"
1125       "movdqu      %%xmm1,0x10(%1)               \n"
1126       "lea         0x10(%0),%0                   \n"
1127       "lea         0x20(%1),%1                   \n"
1128       "sub         $0x4,%2                       \n"
1129       "jg          1b                            \n"
1130       : "+r"(src_argb),             // %0
1131         "+r"(dst_ab64),             // %1
1132         "+r"(width)                 // %2
1133       : "m"(kShuffleARGBToAB64Lo),  // %3
1134         "m"(kShuffleARGBToAB64Hi)   // %4
1135       : "memory", "cc", "xmm0", "xmm1", "xmm2");
1136 }
1137 
AR64ToARGBRow_SSSE3(const uint16_t * src_ar64,uint8_t * dst_argb,int width)1138 void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
1139                          uint8_t* dst_argb,
1140                          int width) {
1141   asm volatile(
1142 
1143       LABELALIGN
1144       "1:                                        \n"
1145       "movdqu      (%0),%%xmm0                   \n"
1146       "movdqu      0x10(%0),%%xmm1               \n"
1147       "psrlw       $8,%%xmm0                     \n"
1148       "psrlw       $8,%%xmm1                     \n"
1149       "packuswb    %%xmm1,%%xmm0                 \n"
1150       "movdqu      %%xmm0,(%1)                   \n"
1151       "lea         0x20(%0),%0                   \n"
1152       "lea         0x10(%1),%1                   \n"
1153       "sub         $0x4,%2                       \n"
1154       "jg          1b                            \n"
1155       : "+r"(src_ar64),  // %0
1156         "+r"(dst_argb),  // %1
1157         "+r"(width)      // %2
1158       :
1159       : "memory", "cc", "xmm0", "xmm1");
1160 }
1161 
AB64ToARGBRow_SSSE3(const uint16_t * src_ab64,uint8_t * dst_argb,int width)1162 void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
1163                          uint8_t* dst_argb,
1164                          int width) {
1165   asm volatile(
1166 
1167       "movdqa      %3,%%xmm2                     \n" LABELALIGN
1168       "1:                                        \n"
1169       "movdqu      (%0),%%xmm0                   \n"
1170       "movdqu      0x10(%0),%%xmm1               \n"
1171       "psrlw       $8,%%xmm0                     \n"
1172       "psrlw       $8,%%xmm1                     \n"
1173       "packuswb    %%xmm1,%%xmm0                 \n"
1174       "pshufb      %%xmm2,%%xmm0                 \n"
1175       "movdqu      %%xmm0,(%1)                   \n"
1176       "lea         0x20(%0),%0                   \n"
1177       "lea         0x10(%1),%1                   \n"
1178       "sub         $0x4,%2                       \n"
1179       "jg          1b                            \n"
1180       : "+r"(src_ab64),          // %0
1181         "+r"(dst_argb),          // %1
1182         "+r"(width)              // %2
1183       : "m"(kShuffleARGBToABGR)  // %3
1184       : "memory", "cc", "xmm0", "xmm1", "xmm2");
1185 }
1186 
1187 #ifdef HAS_ARGBTOAR64ROW_AVX2
ARGBToAR64Row_AVX2(const uint8_t * src_argb,uint16_t * dst_ar64,int width)1188 void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
1189                         uint16_t* dst_ar64,
1190                         int width) {
1191   asm volatile(
1192 
1193       LABELALIGN
1194       "1:                                        \n"
1195       "vmovdqu     (%0),%%ymm0                   \n"
1196       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1197       "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
1198       "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
1199       "vmovdqu     %%ymm0,(%1)                   \n"
1200       "vmovdqu     %%ymm1,0x20(%1)               \n"
1201       "lea         0x20(%0),%0                   \n"
1202       "lea         0x40(%1),%1                   \n"
1203       "sub         $0x8,%2                       \n"
1204       "jg          1b                            \n"
1205       : "+r"(src_argb),  // %0
1206         "+r"(dst_ar64),  // %1
1207         "+r"(width)      // %2
1208       :
1209       : "memory", "cc", "xmm0", "xmm1");
1210 }
1211 #endif
1212 
1213 #ifdef HAS_ARGBTOAB64ROW_AVX2
ARGBToAB64Row_AVX2(const uint8_t * src_argb,uint16_t * dst_ab64,int width)1214 void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
1215                         uint16_t* dst_ab64,
1216                         int width) {
1217   asm volatile(
1218 
1219       "vbroadcastf128 %3,%%ymm2                  \n"
1220       "vbroadcastf128 %4,%%ymm3                  \n" LABELALIGN
1221       "1:                                        \n"
1222       "vmovdqu     (%0),%%ymm0                   \n"
1223       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1224       "vpshufb     %%ymm3,%%ymm0,%%ymm1          \n"
1225       "vpshufb     %%ymm2,%%ymm0,%%ymm0          \n"
1226       "vmovdqu     %%ymm0,(%1)                   \n"
1227       "vmovdqu     %%ymm1,0x20(%1)               \n"
1228       "lea         0x20(%0),%0                   \n"
1229       "lea         0x40(%1),%1                   \n"
1230       "sub         $0x8,%2                       \n"
1231       "jg          1b                            \n"
1232       : "+r"(src_argb),             // %0
1233         "+r"(dst_ab64),             // %1
1234         "+r"(width)                 // %2
1235       : "m"(kShuffleARGBToAB64Lo),  // %3
1236         "m"(kShuffleARGBToAB64Hi)   // %3
1237       : "memory", "cc", "xmm0", "xmm1", "xmm2");
1238 }
1239 #endif
1240 
1241 #ifdef HAS_AR64TOARGBROW_AVX2
AR64ToARGBRow_AVX2(const uint16_t * src_ar64,uint8_t * dst_argb,int width)1242 void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
1243                         uint8_t* dst_argb,
1244                         int width) {
1245   asm volatile(
1246 
1247       LABELALIGN
1248       "1:                                        \n"
1249       "vmovdqu     (%0),%%ymm0                   \n"
1250       "vmovdqu     0x20(%0),%%ymm1               \n"
1251       "vpsrlw      $8,%%ymm0,%%ymm0              \n"
1252       "vpsrlw      $8,%%ymm1,%%ymm1              \n"
1253       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
1254       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1255       "vmovdqu     %%ymm0,(%1)                   \n"
1256       "lea         0x40(%0),%0                   \n"
1257       "lea         0x20(%1),%1                   \n"
1258       "sub         $0x8,%2                       \n"
1259       "jg          1b                            \n"
1260       : "+r"(src_ar64),  // %0
1261         "+r"(dst_argb),  // %1
1262         "+r"(width)      // %2
1263       :
1264       : "memory", "cc", "xmm0", "xmm1");
1265 }
1266 #endif
1267 
1268 #ifdef HAS_AB64TOARGBROW_AVX2
AB64ToARGBRow_AVX2(const uint16_t * src_ab64,uint8_t * dst_argb,int width)1269 void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
1270                         uint8_t* dst_argb,
1271                         int width) {
1272   asm volatile(
1273 
1274       "vbroadcastf128 %3,%%ymm2                  \n" LABELALIGN
1275       "1:                                        \n"
1276       "vmovdqu     (%0),%%ymm0                   \n"
1277       "vmovdqu     0x20(%0),%%ymm1               \n"
1278       "vpsrlw      $8,%%ymm0,%%ymm0              \n"
1279       "vpsrlw      $8,%%ymm1,%%ymm1              \n"
1280       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
1281       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1282       "vpshufb     %%ymm2,%%ymm0,%%ymm0          \n"
1283       "vmovdqu     %%ymm0,(%1)                   \n"
1284       "lea         0x40(%0),%0                   \n"
1285       "lea         0x20(%1),%1                   \n"
1286       "sub         $0x8,%2                       \n"
1287       "jg          1b                            \n"
1288       : "+r"(src_ab64),          // %0
1289         "+r"(dst_argb),          // %1
1290         "+r"(width)              // %2
1291       : "m"(kShuffleARGBToABGR)  // %3
1292       : "memory", "cc", "xmm0", "xmm1", "xmm2");
1293 }
1294 #endif
1295 
1296 // clang-format off
1297 
1298 // TODO(mraptis): Consider passing R, G, B multipliers as parameter.
1299 // round parameter is register containing value to add before shift.
1300 #define RGBTOY(round)                            \
1301   "1:                                        \n" \
1302   "movdqu    (%0),%%xmm0                     \n" \
1303   "movdqu    0x10(%0),%%xmm1                 \n" \
1304   "movdqu    0x20(%0),%%xmm2                 \n" \
1305   "movdqu    0x30(%0),%%xmm3                 \n" \
1306   "psubb     %%xmm5,%%xmm0                   \n" \
1307   "psubb     %%xmm5,%%xmm1                   \n" \
1308   "psubb     %%xmm5,%%xmm2                   \n" \
1309   "psubb     %%xmm5,%%xmm3                   \n" \
1310   "movdqu    %%xmm4,%%xmm6                   \n" \
1311   "pmaddubsw %%xmm0,%%xmm6                   \n" \
1312   "movdqu    %%xmm4,%%xmm0                   \n" \
1313   "pmaddubsw %%xmm1,%%xmm0                   \n" \
1314   "movdqu    %%xmm4,%%xmm1                   \n" \
1315   "pmaddubsw %%xmm2,%%xmm1                   \n" \
1316   "movdqu    %%xmm4,%%xmm2                   \n" \
1317   "pmaddubsw %%xmm3,%%xmm2                   \n" \
1318   "lea       0x40(%0),%0                     \n" \
1319   "phaddw    %%xmm0,%%xmm6                   \n" \
1320   "phaddw    %%xmm2,%%xmm1                   \n" \
1321   "prefetcht0 1280(%0)                       \n" \
1322   "paddw     %%" #round ",%%xmm6             \n" \
1323   "paddw     %%" #round ",%%xmm1             \n" \
1324   "psrlw     $0x8,%%xmm6                     \n" \
1325   "psrlw     $0x8,%%xmm1                     \n" \
1326   "packuswb  %%xmm1,%%xmm6                   \n" \
1327   "movdqu    %%xmm6,(%1)                     \n" \
1328   "lea       0x10(%1),%1                     \n" \
1329   "sub       $0x10,%2                        \n" \
1330   "jg        1b                              \n"
1331 
1332 #define RGBTOY_AVX2(round)                                       \
1333   "1:                                        \n"                 \
1334   "vmovdqu    (%0),%%ymm0                    \n"                 \
1335   "vmovdqu    0x20(%0),%%ymm1                \n"                 \
1336   "vmovdqu    0x40(%0),%%ymm2                \n"                 \
1337   "vmovdqu    0x60(%0),%%ymm3                \n"                 \
1338   "vpsubb     %%ymm5, %%ymm0, %%ymm0         \n"                 \
1339   "vpsubb     %%ymm5, %%ymm1, %%ymm1         \n"                 \
1340   "vpsubb     %%ymm5, %%ymm2, %%ymm2         \n"                 \
1341   "vpsubb     %%ymm5, %%ymm3, %%ymm3         \n"                 \
1342   "vpmaddubsw %%ymm0,%%ymm4,%%ymm0           \n"                 \
1343   "vpmaddubsw %%ymm1,%%ymm4,%%ymm1           \n"                 \
1344   "vpmaddubsw %%ymm2,%%ymm4,%%ymm2           \n"                 \
1345   "vpmaddubsw %%ymm3,%%ymm4,%%ymm3           \n"                 \
1346   "lea       0x80(%0),%0                     \n"                 \
1347   "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n" /* mutates. */  \
1348   "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"                 \
1349   "prefetcht0 1280(%0)                       \n"                 \
1350   "vpaddw     %%" #round ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */             \
1351   "vpaddw     %%" #round ",%%ymm2,%%ymm2     \n" \
1352   "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"                 \
1353   "vpsrlw     $0x8,%%ymm2,%%ymm2             \n"                 \
1354   "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n" /* mutates. */  \
1355   "vpermd     %%ymm0,%%ymm6,%%ymm0           \n" /* unmutate. */ \
1356   "vmovdqu    %%ymm0,(%1)                    \n"                 \
1357   "lea       0x20(%1),%1                     \n"                 \
1358   "sub       $0x20,%2                        \n"                 \
1359   "jg        1b                              \n"                 \
1360   "vzeroupper                                \n"
1361 
1362 // clang-format on
1363 
1364 #ifdef HAS_ARGBTOYROW_SSSE3
1365 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
ARGBToYRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_y,int width)1366 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1367   asm volatile(
1368       "movdqa      %3,%%xmm4                     \n"
1369       "movdqa      %4,%%xmm5                     \n"
1370       "movdqa      %5,%%xmm7                     \n"
1371 
1372       LABELALIGN RGBTOY(xmm7)
1373       : "+r"(src_argb),  // %0
1374         "+r"(dst_y),     // %1
1375         "+r"(width)      // %2
1376       : "m"(kARGBToY),   // %3
1377         "m"(kSub128),    // %4
1378         "m"(kAddY16)     // %5
1379       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1380         "xmm7");
1381 }
1382 #endif  // HAS_ARGBTOYROW_SSSE3
1383 
1384 #ifdef HAS_ARGBTOYJROW_SSSE3
1385 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1386 // Same as ARGBToYRow but different coefficients, no add 16.
ARGBToYJRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_y,int width)1387 void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1388   asm volatile(
1389       "movdqa      %3,%%xmm4                     \n"
1390       "movdqa      %4,%%xmm5                     \n"
1391 
1392       LABELALIGN RGBTOY(xmm5)
1393       : "+r"(src_argb),  // %0
1394         "+r"(dst_y),     // %1
1395         "+r"(width)      // %2
1396       : "m"(kARGBToYJ),  // %3
1397         "m"(kSub128)     // %4
1398       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1399 }
1400 #endif  // HAS_ARGBTOYJROW_SSSE3
1401 
1402 #ifdef HAS_RGBATOYJROW_SSSE3
1403 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1404 // Same as ARGBToYRow but different coefficients, no add 16.
RGBAToYJRow_SSSE3(const uint8_t * src_rgba,uint8_t * dst_y,int width)1405 void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1406   asm volatile(
1407       "movdqa      %3,%%xmm4                     \n"
1408       "movdqa      %4,%%xmm5                     \n"
1409 
1410       LABELALIGN RGBTOY(xmm5)
1411       : "+r"(src_rgba),  // %0
1412         "+r"(dst_y),     // %1
1413         "+r"(width)      // %2
1414       : "m"(kRGBAToYJ),  // %3
1415         "m"(kSub128)     // %4
1416       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1417 }
1418 #endif  // HAS_RGBATOYJROW_SSSE3
1419 
1420 #ifdef HAS_ARGBTOYROW_AVX2
1421 // vpermd for vphaddw + vpackuswb vpermd.
1422 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
1423 
1424 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYRow_AVX2(const uint8_t * src_argb,uint8_t * dst_y,int width)1425 void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1426   asm volatile(
1427       "vbroadcastf128 %3,%%ymm4                  \n"
1428       "vbroadcastf128 %4,%%ymm5                  \n"
1429       "vbroadcastf128 %5,%%ymm7                  \n"
1430       "vmovdqu     %6,%%ymm6                     \n"
1431 
1432       LABELALIGN RGBTOY_AVX2(ymm7)
1433       : "+r"(src_argb),         // %0
1434         "+r"(dst_y),            // %1
1435         "+r"(width)             // %2
1436       : "m"(kARGBToY),          // %3
1437         "m"(kSub128),           // %4
1438         "m"(kAddY16),           // %5
1439         "m"(kPermdARGBToY_AVX)  // %6
1440       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1441         "xmm7");
1442 }
1443 #endif  // HAS_ARGBTOYROW_AVX2
1444 
1445 #ifdef HAS_ABGRTOYROW_AVX2
1446 // Convert 32 ABGR pixels (128 bytes) to 32 Y values.
ABGRToYRow_AVX2(const uint8_t * src_abgr,uint8_t * dst_y,int width)1447 void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1448   asm volatile(
1449       "vbroadcastf128 %3,%%ymm4                  \n"
1450       "vbroadcastf128 %4,%%ymm5                  \n"
1451       "vbroadcastf128 %5,%%ymm7                  \n"
1452       "vmovdqu     %6,%%ymm6                     \n"
1453 
1454       LABELALIGN RGBTOY_AVX2(ymm7)
1455       : "+r"(src_abgr),         // %0
1456         "+r"(dst_y),            // %1
1457         "+r"(width)             // %2
1458       : "m"(kABGRToY),          // %3
1459         "m"(kSub128),           // %4
1460         "m"(kAddY16),           // %5
1461         "m"(kPermdARGBToY_AVX)  // %6
1462       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1463         "xmm7");
1464 }
1465 #endif  // HAS_ABGRTOYROW_AVX2
1466 
1467 #ifdef HAS_ARGBTOYJROW_AVX2
1468 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYJRow_AVX2(const uint8_t * src_argb,uint8_t * dst_y,int width)1469 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1470   asm volatile(
1471       "vbroadcastf128 %3,%%ymm4                  \n"
1472       "vbroadcastf128 %4,%%ymm5                  \n"
1473       "vmovdqu     %5,%%ymm6                     \n"
1474 
1475       LABELALIGN RGBTOY_AVX2(ymm5)
1476       : "+r"(src_argb),         // %0
1477         "+r"(dst_y),            // %1
1478         "+r"(width)             // %2
1479       : "m"(kARGBToYJ),         // %3
1480         "m"(kSub128),           // %4
1481         "m"(kPermdARGBToY_AVX)  // %5
1482       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1483         "xmm7");
1484 }
1485 #endif  // HAS_ARGBTOYJROW_AVX2
1486 
1487 #ifdef HAS_RGBATOYJROW_AVX2
1488 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
RGBAToYJRow_AVX2(const uint8_t * src_rgba,uint8_t * dst_y,int width)1489 void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1490   asm volatile(
1491       "vbroadcastf128 %3,%%ymm4                  \n"
1492       "vbroadcastf128 %4,%%ymm5                  \n"
1493       "vmovdqu     %5,%%ymm6                     \n"
1494 
1495       LABELALIGN RGBTOY_AVX2(
1496       ymm5) "vzeroupper                                \n"
1497       : "+r"(src_rgba),         // %0
1498         "+r"(dst_y),            // %1
1499         "+r"(width)             // %2
1500       : "m"(kRGBAToYJ),         // %3
1501         "m"(kSub128),           // %4
1502         "m"(kPermdARGBToY_AVX)  // %5
1503       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1504 }
1505 #endif  // HAS_RGBATOYJROW_AVX2
1506 
1507 #ifdef HAS_ARGBTOUVROW_SSSE3
ARGBToUVRow_SSSE3(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1508 void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
1509                        int src_stride_argb,
1510                        uint8_t* dst_u,
1511                        uint8_t* dst_v,
1512                        int width) {
1513   asm volatile(
1514       "movdqa      %5,%%xmm3                     \n"
1515       "movdqa      %6,%%xmm4                     \n"
1516       "movdqa      %7,%%xmm5                     \n"
1517       "sub         %1,%2                         \n"
1518 
1519       LABELALIGN
1520       "1:                                        \n"
1521       "movdqu      (%0),%%xmm0                   \n"
1522       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
1523       "pavgb       %%xmm7,%%xmm0                 \n"
1524       "movdqu      0x10(%0),%%xmm1               \n"
1525       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
1526       "pavgb       %%xmm7,%%xmm1                 \n"
1527       "movdqu      0x20(%0),%%xmm2               \n"
1528       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
1529       "pavgb       %%xmm7,%%xmm2                 \n"
1530       "movdqu      0x30(%0),%%xmm6               \n"
1531       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
1532       "pavgb       %%xmm7,%%xmm6                 \n"
1533 
1534       "lea         0x40(%0),%0                   \n"
1535       "movdqa      %%xmm0,%%xmm7                 \n"
1536       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1537       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
1538       "pavgb       %%xmm7,%%xmm0                 \n"
1539       "movdqa      %%xmm2,%%xmm7                 \n"
1540       "shufps      $0x88,%%xmm6,%%xmm2           \n"
1541       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
1542       "pavgb       %%xmm7,%%xmm2                 \n"
1543       "movdqa      %%xmm0,%%xmm1                 \n"
1544       "movdqa      %%xmm2,%%xmm6                 \n"
1545       "pmaddubsw   %%xmm4,%%xmm0                 \n"
1546       "pmaddubsw   %%xmm4,%%xmm2                 \n"
1547       "pmaddubsw   %%xmm3,%%xmm1                 \n"
1548       "pmaddubsw   %%xmm3,%%xmm6                 \n"
1549       "phaddw      %%xmm2,%%xmm0                 \n"
1550       "phaddw      %%xmm6,%%xmm1                 \n"
1551       "psraw       $0x8,%%xmm0                   \n"
1552       "psraw       $0x8,%%xmm1                   \n"
1553       "packsswb    %%xmm1,%%xmm0                 \n"
1554       "paddb       %%xmm5,%%xmm0                 \n"
1555       "movlps      %%xmm0,(%1)                   \n"
1556       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
1557       "lea         0x8(%1),%1                    \n"
1558       "sub         $0x10,%3                      \n"
1559       "jg          1b                            \n"
1560       : "+r"(src_argb),                    // %0
1561         "+r"(dst_u),                       // %1
1562         "+r"(dst_v),                       // %2
1563         "+rm"(width)                       // %3
1564       : "r"((intptr_t)(src_stride_argb)),  // %4
1565         "m"(kARGBToV),                     // %5
1566         "m"(kARGBToU),                     // %6
1567         "m"(kAddUV128)                     // %7
1568       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1569 }
1570 #endif  // HAS_ARGBTOUVROW_SSSE3
1571 
1572 #ifdef HAS_ARGBTOUVROW_AVX2
1573 // vpshufb for vphaddw + vpackuswb packed to shorts.
1574 static const lvec8 kShufARGBToUV_AVX = {
1575     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
1576     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
ARGBToUVRow_AVX2(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1577 void ARGBToUVRow_AVX2(const uint8_t* src_argb,
1578                       int src_stride_argb,
1579                       uint8_t* dst_u,
1580                       uint8_t* dst_v,
1581                       int width) {
1582   asm volatile(
1583       "vbroadcastf128 %5,%%ymm5                  \n"
1584       "vbroadcastf128 %6,%%ymm6                  \n"
1585       "vbroadcastf128 %7,%%ymm7                  \n"
1586       "sub         %1,%2                         \n"
1587 
1588       LABELALIGN
1589       "1:                                        \n"
1590       "vmovdqu     (%0),%%ymm0                   \n"
1591       "vmovdqu     0x20(%0),%%ymm1               \n"
1592       "vmovdqu     0x40(%0),%%ymm2               \n"
1593       "vmovdqu     0x60(%0),%%ymm3               \n"
1594       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
1595       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
1596       "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
1597       "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
1598       "lea         0x80(%0),%0                   \n"
1599       "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
1600       "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
1601       "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
1602       "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
1603       "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
1604       "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
1605 
1606       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
1607       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
1608       "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
1609       "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
1610       "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
1611       "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
1612       "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
1613       "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
1614       "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
1615       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1616       "vpshufb     %8,%%ymm0,%%ymm0              \n"
1617       "vpaddb      %%ymm5,%%ymm0,%%ymm0          \n"
1618 
1619       "vextractf128 $0x0,%%ymm0,(%1)             \n"
1620       "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
1621       "lea         0x10(%1),%1                   \n"
1622       "sub         $0x20,%3                      \n"
1623       "jg          1b                            \n"
1624       "vzeroupper                                \n"
1625       : "+r"(src_argb),                    // %0
1626         "+r"(dst_u),                       // %1
1627         "+r"(dst_v),                       // %2
1628         "+rm"(width)                       // %3
1629       : "r"((intptr_t)(src_stride_argb)),  // %4
1630         "m"(kAddUV128),                    // %5
1631         "m"(kARGBToV),                     // %6
1632         "m"(kARGBToU),                     // %7
1633         "m"(kShufARGBToUV_AVX)             // %8
1634       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1635         "xmm7");
1636 }
1637 #endif  // HAS_ARGBTOUVROW_AVX2
1638 
1639 #ifdef HAS_ABGRTOUVROW_AVX2
ABGRToUVRow_AVX2(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1640 void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
1641                       int src_stride_abgr,
1642                       uint8_t* dst_u,
1643                       uint8_t* dst_v,
1644                       int width) {
1645   asm volatile(
1646       "vbroadcastf128 %5,%%ymm5                  \n"
1647       "vbroadcastf128 %6,%%ymm6                  \n"
1648       "vbroadcastf128 %7,%%ymm7                  \n"
1649       "sub         %1,%2                         \n"
1650 
1651       LABELALIGN
1652       "1:                                        \n"
1653       "vmovdqu     (%0),%%ymm0                   \n"
1654       "vmovdqu     0x20(%0),%%ymm1               \n"
1655       "vmovdqu     0x40(%0),%%ymm2               \n"
1656       "vmovdqu     0x60(%0),%%ymm3               \n"
1657       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
1658       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
1659       "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
1660       "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
1661       "lea         0x80(%0),%0                   \n"
1662       "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
1663       "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
1664       "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
1665       "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
1666       "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
1667       "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
1668 
1669       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
1670       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
1671       "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
1672       "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
1673       "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
1674       "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
1675       "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
1676       "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
1677       "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
1678       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1679       "vpshufb     %8,%%ymm0,%%ymm0              \n"
1680       "vpaddb      %%ymm5,%%ymm0,%%ymm0          \n"
1681 
1682       "vextractf128 $0x0,%%ymm0,(%1)             \n"
1683       "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
1684       "lea         0x10(%1),%1                   \n"
1685       "sub         $0x20,%3                      \n"
1686       "jg          1b                            \n"
1687       "vzeroupper                                \n"
1688       : "+r"(src_abgr),                    // %0
1689         "+r"(dst_u),                       // %1
1690         "+r"(dst_v),                       // %2
1691         "+rm"(width)                       // %3
1692       : "r"((intptr_t)(src_stride_abgr)),  // %4
1693         "m"(kAddUV128),                    // %5
1694         "m"(kABGRToV),                     // %6
1695         "m"(kABGRToU),                     // %7
1696         "m"(kShufARGBToUV_AVX)             // %8
1697       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1698         "xmm7");
1699 }
1700 #endif  // HAS_ABGRTOUVROW_AVX2
1701 
1702 #ifdef HAS_ARGBTOUVJROW_AVX2
ARGBToUVJRow_AVX2(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1703 void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
1704                        int src_stride_argb,
1705                        uint8_t* dst_u,
1706                        uint8_t* dst_v,
1707                        int width) {
1708   asm volatile(
1709       "vbroadcastf128 %5,%%ymm5                  \n"
1710       "vbroadcastf128 %6,%%ymm6                  \n"
1711       "vbroadcastf128 %7,%%ymm7                  \n"
1712       "sub         %1,%2                         \n"
1713 
1714       LABELALIGN
1715       "1:                                        \n"
1716       "vmovdqu     (%0),%%ymm0                   \n"
1717       "vmovdqu     0x20(%0),%%ymm1               \n"
1718       "vmovdqu     0x40(%0),%%ymm2               \n"
1719       "vmovdqu     0x60(%0),%%ymm3               \n"
1720       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
1721       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
1722       "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
1723       "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
1724       "lea         0x80(%0),%0                   \n"
1725       "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
1726       "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
1727       "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
1728       "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
1729       "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
1730       "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
1731 
1732       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
1733       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
1734       "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
1735       "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
1736       "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
1737       "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
1738       "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
1739       "vpaddw      %%ymm5,%%ymm1,%%ymm1          \n"
1740       "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
1741       "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
1742       "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
1743       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1744       "vpshufb     %8,%%ymm0,%%ymm0              \n"
1745 
1746       "vextractf128 $0x0,%%ymm0,(%1)             \n"
1747       "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
1748       "lea         0x10(%1),%1                   \n"
1749       "sub         $0x20,%3                      \n"
1750       "jg          1b                            \n"
1751       "vzeroupper                                \n"
1752       : "+r"(src_argb),                    // %0
1753         "+r"(dst_u),                       // %1
1754         "+r"(dst_v),                       // %2
1755         "+rm"(width)                       // %3
1756       : "r"((intptr_t)(src_stride_argb)),  // %4
1757         "m"(kSub128),                      // %5
1758         "m"(kARGBToVJ),                    // %6
1759         "m"(kARGBToUJ),                    // %7
1760         "m"(kShufARGBToUV_AVX)             // %8
1761       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1762         "xmm7");
1763 }
1764 #endif  // HAS_ARGBTOUVJROW_AVX2
1765 
1766 #ifdef HAS_ARGBTOUVJROW_SSSE3
ARGBToUVJRow_SSSE3(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1767 void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
1768                         int src_stride_argb,
1769                         uint8_t* dst_u,
1770                         uint8_t* dst_v,
1771                         int width) {
1772   asm volatile(
1773       "movdqa      %5,%%xmm3                     \n"
1774       "movdqa      %6,%%xmm4                     \n"
1775       "movdqa      %7,%%xmm5                     \n"
1776       "sub         %1,%2                         \n"
1777 
1778       LABELALIGN
1779       "1:                                        \n"
1780       "movdqu      (%0),%%xmm0                   \n"
1781       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
1782       "pavgb       %%xmm7,%%xmm0                 \n"
1783       "movdqu      0x10(%0),%%xmm1               \n"
1784       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
1785       "pavgb       %%xmm7,%%xmm1                 \n"
1786       "movdqu      0x20(%0),%%xmm2               \n"
1787       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
1788       "pavgb       %%xmm7,%%xmm2                 \n"
1789       "movdqu      0x30(%0),%%xmm6               \n"
1790       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
1791       "pavgb       %%xmm7,%%xmm6                 \n"
1792 
1793       "lea         0x40(%0),%0                   \n"
1794       "movdqa      %%xmm0,%%xmm7                 \n"
1795       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1796       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
1797       "pavgb       %%xmm7,%%xmm0                 \n"
1798       "movdqa      %%xmm2,%%xmm7                 \n"
1799       "shufps      $0x88,%%xmm6,%%xmm2           \n"
1800       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
1801       "pavgb       %%xmm7,%%xmm2                 \n"
1802       "movdqa      %%xmm0,%%xmm1                 \n"
1803       "movdqa      %%xmm2,%%xmm6                 \n"
1804       "pmaddubsw   %%xmm4,%%xmm0                 \n"
1805       "pmaddubsw   %%xmm4,%%xmm2                 \n"
1806       "pmaddubsw   %%xmm3,%%xmm1                 \n"
1807       "pmaddubsw   %%xmm3,%%xmm6                 \n"
1808       "phaddw      %%xmm2,%%xmm0                 \n"
1809       "phaddw      %%xmm6,%%xmm1                 \n"
1810       "paddw       %%xmm5,%%xmm0                 \n"
1811       "paddw       %%xmm5,%%xmm1                 \n"
1812       "psraw       $0x8,%%xmm0                   \n"
1813       "psraw       $0x8,%%xmm1                   \n"
1814       "packsswb    %%xmm1,%%xmm0                 \n"
1815       "movlps      %%xmm0,(%1)                   \n"
1816       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
1817       "lea         0x8(%1),%1                    \n"
1818       "sub         $0x10,%3                      \n"
1819       "jg          1b                            \n"
1820       : "+r"(src_argb),                    // %0
1821         "+r"(dst_u),                       // %1
1822         "+r"(dst_v),                       // %2
1823         "+rm"(width)                       // %3
1824       : "r"((intptr_t)(src_stride_argb)),  // %4
1825         "m"(kARGBToVJ),                    // %5
1826         "m"(kARGBToUJ),                    // %6
1827         "m"(kSub128)                       // %7
1828       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1829 }
1830 #endif  // HAS_ARGBTOUVJROW_SSSE3
1831 
1832 #ifdef HAS_ARGBTOUV444ROW_SSSE3
ARGBToUV444Row_SSSE3(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1833 void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
1834                           uint8_t* dst_u,
1835                           uint8_t* dst_v,
1836                           int width) {
1837   asm volatile(
1838       "movdqa      %4,%%xmm3                     \n"
1839       "movdqa      %5,%%xmm4                     \n"
1840       "movdqa      %6,%%xmm5                     \n"
1841       "sub         %1,%2                         \n"
1842 
1843       LABELALIGN
1844       "1:                                        \n"
1845       "movdqu      (%0),%%xmm0                   \n"
1846       "movdqu      0x10(%0),%%xmm1               \n"
1847       "movdqu      0x20(%0),%%xmm2               \n"
1848       "movdqu      0x30(%0),%%xmm6               \n"
1849       "pmaddubsw   %%xmm4,%%xmm0                 \n"
1850       "pmaddubsw   %%xmm4,%%xmm1                 \n"
1851       "pmaddubsw   %%xmm4,%%xmm2                 \n"
1852       "pmaddubsw   %%xmm4,%%xmm6                 \n"
1853       "phaddw      %%xmm1,%%xmm0                 \n"
1854       "phaddw      %%xmm6,%%xmm2                 \n"
1855       "psraw       $0x8,%%xmm0                   \n"
1856       "psraw       $0x8,%%xmm2                   \n"
1857       "packsswb    %%xmm2,%%xmm0                 \n"
1858       "paddb       %%xmm5,%%xmm0                 \n"
1859       "movdqu      %%xmm0,(%1)                   \n"
1860       "movdqu      (%0),%%xmm0                   \n"
1861       "movdqu      0x10(%0),%%xmm1               \n"
1862       "movdqu      0x20(%0),%%xmm2               \n"
1863       "movdqu      0x30(%0),%%xmm6               \n"
1864       "pmaddubsw   %%xmm3,%%xmm0                 \n"
1865       "pmaddubsw   %%xmm3,%%xmm1                 \n"
1866       "pmaddubsw   %%xmm3,%%xmm2                 \n"
1867       "pmaddubsw   %%xmm3,%%xmm6                 \n"
1868       "phaddw      %%xmm1,%%xmm0                 \n"
1869       "phaddw      %%xmm6,%%xmm2                 \n"
1870       "psraw       $0x8,%%xmm0                   \n"
1871       "psraw       $0x8,%%xmm2                   \n"
1872       "packsswb    %%xmm2,%%xmm0                 \n"
1873       "paddb       %%xmm5,%%xmm0                 \n"
1874       "lea         0x40(%0),%0                   \n"
1875       "movdqu      %%xmm0,0x00(%1,%2,1)          \n"
1876       "lea         0x10(%1),%1                   \n"
1877       "sub         $0x10,%3                      \n"
1878       "jg          1b                            \n"
1879       : "+r"(src_argb),  // %0
1880         "+r"(dst_u),     // %1
1881         "+r"(dst_v),     // %2
1882         "+rm"(width)     // %3
1883       : "m"(kARGBToV),   // %4
1884         "m"(kARGBToU),   // %5
1885         "m"(kAddUV128)   // %6
1886       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
1887 }
1888 #endif  // HAS_ARGBTOUV444ROW_SSSE3
1889 
BGRAToYRow_SSSE3(const uint8_t * src_bgra,uint8_t * dst_y,int width)1890 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
1891   asm volatile(
1892       "movdqa      %3,%%xmm4                     \n"
1893       "movdqa      %4,%%xmm5                     \n"
1894       "movdqa      %5,%%xmm7                     \n"
1895 
1896       LABELALIGN RGBTOY(xmm7)
1897       : "+r"(src_bgra),  // %0
1898         "+r"(dst_y),     // %1
1899         "+r"(width)      // %2
1900       : "m"(kBGRAToY),   // %3
1901         "m"(kSub128),    // %4
1902         "m"(kAddY16)     // %5
1903       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1904         "xmm7");
1905 }
1906 
BGRAToUVRow_SSSE3(const uint8_t * src_bgra,int src_stride_bgra,uint8_t * dst_u,uint8_t * dst_v,int width)1907 void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
1908                        int src_stride_bgra,
1909                        uint8_t* dst_u,
1910                        uint8_t* dst_v,
1911                        int width) {
1912   asm volatile(
1913       "movdqa      %5,%%xmm3                     \n"
1914       "movdqa      %6,%%xmm4                     \n"
1915       "movdqa      %7,%%xmm5                     \n"
1916       "sub         %1,%2                         \n"
1917 
1918       LABELALIGN
1919       "1:                                        \n"
1920       "movdqu      (%0),%%xmm0                   \n"
1921       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
1922       "pavgb       %%xmm7,%%xmm0                 \n"
1923       "movdqu      0x10(%0),%%xmm1               \n"
1924       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
1925       "pavgb       %%xmm7,%%xmm1                 \n"
1926       "movdqu      0x20(%0),%%xmm2               \n"
1927       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
1928       "pavgb       %%xmm7,%%xmm2                 \n"
1929       "movdqu      0x30(%0),%%xmm6               \n"
1930       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
1931       "pavgb       %%xmm7,%%xmm6                 \n"
1932 
1933       "lea         0x40(%0),%0                   \n"
1934       "movdqa      %%xmm0,%%xmm7                 \n"
1935       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1936       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
1937       "pavgb       %%xmm7,%%xmm0                 \n"
1938       "movdqa      %%xmm2,%%xmm7                 \n"
1939       "shufps      $0x88,%%xmm6,%%xmm2           \n"
1940       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
1941       "pavgb       %%xmm7,%%xmm2                 \n"
1942       "movdqa      %%xmm0,%%xmm1                 \n"
1943       "movdqa      %%xmm2,%%xmm6                 \n"
1944       "pmaddubsw   %%xmm4,%%xmm0                 \n"
1945       "pmaddubsw   %%xmm4,%%xmm2                 \n"
1946       "pmaddubsw   %%xmm3,%%xmm1                 \n"
1947       "pmaddubsw   %%xmm3,%%xmm6                 \n"
1948       "phaddw      %%xmm2,%%xmm0                 \n"
1949       "phaddw      %%xmm6,%%xmm1                 \n"
1950       "psraw       $0x8,%%xmm0                   \n"
1951       "psraw       $0x8,%%xmm1                   \n"
1952       "packsswb    %%xmm1,%%xmm0                 \n"
1953       "paddb       %%xmm5,%%xmm0                 \n"
1954       "movlps      %%xmm0,(%1)                   \n"
1955       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
1956       "lea         0x8(%1),%1                    \n"
1957       "sub         $0x10,%3                      \n"
1958       "jg          1b                            \n"
1959       : "+r"(src_bgra),                    // %0
1960         "+r"(dst_u),                       // %1
1961         "+r"(dst_v),                       // %2
1962         "+rm"(width)                       // %3
1963       : "r"((intptr_t)(src_stride_bgra)),  // %4
1964         "m"(kBGRAToV),                     // %5
1965         "m"(kBGRAToU),                     // %6
1966         "m"(kAddUV128)                     // %7
1967       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1968 }
1969 
ABGRToYRow_SSSE3(const uint8_t * src_abgr,uint8_t * dst_y,int width)1970 void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1971   asm volatile(
1972       "movdqa      %3,%%xmm4                     \n"
1973       "movdqa      %4,%%xmm5                     \n"
1974       "movdqa      %5,%%xmm7                     \n"
1975 
1976       LABELALIGN RGBTOY(xmm7)
1977       : "+r"(src_abgr),  // %0
1978         "+r"(dst_y),     // %1
1979         "+r"(width)      // %2
1980       : "m"(kABGRToY),   // %3
1981         "m"(kSub128),    // %4
1982         "m"(kAddY16)     // %5
1983       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1984         "xmm7");
1985 }
1986 
RGBAToYRow_SSSE3(const uint8_t * src_rgba,uint8_t * dst_y,int width)1987 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1988   asm volatile(
1989       "movdqa      %3,%%xmm4                     \n"
1990       "movdqa      %4,%%xmm5                     \n"
1991       "movdqa      %5,%%xmm7                     \n"
1992 
1993       LABELALIGN RGBTOY(xmm7)
1994       : "+r"(src_rgba),  // %0
1995         "+r"(dst_y),     // %1
1996         "+r"(width)      // %2
1997       : "m"(kRGBAToY),   // %3
1998         "m"(kSub128),    // %4
1999         "m"(kAddY16)     // %5
2000       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2001         "xmm7");
2002 }
2003 
ABGRToUVRow_SSSE3(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)2004 void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
2005                        int src_stride_abgr,
2006                        uint8_t* dst_u,
2007                        uint8_t* dst_v,
2008                        int width) {
2009   asm volatile(
2010       "movdqa      %5,%%xmm3                     \n"
2011       "movdqa      %6,%%xmm4                     \n"
2012       "movdqa      %7,%%xmm5                     \n"
2013       "sub         %1,%2                         \n"
2014 
2015       LABELALIGN
2016       "1:                                        \n"
2017       "movdqu      (%0),%%xmm0                   \n"
2018       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
2019       "pavgb       %%xmm7,%%xmm0                 \n"
2020       "movdqu      0x10(%0),%%xmm1               \n"
2021       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
2022       "pavgb       %%xmm7,%%xmm1                 \n"
2023       "movdqu      0x20(%0),%%xmm2               \n"
2024       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
2025       "pavgb       %%xmm7,%%xmm2                 \n"
2026       "movdqu      0x30(%0),%%xmm6               \n"
2027       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
2028       "pavgb       %%xmm7,%%xmm6                 \n"
2029 
2030       "lea         0x40(%0),%0                   \n"
2031       "movdqa      %%xmm0,%%xmm7                 \n"
2032       "shufps      $0x88,%%xmm1,%%xmm0           \n"
2033       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
2034       "pavgb       %%xmm7,%%xmm0                 \n"
2035       "movdqa      %%xmm2,%%xmm7                 \n"
2036       "shufps      $0x88,%%xmm6,%%xmm2           \n"
2037       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
2038       "pavgb       %%xmm7,%%xmm2                 \n"
2039       "movdqa      %%xmm0,%%xmm1                 \n"
2040       "movdqa      %%xmm2,%%xmm6                 \n"
2041       "pmaddubsw   %%xmm4,%%xmm0                 \n"
2042       "pmaddubsw   %%xmm4,%%xmm2                 \n"
2043       "pmaddubsw   %%xmm3,%%xmm1                 \n"
2044       "pmaddubsw   %%xmm3,%%xmm6                 \n"
2045       "phaddw      %%xmm2,%%xmm0                 \n"
2046       "phaddw      %%xmm6,%%xmm1                 \n"
2047       "psraw       $0x8,%%xmm0                   \n"
2048       "psraw       $0x8,%%xmm1                   \n"
2049       "packsswb    %%xmm1,%%xmm0                 \n"
2050       "paddb       %%xmm5,%%xmm0                 \n"
2051       "movlps      %%xmm0,(%1)                   \n"
2052       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
2053       "lea         0x8(%1),%1                    \n"
2054       "sub         $0x10,%3                      \n"
2055       "jg          1b                            \n"
2056       : "+r"(src_abgr),                    // %0
2057         "+r"(dst_u),                       // %1
2058         "+r"(dst_v),                       // %2
2059         "+rm"(width)                       // %3
2060       : "r"((intptr_t)(src_stride_abgr)),  // %4
2061         "m"(kABGRToV),                     // %5
2062         "m"(kABGRToU),                     // %6
2063         "m"(kAddUV128)                     // %7
2064       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
2065 }
2066 
RGBAToUVRow_SSSE3(const uint8_t * src_rgba,int src_stride_rgba,uint8_t * dst_u,uint8_t * dst_v,int width)2067 void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
2068                        int src_stride_rgba,
2069                        uint8_t* dst_u,
2070                        uint8_t* dst_v,
2071                        int width) {
2072   asm volatile(
2073       "movdqa      %5,%%xmm3                     \n"
2074       "movdqa      %6,%%xmm4                     \n"
2075       "movdqa      %7,%%xmm5                     \n"
2076       "sub         %1,%2                         \n"
2077 
2078       LABELALIGN
2079       "1:                                        \n"
2080       "movdqu      (%0),%%xmm0                   \n"
2081       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
2082       "pavgb       %%xmm7,%%xmm0                 \n"
2083       "movdqu      0x10(%0),%%xmm1               \n"
2084       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
2085       "pavgb       %%xmm7,%%xmm1                 \n"
2086       "movdqu      0x20(%0),%%xmm2               \n"
2087       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
2088       "pavgb       %%xmm7,%%xmm2                 \n"
2089       "movdqu      0x30(%0),%%xmm6               \n"
2090       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
2091       "pavgb       %%xmm7,%%xmm6                 \n"
2092 
2093       "lea         0x40(%0),%0                   \n"
2094       "movdqa      %%xmm0,%%xmm7                 \n"
2095       "shufps      $0x88,%%xmm1,%%xmm0           \n"
2096       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
2097       "pavgb       %%xmm7,%%xmm0                 \n"
2098       "movdqa      %%xmm2,%%xmm7                 \n"
2099       "shufps      $0x88,%%xmm6,%%xmm2           \n"
2100       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
2101       "pavgb       %%xmm7,%%xmm2                 \n"
2102       "movdqa      %%xmm0,%%xmm1                 \n"
2103       "movdqa      %%xmm2,%%xmm6                 \n"
2104       "pmaddubsw   %%xmm4,%%xmm0                 \n"
2105       "pmaddubsw   %%xmm4,%%xmm2                 \n"
2106       "pmaddubsw   %%xmm3,%%xmm1                 \n"
2107       "pmaddubsw   %%xmm3,%%xmm6                 \n"
2108       "phaddw      %%xmm2,%%xmm0                 \n"
2109       "phaddw      %%xmm6,%%xmm1                 \n"
2110       "psraw       $0x8,%%xmm0                   \n"
2111       "psraw       $0x8,%%xmm1                   \n"
2112       "packsswb    %%xmm1,%%xmm0                 \n"
2113       "paddb       %%xmm5,%%xmm0                 \n"
2114       "movlps      %%xmm0,(%1)                   \n"
2115       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
2116       "lea         0x8(%1),%1                    \n"
2117       "sub         $0x10,%3                      \n"
2118       "jg          1b                            \n"
2119       : "+r"(src_rgba),                    // %0
2120         "+r"(dst_u),                       // %1
2121         "+r"(dst_v),                       // %2
2122         "+rm"(width)                       // %3
2123       : "r"((intptr_t)(src_stride_rgba)),  // %4
2124         "m"(kRGBAToV),                     // %5
2125         "m"(kRGBAToU),                     // %6
2126         "m"(kAddUV128)                     // %7
2127       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
2128 }
2129 
2130 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
2131 
2132 // Read 8 UV from 444
2133 #define READYUV444                                                \
2134   "movq       (%[u_buf]),%%xmm3                               \n" \
2135   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2136   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
2137   "punpcklbw  %%xmm1,%%xmm3                                   \n" \
2138   "movq       (%[y_buf]),%%xmm4                               \n" \
2139   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2140   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
2141 
2142 // Read 4 UV from 422, upsample to 8 UV
2143 #define READYUV422                                                \
2144   "movd       (%[u_buf]),%%xmm3                               \n" \
2145   "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2146   "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
2147   "punpcklbw  %%xmm1,%%xmm3                                   \n" \
2148   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2149   "movq       (%[y_buf]),%%xmm4                               \n" \
2150   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2151   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
2152 
2153 // Read 4 UV from 422 10 bit, upsample to 8 UV
2154 // TODO(fbarchard): Consider shufb to replace pack/unpack
2155 // TODO(fbarchard): Consider pmulhuw to replace psraw
2156 // TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
2157 #define READYUV210                                                \
2158   "movq       (%[u_buf]),%%xmm3                               \n" \
2159   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2160   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
2161   "punpcklwd  %%xmm1,%%xmm3                                   \n" \
2162   "psraw      $2,%%xmm3                                       \n" \
2163   "packuswb   %%xmm3,%%xmm3                                   \n" \
2164   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2165   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2166   "psllw      $6,%%xmm4                                       \n" \
2167   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
2168 
2169 #define READYUVA210                                               \
2170   "movq       (%[u_buf]),%%xmm3                               \n" \
2171   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2172   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
2173   "punpcklwd  %%xmm1,%%xmm3                                   \n" \
2174   "psraw      $2,%%xmm3                                       \n" \
2175   "packuswb   %%xmm3,%%xmm3                                   \n" \
2176   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2177   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2178   "psllw      $6,%%xmm4                                       \n" \
2179   "lea        0x10(%[y_buf]),%[y_buf]                         \n" \
2180   "movdqu     (%[a_buf]),%%xmm5                               \n" \
2181   "psraw      $2,%%xmm5                                       \n" \
2182   "packuswb   %%xmm5,%%xmm5                                   \n" \
2183   "lea        0x10(%[a_buf]),%[a_buf]                         \n"
2184 
2185 // Read 8 UV from 444 10 bit
2186 #define READYUV410                                                \
2187   "movdqu     (%[u_buf]),%%xmm3                               \n" \
2188   "movdqu     0x00(%[u_buf],%[v_buf],1),%%xmm2                \n" \
2189   "lea        0x10(%[u_buf]),%[u_buf]                         \n" \
2190   "psraw      $2,%%xmm3                                       \n" \
2191   "psraw      $2,%%xmm2                                       \n" \
2192   "movdqa     %%xmm3,%%xmm1                                   \n" \
2193   "punpcklwd  %%xmm2,%%xmm3                                   \n" \
2194   "punpckhwd  %%xmm2,%%xmm1                                   \n" \
2195   "packuswb   %%xmm1,%%xmm3                                   \n" \
2196   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2197   "psllw      $6,%%xmm4                                       \n" \
2198   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
2199 
2200 // Read 8 UV from 444 10 bit.  With 8 Alpha.
2201 #define READYUVA410                                               \
2202   "movdqu     (%[u_buf]),%%xmm3                               \n" \
2203   "movdqu     0x00(%[u_buf],%[v_buf],1),%%xmm2                \n" \
2204   "lea        0x10(%[u_buf]),%[u_buf]                         \n" \
2205   "psraw      $2,%%xmm3                                       \n" \
2206   "psraw      $2,%%xmm2                                       \n" \
2207   "movdqa     %%xmm3,%%xmm1                                   \n" \
2208   "punpcklwd  %%xmm2,%%xmm3                                   \n" \
2209   "punpckhwd  %%xmm2,%%xmm1                                   \n" \
2210   "packuswb   %%xmm1,%%xmm3                                   \n" \
2211   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2212   "psllw      $0x6,%%xmm4                                     \n" \
2213   "lea        0x10(%[y_buf]),%[y_buf]                         \n" \
2214   "movdqu     (%[a_buf]),%%xmm5                               \n" \
2215   "psraw      $2,%%xmm5                                       \n" \
2216   "packuswb   %%xmm5,%%xmm5                                   \n" \
2217   "lea        0x10(%[a_buf]),%[a_buf]                         \n"
2218 
2219 // Read 4 UV from 422 12 bit, upsample to 8 UV
2220 #define READYUV212                                                \
2221   "movq       (%[u_buf]),%%xmm3                               \n" \
2222   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2223   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
2224   "punpcklwd  %%xmm1,%%xmm3                                   \n" \
2225   "psraw      $0x4,%%xmm3                                     \n" \
2226   "packuswb   %%xmm3,%%xmm3                                   \n" \
2227   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2228   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2229   "psllw      $0x4,%%xmm4                                     \n" \
2230   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
2231 
2232 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
2233 #define READYUVA422                                               \
2234   "movd       (%[u_buf]),%%xmm3                               \n" \
2235   "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2236   "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
2237   "punpcklbw  %%xmm1,%%xmm3                                   \n" \
2238   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2239   "movq       (%[y_buf]),%%xmm4                               \n" \
2240   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2241   "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
2242   "movq       (%[a_buf]),%%xmm5                               \n" \
2243   "lea        0x8(%[a_buf]),%[a_buf]                          \n"
2244 
2245 // Read 8 UV from 444.  With 8 Alpha.
2246 #define READYUVA444                                               \
2247   "movq       (%[u_buf]),%%xmm3                               \n" \
2248   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2249   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
2250   "punpcklbw  %%xmm1,%%xmm3                                   \n" \
2251   "movq       (%[y_buf]),%%xmm4                               \n" \
2252   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2253   "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
2254   "movq       (%[a_buf]),%%xmm5                               \n" \
2255   "lea        0x8(%[a_buf]),%[a_buf]                          \n"
2256 
2257 // Read 4 UV from NV12, upsample to 8 UV
2258 #define READNV12                                                  \
2259   "movq       (%[uv_buf]),%%xmm3                              \n" \
2260   "lea        0x8(%[uv_buf]),%[uv_buf]                        \n" \
2261   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2262   "movq       (%[y_buf]),%%xmm4                               \n" \
2263   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2264   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
2265 
2266 // Read 4 VU from NV21, upsample to 8 UV
2267 #define READNV21                                                  \
2268   "movq       (%[vu_buf]),%%xmm3                              \n" \
2269   "lea        0x8(%[vu_buf]),%[vu_buf]                        \n" \
2270   "pshufb     %[kShuffleNV21], %%xmm3                         \n" \
2271   "movq       (%[y_buf]),%%xmm4                               \n" \
2272   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2273   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
2274 
2275 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
2276 #define READYUY2                                                  \
2277   "movdqu     (%[yuy2_buf]),%%xmm4                            \n" \
2278   "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n" \
2279   "movdqu     (%[yuy2_buf]),%%xmm3                            \n" \
2280   "pshufb     %[kShuffleYUY2UV], %%xmm3                       \n" \
2281   "lea        0x10(%[yuy2_buf]),%[yuy2_buf]                   \n"
2282 
2283 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
2284 #define READUYVY                                                  \
2285   "movdqu     (%[uyvy_buf]),%%xmm4                            \n" \
2286   "pshufb     %[kShuffleUYVYY], %%xmm4                        \n" \
2287   "movdqu     (%[uyvy_buf]),%%xmm3                            \n" \
2288   "pshufb     %[kShuffleUYVYUV], %%xmm3                       \n" \
2289   "lea        0x10(%[uyvy_buf]),%[uyvy_buf]                   \n"
2290 
2291 // Read 4 UV from P210, upsample to 8 UV
2292 #define READP210                                                  \
2293   "movdqu     (%[uv_buf]),%%xmm3                              \n" \
2294   "lea        0x10(%[uv_buf]),%[uv_buf]                       \n" \
2295   "psrlw      $0x8,%%xmm3                                     \n" \
2296   "packuswb   %%xmm3,%%xmm3                                   \n" \
2297   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2298   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2299   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
2300 
2301 // Read 8 UV from P410
2302 #define READP410                                                  \
2303   "movdqu     (%[uv_buf]),%%xmm3                              \n" \
2304   "movdqu     0x10(%[uv_buf]),%%xmm1                          \n" \
2305   "lea        0x20(%[uv_buf]),%[uv_buf]                       \n" \
2306   "psrlw      $0x8,%%xmm3                                     \n" \
2307   "psrlw      $0x8,%%xmm1                                     \n" \
2308   "packuswb   %%xmm1,%%xmm3                                   \n" \
2309   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2310   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
2311 
2312 #if defined(__x86_64__)
2313 #define YUVTORGB_SETUP(yuvconstants)                              \
2314   "pcmpeqb    %%xmm13,%%xmm13                                 \n" \
2315   "movdqa     (%[yuvconstants]),%%xmm8                        \n" \
2316   "pxor       %%xmm12,%%xmm12                                 \n" \
2317   "movdqa     32(%[yuvconstants]),%%xmm9                      \n" \
2318   "psllw      $7,%%xmm13                                      \n" \
2319   "movdqa     64(%[yuvconstants]),%%xmm10                     \n" \
2320   "pshufb     %%xmm12,%%xmm13                                 \n" \
2321   "movdqa     96(%[yuvconstants]),%%xmm11                     \n" \
2322   "movdqa     128(%[yuvconstants]),%%xmm12                    \n"
2323 
2324 // Convert 8 pixels: 8 UV and 8 Y
2325 #define YUVTORGB16(yuvconstants)                                  \
2326   "psubb      %%xmm13,%%xmm3                                  \n" \
2327   "pmulhuw    %%xmm11,%%xmm4                                  \n" \
2328   "movdqa     %%xmm8,%%xmm0                                   \n" \
2329   "movdqa     %%xmm9,%%xmm1                                   \n" \
2330   "movdqa     %%xmm10,%%xmm2                                  \n" \
2331   "paddw      %%xmm12,%%xmm4                                  \n" \
2332   "pmaddubsw  %%xmm3,%%xmm0                                   \n" \
2333   "pmaddubsw  %%xmm3,%%xmm1                                   \n" \
2334   "pmaddubsw  %%xmm3,%%xmm2                                   \n" \
2335   "paddsw     %%xmm4,%%xmm0                                   \n" \
2336   "paddsw     %%xmm4,%%xmm2                                   \n" \
2337   "psubsw     %%xmm1,%%xmm4                                   \n" \
2338   "movdqa     %%xmm4,%%xmm1                                   \n"
2339 
2340 #define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
2341 
2342 #else
2343 #define YUVTORGB_SETUP(yuvconstants)
2344 // Convert 8 pixels: 8 UV and 8 Y
2345 #define YUVTORGB16(yuvconstants)                                  \
2346   "pcmpeqb    %%xmm0,%%xmm0                                   \n" \
2347   "pxor       %%xmm1,%%xmm1                                   \n" \
2348   "psllw      $7,%%xmm0                                       \n" \
2349   "pshufb     %%xmm1,%%xmm0                                   \n" \
2350   "psubb      %%xmm0,%%xmm3                                   \n" \
2351   "pmulhuw    96(%[yuvconstants]),%%xmm4                      \n" \
2352   "movdqa     (%[yuvconstants]),%%xmm0                        \n" \
2353   "movdqa     32(%[yuvconstants]),%%xmm1                      \n" \
2354   "movdqa     64(%[yuvconstants]),%%xmm2                      \n" \
2355   "pmaddubsw  %%xmm3,%%xmm0                                   \n" \
2356   "pmaddubsw  %%xmm3,%%xmm1                                   \n" \
2357   "pmaddubsw  %%xmm3,%%xmm2                                   \n" \
2358   "movdqa     128(%[yuvconstants]),%%xmm3                     \n" \
2359   "paddw      %%xmm3,%%xmm4                                   \n" \
2360   "paddsw     %%xmm4,%%xmm0                                   \n" \
2361   "paddsw     %%xmm4,%%xmm2                                   \n" \
2362   "psubsw     %%xmm1,%%xmm4                                   \n" \
2363   "movdqa     %%xmm4,%%xmm1                                   \n"
2364 
2365 #define YUVTORGB_REGS
2366 #endif
2367 
2368 #define YUVTORGB(yuvconstants)                                    \
2369   YUVTORGB16(yuvconstants)                                        \
2370   "psraw      $0x6,%%xmm0                                     \n" \
2371   "psraw      $0x6,%%xmm1                                     \n" \
2372   "psraw      $0x6,%%xmm2                                     \n" \
2373   "packuswb   %%xmm0,%%xmm0                                   \n" \
2374   "packuswb   %%xmm1,%%xmm1                                   \n" \
2375   "packuswb   %%xmm2,%%xmm2                                   \n"
2376 
2377 // Store 8 ARGB values.
2378 #define STOREARGB                                                  \
2379   "punpcklbw  %%xmm1,%%xmm0                                    \n" \
2380   "punpcklbw  %%xmm5,%%xmm2                                    \n" \
2381   "movdqa     %%xmm0,%%xmm1                                    \n" \
2382   "punpcklwd  %%xmm2,%%xmm0                                    \n" \
2383   "punpckhwd  %%xmm2,%%xmm1                                    \n" \
2384   "movdqu     %%xmm0,(%[dst_argb])                             \n" \
2385   "movdqu     %%xmm1,0x10(%[dst_argb])                         \n" \
2386   "lea        0x20(%[dst_argb]), %[dst_argb]                   \n"
2387 
2388 // Store 8 RGBA values.
2389 #define STORERGBA                                                  \
2390   "pcmpeqb   %%xmm5,%%xmm5                                     \n" \
2391   "punpcklbw %%xmm2,%%xmm1                                     \n" \
2392   "punpcklbw %%xmm0,%%xmm5                                     \n" \
2393   "movdqa    %%xmm5,%%xmm0                                     \n" \
2394   "punpcklwd %%xmm1,%%xmm5                                     \n" \
2395   "punpckhwd %%xmm1,%%xmm0                                     \n" \
2396   "movdqu    %%xmm5,(%[dst_rgba])                              \n" \
2397   "movdqu    %%xmm0,0x10(%[dst_rgba])                          \n" \
2398   "lea       0x20(%[dst_rgba]),%[dst_rgba]                     \n"
2399 
2400 // Store 8 AR30 values.
2401 #define STOREAR30                                                  \
2402   "psraw      $0x4,%%xmm0                                      \n" \
2403   "psraw      $0x4,%%xmm1                                      \n" \
2404   "psraw      $0x4,%%xmm2                                      \n" \
2405   "pminsw     %%xmm7,%%xmm0                                    \n" \
2406   "pminsw     %%xmm7,%%xmm1                                    \n" \
2407   "pminsw     %%xmm7,%%xmm2                                    \n" \
2408   "pmaxsw     %%xmm6,%%xmm0                                    \n" \
2409   "pmaxsw     %%xmm6,%%xmm1                                    \n" \
2410   "pmaxsw     %%xmm6,%%xmm2                                    \n" \
2411   "psllw      $0x4,%%xmm2                                      \n" \
2412   "movdqa     %%xmm0,%%xmm3                                    \n" \
2413   "punpcklwd  %%xmm2,%%xmm0                                    \n" \
2414   "punpckhwd  %%xmm2,%%xmm3                                    \n" \
2415   "movdqa     %%xmm1,%%xmm2                                    \n" \
2416   "punpcklwd  %%xmm5,%%xmm1                                    \n" \
2417   "punpckhwd  %%xmm5,%%xmm2                                    \n" \
2418   "pslld      $0xa,%%xmm1                                      \n" \
2419   "pslld      $0xa,%%xmm2                                      \n" \
2420   "por        %%xmm1,%%xmm0                                    \n" \
2421   "por        %%xmm2,%%xmm3                                    \n" \
2422   "movdqu     %%xmm0,(%[dst_ar30])                             \n" \
2423   "movdqu     %%xmm3,0x10(%[dst_ar30])                         \n" \
2424   "lea        0x20(%[dst_ar30]), %[dst_ar30]                   \n"
2425 
I444ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2426 void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
2427                                 const uint8_t* u_buf,
2428                                 const uint8_t* v_buf,
2429                                 uint8_t* dst_argb,
2430                                 const struct YuvConstants* yuvconstants,
2431                                 int width) {
2432   asm volatile (
2433     YUVTORGB_SETUP(yuvconstants)
2434       "sub         %[u_buf],%[v_buf]             \n"
2435       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2436 
2437     LABELALIGN
2438       "1:                                        \n"
2439     READYUV444
2440     YUVTORGB(yuvconstants)
2441     STOREARGB
2442       "sub         $0x8,%[width]                 \n"
2443       "jg          1b                            \n"
2444   : [y_buf]"+r"(y_buf),    // %[y_buf]
2445     [u_buf]"+r"(u_buf),    // %[u_buf]
2446     [v_buf]"+r"(v_buf),    // %[v_buf]
2447     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2448     [width]"+rm"(width)    // %[width]
2449   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2450   : "memory", "cc", YUVTORGB_REGS
2451     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2452   );
2453 }
2454 
2455 #ifdef HAS_I444ALPHATOARGBROW_SSSE3
I444AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2456 void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
2457                                      const uint8_t* u_buf,
2458                                      const uint8_t* v_buf,
2459                                      const uint8_t* a_buf,
2460                                      uint8_t* dst_argb,
2461                                      const struct YuvConstants* yuvconstants,
2462                                      int width) {
2463   // clang-format off
2464   asm volatile (
2465   YUVTORGB_SETUP(yuvconstants)
2466       "sub         %[u_buf],%[v_buf]             \n"
2467 
2468   LABELALIGN
2469       "1:                                        \n"
2470   READYUVA444
2471   YUVTORGB(yuvconstants)
2472   STOREARGB
2473       "subl        $0x8,%[width]                 \n"
2474       "jg          1b                            \n"
2475   : [y_buf]"+r"(y_buf),    // %[y_buf]
2476     [u_buf]"+r"(u_buf),    // %[u_buf]
2477     [v_buf]"+r"(v_buf),    // %[v_buf]
2478     [a_buf]"+r"(a_buf),    // %[a_buf]
2479     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2480 #if defined(__i386__)
2481     [width]"+m"(width)     // %[width]
2482 #else
2483     [width]"+rm"(width)    // %[width]
2484 #endif
2485   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2486   : "memory", "cc", YUVTORGB_REGS
2487       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2488   );
2489   // clang-format on
2490 }
2491 #endif  // HAS_I444ALPHATOARGBROW_SSSE3
2492 
I422ToRGB24Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)2493 void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
2494                                  const uint8_t* u_buf,
2495                                  const uint8_t* v_buf,
2496                                  uint8_t* dst_rgb24,
2497                                  const struct YuvConstants* yuvconstants,
2498                                  int width) {
2499   asm volatile (
2500     YUVTORGB_SETUP(yuvconstants)
2501       "movdqa      %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2502       "movdqa      %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
2503       "sub         %[u_buf],%[v_buf]             \n"
2504 
2505     LABELALIGN
2506       "1:                                        \n"
2507     READYUV422
2508     YUVTORGB(yuvconstants)
2509       "punpcklbw   %%xmm1,%%xmm0                 \n"
2510       "punpcklbw   %%xmm2,%%xmm2                 \n"
2511       "movdqa      %%xmm0,%%xmm1                 \n"
2512       "punpcklwd   %%xmm2,%%xmm0                 \n"
2513       "punpckhwd   %%xmm2,%%xmm1                 \n"
2514       "pshufb      %%xmm5,%%xmm0                 \n"
2515       "pshufb      %%xmm6,%%xmm1                 \n"
2516       "palignr     $0xc,%%xmm0,%%xmm1            \n"
2517       "movq        %%xmm0,(%[dst_rgb24])         \n"
2518       "movdqu      %%xmm1,0x8(%[dst_rgb24])      \n"
2519       "lea         0x18(%[dst_rgb24]),%[dst_rgb24] \n"
2520       "subl        $0x8,%[width]                 \n"
2521       "jg          1b                            \n"
2522   : [y_buf]"+r"(y_buf),    // %[y_buf]
2523     [u_buf]"+r"(u_buf),    // %[u_buf]
2524     [v_buf]"+r"(v_buf),    // %[v_buf]
2525     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
2526 #if defined(__i386__)
2527     [width]"+m"(width)     // %[width]
2528 #else
2529     [width]"+rm"(width)    // %[width]
2530 #endif
2531   : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
2532     [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2533     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
2534   : "memory", "cc", YUVTORGB_REGS
2535     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2536   );
2537 }
2538 
I422ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2539 void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
2540                                 const uint8_t* u_buf,
2541                                 const uint8_t* v_buf,
2542                                 uint8_t* dst_argb,
2543                                 const struct YuvConstants* yuvconstants,
2544                                 int width) {
2545   asm volatile (
2546     YUVTORGB_SETUP(yuvconstants)
2547       "sub         %[u_buf],%[v_buf]             \n"
2548       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2549 
2550     LABELALIGN
2551       "1:                                        \n"
2552     READYUV422
2553     YUVTORGB(yuvconstants)
2554     STOREARGB
2555       "sub         $0x8,%[width]                 \n"
2556       "jg          1b                            \n"
2557   : [y_buf]"+r"(y_buf),    // %[y_buf]
2558     [u_buf]"+r"(u_buf),    // %[u_buf]
2559     [v_buf]"+r"(v_buf),    // %[v_buf]
2560     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2561     [width]"+rm"(width)    // %[width]
2562   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2563   : "memory", "cc", YUVTORGB_REGS
2564     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2565   );
2566 }
2567 
I422ToAR30Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2568 void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
2569                                 const uint8_t* u_buf,
2570                                 const uint8_t* v_buf,
2571                                 uint8_t* dst_ar30,
2572                                 const struct YuvConstants* yuvconstants,
2573                                 int width) {
2574   asm volatile (
2575     YUVTORGB_SETUP(yuvconstants)
2576       "sub         %[u_buf],%[v_buf]             \n"
2577       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // AR30 constants
2578       "psrlw       $14,%%xmm5                    \n"
2579       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
2580       "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
2581       "pcmpeqb     %%xmm7,%%xmm7                 \n"
2582       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
2583 
2584     LABELALIGN
2585       "1:                                        \n"
2586     READYUV422
2587     YUVTORGB16(yuvconstants)
2588     STOREAR30
2589       "sub         $0x8,%[width]                 \n"
2590       "jg          1b                            \n"
2591   : [y_buf]"+r"(y_buf),    // %[y_buf]
2592     [u_buf]"+r"(u_buf),    // %[u_buf]
2593     [v_buf]"+r"(v_buf),    // %[v_buf]
2594     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2595     [width]"+rm"(width)    // %[width]
2596   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2597   : "memory", "cc", YUVTORGB_REGS
2598     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2599   );
2600 }
2601 
2602 // 10 bit YUV to ARGB
I210ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2603 void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
2604                                 const uint16_t* u_buf,
2605                                 const uint16_t* v_buf,
2606                                 uint8_t* dst_argb,
2607                                 const struct YuvConstants* yuvconstants,
2608                                 int width) {
2609   asm volatile (
2610     YUVTORGB_SETUP(yuvconstants)
2611       "sub         %[u_buf],%[v_buf]             \n"
2612       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2613 
2614     LABELALIGN
2615       "1:                                        \n"
2616     READYUV210
2617     YUVTORGB(yuvconstants)
2618     STOREARGB
2619       "sub         $0x8,%[width]                 \n"
2620       "jg          1b                            \n"
2621   : [y_buf]"+r"(y_buf),    // %[y_buf]
2622     [u_buf]"+r"(u_buf),    // %[u_buf]
2623     [v_buf]"+r"(v_buf),    // %[v_buf]
2624     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2625     [width]"+rm"(width)    // %[width]
2626   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2627   : "memory", "cc", YUVTORGB_REGS
2628     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2629   );
2630 }
2631 
2632 // 12 bit YUV to ARGB
I212ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2633 void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf,
2634                                 const uint16_t* u_buf,
2635                                 const uint16_t* v_buf,
2636                                 uint8_t* dst_argb,
2637                                 const struct YuvConstants* yuvconstants,
2638                                 int width) {
2639   asm volatile (
2640     YUVTORGB_SETUP(yuvconstants)
2641       "sub         %[u_buf],%[v_buf]             \n"
2642       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2643 
2644     LABELALIGN
2645       "1:                                        \n"
2646     READYUV212
2647     YUVTORGB(yuvconstants)
2648     STOREARGB
2649       "sub         $0x8,%[width]                 \n"
2650       "jg          1b                            \n"
2651   : [y_buf]"+r"(y_buf),    // %[y_buf]
2652     [u_buf]"+r"(u_buf),    // %[u_buf]
2653     [v_buf]"+r"(v_buf),    // %[v_buf]
2654     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2655     [width]"+rm"(width)    // %[width]
2656   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2657   : "memory", "cc", YUVTORGB_REGS
2658     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2659   );
2660 }
2661 
2662 // 10 bit YUV to AR30
I210ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2663 void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
2664                                 const uint16_t* u_buf,
2665                                 const uint16_t* v_buf,
2666                                 uint8_t* dst_ar30,
2667                                 const struct YuvConstants* yuvconstants,
2668                                 int width) {
2669   asm volatile (
2670     YUVTORGB_SETUP(yuvconstants)
2671       "sub         %[u_buf],%[v_buf]             \n"
2672       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2673       "psrlw       $14,%%xmm5                    \n"
2674       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
2675       "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
2676       "pcmpeqb     %%xmm7,%%xmm7                 \n"
2677       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
2678 
2679     LABELALIGN
2680       "1:                                        \n"
2681     READYUV210
2682     YUVTORGB16(yuvconstants)
2683     STOREAR30
2684       "sub         $0x8,%[width]                 \n"
2685       "jg          1b                            \n"
2686   : [y_buf]"+r"(y_buf),    // %[y_buf]
2687     [u_buf]"+r"(u_buf),    // %[u_buf]
2688     [v_buf]"+r"(v_buf),    // %[v_buf]
2689     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2690     [width]"+rm"(width)    // %[width]
2691   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2692   : "memory", "cc", YUVTORGB_REGS
2693     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2694   );
2695 }
2696 
2697 // 12 bit YUV to AR30
I212ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2698 void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
2699                                 const uint16_t* u_buf,
2700                                 const uint16_t* v_buf,
2701                                 uint8_t* dst_ar30,
2702                                 const struct YuvConstants* yuvconstants,
2703                                 int width) {
2704   asm volatile (
2705     YUVTORGB_SETUP(yuvconstants)
2706       "sub         %[u_buf],%[v_buf]             \n"
2707       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2708       "psrlw       $14,%%xmm5                    \n"
2709       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
2710       "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
2711       "pcmpeqb     %%xmm7,%%xmm7                 \n"
2712       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
2713 
2714     LABELALIGN
2715       "1:                                        \n"
2716     READYUV212
2717     YUVTORGB16(yuvconstants)
2718     STOREAR30
2719       "sub         $0x8,%[width]                 \n"
2720       "jg          1b                            \n"
2721   : [y_buf]"+r"(y_buf),    // %[y_buf]
2722     [u_buf]"+r"(u_buf),    // %[u_buf]
2723     [v_buf]"+r"(v_buf),    // %[v_buf]
2724     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2725     [width]"+rm"(width)    // %[width]
2726   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2727   : "memory", "cc", YUVTORGB_REGS
2728     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2729   );
2730 }
2731 
2732 // 10 bit YUV to ARGB
I410ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2733 void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf,
2734                                 const uint16_t* u_buf,
2735                                 const uint16_t* v_buf,
2736                                 uint8_t* dst_argb,
2737                                 const struct YuvConstants* yuvconstants,
2738                                 int width) {
2739   asm volatile (
2740     YUVTORGB_SETUP(yuvconstants)
2741       "sub         %[u_buf],%[v_buf]             \n"
2742       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2743 
2744     LABELALIGN
2745       "1:                                        \n"
2746     READYUV410
2747     YUVTORGB(yuvconstants)
2748     STOREARGB
2749       "sub         $0x8,%[width]                 \n"
2750       "jg          1b                            \n"
2751   : [y_buf]"+r"(y_buf),    // %[y_buf]
2752     [u_buf]"+r"(u_buf),    // %[u_buf]
2753     [v_buf]"+r"(v_buf),    // %[v_buf]
2754     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2755     [width]"+rm"(width)    // %[width]
2756   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2757   : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2758   );
2759 }
2760 
2761 #ifdef HAS_I210ALPHATOARGBROW_SSSE3
2762 // 10 bit YUVA to ARGB
I210AlphaToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2763 void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
2764                                      const uint16_t* u_buf,
2765                                      const uint16_t* v_buf,
2766                                      const uint16_t* a_buf,
2767                                      uint8_t* dst_argb,
2768                                      const struct YuvConstants* yuvconstants,
2769                                      int width) {
2770   asm volatile(
2771       YUVTORGB_SETUP(
2772       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
2773 
2774       LABELALIGN "1:                                        \n" READYUVA210
2775           YUVTORGB(yuvconstants) STOREARGB
2776       "subl        $0x8,%[width]                 \n"
2777       "jg          1b                            \n"
2778       : [y_buf] "+r"(y_buf),  // %[y_buf]
2779         [u_buf] "+r"(u_buf),  // %[u_buf]
2780         [v_buf] "+r"(v_buf),  // %[v_buf]
2781         [a_buf] "+r"(a_buf),
2782         [dst_argb] "+r"(dst_argb),  // %[dst_argb]
2783 #if defined(__i386__)
2784         [width] "+m"(width)  // %[width]
2785 #else
2786         [width] "+rm"(width)  // %[width]
2787 #endif
2788       : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
2789       : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
2790         "xmm5");
2791 }
2792 #endif
2793 
2794 #ifdef HAS_I410ALPHATOARGBROW_SSSE3
2795 // 10 bit YUVA to ARGB
I410AlphaToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2796 void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
2797                                      const uint16_t* u_buf,
2798                                      const uint16_t* v_buf,
2799                                      const uint16_t* a_buf,
2800                                      uint8_t* dst_argb,
2801                                      const struct YuvConstants* yuvconstants,
2802                                      int width) {
2803   // clang-format off
2804   asm volatile(
2805     YUVTORGB_SETUP(yuvconstants)
2806       "sub         %[u_buf],%[v_buf]             \n"
2807 
2808     LABELALIGN
2809       "1:                                        \n"
2810     READYUVA410
2811     YUVTORGB(yuvconstants)
2812     STOREARGB
2813       "subl        $0x8,%[width]                 \n"
2814       "jg          1b                            \n"
2815     : [y_buf] "+r"(y_buf),  // %[y_buf]
2816       [u_buf] "+r"(u_buf),  // %[u_buf]
2817       [v_buf] "+r"(v_buf),  // %[v_buf]
2818       [a_buf] "+r"(a_buf),
2819       [dst_argb] "+r"(dst_argb),  // %[dst_argb]
2820 #if defined(__i386__)
2821       [width] "+m"(width)  // %[width]
2822 #else
2823       [width] "+rm"(width)  // %[width]
2824 #endif
2825     : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
2826     : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
2827       "xmm5");
2828   // clang-format on
2829 }
2830 #endif
2831 
2832 // 10 bit YUV to AR30
I410ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2833 void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
2834                                 const uint16_t* u_buf,
2835                                 const uint16_t* v_buf,
2836                                 uint8_t* dst_ar30,
2837                                 const struct YuvConstants* yuvconstants,
2838                                 int width) {
2839   asm volatile (
2840     YUVTORGB_SETUP(yuvconstants)
2841       "sub         %[u_buf],%[v_buf]             \n"
2842       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2843       "psrlw       $14,%%xmm5                    \n"
2844       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
2845       "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
2846       "pcmpeqb     %%xmm7,%%xmm7                 \n"
2847       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
2848 
2849     LABELALIGN
2850       "1:                                        \n"
2851     READYUV410
2852     YUVTORGB16(yuvconstants)
2853     STOREAR30
2854       "sub         $0x8,%[width]                 \n"
2855       "jg          1b                            \n"
2856   : [y_buf]"+r"(y_buf),    // %[y_buf]
2857     [u_buf]"+r"(u_buf),    // %[u_buf]
2858     [v_buf]"+r"(v_buf),    // %[v_buf]
2859     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2860     [width]"+rm"(width)    // %[width]
2861   : [yuvconstants]"r"(yuvconstants)   // %[yuvconstants]
2862   : "memory", "cc", YUVTORGB_REGS
2863       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2864   );
2865 }
2866 
2867 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
I422AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2868 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
2869                                      const uint8_t* u_buf,
2870                                      const uint8_t* v_buf,
2871                                      const uint8_t* a_buf,
2872                                      uint8_t* dst_argb,
2873                                      const struct YuvConstants* yuvconstants,
2874                                      int width) {
2875   // clang-format off
2876   asm volatile (
2877     YUVTORGB_SETUP(yuvconstants)
2878       "sub         %[u_buf],%[v_buf]             \n"
2879 
2880     LABELALIGN
2881       "1:                                        \n"
2882     READYUVA422
2883     YUVTORGB(yuvconstants)
2884     STOREARGB
2885       "subl        $0x8,%[width]                 \n"
2886       "jg          1b                            \n"
2887   : [y_buf]"+r"(y_buf),    // %[y_buf]
2888     [u_buf]"+r"(u_buf),    // %[u_buf]
2889     [v_buf]"+r"(v_buf),    // %[v_buf]
2890     [a_buf]"+r"(a_buf),    // %[a_buf]
2891     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2892 #if defined(__i386__)
2893     [width]"+m"(width)     // %[width]
2894 #else
2895     [width]"+rm"(width)    // %[width]
2896 #endif
2897   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2898   : "memory", "cc", YUVTORGB_REGS
2899     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2900   );
2901   // clang-format on
2902 }
2903 #endif  // HAS_I422ALPHATOARGBROW_SSSE3
2904 
NV12ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2905 void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
2906                                 const uint8_t* uv_buf,
2907                                 uint8_t* dst_argb,
2908                                 const struct YuvConstants* yuvconstants,
2909                                 int width) {
2910   // clang-format off
2911   asm volatile (
2912     YUVTORGB_SETUP(yuvconstants)
2913       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2914 
2915     LABELALIGN
2916       "1:                                        \n"
2917     READNV12
2918     YUVTORGB(yuvconstants)
2919     STOREARGB
2920       "sub         $0x8,%[width]                 \n"
2921       "jg          1b                            \n"
2922   : [y_buf]"+r"(y_buf),    // %[y_buf]
2923     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
2924     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2925     [width]"+rm"(width)    // %[width]
2926   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2927     : "memory", "cc", YUVTORGB_REGS
2928       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2929   );
2930   // clang-format on
2931 }
2932 
NV21ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * vu_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2933 void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
2934                                 const uint8_t* vu_buf,
2935                                 uint8_t* dst_argb,
2936                                 const struct YuvConstants* yuvconstants,
2937                                 int width) {
2938   // clang-format off
2939   asm volatile (
2940     YUVTORGB_SETUP(yuvconstants)
2941       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2942 
2943     LABELALIGN
2944       "1:                                        \n"
2945     READNV21
2946     YUVTORGB(yuvconstants)
2947     STOREARGB
2948       "sub         $0x8,%[width]                 \n"
2949       "jg          1b                            \n"
2950   : [y_buf]"+r"(y_buf),    // %[y_buf]
2951     [vu_buf]"+r"(vu_buf),    // %[vu_buf]
2952     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2953     [width]"+rm"(width)    // %[width]
2954   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2955     [kShuffleNV21]"m"(kShuffleNV21)
2956     : "memory", "cc", YUVTORGB_REGS
2957       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2958   );
2959   // clang-format on
2960 }
2961 
YUY2ToARGBRow_SSSE3(const uint8_t * yuy2_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2962 void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
2963                                 uint8_t* dst_argb,
2964                                 const struct YuvConstants* yuvconstants,
2965                                 int width) {
2966   // clang-format off
2967   asm volatile (
2968     YUVTORGB_SETUP(yuvconstants)
2969       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2970 
2971     LABELALIGN
2972       "1:                                        \n"
2973     READYUY2
2974     YUVTORGB(yuvconstants)
2975     STOREARGB
2976       "sub         $0x8,%[width]                 \n"
2977       "jg          1b                            \n"
2978   : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
2979     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2980     [width]"+rm"(width)    // %[width]
2981   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2982     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2983     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2984     : "memory", "cc", YUVTORGB_REGS
2985       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2986   );
2987   // clang-format on
2988 }
2989 
UYVYToARGBRow_SSSE3(const uint8_t * uyvy_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2990 void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
2991                                 uint8_t* dst_argb,
2992                                 const struct YuvConstants* yuvconstants,
2993                                 int width) {
2994   // clang-format off
2995   asm volatile (
2996     YUVTORGB_SETUP(yuvconstants)
2997       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2998 
2999     LABELALIGN
3000       "1:                                        \n"
3001     READUYVY
3002     YUVTORGB(yuvconstants)
3003     STOREARGB
3004       "sub         $0x8,%[width]                 \n"
3005       "jg          1b                            \n"
3006   : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
3007     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3008     [width]"+rm"(width)    // %[width]
3009   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
3010     [kShuffleUYVYY]"m"(kShuffleUYVYY),
3011     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
3012     : "memory", "cc", YUVTORGB_REGS
3013       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3014   );
3015   // clang-format on
3016 }
3017 
P210ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3018 void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf,
3019                                 const uint16_t* uv_buf,
3020                                 uint8_t* dst_argb,
3021                                 const struct YuvConstants* yuvconstants,
3022                                 int width) {
3023   asm volatile(
3024       YUVTORGB_SETUP(
3025       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
3026 
3027       LABELALIGN "1:                                        \n" READP210
3028           YUVTORGB(yuvconstants) STOREARGB
3029       "sub         $0x8,%[width]                 \n"
3030       "jg          1b                            \n"
3031       : [y_buf] "+r"(y_buf),              // %[y_buf]
3032         [uv_buf] "+r"(uv_buf),            // %[u_buf]
3033         [dst_argb] "+r"(dst_argb),        // %[dst_argb]
3034         [width] "+rm"(width)              // %[width]
3035       : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
3036       : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
3037         "xmm5");
3038 }
3039 
P410ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3040 void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
3041                                 const uint16_t* uv_buf,
3042                                 uint8_t* dst_argb,
3043                                 const struct YuvConstants* yuvconstants,
3044                                 int width) {
3045   asm volatile(
3046       YUVTORGB_SETUP(
3047       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
3048 
3049       LABELALIGN "1:                                        \n" READP410
3050           YUVTORGB(yuvconstants) STOREARGB
3051       "sub         $0x8,%[width]                 \n"
3052       "jg          1b                            \n"
3053       : [y_buf] "+r"(y_buf),              // %[y_buf]
3054         [uv_buf] "+r"(uv_buf),            // %[u_buf]
3055         [dst_argb] "+r"(dst_argb),        // %[dst_argb]
3056         [width] "+rm"(width)              // %[width]
3057       : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
3058       : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
3059         "xmm5");
3060 }
3061 
P210ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3062 void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
3063                                 const uint16_t* uv_buf,
3064                                 uint8_t* dst_ar30,
3065                                 const struct YuvConstants* yuvconstants,
3066                                 int width) {
3067   asm volatile (
3068     YUVTORGB_SETUP(yuvconstants)
3069       "pcmpeqb     %%xmm5,%%xmm5                 \n"
3070       "psrlw       $14,%%xmm5                    \n"
3071       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
3072       "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
3073       "pcmpeqb     %%xmm7,%%xmm7                 \n"
3074       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
3075 
3076     LABELALIGN
3077       "1:                                        \n"
3078     READP210
3079     YUVTORGB16(yuvconstants)
3080     STOREAR30
3081       "sub         $0x8,%[width]                 \n"
3082       "jg          1b                            \n"
3083   : [y_buf]"+r"(y_buf),              // %[y_buf]
3084     [uv_buf]"+r"(uv_buf),            // %[uv_buf]
3085     [dst_ar30]"+r"(dst_ar30),        // %[dst_ar30]
3086     [width]"+rm"(width)              // %[width]
3087   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3088   : "memory", "cc", YUVTORGB_REGS
3089       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3090   );
3091 }
3092 
P410ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3093 void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
3094                                 const uint16_t* uv_buf,
3095                                 uint8_t* dst_ar30,
3096                                 const struct YuvConstants* yuvconstants,
3097                                 int width) {
3098   asm volatile (
3099     YUVTORGB_SETUP(yuvconstants)
3100       "pcmpeqb     %%xmm5,%%xmm5                 \n"
3101       "psrlw       $14,%%xmm5                    \n"
3102       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
3103       "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
3104       "pcmpeqb     %%xmm7,%%xmm7                 \n"
3105       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
3106 
3107     LABELALIGN
3108       "1:                                        \n"
3109     READP410
3110     YUVTORGB16(yuvconstants)
3111     STOREAR30
3112       "sub         $0x8,%[width]                 \n"
3113       "jg          1b                            \n"
3114   : [y_buf]"+r"(y_buf),              // %[y_buf]
3115     [uv_buf]"+r"(uv_buf),            // %[uv_buf]
3116     [dst_ar30]"+r"(dst_ar30),        // %[dst_ar30]
3117     [width]"+rm"(width)              // %[width]
3118   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3119   : "memory", "cc", YUVTORGB_REGS
3120       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3121   );
3122 }
3123 
I422ToRGBARow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgba,const struct YuvConstants * yuvconstants,int width)3124 void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
3125                                 const uint8_t* u_buf,
3126                                 const uint8_t* v_buf,
3127                                 uint8_t* dst_rgba,
3128                                 const struct YuvConstants* yuvconstants,
3129                                 int width) {
3130   asm volatile (
3131     YUVTORGB_SETUP(yuvconstants)
3132       "sub         %[u_buf],%[v_buf]             \n"
3133       "pcmpeqb     %%xmm5,%%xmm5                 \n"
3134 
3135     LABELALIGN
3136       "1:                                        \n"
3137     READYUV422
3138     YUVTORGB(yuvconstants)
3139     STORERGBA
3140       "sub         $0x8,%[width]                 \n"
3141       "jg          1b                            \n"
3142   : [y_buf]"+r"(y_buf),    // %[y_buf]
3143     [u_buf]"+r"(u_buf),    // %[u_buf]
3144     [v_buf]"+r"(v_buf),    // %[v_buf]
3145     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
3146     [width]"+rm"(width)    // %[width]
3147   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3148   : "memory", "cc", YUVTORGB_REGS
3149     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3150   );
3151 }
3152 
3153 #endif  // HAS_I422TOARGBROW_SSSE3
3154 
3155 // Read 16 UV from 444
3156 #define READYUV444_AVX2                                               \
3157   "vmovdqu    (%[u_buf]),%%xmm3                                   \n" \
3158   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
3159   "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
3160   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3161   "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
3162   "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
3163   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3164   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3165   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3166   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
3167 
3168 // Read 8 UV from 422, upsample to 16 UV.
3169 #define READYUV422_AVX2                                               \
3170   "vmovq      (%[u_buf]),%%xmm3                                   \n" \
3171   "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
3172   "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
3173   "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
3174   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3175   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
3176   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3177   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3178   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3179   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
3180 
3181 // Read 8 UV from 210, upsample to 16 UV
3182 // TODO(fbarchard): Consider vshufb to replace pack/unpack
3183 // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
3184 #define READYUV210_AVX2                                            \
3185   "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
3186   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
3187   "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
3188   "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
3189   "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
3190   "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
3191   "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
3192   "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
3193   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
3194   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
3195   "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
3196   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
3197 
3198 // Read 8 UV from 210, upsample to 16 UV. With 16 Alpha.
3199 #define READYUVA210_AVX2                                           \
3200   "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
3201   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
3202   "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
3203   "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
3204   "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
3205   "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
3206   "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
3207   "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
3208   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
3209   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
3210   "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
3211   "lea        0x20(%[y_buf]),%[y_buf]                          \n" \
3212   "vmovdqu    (%[a_buf]),%%ymm5                                \n" \
3213   "vpsraw     $2,%%ymm5,%%ymm5                                 \n" \
3214   "vpackuswb  %%ymm5,%%ymm5,%%ymm5                             \n" \
3215   "lea        0x20(%[a_buf]),%[a_buf]                          \n"
3216 
3217 // Read 16 UV from 410
3218 #define READYUV410_AVX2                                            \
3219   "vmovdqu    (%[u_buf]),%%ymm3                                \n" \
3220   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%ymm2                 \n" \
3221   "lea        0x20(%[u_buf]),%[u_buf]                          \n" \
3222   "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
3223   "vpsraw     $2,%%ymm2,%%ymm2                                 \n" \
3224   "vpunpckhwd %%ymm2,%%ymm3,%%ymm1                             \n" \
3225   "vpunpcklwd %%ymm2,%%ymm3,%%ymm3                             \n" \
3226   "vpackuswb  %%ymm1,%%ymm3,%%ymm3                             \n" \
3227   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
3228   "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
3229   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
3230 
3231 // Read 8 UV from 212 12 bit, upsample to 16 UV
3232 #define READYUV212_AVX2                                            \
3233   "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
3234   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
3235   "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
3236   "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
3237   "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
3238   "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
3239   "vpsraw     $0x4,%%ymm3,%%ymm3                               \n" \
3240   "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
3241   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
3242   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
3243   "vpsllw     $0x4,%%ymm4,%%ymm4                               \n" \
3244   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
3245 
3246 // Read 16 UV from 410. With 16 Alpha.
3247 #define READYUVA410_AVX2                                           \
3248   "vmovdqu    (%[u_buf]),%%ymm3                                \n" \
3249   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%ymm2                 \n" \
3250   "lea        0x20(%[u_buf]),%[u_buf]                          \n" \
3251   "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
3252   "vpsraw     $2,%%ymm2,%%ymm2                                 \n" \
3253   "vpunpckhwd %%ymm2,%%ymm3,%%ymm1                             \n" \
3254   "vpunpcklwd %%ymm2,%%ymm3,%%ymm3                             \n" \
3255   "vpackuswb  %%ymm1,%%ymm3,%%ymm3                             \n" \
3256   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
3257   "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
3258   "lea        0x20(%[y_buf]),%[y_buf]                          \n" \
3259   "vmovdqu    (%[a_buf]),%%ymm5                                \n" \
3260   "vpsraw     $2,%%ymm5,%%ymm5                                 \n" \
3261   "vpackuswb  %%ymm5,%%ymm5,%%ymm5                             \n" \
3262   "lea        0x20(%[a_buf]),%[a_buf]                          \n"
3263 
3264 // Read 16 UV from 444.  With 16 Alpha.
3265 #define READYUVA444_AVX2                                              \
3266   "vmovdqu    (%[u_buf]),%%xmm3                                   \n" \
3267   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
3268   "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
3269   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3270   "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
3271   "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
3272   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3273   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3274   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3275   "lea        0x10(%[y_buf]),%[y_buf]                             \n" \
3276   "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \
3277   "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \
3278   "lea        0x10(%[a_buf]),%[a_buf]                             \n"
3279 
3280 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
3281 #define READYUVA422_AVX2                                              \
3282   "vmovq      (%[u_buf]),%%xmm3                                   \n" \
3283   "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
3284   "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
3285   "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
3286   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3287   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
3288   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3289   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3290   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3291   "lea        0x10(%[y_buf]),%[y_buf]                             \n" \
3292   "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \
3293   "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \
3294   "lea        0x10(%[a_buf]),%[a_buf]                             \n"
3295 
3296 // Read 8 UV from NV12, upsample to 16 UV.
3297 #define READNV12_AVX2                                                 \
3298   "vmovdqu    (%[uv_buf]),%%xmm3                                  \n" \
3299   "lea        0x10(%[uv_buf]),%[uv_buf]                           \n" \
3300   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3301   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
3302   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3303   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3304   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3305   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
3306 
3307 // Read 8 VU from NV21, upsample to 16 UV.
3308 #define READNV21_AVX2                                                 \
3309   "vmovdqu    (%[vu_buf]),%%xmm3                                  \n" \
3310   "lea        0x10(%[vu_buf]),%[vu_buf]                           \n" \
3311   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3312   "vpshufb     %[kShuffleNV21], %%ymm3, %%ymm3                    \n" \
3313   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3314   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3315   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3316   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
3317 
3318 // Read 4 UV from P210, upsample to 8 UV
3319 #define READP210_AVX2                                                 \
3320   "vmovdqu    (%[uv_buf]),%%ymm3                                  \n" \
3321   "lea        0x20(%[uv_buf]),%[uv_buf]                           \n" \
3322   "vpsrlw     $0x8,%%ymm3,%%ymm3                                  \n" \
3323   "vpackuswb  %%ymm3,%%ymm3,%%ymm3                                \n" \
3324   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
3325   "vmovdqu    (%[y_buf]),%%ymm4                                   \n" \
3326   "lea        0x20(%[y_buf]),%[y_buf]                             \n"
3327 
3328 // Read 8 UV from P410
3329 #define READP410_AVX2                                                 \
3330   "vmovdqu    (%[uv_buf]),%%ymm3                                  \n" \
3331   "vmovdqu    0x20(%[uv_buf]),%%ymm1                              \n" \
3332   "lea        0x40(%[uv_buf]),%[uv_buf]                           \n" \
3333   "vpsrlw     $0x8,%%ymm3,%%ymm3                                  \n" \
3334   "vpsrlw     $0x8,%%ymm1,%%ymm1                                  \n" \
3335   "vpackuswb  %%ymm1,%%ymm3,%%ymm3                                \n" \
3336   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3337   "vmovdqu    (%[y_buf]),%%ymm4                                   \n" \
3338   "lea        0x20(%[y_buf]),%[y_buf]                             \n"
3339 
3340 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
3341 #define READYUY2_AVX2                                                 \
3342   "vmovdqu    (%[yuy2_buf]),%%ymm4                                \n" \
3343   "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n" \
3344   "vmovdqu    (%[yuy2_buf]),%%ymm3                                \n" \
3345   "vpshufb    %[kShuffleYUY2UV], %%ymm3, %%ymm3                   \n" \
3346   "lea        0x20(%[yuy2_buf]),%[yuy2_buf]                       \n"
3347 
3348 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
3349 #define READUYVY_AVX2                                                 \
3350   "vmovdqu    (%[uyvy_buf]),%%ymm4                                \n" \
3351   "vpshufb    %[kShuffleUYVYY], %%ymm4, %%ymm4                    \n" \
3352   "vmovdqu    (%[uyvy_buf]),%%ymm3                                \n" \
3353   "vpshufb    %[kShuffleUYVYUV], %%ymm3, %%ymm3                   \n" \
3354   "lea        0x20(%[uyvy_buf]),%[uyvy_buf]                       \n"
3355 
3356 #if defined(__x86_64__)
3357 #define YUVTORGB_SETUP_AVX2(yuvconstants)                             \
3358   "vpcmpeqb    %%xmm13,%%xmm13,%%xmm13                            \n" \
3359   "vmovdqa     (%[yuvconstants]),%%ymm8                           \n" \
3360   "vpsllw      $7,%%xmm13,%%xmm13                                 \n" \
3361   "vmovdqa     32(%[yuvconstants]),%%ymm9                         \n" \
3362   "vpbroadcastb %%xmm13,%%ymm13                                   \n" \
3363   "vmovdqa     64(%[yuvconstants]),%%ymm10                        \n" \
3364   "vmovdqa     96(%[yuvconstants]),%%ymm11                        \n" \
3365   "vmovdqa     128(%[yuvconstants]),%%ymm12                       \n"
3366 
3367 #define YUVTORGB16_AVX2(yuvconstants)                                 \
3368   "vpsubb      %%ymm13,%%ymm3,%%ymm3                              \n" \
3369   "vpmulhuw    %%ymm11,%%ymm4,%%ymm4                              \n" \
3370   "vpmaddubsw  %%ymm3,%%ymm8,%%ymm0                               \n" \
3371   "vpmaddubsw  %%ymm3,%%ymm9,%%ymm1                               \n" \
3372   "vpmaddubsw  %%ymm3,%%ymm10,%%ymm2                              \n" \
3373   "vpaddw      %%ymm4,%%ymm12,%%ymm4                              \n" \
3374   "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
3375   "vpsubsw     %%ymm1,%%ymm4,%%ymm1                               \n" \
3376   "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
3377 
3378 #define YUVTORGB_REGS_AVX2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
3379 
3380 #else  // Convert 16 pixels: 16 UV and 16 Y.
3381 
3382 #define YUVTORGB_SETUP_AVX2(yuvconstants)
3383 #define YUVTORGB16_AVX2(yuvconstants)                                 \
3384   "vpcmpeqb    %%xmm0,%%xmm0,%%xmm0                               \n" \
3385   "vpsllw      $7,%%xmm0,%%xmm0                                   \n" \
3386   "vpbroadcastb %%xmm0,%%ymm0                                     \n" \
3387   "vpsubb      %%ymm0,%%ymm3,%%ymm3                               \n" \
3388   "vpmulhuw    96(%[yuvconstants]),%%ymm4,%%ymm4                  \n" \
3389   "vmovdqa     (%[yuvconstants]),%%ymm0                           \n" \
3390   "vmovdqa     32(%[yuvconstants]),%%ymm1                         \n" \
3391   "vmovdqa     64(%[yuvconstants]),%%ymm2                         \n" \
3392   "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0                               \n" \
3393   "vpmaddubsw  %%ymm3,%%ymm1,%%ymm1                               \n" \
3394   "vpmaddubsw  %%ymm3,%%ymm2,%%ymm2                               \n" \
3395   "vmovdqa     128(%[yuvconstants]),%%ymm3                        \n" \
3396   "vpaddw      %%ymm4,%%ymm3,%%ymm4                               \n" \
3397   "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
3398   "vpsubsw     %%ymm1,%%ymm4,%%ymm1                               \n" \
3399   "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
3400 
3401 #define YUVTORGB_REGS_AVX2
3402 #endif
3403 
3404 #define YUVTORGB_AVX2(yuvconstants)                                   \
3405   YUVTORGB16_AVX2(yuvconstants)                                       \
3406   "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
3407   "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
3408   "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
3409   "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
3410   "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
3411   "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
3412 
3413 // Store 16 ARGB values.
3414 #define STOREARGB_AVX2                                                \
3415   "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
3416   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
3417   "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n" \
3418   "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
3419   "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n" \
3420   "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n" \
3421   "vmovdqu    %%ymm1,(%[dst_argb])                                \n" \
3422   "vmovdqu    %%ymm0,0x20(%[dst_argb])                            \n" \
3423   "lea        0x40(%[dst_argb]), %[dst_argb]                      \n"
3424 
3425 // Store 16 AR30 values.
3426 #define STOREAR30_AVX2                                                \
3427   "vpsraw     $0x4,%%ymm0,%%ymm0                                  \n" \
3428   "vpsraw     $0x4,%%ymm1,%%ymm1                                  \n" \
3429   "vpsraw     $0x4,%%ymm2,%%ymm2                                  \n" \
3430   "vpminsw    %%ymm7,%%ymm0,%%ymm0                                \n" \
3431   "vpminsw    %%ymm7,%%ymm1,%%ymm1                                \n" \
3432   "vpminsw    %%ymm7,%%ymm2,%%ymm2                                \n" \
3433   "vpmaxsw    %%ymm6,%%ymm0,%%ymm0                                \n" \
3434   "vpmaxsw    %%ymm6,%%ymm1,%%ymm1                                \n" \
3435   "vpmaxsw    %%ymm6,%%ymm2,%%ymm2                                \n" \
3436   "vpsllw     $0x4,%%ymm2,%%ymm2                                  \n" \
3437   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
3438   "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
3439   "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
3440   "vpunpckhwd %%ymm2,%%ymm0,%%ymm3                                \n" \
3441   "vpunpcklwd %%ymm2,%%ymm0,%%ymm0                                \n" \
3442   "vpunpckhwd %%ymm5,%%ymm1,%%ymm2                                \n" \
3443   "vpunpcklwd %%ymm5,%%ymm1,%%ymm1                                \n" \
3444   "vpslld     $0xa,%%ymm1,%%ymm1                                  \n" \
3445   "vpslld     $0xa,%%ymm2,%%ymm2                                  \n" \
3446   "vpor       %%ymm1,%%ymm0,%%ymm0                                \n" \
3447   "vpor       %%ymm2,%%ymm3,%%ymm3                                \n" \
3448   "vmovdqu    %%ymm0,(%[dst_ar30])                                \n" \
3449   "vmovdqu    %%ymm3,0x20(%[dst_ar30])                            \n" \
3450   "lea        0x40(%[dst_ar30]), %[dst_ar30]                      \n"
3451 
3452 #ifdef HAS_I444TOARGBROW_AVX2
3453 // 16 pixels
3454 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
I444ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3455 void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
3456                                const uint8_t* u_buf,
3457                                const uint8_t* v_buf,
3458                                uint8_t* dst_argb,
3459                                const struct YuvConstants* yuvconstants,
3460                                int width) {
3461   asm volatile (
3462     YUVTORGB_SETUP_AVX2(yuvconstants)
3463       "sub         %[u_buf],%[v_buf]             \n"
3464       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3465 
3466     LABELALIGN
3467       "1:                                        \n"
3468     READYUV444_AVX2
3469     YUVTORGB_AVX2(yuvconstants)
3470     STOREARGB_AVX2
3471       "sub         $0x10,%[width]                \n"
3472       "jg          1b                            \n"
3473       "vzeroupper                                \n"
3474   : [y_buf]"+r"(y_buf),    // %[y_buf]
3475     [u_buf]"+r"(u_buf),    // %[u_buf]
3476     [v_buf]"+r"(v_buf),    // %[v_buf]
3477     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3478     [width]"+rm"(width)    // %[width]
3479   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3480   : "memory", "cc", YUVTORGB_REGS_AVX2
3481     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3482   );
3483 }
3484 #endif  // HAS_I444TOARGBROW_AVX2
3485 
3486 #if defined(HAS_I422TOARGBROW_AVX2)
3487 // 16 pixels
3488 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I422ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3489 void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
3490                                const uint8_t* u_buf,
3491                                const uint8_t* v_buf,
3492                                uint8_t* dst_argb,
3493                                const struct YuvConstants* yuvconstants,
3494                                int width) {
3495   asm volatile (
3496     YUVTORGB_SETUP_AVX2(yuvconstants)
3497       "sub         %[u_buf],%[v_buf]             \n"
3498       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3499 
3500     LABELALIGN
3501       "1:                                        \n"
3502     READYUV422_AVX2
3503     YUVTORGB_AVX2(yuvconstants)
3504     STOREARGB_AVX2
3505       "sub         $0x10,%[width]                \n"
3506       "jg          1b                            \n"
3507 
3508       "vzeroupper                                \n"
3509   : [y_buf]"+r"(y_buf),    // %[y_buf]
3510     [u_buf]"+r"(u_buf),    // %[u_buf]
3511     [v_buf]"+r"(v_buf),    // %[v_buf]
3512     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3513     [width]"+rm"(width)    // %[width]
3514   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3515   : "memory", "cc", YUVTORGB_REGS_AVX2
3516     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3517   );
3518 }
3519 #endif  // HAS_I422TOARGBROW_AVX2
3520 
3521 #if defined(HAS_I422TOAR30ROW_AVX2)
3522 // 16 pixels
3523 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I422ToAR30Row_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3524 void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
3525                                const uint8_t* u_buf,
3526                                const uint8_t* v_buf,
3527                                uint8_t* dst_ar30,
3528                                const struct YuvConstants* yuvconstants,
3529                                int width) {
3530   asm volatile (
3531     YUVTORGB_SETUP_AVX2(yuvconstants)
3532       "sub         %[u_buf],%[v_buf]             \n"
3533       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
3534       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
3535       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
3536       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
3537       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
3538       "vpsrlw      $6,%%ymm7,%%ymm7              \n"
3539 
3540     LABELALIGN
3541       "1:                                        \n"
3542     READYUV422_AVX2
3543     YUVTORGB16_AVX2(yuvconstants)
3544     STOREAR30_AVX2
3545       "sub         $0x10,%[width]                \n"
3546       "jg          1b                            \n"
3547 
3548       "vzeroupper                                \n"
3549   : [y_buf]"+r"(y_buf),    // %[y_buf]
3550     [u_buf]"+r"(u_buf),    // %[u_buf]
3551     [v_buf]"+r"(v_buf),    // %[v_buf]
3552     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
3553     [width]"+rm"(width)    // %[width]
3554   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3555   : "memory", "cc", YUVTORGB_REGS_AVX2
3556     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3557   );
3558 }
3559 #endif  // HAS_I422TOAR30ROW_AVX2
3560 
3561 #if defined(HAS_I210TOARGBROW_AVX2)
3562 // 16 pixels
3563 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I210ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3564 void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
3565                                const uint16_t* u_buf,
3566                                const uint16_t* v_buf,
3567                                uint8_t* dst_argb,
3568                                const struct YuvConstants* yuvconstants,
3569                                int width) {
3570   asm volatile (
3571     YUVTORGB_SETUP_AVX2(yuvconstants)
3572       "sub         %[u_buf],%[v_buf]             \n"
3573       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3574 
3575     LABELALIGN
3576       "1:                                        \n"
3577     READYUV210_AVX2
3578     YUVTORGB_AVX2(yuvconstants)
3579     STOREARGB_AVX2
3580       "sub         $0x10,%[width]                \n"
3581       "jg          1b                            \n"
3582 
3583       "vzeroupper                                \n"
3584   : [y_buf]"+r"(y_buf),    // %[y_buf]
3585     [u_buf]"+r"(u_buf),    // %[u_buf]
3586     [v_buf]"+r"(v_buf),    // %[v_buf]
3587     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3588     [width]"+rm"(width)    // %[width]
3589   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3590   : "memory", "cc", YUVTORGB_REGS_AVX2
3591     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3592   );
3593 }
3594 #endif  // HAS_I210TOARGBROW_AVX2
3595 
3596 #if defined(HAS_I212TOARGBROW_AVX2)
3597 // 16 pixels
3598 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I212ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3599 void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf,
3600                                const uint16_t* u_buf,
3601                                const uint16_t* v_buf,
3602                                uint8_t* dst_argb,
3603                                const struct YuvConstants* yuvconstants,
3604                                int width) {
3605   asm volatile (
3606     YUVTORGB_SETUP_AVX2(yuvconstants)
3607       "sub         %[u_buf],%[v_buf]             \n"
3608       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3609 
3610     LABELALIGN
3611       "1:                                        \n"
3612     READYUV212_AVX2
3613     YUVTORGB_AVX2(yuvconstants)
3614     STOREARGB_AVX2
3615       "sub         $0x10,%[width]                \n"
3616       "jg          1b                            \n"
3617 
3618       "vzeroupper                                \n"
3619   : [y_buf]"+r"(y_buf),    // %[y_buf]
3620     [u_buf]"+r"(u_buf),    // %[u_buf]
3621     [v_buf]"+r"(v_buf),    // %[v_buf]
3622     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3623     [width]"+rm"(width)    // %[width]
3624   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3625   : "memory", "cc", YUVTORGB_REGS_AVX2
3626     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3627   );
3628 }
3629 #endif  // HAS_I212TOARGBROW_AVX2
3630 
3631 #if defined(HAS_I210TOAR30ROW_AVX2)
3632 // 16 pixels
3633 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I210ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3634 void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
3635                                const uint16_t* u_buf,
3636                                const uint16_t* v_buf,
3637                                uint8_t* dst_ar30,
3638                                const struct YuvConstants* yuvconstants,
3639                                int width) {
3640   asm volatile (
3641     YUVTORGB_SETUP_AVX2(yuvconstants)
3642       "sub         %[u_buf],%[v_buf]             \n"
3643       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
3644       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
3645       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
3646       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
3647       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
3648       "vpsrlw      $6,%%ymm7,%%ymm7              \n"
3649 
3650     LABELALIGN
3651       "1:                                        \n"
3652     READYUV210_AVX2
3653     YUVTORGB16_AVX2(yuvconstants)
3654     STOREAR30_AVX2
3655       "sub         $0x10,%[width]                \n"
3656       "jg          1b                            \n"
3657 
3658       "vzeroupper                                \n"
3659   : [y_buf]"+r"(y_buf),    // %[y_buf]
3660     [u_buf]"+r"(u_buf),    // %[u_buf]
3661     [v_buf]"+r"(v_buf),    // %[v_buf]
3662     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
3663     [width]"+rm"(width)    // %[width]
3664   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3665   : "memory", "cc", YUVTORGB_REGS_AVX2
3666     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3667   );
3668 }
3669 #endif  // HAS_I210TOAR30ROW_AVX2
3670 
3671 #if defined(HAS_I212TOAR30ROW_AVX2)
3672 // 16 pixels
3673 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I212ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3674 void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
3675                                const uint16_t* u_buf,
3676                                const uint16_t* v_buf,
3677                                uint8_t* dst_ar30,
3678                                const struct YuvConstants* yuvconstants,
3679                                int width) {
3680   asm volatile (
3681     YUVTORGB_SETUP_AVX2(yuvconstants)
3682       "sub         %[u_buf],%[v_buf]             \n"
3683       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
3684       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
3685       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
3686       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
3687       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
3688       "vpsrlw      $6,%%ymm7,%%ymm7              \n"
3689 
3690     LABELALIGN
3691       "1:                                        \n"
3692     READYUV212_AVX2
3693     YUVTORGB16_AVX2(yuvconstants)
3694     STOREAR30_AVX2
3695       "sub         $0x10,%[width]                \n"
3696       "jg          1b                            \n"
3697 
3698       "vzeroupper                                \n"
3699   : [y_buf]"+r"(y_buf),    // %[y_buf]
3700     [u_buf]"+r"(u_buf),    // %[u_buf]
3701     [v_buf]"+r"(v_buf),    // %[v_buf]
3702     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
3703     [width]"+rm"(width)    // %[width]
3704   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3705   : "memory", "cc", YUVTORGB_REGS_AVX2
3706     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3707   );
3708 }
3709 #endif  // HAS_I212TOAR30ROW_AVX2
3710 
3711 #if defined(HAS_I410TOARGBROW_AVX2)
3712 // 16 pixels
3713 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
I410ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3714 void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf,
3715                                const uint16_t* u_buf,
3716                                const uint16_t* v_buf,
3717                                uint8_t* dst_argb,
3718                                const struct YuvConstants* yuvconstants,
3719                                int width) {
3720   asm volatile (
3721     YUVTORGB_SETUP_AVX2(yuvconstants)
3722       "sub         %[u_buf],%[v_buf]             \n"
3723       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3724 
3725     LABELALIGN
3726       "1:                                        \n"
3727     READYUV410_AVX2
3728     YUVTORGB_AVX2(yuvconstants)
3729     STOREARGB_AVX2
3730       "sub         $0x10,%[width]                \n"
3731       "jg          1b                            \n"
3732       "vzeroupper                                \n"
3733 
3734   : [y_buf]"+r"(y_buf),    // %[y_buf]
3735     [u_buf]"+r"(u_buf),    // %[u_buf]
3736     [v_buf]"+r"(v_buf),    // %[v_buf]
3737     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3738     [width]"+rm"(width)    // %[width]
3739   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3740   : "memory", "cc", YUVTORGB_REGS_AVX2
3741       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3742   );
3743 }
3744 #endif  // HAS_I410TOARGBROW_AVX2
3745 
3746 #if defined(HAS_I210ALPHATOARGBROW_AVX2)
3747 // 16 pixels
3748 // 8 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
I210AlphaToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3749 void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
3750                                     const uint16_t* u_buf,
3751                                     const uint16_t* v_buf,
3752                                     const uint16_t* a_buf,
3753                                     uint8_t* dst_argb,
3754                                     const struct YuvConstants* yuvconstants,
3755                                     int width) {
3756   asm volatile(
3757       YUVTORGB_SETUP_AVX2(
3758       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
3759 
3760       LABELALIGN "1:                                        \n" READYUVA210_AVX2
3761           YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
3762       "subl        $0x10,%[width]                \n"
3763       "jg          1b                            \n"
3764       "vzeroupper                                \n"
3765 
3766       : [y_buf] "+r"(y_buf),        // %[y_buf]
3767         [u_buf] "+r"(u_buf),        // %[u_buf]
3768         [v_buf] "+r"(v_buf),        // %[v_buf]
3769         [a_buf] "+r"(a_buf),        // %[a_buf]
3770         [dst_argb] "+r"(dst_argb),  // %[dst_argb]
3771 #if defined(__i386__)
3772         [width] "+m"(width)  // %[width]
3773 #else
3774         [width] "+rm"(width)  // %[width]
3775 #endif
3776       : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
3777       : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
3778         "xmm4", "xmm5");
3779 }
3780 #endif  // HAS_I210TOARGBROW_AVX2
3781 
3782 #if defined(HAS_I410ALPHATOARGBROW_AVX2)
3783 // 16 pixels
3784 // 16 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
I410AlphaToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3785 void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
3786                                     const uint16_t* u_buf,
3787                                     const uint16_t* v_buf,
3788                                     const uint16_t* a_buf,
3789                                     uint8_t* dst_argb,
3790                                     const struct YuvConstants* yuvconstants,
3791                                     int width) {
3792   asm volatile(
3793       YUVTORGB_SETUP_AVX2(
3794       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
3795 
3796       LABELALIGN "1:                                        \n" READYUVA410_AVX2
3797           YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
3798       "subl        $0x10,%[width]                \n"
3799       "jg          1b                            \n"
3800       "vzeroupper                                \n"
3801 
3802       : [y_buf] "+r"(y_buf),        // %[y_buf]
3803         [u_buf] "+r"(u_buf),        // %[u_buf]
3804         [v_buf] "+r"(v_buf),        // %[v_buf]
3805         [a_buf] "+r"(a_buf),        // %[a_buf]
3806         [dst_argb] "+r"(dst_argb),  // %[dst_argb]
3807 #if defined(__i386__)
3808         [width] "+m"(width)  // %[width]
3809 #else
3810         [width] "+rm"(width)  // %[width]
3811 #endif
3812       : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
3813       : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
3814         "xmm4", "xmm5");
3815 }
3816 #endif  // HAS_I410TOARGBROW_AVX2
3817 
3818 #if defined(HAS_I410TOAR30ROW_AVX2)
3819 // 16 pixels
3820 // 16 UV values with 16 Y producing 16 AR30 (64 bytes).
I410ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3821 void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
3822                                const uint16_t* u_buf,
3823                                const uint16_t* v_buf,
3824                                uint8_t* dst_ar30,
3825                                const struct YuvConstants* yuvconstants,
3826                                int width) {
3827   asm volatile (
3828     YUVTORGB_SETUP_AVX2(yuvconstants)
3829       "sub         %[u_buf],%[v_buf]             \n"
3830       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
3831       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
3832       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
3833       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
3834       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
3835       "vpsrlw      $6,%%ymm7,%%ymm7              \n"
3836 
3837     LABELALIGN
3838       "1:                                        \n"
3839     READYUV410_AVX2
3840     YUVTORGB16_AVX2(yuvconstants)
3841     STOREAR30_AVX2
3842       "sub         $0x10,%[width]                \n"
3843       "jg          1b                            \n"
3844 
3845       "vzeroupper                                \n"
3846   : [y_buf]"+r"(y_buf),    // %[y_buf]
3847     [u_buf]"+r"(u_buf),    // %[u_buf]
3848     [v_buf]"+r"(v_buf),    // %[v_buf]
3849     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
3850     [width]"+rm"(width)    // %[width]
3851   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3852   : "memory", "cc", YUVTORGB_REGS_AVX2
3853       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3854   );
3855 }
3856 #endif  // HAS_I410TOAR30ROW_AVX2
3857 
3858 #if defined(HAS_I444ALPHATOARGBROW_AVX2)
3859 // 16 pixels
3860 // 16 UV values with 16 Y and 16 A producing 16 ARGB.
I444AlphaToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3861 void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf,
3862                                     const uint8_t* u_buf,
3863                                     const uint8_t* v_buf,
3864                                     const uint8_t* a_buf,
3865                                     uint8_t* dst_argb,
3866                                     const struct YuvConstants* yuvconstants,
3867                                     int width) {
3868   // clang-format off
3869   asm volatile (
3870   YUVTORGB_SETUP_AVX2(yuvconstants)
3871       "sub         %[u_buf],%[v_buf]             \n"
3872 
3873   LABELALIGN
3874       "1:                                        \n"
3875   READYUVA444_AVX2
3876   YUVTORGB_AVX2(yuvconstants)
3877   STOREARGB_AVX2
3878       "subl        $0x10,%[width]                \n"
3879       "jg          1b                            \n"
3880       "vzeroupper                                \n"
3881   : [y_buf]"+r"(y_buf),    // %[y_buf]
3882     [u_buf]"+r"(u_buf),    // %[u_buf]
3883     [v_buf]"+r"(v_buf),    // %[v_buf]
3884     [a_buf]"+r"(a_buf),    // %[a_buf]
3885     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3886 #if defined(__i386__)
3887     [width]"+m"(width)     // %[width]
3888 #else
3889     [width]"+rm"(width)    // %[width]
3890 #endif
3891   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3892   : "memory", "cc", YUVTORGB_REGS_AVX2
3893       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3894   );
3895   // clang-format on
3896 }
3897 #endif  // HAS_I444ALPHATOARGBROW_AVX2
3898 
3899 #if defined(HAS_I422ALPHATOARGBROW_AVX2)
3900 // 16 pixels
3901 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
I422AlphaToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3902 void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
3903                                     const uint8_t* u_buf,
3904                                     const uint8_t* v_buf,
3905                                     const uint8_t* a_buf,
3906                                     uint8_t* dst_argb,
3907                                     const struct YuvConstants* yuvconstants,
3908                                     int width) {
3909   // clang-format off
3910   asm volatile (
3911     YUVTORGB_SETUP_AVX2(yuvconstants)
3912       "sub         %[u_buf],%[v_buf]             \n"
3913 
3914     LABELALIGN
3915       "1:                                        \n"
3916     READYUVA422_AVX2
3917     YUVTORGB_AVX2(yuvconstants)
3918     STOREARGB_AVX2
3919       "subl        $0x10,%[width]                \n"
3920       "jg          1b                            \n"
3921       "vzeroupper                                \n"
3922   : [y_buf]"+r"(y_buf),    // %[y_buf]
3923     [u_buf]"+r"(u_buf),    // %[u_buf]
3924     [v_buf]"+r"(v_buf),    // %[v_buf]
3925     [a_buf]"+r"(a_buf),    // %[a_buf]
3926     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3927 #if defined(__i386__)
3928     [width]"+m"(width)     // %[width]
3929 #else
3930     [width]"+rm"(width)    // %[width]
3931 #endif
3932   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3933   : "memory", "cc", YUVTORGB_REGS_AVX2
3934     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3935   );
3936   // clang-format on
3937 }
3938 #endif  // HAS_I422ALPHATOARGBROW_AVX2
3939 
3940 #if defined(HAS_I422TORGBAROW_AVX2)
3941 // 16 pixels
3942 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
I422ToRGBARow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3943 void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
3944                                const uint8_t* u_buf,
3945                                const uint8_t* v_buf,
3946                                uint8_t* dst_argb,
3947                                const struct YuvConstants* yuvconstants,
3948                                int width) {
3949   asm volatile (
3950     YUVTORGB_SETUP_AVX2(yuvconstants)
3951       "sub         %[u_buf],%[v_buf]             \n"
3952       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3953 
3954     LABELALIGN
3955       "1:                                        \n"
3956     READYUV422_AVX2
3957     YUVTORGB_AVX2(yuvconstants)
3958 
3959     // Step 3: Weave into RGBA
3960     "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
3961     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
3962     "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
3963     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
3964     "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
3965     "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
3966     "vmovdqu    %%ymm0,(%[dst_argb])           \n"
3967     "vmovdqu    %%ymm1,0x20(%[dst_argb])       \n"
3968     "lea        0x40(%[dst_argb]),%[dst_argb]  \n"
3969     "sub        $0x10,%[width]                 \n"
3970     "jg         1b                             \n"
3971     "vzeroupper                                \n"
3972   : [y_buf]"+r"(y_buf),    // %[y_buf]
3973     [u_buf]"+r"(u_buf),    // %[u_buf]
3974     [v_buf]"+r"(v_buf),    // %[v_buf]
3975     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3976     [width]"+rm"(width)    // %[width]
3977   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3978   : "memory", "cc", YUVTORGB_REGS_AVX2
3979     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3980   );
3981 }
3982 #endif  // HAS_I422TORGBAROW_AVX2
3983 
3984 #if defined(HAS_NV12TOARGBROW_AVX2)
3985 // 16 pixels.
3986 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV12ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3987 void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
3988                                const uint8_t* uv_buf,
3989                                uint8_t* dst_argb,
3990                                const struct YuvConstants* yuvconstants,
3991                                int width) {
3992   // clang-format off
3993   asm volatile (
3994     YUVTORGB_SETUP_AVX2(yuvconstants)
3995       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3996 
3997     LABELALIGN
3998       "1:                                        \n"
3999     READNV12_AVX2
4000     YUVTORGB_AVX2(yuvconstants)
4001     STOREARGB_AVX2
4002       "sub         $0x10,%[width]                \n"
4003       "jg          1b                            \n"
4004       "vzeroupper                                \n"
4005   : [y_buf]"+r"(y_buf),    // %[y_buf]
4006     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
4007     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4008     [width]"+rm"(width)    // %[width]
4009   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4010     : "memory", "cc", YUVTORGB_REGS_AVX2
4011     "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4012   );
4013   // clang-format on
4014 }
4015 #endif  // HAS_NV12TOARGBROW_AVX2
4016 
4017 #if defined(HAS_NV21TOARGBROW_AVX2)
4018 // 16 pixels.
4019 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV21ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * vu_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4020 void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
4021                                const uint8_t* vu_buf,
4022                                uint8_t* dst_argb,
4023                                const struct YuvConstants* yuvconstants,
4024                                int width) {
4025   // clang-format off
4026   asm volatile (
4027     YUVTORGB_SETUP_AVX2(yuvconstants)
4028       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4029 
4030     LABELALIGN
4031       "1:                                        \n"
4032     READNV21_AVX2
4033     YUVTORGB_AVX2(yuvconstants)
4034     STOREARGB_AVX2
4035       "sub         $0x10,%[width]                \n"
4036       "jg          1b                            \n"
4037       "vzeroupper                                \n"
4038   : [y_buf]"+r"(y_buf),    // %[y_buf]
4039     [vu_buf]"+r"(vu_buf),    // %[vu_buf]
4040     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4041     [width]"+rm"(width)    // %[width]
4042   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
4043     [kShuffleNV21]"m"(kShuffleNV21)
4044     : "memory", "cc", YUVTORGB_REGS_AVX2
4045       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4046   );
4047   // clang-format on
4048 }
4049 #endif  // HAS_NV21TOARGBROW_AVX2
4050 
4051 #if defined(HAS_YUY2TOARGBROW_AVX2)
4052 // 16 pixels.
4053 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
YUY2ToARGBRow_AVX2(const uint8_t * yuy2_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4054 void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
4055                                uint8_t* dst_argb,
4056                                const struct YuvConstants* yuvconstants,
4057                                int width) {
4058   // clang-format off
4059   asm volatile (
4060     YUVTORGB_SETUP_AVX2(yuvconstants)
4061       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4062 
4063     LABELALIGN
4064       "1:                                        \n"
4065     READYUY2_AVX2
4066     YUVTORGB_AVX2(yuvconstants)
4067     STOREARGB_AVX2
4068       "sub         $0x10,%[width]                \n"
4069       "jg          1b                            \n"
4070       "vzeroupper                                \n"
4071   : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
4072     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4073     [width]"+rm"(width)    // %[width]
4074   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
4075     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
4076     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
4077     : "memory", "cc", YUVTORGB_REGS_AVX2
4078       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4079   );
4080   // clang-format on
4081 }
4082 #endif  // HAS_YUY2TOARGBROW_AVX2
4083 
4084 #if defined(HAS_UYVYTOARGBROW_AVX2)
4085 // 16 pixels.
4086 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
UYVYToARGBRow_AVX2(const uint8_t * uyvy_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4087 void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
4088                                uint8_t* dst_argb,
4089                                const struct YuvConstants* yuvconstants,
4090                                int width) {
4091   // clang-format off
4092   asm volatile (
4093     YUVTORGB_SETUP_AVX2(yuvconstants)
4094       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4095 
4096     LABELALIGN
4097       "1:                                        \n"
4098     READUYVY_AVX2
4099     YUVTORGB_AVX2(yuvconstants)
4100     STOREARGB_AVX2
4101       "sub         $0x10,%[width]                \n"
4102       "jg          1b                            \n"
4103       "vzeroupper                                \n"
4104   : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
4105     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4106     [width]"+rm"(width)    // %[width]
4107   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
4108     [kShuffleUYVYY]"m"(kShuffleUYVYY),
4109     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
4110     : "memory", "cc", YUVTORGB_REGS_AVX2
4111       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4112   );
4113   // clang-format on
4114 }
4115 #endif  // HAS_UYVYTOARGBROW_AVX2
4116 
4117 #if defined(HAS_P210TOARGBROW_AVX2)
4118 // 16 pixels.
4119 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
P210ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4120 void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf,
4121                                const uint16_t* uv_buf,
4122                                uint8_t* dst_argb,
4123                                const struct YuvConstants* yuvconstants,
4124                                int width) {
4125   // clang-format off
4126   asm volatile (
4127     YUVTORGB_SETUP_AVX2(yuvconstants)
4128       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4129 
4130     LABELALIGN
4131       "1:                                        \n"
4132     READP210_AVX2
4133     YUVTORGB_AVX2(yuvconstants)
4134     STOREARGB_AVX2
4135       "sub         $0x10,%[width]                \n"
4136       "jg          1b                            \n"
4137       "vzeroupper                                \n"
4138   : [y_buf]"+r"(y_buf),    // %[y_buf]
4139     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
4140     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4141     [width]"+rm"(width)    // %[width]
4142   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4143   : "memory", "cc", YUVTORGB_REGS_AVX2
4144       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4145   );
4146   // clang-format on
4147 }
4148 #endif  // HAS_P210TOARGBROW_AVX2
4149 
4150 #if defined(HAS_P410TOARGBROW_AVX2)
4151 // 16 pixels.
4152 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
P410ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4153 void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf,
4154                                const uint16_t* uv_buf,
4155                                uint8_t* dst_argb,
4156                                const struct YuvConstants* yuvconstants,
4157                                int width) {
4158   // clang-format off
4159   asm volatile (
4160     YUVTORGB_SETUP_AVX2(yuvconstants)
4161       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4162 
4163     LABELALIGN
4164       "1:                                        \n"
4165     READP410_AVX2
4166     YUVTORGB_AVX2(yuvconstants)
4167     STOREARGB_AVX2
4168       "sub         $0x10,%[width]                \n"
4169       "jg          1b                            \n"
4170       "vzeroupper                                \n"
4171   : [y_buf]"+r"(y_buf),    // %[y_buf]
4172     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
4173     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4174     [width]"+rm"(width)    // %[width]
4175   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4176   : "memory", "cc", YUVTORGB_REGS_AVX2
4177       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4178   );
4179   // clang-format on
4180 }
4181 #endif  // HAS_P410TOARGBROW_AVX2
4182 
4183 #if defined(HAS_P210TOAR30ROW_AVX2)
4184 // 16 pixels
4185 // 16 UV values with 16 Y producing 16 AR30 (64 bytes).
P210ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)4186 void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf,
4187                                const uint16_t* uv_buf,
4188                                uint8_t* dst_ar30,
4189                                const struct YuvConstants* yuvconstants,
4190                                int width) {
4191   asm volatile (
4192     YUVTORGB_SETUP_AVX2(yuvconstants)
4193       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
4194       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
4195       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
4196       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
4197       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
4198       "vpsrlw      $6,%%ymm7,%%ymm7              \n"
4199 
4200     LABELALIGN
4201       "1:                                        \n"
4202     READP210_AVX2
4203     YUVTORGB16_AVX2(yuvconstants)
4204     STOREAR30_AVX2
4205       "sub         $0x10,%[width]                \n"
4206       "jg          1b                            \n"
4207 
4208       "vzeroupper                                \n"
4209   : [y_buf]"+r"(y_buf),    // %[y_buf]
4210     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
4211     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
4212     [width]"+rm"(width)    // %[width]
4213   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4214   : "memory", "cc", YUVTORGB_REGS_AVX2
4215       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4216   );
4217 }
4218 #endif  // HAS_P210TOAR30ROW_AVX2
4219 
4220 #if defined(HAS_P410TOAR30ROW_AVX2)
4221 // 16 pixels
4222 // 16 UV values with 16 Y producing 16 AR30 (64 bytes).
P410ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)4223 void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf,
4224                                const uint16_t* uv_buf,
4225                                uint8_t* dst_ar30,
4226                                const struct YuvConstants* yuvconstants,
4227                                int width) {
4228   asm volatile (
4229     YUVTORGB_SETUP_AVX2(yuvconstants)
4230       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
4231       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
4232       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
4233       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
4234       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
4235       "vpsrlw      $6,%%ymm7,%%ymm7              \n"
4236 
4237     LABELALIGN
4238       "1:                                        \n"
4239     READP410_AVX2
4240     YUVTORGB16_AVX2(yuvconstants)
4241     STOREAR30_AVX2
4242       "sub         $0x10,%[width]                \n"
4243       "jg          1b                            \n"
4244 
4245       "vzeroupper                                \n"
4246   : [y_buf]"+r"(y_buf),    // %[y_buf]
4247     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
4248     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
4249     [width]"+rm"(width)    // %[width]
4250   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4251   : "memory", "cc", YUVTORGB_REGS_AVX2
4252       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4253   );
4254 }
4255 #endif  // HAS_P410TOAR30ROW_AVX2
4256 
4257 #ifdef HAS_I400TOARGBROW_SSE2
I400ToARGBRow_SSE2(const uint8_t * y_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4258 void I400ToARGBRow_SSE2(const uint8_t* y_buf,
4259                         uint8_t* dst_argb,
4260                         const struct YuvConstants* yuvconstants,
4261                         int width) {
4262   asm volatile(
4263       "movdqa      96(%3),%%xmm2                 \n"  // yg = 18997 = 1.164
4264       "movdqa      128(%3),%%xmm3                \n"  // ygb = 1160 = 1.164 * 16
4265       "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 0xff000000
4266       "pslld       $0x18,%%xmm4                  \n"
4267 
4268       LABELALIGN
4269       "1:                                        \n"
4270       // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
4271       "movq      (%0),%%xmm0                     \n"
4272       "lea       0x8(%0),%0                      \n"
4273       "punpcklbw %%xmm0,%%xmm0                   \n"
4274       "pmulhuw   %%xmm2,%%xmm0                   \n"
4275       "paddsw    %%xmm3,%%xmm0                   \n"
4276       "psraw     $6, %%xmm0                      \n"
4277       "packuswb  %%xmm0,%%xmm0                   \n"
4278 
4279       // Step 2: Weave into ARGB
4280       "punpcklbw %%xmm0,%%xmm0                   \n"
4281       "movdqa    %%xmm0,%%xmm1                   \n"
4282       "punpcklwd %%xmm0,%%xmm0                   \n"
4283       "punpckhwd %%xmm1,%%xmm1                   \n"
4284       "por       %%xmm4,%%xmm0                   \n"
4285       "por       %%xmm4,%%xmm1                   \n"
4286       "movdqu    %%xmm0,(%1)                     \n"
4287       "movdqu    %%xmm1,0x10(%1)                 \n"
4288       "lea       0x20(%1),%1                     \n"
4289 
4290       "sub       $0x8,%2                         \n"
4291       "jg        1b                              \n"
4292       : "+r"(y_buf),       // %0
4293         "+r"(dst_argb),    // %1
4294         "+rm"(width)       // %2
4295       : "r"(yuvconstants)  // %3
4296       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
4297 }
4298 #endif  // HAS_I400TOARGBROW_SSE2
4299 
4300 #ifdef HAS_I400TOARGBROW_AVX2
4301 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
4302 // note: vpunpcklbw mutates and vpackuswb unmutates.
I400ToARGBRow_AVX2(const uint8_t * y_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4303 void I400ToARGBRow_AVX2(const uint8_t* y_buf,
4304                         uint8_t* dst_argb,
4305                         const struct YuvConstants* yuvconstants,
4306                         int width) {
4307   asm volatile(
4308       "vmovdqa     96(%3),%%ymm2                 \n"  // yg = 18997 = 1.164
4309       "vmovdqa     128(%3),%%ymm3                \n"  // ygb = -1160 = 1.164*16
4310       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 0xff000000
4311       "vpslld      $0x18,%%ymm4,%%ymm4           \n"
4312 
4313       LABELALIGN
4314       "1:                                        \n"
4315       // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
4316       "vmovdqu    (%0),%%xmm0                    \n"
4317       "lea        0x10(%0),%0                    \n"
4318       "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
4319       "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
4320       "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
4321       "vpaddsw    %%ymm3,%%ymm0,%%ymm0           \n"
4322       "vpsraw     $0x6,%%ymm0,%%ymm0             \n"
4323       "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
4324       "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
4325       "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
4326       "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
4327       "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
4328       "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
4329       "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
4330       "vmovdqu    %%ymm0,(%1)                    \n"
4331       "vmovdqu    %%ymm1,0x20(%1)                \n"
4332       "lea        0x40(%1),%1                     \n"
4333       "sub        $0x10,%2                       \n"
4334       "jg        1b                              \n"
4335       "vzeroupper                                \n"
4336       : "+r"(y_buf),       // %0
4337         "+r"(dst_argb),    // %1
4338         "+rm"(width)       // %2
4339       : "r"(yuvconstants)  // %3
4340       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
4341 }
4342 #endif  // HAS_I400TOARGBROW_AVX2
4343 
4344 #ifdef HAS_MIRRORROW_SSSE3
4345 // Shuffle table for reversing the bytes.
4346 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
4347                                      7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
4348 
MirrorRow_SSSE3(const uint8_t * src,uint8_t * dst,int width)4349 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
4350   intptr_t temp_width = (intptr_t)(width);
4351   asm volatile(
4352 
4353       "movdqa      %3,%%xmm5                     \n"
4354 
4355       LABELALIGN
4356       "1:                                        \n"
4357       "movdqu      -0x10(%0,%2,1),%%xmm0         \n"
4358       "pshufb      %%xmm5,%%xmm0                 \n"
4359       "movdqu      %%xmm0,(%1)                   \n"
4360       "lea         0x10(%1),%1                   \n"
4361       "sub         $0x10,%2                      \n"
4362       "jg          1b                            \n"
4363       : "+r"(src),           // %0
4364         "+r"(dst),           // %1
4365         "+r"(temp_width)     // %2
4366       : "m"(kShuffleMirror)  // %3
4367       : "memory", "cc", "xmm0", "xmm5");
4368 }
4369 #endif  // HAS_MIRRORROW_SSSE3
4370 
4371 #ifdef HAS_MIRRORROW_AVX2
MirrorRow_AVX2(const uint8_t * src,uint8_t * dst,int width)4372 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
4373   intptr_t temp_width = (intptr_t)(width);
4374   asm volatile(
4375 
4376       "vbroadcastf128 %3,%%ymm5                  \n"
4377 
4378       LABELALIGN
4379       "1:                                        \n"
4380       "vmovdqu     -0x20(%0,%2,1),%%ymm0         \n"
4381       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
4382       "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
4383       "vmovdqu     %%ymm0,(%1)                   \n"
4384       "lea         0x20(%1),%1                   \n"
4385       "sub         $0x20,%2                      \n"
4386       "jg          1b                            \n"
4387       "vzeroupper                                \n"
4388       : "+r"(src),           // %0
4389         "+r"(dst),           // %1
4390         "+r"(temp_width)     // %2
4391       : "m"(kShuffleMirror)  // %3
4392       : "memory", "cc", "xmm0", "xmm5");
4393 }
4394 #endif  // HAS_MIRRORROW_AVX2
4395 
4396 #ifdef HAS_MIRRORUVROW_SSSE3
4397 // Shuffle table for reversing the UV.
4398 static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
4399                                        6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
4400 
MirrorUVRow_SSSE3(const uint8_t * src_uv,uint8_t * dst_uv,int width)4401 void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
4402   intptr_t temp_width = (intptr_t)(width);
4403   asm volatile(
4404 
4405       "movdqa      %3,%%xmm5                     \n"
4406 
4407       LABELALIGN
4408       "1:                                        \n"
4409       "movdqu      -0x10(%0,%2,2),%%xmm0         \n"
4410       "pshufb      %%xmm5,%%xmm0                 \n"
4411       "movdqu      %%xmm0,(%1)                   \n"
4412       "lea         0x10(%1),%1                   \n"
4413       "sub         $0x8,%2                       \n"
4414       "jg          1b                            \n"
4415       : "+r"(src_uv),          // %0
4416         "+r"(dst_uv),          // %1
4417         "+r"(temp_width)       // %2
4418       : "m"(kShuffleMirrorUV)  // %3
4419       : "memory", "cc", "xmm0", "xmm5");
4420 }
4421 #endif  // HAS_MIRRORUVROW_SSSE3
4422 
4423 #ifdef HAS_MIRRORUVROW_AVX2
MirrorUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_uv,int width)4424 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
4425   intptr_t temp_width = (intptr_t)(width);
4426   asm volatile(
4427 
4428       "vbroadcastf128 %3,%%ymm5                  \n"
4429 
4430       LABELALIGN
4431       "1:                                        \n"
4432       "vmovdqu     -0x20(%0,%2,2),%%ymm0         \n"
4433       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
4434       "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
4435       "vmovdqu     %%ymm0,(%1)                   \n"
4436       "lea         0x20(%1),%1                   \n"
4437       "sub         $0x10,%2                      \n"
4438       "jg          1b                            \n"
4439       "vzeroupper                                \n"
4440       : "+r"(src_uv),          // %0
4441         "+r"(dst_uv),          // %1
4442         "+r"(temp_width)       // %2
4443       : "m"(kShuffleMirrorUV)  // %3
4444       : "memory", "cc", "xmm0", "xmm5");
4445 }
4446 #endif  // HAS_MIRRORUVROW_AVX2
4447 
4448 #ifdef HAS_MIRRORSPLITUVROW_SSSE3
4449 // Shuffle table for reversing the bytes of UV channels.
4450 static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
4451                                             15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
MirrorSplitUVRow_SSSE3(const uint8_t * src,uint8_t * dst_u,uint8_t * dst_v,int width)4452 void MirrorSplitUVRow_SSSE3(const uint8_t* src,
4453                             uint8_t* dst_u,
4454                             uint8_t* dst_v,
4455                             int width) {
4456   intptr_t temp_width = (intptr_t)(width);
4457   asm volatile(
4458       "movdqa      %4,%%xmm1                     \n"
4459       "lea         -0x10(%0,%3,2),%0             \n"
4460       "sub         %1,%2                         \n"
4461 
4462       LABELALIGN
4463       "1:                                        \n"
4464       "movdqu      (%0),%%xmm0                   \n"
4465       "lea         -0x10(%0),%0                  \n"
4466       "pshufb      %%xmm1,%%xmm0                 \n"
4467       "movlpd      %%xmm0,(%1)                   \n"
4468       "movhpd      %%xmm0,0x00(%1,%2,1)          \n"
4469       "lea         0x8(%1),%1                    \n"
4470       "sub         $8,%3                         \n"
4471       "jg          1b                            \n"
4472       : "+r"(src),                  // %0
4473         "+r"(dst_u),                // %1
4474         "+r"(dst_v),                // %2
4475         "+r"(temp_width)            // %3
4476       : "m"(kShuffleMirrorSplitUV)  // %4
4477       : "memory", "cc", "xmm0", "xmm1");
4478 }
4479 #endif  // HAS_MIRRORSPLITUVROW_SSSE3
4480 
4481 #ifdef HAS_RGB24MIRRORROW_SSSE3
4482 
4483 // Shuffle first 5 pixels to last 5 mirrored.  first byte zero
4484 static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u,
4485                                          7u,   8u,  3u,  4u,  5u, 0u,  1u,  2u};
4486 
4487 // Shuffle last 5 pixels to first 5 mirrored.  last byte zero
4488 static const uvec8 kShuffleMirrorRGB1 = {
4489     13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
4490 
4491 // Shuffle 5 pixels at a time (15 bytes)
RGB24MirrorRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_rgb24,int width)4492 void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
4493                           uint8_t* dst_rgb24,
4494                           int width) {
4495   intptr_t temp_width = (intptr_t)(width);
4496   src_rgb24 += width * 3 - 48;
4497   asm volatile(
4498       "movdqa      %3,%%xmm4                     \n"
4499       "movdqa      %4,%%xmm5                     \n"
4500 
4501       LABELALIGN
4502       "1:                                        \n"
4503       "movdqu      (%0),%%xmm0                   \n"  // first 5
4504       "movdqu      15(%0),%%xmm1                 \n"  // next 5
4505       "movdqu      30(%0),%%xmm2                 \n"  // next 5
4506       "movdqu      32(%0),%%xmm3                 \n"  // last 1 special
4507       "pshufb      %%xmm4,%%xmm0                 \n"
4508       "pshufb      %%xmm4,%%xmm1                 \n"
4509       "pshufb      %%xmm4,%%xmm2                 \n"
4510       "pshufb      %%xmm5,%%xmm3                 \n"
4511       "lea         -0x30(%0),%0                  \n"
4512       "movdqu      %%xmm0,32(%1)                 \n"  // last 5
4513       "movdqu      %%xmm1,17(%1)                 \n"  // next 5
4514       "movdqu      %%xmm2,2(%1)                  \n"  // next 5
4515       "movlpd      %%xmm3,0(%1)                  \n"  // first 1
4516       "lea         0x30(%1),%1                   \n"
4517       "sub         $0x10,%2                      \n"
4518       "jg          1b                            \n"
4519       : "+r"(src_rgb24),          // %0
4520         "+r"(dst_rgb24),          // %1
4521         "+r"(temp_width)          // %2
4522       : "m"(kShuffleMirrorRGB0),  // %3
4523         "m"(kShuffleMirrorRGB1)   // %4
4524       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4525 }
4526 #endif  // HAS_RGB24MIRRORROW_SSSE3
4527 
4528 #ifdef HAS_ARGBMIRRORROW_SSE2
4529 
ARGBMirrorRow_SSE2(const uint8_t * src,uint8_t * dst,int width)4530 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
4531   intptr_t temp_width = (intptr_t)(width);
4532   asm volatile(
4533 
4534       "lea         -0x10(%0,%2,4),%0             \n"
4535 
4536       LABELALIGN
4537       "1:                                        \n"
4538       "movdqu      (%0),%%xmm0                   \n"
4539       "pshufd      $0x1b,%%xmm0,%%xmm0           \n"
4540       "lea         -0x10(%0),%0                  \n"
4541       "movdqu      %%xmm0,(%1)                   \n"
4542       "lea         0x10(%1),%1                   \n"
4543       "sub         $0x4,%2                       \n"
4544       "jg          1b                            \n"
4545       : "+r"(src),        // %0
4546         "+r"(dst),        // %1
4547         "+r"(temp_width)  // %2
4548       :
4549       : "memory", "cc", "xmm0");
4550 }
4551 #endif  // HAS_ARGBMIRRORROW_SSE2
4552 
4553 #ifdef HAS_ARGBMIRRORROW_AVX2
4554 // Shuffle table for reversing the bytes.
4555 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
ARGBMirrorRow_AVX2(const uint8_t * src,uint8_t * dst,int width)4556 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
4557   intptr_t temp_width = (intptr_t)(width);
4558   asm volatile(
4559 
4560       "vmovdqu     %3,%%ymm5                     \n"
4561 
4562       LABELALIGN
4563       "1:                                        \n"
4564       "vpermd      -0x20(%0,%2,4),%%ymm5,%%ymm0  \n"
4565       "vmovdqu     %%ymm0,(%1)                   \n"
4566       "lea         0x20(%1),%1                   \n"
4567       "sub         $0x8,%2                       \n"
4568       "jg          1b                            \n"
4569       "vzeroupper                                \n"
4570       : "+r"(src),                    // %0
4571         "+r"(dst),                    // %1
4572         "+r"(temp_width)              // %2
4573       : "m"(kARGBShuffleMirror_AVX2)  // %3
4574       : "memory", "cc", "xmm0", "xmm5");
4575 }
4576 #endif  // HAS_ARGBMIRRORROW_AVX2
4577 
4578 #ifdef HAS_SPLITUVROW_AVX2
SplitUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)4579 void SplitUVRow_AVX2(const uint8_t* src_uv,
4580                      uint8_t* dst_u,
4581                      uint8_t* dst_v,
4582                      int width) {
4583   asm volatile(
4584       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4585       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
4586       "sub         %1,%2                         \n"
4587 
4588       LABELALIGN
4589       "1:                                        \n"
4590       "vmovdqu     (%0),%%ymm0                   \n"
4591       "vmovdqu     0x20(%0),%%ymm1               \n"
4592       "lea         0x40(%0),%0                   \n"
4593       "vpsrlw      $0x8,%%ymm0,%%ymm2            \n"
4594       "vpsrlw      $0x8,%%ymm1,%%ymm3            \n"
4595       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
4596       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
4597       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
4598       "vpackuswb   %%ymm3,%%ymm2,%%ymm2          \n"
4599       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
4600       "vpermq      $0xd8,%%ymm2,%%ymm2           \n"
4601       "vmovdqu     %%ymm0,(%1)                   \n"
4602       "vmovdqu     %%ymm2,0x00(%1,%2,1)          \n"
4603       "lea         0x20(%1),%1                   \n"
4604       "sub         $0x20,%3                      \n"
4605       "jg          1b                            \n"
4606       "vzeroupper                                \n"
4607       : "+r"(src_uv),  // %0
4608         "+r"(dst_u),   // %1
4609         "+r"(dst_v),   // %2
4610         "+r"(width)    // %3
4611       :
4612       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
4613 }
4614 #endif  // HAS_SPLITUVROW_AVX2
4615 
4616 #ifdef HAS_SPLITUVROW_SSE2
SplitUVRow_SSE2(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)4617 void SplitUVRow_SSE2(const uint8_t* src_uv,
4618                      uint8_t* dst_u,
4619                      uint8_t* dst_v,
4620                      int width) {
4621   asm volatile(
4622       "pcmpeqb     %%xmm5,%%xmm5                 \n"
4623       "psrlw       $0x8,%%xmm5                   \n"
4624       "sub         %1,%2                         \n"
4625 
4626       LABELALIGN
4627       "1:                                        \n"
4628       "movdqu      (%0),%%xmm0                   \n"
4629       "movdqu      0x10(%0),%%xmm1               \n"
4630       "lea         0x20(%0),%0                   \n"
4631       "movdqa      %%xmm0,%%xmm2                 \n"
4632       "movdqa      %%xmm1,%%xmm3                 \n"
4633       "pand        %%xmm5,%%xmm0                 \n"
4634       "pand        %%xmm5,%%xmm1                 \n"
4635       "packuswb    %%xmm1,%%xmm0                 \n"
4636       "psrlw       $0x8,%%xmm2                   \n"
4637       "psrlw       $0x8,%%xmm3                   \n"
4638       "packuswb    %%xmm3,%%xmm2                 \n"
4639       "movdqu      %%xmm0,(%1)                   \n"
4640       "movdqu      %%xmm2,0x00(%1,%2,1)          \n"
4641       "lea         0x10(%1),%1                   \n"
4642       "sub         $0x10,%3                      \n"
4643       "jg          1b                            \n"
4644       : "+r"(src_uv),  // %0
4645         "+r"(dst_u),   // %1
4646         "+r"(dst_v),   // %2
4647         "+r"(width)    // %3
4648       :
4649       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
4650 }
4651 #endif  // HAS_SPLITUVROW_SSE2
4652 
4653 #ifdef HAS_MERGEUVROW_AVX2
MergeUVRow_AVX2(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)4654 void MergeUVRow_AVX2(const uint8_t* src_u,
4655                      const uint8_t* src_v,
4656                      uint8_t* dst_uv,
4657                      int width) {
4658   asm volatile(
4659 
4660       "sub         %0,%1                         \n"
4661 
4662       LABELALIGN
4663       "1:                                        \n"
4664       "vmovdqu     (%0),%%ymm0                   \n"
4665       "vmovdqu     0x00(%0,%1,1),%%ymm1          \n"
4666       "lea         0x20(%0),%0                   \n"
4667       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm2          \n"
4668       "vpunpckhbw  %%ymm1,%%ymm0,%%ymm0          \n"
4669       "vextractf128 $0x0,%%ymm2,(%2)             \n"
4670       "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
4671       "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
4672       "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
4673       "lea         0x40(%2),%2                   \n"
4674       "sub         $0x20,%3                      \n"
4675       "jg          1b                            \n"
4676       "vzeroupper                                \n"
4677       : "+r"(src_u),   // %0
4678         "+r"(src_v),   // %1
4679         "+r"(dst_uv),  // %2
4680         "+r"(width)    // %3
4681       :
4682       : "memory", "cc", "xmm0", "xmm1", "xmm2");
4683 }
4684 #endif  // HAS_MERGEUVROW_AVX2
4685 
4686 #ifdef HAS_MERGEUVROW_SSE2
MergeUVRow_SSE2(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)4687 void MergeUVRow_SSE2(const uint8_t* src_u,
4688                      const uint8_t* src_v,
4689                      uint8_t* dst_uv,
4690                      int width) {
4691   asm volatile(
4692 
4693       "sub         %0,%1                         \n"
4694 
4695       LABELALIGN
4696       "1:                                        \n"
4697       "movdqu      (%0),%%xmm0                   \n"
4698       "movdqu      0x00(%0,%1,1),%%xmm1          \n"
4699       "lea         0x10(%0),%0                   \n"
4700       "movdqa      %%xmm0,%%xmm2                 \n"
4701       "punpcklbw   %%xmm1,%%xmm0                 \n"
4702       "punpckhbw   %%xmm1,%%xmm2                 \n"
4703       "movdqu      %%xmm0,(%2)                   \n"
4704       "movdqu      %%xmm2,0x10(%2)               \n"
4705       "lea         0x20(%2),%2                   \n"
4706       "sub         $0x10,%3                      \n"
4707       "jg          1b                            \n"
4708       : "+r"(src_u),   // %0
4709         "+r"(src_v),   // %1
4710         "+r"(dst_uv),  // %2
4711         "+r"(width)    // %3
4712       :
4713       : "memory", "cc", "xmm0", "xmm1", "xmm2");
4714 }
4715 #endif  // HAS_MERGEUVROW_SSE2
4716 
4717 #ifdef HAS_MERGEUVROW_16_AVX2
MergeUVRow_16_AVX2(const uint16_t * src_u,const uint16_t * src_v,uint16_t * dst_uv,int depth,int width)4718 void MergeUVRow_16_AVX2(const uint16_t* src_u,
4719                         const uint16_t* src_v,
4720                         uint16_t* dst_uv,
4721                         int depth,
4722                         int width) {
4723   depth = 16 - depth;
4724   // clang-format off
4725   asm volatile (
4726       "vmovd       %4,%%xmm3                     \n"
4727       "sub         %0,%1                         \n"
4728 
4729     // 16 pixels per loop.
4730     LABELALIGN
4731       "1:                                        \n"
4732       "vmovdqu     (%0),%%ymm0                   \n"
4733       "vmovdqu     (%0,%1,1),%%ymm1              \n"
4734       "add         $0x20,%0                      \n"
4735 
4736       "vpsllw      %%xmm3,%%ymm0,%%ymm0          \n"
4737       "vpsllw      %%xmm3,%%ymm1,%%ymm1          \n"
4738       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm2          \n"  // mutates
4739       "vpunpckhwd  %%ymm1,%%ymm0,%%ymm0          \n"
4740       "vextractf128 $0x0,%%ymm2,(%2)             \n"
4741       "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
4742       "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
4743       "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
4744       "add         $0x40,%2                      \n"
4745       "sub         $0x10,%3                      \n"
4746       "jg          1b                            \n"
4747       "vzeroupper                                \n"
4748   : "+r"(src_u),   // %0
4749     "+r"(src_v),   // %1
4750     "+r"(dst_uv),  // %2
4751     "+r"(width)    // %3
4752   : "r"(depth)     // %4
4753   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
4754   // clang-format on
4755 }
4756 #endif  // HAS_MERGEUVROW_AVX2
4757 
4758 #ifdef HAS_SPLITUVROW_16_AVX2
4759 const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8,  9,  12, 13,
4760                                  2, 3, 6, 7, 10, 11, 14, 15};
SplitUVRow_16_AVX2(const uint16_t * src_uv,uint16_t * dst_u,uint16_t * dst_v,int depth,int width)4761 void SplitUVRow_16_AVX2(const uint16_t* src_uv,
4762                         uint16_t* dst_u,
4763                         uint16_t* dst_v,
4764                         int depth,
4765                         int width) {
4766   depth = 16 - depth;
4767   // clang-format off
4768   asm volatile (
4769       "vmovd       %4,%%xmm3                     \n"
4770       "vbroadcastf128 %5,%%ymm4                  \n"
4771       "sub         %1,%2                         \n"
4772 
4773     // 16 pixels per loop.
4774     LABELALIGN
4775       "1:                                        \n"
4776       "vmovdqu     (%0),%%ymm0                   \n"
4777       "vmovdqu     0x20(%0),%%ymm1               \n"
4778       "add         $0x40,%0                      \n"
4779 
4780       "vpsrlw      %%xmm3,%%ymm0,%%ymm0          \n"
4781       "vpsrlw      %%xmm3,%%ymm1,%%ymm1          \n"
4782       "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
4783       "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
4784       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
4785       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
4786       "vextractf128 $0x0,%%ymm0,(%1)             \n"
4787       "vextractf128 $0x0,%%ymm1,0x10(%1)         \n"
4788       "vextractf128 $0x1,%%ymm0,(%1,%2)          \n"
4789       "vextractf128 $0x1,%%ymm1,0x10(%1,%2)      \n"
4790       "add         $0x20,%1                      \n"
4791       "sub         $0x10,%3                      \n"
4792       "jg          1b                            \n"
4793       "vzeroupper                                \n"
4794   : "+r"(src_uv),   // %0
4795     "+r"(dst_u),    // %1
4796     "+r"(dst_v),    // %2
4797     "+r"(width)     // %3
4798   : "r"(depth),     // %4
4799     "m"(kSplitUVShuffle16) // %5
4800   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
4801   // clang-format on
4802 }
4803 #endif  // HAS_SPLITUVROW_16_AVX2
4804 
4805 // Use scale to convert lsb formats to msb, depending how many bits there are:
4806 // 128 = 9 bits
4807 // 64 = 10 bits
4808 // 16 = 12 bits
4809 // 1 = 16 bits
4810 #ifdef HAS_MULTIPLYROW_16_AVX2
MultiplyRow_16_AVX2(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)4811 void MultiplyRow_16_AVX2(const uint16_t* src_y,
4812                          uint16_t* dst_y,
4813                          int scale,
4814                          int width) {
4815   // clang-format off
4816   asm volatile (
4817       "vmovd       %3,%%xmm3                     \n"
4818       "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
4819       "vbroadcastss %%xmm3,%%ymm3                \n"
4820       "sub         %0,%1                         \n"
4821 
4822     // 32 pixels per loop.
4823     LABELALIGN
4824       "1:                                        \n"
4825       "vmovdqu     (%0),%%ymm0                   \n"
4826       "vmovdqu     0x20(%0),%%ymm1               \n"
4827       "vpmullw     %%ymm3,%%ymm0,%%ymm0          \n"
4828       "vpmullw     %%ymm3,%%ymm1,%%ymm1          \n"
4829       "vmovdqu     %%ymm0,(%0,%1)                \n"
4830       "vmovdqu     %%ymm1,0x20(%0,%1)            \n"
4831       "add         $0x40,%0                      \n"
4832       "sub         $0x20,%2                      \n"
4833       "jg          1b                            \n"
4834       "vzeroupper                                \n"
4835   : "+r"(src_y),   // %0
4836     "+r"(dst_y),   // %1
4837     "+r"(width)    // %2
4838   : "r"(scale)     // %3
4839   : "memory", "cc", "xmm0", "xmm1", "xmm3");
4840   // clang-format on
4841 }
4842 #endif  // HAS_MULTIPLYROW_16_AVX2
4843 
4844 // Use scale to convert msb formats to lsb, depending how many bits there are:
4845 // 512 = 9 bits
4846 // 1024 = 10 bits
4847 // 4096 = 12 bits
4848 // 65536 = 16 bits
4849 #ifdef HAS_DIVIDEROW_16_AVX2
DivideRow_16_AVX2(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)4850 void DivideRow_16_AVX2(const uint16_t* src_y,
4851                        uint16_t* dst_y,
4852                        int scale,
4853                        int width) {
4854   // clang-format off
4855   asm volatile (
4856       "vmovd       %3,%%xmm3                     \n"
4857       "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
4858       "vbroadcastss %%xmm3,%%ymm3                \n"
4859       "sub         %0,%1                         \n"
4860 
4861     // 32 pixels per loop.
4862     LABELALIGN
4863       "1:                                        \n"
4864       "vmovdqu     (%0),%%ymm0                   \n"
4865       "vmovdqu     0x20(%0),%%ymm1               \n"
4866       "vpmulhuw    %%ymm3,%%ymm0,%%ymm0          \n"
4867       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
4868       "vmovdqu     %%ymm0,(%0,%1)                \n"
4869       "vmovdqu     %%ymm1,0x20(%0,%1)            \n"
4870       "add         $0x40,%0                      \n"
4871       "sub         $0x20,%2                      \n"
4872       "jg          1b                            \n"
4873       "vzeroupper                                \n"
4874   : "+r"(src_y),   // %0
4875     "+r"(dst_y),   // %1
4876     "+r"(width),    // %2
4877     "+r"(scale)     // %3
4878   :
4879   : "memory", "cc", "xmm0", "xmm1", "xmm3");
4880   // clang-format on
4881 }
4882 #endif  // HAS_MULTIPLYROW_16_AVX2
4883 
4884 // Use scale to convert lsb formats to msb, depending how many bits there are:
4885 // 32768 = 9 bits
4886 // 16384 = 10 bits
4887 // 4096 = 12 bits
4888 // 256 = 16 bits
Convert16To8Row_SSSE3(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)4889 void Convert16To8Row_SSSE3(const uint16_t* src_y,
4890                            uint8_t* dst_y,
4891                            int scale,
4892                            int width) {
4893   // clang-format off
4894   asm volatile (
4895       "movd        %3,%%xmm2                     \n"
4896       "punpcklwd   %%xmm2,%%xmm2                 \n"
4897       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
4898 
4899     // 32 pixels per loop.
4900     LABELALIGN
4901       "1:                                        \n"
4902       "movdqu      (%0),%%xmm0                   \n"
4903       "movdqu      0x10(%0),%%xmm1               \n"
4904       "add         $0x20,%0                      \n"
4905       "pmulhuw     %%xmm2,%%xmm0                 \n"
4906       "pmulhuw     %%xmm2,%%xmm1                 \n"
4907       "packuswb    %%xmm1,%%xmm0                 \n"
4908       "movdqu      %%xmm0,(%1)                   \n"
4909       "add         $0x10,%1                      \n"
4910       "sub         $0x10,%2                      \n"
4911       "jg          1b                            \n"
4912   : "+r"(src_y),   // %0
4913     "+r"(dst_y),   // %1
4914     "+r"(width)    // %2
4915   : "r"(scale)     // %3
4916   : "memory", "cc", "xmm0", "xmm1", "xmm2");
4917   // clang-format on
4918 }
4919 
4920 #ifdef HAS_CONVERT16TO8ROW_AVX2
Convert16To8Row_AVX2(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)4921 void Convert16To8Row_AVX2(const uint16_t* src_y,
4922                           uint8_t* dst_y,
4923                           int scale,
4924                           int width) {
4925   // clang-format off
4926   asm volatile (
4927       "vmovd       %3,%%xmm2                     \n"
4928       "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
4929       "vbroadcastss %%xmm2,%%ymm2                \n"
4930 
4931     // 32 pixels per loop.
4932     LABELALIGN
4933       "1:                                        \n"
4934       "vmovdqu     (%0),%%ymm0                   \n"
4935       "vmovdqu     0x20(%0),%%ymm1               \n"
4936       "add         $0x40,%0                      \n"
4937       "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
4938       "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
4939       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"  // mutates
4940       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
4941       "vmovdqu     %%ymm0,(%1)                   \n"
4942       "add         $0x20,%1                      \n"
4943       "sub         $0x20,%2                      \n"
4944       "jg          1b                            \n"
4945       "vzeroupper                                \n"
4946   : "+r"(src_y),   // %0
4947     "+r"(dst_y),   // %1
4948     "+r"(width)    // %2
4949   : "r"(scale)     // %3
4950   : "memory", "cc", "xmm0", "xmm1", "xmm2");
4951   // clang-format on
4952 }
4953 #endif  // HAS_CONVERT16TO8ROW_AVX2
4954 
4955 // Use scale to convert to lsb formats depending how many bits there are:
4956 // 512 = 9 bits
4957 // 1024 = 10 bits
4958 // 4096 = 12 bits
4959 // TODO(fbarchard): reduce to SSE2
Convert8To16Row_SSE2(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)4960 void Convert8To16Row_SSE2(const uint8_t* src_y,
4961                           uint16_t* dst_y,
4962                           int scale,
4963                           int width) {
4964   // clang-format off
4965   asm volatile (
4966       "movd        %3,%%xmm2                     \n"
4967       "punpcklwd   %%xmm2,%%xmm2                 \n"
4968       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
4969 
4970     // 32 pixels per loop.
4971     LABELALIGN
4972       "1:                                        \n"
4973       "movdqu      (%0),%%xmm0                   \n"
4974       "movdqa      %%xmm0,%%xmm1                 \n"
4975       "punpcklbw   %%xmm0,%%xmm0                 \n"
4976       "punpckhbw   %%xmm1,%%xmm1                 \n"
4977       "add         $0x10,%0                      \n"
4978       "pmulhuw     %%xmm2,%%xmm0                 \n"
4979       "pmulhuw     %%xmm2,%%xmm1                 \n"
4980       "movdqu      %%xmm0,(%1)                   \n"
4981       "movdqu      %%xmm1,0x10(%1)               \n"
4982       "add         $0x20,%1                      \n"
4983       "sub         $0x10,%2                      \n"
4984       "jg          1b                            \n"
4985   : "+r"(src_y),   // %0
4986     "+r"(dst_y),   // %1
4987     "+r"(width)    // %2
4988   : "r"(scale)     // %3
4989   : "memory", "cc", "xmm0", "xmm1", "xmm2");
4990   // clang-format on
4991 }
4992 
4993 #ifdef HAS_CONVERT8TO16ROW_AVX2
Convert8To16Row_AVX2(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)4994 void Convert8To16Row_AVX2(const uint8_t* src_y,
4995                           uint16_t* dst_y,
4996                           int scale,
4997                           int width) {
4998   // clang-format off
4999   asm volatile (
5000       "vmovd       %3,%%xmm2                     \n"
5001       "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
5002       "vbroadcastss %%xmm2,%%ymm2                \n"
5003 
5004     // 32 pixels per loop.
5005     LABELALIGN
5006       "1:                                        \n"
5007       "vmovdqu     (%0),%%ymm0                   \n"
5008       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
5009       "add         $0x20,%0                      \n"
5010       "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
5011       "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
5012       "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
5013       "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
5014       "vmovdqu     %%ymm0,(%1)                   \n"
5015       "vmovdqu     %%ymm1,0x20(%1)               \n"
5016       "add         $0x40,%1                      \n"
5017       "sub         $0x20,%2                      \n"
5018       "jg          1b                            \n"
5019       "vzeroupper                                \n"
5020   : "+r"(src_y),   // %0
5021     "+r"(dst_y),   // %1
5022     "+r"(width)    // %2
5023   : "r"(scale)     // %3
5024   : "memory", "cc", "xmm0", "xmm1", "xmm2");
5025   // clang-format on
5026 }
5027 #endif  // HAS_CONVERT8TO16ROW_AVX2
5028 
5029 #ifdef HAS_SPLITRGBROW_SSSE3
5030 
5031 // Shuffle table for converting RGB to Planar.
5032 static const uvec8 kShuffleMaskRGBToR0 = {0u,   3u,   6u,   9u,   12u,  15u,
5033                                           128u, 128u, 128u, 128u, 128u, 128u,
5034                                           128u, 128u, 128u, 128u};
5035 static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
5036                                           2u,   5u,   8u,   11u,  14u,  128u,
5037                                           128u, 128u, 128u, 128u};
5038 static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
5039                                           128u, 128u, 128u, 128u, 128u, 1u,
5040                                           4u,   7u,   10u,  13u};
5041 
5042 static const uvec8 kShuffleMaskRGBToG0 = {1u,   4u,   7u,   10u,  13u,  128u,
5043                                           128u, 128u, 128u, 128u, 128u, 128u,
5044                                           128u, 128u, 128u, 128u};
5045 static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
5046                                           3u,   6u,   9u,   12u,  15u,  128u,
5047                                           128u, 128u, 128u, 128u};
5048 static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
5049                                           128u, 128u, 128u, 128u, 128u, 2u,
5050                                           5u,   8u,   11u,  14u};
5051 
5052 static const uvec8 kShuffleMaskRGBToB0 = {2u,   5u,   8u,   11u,  14u,  128u,
5053                                           128u, 128u, 128u, 128u, 128u, 128u,
5054                                           128u, 128u, 128u, 128u};
5055 static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
5056                                           4u,   7u,   10u,  13u,  128u, 128u,
5057                                           128u, 128u, 128u, 128u};
5058 static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
5059                                           128u, 128u, 128u, 128u, 0u,   3u,
5060                                           6u,   9u,   12u,  15u};
5061 
SplitRGBRow_SSSE3(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)5062 void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
5063                        uint8_t* dst_r,
5064                        uint8_t* dst_g,
5065                        uint8_t* dst_b,
5066                        int width) {
5067   asm volatile(
5068 
5069       LABELALIGN
5070       "1:                                        \n"
5071       "movdqu      (%0),%%xmm0                   \n"
5072       "movdqu      0x10(%0),%%xmm1               \n"
5073       "movdqu      0x20(%0),%%xmm2               \n"
5074       "pshufb      %5, %%xmm0                    \n"
5075       "pshufb      %6, %%xmm1                    \n"
5076       "pshufb      %7, %%xmm2                    \n"
5077       "por         %%xmm1,%%xmm0                 \n"
5078       "por         %%xmm2,%%xmm0                 \n"
5079       "movdqu      %%xmm0,(%1)                   \n"
5080       "lea         0x10(%1),%1                   \n"
5081 
5082       "movdqu      (%0),%%xmm0                   \n"
5083       "movdqu      0x10(%0),%%xmm1               \n"
5084       "movdqu      0x20(%0),%%xmm2               \n"
5085       "pshufb      %8, %%xmm0                    \n"
5086       "pshufb      %9, %%xmm1                    \n"
5087       "pshufb      %10, %%xmm2                   \n"
5088       "por         %%xmm1,%%xmm0                 \n"
5089       "por         %%xmm2,%%xmm0                 \n"
5090       "movdqu      %%xmm0,(%2)                   \n"
5091       "lea         0x10(%2),%2                   \n"
5092 
5093       "movdqu      (%0),%%xmm0                   \n"
5094       "movdqu      0x10(%0),%%xmm1               \n"
5095       "movdqu      0x20(%0),%%xmm2               \n"
5096       "pshufb      %11, %%xmm0                   \n"
5097       "pshufb      %12, %%xmm1                   \n"
5098       "pshufb      %13, %%xmm2                   \n"
5099       "por         %%xmm1,%%xmm0                 \n"
5100       "por         %%xmm2,%%xmm0                 \n"
5101       "movdqu      %%xmm0,(%3)                   \n"
5102       "lea         0x10(%3),%3                   \n"
5103       "lea         0x30(%0),%0                   \n"
5104       "sub         $0x10,%4                      \n"
5105       "jg          1b                            \n"
5106       : "+r"(src_rgb),             // %0
5107         "+r"(dst_r),               // %1
5108         "+r"(dst_g),               // %2
5109         "+r"(dst_b),               // %3
5110         "+r"(width)                // %4
5111       : "m"(kShuffleMaskRGBToR0),  // %5
5112         "m"(kShuffleMaskRGBToR1),  // %6
5113         "m"(kShuffleMaskRGBToR2),  // %7
5114         "m"(kShuffleMaskRGBToG0),  // %8
5115         "m"(kShuffleMaskRGBToG1),  // %9
5116         "m"(kShuffleMaskRGBToG2),  // %10
5117         "m"(kShuffleMaskRGBToB0),  // %11
5118         "m"(kShuffleMaskRGBToB1),  // %12
5119         "m"(kShuffleMaskRGBToB2)   // %13
5120       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5121 }
5122 #endif  // HAS_SPLITRGBROW_SSSE3
5123 
5124 #ifdef HAS_MERGERGBROW_SSSE3
5125 
5126 // Shuffle table for converting RGB to Planar.
5127 static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
5128                                           2u, 128u, 128u, 3u, 128u, 128u,
5129                                           4u, 128u, 128u, 5u};
5130 static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
5131                                           128u, 2u, 128u, 128u, 3u, 128u,
5132                                           128u, 4u, 128u, 128u};
5133 static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
5134                                           128u, 128u, 2u, 128u, 128u, 3u,
5135                                           128u, 128u, 4u, 128u};
5136 
5137 static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
5138                                           7u, 128u, 128u, 8u, 128u, 128u,
5139                                           9u, 128u, 128u, 10u};
5140 static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
5141                                           128u, 7u, 128u, 128u, 8u, 128u,
5142                                           128u, 9u, 128u, 128u};
5143 static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u,  128u, 128u, 7u,
5144                                           128u, 128u, 8u,  128u, 128u, 9u,
5145                                           128u, 128u, 10u, 128u};
5146 
5147 static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
5148                                           12u, 128u, 128u, 13u, 128u, 128u,
5149                                           14u, 128u, 128u, 15u};
5150 static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
5151                                           128u, 13u, 128u, 128u, 14u, 128u,
5152                                           128u, 15u, 128u, 128u};
5153 static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
5154                                           128u, 128u, 13u, 128u, 128u, 14u,
5155                                           128u, 128u, 15u, 128u};
5156 
MergeRGBRow_SSSE3(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)5157 void MergeRGBRow_SSSE3(const uint8_t* src_r,
5158                        const uint8_t* src_g,
5159                        const uint8_t* src_b,
5160                        uint8_t* dst_rgb,
5161                        int width) {
5162   asm volatile(
5163 
5164       LABELALIGN
5165       "1:                                        \n"
5166       "movdqu      (%0),%%xmm0                   \n"
5167       "movdqu      (%1),%%xmm1                   \n"
5168       "movdqu      (%2),%%xmm2                   \n"
5169       "pshufb      %5, %%xmm0                    \n"
5170       "pshufb      %6, %%xmm1                    \n"
5171       "pshufb      %7, %%xmm2                    \n"
5172       "por         %%xmm1,%%xmm0                 \n"
5173       "por         %%xmm2,%%xmm0                 \n"
5174       "movdqu      %%xmm0,(%3)                   \n"
5175 
5176       "movdqu      (%0),%%xmm0                   \n"
5177       "movdqu      (%1),%%xmm1                   \n"
5178       "movdqu      (%2),%%xmm2                   \n"
5179       "pshufb      %8, %%xmm0                    \n"
5180       "pshufb      %9, %%xmm1                    \n"
5181       "pshufb      %10, %%xmm2                   \n"
5182       "por         %%xmm1,%%xmm0                 \n"
5183       "por         %%xmm2,%%xmm0                 \n"
5184       "movdqu      %%xmm0,16(%3)                 \n"
5185 
5186       "movdqu      (%0),%%xmm0                   \n"
5187       "movdqu      (%1),%%xmm1                   \n"
5188       "movdqu      (%2),%%xmm2                   \n"
5189       "pshufb      %11, %%xmm0                   \n"
5190       "pshufb      %12, %%xmm1                   \n"
5191       "pshufb      %13, %%xmm2                   \n"
5192       "por         %%xmm1,%%xmm0                 \n"
5193       "por         %%xmm2,%%xmm0                 \n"
5194       "movdqu      %%xmm0,32(%3)                 \n"
5195 
5196       "lea         0x10(%0),%0                   \n"
5197       "lea         0x10(%1),%1                   \n"
5198       "lea         0x10(%2),%2                   \n"
5199       "lea         0x30(%3),%3                   \n"
5200       "sub         $0x10,%4                      \n"
5201       "jg          1b                            \n"
5202       : "+r"(src_r),               // %0
5203         "+r"(src_g),               // %1
5204         "+r"(src_b),               // %2
5205         "+r"(dst_rgb),             // %3
5206         "+r"(width)                // %4
5207       : "m"(kShuffleMaskRToRGB0),  // %5
5208         "m"(kShuffleMaskGToRGB0),  // %6
5209         "m"(kShuffleMaskBToRGB0),  // %7
5210         "m"(kShuffleMaskRToRGB1),  // %8
5211         "m"(kShuffleMaskGToRGB1),  // %9
5212         "m"(kShuffleMaskBToRGB1),  // %10
5213         "m"(kShuffleMaskRToRGB2),  // %11
5214         "m"(kShuffleMaskGToRGB2),  // %12
5215         "m"(kShuffleMaskBToRGB2)   // %13
5216       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5217 }
5218 #endif  // HAS_MERGERGBROW_SSSE3
5219 
5220 #ifdef HAS_MERGEARGBROW_SSE2
MergeARGBRow_SSE2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)5221 void MergeARGBRow_SSE2(const uint8_t* src_r,
5222                        const uint8_t* src_g,
5223                        const uint8_t* src_b,
5224                        const uint8_t* src_a,
5225                        uint8_t* dst_argb,
5226                        int width) {
5227   asm volatile(
5228 
5229       "sub         %0,%1                         \n"
5230       "sub         %0,%2                         \n"
5231       "sub         %0,%3                         \n"
5232 
5233       LABELALIGN
5234       "1:                                        \n"
5235 
5236       "movq        (%0,%2),%%xmm0                \n"  // B
5237       "movq        (%0),%%xmm1                   \n"  // R
5238       "movq        (%0,%1),%%xmm2                \n"  // G
5239       "punpcklbw   %%xmm1,%%xmm0                 \n"  // BR
5240       "movq        (%0,%3),%%xmm1                \n"  // A
5241       "punpcklbw   %%xmm1,%%xmm2                 \n"  // GA
5242       "movdqa      %%xmm0,%%xmm1                 \n"  // BR
5243       "punpckhbw   %%xmm2,%%xmm1                 \n"  // BGRA (hi)
5244       "punpcklbw   %%xmm2,%%xmm0                 \n"  // BGRA (lo)
5245       "movdqu      %%xmm0,(%4)                   \n"
5246       "movdqu      %%xmm1,16(%4)                 \n"
5247 
5248       "lea         8(%0),%0                      \n"
5249       "lea         32(%4),%4                     \n"
5250       "sub         $0x8,%5                       \n"
5251       "jg          1b                            \n"
5252       : "+r"(src_r),     // %0
5253         "+r"(src_g),     // %1
5254         "+r"(src_b),     // %2
5255         "+r"(src_a),     // %3
5256         "+r"(dst_argb),  // %4
5257         "+r"(width)      // %5
5258       :
5259       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5260 }
5261 #endif
5262 
5263 #ifdef HAS_MERGEXRGBROW_SSE2
MergeXRGBRow_SSE2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_argb,int width)5264 void MergeXRGBRow_SSE2(const uint8_t* src_r,
5265                        const uint8_t* src_g,
5266                        const uint8_t* src_b,
5267                        uint8_t* dst_argb,
5268                        int width) {
5269   asm volatile(
5270 
5271       LABELALIGN
5272       "1:                                        \n"
5273 
5274       "movq        (%2),%%xmm0                   \n"  // B
5275       "movq        (%0),%%xmm1                   \n"  // R
5276       "movq        (%1),%%xmm2                   \n"  // G
5277       "punpcklbw   %%xmm1,%%xmm0                 \n"  // BR
5278       "pcmpeqd     %%xmm1,%%xmm1                 \n"  // A(255)
5279       "punpcklbw   %%xmm1,%%xmm2                 \n"  // GA
5280       "movdqa      %%xmm0,%%xmm1                 \n"  // BR
5281       "punpckhbw   %%xmm2,%%xmm1                 \n"  // BGRA (hi)
5282       "punpcklbw   %%xmm2,%%xmm0                 \n"  // BGRA (lo)
5283       "movdqu      %%xmm0,(%3)                   \n"
5284       "movdqu      %%xmm1,16(%3)                 \n"
5285 
5286       "lea         8(%0),%0                      \n"
5287       "lea         8(%1),%1                      \n"
5288       "lea         8(%2),%2                      \n"
5289       "lea         32(%3),%3                     \n"
5290       "sub         $0x8,%4                       \n"
5291       "jg          1b                            \n"
5292       : "+r"(src_r),     // %0
5293         "+r"(src_g),     // %1
5294         "+r"(src_b),     // %2
5295         "+r"(dst_argb),  // %3
5296         "+r"(width)      // %4
5297       :
5298       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5299 }
5300 #endif  // HAS_MERGEARGBROW_SSE2
5301 
5302 #ifdef HAS_MERGEARGBROW_AVX2
MergeARGBRow_AVX2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)5303 void MergeARGBRow_AVX2(const uint8_t* src_r,
5304                        const uint8_t* src_g,
5305                        const uint8_t* src_b,
5306                        const uint8_t* src_a,
5307                        uint8_t* dst_argb,
5308                        int width) {
5309   asm volatile(
5310 
5311       "sub         %0,%1                         \n"
5312       "sub         %0,%2                         \n"
5313       "sub         %0,%3                         \n"
5314 
5315       LABELALIGN
5316       "1:                                        \n"
5317 
5318       "vmovdqu     (%0,%2),%%xmm0                \n"  // B
5319       "vmovdqu     (%0,%1),%%xmm1                \n"  // R
5320       "vinserti128 $1,(%0),%%ymm0,%%ymm0         \n"  // G
5321       "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1      \n"  // A
5322       "vpunpckhbw  %%ymm1,%%ymm0,%%ymm2          \n"
5323       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
5324       "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
5325       "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
5326       "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"
5327       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"
5328       "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
5329       "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
5330       "vmovdqu     %%ymm0,(%4)                   \n"  // First 8
5331       "vmovdqu     %%ymm1,32(%4)                 \n"  // Next 8
5332 
5333       "lea         16(%0),%0                     \n"
5334       "lea         64(%4),%4                     \n"
5335       "sub         $0x10,%5                      \n"
5336       "jg          1b                            \n"
5337       "vzeroupper                                \n"
5338       : "+r"(src_r),     // %0
5339         "+r"(src_g),     // %1
5340         "+r"(src_b),     // %2
5341         "+r"(src_a),     // %3
5342         "+r"(dst_argb),  // %4
5343         "+r"(width)      // %5
5344       :
5345       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5346 }
5347 #endif
5348 
5349 #ifdef HAS_MERGEXRGBROW_AVX2
MergeXRGBRow_AVX2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_argb,int width)5350 void MergeXRGBRow_AVX2(const uint8_t* src_r,
5351                        const uint8_t* src_g,
5352                        const uint8_t* src_b,
5353                        uint8_t* dst_argb,
5354                        int width) {
5355   asm volatile(
5356 
5357       LABELALIGN
5358       "1:                                        \n"
5359 
5360       "vmovdqu     (%2),%%xmm0                   \n"  // B
5361       "vpcmpeqd    %%ymm1,%%ymm1,%%ymm1          \n"  // A(255)
5362       "vinserti128 $0,(%1),%%ymm1,%%ymm1         \n"  // R
5363       "vinserti128 $1,(%0),%%ymm0,%%ymm0         \n"  // G
5364       "vpunpckhbw  %%ymm1,%%ymm0,%%ymm2          \n"
5365       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
5366       "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
5367       "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
5368       "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"
5369       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"
5370       "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
5371       "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
5372       "vmovdqu     %%ymm0,(%3)                   \n"  // First 8
5373       "vmovdqu     %%ymm1,32(%3)                 \n"  // Next 8
5374 
5375       "lea         16(%0),%0                     \n"
5376       "lea         16(%1),%1                     \n"
5377       "lea         16(%2),%2                     \n"
5378       "lea         64(%3),%3                     \n"
5379       "sub         $0x10,%4                      \n"
5380       "jg          1b                            \n"
5381       "vzeroupper                                \n"
5382       : "+r"(src_r),     // %0
5383         "+r"(src_g),     // %1
5384         "+r"(src_b),     // %2
5385         "+r"(dst_argb),  // %3
5386         "+rm"(width)     // %4
5387       :
5388       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5389 }
5390 #endif  // HAS_MERGEARGBROW_AVX2
5391 
5392 #ifdef HAS_SPLITARGBROW_SSE2
SplitARGBRow_SSE2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)5393 void SplitARGBRow_SSE2(const uint8_t* src_argb,
5394                        uint8_t* dst_r,
5395                        uint8_t* dst_g,
5396                        uint8_t* dst_b,
5397                        uint8_t* dst_a,
5398                        int width) {
5399   asm volatile(
5400 
5401       "sub         %1,%2                         \n"
5402       "sub         %1,%3                         \n"
5403       "sub         %1,%4                         \n"
5404 
5405       LABELALIGN
5406       "1:                                        \n"
5407 
5408       "movdqu      (%0),%%xmm0                   \n"  // 00-0F
5409       "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
5410       "movdqa      %%xmm0,%%xmm2                 \n"
5411       "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 00-07 10-17
5412       "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 08-0F 18-1F
5413       "movdqa      %%xmm0,%%xmm1                 \n"
5414       "punpcklbw   %%xmm2,%%xmm0                 \n"  // 08192A3B4C5D6E7F (lo)
5415       "punpckhbw   %%xmm2,%%xmm1                 \n"  // 08192A3B4C5D6E7F (hi)
5416       "movdqa      %%xmm0,%%xmm2                 \n"
5417       "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 08192A3B08192A3B
5418       "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 4C5D6E7F4C5D6E7F
5419       "movdqa      %%xmm0,%%xmm1                 \n"
5420       "punpcklbw   %%xmm2,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
5421       "punpckhbw   %%xmm2,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
5422       "movdqa      %%xmm0,%%xmm2                 \n"
5423       "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
5424       "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
5425       "movlps      %%xmm0,(%1,%3)                \n"  // B
5426       "movhps      %%xmm0,(%1,%2)                \n"  // G
5427       "movlps      %%xmm2,(%1)                   \n"  // R
5428       "movhps      %%xmm2,(%1,%4)                \n"  // A
5429 
5430       "lea         32(%0),%0                     \n"
5431       "lea         8(%1),%1                      \n"
5432       "sub         $0x8,%5                       \n"
5433       "jg          1b                            \n"
5434       : "+r"(src_argb),  // %0
5435         "+r"(dst_r),     // %1
5436         "+r"(dst_g),     // %2
5437         "+r"(dst_b),     // %3
5438         "+r"(dst_a),     // %4
5439         "+rm"(width)     // %5
5440       :
5441       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5442 }
5443 #endif
5444 
5445 #ifdef HAS_SPLITXRGBROW_SSE2
SplitXRGBRow_SSE2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)5446 void SplitXRGBRow_SSE2(const uint8_t* src_argb,
5447                        uint8_t* dst_r,
5448                        uint8_t* dst_g,
5449                        uint8_t* dst_b,
5450                        int width) {
5451   asm volatile(
5452 
5453       LABELALIGN
5454       "1:                                        \n"
5455 
5456       "movdqu      (%0),%%xmm0                   \n"  // 00-0F
5457       "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
5458       "movdqa      %%xmm0,%%xmm2                 \n"
5459       "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 00-07 10-17
5460       "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 08-0F 18-1F
5461       "movdqa      %%xmm0,%%xmm1                 \n"
5462       "punpcklbw   %%xmm2,%%xmm0                 \n"  // 08192A3B4C5D6E7F (lo)
5463       "punpckhbw   %%xmm2,%%xmm1                 \n"  // 08192A3B4C5D6E7F (hi)
5464       "movdqa      %%xmm0,%%xmm2                 \n"
5465       "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 08192A3B08192A3B
5466       "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 4C5D6E7F4C5D6E7F
5467       "movdqa      %%xmm0,%%xmm1                 \n"
5468       "punpcklbw   %%xmm2,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
5469       "punpckhbw   %%xmm2,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
5470       "movdqa      %%xmm0,%%xmm2                 \n"
5471       "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
5472       "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
5473       "movlps      %%xmm0,(%3)                   \n"  // B
5474       "movhps      %%xmm0,(%2)                   \n"  // G
5475       "movlps      %%xmm2,(%1)                   \n"  // R
5476 
5477       "lea         32(%0),%0                     \n"
5478       "lea         8(%1),%1                      \n"
5479       "lea         8(%2),%2                      \n"
5480       "lea         8(%3),%3                      \n"
5481       "sub         $0x8,%4                       \n"
5482       "jg          1b                            \n"
5483       : "+r"(src_argb),  // %0
5484         "+r"(dst_r),     // %1
5485         "+r"(dst_g),     // %2
5486         "+r"(dst_b),     // %3
5487         "+rm"(width)     // %4
5488       :
5489       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5490 }
5491 #endif
5492 
5493 static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8,  12, 1, 5, 9,  13,
5494                                             2, 6, 10, 14, 3, 7, 11, 15};
5495 #ifdef HAS_SPLITARGBROW_SSSE3
SplitARGBRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)5496 void SplitARGBRow_SSSE3(const uint8_t* src_argb,
5497                         uint8_t* dst_r,
5498                         uint8_t* dst_g,
5499                         uint8_t* dst_b,
5500                         uint8_t* dst_a,
5501                         int width) {
5502   asm volatile(
5503 
5504       "movdqa      %6,%%xmm3                     \n"
5505       "sub         %1,%2                         \n"
5506       "sub         %1,%3                         \n"
5507       "sub         %1,%4                         \n"
5508 
5509       LABELALIGN
5510       "1:                                        \n"
5511 
5512       "movdqu      (%0),%%xmm0                   \n"  // 00-0F
5513       "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
5514       "pshufb      %%xmm3,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
5515       "pshufb      %%xmm3,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
5516       "movdqa      %%xmm0,%%xmm2                 \n"
5517       "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
5518       "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
5519       "movlps      %%xmm0,(%1,%3)                \n"  // B
5520       "movhps      %%xmm0,(%1,%2)                \n"  // G
5521       "movlps      %%xmm2,(%1)                   \n"  // R
5522       "movhps      %%xmm2,(%1,%4)                \n"  // A
5523 
5524       "lea         32(%0),%0                     \n"
5525       "lea         8(%1),%1                      \n"
5526       "subl        $0x8,%5                       \n"
5527       "jg          1b                            \n"
5528       : "+r"(src_argb),  // %0
5529         "+r"(dst_r),     // %1
5530         "+r"(dst_g),     // %2
5531         "+r"(dst_b),     // %3
5532         "+r"(dst_a),     // %4
5533 #if defined(__i386__)
5534         "+m"(width)  // %5
5535 #else
5536         "+rm"(width)          // %5
5537 #endif
5538       : "m"(kShuffleMaskARGBSplit)  // %6
5539       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
5540 }
5541 #endif
5542 
5543 #ifdef HAS_SPLITXRGBROW_SSSE3
SplitXRGBRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)5544 void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
5545                         uint8_t* dst_r,
5546                         uint8_t* dst_g,
5547                         uint8_t* dst_b,
5548                         int width) {
5549   asm volatile(
5550 
5551       "movdqa      %5,%%xmm3                     \n"
5552 
5553       LABELALIGN
5554       "1:                                        \n"
5555 
5556       "movdqu      (%0),%%xmm0                   \n"  // 00-0F
5557       "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
5558       "pshufb      %%xmm3,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
5559       "pshufb      %%xmm3,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
5560       "movdqa      %%xmm0,%%xmm2                 \n"
5561       "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
5562       "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
5563       "movlps      %%xmm0,(%3)                   \n"  // B
5564       "movhps      %%xmm0,(%2)                   \n"  // G
5565       "movlps      %%xmm2,(%1)                   \n"  // R
5566 
5567       "lea         32(%0),%0                     \n"
5568       "lea         8(%1),%1                      \n"
5569       "lea         8(%2),%2                      \n"
5570       "lea         8(%3),%3                      \n"
5571       "sub         $0x8,%4                       \n"
5572       "jg          1b                            \n"
5573       : "+r"(src_argb),             // %0
5574         "+r"(dst_r),                // %1
5575         "+r"(dst_g),                // %2
5576         "+r"(dst_b),                // %3
5577         "+r"(width)                 // %4
5578       : "m"(kShuffleMaskARGBSplit)  // %5
5579       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
5580 }
5581 #endif
5582 
5583 #ifdef HAS_SPLITARGBROW_AVX2
5584 static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7};
SplitARGBRow_AVX2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)5585 void SplitARGBRow_AVX2(const uint8_t* src_argb,
5586                        uint8_t* dst_r,
5587                        uint8_t* dst_g,
5588                        uint8_t* dst_b,
5589                        uint8_t* dst_a,
5590                        int width) {
5591   asm volatile(
5592 
5593       "sub         %1,%2                         \n"
5594       "sub         %1,%3                         \n"
5595       "sub         %1,%4                         \n"
5596       "vmovdqa     %7,%%ymm3                     \n"
5597       "vbroadcastf128 %6,%%ymm4                  \n"
5598 
5599       LABELALIGN
5600       "1:                                        \n"
5601 
5602       "vmovdqu     (%0),%%xmm0                   \n"  // 00-0F
5603       "vmovdqu     16(%0),%%xmm1                 \n"  // 10-1F
5604       "vinserti128 $1,32(%0),%%ymm0,%%ymm0       \n"  // 00-0F 20-2F
5605       "vinserti128 $1,48(%0),%%ymm1,%%ymm1       \n"  // 10-1F 30-3F
5606       "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
5607       "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
5608       "vpermd      %%ymm0,%%ymm3,%%ymm0          \n"
5609       "vpermd      %%ymm1,%%ymm3,%%ymm1          \n"
5610       "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"  // GA
5611       "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"  // BR
5612       "vmovdqu     %%xmm0,(%1,%3)                \n"  // B
5613       "vextracti128 $1,%%ymm0,(%1)               \n"  // R
5614       "vmovdqu     %%xmm2,(%1,%2)                \n"  // G
5615       "vextracti128 $1,%%ymm2,(%1,%4)            \n"  // A
5616       "lea         64(%0),%0                     \n"
5617       "lea         16(%1),%1                     \n"
5618       "subl        $0x10,%5                      \n"
5619       "jg          1b                            \n"
5620       "vzeroupper                                \n"
5621       : "+r"(src_argb),  // %0
5622         "+r"(dst_r),     // %1
5623         "+r"(dst_g),     // %2
5624         "+r"(dst_b),     // %3
5625         "+r"(dst_a),     // %4
5626 #if defined(__i386__)
5627         "+m"(width)  // %5
5628 #else
5629         "+rm"(width)          // %5
5630 #endif
5631       : "m"(kShuffleMaskARGBSplit),   // %6
5632         "m"(kShuffleMaskARGBPermute)  // %7
5633       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
5634 }
5635 #endif
5636 
5637 #ifdef HAS_SPLITXRGBROW_AVX2
SplitXRGBRow_AVX2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)5638 void SplitXRGBRow_AVX2(const uint8_t* src_argb,
5639                        uint8_t* dst_r,
5640                        uint8_t* dst_g,
5641                        uint8_t* dst_b,
5642                        int width) {
5643   asm volatile(
5644 
5645       "vmovdqa     %6,%%ymm3                     \n"
5646       "vbroadcastf128 %5,%%ymm4                  \n"
5647 
5648       LABELALIGN
5649       "1:                                        \n"
5650 
5651       "vmovdqu     (%0),%%xmm0                   \n"  // 00-0F
5652       "vmovdqu     16(%0),%%xmm1                 \n"  // 10-1F
5653       "vinserti128 $1,32(%0),%%ymm0,%%ymm0       \n"  // 00-0F 20-2F
5654       "vinserti128 $1,48(%0),%%ymm1,%%ymm1       \n"  // 10-1F 30-3F
5655       "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
5656       "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
5657       "vpermd      %%ymm0,%%ymm3,%%ymm0          \n"
5658       "vpermd      %%ymm1,%%ymm3,%%ymm1          \n"
5659       "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"  // GA
5660       "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"  // BR
5661       "vmovdqu     %%xmm0,(%3)                   \n"  // B
5662       "vextracti128 $1,%%ymm0,(%1)               \n"  // R
5663       "vmovdqu     %%xmm2,(%2)                   \n"  // G
5664 
5665       "lea         64(%0),%0                     \n"
5666       "lea         16(%1),%1                     \n"
5667       "lea         16(%2),%2                     \n"
5668       "lea         16(%3),%3                     \n"
5669       "sub         $0x10,%4                      \n"
5670       "jg          1b                            \n"
5671       "vzeroupper                                \n"
5672       : "+r"(src_argb),               // %0
5673         "+r"(dst_r),                  // %1
5674         "+r"(dst_g),                  // %2
5675         "+r"(dst_b),                  // %3
5676         "+r"(width)                   // %4
5677       : "m"(kShuffleMaskARGBSplit),   // %5
5678         "m"(kShuffleMaskARGBPermute)  // %6
5679       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
5680 }
5681 #endif
5682 
5683 #ifdef HAS_MERGEXR30ROW_AVX2
MergeXR30Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_ar30,int depth,int width)5684 void MergeXR30Row_AVX2(const uint16_t* src_r,
5685                        const uint16_t* src_g,
5686                        const uint16_t* src_b,
5687                        uint8_t* dst_ar30,
5688                        int depth,
5689                        int width) {
5690   int shift = depth - 10;
5691   asm volatile(
5692 
5693       "sub         %0,%1                         \n"
5694       "sub         %0,%2                         \n"
5695       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
5696       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
5697       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
5698       "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"
5699       "vpsrlw      $6,%%ymm6,%%ymm6              \n"
5700       "vmovd       %5,%%xmm4                     \n"
5701 
5702       LABELALIGN
5703       "1:                                        \n"
5704       "vmovdqu     (%0),%%ymm0                   \n"
5705       "vmovdqu     (%0,%1),%%ymm1                \n"
5706       "vmovdqu     (%0,%2),%%ymm2                \n"
5707       "vpsrlw      %%xmm4,%%ymm0,%%ymm0          \n"
5708       "vpsrlw      %%xmm4,%%ymm1,%%ymm1          \n"
5709       "vpsrlw      %%xmm4,%%ymm2,%%ymm2          \n"
5710       "vpminuw     %%ymm0,%%ymm6,%%ymm0          \n"
5711       "vpminuw     %%ymm1,%%ymm6,%%ymm1          \n"
5712       "vpminuw     %%ymm2,%%ymm6,%%ymm2          \n"
5713       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
5714       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
5715       "vpermq      $0xd8,%%ymm2,%%ymm2           \n"
5716       "vpsllw      $0x4,%%ymm0,%%ymm0            \n"  // Shift R to target bit
5717       "vpunpckhwd  %%ymm0,%%ymm2,%%ymm3          \n"  // RB
5718       "vpunpcklwd  %%ymm0,%%ymm2,%%ymm0          \n"
5719       "vpunpckhwd  %%ymm5,%%ymm1,%%ymm2          \n"  // AG
5720       "vpunpcklwd  %%ymm5,%%ymm1,%%ymm1          \n"
5721       "vpslld      $0xa,%%ymm1,%%ymm1            \n"  // Shift AG to target bit
5722       "vpslld      $0xa,%%ymm2,%%ymm2            \n"
5723       "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // Combine
5724       "vpor        %%ymm2,%%ymm3,%%ymm3          \n"
5725       "vmovdqu     %%ymm0,(%3)                   \n"
5726       "vmovdqu     %%ymm3,0x20(%3)               \n"
5727       "lea         0x20(%0),%0                   \n"
5728       "lea         0x40(%3),%3                   \n"
5729       "sub         $0x10,%4                      \n"
5730       "jg          1b                            \n"
5731       "vzeroupper                                \n"
5732       : "+r"(src_r),     // %0
5733         "+r"(src_g),     // %1
5734         "+r"(src_b),     // %2
5735         "+r"(dst_ar30),  // %3
5736         "+r"(width)      // %4
5737 #if defined(__i386__)
5738       : "m"(shift)  // %5
5739 #else
5740       : "rm"(shift)           // %5
5741 #endif
5742       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
5743 }
5744 #endif
5745 
5746 #ifdef HAS_MERGEAR64ROW_AVX2
5747 static const lvec32 MergeAR64Permute = {0, 4, 2, 6, 1, 5, 3, 7};
MergeAR64Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint16_t * dst_ar64,int depth,int width)5748 void MergeAR64Row_AVX2(const uint16_t* src_r,
5749                        const uint16_t* src_g,
5750                        const uint16_t* src_b,
5751                        const uint16_t* src_a,
5752                        uint16_t* dst_ar64,
5753                        int depth,
5754                        int width) {
5755   int shift = 16 - depth;
5756   int mask = (1 << depth) - 1;
5757   mask = (mask << 16) + mask;
5758   asm volatile(
5759 
5760       "sub         %0,%1                         \n"
5761       "sub         %0,%2                         \n"
5762       "sub         %0,%3                         \n"
5763       "vmovdqa     %8,%%ymm5                     \n"
5764       "vmovd       %6,%%xmm6                     \n"
5765       "vbroadcastss %7,%%ymm7                    \n"
5766 
5767       LABELALIGN
5768       "1:                                        \n"
5769       "vmovdqu     (%0),%%ymm0                   \n"  // R
5770       "vmovdqu     (%0,%1),%%ymm1                \n"  // G
5771       "vmovdqu     (%0,%2),%%ymm2                \n"  // B
5772       "vmovdqu     (%0,%3),%%ymm3                \n"  // A
5773       "vpminuw     %%ymm0,%%ymm7,%%ymm0          \n"
5774       "vpminuw     %%ymm1,%%ymm7,%%ymm1          \n"
5775       "vpminuw     %%ymm2,%%ymm7,%%ymm2          \n"
5776       "vpminuw     %%ymm3,%%ymm7,%%ymm3          \n"
5777       "vpsllw      %%xmm6,%%ymm0,%%ymm0          \n"
5778       "vpsllw      %%xmm6,%%ymm1,%%ymm1          \n"
5779       "vpsllw      %%xmm6,%%ymm2,%%ymm2          \n"
5780       "vpsllw      %%xmm6,%%ymm3,%%ymm3          \n"
5781       "vpermd      %%ymm0,%%ymm5,%%ymm0          \n"
5782       "vpermd      %%ymm1,%%ymm5,%%ymm1          \n"
5783       "vpermd      %%ymm2,%%ymm5,%%ymm2          \n"
5784       "vpermd      %%ymm3,%%ymm5,%%ymm3          \n"
5785       "vpunpcklwd  %%ymm1,%%ymm2,%%ymm4          \n"  // BG(low)
5786       "vpunpckhwd  %%ymm1,%%ymm2,%%ymm1          \n"  // BG(hi)
5787       "vpunpcklwd  %%ymm3,%%ymm0,%%ymm2          \n"  // RA(low)
5788       "vpunpckhwd  %%ymm3,%%ymm0,%%ymm0          \n"  // RA(hi)
5789       "vpunpckldq  %%ymm2,%%ymm4,%%ymm3          \n"  // BGRA(1)
5790       "vpunpckhdq  %%ymm2,%%ymm4,%%ymm4          \n"  // BGRA(3)
5791       "vpunpckldq  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA(2)
5792       "vpunpckhdq  %%ymm0,%%ymm1,%%ymm1          \n"  // BGRA(4)
5793       "vmovdqu     %%ymm3,(%4)                   \n"
5794       "vmovdqu     %%ymm2,0x20(%4)               \n"
5795       "vmovdqu     %%ymm4,0x40(%4)               \n"
5796       "vmovdqu     %%ymm1,0x60(%4)               \n"
5797       "lea         0x20(%0),%0                   \n"
5798       "lea         0x80(%4),%4                   \n"
5799       "subl        $0x10,%5                      \n"
5800       "jg          1b                            \n"
5801       "vzeroupper                                \n"
5802       : "+r"(src_r),     // %0
5803         "+r"(src_g),     // %1
5804         "+r"(src_b),     // %2
5805         "+r"(src_a),     // %3
5806         "+r"(dst_ar64),  // %4
5807 #if defined(__i386__)
5808         "+m"(width)  // %5
5809 #else
5810         "+rm"(width)          // %5
5811 #endif
5812       : "m"(shift),            // %6
5813         "m"(mask),             // %7
5814         "m"(MergeAR64Permute)  // %8
5815       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5816         "xmm7");
5817 }
5818 #endif
5819 
5820 #ifdef HAS_MERGEXR64ROW_AVX2
MergeXR64Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint16_t * dst_ar64,int depth,int width)5821 void MergeXR64Row_AVX2(const uint16_t* src_r,
5822                        const uint16_t* src_g,
5823                        const uint16_t* src_b,
5824                        uint16_t* dst_ar64,
5825                        int depth,
5826                        int width) {
5827   int shift = 16 - depth;
5828   int mask = (1 << depth) - 1;
5829   mask = (mask << 16) + mask;
5830   asm volatile(
5831 
5832       "sub         %0,%1                         \n"
5833       "sub         %0,%2                         \n"
5834       "vmovdqa     %7,%%ymm5                     \n"
5835       "vmovd       %5,%%xmm6                     \n"
5836       "vbroadcastss %6,%%ymm7                    \n"
5837 
5838       LABELALIGN
5839       "1:                                        \n"
5840       "vmovdqu     (%0),%%ymm0                   \n"  // R
5841       "vmovdqu     (%0,%1),%%ymm1                \n"  // G
5842       "vmovdqu     (%0,%2),%%ymm2                \n"  // B
5843       "vpminuw     %%ymm0,%%ymm7,%%ymm0          \n"
5844       "vpminuw     %%ymm1,%%ymm7,%%ymm1          \n"
5845       "vpminuw     %%ymm2,%%ymm7,%%ymm2          \n"
5846       "vpsllw      %%xmm6,%%ymm0,%%ymm0          \n"
5847       "vpsllw      %%xmm6,%%ymm1,%%ymm1          \n"
5848       "vpsllw      %%xmm6,%%ymm2,%%ymm2          \n"
5849       "vpermd      %%ymm0,%%ymm5,%%ymm0          \n"
5850       "vpermd      %%ymm1,%%ymm5,%%ymm1          \n"
5851       "vpermd      %%ymm2,%%ymm5,%%ymm2          \n"
5852       "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"  // A (0xffff)
5853       "vpunpcklwd  %%ymm1,%%ymm2,%%ymm4          \n"  // BG(low)
5854       "vpunpckhwd  %%ymm1,%%ymm2,%%ymm1          \n"  // BG(hi)
5855       "vpunpcklwd  %%ymm3,%%ymm0,%%ymm2          \n"  // RA(low)
5856       "vpunpckhwd  %%ymm3,%%ymm0,%%ymm0          \n"  // RA(hi)
5857       "vpunpckldq  %%ymm2,%%ymm4,%%ymm3          \n"  // BGRA(1)
5858       "vpunpckhdq  %%ymm2,%%ymm4,%%ymm4          \n"  // BGRA(3)
5859       "vpunpckldq  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA(2)
5860       "vpunpckhdq  %%ymm0,%%ymm1,%%ymm1          \n"  // BGRA(4)
5861       "vmovdqu     %%ymm3,(%3)                   \n"
5862       "vmovdqu     %%ymm2,0x20(%3)               \n"
5863       "vmovdqu     %%ymm4,0x40(%3)               \n"
5864       "vmovdqu     %%ymm1,0x60(%3)               \n"
5865       "lea         0x20(%0),%0                   \n"
5866       "lea         0x80(%3),%3                   \n"
5867       "subl        $0x10,%4                      \n"
5868       "jg          1b                            \n"
5869       "vzeroupper                                \n"
5870       : "+r"(src_r),           // %0
5871         "+r"(src_g),           // %1
5872         "+r"(src_b),           // %2
5873         "+r"(dst_ar64),        // %3
5874         "+r"(width)            // %4
5875       : "m"(shift),            // %5
5876         "m"(mask),             // %6
5877         "m"(MergeAR64Permute)  // %7
5878       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5879         "xmm7");
5880 }
5881 #endif
5882 
5883 #ifdef HAS_MERGEARGB16TO8ROW_AVX2
5884 static const uvec8 MergeARGB16To8Shuffle = {0, 8,  1, 9,  2, 10, 3, 11,
5885                                             4, 12, 5, 13, 6, 14, 7, 15};
MergeARGB16To8Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint8_t * dst_argb,int depth,int width)5886 void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
5887                             const uint16_t* src_g,
5888                             const uint16_t* src_b,
5889                             const uint16_t* src_a,
5890                             uint8_t* dst_argb,
5891                             int depth,
5892                             int width) {
5893   int shift = depth - 8;
5894   asm volatile(
5895 
5896       "sub         %0,%1                         \n"
5897       "sub         %0,%2                         \n"
5898       "sub         %0,%3                         \n"
5899       "vbroadcastf128 %7,%%ymm5                  \n"
5900       "vmovd       %6,%%xmm6                     \n"
5901 
5902       LABELALIGN
5903       "1:                                        \n"
5904       "vmovdqu     (%0),%%ymm0                   \n"  // R
5905       "vmovdqu     (%0,%1),%%ymm1                \n"  // G
5906       "vmovdqu     (%0,%2),%%ymm2                \n"  // B
5907       "vmovdqu     (%0,%3),%%ymm3                \n"  // A
5908       "vpsrlw      %%xmm6,%%ymm0,%%ymm0          \n"
5909       "vpsrlw      %%xmm6,%%ymm1,%%ymm1          \n"
5910       "vpsrlw      %%xmm6,%%ymm2,%%ymm2          \n"
5911       "vpsrlw      %%xmm6,%%ymm3,%%ymm3          \n"
5912       "vpackuswb   %%ymm1,%%ymm2,%%ymm1          \n"  // BG (planar)
5913       "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"  // RA (planar)
5914       "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"  // BG (interleave)
5915       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // RA (interleave)
5916       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
5917       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
5918       "vpunpcklwd  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA (low)
5919       "vpunpckhwd  %%ymm0,%%ymm1,%%ymm0          \n"  // BGRA (hi)
5920       "vmovdqu     %%ymm2,(%4)                   \n"
5921       "vmovdqu     %%ymm0,0x20(%4)               \n"
5922       "lea         0x20(%0),%0                   \n"
5923       "lea         0x40(%4),%4                   \n"
5924       "subl        $0x10,%5                      \n"
5925       "jg          1b                            \n"
5926       "vzeroupper                                \n"
5927       : "+r"(src_r),     // %0
5928         "+r"(src_g),     // %1
5929         "+r"(src_b),     // %2
5930         "+r"(src_a),     // %3
5931         "+r"(dst_argb),  // %4
5932 #if defined(__i386__)
5933         "+m"(width)  // %5
5934 #else
5935         "+rm"(width)          // %5
5936 #endif
5937       : "m"(shift),                 // %6
5938         "m"(MergeARGB16To8Shuffle)  // %7
5939       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
5940 }
5941 #endif
5942 
5943 #ifdef HAS_MERGEXRGB16TO8ROW_AVX2
MergeXRGB16To8Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_argb,int depth,int width)5944 void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
5945                             const uint16_t* src_g,
5946                             const uint16_t* src_b,
5947                             uint8_t* dst_argb,
5948                             int depth,
5949                             int width) {
5950   int shift = depth - 8;
5951   asm volatile(
5952 
5953       "sub         %0,%1                         \n"
5954       "sub         %0,%2                         \n"
5955       "vbroadcastf128 %6,%%ymm5                  \n"
5956       "vmovd       %5,%%xmm6                     \n"
5957       "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"
5958       "vpsrlw      $8,%%ymm3,%%ymm3              \n"  // A (0xff)
5959 
5960       LABELALIGN
5961       "1:                                        \n"
5962       "vmovdqu     (%0),%%ymm0                   \n"  // R
5963       "vmovdqu     (%0,%1),%%ymm1                \n"  // G
5964       "vmovdqu     (%0,%2),%%ymm2                \n"  // B
5965       "vpsrlw      %%xmm6,%%ymm0,%%ymm0          \n"
5966       "vpsrlw      %%xmm6,%%ymm1,%%ymm1          \n"
5967       "vpsrlw      %%xmm6,%%ymm2,%%ymm2          \n"
5968       "vpackuswb   %%ymm1,%%ymm2,%%ymm1          \n"  // BG (planar)
5969       "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"  // RA (planar)
5970       "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"  // BG (interleave)
5971       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // RA (interleave)
5972       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
5973       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
5974       "vpunpcklwd  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA (low)
5975       "vpunpckhwd  %%ymm0,%%ymm1,%%ymm0          \n"  // BGRA (hi)
5976       "vmovdqu     %%ymm2,(%3)                   \n"
5977       "vmovdqu     %%ymm0,0x20(%3)               \n"
5978       "lea         0x20(%0),%0                   \n"
5979       "lea         0x40(%3),%3                   \n"
5980       "subl        $0x10,%4                      \n"
5981       "jg          1b                            \n"
5982       "vzeroupper                                \n"
5983       : "+r"(src_r),                // %0
5984         "+r"(src_g),                // %1
5985         "+r"(src_b),                // %2
5986         "+r"(dst_argb),             // %3
5987         "+r"(width)                 // %4
5988       : "m"(shift),                 // %5
5989         "m"(MergeARGB16To8Shuffle)  // %6
5990       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
5991 }
5992 #endif
5993 
5994 #ifdef HAS_COPYROW_SSE2
CopyRow_SSE2(const uint8_t * src,uint8_t * dst,int width)5995 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
5996   asm volatile(
5997       "test        $0xf,%0                       \n"
5998       "jne         2f                            \n"
5999       "test        $0xf,%1                       \n"
6000       "jne         2f                            \n"
6001 
6002       LABELALIGN
6003       "1:                                        \n"
6004       "movdqa      (%0),%%xmm0                   \n"
6005       "movdqa      0x10(%0),%%xmm1               \n"
6006       "lea         0x20(%0),%0                   \n"
6007       "movdqa      %%xmm0,(%1)                   \n"
6008       "movdqa      %%xmm1,0x10(%1)               \n"
6009       "lea         0x20(%1),%1                   \n"
6010       "sub         $0x20,%2                      \n"
6011       "jg          1b                            \n"
6012       "jmp         9f                            \n"
6013 
6014       LABELALIGN
6015       "2:                                        \n"
6016       "movdqu      (%0),%%xmm0                   \n"
6017       "movdqu      0x10(%0),%%xmm1               \n"
6018       "lea         0x20(%0),%0                   \n"
6019       "movdqu      %%xmm0,(%1)                   \n"
6020       "movdqu      %%xmm1,0x10(%1)               \n"
6021       "lea         0x20(%1),%1                   \n"
6022       "sub         $0x20,%2                      \n"
6023       "jg          2b                            \n"
6024 
6025       LABELALIGN "9:                                        \n"
6026       : "+r"(src),   // %0
6027         "+r"(dst),   // %1
6028         "+r"(width)  // %2
6029       :
6030       : "memory", "cc", "xmm0", "xmm1");
6031 }
6032 #endif  // HAS_COPYROW_SSE2
6033 
6034 #ifdef HAS_COPYROW_AVX
CopyRow_AVX(const uint8_t * src,uint8_t * dst,int width)6035 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
6036   asm volatile(
6037 
6038       LABELALIGN
6039       "1:                                        \n"
6040       "vmovdqu     (%0),%%ymm0                   \n"
6041       "vmovdqu     0x20(%0),%%ymm1               \n"
6042       "lea         0x40(%0),%0                   \n"
6043       "vmovdqu     %%ymm0,(%1)                   \n"
6044       "vmovdqu     %%ymm1,0x20(%1)               \n"
6045       "lea         0x40(%1),%1                   \n"
6046       "sub         $0x40,%2                      \n"
6047       "jg          1b                            \n"
6048       : "+r"(src),   // %0
6049         "+r"(dst),   // %1
6050         "+r"(width)  // %2
6051       :
6052       : "memory", "cc", "xmm0", "xmm1");
6053 }
6054 #endif  // HAS_COPYROW_AVX
6055 
6056 #ifdef HAS_COPYROW_ERMS
6057 // Multiple of 1.
CopyRow_ERMS(const uint8_t * src,uint8_t * dst,int width)6058 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
6059   size_t width_tmp = (size_t)(width);
6060   asm volatile(
6061 
6062       "rep         movsb                         \n"
6063       : "+S"(src),       // %0
6064         "+D"(dst),       // %1
6065         "+c"(width_tmp)  // %2
6066       :
6067       : "memory", "cc");
6068 }
6069 #endif  // HAS_COPYROW_ERMS
6070 
6071 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
6072 // width in pixels
ARGBCopyAlphaRow_SSE2(const uint8_t * src,uint8_t * dst,int width)6073 void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
6074   asm volatile(
6075       "pcmpeqb     %%xmm0,%%xmm0                 \n"
6076       "pslld       $0x18,%%xmm0                  \n"
6077       "pcmpeqb     %%xmm1,%%xmm1                 \n"
6078       "psrld       $0x8,%%xmm1                   \n"
6079 
6080       LABELALIGN
6081       "1:                                        \n"
6082       "movdqu      (%0),%%xmm2                   \n"
6083       "movdqu      0x10(%0),%%xmm3               \n"
6084       "lea         0x20(%0),%0                   \n"
6085       "movdqu      (%1),%%xmm4                   \n"
6086       "movdqu      0x10(%1),%%xmm5               \n"
6087       "pand        %%xmm0,%%xmm2                 \n"
6088       "pand        %%xmm0,%%xmm3                 \n"
6089       "pand        %%xmm1,%%xmm4                 \n"
6090       "pand        %%xmm1,%%xmm5                 \n"
6091       "por         %%xmm4,%%xmm2                 \n"
6092       "por         %%xmm5,%%xmm3                 \n"
6093       "movdqu      %%xmm2,(%1)                   \n"
6094       "movdqu      %%xmm3,0x10(%1)               \n"
6095       "lea         0x20(%1),%1                   \n"
6096       "sub         $0x8,%2                       \n"
6097       "jg          1b                            \n"
6098       : "+r"(src),   // %0
6099         "+r"(dst),   // %1
6100         "+r"(width)  // %2
6101       :
6102       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6103 }
6104 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
6105 
6106 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
6107 // width in pixels
ARGBCopyAlphaRow_AVX2(const uint8_t * src,uint8_t * dst,int width)6108 void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
6109   asm volatile(
6110       "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
6111       "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
6112 
6113       LABELALIGN
6114       "1:                                        \n"
6115       "vmovdqu     (%0),%%ymm1                   \n"
6116       "vmovdqu     0x20(%0),%%ymm2               \n"
6117       "lea         0x40(%0),%0                   \n"
6118       "vpblendvb   %%ymm0,(%1),%%ymm1,%%ymm1     \n"
6119       "vpblendvb   %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
6120       "vmovdqu     %%ymm1,(%1)                   \n"
6121       "vmovdqu     %%ymm2,0x20(%1)               \n"
6122       "lea         0x40(%1),%1                   \n"
6123       "sub         $0x10,%2                      \n"
6124       "jg          1b                            \n"
6125       "vzeroupper                                \n"
6126       : "+r"(src),   // %0
6127         "+r"(dst),   // %1
6128         "+r"(width)  // %2
6129       :
6130       : "memory", "cc", "xmm0", "xmm1", "xmm2");
6131 }
6132 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
6133 
6134 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
6135 // width in pixels
ARGBExtractAlphaRow_SSE2(const uint8_t * src_argb,uint8_t * dst_a,int width)6136 void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
6137                               uint8_t* dst_a,
6138                               int width) {
6139   asm volatile(
6140 
6141       LABELALIGN
6142       "1:                                        \n"
6143       "movdqu      (%0), %%xmm0                  \n"
6144       "movdqu      0x10(%0), %%xmm1              \n"
6145       "lea         0x20(%0), %0                  \n"
6146       "psrld       $0x18, %%xmm0                 \n"
6147       "psrld       $0x18, %%xmm1                 \n"
6148       "packssdw    %%xmm1, %%xmm0                \n"
6149       "packuswb    %%xmm0, %%xmm0                \n"
6150       "movq        %%xmm0,(%1)                   \n"
6151       "lea         0x8(%1), %1                   \n"
6152       "sub         $0x8, %2                      \n"
6153       "jg          1b                            \n"
6154       : "+r"(src_argb),  // %0
6155         "+r"(dst_a),     // %1
6156         "+rm"(width)     // %2
6157       :
6158       : "memory", "cc", "xmm0", "xmm1");
6159 }
6160 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
6161 
6162 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
6163 static const uvec8 kShuffleAlphaShort_AVX2 = {
6164     3u,  128u, 128u, 128u, 7u,  128u, 128u, 128u,
6165     11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
6166 
ARGBExtractAlphaRow_AVX2(const uint8_t * src_argb,uint8_t * dst_a,int width)6167 void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
6168                               uint8_t* dst_a,
6169                               int width) {
6170   asm volatile(
6171       "vmovdqa     %3,%%ymm4                     \n"
6172       "vbroadcastf128 %4,%%ymm5                  \n"
6173 
6174       LABELALIGN
6175       "1:                                        \n"
6176       "vmovdqu     (%0), %%ymm0                  \n"
6177       "vmovdqu     0x20(%0), %%ymm1              \n"
6178       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // vpsrld $0x18, %%ymm0
6179       "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
6180       "vmovdqu     0x40(%0), %%ymm2              \n"
6181       "vmovdqu     0x60(%0), %%ymm3              \n"
6182       "lea         0x80(%0), %0                  \n"
6183       "vpackssdw   %%ymm1, %%ymm0, %%ymm0        \n"  // mutates
6184       "vpshufb     %%ymm5,%%ymm2,%%ymm2          \n"
6185       "vpshufb     %%ymm5,%%ymm3,%%ymm3          \n"
6186       "vpackssdw   %%ymm3, %%ymm2, %%ymm2        \n"  // mutates
6187       "vpackuswb   %%ymm2,%%ymm0,%%ymm0          \n"  // mutates.
6188       "vpermd      %%ymm0,%%ymm4,%%ymm0          \n"  // unmutate.
6189       "vmovdqu     %%ymm0,(%1)                   \n"
6190       "lea         0x20(%1),%1                   \n"
6191       "sub         $0x20, %2                     \n"
6192       "jg          1b                            \n"
6193       "vzeroupper                                \n"
6194       : "+r"(src_argb),               // %0
6195         "+r"(dst_a),                  // %1
6196         "+rm"(width)                  // %2
6197       : "m"(kPermdARGBToY_AVX),       // %3
6198         "m"(kShuffleAlphaShort_AVX2)  // %4
6199       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6200 }
6201 #endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
6202 
6203 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
6204 // width in pixels
ARGBCopyYToAlphaRow_SSE2(const uint8_t * src,uint8_t * dst,int width)6205 void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
6206   asm volatile(
6207       "pcmpeqb     %%xmm0,%%xmm0                 \n"
6208       "pslld       $0x18,%%xmm0                  \n"
6209       "pcmpeqb     %%xmm1,%%xmm1                 \n"
6210       "psrld       $0x8,%%xmm1                   \n"
6211 
6212       LABELALIGN
6213       "1:                                        \n"
6214       "movq        (%0),%%xmm2                   \n"
6215       "lea         0x8(%0),%0                    \n"
6216       "punpcklbw   %%xmm2,%%xmm2                 \n"
6217       "punpckhwd   %%xmm2,%%xmm3                 \n"
6218       "punpcklwd   %%xmm2,%%xmm2                 \n"
6219       "movdqu      (%1),%%xmm4                   \n"
6220       "movdqu      0x10(%1),%%xmm5               \n"
6221       "pand        %%xmm0,%%xmm2                 \n"
6222       "pand        %%xmm0,%%xmm3                 \n"
6223       "pand        %%xmm1,%%xmm4                 \n"
6224       "pand        %%xmm1,%%xmm5                 \n"
6225       "por         %%xmm4,%%xmm2                 \n"
6226       "por         %%xmm5,%%xmm3                 \n"
6227       "movdqu      %%xmm2,(%1)                   \n"
6228       "movdqu      %%xmm3,0x10(%1)               \n"
6229       "lea         0x20(%1),%1                   \n"
6230       "sub         $0x8,%2                       \n"
6231       "jg          1b                            \n"
6232       : "+r"(src),   // %0
6233         "+r"(dst),   // %1
6234         "+r"(width)  // %2
6235       :
6236       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6237 }
6238 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
6239 
6240 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
6241 // width in pixels
ARGBCopyYToAlphaRow_AVX2(const uint8_t * src,uint8_t * dst,int width)6242 void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
6243   asm volatile(
6244       "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
6245       "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
6246 
6247       LABELALIGN
6248       "1:                                        \n"
6249       "vpmovzxbd   (%0),%%ymm1                   \n"
6250       "vpmovzxbd   0x8(%0),%%ymm2                \n"
6251       "lea         0x10(%0),%0                   \n"
6252       "vpslld      $0x18,%%ymm1,%%ymm1           \n"
6253       "vpslld      $0x18,%%ymm2,%%ymm2           \n"
6254       "vpblendvb   %%ymm0,(%1),%%ymm1,%%ymm1     \n"
6255       "vpblendvb   %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
6256       "vmovdqu     %%ymm1,(%1)                   \n"
6257       "vmovdqu     %%ymm2,0x20(%1)               \n"
6258       "lea         0x40(%1),%1                   \n"
6259       "sub         $0x10,%2                      \n"
6260       "jg          1b                            \n"
6261       "vzeroupper                                \n"
6262       : "+r"(src),   // %0
6263         "+r"(dst),   // %1
6264         "+r"(width)  // %2
6265       :
6266       : "memory", "cc", "xmm0", "xmm1", "xmm2");
6267 }
6268 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
6269 
6270 #ifdef HAS_SETROW_X86
SetRow_X86(uint8_t * dst,uint8_t v8,int width)6271 void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
6272   size_t width_tmp = (size_t)(width >> 2);
6273   const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
6274   asm volatile(
6275 
6276       "rep         stosl                         \n"
6277       : "+D"(dst),       // %0
6278         "+c"(width_tmp)  // %1
6279       : "a"(v32)         // %2
6280       : "memory", "cc");
6281 }
6282 
SetRow_ERMS(uint8_t * dst,uint8_t v8,int width)6283 void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
6284   size_t width_tmp = (size_t)(width);
6285   asm volatile(
6286 
6287       "rep         stosb                         \n"
6288       : "+D"(dst),       // %0
6289         "+c"(width_tmp)  // %1
6290       : "a"(v8)          // %2
6291       : "memory", "cc");
6292 }
6293 
ARGBSetRow_X86(uint8_t * dst_argb,uint32_t v32,int width)6294 void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
6295   size_t width_tmp = (size_t)(width);
6296   asm volatile(
6297 
6298       "rep         stosl                         \n"
6299       : "+D"(dst_argb),  // %0
6300         "+c"(width_tmp)  // %1
6301       : "a"(v32)         // %2
6302       : "memory", "cc");
6303 }
6304 #endif  // HAS_SETROW_X86
6305 
6306 #ifdef HAS_YUY2TOYROW_SSE2
YUY2ToYRow_SSE2(const uint8_t * src_yuy2,uint8_t * dst_y,int width)6307 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
6308   asm volatile(
6309       "pcmpeqb     %%xmm5,%%xmm5                 \n"
6310       "psrlw       $0x8,%%xmm5                   \n"
6311 
6312       LABELALIGN
6313       "1:                                        \n"
6314       "movdqu      (%0),%%xmm0                   \n"
6315       "movdqu      0x10(%0),%%xmm1               \n"
6316       "lea         0x20(%0),%0                   \n"
6317       "pand        %%xmm5,%%xmm0                 \n"
6318       "pand        %%xmm5,%%xmm1                 \n"
6319       "packuswb    %%xmm1,%%xmm0                 \n"
6320       "movdqu      %%xmm0,(%1)                   \n"
6321       "lea         0x10(%1),%1                   \n"
6322       "sub         $0x10,%2                      \n"
6323       "jg          1b                            \n"
6324       : "+r"(src_yuy2),  // %0
6325         "+r"(dst_y),     // %1
6326         "+r"(width)      // %2
6327       :
6328       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6329 }
6330 
YUY2ToUVRow_SSE2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)6331 void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
6332                       int stride_yuy2,
6333                       uint8_t* dst_u,
6334                       uint8_t* dst_v,
6335                       int width) {
6336   asm volatile(
6337       "pcmpeqb     %%xmm5,%%xmm5                 \n"
6338       "psrlw       $0x8,%%xmm5                   \n"
6339       "sub         %1,%2                         \n"
6340 
6341       LABELALIGN
6342       "1:                                        \n"
6343       "movdqu      (%0),%%xmm0                   \n"
6344       "movdqu      0x10(%0),%%xmm1               \n"
6345       "movdqu      0x00(%0,%4,1),%%xmm2          \n"
6346       "movdqu      0x10(%0,%4,1),%%xmm3          \n"
6347       "lea         0x20(%0),%0                   \n"
6348       "pavgb       %%xmm2,%%xmm0                 \n"
6349       "pavgb       %%xmm3,%%xmm1                 \n"
6350       "psrlw       $0x8,%%xmm0                   \n"
6351       "psrlw       $0x8,%%xmm1                   \n"
6352       "packuswb    %%xmm1,%%xmm0                 \n"
6353       "movdqa      %%xmm0,%%xmm1                 \n"
6354       "pand        %%xmm5,%%xmm0                 \n"
6355       "packuswb    %%xmm0,%%xmm0                 \n"
6356       "psrlw       $0x8,%%xmm1                   \n"
6357       "packuswb    %%xmm1,%%xmm1                 \n"
6358       "movq        %%xmm0,(%1)                   \n"
6359       "movq        %%xmm1,0x00(%1,%2,1)          \n"
6360       "lea         0x8(%1),%1                    \n"
6361       "sub         $0x10,%3                      \n"
6362       "jg          1b                            \n"
6363       : "+r"(src_yuy2),               // %0
6364         "+r"(dst_u),                  // %1
6365         "+r"(dst_v),                  // %2
6366         "+r"(width)                   // %3
6367       : "r"((intptr_t)(stride_yuy2))  // %4
6368       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
6369 }
6370 
YUY2ToUV422Row_SSE2(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)6371 void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
6372                          uint8_t* dst_u,
6373                          uint8_t* dst_v,
6374                          int width) {
6375   asm volatile(
6376       "pcmpeqb     %%xmm5,%%xmm5                 \n"
6377       "psrlw       $0x8,%%xmm5                   \n"
6378       "sub         %1,%2                         \n"
6379 
6380       LABELALIGN
6381       "1:                                        \n"
6382       "movdqu      (%0),%%xmm0                   \n"
6383       "movdqu      0x10(%0),%%xmm1               \n"
6384       "lea         0x20(%0),%0                   \n"
6385       "psrlw       $0x8,%%xmm0                   \n"
6386       "psrlw       $0x8,%%xmm1                   \n"
6387       "packuswb    %%xmm1,%%xmm0                 \n"
6388       "movdqa      %%xmm0,%%xmm1                 \n"
6389       "pand        %%xmm5,%%xmm0                 \n"
6390       "packuswb    %%xmm0,%%xmm0                 \n"
6391       "psrlw       $0x8,%%xmm1                   \n"
6392       "packuswb    %%xmm1,%%xmm1                 \n"
6393       "movq        %%xmm0,(%1)                   \n"
6394       "movq        %%xmm1,0x00(%1,%2,1)          \n"
6395       "lea         0x8(%1),%1                    \n"
6396       "sub         $0x10,%3                      \n"
6397       "jg          1b                            \n"
6398       : "+r"(src_yuy2),  // %0
6399         "+r"(dst_u),     // %1
6400         "+r"(dst_v),     // %2
6401         "+r"(width)      // %3
6402       :
6403       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6404 }
6405 
UYVYToYRow_SSE2(const uint8_t * src_uyvy,uint8_t * dst_y,int width)6406 void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
6407   asm volatile(
6408 
6409       LABELALIGN
6410       "1:                                        \n"
6411       "movdqu      (%0),%%xmm0                   \n"
6412       "movdqu      0x10(%0),%%xmm1               \n"
6413       "lea         0x20(%0),%0                   \n"
6414       "psrlw       $0x8,%%xmm0                   \n"
6415       "psrlw       $0x8,%%xmm1                   \n"
6416       "packuswb    %%xmm1,%%xmm0                 \n"
6417       "movdqu      %%xmm0,(%1)                   \n"
6418       "lea         0x10(%1),%1                   \n"
6419       "sub         $0x10,%2                      \n"
6420       "jg          1b                            \n"
6421       : "+r"(src_uyvy),  // %0
6422         "+r"(dst_y),     // %1
6423         "+r"(width)      // %2
6424       :
6425       : "memory", "cc", "xmm0", "xmm1");
6426 }
6427 
UYVYToUVRow_SSE2(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)6428 void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
6429                       int stride_uyvy,
6430                       uint8_t* dst_u,
6431                       uint8_t* dst_v,
6432                       int width) {
6433   asm volatile(
6434       "pcmpeqb     %%xmm5,%%xmm5                 \n"
6435       "psrlw       $0x8,%%xmm5                   \n"
6436       "sub         %1,%2                         \n"
6437 
6438       LABELALIGN
6439       "1:                                        \n"
6440       "movdqu      (%0),%%xmm0                   \n"
6441       "movdqu      0x10(%0),%%xmm1               \n"
6442       "movdqu      0x00(%0,%4,1),%%xmm2          \n"
6443       "movdqu      0x10(%0,%4,1),%%xmm3          \n"
6444       "lea         0x20(%0),%0                   \n"
6445       "pavgb       %%xmm2,%%xmm0                 \n"
6446       "pavgb       %%xmm3,%%xmm1                 \n"
6447       "pand        %%xmm5,%%xmm0                 \n"
6448       "pand        %%xmm5,%%xmm1                 \n"
6449       "packuswb    %%xmm1,%%xmm0                 \n"
6450       "movdqa      %%xmm0,%%xmm1                 \n"
6451       "pand        %%xmm5,%%xmm0                 \n"
6452       "packuswb    %%xmm0,%%xmm0                 \n"
6453       "psrlw       $0x8,%%xmm1                   \n"
6454       "packuswb    %%xmm1,%%xmm1                 \n"
6455       "movq        %%xmm0,(%1)                   \n"
6456       "movq        %%xmm1,0x00(%1,%2,1)          \n"
6457       "lea         0x8(%1),%1                    \n"
6458       "sub         $0x10,%3                      \n"
6459       "jg          1b                            \n"
6460       : "+r"(src_uyvy),               // %0
6461         "+r"(dst_u),                  // %1
6462         "+r"(dst_v),                  // %2
6463         "+r"(width)                   // %3
6464       : "r"((intptr_t)(stride_uyvy))  // %4
6465       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
6466 }
6467 
UYVYToUV422Row_SSE2(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)6468 void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
6469                          uint8_t* dst_u,
6470                          uint8_t* dst_v,
6471                          int width) {
6472   asm volatile(
6473       "pcmpeqb     %%xmm5,%%xmm5                 \n"
6474       "psrlw       $0x8,%%xmm5                   \n"
6475       "sub         %1,%2                         \n"
6476 
6477       LABELALIGN
6478       "1:                                        \n"
6479       "movdqu      (%0),%%xmm0                   \n"
6480       "movdqu      0x10(%0),%%xmm1               \n"
6481       "lea         0x20(%0),%0                   \n"
6482       "pand        %%xmm5,%%xmm0                 \n"
6483       "pand        %%xmm5,%%xmm1                 \n"
6484       "packuswb    %%xmm1,%%xmm0                 \n"
6485       "movdqa      %%xmm0,%%xmm1                 \n"
6486       "pand        %%xmm5,%%xmm0                 \n"
6487       "packuswb    %%xmm0,%%xmm0                 \n"
6488       "psrlw       $0x8,%%xmm1                   \n"
6489       "packuswb    %%xmm1,%%xmm1                 \n"
6490       "movq        %%xmm0,(%1)                   \n"
6491       "movq        %%xmm1,0x00(%1,%2,1)          \n"
6492       "lea         0x8(%1),%1                    \n"
6493       "sub         $0x10,%3                      \n"
6494       "jg          1b                            \n"
6495       : "+r"(src_uyvy),  // %0
6496         "+r"(dst_u),     // %1
6497         "+r"(dst_v),     // %2
6498         "+r"(width)      // %3
6499       :
6500       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6501 }
6502 #endif  // HAS_YUY2TOYROW_SSE2
6503 
6504 #ifdef HAS_YUY2TOYROW_AVX2
YUY2ToYRow_AVX2(const uint8_t * src_yuy2,uint8_t * dst_y,int width)6505 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
6506   asm volatile(
6507       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
6508       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
6509 
6510       LABELALIGN
6511       "1:                                        \n"
6512       "vmovdqu     (%0),%%ymm0                   \n"
6513       "vmovdqu     0x20(%0),%%ymm1               \n"
6514       "lea         0x40(%0),%0                   \n"
6515       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
6516       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
6517       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
6518       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6519       "vmovdqu     %%ymm0,(%1)                   \n"
6520       "lea         0x20(%1),%1                   \n"
6521       "sub         $0x20,%2                      \n"
6522       "jg          1b                            \n"
6523       "vzeroupper                                \n"
6524       : "+r"(src_yuy2),  // %0
6525         "+r"(dst_y),     // %1
6526         "+r"(width)      // %2
6527       :
6528       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6529 }
6530 
YUY2ToUVRow_AVX2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)6531 void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
6532                       int stride_yuy2,
6533                       uint8_t* dst_u,
6534                       uint8_t* dst_v,
6535                       int width) {
6536   asm volatile(
6537       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
6538       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
6539       "sub         %1,%2                         \n"
6540 
6541       LABELALIGN
6542       "1:                                        \n"
6543       "vmovdqu     (%0),%%ymm0                   \n"
6544       "vmovdqu     0x20(%0),%%ymm1               \n"
6545       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
6546       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
6547       "lea         0x40(%0),%0                   \n"
6548       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6549       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
6550       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
6551       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6552       "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
6553       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6554       "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
6555       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
6556       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
6557       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6558       "vextractf128 $0x0,%%ymm1,(%1)             \n"
6559       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
6560       "lea         0x10(%1),%1                   \n"
6561       "sub         $0x20,%3                      \n"
6562       "jg          1b                            \n"
6563       "vzeroupper                                \n"
6564       : "+r"(src_yuy2),               // %0
6565         "+r"(dst_u),                  // %1
6566         "+r"(dst_v),                  // %2
6567         "+r"(width)                   // %3
6568       : "r"((intptr_t)(stride_yuy2))  // %4
6569       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6570 }
6571 
YUY2ToUV422Row_AVX2(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)6572 void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
6573                          uint8_t* dst_u,
6574                          uint8_t* dst_v,
6575                          int width) {
6576   asm volatile(
6577       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
6578       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
6579       "sub         %1,%2                         \n"
6580 
6581       LABELALIGN
6582       "1:                                        \n"
6583       "vmovdqu     (%0),%%ymm0                   \n"
6584       "vmovdqu     0x20(%0),%%ymm1               \n"
6585       "lea         0x40(%0),%0                   \n"
6586       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6587       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
6588       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
6589       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6590       "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
6591       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6592       "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
6593       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
6594       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
6595       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6596       "vextractf128 $0x0,%%ymm1,(%1)             \n"
6597       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
6598       "lea         0x10(%1),%1                   \n"
6599       "sub         $0x20,%3                      \n"
6600       "jg          1b                            \n"
6601       "vzeroupper                                \n"
6602       : "+r"(src_yuy2),  // %0
6603         "+r"(dst_u),     // %1
6604         "+r"(dst_v),     // %2
6605         "+r"(width)      // %3
6606       :
6607       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6608 }
6609 
UYVYToYRow_AVX2(const uint8_t * src_uyvy,uint8_t * dst_y,int width)6610 void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
6611   asm volatile(
6612 
6613       LABELALIGN
6614       "1:                                        \n"
6615       "vmovdqu     (%0),%%ymm0                   \n"
6616       "vmovdqu     0x20(%0),%%ymm1               \n"
6617       "lea         0x40(%0),%0                   \n"
6618       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6619       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
6620       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
6621       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6622       "vmovdqu     %%ymm0,(%1)                   \n"
6623       "lea         0x20(%1),%1                   \n"
6624       "sub         $0x20,%2                      \n"
6625       "jg          1b                            \n"
6626       "vzeroupper                                \n"
6627       : "+r"(src_uyvy),  // %0
6628         "+r"(dst_y),     // %1
6629         "+r"(width)      // %2
6630       :
6631       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6632 }
UYVYToUVRow_AVX2(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)6633 void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
6634                       int stride_uyvy,
6635                       uint8_t* dst_u,
6636                       uint8_t* dst_v,
6637                       int width) {
6638   asm volatile(
6639       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
6640       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
6641       "sub         %1,%2                         \n"
6642 
6643       LABELALIGN
6644       "1:                                        \n"
6645       "vmovdqu     (%0),%%ymm0                   \n"
6646       "vmovdqu     0x20(%0),%%ymm1               \n"
6647       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
6648       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
6649       "lea         0x40(%0),%0                   \n"
6650       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
6651       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
6652       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
6653       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6654       "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
6655       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6656       "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
6657       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
6658       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
6659       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6660       "vextractf128 $0x0,%%ymm1,(%1)             \n"
6661       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
6662       "lea         0x10(%1),%1                   \n"
6663       "sub         $0x20,%3                      \n"
6664       "jg          1b                            \n"
6665       "vzeroupper                                \n"
6666       : "+r"(src_uyvy),               // %0
6667         "+r"(dst_u),                  // %1
6668         "+r"(dst_v),                  // %2
6669         "+r"(width)                   // %3
6670       : "r"((intptr_t)(stride_uyvy))  // %4
6671       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6672 }
6673 
UYVYToUV422Row_AVX2(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)6674 void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
6675                          uint8_t* dst_u,
6676                          uint8_t* dst_v,
6677                          int width) {
6678   asm volatile(
6679       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
6680       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
6681       "sub         %1,%2                         \n"
6682 
6683       LABELALIGN
6684       "1:                                        \n"
6685       "vmovdqu     (%0),%%ymm0                   \n"
6686       "vmovdqu     0x20(%0),%%ymm1               \n"
6687       "lea         0x40(%0),%0                   \n"
6688       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
6689       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
6690       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
6691       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6692       "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
6693       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6694       "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
6695       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
6696       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
6697       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6698       "vextractf128 $0x0,%%ymm1,(%1)             \n"
6699       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
6700       "lea         0x10(%1),%1                   \n"
6701       "sub         $0x20,%3                      \n"
6702       "jg          1b                            \n"
6703       "vzeroupper                                \n"
6704       : "+r"(src_uyvy),  // %0
6705         "+r"(dst_u),     // %1
6706         "+r"(dst_v),     // %2
6707         "+r"(width)      // %3
6708       :
6709       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6710 }
6711 #endif  // HAS_YUY2TOYROW_AVX2
6712 
6713 #ifdef HAS_ARGBBLENDROW_SSSE3
6714 // Shuffle table for isolating alpha.
6715 static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
6716                                     11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
6717 
6718 // Blend 8 pixels at a time
ARGBBlendRow_SSSE3(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)6719 void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
6720                         const uint8_t* src_argb1,
6721                         uint8_t* dst_argb,
6722                         int width) {
6723   asm volatile(
6724       "pcmpeqb     %%xmm7,%%xmm7                 \n"
6725       "psrlw       $0xf,%%xmm7                   \n"
6726       "pcmpeqb     %%xmm6,%%xmm6                 \n"
6727       "psrlw       $0x8,%%xmm6                   \n"
6728       "pcmpeqb     %%xmm5,%%xmm5                 \n"
6729       "psllw       $0x8,%%xmm5                   \n"
6730       "pcmpeqb     %%xmm4,%%xmm4                 \n"
6731       "pslld       $0x18,%%xmm4                  \n"
6732       "sub         $0x4,%3                       \n"
6733       "jl          49f                           \n"
6734 
6735       // 4 pixel loop.
6736       LABELALIGN
6737       "40:                                       \n"
6738       "movdqu      (%0),%%xmm3                   \n"
6739       "lea         0x10(%0),%0                   \n"
6740       "movdqa      %%xmm3,%%xmm0                 \n"
6741       "pxor        %%xmm4,%%xmm3                 \n"
6742       "movdqu      (%1),%%xmm2                   \n"
6743       "pshufb      %4,%%xmm3                     \n"
6744       "pand        %%xmm6,%%xmm2                 \n"
6745       "paddw       %%xmm7,%%xmm3                 \n"
6746       "pmullw      %%xmm3,%%xmm2                 \n"
6747       "movdqu      (%1),%%xmm1                   \n"
6748       "lea         0x10(%1),%1                   \n"
6749       "psrlw       $0x8,%%xmm1                   \n"
6750       "por         %%xmm4,%%xmm0                 \n"
6751       "pmullw      %%xmm3,%%xmm1                 \n"
6752       "psrlw       $0x8,%%xmm2                   \n"
6753       "paddusb     %%xmm2,%%xmm0                 \n"
6754       "pand        %%xmm5,%%xmm1                 \n"
6755       "paddusb     %%xmm1,%%xmm0                 \n"
6756       "movdqu      %%xmm0,(%2)                   \n"
6757       "lea         0x10(%2),%2                   \n"
6758       "sub         $0x4,%3                       \n"
6759       "jge         40b                           \n"
6760 
6761       "49:                                       \n"
6762       "add         $0x3,%3                       \n"
6763       "jl          99f                           \n"
6764 
6765       // 1 pixel loop.
6766       "91:                                       \n"
6767       "movd        (%0),%%xmm3                   \n"
6768       "lea         0x4(%0),%0                    \n"
6769       "movdqa      %%xmm3,%%xmm0                 \n"
6770       "pxor        %%xmm4,%%xmm3                 \n"
6771       "movd        (%1),%%xmm2                   \n"
6772       "pshufb      %4,%%xmm3                     \n"
6773       "pand        %%xmm6,%%xmm2                 \n"
6774       "paddw       %%xmm7,%%xmm3                 \n"
6775       "pmullw      %%xmm3,%%xmm2                 \n"
6776       "movd        (%1),%%xmm1                   \n"
6777       "lea         0x4(%1),%1                    \n"
6778       "psrlw       $0x8,%%xmm1                   \n"
6779       "por         %%xmm4,%%xmm0                 \n"
6780       "pmullw      %%xmm3,%%xmm1                 \n"
6781       "psrlw       $0x8,%%xmm2                   \n"
6782       "paddusb     %%xmm2,%%xmm0                 \n"
6783       "pand        %%xmm5,%%xmm1                 \n"
6784       "paddusb     %%xmm1,%%xmm0                 \n"
6785       "movd        %%xmm0,(%2)                   \n"
6786       "lea         0x4(%2),%2                    \n"
6787       "sub         $0x1,%3                       \n"
6788       "jge         91b                           \n"
6789       "99:                                       \n"
6790       : "+r"(src_argb),     // %0
6791         "+r"(src_argb1),    // %1
6792         "+r"(dst_argb),     // %2
6793         "+r"(width)         // %3
6794       : "m"(kShuffleAlpha)  // %4
6795       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
6796         "xmm7");
6797 }
6798 #endif  // HAS_ARGBBLENDROW_SSSE3
6799 
6800 #ifdef HAS_BLENDPLANEROW_SSSE3
6801 // Blend 8 pixels at a time.
6802 // unsigned version of math
6803 // =((A2*C2)+(B2*(255-C2))+255)/256
6804 // signed version of math
6805 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_SSSE3(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)6806 void BlendPlaneRow_SSSE3(const uint8_t* src0,
6807                          const uint8_t* src1,
6808                          const uint8_t* alpha,
6809                          uint8_t* dst,
6810                          int width) {
6811   asm volatile(
6812       "pcmpeqb     %%xmm5,%%xmm5                 \n"
6813       "psllw       $0x8,%%xmm5                   \n"
6814       "mov         $0x80808080,%%eax             \n"
6815       "movd        %%eax,%%xmm6                  \n"
6816       "pshufd      $0x0,%%xmm6,%%xmm6            \n"
6817       "mov         $0x807f807f,%%eax             \n"
6818       "movd        %%eax,%%xmm7                  \n"
6819       "pshufd      $0x0,%%xmm7,%%xmm7            \n"
6820       "sub         %2,%0                         \n"
6821       "sub         %2,%1                         \n"
6822       "sub         %2,%3                         \n"
6823 
6824       // 8 pixel loop.
6825       LABELALIGN
6826       "1:                                        \n"
6827       "movq        (%2),%%xmm0                   \n"
6828       "punpcklbw   %%xmm0,%%xmm0                 \n"
6829       "pxor        %%xmm5,%%xmm0                 \n"
6830       "movq        (%0,%2,1),%%xmm1              \n"
6831       "movq        (%1,%2,1),%%xmm2              \n"
6832       "punpcklbw   %%xmm2,%%xmm1                 \n"
6833       "psubb       %%xmm6,%%xmm1                 \n"
6834       "pmaddubsw   %%xmm1,%%xmm0                 \n"
6835       "paddw       %%xmm7,%%xmm0                 \n"
6836       "psrlw       $0x8,%%xmm0                   \n"
6837       "packuswb    %%xmm0,%%xmm0                 \n"
6838       "movq        %%xmm0,(%3,%2,1)              \n"
6839       "lea         0x8(%2),%2                    \n"
6840       "sub         $0x8,%4                       \n"
6841       "jg          1b                            \n"
6842       : "+r"(src0),   // %0
6843         "+r"(src1),   // %1
6844         "+r"(alpha),  // %2
6845         "+r"(dst),    // %3
6846         "+rm"(width)  // %4
6847         ::"memory",
6848         "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
6849 }
6850 #endif  // HAS_BLENDPLANEROW_SSSE3
6851 
6852 #ifdef HAS_BLENDPLANEROW_AVX2
6853 // Blend 32 pixels at a time.
6854 // unsigned version of math
6855 // =((A2*C2)+(B2*(255-C2))+255)/256
6856 // signed version of math
6857 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_AVX2(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)6858 void BlendPlaneRow_AVX2(const uint8_t* src0,
6859                         const uint8_t* src1,
6860                         const uint8_t* alpha,
6861                         uint8_t* dst,
6862                         int width) {
6863   asm volatile(
6864       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
6865       "vpsllw      $0x8,%%ymm5,%%ymm5            \n"
6866       "mov         $0x80808080,%%eax             \n"
6867       "vmovd       %%eax,%%xmm6                  \n"
6868       "vbroadcastss %%xmm6,%%ymm6                \n"
6869       "mov         $0x807f807f,%%eax             \n"
6870       "vmovd       %%eax,%%xmm7                  \n"
6871       "vbroadcastss %%xmm7,%%ymm7                \n"
6872       "sub         %2,%0                         \n"
6873       "sub         %2,%1                         \n"
6874       "sub         %2,%3                         \n"
6875 
6876       // 32 pixel loop.
6877       LABELALIGN
6878       "1:                                        \n"
6879       "vmovdqu     (%2),%%ymm0                   \n"
6880       "vpunpckhbw  %%ymm0,%%ymm0,%%ymm3          \n"
6881       "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
6882       "vpxor       %%ymm5,%%ymm3,%%ymm3          \n"
6883       "vpxor       %%ymm5,%%ymm0,%%ymm0          \n"
6884       "vmovdqu     (%0,%2,1),%%ymm1              \n"
6885       "vmovdqu     (%1,%2,1),%%ymm2              \n"
6886       "vpunpckhbw  %%ymm2,%%ymm1,%%ymm4          \n"
6887       "vpunpcklbw  %%ymm2,%%ymm1,%%ymm1          \n"
6888       "vpsubb      %%ymm6,%%ymm4,%%ymm4          \n"
6889       "vpsubb      %%ymm6,%%ymm1,%%ymm1          \n"
6890       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
6891       "vpmaddubsw  %%ymm1,%%ymm0,%%ymm0          \n"
6892       "vpaddw      %%ymm7,%%ymm3,%%ymm3          \n"
6893       "vpaddw      %%ymm7,%%ymm0,%%ymm0          \n"
6894       "vpsrlw      $0x8,%%ymm3,%%ymm3            \n"
6895       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6896       "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"
6897       "vmovdqu     %%ymm0,(%3,%2,1)              \n"
6898       "lea         0x20(%2),%2                   \n"
6899       "sub         $0x20,%4                      \n"
6900       "jg          1b                            \n"
6901       "vzeroupper                                \n"
6902       : "+r"(src0),   // %0
6903         "+r"(src1),   // %1
6904         "+r"(alpha),  // %2
6905         "+r"(dst),    // %3
6906         "+rm"(width)  // %4
6907         ::"memory",
6908         "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
6909         "xmm7");
6910 }
6911 #endif  // HAS_BLENDPLANEROW_AVX2
6912 
6913 #ifdef HAS_ARGBATTENUATEROW_SSSE3
6914 // Shuffle table duplicating alpha.
6915 static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
6916                                      7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
6917 static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
6918                                      15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
6919 // Attenuate 4 pixels at a time.
ARGBAttenuateRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width)6920 void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
6921                             uint8_t* dst_argb,
6922                             int width) {
6923   asm volatile(
6924       "pcmpeqb     %%xmm3,%%xmm3                 \n"
6925       "pslld       $0x18,%%xmm3                  \n"
6926       "movdqa      %3,%%xmm4                     \n"
6927       "movdqa      %4,%%xmm5                     \n"
6928 
6929       // 4 pixel loop.
6930       LABELALIGN
6931       "1:                                        \n"
6932       "movdqu      (%0),%%xmm0                   \n"
6933       "pshufb      %%xmm4,%%xmm0                 \n"
6934       "movdqu      (%0),%%xmm1                   \n"
6935       "punpcklbw   %%xmm1,%%xmm1                 \n"
6936       "pmulhuw     %%xmm1,%%xmm0                 \n"
6937       "movdqu      (%0),%%xmm1                   \n"
6938       "pshufb      %%xmm5,%%xmm1                 \n"
6939       "movdqu      (%0),%%xmm2                   \n"
6940       "punpckhbw   %%xmm2,%%xmm2                 \n"
6941       "pmulhuw     %%xmm2,%%xmm1                 \n"
6942       "movdqu      (%0),%%xmm2                   \n"
6943       "lea         0x10(%0),%0                   \n"
6944       "pand        %%xmm3,%%xmm2                 \n"
6945       "psrlw       $0x8,%%xmm0                   \n"
6946       "psrlw       $0x8,%%xmm1                   \n"
6947       "packuswb    %%xmm1,%%xmm0                 \n"
6948       "por         %%xmm2,%%xmm0                 \n"
6949       "movdqu      %%xmm0,(%1)                   \n"
6950       "lea         0x10(%1),%1                   \n"
6951       "sub         $0x4,%2                       \n"
6952       "jg          1b                            \n"
6953       : "+r"(src_argb),       // %0
6954         "+r"(dst_argb),       // %1
6955         "+r"(width)           // %2
6956       : "m"(kShuffleAlpha0),  // %3
6957         "m"(kShuffleAlpha1)   // %4
6958       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6959 }
6960 #endif  // HAS_ARGBATTENUATEROW_SSSE3
6961 
6962 #ifdef HAS_ARGBATTENUATEROW_AVX2
6963 // Shuffle table duplicating alpha.
6964 static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
6965                                          128u, 128u, 14u,  15u, 14u, 15u,
6966                                          14u,  15u,  128u, 128u};
6967 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,int width)6968 void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
6969                            uint8_t* dst_argb,
6970                            int width) {
6971   asm volatile(
6972       "vbroadcastf128 %3,%%ymm4                  \n"
6973       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
6974       "vpslld      $0x18,%%ymm5,%%ymm5           \n"
6975       "sub         %0,%1                         \n"
6976 
6977       // 8 pixel loop.
6978       LABELALIGN
6979       "1:                                        \n"
6980       "vmovdqu     (%0),%%ymm6                   \n"
6981       "vpunpcklbw  %%ymm6,%%ymm6,%%ymm0          \n"
6982       "vpunpckhbw  %%ymm6,%%ymm6,%%ymm1          \n"
6983       "vpshufb     %%ymm4,%%ymm0,%%ymm2          \n"
6984       "vpshufb     %%ymm4,%%ymm1,%%ymm3          \n"
6985       "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
6986       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
6987       "vpand       %%ymm5,%%ymm6,%%ymm6          \n"
6988       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6989       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
6990       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
6991       "vpor        %%ymm6,%%ymm0,%%ymm0          \n"
6992       "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
6993       "lea         0x20(%0),%0                   \n"
6994       "sub         $0x8,%2                       \n"
6995       "jg          1b                            \n"
6996       "vzeroupper                                \n"
6997       : "+r"(src_argb),          // %0
6998         "+r"(dst_argb),          // %1
6999         "+r"(width)              // %2
7000       : "m"(kShuffleAlpha_AVX2)  // %3
7001       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
7002 }
7003 #endif  // HAS_ARGBATTENUATEROW_AVX2
7004 
7005 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
7006 // Unattenuate 4 pixels at a time.
ARGBUnattenuateRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,int width)7007 void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
7008                              uint8_t* dst_argb,
7009                              int width) {
7010   uintptr_t alpha;
7011   asm volatile(
7012       // 4 pixel loop.
7013       LABELALIGN
7014       "1:                                        \n"
7015       "movdqu      (%0),%%xmm0                   \n"
7016       "movzb       0x03(%0),%3                   \n"
7017       "punpcklbw   %%xmm0,%%xmm0                 \n"
7018       "movd        0x00(%4,%3,4),%%xmm2          \n"
7019       "movzb       0x07(%0),%3                   \n"
7020       "movd        0x00(%4,%3,4),%%xmm3          \n"
7021       "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
7022       "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
7023       "movlhps     %%xmm3,%%xmm2                 \n"
7024       "pmulhuw     %%xmm2,%%xmm0                 \n"
7025       "movdqu      (%0),%%xmm1                   \n"
7026       "movzb       0x0b(%0),%3                   \n"
7027       "punpckhbw   %%xmm1,%%xmm1                 \n"
7028       "movd        0x00(%4,%3,4),%%xmm2          \n"
7029       "movzb       0x0f(%0),%3                   \n"
7030       "movd        0x00(%4,%3,4),%%xmm3          \n"
7031       "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
7032       "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
7033       "movlhps     %%xmm3,%%xmm2                 \n"
7034       "pmulhuw     %%xmm2,%%xmm1                 \n"
7035       "lea         0x10(%0),%0                   \n"
7036       "packuswb    %%xmm1,%%xmm0                 \n"
7037       "movdqu      %%xmm0,(%1)                   \n"
7038       "lea         0x10(%1),%1                   \n"
7039       "sub         $0x4,%2                       \n"
7040       "jg          1b                            \n"
7041       : "+r"(src_argb),     // %0
7042         "+r"(dst_argb),     // %1
7043         "+r"(width),        // %2
7044         "=&r"(alpha)        // %3
7045       : "r"(fixed_invtbl8)  // %4
7046       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
7047 }
7048 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
7049 
7050 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
7051 // Shuffle table duplicating alpha.
7052 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
7053     0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
7054 // Unattenuate 8 pixels at a time.
ARGBUnattenuateRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,int width)7055 void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
7056                              uint8_t* dst_argb,
7057                              int width) {
7058   uintptr_t alpha;
7059   asm volatile(
7060       "sub         %0,%1                         \n"
7061       "vbroadcastf128 %5,%%ymm5                  \n"
7062 
7063       // 8 pixel loop.
7064       LABELALIGN
7065       "1:                                        \n"
7066       // replace VPGATHER
7067       "movzb       0x03(%0),%3                   \n"
7068       "vmovd       0x00(%4,%3,4),%%xmm0          \n"
7069       "movzb       0x07(%0),%3                   \n"
7070       "vmovd       0x00(%4,%3,4),%%xmm1          \n"
7071       "movzb       0x0b(%0),%3                   \n"
7072       "vpunpckldq  %%xmm1,%%xmm0,%%xmm6          \n"
7073       "vmovd       0x00(%4,%3,4),%%xmm2          \n"
7074       "movzb       0x0f(%0),%3                   \n"
7075       "vmovd       0x00(%4,%3,4),%%xmm3          \n"
7076       "movzb       0x13(%0),%3                   \n"
7077       "vpunpckldq  %%xmm3,%%xmm2,%%xmm7          \n"
7078       "vmovd       0x00(%4,%3,4),%%xmm0          \n"
7079       "movzb       0x17(%0),%3                   \n"
7080       "vmovd       0x00(%4,%3,4),%%xmm1          \n"
7081       "movzb       0x1b(%0),%3                   \n"
7082       "vpunpckldq  %%xmm1,%%xmm0,%%xmm0          \n"
7083       "vmovd       0x00(%4,%3,4),%%xmm2          \n"
7084       "movzb       0x1f(%0),%3                   \n"
7085       "vmovd       0x00(%4,%3,4),%%xmm3          \n"
7086       "vpunpckldq  %%xmm3,%%xmm2,%%xmm2          \n"
7087       "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
7088       "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
7089       "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
7090       // end of VPGATHER
7091 
7092       "vmovdqu     (%0),%%ymm6                   \n"
7093       "vpunpcklbw  %%ymm6,%%ymm6,%%ymm0          \n"
7094       "vpunpckhbw  %%ymm6,%%ymm6,%%ymm1          \n"
7095       "vpunpcklwd  %%ymm3,%%ymm3,%%ymm2          \n"
7096       "vpunpckhwd  %%ymm3,%%ymm3,%%ymm3          \n"
7097       "vpshufb     %%ymm5,%%ymm2,%%ymm2          \n"
7098       "vpshufb     %%ymm5,%%ymm3,%%ymm3          \n"
7099       "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
7100       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
7101       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
7102       "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
7103       "lea         0x20(%0),%0                   \n"
7104       "sub         $0x8,%2                       \n"
7105       "jg          1b                            \n"
7106       "vzeroupper                                \n"
7107       : "+r"(src_argb),                 // %0
7108         "+r"(dst_argb),                 // %1
7109         "+r"(width),                    // %2
7110         "=&r"(alpha)                    // %3
7111       : "r"(fixed_invtbl8),             // %4
7112         "m"(kUnattenShuffleAlpha_AVX2)  // %5
7113       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7114         "xmm7");
7115 }
7116 #endif  // HAS_ARGBUNATTENUATEROW_AVX2
7117 
7118 #ifdef HAS_ARGBGRAYROW_SSSE3
7119 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
ARGBGrayRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width)7120 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
7121   asm volatile(
7122       "movdqa      %3,%%xmm4                     \n"
7123       "movdqa      %4,%%xmm5                     \n"
7124 
7125       // 8 pixel loop.
7126       LABELALIGN
7127       "1:                                        \n"
7128       "movdqu      (%0),%%xmm0                   \n"
7129       "movdqu      0x10(%0),%%xmm1               \n"
7130       "psubb       %%xmm5,%%xmm0                 \n"
7131       "psubb       %%xmm5,%%xmm1                 \n"
7132       "movdqu      %%xmm4,%%xmm6                 \n"
7133       "pmaddubsw   %%xmm0,%%xmm6                 \n"
7134       "movdqu      %%xmm4,%%xmm0                 \n"
7135       "pmaddubsw   %%xmm1,%%xmm0                 \n"
7136       "phaddw      %%xmm0,%%xmm6                 \n"
7137       "paddw       %%xmm5,%%xmm6                 \n"
7138       "psrlw       $0x8,%%xmm6                   \n"
7139       "packuswb    %%xmm6,%%xmm6                 \n"
7140       "movdqu      (%0),%%xmm2                   \n"
7141       "movdqu      0x10(%0),%%xmm3               \n"
7142       "lea         0x20(%0),%0                   \n"
7143       "psrld       $0x18,%%xmm2                  \n"
7144       "psrld       $0x18,%%xmm3                  \n"
7145       "packuswb    %%xmm3,%%xmm2                 \n"
7146       "packuswb    %%xmm2,%%xmm2                 \n"
7147       "movdqa      %%xmm6,%%xmm3                 \n"
7148       "punpcklbw   %%xmm6,%%xmm6                 \n"
7149       "punpcklbw   %%xmm2,%%xmm3                 \n"
7150       "movdqa      %%xmm6,%%xmm1                 \n"
7151       "punpcklwd   %%xmm3,%%xmm6                 \n"
7152       "punpckhwd   %%xmm3,%%xmm1                 \n"
7153       "movdqu      %%xmm6,(%1)                   \n"
7154       "movdqu      %%xmm1,0x10(%1)               \n"
7155       "lea         0x20(%1),%1                   \n"
7156       "sub         $0x8,%2                       \n"
7157       "jg          1b                            \n"
7158       : "+r"(src_argb),  // %0
7159         "+r"(dst_argb),  // %1
7160         "+r"(width)      // %2
7161       : "m"(kARGBToYJ),  // %3
7162         "m"(kSub128)     // %4
7163       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
7164 }
7165 #endif  // HAS_ARGBGRAYROW_SSSE3
7166 
7167 #ifdef HAS_ARGBSEPIAROW_SSSE3
7168 //    b = (r * 35 + g * 68 + b * 17) >> 7
7169 //    g = (r * 45 + g * 88 + b * 22) >> 7
7170 //    r = (r * 50 + g * 98 + b * 24) >> 7
7171 // Constant for ARGB color to sepia tone
7172 static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
7173                                    17, 68, 35, 0, 17, 68, 35, 0};
7174 
7175 static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
7176                                    22, 88, 45, 0, 22, 88, 45, 0};
7177 
7178 static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
7179                                    24, 98, 50, 0, 24, 98, 50, 0};
7180 
7181 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
ARGBSepiaRow_SSSE3(uint8_t * dst_argb,int width)7182 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
7183   asm volatile(
7184       "movdqa      %2,%%xmm2                     \n"
7185       "movdqa      %3,%%xmm3                     \n"
7186       "movdqa      %4,%%xmm4                     \n"
7187 
7188       // 8 pixel loop.
7189       LABELALIGN
7190       "1:                                        \n"
7191       "movdqu      (%0),%%xmm0                   \n"
7192       "movdqu      0x10(%0),%%xmm6               \n"
7193       "pmaddubsw   %%xmm2,%%xmm0                 \n"
7194       "pmaddubsw   %%xmm2,%%xmm6                 \n"
7195       "phaddw      %%xmm6,%%xmm0                 \n"
7196       "psrlw       $0x7,%%xmm0                   \n"
7197       "packuswb    %%xmm0,%%xmm0                 \n"
7198       "movdqu      (%0),%%xmm5                   \n"
7199       "movdqu      0x10(%0),%%xmm1               \n"
7200       "pmaddubsw   %%xmm3,%%xmm5                 \n"
7201       "pmaddubsw   %%xmm3,%%xmm1                 \n"
7202       "phaddw      %%xmm1,%%xmm5                 \n"
7203       "psrlw       $0x7,%%xmm5                   \n"
7204       "packuswb    %%xmm5,%%xmm5                 \n"
7205       "punpcklbw   %%xmm5,%%xmm0                 \n"
7206       "movdqu      (%0),%%xmm5                   \n"
7207       "movdqu      0x10(%0),%%xmm1               \n"
7208       "pmaddubsw   %%xmm4,%%xmm5                 \n"
7209       "pmaddubsw   %%xmm4,%%xmm1                 \n"
7210       "phaddw      %%xmm1,%%xmm5                 \n"
7211       "psrlw       $0x7,%%xmm5                   \n"
7212       "packuswb    %%xmm5,%%xmm5                 \n"
7213       "movdqu      (%0),%%xmm6                   \n"
7214       "movdqu      0x10(%0),%%xmm1               \n"
7215       "psrld       $0x18,%%xmm6                  \n"
7216       "psrld       $0x18,%%xmm1                  \n"
7217       "packuswb    %%xmm1,%%xmm6                 \n"
7218       "packuswb    %%xmm6,%%xmm6                 \n"
7219       "punpcklbw   %%xmm6,%%xmm5                 \n"
7220       "movdqa      %%xmm0,%%xmm1                 \n"
7221       "punpcklwd   %%xmm5,%%xmm0                 \n"
7222       "punpckhwd   %%xmm5,%%xmm1                 \n"
7223       "movdqu      %%xmm0,(%0)                   \n"
7224       "movdqu      %%xmm1,0x10(%0)               \n"
7225       "lea         0x20(%0),%0                   \n"
7226       "sub         $0x8,%1                       \n"
7227       "jg          1b                            \n"
7228       : "+r"(dst_argb),      // %0
7229         "+r"(width)          // %1
7230       : "m"(kARGBToSepiaB),  // %2
7231         "m"(kARGBToSepiaG),  // %3
7232         "m"(kARGBToSepiaR)   // %4
7233       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
7234 }
7235 #endif  // HAS_ARGBSEPIAROW_SSSE3
7236 
7237 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
7238 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
7239 // Same as Sepia except matrix is provided.
ARGBColorMatrixRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)7240 void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
7241                               uint8_t* dst_argb,
7242                               const int8_t* matrix_argb,
7243                               int width) {
7244   asm volatile(
7245       "movdqu      (%3),%%xmm5                   \n"
7246       "pshufd      $0x00,%%xmm5,%%xmm2           \n"
7247       "pshufd      $0x55,%%xmm5,%%xmm3           \n"
7248       "pshufd      $0xaa,%%xmm5,%%xmm4           \n"
7249       "pshufd      $0xff,%%xmm5,%%xmm5           \n"
7250 
7251       // 8 pixel loop.
7252       LABELALIGN
7253       "1:                                        \n"
7254       "movdqu      (%0),%%xmm0                   \n"
7255       "movdqu      0x10(%0),%%xmm7               \n"
7256       "pmaddubsw   %%xmm2,%%xmm0                 \n"
7257       "pmaddubsw   %%xmm2,%%xmm7                 \n"
7258       "movdqu      (%0),%%xmm6                   \n"
7259       "movdqu      0x10(%0),%%xmm1               \n"
7260       "pmaddubsw   %%xmm3,%%xmm6                 \n"
7261       "pmaddubsw   %%xmm3,%%xmm1                 \n"
7262       "phaddsw     %%xmm7,%%xmm0                 \n"
7263       "phaddsw     %%xmm1,%%xmm6                 \n"
7264       "psraw       $0x6,%%xmm0                   \n"
7265       "psraw       $0x6,%%xmm6                   \n"
7266       "packuswb    %%xmm0,%%xmm0                 \n"
7267       "packuswb    %%xmm6,%%xmm6                 \n"
7268       "punpcklbw   %%xmm6,%%xmm0                 \n"
7269       "movdqu      (%0),%%xmm1                   \n"
7270       "movdqu      0x10(%0),%%xmm7               \n"
7271       "pmaddubsw   %%xmm4,%%xmm1                 \n"
7272       "pmaddubsw   %%xmm4,%%xmm7                 \n"
7273       "phaddsw     %%xmm7,%%xmm1                 \n"
7274       "movdqu      (%0),%%xmm6                   \n"
7275       "movdqu      0x10(%0),%%xmm7               \n"
7276       "pmaddubsw   %%xmm5,%%xmm6                 \n"
7277       "pmaddubsw   %%xmm5,%%xmm7                 \n"
7278       "phaddsw     %%xmm7,%%xmm6                 \n"
7279       "psraw       $0x6,%%xmm1                   \n"
7280       "psraw       $0x6,%%xmm6                   \n"
7281       "packuswb    %%xmm1,%%xmm1                 \n"
7282       "packuswb    %%xmm6,%%xmm6                 \n"
7283       "punpcklbw   %%xmm6,%%xmm1                 \n"
7284       "movdqa      %%xmm0,%%xmm6                 \n"
7285       "punpcklwd   %%xmm1,%%xmm0                 \n"
7286       "punpckhwd   %%xmm1,%%xmm6                 \n"
7287       "movdqu      %%xmm0,(%1)                   \n"
7288       "movdqu      %%xmm6,0x10(%1)               \n"
7289       "lea         0x20(%0),%0                   \n"
7290       "lea         0x20(%1),%1                   \n"
7291       "sub         $0x8,%2                       \n"
7292       "jg          1b                            \n"
7293       : "+r"(src_argb),   // %0
7294         "+r"(dst_argb),   // %1
7295         "+r"(width)       // %2
7296       : "r"(matrix_argb)  // %3
7297       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7298         "xmm7");
7299 }
7300 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
7301 
7302 #ifdef HAS_ARGBQUANTIZEROW_SSE2
7303 // Quantize 4 ARGB pixels (16 bytes).
ARGBQuantizeRow_SSE2(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)7304 void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
7305                           int scale,
7306                           int interval_size,
7307                           int interval_offset,
7308                           int width) {
7309   asm volatile(
7310       "movd        %2,%%xmm2                     \n"
7311       "movd        %3,%%xmm3                     \n"
7312       "movd        %4,%%xmm4                     \n"
7313       "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
7314       "pshufd      $0x44,%%xmm2,%%xmm2           \n"
7315       "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
7316       "pshufd      $0x44,%%xmm3,%%xmm3           \n"
7317       "pshuflw     $0x40,%%xmm4,%%xmm4           \n"
7318       "pshufd      $0x44,%%xmm4,%%xmm4           \n"
7319       "pxor        %%xmm5,%%xmm5                 \n"
7320       "pcmpeqb     %%xmm6,%%xmm6                 \n"
7321       "pslld       $0x18,%%xmm6                  \n"
7322 
7323       // 4 pixel loop.
7324       LABELALIGN
7325       "1:                                        \n"
7326       "movdqu      (%0),%%xmm0                   \n"
7327       "punpcklbw   %%xmm5,%%xmm0                 \n"
7328       "pmulhuw     %%xmm2,%%xmm0                 \n"
7329       "movdqu      (%0),%%xmm1                   \n"
7330       "punpckhbw   %%xmm5,%%xmm1                 \n"
7331       "pmulhuw     %%xmm2,%%xmm1                 \n"
7332       "pmullw      %%xmm3,%%xmm0                 \n"
7333       "movdqu      (%0),%%xmm7                   \n"
7334       "pmullw      %%xmm3,%%xmm1                 \n"
7335       "pand        %%xmm6,%%xmm7                 \n"
7336       "paddw       %%xmm4,%%xmm0                 \n"
7337       "paddw       %%xmm4,%%xmm1                 \n"
7338       "packuswb    %%xmm1,%%xmm0                 \n"
7339       "por         %%xmm7,%%xmm0                 \n"
7340       "movdqu      %%xmm0,(%0)                   \n"
7341       "lea         0x10(%0),%0                   \n"
7342       "sub         $0x4,%1                       \n"
7343       "jg          1b                            \n"
7344       : "+r"(dst_argb),       // %0
7345         "+r"(width)           // %1
7346       : "r"(scale),           // %2
7347         "r"(interval_size),   // %3
7348         "r"(interval_offset)  // %4
7349       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7350         "xmm7");
7351 }
7352 #endif  // HAS_ARGBQUANTIZEROW_SSE2
7353 
7354 #ifdef HAS_ARGBSHADEROW_SSE2
7355 // Shade 4 pixels at a time by specified value.
ARGBShadeRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)7356 void ARGBShadeRow_SSE2(const uint8_t* src_argb,
7357                        uint8_t* dst_argb,
7358                        int width,
7359                        uint32_t value) {
7360   asm volatile(
7361       "movd        %3,%%xmm2                     \n"
7362       "punpcklbw   %%xmm2,%%xmm2                 \n"
7363       "punpcklqdq  %%xmm2,%%xmm2                 \n"
7364 
7365       // 4 pixel loop.
7366       LABELALIGN
7367       "1:                                        \n"
7368       "movdqu      (%0),%%xmm0                   \n"
7369       "lea         0x10(%0),%0                   \n"
7370       "movdqa      %%xmm0,%%xmm1                 \n"
7371       "punpcklbw   %%xmm0,%%xmm0                 \n"
7372       "punpckhbw   %%xmm1,%%xmm1                 \n"
7373       "pmulhuw     %%xmm2,%%xmm0                 \n"
7374       "pmulhuw     %%xmm2,%%xmm1                 \n"
7375       "psrlw       $0x8,%%xmm0                   \n"
7376       "psrlw       $0x8,%%xmm1                   \n"
7377       "packuswb    %%xmm1,%%xmm0                 \n"
7378       "movdqu      %%xmm0,(%1)                   \n"
7379       "lea         0x10(%1),%1                   \n"
7380       "sub         $0x4,%2                       \n"
7381       "jg          1b                            \n"
7382       : "+r"(src_argb),  // %0
7383         "+r"(dst_argb),  // %1
7384         "+r"(width)      // %2
7385       : "r"(value)       // %3
7386       : "memory", "cc", "xmm0", "xmm1", "xmm2");
7387 }
7388 #endif  // HAS_ARGBSHADEROW_SSE2
7389 
7390 #ifdef HAS_ARGBMULTIPLYROW_SSE2
7391 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBMultiplyRow_SSE2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7392 void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
7393                           const uint8_t* src_argb1,
7394                           uint8_t* dst_argb,
7395                           int width) {
7396   asm volatile(
7397 
7398       "pxor        %%xmm5,%%xmm5                 \n"
7399 
7400       // 4 pixel loop.
7401       LABELALIGN
7402       "1:                                        \n"
7403       "movdqu      (%0),%%xmm0                   \n"
7404       "lea         0x10(%0),%0                   \n"
7405       "movdqu      (%1),%%xmm2                   \n"
7406       "lea         0x10(%1),%1                   \n"
7407       "movdqu      %%xmm0,%%xmm1                 \n"
7408       "movdqu      %%xmm2,%%xmm3                 \n"
7409       "punpcklbw   %%xmm0,%%xmm0                 \n"
7410       "punpckhbw   %%xmm1,%%xmm1                 \n"
7411       "punpcklbw   %%xmm5,%%xmm2                 \n"
7412       "punpckhbw   %%xmm5,%%xmm3                 \n"
7413       "pmulhuw     %%xmm2,%%xmm0                 \n"
7414       "pmulhuw     %%xmm3,%%xmm1                 \n"
7415       "packuswb    %%xmm1,%%xmm0                 \n"
7416       "movdqu      %%xmm0,(%2)                   \n"
7417       "lea         0x10(%2),%2                   \n"
7418       "sub         $0x4,%3                       \n"
7419       "jg          1b                            \n"
7420       : "+r"(src_argb),   // %0
7421         "+r"(src_argb1),  // %1
7422         "+r"(dst_argb),   // %2
7423         "+r"(width)       // %3
7424       :
7425       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7426 }
7427 #endif  // HAS_ARGBMULTIPLYROW_SSE2
7428 
7429 #ifdef HAS_ARGBMULTIPLYROW_AVX2
7430 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_AVX2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7431 void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
7432                           const uint8_t* src_argb1,
7433                           uint8_t* dst_argb,
7434                           int width) {
7435   asm volatile(
7436 
7437       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
7438 
7439       // 4 pixel loop.
7440       LABELALIGN
7441       "1:                                        \n"
7442       "vmovdqu     (%0),%%ymm1                   \n"
7443       "lea         0x20(%0),%0                   \n"
7444       "vmovdqu     (%1),%%ymm3                   \n"
7445       "lea         0x20(%1),%1                   \n"
7446       "vpunpcklbw  %%ymm1,%%ymm1,%%ymm0          \n"
7447       "vpunpckhbw  %%ymm1,%%ymm1,%%ymm1          \n"
7448       "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
7449       "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
7450       "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
7451       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
7452       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
7453       "vmovdqu     %%ymm0,(%2)                   \n"
7454       "lea         0x20(%2),%2                   \n"
7455       "sub         $0x8,%3                       \n"
7456       "jg          1b                            \n"
7457       "vzeroupper                                \n"
7458       : "+r"(src_argb),   // %0
7459         "+r"(src_argb1),  // %1
7460         "+r"(dst_argb),   // %2
7461         "+r"(width)       // %3
7462       :
7463       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7464 }
7465 #endif  // HAS_ARGBMULTIPLYROW_AVX2
7466 
7467 #ifdef HAS_ARGBADDROW_SSE2
7468 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_SSE2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7469 void ARGBAddRow_SSE2(const uint8_t* src_argb,
7470                      const uint8_t* src_argb1,
7471                      uint8_t* dst_argb,
7472                      int width) {
7473   asm volatile(
7474       // 4 pixel loop.
7475       LABELALIGN
7476       "1:                                        \n"
7477       "movdqu      (%0),%%xmm0                   \n"
7478       "lea         0x10(%0),%0                   \n"
7479       "movdqu      (%1),%%xmm1                   \n"
7480       "lea         0x10(%1),%1                   \n"
7481       "paddusb     %%xmm1,%%xmm0                 \n"
7482       "movdqu      %%xmm0,(%2)                   \n"
7483       "lea         0x10(%2),%2                   \n"
7484       "sub         $0x4,%3                       \n"
7485       "jg          1b                            \n"
7486       : "+r"(src_argb),   // %0
7487         "+r"(src_argb1),  // %1
7488         "+r"(dst_argb),   // %2
7489         "+r"(width)       // %3
7490       :
7491       : "memory", "cc", "xmm0", "xmm1");
7492 }
7493 #endif  // HAS_ARGBADDROW_SSE2
7494 
7495 #ifdef HAS_ARGBADDROW_AVX2
7496 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_AVX2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7497 void ARGBAddRow_AVX2(const uint8_t* src_argb,
7498                      const uint8_t* src_argb1,
7499                      uint8_t* dst_argb,
7500                      int width) {
7501   asm volatile(
7502       // 4 pixel loop.
7503       LABELALIGN
7504       "1:                                        \n"
7505       "vmovdqu     (%0),%%ymm0                   \n"
7506       "lea         0x20(%0),%0                   \n"
7507       "vpaddusb    (%1),%%ymm0,%%ymm0            \n"
7508       "lea         0x20(%1),%1                   \n"
7509       "vmovdqu     %%ymm0,(%2)                   \n"
7510       "lea         0x20(%2),%2                   \n"
7511       "sub         $0x8,%3                       \n"
7512       "jg          1b                            \n"
7513       "vzeroupper                                \n"
7514       : "+r"(src_argb),   // %0
7515         "+r"(src_argb1),  // %1
7516         "+r"(dst_argb),   // %2
7517         "+r"(width)       // %3
7518       :
7519       : "memory", "cc", "xmm0");
7520 }
7521 #endif  // HAS_ARGBADDROW_AVX2
7522 
7523 #ifdef HAS_ARGBSUBTRACTROW_SSE2
7524 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
ARGBSubtractRow_SSE2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7525 void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
7526                           const uint8_t* src_argb1,
7527                           uint8_t* dst_argb,
7528                           int width) {
7529   asm volatile(
7530       // 4 pixel loop.
7531       LABELALIGN
7532       "1:                                        \n"
7533       "movdqu      (%0),%%xmm0                   \n"
7534       "lea         0x10(%0),%0                   \n"
7535       "movdqu      (%1),%%xmm1                   \n"
7536       "lea         0x10(%1),%1                   \n"
7537       "psubusb     %%xmm1,%%xmm0                 \n"
7538       "movdqu      %%xmm0,(%2)                   \n"
7539       "lea         0x10(%2),%2                   \n"
7540       "sub         $0x4,%3                       \n"
7541       "jg          1b                            \n"
7542       : "+r"(src_argb),   // %0
7543         "+r"(src_argb1),  // %1
7544         "+r"(dst_argb),   // %2
7545         "+r"(width)       // %3
7546       :
7547       : "memory", "cc", "xmm0", "xmm1");
7548 }
7549 #endif  // HAS_ARGBSUBTRACTROW_SSE2
7550 
7551 #ifdef HAS_ARGBSUBTRACTROW_AVX2
7552 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_AVX2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7553 void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
7554                           const uint8_t* src_argb1,
7555                           uint8_t* dst_argb,
7556                           int width) {
7557   asm volatile(
7558       // 4 pixel loop.
7559       LABELALIGN
7560       "1:                                        \n"
7561       "vmovdqu     (%0),%%ymm0                   \n"
7562       "lea         0x20(%0),%0                   \n"
7563       "vpsubusb    (%1),%%ymm0,%%ymm0            \n"
7564       "lea         0x20(%1),%1                   \n"
7565       "vmovdqu     %%ymm0,(%2)                   \n"
7566       "lea         0x20(%2),%2                   \n"
7567       "sub         $0x8,%3                       \n"
7568       "jg          1b                            \n"
7569       "vzeroupper                                \n"
7570       : "+r"(src_argb),   // %0
7571         "+r"(src_argb1),  // %1
7572         "+r"(dst_argb),   // %2
7573         "+r"(width)       // %3
7574       :
7575       : "memory", "cc", "xmm0");
7576 }
7577 #endif  // HAS_ARGBSUBTRACTROW_AVX2
7578 
7579 #ifdef HAS_SOBELXROW_SSE2
7580 // SobelX as a matrix is
7581 // -1  0  1
7582 // -2  0  2
7583 // -1  0  1
SobelXRow_SSE2(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int width)7584 void SobelXRow_SSE2(const uint8_t* src_y0,
7585                     const uint8_t* src_y1,
7586                     const uint8_t* src_y2,
7587                     uint8_t* dst_sobelx,
7588                     int width) {
7589   asm volatile(
7590       "sub         %0,%1                         \n"
7591       "sub         %0,%2                         \n"
7592       "sub         %0,%3                         \n"
7593       "pxor        %%xmm5,%%xmm5                 \n"
7594 
7595       // 8 pixel loop.
7596       LABELALIGN
7597       "1:                                        \n"
7598       "movq        (%0),%%xmm0                   \n"
7599       "movq        0x2(%0),%%xmm1                \n"
7600       "punpcklbw   %%xmm5,%%xmm0                 \n"
7601       "punpcklbw   %%xmm5,%%xmm1                 \n"
7602       "psubw       %%xmm1,%%xmm0                 \n"
7603       "movq        0x00(%0,%1,1),%%xmm1          \n"
7604       "movq        0x02(%0,%1,1),%%xmm2          \n"
7605       "punpcklbw   %%xmm5,%%xmm1                 \n"
7606       "punpcklbw   %%xmm5,%%xmm2                 \n"
7607       "psubw       %%xmm2,%%xmm1                 \n"
7608       "movq        0x00(%0,%2,1),%%xmm2          \n"
7609       "movq        0x02(%0,%2,1),%%xmm3          \n"
7610       "punpcklbw   %%xmm5,%%xmm2                 \n"
7611       "punpcklbw   %%xmm5,%%xmm3                 \n"
7612       "psubw       %%xmm3,%%xmm2                 \n"
7613       "paddw       %%xmm2,%%xmm0                 \n"
7614       "paddw       %%xmm1,%%xmm0                 \n"
7615       "paddw       %%xmm1,%%xmm0                 \n"
7616       "pxor        %%xmm1,%%xmm1                 \n"
7617       "psubw       %%xmm0,%%xmm1                 \n"
7618       "pmaxsw      %%xmm1,%%xmm0                 \n"
7619       "packuswb    %%xmm0,%%xmm0                 \n"
7620       "movq        %%xmm0,0x00(%0,%3,1)          \n"
7621       "lea         0x8(%0),%0                    \n"
7622       "sub         $0x8,%4                       \n"
7623       "jg          1b                            \n"
7624       : "+r"(src_y0),      // %0
7625         "+r"(src_y1),      // %1
7626         "+r"(src_y2),      // %2
7627         "+r"(dst_sobelx),  // %3
7628         "+r"(width)        // %4
7629       :
7630       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7631 }
7632 #endif  // HAS_SOBELXROW_SSE2
7633 
7634 #ifdef HAS_SOBELYROW_SSE2
7635 // SobelY as a matrix is
7636 // -1 -2 -1
7637 //  0  0  0
7638 //  1  2  1
SobelYRow_SSE2(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int width)7639 void SobelYRow_SSE2(const uint8_t* src_y0,
7640                     const uint8_t* src_y1,
7641                     uint8_t* dst_sobely,
7642                     int width) {
7643   asm volatile(
7644       "sub         %0,%1                         \n"
7645       "sub         %0,%2                         \n"
7646       "pxor        %%xmm5,%%xmm5                 \n"
7647 
7648       // 8 pixel loop.
7649       LABELALIGN
7650       "1:                                        \n"
7651       "movq        (%0),%%xmm0                   \n"
7652       "movq        0x00(%0,%1,1),%%xmm1          \n"
7653       "punpcklbw   %%xmm5,%%xmm0                 \n"
7654       "punpcklbw   %%xmm5,%%xmm1                 \n"
7655       "psubw       %%xmm1,%%xmm0                 \n"
7656       "movq        0x1(%0),%%xmm1                \n"
7657       "movq        0x01(%0,%1,1),%%xmm2          \n"
7658       "punpcklbw   %%xmm5,%%xmm1                 \n"
7659       "punpcklbw   %%xmm5,%%xmm2                 \n"
7660       "psubw       %%xmm2,%%xmm1                 \n"
7661       "movq        0x2(%0),%%xmm2                \n"
7662       "movq        0x02(%0,%1,1),%%xmm3          \n"
7663       "punpcklbw   %%xmm5,%%xmm2                 \n"
7664       "punpcklbw   %%xmm5,%%xmm3                 \n"
7665       "psubw       %%xmm3,%%xmm2                 \n"
7666       "paddw       %%xmm2,%%xmm0                 \n"
7667       "paddw       %%xmm1,%%xmm0                 \n"
7668       "paddw       %%xmm1,%%xmm0                 \n"
7669       "pxor        %%xmm1,%%xmm1                 \n"
7670       "psubw       %%xmm0,%%xmm1                 \n"
7671       "pmaxsw      %%xmm1,%%xmm0                 \n"
7672       "packuswb    %%xmm0,%%xmm0                 \n"
7673       "movq        %%xmm0,0x00(%0,%2,1)          \n"
7674       "lea         0x8(%0),%0                    \n"
7675       "sub         $0x8,%3                       \n"
7676       "jg          1b                            \n"
7677       : "+r"(src_y0),      // %0
7678         "+r"(src_y1),      // %1
7679         "+r"(dst_sobely),  // %2
7680         "+r"(width)        // %3
7681       :
7682       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7683 }
7684 #endif  // HAS_SOBELYROW_SSE2
7685 
7686 #ifdef HAS_SOBELROW_SSE2
7687 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
7688 // A = 255
7689 // R = Sobel
7690 // G = Sobel
7691 // B = Sobel
SobelRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)7692 void SobelRow_SSE2(const uint8_t* src_sobelx,
7693                    const uint8_t* src_sobely,
7694                    uint8_t* dst_argb,
7695                    int width) {
7696   asm volatile(
7697       "sub         %0,%1                         \n"
7698       "pcmpeqb     %%xmm5,%%xmm5                 \n"
7699       "pslld       $0x18,%%xmm5                  \n"
7700 
7701       // 8 pixel loop.
7702       LABELALIGN
7703       "1:                                        \n"
7704       "movdqu      (%0),%%xmm0                   \n"
7705       "movdqu      0x00(%0,%1,1),%%xmm1          \n"
7706       "lea         0x10(%0),%0                   \n"
7707       "paddusb     %%xmm1,%%xmm0                 \n"
7708       "movdqa      %%xmm0,%%xmm2                 \n"
7709       "punpcklbw   %%xmm0,%%xmm2                 \n"
7710       "punpckhbw   %%xmm0,%%xmm0                 \n"
7711       "movdqa      %%xmm2,%%xmm1                 \n"
7712       "punpcklwd   %%xmm2,%%xmm1                 \n"
7713       "punpckhwd   %%xmm2,%%xmm2                 \n"
7714       "por         %%xmm5,%%xmm1                 \n"
7715       "por         %%xmm5,%%xmm2                 \n"
7716       "movdqa      %%xmm0,%%xmm3                 \n"
7717       "punpcklwd   %%xmm0,%%xmm3                 \n"
7718       "punpckhwd   %%xmm0,%%xmm0                 \n"
7719       "por         %%xmm5,%%xmm3                 \n"
7720       "por         %%xmm5,%%xmm0                 \n"
7721       "movdqu      %%xmm1,(%2)                   \n"
7722       "movdqu      %%xmm2,0x10(%2)               \n"
7723       "movdqu      %%xmm3,0x20(%2)               \n"
7724       "movdqu      %%xmm0,0x30(%2)               \n"
7725       "lea         0x40(%2),%2                   \n"
7726       "sub         $0x10,%3                      \n"
7727       "jg          1b                            \n"
7728       : "+r"(src_sobelx),  // %0
7729         "+r"(src_sobely),  // %1
7730         "+r"(dst_argb),    // %2
7731         "+r"(width)        // %3
7732       :
7733       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7734 }
7735 #endif  // HAS_SOBELROW_SSE2
7736 
7737 #ifdef HAS_SOBELTOPLANEROW_SSE2
7738 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
SobelToPlaneRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)7739 void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
7740                           const uint8_t* src_sobely,
7741                           uint8_t* dst_y,
7742                           int width) {
7743   asm volatile(
7744       "sub         %0,%1                         \n"
7745       "pcmpeqb     %%xmm5,%%xmm5                 \n"
7746       "pslld       $0x18,%%xmm5                  \n"
7747 
7748       // 8 pixel loop.
7749       LABELALIGN
7750       "1:                                        \n"
7751       "movdqu      (%0),%%xmm0                   \n"
7752       "movdqu      0x00(%0,%1,1),%%xmm1          \n"
7753       "lea         0x10(%0),%0                   \n"
7754       "paddusb     %%xmm1,%%xmm0                 \n"
7755       "movdqu      %%xmm0,(%2)                   \n"
7756       "lea         0x10(%2),%2                   \n"
7757       "sub         $0x10,%3                      \n"
7758       "jg          1b                            \n"
7759       : "+r"(src_sobelx),  // %0
7760         "+r"(src_sobely),  // %1
7761         "+r"(dst_y),       // %2
7762         "+r"(width)        // %3
7763       :
7764       : "memory", "cc", "xmm0", "xmm1");
7765 }
7766 #endif  // HAS_SOBELTOPLANEROW_SSE2
7767 
7768 #ifdef HAS_SOBELXYROW_SSE2
7769 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
7770 // A = 255
7771 // R = Sobel X
7772 // G = Sobel
7773 // B = Sobel Y
SobelXYRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)7774 void SobelXYRow_SSE2(const uint8_t* src_sobelx,
7775                      const uint8_t* src_sobely,
7776                      uint8_t* dst_argb,
7777                      int width) {
7778   asm volatile(
7779       "sub         %0,%1                         \n"
7780       "pcmpeqb     %%xmm5,%%xmm5                 \n"
7781 
7782       // 8 pixel loop.
7783       LABELALIGN
7784       "1:                                        \n"
7785       "movdqu      (%0),%%xmm0                   \n"
7786       "movdqu      0x00(%0,%1,1),%%xmm1          \n"
7787       "lea         0x10(%0),%0                   \n"
7788       "movdqa      %%xmm0,%%xmm2                 \n"
7789       "paddusb     %%xmm1,%%xmm2                 \n"
7790       "movdqa      %%xmm0,%%xmm3                 \n"
7791       "punpcklbw   %%xmm5,%%xmm3                 \n"
7792       "punpckhbw   %%xmm5,%%xmm0                 \n"
7793       "movdqa      %%xmm1,%%xmm4                 \n"
7794       "punpcklbw   %%xmm2,%%xmm4                 \n"
7795       "punpckhbw   %%xmm2,%%xmm1                 \n"
7796       "movdqa      %%xmm4,%%xmm6                 \n"
7797       "punpcklwd   %%xmm3,%%xmm6                 \n"
7798       "punpckhwd   %%xmm3,%%xmm4                 \n"
7799       "movdqa      %%xmm1,%%xmm7                 \n"
7800       "punpcklwd   %%xmm0,%%xmm7                 \n"
7801       "punpckhwd   %%xmm0,%%xmm1                 \n"
7802       "movdqu      %%xmm6,(%2)                   \n"
7803       "movdqu      %%xmm4,0x10(%2)               \n"
7804       "movdqu      %%xmm7,0x20(%2)               \n"
7805       "movdqu      %%xmm1,0x30(%2)               \n"
7806       "lea         0x40(%2),%2                   \n"
7807       "sub         $0x10,%3                      \n"
7808       "jg          1b                            \n"
7809       : "+r"(src_sobelx),  // %0
7810         "+r"(src_sobely),  // %1
7811         "+r"(dst_argb),    // %2
7812         "+r"(width)        // %3
7813       :
7814       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7815         "xmm7");
7816 }
7817 #endif  // HAS_SOBELXYROW_SSE2
7818 
7819 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
7820 // Creates a table of cumulative sums where each value is a sum of all values
7821 // above and to the left of the value, inclusive of the value.
ComputeCumulativeSumRow_SSE2(const uint8_t * row,int32_t * cumsum,const int32_t * previous_cumsum,int width)7822 void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
7823                                   int32_t* cumsum,
7824                                   const int32_t* previous_cumsum,
7825                                   int width) {
7826   asm volatile(
7827       "pxor        %%xmm0,%%xmm0                 \n"
7828       "pxor        %%xmm1,%%xmm1                 \n"
7829       "sub         $0x4,%3                       \n"
7830       "jl          49f                           \n"
7831       "test        $0xf,%1                       \n"
7832       "jne         49f                           \n"
7833 
7834       // 4 pixel loop.
7835       LABELALIGN
7836       "40:                                       \n"
7837       "movdqu      (%0),%%xmm2                   \n"
7838       "lea         0x10(%0),%0                   \n"
7839       "movdqa      %%xmm2,%%xmm4                 \n"
7840       "punpcklbw   %%xmm1,%%xmm2                 \n"
7841       "movdqa      %%xmm2,%%xmm3                 \n"
7842       "punpcklwd   %%xmm1,%%xmm2                 \n"
7843       "punpckhwd   %%xmm1,%%xmm3                 \n"
7844       "punpckhbw   %%xmm1,%%xmm4                 \n"
7845       "movdqa      %%xmm4,%%xmm5                 \n"
7846       "punpcklwd   %%xmm1,%%xmm4                 \n"
7847       "punpckhwd   %%xmm1,%%xmm5                 \n"
7848       "paddd       %%xmm2,%%xmm0                 \n"
7849       "movdqu      (%2),%%xmm2                   \n"
7850       "paddd       %%xmm0,%%xmm2                 \n"
7851       "paddd       %%xmm3,%%xmm0                 \n"
7852       "movdqu      0x10(%2),%%xmm3               \n"
7853       "paddd       %%xmm0,%%xmm3                 \n"
7854       "paddd       %%xmm4,%%xmm0                 \n"
7855       "movdqu      0x20(%2),%%xmm4               \n"
7856       "paddd       %%xmm0,%%xmm4                 \n"
7857       "paddd       %%xmm5,%%xmm0                 \n"
7858       "movdqu      0x30(%2),%%xmm5               \n"
7859       "lea         0x40(%2),%2                   \n"
7860       "paddd       %%xmm0,%%xmm5                 \n"
7861       "movdqu      %%xmm2,(%1)                   \n"
7862       "movdqu      %%xmm3,0x10(%1)               \n"
7863       "movdqu      %%xmm4,0x20(%1)               \n"
7864       "movdqu      %%xmm5,0x30(%1)               \n"
7865       "lea         0x40(%1),%1                   \n"
7866       "sub         $0x4,%3                       \n"
7867       "jge         40b                           \n"
7868 
7869       "49:                                       \n"
7870       "add         $0x3,%3                       \n"
7871       "jl          19f                           \n"
7872 
7873       // 1 pixel loop.
7874       LABELALIGN
7875       "10:                                       \n"
7876       "movd        (%0),%%xmm2                   \n"
7877       "lea         0x4(%0),%0                    \n"
7878       "punpcklbw   %%xmm1,%%xmm2                 \n"
7879       "punpcklwd   %%xmm1,%%xmm2                 \n"
7880       "paddd       %%xmm2,%%xmm0                 \n"
7881       "movdqu      (%2),%%xmm2                   \n"
7882       "lea         0x10(%2),%2                   \n"
7883       "paddd       %%xmm0,%%xmm2                 \n"
7884       "movdqu      %%xmm2,(%1)                   \n"
7885       "lea         0x10(%1),%1                   \n"
7886       "sub         $0x1,%3                       \n"
7887       "jge         10b                           \n"
7888 
7889       "19:                                       \n"
7890       : "+r"(row),              // %0
7891         "+r"(cumsum),           // %1
7892         "+r"(previous_cumsum),  // %2
7893         "+r"(width)             // %3
7894       :
7895       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
7896 }
7897 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
7898 
7899 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
CumulativeSumToAverageRow_SSE2(const int32_t * topleft,const int32_t * botleft,int width,int area,uint8_t * dst,int count)7900 void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
7901                                     const int32_t* botleft,
7902                                     int width,
7903                                     int area,
7904                                     uint8_t* dst,
7905                                     int count) {
7906   asm volatile(
7907       "movd        %5,%%xmm5                     \n"
7908       "cvtdq2ps    %%xmm5,%%xmm5                 \n"
7909       "rcpss       %%xmm5,%%xmm4                 \n"
7910       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
7911       "sub         $0x4,%3                       \n"
7912       "jl          49f                           \n"
7913       "cmpl        $0x80,%5                      \n"
7914       "ja          40f                           \n"
7915 
7916       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
7917       "pcmpeqb     %%xmm6,%%xmm6                 \n"
7918       "psrld       $0x10,%%xmm6                  \n"
7919       "cvtdq2ps    %%xmm6,%%xmm6                 \n"
7920       "addps       %%xmm6,%%xmm5                 \n"
7921       "mulps       %%xmm4,%%xmm5                 \n"
7922       "cvtps2dq    %%xmm5,%%xmm5                 \n"
7923       "packssdw    %%xmm5,%%xmm5                 \n"
7924 
7925       // 4 pixel small loop.
7926       LABELALIGN
7927       "4:                                        \n"
7928       "movdqu      (%0),%%xmm0                   \n"
7929       "movdqu      0x10(%0),%%xmm1               \n"
7930       "movdqu      0x20(%0),%%xmm2               \n"
7931       "movdqu      0x30(%0),%%xmm3               \n"
7932       "psubd       0x00(%0,%4,4),%%xmm0          \n"
7933       "psubd       0x10(%0,%4,4),%%xmm1          \n"
7934       "psubd       0x20(%0,%4,4),%%xmm2          \n"
7935       "psubd       0x30(%0,%4,4),%%xmm3          \n"
7936       "lea         0x40(%0),%0                   \n"
7937       "psubd       (%1),%%xmm0                   \n"
7938       "psubd       0x10(%1),%%xmm1               \n"
7939       "psubd       0x20(%1),%%xmm2               \n"
7940       "psubd       0x30(%1),%%xmm3               \n"
7941       "paddd       0x00(%1,%4,4),%%xmm0          \n"
7942       "paddd       0x10(%1,%4,4),%%xmm1          \n"
7943       "paddd       0x20(%1,%4,4),%%xmm2          \n"
7944       "paddd       0x30(%1,%4,4),%%xmm3          \n"
7945       "lea         0x40(%1),%1                   \n"
7946       "packssdw    %%xmm1,%%xmm0                 \n"
7947       "packssdw    %%xmm3,%%xmm2                 \n"
7948       "pmulhuw     %%xmm5,%%xmm0                 \n"
7949       "pmulhuw     %%xmm5,%%xmm2                 \n"
7950       "packuswb    %%xmm2,%%xmm0                 \n"
7951       "movdqu      %%xmm0,(%2)                   \n"
7952       "lea         0x10(%2),%2                   \n"
7953       "sub         $0x4,%3                       \n"
7954       "jge         4b                            \n"
7955       "jmp         49f                           \n"
7956 
7957       // 4 pixel loop
7958       LABELALIGN
7959       "40:                                       \n"
7960       "movdqu      (%0),%%xmm0                   \n"
7961       "movdqu      0x10(%0),%%xmm1               \n"
7962       "movdqu      0x20(%0),%%xmm2               \n"
7963       "movdqu      0x30(%0),%%xmm3               \n"
7964       "psubd       0x00(%0,%4,4),%%xmm0          \n"
7965       "psubd       0x10(%0,%4,4),%%xmm1          \n"
7966       "psubd       0x20(%0,%4,4),%%xmm2          \n"
7967       "psubd       0x30(%0,%4,4),%%xmm3          \n"
7968       "lea         0x40(%0),%0                   \n"
7969       "psubd       (%1),%%xmm0                   \n"
7970       "psubd       0x10(%1),%%xmm1               \n"
7971       "psubd       0x20(%1),%%xmm2               \n"
7972       "psubd       0x30(%1),%%xmm3               \n"
7973       "paddd       0x00(%1,%4,4),%%xmm0          \n"
7974       "paddd       0x10(%1,%4,4),%%xmm1          \n"
7975       "paddd       0x20(%1,%4,4),%%xmm2          \n"
7976       "paddd       0x30(%1,%4,4),%%xmm3          \n"
7977       "lea         0x40(%1),%1                   \n"
7978       "cvtdq2ps    %%xmm0,%%xmm0                 \n"
7979       "cvtdq2ps    %%xmm1,%%xmm1                 \n"
7980       "mulps       %%xmm4,%%xmm0                 \n"
7981       "mulps       %%xmm4,%%xmm1                 \n"
7982       "cvtdq2ps    %%xmm2,%%xmm2                 \n"
7983       "cvtdq2ps    %%xmm3,%%xmm3                 \n"
7984       "mulps       %%xmm4,%%xmm2                 \n"
7985       "mulps       %%xmm4,%%xmm3                 \n"
7986       "cvtps2dq    %%xmm0,%%xmm0                 \n"
7987       "cvtps2dq    %%xmm1,%%xmm1                 \n"
7988       "cvtps2dq    %%xmm2,%%xmm2                 \n"
7989       "cvtps2dq    %%xmm3,%%xmm3                 \n"
7990       "packssdw    %%xmm1,%%xmm0                 \n"
7991       "packssdw    %%xmm3,%%xmm2                 \n"
7992       "packuswb    %%xmm2,%%xmm0                 \n"
7993       "movdqu      %%xmm0,(%2)                   \n"
7994       "lea         0x10(%2),%2                   \n"
7995       "sub         $0x4,%3                       \n"
7996       "jge         40b                           \n"
7997 
7998       "49:                                       \n"
7999       "add         $0x3,%3                       \n"
8000       "jl          19f                           \n"
8001 
8002       // 1 pixel loop
8003       LABELALIGN
8004       "10:                                       \n"
8005       "movdqu      (%0),%%xmm0                   \n"
8006       "psubd       0x00(%0,%4,4),%%xmm0          \n"
8007       "lea         0x10(%0),%0                   \n"
8008       "psubd       (%1),%%xmm0                   \n"
8009       "paddd       0x00(%1,%4,4),%%xmm0          \n"
8010       "lea         0x10(%1),%1                   \n"
8011       "cvtdq2ps    %%xmm0,%%xmm0                 \n"
8012       "mulps       %%xmm4,%%xmm0                 \n"
8013       "cvtps2dq    %%xmm0,%%xmm0                 \n"
8014       "packssdw    %%xmm0,%%xmm0                 \n"
8015       "packuswb    %%xmm0,%%xmm0                 \n"
8016       "movd        %%xmm0,(%2)                   \n"
8017       "lea         0x4(%2),%2                    \n"
8018       "sub         $0x1,%3                       \n"
8019       "jge         10b                           \n"
8020       "19:                                       \n"
8021       : "+r"(topleft),           // %0
8022         "+r"(botleft),           // %1
8023         "+r"(dst),               // %2
8024         "+rm"(count)             // %3
8025       : "r"((intptr_t)(width)),  // %4
8026         "rm"(area)               // %5
8027       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
8028 }
8029 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
8030 
8031 #ifdef HAS_ARGBAFFINEROW_SSE2
8032 // Copy ARGB pixels from source image with slope to a row of destination.
8033 LIBYUV_API
ARGBAffineRow_SSE2(const uint8_t * src_argb,int src_argb_stride,uint8_t * dst_argb,const float * src_dudv,int width)8034 void ARGBAffineRow_SSE2(const uint8_t* src_argb,
8035                         int src_argb_stride,
8036                         uint8_t* dst_argb,
8037                         const float* src_dudv,
8038                         int width) {
8039   intptr_t src_argb_stride_temp = src_argb_stride;
8040   intptr_t temp;
8041   asm volatile(
8042       "movq        (%3),%%xmm2                   \n"
8043       "movq        0x08(%3),%%xmm7               \n"
8044       "shl         $0x10,%1                      \n"
8045       "add         $0x4,%1                       \n"
8046       "movd        %1,%%xmm5                     \n"
8047       "sub         $0x4,%4                       \n"
8048       "jl          49f                           \n"
8049 
8050       "pshufd      $0x44,%%xmm7,%%xmm7           \n"
8051       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
8052       "movdqa      %%xmm2,%%xmm0                 \n"
8053       "addps       %%xmm7,%%xmm0                 \n"
8054       "movlhps     %%xmm0,%%xmm2                 \n"
8055       "movdqa      %%xmm7,%%xmm4                 \n"
8056       "addps       %%xmm4,%%xmm4                 \n"
8057       "movdqa      %%xmm2,%%xmm3                 \n"
8058       "addps       %%xmm4,%%xmm3                 \n"
8059       "addps       %%xmm4,%%xmm4                 \n"
8060 
8061       // 4 pixel loop
8062       LABELALIGN
8063       "40:                                       \n"
8064       "cvttps2dq   %%xmm2,%%xmm0                 \n"  // x,y float->int first 2
8065       "cvttps2dq   %%xmm3,%%xmm1                 \n"  // x,y float->int next 2
8066       "packssdw    %%xmm1,%%xmm0                 \n"  // x, y as 8 shorts
8067       "pmaddwd     %%xmm5,%%xmm0                 \n"  // off = x*4 + y*stride
8068       "movd        %%xmm0,%k1                    \n"
8069       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
8070       "movd        %%xmm0,%k5                    \n"
8071       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
8072       "movd        0x00(%0,%1,1),%%xmm1          \n"
8073       "movd        0x00(%0,%5,1),%%xmm6          \n"
8074       "punpckldq   %%xmm6,%%xmm1                 \n"
8075       "addps       %%xmm4,%%xmm2                 \n"
8076       "movq        %%xmm1,(%2)                   \n"
8077       "movd        %%xmm0,%k1                    \n"
8078       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
8079       "movd        %%xmm0,%k5                    \n"
8080       "movd        0x00(%0,%1,1),%%xmm0          \n"
8081       "movd        0x00(%0,%5,1),%%xmm6          \n"
8082       "punpckldq   %%xmm6,%%xmm0                 \n"
8083       "addps       %%xmm4,%%xmm3                 \n"
8084       "movq        %%xmm0,0x08(%2)               \n"
8085       "lea         0x10(%2),%2                   \n"
8086       "sub         $0x4,%4                       \n"
8087       "jge         40b                           \n"
8088 
8089       "49:                                       \n"
8090       "add         $0x3,%4                       \n"
8091       "jl          19f                           \n"
8092 
8093       // 1 pixel loop
8094       LABELALIGN
8095       "10:                                       \n"
8096       "cvttps2dq   %%xmm2,%%xmm0                 \n"
8097       "packssdw    %%xmm0,%%xmm0                 \n"
8098       "pmaddwd     %%xmm5,%%xmm0                 \n"
8099       "addps       %%xmm7,%%xmm2                 \n"
8100       "movd        %%xmm0,%k1                    \n"
8101       "movd        0x00(%0,%1,1),%%xmm0          \n"
8102       "movd        %%xmm0,(%2)                   \n"
8103       "lea         0x04(%2),%2                   \n"
8104       "sub         $0x1,%4                       \n"
8105       "jge         10b                           \n"
8106       "19:                                       \n"
8107       : "+r"(src_argb),              // %0
8108         "+r"(src_argb_stride_temp),  // %1
8109         "+r"(dst_argb),              // %2
8110         "+r"(src_dudv),              // %3
8111         "+rm"(width),                // %4
8112         "=&r"(temp)                  // %5
8113       :
8114       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
8115         "xmm7");
8116 }
8117 #endif  // HAS_ARGBAFFINEROW_SSE2
8118 
8119 #ifdef HAS_INTERPOLATEROW_SSSE3
8120 // Bilinear filter 16x2 -> 16x1
InterpolateRow_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)8121 void InterpolateRow_SSSE3(uint8_t* dst_ptr,
8122                           const uint8_t* src_ptr,
8123                           ptrdiff_t src_stride,
8124                           int dst_width,
8125                           int source_y_fraction) {
8126   asm volatile(
8127       "sub         %1,%0                         \n"
8128       "cmp         $0x0,%3                       \n"
8129       "je          100f                          \n"
8130       "cmp         $0x80,%3                      \n"
8131       "je          50f                           \n"
8132 
8133       "movd        %3,%%xmm0                     \n"
8134       "neg         %3                            \n"
8135       "add         $0x100,%3                     \n"
8136       "movd        %3,%%xmm5                     \n"
8137       "punpcklbw   %%xmm0,%%xmm5                 \n"
8138       "punpcklwd   %%xmm5,%%xmm5                 \n"
8139       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
8140       "mov         $0x80808080,%%eax             \n"
8141       "movd        %%eax,%%xmm4                  \n"
8142       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
8143 
8144       // General purpose row blend.
8145       LABELALIGN
8146       "1:                                        \n"
8147       "movdqu      (%1),%%xmm0                   \n"
8148       "movdqu      0x00(%1,%4,1),%%xmm2          \n"
8149       "movdqa      %%xmm0,%%xmm1                 \n"
8150       "punpcklbw   %%xmm2,%%xmm0                 \n"
8151       "punpckhbw   %%xmm2,%%xmm1                 \n"
8152       "psubb       %%xmm4,%%xmm0                 \n"
8153       "psubb       %%xmm4,%%xmm1                 \n"
8154       "movdqa      %%xmm5,%%xmm2                 \n"
8155       "movdqa      %%xmm5,%%xmm3                 \n"
8156       "pmaddubsw   %%xmm0,%%xmm2                 \n"
8157       "pmaddubsw   %%xmm1,%%xmm3                 \n"
8158       "paddw       %%xmm4,%%xmm2                 \n"
8159       "paddw       %%xmm4,%%xmm3                 \n"
8160       "psrlw       $0x8,%%xmm2                   \n"
8161       "psrlw       $0x8,%%xmm3                   \n"
8162       "packuswb    %%xmm3,%%xmm2                 \n"
8163       "movdqu      %%xmm2,0x00(%1,%0,1)          \n"
8164       "lea         0x10(%1),%1                   \n"
8165       "sub         $0x10,%2                      \n"
8166       "jg          1b                            \n"
8167       "jmp         99f                           \n"
8168 
8169       // Blend 50 / 50.
8170       LABELALIGN
8171       "50:                                       \n"
8172       "movdqu      (%1),%%xmm0                   \n"
8173       "movdqu      0x00(%1,%4,1),%%xmm1          \n"
8174       "pavgb       %%xmm1,%%xmm0                 \n"
8175       "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
8176       "lea         0x10(%1),%1                   \n"
8177       "sub         $0x10,%2                      \n"
8178       "jg          50b                           \n"
8179       "jmp         99f                           \n"
8180 
8181       // Blend 100 / 0 - Copy row unchanged.
8182       LABELALIGN
8183       "100:                                      \n"
8184       "movdqu      (%1),%%xmm0                   \n"
8185       "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
8186       "lea         0x10(%1),%1                   \n"
8187       "sub         $0x10,%2                      \n"
8188       "jg          100b                          \n"
8189 
8190       "99:                                       \n"
8191       : "+r"(dst_ptr),               // %0
8192         "+r"(src_ptr),               // %1
8193         "+rm"(dst_width),            // %2
8194         "+r"(source_y_fraction)      // %3
8195       : "r"((intptr_t)(src_stride))  // %4
8196       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
8197 }
8198 #endif  // HAS_INTERPOLATEROW_SSSE3
8199 
8200 #ifdef HAS_INTERPOLATEROW_AVX2
8201 // Bilinear filter 32x2 -> 32x1
InterpolateRow_AVX2(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)8202 void InterpolateRow_AVX2(uint8_t* dst_ptr,
8203                          const uint8_t* src_ptr,
8204                          ptrdiff_t src_stride,
8205                          int dst_width,
8206                          int source_y_fraction) {
8207   asm volatile(
8208       "cmp         $0x0,%3                       \n"
8209       "je          100f                          \n"
8210       "sub         %1,%0                         \n"
8211       "cmp         $0x80,%3                      \n"
8212       "je          50f                           \n"
8213 
8214       "vmovd       %3,%%xmm0                     \n"
8215       "neg         %3                            \n"
8216       "add         $0x100,%3                     \n"
8217       "vmovd       %3,%%xmm5                     \n"
8218       "vpunpcklbw  %%xmm0,%%xmm5,%%xmm5          \n"
8219       "vpunpcklwd  %%xmm5,%%xmm5,%%xmm5          \n"
8220       "vbroadcastss %%xmm5,%%ymm5                \n"
8221       "mov         $0x80808080,%%eax             \n"
8222       "vmovd       %%eax,%%xmm4                  \n"
8223       "vbroadcastss %%xmm4,%%ymm4                \n"
8224 
8225       // General purpose row blend.
8226       LABELALIGN
8227       "1:                                        \n"
8228       "vmovdqu     (%1),%%ymm0                   \n"
8229       "vmovdqu     0x00(%1,%4,1),%%ymm2          \n"
8230       "vpunpckhbw  %%ymm2,%%ymm0,%%ymm1          \n"
8231       "vpunpcklbw  %%ymm2,%%ymm0,%%ymm0          \n"
8232       "vpsubb      %%ymm4,%%ymm1,%%ymm1          \n"
8233       "vpsubb      %%ymm4,%%ymm0,%%ymm0          \n"
8234       "vpmaddubsw  %%ymm1,%%ymm5,%%ymm1          \n"
8235       "vpmaddubsw  %%ymm0,%%ymm5,%%ymm0          \n"
8236       "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"
8237       "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"
8238       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
8239       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
8240       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
8241       "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
8242       "lea         0x20(%1),%1                   \n"
8243       "sub         $0x20,%2                      \n"
8244       "jg          1b                            \n"
8245       "jmp         99f                           \n"
8246 
8247       // Blend 50 / 50.
8248       LABELALIGN
8249       "50:                                       \n"
8250       "vmovdqu     (%1),%%ymm0                   \n"
8251       "vpavgb      0x00(%1,%4,1),%%ymm0,%%ymm0   \n"
8252       "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
8253       "lea         0x20(%1),%1                   \n"
8254       "sub         $0x20,%2                      \n"
8255       "jg          50b                           \n"
8256       "jmp         99f                           \n"
8257 
8258       // Blend 100 / 0 - Copy row unchanged.
8259       LABELALIGN
8260       "100:                                      \n"
8261       "rep         movsb                         \n"
8262       "jmp         999f                          \n"
8263 
8264       "99:                                       \n"
8265       "vzeroupper                                \n"
8266       "999:                                      \n"
8267       : "+D"(dst_ptr),               // %0
8268         "+S"(src_ptr),               // %1
8269         "+cm"(dst_width),            // %2
8270         "+r"(source_y_fraction)      // %3
8271       : "r"((intptr_t)(src_stride))  // %4
8272       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
8273 }
8274 #endif  // HAS_INTERPOLATEROW_AVX2
8275 
8276 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
8277 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)8278 void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
8279                           uint8_t* dst_argb,
8280                           const uint8_t* shuffler,
8281                           int width) {
8282   asm volatile(
8283 
8284       "movdqu      (%3),%%xmm5                   \n"
8285 
8286       LABELALIGN
8287       "1:                                        \n"
8288       "movdqu      (%0),%%xmm0                   \n"
8289       "movdqu      0x10(%0),%%xmm1               \n"
8290       "lea         0x20(%0),%0                   \n"
8291       "pshufb      %%xmm5,%%xmm0                 \n"
8292       "pshufb      %%xmm5,%%xmm1                 \n"
8293       "movdqu      %%xmm0,(%1)                   \n"
8294       "movdqu      %%xmm1,0x10(%1)               \n"
8295       "lea         0x20(%1),%1                   \n"
8296       "sub         $0x8,%2                       \n"
8297       "jg          1b                            \n"
8298       : "+r"(src_argb),  // %0
8299         "+r"(dst_argb),  // %1
8300         "+r"(width)      // %2
8301       : "r"(shuffler)    // %3
8302       : "memory", "cc", "xmm0", "xmm1", "xmm5");
8303 }
8304 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
8305 
8306 #ifdef HAS_ARGBSHUFFLEROW_AVX2
8307 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)8308 void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
8309                          uint8_t* dst_argb,
8310                          const uint8_t* shuffler,
8311                          int width) {
8312   asm volatile(
8313 
8314       "vbroadcastf128 (%3),%%ymm5                \n"
8315 
8316       LABELALIGN
8317       "1:                                        \n"
8318       "vmovdqu     (%0),%%ymm0                   \n"
8319       "vmovdqu     0x20(%0),%%ymm1               \n"
8320       "lea         0x40(%0),%0                   \n"
8321       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
8322       "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
8323       "vmovdqu     %%ymm0,(%1)                   \n"
8324       "vmovdqu     %%ymm1,0x20(%1)               \n"
8325       "lea         0x40(%1),%1                   \n"
8326       "sub         $0x10,%2                      \n"
8327       "jg          1b                            \n"
8328       "vzeroupper                                \n"
8329       : "+r"(src_argb),  // %0
8330         "+r"(dst_argb),  // %1
8331         "+r"(width)      // %2
8332       : "r"(shuffler)    // %3
8333       : "memory", "cc", "xmm0", "xmm1", "xmm5");
8334 }
8335 #endif  // HAS_ARGBSHUFFLEROW_AVX2
8336 
8337 #ifdef HAS_I422TOYUY2ROW_SSE2
I422ToYUY2Row_SSE2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)8338 void I422ToYUY2Row_SSE2(const uint8_t* src_y,
8339                         const uint8_t* src_u,
8340                         const uint8_t* src_v,
8341                         uint8_t* dst_yuy2,
8342                         int width) {
8343   asm volatile(
8344 
8345       "sub         %1,%2                         \n"
8346 
8347       LABELALIGN
8348       "1:                                        \n"
8349       "movq        (%1),%%xmm2                   \n"
8350       "movq        0x00(%1,%2,1),%%xmm1          \n"
8351       "add         $0x8,%1                       \n"
8352       "punpcklbw   %%xmm1,%%xmm2                 \n"
8353       "movdqu      (%0),%%xmm0                   \n"
8354       "add         $0x10,%0                      \n"
8355       "movdqa      %%xmm0,%%xmm1                 \n"
8356       "punpcklbw   %%xmm2,%%xmm0                 \n"
8357       "punpckhbw   %%xmm2,%%xmm1                 \n"
8358       "movdqu      %%xmm0,(%3)                   \n"
8359       "movdqu      %%xmm1,0x10(%3)               \n"
8360       "lea         0x20(%3),%3                   \n"
8361       "sub         $0x10,%4                      \n"
8362       "jg          1b                            \n"
8363       : "+r"(src_y),     // %0
8364         "+r"(src_u),     // %1
8365         "+r"(src_v),     // %2
8366         "+r"(dst_yuy2),  // %3
8367         "+rm"(width)     // %4
8368       :
8369       : "memory", "cc", "xmm0", "xmm1", "xmm2");
8370 }
8371 #endif  // HAS_I422TOYUY2ROW_SSE2
8372 
8373 #ifdef HAS_I422TOUYVYROW_SSE2
I422ToUYVYRow_SSE2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)8374 void I422ToUYVYRow_SSE2(const uint8_t* src_y,
8375                         const uint8_t* src_u,
8376                         const uint8_t* src_v,
8377                         uint8_t* dst_uyvy,
8378                         int width) {
8379   asm volatile(
8380 
8381       "sub         %1,%2                         \n"
8382 
8383       LABELALIGN
8384       "1:                                        \n"
8385       "movq        (%1),%%xmm2                   \n"
8386       "movq        0x00(%1,%2,1),%%xmm1          \n"
8387       "add         $0x8,%1                       \n"
8388       "punpcklbw   %%xmm1,%%xmm2                 \n"
8389       "movdqu      (%0),%%xmm0                   \n"
8390       "movdqa      %%xmm2,%%xmm1                 \n"
8391       "add         $0x10,%0                      \n"
8392       "punpcklbw   %%xmm0,%%xmm1                 \n"
8393       "punpckhbw   %%xmm0,%%xmm2                 \n"
8394       "movdqu      %%xmm1,(%3)                   \n"
8395       "movdqu      %%xmm2,0x10(%3)               \n"
8396       "lea         0x20(%3),%3                   \n"
8397       "sub         $0x10,%4                      \n"
8398       "jg          1b                            \n"
8399       : "+r"(src_y),     // %0
8400         "+r"(src_u),     // %1
8401         "+r"(src_v),     // %2
8402         "+r"(dst_uyvy),  // %3
8403         "+rm"(width)     // %4
8404       :
8405       : "memory", "cc", "xmm0", "xmm1", "xmm2");
8406 }
8407 #endif  // HAS_I422TOUYVYROW_SSE2
8408 
8409 #ifdef HAS_I422TOYUY2ROW_AVX2
I422ToYUY2Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)8410 void I422ToYUY2Row_AVX2(const uint8_t* src_y,
8411                         const uint8_t* src_u,
8412                         const uint8_t* src_v,
8413                         uint8_t* dst_yuy2,
8414                         int width) {
8415   asm volatile(
8416 
8417       "sub         %1,%2                         \n"
8418 
8419       LABELALIGN
8420       "1:                                        \n"
8421       "vpmovzxbw   (%1),%%ymm1                   \n"
8422       "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
8423       "add         $0x10,%1                      \n"
8424       "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
8425       "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
8426       "vmovdqu     (%0),%%ymm0                   \n"
8427       "add         $0x20,%0                      \n"
8428       "vpunpcklbw  %%ymm2,%%ymm0,%%ymm1          \n"
8429       "vpunpckhbw  %%ymm2,%%ymm0,%%ymm2          \n"
8430       "vextractf128 $0x0,%%ymm1,(%3)             \n"
8431       "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
8432       "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
8433       "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
8434       "lea         0x40(%3),%3                   \n"
8435       "sub         $0x20,%4                      \n"
8436       "jg          1b                            \n"
8437       "vzeroupper                                \n"
8438       : "+r"(src_y),     // %0
8439         "+r"(src_u),     // %1
8440         "+r"(src_v),     // %2
8441         "+r"(dst_yuy2),  // %3
8442         "+rm"(width)     // %4
8443       :
8444       : "memory", "cc", "xmm0", "xmm1", "xmm2");
8445 }
8446 #endif  // HAS_I422TOYUY2ROW_AVX2
8447 
8448 #ifdef HAS_I422TOUYVYROW_AVX2
I422ToUYVYRow_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)8449 void I422ToUYVYRow_AVX2(const uint8_t* src_y,
8450                         const uint8_t* src_u,
8451                         const uint8_t* src_v,
8452                         uint8_t* dst_uyvy,
8453                         int width) {
8454   asm volatile(
8455 
8456       "sub         %1,%2                         \n"
8457 
8458       LABELALIGN
8459       "1:                                        \n"
8460       "vpmovzxbw   (%1),%%ymm1                   \n"
8461       "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
8462       "add         $0x10,%1                      \n"
8463       "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
8464       "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
8465       "vmovdqu     (%0),%%ymm0                   \n"
8466       "add         $0x20,%0                      \n"
8467       "vpunpcklbw  %%ymm0,%%ymm2,%%ymm1          \n"
8468       "vpunpckhbw  %%ymm0,%%ymm2,%%ymm2          \n"
8469       "vextractf128 $0x0,%%ymm1,(%3)             \n"
8470       "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
8471       "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
8472       "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
8473       "lea         0x40(%3),%3                   \n"
8474       "sub         $0x20,%4                      \n"
8475       "jg          1b                            \n"
8476       "vzeroupper                                \n"
8477       : "+r"(src_y),     // %0
8478         "+r"(src_u),     // %1
8479         "+r"(src_v),     // %2
8480         "+r"(dst_uyvy),  // %3
8481         "+rm"(width)     // %4
8482       :
8483       : "memory", "cc", "xmm0", "xmm1", "xmm2");
8484 }
8485 #endif  // HAS_I422TOUYVYROW_AVX2
8486 
8487 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
ARGBPolynomialRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)8488 void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
8489                             uint8_t* dst_argb,
8490                             const float* poly,
8491                             int width) {
8492   asm volatile(
8493 
8494       "pxor        %%xmm3,%%xmm3                 \n"
8495 
8496       // 2 pixel loop.
8497       LABELALIGN
8498       "1:                                        \n"
8499       "movq        (%0),%%xmm0                   \n"
8500       "lea         0x8(%0),%0                    \n"
8501       "punpcklbw   %%xmm3,%%xmm0                 \n"
8502       "movdqa      %%xmm0,%%xmm4                 \n"
8503       "punpcklwd   %%xmm3,%%xmm0                 \n"
8504       "punpckhwd   %%xmm3,%%xmm4                 \n"
8505       "cvtdq2ps    %%xmm0,%%xmm0                 \n"
8506       "cvtdq2ps    %%xmm4,%%xmm4                 \n"
8507       "movdqa      %%xmm0,%%xmm1                 \n"
8508       "movdqa      %%xmm4,%%xmm5                 \n"
8509       "mulps       0x10(%3),%%xmm0               \n"
8510       "mulps       0x10(%3),%%xmm4               \n"
8511       "addps       (%3),%%xmm0                   \n"
8512       "addps       (%3),%%xmm4                   \n"
8513       "movdqa      %%xmm1,%%xmm2                 \n"
8514       "movdqa      %%xmm5,%%xmm6                 \n"
8515       "mulps       %%xmm1,%%xmm2                 \n"
8516       "mulps       %%xmm5,%%xmm6                 \n"
8517       "mulps       %%xmm2,%%xmm1                 \n"
8518       "mulps       %%xmm6,%%xmm5                 \n"
8519       "mulps       0x20(%3),%%xmm2               \n"
8520       "mulps       0x20(%3),%%xmm6               \n"
8521       "mulps       0x30(%3),%%xmm1               \n"
8522       "mulps       0x30(%3),%%xmm5               \n"
8523       "addps       %%xmm2,%%xmm0                 \n"
8524       "addps       %%xmm6,%%xmm4                 \n"
8525       "addps       %%xmm1,%%xmm0                 \n"
8526       "addps       %%xmm5,%%xmm4                 \n"
8527       "cvttps2dq   %%xmm0,%%xmm0                 \n"
8528       "cvttps2dq   %%xmm4,%%xmm4                 \n"
8529       "packuswb    %%xmm4,%%xmm0                 \n"
8530       "packuswb    %%xmm0,%%xmm0                 \n"
8531       "movq        %%xmm0,(%1)                   \n"
8532       "lea         0x8(%1),%1                    \n"
8533       "sub         $0x2,%2                       \n"
8534       "jg          1b                            \n"
8535       : "+r"(src_argb),  // %0
8536         "+r"(dst_argb),  // %1
8537         "+r"(width)      // %2
8538       : "r"(poly)        // %3
8539       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
8540 }
8541 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
8542 
8543 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
ARGBPolynomialRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)8544 void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
8545                             uint8_t* dst_argb,
8546                             const float* poly,
8547                             int width) {
8548   asm volatile(
8549       "vbroadcastf128 (%3),%%ymm4                \n"
8550       "vbroadcastf128 0x10(%3),%%ymm5            \n"
8551       "vbroadcastf128 0x20(%3),%%ymm6            \n"
8552       "vbroadcastf128 0x30(%3),%%ymm7            \n"
8553 
8554       // 2 pixel loop.
8555       LABELALIGN
8556       "1:                                        \n"
8557       "vpmovzxbd   (%0),%%ymm0                   \n"  // 2 ARGB pixels
8558       "lea         0x8(%0),%0                    \n"
8559       "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
8560       "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
8561       "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
8562       "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
8563       "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
8564       "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X *
8565                                                       // X
8566       "vcvttps2dq  %%ymm0,%%ymm0                 \n"
8567       "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
8568       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
8569       "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
8570       "vmovq       %%xmm0,(%1)                   \n"
8571       "lea         0x8(%1),%1                    \n"
8572       "sub         $0x2,%2                       \n"
8573       "jg          1b                            \n"
8574       "vzeroupper                                \n"
8575       : "+r"(src_argb),  // %0
8576         "+r"(dst_argb),  // %1
8577         "+r"(width)      // %2
8578       : "r"(poly)        // %3
8579       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
8580         "xmm7");
8581 }
8582 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
8583 
8584 #ifdef HAS_HALFFLOATROW_SSE2
8585 static float kScaleBias = 1.9259299444e-34f;
HalfFloatRow_SSE2(const uint16_t * src,uint16_t * dst,float scale,int width)8586 void HalfFloatRow_SSE2(const uint16_t* src,
8587                        uint16_t* dst,
8588                        float scale,
8589                        int width) {
8590   scale *= kScaleBias;
8591   asm volatile(
8592       "movd        %3,%%xmm4                     \n"
8593       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
8594       "pxor        %%xmm5,%%xmm5                 \n"
8595       "sub         %0,%1                         \n"
8596 
8597       // 16 pixel loop.
8598       LABELALIGN
8599       "1:                                        \n"
8600       "movdqu      (%0),%%xmm2                   \n"  // 8 shorts
8601       "add         $0x10,%0                      \n"
8602       "movdqa      %%xmm2,%%xmm3                 \n"
8603       "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1
8604       "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats
8605       "punpckhwd   %%xmm5,%%xmm3                 \n"
8606       "cvtdq2ps    %%xmm3,%%xmm3                 \n"
8607       "mulps       %%xmm4,%%xmm2                 \n"
8608       "mulps       %%xmm4,%%xmm3                 \n"
8609       "psrld       $0xd,%%xmm2                   \n"
8610       "psrld       $0xd,%%xmm3                   \n"
8611       "packssdw    %%xmm3,%%xmm2                 \n"
8612       "movdqu      %%xmm2,-0x10(%0,%1,1)         \n"
8613       "sub         $0x8,%2                       \n"
8614       "jg          1b                            \n"
8615       : "+r"(src),   // %0
8616         "+r"(dst),   // %1
8617         "+r"(width)  // %2
8618       : "m"(scale)   // %3
8619       : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
8620 }
8621 #endif  // HAS_HALFFLOATROW_SSE2
8622 
8623 #ifdef HAS_HALFFLOATROW_AVX2
HalfFloatRow_AVX2(const uint16_t * src,uint16_t * dst,float scale,int width)8624 void HalfFloatRow_AVX2(const uint16_t* src,
8625                        uint16_t* dst,
8626                        float scale,
8627                        int width) {
8628   scale *= kScaleBias;
8629   asm volatile(
8630       "vbroadcastss %3, %%ymm4                   \n"
8631       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
8632       "sub         %0,%1                         \n"
8633 
8634       // 16 pixel loop.
8635       LABELALIGN
8636       "1:                                        \n"
8637       "vmovdqu     (%0),%%ymm2                   \n"  // 16 shorts
8638       "add         $0x20,%0                      \n"
8639       "vpunpckhwd  %%ymm5,%%ymm2,%%ymm3          \n"  // mutates
8640       "vpunpcklwd  %%ymm5,%%ymm2,%%ymm2          \n"
8641       "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
8642       "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
8643       "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
8644       "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
8645       "vpsrld      $0xd,%%ymm3,%%ymm3            \n"
8646       "vpsrld      $0xd,%%ymm2,%%ymm2            \n"
8647       "vpackssdw   %%ymm3, %%ymm2, %%ymm2        \n"  // unmutates
8648       "vmovdqu     %%ymm2,-0x20(%0,%1,1)         \n"
8649       "sub         $0x10,%2                      \n"
8650       "jg          1b                            \n"
8651 
8652       "vzeroupper                                \n"
8653       : "+r"(src),   // %0
8654         "+r"(dst),   // %1
8655         "+r"(width)  // %2
8656 #if defined(__x86_64__)
8657       : "x"(scale)  // %3
8658 #else
8659       : "m"(scale)            // %3
8660 #endif
8661       : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
8662 }
8663 #endif  // HAS_HALFFLOATROW_AVX2
8664 
8665 #ifdef HAS_HALFFLOATROW_F16C
HalfFloatRow_F16C(const uint16_t * src,uint16_t * dst,float scale,int width)8666 void HalfFloatRow_F16C(const uint16_t* src,
8667                        uint16_t* dst,
8668                        float scale,
8669                        int width) {
8670   asm volatile(
8671       "vbroadcastss %3, %%ymm4                   \n"
8672       "sub         %0,%1                         \n"
8673 
8674       // 16 pixel loop.
8675       LABELALIGN
8676       "1:                                        \n"
8677       "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
8678       "vpmovzxwd   0x10(%0),%%ymm3               \n"
8679       "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
8680       "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
8681       "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
8682       "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
8683       "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
8684       "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
8685       "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
8686       "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
8687       "add         $0x20,%0                      \n"
8688       "sub         $0x10,%2                      \n"
8689       "jg          1b                            \n"
8690       "vzeroupper                                \n"
8691       : "+r"(src),   // %0
8692         "+r"(dst),   // %1
8693         "+r"(width)  // %2
8694 #if defined(__x86_64__)
8695       : "x"(scale)  // %3
8696 #else
8697       : "m"(scale)            // %3
8698 #endif
8699       : "memory", "cc", "xmm2", "xmm3", "xmm4");
8700 }
8701 #endif  // HAS_HALFFLOATROW_F16C
8702 
8703 #ifdef HAS_HALFFLOATROW_F16C
HalfFloat1Row_F16C(const uint16_t * src,uint16_t * dst,float,int width)8704 void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
8705   asm volatile(
8706       "sub         %0,%1                         \n"
8707       // 16 pixel loop.
8708       LABELALIGN
8709       "1:                                        \n"
8710       "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
8711       "vpmovzxwd   0x10(%0),%%ymm3               \n"
8712       "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
8713       "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
8714       "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
8715       "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
8716       "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
8717       "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
8718       "add         $0x20,%0                      \n"
8719       "sub         $0x10,%2                      \n"
8720       "jg          1b                            \n"
8721       "vzeroupper                                \n"
8722       : "+r"(src),   // %0
8723         "+r"(dst),   // %1
8724         "+r"(width)  // %2
8725       :
8726       : "memory", "cc", "xmm2", "xmm3");
8727 }
8728 #endif  // HAS_HALFFLOATROW_F16C
8729 
8730 #ifdef HAS_ARGBCOLORTABLEROW_X86
8731 // Tranform ARGB pixels with color table.
ARGBColorTableRow_X86(uint8_t * dst_argb,const uint8_t * table_argb,int width)8732 void ARGBColorTableRow_X86(uint8_t* dst_argb,
8733                            const uint8_t* table_argb,
8734                            int width) {
8735   uintptr_t pixel_temp;
8736   asm volatile(
8737       // 1 pixel loop.
8738       LABELALIGN
8739       "1:                                        \n"
8740       "movzb       (%0),%1                       \n"
8741       "lea         0x4(%0),%0                    \n"
8742       "movzb       0x00(%3,%1,4),%1              \n"
8743       "mov         %b1,-0x4(%0)                  \n"
8744       "movzb       -0x3(%0),%1                   \n"
8745       "movzb       0x01(%3,%1,4),%1              \n"
8746       "mov         %b1,-0x3(%0)                  \n"
8747       "movzb       -0x2(%0),%1                   \n"
8748       "movzb       0x02(%3,%1,4),%1              \n"
8749       "mov         %b1,-0x2(%0)                  \n"
8750       "movzb       -0x1(%0),%1                   \n"
8751       "movzb       0x03(%3,%1,4),%1              \n"
8752       "mov         %b1,-0x1(%0)                  \n"
8753       "dec         %2                            \n"
8754       "jg          1b                            \n"
8755       : "+r"(dst_argb),     // %0
8756         "=&d"(pixel_temp),  // %1
8757         "+r"(width)         // %2
8758       : "r"(table_argb)     // %3
8759       : "memory", "cc");
8760 }
8761 #endif  // HAS_ARGBCOLORTABLEROW_X86
8762 
8763 #ifdef HAS_RGBCOLORTABLEROW_X86
8764 // Tranform RGB pixels with color table.
RGBColorTableRow_X86(uint8_t * dst_argb,const uint8_t * table_argb,int width)8765 void RGBColorTableRow_X86(uint8_t* dst_argb,
8766                           const uint8_t* table_argb,
8767                           int width) {
8768   uintptr_t pixel_temp;
8769   asm volatile(
8770       // 1 pixel loop.
8771       LABELALIGN
8772       "1:                                        \n"
8773       "movzb       (%0),%1                       \n"
8774       "lea         0x4(%0),%0                    \n"
8775       "movzb       0x00(%3,%1,4),%1              \n"
8776       "mov         %b1,-0x4(%0)                  \n"
8777       "movzb       -0x3(%0),%1                   \n"
8778       "movzb       0x01(%3,%1,4),%1              \n"
8779       "mov         %b1,-0x3(%0)                  \n"
8780       "movzb       -0x2(%0),%1                   \n"
8781       "movzb       0x02(%3,%1,4),%1              \n"
8782       "mov         %b1,-0x2(%0)                  \n"
8783       "dec         %2                            \n"
8784       "jg          1b                            \n"
8785       : "+r"(dst_argb),     // %0
8786         "=&d"(pixel_temp),  // %1
8787         "+r"(width)         // %2
8788       : "r"(table_argb)     // %3
8789       : "memory", "cc");
8790 }
8791 #endif  // HAS_RGBCOLORTABLEROW_X86
8792 
8793 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
8794 // Tranform RGB pixels with luma table.
ARGBLumaColorTableRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width,const uint8_t * luma,uint32_t lumacoeff)8795 void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
8796                                  uint8_t* dst_argb,
8797                                  int width,
8798                                  const uint8_t* luma,
8799                                  uint32_t lumacoeff) {
8800   uintptr_t pixel_temp;
8801   uintptr_t table_temp;
8802   asm volatile(
8803       "movd        %6,%%xmm3                     \n"
8804       "pshufd      $0x0,%%xmm3,%%xmm3            \n"
8805       "pcmpeqb     %%xmm4,%%xmm4                 \n"
8806       "psllw       $0x8,%%xmm4                   \n"
8807       "pxor        %%xmm5,%%xmm5                 \n"
8808 
8809       // 4 pixel loop.
8810       LABELALIGN
8811       "1:                                        \n"
8812       "movdqu      (%2),%%xmm0                   \n"
8813       "pmaddubsw   %%xmm3,%%xmm0                 \n"
8814       "phaddw      %%xmm0,%%xmm0                 \n"
8815       "pand        %%xmm4,%%xmm0                 \n"
8816       "punpcklwd   %%xmm5,%%xmm0                 \n"
8817       "movd        %%xmm0,%k1                    \n"  // 32 bit offset
8818       "add         %5,%1                         \n"
8819       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
8820 
8821       "movzb       (%2),%0                       \n"
8822       "movzb       0x00(%1,%0,1),%0              \n"
8823       "mov         %b0,(%3)                      \n"
8824       "movzb       0x1(%2),%0                    \n"
8825       "movzb       0x00(%1,%0,1),%0              \n"
8826       "mov         %b0,0x1(%3)                   \n"
8827       "movzb       0x2(%2),%0                    \n"
8828       "movzb       0x00(%1,%0,1),%0              \n"
8829       "mov         %b0,0x2(%3)                   \n"
8830       "movzb       0x3(%2),%0                    \n"
8831       "mov         %b0,0x3(%3)                   \n"
8832 
8833       "movd        %%xmm0,%k1                    \n"  // 32 bit offset
8834       "add         %5,%1                         \n"
8835       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
8836 
8837       "movzb       0x4(%2),%0                    \n"
8838       "movzb       0x00(%1,%0,1),%0              \n"
8839       "mov         %b0,0x4(%3)                   \n"
8840       "movzb       0x5(%2),%0                    \n"
8841       "movzb       0x00(%1,%0,1),%0              \n"
8842       "mov         %b0,0x5(%3)                   \n"
8843       "movzb       0x6(%2),%0                    \n"
8844       "movzb       0x00(%1,%0,1),%0              \n"
8845       "mov         %b0,0x6(%3)                   \n"
8846       "movzb       0x7(%2),%0                    \n"
8847       "mov         %b0,0x7(%3)                   \n"
8848 
8849       "movd        %%xmm0,%k1                    \n"  // 32 bit offset
8850       "add         %5,%1                         \n"
8851       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
8852 
8853       "movzb       0x8(%2),%0                    \n"
8854       "movzb       0x00(%1,%0,1),%0              \n"
8855       "mov         %b0,0x8(%3)                   \n"
8856       "movzb       0x9(%2),%0                    \n"
8857       "movzb       0x00(%1,%0,1),%0              \n"
8858       "mov         %b0,0x9(%3)                   \n"
8859       "movzb       0xa(%2),%0                    \n"
8860       "movzb       0x00(%1,%0,1),%0              \n"
8861       "mov         %b0,0xa(%3)                   \n"
8862       "movzb       0xb(%2),%0                    \n"
8863       "mov         %b0,0xb(%3)                   \n"
8864 
8865       "movd        %%xmm0,%k1                    \n"  // 32 bit offset
8866       "add         %5,%1                         \n"
8867 
8868       "movzb       0xc(%2),%0                    \n"
8869       "movzb       0x00(%1,%0,1),%0              \n"
8870       "mov         %b0,0xc(%3)                   \n"
8871       "movzb       0xd(%2),%0                    \n"
8872       "movzb       0x00(%1,%0,1),%0              \n"
8873       "mov         %b0,0xd(%3)                   \n"
8874       "movzb       0xe(%2),%0                    \n"
8875       "movzb       0x00(%1,%0,1),%0              \n"
8876       "mov         %b0,0xe(%3)                   \n"
8877       "movzb       0xf(%2),%0                    \n"
8878       "mov         %b0,0xf(%3)                   \n"
8879       "lea         0x10(%2),%2                   \n"
8880       "lea         0x10(%3),%3                   \n"
8881       "sub         $0x4,%4                       \n"
8882       "jg          1b                            \n"
8883       : "=&d"(pixel_temp),  // %0
8884         "=&a"(table_temp),  // %1
8885         "+r"(src_argb),     // %2
8886         "+r"(dst_argb),     // %3
8887         "+rm"(width)        // %4
8888       : "r"(luma),          // %5
8889         "rm"(lumacoeff)     // %6
8890       : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
8891 }
8892 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
8893 
8894 #ifdef HAS_NV21TOYUV24ROW_AVX2
8895 
8896 // begin NV21ToYUV24Row_C avx2 constants
8897 static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00,
8898                                0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80,
8899                                0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
8900                                0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00};
8901 
8902 static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
8903                                0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
8904                                0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
8905                                0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80};
8906 
8907 static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
8908                                0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
8909                                0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
8910                                0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00};
8911 
8912 static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
8913                               0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05,
8914                               0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
8915                               0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05};
8916 
8917 static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
8918                               0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80,
8919                               0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
8920                               0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80};
8921 
8922 static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
8923                               0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f,
8924                               0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
8925                               0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f};
8926 
8927 static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
8928                               0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80,
8929                               0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
8930                               0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80};
8931 
8932 static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
8933                               0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a,
8934                               0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
8935                               0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a};
8936 
8937 static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
8938                               0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80,
8939                               0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
8940                               0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80};
8941 
8942 // NV21ToYUV24Row_AVX2
NV21ToYUV24Row_AVX2(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)8943 void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
8944                          const uint8_t* src_vu,
8945                          uint8_t* dst_yuv24,
8946                          int width) {
8947   uint8_t* src_y_ptr;
8948   uint64_t src_offset = 0;
8949   uint64_t width64;
8950 
8951   width64 = width;
8952   src_y_ptr = (uint8_t*)src_y;
8953 
8954   asm volatile(
8955       "vmovdqu     %5, %%ymm0                    \n"  // init blend value
8956       "vmovdqu     %6, %%ymm1                    \n"  // init blend value
8957       "vmovdqu     %7, %%ymm2                    \n"  // init blend value
8958       //      "sub         $0x20, %3                     \n"  //sub 32 from
8959       //      width for final loop
8960 
8961       LABELALIGN
8962       "1:                                        \n"      // label 1
8963       "vmovdqu     (%0,%4), %%ymm3               \n"      // src_y
8964       "vmovdqu     1(%1,%4), %%ymm4              \n"      // src_uv+1
8965       "vmovdqu     (%1), %%ymm5                  \n"      // src_uv
8966       "vpshufb     %8, %%ymm3, %%ymm13           \n"      // y, kSHUF0 for shuf
8967       "vpshufb     %9, %%ymm4, %%ymm14           \n"      // uv+1, kSHUF1 for
8968                                                           // shuf
8969       "vpshufb     %10, %%ymm5, %%ymm15          \n"      // uv, kSHUF2 for
8970                                                           // shuf
8971       "vpshufb     %11, %%ymm3, %%ymm3           \n"      // y kSHUF3 for shuf
8972       "vpshufb     %12, %%ymm4, %%ymm4           \n"      // uv+1 kSHUF4 for
8973                                                           // shuf
8974       "vpblendvb   %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n"  // blend 0
8975       "vpblendvb   %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n"  // blend 0
8976       "vpblendvb   %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n"  // blend 2
8977       "vpblendvb   %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n"  // blend 1
8978       "vpshufb     %13, %%ymm5, %%ymm15          \n"      // shuffle const
8979       "vpor        %%ymm4, %%ymm3, %%ymm5        \n"      // get results
8980       "vmovdqu     %%ymm12, 0x20(%2)             \n"      // store dst_yuv+20h
8981       "vpor        %%ymm15, %%ymm5, %%ymm3       \n"      // get results
8982       "add         $0x20, %4                     \n"      // add to src buffer
8983                                                           // ptr
8984       "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n"      // insert
8985       "vperm2i128  $0x31, %%ymm13, %%ymm3, %%ymm5 \n"     // insert
8986       "vmovdqu     %%ymm4, (%2)                  \n"      // store dst_yuv
8987       "vmovdqu     %%ymm5, 0x40(%2)              \n"      // store dst_yuv+40h
8988       "add         $0x60,%2                      \n"      // add to dst buffer
8989                                                           // ptr
8990       //      "cmp         %3, %4                        \n" //(width64 -
8991       //      32 bytes) and src_offset
8992       "sub         $0x20,%3                      \n"  // 32 pixels per loop
8993       "jg          1b                            \n"
8994       "vzeroupper                                \n"  // sse-avx2
8995                                                       // transistions
8996 
8997       : "+r"(src_y),      //%0
8998         "+r"(src_vu),     //%1
8999         "+r"(dst_yuv24),  //%2
9000         "+r"(width64),    //%3
9001         "+r"(src_offset)  //%4
9002       : "m"(kBLEND0),     //%5
9003         "m"(kBLEND1),     //%6
9004         "m"(kBLEND2),     //%7
9005         "m"(kSHUF0),      //%8
9006         "m"(kSHUF1),      //%9
9007         "m"(kSHUF2),      //%10
9008         "m"(kSHUF3),      //%11
9009         "m"(kSHUF4),      //%12
9010         "m"(kSHUF5)       //%13
9011       : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12",
9012         "xmm13", "xmm14", "xmm15");
9013 }
9014 #endif  // HAS_NV21TOYUV24ROW_AVX2
9015 
9016 #ifdef HAS_SWAPUVROW_SSSE3
9017 
9018 // Shuffle table for reversing the bytes.
9019 static const uvec8 kShuffleUVToVU = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
9020                                      9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
9021 
9022 // Convert UV plane of NV12 to VU of NV21.
SwapUVRow_SSSE3(const uint8_t * src_uv,uint8_t * dst_vu,int width)9023 void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
9024   asm volatile(
9025 
9026       "movdqu      %3,%%xmm5                     \n"
9027 
9028       LABELALIGN
9029       "1:                                        \n"
9030       "movdqu      (%0),%%xmm0                   \n"
9031       "movdqu      0x10(%0),%%xmm1               \n"
9032       "lea         0x20(%0),%0                   \n"
9033       "pshufb      %%xmm5,%%xmm0                 \n"
9034       "pshufb      %%xmm5,%%xmm1                 \n"
9035       "movdqu      %%xmm0,(%1)                   \n"
9036       "movdqu      %%xmm1,0x10(%1)               \n"
9037       "lea         0x20(%1),%1                   \n"
9038       "sub         $0x10,%2                      \n"
9039       "jg          1b                            \n"
9040       : "+r"(src_uv),        // %0
9041         "+r"(dst_vu),        // %1
9042         "+r"(width)          // %2
9043       : "m"(kShuffleUVToVU)  // %3
9044       : "memory", "cc", "xmm0", "xmm1", "xmm5");
9045 }
9046 #endif  // HAS_SWAPUVROW_SSSE3
9047 
9048 #ifdef HAS_SWAPUVROW_AVX2
SwapUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_vu,int width)9049 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
9050   asm volatile(
9051 
9052       "vbroadcastf128 %3,%%ymm5                  \n"
9053 
9054       LABELALIGN
9055       "1:                                        \n"
9056       "vmovdqu     (%0),%%ymm0                   \n"
9057       "vmovdqu     0x20(%0),%%ymm1               \n"
9058       "lea         0x40(%0),%0                   \n"
9059       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
9060       "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
9061       "vmovdqu     %%ymm0,(%1)                   \n"
9062       "vmovdqu     %%ymm1,0x20(%1)               \n"
9063       "lea         0x40(%1),%1                   \n"
9064       "sub         $0x20,%2                      \n"
9065       "jg          1b                            \n"
9066       "vzeroupper                                \n"
9067       : "+r"(src_uv),        // %0
9068         "+r"(dst_vu),        // %1
9069         "+r"(width)          // %2
9070       : "m"(kShuffleUVToVU)  // %3
9071       : "memory", "cc", "xmm0", "xmm1", "xmm5");
9072 }
9073 #endif  // HAS_SWAPUVROW_AVX2
9074 
HalfMergeUVRow_SSSE3(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)9075 void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
9076                           int src_stride_u,
9077                           const uint8_t* src_v,
9078                           int src_stride_v,
9079                           uint8_t* dst_uv,
9080                           int width) {
9081   asm volatile(
9082       "pcmpeqb     %%xmm4,%%xmm4                 \n"
9083       "psrlw       $0xf,%%xmm4                   \n"
9084       "packuswb    %%xmm4,%%xmm4                 \n"
9085       "pxor        %%xmm5,%%xmm5                 \n"
9086 
9087       LABELALIGN
9088       "1:                                        \n"
9089       "movdqu      (%0),%%xmm0                   \n"  // load 16 U values
9090       "movdqu      (%1),%%xmm1                   \n"  // load 16 V values
9091       "movdqu      0(%0,%4,1),%%xmm2             \n"  // 16 from next row
9092       "movdqu      0(%1,%5,1),%%xmm3             \n"
9093       "lea         0x10(%0),%0                   \n"
9094       "pmaddubsw   %%xmm4,%%xmm0                 \n"  // half size
9095       "pmaddubsw   %%xmm4,%%xmm1                 \n"
9096       "pmaddubsw   %%xmm4,%%xmm2                 \n"
9097       "pmaddubsw   %%xmm4,%%xmm3                 \n"
9098       "lea         0x10(%1),%1                   \n"
9099       "paddw       %%xmm2,%%xmm0                 \n"
9100       "paddw       %%xmm3,%%xmm1                 \n"
9101       "psrlw       $0x1,%%xmm0                   \n"
9102       "psrlw       $0x1,%%xmm1                   \n"
9103       "pavgw       %%xmm5,%%xmm0                 \n"
9104       "pavgw       %%xmm5,%%xmm1                 \n"
9105       "packuswb    %%xmm0,%%xmm0                 \n"
9106       "packuswb    %%xmm1,%%xmm1                 \n"
9107       "punpcklbw   %%xmm1,%%xmm0                 \n"
9108       "movdqu      %%xmm0,(%2)                   \n"  // store 8 UV pixels
9109       "lea         0x10(%2),%2                   \n"
9110       "sub         $0x10,%3                      \n"  // 16 src pixels per loop
9111       "jg          1b                            \n"
9112       : "+r"(src_u),                    // %0
9113         "+r"(src_v),                    // %1
9114         "+r"(dst_uv),                   // %2
9115         "+r"(width)                     // %3
9116       : "r"((intptr_t)(src_stride_u)),  // %4
9117         "r"((intptr_t)(src_stride_v))   // %5
9118       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
9119 }
9120 
HalfMergeUVRow_AVX2(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)9121 void HalfMergeUVRow_AVX2(const uint8_t* src_u,
9122                          int src_stride_u,
9123                          const uint8_t* src_v,
9124                          int src_stride_v,
9125                          uint8_t* dst_uv,
9126                          int width) {
9127   asm volatile(
9128       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
9129       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
9130       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
9131       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
9132 
9133       LABELALIGN
9134       "1:                                        \n"
9135       "vmovdqu     (%0),%%ymm0                   \n"  // load 32 U values
9136       "vmovdqu     (%1),%%ymm1                   \n"  // load 32 V values
9137       "vmovdqu     0(%0,%4,1),%%ymm2             \n"  // 32 from next row
9138       "vmovdqu     0(%1,%5,1),%%ymm3             \n"
9139       "lea         0x20(%0),%0                   \n"
9140       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // half size
9141       "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
9142       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
9143       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
9144       "lea         0x20(%1),%1                   \n"
9145       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
9146       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
9147       "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
9148       "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
9149       "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
9150       "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
9151       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
9152       "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
9153       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
9154       "vmovdqu     %%ymm0,(%2)                   \n"  // store 16 UV pixels
9155       "lea         0x20(%2),%2                   \n"
9156       "sub         $0x20,%3                      \n"  // 32 src pixels per loop
9157       "jg          1b                            \n"
9158       "vzeroupper                                \n"
9159       : "+r"(src_u),                    // %0
9160         "+r"(src_v),                    // %1
9161         "+r"(dst_uv),                   // %2
9162         "+r"(width)                     // %3
9163       : "r"((intptr_t)(src_stride_u)),  // %4
9164         "r"((intptr_t)(src_stride_v))   // %5
9165       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
9166 }
9167 
ClampFloatToZero_SSE2(const float * src_x,float * dst_y,int width)9168 void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
9169   asm volatile(
9170       "pxor        %%xmm1,%%xmm1                 \n"
9171 
9172       LABELALIGN
9173       "1:                                        \n"
9174       "movd        (%0),%%xmm0                   \n"  // load float
9175       "maxss       %%xmm1, %%xmm0                \n"  // clamp to zero
9176       "add         4, %0                         \n"
9177       "movd        %%xmm0, (%1)                  \n"  // store float
9178       "add         4, %1                         \n"
9179       "sub         $0x4,%2                       \n"  // 1 float per loop
9180       "jg          1b                            \n"
9181       : "+r"(src_x),  // %0
9182         "+r"(dst_y),  // %1
9183         "+r"(width)   // %2
9184       :
9185       : "memory", "cc", "xmm0", "xmm1");
9186 }
9187 
9188 #endif  // defined(__x86_64__) || defined(__i386__)
9189 
9190 #ifdef __cplusplus
9191 }  // extern "C"
9192 }  // namespace libyuv
9193 #endif
9194