1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 // This module is for Visual C 32/64 bit and clangcl 32 bit
14 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
15     (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
16 
17 #if defined(_M_X64)
18 #include <emmintrin.h>
19 #include <tmmintrin.h>  // For _mm_maddubs_epi16
20 #endif
21 
22 #ifdef __cplusplus
23 namespace libyuv {
24 extern "C" {
25 #endif
26 
27 // 64 bit
28 #if defined(_M_X64)
29 
30 // Read 4 UV from 422, upsample to 8 UV.
31 #define READYUV422                                        \
32   xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
33   xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
34   xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                   \
35   xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                  \
36   u_buf += 4;                                             \
37   xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
38   xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
39   y_buf += 8;
40 
41 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
42 #define READYUVA422                                       \
43   xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
44   xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
45   xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                   \
46   xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                  \
47   u_buf += 4;                                             \
48   xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
49   xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
50   y_buf += 8;                                             \
51   xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                \
52   a_buf += 8;
53 
54 // Convert 8 pixels: 8 UV and 8 Y.
55 #define YUVTORGB(yuvconstants)                                     \
56   xmm1 = _mm_loadu_si128(&xmm0);                                   \
57   xmm2 = _mm_loadu_si128(&xmm0);                                   \
58   xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
59   xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
60   xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
61   xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);   \
62   xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);   \
63   xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);   \
64   xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);  \
65   xmm0 = _mm_adds_epi16(xmm0, xmm4);                               \
66   xmm1 = _mm_adds_epi16(xmm1, xmm4);                               \
67   xmm2 = _mm_adds_epi16(xmm2, xmm4);                               \
68   xmm0 = _mm_srai_epi16(xmm0, 6);                                  \
69   xmm1 = _mm_srai_epi16(xmm1, 6);                                  \
70   xmm2 = _mm_srai_epi16(xmm2, 6);                                  \
71   xmm0 = _mm_packus_epi16(xmm0, xmm0);                             \
72   xmm1 = _mm_packus_epi16(xmm1, xmm1);                             \
73   xmm2 = _mm_packus_epi16(xmm2, xmm2);
74 
75 // Store 8 ARGB values.
76 #define STOREARGB                                    \
77   xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);              \
78   xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);              \
79   xmm1 = _mm_loadu_si128(&xmm0);                     \
80   xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);             \
81   xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);             \
82   _mm_storeu_si128((__m128i*)dst_argb, xmm0);        \
83   _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
84   dst_argb += 32;
85 
86 #if defined(HAS_I422TOARGBROW_SSSE3)
I422ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)87 void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
88                          const uint8_t* u_buf,
89                          const uint8_t* v_buf,
90                          uint8_t* dst_argb,
91                          const struct YuvConstants* yuvconstants,
92                          int width) {
93   __m128i xmm0, xmm1, xmm2, xmm4;
94   const __m128i xmm5 = _mm_set1_epi8(-1);
95   const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
96   while (width > 0) {
97     READYUV422
98     YUVTORGB(yuvconstants)
99     STOREARGB
100     width -= 8;
101   }
102 }
103 #endif
104 
105 #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
I422AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)106 void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
107                               const uint8_t* u_buf,
108                               const uint8_t* v_buf,
109                               const uint8_t* a_buf,
110                               uint8_t* dst_argb,
111                               const struct YuvConstants* yuvconstants,
112                               int width) {
113   __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
114   const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
115   while (width > 0) {
116     READYUVA422
117     YUVTORGB(yuvconstants)
118     STOREARGB
119     width -= 8;
120   }
121 }
122 #endif
123 
124 // 32 bit
125 #else  // defined(_M_X64)
126 #ifdef HAS_ARGBTOYROW_SSSE3
127 
128 // Constants for ARGB.
129 static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
130                               13, 65, 33, 0, 13, 65, 33, 0};
131 
132 // JPeg full range.
133 static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
134                                15, 75, 38, 0, 15, 75, 38, 0};
135 
136 static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
137                               112, -74, -38, 0, 112, -74, -38, 0};
138 
139 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
140                                127, -84, -43, 0, 127, -84, -43, 0};
141 
142 static const vec8 kARGBToV = {
143     -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
144 };
145 
146 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
147                                -20, -107, 127, 0, -20, -107, 127, 0};
148 
149 // vpshufb for vphaddw + vpackuswb packed to shorts.
150 static const lvec8 kShufARGBToUV_AVX = {
151     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
152     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
153 
154 // Constants for BGRA.
155 static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
156                               0, 33, 65, 13, 0, 33, 65, 13};
157 
158 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
159                               0, -38, -74, 112, 0, -38, -74, 112};
160 
161 static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
162                               0, 112, -94, -18, 0, 112, -94, -18};
163 
164 // Constants for ABGR.
165 static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
166                               33, 65, 13, 0, 33, 65, 13, 0};
167 
168 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
169                               -38, -74, 112, 0, -38, -74, 112, 0};
170 
171 static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
172                               112, -94, -18, 0, 112, -94, -18, 0};
173 
174 // Constants for RGBA.
175 static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
176                               0, 13, 65, 33, 0, 13, 65, 33};
177 
178 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
179                               0, 112, -74, -38, 0, 112, -74, -38};
180 
181 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
182                               0, -18, -94, 112, 0, -18, -94, 112};
183 
184 static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
185                               16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
186 
187 // 7 bit fixed point 0.5.
188 static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
189 
190 static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
191                                 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
192 
193 static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
194                                   0x8080u, 0x8080u, 0x8080u, 0x8080u};
195 
196 // Shuffle table for converting RGB24 to ARGB.
197 static const uvec8 kShuffleMaskRGB24ToARGB = {
198     0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
199 
200 // Shuffle table for converting RAW to ARGB.
201 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
202                                             8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
203 
204 // Shuffle table for converting RAW to RGB24.  First 8.
205 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
206     2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
207     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
208 
209 // Shuffle table for converting RAW to RGB24.  Middle 8.
210 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
211     2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
212     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
213 
214 // Shuffle table for converting RAW to RGB24.  Last 8.
215 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
216     8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
217     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
218 
219 // Shuffle table for converting ARGB to RGB24.
220 static const uvec8 kShuffleMaskARGBToRGB24 = {
221     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
222 
223 // Shuffle table for converting ARGB to RAW.
224 static const uvec8 kShuffleMaskARGBToRAW = {
225     2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
226 
227 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
228 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
229     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
230 
231 // YUY2 shuf 16 Y to 32 Y.
232 static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
233                                     10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
234                                     6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
235 
236 // YUY2 shuf 8 UV to 16 UV.
237 static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
238                                      11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
239                                      5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
240 
241 // UYVY shuf 16 Y to 32 Y.
242 static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
243                                     11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
244                                     7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
245 
246 // UYVY shuf 8 UV to 16 UV.
247 static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
248                                      10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
249                                      4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
250 
251 // NV21 shuf 8 VU to 16 UV.
252 static const lvec8 kShuffleNV21 = {
253     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
254     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
255 };
256 
257 // Duplicates gray value 3 times and fills in alpha opaque.
258 __declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y,
259                                           uint8_t* dst_argb,
260                                           int width) {
261   __asm {
262     mov        eax, [esp + 4]  // src_y
263     mov        edx, [esp + 8]  // dst_argb
264     mov        ecx, [esp + 12]  // width
265     pcmpeqb    xmm5, xmm5  // generate mask 0xff000000
266     pslld      xmm5, 24
267 
268   convertloop:
269     movq       xmm0, qword ptr [eax]
270     lea        eax,  [eax + 8]
271     punpcklbw  xmm0, xmm0
272     movdqa     xmm1, xmm0
273     punpcklwd  xmm0, xmm0
274     punpckhwd  xmm1, xmm1
275     por        xmm0, xmm5
276     por        xmm1, xmm5
277     movdqu     [edx], xmm0
278     movdqu     [edx + 16], xmm1
279     lea        edx, [edx + 32]
280     sub        ecx, 8
281     jg         convertloop
282     ret
283   }
284 }
285 
286 #ifdef HAS_J400TOARGBROW_AVX2
287 // Duplicates gray value 3 times and fills in alpha opaque.
288 __declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y,
289                                           uint8_t* dst_argb,
290                                           int width) {
291   __asm {
292     mov         eax, [esp + 4]  // src_y
293     mov         edx, [esp + 8]  // dst_argb
294     mov         ecx, [esp + 12]  // width
295     vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff000000
296     vpslld      ymm5, ymm5, 24
297 
298   convertloop:
299     vmovdqu     xmm0, [eax]
300     lea         eax,  [eax + 16]
301     vpermq      ymm0, ymm0, 0xd8
302     vpunpcklbw  ymm0, ymm0, ymm0
303     vpermq      ymm0, ymm0, 0xd8
304     vpunpckhwd  ymm1, ymm0, ymm0
305     vpunpcklwd  ymm0, ymm0, ymm0
306     vpor        ymm0, ymm0, ymm5
307     vpor        ymm1, ymm1, ymm5
308     vmovdqu     [edx], ymm0
309     vmovdqu     [edx + 32], ymm1
310     lea         edx, [edx + 64]
311     sub         ecx, 16
312     jg          convertloop
313     vzeroupper
314     ret
315   }
316 }
317 #endif  // HAS_J400TOARGBROW_AVX2
318 
319 __declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
320                                             uint8_t* dst_argb,
321                                             int width) {
322   __asm {
323     mov       eax, [esp + 4]  // src_rgb24
324     mov       edx, [esp + 8]  // dst_argb
325     mov       ecx, [esp + 12]  // width
326     pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
327     pslld     xmm5, 24
328     movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
329 
330  convertloop:
331     movdqu    xmm0, [eax]
332     movdqu    xmm1, [eax + 16]
333     movdqu    xmm3, [eax + 32]
334     lea       eax, [eax + 48]
335     movdqa    xmm2, xmm3
336     palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
337     pshufb    xmm2, xmm4
338     por       xmm2, xmm5
339     palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
340     pshufb    xmm0, xmm4
341     movdqu    [edx + 32], xmm2
342     por       xmm0, xmm5
343     pshufb    xmm1, xmm4
344     movdqu    [edx], xmm0
345     por       xmm1, xmm5
346     palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
347     pshufb    xmm3, xmm4
348     movdqu    [edx + 16], xmm1
349     por       xmm3, xmm5
350     movdqu    [edx + 48], xmm3
351     lea       edx, [edx + 64]
352     sub       ecx, 16
353     jg        convertloop
354     ret
355   }
356 }
357 
358 __declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw,
359                                           uint8_t* dst_argb,
360                                           int width) {
361   __asm {
362     mov       eax, [esp + 4]  // src_raw
363     mov       edx, [esp + 8]  // dst_argb
364     mov       ecx, [esp + 12]  // width
365     pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
366     pslld     xmm5, 24
367     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToARGB
368 
369  convertloop:
370     movdqu    xmm0, [eax]
371     movdqu    xmm1, [eax + 16]
372     movdqu    xmm3, [eax + 32]
373     lea       eax, [eax + 48]
374     movdqa    xmm2, xmm3
375     palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
376     pshufb    xmm2, xmm4
377     por       xmm2, xmm5
378     palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
379     pshufb    xmm0, xmm4
380     movdqu    [edx + 32], xmm2
381     por       xmm0, xmm5
382     pshufb    xmm1, xmm4
383     movdqu    [edx], xmm0
384     por       xmm1, xmm5
385     palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
386     pshufb    xmm3, xmm4
387     movdqu    [edx + 16], xmm1
388     por       xmm3, xmm5
389     movdqu    [edx + 48], xmm3
390     lea       edx, [edx + 64]
391     sub       ecx, 16
392     jg        convertloop
393     ret
394   }
395 }
396 
397 __declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
398                                            uint8_t* dst_rgb24,
399                                            int width) {
400   __asm {
401     mov       eax, [esp + 4]  // src_raw
402     mov       edx, [esp + 8]  // dst_rgb24
403     mov       ecx, [esp + 12]  // width
404     movdqa    xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
405     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
406     movdqa    xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
407 
408  convertloop:
409     movdqu    xmm0, [eax]
410     movdqu    xmm1, [eax + 4]
411     movdqu    xmm2, [eax + 8]
412     lea       eax, [eax + 24]
413     pshufb    xmm0, xmm3
414     pshufb    xmm1, xmm4
415     pshufb    xmm2, xmm5
416     movq      qword ptr [edx], xmm0
417     movq      qword ptr [edx + 8], xmm1
418     movq      qword ptr [edx + 16], xmm2
419     lea       edx, [edx + 24]
420     sub       ecx, 8
421     jg        convertloop
422     ret
423   }
424 }
425 
426 // pmul method to replicate bits.
427 // Math to replicate bits:
428 // (v << 8) | (v << 3)
429 // v * 256 + v * 8
430 // v * (256 + 8)
431 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
432 // 20 instructions.
433 __declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565,
434                                             uint8_t* dst_argb,
435                                             int width) {
436   __asm {
437     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
438     movd      xmm5, eax
439     pshufd    xmm5, xmm5, 0
440     mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
441     movd      xmm6, eax
442     pshufd    xmm6, xmm6, 0
443     pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
444     psllw     xmm3, 11
445     pcmpeqb   xmm4, xmm4  // generate mask 0x07e007e0 for Green
446     psllw     xmm4, 10
447     psrlw     xmm4, 5
448     pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
449     psllw     xmm7, 8
450 
451     mov       eax, [esp + 4]  // src_rgb565
452     mov       edx, [esp + 8]  // dst_argb
453     mov       ecx, [esp + 12]  // width
454     sub       edx, eax
455     sub       edx, eax
456 
457  convertloop:
458     movdqu    xmm0, [eax]  // fetch 8 pixels of bgr565
459     movdqa    xmm1, xmm0
460     movdqa    xmm2, xmm0
461     pand      xmm1, xmm3  // R in upper 5 bits
462     psllw     xmm2, 11  // B in upper 5 bits
463     pmulhuw   xmm1, xmm5  // * (256 + 8)
464     pmulhuw   xmm2, xmm5  // * (256 + 8)
465     psllw     xmm1, 8
466     por       xmm1, xmm2  // RB
467     pand      xmm0, xmm4  // G in middle 6 bits
468     pmulhuw   xmm0, xmm6  // << 5 * (256 + 4)
469     por       xmm0, xmm7  // AG
470     movdqa    xmm2, xmm1
471     punpcklbw xmm1, xmm0
472     punpckhbw xmm2, xmm0
473     movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
474     movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
475     lea       eax, [eax + 16]
476     sub       ecx, 8
477     jg        convertloop
478     ret
479   }
480 }
481 
482 #ifdef HAS_RGB565TOARGBROW_AVX2
483 // pmul method to replicate bits.
484 // Math to replicate bits:
485 // (v << 8) | (v << 3)
486 // v * 256 + v * 8
487 // v * (256 + 8)
488 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
489 __declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
490                                             uint8_t* dst_argb,
491                                             int width) {
492   __asm {
493     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
494     vmovd      xmm5, eax
495     vbroadcastss ymm5, xmm5
496     mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
497     vmovd      xmm6, eax
498     vbroadcastss ymm6, xmm6
499     vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
500     vpsllw     ymm3, ymm3, 11
501     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x07e007e0 for Green
502     vpsllw     ymm4, ymm4, 10
503     vpsrlw     ymm4, ymm4, 5
504     vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
505     vpsllw     ymm7, ymm7, 8
506 
507     mov        eax, [esp + 4]  // src_rgb565
508     mov        edx, [esp + 8]  // dst_argb
509     mov        ecx, [esp + 12]  // width
510     sub        edx, eax
511     sub        edx, eax
512 
513  convertloop:
514     vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgr565
515     vpand      ymm1, ymm0, ymm3  // R in upper 5 bits
516     vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
517     vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
518     vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
519     vpsllw     ymm1, ymm1, 8
520     vpor       ymm1, ymm1, ymm2  // RB
521     vpand      ymm0, ymm0, ymm4  // G in middle 6 bits
522     vpmulhuw   ymm0, ymm0, ymm6  // << 5 * (256 + 4)
523     vpor       ymm0, ymm0, ymm7  // AG
524     vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
525     vpermq     ymm1, ymm1, 0xd8
526     vpunpckhbw ymm2, ymm1, ymm0
527     vpunpcklbw ymm1, ymm1, ymm0
528     vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
529     vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
530     lea       eax, [eax + 32]
531     sub       ecx, 16
532     jg        convertloop
533     vzeroupper
534     ret
535   }
536 }
537 #endif  // HAS_RGB565TOARGBROW_AVX2
538 
539 #ifdef HAS_ARGB1555TOARGBROW_AVX2
540 __declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
541                                               uint8_t* dst_argb,
542                                               int width) {
543   __asm {
544     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
545     vmovd      xmm5, eax
546     vbroadcastss ymm5, xmm5
547     mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
548     vmovd      xmm6, eax
549     vbroadcastss ymm6, xmm6
550     vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
551     vpsllw     ymm3, ymm3, 11
552     vpsrlw     ymm4, ymm3, 6  // generate mask 0x03e003e0 for Green
553     vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
554     vpsllw     ymm7, ymm7, 8
555 
556     mov        eax,  [esp + 4]  // src_argb1555
557     mov        edx,  [esp + 8]  // dst_argb
558     mov        ecx,  [esp + 12]  // width
559     sub        edx,  eax
560     sub        edx,  eax
561 
562  convertloop:
563     vmovdqu    ymm0, [eax]  // fetch 16 pixels of 1555
564     vpsllw     ymm1, ymm0, 1  // R in upper 5 bits
565     vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
566     vpand      ymm1, ymm1, ymm3
567     vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
568     vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
569     vpsllw     ymm1, ymm1, 8
570     vpor       ymm1, ymm1, ymm2  // RB
571     vpsraw     ymm2, ymm0, 8  // A
572     vpand      ymm0, ymm0, ymm4  // G in middle 5 bits
573     vpmulhuw   ymm0, ymm0, ymm6  // << 6 * (256 + 8)
574     vpand      ymm2, ymm2, ymm7
575     vpor       ymm0, ymm0, ymm2  // AG
576     vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
577     vpermq     ymm1, ymm1, 0xd8
578     vpunpckhbw ymm2, ymm1, ymm0
579     vpunpcklbw ymm1, ymm1, ymm0
580     vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
581     vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
582     lea       eax, [eax + 32]
583     sub       ecx, 16
584     jg        convertloop
585     vzeroupper
586     ret
587   }
588 }
589 #endif  // HAS_ARGB1555TOARGBROW_AVX2
590 
591 #ifdef HAS_ARGB4444TOARGBROW_AVX2
592 __declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
593                                               uint8_t* dst_argb,
594                                               int width) {
595   __asm {
596     mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
597     vmovd     xmm4, eax
598     vbroadcastss ymm4, xmm4
599     vpslld    ymm5, ymm4, 4  // 0xf0f0f0f0 for high nibbles
600     mov       eax,  [esp + 4]  // src_argb4444
601     mov       edx,  [esp + 8]  // dst_argb
602     mov       ecx,  [esp + 12]  // width
603     sub       edx,  eax
604     sub       edx,  eax
605 
606  convertloop:
607     vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgra4444
608     vpand      ymm2, ymm0, ymm5  // mask high nibbles
609     vpand      ymm0, ymm0, ymm4  // mask low nibbles
610     vpsrlw     ymm3, ymm2, 4
611     vpsllw     ymm1, ymm0, 4
612     vpor       ymm2, ymm2, ymm3
613     vpor       ymm0, ymm0, ymm1
614     vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
615     vpermq     ymm2, ymm2, 0xd8
616     vpunpckhbw ymm1, ymm0, ymm2
617     vpunpcklbw ymm0, ymm0, ymm2
618     vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
619     vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
620     lea       eax, [eax + 32]
621     sub       ecx, 16
622     jg        convertloop
623     vzeroupper
624     ret
625   }
626 }
627 #endif  // HAS_ARGB4444TOARGBROW_AVX2
628 
629 // 24 instructions
630 __declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555,
631                                               uint8_t* dst_argb,
632                                               int width) {
633   __asm {
634     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
635     movd      xmm5, eax
636     pshufd    xmm5, xmm5, 0
637     mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
638     movd      xmm6, eax
639     pshufd    xmm6, xmm6, 0
640     pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
641     psllw     xmm3, 11
642     movdqa    xmm4, xmm3  // generate mask 0x03e003e0 for Green
643     psrlw     xmm4, 6
644     pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
645     psllw     xmm7, 8
646 
647     mov       eax, [esp + 4]  // src_argb1555
648     mov       edx, [esp + 8]  // dst_argb
649     mov       ecx, [esp + 12]  // width
650     sub       edx, eax
651     sub       edx, eax
652 
653  convertloop:
654     movdqu    xmm0, [eax]  // fetch 8 pixels of 1555
655     movdqa    xmm1, xmm0
656     movdqa    xmm2, xmm0
657     psllw     xmm1, 1  // R in upper 5 bits
658     psllw     xmm2, 11  // B in upper 5 bits
659     pand      xmm1, xmm3
660     pmulhuw   xmm2, xmm5  // * (256 + 8)
661     pmulhuw   xmm1, xmm5  // * (256 + 8)
662     psllw     xmm1, 8
663     por       xmm1, xmm2  // RB
664     movdqa    xmm2, xmm0
665     pand      xmm0, xmm4  // G in middle 5 bits
666     psraw     xmm2, 8  // A
667     pmulhuw   xmm0, xmm6  // << 6 * (256 + 8)
668     pand      xmm2, xmm7
669     por       xmm0, xmm2  // AG
670     movdqa    xmm2, xmm1
671     punpcklbw xmm1, xmm0
672     punpckhbw xmm2, xmm0
673     movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
674     movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
675     lea       eax, [eax + 16]
676     sub       ecx, 8
677     jg        convertloop
678     ret
679   }
680 }
681 
682 // 18 instructions.
683 __declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444,
684                                               uint8_t* dst_argb,
685                                               int width) {
686   __asm {
687     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
688     movd      xmm4, eax
689     pshufd    xmm4, xmm4, 0
690     movdqa    xmm5, xmm4  // 0xf0f0f0f0 for high nibbles
691     pslld     xmm5, 4
692     mov       eax, [esp + 4]  // src_argb4444
693     mov       edx, [esp + 8]  // dst_argb
694     mov       ecx, [esp + 12]  // width
695     sub       edx, eax
696     sub       edx, eax
697 
698  convertloop:
699     movdqu    xmm0, [eax]  // fetch 8 pixels of bgra4444
700     movdqa    xmm2, xmm0
701     pand      xmm0, xmm4  // mask low nibbles
702     pand      xmm2, xmm5  // mask high nibbles
703     movdqa    xmm1, xmm0
704     movdqa    xmm3, xmm2
705     psllw     xmm1, 4
706     psrlw     xmm3, 4
707     por       xmm0, xmm1
708     por       xmm2, xmm3
709     movdqa    xmm1, xmm0
710     punpcklbw xmm0, xmm2
711     punpckhbw xmm1, xmm2
712     movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
713     movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
714     lea       eax, [eax + 16]
715     sub       ecx, 8
716     jg        convertloop
717     ret
718   }
719 }
720 
721 __declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb,
722                                             uint8_t* dst_rgb,
723                                             int width) {
724   __asm {
725     mov       eax, [esp + 4]  // src_argb
726     mov       edx, [esp + 8]  // dst_rgb
727     mov       ecx, [esp + 12]  // width
728     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24
729 
730  convertloop:
731     movdqu    xmm0, [eax]  // fetch 16 pixels of argb
732     movdqu    xmm1, [eax + 16]
733     movdqu    xmm2, [eax + 32]
734     movdqu    xmm3, [eax + 48]
735     lea       eax, [eax + 64]
736     pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
737     pshufb    xmm1, xmm6
738     pshufb    xmm2, xmm6
739     pshufb    xmm3, xmm6
740     movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
741     psrldq    xmm1, 4  // 8 bytes from 1
742     pslldq    xmm4, 12  // 4 bytes from 1 for 0
743     movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
744     por       xmm0, xmm4  // 4 bytes from 1 for 0
745     pslldq    xmm5, 8  // 8 bytes from 2 for 1
746     movdqu    [edx], xmm0  // store 0
747     por       xmm1, xmm5  // 8 bytes from 2 for 1
748     psrldq    xmm2, 8  // 4 bytes from 2
749     pslldq    xmm3, 4  // 12 bytes from 3 for 2
750     por       xmm2, xmm3  // 12 bytes from 3 for 2
751     movdqu    [edx + 16], xmm1  // store 1
752     movdqu    [edx + 32], xmm2  // store 2
753     lea       edx, [edx + 48]
754     sub       ecx, 16
755     jg        convertloop
756     ret
757   }
758 }
759 
760 __declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb,
761                                           uint8_t* dst_rgb,
762                                           int width) {
763   __asm {
764     mov       eax, [esp + 4]  // src_argb
765     mov       edx, [esp + 8]  // dst_rgb
766     mov       ecx, [esp + 12]  // width
767     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRAW
768 
769  convertloop:
770     movdqu    xmm0, [eax]  // fetch 16 pixels of argb
771     movdqu    xmm1, [eax + 16]
772     movdqu    xmm2, [eax + 32]
773     movdqu    xmm3, [eax + 48]
774     lea       eax, [eax + 64]
775     pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
776     pshufb    xmm1, xmm6
777     pshufb    xmm2, xmm6
778     pshufb    xmm3, xmm6
779     movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
780     psrldq    xmm1, 4  // 8 bytes from 1
781     pslldq    xmm4, 12  // 4 bytes from 1 for 0
782     movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
783     por       xmm0, xmm4  // 4 bytes from 1 for 0
784     pslldq    xmm5, 8  // 8 bytes from 2 for 1
785     movdqu    [edx], xmm0  // store 0
786     por       xmm1, xmm5  // 8 bytes from 2 for 1
787     psrldq    xmm2, 8  // 4 bytes from 2
788     pslldq    xmm3, 4  // 12 bytes from 3 for 2
789     por       xmm2, xmm3  // 12 bytes from 3 for 2
790     movdqu    [edx + 16], xmm1  // store 1
791     movdqu    [edx + 32], xmm2  // store 2
792     lea       edx, [edx + 48]
793     sub       ecx, 16
794     jg        convertloop
795     ret
796   }
797 }
798 
799 __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
800                                             uint8_t* dst_rgb,
801                                             int width) {
802   __asm {
803     mov       eax, [esp + 4]  // src_argb
804     mov       edx, [esp + 8]  // dst_rgb
805     mov       ecx, [esp + 12]  // width
806     pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
807     psrld     xmm3, 27
808     pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
809     psrld     xmm4, 26
810     pslld     xmm4, 5
811     pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
812     pslld     xmm5, 11
813 
814  convertloop:
815     movdqu    xmm0, [eax]  // fetch 4 pixels of argb
816     movdqa    xmm1, xmm0  // B
817     movdqa    xmm2, xmm0  // G
818     pslld     xmm0, 8  // R
819     psrld     xmm1, 3  // B
820     psrld     xmm2, 5  // G
821     psrad     xmm0, 16  // R
822     pand      xmm1, xmm3  // B
823     pand      xmm2, xmm4  // G
824     pand      xmm0, xmm5  // R
825     por       xmm1, xmm2  // BG
826     por       xmm0, xmm1  // BGR
827     packssdw  xmm0, xmm0
828     lea       eax, [eax + 16]
829     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
830     lea       edx, [edx + 8]
831     sub       ecx, 4
832     jg        convertloop
833     ret
834   }
835 }
836 
837 __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
838                                                   uint8_t* dst_rgb,
839                                                   const uint32_t dither4,
840                                                   int width) {
841   __asm {
842 
843     mov       eax, [esp + 4]  // src_argb
844     mov       edx, [esp + 8]  // dst_rgb
845     movd      xmm6, [esp + 12]  // dither4
846     mov       ecx, [esp + 16]  // width
847     punpcklbw xmm6, xmm6  // make dither 16 bytes
848     movdqa    xmm7, xmm6
849     punpcklwd xmm6, xmm6
850     punpckhwd xmm7, xmm7
851     pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
852     psrld     xmm3, 27
853     pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
854     psrld     xmm4, 26
855     pslld     xmm4, 5
856     pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
857     pslld     xmm5, 11
858 
859  convertloop:
860     movdqu    xmm0, [eax]  // fetch 4 pixels of argb
861     paddusb   xmm0, xmm6  // add dither
862     movdqa    xmm1, xmm0  // B
863     movdqa    xmm2, xmm0  // G
864     pslld     xmm0, 8  // R
865     psrld     xmm1, 3  // B
866     psrld     xmm2, 5  // G
867     psrad     xmm0, 16  // R
868     pand      xmm1, xmm3  // B
869     pand      xmm2, xmm4  // G
870     pand      xmm0, xmm5  // R
871     por       xmm1, xmm2  // BG
872     por       xmm0, xmm1  // BGR
873     packssdw  xmm0, xmm0
874     lea       eax, [eax + 16]
875     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
876     lea       edx, [edx + 8]
877     sub       ecx, 4
878     jg        convertloop
879     ret
880   }
881 }
882 
883 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
884 __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
885                                                   uint8_t* dst_rgb,
886                                                   const uint32_t dither4,
887                                                   int width) {
888   __asm {
889     mov        eax, [esp + 4]  // src_argb
890     mov        edx, [esp + 8]  // dst_rgb
891     vbroadcastss xmm6, [esp + 12]  // dither4
892     mov        ecx, [esp + 16]  // width
893     vpunpcklbw xmm6, xmm6, xmm6  // make dither 32 bytes
894     vpermq     ymm6, ymm6, 0xd8
895     vpunpcklwd ymm6, ymm6, ymm6
896     vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
897     vpsrld     ymm3, ymm3, 27
898     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
899     vpsrld     ymm4, ymm4, 26
900     vpslld     ymm4, ymm4, 5
901     vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
902 
903  convertloop:
904     vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
905     vpaddusb   ymm0, ymm0, ymm6  // add dither
906     vpsrld     ymm2, ymm0, 5  // G
907     vpsrld     ymm1, ymm0, 3  // B
908     vpsrld     ymm0, ymm0, 8  // R
909     vpand      ymm2, ymm2, ymm4  // G
910     vpand      ymm1, ymm1, ymm3  // B
911     vpand      ymm0, ymm0, ymm5  // R
912     vpor       ymm1, ymm1, ymm2  // BG
913     vpor       ymm0, ymm0, ymm1  // BGR
914     vpackusdw  ymm0, ymm0, ymm0
915     vpermq     ymm0, ymm0, 0xd8
916     lea        eax, [eax + 32]
917     vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
918     lea        edx, [edx + 16]
919     sub        ecx, 8
920     jg         convertloop
921     vzeroupper
922     ret
923   }
924 }
925 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
926 
927 // TODO(fbarchard): Improve sign extension/packing.
928 __declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,
929                                               uint8_t* dst_rgb,
930                                               int width) {
931   __asm {
932     mov       eax, [esp + 4]  // src_argb
933     mov       edx, [esp + 8]  // dst_rgb
934     mov       ecx, [esp + 12]  // width
935     pcmpeqb   xmm4, xmm4  // generate mask 0x0000001f
936     psrld     xmm4, 27
937     movdqa    xmm5, xmm4  // generate mask 0x000003e0
938     pslld     xmm5, 5
939     movdqa    xmm6, xmm4  // generate mask 0x00007c00
940     pslld     xmm6, 10
941     pcmpeqb   xmm7, xmm7  // generate mask 0xffff8000
942     pslld     xmm7, 15
943 
944  convertloop:
945     movdqu    xmm0, [eax]  // fetch 4 pixels of argb
946     movdqa    xmm1, xmm0  // B
947     movdqa    xmm2, xmm0  // G
948     movdqa    xmm3, xmm0  // R
949     psrad     xmm0, 16  // A
950     psrld     xmm1, 3  // B
951     psrld     xmm2, 6  // G
952     psrld     xmm3, 9  // R
953     pand      xmm0, xmm7  // A
954     pand      xmm1, xmm4  // B
955     pand      xmm2, xmm5  // G
956     pand      xmm3, xmm6  // R
957     por       xmm0, xmm1  // BA
958     por       xmm2, xmm3  // GR
959     por       xmm0, xmm2  // BGRA
960     packssdw  xmm0, xmm0
961     lea       eax, [eax + 16]
962     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
963     lea       edx, [edx + 8]
964     sub       ecx, 4
965     jg        convertloop
966     ret
967   }
968 }
969 
970 __declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,
971                                               uint8_t* dst_rgb,
972                                               int width) {
973   __asm {
974     mov       eax, [esp + 4]  // src_argb
975     mov       edx, [esp + 8]  // dst_rgb
976     mov       ecx, [esp + 12]  // width
977     pcmpeqb   xmm4, xmm4  // generate mask 0xf000f000
978     psllw     xmm4, 12
979     movdqa    xmm3, xmm4  // generate mask 0x00f000f0
980     psrlw     xmm3, 8
981 
982  convertloop:
983     movdqu    xmm0, [eax]  // fetch 4 pixels of argb
984     movdqa    xmm1, xmm0
985     pand      xmm0, xmm3  // low nibble
986     pand      xmm1, xmm4  // high nibble
987     psrld     xmm0, 4
988     psrld     xmm1, 8
989     por       xmm0, xmm1
990     packuswb  xmm0, xmm0
991     lea       eax, [eax + 16]
992     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
993     lea       edx, [edx + 8]
994     sub       ecx, 4
995     jg        convertloop
996     ret
997   }
998 }
999 
1000 #ifdef HAS_ARGBTORGB565ROW_AVX2
1001 __declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb,
1002                                             uint8_t* dst_rgb,
1003                                             int width) {
1004   __asm {
1005     mov        eax, [esp + 4]  // src_argb
1006     mov        edx, [esp + 8]  // dst_rgb
1007     mov        ecx, [esp + 12]  // width
1008     vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
1009     vpsrld     ymm3, ymm3, 27
1010     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
1011     vpsrld     ymm4, ymm4, 26
1012     vpslld     ymm4, ymm4, 5
1013     vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
1014 
1015  convertloop:
1016     vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
1017     vpsrld     ymm2, ymm0, 5  // G
1018     vpsrld     ymm1, ymm0, 3  // B
1019     vpsrld     ymm0, ymm0, 8  // R
1020     vpand      ymm2, ymm2, ymm4  // G
1021     vpand      ymm1, ymm1, ymm3  // B
1022     vpand      ymm0, ymm0, ymm5  // R
1023     vpor       ymm1, ymm1, ymm2  // BG
1024     vpor       ymm0, ymm0, ymm1  // BGR
1025     vpackusdw  ymm0, ymm0, ymm0
1026     vpermq     ymm0, ymm0, 0xd8
1027     lea        eax, [eax + 32]
1028     vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
1029     lea        edx, [edx + 16]
1030     sub        ecx, 8
1031     jg         convertloop
1032     vzeroupper
1033     ret
1034   }
1035 }
1036 #endif  // HAS_ARGBTORGB565ROW_AVX2
1037 
1038 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
1039 __declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
1040                                               uint8_t* dst_rgb,
1041                                               int width) {
1042   __asm {
1043     mov        eax, [esp + 4]  // src_argb
1044     mov        edx, [esp + 8]  // dst_rgb
1045     mov        ecx, [esp + 12]  // width
1046     vpcmpeqb   ymm4, ymm4, ymm4
1047     vpsrld     ymm4, ymm4, 27  // generate mask 0x0000001f
1048     vpslld     ymm5, ymm4, 5  // generate mask 0x000003e0
1049     vpslld     ymm6, ymm4, 10  // generate mask 0x00007c00
1050     vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xffff8000
1051     vpslld     ymm7, ymm7, 15
1052 
1053  convertloop:
1054     vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
1055     vpsrld     ymm3, ymm0, 9  // R
1056     vpsrld     ymm2, ymm0, 6  // G
1057     vpsrld     ymm1, ymm0, 3  // B
1058     vpsrad     ymm0, ymm0, 16  // A
1059     vpand      ymm3, ymm3, ymm6  // R
1060     vpand      ymm2, ymm2, ymm5  // G
1061     vpand      ymm1, ymm1, ymm4  // B
1062     vpand      ymm0, ymm0, ymm7  // A
1063     vpor       ymm0, ymm0, ymm1  // BA
1064     vpor       ymm2, ymm2, ymm3  // GR
1065     vpor       ymm0, ymm0, ymm2  // BGRA
1066     vpackssdw  ymm0, ymm0, ymm0
1067     vpermq     ymm0, ymm0, 0xd8
1068     lea        eax, [eax + 32]
1069     vmovdqu    [edx], xmm0  // store 8 pixels of ARGB1555
1070     lea        edx, [edx + 16]
1071     sub        ecx, 8
1072     jg         convertloop
1073     vzeroupper
1074     ret
1075   }
1076 }
1077 #endif  // HAS_ARGBTOARGB1555ROW_AVX2
1078 
1079 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
1080 __declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
1081                                               uint8_t* dst_rgb,
1082                                               int width) {
1083   __asm {
1084     mov        eax, [esp + 4]  // src_argb
1085     mov        edx, [esp + 8]  // dst_rgb
1086     mov        ecx, [esp + 12]  // width
1087     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xf000f000
1088     vpsllw     ymm4, ymm4, 12
1089     vpsrlw     ymm3, ymm4, 8  // generate mask 0x00f000f0
1090 
1091  convertloop:
1092     vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
1093     vpand      ymm1, ymm0, ymm4  // high nibble
1094     vpand      ymm0, ymm0, ymm3  // low nibble
1095     vpsrld     ymm1, ymm1, 8
1096     vpsrld     ymm0, ymm0, 4
1097     vpor       ymm0, ymm0, ymm1
1098     vpackuswb  ymm0, ymm0, ymm0
1099     vpermq     ymm0, ymm0, 0xd8
1100     lea        eax, [eax + 32]
1101     vmovdqu    [edx], xmm0  // store 8 pixels of ARGB4444
1102     lea        edx, [edx + 16]
1103     sub        ecx, 8
1104     jg         convertloop
1105     vzeroupper
1106     ret
1107   }
1108 }
1109 #endif  // HAS_ARGBTOARGB4444ROW_AVX2
1110 
1111 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
1112 __declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
1113                                         uint8_t* dst_y,
1114                                         int width) {
1115   __asm {
1116     mov        eax, [esp + 4] /* src_argb */
1117     mov        edx, [esp + 8] /* dst_y */
1118     mov        ecx, [esp + 12] /* width */
1119     movdqa     xmm4, xmmword ptr kARGBToY
1120     movdqa     xmm5, xmmword ptr kAddY16
1121 
1122  convertloop:
1123     movdqu     xmm0, [eax]
1124     movdqu     xmm1, [eax + 16]
1125     movdqu     xmm2, [eax + 32]
1126     movdqu     xmm3, [eax + 48]
1127     pmaddubsw  xmm0, xmm4
1128     pmaddubsw  xmm1, xmm4
1129     pmaddubsw  xmm2, xmm4
1130     pmaddubsw  xmm3, xmm4
1131     lea        eax, [eax + 64]
1132     phaddw     xmm0, xmm1
1133     phaddw     xmm2, xmm3
1134     psrlw      xmm0, 7
1135     psrlw      xmm2, 7
1136     packuswb   xmm0, xmm2
1137     paddb      xmm0, xmm5
1138     movdqu     [edx], xmm0
1139     lea        edx, [edx + 16]
1140     sub        ecx, 16
1141     jg         convertloop
1142     ret
1143   }
1144 }
1145 
1146 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1147 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
1148 __declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
1149                                          uint8_t* dst_y,
1150                                          int width) {
1151   __asm {
1152     mov        eax, [esp + 4] /* src_argb */
1153     mov        edx, [esp + 8] /* dst_y */
1154     mov        ecx, [esp + 12] /* width */
1155     movdqa     xmm4, xmmword ptr kARGBToYJ
1156     movdqa     xmm5, xmmword ptr kAddYJ64
1157 
1158  convertloop:
1159     movdqu     xmm0, [eax]
1160     movdqu     xmm1, [eax + 16]
1161     movdqu     xmm2, [eax + 32]
1162     movdqu     xmm3, [eax + 48]
1163     pmaddubsw  xmm0, xmm4
1164     pmaddubsw  xmm1, xmm4
1165     pmaddubsw  xmm2, xmm4
1166     pmaddubsw  xmm3, xmm4
1167     lea        eax, [eax + 64]
1168     phaddw     xmm0, xmm1
1169     phaddw     xmm2, xmm3
1170     paddw      xmm0, xmm5  // Add .5 for rounding.
1171     paddw      xmm2, xmm5
1172     psrlw      xmm0, 7
1173     psrlw      xmm2, 7
1174     packuswb   xmm0, xmm2
1175     movdqu     [edx], xmm0
1176     lea        edx, [edx + 16]
1177     sub        ecx, 16
1178     jg         convertloop
1179     ret
1180   }
1181 }
1182 
1183 #ifdef HAS_ARGBTOYROW_AVX2
1184 // vpermd for vphaddw + vpackuswb vpermd.
1185 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
1186 
1187 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1188 __declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb,
1189                                        uint8_t* dst_y,
1190                                        int width) {
1191   __asm {
1192     mov        eax, [esp + 4] /* src_argb */
1193     mov        edx, [esp + 8] /* dst_y */
1194     mov        ecx, [esp + 12] /* width */
1195     vbroadcastf128 ymm4, xmmword ptr kARGBToY
1196     vbroadcastf128 ymm5, xmmword ptr kAddY16
1197     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
1198 
1199  convertloop:
1200     vmovdqu    ymm0, [eax]
1201     vmovdqu    ymm1, [eax + 32]
1202     vmovdqu    ymm2, [eax + 64]
1203     vmovdqu    ymm3, [eax + 96]
1204     vpmaddubsw ymm0, ymm0, ymm4
1205     vpmaddubsw ymm1, ymm1, ymm4
1206     vpmaddubsw ymm2, ymm2, ymm4
1207     vpmaddubsw ymm3, ymm3, ymm4
1208     lea        eax, [eax + 128]
1209     vphaddw    ymm0, ymm0, ymm1  // mutates.
1210     vphaddw    ymm2, ymm2, ymm3
1211     vpsrlw     ymm0, ymm0, 7
1212     vpsrlw     ymm2, ymm2, 7
1213     vpackuswb  ymm0, ymm0, ymm2  // mutates.
1214     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
1215     vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
1216     vmovdqu    [edx], ymm0
1217     lea        edx, [edx + 32]
1218     sub        ecx, 32
1219     jg         convertloop
1220     vzeroupper
1221     ret
1222   }
1223 }
1224 #endif  //  HAS_ARGBTOYROW_AVX2
1225 
1226 #ifdef HAS_ARGBTOYJROW_AVX2
1227 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1228 __declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb,
1229                                         uint8_t* dst_y,
1230                                         int width) {
1231   __asm {
1232     mov        eax, [esp + 4] /* src_argb */
1233     mov        edx, [esp + 8] /* dst_y */
1234     mov        ecx, [esp + 12] /* width */
1235     vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
1236     vbroadcastf128 ymm5, xmmword ptr kAddYJ64
1237     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
1238 
1239  convertloop:
1240     vmovdqu    ymm0, [eax]
1241     vmovdqu    ymm1, [eax + 32]
1242     vmovdqu    ymm2, [eax + 64]
1243     vmovdqu    ymm3, [eax + 96]
1244     vpmaddubsw ymm0, ymm0, ymm4
1245     vpmaddubsw ymm1, ymm1, ymm4
1246     vpmaddubsw ymm2, ymm2, ymm4
1247     vpmaddubsw ymm3, ymm3, ymm4
1248     lea        eax, [eax + 128]
1249     vphaddw    ymm0, ymm0, ymm1  // mutates.
1250     vphaddw    ymm2, ymm2, ymm3
1251     vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
1252     vpaddw     ymm2, ymm2, ymm5
1253     vpsrlw     ymm0, ymm0, 7
1254     vpsrlw     ymm2, ymm2, 7
1255     vpackuswb  ymm0, ymm0, ymm2  // mutates.
1256     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
1257     vmovdqu    [edx], ymm0
1258     lea        edx, [edx + 32]
1259     sub        ecx, 32
1260     jg         convertloop
1261 
1262     vzeroupper
1263     ret
1264   }
1265 }
1266 #endif  //  HAS_ARGBTOYJROW_AVX2
1267 
1268 __declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb,
1269                                         uint8_t* dst_y,
1270                                         int width) {
1271   __asm {
1272     mov        eax, [esp + 4] /* src_argb */
1273     mov        edx, [esp + 8] /* dst_y */
1274     mov        ecx, [esp + 12] /* width */
1275     movdqa     xmm4, xmmword ptr kBGRAToY
1276     movdqa     xmm5, xmmword ptr kAddY16
1277 
1278  convertloop:
1279     movdqu     xmm0, [eax]
1280     movdqu     xmm1, [eax + 16]
1281     movdqu     xmm2, [eax + 32]
1282     movdqu     xmm3, [eax + 48]
1283     pmaddubsw  xmm0, xmm4
1284     pmaddubsw  xmm1, xmm4
1285     pmaddubsw  xmm2, xmm4
1286     pmaddubsw  xmm3, xmm4
1287     lea        eax, [eax + 64]
1288     phaddw     xmm0, xmm1
1289     phaddw     xmm2, xmm3
1290     psrlw      xmm0, 7
1291     psrlw      xmm2, 7
1292     packuswb   xmm0, xmm2
1293     paddb      xmm0, xmm5
1294     movdqu     [edx], xmm0
1295     lea        edx, [edx + 16]
1296     sub        ecx, 16
1297     jg         convertloop
1298     ret
1299   }
1300 }
1301 
1302 __declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb,
1303                                         uint8_t* dst_y,
1304                                         int width) {
1305   __asm {
1306     mov        eax, [esp + 4] /* src_argb */
1307     mov        edx, [esp + 8] /* dst_y */
1308     mov        ecx, [esp + 12] /* width */
1309     movdqa     xmm4, xmmword ptr kABGRToY
1310     movdqa     xmm5, xmmword ptr kAddY16
1311 
1312  convertloop:
1313     movdqu     xmm0, [eax]
1314     movdqu     xmm1, [eax + 16]
1315     movdqu     xmm2, [eax + 32]
1316     movdqu     xmm3, [eax + 48]
1317     pmaddubsw  xmm0, xmm4
1318     pmaddubsw  xmm1, xmm4
1319     pmaddubsw  xmm2, xmm4
1320     pmaddubsw  xmm3, xmm4
1321     lea        eax, [eax + 64]
1322     phaddw     xmm0, xmm1
1323     phaddw     xmm2, xmm3
1324     psrlw      xmm0, 7
1325     psrlw      xmm2, 7
1326     packuswb   xmm0, xmm2
1327     paddb      xmm0, xmm5
1328     movdqu     [edx], xmm0
1329     lea        edx, [edx + 16]
1330     sub        ecx, 16
1331     jg         convertloop
1332     ret
1333   }
1334 }
1335 
1336 __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
1337                                         uint8_t* dst_y,
1338                                         int width) {
1339   __asm {
1340     mov        eax, [esp + 4] /* src_argb */
1341     mov        edx, [esp + 8] /* dst_y */
1342     mov        ecx, [esp + 12] /* width */
1343     movdqa     xmm4, xmmword ptr kRGBAToY
1344     movdqa     xmm5, xmmword ptr kAddY16
1345 
1346  convertloop:
1347     movdqu     xmm0, [eax]
1348     movdqu     xmm1, [eax + 16]
1349     movdqu     xmm2, [eax + 32]
1350     movdqu     xmm3, [eax + 48]
1351     pmaddubsw  xmm0, xmm4
1352     pmaddubsw  xmm1, xmm4
1353     pmaddubsw  xmm2, xmm4
1354     pmaddubsw  xmm3, xmm4
1355     lea        eax, [eax + 64]
1356     phaddw     xmm0, xmm1
1357     phaddw     xmm2, xmm3
1358     psrlw      xmm0, 7
1359     psrlw      xmm2, 7
1360     packuswb   xmm0, xmm2
1361     paddb      xmm0, xmm5
1362     movdqu     [edx], xmm0
1363     lea        edx, [edx + 16]
1364     sub        ecx, 16
1365     jg         convertloop
1366     ret
1367   }
1368 }
1369 
1370 __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
1371                                          int src_stride_argb,
1372                                          uint8_t* dst_u,
1373                                          uint8_t* dst_v,
1374                                          int width) {
1375   __asm {
1376     push       esi
1377     push       edi
1378     mov        eax, [esp + 8 + 4]  // src_argb
1379     mov        esi, [esp + 8 + 8]  // src_stride_argb
1380     mov        edx, [esp + 8 + 12]  // dst_u
1381     mov        edi, [esp + 8 + 16]  // dst_v
1382     mov        ecx, [esp + 8 + 20]  // width
1383     movdqa     xmm5, xmmword ptr kAddUV128
1384     movdqa     xmm6, xmmword ptr kARGBToV
1385     movdqa     xmm7, xmmword ptr kARGBToU
1386     sub        edi, edx  // stride from u to v
1387 
1388  convertloop:
1389          /* step 1 - subsample 16x2 argb pixels to 8x1 */
1390     movdqu     xmm0, [eax]
1391     movdqu     xmm4, [eax + esi]
1392     pavgb      xmm0, xmm4
1393     movdqu     xmm1, [eax + 16]
1394     movdqu     xmm4, [eax + esi + 16]
1395     pavgb      xmm1, xmm4
1396     movdqu     xmm2, [eax + 32]
1397     movdqu     xmm4, [eax + esi + 32]
1398     pavgb      xmm2, xmm4
1399     movdqu     xmm3, [eax + 48]
1400     movdqu     xmm4, [eax + esi + 48]
1401     pavgb      xmm3, xmm4
1402 
1403     lea        eax,  [eax + 64]
1404     movdqa     xmm4, xmm0
1405     shufps     xmm0, xmm1, 0x88
1406     shufps     xmm4, xmm1, 0xdd
1407     pavgb      xmm0, xmm4
1408     movdqa     xmm4, xmm2
1409     shufps     xmm2, xmm3, 0x88
1410     shufps     xmm4, xmm3, 0xdd
1411     pavgb      xmm2, xmm4
1412 
1413         // step 2 - convert to U and V
1414         // from here down is very similar to Y code except
1415         // instead of 16 different pixels, its 8 pixels of U and 8 of V
1416     movdqa     xmm1, xmm0
1417     movdqa     xmm3, xmm2
1418     pmaddubsw  xmm0, xmm7  // U
1419     pmaddubsw  xmm2, xmm7
1420     pmaddubsw  xmm1, xmm6  // V
1421     pmaddubsw  xmm3, xmm6
1422     phaddw     xmm0, xmm2
1423     phaddw     xmm1, xmm3
1424     psraw      xmm0, 8
1425     psraw      xmm1, 8
1426     packsswb   xmm0, xmm1
1427     paddb      xmm0, xmm5  // -> unsigned
1428 
1429         // step 3 - store 8 U and 8 V values
1430     movlps     qword ptr [edx], xmm0  // U
1431     movhps     qword ptr [edx + edi], xmm0  // V
1432     lea        edx, [edx + 8]
1433     sub        ecx, 16
1434     jg         convertloop
1435 
1436     pop        edi
1437     pop        esi
1438     ret
1439   }
1440 }
1441 
1442 __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
1443                                           int src_stride_argb,
1444                                           uint8_t* dst_u,
1445                                           uint8_t* dst_v,
1446                                           int width) {
1447   __asm {
1448     push       esi
1449     push       edi
1450     mov        eax, [esp + 8 + 4]  // src_argb
1451     mov        esi, [esp + 8 + 8]  // src_stride_argb
1452     mov        edx, [esp + 8 + 12]  // dst_u
1453     mov        edi, [esp + 8 + 16]  // dst_v
1454     mov        ecx, [esp + 8 + 20]  // width
1455     movdqa     xmm5, xmmword ptr kAddUVJ128
1456     movdqa     xmm6, xmmword ptr kARGBToVJ
1457     movdqa     xmm7, xmmword ptr kARGBToUJ
1458     sub        edi, edx  // stride from u to v
1459 
1460  convertloop:
1461          /* step 1 - subsample 16x2 argb pixels to 8x1 */
1462     movdqu     xmm0, [eax]
1463     movdqu     xmm4, [eax + esi]
1464     pavgb      xmm0, xmm4
1465     movdqu     xmm1, [eax + 16]
1466     movdqu     xmm4, [eax + esi + 16]
1467     pavgb      xmm1, xmm4
1468     movdqu     xmm2, [eax + 32]
1469     movdqu     xmm4, [eax + esi + 32]
1470     pavgb      xmm2, xmm4
1471     movdqu     xmm3, [eax + 48]
1472     movdqu     xmm4, [eax + esi + 48]
1473     pavgb      xmm3, xmm4
1474 
1475     lea        eax,  [eax + 64]
1476     movdqa     xmm4, xmm0
1477     shufps     xmm0, xmm1, 0x88
1478     shufps     xmm4, xmm1, 0xdd
1479     pavgb      xmm0, xmm4
1480     movdqa     xmm4, xmm2
1481     shufps     xmm2, xmm3, 0x88
1482     shufps     xmm4, xmm3, 0xdd
1483     pavgb      xmm2, xmm4
1484 
1485         // step 2 - convert to U and V
1486         // from here down is very similar to Y code except
1487         // instead of 16 different pixels, its 8 pixels of U and 8 of V
1488     movdqa     xmm1, xmm0
1489     movdqa     xmm3, xmm2
1490     pmaddubsw  xmm0, xmm7  // U
1491     pmaddubsw  xmm2, xmm7
1492     pmaddubsw  xmm1, xmm6  // V
1493     pmaddubsw  xmm3, xmm6
1494     phaddw     xmm0, xmm2
1495     phaddw     xmm1, xmm3
1496     paddw      xmm0, xmm5  // +.5 rounding -> unsigned
1497     paddw      xmm1, xmm5
1498     psraw      xmm0, 8
1499     psraw      xmm1, 8
1500     packsswb   xmm0, xmm1
1501 
1502         // step 3 - store 8 U and 8 V values
1503     movlps     qword ptr [edx], xmm0  // U
1504     movhps     qword ptr [edx + edi], xmm0  // V
1505     lea        edx, [edx + 8]
1506     sub        ecx, 16
1507     jg         convertloop
1508 
1509     pop        edi
1510     pop        esi
1511     ret
1512   }
1513 }
1514 
1515 #ifdef HAS_ARGBTOUVROW_AVX2
1516 __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
1517                                         int src_stride_argb,
1518                                         uint8_t* dst_u,
1519                                         uint8_t* dst_v,
1520                                         int width) {
1521   __asm {
1522     push       esi
1523     push       edi
1524     mov        eax, [esp + 8 + 4]  // src_argb
1525     mov        esi, [esp + 8 + 8]  // src_stride_argb
1526     mov        edx, [esp + 8 + 12]  // dst_u
1527     mov        edi, [esp + 8 + 16]  // dst_v
1528     mov        ecx, [esp + 8 + 20]  // width
1529     vbroadcastf128 ymm5, xmmword ptr kAddUV128
1530     vbroadcastf128 ymm6, xmmword ptr kARGBToV
1531     vbroadcastf128 ymm7, xmmword ptr kARGBToU
1532     sub        edi, edx   // stride from u to v
1533 
1534  convertloop:
1535         /* step 1 - subsample 32x2 argb pixels to 16x1 */
1536     vmovdqu    ymm0, [eax]
1537     vmovdqu    ymm1, [eax + 32]
1538     vmovdqu    ymm2, [eax + 64]
1539     vmovdqu    ymm3, [eax + 96]
1540     vpavgb     ymm0, ymm0, [eax + esi]
1541     vpavgb     ymm1, ymm1, [eax + esi + 32]
1542     vpavgb     ymm2, ymm2, [eax + esi + 64]
1543     vpavgb     ymm3, ymm3, [eax + esi + 96]
1544     lea        eax,  [eax + 128]
1545     vshufps    ymm4, ymm0, ymm1, 0x88
1546     vshufps    ymm0, ymm0, ymm1, 0xdd
1547     vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
1548     vshufps    ymm4, ymm2, ymm3, 0x88
1549     vshufps    ymm2, ymm2, ymm3, 0xdd
1550     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
1551 
1552         // step 2 - convert to U and V
1553         // from here down is very similar to Y code except
1554         // instead of 32 different pixels, its 16 pixels of U and 16 of V
1555     vpmaddubsw ymm1, ymm0, ymm7  // U
1556     vpmaddubsw ymm3, ymm2, ymm7
1557     vpmaddubsw ymm0, ymm0, ymm6  // V
1558     vpmaddubsw ymm2, ymm2, ymm6
1559     vphaddw    ymm1, ymm1, ymm3  // mutates
1560     vphaddw    ymm0, ymm0, ymm2
1561     vpsraw     ymm1, ymm1, 8
1562     vpsraw     ymm0, ymm0, 8
1563     vpacksswb  ymm0, ymm1, ymm0  // mutates
1564     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
1565     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
1566     vpaddb     ymm0, ymm0, ymm5  // -> unsigned
1567 
1568         // step 3 - store 16 U and 16 V values
1569     vextractf128 [edx], ymm0, 0  // U
1570     vextractf128 [edx + edi], ymm0, 1  // V
1571     lea        edx, [edx + 16]
1572     sub        ecx, 32
1573     jg         convertloop
1574 
1575     pop        edi
1576     pop        esi
1577     vzeroupper
1578     ret
1579   }
1580 }
1581 #endif  // HAS_ARGBTOUVROW_AVX2
1582 
1583 #ifdef HAS_ARGBTOUVJROW_AVX2
1584 __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
1585                                          int src_stride_argb,
1586                                          uint8_t* dst_u,
1587                                          uint8_t* dst_v,
1588                                          int width) {
1589   __asm {
1590     push       esi
1591     push       edi
1592     mov        eax, [esp + 8 + 4]  // src_argb
1593     mov        esi, [esp + 8 + 8]  // src_stride_argb
1594     mov        edx, [esp + 8 + 12]  // dst_u
1595     mov        edi, [esp + 8 + 16]  // dst_v
1596     mov        ecx, [esp + 8 + 20]  // width
1597     vbroadcastf128 ymm5, xmmword ptr kAddUVJ128
1598     vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
1599     vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
1600     sub        edi, edx   // stride from u to v
1601 
1602  convertloop:
1603         /* step 1 - subsample 32x2 argb pixels to 16x1 */
1604     vmovdqu    ymm0, [eax]
1605     vmovdqu    ymm1, [eax + 32]
1606     vmovdqu    ymm2, [eax + 64]
1607     vmovdqu    ymm3, [eax + 96]
1608     vpavgb     ymm0, ymm0, [eax + esi]
1609     vpavgb     ymm1, ymm1, [eax + esi + 32]
1610     vpavgb     ymm2, ymm2, [eax + esi + 64]
1611     vpavgb     ymm3, ymm3, [eax + esi + 96]
1612     lea        eax,  [eax + 128]
1613     vshufps    ymm4, ymm0, ymm1, 0x88
1614     vshufps    ymm0, ymm0, ymm1, 0xdd
1615     vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
1616     vshufps    ymm4, ymm2, ymm3, 0x88
1617     vshufps    ymm2, ymm2, ymm3, 0xdd
1618     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
1619 
1620         // step 2 - convert to U and V
1621         // from here down is very similar to Y code except
1622         // instead of 32 different pixels, its 16 pixels of U and 16 of V
1623     vpmaddubsw ymm1, ymm0, ymm7  // U
1624     vpmaddubsw ymm3, ymm2, ymm7
1625     vpmaddubsw ymm0, ymm0, ymm6  // V
1626     vpmaddubsw ymm2, ymm2, ymm6
1627     vphaddw    ymm1, ymm1, ymm3  // mutates
1628     vphaddw    ymm0, ymm0, ymm2
1629     vpaddw     ymm1, ymm1, ymm5  // +.5 rounding -> unsigned
1630     vpaddw     ymm0, ymm0, ymm5
1631     vpsraw     ymm1, ymm1, 8
1632     vpsraw     ymm0, ymm0, 8
1633     vpacksswb  ymm0, ymm1, ymm0  // mutates
1634     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
1635     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
1636 
1637         // step 3 - store 16 U and 16 V values
1638     vextractf128 [edx], ymm0, 0  // U
1639     vextractf128 [edx + edi], ymm0, 1  // V
1640     lea        edx, [edx + 16]
1641     sub        ecx, 32
1642     jg         convertloop
1643 
1644     pop        edi
1645     pop        esi
1646     vzeroupper
1647     ret
1648   }
1649 }
1650 #endif  // HAS_ARGBTOUVJROW_AVX2
1651 
1652 __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
1653                                             uint8_t* dst_u,
1654                                             uint8_t* dst_v,
1655                                             int width) {
1656   __asm {
1657     push       edi
1658     mov        eax, [esp + 4 + 4]  // src_argb
1659     mov        edx, [esp + 4 + 8]  // dst_u
1660     mov        edi, [esp + 4 + 12]  // dst_v
1661     mov        ecx, [esp + 4 + 16]  // width
1662     movdqa     xmm5, xmmword ptr kAddUV128
1663     movdqa     xmm6, xmmword ptr kARGBToV
1664     movdqa     xmm7, xmmword ptr kARGBToU
1665     sub        edi, edx    // stride from u to v
1666 
1667  convertloop:
1668         /* convert to U and V */
1669     movdqu     xmm0, [eax]  // U
1670     movdqu     xmm1, [eax + 16]
1671     movdqu     xmm2, [eax + 32]
1672     movdqu     xmm3, [eax + 48]
1673     pmaddubsw  xmm0, xmm7
1674     pmaddubsw  xmm1, xmm7
1675     pmaddubsw  xmm2, xmm7
1676     pmaddubsw  xmm3, xmm7
1677     phaddw     xmm0, xmm1
1678     phaddw     xmm2, xmm3
1679     psraw      xmm0, 8
1680     psraw      xmm2, 8
1681     packsswb   xmm0, xmm2
1682     paddb      xmm0, xmm5
1683     movdqu     [edx], xmm0
1684 
1685     movdqu     xmm0, [eax]  // V
1686     movdqu     xmm1, [eax + 16]
1687     movdqu     xmm2, [eax + 32]
1688     movdqu     xmm3, [eax + 48]
1689     pmaddubsw  xmm0, xmm6
1690     pmaddubsw  xmm1, xmm6
1691     pmaddubsw  xmm2, xmm6
1692     pmaddubsw  xmm3, xmm6
1693     phaddw     xmm0, xmm1
1694     phaddw     xmm2, xmm3
1695     psraw      xmm0, 8
1696     psraw      xmm2, 8
1697     packsswb   xmm0, xmm2
1698     paddb      xmm0, xmm5
1699     lea        eax,  [eax + 64]
1700     movdqu     [edx + edi], xmm0
1701     lea        edx,  [edx + 16]
1702     sub        ecx,  16
1703     jg         convertloop
1704 
1705     pop        edi
1706     ret
1707   }
1708 }
1709 
1710 __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
1711                                          int src_stride_argb,
1712                                          uint8_t* dst_u,
1713                                          uint8_t* dst_v,
1714                                          int width) {
1715   __asm {
1716     push       esi
1717     push       edi
1718     mov        eax, [esp + 8 + 4]  // src_argb
1719     mov        esi, [esp + 8 + 8]  // src_stride_argb
1720     mov        edx, [esp + 8 + 12]  // dst_u
1721     mov        edi, [esp + 8 + 16]  // dst_v
1722     mov        ecx, [esp + 8 + 20]  // width
1723     movdqa     xmm5, xmmword ptr kAddUV128
1724     movdqa     xmm6, xmmword ptr kBGRAToV
1725     movdqa     xmm7, xmmword ptr kBGRAToU
1726     sub        edi, edx  // stride from u to v
1727 
1728  convertloop:
1729          /* step 1 - subsample 16x2 argb pixels to 8x1 */
1730     movdqu     xmm0, [eax]
1731     movdqu     xmm4, [eax + esi]
1732     pavgb      xmm0, xmm4
1733     movdqu     xmm1, [eax + 16]
1734     movdqu     xmm4, [eax + esi + 16]
1735     pavgb      xmm1, xmm4
1736     movdqu     xmm2, [eax + 32]
1737     movdqu     xmm4, [eax + esi + 32]
1738     pavgb      xmm2, xmm4
1739     movdqu     xmm3, [eax + 48]
1740     movdqu     xmm4, [eax + esi + 48]
1741     pavgb      xmm3, xmm4
1742 
1743     lea        eax,  [eax + 64]
1744     movdqa     xmm4, xmm0
1745     shufps     xmm0, xmm1, 0x88
1746     shufps     xmm4, xmm1, 0xdd
1747     pavgb      xmm0, xmm4
1748     movdqa     xmm4, xmm2
1749     shufps     xmm2, xmm3, 0x88
1750     shufps     xmm4, xmm3, 0xdd
1751     pavgb      xmm2, xmm4
1752 
1753         // step 2 - convert to U and V
1754         // from here down is very similar to Y code except
1755         // instead of 16 different pixels, its 8 pixels of U and 8 of V
1756     movdqa     xmm1, xmm0
1757     movdqa     xmm3, xmm2
1758     pmaddubsw  xmm0, xmm7  // U
1759     pmaddubsw  xmm2, xmm7
1760     pmaddubsw  xmm1, xmm6  // V
1761     pmaddubsw  xmm3, xmm6
1762     phaddw     xmm0, xmm2
1763     phaddw     xmm1, xmm3
1764     psraw      xmm0, 8
1765     psraw      xmm1, 8
1766     packsswb   xmm0, xmm1
1767     paddb      xmm0, xmm5  // -> unsigned
1768 
1769         // step 3 - store 8 U and 8 V values
1770     movlps     qword ptr [edx], xmm0  // U
1771     movhps     qword ptr [edx + edi], xmm0  // V
1772     lea        edx, [edx + 8]
1773     sub        ecx, 16
1774     jg         convertloop
1775 
1776     pop        edi
1777     pop        esi
1778     ret
1779   }
1780 }
1781 
1782 __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
1783                                          int src_stride_argb,
1784                                          uint8_t* dst_u,
1785                                          uint8_t* dst_v,
1786                                          int width) {
1787   __asm {
1788     push       esi
1789     push       edi
1790     mov        eax, [esp + 8 + 4]  // src_argb
1791     mov        esi, [esp + 8 + 8]  // src_stride_argb
1792     mov        edx, [esp + 8 + 12]  // dst_u
1793     mov        edi, [esp + 8 + 16]  // dst_v
1794     mov        ecx, [esp + 8 + 20]  // width
1795     movdqa     xmm5, xmmword ptr kAddUV128
1796     movdqa     xmm6, xmmword ptr kABGRToV
1797     movdqa     xmm7, xmmword ptr kABGRToU
1798     sub        edi, edx  // stride from u to v
1799 
1800  convertloop:
1801          /* step 1 - subsample 16x2 argb pixels to 8x1 */
1802     movdqu     xmm0, [eax]
1803     movdqu     xmm4, [eax + esi]
1804     pavgb      xmm0, xmm4
1805     movdqu     xmm1, [eax + 16]
1806     movdqu     xmm4, [eax + esi + 16]
1807     pavgb      xmm1, xmm4
1808     movdqu     xmm2, [eax + 32]
1809     movdqu     xmm4, [eax + esi + 32]
1810     pavgb      xmm2, xmm4
1811     movdqu     xmm3, [eax + 48]
1812     movdqu     xmm4, [eax + esi + 48]
1813     pavgb      xmm3, xmm4
1814 
1815     lea        eax,  [eax + 64]
1816     movdqa     xmm4, xmm0
1817     shufps     xmm0, xmm1, 0x88
1818     shufps     xmm4, xmm1, 0xdd
1819     pavgb      xmm0, xmm4
1820     movdqa     xmm4, xmm2
1821     shufps     xmm2, xmm3, 0x88
1822     shufps     xmm4, xmm3, 0xdd
1823     pavgb      xmm2, xmm4
1824 
1825         // step 2 - convert to U and V
1826         // from here down is very similar to Y code except
1827         // instead of 16 different pixels, its 8 pixels of U and 8 of V
1828     movdqa     xmm1, xmm0
1829     movdqa     xmm3, xmm2
1830     pmaddubsw  xmm0, xmm7  // U
1831     pmaddubsw  xmm2, xmm7
1832     pmaddubsw  xmm1, xmm6  // V
1833     pmaddubsw  xmm3, xmm6
1834     phaddw     xmm0, xmm2
1835     phaddw     xmm1, xmm3
1836     psraw      xmm0, 8
1837     psraw      xmm1, 8
1838     packsswb   xmm0, xmm1
1839     paddb      xmm0, xmm5  // -> unsigned
1840 
1841         // step 3 - store 8 U and 8 V values
1842     movlps     qword ptr [edx], xmm0  // U
1843     movhps     qword ptr [edx + edi], xmm0  // V
1844     lea        edx, [edx + 8]
1845     sub        ecx, 16
1846     jg         convertloop
1847 
1848     pop        edi
1849     pop        esi
1850     ret
1851   }
1852 }
1853 
1854 __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
1855                                          int src_stride_argb,
1856                                          uint8_t* dst_u,
1857                                          uint8_t* dst_v,
1858                                          int width) {
1859   __asm {
1860     push       esi
1861     push       edi
1862     mov        eax, [esp + 8 + 4]  // src_argb
1863     mov        esi, [esp + 8 + 8]  // src_stride_argb
1864     mov        edx, [esp + 8 + 12]  // dst_u
1865     mov        edi, [esp + 8 + 16]  // dst_v
1866     mov        ecx, [esp + 8 + 20]  // width
1867     movdqa     xmm5, xmmword ptr kAddUV128
1868     movdqa     xmm6, xmmword ptr kRGBAToV
1869     movdqa     xmm7, xmmword ptr kRGBAToU
1870     sub        edi, edx  // stride from u to v
1871 
1872  convertloop:
1873          /* step 1 - subsample 16x2 argb pixels to 8x1 */
1874     movdqu     xmm0, [eax]
1875     movdqu     xmm4, [eax + esi]
1876     pavgb      xmm0, xmm4
1877     movdqu     xmm1, [eax + 16]
1878     movdqu     xmm4, [eax + esi + 16]
1879     pavgb      xmm1, xmm4
1880     movdqu     xmm2, [eax + 32]
1881     movdqu     xmm4, [eax + esi + 32]
1882     pavgb      xmm2, xmm4
1883     movdqu     xmm3, [eax + 48]
1884     movdqu     xmm4, [eax + esi + 48]
1885     pavgb      xmm3, xmm4
1886 
1887     lea        eax,  [eax + 64]
1888     movdqa     xmm4, xmm0
1889     shufps     xmm0, xmm1, 0x88
1890     shufps     xmm4, xmm1, 0xdd
1891     pavgb      xmm0, xmm4
1892     movdqa     xmm4, xmm2
1893     shufps     xmm2, xmm3, 0x88
1894     shufps     xmm4, xmm3, 0xdd
1895     pavgb      xmm2, xmm4
1896 
1897         // step 2 - convert to U and V
1898         // from here down is very similar to Y code except
1899         // instead of 16 different pixels, its 8 pixels of U and 8 of V
1900     movdqa     xmm1, xmm0
1901     movdqa     xmm3, xmm2
1902     pmaddubsw  xmm0, xmm7  // U
1903     pmaddubsw  xmm2, xmm7
1904     pmaddubsw  xmm1, xmm6  // V
1905     pmaddubsw  xmm3, xmm6
1906     phaddw     xmm0, xmm2
1907     phaddw     xmm1, xmm3
1908     psraw      xmm0, 8
1909     psraw      xmm1, 8
1910     packsswb   xmm0, xmm1
1911     paddb      xmm0, xmm5  // -> unsigned
1912 
1913         // step 3 - store 8 U and 8 V values
1914     movlps     qword ptr [edx], xmm0  // U
1915     movhps     qword ptr [edx + edi], xmm0  // V
1916     lea        edx, [edx + 8]
1917     sub        ecx, 16
1918     jg         convertloop
1919 
1920     pop        edi
1921     pop        esi
1922     ret
1923   }
1924 }
1925 #endif  // HAS_ARGBTOYROW_SSSE3
1926 
1927 // Read 16 UV from 444
1928 #define READYUV444_AVX2 \
1929   __asm {                                                \
1930     __asm vmovdqu    xmm0, [esi] /* U */                      \
1931     __asm vmovdqu    xmm1, [esi + edi] /* V */                      \
1932     __asm lea        esi,  [esi + 16]                                          \
1933     __asm vpermq     ymm0, ymm0, 0xd8                                          \
1934     __asm vpermq     ymm1, ymm1, 0xd8                                          \
1935     __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
1936     __asm vmovdqu    xmm4, [eax] /* Y */                      \
1937     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1938     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1939     __asm lea        eax, [eax + 16]}
1940 
1941 // Read 8 UV from 422, upsample to 16 UV.
1942 #define READYUV422_AVX2 \
1943   __asm {                                                \
1944     __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
1945     __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
1946     __asm lea        esi,  [esi + 8]                                           \
1947     __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
1948     __asm vpermq     ymm0, ymm0, 0xd8                                          \
1949     __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
1950     __asm vmovdqu    xmm4, [eax] /* Y */                      \
1951     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1952     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1953     __asm lea        eax, [eax + 16]}
1954 
1955 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
1956 #define READYUVA422_AVX2 \
1957   __asm {                                               \
1958     __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
1959     __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
1960     __asm lea        esi,  [esi + 8]                                           \
1961     __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
1962     __asm vpermq     ymm0, ymm0, 0xd8                                          \
1963     __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
1964     __asm vmovdqu    xmm4, [eax] /* Y */                      \
1965     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1966     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1967     __asm lea        eax, [eax + 16]                                           \
1968     __asm vmovdqu    xmm5, [ebp] /* A */                      \
1969     __asm vpermq     ymm5, ymm5, 0xd8                                          \
1970     __asm lea        ebp, [ebp + 16]}
1971 
1972 // Read 8 UV from NV12, upsample to 16 UV.
1973 #define READNV12_AVX2 \
1974   __asm {                                                  \
1975     __asm vmovdqu    xmm0, [esi] /* UV */                     \
1976     __asm lea        esi,  [esi + 16]                                          \
1977     __asm vpermq     ymm0, ymm0, 0xd8                                          \
1978     __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
1979     __asm vmovdqu    xmm4, [eax] /* Y */                      \
1980     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1981     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1982     __asm lea        eax, [eax + 16]}
1983 
1984 // Read 8 UV from NV21, upsample to 16 UV.
1985 #define READNV21_AVX2 \
1986   __asm {                                                  \
1987     __asm vmovdqu    xmm0, [esi] /* UV */                     \
1988     __asm lea        esi,  [esi + 16]                                          \
1989     __asm vpermq     ymm0, ymm0, 0xd8                                          \
1990     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleNV21                      \
1991     __asm vmovdqu    xmm4, [eax] /* Y */                      \
1992     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1993     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1994     __asm lea        eax, [eax + 16]}
1995 
1996 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
1997 #define READYUY2_AVX2 \
1998   __asm {                                                  \
1999     __asm vmovdqu    ymm4, [eax] /* YUY2 */                           \
2000     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \
2001     __asm vmovdqu    ymm0, [eax] /* UV */                             \
2002     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleYUY2UV                    \
2003     __asm lea        eax, [eax + 32]}
2004 
2005 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2006 #define READUYVY_AVX2 \
2007   __asm {                                                  \
2008     __asm vmovdqu    ymm4, [eax] /* UYVY */                           \
2009     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \
2010     __asm vmovdqu    ymm0, [eax] /* UV */                             \
2011     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleUYVYUV                    \
2012     __asm lea        eax, [eax + 32]}
2013 
2014 // Convert 16 pixels: 16 UV and 16 Y.
2015 #define YUVTORGB_AVX2(YuvConstants) \
2016   __asm {                                    \
2017     __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
2018     __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
2019     __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
2020     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASR]               \
2021     __asm vpsubw     ymm2, ymm3, ymm2                                          \
2022     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASG]               \
2023     __asm vpsubw     ymm1, ymm3, ymm1                                          \
2024     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \
2025     __asm vpsubw     ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */                       \
2026     __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
2027     __asm vpaddsw    ymm0, ymm0, ymm4 /* B += Y */                   \
2028     __asm vpaddsw    ymm1, ymm1, ymm4 /* G += Y */                   \
2029     __asm vpaddsw    ymm2, ymm2, ymm4 /* R += Y */                   \
2030     __asm vpsraw     ymm0, ymm0, 6                                             \
2031     __asm vpsraw     ymm1, ymm1, 6                                             \
2032     __asm vpsraw     ymm2, ymm2, 6                                             \
2033     __asm vpackuswb  ymm0, ymm0, ymm0 /* B */                        \
2034     __asm vpackuswb  ymm1, ymm1, ymm1 /* G */                        \
2035     __asm vpackuswb  ymm2, ymm2, ymm2 /* R */                  \
2036   }
2037 
2038 // Store 16 ARGB values.
2039 #define STOREARGB_AVX2 \
2040   __asm {                                                 \
2041     __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */                       \
2042     __asm vpermq     ymm0, ymm0, 0xd8                                          \
2043     __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */                       \
2044     __asm vpermq     ymm2, ymm2, 0xd8                                          \
2045     __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */      \
2046     __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */       \
2047     __asm vmovdqu    0[edx], ymm1                                              \
2048     __asm vmovdqu    32[edx], ymm0                                             \
2049     __asm lea        edx,  [edx + 64]}
2050 
2051 // Store 16 RGBA values.
2052 #define STORERGBA_AVX2 \
2053   __asm {                                                 \
2054     __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */                       \
2055     __asm vpermq     ymm1, ymm1, 0xd8                                          \
2056     __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */                       \
2057     __asm vpermq     ymm2, ymm2, 0xd8                                          \
2058     __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */      \
2059     __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */       \
2060     __asm vmovdqu    [edx], ymm0                                               \
2061     __asm vmovdqu    [edx + 32], ymm1                                          \
2062     __asm lea        edx,  [edx + 64]}
2063 
2064 #ifdef HAS_I422TOARGBROW_AVX2
2065 // 16 pixels
2066 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2067 __declspec(naked) void I422ToARGBRow_AVX2(
2068     const uint8_t* y_buf,
2069     const uint8_t* u_buf,
2070     const uint8_t* v_buf,
2071     uint8_t* dst_argb,
2072     const struct YuvConstants* yuvconstants,
2073     int width) {
2074   __asm {
2075     push       esi
2076     push       edi
2077     push       ebx
2078     mov        eax, [esp + 12 + 4]  // Y
2079     mov        esi, [esp + 12 + 8]  // U
2080     mov        edi, [esp + 12 + 12]  // V
2081     mov        edx, [esp + 12 + 16]  // argb
2082     mov        ebx, [esp + 12 + 20]  // yuvconstants
2083     mov        ecx, [esp + 12 + 24]  // width
2084     sub        edi, esi
2085     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2086 
2087  convertloop:
2088     READYUV422_AVX2
2089     YUVTORGB_AVX2(ebx)
2090     STOREARGB_AVX2
2091 
2092     sub        ecx, 16
2093     jg         convertloop
2094 
2095     pop        ebx
2096     pop        edi
2097     pop        esi
2098     vzeroupper
2099     ret
2100   }
2101 }
2102 #endif  // HAS_I422TOARGBROW_AVX2
2103 
2104 #ifdef HAS_I422ALPHATOARGBROW_AVX2
2105 // 16 pixels
2106 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
2107 __declspec(naked) void I422AlphaToARGBRow_AVX2(
2108     const uint8_t* y_buf,
2109     const uint8_t* u_buf,
2110     const uint8_t* v_buf,
2111     const uint8_t* a_buf,
2112     uint8_t* dst_argb,
2113     const struct YuvConstants* yuvconstants,
2114     int width) {
2115   __asm {
2116     push       esi
2117     push       edi
2118     push       ebx
2119     push       ebp
2120     mov        eax, [esp + 16 + 4]  // Y
2121     mov        esi, [esp + 16 + 8]  // U
2122     mov        edi, [esp + 16 + 12]  // V
2123     mov        ebp, [esp + 16 + 16]  // A
2124     mov        edx, [esp + 16 + 20]  // argb
2125     mov        ebx, [esp + 16 + 24]  // yuvconstants
2126     mov        ecx, [esp + 16 + 28]  // width
2127     sub        edi, esi
2128 
2129  convertloop:
2130     READYUVA422_AVX2
2131     YUVTORGB_AVX2(ebx)
2132     STOREARGB_AVX2
2133 
2134     sub        ecx, 16
2135     jg         convertloop
2136 
2137     pop        ebp
2138     pop        ebx
2139     pop        edi
2140     pop        esi
2141     vzeroupper
2142     ret
2143   }
2144 }
2145 #endif  // HAS_I422ALPHATOARGBROW_AVX2
2146 
2147 #ifdef HAS_I444TOARGBROW_AVX2
2148 // 16 pixels
2149 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2150 __declspec(naked) void I444ToARGBRow_AVX2(
2151     const uint8_t* y_buf,
2152     const uint8_t* u_buf,
2153     const uint8_t* v_buf,
2154     uint8_t* dst_argb,
2155     const struct YuvConstants* yuvconstants,
2156     int width) {
2157   __asm {
2158     push       esi
2159     push       edi
2160     push       ebx
2161     mov        eax, [esp + 12 + 4]  // Y
2162     mov        esi, [esp + 12 + 8]  // U
2163     mov        edi, [esp + 12 + 12]  // V
2164     mov        edx, [esp + 12 + 16]  // argb
2165     mov        ebx, [esp + 12 + 20]  // yuvconstants
2166     mov        ecx, [esp + 12 + 24]  // width
2167     sub        edi, esi
2168     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2169  convertloop:
2170     READYUV444_AVX2
2171     YUVTORGB_AVX2(ebx)
2172     STOREARGB_AVX2
2173 
2174     sub        ecx, 16
2175     jg         convertloop
2176 
2177     pop        ebx
2178     pop        edi
2179     pop        esi
2180     vzeroupper
2181     ret
2182   }
2183 }
2184 #endif  // HAS_I444TOARGBROW_AVX2
2185 
2186 #ifdef HAS_NV12TOARGBROW_AVX2
2187 // 16 pixels.
2188 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2189 __declspec(naked) void NV12ToARGBRow_AVX2(
2190     const uint8_t* y_buf,
2191     const uint8_t* uv_buf,
2192     uint8_t* dst_argb,
2193     const struct YuvConstants* yuvconstants,
2194     int width) {
2195   __asm {
2196     push       esi
2197     push       ebx
2198     mov        eax, [esp + 8 + 4]  // Y
2199     mov        esi, [esp + 8 + 8]  // UV
2200     mov        edx, [esp + 8 + 12]  // argb
2201     mov        ebx, [esp + 8 + 16]  // yuvconstants
2202     mov        ecx, [esp + 8 + 20]  // width
2203     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2204 
2205  convertloop:
2206     READNV12_AVX2
2207     YUVTORGB_AVX2(ebx)
2208     STOREARGB_AVX2
2209 
2210     sub        ecx, 16
2211     jg         convertloop
2212 
2213     pop        ebx
2214     pop        esi
2215     vzeroupper
2216     ret
2217   }
2218 }
2219 #endif  // HAS_NV12TOARGBROW_AVX2
2220 
2221 #ifdef HAS_NV21TOARGBROW_AVX2
2222 // 16 pixels.
2223 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2224 __declspec(naked) void NV21ToARGBRow_AVX2(
2225     const uint8_t* y_buf,
2226     const uint8_t* vu_buf,
2227     uint8_t* dst_argb,
2228     const struct YuvConstants* yuvconstants,
2229     int width) {
2230   __asm {
2231     push       esi
2232     push       ebx
2233     mov        eax, [esp + 8 + 4]  // Y
2234     mov        esi, [esp + 8 + 8]  // VU
2235     mov        edx, [esp + 8 + 12]  // argb
2236     mov        ebx, [esp + 8 + 16]  // yuvconstants
2237     mov        ecx, [esp + 8 + 20]  // width
2238     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2239 
2240  convertloop:
2241     READNV21_AVX2
2242     YUVTORGB_AVX2(ebx)
2243     STOREARGB_AVX2
2244 
2245     sub        ecx, 16
2246     jg         convertloop
2247 
2248     pop        ebx
2249     pop        esi
2250     vzeroupper
2251     ret
2252   }
2253 }
2254 #endif  // HAS_NV21TOARGBROW_AVX2
2255 
2256 #ifdef HAS_YUY2TOARGBROW_AVX2
2257 // 16 pixels.
2258 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2259 __declspec(naked) void YUY2ToARGBRow_AVX2(
2260     const uint8_t* src_yuy2,
2261     uint8_t* dst_argb,
2262     const struct YuvConstants* yuvconstants,
2263     int width) {
2264   __asm {
2265     push       ebx
2266     mov        eax, [esp + 4 + 4]  // yuy2
2267     mov        edx, [esp + 4 + 8]  // argb
2268     mov        ebx, [esp + 4 + 12]  // yuvconstants
2269     mov        ecx, [esp + 4 + 16]  // width
2270     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2271 
2272  convertloop:
2273     READYUY2_AVX2
2274     YUVTORGB_AVX2(ebx)
2275     STOREARGB_AVX2
2276 
2277     sub        ecx, 16
2278     jg         convertloop
2279 
2280     pop        ebx
2281     vzeroupper
2282     ret
2283   }
2284 }
2285 #endif  // HAS_YUY2TOARGBROW_AVX2
2286 
2287 #ifdef HAS_UYVYTOARGBROW_AVX2
2288 // 16 pixels.
2289 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2290 __declspec(naked) void UYVYToARGBRow_AVX2(
2291     const uint8_t* src_uyvy,
2292     uint8_t* dst_argb,
2293     const struct YuvConstants* yuvconstants,
2294     int width) {
2295   __asm {
2296     push       ebx
2297     mov        eax, [esp + 4 + 4]  // uyvy
2298     mov        edx, [esp + 4 + 8]  // argb
2299     mov        ebx, [esp + 4 + 12]  // yuvconstants
2300     mov        ecx, [esp + 4 + 16]  // width
2301     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2302 
2303  convertloop:
2304     READUYVY_AVX2
2305     YUVTORGB_AVX2(ebx)
2306     STOREARGB_AVX2
2307 
2308     sub        ecx, 16
2309     jg         convertloop
2310 
2311     pop        ebx
2312     vzeroupper
2313     ret
2314   }
2315 }
2316 #endif  // HAS_UYVYTOARGBROW_AVX2
2317 
2318 #ifdef HAS_I422TORGBAROW_AVX2
2319 // 16 pixels
2320 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2321 __declspec(naked) void I422ToRGBARow_AVX2(
2322     const uint8_t* y_buf,
2323     const uint8_t* u_buf,
2324     const uint8_t* v_buf,
2325     uint8_t* dst_argb,
2326     const struct YuvConstants* yuvconstants,
2327     int width) {
2328   __asm {
2329     push       esi
2330     push       edi
2331     push       ebx
2332     mov        eax, [esp + 12 + 4]  // Y
2333     mov        esi, [esp + 12 + 8]  // U
2334     mov        edi, [esp + 12 + 12]  // V
2335     mov        edx, [esp + 12 + 16]  // abgr
2336     mov        ebx, [esp + 12 + 20]  // yuvconstants
2337     mov        ecx, [esp + 12 + 24]  // width
2338     sub        edi, esi
2339     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2340 
2341  convertloop:
2342     READYUV422_AVX2
2343     YUVTORGB_AVX2(ebx)
2344     STORERGBA_AVX2
2345 
2346     sub        ecx, 16
2347     jg         convertloop
2348 
2349     pop        ebx
2350     pop        edi
2351     pop        esi
2352     vzeroupper
2353     ret
2354   }
2355 }
2356 #endif  // HAS_I422TORGBAROW_AVX2
2357 
2358 #if defined(HAS_I422TOARGBROW_SSSE3)
2359 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
2360 // Allows a conversion with half size scaling.
2361 
2362 // Read 8 UV from 444.
2363 #define READYUV444 \
2364   __asm {                                                     \
2365     __asm movq       xmm0, qword ptr [esi] /* U */                             \
2366     __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
2367     __asm lea        esi,  [esi + 8]                                           \
2368     __asm punpcklbw  xmm0, xmm1 /* UV */                             \
2369     __asm movq       xmm4, qword ptr [eax]                                     \
2370     __asm punpcklbw  xmm4, xmm4                                                \
2371     __asm lea        eax, [eax + 8]}
2372 
2373 // Read 4 UV from 422, upsample to 8 UV.
2374 #define READYUV422 \
2375   __asm {                                                     \
2376     __asm movd       xmm0, [esi] /* U */                              \
2377     __asm movd       xmm1, [esi + edi] /* V */                              \
2378     __asm lea        esi,  [esi + 4]                                           \
2379     __asm punpcklbw  xmm0, xmm1 /* UV */                             \
2380     __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
2381     __asm movq       xmm4, qword ptr [eax]                                     \
2382     __asm punpcklbw  xmm4, xmm4                                                \
2383     __asm lea        eax, [eax + 8]}
2384 
2385 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
2386 #define READYUVA422 \
2387   __asm {                                                    \
2388     __asm movd       xmm0, [esi] /* U */                              \
2389     __asm movd       xmm1, [esi + edi] /* V */                              \
2390     __asm lea        esi,  [esi + 4]                                           \
2391     __asm punpcklbw  xmm0, xmm1 /* UV */                             \
2392     __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
2393     __asm movq       xmm4, qword ptr [eax] /* Y */                           \
2394     __asm punpcklbw  xmm4, xmm4                                                \
2395     __asm lea        eax, [eax + 8]                                            \
2396     __asm movq       xmm5, qword ptr [ebp] /* A */                           \
2397     __asm lea        ebp, [ebp + 8]}
2398 
2399 // Read 4 UV from NV12, upsample to 8 UV.
2400 #define READNV12 \
2401   __asm {                                                       \
2402     __asm movq       xmm0, qword ptr [esi] /* UV */                            \
2403     __asm lea        esi,  [esi + 8]                                           \
2404     __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
2405     __asm movq       xmm4, qword ptr [eax]                                     \
2406     __asm punpcklbw  xmm4, xmm4                                                \
2407     __asm lea        eax, [eax + 8]}
2408 
2409 // Read 4 VU from NV21, upsample to 8 UV.
2410 #define READNV21 \
2411   __asm {                                                       \
2412     __asm movq       xmm0, qword ptr [esi] /* UV */                            \
2413     __asm lea        esi,  [esi + 8]                                           \
2414     __asm pshufb     xmm0, xmmword ptr kShuffleNV21                            \
2415     __asm movq       xmm4, qword ptr [eax]                                     \
2416     __asm punpcklbw  xmm4, xmm4                                                \
2417     __asm lea        eax, [eax + 8]}
2418 
2419 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
2420 #define READYUY2 \
2421   __asm {                                                       \
2422     __asm movdqu     xmm4, [eax] /* YUY2 */                           \
2423     __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \
2424     __asm movdqu     xmm0, [eax] /* UV */                             \
2425     __asm pshufb     xmm0, xmmword ptr kShuffleYUY2UV                          \
2426     __asm lea        eax, [eax + 16]}
2427 
2428 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
2429 #define READUYVY \
2430   __asm {                                                       \
2431     __asm movdqu     xmm4, [eax] /* UYVY */                           \
2432     __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \
2433     __asm movdqu     xmm0, [eax] /* UV */                             \
2434     __asm pshufb     xmm0, xmmword ptr kShuffleUYVYUV                          \
2435     __asm lea        eax, [eax + 16]}
2436 
2437 // Convert 8 pixels: 8 UV and 8 Y.
2438 #define YUVTORGB(YuvConstants) \
2439   __asm {                                         \
2440     __asm movdqa     xmm1, xmm0                                                \
2441     __asm movdqa     xmm2, xmm0                                                \
2442     __asm movdqa     xmm3, xmm0                                                \
2443     __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVBIASB]               \
2444     __asm pmaddubsw  xmm1, xmmword ptr [YuvConstants + KUVTOB]                 \
2445     __asm psubw      xmm0, xmm1                                                \
2446     __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVBIASG]               \
2447     __asm pmaddubsw  xmm2, xmmword ptr [YuvConstants + KUVTOG]                 \
2448     __asm psubw      xmm1, xmm2                                                \
2449     __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVBIASR]               \
2450     __asm pmaddubsw  xmm3, xmmword ptr [YuvConstants + KUVTOR]                 \
2451     __asm psubw      xmm2, xmm3                                                \
2452     __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
2453     __asm paddsw     xmm0, xmm4 /* B += Y */                         \
2454     __asm paddsw     xmm1, xmm4 /* G += Y */                         \
2455     __asm paddsw     xmm2, xmm4 /* R += Y */                         \
2456     __asm psraw      xmm0, 6                                                   \
2457     __asm psraw      xmm1, 6                                                   \
2458     __asm psraw      xmm2, 6                                                   \
2459     __asm packuswb   xmm0, xmm0 /* B */                              \
2460     __asm packuswb   xmm1, xmm1 /* G */                              \
2461     __asm packuswb   xmm2, xmm2 /* R */             \
2462   }
2463 
2464 // Store 8 ARGB values.
2465 #define STOREARGB \
2466   __asm {                                                      \
2467     __asm punpcklbw  xmm0, xmm1 /* BG */                             \
2468     __asm punpcklbw  xmm2, xmm5 /* RA */                             \
2469     __asm movdqa     xmm1, xmm0                                                \
2470     __asm punpcklwd  xmm0, xmm2 /* BGRA first 4 pixels */            \
2471     __asm punpckhwd  xmm1, xmm2 /* BGRA next 4 pixels */             \
2472     __asm movdqu     0[edx], xmm0                                              \
2473     __asm movdqu     16[edx], xmm1                                             \
2474     __asm lea        edx,  [edx + 32]}
2475 
2476 // Store 8 BGRA values.
2477 #define STOREBGRA \
2478   __asm {                                                      \
2479     __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
2480     __asm punpcklbw  xmm1, xmm0 /* GB */                             \
2481     __asm punpcklbw  xmm5, xmm2 /* AR */                             \
2482     __asm movdqa     xmm0, xmm5                                                \
2483     __asm punpcklwd  xmm5, xmm1 /* BGRA first 4 pixels */            \
2484     __asm punpckhwd  xmm0, xmm1 /* BGRA next 4 pixels */             \
2485     __asm movdqu     0[edx], xmm5                                              \
2486     __asm movdqu     16[edx], xmm0                                             \
2487     __asm lea        edx,  [edx + 32]}
2488 
2489 // Store 8 RGBA values.
2490 #define STORERGBA \
2491   __asm {                                                      \
2492     __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
2493     __asm punpcklbw  xmm1, xmm2 /* GR */                             \
2494     __asm punpcklbw  xmm5, xmm0 /* AB */                             \
2495     __asm movdqa     xmm0, xmm5                                                \
2496     __asm punpcklwd  xmm5, xmm1 /* RGBA first 4 pixels */            \
2497     __asm punpckhwd  xmm0, xmm1 /* RGBA next 4 pixels */             \
2498     __asm movdqu     0[edx], xmm5                                              \
2499     __asm movdqu     16[edx], xmm0                                             \
2500     __asm lea        edx,  [edx + 32]}
2501 
2502 // Store 8 RGB24 values.
2503 #define STORERGB24 \
2504   __asm {/* Weave into RRGB */                                                      \
2505     __asm punpcklbw  xmm0, xmm1 /* BG */                             \
2506     __asm punpcklbw  xmm2, xmm2 /* RR */                             \
2507     __asm movdqa     xmm1, xmm0                                                \
2508     __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
2509     __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */                                                        \
2510     __asm pshufb     xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
2511     __asm pshufb     xmm1, xmm6 /* Pack first 12 bytes. */           \
2512     __asm palignr    xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
2513     __asm movq       qword ptr 0[edx], xmm0 /* First 8 bytes */               \
2514     __asm movdqu     8[edx], xmm1 /* Last 16 bytes */                  \
2515     __asm lea        edx,  [edx + 24]}
2516 
2517 // Store 8 RGB565 values.
2518 #define STORERGB565 \
2519   __asm {/* Weave into RRGB */                                                      \
2520     __asm punpcklbw  xmm0, xmm1 /* BG */                             \
2521     __asm punpcklbw  xmm2, xmm2 /* RR */                             \
2522     __asm movdqa     xmm1, xmm0                                                \
2523     __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
2524     __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */                                                       \
2525     __asm movdqa     xmm3, xmm0 /* B  first 4 pixels of argb */             \
2526     __asm movdqa     xmm2, xmm0 /* G */                                     \
2527     __asm pslld      xmm0, 8 /* R */                                     \
2528     __asm psrld      xmm3, 3 /* B */                                     \
2529     __asm psrld      xmm2, 5 /* G */                                     \
2530     __asm psrad      xmm0, 16 /* R */                                     \
2531     __asm pand       xmm3, xmm5 /* B */                                     \
2532     __asm pand       xmm2, xmm6 /* G */                                     \
2533     __asm pand       xmm0, xmm7 /* R */                                     \
2534     __asm por        xmm3, xmm2 /* BG */                                    \
2535     __asm por        xmm0, xmm3 /* BGR */                                   \
2536     __asm movdqa     xmm3, xmm1 /* B  next 4 pixels of argb */              \
2537     __asm movdqa     xmm2, xmm1 /* G */                                     \
2538     __asm pslld      xmm1, 8 /* R */                                     \
2539     __asm psrld      xmm3, 3 /* B */                                     \
2540     __asm psrld      xmm2, 5 /* G */                                     \
2541     __asm psrad      xmm1, 16 /* R */                                     \
2542     __asm pand       xmm3, xmm5 /* B */                                     \
2543     __asm pand       xmm2, xmm6 /* G */                                     \
2544     __asm pand       xmm1, xmm7 /* R */                                     \
2545     __asm por        xmm3, xmm2 /* BG */                                    \
2546     __asm por        xmm1, xmm3 /* BGR */                                   \
2547     __asm packssdw   xmm0, xmm1                                                \
2548     __asm movdqu     0[edx], xmm0 /* store 8 pixels of RGB565 */              \
2549     __asm lea        edx, [edx + 16]}
2550 
2551 // 8 pixels.
2552 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2553 __declspec(naked) void I444ToARGBRow_SSSE3(
2554     const uint8_t* y_buf,
2555     const uint8_t* u_buf,
2556     const uint8_t* v_buf,
2557     uint8_t* dst_argb,
2558     const struct YuvConstants* yuvconstants,
2559     int width) {
2560   __asm {
2561     push       esi
2562     push       edi
2563     push       ebx
2564     mov        eax, [esp + 12 + 4]  // Y
2565     mov        esi, [esp + 12 + 8]  // U
2566     mov        edi, [esp + 12 + 12]  // V
2567     mov        edx, [esp + 12 + 16]  // argb
2568     mov        ebx, [esp + 12 + 20]  // yuvconstants
2569     mov        ecx, [esp + 12 + 24]  // width
2570     sub        edi, esi
2571     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2572 
2573  convertloop:
2574     READYUV444
2575     YUVTORGB(ebx)
2576     STOREARGB
2577 
2578     sub        ecx, 8
2579     jg         convertloop
2580 
2581     pop        ebx
2582     pop        edi
2583     pop        esi
2584     ret
2585   }
2586 }
2587 
2588 // 8 pixels.
2589 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
2590 __declspec(naked) void I422ToRGB24Row_SSSE3(
2591     const uint8_t* y_buf,
2592     const uint8_t* u_buf,
2593     const uint8_t* v_buf,
2594     uint8_t* dst_rgb24,
2595     const struct YuvConstants* yuvconstants,
2596     int width) {
2597   __asm {
2598     push       esi
2599     push       edi
2600     push       ebx
2601     mov        eax, [esp + 12 + 4]  // Y
2602     mov        esi, [esp + 12 + 8]  // U
2603     mov        edi, [esp + 12 + 12]  // V
2604     mov        edx, [esp + 12 + 16]  // argb
2605     mov        ebx, [esp + 12 + 20]  // yuvconstants
2606     mov        ecx, [esp + 12 + 24]  // width
2607     sub        edi, esi
2608     movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
2609     movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24
2610 
2611  convertloop:
2612     READYUV422
2613     YUVTORGB(ebx)
2614     STORERGB24
2615 
2616     sub        ecx, 8
2617     jg         convertloop
2618 
2619     pop        ebx
2620     pop        edi
2621     pop        esi
2622     ret
2623   }
2624 }
2625 
2626 // 8 pixels
2627 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
2628 __declspec(naked) void I422ToRGB565Row_SSSE3(
2629     const uint8_t* y_buf,
2630     const uint8_t* u_buf,
2631     const uint8_t* v_buf,
2632     uint8_t* rgb565_buf,
2633     const struct YuvConstants* yuvconstants,
2634     int width) {
2635   __asm {
2636     push       esi
2637     push       edi
2638     push       ebx
2639     mov        eax, [esp + 12 + 4]  // Y
2640     mov        esi, [esp + 12 + 8]  // U
2641     mov        edi, [esp + 12 + 12]  // V
2642     mov        edx, [esp + 12 + 16]  // argb
2643     mov        ebx, [esp + 12 + 20]  // yuvconstants
2644     mov        ecx, [esp + 12 + 24]  // width
2645     sub        edi, esi
2646     pcmpeqb    xmm5, xmm5  // generate mask 0x0000001f
2647     psrld      xmm5, 27
2648     pcmpeqb    xmm6, xmm6  // generate mask 0x000007e0
2649     psrld      xmm6, 26
2650     pslld      xmm6, 5
2651     pcmpeqb    xmm7, xmm7  // generate mask 0xfffff800
2652     pslld      xmm7, 11
2653 
2654  convertloop:
2655     READYUV422
2656     YUVTORGB(ebx)
2657     STORERGB565
2658 
2659     sub        ecx, 8
2660     jg         convertloop
2661 
2662     pop        ebx
2663     pop        edi
2664     pop        esi
2665     ret
2666   }
2667 }
2668 
2669 // 8 pixels.
2670 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2671 __declspec(naked) void I422ToARGBRow_SSSE3(
2672     const uint8_t* y_buf,
2673     const uint8_t* u_buf,
2674     const uint8_t* v_buf,
2675     uint8_t* dst_argb,
2676     const struct YuvConstants* yuvconstants,
2677     int width) {
2678   __asm {
2679     push       esi
2680     push       edi
2681     push       ebx
2682     mov        eax, [esp + 12 + 4]  // Y
2683     mov        esi, [esp + 12 + 8]  // U
2684     mov        edi, [esp + 12 + 12]  // V
2685     mov        edx, [esp + 12 + 16]  // argb
2686     mov        ebx, [esp + 12 + 20]  // yuvconstants
2687     mov        ecx, [esp + 12 + 24]  // width
2688     sub        edi, esi
2689     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2690 
2691  convertloop:
2692     READYUV422
2693     YUVTORGB(ebx)
2694     STOREARGB
2695 
2696     sub        ecx, 8
2697     jg         convertloop
2698 
2699     pop        ebx
2700     pop        edi
2701     pop        esi
2702     ret
2703   }
2704 }
2705 
2706 // 8 pixels.
2707 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
2708 __declspec(naked) void I422AlphaToARGBRow_SSSE3(
2709     const uint8_t* y_buf,
2710     const uint8_t* u_buf,
2711     const uint8_t* v_buf,
2712     const uint8_t* a_buf,
2713     uint8_t* dst_argb,
2714     const struct YuvConstants* yuvconstants,
2715     int width) {
2716   __asm {
2717     push       esi
2718     push       edi
2719     push       ebx
2720     push       ebp
2721     mov        eax, [esp + 16 + 4]  // Y
2722     mov        esi, [esp + 16 + 8]  // U
2723     mov        edi, [esp + 16 + 12]  // V
2724     mov        ebp, [esp + 16 + 16]  // A
2725     mov        edx, [esp + 16 + 20]  // argb
2726     mov        ebx, [esp + 16 + 24]  // yuvconstants
2727     mov        ecx, [esp + 16 + 28]  // width
2728     sub        edi, esi
2729 
2730  convertloop:
2731     READYUVA422
2732     YUVTORGB(ebx)
2733     STOREARGB
2734 
2735     sub        ecx, 8
2736     jg         convertloop
2737 
2738     pop        ebp
2739     pop        ebx
2740     pop        edi
2741     pop        esi
2742     ret
2743   }
2744 }
2745 
2746 // 8 pixels.
2747 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2748 __declspec(naked) void NV12ToARGBRow_SSSE3(
2749     const uint8_t* y_buf,
2750     const uint8_t* uv_buf,
2751     uint8_t* dst_argb,
2752     const struct YuvConstants* yuvconstants,
2753     int width) {
2754   __asm {
2755     push       esi
2756     push       ebx
2757     mov        eax, [esp + 8 + 4]  // Y
2758     mov        esi, [esp + 8 + 8]  // UV
2759     mov        edx, [esp + 8 + 12]  // argb
2760     mov        ebx, [esp + 8 + 16]  // yuvconstants
2761     mov        ecx, [esp + 8 + 20]  // width
2762     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2763 
2764  convertloop:
2765     READNV12
2766     YUVTORGB(ebx)
2767     STOREARGB
2768 
2769     sub        ecx, 8
2770     jg         convertloop
2771 
2772     pop        ebx
2773     pop        esi
2774     ret
2775   }
2776 }
2777 
2778 // 8 pixels.
2779 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2780 __declspec(naked) void NV21ToARGBRow_SSSE3(
2781     const uint8_t* y_buf,
2782     const uint8_t* vu_buf,
2783     uint8_t* dst_argb,
2784     const struct YuvConstants* yuvconstants,
2785     int width) {
2786   __asm {
2787     push       esi
2788     push       ebx
2789     mov        eax, [esp + 8 + 4]  // Y
2790     mov        esi, [esp + 8 + 8]  // VU
2791     mov        edx, [esp + 8 + 12]  // argb
2792     mov        ebx, [esp + 8 + 16]  // yuvconstants
2793     mov        ecx, [esp + 8 + 20]  // width
2794     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2795 
2796  convertloop:
2797     READNV21
2798     YUVTORGB(ebx)
2799     STOREARGB
2800 
2801     sub        ecx, 8
2802     jg         convertloop
2803 
2804     pop        ebx
2805     pop        esi
2806     ret
2807   }
2808 }
2809 
2810 // 8 pixels.
2811 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
2812 __declspec(naked) void YUY2ToARGBRow_SSSE3(
2813     const uint8_t* src_yuy2,
2814     uint8_t* dst_argb,
2815     const struct YuvConstants* yuvconstants,
2816     int width) {
2817   __asm {
2818     push       ebx
2819     mov        eax, [esp + 4 + 4]  // yuy2
2820     mov        edx, [esp + 4 + 8]  // argb
2821     mov        ebx, [esp + 4 + 12]  // yuvconstants
2822     mov        ecx, [esp + 4 + 16]  // width
2823     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2824 
2825  convertloop:
2826     READYUY2
2827     YUVTORGB(ebx)
2828     STOREARGB
2829 
2830     sub        ecx, 8
2831     jg         convertloop
2832 
2833     pop        ebx
2834     ret
2835   }
2836 }
2837 
2838 // 8 pixels.
2839 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
2840 __declspec(naked) void UYVYToARGBRow_SSSE3(
2841     const uint8_t* src_uyvy,
2842     uint8_t* dst_argb,
2843     const struct YuvConstants* yuvconstants,
2844     int width) {
2845   __asm {
2846     push       ebx
2847     mov        eax, [esp + 4 + 4]  // uyvy
2848     mov        edx, [esp + 4 + 8]  // argb
2849     mov        ebx, [esp + 4 + 12]  // yuvconstants
2850     mov        ecx, [esp + 4 + 16]  // width
2851     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2852 
2853  convertloop:
2854     READUYVY
2855     YUVTORGB(ebx)
2856     STOREARGB
2857 
2858     sub        ecx, 8
2859     jg         convertloop
2860 
2861     pop        ebx
2862     ret
2863   }
2864 }
2865 
2866 __declspec(naked) void I422ToRGBARow_SSSE3(
2867     const uint8_t* y_buf,
2868     const uint8_t* u_buf,
2869     const uint8_t* v_buf,
2870     uint8_t* dst_rgba,
2871     const struct YuvConstants* yuvconstants,
2872     int width) {
2873   __asm {
2874     push       esi
2875     push       edi
2876     push       ebx
2877     mov        eax, [esp + 12 + 4]  // Y
2878     mov        esi, [esp + 12 + 8]  // U
2879     mov        edi, [esp + 12 + 12]  // V
2880     mov        edx, [esp + 12 + 16]  // argb
2881     mov        ebx, [esp + 12 + 20]  // yuvconstants
2882     mov        ecx, [esp + 12 + 24]  // width
2883     sub        edi, esi
2884 
2885  convertloop:
2886     READYUV422
2887     YUVTORGB(ebx)
2888     STORERGBA
2889 
2890     sub        ecx, 8
2891     jg         convertloop
2892 
2893     pop        ebx
2894     pop        edi
2895     pop        esi
2896     ret
2897   }
2898 }
2899 #endif  // HAS_I422TOARGBROW_SSSE3
2900 
2901 // I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter
2902 #ifdef HAS_I400TOARGBROW_SSE2
2903 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
2904 __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
2905                                           uint8_t* rgb_buf,
2906                                           const struct YuvConstants*,
2907                                           int width) {
2908   __asm {
2909     mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
2910     movd       xmm2, eax
2911     pshufd     xmm2, xmm2,0
2912     mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
2913     movd       xmm3, eax
2914     pshufd     xmm3, xmm3, 0
2915     pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
2916     pslld      xmm4, 24
2917 
2918     mov        eax, [esp + 4]  // Y
2919     mov        edx, [esp + 8]  // rgb
2920     mov        ecx, [esp + 12]  // width
2921 
2922  convertloop:
2923         // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2924     movq       xmm0, qword ptr [eax]
2925     lea        eax, [eax + 8]
2926     punpcklbw  xmm0, xmm0  // Y.Y
2927     pmulhuw    xmm0, xmm2
2928     psubusw    xmm0, xmm3
2929     psrlw      xmm0, 6
2930     packuswb   xmm0, xmm0        // G
2931 
2932         // Step 2: Weave into ARGB
2933     punpcklbw  xmm0, xmm0  // GG
2934     movdqa     xmm1, xmm0
2935     punpcklwd  xmm0, xmm0  // BGRA first 4 pixels
2936     punpckhwd  xmm1, xmm1  // BGRA next 4 pixels
2937     por        xmm0, xmm4
2938     por        xmm1, xmm4
2939     movdqu     [edx], xmm0
2940     movdqu     [edx + 16], xmm1
2941     lea        edx,  [edx + 32]
2942     sub        ecx, 8
2943     jg         convertloop
2944     ret
2945   }
2946 }
2947 #endif  // HAS_I400TOARGBROW_SSE2
2948 
2949 #ifdef HAS_I400TOARGBROW_AVX2
2950 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2951 // note: vpunpcklbw mutates and vpackuswb unmutates.
2952 __declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
2953                                           uint8_t* rgb_buf,
2954                                           const struct YuvConstants*,
2955                                           int width) {
2956   __asm {
2957     mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
2958     vmovd      xmm2, eax
2959     vbroadcastss ymm2, xmm2
2960     mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
2961     vmovd      xmm3, eax
2962     vbroadcastss ymm3, xmm3
2963     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xff000000
2964     vpslld     ymm4, ymm4, 24
2965 
2966     mov        eax, [esp + 4]  // Y
2967     mov        edx, [esp + 8]  // rgb
2968     mov        ecx, [esp + 12]  // width
2969 
2970  convertloop:
2971         // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
2972     vmovdqu    xmm0, [eax]
2973     lea        eax, [eax + 16]
2974     vpermq     ymm0, ymm0, 0xd8  // vpunpcklbw mutates
2975     vpunpcklbw ymm0, ymm0, ymm0  // Y.Y
2976     vpmulhuw   ymm0, ymm0, ymm2
2977     vpsubusw   ymm0, ymm0, ymm3
2978     vpsrlw     ymm0, ymm0, 6
2979     vpackuswb  ymm0, ymm0, ymm0        // G.  still mutated: 3120
2980 
2981         // TODO(fbarchard): Weave alpha with unpack.
2982         // Step 2: Weave into ARGB
2983     vpunpcklbw ymm1, ymm0, ymm0  // GG - mutates
2984     vpermq     ymm1, ymm1, 0xd8
2985     vpunpcklwd ymm0, ymm1, ymm1  // GGGG first 8 pixels
2986     vpunpckhwd ymm1, ymm1, ymm1  // GGGG next 8 pixels
2987     vpor       ymm0, ymm0, ymm4
2988     vpor       ymm1, ymm1, ymm4
2989     vmovdqu    [edx], ymm0
2990     vmovdqu    [edx + 32], ymm1
2991     lea        edx,  [edx + 64]
2992     sub        ecx, 16
2993     jg         convertloop
2994     vzeroupper
2995     ret
2996   }
2997 }
2998 #endif  // HAS_I400TOARGBROW_AVX2
2999 
3000 #ifdef HAS_MIRRORROW_SSSE3
3001 // Shuffle table for reversing the bytes.
3002 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
3003                                      7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
3004 
3005 // TODO(fbarchard): Replace lea with -16 offset.
3006 __declspec(naked) void MirrorRow_SSSE3(const uint8_t* src,
3007                                        uint8_t* dst,
3008                                        int width) {
3009   __asm {
3010     mov       eax, [esp + 4]  // src
3011     mov       edx, [esp + 8]  // dst
3012     mov       ecx, [esp + 12]  // width
3013     movdqa    xmm5, xmmword ptr kShuffleMirror
3014 
3015  convertloop:
3016     movdqu    xmm0, [eax - 16 + ecx]
3017     pshufb    xmm0, xmm5
3018     movdqu    [edx], xmm0
3019     lea       edx, [edx + 16]
3020     sub       ecx, 16
3021     jg        convertloop
3022     ret
3023   }
3024 }
3025 #endif  // HAS_MIRRORROW_SSSE3
3026 
3027 #ifdef HAS_MIRRORROW_AVX2
3028 __declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
3029                                       uint8_t* dst,
3030                                       int width) {
3031   __asm {
3032     mov       eax, [esp + 4]  // src
3033     mov       edx, [esp + 8]  // dst
3034     mov       ecx, [esp + 12]  // width
3035     vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
3036 
3037  convertloop:
3038     vmovdqu   ymm0, [eax - 32 + ecx]
3039     vpshufb   ymm0, ymm0, ymm5
3040     vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
3041     vmovdqu   [edx], ymm0
3042     lea       edx, [edx + 32]
3043     sub       ecx, 32
3044     jg        convertloop
3045     vzeroupper
3046     ret
3047   }
3048 }
3049 #endif  // HAS_MIRRORROW_AVX2
3050 
3051 #ifdef HAS_MIRRORSPLITUVROW_SSSE3
3052 // Shuffle table for reversing the bytes of UV channels.
3053 static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
3054                                        15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
3055 
3056 __declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src,
3057                                               uint8_t* dst_u,
3058                                               uint8_t* dst_v,
3059                                               int width) {
3060   __asm {
3061     push      edi
3062     mov       eax, [esp + 4 + 4]  // src
3063     mov       edx, [esp + 4 + 8]  // dst_u
3064     mov       edi, [esp + 4 + 12]  // dst_v
3065     mov       ecx, [esp + 4 + 16]  // width
3066     movdqa    xmm1, xmmword ptr kShuffleMirrorUV
3067     lea       eax, [eax + ecx * 2 - 16]
3068     sub       edi, edx
3069 
3070  convertloop:
3071     movdqu    xmm0, [eax]
3072     lea       eax, [eax - 16]
3073     pshufb    xmm0, xmm1
3074     movlpd    qword ptr [edx], xmm0
3075     movhpd    qword ptr [edx + edi], xmm0
3076     lea       edx, [edx + 8]
3077     sub       ecx, 8
3078     jg        convertloop
3079 
3080     pop       edi
3081     ret
3082   }
3083 }
3084 #endif  // HAS_MIRRORSPLITUVROW_SSSE3
3085 
3086 #ifdef HAS_ARGBMIRRORROW_SSE2
3087 __declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
3088                                           uint8_t* dst,
3089                                           int width) {
3090   __asm {
3091     mov       eax, [esp + 4]  // src
3092     mov       edx, [esp + 8]  // dst
3093     mov       ecx, [esp + 12]  // width
3094     lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
3095 
3096  convertloop:
3097     movdqu    xmm0, [eax]
3098     lea       eax, [eax - 16]
3099     pshufd    xmm0, xmm0, 0x1b
3100     movdqu    [edx], xmm0
3101     lea       edx, [edx + 16]
3102     sub       ecx, 4
3103     jg        convertloop
3104     ret
3105   }
3106 }
3107 #endif  // HAS_ARGBMIRRORROW_SSE2
3108 
3109 #ifdef HAS_ARGBMIRRORROW_AVX2
3110 // Shuffle table for reversing the bytes.
3111 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
3112 
3113 __declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src,
3114                                           uint8_t* dst,
3115                                           int width) {
3116   __asm {
3117     mov       eax, [esp + 4]  // src
3118     mov       edx, [esp + 8]  // dst
3119     mov       ecx, [esp + 12]  // width
3120     vmovdqu   ymm5, ymmword ptr kARGBShuffleMirror_AVX2
3121 
3122  convertloop:
3123     vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
3124     vmovdqu   [edx], ymm0
3125     lea       edx, [edx + 32]
3126     sub       ecx, 8
3127     jg        convertloop
3128     vzeroupper
3129     ret
3130   }
3131 }
3132 #endif  // HAS_ARGBMIRRORROW_AVX2
3133 
3134 #ifdef HAS_SPLITUVROW_SSE2
3135 __declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv,
3136                                        uint8_t* dst_u,
3137                                        uint8_t* dst_v,
3138                                        int width) {
3139   __asm {
3140     push       edi
3141     mov        eax, [esp + 4 + 4]  // src_uv
3142     mov        edx, [esp + 4 + 8]  // dst_u
3143     mov        edi, [esp + 4 + 12]  // dst_v
3144     mov        ecx, [esp + 4 + 16]  // width
3145     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
3146     psrlw      xmm5, 8
3147     sub        edi, edx
3148 
3149   convertloop:
3150     movdqu     xmm0, [eax]
3151     movdqu     xmm1, [eax + 16]
3152     lea        eax,  [eax + 32]
3153     movdqa     xmm2, xmm0
3154     movdqa     xmm3, xmm1
3155     pand       xmm0, xmm5  // even bytes
3156     pand       xmm1, xmm5
3157     packuswb   xmm0, xmm1
3158     psrlw      xmm2, 8  // odd bytes
3159     psrlw      xmm3, 8
3160     packuswb   xmm2, xmm3
3161     movdqu     [edx], xmm0
3162     movdqu     [edx + edi], xmm2
3163     lea        edx, [edx + 16]
3164     sub        ecx, 16
3165     jg         convertloop
3166 
3167     pop        edi
3168     ret
3169   }
3170 }
3171 
3172 #endif  // HAS_SPLITUVROW_SSE2
3173 
3174 #ifdef HAS_SPLITUVROW_AVX2
3175 __declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv,
3176                                        uint8_t* dst_u,
3177                                        uint8_t* dst_v,
3178                                        int width) {
3179   __asm {
3180     push       edi
3181     mov        eax, [esp + 4 + 4]  // src_uv
3182     mov        edx, [esp + 4 + 8]  // dst_u
3183     mov        edi, [esp + 4 + 12]  // dst_v
3184     mov        ecx, [esp + 4 + 16]  // width
3185     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3186     vpsrlw     ymm5, ymm5, 8
3187     sub        edi, edx
3188 
3189   convertloop:
3190     vmovdqu    ymm0, [eax]
3191     vmovdqu    ymm1, [eax + 32]
3192     lea        eax,  [eax + 64]
3193     vpsrlw     ymm2, ymm0, 8  // odd bytes
3194     vpsrlw     ymm3, ymm1, 8
3195     vpand      ymm0, ymm0, ymm5  // even bytes
3196     vpand      ymm1, ymm1, ymm5
3197     vpackuswb  ymm0, ymm0, ymm1
3198     vpackuswb  ymm2, ymm2, ymm3
3199     vpermq     ymm0, ymm0, 0xd8
3200     vpermq     ymm2, ymm2, 0xd8
3201     vmovdqu    [edx], ymm0
3202     vmovdqu    [edx + edi], ymm2
3203     lea        edx, [edx + 32]
3204     sub        ecx, 32
3205     jg         convertloop
3206 
3207     pop        edi
3208     vzeroupper
3209     ret
3210   }
3211 }
3212 #endif  // HAS_SPLITUVROW_AVX2
3213 
3214 #ifdef HAS_MERGEUVROW_SSE2
3215 __declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u,
3216                                        const uint8_t* src_v,
3217                                        uint8_t* dst_uv,
3218                                        int width) {
3219   __asm {
3220     push       edi
3221     mov        eax, [esp + 4 + 4]  // src_u
3222     mov        edx, [esp + 4 + 8]  // src_v
3223     mov        edi, [esp + 4 + 12]  // dst_uv
3224     mov        ecx, [esp + 4 + 16]  // width
3225     sub        edx, eax
3226 
3227   convertloop:
3228     movdqu     xmm0, [eax]  // read 16 U's
3229     movdqu     xmm1, [eax + edx]  // and 16 V's
3230     lea        eax,  [eax + 16]
3231     movdqa     xmm2, xmm0
3232     punpcklbw  xmm0, xmm1  // first 8 UV pairs
3233     punpckhbw  xmm2, xmm1  // next 8 UV pairs
3234     movdqu     [edi], xmm0
3235     movdqu     [edi + 16], xmm2
3236     lea        edi, [edi + 32]
3237     sub        ecx, 16
3238     jg         convertloop
3239 
3240     pop        edi
3241     ret
3242   }
3243 }
3244 #endif  //  HAS_MERGEUVROW_SSE2
3245 
3246 #ifdef HAS_MERGEUVROW_AVX2
3247 __declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
3248                                        const uint8_t* src_v,
3249                                        uint8_t* dst_uv,
3250                                        int width) {
3251   __asm {
3252     push       edi
3253     mov        eax, [esp + 4 + 4]  // src_u
3254     mov        edx, [esp + 4 + 8]  // src_v
3255     mov        edi, [esp + 4 + 12]  // dst_uv
3256     mov        ecx, [esp + 4 + 16]  // width
3257     sub        edx, eax
3258 
3259   convertloop:
3260     vmovdqu    ymm0, [eax]  // read 32 U's
3261     vmovdqu    ymm1, [eax + edx]  // and 32 V's
3262     lea        eax,  [eax + 32]
3263     vpunpcklbw ymm2, ymm0, ymm1  // low 16 UV pairs. mutated qqword 0,2
3264     vpunpckhbw ymm0, ymm0, ymm1  // high 16 UV pairs. mutated qqword 1,3
3265     vextractf128 [edi], ymm2, 0  // bytes 0..15
3266     vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
3267     vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
3268     vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
3269     lea        edi, [edi + 64]
3270     sub        ecx, 32
3271     jg         convertloop
3272 
3273     pop        edi
3274     vzeroupper
3275     ret
3276   }
3277 }
3278 #endif  //  HAS_MERGEUVROW_AVX2
3279 
3280 #ifdef HAS_COPYROW_SSE2
3281 // CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time.
3282 __declspec(naked) void CopyRow_SSE2(const uint8_t* src,
3283                                     uint8_t* dst,
3284                                     int width) {
3285   __asm {
3286     mov        eax, [esp + 4]  // src
3287     mov        edx, [esp + 8]  // dst
3288     mov        ecx, [esp + 12]  // width
3289     test       eax, 15
3290     jne        convertloopu
3291     test       edx, 15
3292     jne        convertloopu
3293 
3294   convertloopa:
3295     movdqa     xmm0, [eax]
3296     movdqa     xmm1, [eax + 16]
3297     lea        eax, [eax + 32]
3298     movdqa     [edx], xmm0
3299     movdqa     [edx + 16], xmm1
3300     lea        edx, [edx + 32]
3301     sub        ecx, 32
3302     jg         convertloopa
3303     ret
3304 
3305   convertloopu:
3306     movdqu     xmm0, [eax]
3307     movdqu     xmm1, [eax + 16]
3308     lea        eax, [eax + 32]
3309     movdqu     [edx], xmm0
3310     movdqu     [edx + 16], xmm1
3311     lea        edx, [edx + 32]
3312     sub        ecx, 32
3313     jg         convertloopu
3314     ret
3315   }
3316 }
3317 #endif  // HAS_COPYROW_SSE2
3318 
3319 #ifdef HAS_COPYROW_AVX
3320 // CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time.
3321 __declspec(naked) void CopyRow_AVX(const uint8_t* src,
3322                                    uint8_t* dst,
3323                                    int width) {
3324   __asm {
3325     mov        eax, [esp + 4]  // src
3326     mov        edx, [esp + 8]  // dst
3327     mov        ecx, [esp + 12]  // width
3328 
3329   convertloop:
3330     vmovdqu    ymm0, [eax]
3331     vmovdqu    ymm1, [eax + 32]
3332     lea        eax, [eax + 64]
3333     vmovdqu    [edx], ymm0
3334     vmovdqu    [edx + 32], ymm1
3335     lea        edx, [edx + 64]
3336     sub        ecx, 64
3337     jg         convertloop
3338 
3339     vzeroupper
3340     ret
3341   }
3342 }
3343 #endif  // HAS_COPYROW_AVX
3344 
3345 // Multiple of 1.
3346 __declspec(naked) void CopyRow_ERMS(const uint8_t* src,
3347                                     uint8_t* dst,
3348                                     int width) {
3349   __asm {
3350     mov        eax, esi
3351     mov        edx, edi
3352     mov        esi, [esp + 4]  // src
3353     mov        edi, [esp + 8]  // dst
3354     mov        ecx, [esp + 12]  // width
3355     rep movsb
3356     mov        edi, edx
3357     mov        esi, eax
3358     ret
3359   }
3360 }
3361 
3362 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
3363 // width in pixels
3364 __declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src,
3365                                              uint8_t* dst,
3366                                              int width) {
3367   __asm {
3368     mov        eax, [esp + 4]  // src
3369     mov        edx, [esp + 8]  // dst
3370     mov        ecx, [esp + 12]  // width
3371     pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
3372     pslld      xmm0, 24
3373     pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
3374     psrld      xmm1, 8
3375 
3376   convertloop:
3377     movdqu     xmm2, [eax]
3378     movdqu     xmm3, [eax + 16]
3379     lea        eax, [eax + 32]
3380     movdqu     xmm4, [edx]
3381     movdqu     xmm5, [edx + 16]
3382     pand       xmm2, xmm0
3383     pand       xmm3, xmm0
3384     pand       xmm4, xmm1
3385     pand       xmm5, xmm1
3386     por        xmm2, xmm4
3387     por        xmm3, xmm5
3388     movdqu     [edx], xmm2
3389     movdqu     [edx + 16], xmm3
3390     lea        edx, [edx + 32]
3391     sub        ecx, 8
3392     jg         convertloop
3393 
3394     ret
3395   }
3396 }
3397 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
3398 
3399 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
3400 // width in pixels
3401 __declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src,
3402                                              uint8_t* dst,
3403                                              int width) {
3404   __asm {
3405     mov        eax, [esp + 4]  // src
3406     mov        edx, [esp + 8]  // dst
3407     mov        ecx, [esp + 12]  // width
3408     vpcmpeqb   ymm0, ymm0, ymm0
3409     vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
3410 
3411   convertloop:
3412     vmovdqu    ymm1, [eax]
3413     vmovdqu    ymm2, [eax + 32]
3414     lea        eax, [eax + 64]
3415     vpblendvb  ymm1, ymm1, [edx], ymm0
3416     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
3417     vmovdqu    [edx], ymm1
3418     vmovdqu    [edx + 32], ymm2
3419     lea        edx, [edx + 64]
3420     sub        ecx, 16
3421     jg         convertloop
3422 
3423     vzeroupper
3424     ret
3425   }
3426 }
3427 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
3428 
3429 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
3430 // width in pixels
3431 __declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
3432                                                 uint8_t* dst_a,
3433                                                 int width) {
3434   __asm {
3435     mov        eax, [esp + 4]  // src_argb
3436     mov        edx, [esp + 8]  // dst_a
3437     mov        ecx, [esp + 12]  // width
3438 
3439   extractloop:
3440     movdqu     xmm0, [eax]
3441     movdqu     xmm1, [eax + 16]
3442     lea        eax, [eax + 32]
3443     psrld      xmm0, 24
3444     psrld      xmm1, 24
3445     packssdw   xmm0, xmm1
3446     packuswb   xmm0, xmm0
3447     movq       qword ptr [edx], xmm0
3448     lea        edx, [edx + 8]
3449     sub        ecx, 8
3450     jg         extractloop
3451 
3452     ret
3453   }
3454 }
3455 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
3456 
3457 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
3458 // width in pixels
3459 __declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
3460                                                 uint8_t* dst_a,
3461                                                 int width) {
3462   __asm {
3463     mov        eax, [esp + 4]  // src_argb
3464     mov        edx, [esp + 8]  // dst_a
3465     mov        ecx, [esp + 12]  // width
3466     vmovdqa    ymm4, ymmword ptr kPermdARGBToY_AVX
3467 
3468   extractloop:
3469     vmovdqu    ymm0, [eax]
3470     vmovdqu    ymm1, [eax + 32]
3471     vpsrld     ymm0, ymm0, 24
3472     vpsrld     ymm1, ymm1, 24
3473     vmovdqu    ymm2, [eax + 64]
3474     vmovdqu    ymm3, [eax + 96]
3475     lea        eax, [eax + 128]
3476     vpackssdw  ymm0, ymm0, ymm1  // mutates
3477     vpsrld     ymm2, ymm2, 24
3478     vpsrld     ymm3, ymm3, 24
3479     vpackssdw  ymm2, ymm2, ymm3  // mutates
3480     vpackuswb  ymm0, ymm0, ymm2  // mutates
3481     vpermd     ymm0, ymm4, ymm0  // unmutate
3482     vmovdqu    [edx], ymm0
3483     lea        edx, [edx + 32]
3484     sub        ecx, 32
3485     jg         extractloop
3486 
3487     vzeroupper
3488     ret
3489   }
3490 }
3491 #endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
3492 
3493 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3494 // width in pixels
3495 __declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src,
3496                                                 uint8_t* dst,
3497                                                 int width) {
3498   __asm {
3499     mov        eax, [esp + 4]  // src
3500     mov        edx, [esp + 8]  // dst
3501     mov        ecx, [esp + 12]  // width
3502     pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
3503     pslld      xmm0, 24
3504     pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
3505     psrld      xmm1, 8
3506 
3507   convertloop:
3508     movq       xmm2, qword ptr [eax]  // 8 Y's
3509     lea        eax, [eax + 8]
3510     punpcklbw  xmm2, xmm2
3511     punpckhwd  xmm3, xmm2
3512     punpcklwd  xmm2, xmm2
3513     movdqu     xmm4, [edx]
3514     movdqu     xmm5, [edx + 16]
3515     pand       xmm2, xmm0
3516     pand       xmm3, xmm0
3517     pand       xmm4, xmm1
3518     pand       xmm5, xmm1
3519     por        xmm2, xmm4
3520     por        xmm3, xmm5
3521     movdqu     [edx], xmm2
3522     movdqu     [edx + 16], xmm3
3523     lea        edx, [edx + 32]
3524     sub        ecx, 8
3525     jg         convertloop
3526 
3527     ret
3528   }
3529 }
3530 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
3531 
3532 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3533 // width in pixels
3534 __declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src,
3535                                                 uint8_t* dst,
3536                                                 int width) {
3537   __asm {
3538     mov        eax, [esp + 4]  // src
3539     mov        edx, [esp + 8]  // dst
3540     mov        ecx, [esp + 12]  // width
3541     vpcmpeqb   ymm0, ymm0, ymm0
3542     vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
3543 
3544   convertloop:
3545     vpmovzxbd  ymm1, qword ptr [eax]
3546     vpmovzxbd  ymm2, qword ptr [eax + 8]
3547     lea        eax, [eax + 16]
3548     vpslld     ymm1, ymm1, 24
3549     vpslld     ymm2, ymm2, 24
3550     vpblendvb  ymm1, ymm1, [edx], ymm0
3551     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
3552     vmovdqu    [edx], ymm1
3553     vmovdqu    [edx + 32], ymm2
3554     lea        edx, [edx + 64]
3555     sub        ecx, 16
3556     jg         convertloop
3557 
3558     vzeroupper
3559     ret
3560   }
3561 }
3562 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
3563 
3564 #ifdef HAS_SETROW_X86
3565 // Write 'width' bytes using an 8 bit value repeated.
3566 // width should be multiple of 4.
3567 __declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
3568   __asm {
3569     movzx      eax, byte ptr [esp + 8]  // v8
3570     mov        edx, 0x01010101  // Duplicate byte to all bytes.
3571     mul        edx  // overwrites edx with upper part of result.
3572     mov        edx, edi
3573     mov        edi, [esp + 4]  // dst
3574     mov        ecx, [esp + 12]  // width
3575     shr        ecx, 2
3576     rep stosd
3577     mov        edi, edx
3578     ret
3579   }
3580 }
3581 
3582 // Write 'width' bytes using an 8 bit value repeated.
3583 __declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
3584   __asm {
3585     mov        edx, edi
3586     mov        edi, [esp + 4]  // dst
3587     mov        eax, [esp + 8]  // v8
3588     mov        ecx, [esp + 12]  // width
3589     rep stosb
3590     mov        edi, edx
3591     ret
3592   }
3593 }
3594 
3595 // Write 'width' 32 bit values.
3596 __declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb,
3597                                       uint32_t v32,
3598                                       int width) {
3599   __asm {
3600     mov        edx, edi
3601     mov        edi, [esp + 4]  // dst
3602     mov        eax, [esp + 8]  // v32
3603     mov        ecx, [esp + 12]  // width
3604     rep stosd
3605     mov        edi, edx
3606     ret
3607   }
3608 }
3609 #endif  // HAS_SETROW_X86
3610 
3611 #ifdef HAS_YUY2TOYROW_AVX2
3612 __declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2,
3613                                        uint8_t* dst_y,
3614                                        int width) {
3615   __asm {
3616     mov        eax, [esp + 4]  // src_yuy2
3617     mov        edx, [esp + 8]  // dst_y
3618     mov        ecx, [esp + 12]  // width
3619     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3620     vpsrlw     ymm5, ymm5, 8
3621 
3622   convertloop:
3623     vmovdqu    ymm0, [eax]
3624     vmovdqu    ymm1, [eax + 32]
3625     lea        eax,  [eax + 64]
3626     vpand      ymm0, ymm0, ymm5  // even bytes are Y
3627     vpand      ymm1, ymm1, ymm5
3628     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3629     vpermq     ymm0, ymm0, 0xd8
3630     vmovdqu    [edx], ymm0
3631     lea        edx, [edx + 32]
3632     sub        ecx, 32
3633     jg         convertloop
3634     vzeroupper
3635     ret
3636   }
3637 }
3638 
3639 __declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
3640                                         int stride_yuy2,
3641                                         uint8_t* dst_u,
3642                                         uint8_t* dst_v,
3643                                         int width) {
3644   __asm {
3645     push       esi
3646     push       edi
3647     mov        eax, [esp + 8 + 4]  // src_yuy2
3648     mov        esi, [esp + 8 + 8]  // stride_yuy2
3649     mov        edx, [esp + 8 + 12]  // dst_u
3650     mov        edi, [esp + 8 + 16]  // dst_v
3651     mov        ecx, [esp + 8 + 20]  // width
3652     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3653     vpsrlw     ymm5, ymm5, 8
3654     sub        edi, edx
3655 
3656   convertloop:
3657     vmovdqu    ymm0, [eax]
3658     vmovdqu    ymm1, [eax + 32]
3659     vpavgb     ymm0, ymm0, [eax + esi]
3660     vpavgb     ymm1, ymm1, [eax + esi + 32]
3661     lea        eax,  [eax + 64]
3662     vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
3663     vpsrlw     ymm1, ymm1, 8
3664     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3665     vpermq     ymm0, ymm0, 0xd8
3666     vpand      ymm1, ymm0, ymm5  // U
3667     vpsrlw     ymm0, ymm0, 8  // V
3668     vpackuswb  ymm1, ymm1, ymm1  // mutates.
3669     vpackuswb  ymm0, ymm0, ymm0  // mutates.
3670     vpermq     ymm1, ymm1, 0xd8
3671     vpermq     ymm0, ymm0, 0xd8
3672     vextractf128 [edx], ymm1, 0  // U
3673     vextractf128 [edx + edi], ymm0, 0  // V
3674     lea        edx, [edx + 16]
3675     sub        ecx, 32
3676     jg         convertloop
3677 
3678     pop        edi
3679     pop        esi
3680     vzeroupper
3681     ret
3682   }
3683 }
3684 
3685 __declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
3686                                            uint8_t* dst_u,
3687                                            uint8_t* dst_v,
3688                                            int width) {
3689   __asm {
3690     push       edi
3691     mov        eax, [esp + 4 + 4]  // src_yuy2
3692     mov        edx, [esp + 4 + 8]  // dst_u
3693     mov        edi, [esp + 4 + 12]  // dst_v
3694     mov        ecx, [esp + 4 + 16]  // width
3695     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3696     vpsrlw     ymm5, ymm5, 8
3697     sub        edi, edx
3698 
3699   convertloop:
3700     vmovdqu    ymm0, [eax]
3701     vmovdqu    ymm1, [eax + 32]
3702     lea        eax,  [eax + 64]
3703     vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
3704     vpsrlw     ymm1, ymm1, 8
3705     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3706     vpermq     ymm0, ymm0, 0xd8
3707     vpand      ymm1, ymm0, ymm5  // U
3708     vpsrlw     ymm0, ymm0, 8  // V
3709     vpackuswb  ymm1, ymm1, ymm1  // mutates.
3710     vpackuswb  ymm0, ymm0, ymm0  // mutates.
3711     vpermq     ymm1, ymm1, 0xd8
3712     vpermq     ymm0, ymm0, 0xd8
3713     vextractf128 [edx], ymm1, 0  // U
3714     vextractf128 [edx + edi], ymm0, 0  // V
3715     lea        edx, [edx + 16]
3716     sub        ecx, 32
3717     jg         convertloop
3718 
3719     pop        edi
3720     vzeroupper
3721     ret
3722   }
3723 }
3724 
3725 __declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy,
3726                                        uint8_t* dst_y,
3727                                        int width) {
3728   __asm {
3729     mov        eax, [esp + 4]  // src_uyvy
3730     mov        edx, [esp + 8]  // dst_y
3731     mov        ecx, [esp + 12]  // width
3732 
3733   convertloop:
3734     vmovdqu    ymm0, [eax]
3735     vmovdqu    ymm1, [eax + 32]
3736     lea        eax,  [eax + 64]
3737     vpsrlw     ymm0, ymm0, 8  // odd bytes are Y
3738     vpsrlw     ymm1, ymm1, 8
3739     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3740     vpermq     ymm0, ymm0, 0xd8
3741     vmovdqu    [edx], ymm0
3742     lea        edx, [edx + 32]
3743     sub        ecx, 32
3744     jg         convertloop
3745     vzeroupper
3746     ret
3747   }
3748 }
3749 
3750 __declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
3751                                         int stride_uyvy,
3752                                         uint8_t* dst_u,
3753                                         uint8_t* dst_v,
3754                                         int width) {
3755   __asm {
3756     push       esi
3757     push       edi
3758     mov        eax, [esp + 8 + 4]  // src_yuy2
3759     mov        esi, [esp + 8 + 8]  // stride_yuy2
3760     mov        edx, [esp + 8 + 12]  // dst_u
3761     mov        edi, [esp + 8 + 16]  // dst_v
3762     mov        ecx, [esp + 8 + 20]  // width
3763     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3764     vpsrlw     ymm5, ymm5, 8
3765     sub        edi, edx
3766 
3767   convertloop:
3768     vmovdqu    ymm0, [eax]
3769     vmovdqu    ymm1, [eax + 32]
3770     vpavgb     ymm0, ymm0, [eax + esi]
3771     vpavgb     ymm1, ymm1, [eax + esi + 32]
3772     lea        eax,  [eax + 64]
3773     vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
3774     vpand      ymm1, ymm1, ymm5
3775     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3776     vpermq     ymm0, ymm0, 0xd8
3777     vpand      ymm1, ymm0, ymm5  // U
3778     vpsrlw     ymm0, ymm0, 8  // V
3779     vpackuswb  ymm1, ymm1, ymm1  // mutates.
3780     vpackuswb  ymm0, ymm0, ymm0  // mutates.
3781     vpermq     ymm1, ymm1, 0xd8
3782     vpermq     ymm0, ymm0, 0xd8
3783     vextractf128 [edx], ymm1, 0  // U
3784     vextractf128 [edx + edi], ymm0, 0  // V
3785     lea        edx, [edx + 16]
3786     sub        ecx, 32
3787     jg         convertloop
3788 
3789     pop        edi
3790     pop        esi
3791     vzeroupper
3792     ret
3793   }
3794 }
3795 
3796 __declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
3797                                            uint8_t* dst_u,
3798                                            uint8_t* dst_v,
3799                                            int width) {
3800   __asm {
3801     push       edi
3802     mov        eax, [esp + 4 + 4]  // src_yuy2
3803     mov        edx, [esp + 4 + 8]  // dst_u
3804     mov        edi, [esp + 4 + 12]  // dst_v
3805     mov        ecx, [esp + 4 + 16]  // width
3806     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3807     vpsrlw     ymm5, ymm5, 8
3808     sub        edi, edx
3809 
3810   convertloop:
3811     vmovdqu    ymm0, [eax]
3812     vmovdqu    ymm1, [eax + 32]
3813     lea        eax,  [eax + 64]
3814     vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
3815     vpand      ymm1, ymm1, ymm5
3816     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3817     vpermq     ymm0, ymm0, 0xd8
3818     vpand      ymm1, ymm0, ymm5  // U
3819     vpsrlw     ymm0, ymm0, 8  // V
3820     vpackuswb  ymm1, ymm1, ymm1  // mutates.
3821     vpackuswb  ymm0, ymm0, ymm0  // mutates.
3822     vpermq     ymm1, ymm1, 0xd8
3823     vpermq     ymm0, ymm0, 0xd8
3824     vextractf128 [edx], ymm1, 0  // U
3825     vextractf128 [edx + edi], ymm0, 0  // V
3826     lea        edx, [edx + 16]
3827     sub        ecx, 32
3828     jg         convertloop
3829 
3830     pop        edi
3831     vzeroupper
3832     ret
3833   }
3834 }
3835 #endif  // HAS_YUY2TOYROW_AVX2
3836 
3837 #ifdef HAS_YUY2TOYROW_SSE2
3838 __declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2,
3839                                        uint8_t* dst_y,
3840                                        int width) {
3841   __asm {
3842     mov        eax, [esp + 4]  // src_yuy2
3843     mov        edx, [esp + 8]  // dst_y
3844     mov        ecx, [esp + 12]  // width
3845     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
3846     psrlw      xmm5, 8
3847 
3848   convertloop:
3849     movdqu     xmm0, [eax]
3850     movdqu     xmm1, [eax + 16]
3851     lea        eax,  [eax + 32]
3852     pand       xmm0, xmm5  // even bytes are Y
3853     pand       xmm1, xmm5
3854     packuswb   xmm0, xmm1
3855     movdqu     [edx], xmm0
3856     lea        edx, [edx + 16]
3857     sub        ecx, 16
3858     jg         convertloop
3859     ret
3860   }
3861 }
3862 
3863 __declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
3864                                         int stride_yuy2,
3865                                         uint8_t* dst_u,
3866                                         uint8_t* dst_v,
3867                                         int width) {
3868   __asm {
3869     push       esi
3870     push       edi
3871     mov        eax, [esp + 8 + 4]  // src_yuy2
3872     mov        esi, [esp + 8 + 8]  // stride_yuy2
3873     mov        edx, [esp + 8 + 12]  // dst_u
3874     mov        edi, [esp + 8 + 16]  // dst_v
3875     mov        ecx, [esp + 8 + 20]  // width
3876     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
3877     psrlw      xmm5, 8
3878     sub        edi, edx
3879 
3880   convertloop:
3881     movdqu     xmm0, [eax]
3882     movdqu     xmm1, [eax + 16]
3883     movdqu     xmm2, [eax + esi]
3884     movdqu     xmm3, [eax + esi + 16]
3885     lea        eax,  [eax + 32]
3886     pavgb      xmm0, xmm2
3887     pavgb      xmm1, xmm3
3888     psrlw      xmm0, 8  // YUYV -> UVUV
3889     psrlw      xmm1, 8
3890     packuswb   xmm0, xmm1
3891     movdqa     xmm1, xmm0
3892     pand       xmm0, xmm5  // U
3893     packuswb   xmm0, xmm0
3894     psrlw      xmm1, 8  // V
3895     packuswb   xmm1, xmm1
3896     movq       qword ptr [edx], xmm0
3897     movq       qword ptr [edx + edi], xmm1
3898     lea        edx, [edx + 8]
3899     sub        ecx, 16
3900     jg         convertloop
3901 
3902     pop        edi
3903     pop        esi
3904     ret
3905   }
3906 }
3907 
3908 __declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
3909                                            uint8_t* dst_u,
3910                                            uint8_t* dst_v,
3911                                            int width) {
3912   __asm {
3913     push       edi
3914     mov        eax, [esp + 4 + 4]  // src_yuy2
3915     mov        edx, [esp + 4 + 8]  // dst_u
3916     mov        edi, [esp + 4 + 12]  // dst_v
3917     mov        ecx, [esp + 4 + 16]  // width
3918     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
3919     psrlw      xmm5, 8
3920     sub        edi, edx
3921 
3922   convertloop:
3923     movdqu     xmm0, [eax]
3924     movdqu     xmm1, [eax + 16]
3925     lea        eax,  [eax + 32]
3926     psrlw      xmm0, 8  // YUYV -> UVUV
3927     psrlw      xmm1, 8
3928     packuswb   xmm0, xmm1
3929     movdqa     xmm1, xmm0
3930     pand       xmm0, xmm5  // U
3931     packuswb   xmm0, xmm0
3932     psrlw      xmm1, 8  // V
3933     packuswb   xmm1, xmm1
3934     movq       qword ptr [edx], xmm0
3935     movq       qword ptr [edx + edi], xmm1
3936     lea        edx, [edx + 8]
3937     sub        ecx, 16
3938     jg         convertloop
3939 
3940     pop        edi
3941     ret
3942   }
3943 }
3944 
3945 __declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy,
3946                                        uint8_t* dst_y,
3947                                        int width) {
3948   __asm {
3949     mov        eax, [esp + 4]  // src_uyvy
3950     mov        edx, [esp + 8]  // dst_y
3951     mov        ecx, [esp + 12]  // width
3952 
3953   convertloop:
3954     movdqu     xmm0, [eax]
3955     movdqu     xmm1, [eax + 16]
3956     lea        eax,  [eax + 32]
3957     psrlw      xmm0, 8  // odd bytes are Y
3958     psrlw      xmm1, 8
3959     packuswb   xmm0, xmm1
3960     movdqu     [edx], xmm0
3961     lea        edx, [edx + 16]
3962     sub        ecx, 16
3963     jg         convertloop
3964     ret
3965   }
3966 }
3967 
3968 __declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
3969                                         int stride_uyvy,
3970                                         uint8_t* dst_u,
3971                                         uint8_t* dst_v,
3972                                         int width) {
3973   __asm {
3974     push       esi
3975     push       edi
3976     mov        eax, [esp + 8 + 4]  // src_yuy2
3977     mov        esi, [esp + 8 + 8]  // stride_yuy2
3978     mov        edx, [esp + 8 + 12]  // dst_u
3979     mov        edi, [esp + 8 + 16]  // dst_v
3980     mov        ecx, [esp + 8 + 20]  // width
3981     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
3982     psrlw      xmm5, 8
3983     sub        edi, edx
3984 
3985   convertloop:
3986     movdqu     xmm0, [eax]
3987     movdqu     xmm1, [eax + 16]
3988     movdqu     xmm2, [eax + esi]
3989     movdqu     xmm3, [eax + esi + 16]
3990     lea        eax,  [eax + 32]
3991     pavgb      xmm0, xmm2
3992     pavgb      xmm1, xmm3
3993     pand       xmm0, xmm5  // UYVY -> UVUV
3994     pand       xmm1, xmm5
3995     packuswb   xmm0, xmm1
3996     movdqa     xmm1, xmm0
3997     pand       xmm0, xmm5  // U
3998     packuswb   xmm0, xmm0
3999     psrlw      xmm1, 8  // V
4000     packuswb   xmm1, xmm1
4001     movq       qword ptr [edx], xmm0
4002     movq       qword ptr [edx + edi], xmm1
4003     lea        edx, [edx + 8]
4004     sub        ecx, 16
4005     jg         convertloop
4006 
4007     pop        edi
4008     pop        esi
4009     ret
4010   }
4011 }
4012 
4013 __declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
4014                                            uint8_t* dst_u,
4015                                            uint8_t* dst_v,
4016                                            int width) {
4017   __asm {
4018     push       edi
4019     mov        eax, [esp + 4 + 4]  // src_yuy2
4020     mov        edx, [esp + 4 + 8]  // dst_u
4021     mov        edi, [esp + 4 + 12]  // dst_v
4022     mov        ecx, [esp + 4 + 16]  // width
4023     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
4024     psrlw      xmm5, 8
4025     sub        edi, edx
4026 
4027   convertloop:
4028     movdqu     xmm0, [eax]
4029     movdqu     xmm1, [eax + 16]
4030     lea        eax,  [eax + 32]
4031     pand       xmm0, xmm5  // UYVY -> UVUV
4032     pand       xmm1, xmm5
4033     packuswb   xmm0, xmm1
4034     movdqa     xmm1, xmm0
4035     pand       xmm0, xmm5  // U
4036     packuswb   xmm0, xmm0
4037     psrlw      xmm1, 8  // V
4038     packuswb   xmm1, xmm1
4039     movq       qword ptr [edx], xmm0
4040     movq       qword ptr [edx + edi], xmm1
4041     lea        edx, [edx + 8]
4042     sub        ecx, 16
4043     jg         convertloop
4044 
4045     pop        edi
4046     ret
4047   }
4048 }
4049 #endif  // HAS_YUY2TOYROW_SSE2
4050 
4051 #ifdef HAS_BLENDPLANEROW_SSSE3
4052 // Blend 8 pixels at a time.
4053 // unsigned version of math
4054 // =((A2*C2)+(B2*(255-C2))+255)/256
4055 // signed version of math
4056 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4057 __declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0,
4058                                            const uint8_t* src1,
4059                                            const uint8_t* alpha,
4060                                            uint8_t* dst,
4061                                            int width) {
4062   __asm {
4063     push       esi
4064     push       edi
4065     pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
4066     psllw      xmm5, 8
4067     mov        eax, 0x80808080  // 128 for biasing image to signed.
4068     movd       xmm6, eax
4069     pshufd     xmm6, xmm6, 0x00
4070 
4071     mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.
4072     movd       xmm7, eax
4073     pshufd     xmm7, xmm7, 0x00
4074     mov        eax, [esp + 8 + 4]  // src0
4075     mov        edx, [esp + 8 + 8]  // src1
4076     mov        esi, [esp + 8 + 12]  // alpha
4077     mov        edi, [esp + 8 + 16]  // dst
4078     mov        ecx, [esp + 8 + 20]  // width
4079     sub        eax, esi
4080     sub        edx, esi
4081     sub        edi, esi
4082 
4083         // 8 pixel loop.
4084   convertloop8:
4085     movq       xmm0, qword ptr [esi]  // alpha
4086     punpcklbw  xmm0, xmm0
4087     pxor       xmm0, xmm5  // a, 255-a
4088     movq       xmm1, qword ptr [eax + esi]  // src0
4089     movq       xmm2, qword ptr [edx + esi]  // src1
4090     punpcklbw  xmm1, xmm2
4091     psubb      xmm1, xmm6  // bias src0/1 - 128
4092     pmaddubsw  xmm0, xmm1
4093     paddw      xmm0, xmm7  // unbias result - 32768 and round.
4094     psrlw      xmm0, 8
4095     packuswb   xmm0, xmm0
4096     movq       qword ptr [edi + esi], xmm0
4097     lea        esi, [esi + 8]
4098     sub        ecx, 8
4099     jg         convertloop8
4100 
4101     pop        edi
4102     pop        esi
4103     ret
4104   }
4105 }
4106 #endif  // HAS_BLENDPLANEROW_SSSE3
4107 
4108 #ifdef HAS_BLENDPLANEROW_AVX2
4109 // Blend 32 pixels at a time.
4110 // unsigned version of math
4111 // =((A2*C2)+(B2*(255-C2))+255)/256
4112 // signed version of math
4113 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4114 __declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0,
4115                                           const uint8_t* src1,
4116                                           const uint8_t* alpha,
4117                                           uint8_t* dst,
4118                                           int width) {
4119   __asm {
4120     push        esi
4121     push        edi
4122     vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff00ff00
4123     vpsllw      ymm5, ymm5, 8
4124     mov         eax, 0x80808080  // 128 for biasing image to signed.
4125     vmovd       xmm6, eax
4126     vbroadcastss ymm6, xmm6
4127     mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.
4128     vmovd       xmm7, eax
4129     vbroadcastss ymm7, xmm7
4130     mov         eax, [esp + 8 + 4]  // src0
4131     mov         edx, [esp + 8 + 8]  // src1
4132     mov         esi, [esp + 8 + 12]  // alpha
4133     mov         edi, [esp + 8 + 16]  // dst
4134     mov         ecx, [esp + 8 + 20]  // width
4135     sub         eax, esi
4136     sub         edx, esi
4137     sub         edi, esi
4138 
4139         // 32 pixel loop.
4140   convertloop32:
4141     vmovdqu     ymm0, [esi]  // alpha
4142     vpunpckhbw  ymm3, ymm0, ymm0  // 8..15, 24..31
4143     vpunpcklbw  ymm0, ymm0, ymm0  // 0..7, 16..23
4144     vpxor       ymm3, ymm3, ymm5  // a, 255-a
4145     vpxor       ymm0, ymm0, ymm5  // a, 255-a
4146     vmovdqu     ymm1, [eax + esi]  // src0
4147     vmovdqu     ymm2, [edx + esi]  // src1
4148     vpunpckhbw  ymm4, ymm1, ymm2
4149     vpunpcklbw  ymm1, ymm1, ymm2
4150     vpsubb      ymm4, ymm4, ymm6  // bias src0/1 - 128
4151     vpsubb      ymm1, ymm1, ymm6  // bias src0/1 - 128
4152     vpmaddubsw  ymm3, ymm3, ymm4
4153     vpmaddubsw  ymm0, ymm0, ymm1
4154     vpaddw      ymm3, ymm3, ymm7  // unbias result - 32768 and round.
4155     vpaddw      ymm0, ymm0, ymm7  // unbias result - 32768 and round.
4156     vpsrlw      ymm3, ymm3, 8
4157     vpsrlw      ymm0, ymm0, 8
4158     vpackuswb   ymm0, ymm0, ymm3
4159     vmovdqu     [edi + esi], ymm0
4160     lea         esi, [esi + 32]
4161     sub         ecx, 32
4162     jg          convertloop32
4163 
4164     pop         edi
4165     pop         esi
4166     vzeroupper
4167     ret
4168   }
4169 }
4170 #endif  // HAS_BLENDPLANEROW_AVX2
4171 
4172 #ifdef HAS_ARGBBLENDROW_SSSE3
4173 // Shuffle table for isolating alpha.
4174 static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
4175                                     11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
4176 
4177 // Blend 8 pixels at a time.
4178 __declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
4179                                           const uint8_t* src_argb1,
4180                                           uint8_t* dst_argb,
4181                                           int width) {
4182   __asm {
4183     push       esi
4184     mov        eax, [esp + 4 + 4]  // src_argb0
4185     mov        esi, [esp + 4 + 8]  // src_argb1
4186     mov        edx, [esp + 4 + 12]  // dst_argb
4187     mov        ecx, [esp + 4 + 16]  // width
4188     pcmpeqb    xmm7, xmm7  // generate constant 0x0001
4189     psrlw      xmm7, 15
4190     pcmpeqb    xmm6, xmm6  // generate mask 0x00ff00ff
4191     psrlw      xmm6, 8
4192     pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
4193     psllw      xmm5, 8
4194     pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
4195     pslld      xmm4, 24
4196     sub        ecx, 4
4197     jl         convertloop4b  // less than 4 pixels?
4198 
4199         // 4 pixel loop.
4200   convertloop4:
4201     movdqu     xmm3, [eax]  // src argb
4202     lea        eax, [eax + 16]
4203     movdqa     xmm0, xmm3  // src argb
4204     pxor       xmm3, xmm4  // ~alpha
4205     movdqu     xmm2, [esi]  // _r_b
4206     pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
4207     pand       xmm2, xmm6  // _r_b
4208     paddw      xmm3, xmm7  // 256 - alpha
4209     pmullw     xmm2, xmm3  // _r_b * alpha
4210     movdqu     xmm1, [esi]  // _a_g
4211     lea        esi, [esi + 16]
4212     psrlw      xmm1, 8  // _a_g
4213     por        xmm0, xmm4  // set alpha to 255
4214     pmullw     xmm1, xmm3  // _a_g * alpha
4215     psrlw      xmm2, 8  // _r_b convert to 8 bits again
4216     paddusb    xmm0, xmm2  // + src argb
4217     pand       xmm1, xmm5  // a_g_ convert to 8 bits again
4218     paddusb    xmm0, xmm1  // + src argb
4219     movdqu     [edx], xmm0
4220     lea        edx, [edx + 16]
4221     sub        ecx, 4
4222     jge        convertloop4
4223 
4224   convertloop4b:
4225     add        ecx, 4 - 1
4226     jl         convertloop1b
4227 
4228             // 1 pixel loop.
4229   convertloop1:
4230     movd       xmm3, [eax]  // src argb
4231     lea        eax, [eax + 4]
4232     movdqa     xmm0, xmm3  // src argb
4233     pxor       xmm3, xmm4  // ~alpha
4234     movd       xmm2, [esi]  // _r_b
4235     pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
4236     pand       xmm2, xmm6  // _r_b
4237     paddw      xmm3, xmm7  // 256 - alpha
4238     pmullw     xmm2, xmm3  // _r_b * alpha
4239     movd       xmm1, [esi]  // _a_g
4240     lea        esi, [esi + 4]
4241     psrlw      xmm1, 8  // _a_g
4242     por        xmm0, xmm4  // set alpha to 255
4243     pmullw     xmm1, xmm3  // _a_g * alpha
4244     psrlw      xmm2, 8  // _r_b convert to 8 bits again
4245     paddusb    xmm0, xmm2  // + src argb
4246     pand       xmm1, xmm5  // a_g_ convert to 8 bits again
4247     paddusb    xmm0, xmm1  // + src argb
4248     movd       [edx], xmm0
4249     lea        edx, [edx + 4]
4250     sub        ecx, 1
4251     jge        convertloop1
4252 
4253   convertloop1b:
4254     pop        esi
4255     ret
4256   }
4257 }
4258 #endif  // HAS_ARGBBLENDROW_SSSE3
4259 
4260 #ifdef HAS_ARGBATTENUATEROW_SSSE3
4261 // Shuffle table duplicating alpha.
4262 static const uvec8 kShuffleAlpha0 = {
4263     3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4264 };
4265 static const uvec8 kShuffleAlpha1 = {
4266     11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4267     15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4268 };
4269 __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
4270                                               uint8_t* dst_argb,
4271                                               int width) {
4272   __asm {
4273     mov        eax, [esp + 4]  // src_argb0
4274     mov        edx, [esp + 8]  // dst_argb
4275     mov        ecx, [esp + 12]  // width
4276     pcmpeqb    xmm3, xmm3  // generate mask 0xff000000
4277     pslld      xmm3, 24
4278     movdqa     xmm4, xmmword ptr kShuffleAlpha0
4279     movdqa     xmm5, xmmword ptr kShuffleAlpha1
4280 
4281  convertloop:
4282     movdqu     xmm0, [eax]  // read 4 pixels
4283     pshufb     xmm0, xmm4  // isolate first 2 alphas
4284     movdqu     xmm1, [eax]  // read 4 pixels
4285     punpcklbw  xmm1, xmm1  // first 2 pixel rgbs
4286     pmulhuw    xmm0, xmm1  // rgb * a
4287     movdqu     xmm1, [eax]  // read 4 pixels
4288     pshufb     xmm1, xmm5  // isolate next 2 alphas
4289     movdqu     xmm2, [eax]  // read 4 pixels
4290     punpckhbw  xmm2, xmm2  // next 2 pixel rgbs
4291     pmulhuw    xmm1, xmm2  // rgb * a
4292     movdqu     xmm2, [eax]  // mask original alpha
4293     lea        eax, [eax + 16]
4294     pand       xmm2, xmm3
4295     psrlw      xmm0, 8
4296     psrlw      xmm1, 8
4297     packuswb   xmm0, xmm1
4298     por        xmm0, xmm2  // copy original alpha
4299     movdqu     [edx], xmm0
4300     lea        edx, [edx + 16]
4301     sub        ecx, 4
4302     jg         convertloop
4303 
4304     ret
4305   }
4306 }
4307 #endif  // HAS_ARGBATTENUATEROW_SSSE3
4308 
4309 #ifdef HAS_ARGBATTENUATEROW_AVX2
4310 // Shuffle table duplicating alpha.
4311 static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
4312                                          128u, 128u, 14u,  15u, 14u, 15u,
4313                                          14u,  15u,  128u, 128u};
4314 __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
4315                                              uint8_t* dst_argb,
4316                                              int width) {
4317   __asm {
4318     mov        eax, [esp + 4]  // src_argb0
4319     mov        edx, [esp + 8]  // dst_argb
4320     mov        ecx, [esp + 12]  // width
4321     sub        edx, eax
4322     vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
4323     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xff000000
4324     vpslld     ymm5, ymm5, 24
4325 
4326  convertloop:
4327     vmovdqu    ymm6, [eax]  // read 8 pixels.
4328     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4329     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4330     vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
4331     vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
4332     vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
4333     vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
4334     vpand      ymm6, ymm6, ymm5  // isolate alpha
4335     vpsrlw     ymm0, ymm0, 8
4336     vpsrlw     ymm1, ymm1, 8
4337     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4338     vpor       ymm0, ymm0, ymm6  // copy original alpha
4339     vmovdqu    [eax + edx], ymm0
4340     lea        eax, [eax + 32]
4341     sub        ecx, 8
4342     jg         convertloop
4343 
4344     vzeroupper
4345     ret
4346   }
4347 }
4348 #endif  // HAS_ARGBATTENUATEROW_AVX2
4349 
4350 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
4351 // Unattenuate 4 pixels at a time.
4352 __declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
4353                                                uint8_t* dst_argb,
4354                                                int width) {
4355   __asm {
4356     push       ebx
4357     push       esi
4358     push       edi
4359     mov        eax, [esp + 12 + 4]  // src_argb
4360     mov        edx, [esp + 12 + 8]  // dst_argb
4361     mov        ecx, [esp + 12 + 12]  // width
4362     lea        ebx, fixed_invtbl8
4363 
4364  convertloop:
4365     movdqu     xmm0, [eax]  // read 4 pixels
4366     movzx      esi, byte ptr [eax + 3]  // first alpha
4367     movzx      edi, byte ptr [eax + 7]  // second alpha
4368     punpcklbw  xmm0, xmm0  // first 2
4369     movd       xmm2, dword ptr [ebx + esi * 4]
4370     movd       xmm3, dword ptr [ebx + edi * 4]
4371     pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words.  1, a, a, a
4372     pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
4373     movlhps    xmm2, xmm3
4374     pmulhuw    xmm0, xmm2  // rgb * a
4375 
4376     movdqu     xmm1, [eax]  // read 4 pixels
4377     movzx      esi, byte ptr [eax + 11]  // third alpha
4378     movzx      edi, byte ptr [eax + 15]  // forth alpha
4379     punpckhbw  xmm1, xmm1  // next 2
4380     movd       xmm2, dword ptr [ebx + esi * 4]
4381     movd       xmm3, dword ptr [ebx + edi * 4]
4382     pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words
4383     pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
4384     movlhps    xmm2, xmm3
4385     pmulhuw    xmm1, xmm2  // rgb * a
4386     lea        eax, [eax + 16]
4387     packuswb   xmm0, xmm1
4388     movdqu     [edx], xmm0
4389     lea        edx, [edx + 16]
4390     sub        ecx, 4
4391     jg         convertloop
4392 
4393     pop        edi
4394     pop        esi
4395     pop        ebx
4396     ret
4397   }
4398 }
4399 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
4400 
4401 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
4402 // Shuffle table duplicating alpha.
4403 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
4404     0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
4405 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
4406 // USE_GATHER is not on by default, due to being a slow instruction.
4407 #ifdef USE_GATHER
4408 __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
4409                                                uint8_t* dst_argb,
4410                                                int width) {
4411   __asm {
4412     mov        eax, [esp + 4]  // src_argb0
4413     mov        edx, [esp + 8]  // dst_argb
4414     mov        ecx, [esp + 12]  // width
4415     sub        edx, eax
4416     vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
4417 
4418  convertloop:
4419     vmovdqu    ymm6, [eax]  // read 8 pixels.
4420     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
4421     vpsrld     ymm2, ymm6, 24  // alpha in low 8 bits.
4422     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4423     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4424     vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
4425     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
4426     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
4427     vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
4428     vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
4429     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
4430     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
4431     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4432     vmovdqu    [eax + edx], ymm0
4433     lea        eax, [eax + 32]
4434     sub        ecx, 8
4435     jg         convertloop
4436 
4437     vzeroupper
4438     ret
4439   }
4440 }
4441 #else   // USE_GATHER
4442 __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
4443                                                uint8_t* dst_argb,
4444                                                int width) {
4445   __asm {
4446 
4447     push       ebx
4448     push       esi
4449     push       edi
4450     mov        eax, [esp + 12 + 4]  // src_argb
4451     mov        edx, [esp + 12 + 8]  // dst_argb
4452     mov        ecx, [esp + 12 + 12]  // width
4453     sub        edx, eax
4454     lea        ebx, fixed_invtbl8
4455     vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
4456 
4457  convertloop:
4458         // replace VPGATHER
4459     movzx      esi, byte ptr [eax + 3]  // alpha0
4460     movzx      edi, byte ptr [eax + 7]  // alpha1
4461     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a0]
4462     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a1]
4463     movzx      esi, byte ptr [eax + 11]  // alpha2
4464     movzx      edi, byte ptr [eax + 15]  // alpha3
4465     vpunpckldq xmm6, xmm0, xmm1  // [1,a1,1,a0]
4466     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a2]
4467     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a3]
4468     movzx      esi, byte ptr [eax + 19]  // alpha4
4469     movzx      edi, byte ptr [eax + 23]  // alpha5
4470     vpunpckldq xmm7, xmm2, xmm3  // [1,a3,1,a2]
4471     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a4]
4472     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a5]
4473     movzx      esi, byte ptr [eax + 27]  // alpha6
4474     movzx      edi, byte ptr [eax + 31]  // alpha7
4475     vpunpckldq xmm0, xmm0, xmm1  // [1,a5,1,a4]
4476     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a6]
4477     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a7]
4478     vpunpckldq xmm2, xmm2, xmm3  // [1,a7,1,a6]
4479     vpunpcklqdq xmm3, xmm6, xmm7  // [1,a3,1,a2,1,a1,1,a0]
4480     vpunpcklqdq xmm0, xmm0, xmm2  // [1,a7,1,a6,1,a5,1,a4]
4481     vinserti128 ymm3, ymm3, xmm0, 1                // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
4482     // end of VPGATHER
4483 
4484     vmovdqu    ymm6, [eax]  // read 8 pixels.
4485     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4486     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4487     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
4488     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
4489     vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
4490     vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
4491     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
4492     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
4493     vpackuswb  ymm0, ymm0, ymm1             // unmutated.
4494     vmovdqu    [eax + edx], ymm0
4495     lea        eax, [eax + 32]
4496     sub        ecx, 8
4497     jg         convertloop
4498 
4499     pop        edi
4500     pop        esi
4501     pop        ebx
4502     vzeroupper
4503     ret
4504   }
4505 }
4506 #endif  // USE_GATHER
4507 #endif  // HAS_ARGBATTENUATEROW_AVX2
4508 
4509 #ifdef HAS_ARGBGRAYROW_SSSE3
4510 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
4511 __declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb,
4512                                          uint8_t* dst_argb,
4513                                          int width) {
4514   __asm {
4515     mov        eax, [esp + 4] /* src_argb */
4516     mov        edx, [esp + 8] /* dst_argb */
4517     mov        ecx, [esp + 12] /* width */
4518     movdqa     xmm4, xmmword ptr kARGBToYJ
4519     movdqa     xmm5, xmmword ptr kAddYJ64
4520 
4521  convertloop:
4522     movdqu     xmm0, [eax]  // G
4523     movdqu     xmm1, [eax + 16]
4524     pmaddubsw  xmm0, xmm4
4525     pmaddubsw  xmm1, xmm4
4526     phaddw     xmm0, xmm1
4527     paddw      xmm0, xmm5  // Add .5 for rounding.
4528     psrlw      xmm0, 7
4529     packuswb   xmm0, xmm0  // 8 G bytes
4530     movdqu     xmm2, [eax]  // A
4531     movdqu     xmm3, [eax + 16]
4532     lea        eax, [eax + 32]
4533     psrld      xmm2, 24
4534     psrld      xmm3, 24
4535     packuswb   xmm2, xmm3
4536     packuswb   xmm2, xmm2  // 8 A bytes
4537     movdqa     xmm3, xmm0  // Weave into GG, GA, then GGGA
4538     punpcklbw  xmm0, xmm0  // 8 GG words
4539     punpcklbw  xmm3, xmm2  // 8 GA words
4540     movdqa     xmm1, xmm0
4541     punpcklwd  xmm0, xmm3  // GGGA first 4
4542     punpckhwd  xmm1, xmm3  // GGGA next 4
4543     movdqu     [edx], xmm0
4544     movdqu     [edx + 16], xmm1
4545     lea        edx, [edx + 32]
4546     sub        ecx, 8
4547     jg         convertloop
4548     ret
4549   }
4550 }
4551 #endif  // HAS_ARGBGRAYROW_SSSE3
4552 
4553 #ifdef HAS_ARGBSEPIAROW_SSSE3
4554 //    b = (r * 35 + g * 68 + b * 17) >> 7
4555 //    g = (r * 45 + g * 88 + b * 22) >> 7
4556 //    r = (r * 50 + g * 98 + b * 24) >> 7
4557 // Constant for ARGB color to sepia tone.
4558 static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
4559                                    17, 68, 35, 0, 17, 68, 35, 0};
4560 
4561 static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
4562                                    22, 88, 45, 0, 22, 88, 45, 0};
4563 
4564 static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
4565                                    24, 98, 50, 0, 24, 98, 50, 0};
4566 
4567 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
4568 __declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
4569   __asm {
4570     mov        eax, [esp + 4] /* dst_argb */
4571     mov        ecx, [esp + 8] /* width */
4572     movdqa     xmm2, xmmword ptr kARGBToSepiaB
4573     movdqa     xmm3, xmmword ptr kARGBToSepiaG
4574     movdqa     xmm4, xmmword ptr kARGBToSepiaR
4575 
4576  convertloop:
4577     movdqu     xmm0, [eax]  // B
4578     movdqu     xmm6, [eax + 16]
4579     pmaddubsw  xmm0, xmm2
4580     pmaddubsw  xmm6, xmm2
4581     phaddw     xmm0, xmm6
4582     psrlw      xmm0, 7
4583     packuswb   xmm0, xmm0  // 8 B values
4584     movdqu     xmm5, [eax]  // G
4585     movdqu     xmm1, [eax + 16]
4586     pmaddubsw  xmm5, xmm3
4587     pmaddubsw  xmm1, xmm3
4588     phaddw     xmm5, xmm1
4589     psrlw      xmm5, 7
4590     packuswb   xmm5, xmm5  // 8 G values
4591     punpcklbw  xmm0, xmm5  // 8 BG values
4592     movdqu     xmm5, [eax]  // R
4593     movdqu     xmm1, [eax + 16]
4594     pmaddubsw  xmm5, xmm4
4595     pmaddubsw  xmm1, xmm4
4596     phaddw     xmm5, xmm1
4597     psrlw      xmm5, 7
4598     packuswb   xmm5, xmm5  // 8 R values
4599     movdqu     xmm6, [eax]  // A
4600     movdqu     xmm1, [eax + 16]
4601     psrld      xmm6, 24
4602     psrld      xmm1, 24
4603     packuswb   xmm6, xmm1
4604     packuswb   xmm6, xmm6  // 8 A values
4605     punpcklbw  xmm5, xmm6  // 8 RA values
4606     movdqa     xmm1, xmm0  // Weave BG, RA together
4607     punpcklwd  xmm0, xmm5  // BGRA first 4
4608     punpckhwd  xmm1, xmm5  // BGRA next 4
4609     movdqu     [eax], xmm0
4610     movdqu     [eax + 16], xmm1
4611     lea        eax, [eax + 32]
4612     sub        ecx, 8
4613     jg         convertloop
4614     ret
4615   }
4616 }
4617 #endif  // HAS_ARGBSEPIAROW_SSSE3
4618 
4619 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4620 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
4621 // Same as Sepia except matrix is provided.
4622 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
4623 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
4624 __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
4625                                                 uint8_t* dst_argb,
4626                                                 const int8_t* matrix_argb,
4627                                                 int width) {
4628   __asm {
4629     mov        eax, [esp + 4] /* src_argb */
4630     mov        edx, [esp + 8] /* dst_argb */
4631     mov        ecx, [esp + 12] /* matrix_argb */
4632     movdqu     xmm5, [ecx]
4633     pshufd     xmm2, xmm5, 0x00
4634     pshufd     xmm3, xmm5, 0x55
4635     pshufd     xmm4, xmm5, 0xaa
4636     pshufd     xmm5, xmm5, 0xff
4637     mov        ecx, [esp + 16] /* width */
4638 
4639  convertloop:
4640     movdqu     xmm0, [eax]  // B
4641     movdqu     xmm7, [eax + 16]
4642     pmaddubsw  xmm0, xmm2
4643     pmaddubsw  xmm7, xmm2
4644     movdqu     xmm6, [eax]  // G
4645     movdqu     xmm1, [eax + 16]
4646     pmaddubsw  xmm6, xmm3
4647     pmaddubsw  xmm1, xmm3
4648     phaddsw    xmm0, xmm7  // B
4649     phaddsw    xmm6, xmm1  // G
4650     psraw      xmm0, 6  // B
4651     psraw      xmm6, 6  // G
4652     packuswb   xmm0, xmm0  // 8 B values
4653     packuswb   xmm6, xmm6  // 8 G values
4654     punpcklbw  xmm0, xmm6  // 8 BG values
4655     movdqu     xmm1, [eax]  // R
4656     movdqu     xmm7, [eax + 16]
4657     pmaddubsw  xmm1, xmm4
4658     pmaddubsw  xmm7, xmm4
4659     phaddsw    xmm1, xmm7  // R
4660     movdqu     xmm6, [eax]  // A
4661     movdqu     xmm7, [eax + 16]
4662     pmaddubsw  xmm6, xmm5
4663     pmaddubsw  xmm7, xmm5
4664     phaddsw    xmm6, xmm7  // A
4665     psraw      xmm1, 6  // R
4666     psraw      xmm6, 6  // A
4667     packuswb   xmm1, xmm1  // 8 R values
4668     packuswb   xmm6, xmm6  // 8 A values
4669     punpcklbw  xmm1, xmm6  // 8 RA values
4670     movdqa     xmm6, xmm0  // Weave BG, RA together
4671     punpcklwd  xmm0, xmm1  // BGRA first 4
4672     punpckhwd  xmm6, xmm1  // BGRA next 4
4673     movdqu     [edx], xmm0
4674     movdqu     [edx + 16], xmm6
4675     lea        eax, [eax + 32]
4676     lea        edx, [edx + 32]
4677     sub        ecx, 8
4678     jg         convertloop
4679     ret
4680   }
4681 }
4682 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
4683 
4684 #ifdef HAS_ARGBQUANTIZEROW_SSE2
4685 // Quantize 4 ARGB pixels (16 bytes).
4686 __declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
4687                                             int scale,
4688                                             int interval_size,
4689                                             int interval_offset,
4690                                             int width) {
4691   __asm {
4692     mov        eax, [esp + 4] /* dst_argb */
4693     movd       xmm2, [esp + 8] /* scale */
4694     movd       xmm3, [esp + 12] /* interval_size */
4695     movd       xmm4, [esp + 16] /* interval_offset */
4696     mov        ecx, [esp + 20] /* width */
4697     pshuflw    xmm2, xmm2, 040h
4698     pshufd     xmm2, xmm2, 044h
4699     pshuflw    xmm3, xmm3, 040h
4700     pshufd     xmm3, xmm3, 044h
4701     pshuflw    xmm4, xmm4, 040h
4702     pshufd     xmm4, xmm4, 044h
4703     pxor       xmm5, xmm5  // constant 0
4704     pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
4705     pslld      xmm6, 24
4706 
4707  convertloop:
4708     movdqu     xmm0, [eax]  // read 4 pixels
4709     punpcklbw  xmm0, xmm5  // first 2 pixels
4710     pmulhuw    xmm0, xmm2  // pixel * scale >> 16
4711     movdqu     xmm1, [eax]  // read 4 pixels
4712     punpckhbw  xmm1, xmm5  // next 2 pixels
4713     pmulhuw    xmm1, xmm2
4714     pmullw     xmm0, xmm3  // * interval_size
4715     movdqu     xmm7, [eax]  // read 4 pixels
4716     pmullw     xmm1, xmm3
4717     pand       xmm7, xmm6  // mask alpha
4718     paddw      xmm0, xmm4  // + interval_size / 2
4719     paddw      xmm1, xmm4
4720     packuswb   xmm0, xmm1
4721     por        xmm0, xmm7
4722     movdqu     [eax], xmm0
4723     lea        eax, [eax + 16]
4724     sub        ecx, 4
4725     jg         convertloop
4726     ret
4727   }
4728 }
4729 #endif  // HAS_ARGBQUANTIZEROW_SSE2
4730 
4731 #ifdef HAS_ARGBSHADEROW_SSE2
4732 // Shade 4 pixels at a time by specified value.
4733 __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
4734                                          uint8_t* dst_argb,
4735                                          int width,
4736                                          uint32_t value) {
4737   __asm {
4738     mov        eax, [esp + 4]  // src_argb
4739     mov        edx, [esp + 8]  // dst_argb
4740     mov        ecx, [esp + 12]  // width
4741     movd       xmm2, [esp + 16]  // value
4742     punpcklbw  xmm2, xmm2
4743     punpcklqdq xmm2, xmm2
4744 
4745  convertloop:
4746     movdqu     xmm0, [eax]  // read 4 pixels
4747     lea        eax, [eax + 16]
4748     movdqa     xmm1, xmm0
4749     punpcklbw  xmm0, xmm0  // first 2
4750     punpckhbw  xmm1, xmm1  // next 2
4751     pmulhuw    xmm0, xmm2  // argb * value
4752     pmulhuw    xmm1, xmm2  // argb * value
4753     psrlw      xmm0, 8
4754     psrlw      xmm1, 8
4755     packuswb   xmm0, xmm1
4756     movdqu     [edx], xmm0
4757     lea        edx, [edx + 16]
4758     sub        ecx, 4
4759     jg         convertloop
4760 
4761     ret
4762   }
4763 }
4764 #endif  // HAS_ARGBSHADEROW_SSE2
4765 
4766 #ifdef HAS_ARGBMULTIPLYROW_SSE2
4767 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4768 __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
4769                                             const uint8_t* src_argb1,
4770                                             uint8_t* dst_argb,
4771                                             int width) {
4772   __asm {
4773     push       esi
4774     mov        eax, [esp + 4 + 4]  // src_argb0
4775     mov        esi, [esp + 4 + 8]  // src_argb1
4776     mov        edx, [esp + 4 + 12]  // dst_argb
4777     mov        ecx, [esp + 4 + 16]  // width
4778     pxor       xmm5, xmm5  // constant 0
4779 
4780  convertloop:
4781     movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
4782     movdqu     xmm2, [esi]  // read 4 pixels from src_argb1
4783     movdqu     xmm1, xmm0
4784     movdqu     xmm3, xmm2
4785     punpcklbw  xmm0, xmm0  // first 2
4786     punpckhbw  xmm1, xmm1  // next 2
4787     punpcklbw  xmm2, xmm5  // first 2
4788     punpckhbw  xmm3, xmm5  // next 2
4789     pmulhuw    xmm0, xmm2  // src_argb0 * src_argb1 first 2
4790     pmulhuw    xmm1, xmm3  // src_argb0 * src_argb1 next 2
4791     lea        eax, [eax + 16]
4792     lea        esi, [esi + 16]
4793     packuswb   xmm0, xmm1
4794     movdqu     [edx], xmm0
4795     lea        edx, [edx + 16]
4796     sub        ecx, 4
4797     jg         convertloop
4798 
4799     pop        esi
4800     ret
4801   }
4802 }
4803 #endif  // HAS_ARGBMULTIPLYROW_SSE2
4804 
4805 #ifdef HAS_ARGBADDROW_SSE2
4806 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
4807 // TODO(fbarchard): Port this to posix, neon and other math functions.
4808 __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
4809                                        const uint8_t* src_argb1,
4810                                        uint8_t* dst_argb,
4811                                        int width) {
4812   __asm {
4813     push       esi
4814     mov        eax, [esp + 4 + 4]  // src_argb0
4815     mov        esi, [esp + 4 + 8]  // src_argb1
4816     mov        edx, [esp + 4 + 12]  // dst_argb
4817     mov        ecx, [esp + 4 + 16]  // width
4818 
4819     sub        ecx, 4
4820     jl         convertloop49
4821 
4822  convertloop4:
4823     movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
4824     lea        eax, [eax + 16]
4825     movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
4826     lea        esi, [esi + 16]
4827     paddusb    xmm0, xmm1  // src_argb0 + src_argb1
4828     movdqu     [edx], xmm0
4829     lea        edx, [edx + 16]
4830     sub        ecx, 4
4831     jge        convertloop4
4832 
4833  convertloop49:
4834     add        ecx, 4 - 1
4835     jl         convertloop19
4836 
4837  convertloop1:
4838     movd       xmm0, [eax]  // read 1 pixels from src_argb0
4839     lea        eax, [eax + 4]
4840     movd       xmm1, [esi]  // read 1 pixels from src_argb1
4841     lea        esi, [esi + 4]
4842     paddusb    xmm0, xmm1  // src_argb0 + src_argb1
4843     movd       [edx], xmm0
4844     lea        edx, [edx + 4]
4845     sub        ecx, 1
4846     jge        convertloop1
4847 
4848  convertloop19:
4849     pop        esi
4850     ret
4851   }
4852 }
4853 #endif  // HAS_ARGBADDROW_SSE2
4854 
4855 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4856 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
4857 __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
4858                                             const uint8_t* src_argb1,
4859                                             uint8_t* dst_argb,
4860                                             int width) {
4861   __asm {
4862     push       esi
4863     mov        eax, [esp + 4 + 4]  // src_argb0
4864     mov        esi, [esp + 4 + 8]  // src_argb1
4865     mov        edx, [esp + 4 + 12]  // dst_argb
4866     mov        ecx, [esp + 4 + 16]  // width
4867 
4868  convertloop:
4869     movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
4870     lea        eax, [eax + 16]
4871     movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
4872     lea        esi, [esi + 16]
4873     psubusb    xmm0, xmm1  // src_argb0 - src_argb1
4874     movdqu     [edx], xmm0
4875     lea        edx, [edx + 16]
4876     sub        ecx, 4
4877     jg         convertloop
4878 
4879     pop        esi
4880     ret
4881   }
4882 }
4883 #endif  // HAS_ARGBSUBTRACTROW_SSE2
4884 
4885 #ifdef HAS_ARGBMULTIPLYROW_AVX2
4886 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
4887 __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
4888                                             const uint8_t* src_argb1,
4889                                             uint8_t* dst_argb,
4890                                             int width) {
4891   __asm {
4892     push       esi
4893     mov        eax, [esp + 4 + 4]  // src_argb0
4894     mov        esi, [esp + 4 + 8]  // src_argb1
4895     mov        edx, [esp + 4 + 12]  // dst_argb
4896     mov        ecx, [esp + 4 + 16]  // width
4897     vpxor      ymm5, ymm5, ymm5  // constant 0
4898 
4899  convertloop:
4900     vmovdqu    ymm1, [eax]  // read 8 pixels from src_argb0
4901     lea        eax, [eax + 32]
4902     vmovdqu    ymm3, [esi]  // read 8 pixels from src_argb1
4903     lea        esi, [esi + 32]
4904     vpunpcklbw ymm0, ymm1, ymm1  // low 4
4905     vpunpckhbw ymm1, ymm1, ymm1  // high 4
4906     vpunpcklbw ymm2, ymm3, ymm5  // low 4
4907     vpunpckhbw ymm3, ymm3, ymm5  // high 4
4908     vpmulhuw   ymm0, ymm0, ymm2  // src_argb0 * src_argb1 low 4
4909     vpmulhuw   ymm1, ymm1, ymm3  // src_argb0 * src_argb1 high 4
4910     vpackuswb  ymm0, ymm0, ymm1
4911     vmovdqu    [edx], ymm0
4912     lea        edx, [edx + 32]
4913     sub        ecx, 8
4914     jg         convertloop
4915 
4916     pop        esi
4917     vzeroupper
4918     ret
4919   }
4920 }
4921 #endif  // HAS_ARGBMULTIPLYROW_AVX2
4922 
4923 #ifdef HAS_ARGBADDROW_AVX2
4924 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
4925 __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
4926                                        const uint8_t* src_argb1,
4927                                        uint8_t* dst_argb,
4928                                        int width) {
4929   __asm {
4930     push       esi
4931     mov        eax, [esp + 4 + 4]  // src_argb0
4932     mov        esi, [esp + 4 + 8]  // src_argb1
4933     mov        edx, [esp + 4 + 12]  // dst_argb
4934     mov        ecx, [esp + 4 + 16]  // width
4935 
4936  convertloop:
4937     vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
4938     lea        eax, [eax + 32]
4939     vpaddusb   ymm0, ymm0, [esi]  // add 8 pixels from src_argb1
4940     lea        esi, [esi + 32]
4941     vmovdqu    [edx], ymm0
4942     lea        edx, [edx + 32]
4943     sub        ecx, 8
4944     jg         convertloop
4945 
4946     pop        esi
4947     vzeroupper
4948     ret
4949   }
4950 }
4951 #endif  // HAS_ARGBADDROW_AVX2
4952 
4953 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4954 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
4955 __declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
4956                                             const uint8_t* src_argb1,
4957                                             uint8_t* dst_argb,
4958                                             int width) {
4959   __asm {
4960     push       esi
4961     mov        eax, [esp + 4 + 4]  // src_argb0
4962     mov        esi, [esp + 4 + 8]  // src_argb1
4963     mov        edx, [esp + 4 + 12]  // dst_argb
4964     mov        ecx, [esp + 4 + 16]  // width
4965 
4966  convertloop:
4967     vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
4968     lea        eax, [eax + 32]
4969     vpsubusb   ymm0, ymm0, [esi]  // src_argb0 - src_argb1
4970     lea        esi, [esi + 32]
4971     vmovdqu    [edx], ymm0
4972     lea        edx, [edx + 32]
4973     sub        ecx, 8
4974     jg         convertloop
4975 
4976     pop        esi
4977     vzeroupper
4978     ret
4979   }
4980 }
4981 #endif  // HAS_ARGBSUBTRACTROW_AVX2
4982 
4983 #ifdef HAS_SOBELXROW_SSE2
4984 // SobelX as a matrix is
4985 // -1  0  1
4986 // -2  0  2
4987 // -1  0  1
4988 __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
4989                                       const uint8_t* src_y1,
4990                                       const uint8_t* src_y2,
4991                                       uint8_t* dst_sobelx,
4992                                       int width) {
4993   __asm {
4994     push       esi
4995     push       edi
4996     mov        eax, [esp + 8 + 4]  // src_y0
4997     mov        esi, [esp + 8 + 8]  // src_y1
4998     mov        edi, [esp + 8 + 12]  // src_y2
4999     mov        edx, [esp + 8 + 16]  // dst_sobelx
5000     mov        ecx, [esp + 8 + 20]  // width
5001     sub        esi, eax
5002     sub        edi, eax
5003     sub        edx, eax
5004     pxor       xmm5, xmm5  // constant 0
5005 
5006  convertloop:
5007     movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
5008     movq       xmm1, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
5009     punpcklbw  xmm0, xmm5
5010     punpcklbw  xmm1, xmm5
5011     psubw      xmm0, xmm1
5012     movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
5013     movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
5014     punpcklbw  xmm1, xmm5
5015     punpcklbw  xmm2, xmm5
5016     psubw      xmm1, xmm2
5017     movq       xmm2, qword ptr [eax + edi]  // read 8 pixels from src_y2[0]
5018     movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
5019     punpcklbw  xmm2, xmm5
5020     punpcklbw  xmm3, xmm5
5021     psubw      xmm2, xmm3
5022     paddw      xmm0, xmm2
5023     paddw      xmm0, xmm1
5024     paddw      xmm0, xmm1
5025     pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
5026     psubw      xmm1, xmm0
5027     pmaxsw     xmm0, xmm1
5028     packuswb   xmm0, xmm0
5029     movq       qword ptr [eax + edx], xmm0
5030     lea        eax, [eax + 8]
5031     sub        ecx, 8
5032     jg         convertloop
5033 
5034     pop        edi
5035     pop        esi
5036     ret
5037   }
5038 }
5039 #endif  // HAS_SOBELXROW_SSE2
5040 
5041 #ifdef HAS_SOBELYROW_SSE2
5042 // SobelY as a matrix is
5043 // -1 -2 -1
5044 //  0  0  0
5045 //  1  2  1
5046 __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
5047                                       const uint8_t* src_y1,
5048                                       uint8_t* dst_sobely,
5049                                       int width) {
5050   __asm {
5051     push       esi
5052     mov        eax, [esp + 4 + 4]  // src_y0
5053     mov        esi, [esp + 4 + 8]  // src_y1
5054     mov        edx, [esp + 4 + 12]  // dst_sobely
5055     mov        ecx, [esp + 4 + 16]  // width
5056     sub        esi, eax
5057     sub        edx, eax
5058     pxor       xmm5, xmm5  // constant 0
5059 
5060  convertloop:
5061     movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
5062     movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
5063     punpcklbw  xmm0, xmm5
5064     punpcklbw  xmm1, xmm5
5065     psubw      xmm0, xmm1
5066     movq       xmm1, qword ptr [eax + 1]  // read 8 pixels from src_y0[1]
5067     movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
5068     punpcklbw  xmm1, xmm5
5069     punpcklbw  xmm2, xmm5
5070     psubw      xmm1, xmm2
5071     movq       xmm2, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
5072     movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
5073     punpcklbw  xmm2, xmm5
5074     punpcklbw  xmm3, xmm5
5075     psubw      xmm2, xmm3
5076     paddw      xmm0, xmm2
5077     paddw      xmm0, xmm1
5078     paddw      xmm0, xmm1
5079     pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
5080     psubw      xmm1, xmm0
5081     pmaxsw     xmm0, xmm1
5082     packuswb   xmm0, xmm0
5083     movq       qword ptr [eax + edx], xmm0
5084     lea        eax, [eax + 8]
5085     sub        ecx, 8
5086     jg         convertloop
5087 
5088     pop        esi
5089     ret
5090   }
5091 }
5092 #endif  // HAS_SOBELYROW_SSE2
5093 
5094 #ifdef HAS_SOBELROW_SSE2
5095 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5096 // A = 255
5097 // R = Sobel
5098 // G = Sobel
5099 // B = Sobel
5100 __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
5101                                      const uint8_t* src_sobely,
5102                                      uint8_t* dst_argb,
5103                                      int width) {
5104   __asm {
5105     push       esi
5106     mov        eax, [esp + 4 + 4]  // src_sobelx
5107     mov        esi, [esp + 4 + 8]  // src_sobely
5108     mov        edx, [esp + 4 + 12]  // dst_argb
5109     mov        ecx, [esp + 4 + 16]  // width
5110     sub        esi, eax
5111     pcmpeqb    xmm5, xmm5  // alpha 255
5112     pslld      xmm5, 24  // 0xff000000
5113 
5114  convertloop:
5115     movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
5116     movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
5117     lea        eax, [eax + 16]
5118     paddusb    xmm0, xmm1  // sobel = sobelx + sobely
5119     movdqa     xmm2, xmm0  // GG
5120     punpcklbw  xmm2, xmm0  // First 8
5121     punpckhbw  xmm0, xmm0  // Next 8
5122     movdqa     xmm1, xmm2  // GGGG
5123     punpcklwd  xmm1, xmm2  // First 4
5124     punpckhwd  xmm2, xmm2  // Next 4
5125     por        xmm1, xmm5  // GGGA
5126     por        xmm2, xmm5
5127     movdqa     xmm3, xmm0  // GGGG
5128     punpcklwd  xmm3, xmm0  // Next 4
5129     punpckhwd  xmm0, xmm0  // Last 4
5130     por        xmm3, xmm5  // GGGA
5131     por        xmm0, xmm5
5132     movdqu     [edx], xmm1
5133     movdqu     [edx + 16], xmm2
5134     movdqu     [edx + 32], xmm3
5135     movdqu     [edx + 48], xmm0
5136     lea        edx, [edx + 64]
5137     sub        ecx, 16
5138     jg         convertloop
5139 
5140     pop        esi
5141     ret
5142   }
5143 }
5144 #endif  // HAS_SOBELROW_SSE2
5145 
5146 #ifdef HAS_SOBELTOPLANEROW_SSE2
5147 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
5148 __declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
5149                                             const uint8_t* src_sobely,
5150                                             uint8_t* dst_y,
5151                                             int width) {
5152   __asm {
5153     push       esi
5154     mov        eax, [esp + 4 + 4]  // src_sobelx
5155     mov        esi, [esp + 4 + 8]  // src_sobely
5156     mov        edx, [esp + 4 + 12]  // dst_argb
5157     mov        ecx, [esp + 4 + 16]  // width
5158     sub        esi, eax
5159 
5160  convertloop:
5161     movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
5162     movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
5163     lea        eax, [eax + 16]
5164     paddusb    xmm0, xmm1  // sobel = sobelx + sobely
5165     movdqu     [edx], xmm0
5166     lea        edx, [edx + 16]
5167     sub        ecx, 16
5168     jg         convertloop
5169 
5170     pop        esi
5171     ret
5172   }
5173 }
5174 #endif  // HAS_SOBELTOPLANEROW_SSE2
5175 
5176 #ifdef HAS_SOBELXYROW_SSE2
5177 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
5178 // A = 255
5179 // R = Sobel X
5180 // G = Sobel
5181 // B = Sobel Y
5182 __declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
5183                                        const uint8_t* src_sobely,
5184                                        uint8_t* dst_argb,
5185                                        int width) {
5186   __asm {
5187     push       esi
5188     mov        eax, [esp + 4 + 4]  // src_sobelx
5189     mov        esi, [esp + 4 + 8]  // src_sobely
5190     mov        edx, [esp + 4 + 12]  // dst_argb
5191     mov        ecx, [esp + 4 + 16]  // width
5192     sub        esi, eax
5193     pcmpeqb    xmm5, xmm5  // alpha 255
5194 
5195  convertloop:
5196     movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
5197     movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
5198     lea        eax, [eax + 16]
5199     movdqa     xmm2, xmm0
5200     paddusb    xmm2, xmm1  // sobel = sobelx + sobely
5201     movdqa     xmm3, xmm0  // XA
5202     punpcklbw  xmm3, xmm5
5203     punpckhbw  xmm0, xmm5
5204     movdqa     xmm4, xmm1  // YS
5205     punpcklbw  xmm4, xmm2
5206     punpckhbw  xmm1, xmm2
5207     movdqa     xmm6, xmm4  // YSXA
5208     punpcklwd  xmm6, xmm3  // First 4
5209     punpckhwd  xmm4, xmm3  // Next 4
5210     movdqa     xmm7, xmm1  // YSXA
5211     punpcklwd  xmm7, xmm0  // Next 4
5212     punpckhwd  xmm1, xmm0  // Last 4
5213     movdqu     [edx], xmm6
5214     movdqu     [edx + 16], xmm4
5215     movdqu     [edx + 32], xmm7
5216     movdqu     [edx + 48], xmm1
5217     lea        edx, [edx + 64]
5218     sub        ecx, 16
5219     jg         convertloop
5220 
5221     pop        esi
5222     ret
5223   }
5224 }
5225 #endif  // HAS_SOBELXYROW_SSE2
5226 
5227 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5228 // Consider float CumulativeSum.
5229 // Consider calling CumulativeSum one row at time as needed.
5230 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
5231 // Convert cumulative sum for an area to an average for 1 pixel.
5232 // topleft is pointer to top left of CumulativeSum buffer for area.
5233 // botleft is pointer to bottom left of CumulativeSum buffer.
5234 // width is offset from left to right of area in CumulativeSum buffer measured
5235 //   in number of ints.
5236 // area is the number of pixels in the area being averaged.
5237 // dst points to pixel to store result to.
5238 // count is number of averaged pixels to produce.
5239 // Does 4 pixels at a time.
5240 // This function requires alignment on accumulation buffer pointers.
5241 void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
5242                                     const int32_t* botleft,
5243                                     int width,
5244                                     int area,
5245                                     uint8_t* dst,
5246                                     int count) {
5247   __asm {
5248     mov        eax, topleft  // eax topleft
5249     mov        esi, botleft  // esi botleft
5250     mov        edx, width
5251     movd       xmm5, area
5252     mov        edi, dst
5253     mov        ecx, count
5254     cvtdq2ps   xmm5, xmm5
5255     rcpss      xmm4, xmm5  // 1.0f / area
5256     pshufd     xmm4, xmm4, 0
5257     sub        ecx, 4
5258     jl         l4b
5259 
5260     cmp        area, 128  // 128 pixels will not overflow 15 bits.
5261     ja         l4
5262 
5263     pshufd     xmm5, xmm5, 0  // area
5264     pcmpeqb    xmm6, xmm6  // constant of 65536.0 - 1 = 65535.0
5265     psrld      xmm6, 16
5266     cvtdq2ps   xmm6, xmm6
5267     addps      xmm5, xmm6  // (65536.0 + area - 1)
5268     mulps      xmm5, xmm4  // (65536.0 + area - 1) * 1 / area
5269     cvtps2dq   xmm5, xmm5  // 0.16 fixed point
5270     packssdw   xmm5, xmm5  // 16 bit shorts
5271 
5272         // 4 pixel loop small blocks.
5273   s4:
5274         // top left
5275     movdqu     xmm0, [eax]
5276     movdqu     xmm1, [eax + 16]
5277     movdqu     xmm2, [eax + 32]
5278     movdqu     xmm3, [eax + 48]
5279 
5280     // - top right
5281     psubd      xmm0, [eax + edx * 4]
5282     psubd      xmm1, [eax + edx * 4 + 16]
5283     psubd      xmm2, [eax + edx * 4 + 32]
5284     psubd      xmm3, [eax + edx * 4 + 48]
5285     lea        eax, [eax + 64]
5286 
5287     // - bottom left
5288     psubd      xmm0, [esi]
5289     psubd      xmm1, [esi + 16]
5290     psubd      xmm2, [esi + 32]
5291     psubd      xmm3, [esi + 48]
5292 
5293     // + bottom right
5294     paddd      xmm0, [esi + edx * 4]
5295     paddd      xmm1, [esi + edx * 4 + 16]
5296     paddd      xmm2, [esi + edx * 4 + 32]
5297     paddd      xmm3, [esi + edx * 4 + 48]
5298     lea        esi, [esi + 64]
5299 
5300     packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
5301     packssdw   xmm2, xmm3
5302 
5303     pmulhuw    xmm0, xmm5
5304     pmulhuw    xmm2, xmm5
5305 
5306     packuswb   xmm0, xmm2
5307     movdqu     [edi], xmm0
5308     lea        edi, [edi + 16]
5309     sub        ecx, 4
5310     jge        s4
5311 
5312     jmp        l4b
5313 
5314             // 4 pixel loop
5315   l4:
5316         // top left
5317     movdqu     xmm0, [eax]
5318     movdqu     xmm1, [eax + 16]
5319     movdqu     xmm2, [eax + 32]
5320     movdqu     xmm3, [eax + 48]
5321 
5322     // - top right
5323     psubd      xmm0, [eax + edx * 4]
5324     psubd      xmm1, [eax + edx * 4 + 16]
5325     psubd      xmm2, [eax + edx * 4 + 32]
5326     psubd      xmm3, [eax + edx * 4 + 48]
5327     lea        eax, [eax + 64]
5328 
5329     // - bottom left
5330     psubd      xmm0, [esi]
5331     psubd      xmm1, [esi + 16]
5332     psubd      xmm2, [esi + 32]
5333     psubd      xmm3, [esi + 48]
5334 
5335     // + bottom right
5336     paddd      xmm0, [esi + edx * 4]
5337     paddd      xmm1, [esi + edx * 4 + 16]
5338     paddd      xmm2, [esi + edx * 4 + 32]
5339     paddd      xmm3, [esi + edx * 4 + 48]
5340     lea        esi, [esi + 64]
5341 
5342     cvtdq2ps   xmm0, xmm0  // Average = Sum * 1 / Area
5343     cvtdq2ps   xmm1, xmm1
5344     mulps      xmm0, xmm4
5345     mulps      xmm1, xmm4
5346     cvtdq2ps   xmm2, xmm2
5347     cvtdq2ps   xmm3, xmm3
5348     mulps      xmm2, xmm4
5349     mulps      xmm3, xmm4
5350     cvtps2dq   xmm0, xmm0
5351     cvtps2dq   xmm1, xmm1
5352     cvtps2dq   xmm2, xmm2
5353     cvtps2dq   xmm3, xmm3
5354     packssdw   xmm0, xmm1
5355     packssdw   xmm2, xmm3
5356     packuswb   xmm0, xmm2
5357     movdqu     [edi], xmm0
5358     lea        edi, [edi + 16]
5359     sub        ecx, 4
5360     jge        l4
5361 
5362   l4b:
5363     add        ecx, 4 - 1
5364     jl         l1b
5365 
5366             // 1 pixel loop
5367   l1:
5368     movdqu     xmm0, [eax]
5369     psubd      xmm0, [eax + edx * 4]
5370     lea        eax, [eax + 16]
5371     psubd      xmm0, [esi]
5372     paddd      xmm0, [esi + edx * 4]
5373     lea        esi, [esi + 16]
5374     cvtdq2ps   xmm0, xmm0
5375     mulps      xmm0, xmm4
5376     cvtps2dq   xmm0, xmm0
5377     packssdw   xmm0, xmm0
5378     packuswb   xmm0, xmm0
5379     movd       dword ptr [edi], xmm0
5380     lea        edi, [edi + 4]
5381     sub        ecx, 1
5382     jge        l1
5383   l1b:
5384   }
5385 }
5386 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5387 
5388 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5389 // Creates a table of cumulative sums where each value is a sum of all values
5390 // above and to the left of the value.
5391 void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
5392                                   int32_t* cumsum,
5393                                   const int32_t* previous_cumsum,
5394                                   int width) {
5395   __asm {
5396     mov        eax, row
5397     mov        edx, cumsum
5398     mov        esi, previous_cumsum
5399     mov        ecx, width
5400     pxor       xmm0, xmm0
5401     pxor       xmm1, xmm1
5402 
5403     sub        ecx, 4
5404     jl         l4b
5405     test       edx, 15
5406     jne        l4b
5407 
5408         // 4 pixel loop
5409   l4:
5410     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
5411     lea        eax, [eax + 16]
5412     movdqa     xmm4, xmm2
5413 
5414     punpcklbw  xmm2, xmm1
5415     movdqa     xmm3, xmm2
5416     punpcklwd  xmm2, xmm1
5417     punpckhwd  xmm3, xmm1
5418 
5419     punpckhbw  xmm4, xmm1
5420     movdqa     xmm5, xmm4
5421     punpcklwd  xmm4, xmm1
5422     punpckhwd  xmm5, xmm1
5423 
5424     paddd      xmm0, xmm2
5425     movdqu     xmm2, [esi]  // previous row above.
5426     paddd      xmm2, xmm0
5427 
5428     paddd      xmm0, xmm3
5429     movdqu     xmm3, [esi + 16]
5430     paddd      xmm3, xmm0
5431 
5432     paddd      xmm0, xmm4
5433     movdqu     xmm4, [esi + 32]
5434     paddd      xmm4, xmm0
5435 
5436     paddd      xmm0, xmm5
5437     movdqu     xmm5, [esi + 48]
5438     lea        esi, [esi + 64]
5439     paddd      xmm5, xmm0
5440 
5441     movdqu     [edx], xmm2
5442     movdqu     [edx + 16], xmm3
5443     movdqu     [edx + 32], xmm4
5444     movdqu     [edx + 48], xmm5
5445 
5446     lea        edx, [edx + 64]
5447     sub        ecx, 4
5448     jge        l4
5449 
5450   l4b:
5451     add        ecx, 4 - 1
5452     jl         l1b
5453 
5454             // 1 pixel loop
5455   l1:
5456     movd       xmm2, dword ptr [eax]  // 1 argb pixel
5457     lea        eax, [eax + 4]
5458     punpcklbw  xmm2, xmm1
5459     punpcklwd  xmm2, xmm1
5460     paddd      xmm0, xmm2
5461     movdqu     xmm2, [esi]
5462     lea        esi, [esi + 16]
5463     paddd      xmm2, xmm0
5464     movdqu     [edx], xmm2
5465     lea        edx, [edx + 16]
5466     sub        ecx, 1
5467     jge        l1
5468 
5469  l1b:
5470   }
5471 }
5472 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
5473 
5474 #ifdef HAS_ARGBAFFINEROW_SSE2
5475 // Copy ARGB pixels from source image with slope to a row of destination.
5476 __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
5477                                                      int src_argb_stride,
5478                                                      uint8_t* dst_argb,
5479                                                      const float* uv_dudv,
5480                                                      int width) {
5481   __asm {
5482     push       esi
5483     push       edi
5484     mov        eax, [esp + 12]  // src_argb
5485     mov        esi, [esp + 16]  // stride
5486     mov        edx, [esp + 20]  // dst_argb
5487     mov        ecx, [esp + 24]  // pointer to uv_dudv
5488     movq       xmm2, qword ptr [ecx]  // uv
5489     movq       xmm7, qword ptr [ecx + 8]  // dudv
5490     mov        ecx, [esp + 28]  // width
5491     shl        esi, 16  // 4, stride
5492     add        esi, 4
5493     movd       xmm5, esi
5494     sub        ecx, 4
5495     jl         l4b
5496 
5497         // setup for 4 pixel loop
5498     pshufd     xmm7, xmm7, 0x44  // dup dudv
5499     pshufd     xmm5, xmm5, 0  // dup 4, stride
5500     movdqa     xmm0, xmm2  // x0, y0, x1, y1
5501     addps      xmm0, xmm7
5502     movlhps    xmm2, xmm0
5503     movdqa     xmm4, xmm7
5504     addps      xmm4, xmm4  // dudv *= 2
5505     movdqa     xmm3, xmm2  // x2, y2, x3, y3
5506     addps      xmm3, xmm4
5507     addps      xmm4, xmm4  // dudv *= 4
5508 
5509         // 4 pixel loop
5510   l4:
5511     cvttps2dq  xmm0, xmm2  // x, y float to int first 2
5512     cvttps2dq  xmm1, xmm3  // x, y float to int next 2
5513     packssdw   xmm0, xmm1  // x, y as 8 shorts
5514     pmaddwd    xmm0, xmm5  // offsets = x * 4 + y * stride.
5515     movd       esi, xmm0
5516     pshufd     xmm0, xmm0, 0x39  // shift right
5517     movd       edi, xmm0
5518     pshufd     xmm0, xmm0, 0x39  // shift right
5519     movd       xmm1, [eax + esi]  // read pixel 0
5520     movd       xmm6, [eax + edi]  // read pixel 1
5521     punpckldq  xmm1, xmm6  // combine pixel 0 and 1
5522     addps      xmm2, xmm4  // x, y += dx, dy first 2
5523     movq       qword ptr [edx], xmm1
5524     movd       esi, xmm0
5525     pshufd     xmm0, xmm0, 0x39  // shift right
5526     movd       edi, xmm0
5527     movd       xmm6, [eax + esi]  // read pixel 2
5528     movd       xmm0, [eax + edi]  // read pixel 3
5529     punpckldq  xmm6, xmm0  // combine pixel 2 and 3
5530     addps      xmm3, xmm4  // x, y += dx, dy next 2
5531     movq       qword ptr 8[edx], xmm6
5532     lea        edx, [edx + 16]
5533     sub        ecx, 4
5534     jge        l4
5535 
5536   l4b:
5537     add        ecx, 4 - 1
5538     jl         l1b
5539 
5540             // 1 pixel loop
5541   l1:
5542     cvttps2dq  xmm0, xmm2  // x, y float to int
5543     packssdw   xmm0, xmm0  // x, y as shorts
5544     pmaddwd    xmm0, xmm5  // offset = x * 4 + y * stride
5545     addps      xmm2, xmm7  // x, y += dx, dy
5546     movd       esi, xmm0
5547     movd       xmm0, [eax + esi]  // copy a pixel
5548     movd       [edx], xmm0
5549     lea        edx, [edx + 4]
5550     sub        ecx, 1
5551     jge        l1
5552   l1b:
5553     pop        edi
5554     pop        esi
5555     ret
5556   }
5557 }
5558 #endif  // HAS_ARGBAFFINEROW_SSE2
5559 
5560 #ifdef HAS_INTERPOLATEROW_AVX2
5561 // Bilinear filter 32x2 -> 32x1
5562 __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
5563                                            const uint8_t* src_ptr,
5564                                            ptrdiff_t src_stride,
5565                                            int dst_width,
5566                                            int source_y_fraction) {
5567   __asm {
5568     push       esi
5569     push       edi
5570     mov        edi, [esp + 8 + 4]  // dst_ptr
5571     mov        esi, [esp + 8 + 8]  // src_ptr
5572     mov        edx, [esp + 8 + 12]  // src_stride
5573     mov        ecx, [esp + 8 + 16]  // dst_width
5574     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
5575     // Dispatch to specialized filters if applicable.
5576     cmp        eax, 0
5577     je         xloop100  // 0 / 256.  Blend 100 / 0.
5578     sub        edi, esi
5579     cmp        eax, 128
5580     je         xloop50  // 128 /256 is 0.50.  Blend 50 / 50.
5581 
5582     vmovd      xmm0, eax  // high fraction 0..255
5583     neg        eax
5584     add        eax, 256
5585     vmovd      xmm5, eax  // low fraction 256..1
5586     vpunpcklbw xmm5, xmm5, xmm0
5587     vpunpcklwd xmm5, xmm5, xmm5
5588     vbroadcastss ymm5, xmm5
5589 
5590     mov        eax, 0x80808080  // 128b for bias and rounding.
5591     vmovd      xmm4, eax
5592     vbroadcastss ymm4, xmm4
5593 
5594   xloop:
5595     vmovdqu    ymm0, [esi]
5596     vmovdqu    ymm2, [esi + edx]
5597     vpunpckhbw ymm1, ymm0, ymm2  // mutates
5598     vpunpcklbw ymm0, ymm0, ymm2
5599     vpsubb     ymm1, ymm1, ymm4  // bias to signed image
5600     vpsubb     ymm0, ymm0, ymm4
5601     vpmaddubsw ymm1, ymm5, ymm1
5602     vpmaddubsw ymm0, ymm5, ymm0
5603     vpaddw     ymm1, ymm1, ymm4  // unbias and round
5604     vpaddw     ymm0, ymm0, ymm4
5605     vpsrlw     ymm1, ymm1, 8
5606     vpsrlw     ymm0, ymm0, 8
5607     vpackuswb  ymm0, ymm0, ymm1            // unmutates
5608     vmovdqu    [esi + edi], ymm0
5609     lea        esi, [esi + 32]
5610     sub        ecx, 32
5611     jg         xloop
5612     jmp        xloop99
5613 
5614         // Blend 50 / 50.
5615  xloop50:
5616    vmovdqu    ymm0, [esi]
5617    vpavgb     ymm0, ymm0, [esi + edx]
5618    vmovdqu    [esi + edi], ymm0
5619    lea        esi, [esi + 32]
5620    sub        ecx, 32
5621    jg         xloop50
5622    jmp        xloop99
5623 
5624         // Blend 100 / 0 - Copy row unchanged.
5625  xloop100:
5626    rep movsb
5627 
5628   xloop99:
5629     pop        edi
5630     pop        esi
5631     vzeroupper
5632     ret
5633   }
5634 }
5635 #endif  // HAS_INTERPOLATEROW_AVX2
5636 
5637 // Bilinear filter 16x2 -> 16x1
5638 // TODO(fbarchard): Consider allowing 256 using memcpy.
5639 __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
5640                                             const uint8_t* src_ptr,
5641                                             ptrdiff_t src_stride,
5642                                             int dst_width,
5643                                             int source_y_fraction) {
5644   __asm {
5645     push       esi
5646     push       edi
5647 
5648     mov        edi, [esp + 8 + 4]  // dst_ptr
5649     mov        esi, [esp + 8 + 8]  // src_ptr
5650     mov        edx, [esp + 8 + 12]  // src_stride
5651     mov        ecx, [esp + 8 + 16]  // dst_width
5652     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
5653     sub        edi, esi
5654         // Dispatch to specialized filters if applicable.
5655     cmp        eax, 0
5656     je         xloop100  // 0 /256.  Blend 100 / 0.
5657     cmp        eax, 128
5658     je         xloop50  // 128 / 256 is 0.50.  Blend 50 / 50.
5659 
5660     movd       xmm0, eax  // high fraction 0..255
5661     neg        eax
5662     add        eax, 256
5663     movd       xmm5, eax  // low fraction 255..1
5664     punpcklbw  xmm5, xmm0
5665     punpcklwd  xmm5, xmm5
5666     pshufd     xmm5, xmm5, 0
5667     mov        eax, 0x80808080  // 128 for biasing image to signed.
5668     movd       xmm4, eax
5669     pshufd     xmm4, xmm4, 0x00
5670 
5671   xloop:
5672     movdqu     xmm0, [esi]
5673     movdqu     xmm2, [esi + edx]
5674     movdqu     xmm1, xmm0
5675     punpcklbw  xmm0, xmm2
5676     punpckhbw  xmm1, xmm2
5677     psubb      xmm0, xmm4            // bias image by -128
5678     psubb      xmm1, xmm4
5679     movdqa     xmm2, xmm5
5680     movdqa     xmm3, xmm5
5681     pmaddubsw  xmm2, xmm0
5682     pmaddubsw  xmm3, xmm1
5683     paddw      xmm2, xmm4
5684     paddw      xmm3, xmm4
5685     psrlw      xmm2, 8
5686     psrlw      xmm3, 8
5687     packuswb   xmm2, xmm3
5688     movdqu     [esi + edi], xmm2
5689     lea        esi, [esi + 16]
5690     sub        ecx, 16
5691     jg         xloop
5692     jmp        xloop99
5693 
5694         // Blend 50 / 50.
5695   xloop50:
5696     movdqu     xmm0, [esi]
5697     movdqu     xmm1, [esi + edx]
5698     pavgb      xmm0, xmm1
5699     movdqu     [esi + edi], xmm0
5700     lea        esi, [esi + 16]
5701     sub        ecx, 16
5702     jg         xloop50
5703     jmp        xloop99
5704 
5705         // Blend 100 / 0 - Copy row unchanged.
5706   xloop100:
5707     movdqu     xmm0, [esi]
5708     movdqu     [esi + edi], xmm0
5709     lea        esi, [esi + 16]
5710     sub        ecx, 16
5711     jg         xloop100
5712 
5713   xloop99:
5714     pop        edi
5715     pop        esi
5716     ret
5717   }
5718 }
5719 
5720 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5721 __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
5722                                             uint8_t* dst_argb,
5723                                             const uint8_t* shuffler,
5724                                             int width) {
5725   __asm {
5726     mov        eax, [esp + 4]  // src_argb
5727     mov        edx, [esp + 8]  // dst_argb
5728     mov        ecx, [esp + 12]  // shuffler
5729     movdqu     xmm5, [ecx]
5730     mov        ecx, [esp + 16]  // width
5731 
5732   wloop:
5733     movdqu     xmm0, [eax]
5734     movdqu     xmm1, [eax + 16]
5735     lea        eax, [eax + 32]
5736     pshufb     xmm0, xmm5
5737     pshufb     xmm1, xmm5
5738     movdqu     [edx], xmm0
5739     movdqu     [edx + 16], xmm1
5740     lea        edx, [edx + 32]
5741     sub        ecx, 8
5742     jg         wloop
5743     ret
5744   }
5745 }
5746 
5747 #ifdef HAS_ARGBSHUFFLEROW_AVX2
5748 __declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
5749                                            uint8_t* dst_argb,
5750                                            const uint8_t* shuffler,
5751                                            int width) {
5752   __asm {
5753     mov        eax, [esp + 4]  // src_argb
5754     mov        edx, [esp + 8]  // dst_argb
5755     mov        ecx, [esp + 12]  // shuffler
5756     vbroadcastf128 ymm5, [ecx]  // same shuffle in high as low.
5757     mov        ecx, [esp + 16]  // width
5758 
5759   wloop:
5760     vmovdqu    ymm0, [eax]
5761     vmovdqu    ymm1, [eax + 32]
5762     lea        eax, [eax + 64]
5763     vpshufb    ymm0, ymm0, ymm5
5764     vpshufb    ymm1, ymm1, ymm5
5765     vmovdqu    [edx], ymm0
5766     vmovdqu    [edx + 32], ymm1
5767     lea        edx, [edx + 64]
5768     sub        ecx, 16
5769     jg         wloop
5770 
5771     vzeroupper
5772     ret
5773   }
5774 }
5775 #endif  // HAS_ARGBSHUFFLEROW_AVX2
5776 
5777 // YUY2 - Macro-pixel = 2 image pixels
5778 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
5779 
5780 // UYVY - Macro-pixel = 2 image pixels
5781 // U0Y0V0Y1
5782 
5783 __declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
5784                                           const uint8_t* src_u,
5785                                           const uint8_t* src_v,
5786                                           uint8_t* dst_frame,
5787                                           int width) {
5788   __asm {
5789     push       esi
5790     push       edi
5791     mov        eax, [esp + 8 + 4]  // src_y
5792     mov        esi, [esp + 8 + 8]  // src_u
5793     mov        edx, [esp + 8 + 12]  // src_v
5794     mov        edi, [esp + 8 + 16]  // dst_frame
5795     mov        ecx, [esp + 8 + 20]  // width
5796     sub        edx, esi
5797 
5798   convertloop:
5799     movq       xmm2, qword ptr [esi]  // U
5800     movq       xmm3, qword ptr [esi + edx]  // V
5801     lea        esi, [esi + 8]
5802     punpcklbw  xmm2, xmm3  // UV
5803     movdqu     xmm0, [eax]  // Y
5804     lea        eax, [eax + 16]
5805     movdqa     xmm1, xmm0
5806     punpcklbw  xmm0, xmm2  // YUYV
5807     punpckhbw  xmm1, xmm2
5808     movdqu     [edi], xmm0
5809     movdqu     [edi + 16], xmm1
5810     lea        edi, [edi + 32]
5811     sub        ecx, 16
5812     jg         convertloop
5813 
5814     pop        edi
5815     pop        esi
5816     ret
5817   }
5818 }
5819 
5820 __declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
5821                                           const uint8_t* src_u,
5822                                           const uint8_t* src_v,
5823                                           uint8_t* dst_frame,
5824                                           int width) {
5825   __asm {
5826     push       esi
5827     push       edi
5828     mov        eax, [esp + 8 + 4]  // src_y
5829     mov        esi, [esp + 8 + 8]  // src_u
5830     mov        edx, [esp + 8 + 12]  // src_v
5831     mov        edi, [esp + 8 + 16]  // dst_frame
5832     mov        ecx, [esp + 8 + 20]  // width
5833     sub        edx, esi
5834 
5835   convertloop:
5836     movq       xmm2, qword ptr [esi]  // U
5837     movq       xmm3, qword ptr [esi + edx]  // V
5838     lea        esi, [esi + 8]
5839     punpcklbw  xmm2, xmm3  // UV
5840     movdqu     xmm0, [eax]  // Y
5841     movdqa     xmm1, xmm2
5842     lea        eax, [eax + 16]
5843     punpcklbw  xmm1, xmm0  // UYVY
5844     punpckhbw  xmm2, xmm0
5845     movdqu     [edi], xmm1
5846     movdqu     [edi + 16], xmm2
5847     lea        edi, [edi + 32]
5848     sub        ecx, 16
5849     jg         convertloop
5850 
5851     pop        edi
5852     pop        esi
5853     ret
5854   }
5855 }
5856 
5857 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
5858 __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
5859                                               uint8_t* dst_argb,
5860                                               const float* poly,
5861                                               int width) {
5862   __asm {
5863     push       esi
5864     mov        eax, [esp + 4 + 4] /* src_argb */
5865     mov        edx, [esp + 4 + 8] /* dst_argb */
5866     mov        esi, [esp + 4 + 12] /* poly */
5867     mov        ecx, [esp + 4 + 16] /* width */
5868     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
5869 
5870         // 2 pixel loop.
5871  convertloop:
5872         //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
5873         //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
5874     movq       xmm0, qword ptr [eax]  // BGRABGRA
5875     lea        eax, [eax + 8]
5876     punpcklbw  xmm0, xmm3
5877     movdqa     xmm4, xmm0
5878     punpcklwd  xmm0, xmm3  // pixel 0
5879     punpckhwd  xmm4, xmm3  // pixel 1
5880     cvtdq2ps   xmm0, xmm0  // 4 floats
5881     cvtdq2ps   xmm4, xmm4
5882     movdqa     xmm1, xmm0  // X
5883     movdqa     xmm5, xmm4
5884     mulps      xmm0, [esi + 16]  // C1 * X
5885     mulps      xmm4, [esi + 16]
5886     addps      xmm0, [esi]  // result = C0 + C1 * X
5887     addps      xmm4, [esi]
5888     movdqa     xmm2, xmm1
5889     movdqa     xmm6, xmm5
5890     mulps      xmm2, xmm1  // X * X
5891     mulps      xmm6, xmm5
5892     mulps      xmm1, xmm2  // X * X * X
5893     mulps      xmm5, xmm6
5894     mulps      xmm2, [esi + 32]  // C2 * X * X
5895     mulps      xmm6, [esi + 32]
5896     mulps      xmm1, [esi + 48]  // C3 * X * X * X
5897     mulps      xmm5, [esi + 48]
5898     addps      xmm0, xmm2  // result += C2 * X * X
5899     addps      xmm4, xmm6
5900     addps      xmm0, xmm1  // result += C3 * X * X * X
5901     addps      xmm4, xmm5
5902     cvttps2dq  xmm0, xmm0
5903     cvttps2dq  xmm4, xmm4
5904     packuswb   xmm0, xmm4
5905     packuswb   xmm0, xmm0
5906     movq       qword ptr [edx], xmm0
5907     lea        edx, [edx + 8]
5908     sub        ecx, 2
5909     jg         convertloop
5910     pop        esi
5911     ret
5912   }
5913 }
5914 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
5915 
5916 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
5917 __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
5918                                               uint8_t* dst_argb,
5919                                               const float* poly,
5920                                               int width) {
5921   __asm {
5922     mov        eax, [esp + 4] /* src_argb */
5923     mov        edx, [esp + 8] /* dst_argb */
5924     mov        ecx, [esp + 12] /* poly */
5925     vbroadcastf128 ymm4, [ecx]  // C0
5926     vbroadcastf128 ymm5, [ecx + 16]  // C1
5927     vbroadcastf128 ymm6, [ecx + 32]  // C2
5928     vbroadcastf128 ymm7, [ecx + 48]  // C3
5929     mov        ecx, [esp + 16] /* width */
5930 
5931     // 2 pixel loop.
5932  convertloop:
5933     vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
5934     lea         eax, [eax + 8]
5935     vcvtdq2ps   ymm0, ymm0  // X 8 floats
5936     vmulps      ymm2, ymm0, ymm0  // X * X
5937     vmulps      ymm3, ymm0, ymm7  // C3 * X
5938     vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
5939     vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
5940     vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
5941     vcvttps2dq  ymm0, ymm0
5942     vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
5943     vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
5944     vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
5945     vmovq       qword ptr [edx], xmm0
5946     lea         edx, [edx + 8]
5947     sub         ecx, 2
5948     jg          convertloop
5949     vzeroupper
5950     ret
5951   }
5952 }
5953 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
5954 
5955 #ifdef HAS_HALFFLOATROW_SSE2
5956 static float kExpBias = 1.9259299444e-34f;
5957 __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
5958                                          uint16_t* dst,
5959                                          float scale,
5960                                          int width) {
5961   __asm {
5962     mov        eax, [esp + 4] /* src */
5963     mov        edx, [esp + 8] /* dst */
5964     movd       xmm4, dword ptr [esp + 12] /* scale */
5965     mov        ecx, [esp + 16] /* width */
5966     mulss      xmm4, kExpBias
5967     pshufd     xmm4, xmm4, 0
5968     pxor       xmm5, xmm5
5969     sub        edx, eax
5970 
5971         // 8 pixel loop.
5972  convertloop:
5973     movdqu      xmm2, xmmword ptr [eax]  // 8 shorts
5974     add         eax, 16
5975     movdqa      xmm3, xmm2
5976     punpcklwd   xmm2, xmm5
5977     cvtdq2ps    xmm2, xmm2  // convert 8 ints to floats
5978     punpckhwd   xmm3, xmm5
5979     cvtdq2ps    xmm3, xmm3
5980     mulps       xmm2, xmm4
5981     mulps       xmm3, xmm4
5982     psrld       xmm2, 13
5983     psrld       xmm3, 13
5984     packssdw    xmm2, xmm3
5985     movdqu      [eax + edx - 16], xmm2
5986     sub         ecx, 8
5987     jg          convertloop
5988     ret
5989   }
5990 }
5991 #endif  // HAS_HALFFLOATROW_SSE2
5992 
5993 #ifdef HAS_HALFFLOATROW_AVX2
5994 __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
5995                                          uint16_t* dst,
5996                                          float scale,
5997                                          int width) {
5998   __asm {
5999     mov        eax, [esp + 4] /* src */
6000     mov        edx, [esp + 8] /* dst */
6001     movd       xmm4, dword ptr [esp + 12] /* scale */
6002     mov        ecx, [esp + 16] /* width */
6003 
6004     vmulss     xmm4, xmm4, kExpBias
6005     vbroadcastss ymm4, xmm4
6006     vpxor      ymm5, ymm5, ymm5
6007     sub        edx, eax
6008 
6009         // 16 pixel loop.
6010  convertloop:
6011     vmovdqu     ymm2, [eax]  // 16 shorts
6012     add         eax, 32
6013     vpunpckhwd  ymm3, ymm2, ymm5  // convert 16 shorts to 16 ints
6014     vpunpcklwd  ymm2, ymm2, ymm5
6015     vcvtdq2ps   ymm3, ymm3  // convert 16 ints to floats
6016     vcvtdq2ps   ymm2, ymm2
6017     vmulps      ymm3, ymm3, ymm4  // scale to adjust exponent for 5 bit range.
6018     vmulps      ymm2, ymm2, ymm4
6019     vpsrld      ymm3, ymm3, 13  // float convert to 8 half floats truncate
6020     vpsrld      ymm2, ymm2, 13
6021     vpackssdw   ymm2, ymm2, ymm3
6022     vmovdqu     [eax + edx - 32], ymm2
6023     sub         ecx, 16
6024     jg          convertloop
6025     vzeroupper
6026     ret
6027   }
6028 }
6029 #endif  // HAS_HALFFLOATROW_AVX2
6030 
6031 #ifdef HAS_HALFFLOATROW_F16C
6032 __declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
6033                                          uint16_t* dst,
6034                                          float scale,
6035                                          int width) {
6036   __asm {
6037     mov        eax, [esp + 4] /* src */
6038     mov        edx, [esp + 8] /* dst */
6039     vbroadcastss ymm4, [esp + 12] /* scale */
6040     mov        ecx, [esp + 16] /* width */
6041     sub        edx, eax
6042 
6043         // 16 pixel loop.
6044  convertloop:
6045     vpmovzxwd   ymm2, xmmword ptr [eax]  // 8 shorts -> 8 ints
6046     vpmovzxwd   ymm3, xmmword ptr [eax + 16]  // 8 more shorts
6047     add         eax, 32
6048     vcvtdq2ps   ymm2, ymm2  // convert 8 ints to floats
6049     vcvtdq2ps   ymm3, ymm3
6050     vmulps      ymm2, ymm2, ymm4  // scale to normalized range 0 to 1
6051     vmulps      ymm3, ymm3, ymm4
6052     vcvtps2ph   xmm2, ymm2, 3  // float convert to 8 half floats truncate
6053     vcvtps2ph   xmm3, ymm3, 3
6054     vmovdqu     [eax + edx + 32], xmm2
6055     vmovdqu     [eax + edx + 32 + 16], xmm3
6056     sub         ecx, 16
6057     jg          convertloop
6058     vzeroupper
6059     ret
6060   }
6061 }
6062 #endif  // HAS_HALFFLOATROW_F16C
6063 
6064 #ifdef HAS_ARGBCOLORTABLEROW_X86
6065 // Tranform ARGB pixels with color table.
6066 __declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
6067                                              const uint8_t* table_argb,
6068                                              int width) {
6069   __asm {
6070     push       esi
6071     mov        eax, [esp + 4 + 4] /* dst_argb */
6072     mov        esi, [esp + 4 + 8] /* table_argb */
6073     mov        ecx, [esp + 4 + 12] /* width */
6074 
6075     // 1 pixel loop.
6076   convertloop:
6077     movzx      edx, byte ptr [eax]
6078     lea        eax, [eax + 4]
6079     movzx      edx, byte ptr [esi + edx * 4]
6080     mov        byte ptr [eax - 4], dl
6081     movzx      edx, byte ptr [eax - 4 + 1]
6082     movzx      edx, byte ptr [esi + edx * 4 + 1]
6083     mov        byte ptr [eax - 4 + 1], dl
6084     movzx      edx, byte ptr [eax - 4 + 2]
6085     movzx      edx, byte ptr [esi + edx * 4 + 2]
6086     mov        byte ptr [eax - 4 + 2], dl
6087     movzx      edx, byte ptr [eax - 4 + 3]
6088     movzx      edx, byte ptr [esi + edx * 4 + 3]
6089     mov        byte ptr [eax - 4 + 3], dl
6090     dec        ecx
6091     jg         convertloop
6092     pop        esi
6093     ret
6094   }
6095 }
6096 #endif  // HAS_ARGBCOLORTABLEROW_X86
6097 
6098 #ifdef HAS_RGBCOLORTABLEROW_X86
6099 // Tranform RGB pixels with color table.
6100 __declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
6101                                             const uint8_t* table_argb,
6102                                             int width) {
6103   __asm {
6104     push       esi
6105     mov        eax, [esp + 4 + 4] /* dst_argb */
6106     mov        esi, [esp + 4 + 8] /* table_argb */
6107     mov        ecx, [esp + 4 + 12] /* width */
6108 
6109     // 1 pixel loop.
6110   convertloop:
6111     movzx      edx, byte ptr [eax]
6112     lea        eax, [eax + 4]
6113     movzx      edx, byte ptr [esi + edx * 4]
6114     mov        byte ptr [eax - 4], dl
6115     movzx      edx, byte ptr [eax - 4 + 1]
6116     movzx      edx, byte ptr [esi + edx * 4 + 1]
6117     mov        byte ptr [eax - 4 + 1], dl
6118     movzx      edx, byte ptr [eax - 4 + 2]
6119     movzx      edx, byte ptr [esi + edx * 4 + 2]
6120     mov        byte ptr [eax - 4 + 2], dl
6121     dec        ecx
6122     jg         convertloop
6123 
6124     pop        esi
6125     ret
6126   }
6127 }
6128 #endif  // HAS_RGBCOLORTABLEROW_X86
6129 
6130 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6131 // Tranform RGB pixels with luma table.
6132 __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
6133                                                    uint8_t* dst_argb,
6134                                                    int width,
6135                                                    const uint8_t* luma,
6136                                                    uint32_t lumacoeff) {
6137   __asm {
6138     push       esi
6139     push       edi
6140     mov        eax, [esp + 8 + 4] /* src_argb */
6141     mov        edi, [esp + 8 + 8] /* dst_argb */
6142     mov        ecx, [esp + 8 + 12] /* width */
6143     movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
6144     movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
6145     pshufd     xmm2, xmm2, 0
6146     pshufd     xmm3, xmm3, 0
6147     pcmpeqb    xmm4, xmm4  // generate mask 0xff00ff00
6148     psllw      xmm4, 8
6149     pxor       xmm5, xmm5
6150 
6151         // 4 pixel loop.
6152   convertloop:
6153     movdqu     xmm0, xmmword ptr [eax]  // generate luma ptr
6154     pmaddubsw  xmm0, xmm3
6155     phaddw     xmm0, xmm0
6156     pand       xmm0, xmm4  // mask out low bits
6157     punpcklwd  xmm0, xmm5
6158     paddd      xmm0, xmm2  // add table base
6159     movd       esi, xmm0
6160     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6161 
6162     movzx      edx, byte ptr [eax]
6163     movzx      edx, byte ptr [esi + edx]
6164     mov        byte ptr [edi], dl
6165     movzx      edx, byte ptr [eax + 1]
6166     movzx      edx, byte ptr [esi + edx]
6167     mov        byte ptr [edi + 1], dl
6168     movzx      edx, byte ptr [eax + 2]
6169     movzx      edx, byte ptr [esi + edx]
6170     mov        byte ptr [edi + 2], dl
6171     movzx      edx, byte ptr [eax + 3]  // copy alpha.
6172     mov        byte ptr [edi + 3], dl
6173 
6174     movd       esi, xmm0
6175     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6176 
6177     movzx      edx, byte ptr [eax + 4]
6178     movzx      edx, byte ptr [esi + edx]
6179     mov        byte ptr [edi + 4], dl
6180     movzx      edx, byte ptr [eax + 5]
6181     movzx      edx, byte ptr [esi + edx]
6182     mov        byte ptr [edi + 5], dl
6183     movzx      edx, byte ptr [eax + 6]
6184     movzx      edx, byte ptr [esi + edx]
6185     mov        byte ptr [edi + 6], dl
6186     movzx      edx, byte ptr [eax + 7]  // copy alpha.
6187     mov        byte ptr [edi + 7], dl
6188 
6189     movd       esi, xmm0
6190     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6191 
6192     movzx      edx, byte ptr [eax + 8]
6193     movzx      edx, byte ptr [esi + edx]
6194     mov        byte ptr [edi + 8], dl
6195     movzx      edx, byte ptr [eax + 9]
6196     movzx      edx, byte ptr [esi + edx]
6197     mov        byte ptr [edi + 9], dl
6198     movzx      edx, byte ptr [eax + 10]
6199     movzx      edx, byte ptr [esi + edx]
6200     mov        byte ptr [edi + 10], dl
6201     movzx      edx, byte ptr [eax + 11]  // copy alpha.
6202     mov        byte ptr [edi + 11], dl
6203 
6204     movd       esi, xmm0
6205 
6206     movzx      edx, byte ptr [eax + 12]
6207     movzx      edx, byte ptr [esi + edx]
6208     mov        byte ptr [edi + 12], dl
6209     movzx      edx, byte ptr [eax + 13]
6210     movzx      edx, byte ptr [esi + edx]
6211     mov        byte ptr [edi + 13], dl
6212     movzx      edx, byte ptr [eax + 14]
6213     movzx      edx, byte ptr [esi + edx]
6214     mov        byte ptr [edi + 14], dl
6215     movzx      edx, byte ptr [eax + 15]  // copy alpha.
6216     mov        byte ptr [edi + 15], dl
6217 
6218     lea        eax, [eax + 16]
6219     lea        edi, [edi + 16]
6220     sub        ecx, 4
6221     jg         convertloop
6222 
6223     pop        edi
6224     pop        esi
6225     ret
6226   }
6227 }
6228 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6229 
6230 #endif  // defined(_M_X64)
6231 
6232 #ifdef __cplusplus
6233 }  // extern "C"
6234 }  // namespace libyuv
6235 #endif
6236 
6237 #endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6238