1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 // This module is for Visual C 32/64 bit and clangcl 32 bit
14 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
15     (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
16 
17 #if defined(_M_X64)
18 #include <emmintrin.h>
19 #include <tmmintrin.h>  // For _mm_maddubs_epi16
20 #endif
21 
22 #ifdef __cplusplus
23 namespace libyuv {
24 extern "C" {
25 #endif
26 
27 // 64 bit
28 #if defined(_M_X64)
29 
30 // Read 4 UV from 422, upsample to 8 UV.
31 #define READYUV422                                        \
32   xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
33   xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
34   xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                   \
35   xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                  \
36   u_buf += 4;                                             \
37   xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
38   xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
39   y_buf += 8;
40 
41 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
42 #define READYUVA422                                       \
43   xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
44   xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
45   xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                   \
46   xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                  \
47   u_buf += 4;                                             \
48   xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
49   xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
50   y_buf += 8;                                             \
51   xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                \
52   a_buf += 8;
53 
54 // Convert 8 pixels: 8 UV and 8 Y.
55 #define YUVTORGB(yuvconstants)                                     \
56   xmm1 = _mm_loadu_si128(&xmm0);                                   \
57   xmm2 = _mm_loadu_si128(&xmm0);                                   \
58   xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
59   xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
60   xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
61   xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);   \
62   xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);   \
63   xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);   \
64   xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);  \
65   xmm0 = _mm_adds_epi16(xmm0, xmm4);                               \
66   xmm1 = _mm_adds_epi16(xmm1, xmm4);                               \
67   xmm2 = _mm_adds_epi16(xmm2, xmm4);                               \
68   xmm0 = _mm_srai_epi16(xmm0, 6);                                  \
69   xmm1 = _mm_srai_epi16(xmm1, 6);                                  \
70   xmm2 = _mm_srai_epi16(xmm2, 6);                                  \
71   xmm0 = _mm_packus_epi16(xmm0, xmm0);                             \
72   xmm1 = _mm_packus_epi16(xmm1, xmm1);                             \
73   xmm2 = _mm_packus_epi16(xmm2, xmm2);
74 
75 // Store 8 ARGB values.
76 #define STOREARGB                                    \
77   xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);              \
78   xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);              \
79   xmm1 = _mm_loadu_si128(&xmm0);                     \
80   xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);             \
81   xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);             \
82   _mm_storeu_si128((__m128i*)dst_argb, xmm0);        \
83   _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
84   dst_argb += 32;
85 
86 #if defined(HAS_I422TOARGBROW_SSSE3)
I422ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)87 void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
88                          const uint8_t* u_buf,
89                          const uint8_t* v_buf,
90                          uint8_t* dst_argb,
91                          const struct YuvConstants* yuvconstants,
92                          int width) {
93   __m128i xmm0, xmm1, xmm2, xmm4;
94   const __m128i xmm5 = _mm_set1_epi8(-1);
95   const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
96   while (width > 0) {
97     READYUV422
98     YUVTORGB(yuvconstants)
99     STOREARGB
100     width -= 8;
101   }
102 }
103 #endif
104 
105 #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
I422AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)106 void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
107                               const uint8_t* u_buf,
108                               const uint8_t* v_buf,
109                               const uint8_t* a_buf,
110                               uint8_t* dst_argb,
111                               const struct YuvConstants* yuvconstants,
112                               int width) {
113   __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
114   const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
115   while (width > 0) {
116     READYUVA422
117     YUVTORGB(yuvconstants)
118     STOREARGB
119     width -= 8;
120   }
121 }
122 #endif
123 
124 // 32 bit
125 #else  // defined(_M_X64)
126 #ifdef HAS_ARGBTOYROW_SSSE3
127 
128 // Constants for ARGB.
129 static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
130                               13, 65, 33, 0, 13, 65, 33, 0};
131 
132 // JPeg full range.
133 static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
134                                15, 75, 38, 0, 15, 75, 38, 0};
135 
136 static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
137                               112, -74, -38, 0, 112, -74, -38, 0};
138 
139 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
140                                127, -84, -43, 0, 127, -84, -43, 0};
141 
142 static const vec8 kARGBToV = {
143     -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
144 };
145 
146 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
147                                -20, -107, 127, 0, -20, -107, 127, 0};
148 
149 // vpshufb for vphaddw + vpackuswb packed to shorts.
150 static const lvec8 kShufARGBToUV_AVX = {
151     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
152     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
153 
154 // Constants for BGRA.
155 static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
156                               0, 33, 65, 13, 0, 33, 65, 13};
157 
158 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
159                               0, -38, -74, 112, 0, -38, -74, 112};
160 
161 static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
162                               0, 112, -94, -18, 0, 112, -94, -18};
163 
164 // Constants for ABGR.
165 static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
166                               33, 65, 13, 0, 33, 65, 13, 0};
167 
168 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
169                               -38, -74, 112, 0, -38, -74, 112, 0};
170 
171 static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
172                               112, -94, -18, 0, 112, -94, -18, 0};
173 
174 // Constants for RGBA.
175 static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
176                               0, 13, 65, 33, 0, 13, 65, 33};
177 
178 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
179                               0, 112, -74, -38, 0, 112, -74, -38};
180 
181 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
182                               0, -18, -94, 112, 0, -18, -94, 112};
183 
184 static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
185                               16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
186 
187 // 7 bit fixed point 0.5.
188 static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
189 
190 static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
191                                 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
192 
193 static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
194                                   0x8080u, 0x8080u, 0x8080u, 0x8080u};
195 
196 // Shuffle table for converting RGB24 to ARGB.
197 static const uvec8 kShuffleMaskRGB24ToARGB = {
198     0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
199 
200 // Shuffle table for converting RAW to ARGB.
201 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
202                                             8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
203 
204 // Shuffle table for converting RAW to RGB24.  First 8.
205 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
206     2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
207     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
208 
209 // Shuffle table for converting RAW to RGB24.  Middle 8.
210 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
211     2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
212     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
213 
214 // Shuffle table for converting RAW to RGB24.  Last 8.
215 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
216     8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
217     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
218 
219 // Shuffle table for converting ARGB to RGB24.
220 static const uvec8 kShuffleMaskARGBToRGB24 = {
221     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
222 
223 // Shuffle table for converting ARGB to RAW.
224 static const uvec8 kShuffleMaskARGBToRAW = {
225     2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
226 
227 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
228 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
229     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
230 
231 // YUY2 shuf 16 Y to 32 Y.
232 static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
233                                     10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
234                                     6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
235 
236 // YUY2 shuf 8 UV to 16 UV.
237 static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
238                                      11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
239                                      5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
240 
241 // UYVY shuf 16 Y to 32 Y.
242 static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
243                                     11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
244                                     7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
245 
246 // UYVY shuf 8 UV to 16 UV.
247 static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
248                                      10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
249                                      4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
250 
251 // NV21 shuf 8 VU to 16 UV.
252 static const lvec8 kShuffleNV21 = {
253     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
254     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
255 };
256 
257 // Duplicates gray value 3 times and fills in alpha opaque.
258 __declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y,
259                                           uint8_t* dst_argb,
260                                           int width) {
261   __asm {
262     mov        eax, [esp + 4]  // src_y
263     mov        edx, [esp + 8]  // dst_argb
264     mov        ecx, [esp + 12]  // width
265     pcmpeqb    xmm5, xmm5  // generate mask 0xff000000
266     pslld      xmm5, 24
267 
268   convertloop:
269     movq       xmm0, qword ptr [eax]
270     lea        eax,  [eax + 8]
271     punpcklbw  xmm0, xmm0
272     movdqa     xmm1, xmm0
273     punpcklwd  xmm0, xmm0
274     punpckhwd  xmm1, xmm1
275     por        xmm0, xmm5
276     por        xmm1, xmm5
277     movdqu     [edx], xmm0
278     movdqu     [edx + 16], xmm1
279     lea        edx, [edx + 32]
280     sub        ecx, 8
281     jg         convertloop
282     ret
283   }
284 }
285 
286 #ifdef HAS_J400TOARGBROW_AVX2
287 // Duplicates gray value 3 times and fills in alpha opaque.
288 __declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y,
289                                           uint8_t* dst_argb,
290                                           int width) {
291   __asm {
292     mov         eax, [esp + 4]  // src_y
293     mov         edx, [esp + 8]  // dst_argb
294     mov         ecx, [esp + 12]  // width
295     vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff000000
296     vpslld      ymm5, ymm5, 24
297 
298   convertloop:
299     vmovdqu     xmm0, [eax]
300     lea         eax,  [eax + 16]
301     vpermq      ymm0, ymm0, 0xd8
302     vpunpcklbw  ymm0, ymm0, ymm0
303     vpermq      ymm0, ymm0, 0xd8
304     vpunpckhwd  ymm1, ymm0, ymm0
305     vpunpcklwd  ymm0, ymm0, ymm0
306     vpor        ymm0, ymm0, ymm5
307     vpor        ymm1, ymm1, ymm5
308     vmovdqu     [edx], ymm0
309     vmovdqu     [edx + 32], ymm1
310     lea         edx, [edx + 64]
311     sub         ecx, 16
312     jg          convertloop
313     vzeroupper
314     ret
315   }
316 }
317 #endif  // HAS_J400TOARGBROW_AVX2
318 
319 __declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
320                                             uint8_t* dst_argb,
321                                             int width) {
322   __asm {
323     mov       eax, [esp + 4]  // src_rgb24
324     mov       edx, [esp + 8]  // dst_argb
325     mov       ecx, [esp + 12]  // width
326     pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
327     pslld     xmm5, 24
328     movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
329 
330  convertloop:
331     movdqu    xmm0, [eax]
332     movdqu    xmm1, [eax + 16]
333     movdqu    xmm3, [eax + 32]
334     lea       eax, [eax + 48]
335     movdqa    xmm2, xmm3
336     palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
337     pshufb    xmm2, xmm4
338     por       xmm2, xmm5
339     palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
340     pshufb    xmm0, xmm4
341     movdqu    [edx + 32], xmm2
342     por       xmm0, xmm5
343     pshufb    xmm1, xmm4
344     movdqu    [edx], xmm0
345     por       xmm1, xmm5
346     palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
347     pshufb    xmm3, xmm4
348     movdqu    [edx + 16], xmm1
349     por       xmm3, xmm5
350     movdqu    [edx + 48], xmm3
351     lea       edx, [edx + 64]
352     sub       ecx, 16
353     jg        convertloop
354     ret
355   }
356 }
357 
358 __declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw,
359                                           uint8_t* dst_argb,
360                                           int width) {
361   __asm {
362     mov       eax, [esp + 4]  // src_raw
363     mov       edx, [esp + 8]  // dst_argb
364     mov       ecx, [esp + 12]  // width
365     pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
366     pslld     xmm5, 24
367     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToARGB
368 
369  convertloop:
370     movdqu    xmm0, [eax]
371     movdqu    xmm1, [eax + 16]
372     movdqu    xmm3, [eax + 32]
373     lea       eax, [eax + 48]
374     movdqa    xmm2, xmm3
375     palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
376     pshufb    xmm2, xmm4
377     por       xmm2, xmm5
378     palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
379     pshufb    xmm0, xmm4
380     movdqu    [edx + 32], xmm2
381     por       xmm0, xmm5
382     pshufb    xmm1, xmm4
383     movdqu    [edx], xmm0
384     por       xmm1, xmm5
385     palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
386     pshufb    xmm3, xmm4
387     movdqu    [edx + 16], xmm1
388     por       xmm3, xmm5
389     movdqu    [edx + 48], xmm3
390     lea       edx, [edx + 64]
391     sub       ecx, 16
392     jg        convertloop
393     ret
394   }
395 }
396 
397 __declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
398                                            uint8_t* dst_rgb24,
399                                            int width) {
400   __asm {
401     mov       eax, [esp + 4]  // src_raw
402     mov       edx, [esp + 8]  // dst_rgb24
403     mov       ecx, [esp + 12]  // width
404     movdqa    xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
405     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
406     movdqa    xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
407 
408  convertloop:
409     movdqu    xmm0, [eax]
410     movdqu    xmm1, [eax + 4]
411     movdqu    xmm2, [eax + 8]
412     lea       eax, [eax + 24]
413     pshufb    xmm0, xmm3
414     pshufb    xmm1, xmm4
415     pshufb    xmm2, xmm5
416     movq      qword ptr [edx], xmm0
417     movq      qword ptr [edx + 8], xmm1
418     movq      qword ptr [edx + 16], xmm2
419     lea       edx, [edx + 24]
420     sub       ecx, 8
421     jg        convertloop
422     ret
423   }
424 }
425 
426 // pmul method to replicate bits.
427 // Math to replicate bits:
428 // (v << 8) | (v << 3)
429 // v * 256 + v * 8
430 // v * (256 + 8)
431 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
432 // 20 instructions.
433 __declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565,
434                                             uint8_t* dst_argb,
435                                             int width) {
436   __asm {
437     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
438     movd      xmm5, eax
439     pshufd    xmm5, xmm5, 0
440     mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
441     movd      xmm6, eax
442     pshufd    xmm6, xmm6, 0
443     pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
444     psllw     xmm3, 11
445     pcmpeqb   xmm4, xmm4  // generate mask 0x07e007e0 for Green
446     psllw     xmm4, 10
447     psrlw     xmm4, 5
448     pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
449     psllw     xmm7, 8
450 
451     mov       eax, [esp + 4]  // src_rgb565
452     mov       edx, [esp + 8]  // dst_argb
453     mov       ecx, [esp + 12]  // width
454     sub       edx, eax
455     sub       edx, eax
456 
457  convertloop:
458     movdqu    xmm0, [eax]  // fetch 8 pixels of bgr565
459     movdqa    xmm1, xmm0
460     movdqa    xmm2, xmm0
461     pand      xmm1, xmm3  // R in upper 5 bits
462     psllw     xmm2, 11  // B in upper 5 bits
463     pmulhuw   xmm1, xmm5  // * (256 + 8)
464     pmulhuw   xmm2, xmm5  // * (256 + 8)
465     psllw     xmm1, 8
466     por       xmm1, xmm2  // RB
467     pand      xmm0, xmm4  // G in middle 6 bits
468     pmulhuw   xmm0, xmm6  // << 5 * (256 + 4)
469     por       xmm0, xmm7  // AG
470     movdqa    xmm2, xmm1
471     punpcklbw xmm1, xmm0
472     punpckhbw xmm2, xmm0
473     movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
474     movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
475     lea       eax, [eax + 16]
476     sub       ecx, 8
477     jg        convertloop
478     ret
479   }
480 }
481 
482 #ifdef HAS_RGB565TOARGBROW_AVX2
483 // pmul method to replicate bits.
484 // Math to replicate bits:
485 // (v << 8) | (v << 3)
486 // v * 256 + v * 8
487 // v * (256 + 8)
488 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
489 __declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
490                                             uint8_t* dst_argb,
491                                             int width) {
492   __asm {
493     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
494     vmovd      xmm5, eax
495     vbroadcastss ymm5, xmm5
496     mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
497     vmovd      xmm6, eax
498     vbroadcastss ymm6, xmm6
499     vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
500     vpsllw     ymm3, ymm3, 11
501     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x07e007e0 for Green
502     vpsllw     ymm4, ymm4, 10
503     vpsrlw     ymm4, ymm4, 5
504     vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
505     vpsllw     ymm7, ymm7, 8
506 
507     mov        eax, [esp + 4]  // src_rgb565
508     mov        edx, [esp + 8]  // dst_argb
509     mov        ecx, [esp + 12]  // width
510     sub        edx, eax
511     sub        edx, eax
512 
513  convertloop:
514     vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgr565
515     vpand      ymm1, ymm0, ymm3  // R in upper 5 bits
516     vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
517     vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
518     vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
519     vpsllw     ymm1, ymm1, 8
520     vpor       ymm1, ymm1, ymm2  // RB
521     vpand      ymm0, ymm0, ymm4  // G in middle 6 bits
522     vpmulhuw   ymm0, ymm0, ymm6  // << 5 * (256 + 4)
523     vpor       ymm0, ymm0, ymm7  // AG
524     vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
525     vpermq     ymm1, ymm1, 0xd8
526     vpunpckhbw ymm2, ymm1, ymm0
527     vpunpcklbw ymm1, ymm1, ymm0
528     vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
529     vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
530     lea       eax, [eax + 32]
531     sub       ecx, 16
532     jg        convertloop
533     vzeroupper
534     ret
535   }
536 }
537 #endif  // HAS_RGB565TOARGBROW_AVX2
538 
539 #ifdef HAS_ARGB1555TOARGBROW_AVX2
540 __declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
541                                               uint8_t* dst_argb,
542                                               int width) {
543   __asm {
544     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
545     vmovd      xmm5, eax
546     vbroadcastss ymm5, xmm5
547     mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
548     vmovd      xmm6, eax
549     vbroadcastss ymm6, xmm6
550     vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
551     vpsllw     ymm3, ymm3, 11
552     vpsrlw     ymm4, ymm3, 6  // generate mask 0x03e003e0 for Green
553     vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
554     vpsllw     ymm7, ymm7, 8
555 
556     mov        eax,  [esp + 4]  // src_argb1555
557     mov        edx,  [esp + 8]  // dst_argb
558     mov        ecx,  [esp + 12]  // width
559     sub        edx,  eax
560     sub        edx,  eax
561 
562  convertloop:
563     vmovdqu    ymm0, [eax]  // fetch 16 pixels of 1555
564     vpsllw     ymm1, ymm0, 1  // R in upper 5 bits
565     vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
566     vpand      ymm1, ymm1, ymm3
567     vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
568     vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
569     vpsllw     ymm1, ymm1, 8
570     vpor       ymm1, ymm1, ymm2  // RB
571     vpsraw     ymm2, ymm0, 8  // A
572     vpand      ymm0, ymm0, ymm4  // G in middle 5 bits
573     vpmulhuw   ymm0, ymm0, ymm6  // << 6 * (256 + 8)
574     vpand      ymm2, ymm2, ymm7
575     vpor       ymm0, ymm0, ymm2  // AG
576     vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
577     vpermq     ymm1, ymm1, 0xd8
578     vpunpckhbw ymm2, ymm1, ymm0
579     vpunpcklbw ymm1, ymm1, ymm0
580     vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
581     vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
582     lea       eax, [eax + 32]
583     sub       ecx, 16
584     jg        convertloop
585     vzeroupper
586     ret
587   }
588 }
589 #endif  // HAS_ARGB1555TOARGBROW_AVX2
590 
591 #ifdef HAS_ARGB4444TOARGBROW_AVX2
592 __declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
593                                               uint8_t* dst_argb,
594                                               int width) {
595   __asm {
596     mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
597     vmovd     xmm4, eax
598     vbroadcastss ymm4, xmm4
599     vpslld    ymm5, ymm4, 4  // 0xf0f0f0f0 for high nibbles
600     mov       eax,  [esp + 4]  // src_argb4444
601     mov       edx,  [esp + 8]  // dst_argb
602     mov       ecx,  [esp + 12]  // width
603     sub       edx,  eax
604     sub       edx,  eax
605 
606  convertloop:
607     vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgra4444
608     vpand      ymm2, ymm0, ymm5  // mask high nibbles
609     vpand      ymm0, ymm0, ymm4  // mask low nibbles
610     vpsrlw     ymm3, ymm2, 4
611     vpsllw     ymm1, ymm0, 4
612     vpor       ymm2, ymm2, ymm3
613     vpor       ymm0, ymm0, ymm1
614     vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
615     vpermq     ymm2, ymm2, 0xd8
616     vpunpckhbw ymm1, ymm0, ymm2
617     vpunpcklbw ymm0, ymm0, ymm2
618     vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
619     vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
620     lea       eax, [eax + 32]
621     sub       ecx, 16
622     jg        convertloop
623     vzeroupper
624     ret
625   }
626 }
627 #endif  // HAS_ARGB4444TOARGBROW_AVX2
628 
629 // 24 instructions
630 __declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555,
631                                               uint8_t* dst_argb,
632                                               int width) {
633   __asm {
634     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
635     movd      xmm5, eax
636     pshufd    xmm5, xmm5, 0
637     mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
638     movd      xmm6, eax
639     pshufd    xmm6, xmm6, 0
640     pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
641     psllw     xmm3, 11
642     movdqa    xmm4, xmm3  // generate mask 0x03e003e0 for Green
643     psrlw     xmm4, 6
644     pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
645     psllw     xmm7, 8
646 
647     mov       eax, [esp + 4]  // src_argb1555
648     mov       edx, [esp + 8]  // dst_argb
649     mov       ecx, [esp + 12]  // width
650     sub       edx, eax
651     sub       edx, eax
652 
653  convertloop:
654     movdqu    xmm0, [eax]  // fetch 8 pixels of 1555
655     movdqa    xmm1, xmm0
656     movdqa    xmm2, xmm0
657     psllw     xmm1, 1  // R in upper 5 bits
658     psllw     xmm2, 11  // B in upper 5 bits
659     pand      xmm1, xmm3
660     pmulhuw   xmm2, xmm5  // * (256 + 8)
661     pmulhuw   xmm1, xmm5  // * (256 + 8)
662     psllw     xmm1, 8
663     por       xmm1, xmm2  // RB
664     movdqa    xmm2, xmm0
665     pand      xmm0, xmm4  // G in middle 5 bits
666     psraw     xmm2, 8  // A
667     pmulhuw   xmm0, xmm6  // << 6 * (256 + 8)
668     pand      xmm2, xmm7
669     por       xmm0, xmm2  // AG
670     movdqa    xmm2, xmm1
671     punpcklbw xmm1, xmm0
672     punpckhbw xmm2, xmm0
673     movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
674     movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
675     lea       eax, [eax + 16]
676     sub       ecx, 8
677     jg        convertloop
678     ret
679   }
680 }
681 
682 // 18 instructions.
683 __declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444,
684                                               uint8_t* dst_argb,
685                                               int width) {
686   __asm {
687     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
688     movd      xmm4, eax
689     pshufd    xmm4, xmm4, 0
690     movdqa    xmm5, xmm4  // 0xf0f0f0f0 for high nibbles
691     pslld     xmm5, 4
692     mov       eax, [esp + 4]  // src_argb4444
693     mov       edx, [esp + 8]  // dst_argb
694     mov       ecx, [esp + 12]  // width
695     sub       edx, eax
696     sub       edx, eax
697 
698  convertloop:
699     movdqu    xmm0, [eax]  // fetch 8 pixels of bgra4444
700     movdqa    xmm2, xmm0
701     pand      xmm0, xmm4  // mask low nibbles
702     pand      xmm2, xmm5  // mask high nibbles
703     movdqa    xmm1, xmm0
704     movdqa    xmm3, xmm2
705     psllw     xmm1, 4
706     psrlw     xmm3, 4
707     por       xmm0, xmm1
708     por       xmm2, xmm3
709     movdqa    xmm1, xmm0
710     punpcklbw xmm0, xmm2
711     punpckhbw xmm1, xmm2
712     movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
713     movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
714     lea       eax, [eax + 16]
715     sub       ecx, 8
716     jg        convertloop
717     ret
718   }
719 }
720 
721 __declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb,
722                                             uint8_t* dst_rgb,
723                                             int width) {
724   __asm {
725     mov       eax, [esp + 4]  // src_argb
726     mov       edx, [esp + 8]  // dst_rgb
727     mov       ecx, [esp + 12]  // width
728     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24
729 
730  convertloop:
731     movdqu    xmm0, [eax]  // fetch 16 pixels of argb
732     movdqu    xmm1, [eax + 16]
733     movdqu    xmm2, [eax + 32]
734     movdqu    xmm3, [eax + 48]
735     lea       eax, [eax + 64]
736     pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
737     pshufb    xmm1, xmm6
738     pshufb    xmm2, xmm6
739     pshufb    xmm3, xmm6
740     movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
741     psrldq    xmm1, 4  // 8 bytes from 1
742     pslldq    xmm4, 12  // 4 bytes from 1 for 0
743     movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
744     por       xmm0, xmm4  // 4 bytes from 1 for 0
745     pslldq    xmm5, 8  // 8 bytes from 2 for 1
746     movdqu    [edx], xmm0  // store 0
747     por       xmm1, xmm5  // 8 bytes from 2 for 1
748     psrldq    xmm2, 8  // 4 bytes from 2
749     pslldq    xmm3, 4  // 12 bytes from 3 for 2
750     por       xmm2, xmm3  // 12 bytes from 3 for 2
751     movdqu    [edx + 16], xmm1  // store 1
752     movdqu    [edx + 32], xmm2  // store 2
753     lea       edx, [edx + 48]
754     sub       ecx, 16
755     jg        convertloop
756     ret
757   }
758 }
759 
760 __declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb,
761                                           uint8_t* dst_rgb,
762                                           int width) {
763   __asm {
764     mov       eax, [esp + 4]  // src_argb
765     mov       edx, [esp + 8]  // dst_rgb
766     mov       ecx, [esp + 12]  // width
767     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRAW
768 
769  convertloop:
770     movdqu    xmm0, [eax]  // fetch 16 pixels of argb
771     movdqu    xmm1, [eax + 16]
772     movdqu    xmm2, [eax + 32]
773     movdqu    xmm3, [eax + 48]
774     lea       eax, [eax + 64]
775     pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
776     pshufb    xmm1, xmm6
777     pshufb    xmm2, xmm6
778     pshufb    xmm3, xmm6
779     movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
780     psrldq    xmm1, 4  // 8 bytes from 1
781     pslldq    xmm4, 12  // 4 bytes from 1 for 0
782     movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
783     por       xmm0, xmm4  // 4 bytes from 1 for 0
784     pslldq    xmm5, 8  // 8 bytes from 2 for 1
785     movdqu    [edx], xmm0  // store 0
786     por       xmm1, xmm5  // 8 bytes from 2 for 1
787     psrldq    xmm2, 8  // 4 bytes from 2
788     pslldq    xmm3, 4  // 12 bytes from 3 for 2
789     por       xmm2, xmm3  // 12 bytes from 3 for 2
790     movdqu    [edx + 16], xmm1  // store 1
791     movdqu    [edx + 32], xmm2  // store 2
792     lea       edx, [edx + 48]
793     sub       ecx, 16
794     jg        convertloop
795     ret
796   }
797 }
798 
799 __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
800                                             uint8_t* dst_rgb,
801                                             int width) {
802   __asm {
803     mov       eax, [esp + 4]  // src_argb
804     mov       edx, [esp + 8]  // dst_rgb
805     mov       ecx, [esp + 12]  // width
806     pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
807     psrld     xmm3, 27
808     pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
809     psrld     xmm4, 26
810     pslld     xmm4, 5
811     pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
812     pslld     xmm5, 11
813 
814  convertloop:
815     movdqu    xmm0, [eax]  // fetch 4 pixels of argb
816     movdqa    xmm1, xmm0  // B
817     movdqa    xmm2, xmm0  // G
818     pslld     xmm0, 8  // R
819     psrld     xmm1, 3  // B
820     psrld     xmm2, 5  // G
821     psrad     xmm0, 16  // R
822     pand      xmm1, xmm3  // B
823     pand      xmm2, xmm4  // G
824     pand      xmm0, xmm5  // R
825     por       xmm1, xmm2  // BG
826     por       xmm0, xmm1  // BGR
827     packssdw  xmm0, xmm0
828     lea       eax, [eax + 16]
829     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
830     lea       edx, [edx + 8]
831     sub       ecx, 4
832     jg        convertloop
833     ret
834   }
835 }
836 
837 __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
838                                                   uint8_t* dst_rgb,
839                                                   const uint32_t dither4,
840                                                   int width) {
841   __asm {
842 
843     mov       eax, [esp + 4]  // src_argb
844     mov       edx, [esp + 8]  // dst_rgb
845     movd      xmm6, [esp + 12]  // dither4
846     mov       ecx, [esp + 16]  // width
847     punpcklbw xmm6, xmm6  // make dither 16 bytes
848     movdqa    xmm7, xmm6
849     punpcklwd xmm6, xmm6
850     punpckhwd xmm7, xmm7
851     pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
852     psrld     xmm3, 27
853     pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
854     psrld     xmm4, 26
855     pslld     xmm4, 5
856     pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
857     pslld     xmm5, 11
858 
859  convertloop:
860     movdqu    xmm0, [eax]  // fetch 4 pixels of argb
861     paddusb   xmm0, xmm6  // add dither
862     movdqa    xmm1, xmm0  // B
863     movdqa    xmm2, xmm0  // G
864     pslld     xmm0, 8  // R
865     psrld     xmm1, 3  // B
866     psrld     xmm2, 5  // G
867     psrad     xmm0, 16  // R
868     pand      xmm1, xmm3  // B
869     pand      xmm2, xmm4  // G
870     pand      xmm0, xmm5  // R
871     por       xmm1, xmm2  // BG
872     por       xmm0, xmm1  // BGR
873     packssdw  xmm0, xmm0
874     lea       eax, [eax + 16]
875     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
876     lea       edx, [edx + 8]
877     sub       ecx, 4
878     jg        convertloop
879     ret
880   }
881 }
882 
883 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
884 __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
885                                                   uint8_t* dst_rgb,
886                                                   const uint32_t dither4,
887                                                   int width) {
888   __asm {
889     mov        eax, [esp + 4]  // src_argb
890     mov        edx, [esp + 8]  // dst_rgb
891     vbroadcastss xmm6, [esp + 12]  // dither4
892     mov        ecx, [esp + 16]  // width
893     vpunpcklbw xmm6, xmm6, xmm6  // make dither 32 bytes
894     vpermq     ymm6, ymm6, 0xd8
895     vpunpcklwd ymm6, ymm6, ymm6
896     vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
897     vpsrld     ymm3, ymm3, 27
898     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
899     vpsrld     ymm4, ymm4, 26
900     vpslld     ymm4, ymm4, 5
901     vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
902 
903  convertloop:
904     vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
905     vpaddusb   ymm0, ymm0, ymm6  // add dither
906     vpsrld     ymm2, ymm0, 5  // G
907     vpsrld     ymm1, ymm0, 3  // B
908     vpsrld     ymm0, ymm0, 8  // R
909     vpand      ymm2, ymm2, ymm4  // G
910     vpand      ymm1, ymm1, ymm3  // B
911     vpand      ymm0, ymm0, ymm5  // R
912     vpor       ymm1, ymm1, ymm2  // BG
913     vpor       ymm0, ymm0, ymm1  // BGR
914     vpackusdw  ymm0, ymm0, ymm0
915     vpermq     ymm0, ymm0, 0xd8
916     lea        eax, [eax + 32]
917     vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
918     lea        edx, [edx + 16]
919     sub        ecx, 8
920     jg         convertloop
921     vzeroupper
922     ret
923   }
924 }
925 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
926 
927 // TODO(fbarchard): Improve sign extension/packing.
928 __declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,
929                                               uint8_t* dst_rgb,
930                                               int width) {
931   __asm {
932     mov       eax, [esp + 4]  // src_argb
933     mov       edx, [esp + 8]  // dst_rgb
934     mov       ecx, [esp + 12]  // width
935     pcmpeqb   xmm4, xmm4  // generate mask 0x0000001f
936     psrld     xmm4, 27
937     movdqa    xmm5, xmm4  // generate mask 0x000003e0
938     pslld     xmm5, 5
939     movdqa    xmm6, xmm4  // generate mask 0x00007c00
940     pslld     xmm6, 10
941     pcmpeqb   xmm7, xmm7  // generate mask 0xffff8000
942     pslld     xmm7, 15
943 
944  convertloop:
945     movdqu    xmm0, [eax]  // fetch 4 pixels of argb
946     movdqa    xmm1, xmm0  // B
947     movdqa    xmm2, xmm0  // G
948     movdqa    xmm3, xmm0  // R
949     psrad     xmm0, 16  // A
950     psrld     xmm1, 3  // B
951     psrld     xmm2, 6  // G
952     psrld     xmm3, 9  // R
953     pand      xmm0, xmm7  // A
954     pand      xmm1, xmm4  // B
955     pand      xmm2, xmm5  // G
956     pand      xmm3, xmm6  // R
957     por       xmm0, xmm1  // BA
958     por       xmm2, xmm3  // GR
959     por       xmm0, xmm2  // BGRA
960     packssdw  xmm0, xmm0
961     lea       eax, [eax + 16]
962     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
963     lea       edx, [edx + 8]
964     sub       ecx, 4
965     jg        convertloop
966     ret
967   }
968 }
969 
970 __declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,
971                                               uint8_t* dst_rgb,
972                                               int width) {
973   __asm {
974     mov       eax, [esp + 4]  // src_argb
975     mov       edx, [esp + 8]  // dst_rgb
976     mov       ecx, [esp + 12]  // width
977     pcmpeqb   xmm4, xmm4  // generate mask 0xf000f000
978     psllw     xmm4, 12
979     movdqa    xmm3, xmm4  // generate mask 0x00f000f0
980     psrlw     xmm3, 8
981 
982  convertloop:
983     movdqu    xmm0, [eax]  // fetch 4 pixels of argb
984     movdqa    xmm1, xmm0
985     pand      xmm0, xmm3  // low nibble
986     pand      xmm1, xmm4  // high nibble
987     psrld     xmm0, 4
988     psrld     xmm1, 8
989     por       xmm0, xmm1
990     packuswb  xmm0, xmm0
991     lea       eax, [eax + 16]
992     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
993     lea       edx, [edx + 8]
994     sub       ecx, 4
995     jg        convertloop
996     ret
997   }
998 }
999 
1000 #ifdef HAS_ARGBTORGB565ROW_AVX2
1001 __declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb,
1002                                             uint8_t* dst_rgb,
1003                                             int width) {
1004   __asm {
1005     mov        eax, [esp + 4]  // src_argb
1006     mov        edx, [esp + 8]  // dst_rgb
1007     mov        ecx, [esp + 12]  // width
1008     vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
1009     vpsrld     ymm3, ymm3, 27
1010     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
1011     vpsrld     ymm4, ymm4, 26
1012     vpslld     ymm4, ymm4, 5
1013     vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
1014 
1015  convertloop:
1016     vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
1017     vpsrld     ymm2, ymm0, 5  // G
1018     vpsrld     ymm1, ymm0, 3  // B
1019     vpsrld     ymm0, ymm0, 8  // R
1020     vpand      ymm2, ymm2, ymm4  // G
1021     vpand      ymm1, ymm1, ymm3  // B
1022     vpand      ymm0, ymm0, ymm5  // R
1023     vpor       ymm1, ymm1, ymm2  // BG
1024     vpor       ymm0, ymm0, ymm1  // BGR
1025     vpackusdw  ymm0, ymm0, ymm0
1026     vpermq     ymm0, ymm0, 0xd8
1027     lea        eax, [eax + 32]
1028     vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
1029     lea        edx, [edx + 16]
1030     sub        ecx, 8
1031     jg         convertloop
1032     vzeroupper
1033     ret
1034   }
1035 }
1036 #endif  // HAS_ARGBTORGB565ROW_AVX2
1037 
1038 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
1039 __declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
1040                                               uint8_t* dst_rgb,
1041                                               int width) {
1042   __asm {
1043     mov        eax, [esp + 4]  // src_argb
1044     mov        edx, [esp + 8]  // dst_rgb
1045     mov        ecx, [esp + 12]  // width
1046     vpcmpeqb   ymm4, ymm4, ymm4
1047     vpsrld     ymm4, ymm4, 27  // generate mask 0x0000001f
1048     vpslld     ymm5, ymm4, 5  // generate mask 0x000003e0
1049     vpslld     ymm6, ymm4, 10  // generate mask 0x00007c00
1050     vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xffff8000
1051     vpslld     ymm7, ymm7, 15
1052 
1053  convertloop:
1054     vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
1055     vpsrld     ymm3, ymm0, 9  // R
1056     vpsrld     ymm2, ymm0, 6  // G
1057     vpsrld     ymm1, ymm0, 3  // B
1058     vpsrad     ymm0, ymm0, 16  // A
1059     vpand      ymm3, ymm3, ymm6  // R
1060     vpand      ymm2, ymm2, ymm5  // G
1061     vpand      ymm1, ymm1, ymm4  // B
1062     vpand      ymm0, ymm0, ymm7  // A
1063     vpor       ymm0, ymm0, ymm1  // BA
1064     vpor       ymm2, ymm2, ymm3  // GR
1065     vpor       ymm0, ymm0, ymm2  // BGRA
1066     vpackssdw  ymm0, ymm0, ymm0
1067     vpermq     ymm0, ymm0, 0xd8
1068     lea        eax, [eax + 32]
1069     vmovdqu    [edx], xmm0  // store 8 pixels of ARGB1555
1070     lea        edx, [edx + 16]
1071     sub        ecx, 8
1072     jg         convertloop
1073     vzeroupper
1074     ret
1075   }
1076 }
1077 #endif  // HAS_ARGBTOARGB1555ROW_AVX2
1078 
1079 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
1080 __declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
1081                                               uint8_t* dst_rgb,
1082                                               int width) {
1083   __asm {
1084     mov        eax, [esp + 4]  // src_argb
1085     mov        edx, [esp + 8]  // dst_rgb
1086     mov        ecx, [esp + 12]  // width
1087     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xf000f000
1088     vpsllw     ymm4, ymm4, 12
1089     vpsrlw     ymm3, ymm4, 8  // generate mask 0x00f000f0
1090 
1091  convertloop:
1092     vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
1093     vpand      ymm1, ymm0, ymm4  // high nibble
1094     vpand      ymm0, ymm0, ymm3  // low nibble
1095     vpsrld     ymm1, ymm1, 8
1096     vpsrld     ymm0, ymm0, 4
1097     vpor       ymm0, ymm0, ymm1
1098     vpackuswb  ymm0, ymm0, ymm0
1099     vpermq     ymm0, ymm0, 0xd8
1100     lea        eax, [eax + 32]
1101     vmovdqu    [edx], xmm0  // store 8 pixels of ARGB4444
1102     lea        edx, [edx + 16]
1103     sub        ecx, 8
1104     jg         convertloop
1105     vzeroupper
1106     ret
1107   }
1108 }
1109 #endif  // HAS_ARGBTOARGB4444ROW_AVX2
1110 
1111 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
1112 __declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
1113                                         uint8_t* dst_y,
1114                                         int width) {
1115   __asm {
1116     mov        eax, [esp + 4] /* src_argb */
1117     mov        edx, [esp + 8] /* dst_y */
1118     mov        ecx, [esp + 12] /* width */
1119     movdqa     xmm4, xmmword ptr kARGBToY
1120     movdqa     xmm5, xmmword ptr kAddY16
1121 
1122  convertloop:
1123     movdqu     xmm0, [eax]
1124     movdqu     xmm1, [eax + 16]
1125     movdqu     xmm2, [eax + 32]
1126     movdqu     xmm3, [eax + 48]
1127     pmaddubsw  xmm0, xmm4
1128     pmaddubsw  xmm1, xmm4
1129     pmaddubsw  xmm2, xmm4
1130     pmaddubsw  xmm3, xmm4
1131     lea        eax, [eax + 64]
1132     phaddw     xmm0, xmm1
1133     phaddw     xmm2, xmm3
1134     psrlw      xmm0, 7
1135     psrlw      xmm2, 7
1136     packuswb   xmm0, xmm2
1137     paddb      xmm0, xmm5
1138     movdqu     [edx], xmm0
1139     lea        edx, [edx + 16]
1140     sub        ecx, 16
1141     jg         convertloop
1142     ret
1143   }
1144 }
1145 
1146 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1147 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
1148 __declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
1149                                          uint8_t* dst_y,
1150                                          int width) {
1151   __asm {
1152     mov        eax, [esp + 4] /* src_argb */
1153     mov        edx, [esp + 8] /* dst_y */
1154     mov        ecx, [esp + 12] /* width */
1155     movdqa     xmm4, xmmword ptr kARGBToYJ
1156     movdqa     xmm5, xmmword ptr kAddYJ64
1157 
1158  convertloop:
1159     movdqu     xmm0, [eax]
1160     movdqu     xmm1, [eax + 16]
1161     movdqu     xmm2, [eax + 32]
1162     movdqu     xmm3, [eax + 48]
1163     pmaddubsw  xmm0, xmm4
1164     pmaddubsw  xmm1, xmm4
1165     pmaddubsw  xmm2, xmm4
1166     pmaddubsw  xmm3, xmm4
1167     lea        eax, [eax + 64]
1168     phaddw     xmm0, xmm1
1169     phaddw     xmm2, xmm3
1170     paddw      xmm0, xmm5  // Add .5 for rounding.
1171     paddw      xmm2, xmm5
1172     psrlw      xmm0, 7
1173     psrlw      xmm2, 7
1174     packuswb   xmm0, xmm2
1175     movdqu     [edx], xmm0
1176     lea        edx, [edx + 16]
1177     sub        ecx, 16
1178     jg         convertloop
1179     ret
1180   }
1181 }
1182 
1183 #ifdef HAS_ARGBTOYROW_AVX2
1184 // vpermd for vphaddw + vpackuswb vpermd.
1185 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
1186 
1187 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1188 __declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb,
1189                                        uint8_t* dst_y,
1190                                        int width) {
1191   __asm {
1192     mov        eax, [esp + 4] /* src_argb */
1193     mov        edx, [esp + 8] /* dst_y */
1194     mov        ecx, [esp + 12] /* width */
1195     vbroadcastf128 ymm4, xmmword ptr kARGBToY
1196     vbroadcastf128 ymm5, xmmword ptr kAddY16
1197     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
1198 
1199  convertloop:
1200     vmovdqu    ymm0, [eax]
1201     vmovdqu    ymm1, [eax + 32]
1202     vmovdqu    ymm2, [eax + 64]
1203     vmovdqu    ymm3, [eax + 96]
1204     vpmaddubsw ymm0, ymm0, ymm4
1205     vpmaddubsw ymm1, ymm1, ymm4
1206     vpmaddubsw ymm2, ymm2, ymm4
1207     vpmaddubsw ymm3, ymm3, ymm4
1208     lea        eax, [eax + 128]
1209     vphaddw    ymm0, ymm0, ymm1  // mutates.
1210     vphaddw    ymm2, ymm2, ymm3
1211     vpsrlw     ymm0, ymm0, 7
1212     vpsrlw     ymm2, ymm2, 7
1213     vpackuswb  ymm0, ymm0, ymm2  // mutates.
1214     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
1215     vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
1216     vmovdqu    [edx], ymm0
1217     lea        edx, [edx + 32]
1218     sub        ecx, 32
1219     jg         convertloop
1220     vzeroupper
1221     ret
1222   }
1223 }
1224 #endif  //  HAS_ARGBTOYROW_AVX2
1225 
1226 #ifdef HAS_ARGBTOYJROW_AVX2
1227 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1228 __declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb,
1229                                         uint8_t* dst_y,
1230                                         int width) {
1231   __asm {
1232     mov        eax, [esp + 4] /* src_argb */
1233     mov        edx, [esp + 8] /* dst_y */
1234     mov        ecx, [esp + 12] /* width */
1235     vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
1236     vbroadcastf128 ymm5, xmmword ptr kAddYJ64
1237     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
1238 
1239  convertloop:
1240     vmovdqu    ymm0, [eax]
1241     vmovdqu    ymm1, [eax + 32]
1242     vmovdqu    ymm2, [eax + 64]
1243     vmovdqu    ymm3, [eax + 96]
1244     vpmaddubsw ymm0, ymm0, ymm4
1245     vpmaddubsw ymm1, ymm1, ymm4
1246     vpmaddubsw ymm2, ymm2, ymm4
1247     vpmaddubsw ymm3, ymm3, ymm4
1248     lea        eax, [eax + 128]
1249     vphaddw    ymm0, ymm0, ymm1  // mutates.
1250     vphaddw    ymm2, ymm2, ymm3
1251     vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
1252     vpaddw     ymm2, ymm2, ymm5
1253     vpsrlw     ymm0, ymm0, 7
1254     vpsrlw     ymm2, ymm2, 7
1255     vpackuswb  ymm0, ymm0, ymm2  // mutates.
1256     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
1257     vmovdqu    [edx], ymm0
1258     lea        edx, [edx + 32]
1259     sub        ecx, 32
1260     jg         convertloop
1261 
1262     vzeroupper
1263     ret
1264   }
1265 }
1266 #endif  //  HAS_ARGBTOYJROW_AVX2
1267 
1268 __declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb,
1269                                         uint8_t* dst_y,
1270                                         int width) {
1271   __asm {
1272     mov        eax, [esp + 4] /* src_argb */
1273     mov        edx, [esp + 8] /* dst_y */
1274     mov        ecx, [esp + 12] /* width */
1275     movdqa     xmm4, xmmword ptr kBGRAToY
1276     movdqa     xmm5, xmmword ptr kAddY16
1277 
1278  convertloop:
1279     movdqu     xmm0, [eax]
1280     movdqu     xmm1, [eax + 16]
1281     movdqu     xmm2, [eax + 32]
1282     movdqu     xmm3, [eax + 48]
1283     pmaddubsw  xmm0, xmm4
1284     pmaddubsw  xmm1, xmm4
1285     pmaddubsw  xmm2, xmm4
1286     pmaddubsw  xmm3, xmm4
1287     lea        eax, [eax + 64]
1288     phaddw     xmm0, xmm1
1289     phaddw     xmm2, xmm3
1290     psrlw      xmm0, 7
1291     psrlw      xmm2, 7
1292     packuswb   xmm0, xmm2
1293     paddb      xmm0, xmm5
1294     movdqu     [edx], xmm0
1295     lea        edx, [edx + 16]
1296     sub        ecx, 16
1297     jg         convertloop
1298     ret
1299   }
1300 }
1301 
1302 __declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb,
1303                                         uint8_t* dst_y,
1304                                         int width) {
1305   __asm {
1306     mov        eax, [esp + 4] /* src_argb */
1307     mov        edx, [esp + 8] /* dst_y */
1308     mov        ecx, [esp + 12] /* width */
1309     movdqa     xmm4, xmmword ptr kABGRToY
1310     movdqa     xmm5, xmmword ptr kAddY16
1311 
1312  convertloop:
1313     movdqu     xmm0, [eax]
1314     movdqu     xmm1, [eax + 16]
1315     movdqu     xmm2, [eax + 32]
1316     movdqu     xmm3, [eax + 48]
1317     pmaddubsw  xmm0, xmm4
1318     pmaddubsw  xmm1, xmm4
1319     pmaddubsw  xmm2, xmm4
1320     pmaddubsw  xmm3, xmm4
1321     lea        eax, [eax + 64]
1322     phaddw     xmm0, xmm1
1323     phaddw     xmm2, xmm3
1324     psrlw      xmm0, 7
1325     psrlw      xmm2, 7
1326     packuswb   xmm0, xmm2
1327     paddb      xmm0, xmm5
1328     movdqu     [edx], xmm0
1329     lea        edx, [edx + 16]
1330     sub        ecx, 16
1331     jg         convertloop
1332     ret
1333   }
1334 }
1335 
1336 __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
1337                                         uint8_t* dst_y,
1338                                         int width) {
1339   __asm {
1340     mov        eax, [esp + 4] /* src_argb */
1341     mov        edx, [esp + 8] /* dst_y */
1342     mov        ecx, [esp + 12] /* width */
1343     movdqa     xmm4, xmmword ptr kRGBAToY
1344     movdqa     xmm5, xmmword ptr kAddY16
1345 
1346  convertloop:
1347     movdqu     xmm0, [eax]
1348     movdqu     xmm1, [eax + 16]
1349     movdqu     xmm2, [eax + 32]
1350     movdqu     xmm3, [eax + 48]
1351     pmaddubsw  xmm0, xmm4
1352     pmaddubsw  xmm1, xmm4
1353     pmaddubsw  xmm2, xmm4
1354     pmaddubsw  xmm3, xmm4
1355     lea        eax, [eax + 64]
1356     phaddw     xmm0, xmm1
1357     phaddw     xmm2, xmm3
1358     psrlw      xmm0, 7
1359     psrlw      xmm2, 7
1360     packuswb   xmm0, xmm2
1361     paddb      xmm0, xmm5
1362     movdqu     [edx], xmm0
1363     lea        edx, [edx + 16]
1364     sub        ecx, 16
1365     jg         convertloop
1366     ret
1367   }
1368 }
1369 
1370 __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
1371                                          int src_stride_argb,
1372                                          uint8_t* dst_u,
1373                                          uint8_t* dst_v,
1374                                          int width) {
1375   __asm {
1376     push       esi
1377     push       edi
1378     mov        eax, [esp + 8 + 4]  // src_argb
1379     mov        esi, [esp + 8 + 8]  // src_stride_argb
1380     mov        edx, [esp + 8 + 12]  // dst_u
1381     mov        edi, [esp + 8 + 16]  // dst_v
1382     mov        ecx, [esp + 8 + 20]  // width
1383     movdqa     xmm5, xmmword ptr kAddUV128
1384     movdqa     xmm6, xmmword ptr kARGBToV
1385     movdqa     xmm7, xmmword ptr kARGBToU
1386     sub        edi, edx  // stride from u to v
1387 
1388  convertloop:
1389          /* step 1 - subsample 16x2 argb pixels to 8x1 */
1390     movdqu     xmm0, [eax]
1391     movdqu     xmm4, [eax + esi]
1392     pavgb      xmm0, xmm4
1393     movdqu     xmm1, [eax + 16]
1394     movdqu     xmm4, [eax + esi + 16]
1395     pavgb      xmm1, xmm4
1396     movdqu     xmm2, [eax + 32]
1397     movdqu     xmm4, [eax + esi + 32]
1398     pavgb      xmm2, xmm4
1399     movdqu     xmm3, [eax + 48]
1400     movdqu     xmm4, [eax + esi + 48]
1401     pavgb      xmm3, xmm4
1402 
1403     lea        eax,  [eax + 64]
1404     movdqa     xmm4, xmm0
1405     shufps     xmm0, xmm1, 0x88
1406     shufps     xmm4, xmm1, 0xdd
1407     pavgb      xmm0, xmm4
1408     movdqa     xmm4, xmm2
1409     shufps     xmm2, xmm3, 0x88
1410     shufps     xmm4, xmm3, 0xdd
1411     pavgb      xmm2, xmm4
1412 
1413         // step 2 - convert to U and V
1414         // from here down is very similar to Y code except
1415         // instead of 16 different pixels, its 8 pixels of U and 8 of V
1416     movdqa     xmm1, xmm0
1417     movdqa     xmm3, xmm2
1418     pmaddubsw  xmm0, xmm7  // U
1419     pmaddubsw  xmm2, xmm7
1420     pmaddubsw  xmm1, xmm6  // V
1421     pmaddubsw  xmm3, xmm6
1422     phaddw     xmm0, xmm2
1423     phaddw     xmm1, xmm3
1424     psraw      xmm0, 8
1425     psraw      xmm1, 8
1426     packsswb   xmm0, xmm1
1427     paddb      xmm0, xmm5  // -> unsigned
1428 
1429         // step 3 - store 8 U and 8 V values
1430     movlps     qword ptr [edx], xmm0  // U
1431     movhps     qword ptr [edx + edi], xmm0  // V
1432     lea        edx, [edx + 8]
1433     sub        ecx, 16
1434     jg         convertloop
1435 
1436     pop        edi
1437     pop        esi
1438     ret
1439   }
1440 }
1441 
1442 __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
1443                                           int src_stride_argb,
1444                                           uint8_t* dst_u,
1445                                           uint8_t* dst_v,
1446                                           int width) {
1447   __asm {
1448     push       esi
1449     push       edi
1450     mov        eax, [esp + 8 + 4]  // src_argb
1451     mov        esi, [esp + 8 + 8]  // src_stride_argb
1452     mov        edx, [esp + 8 + 12]  // dst_u
1453     mov        edi, [esp + 8 + 16]  // dst_v
1454     mov        ecx, [esp + 8 + 20]  // width
1455     movdqa     xmm5, xmmword ptr kAddUVJ128
1456     movdqa     xmm6, xmmword ptr kARGBToVJ
1457     movdqa     xmm7, xmmword ptr kARGBToUJ
1458     sub        edi, edx  // stride from u to v
1459 
1460  convertloop:
1461          /* step 1 - subsample 16x2 argb pixels to 8x1 */
1462     movdqu     xmm0, [eax]
1463     movdqu     xmm4, [eax + esi]
1464     pavgb      xmm0, xmm4
1465     movdqu     xmm1, [eax + 16]
1466     movdqu     xmm4, [eax + esi + 16]
1467     pavgb      xmm1, xmm4
1468     movdqu     xmm2, [eax + 32]
1469     movdqu     xmm4, [eax + esi + 32]
1470     pavgb      xmm2, xmm4
1471     movdqu     xmm3, [eax + 48]
1472     movdqu     xmm4, [eax + esi + 48]
1473     pavgb      xmm3, xmm4
1474 
1475     lea        eax,  [eax + 64]
1476     movdqa     xmm4, xmm0
1477     shufps     xmm0, xmm1, 0x88
1478     shufps     xmm4, xmm1, 0xdd
1479     pavgb      xmm0, xmm4
1480     movdqa     xmm4, xmm2
1481     shufps     xmm2, xmm3, 0x88
1482     shufps     xmm4, xmm3, 0xdd
1483     pavgb      xmm2, xmm4
1484 
1485         // step 2 - convert to U and V
1486         // from here down is very similar to Y code except
1487         // instead of 16 different pixels, its 8 pixels of U and 8 of V
1488     movdqa     xmm1, xmm0
1489     movdqa     xmm3, xmm2
1490     pmaddubsw  xmm0, xmm7  // U
1491     pmaddubsw  xmm2, xmm7
1492     pmaddubsw  xmm1, xmm6  // V
1493     pmaddubsw  xmm3, xmm6
1494     phaddw     xmm0, xmm2
1495     phaddw     xmm1, xmm3
1496     paddw      xmm0, xmm5  // +.5 rounding -> unsigned
1497     paddw      xmm1, xmm5
1498     psraw      xmm0, 8
1499     psraw      xmm1, 8
1500     packsswb   xmm0, xmm1
1501 
1502         // step 3 - store 8 U and 8 V values
1503     movlps     qword ptr [edx], xmm0  // U
1504     movhps     qword ptr [edx + edi], xmm0  // V
1505     lea        edx, [edx + 8]
1506     sub        ecx, 16
1507     jg         convertloop
1508 
1509     pop        edi
1510     pop        esi
1511     ret
1512   }
1513 }
1514 
1515 #ifdef HAS_ARGBTOUVROW_AVX2
1516 __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
1517                                         int src_stride_argb,
1518                                         uint8_t* dst_u,
1519                                         uint8_t* dst_v,
1520                                         int width) {
1521   __asm {
1522     push       esi
1523     push       edi
1524     mov        eax, [esp + 8 + 4]  // src_argb
1525     mov        esi, [esp + 8 + 8]  // src_stride_argb
1526     mov        edx, [esp + 8 + 12]  // dst_u
1527     mov        edi, [esp + 8 + 16]  // dst_v
1528     mov        ecx, [esp + 8 + 20]  // width
1529     vbroadcastf128 ymm5, xmmword ptr kAddUV128
1530     vbroadcastf128 ymm6, xmmword ptr kARGBToV
1531     vbroadcastf128 ymm7, xmmword ptr kARGBToU
1532     sub        edi, edx   // stride from u to v
1533 
1534  convertloop:
1535         /* step 1 - subsample 32x2 argb pixels to 16x1 */
1536     vmovdqu    ymm0, [eax]
1537     vmovdqu    ymm1, [eax + 32]
1538     vmovdqu    ymm2, [eax + 64]
1539     vmovdqu    ymm3, [eax + 96]
1540     vpavgb     ymm0, ymm0, [eax + esi]
1541     vpavgb     ymm1, ymm1, [eax + esi + 32]
1542     vpavgb     ymm2, ymm2, [eax + esi + 64]
1543     vpavgb     ymm3, ymm3, [eax + esi + 96]
1544     lea        eax,  [eax + 128]
1545     vshufps    ymm4, ymm0, ymm1, 0x88
1546     vshufps    ymm0, ymm0, ymm1, 0xdd
1547     vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
1548     vshufps    ymm4, ymm2, ymm3, 0x88
1549     vshufps    ymm2, ymm2, ymm3, 0xdd
1550     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
1551 
1552         // step 2 - convert to U and V
1553         // from here down is very similar to Y code except
1554         // instead of 32 different pixels, its 16 pixels of U and 16 of V
1555     vpmaddubsw ymm1, ymm0, ymm7  // U
1556     vpmaddubsw ymm3, ymm2, ymm7
1557     vpmaddubsw ymm0, ymm0, ymm6  // V
1558     vpmaddubsw ymm2, ymm2, ymm6
1559     vphaddw    ymm1, ymm1, ymm3  // mutates
1560     vphaddw    ymm0, ymm0, ymm2
1561     vpsraw     ymm1, ymm1, 8
1562     vpsraw     ymm0, ymm0, 8
1563     vpacksswb  ymm0, ymm1, ymm0  // mutates
1564     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
1565     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
1566     vpaddb     ymm0, ymm0, ymm5  // -> unsigned
1567 
1568         // step 3 - store 16 U and 16 V values
1569     vextractf128 [edx], ymm0, 0  // U
1570     vextractf128 [edx + edi], ymm0, 1  // V
1571     lea        edx, [edx + 16]
1572     sub        ecx, 32
1573     jg         convertloop
1574 
1575     pop        edi
1576     pop        esi
1577     vzeroupper
1578     ret
1579   }
1580 }
1581 #endif  // HAS_ARGBTOUVROW_AVX2
1582 
1583 #ifdef HAS_ARGBTOUVJROW_AVX2
1584 __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
1585                                          int src_stride_argb,
1586                                          uint8_t* dst_u,
1587                                          uint8_t* dst_v,
1588                                          int width) {
1589   __asm {
1590     push       esi
1591     push       edi
1592     mov        eax, [esp + 8 + 4]  // src_argb
1593     mov        esi, [esp + 8 + 8]  // src_stride_argb
1594     mov        edx, [esp + 8 + 12]  // dst_u
1595     mov        edi, [esp + 8 + 16]  // dst_v
1596     mov        ecx, [esp + 8 + 20]  // width
1597     vbroadcastf128 ymm5, xmmword ptr kAddUV128
1598     vbroadcastf128 ymm6, xmmword ptr kARGBToV
1599     vbroadcastf128 ymm7, xmmword ptr kARGBToU
1600     sub        edi, edx   // stride from u to v
1601 
1602  convertloop:
1603         /* step 1 - subsample 32x2 argb pixels to 16x1 */
1604     vmovdqu    ymm0, [eax]
1605     vmovdqu    ymm1, [eax + 32]
1606     vmovdqu    ymm2, [eax + 64]
1607     vmovdqu    ymm3, [eax + 96]
1608     vpavgb     ymm0, ymm0, [eax + esi]
1609     vpavgb     ymm1, ymm1, [eax + esi + 32]
1610     vpavgb     ymm2, ymm2, [eax + esi + 64]
1611     vpavgb     ymm3, ymm3, [eax + esi + 96]
1612     lea        eax,  [eax + 128]
1613     vshufps    ymm4, ymm0, ymm1, 0x88
1614     vshufps    ymm0, ymm0, ymm1, 0xdd
1615     vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
1616     vshufps    ymm4, ymm2, ymm3, 0x88
1617     vshufps    ymm2, ymm2, ymm3, 0xdd
1618     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
1619 
1620         // step 2 - convert to U and V
1621         // from here down is very similar to Y code except
1622         // instead of 32 different pixels, its 16 pixels of U and 16 of V
1623     vpmaddubsw ymm1, ymm0, ymm7  // U
1624     vpmaddubsw ymm3, ymm2, ymm7
1625     vpmaddubsw ymm0, ymm0, ymm6  // V
1626     vpmaddubsw ymm2, ymm2, ymm6
1627     vphaddw    ymm1, ymm1, ymm3  // mutates
1628     vphaddw    ymm0, ymm0, ymm2
1629     vpaddw     ymm1, ymm1, ymm5  // +.5 rounding -> unsigned
1630     vpaddw     ymm0, ymm0, ymm5
1631     vpsraw     ymm1, ymm1, 8
1632     vpsraw     ymm0, ymm0, 8
1633     vpacksswb  ymm0, ymm1, ymm0  // mutates
1634     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
1635     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
1636 
1637         // step 3 - store 16 U and 16 V values
1638     vextractf128 [edx], ymm0, 0  // U
1639     vextractf128 [edx + edi], ymm0, 1  // V
1640     lea        edx, [edx + 16]
1641     sub        ecx, 32
1642     jg         convertloop
1643 
1644     pop        edi
1645     pop        esi
1646     vzeroupper
1647     ret
1648   }
1649 }
1650 #endif  // HAS_ARGBTOUVJROW_AVX2
1651 
1652 __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
1653                                             uint8_t* dst_u,
1654                                             uint8_t* dst_v,
1655                                             int width) {
1656   __asm {
1657     push       edi
1658     mov        eax, [esp + 4 + 4]  // src_argb
1659     mov        edx, [esp + 4 + 8]  // dst_u
1660     mov        edi, [esp + 4 + 12]  // dst_v
1661     mov        ecx, [esp + 4 + 16]  // width
1662     movdqa     xmm5, xmmword ptr kAddUV128
1663     movdqa     xmm6, xmmword ptr kARGBToV
1664     movdqa     xmm7, xmmword ptr kARGBToU
1665     sub        edi, edx    // stride from u to v
1666 
1667  convertloop:
1668         /* convert to U and V */
1669     movdqu     xmm0, [eax]  // U
1670     movdqu     xmm1, [eax + 16]
1671     movdqu     xmm2, [eax + 32]
1672     movdqu     xmm3, [eax + 48]
1673     pmaddubsw  xmm0, xmm7
1674     pmaddubsw  xmm1, xmm7
1675     pmaddubsw  xmm2, xmm7
1676     pmaddubsw  xmm3, xmm7
1677     phaddw     xmm0, xmm1
1678     phaddw     xmm2, xmm3
1679     psraw      xmm0, 8
1680     psraw      xmm2, 8
1681     packsswb   xmm0, xmm2
1682     paddb      xmm0, xmm5
1683     movdqu     [edx], xmm0
1684 
1685     movdqu     xmm0, [eax]  // V
1686     movdqu     xmm1, [eax + 16]
1687     movdqu     xmm2, [eax + 32]
1688     movdqu     xmm3, [eax + 48]
1689     pmaddubsw  xmm0, xmm6
1690     pmaddubsw  xmm1, xmm6
1691     pmaddubsw  xmm2, xmm6
1692     pmaddubsw  xmm3, xmm6
1693     phaddw     xmm0, xmm1
1694     phaddw     xmm2, xmm3
1695     psraw      xmm0, 8
1696     psraw      xmm2, 8
1697     packsswb   xmm0, xmm2
1698     paddb      xmm0, xmm5
1699     lea        eax,  [eax + 64]
1700     movdqu     [edx + edi], xmm0
1701     lea        edx,  [edx + 16]
1702     sub        ecx,  16
1703     jg         convertloop
1704 
1705     pop        edi
1706     ret
1707   }
1708 }
1709 
1710 __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
1711                                          int src_stride_argb,
1712                                          uint8_t* dst_u,
1713                                          uint8_t* dst_v,
1714                                          int width) {
1715   __asm {
1716     push       esi
1717     push       edi
1718     mov        eax, [esp + 8 + 4]  // src_argb
1719     mov        esi, [esp + 8 + 8]  // src_stride_argb
1720     mov        edx, [esp + 8 + 12]  // dst_u
1721     mov        edi, [esp + 8 + 16]  // dst_v
1722     mov        ecx, [esp + 8 + 20]  // width
1723     movdqa     xmm5, xmmword ptr kAddUV128
1724     movdqa     xmm6, xmmword ptr kBGRAToV
1725     movdqa     xmm7, xmmword ptr kBGRAToU
1726     sub        edi, edx  // stride from u to v
1727 
1728  convertloop:
1729          /* step 1 - subsample 16x2 argb pixels to 8x1 */
1730     movdqu     xmm0, [eax]
1731     movdqu     xmm4, [eax + esi]
1732     pavgb      xmm0, xmm4
1733     movdqu     xmm1, [eax + 16]
1734     movdqu     xmm4, [eax + esi + 16]
1735     pavgb      xmm1, xmm4
1736     movdqu     xmm2, [eax + 32]
1737     movdqu     xmm4, [eax + esi + 32]
1738     pavgb      xmm2, xmm4
1739     movdqu     xmm3, [eax + 48]
1740     movdqu     xmm4, [eax + esi + 48]
1741     pavgb      xmm3, xmm4
1742 
1743     lea        eax,  [eax + 64]
1744     movdqa     xmm4, xmm0
1745     shufps     xmm0, xmm1, 0x88
1746     shufps     xmm4, xmm1, 0xdd
1747     pavgb      xmm0, xmm4
1748     movdqa     xmm4, xmm2
1749     shufps     xmm2, xmm3, 0x88
1750     shufps     xmm4, xmm3, 0xdd
1751     pavgb      xmm2, xmm4
1752 
1753         // step 2 - convert to U and V
1754         // from here down is very similar to Y code except
1755         // instead of 16 different pixels, its 8 pixels of U and 8 of V
1756     movdqa     xmm1, xmm0
1757     movdqa     xmm3, xmm2
1758     pmaddubsw  xmm0, xmm7  // U
1759     pmaddubsw  xmm2, xmm7
1760     pmaddubsw  xmm1, xmm6  // V
1761     pmaddubsw  xmm3, xmm6
1762     phaddw     xmm0, xmm2
1763     phaddw     xmm1, xmm3
1764     psraw      xmm0, 8
1765     psraw      xmm1, 8
1766     packsswb   xmm0, xmm1
1767     paddb      xmm0, xmm5  // -> unsigned
1768 
1769         // step 3 - store 8 U and 8 V values
1770     movlps     qword ptr [edx], xmm0  // U
1771     movhps     qword ptr [edx + edi], xmm0  // V
1772     lea        edx, [edx + 8]
1773     sub        ecx, 16
1774     jg         convertloop
1775 
1776     pop        edi
1777     pop        esi
1778     ret
1779   }
1780 }
1781 
1782 __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
1783                                          int src_stride_argb,
1784                                          uint8_t* dst_u,
1785                                          uint8_t* dst_v,
1786                                          int width) {
1787   __asm {
1788     push       esi
1789     push       edi
1790     mov        eax, [esp + 8 + 4]  // src_argb
1791     mov        esi, [esp + 8 + 8]  // src_stride_argb
1792     mov        edx, [esp + 8 + 12]  // dst_u
1793     mov        edi, [esp + 8 + 16]  // dst_v
1794     mov        ecx, [esp + 8 + 20]  // width
1795     movdqa     xmm5, xmmword ptr kAddUV128
1796     movdqa     xmm6, xmmword ptr kABGRToV
1797     movdqa     xmm7, xmmword ptr kABGRToU
1798     sub        edi, edx  // stride from u to v
1799 
1800  convertloop:
1801          /* step 1 - subsample 16x2 argb pixels to 8x1 */
1802     movdqu     xmm0, [eax]
1803     movdqu     xmm4, [eax + esi]
1804     pavgb      xmm0, xmm4
1805     movdqu     xmm1, [eax + 16]
1806     movdqu     xmm4, [eax + esi + 16]
1807     pavgb      xmm1, xmm4
1808     movdqu     xmm2, [eax + 32]
1809     movdqu     xmm4, [eax + esi + 32]
1810     pavgb      xmm2, xmm4
1811     movdqu     xmm3, [eax + 48]
1812     movdqu     xmm4, [eax + esi + 48]
1813     pavgb      xmm3, xmm4
1814 
1815     lea        eax,  [eax + 64]
1816     movdqa     xmm4, xmm0
1817     shufps     xmm0, xmm1, 0x88
1818     shufps     xmm4, xmm1, 0xdd
1819     pavgb      xmm0, xmm4
1820     movdqa     xmm4, xmm2
1821     shufps     xmm2, xmm3, 0x88
1822     shufps     xmm4, xmm3, 0xdd
1823     pavgb      xmm2, xmm4
1824 
1825         // step 2 - convert to U and V
1826         // from here down is very similar to Y code except
1827         // instead of 16 different pixels, its 8 pixels of U and 8 of V
1828     movdqa     xmm1, xmm0
1829     movdqa     xmm3, xmm2
1830     pmaddubsw  xmm0, xmm7  // U
1831     pmaddubsw  xmm2, xmm7
1832     pmaddubsw  xmm1, xmm6  // V
1833     pmaddubsw  xmm3, xmm6
1834     phaddw     xmm0, xmm2
1835     phaddw     xmm1, xmm3
1836     psraw      xmm0, 8
1837     psraw      xmm1, 8
1838     packsswb   xmm0, xmm1
1839     paddb      xmm0, xmm5  // -> unsigned
1840 
1841         // step 3 - store 8 U and 8 V values
1842     movlps     qword ptr [edx], xmm0  // U
1843     movhps     qword ptr [edx + edi], xmm0  // V
1844     lea        edx, [edx + 8]
1845     sub        ecx, 16
1846     jg         convertloop
1847 
1848     pop        edi
1849     pop        esi
1850     ret
1851   }
1852 }
1853 
1854 __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
1855                                          int src_stride_argb,
1856                                          uint8_t* dst_u,
1857                                          uint8_t* dst_v,
1858                                          int width) {
1859   __asm {
1860     push       esi
1861     push       edi
1862     mov        eax, [esp + 8 + 4]  // src_argb
1863     mov        esi, [esp + 8 + 8]  // src_stride_argb
1864     mov        edx, [esp + 8 + 12]  // dst_u
1865     mov        edi, [esp + 8 + 16]  // dst_v
1866     mov        ecx, [esp + 8 + 20]  // width
1867     movdqa     xmm5, xmmword ptr kAddUV128
1868     movdqa     xmm6, xmmword ptr kRGBAToV
1869     movdqa     xmm7, xmmword ptr kRGBAToU
1870     sub        edi, edx  // stride from u to v
1871 
1872  convertloop:
1873          /* step 1 - subsample 16x2 argb pixels to 8x1 */
1874     movdqu     xmm0, [eax]
1875     movdqu     xmm4, [eax + esi]
1876     pavgb      xmm0, xmm4
1877     movdqu     xmm1, [eax + 16]
1878     movdqu     xmm4, [eax + esi + 16]
1879     pavgb      xmm1, xmm4
1880     movdqu     xmm2, [eax + 32]
1881     movdqu     xmm4, [eax + esi + 32]
1882     pavgb      xmm2, xmm4
1883     movdqu     xmm3, [eax + 48]
1884     movdqu     xmm4, [eax + esi + 48]
1885     pavgb      xmm3, xmm4
1886 
1887     lea        eax,  [eax + 64]
1888     movdqa     xmm4, xmm0
1889     shufps     xmm0, xmm1, 0x88
1890     shufps     xmm4, xmm1, 0xdd
1891     pavgb      xmm0, xmm4
1892     movdqa     xmm4, xmm2
1893     shufps     xmm2, xmm3, 0x88
1894     shufps     xmm4, xmm3, 0xdd
1895     pavgb      xmm2, xmm4
1896 
1897         // step 2 - convert to U and V
1898         // from here down is very similar to Y code except
1899         // instead of 16 different pixels, its 8 pixels of U and 8 of V
1900     movdqa     xmm1, xmm0
1901     movdqa     xmm3, xmm2
1902     pmaddubsw  xmm0, xmm7  // U
1903     pmaddubsw  xmm2, xmm7
1904     pmaddubsw  xmm1, xmm6  // V
1905     pmaddubsw  xmm3, xmm6
1906     phaddw     xmm0, xmm2
1907     phaddw     xmm1, xmm3
1908     psraw      xmm0, 8
1909     psraw      xmm1, 8
1910     packsswb   xmm0, xmm1
1911     paddb      xmm0, xmm5  // -> unsigned
1912 
1913         // step 3 - store 8 U and 8 V values
1914     movlps     qword ptr [edx], xmm0  // U
1915     movhps     qword ptr [edx + edi], xmm0  // V
1916     lea        edx, [edx + 8]
1917     sub        ecx, 16
1918     jg         convertloop
1919 
1920     pop        edi
1921     pop        esi
1922     ret
1923   }
1924 }
1925 #endif  // HAS_ARGBTOYROW_SSSE3
1926 
1927 // Read 16 UV from 444
1928 #define READYUV444_AVX2 \
1929   __asm {                                                \
1930     __asm vmovdqu    xmm0, [esi] /* U */                      \
1931     __asm vmovdqu    xmm1, [esi + edi] /* V */                      \
1932     __asm lea        esi,  [esi + 16]                                          \
1933     __asm vpermq     ymm0, ymm0, 0xd8                                          \
1934     __asm vpermq     ymm1, ymm1, 0xd8                                          \
1935     __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
1936     __asm vmovdqu    xmm4, [eax] /* Y */                      \
1937     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1938     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1939     __asm lea        eax, [eax + 16]}
1940 
1941 // Read 8 UV from 422, upsample to 16 UV.
1942 #define READYUV422_AVX2 \
1943   __asm {                                                \
1944     __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
1945     __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
1946     __asm lea        esi,  [esi + 8]                                           \
1947     __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
1948     __asm vpermq     ymm0, ymm0, 0xd8                                          \
1949     __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
1950     __asm vmovdqu    xmm4, [eax] /* Y */                      \
1951     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1952     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1953     __asm lea        eax, [eax + 16]}
1954 
1955 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
1956 #define READYUVA422_AVX2 \
1957   __asm {                                               \
1958     __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
1959     __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
1960     __asm lea        esi,  [esi + 8]                                           \
1961     __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
1962     __asm vpermq     ymm0, ymm0, 0xd8                                          \
1963     __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
1964     __asm vmovdqu    xmm4, [eax] /* Y */                      \
1965     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1966     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1967     __asm lea        eax, [eax + 16]                                           \
1968     __asm vmovdqu    xmm5, [ebp] /* A */                      \
1969     __asm vpermq     ymm5, ymm5, 0xd8                                          \
1970     __asm lea        ebp, [ebp + 16]}
1971 
1972 // Read 8 UV from NV12, upsample to 16 UV.
1973 #define READNV12_AVX2 \
1974   __asm {                                                  \
1975     __asm vmovdqu    xmm0, [esi] /* UV */                     \
1976     __asm lea        esi,  [esi + 16]                                          \
1977     __asm vpermq     ymm0, ymm0, 0xd8                                          \
1978     __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
1979     __asm vmovdqu    xmm4, [eax] /* Y */                      \
1980     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1981     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1982     __asm lea        eax, [eax + 16]}
1983 
1984 // Read 8 UV from NV21, upsample to 16 UV.
1985 #define READNV21_AVX2 \
1986   __asm {                                                  \
1987     __asm vmovdqu    xmm0, [esi] /* UV */                     \
1988     __asm lea        esi,  [esi + 16]                                          \
1989     __asm vpermq     ymm0, ymm0, 0xd8                                          \
1990     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleNV21                      \
1991     __asm vmovdqu    xmm4, [eax] /* Y */                      \
1992     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1993     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1994     __asm lea        eax, [eax + 16]}
1995 
1996 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
1997 #define READYUY2_AVX2 \
1998   __asm {                                                  \
1999     __asm vmovdqu    ymm4, [eax] /* YUY2 */                           \
2000     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \
2001     __asm vmovdqu    ymm0, [eax] /* UV */                             \
2002     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleYUY2UV                    \
2003     __asm lea        eax, [eax + 32]}
2004 
2005 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2006 #define READUYVY_AVX2 \
2007   __asm {                                                  \
2008     __asm vmovdqu    ymm4, [eax] /* UYVY */                           \
2009     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \
2010     __asm vmovdqu    ymm0, [eax] /* UV */                             \
2011     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleUYVYUV                    \
2012     __asm lea        eax, [eax + 32]}
2013 
2014 // Convert 16 pixels: 16 UV and 16 Y.
2015 #define YUVTORGB_AVX2(YuvConstants) \
2016   __asm {                                    \
2017     __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
2018     __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
2019     __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
2020     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASR]               \
2021     __asm vpsubw     ymm2, ymm3, ymm2                                          \
2022     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASG]               \
2023     __asm vpsubw     ymm1, ymm3, ymm1                                          \
2024     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \
2025     __asm vpsubw     ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */                       \
2026     __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
2027     __asm vpaddsw    ymm0, ymm0, ymm4 /* B += Y */                   \
2028     __asm vpaddsw    ymm1, ymm1, ymm4 /* G += Y */                   \
2029     __asm vpaddsw    ymm2, ymm2, ymm4 /* R += Y */                   \
2030     __asm vpsraw     ymm0, ymm0, 6                                             \
2031     __asm vpsraw     ymm1, ymm1, 6                                             \
2032     __asm vpsraw     ymm2, ymm2, 6                                             \
2033     __asm vpackuswb  ymm0, ymm0, ymm0 /* B */                        \
2034     __asm vpackuswb  ymm1, ymm1, ymm1 /* G */                        \
2035     __asm vpackuswb  ymm2, ymm2, ymm2 /* R */                  \
2036   }
2037 
2038 // Store 16 ARGB values.
2039 #define STOREARGB_AVX2 \
2040   __asm {                                                 \
2041     __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */                       \
2042     __asm vpermq     ymm0, ymm0, 0xd8                                          \
2043     __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */                       \
2044     __asm vpermq     ymm2, ymm2, 0xd8                                          \
2045     __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */      \
2046     __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */       \
2047     __asm vmovdqu    0[edx], ymm1                                              \
2048     __asm vmovdqu    32[edx], ymm0                                             \
2049     __asm lea        edx,  [edx + 64]}
2050 
2051 // Store 16 RGBA values.
2052 #define STORERGBA_AVX2 \
2053   __asm {                                                 \
2054     __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */                       \
2055     __asm vpermq     ymm1, ymm1, 0xd8                                          \
2056     __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */                       \
2057     __asm vpermq     ymm2, ymm2, 0xd8                                          \
2058     __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */      \
2059     __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */       \
2060     __asm vmovdqu    [edx], ymm0                                               \
2061     __asm vmovdqu    [edx + 32], ymm1                                          \
2062     __asm lea        edx,  [edx + 64]}
2063 
2064 #ifdef HAS_I422TOARGBROW_AVX2
2065 // 16 pixels
2066 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2067 __declspec(naked) void I422ToARGBRow_AVX2(
2068     const uint8_t* y_buf,
2069     const uint8_t* u_buf,
2070     const uint8_t* v_buf,
2071     uint8_t* dst_argb,
2072     const struct YuvConstants* yuvconstants,
2073     int width) {
2074   __asm {
2075     push       esi
2076     push       edi
2077     push       ebx
2078     mov        eax, [esp + 12 + 4]  // Y
2079     mov        esi, [esp + 12 + 8]  // U
2080     mov        edi, [esp + 12 + 12]  // V
2081     mov        edx, [esp + 12 + 16]  // argb
2082     mov        ebx, [esp + 12 + 20]  // yuvconstants
2083     mov        ecx, [esp + 12 + 24]  // width
2084     sub        edi, esi
2085     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2086 
2087  convertloop:
2088     READYUV422_AVX2
2089     YUVTORGB_AVX2(ebx)
2090     STOREARGB_AVX2
2091 
2092     sub        ecx, 16
2093     jg         convertloop
2094 
2095     pop        ebx
2096     pop        edi
2097     pop        esi
2098     vzeroupper
2099     ret
2100   }
2101 }
2102 #endif  // HAS_I422TOARGBROW_AVX2
2103 
2104 #ifdef HAS_I422ALPHATOARGBROW_AVX2
2105 // 16 pixels
2106 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
2107 __declspec(naked) void I422AlphaToARGBRow_AVX2(
2108     const uint8_t* y_buf,
2109     const uint8_t* u_buf,
2110     const uint8_t* v_buf,
2111     const uint8_t* a_buf,
2112     uint8_t* dst_argb,
2113     const struct YuvConstants* yuvconstants,
2114     int width) {
2115   __asm {
2116     push       esi
2117     push       edi
2118     push       ebx
2119     push       ebp
2120     mov        eax, [esp + 16 + 4]  // Y
2121     mov        esi, [esp + 16 + 8]  // U
2122     mov        edi, [esp + 16 + 12]  // V
2123     mov        ebp, [esp + 16 + 16]  // A
2124     mov        edx, [esp + 16 + 20]  // argb
2125     mov        ebx, [esp + 16 + 24]  // yuvconstants
2126     mov        ecx, [esp + 16 + 28]  // width
2127     sub        edi, esi
2128 
2129  convertloop:
2130     READYUVA422_AVX2
2131     YUVTORGB_AVX2(ebx)
2132     STOREARGB_AVX2
2133 
2134     sub        ecx, 16
2135     jg         convertloop
2136 
2137     pop        ebp
2138     pop        ebx
2139     pop        edi
2140     pop        esi
2141     vzeroupper
2142     ret
2143   }
2144 }
2145 #endif  // HAS_I422ALPHATOARGBROW_AVX2
2146 
2147 #ifdef HAS_I444TOARGBROW_AVX2
2148 // 16 pixels
2149 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2150 __declspec(naked) void I444ToARGBRow_AVX2(
2151     const uint8_t* y_buf,
2152     const uint8_t* u_buf,
2153     const uint8_t* v_buf,
2154     uint8_t* dst_argb,
2155     const struct YuvConstants* yuvconstants,
2156     int width) {
2157   __asm {
2158     push       esi
2159     push       edi
2160     push       ebx
2161     mov        eax, [esp + 12 + 4]  // Y
2162     mov        esi, [esp + 12 + 8]  // U
2163     mov        edi, [esp + 12 + 12]  // V
2164     mov        edx, [esp + 12 + 16]  // argb
2165     mov        ebx, [esp + 12 + 20]  // yuvconstants
2166     mov        ecx, [esp + 12 + 24]  // width
2167     sub        edi, esi
2168     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2169  convertloop:
2170     READYUV444_AVX2
2171     YUVTORGB_AVX2(ebx)
2172     STOREARGB_AVX2
2173 
2174     sub        ecx, 16
2175     jg         convertloop
2176 
2177     pop        ebx
2178     pop        edi
2179     pop        esi
2180     vzeroupper
2181     ret
2182   }
2183 }
2184 #endif  // HAS_I444TOARGBROW_AVX2
2185 
2186 #ifdef HAS_NV12TOARGBROW_AVX2
2187 // 16 pixels.
2188 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2189 __declspec(naked) void NV12ToARGBRow_AVX2(
2190     const uint8_t* y_buf,
2191     const uint8_t* uv_buf,
2192     uint8_t* dst_argb,
2193     const struct YuvConstants* yuvconstants,
2194     int width) {
2195   __asm {
2196     push       esi
2197     push       ebx
2198     mov        eax, [esp + 8 + 4]  // Y
2199     mov        esi, [esp + 8 + 8]  // UV
2200     mov        edx, [esp + 8 + 12]  // argb
2201     mov        ebx, [esp + 8 + 16]  // yuvconstants
2202     mov        ecx, [esp + 8 + 20]  // width
2203     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2204 
2205  convertloop:
2206     READNV12_AVX2
2207     YUVTORGB_AVX2(ebx)
2208     STOREARGB_AVX2
2209 
2210     sub        ecx, 16
2211     jg         convertloop
2212 
2213     pop        ebx
2214     pop        esi
2215     vzeroupper
2216     ret
2217   }
2218 }
2219 #endif  // HAS_NV12TOARGBROW_AVX2
2220 
2221 #ifdef HAS_NV21TOARGBROW_AVX2
2222 // 16 pixels.
2223 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2224 __declspec(naked) void NV21ToARGBRow_AVX2(
2225     const uint8_t* y_buf,
2226     const uint8_t* vu_buf,
2227     uint8_t* dst_argb,
2228     const struct YuvConstants* yuvconstants,
2229     int width) {
2230   __asm {
2231     push       esi
2232     push       ebx
2233     mov        eax, [esp + 8 + 4]  // Y
2234     mov        esi, [esp + 8 + 8]  // VU
2235     mov        edx, [esp + 8 + 12]  // argb
2236     mov        ebx, [esp + 8 + 16]  // yuvconstants
2237     mov        ecx, [esp + 8 + 20]  // width
2238     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2239 
2240  convertloop:
2241     READNV21_AVX2
2242     YUVTORGB_AVX2(ebx)
2243     STOREARGB_AVX2
2244 
2245     sub        ecx, 16
2246     jg         convertloop
2247 
2248     pop        ebx
2249     pop        esi
2250     vzeroupper
2251     ret
2252   }
2253 }
2254 #endif  // HAS_NV21TOARGBROW_AVX2
2255 
2256 #ifdef HAS_YUY2TOARGBROW_AVX2
2257 // 16 pixels.
2258 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2259 __declspec(naked) void YUY2ToARGBRow_AVX2(
2260     const uint8_t* src_yuy2,
2261     uint8_t* dst_argb,
2262     const struct YuvConstants* yuvconstants,
2263     int width) {
2264   __asm {
2265     push       ebx
2266     mov        eax, [esp + 4 + 4]  // yuy2
2267     mov        edx, [esp + 4 + 8]  // argb
2268     mov        ebx, [esp + 4 + 12]  // yuvconstants
2269     mov        ecx, [esp + 4 + 16]  // width
2270     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2271 
2272  convertloop:
2273     READYUY2_AVX2
2274     YUVTORGB_AVX2(ebx)
2275     STOREARGB_AVX2
2276 
2277     sub        ecx, 16
2278     jg         convertloop
2279 
2280     pop        ebx
2281     vzeroupper
2282     ret
2283   }
2284 }
2285 #endif  // HAS_YUY2TOARGBROW_AVX2
2286 
2287 #ifdef HAS_UYVYTOARGBROW_AVX2
2288 // 16 pixels.
2289 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2290 __declspec(naked) void UYVYToARGBRow_AVX2(
2291     const uint8_t* src_uyvy,
2292     uint8_t* dst_argb,
2293     const struct YuvConstants* yuvconstants,
2294     int width) {
2295   __asm {
2296     push       ebx
2297     mov        eax, [esp + 4 + 4]  // uyvy
2298     mov        edx, [esp + 4 + 8]  // argb
2299     mov        ebx, [esp + 4 + 12]  // yuvconstants
2300     mov        ecx, [esp + 4 + 16]  // width
2301     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2302 
2303  convertloop:
2304     READUYVY_AVX2
2305     YUVTORGB_AVX2(ebx)
2306     STOREARGB_AVX2
2307 
2308     sub        ecx, 16
2309     jg         convertloop
2310 
2311     pop        ebx
2312     vzeroupper
2313     ret
2314   }
2315 }
2316 #endif  // HAS_UYVYTOARGBROW_AVX2
2317 
2318 #ifdef HAS_I422TORGBAROW_AVX2
2319 // 16 pixels
2320 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2321 __declspec(naked) void I422ToRGBARow_AVX2(
2322     const uint8_t* y_buf,
2323     const uint8_t* u_buf,
2324     const uint8_t* v_buf,
2325     uint8_t* dst_argb,
2326     const struct YuvConstants* yuvconstants,
2327     int width) {
2328   __asm {
2329     push       esi
2330     push       edi
2331     push       ebx
2332     mov        eax, [esp + 12 + 4]  // Y
2333     mov        esi, [esp + 12 + 8]  // U
2334     mov        edi, [esp + 12 + 12]  // V
2335     mov        edx, [esp + 12 + 16]  // abgr
2336     mov        ebx, [esp + 12 + 20]  // yuvconstants
2337     mov        ecx, [esp + 12 + 24]  // width
2338     sub        edi, esi
2339     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2340 
2341  convertloop:
2342     READYUV422_AVX2
2343     YUVTORGB_AVX2(ebx)
2344     STORERGBA_AVX2
2345 
2346     sub        ecx, 16
2347     jg         convertloop
2348 
2349     pop        ebx
2350     pop        edi
2351     pop        esi
2352     vzeroupper
2353     ret
2354   }
2355 }
2356 #endif  // HAS_I422TORGBAROW_AVX2
2357 
2358 #if defined(HAS_I422TOARGBROW_SSSE3)
2359 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
2360 // Allows a conversion with half size scaling.
2361 
2362 // Read 8 UV from 444.
2363 #define READYUV444 \
2364   __asm {                                                     \
2365     __asm movq       xmm0, qword ptr [esi] /* U */                             \
2366     __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
2367     __asm lea        esi,  [esi + 8]                                           \
2368     __asm punpcklbw  xmm0, xmm1 /* UV */                             \
2369     __asm movq       xmm4, qword ptr [eax]                                     \
2370     __asm punpcklbw  xmm4, xmm4                                                \
2371     __asm lea        eax, [eax + 8]}
2372 
2373 // Read 4 UV from 422, upsample to 8 UV.
2374 #define READYUV422 \
2375   __asm {                                                     \
2376     __asm movd       xmm0, [esi] /* U */                              \
2377     __asm movd       xmm1, [esi + edi] /* V */                              \
2378     __asm lea        esi,  [esi + 4]                                           \
2379     __asm punpcklbw  xmm0, xmm1 /* UV */                             \
2380     __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
2381     __asm movq       xmm4, qword ptr [eax]                                     \
2382     __asm punpcklbw  xmm4, xmm4                                                \
2383     __asm lea        eax, [eax + 8]}
2384 
2385 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
2386 #define READYUVA422 \
2387   __asm {                                                    \
2388     __asm movd       xmm0, [esi] /* U */                              \
2389     __asm movd       xmm1, [esi + edi] /* V */                              \
2390     __asm lea        esi,  [esi + 4]                                           \
2391     __asm punpcklbw  xmm0, xmm1 /* UV */                             \
2392     __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
2393     __asm movq       xmm4, qword ptr [eax] /* Y */                           \
2394     __asm punpcklbw  xmm4, xmm4                                                \
2395     __asm lea        eax, [eax + 8]                                            \
2396     __asm movq       xmm5, qword ptr [ebp] /* A */                           \
2397     __asm lea        ebp, [ebp + 8]}
2398 
2399 // Read 4 UV from NV12, upsample to 8 UV.
2400 #define READNV12 \
2401   __asm {                                                       \
2402     __asm movq       xmm0, qword ptr [esi] /* UV */                            \
2403     __asm lea        esi,  [esi + 8]                                           \
2404     __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
2405     __asm movq       xmm4, qword ptr [eax]                                     \
2406     __asm punpcklbw  xmm4, xmm4                                                \
2407     __asm lea        eax, [eax + 8]}
2408 
2409 // Read 4 VU from NV21, upsample to 8 UV.
2410 #define READNV21 \
2411   __asm {                                                       \
2412     __asm movq       xmm0, qword ptr [esi] /* UV */                            \
2413     __asm lea        esi,  [esi + 8]                                           \
2414     __asm pshufb     xmm0, xmmword ptr kShuffleNV21                            \
2415     __asm movq       xmm4, qword ptr [eax]                                     \
2416     __asm punpcklbw  xmm4, xmm4                                                \
2417     __asm lea        eax, [eax + 8]}
2418 
2419 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
2420 #define READYUY2 \
2421   __asm {                                                       \
2422     __asm movdqu     xmm4, [eax] /* YUY2 */                           \
2423     __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \
2424     __asm movdqu     xmm0, [eax] /* UV */                             \
2425     __asm pshufb     xmm0, xmmword ptr kShuffleYUY2UV                          \
2426     __asm lea        eax, [eax + 16]}
2427 
2428 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
2429 #define READUYVY \
2430   __asm {                                                       \
2431     __asm movdqu     xmm4, [eax] /* UYVY */                           \
2432     __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \
2433     __asm movdqu     xmm0, [eax] /* UV */                             \
2434     __asm pshufb     xmm0, xmmword ptr kShuffleUYVYUV                          \
2435     __asm lea        eax, [eax + 16]}
2436 
2437 // Convert 8 pixels: 8 UV and 8 Y.
2438 #define YUVTORGB(YuvConstants) \
2439   __asm {                                         \
2440     __asm movdqa     xmm1, xmm0                                                \
2441     __asm movdqa     xmm2, xmm0                                                \
2442     __asm movdqa     xmm3, xmm0                                                \
2443     __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVBIASB]               \
2444     __asm pmaddubsw  xmm1, xmmword ptr [YuvConstants + KUVTOB]                 \
2445     __asm psubw      xmm0, xmm1                                                \
2446     __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVBIASG]               \
2447     __asm pmaddubsw  xmm2, xmmword ptr [YuvConstants + KUVTOG]                 \
2448     __asm psubw      xmm1, xmm2                                                \
2449     __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVBIASR]               \
2450     __asm pmaddubsw  xmm3, xmmword ptr [YuvConstants + KUVTOR]                 \
2451     __asm psubw      xmm2, xmm3                                                \
2452     __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
2453     __asm paddsw     xmm0, xmm4 /* B += Y */                         \
2454     __asm paddsw     xmm1, xmm4 /* G += Y */                         \
2455     __asm paddsw     xmm2, xmm4 /* R += Y */                         \
2456     __asm psraw      xmm0, 6                                                   \
2457     __asm psraw      xmm1, 6                                                   \
2458     __asm psraw      xmm2, 6                                                   \
2459     __asm packuswb   xmm0, xmm0 /* B */                              \
2460     __asm packuswb   xmm1, xmm1 /* G */                              \
2461     __asm packuswb   xmm2, xmm2 /* R */             \
2462   }
2463 
2464 // Store 8 ARGB values.
2465 #define STOREARGB \
2466   __asm {                                                      \
2467     __asm punpcklbw  xmm0, xmm1 /* BG */                             \
2468     __asm punpcklbw  xmm2, xmm5 /* RA */                             \
2469     __asm movdqa     xmm1, xmm0                                                \
2470     __asm punpcklwd  xmm0, xmm2 /* BGRA first 4 pixels */            \
2471     __asm punpckhwd  xmm1, xmm2 /* BGRA next 4 pixels */             \
2472     __asm movdqu     0[edx], xmm0                                              \
2473     __asm movdqu     16[edx], xmm1                                             \
2474     __asm lea        edx,  [edx + 32]}
2475 
2476 // Store 8 BGRA values.
2477 #define STOREBGRA \
2478   __asm {                                                      \
2479     __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
2480     __asm punpcklbw  xmm1, xmm0 /* GB */                             \
2481     __asm punpcklbw  xmm5, xmm2 /* AR */                             \
2482     __asm movdqa     xmm0, xmm5                                                \
2483     __asm punpcklwd  xmm5, xmm1 /* BGRA first 4 pixels */            \
2484     __asm punpckhwd  xmm0, xmm1 /* BGRA next 4 pixels */             \
2485     __asm movdqu     0[edx], xmm5                                              \
2486     __asm movdqu     16[edx], xmm0                                             \
2487     __asm lea        edx,  [edx + 32]}
2488 
2489 // Store 8 RGBA values.
2490 #define STORERGBA \
2491   __asm {                                                      \
2492     __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
2493     __asm punpcklbw  xmm1, xmm2 /* GR */                             \
2494     __asm punpcklbw  xmm5, xmm0 /* AB */                             \
2495     __asm movdqa     xmm0, xmm5                                                \
2496     __asm punpcklwd  xmm5, xmm1 /* RGBA first 4 pixels */            \
2497     __asm punpckhwd  xmm0, xmm1 /* RGBA next 4 pixels */             \
2498     __asm movdqu     0[edx], xmm5                                              \
2499     __asm movdqu     16[edx], xmm0                                             \
2500     __asm lea        edx,  [edx + 32]}
2501 
2502 // Store 8 RGB24 values.
2503 #define STORERGB24 \
2504   __asm {/* Weave into RRGB */                                                      \
2505     __asm punpcklbw  xmm0, xmm1 /* BG */                             \
2506     __asm punpcklbw  xmm2, xmm2 /* RR */                             \
2507     __asm movdqa     xmm1, xmm0                                                \
2508     __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
2509     __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */                                                        \
2510     __asm pshufb     xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
2511     __asm pshufb     xmm1, xmm6 /* Pack first 12 bytes. */           \
2512     __asm palignr    xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
2513     __asm movq       qword ptr 0[edx], xmm0 /* First 8 bytes */               \
2514     __asm movdqu     8[edx], xmm1 /* Last 16 bytes */                  \
2515     __asm lea        edx,  [edx + 24]}
2516 
2517 // Store 8 RGB565 values.
2518 #define STORERGB565 \
2519   __asm {/* Weave into RRGB */                                                      \
2520     __asm punpcklbw  xmm0, xmm1 /* BG */                             \
2521     __asm punpcklbw  xmm2, xmm2 /* RR */                             \
2522     __asm movdqa     xmm1, xmm0                                                \
2523     __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
2524     __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */                                                       \
2525     __asm movdqa     xmm3, xmm0 /* B  first 4 pixels of argb */             \
2526     __asm movdqa     xmm2, xmm0 /* G */                                     \
2527     __asm pslld      xmm0, 8 /* R */                                     \
2528     __asm psrld      xmm3, 3 /* B */                                     \
2529     __asm psrld      xmm2, 5 /* G */                                     \
2530     __asm psrad      xmm0, 16 /* R */                                     \
2531     __asm pand       xmm3, xmm5 /* B */                                     \
2532     __asm pand       xmm2, xmm6 /* G */                                     \
2533     __asm pand       xmm0, xmm7 /* R */                                     \
2534     __asm por        xmm3, xmm2 /* BG */                                    \
2535     __asm por        xmm0, xmm3 /* BGR */                                   \
2536     __asm movdqa     xmm3, xmm1 /* B  next 4 pixels of argb */              \
2537     __asm movdqa     xmm2, xmm1 /* G */                                     \
2538     __asm pslld      xmm1, 8 /* R */                                     \
2539     __asm psrld      xmm3, 3 /* B */                                     \
2540     __asm psrld      xmm2, 5 /* G */                                     \
2541     __asm psrad      xmm1, 16 /* R */                                     \
2542     __asm pand       xmm3, xmm5 /* B */                                     \
2543     __asm pand       xmm2, xmm6 /* G */                                     \
2544     __asm pand       xmm1, xmm7 /* R */                                     \
2545     __asm por        xmm3, xmm2 /* BG */                                    \
2546     __asm por        xmm1, xmm3 /* BGR */                                   \
2547     __asm packssdw   xmm0, xmm1                                                \
2548     __asm movdqu     0[edx], xmm0 /* store 8 pixels of RGB565 */              \
2549     __asm lea        edx, [edx + 16]}
2550 
2551 // 8 pixels.
2552 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2553 __declspec(naked) void I444ToARGBRow_SSSE3(
2554     const uint8_t* y_buf,
2555     const uint8_t* u_buf,
2556     const uint8_t* v_buf,
2557     uint8_t* dst_argb,
2558     const struct YuvConstants* yuvconstants,
2559     int width) {
2560   __asm {
2561     push       esi
2562     push       edi
2563     push       ebx
2564     mov        eax, [esp + 12 + 4]  // Y
2565     mov        esi, [esp + 12 + 8]  // U
2566     mov        edi, [esp + 12 + 12]  // V
2567     mov        edx, [esp + 12 + 16]  // argb
2568     mov        ebx, [esp + 12 + 20]  // yuvconstants
2569     mov        ecx, [esp + 12 + 24]  // width
2570     sub        edi, esi
2571     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2572 
2573  convertloop:
2574     READYUV444
2575     YUVTORGB(ebx)
2576     STOREARGB
2577 
2578     sub        ecx, 8
2579     jg         convertloop
2580 
2581     pop        ebx
2582     pop        edi
2583     pop        esi
2584     ret
2585   }
2586 }
2587 
2588 // 8 pixels.
2589 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
2590 __declspec(naked) void I422ToRGB24Row_SSSE3(
2591     const uint8_t* y_buf,
2592     const uint8_t* u_buf,
2593     const uint8_t* v_buf,
2594     uint8_t* dst_rgb24,
2595     const struct YuvConstants* yuvconstants,
2596     int width) {
2597   __asm {
2598     push       esi
2599     push       edi
2600     push       ebx
2601     mov        eax, [esp + 12 + 4]  // Y
2602     mov        esi, [esp + 12 + 8]  // U
2603     mov        edi, [esp + 12 + 12]  // V
2604     mov        edx, [esp + 12 + 16]  // argb
2605     mov        ebx, [esp + 12 + 20]  // yuvconstants
2606     mov        ecx, [esp + 12 + 24]  // width
2607     sub        edi, esi
2608     movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
2609     movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24
2610 
2611  convertloop:
2612     READYUV422
2613     YUVTORGB(ebx)
2614     STORERGB24
2615 
2616     sub        ecx, 8
2617     jg         convertloop
2618 
2619     pop        ebx
2620     pop        edi
2621     pop        esi
2622     ret
2623   }
2624 }
2625 
2626 // 8 pixels
2627 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
2628 __declspec(naked) void I422ToRGB565Row_SSSE3(
2629     const uint8_t* y_buf,
2630     const uint8_t* u_buf,
2631     const uint8_t* v_buf,
2632     uint8_t* rgb565_buf,
2633     const struct YuvConstants* yuvconstants,
2634     int width) {
2635   __asm {
2636     push       esi
2637     push       edi
2638     push       ebx
2639     mov        eax, [esp + 12 + 4]  // Y
2640     mov        esi, [esp + 12 + 8]  // U
2641     mov        edi, [esp + 12 + 12]  // V
2642     mov        edx, [esp + 12 + 16]  // argb
2643     mov        ebx, [esp + 12 + 20]  // yuvconstants
2644     mov        ecx, [esp + 12 + 24]  // width
2645     sub        edi, esi
2646     pcmpeqb    xmm5, xmm5  // generate mask 0x0000001f
2647     psrld      xmm5, 27
2648     pcmpeqb    xmm6, xmm6  // generate mask 0x000007e0
2649     psrld      xmm6, 26
2650     pslld      xmm6, 5
2651     pcmpeqb    xmm7, xmm7  // generate mask 0xfffff800
2652     pslld      xmm7, 11
2653 
2654  convertloop:
2655     READYUV422
2656     YUVTORGB(ebx)
2657     STORERGB565
2658 
2659     sub        ecx, 8
2660     jg         convertloop
2661 
2662     pop        ebx
2663     pop        edi
2664     pop        esi
2665     ret
2666   }
2667 }
2668 
2669 // 8 pixels.
2670 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2671 __declspec(naked) void I422ToARGBRow_SSSE3(
2672     const uint8_t* y_buf,
2673     const uint8_t* u_buf,
2674     const uint8_t* v_buf,
2675     uint8_t* dst_argb,
2676     const struct YuvConstants* yuvconstants,
2677     int width) {
2678   __asm {
2679     push       esi
2680     push       edi
2681     push       ebx
2682     mov        eax, [esp + 12 + 4]  // Y
2683     mov        esi, [esp + 12 + 8]  // U
2684     mov        edi, [esp + 12 + 12]  // V
2685     mov        edx, [esp + 12 + 16]  // argb
2686     mov        ebx, [esp + 12 + 20]  // yuvconstants
2687     mov        ecx, [esp + 12 + 24]  // width
2688     sub        edi, esi
2689     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2690 
2691  convertloop:
2692     READYUV422
2693     YUVTORGB(ebx)
2694     STOREARGB
2695 
2696     sub        ecx, 8
2697     jg         convertloop
2698 
2699     pop        ebx
2700     pop        edi
2701     pop        esi
2702     ret
2703   }
2704 }
2705 
2706 // 8 pixels.
2707 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
2708 __declspec(naked) void I422AlphaToARGBRow_SSSE3(
2709     const uint8_t* y_buf,
2710     const uint8_t* u_buf,
2711     const uint8_t* v_buf,
2712     const uint8_t* a_buf,
2713     uint8_t* dst_argb,
2714     const struct YuvConstants* yuvconstants,
2715     int width) {
2716   __asm {
2717     push       esi
2718     push       edi
2719     push       ebx
2720     push       ebp
2721     mov        eax, [esp + 16 + 4]  // Y
2722     mov        esi, [esp + 16 + 8]  // U
2723     mov        edi, [esp + 16 + 12]  // V
2724     mov        ebp, [esp + 16 + 16]  // A
2725     mov        edx, [esp + 16 + 20]  // argb
2726     mov        ebx, [esp + 16 + 24]  // yuvconstants
2727     mov        ecx, [esp + 16 + 28]  // width
2728     sub        edi, esi
2729 
2730  convertloop:
2731     READYUVA422
2732     YUVTORGB(ebx)
2733     STOREARGB
2734 
2735     sub        ecx, 8
2736     jg         convertloop
2737 
2738     pop        ebp
2739     pop        ebx
2740     pop        edi
2741     pop        esi
2742     ret
2743   }
2744 }
2745 
2746 // 8 pixels.
2747 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2748 __declspec(naked) void NV12ToARGBRow_SSSE3(
2749     const uint8_t* y_buf,
2750     const uint8_t* uv_buf,
2751     uint8_t* dst_argb,
2752     const struct YuvConstants* yuvconstants,
2753     int width) {
2754   __asm {
2755     push       esi
2756     push       ebx
2757     mov        eax, [esp + 8 + 4]  // Y
2758     mov        esi, [esp + 8 + 8]  // UV
2759     mov        edx, [esp + 8 + 12]  // argb
2760     mov        ebx, [esp + 8 + 16]  // yuvconstants
2761     mov        ecx, [esp + 8 + 20]  // width
2762     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2763 
2764  convertloop:
2765     READNV12
2766     YUVTORGB(ebx)
2767     STOREARGB
2768 
2769     sub        ecx, 8
2770     jg         convertloop
2771 
2772     pop        ebx
2773     pop        esi
2774     ret
2775   }
2776 }
2777 
2778 // 8 pixels.
2779 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2780 __declspec(naked) void NV21ToARGBRow_SSSE3(
2781     const uint8_t* y_buf,
2782     const uint8_t* vu_buf,
2783     uint8_t* dst_argb,
2784     const struct YuvConstants* yuvconstants,
2785     int width) {
2786   __asm {
2787     push       esi
2788     push       ebx
2789     mov        eax, [esp + 8 + 4]  // Y
2790     mov        esi, [esp + 8 + 8]  // VU
2791     mov        edx, [esp + 8 + 12]  // argb
2792     mov        ebx, [esp + 8 + 16]  // yuvconstants
2793     mov        ecx, [esp + 8 + 20]  // width
2794     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2795 
2796  convertloop:
2797     READNV21
2798     YUVTORGB(ebx)
2799     STOREARGB
2800 
2801     sub        ecx, 8
2802     jg         convertloop
2803 
2804     pop        ebx
2805     pop        esi
2806     ret
2807   }
2808 }
2809 
2810 // 8 pixels.
2811 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
2812 __declspec(naked) void YUY2ToARGBRow_SSSE3(
2813     const uint8_t* src_yuy2,
2814     uint8_t* dst_argb,
2815     const struct YuvConstants* yuvconstants,
2816     int width) {
2817   __asm {
2818     push       ebx
2819     mov        eax, [esp + 4 + 4]  // yuy2
2820     mov        edx, [esp + 4 + 8]  // argb
2821     mov        ebx, [esp + 4 + 12]  // yuvconstants
2822     mov        ecx, [esp + 4 + 16]  // width
2823     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2824 
2825  convertloop:
2826     READYUY2
2827     YUVTORGB(ebx)
2828     STOREARGB
2829 
2830     sub        ecx, 8
2831     jg         convertloop
2832 
2833     pop        ebx
2834     ret
2835   }
2836 }
2837 
2838 // 8 pixels.
2839 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
2840 __declspec(naked) void UYVYToARGBRow_SSSE3(
2841     const uint8_t* src_uyvy,
2842     uint8_t* dst_argb,
2843     const struct YuvConstants* yuvconstants,
2844     int width) {
2845   __asm {
2846     push       ebx
2847     mov        eax, [esp + 4 + 4]  // uyvy
2848     mov        edx, [esp + 4 + 8]  // argb
2849     mov        ebx, [esp + 4 + 12]  // yuvconstants
2850     mov        ecx, [esp + 4 + 16]  // width
2851     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2852 
2853  convertloop:
2854     READUYVY
2855     YUVTORGB(ebx)
2856     STOREARGB
2857 
2858     sub        ecx, 8
2859     jg         convertloop
2860 
2861     pop        ebx
2862     ret
2863   }
2864 }
2865 
2866 __declspec(naked) void I422ToRGBARow_SSSE3(
2867     const uint8_t* y_buf,
2868     const uint8_t* u_buf,
2869     const uint8_t* v_buf,
2870     uint8_t* dst_rgba,
2871     const struct YuvConstants* yuvconstants,
2872     int width) {
2873   __asm {
2874     push       esi
2875     push       edi
2876     push       ebx
2877     mov        eax, [esp + 12 + 4]  // Y
2878     mov        esi, [esp + 12 + 8]  // U
2879     mov        edi, [esp + 12 + 12]  // V
2880     mov        edx, [esp + 12 + 16]  // argb
2881     mov        ebx, [esp + 12 + 20]  // yuvconstants
2882     mov        ecx, [esp + 12 + 24]  // width
2883     sub        edi, esi
2884 
2885  convertloop:
2886     READYUV422
2887     YUVTORGB(ebx)
2888     STORERGBA
2889 
2890     sub        ecx, 8
2891     jg         convertloop
2892 
2893     pop        ebx
2894     pop        edi
2895     pop        esi
2896     ret
2897   }
2898 }
2899 #endif  // HAS_I422TOARGBROW_SSSE3
2900 
2901 #ifdef HAS_I400TOARGBROW_SSE2
2902 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
2903 __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
2904                                           uint8_t* rgb_buf,
2905                                           int width) {
2906   __asm {
2907     mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
2908     movd       xmm2, eax
2909     pshufd     xmm2, xmm2,0
2910     mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
2911     movd       xmm3, eax
2912     pshufd     xmm3, xmm3, 0
2913     pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
2914     pslld      xmm4, 24
2915 
2916     mov        eax, [esp + 4]  // Y
2917     mov        edx, [esp + 8]  // rgb
2918     mov        ecx, [esp + 12]  // width
2919 
2920  convertloop:
2921         // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2922     movq       xmm0, qword ptr [eax]
2923     lea        eax, [eax + 8]
2924     punpcklbw  xmm0, xmm0  // Y.Y
2925     pmulhuw    xmm0, xmm2
2926     psubusw    xmm0, xmm3
2927     psrlw      xmm0, 6
2928     packuswb   xmm0, xmm0        // G
2929 
2930         // Step 2: Weave into ARGB
2931     punpcklbw  xmm0, xmm0  // GG
2932     movdqa     xmm1, xmm0
2933     punpcklwd  xmm0, xmm0  // BGRA first 4 pixels
2934     punpckhwd  xmm1, xmm1  // BGRA next 4 pixels
2935     por        xmm0, xmm4
2936     por        xmm1, xmm4
2937     movdqu     [edx], xmm0
2938     movdqu     [edx + 16], xmm1
2939     lea        edx,  [edx + 32]
2940     sub        ecx, 8
2941     jg         convertloop
2942     ret
2943   }
2944 }
2945 #endif  // HAS_I400TOARGBROW_SSE2
2946 
2947 #ifdef HAS_I400TOARGBROW_AVX2
2948 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2949 // note: vpunpcklbw mutates and vpackuswb unmutates.
2950 __declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
2951                                           uint8_t* rgb_buf,
2952                                           int width) {
2953   __asm {
2954     mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
2955     vmovd      xmm2, eax
2956     vbroadcastss ymm2, xmm2
2957     mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
2958     vmovd      xmm3, eax
2959     vbroadcastss ymm3, xmm3
2960     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xff000000
2961     vpslld     ymm4, ymm4, 24
2962 
2963     mov        eax, [esp + 4]  // Y
2964     mov        edx, [esp + 8]  // rgb
2965     mov        ecx, [esp + 12]  // width
2966 
2967  convertloop:
2968         // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
2969     vmovdqu    xmm0, [eax]
2970     lea        eax, [eax + 16]
2971     vpermq     ymm0, ymm0, 0xd8  // vpunpcklbw mutates
2972     vpunpcklbw ymm0, ymm0, ymm0  // Y.Y
2973     vpmulhuw   ymm0, ymm0, ymm2
2974     vpsubusw   ymm0, ymm0, ymm3
2975     vpsrlw     ymm0, ymm0, 6
2976     vpackuswb  ymm0, ymm0, ymm0        // G.  still mutated: 3120
2977 
2978         // TODO(fbarchard): Weave alpha with unpack.
2979         // Step 2: Weave into ARGB
2980     vpunpcklbw ymm1, ymm0, ymm0  // GG - mutates
2981     vpermq     ymm1, ymm1, 0xd8
2982     vpunpcklwd ymm0, ymm1, ymm1  // GGGG first 8 pixels
2983     vpunpckhwd ymm1, ymm1, ymm1  // GGGG next 8 pixels
2984     vpor       ymm0, ymm0, ymm4
2985     vpor       ymm1, ymm1, ymm4
2986     vmovdqu    [edx], ymm0
2987     vmovdqu    [edx + 32], ymm1
2988     lea        edx,  [edx + 64]
2989     sub        ecx, 16
2990     jg         convertloop
2991     vzeroupper
2992     ret
2993   }
2994 }
2995 #endif  // HAS_I400TOARGBROW_AVX2
2996 
2997 #ifdef HAS_MIRRORROW_SSSE3
2998 // Shuffle table for reversing the bytes.
2999 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
3000                                      7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
3001 
3002 // TODO(fbarchard): Replace lea with -16 offset.
3003 __declspec(naked) void MirrorRow_SSSE3(const uint8_t* src,
3004                                        uint8_t* dst,
3005                                        int width) {
3006   __asm {
3007     mov       eax, [esp + 4]  // src
3008     mov       edx, [esp + 8]  // dst
3009     mov       ecx, [esp + 12]  // width
3010     movdqa    xmm5, xmmword ptr kShuffleMirror
3011 
3012  convertloop:
3013     movdqu    xmm0, [eax - 16 + ecx]
3014     pshufb    xmm0, xmm5
3015     movdqu    [edx], xmm0
3016     lea       edx, [edx + 16]
3017     sub       ecx, 16
3018     jg        convertloop
3019     ret
3020   }
3021 }
3022 #endif  // HAS_MIRRORROW_SSSE3
3023 
3024 #ifdef HAS_MIRRORROW_AVX2
3025 __declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
3026                                       uint8_t* dst,
3027                                       int width) {
3028   __asm {
3029     mov       eax, [esp + 4]  // src
3030     mov       edx, [esp + 8]  // dst
3031     mov       ecx, [esp + 12]  // width
3032     vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
3033 
3034  convertloop:
3035     vmovdqu   ymm0, [eax - 32 + ecx]
3036     vpshufb   ymm0, ymm0, ymm5
3037     vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
3038     vmovdqu   [edx], ymm0
3039     lea       edx, [edx + 32]
3040     sub       ecx, 32
3041     jg        convertloop
3042     vzeroupper
3043     ret
3044   }
3045 }
3046 #endif  // HAS_MIRRORROW_AVX2
3047 
3048 #ifdef HAS_MIRRORUVROW_SSSE3
3049 // Shuffle table for reversing the bytes of UV channels.
3050 static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
3051                                        15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
3052 
3053 __declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
3054                                          uint8_t* dst_u,
3055                                          uint8_t* dst_v,
3056                                          int width) {
3057   __asm {
3058     push      edi
3059     mov       eax, [esp + 4 + 4]  // src
3060     mov       edx, [esp + 4 + 8]  // dst_u
3061     mov       edi, [esp + 4 + 12]  // dst_v
3062     mov       ecx, [esp + 4 + 16]  // width
3063     movdqa    xmm1, xmmword ptr kShuffleMirrorUV
3064     lea       eax, [eax + ecx * 2 - 16]
3065     sub       edi, edx
3066 
3067  convertloop:
3068     movdqu    xmm0, [eax]
3069     lea       eax, [eax - 16]
3070     pshufb    xmm0, xmm1
3071     movlpd    qword ptr [edx], xmm0
3072     movhpd    qword ptr [edx + edi], xmm0
3073     lea       edx, [edx + 8]
3074     sub       ecx, 8
3075     jg        convertloop
3076 
3077     pop       edi
3078     ret
3079   }
3080 }
3081 #endif  // HAS_MIRRORUVROW_SSSE3
3082 
3083 #ifdef HAS_ARGBMIRRORROW_SSE2
3084 __declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
3085                                           uint8_t* dst,
3086                                           int width) {
3087   __asm {
3088     mov       eax, [esp + 4]  // src
3089     mov       edx, [esp + 8]  // dst
3090     mov       ecx, [esp + 12]  // width
3091     lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
3092 
3093  convertloop:
3094     movdqu    xmm0, [eax]
3095     lea       eax, [eax - 16]
3096     pshufd    xmm0, xmm0, 0x1b
3097     movdqu    [edx], xmm0
3098     lea       edx, [edx + 16]
3099     sub       ecx, 4
3100     jg        convertloop
3101     ret
3102   }
3103 }
3104 #endif  // HAS_ARGBMIRRORROW_SSE2
3105 
3106 #ifdef HAS_ARGBMIRRORROW_AVX2
3107 // Shuffle table for reversing the bytes.
3108 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
3109 
3110 __declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src,
3111                                           uint8_t* dst,
3112                                           int width) {
3113   __asm {
3114     mov       eax, [esp + 4]  // src
3115     mov       edx, [esp + 8]  // dst
3116     mov       ecx, [esp + 12]  // width
3117     vmovdqu   ymm5, ymmword ptr kARGBShuffleMirror_AVX2
3118 
3119  convertloop:
3120     vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
3121     vmovdqu   [edx], ymm0
3122     lea       edx, [edx + 32]
3123     sub       ecx, 8
3124     jg        convertloop
3125     vzeroupper
3126     ret
3127   }
3128 }
3129 #endif  // HAS_ARGBMIRRORROW_AVX2
3130 
3131 #ifdef HAS_SPLITUVROW_SSE2
3132 __declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv,
3133                                        uint8_t* dst_u,
3134                                        uint8_t* dst_v,
3135                                        int width) {
3136   __asm {
3137     push       edi
3138     mov        eax, [esp + 4 + 4]  // src_uv
3139     mov        edx, [esp + 4 + 8]  // dst_u
3140     mov        edi, [esp + 4 + 12]  // dst_v
3141     mov        ecx, [esp + 4 + 16]  // width
3142     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
3143     psrlw      xmm5, 8
3144     sub        edi, edx
3145 
3146   convertloop:
3147     movdqu     xmm0, [eax]
3148     movdqu     xmm1, [eax + 16]
3149     lea        eax,  [eax + 32]
3150     movdqa     xmm2, xmm0
3151     movdqa     xmm3, xmm1
3152     pand       xmm0, xmm5  // even bytes
3153     pand       xmm1, xmm5
3154     packuswb   xmm0, xmm1
3155     psrlw      xmm2, 8  // odd bytes
3156     psrlw      xmm3, 8
3157     packuswb   xmm2, xmm3
3158     movdqu     [edx], xmm0
3159     movdqu     [edx + edi], xmm2
3160     lea        edx, [edx + 16]
3161     sub        ecx, 16
3162     jg         convertloop
3163 
3164     pop        edi
3165     ret
3166   }
3167 }
3168 
3169 #endif  // HAS_SPLITUVROW_SSE2
3170 
3171 #ifdef HAS_SPLITUVROW_AVX2
3172 __declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv,
3173                                        uint8_t* dst_u,
3174                                        uint8_t* dst_v,
3175                                        int width) {
3176   __asm {
3177     push       edi
3178     mov        eax, [esp + 4 + 4]  // src_uv
3179     mov        edx, [esp + 4 + 8]  // dst_u
3180     mov        edi, [esp + 4 + 12]  // dst_v
3181     mov        ecx, [esp + 4 + 16]  // width
3182     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3183     vpsrlw     ymm5, ymm5, 8
3184     sub        edi, edx
3185 
3186   convertloop:
3187     vmovdqu    ymm0, [eax]
3188     vmovdqu    ymm1, [eax + 32]
3189     lea        eax,  [eax + 64]
3190     vpsrlw     ymm2, ymm0, 8  // odd bytes
3191     vpsrlw     ymm3, ymm1, 8
3192     vpand      ymm0, ymm0, ymm5  // even bytes
3193     vpand      ymm1, ymm1, ymm5
3194     vpackuswb  ymm0, ymm0, ymm1
3195     vpackuswb  ymm2, ymm2, ymm3
3196     vpermq     ymm0, ymm0, 0xd8
3197     vpermq     ymm2, ymm2, 0xd8
3198     vmovdqu    [edx], ymm0
3199     vmovdqu    [edx + edi], ymm2
3200     lea        edx, [edx + 32]
3201     sub        ecx, 32
3202     jg         convertloop
3203 
3204     pop        edi
3205     vzeroupper
3206     ret
3207   }
3208 }
3209 #endif  // HAS_SPLITUVROW_AVX2
3210 
3211 #ifdef HAS_MERGEUVROW_SSE2
3212 __declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u,
3213                                        const uint8_t* src_v,
3214                                        uint8_t* dst_uv,
3215                                        int width) {
3216   __asm {
3217     push       edi
3218     mov        eax, [esp + 4 + 4]  // src_u
3219     mov        edx, [esp + 4 + 8]  // src_v
3220     mov        edi, [esp + 4 + 12]  // dst_uv
3221     mov        ecx, [esp + 4 + 16]  // width
3222     sub        edx, eax
3223 
3224   convertloop:
3225     movdqu     xmm0, [eax]  // read 16 U's
3226     movdqu     xmm1, [eax + edx]  // and 16 V's
3227     lea        eax,  [eax + 16]
3228     movdqa     xmm2, xmm0
3229     punpcklbw  xmm0, xmm1  // first 8 UV pairs
3230     punpckhbw  xmm2, xmm1  // next 8 UV pairs
3231     movdqu     [edi], xmm0
3232     movdqu     [edi + 16], xmm2
3233     lea        edi, [edi + 32]
3234     sub        ecx, 16
3235     jg         convertloop
3236 
3237     pop        edi
3238     ret
3239   }
3240 }
3241 #endif  //  HAS_MERGEUVROW_SSE2
3242 
3243 #ifdef HAS_MERGEUVROW_AVX2
3244 __declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
3245                                        const uint8_t* src_v,
3246                                        uint8_t* dst_uv,
3247                                        int width) {
3248   __asm {
3249     push       edi
3250     mov        eax, [esp + 4 + 4]  // src_u
3251     mov        edx, [esp + 4 + 8]  // src_v
3252     mov        edi, [esp + 4 + 12]  // dst_uv
3253     mov        ecx, [esp + 4 + 16]  // width
3254     sub        edx, eax
3255 
3256   convertloop:
3257     vmovdqu    ymm0, [eax]  // read 32 U's
3258     vmovdqu    ymm1, [eax + edx]  // and 32 V's
3259     lea        eax,  [eax + 32]
3260     vpunpcklbw ymm2, ymm0, ymm1  // low 16 UV pairs. mutated qqword 0,2
3261     vpunpckhbw ymm0, ymm0, ymm1  // high 16 UV pairs. mutated qqword 1,3
3262     vextractf128 [edi], ymm2, 0  // bytes 0..15
3263     vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
3264     vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
3265     vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
3266     lea        edi, [edi + 64]
3267     sub        ecx, 32
3268     jg         convertloop
3269 
3270     pop        edi
3271     vzeroupper
3272     ret
3273   }
3274 }
3275 #endif  //  HAS_MERGEUVROW_AVX2
3276 
3277 #ifdef HAS_COPYROW_SSE2
3278 // CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time.
3279 __declspec(naked) void CopyRow_SSE2(const uint8_t* src,
3280                                     uint8_t* dst,
3281                                     int width) {
3282   __asm {
3283     mov        eax, [esp + 4]  // src
3284     mov        edx, [esp + 8]  // dst
3285     mov        ecx, [esp + 12]  // width
3286     test       eax, 15
3287     jne        convertloopu
3288     test       edx, 15
3289     jne        convertloopu
3290 
3291   convertloopa:
3292     movdqa     xmm0, [eax]
3293     movdqa     xmm1, [eax + 16]
3294     lea        eax, [eax + 32]
3295     movdqa     [edx], xmm0
3296     movdqa     [edx + 16], xmm1
3297     lea        edx, [edx + 32]
3298     sub        ecx, 32
3299     jg         convertloopa
3300     ret
3301 
3302   convertloopu:
3303     movdqu     xmm0, [eax]
3304     movdqu     xmm1, [eax + 16]
3305     lea        eax, [eax + 32]
3306     movdqu     [edx], xmm0
3307     movdqu     [edx + 16], xmm1
3308     lea        edx, [edx + 32]
3309     sub        ecx, 32
3310     jg         convertloopu
3311     ret
3312   }
3313 }
3314 #endif  // HAS_COPYROW_SSE2
3315 
3316 #ifdef HAS_COPYROW_AVX
3317 // CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time.
3318 __declspec(naked) void CopyRow_AVX(const uint8_t* src,
3319                                    uint8_t* dst,
3320                                    int width) {
3321   __asm {
3322     mov        eax, [esp + 4]  // src
3323     mov        edx, [esp + 8]  // dst
3324     mov        ecx, [esp + 12]  // width
3325 
3326   convertloop:
3327     vmovdqu    ymm0, [eax]
3328     vmovdqu    ymm1, [eax + 32]
3329     lea        eax, [eax + 64]
3330     vmovdqu    [edx], ymm0
3331     vmovdqu    [edx + 32], ymm1
3332     lea        edx, [edx + 64]
3333     sub        ecx, 64
3334     jg         convertloop
3335 
3336     vzeroupper
3337     ret
3338   }
3339 }
3340 #endif  // HAS_COPYROW_AVX
3341 
3342 // Multiple of 1.
3343 __declspec(naked) void CopyRow_ERMS(const uint8_t* src,
3344                                     uint8_t* dst,
3345                                     int width) {
3346   __asm {
3347     mov        eax, esi
3348     mov        edx, edi
3349     mov        esi, [esp + 4]  // src
3350     mov        edi, [esp + 8]  // dst
3351     mov        ecx, [esp + 12]  // width
3352     rep movsb
3353     mov        edi, edx
3354     mov        esi, eax
3355     ret
3356   }
3357 }
3358 
3359 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
3360 // width in pixels
3361 __declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src,
3362                                              uint8_t* dst,
3363                                              int width) {
3364   __asm {
3365     mov        eax, [esp + 4]  // src
3366     mov        edx, [esp + 8]  // dst
3367     mov        ecx, [esp + 12]  // width
3368     pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
3369     pslld      xmm0, 24
3370     pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
3371     psrld      xmm1, 8
3372 
3373   convertloop:
3374     movdqu     xmm2, [eax]
3375     movdqu     xmm3, [eax + 16]
3376     lea        eax, [eax + 32]
3377     movdqu     xmm4, [edx]
3378     movdqu     xmm5, [edx + 16]
3379     pand       xmm2, xmm0
3380     pand       xmm3, xmm0
3381     pand       xmm4, xmm1
3382     pand       xmm5, xmm1
3383     por        xmm2, xmm4
3384     por        xmm3, xmm5
3385     movdqu     [edx], xmm2
3386     movdqu     [edx + 16], xmm3
3387     lea        edx, [edx + 32]
3388     sub        ecx, 8
3389     jg         convertloop
3390 
3391     ret
3392   }
3393 }
3394 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
3395 
3396 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
3397 // width in pixels
3398 __declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src,
3399                                              uint8_t* dst,
3400                                              int width) {
3401   __asm {
3402     mov        eax, [esp + 4]  // src
3403     mov        edx, [esp + 8]  // dst
3404     mov        ecx, [esp + 12]  // width
3405     vpcmpeqb   ymm0, ymm0, ymm0
3406     vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
3407 
3408   convertloop:
3409     vmovdqu    ymm1, [eax]
3410     vmovdqu    ymm2, [eax + 32]
3411     lea        eax, [eax + 64]
3412     vpblendvb  ymm1, ymm1, [edx], ymm0
3413     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
3414     vmovdqu    [edx], ymm1
3415     vmovdqu    [edx + 32], ymm2
3416     lea        edx, [edx + 64]
3417     sub        ecx, 16
3418     jg         convertloop
3419 
3420     vzeroupper
3421     ret
3422   }
3423 }
3424 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
3425 
3426 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
3427 // width in pixels
3428 __declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
3429                                                 uint8_t* dst_a,
3430                                                 int width) {
3431   __asm {
3432     mov        eax, [esp + 4]  // src_argb
3433     mov        edx, [esp + 8]  // dst_a
3434     mov        ecx, [esp + 12]  // width
3435 
3436   extractloop:
3437     movdqu     xmm0, [eax]
3438     movdqu     xmm1, [eax + 16]
3439     lea        eax, [eax + 32]
3440     psrld      xmm0, 24
3441     psrld      xmm1, 24
3442     packssdw   xmm0, xmm1
3443     packuswb   xmm0, xmm0
3444     movq       qword ptr [edx], xmm0
3445     lea        edx, [edx + 8]
3446     sub        ecx, 8
3447     jg         extractloop
3448 
3449     ret
3450   }
3451 }
3452 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
3453 
3454 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
3455 // width in pixels
3456 __declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
3457                                                 uint8_t* dst_a,
3458                                                 int width) {
3459   __asm {
3460     mov        eax, [esp + 4]  // src_argb
3461     mov        edx, [esp + 8]  // dst_a
3462     mov        ecx, [esp + 12]  // width
3463     vmovdqa    ymm4, ymmword ptr kPermdARGBToY_AVX
3464 
3465   extractloop:
3466     vmovdqu    ymm0, [eax]
3467     vmovdqu    ymm1, [eax + 32]
3468     vpsrld     ymm0, ymm0, 24
3469     vpsrld     ymm1, ymm1, 24
3470     vmovdqu    ymm2, [eax + 64]
3471     vmovdqu    ymm3, [eax + 96]
3472     lea        eax, [eax + 128]
3473     vpackssdw  ymm0, ymm0, ymm1  // mutates
3474     vpsrld     ymm2, ymm2, 24
3475     vpsrld     ymm3, ymm3, 24
3476     vpackssdw  ymm2, ymm2, ymm3  // mutates
3477     vpackuswb  ymm0, ymm0, ymm2  // mutates
3478     vpermd     ymm0, ymm4, ymm0  // unmutate
3479     vmovdqu    [edx], ymm0
3480     lea        edx, [edx + 32]
3481     sub        ecx, 32
3482     jg         extractloop
3483 
3484     vzeroupper
3485     ret
3486   }
3487 }
3488 #endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
3489 
3490 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3491 // width in pixels
3492 __declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src,
3493                                                 uint8_t* dst,
3494                                                 int width) {
3495   __asm {
3496     mov        eax, [esp + 4]  // src
3497     mov        edx, [esp + 8]  // dst
3498     mov        ecx, [esp + 12]  // width
3499     pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
3500     pslld      xmm0, 24
3501     pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
3502     psrld      xmm1, 8
3503 
3504   convertloop:
3505     movq       xmm2, qword ptr [eax]  // 8 Y's
3506     lea        eax, [eax + 8]
3507     punpcklbw  xmm2, xmm2
3508     punpckhwd  xmm3, xmm2
3509     punpcklwd  xmm2, xmm2
3510     movdqu     xmm4, [edx]
3511     movdqu     xmm5, [edx + 16]
3512     pand       xmm2, xmm0
3513     pand       xmm3, xmm0
3514     pand       xmm4, xmm1
3515     pand       xmm5, xmm1
3516     por        xmm2, xmm4
3517     por        xmm3, xmm5
3518     movdqu     [edx], xmm2
3519     movdqu     [edx + 16], xmm3
3520     lea        edx, [edx + 32]
3521     sub        ecx, 8
3522     jg         convertloop
3523 
3524     ret
3525   }
3526 }
3527 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
3528 
3529 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3530 // width in pixels
3531 __declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src,
3532                                                 uint8_t* dst,
3533                                                 int width) {
3534   __asm {
3535     mov        eax, [esp + 4]  // src
3536     mov        edx, [esp + 8]  // dst
3537     mov        ecx, [esp + 12]  // width
3538     vpcmpeqb   ymm0, ymm0, ymm0
3539     vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
3540 
3541   convertloop:
3542     vpmovzxbd  ymm1, qword ptr [eax]
3543     vpmovzxbd  ymm2, qword ptr [eax + 8]
3544     lea        eax, [eax + 16]
3545     vpslld     ymm1, ymm1, 24
3546     vpslld     ymm2, ymm2, 24
3547     vpblendvb  ymm1, ymm1, [edx], ymm0
3548     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
3549     vmovdqu    [edx], ymm1
3550     vmovdqu    [edx + 32], ymm2
3551     lea        edx, [edx + 64]
3552     sub        ecx, 16
3553     jg         convertloop
3554 
3555     vzeroupper
3556     ret
3557   }
3558 }
3559 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
3560 
3561 #ifdef HAS_SETROW_X86
3562 // Write 'width' bytes using an 8 bit value repeated.
3563 // width should be multiple of 4.
3564 __declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
3565   __asm {
3566     movzx      eax, byte ptr [esp + 8]  // v8
3567     mov        edx, 0x01010101  // Duplicate byte to all bytes.
3568     mul        edx  // overwrites edx with upper part of result.
3569     mov        edx, edi
3570     mov        edi, [esp + 4]  // dst
3571     mov        ecx, [esp + 12]  // width
3572     shr        ecx, 2
3573     rep stosd
3574     mov        edi, edx
3575     ret
3576   }
3577 }
3578 
3579 // Write 'width' bytes using an 8 bit value repeated.
3580 __declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
3581   __asm {
3582     mov        edx, edi
3583     mov        edi, [esp + 4]  // dst
3584     mov        eax, [esp + 8]  // v8
3585     mov        ecx, [esp + 12]  // width
3586     rep stosb
3587     mov        edi, edx
3588     ret
3589   }
3590 }
3591 
3592 // Write 'width' 32 bit values.
3593 __declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb,
3594                                       uint32_t v32,
3595                                       int width) {
3596   __asm {
3597     mov        edx, edi
3598     mov        edi, [esp + 4]  // dst
3599     mov        eax, [esp + 8]  // v32
3600     mov        ecx, [esp + 12]  // width
3601     rep stosd
3602     mov        edi, edx
3603     ret
3604   }
3605 }
3606 #endif  // HAS_SETROW_X86
3607 
3608 #ifdef HAS_YUY2TOYROW_AVX2
3609 __declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2,
3610                                        uint8_t* dst_y,
3611                                        int width) {
3612   __asm {
3613     mov        eax, [esp + 4]  // src_yuy2
3614     mov        edx, [esp + 8]  // dst_y
3615     mov        ecx, [esp + 12]  // width
3616     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3617     vpsrlw     ymm5, ymm5, 8
3618 
3619   convertloop:
3620     vmovdqu    ymm0, [eax]
3621     vmovdqu    ymm1, [eax + 32]
3622     lea        eax,  [eax + 64]
3623     vpand      ymm0, ymm0, ymm5  // even bytes are Y
3624     vpand      ymm1, ymm1, ymm5
3625     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3626     vpermq     ymm0, ymm0, 0xd8
3627     vmovdqu    [edx], ymm0
3628     lea        edx, [edx + 32]
3629     sub        ecx, 32
3630     jg         convertloop
3631     vzeroupper
3632     ret
3633   }
3634 }
3635 
3636 __declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
3637                                         int stride_yuy2,
3638                                         uint8_t* dst_u,
3639                                         uint8_t* dst_v,
3640                                         int width) {
3641   __asm {
3642     push       esi
3643     push       edi
3644     mov        eax, [esp + 8 + 4]  // src_yuy2
3645     mov        esi, [esp + 8 + 8]  // stride_yuy2
3646     mov        edx, [esp + 8 + 12]  // dst_u
3647     mov        edi, [esp + 8 + 16]  // dst_v
3648     mov        ecx, [esp + 8 + 20]  // width
3649     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3650     vpsrlw     ymm5, ymm5, 8
3651     sub        edi, edx
3652 
3653   convertloop:
3654     vmovdqu    ymm0, [eax]
3655     vmovdqu    ymm1, [eax + 32]
3656     vpavgb     ymm0, ymm0, [eax + esi]
3657     vpavgb     ymm1, ymm1, [eax + esi + 32]
3658     lea        eax,  [eax + 64]
3659     vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
3660     vpsrlw     ymm1, ymm1, 8
3661     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3662     vpermq     ymm0, ymm0, 0xd8
3663     vpand      ymm1, ymm0, ymm5  // U
3664     vpsrlw     ymm0, ymm0, 8  // V
3665     vpackuswb  ymm1, ymm1, ymm1  // mutates.
3666     vpackuswb  ymm0, ymm0, ymm0  // mutates.
3667     vpermq     ymm1, ymm1, 0xd8
3668     vpermq     ymm0, ymm0, 0xd8
3669     vextractf128 [edx], ymm1, 0  // U
3670     vextractf128 [edx + edi], ymm0, 0  // V
3671     lea        edx, [edx + 16]
3672     sub        ecx, 32
3673     jg         convertloop
3674 
3675     pop        edi
3676     pop        esi
3677     vzeroupper
3678     ret
3679   }
3680 }
3681 
3682 __declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
3683                                            uint8_t* dst_u,
3684                                            uint8_t* dst_v,
3685                                            int width) {
3686   __asm {
3687     push       edi
3688     mov        eax, [esp + 4 + 4]  // src_yuy2
3689     mov        edx, [esp + 4 + 8]  // dst_u
3690     mov        edi, [esp + 4 + 12]  // dst_v
3691     mov        ecx, [esp + 4 + 16]  // width
3692     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3693     vpsrlw     ymm5, ymm5, 8
3694     sub        edi, edx
3695 
3696   convertloop:
3697     vmovdqu    ymm0, [eax]
3698     vmovdqu    ymm1, [eax + 32]
3699     lea        eax,  [eax + 64]
3700     vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
3701     vpsrlw     ymm1, ymm1, 8
3702     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3703     vpermq     ymm0, ymm0, 0xd8
3704     vpand      ymm1, ymm0, ymm5  // U
3705     vpsrlw     ymm0, ymm0, 8  // V
3706     vpackuswb  ymm1, ymm1, ymm1  // mutates.
3707     vpackuswb  ymm0, ymm0, ymm0  // mutates.
3708     vpermq     ymm1, ymm1, 0xd8
3709     vpermq     ymm0, ymm0, 0xd8
3710     vextractf128 [edx], ymm1, 0  // U
3711     vextractf128 [edx + edi], ymm0, 0  // V
3712     lea        edx, [edx + 16]
3713     sub        ecx, 32
3714     jg         convertloop
3715 
3716     pop        edi
3717     vzeroupper
3718     ret
3719   }
3720 }
3721 
3722 __declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy,
3723                                        uint8_t* dst_y,
3724                                        int width) {
3725   __asm {
3726     mov        eax, [esp + 4]  // src_uyvy
3727     mov        edx, [esp + 8]  // dst_y
3728     mov        ecx, [esp + 12]  // width
3729 
3730   convertloop:
3731     vmovdqu    ymm0, [eax]
3732     vmovdqu    ymm1, [eax + 32]
3733     lea        eax,  [eax + 64]
3734     vpsrlw     ymm0, ymm0, 8  // odd bytes are Y
3735     vpsrlw     ymm1, ymm1, 8
3736     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3737     vpermq     ymm0, ymm0, 0xd8
3738     vmovdqu    [edx], ymm0
3739     lea        edx, [edx + 32]
3740     sub        ecx, 32
3741     jg         convertloop
3742     vzeroupper
3743     ret
3744   }
3745 }
3746 
3747 __declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
3748                                         int stride_uyvy,
3749                                         uint8_t* dst_u,
3750                                         uint8_t* dst_v,
3751                                         int width) {
3752   __asm {
3753     push       esi
3754     push       edi
3755     mov        eax, [esp + 8 + 4]  // src_yuy2
3756     mov        esi, [esp + 8 + 8]  // stride_yuy2
3757     mov        edx, [esp + 8 + 12]  // dst_u
3758     mov        edi, [esp + 8 + 16]  // dst_v
3759     mov        ecx, [esp + 8 + 20]  // width
3760     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3761     vpsrlw     ymm5, ymm5, 8
3762     sub        edi, edx
3763 
3764   convertloop:
3765     vmovdqu    ymm0, [eax]
3766     vmovdqu    ymm1, [eax + 32]
3767     vpavgb     ymm0, ymm0, [eax + esi]
3768     vpavgb     ymm1, ymm1, [eax + esi + 32]
3769     lea        eax,  [eax + 64]
3770     vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
3771     vpand      ymm1, ymm1, ymm5
3772     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3773     vpermq     ymm0, ymm0, 0xd8
3774     vpand      ymm1, ymm0, ymm5  // U
3775     vpsrlw     ymm0, ymm0, 8  // V
3776     vpackuswb  ymm1, ymm1, ymm1  // mutates.
3777     vpackuswb  ymm0, ymm0, ymm0  // mutates.
3778     vpermq     ymm1, ymm1, 0xd8
3779     vpermq     ymm0, ymm0, 0xd8
3780     vextractf128 [edx], ymm1, 0  // U
3781     vextractf128 [edx + edi], ymm0, 0  // V
3782     lea        edx, [edx + 16]
3783     sub        ecx, 32
3784     jg         convertloop
3785 
3786     pop        edi
3787     pop        esi
3788     vzeroupper
3789     ret
3790   }
3791 }
3792 
3793 __declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
3794                                            uint8_t* dst_u,
3795                                            uint8_t* dst_v,
3796                                            int width) {
3797   __asm {
3798     push       edi
3799     mov        eax, [esp + 4 + 4]  // src_yuy2
3800     mov        edx, [esp + 4 + 8]  // dst_u
3801     mov        edi, [esp + 4 + 12]  // dst_v
3802     mov        ecx, [esp + 4 + 16]  // width
3803     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3804     vpsrlw     ymm5, ymm5, 8
3805     sub        edi, edx
3806 
3807   convertloop:
3808     vmovdqu    ymm0, [eax]
3809     vmovdqu    ymm1, [eax + 32]
3810     lea        eax,  [eax + 64]
3811     vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
3812     vpand      ymm1, ymm1, ymm5
3813     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3814     vpermq     ymm0, ymm0, 0xd8
3815     vpand      ymm1, ymm0, ymm5  // U
3816     vpsrlw     ymm0, ymm0, 8  // V
3817     vpackuswb  ymm1, ymm1, ymm1  // mutates.
3818     vpackuswb  ymm0, ymm0, ymm0  // mutates.
3819     vpermq     ymm1, ymm1, 0xd8
3820     vpermq     ymm0, ymm0, 0xd8
3821     vextractf128 [edx], ymm1, 0  // U
3822     vextractf128 [edx + edi], ymm0, 0  // V
3823     lea        edx, [edx + 16]
3824     sub        ecx, 32
3825     jg         convertloop
3826 
3827     pop        edi
3828     vzeroupper
3829     ret
3830   }
3831 }
3832 #endif  // HAS_YUY2TOYROW_AVX2
3833 
3834 #ifdef HAS_YUY2TOYROW_SSE2
3835 __declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2,
3836                                        uint8_t* dst_y,
3837                                        int width) {
3838   __asm {
3839     mov        eax, [esp + 4]  // src_yuy2
3840     mov        edx, [esp + 8]  // dst_y
3841     mov        ecx, [esp + 12]  // width
3842     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
3843     psrlw      xmm5, 8
3844 
3845   convertloop:
3846     movdqu     xmm0, [eax]
3847     movdqu     xmm1, [eax + 16]
3848     lea        eax,  [eax + 32]
3849     pand       xmm0, xmm5  // even bytes are Y
3850     pand       xmm1, xmm5
3851     packuswb   xmm0, xmm1
3852     movdqu     [edx], xmm0
3853     lea        edx, [edx + 16]
3854     sub        ecx, 16
3855     jg         convertloop
3856     ret
3857   }
3858 }
3859 
3860 __declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
3861                                         int stride_yuy2,
3862                                         uint8_t* dst_u,
3863                                         uint8_t* dst_v,
3864                                         int width) {
3865   __asm {
3866     push       esi
3867     push       edi
3868     mov        eax, [esp + 8 + 4]  // src_yuy2
3869     mov        esi, [esp + 8 + 8]  // stride_yuy2
3870     mov        edx, [esp + 8 + 12]  // dst_u
3871     mov        edi, [esp + 8 + 16]  // dst_v
3872     mov        ecx, [esp + 8 + 20]  // width
3873     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
3874     psrlw      xmm5, 8
3875     sub        edi, edx
3876 
3877   convertloop:
3878     movdqu     xmm0, [eax]
3879     movdqu     xmm1, [eax + 16]
3880     movdqu     xmm2, [eax + esi]
3881     movdqu     xmm3, [eax + esi + 16]
3882     lea        eax,  [eax + 32]
3883     pavgb      xmm0, xmm2
3884     pavgb      xmm1, xmm3
3885     psrlw      xmm0, 8  // YUYV -> UVUV
3886     psrlw      xmm1, 8
3887     packuswb   xmm0, xmm1
3888     movdqa     xmm1, xmm0
3889     pand       xmm0, xmm5  // U
3890     packuswb   xmm0, xmm0
3891     psrlw      xmm1, 8  // V
3892     packuswb   xmm1, xmm1
3893     movq       qword ptr [edx], xmm0
3894     movq       qword ptr [edx + edi], xmm1
3895     lea        edx, [edx + 8]
3896     sub        ecx, 16
3897     jg         convertloop
3898 
3899     pop        edi
3900     pop        esi
3901     ret
3902   }
3903 }
3904 
3905 __declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
3906                                            uint8_t* dst_u,
3907                                            uint8_t* dst_v,
3908                                            int width) {
3909   __asm {
3910     push       edi
3911     mov        eax, [esp + 4 + 4]  // src_yuy2
3912     mov        edx, [esp + 4 + 8]  // dst_u
3913     mov        edi, [esp + 4 + 12]  // dst_v
3914     mov        ecx, [esp + 4 + 16]  // width
3915     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
3916     psrlw      xmm5, 8
3917     sub        edi, edx
3918 
3919   convertloop:
3920     movdqu     xmm0, [eax]
3921     movdqu     xmm1, [eax + 16]
3922     lea        eax,  [eax + 32]
3923     psrlw      xmm0, 8  // YUYV -> UVUV
3924     psrlw      xmm1, 8
3925     packuswb   xmm0, xmm1
3926     movdqa     xmm1, xmm0
3927     pand       xmm0, xmm5  // U
3928     packuswb   xmm0, xmm0
3929     psrlw      xmm1, 8  // V
3930     packuswb   xmm1, xmm1
3931     movq       qword ptr [edx], xmm0
3932     movq       qword ptr [edx + edi], xmm1
3933     lea        edx, [edx + 8]
3934     sub        ecx, 16
3935     jg         convertloop
3936 
3937     pop        edi
3938     ret
3939   }
3940 }
3941 
3942 __declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy,
3943                                        uint8_t* dst_y,
3944                                        int width) {
3945   __asm {
3946     mov        eax, [esp + 4]  // src_uyvy
3947     mov        edx, [esp + 8]  // dst_y
3948     mov        ecx, [esp + 12]  // width
3949 
3950   convertloop:
3951     movdqu     xmm0, [eax]
3952     movdqu     xmm1, [eax + 16]
3953     lea        eax,  [eax + 32]
3954     psrlw      xmm0, 8  // odd bytes are Y
3955     psrlw      xmm1, 8
3956     packuswb   xmm0, xmm1
3957     movdqu     [edx], xmm0
3958     lea        edx, [edx + 16]
3959     sub        ecx, 16
3960     jg         convertloop
3961     ret
3962   }
3963 }
3964 
3965 __declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
3966                                         int stride_uyvy,
3967                                         uint8_t* dst_u,
3968                                         uint8_t* dst_v,
3969                                         int width) {
3970   __asm {
3971     push       esi
3972     push       edi
3973     mov        eax, [esp + 8 + 4]  // src_yuy2
3974     mov        esi, [esp + 8 + 8]  // stride_yuy2
3975     mov        edx, [esp + 8 + 12]  // dst_u
3976     mov        edi, [esp + 8 + 16]  // dst_v
3977     mov        ecx, [esp + 8 + 20]  // width
3978     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
3979     psrlw      xmm5, 8
3980     sub        edi, edx
3981 
3982   convertloop:
3983     movdqu     xmm0, [eax]
3984     movdqu     xmm1, [eax + 16]
3985     movdqu     xmm2, [eax + esi]
3986     movdqu     xmm3, [eax + esi + 16]
3987     lea        eax,  [eax + 32]
3988     pavgb      xmm0, xmm2
3989     pavgb      xmm1, xmm3
3990     pand       xmm0, xmm5  // UYVY -> UVUV
3991     pand       xmm1, xmm5
3992     packuswb   xmm0, xmm1
3993     movdqa     xmm1, xmm0
3994     pand       xmm0, xmm5  // U
3995     packuswb   xmm0, xmm0
3996     psrlw      xmm1, 8  // V
3997     packuswb   xmm1, xmm1
3998     movq       qword ptr [edx], xmm0
3999     movq       qword ptr [edx + edi], xmm1
4000     lea        edx, [edx + 8]
4001     sub        ecx, 16
4002     jg         convertloop
4003 
4004     pop        edi
4005     pop        esi
4006     ret
4007   }
4008 }
4009 
4010 __declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
4011                                            uint8_t* dst_u,
4012                                            uint8_t* dst_v,
4013                                            int width) {
4014   __asm {
4015     push       edi
4016     mov        eax, [esp + 4 + 4]  // src_yuy2
4017     mov        edx, [esp + 4 + 8]  // dst_u
4018     mov        edi, [esp + 4 + 12]  // dst_v
4019     mov        ecx, [esp + 4 + 16]  // width
4020     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
4021     psrlw      xmm5, 8
4022     sub        edi, edx
4023 
4024   convertloop:
4025     movdqu     xmm0, [eax]
4026     movdqu     xmm1, [eax + 16]
4027     lea        eax,  [eax + 32]
4028     pand       xmm0, xmm5  // UYVY -> UVUV
4029     pand       xmm1, xmm5
4030     packuswb   xmm0, xmm1
4031     movdqa     xmm1, xmm0
4032     pand       xmm0, xmm5  // U
4033     packuswb   xmm0, xmm0
4034     psrlw      xmm1, 8  // V
4035     packuswb   xmm1, xmm1
4036     movq       qword ptr [edx], xmm0
4037     movq       qword ptr [edx + edi], xmm1
4038     lea        edx, [edx + 8]
4039     sub        ecx, 16
4040     jg         convertloop
4041 
4042     pop        edi
4043     ret
4044   }
4045 }
4046 #endif  // HAS_YUY2TOYROW_SSE2
4047 
4048 #ifdef HAS_BLENDPLANEROW_SSSE3
4049 // Blend 8 pixels at a time.
4050 // unsigned version of math
4051 // =((A2*C2)+(B2*(255-C2))+255)/256
4052 // signed version of math
4053 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4054 __declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0,
4055                                            const uint8_t* src1,
4056                                            const uint8_t* alpha,
4057                                            uint8_t* dst,
4058                                            int width) {
4059   __asm {
4060     push       esi
4061     push       edi
4062     pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
4063     psllw      xmm5, 8
4064     mov        eax, 0x80808080  // 128 for biasing image to signed.
4065     movd       xmm6, eax
4066     pshufd     xmm6, xmm6, 0x00
4067 
4068     mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.
4069     movd       xmm7, eax
4070     pshufd     xmm7, xmm7, 0x00
4071     mov        eax, [esp + 8 + 4]  // src0
4072     mov        edx, [esp + 8 + 8]  // src1
4073     mov        esi, [esp + 8 + 12]  // alpha
4074     mov        edi, [esp + 8 + 16]  // dst
4075     mov        ecx, [esp + 8 + 20]  // width
4076     sub        eax, esi
4077     sub        edx, esi
4078     sub        edi, esi
4079 
4080         // 8 pixel loop.
4081   convertloop8:
4082     movq       xmm0, qword ptr [esi]  // alpha
4083     punpcklbw  xmm0, xmm0
4084     pxor       xmm0, xmm5  // a, 255-a
4085     movq       xmm1, qword ptr [eax + esi]  // src0
4086     movq       xmm2, qword ptr [edx + esi]  // src1
4087     punpcklbw  xmm1, xmm2
4088     psubb      xmm1, xmm6  // bias src0/1 - 128
4089     pmaddubsw  xmm0, xmm1
4090     paddw      xmm0, xmm7  // unbias result - 32768 and round.
4091     psrlw      xmm0, 8
4092     packuswb   xmm0, xmm0
4093     movq       qword ptr [edi + esi], xmm0
4094     lea        esi, [esi + 8]
4095     sub        ecx, 8
4096     jg         convertloop8
4097 
4098     pop        edi
4099     pop        esi
4100     ret
4101   }
4102 }
4103 #endif  // HAS_BLENDPLANEROW_SSSE3
4104 
4105 #ifdef HAS_BLENDPLANEROW_AVX2
4106 // Blend 32 pixels at a time.
4107 // unsigned version of math
4108 // =((A2*C2)+(B2*(255-C2))+255)/256
4109 // signed version of math
4110 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4111 __declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0,
4112                                           const uint8_t* src1,
4113                                           const uint8_t* alpha,
4114                                           uint8_t* dst,
4115                                           int width) {
4116   __asm {
4117     push        esi
4118     push        edi
4119     vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff00ff00
4120     vpsllw      ymm5, ymm5, 8
4121     mov         eax, 0x80808080  // 128 for biasing image to signed.
4122     vmovd       xmm6, eax
4123     vbroadcastss ymm6, xmm6
4124     mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.
4125     vmovd       xmm7, eax
4126     vbroadcastss ymm7, xmm7
4127     mov         eax, [esp + 8 + 4]  // src0
4128     mov         edx, [esp + 8 + 8]  // src1
4129     mov         esi, [esp + 8 + 12]  // alpha
4130     mov         edi, [esp + 8 + 16]  // dst
4131     mov         ecx, [esp + 8 + 20]  // width
4132     sub         eax, esi
4133     sub         edx, esi
4134     sub         edi, esi
4135 
4136         // 32 pixel loop.
4137   convertloop32:
4138     vmovdqu     ymm0, [esi]  // alpha
4139     vpunpckhbw  ymm3, ymm0, ymm0  // 8..15, 24..31
4140     vpunpcklbw  ymm0, ymm0, ymm0  // 0..7, 16..23
4141     vpxor       ymm3, ymm3, ymm5  // a, 255-a
4142     vpxor       ymm0, ymm0, ymm5  // a, 255-a
4143     vmovdqu     ymm1, [eax + esi]  // src0
4144     vmovdqu     ymm2, [edx + esi]  // src1
4145     vpunpckhbw  ymm4, ymm1, ymm2
4146     vpunpcklbw  ymm1, ymm1, ymm2
4147     vpsubb      ymm4, ymm4, ymm6  // bias src0/1 - 128
4148     vpsubb      ymm1, ymm1, ymm6  // bias src0/1 - 128
4149     vpmaddubsw  ymm3, ymm3, ymm4
4150     vpmaddubsw  ymm0, ymm0, ymm1
4151     vpaddw      ymm3, ymm3, ymm7  // unbias result - 32768 and round.
4152     vpaddw      ymm0, ymm0, ymm7  // unbias result - 32768 and round.
4153     vpsrlw      ymm3, ymm3, 8
4154     vpsrlw      ymm0, ymm0, 8
4155     vpackuswb   ymm0, ymm0, ymm3
4156     vmovdqu     [edi + esi], ymm0
4157     lea         esi, [esi + 32]
4158     sub         ecx, 32
4159     jg          convertloop32
4160 
4161     pop         edi
4162     pop         esi
4163     vzeroupper
4164     ret
4165   }
4166 }
4167 #endif  // HAS_BLENDPLANEROW_AVX2
4168 
4169 #ifdef HAS_ARGBBLENDROW_SSSE3
4170 // Shuffle table for isolating alpha.
4171 static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
4172                                     11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
4173 
4174 // Blend 8 pixels at a time.
4175 __declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
4176                                           const uint8_t* src_argb1,
4177                                           uint8_t* dst_argb,
4178                                           int width) {
4179   __asm {
4180     push       esi
4181     mov        eax, [esp + 4 + 4]  // src_argb0
4182     mov        esi, [esp + 4 + 8]  // src_argb1
4183     mov        edx, [esp + 4 + 12]  // dst_argb
4184     mov        ecx, [esp + 4 + 16]  // width
4185     pcmpeqb    xmm7, xmm7  // generate constant 0x0001
4186     psrlw      xmm7, 15
4187     pcmpeqb    xmm6, xmm6  // generate mask 0x00ff00ff
4188     psrlw      xmm6, 8
4189     pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
4190     psllw      xmm5, 8
4191     pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
4192     pslld      xmm4, 24
4193     sub        ecx, 4
4194     jl         convertloop4b  // less than 4 pixels?
4195 
4196         // 4 pixel loop.
4197   convertloop4:
4198     movdqu     xmm3, [eax]  // src argb
4199     lea        eax, [eax + 16]
4200     movdqa     xmm0, xmm3  // src argb
4201     pxor       xmm3, xmm4  // ~alpha
4202     movdqu     xmm2, [esi]  // _r_b
4203     pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
4204     pand       xmm2, xmm6  // _r_b
4205     paddw      xmm3, xmm7  // 256 - alpha
4206     pmullw     xmm2, xmm3  // _r_b * alpha
4207     movdqu     xmm1, [esi]  // _a_g
4208     lea        esi, [esi + 16]
4209     psrlw      xmm1, 8  // _a_g
4210     por        xmm0, xmm4  // set alpha to 255
4211     pmullw     xmm1, xmm3  // _a_g * alpha
4212     psrlw      xmm2, 8  // _r_b convert to 8 bits again
4213     paddusb    xmm0, xmm2  // + src argb
4214     pand       xmm1, xmm5  // a_g_ convert to 8 bits again
4215     paddusb    xmm0, xmm1  // + src argb
4216     movdqu     [edx], xmm0
4217     lea        edx, [edx + 16]
4218     sub        ecx, 4
4219     jge        convertloop4
4220 
4221   convertloop4b:
4222     add        ecx, 4 - 1
4223     jl         convertloop1b
4224 
4225         // 1 pixel loop.
4226   convertloop1:
4227     movd       xmm3, [eax]  // src argb
4228     lea        eax, [eax + 4]
4229     movdqa     xmm0, xmm3  // src argb
4230     pxor       xmm3, xmm4  // ~alpha
4231     movd       xmm2, [esi]  // _r_b
4232     pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
4233     pand       xmm2, xmm6  // _r_b
4234     paddw      xmm3, xmm7  // 256 - alpha
4235     pmullw     xmm2, xmm3  // _r_b * alpha
4236     movd       xmm1, [esi]  // _a_g
4237     lea        esi, [esi + 4]
4238     psrlw      xmm1, 8  // _a_g
4239     por        xmm0, xmm4  // set alpha to 255
4240     pmullw     xmm1, xmm3  // _a_g * alpha
4241     psrlw      xmm2, 8  // _r_b convert to 8 bits again
4242     paddusb    xmm0, xmm2  // + src argb
4243     pand       xmm1, xmm5  // a_g_ convert to 8 bits again
4244     paddusb    xmm0, xmm1  // + src argb
4245     movd       [edx], xmm0
4246     lea        edx, [edx + 4]
4247     sub        ecx, 1
4248     jge        convertloop1
4249 
4250   convertloop1b:
4251     pop        esi
4252     ret
4253   }
4254 }
4255 #endif  // HAS_ARGBBLENDROW_SSSE3
4256 
4257 #ifdef HAS_ARGBATTENUATEROW_SSSE3
4258 // Shuffle table duplicating alpha.
4259 static const uvec8 kShuffleAlpha0 = {
4260     3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4261 };
4262 static const uvec8 kShuffleAlpha1 = {
4263     11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4264     15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4265 };
4266 __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
4267                                               uint8_t* dst_argb,
4268                                               int width) {
4269   __asm {
4270     mov        eax, [esp + 4]  // src_argb0
4271     mov        edx, [esp + 8]  // dst_argb
4272     mov        ecx, [esp + 12]  // width
4273     pcmpeqb    xmm3, xmm3  // generate mask 0xff000000
4274     pslld      xmm3, 24
4275     movdqa     xmm4, xmmword ptr kShuffleAlpha0
4276     movdqa     xmm5, xmmword ptr kShuffleAlpha1
4277 
4278  convertloop:
4279     movdqu     xmm0, [eax]  // read 4 pixels
4280     pshufb     xmm0, xmm4  // isolate first 2 alphas
4281     movdqu     xmm1, [eax]  // read 4 pixels
4282     punpcklbw  xmm1, xmm1  // first 2 pixel rgbs
4283     pmulhuw    xmm0, xmm1  // rgb * a
4284     movdqu     xmm1, [eax]  // read 4 pixels
4285     pshufb     xmm1, xmm5  // isolate next 2 alphas
4286     movdqu     xmm2, [eax]  // read 4 pixels
4287     punpckhbw  xmm2, xmm2  // next 2 pixel rgbs
4288     pmulhuw    xmm1, xmm2  // rgb * a
4289     movdqu     xmm2, [eax]  // mask original alpha
4290     lea        eax, [eax + 16]
4291     pand       xmm2, xmm3
4292     psrlw      xmm0, 8
4293     psrlw      xmm1, 8
4294     packuswb   xmm0, xmm1
4295     por        xmm0, xmm2  // copy original alpha
4296     movdqu     [edx], xmm0
4297     lea        edx, [edx + 16]
4298     sub        ecx, 4
4299     jg         convertloop
4300 
4301     ret
4302   }
4303 }
4304 #endif  // HAS_ARGBATTENUATEROW_SSSE3
4305 
4306 #ifdef HAS_ARGBATTENUATEROW_AVX2
4307 // Shuffle table duplicating alpha.
4308 static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
4309                                          128u, 128u, 14u,  15u, 14u, 15u,
4310                                          14u,  15u,  128u, 128u};
4311 __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
4312                                              uint8_t* dst_argb,
4313                                              int width) {
4314   __asm {
4315     mov        eax, [esp + 4]  // src_argb0
4316     mov        edx, [esp + 8]  // dst_argb
4317     mov        ecx, [esp + 12]  // width
4318     sub        edx, eax
4319     vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
4320     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xff000000
4321     vpslld     ymm5, ymm5, 24
4322 
4323  convertloop:
4324     vmovdqu    ymm6, [eax]  // read 8 pixels.
4325     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4326     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4327     vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
4328     vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
4329     vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
4330     vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
4331     vpand      ymm6, ymm6, ymm5  // isolate alpha
4332     vpsrlw     ymm0, ymm0, 8
4333     vpsrlw     ymm1, ymm1, 8
4334     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4335     vpor       ymm0, ymm0, ymm6  // copy original alpha
4336     vmovdqu    [eax + edx], ymm0
4337     lea        eax, [eax + 32]
4338     sub        ecx, 8
4339     jg         convertloop
4340 
4341     vzeroupper
4342     ret
4343   }
4344 }
4345 #endif  // HAS_ARGBATTENUATEROW_AVX2
4346 
4347 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
4348 // Unattenuate 4 pixels at a time.
4349 __declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
4350                                                uint8_t* dst_argb,
4351                                                int width) {
4352   __asm {
4353     push       ebx
4354     push       esi
4355     push       edi
4356     mov        eax, [esp + 12 + 4]  // src_argb
4357     mov        edx, [esp + 12 + 8]  // dst_argb
4358     mov        ecx, [esp + 12 + 12]  // width
4359     lea        ebx, fixed_invtbl8
4360 
4361  convertloop:
4362     movdqu     xmm0, [eax]  // read 4 pixels
4363     movzx      esi, byte ptr [eax + 3]  // first alpha
4364     movzx      edi, byte ptr [eax + 7]  // second alpha
4365     punpcklbw  xmm0, xmm0  // first 2
4366     movd       xmm2, dword ptr [ebx + esi * 4]
4367     movd       xmm3, dword ptr [ebx + edi * 4]
4368     pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words.  1, a, a, a
4369     pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
4370     movlhps    xmm2, xmm3
4371     pmulhuw    xmm0, xmm2  // rgb * a
4372 
4373     movdqu     xmm1, [eax]  // read 4 pixels
4374     movzx      esi, byte ptr [eax + 11]  // third alpha
4375     movzx      edi, byte ptr [eax + 15]  // forth alpha
4376     punpckhbw  xmm1, xmm1  // next 2
4377     movd       xmm2, dword ptr [ebx + esi * 4]
4378     movd       xmm3, dword ptr [ebx + edi * 4]
4379     pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words
4380     pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
4381     movlhps    xmm2, xmm3
4382     pmulhuw    xmm1, xmm2  // rgb * a
4383     lea        eax, [eax + 16]
4384     packuswb   xmm0, xmm1
4385     movdqu     [edx], xmm0
4386     lea        edx, [edx + 16]
4387     sub        ecx, 4
4388     jg         convertloop
4389 
4390     pop        edi
4391     pop        esi
4392     pop        ebx
4393     ret
4394   }
4395 }
4396 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
4397 
4398 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
4399 // Shuffle table duplicating alpha.
4400 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
4401     0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
4402 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
4403 // USE_GATHER is not on by default, due to being a slow instruction.
4404 #ifdef USE_GATHER
4405 __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
4406                                                uint8_t* dst_argb,
4407                                                int width) {
4408   __asm {
4409     mov        eax, [esp + 4]  // src_argb0
4410     mov        edx, [esp + 8]  // dst_argb
4411     mov        ecx, [esp + 12]  // width
4412     sub        edx, eax
4413     vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
4414 
4415  convertloop:
4416     vmovdqu    ymm6, [eax]  // read 8 pixels.
4417     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
4418     vpsrld     ymm2, ymm6, 24  // alpha in low 8 bits.
4419     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4420     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4421     vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
4422     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
4423     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
4424     vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
4425     vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
4426     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
4427     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
4428     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4429     vmovdqu    [eax + edx], ymm0
4430     lea        eax, [eax + 32]
4431     sub        ecx, 8
4432     jg         convertloop
4433 
4434     vzeroupper
4435     ret
4436   }
4437 }
4438 #else   // USE_GATHER
4439 __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
4440                                                uint8_t* dst_argb,
4441                                                int width) {
4442   __asm {
4443 
4444     push       ebx
4445     push       esi
4446     push       edi
4447     mov        eax, [esp + 12 + 4]  // src_argb
4448     mov        edx, [esp + 12 + 8]  // dst_argb
4449     mov        ecx, [esp + 12 + 12]  // width
4450     sub        edx, eax
4451     lea        ebx, fixed_invtbl8
4452     vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
4453 
4454  convertloop:
4455         // replace VPGATHER
4456     movzx      esi, byte ptr [eax + 3]  // alpha0
4457     movzx      edi, byte ptr [eax + 7]  // alpha1
4458     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a0]
4459     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a1]
4460     movzx      esi, byte ptr [eax + 11]  // alpha2
4461     movzx      edi, byte ptr [eax + 15]  // alpha3
4462     vpunpckldq xmm6, xmm0, xmm1  // [1,a1,1,a0]
4463     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a2]
4464     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a3]
4465     movzx      esi, byte ptr [eax + 19]  // alpha4
4466     movzx      edi, byte ptr [eax + 23]  // alpha5
4467     vpunpckldq xmm7, xmm2, xmm3  // [1,a3,1,a2]
4468     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a4]
4469     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a5]
4470     movzx      esi, byte ptr [eax + 27]  // alpha6
4471     movzx      edi, byte ptr [eax + 31]  // alpha7
4472     vpunpckldq xmm0, xmm0, xmm1  // [1,a5,1,a4]
4473     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a6]
4474     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a7]
4475     vpunpckldq xmm2, xmm2, xmm3  // [1,a7,1,a6]
4476     vpunpcklqdq xmm3, xmm6, xmm7  // [1,a3,1,a2,1,a1,1,a0]
4477     vpunpcklqdq xmm0, xmm0, xmm2  // [1,a7,1,a6,1,a5,1,a4]
4478     vinserti128 ymm3, ymm3, xmm0, 1                // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
4479     // end of VPGATHER
4480 
4481     vmovdqu    ymm6, [eax]  // read 8 pixels.
4482     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4483     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4484     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
4485     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
4486     vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
4487     vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
4488     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
4489     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
4490     vpackuswb  ymm0, ymm0, ymm1             // unmutated.
4491     vmovdqu    [eax + edx], ymm0
4492     lea        eax, [eax + 32]
4493     sub        ecx, 8
4494     jg         convertloop
4495 
4496     pop        edi
4497     pop        esi
4498     pop        ebx
4499     vzeroupper
4500     ret
4501   }
4502 }
4503 #endif  // USE_GATHER
4504 #endif  // HAS_ARGBATTENUATEROW_AVX2
4505 
4506 #ifdef HAS_ARGBGRAYROW_SSSE3
4507 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
4508 __declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb,
4509                                          uint8_t* dst_argb,
4510                                          int width) {
4511   __asm {
4512     mov        eax, [esp + 4] /* src_argb */
4513     mov        edx, [esp + 8] /* dst_argb */
4514     mov        ecx, [esp + 12] /* width */
4515     movdqa     xmm4, xmmword ptr kARGBToYJ
4516     movdqa     xmm5, xmmword ptr kAddYJ64
4517 
4518  convertloop:
4519     movdqu     xmm0, [eax]  // G
4520     movdqu     xmm1, [eax + 16]
4521     pmaddubsw  xmm0, xmm4
4522     pmaddubsw  xmm1, xmm4
4523     phaddw     xmm0, xmm1
4524     paddw      xmm0, xmm5  // Add .5 for rounding.
4525     psrlw      xmm0, 7
4526     packuswb   xmm0, xmm0  // 8 G bytes
4527     movdqu     xmm2, [eax]  // A
4528     movdqu     xmm3, [eax + 16]
4529     lea        eax, [eax + 32]
4530     psrld      xmm2, 24
4531     psrld      xmm3, 24
4532     packuswb   xmm2, xmm3
4533     packuswb   xmm2, xmm2  // 8 A bytes
4534     movdqa     xmm3, xmm0  // Weave into GG, GA, then GGGA
4535     punpcklbw  xmm0, xmm0  // 8 GG words
4536     punpcklbw  xmm3, xmm2  // 8 GA words
4537     movdqa     xmm1, xmm0
4538     punpcklwd  xmm0, xmm3  // GGGA first 4
4539     punpckhwd  xmm1, xmm3  // GGGA next 4
4540     movdqu     [edx], xmm0
4541     movdqu     [edx + 16], xmm1
4542     lea        edx, [edx + 32]
4543     sub        ecx, 8
4544     jg         convertloop
4545     ret
4546   }
4547 }
4548 #endif  // HAS_ARGBGRAYROW_SSSE3
4549 
4550 #ifdef HAS_ARGBSEPIAROW_SSSE3
4551 //    b = (r * 35 + g * 68 + b * 17) >> 7
4552 //    g = (r * 45 + g * 88 + b * 22) >> 7
4553 //    r = (r * 50 + g * 98 + b * 24) >> 7
4554 // Constant for ARGB color to sepia tone.
4555 static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
4556                                    17, 68, 35, 0, 17, 68, 35, 0};
4557 
4558 static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
4559                                    22, 88, 45, 0, 22, 88, 45, 0};
4560 
4561 static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
4562                                    24, 98, 50, 0, 24, 98, 50, 0};
4563 
4564 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
4565 __declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
4566   __asm {
4567     mov        eax, [esp + 4] /* dst_argb */
4568     mov        ecx, [esp + 8] /* width */
4569     movdqa     xmm2, xmmword ptr kARGBToSepiaB
4570     movdqa     xmm3, xmmword ptr kARGBToSepiaG
4571     movdqa     xmm4, xmmword ptr kARGBToSepiaR
4572 
4573  convertloop:
4574     movdqu     xmm0, [eax]  // B
4575     movdqu     xmm6, [eax + 16]
4576     pmaddubsw  xmm0, xmm2
4577     pmaddubsw  xmm6, xmm2
4578     phaddw     xmm0, xmm6
4579     psrlw      xmm0, 7
4580     packuswb   xmm0, xmm0  // 8 B values
4581     movdqu     xmm5, [eax]  // G
4582     movdqu     xmm1, [eax + 16]
4583     pmaddubsw  xmm5, xmm3
4584     pmaddubsw  xmm1, xmm3
4585     phaddw     xmm5, xmm1
4586     psrlw      xmm5, 7
4587     packuswb   xmm5, xmm5  // 8 G values
4588     punpcklbw  xmm0, xmm5  // 8 BG values
4589     movdqu     xmm5, [eax]  // R
4590     movdqu     xmm1, [eax + 16]
4591     pmaddubsw  xmm5, xmm4
4592     pmaddubsw  xmm1, xmm4
4593     phaddw     xmm5, xmm1
4594     psrlw      xmm5, 7
4595     packuswb   xmm5, xmm5  // 8 R values
4596     movdqu     xmm6, [eax]  // A
4597     movdqu     xmm1, [eax + 16]
4598     psrld      xmm6, 24
4599     psrld      xmm1, 24
4600     packuswb   xmm6, xmm1
4601     packuswb   xmm6, xmm6  // 8 A values
4602     punpcklbw  xmm5, xmm6  // 8 RA values
4603     movdqa     xmm1, xmm0  // Weave BG, RA together
4604     punpcklwd  xmm0, xmm5  // BGRA first 4
4605     punpckhwd  xmm1, xmm5  // BGRA next 4
4606     movdqu     [eax], xmm0
4607     movdqu     [eax + 16], xmm1
4608     lea        eax, [eax + 32]
4609     sub        ecx, 8
4610     jg         convertloop
4611     ret
4612   }
4613 }
4614 #endif  // HAS_ARGBSEPIAROW_SSSE3
4615 
4616 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4617 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
4618 // Same as Sepia except matrix is provided.
4619 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
4620 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
4621 __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
4622                                                 uint8_t* dst_argb,
4623                                                 const int8_t* matrix_argb,
4624                                                 int width) {
4625   __asm {
4626     mov        eax, [esp + 4] /* src_argb */
4627     mov        edx, [esp + 8] /* dst_argb */
4628     mov        ecx, [esp + 12] /* matrix_argb */
4629     movdqu     xmm5, [ecx]
4630     pshufd     xmm2, xmm5, 0x00
4631     pshufd     xmm3, xmm5, 0x55
4632     pshufd     xmm4, xmm5, 0xaa
4633     pshufd     xmm5, xmm5, 0xff
4634     mov        ecx, [esp + 16] /* width */
4635 
4636  convertloop:
4637     movdqu     xmm0, [eax]  // B
4638     movdqu     xmm7, [eax + 16]
4639     pmaddubsw  xmm0, xmm2
4640     pmaddubsw  xmm7, xmm2
4641     movdqu     xmm6, [eax]  // G
4642     movdqu     xmm1, [eax + 16]
4643     pmaddubsw  xmm6, xmm3
4644     pmaddubsw  xmm1, xmm3
4645     phaddsw    xmm0, xmm7  // B
4646     phaddsw    xmm6, xmm1  // G
4647     psraw      xmm0, 6  // B
4648     psraw      xmm6, 6  // G
4649     packuswb   xmm0, xmm0  // 8 B values
4650     packuswb   xmm6, xmm6  // 8 G values
4651     punpcklbw  xmm0, xmm6  // 8 BG values
4652     movdqu     xmm1, [eax]  // R
4653     movdqu     xmm7, [eax + 16]
4654     pmaddubsw  xmm1, xmm4
4655     pmaddubsw  xmm7, xmm4
4656     phaddsw    xmm1, xmm7  // R
4657     movdqu     xmm6, [eax]  // A
4658     movdqu     xmm7, [eax + 16]
4659     pmaddubsw  xmm6, xmm5
4660     pmaddubsw  xmm7, xmm5
4661     phaddsw    xmm6, xmm7  // A
4662     psraw      xmm1, 6  // R
4663     psraw      xmm6, 6  // A
4664     packuswb   xmm1, xmm1  // 8 R values
4665     packuswb   xmm6, xmm6  // 8 A values
4666     punpcklbw  xmm1, xmm6  // 8 RA values
4667     movdqa     xmm6, xmm0  // Weave BG, RA together
4668     punpcklwd  xmm0, xmm1  // BGRA first 4
4669     punpckhwd  xmm6, xmm1  // BGRA next 4
4670     movdqu     [edx], xmm0
4671     movdqu     [edx + 16], xmm6
4672     lea        eax, [eax + 32]
4673     lea        edx, [edx + 32]
4674     sub        ecx, 8
4675     jg         convertloop
4676     ret
4677   }
4678 }
4679 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
4680 
4681 #ifdef HAS_ARGBQUANTIZEROW_SSE2
4682 // Quantize 4 ARGB pixels (16 bytes).
4683 __declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
4684                                             int scale,
4685                                             int interval_size,
4686                                             int interval_offset,
4687                                             int width) {
4688   __asm {
4689     mov        eax, [esp + 4] /* dst_argb */
4690     movd       xmm2, [esp + 8] /* scale */
4691     movd       xmm3, [esp + 12] /* interval_size */
4692     movd       xmm4, [esp + 16] /* interval_offset */
4693     mov        ecx, [esp + 20] /* width */
4694     pshuflw    xmm2, xmm2, 040h
4695     pshufd     xmm2, xmm2, 044h
4696     pshuflw    xmm3, xmm3, 040h
4697     pshufd     xmm3, xmm3, 044h
4698     pshuflw    xmm4, xmm4, 040h
4699     pshufd     xmm4, xmm4, 044h
4700     pxor       xmm5, xmm5  // constant 0
4701     pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
4702     pslld      xmm6, 24
4703 
4704  convertloop:
4705     movdqu     xmm0, [eax]  // read 4 pixels
4706     punpcklbw  xmm0, xmm5  // first 2 pixels
4707     pmulhuw    xmm0, xmm2  // pixel * scale >> 16
4708     movdqu     xmm1, [eax]  // read 4 pixels
4709     punpckhbw  xmm1, xmm5  // next 2 pixels
4710     pmulhuw    xmm1, xmm2
4711     pmullw     xmm0, xmm3  // * interval_size
4712     movdqu     xmm7, [eax]  // read 4 pixels
4713     pmullw     xmm1, xmm3
4714     pand       xmm7, xmm6  // mask alpha
4715     paddw      xmm0, xmm4  // + interval_size / 2
4716     paddw      xmm1, xmm4
4717     packuswb   xmm0, xmm1
4718     por        xmm0, xmm7
4719     movdqu     [eax], xmm0
4720     lea        eax, [eax + 16]
4721     sub        ecx, 4
4722     jg         convertloop
4723     ret
4724   }
4725 }
4726 #endif  // HAS_ARGBQUANTIZEROW_SSE2
4727 
4728 #ifdef HAS_ARGBSHADEROW_SSE2
4729 // Shade 4 pixels at a time by specified value.
4730 __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
4731                                          uint8_t* dst_argb,
4732                                          int width,
4733                                          uint32_t value) {
4734   __asm {
4735     mov        eax, [esp + 4]  // src_argb
4736     mov        edx, [esp + 8]  // dst_argb
4737     mov        ecx, [esp + 12]  // width
4738     movd       xmm2, [esp + 16]  // value
4739     punpcklbw  xmm2, xmm2
4740     punpcklqdq xmm2, xmm2
4741 
4742  convertloop:
4743     movdqu     xmm0, [eax]  // read 4 pixels
4744     lea        eax, [eax + 16]
4745     movdqa     xmm1, xmm0
4746     punpcklbw  xmm0, xmm0  // first 2
4747     punpckhbw  xmm1, xmm1  // next 2
4748     pmulhuw    xmm0, xmm2  // argb * value
4749     pmulhuw    xmm1, xmm2  // argb * value
4750     psrlw      xmm0, 8
4751     psrlw      xmm1, 8
4752     packuswb   xmm0, xmm1
4753     movdqu     [edx], xmm0
4754     lea        edx, [edx + 16]
4755     sub        ecx, 4
4756     jg         convertloop
4757 
4758     ret
4759   }
4760 }
4761 #endif  // HAS_ARGBSHADEROW_SSE2
4762 
4763 #ifdef HAS_ARGBMULTIPLYROW_SSE2
4764 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4765 __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
4766                                             const uint8_t* src_argb1,
4767                                             uint8_t* dst_argb,
4768                                             int width) {
4769   __asm {
4770     push       esi
4771     mov        eax, [esp + 4 + 4]  // src_argb0
4772     mov        esi, [esp + 4 + 8]  // src_argb1
4773     mov        edx, [esp + 4 + 12]  // dst_argb
4774     mov        ecx, [esp + 4 + 16]  // width
4775     pxor       xmm5, xmm5  // constant 0
4776 
4777  convertloop:
4778     movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
4779     movdqu     xmm2, [esi]  // read 4 pixels from src_argb1
4780     movdqu     xmm1, xmm0
4781     movdqu     xmm3, xmm2
4782     punpcklbw  xmm0, xmm0  // first 2
4783     punpckhbw  xmm1, xmm1  // next 2
4784     punpcklbw  xmm2, xmm5  // first 2
4785     punpckhbw  xmm3, xmm5  // next 2
4786     pmulhuw    xmm0, xmm2  // src_argb0 * src_argb1 first 2
4787     pmulhuw    xmm1, xmm3  // src_argb0 * src_argb1 next 2
4788     lea        eax, [eax + 16]
4789     lea        esi, [esi + 16]
4790     packuswb   xmm0, xmm1
4791     movdqu     [edx], xmm0
4792     lea        edx, [edx + 16]
4793     sub        ecx, 4
4794     jg         convertloop
4795 
4796     pop        esi
4797     ret
4798   }
4799 }
4800 #endif  // HAS_ARGBMULTIPLYROW_SSE2
4801 
4802 #ifdef HAS_ARGBADDROW_SSE2
4803 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
4804 // TODO(fbarchard): Port this to posix, neon and other math functions.
4805 __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
4806                                        const uint8_t* src_argb1,
4807                                        uint8_t* dst_argb,
4808                                        int width) {
4809   __asm {
4810     push       esi
4811     mov        eax, [esp + 4 + 4]  // src_argb0
4812     mov        esi, [esp + 4 + 8]  // src_argb1
4813     mov        edx, [esp + 4 + 12]  // dst_argb
4814     mov        ecx, [esp + 4 + 16]  // width
4815 
4816     sub        ecx, 4
4817     jl         convertloop49
4818 
4819  convertloop4:
4820     movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
4821     lea        eax, [eax + 16]
4822     movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
4823     lea        esi, [esi + 16]
4824     paddusb    xmm0, xmm1  // src_argb0 + src_argb1
4825     movdqu     [edx], xmm0
4826     lea        edx, [edx + 16]
4827     sub        ecx, 4
4828     jge        convertloop4
4829 
4830  convertloop49:
4831     add        ecx, 4 - 1
4832     jl         convertloop19
4833 
4834  convertloop1:
4835     movd       xmm0, [eax]  // read 1 pixels from src_argb0
4836     lea        eax, [eax + 4]
4837     movd       xmm1, [esi]  // read 1 pixels from src_argb1
4838     lea        esi, [esi + 4]
4839     paddusb    xmm0, xmm1  // src_argb0 + src_argb1
4840     movd       [edx], xmm0
4841     lea        edx, [edx + 4]
4842     sub        ecx, 1
4843     jge        convertloop1
4844 
4845  convertloop19:
4846     pop        esi
4847     ret
4848   }
4849 }
4850 #endif  // HAS_ARGBADDROW_SSE2
4851 
4852 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4853 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
4854 __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
4855                                             const uint8_t* src_argb1,
4856                                             uint8_t* dst_argb,
4857                                             int width) {
4858   __asm {
4859     push       esi
4860     mov        eax, [esp + 4 + 4]  // src_argb0
4861     mov        esi, [esp + 4 + 8]  // src_argb1
4862     mov        edx, [esp + 4 + 12]  // dst_argb
4863     mov        ecx, [esp + 4 + 16]  // width
4864 
4865  convertloop:
4866     movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
4867     lea        eax, [eax + 16]
4868     movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
4869     lea        esi, [esi + 16]
4870     psubusb    xmm0, xmm1  // src_argb0 - src_argb1
4871     movdqu     [edx], xmm0
4872     lea        edx, [edx + 16]
4873     sub        ecx, 4
4874     jg         convertloop
4875 
4876     pop        esi
4877     ret
4878   }
4879 }
4880 #endif  // HAS_ARGBSUBTRACTROW_SSE2
4881 
4882 #ifdef HAS_ARGBMULTIPLYROW_AVX2
4883 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
4884 __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
4885                                             const uint8_t* src_argb1,
4886                                             uint8_t* dst_argb,
4887                                             int width) {
4888   __asm {
4889     push       esi
4890     mov        eax, [esp + 4 + 4]  // src_argb0
4891     mov        esi, [esp + 4 + 8]  // src_argb1
4892     mov        edx, [esp + 4 + 12]  // dst_argb
4893     mov        ecx, [esp + 4 + 16]  // width
4894     vpxor      ymm5, ymm5, ymm5  // constant 0
4895 
4896  convertloop:
4897     vmovdqu    ymm1, [eax]  // read 8 pixels from src_argb0
4898     lea        eax, [eax + 32]
4899     vmovdqu    ymm3, [esi]  // read 8 pixels from src_argb1
4900     lea        esi, [esi + 32]
4901     vpunpcklbw ymm0, ymm1, ymm1  // low 4
4902     vpunpckhbw ymm1, ymm1, ymm1  // high 4
4903     vpunpcklbw ymm2, ymm3, ymm5  // low 4
4904     vpunpckhbw ymm3, ymm3, ymm5  // high 4
4905     vpmulhuw   ymm0, ymm0, ymm2  // src_argb0 * src_argb1 low 4
4906     vpmulhuw   ymm1, ymm1, ymm3  // src_argb0 * src_argb1 high 4
4907     vpackuswb  ymm0, ymm0, ymm1
4908     vmovdqu    [edx], ymm0
4909     lea        edx, [edx + 32]
4910     sub        ecx, 8
4911     jg         convertloop
4912 
4913     pop        esi
4914     vzeroupper
4915     ret
4916   }
4917 }
4918 #endif  // HAS_ARGBMULTIPLYROW_AVX2
4919 
4920 #ifdef HAS_ARGBADDROW_AVX2
4921 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
4922 __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
4923                                        const uint8_t* src_argb1,
4924                                        uint8_t* dst_argb,
4925                                        int width) {
4926   __asm {
4927     push       esi
4928     mov        eax, [esp + 4 + 4]  // src_argb0
4929     mov        esi, [esp + 4 + 8]  // src_argb1
4930     mov        edx, [esp + 4 + 12]  // dst_argb
4931     mov        ecx, [esp + 4 + 16]  // width
4932 
4933  convertloop:
4934     vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
4935     lea        eax, [eax + 32]
4936     vpaddusb   ymm0, ymm0, [esi]  // add 8 pixels from src_argb1
4937     lea        esi, [esi + 32]
4938     vmovdqu    [edx], ymm0
4939     lea        edx, [edx + 32]
4940     sub        ecx, 8
4941     jg         convertloop
4942 
4943     pop        esi
4944     vzeroupper
4945     ret
4946   }
4947 }
4948 #endif  // HAS_ARGBADDROW_AVX2
4949 
4950 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4951 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
4952 __declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
4953                                             const uint8_t* src_argb1,
4954                                             uint8_t* dst_argb,
4955                                             int width) {
4956   __asm {
4957     push       esi
4958     mov        eax, [esp + 4 + 4]  // src_argb0
4959     mov        esi, [esp + 4 + 8]  // src_argb1
4960     mov        edx, [esp + 4 + 12]  // dst_argb
4961     mov        ecx, [esp + 4 + 16]  // width
4962 
4963  convertloop:
4964     vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
4965     lea        eax, [eax + 32]
4966     vpsubusb   ymm0, ymm0, [esi]  // src_argb0 - src_argb1
4967     lea        esi, [esi + 32]
4968     vmovdqu    [edx], ymm0
4969     lea        edx, [edx + 32]
4970     sub        ecx, 8
4971     jg         convertloop
4972 
4973     pop        esi
4974     vzeroupper
4975     ret
4976   }
4977 }
4978 #endif  // HAS_ARGBSUBTRACTROW_AVX2
4979 
4980 #ifdef HAS_SOBELXROW_SSE2
4981 // SobelX as a matrix is
4982 // -1  0  1
4983 // -2  0  2
4984 // -1  0  1
4985 __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
4986                                       const uint8_t* src_y1,
4987                                       const uint8_t* src_y2,
4988                                       uint8_t* dst_sobelx,
4989                                       int width) {
4990   __asm {
4991     push       esi
4992     push       edi
4993     mov        eax, [esp + 8 + 4]  // src_y0
4994     mov        esi, [esp + 8 + 8]  // src_y1
4995     mov        edi, [esp + 8 + 12]  // src_y2
4996     mov        edx, [esp + 8 + 16]  // dst_sobelx
4997     mov        ecx, [esp + 8 + 20]  // width
4998     sub        esi, eax
4999     sub        edi, eax
5000     sub        edx, eax
5001     pxor       xmm5, xmm5  // constant 0
5002 
5003  convertloop:
5004     movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
5005     movq       xmm1, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
5006     punpcklbw  xmm0, xmm5
5007     punpcklbw  xmm1, xmm5
5008     psubw      xmm0, xmm1
5009     movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
5010     movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
5011     punpcklbw  xmm1, xmm5
5012     punpcklbw  xmm2, xmm5
5013     psubw      xmm1, xmm2
5014     movq       xmm2, qword ptr [eax + edi]  // read 8 pixels from src_y2[0]
5015     movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
5016     punpcklbw  xmm2, xmm5
5017     punpcklbw  xmm3, xmm5
5018     psubw      xmm2, xmm3
5019     paddw      xmm0, xmm2
5020     paddw      xmm0, xmm1
5021     paddw      xmm0, xmm1
5022     pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
5023     psubw      xmm1, xmm0
5024     pmaxsw     xmm0, xmm1
5025     packuswb   xmm0, xmm0
5026     movq       qword ptr [eax + edx], xmm0
5027     lea        eax, [eax + 8]
5028     sub        ecx, 8
5029     jg         convertloop
5030 
5031     pop        edi
5032     pop        esi
5033     ret
5034   }
5035 }
5036 #endif  // HAS_SOBELXROW_SSE2
5037 
5038 #ifdef HAS_SOBELYROW_SSE2
5039 // SobelY as a matrix is
5040 // -1 -2 -1
5041 //  0  0  0
5042 //  1  2  1
5043 __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
5044                                       const uint8_t* src_y1,
5045                                       uint8_t* dst_sobely,
5046                                       int width) {
5047   __asm {
5048     push       esi
5049     mov        eax, [esp + 4 + 4]  // src_y0
5050     mov        esi, [esp + 4 + 8]  // src_y1
5051     mov        edx, [esp + 4 + 12]  // dst_sobely
5052     mov        ecx, [esp + 4 + 16]  // width
5053     sub        esi, eax
5054     sub        edx, eax
5055     pxor       xmm5, xmm5  // constant 0
5056 
5057  convertloop:
5058     movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
5059     movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
5060     punpcklbw  xmm0, xmm5
5061     punpcklbw  xmm1, xmm5
5062     psubw      xmm0, xmm1
5063     movq       xmm1, qword ptr [eax + 1]  // read 8 pixels from src_y0[1]
5064     movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
5065     punpcklbw  xmm1, xmm5
5066     punpcklbw  xmm2, xmm5
5067     psubw      xmm1, xmm2
5068     movq       xmm2, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
5069     movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
5070     punpcklbw  xmm2, xmm5
5071     punpcklbw  xmm3, xmm5
5072     psubw      xmm2, xmm3
5073     paddw      xmm0, xmm2
5074     paddw      xmm0, xmm1
5075     paddw      xmm0, xmm1
5076     pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
5077     psubw      xmm1, xmm0
5078     pmaxsw     xmm0, xmm1
5079     packuswb   xmm0, xmm0
5080     movq       qword ptr [eax + edx], xmm0
5081     lea        eax, [eax + 8]
5082     sub        ecx, 8
5083     jg         convertloop
5084 
5085     pop        esi
5086     ret
5087   }
5088 }
5089 #endif  // HAS_SOBELYROW_SSE2
5090 
5091 #ifdef HAS_SOBELROW_SSE2
5092 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5093 // A = 255
5094 // R = Sobel
5095 // G = Sobel
5096 // B = Sobel
5097 __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
5098                                      const uint8_t* src_sobely,
5099                                      uint8_t* dst_argb,
5100                                      int width) {
5101   __asm {
5102     push       esi
5103     mov        eax, [esp + 4 + 4]  // src_sobelx
5104     mov        esi, [esp + 4 + 8]  // src_sobely
5105     mov        edx, [esp + 4 + 12]  // dst_argb
5106     mov        ecx, [esp + 4 + 16]  // width
5107     sub        esi, eax
5108     pcmpeqb    xmm5, xmm5  // alpha 255
5109     pslld      xmm5, 24  // 0xff000000
5110 
5111  convertloop:
5112     movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
5113     movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
5114     lea        eax, [eax + 16]
5115     paddusb    xmm0, xmm1  // sobel = sobelx + sobely
5116     movdqa     xmm2, xmm0  // GG
5117     punpcklbw  xmm2, xmm0  // First 8
5118     punpckhbw  xmm0, xmm0  // Next 8
5119     movdqa     xmm1, xmm2  // GGGG
5120     punpcklwd  xmm1, xmm2  // First 4
5121     punpckhwd  xmm2, xmm2  // Next 4
5122     por        xmm1, xmm5  // GGGA
5123     por        xmm2, xmm5
5124     movdqa     xmm3, xmm0  // GGGG
5125     punpcklwd  xmm3, xmm0  // Next 4
5126     punpckhwd  xmm0, xmm0  // Last 4
5127     por        xmm3, xmm5  // GGGA
5128     por        xmm0, xmm5
5129     movdqu     [edx], xmm1
5130     movdqu     [edx + 16], xmm2
5131     movdqu     [edx + 32], xmm3
5132     movdqu     [edx + 48], xmm0
5133     lea        edx, [edx + 64]
5134     sub        ecx, 16
5135     jg         convertloop
5136 
5137     pop        esi
5138     ret
5139   }
5140 }
5141 #endif  // HAS_SOBELROW_SSE2
5142 
5143 #ifdef HAS_SOBELTOPLANEROW_SSE2
5144 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
5145 __declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
5146                                             const uint8_t* src_sobely,
5147                                             uint8_t* dst_y,
5148                                             int width) {
5149   __asm {
5150     push       esi
5151     mov        eax, [esp + 4 + 4]  // src_sobelx
5152     mov        esi, [esp + 4 + 8]  // src_sobely
5153     mov        edx, [esp + 4 + 12]  // dst_argb
5154     mov        ecx, [esp + 4 + 16]  // width
5155     sub        esi, eax
5156 
5157  convertloop:
5158     movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
5159     movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
5160     lea        eax, [eax + 16]
5161     paddusb    xmm0, xmm1  // sobel = sobelx + sobely
5162     movdqu     [edx], xmm0
5163     lea        edx, [edx + 16]
5164     sub        ecx, 16
5165     jg         convertloop
5166 
5167     pop        esi
5168     ret
5169   }
5170 }
5171 #endif  // HAS_SOBELTOPLANEROW_SSE2
5172 
5173 #ifdef HAS_SOBELXYROW_SSE2
5174 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
5175 // A = 255
5176 // R = Sobel X
5177 // G = Sobel
5178 // B = Sobel Y
5179 __declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
5180                                        const uint8_t* src_sobely,
5181                                        uint8_t* dst_argb,
5182                                        int width) {
5183   __asm {
5184     push       esi
5185     mov        eax, [esp + 4 + 4]  // src_sobelx
5186     mov        esi, [esp + 4 + 8]  // src_sobely
5187     mov        edx, [esp + 4 + 12]  // dst_argb
5188     mov        ecx, [esp + 4 + 16]  // width
5189     sub        esi, eax
5190     pcmpeqb    xmm5, xmm5  // alpha 255
5191 
5192  convertloop:
5193     movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
5194     movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
5195     lea        eax, [eax + 16]
5196     movdqa     xmm2, xmm0
5197     paddusb    xmm2, xmm1  // sobel = sobelx + sobely
5198     movdqa     xmm3, xmm0  // XA
5199     punpcklbw  xmm3, xmm5
5200     punpckhbw  xmm0, xmm5
5201     movdqa     xmm4, xmm1  // YS
5202     punpcklbw  xmm4, xmm2
5203     punpckhbw  xmm1, xmm2
5204     movdqa     xmm6, xmm4  // YSXA
5205     punpcklwd  xmm6, xmm3  // First 4
5206     punpckhwd  xmm4, xmm3  // Next 4
5207     movdqa     xmm7, xmm1  // YSXA
5208     punpcklwd  xmm7, xmm0  // Next 4
5209     punpckhwd  xmm1, xmm0  // Last 4
5210     movdqu     [edx], xmm6
5211     movdqu     [edx + 16], xmm4
5212     movdqu     [edx + 32], xmm7
5213     movdqu     [edx + 48], xmm1
5214     lea        edx, [edx + 64]
5215     sub        ecx, 16
5216     jg         convertloop
5217 
5218     pop        esi
5219     ret
5220   }
5221 }
5222 #endif  // HAS_SOBELXYROW_SSE2
5223 
5224 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5225 // Consider float CumulativeSum.
5226 // Consider calling CumulativeSum one row at time as needed.
5227 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
5228 // Convert cumulative sum for an area to an average for 1 pixel.
5229 // topleft is pointer to top left of CumulativeSum buffer for area.
5230 // botleft is pointer to bottom left of CumulativeSum buffer.
5231 // width is offset from left to right of area in CumulativeSum buffer measured
5232 //   in number of ints.
5233 // area is the number of pixels in the area being averaged.
5234 // dst points to pixel to store result to.
5235 // count is number of averaged pixels to produce.
5236 // Does 4 pixels at a time.
5237 // This function requires alignment on accumulation buffer pointers.
5238 void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
5239                                     const int32_t* botleft,
5240                                     int width,
5241                                     int area,
5242                                     uint8_t* dst,
5243                                     int count) {
5244   __asm {
5245     mov        eax, topleft  // eax topleft
5246     mov        esi, botleft  // esi botleft
5247     mov        edx, width
5248     movd       xmm5, area
5249     mov        edi, dst
5250     mov        ecx, count
5251     cvtdq2ps   xmm5, xmm5
5252     rcpss      xmm4, xmm5  // 1.0f / area
5253     pshufd     xmm4, xmm4, 0
5254     sub        ecx, 4
5255     jl         l4b
5256 
5257     cmp        area, 128  // 128 pixels will not overflow 15 bits.
5258     ja         l4
5259 
5260     pshufd     xmm5, xmm5, 0  // area
5261     pcmpeqb    xmm6, xmm6  // constant of 65536.0 - 1 = 65535.0
5262     psrld      xmm6, 16
5263     cvtdq2ps   xmm6, xmm6
5264     addps      xmm5, xmm6  // (65536.0 + area - 1)
5265     mulps      xmm5, xmm4  // (65536.0 + area - 1) * 1 / area
5266     cvtps2dq   xmm5, xmm5  // 0.16 fixed point
5267     packssdw   xmm5, xmm5  // 16 bit shorts
5268 
5269         // 4 pixel loop small blocks.
5270   s4:
5271         // top left
5272     movdqu     xmm0, [eax]
5273     movdqu     xmm1, [eax + 16]
5274     movdqu     xmm2, [eax + 32]
5275     movdqu     xmm3, [eax + 48]
5276 
5277     // - top right
5278     psubd      xmm0, [eax + edx * 4]
5279     psubd      xmm1, [eax + edx * 4 + 16]
5280     psubd      xmm2, [eax + edx * 4 + 32]
5281     psubd      xmm3, [eax + edx * 4 + 48]
5282     lea        eax, [eax + 64]
5283 
5284     // - bottom left
5285     psubd      xmm0, [esi]
5286     psubd      xmm1, [esi + 16]
5287     psubd      xmm2, [esi + 32]
5288     psubd      xmm3, [esi + 48]
5289 
5290     // + bottom right
5291     paddd      xmm0, [esi + edx * 4]
5292     paddd      xmm1, [esi + edx * 4 + 16]
5293     paddd      xmm2, [esi + edx * 4 + 32]
5294     paddd      xmm3, [esi + edx * 4 + 48]
5295     lea        esi, [esi + 64]
5296 
5297     packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
5298     packssdw   xmm2, xmm3
5299 
5300     pmulhuw    xmm0, xmm5
5301     pmulhuw    xmm2, xmm5
5302 
5303     packuswb   xmm0, xmm2
5304     movdqu     [edi], xmm0
5305     lea        edi, [edi + 16]
5306     sub        ecx, 4
5307     jge        s4
5308 
5309     jmp        l4b
5310 
5311             // 4 pixel loop
5312   l4:
5313         // top left
5314     movdqu     xmm0, [eax]
5315     movdqu     xmm1, [eax + 16]
5316     movdqu     xmm2, [eax + 32]
5317     movdqu     xmm3, [eax + 48]
5318 
5319     // - top right
5320     psubd      xmm0, [eax + edx * 4]
5321     psubd      xmm1, [eax + edx * 4 + 16]
5322     psubd      xmm2, [eax + edx * 4 + 32]
5323     psubd      xmm3, [eax + edx * 4 + 48]
5324     lea        eax, [eax + 64]
5325 
5326     // - bottom left
5327     psubd      xmm0, [esi]
5328     psubd      xmm1, [esi + 16]
5329     psubd      xmm2, [esi + 32]
5330     psubd      xmm3, [esi + 48]
5331 
5332     // + bottom right
5333     paddd      xmm0, [esi + edx * 4]
5334     paddd      xmm1, [esi + edx * 4 + 16]
5335     paddd      xmm2, [esi + edx * 4 + 32]
5336     paddd      xmm3, [esi + edx * 4 + 48]
5337     lea        esi, [esi + 64]
5338 
5339     cvtdq2ps   xmm0, xmm0  // Average = Sum * 1 / Area
5340     cvtdq2ps   xmm1, xmm1
5341     mulps      xmm0, xmm4
5342     mulps      xmm1, xmm4
5343     cvtdq2ps   xmm2, xmm2
5344     cvtdq2ps   xmm3, xmm3
5345     mulps      xmm2, xmm4
5346     mulps      xmm3, xmm4
5347     cvtps2dq   xmm0, xmm0
5348     cvtps2dq   xmm1, xmm1
5349     cvtps2dq   xmm2, xmm2
5350     cvtps2dq   xmm3, xmm3
5351     packssdw   xmm0, xmm1
5352     packssdw   xmm2, xmm3
5353     packuswb   xmm0, xmm2
5354     movdqu     [edi], xmm0
5355     lea        edi, [edi + 16]
5356     sub        ecx, 4
5357     jge        l4
5358 
5359   l4b:
5360     add        ecx, 4 - 1
5361     jl         l1b
5362 
5363         // 1 pixel loop
5364   l1:
5365     movdqu     xmm0, [eax]
5366     psubd      xmm0, [eax + edx * 4]
5367     lea        eax, [eax + 16]
5368     psubd      xmm0, [esi]
5369     paddd      xmm0, [esi + edx * 4]
5370     lea        esi, [esi + 16]
5371     cvtdq2ps   xmm0, xmm0
5372     mulps      xmm0, xmm4
5373     cvtps2dq   xmm0, xmm0
5374     packssdw   xmm0, xmm0
5375     packuswb   xmm0, xmm0
5376     movd       dword ptr [edi], xmm0
5377     lea        edi, [edi + 4]
5378     sub        ecx, 1
5379     jge        l1
5380   l1b:
5381   }
5382 }
5383 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5384 
5385 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5386 // Creates a table of cumulative sums where each value is a sum of all values
5387 // above and to the left of the value.
5388 void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
5389                                   int32_t* cumsum,
5390                                   const int32_t* previous_cumsum,
5391                                   int width) {
5392   __asm {
5393     mov        eax, row
5394     mov        edx, cumsum
5395     mov        esi, previous_cumsum
5396     mov        ecx, width
5397     pxor       xmm0, xmm0
5398     pxor       xmm1, xmm1
5399 
5400     sub        ecx, 4
5401     jl         l4b
5402     test       edx, 15
5403     jne        l4b
5404 
5405         // 4 pixel loop
5406   l4:
5407     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
5408     lea        eax, [eax + 16]
5409     movdqa     xmm4, xmm2
5410 
5411     punpcklbw  xmm2, xmm1
5412     movdqa     xmm3, xmm2
5413     punpcklwd  xmm2, xmm1
5414     punpckhwd  xmm3, xmm1
5415 
5416     punpckhbw  xmm4, xmm1
5417     movdqa     xmm5, xmm4
5418     punpcklwd  xmm4, xmm1
5419     punpckhwd  xmm5, xmm1
5420 
5421     paddd      xmm0, xmm2
5422     movdqu     xmm2, [esi]  // previous row above.
5423     paddd      xmm2, xmm0
5424 
5425     paddd      xmm0, xmm3
5426     movdqu     xmm3, [esi + 16]
5427     paddd      xmm3, xmm0
5428 
5429     paddd      xmm0, xmm4
5430     movdqu     xmm4, [esi + 32]
5431     paddd      xmm4, xmm0
5432 
5433     paddd      xmm0, xmm5
5434     movdqu     xmm5, [esi + 48]
5435     lea        esi, [esi + 64]
5436     paddd      xmm5, xmm0
5437 
5438     movdqu     [edx], xmm2
5439     movdqu     [edx + 16], xmm3
5440     movdqu     [edx + 32], xmm4
5441     movdqu     [edx + 48], xmm5
5442 
5443     lea        edx, [edx + 64]
5444     sub        ecx, 4
5445     jge        l4
5446 
5447   l4b:
5448     add        ecx, 4 - 1
5449     jl         l1b
5450 
5451         // 1 pixel loop
5452   l1:
5453     movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
5454     lea        eax, [eax + 4]
5455     punpcklbw  xmm2, xmm1
5456     punpcklwd  xmm2, xmm1
5457     paddd      xmm0, xmm2
5458     movdqu     xmm2, [esi]
5459     lea        esi, [esi + 16]
5460     paddd      xmm2, xmm0
5461     movdqu     [edx], xmm2
5462     lea        edx, [edx + 16]
5463     sub        ecx, 1
5464     jge        l1
5465 
5466  l1b:
5467   }
5468 }
5469 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
5470 
5471 #ifdef HAS_ARGBAFFINEROW_SSE2
5472 // Copy ARGB pixels from source image with slope to a row of destination.
5473 __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
5474                                                      int src_argb_stride,
5475                                                      uint8_t* dst_argb,
5476                                                      const float* uv_dudv,
5477                                                      int width) {
5478   __asm {
5479     push       esi
5480     push       edi
5481     mov        eax, [esp + 12]  // src_argb
5482     mov        esi, [esp + 16]  // stride
5483     mov        edx, [esp + 20]  // dst_argb
5484     mov        ecx, [esp + 24]  // pointer to uv_dudv
5485     movq       xmm2, qword ptr [ecx]  // uv
5486     movq       xmm7, qword ptr [ecx + 8]  // dudv
5487     mov        ecx, [esp + 28]  // width
5488     shl        esi, 16  // 4, stride
5489     add        esi, 4
5490     movd       xmm5, esi
5491     sub        ecx, 4
5492     jl         l4b
5493 
5494         // setup for 4 pixel loop
5495     pshufd     xmm7, xmm7, 0x44  // dup dudv
5496     pshufd     xmm5, xmm5, 0  // dup 4, stride
5497     movdqa     xmm0, xmm2  // x0, y0, x1, y1
5498     addps      xmm0, xmm7
5499     movlhps    xmm2, xmm0
5500     movdqa     xmm4, xmm7
5501     addps      xmm4, xmm4  // dudv *= 2
5502     movdqa     xmm3, xmm2  // x2, y2, x3, y3
5503     addps      xmm3, xmm4
5504     addps      xmm4, xmm4  // dudv *= 4
5505 
5506         // 4 pixel loop
5507   l4:
5508     cvttps2dq  xmm0, xmm2  // x, y float to int first 2
5509     cvttps2dq  xmm1, xmm3  // x, y float to int next 2
5510     packssdw   xmm0, xmm1  // x, y as 8 shorts
5511     pmaddwd    xmm0, xmm5  // offsets = x * 4 + y * stride.
5512     movd       esi, xmm0
5513     pshufd     xmm0, xmm0, 0x39  // shift right
5514     movd       edi, xmm0
5515     pshufd     xmm0, xmm0, 0x39  // shift right
5516     movd       xmm1, [eax + esi]  // read pixel 0
5517     movd       xmm6, [eax + edi]  // read pixel 1
5518     punpckldq  xmm1, xmm6  // combine pixel 0 and 1
5519     addps      xmm2, xmm4  // x, y += dx, dy first 2
5520     movq       qword ptr [edx], xmm1
5521     movd       esi, xmm0
5522     pshufd     xmm0, xmm0, 0x39  // shift right
5523     movd       edi, xmm0
5524     movd       xmm6, [eax + esi]  // read pixel 2
5525     movd       xmm0, [eax + edi]  // read pixel 3
5526     punpckldq  xmm6, xmm0  // combine pixel 2 and 3
5527     addps      xmm3, xmm4  // x, y += dx, dy next 2
5528     movq       qword ptr 8[edx], xmm6
5529     lea        edx, [edx + 16]
5530     sub        ecx, 4
5531     jge        l4
5532 
5533   l4b:
5534     add        ecx, 4 - 1
5535     jl         l1b
5536 
5537         // 1 pixel loop
5538   l1:
5539     cvttps2dq  xmm0, xmm2  // x, y float to int
5540     packssdw   xmm0, xmm0  // x, y as shorts
5541     pmaddwd    xmm0, xmm5  // offset = x * 4 + y * stride
5542     addps      xmm2, xmm7  // x, y += dx, dy
5543     movd       esi, xmm0
5544     movd       xmm0, [eax + esi]  // copy a pixel
5545     movd       [edx], xmm0
5546     lea        edx, [edx + 4]
5547     sub        ecx, 1
5548     jge        l1
5549   l1b:
5550     pop        edi
5551     pop        esi
5552     ret
5553   }
5554 }
5555 #endif  // HAS_ARGBAFFINEROW_SSE2
5556 
5557 #ifdef HAS_INTERPOLATEROW_AVX2
5558 // Bilinear filter 32x2 -> 32x1
5559 __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
5560                                            const uint8_t* src_ptr,
5561                                            ptrdiff_t src_stride,
5562                                            int dst_width,
5563                                            int source_y_fraction) {
5564   __asm {
5565     push       esi
5566     push       edi
5567     mov        edi, [esp + 8 + 4]  // dst_ptr
5568     mov        esi, [esp + 8 + 8]  // src_ptr
5569     mov        edx, [esp + 8 + 12]  // src_stride
5570     mov        ecx, [esp + 8 + 16]  // dst_width
5571     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
5572     // Dispatch to specialized filters if applicable.
5573     cmp        eax, 0
5574     je         xloop100  // 0 / 256.  Blend 100 / 0.
5575     sub        edi, esi
5576     cmp        eax, 128
5577     je         xloop50  // 128 /256 is 0.50.  Blend 50 / 50.
5578 
5579     vmovd      xmm0, eax  // high fraction 0..255
5580     neg        eax
5581     add        eax, 256
5582     vmovd      xmm5, eax  // low fraction 256..1
5583     vpunpcklbw xmm5, xmm5, xmm0
5584     vpunpcklwd xmm5, xmm5, xmm5
5585     vbroadcastss ymm5, xmm5
5586 
5587     mov        eax, 0x80808080  // 128b for bias and rounding.
5588     vmovd      xmm4, eax
5589     vbroadcastss ymm4, xmm4
5590 
5591   xloop:
5592     vmovdqu    ymm0, [esi]
5593     vmovdqu    ymm2, [esi + edx]
5594     vpunpckhbw ymm1, ymm0, ymm2  // mutates
5595     vpunpcklbw ymm0, ymm0, ymm2
5596     vpsubb     ymm1, ymm1, ymm4  // bias to signed image
5597     vpsubb     ymm0, ymm0, ymm4
5598     vpmaddubsw ymm1, ymm5, ymm1
5599     vpmaddubsw ymm0, ymm5, ymm0
5600     vpaddw     ymm1, ymm1, ymm4  // unbias and round
5601     vpaddw     ymm0, ymm0, ymm4
5602     vpsrlw     ymm1, ymm1, 8
5603     vpsrlw     ymm0, ymm0, 8
5604     vpackuswb  ymm0, ymm0, ymm1            // unmutates
5605     vmovdqu    [esi + edi], ymm0
5606     lea        esi, [esi + 32]
5607     sub        ecx, 32
5608     jg         xloop
5609     jmp        xloop99
5610 
5611         // Blend 50 / 50.
5612  xloop50:
5613    vmovdqu    ymm0, [esi]
5614    vpavgb     ymm0, ymm0, [esi + edx]
5615    vmovdqu    [esi + edi], ymm0
5616    lea        esi, [esi + 32]
5617    sub        ecx, 32
5618    jg         xloop50
5619    jmp        xloop99
5620 
5621         // Blend 100 / 0 - Copy row unchanged.
5622  xloop100:
5623    rep movsb
5624 
5625   xloop99:
5626     pop        edi
5627     pop        esi
5628     vzeroupper
5629     ret
5630   }
5631 }
5632 #endif  // HAS_INTERPOLATEROW_AVX2
5633 
5634 // Bilinear filter 16x2 -> 16x1
5635 // TODO(fbarchard): Consider allowing 256 using memcpy.
5636 __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
5637                                             const uint8_t* src_ptr,
5638                                             ptrdiff_t src_stride,
5639                                             int dst_width,
5640                                             int source_y_fraction) {
5641   __asm {
5642     push       esi
5643     push       edi
5644 
5645     mov        edi, [esp + 8 + 4]  // dst_ptr
5646     mov        esi, [esp + 8 + 8]  // src_ptr
5647     mov        edx, [esp + 8 + 12]  // src_stride
5648     mov        ecx, [esp + 8 + 16]  // dst_width
5649     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
5650     sub        edi, esi
5651         // Dispatch to specialized filters if applicable.
5652     cmp        eax, 0
5653     je         xloop100  // 0 /256.  Blend 100 / 0.
5654     cmp        eax, 128
5655     je         xloop50  // 128 / 256 is 0.50.  Blend 50 / 50.
5656 
5657     movd       xmm0, eax  // high fraction 0..255
5658     neg        eax
5659     add        eax, 256
5660     movd       xmm5, eax  // low fraction 255..1
5661     punpcklbw  xmm5, xmm0
5662     punpcklwd  xmm5, xmm5
5663     pshufd     xmm5, xmm5, 0
5664     mov        eax, 0x80808080  // 128 for biasing image to signed.
5665     movd       xmm4, eax
5666     pshufd     xmm4, xmm4, 0x00
5667 
5668   xloop:
5669     movdqu     xmm0, [esi]
5670     movdqu     xmm2, [esi + edx]
5671     movdqu     xmm1, xmm0
5672     punpcklbw  xmm0, xmm2
5673     punpckhbw  xmm1, xmm2
5674     psubb      xmm0, xmm4            // bias image by -128
5675     psubb      xmm1, xmm4
5676     movdqa     xmm2, xmm5
5677     movdqa     xmm3, xmm5
5678     pmaddubsw  xmm2, xmm0
5679     pmaddubsw  xmm3, xmm1
5680     paddw      xmm2, xmm4
5681     paddw      xmm3, xmm4
5682     psrlw      xmm2, 8
5683     psrlw      xmm3, 8
5684     packuswb   xmm2, xmm3
5685     movdqu     [esi + edi], xmm2
5686     lea        esi, [esi + 16]
5687     sub        ecx, 16
5688     jg         xloop
5689     jmp        xloop99
5690 
5691         // Blend 50 / 50.
5692   xloop50:
5693     movdqu     xmm0, [esi]
5694     movdqu     xmm1, [esi + edx]
5695     pavgb      xmm0, xmm1
5696     movdqu     [esi + edi], xmm0
5697     lea        esi, [esi + 16]
5698     sub        ecx, 16
5699     jg         xloop50
5700     jmp        xloop99
5701 
5702         // Blend 100 / 0 - Copy row unchanged.
5703   xloop100:
5704     movdqu     xmm0, [esi]
5705     movdqu     [esi + edi], xmm0
5706     lea        esi, [esi + 16]
5707     sub        ecx, 16
5708     jg         xloop100
5709 
5710   xloop99:
5711     pop        edi
5712     pop        esi
5713     ret
5714   }
5715 }
5716 
5717 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5718 __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
5719                                             uint8_t* dst_argb,
5720                                             const uint8_t* shuffler,
5721                                             int width) {
5722   __asm {
5723     mov        eax, [esp + 4]  // src_argb
5724     mov        edx, [esp + 8]  // dst_argb
5725     mov        ecx, [esp + 12]  // shuffler
5726     movdqu     xmm5, [ecx]
5727     mov        ecx, [esp + 16]  // width
5728 
5729   wloop:
5730     movdqu     xmm0, [eax]
5731     movdqu     xmm1, [eax + 16]
5732     lea        eax, [eax + 32]
5733     pshufb     xmm0, xmm5
5734     pshufb     xmm1, xmm5
5735     movdqu     [edx], xmm0
5736     movdqu     [edx + 16], xmm1
5737     lea        edx, [edx + 32]
5738     sub        ecx, 8
5739     jg         wloop
5740     ret
5741   }
5742 }
5743 
5744 #ifdef HAS_ARGBSHUFFLEROW_AVX2
5745 __declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
5746                                            uint8_t* dst_argb,
5747                                            const uint8_t* shuffler,
5748                                            int width) {
5749   __asm {
5750     mov        eax, [esp + 4]  // src_argb
5751     mov        edx, [esp + 8]  // dst_argb
5752     mov        ecx, [esp + 12]  // shuffler
5753     vbroadcastf128 ymm5, [ecx]  // same shuffle in high as low.
5754     mov        ecx, [esp + 16]  // width
5755 
5756   wloop:
5757     vmovdqu    ymm0, [eax]
5758     vmovdqu    ymm1, [eax + 32]
5759     lea        eax, [eax + 64]
5760     vpshufb    ymm0, ymm0, ymm5
5761     vpshufb    ymm1, ymm1, ymm5
5762     vmovdqu    [edx], ymm0
5763     vmovdqu    [edx + 32], ymm1
5764     lea        edx, [edx + 64]
5765     sub        ecx, 16
5766     jg         wloop
5767 
5768     vzeroupper
5769     ret
5770   }
5771 }
5772 #endif  // HAS_ARGBSHUFFLEROW_AVX2
5773 
5774 // YUY2 - Macro-pixel = 2 image pixels
5775 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
5776 
5777 // UYVY - Macro-pixel = 2 image pixels
5778 // U0Y0V0Y1
5779 
5780 __declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
5781                                           const uint8_t* src_u,
5782                                           const uint8_t* src_v,
5783                                           uint8_t* dst_frame,
5784                                           int width) {
5785   __asm {
5786     push       esi
5787     push       edi
5788     mov        eax, [esp + 8 + 4]  // src_y
5789     mov        esi, [esp + 8 + 8]  // src_u
5790     mov        edx, [esp + 8 + 12]  // src_v
5791     mov        edi, [esp + 8 + 16]  // dst_frame
5792     mov        ecx, [esp + 8 + 20]  // width
5793     sub        edx, esi
5794 
5795   convertloop:
5796     movq       xmm2, qword ptr [esi]  // U
5797     movq       xmm3, qword ptr [esi + edx]  // V
5798     lea        esi, [esi + 8]
5799     punpcklbw  xmm2, xmm3  // UV
5800     movdqu     xmm0, [eax]  // Y
5801     lea        eax, [eax + 16]
5802     movdqa     xmm1, xmm0
5803     punpcklbw  xmm0, xmm2  // YUYV
5804     punpckhbw  xmm1, xmm2
5805     movdqu     [edi], xmm0
5806     movdqu     [edi + 16], xmm1
5807     lea        edi, [edi + 32]
5808     sub        ecx, 16
5809     jg         convertloop
5810 
5811     pop        edi
5812     pop        esi
5813     ret
5814   }
5815 }
5816 
5817 __declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
5818                                           const uint8_t* src_u,
5819                                           const uint8_t* src_v,
5820                                           uint8_t* dst_frame,
5821                                           int width) {
5822   __asm {
5823     push       esi
5824     push       edi
5825     mov        eax, [esp + 8 + 4]  // src_y
5826     mov        esi, [esp + 8 + 8]  // src_u
5827     mov        edx, [esp + 8 + 12]  // src_v
5828     mov        edi, [esp + 8 + 16]  // dst_frame
5829     mov        ecx, [esp + 8 + 20]  // width
5830     sub        edx, esi
5831 
5832   convertloop:
5833     movq       xmm2, qword ptr [esi]  // U
5834     movq       xmm3, qword ptr [esi + edx]  // V
5835     lea        esi, [esi + 8]
5836     punpcklbw  xmm2, xmm3  // UV
5837     movdqu     xmm0, [eax]  // Y
5838     movdqa     xmm1, xmm2
5839     lea        eax, [eax + 16]
5840     punpcklbw  xmm1, xmm0  // UYVY
5841     punpckhbw  xmm2, xmm0
5842     movdqu     [edi], xmm1
5843     movdqu     [edi + 16], xmm2
5844     lea        edi, [edi + 32]
5845     sub        ecx, 16
5846     jg         convertloop
5847 
5848     pop        edi
5849     pop        esi
5850     ret
5851   }
5852 }
5853 
5854 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
5855 __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
5856                                               uint8_t* dst_argb,
5857                                               const float* poly,
5858                                               int width) {
5859   __asm {
5860     push       esi
5861     mov        eax, [esp + 4 + 4] /* src_argb */
5862     mov        edx, [esp + 4 + 8] /* dst_argb */
5863     mov        esi, [esp + 4 + 12] /* poly */
5864     mov        ecx, [esp + 4 + 16] /* width */
5865     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
5866 
5867         // 2 pixel loop.
5868  convertloop:
5869         //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
5870         //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
5871     movq       xmm0, qword ptr [eax]  // BGRABGRA
5872     lea        eax, [eax + 8]
5873     punpcklbw  xmm0, xmm3
5874     movdqa     xmm4, xmm0
5875     punpcklwd  xmm0, xmm3  // pixel 0
5876     punpckhwd  xmm4, xmm3  // pixel 1
5877     cvtdq2ps   xmm0, xmm0  // 4 floats
5878     cvtdq2ps   xmm4, xmm4
5879     movdqa     xmm1, xmm0  // X
5880     movdqa     xmm5, xmm4
5881     mulps      xmm0, [esi + 16]  // C1 * X
5882     mulps      xmm4, [esi + 16]
5883     addps      xmm0, [esi]  // result = C0 + C1 * X
5884     addps      xmm4, [esi]
5885     movdqa     xmm2, xmm1
5886     movdqa     xmm6, xmm5
5887     mulps      xmm2, xmm1  // X * X
5888     mulps      xmm6, xmm5
5889     mulps      xmm1, xmm2  // X * X * X
5890     mulps      xmm5, xmm6
5891     mulps      xmm2, [esi + 32]  // C2 * X * X
5892     mulps      xmm6, [esi + 32]
5893     mulps      xmm1, [esi + 48]  // C3 * X * X * X
5894     mulps      xmm5, [esi + 48]
5895     addps      xmm0, xmm2  // result += C2 * X * X
5896     addps      xmm4, xmm6
5897     addps      xmm0, xmm1  // result += C3 * X * X * X
5898     addps      xmm4, xmm5
5899     cvttps2dq  xmm0, xmm0
5900     cvttps2dq  xmm4, xmm4
5901     packuswb   xmm0, xmm4
5902     packuswb   xmm0, xmm0
5903     movq       qword ptr [edx], xmm0
5904     lea        edx, [edx + 8]
5905     sub        ecx, 2
5906     jg         convertloop
5907     pop        esi
5908     ret
5909   }
5910 }
5911 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
5912 
5913 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
5914 __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
5915                                               uint8_t* dst_argb,
5916                                               const float* poly,
5917                                               int width) {
5918   __asm {
5919     mov        eax, [esp + 4] /* src_argb */
5920     mov        edx, [esp + 8] /* dst_argb */
5921     mov        ecx, [esp + 12] /* poly */
5922     vbroadcastf128 ymm4, [ecx]  // C0
5923     vbroadcastf128 ymm5, [ecx + 16]  // C1
5924     vbroadcastf128 ymm6, [ecx + 32]  // C2
5925     vbroadcastf128 ymm7, [ecx + 48]  // C3
5926     mov        ecx, [esp + 16] /* width */
5927 
5928     // 2 pixel loop.
5929  convertloop:
5930     vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
5931     lea         eax, [eax + 8]
5932     vcvtdq2ps   ymm0, ymm0  // X 8 floats
5933     vmulps      ymm2, ymm0, ymm0  // X * X
5934     vmulps      ymm3, ymm0, ymm7  // C3 * X
5935     vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
5936     vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
5937     vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
5938     vcvttps2dq  ymm0, ymm0
5939     vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
5940     vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
5941     vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
5942     vmovq       qword ptr [edx], xmm0
5943     lea         edx, [edx + 8]
5944     sub         ecx, 2
5945     jg          convertloop
5946     vzeroupper
5947     ret
5948   }
5949 }
5950 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
5951 
5952 #ifdef HAS_HALFFLOATROW_SSE2
5953 static float kExpBias = 1.9259299444e-34f;
5954 __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
5955                                          uint16_t* dst,
5956                                          float scale,
5957                                          int width) {
5958   __asm {
5959     mov        eax, [esp + 4] /* src */
5960     mov        edx, [esp + 8] /* dst */
5961     movd       xmm4, dword ptr [esp + 12] /* scale */
5962     mov        ecx, [esp + 16] /* width */
5963     mulss      xmm4, kExpBias
5964     pshufd     xmm4, xmm4, 0
5965     pxor       xmm5, xmm5
5966     sub        edx, eax
5967 
5968         // 8 pixel loop.
5969  convertloop:
5970     movdqu      xmm2, xmmword ptr [eax]  // 8 shorts
5971     add         eax, 16
5972     movdqa      xmm3, xmm2
5973     punpcklwd   xmm2, xmm5
5974     cvtdq2ps    xmm2, xmm2  // convert 8 ints to floats
5975     punpckhwd   xmm3, xmm5
5976     cvtdq2ps    xmm3, xmm3
5977     mulps       xmm2, xmm4
5978     mulps       xmm3, xmm4
5979     psrld       xmm2, 13
5980     psrld       xmm3, 13
5981     packssdw    xmm2, xmm3
5982     movdqu      [eax + edx - 16], xmm2
5983     sub         ecx, 8
5984     jg          convertloop
5985     ret
5986   }
5987 }
5988 #endif  // HAS_HALFFLOATROW_SSE2
5989 
5990 #ifdef HAS_HALFFLOATROW_AVX2
5991 __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
5992                                          uint16_t* dst,
5993                                          float scale,
5994                                          int width) {
5995   __asm {
5996     mov        eax, [esp + 4] /* src */
5997     mov        edx, [esp + 8] /* dst */
5998     movd       xmm4, dword ptr [esp + 12] /* scale */
5999     mov        ecx, [esp + 16] /* width */
6000 
6001     vmulss     xmm4, xmm4, kExpBias
6002     vbroadcastss ymm4, xmm4
6003     vpxor      ymm5, ymm5, ymm5
6004     sub        edx, eax
6005 
6006         // 16 pixel loop.
6007  convertloop:
6008     vmovdqu     ymm2, [eax]  // 16 shorts
6009     add         eax, 32
6010     vpunpckhwd  ymm3, ymm2, ymm5  // convert 16 shorts to 16 ints
6011     vpunpcklwd  ymm2, ymm2, ymm5
6012     vcvtdq2ps   ymm3, ymm3  // convert 16 ints to floats
6013     vcvtdq2ps   ymm2, ymm2
6014     vmulps      ymm3, ymm3, ymm4  // scale to adjust exponent for 5 bit range.
6015     vmulps      ymm2, ymm2, ymm4
6016     vpsrld      ymm3, ymm3, 13  // float convert to 8 half floats truncate
6017     vpsrld      ymm2, ymm2, 13
6018     vpackssdw   ymm2, ymm2, ymm3
6019     vmovdqu     [eax + edx - 32], ymm2
6020     sub         ecx, 16
6021     jg          convertloop
6022     vzeroupper
6023     ret
6024   }
6025 }
6026 #endif  // HAS_HALFFLOATROW_AVX2
6027 
6028 #ifdef HAS_HALFFLOATROW_F16C
6029 __declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
6030                                          uint16_t* dst,
6031                                          float scale,
6032                                          int width) {
6033   __asm {
6034     mov        eax, [esp + 4] /* src */
6035     mov        edx, [esp + 8] /* dst */
6036     vbroadcastss ymm4, [esp + 12] /* scale */
6037     mov        ecx, [esp + 16] /* width */
6038     sub        edx, eax
6039 
6040         // 16 pixel loop.
6041  convertloop:
6042     vpmovzxwd   ymm2, xmmword ptr [eax]  // 8 shorts -> 8 ints
6043     vpmovzxwd   ymm3, xmmword ptr [eax + 16]  // 8 more shorts
6044     add         eax, 32
6045     vcvtdq2ps   ymm2, ymm2  // convert 8 ints to floats
6046     vcvtdq2ps   ymm3, ymm3
6047     vmulps      ymm2, ymm2, ymm4  // scale to normalized range 0 to 1
6048     vmulps      ymm3, ymm3, ymm4
6049     vcvtps2ph   xmm2, ymm2, 3  // float convert to 8 half floats truncate
6050     vcvtps2ph   xmm3, ymm3, 3
6051     vmovdqu     [eax + edx + 32], xmm2
6052     vmovdqu     [eax + edx + 32 + 16], xmm3
6053     sub         ecx, 16
6054     jg          convertloop
6055     vzeroupper
6056     ret
6057   }
6058 }
6059 #endif  // HAS_HALFFLOATROW_F16C
6060 
6061 #ifdef HAS_ARGBCOLORTABLEROW_X86
6062 // Tranform ARGB pixels with color table.
6063 __declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
6064                                              const uint8_t* table_argb,
6065                                              int width) {
6066   __asm {
6067     push       esi
6068     mov        eax, [esp + 4 + 4] /* dst_argb */
6069     mov        esi, [esp + 4 + 8] /* table_argb */
6070     mov        ecx, [esp + 4 + 12] /* width */
6071 
6072     // 1 pixel loop.
6073   convertloop:
6074     movzx      edx, byte ptr [eax]
6075     lea        eax, [eax + 4]
6076     movzx      edx, byte ptr [esi + edx * 4]
6077     mov        byte ptr [eax - 4], dl
6078     movzx      edx, byte ptr [eax - 4 + 1]
6079     movzx      edx, byte ptr [esi + edx * 4 + 1]
6080     mov        byte ptr [eax - 4 + 1], dl
6081     movzx      edx, byte ptr [eax - 4 + 2]
6082     movzx      edx, byte ptr [esi + edx * 4 + 2]
6083     mov        byte ptr [eax - 4 + 2], dl
6084     movzx      edx, byte ptr [eax - 4 + 3]
6085     movzx      edx, byte ptr [esi + edx * 4 + 3]
6086     mov        byte ptr [eax - 4 + 3], dl
6087     dec        ecx
6088     jg         convertloop
6089     pop        esi
6090     ret
6091   }
6092 }
6093 #endif  // HAS_ARGBCOLORTABLEROW_X86
6094 
6095 #ifdef HAS_RGBCOLORTABLEROW_X86
6096 // Tranform RGB pixels with color table.
6097 __declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
6098                                             const uint8_t* table_argb,
6099                                             int width) {
6100   __asm {
6101     push       esi
6102     mov        eax, [esp + 4 + 4] /* dst_argb */
6103     mov        esi, [esp + 4 + 8] /* table_argb */
6104     mov        ecx, [esp + 4 + 12] /* width */
6105 
6106     // 1 pixel loop.
6107   convertloop:
6108     movzx      edx, byte ptr [eax]
6109     lea        eax, [eax + 4]
6110     movzx      edx, byte ptr [esi + edx * 4]
6111     mov        byte ptr [eax - 4], dl
6112     movzx      edx, byte ptr [eax - 4 + 1]
6113     movzx      edx, byte ptr [esi + edx * 4 + 1]
6114     mov        byte ptr [eax - 4 + 1], dl
6115     movzx      edx, byte ptr [eax - 4 + 2]
6116     movzx      edx, byte ptr [esi + edx * 4 + 2]
6117     mov        byte ptr [eax - 4 + 2], dl
6118     dec        ecx
6119     jg         convertloop
6120 
6121     pop        esi
6122     ret
6123   }
6124 }
6125 #endif  // HAS_RGBCOLORTABLEROW_X86
6126 
6127 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6128 // Tranform RGB pixels with luma table.
6129 __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
6130                                                    uint8_t* dst_argb,
6131                                                    int width,
6132                                                    const uint8_t* luma,
6133                                                    uint32_t lumacoeff) {
6134   __asm {
6135     push       esi
6136     push       edi
6137     mov        eax, [esp + 8 + 4] /* src_argb */
6138     mov        edi, [esp + 8 + 8] /* dst_argb */
6139     mov        ecx, [esp + 8 + 12] /* width */
6140     movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
6141     movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
6142     pshufd     xmm2, xmm2, 0
6143     pshufd     xmm3, xmm3, 0
6144     pcmpeqb    xmm4, xmm4  // generate mask 0xff00ff00
6145     psllw      xmm4, 8
6146     pxor       xmm5, xmm5
6147 
6148         // 4 pixel loop.
6149   convertloop:
6150     movdqu     xmm0, xmmword ptr [eax]  // generate luma ptr
6151     pmaddubsw  xmm0, xmm3
6152     phaddw     xmm0, xmm0
6153     pand       xmm0, xmm4  // mask out low bits
6154     punpcklwd  xmm0, xmm5
6155     paddd      xmm0, xmm2  // add table base
6156     movd       esi, xmm0
6157     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6158 
6159     movzx      edx, byte ptr [eax]
6160     movzx      edx, byte ptr [esi + edx]
6161     mov        byte ptr [edi], dl
6162     movzx      edx, byte ptr [eax + 1]
6163     movzx      edx, byte ptr [esi + edx]
6164     mov        byte ptr [edi + 1], dl
6165     movzx      edx, byte ptr [eax + 2]
6166     movzx      edx, byte ptr [esi + edx]
6167     mov        byte ptr [edi + 2], dl
6168     movzx      edx, byte ptr [eax + 3]  // copy alpha.
6169     mov        byte ptr [edi + 3], dl
6170 
6171     movd       esi, xmm0
6172     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6173 
6174     movzx      edx, byte ptr [eax + 4]
6175     movzx      edx, byte ptr [esi + edx]
6176     mov        byte ptr [edi + 4], dl
6177     movzx      edx, byte ptr [eax + 5]
6178     movzx      edx, byte ptr [esi + edx]
6179     mov        byte ptr [edi + 5], dl
6180     movzx      edx, byte ptr [eax + 6]
6181     movzx      edx, byte ptr [esi + edx]
6182     mov        byte ptr [edi + 6], dl
6183     movzx      edx, byte ptr [eax + 7]  // copy alpha.
6184     mov        byte ptr [edi + 7], dl
6185 
6186     movd       esi, xmm0
6187     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6188 
6189     movzx      edx, byte ptr [eax + 8]
6190     movzx      edx, byte ptr [esi + edx]
6191     mov        byte ptr [edi + 8], dl
6192     movzx      edx, byte ptr [eax + 9]
6193     movzx      edx, byte ptr [esi + edx]
6194     mov        byte ptr [edi + 9], dl
6195     movzx      edx, byte ptr [eax + 10]
6196     movzx      edx, byte ptr [esi + edx]
6197     mov        byte ptr [edi + 10], dl
6198     movzx      edx, byte ptr [eax + 11]  // copy alpha.
6199     mov        byte ptr [edi + 11], dl
6200 
6201     movd       esi, xmm0
6202 
6203     movzx      edx, byte ptr [eax + 12]
6204     movzx      edx, byte ptr [esi + edx]
6205     mov        byte ptr [edi + 12], dl
6206     movzx      edx, byte ptr [eax + 13]
6207     movzx      edx, byte ptr [esi + edx]
6208     mov        byte ptr [edi + 13], dl
6209     movzx      edx, byte ptr [eax + 14]
6210     movzx      edx, byte ptr [esi + edx]
6211     mov        byte ptr [edi + 14], dl
6212     movzx      edx, byte ptr [eax + 15]  // copy alpha.
6213     mov        byte ptr [edi + 15], dl
6214 
6215     lea        eax, [eax + 16]
6216     lea        edi, [edi + 16]
6217     sub        ecx, 4
6218     jg         convertloop
6219 
6220     pop        edi
6221     pop        esi
6222     ret
6223   }
6224 }
6225 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6226 
6227 #endif  // defined(_M_X64)
6228 
6229 #ifdef __cplusplus
6230 }  // extern "C"
6231 }  // namespace libyuv
6232 #endif
6233 
6234 #endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6235