1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 // This module is for Visual C 32/64 bit and clangcl 32 bit
14 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
15     (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
16 
17 #if defined(_M_X64)
18 #include <emmintrin.h>
19 #include <tmmintrin.h>  // For _mm_maddubs_epi16
20 #endif
21 
22 #ifdef __cplusplus
23 namespace libyuv {
24 extern "C" {
25 #endif
26 
27 // 64 bit
28 #if defined(_M_X64)
29 
30 // Read 4 UV from 422, upsample to 8 UV.
31 #define READYUV422                                      \
32   xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);            \
33   xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
34   xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                 \
35   xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                \
36   u_buf += 4;                                           \
37   xmm4 = _mm_loadl_epi64((__m128i*)y_buf);              \
38   xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                 \
39   y_buf += 8;
40 
41 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
42 #define READYUVA422                                     \
43   xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);            \
44   xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
45   xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                 \
46   xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                \
47   u_buf += 4;                                           \
48   xmm4 = _mm_loadl_epi64((__m128i*)y_buf);              \
49   xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                 \
50   y_buf += 8;                                           \
51   xmm5 = _mm_loadl_epi64((__m128i*)a_buf);              \
52   a_buf += 8;
53 
54 // Convert 8 pixels: 8 UV and 8 Y.
55 #define YUVTORGB(yuvconstants)                                     \
56   xmm1 = _mm_loadu_si128(&xmm0);                                   \
57   xmm2 = _mm_loadu_si128(&xmm0);                                   \
58   xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
59   xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
60   xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
61   xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);   \
62   xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);   \
63   xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);   \
64   xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);  \
65   xmm0 = _mm_adds_epi16(xmm0, xmm4);                               \
66   xmm1 = _mm_adds_epi16(xmm1, xmm4);                               \
67   xmm2 = _mm_adds_epi16(xmm2, xmm4);                               \
68   xmm0 = _mm_srai_epi16(xmm0, 6);                                  \
69   xmm1 = _mm_srai_epi16(xmm1, 6);                                  \
70   xmm2 = _mm_srai_epi16(xmm2, 6);                                  \
71   xmm0 = _mm_packus_epi16(xmm0, xmm0);                             \
72   xmm1 = _mm_packus_epi16(xmm1, xmm1);                             \
73   xmm2 = _mm_packus_epi16(xmm2, xmm2);
74 
75 // Store 8 ARGB values.
76 #define STOREARGB                                    \
77   xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);              \
78   xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);              \
79   xmm1 = _mm_loadu_si128(&xmm0);                     \
80   xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);             \
81   xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);             \
82   _mm_storeu_si128((__m128i*)dst_argb, xmm0);        \
83   _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
84   dst_argb += 32;
85 
86 #if defined(HAS_I422TOARGBROW_SSSE3)
I422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)87 void I422ToARGBRow_SSSE3(const uint8* y_buf,
88                          const uint8* u_buf,
89                          const uint8* v_buf,
90                          uint8* dst_argb,
91                          const struct YuvConstants* yuvconstants,
92                          int width) {
93   __m128i xmm0, xmm1, xmm2, xmm4;
94   const __m128i xmm5 = _mm_set1_epi8(-1);
95   const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
96   while (width > 0) {
97     READYUV422
98     YUVTORGB(yuvconstants)
99     STOREARGB
100     width -= 8;
101   }
102 }
103 #endif
104 
105 #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
I422AlphaToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,const uint8 * a_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)106 void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
107                               const uint8* u_buf,
108                               const uint8* v_buf,
109                               const uint8* a_buf,
110                               uint8* dst_argb,
111                               const struct YuvConstants* yuvconstants,
112                               int width) {
113   __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
114   const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
115   while (width > 0) {
116     READYUVA422
117     YUVTORGB(yuvconstants)
118     STOREARGB
119     width -= 8;
120   }
121 }
122 #endif
123 
124 // 32 bit
125 #else  // defined(_M_X64)
126 #ifdef HAS_ARGBTOYROW_SSSE3
127 
128 // Constants for ARGB.
129 static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
130                               13, 65, 33, 0, 13, 65, 33, 0};
131 
132 // JPeg full range.
133 static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
134                                15, 75, 38, 0, 15, 75, 38, 0};
135 
136 static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
137                               112, -74, -38, 0, 112, -74, -38, 0};
138 
139 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
140                                127, -84, -43, 0, 127, -84, -43, 0};
141 
142 static const vec8 kARGBToV = {
143     -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
144 };
145 
146 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
147                                -20, -107, 127, 0, -20, -107, 127, 0};
148 
149 // vpshufb for vphaddw + vpackuswb packed to shorts.
150 static const lvec8 kShufARGBToUV_AVX = {
151     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
152     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
153 
154 // Constants for BGRA.
155 static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
156                               0, 33, 65, 13, 0, 33, 65, 13};
157 
158 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
159                               0, -38, -74, 112, 0, -38, -74, 112};
160 
161 static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
162                               0, 112, -94, -18, 0, 112, -94, -18};
163 
164 // Constants for ABGR.
165 static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
166                               33, 65, 13, 0, 33, 65, 13, 0};
167 
168 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
169                               -38, -74, 112, 0, -38, -74, 112, 0};
170 
171 static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
172                               112, -94, -18, 0, 112, -94, -18, 0};
173 
174 // Constants for RGBA.
175 static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
176                               0, 13, 65, 33, 0, 13, 65, 33};
177 
178 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
179                               0, 112, -74, -38, 0, 112, -74, -38};
180 
181 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
182                               0, -18, -94, 112, 0, -18, -94, 112};
183 
184 static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
185                               16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
186 
187 // 7 bit fixed point 0.5.
188 static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
189 
190 static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
191                                 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
192 
193 static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
194                                   0x8080u, 0x8080u, 0x8080u, 0x8080u};
195 
196 // Shuffle table for converting RGB24 to ARGB.
197 static const uvec8 kShuffleMaskRGB24ToARGB = {
198     0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
199 
200 // Shuffle table for converting RAW to ARGB.
201 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
202                                             8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
203 
204 // Shuffle table for converting RAW to RGB24.  First 8.
205 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
206     2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
207     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
208 
209 // Shuffle table for converting RAW to RGB24.  Middle 8.
210 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
211     2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
212     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
213 
214 // Shuffle table for converting RAW to RGB24.  Last 8.
215 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
216     8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
217     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
218 
219 // Shuffle table for converting ARGB to RGB24.
220 static const uvec8 kShuffleMaskARGBToRGB24 = {
221     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
222 
223 // Shuffle table for converting ARGB to RAW.
224 static const uvec8 kShuffleMaskARGBToRAW = {
225     2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
226 
227 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
228 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
229     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
230 
231 // YUY2 shuf 16 Y to 32 Y.
232 static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
233                                     10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
234                                     6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
235 
236 // YUY2 shuf 8 UV to 16 UV.
237 static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
238                                      11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
239                                      5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
240 
241 // UYVY shuf 16 Y to 32 Y.
242 static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
243                                     11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
244                                     7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
245 
246 // UYVY shuf 8 UV to 16 UV.
247 static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
248                                      10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
249                                      4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
250 
251 // NV21 shuf 8 VU to 16 UV.
252 static const lvec8 kShuffleNV21 = {
253     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
254     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
255 };
256 
257 // Duplicates gray value 3 times and fills in alpha opaque.
258 __declspec(naked) void J400ToARGBRow_SSE2(const uint8* src_y,
259                                           uint8* dst_argb,
260                                           int width) {
261   __asm {
262     mov        eax, [esp + 4]  // src_y
263     mov        edx, [esp + 8]  // dst_argb
264     mov        ecx, [esp + 12]  // width
265     pcmpeqb    xmm5, xmm5  // generate mask 0xff000000
266     pslld      xmm5, 24
267 
268   convertloop:
269     movq       xmm0, qword ptr [eax]
270     lea        eax,  [eax + 8]
271     punpcklbw  xmm0, xmm0
272     movdqa     xmm1, xmm0
273     punpcklwd  xmm0, xmm0
274     punpckhwd  xmm1, xmm1
275     por        xmm0, xmm5
276     por        xmm1, xmm5
277     movdqu     [edx], xmm0
278     movdqu     [edx + 16], xmm1
279     lea        edx, [edx + 32]
280     sub        ecx, 8
281     jg         convertloop
282     ret
283   }
284 }
285 
286 #ifdef HAS_J400TOARGBROW_AVX2
287 // Duplicates gray value 3 times and fills in alpha opaque.
288 __declspec(naked) void J400ToARGBRow_AVX2(const uint8* src_y,
289                                           uint8* dst_argb,
290                                           int width) {
291   __asm {
292     mov         eax, [esp + 4]  // src_y
293     mov         edx, [esp + 8]  // dst_argb
294     mov         ecx, [esp + 12]  // width
295     vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff000000
296     vpslld      ymm5, ymm5, 24
297 
298   convertloop:
299     vmovdqu     xmm0, [eax]
300     lea         eax,  [eax + 16]
301     vpermq      ymm0, ymm0, 0xd8
302     vpunpcklbw  ymm0, ymm0, ymm0
303     vpermq      ymm0, ymm0, 0xd8
304     vpunpckhwd  ymm1, ymm0, ymm0
305     vpunpcklwd  ymm0, ymm0, ymm0
306     vpor        ymm0, ymm0, ymm5
307     vpor        ymm1, ymm1, ymm5
308     vmovdqu     [edx], ymm0
309     vmovdqu     [edx + 32], ymm1
310     lea         edx, [edx + 64]
311     sub         ecx, 16
312     jg          convertloop
313     vzeroupper
314     ret
315   }
316 }
317 #endif  // HAS_J400TOARGBROW_AVX2
318 
319 __declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24,
320                                             uint8* dst_argb,
321                                             int width) {
322   __asm {
323     mov       eax, [esp + 4]  // src_rgb24
324     mov       edx, [esp + 8]  // dst_argb
325     mov       ecx, [esp + 12]  // width
326     pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
327     pslld     xmm5, 24
328     movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
329 
330  convertloop:
331     movdqu    xmm0, [eax]
332     movdqu    xmm1, [eax + 16]
333     movdqu    xmm3, [eax + 32]
334     lea       eax, [eax + 48]
335     movdqa    xmm2, xmm3
336     palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
337     pshufb    xmm2, xmm4
338     por       xmm2, xmm5
339     palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
340     pshufb    xmm0, xmm4
341     movdqu    [edx + 32], xmm2
342     por       xmm0, xmm5
343     pshufb    xmm1, xmm4
344     movdqu    [edx], xmm0
345     por       xmm1, xmm5
346     palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
347     pshufb    xmm3, xmm4
348     movdqu    [edx + 16], xmm1
349     por       xmm3, xmm5
350     movdqu    [edx + 48], xmm3
351     lea       edx, [edx + 64]
352     sub       ecx, 16
353     jg        convertloop
354     ret
355   }
356 }
357 
358 __declspec(naked) void RAWToARGBRow_SSSE3(const uint8* src_raw,
359                                           uint8* dst_argb,
360                                           int width) {
361   __asm {
362     mov       eax, [esp + 4]  // src_raw
363     mov       edx, [esp + 8]  // dst_argb
364     mov       ecx, [esp + 12]  // width
365     pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
366     pslld     xmm5, 24
367     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToARGB
368 
369  convertloop:
370     movdqu    xmm0, [eax]
371     movdqu    xmm1, [eax + 16]
372     movdqu    xmm3, [eax + 32]
373     lea       eax, [eax + 48]
374     movdqa    xmm2, xmm3
375     palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
376     pshufb    xmm2, xmm4
377     por       xmm2, xmm5
378     palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
379     pshufb    xmm0, xmm4
380     movdqu    [edx + 32], xmm2
381     por       xmm0, xmm5
382     pshufb    xmm1, xmm4
383     movdqu    [edx], xmm0
384     por       xmm1, xmm5
385     palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
386     pshufb    xmm3, xmm4
387     movdqu    [edx + 16], xmm1
388     por       xmm3, xmm5
389     movdqu    [edx + 48], xmm3
390     lea       edx, [edx + 64]
391     sub       ecx, 16
392     jg        convertloop
393     ret
394   }
395 }
396 
397 __declspec(naked) void RAWToRGB24Row_SSSE3(const uint8* src_raw,
398                                            uint8* dst_rgb24,
399                                            int width) {
400   __asm {
401     mov       eax, [esp + 4]  // src_raw
402     mov       edx, [esp + 8]  // dst_rgb24
403     mov       ecx, [esp + 12]  // width
404     movdqa    xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
405     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
406     movdqa    xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
407 
408  convertloop:
409     movdqu    xmm0, [eax]
410     movdqu    xmm1, [eax + 4]
411     movdqu    xmm2, [eax + 8]
412     lea       eax, [eax + 24]
413     pshufb    xmm0, xmm3
414     pshufb    xmm1, xmm4
415     pshufb    xmm2, xmm5
416     movq      qword ptr [edx], xmm0
417     movq      qword ptr [edx + 8], xmm1
418     movq      qword ptr [edx + 16], xmm2
419     lea       edx, [edx + 24]
420     sub       ecx, 8
421     jg        convertloop
422     ret
423   }
424 }
425 
426 // pmul method to replicate bits.
427 // Math to replicate bits:
428 // (v << 8) | (v << 3)
429 // v * 256 + v * 8
430 // v * (256 + 8)
431 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
432 // 20 instructions.
433 __declspec(naked) void RGB565ToARGBRow_SSE2(const uint8* src_rgb565,
434                                             uint8* dst_argb,
435                                             int width) {
436   __asm {
437     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
438     movd      xmm5, eax
439     pshufd    xmm5, xmm5, 0
440     mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
441     movd      xmm6, eax
442     pshufd    xmm6, xmm6, 0
443     pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
444     psllw     xmm3, 11
445     pcmpeqb   xmm4, xmm4  // generate mask 0x07e007e0 for Green
446     psllw     xmm4, 10
447     psrlw     xmm4, 5
448     pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
449     psllw     xmm7, 8
450 
451     mov       eax, [esp + 4]  // src_rgb565
452     mov       edx, [esp + 8]  // dst_argb
453     mov       ecx, [esp + 12]  // width
454     sub       edx, eax
455     sub       edx, eax
456 
457  convertloop:
458     movdqu    xmm0, [eax]  // fetch 8 pixels of bgr565
459     movdqa    xmm1, xmm0
460     movdqa    xmm2, xmm0
461     pand      xmm1, xmm3  // R in upper 5 bits
462     psllw     xmm2, 11  // B in upper 5 bits
463     pmulhuw   xmm1, xmm5  // * (256 + 8)
464     pmulhuw   xmm2, xmm5  // * (256 + 8)
465     psllw     xmm1, 8
466     por       xmm1, xmm2  // RB
467     pand      xmm0, xmm4  // G in middle 6 bits
468     pmulhuw   xmm0, xmm6  // << 5 * (256 + 4)
469     por       xmm0, xmm7  // AG
470     movdqa    xmm2, xmm1
471     punpcklbw xmm1, xmm0
472     punpckhbw xmm2, xmm0
473     movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
474     movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
475     lea       eax, [eax + 16]
476     sub       ecx, 8
477     jg        convertloop
478     ret
479   }
480 }
481 
482 #ifdef HAS_RGB565TOARGBROW_AVX2
483 // pmul method to replicate bits.
484 // Math to replicate bits:
485 // (v << 8) | (v << 3)
486 // v * 256 + v * 8
487 // v * (256 + 8)
488 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
489 __declspec(naked) void RGB565ToARGBRow_AVX2(const uint8* src_rgb565,
490                                             uint8* dst_argb,
491                                             int width) {
492   __asm {
493     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
494     vmovd      xmm5, eax
495     vbroadcastss ymm5, xmm5
496     mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
497     vmovd      xmm6, eax
498     vbroadcastss ymm6, xmm6
499     vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
500     vpsllw     ymm3, ymm3, 11
501     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x07e007e0 for Green
502     vpsllw     ymm4, ymm4, 10
503     vpsrlw     ymm4, ymm4, 5
504     vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
505     vpsllw     ymm7, ymm7, 8
506 
507     mov        eax, [esp + 4]  // src_rgb565
508     mov        edx, [esp + 8]  // dst_argb
509     mov        ecx, [esp + 12]  // width
510     sub        edx, eax
511     sub        edx, eax
512 
513  convertloop:
514     vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgr565
515     vpand      ymm1, ymm0, ymm3  // R in upper 5 bits
516     vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
517     vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
518     vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
519     vpsllw     ymm1, ymm1, 8
520     vpor       ymm1, ymm1, ymm2  // RB
521     vpand      ymm0, ymm0, ymm4  // G in middle 6 bits
522     vpmulhuw   ymm0, ymm0, ymm6  // << 5 * (256 + 4)
523     vpor       ymm0, ymm0, ymm7  // AG
524     vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
525     vpermq     ymm1, ymm1, 0xd8
526     vpunpckhbw ymm2, ymm1, ymm0
527     vpunpcklbw ymm1, ymm1, ymm0
528     vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
529     vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
530     lea       eax, [eax + 32]
531     sub       ecx, 16
532     jg        convertloop
533     vzeroupper
534     ret
535   }
536 }
537 #endif  // HAS_RGB565TOARGBROW_AVX2
538 
539 #ifdef HAS_ARGB1555TOARGBROW_AVX2
540 __declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555,
541                                               uint8* dst_argb,
542                                               int width) {
543   __asm {
544     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
545     vmovd      xmm5, eax
546     vbroadcastss ymm5, xmm5
547     mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
548     vmovd      xmm6, eax
549     vbroadcastss ymm6, xmm6
550     vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
551     vpsllw     ymm3, ymm3, 11
552     vpsrlw     ymm4, ymm3, 6  // generate mask 0x03e003e0 for Green
553     vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
554     vpsllw     ymm7, ymm7, 8
555 
556     mov        eax,  [esp + 4]  // src_argb1555
557     mov        edx,  [esp + 8]  // dst_argb
558     mov        ecx,  [esp + 12]  // width
559     sub        edx,  eax
560     sub        edx,  eax
561 
562  convertloop:
563     vmovdqu    ymm0, [eax]  // fetch 16 pixels of 1555
564     vpsllw     ymm1, ymm0, 1  // R in upper 5 bits
565     vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
566     vpand      ymm1, ymm1, ymm3
567     vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
568     vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
569     vpsllw     ymm1, ymm1, 8
570     vpor       ymm1, ymm1, ymm2  // RB
571     vpsraw     ymm2, ymm0, 8  // A
572     vpand      ymm0, ymm0, ymm4  // G in middle 5 bits
573     vpmulhuw   ymm0, ymm0, ymm6  // << 6 * (256 + 8)
574     vpand      ymm2, ymm2, ymm7
575     vpor       ymm0, ymm0, ymm2  // AG
576     vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
577     vpermq     ymm1, ymm1, 0xd8
578     vpunpckhbw ymm2, ymm1, ymm0
579     vpunpcklbw ymm1, ymm1, ymm0
580     vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
581     vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
582     lea       eax, [eax + 32]
583     sub       ecx, 16
584     jg        convertloop
585     vzeroupper
586     ret
587   }
588 }
589 #endif  // HAS_ARGB1555TOARGBROW_AVX2
590 
591 #ifdef HAS_ARGB4444TOARGBROW_AVX2
592 __declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444,
593                                               uint8* dst_argb,
594                                               int width) {
595   __asm {
596     mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
597     vmovd     xmm4, eax
598     vbroadcastss ymm4, xmm4
599     vpslld    ymm5, ymm4, 4  // 0xf0f0f0f0 for high nibbles
600     mov       eax,  [esp + 4]  // src_argb4444
601     mov       edx,  [esp + 8]  // dst_argb
602     mov       ecx,  [esp + 12]  // width
603     sub       edx,  eax
604     sub       edx,  eax
605 
606  convertloop:
607     vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgra4444
608     vpand      ymm2, ymm0, ymm5  // mask high nibbles
609     vpand      ymm0, ymm0, ymm4  // mask low nibbles
610     vpsrlw     ymm3, ymm2, 4
611     vpsllw     ymm1, ymm0, 4
612     vpor       ymm2, ymm2, ymm3
613     vpor       ymm0, ymm0, ymm1
614     vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
615     vpermq     ymm2, ymm2, 0xd8
616     vpunpckhbw ymm1, ymm0, ymm2
617     vpunpcklbw ymm0, ymm0, ymm2
618     vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
619     vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
620     lea       eax, [eax + 32]
621     sub       ecx, 16
622     jg        convertloop
623     vzeroupper
624     ret
625   }
626 }
627 #endif  // HAS_ARGB4444TOARGBROW_AVX2
628 
629 // 24 instructions
630 __declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555,
631                                               uint8* dst_argb,
632                                               int width) {
633   __asm {
634     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
635     movd      xmm5, eax
636     pshufd    xmm5, xmm5, 0
637     mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
638     movd      xmm6, eax
639     pshufd    xmm6, xmm6, 0
640     pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
641     psllw     xmm3, 11
642     movdqa    xmm4, xmm3  // generate mask 0x03e003e0 for Green
643     psrlw     xmm4, 6
644     pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
645     psllw     xmm7, 8
646 
647     mov       eax, [esp + 4]  // src_argb1555
648     mov       edx, [esp + 8]  // dst_argb
649     mov       ecx, [esp + 12]  // width
650     sub       edx, eax
651     sub       edx, eax
652 
653  convertloop:
654     movdqu    xmm0, [eax]  // fetch 8 pixels of 1555
655     movdqa    xmm1, xmm0
656     movdqa    xmm2, xmm0
657     psllw     xmm1, 1  // R in upper 5 bits
658     psllw     xmm2, 11  // B in upper 5 bits
659     pand      xmm1, xmm3
660     pmulhuw   xmm2, xmm5  // * (256 + 8)
661     pmulhuw   xmm1, xmm5  // * (256 + 8)
662     psllw     xmm1, 8
663     por       xmm1, xmm2  // RB
664     movdqa    xmm2, xmm0
665     pand      xmm0, xmm4  // G in middle 5 bits
666     psraw     xmm2, 8  // A
667     pmulhuw   xmm0, xmm6  // << 6 * (256 + 8)
668     pand      xmm2, xmm7
669     por       xmm0, xmm2  // AG
670     movdqa    xmm2, xmm1
671     punpcklbw xmm1, xmm0
672     punpckhbw xmm2, xmm0
673     movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
674     movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
675     lea       eax, [eax + 16]
676     sub       ecx, 8
677     jg        convertloop
678     ret
679   }
680 }
681 
682 // 18 instructions.
683 __declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444,
684                                               uint8* dst_argb,
685                                               int width) {
686   __asm {
687     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
688     movd      xmm4, eax
689     pshufd    xmm4, xmm4, 0
690     movdqa    xmm5, xmm4  // 0xf0f0f0f0 for high nibbles
691     pslld     xmm5, 4
692     mov       eax, [esp + 4]  // src_argb4444
693     mov       edx, [esp + 8]  // dst_argb
694     mov       ecx, [esp + 12]  // width
695     sub       edx, eax
696     sub       edx, eax
697 
698  convertloop:
699     movdqu    xmm0, [eax]  // fetch 8 pixels of bgra4444
700     movdqa    xmm2, xmm0
701     pand      xmm0, xmm4  // mask low nibbles
702     pand      xmm2, xmm5  // mask high nibbles
703     movdqa    xmm1, xmm0
704     movdqa    xmm3, xmm2
705     psllw     xmm1, 4
706     psrlw     xmm3, 4
707     por       xmm0, xmm1
708     por       xmm2, xmm3
709     movdqa    xmm1, xmm0
710     punpcklbw xmm0, xmm2
711     punpckhbw xmm1, xmm2
712     movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
713     movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
714     lea       eax, [eax + 16]
715     sub       ecx, 8
716     jg        convertloop
717     ret
718   }
719 }
720 
721 __declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8* src_argb,
722                                             uint8* dst_rgb,
723                                             int width) {
724   __asm {
725     mov       eax, [esp + 4]  // src_argb
726     mov       edx, [esp + 8]  // dst_rgb
727     mov       ecx, [esp + 12]  // width
728     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24
729 
730  convertloop:
731     movdqu    xmm0, [eax]  // fetch 16 pixels of argb
732     movdqu    xmm1, [eax + 16]
733     movdqu    xmm2, [eax + 32]
734     movdqu    xmm3, [eax + 48]
735     lea       eax, [eax + 64]
736     pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
737     pshufb    xmm1, xmm6
738     pshufb    xmm2, xmm6
739     pshufb    xmm3, xmm6
740     movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
741     psrldq    xmm1, 4  // 8 bytes from 1
742     pslldq    xmm4, 12  // 4 bytes from 1 for 0
743     movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
744     por       xmm0, xmm4  // 4 bytes from 1 for 0
745     pslldq    xmm5, 8  // 8 bytes from 2 for 1
746     movdqu    [edx], xmm0  // store 0
747     por       xmm1, xmm5  // 8 bytes from 2 for 1
748     psrldq    xmm2, 8  // 4 bytes from 2
749     pslldq    xmm3, 4  // 12 bytes from 3 for 2
750     por       xmm2, xmm3  // 12 bytes from 3 for 2
751     movdqu    [edx + 16], xmm1  // store 1
752     movdqu    [edx + 32], xmm2  // store 2
753     lea       edx, [edx + 48]
754     sub       ecx, 16
755     jg        convertloop
756     ret
757   }
758 }
759 
760 __declspec(naked) void ARGBToRAWRow_SSSE3(const uint8* src_argb,
761                                           uint8* dst_rgb,
762                                           int width) {
763   __asm {
764     mov       eax, [esp + 4]  // src_argb
765     mov       edx, [esp + 8]  // dst_rgb
766     mov       ecx, [esp + 12]  // width
767     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRAW
768 
769  convertloop:
770     movdqu    xmm0, [eax]  // fetch 16 pixels of argb
771     movdqu    xmm1, [eax + 16]
772     movdqu    xmm2, [eax + 32]
773     movdqu    xmm3, [eax + 48]
774     lea       eax, [eax + 64]
775     pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
776     pshufb    xmm1, xmm6
777     pshufb    xmm2, xmm6
778     pshufb    xmm3, xmm6
779     movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
780     psrldq    xmm1, 4  // 8 bytes from 1
781     pslldq    xmm4, 12  // 4 bytes from 1 for 0
782     movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
783     por       xmm0, xmm4  // 4 bytes from 1 for 0
784     pslldq    xmm5, 8  // 8 bytes from 2 for 1
785     movdqu    [edx], xmm0  // store 0
786     por       xmm1, xmm5  // 8 bytes from 2 for 1
787     psrldq    xmm2, 8  // 4 bytes from 2
788     pslldq    xmm3, 4  // 12 bytes from 3 for 2
789     por       xmm2, xmm3  // 12 bytes from 3 for 2
790     movdqu    [edx + 16], xmm1  // store 1
791     movdqu    [edx + 32], xmm2  // store 2
792     lea       edx, [edx + 48]
793     sub       ecx, 16
794     jg        convertloop
795     ret
796   }
797 }
798 
799 __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8* src_argb,
800                                             uint8* dst_rgb,
801                                             int width) {
802   __asm {
803     mov       eax, [esp + 4]  // src_argb
804     mov       edx, [esp + 8]  // dst_rgb
805     mov       ecx, [esp + 12]  // width
806     pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
807     psrld     xmm3, 27
808     pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
809     psrld     xmm4, 26
810     pslld     xmm4, 5
811     pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
812     pslld     xmm5, 11
813 
814  convertloop:
815     movdqu    xmm0, [eax]  // fetch 4 pixels of argb
816     movdqa    xmm1, xmm0  // B
817     movdqa    xmm2, xmm0  // G
818     pslld     xmm0, 8  // R
819     psrld     xmm1, 3  // B
820     psrld     xmm2, 5  // G
821     psrad     xmm0, 16  // R
822     pand      xmm1, xmm3  // B
823     pand      xmm2, xmm4  // G
824     pand      xmm0, xmm5  // R
825     por       xmm1, xmm2  // BG
826     por       xmm0, xmm1  // BGR
827     packssdw  xmm0, xmm0
828     lea       eax, [eax + 16]
829     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
830     lea       edx, [edx + 8]
831     sub       ecx, 4
832     jg        convertloop
833     ret
834   }
835 }
836 
837 __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb,
838                                                   uint8* dst_rgb,
839                                                   const uint32 dither4,
840                                                   int width) {
841   __asm {
842 
843     mov       eax, [esp + 4]  // src_argb
844     mov       edx, [esp + 8]  // dst_rgb
845     movd      xmm6, [esp + 12]  // dither4
846     mov       ecx, [esp + 16]  // width
847     punpcklbw xmm6, xmm6  // make dither 16 bytes
848     movdqa    xmm7, xmm6
849     punpcklwd xmm6, xmm6
850     punpckhwd xmm7, xmm7
851     pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
852     psrld     xmm3, 27
853     pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
854     psrld     xmm4, 26
855     pslld     xmm4, 5
856     pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
857     pslld     xmm5, 11
858 
859  convertloop:
860     movdqu    xmm0, [eax]  // fetch 4 pixels of argb
861     paddusb   xmm0, xmm6  // add dither
862     movdqa    xmm1, xmm0  // B
863     movdqa    xmm2, xmm0  // G
864     pslld     xmm0, 8  // R
865     psrld     xmm1, 3  // B
866     psrld     xmm2, 5  // G
867     psrad     xmm0, 16  // R
868     pand      xmm1, xmm3  // B
869     pand      xmm2, xmm4  // G
870     pand      xmm0, xmm5  // R
871     por       xmm1, xmm2  // BG
872     por       xmm0, xmm1  // BGR
873     packssdw  xmm0, xmm0
874     lea       eax, [eax + 16]
875     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
876     lea       edx, [edx + 8]
877     sub       ecx, 4
878     jg        convertloop
879     ret
880   }
881 }
882 
883 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
884 __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb,
885                                                   uint8* dst_rgb,
886                                                   const uint32 dither4,
887                                                   int width) {
888   __asm {
889     mov        eax, [esp + 4]  // src_argb
890     mov        edx, [esp + 8]  // dst_rgb
891     vbroadcastss xmm6, [esp + 12]  // dither4
892     mov        ecx, [esp + 16]  // width
893     vpunpcklbw xmm6, xmm6, xmm6  // make dither 32 bytes
894     vpermq     ymm6, ymm6, 0xd8
895     vpunpcklwd ymm6, ymm6, ymm6
896     vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
897     vpsrld     ymm3, ymm3, 27
898     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
899     vpsrld     ymm4, ymm4, 26
900     vpslld     ymm4, ymm4, 5
901     vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
902 
903  convertloop:
904     vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
905     vpaddusb   ymm0, ymm0, ymm6  // add dither
906     vpsrld     ymm2, ymm0, 5  // G
907     vpsrld     ymm1, ymm0, 3  // B
908     vpsrld     ymm0, ymm0, 8  // R
909     vpand      ymm2, ymm2, ymm4  // G
910     vpand      ymm1, ymm1, ymm3  // B
911     vpand      ymm0, ymm0, ymm5  // R
912     vpor       ymm1, ymm1, ymm2  // BG
913     vpor       ymm0, ymm0, ymm1  // BGR
914     vpackusdw  ymm0, ymm0, ymm0
915     vpermq     ymm0, ymm0, 0xd8
916     lea        eax, [eax + 32]
917     vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
918     lea        edx, [edx + 16]
919     sub        ecx, 8
920     jg         convertloop
921     vzeroupper
922     ret
923   }
924 }
925 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
926 
927 // TODO(fbarchard): Improve sign extension/packing.
928 __declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8* src_argb,
929                                               uint8* dst_rgb,
930                                               int width) {
931   __asm {
932     mov       eax, [esp + 4]  // src_argb
933     mov       edx, [esp + 8]  // dst_rgb
934     mov       ecx, [esp + 12]  // width
935     pcmpeqb   xmm4, xmm4  // generate mask 0x0000001f
936     psrld     xmm4, 27
937     movdqa    xmm5, xmm4  // generate mask 0x000003e0
938     pslld     xmm5, 5
939     movdqa    xmm6, xmm4  // generate mask 0x00007c00
940     pslld     xmm6, 10
941     pcmpeqb   xmm7, xmm7  // generate mask 0xffff8000
942     pslld     xmm7, 15
943 
944  convertloop:
945     movdqu    xmm0, [eax]  // fetch 4 pixels of argb
946     movdqa    xmm1, xmm0  // B
947     movdqa    xmm2, xmm0  // G
948     movdqa    xmm3, xmm0  // R
949     psrad     xmm0, 16  // A
950     psrld     xmm1, 3  // B
951     psrld     xmm2, 6  // G
952     psrld     xmm3, 9  // R
953     pand      xmm0, xmm7  // A
954     pand      xmm1, xmm4  // B
955     pand      xmm2, xmm5  // G
956     pand      xmm3, xmm6  // R
957     por       xmm0, xmm1  // BA
958     por       xmm2, xmm3  // GR
959     por       xmm0, xmm2  // BGRA
960     packssdw  xmm0, xmm0
961     lea       eax, [eax + 16]
962     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
963     lea       edx, [edx + 8]
964     sub       ecx, 4
965     jg        convertloop
966     ret
967   }
968 }
969 
970 __declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8* src_argb,
971                                               uint8* dst_rgb,
972                                               int width) {
973   __asm {
974     mov       eax, [esp + 4]  // src_argb
975     mov       edx, [esp + 8]  // dst_rgb
976     mov       ecx, [esp + 12]  // width
977     pcmpeqb   xmm4, xmm4  // generate mask 0xf000f000
978     psllw     xmm4, 12
979     movdqa    xmm3, xmm4  // generate mask 0x00f000f0
980     psrlw     xmm3, 8
981 
982  convertloop:
983     movdqu    xmm0, [eax]  // fetch 4 pixels of argb
984     movdqa    xmm1, xmm0
985     pand      xmm0, xmm3  // low nibble
986     pand      xmm1, xmm4  // high nibble
987     psrld     xmm0, 4
988     psrld     xmm1, 8
989     por       xmm0, xmm1
990     packuswb  xmm0, xmm0
991     lea       eax, [eax + 16]
992     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
993     lea       edx, [edx + 8]
994     sub       ecx, 4
995     jg        convertloop
996     ret
997   }
998 }
999 
1000 #ifdef HAS_ARGBTORGB565ROW_AVX2
1001 __declspec(naked) void ARGBToRGB565Row_AVX2(const uint8* src_argb,
1002                                             uint8* dst_rgb,
1003                                             int width) {
1004   __asm {
1005     mov        eax, [esp + 4]  // src_argb
1006     mov        edx, [esp + 8]  // dst_rgb
1007     mov        ecx, [esp + 12]  // width
1008     vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
1009     vpsrld     ymm3, ymm3, 27
1010     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
1011     vpsrld     ymm4, ymm4, 26
1012     vpslld     ymm4, ymm4, 5
1013     vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
1014 
1015  convertloop:
1016     vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
1017     vpsrld     ymm2, ymm0, 5  // G
1018     vpsrld     ymm1, ymm0, 3  // B
1019     vpsrld     ymm0, ymm0, 8  // R
1020     vpand      ymm2, ymm2, ymm4  // G
1021     vpand      ymm1, ymm1, ymm3  // B
1022     vpand      ymm0, ymm0, ymm5  // R
1023     vpor       ymm1, ymm1, ymm2  // BG
1024     vpor       ymm0, ymm0, ymm1  // BGR
1025     vpackusdw  ymm0, ymm0, ymm0
1026     vpermq     ymm0, ymm0, 0xd8
1027     lea        eax, [eax + 32]
1028     vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
1029     lea        edx, [edx + 16]
1030     sub        ecx, 8
1031     jg         convertloop
1032     vzeroupper
1033     ret
1034   }
1035 }
1036 #endif  // HAS_ARGBTORGB565ROW_AVX2
1037 
1038 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
1039 __declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8* src_argb,
1040                                               uint8* dst_rgb,
1041                                               int width) {
1042   __asm {
1043     mov        eax, [esp + 4]  // src_argb
1044     mov        edx, [esp + 8]  // dst_rgb
1045     mov        ecx, [esp + 12]  // width
1046     vpcmpeqb   ymm4, ymm4, ymm4
1047     vpsrld     ymm4, ymm4, 27  // generate mask 0x0000001f
1048     vpslld     ymm5, ymm4, 5  // generate mask 0x000003e0
1049     vpslld     ymm6, ymm4, 10  // generate mask 0x00007c00
1050     vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xffff8000
1051     vpslld     ymm7, ymm7, 15
1052 
1053  convertloop:
1054     vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
1055     vpsrld     ymm3, ymm0, 9  // R
1056     vpsrld     ymm2, ymm0, 6  // G
1057     vpsrld     ymm1, ymm0, 3  // B
1058     vpsrad     ymm0, ymm0, 16  // A
1059     vpand      ymm3, ymm3, ymm6  // R
1060     vpand      ymm2, ymm2, ymm5  // G
1061     vpand      ymm1, ymm1, ymm4  // B
1062     vpand      ymm0, ymm0, ymm7  // A
1063     vpor       ymm0, ymm0, ymm1  // BA
1064     vpor       ymm2, ymm2, ymm3  // GR
1065     vpor       ymm0, ymm0, ymm2  // BGRA
1066     vpackssdw  ymm0, ymm0, ymm0
1067     vpermq     ymm0, ymm0, 0xd8
1068     lea        eax, [eax + 32]
1069     vmovdqu    [edx], xmm0  // store 8 pixels of ARGB1555
1070     lea        edx, [edx + 16]
1071     sub        ecx, 8
1072     jg         convertloop
1073     vzeroupper
1074     ret
1075   }
1076 }
1077 #endif  // HAS_ARGBTOARGB1555ROW_AVX2
1078 
1079 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
1080 __declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8* src_argb,
1081                                               uint8* dst_rgb,
1082                                               int width) {
1083   __asm {
1084     mov        eax, [esp + 4]  // src_argb
1085     mov        edx, [esp + 8]  // dst_rgb
1086     mov        ecx, [esp + 12]  // width
1087     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xf000f000
1088     vpsllw     ymm4, ymm4, 12
1089     vpsrlw     ymm3, ymm4, 8  // generate mask 0x00f000f0
1090 
1091  convertloop:
1092     vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
1093     vpand      ymm1, ymm0, ymm4  // high nibble
1094     vpand      ymm0, ymm0, ymm3  // low nibble
1095     vpsrld     ymm1, ymm1, 8
1096     vpsrld     ymm0, ymm0, 4
1097     vpor       ymm0, ymm0, ymm1
1098     vpackuswb  ymm0, ymm0, ymm0
1099     vpermq     ymm0, ymm0, 0xd8
1100     lea        eax, [eax + 32]
1101     vmovdqu    [edx], xmm0  // store 8 pixels of ARGB4444
1102     lea        edx, [edx + 16]
1103     sub        ecx, 8
1104     jg         convertloop
1105     vzeroupper
1106     ret
1107   }
1108 }
1109 #endif  // HAS_ARGBTOARGB4444ROW_AVX2
1110 
1111 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
1112 __declspec(naked) void ARGBToYRow_SSSE3(const uint8* src_argb,
1113                                         uint8* dst_y,
1114                                         int width) {
1115   __asm {
1116     mov        eax, [esp + 4] /* src_argb */
1117     mov        edx, [esp + 8] /* dst_y */
1118     mov        ecx, [esp + 12] /* width */
1119     movdqa     xmm4, xmmword ptr kARGBToY
1120     movdqa     xmm5, xmmword ptr kAddY16
1121 
1122  convertloop:
1123     movdqu     xmm0, [eax]
1124     movdqu     xmm1, [eax + 16]
1125     movdqu     xmm2, [eax + 32]
1126     movdqu     xmm3, [eax + 48]
1127     pmaddubsw  xmm0, xmm4
1128     pmaddubsw  xmm1, xmm4
1129     pmaddubsw  xmm2, xmm4
1130     pmaddubsw  xmm3, xmm4
1131     lea        eax, [eax + 64]
1132     phaddw     xmm0, xmm1
1133     phaddw     xmm2, xmm3
1134     psrlw      xmm0, 7
1135     psrlw      xmm2, 7
1136     packuswb   xmm0, xmm2
1137     paddb      xmm0, xmm5
1138     movdqu     [edx], xmm0
1139     lea        edx, [edx + 16]
1140     sub        ecx, 16
1141     jg         convertloop
1142     ret
1143   }
1144 }
1145 
1146 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1147 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
1148 __declspec(naked) void ARGBToYJRow_SSSE3(const uint8* src_argb,
1149                                          uint8* dst_y,
1150                                          int width) {
1151   __asm {
1152     mov        eax, [esp + 4] /* src_argb */
1153     mov        edx, [esp + 8] /* dst_y */
1154     mov        ecx, [esp + 12] /* width */
1155     movdqa     xmm4, xmmword ptr kARGBToYJ
1156     movdqa     xmm5, xmmword ptr kAddYJ64
1157 
1158  convertloop:
1159     movdqu     xmm0, [eax]
1160     movdqu     xmm1, [eax + 16]
1161     movdqu     xmm2, [eax + 32]
1162     movdqu     xmm3, [eax + 48]
1163     pmaddubsw  xmm0, xmm4
1164     pmaddubsw  xmm1, xmm4
1165     pmaddubsw  xmm2, xmm4
1166     pmaddubsw  xmm3, xmm4
1167     lea        eax, [eax + 64]
1168     phaddw     xmm0, xmm1
1169     phaddw     xmm2, xmm3
1170     paddw      xmm0, xmm5  // Add .5 for rounding.
1171     paddw      xmm2, xmm5
1172     psrlw      xmm0, 7
1173     psrlw      xmm2, 7
1174     packuswb   xmm0, xmm2
1175     movdqu     [edx], xmm0
1176     lea        edx, [edx + 16]
1177     sub        ecx, 16
1178     jg         convertloop
1179     ret
1180   }
1181 }
1182 
1183 #ifdef HAS_ARGBTOYROW_AVX2
1184 // vpermd for vphaddw + vpackuswb vpermd.
1185 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
1186 
1187 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1188 __declspec(naked) void ARGBToYRow_AVX2(const uint8* src_argb,
1189                                        uint8* dst_y,
1190                                        int width) {
1191   __asm {
1192     mov        eax, [esp + 4] /* src_argb */
1193     mov        edx, [esp + 8] /* dst_y */
1194     mov        ecx, [esp + 12] /* width */
1195     vbroadcastf128 ymm4, xmmword ptr kARGBToY
1196     vbroadcastf128 ymm5, xmmword ptr kAddY16
1197     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
1198 
1199  convertloop:
1200     vmovdqu    ymm0, [eax]
1201     vmovdqu    ymm1, [eax + 32]
1202     vmovdqu    ymm2, [eax + 64]
1203     vmovdqu    ymm3, [eax + 96]
1204     vpmaddubsw ymm0, ymm0, ymm4
1205     vpmaddubsw ymm1, ymm1, ymm4
1206     vpmaddubsw ymm2, ymm2, ymm4
1207     vpmaddubsw ymm3, ymm3, ymm4
1208     lea        eax, [eax + 128]
1209     vphaddw    ymm0, ymm0, ymm1  // mutates.
1210     vphaddw    ymm2, ymm2, ymm3
1211     vpsrlw     ymm0, ymm0, 7
1212     vpsrlw     ymm2, ymm2, 7
1213     vpackuswb  ymm0, ymm0, ymm2  // mutates.
1214     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
1215     vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
1216     vmovdqu    [edx], ymm0
1217     lea        edx, [edx + 32]
1218     sub        ecx, 32
1219     jg         convertloop
1220     vzeroupper
1221     ret
1222   }
1223 }
1224 #endif  //  HAS_ARGBTOYROW_AVX2
1225 
1226 #ifdef HAS_ARGBTOYJROW_AVX2
1227 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1228 __declspec(naked) void ARGBToYJRow_AVX2(const uint8* src_argb,
1229                                         uint8* dst_y,
1230                                         int width) {
1231   __asm {
1232     mov        eax, [esp + 4] /* src_argb */
1233     mov        edx, [esp + 8] /* dst_y */
1234     mov        ecx, [esp + 12] /* width */
1235     vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
1236     vbroadcastf128 ymm5, xmmword ptr kAddYJ64
1237     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
1238 
1239  convertloop:
1240     vmovdqu    ymm0, [eax]
1241     vmovdqu    ymm1, [eax + 32]
1242     vmovdqu    ymm2, [eax + 64]
1243     vmovdqu    ymm3, [eax + 96]
1244     vpmaddubsw ymm0, ymm0, ymm4
1245     vpmaddubsw ymm1, ymm1, ymm4
1246     vpmaddubsw ymm2, ymm2, ymm4
1247     vpmaddubsw ymm3, ymm3, ymm4
1248     lea        eax, [eax + 128]
1249     vphaddw    ymm0, ymm0, ymm1  // mutates.
1250     vphaddw    ymm2, ymm2, ymm3
1251     vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
1252     vpaddw     ymm2, ymm2, ymm5
1253     vpsrlw     ymm0, ymm0, 7
1254     vpsrlw     ymm2, ymm2, 7
1255     vpackuswb  ymm0, ymm0, ymm2  // mutates.
1256     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
1257     vmovdqu    [edx], ymm0
1258     lea        edx, [edx + 32]
1259     sub        ecx, 32
1260     jg         convertloop
1261 
1262     vzeroupper
1263     ret
1264   }
1265 }
1266 #endif  //  HAS_ARGBTOYJROW_AVX2
1267 
1268 __declspec(naked) void BGRAToYRow_SSSE3(const uint8* src_argb,
1269                                         uint8* dst_y,
1270                                         int width) {
1271   __asm {
1272     mov        eax, [esp + 4] /* src_argb */
1273     mov        edx, [esp + 8] /* dst_y */
1274     mov        ecx, [esp + 12] /* width */
1275     movdqa     xmm4, xmmword ptr kBGRAToY
1276     movdqa     xmm5, xmmword ptr kAddY16
1277 
1278  convertloop:
1279     movdqu     xmm0, [eax]
1280     movdqu     xmm1, [eax + 16]
1281     movdqu     xmm2, [eax + 32]
1282     movdqu     xmm3, [eax + 48]
1283     pmaddubsw  xmm0, xmm4
1284     pmaddubsw  xmm1, xmm4
1285     pmaddubsw  xmm2, xmm4
1286     pmaddubsw  xmm3, xmm4
1287     lea        eax, [eax + 64]
1288     phaddw     xmm0, xmm1
1289     phaddw     xmm2, xmm3
1290     psrlw      xmm0, 7
1291     psrlw      xmm2, 7
1292     packuswb   xmm0, xmm2
1293     paddb      xmm0, xmm5
1294     movdqu     [edx], xmm0
1295     lea        edx, [edx + 16]
1296     sub        ecx, 16
1297     jg         convertloop
1298     ret
1299   }
1300 }
1301 
1302 __declspec(naked) void ABGRToYRow_SSSE3(const uint8* src_argb,
1303                                         uint8* dst_y,
1304                                         int width) {
1305   __asm {
1306     mov        eax, [esp + 4] /* src_argb */
1307     mov        edx, [esp + 8] /* dst_y */
1308     mov        ecx, [esp + 12] /* width */
1309     movdqa     xmm4, xmmword ptr kABGRToY
1310     movdqa     xmm5, xmmword ptr kAddY16
1311 
1312  convertloop:
1313     movdqu     xmm0, [eax]
1314     movdqu     xmm1, [eax + 16]
1315     movdqu     xmm2, [eax + 32]
1316     movdqu     xmm3, [eax + 48]
1317     pmaddubsw  xmm0, xmm4
1318     pmaddubsw  xmm1, xmm4
1319     pmaddubsw  xmm2, xmm4
1320     pmaddubsw  xmm3, xmm4
1321     lea        eax, [eax + 64]
1322     phaddw     xmm0, xmm1
1323     phaddw     xmm2, xmm3
1324     psrlw      xmm0, 7
1325     psrlw      xmm2, 7
1326     packuswb   xmm0, xmm2
1327     paddb      xmm0, xmm5
1328     movdqu     [edx], xmm0
1329     lea        edx, [edx + 16]
1330     sub        ecx, 16
1331     jg         convertloop
1332     ret
1333   }
1334 }
1335 
1336 __declspec(naked) void RGBAToYRow_SSSE3(const uint8* src_argb,
1337                                         uint8* dst_y,
1338                                         int width) {
1339   __asm {
1340     mov        eax, [esp + 4] /* src_argb */
1341     mov        edx, [esp + 8] /* dst_y */
1342     mov        ecx, [esp + 12] /* width */
1343     movdqa     xmm4, xmmword ptr kRGBAToY
1344     movdqa     xmm5, xmmword ptr kAddY16
1345 
1346  convertloop:
1347     movdqu     xmm0, [eax]
1348     movdqu     xmm1, [eax + 16]
1349     movdqu     xmm2, [eax + 32]
1350     movdqu     xmm3, [eax + 48]
1351     pmaddubsw  xmm0, xmm4
1352     pmaddubsw  xmm1, xmm4
1353     pmaddubsw  xmm2, xmm4
1354     pmaddubsw  xmm3, xmm4
1355     lea        eax, [eax + 64]
1356     phaddw     xmm0, xmm1
1357     phaddw     xmm2, xmm3
1358     psrlw      xmm0, 7
1359     psrlw      xmm2, 7
1360     packuswb   xmm0, xmm2
1361     paddb      xmm0, xmm5
1362     movdqu     [edx], xmm0
1363     lea        edx, [edx + 16]
1364     sub        ecx, 16
1365     jg         convertloop
1366     ret
1367   }
1368 }
1369 
1370 __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0,
1371                                          int src_stride_argb,
1372                                          uint8* dst_u,
1373                                          uint8* dst_v,
1374                                          int width) {
1375   __asm {
1376     push       esi
1377     push       edi
1378     mov        eax, [esp + 8 + 4]  // src_argb
1379     mov        esi, [esp + 8 + 8]  // src_stride_argb
1380     mov        edx, [esp + 8 + 12]  // dst_u
1381     mov        edi, [esp + 8 + 16]  // dst_v
1382     mov        ecx, [esp + 8 + 20]  // width
1383     movdqa     xmm5, xmmword ptr kAddUV128
1384     movdqa     xmm6, xmmword ptr kARGBToV
1385     movdqa     xmm7, xmmword ptr kARGBToU
1386     sub        edi, edx  // stride from u to v
1387 
1388  convertloop:
1389          /* step 1 - subsample 16x2 argb pixels to 8x1 */
1390     movdqu     xmm0, [eax]
1391     movdqu     xmm4, [eax + esi]
1392     pavgb      xmm0, xmm4
1393     movdqu     xmm1, [eax + 16]
1394     movdqu     xmm4, [eax + esi + 16]
1395     pavgb      xmm1, xmm4
1396     movdqu     xmm2, [eax + 32]
1397     movdqu     xmm4, [eax + esi + 32]
1398     pavgb      xmm2, xmm4
1399     movdqu     xmm3, [eax + 48]
1400     movdqu     xmm4, [eax + esi + 48]
1401     pavgb      xmm3, xmm4
1402 
1403     lea        eax,  [eax + 64]
1404     movdqa     xmm4, xmm0
1405     shufps     xmm0, xmm1, 0x88
1406     shufps     xmm4, xmm1, 0xdd
1407     pavgb      xmm0, xmm4
1408     movdqa     xmm4, xmm2
1409     shufps     xmm2, xmm3, 0x88
1410     shufps     xmm4, xmm3, 0xdd
1411     pavgb      xmm2, xmm4
1412 
1413     // step 2 - convert to U and V
1414     // from here down is very similar to Y code except
1415     // instead of 16 different pixels, its 8 pixels of U and 8 of V
1416     movdqa     xmm1, xmm0
1417     movdqa     xmm3, xmm2
1418     pmaddubsw  xmm0, xmm7  // U
1419     pmaddubsw  xmm2, xmm7
1420     pmaddubsw  xmm1, xmm6  // V
1421     pmaddubsw  xmm3, xmm6
1422     phaddw     xmm0, xmm2
1423     phaddw     xmm1, xmm3
1424     psraw      xmm0, 8
1425     psraw      xmm1, 8
1426     packsswb   xmm0, xmm1
1427     paddb      xmm0, xmm5  // -> unsigned
1428 
1429     // step 3 - store 8 U and 8 V values
1430     movlps     qword ptr [edx], xmm0  // U
1431     movhps     qword ptr [edx + edi], xmm0  // V
1432     lea        edx, [edx + 8]
1433     sub        ecx, 16
1434     jg         convertloop
1435 
1436     pop        edi
1437     pop        esi
1438     ret
1439   }
1440 }
1441 
1442 __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
1443                                           int src_stride_argb,
1444                                           uint8* dst_u,
1445                                           uint8* dst_v,
1446                                           int width) {
1447   __asm {
1448     push       esi
1449     push       edi
1450     mov        eax, [esp + 8 + 4]  // src_argb
1451     mov        esi, [esp + 8 + 8]  // src_stride_argb
1452     mov        edx, [esp + 8 + 12]  // dst_u
1453     mov        edi, [esp + 8 + 16]  // dst_v
1454     mov        ecx, [esp + 8 + 20]  // width
1455     movdqa     xmm5, xmmword ptr kAddUVJ128
1456     movdqa     xmm6, xmmword ptr kARGBToVJ
1457     movdqa     xmm7, xmmword ptr kARGBToUJ
1458     sub        edi, edx  // stride from u to v
1459 
1460  convertloop:
1461          /* step 1 - subsample 16x2 argb pixels to 8x1 */
1462     movdqu     xmm0, [eax]
1463     movdqu     xmm4, [eax + esi]
1464     pavgb      xmm0, xmm4
1465     movdqu     xmm1, [eax + 16]
1466     movdqu     xmm4, [eax + esi + 16]
1467     pavgb      xmm1, xmm4
1468     movdqu     xmm2, [eax + 32]
1469     movdqu     xmm4, [eax + esi + 32]
1470     pavgb      xmm2, xmm4
1471     movdqu     xmm3, [eax + 48]
1472     movdqu     xmm4, [eax + esi + 48]
1473     pavgb      xmm3, xmm4
1474 
1475     lea        eax,  [eax + 64]
1476     movdqa     xmm4, xmm0
1477     shufps     xmm0, xmm1, 0x88
1478     shufps     xmm4, xmm1, 0xdd
1479     pavgb      xmm0, xmm4
1480     movdqa     xmm4, xmm2
1481     shufps     xmm2, xmm3, 0x88
1482     shufps     xmm4, xmm3, 0xdd
1483     pavgb      xmm2, xmm4
1484 
1485     // step 2 - convert to U and V
1486     // from here down is very similar to Y code except
1487     // instead of 16 different pixels, its 8 pixels of U and 8 of V
1488     movdqa     xmm1, xmm0
1489     movdqa     xmm3, xmm2
1490     pmaddubsw  xmm0, xmm7  // U
1491     pmaddubsw  xmm2, xmm7
1492     pmaddubsw  xmm1, xmm6  // V
1493     pmaddubsw  xmm3, xmm6
1494     phaddw     xmm0, xmm2
1495     phaddw     xmm1, xmm3
1496     paddw      xmm0, xmm5  // +.5 rounding -> unsigned
1497     paddw      xmm1, xmm5
1498     psraw      xmm0, 8
1499     psraw      xmm1, 8
1500     packsswb   xmm0, xmm1
1501 
1502     // step 3 - store 8 U and 8 V values
1503     movlps     qword ptr [edx], xmm0  // U
1504     movhps     qword ptr [edx + edi], xmm0  // V
1505     lea        edx, [edx + 8]
1506     sub        ecx, 16
1507     jg         convertloop
1508 
1509     pop        edi
1510     pop        esi
1511     ret
1512   }
1513 }
1514 
1515 #ifdef HAS_ARGBTOUVROW_AVX2
1516 __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0,
1517                                         int src_stride_argb,
1518                                         uint8* dst_u,
1519                                         uint8* dst_v,
1520                                         int width) {
1521   __asm {
1522     push       esi
1523     push       edi
1524     mov        eax, [esp + 8 + 4]  // src_argb
1525     mov        esi, [esp + 8 + 8]  // src_stride_argb
1526     mov        edx, [esp + 8 + 12]  // dst_u
1527     mov        edi, [esp + 8 + 16]  // dst_v
1528     mov        ecx, [esp + 8 + 20]  // width
1529     vbroadcastf128 ymm5, xmmword ptr kAddUV128
1530     vbroadcastf128 ymm6, xmmword ptr kARGBToV
1531     vbroadcastf128 ymm7, xmmword ptr kARGBToU
1532     sub        edi, edx   // stride from u to v
1533 
1534  convertloop:
1535         /* step 1 - subsample 32x2 argb pixels to 16x1 */
1536     vmovdqu    ymm0, [eax]
1537     vmovdqu    ymm1, [eax + 32]
1538     vmovdqu    ymm2, [eax + 64]
1539     vmovdqu    ymm3, [eax + 96]
1540     vpavgb     ymm0, ymm0, [eax + esi]
1541     vpavgb     ymm1, ymm1, [eax + esi + 32]
1542     vpavgb     ymm2, ymm2, [eax + esi + 64]
1543     vpavgb     ymm3, ymm3, [eax + esi + 96]
1544     lea        eax,  [eax + 128]
1545     vshufps    ymm4, ymm0, ymm1, 0x88
1546     vshufps    ymm0, ymm0, ymm1, 0xdd
1547     vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
1548     vshufps    ymm4, ymm2, ymm3, 0x88
1549     vshufps    ymm2, ymm2, ymm3, 0xdd
1550     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
1551 
1552     // step 2 - convert to U and V
1553     // from here down is very similar to Y code except
1554     // instead of 32 different pixels, its 16 pixels of U and 16 of V
1555     vpmaddubsw ymm1, ymm0, ymm7  // U
1556     vpmaddubsw ymm3, ymm2, ymm7
1557     vpmaddubsw ymm0, ymm0, ymm6  // V
1558     vpmaddubsw ymm2, ymm2, ymm6
1559     vphaddw    ymm1, ymm1, ymm3  // mutates
1560     vphaddw    ymm0, ymm0, ymm2
1561     vpsraw     ymm1, ymm1, 8
1562     vpsraw     ymm0, ymm0, 8
1563     vpacksswb  ymm0, ymm1, ymm0  // mutates
1564     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
1565     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
1566     vpaddb     ymm0, ymm0, ymm5  // -> unsigned
1567 
1568     // step 3 - store 16 U and 16 V values
1569     vextractf128 [edx], ymm0, 0  // U
1570     vextractf128 [edx + edi], ymm0, 1  // V
1571     lea        edx, [edx + 16]
1572     sub        ecx, 32
1573     jg         convertloop
1574 
1575     pop        edi
1576     pop        esi
1577     vzeroupper
1578     ret
1579   }
1580 }
1581 #endif  // HAS_ARGBTOUVROW_AVX2
1582 
1583 #ifdef HAS_ARGBTOUVJROW_AVX2
1584 __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0,
1585                                          int src_stride_argb,
1586                                          uint8* dst_u,
1587                                          uint8* dst_v,
1588                                          int width) {
1589   __asm {
1590     push       esi
1591     push       edi
1592     mov        eax, [esp + 8 + 4]  // src_argb
1593     mov        esi, [esp + 8 + 8]  // src_stride_argb
1594     mov        edx, [esp + 8 + 12]  // dst_u
1595     mov        edi, [esp + 8 + 16]  // dst_v
1596     mov        ecx, [esp + 8 + 20]  // width
1597     vbroadcastf128 ymm5, xmmword ptr kAddUV128
1598     vbroadcastf128 ymm6, xmmword ptr kARGBToV
1599     vbroadcastf128 ymm7, xmmword ptr kARGBToU
1600     sub        edi, edx   // stride from u to v
1601 
1602  convertloop:
1603         /* step 1 - subsample 32x2 argb pixels to 16x1 */
1604     vmovdqu    ymm0, [eax]
1605     vmovdqu    ymm1, [eax + 32]
1606     vmovdqu    ymm2, [eax + 64]
1607     vmovdqu    ymm3, [eax + 96]
1608     vpavgb     ymm0, ymm0, [eax + esi]
1609     vpavgb     ymm1, ymm1, [eax + esi + 32]
1610     vpavgb     ymm2, ymm2, [eax + esi + 64]
1611     vpavgb     ymm3, ymm3, [eax + esi + 96]
1612     lea        eax,  [eax + 128]
1613     vshufps    ymm4, ymm0, ymm1, 0x88
1614     vshufps    ymm0, ymm0, ymm1, 0xdd
1615     vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
1616     vshufps    ymm4, ymm2, ymm3, 0x88
1617     vshufps    ymm2, ymm2, ymm3, 0xdd
1618     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
1619 
1620     // step 2 - convert to U and V
1621     // from here down is very similar to Y code except
1622     // instead of 32 different pixels, its 16 pixels of U and 16 of V
1623     vpmaddubsw ymm1, ymm0, ymm7  // U
1624     vpmaddubsw ymm3, ymm2, ymm7
1625     vpmaddubsw ymm0, ymm0, ymm6  // V
1626     vpmaddubsw ymm2, ymm2, ymm6
1627     vphaddw    ymm1, ymm1, ymm3  // mutates
1628     vphaddw    ymm0, ymm0, ymm2
1629     vpaddw     ymm1, ymm1, ymm5  // +.5 rounding -> unsigned
1630     vpaddw     ymm0, ymm0, ymm5
1631     vpsraw     ymm1, ymm1, 8
1632     vpsraw     ymm0, ymm0, 8
1633     vpacksswb  ymm0, ymm1, ymm0  // mutates
1634     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
1635     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
1636 
1637     // step 3 - store 16 U and 16 V values
1638     vextractf128 [edx], ymm0, 0  // U
1639     vextractf128 [edx + edi], ymm0, 1  // V
1640     lea        edx, [edx + 16]
1641     sub        ecx, 32
1642     jg         convertloop
1643 
1644     pop        edi
1645     pop        esi
1646     vzeroupper
1647     ret
1648   }
1649 }
1650 #endif  // HAS_ARGBTOUVJROW_AVX2
1651 
1652 __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
1653                                             uint8* dst_u,
1654                                             uint8* dst_v,
1655                                             int width) {
1656   __asm {
1657     push       edi
1658     mov        eax, [esp + 4 + 4]  // src_argb
1659     mov        edx, [esp + 4 + 8]  // dst_u
1660     mov        edi, [esp + 4 + 12]  // dst_v
1661     mov        ecx, [esp + 4 + 16]  // width
1662     movdqa     xmm5, xmmword ptr kAddUV128
1663     movdqa     xmm6, xmmword ptr kARGBToV
1664     movdqa     xmm7, xmmword ptr kARGBToU
1665     sub        edi, edx    // stride from u to v
1666 
1667  convertloop:
1668         /* convert to U and V */
1669     movdqu     xmm0, [eax]  // U
1670     movdqu     xmm1, [eax + 16]
1671     movdqu     xmm2, [eax + 32]
1672     movdqu     xmm3, [eax + 48]
1673     pmaddubsw  xmm0, xmm7
1674     pmaddubsw  xmm1, xmm7
1675     pmaddubsw  xmm2, xmm7
1676     pmaddubsw  xmm3, xmm7
1677     phaddw     xmm0, xmm1
1678     phaddw     xmm2, xmm3
1679     psraw      xmm0, 8
1680     psraw      xmm2, 8
1681     packsswb   xmm0, xmm2
1682     paddb      xmm0, xmm5
1683     movdqu     [edx], xmm0
1684 
1685     movdqu     xmm0, [eax]  // V
1686     movdqu     xmm1, [eax + 16]
1687     movdqu     xmm2, [eax + 32]
1688     movdqu     xmm3, [eax + 48]
1689     pmaddubsw  xmm0, xmm6
1690     pmaddubsw  xmm1, xmm6
1691     pmaddubsw  xmm2, xmm6
1692     pmaddubsw  xmm3, xmm6
1693     phaddw     xmm0, xmm1
1694     phaddw     xmm2, xmm3
1695     psraw      xmm0, 8
1696     psraw      xmm2, 8
1697     packsswb   xmm0, xmm2
1698     paddb      xmm0, xmm5
1699     lea        eax,  [eax + 64]
1700     movdqu     [edx + edi], xmm0
1701     lea        edx,  [edx + 16]
1702     sub        ecx,  16
1703     jg         convertloop
1704 
1705     pop        edi
1706     ret
1707   }
1708 }
1709 
1710 __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0,
1711                                          int src_stride_argb,
1712                                          uint8* dst_u,
1713                                          uint8* dst_v,
1714                                          int width) {
1715   __asm {
1716     push       esi
1717     push       edi
1718     mov        eax, [esp + 8 + 4]  // src_argb
1719     mov        esi, [esp + 8 + 8]  // src_stride_argb
1720     mov        edx, [esp + 8 + 12]  // dst_u
1721     mov        edi, [esp + 8 + 16]  // dst_v
1722     mov        ecx, [esp + 8 + 20]  // width
1723     movdqa     xmm5, xmmword ptr kAddUV128
1724     movdqa     xmm6, xmmword ptr kBGRAToV
1725     movdqa     xmm7, xmmword ptr kBGRAToU
1726     sub        edi, edx  // stride from u to v
1727 
1728  convertloop:
1729          /* step 1 - subsample 16x2 argb pixels to 8x1 */
1730     movdqu     xmm0, [eax]
1731     movdqu     xmm4, [eax + esi]
1732     pavgb      xmm0, xmm4
1733     movdqu     xmm1, [eax + 16]
1734     movdqu     xmm4, [eax + esi + 16]
1735     pavgb      xmm1, xmm4
1736     movdqu     xmm2, [eax + 32]
1737     movdqu     xmm4, [eax + esi + 32]
1738     pavgb      xmm2, xmm4
1739     movdqu     xmm3, [eax + 48]
1740     movdqu     xmm4, [eax + esi + 48]
1741     pavgb      xmm3, xmm4
1742 
1743     lea        eax,  [eax + 64]
1744     movdqa     xmm4, xmm0
1745     shufps     xmm0, xmm1, 0x88
1746     shufps     xmm4, xmm1, 0xdd
1747     pavgb      xmm0, xmm4
1748     movdqa     xmm4, xmm2
1749     shufps     xmm2, xmm3, 0x88
1750     shufps     xmm4, xmm3, 0xdd
1751     pavgb      xmm2, xmm4
1752 
1753     // step 2 - convert to U and V
1754     // from here down is very similar to Y code except
1755     // instead of 16 different pixels, its 8 pixels of U and 8 of V
1756     movdqa     xmm1, xmm0
1757     movdqa     xmm3, xmm2
1758     pmaddubsw  xmm0, xmm7  // U
1759     pmaddubsw  xmm2, xmm7
1760     pmaddubsw  xmm1, xmm6  // V
1761     pmaddubsw  xmm3, xmm6
1762     phaddw     xmm0, xmm2
1763     phaddw     xmm1, xmm3
1764     psraw      xmm0, 8
1765     psraw      xmm1, 8
1766     packsswb   xmm0, xmm1
1767     paddb      xmm0, xmm5  // -> unsigned
1768 
1769     // step 3 - store 8 U and 8 V values
1770     movlps     qword ptr [edx], xmm0  // U
1771     movhps     qword ptr [edx + edi], xmm0  // V
1772     lea        edx, [edx + 8]
1773     sub        ecx, 16
1774     jg         convertloop
1775 
1776     pop        edi
1777     pop        esi
1778     ret
1779   }
1780 }
1781 
1782 __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0,
1783                                          int src_stride_argb,
1784                                          uint8* dst_u,
1785                                          uint8* dst_v,
1786                                          int width) {
1787   __asm {
1788     push       esi
1789     push       edi
1790     mov        eax, [esp + 8 + 4]  // src_argb
1791     mov        esi, [esp + 8 + 8]  // src_stride_argb
1792     mov        edx, [esp + 8 + 12]  // dst_u
1793     mov        edi, [esp + 8 + 16]  // dst_v
1794     mov        ecx, [esp + 8 + 20]  // width
1795     movdqa     xmm5, xmmword ptr kAddUV128
1796     movdqa     xmm6, xmmword ptr kABGRToV
1797     movdqa     xmm7, xmmword ptr kABGRToU
1798     sub        edi, edx  // stride from u to v
1799 
1800  convertloop:
1801          /* step 1 - subsample 16x2 argb pixels to 8x1 */
1802     movdqu     xmm0, [eax]
1803     movdqu     xmm4, [eax + esi]
1804     pavgb      xmm0, xmm4
1805     movdqu     xmm1, [eax + 16]
1806     movdqu     xmm4, [eax + esi + 16]
1807     pavgb      xmm1, xmm4
1808     movdqu     xmm2, [eax + 32]
1809     movdqu     xmm4, [eax + esi + 32]
1810     pavgb      xmm2, xmm4
1811     movdqu     xmm3, [eax + 48]
1812     movdqu     xmm4, [eax + esi + 48]
1813     pavgb      xmm3, xmm4
1814 
1815     lea        eax,  [eax + 64]
1816     movdqa     xmm4, xmm0
1817     shufps     xmm0, xmm1, 0x88
1818     shufps     xmm4, xmm1, 0xdd
1819     pavgb      xmm0, xmm4
1820     movdqa     xmm4, xmm2
1821     shufps     xmm2, xmm3, 0x88
1822     shufps     xmm4, xmm3, 0xdd
1823     pavgb      xmm2, xmm4
1824 
1825     // step 2 - convert to U and V
1826     // from here down is very similar to Y code except
1827     // instead of 16 different pixels, its 8 pixels of U and 8 of V
1828     movdqa     xmm1, xmm0
1829     movdqa     xmm3, xmm2
1830     pmaddubsw  xmm0, xmm7  // U
1831     pmaddubsw  xmm2, xmm7
1832     pmaddubsw  xmm1, xmm6  // V
1833     pmaddubsw  xmm3, xmm6
1834     phaddw     xmm0, xmm2
1835     phaddw     xmm1, xmm3
1836     psraw      xmm0, 8
1837     psraw      xmm1, 8
1838     packsswb   xmm0, xmm1
1839     paddb      xmm0, xmm5  // -> unsigned
1840 
1841     // step 3 - store 8 U and 8 V values
1842     movlps     qword ptr [edx], xmm0  // U
1843     movhps     qword ptr [edx + edi], xmm0  // V
1844     lea        edx, [edx + 8]
1845     sub        ecx, 16
1846     jg         convertloop
1847 
1848     pop        edi
1849     pop        esi
1850     ret
1851   }
1852 }
1853 
1854 __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0,
1855                                          int src_stride_argb,
1856                                          uint8* dst_u,
1857                                          uint8* dst_v,
1858                                          int width) {
1859   __asm {
1860     push       esi
1861     push       edi
1862     mov        eax, [esp + 8 + 4]  // src_argb
1863     mov        esi, [esp + 8 + 8]  // src_stride_argb
1864     mov        edx, [esp + 8 + 12]  // dst_u
1865     mov        edi, [esp + 8 + 16]  // dst_v
1866     mov        ecx, [esp + 8 + 20]  // width
1867     movdqa     xmm5, xmmword ptr kAddUV128
1868     movdqa     xmm6, xmmword ptr kRGBAToV
1869     movdqa     xmm7, xmmword ptr kRGBAToU
1870     sub        edi, edx  // stride from u to v
1871 
1872  convertloop:
1873          /* step 1 - subsample 16x2 argb pixels to 8x1 */
1874     movdqu     xmm0, [eax]
1875     movdqu     xmm4, [eax + esi]
1876     pavgb      xmm0, xmm4
1877     movdqu     xmm1, [eax + 16]
1878     movdqu     xmm4, [eax + esi + 16]
1879     pavgb      xmm1, xmm4
1880     movdqu     xmm2, [eax + 32]
1881     movdqu     xmm4, [eax + esi + 32]
1882     pavgb      xmm2, xmm4
1883     movdqu     xmm3, [eax + 48]
1884     movdqu     xmm4, [eax + esi + 48]
1885     pavgb      xmm3, xmm4
1886 
1887     lea        eax,  [eax + 64]
1888     movdqa     xmm4, xmm0
1889     shufps     xmm0, xmm1, 0x88
1890     shufps     xmm4, xmm1, 0xdd
1891     pavgb      xmm0, xmm4
1892     movdqa     xmm4, xmm2
1893     shufps     xmm2, xmm3, 0x88
1894     shufps     xmm4, xmm3, 0xdd
1895     pavgb      xmm2, xmm4
1896 
1897     // step 2 - convert to U and V
1898     // from here down is very similar to Y code except
1899     // instead of 16 different pixels, its 8 pixels of U and 8 of V
1900     movdqa     xmm1, xmm0
1901     movdqa     xmm3, xmm2
1902     pmaddubsw  xmm0, xmm7  // U
1903     pmaddubsw  xmm2, xmm7
1904     pmaddubsw  xmm1, xmm6  // V
1905     pmaddubsw  xmm3, xmm6
1906     phaddw     xmm0, xmm2
1907     phaddw     xmm1, xmm3
1908     psraw      xmm0, 8
1909     psraw      xmm1, 8
1910     packsswb   xmm0, xmm1
1911     paddb      xmm0, xmm5  // -> unsigned
1912 
1913     // step 3 - store 8 U and 8 V values
1914     movlps     qword ptr [edx], xmm0  // U
1915     movhps     qword ptr [edx + edi], xmm0  // V
1916     lea        edx, [edx + 8]
1917     sub        ecx, 16
1918     jg         convertloop
1919 
1920     pop        edi
1921     pop        esi
1922     ret
1923   }
1924 }
1925 #endif  // HAS_ARGBTOYROW_SSSE3
1926 
1927 // Read 16 UV from 444
1928 #define READYUV444_AVX2 \
1929   __asm {                                                \
1930     __asm vmovdqu    xmm0, [esi] /* U */                      \
1931     __asm vmovdqu    xmm1, [esi + edi] /* V */                      \
1932     __asm lea        esi,  [esi + 16]                                          \
1933     __asm vpermq     ymm0, ymm0, 0xd8                                          \
1934     __asm vpermq     ymm1, ymm1, 0xd8                                          \
1935     __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
1936     __asm vmovdqu    xmm4, [eax] /* Y */                      \
1937     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1938     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1939     __asm lea        eax, [eax + 16]}
1940 
1941 // Read 8 UV from 422, upsample to 16 UV.
1942 #define READYUV422_AVX2 \
1943   __asm {                                                \
1944     __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
1945     __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
1946     __asm lea        esi,  [esi + 8]                                           \
1947     __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
1948     __asm vpermq     ymm0, ymm0, 0xd8                                          \
1949     __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
1950     __asm vmovdqu    xmm4, [eax] /* Y */                      \
1951     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1952     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1953     __asm lea        eax, [eax + 16]}
1954 
1955 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
1956 #define READYUVA422_AVX2 \
1957   __asm {                                               \
1958     __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
1959     __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
1960     __asm lea        esi,  [esi + 8]                                           \
1961     __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
1962     __asm vpermq     ymm0, ymm0, 0xd8                                          \
1963     __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
1964     __asm vmovdqu    xmm4, [eax] /* Y */                      \
1965     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1966     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1967     __asm lea        eax, [eax + 16]                                           \
1968     __asm vmovdqu    xmm5, [ebp] /* A */                      \
1969     __asm vpermq     ymm5, ymm5, 0xd8                                          \
1970     __asm lea        ebp, [ebp + 16]}
1971 
1972 // Read 8 UV from NV12, upsample to 16 UV.
1973 #define READNV12_AVX2 \
1974   __asm {                                                  \
1975     __asm vmovdqu    xmm0, [esi] /* UV */                     \
1976     __asm lea        esi,  [esi + 16]                                          \
1977     __asm vpermq     ymm0, ymm0, 0xd8                                          \
1978     __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
1979     __asm vmovdqu    xmm4, [eax] /* Y */                      \
1980     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1981     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1982     __asm lea        eax, [eax + 16]}
1983 
1984 // Read 8 UV from NV21, upsample to 16 UV.
1985 #define READNV21_AVX2 \
1986   __asm {                                                  \
1987     __asm vmovdqu    xmm0, [esi] /* UV */                     \
1988     __asm lea        esi,  [esi + 16]                                          \
1989     __asm vpermq     ymm0, ymm0, 0xd8                                          \
1990     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleNV21                      \
1991     __asm vmovdqu    xmm4, [eax] /* Y */                      \
1992     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1993     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1994     __asm lea        eax, [eax + 16]}
1995 
1996 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
1997 #define READYUY2_AVX2 \
1998   __asm {                                                  \
1999     __asm vmovdqu    ymm4, [eax] /* YUY2 */                           \
2000     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \
2001     __asm vmovdqu    ymm0, [eax] /* UV */                             \
2002     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleYUY2UV                    \
2003     __asm lea        eax, [eax + 32]}
2004 
2005 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2006 #define READUYVY_AVX2 \
2007   __asm {                                                  \
2008     __asm vmovdqu    ymm4, [eax] /* UYVY */                           \
2009     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \
2010     __asm vmovdqu    ymm0, [eax] /* UV */                             \
2011     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleUYVYUV                    \
2012     __asm lea        eax, [eax + 32]}
2013 
2014 // Convert 16 pixels: 16 UV and 16 Y.
2015 #define YUVTORGB_AVX2(YuvConstants) \
2016   __asm {                                    \
2017     __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
2018     __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
2019     __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
2020     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASR]               \
2021     __asm vpsubw     ymm2, ymm3, ymm2                                          \
2022     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASG]               \
2023     __asm vpsubw     ymm1, ymm3, ymm1                                          \
2024     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \
2025     __asm vpsubw     ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */                       \
2026     __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
2027     __asm vpaddsw    ymm0, ymm0, ymm4 /* B += Y */                   \
2028     __asm vpaddsw    ymm1, ymm1, ymm4 /* G += Y */                   \
2029     __asm vpaddsw    ymm2, ymm2, ymm4 /* R += Y */                   \
2030     __asm vpsraw     ymm0, ymm0, 6                                             \
2031     __asm vpsraw     ymm1, ymm1, 6                                             \
2032     __asm vpsraw     ymm2, ymm2, 6                                             \
2033     __asm vpackuswb  ymm0, ymm0, ymm0 /* B */                        \
2034     __asm vpackuswb  ymm1, ymm1, ymm1 /* G */                        \
2035     __asm vpackuswb  ymm2, ymm2, ymm2 /* R */                  \
2036   }
2037 
2038 // Store 16 ARGB values.
2039 #define STOREARGB_AVX2 \
2040   __asm {                                                 \
2041     __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */                       \
2042     __asm vpermq     ymm0, ymm0, 0xd8                                          \
2043     __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */                       \
2044     __asm vpermq     ymm2, ymm2, 0xd8                                          \
2045     __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */      \
2046     __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */       \
2047     __asm vmovdqu    0[edx], ymm1                                              \
2048     __asm vmovdqu    32[edx], ymm0                                             \
2049     __asm lea        edx,  [edx + 64]}
2050 
2051 // Store 16 RGBA values.
2052 #define STORERGBA_AVX2 \
2053   __asm {                                                 \
2054     __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */                       \
2055     __asm vpermq     ymm1, ymm1, 0xd8                                          \
2056     __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */                       \
2057     __asm vpermq     ymm2, ymm2, 0xd8                                          \
2058     __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */      \
2059     __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */       \
2060     __asm vmovdqu    [edx], ymm0                                               \
2061     __asm vmovdqu    [edx + 32], ymm1                                          \
2062     __asm lea        edx,  [edx + 64]}
2063 
2064 #ifdef HAS_I422TOARGBROW_AVX2
2065 // 16 pixels
2066 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2067 __declspec(naked) void I422ToARGBRow_AVX2(
2068     const uint8* y_buf,
2069     const uint8* u_buf,
2070     const uint8* v_buf,
2071     uint8* dst_argb,
2072     const struct YuvConstants* yuvconstants,
2073     int width) {
2074   __asm {
2075     push       esi
2076     push       edi
2077     push       ebx
2078     mov        eax, [esp + 12 + 4]  // Y
2079     mov        esi, [esp + 12 + 8]  // U
2080     mov        edi, [esp + 12 + 12]  // V
2081     mov        edx, [esp + 12 + 16]  // argb
2082     mov        ebx, [esp + 12 + 20]  // yuvconstants
2083     mov        ecx, [esp + 12 + 24]  // width
2084     sub        edi, esi
2085     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2086 
2087  convertloop:
2088     READYUV422_AVX2
2089     YUVTORGB_AVX2(ebx)
2090     STOREARGB_AVX2
2091 
2092     sub        ecx, 16
2093     jg         convertloop
2094 
2095     pop        ebx
2096     pop        edi
2097     pop        esi
2098     vzeroupper
2099     ret
2100   }
2101 }
2102 #endif  // HAS_I422TOARGBROW_AVX2
2103 
2104 #ifdef HAS_I422ALPHATOARGBROW_AVX2
2105 // 16 pixels
2106 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
2107 __declspec(naked) void I422AlphaToARGBRow_AVX2(
2108     const uint8* y_buf,
2109     const uint8* u_buf,
2110     const uint8* v_buf,
2111     const uint8* a_buf,
2112     uint8* dst_argb,
2113     const struct YuvConstants* yuvconstants,
2114     int width) {
2115   __asm {
2116     push       esi
2117     push       edi
2118     push       ebx
2119     push       ebp
2120     mov        eax, [esp + 16 + 4]  // Y
2121     mov        esi, [esp + 16 + 8]  // U
2122     mov        edi, [esp + 16 + 12]  // V
2123     mov        ebp, [esp + 16 + 16]  // A
2124     mov        edx, [esp + 16 + 20]  // argb
2125     mov        ebx, [esp + 16 + 24]  // yuvconstants
2126     mov        ecx, [esp + 16 + 28]  // width
2127     sub        edi, esi
2128 
2129  convertloop:
2130     READYUVA422_AVX2
2131     YUVTORGB_AVX2(ebx)
2132     STOREARGB_AVX2
2133 
2134     sub        ecx, 16
2135     jg         convertloop
2136 
2137     pop        ebp
2138     pop        ebx
2139     pop        edi
2140     pop        esi
2141     vzeroupper
2142     ret
2143   }
2144 }
2145 #endif  // HAS_I422ALPHATOARGBROW_AVX2
2146 
2147 #ifdef HAS_I444TOARGBROW_AVX2
2148 // 16 pixels
2149 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2150 __declspec(naked) void I444ToARGBRow_AVX2(
2151     const uint8* y_buf,
2152     const uint8* u_buf,
2153     const uint8* v_buf,
2154     uint8* dst_argb,
2155     const struct YuvConstants* yuvconstants,
2156     int width) {
2157   __asm {
2158     push       esi
2159     push       edi
2160     push       ebx
2161     mov        eax, [esp + 12 + 4]  // Y
2162     mov        esi, [esp + 12 + 8]  // U
2163     mov        edi, [esp + 12 + 12]  // V
2164     mov        edx, [esp + 12 + 16]  // argb
2165     mov        ebx, [esp + 12 + 20]  // yuvconstants
2166     mov        ecx, [esp + 12 + 24]  // width
2167     sub        edi, esi
2168     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2169  convertloop:
2170     READYUV444_AVX2
2171     YUVTORGB_AVX2(ebx)
2172     STOREARGB_AVX2
2173 
2174     sub        ecx, 16
2175     jg         convertloop
2176 
2177     pop        ebx
2178     pop        edi
2179     pop        esi
2180     vzeroupper
2181     ret
2182   }
2183 }
2184 #endif  // HAS_I444TOARGBROW_AVX2
2185 
2186 #ifdef HAS_NV12TOARGBROW_AVX2
2187 // 16 pixels.
2188 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2189 __declspec(naked) void NV12ToARGBRow_AVX2(
2190     const uint8* y_buf,
2191     const uint8* uv_buf,
2192     uint8* dst_argb,
2193     const struct YuvConstants* yuvconstants,
2194     int width) {
2195   __asm {
2196     push       esi
2197     push       ebx
2198     mov        eax, [esp + 8 + 4]  // Y
2199     mov        esi, [esp + 8 + 8]  // UV
2200     mov        edx, [esp + 8 + 12]  // argb
2201     mov        ebx, [esp + 8 + 16]  // yuvconstants
2202     mov        ecx, [esp + 8 + 20]  // width
2203     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2204 
2205  convertloop:
2206     READNV12_AVX2
2207     YUVTORGB_AVX2(ebx)
2208     STOREARGB_AVX2
2209 
2210     sub        ecx, 16
2211     jg         convertloop
2212 
2213     pop        ebx
2214     pop        esi
2215     vzeroupper
2216     ret
2217   }
2218 }
2219 #endif  // HAS_NV12TOARGBROW_AVX2
2220 
2221 #ifdef HAS_NV21TOARGBROW_AVX2
2222 // 16 pixels.
2223 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2224 __declspec(naked) void NV21ToARGBRow_AVX2(
2225     const uint8* y_buf,
2226     const uint8* vu_buf,
2227     uint8* dst_argb,
2228     const struct YuvConstants* yuvconstants,
2229     int width) {
2230   __asm {
2231     push       esi
2232     push       ebx
2233     mov        eax, [esp + 8 + 4]  // Y
2234     mov        esi, [esp + 8 + 8]  // VU
2235     mov        edx, [esp + 8 + 12]  // argb
2236     mov        ebx, [esp + 8 + 16]  // yuvconstants
2237     mov        ecx, [esp + 8 + 20]  // width
2238     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2239 
2240  convertloop:
2241     READNV21_AVX2
2242     YUVTORGB_AVX2(ebx)
2243     STOREARGB_AVX2
2244 
2245     sub        ecx, 16
2246     jg         convertloop
2247 
2248     pop        ebx
2249     pop        esi
2250     vzeroupper
2251     ret
2252   }
2253 }
2254 #endif  // HAS_NV21TOARGBROW_AVX2
2255 
2256 #ifdef HAS_YUY2TOARGBROW_AVX2
2257 // 16 pixels.
2258 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2259 __declspec(naked) void YUY2ToARGBRow_AVX2(
2260     const uint8* src_yuy2,
2261     uint8* dst_argb,
2262     const struct YuvConstants* yuvconstants,
2263     int width) {
2264   __asm {
2265     push       ebx
2266     mov        eax, [esp + 4 + 4]  // yuy2
2267     mov        edx, [esp + 4 + 8]  // argb
2268     mov        ebx, [esp + 4 + 12]  // yuvconstants
2269     mov        ecx, [esp + 4 + 16]  // width
2270     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2271 
2272  convertloop:
2273     READYUY2_AVX2
2274     YUVTORGB_AVX2(ebx)
2275     STOREARGB_AVX2
2276 
2277     sub        ecx, 16
2278     jg         convertloop
2279 
2280     pop        ebx
2281     vzeroupper
2282     ret
2283   }
2284 }
2285 #endif  // HAS_YUY2TOARGBROW_AVX2
2286 
2287 #ifdef HAS_UYVYTOARGBROW_AVX2
2288 // 16 pixels.
2289 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2290 __declspec(naked) void UYVYToARGBRow_AVX2(
2291     const uint8* src_uyvy,
2292     uint8* dst_argb,
2293     const struct YuvConstants* yuvconstants,
2294     int width) {
2295   __asm {
2296     push       ebx
2297     mov        eax, [esp + 4 + 4]  // uyvy
2298     mov        edx, [esp + 4 + 8]  // argb
2299     mov        ebx, [esp + 4 + 12]  // yuvconstants
2300     mov        ecx, [esp + 4 + 16]  // width
2301     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2302 
2303  convertloop:
2304     READUYVY_AVX2
2305     YUVTORGB_AVX2(ebx)
2306     STOREARGB_AVX2
2307 
2308     sub        ecx, 16
2309     jg         convertloop
2310 
2311     pop        ebx
2312     vzeroupper
2313     ret
2314   }
2315 }
2316 #endif  // HAS_UYVYTOARGBROW_AVX2
2317 
2318 #ifdef HAS_I422TORGBAROW_AVX2
2319 // 16 pixels
2320 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2321 __declspec(naked) void I422ToRGBARow_AVX2(
2322     const uint8* y_buf,
2323     const uint8* u_buf,
2324     const uint8* v_buf,
2325     uint8* dst_argb,
2326     const struct YuvConstants* yuvconstants,
2327     int width) {
2328   __asm {
2329     push       esi
2330     push       edi
2331     push       ebx
2332     mov        eax, [esp + 12 + 4]  // Y
2333     mov        esi, [esp + 12 + 8]  // U
2334     mov        edi, [esp + 12 + 12]  // V
2335     mov        edx, [esp + 12 + 16]  // abgr
2336     mov        ebx, [esp + 12 + 20]  // yuvconstants
2337     mov        ecx, [esp + 12 + 24]  // width
2338     sub        edi, esi
2339     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2340 
2341  convertloop:
2342     READYUV422_AVX2
2343     YUVTORGB_AVX2(ebx)
2344     STORERGBA_AVX2
2345 
2346     sub        ecx, 16
2347     jg         convertloop
2348 
2349     pop        ebx
2350     pop        edi
2351     pop        esi
2352     vzeroupper
2353     ret
2354   }
2355 }
2356 #endif  // HAS_I422TORGBAROW_AVX2
2357 
2358 #if defined(HAS_I422TOARGBROW_SSSE3)
2359 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
2360 // Allows a conversion with half size scaling.
2361 
2362 // Read 8 UV from 444.
2363 #define READYUV444 \
2364   __asm {                                                     \
2365     __asm movq       xmm0, qword ptr [esi] /* U */                             \
2366     __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
2367     __asm lea        esi,  [esi + 8]                                           \
2368     __asm punpcklbw  xmm0, xmm1 /* UV */                             \
2369     __asm movq       xmm4, qword ptr [eax]                                     \
2370     __asm punpcklbw  xmm4, xmm4                                                \
2371     __asm lea        eax, [eax + 8]}
2372 
2373 // Read 4 UV from 422, upsample to 8 UV.
2374 #define READYUV422 \
2375   __asm {                                                     \
2376     __asm movd       xmm0, [esi] /* U */                              \
2377     __asm movd       xmm1, [esi + edi] /* V */                              \
2378     __asm lea        esi,  [esi + 4]                                           \
2379     __asm punpcklbw  xmm0, xmm1 /* UV */                             \
2380     __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
2381     __asm movq       xmm4, qword ptr [eax]                                     \
2382     __asm punpcklbw  xmm4, xmm4                                                \
2383     __asm lea        eax, [eax + 8]}
2384 
2385 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
2386 #define READYUVA422 \
2387   __asm {                                                    \
2388     __asm movd       xmm0, [esi] /* U */                              \
2389     __asm movd       xmm1, [esi + edi] /* V */                              \
2390     __asm lea        esi,  [esi + 4]                                           \
2391     __asm punpcklbw  xmm0, xmm1 /* UV */                             \
2392     __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
2393     __asm movq       xmm4, qword ptr [eax] /* Y */                           \
2394     __asm punpcklbw  xmm4, xmm4                                                \
2395     __asm lea        eax, [eax + 8]                                            \
2396     __asm movq       xmm5, qword ptr [ebp] /* A */                           \
2397     __asm lea        ebp, [ebp + 8]}
2398 
2399 // Read 4 UV from NV12, upsample to 8 UV.
2400 #define READNV12 \
2401   __asm {                                                       \
2402     __asm movq       xmm0, qword ptr [esi] /* UV */                            \
2403     __asm lea        esi,  [esi + 8]                                           \
2404     __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
2405     __asm movq       xmm4, qword ptr [eax]                                     \
2406     __asm punpcklbw  xmm4, xmm4                                                \
2407     __asm lea        eax, [eax + 8]}
2408 
2409 // Read 4 VU from NV21, upsample to 8 UV.
2410 #define READNV21 \
2411   __asm {                                                       \
2412     __asm movq       xmm0, qword ptr [esi] /* UV */                            \
2413     __asm lea        esi,  [esi + 8]                                           \
2414     __asm pshufb     xmm0, xmmword ptr kShuffleNV21                            \
2415     __asm movq       xmm4, qword ptr [eax]                                     \
2416     __asm punpcklbw  xmm4, xmm4                                                \
2417     __asm lea        eax, [eax + 8]}
2418 
2419 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
2420 #define READYUY2 \
2421   __asm {                                                       \
2422     __asm movdqu     xmm4, [eax] /* YUY2 */                           \
2423     __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \
2424     __asm movdqu     xmm0, [eax] /* UV */                             \
2425     __asm pshufb     xmm0, xmmword ptr kShuffleYUY2UV                          \
2426     __asm lea        eax, [eax + 16]}
2427 
2428 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
2429 #define READUYVY \
2430   __asm {                                                       \
2431     __asm movdqu     xmm4, [eax] /* UYVY */                           \
2432     __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \
2433     __asm movdqu     xmm0, [eax] /* UV */                             \
2434     __asm pshufb     xmm0, xmmword ptr kShuffleUYVYUV                          \
2435     __asm lea        eax, [eax + 16]}
2436 
2437 // Convert 8 pixels: 8 UV and 8 Y.
2438 #define YUVTORGB(YuvConstants) \
2439   __asm {                                         \
2440     __asm movdqa     xmm1, xmm0                                                \
2441     __asm movdqa     xmm2, xmm0                                                \
2442     __asm movdqa     xmm3, xmm0                                                \
2443     __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVBIASB]               \
2444     __asm pmaddubsw  xmm1, xmmword ptr [YuvConstants + KUVTOB]                 \
2445     __asm psubw      xmm0, xmm1                                                \
2446     __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVBIASG]               \
2447     __asm pmaddubsw  xmm2, xmmword ptr [YuvConstants + KUVTOG]                 \
2448     __asm psubw      xmm1, xmm2                                                \
2449     __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVBIASR]               \
2450     __asm pmaddubsw  xmm3, xmmword ptr [YuvConstants + KUVTOR]                 \
2451     __asm psubw      xmm2, xmm3                                                \
2452     __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
2453     __asm paddsw     xmm0, xmm4 /* B += Y */                         \
2454     __asm paddsw     xmm1, xmm4 /* G += Y */                         \
2455     __asm paddsw     xmm2, xmm4 /* R += Y */                         \
2456     __asm psraw      xmm0, 6                                                   \
2457     __asm psraw      xmm1, 6                                                   \
2458     __asm psraw      xmm2, 6                                                   \
2459     __asm packuswb   xmm0, xmm0 /* B */                              \
2460     __asm packuswb   xmm1, xmm1 /* G */                              \
2461     __asm packuswb   xmm2, xmm2 /* R */             \
2462   }
2463 
2464 // Store 8 ARGB values.
2465 #define STOREARGB \
2466   __asm {                                                      \
2467     __asm punpcklbw  xmm0, xmm1 /* BG */                             \
2468     __asm punpcklbw  xmm2, xmm5 /* RA */                             \
2469     __asm movdqa     xmm1, xmm0                                                \
2470     __asm punpcklwd  xmm0, xmm2 /* BGRA first 4 pixels */            \
2471     __asm punpckhwd  xmm1, xmm2 /* BGRA next 4 pixels */             \
2472     __asm movdqu     0[edx], xmm0                                              \
2473     __asm movdqu     16[edx], xmm1                                             \
2474     __asm lea        edx,  [edx + 32]}
2475 
2476 // Store 8 BGRA values.
2477 #define STOREBGRA \
2478   __asm {                                                      \
2479     __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
2480     __asm punpcklbw  xmm1, xmm0 /* GB */                             \
2481     __asm punpcklbw  xmm5, xmm2 /* AR */                             \
2482     __asm movdqa     xmm0, xmm5                                                \
2483     __asm punpcklwd  xmm5, xmm1 /* BGRA first 4 pixels */            \
2484     __asm punpckhwd  xmm0, xmm1 /* BGRA next 4 pixels */             \
2485     __asm movdqu     0[edx], xmm5                                              \
2486     __asm movdqu     16[edx], xmm0                                             \
2487     __asm lea        edx,  [edx + 32]}
2488 
2489 // Store 8 RGBA values.
2490 #define STORERGBA \
2491   __asm {                                                      \
2492     __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
2493     __asm punpcklbw  xmm1, xmm2 /* GR */                             \
2494     __asm punpcklbw  xmm5, xmm0 /* AB */                             \
2495     __asm movdqa     xmm0, xmm5                                                \
2496     __asm punpcklwd  xmm5, xmm1 /* RGBA first 4 pixels */            \
2497     __asm punpckhwd  xmm0, xmm1 /* RGBA next 4 pixels */             \
2498     __asm movdqu     0[edx], xmm5                                              \
2499     __asm movdqu     16[edx], xmm0                                             \
2500     __asm lea        edx,  [edx + 32]}
2501 
2502 // Store 8 RGB24 values.
2503 #define STORERGB24 \
2504   __asm {/* Weave into RRGB */                                                      \
2505     __asm punpcklbw  xmm0, xmm1 /* BG */                             \
2506     __asm punpcklbw  xmm2, xmm2 /* RR */                             \
2507     __asm movdqa     xmm1, xmm0                                                \
2508     __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
2509     __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */                                                        \
2510     __asm pshufb     xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
2511     __asm pshufb     xmm1, xmm6 /* Pack first 12 bytes. */           \
2512     __asm palignr    xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
2513     __asm movq       qword ptr 0[edx], xmm0 /* First 8 bytes */               \
2514     __asm movdqu     8[edx], xmm1 /* Last 16 bytes */                  \
2515     __asm lea        edx,  [edx + 24]}
2516 
2517 // Store 8 RGB565 values.
2518 #define STORERGB565 \
2519   __asm {/* Weave into RRGB */                                                      \
2520     __asm punpcklbw  xmm0, xmm1 /* BG */                             \
2521     __asm punpcklbw  xmm2, xmm2 /* RR */                             \
2522     __asm movdqa     xmm1, xmm0                                                \
2523     __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
2524     __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */                                                       \
2525     __asm movdqa     xmm3, xmm0 /* B  first 4 pixels of argb */             \
2526     __asm movdqa     xmm2, xmm0 /* G */                                     \
2527     __asm pslld      xmm0, 8 /* R */                                     \
2528     __asm psrld      xmm3, 3 /* B */                                     \
2529     __asm psrld      xmm2, 5 /* G */                                     \
2530     __asm psrad      xmm0, 16 /* R */                                     \
2531     __asm pand       xmm3, xmm5 /* B */                                     \
2532     __asm pand       xmm2, xmm6 /* G */                                     \
2533     __asm pand       xmm0, xmm7 /* R */                                     \
2534     __asm por        xmm3, xmm2 /* BG */                                    \
2535     __asm por        xmm0, xmm3 /* BGR */                                   \
2536     __asm movdqa     xmm3, xmm1 /* B  next 4 pixels of argb */              \
2537     __asm movdqa     xmm2, xmm1 /* G */                                     \
2538     __asm pslld      xmm1, 8 /* R */                                     \
2539     __asm psrld      xmm3, 3 /* B */                                     \
2540     __asm psrld      xmm2, 5 /* G */                                     \
2541     __asm psrad      xmm1, 16 /* R */                                     \
2542     __asm pand       xmm3, xmm5 /* B */                                     \
2543     __asm pand       xmm2, xmm6 /* G */                                     \
2544     __asm pand       xmm1, xmm7 /* R */                                     \
2545     __asm por        xmm3, xmm2 /* BG */                                    \
2546     __asm por        xmm1, xmm3 /* BGR */                                   \
2547     __asm packssdw   xmm0, xmm1                                                \
2548     __asm movdqu     0[edx], xmm0 /* store 8 pixels of RGB565 */              \
2549     __asm lea        edx, [edx + 16]}
2550 
2551 // 8 pixels.
2552 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2553 __declspec(naked) void I444ToARGBRow_SSSE3(
2554     const uint8* y_buf,
2555     const uint8* u_buf,
2556     const uint8* v_buf,
2557     uint8* dst_argb,
2558     const struct YuvConstants* yuvconstants,
2559     int width) {
2560   __asm {
2561     push       esi
2562     push       edi
2563     push       ebx
2564     mov        eax, [esp + 12 + 4]  // Y
2565     mov        esi, [esp + 12 + 8]  // U
2566     mov        edi, [esp + 12 + 12]  // V
2567     mov        edx, [esp + 12 + 16]  // argb
2568     mov        ebx, [esp + 12 + 20]  // yuvconstants
2569     mov        ecx, [esp + 12 + 24]  // width
2570     sub        edi, esi
2571     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2572 
2573  convertloop:
2574     READYUV444
2575     YUVTORGB(ebx)
2576     STOREARGB
2577 
2578     sub        ecx, 8
2579     jg         convertloop
2580 
2581     pop        ebx
2582     pop        edi
2583     pop        esi
2584     ret
2585   }
2586 }
2587 
2588 // 8 pixels.
2589 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
2590 __declspec(naked) void I422ToRGB24Row_SSSE3(
2591     const uint8* y_buf,
2592     const uint8* u_buf,
2593     const uint8* v_buf,
2594     uint8* dst_rgb24,
2595     const struct YuvConstants* yuvconstants,
2596     int width) {
2597   __asm {
2598     push       esi
2599     push       edi
2600     push       ebx
2601     mov        eax, [esp + 12 + 4]  // Y
2602     mov        esi, [esp + 12 + 8]  // U
2603     mov        edi, [esp + 12 + 12]  // V
2604     mov        edx, [esp + 12 + 16]  // argb
2605     mov        ebx, [esp + 12 + 20]  // yuvconstants
2606     mov        ecx, [esp + 12 + 24]  // width
2607     sub        edi, esi
2608     movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
2609     movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24
2610 
2611  convertloop:
2612     READYUV422
2613     YUVTORGB(ebx)
2614     STORERGB24
2615 
2616     sub        ecx, 8
2617     jg         convertloop
2618 
2619     pop        ebx
2620     pop        edi
2621     pop        esi
2622     ret
2623   }
2624 }
2625 
2626 // 8 pixels
2627 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
2628 __declspec(naked) void I422ToRGB565Row_SSSE3(
2629     const uint8* y_buf,
2630     const uint8* u_buf,
2631     const uint8* v_buf,
2632     uint8* rgb565_buf,
2633     const struct YuvConstants* yuvconstants,
2634     int width) {
2635   __asm {
2636     push       esi
2637     push       edi
2638     push       ebx
2639     mov        eax, [esp + 12 + 4]  // Y
2640     mov        esi, [esp + 12 + 8]  // U
2641     mov        edi, [esp + 12 + 12]  // V
2642     mov        edx, [esp + 12 + 16]  // argb
2643     mov        ebx, [esp + 12 + 20]  // yuvconstants
2644     mov        ecx, [esp + 12 + 24]  // width
2645     sub        edi, esi
2646     pcmpeqb    xmm5, xmm5  // generate mask 0x0000001f
2647     psrld      xmm5, 27
2648     pcmpeqb    xmm6, xmm6  // generate mask 0x000007e0
2649     psrld      xmm6, 26
2650     pslld      xmm6, 5
2651     pcmpeqb    xmm7, xmm7  // generate mask 0xfffff800
2652     pslld      xmm7, 11
2653 
2654  convertloop:
2655     READYUV422
2656     YUVTORGB(ebx)
2657     STORERGB565
2658 
2659     sub        ecx, 8
2660     jg         convertloop
2661 
2662     pop        ebx
2663     pop        edi
2664     pop        esi
2665     ret
2666   }
2667 }
2668 
2669 // 8 pixels.
2670 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2671 __declspec(naked) void I422ToARGBRow_SSSE3(
2672     const uint8* y_buf,
2673     const uint8* u_buf,
2674     const uint8* v_buf,
2675     uint8* dst_argb,
2676     const struct YuvConstants* yuvconstants,
2677     int width) {
2678   __asm {
2679     push       esi
2680     push       edi
2681     push       ebx
2682     mov        eax, [esp + 12 + 4]  // Y
2683     mov        esi, [esp + 12 + 8]  // U
2684     mov        edi, [esp + 12 + 12]  // V
2685     mov        edx, [esp + 12 + 16]  // argb
2686     mov        ebx, [esp + 12 + 20]  // yuvconstants
2687     mov        ecx, [esp + 12 + 24]  // width
2688     sub        edi, esi
2689     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2690 
2691  convertloop:
2692     READYUV422
2693     YUVTORGB(ebx)
2694     STOREARGB
2695 
2696     sub        ecx, 8
2697     jg         convertloop
2698 
2699     pop        ebx
2700     pop        edi
2701     pop        esi
2702     ret
2703   }
2704 }
2705 
2706 // 8 pixels.
2707 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
2708 __declspec(naked) void I422AlphaToARGBRow_SSSE3(
2709     const uint8* y_buf,
2710     const uint8* u_buf,
2711     const uint8* v_buf,
2712     const uint8* a_buf,
2713     uint8* dst_argb,
2714     const struct YuvConstants* yuvconstants,
2715     int width) {
2716   __asm {
2717     push       esi
2718     push       edi
2719     push       ebx
2720     push       ebp
2721     mov        eax, [esp + 16 + 4]  // Y
2722     mov        esi, [esp + 16 + 8]  // U
2723     mov        edi, [esp + 16 + 12]  // V
2724     mov        ebp, [esp + 16 + 16]  // A
2725     mov        edx, [esp + 16 + 20]  // argb
2726     mov        ebx, [esp + 16 + 24]  // yuvconstants
2727     mov        ecx, [esp + 16 + 28]  // width
2728     sub        edi, esi
2729 
2730  convertloop:
2731     READYUVA422
2732     YUVTORGB(ebx)
2733     STOREARGB
2734 
2735     sub        ecx, 8
2736     jg         convertloop
2737 
2738     pop        ebp
2739     pop        ebx
2740     pop        edi
2741     pop        esi
2742     ret
2743   }
2744 }
2745 
2746 // 8 pixels.
2747 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2748 __declspec(naked) void NV12ToARGBRow_SSSE3(
2749     const uint8* y_buf,
2750     const uint8* uv_buf,
2751     uint8* dst_argb,
2752     const struct YuvConstants* yuvconstants,
2753     int width) {
2754   __asm {
2755     push       esi
2756     push       ebx
2757     mov        eax, [esp + 8 + 4]  // Y
2758     mov        esi, [esp + 8 + 8]  // UV
2759     mov        edx, [esp + 8 + 12]  // argb
2760     mov        ebx, [esp + 8 + 16]  // yuvconstants
2761     mov        ecx, [esp + 8 + 20]  // width
2762     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2763 
2764  convertloop:
2765     READNV12
2766     YUVTORGB(ebx)
2767     STOREARGB
2768 
2769     sub        ecx, 8
2770     jg         convertloop
2771 
2772     pop        ebx
2773     pop        esi
2774     ret
2775   }
2776 }
2777 
2778 // 8 pixels.
2779 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2780 __declspec(naked) void NV21ToARGBRow_SSSE3(
2781     const uint8* y_buf,
2782     const uint8* vu_buf,
2783     uint8* dst_argb,
2784     const struct YuvConstants* yuvconstants,
2785     int width) {
2786   __asm {
2787     push       esi
2788     push       ebx
2789     mov        eax, [esp + 8 + 4]  // Y
2790     mov        esi, [esp + 8 + 8]  // VU
2791     mov        edx, [esp + 8 + 12]  // argb
2792     mov        ebx, [esp + 8 + 16]  // yuvconstants
2793     mov        ecx, [esp + 8 + 20]  // width
2794     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2795 
2796  convertloop:
2797     READNV21
2798     YUVTORGB(ebx)
2799     STOREARGB
2800 
2801     sub        ecx, 8
2802     jg         convertloop
2803 
2804     pop        ebx
2805     pop        esi
2806     ret
2807   }
2808 }
2809 
2810 // 8 pixels.
2811 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
2812 __declspec(naked) void YUY2ToARGBRow_SSSE3(
2813     const uint8* src_yuy2,
2814     uint8* dst_argb,
2815     const struct YuvConstants* yuvconstants,
2816     int width) {
2817   __asm {
2818     push       ebx
2819     mov        eax, [esp + 4 + 4]  // yuy2
2820     mov        edx, [esp + 4 + 8]  // argb
2821     mov        ebx, [esp + 4 + 12]  // yuvconstants
2822     mov        ecx, [esp + 4 + 16]  // width
2823     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2824 
2825  convertloop:
2826     READYUY2
2827     YUVTORGB(ebx)
2828     STOREARGB
2829 
2830     sub        ecx, 8
2831     jg         convertloop
2832 
2833     pop        ebx
2834     ret
2835   }
2836 }
2837 
2838 // 8 pixels.
2839 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
2840 __declspec(naked) void UYVYToARGBRow_SSSE3(
2841     const uint8* src_uyvy,
2842     uint8* dst_argb,
2843     const struct YuvConstants* yuvconstants,
2844     int width) {
2845   __asm {
2846     push       ebx
2847     mov        eax, [esp + 4 + 4]  // uyvy
2848     mov        edx, [esp + 4 + 8]  // argb
2849     mov        ebx, [esp + 4 + 12]  // yuvconstants
2850     mov        ecx, [esp + 4 + 16]  // width
2851     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2852 
2853  convertloop:
2854     READUYVY
2855     YUVTORGB(ebx)
2856     STOREARGB
2857 
2858     sub        ecx, 8
2859     jg         convertloop
2860 
2861     pop        ebx
2862     ret
2863   }
2864 }
2865 
2866 __declspec(naked) void I422ToRGBARow_SSSE3(
2867     const uint8* y_buf,
2868     const uint8* u_buf,
2869     const uint8* v_buf,
2870     uint8* dst_rgba,
2871     const struct YuvConstants* yuvconstants,
2872     int width) {
2873   __asm {
2874     push       esi
2875     push       edi
2876     push       ebx
2877     mov        eax, [esp + 12 + 4]  // Y
2878     mov        esi, [esp + 12 + 8]  // U
2879     mov        edi, [esp + 12 + 12]  // V
2880     mov        edx, [esp + 12 + 16]  // argb
2881     mov        ebx, [esp + 12 + 20]  // yuvconstants
2882     mov        ecx, [esp + 12 + 24]  // width
2883     sub        edi, esi
2884 
2885  convertloop:
2886     READYUV422
2887     YUVTORGB(ebx)
2888     STORERGBA
2889 
2890     sub        ecx, 8
2891     jg         convertloop
2892 
2893     pop        ebx
2894     pop        edi
2895     pop        esi
2896     ret
2897   }
2898 }
2899 #endif  // HAS_I422TOARGBROW_SSSE3
2900 
2901 #ifdef HAS_I400TOARGBROW_SSE2
2902 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
2903 __declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf,
2904                                           uint8* rgb_buf,
2905                                           int width) {
2906   __asm {
2907     mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
2908     movd       xmm2, eax
2909     pshufd     xmm2, xmm2,0
2910     mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
2911     movd       xmm3, eax
2912     pshufd     xmm3, xmm3, 0
2913     pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
2914     pslld      xmm4, 24
2915 
2916     mov        eax, [esp + 4]  // Y
2917     mov        edx, [esp + 8]  // rgb
2918     mov        ecx, [esp + 12]  // width
2919 
2920  convertloop:
2921         // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2922     movq       xmm0, qword ptr [eax]
2923     lea        eax, [eax + 8]
2924     punpcklbw  xmm0, xmm0  // Y.Y
2925     pmulhuw    xmm0, xmm2
2926     psubusw    xmm0, xmm3
2927     psrlw      xmm0, 6
2928     packuswb   xmm0, xmm0        // G
2929 
2930     // Step 2: Weave into ARGB
2931     punpcklbw  xmm0, xmm0  // GG
2932     movdqa     xmm1, xmm0
2933     punpcklwd  xmm0, xmm0  // BGRA first 4 pixels
2934     punpckhwd  xmm1, xmm1  // BGRA next 4 pixels
2935     por        xmm0, xmm4
2936     por        xmm1, xmm4
2937     movdqu     [edx], xmm0
2938     movdqu     [edx + 16], xmm1
2939     lea        edx,  [edx + 32]
2940     sub        ecx, 8
2941     jg         convertloop
2942     ret
2943   }
2944 }
2945 #endif  // HAS_I400TOARGBROW_SSE2
2946 
2947 #ifdef HAS_I400TOARGBROW_AVX2
2948 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2949 // note: vpunpcklbw mutates and vpackuswb unmutates.
2950 __declspec(naked) void I400ToARGBRow_AVX2(const uint8* y_buf,
2951                                           uint8* rgb_buf,
2952                                           int width) {
2953   __asm {
2954     mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
2955     vmovd      xmm2, eax
2956     vbroadcastss ymm2, xmm2
2957     mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
2958     vmovd      xmm3, eax
2959     vbroadcastss ymm3, xmm3
2960     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xff000000
2961     vpslld     ymm4, ymm4, 24
2962 
2963     mov        eax, [esp + 4]  // Y
2964     mov        edx, [esp + 8]  // rgb
2965     mov        ecx, [esp + 12]  // width
2966 
2967  convertloop:
2968         // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
2969     vmovdqu    xmm0, [eax]
2970     lea        eax, [eax + 16]
2971     vpermq     ymm0, ymm0, 0xd8  // vpunpcklbw mutates
2972     vpunpcklbw ymm0, ymm0, ymm0  // Y.Y
2973     vpmulhuw   ymm0, ymm0, ymm2
2974     vpsubusw   ymm0, ymm0, ymm3
2975     vpsrlw     ymm0, ymm0, 6
2976     vpackuswb  ymm0, ymm0, ymm0        // G.  still mutated: 3120
2977 
2978     // TODO(fbarchard): Weave alpha with unpack.
2979     // Step 2: Weave into ARGB
2980     vpunpcklbw ymm1, ymm0, ymm0  // GG - mutates
2981     vpermq     ymm1, ymm1, 0xd8
2982     vpunpcklwd ymm0, ymm1, ymm1  // GGGG first 8 pixels
2983     vpunpckhwd ymm1, ymm1, ymm1  // GGGG next 8 pixels
2984     vpor       ymm0, ymm0, ymm4
2985     vpor       ymm1, ymm1, ymm4
2986     vmovdqu    [edx], ymm0
2987     vmovdqu    [edx + 32], ymm1
2988     lea        edx,  [edx + 64]
2989     sub        ecx, 16
2990     jg         convertloop
2991     vzeroupper
2992     ret
2993   }
2994 }
2995 #endif  // HAS_I400TOARGBROW_AVX2
2996 
2997 #ifdef HAS_MIRRORROW_SSSE3
2998 // Shuffle table for reversing the bytes.
2999 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
3000                                      7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
3001 
3002 // TODO(fbarchard): Replace lea with -16 offset.
3003 __declspec(naked) void MirrorRow_SSSE3(const uint8* src,
3004                                        uint8* dst,
3005                                        int width) {
3006   __asm {
3007     mov       eax, [esp + 4]  // src
3008     mov       edx, [esp + 8]  // dst
3009     mov       ecx, [esp + 12]  // width
3010     movdqa    xmm5, xmmword ptr kShuffleMirror
3011 
3012  convertloop:
3013     movdqu    xmm0, [eax - 16 + ecx]
3014     pshufb    xmm0, xmm5
3015     movdqu    [edx], xmm0
3016     lea       edx, [edx + 16]
3017     sub       ecx, 16
3018     jg        convertloop
3019     ret
3020   }
3021 }
3022 #endif  // HAS_MIRRORROW_SSSE3
3023 
3024 #ifdef HAS_MIRRORROW_AVX2
3025 __declspec(naked) void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3026   __asm {
3027     mov       eax, [esp + 4]  // src
3028     mov       edx, [esp + 8]  // dst
3029     mov       ecx, [esp + 12]  // width
3030     vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
3031 
3032  convertloop:
3033     vmovdqu   ymm0, [eax - 32 + ecx]
3034     vpshufb   ymm0, ymm0, ymm5
3035     vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
3036     vmovdqu   [edx], ymm0
3037     lea       edx, [edx + 32]
3038     sub       ecx, 32
3039     jg        convertloop
3040     vzeroupper
3041     ret
3042   }
3043 }
3044 #endif  // HAS_MIRRORROW_AVX2
3045 
3046 #ifdef HAS_MIRRORUVROW_SSSE3
3047 // Shuffle table for reversing the bytes of UV channels.
3048 static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
3049                                        15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
3050 
3051 __declspec(naked) void MirrorUVRow_SSSE3(const uint8* src,
3052                                          uint8* dst_u,
3053                                          uint8* dst_v,
3054                                          int width) {
3055   __asm {
3056     push      edi
3057     mov       eax, [esp + 4 + 4]  // src
3058     mov       edx, [esp + 4 + 8]  // dst_u
3059     mov       edi, [esp + 4 + 12]  // dst_v
3060     mov       ecx, [esp + 4 + 16]  // width
3061     movdqa    xmm1, xmmword ptr kShuffleMirrorUV
3062     lea       eax, [eax + ecx * 2 - 16]
3063     sub       edi, edx
3064 
3065  convertloop:
3066     movdqu    xmm0, [eax]
3067     lea       eax, [eax - 16]
3068     pshufb    xmm0, xmm1
3069     movlpd    qword ptr [edx], xmm0
3070     movhpd    qword ptr [edx + edi], xmm0
3071     lea       edx, [edx + 8]
3072     sub       ecx, 8
3073     jg        convertloop
3074 
3075     pop       edi
3076     ret
3077   }
3078 }
3079 #endif  // HAS_MIRRORUVROW_SSSE3
3080 
3081 #ifdef HAS_ARGBMIRRORROW_SSE2
3082 __declspec(naked) void ARGBMirrorRow_SSE2(const uint8* src,
3083                                           uint8* dst,
3084                                           int width) {
3085   __asm {
3086     mov       eax, [esp + 4]  // src
3087     mov       edx, [esp + 8]  // dst
3088     mov       ecx, [esp + 12]  // width
3089     lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
3090 
3091  convertloop:
3092     movdqu    xmm0, [eax]
3093     lea       eax, [eax - 16]
3094     pshufd    xmm0, xmm0, 0x1b
3095     movdqu    [edx], xmm0
3096     lea       edx, [edx + 16]
3097     sub       ecx, 4
3098     jg        convertloop
3099     ret
3100   }
3101 }
3102 #endif  // HAS_ARGBMIRRORROW_SSE2
3103 
3104 #ifdef HAS_ARGBMIRRORROW_AVX2
3105 // Shuffle table for reversing the bytes.
3106 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
3107 
3108 __declspec(naked) void ARGBMirrorRow_AVX2(const uint8* src,
3109                                           uint8* dst,
3110                                           int width) {
3111   __asm {
3112     mov       eax, [esp + 4]  // src
3113     mov       edx, [esp + 8]  // dst
3114     mov       ecx, [esp + 12]  // width
3115     vmovdqu   ymm5, ymmword ptr kARGBShuffleMirror_AVX2
3116 
3117  convertloop:
3118     vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
3119     vmovdqu   [edx], ymm0
3120     lea       edx, [edx + 32]
3121     sub       ecx, 8
3122     jg        convertloop
3123     vzeroupper
3124     ret
3125   }
3126 }
3127 #endif  // HAS_ARGBMIRRORROW_AVX2
3128 
3129 #ifdef HAS_SPLITUVROW_SSE2
3130 __declspec(naked) void SplitUVRow_SSE2(const uint8* src_uv,
3131                                        uint8* dst_u,
3132                                        uint8* dst_v,
3133                                        int width) {
3134   __asm {
3135     push       edi
3136     mov        eax, [esp + 4 + 4]  // src_uv
3137     mov        edx, [esp + 4 + 8]  // dst_u
3138     mov        edi, [esp + 4 + 12]  // dst_v
3139     mov        ecx, [esp + 4 + 16]  // width
3140     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
3141     psrlw      xmm5, 8
3142     sub        edi, edx
3143 
3144   convertloop:
3145     movdqu     xmm0, [eax]
3146     movdqu     xmm1, [eax + 16]
3147     lea        eax,  [eax + 32]
3148     movdqa     xmm2, xmm0
3149     movdqa     xmm3, xmm1
3150     pand       xmm0, xmm5  // even bytes
3151     pand       xmm1, xmm5
3152     packuswb   xmm0, xmm1
3153     psrlw      xmm2, 8  // odd bytes
3154     psrlw      xmm3, 8
3155     packuswb   xmm2, xmm3
3156     movdqu     [edx], xmm0
3157     movdqu     [edx + edi], xmm2
3158     lea        edx, [edx + 16]
3159     sub        ecx, 16
3160     jg         convertloop
3161 
3162     pop        edi
3163     ret
3164   }
3165 }
3166 
3167 #endif  // HAS_SPLITUVROW_SSE2
3168 
3169 #ifdef HAS_SPLITUVROW_AVX2
3170 __declspec(naked) void SplitUVRow_AVX2(const uint8* src_uv,
3171                                        uint8* dst_u,
3172                                        uint8* dst_v,
3173                                        int width) {
3174   __asm {
3175     push       edi
3176     mov        eax, [esp + 4 + 4]  // src_uv
3177     mov        edx, [esp + 4 + 8]  // dst_u
3178     mov        edi, [esp + 4 + 12]  // dst_v
3179     mov        ecx, [esp + 4 + 16]  // width
3180     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3181     vpsrlw     ymm5, ymm5, 8
3182     sub        edi, edx
3183 
3184   convertloop:
3185     vmovdqu    ymm0, [eax]
3186     vmovdqu    ymm1, [eax + 32]
3187     lea        eax,  [eax + 64]
3188     vpsrlw     ymm2, ymm0, 8  // odd bytes
3189     vpsrlw     ymm3, ymm1, 8
3190     vpand      ymm0, ymm0, ymm5  // even bytes
3191     vpand      ymm1, ymm1, ymm5
3192     vpackuswb  ymm0, ymm0, ymm1
3193     vpackuswb  ymm2, ymm2, ymm3
3194     vpermq     ymm0, ymm0, 0xd8
3195     vpermq     ymm2, ymm2, 0xd8
3196     vmovdqu    [edx], ymm0
3197     vmovdqu    [edx + edi], ymm2
3198     lea        edx, [edx + 32]
3199     sub        ecx, 32
3200     jg         convertloop
3201 
3202     pop        edi
3203     vzeroupper
3204     ret
3205   }
3206 }
3207 #endif  // HAS_SPLITUVROW_AVX2
3208 
3209 #ifdef HAS_MERGEUVROW_SSE2
3210 __declspec(naked) void MergeUVRow_SSE2(const uint8* src_u,
3211                                        const uint8* src_v,
3212                                        uint8* dst_uv,
3213                                        int width) {
3214   __asm {
3215     push       edi
3216     mov        eax, [esp + 4 + 4]  // src_u
3217     mov        edx, [esp + 4 + 8]  // src_v
3218     mov        edi, [esp + 4 + 12]  // dst_uv
3219     mov        ecx, [esp + 4 + 16]  // width
3220     sub        edx, eax
3221 
3222   convertloop:
3223     movdqu     xmm0, [eax]  // read 16 U's
3224     movdqu     xmm1, [eax + edx]  // and 16 V's
3225     lea        eax,  [eax + 16]
3226     movdqa     xmm2, xmm0
3227     punpcklbw  xmm0, xmm1  // first 8 UV pairs
3228     punpckhbw  xmm2, xmm1  // next 8 UV pairs
3229     movdqu     [edi], xmm0
3230     movdqu     [edi + 16], xmm2
3231     lea        edi, [edi + 32]
3232     sub        ecx, 16
3233     jg         convertloop
3234 
3235     pop        edi
3236     ret
3237   }
3238 }
3239 #endif  //  HAS_MERGEUVROW_SSE2
3240 
3241 #ifdef HAS_MERGEUVROW_AVX2
3242 __declspec(naked) void MergeUVRow_AVX2(const uint8* src_u,
3243                                        const uint8* src_v,
3244                                        uint8* dst_uv,
3245                                        int width) {
3246   __asm {
3247     push       edi
3248     mov        eax, [esp + 4 + 4]  // src_u
3249     mov        edx, [esp + 4 + 8]  // src_v
3250     mov        edi, [esp + 4 + 12]  // dst_uv
3251     mov        ecx, [esp + 4 + 16]  // width
3252     sub        edx, eax
3253 
3254   convertloop:
3255     vmovdqu    ymm0, [eax]  // read 32 U's
3256     vmovdqu    ymm1, [eax + edx]  // and 32 V's
3257     lea        eax,  [eax + 32]
3258     vpunpcklbw ymm2, ymm0, ymm1  // low 16 UV pairs. mutated qqword 0,2
3259     vpunpckhbw ymm0, ymm0, ymm1  // high 16 UV pairs. mutated qqword 1,3
3260     vextractf128 [edi], ymm2, 0  // bytes 0..15
3261     vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
3262     vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
3263     vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
3264     lea        edi, [edi + 64]
3265     sub        ecx, 32
3266     jg         convertloop
3267 
3268     pop        edi
3269     vzeroupper
3270     ret
3271   }
3272 }
3273 #endif  //  HAS_MERGEUVROW_AVX2
3274 
3275 #ifdef HAS_COPYROW_SSE2
3276 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
3277 __declspec(naked) void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
3278   __asm {
3279     mov        eax, [esp + 4]  // src
3280     mov        edx, [esp + 8]  // dst
3281     mov        ecx, [esp + 12]  // count
3282     test       eax, 15
3283     jne        convertloopu
3284     test       edx, 15
3285     jne        convertloopu
3286 
3287   convertloopa:
3288     movdqa     xmm0, [eax]
3289     movdqa     xmm1, [eax + 16]
3290     lea        eax, [eax + 32]
3291     movdqa     [edx], xmm0
3292     movdqa     [edx + 16], xmm1
3293     lea        edx, [edx + 32]
3294     sub        ecx, 32
3295     jg         convertloopa
3296     ret
3297 
3298   convertloopu:
3299     movdqu     xmm0, [eax]
3300     movdqu     xmm1, [eax + 16]
3301     lea        eax, [eax + 32]
3302     movdqu     [edx], xmm0
3303     movdqu     [edx + 16], xmm1
3304     lea        edx, [edx + 32]
3305     sub        ecx, 32
3306     jg         convertloopu
3307     ret
3308   }
3309 }
3310 #endif  // HAS_COPYROW_SSE2
3311 
3312 #ifdef HAS_COPYROW_AVX
3313 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
3314 __declspec(naked) void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
3315   __asm {
3316     mov        eax, [esp + 4]  // src
3317     mov        edx, [esp + 8]  // dst
3318     mov        ecx, [esp + 12]  // count
3319 
3320   convertloop:
3321     vmovdqu    ymm0, [eax]
3322     vmovdqu    ymm1, [eax + 32]
3323     lea        eax, [eax + 64]
3324     vmovdqu    [edx], ymm0
3325     vmovdqu    [edx + 32], ymm1
3326     lea        edx, [edx + 64]
3327     sub        ecx, 64
3328     jg         convertloop
3329 
3330     vzeroupper
3331     ret
3332   }
3333 }
3334 #endif  // HAS_COPYROW_AVX
3335 
3336 // Multiple of 1.
3337 __declspec(naked) void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
3338   __asm {
3339     mov        eax, esi
3340     mov        edx, edi
3341     mov        esi, [esp + 4]  // src
3342     mov        edi, [esp + 8]  // dst
3343     mov        ecx, [esp + 12]  // count
3344     rep movsb
3345     mov        edi, edx
3346     mov        esi, eax
3347     ret
3348   }
3349 }
3350 
3351 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
3352 // width in pixels
3353 __declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8* src,
3354                                              uint8* dst,
3355                                              int width) {
3356   __asm {
3357     mov        eax, [esp + 4]  // src
3358     mov        edx, [esp + 8]  // dst
3359     mov        ecx, [esp + 12]  // count
3360     pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
3361     pslld      xmm0, 24
3362     pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
3363     psrld      xmm1, 8
3364 
3365   convertloop:
3366     movdqu     xmm2, [eax]
3367     movdqu     xmm3, [eax + 16]
3368     lea        eax, [eax + 32]
3369     movdqu     xmm4, [edx]
3370     movdqu     xmm5, [edx + 16]
3371     pand       xmm2, xmm0
3372     pand       xmm3, xmm0
3373     pand       xmm4, xmm1
3374     pand       xmm5, xmm1
3375     por        xmm2, xmm4
3376     por        xmm3, xmm5
3377     movdqu     [edx], xmm2
3378     movdqu     [edx + 16], xmm3
3379     lea        edx, [edx + 32]
3380     sub        ecx, 8
3381     jg         convertloop
3382 
3383     ret
3384   }
3385 }
3386 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
3387 
3388 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
3389 // width in pixels
3390 __declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8* src,
3391                                              uint8* dst,
3392                                              int width) {
3393   __asm {
3394     mov        eax, [esp + 4]  // src
3395     mov        edx, [esp + 8]  // dst
3396     mov        ecx, [esp + 12]  // count
3397     vpcmpeqb   ymm0, ymm0, ymm0
3398     vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
3399 
3400   convertloop:
3401     vmovdqu    ymm1, [eax]
3402     vmovdqu    ymm2, [eax + 32]
3403     lea        eax, [eax + 64]
3404     vpblendvb  ymm1, ymm1, [edx], ymm0
3405     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
3406     vmovdqu    [edx], ymm1
3407     vmovdqu    [edx + 32], ymm2
3408     lea        edx, [edx + 64]
3409     sub        ecx, 16
3410     jg         convertloop
3411 
3412     vzeroupper
3413     ret
3414   }
3415 }
3416 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
3417 
3418 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
3419 // width in pixels
3420 __declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8* src_argb,
3421                                                 uint8* dst_a,
3422                                                 int width) {
3423   __asm {
3424     mov        eax, [esp + 4]  // src_argb
3425     mov        edx, [esp + 8]  // dst_a
3426     mov        ecx, [esp + 12]  // width
3427 
3428   extractloop:
3429     movdqu     xmm0, [eax]
3430     movdqu     xmm1, [eax + 16]
3431     lea        eax, [eax + 32]
3432     psrld      xmm0, 24
3433     psrld      xmm1, 24
3434     packssdw   xmm0, xmm1
3435     packuswb   xmm0, xmm0
3436     movq       qword ptr [edx], xmm0
3437     lea        edx, [edx + 8]
3438     sub        ecx, 8
3439     jg         extractloop
3440 
3441     ret
3442   }
3443 }
3444 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
3445 
3446 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
3447 // width in pixels
3448 __declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8* src_argb,
3449                                                 uint8* dst_a,
3450                                                 int width) {
3451   __asm {
3452     mov        eax, [esp + 4]  // src_argb
3453     mov        edx, [esp + 8]  // dst_a
3454     mov        ecx, [esp + 12]  // width
3455     vmovdqa    ymm4, ymmword ptr kPermdARGBToY_AVX
3456 
3457   extractloop:
3458     vmovdqu    ymm0, [eax]
3459     vmovdqu    ymm1, [eax + 32]
3460     vpsrld     ymm0, ymm0, 24
3461     vpsrld     ymm1, ymm1, 24
3462     vmovdqu    ymm2, [eax + 64]
3463     vmovdqu    ymm3, [eax + 96]
3464     lea        eax, [eax + 128]
3465     vpackssdw  ymm0, ymm0, ymm1  // mutates
3466     vpsrld     ymm2, ymm2, 24
3467     vpsrld     ymm3, ymm3, 24
3468     vpackssdw  ymm2, ymm2, ymm3  // mutates
3469     vpackuswb  ymm0, ymm0, ymm2  // mutates
3470     vpermd     ymm0, ymm4, ymm0  // unmutate
3471     vmovdqu    [edx], ymm0
3472     lea        edx, [edx + 32]
3473     sub        ecx, 32
3474     jg         extractloop
3475 
3476     vzeroupper
3477     ret
3478   }
3479 }
3480 #endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
3481 
3482 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3483 // width in pixels
3484 __declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8* src,
3485                                                 uint8* dst,
3486                                                 int width) {
3487   __asm {
3488     mov        eax, [esp + 4]  // src
3489     mov        edx, [esp + 8]  // dst
3490     mov        ecx, [esp + 12]  // count
3491     pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
3492     pslld      xmm0, 24
3493     pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
3494     psrld      xmm1, 8
3495 
3496   convertloop:
3497     movq       xmm2, qword ptr [eax]  // 8 Y's
3498     lea        eax, [eax + 8]
3499     punpcklbw  xmm2, xmm2
3500     punpckhwd  xmm3, xmm2
3501     punpcklwd  xmm2, xmm2
3502     movdqu     xmm4, [edx]
3503     movdqu     xmm5, [edx + 16]
3504     pand       xmm2, xmm0
3505     pand       xmm3, xmm0
3506     pand       xmm4, xmm1
3507     pand       xmm5, xmm1
3508     por        xmm2, xmm4
3509     por        xmm3, xmm5
3510     movdqu     [edx], xmm2
3511     movdqu     [edx + 16], xmm3
3512     lea        edx, [edx + 32]
3513     sub        ecx, 8
3514     jg         convertloop
3515 
3516     ret
3517   }
3518 }
3519 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
3520 
3521 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3522 // width in pixels
3523 __declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8* src,
3524                                                 uint8* dst,
3525                                                 int width) {
3526   __asm {
3527     mov        eax, [esp + 4]  // src
3528     mov        edx, [esp + 8]  // dst
3529     mov        ecx, [esp + 12]  // count
3530     vpcmpeqb   ymm0, ymm0, ymm0
3531     vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
3532 
3533   convertloop:
3534     vpmovzxbd  ymm1, qword ptr [eax]
3535     vpmovzxbd  ymm2, qword ptr [eax + 8]
3536     lea        eax, [eax + 16]
3537     vpslld     ymm1, ymm1, 24
3538     vpslld     ymm2, ymm2, 24
3539     vpblendvb  ymm1, ymm1, [edx], ymm0
3540     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
3541     vmovdqu    [edx], ymm1
3542     vmovdqu    [edx + 32], ymm2
3543     lea        edx, [edx + 64]
3544     sub        ecx, 16
3545     jg         convertloop
3546 
3547     vzeroupper
3548     ret
3549   }
3550 }
3551 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
3552 
3553 #ifdef HAS_SETROW_X86
3554 // Write 'count' bytes using an 8 bit value repeated.
3555 // Count should be multiple of 4.
3556 __declspec(naked) void SetRow_X86(uint8* dst, uint8 v8, int count) {
3557   __asm {
3558     movzx      eax, byte ptr [esp + 8]  // v8
3559     mov        edx, 0x01010101  // Duplicate byte to all bytes.
3560     mul        edx  // overwrites edx with upper part of result.
3561     mov        edx, edi
3562     mov        edi, [esp + 4]  // dst
3563     mov        ecx, [esp + 12]  // count
3564     shr        ecx, 2
3565     rep stosd
3566     mov        edi, edx
3567     ret
3568   }
3569 }
3570 
3571 // Write 'count' bytes using an 8 bit value repeated.
3572 __declspec(naked) void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
3573   __asm {
3574     mov        edx, edi
3575     mov        edi, [esp + 4]  // dst
3576     mov        eax, [esp + 8]  // v8
3577     mov        ecx, [esp + 12]  // count
3578     rep stosb
3579     mov        edi, edx
3580     ret
3581   }
3582 }
3583 
3584 // Write 'count' 32 bit values.
3585 __declspec(naked) void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
3586   __asm {
3587     mov        edx, edi
3588     mov        edi, [esp + 4]  // dst
3589     mov        eax, [esp + 8]  // v32
3590     mov        ecx, [esp + 12]  // count
3591     rep stosd
3592     mov        edi, edx
3593     ret
3594   }
3595 }
3596 #endif  // HAS_SETROW_X86
3597 
3598 #ifdef HAS_YUY2TOYROW_AVX2
3599 __declspec(naked) void YUY2ToYRow_AVX2(const uint8* src_yuy2,
3600                                        uint8* dst_y,
3601                                        int width) {
3602   __asm {
3603     mov        eax, [esp + 4]  // src_yuy2
3604     mov        edx, [esp + 8]  // dst_y
3605     mov        ecx, [esp + 12]  // width
3606     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3607     vpsrlw     ymm5, ymm5, 8
3608 
3609   convertloop:
3610     vmovdqu    ymm0, [eax]
3611     vmovdqu    ymm1, [eax + 32]
3612     lea        eax,  [eax + 64]
3613     vpand      ymm0, ymm0, ymm5  // even bytes are Y
3614     vpand      ymm1, ymm1, ymm5
3615     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3616     vpermq     ymm0, ymm0, 0xd8
3617     vmovdqu    [edx], ymm0
3618     lea        edx, [edx + 32]
3619     sub        ecx, 32
3620     jg         convertloop
3621     vzeroupper
3622     ret
3623   }
3624 }
3625 
3626 __declspec(naked) void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
3627                                         int stride_yuy2,
3628                                         uint8* dst_u,
3629                                         uint8* dst_v,
3630                                         int width) {
3631   __asm {
3632     push       esi
3633     push       edi
3634     mov        eax, [esp + 8 + 4]  // src_yuy2
3635     mov        esi, [esp + 8 + 8]  // stride_yuy2
3636     mov        edx, [esp + 8 + 12]  // dst_u
3637     mov        edi, [esp + 8 + 16]  // dst_v
3638     mov        ecx, [esp + 8 + 20]  // width
3639     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3640     vpsrlw     ymm5, ymm5, 8
3641     sub        edi, edx
3642 
3643   convertloop:
3644     vmovdqu    ymm0, [eax]
3645     vmovdqu    ymm1, [eax + 32]
3646     vpavgb     ymm0, ymm0, [eax + esi]
3647     vpavgb     ymm1, ymm1, [eax + esi + 32]
3648     lea        eax,  [eax + 64]
3649     vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
3650     vpsrlw     ymm1, ymm1, 8
3651     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3652     vpermq     ymm0, ymm0, 0xd8
3653     vpand      ymm1, ymm0, ymm5  // U
3654     vpsrlw     ymm0, ymm0, 8  // V
3655     vpackuswb  ymm1, ymm1, ymm1  // mutates.
3656     vpackuswb  ymm0, ymm0, ymm0  // mutates.
3657     vpermq     ymm1, ymm1, 0xd8
3658     vpermq     ymm0, ymm0, 0xd8
3659     vextractf128 [edx], ymm1, 0  // U
3660     vextractf128 [edx + edi], ymm0, 0  // V
3661     lea        edx, [edx + 16]
3662     sub        ecx, 32
3663     jg         convertloop
3664 
3665     pop        edi
3666     pop        esi
3667     vzeroupper
3668     ret
3669   }
3670 }
3671 
3672 __declspec(naked) void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3673                                            uint8* dst_u,
3674                                            uint8* dst_v,
3675                                            int width) {
3676   __asm {
3677     push       edi
3678     mov        eax, [esp + 4 + 4]  // src_yuy2
3679     mov        edx, [esp + 4 + 8]  // dst_u
3680     mov        edi, [esp + 4 + 12]  // dst_v
3681     mov        ecx, [esp + 4 + 16]  // width
3682     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3683     vpsrlw     ymm5, ymm5, 8
3684     sub        edi, edx
3685 
3686   convertloop:
3687     vmovdqu    ymm0, [eax]
3688     vmovdqu    ymm1, [eax + 32]
3689     lea        eax,  [eax + 64]
3690     vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
3691     vpsrlw     ymm1, ymm1, 8
3692     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3693     vpermq     ymm0, ymm0, 0xd8
3694     vpand      ymm1, ymm0, ymm5  // U
3695     vpsrlw     ymm0, ymm0, 8  // V
3696     vpackuswb  ymm1, ymm1, ymm1  // mutates.
3697     vpackuswb  ymm0, ymm0, ymm0  // mutates.
3698     vpermq     ymm1, ymm1, 0xd8
3699     vpermq     ymm0, ymm0, 0xd8
3700     vextractf128 [edx], ymm1, 0  // U
3701     vextractf128 [edx + edi], ymm0, 0  // V
3702     lea        edx, [edx + 16]
3703     sub        ecx, 32
3704     jg         convertloop
3705 
3706     pop        edi
3707     vzeroupper
3708     ret
3709   }
3710 }
3711 
3712 __declspec(naked) void UYVYToYRow_AVX2(const uint8* src_uyvy,
3713                                        uint8* dst_y,
3714                                        int width) {
3715   __asm {
3716     mov        eax, [esp + 4]  // src_uyvy
3717     mov        edx, [esp + 8]  // dst_y
3718     mov        ecx, [esp + 12]  // width
3719 
3720   convertloop:
3721     vmovdqu    ymm0, [eax]
3722     vmovdqu    ymm1, [eax + 32]
3723     lea        eax,  [eax + 64]
3724     vpsrlw     ymm0, ymm0, 8  // odd bytes are Y
3725     vpsrlw     ymm1, ymm1, 8
3726     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3727     vpermq     ymm0, ymm0, 0xd8
3728     vmovdqu    [edx], ymm0
3729     lea        edx, [edx + 32]
3730     sub        ecx, 32
3731     jg         convertloop
3732     vzeroupper
3733     ret
3734   }
3735 }
3736 
3737 __declspec(naked) void UYVYToUVRow_AVX2(const uint8* src_uyvy,
3738                                         int stride_uyvy,
3739                                         uint8* dst_u,
3740                                         uint8* dst_v,
3741                                         int width) {
3742   __asm {
3743     push       esi
3744     push       edi
3745     mov        eax, [esp + 8 + 4]  // src_yuy2
3746     mov        esi, [esp + 8 + 8]  // stride_yuy2
3747     mov        edx, [esp + 8 + 12]  // dst_u
3748     mov        edi, [esp + 8 + 16]  // dst_v
3749     mov        ecx, [esp + 8 + 20]  // width
3750     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3751     vpsrlw     ymm5, ymm5, 8
3752     sub        edi, edx
3753 
3754   convertloop:
3755     vmovdqu    ymm0, [eax]
3756     vmovdqu    ymm1, [eax + 32]
3757     vpavgb     ymm0, ymm0, [eax + esi]
3758     vpavgb     ymm1, ymm1, [eax + esi + 32]
3759     lea        eax,  [eax + 64]
3760     vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
3761     vpand      ymm1, ymm1, ymm5
3762     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3763     vpermq     ymm0, ymm0, 0xd8
3764     vpand      ymm1, ymm0, ymm5  // U
3765     vpsrlw     ymm0, ymm0, 8  // V
3766     vpackuswb  ymm1, ymm1, ymm1  // mutates.
3767     vpackuswb  ymm0, ymm0, ymm0  // mutates.
3768     vpermq     ymm1, ymm1, 0xd8
3769     vpermq     ymm0, ymm0, 0xd8
3770     vextractf128 [edx], ymm1, 0  // U
3771     vextractf128 [edx + edi], ymm0, 0  // V
3772     lea        edx, [edx + 16]
3773     sub        ecx, 32
3774     jg         convertloop
3775 
3776     pop        edi
3777     pop        esi
3778     vzeroupper
3779     ret
3780   }
3781 }
3782 
3783 __declspec(naked) void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3784                                            uint8* dst_u,
3785                                            uint8* dst_v,
3786                                            int width) {
3787   __asm {
3788     push       edi
3789     mov        eax, [esp + 4 + 4]  // src_yuy2
3790     mov        edx, [esp + 4 + 8]  // dst_u
3791     mov        edi, [esp + 4 + 12]  // dst_v
3792     mov        ecx, [esp + 4 + 16]  // width
3793     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3794     vpsrlw     ymm5, ymm5, 8
3795     sub        edi, edx
3796 
3797   convertloop:
3798     vmovdqu    ymm0, [eax]
3799     vmovdqu    ymm1, [eax + 32]
3800     lea        eax,  [eax + 64]
3801     vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
3802     vpand      ymm1, ymm1, ymm5
3803     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3804     vpermq     ymm0, ymm0, 0xd8
3805     vpand      ymm1, ymm0, ymm5  // U
3806     vpsrlw     ymm0, ymm0, 8  // V
3807     vpackuswb  ymm1, ymm1, ymm1  // mutates.
3808     vpackuswb  ymm0, ymm0, ymm0  // mutates.
3809     vpermq     ymm1, ymm1, 0xd8
3810     vpermq     ymm0, ymm0, 0xd8
3811     vextractf128 [edx], ymm1, 0  // U
3812     vextractf128 [edx + edi], ymm0, 0  // V
3813     lea        edx, [edx + 16]
3814     sub        ecx, 32
3815     jg         convertloop
3816 
3817     pop        edi
3818     vzeroupper
3819     ret
3820   }
3821 }
3822 #endif  // HAS_YUY2TOYROW_AVX2
3823 
3824 #ifdef HAS_YUY2TOYROW_SSE2
3825 __declspec(naked) void YUY2ToYRow_SSE2(const uint8* src_yuy2,
3826                                        uint8* dst_y,
3827                                        int width) {
3828   __asm {
3829     mov        eax, [esp + 4]  // src_yuy2
3830     mov        edx, [esp + 8]  // dst_y
3831     mov        ecx, [esp + 12]  // width
3832     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
3833     psrlw      xmm5, 8
3834 
3835   convertloop:
3836     movdqu     xmm0, [eax]
3837     movdqu     xmm1, [eax + 16]
3838     lea        eax,  [eax + 32]
3839     pand       xmm0, xmm5  // even bytes are Y
3840     pand       xmm1, xmm5
3841     packuswb   xmm0, xmm1
3842     movdqu     [edx], xmm0
3843     lea        edx, [edx + 16]
3844     sub        ecx, 16
3845     jg         convertloop
3846     ret
3847   }
3848 }
3849 
3850 __declspec(naked) void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
3851                                         int stride_yuy2,
3852                                         uint8* dst_u,
3853                                         uint8* dst_v,
3854                                         int width) {
3855   __asm {
3856     push       esi
3857     push       edi
3858     mov        eax, [esp + 8 + 4]  // src_yuy2
3859     mov        esi, [esp + 8 + 8]  // stride_yuy2
3860     mov        edx, [esp + 8 + 12]  // dst_u
3861     mov        edi, [esp + 8 + 16]  // dst_v
3862     mov        ecx, [esp + 8 + 20]  // width
3863     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
3864     psrlw      xmm5, 8
3865     sub        edi, edx
3866 
3867   convertloop:
3868     movdqu     xmm0, [eax]
3869     movdqu     xmm1, [eax + 16]
3870     movdqu     xmm2, [eax + esi]
3871     movdqu     xmm3, [eax + esi + 16]
3872     lea        eax,  [eax + 32]
3873     pavgb      xmm0, xmm2
3874     pavgb      xmm1, xmm3
3875     psrlw      xmm0, 8  // YUYV -> UVUV
3876     psrlw      xmm1, 8
3877     packuswb   xmm0, xmm1
3878     movdqa     xmm1, xmm0
3879     pand       xmm0, xmm5  // U
3880     packuswb   xmm0, xmm0
3881     psrlw      xmm1, 8  // V
3882     packuswb   xmm1, xmm1
3883     movq       qword ptr [edx], xmm0
3884     movq       qword ptr [edx + edi], xmm1
3885     lea        edx, [edx + 8]
3886     sub        ecx, 16
3887     jg         convertloop
3888 
3889     pop        edi
3890     pop        esi
3891     ret
3892   }
3893 }
3894 
3895 __declspec(naked) void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3896                                            uint8* dst_u,
3897                                            uint8* dst_v,
3898                                            int width) {
3899   __asm {
3900     push       edi
3901     mov        eax, [esp + 4 + 4]  // src_yuy2
3902     mov        edx, [esp + 4 + 8]  // dst_u
3903     mov        edi, [esp + 4 + 12]  // dst_v
3904     mov        ecx, [esp + 4 + 16]  // width
3905     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
3906     psrlw      xmm5, 8
3907     sub        edi, edx
3908 
3909   convertloop:
3910     movdqu     xmm0, [eax]
3911     movdqu     xmm1, [eax + 16]
3912     lea        eax,  [eax + 32]
3913     psrlw      xmm0, 8  // YUYV -> UVUV
3914     psrlw      xmm1, 8
3915     packuswb   xmm0, xmm1
3916     movdqa     xmm1, xmm0
3917     pand       xmm0, xmm5  // U
3918     packuswb   xmm0, xmm0
3919     psrlw      xmm1, 8  // V
3920     packuswb   xmm1, xmm1
3921     movq       qword ptr [edx], xmm0
3922     movq       qword ptr [edx + edi], xmm1
3923     lea        edx, [edx + 8]
3924     sub        ecx, 16
3925     jg         convertloop
3926 
3927     pop        edi
3928     ret
3929   }
3930 }
3931 
3932 __declspec(naked) void UYVYToYRow_SSE2(const uint8* src_uyvy,
3933                                        uint8* dst_y,
3934                                        int width) {
3935   __asm {
3936     mov        eax, [esp + 4]  // src_uyvy
3937     mov        edx, [esp + 8]  // dst_y
3938     mov        ecx, [esp + 12]  // width
3939 
3940   convertloop:
3941     movdqu     xmm0, [eax]
3942     movdqu     xmm1, [eax + 16]
3943     lea        eax,  [eax + 32]
3944     psrlw      xmm0, 8  // odd bytes are Y
3945     psrlw      xmm1, 8
3946     packuswb   xmm0, xmm1
3947     movdqu     [edx], xmm0
3948     lea        edx, [edx + 16]
3949     sub        ecx, 16
3950     jg         convertloop
3951     ret
3952   }
3953 }
3954 
3955 __declspec(naked) void UYVYToUVRow_SSE2(const uint8* src_uyvy,
3956                                         int stride_uyvy,
3957                                         uint8* dst_u,
3958                                         uint8* dst_v,
3959                                         int width) {
3960   __asm {
3961     push       esi
3962     push       edi
3963     mov        eax, [esp + 8 + 4]  // src_yuy2
3964     mov        esi, [esp + 8 + 8]  // stride_yuy2
3965     mov        edx, [esp + 8 + 12]  // dst_u
3966     mov        edi, [esp + 8 + 16]  // dst_v
3967     mov        ecx, [esp + 8 + 20]  // width
3968     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
3969     psrlw      xmm5, 8
3970     sub        edi, edx
3971 
3972   convertloop:
3973     movdqu     xmm0, [eax]
3974     movdqu     xmm1, [eax + 16]
3975     movdqu     xmm2, [eax + esi]
3976     movdqu     xmm3, [eax + esi + 16]
3977     lea        eax,  [eax + 32]
3978     pavgb      xmm0, xmm2
3979     pavgb      xmm1, xmm3
3980     pand       xmm0, xmm5  // UYVY -> UVUV
3981     pand       xmm1, xmm5
3982     packuswb   xmm0, xmm1
3983     movdqa     xmm1, xmm0
3984     pand       xmm0, xmm5  // U
3985     packuswb   xmm0, xmm0
3986     psrlw      xmm1, 8  // V
3987     packuswb   xmm1, xmm1
3988     movq       qword ptr [edx], xmm0
3989     movq       qword ptr [edx + edi], xmm1
3990     lea        edx, [edx + 8]
3991     sub        ecx, 16
3992     jg         convertloop
3993 
3994     pop        edi
3995     pop        esi
3996     ret
3997   }
3998 }
3999 
4000 __declspec(naked) void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
4001                                            uint8* dst_u,
4002                                            uint8* dst_v,
4003                                            int width) {
4004   __asm {
4005     push       edi
4006     mov        eax, [esp + 4 + 4]  // src_yuy2
4007     mov        edx, [esp + 4 + 8]  // dst_u
4008     mov        edi, [esp + 4 + 12]  // dst_v
4009     mov        ecx, [esp + 4 + 16]  // width
4010     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
4011     psrlw      xmm5, 8
4012     sub        edi, edx
4013 
4014   convertloop:
4015     movdqu     xmm0, [eax]
4016     movdqu     xmm1, [eax + 16]
4017     lea        eax,  [eax + 32]
4018     pand       xmm0, xmm5  // UYVY -> UVUV
4019     pand       xmm1, xmm5
4020     packuswb   xmm0, xmm1
4021     movdqa     xmm1, xmm0
4022     pand       xmm0, xmm5  // U
4023     packuswb   xmm0, xmm0
4024     psrlw      xmm1, 8  // V
4025     packuswb   xmm1, xmm1
4026     movq       qword ptr [edx], xmm0
4027     movq       qword ptr [edx + edi], xmm1
4028     lea        edx, [edx + 8]
4029     sub        ecx, 16
4030     jg         convertloop
4031 
4032     pop        edi
4033     ret
4034   }
4035 }
4036 #endif  // HAS_YUY2TOYROW_SSE2
4037 
4038 #ifdef HAS_BLENDPLANEROW_SSSE3
4039 // Blend 8 pixels at a time.
4040 // unsigned version of math
4041 // =((A2*C2)+(B2*(255-C2))+255)/256
4042 // signed version of math
4043 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4044 __declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0,
4045                                            const uint8* src1,
4046                                            const uint8* alpha,
4047                                            uint8* dst,
4048                                            int width) {
4049   __asm {
4050     push       esi
4051     push       edi
4052     pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
4053     psllw      xmm5, 8
4054     mov        eax, 0x80808080  // 128 for biasing image to signed.
4055     movd       xmm6, eax
4056     pshufd     xmm6, xmm6, 0x00
4057 
4058     mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.
4059     movd       xmm7, eax
4060     pshufd     xmm7, xmm7, 0x00
4061     mov        eax, [esp + 8 + 4]  // src0
4062     mov        edx, [esp + 8 + 8]  // src1
4063     mov        esi, [esp + 8 + 12]  // alpha
4064     mov        edi, [esp + 8 + 16]  // dst
4065     mov        ecx, [esp + 8 + 20]  // width
4066     sub        eax, esi
4067     sub        edx, esi
4068     sub        edi, esi
4069 
4070     // 8 pixel loop.
4071   convertloop8:
4072     movq       xmm0, qword ptr [esi]  // alpha
4073     punpcklbw  xmm0, xmm0
4074     pxor       xmm0, xmm5  // a, 255-a
4075     movq       xmm1, qword ptr [eax + esi]  // src0
4076     movq       xmm2, qword ptr [edx + esi]  // src1
4077     punpcklbw  xmm1, xmm2
4078     psubb      xmm1, xmm6  // bias src0/1 - 128
4079     pmaddubsw  xmm0, xmm1
4080     paddw      xmm0, xmm7  // unbias result - 32768 and round.
4081     psrlw      xmm0, 8
4082     packuswb   xmm0, xmm0
4083     movq       qword ptr [edi + esi], xmm0
4084     lea        esi, [esi + 8]
4085     sub        ecx, 8
4086     jg         convertloop8
4087 
4088     pop        edi
4089     pop        esi
4090     ret
4091   }
4092 }
4093 #endif  // HAS_BLENDPLANEROW_SSSE3
4094 
4095 #ifdef HAS_BLENDPLANEROW_AVX2
4096 // Blend 32 pixels at a time.
4097 // unsigned version of math
4098 // =((A2*C2)+(B2*(255-C2))+255)/256
4099 // signed version of math
4100 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4101 __declspec(naked) void BlendPlaneRow_AVX2(const uint8* src0,
4102                                           const uint8* src1,
4103                                           const uint8* alpha,
4104                                           uint8* dst,
4105                                           int width) {
4106   __asm {
4107     push        esi
4108     push        edi
4109     vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff00ff00
4110     vpsllw      ymm5, ymm5, 8
4111     mov         eax, 0x80808080  // 128 for biasing image to signed.
4112     vmovd       xmm6, eax
4113     vbroadcastss ymm6, xmm6
4114     mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.
4115     vmovd       xmm7, eax
4116     vbroadcastss ymm7, xmm7
4117     mov         eax, [esp + 8 + 4]  // src0
4118     mov         edx, [esp + 8 + 8]  // src1
4119     mov         esi, [esp + 8 + 12]  // alpha
4120     mov         edi, [esp + 8 + 16]  // dst
4121     mov         ecx, [esp + 8 + 20]  // width
4122     sub         eax, esi
4123     sub         edx, esi
4124     sub         edi, esi
4125 
4126     // 32 pixel loop.
4127   convertloop32:
4128     vmovdqu     ymm0, [esi]  // alpha
4129     vpunpckhbw  ymm3, ymm0, ymm0  // 8..15, 24..31
4130     vpunpcklbw  ymm0, ymm0, ymm0  // 0..7, 16..23
4131     vpxor       ymm3, ymm3, ymm5  // a, 255-a
4132     vpxor       ymm0, ymm0, ymm5  // a, 255-a
4133     vmovdqu     ymm1, [eax + esi]  // src0
4134     vmovdqu     ymm2, [edx + esi]  // src1
4135     vpunpckhbw  ymm4, ymm1, ymm2
4136     vpunpcklbw  ymm1, ymm1, ymm2
4137     vpsubb      ymm4, ymm4, ymm6  // bias src0/1 - 128
4138     vpsubb      ymm1, ymm1, ymm6  // bias src0/1 - 128
4139     vpmaddubsw  ymm3, ymm3, ymm4
4140     vpmaddubsw  ymm0, ymm0, ymm1
4141     vpaddw      ymm3, ymm3, ymm7  // unbias result - 32768 and round.
4142     vpaddw      ymm0, ymm0, ymm7  // unbias result - 32768 and round.
4143     vpsrlw      ymm3, ymm3, 8
4144     vpsrlw      ymm0, ymm0, 8
4145     vpackuswb   ymm0, ymm0, ymm3
4146     vmovdqu     [edi + esi], ymm0
4147     lea         esi, [esi + 32]
4148     sub         ecx, 32
4149     jg          convertloop32
4150 
4151     pop         edi
4152     pop         esi
4153     vzeroupper
4154     ret
4155   }
4156 }
4157 #endif  // HAS_BLENDPLANEROW_AVX2
4158 
4159 #ifdef HAS_ARGBBLENDROW_SSSE3
4160 // Shuffle table for isolating alpha.
4161 static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
4162                                     11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
4163 
4164 // Blend 8 pixels at a time.
4165 __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0,
4166                                           const uint8* src_argb1,
4167                                           uint8* dst_argb,
4168                                           int width) {
4169   __asm {
4170     push       esi
4171     mov        eax, [esp + 4 + 4]  // src_argb0
4172     mov        esi, [esp + 4 + 8]  // src_argb1
4173     mov        edx, [esp + 4 + 12]  // dst_argb
4174     mov        ecx, [esp + 4 + 16]  // width
4175     pcmpeqb    xmm7, xmm7  // generate constant 0x0001
4176     psrlw      xmm7, 15
4177     pcmpeqb    xmm6, xmm6  // generate mask 0x00ff00ff
4178     psrlw      xmm6, 8
4179     pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
4180     psllw      xmm5, 8
4181     pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
4182     pslld      xmm4, 24
4183     sub        ecx, 4
4184     jl         convertloop4b  // less than 4 pixels?
4185 
4186     // 4 pixel loop.
4187   convertloop4:
4188     movdqu     xmm3, [eax]  // src argb
4189     lea        eax, [eax + 16]
4190     movdqa     xmm0, xmm3  // src argb
4191     pxor       xmm3, xmm4  // ~alpha
4192     movdqu     xmm2, [esi]  // _r_b
4193     pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
4194     pand       xmm2, xmm6  // _r_b
4195     paddw      xmm3, xmm7  // 256 - alpha
4196     pmullw     xmm2, xmm3  // _r_b * alpha
4197     movdqu     xmm1, [esi]  // _a_g
4198     lea        esi, [esi + 16]
4199     psrlw      xmm1, 8  // _a_g
4200     por        xmm0, xmm4  // set alpha to 255
4201     pmullw     xmm1, xmm3  // _a_g * alpha
4202     psrlw      xmm2, 8  // _r_b convert to 8 bits again
4203     paddusb    xmm0, xmm2  // + src argb
4204     pand       xmm1, xmm5  // a_g_ convert to 8 bits again
4205     paddusb    xmm0, xmm1  // + src argb
4206     movdqu     [edx], xmm0
4207     lea        edx, [edx + 16]
4208     sub        ecx, 4
4209     jge        convertloop4
4210 
4211   convertloop4b:
4212     add        ecx, 4 - 1
4213     jl         convertloop1b
4214 
4215     // 1 pixel loop.
4216   convertloop1:
4217     movd       xmm3, [eax]  // src argb
4218     lea        eax, [eax + 4]
4219     movdqa     xmm0, xmm3  // src argb
4220     pxor       xmm3, xmm4  // ~alpha
4221     movd       xmm2, [esi]  // _r_b
4222     pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
4223     pand       xmm2, xmm6  // _r_b
4224     paddw      xmm3, xmm7  // 256 - alpha
4225     pmullw     xmm2, xmm3  // _r_b * alpha
4226     movd       xmm1, [esi]  // _a_g
4227     lea        esi, [esi + 4]
4228     psrlw      xmm1, 8  // _a_g
4229     por        xmm0, xmm4  // set alpha to 255
4230     pmullw     xmm1, xmm3  // _a_g * alpha
4231     psrlw      xmm2, 8  // _r_b convert to 8 bits again
4232     paddusb    xmm0, xmm2  // + src argb
4233     pand       xmm1, xmm5  // a_g_ convert to 8 bits again
4234     paddusb    xmm0, xmm1  // + src argb
4235     movd       [edx], xmm0
4236     lea        edx, [edx + 4]
4237     sub        ecx, 1
4238     jge        convertloop1
4239 
4240   convertloop1b:
4241     pop        esi
4242     ret
4243   }
4244 }
4245 #endif  // HAS_ARGBBLENDROW_SSSE3
4246 
4247 #ifdef HAS_ARGBATTENUATEROW_SSSE3
4248 // Shuffle table duplicating alpha.
4249 static const uvec8 kShuffleAlpha0 = {
4250     3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4251 };
4252 static const uvec8 kShuffleAlpha1 = {
4253     11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4254     15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4255 };
4256 __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8* src_argb,
4257                                               uint8* dst_argb,
4258                                               int width) {
4259   __asm {
4260     mov        eax, [esp + 4]  // src_argb0
4261     mov        edx, [esp + 8]  // dst_argb
4262     mov        ecx, [esp + 12]  // width
4263     pcmpeqb    xmm3, xmm3  // generate mask 0xff000000
4264     pslld      xmm3, 24
4265     movdqa     xmm4, xmmword ptr kShuffleAlpha0
4266     movdqa     xmm5, xmmword ptr kShuffleAlpha1
4267 
4268  convertloop:
4269     movdqu     xmm0, [eax]  // read 4 pixels
4270     pshufb     xmm0, xmm4  // isolate first 2 alphas
4271     movdqu     xmm1, [eax]  // read 4 pixels
4272     punpcklbw  xmm1, xmm1  // first 2 pixel rgbs
4273     pmulhuw    xmm0, xmm1  // rgb * a
4274     movdqu     xmm1, [eax]  // read 4 pixels
4275     pshufb     xmm1, xmm5  // isolate next 2 alphas
4276     movdqu     xmm2, [eax]  // read 4 pixels
4277     punpckhbw  xmm2, xmm2  // next 2 pixel rgbs
4278     pmulhuw    xmm1, xmm2  // rgb * a
4279     movdqu     xmm2, [eax]  // mask original alpha
4280     lea        eax, [eax + 16]
4281     pand       xmm2, xmm3
4282     psrlw      xmm0, 8
4283     psrlw      xmm1, 8
4284     packuswb   xmm0, xmm1
4285     por        xmm0, xmm2  // copy original alpha
4286     movdqu     [edx], xmm0
4287     lea        edx, [edx + 16]
4288     sub        ecx, 4
4289     jg         convertloop
4290 
4291     ret
4292   }
4293 }
4294 #endif  // HAS_ARGBATTENUATEROW_SSSE3
4295 
4296 #ifdef HAS_ARGBATTENUATEROW_AVX2
4297 // Shuffle table duplicating alpha.
4298 static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
4299                                          128u, 128u, 14u,  15u, 14u, 15u,
4300                                          14u,  15u,  128u, 128u};
4301 __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8* src_argb,
4302                                              uint8* dst_argb,
4303                                              int width) {
4304   __asm {
4305     mov        eax, [esp + 4]  // src_argb0
4306     mov        edx, [esp + 8]  // dst_argb
4307     mov        ecx, [esp + 12]  // width
4308     sub        edx, eax
4309     vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
4310     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xff000000
4311     vpslld     ymm5, ymm5, 24
4312 
4313  convertloop:
4314     vmovdqu    ymm6, [eax]  // read 8 pixels.
4315     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4316     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4317     vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
4318     vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
4319     vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
4320     vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
4321     vpand      ymm6, ymm6, ymm5  // isolate alpha
4322     vpsrlw     ymm0, ymm0, 8
4323     vpsrlw     ymm1, ymm1, 8
4324     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4325     vpor       ymm0, ymm0, ymm6  // copy original alpha
4326     vmovdqu    [eax + edx], ymm0
4327     lea        eax, [eax + 32]
4328     sub        ecx, 8
4329     jg         convertloop
4330 
4331     vzeroupper
4332     ret
4333   }
4334 }
4335 #endif  // HAS_ARGBATTENUATEROW_AVX2
4336 
4337 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
4338 // Unattenuate 4 pixels at a time.
4339 __declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8* src_argb,
4340                                                uint8* dst_argb,
4341                                                int width) {
4342   __asm {
4343     push       ebx
4344     push       esi
4345     push       edi
4346     mov        eax, [esp + 12 + 4]  // src_argb
4347     mov        edx, [esp + 12 + 8]  // dst_argb
4348     mov        ecx, [esp + 12 + 12]  // width
4349     lea        ebx, fixed_invtbl8
4350 
4351  convertloop:
4352     movdqu     xmm0, [eax]  // read 4 pixels
4353     movzx      esi, byte ptr [eax + 3]  // first alpha
4354     movzx      edi, byte ptr [eax + 7]  // second alpha
4355     punpcklbw  xmm0, xmm0  // first 2
4356     movd       xmm2, dword ptr [ebx + esi * 4]
4357     movd       xmm3, dword ptr [ebx + edi * 4]
4358     pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words.  1, a, a, a
4359     pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
4360     movlhps    xmm2, xmm3
4361     pmulhuw    xmm0, xmm2  // rgb * a
4362 
4363     movdqu     xmm1, [eax]  // read 4 pixels
4364     movzx      esi, byte ptr [eax + 11]  // third alpha
4365     movzx      edi, byte ptr [eax + 15]  // forth alpha
4366     punpckhbw  xmm1, xmm1  // next 2
4367     movd       xmm2, dword ptr [ebx + esi * 4]
4368     movd       xmm3, dword ptr [ebx + edi * 4]
4369     pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words
4370     pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
4371     movlhps    xmm2, xmm3
4372     pmulhuw    xmm1, xmm2  // rgb * a
4373     lea        eax, [eax + 16]
4374     packuswb   xmm0, xmm1
4375     movdqu     [edx], xmm0
4376     lea        edx, [edx + 16]
4377     sub        ecx, 4
4378     jg         convertloop
4379 
4380     pop        edi
4381     pop        esi
4382     pop        ebx
4383     ret
4384   }
4385 }
4386 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
4387 
4388 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
4389 // Shuffle table duplicating alpha.
4390 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
4391     0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
4392 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
4393 // USE_GATHER is not on by default, due to being a slow instruction.
4394 #ifdef USE_GATHER
4395 __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
4396                                                uint8* dst_argb,
4397                                                int width) {
4398   __asm {
4399     mov        eax, [esp + 4]  // src_argb0
4400     mov        edx, [esp + 8]  // dst_argb
4401     mov        ecx, [esp + 12]  // width
4402     sub        edx, eax
4403     vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
4404 
4405  convertloop:
4406     vmovdqu    ymm6, [eax]  // read 8 pixels.
4407     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
4408     vpsrld     ymm2, ymm6, 24  // alpha in low 8 bits.
4409     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4410     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4411     vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
4412     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
4413     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
4414     vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
4415     vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
4416     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
4417     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
4418     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4419     vmovdqu    [eax + edx], ymm0
4420     lea        eax, [eax + 32]
4421     sub        ecx, 8
4422     jg         convertloop
4423 
4424     vzeroupper
4425     ret
4426   }
4427 }
4428 #else   // USE_GATHER
4429 __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
4430                                                uint8* dst_argb,
4431                                                int width) {
4432   __asm {
4433 
4434     push       ebx
4435     push       esi
4436     push       edi
4437     mov        eax, [esp + 12 + 4]  // src_argb
4438     mov        edx, [esp + 12 + 8]  // dst_argb
4439     mov        ecx, [esp + 12 + 12]  // width
4440     sub        edx, eax
4441     lea        ebx, fixed_invtbl8
4442     vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
4443 
4444  convertloop:
4445         // replace VPGATHER
4446     movzx      esi, byte ptr [eax + 3]  // alpha0
4447     movzx      edi, byte ptr [eax + 7]  // alpha1
4448     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a0]
4449     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a1]
4450     movzx      esi, byte ptr [eax + 11]  // alpha2
4451     movzx      edi, byte ptr [eax + 15]  // alpha3
4452     vpunpckldq xmm6, xmm0, xmm1  // [1,a1,1,a0]
4453     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a2]
4454     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a3]
4455     movzx      esi, byte ptr [eax + 19]  // alpha4
4456     movzx      edi, byte ptr [eax + 23]  // alpha5
4457     vpunpckldq xmm7, xmm2, xmm3  // [1,a3,1,a2]
4458     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a4]
4459     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a5]
4460     movzx      esi, byte ptr [eax + 27]  // alpha6
4461     movzx      edi, byte ptr [eax + 31]  // alpha7
4462     vpunpckldq xmm0, xmm0, xmm1  // [1,a5,1,a4]
4463     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a6]
4464     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a7]
4465     vpunpckldq xmm2, xmm2, xmm3  // [1,a7,1,a6]
4466     vpunpcklqdq xmm3, xmm6, xmm7  // [1,a3,1,a2,1,a1,1,a0]
4467     vpunpcklqdq xmm0, xmm0, xmm2  // [1,a7,1,a6,1,a5,1,a4]
4468     vinserti128 ymm3, ymm3, xmm0, 1                // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
4469     // end of VPGATHER
4470 
4471     vmovdqu    ymm6, [eax]  // read 8 pixels.
4472     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4473     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4474     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
4475     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
4476     vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
4477     vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
4478     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
4479     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
4480     vpackuswb  ymm0, ymm0, ymm1             // unmutated.
4481     vmovdqu    [eax + edx], ymm0
4482     lea        eax, [eax + 32]
4483     sub        ecx, 8
4484     jg         convertloop
4485 
4486     pop        edi
4487     pop        esi
4488     pop        ebx
4489     vzeroupper
4490     ret
4491   }
4492 }
4493 #endif  // USE_GATHER
4494 #endif  // HAS_ARGBATTENUATEROW_AVX2
4495 
4496 #ifdef HAS_ARGBGRAYROW_SSSE3
4497 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
4498 __declspec(naked) void ARGBGrayRow_SSSE3(const uint8* src_argb,
4499                                          uint8* dst_argb,
4500                                          int width) {
4501   __asm {
4502     mov        eax, [esp + 4] /* src_argb */
4503     mov        edx, [esp + 8] /* dst_argb */
4504     mov        ecx, [esp + 12] /* width */
4505     movdqa     xmm4, xmmword ptr kARGBToYJ
4506     movdqa     xmm5, xmmword ptr kAddYJ64
4507 
4508  convertloop:
4509     movdqu     xmm0, [eax]  // G
4510     movdqu     xmm1, [eax + 16]
4511     pmaddubsw  xmm0, xmm4
4512     pmaddubsw  xmm1, xmm4
4513     phaddw     xmm0, xmm1
4514     paddw      xmm0, xmm5  // Add .5 for rounding.
4515     psrlw      xmm0, 7
4516     packuswb   xmm0, xmm0  // 8 G bytes
4517     movdqu     xmm2, [eax]  // A
4518     movdqu     xmm3, [eax + 16]
4519     lea        eax, [eax + 32]
4520     psrld      xmm2, 24
4521     psrld      xmm3, 24
4522     packuswb   xmm2, xmm3
4523     packuswb   xmm2, xmm2  // 8 A bytes
4524     movdqa     xmm3, xmm0  // Weave into GG, GA, then GGGA
4525     punpcklbw  xmm0, xmm0  // 8 GG words
4526     punpcklbw  xmm3, xmm2  // 8 GA words
4527     movdqa     xmm1, xmm0
4528     punpcklwd  xmm0, xmm3  // GGGA first 4
4529     punpckhwd  xmm1, xmm3  // GGGA next 4
4530     movdqu     [edx], xmm0
4531     movdqu     [edx + 16], xmm1
4532     lea        edx, [edx + 32]
4533     sub        ecx, 8
4534     jg         convertloop
4535     ret
4536   }
4537 }
4538 #endif  // HAS_ARGBGRAYROW_SSSE3
4539 
4540 #ifdef HAS_ARGBSEPIAROW_SSSE3
4541 //    b = (r * 35 + g * 68 + b * 17) >> 7
4542 //    g = (r * 45 + g * 88 + b * 22) >> 7
4543 //    r = (r * 50 + g * 98 + b * 24) >> 7
4544 // Constant for ARGB color to sepia tone.
4545 static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
4546                                    17, 68, 35, 0, 17, 68, 35, 0};
4547 
4548 static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
4549                                    22, 88, 45, 0, 22, 88, 45, 0};
4550 
4551 static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
4552                                    24, 98, 50, 0, 24, 98, 50, 0};
4553 
4554 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
4555 __declspec(naked) void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
4556   __asm {
4557     mov        eax, [esp + 4] /* dst_argb */
4558     mov        ecx, [esp + 8] /* width */
4559     movdqa     xmm2, xmmword ptr kARGBToSepiaB
4560     movdqa     xmm3, xmmword ptr kARGBToSepiaG
4561     movdqa     xmm4, xmmword ptr kARGBToSepiaR
4562 
4563  convertloop:
4564     movdqu     xmm0, [eax]  // B
4565     movdqu     xmm6, [eax + 16]
4566     pmaddubsw  xmm0, xmm2
4567     pmaddubsw  xmm6, xmm2
4568     phaddw     xmm0, xmm6
4569     psrlw      xmm0, 7
4570     packuswb   xmm0, xmm0  // 8 B values
4571     movdqu     xmm5, [eax]  // G
4572     movdqu     xmm1, [eax + 16]
4573     pmaddubsw  xmm5, xmm3
4574     pmaddubsw  xmm1, xmm3
4575     phaddw     xmm5, xmm1
4576     psrlw      xmm5, 7
4577     packuswb   xmm5, xmm5  // 8 G values
4578     punpcklbw  xmm0, xmm5  // 8 BG values
4579     movdqu     xmm5, [eax]  // R
4580     movdqu     xmm1, [eax + 16]
4581     pmaddubsw  xmm5, xmm4
4582     pmaddubsw  xmm1, xmm4
4583     phaddw     xmm5, xmm1
4584     psrlw      xmm5, 7
4585     packuswb   xmm5, xmm5  // 8 R values
4586     movdqu     xmm6, [eax]  // A
4587     movdqu     xmm1, [eax + 16]
4588     psrld      xmm6, 24
4589     psrld      xmm1, 24
4590     packuswb   xmm6, xmm1
4591     packuswb   xmm6, xmm6  // 8 A values
4592     punpcklbw  xmm5, xmm6  // 8 RA values
4593     movdqa     xmm1, xmm0  // Weave BG, RA together
4594     punpcklwd  xmm0, xmm5  // BGRA first 4
4595     punpckhwd  xmm1, xmm5  // BGRA next 4
4596     movdqu     [eax], xmm0
4597     movdqu     [eax + 16], xmm1
4598     lea        eax, [eax + 32]
4599     sub        ecx, 8
4600     jg         convertloop
4601     ret
4602   }
4603 }
4604 #endif  // HAS_ARGBSEPIAROW_SSSE3
4605 
4606 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4607 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
4608 // Same as Sepia except matrix is provided.
4609 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
4610 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
4611 __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
4612                                                 uint8* dst_argb,
4613                                                 const int8* matrix_argb,
4614                                                 int width) {
4615   __asm {
4616     mov        eax, [esp + 4] /* src_argb */
4617     mov        edx, [esp + 8] /* dst_argb */
4618     mov        ecx, [esp + 12] /* matrix_argb */
4619     movdqu     xmm5, [ecx]
4620     pshufd     xmm2, xmm5, 0x00
4621     pshufd     xmm3, xmm5, 0x55
4622     pshufd     xmm4, xmm5, 0xaa
4623     pshufd     xmm5, xmm5, 0xff
4624     mov        ecx, [esp + 16] /* width */
4625 
4626  convertloop:
4627     movdqu     xmm0, [eax]  // B
4628     movdqu     xmm7, [eax + 16]
4629     pmaddubsw  xmm0, xmm2
4630     pmaddubsw  xmm7, xmm2
4631     movdqu     xmm6, [eax]  // G
4632     movdqu     xmm1, [eax + 16]
4633     pmaddubsw  xmm6, xmm3
4634     pmaddubsw  xmm1, xmm3
4635     phaddsw    xmm0, xmm7  // B
4636     phaddsw    xmm6, xmm1  // G
4637     psraw      xmm0, 6  // B
4638     psraw      xmm6, 6  // G
4639     packuswb   xmm0, xmm0  // 8 B values
4640     packuswb   xmm6, xmm6  // 8 G values
4641     punpcklbw  xmm0, xmm6  // 8 BG values
4642     movdqu     xmm1, [eax]  // R
4643     movdqu     xmm7, [eax + 16]
4644     pmaddubsw  xmm1, xmm4
4645     pmaddubsw  xmm7, xmm4
4646     phaddsw    xmm1, xmm7  // R
4647     movdqu     xmm6, [eax]  // A
4648     movdqu     xmm7, [eax + 16]
4649     pmaddubsw  xmm6, xmm5
4650     pmaddubsw  xmm7, xmm5
4651     phaddsw    xmm6, xmm7  // A
4652     psraw      xmm1, 6  // R
4653     psraw      xmm6, 6  // A
4654     packuswb   xmm1, xmm1  // 8 R values
4655     packuswb   xmm6, xmm6  // 8 A values
4656     punpcklbw  xmm1, xmm6  // 8 RA values
4657     movdqa     xmm6, xmm0  // Weave BG, RA together
4658     punpcklwd  xmm0, xmm1  // BGRA first 4
4659     punpckhwd  xmm6, xmm1  // BGRA next 4
4660     movdqu     [edx], xmm0
4661     movdqu     [edx + 16], xmm6
4662     lea        eax, [eax + 32]
4663     lea        edx, [edx + 32]
4664     sub        ecx, 8
4665     jg         convertloop
4666     ret
4667   }
4668 }
4669 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
4670 
4671 #ifdef HAS_ARGBQUANTIZEROW_SSE2
4672 // Quantize 4 ARGB pixels (16 bytes).
4673 __declspec(naked) void ARGBQuantizeRow_SSE2(uint8* dst_argb,
4674                                             int scale,
4675                                             int interval_size,
4676                                             int interval_offset,
4677                                             int width) {
4678   __asm {
4679     mov        eax, [esp + 4] /* dst_argb */
4680     movd       xmm2, [esp + 8] /* scale */
4681     movd       xmm3, [esp + 12] /* interval_size */
4682     movd       xmm4, [esp + 16] /* interval_offset */
4683     mov        ecx, [esp + 20] /* width */
4684     pshuflw    xmm2, xmm2, 040h
4685     pshufd     xmm2, xmm2, 044h
4686     pshuflw    xmm3, xmm3, 040h
4687     pshufd     xmm3, xmm3, 044h
4688     pshuflw    xmm4, xmm4, 040h
4689     pshufd     xmm4, xmm4, 044h
4690     pxor       xmm5, xmm5  // constant 0
4691     pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
4692     pslld      xmm6, 24
4693 
4694  convertloop:
4695     movdqu     xmm0, [eax]  // read 4 pixels
4696     punpcklbw  xmm0, xmm5  // first 2 pixels
4697     pmulhuw    xmm0, xmm2  // pixel * scale >> 16
4698     movdqu     xmm1, [eax]  // read 4 pixels
4699     punpckhbw  xmm1, xmm5  // next 2 pixels
4700     pmulhuw    xmm1, xmm2
4701     pmullw     xmm0, xmm3  // * interval_size
4702     movdqu     xmm7, [eax]  // read 4 pixels
4703     pmullw     xmm1, xmm3
4704     pand       xmm7, xmm6  // mask alpha
4705     paddw      xmm0, xmm4  // + interval_size / 2
4706     paddw      xmm1, xmm4
4707     packuswb   xmm0, xmm1
4708     por        xmm0, xmm7
4709     movdqu     [eax], xmm0
4710     lea        eax, [eax + 16]
4711     sub        ecx, 4
4712     jg         convertloop
4713     ret
4714   }
4715 }
4716 #endif  // HAS_ARGBQUANTIZEROW_SSE2
4717 
4718 #ifdef HAS_ARGBSHADEROW_SSE2
4719 // Shade 4 pixels at a time by specified value.
4720 __declspec(naked) void ARGBShadeRow_SSE2(const uint8* src_argb,
4721                                          uint8* dst_argb,
4722                                          int width,
4723                                          uint32 value) {
4724   __asm {
4725     mov        eax, [esp + 4]  // src_argb
4726     mov        edx, [esp + 8]  // dst_argb
4727     mov        ecx, [esp + 12]  // width
4728     movd       xmm2, [esp + 16]  // value
4729     punpcklbw  xmm2, xmm2
4730     punpcklqdq xmm2, xmm2
4731 
4732  convertloop:
4733     movdqu     xmm0, [eax]  // read 4 pixels
4734     lea        eax, [eax + 16]
4735     movdqa     xmm1, xmm0
4736     punpcklbw  xmm0, xmm0  // first 2
4737     punpckhbw  xmm1, xmm1  // next 2
4738     pmulhuw    xmm0, xmm2  // argb * value
4739     pmulhuw    xmm1, xmm2  // argb * value
4740     psrlw      xmm0, 8
4741     psrlw      xmm1, 8
4742     packuswb   xmm0, xmm1
4743     movdqu     [edx], xmm0
4744     lea        edx, [edx + 16]
4745     sub        ecx, 4
4746     jg         convertloop
4747 
4748     ret
4749   }
4750 }
4751 #endif  // HAS_ARGBSHADEROW_SSE2
4752 
4753 #ifdef HAS_ARGBMULTIPLYROW_SSE2
4754 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4755 __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
4756                                             const uint8* src_argb1,
4757                                             uint8* dst_argb,
4758                                             int width) {
4759   __asm {
4760     push       esi
4761     mov        eax, [esp + 4 + 4]  // src_argb0
4762     mov        esi, [esp + 4 + 8]  // src_argb1
4763     mov        edx, [esp + 4 + 12]  // dst_argb
4764     mov        ecx, [esp + 4 + 16]  // width
4765     pxor       xmm5, xmm5  // constant 0
4766 
4767  convertloop:
4768     movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
4769     movdqu     xmm2, [esi]  // read 4 pixels from src_argb1
4770     movdqu     xmm1, xmm0
4771     movdqu     xmm3, xmm2
4772     punpcklbw  xmm0, xmm0  // first 2
4773     punpckhbw  xmm1, xmm1  // next 2
4774     punpcklbw  xmm2, xmm5  // first 2
4775     punpckhbw  xmm3, xmm5  // next 2
4776     pmulhuw    xmm0, xmm2  // src_argb0 * src_argb1 first 2
4777     pmulhuw    xmm1, xmm3  // src_argb0 * src_argb1 next 2
4778     lea        eax, [eax + 16]
4779     lea        esi, [esi + 16]
4780     packuswb   xmm0, xmm1
4781     movdqu     [edx], xmm0
4782     lea        edx, [edx + 16]
4783     sub        ecx, 4
4784     jg         convertloop
4785 
4786     pop        esi
4787     ret
4788   }
4789 }
4790 #endif  // HAS_ARGBMULTIPLYROW_SSE2
4791 
4792 #ifdef HAS_ARGBADDROW_SSE2
4793 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
4794 // TODO(fbarchard): Port this to posix, neon and other math functions.
4795 __declspec(naked) void ARGBAddRow_SSE2(const uint8* src_argb0,
4796                                        const uint8* src_argb1,
4797                                        uint8* dst_argb,
4798                                        int width) {
4799   __asm {
4800     push       esi
4801     mov        eax, [esp + 4 + 4]  // src_argb0
4802     mov        esi, [esp + 4 + 8]  // src_argb1
4803     mov        edx, [esp + 4 + 12]  // dst_argb
4804     mov        ecx, [esp + 4 + 16]  // width
4805 
4806     sub        ecx, 4
4807     jl         convertloop49
4808 
4809  convertloop4:
4810     movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
4811     lea        eax, [eax + 16]
4812     movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
4813     lea        esi, [esi + 16]
4814     paddusb    xmm0, xmm1  // src_argb0 + src_argb1
4815     movdqu     [edx], xmm0
4816     lea        edx, [edx + 16]
4817     sub        ecx, 4
4818     jge        convertloop4
4819 
4820  convertloop49:
4821     add        ecx, 4 - 1
4822     jl         convertloop19
4823 
4824  convertloop1:
4825     movd       xmm0, [eax]  // read 1 pixels from src_argb0
4826     lea        eax, [eax + 4]
4827     movd       xmm1, [esi]  // read 1 pixels from src_argb1
4828     lea        esi, [esi + 4]
4829     paddusb    xmm0, xmm1  // src_argb0 + src_argb1
4830     movd       [edx], xmm0
4831     lea        edx, [edx + 4]
4832     sub        ecx, 1
4833     jge        convertloop1
4834 
4835  convertloop19:
4836     pop        esi
4837     ret
4838   }
4839 }
4840 #endif  // HAS_ARGBADDROW_SSE2
4841 
4842 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4843 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
4844 __declspec(naked) void ARGBSubtractRow_SSE2(const uint8* src_argb0,
4845                                             const uint8* src_argb1,
4846                                             uint8* dst_argb,
4847                                             int width) {
4848   __asm {
4849     push       esi
4850     mov        eax, [esp + 4 + 4]  // src_argb0
4851     mov        esi, [esp + 4 + 8]  // src_argb1
4852     mov        edx, [esp + 4 + 12]  // dst_argb
4853     mov        ecx, [esp + 4 + 16]  // width
4854 
4855  convertloop:
4856     movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
4857     lea        eax, [eax + 16]
4858     movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
4859     lea        esi, [esi + 16]
4860     psubusb    xmm0, xmm1  // src_argb0 - src_argb1
4861     movdqu     [edx], xmm0
4862     lea        edx, [edx + 16]
4863     sub        ecx, 4
4864     jg         convertloop
4865 
4866     pop        esi
4867     ret
4868   }
4869 }
4870 #endif  // HAS_ARGBSUBTRACTROW_SSE2
4871 
4872 #ifdef HAS_ARGBMULTIPLYROW_AVX2
4873 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
4874 __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
4875                                             const uint8* src_argb1,
4876                                             uint8* dst_argb,
4877                                             int width) {
4878   __asm {
4879     push       esi
4880     mov        eax, [esp + 4 + 4]  // src_argb0
4881     mov        esi, [esp + 4 + 8]  // src_argb1
4882     mov        edx, [esp + 4 + 12]  // dst_argb
4883     mov        ecx, [esp + 4 + 16]  // width
4884     vpxor      ymm5, ymm5, ymm5  // constant 0
4885 
4886  convertloop:
4887     vmovdqu    ymm1, [eax]  // read 8 pixels from src_argb0
4888     lea        eax, [eax + 32]
4889     vmovdqu    ymm3, [esi]  // read 8 pixels from src_argb1
4890     lea        esi, [esi + 32]
4891     vpunpcklbw ymm0, ymm1, ymm1  // low 4
4892     vpunpckhbw ymm1, ymm1, ymm1  // high 4
4893     vpunpcklbw ymm2, ymm3, ymm5  // low 4
4894     vpunpckhbw ymm3, ymm3, ymm5  // high 4
4895     vpmulhuw   ymm0, ymm0, ymm2  // src_argb0 * src_argb1 low 4
4896     vpmulhuw   ymm1, ymm1, ymm3  // src_argb0 * src_argb1 high 4
4897     vpackuswb  ymm0, ymm0, ymm1
4898     vmovdqu    [edx], ymm0
4899     lea        edx, [edx + 32]
4900     sub        ecx, 8
4901     jg         convertloop
4902 
4903     pop        esi
4904     vzeroupper
4905     ret
4906   }
4907 }
4908 #endif  // HAS_ARGBMULTIPLYROW_AVX2
4909 
4910 #ifdef HAS_ARGBADDROW_AVX2
4911 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
4912 __declspec(naked) void ARGBAddRow_AVX2(const uint8* src_argb0,
4913                                        const uint8* src_argb1,
4914                                        uint8* dst_argb,
4915                                        int width) {
4916   __asm {
4917     push       esi
4918     mov        eax, [esp + 4 + 4]  // src_argb0
4919     mov        esi, [esp + 4 + 8]  // src_argb1
4920     mov        edx, [esp + 4 + 12]  // dst_argb
4921     mov        ecx, [esp + 4 + 16]  // width
4922 
4923  convertloop:
4924     vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
4925     lea        eax, [eax + 32]
4926     vpaddusb   ymm0, ymm0, [esi]  // add 8 pixels from src_argb1
4927     lea        esi, [esi + 32]
4928     vmovdqu    [edx], ymm0
4929     lea        edx, [edx + 32]
4930     sub        ecx, 8
4931     jg         convertloop
4932 
4933     pop        esi
4934     vzeroupper
4935     ret
4936   }
4937 }
4938 #endif  // HAS_ARGBADDROW_AVX2
4939 
4940 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4941 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
4942 __declspec(naked) void ARGBSubtractRow_AVX2(const uint8* src_argb0,
4943                                             const uint8* src_argb1,
4944                                             uint8* dst_argb,
4945                                             int width) {
4946   __asm {
4947     push       esi
4948     mov        eax, [esp + 4 + 4]  // src_argb0
4949     mov        esi, [esp + 4 + 8]  // src_argb1
4950     mov        edx, [esp + 4 + 12]  // dst_argb
4951     mov        ecx, [esp + 4 + 16]  // width
4952 
4953  convertloop:
4954     vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
4955     lea        eax, [eax + 32]
4956     vpsubusb   ymm0, ymm0, [esi]  // src_argb0 - src_argb1
4957     lea        esi, [esi + 32]
4958     vmovdqu    [edx], ymm0
4959     lea        edx, [edx + 32]
4960     sub        ecx, 8
4961     jg         convertloop
4962 
4963     pop        esi
4964     vzeroupper
4965     ret
4966   }
4967 }
4968 #endif  // HAS_ARGBSUBTRACTROW_AVX2
4969 
4970 #ifdef HAS_SOBELXROW_SSE2
4971 // SobelX as a matrix is
4972 // -1  0  1
4973 // -2  0  2
4974 // -1  0  1
4975 __declspec(naked) void SobelXRow_SSE2(const uint8* src_y0,
4976                                       const uint8* src_y1,
4977                                       const uint8* src_y2,
4978                                       uint8* dst_sobelx,
4979                                       int width) {
4980   __asm {
4981     push       esi
4982     push       edi
4983     mov        eax, [esp + 8 + 4]  // src_y0
4984     mov        esi, [esp + 8 + 8]  // src_y1
4985     mov        edi, [esp + 8 + 12]  // src_y2
4986     mov        edx, [esp + 8 + 16]  // dst_sobelx
4987     mov        ecx, [esp + 8 + 20]  // width
4988     sub        esi, eax
4989     sub        edi, eax
4990     sub        edx, eax
4991     pxor       xmm5, xmm5  // constant 0
4992 
4993  convertloop:
4994     movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
4995     movq       xmm1, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
4996     punpcklbw  xmm0, xmm5
4997     punpcklbw  xmm1, xmm5
4998     psubw      xmm0, xmm1
4999     movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
5000     movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
5001     punpcklbw  xmm1, xmm5
5002     punpcklbw  xmm2, xmm5
5003     psubw      xmm1, xmm2
5004     movq       xmm2, qword ptr [eax + edi]  // read 8 pixels from src_y2[0]
5005     movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
5006     punpcklbw  xmm2, xmm5
5007     punpcklbw  xmm3, xmm5
5008     psubw      xmm2, xmm3
5009     paddw      xmm0, xmm2
5010     paddw      xmm0, xmm1
5011     paddw      xmm0, xmm1
5012     pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
5013     psubw      xmm1, xmm0
5014     pmaxsw     xmm0, xmm1
5015     packuswb   xmm0, xmm0
5016     movq       qword ptr [eax + edx], xmm0
5017     lea        eax, [eax + 8]
5018     sub        ecx, 8
5019     jg         convertloop
5020 
5021     pop        edi
5022     pop        esi
5023     ret
5024   }
5025 }
5026 #endif  // HAS_SOBELXROW_SSE2
5027 
5028 #ifdef HAS_SOBELYROW_SSE2
5029 // SobelY as a matrix is
5030 // -1 -2 -1
5031 //  0  0  0
5032 //  1  2  1
5033 __declspec(naked) void SobelYRow_SSE2(const uint8* src_y0,
5034                                       const uint8* src_y1,
5035                                       uint8* dst_sobely,
5036                                       int width) {
5037   __asm {
5038     push       esi
5039     mov        eax, [esp + 4 + 4]  // src_y0
5040     mov        esi, [esp + 4 + 8]  // src_y1
5041     mov        edx, [esp + 4 + 12]  // dst_sobely
5042     mov        ecx, [esp + 4 + 16]  // width
5043     sub        esi, eax
5044     sub        edx, eax
5045     pxor       xmm5, xmm5  // constant 0
5046 
5047  convertloop:
5048     movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
5049     movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
5050     punpcklbw  xmm0, xmm5
5051     punpcklbw  xmm1, xmm5
5052     psubw      xmm0, xmm1
5053     movq       xmm1, qword ptr [eax + 1]  // read 8 pixels from src_y0[1]
5054     movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
5055     punpcklbw  xmm1, xmm5
5056     punpcklbw  xmm2, xmm5
5057     psubw      xmm1, xmm2
5058     movq       xmm2, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
5059     movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
5060     punpcklbw  xmm2, xmm5
5061     punpcklbw  xmm3, xmm5
5062     psubw      xmm2, xmm3
5063     paddw      xmm0, xmm2
5064     paddw      xmm0, xmm1
5065     paddw      xmm0, xmm1
5066     pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
5067     psubw      xmm1, xmm0
5068     pmaxsw     xmm0, xmm1
5069     packuswb   xmm0, xmm0
5070     movq       qword ptr [eax + edx], xmm0
5071     lea        eax, [eax + 8]
5072     sub        ecx, 8
5073     jg         convertloop
5074 
5075     pop        esi
5076     ret
5077   }
5078 }
5079 #endif  // HAS_SOBELYROW_SSE2
5080 
5081 #ifdef HAS_SOBELROW_SSE2
5082 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5083 // A = 255
5084 // R = Sobel
5085 // G = Sobel
5086 // B = Sobel
5087 __declspec(naked) void SobelRow_SSE2(const uint8* src_sobelx,
5088                                      const uint8* src_sobely,
5089                                      uint8* dst_argb,
5090                                      int width) {
5091   __asm {
5092     push       esi
5093     mov        eax, [esp + 4 + 4]  // src_sobelx
5094     mov        esi, [esp + 4 + 8]  // src_sobely
5095     mov        edx, [esp + 4 + 12]  // dst_argb
5096     mov        ecx, [esp + 4 + 16]  // width
5097     sub        esi, eax
5098     pcmpeqb    xmm5, xmm5  // alpha 255
5099     pslld      xmm5, 24  // 0xff000000
5100 
5101  convertloop:
5102     movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
5103     movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
5104     lea        eax, [eax + 16]
5105     paddusb    xmm0, xmm1  // sobel = sobelx + sobely
5106     movdqa     xmm2, xmm0  // GG
5107     punpcklbw  xmm2, xmm0  // First 8
5108     punpckhbw  xmm0, xmm0  // Next 8
5109     movdqa     xmm1, xmm2  // GGGG
5110     punpcklwd  xmm1, xmm2  // First 4
5111     punpckhwd  xmm2, xmm2  // Next 4
5112     por        xmm1, xmm5  // GGGA
5113     por        xmm2, xmm5
5114     movdqa     xmm3, xmm0  // GGGG
5115     punpcklwd  xmm3, xmm0  // Next 4
5116     punpckhwd  xmm0, xmm0  // Last 4
5117     por        xmm3, xmm5  // GGGA
5118     por        xmm0, xmm5
5119     movdqu     [edx], xmm1
5120     movdqu     [edx + 16], xmm2
5121     movdqu     [edx + 32], xmm3
5122     movdqu     [edx + 48], xmm0
5123     lea        edx, [edx + 64]
5124     sub        ecx, 16
5125     jg         convertloop
5126 
5127     pop        esi
5128     ret
5129   }
5130 }
5131 #endif  // HAS_SOBELROW_SSE2
5132 
5133 #ifdef HAS_SOBELTOPLANEROW_SSE2
5134 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
5135 __declspec(naked) void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
5136                                             const uint8* src_sobely,
5137                                             uint8* dst_y,
5138                                             int width) {
5139   __asm {
5140     push       esi
5141     mov        eax, [esp + 4 + 4]  // src_sobelx
5142     mov        esi, [esp + 4 + 8]  // src_sobely
5143     mov        edx, [esp + 4 + 12]  // dst_argb
5144     mov        ecx, [esp + 4 + 16]  // width
5145     sub        esi, eax
5146 
5147  convertloop:
5148     movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
5149     movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
5150     lea        eax, [eax + 16]
5151     paddusb    xmm0, xmm1  // sobel = sobelx + sobely
5152     movdqu     [edx], xmm0
5153     lea        edx, [edx + 16]
5154     sub        ecx, 16
5155     jg         convertloop
5156 
5157     pop        esi
5158     ret
5159   }
5160 }
5161 #endif  // HAS_SOBELTOPLANEROW_SSE2
5162 
5163 #ifdef HAS_SOBELXYROW_SSE2
5164 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
5165 // A = 255
5166 // R = Sobel X
5167 // G = Sobel
5168 // B = Sobel Y
5169 __declspec(naked) void SobelXYRow_SSE2(const uint8* src_sobelx,
5170                                        const uint8* src_sobely,
5171                                        uint8* dst_argb,
5172                                        int width) {
5173   __asm {
5174     push       esi
5175     mov        eax, [esp + 4 + 4]  // src_sobelx
5176     mov        esi, [esp + 4 + 8]  // src_sobely
5177     mov        edx, [esp + 4 + 12]  // dst_argb
5178     mov        ecx, [esp + 4 + 16]  // width
5179     sub        esi, eax
5180     pcmpeqb    xmm5, xmm5  // alpha 255
5181 
5182  convertloop:
5183     movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
5184     movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
5185     lea        eax, [eax + 16]
5186     movdqa     xmm2, xmm0
5187     paddusb    xmm2, xmm1  // sobel = sobelx + sobely
5188     movdqa     xmm3, xmm0  // XA
5189     punpcklbw  xmm3, xmm5
5190     punpckhbw  xmm0, xmm5
5191     movdqa     xmm4, xmm1  // YS
5192     punpcklbw  xmm4, xmm2
5193     punpckhbw  xmm1, xmm2
5194     movdqa     xmm6, xmm4  // YSXA
5195     punpcklwd  xmm6, xmm3  // First 4
5196     punpckhwd  xmm4, xmm3  // Next 4
5197     movdqa     xmm7, xmm1  // YSXA
5198     punpcklwd  xmm7, xmm0  // Next 4
5199     punpckhwd  xmm1, xmm0  // Last 4
5200     movdqu     [edx], xmm6
5201     movdqu     [edx + 16], xmm4
5202     movdqu     [edx + 32], xmm7
5203     movdqu     [edx + 48], xmm1
5204     lea        edx, [edx + 64]
5205     sub        ecx, 16
5206     jg         convertloop
5207 
5208     pop        esi
5209     ret
5210   }
5211 }
5212 #endif  // HAS_SOBELXYROW_SSE2
5213 
5214 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5215 // Consider float CumulativeSum.
5216 // Consider calling CumulativeSum one row at time as needed.
5217 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
5218 // Convert cumulative sum for an area to an average for 1 pixel.
5219 // topleft is pointer to top left of CumulativeSum buffer for area.
5220 // botleft is pointer to bottom left of CumulativeSum buffer.
5221 // width is offset from left to right of area in CumulativeSum buffer measured
5222 //   in number of ints.
5223 // area is the number of pixels in the area being averaged.
5224 // dst points to pixel to store result to.
5225 // count is number of averaged pixels to produce.
5226 // Does 4 pixels at a time.
5227 // This function requires alignment on accumulation buffer pointers.
5228 void CumulativeSumToAverageRow_SSE2(const int32* topleft,
5229                                     const int32* botleft,
5230                                     int width,
5231                                     int area,
5232                                     uint8* dst,
5233                                     int count) {
5234   __asm {
5235     mov        eax, topleft  // eax topleft
5236     mov        esi, botleft  // esi botleft
5237     mov        edx, width
5238     movd       xmm5, area
5239     mov        edi, dst
5240     mov        ecx, count
5241     cvtdq2ps   xmm5, xmm5
5242     rcpss      xmm4, xmm5  // 1.0f / area
5243     pshufd     xmm4, xmm4, 0
5244     sub        ecx, 4
5245     jl         l4b
5246 
5247     cmp        area, 128  // 128 pixels will not overflow 15 bits.
5248     ja         l4
5249 
5250     pshufd     xmm5, xmm5, 0  // area
5251     pcmpeqb    xmm6, xmm6  // constant of 65536.0 - 1 = 65535.0
5252     psrld      xmm6, 16
5253     cvtdq2ps   xmm6, xmm6
5254     addps      xmm5, xmm6  // (65536.0 + area - 1)
5255     mulps      xmm5, xmm4  // (65536.0 + area - 1) * 1 / area
5256     cvtps2dq   xmm5, xmm5  // 0.16 fixed point
5257     packssdw   xmm5, xmm5  // 16 bit shorts
5258 
5259     // 4 pixel loop small blocks.
5260   s4:
5261         // top left
5262     movdqu     xmm0, [eax]
5263     movdqu     xmm1, [eax + 16]
5264     movdqu     xmm2, [eax + 32]
5265     movdqu     xmm3, [eax + 48]
5266 
5267     // - top right
5268     psubd      xmm0, [eax + edx * 4]
5269     psubd      xmm1, [eax + edx * 4 + 16]
5270     psubd      xmm2, [eax + edx * 4 + 32]
5271     psubd      xmm3, [eax + edx * 4 + 48]
5272     lea        eax, [eax + 64]
5273 
5274     // - bottom left
5275     psubd      xmm0, [esi]
5276     psubd      xmm1, [esi + 16]
5277     psubd      xmm2, [esi + 32]
5278     psubd      xmm3, [esi + 48]
5279 
5280     // + bottom right
5281     paddd      xmm0, [esi + edx * 4]
5282     paddd      xmm1, [esi + edx * 4 + 16]
5283     paddd      xmm2, [esi + edx * 4 + 32]
5284     paddd      xmm3, [esi + edx * 4 + 48]
5285     lea        esi, [esi + 64]
5286 
5287     packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
5288     packssdw   xmm2, xmm3
5289 
5290     pmulhuw    xmm0, xmm5
5291     pmulhuw    xmm2, xmm5
5292 
5293     packuswb   xmm0, xmm2
5294     movdqu     [edi], xmm0
5295     lea        edi, [edi + 16]
5296     sub        ecx, 4
5297     jge        s4
5298 
5299     jmp        l4b
5300 
5301     // 4 pixel loop
5302   l4:
5303         // top left
5304     movdqu     xmm0, [eax]
5305     movdqu     xmm1, [eax + 16]
5306     movdqu     xmm2, [eax + 32]
5307     movdqu     xmm3, [eax + 48]
5308 
5309     // - top right
5310     psubd      xmm0, [eax + edx * 4]
5311     psubd      xmm1, [eax + edx * 4 + 16]
5312     psubd      xmm2, [eax + edx * 4 + 32]
5313     psubd      xmm3, [eax + edx * 4 + 48]
5314     lea        eax, [eax + 64]
5315 
5316     // - bottom left
5317     psubd      xmm0, [esi]
5318     psubd      xmm1, [esi + 16]
5319     psubd      xmm2, [esi + 32]
5320     psubd      xmm3, [esi + 48]
5321 
5322     // + bottom right
5323     paddd      xmm0, [esi + edx * 4]
5324     paddd      xmm1, [esi + edx * 4 + 16]
5325     paddd      xmm2, [esi + edx * 4 + 32]
5326     paddd      xmm3, [esi + edx * 4 + 48]
5327     lea        esi, [esi + 64]
5328 
5329     cvtdq2ps   xmm0, xmm0  // Average = Sum * 1 / Area
5330     cvtdq2ps   xmm1, xmm1
5331     mulps      xmm0, xmm4
5332     mulps      xmm1, xmm4
5333     cvtdq2ps   xmm2, xmm2
5334     cvtdq2ps   xmm3, xmm3
5335     mulps      xmm2, xmm4
5336     mulps      xmm3, xmm4
5337     cvtps2dq   xmm0, xmm0
5338     cvtps2dq   xmm1, xmm1
5339     cvtps2dq   xmm2, xmm2
5340     cvtps2dq   xmm3, xmm3
5341     packssdw   xmm0, xmm1
5342     packssdw   xmm2, xmm3
5343     packuswb   xmm0, xmm2
5344     movdqu     [edi], xmm0
5345     lea        edi, [edi + 16]
5346     sub        ecx, 4
5347     jge        l4
5348 
5349   l4b:
5350     add        ecx, 4 - 1
5351     jl         l1b
5352 
5353     // 1 pixel loop
5354   l1:
5355     movdqu     xmm0, [eax]
5356     psubd      xmm0, [eax + edx * 4]
5357     lea        eax, [eax + 16]
5358     psubd      xmm0, [esi]
5359     paddd      xmm0, [esi + edx * 4]
5360     lea        esi, [esi + 16]
5361     cvtdq2ps   xmm0, xmm0
5362     mulps      xmm0, xmm4
5363     cvtps2dq   xmm0, xmm0
5364     packssdw   xmm0, xmm0
5365     packuswb   xmm0, xmm0
5366     movd       dword ptr [edi], xmm0
5367     lea        edi, [edi + 4]
5368     sub        ecx, 1
5369     jge        l1
5370   l1b:
5371   }
5372 }
5373 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5374 
5375 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5376 // Creates a table of cumulative sums where each value is a sum of all values
5377 // above and to the left of the value.
5378 void ComputeCumulativeSumRow_SSE2(const uint8* row,
5379                                   int32* cumsum,
5380                                   const int32* previous_cumsum,
5381                                   int width) {
5382   __asm {
5383     mov        eax, row
5384     mov        edx, cumsum
5385     mov        esi, previous_cumsum
5386     mov        ecx, width
5387     pxor       xmm0, xmm0
5388     pxor       xmm1, xmm1
5389 
5390     sub        ecx, 4
5391     jl         l4b
5392     test       edx, 15
5393     jne        l4b
5394 
5395     // 4 pixel loop
5396   l4:
5397     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
5398     lea        eax, [eax + 16]
5399     movdqa     xmm4, xmm2
5400 
5401     punpcklbw  xmm2, xmm1
5402     movdqa     xmm3, xmm2
5403     punpcklwd  xmm2, xmm1
5404     punpckhwd  xmm3, xmm1
5405 
5406     punpckhbw  xmm4, xmm1
5407     movdqa     xmm5, xmm4
5408     punpcklwd  xmm4, xmm1
5409     punpckhwd  xmm5, xmm1
5410 
5411     paddd      xmm0, xmm2
5412     movdqu     xmm2, [esi]  // previous row above.
5413     paddd      xmm2, xmm0
5414 
5415     paddd      xmm0, xmm3
5416     movdqu     xmm3, [esi + 16]
5417     paddd      xmm3, xmm0
5418 
5419     paddd      xmm0, xmm4
5420     movdqu     xmm4, [esi + 32]
5421     paddd      xmm4, xmm0
5422 
5423     paddd      xmm0, xmm5
5424     movdqu     xmm5, [esi + 48]
5425     lea        esi, [esi + 64]
5426     paddd      xmm5, xmm0
5427 
5428     movdqu     [edx], xmm2
5429     movdqu     [edx + 16], xmm3
5430     movdqu     [edx + 32], xmm4
5431     movdqu     [edx + 48], xmm5
5432 
5433     lea        edx, [edx + 64]
5434     sub        ecx, 4
5435     jge        l4
5436 
5437   l4b:
5438     add        ecx, 4 - 1
5439     jl         l1b
5440 
5441     // 1 pixel loop
5442   l1:
5443     movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
5444     lea        eax, [eax + 4]
5445     punpcklbw  xmm2, xmm1
5446     punpcklwd  xmm2, xmm1
5447     paddd      xmm0, xmm2
5448     movdqu     xmm2, [esi]
5449     lea        esi, [esi + 16]
5450     paddd      xmm2, xmm0
5451     movdqu     [edx], xmm2
5452     lea        edx, [edx + 16]
5453     sub        ecx, 1
5454     jge        l1
5455 
5456  l1b:
5457   }
5458 }
5459 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
5460 
5461 #ifdef HAS_ARGBAFFINEROW_SSE2
5462 // Copy ARGB pixels from source image with slope to a row of destination.
5463 __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
5464                                                      int src_argb_stride,
5465                                                      uint8* dst_argb,
5466                                                      const float* uv_dudv,
5467                                                      int width) {
5468   __asm {
5469     push       esi
5470     push       edi
5471     mov        eax, [esp + 12]  // src_argb
5472     mov        esi, [esp + 16]  // stride
5473     mov        edx, [esp + 20]  // dst_argb
5474     mov        ecx, [esp + 24]  // pointer to uv_dudv
5475     movq       xmm2, qword ptr [ecx]  // uv
5476     movq       xmm7, qword ptr [ecx + 8]  // dudv
5477     mov        ecx, [esp + 28]  // width
5478     shl        esi, 16  // 4, stride
5479     add        esi, 4
5480     movd       xmm5, esi
5481     sub        ecx, 4
5482     jl         l4b
5483 
5484     // setup for 4 pixel loop
5485     pshufd     xmm7, xmm7, 0x44  // dup dudv
5486     pshufd     xmm5, xmm5, 0  // dup 4, stride
5487     movdqa     xmm0, xmm2  // x0, y0, x1, y1
5488     addps      xmm0, xmm7
5489     movlhps    xmm2, xmm0
5490     movdqa     xmm4, xmm7
5491     addps      xmm4, xmm4  // dudv *= 2
5492     movdqa     xmm3, xmm2  // x2, y2, x3, y3
5493     addps      xmm3, xmm4
5494     addps      xmm4, xmm4  // dudv *= 4
5495 
5496     // 4 pixel loop
5497   l4:
5498     cvttps2dq  xmm0, xmm2  // x, y float to int first 2
5499     cvttps2dq  xmm1, xmm3  // x, y float to int next 2
5500     packssdw   xmm0, xmm1  // x, y as 8 shorts
5501     pmaddwd    xmm0, xmm5  // offsets = x * 4 + y * stride.
5502     movd       esi, xmm0
5503     pshufd     xmm0, xmm0, 0x39  // shift right
5504     movd       edi, xmm0
5505     pshufd     xmm0, xmm0, 0x39  // shift right
5506     movd       xmm1, [eax + esi]  // read pixel 0
5507     movd       xmm6, [eax + edi]  // read pixel 1
5508     punpckldq  xmm1, xmm6  // combine pixel 0 and 1
5509     addps      xmm2, xmm4  // x, y += dx, dy first 2
5510     movq       qword ptr [edx], xmm1
5511     movd       esi, xmm0
5512     pshufd     xmm0, xmm0, 0x39  // shift right
5513     movd       edi, xmm0
5514     movd       xmm6, [eax + esi]  // read pixel 2
5515     movd       xmm0, [eax + edi]  // read pixel 3
5516     punpckldq  xmm6, xmm0  // combine pixel 2 and 3
5517     addps      xmm3, xmm4  // x, y += dx, dy next 2
5518     movq       qword ptr 8[edx], xmm6
5519     lea        edx, [edx + 16]
5520     sub        ecx, 4
5521     jge        l4
5522 
5523   l4b:
5524     add        ecx, 4 - 1
5525     jl         l1b
5526 
5527     // 1 pixel loop
5528   l1:
5529     cvttps2dq  xmm0, xmm2  // x, y float to int
5530     packssdw   xmm0, xmm0  // x, y as shorts
5531     pmaddwd    xmm0, xmm5  // offset = x * 4 + y * stride
5532     addps      xmm2, xmm7  // x, y += dx, dy
5533     movd       esi, xmm0
5534     movd       xmm0, [eax + esi]  // copy a pixel
5535     movd       [edx], xmm0
5536     lea        edx, [edx + 4]
5537     sub        ecx, 1
5538     jge        l1
5539   l1b:
5540     pop        edi
5541     pop        esi
5542     ret
5543   }
5544 }
5545 #endif  // HAS_ARGBAFFINEROW_SSE2
5546 
5547 #ifdef HAS_INTERPOLATEROW_AVX2
5548 // Bilinear filter 32x2 -> 32x1
5549 __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr,
5550                                            const uint8* src_ptr,
5551                                            ptrdiff_t src_stride,
5552                                            int dst_width,
5553                                            int source_y_fraction) {
5554   __asm {
5555     push       esi
5556     push       edi
5557     mov        edi, [esp + 8 + 4]  // dst_ptr
5558     mov        esi, [esp + 8 + 8]  // src_ptr
5559     mov        edx, [esp + 8 + 12]  // src_stride
5560     mov        ecx, [esp + 8 + 16]  // dst_width
5561     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
5562     // Dispatch to specialized filters if applicable.
5563     cmp        eax, 0
5564     je         xloop100  // 0 / 256.  Blend 100 / 0.
5565     sub        edi, esi
5566     cmp        eax, 128
5567     je         xloop50  // 128 /256 is 0.50.  Blend 50 / 50.
5568 
5569     vmovd      xmm0, eax  // high fraction 0..255
5570     neg        eax
5571     add        eax, 256
5572     vmovd      xmm5, eax  // low fraction 256..1
5573     vpunpcklbw xmm5, xmm5, xmm0
5574     vpunpcklwd xmm5, xmm5, xmm5
5575     vbroadcastss ymm5, xmm5
5576 
5577     mov        eax, 0x80808080  // 128b for bias and rounding.
5578     vmovd      xmm4, eax
5579     vbroadcastss ymm4, xmm4
5580 
5581   xloop:
5582     vmovdqu    ymm0, [esi]
5583     vmovdqu    ymm2, [esi + edx]
5584     vpunpckhbw ymm1, ymm0, ymm2  // mutates
5585     vpunpcklbw ymm0, ymm0, ymm2
5586     vpsubb     ymm1, ymm1, ymm4  // bias to signed image
5587     vpsubb     ymm0, ymm0, ymm4
5588     vpmaddubsw ymm1, ymm5, ymm1
5589     vpmaddubsw ymm0, ymm5, ymm0
5590     vpaddw     ymm1, ymm1, ymm4  // unbias and round
5591     vpaddw     ymm0, ymm0, ymm4
5592     vpsrlw     ymm1, ymm1, 8
5593     vpsrlw     ymm0, ymm0, 8
5594     vpackuswb  ymm0, ymm0, ymm1            // unmutates
5595     vmovdqu    [esi + edi], ymm0
5596     lea        esi, [esi + 32]
5597     sub        ecx, 32
5598     jg         xloop
5599     jmp        xloop99
5600 
5601     // Blend 50 / 50.
5602  xloop50:
5603    vmovdqu    ymm0, [esi]
5604    vpavgb     ymm0, ymm0, [esi + edx]
5605    vmovdqu    [esi + edi], ymm0
5606    lea        esi, [esi + 32]
5607    sub        ecx, 32
5608    jg         xloop50
5609    jmp        xloop99
5610 
5611     // Blend 100 / 0 - Copy row unchanged.
5612  xloop100:
5613    rep movsb
5614 
5615   xloop99:
5616     pop        edi
5617     pop        esi
5618     vzeroupper
5619     ret
5620   }
5621 }
5622 #endif  // HAS_INTERPOLATEROW_AVX2
5623 
5624 // Bilinear filter 16x2 -> 16x1
5625 // TODO(fbarchard): Consider allowing 256 using memcpy.
5626 __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
5627                                             const uint8* src_ptr,
5628                                             ptrdiff_t src_stride,
5629                                             int dst_width,
5630                                             int source_y_fraction) {
5631   __asm {
5632     push       esi
5633     push       edi
5634 
5635     mov        edi, [esp + 8 + 4]  // dst_ptr
5636     mov        esi, [esp + 8 + 8]  // src_ptr
5637     mov        edx, [esp + 8 + 12]  // src_stride
5638     mov        ecx, [esp + 8 + 16]  // dst_width
5639     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
5640     sub        edi, esi
5641     // Dispatch to specialized filters if applicable.
5642     cmp        eax, 0
5643     je         xloop100  // 0 /256.  Blend 100 / 0.
5644     cmp        eax, 128
5645     je         xloop50  // 128 / 256 is 0.50.  Blend 50 / 50.
5646 
5647     movd       xmm0, eax  // high fraction 0..255
5648     neg        eax
5649     add        eax, 256
5650     movd       xmm5, eax  // low fraction 255..1
5651     punpcklbw  xmm5, xmm0
5652     punpcklwd  xmm5, xmm5
5653     pshufd     xmm5, xmm5, 0
5654     mov        eax, 0x80808080  // 128 for biasing image to signed.
5655     movd       xmm4, eax
5656     pshufd     xmm4, xmm4, 0x00
5657 
5658   xloop:
5659     movdqu     xmm0, [esi]
5660     movdqu     xmm2, [esi + edx]
5661     movdqu     xmm1, xmm0
5662     punpcklbw  xmm0, xmm2
5663     punpckhbw  xmm1, xmm2
5664     psubb      xmm0, xmm4            // bias image by -128
5665     psubb      xmm1, xmm4
5666     movdqa     xmm2, xmm5
5667     movdqa     xmm3, xmm5
5668     pmaddubsw  xmm2, xmm0
5669     pmaddubsw  xmm3, xmm1
5670     paddw      xmm2, xmm4
5671     paddw      xmm3, xmm4
5672     psrlw      xmm2, 8
5673     psrlw      xmm3, 8
5674     packuswb   xmm2, xmm3
5675     movdqu     [esi + edi], xmm2
5676     lea        esi, [esi + 16]
5677     sub        ecx, 16
5678     jg         xloop
5679     jmp        xloop99
5680 
5681     // Blend 50 / 50.
5682   xloop50:
5683     movdqu     xmm0, [esi]
5684     movdqu     xmm1, [esi + edx]
5685     pavgb      xmm0, xmm1
5686     movdqu     [esi + edi], xmm0
5687     lea        esi, [esi + 16]
5688     sub        ecx, 16
5689     jg         xloop50
5690     jmp        xloop99
5691 
5692     // Blend 100 / 0 - Copy row unchanged.
5693   xloop100:
5694     movdqu     xmm0, [esi]
5695     movdqu     [esi + edi], xmm0
5696     lea        esi, [esi + 16]
5697     sub        ecx, 16
5698     jg         xloop100
5699 
5700   xloop99:
5701     pop        edi
5702     pop        esi
5703     ret
5704   }
5705 }
5706 
5707 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5708 __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8* src_argb,
5709                                             uint8* dst_argb,
5710                                             const uint8* shuffler,
5711                                             int width) {
5712   __asm {
5713     mov        eax, [esp + 4]  // src_argb
5714     mov        edx, [esp + 8]  // dst_argb
5715     mov        ecx, [esp + 12]  // shuffler
5716     movdqu     xmm5, [ecx]
5717     mov        ecx, [esp + 16]  // width
5718 
5719   wloop:
5720     movdqu     xmm0, [eax]
5721     movdqu     xmm1, [eax + 16]
5722     lea        eax, [eax + 32]
5723     pshufb     xmm0, xmm5
5724     pshufb     xmm1, xmm5
5725     movdqu     [edx], xmm0
5726     movdqu     [edx + 16], xmm1
5727     lea        edx, [edx + 32]
5728     sub        ecx, 8
5729     jg         wloop
5730     ret
5731   }
5732 }
5733 
5734 #ifdef HAS_ARGBSHUFFLEROW_AVX2
5735 __declspec(naked) void ARGBShuffleRow_AVX2(const uint8* src_argb,
5736                                            uint8* dst_argb,
5737                                            const uint8* shuffler,
5738                                            int width) {
5739   __asm {
5740     mov        eax, [esp + 4]  // src_argb
5741     mov        edx, [esp + 8]  // dst_argb
5742     mov        ecx, [esp + 12]  // shuffler
5743     vbroadcastf128 ymm5, [ecx]  // same shuffle in high as low.
5744     mov        ecx, [esp + 16]  // width
5745 
5746   wloop:
5747     vmovdqu    ymm0, [eax]
5748     vmovdqu    ymm1, [eax + 32]
5749     lea        eax, [eax + 64]
5750     vpshufb    ymm0, ymm0, ymm5
5751     vpshufb    ymm1, ymm1, ymm5
5752     vmovdqu    [edx], ymm0
5753     vmovdqu    [edx + 32], ymm1
5754     lea        edx, [edx + 64]
5755     sub        ecx, 16
5756     jg         wloop
5757 
5758     vzeroupper
5759     ret
5760   }
5761 }
5762 #endif  // HAS_ARGBSHUFFLEROW_AVX2
5763 
5764 __declspec(naked) void ARGBShuffleRow_SSE2(const uint8* src_argb,
5765                                            uint8* dst_argb,
5766                                            const uint8* shuffler,
5767                                            int width) {
5768   __asm {
5769     push       ebx
5770     push       esi
5771     mov        eax, [esp + 8 + 4]  // src_argb
5772     mov        edx, [esp + 8 + 8]  // dst_argb
5773     mov        esi, [esp + 8 + 12]  // shuffler
5774     mov        ecx, [esp + 8 + 16]  // width
5775     pxor       xmm5, xmm5
5776 
5777     mov        ebx, [esi]  // shuffler
5778     cmp        ebx, 0x03000102
5779     je         shuf_3012
5780     cmp        ebx, 0x00010203
5781     je         shuf_0123
5782     cmp        ebx, 0x00030201
5783     je         shuf_0321
5784     cmp        ebx, 0x02010003
5785     je         shuf_2103
5786 
5787     // TODO(fbarchard): Use one source pointer and 3 offsets.
5788   shuf_any1:
5789     movzx      ebx, byte ptr [esi]
5790     movzx      ebx, byte ptr [eax + ebx]
5791     mov        [edx], bl
5792     movzx      ebx, byte ptr [esi + 1]
5793     movzx      ebx, byte ptr [eax + ebx]
5794     mov        [edx + 1], bl
5795     movzx      ebx, byte ptr [esi + 2]
5796     movzx      ebx, byte ptr [eax + ebx]
5797     mov        [edx + 2], bl
5798     movzx      ebx, byte ptr [esi + 3]
5799     movzx      ebx, byte ptr [eax + ebx]
5800     mov        [edx + 3], bl
5801     lea        eax, [eax + 4]
5802     lea        edx, [edx + 4]
5803     sub        ecx, 1
5804     jg         shuf_any1
5805     jmp        shuf99
5806 
5807   shuf_0123:
5808     movdqu     xmm0, [eax]
5809     lea        eax, [eax + 16]
5810     movdqa     xmm1, xmm0
5811     punpcklbw  xmm0, xmm5
5812     punpckhbw  xmm1, xmm5
5813     pshufhw    xmm0, xmm0, 01Bh  // 1B = 00011011 = 0x0123 = BGRAToARGB
5814     pshuflw    xmm0, xmm0, 01Bh
5815     pshufhw    xmm1, xmm1, 01Bh
5816     pshuflw    xmm1, xmm1, 01Bh
5817     packuswb   xmm0, xmm1
5818     movdqu     [edx], xmm0
5819     lea        edx, [edx + 16]
5820     sub        ecx, 4
5821     jg         shuf_0123
5822     jmp        shuf99
5823 
5824   shuf_0321:
5825     movdqu     xmm0, [eax]
5826     lea        eax, [eax + 16]
5827     movdqa     xmm1, xmm0
5828     punpcklbw  xmm0, xmm5
5829     punpckhbw  xmm1, xmm5
5830     pshufhw    xmm0, xmm0, 039h  // 39 = 00111001 = 0x0321 = RGBAToARGB
5831     pshuflw    xmm0, xmm0, 039h
5832     pshufhw    xmm1, xmm1, 039h
5833     pshuflw    xmm1, xmm1, 039h
5834     packuswb   xmm0, xmm1
5835     movdqu     [edx], xmm0
5836     lea        edx, [edx + 16]
5837     sub        ecx, 4
5838     jg         shuf_0321
5839     jmp        shuf99
5840 
5841   shuf_2103:
5842     movdqu     xmm0, [eax]
5843     lea        eax, [eax + 16]
5844     movdqa     xmm1, xmm0
5845     punpcklbw  xmm0, xmm5
5846     punpckhbw  xmm1, xmm5
5847     pshufhw    xmm0, xmm0, 093h  // 93 = 10010011 = 0x2103 = ARGBToRGBA
5848     pshuflw    xmm0, xmm0, 093h
5849     pshufhw    xmm1, xmm1, 093h
5850     pshuflw    xmm1, xmm1, 093h
5851     packuswb   xmm0, xmm1
5852     movdqu     [edx], xmm0
5853     lea        edx, [edx + 16]
5854     sub        ecx, 4
5855     jg         shuf_2103
5856     jmp        shuf99
5857 
5858   shuf_3012:
5859     movdqu     xmm0, [eax]
5860     lea        eax, [eax + 16]
5861     movdqa     xmm1, xmm0
5862     punpcklbw  xmm0, xmm5
5863     punpckhbw  xmm1, xmm5
5864     pshufhw    xmm0, xmm0, 0C6h  // C6 = 11000110 = 0x3012 = ABGRToARGB
5865     pshuflw    xmm0, xmm0, 0C6h
5866     pshufhw    xmm1, xmm1, 0C6h
5867     pshuflw    xmm1, xmm1, 0C6h
5868     packuswb   xmm0, xmm1
5869     movdqu     [edx], xmm0
5870     lea        edx, [edx + 16]
5871     sub        ecx, 4
5872     jg         shuf_3012
5873 
5874   shuf99:
5875     pop        esi
5876     pop        ebx
5877     ret
5878   }
5879 }
5880 
5881 // YUY2 - Macro-pixel = 2 image pixels
5882 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
5883 
5884 // UYVY - Macro-pixel = 2 image pixels
5885 // U0Y0V0Y1
5886 
5887 __declspec(naked) void I422ToYUY2Row_SSE2(const uint8* src_y,
5888                                           const uint8* src_u,
5889                                           const uint8* src_v,
5890                                           uint8* dst_frame,
5891                                           int width) {
5892   __asm {
5893     push       esi
5894     push       edi
5895     mov        eax, [esp + 8 + 4]  // src_y
5896     mov        esi, [esp + 8 + 8]  // src_u
5897     mov        edx, [esp + 8 + 12]  // src_v
5898     mov        edi, [esp + 8 + 16]  // dst_frame
5899     mov        ecx, [esp + 8 + 20]  // width
5900     sub        edx, esi
5901 
5902   convertloop:
5903     movq       xmm2, qword ptr [esi]  // U
5904     movq       xmm3, qword ptr [esi + edx]  // V
5905     lea        esi, [esi + 8]
5906     punpcklbw  xmm2, xmm3  // UV
5907     movdqu     xmm0, [eax]  // Y
5908     lea        eax, [eax + 16]
5909     movdqa     xmm1, xmm0
5910     punpcklbw  xmm0, xmm2  // YUYV
5911     punpckhbw  xmm1, xmm2
5912     movdqu     [edi], xmm0
5913     movdqu     [edi + 16], xmm1
5914     lea        edi, [edi + 32]
5915     sub        ecx, 16
5916     jg         convertloop
5917 
5918     pop        edi
5919     pop        esi
5920     ret
5921   }
5922 }
5923 
5924 __declspec(naked) void I422ToUYVYRow_SSE2(const uint8* src_y,
5925                                           const uint8* src_u,
5926                                           const uint8* src_v,
5927                                           uint8* dst_frame,
5928                                           int width) {
5929   __asm {
5930     push       esi
5931     push       edi
5932     mov        eax, [esp + 8 + 4]  // src_y
5933     mov        esi, [esp + 8 + 8]  // src_u
5934     mov        edx, [esp + 8 + 12]  // src_v
5935     mov        edi, [esp + 8 + 16]  // dst_frame
5936     mov        ecx, [esp + 8 + 20]  // width
5937     sub        edx, esi
5938 
5939   convertloop:
5940     movq       xmm2, qword ptr [esi]  // U
5941     movq       xmm3, qword ptr [esi + edx]  // V
5942     lea        esi, [esi + 8]
5943     punpcklbw  xmm2, xmm3  // UV
5944     movdqu     xmm0, [eax]  // Y
5945     movdqa     xmm1, xmm2
5946     lea        eax, [eax + 16]
5947     punpcklbw  xmm1, xmm0  // UYVY
5948     punpckhbw  xmm2, xmm0
5949     movdqu     [edi], xmm1
5950     movdqu     [edi + 16], xmm2
5951     lea        edi, [edi + 32]
5952     sub        ecx, 16
5953     jg         convertloop
5954 
5955     pop        edi
5956     pop        esi
5957     ret
5958   }
5959 }
5960 
5961 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
5962 __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb,
5963                                               uint8* dst_argb,
5964                                               const float* poly,
5965                                               int width) {
5966   __asm {
5967     push       esi
5968     mov        eax, [esp + 4 + 4] /* src_argb */
5969     mov        edx, [esp + 4 + 8] /* dst_argb */
5970     mov        esi, [esp + 4 + 12] /* poly */
5971     mov        ecx, [esp + 4 + 16] /* width */
5972     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
5973 
5974     // 2 pixel loop.
5975  convertloop:
5976         //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
5977         //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
5978     movq       xmm0, qword ptr [eax]  // BGRABGRA
5979     lea        eax, [eax + 8]
5980     punpcklbw  xmm0, xmm3
5981     movdqa     xmm4, xmm0
5982     punpcklwd  xmm0, xmm3  // pixel 0
5983     punpckhwd  xmm4, xmm3  // pixel 1
5984     cvtdq2ps   xmm0, xmm0  // 4 floats
5985     cvtdq2ps   xmm4, xmm4
5986     movdqa     xmm1, xmm0  // X
5987     movdqa     xmm5, xmm4
5988     mulps      xmm0, [esi + 16]  // C1 * X
5989     mulps      xmm4, [esi + 16]
5990     addps      xmm0, [esi]  // result = C0 + C1 * X
5991     addps      xmm4, [esi]
5992     movdqa     xmm2, xmm1
5993     movdqa     xmm6, xmm5
5994     mulps      xmm2, xmm1  // X * X
5995     mulps      xmm6, xmm5
5996     mulps      xmm1, xmm2  // X * X * X
5997     mulps      xmm5, xmm6
5998     mulps      xmm2, [esi + 32]  // C2 * X * X
5999     mulps      xmm6, [esi + 32]
6000     mulps      xmm1, [esi + 48]  // C3 * X * X * X
6001     mulps      xmm5, [esi + 48]
6002     addps      xmm0, xmm2  // result += C2 * X * X
6003     addps      xmm4, xmm6
6004     addps      xmm0, xmm1  // result += C3 * X * X * X
6005     addps      xmm4, xmm5
6006     cvttps2dq  xmm0, xmm0
6007     cvttps2dq  xmm4, xmm4
6008     packuswb   xmm0, xmm4
6009     packuswb   xmm0, xmm0
6010     movq       qword ptr [edx], xmm0
6011     lea        edx, [edx + 8]
6012     sub        ecx, 2
6013     jg         convertloop
6014     pop        esi
6015     ret
6016   }
6017 }
6018 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
6019 
6020 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
6021 __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8* src_argb,
6022                                               uint8* dst_argb,
6023                                               const float* poly,
6024                                               int width) {
6025   __asm {
6026     mov        eax, [esp + 4] /* src_argb */
6027     mov        edx, [esp + 8] /* dst_argb */
6028     mov        ecx, [esp + 12] /* poly */
6029     vbroadcastf128 ymm4, [ecx]  // C0
6030     vbroadcastf128 ymm5, [ecx + 16]  // C1
6031     vbroadcastf128 ymm6, [ecx + 32]  // C2
6032     vbroadcastf128 ymm7, [ecx + 48]  // C3
6033     mov        ecx, [esp + 16] /* width */
6034 
6035     // 2 pixel loop.
6036  convertloop:
6037     vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
6038     lea         eax, [eax + 8]
6039     vcvtdq2ps   ymm0, ymm0  // X 8 floats
6040     vmulps      ymm2, ymm0, ymm0  // X * X
6041     vmulps      ymm3, ymm0, ymm7  // C3 * X
6042     vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
6043     vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
6044     vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
6045     vcvttps2dq  ymm0, ymm0
6046     vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
6047     vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
6048     vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
6049     vmovq       qword ptr [edx], xmm0
6050     lea         edx, [edx + 8]
6051     sub         ecx, 2
6052     jg          convertloop
6053     vzeroupper
6054     ret
6055   }
6056 }
6057 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
6058 
6059 #ifdef HAS_HALFFLOATROW_SSE2
6060 static float kExpBias = 1.9259299444e-34f;
6061 __declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
6062                                          uint16* dst,
6063                                          float scale,
6064                                          int width) {
6065   __asm {
6066     mov        eax, [esp + 4] /* src */
6067     mov        edx, [esp + 8] /* dst */
6068     movd       xmm4, dword ptr [esp + 12] /* scale */
6069     mov        ecx, [esp + 16] /* width */
6070     mulss      xmm4, kExpBias
6071     pshufd     xmm4, xmm4, 0
6072     pxor       xmm5, xmm5
6073     sub        edx, eax
6074 
6075     // 8 pixel loop.
6076  convertloop:
6077     movdqu      xmm2, xmmword ptr [eax]  // 8 shorts
6078     add         eax, 16
6079     movdqa      xmm3, xmm2
6080     punpcklwd   xmm2, xmm5
6081     cvtdq2ps    xmm2, xmm2  // convert 8 ints to floats
6082     punpckhwd   xmm3, xmm5
6083     cvtdq2ps    xmm3, xmm3
6084     mulps       xmm2, xmm4
6085     mulps       xmm3, xmm4
6086     psrld       xmm2, 13
6087     psrld       xmm3, 13
6088     packssdw    xmm2, xmm3
6089     movdqu      [eax + edx - 16], xmm2
6090     sub         ecx, 8
6091     jg          convertloop
6092     ret
6093   }
6094 }
6095 #endif  // HAS_HALFFLOATROW_SSE2
6096 
6097 #ifdef HAS_HALFFLOATROW_AVX2
6098 __declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
6099                                          uint16* dst,
6100                                          float scale,
6101                                          int width) {
6102   __asm {
6103     mov        eax, [esp + 4] /* src */
6104     mov        edx, [esp + 8] /* dst */
6105     movd       xmm4, dword ptr [esp + 12] /* scale */
6106     mov        ecx, [esp + 16] /* width */
6107 
6108     vmulss     xmm4, xmm4, kExpBias
6109     vbroadcastss ymm4, xmm4
6110     vpxor      ymm5, ymm5, ymm5
6111     sub        edx, eax
6112 
6113     // 16 pixel loop.
6114  convertloop:
6115     vmovdqu     ymm2, [eax]  // 16 shorts
6116     add         eax, 32
6117     vpunpckhwd  ymm3, ymm2, ymm5  // convert 16 shorts to 16 ints
6118     vpunpcklwd  ymm2, ymm2, ymm5
6119     vcvtdq2ps   ymm3, ymm3  // convert 16 ints to floats
6120     vcvtdq2ps   ymm2, ymm2
6121     vmulps      ymm3, ymm3, ymm4  // scale to adjust exponent for 5 bit range.
6122     vmulps      ymm2, ymm2, ymm4
6123     vpsrld      ymm3, ymm3, 13  // float convert to 8 half floats truncate
6124     vpsrld      ymm2, ymm2, 13
6125     vpackssdw   ymm2, ymm2, ymm3
6126     vmovdqu     [eax + edx - 32], ymm2
6127     sub         ecx, 16
6128     jg          convertloop
6129     vzeroupper
6130     ret
6131   }
6132 }
6133 #endif  // HAS_HALFFLOATROW_AVX2
6134 
6135 #ifdef HAS_HALFFLOATROW_F16C
6136 __declspec(naked) void HalfFloatRow_F16C(const uint16* src,
6137                                          uint16* dst,
6138                                          float scale,
6139                                          int width) {
6140   __asm {
6141     mov        eax, [esp + 4] /* src */
6142     mov        edx, [esp + 8] /* dst */
6143     vbroadcastss ymm4, [esp + 12] /* scale */
6144     mov        ecx, [esp + 16] /* width */
6145     sub        edx, eax
6146 
6147     // 16 pixel loop.
6148  convertloop:
6149     vpmovzxwd   ymm2, xmmword ptr [eax]  // 8 shorts -> 8 ints
6150     vpmovzxwd   ymm3, xmmword ptr [eax + 16]  // 8 more shorts
6151     add         eax, 32
6152     vcvtdq2ps   ymm2, ymm2  // convert 8 ints to floats
6153     vcvtdq2ps   ymm3, ymm3
6154     vmulps      ymm2, ymm2, ymm4  // scale to normalized range 0 to 1
6155     vmulps      ymm3, ymm3, ymm4
6156     vcvtps2ph   xmm2, ymm2, 3  // float convert to 8 half floats truncate
6157     vcvtps2ph   xmm3, ymm3, 3
6158     vmovdqu     [eax + edx + 32], xmm2
6159     vmovdqu     [eax + edx + 32 + 16], xmm3
6160     sub         ecx, 16
6161     jg          convertloop
6162     vzeroupper
6163     ret
6164   }
6165 }
6166 #endif  // HAS_HALFFLOATROW_F16C
6167 
6168 #ifdef HAS_ARGBCOLORTABLEROW_X86
6169 // Tranform ARGB pixels with color table.
6170 __declspec(naked) void ARGBColorTableRow_X86(uint8* dst_argb,
6171                                              const uint8* table_argb,
6172                                              int width) {
6173   __asm {
6174     push       esi
6175     mov        eax, [esp + 4 + 4] /* dst_argb */
6176     mov        esi, [esp + 4 + 8] /* table_argb */
6177     mov        ecx, [esp + 4 + 12] /* width */
6178 
6179     // 1 pixel loop.
6180   convertloop:
6181     movzx      edx, byte ptr [eax]
6182     lea        eax, [eax + 4]
6183     movzx      edx, byte ptr [esi + edx * 4]
6184     mov        byte ptr [eax - 4], dl
6185     movzx      edx, byte ptr [eax - 4 + 1]
6186     movzx      edx, byte ptr [esi + edx * 4 + 1]
6187     mov        byte ptr [eax - 4 + 1], dl
6188     movzx      edx, byte ptr [eax - 4 + 2]
6189     movzx      edx, byte ptr [esi + edx * 4 + 2]
6190     mov        byte ptr [eax - 4 + 2], dl
6191     movzx      edx, byte ptr [eax - 4 + 3]
6192     movzx      edx, byte ptr [esi + edx * 4 + 3]
6193     mov        byte ptr [eax - 4 + 3], dl
6194     dec        ecx
6195     jg         convertloop
6196     pop        esi
6197     ret
6198   }
6199 }
6200 #endif  // HAS_ARGBCOLORTABLEROW_X86
6201 
6202 #ifdef HAS_RGBCOLORTABLEROW_X86
6203 // Tranform RGB pixels with color table.
6204 __declspec(naked) void RGBColorTableRow_X86(uint8* dst_argb,
6205                                             const uint8* table_argb,
6206                                             int width) {
6207   __asm {
6208     push       esi
6209     mov        eax, [esp + 4 + 4] /* dst_argb */
6210     mov        esi, [esp + 4 + 8] /* table_argb */
6211     mov        ecx, [esp + 4 + 12] /* width */
6212 
6213     // 1 pixel loop.
6214   convertloop:
6215     movzx      edx, byte ptr [eax]
6216     lea        eax, [eax + 4]
6217     movzx      edx, byte ptr [esi + edx * 4]
6218     mov        byte ptr [eax - 4], dl
6219     movzx      edx, byte ptr [eax - 4 + 1]
6220     movzx      edx, byte ptr [esi + edx * 4 + 1]
6221     mov        byte ptr [eax - 4 + 1], dl
6222     movzx      edx, byte ptr [eax - 4 + 2]
6223     movzx      edx, byte ptr [esi + edx * 4 + 2]
6224     mov        byte ptr [eax - 4 + 2], dl
6225     dec        ecx
6226     jg         convertloop
6227 
6228     pop        esi
6229     ret
6230   }
6231 }
6232 #endif  // HAS_RGBCOLORTABLEROW_X86
6233 
6234 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6235 // Tranform RGB pixels with luma table.
6236 __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
6237                                                    uint8* dst_argb,
6238                                                    int width,
6239                                                    const uint8* luma,
6240                                                    uint32 lumacoeff) {
6241   __asm {
6242     push       esi
6243     push       edi
6244     mov        eax, [esp + 8 + 4] /* src_argb */
6245     mov        edi, [esp + 8 + 8] /* dst_argb */
6246     mov        ecx, [esp + 8 + 12] /* width */
6247     movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
6248     movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
6249     pshufd     xmm2, xmm2, 0
6250     pshufd     xmm3, xmm3, 0
6251     pcmpeqb    xmm4, xmm4  // generate mask 0xff00ff00
6252     psllw      xmm4, 8
6253     pxor       xmm5, xmm5
6254 
6255     // 4 pixel loop.
6256   convertloop:
6257     movdqu     xmm0, xmmword ptr [eax]  // generate luma ptr
6258     pmaddubsw  xmm0, xmm3
6259     phaddw     xmm0, xmm0
6260     pand       xmm0, xmm4  // mask out low bits
6261     punpcklwd  xmm0, xmm5
6262     paddd      xmm0, xmm2  // add table base
6263     movd       esi, xmm0
6264     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6265 
6266     movzx      edx, byte ptr [eax]
6267     movzx      edx, byte ptr [esi + edx]
6268     mov        byte ptr [edi], dl
6269     movzx      edx, byte ptr [eax + 1]
6270     movzx      edx, byte ptr [esi + edx]
6271     mov        byte ptr [edi + 1], dl
6272     movzx      edx, byte ptr [eax + 2]
6273     movzx      edx, byte ptr [esi + edx]
6274     mov        byte ptr [edi + 2], dl
6275     movzx      edx, byte ptr [eax + 3]  // copy alpha.
6276     mov        byte ptr [edi + 3], dl
6277 
6278     movd       esi, xmm0
6279     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6280 
6281     movzx      edx, byte ptr [eax + 4]
6282     movzx      edx, byte ptr [esi + edx]
6283     mov        byte ptr [edi + 4], dl
6284     movzx      edx, byte ptr [eax + 5]
6285     movzx      edx, byte ptr [esi + edx]
6286     mov        byte ptr [edi + 5], dl
6287     movzx      edx, byte ptr [eax + 6]
6288     movzx      edx, byte ptr [esi + edx]
6289     mov        byte ptr [edi + 6], dl
6290     movzx      edx, byte ptr [eax + 7]  // copy alpha.
6291     mov        byte ptr [edi + 7], dl
6292 
6293     movd       esi, xmm0
6294     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6295 
6296     movzx      edx, byte ptr [eax + 8]
6297     movzx      edx, byte ptr [esi + edx]
6298     mov        byte ptr [edi + 8], dl
6299     movzx      edx, byte ptr [eax + 9]
6300     movzx      edx, byte ptr [esi + edx]
6301     mov        byte ptr [edi + 9], dl
6302     movzx      edx, byte ptr [eax + 10]
6303     movzx      edx, byte ptr [esi + edx]
6304     mov        byte ptr [edi + 10], dl
6305     movzx      edx, byte ptr [eax + 11]  // copy alpha.
6306     mov        byte ptr [edi + 11], dl
6307 
6308     movd       esi, xmm0
6309 
6310     movzx      edx, byte ptr [eax + 12]
6311     movzx      edx, byte ptr [esi + edx]
6312     mov        byte ptr [edi + 12], dl
6313     movzx      edx, byte ptr [eax + 13]
6314     movzx      edx, byte ptr [esi + edx]
6315     mov        byte ptr [edi + 13], dl
6316     movzx      edx, byte ptr [eax + 14]
6317     movzx      edx, byte ptr [esi + edx]
6318     mov        byte ptr [edi + 14], dl
6319     movzx      edx, byte ptr [eax + 15]  // copy alpha.
6320     mov        byte ptr [edi + 15], dl
6321 
6322     lea        eax, [eax + 16]
6323     lea        edi, [edi + 16]
6324     sub        ecx, 4
6325     jg         convertloop
6326 
6327     pop        edi
6328     pop        esi
6329     ret
6330   }
6331 }
6332 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6333 
6334 #endif  // defined(_M_X64)
6335 
6336 #ifdef __cplusplus
6337 }  // extern "C"
6338 }  // namespace libyuv
6339 #endif
6340 
6341 #endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6342