1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 // This module is for Visual C 32/64 bit and clangcl 32 bit
14 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
15     (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
16 
17 #if defined(_M_X64)
18 #include <emmintrin.h>
19 #include <tmmintrin.h>  // For _mm_maddubs_epi16
20 #endif
21 
22 #ifdef __cplusplus
23 namespace libyuv {
24 extern "C" {
25 #endif
26 
27 // 64 bit
28 #if defined(_M_X64)
29 
30 // Read 4 UV from 422, upsample to 8 UV.
31 #define READYUV422                                                             \
32     xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \
33     xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \
34     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
35     xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \
36     u_buf += 4;                                                                \
37     xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \
38     xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \
39     y_buf += 8;
40 
41 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
42 #define READYUVA422                                                            \
43     xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \
44     xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \
45     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
46     xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \
47     u_buf += 4;                                                                \
48     xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \
49     xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \
50     y_buf += 8;                                                                \
51     xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                                   \
52     a_buf += 8;
53 
54 // Convert 8 pixels: 8 UV and 8 Y.
55 #define YUVTORGB(yuvconstants)                                                 \
56     xmm1 = _mm_loadu_si128(&xmm0);                                             \
57     xmm2 = _mm_loadu_si128(&xmm0);                                             \
58     xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB);           \
59     xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG);           \
60     xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR);           \
61     xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);             \
62     xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);             \
63     xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);             \
64     xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);            \
65     xmm0 = _mm_adds_epi16(xmm0, xmm4);                                         \
66     xmm1 = _mm_adds_epi16(xmm1, xmm4);                                         \
67     xmm2 = _mm_adds_epi16(xmm2, xmm4);                                         \
68     xmm0 = _mm_srai_epi16(xmm0, 6);                                            \
69     xmm1 = _mm_srai_epi16(xmm1, 6);                                            \
70     xmm2 = _mm_srai_epi16(xmm2, 6);                                            \
71     xmm0 = _mm_packus_epi16(xmm0, xmm0);                                       \
72     xmm1 = _mm_packus_epi16(xmm1, xmm1);                                       \
73     xmm2 = _mm_packus_epi16(xmm2, xmm2);
74 
75 // Store 8 ARGB values.
76 #define STOREARGB                                                              \
77     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
78     xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);                                      \
79     xmm1 = _mm_loadu_si128(&xmm0);                                             \
80     xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);                                     \
81     xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);                                     \
82     _mm_storeu_si128((__m128i *)dst_argb, xmm0);                               \
83     _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);                        \
84     dst_argb += 32;
85 
86 
87 #if defined(HAS_I422TOARGBROW_SSSE3)
I422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)88 void I422ToARGBRow_SSSE3(const uint8* y_buf,
89                          const uint8* u_buf,
90                          const uint8* v_buf,
91                          uint8* dst_argb,
92                          const struct YuvConstants* yuvconstants,
93                          int width) {
94   __m128i xmm0, xmm1, xmm2, xmm4;
95   const __m128i xmm5 = _mm_set1_epi8(-1);
96   const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
97   while (width > 0) {
98     READYUV422
99     YUVTORGB(yuvconstants)
100     STOREARGB
101     width -= 8;
102   }
103 }
104 #endif
105 
106 #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
I422AlphaToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,const uint8 * a_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)107 void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
108                               const uint8* u_buf,
109                               const uint8* v_buf,
110                               const uint8* a_buf,
111                               uint8* dst_argb,
112                               const struct YuvConstants* yuvconstants,
113                               int width) {
114   __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
115   const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
116   while (width > 0) {
117     READYUVA422
118     YUVTORGB(yuvconstants)
119     STOREARGB
120     width -= 8;
121   }
122 }
123 #endif
124 
125 // 32 bit
126 #else  // defined(_M_X64)
127 #ifdef HAS_ARGBTOYROW_SSSE3
128 
129 // Constants for ARGB.
130 static const vec8 kARGBToY = {
131   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
132 };
133 
134 // JPeg full range.
135 static const vec8 kARGBToYJ = {
136   15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
137 };
138 
139 static const vec8 kARGBToU = {
140   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
141 };
142 
143 static const vec8 kARGBToUJ = {
144   127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
145 };
146 
147 static const vec8 kARGBToV = {
148   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
149 };
150 
151 static const vec8 kARGBToVJ = {
152   -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
153 };
154 
155 // vpshufb for vphaddw + vpackuswb packed to shorts.
156 static const lvec8 kShufARGBToUV_AVX = {
157   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
158   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
159 };
160 
161 // Constants for BGRA.
162 static const vec8 kBGRAToY = {
163   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
164 };
165 
166 static const vec8 kBGRAToU = {
167   0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
168 };
169 
170 static const vec8 kBGRAToV = {
171   0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
172 };
173 
174 // Constants for ABGR.
175 static const vec8 kABGRToY = {
176   33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
177 };
178 
179 static const vec8 kABGRToU = {
180   -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
181 };
182 
183 static const vec8 kABGRToV = {
184   112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
185 };
186 
187 // Constants for RGBA.
188 static const vec8 kRGBAToY = {
189   0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
190 };
191 
192 static const vec8 kRGBAToU = {
193   0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
194 };
195 
196 static const vec8 kRGBAToV = {
197   0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
198 };
199 
200 static const uvec8 kAddY16 = {
201   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
202 };
203 
204 // 7 bit fixed point 0.5.
205 static const vec16 kAddYJ64 = {
206   64, 64, 64, 64, 64, 64, 64, 64
207 };
208 
209 static const uvec8 kAddUV128 = {
210   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
211   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
212 };
213 
214 static const uvec16 kAddUVJ128 = {
215   0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
216 };
217 
218 // Shuffle table for converting RGB24 to ARGB.
219 static const uvec8 kShuffleMaskRGB24ToARGB = {
220   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
221 };
222 
223 // Shuffle table for converting RAW to ARGB.
224 static const uvec8 kShuffleMaskRAWToARGB = {
225   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
226 };
227 
228 // Shuffle table for converting RAW to RGB24.  First 8.
229 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
230   2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
231   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
232 };
233 
234 // Shuffle table for converting RAW to RGB24.  Middle 8.
235 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
236   2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
237   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
238 };
239 
240 // Shuffle table for converting RAW to RGB24.  Last 8.
241 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
242   8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
243   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
244 };
245 
246 // Shuffle table for converting ARGB to RGB24.
247 static const uvec8 kShuffleMaskARGBToRGB24 = {
248   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
249 };
250 
251 // Shuffle table for converting ARGB to RAW.
252 static const uvec8 kShuffleMaskARGBToRAW = {
253   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
254 };
255 
256 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
257 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
258   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
259 };
260 
261 // YUY2 shuf 16 Y to 32 Y.
262 static const lvec8 kShuffleYUY2Y = {
263   0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
264   0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
265 };
266 
267 // YUY2 shuf 8 UV to 16 UV.
268 static const lvec8 kShuffleYUY2UV = {
269   1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
270   1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
271 };
272 
273 // UYVY shuf 16 Y to 32 Y.
274 static const lvec8 kShuffleUYVYY = {
275   1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
276   1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
277 };
278 
279 // UYVY shuf 8 UV to 16 UV.
280 static const lvec8 kShuffleUYVYUV = {
281   0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
282   0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
283 };
284 
285 // NV21 shuf 8 VU to 16 UV.
286 static const lvec8 kShuffleNV21 = {
287   1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
288   1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
289 };
290 
291 // Duplicates gray value 3 times and fills in alpha opaque.
292 __declspec(naked)
293 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
294   __asm {
295     mov        eax, [esp + 4]        // src_y
296     mov        edx, [esp + 8]        // dst_argb
297     mov        ecx, [esp + 12]       // width
298     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
299     pslld      xmm5, 24
300 
301   convertloop:
302     movq       xmm0, qword ptr [eax]
303     lea        eax,  [eax + 8]
304     punpcklbw  xmm0, xmm0
305     movdqa     xmm1, xmm0
306     punpcklwd  xmm0, xmm0
307     punpckhwd  xmm1, xmm1
308     por        xmm0, xmm5
309     por        xmm1, xmm5
310     movdqu     [edx], xmm0
311     movdqu     [edx + 16], xmm1
312     lea        edx, [edx + 32]
313     sub        ecx, 8
314     jg         convertloop
315     ret
316   }
317 }
318 
319 #ifdef HAS_J400TOARGBROW_AVX2
320 // Duplicates gray value 3 times and fills in alpha opaque.
321 __declspec(naked)
322 void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {
323   __asm {
324     mov         eax, [esp + 4]        // src_y
325     mov         edx, [esp + 8]        // dst_argb
326     mov         ecx, [esp + 12]       // width
327     vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0xff000000
328     vpslld      ymm5, ymm5, 24
329 
330   convertloop:
331     vmovdqu     xmm0, [eax]
332     lea         eax,  [eax + 16]
333     vpermq      ymm0, ymm0, 0xd8
334     vpunpcklbw  ymm0, ymm0, ymm0
335     vpermq      ymm0, ymm0, 0xd8
336     vpunpckhwd  ymm1, ymm0, ymm0
337     vpunpcklwd  ymm0, ymm0, ymm0
338     vpor        ymm0, ymm0, ymm5
339     vpor        ymm1, ymm1, ymm5
340     vmovdqu     [edx], ymm0
341     vmovdqu     [edx + 32], ymm1
342     lea         edx, [edx + 64]
343     sub         ecx, 16
344     jg          convertloop
345     vzeroupper
346     ret
347   }
348 }
349 #endif  // HAS_J400TOARGBROW_AVX2
350 
351 __declspec(naked)
352 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
353   __asm {
354     mov       eax, [esp + 4]   // src_rgb24
355     mov       edx, [esp + 8]   // dst_argb
356     mov       ecx, [esp + 12]  // width
357     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
358     pslld     xmm5, 24
359     movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
360 
361  convertloop:
362     movdqu    xmm0, [eax]
363     movdqu    xmm1, [eax + 16]
364     movdqu    xmm3, [eax + 32]
365     lea       eax, [eax + 48]
366     movdqa    xmm2, xmm3
367     palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
368     pshufb    xmm2, xmm4
369     por       xmm2, xmm5
370     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
371     pshufb    xmm0, xmm4
372     movdqu    [edx + 32], xmm2
373     por       xmm0, xmm5
374     pshufb    xmm1, xmm4
375     movdqu    [edx], xmm0
376     por       xmm1, xmm5
377     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
378     pshufb    xmm3, xmm4
379     movdqu    [edx + 16], xmm1
380     por       xmm3, xmm5
381     movdqu    [edx + 48], xmm3
382     lea       edx, [edx + 64]
383     sub       ecx, 16
384     jg        convertloop
385     ret
386   }
387 }
388 
389 __declspec(naked)
390 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
391                         int width) {
392   __asm {
393     mov       eax, [esp + 4]   // src_raw
394     mov       edx, [esp + 8]   // dst_argb
395     mov       ecx, [esp + 12]  // width
396     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
397     pslld     xmm5, 24
398     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToARGB
399 
400  convertloop:
401     movdqu    xmm0, [eax]
402     movdqu    xmm1, [eax + 16]
403     movdqu    xmm3, [eax + 32]
404     lea       eax, [eax + 48]
405     movdqa    xmm2, xmm3
406     palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
407     pshufb    xmm2, xmm4
408     por       xmm2, xmm5
409     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
410     pshufb    xmm0, xmm4
411     movdqu    [edx + 32], xmm2
412     por       xmm0, xmm5
413     pshufb    xmm1, xmm4
414     movdqu    [edx], xmm0
415     por       xmm1, xmm5
416     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
417     pshufb    xmm3, xmm4
418     movdqu    [edx + 16], xmm1
419     por       xmm3, xmm5
420     movdqu    [edx + 48], xmm3
421     lea       edx, [edx + 64]
422     sub       ecx, 16
423     jg        convertloop
424     ret
425   }
426 }
427 
428 __declspec(naked)
429 void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
430   __asm {
431     mov       eax, [esp + 4]   // src_raw
432     mov       edx, [esp + 8]   // dst_rgb24
433     mov       ecx, [esp + 12]  // width
434     movdqa    xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
435     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
436     movdqa    xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
437 
438  convertloop:
439     movdqu    xmm0, [eax]
440     movdqu    xmm1, [eax + 4]
441     movdqu    xmm2, [eax + 8]
442     lea       eax, [eax + 24]
443     pshufb    xmm0, xmm3
444     pshufb    xmm1, xmm4
445     pshufb    xmm2, xmm5
446     movq      qword ptr [edx], xmm0
447     movq      qword ptr [edx + 8], xmm1
448     movq      qword ptr [edx + 16], xmm2
449     lea       edx, [edx + 24]
450     sub       ecx, 8
451     jg        convertloop
452     ret
453   }
454 }
455 
456 // pmul method to replicate bits.
457 // Math to replicate bits:
458 // (v << 8) | (v << 3)
459 // v * 256 + v * 8
460 // v * (256 + 8)
461 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
462 // 20 instructions.
463 __declspec(naked)
464 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
465                           int width) {
466   __asm {
467     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
468     movd      xmm5, eax
469     pshufd    xmm5, xmm5, 0
470     mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
471     movd      xmm6, eax
472     pshufd    xmm6, xmm6, 0
473     pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
474     psllw     xmm3, 11
475     pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
476     psllw     xmm4, 10
477     psrlw     xmm4, 5
478     pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
479     psllw     xmm7, 8
480 
481     mov       eax, [esp + 4]   // src_rgb565
482     mov       edx, [esp + 8]   // dst_argb
483     mov       ecx, [esp + 12]  // width
484     sub       edx, eax
485     sub       edx, eax
486 
487  convertloop:
488     movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
489     movdqa    xmm1, xmm0
490     movdqa    xmm2, xmm0
491     pand      xmm1, xmm3    // R in upper 5 bits
492     psllw     xmm2, 11      // B in upper 5 bits
493     pmulhuw   xmm1, xmm5    // * (256 + 8)
494     pmulhuw   xmm2, xmm5    // * (256 + 8)
495     psllw     xmm1, 8
496     por       xmm1, xmm2    // RB
497     pand      xmm0, xmm4    // G in middle 6 bits
498     pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
499     por       xmm0, xmm7    // AG
500     movdqa    xmm2, xmm1
501     punpcklbw xmm1, xmm0
502     punpckhbw xmm2, xmm0
503     movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
504     movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
505     lea       eax, [eax + 16]
506     sub       ecx, 8
507     jg        convertloop
508     ret
509   }
510 }
511 
512 #ifdef HAS_RGB565TOARGBROW_AVX2
513 // pmul method to replicate bits.
514 // Math to replicate bits:
515 // (v << 8) | (v << 3)
516 // v * 256 + v * 8
517 // v * (256 + 8)
518 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
519 __declspec(naked)
520 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
521                           int width) {
522   __asm {
523     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
524     vmovd      xmm5, eax
525     vbroadcastss ymm5, xmm5
526     mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
527     vmovd      xmm6, eax
528     vbroadcastss ymm6, xmm6
529     vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red
530     vpsllw     ymm3, ymm3, 11
531     vpcmpeqb   ymm4, ymm4, ymm4       // generate mask 0x07e007e0 for Green
532     vpsllw     ymm4, ymm4, 10
533     vpsrlw     ymm4, ymm4, 5
534     vpcmpeqb   ymm7, ymm7, ymm7       // generate mask 0xff00ff00 for Alpha
535     vpsllw     ymm7, ymm7, 8
536 
537     mov        eax, [esp + 4]   // src_rgb565
538     mov        edx, [esp + 8]   // dst_argb
539     mov        ecx, [esp + 12]  // width
540     sub        edx, eax
541     sub        edx, eax
542 
543  convertloop:
544     vmovdqu    ymm0, [eax]   // fetch 16 pixels of bgr565
545     vpand      ymm1, ymm0, ymm3    // R in upper 5 bits
546     vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
547     vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
548     vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
549     vpsllw     ymm1, ymm1, 8
550     vpor       ymm1, ymm1, ymm2    // RB
551     vpand      ymm0, ymm0, ymm4    // G in middle 6 bits
552     vpmulhuw   ymm0, ymm0, ymm6    // << 5 * (256 + 4)
553     vpor       ymm0, ymm0, ymm7    // AG
554     vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
555     vpermq     ymm1, ymm1, 0xd8
556     vpunpckhbw ymm2, ymm1, ymm0
557     vpunpcklbw ymm1, ymm1, ymm0
558     vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
559     vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
560     lea       eax, [eax + 32]
561     sub       ecx, 16
562     jg        convertloop
563     vzeroupper
564     ret
565   }
566 }
567 #endif  // HAS_RGB565TOARGBROW_AVX2
568 
569 #ifdef HAS_ARGB1555TOARGBROW_AVX2
570 __declspec(naked)
571 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
572                             int width) {
573   __asm {
574     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
575     vmovd      xmm5, eax
576     vbroadcastss ymm5, xmm5
577     mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
578     vmovd      xmm6, eax
579     vbroadcastss ymm6, xmm6
580     vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
581     vpsllw     ymm3, ymm3, 11
582     vpsrlw     ymm4, ymm3, 6    // generate mask 0x03e003e0 for Green
583     vpcmpeqb   ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
584     vpsllw     ymm7, ymm7, 8
585 
586     mov        eax,  [esp + 4]   // src_argb1555
587     mov        edx,  [esp + 8]   // dst_argb
588     mov        ecx,  [esp + 12]  // width
589     sub        edx,  eax
590     sub        edx,  eax
591 
592  convertloop:
593     vmovdqu    ymm0, [eax]         // fetch 16 pixels of 1555
594     vpsllw     ymm1, ymm0, 1       // R in upper 5 bits
595     vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
596     vpand      ymm1, ymm1, ymm3
597     vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
598     vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
599     vpsllw     ymm1, ymm1, 8
600     vpor       ymm1, ymm1, ymm2    // RB
601     vpsraw     ymm2, ymm0, 8       // A
602     vpand      ymm0, ymm0, ymm4    // G in middle 5 bits
603     vpmulhuw   ymm0, ymm0, ymm6    // << 6 * (256 + 8)
604     vpand      ymm2, ymm2, ymm7
605     vpor       ymm0, ymm0, ymm2    // AG
606     vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
607     vpermq     ymm1, ymm1, 0xd8
608     vpunpckhbw ymm2, ymm1, ymm0
609     vpunpcklbw ymm1, ymm1, ymm0
610     vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
611     vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
612     lea       eax, [eax + 32]
613     sub       ecx, 16
614     jg        convertloop
615     vzeroupper
616     ret
617   }
618 }
619 #endif  // HAS_ARGB1555TOARGBROW_AVX2
620 
621 #ifdef HAS_ARGB4444TOARGBROW_AVX2
622 __declspec(naked)
623 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
624                             int width) {
625   __asm {
626     mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
627     vmovd     xmm4, eax
628     vbroadcastss ymm4, xmm4
629     vpslld    ymm5, ymm4, 4     // 0xf0f0f0f0 for high nibbles
630     mov       eax,  [esp + 4]   // src_argb4444
631     mov       edx,  [esp + 8]   // dst_argb
632     mov       ecx,  [esp + 12]  // width
633     sub       edx,  eax
634     sub       edx,  eax
635 
636  convertloop:
637     vmovdqu    ymm0, [eax]         // fetch 16 pixels of bgra4444
638     vpand      ymm2, ymm0, ymm5    // mask high nibbles
639     vpand      ymm0, ymm0, ymm4    // mask low nibbles
640     vpsrlw     ymm3, ymm2, 4
641     vpsllw     ymm1, ymm0, 4
642     vpor       ymm2, ymm2, ymm3
643     vpor       ymm0, ymm0, ymm1
644     vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
645     vpermq     ymm2, ymm2, 0xd8
646     vpunpckhbw ymm1, ymm0, ymm2
647     vpunpcklbw ymm0, ymm0, ymm2
648     vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
649     vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
650     lea       eax, [eax + 32]
651     sub       ecx, 16
652     jg        convertloop
653     vzeroupper
654     ret
655   }
656 }
657 #endif  // HAS_ARGB4444TOARGBROW_AVX2
658 
659 // 24 instructions
660 __declspec(naked)
661 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
662                             int width) {
663   __asm {
664     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
665     movd      xmm5, eax
666     pshufd    xmm5, xmm5, 0
667     mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
668     movd      xmm6, eax
669     pshufd    xmm6, xmm6, 0
670     pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
671     psllw     xmm3, 11
672     movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
673     psrlw     xmm4, 6
674     pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
675     psllw     xmm7, 8
676 
677     mov       eax, [esp + 4]   // src_argb1555
678     mov       edx, [esp + 8]   // dst_argb
679     mov       ecx, [esp + 12]  // width
680     sub       edx, eax
681     sub       edx, eax
682 
683  convertloop:
684     movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
685     movdqa    xmm1, xmm0
686     movdqa    xmm2, xmm0
687     psllw     xmm1, 1       // R in upper 5 bits
688     psllw     xmm2, 11      // B in upper 5 bits
689     pand      xmm1, xmm3
690     pmulhuw   xmm2, xmm5    // * (256 + 8)
691     pmulhuw   xmm1, xmm5    // * (256 + 8)
692     psllw     xmm1, 8
693     por       xmm1, xmm2    // RB
694     movdqa    xmm2, xmm0
695     pand      xmm0, xmm4    // G in middle 5 bits
696     psraw     xmm2, 8       // A
697     pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
698     pand      xmm2, xmm7
699     por       xmm0, xmm2    // AG
700     movdqa    xmm2, xmm1
701     punpcklbw xmm1, xmm0
702     punpckhbw xmm2, xmm0
703     movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
704     movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
705     lea       eax, [eax + 16]
706     sub       ecx, 8
707     jg        convertloop
708     ret
709   }
710 }
711 
712 // 18 instructions.
713 __declspec(naked)
714 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
715                             int width) {
716   __asm {
717     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
718     movd      xmm4, eax
719     pshufd    xmm4, xmm4, 0
720     movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
721     pslld     xmm5, 4
722     mov       eax, [esp + 4]   // src_argb4444
723     mov       edx, [esp + 8]   // dst_argb
724     mov       ecx, [esp + 12]  // width
725     sub       edx, eax
726     sub       edx, eax
727 
728  convertloop:
729     movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
730     movdqa    xmm2, xmm0
731     pand      xmm0, xmm4    // mask low nibbles
732     pand      xmm2, xmm5    // mask high nibbles
733     movdqa    xmm1, xmm0
734     movdqa    xmm3, xmm2
735     psllw     xmm1, 4
736     psrlw     xmm3, 4
737     por       xmm0, xmm1
738     por       xmm2, xmm3
739     movdqa    xmm1, xmm0
740     punpcklbw xmm0, xmm2
741     punpckhbw xmm1, xmm2
742     movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
743     movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
744     lea       eax, [eax + 16]
745     sub       ecx, 8
746     jg        convertloop
747     ret
748   }
749 }
750 
751 __declspec(naked)
752 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
753   __asm {
754     mov       eax, [esp + 4]   // src_argb
755     mov       edx, [esp + 8]   // dst_rgb
756     mov       ecx, [esp + 12]  // width
757     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24
758 
759  convertloop:
760     movdqu    xmm0, [eax]   // fetch 16 pixels of argb
761     movdqu    xmm1, [eax + 16]
762     movdqu    xmm2, [eax + 32]
763     movdqu    xmm3, [eax + 48]
764     lea       eax, [eax + 64]
765     pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
766     pshufb    xmm1, xmm6
767     pshufb    xmm2, xmm6
768     pshufb    xmm3, xmm6
769     movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
770     psrldq    xmm1, 4      // 8 bytes from 1
771     pslldq    xmm4, 12     // 4 bytes from 1 for 0
772     movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
773     por       xmm0, xmm4   // 4 bytes from 1 for 0
774     pslldq    xmm5, 8      // 8 bytes from 2 for 1
775     movdqu    [edx], xmm0  // store 0
776     por       xmm1, xmm5   // 8 bytes from 2 for 1
777     psrldq    xmm2, 8      // 4 bytes from 2
778     pslldq    xmm3, 4      // 12 bytes from 3 for 2
779     por       xmm2, xmm3   // 12 bytes from 3 for 2
780     movdqu    [edx + 16], xmm1   // store 1
781     movdqu    [edx + 32], xmm2   // store 2
782     lea       edx, [edx + 48]
783     sub       ecx, 16
784     jg        convertloop
785     ret
786   }
787 }
788 
789 __declspec(naked)
790 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
791   __asm {
792     mov       eax, [esp + 4]   // src_argb
793     mov       edx, [esp + 8]   // dst_rgb
794     mov       ecx, [esp + 12]  // width
795     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRAW
796 
797  convertloop:
798     movdqu    xmm0, [eax]   // fetch 16 pixels of argb
799     movdqu    xmm1, [eax + 16]
800     movdqu    xmm2, [eax + 32]
801     movdqu    xmm3, [eax + 48]
802     lea       eax, [eax + 64]
803     pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
804     pshufb    xmm1, xmm6
805     pshufb    xmm2, xmm6
806     pshufb    xmm3, xmm6
807     movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
808     psrldq    xmm1, 4      // 8 bytes from 1
809     pslldq    xmm4, 12     // 4 bytes from 1 for 0
810     movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
811     por       xmm0, xmm4   // 4 bytes from 1 for 0
812     pslldq    xmm5, 8      // 8 bytes from 2 for 1
813     movdqu    [edx], xmm0  // store 0
814     por       xmm1, xmm5   // 8 bytes from 2 for 1
815     psrldq    xmm2, 8      // 4 bytes from 2
816     pslldq    xmm3, 4      // 12 bytes from 3 for 2
817     por       xmm2, xmm3   // 12 bytes from 3 for 2
818     movdqu    [edx + 16], xmm1   // store 1
819     movdqu    [edx + 32], xmm2   // store 2
820     lea       edx, [edx + 48]
821     sub       ecx, 16
822     jg        convertloop
823     ret
824   }
825 }
826 
827 __declspec(naked)
828 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
829   __asm {
830     mov       eax, [esp + 4]   // src_argb
831     mov       edx, [esp + 8]   // dst_rgb
832     mov       ecx, [esp + 12]  // width
833     pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
834     psrld     xmm3, 27
835     pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
836     psrld     xmm4, 26
837     pslld     xmm4, 5
838     pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
839     pslld     xmm5, 11
840 
841  convertloop:
842     movdqu    xmm0, [eax]   // fetch 4 pixels of argb
843     movdqa    xmm1, xmm0    // B
844     movdqa    xmm2, xmm0    // G
845     pslld     xmm0, 8       // R
846     psrld     xmm1, 3       // B
847     psrld     xmm2, 5       // G
848     psrad     xmm0, 16      // R
849     pand      xmm1, xmm3    // B
850     pand      xmm2, xmm4    // G
851     pand      xmm0, xmm5    // R
852     por       xmm1, xmm2    // BG
853     por       xmm0, xmm1    // BGR
854     packssdw  xmm0, xmm0
855     lea       eax, [eax + 16]
856     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
857     lea       edx, [edx + 8]
858     sub       ecx, 4
859     jg        convertloop
860     ret
861   }
862 }
863 
864 __declspec(naked)
865 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
866                                 const uint32 dither4, int width) {
867   __asm {
868 
869     mov       eax, [esp + 4]   // src_argb
870     mov       edx, [esp + 8]   // dst_rgb
871     movd      xmm6, [esp + 12] // dither4
872     mov       ecx, [esp + 16]  // width
873     punpcklbw xmm6, xmm6       // make dither 16 bytes
874     movdqa    xmm7, xmm6
875     punpcklwd xmm6, xmm6
876     punpckhwd xmm7, xmm7
877     pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
878     psrld     xmm3, 27
879     pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
880     psrld     xmm4, 26
881     pslld     xmm4, 5
882     pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
883     pslld     xmm5, 11
884 
885  convertloop:
886     movdqu    xmm0, [eax]   // fetch 4 pixels of argb
887     paddusb   xmm0, xmm6    // add dither
888     movdqa    xmm1, xmm0    // B
889     movdqa    xmm2, xmm0    // G
890     pslld     xmm0, 8       // R
891     psrld     xmm1, 3       // B
892     psrld     xmm2, 5       // G
893     psrad     xmm0, 16      // R
894     pand      xmm1, xmm3    // B
895     pand      xmm2, xmm4    // G
896     pand      xmm0, xmm5    // R
897     por       xmm1, xmm2    // BG
898     por       xmm0, xmm1    // BGR
899     packssdw  xmm0, xmm0
900     lea       eax, [eax + 16]
901     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
902     lea       edx, [edx + 8]
903     sub       ecx, 4
904     jg        convertloop
905     ret
906   }
907 }
908 
909 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
910 __declspec(naked)
911 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
912                                 const uint32 dither4, int width) {
913   __asm {
914     mov        eax, [esp + 4]      // src_argb
915     mov        edx, [esp + 8]      // dst_rgb
916     vbroadcastss xmm6, [esp + 12]  // dither4
917     mov        ecx, [esp + 16]     // width
918     vpunpcklbw xmm6, xmm6, xmm6    // make dither 32 bytes
919     vpermq     ymm6, ymm6, 0xd8
920     vpunpcklwd ymm6, ymm6, ymm6
921     vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
922     vpsrld     ymm3, ymm3, 27
923     vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
924     vpsrld     ymm4, ymm4, 26
925     vpslld     ymm4, ymm4, 5
926     vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
927 
928  convertloop:
929     vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
930     vpaddusb   ymm0, ymm0, ymm6    // add dither
931     vpsrld     ymm2, ymm0, 5       // G
932     vpsrld     ymm1, ymm0, 3       // B
933     vpsrld     ymm0, ymm0, 8       // R
934     vpand      ymm2, ymm2, ymm4    // G
935     vpand      ymm1, ymm1, ymm3    // B
936     vpand      ymm0, ymm0, ymm5    // R
937     vpor       ymm1, ymm1, ymm2    // BG
938     vpor       ymm0, ymm0, ymm1    // BGR
939     vpackusdw  ymm0, ymm0, ymm0
940     vpermq     ymm0, ymm0, 0xd8
941     lea        eax, [eax + 32]
942     vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
943     lea        edx, [edx + 16]
944     sub        ecx, 8
945     jg         convertloop
946     vzeroupper
947     ret
948   }
949 }
950 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
951 
952 // TODO(fbarchard): Improve sign extension/packing.
953 __declspec(naked)
954 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
955   __asm {
956     mov       eax, [esp + 4]   // src_argb
957     mov       edx, [esp + 8]   // dst_rgb
958     mov       ecx, [esp + 12]  // width
959     pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
960     psrld     xmm4, 27
961     movdqa    xmm5, xmm4       // generate mask 0x000003e0
962     pslld     xmm5, 5
963     movdqa    xmm6, xmm4       // generate mask 0x00007c00
964     pslld     xmm6, 10
965     pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
966     pslld     xmm7, 15
967 
968  convertloop:
969     movdqu    xmm0, [eax]   // fetch 4 pixels of argb
970     movdqa    xmm1, xmm0    // B
971     movdqa    xmm2, xmm0    // G
972     movdqa    xmm3, xmm0    // R
973     psrad     xmm0, 16      // A
974     psrld     xmm1, 3       // B
975     psrld     xmm2, 6       // G
976     psrld     xmm3, 9       // R
977     pand      xmm0, xmm7    // A
978     pand      xmm1, xmm4    // B
979     pand      xmm2, xmm5    // G
980     pand      xmm3, xmm6    // R
981     por       xmm0, xmm1    // BA
982     por       xmm2, xmm3    // GR
983     por       xmm0, xmm2    // BGRA
984     packssdw  xmm0, xmm0
985     lea       eax, [eax + 16]
986     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
987     lea       edx, [edx + 8]
988     sub       ecx, 4
989     jg        convertloop
990     ret
991   }
992 }
993 
994 __declspec(naked)
995 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
996   __asm {
997     mov       eax, [esp + 4]   // src_argb
998     mov       edx, [esp + 8]   // dst_rgb
999     mov       ecx, [esp + 12]  // width
1000     pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
1001     psllw     xmm4, 12
1002     movdqa    xmm3, xmm4       // generate mask 0x00f000f0
1003     psrlw     xmm3, 8
1004 
1005  convertloop:
1006     movdqu    xmm0, [eax]   // fetch 4 pixels of argb
1007     movdqa    xmm1, xmm0
1008     pand      xmm0, xmm3    // low nibble
1009     pand      xmm1, xmm4    // high nibble
1010     psrld     xmm0, 4
1011     psrld     xmm1, 8
1012     por       xmm0, xmm1
1013     packuswb  xmm0, xmm0
1014     lea       eax, [eax + 16]
1015     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
1016     lea       edx, [edx + 8]
1017     sub       ecx, 4
1018     jg        convertloop
1019     ret
1020   }
1021 }
1022 
1023 #ifdef HAS_ARGBTORGB565ROW_AVX2
1024 __declspec(naked)
1025 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
1026   __asm {
1027     mov        eax, [esp + 4]      // src_argb
1028     mov        edx, [esp + 8]      // dst_rgb
1029     mov        ecx, [esp + 12]     // width
1030     vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
1031     vpsrld     ymm3, ymm3, 27
1032     vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
1033     vpsrld     ymm4, ymm4, 26
1034     vpslld     ymm4, ymm4, 5
1035     vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
1036 
1037  convertloop:
1038     vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
1039     vpsrld     ymm2, ymm0, 5       // G
1040     vpsrld     ymm1, ymm0, 3       // B
1041     vpsrld     ymm0, ymm0, 8       // R
1042     vpand      ymm2, ymm2, ymm4    // G
1043     vpand      ymm1, ymm1, ymm3    // B
1044     vpand      ymm0, ymm0, ymm5    // R
1045     vpor       ymm1, ymm1, ymm2    // BG
1046     vpor       ymm0, ymm0, ymm1    // BGR
1047     vpackusdw  ymm0, ymm0, ymm0
1048     vpermq     ymm0, ymm0, 0xd8
1049     lea        eax, [eax + 32]
1050     vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
1051     lea        edx, [edx + 16]
1052     sub        ecx, 8
1053     jg         convertloop
1054     vzeroupper
1055     ret
1056   }
1057 }
1058 #endif  // HAS_ARGBTORGB565ROW_AVX2
1059 
1060 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
1061 __declspec(naked)
1062 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
1063   __asm {
1064     mov        eax, [esp + 4]      // src_argb
1065     mov        edx, [esp + 8]      // dst_rgb
1066     mov        ecx, [esp + 12]     // width
1067     vpcmpeqb   ymm4, ymm4, ymm4
1068     vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f
1069     vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0
1070     vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00
1071     vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000
1072     vpslld     ymm7, ymm7, 15
1073 
1074  convertloop:
1075     vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
1076     vpsrld     ymm3, ymm0, 9       // R
1077     vpsrld     ymm2, ymm0, 6       // G
1078     vpsrld     ymm1, ymm0, 3       // B
1079     vpsrad     ymm0, ymm0, 16      // A
1080     vpand      ymm3, ymm3, ymm6    // R
1081     vpand      ymm2, ymm2, ymm5    // G
1082     vpand      ymm1, ymm1, ymm4    // B
1083     vpand      ymm0, ymm0, ymm7    // A
1084     vpor       ymm0, ymm0, ymm1    // BA
1085     vpor       ymm2, ymm2, ymm3    // GR
1086     vpor       ymm0, ymm0, ymm2    // BGRA
1087     vpackssdw  ymm0, ymm0, ymm0
1088     vpermq     ymm0, ymm0, 0xd8
1089     lea        eax, [eax + 32]
1090     vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555
1091     lea        edx, [edx + 16]
1092     sub        ecx, 8
1093     jg         convertloop
1094     vzeroupper
1095     ret
1096   }
1097 }
1098 #endif  // HAS_ARGBTOARGB1555ROW_AVX2
1099 
1100 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
1101 __declspec(naked)
1102 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
1103   __asm {
1104     mov        eax, [esp + 4]   // src_argb
1105     mov        edx, [esp + 8]   // dst_rgb
1106     mov        ecx, [esp + 12]  // width
1107     vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000
1108     vpsllw     ymm4, ymm4, 12
1109     vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0
1110 
1111  convertloop:
1112     vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
1113     vpand      ymm1, ymm0, ymm4    // high nibble
1114     vpand      ymm0, ymm0, ymm3    // low nibble
1115     vpsrld     ymm1, ymm1, 8
1116     vpsrld     ymm0, ymm0, 4
1117     vpor       ymm0, ymm0, ymm1
1118     vpackuswb  ymm0, ymm0, ymm0
1119     vpermq     ymm0, ymm0, 0xd8
1120     lea        eax, [eax + 32]
1121     vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444
1122     lea        edx, [edx + 16]
1123     sub        ecx, 8
1124     jg         convertloop
1125     vzeroupper
1126     ret
1127   }
1128 }
1129 #endif  // HAS_ARGBTOARGB4444ROW_AVX2
1130 
1131 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
1132 __declspec(naked)
1133 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
1134   __asm {
1135     mov        eax, [esp + 4]   /* src_argb */
1136     mov        edx, [esp + 8]   /* dst_y */
1137     mov        ecx, [esp + 12]  /* width */
1138     movdqa     xmm4, xmmword ptr kARGBToY
1139     movdqa     xmm5, xmmword ptr kAddY16
1140 
1141  convertloop:
1142     movdqu     xmm0, [eax]
1143     movdqu     xmm1, [eax + 16]
1144     movdqu     xmm2, [eax + 32]
1145     movdqu     xmm3, [eax + 48]
1146     pmaddubsw  xmm0, xmm4
1147     pmaddubsw  xmm1, xmm4
1148     pmaddubsw  xmm2, xmm4
1149     pmaddubsw  xmm3, xmm4
1150     lea        eax, [eax + 64]
1151     phaddw     xmm0, xmm1
1152     phaddw     xmm2, xmm3
1153     psrlw      xmm0, 7
1154     psrlw      xmm2, 7
1155     packuswb   xmm0, xmm2
1156     paddb      xmm0, xmm5
1157     movdqu     [edx], xmm0
1158     lea        edx, [edx + 16]
1159     sub        ecx, 16
1160     jg         convertloop
1161     ret
1162   }
1163 }
1164 
1165 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1166 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
1167 __declspec(naked)
1168 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
1169   __asm {
1170     mov        eax, [esp + 4]   /* src_argb */
1171     mov        edx, [esp + 8]   /* dst_y */
1172     mov        ecx, [esp + 12]  /* width */
1173     movdqa     xmm4, xmmword ptr kARGBToYJ
1174     movdqa     xmm5, xmmword ptr kAddYJ64
1175 
1176  convertloop:
1177     movdqu     xmm0, [eax]
1178     movdqu     xmm1, [eax + 16]
1179     movdqu     xmm2, [eax + 32]
1180     movdqu     xmm3, [eax + 48]
1181     pmaddubsw  xmm0, xmm4
1182     pmaddubsw  xmm1, xmm4
1183     pmaddubsw  xmm2, xmm4
1184     pmaddubsw  xmm3, xmm4
1185     lea        eax, [eax + 64]
1186     phaddw     xmm0, xmm1
1187     phaddw     xmm2, xmm3
1188     paddw      xmm0, xmm5  // Add .5 for rounding.
1189     paddw      xmm2, xmm5
1190     psrlw      xmm0, 7
1191     psrlw      xmm2, 7
1192     packuswb   xmm0, xmm2
1193     movdqu     [edx], xmm0
1194     lea        edx, [edx + 16]
1195     sub        ecx, 16
1196     jg         convertloop
1197     ret
1198   }
1199 }
1200 
1201 #ifdef HAS_ARGBTOYROW_AVX2
1202 // vpermd for vphaddw + vpackuswb vpermd.
1203 static const lvec32 kPermdARGBToY_AVX = {
1204   0, 4, 1, 5, 2, 6, 3, 7
1205 };
1206 
1207 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1208 __declspec(naked)
1209 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
1210   __asm {
1211     mov        eax, [esp + 4]   /* src_argb */
1212     mov        edx, [esp + 8]   /* dst_y */
1213     mov        ecx, [esp + 12]  /* width */
1214     vbroadcastf128 ymm4, xmmword ptr kARGBToY
1215     vbroadcastf128 ymm5, xmmword ptr kAddY16
1216     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
1217 
1218  convertloop:
1219     vmovdqu    ymm0, [eax]
1220     vmovdqu    ymm1, [eax + 32]
1221     vmovdqu    ymm2, [eax + 64]
1222     vmovdqu    ymm3, [eax + 96]
1223     vpmaddubsw ymm0, ymm0, ymm4
1224     vpmaddubsw ymm1, ymm1, ymm4
1225     vpmaddubsw ymm2, ymm2, ymm4
1226     vpmaddubsw ymm3, ymm3, ymm4
1227     lea        eax, [eax + 128]
1228     vphaddw    ymm0, ymm0, ymm1  // mutates.
1229     vphaddw    ymm2, ymm2, ymm3
1230     vpsrlw     ymm0, ymm0, 7
1231     vpsrlw     ymm2, ymm2, 7
1232     vpackuswb  ymm0, ymm0, ymm2  // mutates.
1233     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
1234     vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
1235     vmovdqu    [edx], ymm0
1236     lea        edx, [edx + 32]
1237     sub        ecx, 32
1238     jg         convertloop
1239     vzeroupper
1240     ret
1241   }
1242 }
1243 #endif  //  HAS_ARGBTOYROW_AVX2
1244 
1245 #ifdef HAS_ARGBTOYJROW_AVX2
1246 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1247 __declspec(naked)
1248 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
1249   __asm {
1250     mov        eax, [esp + 4]   /* src_argb */
1251     mov        edx, [esp + 8]   /* dst_y */
1252     mov        ecx, [esp + 12]  /* width */
1253     vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
1254     vbroadcastf128 ymm5, xmmword ptr kAddYJ64
1255     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
1256 
1257  convertloop:
1258     vmovdqu    ymm0, [eax]
1259     vmovdqu    ymm1, [eax + 32]
1260     vmovdqu    ymm2, [eax + 64]
1261     vmovdqu    ymm3, [eax + 96]
1262     vpmaddubsw ymm0, ymm0, ymm4
1263     vpmaddubsw ymm1, ymm1, ymm4
1264     vpmaddubsw ymm2, ymm2, ymm4
1265     vpmaddubsw ymm3, ymm3, ymm4
1266     lea        eax, [eax + 128]
1267     vphaddw    ymm0, ymm0, ymm1  // mutates.
1268     vphaddw    ymm2, ymm2, ymm3
1269     vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
1270     vpaddw     ymm2, ymm2, ymm5
1271     vpsrlw     ymm0, ymm0, 7
1272     vpsrlw     ymm2, ymm2, 7
1273     vpackuswb  ymm0, ymm0, ymm2  // mutates.
1274     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
1275     vmovdqu    [edx], ymm0
1276     lea        edx, [edx + 32]
1277     sub        ecx, 32
1278     jg         convertloop
1279 
1280     vzeroupper
1281     ret
1282   }
1283 }
1284 #endif  //  HAS_ARGBTOYJROW_AVX2
1285 
1286 __declspec(naked)
1287 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
1288   __asm {
1289     mov        eax, [esp + 4]   /* src_argb */
1290     mov        edx, [esp + 8]   /* dst_y */
1291     mov        ecx, [esp + 12]  /* width */
1292     movdqa     xmm4, xmmword ptr kBGRAToY
1293     movdqa     xmm5, xmmword ptr kAddY16
1294 
1295  convertloop:
1296     movdqu     xmm0, [eax]
1297     movdqu     xmm1, [eax + 16]
1298     movdqu     xmm2, [eax + 32]
1299     movdqu     xmm3, [eax + 48]
1300     pmaddubsw  xmm0, xmm4
1301     pmaddubsw  xmm1, xmm4
1302     pmaddubsw  xmm2, xmm4
1303     pmaddubsw  xmm3, xmm4
1304     lea        eax, [eax + 64]
1305     phaddw     xmm0, xmm1
1306     phaddw     xmm2, xmm3
1307     psrlw      xmm0, 7
1308     psrlw      xmm2, 7
1309     packuswb   xmm0, xmm2
1310     paddb      xmm0, xmm5
1311     movdqu     [edx], xmm0
1312     lea        edx, [edx + 16]
1313     sub        ecx, 16
1314     jg         convertloop
1315     ret
1316   }
1317 }
1318 
1319 __declspec(naked)
1320 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
1321   __asm {
1322     mov        eax, [esp + 4]   /* src_argb */
1323     mov        edx, [esp + 8]   /* dst_y */
1324     mov        ecx, [esp + 12]  /* width */
1325     movdqa     xmm4, xmmword ptr kABGRToY
1326     movdqa     xmm5, xmmword ptr kAddY16
1327 
1328  convertloop:
1329     movdqu     xmm0, [eax]
1330     movdqu     xmm1, [eax + 16]
1331     movdqu     xmm2, [eax + 32]
1332     movdqu     xmm3, [eax + 48]
1333     pmaddubsw  xmm0, xmm4
1334     pmaddubsw  xmm1, xmm4
1335     pmaddubsw  xmm2, xmm4
1336     pmaddubsw  xmm3, xmm4
1337     lea        eax, [eax + 64]
1338     phaddw     xmm0, xmm1
1339     phaddw     xmm2, xmm3
1340     psrlw      xmm0, 7
1341     psrlw      xmm2, 7
1342     packuswb   xmm0, xmm2
1343     paddb      xmm0, xmm5
1344     movdqu     [edx], xmm0
1345     lea        edx, [edx + 16]
1346     sub        ecx, 16
1347     jg         convertloop
1348     ret
1349   }
1350 }
1351 
1352 __declspec(naked)
1353 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
1354   __asm {
1355     mov        eax, [esp + 4]   /* src_argb */
1356     mov        edx, [esp + 8]   /* dst_y */
1357     mov        ecx, [esp + 12]  /* width */
1358     movdqa     xmm4, xmmword ptr kRGBAToY
1359     movdqa     xmm5, xmmword ptr kAddY16
1360 
1361  convertloop:
1362     movdqu     xmm0, [eax]
1363     movdqu     xmm1, [eax + 16]
1364     movdqu     xmm2, [eax + 32]
1365     movdqu     xmm3, [eax + 48]
1366     pmaddubsw  xmm0, xmm4
1367     pmaddubsw  xmm1, xmm4
1368     pmaddubsw  xmm2, xmm4
1369     pmaddubsw  xmm3, xmm4
1370     lea        eax, [eax + 64]
1371     phaddw     xmm0, xmm1
1372     phaddw     xmm2, xmm3
1373     psrlw      xmm0, 7
1374     psrlw      xmm2, 7
1375     packuswb   xmm0, xmm2
1376     paddb      xmm0, xmm5
1377     movdqu     [edx], xmm0
1378     lea        edx, [edx + 16]
1379     sub        ecx, 16
1380     jg         convertloop
1381     ret
1382   }
1383 }
1384 
1385 __declspec(naked)
1386 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1387                        uint8* dst_u, uint8* dst_v, int width) {
1388   __asm {
1389     push       esi
1390     push       edi
1391     mov        eax, [esp + 8 + 4]   // src_argb
1392     mov        esi, [esp + 8 + 8]   // src_stride_argb
1393     mov        edx, [esp + 8 + 12]  // dst_u
1394     mov        edi, [esp + 8 + 16]  // dst_v
1395     mov        ecx, [esp + 8 + 20]  // width
1396     movdqa     xmm5, xmmword ptr kAddUV128
1397     movdqa     xmm6, xmmword ptr kARGBToV
1398     movdqa     xmm7, xmmword ptr kARGBToU
1399     sub        edi, edx             // stride from u to v
1400 
1401  convertloop:
1402     /* step 1 - subsample 16x2 argb pixels to 8x1 */
1403     movdqu     xmm0, [eax]
1404     movdqu     xmm4, [eax + esi]
1405     pavgb      xmm0, xmm4
1406     movdqu     xmm1, [eax + 16]
1407     movdqu     xmm4, [eax + esi + 16]
1408     pavgb      xmm1, xmm4
1409     movdqu     xmm2, [eax + 32]
1410     movdqu     xmm4, [eax + esi + 32]
1411     pavgb      xmm2, xmm4
1412     movdqu     xmm3, [eax + 48]
1413     movdqu     xmm4, [eax + esi + 48]
1414     pavgb      xmm3, xmm4
1415 
1416     lea        eax,  [eax + 64]
1417     movdqa     xmm4, xmm0
1418     shufps     xmm0, xmm1, 0x88
1419     shufps     xmm4, xmm1, 0xdd
1420     pavgb      xmm0, xmm4
1421     movdqa     xmm4, xmm2
1422     shufps     xmm2, xmm3, 0x88
1423     shufps     xmm4, xmm3, 0xdd
1424     pavgb      xmm2, xmm4
1425 
1426     // step 2 - convert to U and V
1427     // from here down is very similar to Y code except
1428     // instead of 16 different pixels, its 8 pixels of U and 8 of V
1429     movdqa     xmm1, xmm0
1430     movdqa     xmm3, xmm2
1431     pmaddubsw  xmm0, xmm7  // U
1432     pmaddubsw  xmm2, xmm7
1433     pmaddubsw  xmm1, xmm6  // V
1434     pmaddubsw  xmm3, xmm6
1435     phaddw     xmm0, xmm2
1436     phaddw     xmm1, xmm3
1437     psraw      xmm0, 8
1438     psraw      xmm1, 8
1439     packsswb   xmm0, xmm1
1440     paddb      xmm0, xmm5            // -> unsigned
1441 
1442     // step 3 - store 8 U and 8 V values
1443     movlps     qword ptr [edx], xmm0 // U
1444     movhps     qword ptr [edx + edi], xmm0 // V
1445     lea        edx, [edx + 8]
1446     sub        ecx, 16
1447     jg         convertloop
1448 
1449     pop        edi
1450     pop        esi
1451     ret
1452   }
1453 }
1454 
1455 __declspec(naked)
1456 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1457                         uint8* dst_u, uint8* dst_v, int width) {
1458   __asm {
1459     push       esi
1460     push       edi
1461     mov        eax, [esp + 8 + 4]   // src_argb
1462     mov        esi, [esp + 8 + 8]   // src_stride_argb
1463     mov        edx, [esp + 8 + 12]  // dst_u
1464     mov        edi, [esp + 8 + 16]  // dst_v
1465     mov        ecx, [esp + 8 + 20]  // width
1466     movdqa     xmm5, xmmword ptr kAddUVJ128
1467     movdqa     xmm6, xmmword ptr kARGBToVJ
1468     movdqa     xmm7, xmmword ptr kARGBToUJ
1469     sub        edi, edx             // stride from u to v
1470 
1471  convertloop:
1472     /* step 1 - subsample 16x2 argb pixels to 8x1 */
1473     movdqu     xmm0, [eax]
1474     movdqu     xmm4, [eax + esi]
1475     pavgb      xmm0, xmm4
1476     movdqu     xmm1, [eax + 16]
1477     movdqu     xmm4, [eax + esi + 16]
1478     pavgb      xmm1, xmm4
1479     movdqu     xmm2, [eax + 32]
1480     movdqu     xmm4, [eax + esi + 32]
1481     pavgb      xmm2, xmm4
1482     movdqu     xmm3, [eax + 48]
1483     movdqu     xmm4, [eax + esi + 48]
1484     pavgb      xmm3, xmm4
1485 
1486     lea        eax,  [eax + 64]
1487     movdqa     xmm4, xmm0
1488     shufps     xmm0, xmm1, 0x88
1489     shufps     xmm4, xmm1, 0xdd
1490     pavgb      xmm0, xmm4
1491     movdqa     xmm4, xmm2
1492     shufps     xmm2, xmm3, 0x88
1493     shufps     xmm4, xmm3, 0xdd
1494     pavgb      xmm2, xmm4
1495 
1496     // step 2 - convert to U and V
1497     // from here down is very similar to Y code except
1498     // instead of 16 different pixels, its 8 pixels of U and 8 of V
1499     movdqa     xmm1, xmm0
1500     movdqa     xmm3, xmm2
1501     pmaddubsw  xmm0, xmm7  // U
1502     pmaddubsw  xmm2, xmm7
1503     pmaddubsw  xmm1, xmm6  // V
1504     pmaddubsw  xmm3, xmm6
1505     phaddw     xmm0, xmm2
1506     phaddw     xmm1, xmm3
1507     paddw      xmm0, xmm5  // +.5 rounding -> unsigned
1508     paddw      xmm1, xmm5
1509     psraw      xmm0, 8
1510     psraw      xmm1, 8
1511     packsswb   xmm0, xmm1
1512 
1513     // step 3 - store 8 U and 8 V values
1514     movlps     qword ptr [edx], xmm0 // U
1515     movhps     qword ptr [edx + edi], xmm0 // V
1516     lea        edx, [edx + 8]
1517     sub        ecx, 16
1518     jg         convertloop
1519 
1520     pop        edi
1521     pop        esi
1522     ret
1523   }
1524 }
1525 
1526 #ifdef HAS_ARGBTOUVROW_AVX2
1527 __declspec(naked)
1528 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
1529                       uint8* dst_u, uint8* dst_v, int width) {
1530   __asm {
1531     push       esi
1532     push       edi
1533     mov        eax, [esp + 8 + 4]   // src_argb
1534     mov        esi, [esp + 8 + 8]   // src_stride_argb
1535     mov        edx, [esp + 8 + 12]  // dst_u
1536     mov        edi, [esp + 8 + 16]  // dst_v
1537     mov        ecx, [esp + 8 + 20]  // width
1538     vbroadcastf128 ymm5, xmmword ptr kAddUV128
1539     vbroadcastf128 ymm6, xmmword ptr kARGBToV
1540     vbroadcastf128 ymm7, xmmword ptr kARGBToU
1541     sub        edi, edx             // stride from u to v
1542 
1543  convertloop:
1544     /* step 1 - subsample 32x2 argb pixels to 16x1 */
1545     vmovdqu    ymm0, [eax]
1546     vmovdqu    ymm1, [eax + 32]
1547     vmovdqu    ymm2, [eax + 64]
1548     vmovdqu    ymm3, [eax + 96]
1549     vpavgb     ymm0, ymm0, [eax + esi]
1550     vpavgb     ymm1, ymm1, [eax + esi + 32]
1551     vpavgb     ymm2, ymm2, [eax + esi + 64]
1552     vpavgb     ymm3, ymm3, [eax + esi + 96]
1553     lea        eax,  [eax + 128]
1554     vshufps    ymm4, ymm0, ymm1, 0x88
1555     vshufps    ymm0, ymm0, ymm1, 0xdd
1556     vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
1557     vshufps    ymm4, ymm2, ymm3, 0x88
1558     vshufps    ymm2, ymm2, ymm3, 0xdd
1559     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
1560 
1561     // step 2 - convert to U and V
1562     // from here down is very similar to Y code except
1563     // instead of 32 different pixels, its 16 pixels of U and 16 of V
1564     vpmaddubsw ymm1, ymm0, ymm7  // U
1565     vpmaddubsw ymm3, ymm2, ymm7
1566     vpmaddubsw ymm0, ymm0, ymm6  // V
1567     vpmaddubsw ymm2, ymm2, ymm6
1568     vphaddw    ymm1, ymm1, ymm3  // mutates
1569     vphaddw    ymm0, ymm0, ymm2
1570     vpsraw     ymm1, ymm1, 8
1571     vpsraw     ymm0, ymm0, 8
1572     vpacksswb  ymm0, ymm1, ymm0  // mutates
1573     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
1574     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
1575     vpaddb     ymm0, ymm0, ymm5  // -> unsigned
1576 
1577     // step 3 - store 16 U and 16 V values
1578     vextractf128 [edx], ymm0, 0 // U
1579     vextractf128 [edx + edi], ymm0, 1 // V
1580     lea        edx, [edx + 16]
1581     sub        ecx, 32
1582     jg         convertloop
1583 
1584     pop        edi
1585     pop        esi
1586     vzeroupper
1587     ret
1588   }
1589 }
1590 #endif  // HAS_ARGBTOUVROW_AVX2
1591 
1592 #ifdef HAS_ARGBTOUVJROW_AVX2
1593 __declspec(naked)
1594 void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
1595                       uint8* dst_u, uint8* dst_v, int width) {
1596   __asm {
1597     push       esi
1598     push       edi
1599     mov        eax, [esp + 8 + 4]   // src_argb
1600     mov        esi, [esp + 8 + 8]   // src_stride_argb
1601     mov        edx, [esp + 8 + 12]  // dst_u
1602     mov        edi, [esp + 8 + 16]  // dst_v
1603     mov        ecx, [esp + 8 + 20]  // width
1604     vbroadcastf128 ymm5, xmmword ptr kAddUV128
1605     vbroadcastf128 ymm6, xmmword ptr kARGBToV
1606     vbroadcastf128 ymm7, xmmword ptr kARGBToU
1607     sub        edi, edx             // stride from u to v
1608 
1609  convertloop:
1610     /* step 1 - subsample 32x2 argb pixels to 16x1 */
1611     vmovdqu    ymm0, [eax]
1612     vmovdqu    ymm1, [eax + 32]
1613     vmovdqu    ymm2, [eax + 64]
1614     vmovdqu    ymm3, [eax + 96]
1615     vpavgb     ymm0, ymm0, [eax + esi]
1616     vpavgb     ymm1, ymm1, [eax + esi + 32]
1617     vpavgb     ymm2, ymm2, [eax + esi + 64]
1618     vpavgb     ymm3, ymm3, [eax + esi + 96]
1619     lea        eax,  [eax + 128]
1620     vshufps    ymm4, ymm0, ymm1, 0x88
1621     vshufps    ymm0, ymm0, ymm1, 0xdd
1622     vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
1623     vshufps    ymm4, ymm2, ymm3, 0x88
1624     vshufps    ymm2, ymm2, ymm3, 0xdd
1625     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
1626 
1627     // step 2 - convert to U and V
1628     // from here down is very similar to Y code except
1629     // instead of 32 different pixels, its 16 pixels of U and 16 of V
1630     vpmaddubsw ymm1, ymm0, ymm7  // U
1631     vpmaddubsw ymm3, ymm2, ymm7
1632     vpmaddubsw ymm0, ymm0, ymm6  // V
1633     vpmaddubsw ymm2, ymm2, ymm6
1634     vphaddw    ymm1, ymm1, ymm3  // mutates
1635     vphaddw    ymm0, ymm0, ymm2
1636     vpaddw     ymm1, ymm1, ymm5  // +.5 rounding -> unsigned
1637     vpaddw     ymm0, ymm0, ymm5
1638     vpsraw     ymm1, ymm1, 8
1639     vpsraw     ymm0, ymm0, 8
1640     vpacksswb  ymm0, ymm1, ymm0  // mutates
1641     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
1642     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
1643 
1644     // step 3 - store 16 U and 16 V values
1645     vextractf128 [edx], ymm0, 0 // U
1646     vextractf128 [edx + edi], ymm0, 1 // V
1647     lea        edx, [edx + 16]
1648     sub        ecx, 32
1649     jg         convertloop
1650 
1651     pop        edi
1652     pop        esi
1653     vzeroupper
1654     ret
1655   }
1656 }
1657 #endif  // HAS_ARGBTOUVJROW_AVX2
1658 
1659 __declspec(naked)
1660 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
1661                           uint8* dst_u, uint8* dst_v, int width) {
1662   __asm {
1663     push       edi
1664     mov        eax, [esp + 4 + 4]   // src_argb
1665     mov        edx, [esp + 4 + 8]   // dst_u
1666     mov        edi, [esp + 4 + 12]  // dst_v
1667     mov        ecx, [esp + 4 + 16]  // width
1668     movdqa     xmm5, xmmword ptr kAddUV128
1669     movdqa     xmm6, xmmword ptr kARGBToV
1670     movdqa     xmm7, xmmword ptr kARGBToU
1671     sub        edi, edx             // stride from u to v
1672 
1673  convertloop:
1674     /* convert to U and V */
1675     movdqu     xmm0, [eax]          // U
1676     movdqu     xmm1, [eax + 16]
1677     movdqu     xmm2, [eax + 32]
1678     movdqu     xmm3, [eax + 48]
1679     pmaddubsw  xmm0, xmm7
1680     pmaddubsw  xmm1, xmm7
1681     pmaddubsw  xmm2, xmm7
1682     pmaddubsw  xmm3, xmm7
1683     phaddw     xmm0, xmm1
1684     phaddw     xmm2, xmm3
1685     psraw      xmm0, 8
1686     psraw      xmm2, 8
1687     packsswb   xmm0, xmm2
1688     paddb      xmm0, xmm5
1689     movdqu     [edx], xmm0
1690 
1691     movdqu     xmm0, [eax]          // V
1692     movdqu     xmm1, [eax + 16]
1693     movdqu     xmm2, [eax + 32]
1694     movdqu     xmm3, [eax + 48]
1695     pmaddubsw  xmm0, xmm6
1696     pmaddubsw  xmm1, xmm6
1697     pmaddubsw  xmm2, xmm6
1698     pmaddubsw  xmm3, xmm6
1699     phaddw     xmm0, xmm1
1700     phaddw     xmm2, xmm3
1701     psraw      xmm0, 8
1702     psraw      xmm2, 8
1703     packsswb   xmm0, xmm2
1704     paddb      xmm0, xmm5
1705     lea        eax,  [eax + 64]
1706     movdqu     [edx + edi], xmm0
1707     lea        edx,  [edx + 16]
1708     sub        ecx,  16
1709     jg         convertloop
1710 
1711     pop        edi
1712     ret
1713   }
1714 }
1715 
1716 __declspec(naked)
1717 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1718                        uint8* dst_u, uint8* dst_v, int width) {
1719   __asm {
1720     push       esi
1721     push       edi
1722     mov        eax, [esp + 8 + 4]   // src_argb
1723     mov        esi, [esp + 8 + 8]   // src_stride_argb
1724     mov        edx, [esp + 8 + 12]  // dst_u
1725     mov        edi, [esp + 8 + 16]  // dst_v
1726     mov        ecx, [esp + 8 + 20]  // width
1727     movdqa     xmm5, xmmword ptr kAddUV128
1728     movdqa     xmm6, xmmword ptr kBGRAToV
1729     movdqa     xmm7, xmmword ptr kBGRAToU
1730     sub        edi, edx             // stride from u to v
1731 
1732  convertloop:
1733     /* step 1 - subsample 16x2 argb pixels to 8x1 */
1734     movdqu     xmm0, [eax]
1735     movdqu     xmm4, [eax + esi]
1736     pavgb      xmm0, xmm4
1737     movdqu     xmm1, [eax + 16]
1738     movdqu     xmm4, [eax + esi + 16]
1739     pavgb      xmm1, xmm4
1740     movdqu     xmm2, [eax + 32]
1741     movdqu     xmm4, [eax + esi + 32]
1742     pavgb      xmm2, xmm4
1743     movdqu     xmm3, [eax + 48]
1744     movdqu     xmm4, [eax + esi + 48]
1745     pavgb      xmm3, xmm4
1746 
1747     lea        eax,  [eax + 64]
1748     movdqa     xmm4, xmm0
1749     shufps     xmm0, xmm1, 0x88
1750     shufps     xmm4, xmm1, 0xdd
1751     pavgb      xmm0, xmm4
1752     movdqa     xmm4, xmm2
1753     shufps     xmm2, xmm3, 0x88
1754     shufps     xmm4, xmm3, 0xdd
1755     pavgb      xmm2, xmm4
1756 
1757     // step 2 - convert to U and V
1758     // from here down is very similar to Y code except
1759     // instead of 16 different pixels, its 8 pixels of U and 8 of V
1760     movdqa     xmm1, xmm0
1761     movdqa     xmm3, xmm2
1762     pmaddubsw  xmm0, xmm7  // U
1763     pmaddubsw  xmm2, xmm7
1764     pmaddubsw  xmm1, xmm6  // V
1765     pmaddubsw  xmm3, xmm6
1766     phaddw     xmm0, xmm2
1767     phaddw     xmm1, xmm3
1768     psraw      xmm0, 8
1769     psraw      xmm1, 8
1770     packsswb   xmm0, xmm1
1771     paddb      xmm0, xmm5            // -> unsigned
1772 
1773     // step 3 - store 8 U and 8 V values
1774     movlps     qword ptr [edx], xmm0 // U
1775     movhps     qword ptr [edx + edi], xmm0 // V
1776     lea        edx, [edx + 8]
1777     sub        ecx, 16
1778     jg         convertloop
1779 
1780     pop        edi
1781     pop        esi
1782     ret
1783   }
1784 }
1785 
1786 __declspec(naked)
1787 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1788                        uint8* dst_u, uint8* dst_v, int width) {
1789   __asm {
1790     push       esi
1791     push       edi
1792     mov        eax, [esp + 8 + 4]   // src_argb
1793     mov        esi, [esp + 8 + 8]   // src_stride_argb
1794     mov        edx, [esp + 8 + 12]  // dst_u
1795     mov        edi, [esp + 8 + 16]  // dst_v
1796     mov        ecx, [esp + 8 + 20]  // width
1797     movdqa     xmm5, xmmword ptr kAddUV128
1798     movdqa     xmm6, xmmword ptr kABGRToV
1799     movdqa     xmm7, xmmword ptr kABGRToU
1800     sub        edi, edx             // stride from u to v
1801 
1802  convertloop:
1803     /* step 1 - subsample 16x2 argb pixels to 8x1 */
1804     movdqu     xmm0, [eax]
1805     movdqu     xmm4, [eax + esi]
1806     pavgb      xmm0, xmm4
1807     movdqu     xmm1, [eax + 16]
1808     movdqu     xmm4, [eax + esi + 16]
1809     pavgb      xmm1, xmm4
1810     movdqu     xmm2, [eax + 32]
1811     movdqu     xmm4, [eax + esi + 32]
1812     pavgb      xmm2, xmm4
1813     movdqu     xmm3, [eax + 48]
1814     movdqu     xmm4, [eax + esi + 48]
1815     pavgb      xmm3, xmm4
1816 
1817     lea        eax,  [eax + 64]
1818     movdqa     xmm4, xmm0
1819     shufps     xmm0, xmm1, 0x88
1820     shufps     xmm4, xmm1, 0xdd
1821     pavgb      xmm0, xmm4
1822     movdqa     xmm4, xmm2
1823     shufps     xmm2, xmm3, 0x88
1824     shufps     xmm4, xmm3, 0xdd
1825     pavgb      xmm2, xmm4
1826 
1827     // step 2 - convert to U and V
1828     // from here down is very similar to Y code except
1829     // instead of 16 different pixels, its 8 pixels of U and 8 of V
1830     movdqa     xmm1, xmm0
1831     movdqa     xmm3, xmm2
1832     pmaddubsw  xmm0, xmm7  // U
1833     pmaddubsw  xmm2, xmm7
1834     pmaddubsw  xmm1, xmm6  // V
1835     pmaddubsw  xmm3, xmm6
1836     phaddw     xmm0, xmm2
1837     phaddw     xmm1, xmm3
1838     psraw      xmm0, 8
1839     psraw      xmm1, 8
1840     packsswb   xmm0, xmm1
1841     paddb      xmm0, xmm5            // -> unsigned
1842 
1843     // step 3 - store 8 U and 8 V values
1844     movlps     qword ptr [edx], xmm0 // U
1845     movhps     qword ptr [edx + edi], xmm0 // V
1846     lea        edx, [edx + 8]
1847     sub        ecx, 16
1848     jg         convertloop
1849 
1850     pop        edi
1851     pop        esi
1852     ret
1853   }
1854 }
1855 
1856 __declspec(naked)
1857 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1858                        uint8* dst_u, uint8* dst_v, int width) {
1859   __asm {
1860     push       esi
1861     push       edi
1862     mov        eax, [esp + 8 + 4]   // src_argb
1863     mov        esi, [esp + 8 + 8]   // src_stride_argb
1864     mov        edx, [esp + 8 + 12]  // dst_u
1865     mov        edi, [esp + 8 + 16]  // dst_v
1866     mov        ecx, [esp + 8 + 20]  // width
1867     movdqa     xmm5, xmmword ptr kAddUV128
1868     movdqa     xmm6, xmmword ptr kRGBAToV
1869     movdqa     xmm7, xmmword ptr kRGBAToU
1870     sub        edi, edx             // stride from u to v
1871 
1872  convertloop:
1873     /* step 1 - subsample 16x2 argb pixels to 8x1 */
1874     movdqu     xmm0, [eax]
1875     movdqu     xmm4, [eax + esi]
1876     pavgb      xmm0, xmm4
1877     movdqu     xmm1, [eax + 16]
1878     movdqu     xmm4, [eax + esi + 16]
1879     pavgb      xmm1, xmm4
1880     movdqu     xmm2, [eax + 32]
1881     movdqu     xmm4, [eax + esi + 32]
1882     pavgb      xmm2, xmm4
1883     movdqu     xmm3, [eax + 48]
1884     movdqu     xmm4, [eax + esi + 48]
1885     pavgb      xmm3, xmm4
1886 
1887     lea        eax,  [eax + 64]
1888     movdqa     xmm4, xmm0
1889     shufps     xmm0, xmm1, 0x88
1890     shufps     xmm4, xmm1, 0xdd
1891     pavgb      xmm0, xmm4
1892     movdqa     xmm4, xmm2
1893     shufps     xmm2, xmm3, 0x88
1894     shufps     xmm4, xmm3, 0xdd
1895     pavgb      xmm2, xmm4
1896 
1897     // step 2 - convert to U and V
1898     // from here down is very similar to Y code except
1899     // instead of 16 different pixels, its 8 pixels of U and 8 of V
1900     movdqa     xmm1, xmm0
1901     movdqa     xmm3, xmm2
1902     pmaddubsw  xmm0, xmm7  // U
1903     pmaddubsw  xmm2, xmm7
1904     pmaddubsw  xmm1, xmm6  // V
1905     pmaddubsw  xmm3, xmm6
1906     phaddw     xmm0, xmm2
1907     phaddw     xmm1, xmm3
1908     psraw      xmm0, 8
1909     psraw      xmm1, 8
1910     packsswb   xmm0, xmm1
1911     paddb      xmm0, xmm5            // -> unsigned
1912 
1913     // step 3 - store 8 U and 8 V values
1914     movlps     qword ptr [edx], xmm0 // U
1915     movhps     qword ptr [edx + edi], xmm0 // V
1916     lea        edx, [edx + 8]
1917     sub        ecx, 16
1918     jg         convertloop
1919 
1920     pop        edi
1921     pop        esi
1922     ret
1923   }
1924 }
1925 #endif  // HAS_ARGBTOYROW_SSSE3
1926 
1927 // Read 16 UV from 444
1928 #define READYUV444_AVX2 __asm {                                                \
1929     __asm vmovdqu    xmm0, [esi]                  /* U */                      \
1930     __asm vmovdqu    xmm1, [esi + edi]            /* V */                      \
1931     __asm lea        esi,  [esi + 16]                                          \
1932     __asm vpermq     ymm0, ymm0, 0xd8                                          \
1933     __asm vpermq     ymm1, ymm1, 0xd8                                          \
1934     __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
1935     __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
1936     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1937     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1938     __asm lea        eax, [eax + 16]                                           \
1939   }
1940 
1941 // Read 8 UV from 422, upsample to 16 UV.
1942 #define READYUV422_AVX2 __asm {                                                \
1943     __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \
1944     __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \
1945     __asm lea        esi,  [esi + 8]                                           \
1946     __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
1947     __asm vpermq     ymm0, ymm0, 0xd8                                          \
1948     __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
1949     __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
1950     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1951     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1952     __asm lea        eax, [eax + 16]                                           \
1953   }
1954 
1955 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
1956 #define READYUVA422_AVX2 __asm {                                               \
1957     __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \
1958     __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \
1959     __asm lea        esi,  [esi + 8]                                           \
1960     __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
1961     __asm vpermq     ymm0, ymm0, 0xd8                                          \
1962     __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
1963     __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
1964     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1965     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1966     __asm lea        eax, [eax + 16]                                           \
1967     __asm vmovdqu    xmm5, [ebp]                  /* A */                      \
1968     __asm vpermq     ymm5, ymm5, 0xd8                                          \
1969     __asm lea        ebp, [ebp + 16]                                           \
1970   }
1971 
1972 // Read 4 UV from 411, upsample to 16 UV.
1973 #define READYUV411_AVX2 __asm {                                                \
1974     __asm vmovd      xmm0, dword ptr [esi]        /* U */                      \
1975     __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */                      \
1976     __asm lea        esi,  [esi + 4]                                           \
1977     __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
1978     __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
1979     __asm vpermq     ymm0, ymm0, 0xd8                                          \
1980     __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \
1981     __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
1982     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1983     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1984     __asm lea        eax, [eax + 16]                                           \
1985   }
1986 
1987 // Read 8 UV from NV12, upsample to 16 UV.
1988 #define READNV12_AVX2 __asm {                                                  \
1989     __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
1990     __asm lea        esi,  [esi + 16]                                          \
1991     __asm vpermq     ymm0, ymm0, 0xd8                                          \
1992     __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
1993     __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
1994     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1995     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1996     __asm lea        eax, [eax + 16]                                           \
1997   }
1998 
1999 // Read 8 UV from NV21, upsample to 16 UV.
2000 #define READNV21_AVX2 __asm {                                                  \
2001     __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
2002     __asm lea        esi,  [esi + 16]                                          \
2003     __asm vpermq     ymm0, ymm0, 0xd8                                          \
2004     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleNV21                      \
2005     __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
2006     __asm vpermq     ymm4, ymm4, 0xd8                                          \
2007     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
2008     __asm lea        eax, [eax + 16]                                           \
2009   }
2010 
2011 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
2012 #define READYUY2_AVX2 __asm {                                                  \
2013     __asm vmovdqu    ymm4, [eax]          /* YUY2 */                           \
2014     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \
2015     __asm vmovdqu    ymm0, [eax]          /* UV */                             \
2016     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleYUY2UV                    \
2017     __asm lea        eax, [eax + 32]                                           \
2018   }
2019 
2020 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2021 #define READUYVY_AVX2 __asm {                                                  \
2022     __asm vmovdqu    ymm4, [eax]          /* UYVY */                           \
2023     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \
2024     __asm vmovdqu    ymm0, [eax]          /* UV */                             \
2025     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleUYVYUV                    \
2026     __asm lea        eax, [eax + 32]                                           \
2027   }
2028 
2029 // Convert 16 pixels: 16 UV and 16 Y.
2030 #define YUVTORGB_AVX2(YuvConstants) __asm {                                    \
2031     __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
2032     __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
2033     __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
2034     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASR]               \
2035     __asm vpsubw     ymm2, ymm3, ymm2                                          \
2036     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASG]               \
2037     __asm vpsubw     ymm1, ymm3, ymm1                                          \
2038     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \
2039     __asm vpsubw     ymm0, ymm3, ymm0                                          \
2040     /* Step 2: Find Y contribution to 16 R,G,B values */                       \
2041     __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
2042     __asm vpaddsw    ymm0, ymm0, ymm4           /* B += Y */                   \
2043     __asm vpaddsw    ymm1, ymm1, ymm4           /* G += Y */                   \
2044     __asm vpaddsw    ymm2, ymm2, ymm4           /* R += Y */                   \
2045     __asm vpsraw     ymm0, ymm0, 6                                             \
2046     __asm vpsraw     ymm1, ymm1, 6                                             \
2047     __asm vpsraw     ymm2, ymm2, 6                                             \
2048     __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \
2049     __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \
2050     __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \
2051   }
2052 
2053 // Store 16 ARGB values.
2054 #define STOREARGB_AVX2 __asm {                                                 \
2055     __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \
2056     __asm vpermq     ymm0, ymm0, 0xd8                                          \
2057     __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \
2058     __asm vpermq     ymm2, ymm2, 0xd8                                          \
2059     __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \
2060     __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \
2061     __asm vmovdqu    0[edx], ymm1                                              \
2062     __asm vmovdqu    32[edx], ymm0                                             \
2063     __asm lea        edx,  [edx + 64]                                          \
2064   }
2065 
2066 // Store 16 RGBA values.
2067 #define STORERGBA_AVX2 __asm {                                                 \
2068     __asm vpunpcklbw ymm1, ymm1, ymm2           /* GR */                       \
2069     __asm vpermq     ymm1, ymm1, 0xd8                                          \
2070     __asm vpunpcklbw ymm2, ymm5, ymm0           /* AB */                       \
2071     __asm vpermq     ymm2, ymm2, 0xd8                                          \
2072     __asm vpunpcklwd ymm0, ymm2, ymm1           /* ABGR first 8 pixels */      \
2073     __asm vpunpckhwd ymm1, ymm2, ymm1           /* ABGR next 8 pixels */       \
2074     __asm vmovdqu    [edx], ymm0                                               \
2075     __asm vmovdqu    [edx + 32], ymm1                                          \
2076     __asm lea        edx,  [edx + 64]                                          \
2077   }
2078 
2079 #ifdef HAS_I422TOARGBROW_AVX2
2080 // 16 pixels
2081 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2082 __declspec(naked)
2083 void I422ToARGBRow_AVX2(const uint8* y_buf,
2084                         const uint8* u_buf,
2085                         const uint8* v_buf,
2086                         uint8* dst_argb,
2087                         const struct YuvConstants* yuvconstants,
2088                         int width) {
2089   __asm {
2090     push       esi
2091     push       edi
2092     push       ebx
2093     mov        eax, [esp + 12 + 4]   // Y
2094     mov        esi, [esp + 12 + 8]   // U
2095     mov        edi, [esp + 12 + 12]  // V
2096     mov        edx, [esp + 12 + 16]  // argb
2097     mov        ebx, [esp + 12 + 20]  // yuvconstants
2098     mov        ecx, [esp + 12 + 24]  // width
2099     sub        edi, esi
2100     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2101 
2102  convertloop:
2103     READYUV422_AVX2
2104     YUVTORGB_AVX2(ebx)
2105     STOREARGB_AVX2
2106 
2107     sub        ecx, 16
2108     jg         convertloop
2109 
2110     pop        ebx
2111     pop        edi
2112     pop        esi
2113     vzeroupper
2114     ret
2115   }
2116 }
2117 #endif  // HAS_I422TOARGBROW_AVX2
2118 
2119 #ifdef HAS_I422ALPHATOARGBROW_AVX2
2120 // 16 pixels
2121 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
2122 __declspec(naked)
2123 void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
2124                              const uint8* u_buf,
2125                              const uint8* v_buf,
2126                              const uint8* a_buf,
2127                              uint8* dst_argb,
2128                              const struct YuvConstants* yuvconstants,
2129                              int width) {
2130   __asm {
2131     push       esi
2132     push       edi
2133     push       ebx
2134     push       ebp
2135     mov        eax, [esp + 16 + 4]   // Y
2136     mov        esi, [esp + 16 + 8]   // U
2137     mov        edi, [esp + 16 + 12]  // V
2138     mov        ebp, [esp + 16 + 16]  // A
2139     mov        edx, [esp + 16 + 20]  // argb
2140     mov        ebx, [esp + 16 + 24]  // yuvconstants
2141     mov        ecx, [esp + 16 + 28]  // width
2142     sub        edi, esi
2143 
2144  convertloop:
2145     READYUVA422_AVX2
2146     YUVTORGB_AVX2(ebx)
2147     STOREARGB_AVX2
2148 
2149     sub        ecx, 16
2150     jg         convertloop
2151 
2152     pop        ebp
2153     pop        ebx
2154     pop        edi
2155     pop        esi
2156     vzeroupper
2157     ret
2158   }
2159 }
2160 #endif  // HAS_I422ALPHATOARGBROW_AVX2
2161 
2162 #ifdef HAS_I444TOARGBROW_AVX2
2163 // 16 pixels
2164 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2165 __declspec(naked)
2166 void I444ToARGBRow_AVX2(const uint8* y_buf,
2167                         const uint8* u_buf,
2168                         const uint8* v_buf,
2169                         uint8* dst_argb,
2170                         const struct YuvConstants* yuvconstants,
2171                         int width) {
2172   __asm {
2173     push       esi
2174     push       edi
2175     push       ebx
2176     mov        eax, [esp + 12 + 4]   // Y
2177     mov        esi, [esp + 12 + 8]   // U
2178     mov        edi, [esp + 12 + 12]  // V
2179     mov        edx, [esp + 12 + 16]  // argb
2180     mov        ebx, [esp + 12 + 20]  // yuvconstants
2181     mov        ecx, [esp + 12 + 24]  // width
2182     sub        edi, esi
2183     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2184  convertloop:
2185     READYUV444_AVX2
2186     YUVTORGB_AVX2(ebx)
2187     STOREARGB_AVX2
2188 
2189     sub        ecx, 16
2190     jg         convertloop
2191 
2192     pop        ebx
2193     pop        edi
2194     pop        esi
2195     vzeroupper
2196     ret
2197   }
2198 }
2199 #endif  // HAS_I444TOARGBROW_AVX2
2200 
2201 #ifdef HAS_I411TOARGBROW_AVX2
2202 // 16 pixels
2203 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2204 __declspec(naked)
2205 void I411ToARGBRow_AVX2(const uint8* y_buf,
2206                         const uint8* u_buf,
2207                         const uint8* v_buf,
2208                         uint8* dst_argb,
2209                         const struct YuvConstants* yuvconstants,
2210                         int width) {
2211   __asm {
2212     push       esi
2213     push       edi
2214     push       ebx
2215     mov        eax, [esp + 12 + 4]   // Y
2216     mov        esi, [esp + 12 + 8]   // U
2217     mov        edi, [esp + 12 + 12]  // V
2218     mov        edx, [esp + 12 + 16]  // abgr
2219     mov        ebx, [esp + 12 + 20]  // yuvconstants
2220     mov        ecx, [esp + 12 + 24]  // width
2221     sub        edi, esi
2222     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2223 
2224  convertloop:
2225     READYUV411_AVX2
2226     YUVTORGB_AVX2(ebx)
2227     STOREARGB_AVX2
2228 
2229     sub        ecx, 16
2230     jg         convertloop
2231 
2232     pop        ebx
2233     pop        edi
2234     pop        esi
2235     vzeroupper
2236     ret
2237   }
2238 }
2239 #endif  // HAS_I411TOARGBROW_AVX2
2240 
2241 #ifdef HAS_NV12TOARGBROW_AVX2
2242 // 16 pixels.
2243 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2244 __declspec(naked)
2245 void NV12ToARGBRow_AVX2(const uint8* y_buf,
2246                         const uint8* uv_buf,
2247                         uint8* dst_argb,
2248                         const struct YuvConstants* yuvconstants,
2249                         int width) {
2250   __asm {
2251     push       esi
2252     push       ebx
2253     mov        eax, [esp + 8 + 4]   // Y
2254     mov        esi, [esp + 8 + 8]   // UV
2255     mov        edx, [esp + 8 + 12]  // argb
2256     mov        ebx, [esp + 8 + 16]  // yuvconstants
2257     mov        ecx, [esp + 8 + 20]  // width
2258     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2259 
2260  convertloop:
2261     READNV12_AVX2
2262     YUVTORGB_AVX2(ebx)
2263     STOREARGB_AVX2
2264 
2265     sub        ecx, 16
2266     jg         convertloop
2267 
2268     pop        ebx
2269     pop        esi
2270     vzeroupper
2271     ret
2272   }
2273 }
2274 #endif  // HAS_NV12TOARGBROW_AVX2
2275 
2276 #ifdef HAS_NV21TOARGBROW_AVX2
2277 // 16 pixels.
2278 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2279 __declspec(naked)
2280 void NV21ToARGBRow_AVX2(const uint8* y_buf,
2281                         const uint8* vu_buf,
2282                         uint8* dst_argb,
2283                         const struct YuvConstants* yuvconstants,
2284                         int width) {
2285   __asm {
2286     push       esi
2287     push       ebx
2288     mov        eax, [esp + 8 + 4]   // Y
2289     mov        esi, [esp + 8 + 8]   // VU
2290     mov        edx, [esp + 8 + 12]  // argb
2291     mov        ebx, [esp + 8 + 16]  // yuvconstants
2292     mov        ecx, [esp + 8 + 20]  // width
2293     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2294 
2295  convertloop:
2296     READNV21_AVX2
2297     YUVTORGB_AVX2(ebx)
2298     STOREARGB_AVX2
2299 
2300     sub        ecx, 16
2301     jg         convertloop
2302 
2303     pop        ebx
2304     pop        esi
2305     vzeroupper
2306     ret
2307   }
2308 }
2309 #endif  // HAS_NV21TOARGBROW_AVX2
2310 
2311 #ifdef HAS_YUY2TOARGBROW_AVX2
2312 // 16 pixels.
2313 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2314 __declspec(naked)
2315 void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
2316                         uint8* dst_argb,
2317                         const struct YuvConstants* yuvconstants,
2318                         int width) {
2319   __asm {
2320     push       ebx
2321     mov        eax, [esp + 4 + 4]   // yuy2
2322     mov        edx, [esp + 4 + 8]   // argb
2323     mov        ebx, [esp + 4 + 12]  // yuvconstants
2324     mov        ecx, [esp + 4 + 16]  // width
2325     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2326 
2327  convertloop:
2328     READYUY2_AVX2
2329     YUVTORGB_AVX2(ebx)
2330     STOREARGB_AVX2
2331 
2332     sub        ecx, 16
2333     jg         convertloop
2334 
2335     pop        ebx
2336     vzeroupper
2337     ret
2338   }
2339 }
2340 #endif  // HAS_YUY2TOARGBROW_AVX2
2341 
2342 #ifdef HAS_UYVYTOARGBROW_AVX2
2343 // 16 pixels.
2344 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2345 __declspec(naked)
2346 void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
2347                         uint8* dst_argb,
2348                         const struct YuvConstants* yuvconstants,
2349                         int width) {
2350   __asm {
2351     push       ebx
2352     mov        eax, [esp + 4 + 4]   // uyvy
2353     mov        edx, [esp + 4 + 8]   // argb
2354     mov        ebx, [esp + 4 + 12]  // yuvconstants
2355     mov        ecx, [esp + 4 + 16]  // width
2356     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2357 
2358  convertloop:
2359     READUYVY_AVX2
2360     YUVTORGB_AVX2(ebx)
2361     STOREARGB_AVX2
2362 
2363     sub        ecx, 16
2364     jg         convertloop
2365 
2366     pop        ebx
2367     vzeroupper
2368     ret
2369   }
2370 }
2371 #endif  // HAS_UYVYTOARGBROW_AVX2
2372 
2373 #ifdef HAS_I422TORGBAROW_AVX2
2374 // 16 pixels
2375 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2376 __declspec(naked)
2377 void I422ToRGBARow_AVX2(const uint8* y_buf,
2378                         const uint8* u_buf,
2379                         const uint8* v_buf,
2380                         uint8* dst_argb,
2381                         const struct YuvConstants* yuvconstants,
2382                         int width) {
2383   __asm {
2384     push       esi
2385     push       edi
2386     push       ebx
2387     mov        eax, [esp + 12 + 4]   // Y
2388     mov        esi, [esp + 12 + 8]   // U
2389     mov        edi, [esp + 12 + 12]  // V
2390     mov        edx, [esp + 12 + 16]  // abgr
2391     mov        ebx, [esp + 12 + 20]  // yuvconstants
2392     mov        ecx, [esp + 12 + 24]  // width
2393     sub        edi, esi
2394     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2395 
2396  convertloop:
2397     READYUV422_AVX2
2398     YUVTORGB_AVX2(ebx)
2399     STORERGBA_AVX2
2400 
2401     sub        ecx, 16
2402     jg         convertloop
2403 
2404     pop        ebx
2405     pop        edi
2406     pop        esi
2407     vzeroupper
2408     ret
2409   }
2410 }
2411 #endif  // HAS_I422TORGBAROW_AVX2
2412 
2413 #if defined(HAS_I422TOARGBROW_SSSE3)
2414 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
2415 // Allows a conversion with half size scaling.
2416 
2417 // Read 8 UV from 444.
2418 #define READYUV444 __asm {                                                     \
2419     __asm movq       xmm0, qword ptr [esi] /* U */                             \
2420     __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
2421     __asm lea        esi,  [esi + 8]                                           \
2422     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
2423     __asm movq       xmm4, qword ptr [eax]                                     \
2424     __asm punpcklbw  xmm4, xmm4                                                \
2425     __asm lea        eax, [eax + 8]                                            \
2426   }
2427 
2428 // Read 4 UV from 422, upsample to 8 UV.
2429 #define READYUV422 __asm {                                                     \
2430     __asm movd       xmm0, [esi]          /* U */                              \
2431     __asm movd       xmm1, [esi + edi]    /* V */                              \
2432     __asm lea        esi,  [esi + 4]                                           \
2433     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
2434     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
2435     __asm movq       xmm4, qword ptr [eax]                                     \
2436     __asm punpcklbw  xmm4, xmm4                                                \
2437     __asm lea        eax, [eax + 8]                                            \
2438   }
2439 
2440 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
2441 #define READYUVA422 __asm {                                                    \
2442     __asm movd       xmm0, [esi]          /* U */                              \
2443     __asm movd       xmm1, [esi + edi]    /* V */                              \
2444     __asm lea        esi,  [esi + 4]                                           \
2445     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
2446     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
2447     __asm movq       xmm4, qword ptr [eax]   /* Y */                           \
2448     __asm punpcklbw  xmm4, xmm4                                                \
2449     __asm lea        eax, [eax + 8]                                            \
2450     __asm movq       xmm5, qword ptr [ebp]   /* A */                           \
2451     __asm lea        ebp, [ebp + 8]                                            \
2452   }
2453 
2454 // Read 2 UV from 411, upsample to 8 UV.
2455 // drmemory fails with memory fault if pinsrw used. libyuv bug: 525
2456 //  __asm pinsrw     xmm0, [esi], 0        /* U */
2457 //  __asm pinsrw     xmm1, [esi + edi], 0  /* V */
2458 #define READYUV411_EBX __asm {                                                 \
2459     __asm movzx      ebx, word ptr [esi]        /* U */                        \
2460     __asm movd       xmm0, ebx                                                 \
2461     __asm movzx      ebx, word ptr [esi + edi]  /* V */                        \
2462     __asm movd       xmm1, ebx                                                 \
2463     __asm lea        esi,  [esi + 2]                                           \
2464     __asm punpcklbw  xmm0, xmm1            /* UV */                            \
2465     __asm punpcklwd  xmm0, xmm0            /* UVUV (upsample) */               \
2466     __asm punpckldq  xmm0, xmm0            /* UVUVUVUV (upsample) */           \
2467     __asm movq       xmm4, qword ptr [eax]                                     \
2468     __asm punpcklbw  xmm4, xmm4                                                \
2469     __asm lea        eax, [eax + 8]                                            \
2470   }
2471 
2472 // Read 4 UV from NV12, upsample to 8 UV.
2473 #define READNV12 __asm {                                                       \
2474     __asm movq       xmm0, qword ptr [esi] /* UV */                            \
2475     __asm lea        esi,  [esi + 8]                                           \
2476     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
2477     __asm movq       xmm4, qword ptr [eax]                                     \
2478     __asm punpcklbw  xmm4, xmm4                                                \
2479     __asm lea        eax, [eax + 8]                                            \
2480   }
2481 
2482 // Read 4 VU from NV21, upsample to 8 UV.
2483 #define READNV21 __asm {                                                       \
2484     __asm movq       xmm0, qword ptr [esi] /* UV */                            \
2485     __asm lea        esi,  [esi + 8]                                           \
2486     __asm pshufb     xmm0, xmmword ptr kShuffleNV21                            \
2487     __asm movq       xmm4, qword ptr [eax]                                     \
2488     __asm punpcklbw  xmm4, xmm4                                                \
2489     __asm lea        eax, [eax + 8]                                            \
2490   }
2491 
2492 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
2493 #define READYUY2 __asm {                                                       \
2494     __asm movdqu     xmm4, [eax]          /* YUY2 */                           \
2495     __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \
2496     __asm movdqu     xmm0, [eax]          /* UV */                             \
2497     __asm pshufb     xmm0, xmmword ptr kShuffleYUY2UV                          \
2498     __asm lea        eax, [eax + 16]                                           \
2499   }
2500 
2501 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
2502 #define READUYVY __asm {                                                       \
2503     __asm movdqu     xmm4, [eax]          /* UYVY */                           \
2504     __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \
2505     __asm movdqu     xmm0, [eax]          /* UV */                             \
2506     __asm pshufb     xmm0, xmmword ptr kShuffleUYVYUV                          \
2507     __asm lea        eax, [eax + 16]                                           \
2508   }
2509 
2510 // Convert 8 pixels: 8 UV and 8 Y.
2511 #define YUVTORGB(YuvConstants) __asm {                                         \
2512     __asm movdqa     xmm1, xmm0                                                \
2513     __asm movdqa     xmm2, xmm0                                                \
2514     __asm movdqa     xmm3, xmm0                                                \
2515     __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVBIASB]               \
2516     __asm pmaddubsw  xmm1, xmmword ptr [YuvConstants + KUVTOB]                 \
2517     __asm psubw      xmm0, xmm1                                                \
2518     __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVBIASG]               \
2519     __asm pmaddubsw  xmm2, xmmword ptr [YuvConstants + KUVTOG]                 \
2520     __asm psubw      xmm1, xmm2                                                \
2521     __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVBIASR]               \
2522     __asm pmaddubsw  xmm3, xmmword ptr [YuvConstants + KUVTOR]                 \
2523     __asm psubw      xmm2, xmm3                                                \
2524     __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
2525     __asm paddsw     xmm0, xmm4           /* B += Y */                         \
2526     __asm paddsw     xmm1, xmm4           /* G += Y */                         \
2527     __asm paddsw     xmm2, xmm4           /* R += Y */                         \
2528     __asm psraw      xmm0, 6                                                   \
2529     __asm psraw      xmm1, 6                                                   \
2530     __asm psraw      xmm2, 6                                                   \
2531     __asm packuswb   xmm0, xmm0           /* B */                              \
2532     __asm packuswb   xmm1, xmm1           /* G */                              \
2533     __asm packuswb   xmm2, xmm2           /* R */                              \
2534   }
2535 
2536 // Store 8 ARGB values.
2537 #define STOREARGB __asm {                                                      \
2538     __asm punpcklbw  xmm0, xmm1           /* BG */                             \
2539     __asm punpcklbw  xmm2, xmm5           /* RA */                             \
2540     __asm movdqa     xmm1, xmm0                                                \
2541     __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \
2542     __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \
2543     __asm movdqu     0[edx], xmm0                                              \
2544     __asm movdqu     16[edx], xmm1                                             \
2545     __asm lea        edx,  [edx + 32]                                          \
2546   }
2547 
2548 // Store 8 BGRA values.
2549 #define STOREBGRA __asm {                                                      \
2550     __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
2551     __asm punpcklbw  xmm1, xmm0           /* GB */                             \
2552     __asm punpcklbw  xmm5, xmm2           /* AR */                             \
2553     __asm movdqa     xmm0, xmm5                                                \
2554     __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \
2555     __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \
2556     __asm movdqu     0[edx], xmm5                                              \
2557     __asm movdqu     16[edx], xmm0                                             \
2558     __asm lea        edx,  [edx + 32]                                          \
2559   }
2560 
2561 // Store 8 RGBA values.
2562 #define STORERGBA __asm {                                                      \
2563     __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
2564     __asm punpcklbw  xmm1, xmm2           /* GR */                             \
2565     __asm punpcklbw  xmm5, xmm0           /* AB */                             \
2566     __asm movdqa     xmm0, xmm5                                                \
2567     __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \
2568     __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \
2569     __asm movdqu     0[edx], xmm5                                              \
2570     __asm movdqu     16[edx], xmm0                                             \
2571     __asm lea        edx,  [edx + 32]                                          \
2572   }
2573 
2574 // Store 8 RGB24 values.
2575 #define STORERGB24 __asm {                                                     \
2576     /* Weave into RRGB */                                                      \
2577     __asm punpcklbw  xmm0, xmm1           /* BG */                             \
2578     __asm punpcklbw  xmm2, xmm2           /* RR */                             \
2579     __asm movdqa     xmm1, xmm0                                                \
2580     __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
2581     __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
2582     /* RRGB -> RGB24 */                                                        \
2583     __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
2584     __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
2585     __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
2586     __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
2587     __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
2588     __asm lea        edx,  [edx + 24]                                          \
2589   }
2590 
2591 // Store 8 RGB565 values.
2592 #define STORERGB565 __asm {                                                    \
2593     /* Weave into RRGB */                                                      \
2594     __asm punpcklbw  xmm0, xmm1           /* BG */                             \
2595     __asm punpcklbw  xmm2, xmm2           /* RR */                             \
2596     __asm movdqa     xmm1, xmm0                                                \
2597     __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
2598     __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
2599     /* RRGB -> RGB565 */                                                       \
2600     __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \
2601     __asm movdqa     xmm2, xmm0    /* G */                                     \
2602     __asm pslld      xmm0, 8       /* R */                                     \
2603     __asm psrld      xmm3, 3       /* B */                                     \
2604     __asm psrld      xmm2, 5       /* G */                                     \
2605     __asm psrad      xmm0, 16      /* R */                                     \
2606     __asm pand       xmm3, xmm5    /* B */                                     \
2607     __asm pand       xmm2, xmm6    /* G */                                     \
2608     __asm pand       xmm0, xmm7    /* R */                                     \
2609     __asm por        xmm3, xmm2    /* BG */                                    \
2610     __asm por        xmm0, xmm3    /* BGR */                                   \
2611     __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \
2612     __asm movdqa     xmm2, xmm1    /* G */                                     \
2613     __asm pslld      xmm1, 8       /* R */                                     \
2614     __asm psrld      xmm3, 3       /* B */                                     \
2615     __asm psrld      xmm2, 5       /* G */                                     \
2616     __asm psrad      xmm1, 16      /* R */                                     \
2617     __asm pand       xmm3, xmm5    /* B */                                     \
2618     __asm pand       xmm2, xmm6    /* G */                                     \
2619     __asm pand       xmm1, xmm7    /* R */                                     \
2620     __asm por        xmm3, xmm2    /* BG */                                    \
2621     __asm por        xmm1, xmm3    /* BGR */                                   \
2622     __asm packssdw   xmm0, xmm1                                                \
2623     __asm movdqu     0[edx], xmm0  /* store 8 pixels of RGB565 */              \
2624     __asm lea        edx, [edx + 16]                                           \
2625   }
2626 
2627 // 8 pixels.
2628 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2629 __declspec(naked)
2630 void I444ToARGBRow_SSSE3(const uint8* y_buf,
2631                          const uint8* u_buf,
2632                          const uint8* v_buf,
2633                          uint8* dst_argb,
2634                          const struct YuvConstants* yuvconstants,
2635                          int width) {
2636   __asm {
2637     push       esi
2638     push       edi
2639     push       ebx
2640     mov        eax, [esp + 12 + 4]   // Y
2641     mov        esi, [esp + 12 + 8]   // U
2642     mov        edi, [esp + 12 + 12]  // V
2643     mov        edx, [esp + 12 + 16]  // argb
2644     mov        ebx, [esp + 12 + 20]  // yuvconstants
2645     mov        ecx, [esp + 12 + 24]  // width
2646     sub        edi, esi
2647     pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
2648 
2649  convertloop:
2650     READYUV444
2651     YUVTORGB(ebx)
2652     STOREARGB
2653 
2654     sub        ecx, 8
2655     jg         convertloop
2656 
2657     pop        ebx
2658     pop        edi
2659     pop        esi
2660     ret
2661   }
2662 }
2663 
2664 // 8 pixels.
2665 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
2666 __declspec(naked)
2667 void I422ToRGB24Row_SSSE3(const uint8* y_buf,
2668                           const uint8* u_buf,
2669                           const uint8* v_buf,
2670                           uint8* dst_rgb24,
2671                           const struct YuvConstants* yuvconstants,
2672                           int width) {
2673   __asm {
2674     push       esi
2675     push       edi
2676     push       ebx
2677     mov        eax, [esp + 12 + 4]   // Y
2678     mov        esi, [esp + 12 + 8]   // U
2679     mov        edi, [esp + 12 + 12]  // V
2680     mov        edx, [esp + 12 + 16]  // argb
2681     mov        ebx, [esp + 12 + 20]  // yuvconstants
2682     mov        ecx, [esp + 12 + 24]  // width
2683     sub        edi, esi
2684     movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
2685     movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24
2686 
2687  convertloop:
2688     READYUV422
2689     YUVTORGB(ebx)
2690     STORERGB24
2691 
2692     sub        ecx, 8
2693     jg         convertloop
2694 
2695     pop        ebx
2696     pop        edi
2697     pop        esi
2698     ret
2699   }
2700 }
2701 
2702 // 8 pixels
2703 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
2704 __declspec(naked)
2705 void I422ToRGB565Row_SSSE3(const uint8* y_buf,
2706                            const uint8* u_buf,
2707                            const uint8* v_buf,
2708                            uint8* rgb565_buf,
2709                            const struct YuvConstants* yuvconstants,
2710                            int width) {
2711   __asm {
2712     push       esi
2713     push       edi
2714     push       ebx
2715     mov        eax, [esp + 12 + 4]   // Y
2716     mov        esi, [esp + 12 + 8]   // U
2717     mov        edi, [esp + 12 + 12]  // V
2718     mov        edx, [esp + 12 + 16]  // argb
2719     mov        ebx, [esp + 12 + 20]  // yuvconstants
2720     mov        ecx, [esp + 12 + 24]  // width
2721     sub        edi, esi
2722     pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
2723     psrld      xmm5, 27
2724     pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
2725     psrld      xmm6, 26
2726     pslld      xmm6, 5
2727     pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
2728     pslld      xmm7, 11
2729 
2730  convertloop:
2731     READYUV422
2732     YUVTORGB(ebx)
2733     STORERGB565
2734 
2735     sub        ecx, 8
2736     jg         convertloop
2737 
2738     pop        ebx
2739     pop        edi
2740     pop        esi
2741     ret
2742   }
2743 }
2744 
2745 // 8 pixels.
2746 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2747 __declspec(naked)
2748 void I422ToARGBRow_SSSE3(const uint8* y_buf,
2749                          const uint8* u_buf,
2750                          const uint8* v_buf,
2751                          uint8* dst_argb,
2752                          const struct YuvConstants* yuvconstants,
2753                          int width) {
2754   __asm {
2755     push       esi
2756     push       edi
2757     push       ebx
2758     mov        eax, [esp + 12 + 4]   // Y
2759     mov        esi, [esp + 12 + 8]   // U
2760     mov        edi, [esp + 12 + 12]  // V
2761     mov        edx, [esp + 12 + 16]  // argb
2762     mov        ebx, [esp + 12 + 20]  // yuvconstants
2763     mov        ecx, [esp + 12 + 24]  // width
2764     sub        edi, esi
2765     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2766 
2767  convertloop:
2768     READYUV422
2769     YUVTORGB(ebx)
2770     STOREARGB
2771 
2772     sub        ecx, 8
2773     jg         convertloop
2774 
2775     pop        ebx
2776     pop        edi
2777     pop        esi
2778     ret
2779   }
2780 }
2781 
2782 // 8 pixels.
2783 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
2784 __declspec(naked)
2785 void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
2786                               const uint8* u_buf,
2787                               const uint8* v_buf,
2788                               const uint8* a_buf,
2789                               uint8* dst_argb,
2790                               const struct YuvConstants* yuvconstants,
2791                               int width) {
2792   __asm {
2793     push       esi
2794     push       edi
2795     push       ebx
2796     push       ebp
2797     mov        eax, [esp + 16 + 4]   // Y
2798     mov        esi, [esp + 16 + 8]   // U
2799     mov        edi, [esp + 16 + 12]  // V
2800     mov        ebp, [esp + 16 + 16]  // A
2801     mov        edx, [esp + 16 + 20]  // argb
2802     mov        ebx, [esp + 16 + 24]  // yuvconstants
2803     mov        ecx, [esp + 16 + 28]  // width
2804     sub        edi, esi
2805 
2806  convertloop:
2807     READYUVA422
2808     YUVTORGB(ebx)
2809     STOREARGB
2810 
2811     sub        ecx, 8
2812     jg         convertloop
2813 
2814     pop        ebp
2815     pop        ebx
2816     pop        edi
2817     pop        esi
2818     ret
2819   }
2820 }
2821 
2822 // 8 pixels.
2823 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2824 // Similar to I420 but duplicate UV once more.
2825 __declspec(naked)
2826 void I411ToARGBRow_SSSE3(const uint8* y_buf,
2827                          const uint8* u_buf,
2828                          const uint8* v_buf,
2829                          uint8* dst_argb,
2830                          const struct YuvConstants* yuvconstants,
2831                          int width) {
2832   __asm {
2833     push       esi
2834     push       edi
2835     push       ebx
2836     push       ebp
2837     mov        eax, [esp + 16 + 4]   // Y
2838     mov        esi, [esp + 16 + 8]   // U
2839     mov        edi, [esp + 16 + 12]  // V
2840     mov        edx, [esp + 16 + 16]  // abgr
2841     mov        ebp, [esp + 16 + 20]  // yuvconstants
2842     mov        ecx, [esp + 16 + 24]  // width
2843     sub        edi, esi
2844     pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
2845 
2846  convertloop:
2847     READYUV411_EBX
2848     YUVTORGB(ebp)
2849     STOREARGB
2850 
2851     sub        ecx, 8
2852     jg         convertloop
2853 
2854     pop        ebp
2855     pop        ebx
2856     pop        edi
2857     pop        esi
2858     ret
2859   }
2860 }
2861 
2862 // 8 pixels.
2863 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2864 __declspec(naked)
2865 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
2866                          const uint8* uv_buf,
2867                          uint8* dst_argb,
2868                          const struct YuvConstants* yuvconstants,
2869                          int width) {
2870   __asm {
2871     push       esi
2872     push       ebx
2873     mov        eax, [esp + 8 + 4]   // Y
2874     mov        esi, [esp + 8 + 8]   // UV
2875     mov        edx, [esp + 8 + 12]  // argb
2876     mov        ebx, [esp + 8 + 16]  // yuvconstants
2877     mov        ecx, [esp + 8 + 20]  // width
2878     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2879 
2880  convertloop:
2881     READNV12
2882     YUVTORGB(ebx)
2883     STOREARGB
2884 
2885     sub        ecx, 8
2886     jg         convertloop
2887 
2888     pop        ebx
2889     pop        esi
2890     ret
2891   }
2892 }
2893 
2894 // 8 pixels.
2895 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2896 __declspec(naked)
2897 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
2898                          const uint8* vu_buf,
2899                          uint8* dst_argb,
2900                          const struct YuvConstants* yuvconstants,
2901                          int width) {
2902   __asm {
2903     push       esi
2904     push       ebx
2905     mov        eax, [esp + 8 + 4]   // Y
2906     mov        esi, [esp + 8 + 8]   // VU
2907     mov        edx, [esp + 8 + 12]  // argb
2908     mov        ebx, [esp + 8 + 16]  // yuvconstants
2909     mov        ecx, [esp + 8 + 20]  // width
2910     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2911 
2912  convertloop:
2913     READNV21
2914     YUVTORGB(ebx)
2915     STOREARGB
2916 
2917     sub        ecx, 8
2918     jg         convertloop
2919 
2920     pop        ebx
2921     pop        esi
2922     ret
2923   }
2924 }
2925 
2926 // 8 pixels.
2927 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
2928 __declspec(naked)
2929 void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
2930                          uint8* dst_argb,
2931                          const struct YuvConstants* yuvconstants,
2932                          int width) {
2933   __asm {
2934     push       ebx
2935     mov        eax, [esp + 4 + 4]   // yuy2
2936     mov        edx, [esp + 4 + 8]   // argb
2937     mov        ebx, [esp + 4 + 12]  // yuvconstants
2938     mov        ecx, [esp + 4 + 16]  // width
2939     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2940 
2941  convertloop:
2942     READYUY2
2943     YUVTORGB(ebx)
2944     STOREARGB
2945 
2946     sub        ecx, 8
2947     jg         convertloop
2948 
2949     pop        ebx
2950     ret
2951   }
2952 }
2953 
2954 // 8 pixels.
2955 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
2956 __declspec(naked)
2957 void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
2958                          uint8* dst_argb,
2959                          const struct YuvConstants* yuvconstants,
2960                          int width) {
2961   __asm {
2962     push       ebx
2963     mov        eax, [esp + 4 + 4]   // uyvy
2964     mov        edx, [esp + 4 + 8]   // argb
2965     mov        ebx, [esp + 4 + 12]  // yuvconstants
2966     mov        ecx, [esp + 4 + 16]  // width
2967     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2968 
2969  convertloop:
2970     READUYVY
2971     YUVTORGB(ebx)
2972     STOREARGB
2973 
2974     sub        ecx, 8
2975     jg         convertloop
2976 
2977     pop        ebx
2978     ret
2979   }
2980 }
2981 
2982 __declspec(naked)
2983 void I422ToRGBARow_SSSE3(const uint8* y_buf,
2984                          const uint8* u_buf,
2985                          const uint8* v_buf,
2986                          uint8* dst_rgba,
2987                          const struct YuvConstants* yuvconstants,
2988                          int width) {
2989   __asm {
2990     push       esi
2991     push       edi
2992     push       ebx
2993     mov        eax, [esp + 12 + 4]   // Y
2994     mov        esi, [esp + 12 + 8]   // U
2995     mov        edi, [esp + 12 + 12]  // V
2996     mov        edx, [esp + 12 + 16]  // argb
2997     mov        ebx, [esp + 12 + 20]  // yuvconstants
2998     mov        ecx, [esp + 12 + 24]  // width
2999     sub        edi, esi
3000 
3001  convertloop:
3002     READYUV422
3003     YUVTORGB(ebx)
3004     STORERGBA
3005 
3006     sub        ecx, 8
3007     jg         convertloop
3008 
3009     pop        ebx
3010     pop        edi
3011     pop        esi
3012     ret
3013   }
3014 }
3015 #endif  // HAS_I422TOARGBROW_SSSE3
3016 
3017 #ifdef HAS_I400TOARGBROW_SSE2
3018 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
3019 __declspec(naked)
3020 void I400ToARGBRow_SSE2(const uint8* y_buf,
3021                         uint8* rgb_buf,
3022                         int width) {
3023   __asm {
3024     mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
3025     movd       xmm2, eax
3026     pshufd     xmm2, xmm2,0
3027     mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
3028     movd       xmm3, eax
3029     pshufd     xmm3, xmm3, 0
3030     pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
3031     pslld      xmm4, 24
3032 
3033     mov        eax, [esp + 4]       // Y
3034     mov        edx, [esp + 8]       // rgb
3035     mov        ecx, [esp + 12]      // width
3036 
3037  convertloop:
3038     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
3039     movq       xmm0, qword ptr [eax]
3040     lea        eax, [eax + 8]
3041     punpcklbw  xmm0, xmm0           // Y.Y
3042     pmulhuw    xmm0, xmm2
3043     psubusw    xmm0, xmm3
3044     psrlw      xmm0, 6
3045     packuswb   xmm0, xmm0           // G
3046 
3047     // Step 2: Weave into ARGB
3048     punpcklbw  xmm0, xmm0           // GG
3049     movdqa     xmm1, xmm0
3050     punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
3051     punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
3052     por        xmm0, xmm4
3053     por        xmm1, xmm4
3054     movdqu     [edx], xmm0
3055     movdqu     [edx + 16], xmm1
3056     lea        edx,  [edx + 32]
3057     sub        ecx, 8
3058     jg         convertloop
3059     ret
3060   }
3061 }
3062 #endif  // HAS_I400TOARGBROW_SSE2
3063 
3064 #ifdef HAS_I400TOARGBROW_AVX2
3065 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
3066 // note: vpunpcklbw mutates and vpackuswb unmutates.
3067 __declspec(naked)
3068 void I400ToARGBRow_AVX2(const uint8* y_buf,
3069                         uint8* rgb_buf,
3070                         int width) {
3071   __asm {
3072     mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
3073     vmovd      xmm2, eax
3074     vbroadcastss ymm2, xmm2
3075     mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
3076     vmovd      xmm3, eax
3077     vbroadcastss ymm3, xmm3
3078     vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000
3079     vpslld     ymm4, ymm4, 24
3080 
3081     mov        eax, [esp + 4]       // Y
3082     mov        edx, [esp + 8]       // rgb
3083     mov        ecx, [esp + 12]      // width
3084 
3085  convertloop:
3086     // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
3087     vmovdqu    xmm0, [eax]
3088     lea        eax, [eax + 16]
3089     vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates
3090     vpunpcklbw ymm0, ymm0, ymm0           // Y.Y
3091     vpmulhuw   ymm0, ymm0, ymm2
3092     vpsubusw   ymm0, ymm0, ymm3
3093     vpsrlw     ymm0, ymm0, 6
3094     vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120
3095 
3096     // TODO(fbarchard): Weave alpha with unpack.
3097     // Step 2: Weave into ARGB
3098     vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates
3099     vpermq     ymm1, ymm1, 0xd8
3100     vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels
3101     vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels
3102     vpor       ymm0, ymm0, ymm4
3103     vpor       ymm1, ymm1, ymm4
3104     vmovdqu    [edx], ymm0
3105     vmovdqu    [edx + 32], ymm1
3106     lea        edx,  [edx + 64]
3107     sub        ecx, 16
3108     jg         convertloop
3109     vzeroupper
3110     ret
3111   }
3112 }
3113 #endif  // HAS_I400TOARGBROW_AVX2
3114 
3115 #ifdef HAS_MIRRORROW_SSSE3
3116 // Shuffle table for reversing the bytes.
3117 static const uvec8 kShuffleMirror = {
3118   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3119 };
3120 
3121 // TODO(fbarchard): Replace lea with -16 offset.
3122 __declspec(naked)
3123 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
3124   __asm {
3125     mov       eax, [esp + 4]   // src
3126     mov       edx, [esp + 8]   // dst
3127     mov       ecx, [esp + 12]  // width
3128     movdqa    xmm5, xmmword ptr kShuffleMirror
3129 
3130  convertloop:
3131     movdqu    xmm0, [eax - 16 + ecx]
3132     pshufb    xmm0, xmm5
3133     movdqu    [edx], xmm0
3134     lea       edx, [edx + 16]
3135     sub       ecx, 16
3136     jg        convertloop
3137     ret
3138   }
3139 }
3140 #endif  // HAS_MIRRORROW_SSSE3
3141 
3142 #ifdef HAS_MIRRORROW_AVX2
3143 __declspec(naked)
3144 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3145   __asm {
3146     mov       eax, [esp + 4]   // src
3147     mov       edx, [esp + 8]   // dst
3148     mov       ecx, [esp + 12]  // width
3149     vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
3150 
3151  convertloop:
3152     vmovdqu   ymm0, [eax - 32 + ecx]
3153     vpshufb   ymm0, ymm0, ymm5
3154     vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
3155     vmovdqu   [edx], ymm0
3156     lea       edx, [edx + 32]
3157     sub       ecx, 32
3158     jg        convertloop
3159     vzeroupper
3160     ret
3161   }
3162 }
3163 #endif  // HAS_MIRRORROW_AVX2
3164 
3165 #ifdef HAS_MIRRORUVROW_SSSE3
3166 // Shuffle table for reversing the bytes of UV channels.
3167 static const uvec8 kShuffleMirrorUV = {
3168   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
3169 };
3170 
3171 __declspec(naked)
3172 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
3173                        int width) {
3174   __asm {
3175     push      edi
3176     mov       eax, [esp + 4 + 4]   // src
3177     mov       edx, [esp + 4 + 8]   // dst_u
3178     mov       edi, [esp + 4 + 12]  // dst_v
3179     mov       ecx, [esp + 4 + 16]  // width
3180     movdqa    xmm1, xmmword ptr kShuffleMirrorUV
3181     lea       eax, [eax + ecx * 2 - 16]
3182     sub       edi, edx
3183 
3184  convertloop:
3185     movdqu    xmm0, [eax]
3186     lea       eax, [eax - 16]
3187     pshufb    xmm0, xmm1
3188     movlpd    qword ptr [edx], xmm0
3189     movhpd    qword ptr [edx + edi], xmm0
3190     lea       edx, [edx + 8]
3191     sub       ecx, 8
3192     jg        convertloop
3193 
3194     pop       edi
3195     ret
3196   }
3197 }
3198 #endif  // HAS_MIRRORUVROW_SSSE3
3199 
3200 #ifdef HAS_ARGBMIRRORROW_SSE2
3201 __declspec(naked)
3202 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
3203   __asm {
3204     mov       eax, [esp + 4]   // src
3205     mov       edx, [esp + 8]   // dst
3206     mov       ecx, [esp + 12]  // width
3207     lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
3208 
3209  convertloop:
3210     movdqu    xmm0, [eax]
3211     lea       eax, [eax - 16]
3212     pshufd    xmm0, xmm0, 0x1b
3213     movdqu    [edx], xmm0
3214     lea       edx, [edx + 16]
3215     sub       ecx, 4
3216     jg        convertloop
3217     ret
3218   }
3219 }
3220 #endif  // HAS_ARGBMIRRORROW_SSE2
3221 
3222 #ifdef HAS_ARGBMIRRORROW_AVX2
3223 // Shuffle table for reversing the bytes.
3224 static const ulvec32 kARGBShuffleMirror_AVX2 = {
3225   7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3226 };
3227 
3228 __declspec(naked)
3229 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3230   __asm {
3231     mov       eax, [esp + 4]   // src
3232     mov       edx, [esp + 8]   // dst
3233     mov       ecx, [esp + 12]  // width
3234     vmovdqu   ymm5, ymmword ptr kARGBShuffleMirror_AVX2
3235 
3236  convertloop:
3237     vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
3238     vmovdqu   [edx], ymm0
3239     lea       edx, [edx + 32]
3240     sub       ecx, 8
3241     jg        convertloop
3242     vzeroupper
3243     ret
3244   }
3245 }
3246 #endif  // HAS_ARGBMIRRORROW_AVX2
3247 
3248 #ifdef HAS_SPLITUVROW_SSE2
3249 __declspec(naked)
3250 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
3251                      int width) {
3252   __asm {
3253     push       edi
3254     mov        eax, [esp + 4 + 4]    // src_uv
3255     mov        edx, [esp + 4 + 8]    // dst_u
3256     mov        edi, [esp + 4 + 12]   // dst_v
3257     mov        ecx, [esp + 4 + 16]   // width
3258     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
3259     psrlw      xmm5, 8
3260     sub        edi, edx
3261 
3262   convertloop:
3263     movdqu     xmm0, [eax]
3264     movdqu     xmm1, [eax + 16]
3265     lea        eax,  [eax + 32]
3266     movdqa     xmm2, xmm0
3267     movdqa     xmm3, xmm1
3268     pand       xmm0, xmm5   // even bytes
3269     pand       xmm1, xmm5
3270     packuswb   xmm0, xmm1
3271     psrlw      xmm2, 8      // odd bytes
3272     psrlw      xmm3, 8
3273     packuswb   xmm2, xmm3
3274     movdqu     [edx], xmm0
3275     movdqu     [edx + edi], xmm2
3276     lea        edx, [edx + 16]
3277     sub        ecx, 16
3278     jg         convertloop
3279 
3280     pop        edi
3281     ret
3282   }
3283 }
3284 
3285 #endif  // HAS_SPLITUVROW_SSE2
3286 
3287 #ifdef HAS_SPLITUVROW_AVX2
3288 __declspec(naked)
3289 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
3290                      int width) {
3291   __asm {
3292     push       edi
3293     mov        eax, [esp + 4 + 4]    // src_uv
3294     mov        edx, [esp + 4 + 8]    // dst_u
3295     mov        edi, [esp + 4 + 12]   // dst_v
3296     mov        ecx, [esp + 4 + 16]   // width
3297     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
3298     vpsrlw     ymm5, ymm5, 8
3299     sub        edi, edx
3300 
3301   convertloop:
3302     vmovdqu    ymm0, [eax]
3303     vmovdqu    ymm1, [eax + 32]
3304     lea        eax,  [eax + 64]
3305     vpsrlw     ymm2, ymm0, 8      // odd bytes
3306     vpsrlw     ymm3, ymm1, 8
3307     vpand      ymm0, ymm0, ymm5   // even bytes
3308     vpand      ymm1, ymm1, ymm5
3309     vpackuswb  ymm0, ymm0, ymm1
3310     vpackuswb  ymm2, ymm2, ymm3
3311     vpermq     ymm0, ymm0, 0xd8
3312     vpermq     ymm2, ymm2, 0xd8
3313     vmovdqu    [edx], ymm0
3314     vmovdqu    [edx + edi], ymm2
3315     lea        edx, [edx + 32]
3316     sub        ecx, 32
3317     jg         convertloop
3318 
3319     pop        edi
3320     vzeroupper
3321     ret
3322   }
3323 }
3324 #endif  // HAS_SPLITUVROW_AVX2
3325 
3326 #ifdef HAS_MERGEUVROW_SSE2
3327 __declspec(naked)
3328 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3329                      int width) {
3330   __asm {
3331     push       edi
3332     mov        eax, [esp + 4 + 4]    // src_u
3333     mov        edx, [esp + 4 + 8]    // src_v
3334     mov        edi, [esp + 4 + 12]   // dst_uv
3335     mov        ecx, [esp + 4 + 16]   // width
3336     sub        edx, eax
3337 
3338   convertloop:
3339     movdqu     xmm0, [eax]      // read 16 U's
3340     movdqu     xmm1, [eax + edx]  // and 16 V's
3341     lea        eax,  [eax + 16]
3342     movdqa     xmm2, xmm0
3343     punpcklbw  xmm0, xmm1       // first 8 UV pairs
3344     punpckhbw  xmm2, xmm1       // next 8 UV pairs
3345     movdqu     [edi], xmm0
3346     movdqu     [edi + 16], xmm2
3347     lea        edi, [edi + 32]
3348     sub        ecx, 16
3349     jg         convertloop
3350 
3351     pop        edi
3352     ret
3353   }
3354 }
3355 #endif  //  HAS_MERGEUVROW_SSE2
3356 
3357 #ifdef HAS_MERGEUVROW_AVX2
3358 __declspec(naked)
3359 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3360                      int width) {
3361   __asm {
3362     push       edi
3363     mov        eax, [esp + 4 + 4]    // src_u
3364     mov        edx, [esp + 4 + 8]    // src_v
3365     mov        edi, [esp + 4 + 12]   // dst_uv
3366     mov        ecx, [esp + 4 + 16]   // width
3367     sub        edx, eax
3368 
3369   convertloop:
3370     vmovdqu    ymm0, [eax]           // read 32 U's
3371     vmovdqu    ymm1, [eax + edx]     // and 32 V's
3372     lea        eax,  [eax + 32]
3373     vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
3374     vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
3375     vextractf128 [edi], ymm2, 0       // bytes 0..15
3376     vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
3377     vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
3378     vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
3379     lea        edi, [edi + 64]
3380     sub        ecx, 32
3381     jg         convertloop
3382 
3383     pop        edi
3384     vzeroupper
3385     ret
3386   }
3387 }
3388 #endif  //  HAS_MERGEUVROW_AVX2
3389 
3390 #ifdef HAS_COPYROW_SSE2
3391 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
3392 __declspec(naked)
3393 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
3394   __asm {
3395     mov        eax, [esp + 4]   // src
3396     mov        edx, [esp + 8]   // dst
3397     mov        ecx, [esp + 12]  // count
3398     test       eax, 15
3399     jne        convertloopu
3400     test       edx, 15
3401     jne        convertloopu
3402 
3403   convertloopa:
3404     movdqa     xmm0, [eax]
3405     movdqa     xmm1, [eax + 16]
3406     lea        eax, [eax + 32]
3407     movdqa     [edx], xmm0
3408     movdqa     [edx + 16], xmm1
3409     lea        edx, [edx + 32]
3410     sub        ecx, 32
3411     jg         convertloopa
3412     ret
3413 
3414   convertloopu:
3415     movdqu     xmm0, [eax]
3416     movdqu     xmm1, [eax + 16]
3417     lea        eax, [eax + 32]
3418     movdqu     [edx], xmm0
3419     movdqu     [edx + 16], xmm1
3420     lea        edx, [edx + 32]
3421     sub        ecx, 32
3422     jg         convertloopu
3423     ret
3424   }
3425 }
3426 #endif  // HAS_COPYROW_SSE2
3427 
3428 #ifdef HAS_COPYROW_AVX
3429 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
3430 __declspec(naked)
3431 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
3432   __asm {
3433     mov        eax, [esp + 4]   // src
3434     mov        edx, [esp + 8]   // dst
3435     mov        ecx, [esp + 12]  // count
3436 
3437   convertloop:
3438     vmovdqu    ymm0, [eax]
3439     vmovdqu    ymm1, [eax + 32]
3440     lea        eax, [eax + 64]
3441     vmovdqu    [edx], ymm0
3442     vmovdqu    [edx + 32], ymm1
3443     lea        edx, [edx + 64]
3444     sub        ecx, 64
3445     jg         convertloop
3446 
3447     vzeroupper
3448     ret
3449   }
3450 }
3451 #endif  // HAS_COPYROW_AVX
3452 
3453 // Multiple of 1.
3454 __declspec(naked)
3455 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
3456   __asm {
3457     mov        eax, esi
3458     mov        edx, edi
3459     mov        esi, [esp + 4]   // src
3460     mov        edi, [esp + 8]   // dst
3461     mov        ecx, [esp + 12]  // count
3462     rep movsb
3463     mov        edi, edx
3464     mov        esi, eax
3465     ret
3466   }
3467 }
3468 
3469 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
3470 // width in pixels
3471 __declspec(naked)
3472 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3473   __asm {
3474     mov        eax, [esp + 4]   // src
3475     mov        edx, [esp + 8]   // dst
3476     mov        ecx, [esp + 12]  // count
3477     pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
3478     pslld      xmm0, 24
3479     pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
3480     psrld      xmm1, 8
3481 
3482   convertloop:
3483     movdqu     xmm2, [eax]
3484     movdqu     xmm3, [eax + 16]
3485     lea        eax, [eax + 32]
3486     movdqu     xmm4, [edx]
3487     movdqu     xmm5, [edx + 16]
3488     pand       xmm2, xmm0
3489     pand       xmm3, xmm0
3490     pand       xmm4, xmm1
3491     pand       xmm5, xmm1
3492     por        xmm2, xmm4
3493     por        xmm3, xmm5
3494     movdqu     [edx], xmm2
3495     movdqu     [edx + 16], xmm3
3496     lea        edx, [edx + 32]
3497     sub        ecx, 8
3498     jg         convertloop
3499 
3500     ret
3501   }
3502 }
3503 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
3504 
3505 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
3506 // width in pixels
3507 __declspec(naked)
3508 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3509   __asm {
3510     mov        eax, [esp + 4]   // src
3511     mov        edx, [esp + 8]   // dst
3512     mov        ecx, [esp + 12]  // count
3513     vpcmpeqb   ymm0, ymm0, ymm0
3514     vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
3515 
3516   convertloop:
3517     vmovdqu    ymm1, [eax]
3518     vmovdqu    ymm2, [eax + 32]
3519     lea        eax, [eax + 64]
3520     vpblendvb  ymm1, ymm1, [edx], ymm0
3521     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
3522     vmovdqu    [edx], ymm1
3523     vmovdqu    [edx + 32], ymm2
3524     lea        edx, [edx + 64]
3525     sub        ecx, 16
3526     jg         convertloop
3527 
3528     vzeroupper
3529     ret
3530   }
3531 }
3532 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
3533 
3534 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
3535 // width in pixels
3536 __declspec(naked)
3537 void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
3538   __asm {
3539     mov        eax, [esp + 4]   // src_argb
3540     mov        edx, [esp + 8]   // dst_a
3541     mov        ecx, [esp + 12]  // width
3542 
3543   extractloop:
3544     movdqu     xmm0, [eax]
3545     movdqu     xmm1, [eax + 16]
3546     lea        eax, [eax + 32]
3547     psrld      xmm0, 24
3548     psrld      xmm1, 24
3549     packssdw   xmm0, xmm1
3550     packuswb   xmm0, xmm0
3551     movq       qword ptr [edx], xmm0
3552     lea        edx, [edx + 8]
3553     sub        ecx, 8
3554     jg         extractloop
3555 
3556     ret
3557   }
3558 }
3559 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
3560 
3561 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3562 // width in pixels
3563 __declspec(naked)
3564 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3565   __asm {
3566     mov        eax, [esp + 4]   // src
3567     mov        edx, [esp + 8]   // dst
3568     mov        ecx, [esp + 12]  // count
3569     pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
3570     pslld      xmm0, 24
3571     pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
3572     psrld      xmm1, 8
3573 
3574   convertloop:
3575     movq       xmm2, qword ptr [eax]  // 8 Y's
3576     lea        eax, [eax + 8]
3577     punpcklbw  xmm2, xmm2
3578     punpckhwd  xmm3, xmm2
3579     punpcklwd  xmm2, xmm2
3580     movdqu     xmm4, [edx]
3581     movdqu     xmm5, [edx + 16]
3582     pand       xmm2, xmm0
3583     pand       xmm3, xmm0
3584     pand       xmm4, xmm1
3585     pand       xmm5, xmm1
3586     por        xmm2, xmm4
3587     por        xmm3, xmm5
3588     movdqu     [edx], xmm2
3589     movdqu     [edx + 16], xmm3
3590     lea        edx, [edx + 32]
3591     sub        ecx, 8
3592     jg         convertloop
3593 
3594     ret
3595   }
3596 }
3597 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
3598 
3599 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3600 // width in pixels
3601 __declspec(naked)
3602 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3603   __asm {
3604     mov        eax, [esp + 4]   // src
3605     mov        edx, [esp + 8]   // dst
3606     mov        ecx, [esp + 12]  // count
3607     vpcmpeqb   ymm0, ymm0, ymm0
3608     vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
3609 
3610   convertloop:
3611     vpmovzxbd  ymm1, qword ptr [eax]
3612     vpmovzxbd  ymm2, qword ptr [eax + 8]
3613     lea        eax, [eax + 16]
3614     vpslld     ymm1, ymm1, 24
3615     vpslld     ymm2, ymm2, 24
3616     vpblendvb  ymm1, ymm1, [edx], ymm0
3617     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
3618     vmovdqu    [edx], ymm1
3619     vmovdqu    [edx + 32], ymm2
3620     lea        edx, [edx + 64]
3621     sub        ecx, 16
3622     jg         convertloop
3623 
3624     vzeroupper
3625     ret
3626   }
3627 }
3628 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
3629 
3630 #ifdef HAS_SETROW_X86
3631 // Write 'count' bytes using an 8 bit value repeated.
3632 // Count should be multiple of 4.
3633 __declspec(naked)
3634 void SetRow_X86(uint8* dst, uint8 v8, int count) {
3635   __asm {
3636     movzx      eax, byte ptr [esp + 8]    // v8
3637     mov        edx, 0x01010101  // Duplicate byte to all bytes.
3638     mul        edx              // overwrites edx with upper part of result.
3639     mov        edx, edi
3640     mov        edi, [esp + 4]   // dst
3641     mov        ecx, [esp + 12]  // count
3642     shr        ecx, 2
3643     rep stosd
3644     mov        edi, edx
3645     ret
3646   }
3647 }
3648 
3649 // Write 'count' bytes using an 8 bit value repeated.
3650 __declspec(naked)
3651 void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
3652   __asm {
3653     mov        edx, edi
3654     mov        edi, [esp + 4]   // dst
3655     mov        eax, [esp + 8]   // v8
3656     mov        ecx, [esp + 12]  // count
3657     rep stosb
3658     mov        edi, edx
3659     ret
3660   }
3661 }
3662 
3663 // Write 'count' 32 bit values.
3664 __declspec(naked)
3665 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
3666   __asm {
3667     mov        edx, edi
3668     mov        edi, [esp + 4]   // dst
3669     mov        eax, [esp + 8]   // v32
3670     mov        ecx, [esp + 12]  // count
3671     rep stosd
3672     mov        edi, edx
3673     ret
3674   }
3675 }
3676 #endif  // HAS_SETROW_X86
3677 
3678 #ifdef HAS_YUY2TOYROW_AVX2
3679 __declspec(naked)
3680 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
3681   __asm {
3682     mov        eax, [esp + 4]    // src_yuy2
3683     mov        edx, [esp + 8]    // dst_y
3684     mov        ecx, [esp + 12]   // width
3685     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3686     vpsrlw     ymm5, ymm5, 8
3687 
3688   convertloop:
3689     vmovdqu    ymm0, [eax]
3690     vmovdqu    ymm1, [eax + 32]
3691     lea        eax,  [eax + 64]
3692     vpand      ymm0, ymm0, ymm5   // even bytes are Y
3693     vpand      ymm1, ymm1, ymm5
3694     vpackuswb  ymm0, ymm0, ymm1   // mutates.
3695     vpermq     ymm0, ymm0, 0xd8
3696     vmovdqu    [edx], ymm0
3697     lea        edx, [edx + 32]
3698     sub        ecx, 32
3699     jg         convertloop
3700     vzeroupper
3701     ret
3702   }
3703 }
3704 
3705 __declspec(naked)
3706 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
3707                       uint8* dst_u, uint8* dst_v, int width) {
3708   __asm {
3709     push       esi
3710     push       edi
3711     mov        eax, [esp + 8 + 4]    // src_yuy2
3712     mov        esi, [esp + 8 + 8]    // stride_yuy2
3713     mov        edx, [esp + 8 + 12]   // dst_u
3714     mov        edi, [esp + 8 + 16]   // dst_v
3715     mov        ecx, [esp + 8 + 20]   // width
3716     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
3717     vpsrlw     ymm5, ymm5, 8
3718     sub        edi, edx
3719 
3720   convertloop:
3721     vmovdqu    ymm0, [eax]
3722     vmovdqu    ymm1, [eax + 32]
3723     vpavgb     ymm0, ymm0, [eax + esi]
3724     vpavgb     ymm1, ymm1, [eax + esi + 32]
3725     lea        eax,  [eax + 64]
3726     vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
3727     vpsrlw     ymm1, ymm1, 8
3728     vpackuswb  ymm0, ymm0, ymm1   // mutates.
3729     vpermq     ymm0, ymm0, 0xd8
3730     vpand      ymm1, ymm0, ymm5  // U
3731     vpsrlw     ymm0, ymm0, 8     // V
3732     vpackuswb  ymm1, ymm1, ymm1  // mutates.
3733     vpackuswb  ymm0, ymm0, ymm0  // mutates.
3734     vpermq     ymm1, ymm1, 0xd8
3735     vpermq     ymm0, ymm0, 0xd8
3736     vextractf128 [edx], ymm1, 0  // U
3737     vextractf128 [edx + edi], ymm0, 0 // V
3738     lea        edx, [edx + 16]
3739     sub        ecx, 32
3740     jg         convertloop
3741 
3742     pop        edi
3743     pop        esi
3744     vzeroupper
3745     ret
3746   }
3747 }
3748 
3749 __declspec(naked)
3750 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3751                          uint8* dst_u, uint8* dst_v, int width) {
3752   __asm {
3753     push       edi
3754     mov        eax, [esp + 4 + 4]    // src_yuy2
3755     mov        edx, [esp + 4 + 8]    // dst_u
3756     mov        edi, [esp + 4 + 12]   // dst_v
3757     mov        ecx, [esp + 4 + 16]   // width
3758     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
3759     vpsrlw     ymm5, ymm5, 8
3760     sub        edi, edx
3761 
3762   convertloop:
3763     vmovdqu    ymm0, [eax]
3764     vmovdqu    ymm1, [eax + 32]
3765     lea        eax,  [eax + 64]
3766     vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
3767     vpsrlw     ymm1, ymm1, 8
3768     vpackuswb  ymm0, ymm0, ymm1   // mutates.
3769     vpermq     ymm0, ymm0, 0xd8
3770     vpand      ymm1, ymm0, ymm5  // U
3771     vpsrlw     ymm0, ymm0, 8     // V
3772     vpackuswb  ymm1, ymm1, ymm1  // mutates.
3773     vpackuswb  ymm0, ymm0, ymm0  // mutates.
3774     vpermq     ymm1, ymm1, 0xd8
3775     vpermq     ymm0, ymm0, 0xd8
3776     vextractf128 [edx], ymm1, 0  // U
3777     vextractf128 [edx + edi], ymm0, 0 // V
3778     lea        edx, [edx + 16]
3779     sub        ecx, 32
3780     jg         convertloop
3781 
3782     pop        edi
3783     vzeroupper
3784     ret
3785   }
3786 }
3787 
3788 __declspec(naked)
3789 void UYVYToYRow_AVX2(const uint8* src_uyvy,
3790                      uint8* dst_y, int width) {
3791   __asm {
3792     mov        eax, [esp + 4]    // src_uyvy
3793     mov        edx, [esp + 8]    // dst_y
3794     mov        ecx, [esp + 12]   // width
3795 
3796   convertloop:
3797     vmovdqu    ymm0, [eax]
3798     vmovdqu    ymm1, [eax + 32]
3799     lea        eax,  [eax + 64]
3800     vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
3801     vpsrlw     ymm1, ymm1, 8
3802     vpackuswb  ymm0, ymm0, ymm1   // mutates.
3803     vpermq     ymm0, ymm0, 0xd8
3804     vmovdqu    [edx], ymm0
3805     lea        edx, [edx + 32]
3806     sub        ecx, 32
3807     jg         convertloop
3808     vzeroupper
3809     ret
3810   }
3811 }
3812 
3813 __declspec(naked)
3814 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
3815                       uint8* dst_u, uint8* dst_v, int width) {
3816   __asm {
3817     push       esi
3818     push       edi
3819     mov        eax, [esp + 8 + 4]    // src_yuy2
3820     mov        esi, [esp + 8 + 8]    // stride_yuy2
3821     mov        edx, [esp + 8 + 12]   // dst_u
3822     mov        edi, [esp + 8 + 16]   // dst_v
3823     mov        ecx, [esp + 8 + 20]   // width
3824     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
3825     vpsrlw     ymm5, ymm5, 8
3826     sub        edi, edx
3827 
3828   convertloop:
3829     vmovdqu    ymm0, [eax]
3830     vmovdqu    ymm1, [eax + 32]
3831     vpavgb     ymm0, ymm0, [eax + esi]
3832     vpavgb     ymm1, ymm1, [eax + esi + 32]
3833     lea        eax,  [eax + 64]
3834     vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
3835     vpand      ymm1, ymm1, ymm5
3836     vpackuswb  ymm0, ymm0, ymm1   // mutates.
3837     vpermq     ymm0, ymm0, 0xd8
3838     vpand      ymm1, ymm0, ymm5  // U
3839     vpsrlw     ymm0, ymm0, 8     // V
3840     vpackuswb  ymm1, ymm1, ymm1  // mutates.
3841     vpackuswb  ymm0, ymm0, ymm0  // mutates.
3842     vpermq     ymm1, ymm1, 0xd8
3843     vpermq     ymm0, ymm0, 0xd8
3844     vextractf128 [edx], ymm1, 0  // U
3845     vextractf128 [edx + edi], ymm0, 0 // V
3846     lea        edx, [edx + 16]
3847     sub        ecx, 32
3848     jg         convertloop
3849 
3850     pop        edi
3851     pop        esi
3852     vzeroupper
3853     ret
3854   }
3855 }
3856 
3857 __declspec(naked)
3858 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3859                          uint8* dst_u, uint8* dst_v, int width) {
3860   __asm {
3861     push       edi
3862     mov        eax, [esp + 4 + 4]    // src_yuy2
3863     mov        edx, [esp + 4 + 8]    // dst_u
3864     mov        edi, [esp + 4 + 12]   // dst_v
3865     mov        ecx, [esp + 4 + 16]   // width
3866     vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
3867     vpsrlw     ymm5, ymm5, 8
3868     sub        edi, edx
3869 
3870   convertloop:
3871     vmovdqu    ymm0, [eax]
3872     vmovdqu    ymm1, [eax + 32]
3873     lea        eax,  [eax + 64]
3874     vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
3875     vpand      ymm1, ymm1, ymm5
3876     vpackuswb  ymm0, ymm0, ymm1   // mutates.
3877     vpermq     ymm0, ymm0, 0xd8
3878     vpand      ymm1, ymm0, ymm5  // U
3879     vpsrlw     ymm0, ymm0, 8     // V
3880     vpackuswb  ymm1, ymm1, ymm1  // mutates.
3881     vpackuswb  ymm0, ymm0, ymm0  // mutates.
3882     vpermq     ymm1, ymm1, 0xd8
3883     vpermq     ymm0, ymm0, 0xd8
3884     vextractf128 [edx], ymm1, 0  // U
3885     vextractf128 [edx + edi], ymm0, 0 // V
3886     lea        edx, [edx + 16]
3887     sub        ecx, 32
3888     jg         convertloop
3889 
3890     pop        edi
3891     vzeroupper
3892     ret
3893   }
3894 }
3895 #endif  // HAS_YUY2TOYROW_AVX2
3896 
3897 #ifdef HAS_YUY2TOYROW_SSE2
3898 __declspec(naked)
3899 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
3900                      uint8* dst_y, int width) {
3901   __asm {
3902     mov        eax, [esp + 4]    // src_yuy2
3903     mov        edx, [esp + 8]    // dst_y
3904     mov        ecx, [esp + 12]   // width
3905     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
3906     psrlw      xmm5, 8
3907 
3908   convertloop:
3909     movdqu     xmm0, [eax]
3910     movdqu     xmm1, [eax + 16]
3911     lea        eax,  [eax + 32]
3912     pand       xmm0, xmm5   // even bytes are Y
3913     pand       xmm1, xmm5
3914     packuswb   xmm0, xmm1
3915     movdqu     [edx], xmm0
3916     lea        edx, [edx + 16]
3917     sub        ecx, 16
3918     jg         convertloop
3919     ret
3920   }
3921 }
3922 
3923 __declspec(naked)
3924 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
3925                       uint8* dst_u, uint8* dst_v, int width) {
3926   __asm {
3927     push       esi
3928     push       edi
3929     mov        eax, [esp + 8 + 4]    // src_yuy2
3930     mov        esi, [esp + 8 + 8]    // stride_yuy2
3931     mov        edx, [esp + 8 + 12]   // dst_u
3932     mov        edi, [esp + 8 + 16]   // dst_v
3933     mov        ecx, [esp + 8 + 20]   // width
3934     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
3935     psrlw      xmm5, 8
3936     sub        edi, edx
3937 
3938   convertloop:
3939     movdqu     xmm0, [eax]
3940     movdqu     xmm1, [eax + 16]
3941     movdqu     xmm2, [eax + esi]
3942     movdqu     xmm3, [eax + esi + 16]
3943     lea        eax,  [eax + 32]
3944     pavgb      xmm0, xmm2
3945     pavgb      xmm1, xmm3
3946     psrlw      xmm0, 8      // YUYV -> UVUV
3947     psrlw      xmm1, 8
3948     packuswb   xmm0, xmm1
3949     movdqa     xmm1, xmm0
3950     pand       xmm0, xmm5  // U
3951     packuswb   xmm0, xmm0
3952     psrlw      xmm1, 8     // V
3953     packuswb   xmm1, xmm1
3954     movq       qword ptr [edx], xmm0
3955     movq       qword ptr [edx + edi], xmm1
3956     lea        edx, [edx + 8]
3957     sub        ecx, 16
3958     jg         convertloop
3959 
3960     pop        edi
3961     pop        esi
3962     ret
3963   }
3964 }
3965 
3966 __declspec(naked)
3967 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3968                          uint8* dst_u, uint8* dst_v, int width) {
3969   __asm {
3970     push       edi
3971     mov        eax, [esp + 4 + 4]    // src_yuy2
3972     mov        edx, [esp + 4 + 8]    // dst_u
3973     mov        edi, [esp + 4 + 12]   // dst_v
3974     mov        ecx, [esp + 4 + 16]   // width
3975     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
3976     psrlw      xmm5, 8
3977     sub        edi, edx
3978 
3979   convertloop:
3980     movdqu     xmm0, [eax]
3981     movdqu     xmm1, [eax + 16]
3982     lea        eax,  [eax + 32]
3983     psrlw      xmm0, 8      // YUYV -> UVUV
3984     psrlw      xmm1, 8
3985     packuswb   xmm0, xmm1
3986     movdqa     xmm1, xmm0
3987     pand       xmm0, xmm5  // U
3988     packuswb   xmm0, xmm0
3989     psrlw      xmm1, 8     // V
3990     packuswb   xmm1, xmm1
3991     movq       qword ptr [edx], xmm0
3992     movq       qword ptr [edx + edi], xmm1
3993     lea        edx, [edx + 8]
3994     sub        ecx, 16
3995     jg         convertloop
3996 
3997     pop        edi
3998     ret
3999   }
4000 }
4001 
4002 __declspec(naked)
4003 void UYVYToYRow_SSE2(const uint8* src_uyvy,
4004                      uint8* dst_y, int width) {
4005   __asm {
4006     mov        eax, [esp + 4]    // src_uyvy
4007     mov        edx, [esp + 8]    // dst_y
4008     mov        ecx, [esp + 12]   // width
4009 
4010   convertloop:
4011     movdqu     xmm0, [eax]
4012     movdqu     xmm1, [eax + 16]
4013     lea        eax,  [eax + 32]
4014     psrlw      xmm0, 8    // odd bytes are Y
4015     psrlw      xmm1, 8
4016     packuswb   xmm0, xmm1
4017     movdqu     [edx], xmm0
4018     lea        edx, [edx + 16]
4019     sub        ecx, 16
4020     jg         convertloop
4021     ret
4022   }
4023 }
4024 
4025 __declspec(naked)
4026 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
4027                       uint8* dst_u, uint8* dst_v, int width) {
4028   __asm {
4029     push       esi
4030     push       edi
4031     mov        eax, [esp + 8 + 4]    // src_yuy2
4032     mov        esi, [esp + 8 + 8]    // stride_yuy2
4033     mov        edx, [esp + 8 + 12]   // dst_u
4034     mov        edi, [esp + 8 + 16]   // dst_v
4035     mov        ecx, [esp + 8 + 20]   // width
4036     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
4037     psrlw      xmm5, 8
4038     sub        edi, edx
4039 
4040   convertloop:
4041     movdqu     xmm0, [eax]
4042     movdqu     xmm1, [eax + 16]
4043     movdqu     xmm2, [eax + esi]
4044     movdqu     xmm3, [eax + esi + 16]
4045     lea        eax,  [eax + 32]
4046     pavgb      xmm0, xmm2
4047     pavgb      xmm1, xmm3
4048     pand       xmm0, xmm5   // UYVY -> UVUV
4049     pand       xmm1, xmm5
4050     packuswb   xmm0, xmm1
4051     movdqa     xmm1, xmm0
4052     pand       xmm0, xmm5  // U
4053     packuswb   xmm0, xmm0
4054     psrlw      xmm1, 8     // V
4055     packuswb   xmm1, xmm1
4056     movq       qword ptr [edx], xmm0
4057     movq       qword ptr [edx + edi], xmm1
4058     lea        edx, [edx + 8]
4059     sub        ecx, 16
4060     jg         convertloop
4061 
4062     pop        edi
4063     pop        esi
4064     ret
4065   }
4066 }
4067 
4068 __declspec(naked)
4069 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
4070                          uint8* dst_u, uint8* dst_v, int width) {
4071   __asm {
4072     push       edi
4073     mov        eax, [esp + 4 + 4]    // src_yuy2
4074     mov        edx, [esp + 4 + 8]    // dst_u
4075     mov        edi, [esp + 4 + 12]   // dst_v
4076     mov        ecx, [esp + 4 + 16]   // width
4077     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
4078     psrlw      xmm5, 8
4079     sub        edi, edx
4080 
4081   convertloop:
4082     movdqu     xmm0, [eax]
4083     movdqu     xmm1, [eax + 16]
4084     lea        eax,  [eax + 32]
4085     pand       xmm0, xmm5   // UYVY -> UVUV
4086     pand       xmm1, xmm5
4087     packuswb   xmm0, xmm1
4088     movdqa     xmm1, xmm0
4089     pand       xmm0, xmm5  // U
4090     packuswb   xmm0, xmm0
4091     psrlw      xmm1, 8     // V
4092     packuswb   xmm1, xmm1
4093     movq       qword ptr [edx], xmm0
4094     movq       qword ptr [edx + edi], xmm1
4095     lea        edx, [edx + 8]
4096     sub        ecx, 16
4097     jg         convertloop
4098 
4099     pop        edi
4100     ret
4101   }
4102 }
4103 #endif  // HAS_YUY2TOYROW_SSE2
4104 
4105 #ifdef HAS_BLENDPLANEROW_SSSE3
4106 // Blend 8 pixels at a time.
4107 // unsigned version of math
4108 // =((A2*C2)+(B2*(255-C2))+255)/256
4109 // signed version of math
4110 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4111 __declspec(naked)
4112 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
4113                          const uint8* alpha, uint8* dst, int width) {
4114   __asm {
4115     push       esi
4116     push       edi
4117     pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
4118     psllw      xmm5, 8
4119     mov        eax, 0x80808080  // 128 for biasing image to signed.
4120     movd       xmm6, eax
4121     pshufd     xmm6, xmm6, 0x00
4122 
4123     mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.
4124     movd       xmm7, eax
4125     pshufd     xmm7, xmm7, 0x00
4126     mov        eax, [esp + 8 + 4]   // src0
4127     mov        edx, [esp + 8 + 8]   // src1
4128     mov        esi, [esp + 8 + 12]  // alpha
4129     mov        edi, [esp + 8 + 16]  // dst
4130     mov        ecx, [esp + 8 + 20]  // width
4131     sub        eax, esi
4132     sub        edx, esi
4133     sub        edi, esi
4134 
4135     // 8 pixel loop.
4136   convertloop8:
4137     movq       xmm0, qword ptr [esi]        // alpha
4138     punpcklbw  xmm0, xmm0
4139     pxor       xmm0, xmm5         // a, 255-a
4140     movq       xmm1, qword ptr [eax + esi]  // src0
4141     movq       xmm2, qword ptr [edx + esi]  // src1
4142     punpcklbw  xmm1, xmm2
4143     psubb      xmm1, xmm6         // bias src0/1 - 128
4144     pmaddubsw  xmm0, xmm1
4145     paddw      xmm0, xmm7         // unbias result - 32768 and round.
4146     psrlw      xmm0, 8
4147     packuswb   xmm0, xmm0
4148     movq       qword ptr [edi + esi], xmm0
4149     lea        esi, [esi + 8]
4150     sub        ecx, 8
4151     jg         convertloop8
4152 
4153     pop        edi
4154     pop        esi
4155     ret
4156   }
4157 }
4158 #endif  // HAS_BLENDPLANEROW_SSSE3
4159 
4160 #ifdef HAS_BLENDPLANEROW_AVX2
4161 // Blend 32 pixels at a time.
4162 // unsigned version of math
4163 // =((A2*C2)+(B2*(255-C2))+255)/256
4164 // signed version of math
4165 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4166 __declspec(naked)
4167 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
4168                          const uint8* alpha, uint8* dst, int width) {
4169   __asm {
4170     push        esi
4171     push        edi
4172     vpcmpeqb    ymm5, ymm5, ymm5       // generate mask 0xff00ff00
4173     vpsllw      ymm5, ymm5, 8
4174     mov         eax, 0x80808080  // 128 for biasing image to signed.
4175     vmovd       xmm6, eax
4176     vbroadcastss ymm6, xmm6
4177     mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.
4178     vmovd       xmm7, eax
4179     vbroadcastss ymm7, xmm7
4180     mov         eax, [esp + 8 + 4]   // src0
4181     mov         edx, [esp + 8 + 8]   // src1
4182     mov         esi, [esp + 8 + 12]  // alpha
4183     mov         edi, [esp + 8 + 16]  // dst
4184     mov         ecx, [esp + 8 + 20]  // width
4185     sub         eax, esi
4186     sub         edx, esi
4187     sub         edi, esi
4188 
4189     // 32 pixel loop.
4190   convertloop32:
4191     vmovdqu     ymm0, [esi]        // alpha
4192     vpunpckhbw  ymm3, ymm0, ymm0   // 8..15, 24..31
4193     vpunpcklbw  ymm0, ymm0, ymm0   // 0..7, 16..23
4194     vpxor       ymm3, ymm3, ymm5   // a, 255-a
4195     vpxor       ymm0, ymm0, ymm5   // a, 255-a
4196     vmovdqu     ymm1, [eax + esi]  // src0
4197     vmovdqu     ymm2, [edx + esi]  // src1
4198     vpunpckhbw  ymm4, ymm1, ymm2
4199     vpunpcklbw  ymm1, ymm1, ymm2
4200     vpsubb      ymm4, ymm4, ymm6   // bias src0/1 - 128
4201     vpsubb      ymm1, ymm1, ymm6   // bias src0/1 - 128
4202     vpmaddubsw  ymm3, ymm3, ymm4
4203     vpmaddubsw  ymm0, ymm0, ymm1
4204     vpaddw      ymm3, ymm3, ymm7   // unbias result - 32768 and round.
4205     vpaddw      ymm0, ymm0, ymm7   // unbias result - 32768 and round.
4206     vpsrlw      ymm3, ymm3, 8
4207     vpsrlw      ymm0, ymm0, 8
4208     vpackuswb   ymm0, ymm0, ymm3
4209     vmovdqu     [edi + esi], ymm0
4210     lea         esi, [esi + 32]
4211     sub         ecx, 32
4212     jg          convertloop32
4213 
4214     pop         edi
4215     pop         esi
4216     vzeroupper
4217     ret
4218   }
4219 }
4220 #endif  // HAS_BLENDPLANEROW_AVX2
4221 
4222 #ifdef HAS_ARGBBLENDROW_SSSE3
4223 // Shuffle table for isolating alpha.
4224 static const uvec8 kShuffleAlpha = {
4225   3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
4226   11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
4227 };
4228 
4229 // Blend 8 pixels at a time.
4230 __declspec(naked)
4231 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
4232                         uint8* dst_argb, int width) {
4233   __asm {
4234     push       esi
4235     mov        eax, [esp + 4 + 4]   // src_argb0
4236     mov        esi, [esp + 4 + 8]   // src_argb1
4237     mov        edx, [esp + 4 + 12]  // dst_argb
4238     mov        ecx, [esp + 4 + 16]  // width
4239     pcmpeqb    xmm7, xmm7       // generate constant 0x0001
4240     psrlw      xmm7, 15
4241     pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
4242     psrlw      xmm6, 8
4243     pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
4244     psllw      xmm5, 8
4245     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
4246     pslld      xmm4, 24
4247     sub        ecx, 4
4248     jl         convertloop4b    // less than 4 pixels?
4249 
4250     // 4 pixel loop.
4251   convertloop4:
4252     movdqu     xmm3, [eax]      // src argb
4253     lea        eax, [eax + 16]
4254     movdqa     xmm0, xmm3       // src argb
4255     pxor       xmm3, xmm4       // ~alpha
4256     movdqu     xmm2, [esi]      // _r_b
4257     pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha
4258     pand       xmm2, xmm6       // _r_b
4259     paddw      xmm3, xmm7       // 256 - alpha
4260     pmullw     xmm2, xmm3       // _r_b * alpha
4261     movdqu     xmm1, [esi]      // _a_g
4262     lea        esi, [esi + 16]
4263     psrlw      xmm1, 8          // _a_g
4264     por        xmm0, xmm4       // set alpha to 255
4265     pmullw     xmm1, xmm3       // _a_g * alpha
4266     psrlw      xmm2, 8          // _r_b convert to 8 bits again
4267     paddusb    xmm0, xmm2       // + src argb
4268     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
4269     paddusb    xmm0, xmm1       // + src argb
4270     movdqu     [edx], xmm0
4271     lea        edx, [edx + 16]
4272     sub        ecx, 4
4273     jge        convertloop4
4274 
4275   convertloop4b:
4276     add        ecx, 4 - 1
4277     jl         convertloop1b
4278 
4279     // 1 pixel loop.
4280   convertloop1:
4281     movd       xmm3, [eax]      // src argb
4282     lea        eax, [eax + 4]
4283     movdqa     xmm0, xmm3       // src argb
4284     pxor       xmm3, xmm4       // ~alpha
4285     movd       xmm2, [esi]      // _r_b
4286     pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha
4287     pand       xmm2, xmm6       // _r_b
4288     paddw      xmm3, xmm7       // 256 - alpha
4289     pmullw     xmm2, xmm3       // _r_b * alpha
4290     movd       xmm1, [esi]      // _a_g
4291     lea        esi, [esi + 4]
4292     psrlw      xmm1, 8          // _a_g
4293     por        xmm0, xmm4       // set alpha to 255
4294     pmullw     xmm1, xmm3       // _a_g * alpha
4295     psrlw      xmm2, 8          // _r_b convert to 8 bits again
4296     paddusb    xmm0, xmm2       // + src argb
4297     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
4298     paddusb    xmm0, xmm1       // + src argb
4299     movd       [edx], xmm0
4300     lea        edx, [edx + 4]
4301     sub        ecx, 1
4302     jge        convertloop1
4303 
4304   convertloop1b:
4305     pop        esi
4306     ret
4307   }
4308 }
4309 #endif  // HAS_ARGBBLENDROW_SSSE3
4310 
4311 #ifdef HAS_ARGBATTENUATEROW_SSSE3
4312 // Shuffle table duplicating alpha.
4313 static const uvec8 kShuffleAlpha0 = {
4314   3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4315 };
4316 static const uvec8 kShuffleAlpha1 = {
4317   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4318   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4319 };
4320 __declspec(naked)
4321 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4322   __asm {
4323     mov        eax, [esp + 4]   // src_argb0
4324     mov        edx, [esp + 8]   // dst_argb
4325     mov        ecx, [esp + 12]  // width
4326     pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
4327     pslld      xmm3, 24
4328     movdqa     xmm4, xmmword ptr kShuffleAlpha0
4329     movdqa     xmm5, xmmword ptr kShuffleAlpha1
4330 
4331  convertloop:
4332     movdqu     xmm0, [eax]      // read 4 pixels
4333     pshufb     xmm0, xmm4       // isolate first 2 alphas
4334     movdqu     xmm1, [eax]      // read 4 pixels
4335     punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
4336     pmulhuw    xmm0, xmm1       // rgb * a
4337     movdqu     xmm1, [eax]      // read 4 pixels
4338     pshufb     xmm1, xmm5       // isolate next 2 alphas
4339     movdqu     xmm2, [eax]      // read 4 pixels
4340     punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
4341     pmulhuw    xmm1, xmm2       // rgb * a
4342     movdqu     xmm2, [eax]      // mask original alpha
4343     lea        eax, [eax + 16]
4344     pand       xmm2, xmm3
4345     psrlw      xmm0, 8
4346     psrlw      xmm1, 8
4347     packuswb   xmm0, xmm1
4348     por        xmm0, xmm2       // copy original alpha
4349     movdqu     [edx], xmm0
4350     lea        edx, [edx + 16]
4351     sub        ecx, 4
4352     jg         convertloop
4353 
4354     ret
4355   }
4356 }
4357 #endif  // HAS_ARGBATTENUATEROW_SSSE3
4358 
4359 #ifdef HAS_ARGBATTENUATEROW_AVX2
4360 // Shuffle table duplicating alpha.
4361 static const uvec8 kShuffleAlpha_AVX2 = {
4362   6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
4363 };
4364 __declspec(naked)
4365 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
4366   __asm {
4367     mov        eax, [esp + 4]   // src_argb0
4368     mov        edx, [esp + 8]   // dst_argb
4369     mov        ecx, [esp + 12]  // width
4370     sub        edx, eax
4371     vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
4372     vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
4373     vpslld     ymm5, ymm5, 24
4374 
4375  convertloop:
4376     vmovdqu    ymm6, [eax]       // read 8 pixels.
4377     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4378     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4379     vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
4380     vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
4381     vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
4382     vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
4383     vpand      ymm6, ymm6, ymm5  // isolate alpha
4384     vpsrlw     ymm0, ymm0, 8
4385     vpsrlw     ymm1, ymm1, 8
4386     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4387     vpor       ymm0, ymm0, ymm6  // copy original alpha
4388     vmovdqu    [eax + edx], ymm0
4389     lea        eax, [eax + 32]
4390     sub        ecx, 8
4391     jg         convertloop
4392 
4393     vzeroupper
4394     ret
4395   }
4396 }
4397 #endif  // HAS_ARGBATTENUATEROW_AVX2
4398 
4399 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
4400 // Unattenuate 4 pixels at a time.
4401 __declspec(naked)
4402 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
4403                              int width) {
4404   __asm {
4405     push       ebx
4406     push       esi
4407     push       edi
4408     mov        eax, [esp + 12 + 4]   // src_argb
4409     mov        edx, [esp + 12 + 8]   // dst_argb
4410     mov        ecx, [esp + 12 + 12]  // width
4411     lea        ebx, fixed_invtbl8
4412 
4413  convertloop:
4414     movdqu     xmm0, [eax]      // read 4 pixels
4415     movzx      esi, byte ptr [eax + 3]  // first alpha
4416     movzx      edi, byte ptr [eax + 7]  // second alpha
4417     punpcklbw  xmm0, xmm0       // first 2
4418     movd       xmm2, dword ptr [ebx + esi * 4]
4419     movd       xmm3, dword ptr [ebx + edi * 4]
4420     pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
4421     pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
4422     movlhps    xmm2, xmm3
4423     pmulhuw    xmm0, xmm2       // rgb * a
4424 
4425     movdqu     xmm1, [eax]      // read 4 pixels
4426     movzx      esi, byte ptr [eax + 11]  // third alpha
4427     movzx      edi, byte ptr [eax + 15]  // forth alpha
4428     punpckhbw  xmm1, xmm1       // next 2
4429     movd       xmm2, dword ptr [ebx + esi * 4]
4430     movd       xmm3, dword ptr [ebx + edi * 4]
4431     pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
4432     pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
4433     movlhps    xmm2, xmm3
4434     pmulhuw    xmm1, xmm2       // rgb * a
4435     lea        eax, [eax + 16]
4436     packuswb   xmm0, xmm1
4437     movdqu     [edx], xmm0
4438     lea        edx, [edx + 16]
4439     sub        ecx, 4
4440     jg         convertloop
4441 
4442     pop        edi
4443     pop        esi
4444     pop        ebx
4445     ret
4446   }
4447 }
4448 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
4449 
4450 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
4451 // Shuffle table duplicating alpha.
4452 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
4453   0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
4454 };
4455 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
4456 // USE_GATHER is not on by default, due to being a slow instruction.
4457 #ifdef USE_GATHER
4458 __declspec(naked)
4459 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4460                              int width) {
4461   __asm {
4462     mov        eax, [esp + 4]   // src_argb0
4463     mov        edx, [esp + 8]   // dst_argb
4464     mov        ecx, [esp + 12]  // width
4465     sub        edx, eax
4466     vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
4467 
4468  convertloop:
4469     vmovdqu    ymm6, [eax]       // read 8 pixels.
4470     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
4471     vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
4472     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4473     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4474     vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
4475     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
4476     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
4477     vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
4478     vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
4479     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
4480     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
4481     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4482     vmovdqu    [eax + edx], ymm0
4483     lea        eax, [eax + 32]
4484     sub        ecx, 8
4485     jg         convertloop
4486 
4487     vzeroupper
4488     ret
4489   }
4490 }
4491 #else  // USE_GATHER
4492 __declspec(naked)
4493 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4494                              int width) {
4495   __asm {
4496 
4497     push       ebx
4498     push       esi
4499     push       edi
4500     mov        eax, [esp + 12 + 4]   // src_argb
4501     mov        edx, [esp + 12 + 8]   // dst_argb
4502     mov        ecx, [esp + 12 + 12]  // width
4503     sub        edx, eax
4504     lea        ebx, fixed_invtbl8
4505     vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
4506 
4507  convertloop:
4508     // replace VPGATHER
4509     movzx      esi, byte ptr [eax + 3]                 // alpha0
4510     movzx      edi, byte ptr [eax + 7]                 // alpha1
4511     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a0]
4512     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a1]
4513     movzx      esi, byte ptr [eax + 11]                // alpha2
4514     movzx      edi, byte ptr [eax + 15]                // alpha3
4515     vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
4516     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a2]
4517     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a3]
4518     movzx      esi, byte ptr [eax + 19]                // alpha4
4519     movzx      edi, byte ptr [eax + 23]                // alpha5
4520     vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
4521     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a4]
4522     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a5]
4523     movzx      esi, byte ptr [eax + 27]                // alpha6
4524     movzx      edi, byte ptr [eax + 31]                // alpha7
4525     vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
4526     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a6]
4527     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a7]
4528     vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
4529     vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
4530     vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
4531     vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
4532     // end of VPGATHER
4533 
4534     vmovdqu    ymm6, [eax]       // read 8 pixels.
4535     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4536     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4537     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
4538     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
4539     vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
4540     vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
4541     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
4542     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
4543     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4544     vmovdqu    [eax + edx], ymm0
4545     lea        eax, [eax + 32]
4546     sub        ecx, 8
4547     jg         convertloop
4548 
4549     pop        edi
4550     pop        esi
4551     pop        ebx
4552     vzeroupper
4553     ret
4554   }
4555 }
4556 #endif  // USE_GATHER
4557 #endif  // HAS_ARGBATTENUATEROW_AVX2
4558 
4559 #ifdef HAS_ARGBGRAYROW_SSSE3
4560 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
4561 __declspec(naked)
4562 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4563   __asm {
4564     mov        eax, [esp + 4]   /* src_argb */
4565     mov        edx, [esp + 8]   /* dst_argb */
4566     mov        ecx, [esp + 12]  /* width */
4567     movdqa     xmm4, xmmword ptr kARGBToYJ
4568     movdqa     xmm5, xmmword ptr kAddYJ64
4569 
4570  convertloop:
4571     movdqu     xmm0, [eax]  // G
4572     movdqu     xmm1, [eax + 16]
4573     pmaddubsw  xmm0, xmm4
4574     pmaddubsw  xmm1, xmm4
4575     phaddw     xmm0, xmm1
4576     paddw      xmm0, xmm5  // Add .5 for rounding.
4577     psrlw      xmm0, 7
4578     packuswb   xmm0, xmm0   // 8 G bytes
4579     movdqu     xmm2, [eax]  // A
4580     movdqu     xmm3, [eax + 16]
4581     lea        eax, [eax + 32]
4582     psrld      xmm2, 24
4583     psrld      xmm3, 24
4584     packuswb   xmm2, xmm3
4585     packuswb   xmm2, xmm2   // 8 A bytes
4586     movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
4587     punpcklbw  xmm0, xmm0   // 8 GG words
4588     punpcklbw  xmm3, xmm2   // 8 GA words
4589     movdqa     xmm1, xmm0
4590     punpcklwd  xmm0, xmm3   // GGGA first 4
4591     punpckhwd  xmm1, xmm3   // GGGA next 4
4592     movdqu     [edx], xmm0
4593     movdqu     [edx + 16], xmm1
4594     lea        edx, [edx + 32]
4595     sub        ecx, 8
4596     jg         convertloop
4597     ret
4598   }
4599 }
4600 #endif  // HAS_ARGBGRAYROW_SSSE3
4601 
4602 #ifdef HAS_ARGBSEPIAROW_SSSE3
4603 //    b = (r * 35 + g * 68 + b * 17) >> 7
4604 //    g = (r * 45 + g * 88 + b * 22) >> 7
4605 //    r = (r * 50 + g * 98 + b * 24) >> 7
4606 // Constant for ARGB color to sepia tone.
4607 static const vec8 kARGBToSepiaB = {
4608   17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
4609 };
4610 
4611 static const vec8 kARGBToSepiaG = {
4612   22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
4613 };
4614 
4615 static const vec8 kARGBToSepiaR = {
4616   24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
4617 };
4618 
4619 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
4620 __declspec(naked)
4621 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
4622   __asm {
4623     mov        eax, [esp + 4]   /* dst_argb */
4624     mov        ecx, [esp + 8]   /* width */
4625     movdqa     xmm2, xmmword ptr kARGBToSepiaB
4626     movdqa     xmm3, xmmword ptr kARGBToSepiaG
4627     movdqa     xmm4, xmmword ptr kARGBToSepiaR
4628 
4629  convertloop:
4630     movdqu     xmm0, [eax]  // B
4631     movdqu     xmm6, [eax + 16]
4632     pmaddubsw  xmm0, xmm2
4633     pmaddubsw  xmm6, xmm2
4634     phaddw     xmm0, xmm6
4635     psrlw      xmm0, 7
4636     packuswb   xmm0, xmm0   // 8 B values
4637     movdqu     xmm5, [eax]  // G
4638     movdqu     xmm1, [eax + 16]
4639     pmaddubsw  xmm5, xmm3
4640     pmaddubsw  xmm1, xmm3
4641     phaddw     xmm5, xmm1
4642     psrlw      xmm5, 7
4643     packuswb   xmm5, xmm5   // 8 G values
4644     punpcklbw  xmm0, xmm5   // 8 BG values
4645     movdqu     xmm5, [eax]  // R
4646     movdqu     xmm1, [eax + 16]
4647     pmaddubsw  xmm5, xmm4
4648     pmaddubsw  xmm1, xmm4
4649     phaddw     xmm5, xmm1
4650     psrlw      xmm5, 7
4651     packuswb   xmm5, xmm5   // 8 R values
4652     movdqu     xmm6, [eax]  // A
4653     movdqu     xmm1, [eax + 16]
4654     psrld      xmm6, 24
4655     psrld      xmm1, 24
4656     packuswb   xmm6, xmm1
4657     packuswb   xmm6, xmm6   // 8 A values
4658     punpcklbw  xmm5, xmm6   // 8 RA values
4659     movdqa     xmm1, xmm0   // Weave BG, RA together
4660     punpcklwd  xmm0, xmm5   // BGRA first 4
4661     punpckhwd  xmm1, xmm5   // BGRA next 4
4662     movdqu     [eax], xmm0
4663     movdqu     [eax + 16], xmm1
4664     lea        eax, [eax + 32]
4665     sub        ecx, 8
4666     jg         convertloop
4667     ret
4668   }
4669 }
4670 #endif  // HAS_ARGBSEPIAROW_SSSE3
4671 
4672 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4673 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
4674 // Same as Sepia except matrix is provided.
4675 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
4676 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
4677 __declspec(naked)
4678 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4679                               const int8* matrix_argb, int width) {
4680   __asm {
4681     mov        eax, [esp + 4]   /* src_argb */
4682     mov        edx, [esp + 8]   /* dst_argb */
4683     mov        ecx, [esp + 12]  /* matrix_argb */
4684     movdqu     xmm5, [ecx]
4685     pshufd     xmm2, xmm5, 0x00
4686     pshufd     xmm3, xmm5, 0x55
4687     pshufd     xmm4, xmm5, 0xaa
4688     pshufd     xmm5, xmm5, 0xff
4689     mov        ecx, [esp + 16]  /* width */
4690 
4691  convertloop:
4692     movdqu     xmm0, [eax]  // B
4693     movdqu     xmm7, [eax + 16]
4694     pmaddubsw  xmm0, xmm2
4695     pmaddubsw  xmm7, xmm2
4696     movdqu     xmm6, [eax]  // G
4697     movdqu     xmm1, [eax + 16]
4698     pmaddubsw  xmm6, xmm3
4699     pmaddubsw  xmm1, xmm3
4700     phaddsw    xmm0, xmm7   // B
4701     phaddsw    xmm6, xmm1   // G
4702     psraw      xmm0, 6      // B
4703     psraw      xmm6, 6      // G
4704     packuswb   xmm0, xmm0   // 8 B values
4705     packuswb   xmm6, xmm6   // 8 G values
4706     punpcklbw  xmm0, xmm6   // 8 BG values
4707     movdqu     xmm1, [eax]  // R
4708     movdqu     xmm7, [eax + 16]
4709     pmaddubsw  xmm1, xmm4
4710     pmaddubsw  xmm7, xmm4
4711     phaddsw    xmm1, xmm7   // R
4712     movdqu     xmm6, [eax]  // A
4713     movdqu     xmm7, [eax + 16]
4714     pmaddubsw  xmm6, xmm5
4715     pmaddubsw  xmm7, xmm5
4716     phaddsw    xmm6, xmm7   // A
4717     psraw      xmm1, 6      // R
4718     psraw      xmm6, 6      // A
4719     packuswb   xmm1, xmm1   // 8 R values
4720     packuswb   xmm6, xmm6   // 8 A values
4721     punpcklbw  xmm1, xmm6   // 8 RA values
4722     movdqa     xmm6, xmm0   // Weave BG, RA together
4723     punpcklwd  xmm0, xmm1   // BGRA first 4
4724     punpckhwd  xmm6, xmm1   // BGRA next 4
4725     movdqu     [edx], xmm0
4726     movdqu     [edx + 16], xmm6
4727     lea        eax, [eax + 32]
4728     lea        edx, [edx + 32]
4729     sub        ecx, 8
4730     jg         convertloop
4731     ret
4732   }
4733 }
4734 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
4735 
4736 #ifdef HAS_ARGBQUANTIZEROW_SSE2
4737 // Quantize 4 ARGB pixels (16 bytes).
4738 __declspec(naked)
4739 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
4740                           int interval_offset, int width) {
4741   __asm {
4742     mov        eax, [esp + 4]    /* dst_argb */
4743     movd       xmm2, [esp + 8]   /* scale */
4744     movd       xmm3, [esp + 12]  /* interval_size */
4745     movd       xmm4, [esp + 16]  /* interval_offset */
4746     mov        ecx, [esp + 20]   /* width */
4747     pshuflw    xmm2, xmm2, 040h
4748     pshufd     xmm2, xmm2, 044h
4749     pshuflw    xmm3, xmm3, 040h
4750     pshufd     xmm3, xmm3, 044h
4751     pshuflw    xmm4, xmm4, 040h
4752     pshufd     xmm4, xmm4, 044h
4753     pxor       xmm5, xmm5  // constant 0
4754     pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
4755     pslld      xmm6, 24
4756 
4757  convertloop:
4758     movdqu     xmm0, [eax]  // read 4 pixels
4759     punpcklbw  xmm0, xmm5   // first 2 pixels
4760     pmulhuw    xmm0, xmm2   // pixel * scale >> 16
4761     movdqu     xmm1, [eax]  // read 4 pixels
4762     punpckhbw  xmm1, xmm5   // next 2 pixels
4763     pmulhuw    xmm1, xmm2
4764     pmullw     xmm0, xmm3   // * interval_size
4765     movdqu     xmm7, [eax]  // read 4 pixels
4766     pmullw     xmm1, xmm3
4767     pand       xmm7, xmm6   // mask alpha
4768     paddw      xmm0, xmm4   // + interval_size / 2
4769     paddw      xmm1, xmm4
4770     packuswb   xmm0, xmm1
4771     por        xmm0, xmm7
4772     movdqu     [eax], xmm0
4773     lea        eax, [eax + 16]
4774     sub        ecx, 4
4775     jg         convertloop
4776     ret
4777   }
4778 }
4779 #endif  // HAS_ARGBQUANTIZEROW_SSE2
4780 
4781 #ifdef HAS_ARGBSHADEROW_SSE2
4782 // Shade 4 pixels at a time by specified value.
4783 __declspec(naked)
4784 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
4785                        uint32 value) {
4786   __asm {
4787     mov        eax, [esp + 4]   // src_argb
4788     mov        edx, [esp + 8]   // dst_argb
4789     mov        ecx, [esp + 12]  // width
4790     movd       xmm2, [esp + 16]  // value
4791     punpcklbw  xmm2, xmm2
4792     punpcklqdq xmm2, xmm2
4793 
4794  convertloop:
4795     movdqu     xmm0, [eax]      // read 4 pixels
4796     lea        eax, [eax + 16]
4797     movdqa     xmm1, xmm0
4798     punpcklbw  xmm0, xmm0       // first 2
4799     punpckhbw  xmm1, xmm1       // next 2
4800     pmulhuw    xmm0, xmm2       // argb * value
4801     pmulhuw    xmm1, xmm2       // argb * value
4802     psrlw      xmm0, 8
4803     psrlw      xmm1, 8
4804     packuswb   xmm0, xmm1
4805     movdqu     [edx], xmm0
4806     lea        edx, [edx + 16]
4807     sub        ecx, 4
4808     jg         convertloop
4809 
4810     ret
4811   }
4812 }
4813 #endif  // HAS_ARGBSHADEROW_SSE2
4814 
4815 #ifdef HAS_ARGBMULTIPLYROW_SSE2
4816 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4817 __declspec(naked)
4818 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4819                           uint8* dst_argb, int width) {
4820   __asm {
4821     push       esi
4822     mov        eax, [esp + 4 + 4]   // src_argb0
4823     mov        esi, [esp + 4 + 8]   // src_argb1
4824     mov        edx, [esp + 4 + 12]  // dst_argb
4825     mov        ecx, [esp + 4 + 16]  // width
4826     pxor       xmm5, xmm5  // constant 0
4827 
4828  convertloop:
4829     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
4830     movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
4831     movdqu     xmm1, xmm0
4832     movdqu     xmm3, xmm2
4833     punpcklbw  xmm0, xmm0         // first 2
4834     punpckhbw  xmm1, xmm1         // next 2
4835     punpcklbw  xmm2, xmm5         // first 2
4836     punpckhbw  xmm3, xmm5         // next 2
4837     pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
4838     pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
4839     lea        eax, [eax + 16]
4840     lea        esi, [esi + 16]
4841     packuswb   xmm0, xmm1
4842     movdqu     [edx], xmm0
4843     lea        edx, [edx + 16]
4844     sub        ecx, 4
4845     jg         convertloop
4846 
4847     pop        esi
4848     ret
4849   }
4850 }
4851 #endif  // HAS_ARGBMULTIPLYROW_SSE2
4852 
4853 #ifdef HAS_ARGBADDROW_SSE2
4854 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
4855 // TODO(fbarchard): Port this to posix, neon and other math functions.
4856 __declspec(naked)
4857 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4858                      uint8* dst_argb, int width) {
4859   __asm {
4860     push       esi
4861     mov        eax, [esp + 4 + 4]   // src_argb0
4862     mov        esi, [esp + 4 + 8]   // src_argb1
4863     mov        edx, [esp + 4 + 12]  // dst_argb
4864     mov        ecx, [esp + 4 + 16]  // width
4865 
4866     sub        ecx, 4
4867     jl         convertloop49
4868 
4869  convertloop4:
4870     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
4871     lea        eax, [eax + 16]
4872     movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
4873     lea        esi, [esi + 16]
4874     paddusb    xmm0, xmm1         // src_argb0 + src_argb1
4875     movdqu     [edx], xmm0
4876     lea        edx, [edx + 16]
4877     sub        ecx, 4
4878     jge        convertloop4
4879 
4880  convertloop49:
4881     add        ecx, 4 - 1
4882     jl         convertloop19
4883 
4884  convertloop1:
4885     movd       xmm0, [eax]        // read 1 pixels from src_argb0
4886     lea        eax, [eax + 4]
4887     movd       xmm1, [esi]        // read 1 pixels from src_argb1
4888     lea        esi, [esi + 4]
4889     paddusb    xmm0, xmm1         // src_argb0 + src_argb1
4890     movd       [edx], xmm0
4891     lea        edx, [edx + 4]
4892     sub        ecx, 1
4893     jge        convertloop1
4894 
4895  convertloop19:
4896     pop        esi
4897     ret
4898   }
4899 }
4900 #endif  // HAS_ARGBADDROW_SSE2
4901 
4902 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4903 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
4904 __declspec(naked)
4905 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4906                           uint8* dst_argb, int width) {
4907   __asm {
4908     push       esi
4909     mov        eax, [esp + 4 + 4]   // src_argb0
4910     mov        esi, [esp + 4 + 8]   // src_argb1
4911     mov        edx, [esp + 4 + 12]  // dst_argb
4912     mov        ecx, [esp + 4 + 16]  // width
4913 
4914  convertloop:
4915     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
4916     lea        eax, [eax + 16]
4917     movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
4918     lea        esi, [esi + 16]
4919     psubusb    xmm0, xmm1         // src_argb0 - src_argb1
4920     movdqu     [edx], xmm0
4921     lea        edx, [edx + 16]
4922     sub        ecx, 4
4923     jg         convertloop
4924 
4925     pop        esi
4926     ret
4927   }
4928 }
4929 #endif  // HAS_ARGBSUBTRACTROW_SSE2
4930 
4931 #ifdef HAS_ARGBMULTIPLYROW_AVX2
4932 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
4933 __declspec(naked)
4934 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4935                           uint8* dst_argb, int width) {
4936   __asm {
4937     push       esi
4938     mov        eax, [esp + 4 + 4]   // src_argb0
4939     mov        esi, [esp + 4 + 8]   // src_argb1
4940     mov        edx, [esp + 4 + 12]  // dst_argb
4941     mov        ecx, [esp + 4 + 16]  // width
4942     vpxor      ymm5, ymm5, ymm5     // constant 0
4943 
4944  convertloop:
4945     vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
4946     lea        eax, [eax + 32]
4947     vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
4948     lea        esi, [esi + 32]
4949     vpunpcklbw ymm0, ymm1, ymm1   // low 4
4950     vpunpckhbw ymm1, ymm1, ymm1   // high 4
4951     vpunpcklbw ymm2, ymm3, ymm5   // low 4
4952     vpunpckhbw ymm3, ymm3, ymm5   // high 4
4953     vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
4954     vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
4955     vpackuswb  ymm0, ymm0, ymm1
4956     vmovdqu    [edx], ymm0
4957     lea        edx, [edx + 32]
4958     sub        ecx, 8
4959     jg         convertloop
4960 
4961     pop        esi
4962     vzeroupper
4963     ret
4964   }
4965 }
4966 #endif  // HAS_ARGBMULTIPLYROW_AVX2
4967 
4968 #ifdef HAS_ARGBADDROW_AVX2
4969 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
4970 __declspec(naked)
4971 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4972                      uint8* dst_argb, int width) {
4973   __asm {
4974     push       esi
4975     mov        eax, [esp + 4 + 4]   // src_argb0
4976     mov        esi, [esp + 4 + 8]   // src_argb1
4977     mov        edx, [esp + 4 + 12]  // dst_argb
4978     mov        ecx, [esp + 4 + 16]  // width
4979 
4980  convertloop:
4981     vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
4982     lea        eax, [eax + 32]
4983     vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
4984     lea        esi, [esi + 32]
4985     vmovdqu    [edx], ymm0
4986     lea        edx, [edx + 32]
4987     sub        ecx, 8
4988     jg         convertloop
4989 
4990     pop        esi
4991     vzeroupper
4992     ret
4993   }
4994 }
4995 #endif  // HAS_ARGBADDROW_AVX2
4996 
4997 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4998 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
4999 __declspec(naked)
5000 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
5001                           uint8* dst_argb, int width) {
5002   __asm {
5003     push       esi
5004     mov        eax, [esp + 4 + 4]   // src_argb0
5005     mov        esi, [esp + 4 + 8]   // src_argb1
5006     mov        edx, [esp + 4 + 12]  // dst_argb
5007     mov        ecx, [esp + 4 + 16]  // width
5008 
5009  convertloop:
5010     vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
5011     lea        eax, [eax + 32]
5012     vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
5013     lea        esi, [esi + 32]
5014     vmovdqu    [edx], ymm0
5015     lea        edx, [edx + 32]
5016     sub        ecx, 8
5017     jg         convertloop
5018 
5019     pop        esi
5020     vzeroupper
5021     ret
5022   }
5023 }
5024 #endif  // HAS_ARGBSUBTRACTROW_AVX2
5025 
5026 #ifdef HAS_SOBELXROW_SSE2
5027 // SobelX as a matrix is
5028 // -1  0  1
5029 // -2  0  2
5030 // -1  0  1
5031 __declspec(naked)
5032 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
5033                     const uint8* src_y2, uint8* dst_sobelx, int width) {
5034   __asm {
5035     push       esi
5036     push       edi
5037     mov        eax, [esp + 8 + 4]   // src_y0
5038     mov        esi, [esp + 8 + 8]   // src_y1
5039     mov        edi, [esp + 8 + 12]  // src_y2
5040     mov        edx, [esp + 8 + 16]  // dst_sobelx
5041     mov        ecx, [esp + 8 + 20]  // width
5042     sub        esi, eax
5043     sub        edi, eax
5044     sub        edx, eax
5045     pxor       xmm5, xmm5  // constant 0
5046 
5047  convertloop:
5048     movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
5049     movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
5050     punpcklbw  xmm0, xmm5
5051     punpcklbw  xmm1, xmm5
5052     psubw      xmm0, xmm1
5053     movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
5054     movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
5055     punpcklbw  xmm1, xmm5
5056     punpcklbw  xmm2, xmm5
5057     psubw      xmm1, xmm2
5058     movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
5059     movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
5060     punpcklbw  xmm2, xmm5
5061     punpcklbw  xmm3, xmm5
5062     psubw      xmm2, xmm3
5063     paddw      xmm0, xmm2
5064     paddw      xmm0, xmm1
5065     paddw      xmm0, xmm1
5066     pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
5067     psubw      xmm1, xmm0
5068     pmaxsw     xmm0, xmm1
5069     packuswb   xmm0, xmm0
5070     movq       qword ptr [eax + edx], xmm0
5071     lea        eax, [eax + 8]
5072     sub        ecx, 8
5073     jg         convertloop
5074 
5075     pop        edi
5076     pop        esi
5077     ret
5078   }
5079 }
5080 #endif  // HAS_SOBELXROW_SSE2
5081 
5082 #ifdef HAS_SOBELYROW_SSE2
5083 // SobelY as a matrix is
5084 // -1 -2 -1
5085 //  0  0  0
5086 //  1  2  1
5087 __declspec(naked)
5088 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
5089                     uint8* dst_sobely, int width) {
5090   __asm {
5091     push       esi
5092     mov        eax, [esp + 4 + 4]   // src_y0
5093     mov        esi, [esp + 4 + 8]   // src_y1
5094     mov        edx, [esp + 4 + 12]  // dst_sobely
5095     mov        ecx, [esp + 4 + 16]  // width
5096     sub        esi, eax
5097     sub        edx, eax
5098     pxor       xmm5, xmm5  // constant 0
5099 
5100  convertloop:
5101     movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
5102     movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
5103     punpcklbw  xmm0, xmm5
5104     punpcklbw  xmm1, xmm5
5105     psubw      xmm0, xmm1
5106     movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
5107     movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
5108     punpcklbw  xmm1, xmm5
5109     punpcklbw  xmm2, xmm5
5110     psubw      xmm1, xmm2
5111     movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
5112     movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
5113     punpcklbw  xmm2, xmm5
5114     punpcklbw  xmm3, xmm5
5115     psubw      xmm2, xmm3
5116     paddw      xmm0, xmm2
5117     paddw      xmm0, xmm1
5118     paddw      xmm0, xmm1
5119     pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
5120     psubw      xmm1, xmm0
5121     pmaxsw     xmm0, xmm1
5122     packuswb   xmm0, xmm0
5123     movq       qword ptr [eax + edx], xmm0
5124     lea        eax, [eax + 8]
5125     sub        ecx, 8
5126     jg         convertloop
5127 
5128     pop        esi
5129     ret
5130   }
5131 }
5132 #endif  // HAS_SOBELYROW_SSE2
5133 
5134 #ifdef HAS_SOBELROW_SSE2
5135 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5136 // A = 255
5137 // R = Sobel
5138 // G = Sobel
5139 // B = Sobel
5140 __declspec(naked)
5141 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5142                    uint8* dst_argb, int width) {
5143   __asm {
5144     push       esi
5145     mov        eax, [esp + 4 + 4]   // src_sobelx
5146     mov        esi, [esp + 4 + 8]   // src_sobely
5147     mov        edx, [esp + 4 + 12]  // dst_argb
5148     mov        ecx, [esp + 4 + 16]  // width
5149     sub        esi, eax
5150     pcmpeqb    xmm5, xmm5           // alpha 255
5151     pslld      xmm5, 24             // 0xff000000
5152 
5153  convertloop:
5154     movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
5155     movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
5156     lea        eax, [eax + 16]
5157     paddusb    xmm0, xmm1             // sobel = sobelx + sobely
5158     movdqa     xmm2, xmm0             // GG
5159     punpcklbw  xmm2, xmm0             // First 8
5160     punpckhbw  xmm0, xmm0             // Next 8
5161     movdqa     xmm1, xmm2             // GGGG
5162     punpcklwd  xmm1, xmm2             // First 4
5163     punpckhwd  xmm2, xmm2             // Next 4
5164     por        xmm1, xmm5             // GGGA
5165     por        xmm2, xmm5
5166     movdqa     xmm3, xmm0             // GGGG
5167     punpcklwd  xmm3, xmm0             // Next 4
5168     punpckhwd  xmm0, xmm0             // Last 4
5169     por        xmm3, xmm5             // GGGA
5170     por        xmm0, xmm5
5171     movdqu     [edx], xmm1
5172     movdqu     [edx + 16], xmm2
5173     movdqu     [edx + 32], xmm3
5174     movdqu     [edx + 48], xmm0
5175     lea        edx, [edx + 64]
5176     sub        ecx, 16
5177     jg         convertloop
5178 
5179     pop        esi
5180     ret
5181   }
5182 }
5183 #endif  // HAS_SOBELROW_SSE2
5184 
5185 #ifdef HAS_SOBELTOPLANEROW_SSE2
5186 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
5187 __declspec(naked)
5188 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5189                           uint8* dst_y, int width) {
5190   __asm {
5191     push       esi
5192     mov        eax, [esp + 4 + 4]   // src_sobelx
5193     mov        esi, [esp + 4 + 8]   // src_sobely
5194     mov        edx, [esp + 4 + 12]  // dst_argb
5195     mov        ecx, [esp + 4 + 16]  // width
5196     sub        esi, eax
5197 
5198  convertloop:
5199     movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
5200     movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
5201     lea        eax, [eax + 16]
5202     paddusb    xmm0, xmm1             // sobel = sobelx + sobely
5203     movdqu     [edx], xmm0
5204     lea        edx, [edx + 16]
5205     sub        ecx, 16
5206     jg         convertloop
5207 
5208     pop        esi
5209     ret
5210   }
5211 }
5212 #endif  // HAS_SOBELTOPLANEROW_SSE2
5213 
5214 #ifdef HAS_SOBELXYROW_SSE2
5215 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
5216 // A = 255
5217 // R = Sobel X
5218 // G = Sobel
5219 // B = Sobel Y
5220 __declspec(naked)
5221 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5222                      uint8* dst_argb, int width) {
5223   __asm {
5224     push       esi
5225     mov        eax, [esp + 4 + 4]   // src_sobelx
5226     mov        esi, [esp + 4 + 8]   // src_sobely
5227     mov        edx, [esp + 4 + 12]  // dst_argb
5228     mov        ecx, [esp + 4 + 16]  // width
5229     sub        esi, eax
5230     pcmpeqb    xmm5, xmm5           // alpha 255
5231 
5232  convertloop:
5233     movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
5234     movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
5235     lea        eax, [eax + 16]
5236     movdqa     xmm2, xmm0
5237     paddusb    xmm2, xmm1             // sobel = sobelx + sobely
5238     movdqa     xmm3, xmm0             // XA
5239     punpcklbw  xmm3, xmm5
5240     punpckhbw  xmm0, xmm5
5241     movdqa     xmm4, xmm1             // YS
5242     punpcklbw  xmm4, xmm2
5243     punpckhbw  xmm1, xmm2
5244     movdqa     xmm6, xmm4             // YSXA
5245     punpcklwd  xmm6, xmm3             // First 4
5246     punpckhwd  xmm4, xmm3             // Next 4
5247     movdqa     xmm7, xmm1             // YSXA
5248     punpcklwd  xmm7, xmm0             // Next 4
5249     punpckhwd  xmm1, xmm0             // Last 4
5250     movdqu     [edx], xmm6
5251     movdqu     [edx + 16], xmm4
5252     movdqu     [edx + 32], xmm7
5253     movdqu     [edx + 48], xmm1
5254     lea        edx, [edx + 64]
5255     sub        ecx, 16
5256     jg         convertloop
5257 
5258     pop        esi
5259     ret
5260   }
5261 }
5262 #endif  // HAS_SOBELXYROW_SSE2
5263 
5264 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5265 // Consider float CumulativeSum.
5266 // Consider calling CumulativeSum one row at time as needed.
5267 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
5268 // Convert cumulative sum for an area to an average for 1 pixel.
5269 // topleft is pointer to top left of CumulativeSum buffer for area.
5270 // botleft is pointer to bottom left of CumulativeSum buffer.
5271 // width is offset from left to right of area in CumulativeSum buffer measured
5272 //   in number of ints.
5273 // area is the number of pixels in the area being averaged.
5274 // dst points to pixel to store result to.
5275 // count is number of averaged pixels to produce.
5276 // Does 4 pixels at a time.
5277 // This function requires alignment on accumulation buffer pointers.
5278 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
5279                                     int width, int area, uint8* dst,
5280                                     int count) {
5281   __asm {
5282     mov        eax, topleft  // eax topleft
5283     mov        esi, botleft  // esi botleft
5284     mov        edx, width
5285     movd       xmm5, area
5286     mov        edi, dst
5287     mov        ecx, count
5288     cvtdq2ps   xmm5, xmm5
5289     rcpss      xmm4, xmm5  // 1.0f / area
5290     pshufd     xmm4, xmm4, 0
5291     sub        ecx, 4
5292     jl         l4b
5293 
5294     cmp        area, 128  // 128 pixels will not overflow 15 bits.
5295     ja         l4
5296 
5297     pshufd     xmm5, xmm5, 0        // area
5298     pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
5299     psrld      xmm6, 16
5300     cvtdq2ps   xmm6, xmm6
5301     addps      xmm5, xmm6           // (65536.0 + area - 1)
5302     mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
5303     cvtps2dq   xmm5, xmm5           // 0.16 fixed point
5304     packssdw   xmm5, xmm5           // 16 bit shorts
5305 
5306     // 4 pixel loop small blocks.
5307   s4:
5308     // top left
5309     movdqu     xmm0, [eax]
5310     movdqu     xmm1, [eax + 16]
5311     movdqu     xmm2, [eax + 32]
5312     movdqu     xmm3, [eax + 48]
5313 
5314     // - top right
5315     psubd      xmm0, [eax + edx * 4]
5316     psubd      xmm1, [eax + edx * 4 + 16]
5317     psubd      xmm2, [eax + edx * 4 + 32]
5318     psubd      xmm3, [eax + edx * 4 + 48]
5319     lea        eax, [eax + 64]
5320 
5321     // - bottom left
5322     psubd      xmm0, [esi]
5323     psubd      xmm1, [esi + 16]
5324     psubd      xmm2, [esi + 32]
5325     psubd      xmm3, [esi + 48]
5326 
5327     // + bottom right
5328     paddd      xmm0, [esi + edx * 4]
5329     paddd      xmm1, [esi + edx * 4 + 16]
5330     paddd      xmm2, [esi + edx * 4 + 32]
5331     paddd      xmm3, [esi + edx * 4 + 48]
5332     lea        esi, [esi + 64]
5333 
5334     packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
5335     packssdw   xmm2, xmm3
5336 
5337     pmulhuw    xmm0, xmm5
5338     pmulhuw    xmm2, xmm5
5339 
5340     packuswb   xmm0, xmm2
5341     movdqu     [edi], xmm0
5342     lea        edi, [edi + 16]
5343     sub        ecx, 4
5344     jge        s4
5345 
5346     jmp        l4b
5347 
5348     // 4 pixel loop
5349   l4:
5350     // top left
5351     movdqu     xmm0, [eax]
5352     movdqu     xmm1, [eax + 16]
5353     movdqu     xmm2, [eax + 32]
5354     movdqu     xmm3, [eax + 48]
5355 
5356     // - top right
5357     psubd      xmm0, [eax + edx * 4]
5358     psubd      xmm1, [eax + edx * 4 + 16]
5359     psubd      xmm2, [eax + edx * 4 + 32]
5360     psubd      xmm3, [eax + edx * 4 + 48]
5361     lea        eax, [eax + 64]
5362 
5363     // - bottom left
5364     psubd      xmm0, [esi]
5365     psubd      xmm1, [esi + 16]
5366     psubd      xmm2, [esi + 32]
5367     psubd      xmm3, [esi + 48]
5368 
5369     // + bottom right
5370     paddd      xmm0, [esi + edx * 4]
5371     paddd      xmm1, [esi + edx * 4 + 16]
5372     paddd      xmm2, [esi + edx * 4 + 32]
5373     paddd      xmm3, [esi + edx * 4 + 48]
5374     lea        esi, [esi + 64]
5375 
5376     cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
5377     cvtdq2ps   xmm1, xmm1
5378     mulps      xmm0, xmm4
5379     mulps      xmm1, xmm4
5380     cvtdq2ps   xmm2, xmm2
5381     cvtdq2ps   xmm3, xmm3
5382     mulps      xmm2, xmm4
5383     mulps      xmm3, xmm4
5384     cvtps2dq   xmm0, xmm0
5385     cvtps2dq   xmm1, xmm1
5386     cvtps2dq   xmm2, xmm2
5387     cvtps2dq   xmm3, xmm3
5388     packssdw   xmm0, xmm1
5389     packssdw   xmm2, xmm3
5390     packuswb   xmm0, xmm2
5391     movdqu     [edi], xmm0
5392     lea        edi, [edi + 16]
5393     sub        ecx, 4
5394     jge        l4
5395 
5396   l4b:
5397     add        ecx, 4 - 1
5398     jl         l1b
5399 
5400     // 1 pixel loop
5401   l1:
5402     movdqu     xmm0, [eax]
5403     psubd      xmm0, [eax + edx * 4]
5404     lea        eax, [eax + 16]
5405     psubd      xmm0, [esi]
5406     paddd      xmm0, [esi + edx * 4]
5407     lea        esi, [esi + 16]
5408     cvtdq2ps   xmm0, xmm0
5409     mulps      xmm0, xmm4
5410     cvtps2dq   xmm0, xmm0
5411     packssdw   xmm0, xmm0
5412     packuswb   xmm0, xmm0
5413     movd       dword ptr [edi], xmm0
5414     lea        edi, [edi + 4]
5415     sub        ecx, 1
5416     jge        l1
5417   l1b:
5418   }
5419 }
5420 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5421 
5422 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5423 // Creates a table of cumulative sums where each value is a sum of all values
5424 // above and to the left of the value.
5425 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
5426                                   const int32* previous_cumsum, int width) {
5427   __asm {
5428     mov        eax, row
5429     mov        edx, cumsum
5430     mov        esi, previous_cumsum
5431     mov        ecx, width
5432     pxor       xmm0, xmm0
5433     pxor       xmm1, xmm1
5434 
5435     sub        ecx, 4
5436     jl         l4b
5437     test       edx, 15
5438     jne        l4b
5439 
5440     // 4 pixel loop
5441   l4:
5442     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
5443     lea        eax, [eax + 16]
5444     movdqa     xmm4, xmm2
5445 
5446     punpcklbw  xmm2, xmm1
5447     movdqa     xmm3, xmm2
5448     punpcklwd  xmm2, xmm1
5449     punpckhwd  xmm3, xmm1
5450 
5451     punpckhbw  xmm4, xmm1
5452     movdqa     xmm5, xmm4
5453     punpcklwd  xmm4, xmm1
5454     punpckhwd  xmm5, xmm1
5455 
5456     paddd      xmm0, xmm2
5457     movdqu     xmm2, [esi]  // previous row above.
5458     paddd      xmm2, xmm0
5459 
5460     paddd      xmm0, xmm3
5461     movdqu     xmm3, [esi + 16]
5462     paddd      xmm3, xmm0
5463 
5464     paddd      xmm0, xmm4
5465     movdqu     xmm4, [esi + 32]
5466     paddd      xmm4, xmm0
5467 
5468     paddd      xmm0, xmm5
5469     movdqu     xmm5, [esi + 48]
5470     lea        esi, [esi + 64]
5471     paddd      xmm5, xmm0
5472 
5473     movdqu     [edx], xmm2
5474     movdqu     [edx + 16], xmm3
5475     movdqu     [edx + 32], xmm4
5476     movdqu     [edx + 48], xmm5
5477 
5478     lea        edx, [edx + 64]
5479     sub        ecx, 4
5480     jge        l4
5481 
5482   l4b:
5483     add        ecx, 4 - 1
5484     jl         l1b
5485 
5486     // 1 pixel loop
5487   l1:
5488     movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
5489     lea        eax, [eax + 4]
5490     punpcklbw  xmm2, xmm1
5491     punpcklwd  xmm2, xmm1
5492     paddd      xmm0, xmm2
5493     movdqu     xmm2, [esi]
5494     lea        esi, [esi + 16]
5495     paddd      xmm2, xmm0
5496     movdqu     [edx], xmm2
5497     lea        edx, [edx + 16]
5498     sub        ecx, 1
5499     jge        l1
5500 
5501  l1b:
5502   }
5503 }
5504 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
5505 
5506 #ifdef HAS_ARGBAFFINEROW_SSE2
5507 // Copy ARGB pixels from source image with slope to a row of destination.
5508 __declspec(naked)
5509 LIBYUV_API
5510 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
5511                         uint8* dst_argb, const float* uv_dudv, int width) {
5512   __asm {
5513     push       esi
5514     push       edi
5515     mov        eax, [esp + 12]  // src_argb
5516     mov        esi, [esp + 16]  // stride
5517     mov        edx, [esp + 20]  // dst_argb
5518     mov        ecx, [esp + 24]  // pointer to uv_dudv
5519     movq       xmm2, qword ptr [ecx]  // uv
5520     movq       xmm7, qword ptr [ecx + 8]  // dudv
5521     mov        ecx, [esp + 28]  // width
5522     shl        esi, 16          // 4, stride
5523     add        esi, 4
5524     movd       xmm5, esi
5525     sub        ecx, 4
5526     jl         l4b
5527 
5528     // setup for 4 pixel loop
5529     pshufd     xmm7, xmm7, 0x44  // dup dudv
5530     pshufd     xmm5, xmm5, 0  // dup 4, stride
5531     movdqa     xmm0, xmm2    // x0, y0, x1, y1
5532     addps      xmm0, xmm7
5533     movlhps    xmm2, xmm0
5534     movdqa     xmm4, xmm7
5535     addps      xmm4, xmm4    // dudv *= 2
5536     movdqa     xmm3, xmm2    // x2, y2, x3, y3
5537     addps      xmm3, xmm4
5538     addps      xmm4, xmm4    // dudv *= 4
5539 
5540     // 4 pixel loop
5541   l4:
5542     cvttps2dq  xmm0, xmm2    // x, y float to int first 2
5543     cvttps2dq  xmm1, xmm3    // x, y float to int next 2
5544     packssdw   xmm0, xmm1    // x, y as 8 shorts
5545     pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
5546     movd       esi, xmm0
5547     pshufd     xmm0, xmm0, 0x39  // shift right
5548     movd       edi, xmm0
5549     pshufd     xmm0, xmm0, 0x39  // shift right
5550     movd       xmm1, [eax + esi]  // read pixel 0
5551     movd       xmm6, [eax + edi]  // read pixel 1
5552     punpckldq  xmm1, xmm6     // combine pixel 0 and 1
5553     addps      xmm2, xmm4    // x, y += dx, dy first 2
5554     movq       qword ptr [edx], xmm1
5555     movd       esi, xmm0
5556     pshufd     xmm0, xmm0, 0x39  // shift right
5557     movd       edi, xmm0
5558     movd       xmm6, [eax + esi]  // read pixel 2
5559     movd       xmm0, [eax + edi]  // read pixel 3
5560     punpckldq  xmm6, xmm0     // combine pixel 2 and 3
5561     addps      xmm3, xmm4    // x, y += dx, dy next 2
5562     movq       qword ptr 8[edx], xmm6
5563     lea        edx, [edx + 16]
5564     sub        ecx, 4
5565     jge        l4
5566 
5567   l4b:
5568     add        ecx, 4 - 1
5569     jl         l1b
5570 
5571     // 1 pixel loop
5572   l1:
5573     cvttps2dq  xmm0, xmm2    // x, y float to int
5574     packssdw   xmm0, xmm0    // x, y as shorts
5575     pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
5576     addps      xmm2, xmm7    // x, y += dx, dy
5577     movd       esi, xmm0
5578     movd       xmm0, [eax + esi]  // copy a pixel
5579     movd       [edx], xmm0
5580     lea        edx, [edx + 4]
5581     sub        ecx, 1
5582     jge        l1
5583   l1b:
5584     pop        edi
5585     pop        esi
5586     ret
5587   }
5588 }
5589 #endif  // HAS_ARGBAFFINEROW_SSE2
5590 
5591 #ifdef HAS_INTERPOLATEROW_AVX2
5592 // Bilinear filter 32x2 -> 32x1
5593 __declspec(naked)
5594 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
5595                          ptrdiff_t src_stride, int dst_width,
5596                          int source_y_fraction) {
5597   __asm {
5598     push       esi
5599     push       edi
5600     mov        edi, [esp + 8 + 4]   // dst_ptr
5601     mov        esi, [esp + 8 + 8]   // src_ptr
5602     mov        edx, [esp + 8 + 12]  // src_stride
5603     mov        ecx, [esp + 8 + 16]  // dst_width
5604     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
5605     // Dispatch to specialized filters if applicable.
5606     cmp        eax, 0
5607     je         xloop100  // 0 / 256.  Blend 100 / 0.
5608     sub        edi, esi
5609     cmp        eax, 128
5610     je         xloop50   // 128 /256 is 0.50.  Blend 50 / 50.
5611 
5612     vmovd      xmm0, eax  // high fraction 0..255
5613     neg        eax
5614     add        eax, 256
5615     vmovd      xmm5, eax  // low fraction 256..1
5616     vpunpcklbw xmm5, xmm5, xmm0
5617     vpunpcklwd xmm5, xmm5, xmm5
5618     vbroadcastss ymm5, xmm5
5619 
5620     mov        eax, 0x80808080  // 128b for bias and rounding.
5621     vmovd      xmm4, eax
5622     vbroadcastss ymm4, xmm4
5623 
5624   xloop:
5625     vmovdqu    ymm0, [esi]
5626     vmovdqu    ymm2, [esi + edx]
5627     vpunpckhbw ymm1, ymm0, ymm2  // mutates
5628     vpunpcklbw ymm0, ymm0, ymm2
5629     vpsubb     ymm1, ymm1, ymm4  // bias to signed image
5630     vpsubb     ymm0, ymm0, ymm4
5631     vpmaddubsw ymm1, ymm5, ymm1
5632     vpmaddubsw ymm0, ymm5, ymm0
5633     vpaddw     ymm1, ymm1, ymm4  // unbias and round
5634     vpaddw     ymm0, ymm0, ymm4
5635     vpsrlw     ymm1, ymm1, 8
5636     vpsrlw     ymm0, ymm0, 8
5637     vpackuswb  ymm0, ymm0, ymm1  // unmutates
5638     vmovdqu    [esi + edi], ymm0
5639     lea        esi, [esi + 32]
5640     sub        ecx, 32
5641     jg         xloop
5642     jmp        xloop99
5643 
5644    // Blend 50 / 50.
5645  xloop50:
5646    vmovdqu    ymm0, [esi]
5647    vpavgb     ymm0, ymm0, [esi + edx]
5648    vmovdqu    [esi + edi], ymm0
5649    lea        esi, [esi + 32]
5650    sub        ecx, 32
5651    jg         xloop50
5652    jmp        xloop99
5653 
5654    // Blend 100 / 0 - Copy row unchanged.
5655  xloop100:
5656    rep movsb
5657 
5658   xloop99:
5659     pop        edi
5660     pop        esi
5661     vzeroupper
5662     ret
5663   }
5664 }
5665 #endif  // HAS_INTERPOLATEROW_AVX2
5666 
5667 // Bilinear filter 16x2 -> 16x1
5668 // TODO(fbarchard): Consider allowing 256 using memcpy.
5669 __declspec(naked)
5670 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
5671                           ptrdiff_t src_stride, int dst_width,
5672                           int source_y_fraction) {
5673   __asm {
5674     push       esi
5675     push       edi
5676 
5677     mov        edi, [esp + 8 + 4]   // dst_ptr
5678     mov        esi, [esp + 8 + 8]   // src_ptr
5679     mov        edx, [esp + 8 + 12]  // src_stride
5680     mov        ecx, [esp + 8 + 16]  // dst_width
5681     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
5682     sub        edi, esi
5683     // Dispatch to specialized filters if applicable.
5684     cmp        eax, 0
5685     je         xloop100  // 0 /256.  Blend 100 / 0.
5686     cmp        eax, 128
5687     je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
5688 
5689     movd       xmm0, eax  // high fraction 0..255
5690     neg        eax
5691     add        eax, 256
5692     movd       xmm5, eax  // low fraction 255..1
5693     punpcklbw  xmm5, xmm0
5694     punpcklwd  xmm5, xmm5
5695     pshufd     xmm5, xmm5, 0
5696     mov        eax, 0x80808080  // 128 for biasing image to signed.
5697     movd       xmm4, eax
5698     pshufd     xmm4, xmm4, 0x00
5699 
5700   xloop:
5701     movdqu     xmm0, [esi]
5702     movdqu     xmm2, [esi + edx]
5703     movdqu     xmm1, xmm0
5704     punpcklbw  xmm0, xmm2
5705     punpckhbw  xmm1, xmm2
5706     psubb      xmm0, xmm4  // bias image by -128
5707     psubb      xmm1, xmm4
5708     movdqa     xmm2, xmm5
5709     movdqa     xmm3, xmm5
5710     pmaddubsw  xmm2, xmm0
5711     pmaddubsw  xmm3, xmm1
5712     paddw      xmm2, xmm4
5713     paddw      xmm3, xmm4
5714     psrlw      xmm2, 8
5715     psrlw      xmm3, 8
5716     packuswb   xmm2, xmm3
5717     movdqu     [esi + edi], xmm2
5718     lea        esi, [esi + 16]
5719     sub        ecx, 16
5720     jg         xloop
5721     jmp        xloop99
5722 
5723     // Blend 50 / 50.
5724   xloop50:
5725     movdqu     xmm0, [esi]
5726     movdqu     xmm1, [esi + edx]
5727     pavgb      xmm0, xmm1
5728     movdqu     [esi + edi], xmm0
5729     lea        esi, [esi + 16]
5730     sub        ecx, 16
5731     jg         xloop50
5732     jmp        xloop99
5733 
5734     // Blend 100 / 0 - Copy row unchanged.
5735   xloop100:
5736     movdqu     xmm0, [esi]
5737     movdqu     [esi + edi], xmm0
5738     lea        esi, [esi + 16]
5739     sub        ecx, 16
5740     jg         xloop100
5741 
5742   xloop99:
5743     pop        edi
5744     pop        esi
5745     ret
5746   }
5747 }
5748 
5749 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5750 __declspec(naked)
5751 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5752                           const uint8* shuffler, int width) {
5753   __asm {
5754     mov        eax, [esp + 4]    // src_argb
5755     mov        edx, [esp + 8]    // dst_argb
5756     mov        ecx, [esp + 12]   // shuffler
5757     movdqu     xmm5, [ecx]
5758     mov        ecx, [esp + 16]   // width
5759 
5760   wloop:
5761     movdqu     xmm0, [eax]
5762     movdqu     xmm1, [eax + 16]
5763     lea        eax, [eax + 32]
5764     pshufb     xmm0, xmm5
5765     pshufb     xmm1, xmm5
5766     movdqu     [edx], xmm0
5767     movdqu     [edx + 16], xmm1
5768     lea        edx, [edx + 32]
5769     sub        ecx, 8
5770     jg         wloop
5771     ret
5772   }
5773 }
5774 
5775 #ifdef HAS_ARGBSHUFFLEROW_AVX2
5776 __declspec(naked)
5777 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
5778                          const uint8* shuffler, int width) {
5779   __asm {
5780     mov        eax, [esp + 4]     // src_argb
5781     mov        edx, [esp + 8]     // dst_argb
5782     mov        ecx, [esp + 12]    // shuffler
5783     vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
5784     mov        ecx, [esp + 16]    // width
5785 
5786   wloop:
5787     vmovdqu    ymm0, [eax]
5788     vmovdqu    ymm1, [eax + 32]
5789     lea        eax, [eax + 64]
5790     vpshufb    ymm0, ymm0, ymm5
5791     vpshufb    ymm1, ymm1, ymm5
5792     vmovdqu    [edx], ymm0
5793     vmovdqu    [edx + 32], ymm1
5794     lea        edx, [edx + 64]
5795     sub        ecx, 16
5796     jg         wloop
5797 
5798     vzeroupper
5799     ret
5800   }
5801 }
5802 #endif  // HAS_ARGBSHUFFLEROW_AVX2
5803 
5804 __declspec(naked)
5805 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
5806                          const uint8* shuffler, int width) {
5807   __asm {
5808     push       ebx
5809     push       esi
5810     mov        eax, [esp + 8 + 4]    // src_argb
5811     mov        edx, [esp + 8 + 8]    // dst_argb
5812     mov        esi, [esp + 8 + 12]   // shuffler
5813     mov        ecx, [esp + 8 + 16]   // width
5814     pxor       xmm5, xmm5
5815 
5816     mov        ebx, [esi]   // shuffler
5817     cmp        ebx, 0x03000102
5818     je         shuf_3012
5819     cmp        ebx, 0x00010203
5820     je         shuf_0123
5821     cmp        ebx, 0x00030201
5822     je         shuf_0321
5823     cmp        ebx, 0x02010003
5824     je         shuf_2103
5825 
5826   // TODO(fbarchard): Use one source pointer and 3 offsets.
5827   shuf_any1:
5828     movzx      ebx, byte ptr [esi]
5829     movzx      ebx, byte ptr [eax + ebx]
5830     mov        [edx], bl
5831     movzx      ebx, byte ptr [esi + 1]
5832     movzx      ebx, byte ptr [eax + ebx]
5833     mov        [edx + 1], bl
5834     movzx      ebx, byte ptr [esi + 2]
5835     movzx      ebx, byte ptr [eax + ebx]
5836     mov        [edx + 2], bl
5837     movzx      ebx, byte ptr [esi + 3]
5838     movzx      ebx, byte ptr [eax + ebx]
5839     mov        [edx + 3], bl
5840     lea        eax, [eax + 4]
5841     lea        edx, [edx + 4]
5842     sub        ecx, 1
5843     jg         shuf_any1
5844     jmp        shuf99
5845 
5846   shuf_0123:
5847     movdqu     xmm0, [eax]
5848     lea        eax, [eax + 16]
5849     movdqa     xmm1, xmm0
5850     punpcklbw  xmm0, xmm5
5851     punpckhbw  xmm1, xmm5
5852     pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
5853     pshuflw    xmm0, xmm0, 01Bh
5854     pshufhw    xmm1, xmm1, 01Bh
5855     pshuflw    xmm1, xmm1, 01Bh
5856     packuswb   xmm0, xmm1
5857     movdqu     [edx], xmm0
5858     lea        edx, [edx + 16]
5859     sub        ecx, 4
5860     jg         shuf_0123
5861     jmp        shuf99
5862 
5863   shuf_0321:
5864     movdqu     xmm0, [eax]
5865     lea        eax, [eax + 16]
5866     movdqa     xmm1, xmm0
5867     punpcklbw  xmm0, xmm5
5868     punpckhbw  xmm1, xmm5
5869     pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
5870     pshuflw    xmm0, xmm0, 039h
5871     pshufhw    xmm1, xmm1, 039h
5872     pshuflw    xmm1, xmm1, 039h
5873     packuswb   xmm0, xmm1
5874     movdqu     [edx], xmm0
5875     lea        edx, [edx + 16]
5876     sub        ecx, 4
5877     jg         shuf_0321
5878     jmp        shuf99
5879 
5880   shuf_2103:
5881     movdqu     xmm0, [eax]
5882     lea        eax, [eax + 16]
5883     movdqa     xmm1, xmm0
5884     punpcklbw  xmm0, xmm5
5885     punpckhbw  xmm1, xmm5
5886     pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
5887     pshuflw    xmm0, xmm0, 093h
5888     pshufhw    xmm1, xmm1, 093h
5889     pshuflw    xmm1, xmm1, 093h
5890     packuswb   xmm0, xmm1
5891     movdqu     [edx], xmm0
5892     lea        edx, [edx + 16]
5893     sub        ecx, 4
5894     jg         shuf_2103
5895     jmp        shuf99
5896 
5897   shuf_3012:
5898     movdqu     xmm0, [eax]
5899     lea        eax, [eax + 16]
5900     movdqa     xmm1, xmm0
5901     punpcklbw  xmm0, xmm5
5902     punpckhbw  xmm1, xmm5
5903     pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
5904     pshuflw    xmm0, xmm0, 0C6h
5905     pshufhw    xmm1, xmm1, 0C6h
5906     pshuflw    xmm1, xmm1, 0C6h
5907     packuswb   xmm0, xmm1
5908     movdqu     [edx], xmm0
5909     lea        edx, [edx + 16]
5910     sub        ecx, 4
5911     jg         shuf_3012
5912 
5913   shuf99:
5914     pop        esi
5915     pop        ebx
5916     ret
5917   }
5918 }
5919 
5920 // YUY2 - Macro-pixel = 2 image pixels
5921 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
5922 
5923 // UYVY - Macro-pixel = 2 image pixels
5924 // U0Y0V0Y1
5925 
5926 __declspec(naked)
5927 void I422ToYUY2Row_SSE2(const uint8* src_y,
5928                         const uint8* src_u,
5929                         const uint8* src_v,
5930                         uint8* dst_frame, int width) {
5931   __asm {
5932     push       esi
5933     push       edi
5934     mov        eax, [esp + 8 + 4]    // src_y
5935     mov        esi, [esp + 8 + 8]    // src_u
5936     mov        edx, [esp + 8 + 12]   // src_v
5937     mov        edi, [esp + 8 + 16]   // dst_frame
5938     mov        ecx, [esp + 8 + 20]   // width
5939     sub        edx, esi
5940 
5941   convertloop:
5942     movq       xmm2, qword ptr [esi] // U
5943     movq       xmm3, qword ptr [esi + edx] // V
5944     lea        esi, [esi + 8]
5945     punpcklbw  xmm2, xmm3 // UV
5946     movdqu     xmm0, [eax] // Y
5947     lea        eax, [eax + 16]
5948     movdqa     xmm1, xmm0
5949     punpcklbw  xmm0, xmm2 // YUYV
5950     punpckhbw  xmm1, xmm2
5951     movdqu     [edi], xmm0
5952     movdqu     [edi + 16], xmm1
5953     lea        edi, [edi + 32]
5954     sub        ecx, 16
5955     jg         convertloop
5956 
5957     pop        edi
5958     pop        esi
5959     ret
5960   }
5961 }
5962 
5963 __declspec(naked)
5964 void I422ToUYVYRow_SSE2(const uint8* src_y,
5965                         const uint8* src_u,
5966                         const uint8* src_v,
5967                         uint8* dst_frame, int width) {
5968   __asm {
5969     push       esi
5970     push       edi
5971     mov        eax, [esp + 8 + 4]    // src_y
5972     mov        esi, [esp + 8 + 8]    // src_u
5973     mov        edx, [esp + 8 + 12]   // src_v
5974     mov        edi, [esp + 8 + 16]   // dst_frame
5975     mov        ecx, [esp + 8 + 20]   // width
5976     sub        edx, esi
5977 
5978   convertloop:
5979     movq       xmm2, qword ptr [esi] // U
5980     movq       xmm3, qword ptr [esi + edx] // V
5981     lea        esi, [esi + 8]
5982     punpcklbw  xmm2, xmm3 // UV
5983     movdqu     xmm0, [eax] // Y
5984     movdqa     xmm1, xmm2
5985     lea        eax, [eax + 16]
5986     punpcklbw  xmm1, xmm0 // UYVY
5987     punpckhbw  xmm2, xmm0
5988     movdqu     [edi], xmm1
5989     movdqu     [edi + 16], xmm2
5990     lea        edi, [edi + 32]
5991     sub        ecx, 16
5992     jg         convertloop
5993 
5994     pop        edi
5995     pop        esi
5996     ret
5997   }
5998 }
5999 
6000 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
6001 __declspec(naked)
6002 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
6003                             uint8* dst_argb, const float* poly,
6004                             int width) {
6005   __asm {
6006     push       esi
6007     mov        eax, [esp + 4 + 4]   /* src_argb */
6008     mov        edx, [esp + 4 + 8]   /* dst_argb */
6009     mov        esi, [esp + 4 + 12]  /* poly */
6010     mov        ecx, [esp + 4 + 16]  /* width */
6011     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
6012 
6013     // 2 pixel loop.
6014  convertloop:
6015 //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
6016 //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
6017     movq       xmm0, qword ptr [eax]  // BGRABGRA
6018     lea        eax, [eax + 8]
6019     punpcklbw  xmm0, xmm3
6020     movdqa     xmm4, xmm0
6021     punpcklwd  xmm0, xmm3  // pixel 0
6022     punpckhwd  xmm4, xmm3  // pixel 1
6023     cvtdq2ps   xmm0, xmm0  // 4 floats
6024     cvtdq2ps   xmm4, xmm4
6025     movdqa     xmm1, xmm0  // X
6026     movdqa     xmm5, xmm4
6027     mulps      xmm0, [esi + 16]  // C1 * X
6028     mulps      xmm4, [esi + 16]
6029     addps      xmm0, [esi]  // result = C0 + C1 * X
6030     addps      xmm4, [esi]
6031     movdqa     xmm2, xmm1
6032     movdqa     xmm6, xmm5
6033     mulps      xmm2, xmm1  // X * X
6034     mulps      xmm6, xmm5
6035     mulps      xmm1, xmm2  // X * X * X
6036     mulps      xmm5, xmm6
6037     mulps      xmm2, [esi + 32]  // C2 * X * X
6038     mulps      xmm6, [esi + 32]
6039     mulps      xmm1, [esi + 48]  // C3 * X * X * X
6040     mulps      xmm5, [esi + 48]
6041     addps      xmm0, xmm2  // result += C2 * X * X
6042     addps      xmm4, xmm6
6043     addps      xmm0, xmm1  // result += C3 * X * X * X
6044     addps      xmm4, xmm5
6045     cvttps2dq  xmm0, xmm0
6046     cvttps2dq  xmm4, xmm4
6047     packuswb   xmm0, xmm4
6048     packuswb   xmm0, xmm0
6049     movq       qword ptr [edx], xmm0
6050     lea        edx, [edx + 8]
6051     sub        ecx, 2
6052     jg         convertloop
6053     pop        esi
6054     ret
6055   }
6056 }
6057 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
6058 
6059 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
6060 __declspec(naked)
6061 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
6062                             uint8* dst_argb, const float* poly,
6063                             int width) {
6064   __asm {
6065     mov        eax, [esp + 4]   /* src_argb */
6066     mov        edx, [esp + 8]   /* dst_argb */
6067     mov        ecx, [esp + 12]   /* poly */
6068     vbroadcastf128 ymm4, [ecx]       // C0
6069     vbroadcastf128 ymm5, [ecx + 16]  // C1
6070     vbroadcastf128 ymm6, [ecx + 32]  // C2
6071     vbroadcastf128 ymm7, [ecx + 48]  // C3
6072     mov        ecx, [esp + 16]  /* width */
6073 
6074     // 2 pixel loop.
6075  convertloop:
6076     vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
6077     lea         eax, [eax + 8]
6078     vcvtdq2ps   ymm0, ymm0        // X 8 floats
6079     vmulps      ymm2, ymm0, ymm0  // X * X
6080     vmulps      ymm3, ymm0, ymm7  // C3 * X
6081     vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
6082     vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
6083     vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
6084     vcvttps2dq  ymm0, ymm0
6085     vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
6086     vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
6087     vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
6088     vmovq       qword ptr [edx], xmm0
6089     lea         edx, [edx + 8]
6090     sub         ecx, 2
6091     jg          convertloop
6092     vzeroupper
6093     ret
6094   }
6095 }
6096 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
6097 
6098 #ifdef HAS_ARGBCOLORTABLEROW_X86
6099 // Tranform ARGB pixels with color table.
6100 __declspec(naked)
6101 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
6102                            int width) {
6103   __asm {
6104     push       esi
6105     mov        eax, [esp + 4 + 4]   /* dst_argb */
6106     mov        esi, [esp + 4 + 8]   /* table_argb */
6107     mov        ecx, [esp + 4 + 12]  /* width */
6108 
6109     // 1 pixel loop.
6110   convertloop:
6111     movzx      edx, byte ptr [eax]
6112     lea        eax, [eax + 4]
6113     movzx      edx, byte ptr [esi + edx * 4]
6114     mov        byte ptr [eax - 4], dl
6115     movzx      edx, byte ptr [eax - 4 + 1]
6116     movzx      edx, byte ptr [esi + edx * 4 + 1]
6117     mov        byte ptr [eax - 4 + 1], dl
6118     movzx      edx, byte ptr [eax - 4 + 2]
6119     movzx      edx, byte ptr [esi + edx * 4 + 2]
6120     mov        byte ptr [eax - 4 + 2], dl
6121     movzx      edx, byte ptr [eax - 4 + 3]
6122     movzx      edx, byte ptr [esi + edx * 4 + 3]
6123     mov        byte ptr [eax - 4 + 3], dl
6124     dec        ecx
6125     jg         convertloop
6126     pop        esi
6127     ret
6128   }
6129 }
6130 #endif  // HAS_ARGBCOLORTABLEROW_X86
6131 
6132 #ifdef HAS_RGBCOLORTABLEROW_X86
6133 // Tranform RGB pixels with color table.
6134 __declspec(naked)
6135 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
6136   __asm {
6137     push       esi
6138     mov        eax, [esp + 4 + 4]   /* dst_argb */
6139     mov        esi, [esp + 4 + 8]   /* table_argb */
6140     mov        ecx, [esp + 4 + 12]  /* width */
6141 
6142     // 1 pixel loop.
6143   convertloop:
6144     movzx      edx, byte ptr [eax]
6145     lea        eax, [eax + 4]
6146     movzx      edx, byte ptr [esi + edx * 4]
6147     mov        byte ptr [eax - 4], dl
6148     movzx      edx, byte ptr [eax - 4 + 1]
6149     movzx      edx, byte ptr [esi + edx * 4 + 1]
6150     mov        byte ptr [eax - 4 + 1], dl
6151     movzx      edx, byte ptr [eax - 4 + 2]
6152     movzx      edx, byte ptr [esi + edx * 4 + 2]
6153     mov        byte ptr [eax - 4 + 2], dl
6154     dec        ecx
6155     jg         convertloop
6156 
6157     pop        esi
6158     ret
6159   }
6160 }
6161 #endif  // HAS_RGBCOLORTABLEROW_X86
6162 
6163 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6164 // Tranform RGB pixels with luma table.
6165 __declspec(naked)
6166 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
6167                                  int width,
6168                                  const uint8* luma, uint32 lumacoeff) {
6169   __asm {
6170     push       esi
6171     push       edi
6172     mov        eax, [esp + 8 + 4]   /* src_argb */
6173     mov        edi, [esp + 8 + 8]   /* dst_argb */
6174     mov        ecx, [esp + 8 + 12]  /* width */
6175     movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
6176     movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
6177     pshufd     xmm2, xmm2, 0
6178     pshufd     xmm3, xmm3, 0
6179     pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
6180     psllw      xmm4, 8
6181     pxor       xmm5, xmm5
6182 
6183     // 4 pixel loop.
6184   convertloop:
6185     movdqu     xmm0, xmmword ptr [eax]      // generate luma ptr
6186     pmaddubsw  xmm0, xmm3
6187     phaddw     xmm0, xmm0
6188     pand       xmm0, xmm4  // mask out low bits
6189     punpcklwd  xmm0, xmm5
6190     paddd      xmm0, xmm2  // add table base
6191     movd       esi, xmm0
6192     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6193 
6194     movzx      edx, byte ptr [eax]
6195     movzx      edx, byte ptr [esi + edx]
6196     mov        byte ptr [edi], dl
6197     movzx      edx, byte ptr [eax + 1]
6198     movzx      edx, byte ptr [esi + edx]
6199     mov        byte ptr [edi + 1], dl
6200     movzx      edx, byte ptr [eax + 2]
6201     movzx      edx, byte ptr [esi + edx]
6202     mov        byte ptr [edi + 2], dl
6203     movzx      edx, byte ptr [eax + 3]  // copy alpha.
6204     mov        byte ptr [edi + 3], dl
6205 
6206     movd       esi, xmm0
6207     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6208 
6209     movzx      edx, byte ptr [eax + 4]
6210     movzx      edx, byte ptr [esi + edx]
6211     mov        byte ptr [edi + 4], dl
6212     movzx      edx, byte ptr [eax + 5]
6213     movzx      edx, byte ptr [esi + edx]
6214     mov        byte ptr [edi + 5], dl
6215     movzx      edx, byte ptr [eax + 6]
6216     movzx      edx, byte ptr [esi + edx]
6217     mov        byte ptr [edi + 6], dl
6218     movzx      edx, byte ptr [eax + 7]  // copy alpha.
6219     mov        byte ptr [edi + 7], dl
6220 
6221     movd       esi, xmm0
6222     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6223 
6224     movzx      edx, byte ptr [eax + 8]
6225     movzx      edx, byte ptr [esi + edx]
6226     mov        byte ptr [edi + 8], dl
6227     movzx      edx, byte ptr [eax + 9]
6228     movzx      edx, byte ptr [esi + edx]
6229     mov        byte ptr [edi + 9], dl
6230     movzx      edx, byte ptr [eax + 10]
6231     movzx      edx, byte ptr [esi + edx]
6232     mov        byte ptr [edi + 10], dl
6233     movzx      edx, byte ptr [eax + 11]  // copy alpha.
6234     mov        byte ptr [edi + 11], dl
6235 
6236     movd       esi, xmm0
6237 
6238     movzx      edx, byte ptr [eax + 12]
6239     movzx      edx, byte ptr [esi + edx]
6240     mov        byte ptr [edi + 12], dl
6241     movzx      edx, byte ptr [eax + 13]
6242     movzx      edx, byte ptr [esi + edx]
6243     mov        byte ptr [edi + 13], dl
6244     movzx      edx, byte ptr [eax + 14]
6245     movzx      edx, byte ptr [esi + edx]
6246     mov        byte ptr [edi + 14], dl
6247     movzx      edx, byte ptr [eax + 15]  // copy alpha.
6248     mov        byte ptr [edi + 15], dl
6249 
6250     lea        eax, [eax + 16]
6251     lea        edi, [edi + 16]
6252     sub        ecx, 4
6253     jg         convertloop
6254 
6255     pop        edi
6256     pop        esi
6257     ret
6258   }
6259 }
6260 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6261 
6262 #endif  // defined(_M_X64)
6263 
6264 #ifdef __cplusplus
6265 }  // extern "C"
6266 }  // namespace libyuv
6267 #endif
6268 
6269 #endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6270