1 // VERSION 2
2 /*
3  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
4  *
5  *  Use of this source code is governed by a BSD-style license
6  *  that can be found in the LICENSE file in the root of the source
7  *  tree. An additional intellectual property rights grant can be found
8  *  in the file PATENTS. All contributing project authors may
9  *  be found in the AUTHORS file in the root of the source tree.
10  */
11 
12 #include "libyuv/row.h"
13 
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18 
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && \
21     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
22 
23 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
24 
25 // Constants for ARGB
26 static vec8 kARGBToY = {
27   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
28 };
29 
30 // JPeg full range.
31 static vec8 kARGBToYJ = {
32   15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
33 };
34 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
35 
36 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
37 
38 static vec8 kARGBToU = {
39   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40 };
41 
42 static vec8 kARGBToUJ = {
43   127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
44 };
45 
46 static vec8 kARGBToV = {
47   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
48 };
49 
50 static vec8 kARGBToVJ = {
51   -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
52 };
53 
54 // Constants for BGRA
55 static vec8 kBGRAToY = {
56   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
57 };
58 
59 static vec8 kBGRAToU = {
60   0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
61 };
62 
63 static vec8 kBGRAToV = {
64   0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
65 };
66 
67 // Constants for ABGR
68 static vec8 kABGRToY = {
69   33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
70 };
71 
72 static vec8 kABGRToU = {
73   -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
74 };
75 
76 static vec8 kABGRToV = {
77   112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
78 };
79 
80 // Constants for RGBA.
81 static vec8 kRGBAToY = {
82   0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
83 };
84 
85 static vec8 kRGBAToU = {
86   0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
87 };
88 
89 static vec8 kRGBAToV = {
90   0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
91 };
92 
93 static uvec8 kAddY16 = {
94   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
95 };
96 
97 // 7 bit fixed point 0.5.
98 static vec16 kAddYJ64 = {
99   64, 64, 64, 64, 64, 64, 64, 64
100 };
101 
102 static uvec8 kAddUV128 = {
103   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
104   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
105 };
106 
107 static uvec16 kAddUVJ128 = {
108   0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
109 };
110 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
111 
112 #ifdef HAS_RGB24TOARGBROW_SSSE3
113 
114 // Shuffle table for converting RGB24 to ARGB.
115 static uvec8 kShuffleMaskRGB24ToARGB = {
116   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
117 };
118 
119 // Shuffle table for converting RAW to ARGB.
120 static uvec8 kShuffleMaskRAWToARGB = {
121   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
122 };
123 
124 // Shuffle table for converting RAW to RGB24.  First 8.
125 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
126   2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
127   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
128 };
129 
130 // Shuffle table for converting RAW to RGB24.  Middle 8.
131 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
132   2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
133   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
134 };
135 
136 // Shuffle table for converting RAW to RGB24.  Last 8.
137 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
138   8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
139   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
140 };
141 
142 // Shuffle table for converting ARGB to RGB24.
143 static uvec8 kShuffleMaskARGBToRGB24 = {
144   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
145 };
146 
147 // Shuffle table for converting ARGB to RAW.
148 static uvec8 kShuffleMaskARGBToRAW = {
149   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
150 };
151 
152 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
153 static uvec8 kShuffleMaskARGBToRGB24_0 = {
154   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
155 };
156 
157 // YUY2 shuf 16 Y to 32 Y.
158 static const lvec8 kShuffleYUY2Y = {
159   0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
160   0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
161 };
162 
163 // YUY2 shuf 8 UV to 16 UV.
164 static const lvec8 kShuffleYUY2UV = {
165   1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
166   1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
167 };
168 
169 // UYVY shuf 16 Y to 32 Y.
170 static const lvec8 kShuffleUYVYY = {
171   1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
172   1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
173 };
174 
175 // UYVY shuf 8 UV to 16 UV.
176 static const lvec8 kShuffleUYVYUV = {
177   0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
178   0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
179 };
180 
181 // NV21 shuf 8 VU to 16 UV.
182 static const lvec8 kShuffleNV21 = {
183   1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
184   1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
185 };
186 #endif  // HAS_RGB24TOARGBROW_SSSE3
187 
188 #ifdef HAS_J400TOARGBROW_SSE2
J400ToARGBRow_SSE2(const uint8 * src_y,uint8 * dst_argb,int width)189 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
190   asm volatile (
191     "pcmpeqb   %%xmm5,%%xmm5                   \n"
192     "pslld     $0x18,%%xmm5                    \n"
193     LABELALIGN
194   "1:                                          \n"
195     "movq      " MEMACCESS(0) ",%%xmm0         \n"
196     "lea       " MEMLEA(0x8,0) ",%0            \n"
197     "punpcklbw %%xmm0,%%xmm0                   \n"
198     "movdqa    %%xmm0,%%xmm1                   \n"
199     "punpcklwd %%xmm0,%%xmm0                   \n"
200     "punpckhwd %%xmm1,%%xmm1                   \n"
201     "por       %%xmm5,%%xmm0                   \n"
202     "por       %%xmm5,%%xmm1                   \n"
203     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
204     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
205     "lea       " MEMLEA(0x20,1) ",%1           \n"
206     "sub       $0x8,%2                         \n"
207     "jg        1b                              \n"
208   : "+r"(src_y),     // %0
209     "+r"(dst_argb),  // %1
210     "+r"(width)        // %2
211   :: "memory", "cc", "xmm0", "xmm1", "xmm5"
212   );
213 }
214 #endif  // HAS_J400TOARGBROW_SSE2
215 
216 #ifdef HAS_RGB24TOARGBROW_SSSE3
RGB24ToARGBRow_SSSE3(const uint8 * src_rgb24,uint8 * dst_argb,int width)217 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
218   asm volatile (
219     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
220     "pslld     $0x18,%%xmm5                    \n"
221     "movdqa    %3,%%xmm4                       \n"
222     LABELALIGN
223   "1:                                          \n"
224     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
225     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
226     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
227     "lea       " MEMLEA(0x30,0) ",%0           \n"
228     "movdqa    %%xmm3,%%xmm2                   \n"
229     "palignr   $0x8,%%xmm1,%%xmm2              \n"
230     "pshufb    %%xmm4,%%xmm2                   \n"
231     "por       %%xmm5,%%xmm2                   \n"
232     "palignr   $0xc,%%xmm0,%%xmm1              \n"
233     "pshufb    %%xmm4,%%xmm0                   \n"
234     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
235     "por       %%xmm5,%%xmm0                   \n"
236     "pshufb    %%xmm4,%%xmm1                   \n"
237     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
238     "por       %%xmm5,%%xmm1                   \n"
239     "palignr   $0x4,%%xmm3,%%xmm3              \n"
240     "pshufb    %%xmm4,%%xmm3                   \n"
241     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
242     "por       %%xmm5,%%xmm3                   \n"
243     "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
244     "lea       " MEMLEA(0x40,1) ",%1           \n"
245     "sub       $0x10,%2                        \n"
246     "jg        1b                              \n"
247   : "+r"(src_rgb24),  // %0
248     "+r"(dst_argb),  // %1
249     "+r"(width)        // %2
250   : "m"(kShuffleMaskRGB24ToARGB)  // %3
251   : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
252   );
253 }
254 
RAWToARGBRow_SSSE3(const uint8 * src_raw,uint8 * dst_argb,int width)255 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
256   asm volatile (
257     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
258     "pslld     $0x18,%%xmm5                    \n"
259     "movdqa    %3,%%xmm4                       \n"
260     LABELALIGN
261   "1:                                          \n"
262     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
263     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
264     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
265     "lea       " MEMLEA(0x30,0) ",%0           \n"
266     "movdqa    %%xmm3,%%xmm2                   \n"
267     "palignr   $0x8,%%xmm1,%%xmm2              \n"
268     "pshufb    %%xmm4,%%xmm2                   \n"
269     "por       %%xmm5,%%xmm2                   \n"
270     "palignr   $0xc,%%xmm0,%%xmm1              \n"
271     "pshufb    %%xmm4,%%xmm0                   \n"
272     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
273     "por       %%xmm5,%%xmm0                   \n"
274     "pshufb    %%xmm4,%%xmm1                   \n"
275     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
276     "por       %%xmm5,%%xmm1                   \n"
277     "palignr   $0x4,%%xmm3,%%xmm3              \n"
278     "pshufb    %%xmm4,%%xmm3                   \n"
279     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
280     "por       %%xmm5,%%xmm3                   \n"
281     "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
282     "lea       " MEMLEA(0x40,1) ",%1           \n"
283     "sub       $0x10,%2                        \n"
284     "jg        1b                              \n"
285   : "+r"(src_raw),   // %0
286     "+r"(dst_argb),  // %1
287     "+r"(width)        // %2
288   : "m"(kShuffleMaskRAWToARGB)  // %3
289   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
290   );
291 }
292 
RAWToRGB24Row_SSSE3(const uint8 * src_raw,uint8 * dst_rgb24,int width)293 void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
294   asm volatile (
295    "movdqa     %3,%%xmm3                       \n"
296    "movdqa     %4,%%xmm4                       \n"
297    "movdqa     %5,%%xmm5                       \n"
298     LABELALIGN
299   "1:                                          \n"
300     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
301     "movdqu    " MEMACCESS2(0x4,0) ",%%xmm1    \n"
302     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm2    \n"
303     "lea       " MEMLEA(0x18,0) ",%0           \n"
304     "pshufb    %%xmm3,%%xmm0                   \n"
305     "pshufb    %%xmm4,%%xmm1                   \n"
306     "pshufb    %%xmm5,%%xmm2                   \n"
307     "movq      %%xmm0," MEMACCESS(1) "         \n"
308     "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
309     "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
310     "lea       " MEMLEA(0x18,1) ",%1           \n"
311     "sub       $0x8,%2                         \n"
312     "jg        1b                              \n"
313   : "+r"(src_raw),    // %0
314     "+r"(dst_rgb24),  // %1
315     "+r"(width)       // %2
316   : "m"(kShuffleMaskRAWToRGB24_0),  // %3
317     "m"(kShuffleMaskRAWToRGB24_1),  // %4
318     "m"(kShuffleMaskRAWToRGB24_2)   // %5
319   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
320   );
321 }
322 
RGB565ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int width)323 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
324   asm volatile (
325     "mov       $0x1080108,%%eax                \n"
326     "movd      %%eax,%%xmm5                    \n"
327     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
328     "mov       $0x20802080,%%eax               \n"
329     "movd      %%eax,%%xmm6                    \n"
330     "pshufd    $0x0,%%xmm6,%%xmm6              \n"
331     "pcmpeqb   %%xmm3,%%xmm3                   \n"
332     "psllw     $0xb,%%xmm3                     \n"
333     "pcmpeqb   %%xmm4,%%xmm4                   \n"
334     "psllw     $0xa,%%xmm4                     \n"
335     "psrlw     $0x5,%%xmm4                     \n"
336     "pcmpeqb   %%xmm7,%%xmm7                   \n"
337     "psllw     $0x8,%%xmm7                     \n"
338     "sub       %0,%1                           \n"
339     "sub       %0,%1                           \n"
340     LABELALIGN
341   "1:                                          \n"
342     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
343     "movdqa    %%xmm0,%%xmm1                   \n"
344     "movdqa    %%xmm0,%%xmm2                   \n"
345     "pand      %%xmm3,%%xmm1                   \n"
346     "psllw     $0xb,%%xmm2                     \n"
347     "pmulhuw   %%xmm5,%%xmm1                   \n"
348     "pmulhuw   %%xmm5,%%xmm2                   \n"
349     "psllw     $0x8,%%xmm1                     \n"
350     "por       %%xmm2,%%xmm1                   \n"
351     "pand      %%xmm4,%%xmm0                   \n"
352     "pmulhuw   %%xmm6,%%xmm0                   \n"
353     "por       %%xmm7,%%xmm0                   \n"
354     "movdqa    %%xmm1,%%xmm2                   \n"
355     "punpcklbw %%xmm0,%%xmm1                   \n"
356     "punpckhbw %%xmm0,%%xmm2                   \n"
357     MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
358     MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
359     "lea       " MEMLEA(0x10,0) ",%0           \n"
360     "sub       $0x8,%2                         \n"
361     "jg        1b                              \n"
362   : "+r"(src),  // %0
363     "+r"(dst),  // %1
364     "+r"(width)   // %2
365   :
366   : "memory", "cc", "eax", NACL_R14
367     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
368   );
369 }
370 
ARGB1555ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int width)371 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
372   asm volatile (
373     "mov       $0x1080108,%%eax                \n"
374     "movd      %%eax,%%xmm5                    \n"
375     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
376     "mov       $0x42004200,%%eax               \n"
377     "movd      %%eax,%%xmm6                    \n"
378     "pshufd    $0x0,%%xmm6,%%xmm6              \n"
379     "pcmpeqb   %%xmm3,%%xmm3                   \n"
380     "psllw     $0xb,%%xmm3                     \n"
381     "movdqa    %%xmm3,%%xmm4                   \n"
382     "psrlw     $0x6,%%xmm4                     \n"
383     "pcmpeqb   %%xmm7,%%xmm7                   \n"
384     "psllw     $0x8,%%xmm7                     \n"
385     "sub       %0,%1                           \n"
386     "sub       %0,%1                           \n"
387     LABELALIGN
388   "1:                                          \n"
389     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
390     "movdqa    %%xmm0,%%xmm1                   \n"
391     "movdqa    %%xmm0,%%xmm2                   \n"
392     "psllw     $0x1,%%xmm1                     \n"
393     "psllw     $0xb,%%xmm2                     \n"
394     "pand      %%xmm3,%%xmm1                   \n"
395     "pmulhuw   %%xmm5,%%xmm2                   \n"
396     "pmulhuw   %%xmm5,%%xmm1                   \n"
397     "psllw     $0x8,%%xmm1                     \n"
398     "por       %%xmm2,%%xmm1                   \n"
399     "movdqa    %%xmm0,%%xmm2                   \n"
400     "pand      %%xmm4,%%xmm0                   \n"
401     "psraw     $0x8,%%xmm2                     \n"
402     "pmulhuw   %%xmm6,%%xmm0                   \n"
403     "pand      %%xmm7,%%xmm2                   \n"
404     "por       %%xmm2,%%xmm0                   \n"
405     "movdqa    %%xmm1,%%xmm2                   \n"
406     "punpcklbw %%xmm0,%%xmm1                   \n"
407     "punpckhbw %%xmm0,%%xmm2                   \n"
408     MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
409     MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
410     "lea       " MEMLEA(0x10,0) ",%0           \n"
411     "sub       $0x8,%2                         \n"
412     "jg        1b                              \n"
413   : "+r"(src),  // %0
414     "+r"(dst),  // %1
415     "+r"(width)   // %2
416   :
417   : "memory", "cc", "eax", NACL_R14
418     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
419   );
420 }
421 
ARGB4444ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int width)422 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
423   asm volatile (
424     "mov       $0xf0f0f0f,%%eax                \n"
425     "movd      %%eax,%%xmm4                    \n"
426     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
427     "movdqa    %%xmm4,%%xmm5                   \n"
428     "pslld     $0x4,%%xmm5                     \n"
429     "sub       %0,%1                           \n"
430     "sub       %0,%1                           \n"
431     LABELALIGN
432   "1:                                          \n"
433     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
434     "movdqa    %%xmm0,%%xmm2                   \n"
435     "pand      %%xmm4,%%xmm0                   \n"
436     "pand      %%xmm5,%%xmm2                   \n"
437     "movdqa    %%xmm0,%%xmm1                   \n"
438     "movdqa    %%xmm2,%%xmm3                   \n"
439     "psllw     $0x4,%%xmm1                     \n"
440     "psrlw     $0x4,%%xmm3                     \n"
441     "por       %%xmm1,%%xmm0                   \n"
442     "por       %%xmm3,%%xmm2                   \n"
443     "movdqa    %%xmm0,%%xmm1                   \n"
444     "punpcklbw %%xmm2,%%xmm0                   \n"
445     "punpckhbw %%xmm2,%%xmm1                   \n"
446     MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)
447     MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)
448     "lea       " MEMLEA(0x10,0) ",%0           \n"
449     "sub       $0x8,%2                         \n"
450     "jg        1b                              \n"
451   : "+r"(src),  // %0
452     "+r"(dst),  // %1
453     "+r"(width)   // %2
454   :
455   : "memory", "cc", "eax", NACL_R14
456     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
457   );
458 }
459 
ARGBToRGB24Row_SSSE3(const uint8 * src,uint8 * dst,int width)460 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
461   asm volatile (
462     "movdqa    %3,%%xmm6                       \n"
463     LABELALIGN
464   "1:                                          \n"
465     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
466     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
467     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
468     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
469     "lea       " MEMLEA(0x40,0) ",%0           \n"
470     "pshufb    %%xmm6,%%xmm0                   \n"
471     "pshufb    %%xmm6,%%xmm1                   \n"
472     "pshufb    %%xmm6,%%xmm2                   \n"
473     "pshufb    %%xmm6,%%xmm3                   \n"
474     "movdqa    %%xmm1,%%xmm4                   \n"
475     "psrldq    $0x4,%%xmm1                     \n"
476     "pslldq    $0xc,%%xmm4                     \n"
477     "movdqa    %%xmm2,%%xmm5                   \n"
478     "por       %%xmm4,%%xmm0                   \n"
479     "pslldq    $0x8,%%xmm5                     \n"
480     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
481     "por       %%xmm5,%%xmm1                   \n"
482     "psrldq    $0x8,%%xmm2                     \n"
483     "pslldq    $0x4,%%xmm3                     \n"
484     "por       %%xmm3,%%xmm2                   \n"
485     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
486     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
487     "lea       " MEMLEA(0x30,1) ",%1           \n"
488     "sub       $0x10,%2                        \n"
489     "jg        1b                              \n"
490   : "+r"(src),  // %0
491     "+r"(dst),  // %1
492     "+r"(width)   // %2
493   : "m"(kShuffleMaskARGBToRGB24)  // %3
494   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
495   );
496 }
497 
ARGBToRAWRow_SSSE3(const uint8 * src,uint8 * dst,int width)498 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
499   asm volatile (
500     "movdqa    %3,%%xmm6                       \n"
501     LABELALIGN
502   "1:                                          \n"
503     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
504     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
505     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
506     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
507     "lea       " MEMLEA(0x40,0) ",%0           \n"
508     "pshufb    %%xmm6,%%xmm0                   \n"
509     "pshufb    %%xmm6,%%xmm1                   \n"
510     "pshufb    %%xmm6,%%xmm2                   \n"
511     "pshufb    %%xmm6,%%xmm3                   \n"
512     "movdqa    %%xmm1,%%xmm4                   \n"
513     "psrldq    $0x4,%%xmm1                     \n"
514     "pslldq    $0xc,%%xmm4                     \n"
515     "movdqa    %%xmm2,%%xmm5                   \n"
516     "por       %%xmm4,%%xmm0                   \n"
517     "pslldq    $0x8,%%xmm5                     \n"
518     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
519     "por       %%xmm5,%%xmm1                   \n"
520     "psrldq    $0x8,%%xmm2                     \n"
521     "pslldq    $0x4,%%xmm3                     \n"
522     "por       %%xmm3,%%xmm2                   \n"
523     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
524     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
525     "lea       " MEMLEA(0x30,1) ",%1           \n"
526     "sub       $0x10,%2                        \n"
527     "jg        1b                              \n"
528   : "+r"(src),  // %0
529     "+r"(dst),  // %1
530     "+r"(width)   // %2
531   : "m"(kShuffleMaskARGBToRAW)  // %3
532   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
533   );
534 }
535 
ARGBToRGB565Row_SSE2(const uint8 * src,uint8 * dst,int width)536 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
537   asm volatile (
538     "pcmpeqb   %%xmm3,%%xmm3                   \n"
539     "psrld     $0x1b,%%xmm3                    \n"
540     "pcmpeqb   %%xmm4,%%xmm4                   \n"
541     "psrld     $0x1a,%%xmm4                    \n"
542     "pslld     $0x5,%%xmm4                     \n"
543     "pcmpeqb   %%xmm5,%%xmm5                   \n"
544     "pslld     $0xb,%%xmm5                     \n"
545     LABELALIGN
546   "1:                                          \n"
547     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
548     "movdqa    %%xmm0,%%xmm1                   \n"
549     "movdqa    %%xmm0,%%xmm2                   \n"
550     "pslld     $0x8,%%xmm0                     \n"
551     "psrld     $0x3,%%xmm1                     \n"
552     "psrld     $0x5,%%xmm2                     \n"
553     "psrad     $0x10,%%xmm0                    \n"
554     "pand      %%xmm3,%%xmm1                   \n"
555     "pand      %%xmm4,%%xmm2                   \n"
556     "pand      %%xmm5,%%xmm0                   \n"
557     "por       %%xmm2,%%xmm1                   \n"
558     "por       %%xmm1,%%xmm0                   \n"
559     "packssdw  %%xmm0,%%xmm0                   \n"
560     "lea       " MEMLEA(0x10,0) ",%0           \n"
561     "movq      %%xmm0," MEMACCESS(1) "         \n"
562     "lea       " MEMLEA(0x8,1) ",%1            \n"
563     "sub       $0x4,%2                         \n"
564     "jg        1b                              \n"
565   : "+r"(src),  // %0
566     "+r"(dst),  // %1
567     "+r"(width)   // %2
568   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
569   );
570 }
571 
ARGBToRGB565DitherRow_SSE2(const uint8 * src,uint8 * dst,const uint32 dither4,int width)572 void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst,
573                                 const uint32 dither4, int width) {
574   asm volatile (
575     "movd       %3,%%xmm6                      \n"
576     "punpcklbw  %%xmm6,%%xmm6                  \n"
577     "movdqa     %%xmm6,%%xmm7                  \n"
578     "punpcklwd  %%xmm6,%%xmm6                  \n"
579     "punpckhwd  %%xmm7,%%xmm7                  \n"
580     "pcmpeqb    %%xmm3,%%xmm3                  \n"
581     "psrld      $0x1b,%%xmm3                   \n"
582     "pcmpeqb    %%xmm4,%%xmm4                  \n"
583     "psrld      $0x1a,%%xmm4                   \n"
584     "pslld      $0x5,%%xmm4                    \n"
585     "pcmpeqb    %%xmm5,%%xmm5                  \n"
586     "pslld      $0xb,%%xmm5                    \n"
587 
588     LABELALIGN
589   "1:                                          \n"
590     "movdqu     (%0),%%xmm0                    \n"
591     "paddusb    %%xmm6,%%xmm0                  \n"
592     "movdqa     %%xmm0,%%xmm1                  \n"
593     "movdqa     %%xmm0,%%xmm2                  \n"
594     "pslld      $0x8,%%xmm0                    \n"
595     "psrld      $0x3,%%xmm1                    \n"
596     "psrld      $0x5,%%xmm2                    \n"
597     "psrad      $0x10,%%xmm0                   \n"
598     "pand       %%xmm3,%%xmm1                  \n"
599     "pand       %%xmm4,%%xmm2                  \n"
600     "pand       %%xmm5,%%xmm0                  \n"
601     "por        %%xmm2,%%xmm1                  \n"
602     "por        %%xmm1,%%xmm0                  \n"
603     "packssdw   %%xmm0,%%xmm0                  \n"
604     "lea        0x10(%0),%0                    \n"
605     "movq       %%xmm0,(%1)                    \n"
606     "lea        0x8(%1),%1                     \n"
607     "sub        $0x4,%2                        \n"
608     "jg        1b                              \n"
609   : "+r"(src),  // %0
610     "+r"(dst),  // %1
611     "+r"(width)   // %2
612   : "m"(dither4) // %3
613   : "memory", "cc",
614     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
615   );
616 }
617 
618 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
ARGBToRGB565DitherRow_AVX2(const uint8 * src,uint8 * dst,const uint32 dither4,int width)619 void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst,
620                                 const uint32 dither4, int width) {
621   asm volatile (
622     "vbroadcastss %3,%%xmm6                    \n"
623     "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"
624     "vpermq     $0xd8,%%ymm6,%%ymm6            \n"
625     "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"
626     "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"
627     "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"
628     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
629     "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"
630     "vpslld     $0x5,%%ymm4,%%ymm4             \n"
631     "vpslld     $0xb,%%ymm3,%%ymm5             \n"
632 
633     LABELALIGN
634   "1:                                          \n"
635     "vmovdqu    (%0),%%ymm0                    \n"
636     "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"
637     "vpsrld     $0x5,%%ymm0,%%ymm2             \n"
638     "vpsrld     $0x3,%%ymm0,%%ymm1             \n"
639     "vpsrld     $0x8,%%ymm0,%%ymm0             \n"
640     "vpand      %%ymm4,%%ymm2,%%ymm2           \n"
641     "vpand      %%ymm3,%%ymm1,%%ymm1           \n"
642     "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
643     "vpor       %%ymm2,%%ymm1,%%ymm1           \n"
644     "vpor       %%ymm1,%%ymm0,%%ymm0           \n"
645     "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"
646     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
647     "lea        0x20(%0),%0                    \n"
648     "vmovdqu    %%xmm0,(%1)                    \n"
649     "lea        0x10(%1),%1                    \n"
650     "sub        $0x8,%2                        \n"
651     "jg         1b                             \n"
652     "vzeroupper                                \n"
653   : "+r"(src),  // %0
654     "+r"(dst),  // %1
655     "+r"(width)   // %2
656   : "m"(dither4) // %3
657   : "memory", "cc",
658     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
659   );
660 }
661 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
662 
663 
ARGBToARGB1555Row_SSE2(const uint8 * src,uint8 * dst,int width)664 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
665   asm volatile (
666     "pcmpeqb   %%xmm4,%%xmm4                   \n"
667     "psrld     $0x1b,%%xmm4                    \n"
668     "movdqa    %%xmm4,%%xmm5                   \n"
669     "pslld     $0x5,%%xmm5                     \n"
670     "movdqa    %%xmm4,%%xmm6                   \n"
671     "pslld     $0xa,%%xmm6                     \n"
672     "pcmpeqb   %%xmm7,%%xmm7                   \n"
673     "pslld     $0xf,%%xmm7                     \n"
674     LABELALIGN
675   "1:                                          \n"
676     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
677     "movdqa    %%xmm0,%%xmm1                   \n"
678     "movdqa    %%xmm0,%%xmm2                   \n"
679     "movdqa    %%xmm0,%%xmm3                   \n"
680     "psrad     $0x10,%%xmm0                    \n"
681     "psrld     $0x3,%%xmm1                     \n"
682     "psrld     $0x6,%%xmm2                     \n"
683     "psrld     $0x9,%%xmm3                     \n"
684     "pand      %%xmm7,%%xmm0                   \n"
685     "pand      %%xmm4,%%xmm1                   \n"
686     "pand      %%xmm5,%%xmm2                   \n"
687     "pand      %%xmm6,%%xmm3                   \n"
688     "por       %%xmm1,%%xmm0                   \n"
689     "por       %%xmm3,%%xmm2                   \n"
690     "por       %%xmm2,%%xmm0                   \n"
691     "packssdw  %%xmm0,%%xmm0                   \n"
692     "lea       " MEMLEA(0x10,0) ",%0           \n"
693     "movq      %%xmm0," MEMACCESS(1) "         \n"
694     "lea       " MEMLEA(0x8,1) ",%1            \n"
695     "sub       $0x4,%2                         \n"
696     "jg        1b                              \n"
697   : "+r"(src),  // %0
698     "+r"(dst),  // %1
699     "+r"(width)   // %2
700   :: "memory", "cc",
701     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
702   );
703 }
704 
ARGBToARGB4444Row_SSE2(const uint8 * src,uint8 * dst,int width)705 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
706   asm volatile (
707     "pcmpeqb   %%xmm4,%%xmm4                   \n"
708     "psllw     $0xc,%%xmm4                     \n"
709     "movdqa    %%xmm4,%%xmm3                   \n"
710     "psrlw     $0x8,%%xmm3                     \n"
711     LABELALIGN
712   "1:                                          \n"
713     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
714     "movdqa    %%xmm0,%%xmm1                   \n"
715     "pand      %%xmm3,%%xmm0                   \n"
716     "pand      %%xmm4,%%xmm1                   \n"
717     "psrlq     $0x4,%%xmm0                     \n"
718     "psrlq     $0x8,%%xmm1                     \n"
719     "por       %%xmm1,%%xmm0                   \n"
720     "packuswb  %%xmm0,%%xmm0                   \n"
721     "lea       " MEMLEA(0x10,0) ",%0           \n"
722     "movq      %%xmm0," MEMACCESS(1) "         \n"
723     "lea       " MEMLEA(0x8,1) ",%1            \n"
724     "sub       $0x4,%2                         \n"
725     "jg        1b                              \n"
726   : "+r"(src),  // %0
727     "+r"(dst),  // %1
728     "+r"(width)   // %2
729   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
730   );
731 }
732 #endif  // HAS_RGB24TOARGBROW_SSSE3
733 
734 #ifdef HAS_ARGBTOYROW_SSSE3
735 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
ARGBToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int width)736 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
737   asm volatile (
738     "movdqa    %3,%%xmm4                       \n"
739     "movdqa    %4,%%xmm5                       \n"
740     LABELALIGN
741   "1:                                          \n"
742     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
743     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
744     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
745     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
746     "pmaddubsw %%xmm4,%%xmm0                   \n"
747     "pmaddubsw %%xmm4,%%xmm1                   \n"
748     "pmaddubsw %%xmm4,%%xmm2                   \n"
749     "pmaddubsw %%xmm4,%%xmm3                   \n"
750     "lea       " MEMLEA(0x40,0) ",%0           \n"
751     "phaddw    %%xmm1,%%xmm0                   \n"
752     "phaddw    %%xmm3,%%xmm2                   \n"
753     "psrlw     $0x7,%%xmm0                     \n"
754     "psrlw     $0x7,%%xmm2                     \n"
755     "packuswb  %%xmm2,%%xmm0                   \n"
756     "paddb     %%xmm5,%%xmm0                   \n"
757     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
758     "lea       " MEMLEA(0x10,1) ",%1           \n"
759     "sub       $0x10,%2                        \n"
760     "jg        1b                              \n"
761   : "+r"(src_argb),  // %0
762     "+r"(dst_y),     // %1
763     "+r"(width)        // %2
764   : "m"(kARGBToY),   // %3
765     "m"(kAddY16)     // %4
766   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
767   );
768 }
769 #endif  // HAS_ARGBTOYROW_SSSE3
770 
771 #ifdef HAS_ARGBTOYJROW_SSSE3
772 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
773 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
ARGBToYJRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int width)774 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
775   asm volatile (
776     "movdqa    %3,%%xmm4                       \n"
777     "movdqa    %4,%%xmm5                       \n"
778     LABELALIGN
779   "1:                                          \n"
780     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
781     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
782     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
783     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
784     "pmaddubsw %%xmm4,%%xmm0                   \n"
785     "pmaddubsw %%xmm4,%%xmm1                   \n"
786     "pmaddubsw %%xmm4,%%xmm2                   \n"
787     "pmaddubsw %%xmm4,%%xmm3                   \n"
788     "lea       " MEMLEA(0x40,0) ",%0           \n"
789     "phaddw    %%xmm1,%%xmm0                   \n"
790     "phaddw    %%xmm3,%%xmm2                   \n"
791     "paddw     %%xmm5,%%xmm0                   \n"
792     "paddw     %%xmm5,%%xmm2                   \n"
793     "psrlw     $0x7,%%xmm0                     \n"
794     "psrlw     $0x7,%%xmm2                     \n"
795     "packuswb  %%xmm2,%%xmm0                   \n"
796     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
797     "lea       " MEMLEA(0x10,1) ",%1           \n"
798     "sub       $0x10,%2                        \n"
799     "jg        1b                              \n"
800   : "+r"(src_argb),  // %0
801     "+r"(dst_y),     // %1
802     "+r"(width)        // %2
803   : "m"(kARGBToYJ),  // %3
804     "m"(kAddYJ64)    // %4
805   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
806   );
807 }
808 #endif  // HAS_ARGBTOYJROW_SSSE3
809 
810 #ifdef HAS_ARGBTOYROW_AVX2
811 // vpermd for vphaddw + vpackuswb vpermd.
812 static const lvec32 kPermdARGBToY_AVX = {
813   0, 4, 1, 5, 2, 6, 3, 7
814 };
815 
816 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYRow_AVX2(const uint8 * src_argb,uint8 * dst_y,int width)817 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
818   asm volatile (
819     "vbroadcastf128 %3,%%ymm4                  \n"
820     "vbroadcastf128 %4,%%ymm5                  \n"
821     "vmovdqu    %5,%%ymm6                      \n"
822     LABELALIGN
823   "1:                                          \n"
824     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
825     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
826     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
827     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
828     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
829     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
830     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
831     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
832     "lea       " MEMLEA(0x80,0) ",%0           \n"
833     "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
834     "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
835     "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
836     "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
837     "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
838     "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
839     "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
840     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
841     "lea       " MEMLEA(0x20,1) ",%1           \n"
842     "sub       $0x20,%2                        \n"
843     "jg        1b                              \n"
844     "vzeroupper                                \n"
845   : "+r"(src_argb),  // %0
846     "+r"(dst_y),     // %1
847     "+r"(width)        // %2
848   : "m"(kARGBToY),   // %3
849     "m"(kAddY16),    // %4
850     "m"(kPermdARGBToY_AVX)  // %5
851   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
852   );
853 }
854 #endif  // HAS_ARGBTOYROW_AVX2
855 
856 #ifdef HAS_ARGBTOYJROW_AVX2
857 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYJRow_AVX2(const uint8 * src_argb,uint8 * dst_y,int width)858 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
859   asm volatile (
860     "vbroadcastf128 %3,%%ymm4                  \n"
861     "vbroadcastf128 %4,%%ymm5                  \n"
862     "vmovdqu    %5,%%ymm6                      \n"
863     LABELALIGN
864   "1:                                          \n"
865     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
866     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
867     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
868     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
869     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
870     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
871     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
872     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
873     "lea       " MEMLEA(0x80,0) ",%0           \n"
874     "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
875     "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
876     "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
877     "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
878     "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
879     "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
880     "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
881     "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
882     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
883     "lea       " MEMLEA(0x20,1) ",%1           \n"
884     "sub       $0x20,%2                        \n"
885     "jg        1b                              \n"
886     "vzeroupper                                \n"
887   : "+r"(src_argb),  // %0
888     "+r"(dst_y),     // %1
889     "+r"(width)        // %2
890   : "m"(kARGBToYJ),   // %3
891     "m"(kAddYJ64),    // %4
892     "m"(kPermdARGBToY_AVX)  // %5
893   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
894   );
895 }
896 #endif  // HAS_ARGBTOYJROW_AVX2
897 
898 #ifdef HAS_ARGBTOUVROW_SSSE3
ARGBToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)899 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
900                        uint8* dst_u, uint8* dst_v, int width) {
901   asm volatile (
902     "movdqa    %5,%%xmm3                       \n"
903     "movdqa    %6,%%xmm4                       \n"
904     "movdqa    %7,%%xmm5                       \n"
905     "sub       %1,%2                           \n"
906     LABELALIGN
907   "1:                                          \n"
908     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
909     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
910     "pavgb     %%xmm7,%%xmm0                   \n"
911     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
912     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
913     "pavgb     %%xmm7,%%xmm1                   \n"
914     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
915     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
916     "pavgb     %%xmm7,%%xmm2                   \n"
917     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
918     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
919     "pavgb     %%xmm7,%%xmm6                   \n"
920 
921     "lea       " MEMLEA(0x40,0) ",%0           \n"
922     "movdqa    %%xmm0,%%xmm7                   \n"
923     "shufps    $0x88,%%xmm1,%%xmm0             \n"
924     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
925     "pavgb     %%xmm7,%%xmm0                   \n"
926     "movdqa    %%xmm2,%%xmm7                   \n"
927     "shufps    $0x88,%%xmm6,%%xmm2             \n"
928     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
929     "pavgb     %%xmm7,%%xmm2                   \n"
930     "movdqa    %%xmm0,%%xmm1                   \n"
931     "movdqa    %%xmm2,%%xmm6                   \n"
932     "pmaddubsw %%xmm4,%%xmm0                   \n"
933     "pmaddubsw %%xmm4,%%xmm2                   \n"
934     "pmaddubsw %%xmm3,%%xmm1                   \n"
935     "pmaddubsw %%xmm3,%%xmm6                   \n"
936     "phaddw    %%xmm2,%%xmm0                   \n"
937     "phaddw    %%xmm6,%%xmm1                   \n"
938     "psraw     $0x8,%%xmm0                     \n"
939     "psraw     $0x8,%%xmm1                     \n"
940     "packsswb  %%xmm1,%%xmm0                   \n"
941     "paddb     %%xmm5,%%xmm0                   \n"
942     "movlps    %%xmm0," MEMACCESS(1) "         \n"
943     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
944     "lea       " MEMLEA(0x8,1) ",%1            \n"
945     "sub       $0x10,%3                        \n"
946     "jg        1b                              \n"
947   : "+r"(src_argb0),       // %0
948     "+r"(dst_u),           // %1
949     "+r"(dst_v),           // %2
950     "+rm"(width)           // %3
951   : "r"((intptr_t)(src_stride_argb)), // %4
952     "m"(kARGBToV),  // %5
953     "m"(kARGBToU),  // %6
954     "m"(kAddUV128)  // %7
955   : "memory", "cc", NACL_R14
956     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
957   );
958 }
959 #endif  // HAS_ARGBTOUVROW_SSSE3
960 
961 #ifdef HAS_ARGBTOUVROW_AVX2
962 // vpshufb for vphaddw + vpackuswb packed to shorts.
963 static const lvec8 kShufARGBToUV_AVX = {
964   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
965   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
966 };
ARGBToUVRow_AVX2(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)967 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
968                       uint8* dst_u, uint8* dst_v, int width) {
969   asm volatile (
970     "vbroadcastf128 %5,%%ymm5                  \n"
971     "vbroadcastf128 %6,%%ymm6                  \n"
972     "vbroadcastf128 %7,%%ymm7                  \n"
973     "sub       %1,%2                           \n"
974     LABELALIGN
975   "1:                                          \n"
976     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
977     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
978     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
979     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
980     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
981     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
982     VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
983     VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
984     "lea       " MEMLEA(0x80,0) ",%0           \n"
985     "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
986     "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
987     "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
988     "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
989     "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
990     "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
991 
992     "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
993     "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
994     "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
995     "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
996     "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
997     "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
998     "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
999     "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
1000     "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
1001     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
1002     "vpshufb    %8,%%ymm0,%%ymm0               \n"
1003     "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
1004 
1005     "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
1006     VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
1007     "lea       " MEMLEA(0x10,1) ",%1           \n"
1008     "sub       $0x20,%3                        \n"
1009     "jg        1b                              \n"
1010     "vzeroupper                                \n"
1011   : "+r"(src_argb0),       // %0
1012     "+r"(dst_u),           // %1
1013     "+r"(dst_v),           // %2
1014     "+rm"(width)           // %3
1015   : "r"((intptr_t)(src_stride_argb)), // %4
1016     "m"(kAddUV128),  // %5
1017     "m"(kARGBToV),   // %6
1018     "m"(kARGBToU),   // %7
1019     "m"(kShufARGBToUV_AVX)  // %8
1020   : "memory", "cc", NACL_R14
1021     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1022   );
1023 }
1024 #endif  // HAS_ARGBTOUVROW_AVX2
1025 
1026 #ifdef HAS_ARGBTOUVJROW_AVX2
ARGBToUVJRow_AVX2(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1027 void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
1028                        uint8* dst_u, uint8* dst_v, int width) {
1029   asm volatile (
1030     "vbroadcastf128 %5,%%ymm5                  \n"
1031     "vbroadcastf128 %6,%%ymm6                  \n"
1032     "vbroadcastf128 %7,%%ymm7                  \n"
1033     "sub       %1,%2                           \n"
1034     LABELALIGN
1035   "1:                                          \n"
1036     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
1037     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
1038     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
1039     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
1040     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
1041     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
1042     VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
1043     VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
1044     "lea       " MEMLEA(0x80,0) ",%0           \n"
1045     "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
1046     "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
1047     "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
1048     "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
1049     "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
1050     "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
1051 
1052     "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
1053     "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
1054     "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
1055     "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
1056     "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
1057     "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
1058     "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
1059     "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"
1060     "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
1061     "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
1062     "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
1063     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
1064     "vpshufb    %8,%%ymm0,%%ymm0               \n"
1065 
1066     "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
1067     VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
1068     "lea       " MEMLEA(0x10,1) ",%1           \n"
1069     "sub       $0x20,%3                        \n"
1070     "jg        1b                              \n"
1071     "vzeroupper                                \n"
1072   : "+r"(src_argb0),       // %0
1073     "+r"(dst_u),           // %1
1074     "+r"(dst_v),           // %2
1075     "+rm"(width)           // %3
1076   : "r"((intptr_t)(src_stride_argb)), // %4
1077     "m"(kAddUVJ128),  // %5
1078     "m"(kARGBToVJ),  // %6
1079     "m"(kARGBToUJ),  // %7
1080     "m"(kShufARGBToUV_AVX)  // %8
1081   : "memory", "cc", NACL_R14
1082     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1083   );
1084 }
1085 #endif  // HAS_ARGBTOUVJROW_AVX2
1086 
1087 #ifdef HAS_ARGBTOUVJROW_SSSE3
ARGBToUVJRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1088 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1089                         uint8* dst_u, uint8* dst_v, int width) {
1090   asm volatile (
1091     "movdqa    %5,%%xmm3                       \n"
1092     "movdqa    %6,%%xmm4                       \n"
1093     "movdqa    %7,%%xmm5                       \n"
1094     "sub       %1,%2                           \n"
1095     LABELALIGN
1096   "1:                                          \n"
1097     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1098     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1099     "pavgb     %%xmm7,%%xmm0                   \n"
1100     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1101     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1102     "pavgb     %%xmm7,%%xmm1                   \n"
1103     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1104     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1105     "pavgb     %%xmm7,%%xmm2                   \n"
1106     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1107     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1108     "pavgb     %%xmm7,%%xmm6                   \n"
1109 
1110     "lea       " MEMLEA(0x40,0) ",%0           \n"
1111     "movdqa    %%xmm0,%%xmm7                   \n"
1112     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1113     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1114     "pavgb     %%xmm7,%%xmm0                   \n"
1115     "movdqa    %%xmm2,%%xmm7                   \n"
1116     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1117     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1118     "pavgb     %%xmm7,%%xmm2                   \n"
1119     "movdqa    %%xmm0,%%xmm1                   \n"
1120     "movdqa    %%xmm2,%%xmm6                   \n"
1121     "pmaddubsw %%xmm4,%%xmm0                   \n"
1122     "pmaddubsw %%xmm4,%%xmm2                   \n"
1123     "pmaddubsw %%xmm3,%%xmm1                   \n"
1124     "pmaddubsw %%xmm3,%%xmm6                   \n"
1125     "phaddw    %%xmm2,%%xmm0                   \n"
1126     "phaddw    %%xmm6,%%xmm1                   \n"
1127     "paddw     %%xmm5,%%xmm0                   \n"
1128     "paddw     %%xmm5,%%xmm1                   \n"
1129     "psraw     $0x8,%%xmm0                     \n"
1130     "psraw     $0x8,%%xmm1                     \n"
1131     "packsswb  %%xmm1,%%xmm0                   \n"
1132     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1133     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1134     "lea       " MEMLEA(0x8,1) ",%1            \n"
1135     "sub       $0x10,%3                        \n"
1136     "jg        1b                              \n"
1137   : "+r"(src_argb0),       // %0
1138     "+r"(dst_u),           // %1
1139     "+r"(dst_v),           // %2
1140     "+rm"(width)           // %3
1141   : "r"((intptr_t)(src_stride_argb)), // %4
1142     "m"(kARGBToVJ),  // %5
1143     "m"(kARGBToUJ),  // %6
1144     "m"(kAddUVJ128)  // %7
1145   : "memory", "cc", NACL_R14
1146     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1147   );
1148 }
1149 #endif  // HAS_ARGBTOUVJROW_SSSE3
1150 
1151 #ifdef HAS_ARGBTOUV444ROW_SSSE3
ARGBToUV444Row_SSSE3(const uint8 * src_argb,uint8 * dst_u,uint8 * dst_v,int width)1152 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1153                           int width) {
1154   asm volatile (
1155     "movdqa    %4,%%xmm3                       \n"
1156     "movdqa    %5,%%xmm4                       \n"
1157     "movdqa    %6,%%xmm5                       \n"
1158     "sub       %1,%2                           \n"
1159     LABELALIGN
1160   "1:                                          \n"
1161     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1162     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1163     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1164     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1165     "pmaddubsw %%xmm4,%%xmm0                   \n"
1166     "pmaddubsw %%xmm4,%%xmm1                   \n"
1167     "pmaddubsw %%xmm4,%%xmm2                   \n"
1168     "pmaddubsw %%xmm4,%%xmm6                   \n"
1169     "phaddw    %%xmm1,%%xmm0                   \n"
1170     "phaddw    %%xmm6,%%xmm2                   \n"
1171     "psraw     $0x8,%%xmm0                     \n"
1172     "psraw     $0x8,%%xmm2                     \n"
1173     "packsswb  %%xmm2,%%xmm0                   \n"
1174     "paddb     %%xmm5,%%xmm0                   \n"
1175     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1176     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1177     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1178     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1179     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1180     "pmaddubsw %%xmm3,%%xmm0                   \n"
1181     "pmaddubsw %%xmm3,%%xmm1                   \n"
1182     "pmaddubsw %%xmm3,%%xmm2                   \n"
1183     "pmaddubsw %%xmm3,%%xmm6                   \n"
1184     "phaddw    %%xmm1,%%xmm0                   \n"
1185     "phaddw    %%xmm6,%%xmm2                   \n"
1186     "psraw     $0x8,%%xmm0                     \n"
1187     "psraw     $0x8,%%xmm2                     \n"
1188     "packsswb  %%xmm2,%%xmm0                   \n"
1189     "paddb     %%xmm5,%%xmm0                   \n"
1190     "lea       " MEMLEA(0x40,0) ",%0           \n"
1191     MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
1192     "lea       " MEMLEA(0x10,1) ",%1           \n"
1193     "sub       $0x10,%3                        \n"
1194     "jg        1b                              \n"
1195   : "+r"(src_argb),        // %0
1196     "+r"(dst_u),           // %1
1197     "+r"(dst_v),           // %2
1198     "+rm"(width)           // %3
1199   : "m"(kARGBToV),  // %4
1200     "m"(kARGBToU),  // %5
1201     "m"(kAddUV128)  // %6
1202   : "memory", "cc", NACL_R14
1203     "xmm0", "xmm1", "xmm2", "xmm6"
1204   );
1205 }
1206 #endif  // HAS_ARGBTOUV444ROW_SSSE3
1207 
BGRAToYRow_SSSE3(const uint8 * src_bgra,uint8 * dst_y,int width)1208 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
1209   asm volatile (
1210     "movdqa    %4,%%xmm5                       \n"
1211     "movdqa    %3,%%xmm4                       \n"
1212     LABELALIGN
1213   "1:                                          \n"
1214     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1215     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1216     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1217     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1218     "pmaddubsw %%xmm4,%%xmm0                   \n"
1219     "pmaddubsw %%xmm4,%%xmm1                   \n"
1220     "pmaddubsw %%xmm4,%%xmm2                   \n"
1221     "pmaddubsw %%xmm4,%%xmm3                   \n"
1222     "lea       " MEMLEA(0x40,0) ",%0           \n"
1223     "phaddw    %%xmm1,%%xmm0                   \n"
1224     "phaddw    %%xmm3,%%xmm2                   \n"
1225     "psrlw     $0x7,%%xmm0                     \n"
1226     "psrlw     $0x7,%%xmm2                     \n"
1227     "packuswb  %%xmm2,%%xmm0                   \n"
1228     "paddb     %%xmm5,%%xmm0                   \n"
1229     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1230     "lea       " MEMLEA(0x10,1) ",%1           \n"
1231     "sub       $0x10,%2                        \n"
1232     "jg        1b                              \n"
1233   : "+r"(src_bgra),  // %0
1234     "+r"(dst_y),     // %1
1235     "+r"(width)        // %2
1236   : "m"(kBGRAToY),   // %3
1237     "m"(kAddY16)     // %4
1238   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1239   );
1240 }
1241 
BGRAToUVRow_SSSE3(const uint8 * src_bgra0,int src_stride_bgra,uint8 * dst_u,uint8 * dst_v,int width)1242 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1243                        uint8* dst_u, uint8* dst_v, int width) {
1244   asm volatile (
1245     "movdqa    %5,%%xmm3                       \n"
1246     "movdqa    %6,%%xmm4                       \n"
1247     "movdqa    %7,%%xmm5                       \n"
1248     "sub       %1,%2                           \n"
1249     LABELALIGN
1250   "1:                                          \n"
1251     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1252     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1253     "pavgb     %%xmm7,%%xmm0                   \n"
1254     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1255     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1256     "pavgb     %%xmm7,%%xmm1                   \n"
1257     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1258     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1259     "pavgb     %%xmm7,%%xmm2                   \n"
1260     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1261     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1262     "pavgb     %%xmm7,%%xmm6                   \n"
1263 
1264     "lea       " MEMLEA(0x40,0) ",%0           \n"
1265     "movdqa    %%xmm0,%%xmm7                   \n"
1266     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1267     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1268     "pavgb     %%xmm7,%%xmm0                   \n"
1269     "movdqa    %%xmm2,%%xmm7                   \n"
1270     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1271     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1272     "pavgb     %%xmm7,%%xmm2                   \n"
1273     "movdqa    %%xmm0,%%xmm1                   \n"
1274     "movdqa    %%xmm2,%%xmm6                   \n"
1275     "pmaddubsw %%xmm4,%%xmm0                   \n"
1276     "pmaddubsw %%xmm4,%%xmm2                   \n"
1277     "pmaddubsw %%xmm3,%%xmm1                   \n"
1278     "pmaddubsw %%xmm3,%%xmm6                   \n"
1279     "phaddw    %%xmm2,%%xmm0                   \n"
1280     "phaddw    %%xmm6,%%xmm1                   \n"
1281     "psraw     $0x8,%%xmm0                     \n"
1282     "psraw     $0x8,%%xmm1                     \n"
1283     "packsswb  %%xmm1,%%xmm0                   \n"
1284     "paddb     %%xmm5,%%xmm0                   \n"
1285     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1286     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1287     "lea       " MEMLEA(0x8,1) ",%1            \n"
1288     "sub       $0x10,%3                        \n"
1289     "jg        1b                              \n"
1290   : "+r"(src_bgra0),       // %0
1291     "+r"(dst_u),           // %1
1292     "+r"(dst_v),           // %2
1293     "+rm"(width)           // %3
1294   : "r"((intptr_t)(src_stride_bgra)), // %4
1295     "m"(kBGRAToV),  // %5
1296     "m"(kBGRAToU),  // %6
1297     "m"(kAddUV128)  // %7
1298   : "memory", "cc", NACL_R14
1299     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1300   );
1301 }
1302 
ABGRToYRow_SSSE3(const uint8 * src_abgr,uint8 * dst_y,int width)1303 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
1304   asm volatile (
1305     "movdqa    %4,%%xmm5                       \n"
1306     "movdqa    %3,%%xmm4                       \n"
1307     LABELALIGN
1308   "1:                                          \n"
1309     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1310     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1311     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1312     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1313     "pmaddubsw %%xmm4,%%xmm0                   \n"
1314     "pmaddubsw %%xmm4,%%xmm1                   \n"
1315     "pmaddubsw %%xmm4,%%xmm2                   \n"
1316     "pmaddubsw %%xmm4,%%xmm3                   \n"
1317     "lea       " MEMLEA(0x40,0) ",%0           \n"
1318     "phaddw    %%xmm1,%%xmm0                   \n"
1319     "phaddw    %%xmm3,%%xmm2                   \n"
1320     "psrlw     $0x7,%%xmm0                     \n"
1321     "psrlw     $0x7,%%xmm2                     \n"
1322     "packuswb  %%xmm2,%%xmm0                   \n"
1323     "paddb     %%xmm5,%%xmm0                   \n"
1324     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1325     "lea       " MEMLEA(0x10,1) ",%1           \n"
1326     "sub       $0x10,%2                        \n"
1327     "jg        1b                              \n"
1328   : "+r"(src_abgr),  // %0
1329     "+r"(dst_y),     // %1
1330     "+r"(width)        // %2
1331   : "m"(kABGRToY),   // %3
1332     "m"(kAddY16)     // %4
1333   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1334   );
1335 }
1336 
RGBAToYRow_SSSE3(const uint8 * src_rgba,uint8 * dst_y,int width)1337 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
1338   asm volatile (
1339     "movdqa    %4,%%xmm5                       \n"
1340     "movdqa    %3,%%xmm4                       \n"
1341     LABELALIGN
1342   "1:                                          \n"
1343     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1344     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1345     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1346     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1347     "pmaddubsw %%xmm4,%%xmm0                   \n"
1348     "pmaddubsw %%xmm4,%%xmm1                   \n"
1349     "pmaddubsw %%xmm4,%%xmm2                   \n"
1350     "pmaddubsw %%xmm4,%%xmm3                   \n"
1351     "lea       " MEMLEA(0x40,0) ",%0           \n"
1352     "phaddw    %%xmm1,%%xmm0                   \n"
1353     "phaddw    %%xmm3,%%xmm2                   \n"
1354     "psrlw     $0x7,%%xmm0                     \n"
1355     "psrlw     $0x7,%%xmm2                     \n"
1356     "packuswb  %%xmm2,%%xmm0                   \n"
1357     "paddb     %%xmm5,%%xmm0                   \n"
1358     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1359     "lea       " MEMLEA(0x10,1) ",%1           \n"
1360     "sub       $0x10,%2                        \n"
1361     "jg        1b                              \n"
1362   : "+r"(src_rgba),  // %0
1363     "+r"(dst_y),     // %1
1364     "+r"(width)        // %2
1365   : "m"(kRGBAToY),   // %3
1366     "m"(kAddY16)     // %4
1367   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1368   );
1369 }
1370 
ABGRToUVRow_SSSE3(const uint8 * src_abgr0,int src_stride_abgr,uint8 * dst_u,uint8 * dst_v,int width)1371 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1372                        uint8* dst_u, uint8* dst_v, int width) {
1373   asm volatile (
1374     "movdqa    %5,%%xmm3                       \n"
1375     "movdqa    %6,%%xmm4                       \n"
1376     "movdqa    %7,%%xmm5                       \n"
1377     "sub       %1,%2                           \n"
1378     LABELALIGN
1379   "1:                                          \n"
1380     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1381     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1382     "pavgb     %%xmm7,%%xmm0                   \n"
1383     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1384     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1385     "pavgb     %%xmm7,%%xmm1                   \n"
1386     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1387     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1388     "pavgb     %%xmm7,%%xmm2                   \n"
1389     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1390     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1391     "pavgb     %%xmm7,%%xmm6                   \n"
1392 
1393     "lea       " MEMLEA(0x40,0) ",%0           \n"
1394     "movdqa    %%xmm0,%%xmm7                   \n"
1395     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1396     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1397     "pavgb     %%xmm7,%%xmm0                   \n"
1398     "movdqa    %%xmm2,%%xmm7                   \n"
1399     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1400     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1401     "pavgb     %%xmm7,%%xmm2                   \n"
1402     "movdqa    %%xmm0,%%xmm1                   \n"
1403     "movdqa    %%xmm2,%%xmm6                   \n"
1404     "pmaddubsw %%xmm4,%%xmm0                   \n"
1405     "pmaddubsw %%xmm4,%%xmm2                   \n"
1406     "pmaddubsw %%xmm3,%%xmm1                   \n"
1407     "pmaddubsw %%xmm3,%%xmm6                   \n"
1408     "phaddw    %%xmm2,%%xmm0                   \n"
1409     "phaddw    %%xmm6,%%xmm1                   \n"
1410     "psraw     $0x8,%%xmm0                     \n"
1411     "psraw     $0x8,%%xmm1                     \n"
1412     "packsswb  %%xmm1,%%xmm0                   \n"
1413     "paddb     %%xmm5,%%xmm0                   \n"
1414     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1415     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1416     "lea       " MEMLEA(0x8,1) ",%1            \n"
1417     "sub       $0x10,%3                        \n"
1418     "jg        1b                              \n"
1419   : "+r"(src_abgr0),       // %0
1420     "+r"(dst_u),           // %1
1421     "+r"(dst_v),           // %2
1422     "+rm"(width)           // %3
1423   : "r"((intptr_t)(src_stride_abgr)), // %4
1424     "m"(kABGRToV),  // %5
1425     "m"(kABGRToU),  // %6
1426     "m"(kAddUV128)  // %7
1427   : "memory", "cc", NACL_R14
1428     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1429   );
1430 }
1431 
RGBAToUVRow_SSSE3(const uint8 * src_rgba0,int src_stride_rgba,uint8 * dst_u,uint8 * dst_v,int width)1432 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1433                        uint8* dst_u, uint8* dst_v, int width) {
1434   asm volatile (
1435     "movdqa    %5,%%xmm3                       \n"
1436     "movdqa    %6,%%xmm4                       \n"
1437     "movdqa    %7,%%xmm5                       \n"
1438     "sub       %1,%2                           \n"
1439     LABELALIGN
1440   "1:                                          \n"
1441     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1442     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1443     "pavgb     %%xmm7,%%xmm0                   \n"
1444     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1445     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1446     "pavgb     %%xmm7,%%xmm1                   \n"
1447     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1448     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1449     "pavgb     %%xmm7,%%xmm2                   \n"
1450     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1451     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1452     "pavgb     %%xmm7,%%xmm6                   \n"
1453 
1454     "lea       " MEMLEA(0x40,0) ",%0           \n"
1455     "movdqa    %%xmm0,%%xmm7                   \n"
1456     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1457     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1458     "pavgb     %%xmm7,%%xmm0                   \n"
1459     "movdqa    %%xmm2,%%xmm7                   \n"
1460     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1461     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1462     "pavgb     %%xmm7,%%xmm2                   \n"
1463     "movdqa    %%xmm0,%%xmm1                   \n"
1464     "movdqa    %%xmm2,%%xmm6                   \n"
1465     "pmaddubsw %%xmm4,%%xmm0                   \n"
1466     "pmaddubsw %%xmm4,%%xmm2                   \n"
1467     "pmaddubsw %%xmm3,%%xmm1                   \n"
1468     "pmaddubsw %%xmm3,%%xmm6                   \n"
1469     "phaddw    %%xmm2,%%xmm0                   \n"
1470     "phaddw    %%xmm6,%%xmm1                   \n"
1471     "psraw     $0x8,%%xmm0                     \n"
1472     "psraw     $0x8,%%xmm1                     \n"
1473     "packsswb  %%xmm1,%%xmm0                   \n"
1474     "paddb     %%xmm5,%%xmm0                   \n"
1475     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1476     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1477     "lea       " MEMLEA(0x8,1) ",%1            \n"
1478     "sub       $0x10,%3                        \n"
1479     "jg        1b                              \n"
1480   : "+r"(src_rgba0),       // %0
1481     "+r"(dst_u),           // %1
1482     "+r"(dst_v),           // %2
1483     "+rm"(width)           // %3
1484   : "r"((intptr_t)(src_stride_rgba)), // %4
1485     "m"(kRGBAToV),  // %5
1486     "m"(kRGBAToU),  // %6
1487     "m"(kAddUV128)  // %7
1488   : "memory", "cc", NACL_R14
1489     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1490   );
1491 }
1492 
1493 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
1494 
1495 // Read 8 UV from 444
1496 #define READYUV444                                                             \
1497     "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
1498     MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1499     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
1500     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
1501     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1502     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1503     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
1504 
1505 // Read 4 UV from 422, upsample to 8 UV
1506 #define READYUV422                                                             \
1507     "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
1508     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1509     "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
1510     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
1511     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
1512     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1513     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1514     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
1515 
1516 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
1517 #define READYUVA422                                                            \
1518     "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
1519     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1520     "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
1521     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
1522     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
1523     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1524     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1525     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
1526     "movq       " MEMACCESS([a_buf]) ",%%xmm5                   \n"            \
1527     "lea        " MEMLEA(0x8, [a_buf]) ",%[a_buf]               \n"
1528 
1529 // Read 2 UV from 411, upsample to 8 UV.
1530 // reading 4 bytes is an msan violation.
1531 //    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"
1532 //    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)
1533 // pinsrw fails with drmemory
1534 //  __asm pinsrw     xmm0, [esi], 0        /* U */
1535 //  __asm pinsrw     xmm1, [esi + edi], 0  /* V */
1536 #define READYUV411_TEMP                                                        \
1537     "movzwl     " MEMACCESS([u_buf]) ",%[temp]                  \n"            \
1538     "movd       %[temp],%%xmm0                                  \n"            \
1539     MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) "       \n"            \
1540     "movd       %[temp],%%xmm1                                  \n"            \
1541     "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
1542     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
1543     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
1544     "punpckldq  %%xmm0,%%xmm0                                   \n"            \
1545     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1546     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1547     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
1548 
1549 // Read 4 UV from NV12, upsample to 8 UV
1550 #define READNV12                                                               \
1551     "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
1552     "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
1553     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
1554     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1555     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1556     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
1557 
1558 // Read 4 VU from NV21, upsample to 8 UV
1559 #define READNV21                                                               \
1560     "movq       " MEMACCESS([vu_buf]) ",%%xmm0                  \n"            \
1561     "lea        " MEMLEA(0x8, [vu_buf]) ",%[vu_buf]             \n"            \
1562     "pshufb     %[kShuffleNV21], %%xmm0                         \n"            \
1563     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1564     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1565     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
1566 
1567 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
1568 #define READYUY2                                                               \
1569     "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm4                \n"            \
1570     "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n"            \
1571     "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm0                \n"            \
1572     "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n"            \
1573     "lea        " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf]        \n"
1574 
1575 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
1576 #define READUYVY                                                               \
1577     "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm4                \n"            \
1578     "pshufb     %[kShuffleUYVYY], %%xmm4                        \n"            \
1579     "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm0                \n"            \
1580     "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n"            \
1581     "lea        " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf]        \n"
1582 
1583 #if defined(__x86_64__)
1584 #define YUVTORGB_SETUP(yuvconstants)                                           \
1585     "movdqa     " MEMACCESS([yuvconstants]) ",%%xmm8            \n"            \
1586     "movdqa     " MEMACCESS2(32, [yuvconstants]) ",%%xmm9       \n"            \
1587     "movdqa     " MEMACCESS2(64, [yuvconstants]) ",%%xmm10      \n"            \
1588     "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm11      \n"            \
1589     "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm12     \n"            \
1590     "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm13     \n"            \
1591     "movdqa     " MEMACCESS2(192, [yuvconstants]) ",%%xmm14     \n"
1592 // Convert 8 pixels: 8 UV and 8 Y
1593 #define YUVTORGB(yuvconstants)                                                 \
1594     "movdqa     %%xmm0,%%xmm1                                   \n"            \
1595     "movdqa     %%xmm0,%%xmm2                                   \n"            \
1596     "movdqa     %%xmm0,%%xmm3                                   \n"            \
1597     "movdqa     %%xmm11,%%xmm0                                  \n"            \
1598     "pmaddubsw  %%xmm8,%%xmm1                                   \n"            \
1599     "psubw      %%xmm1,%%xmm0                                   \n"            \
1600     "movdqa     %%xmm12,%%xmm1                                  \n"            \
1601     "pmaddubsw  %%xmm9,%%xmm2                                   \n"            \
1602     "psubw      %%xmm2,%%xmm1                                   \n"            \
1603     "movdqa     %%xmm13,%%xmm2                                  \n"            \
1604     "pmaddubsw  %%xmm10,%%xmm3                                  \n"            \
1605     "psubw      %%xmm3,%%xmm2                                   \n"            \
1606     "pmulhuw    %%xmm14,%%xmm4                                  \n"            \
1607     "paddsw     %%xmm4,%%xmm0                                   \n"            \
1608     "paddsw     %%xmm4,%%xmm1                                   \n"            \
1609     "paddsw     %%xmm4,%%xmm2                                   \n"            \
1610     "psraw      $0x6,%%xmm0                                     \n"            \
1611     "psraw      $0x6,%%xmm1                                     \n"            \
1612     "psraw      $0x6,%%xmm2                                     \n"            \
1613     "packuswb   %%xmm0,%%xmm0                                   \n"            \
1614     "packuswb   %%xmm1,%%xmm1                                   \n"            \
1615     "packuswb   %%xmm2,%%xmm2                                   \n"
1616 #define YUVTORGB_REGS \
1617     "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
1618 
1619 #else
1620 #define YUVTORGB_SETUP(yuvconstants)
1621 // Convert 8 pixels: 8 UV and 8 Y
1622 #define YUVTORGB(yuvconstants)                                                 \
1623     "movdqa     %%xmm0,%%xmm1                                   \n"            \
1624     "movdqa     %%xmm0,%%xmm2                                   \n"            \
1625     "movdqa     %%xmm0,%%xmm3                                   \n"            \
1626     "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm0       \n"            \
1627     "pmaddubsw  " MEMACCESS([yuvconstants]) ",%%xmm1            \n"            \
1628     "psubw      %%xmm1,%%xmm0                                   \n"            \
1629     "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm1      \n"            \
1630     "pmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%xmm2       \n"            \
1631     "psubw      %%xmm2,%%xmm1                                   \n"            \
1632     "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm2      \n"            \
1633     "pmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%xmm3       \n"            \
1634     "psubw      %%xmm3,%%xmm2                                   \n"            \
1635     "pmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%xmm4      \n"            \
1636     "paddsw     %%xmm4,%%xmm0                                   \n"            \
1637     "paddsw     %%xmm4,%%xmm1                                   \n"            \
1638     "paddsw     %%xmm4,%%xmm2                                   \n"            \
1639     "psraw      $0x6,%%xmm0                                     \n"            \
1640     "psraw      $0x6,%%xmm1                                     \n"            \
1641     "psraw      $0x6,%%xmm2                                     \n"            \
1642     "packuswb   %%xmm0,%%xmm0                                   \n"            \
1643     "packuswb   %%xmm1,%%xmm1                                   \n"            \
1644     "packuswb   %%xmm2,%%xmm2                                   \n"
1645 #define YUVTORGB_REGS
1646 #endif
1647 
1648 // Store 8 ARGB values.
1649 #define STOREARGB                                                              \
1650     "punpcklbw  %%xmm1,%%xmm0                                    \n"           \
1651     "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
1652     "movdqa     %%xmm0,%%xmm1                                    \n"           \
1653     "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
1654     "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
1655     "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
1656     "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \
1657     "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
1658 
1659 // Store 8 RGBA values.
1660 #define STORERGBA                                                              \
1661     "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
1662     "punpcklbw %%xmm2,%%xmm1                                     \n"           \
1663     "punpcklbw %%xmm0,%%xmm5                                     \n"           \
1664     "movdqa    %%xmm5,%%xmm0                                     \n"           \
1665     "punpcklwd %%xmm1,%%xmm5                                     \n"           \
1666     "punpckhwd %%xmm1,%%xmm0                                     \n"           \
1667     "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
1668     "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \
1669     "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"
1670 
I444ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1671 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1672                                 const uint8* u_buf,
1673                                 const uint8* v_buf,
1674                                 uint8* dst_argb,
1675                                 const struct YuvConstants* yuvconstants,
1676                                 int width) {
1677   asm volatile (
1678     YUVTORGB_SETUP(yuvconstants)
1679     "sub       %[u_buf],%[v_buf]               \n"
1680     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1681     LABELALIGN
1682   "1:                                          \n"
1683     READYUV444
1684     YUVTORGB(yuvconstants)
1685     STOREARGB
1686     "sub       $0x8,%[width]                   \n"
1687     "jg        1b                              \n"
1688   : [y_buf]"+r"(y_buf),    // %[y_buf]
1689     [u_buf]"+r"(u_buf),    // %[u_buf]
1690     [v_buf]"+r"(v_buf),    // %[v_buf]
1691     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1692     [width]"+rm"(width)    // %[width]
1693   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1694   : "memory", "cc", NACL_R14 YUVTORGB_REGS
1695     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1696   );
1697 }
1698 
I422ToRGB24Row_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_rgb24,const struct YuvConstants * yuvconstants,int width)1699 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1700                                  const uint8* u_buf,
1701                                  const uint8* v_buf,
1702                                  uint8* dst_rgb24,
1703                                  const struct YuvConstants* yuvconstants,
1704                                  int width) {
1705   asm volatile (
1706     YUVTORGB_SETUP(yuvconstants)
1707     "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1708     "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
1709     "sub       %[u_buf],%[v_buf]               \n"
1710     LABELALIGN
1711   "1:                                          \n"
1712     READYUV422
1713     YUVTORGB(yuvconstants)
1714     "punpcklbw %%xmm1,%%xmm0                   \n"
1715     "punpcklbw %%xmm2,%%xmm2                   \n"
1716     "movdqa    %%xmm0,%%xmm1                   \n"
1717     "punpcklwd %%xmm2,%%xmm0                   \n"
1718     "punpckhwd %%xmm2,%%xmm1                   \n"
1719     "pshufb    %%xmm5,%%xmm0                   \n"
1720     "pshufb    %%xmm6,%%xmm1                   \n"
1721     "palignr   $0xc,%%xmm0,%%xmm1              \n"
1722     "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
1723     "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
1724     "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
1725     "subl      $0x8,%[width]                   \n"
1726     "jg        1b                              \n"
1727   : [y_buf]"+r"(y_buf),    // %[y_buf]
1728     [u_buf]"+r"(u_buf),    // %[u_buf]
1729     [v_buf]"+r"(v_buf),    // %[v_buf]
1730     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
1731 #if defined(__i386__) && defined(__pic__)
1732     [width]"+m"(width)     // %[width]
1733 #else
1734     [width]"+rm"(width)    // %[width]
1735 #endif
1736   : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
1737     [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1738     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
1739   : "memory", "cc", NACL_R14 YUVTORGB_REGS
1740     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1741   );
1742 }
1743 
I422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1744 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
1745                                 const uint8* u_buf,
1746                                 const uint8* v_buf,
1747                                 uint8* dst_argb,
1748                                 const struct YuvConstants* yuvconstants,
1749                                 int width) {
1750   asm volatile (
1751     YUVTORGB_SETUP(yuvconstants)
1752     "sub       %[u_buf],%[v_buf]               \n"
1753     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1754     LABELALIGN
1755   "1:                                          \n"
1756     READYUV422
1757     YUVTORGB(yuvconstants)
1758     STOREARGB
1759     "sub       $0x8,%[width]                   \n"
1760     "jg        1b                              \n"
1761   : [y_buf]"+r"(y_buf),    // %[y_buf]
1762     [u_buf]"+r"(u_buf),    // %[u_buf]
1763     [v_buf]"+r"(v_buf),    // %[v_buf]
1764     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1765     [width]"+rm"(width)    // %[width]
1766   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1767   : "memory", "cc", NACL_R14 YUVTORGB_REGS
1768     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1769   );
1770 }
1771 
1772 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
I422AlphaToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,const uint8 * a_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1773 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
1774                                      const uint8* u_buf,
1775                                      const uint8* v_buf,
1776                                      const uint8* a_buf,
1777                                      uint8* dst_argb,
1778                                      const struct YuvConstants* yuvconstants,
1779                                      int width) {
1780   asm volatile (
1781     YUVTORGB_SETUP(yuvconstants)
1782     "sub       %[u_buf],%[v_buf]               \n"
1783     LABELALIGN
1784   "1:                                          \n"
1785     READYUVA422
1786     YUVTORGB(yuvconstants)
1787     STOREARGB
1788     "subl      $0x8,%[width]                   \n"
1789     "jg        1b                              \n"
1790   : [y_buf]"+r"(y_buf),    // %[y_buf]
1791     [u_buf]"+r"(u_buf),    // %[u_buf]
1792     [v_buf]"+r"(v_buf),    // %[v_buf]
1793     [a_buf]"+r"(a_buf),    // %[a_buf]
1794     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1795 #if defined(__i386__) && defined(__pic__)
1796     [width]"+m"(width)     // %[width]
1797 #else
1798     [width]"+rm"(width)    // %[width]
1799 #endif
1800   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1801   : "memory", "cc", NACL_R14 YUVTORGB_REGS
1802     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1803   );
1804 }
1805 #endif  // HAS_I422ALPHATOARGBROW_SSSE3
1806 
1807 #ifdef HAS_I411TOARGBROW_SSSE3
I411ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1808 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1809                                 const uint8* u_buf,
1810                                 const uint8* v_buf,
1811                                 uint8* dst_argb,
1812                                 const struct YuvConstants* yuvconstants,
1813                                 int width) {
1814   int temp;
1815   asm volatile (
1816     YUVTORGB_SETUP(yuvconstants)
1817     "sub       %[u_buf],%[v_buf]               \n"
1818     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1819     LABELALIGN
1820   "1:                                          \n"
1821     READYUV411_TEMP
1822     YUVTORGB(yuvconstants)
1823     STOREARGB
1824     "subl      $0x8,%[width]                   \n"
1825     "jg        1b                              \n"
1826   : [y_buf]"+r"(y_buf),        // %[y_buf]
1827     [u_buf]"+r"(u_buf),        // %[u_buf]
1828     [v_buf]"+r"(v_buf),        // %[v_buf]
1829     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1830     [temp]"=&r"(temp),         // %[temp]
1831 #if defined(__i386__) && defined(__pic__)
1832     [width]"+m"(width)         // %[width]
1833 #else
1834     [width]"+rm"(width)        // %[width]
1835 #endif
1836   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1837   : "memory", "cc", NACL_R14 YUVTORGB_REGS
1838     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1839   );
1840 }
1841 #endif
1842 
NV12ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1843 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1844                                 const uint8* uv_buf,
1845                                 uint8* dst_argb,
1846                                 const struct YuvConstants* yuvconstants,
1847                                 int width) {
1848   asm volatile (
1849     YUVTORGB_SETUP(yuvconstants)
1850     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1851     LABELALIGN
1852   "1:                                          \n"
1853     READNV12
1854     YUVTORGB(yuvconstants)
1855     STOREARGB
1856     "sub       $0x8,%[width]                   \n"
1857     "jg        1b                              \n"
1858   : [y_buf]"+r"(y_buf),    // %[y_buf]
1859     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
1860     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1861     [width]"+rm"(width)    // %[width]
1862   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1863     : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
1864       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1865   );
1866 }
1867 
NV21ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * vu_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1868 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1869                                 const uint8* vu_buf,
1870                                 uint8* dst_argb,
1871                                 const struct YuvConstants* yuvconstants,
1872                                 int width) {
1873   asm volatile (
1874     YUVTORGB_SETUP(yuvconstants)
1875     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1876     LABELALIGN
1877   "1:                                          \n"
1878     READNV21
1879     YUVTORGB(yuvconstants)
1880     STOREARGB
1881     "sub       $0x8,%[width]                   \n"
1882     "jg        1b                              \n"
1883   : [y_buf]"+r"(y_buf),    // %[y_buf]
1884     [vu_buf]"+r"(vu_buf),    // %[vu_buf]
1885     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1886     [width]"+rm"(width)    // %[width]
1887   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1888     [kShuffleNV21]"m"(kShuffleNV21)
1889     : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
1890       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1891   );
1892 }
1893 
YUY2ToARGBRow_SSSE3(const uint8 * yuy2_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1894 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
1895                                 uint8* dst_argb,
1896                                 const struct YuvConstants* yuvconstants,
1897                                 int width) {
1898   asm volatile (
1899     YUVTORGB_SETUP(yuvconstants)
1900     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1901     LABELALIGN
1902   "1:                                          \n"
1903     READYUY2
1904     YUVTORGB(yuvconstants)
1905     STOREARGB
1906     "sub       $0x8,%[width]                   \n"
1907     "jg        1b                              \n"
1908   : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
1909     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1910     [width]"+rm"(width)    // %[width]
1911   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1912     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
1913     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
1914     : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
1915       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1916   );
1917 }
1918 
UYVYToARGBRow_SSSE3(const uint8 * uyvy_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1919 void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
1920                                 uint8* dst_argb,
1921                                 const struct YuvConstants* yuvconstants,
1922                                 int width) {
1923   asm volatile (
1924     YUVTORGB_SETUP(yuvconstants)
1925     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1926     LABELALIGN
1927   "1:                                          \n"
1928     READUYVY
1929     YUVTORGB(yuvconstants)
1930     STOREARGB
1931     "sub       $0x8,%[width]                   \n"
1932     "jg        1b                              \n"
1933   : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
1934     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1935     [width]"+rm"(width)    // %[width]
1936   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1937     [kShuffleUYVYY]"m"(kShuffleUYVYY),
1938     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
1939     : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
1940       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1941   );
1942 }
1943 
I422ToRGBARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_rgba,const struct YuvConstants * yuvconstants,int width)1944 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
1945                                 const uint8* u_buf,
1946                                 const uint8* v_buf,
1947                                 uint8* dst_rgba,
1948                                 const struct YuvConstants* yuvconstants,
1949                                 int width) {
1950   asm volatile (
1951     YUVTORGB_SETUP(yuvconstants)
1952     "sub       %[u_buf],%[v_buf]               \n"
1953     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1954     LABELALIGN
1955   "1:                                          \n"
1956     READYUV422
1957     YUVTORGB(yuvconstants)
1958     STORERGBA
1959     "sub       $0x8,%[width]                   \n"
1960     "jg        1b                              \n"
1961   : [y_buf]"+r"(y_buf),    // %[y_buf]
1962     [u_buf]"+r"(u_buf),    // %[u_buf]
1963     [v_buf]"+r"(v_buf),    // %[v_buf]
1964     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
1965     [width]"+rm"(width)    // %[width]
1966   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1967   : "memory", "cc", NACL_R14 YUVTORGB_REGS
1968     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1969   );
1970 }
1971 
1972 #endif  // HAS_I422TOARGBROW_SSSE3
1973 
1974 // Read 16 UV from 444
1975 #define READYUV444_AVX2                                                        \
1976     "vmovdqu    " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
1977     MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1)                         \
1978     "lea        " MEMLEA(0x10, [u_buf]) ",%[u_buf]                  \n"        \
1979     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
1980     "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n"        \
1981     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
1982     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
1983     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
1984     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
1985     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
1986 
1987 // Read 8 UV from 422, upsample to 16 UV.
1988 #define READYUV422_AVX2                                                        \
1989     "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
1990     MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
1991     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
1992     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
1993     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
1994     "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
1995     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
1996     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
1997     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
1998     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
1999 
2000 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
2001 #define READYUVA422_AVX2                                                       \
2002     "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
2003     MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
2004     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
2005     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
2006     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
2007     "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
2008     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
2009     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
2010     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
2011     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"        \
2012     "vmovdqu    " MEMACCESS([a_buf]) ",%%xmm5                       \n"        \
2013     "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n"        \
2014     "lea        " MEMLEA(0x10, [a_buf]) ",%[a_buf]                  \n"
2015 
2016 // Read 4 UV from 411, upsample to 16 UV.
2017 #define READYUV411_AVX2                                                        \
2018     "vmovd      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
2019     MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
2020     "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]                   \n"        \
2021     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
2022     "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
2023     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
2024     "vpunpckldq %%ymm0,%%ymm0,%%ymm0                                \n"        \
2025     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
2026     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
2027     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
2028     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
2029 
2030 // Read 8 UV from NV12, upsample to 16 UV.
2031 #define READNV12_AVX2                                                          \
2032     "vmovdqu    " MEMACCESS([uv_buf]) ",%%xmm0                      \n"        \
2033     "lea        " MEMLEA(0x10, [uv_buf]) ",%[uv_buf]                \n"        \
2034     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
2035     "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
2036     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
2037     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
2038     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
2039     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
2040 
2041 // Read 8 VU from NV21, upsample to 16 UV.
2042 #define READNV21_AVX2                                                          \
2043     "vmovdqu    " MEMACCESS([vu_buf]) ",%%xmm0                      \n"        \
2044     "lea        " MEMLEA(0x10, [vu_buf]) ",%[vu_buf]                \n"        \
2045     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
2046     "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n"        \
2047     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
2048     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
2049     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
2050     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
2051 
2052 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
2053 #define READYUY2_AVX2                                                          \
2054     "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm4                    \n"        \
2055     "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n"        \
2056     "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm0                    \n"        \
2057     "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n"        \
2058     "lea        " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf]            \n"
2059 
2060 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2061 #define READUYVY_AVX2                                                          \
2062     "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm4                   \n"        \
2063     "vpshufb     %[kShuffleUYVYY], %%ymm4, %%ymm4                   \n"        \
2064     "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm0                   \n"        \
2065     "vpshufb     %[kShuffleUYVYUV], %%ymm0, %%ymm0                  \n"        \
2066     "lea        " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf]            \n"
2067 
2068 #if defined(__x86_64__)
2069 #define YUVTORGB_SETUP_AVX2(yuvconstants)                                      \
2070     "vmovdqa     " MEMACCESS([yuvconstants]) ",%%ymm8            \n"           \
2071     "vmovdqa     " MEMACCESS2(32, [yuvconstants]) ",%%ymm9       \n"           \
2072     "vmovdqa     " MEMACCESS2(64, [yuvconstants]) ",%%ymm10      \n"           \
2073     "vmovdqa     " MEMACCESS2(96, [yuvconstants]) ",%%ymm11      \n"           \
2074     "vmovdqa     " MEMACCESS2(128, [yuvconstants]) ",%%ymm12     \n"           \
2075     "vmovdqa     " MEMACCESS2(160, [yuvconstants]) ",%%ymm13     \n"           \
2076     "vmovdqa     " MEMACCESS2(192, [yuvconstants]) ",%%ymm14     \n"
2077 #define YUVTORGB_AVX2(yuvconstants)                                            \
2078     "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n"        \
2079     "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n"        \
2080     "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n"        \
2081     "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n"        \
2082     "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n"        \
2083     "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n"        \
2084     "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n"        \
2085     "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \
2086     "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \
2087     "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \
2088     "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \
2089     "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \
2090     "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \
2091     "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \
2092     "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \
2093     "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
2094 #define YUVTORGB_REGS_AVX2 \
2095     "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
2096 #else  // Convert 16 pixels: 16 UV and 16 Y.
2097 #define YUVTORGB_SETUP_AVX2(yuvconstants)
2098 #define YUVTORGB_AVX2(yuvconstants)                                            \
2099     "vpmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2   \n"        \
2100     "vpmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1   \n"        \
2101     "vpmaddubsw  " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0        \n"        \
2102     "vmovdqu     " MEMACCESS2(160, [yuvconstants]) ",%%ymm3         \n"        \
2103     "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
2104     "vmovdqu     " MEMACCESS2(128, [yuvconstants]) ",%%ymm3         \n"        \
2105     "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \
2106     "vmovdqu     " MEMACCESS2(96, [yuvconstants]) ",%%ymm3          \n"        \
2107     "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \
2108     "vpmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4  \n"        \
2109     "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \
2110     "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \
2111     "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \
2112     "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \
2113     "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \
2114     "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \
2115     "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \
2116     "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \
2117     "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
2118 #define YUVTORGB_REGS_AVX2
2119 #endif
2120 
2121 // Store 16 ARGB values.
2122 #define STOREARGB_AVX2                                                         \
2123     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
2124     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
2125     "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n"        \
2126     "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n"        \
2127     "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n"        \
2128     "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n"        \
2129     "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "                    \n"        \
2130     "vmovdqu    %%ymm0," MEMACCESS2(0x20, [dst_argb]) "             \n"        \
2131     "lea       " MEMLEA(0x40, [dst_argb]) ", %[dst_argb]            \n"
2132 
2133 #ifdef HAS_I444TOARGBROW_AVX2
2134 // 16 pixels
2135 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
I444ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2136 void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
2137                                const uint8* u_buf,
2138                                const uint8* v_buf,
2139                                uint8* dst_argb,
2140                                const struct YuvConstants* yuvconstants,
2141                                int width) {
2142   asm volatile (
2143     YUVTORGB_SETUP_AVX2(yuvconstants)
2144     "sub       %[u_buf],%[v_buf]               \n"
2145     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
2146     LABELALIGN
2147   "1:                                          \n"
2148     READYUV444_AVX2
2149     YUVTORGB_AVX2(yuvconstants)
2150     STOREARGB_AVX2
2151     "sub       $0x10,%[width]                  \n"
2152     "jg        1b                              \n"
2153     "vzeroupper                                \n"
2154   : [y_buf]"+r"(y_buf),    // %[y_buf]
2155     [u_buf]"+r"(u_buf),    // %[u_buf]
2156     [v_buf]"+r"(v_buf),    // %[v_buf]
2157     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2158     [width]"+rm"(width)    // %[width]
2159   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2160   : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2161     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2162   );
2163 }
2164 #endif  // HAS_I444TOARGBROW_AVX2
2165 
2166 #ifdef HAS_I411TOARGBROW_AVX2
2167 // 16 pixels
2168 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I411ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2169 void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf,
2170                                const uint8* u_buf,
2171                                const uint8* v_buf,
2172                                uint8* dst_argb,
2173                                const struct YuvConstants* yuvconstants,
2174                                int width) {
2175   asm volatile (
2176     YUVTORGB_SETUP_AVX2(yuvconstants)
2177     "sub       %[u_buf],%[v_buf]               \n"
2178     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
2179     LABELALIGN
2180   "1:                                          \n"
2181     READYUV411_AVX2
2182     YUVTORGB_AVX2(yuvconstants)
2183     STOREARGB_AVX2
2184     "sub       $0x10,%[width]                  \n"
2185     "jg        1b                              \n"
2186     "vzeroupper                                \n"
2187   : [y_buf]"+r"(y_buf),    // %[y_buf]
2188     [u_buf]"+r"(u_buf),    // %[u_buf]
2189     [v_buf]"+r"(v_buf),    // %[v_buf]
2190     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2191     [width]"+rm"(width)    // %[width]
2192   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2193   : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2194     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2195   );
2196 }
2197 #endif  // HAS_I411TOARGBROW_AVX2
2198 
2199 #if defined(HAS_I422TOARGBROW_AVX2)
2200 // 16 pixels
2201 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I422ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2202 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
2203                                const uint8* u_buf,
2204                                const uint8* v_buf,
2205                                uint8* dst_argb,
2206                                const struct YuvConstants* yuvconstants,
2207                                int width) {
2208   asm volatile (
2209     YUVTORGB_SETUP_AVX2(yuvconstants)
2210     "sub       %[u_buf],%[v_buf]               \n"
2211     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
2212     LABELALIGN
2213   "1:                                          \n"
2214     READYUV422_AVX2
2215     YUVTORGB_AVX2(yuvconstants)
2216     STOREARGB_AVX2
2217     "sub       $0x10,%[width]                  \n"
2218     "jg        1b                              \n"
2219     "vzeroupper                                \n"
2220   : [y_buf]"+r"(y_buf),    // %[y_buf]
2221     [u_buf]"+r"(u_buf),    // %[u_buf]
2222     [v_buf]"+r"(v_buf),    // %[v_buf]
2223     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2224     [width]"+rm"(width)    // %[width]
2225   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2226   : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2227     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2228   );
2229 }
2230 #endif  // HAS_I422TOARGBROW_AVX2
2231 
2232 #if defined(HAS_I422ALPHATOARGBROW_AVX2)
2233 // 16 pixels
2234 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
I422AlphaToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,const uint8 * a_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2235 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
2236                                const uint8* u_buf,
2237                                const uint8* v_buf,
2238                                const uint8* a_buf,
2239                                uint8* dst_argb,
2240                                const struct YuvConstants* yuvconstants,
2241                                int width) {
2242   asm volatile (
2243     YUVTORGB_SETUP_AVX2(yuvconstants)
2244     "sub       %[u_buf],%[v_buf]               \n"
2245     LABELALIGN
2246   "1:                                          \n"
2247     READYUVA422_AVX2
2248     YUVTORGB_AVX2(yuvconstants)
2249     STOREARGB_AVX2
2250     "subl      $0x10,%[width]                  \n"
2251     "jg        1b                              \n"
2252     "vzeroupper                                \n"
2253   : [y_buf]"+r"(y_buf),    // %[y_buf]
2254     [u_buf]"+r"(u_buf),    // %[u_buf]
2255     [v_buf]"+r"(v_buf),    // %[v_buf]
2256     [a_buf]"+r"(a_buf),    // %[a_buf]
2257     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2258 #if defined(__i386__) && defined(__pic__)
2259     [width]"+m"(width)     // %[width]
2260 #else
2261     [width]"+rm"(width)    // %[width]
2262 #endif
2263   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2264   : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2265     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2266   );
2267 }
2268 #endif  // HAS_I422ALPHATOARGBROW_AVX2
2269 
2270 #if defined(HAS_I422TORGBAROW_AVX2)
2271 // 16 pixels
2272 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
I422ToRGBARow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2273 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
2274                                const uint8* u_buf,
2275                                const uint8* v_buf,
2276                                uint8* dst_argb,
2277                                const struct YuvConstants* yuvconstants,
2278                                int width) {
2279   asm volatile (
2280     YUVTORGB_SETUP_AVX2(yuvconstants)
2281     "sub       %[u_buf],%[v_buf]               \n"
2282     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2283     LABELALIGN
2284   "1:                                          \n"
2285     READYUV422_AVX2
2286     YUVTORGB_AVX2(yuvconstants)
2287 
2288     // Step 3: Weave into RGBA
2289     "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
2290     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2291     "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
2292     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2293     "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
2294     "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
2295     "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
2296     "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2297     "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2298     "sub       $0x10,%[width]                  \n"
2299     "jg        1b                              \n"
2300     "vzeroupper                                \n"
2301   : [y_buf]"+r"(y_buf),    // %[y_buf]
2302     [u_buf]"+r"(u_buf),    // %[u_buf]
2303     [v_buf]"+r"(v_buf),    // %[v_buf]
2304     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2305     [width]"+rm"(width)    // %[width]
2306   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2307   : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2308     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2309   );
2310 }
2311 #endif  // HAS_I422TORGBAROW_AVX2
2312 
2313 #if defined(HAS_NV12TOARGBROW_AVX2)
2314 // 16 pixels.
2315 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV12ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2316 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
2317                                const uint8* uv_buf,
2318                                uint8* dst_argb,
2319                                const struct YuvConstants* yuvconstants,
2320                                int width) {
2321   asm volatile (
2322     YUVTORGB_SETUP_AVX2(yuvconstants)
2323     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2324     LABELALIGN
2325   "1:                                          \n"
2326     READNV12_AVX2
2327     YUVTORGB_AVX2(yuvconstants)
2328     STOREARGB_AVX2
2329     "sub       $0x10,%[width]                  \n"
2330     "jg        1b                              \n"
2331     "vzeroupper                                \n"
2332   : [y_buf]"+r"(y_buf),    // %[y_buf]
2333     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
2334     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2335     [width]"+rm"(width)    // %[width]
2336   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2337     : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
2338     "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2339   );
2340 }
2341 #endif  // HAS_NV12TOARGBROW_AVX2
2342 
2343 #if defined(HAS_NV21TOARGBROW_AVX2)
2344 // 16 pixels.
2345 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV21ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * vu_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2346 void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
2347                                const uint8* vu_buf,
2348                                uint8* dst_argb,
2349                                const struct YuvConstants* yuvconstants,
2350                                int width) {
2351   asm volatile (
2352     YUVTORGB_SETUP_AVX2(yuvconstants)
2353     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2354     LABELALIGN
2355   "1:                                          \n"
2356     READNV21_AVX2
2357     YUVTORGB_AVX2(yuvconstants)
2358     STOREARGB_AVX2
2359     "sub       $0x10,%[width]                  \n"
2360     "jg        1b                              \n"
2361     "vzeroupper                                \n"
2362   : [y_buf]"+r"(y_buf),    // %[y_buf]
2363     [vu_buf]"+r"(vu_buf),    // %[vu_buf]
2364     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2365     [width]"+rm"(width)    // %[width]
2366   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2367     [kShuffleNV21]"m"(kShuffleNV21)
2368     : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
2369       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2370   );
2371 }
2372 #endif  // HAS_NV21TOARGBROW_AVX2
2373 
2374 #if defined(HAS_YUY2TOARGBROW_AVX2)
2375 // 16 pixels.
2376 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
YUY2ToARGBRow_AVX2(const uint8 * yuy2_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2377 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
2378                                uint8* dst_argb,
2379                                const struct YuvConstants* yuvconstants,
2380                                int width) {
2381   asm volatile (
2382     YUVTORGB_SETUP_AVX2(yuvconstants)
2383     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2384     LABELALIGN
2385   "1:                                          \n"
2386     READYUY2_AVX2
2387     YUVTORGB_AVX2(yuvconstants)
2388     STOREARGB_AVX2
2389     "sub       $0x10,%[width]                  \n"
2390     "jg        1b                              \n"
2391     "vzeroupper                                \n"
2392   : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
2393     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2394     [width]"+rm"(width)    // %[width]
2395   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2396     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2397     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2398     : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
2399       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2400   );
2401 }
2402 #endif  // HAS_YUY2TOARGBROW_AVX2
2403 
2404 #if defined(HAS_UYVYTOARGBROW_AVX2)
2405 // 16 pixels.
2406 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
UYVYToARGBRow_AVX2(const uint8 * uyvy_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2407 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
2408                                uint8* dst_argb,
2409                                const struct YuvConstants* yuvconstants,
2410                                int width) {
2411   asm volatile (
2412     YUVTORGB_SETUP_AVX2(yuvconstants)
2413     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2414     LABELALIGN
2415   "1:                                          \n"
2416     READUYVY_AVX2
2417     YUVTORGB_AVX2(yuvconstants)
2418     STOREARGB_AVX2
2419     "sub       $0x10,%[width]                  \n"
2420     "jg        1b                              \n"
2421     "vzeroupper                                \n"
2422   : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
2423     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2424     [width]"+rm"(width)    // %[width]
2425   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2426     [kShuffleUYVYY]"m"(kShuffleUYVYY),
2427     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
2428     : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
2429       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2430   );
2431 }
2432 #endif  // HAS_UYVYTOARGBROW_AVX2
2433 
2434 #ifdef HAS_I400TOARGBROW_SSE2
I400ToARGBRow_SSE2(const uint8 * y_buf,uint8 * dst_argb,int width)2435 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
2436   asm volatile (
2437     "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
2438     "movd      %%eax,%%xmm2                    \n"
2439     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
2440     "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
2441     "movd      %%eax,%%xmm3                    \n"
2442     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
2443     "pcmpeqb   %%xmm4,%%xmm4                   \n"
2444     "pslld     $0x18,%%xmm4                    \n"
2445     LABELALIGN
2446   "1:                                          \n"
2447     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2448     "movq      " MEMACCESS(0) ",%%xmm0         \n"
2449     "lea       " MEMLEA(0x8,0) ",%0            \n"
2450     "punpcklbw %%xmm0,%%xmm0                   \n"
2451     "pmulhuw   %%xmm2,%%xmm0                   \n"
2452     "psubusw   %%xmm3,%%xmm0                   \n"
2453     "psrlw     $6, %%xmm0                      \n"
2454     "packuswb  %%xmm0,%%xmm0                   \n"
2455 
2456     // Step 2: Weave into ARGB
2457     "punpcklbw %%xmm0,%%xmm0                   \n"
2458     "movdqa    %%xmm0,%%xmm1                   \n"
2459     "punpcklwd %%xmm0,%%xmm0                   \n"
2460     "punpckhwd %%xmm1,%%xmm1                   \n"
2461     "por       %%xmm4,%%xmm0                   \n"
2462     "por       %%xmm4,%%xmm1                   \n"
2463     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2464     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
2465     "lea       " MEMLEA(0x20,1) ",%1           \n"
2466 
2467     "sub       $0x8,%2                         \n"
2468     "jg        1b                              \n"
2469   : "+r"(y_buf),     // %0
2470     "+r"(dst_argb),  // %1
2471     "+rm"(width)     // %2
2472   :
2473   : "memory", "cc", "eax"
2474     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2475   );
2476 }
2477 #endif  // HAS_I400TOARGBROW_SSE2
2478 
2479 #ifdef HAS_I400TOARGBROW_AVX2
2480 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2481 // note: vpunpcklbw mutates and vpackuswb unmutates.
I400ToARGBRow_AVX2(const uint8 * y_buf,uint8 * dst_argb,int width)2482 void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
2483   asm volatile (
2484     "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
2485     "vmovd      %%eax,%%xmm2                   \n"
2486     "vbroadcastss %%xmm2,%%ymm2                \n"
2487     "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
2488     "vmovd      %%eax,%%xmm3                   \n"
2489     "vbroadcastss %%xmm3,%%ymm3                \n"
2490     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
2491     "vpslld     $0x18,%%ymm4,%%ymm4            \n"
2492 
2493     LABELALIGN
2494   "1:                                          \n"
2495     // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
2496     "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
2497     "lea        " MEMLEA(0x10,0) ",%0          \n"
2498     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
2499     "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
2500     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
2501     "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
2502     "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
2503     "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
2504     "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
2505     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2506     "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
2507     "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
2508     "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
2509     "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
2510     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2511     "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
2512     "lea       " MEMLEA(0x40,1) ",%1           \n"
2513     "sub        $0x10,%2                       \n"
2514     "jg        1b                              \n"
2515     "vzeroupper                                \n"
2516   : "+r"(y_buf),     // %0
2517     "+r"(dst_argb),  // %1
2518     "+rm"(width)     // %2
2519   :
2520   : "memory", "cc", "eax"
2521     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2522   );
2523 }
2524 #endif  // HAS_I400TOARGBROW_AVX2
2525 
2526 #ifdef HAS_MIRRORROW_SSSE3
2527 // Shuffle table for reversing the bytes.
2528 static uvec8 kShuffleMirror = {
2529   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2530 };
2531 
MirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)2532 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2533   intptr_t temp_width = (intptr_t)(width);
2534   asm volatile (
2535     "movdqa    %3,%%xmm5                       \n"
2536     LABELALIGN
2537   "1:                                          \n"
2538     MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
2539     "pshufb    %%xmm5,%%xmm0                   \n"
2540     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2541     "lea       " MEMLEA(0x10,1) ",%1           \n"
2542     "sub       $0x10,%2                        \n"
2543     "jg        1b                              \n"
2544   : "+r"(src),  // %0
2545     "+r"(dst),  // %1
2546     "+r"(temp_width)  // %2
2547   : "m"(kShuffleMirror) // %3
2548   : "memory", "cc", NACL_R14
2549     "xmm0", "xmm5"
2550   );
2551 }
2552 #endif  // HAS_MIRRORROW_SSSE3
2553 
2554 #ifdef HAS_MIRRORROW_AVX2
MirrorRow_AVX2(const uint8 * src,uint8 * dst,int width)2555 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2556   intptr_t temp_width = (intptr_t)(width);
2557   asm volatile (
2558     "vbroadcastf128 %3,%%ymm5                  \n"
2559     LABELALIGN
2560   "1:                                          \n"
2561     MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
2562     "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
2563     "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
2564     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2565     "lea       " MEMLEA(0x20,1) ",%1           \n"
2566     "sub       $0x20,%2                        \n"
2567     "jg        1b                              \n"
2568     "vzeroupper                                \n"
2569   : "+r"(src),  // %0
2570     "+r"(dst),  // %1
2571     "+r"(temp_width)  // %2
2572   : "m"(kShuffleMirror) // %3
2573   : "memory", "cc", NACL_R14
2574     "xmm0", "xmm5"
2575   );
2576 }
2577 #endif  // HAS_MIRRORROW_AVX2
2578 
2579 #ifdef HAS_MIRRORUVROW_SSSE3
2580 // Shuffle table for reversing the bytes of UV channels.
2581 static uvec8 kShuffleMirrorUV = {
2582   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2583 };
MirrorUVRow_SSSE3(const uint8 * src,uint8 * dst_u,uint8 * dst_v,int width)2584 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2585                        int width) {
2586   intptr_t temp_width = (intptr_t)(width);
2587   asm volatile (
2588     "movdqa    %4,%%xmm1                       \n"
2589     "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
2590     "sub       %1,%2                           \n"
2591     LABELALIGN
2592   "1:                                          \n"
2593     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2594     "lea       " MEMLEA(-0x10,0) ",%0          \n"
2595     "pshufb    %%xmm1,%%xmm0                   \n"
2596     "movlpd    %%xmm0," MEMACCESS(1) "         \n"
2597     MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
2598     "lea       " MEMLEA(0x8,1) ",%1            \n"
2599     "sub       $8,%3                           \n"
2600     "jg        1b                              \n"
2601   : "+r"(src),      // %0
2602     "+r"(dst_u),    // %1
2603     "+r"(dst_v),    // %2
2604     "+r"(temp_width)  // %3
2605   : "m"(kShuffleMirrorUV)  // %4
2606   : "memory", "cc", NACL_R14
2607     "xmm0", "xmm1"
2608   );
2609 }
2610 #endif  // HAS_MIRRORUVROW_SSSE3
2611 
2612 #ifdef HAS_ARGBMIRRORROW_SSE2
2613 
ARGBMirrorRow_SSE2(const uint8 * src,uint8 * dst,int width)2614 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2615   intptr_t temp_width = (intptr_t)(width);
2616   asm volatile (
2617     "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
2618     LABELALIGN
2619   "1:                                          \n"
2620     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2621     "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
2622     "lea       " MEMLEA(-0x10,0) ",%0          \n"
2623     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2624     "lea       " MEMLEA(0x10,1) ",%1           \n"
2625     "sub       $0x4,%2                         \n"
2626     "jg        1b                              \n"
2627   : "+r"(src),  // %0
2628     "+r"(dst),  // %1
2629     "+r"(temp_width)  // %2
2630   :
2631   : "memory", "cc"
2632     , "xmm0"
2633   );
2634 }
2635 #endif  // HAS_ARGBMIRRORROW_SSE2
2636 
2637 #ifdef HAS_ARGBMIRRORROW_AVX2
2638 // Shuffle table for reversing the bytes.
2639 static const ulvec32 kARGBShuffleMirror_AVX2 = {
2640   7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2641 };
ARGBMirrorRow_AVX2(const uint8 * src,uint8 * dst,int width)2642 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2643   intptr_t temp_width = (intptr_t)(width);
2644   asm volatile (
2645     "vmovdqu    %3,%%ymm5                      \n"
2646     LABELALIGN
2647   "1:                                          \n"
2648     VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
2649     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2650     "lea        " MEMLEA(0x20,1) ",%1          \n"
2651     "sub        $0x8,%2                        \n"
2652     "jg         1b                             \n"
2653     "vzeroupper                                \n"
2654   : "+r"(src),  // %0
2655     "+r"(dst),  // %1
2656     "+r"(temp_width)  // %2
2657   : "m"(kARGBShuffleMirror_AVX2) // %3
2658   : "memory", "cc", NACL_R14
2659     "xmm0", "xmm5"
2660   );
2661 }
2662 #endif  // HAS_ARGBMIRRORROW_AVX2
2663 
2664 #ifdef HAS_SPLITUVROW_AVX2
SplitUVRow_AVX2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)2665 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
2666                      int width) {
2667   asm volatile (
2668     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5             \n"
2669     "vpsrlw     $0x8,%%ymm5,%%ymm5               \n"
2670     "sub        %1,%2                            \n"
2671     LABELALIGN
2672   "1:                                            \n"
2673     "vmovdqu    " MEMACCESS(0) ",%%ymm0          \n"
2674     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1    \n"
2675     "lea        " MEMLEA(0x40,0) ",%0            \n"
2676     "vpsrlw     $0x8,%%ymm0,%%ymm2               \n"
2677     "vpsrlw     $0x8,%%ymm1,%%ymm3               \n"
2678     "vpand      %%ymm5,%%ymm0,%%ymm0             \n"
2679     "vpand      %%ymm5,%%ymm1,%%ymm1             \n"
2680     "vpackuswb  %%ymm1,%%ymm0,%%ymm0             \n"
2681     "vpackuswb  %%ymm3,%%ymm2,%%ymm2             \n"
2682     "vpermq     $0xd8,%%ymm0,%%ymm0              \n"
2683     "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
2684     "vmovdqu    %%ymm0," MEMACCESS(1) "          \n"
2685     MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)             //  vmovdqu %%ymm2,(%1,%2)
2686     "lea        " MEMLEA(0x20,1) ",%1            \n"
2687     "sub        $0x20,%3                         \n"
2688     "jg         1b                               \n"
2689     "vzeroupper                                  \n"
2690   : "+r"(src_uv),     // %0
2691     "+r"(dst_u),      // %1
2692     "+r"(dst_v),      // %2
2693     "+r"(width)         // %3
2694   :
2695   : "memory", "cc", NACL_R14
2696     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2697   );
2698 }
2699 #endif  // HAS_SPLITUVROW_AVX2
2700 
2701 #ifdef HAS_SPLITUVROW_SSE2
SplitUVRow_SSE2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)2702 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
2703                      int width) {
2704   asm volatile (
2705     "pcmpeqb    %%xmm5,%%xmm5                    \n"
2706     "psrlw      $0x8,%%xmm5                      \n"
2707     "sub        %1,%2                            \n"
2708     LABELALIGN
2709   "1:                                            \n"
2710     "movdqu     " MEMACCESS(0) ",%%xmm0          \n"
2711     "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
2712     "lea        " MEMLEA(0x20,0) ",%0            \n"
2713     "movdqa     %%xmm0,%%xmm2                    \n"
2714     "movdqa     %%xmm1,%%xmm3                    \n"
2715     "pand       %%xmm5,%%xmm0                    \n"
2716     "pand       %%xmm5,%%xmm1                    \n"
2717     "packuswb   %%xmm1,%%xmm0                    \n"
2718     "psrlw      $0x8,%%xmm2                      \n"
2719     "psrlw      $0x8,%%xmm3                      \n"
2720     "packuswb   %%xmm3,%%xmm2                    \n"
2721     "movdqu     %%xmm0," MEMACCESS(1) "          \n"
2722     MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)
2723     "lea        " MEMLEA(0x10,1) ",%1            \n"
2724     "sub        $0x10,%3                         \n"
2725     "jg         1b                               \n"
2726   : "+r"(src_uv),     // %0
2727     "+r"(dst_u),      // %1
2728     "+r"(dst_v),      // %2
2729     "+r"(width)         // %3
2730   :
2731   : "memory", "cc", NACL_R14
2732     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2733   );
2734 }
2735 #endif  // HAS_SPLITUVROW_SSE2
2736 
2737 #ifdef HAS_MERGEUVROW_AVX2
MergeUVRow_AVX2(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)2738 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2739                      int width) {
2740   asm volatile (
2741     "sub       %0,%1                             \n"
2742     LABELALIGN
2743   "1:                                            \n"
2744     "vmovdqu   " MEMACCESS(0) ",%%ymm0           \n"
2745     MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)             //  vmovdqu (%0,%1,1),%%ymm1
2746     "lea       " MEMLEA(0x20,0) ",%0             \n"
2747     "vpunpcklbw %%ymm1,%%ymm0,%%ymm2             \n"
2748     "vpunpckhbw %%ymm1,%%ymm0,%%ymm0             \n"
2749     "vextractf128 $0x0,%%ymm2," MEMACCESS(2) "   \n"
2750     "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
2751     "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
2752     "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
2753     "lea       " MEMLEA(0x40,2) ",%2             \n"
2754     "sub       $0x20,%3                          \n"
2755     "jg        1b                                \n"
2756     "vzeroupper                                  \n"
2757   : "+r"(src_u),     // %0
2758     "+r"(src_v),     // %1
2759     "+r"(dst_uv),    // %2
2760     "+r"(width)      // %3
2761   :
2762   : "memory", "cc", NACL_R14
2763     "xmm0", "xmm1", "xmm2"
2764   );
2765 }
2766 #endif  // HAS_MERGEUVROW_AVX2
2767 
2768 #ifdef HAS_MERGEUVROW_SSE2
MergeUVRow_SSE2(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)2769 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2770                      int width) {
2771   asm volatile (
2772     "sub       %0,%1                             \n"
2773     LABELALIGN
2774   "1:                                            \n"
2775     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
2776     MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1
2777     "lea       " MEMLEA(0x10,0) ",%0             \n"
2778     "movdqa    %%xmm0,%%xmm2                     \n"
2779     "punpcklbw %%xmm1,%%xmm0                     \n"
2780     "punpckhbw %%xmm1,%%xmm2                     \n"
2781     "movdqu    %%xmm0," MEMACCESS(2) "           \n"
2782     "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"
2783     "lea       " MEMLEA(0x20,2) ",%2             \n"
2784     "sub       $0x10,%3                          \n"
2785     "jg        1b                                \n"
2786   : "+r"(src_u),     // %0
2787     "+r"(src_v),     // %1
2788     "+r"(dst_uv),    // %2
2789     "+r"(width)      // %3
2790   :
2791   : "memory", "cc", NACL_R14
2792     "xmm0", "xmm1", "xmm2"
2793   );
2794 }
2795 #endif  // HAS_MERGEUVROW_SSE2
2796 
2797 #ifdef HAS_COPYROW_SSE2
CopyRow_SSE2(const uint8 * src,uint8 * dst,int count)2798 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2799   asm volatile (
2800     "test       $0xf,%0                        \n"
2801     "jne        2f                             \n"
2802     "test       $0xf,%1                        \n"
2803     "jne        2f                             \n"
2804     LABELALIGN
2805   "1:                                          \n"
2806     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
2807     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2808     "lea       " MEMLEA(0x20,0) ",%0           \n"
2809     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
2810     "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
2811     "lea       " MEMLEA(0x20,1) ",%1           \n"
2812     "sub       $0x20,%2                        \n"
2813     "jg        1b                              \n"
2814     "jmp       9f                              \n"
2815     LABELALIGN
2816   "2:                                          \n"
2817     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2818     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2819     "lea       " MEMLEA(0x20,0) ",%0           \n"
2820     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2821     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
2822     "lea       " MEMLEA(0x20,1) ",%1           \n"
2823     "sub       $0x20,%2                        \n"
2824     "jg        2b                              \n"
2825   "9:                                          \n"
2826   : "+r"(src),   // %0
2827     "+r"(dst),   // %1
2828     "+r"(count)  // %2
2829   :
2830   : "memory", "cc"
2831     , "xmm0", "xmm1"
2832   );
2833 }
2834 #endif  // HAS_COPYROW_SSE2
2835 
2836 #ifdef HAS_COPYROW_AVX
CopyRow_AVX(const uint8 * src,uint8 * dst,int count)2837 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
2838   asm volatile (
2839     LABELALIGN
2840   "1:                                          \n"
2841     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
2842     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
2843     "lea       " MEMLEA(0x40,0) ",%0           \n"
2844     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
2845     "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
2846     "lea       " MEMLEA(0x40,1) ",%1           \n"
2847     "sub       $0x40,%2                        \n"
2848     "jg        1b                              \n"
2849   : "+r"(src),   // %0
2850     "+r"(dst),   // %1
2851     "+r"(count)  // %2
2852   :
2853   : "memory", "cc"
2854     , "xmm0", "xmm1"
2855   );
2856 }
2857 #endif  // HAS_COPYROW_AVX
2858 
2859 #ifdef HAS_COPYROW_ERMS
2860 // Multiple of 1.
CopyRow_ERMS(const uint8 * src,uint8 * dst,int width)2861 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
2862   size_t width_tmp = (size_t)(width);
2863   asm volatile (
2864     "rep movsb " MEMMOVESTRING(0,1) "          \n"
2865   : "+S"(src),  // %0
2866     "+D"(dst),  // %1
2867     "+c"(width_tmp) // %2
2868   :
2869   : "memory", "cc"
2870   );
2871 }
2872 #endif  // HAS_COPYROW_ERMS
2873 
2874 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
2875 // width in pixels
ARGBCopyAlphaRow_SSE2(const uint8 * src,uint8 * dst,int width)2876 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2877   asm volatile (
2878     "pcmpeqb   %%xmm0,%%xmm0                   \n"
2879     "pslld     $0x18,%%xmm0                    \n"
2880     "pcmpeqb   %%xmm1,%%xmm1                   \n"
2881     "psrld     $0x8,%%xmm1                     \n"
2882     LABELALIGN
2883   "1:                                          \n"
2884     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
2885     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
2886     "lea       " MEMLEA(0x20,0) ",%0           \n"
2887     "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
2888     "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
2889     "pand      %%xmm0,%%xmm2                   \n"
2890     "pand      %%xmm0,%%xmm3                   \n"
2891     "pand      %%xmm1,%%xmm4                   \n"
2892     "pand      %%xmm1,%%xmm5                   \n"
2893     "por       %%xmm4,%%xmm2                   \n"
2894     "por       %%xmm5,%%xmm3                   \n"
2895     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
2896     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
2897     "lea       " MEMLEA(0x20,1) ",%1           \n"
2898     "sub       $0x8,%2                         \n"
2899     "jg        1b                              \n"
2900   : "+r"(src),   // %0
2901     "+r"(dst),   // %1
2902     "+r"(width)  // %2
2903   :
2904   : "memory", "cc"
2905     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2906   );
2907 }
2908 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
2909 
2910 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
2911 // width in pixels
ARGBCopyAlphaRow_AVX2(const uint8 * src,uint8 * dst,int width)2912 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2913   asm volatile (
2914     "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
2915     "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
2916     LABELALIGN
2917   "1:                                          \n"
2918     "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
2919     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
2920     "lea       " MEMLEA(0x40,0) ",%0           \n"
2921     "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
2922     "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
2923     "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
2924     "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
2925     "lea       " MEMLEA(0x40,1) ",%1           \n"
2926     "sub       $0x10,%2                        \n"
2927     "jg        1b                              \n"
2928     "vzeroupper                                \n"
2929   : "+r"(src),   // %0
2930     "+r"(dst),   // %1
2931     "+r"(width)  // %2
2932   :
2933   : "memory", "cc"
2934     , "xmm0", "xmm1", "xmm2"
2935   );
2936 }
2937 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
2938 
2939 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
2940 // width in pixels
ARGBExtractAlphaRow_SSE2(const uint8 * src_argb,uint8 * dst_a,int width)2941 void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
2942  asm volatile (
2943     LABELALIGN
2944   "1:                                          \n"
2945     "movdqu    " MEMACCESS(0) ", %%xmm0        \n"
2946     "movdqu    " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
2947     "lea       " MEMLEA(0x20, 0) ", %0         \n"
2948     "psrld     $0x18, %%xmm0                   \n"
2949     "psrld     $0x18, %%xmm1                   \n"
2950     "packssdw  %%xmm1, %%xmm0                  \n"
2951     "packuswb  %%xmm0, %%xmm0                  \n"
2952     "movq      %%xmm0," MEMACCESS(1) "         \n"
2953     "lea       " MEMLEA(0x8, 1) ", %1          \n"
2954     "sub       $0x8, %2                        \n"
2955     "jg        1b                              \n"
2956   : "+r"(src_argb),  // %0
2957     "+r"(dst_a),     // %1
2958     "+rm"(width)     // %2
2959   :
2960   : "memory", "cc"
2961     , "xmm0", "xmm1"
2962   );
2963 }
2964 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
2965 
2966 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
2967 // width in pixels
ARGBCopyYToAlphaRow_SSE2(const uint8 * src,uint8 * dst,int width)2968 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2969   asm volatile (
2970     "pcmpeqb   %%xmm0,%%xmm0                   \n"
2971     "pslld     $0x18,%%xmm0                    \n"
2972     "pcmpeqb   %%xmm1,%%xmm1                   \n"
2973     "psrld     $0x8,%%xmm1                     \n"
2974     LABELALIGN
2975   "1:                                          \n"
2976     "movq      " MEMACCESS(0) ",%%xmm2         \n"
2977     "lea       " MEMLEA(0x8,0) ",%0            \n"
2978     "punpcklbw %%xmm2,%%xmm2                   \n"
2979     "punpckhwd %%xmm2,%%xmm3                   \n"
2980     "punpcklwd %%xmm2,%%xmm2                   \n"
2981     "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
2982     "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
2983     "pand      %%xmm0,%%xmm2                   \n"
2984     "pand      %%xmm0,%%xmm3                   \n"
2985     "pand      %%xmm1,%%xmm4                   \n"
2986     "pand      %%xmm1,%%xmm5                   \n"
2987     "por       %%xmm4,%%xmm2                   \n"
2988     "por       %%xmm5,%%xmm3                   \n"
2989     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
2990     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
2991     "lea       " MEMLEA(0x20,1) ",%1           \n"
2992     "sub       $0x8,%2                         \n"
2993     "jg        1b                              \n"
2994   : "+r"(src),   // %0
2995     "+r"(dst),   // %1
2996     "+r"(width)  // %2
2997   :
2998   : "memory", "cc"
2999     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3000   );
3001 }
3002 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
3003 
3004 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3005 // width in pixels
ARGBCopyYToAlphaRow_AVX2(const uint8 * src,uint8 * dst,int width)3006 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3007   asm volatile (
3008     "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
3009     "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
3010     LABELALIGN
3011   "1:                                          \n"
3012     "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
3013     "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
3014     "lea       " MEMLEA(0x10,0) ",%0           \n"
3015     "vpslld    $0x18,%%ymm1,%%ymm1             \n"
3016     "vpslld    $0x18,%%ymm2,%%ymm2             \n"
3017     "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
3018     "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
3019     "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
3020     "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
3021     "lea       " MEMLEA(0x40,1) ",%1           \n"
3022     "sub       $0x10,%2                        \n"
3023     "jg        1b                              \n"
3024     "vzeroupper                                \n"
3025   : "+r"(src),   // %0
3026     "+r"(dst),   // %1
3027     "+r"(width)  // %2
3028   :
3029   : "memory", "cc"
3030     , "xmm0", "xmm1", "xmm2"
3031   );
3032 }
3033 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
3034 
3035 #ifdef HAS_SETROW_X86
SetRow_X86(uint8 * dst,uint8 v8,int width)3036 void SetRow_X86(uint8* dst, uint8 v8, int width) {
3037   size_t width_tmp = (size_t)(width >> 2);
3038   const uint32 v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
3039   asm volatile (
3040     "rep stosl " MEMSTORESTRING(eax,0) "       \n"
3041     : "+D"(dst),       // %0
3042       "+c"(width_tmp)  // %1
3043     : "a"(v32)         // %2
3044     : "memory", "cc");
3045 }
3046 
SetRow_ERMS(uint8 * dst,uint8 v8,int width)3047 void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
3048   size_t width_tmp = (size_t)(width);
3049   asm volatile (
3050     "rep stosb " MEMSTORESTRING(al,0) "        \n"
3051     : "+D"(dst),       // %0
3052       "+c"(width_tmp)  // %1
3053     : "a"(v8)          // %2
3054     : "memory", "cc");
3055 }
3056 
ARGBSetRow_X86(uint8 * dst_argb,uint32 v32,int width)3057 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
3058   size_t width_tmp = (size_t)(width);
3059   asm volatile (
3060     "rep stosl " MEMSTORESTRING(eax,0) "       \n"
3061     : "+D"(dst_argb),  // %0
3062       "+c"(width_tmp)  // %1
3063     : "a"(v32)         // %2
3064     : "memory", "cc");
3065 }
3066 #endif  // HAS_SETROW_X86
3067 
3068 #ifdef HAS_YUY2TOYROW_SSE2
YUY2ToYRow_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int width)3069 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
3070   asm volatile (
3071     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3072     "psrlw     $0x8,%%xmm5                     \n"
3073     LABELALIGN
3074   "1:                                          \n"
3075     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3076     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3077     "lea       " MEMLEA(0x20,0) ",%0           \n"
3078     "pand      %%xmm5,%%xmm0                   \n"
3079     "pand      %%xmm5,%%xmm1                   \n"
3080     "packuswb  %%xmm1,%%xmm0                   \n"
3081     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3082     "lea       " MEMLEA(0x10,1) ",%1           \n"
3083     "sub       $0x10,%2                        \n"
3084     "jg        1b                              \n"
3085   : "+r"(src_yuy2),  // %0
3086     "+r"(dst_y),     // %1
3087     "+r"(width)        // %2
3088   :
3089   : "memory", "cc"
3090     , "xmm0", "xmm1", "xmm5"
3091   );
3092 }
3093 
YUY2ToUVRow_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int width)3094 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
3095                       uint8* dst_u, uint8* dst_v, int width) {
3096   asm volatile (
3097     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3098     "psrlw     $0x8,%%xmm5                     \n"
3099     "sub       %1,%2                           \n"
3100     LABELALIGN
3101   "1:                                          \n"
3102     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3103     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3104     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
3105     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
3106     "lea       " MEMLEA(0x20,0) ",%0           \n"
3107     "pavgb     %%xmm2,%%xmm0                   \n"
3108     "pavgb     %%xmm3,%%xmm1                   \n"
3109     "psrlw     $0x8,%%xmm0                     \n"
3110     "psrlw     $0x8,%%xmm1                     \n"
3111     "packuswb  %%xmm1,%%xmm0                   \n"
3112     "movdqa    %%xmm0,%%xmm1                   \n"
3113     "pand      %%xmm5,%%xmm0                   \n"
3114     "packuswb  %%xmm0,%%xmm0                   \n"
3115     "psrlw     $0x8,%%xmm1                     \n"
3116     "packuswb  %%xmm1,%%xmm1                   \n"
3117     "movq      %%xmm0," MEMACCESS(1) "         \n"
3118     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3119     "lea       " MEMLEA(0x8,1) ",%1            \n"
3120     "sub       $0x10,%3                        \n"
3121     "jg        1b                              \n"
3122   : "+r"(src_yuy2),    // %0
3123     "+r"(dst_u),       // %1
3124     "+r"(dst_v),       // %2
3125     "+r"(width)          // %3
3126   : "r"((intptr_t)(stride_yuy2))  // %4
3127   : "memory", "cc", NACL_R14
3128     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3129   );
3130 }
3131 
YUY2ToUV422Row_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int width)3132 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3133                          uint8* dst_u, uint8* dst_v, int width) {
3134   asm volatile (
3135     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3136     "psrlw     $0x8,%%xmm5                     \n"
3137     "sub       %1,%2                           \n"
3138     LABELALIGN
3139   "1:                                          \n"
3140     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3141     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3142     "lea       " MEMLEA(0x20,0) ",%0           \n"
3143     "psrlw     $0x8,%%xmm0                     \n"
3144     "psrlw     $0x8,%%xmm1                     \n"
3145     "packuswb  %%xmm1,%%xmm0                   \n"
3146     "movdqa    %%xmm0,%%xmm1                   \n"
3147     "pand      %%xmm5,%%xmm0                   \n"
3148     "packuswb  %%xmm0,%%xmm0                   \n"
3149     "psrlw     $0x8,%%xmm1                     \n"
3150     "packuswb  %%xmm1,%%xmm1                   \n"
3151     "movq      %%xmm0," MEMACCESS(1) "         \n"
3152     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3153     "lea       " MEMLEA(0x8,1) ",%1            \n"
3154     "sub       $0x10,%3                        \n"
3155     "jg        1b                              \n"
3156   : "+r"(src_yuy2),    // %0
3157     "+r"(dst_u),       // %1
3158     "+r"(dst_v),       // %2
3159     "+r"(width)          // %3
3160   :
3161   : "memory", "cc", NACL_R14
3162     "xmm0", "xmm1", "xmm5"
3163   );
3164 }
3165 
UYVYToYRow_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int width)3166 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
3167   asm volatile (
3168     LABELALIGN
3169   "1:                                          \n"
3170     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3171     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3172     "lea       " MEMLEA(0x20,0) ",%0           \n"
3173     "psrlw     $0x8,%%xmm0                     \n"
3174     "psrlw     $0x8,%%xmm1                     \n"
3175     "packuswb  %%xmm1,%%xmm0                   \n"
3176     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3177     "lea       " MEMLEA(0x10,1) ",%1           \n"
3178     "sub       $0x10,%2                        \n"
3179     "jg        1b                              \n"
3180   : "+r"(src_uyvy),  // %0
3181     "+r"(dst_y),     // %1
3182     "+r"(width)        // %2
3183   :
3184   : "memory", "cc"
3185     , "xmm0", "xmm1"
3186   );
3187 }
3188 
UYVYToUVRow_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int width)3189 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
3190                       uint8* dst_u, uint8* dst_v, int width) {
3191   asm volatile (
3192     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3193     "psrlw     $0x8,%%xmm5                     \n"
3194     "sub       %1,%2                           \n"
3195     LABELALIGN
3196   "1:                                          \n"
3197     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3198     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3199     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
3200     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
3201     "lea       " MEMLEA(0x20,0) ",%0           \n"
3202     "pavgb     %%xmm2,%%xmm0                   \n"
3203     "pavgb     %%xmm3,%%xmm1                   \n"
3204     "pand      %%xmm5,%%xmm0                   \n"
3205     "pand      %%xmm5,%%xmm1                   \n"
3206     "packuswb  %%xmm1,%%xmm0                   \n"
3207     "movdqa    %%xmm0,%%xmm1                   \n"
3208     "pand      %%xmm5,%%xmm0                   \n"
3209     "packuswb  %%xmm0,%%xmm0                   \n"
3210     "psrlw     $0x8,%%xmm1                     \n"
3211     "packuswb  %%xmm1,%%xmm1                   \n"
3212     "movq      %%xmm0," MEMACCESS(1) "         \n"
3213     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3214     "lea       " MEMLEA(0x8,1) ",%1            \n"
3215     "sub       $0x10,%3                        \n"
3216     "jg        1b                              \n"
3217   : "+r"(src_uyvy),    // %0
3218     "+r"(dst_u),       // %1
3219     "+r"(dst_v),       // %2
3220     "+r"(width)          // %3
3221   : "r"((intptr_t)(stride_uyvy))  // %4
3222   : "memory", "cc", NACL_R14
3223     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3224   );
3225 }
3226 
UYVYToUV422Row_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int width)3227 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3228                          uint8* dst_u, uint8* dst_v, int width) {
3229   asm volatile (
3230     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3231     "psrlw     $0x8,%%xmm5                     \n"
3232     "sub       %1,%2                           \n"
3233     LABELALIGN
3234   "1:                                          \n"
3235     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3236     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3237     "lea       " MEMLEA(0x20,0) ",%0           \n"
3238     "pand      %%xmm5,%%xmm0                   \n"
3239     "pand      %%xmm5,%%xmm1                   \n"
3240     "packuswb  %%xmm1,%%xmm0                   \n"
3241     "movdqa    %%xmm0,%%xmm1                   \n"
3242     "pand      %%xmm5,%%xmm0                   \n"
3243     "packuswb  %%xmm0,%%xmm0                   \n"
3244     "psrlw     $0x8,%%xmm1                     \n"
3245     "packuswb  %%xmm1,%%xmm1                   \n"
3246     "movq      %%xmm0," MEMACCESS(1) "         \n"
3247     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3248     "lea       " MEMLEA(0x8,1) ",%1            \n"
3249     "sub       $0x10,%3                        \n"
3250     "jg        1b                              \n"
3251   : "+r"(src_uyvy),    // %0
3252     "+r"(dst_u),       // %1
3253     "+r"(dst_v),       // %2
3254     "+r"(width)          // %3
3255   :
3256   : "memory", "cc", NACL_R14
3257     "xmm0", "xmm1", "xmm5"
3258   );
3259 }
3260 #endif  // HAS_YUY2TOYROW_SSE2
3261 
3262 #ifdef HAS_YUY2TOYROW_AVX2
YUY2ToYRow_AVX2(const uint8 * src_yuy2,uint8 * dst_y,int width)3263 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
3264   asm volatile (
3265     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3266     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3267     LABELALIGN
3268   "1:                                          \n"
3269     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3270     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3271     "lea       " MEMLEA(0x40,0) ",%0           \n"
3272     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
3273     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
3274     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3275     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3276     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
3277     "lea      " MEMLEA(0x20,1) ",%1            \n"
3278     "sub       $0x20,%2                        \n"
3279     "jg        1b                              \n"
3280     "vzeroupper                                \n"
3281   : "+r"(src_yuy2),  // %0
3282     "+r"(dst_y),     // %1
3283     "+r"(width)        // %2
3284   :
3285   : "memory", "cc"
3286     , "xmm0", "xmm1", "xmm5"
3287   );
3288 }
3289 
YUY2ToUVRow_AVX2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int width)3290 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
3291                       uint8* dst_u, uint8* dst_v, int width) {
3292   asm volatile (
3293     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3294     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3295     "sub       %1,%2                           \n"
3296     LABELALIGN
3297   "1:                                          \n"
3298     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3299     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3300     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3301     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3302     "lea       " MEMLEA(0x40,0) ",%0           \n"
3303     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3304     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
3305     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3306     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3307     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3308     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3309     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3310     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3311     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3312     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3313     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3314     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3315     "lea      " MEMLEA(0x10,1) ",%1            \n"
3316     "sub       $0x20,%3                        \n"
3317     "jg        1b                              \n"
3318     "vzeroupper                                \n"
3319   : "+r"(src_yuy2),    // %0
3320     "+r"(dst_u),       // %1
3321     "+r"(dst_v),       // %2
3322     "+r"(width)          // %3
3323   : "r"((intptr_t)(stride_yuy2))  // %4
3324   : "memory", "cc", NACL_R14
3325     "xmm0", "xmm1", "xmm5"
3326   );
3327 }
3328 
YUY2ToUV422Row_AVX2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int width)3329 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3330                          uint8* dst_u, uint8* dst_v, int width) {
3331   asm volatile (
3332     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3333     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3334     "sub       %1,%2                           \n"
3335     LABELALIGN
3336   "1:                                          \n"
3337     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3338     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3339     "lea       " MEMLEA(0x40,0) ",%0           \n"
3340     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3341     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
3342     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3343     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3344     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3345     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3346     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3347     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3348     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3349     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3350     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3351     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3352     "lea      " MEMLEA(0x10,1) ",%1            \n"
3353     "sub       $0x20,%3                        \n"
3354     "jg        1b                              \n"
3355     "vzeroupper                                \n"
3356   : "+r"(src_yuy2),    // %0
3357     "+r"(dst_u),       // %1
3358     "+r"(dst_v),       // %2
3359     "+r"(width)          // %3
3360   :
3361   : "memory", "cc", NACL_R14
3362     "xmm0", "xmm1", "xmm5"
3363   );
3364 }
3365 
UYVYToYRow_AVX2(const uint8 * src_uyvy,uint8 * dst_y,int width)3366 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
3367   asm volatile (
3368     LABELALIGN
3369   "1:                                          \n"
3370     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3371     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3372     "lea       " MEMLEA(0x40,0) ",%0           \n"
3373     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3374     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
3375     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3376     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3377     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
3378     "lea      " MEMLEA(0x20,1) ",%1            \n"
3379     "sub       $0x20,%2                        \n"
3380     "jg        1b                              \n"
3381     "vzeroupper                                \n"
3382   : "+r"(src_uyvy),  // %0
3383     "+r"(dst_y),     // %1
3384     "+r"(width)        // %2
3385   :
3386   : "memory", "cc"
3387     , "xmm0", "xmm1", "xmm5"
3388   );
3389 }
UYVYToUVRow_AVX2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int width)3390 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
3391                       uint8* dst_u, uint8* dst_v, int width) {
3392   asm volatile (
3393     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3394     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3395     "sub       %1,%2                           \n"
3396 
3397     LABELALIGN
3398   "1:                                          \n"
3399     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3400     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3401     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3402     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3403     "lea       " MEMLEA(0x40,0) ",%0           \n"
3404     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
3405     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
3406     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3407     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3408     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3409     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3410     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3411     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3412     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3413     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3414     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3415     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3416     "lea      " MEMLEA(0x10,1) ",%1            \n"
3417     "sub       $0x20,%3                        \n"
3418     "jg        1b                              \n"
3419     "vzeroupper                                \n"
3420   : "+r"(src_uyvy),    // %0
3421     "+r"(dst_u),       // %1
3422     "+r"(dst_v),       // %2
3423     "+r"(width)          // %3
3424   : "r"((intptr_t)(stride_uyvy))  // %4
3425   : "memory", "cc", NACL_R14
3426     "xmm0", "xmm1", "xmm5"
3427   );
3428 }
3429 
UYVYToUV422Row_AVX2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int width)3430 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3431                          uint8* dst_u, uint8* dst_v, int width) {
3432   asm volatile (
3433     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
3434     "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
3435     "sub       %1,%2                           \n"
3436     LABELALIGN
3437   "1:                                          \n"
3438     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3439     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3440     "lea       " MEMLEA(0x40,0) ",%0           \n"
3441     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
3442     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
3443     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3444     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3445     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3446     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3447     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3448     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3449     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3450     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3451     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3452     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3453     "lea      " MEMLEA(0x10,1) ",%1            \n"
3454     "sub       $0x20,%3                        \n"
3455     "jg        1b                              \n"
3456     "vzeroupper                                \n"
3457   : "+r"(src_uyvy),    // %0
3458     "+r"(dst_u),       // %1
3459     "+r"(dst_v),       // %2
3460     "+r"(width)          // %3
3461   :
3462   : "memory", "cc", NACL_R14
3463     "xmm0", "xmm1", "xmm5"
3464   );
3465 }
3466 #endif  // HAS_YUY2TOYROW_AVX2
3467 
3468 #ifdef HAS_ARGBBLENDROW_SSSE3
3469 // Shuffle table for isolating alpha.
3470 static uvec8 kShuffleAlpha = {
3471   3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3472   11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3473 };
3474 
3475 // Blend 8 pixels at a time
ARGBBlendRow_SSSE3(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)3476 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3477                         uint8* dst_argb, int width) {
3478   asm volatile (
3479     "pcmpeqb   %%xmm7,%%xmm7                   \n"
3480     "psrlw     $0xf,%%xmm7                     \n"
3481     "pcmpeqb   %%xmm6,%%xmm6                   \n"
3482     "psrlw     $0x8,%%xmm6                     \n"
3483     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3484     "psllw     $0x8,%%xmm5                     \n"
3485     "pcmpeqb   %%xmm4,%%xmm4                   \n"
3486     "pslld     $0x18,%%xmm4                    \n"
3487     "sub       $0x4,%3                         \n"
3488     "jl        49f                             \n"
3489 
3490     // 4 pixel loop.
3491     LABELALIGN
3492   "40:                                         \n"
3493     "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
3494     "lea       " MEMLEA(0x10,0) ",%0           \n"
3495     "movdqa    %%xmm3,%%xmm0                   \n"
3496     "pxor      %%xmm4,%%xmm3                   \n"
3497     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
3498     "pshufb    %4,%%xmm3                       \n"
3499     "pand      %%xmm6,%%xmm2                   \n"
3500     "paddw     %%xmm7,%%xmm3                   \n"
3501     "pmullw    %%xmm3,%%xmm2                   \n"
3502     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
3503     "lea       " MEMLEA(0x10,1) ",%1           \n"
3504     "psrlw     $0x8,%%xmm1                     \n"
3505     "por       %%xmm4,%%xmm0                   \n"
3506     "pmullw    %%xmm3,%%xmm1                   \n"
3507     "psrlw     $0x8,%%xmm2                     \n"
3508     "paddusb   %%xmm2,%%xmm0                   \n"
3509     "pand      %%xmm5,%%xmm1                   \n"
3510     "paddusb   %%xmm1,%%xmm0                   \n"
3511     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
3512     "lea       " MEMLEA(0x10,2) ",%2           \n"
3513     "sub       $0x4,%3                         \n"
3514     "jge       40b                             \n"
3515 
3516   "49:                                         \n"
3517     "add       $0x3,%3                         \n"
3518     "jl        99f                             \n"
3519 
3520     // 1 pixel loop.
3521   "91:                                         \n"
3522     "movd      " MEMACCESS(0) ",%%xmm3         \n"
3523     "lea       " MEMLEA(0x4,0) ",%0            \n"
3524     "movdqa    %%xmm3,%%xmm0                   \n"
3525     "pxor      %%xmm4,%%xmm3                   \n"
3526     "movd      " MEMACCESS(1) ",%%xmm2         \n"
3527     "pshufb    %4,%%xmm3                       \n"
3528     "pand      %%xmm6,%%xmm2                   \n"
3529     "paddw     %%xmm7,%%xmm3                   \n"
3530     "pmullw    %%xmm3,%%xmm2                   \n"
3531     "movd      " MEMACCESS(1) ",%%xmm1         \n"
3532     "lea       " MEMLEA(0x4,1) ",%1            \n"
3533     "psrlw     $0x8,%%xmm1                     \n"
3534     "por       %%xmm4,%%xmm0                   \n"
3535     "pmullw    %%xmm3,%%xmm1                   \n"
3536     "psrlw     $0x8,%%xmm2                     \n"
3537     "paddusb   %%xmm2,%%xmm0                   \n"
3538     "pand      %%xmm5,%%xmm1                   \n"
3539     "paddusb   %%xmm1,%%xmm0                   \n"
3540     "movd      %%xmm0," MEMACCESS(2) "         \n"
3541     "lea       " MEMLEA(0x4,2) ",%2            \n"
3542     "sub       $0x1,%3                         \n"
3543     "jge       91b                             \n"
3544   "99:                                         \n"
3545   : "+r"(src_argb0),    // %0
3546     "+r"(src_argb1),    // %1
3547     "+r"(dst_argb),     // %2
3548     "+r"(width)         // %3
3549   : "m"(kShuffleAlpha)  // %4
3550   : "memory", "cc"
3551     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3552   );
3553 }
3554 #endif  // HAS_ARGBBLENDROW_SSSE3
3555 
3556 #ifdef HAS_BLENDPLANEROW_SSSE3
3557 // Blend 8 pixels at a time.
3558 // unsigned version of math
3559 // =((A2*C2)+(B2*(255-C2))+255)/256
3560 // signed version of math
3561 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_SSSE3(const uint8 * src0,const uint8 * src1,const uint8 * alpha,uint8 * dst,int width)3562 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
3563                          const uint8* alpha, uint8* dst, int width) {
3564   asm volatile (
3565     "pcmpeqb    %%xmm5,%%xmm5                  \n"
3566     "psllw      $0x8,%%xmm5                    \n"
3567     "mov        $0x80808080,%%eax              \n"
3568     "movd       %%eax,%%xmm6                   \n"
3569     "pshufd     $0x0,%%xmm6,%%xmm6             \n"
3570     "mov        $0x807f807f,%%eax              \n"
3571     "movd       %%eax,%%xmm7                   \n"
3572     "pshufd     $0x0,%%xmm7,%%xmm7             \n"
3573     "sub        %2,%0                          \n"
3574     "sub        %2,%1                          \n"
3575     "sub        %2,%3                          \n"
3576 
3577     // 8 pixel loop.
3578     LABELALIGN
3579   "1:                                          \n"
3580     "movq       (%2),%%xmm0                    \n"
3581     "punpcklbw  %%xmm0,%%xmm0                  \n"
3582     "pxor       %%xmm5,%%xmm0                  \n"
3583     "movq       (%0,%2,1),%%xmm1               \n"
3584     "movq       (%1,%2,1),%%xmm2               \n"
3585     "punpcklbw  %%xmm2,%%xmm1                  \n"
3586     "psubb      %%xmm6,%%xmm1                  \n"
3587     "pmaddubsw  %%xmm1,%%xmm0                  \n"
3588     "paddw      %%xmm7,%%xmm0                  \n"
3589     "psrlw      $0x8,%%xmm0                    \n"
3590     "packuswb   %%xmm0,%%xmm0                  \n"
3591     "movq       %%xmm0,(%3,%2,1)               \n"
3592     "lea        0x8(%2),%2                     \n"
3593     "sub        $0x8,%4                        \n"
3594     "jg        1b                              \n"
3595   : "+r"(src0),       // %0
3596     "+r"(src1),       // %1
3597     "+r"(alpha),      // %2
3598     "+r"(dst),        // %3
3599     "+rm"(width)      // %4
3600   :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"
3601   );
3602 }
3603 #endif  // HAS_BLENDPLANEROW_SSSE3
3604 
3605 #ifdef HAS_BLENDPLANEROW_AVX2
3606 // Blend 32 pixels at a time.
3607 // unsigned version of math
3608 // =((A2*C2)+(B2*(255-C2))+255)/256
3609 // signed version of math
3610 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_AVX2(const uint8 * src0,const uint8 * src1,const uint8 * alpha,uint8 * dst,int width)3611 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
3612                         const uint8* alpha, uint8* dst, int width) {
3613   asm volatile (
3614     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
3615     "vpsllw     $0x8,%%ymm5,%%ymm5             \n"
3616     "mov        $0x80808080,%%eax              \n"
3617     "vmovd      %%eax,%%xmm6                   \n"
3618     "vbroadcastss %%xmm6,%%ymm6                \n"
3619     "mov        $0x807f807f,%%eax              \n"
3620     "vmovd      %%eax,%%xmm7                   \n"
3621     "vbroadcastss %%xmm7,%%ymm7                \n"
3622     "sub        %2,%0                          \n"
3623     "sub        %2,%1                          \n"
3624     "sub        %2,%3                          \n"
3625 
3626     // 32 pixel loop.
3627     LABELALIGN
3628   "1:                                          \n"
3629     "vmovdqu    (%2),%%ymm0                    \n"
3630     "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"
3631     "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
3632     "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"
3633     "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
3634     "vmovdqu    (%0,%2,1),%%ymm1               \n"
3635     "vmovdqu    (%1,%2,1),%%ymm2               \n"
3636     "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"
3637     "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
3638     "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"
3639     "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
3640     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
3641     "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
3642     "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"
3643     "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
3644     "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"
3645     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
3646     "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"
3647     "vmovdqu    %%ymm0,(%3,%2,1)               \n"
3648     "lea        0x20(%2),%2                    \n"
3649     "sub        $0x20,%4                       \n"
3650     "jg        1b                              \n"
3651     "vzeroupper                                \n"
3652   : "+r"(src0),       // %0
3653     "+r"(src1),       // %1
3654     "+r"(alpha),      // %2
3655     "+r"(dst),        // %3
3656     "+rm"(width)      // %4
3657   :: "memory", "cc", "eax",
3658      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3659   );
3660 }
3661 #endif  // HAS_BLENDPLANEROW_AVX2
3662 
3663 #ifdef HAS_ARGBATTENUATEROW_SSSE3
3664 // Shuffle table duplicating alpha
3665 static uvec8 kShuffleAlpha0 = {
3666   3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
3667 };
3668 static uvec8 kShuffleAlpha1 = {
3669   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3670   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
3671 };
3672 // Attenuate 4 pixels at a time.
ARGBAttenuateRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)3673 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3674   asm volatile (
3675     "pcmpeqb   %%xmm3,%%xmm3                   \n"
3676     "pslld     $0x18,%%xmm3                    \n"
3677     "movdqa    %3,%%xmm4                       \n"
3678     "movdqa    %4,%%xmm5                       \n"
3679 
3680     // 4 pixel loop.
3681     LABELALIGN
3682   "1:                                          \n"
3683     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3684     "pshufb    %%xmm4,%%xmm0                   \n"
3685     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3686     "punpcklbw %%xmm1,%%xmm1                   \n"
3687     "pmulhuw   %%xmm1,%%xmm0                   \n"
3688     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3689     "pshufb    %%xmm5,%%xmm1                   \n"
3690     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3691     "punpckhbw %%xmm2,%%xmm2                   \n"
3692     "pmulhuw   %%xmm2,%%xmm1                   \n"
3693     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3694     "lea       " MEMLEA(0x10,0) ",%0           \n"
3695     "pand      %%xmm3,%%xmm2                   \n"
3696     "psrlw     $0x8,%%xmm0                     \n"
3697     "psrlw     $0x8,%%xmm1                     \n"
3698     "packuswb  %%xmm1,%%xmm0                   \n"
3699     "por       %%xmm2,%%xmm0                   \n"
3700     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3701     "lea       " MEMLEA(0x10,1) ",%1           \n"
3702     "sub       $0x4,%2                         \n"
3703     "jg        1b                              \n"
3704   : "+r"(src_argb),    // %0
3705     "+r"(dst_argb),    // %1
3706     "+r"(width)        // %2
3707   : "m"(kShuffleAlpha0),  // %3
3708     "m"(kShuffleAlpha1)  // %4
3709   : "memory", "cc"
3710     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3711   );
3712 }
3713 #endif  // HAS_ARGBATTENUATEROW_SSSE3
3714 
3715 #ifdef HAS_ARGBATTENUATEROW_AVX2
3716 // Shuffle table duplicating alpha.
3717 static const uvec8 kShuffleAlpha_AVX2 = {
3718   6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
3719 };
3720 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)3721 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
3722   asm volatile (
3723     "vbroadcastf128 %3,%%ymm4                  \n"
3724     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
3725     "vpslld     $0x18,%%ymm5,%%ymm5            \n"
3726     "sub        %0,%1                          \n"
3727 
3728     // 8 pixel loop.
3729     LABELALIGN
3730   "1:                                          \n"
3731     "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
3732     "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
3733     "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
3734     "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
3735     "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
3736     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
3737     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
3738     "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
3739     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
3740     "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
3741     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
3742     "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
3743     MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
3744     "lea       " MEMLEA(0x20,0) ",%0           \n"
3745     "sub        $0x8,%2                        \n"
3746     "jg        1b                              \n"
3747     "vzeroupper                                \n"
3748   : "+r"(src_argb),    // %0
3749     "+r"(dst_argb),    // %1
3750     "+r"(width)        // %2
3751   : "m"(kShuffleAlpha_AVX2)  // %3
3752   : "memory", "cc"
3753     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3754   );
3755 }
3756 #endif  // HAS_ARGBATTENUATEROW_AVX2
3757 
3758 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
3759 // Unattenuate 4 pixels at a time.
ARGBUnattenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)3760 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3761                              int width) {
3762   uintptr_t alpha;
3763   asm volatile (
3764     // 4 pixel loop.
3765     LABELALIGN
3766   "1:                                          \n"
3767     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3768     "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
3769     "punpcklbw %%xmm0,%%xmm0                   \n"
3770     MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
3771     "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
3772     MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
3773     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
3774     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
3775     "movlhps   %%xmm3,%%xmm2                   \n"
3776     "pmulhuw   %%xmm2,%%xmm0                   \n"
3777     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3778     "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
3779     "punpckhbw %%xmm1,%%xmm1                   \n"
3780     MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
3781     "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
3782     MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
3783     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
3784     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
3785     "movlhps   %%xmm3,%%xmm2                   \n"
3786     "pmulhuw   %%xmm2,%%xmm1                   \n"
3787     "lea       " MEMLEA(0x10,0) ",%0           \n"
3788     "packuswb  %%xmm1,%%xmm0                   \n"
3789     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3790     "lea       " MEMLEA(0x10,1) ",%1           \n"
3791     "sub       $0x4,%2                         \n"
3792     "jg        1b                              \n"
3793   : "+r"(src_argb),     // %0
3794     "+r"(dst_argb),     // %1
3795     "+r"(width),        // %2
3796     "=&r"(alpha)        // %3
3797   : "r"(fixed_invtbl8)  // %4
3798   : "memory", "cc", NACL_R14
3799     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3800   );
3801 }
3802 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
3803 
3804 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
3805 // Shuffle table duplicating alpha.
3806 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
3807   0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
3808 };
3809 // Unattenuate 8 pixels at a time.
ARGBUnattenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)3810 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
3811                              int width) {
3812   uintptr_t alpha;
3813   asm volatile (
3814     "sub        %0,%1                          \n"
3815     "vbroadcastf128 %5,%%ymm5                  \n"
3816 
3817     // 8 pixel loop.
3818     LABELALIGN
3819   "1:                                          \n"
3820     // replace VPGATHER
3821     "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
3822     MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
3823     "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
3824     MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
3825     "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
3826     "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
3827     MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
3828     "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
3829     MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
3830     "movzb     " MEMACCESS2(0x13,0) ",%3       \n"
3831     "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
3832     MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
3833     "movzb     " MEMACCESS2(0x17,0) ",%3       \n"
3834     MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
3835     "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"
3836     "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
3837     MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
3838     "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"
3839     MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
3840     "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
3841     "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
3842     "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
3843     "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
3844     // end of VPGATHER
3845 
3846     "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
3847     "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
3848     "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
3849     "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
3850     "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
3851     "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
3852     "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
3853     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
3854     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
3855     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
3856     MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
3857     "lea       " MEMLEA(0x20,0) ",%0           \n"
3858     "sub        $0x8,%2                        \n"
3859     "jg        1b                              \n"
3860     "vzeroupper                                \n"
3861   : "+r"(src_argb),      // %0
3862     "+r"(dst_argb),      // %1
3863     "+r"(width),         // %2
3864     "=&r"(alpha)         // %3
3865   : "r"(fixed_invtbl8),  // %4
3866     "m"(kUnattenShuffleAlpha_AVX2)  // %5
3867   : "memory", "cc", NACL_R14
3868     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3869   );
3870 }
3871 #endif  // HAS_ARGBUNATTENUATEROW_AVX2
3872 
3873 #ifdef HAS_ARGBGRAYROW_SSSE3
3874 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
ARGBGrayRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)3875 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3876   asm volatile (
3877     "movdqa    %3,%%xmm4                       \n"
3878     "movdqa    %4,%%xmm5                       \n"
3879 
3880     // 8 pixel loop.
3881     LABELALIGN
3882   "1:                                          \n"
3883     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3884     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3885     "pmaddubsw %%xmm4,%%xmm0                   \n"
3886     "pmaddubsw %%xmm4,%%xmm1                   \n"
3887     "phaddw    %%xmm1,%%xmm0                   \n"
3888     "paddw     %%xmm5,%%xmm0                   \n"
3889     "psrlw     $0x7,%%xmm0                     \n"
3890     "packuswb  %%xmm0,%%xmm0                   \n"
3891     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3892     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
3893     "lea       " MEMLEA(0x20,0) ",%0           \n"
3894     "psrld     $0x18,%%xmm2                    \n"
3895     "psrld     $0x18,%%xmm3                    \n"
3896     "packuswb  %%xmm3,%%xmm2                   \n"
3897     "packuswb  %%xmm2,%%xmm2                   \n"
3898     "movdqa    %%xmm0,%%xmm3                   \n"
3899     "punpcklbw %%xmm0,%%xmm0                   \n"
3900     "punpcklbw %%xmm2,%%xmm3                   \n"
3901     "movdqa    %%xmm0,%%xmm1                   \n"
3902     "punpcklwd %%xmm3,%%xmm0                   \n"
3903     "punpckhwd %%xmm3,%%xmm1                   \n"
3904     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3905     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
3906     "lea       " MEMLEA(0x20,1) ",%1           \n"
3907     "sub       $0x8,%2                         \n"
3908     "jg        1b                              \n"
3909   : "+r"(src_argb),   // %0
3910     "+r"(dst_argb),   // %1
3911     "+r"(width)       // %2
3912   : "m"(kARGBToYJ),   // %3
3913     "m"(kAddYJ64)     // %4
3914   : "memory", "cc"
3915     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3916   );
3917 }
3918 #endif  // HAS_ARGBGRAYROW_SSSE3
3919 
3920 #ifdef HAS_ARGBSEPIAROW_SSSE3
3921 //    b = (r * 35 + g * 68 + b * 17) >> 7
3922 //    g = (r * 45 + g * 88 + b * 22) >> 7
3923 //    r = (r * 50 + g * 98 + b * 24) >> 7
3924 // Constant for ARGB color to sepia tone
3925 static vec8 kARGBToSepiaB = {
3926   17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3927 };
3928 
3929 static vec8 kARGBToSepiaG = {
3930   22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3931 };
3932 
3933 static vec8 kARGBToSepiaR = {
3934   24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3935 };
3936 
3937 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
ARGBSepiaRow_SSSE3(uint8 * dst_argb,int width)3938 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3939   asm volatile (
3940     "movdqa    %2,%%xmm2                       \n"
3941     "movdqa    %3,%%xmm3                       \n"
3942     "movdqa    %4,%%xmm4                       \n"
3943 
3944     // 8 pixel loop.
3945     LABELALIGN
3946   "1:                                          \n"
3947     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3948     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
3949     "pmaddubsw %%xmm2,%%xmm0                   \n"
3950     "pmaddubsw %%xmm2,%%xmm6                   \n"
3951     "phaddw    %%xmm6,%%xmm0                   \n"
3952     "psrlw     $0x7,%%xmm0                     \n"
3953     "packuswb  %%xmm0,%%xmm0                   \n"
3954     "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
3955     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3956     "pmaddubsw %%xmm3,%%xmm5                   \n"
3957     "pmaddubsw %%xmm3,%%xmm1                   \n"
3958     "phaddw    %%xmm1,%%xmm5                   \n"
3959     "psrlw     $0x7,%%xmm5                     \n"
3960     "packuswb  %%xmm5,%%xmm5                   \n"
3961     "punpcklbw %%xmm5,%%xmm0                   \n"
3962     "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
3963     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3964     "pmaddubsw %%xmm4,%%xmm5                   \n"
3965     "pmaddubsw %%xmm4,%%xmm1                   \n"
3966     "phaddw    %%xmm1,%%xmm5                   \n"
3967     "psrlw     $0x7,%%xmm5                     \n"
3968     "packuswb  %%xmm5,%%xmm5                   \n"
3969     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
3970     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3971     "psrld     $0x18,%%xmm6                    \n"
3972     "psrld     $0x18,%%xmm1                    \n"
3973     "packuswb  %%xmm1,%%xmm6                   \n"
3974     "packuswb  %%xmm6,%%xmm6                   \n"
3975     "punpcklbw %%xmm6,%%xmm5                   \n"
3976     "movdqa    %%xmm0,%%xmm1                   \n"
3977     "punpcklwd %%xmm5,%%xmm0                   \n"
3978     "punpckhwd %%xmm5,%%xmm1                   \n"
3979     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
3980     "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
3981     "lea       " MEMLEA(0x20,0) ",%0           \n"
3982     "sub       $0x8,%1                         \n"
3983     "jg        1b                              \n"
3984   : "+r"(dst_argb),      // %0
3985     "+r"(width)          // %1
3986   : "m"(kARGBToSepiaB),  // %2
3987     "m"(kARGBToSepiaG),  // %3
3988     "m"(kARGBToSepiaR)   // %4
3989   : "memory", "cc"
3990     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3991   );
3992 }
3993 #endif  // HAS_ARGBSEPIAROW_SSSE3
3994 
3995 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3996 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
3997 // Same as Sepia except matrix is provided.
ARGBColorMatrixRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,const int8 * matrix_argb,int width)3998 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
3999                               const int8* matrix_argb, int width) {
4000   asm volatile (
4001     "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
4002     "pshufd    $0x00,%%xmm5,%%xmm2             \n"
4003     "pshufd    $0x55,%%xmm5,%%xmm3             \n"
4004     "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
4005     "pshufd    $0xff,%%xmm5,%%xmm5             \n"
4006 
4007     // 8 pixel loop.
4008     LABELALIGN
4009   "1:                                          \n"
4010     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4011     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
4012     "pmaddubsw %%xmm2,%%xmm0                   \n"
4013     "pmaddubsw %%xmm2,%%xmm7                   \n"
4014     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
4015     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4016     "pmaddubsw %%xmm3,%%xmm6                   \n"
4017     "pmaddubsw %%xmm3,%%xmm1                   \n"
4018     "phaddsw   %%xmm7,%%xmm0                   \n"
4019     "phaddsw   %%xmm1,%%xmm6                   \n"
4020     "psraw     $0x6,%%xmm0                     \n"
4021     "psraw     $0x6,%%xmm6                     \n"
4022     "packuswb  %%xmm0,%%xmm0                   \n"
4023     "packuswb  %%xmm6,%%xmm6                   \n"
4024     "punpcklbw %%xmm6,%%xmm0                   \n"
4025     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
4026     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
4027     "pmaddubsw %%xmm4,%%xmm1                   \n"
4028     "pmaddubsw %%xmm4,%%xmm7                   \n"
4029     "phaddsw   %%xmm7,%%xmm1                   \n"
4030     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
4031     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
4032     "pmaddubsw %%xmm5,%%xmm6                   \n"
4033     "pmaddubsw %%xmm5,%%xmm7                   \n"
4034     "phaddsw   %%xmm7,%%xmm6                   \n"
4035     "psraw     $0x6,%%xmm1                     \n"
4036     "psraw     $0x6,%%xmm6                     \n"
4037     "packuswb  %%xmm1,%%xmm1                   \n"
4038     "packuswb  %%xmm6,%%xmm6                   \n"
4039     "punpcklbw %%xmm6,%%xmm1                   \n"
4040     "movdqa    %%xmm0,%%xmm6                   \n"
4041     "punpcklwd %%xmm1,%%xmm0                   \n"
4042     "punpckhwd %%xmm1,%%xmm6                   \n"
4043     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
4044     "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"
4045     "lea       " MEMLEA(0x20,0) ",%0           \n"
4046     "lea       " MEMLEA(0x20,1) ",%1           \n"
4047     "sub       $0x8,%2                         \n"
4048     "jg        1b                              \n"
4049   : "+r"(src_argb),      // %0
4050     "+r"(dst_argb),      // %1
4051     "+r"(width)          // %2
4052   : "r"(matrix_argb)     // %3
4053   : "memory", "cc"
4054     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4055   );
4056 }
4057 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
4058 
4059 #ifdef HAS_ARGBQUANTIZEROW_SSE2
4060 // Quantize 4 ARGB pixels (16 bytes).
ARGBQuantizeRow_SSE2(uint8 * dst_argb,int scale,int interval_size,int interval_offset,int width)4061 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
4062                           int interval_offset, int width) {
4063   asm volatile (
4064     "movd      %2,%%xmm2                       \n"
4065     "movd      %3,%%xmm3                       \n"
4066     "movd      %4,%%xmm4                       \n"
4067     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
4068     "pshufd    $0x44,%%xmm2,%%xmm2             \n"
4069     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
4070     "pshufd    $0x44,%%xmm3,%%xmm3             \n"
4071     "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
4072     "pshufd    $0x44,%%xmm4,%%xmm4             \n"
4073     "pxor      %%xmm5,%%xmm5                   \n"
4074     "pcmpeqb   %%xmm6,%%xmm6                   \n"
4075     "pslld     $0x18,%%xmm6                    \n"
4076 
4077     // 4 pixel loop.
4078     LABELALIGN
4079   "1:                                          \n"
4080     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4081     "punpcklbw %%xmm5,%%xmm0                   \n"
4082     "pmulhuw   %%xmm2,%%xmm0                   \n"
4083     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
4084     "punpckhbw %%xmm5,%%xmm1                   \n"
4085     "pmulhuw   %%xmm2,%%xmm1                   \n"
4086     "pmullw    %%xmm3,%%xmm0                   \n"
4087     "movdqu    " MEMACCESS(0) ",%%xmm7         \n"
4088     "pmullw    %%xmm3,%%xmm1                   \n"
4089     "pand      %%xmm6,%%xmm7                   \n"
4090     "paddw     %%xmm4,%%xmm0                   \n"
4091     "paddw     %%xmm4,%%xmm1                   \n"
4092     "packuswb  %%xmm1,%%xmm0                   \n"
4093     "por       %%xmm7,%%xmm0                   \n"
4094     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
4095     "lea       " MEMLEA(0x10,0) ",%0           \n"
4096     "sub       $0x4,%1                         \n"
4097     "jg        1b                              \n"
4098   : "+r"(dst_argb),       // %0
4099     "+r"(width)           // %1
4100   : "r"(scale),           // %2
4101     "r"(interval_size),   // %3
4102     "r"(interval_offset)  // %4
4103   : "memory", "cc"
4104     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4105   );
4106 }
4107 #endif  // HAS_ARGBQUANTIZEROW_SSE2
4108 
4109 #ifdef HAS_ARGBSHADEROW_SSE2
4110 // Shade 4 pixels at a time by specified value.
ARGBShadeRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)4111 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
4112                        uint32 value) {
4113   asm volatile (
4114     "movd      %3,%%xmm2                       \n"
4115     "punpcklbw %%xmm2,%%xmm2                   \n"
4116     "punpcklqdq %%xmm2,%%xmm2                  \n"
4117 
4118     // 4 pixel loop.
4119     LABELALIGN
4120   "1:                                          \n"
4121     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4122     "lea       " MEMLEA(0x10,0) ",%0           \n"
4123     "movdqa    %%xmm0,%%xmm1                   \n"
4124     "punpcklbw %%xmm0,%%xmm0                   \n"
4125     "punpckhbw %%xmm1,%%xmm1                   \n"
4126     "pmulhuw   %%xmm2,%%xmm0                   \n"
4127     "pmulhuw   %%xmm2,%%xmm1                   \n"
4128     "psrlw     $0x8,%%xmm0                     \n"
4129     "psrlw     $0x8,%%xmm1                     \n"
4130     "packuswb  %%xmm1,%%xmm0                   \n"
4131     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
4132     "lea       " MEMLEA(0x10,1) ",%1           \n"
4133     "sub       $0x4,%2                         \n"
4134     "jg        1b                              \n"
4135   : "+r"(src_argb),  // %0
4136     "+r"(dst_argb),  // %1
4137     "+r"(width)      // %2
4138   : "r"(value)       // %3
4139   : "memory", "cc"
4140     , "xmm0", "xmm1", "xmm2"
4141   );
4142 }
4143 #endif  // HAS_ARGBSHADEROW_SSE2
4144 
4145 #ifdef HAS_ARGBMULTIPLYROW_SSE2
4146 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBMultiplyRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4147 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4148                           uint8* dst_argb, int width) {
4149   asm volatile (
4150     "pxor      %%xmm5,%%xmm5                  \n"
4151 
4152     // 4 pixel loop.
4153     LABELALIGN
4154   "1:                                          \n"
4155     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4156     "lea       " MEMLEA(0x10,0) ",%0           \n"
4157     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
4158     "lea       " MEMLEA(0x10,1) ",%1           \n"
4159     "movdqu    %%xmm0,%%xmm1                   \n"
4160     "movdqu    %%xmm2,%%xmm3                   \n"
4161     "punpcklbw %%xmm0,%%xmm0                   \n"
4162     "punpckhbw %%xmm1,%%xmm1                   \n"
4163     "punpcklbw %%xmm5,%%xmm2                   \n"
4164     "punpckhbw %%xmm5,%%xmm3                   \n"
4165     "pmulhuw   %%xmm2,%%xmm0                   \n"
4166     "pmulhuw   %%xmm3,%%xmm1                   \n"
4167     "packuswb  %%xmm1,%%xmm0                   \n"
4168     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4169     "lea       " MEMLEA(0x10,2) ",%2           \n"
4170     "sub       $0x4,%3                         \n"
4171     "jg        1b                              \n"
4172   : "+r"(src_argb0),  // %0
4173     "+r"(src_argb1),  // %1
4174     "+r"(dst_argb),   // %2
4175     "+r"(width)       // %3
4176   :
4177   : "memory", "cc"
4178     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4179   );
4180 }
4181 #endif  // HAS_ARGBMULTIPLYROW_SSE2
4182 
4183 #ifdef HAS_ARGBMULTIPLYROW_AVX2
4184 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4185 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4186                           uint8* dst_argb, int width) {
4187   asm volatile (
4188     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
4189 
4190     // 4 pixel loop.
4191     LABELALIGN
4192   "1:                                          \n"
4193     "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
4194     "lea        " MEMLEA(0x20,0) ",%0          \n"
4195     "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
4196     "lea        " MEMLEA(0x20,1) ",%1          \n"
4197     "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
4198     "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
4199     "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
4200     "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
4201     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
4202     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
4203     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
4204     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
4205     "lea       " MEMLEA(0x20,2) ",%2           \n"
4206     "sub        $0x8,%3                        \n"
4207     "jg        1b                              \n"
4208     "vzeroupper                                \n"
4209   : "+r"(src_argb0),  // %0
4210     "+r"(src_argb1),  // %1
4211     "+r"(dst_argb),   // %2
4212     "+r"(width)       // %3
4213   :
4214   : "memory", "cc"
4215 #if defined(__AVX2__)
4216     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4217 #endif
4218   );
4219 }
4220 #endif  // HAS_ARGBMULTIPLYROW_AVX2
4221 
4222 #ifdef HAS_ARGBADDROW_SSE2
4223 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4224 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4225                      uint8* dst_argb, int width) {
4226   asm volatile (
4227     // 4 pixel loop.
4228     LABELALIGN
4229   "1:                                          \n"
4230     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4231     "lea       " MEMLEA(0x10,0) ",%0           \n"
4232     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4233     "lea       " MEMLEA(0x10,1) ",%1           \n"
4234     "paddusb   %%xmm1,%%xmm0                   \n"
4235     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4236     "lea       " MEMLEA(0x10,2) ",%2           \n"
4237     "sub       $0x4,%3                         \n"
4238     "jg        1b                              \n"
4239   : "+r"(src_argb0),  // %0
4240     "+r"(src_argb1),  // %1
4241     "+r"(dst_argb),   // %2
4242     "+r"(width)       // %3
4243   :
4244   : "memory", "cc"
4245     , "xmm0", "xmm1"
4246   );
4247 }
4248 #endif  // HAS_ARGBADDROW_SSE2
4249 
4250 #ifdef HAS_ARGBADDROW_AVX2
4251 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4252 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4253                      uint8* dst_argb, int width) {
4254   asm volatile (
4255     // 4 pixel loop.
4256     LABELALIGN
4257   "1:                                          \n"
4258     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
4259     "lea        " MEMLEA(0x20,0) ",%0          \n"
4260     "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4261     "lea        " MEMLEA(0x20,1) ",%1          \n"
4262     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
4263     "lea        " MEMLEA(0x20,2) ",%2          \n"
4264     "sub        $0x8,%3                        \n"
4265     "jg        1b                              \n"
4266     "vzeroupper                                \n"
4267   : "+r"(src_argb0),  // %0
4268     "+r"(src_argb1),  // %1
4269     "+r"(dst_argb),   // %2
4270     "+r"(width)       // %3
4271   :
4272   : "memory", "cc"
4273     , "xmm0"
4274   );
4275 }
4276 #endif  // HAS_ARGBADDROW_AVX2
4277 
4278 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4279 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
ARGBSubtractRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4280 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4281                           uint8* dst_argb, int width) {
4282   asm volatile (
4283     // 4 pixel loop.
4284     LABELALIGN
4285   "1:                                          \n"
4286     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4287     "lea       " MEMLEA(0x10,0) ",%0           \n"
4288     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4289     "lea       " MEMLEA(0x10,1) ",%1           \n"
4290     "psubusb   %%xmm1,%%xmm0                   \n"
4291     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4292     "lea       " MEMLEA(0x10,2) ",%2           \n"
4293     "sub       $0x4,%3                         \n"
4294     "jg        1b                              \n"
4295   : "+r"(src_argb0),  // %0
4296     "+r"(src_argb1),  // %1
4297     "+r"(dst_argb),   // %2
4298     "+r"(width)       // %3
4299   :
4300   : "memory", "cc"
4301     , "xmm0", "xmm1"
4302   );
4303 }
4304 #endif  // HAS_ARGBSUBTRACTROW_SSE2
4305 
4306 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4307 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4308 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4309                           uint8* dst_argb, int width) {
4310   asm volatile (
4311     // 4 pixel loop.
4312     LABELALIGN
4313   "1:                                          \n"
4314     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
4315     "lea        " MEMLEA(0x20,0) ",%0          \n"
4316     "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4317     "lea        " MEMLEA(0x20,1) ",%1          \n"
4318     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
4319     "lea        " MEMLEA(0x20,2) ",%2          \n"
4320     "sub        $0x8,%3                        \n"
4321     "jg        1b                              \n"
4322     "vzeroupper                                \n"
4323   : "+r"(src_argb0),  // %0
4324     "+r"(src_argb1),  // %1
4325     "+r"(dst_argb),   // %2
4326     "+r"(width)       // %3
4327   :
4328   : "memory", "cc"
4329     , "xmm0"
4330   );
4331 }
4332 #endif  // HAS_ARGBSUBTRACTROW_AVX2
4333 
4334 #ifdef HAS_SOBELXROW_SSE2
4335 // SobelX as a matrix is
4336 // -1  0  1
4337 // -2  0  2
4338 // -1  0  1
SobelXRow_SSE2(const uint8 * src_y0,const uint8 * src_y1,const uint8 * src_y2,uint8 * dst_sobelx,int width)4339 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4340                     const uint8* src_y2, uint8* dst_sobelx, int width) {
4341   asm volatile (
4342     "sub       %0,%1                           \n"
4343     "sub       %0,%2                           \n"
4344     "sub       %0,%3                           \n"
4345     "pxor      %%xmm5,%%xmm5                   \n"
4346 
4347     // 8 pixel loop.
4348     LABELALIGN
4349   "1:                                          \n"
4350     "movq      " MEMACCESS(0) ",%%xmm0         \n"
4351     "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
4352     "punpcklbw %%xmm5,%%xmm0                   \n"
4353     "punpcklbw %%xmm5,%%xmm1                   \n"
4354     "psubw     %%xmm1,%%xmm0                   \n"
4355     MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
4356     MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
4357     "punpcklbw %%xmm5,%%xmm1                   \n"
4358     "punpcklbw %%xmm5,%%xmm2                   \n"
4359     "psubw     %%xmm2,%%xmm1                   \n"
4360     MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
4361     MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
4362     "punpcklbw %%xmm5,%%xmm2                   \n"
4363     "punpcklbw %%xmm5,%%xmm3                   \n"
4364     "psubw     %%xmm3,%%xmm2                   \n"
4365     "paddw     %%xmm2,%%xmm0                   \n"
4366     "paddw     %%xmm1,%%xmm0                   \n"
4367     "paddw     %%xmm1,%%xmm0                   \n"
4368     "pxor      %%xmm1,%%xmm1                   \n"
4369     "psubw     %%xmm0,%%xmm1                   \n"
4370     "pmaxsw    %%xmm1,%%xmm0                   \n"
4371     "packuswb  %%xmm0,%%xmm0                   \n"
4372     MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
4373     "lea       " MEMLEA(0x8,0) ",%0            \n"
4374     "sub       $0x8,%4                         \n"
4375     "jg        1b                              \n"
4376   : "+r"(src_y0),      // %0
4377     "+r"(src_y1),      // %1
4378     "+r"(src_y2),      // %2
4379     "+r"(dst_sobelx),  // %3
4380     "+r"(width)        // %4
4381   :
4382   : "memory", "cc", NACL_R14
4383     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4384   );
4385 }
4386 #endif  // HAS_SOBELXROW_SSE2
4387 
4388 #ifdef HAS_SOBELYROW_SSE2
4389 // SobelY as a matrix is
4390 // -1 -2 -1
4391 //  0  0  0
4392 //  1  2  1
SobelYRow_SSE2(const uint8 * src_y0,const uint8 * src_y1,uint8 * dst_sobely,int width)4393 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4394                     uint8* dst_sobely, int width) {
4395   asm volatile (
4396     "sub       %0,%1                           \n"
4397     "sub       %0,%2                           \n"
4398     "pxor      %%xmm5,%%xmm5                   \n"
4399 
4400     // 8 pixel loop.
4401     LABELALIGN
4402   "1:                                          \n"
4403     "movq      " MEMACCESS(0) ",%%xmm0         \n"
4404     MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
4405     "punpcklbw %%xmm5,%%xmm0                   \n"
4406     "punpcklbw %%xmm5,%%xmm1                   \n"
4407     "psubw     %%xmm1,%%xmm0                   \n"
4408     "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
4409     MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
4410     "punpcklbw %%xmm5,%%xmm1                   \n"
4411     "punpcklbw %%xmm5,%%xmm2                   \n"
4412     "psubw     %%xmm2,%%xmm1                   \n"
4413     "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
4414     MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
4415     "punpcklbw %%xmm5,%%xmm2                   \n"
4416     "punpcklbw %%xmm5,%%xmm3                   \n"
4417     "psubw     %%xmm3,%%xmm2                   \n"
4418     "paddw     %%xmm2,%%xmm0                   \n"
4419     "paddw     %%xmm1,%%xmm0                   \n"
4420     "paddw     %%xmm1,%%xmm0                   \n"
4421     "pxor      %%xmm1,%%xmm1                   \n"
4422     "psubw     %%xmm0,%%xmm1                   \n"
4423     "pmaxsw    %%xmm1,%%xmm0                   \n"
4424     "packuswb  %%xmm0,%%xmm0                   \n"
4425     MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
4426     "lea       " MEMLEA(0x8,0) ",%0            \n"
4427     "sub       $0x8,%3                         \n"
4428     "jg        1b                              \n"
4429   : "+r"(src_y0),      // %0
4430     "+r"(src_y1),      // %1
4431     "+r"(dst_sobely),  // %2
4432     "+r"(width)        // %3
4433   :
4434   : "memory", "cc", NACL_R14
4435     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4436   );
4437 }
4438 #endif  // HAS_SOBELYROW_SSE2
4439 
4440 #ifdef HAS_SOBELROW_SSE2
4441 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4442 // A = 255
4443 // R = Sobel
4444 // G = Sobel
4445 // B = Sobel
SobelRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)4446 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4447                    uint8* dst_argb, int width) {
4448   asm volatile (
4449     "sub       %0,%1                           \n"
4450     "pcmpeqb   %%xmm5,%%xmm5                   \n"
4451     "pslld     $0x18,%%xmm5                    \n"
4452 
4453     // 8 pixel loop.
4454     LABELALIGN
4455   "1:                                          \n"
4456     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4457     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4458     "lea       " MEMLEA(0x10,0) ",%0           \n"
4459     "paddusb   %%xmm1,%%xmm0                   \n"
4460     "movdqa    %%xmm0,%%xmm2                   \n"
4461     "punpcklbw %%xmm0,%%xmm2                   \n"
4462     "punpckhbw %%xmm0,%%xmm0                   \n"
4463     "movdqa    %%xmm2,%%xmm1                   \n"
4464     "punpcklwd %%xmm2,%%xmm1                   \n"
4465     "punpckhwd %%xmm2,%%xmm2                   \n"
4466     "por       %%xmm5,%%xmm1                   \n"
4467     "por       %%xmm5,%%xmm2                   \n"
4468     "movdqa    %%xmm0,%%xmm3                   \n"
4469     "punpcklwd %%xmm0,%%xmm3                   \n"
4470     "punpckhwd %%xmm0,%%xmm0                   \n"
4471     "por       %%xmm5,%%xmm3                   \n"
4472     "por       %%xmm5,%%xmm0                   \n"
4473     "movdqu    %%xmm1," MEMACCESS(2) "         \n"
4474     "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
4475     "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"
4476     "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"
4477     "lea       " MEMLEA(0x40,2) ",%2           \n"
4478     "sub       $0x10,%3                        \n"
4479     "jg        1b                              \n"
4480   : "+r"(src_sobelx),  // %0
4481     "+r"(src_sobely),  // %1
4482     "+r"(dst_argb),    // %2
4483     "+r"(width)        // %3
4484   :
4485   : "memory", "cc", NACL_R14
4486     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4487   );
4488 }
4489 #endif  // HAS_SOBELROW_SSE2
4490 
4491 #ifdef HAS_SOBELTOPLANEROW_SSE2
4492 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
SobelToPlaneRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_y,int width)4493 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4494                           uint8* dst_y, int width) {
4495   asm volatile (
4496     "sub       %0,%1                           \n"
4497     "pcmpeqb   %%xmm5,%%xmm5                   \n"
4498     "pslld     $0x18,%%xmm5                    \n"
4499 
4500     // 8 pixel loop.
4501     LABELALIGN
4502   "1:                                          \n"
4503     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4504     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4505     "lea       " MEMLEA(0x10,0) ",%0           \n"
4506     "paddusb   %%xmm1,%%xmm0                   \n"
4507     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4508     "lea       " MEMLEA(0x10,2) ",%2           \n"
4509     "sub       $0x10,%3                        \n"
4510     "jg        1b                              \n"
4511   : "+r"(src_sobelx),  // %0
4512     "+r"(src_sobely),  // %1
4513     "+r"(dst_y),       // %2
4514     "+r"(width)        // %3
4515   :
4516   : "memory", "cc", NACL_R14
4517     "xmm0", "xmm1"
4518   );
4519 }
4520 #endif  // HAS_SOBELTOPLANEROW_SSE2
4521 
4522 #ifdef HAS_SOBELXYROW_SSE2
4523 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
4524 // A = 255
4525 // R = Sobel X
4526 // G = Sobel
4527 // B = Sobel Y
SobelXYRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)4528 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4529                      uint8* dst_argb, int width) {
4530   asm volatile (
4531     "sub       %0,%1                           \n"
4532     "pcmpeqb   %%xmm5,%%xmm5                   \n"
4533 
4534     // 8 pixel loop.
4535     LABELALIGN
4536   "1:                                          \n"
4537     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4538     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4539     "lea       " MEMLEA(0x10,0) ",%0           \n"
4540     "movdqa    %%xmm0,%%xmm2                   \n"
4541     "paddusb   %%xmm1,%%xmm2                   \n"
4542     "movdqa    %%xmm0,%%xmm3                   \n"
4543     "punpcklbw %%xmm5,%%xmm3                   \n"
4544     "punpckhbw %%xmm5,%%xmm0                   \n"
4545     "movdqa    %%xmm1,%%xmm4                   \n"
4546     "punpcklbw %%xmm2,%%xmm4                   \n"
4547     "punpckhbw %%xmm2,%%xmm1                   \n"
4548     "movdqa    %%xmm4,%%xmm6                   \n"
4549     "punpcklwd %%xmm3,%%xmm6                   \n"
4550     "punpckhwd %%xmm3,%%xmm4                   \n"
4551     "movdqa    %%xmm1,%%xmm7                   \n"
4552     "punpcklwd %%xmm0,%%xmm7                   \n"
4553     "punpckhwd %%xmm0,%%xmm1                   \n"
4554     "movdqu    %%xmm6," MEMACCESS(2) "         \n"
4555     "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"
4556     "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"
4557     "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"
4558     "lea       " MEMLEA(0x40,2) ",%2           \n"
4559     "sub       $0x10,%3                        \n"
4560     "jg        1b                              \n"
4561   : "+r"(src_sobelx),  // %0
4562     "+r"(src_sobely),  // %1
4563     "+r"(dst_argb),    // %2
4564     "+r"(width)        // %3
4565   :
4566   : "memory", "cc", NACL_R14
4567     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4568   );
4569 }
4570 #endif  // HAS_SOBELXYROW_SSE2
4571 
4572 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4573 // Creates a table of cumulative sums where each value is a sum of all values
4574 // above and to the left of the value, inclusive of the value.
ComputeCumulativeSumRow_SSE2(const uint8 * row,int32 * cumsum,const int32 * previous_cumsum,int width)4575 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
4576                                   const int32* previous_cumsum, int width) {
4577   asm volatile (
4578     "pxor      %%xmm0,%%xmm0                   \n"
4579     "pxor      %%xmm1,%%xmm1                   \n"
4580     "sub       $0x4,%3                         \n"
4581     "jl        49f                             \n"
4582     "test      $0xf,%1                         \n"
4583     "jne       49f                             \n"
4584 
4585   // 4 pixel loop                              \n"
4586     LABELALIGN
4587   "40:                                         \n"
4588     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
4589     "lea       " MEMLEA(0x10,0) ",%0           \n"
4590     "movdqa    %%xmm2,%%xmm4                   \n"
4591     "punpcklbw %%xmm1,%%xmm2                   \n"
4592     "movdqa    %%xmm2,%%xmm3                   \n"
4593     "punpcklwd %%xmm1,%%xmm2                   \n"
4594     "punpckhwd %%xmm1,%%xmm3                   \n"
4595     "punpckhbw %%xmm1,%%xmm4                   \n"
4596     "movdqa    %%xmm4,%%xmm5                   \n"
4597     "punpcklwd %%xmm1,%%xmm4                   \n"
4598     "punpckhwd %%xmm1,%%xmm5                   \n"
4599     "paddd     %%xmm2,%%xmm0                   \n"
4600     "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
4601     "paddd     %%xmm0,%%xmm2                   \n"
4602     "paddd     %%xmm3,%%xmm0                   \n"
4603     "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
4604     "paddd     %%xmm0,%%xmm3                   \n"
4605     "paddd     %%xmm4,%%xmm0                   \n"
4606     "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
4607     "paddd     %%xmm0,%%xmm4                   \n"
4608     "paddd     %%xmm5,%%xmm0                   \n"
4609     "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
4610     "lea       " MEMLEA(0x40,2) ",%2           \n"
4611     "paddd     %%xmm0,%%xmm5                   \n"
4612     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
4613     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
4614     "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"
4615     "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"
4616     "lea       " MEMLEA(0x40,1) ",%1           \n"
4617     "sub       $0x4,%3                         \n"
4618     "jge       40b                             \n"
4619 
4620   "49:                                         \n"
4621     "add       $0x3,%3                         \n"
4622     "jl        19f                             \n"
4623 
4624   // 1 pixel loop                              \n"
4625     LABELALIGN
4626   "10:                                         \n"
4627     "movd      " MEMACCESS(0) ",%%xmm2         \n"
4628     "lea       " MEMLEA(0x4,0) ",%0            \n"
4629     "punpcklbw %%xmm1,%%xmm2                   \n"
4630     "punpcklwd %%xmm1,%%xmm2                   \n"
4631     "paddd     %%xmm2,%%xmm0                   \n"
4632     "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
4633     "lea       " MEMLEA(0x10,2) ",%2           \n"
4634     "paddd     %%xmm0,%%xmm2                   \n"
4635     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
4636     "lea       " MEMLEA(0x10,1) ",%1           \n"
4637     "sub       $0x1,%3                         \n"
4638     "jge       10b                             \n"
4639 
4640   "19:                                         \n"
4641   : "+r"(row),  // %0
4642     "+r"(cumsum),  // %1
4643     "+r"(previous_cumsum),  // %2
4644     "+r"(width)  // %3
4645   :
4646   : "memory", "cc"
4647     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4648   );
4649 }
4650 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
4651 
4652 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
CumulativeSumToAverageRow_SSE2(const int32 * topleft,const int32 * botleft,int width,int area,uint8 * dst,int count)4653 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
4654                                     int width, int area, uint8* dst,
4655                                     int count) {
4656   asm volatile (
4657     "movd      %5,%%xmm5                       \n"
4658     "cvtdq2ps  %%xmm5,%%xmm5                   \n"
4659     "rcpss     %%xmm5,%%xmm4                   \n"
4660     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
4661     "sub       $0x4,%3                         \n"
4662     "jl        49f                             \n"
4663     "cmpl      $0x80,%5                        \n"
4664     "ja        40f                             \n"
4665 
4666     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4667     "pcmpeqb   %%xmm6,%%xmm6                   \n"
4668     "psrld     $0x10,%%xmm6                    \n"
4669     "cvtdq2ps  %%xmm6,%%xmm6                   \n"
4670     "addps     %%xmm6,%%xmm5                   \n"
4671     "mulps     %%xmm4,%%xmm5                   \n"
4672     "cvtps2dq  %%xmm5,%%xmm5                   \n"
4673     "packssdw  %%xmm5,%%xmm5                   \n"
4674 
4675   // 4 pixel small loop                        \n"
4676     LABELALIGN
4677   "4:                                         \n"
4678     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4679     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4680     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
4681     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
4682     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4683     MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
4684     MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
4685     MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
4686     "lea       " MEMLEA(0x40,0) ",%0           \n"
4687     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4688     "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
4689     "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
4690     "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
4691     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4692     MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
4693     MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
4694     MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
4695     "lea       " MEMLEA(0x40,1) ",%1           \n"
4696     "packssdw  %%xmm1,%%xmm0                   \n"
4697     "packssdw  %%xmm3,%%xmm2                   \n"
4698     "pmulhuw   %%xmm5,%%xmm0                   \n"
4699     "pmulhuw   %%xmm5,%%xmm2                   \n"
4700     "packuswb  %%xmm2,%%xmm0                   \n"
4701     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4702     "lea       " MEMLEA(0x10,2) ",%2           \n"
4703     "sub       $0x4,%3                         \n"
4704     "jge       4b                              \n"
4705     "jmp       49f                             \n"
4706 
4707   // 4 pixel loop                              \n"
4708     LABELALIGN
4709   "40:                                         \n"
4710     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4711     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4712     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
4713     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
4714     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4715     MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
4716     MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
4717     MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
4718     "lea       " MEMLEA(0x40,0) ",%0           \n"
4719     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4720     "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
4721     "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
4722     "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
4723     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4724     MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
4725     MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
4726     MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
4727     "lea       " MEMLEA(0x40,1) ",%1           \n"
4728     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
4729     "cvtdq2ps  %%xmm1,%%xmm1                   \n"
4730     "mulps     %%xmm4,%%xmm0                   \n"
4731     "mulps     %%xmm4,%%xmm1                   \n"
4732     "cvtdq2ps  %%xmm2,%%xmm2                   \n"
4733     "cvtdq2ps  %%xmm3,%%xmm3                   \n"
4734     "mulps     %%xmm4,%%xmm2                   \n"
4735     "mulps     %%xmm4,%%xmm3                   \n"
4736     "cvtps2dq  %%xmm0,%%xmm0                   \n"
4737     "cvtps2dq  %%xmm1,%%xmm1                   \n"
4738     "cvtps2dq  %%xmm2,%%xmm2                   \n"
4739     "cvtps2dq  %%xmm3,%%xmm3                   \n"
4740     "packssdw  %%xmm1,%%xmm0                   \n"
4741     "packssdw  %%xmm3,%%xmm2                   \n"
4742     "packuswb  %%xmm2,%%xmm0                   \n"
4743     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4744     "lea       " MEMLEA(0x10,2) ",%2           \n"
4745     "sub       $0x4,%3                         \n"
4746     "jge       40b                             \n"
4747 
4748   "49:                                         \n"
4749     "add       $0x3,%3                         \n"
4750     "jl        19f                             \n"
4751 
4752   // 1 pixel loop                              \n"
4753     LABELALIGN
4754   "10:                                         \n"
4755     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4756     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4757     "lea       " MEMLEA(0x10,0) ",%0           \n"
4758     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4759     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4760     "lea       " MEMLEA(0x10,1) ",%1           \n"
4761     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
4762     "mulps     %%xmm4,%%xmm0                   \n"
4763     "cvtps2dq  %%xmm0,%%xmm0                   \n"
4764     "packssdw  %%xmm0,%%xmm0                   \n"
4765     "packuswb  %%xmm0,%%xmm0                   \n"
4766     "movd      %%xmm0," MEMACCESS(2) "         \n"
4767     "lea       " MEMLEA(0x4,2) ",%2            \n"
4768     "sub       $0x1,%3                         \n"
4769     "jge       10b                             \n"
4770   "19:                                         \n"
4771   : "+r"(topleft),  // %0
4772     "+r"(botleft),  // %1
4773     "+r"(dst),      // %2
4774     "+rm"(count)    // %3
4775   : "r"((intptr_t)(width)),  // %4
4776     "rm"(area)     // %5
4777   : "memory", "cc", NACL_R14
4778     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4779   );
4780 }
4781 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4782 
4783 #ifdef HAS_ARGBAFFINEROW_SSE2
4784 // Copy ARGB pixels from source image with slope to a row of destination.
4785 LIBYUV_API
ARGBAffineRow_SSE2(const uint8 * src_argb,int src_argb_stride,uint8 * dst_argb,const float * src_dudv,int width)4786 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
4787                         uint8* dst_argb, const float* src_dudv, int width) {
4788   intptr_t src_argb_stride_temp = src_argb_stride;
4789   intptr_t temp;
4790   asm volatile (
4791     "movq      " MEMACCESS(3) ",%%xmm2         \n"
4792     "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
4793     "shl       $0x10,%1                        \n"
4794     "add       $0x4,%1                         \n"
4795     "movd      %1,%%xmm5                       \n"
4796     "sub       $0x4,%4                         \n"
4797     "jl        49f                             \n"
4798 
4799     "pshufd    $0x44,%%xmm7,%%xmm7             \n"
4800     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4801     "movdqa    %%xmm2,%%xmm0                   \n"
4802     "addps     %%xmm7,%%xmm0                   \n"
4803     "movlhps   %%xmm0,%%xmm2                   \n"
4804     "movdqa    %%xmm7,%%xmm4                   \n"
4805     "addps     %%xmm4,%%xmm4                   \n"
4806     "movdqa    %%xmm2,%%xmm3                   \n"
4807     "addps     %%xmm4,%%xmm3                   \n"
4808     "addps     %%xmm4,%%xmm4                   \n"
4809 
4810   // 4 pixel loop                              \n"
4811     LABELALIGN
4812   "40:                                         \n"
4813     "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
4814     "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
4815     "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
4816     "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
4817     "movd      %%xmm0,%k1                      \n"
4818     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4819     "movd      %%xmm0,%k5                      \n"
4820     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4821     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
4822     MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
4823     "punpckldq %%xmm6,%%xmm1                   \n"
4824     "addps     %%xmm4,%%xmm2                   \n"
4825     "movq      %%xmm1," MEMACCESS(2) "         \n"
4826     "movd      %%xmm0,%k1                      \n"
4827     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4828     "movd      %%xmm0,%k5                      \n"
4829     MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
4830     MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
4831     "punpckldq %%xmm6,%%xmm0                   \n"
4832     "addps     %%xmm4,%%xmm3                   \n"
4833     "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
4834     "lea       " MEMLEA(0x10,2) ",%2           \n"
4835     "sub       $0x4,%4                         \n"
4836     "jge       40b                             \n"
4837 
4838   "49:                                         \n"
4839     "add       $0x3,%4                         \n"
4840     "jl        19f                             \n"
4841 
4842   // 1 pixel loop                              \n"
4843     LABELALIGN
4844   "10:                                         \n"
4845     "cvttps2dq %%xmm2,%%xmm0                   \n"
4846     "packssdw  %%xmm0,%%xmm0                   \n"
4847     "pmaddwd   %%xmm5,%%xmm0                   \n"
4848     "addps     %%xmm7,%%xmm2                   \n"
4849     "movd      %%xmm0,%k1                      \n"
4850     MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
4851     "movd      %%xmm0," MEMACCESS(2) "         \n"
4852     "lea       " MEMLEA(0x04,2) ",%2           \n"
4853     "sub       $0x1,%4                         \n"
4854     "jge       10b                             \n"
4855   "19:                                         \n"
4856   : "+r"(src_argb),  // %0
4857     "+r"(src_argb_stride_temp),  // %1
4858     "+r"(dst_argb),  // %2
4859     "+r"(src_dudv),  // %3
4860     "+rm"(width),    // %4
4861     "=&r"(temp)      // %5
4862   :
4863   : "memory", "cc", NACL_R14
4864     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4865   );
4866 }
4867 #endif  // HAS_ARGBAFFINEROW_SSE2
4868 
4869 #ifdef HAS_INTERPOLATEROW_SSSE3
4870 // Bilinear filter 16x2 -> 16x1
InterpolateRow_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)4871 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
4872                           ptrdiff_t src_stride, int dst_width,
4873                           int source_y_fraction) {
4874   asm volatile (
4875     "sub       %1,%0                           \n"
4876     "cmp       $0x0,%3                         \n"
4877     "je        100f                            \n"
4878     "cmp       $0x80,%3                        \n"
4879     "je        50f                             \n"
4880 
4881     "movd      %3,%%xmm0                       \n"
4882     "neg       %3                              \n"
4883     "add       $0x100,%3                       \n"
4884     "movd      %3,%%xmm5                       \n"
4885     "punpcklbw %%xmm0,%%xmm5                   \n"
4886     "punpcklwd %%xmm5,%%xmm5                   \n"
4887     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4888     "mov       $0x80808080,%%eax               \n"
4889     "movd      %%eax,%%xmm4                    \n"
4890     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
4891 
4892     // General purpose row blend.
4893     LABELALIGN
4894   "1:                                          \n"
4895     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4896     MEMOPREG(movdqu,0x00,1,4,1,xmm2)
4897     "movdqa     %%xmm0,%%xmm1                  \n"
4898     "punpcklbw  %%xmm2,%%xmm0                  \n"
4899     "punpckhbw  %%xmm2,%%xmm1                  \n"
4900     "psubb      %%xmm4,%%xmm0                  \n"
4901     "psubb      %%xmm4,%%xmm1                  \n"
4902     "movdqa     %%xmm5,%%xmm2                  \n"
4903     "movdqa     %%xmm5,%%xmm3                  \n"
4904     "pmaddubsw  %%xmm0,%%xmm2                  \n"
4905     "pmaddubsw  %%xmm1,%%xmm3                  \n"
4906     "paddw      %%xmm4,%%xmm2                  \n"
4907     "paddw      %%xmm4,%%xmm3                  \n"
4908     "psrlw      $0x8,%%xmm2                    \n"
4909     "psrlw      $0x8,%%xmm3                    \n"
4910     "packuswb   %%xmm3,%%xmm2                  \n"
4911     MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
4912     "lea       " MEMLEA(0x10,1) ",%1           \n"
4913     "sub       $0x10,%2                        \n"
4914     "jg        1b                              \n"
4915     "jmp       99f                             \n"
4916 
4917     // Blend 50 / 50.
4918     LABELALIGN
4919   "50:                                         \n"
4920     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4921     MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4922     "pavgb     %%xmm1,%%xmm0                   \n"
4923     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4924     "lea       " MEMLEA(0x10,1) ",%1           \n"
4925     "sub       $0x10,%2                        \n"
4926     "jg        50b                             \n"
4927     "jmp       99f                             \n"
4928 
4929     // Blend 100 / 0 - Copy row unchanged.
4930     LABELALIGN
4931   "100:                                        \n"
4932     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4933     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4934     "lea       " MEMLEA(0x10,1) ",%1           \n"
4935     "sub       $0x10,%2                        \n"
4936     "jg        100b                            \n"
4937 
4938   "99:                                         \n"
4939   : "+r"(dst_ptr),     // %0
4940     "+r"(src_ptr),     // %1
4941     "+rm"(dst_width),  // %2
4942     "+r"(source_y_fraction)  // %3
4943   : "r"((intptr_t)(src_stride))  // %4
4944   : "memory", "cc", "eax", NACL_R14
4945     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4946   );
4947 }
4948 #endif  // HAS_INTERPOLATEROW_SSSE3
4949 
4950 #ifdef HAS_INTERPOLATEROW_AVX2
4951 // Bilinear filter 32x2 -> 32x1
InterpolateRow_AVX2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)4952 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
4953                          ptrdiff_t src_stride, int dst_width,
4954                          int source_y_fraction) {
4955   asm volatile (
4956     "cmp       $0x0,%3                         \n"
4957     "je        100f                            \n"
4958     "sub       %1,%0                           \n"
4959     "cmp       $0x80,%3                        \n"
4960     "je        50f                             \n"
4961 
4962     "vmovd      %3,%%xmm0                      \n"
4963     "neg        %3                             \n"
4964     "add        $0x100,%3                      \n"
4965     "vmovd      %3,%%xmm5                      \n"
4966     "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
4967     "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
4968     "vbroadcastss %%xmm5,%%ymm5                \n"
4969     "mov        $0x80808080,%%eax              \n"
4970     "vmovd      %%eax,%%xmm4                   \n"
4971     "vbroadcastss %%xmm4,%%ymm4                \n"
4972 
4973     // General purpose row blend.
4974     LABELALIGN
4975   "1:                                          \n"
4976     "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
4977     MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
4978     "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
4979     "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
4980     "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"
4981     "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"
4982     "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"
4983     "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"
4984     "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"
4985     "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"
4986     "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
4987     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
4988     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
4989     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4990     "lea       " MEMLEA(0x20,1) ",%1           \n"
4991     "sub       $0x20,%2                        \n"
4992     "jg        1b                              \n"
4993     "jmp       99f                             \n"
4994 
4995     // Blend 50 / 50.
4996     LABELALIGN
4997   "50:                                         \n"
4998     "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
4999     VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0
5000     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
5001     "lea       " MEMLEA(0x20,1) ",%1           \n"
5002     "sub       $0x20,%2                        \n"
5003     "jg        50b                             \n"
5004     "jmp       99f                             \n"
5005 
5006     // Blend 100 / 0 - Copy row unchanged.
5007     LABELALIGN
5008   "100:                                        \n"
5009     "rep movsb " MEMMOVESTRING(1,0) "          \n"
5010     "jmp       999f                            \n"
5011 
5012   "99:                                         \n"
5013     "vzeroupper                                \n"
5014   "999:                                        \n"
5015   : "+D"(dst_ptr),    // %0
5016     "+S"(src_ptr),    // %1
5017     "+cm"(dst_width),  // %2
5018     "+r"(source_y_fraction)  // %3
5019   : "r"((intptr_t)(src_stride))  // %4
5020   : "memory", "cc", "eax", NACL_R14
5021     "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
5022   );
5023 }
5024 #endif  // HAS_INTERPOLATEROW_AVX2
5025 
5026 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
5027 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int width)5028 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5029                           const uint8* shuffler, int width) {
5030   asm volatile (
5031     "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
5032     LABELALIGN
5033   "1:                                          \n"
5034     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5035     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
5036     "lea       " MEMLEA(0x20,0) ",%0           \n"
5037     "pshufb    %%xmm5,%%xmm0                   \n"
5038     "pshufb    %%xmm5,%%xmm1                   \n"
5039     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5040     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
5041     "lea       " MEMLEA(0x20,1) ",%1           \n"
5042     "sub       $0x8,%2                         \n"
5043     "jg        1b                              \n"
5044   : "+r"(src_argb),  // %0
5045     "+r"(dst_argb),  // %1
5046     "+r"(width)        // %2
5047   : "r"(shuffler)    // %3
5048   : "memory", "cc"
5049     , "xmm0", "xmm1", "xmm5"
5050   );
5051 }
5052 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
5053 
5054 #ifdef HAS_ARGBSHUFFLEROW_AVX2
5055 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int width)5056 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
5057                          const uint8* shuffler, int width) {
5058   asm volatile (
5059     "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
5060     LABELALIGN
5061   "1:                                          \n"
5062     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
5063     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
5064     "lea       " MEMLEA(0x40,0) ",%0           \n"
5065     "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
5066     "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
5067     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
5068     "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
5069     "lea       " MEMLEA(0x40,1) ",%1           \n"
5070     "sub       $0x10,%2                        \n"
5071     "jg        1b                              \n"
5072     "vzeroupper                                \n"
5073   : "+r"(src_argb),  // %0
5074     "+r"(dst_argb),  // %1
5075     "+r"(width)        // %2
5076   : "r"(shuffler)    // %3
5077   : "memory", "cc"
5078     , "xmm0", "xmm1", "xmm5"
5079   );
5080 }
5081 #endif  // HAS_ARGBSHUFFLEROW_AVX2
5082 
5083 #ifdef HAS_ARGBSHUFFLEROW_SSE2
5084 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int width)5085 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
5086                          const uint8* shuffler, int width) {
5087   uintptr_t pixel_temp;
5088   asm volatile (
5089     "pxor      %%xmm5,%%xmm5                   \n"
5090     "mov       " MEMACCESS(4) ",%k2            \n"
5091     "cmp       $0x3000102,%k2                  \n"
5092     "je        3012f                           \n"
5093     "cmp       $0x10203,%k2                    \n"
5094     "je        123f                            \n"
5095     "cmp       $0x30201,%k2                    \n"
5096     "je        321f                            \n"
5097     "cmp       $0x2010003,%k2                  \n"
5098     "je        2103f                           \n"
5099 
5100     LABELALIGN
5101   "1:                                          \n"
5102     "movzb     " MEMACCESS(4) ",%2             \n"
5103     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5104     "mov       %b2," MEMACCESS(1) "            \n"
5105     "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
5106     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5107     "mov       %b2," MEMACCESS2(0x1,1) "       \n"
5108     "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
5109     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5110     "mov       %b2," MEMACCESS2(0x2,1) "       \n"
5111     "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
5112     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5113     "mov       %b2," MEMACCESS2(0x3,1) "       \n"
5114     "lea       " MEMLEA(0x4,0) ",%0            \n"
5115     "lea       " MEMLEA(0x4,1) ",%1            \n"
5116     "sub       $0x1,%3                         \n"
5117     "jg        1b                              \n"
5118     "jmp       99f                             \n"
5119 
5120     LABELALIGN
5121   "123:                                        \n"
5122     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5123     "lea       " MEMLEA(0x10,0) ",%0           \n"
5124     "movdqa    %%xmm0,%%xmm1                   \n"
5125     "punpcklbw %%xmm5,%%xmm0                   \n"
5126     "punpckhbw %%xmm5,%%xmm1                   \n"
5127     "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
5128     "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
5129     "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
5130     "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
5131     "packuswb  %%xmm1,%%xmm0                   \n"
5132     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5133     "lea       " MEMLEA(0x10,1) ",%1           \n"
5134     "sub       $0x4,%3                         \n"
5135     "jg        123b                            \n"
5136     "jmp       99f                             \n"
5137 
5138     LABELALIGN
5139   "321:                                        \n"
5140     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5141     "lea       " MEMLEA(0x10,0) ",%0           \n"
5142     "movdqa    %%xmm0,%%xmm1                   \n"
5143     "punpcklbw %%xmm5,%%xmm0                   \n"
5144     "punpckhbw %%xmm5,%%xmm1                   \n"
5145     "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
5146     "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
5147     "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
5148     "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
5149     "packuswb  %%xmm1,%%xmm0                   \n"
5150     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5151     "lea       " MEMLEA(0x10,1) ",%1           \n"
5152     "sub       $0x4,%3                         \n"
5153     "jg        321b                            \n"
5154     "jmp       99f                             \n"
5155 
5156     LABELALIGN
5157   "2103:                                       \n"
5158     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5159     "lea       " MEMLEA(0x10,0) ",%0           \n"
5160     "movdqa    %%xmm0,%%xmm1                   \n"
5161     "punpcklbw %%xmm5,%%xmm0                   \n"
5162     "punpckhbw %%xmm5,%%xmm1                   \n"
5163     "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
5164     "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
5165     "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
5166     "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
5167     "packuswb  %%xmm1,%%xmm0                   \n"
5168     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5169     "lea       " MEMLEA(0x10,1) ",%1           \n"
5170     "sub       $0x4,%3                         \n"
5171     "jg        2103b                           \n"
5172     "jmp       99f                             \n"
5173 
5174     LABELALIGN
5175   "3012:                                       \n"
5176     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5177     "lea       " MEMLEA(0x10,0) ",%0           \n"
5178     "movdqa    %%xmm0,%%xmm1                   \n"
5179     "punpcklbw %%xmm5,%%xmm0                   \n"
5180     "punpckhbw %%xmm5,%%xmm1                   \n"
5181     "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
5182     "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
5183     "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
5184     "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
5185     "packuswb  %%xmm1,%%xmm0                   \n"
5186     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5187     "lea       " MEMLEA(0x10,1) ",%1           \n"
5188     "sub       $0x4,%3                         \n"
5189     "jg        3012b                           \n"
5190 
5191   "99:                                         \n"
5192   : "+r"(src_argb),     // %0
5193     "+r"(dst_argb),     // %1
5194     "=&d"(pixel_temp),  // %2
5195     "+r"(width)         // %3
5196   : "r"(shuffler)       // %4
5197   : "memory", "cc", NACL_R14
5198     "xmm0", "xmm1", "xmm5"
5199   );
5200 }
5201 #endif  // HAS_ARGBSHUFFLEROW_SSE2
5202 
5203 #ifdef HAS_I422TOYUY2ROW_SSE2
I422ToYUY2Row_SSE2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_frame,int width)5204 void I422ToYUY2Row_SSE2(const uint8* src_y,
5205                         const uint8* src_u,
5206                         const uint8* src_v,
5207                         uint8* dst_frame, int width) {
5208  asm volatile (
5209     "sub       %1,%2                             \n"
5210     LABELALIGN
5211   "1:                                            \n"
5212     "movq      " MEMACCESS(1) ",%%xmm2           \n"
5213     MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
5214     "lea       " MEMLEA(0x8,1) ",%1              \n"
5215     "punpcklbw %%xmm3,%%xmm2                     \n"
5216     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
5217     "lea       " MEMLEA(0x10,0) ",%0             \n"
5218     "movdqa    %%xmm0,%%xmm1                     \n"
5219     "punpcklbw %%xmm2,%%xmm0                     \n"
5220     "punpckhbw %%xmm2,%%xmm1                     \n"
5221     "movdqu    %%xmm0," MEMACCESS(3) "           \n"
5222     "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
5223     "lea       " MEMLEA(0x20,3) ",%3             \n"
5224     "sub       $0x10,%4                          \n"
5225     "jg         1b                               \n"
5226     : "+r"(src_y),  // %0
5227       "+r"(src_u),  // %1
5228       "+r"(src_v),  // %2
5229       "+r"(dst_frame),  // %3
5230       "+rm"(width)  // %4
5231     :
5232     : "memory", "cc", NACL_R14
5233     "xmm0", "xmm1", "xmm2", "xmm3"
5234   );
5235 }
5236 #endif  // HAS_I422TOYUY2ROW_SSE2
5237 
5238 #ifdef HAS_I422TOUYVYROW_SSE2
I422ToUYVYRow_SSE2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_frame,int width)5239 void I422ToUYVYRow_SSE2(const uint8* src_y,
5240                         const uint8* src_u,
5241                         const uint8* src_v,
5242                         uint8* dst_frame, int width) {
5243  asm volatile (
5244     "sub        %1,%2                            \n"
5245     LABELALIGN
5246   "1:                                            \n"
5247     "movq      " MEMACCESS(1) ",%%xmm2           \n"
5248     MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
5249     "lea       " MEMLEA(0x8,1) ",%1              \n"
5250     "punpcklbw %%xmm3,%%xmm2                     \n"
5251     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
5252     "movdqa    %%xmm2,%%xmm1                     \n"
5253     "lea       " MEMLEA(0x10,0) ",%0             \n"
5254     "punpcklbw %%xmm0,%%xmm1                     \n"
5255     "punpckhbw %%xmm0,%%xmm2                     \n"
5256     "movdqu    %%xmm1," MEMACCESS(3) "           \n"
5257     "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
5258     "lea       " MEMLEA(0x20,3) ",%3             \n"
5259     "sub       $0x10,%4                          \n"
5260     "jg         1b                               \n"
5261     : "+r"(src_y),  // %0
5262       "+r"(src_u),  // %1
5263       "+r"(src_v),  // %2
5264       "+r"(dst_frame),  // %3
5265       "+rm"(width)  // %4
5266     :
5267     : "memory", "cc", NACL_R14
5268     "xmm0", "xmm1", "xmm2", "xmm3"
5269   );
5270 }
5271 #endif  // HAS_I422TOUYVYROW_SSE2
5272 
5273 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
ARGBPolynomialRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,const float * poly,int width)5274 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
5275                             uint8* dst_argb, const float* poly,
5276                             int width) {
5277   asm volatile (
5278     "pxor      %%xmm3,%%xmm3                   \n"
5279 
5280     // 2 pixel loop.
5281     LABELALIGN
5282   "1:                                          \n"
5283     "movq      " MEMACCESS(0) ",%%xmm0         \n"
5284     "lea       " MEMLEA(0x8,0) ",%0            \n"
5285     "punpcklbw %%xmm3,%%xmm0                   \n"
5286     "movdqa    %%xmm0,%%xmm4                   \n"
5287     "punpcklwd %%xmm3,%%xmm0                   \n"
5288     "punpckhwd %%xmm3,%%xmm4                   \n"
5289     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
5290     "cvtdq2ps  %%xmm4,%%xmm4                   \n"
5291     "movdqa    %%xmm0,%%xmm1                   \n"
5292     "movdqa    %%xmm4,%%xmm5                   \n"
5293     "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
5294     "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
5295     "addps     " MEMACCESS(3) ",%%xmm0         \n"
5296     "addps     " MEMACCESS(3) ",%%xmm4         \n"
5297     "movdqa    %%xmm1,%%xmm2                   \n"
5298     "movdqa    %%xmm5,%%xmm6                   \n"
5299     "mulps     %%xmm1,%%xmm2                   \n"
5300     "mulps     %%xmm5,%%xmm6                   \n"
5301     "mulps     %%xmm2,%%xmm1                   \n"
5302     "mulps     %%xmm6,%%xmm5                   \n"
5303     "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
5304     "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
5305     "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
5306     "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
5307     "addps     %%xmm2,%%xmm0                   \n"
5308     "addps     %%xmm6,%%xmm4                   \n"
5309     "addps     %%xmm1,%%xmm0                   \n"
5310     "addps     %%xmm5,%%xmm4                   \n"
5311     "cvttps2dq %%xmm0,%%xmm0                   \n"
5312     "cvttps2dq %%xmm4,%%xmm4                   \n"
5313     "packuswb  %%xmm4,%%xmm0                   \n"
5314     "packuswb  %%xmm0,%%xmm0                   \n"
5315     "movq      %%xmm0," MEMACCESS(1) "         \n"
5316     "lea       " MEMLEA(0x8,1) ",%1            \n"
5317     "sub       $0x2,%2                         \n"
5318     "jg        1b                              \n"
5319   : "+r"(src_argb),  // %0
5320     "+r"(dst_argb),  // %1
5321     "+r"(width)      // %2
5322   : "r"(poly)        // %3
5323   : "memory", "cc"
5324     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
5325   );
5326 }
5327 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
5328 
5329 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
ARGBPolynomialRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,const float * poly,int width)5330 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
5331                             uint8* dst_argb, const float* poly,
5332                             int width) {
5333   asm volatile (
5334     "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
5335     "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
5336     "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
5337     "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
5338 
5339     // 2 pixel loop.
5340     LABELALIGN
5341   "1:                                          \n"
5342     "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
5343     "lea         " MEMLEA(0x8,0) ",%0          \n"
5344     "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
5345     "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
5346     "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
5347     "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
5348     "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
5349     "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
5350     "vcvttps2dq  %%ymm0,%%ymm0                 \n"
5351     "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
5352     "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
5353     "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
5354     "vmovq       %%xmm0," MEMACCESS(1) "       \n"
5355     "lea         " MEMLEA(0x8,1) ",%1          \n"
5356     "sub         $0x2,%2                       \n"
5357     "jg          1b                            \n"
5358     "vzeroupper                                \n"
5359   : "+r"(src_argb),  // %0
5360     "+r"(dst_argb),  // %1
5361     "+r"(width)      // %2
5362   : "r"(poly)        // %3
5363   : "memory", "cc",
5364     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
5365   );
5366 }
5367 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
5368 
5369 #ifdef HAS_ARGBCOLORTABLEROW_X86
5370 // Tranform ARGB pixels with color table.
ARGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)5371 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
5372                            int width) {
5373   uintptr_t pixel_temp;
5374   asm volatile (
5375     // 1 pixel loop.
5376     LABELALIGN
5377   "1:                                          \n"
5378     "movzb     " MEMACCESS(0) ",%1             \n"
5379     "lea       " MEMLEA(0x4,0) ",%0            \n"
5380     MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
5381     "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
5382     "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
5383     MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
5384     "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
5385     "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
5386     MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
5387     "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
5388     "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
5389     MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
5390     "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
5391     "dec       %2                              \n"
5392     "jg        1b                              \n"
5393   : "+r"(dst_argb),     // %0
5394     "=&d"(pixel_temp),  // %1
5395     "+r"(width)         // %2
5396   : "r"(table_argb)     // %3
5397   : "memory", "cc");
5398 }
5399 #endif  // HAS_ARGBCOLORTABLEROW_X86
5400 
5401 #ifdef HAS_RGBCOLORTABLEROW_X86
5402 // Tranform RGB pixels with color table.
RGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)5403 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
5404   uintptr_t pixel_temp;
5405   asm volatile (
5406     // 1 pixel loop.
5407     LABELALIGN
5408   "1:                                          \n"
5409     "movzb     " MEMACCESS(0) ",%1             \n"
5410     "lea       " MEMLEA(0x4,0) ",%0            \n"
5411     MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
5412     "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
5413     "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
5414     MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
5415     "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
5416     "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
5417     MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
5418     "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
5419     "dec       %2                              \n"
5420     "jg        1b                              \n"
5421   : "+r"(dst_argb),     // %0
5422     "=&d"(pixel_temp),  // %1
5423     "+r"(width)         // %2
5424   : "r"(table_argb)     // %3
5425   : "memory", "cc");
5426 }
5427 #endif  // HAS_RGBCOLORTABLEROW_X86
5428 
5429 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
5430 // Tranform RGB pixels with luma table.
ARGBLumaColorTableRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width,const uint8 * luma,uint32 lumacoeff)5431 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5432                                  int width,
5433                                  const uint8* luma, uint32 lumacoeff) {
5434   uintptr_t pixel_temp;
5435   uintptr_t table_temp;
5436   asm volatile (
5437     "movd      %6,%%xmm3                       \n"
5438     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
5439     "pcmpeqb   %%xmm4,%%xmm4                   \n"
5440     "psllw     $0x8,%%xmm4                     \n"
5441     "pxor      %%xmm5,%%xmm5                   \n"
5442 
5443     // 4 pixel loop.
5444     LABELALIGN
5445   "1:                                          \n"
5446     "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
5447     "pmaddubsw %%xmm3,%%xmm0                   \n"
5448     "phaddw    %%xmm0,%%xmm0                   \n"
5449     "pand      %%xmm4,%%xmm0                   \n"
5450     "punpcklwd %%xmm5,%%xmm0                   \n"
5451     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5452     "add       %5,%1                           \n"
5453     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5454 
5455     "movzb     " MEMACCESS(2) ",%0             \n"
5456     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5457     "mov       %b0," MEMACCESS(3) "            \n"
5458     "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
5459     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5460     "mov       %b0," MEMACCESS2(0x1,3) "       \n"
5461     "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
5462     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5463     "mov       %b0," MEMACCESS2(0x2,3) "       \n"
5464     "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
5465     "mov       %b0," MEMACCESS2(0x3,3) "       \n"
5466 
5467     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5468     "add       %5,%1                           \n"
5469     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5470 
5471     "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
5472     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5473     "mov       %b0," MEMACCESS2(0x4,3) "       \n"
5474     "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
5475     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5476     "mov       %b0," MEMACCESS2(0x5,3) "       \n"
5477     "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
5478     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5479     "mov       %b0," MEMACCESS2(0x6,3) "       \n"
5480     "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
5481     "mov       %b0," MEMACCESS2(0x7,3) "       \n"
5482 
5483     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5484     "add       %5,%1                           \n"
5485     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5486 
5487     "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
5488     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5489     "mov       %b0," MEMACCESS2(0x8,3) "       \n"
5490     "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
5491     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5492     "mov       %b0," MEMACCESS2(0x9,3) "       \n"
5493     "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
5494     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5495     "mov       %b0," MEMACCESS2(0xa,3) "       \n"
5496     "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
5497     "mov       %b0," MEMACCESS2(0xb,3) "       \n"
5498 
5499     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5500     "add       %5,%1                           \n"
5501 
5502     "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
5503     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5504     "mov       %b0," MEMACCESS2(0xc,3) "       \n"
5505     "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
5506     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5507     "mov       %b0," MEMACCESS2(0xd,3) "       \n"
5508     "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
5509     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5510     "mov       %b0," MEMACCESS2(0xe,3) "       \n"
5511     "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
5512     "mov       %b0," MEMACCESS2(0xf,3) "       \n"
5513     "lea       " MEMLEA(0x10,2) ",%2           \n"
5514     "lea       " MEMLEA(0x10,3) ",%3           \n"
5515     "sub       $0x4,%4                         \n"
5516     "jg        1b                              \n"
5517   : "=&d"(pixel_temp),  // %0
5518     "=&a"(table_temp),  // %1
5519     "+r"(src_argb),     // %2
5520     "+r"(dst_argb),     // %3
5521     "+rm"(width)        // %4
5522   : "r"(luma),          // %5
5523     "rm"(lumacoeff)     // %6
5524   : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
5525   );
5526 }
5527 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5528 
5529 #endif  // defined(__x86_64__) || defined(__i386__)
5530 
5531 #ifdef __cplusplus
5532 }  // extern "C"
5533 }  // namespace libyuv
5534 #endif
5535