1 // VERSION 2
2 /*
3  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
4  *
5  *  Use of this source code is governed by a BSD-style license
6  *  that can be found in the LICENSE file in the root of the source
7  *  tree. An additional intellectual property rights grant can be found
8  *  in the file PATENTS. All contributing project authors may
9  *  be found in the AUTHORS file in the root of the source tree.
10  */
11 
12 #include "libyuv/row.h"
13 
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18 
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
21 
22 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
23 
24 // Constants for ARGB
25 static vec8 kARGBToY = {
26   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
27 };
28 
29 // JPeg full range.
30 static vec8 kARGBToYJ = {
31   15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
32 };
33 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
34 
35 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
36 
37 static vec8 kARGBToU = {
38   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
39 };
40 
41 static vec8 kARGBToUJ = {
42   127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
43 };
44 
45 static vec8 kARGBToV = {
46   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
47 };
48 
49 static vec8 kARGBToVJ = {
50   -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
51 };
52 
53 // Constants for BGRA
54 static vec8 kBGRAToY = {
55   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
56 };
57 
58 static vec8 kBGRAToU = {
59   0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
60 };
61 
62 static vec8 kBGRAToV = {
63   0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
64 };
65 
66 // Constants for ABGR
67 static vec8 kABGRToY = {
68   33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
69 };
70 
71 static vec8 kABGRToU = {
72   -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
73 };
74 
75 static vec8 kABGRToV = {
76   112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
77 };
78 
79 // Constants for RGBA.
80 static vec8 kRGBAToY = {
81   0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
82 };
83 
84 static vec8 kRGBAToU = {
85   0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
86 };
87 
88 static vec8 kRGBAToV = {
89   0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
90 };
91 
92 static uvec8 kAddY16 = {
93   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
94 };
95 
96 // 7 bit fixed point 0.5.
97 static vec16 kAddYJ64 = {
98   64, 64, 64, 64, 64, 64, 64, 64
99 };
100 
101 static uvec8 kAddUV128 = {
102   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
103   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
104 };
105 
106 static uvec16 kAddUVJ128 = {
107   0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
108 };
109 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
110 
111 #ifdef HAS_RGB24TOARGBROW_SSSE3
112 
113 // Shuffle table for converting RGB24 to ARGB.
114 static uvec8 kShuffleMaskRGB24ToARGB = {
115   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
116 };
117 
118 // Shuffle table for converting RAW to ARGB.
119 static uvec8 kShuffleMaskRAWToARGB = {
120   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
121 };
122 
123 // Shuffle table for converting ARGB to RGB24.
124 static uvec8 kShuffleMaskARGBToRGB24 = {
125   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
126 };
127 
128 // Shuffle table for converting ARGB to RAW.
129 static uvec8 kShuffleMaskARGBToRAW = {
130   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
131 };
132 
133 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
134 static uvec8 kShuffleMaskARGBToRGB24_0 = {
135   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
136 };
137 
138 // Shuffle table for converting ARGB to RAW.
139 static uvec8 kShuffleMaskARGBToRAW_0 = {
140   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
141 };
142 #endif  // HAS_RGB24TOARGBROW_SSSE3
143 
144 #if defined(TESTING) && defined(__x86_64__)
TestRow_SSE2(const uint8 * src_y,uint8 * dst_argb,int pix)145 void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
146   asm volatile (
147     ".p2align  5                               \n"
148     "mov       %%eax,%%eax                     \n"
149     "mov       %%ebx,%%ebx                     \n"
150     "mov       %%ecx,%%ecx                     \n"
151     "mov       %%edx,%%edx                     \n"
152     "mov       %%esi,%%esi                     \n"
153     "mov       %%edi,%%edi                     \n"
154     "mov       %%ebp,%%ebp                     \n"
155     "mov       %%esp,%%esp                     \n"
156     ".p2align  5                               \n"
157     "mov       %%r8d,%%r8d                     \n"
158     "mov       %%r9d,%%r9d                     \n"
159     "mov       %%r10d,%%r10d                   \n"
160     "mov       %%r11d,%%r11d                   \n"
161     "mov       %%r12d,%%r12d                   \n"
162     "mov       %%r13d,%%r13d                   \n"
163     "mov       %%r14d,%%r14d                   \n"
164     "mov       %%r15d,%%r15d                   \n"
165     ".p2align  5                               \n"
166     "lea       (%%rax),%%eax                   \n"
167     "lea       (%%rbx),%%ebx                   \n"
168     "lea       (%%rcx),%%ecx                   \n"
169     "lea       (%%rdx),%%edx                   \n"
170     "lea       (%%rsi),%%esi                   \n"
171     "lea       (%%rdi),%%edi                   \n"
172     "lea       (%%rbp),%%ebp                   \n"
173     "lea       (%%rsp),%%esp                   \n"
174     ".p2align  5                               \n"
175     "lea       (%%r8),%%r8d                    \n"
176     "lea       (%%r9),%%r9d                    \n"
177     "lea       (%%r10),%%r10d                  \n"
178     "lea       (%%r11),%%r11d                  \n"
179     "lea       (%%r12),%%r12d                  \n"
180     "lea       (%%r13),%%r13d                  \n"
181     "lea       (%%r14),%%r14d                  \n"
182     "lea       (%%r15),%%r15d                  \n"
183 
184     ".p2align  5                               \n"
185     "lea       0x10(%%rax),%%eax               \n"
186     "lea       0x10(%%rbx),%%ebx               \n"
187     "lea       0x10(%%rcx),%%ecx               \n"
188     "lea       0x10(%%rdx),%%edx               \n"
189     "lea       0x10(%%rsi),%%esi               \n"
190     "lea       0x10(%%rdi),%%edi               \n"
191     "lea       0x10(%%rbp),%%ebp               \n"
192     "lea       0x10(%%rsp),%%esp               \n"
193     ".p2align  5                               \n"
194     "lea       0x10(%%r8),%%r8d                \n"
195     "lea       0x10(%%r9),%%r9d                \n"
196     "lea       0x10(%%r10),%%r10d              \n"
197     "lea       0x10(%%r11),%%r11d              \n"
198     "lea       0x10(%%r12),%%r12d              \n"
199     "lea       0x10(%%r13),%%r13d              \n"
200     "lea       0x10(%%r14),%%r14d              \n"
201     "lea       0x10(%%r15),%%r15d              \n"
202 
203     ".p2align  5                               \n"
204     "add       0x10,%%eax                      \n"
205     "add       0x10,%%ebx                      \n"
206     "add       0x10,%%ecx                      \n"
207     "add       0x10,%%edx                      \n"
208     "add       0x10,%%esi                      \n"
209     "add       0x10,%%edi                      \n"
210     "add       0x10,%%ebp                      \n"
211     "add       0x10,%%esp                      \n"
212     ".p2align  5                               \n"
213     "add       0x10,%%r8d                      \n"
214     "add       0x10,%%r9d                      \n"
215     "add       0x10,%%r10d                     \n"
216     "add       0x10,%%r11d                     \n"
217     "add       0x10,%%r12d                     \n"
218     "add       0x10,%%r13d                     \n"
219     "add       0x10,%%r14d                     \n"
220     "add       0x10,%%r15d                     \n"
221 
222     ".p2align  2                               \n"
223   "1:                                          \n"
224     "movq      " MEMACCESS(0) ",%%xmm0         \n"
225     "lea       " MEMLEA(0x8,0) ",%0            \n"
226     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
227     "lea       " MEMLEA(0x20,1) ",%1           \n"
228     "sub       $0x8,%2                         \n"
229     "jg        1b                              \n"
230   : "+r"(src_y),     // %0
231     "+r"(dst_argb),  // %1
232     "+r"(pix)        // %2
233   :
234   : "memory", "cc", "xmm0", "xmm1", "xmm5"
235   );
236 }
237 #endif  // TESTING
238 
239 #ifdef HAS_J400TOARGBROW_SSE2
J400ToARGBRow_SSE2(const uint8 * src_y,uint8 * dst_argb,int pix)240 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
241   asm volatile (
242     "pcmpeqb   %%xmm5,%%xmm5                   \n"
243     "pslld     $0x18,%%xmm5                    \n"
244     LABELALIGN
245   "1:                                          \n"
246     "movq      " MEMACCESS(0) ",%%xmm0         \n"
247     "lea       " MEMLEA(0x8,0) ",%0            \n"
248     "punpcklbw %%xmm0,%%xmm0                   \n"
249     "movdqa    %%xmm0,%%xmm1                   \n"
250     "punpcklwd %%xmm0,%%xmm0                   \n"
251     "punpckhwd %%xmm1,%%xmm1                   \n"
252     "por       %%xmm5,%%xmm0                   \n"
253     "por       %%xmm5,%%xmm1                   \n"
254     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
255     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
256     "lea       " MEMLEA(0x20,1) ",%1           \n"
257     "sub       $0x8,%2                         \n"
258     "jg        1b                              \n"
259   : "+r"(src_y),     // %0
260     "+r"(dst_argb),  // %1
261     "+r"(pix)        // %2
262   :: "memory", "cc", "xmm0", "xmm1", "xmm5"
263   );
264 }
265 #endif  // HAS_J400TOARGBROW_SSE2
266 
267 #ifdef HAS_RGB24TOARGBROW_SSSE3
RGB24ToARGBRow_SSSE3(const uint8 * src_rgb24,uint8 * dst_argb,int pix)268 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
269   asm volatile (
270     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
271     "pslld     $0x18,%%xmm5                    \n"
272     "movdqa    %3,%%xmm4                       \n"
273     LABELALIGN
274   "1:                                          \n"
275     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
276     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
277     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
278     "lea       " MEMLEA(0x30,0) ",%0           \n"
279     "movdqa    %%xmm3,%%xmm2                   \n"
280     "palignr   $0x8,%%xmm1,%%xmm2              \n"
281     "pshufb    %%xmm4,%%xmm2                   \n"
282     "por       %%xmm5,%%xmm2                   \n"
283     "palignr   $0xc,%%xmm0,%%xmm1              \n"
284     "pshufb    %%xmm4,%%xmm0                   \n"
285     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
286     "por       %%xmm5,%%xmm0                   \n"
287     "pshufb    %%xmm4,%%xmm1                   \n"
288     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
289     "por       %%xmm5,%%xmm1                   \n"
290     "palignr   $0x4,%%xmm3,%%xmm3              \n"
291     "pshufb    %%xmm4,%%xmm3                   \n"
292     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
293     "por       %%xmm5,%%xmm3                   \n"
294     "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
295     "lea       " MEMLEA(0x40,1) ",%1           \n"
296     "sub       $0x10,%2                        \n"
297     "jg        1b                              \n"
298   : "+r"(src_rgb24),  // %0
299     "+r"(dst_argb),  // %1
300     "+r"(pix)        // %2
301   : "m"(kShuffleMaskRGB24ToARGB)  // %3
302   : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
303   );
304 }
305 
RAWToARGBRow_SSSE3(const uint8 * src_raw,uint8 * dst_argb,int pix)306 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
307   asm volatile (
308     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
309     "pslld     $0x18,%%xmm5                    \n"
310     "movdqa    %3,%%xmm4                       \n"
311     LABELALIGN
312   "1:                                          \n"
313     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
314     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
315     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
316     "lea       " MEMLEA(0x30,0) ",%0           \n"
317     "movdqa    %%xmm3,%%xmm2                   \n"
318     "palignr   $0x8,%%xmm1,%%xmm2              \n"
319     "pshufb    %%xmm4,%%xmm2                   \n"
320     "por       %%xmm5,%%xmm2                   \n"
321     "palignr   $0xc,%%xmm0,%%xmm1              \n"
322     "pshufb    %%xmm4,%%xmm0                   \n"
323     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
324     "por       %%xmm5,%%xmm0                   \n"
325     "pshufb    %%xmm4,%%xmm1                   \n"
326     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
327     "por       %%xmm5,%%xmm1                   \n"
328     "palignr   $0x4,%%xmm3,%%xmm3              \n"
329     "pshufb    %%xmm4,%%xmm3                   \n"
330     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
331     "por       %%xmm5,%%xmm3                   \n"
332     "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
333     "lea       " MEMLEA(0x40,1) ",%1           \n"
334     "sub       $0x10,%2                        \n"
335     "jg        1b                              \n"
336   : "+r"(src_raw),   // %0
337     "+r"(dst_argb),  // %1
338     "+r"(pix)        // %2
339   : "m"(kShuffleMaskRAWToARGB)  // %3
340   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
341   );
342 }
343 
RGB565ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int pix)344 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
345   asm volatile (
346     "mov       $0x1080108,%%eax                \n"
347     "movd      %%eax,%%xmm5                    \n"
348     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
349     "mov       $0x20802080,%%eax               \n"
350     "movd      %%eax,%%xmm6                    \n"
351     "pshufd    $0x0,%%xmm6,%%xmm6              \n"
352     "pcmpeqb   %%xmm3,%%xmm3                   \n"
353     "psllw     $0xb,%%xmm3                     \n"
354     "pcmpeqb   %%xmm4,%%xmm4                   \n"
355     "psllw     $0xa,%%xmm4                     \n"
356     "psrlw     $0x5,%%xmm4                     \n"
357     "pcmpeqb   %%xmm7,%%xmm7                   \n"
358     "psllw     $0x8,%%xmm7                     \n"
359     "sub       %0,%1                           \n"
360     "sub       %0,%1                           \n"
361     LABELALIGN
362   "1:                                          \n"
363     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
364     "movdqa    %%xmm0,%%xmm1                   \n"
365     "movdqa    %%xmm0,%%xmm2                   \n"
366     "pand      %%xmm3,%%xmm1                   \n"
367     "psllw     $0xb,%%xmm2                     \n"
368     "pmulhuw   %%xmm5,%%xmm1                   \n"
369     "pmulhuw   %%xmm5,%%xmm2                   \n"
370     "psllw     $0x8,%%xmm1                     \n"
371     "por       %%xmm2,%%xmm1                   \n"
372     "pand      %%xmm4,%%xmm0                   \n"
373     "pmulhuw   %%xmm6,%%xmm0                   \n"
374     "por       %%xmm7,%%xmm0                   \n"
375     "movdqa    %%xmm1,%%xmm2                   \n"
376     "punpcklbw %%xmm0,%%xmm1                   \n"
377     "punpckhbw %%xmm0,%%xmm2                   \n"
378     MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
379     MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
380     "lea       " MEMLEA(0x10,0) ",%0           \n"
381     "sub       $0x8,%2                         \n"
382     "jg        1b                              \n"
383   : "+r"(src),  // %0
384     "+r"(dst),  // %1
385     "+r"(pix)   // %2
386   :
387   : "memory", "cc", "eax", NACL_R14
388     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
389   );
390 }
391 
ARGB1555ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int pix)392 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
393   asm volatile (
394     "mov       $0x1080108,%%eax                \n"
395     "movd      %%eax,%%xmm5                    \n"
396     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
397     "mov       $0x42004200,%%eax               \n"
398     "movd      %%eax,%%xmm6                    \n"
399     "pshufd    $0x0,%%xmm6,%%xmm6              \n"
400     "pcmpeqb   %%xmm3,%%xmm3                   \n"
401     "psllw     $0xb,%%xmm3                     \n"
402     "movdqa    %%xmm3,%%xmm4                   \n"
403     "psrlw     $0x6,%%xmm4                     \n"
404     "pcmpeqb   %%xmm7,%%xmm7                   \n"
405     "psllw     $0x8,%%xmm7                     \n"
406     "sub       %0,%1                           \n"
407     "sub       %0,%1                           \n"
408     LABELALIGN
409   "1:                                          \n"
410     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
411     "movdqa    %%xmm0,%%xmm1                   \n"
412     "movdqa    %%xmm0,%%xmm2                   \n"
413     "psllw     $0x1,%%xmm1                     \n"
414     "psllw     $0xb,%%xmm2                     \n"
415     "pand      %%xmm3,%%xmm1                   \n"
416     "pmulhuw   %%xmm5,%%xmm2                   \n"
417     "pmulhuw   %%xmm5,%%xmm1                   \n"
418     "psllw     $0x8,%%xmm1                     \n"
419     "por       %%xmm2,%%xmm1                   \n"
420     "movdqa    %%xmm0,%%xmm2                   \n"
421     "pand      %%xmm4,%%xmm0                   \n"
422     "psraw     $0x8,%%xmm2                     \n"
423     "pmulhuw   %%xmm6,%%xmm0                   \n"
424     "pand      %%xmm7,%%xmm2                   \n"
425     "por       %%xmm2,%%xmm0                   \n"
426     "movdqa    %%xmm1,%%xmm2                   \n"
427     "punpcklbw %%xmm0,%%xmm1                   \n"
428     "punpckhbw %%xmm0,%%xmm2                   \n"
429     MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
430     MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
431     "lea       " MEMLEA(0x10,0) ",%0           \n"
432     "sub       $0x8,%2                         \n"
433     "jg        1b                              \n"
434   : "+r"(src),  // %0
435     "+r"(dst),  // %1
436     "+r"(pix)   // %2
437   :
438   : "memory", "cc", "eax", NACL_R14
439     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
440   );
441 }
442 
ARGB4444ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int pix)443 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
444   asm volatile (
445     "mov       $0xf0f0f0f,%%eax                \n"
446     "movd      %%eax,%%xmm4                    \n"
447     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
448     "movdqa    %%xmm4,%%xmm5                   \n"
449     "pslld     $0x4,%%xmm5                     \n"
450     "sub       %0,%1                           \n"
451     "sub       %0,%1                           \n"
452     LABELALIGN
453   "1:                                          \n"
454     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
455     "movdqa    %%xmm0,%%xmm2                   \n"
456     "pand      %%xmm4,%%xmm0                   \n"
457     "pand      %%xmm5,%%xmm2                   \n"
458     "movdqa    %%xmm0,%%xmm1                   \n"
459     "movdqa    %%xmm2,%%xmm3                   \n"
460     "psllw     $0x4,%%xmm1                     \n"
461     "psrlw     $0x4,%%xmm3                     \n"
462     "por       %%xmm1,%%xmm0                   \n"
463     "por       %%xmm3,%%xmm2                   \n"
464     "movdqa    %%xmm0,%%xmm1                   \n"
465     "punpcklbw %%xmm2,%%xmm0                   \n"
466     "punpckhbw %%xmm2,%%xmm1                   \n"
467     MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)
468     MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)
469     "lea       " MEMLEA(0x10,0) ",%0           \n"
470     "sub       $0x8,%2                         \n"
471     "jg        1b                              \n"
472   : "+r"(src),  // %0
473     "+r"(dst),  // %1
474     "+r"(pix)   // %2
475   :
476   : "memory", "cc", "eax", NACL_R14
477     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
478   );
479 }
480 
ARGBToRGB24Row_SSSE3(const uint8 * src,uint8 * dst,int pix)481 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
482   asm volatile (
483     "movdqa    %3,%%xmm6                       \n"
484     LABELALIGN
485   "1:                                          \n"
486     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
487     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
488     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
489     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
490     "lea       " MEMLEA(0x40,0) ",%0           \n"
491     "pshufb    %%xmm6,%%xmm0                   \n"
492     "pshufb    %%xmm6,%%xmm1                   \n"
493     "pshufb    %%xmm6,%%xmm2                   \n"
494     "pshufb    %%xmm6,%%xmm3                   \n"
495     "movdqa    %%xmm1,%%xmm4                   \n"
496     "psrldq    $0x4,%%xmm1                     \n"
497     "pslldq    $0xc,%%xmm4                     \n"
498     "movdqa    %%xmm2,%%xmm5                   \n"
499     "por       %%xmm4,%%xmm0                   \n"
500     "pslldq    $0x8,%%xmm5                     \n"
501     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
502     "por       %%xmm5,%%xmm1                   \n"
503     "psrldq    $0x8,%%xmm2                     \n"
504     "pslldq    $0x4,%%xmm3                     \n"
505     "por       %%xmm3,%%xmm2                   \n"
506     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
507     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
508     "lea       " MEMLEA(0x30,1) ",%1           \n"
509     "sub       $0x10,%2                        \n"
510     "jg        1b                              \n"
511   : "+r"(src),  // %0
512     "+r"(dst),  // %1
513     "+r"(pix)   // %2
514   : "m"(kShuffleMaskARGBToRGB24)  // %3
515   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
516   );
517 }
518 
ARGBToRAWRow_SSSE3(const uint8 * src,uint8 * dst,int pix)519 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
520   asm volatile (
521     "movdqa    %3,%%xmm6                       \n"
522     LABELALIGN
523   "1:                                          \n"
524     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
525     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
526     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
527     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
528     "lea       " MEMLEA(0x40,0) ",%0           \n"
529     "pshufb    %%xmm6,%%xmm0                   \n"
530     "pshufb    %%xmm6,%%xmm1                   \n"
531     "pshufb    %%xmm6,%%xmm2                   \n"
532     "pshufb    %%xmm6,%%xmm3                   \n"
533     "movdqa    %%xmm1,%%xmm4                   \n"
534     "psrldq    $0x4,%%xmm1                     \n"
535     "pslldq    $0xc,%%xmm4                     \n"
536     "movdqa    %%xmm2,%%xmm5                   \n"
537     "por       %%xmm4,%%xmm0                   \n"
538     "pslldq    $0x8,%%xmm5                     \n"
539     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
540     "por       %%xmm5,%%xmm1                   \n"
541     "psrldq    $0x8,%%xmm2                     \n"
542     "pslldq    $0x4,%%xmm3                     \n"
543     "por       %%xmm3,%%xmm2                   \n"
544     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
545     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
546     "lea       " MEMLEA(0x30,1) ",%1           \n"
547     "sub       $0x10,%2                        \n"
548     "jg        1b                              \n"
549   : "+r"(src),  // %0
550     "+r"(dst),  // %1
551     "+r"(pix)   // %2
552   : "m"(kShuffleMaskARGBToRAW)  // %3
553   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
554   );
555 }
556 
ARGBToRGB565Row_SSE2(const uint8 * src,uint8 * dst,int pix)557 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
558   asm volatile (
559     "pcmpeqb   %%xmm3,%%xmm3                   \n"
560     "psrld     $0x1b,%%xmm3                    \n"
561     "pcmpeqb   %%xmm4,%%xmm4                   \n"
562     "psrld     $0x1a,%%xmm4                    \n"
563     "pslld     $0x5,%%xmm4                     \n"
564     "pcmpeqb   %%xmm5,%%xmm5                   \n"
565     "pslld     $0xb,%%xmm5                     \n"
566     LABELALIGN
567   "1:                                          \n"
568     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
569     "movdqa    %%xmm0,%%xmm1                   \n"
570     "movdqa    %%xmm0,%%xmm2                   \n"
571     "pslld     $0x8,%%xmm0                     \n"
572     "psrld     $0x3,%%xmm1                     \n"
573     "psrld     $0x5,%%xmm2                     \n"
574     "psrad     $0x10,%%xmm0                    \n"
575     "pand      %%xmm3,%%xmm1                   \n"
576     "pand      %%xmm4,%%xmm2                   \n"
577     "pand      %%xmm5,%%xmm0                   \n"
578     "por       %%xmm2,%%xmm1                   \n"
579     "por       %%xmm1,%%xmm0                   \n"
580     "packssdw  %%xmm0,%%xmm0                   \n"
581     "lea       " MEMLEA(0x10,0) ",%0           \n"
582     "movq      %%xmm0," MEMACCESS(1) "         \n"
583     "lea       " MEMLEA(0x8,1) ",%1            \n"
584     "sub       $0x4,%2                         \n"
585     "jg        1b                              \n"
586   : "+r"(src),  // %0
587     "+r"(dst),  // %1
588     "+r"(pix)   // %2
589   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
590   );
591 }
592 
ARGBToARGB1555Row_SSE2(const uint8 * src,uint8 * dst,int pix)593 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
594   asm volatile (
595     "pcmpeqb   %%xmm4,%%xmm4                   \n"
596     "psrld     $0x1b,%%xmm4                    \n"
597     "movdqa    %%xmm4,%%xmm5                   \n"
598     "pslld     $0x5,%%xmm5                     \n"
599     "movdqa    %%xmm4,%%xmm6                   \n"
600     "pslld     $0xa,%%xmm6                     \n"
601     "pcmpeqb   %%xmm7,%%xmm7                   \n"
602     "pslld     $0xf,%%xmm7                     \n"
603     LABELALIGN
604   "1:                                          \n"
605     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
606     "movdqa    %%xmm0,%%xmm1                   \n"
607     "movdqa    %%xmm0,%%xmm2                   \n"
608     "movdqa    %%xmm0,%%xmm3                   \n"
609     "psrad     $0x10,%%xmm0                    \n"
610     "psrld     $0x3,%%xmm1                     \n"
611     "psrld     $0x6,%%xmm2                     \n"
612     "psrld     $0x9,%%xmm3                     \n"
613     "pand      %%xmm7,%%xmm0                   \n"
614     "pand      %%xmm4,%%xmm1                   \n"
615     "pand      %%xmm5,%%xmm2                   \n"
616     "pand      %%xmm6,%%xmm3                   \n"
617     "por       %%xmm1,%%xmm0                   \n"
618     "por       %%xmm3,%%xmm2                   \n"
619     "por       %%xmm2,%%xmm0                   \n"
620     "packssdw  %%xmm0,%%xmm0                   \n"
621     "lea       " MEMLEA(0x10,0) ",%0           \n"
622     "movq      %%xmm0," MEMACCESS(1) "         \n"
623     "lea       " MEMLEA(0x8,1) ",%1            \n"
624     "sub       $0x4,%2                         \n"
625     "jg        1b                              \n"
626   : "+r"(src),  // %0
627     "+r"(dst),  // %1
628     "+r"(pix)   // %2
629   :: "memory", "cc",
630     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
631   );
632 }
633 
ARGBToARGB4444Row_SSE2(const uint8 * src,uint8 * dst,int pix)634 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
635   asm volatile (
636     "pcmpeqb   %%xmm4,%%xmm4                   \n"
637     "psllw     $0xc,%%xmm4                     \n"
638     "movdqa    %%xmm4,%%xmm3                   \n"
639     "psrlw     $0x8,%%xmm3                     \n"
640     LABELALIGN
641   "1:                                          \n"
642     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
643     "movdqa    %%xmm0,%%xmm1                   \n"
644     "pand      %%xmm3,%%xmm0                   \n"
645     "pand      %%xmm4,%%xmm1                   \n"
646     "psrlq     $0x4,%%xmm0                     \n"
647     "psrlq     $0x8,%%xmm1                     \n"
648     "por       %%xmm1,%%xmm0                   \n"
649     "packuswb  %%xmm0,%%xmm0                   \n"
650     "lea       " MEMLEA(0x10,0) ",%0           \n"
651     "movq      %%xmm0," MEMACCESS(1) "         \n"
652     "lea       " MEMLEA(0x8,1) ",%1            \n"
653     "sub       $0x4,%2                         \n"
654     "jg        1b                              \n"
655   : "+r"(src),  // %0
656     "+r"(dst),  // %1
657     "+r"(pix)   // %2
658   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
659   );
660 }
661 #endif  // HAS_RGB24TOARGBROW_SSSE3
662 
663 #ifdef HAS_ARGBTOYROW_SSSE3
664 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
ARGBToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)665 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
666   asm volatile (
667     "movdqa    %3,%%xmm4                       \n"
668     "movdqa    %4,%%xmm5                       \n"
669     LABELALIGN
670   "1:                                          \n"
671     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
672     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
673     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
674     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
675     "pmaddubsw %%xmm4,%%xmm0                   \n"
676     "pmaddubsw %%xmm4,%%xmm1                   \n"
677     "pmaddubsw %%xmm4,%%xmm2                   \n"
678     "pmaddubsw %%xmm4,%%xmm3                   \n"
679     "lea       " MEMLEA(0x40,0) ",%0           \n"
680     "phaddw    %%xmm1,%%xmm0                   \n"
681     "phaddw    %%xmm3,%%xmm2                   \n"
682     "psrlw     $0x7,%%xmm0                     \n"
683     "psrlw     $0x7,%%xmm2                     \n"
684     "packuswb  %%xmm2,%%xmm0                   \n"
685     "paddb     %%xmm5,%%xmm0                   \n"
686     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
687     "lea       " MEMLEA(0x10,1) ",%1           \n"
688     "sub       $0x10,%2                        \n"
689     "jg        1b                              \n"
690   : "+r"(src_argb),  // %0
691     "+r"(dst_y),     // %1
692     "+r"(pix)        // %2
693   : "m"(kARGBToY),   // %3
694     "m"(kAddY16)     // %4
695   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
696   );
697 }
698 #endif  // HAS_ARGBTOYROW_SSSE3
699 
700 #ifdef HAS_ARGBTOYJROW_SSSE3
701 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
702 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
ARGBToYJRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)703 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
704   asm volatile (
705     "movdqa    %3,%%xmm4                       \n"
706     "movdqa    %4,%%xmm5                       \n"
707     LABELALIGN
708   "1:                                          \n"
709     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
710     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
711     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
712     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
713     "pmaddubsw %%xmm4,%%xmm0                   \n"
714     "pmaddubsw %%xmm4,%%xmm1                   \n"
715     "pmaddubsw %%xmm4,%%xmm2                   \n"
716     "pmaddubsw %%xmm4,%%xmm3                   \n"
717     "lea       " MEMLEA(0x40,0) ",%0           \n"
718     "phaddw    %%xmm1,%%xmm0                   \n"
719     "phaddw    %%xmm3,%%xmm2                   \n"
720     "paddw     %%xmm5,%%xmm0                   \n"
721     "paddw     %%xmm5,%%xmm2                   \n"
722     "psrlw     $0x7,%%xmm0                     \n"
723     "psrlw     $0x7,%%xmm2                     \n"
724     "packuswb  %%xmm2,%%xmm0                   \n"
725     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
726     "lea       " MEMLEA(0x10,1) ",%1           \n"
727     "sub       $0x10,%2                        \n"
728     "jg        1b                              \n"
729   : "+r"(src_argb),  // %0
730     "+r"(dst_y),     // %1
731     "+r"(pix)        // %2
732   : "m"(kARGBToYJ),  // %3
733     "m"(kAddYJ64)    // %4
734   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
735   );
736 }
737 #endif  // HAS_ARGBTOYJROW_SSSE3
738 
739 #ifdef HAS_ARGBTOYROW_AVX2
740 // vpermd for vphaddw + vpackuswb vpermd.
741 static const lvec32 kPermdARGBToY_AVX = {
742   0, 4, 1, 5, 2, 6, 3, 7
743 };
744 
745 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYRow_AVX2(const uint8 * src_argb,uint8 * dst_y,int pix)746 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
747   asm volatile (
748     "vbroadcastf128 %3,%%ymm4                  \n"
749     "vbroadcastf128 %4,%%ymm5                  \n"
750     "vmovdqu    %5,%%ymm6                      \n"
751     LABELALIGN
752   "1:                                          \n"
753     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
754     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
755     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
756     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
757     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
758     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
759     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
760     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
761     "lea       " MEMLEA(0x80,0) ",%0           \n"
762     "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
763     "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
764     "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
765     "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
766     "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
767     "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
768     "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
769     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
770     "lea       " MEMLEA(0x20,1) ",%1           \n"
771     "sub       $0x20,%2                        \n"
772     "jg        1b                              \n"
773     "vzeroupper                                \n"
774   : "+r"(src_argb),  // %0
775     "+r"(dst_y),     // %1
776     "+r"(pix)        // %2
777   : "m"(kARGBToY),   // %3
778     "m"(kAddY16),    // %4
779     "m"(kPermdARGBToY_AVX)  // %5
780   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
781   );
782 }
783 #endif  // HAS_ARGBTOYROW_AVX2
784 
785 #ifdef HAS_ARGBTOYJROW_AVX2
786 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYJRow_AVX2(const uint8 * src_argb,uint8 * dst_y,int pix)787 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
788   asm volatile (
789     "vbroadcastf128 %3,%%ymm4                  \n"
790     "vbroadcastf128 %4,%%ymm5                  \n"
791     "vmovdqu    %5,%%ymm6                      \n"
792     LABELALIGN
793   "1:                                          \n"
794     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
795     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
796     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
797     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
798     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
799     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
800     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
801     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
802     "lea       " MEMLEA(0x80,0) ",%0           \n"
803     "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
804     "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
805     "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
806     "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
807     "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
808     "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
809     "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
810     "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
811     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
812     "lea       " MEMLEA(0x20,1) ",%1           \n"
813     "sub       $0x20,%2                        \n"
814     "jg        1b                              \n"
815     "vzeroupper                                \n"
816   : "+r"(src_argb),  // %0
817     "+r"(dst_y),     // %1
818     "+r"(pix)        // %2
819   : "m"(kARGBToYJ),   // %3
820     "m"(kAddYJ64),    // %4
821     "m"(kPermdARGBToY_AVX)  // %5
822   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
823   );
824 }
825 #endif  // HAS_ARGBTOYJROW_AVX2
826 
827 #ifdef HAS_ARGBTOUVROW_SSSE3
ARGBToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)828 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
829                        uint8* dst_u, uint8* dst_v, int width) {
830   asm volatile (
831     "movdqa    %5,%%xmm3                       \n"
832     "movdqa    %6,%%xmm4                       \n"
833     "movdqa    %7,%%xmm5                       \n"
834     "sub       %1,%2                           \n"
835     LABELALIGN
836   "1:                                          \n"
837     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
838     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
839     "pavgb     %%xmm7,%%xmm0                   \n"
840     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
841     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
842     "pavgb     %%xmm7,%%xmm1                   \n"
843     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
844     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
845     "pavgb     %%xmm7,%%xmm2                   \n"
846     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
847     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
848     "pavgb     %%xmm7,%%xmm6                   \n"
849 
850     "lea       " MEMLEA(0x40,0) ",%0           \n"
851     "movdqa    %%xmm0,%%xmm7                   \n"
852     "shufps    $0x88,%%xmm1,%%xmm0             \n"
853     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
854     "pavgb     %%xmm7,%%xmm0                   \n"
855     "movdqa    %%xmm2,%%xmm7                   \n"
856     "shufps    $0x88,%%xmm6,%%xmm2             \n"
857     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
858     "pavgb     %%xmm7,%%xmm2                   \n"
859     "movdqa    %%xmm0,%%xmm1                   \n"
860     "movdqa    %%xmm2,%%xmm6                   \n"
861     "pmaddubsw %%xmm4,%%xmm0                   \n"
862     "pmaddubsw %%xmm4,%%xmm2                   \n"
863     "pmaddubsw %%xmm3,%%xmm1                   \n"
864     "pmaddubsw %%xmm3,%%xmm6                   \n"
865     "phaddw    %%xmm2,%%xmm0                   \n"
866     "phaddw    %%xmm6,%%xmm1                   \n"
867     "psraw     $0x8,%%xmm0                     \n"
868     "psraw     $0x8,%%xmm1                     \n"
869     "packsswb  %%xmm1,%%xmm0                   \n"
870     "paddb     %%xmm5,%%xmm0                   \n"
871     "movlps    %%xmm0," MEMACCESS(1) "         \n"
872     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
873     "lea       " MEMLEA(0x8,1) ",%1            \n"
874     "sub       $0x10,%3                        \n"
875     "jg        1b                              \n"
876   : "+r"(src_argb0),       // %0
877     "+r"(dst_u),           // %1
878     "+r"(dst_v),           // %2
879     "+rm"(width)           // %3
880   : "r"((intptr_t)(src_stride_argb)), // %4
881     "m"(kARGBToV),  // %5
882     "m"(kARGBToU),  // %6
883     "m"(kAddUV128)  // %7
884   : "memory", "cc", NACL_R14
885     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
886   );
887 }
888 #endif  // HAS_ARGBTOUVROW_SSSE3
889 
890 #ifdef HAS_ARGBTOUVROW_AVX2
891 // vpshufb for vphaddw + vpackuswb packed to shorts.
892 static const lvec8 kShufARGBToUV_AVX = {
893   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
894   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
895 };
ARGBToUVRow_AVX2(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)896 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
897                       uint8* dst_u, uint8* dst_v, int width) {
898   asm volatile (
899     "vbroadcastf128 %5,%%ymm5                  \n"
900     "vbroadcastf128 %6,%%ymm6                  \n"
901     "vbroadcastf128 %7,%%ymm7                  \n"
902     "sub       %1,%2                           \n"
903     LABELALIGN
904   "1:                                          \n"
905     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
906     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
907     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
908     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
909     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
910     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
911     VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
912     VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
913     "lea       " MEMLEA(0x80,0) ",%0           \n"
914     "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
915     "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
916     "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
917     "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
918     "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
919     "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
920 
921     "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
922     "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
923     "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
924     "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
925     "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
926     "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
927     "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
928     "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
929     "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
930     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
931     "vpshufb    %8,%%ymm0,%%ymm0               \n"
932     "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
933 
934     "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
935     VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
936     "lea       " MEMLEA(0x10,1) ",%1           \n"
937     "sub       $0x20,%3                        \n"
938     "jg        1b                              \n"
939     "vzeroupper                                \n"
940   : "+r"(src_argb0),       // %0
941     "+r"(dst_u),           // %1
942     "+r"(dst_v),           // %2
943     "+rm"(width)           // %3
944   : "r"((intptr_t)(src_stride_argb)), // %4
945     "m"(kAddUV128),  // %5
946     "m"(kARGBToV),   // %6
947     "m"(kARGBToU),   // %7
948     "m"(kShufARGBToUV_AVX)  // %8
949   : "memory", "cc", NACL_R14
950     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
951   );
952 }
953 #endif  // HAS_ARGBTOUVROW_AVX2
954 
955 #ifdef HAS_ARGBTOUVJROW_SSSE3
ARGBToUVJRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)956 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
957                         uint8* dst_u, uint8* dst_v, int width) {
958   asm volatile (
959     "movdqa    %5,%%xmm3                       \n"
960     "movdqa    %6,%%xmm4                       \n"
961     "movdqa    %7,%%xmm5                       \n"
962     "sub       %1,%2                           \n"
963     LABELALIGN
964   "1:                                          \n"
965     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
966     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
967     "pavgb     %%xmm7,%%xmm0                   \n"
968     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
969     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
970     "pavgb     %%xmm7,%%xmm1                   \n"
971     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
972     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
973     "pavgb     %%xmm7,%%xmm2                   \n"
974     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
975     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
976     "pavgb     %%xmm7,%%xmm6                   \n"
977 
978     "lea       " MEMLEA(0x40,0) ",%0           \n"
979     "movdqa    %%xmm0,%%xmm7                   \n"
980     "shufps    $0x88,%%xmm1,%%xmm0             \n"
981     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
982     "pavgb     %%xmm7,%%xmm0                   \n"
983     "movdqa    %%xmm2,%%xmm7                   \n"
984     "shufps    $0x88,%%xmm6,%%xmm2             \n"
985     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
986     "pavgb     %%xmm7,%%xmm2                   \n"
987     "movdqa    %%xmm0,%%xmm1                   \n"
988     "movdqa    %%xmm2,%%xmm6                   \n"
989     "pmaddubsw %%xmm4,%%xmm0                   \n"
990     "pmaddubsw %%xmm4,%%xmm2                   \n"
991     "pmaddubsw %%xmm3,%%xmm1                   \n"
992     "pmaddubsw %%xmm3,%%xmm6                   \n"
993     "phaddw    %%xmm2,%%xmm0                   \n"
994     "phaddw    %%xmm6,%%xmm1                   \n"
995     "paddw     %%xmm5,%%xmm0                   \n"
996     "paddw     %%xmm5,%%xmm1                   \n"
997     "psraw     $0x8,%%xmm0                     \n"
998     "psraw     $0x8,%%xmm1                     \n"
999     "packsswb  %%xmm1,%%xmm0                   \n"
1000     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1001     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1002     "lea       " MEMLEA(0x8,1) ",%1            \n"
1003     "sub       $0x10,%3                        \n"
1004     "jg        1b                              \n"
1005   : "+r"(src_argb0),       // %0
1006     "+r"(dst_u),           // %1
1007     "+r"(dst_v),           // %2
1008     "+rm"(width)           // %3
1009   : "r"((intptr_t)(src_stride_argb)), // %4
1010     "m"(kARGBToVJ),  // %5
1011     "m"(kARGBToUJ),  // %6
1012     "m"(kAddUVJ128)  // %7
1013   : "memory", "cc", NACL_R14
1014     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1015   );
1016 }
1017 #endif  // HAS_ARGBTOUVJROW_SSSE3
1018 
1019 #ifdef HAS_ARGBTOUV444ROW_SSSE3
ARGBToUV444Row_SSSE3(const uint8 * src_argb,uint8 * dst_u,uint8 * dst_v,int width)1020 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1021                           int width) {
1022   asm volatile (
1023     "movdqa    %4,%%xmm3                       \n"
1024     "movdqa    %5,%%xmm4                       \n"
1025     "movdqa    %6,%%xmm5                       \n"
1026     "sub       %1,%2                           \n"
1027     LABELALIGN
1028   "1:                                          \n"
1029     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1030     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1031     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1032     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1033     "pmaddubsw %%xmm4,%%xmm0                   \n"
1034     "pmaddubsw %%xmm4,%%xmm1                   \n"
1035     "pmaddubsw %%xmm4,%%xmm2                   \n"
1036     "pmaddubsw %%xmm4,%%xmm6                   \n"
1037     "phaddw    %%xmm1,%%xmm0                   \n"
1038     "phaddw    %%xmm6,%%xmm2                   \n"
1039     "psraw     $0x8,%%xmm0                     \n"
1040     "psraw     $0x8,%%xmm2                     \n"
1041     "packsswb  %%xmm2,%%xmm0                   \n"
1042     "paddb     %%xmm5,%%xmm0                   \n"
1043     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1044     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1045     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1046     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1047     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1048     "pmaddubsw %%xmm3,%%xmm0                   \n"
1049     "pmaddubsw %%xmm3,%%xmm1                   \n"
1050     "pmaddubsw %%xmm3,%%xmm2                   \n"
1051     "pmaddubsw %%xmm3,%%xmm6                   \n"
1052     "phaddw    %%xmm1,%%xmm0                   \n"
1053     "phaddw    %%xmm6,%%xmm2                   \n"
1054     "psraw     $0x8,%%xmm0                     \n"
1055     "psraw     $0x8,%%xmm2                     \n"
1056     "packsswb  %%xmm2,%%xmm0                   \n"
1057     "paddb     %%xmm5,%%xmm0                   \n"
1058     "lea       " MEMLEA(0x40,0) ",%0           \n"
1059     MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
1060     "lea       " MEMLEA(0x10,1) ",%1           \n"
1061     "sub       $0x10,%3                        \n"
1062     "jg        1b                              \n"
1063   : "+r"(src_argb),        // %0
1064     "+r"(dst_u),           // %1
1065     "+r"(dst_v),           // %2
1066     "+rm"(width)           // %3
1067   : "m"(kARGBToV),  // %4
1068     "m"(kARGBToU),  // %5
1069     "m"(kAddUV128)  // %6
1070   : "memory", "cc", NACL_R14
1071     "xmm0", "xmm1", "xmm2", "xmm6"
1072   );
1073 }
1074 #endif  // HAS_ARGBTOUV444ROW_SSSE3
1075 
1076 #ifdef HAS_ARGBTOUV422ROW_SSSE3
ARGBToUV422Row_SSSE3(const uint8 * src_argb0,uint8 * dst_u,uint8 * dst_v,int width)1077 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1078                           uint8* dst_u, uint8* dst_v, int width) {
1079   asm volatile (
1080     "movdqa    %4,%%xmm3                       \n"
1081     "movdqa    %5,%%xmm4                       \n"
1082     "movdqa    %6,%%xmm5                       \n"
1083     "sub       %1,%2                           \n"
1084     LABELALIGN
1085   "1:                                          \n"
1086     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1087     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1088     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1089     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1090     "lea       " MEMLEA(0x40,0) ",%0           \n"
1091     "movdqa    %%xmm0,%%xmm7                   \n"
1092     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1093     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1094     "pavgb     %%xmm7,%%xmm0                   \n"
1095     "movdqa    %%xmm2,%%xmm7                   \n"
1096     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1097     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1098     "pavgb     %%xmm7,%%xmm2                   \n"
1099     "movdqa    %%xmm0,%%xmm1                   \n"
1100     "movdqa    %%xmm2,%%xmm6                   \n"
1101     "pmaddubsw %%xmm4,%%xmm0                   \n"
1102     "pmaddubsw %%xmm4,%%xmm2                   \n"
1103     "pmaddubsw %%xmm3,%%xmm1                   \n"
1104     "pmaddubsw %%xmm3,%%xmm6                   \n"
1105     "phaddw    %%xmm2,%%xmm0                   \n"
1106     "phaddw    %%xmm6,%%xmm1                   \n"
1107     "psraw     $0x8,%%xmm0                     \n"
1108     "psraw     $0x8,%%xmm1                     \n"
1109     "packsswb  %%xmm1,%%xmm0                   \n"
1110     "paddb     %%xmm5,%%xmm0                   \n"
1111     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1112     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1113     "lea       " MEMLEA(0x8,1) ",%1            \n"
1114     "sub       $0x10,%3                        \n"
1115     "jg        1b                              \n"
1116   : "+r"(src_argb0),       // %0
1117     "+r"(dst_u),           // %1
1118     "+r"(dst_v),           // %2
1119     "+rm"(width)           // %3
1120   : "m"(kARGBToV),  // %4
1121     "m"(kARGBToU),  // %5
1122     "m"(kAddUV128)  // %6
1123   : "memory", "cc", NACL_R14
1124     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1125   );
1126 }
1127 #endif  // HAS_ARGBTOUV422ROW_SSSE3
1128 
BGRAToYRow_SSSE3(const uint8 * src_bgra,uint8 * dst_y,int pix)1129 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
1130   asm volatile (
1131     "movdqa    %4,%%xmm5                       \n"
1132     "movdqa    %3,%%xmm4                       \n"
1133     LABELALIGN
1134   "1:                                          \n"
1135     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1136     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1137     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1138     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1139     "pmaddubsw %%xmm4,%%xmm0                   \n"
1140     "pmaddubsw %%xmm4,%%xmm1                   \n"
1141     "pmaddubsw %%xmm4,%%xmm2                   \n"
1142     "pmaddubsw %%xmm4,%%xmm3                   \n"
1143     "lea       " MEMLEA(0x40,0) ",%0           \n"
1144     "phaddw    %%xmm1,%%xmm0                   \n"
1145     "phaddw    %%xmm3,%%xmm2                   \n"
1146     "psrlw     $0x7,%%xmm0                     \n"
1147     "psrlw     $0x7,%%xmm2                     \n"
1148     "packuswb  %%xmm2,%%xmm0                   \n"
1149     "paddb     %%xmm5,%%xmm0                   \n"
1150     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1151     "lea       " MEMLEA(0x10,1) ",%1           \n"
1152     "sub       $0x10,%2                        \n"
1153     "jg        1b                              \n"
1154   : "+r"(src_bgra),  // %0
1155     "+r"(dst_y),     // %1
1156     "+r"(pix)        // %2
1157   : "m"(kBGRAToY),   // %3
1158     "m"(kAddY16)     // %4
1159   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1160   );
1161 }
1162 
BGRAToUVRow_SSSE3(const uint8 * src_bgra0,int src_stride_bgra,uint8 * dst_u,uint8 * dst_v,int width)1163 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1164                        uint8* dst_u, uint8* dst_v, int width) {
1165   asm volatile (
1166     "movdqa    %5,%%xmm3                       \n"
1167     "movdqa    %6,%%xmm4                       \n"
1168     "movdqa    %7,%%xmm5                       \n"
1169     "sub       %1,%2                           \n"
1170     LABELALIGN
1171   "1:                                          \n"
1172     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1173     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1174     "pavgb     %%xmm7,%%xmm0                   \n"
1175     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1176     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1177     "pavgb     %%xmm7,%%xmm1                   \n"
1178     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1179     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1180     "pavgb     %%xmm7,%%xmm2                   \n"
1181     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1182     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1183     "pavgb     %%xmm7,%%xmm6                   \n"
1184 
1185     "lea       " MEMLEA(0x40,0) ",%0           \n"
1186     "movdqa    %%xmm0,%%xmm7                   \n"
1187     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1188     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1189     "pavgb     %%xmm7,%%xmm0                   \n"
1190     "movdqa    %%xmm2,%%xmm7                   \n"
1191     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1192     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1193     "pavgb     %%xmm7,%%xmm2                   \n"
1194     "movdqa    %%xmm0,%%xmm1                   \n"
1195     "movdqa    %%xmm2,%%xmm6                   \n"
1196     "pmaddubsw %%xmm4,%%xmm0                   \n"
1197     "pmaddubsw %%xmm4,%%xmm2                   \n"
1198     "pmaddubsw %%xmm3,%%xmm1                   \n"
1199     "pmaddubsw %%xmm3,%%xmm6                   \n"
1200     "phaddw    %%xmm2,%%xmm0                   \n"
1201     "phaddw    %%xmm6,%%xmm1                   \n"
1202     "psraw     $0x8,%%xmm0                     \n"
1203     "psraw     $0x8,%%xmm1                     \n"
1204     "packsswb  %%xmm1,%%xmm0                   \n"
1205     "paddb     %%xmm5,%%xmm0                   \n"
1206     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1207     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1208     "lea       " MEMLEA(0x8,1) ",%1            \n"
1209     "sub       $0x10,%3                        \n"
1210     "jg        1b                              \n"
1211   : "+r"(src_bgra0),       // %0
1212     "+r"(dst_u),           // %1
1213     "+r"(dst_v),           // %2
1214     "+rm"(width)           // %3
1215   : "r"((intptr_t)(src_stride_bgra)), // %4
1216     "m"(kBGRAToV),  // %5
1217     "m"(kBGRAToU),  // %6
1218     "m"(kAddUV128)  // %7
1219   : "memory", "cc", NACL_R14
1220     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1221   );
1222 }
1223 
ABGRToYRow_SSSE3(const uint8 * src_abgr,uint8 * dst_y,int pix)1224 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1225   asm volatile (
1226     "movdqa    %4,%%xmm5                       \n"
1227     "movdqa    %3,%%xmm4                       \n"
1228     LABELALIGN
1229   "1:                                          \n"
1230     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1231     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1232     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1233     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1234     "pmaddubsw %%xmm4,%%xmm0                   \n"
1235     "pmaddubsw %%xmm4,%%xmm1                   \n"
1236     "pmaddubsw %%xmm4,%%xmm2                   \n"
1237     "pmaddubsw %%xmm4,%%xmm3                   \n"
1238     "lea       " MEMLEA(0x40,0) ",%0           \n"
1239     "phaddw    %%xmm1,%%xmm0                   \n"
1240     "phaddw    %%xmm3,%%xmm2                   \n"
1241     "psrlw     $0x7,%%xmm0                     \n"
1242     "psrlw     $0x7,%%xmm2                     \n"
1243     "packuswb  %%xmm2,%%xmm0                   \n"
1244     "paddb     %%xmm5,%%xmm0                   \n"
1245     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1246     "lea       " MEMLEA(0x10,1) ",%1           \n"
1247     "sub       $0x10,%2                        \n"
1248     "jg        1b                              \n"
1249   : "+r"(src_abgr),  // %0
1250     "+r"(dst_y),     // %1
1251     "+r"(pix)        // %2
1252   : "m"(kABGRToY),   // %3
1253     "m"(kAddY16)     // %4
1254   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1255   );
1256 }
1257 
RGBAToYRow_SSSE3(const uint8 * src_rgba,uint8 * dst_y,int pix)1258 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1259   asm volatile (
1260     "movdqa    %4,%%xmm5                       \n"
1261     "movdqa    %3,%%xmm4                       \n"
1262     LABELALIGN
1263   "1:                                          \n"
1264     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1265     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1266     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1267     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1268     "pmaddubsw %%xmm4,%%xmm0                   \n"
1269     "pmaddubsw %%xmm4,%%xmm1                   \n"
1270     "pmaddubsw %%xmm4,%%xmm2                   \n"
1271     "pmaddubsw %%xmm4,%%xmm3                   \n"
1272     "lea       " MEMLEA(0x40,0) ",%0           \n"
1273     "phaddw    %%xmm1,%%xmm0                   \n"
1274     "phaddw    %%xmm3,%%xmm2                   \n"
1275     "psrlw     $0x7,%%xmm0                     \n"
1276     "psrlw     $0x7,%%xmm2                     \n"
1277     "packuswb  %%xmm2,%%xmm0                   \n"
1278     "paddb     %%xmm5,%%xmm0                   \n"
1279     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1280     "lea       " MEMLEA(0x10,1) ",%1           \n"
1281     "sub       $0x10,%2                        \n"
1282     "jg        1b                              \n"
1283   : "+r"(src_rgba),  // %0
1284     "+r"(dst_y),     // %1
1285     "+r"(pix)        // %2
1286   : "m"(kRGBAToY),   // %3
1287     "m"(kAddY16)     // %4
1288   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1289   );
1290 }
1291 
ABGRToUVRow_SSSE3(const uint8 * src_abgr0,int src_stride_abgr,uint8 * dst_u,uint8 * dst_v,int width)1292 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1293                        uint8* dst_u, uint8* dst_v, int width) {
1294   asm volatile (
1295     "movdqa    %5,%%xmm3                       \n"
1296     "movdqa    %6,%%xmm4                       \n"
1297     "movdqa    %7,%%xmm5                       \n"
1298     "sub       %1,%2                           \n"
1299     LABELALIGN
1300   "1:                                          \n"
1301     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1302     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1303     "pavgb     %%xmm7,%%xmm0                   \n"
1304     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1305     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1306     "pavgb     %%xmm7,%%xmm1                   \n"
1307     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1308     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1309     "pavgb     %%xmm7,%%xmm2                   \n"
1310     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1311     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1312     "pavgb     %%xmm7,%%xmm6                   \n"
1313 
1314     "lea       " MEMLEA(0x40,0) ",%0           \n"
1315     "movdqa    %%xmm0,%%xmm7                   \n"
1316     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1317     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1318     "pavgb     %%xmm7,%%xmm0                   \n"
1319     "movdqa    %%xmm2,%%xmm7                   \n"
1320     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1321     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1322     "pavgb     %%xmm7,%%xmm2                   \n"
1323     "movdqa    %%xmm0,%%xmm1                   \n"
1324     "movdqa    %%xmm2,%%xmm6                   \n"
1325     "pmaddubsw %%xmm4,%%xmm0                   \n"
1326     "pmaddubsw %%xmm4,%%xmm2                   \n"
1327     "pmaddubsw %%xmm3,%%xmm1                   \n"
1328     "pmaddubsw %%xmm3,%%xmm6                   \n"
1329     "phaddw    %%xmm2,%%xmm0                   \n"
1330     "phaddw    %%xmm6,%%xmm1                   \n"
1331     "psraw     $0x8,%%xmm0                     \n"
1332     "psraw     $0x8,%%xmm1                     \n"
1333     "packsswb  %%xmm1,%%xmm0                   \n"
1334     "paddb     %%xmm5,%%xmm0                   \n"
1335     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1336     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1337     "lea       " MEMLEA(0x8,1) ",%1            \n"
1338     "sub       $0x10,%3                        \n"
1339     "jg        1b                              \n"
1340   : "+r"(src_abgr0),       // %0
1341     "+r"(dst_u),           // %1
1342     "+r"(dst_v),           // %2
1343     "+rm"(width)           // %3
1344   : "r"((intptr_t)(src_stride_abgr)), // %4
1345     "m"(kABGRToV),  // %5
1346     "m"(kABGRToU),  // %6
1347     "m"(kAddUV128)  // %7
1348   : "memory", "cc", NACL_R14
1349     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1350   );
1351 }
1352 
RGBAToUVRow_SSSE3(const uint8 * src_rgba0,int src_stride_rgba,uint8 * dst_u,uint8 * dst_v,int width)1353 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1354                        uint8* dst_u, uint8* dst_v, int width) {
1355   asm volatile (
1356     "movdqa    %5,%%xmm3                       \n"
1357     "movdqa    %6,%%xmm4                       \n"
1358     "movdqa    %7,%%xmm5                       \n"
1359     "sub       %1,%2                           \n"
1360     LABELALIGN
1361   "1:                                          \n"
1362     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1363     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1364     "pavgb     %%xmm7,%%xmm0                   \n"
1365     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1366     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1367     "pavgb     %%xmm7,%%xmm1                   \n"
1368     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1369     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1370     "pavgb     %%xmm7,%%xmm2                   \n"
1371     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1372     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1373     "pavgb     %%xmm7,%%xmm6                   \n"
1374 
1375     "lea       " MEMLEA(0x40,0) ",%0           \n"
1376     "movdqa    %%xmm0,%%xmm7                   \n"
1377     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1378     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1379     "pavgb     %%xmm7,%%xmm0                   \n"
1380     "movdqa    %%xmm2,%%xmm7                   \n"
1381     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1382     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1383     "pavgb     %%xmm7,%%xmm2                   \n"
1384     "movdqa    %%xmm0,%%xmm1                   \n"
1385     "movdqa    %%xmm2,%%xmm6                   \n"
1386     "pmaddubsw %%xmm4,%%xmm0                   \n"
1387     "pmaddubsw %%xmm4,%%xmm2                   \n"
1388     "pmaddubsw %%xmm3,%%xmm1                   \n"
1389     "pmaddubsw %%xmm3,%%xmm6                   \n"
1390     "phaddw    %%xmm2,%%xmm0                   \n"
1391     "phaddw    %%xmm6,%%xmm1                   \n"
1392     "psraw     $0x8,%%xmm0                     \n"
1393     "psraw     $0x8,%%xmm1                     \n"
1394     "packsswb  %%xmm1,%%xmm0                   \n"
1395     "paddb     %%xmm5,%%xmm0                   \n"
1396     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1397     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1398     "lea       " MEMLEA(0x8,1) ",%1            \n"
1399     "sub       $0x10,%3                        \n"
1400     "jg        1b                              \n"
1401   : "+r"(src_rgba0),       // %0
1402     "+r"(dst_u),           // %1
1403     "+r"(dst_v),           // %2
1404     "+rm"(width)           // %3
1405   : "r"((intptr_t)(src_stride_rgba)), // %4
1406     "m"(kRGBAToV),  // %5
1407     "m"(kRGBAToU),  // %6
1408     "m"(kAddUV128)  // %7
1409   : "memory", "cc", NACL_R14
1410     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1411   );
1412 }
1413 
1414 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
1415 
1416 struct YuvConstants {
1417   lvec8 kUVToB;     // 0
1418   lvec8 kUVToG;     // 32
1419   lvec8 kUVToR;     // 64
1420   lvec16 kUVBiasB;  // 96
1421   lvec16 kUVBiasG;  // 128
1422   lvec16 kUVBiasR;  // 160
1423   lvec16 kYToRgb;   // 192
1424 };
1425 
1426 // BT.601 YUV to RGB reference
1427 //  R = (Y - 16) * 1.164              - V * -1.596
1428 //  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
1429 //  B = (Y - 16) * 1.164 - U * -2.018
1430 
1431 // Y contribution to R,G,B.  Scale and bias.
1432 // TODO(fbarchard): Consider moving constants into a common header.
1433 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
1434 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
1435 
1436 // U and V contributions to R,G,B.
1437 #define UB -128 /* max(-128, round(-2.018 * 64)) */
1438 #define UG 25 /* round(0.391 * 64) */
1439 #define VG 52 /* round(0.813 * 64) */
1440 #define VR -102 /* round(-1.596 * 64) */
1441 
1442 // Bias values to subtract 16 from Y and 128 from U and V.
1443 #define BB (UB * 128            + YGB)
1444 #define BG (UG * 128 + VG * 128 + YGB)
1445 #define BR            (VR * 128 + YGB)
1446 
1447 // BT601 constants for YUV to RGB.
1448 static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
1449   { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
1450     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
1451   { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
1452     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1453   { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
1454     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
1455   { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1456   { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1457   { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1458   { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1459 };
1460 
1461 // BT601 constants for NV21 where chroma plane is VU instead of UV.
1462 static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
1463   { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
1464     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
1465   { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
1466     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1467   { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
1468     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
1469   { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1470   { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1471   { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1472   { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1473 };
1474 
1475 #undef YG
1476 #undef YGB
1477 #undef UB
1478 #undef UG
1479 #undef VG
1480 #undef VR
1481 #undef BB
1482 #undef BG
1483 #undef BR
1484 
1485 // JPEG YUV to RGB reference
1486 // *  R = Y                - V * -1.40200
1487 // *  G = Y - U *  0.34414 - V *  0.71414
1488 // *  B = Y - U * -1.77200
1489 
1490 // Y contribution to R,G,B.  Scale and bias.
1491 // TODO(fbarchard): Consider moving constants into a common header.
1492 #define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
1493 #define YGBJ 32  /* 64 / 2 */
1494 
1495 // U and V contributions to R,G,B.
1496 #define UBJ -113 /* round(-1.77200 * 64) */
1497 #define UGJ 22 /* round(0.34414 * 64) */
1498 #define VGJ 46 /* round(0.71414  * 64) */
1499 #define VRJ -90 /* round(-1.40200 * 64) */
1500 
1501 // Bias values to subtract 16 from Y and 128 from U and V.
1502 #define BBJ (UBJ * 128             + YGBJ)
1503 #define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
1504 #define BRJ             (VRJ * 128 + YGBJ)
1505 
1506 // JPEG constants for YUV to RGB.
1507 YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
1508   { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
1509     UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
1510   { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
1511     UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
1512     UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
1513     UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
1514   { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
1515     0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
1516   { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
1517     BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
1518   { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
1519     BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
1520   { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
1521     BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
1522   { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
1523     YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
1524 };
1525 
1526 #undef YGJ
1527 #undef YGBJ
1528 #undef UBJ
1529 #undef UGJ
1530 #undef VGJ
1531 #undef VRJ
1532 #undef BBJ
1533 #undef BGJ
1534 #undef BRJ
1535 
1536 // Read 8 UV from 411
1537 #define READYUV444                                                             \
1538     "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
1539     MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1540     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
1541     "punpcklbw  %%xmm1,%%xmm0                                   \n"
1542 
1543 // Read 4 UV from 422, upsample to 8 UV
1544 #define READYUV422                                                             \
1545     "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
1546     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1547     "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
1548     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
1549     "punpcklwd  %%xmm0,%%xmm0                                   \n"
1550 
1551 // Read 2 UV from 411, upsample to 8 UV
1552 #define READYUV411                                                             \
1553     "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
1554     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1555     "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
1556     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
1557     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
1558     "punpckldq  %%xmm0,%%xmm0                                   \n"
1559 
1560 // Read 4 UV from NV12, upsample to 8 UV
1561 #define READNV12                                                               \
1562     "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
1563     "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
1564     "punpcklwd  %%xmm0,%%xmm0                                   \n"
1565 
1566 // Convert 8 pixels: 8 UV and 8 Y
1567 #define YUVTORGB(YuvConstants)                                                 \
1568     "movdqa     %%xmm0,%%xmm1                                   \n"            \
1569     "movdqa     %%xmm0,%%xmm2                                   \n"            \
1570     "movdqa     %%xmm0,%%xmm3                                   \n"            \
1571     "movdqa     " MEMACCESS2(96, [YuvConstants]) ",%%xmm0       \n"            \
1572     "pmaddubsw  " MEMACCESS([YuvConstants]) ",%%xmm1            \n"            \
1573     "psubw      %%xmm1,%%xmm0                                   \n"            \
1574     "movdqa     " MEMACCESS2(128, [YuvConstants]) ",%%xmm1      \n"            \
1575     "pmaddubsw  " MEMACCESS2(32, [YuvConstants]) ",%%xmm2       \n"            \
1576     "psubw      %%xmm2,%%xmm1                                   \n"            \
1577     "movdqa     " MEMACCESS2(160, [YuvConstants]) ",%%xmm2      \n"            \
1578     "pmaddubsw  " MEMACCESS2(64, [YuvConstants]) ",%%xmm3       \n"            \
1579     "psubw      %%xmm3,%%xmm2                                   \n"            \
1580     "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
1581     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
1582     "punpcklbw  %%xmm3,%%xmm3                                   \n"            \
1583     "pmulhuw    " MEMACCESS2(192, [YuvConstants]) ",%%xmm3      \n"            \
1584     "paddsw     %%xmm3,%%xmm0                                   \n"            \
1585     "paddsw     %%xmm3,%%xmm1                                   \n"            \
1586     "paddsw     %%xmm3,%%xmm2                                   \n"            \
1587     "psraw      $0x6,%%xmm0                                     \n"            \
1588     "psraw      $0x6,%%xmm1                                     \n"            \
1589     "psraw      $0x6,%%xmm2                                     \n"            \
1590     "packuswb   %%xmm0,%%xmm0                                   \n"            \
1591     "packuswb   %%xmm1,%%xmm1                                   \n"            \
1592     "packuswb   %%xmm2,%%xmm2                                   \n"
1593 
1594 // Store 8 ARGB values. Assumes XMM5 is zero.
1595 #define STOREARGB                                                              \
1596     "punpcklbw  %%xmm1,%%xmm0                                    \n"           \
1597     "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
1598     "movdqa     %%xmm0,%%xmm1                                    \n"           \
1599     "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
1600     "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
1601     "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
1602     "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \
1603     "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
1604 
1605 // Store 8 BGRA values. Assumes XMM5 is zero.
1606 #define STOREBGRA                                                              \
1607     "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
1608     "punpcklbw %%xmm0,%%xmm1                                     \n"           \
1609     "punpcklbw %%xmm2,%%xmm5                                     \n"           \
1610     "movdqa    %%xmm5,%%xmm0                                     \n"           \
1611     "punpcklwd %%xmm1,%%xmm5                                     \n"           \
1612     "punpckhwd %%xmm1,%%xmm0                                     \n"           \
1613     "movdqu    %%xmm5," MEMACCESS([dst_bgra]) "                  \n"           \
1614     "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_bgra]) "           \n"           \
1615     "lea       " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra]         \n"
1616 
1617 // Store 8 ABGR values. Assumes XMM5 is zero.
1618 #define STOREABGR                                                              \
1619     "punpcklbw %%xmm1,%%xmm2                                     \n"           \
1620     "punpcklbw %%xmm5,%%xmm0                                     \n"           \
1621     "movdqa    %%xmm2,%%xmm1                                     \n"           \
1622     "punpcklwd %%xmm0,%%xmm2                                     \n"           \
1623     "punpckhwd %%xmm0,%%xmm1                                     \n"           \
1624     "movdqu    %%xmm2," MEMACCESS([dst_abgr]) "                  \n"           \
1625     "movdqu    %%xmm1," MEMACCESS2(0x10, [dst_abgr]) "           \n"           \
1626     "lea       " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr]         \n"
1627 
1628 // Store 8 RGBA values. Assumes XMM5 is zero.
1629 #define STORERGBA                                                              \
1630     "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
1631     "punpcklbw %%xmm2,%%xmm1                                     \n"           \
1632     "punpcklbw %%xmm0,%%xmm5                                     \n"           \
1633     "movdqa    %%xmm5,%%xmm0                                     \n"           \
1634     "punpcklwd %%xmm1,%%xmm5                                     \n"           \
1635     "punpckhwd %%xmm1,%%xmm0                                     \n"           \
1636     "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
1637     "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \
1638     "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"
1639 
I444ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)1640 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1641                                 const uint8* u_buf,
1642                                 const uint8* v_buf,
1643                                 uint8* dst_argb,
1644                                 int width) {
1645   asm volatile (
1646     "sub       %[u_buf],%[v_buf]               \n"
1647     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1648     LABELALIGN
1649   "1:                                          \n"
1650     READYUV444
1651     YUVTORGB(kYuvConstants)
1652     STOREARGB
1653     "sub       $0x8,%[width]                   \n"
1654     "jg        1b                              \n"
1655   : [y_buf]"+r"(y_buf),    // %[y_buf]
1656     [u_buf]"+r"(u_buf),    // %[u_buf]
1657     [v_buf]"+r"(v_buf),    // %[v_buf]
1658     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1659     [width]"+rm"(width)    // %[width]
1660   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1661   : "memory", "cc", NACL_R14
1662     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1663   );
1664 }
1665 
1666 // TODO(fbarchard): Consider putting masks into constants.
I422ToRGB24Row_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_rgb24,int width)1667 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1668                                  const uint8* u_buf,
1669                                  const uint8* v_buf,
1670                                  uint8* dst_rgb24,
1671                                  int width) {
1672   asm volatile (
1673     "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1674     "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
1675     "sub       %[u_buf],%[v_buf]               \n"
1676     LABELALIGN
1677   "1:                                          \n"
1678     READYUV422
1679     YUVTORGB(kYuvConstants)
1680     "punpcklbw %%xmm1,%%xmm0                   \n"
1681     "punpcklbw %%xmm2,%%xmm2                   \n"
1682     "movdqa    %%xmm0,%%xmm1                   \n"
1683     "punpcklwd %%xmm2,%%xmm0                   \n"
1684     "punpckhwd %%xmm2,%%xmm1                   \n"
1685     "pshufb    %%xmm5,%%xmm0                   \n"
1686     "pshufb    %%xmm6,%%xmm1                   \n"
1687     "palignr   $0xc,%%xmm0,%%xmm1              \n"
1688     "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
1689     "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
1690     "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
1691     "subl      $0x8,%[width]                   \n"
1692     "jg        1b                              \n"
1693   : [y_buf]"+r"(y_buf),    // %[y_buf]
1694     [u_buf]"+r"(u_buf),    // %[u_buf]
1695     [v_buf]"+r"(v_buf),    // %[v_buf]
1696     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
1697 // TODO(fbarchard): Make width a register for 32 bit.
1698 #if defined(__i386__) && defined(__pic__)
1699     [width]"+m"(width)     // %[width]
1700 #else
1701     [width]"+rm"(width)    // %[width]
1702 #endif
1703   : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
1704     [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1705     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
1706   : "memory", "cc", NACL_R14
1707     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
1708   );
1709 }
1710 
I422ToRAWRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_raw,int width)1711 void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
1712                                const uint8* u_buf,
1713                                const uint8* v_buf,
1714                                uint8* dst_raw,
1715                                int width) {
1716   asm volatile (
1717     "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1718     "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"
1719     "sub       %[u_buf],%[v_buf]               \n"
1720     LABELALIGN
1721   "1:                                          \n"
1722     READYUV422
1723     YUVTORGB(kYuvConstants)
1724     "punpcklbw %%xmm1,%%xmm0                   \n"
1725     "punpcklbw %%xmm2,%%xmm2                   \n"
1726     "movdqa    %%xmm0,%%xmm1                   \n"
1727     "punpcklwd %%xmm2,%%xmm0                   \n"
1728     "punpckhwd %%xmm2,%%xmm1                   \n"
1729     "pshufb    %%xmm5,%%xmm0                   \n"
1730     "pshufb    %%xmm6,%%xmm1                   \n"
1731     "palignr   $0xc,%%xmm0,%%xmm1              \n"
1732     "movq      %%xmm0," MEMACCESS([dst_raw]) " \n"
1733     "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
1734     "lea       " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
1735     "subl      $0x8,%[width]                   \n"
1736     "jg        1b                              \n"
1737   : [y_buf]"+r"(y_buf),    // %[y_buf]
1738     [u_buf]"+r"(u_buf),    // %[u_buf]
1739     [v_buf]"+r"(v_buf),    // %[v_buf]
1740     [dst_raw]"+r"(dst_raw),  // %[dst_raw]
1741 // TODO(fbarchard): Make width a register for 32 bit.
1742 #if defined(__i386__) && defined(__pic__)
1743     [width]"+m"(width)    // %[width]
1744 #else
1745     [width]"+rm"(width)    // %[width]
1746 #endif
1747   : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
1748     [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1749     [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
1750   : "memory", "cc", NACL_R14
1751     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
1752   );
1753 }
1754 
I422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)1755 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
1756                                 const uint8* u_buf,
1757                                 const uint8* v_buf,
1758                                 uint8* dst_argb,
1759                                 int width) {
1760   asm volatile (
1761     "sub       %[u_buf],%[v_buf]               \n"
1762     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1763     LABELALIGN
1764   "1:                                          \n"
1765     READYUV422
1766     YUVTORGB(kYuvConstants)
1767     STOREARGB
1768     "sub       $0x8,%[width]                   \n"
1769     "jg        1b                              \n"
1770   : [y_buf]"+r"(y_buf),    // %[y_buf]
1771     [u_buf]"+r"(u_buf),    // %[u_buf]
1772     [v_buf]"+r"(v_buf),    // %[v_buf]
1773     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1774     [width]"+rm"(width)    // %[width]
1775   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1776   : "memory", "cc", NACL_R14
1777     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1778   );
1779 }
1780 
J422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)1781 void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf,
1782                                 const uint8* u_buf,
1783                                 const uint8* v_buf,
1784                                 uint8* dst_argb,
1785                                 int width) {
1786   asm volatile (
1787     "sub       %[u_buf],%[v_buf]               \n"
1788     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1789     LABELALIGN
1790   "1:                                          \n"
1791     READYUV422
1792     YUVTORGB(kYuvConstants)
1793     STOREARGB
1794     "sub       $0x8,%[width]                   \n"
1795     "jg        1b                              \n"
1796   : [y_buf]"+r"(y_buf),    // %[y_buf]
1797     [u_buf]"+r"(u_buf),    // %[u_buf]
1798     [v_buf]"+r"(v_buf),    // %[v_buf]
1799     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1800     [width]"+rm"(width)    // %[width]
1801   : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
1802   : "memory", "cc", NACL_R14
1803     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1804   );
1805 }
1806 
I411ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)1807 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1808                                 const uint8* u_buf,
1809                                 const uint8* v_buf,
1810                                 uint8* dst_argb,
1811                                 int width) {
1812   asm volatile (
1813     "sub       %[u_buf],%[v_buf]               \n"
1814     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1815     LABELALIGN
1816   "1:                                          \n"
1817     READYUV411
1818     YUVTORGB(kYuvConstants)
1819     STOREARGB
1820     "sub       $0x8,%[width]                   \n"
1821     "jg        1b                              \n"
1822   : [y_buf]"+r"(y_buf),    // %[y_buf]
1823     [u_buf]"+r"(u_buf),    // %[u_buf]
1824     [v_buf]"+r"(v_buf),    // %[v_buf]
1825     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1826     [width]"+rm"(width)    // %[width]
1827   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1828   : "memory", "cc", NACL_R14
1829     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1830   );
1831 }
1832 
NV12ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,int width)1833 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1834                                 const uint8* uv_buf,
1835                                 uint8* dst_argb,
1836                                 int width) {
1837   asm volatile (
1838     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1839     LABELALIGN
1840   "1:                                          \n"
1841     READNV12
1842     YUVTORGB(kYuvConstants)
1843     STOREARGB
1844     "sub       $0x8,%[width]                   \n"
1845     "jg        1b                              \n"
1846   : [y_buf]"+r"(y_buf),    // %[y_buf]
1847     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
1848     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1849     [width]"+rm"(width)    // %[width]
1850   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1851   // Does not use r14.
1852   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1853   );
1854 }
1855 
NV21ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,int width)1856 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1857                                 const uint8* uv_buf,
1858                                 uint8* dst_argb,
1859                                 int width) {
1860   asm volatile (
1861     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1862     LABELALIGN
1863   "1:                                          \n"
1864     READNV12
1865     YUVTORGB(kYuvConstants)
1866     STOREARGB
1867     "sub       $0x8,%[width]                   \n"
1868     "jg        1b                              \n"
1869   : [y_buf]"+r"(y_buf),    // %[y_buf]
1870     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
1871     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1872     [width]"+rm"(width)    // %[width]
1873   : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants]
1874   // Does not use r14.
1875   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1876   );
1877 }
1878 
I422ToBGRARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_bgra,int width)1879 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
1880                                 const uint8* u_buf,
1881                                 const uint8* v_buf,
1882                                 uint8* dst_bgra,
1883                                 int width) {
1884   asm volatile (
1885     "sub       %[u_buf],%[v_buf]               \n"
1886     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1887     LABELALIGN
1888   "1:                                          \n"
1889     READYUV422
1890     YUVTORGB(kYuvConstants)
1891     STOREBGRA
1892     "sub       $0x8,%[width]                   \n"
1893     "jg        1b                              \n"
1894   : [y_buf]"+r"(y_buf),    // %[y_buf]
1895     [u_buf]"+r"(u_buf),    // %[u_buf]
1896     [v_buf]"+r"(v_buf),    // %[v_buf]
1897     [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
1898     [width]"+rm"(width)    // %[width]
1899   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1900   : "memory", "cc", NACL_R14
1901     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1902   );
1903 }
1904 
I422ToABGRRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_abgr,int width)1905 void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
1906                                 const uint8* u_buf,
1907                                 const uint8* v_buf,
1908                                 uint8* dst_abgr,
1909                                 int width) {
1910   asm volatile (
1911     "sub       %[u_buf],%[v_buf]               \n"
1912     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1913     LABELALIGN
1914   "1:                                          \n"
1915     READYUV422
1916     YUVTORGB(kYuvConstants)
1917     STOREABGR
1918     "sub       $0x8,%[width]                   \n"
1919     "jg        1b                              \n"
1920   : [y_buf]"+r"(y_buf),    // %[y_buf]
1921     [u_buf]"+r"(u_buf),    // %[u_buf]
1922     [v_buf]"+r"(v_buf),    // %[v_buf]
1923     [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]
1924     [width]"+rm"(width)    // %[width]
1925   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1926   : "memory", "cc", NACL_R14
1927     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1928   );
1929 }
1930 
I422ToRGBARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_rgba,int width)1931 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
1932                                 const uint8* u_buf,
1933                                 const uint8* v_buf,
1934                                 uint8* dst_rgba,
1935                                 int width) {
1936   asm volatile (
1937     "sub       %[u_buf],%[v_buf]               \n"
1938     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1939     LABELALIGN
1940   "1:                                          \n"
1941     READYUV422
1942     YUVTORGB(kYuvConstants)
1943     STORERGBA
1944     "sub       $0x8,%[width]                   \n"
1945     "jg        1b                              \n"
1946   : [y_buf]"+r"(y_buf),    // %[y_buf]
1947     [u_buf]"+r"(u_buf),    // %[u_buf]
1948     [v_buf]"+r"(v_buf),    // %[v_buf]
1949     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
1950     [width]"+rm"(width)    // %[width]
1951   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1952   : "memory", "cc", NACL_R14
1953     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1954   );
1955 }
1956 
1957 #endif  // HAS_I422TOARGBROW_SSSE3
1958 
1959 // Read 8 UV from 422, upsample to 16 UV.
1960 #define READYUV422_AVX2                                                        \
1961     "vmovq       " MEMACCESS([u_buf]) ",%%xmm0                      \n"        \
1962     MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
1963     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
1964     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
1965     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
1966     "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"
1967 
1968 // Convert 16 pixels: 16 UV and 16 Y.
1969 #define YUVTORGB_AVX2(YuvConstants)                                            \
1970     "vpmaddubsw  " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2   \n"        \
1971     "vpmaddubsw  " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1   \n"        \
1972     "vpmaddubsw  " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0        \n"        \
1973     "vmovdqu     " MEMACCESS2(160, [YuvConstants]) ",%%ymm3         \n"        \
1974     "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
1975     "vmovdqu     " MEMACCESS2(128, [YuvConstants]) ",%%ymm3         \n"        \
1976     "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \
1977     "vmovdqu     " MEMACCESS2(96, [YuvConstants]) ",%%ymm3          \n"        \
1978     "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \
1979     "vmovdqu     " MEMACCESS([y_buf]) ",%%xmm3                      \n"        \
1980     "lea         " MEMLEA(0x10, [y_buf]) ",%[y_buf]                 \n"        \
1981     "vpermq      $0xd8,%%ymm3,%%ymm3                                \n"        \
1982     "vpunpcklbw  %%ymm3,%%ymm3,%%ymm3                               \n"        \
1983     "vpmulhuw    " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3  \n"        \
1984     "vpaddsw     %%ymm3,%%ymm0,%%ymm0           \n"                            \
1985     "vpaddsw     %%ymm3,%%ymm1,%%ymm1           \n"                            \
1986     "vpaddsw     %%ymm3,%%ymm2,%%ymm2           \n"                            \
1987     "vpsraw      $0x6,%%ymm0,%%ymm0             \n"                            \
1988     "vpsraw      $0x6,%%ymm1,%%ymm1             \n"                            \
1989     "vpsraw      $0x6,%%ymm2,%%ymm2             \n"                            \
1990     "vpackuswb   %%ymm0,%%ymm0,%%ymm0           \n"                            \
1991     "vpackuswb   %%ymm1,%%ymm1,%%ymm1           \n"                            \
1992     "vpackuswb   %%ymm2,%%ymm2,%%ymm2           \n"
1993 
1994 #if defined(HAS_I422TOBGRAROW_AVX2)
1995 // 16 pixels
1996 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
I422ToBGRARow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_bgra,int width)1997 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
1998                                const uint8* u_buf,
1999                                const uint8* v_buf,
2000                                uint8* dst_bgra,
2001                                int width) {
2002   asm volatile (
2003     "sub       %[u_buf],%[v_buf]               \n"
2004     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2005     LABELALIGN
2006   "1:                                          \n"
2007     READYUV422_AVX2
2008     YUVTORGB_AVX2(kYuvConstants)
2009 
2010     // Step 3: Weave into BGRA
2011     "vpunpcklbw %%ymm0,%%ymm1,%%ymm1           \n"  // GB
2012     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2013     "vpunpcklbw %%ymm2,%%ymm5,%%ymm2           \n"  // AR
2014     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2015     "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"  // ARGB first 8 pixels
2016     "vpunpckhwd %%ymm1,%%ymm2,%%ymm2           \n"  // ARGB next 8 pixels
2017 
2018     "vmovdqu    %%ymm0," MEMACCESS([dst_bgra]) "\n"
2019     "vmovdqu    %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n"
2020     "lea       " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n"
2021     "sub       $0x10,%[width]                  \n"
2022     "jg        1b                              \n"
2023     "vzeroupper                                \n"
2024   : [y_buf]"+r"(y_buf),    // %[y_buf]
2025     [u_buf]"+r"(u_buf),    // %[u_buf]
2026     [v_buf]"+r"(v_buf),    // %[v_buf]
2027     [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
2028     [width]"+rm"(width)    // %[width]
2029   : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
2030   : "memory", "cc", NACL_R14
2031     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2032   );
2033 }
2034 #endif  // HAS_I422TOBGRAROW_AVX2
2035 
2036 #if defined(HAS_I422TOARGBROW_AVX2)
2037 // 16 pixels
2038 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I422ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2039 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
2040                                const uint8* u_buf,
2041                                const uint8* v_buf,
2042                                uint8* dst_argb,
2043                                int width) {
2044   asm volatile (
2045     "sub       %[u_buf],%[v_buf]               \n"
2046     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2047     LABELALIGN
2048   "1:                                          \n"
2049     READYUV422_AVX2
2050     YUVTORGB_AVX2(kYuvConstants)
2051 
2052     // Step 3: Weave into ARGB
2053     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"  // BG
2054     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
2055     "vpunpcklbw %%ymm5,%%ymm2,%%ymm2           \n"  // RA
2056     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2057     "vpunpcklwd %%ymm2,%%ymm0,%%ymm1           \n"  // BGRA first 8 pixels
2058     "vpunpckhwd %%ymm2,%%ymm0,%%ymm0           \n"  // BGRA next 8 pixels
2059 
2060     "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "\n"
2061     "vmovdqu    %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
2062     "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2063     "sub       $0x10,%[width]                  \n"
2064     "jg        1b                              \n"
2065     "vzeroupper                                \n"
2066   : [y_buf]"+r"(y_buf),    // %[y_buf]
2067     [u_buf]"+r"(u_buf),    // %[u_buf]
2068     [v_buf]"+r"(v_buf),    // %[v_buf]
2069     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2070     [width]"+rm"(width)    // %[width]
2071   : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
2072   : "memory", "cc", NACL_R14
2073     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2074   );
2075 }
2076 #endif  // HAS_I422TOARGBROW_AVX2
2077 
2078 #if defined(HAS_J422TOARGBROW_AVX2)
2079 // 16 pixels
2080 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
J422ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2081 void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf,
2082                                const uint8* u_buf,
2083                                const uint8* v_buf,
2084                                uint8* dst_argb,
2085                                int width) {
2086   asm volatile (
2087     "sub       %[u_buf],%[v_buf]               \n"
2088     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2089     LABELALIGN
2090   "1:                                          \n"
2091     READYUV422_AVX2
2092     YUVTORGB_AVX2(kYuvConstants)
2093 
2094     // Step 3: Weave into ARGB
2095     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"  // BG
2096     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
2097     "vpunpcklbw %%ymm5,%%ymm2,%%ymm2           \n"  // RA
2098     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2099     "vpunpcklwd %%ymm2,%%ymm0,%%ymm1           \n"  // BGRA first 8 pixels
2100     "vpunpckhwd %%ymm2,%%ymm0,%%ymm0           \n"  // BGRA next 8 pixels
2101 
2102     "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "\n"
2103     "vmovdqu    %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
2104     "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2105     "sub       $0x10,%[width]                  \n"
2106     "jg        1b                              \n"
2107     "vzeroupper                                \n"
2108   : [y_buf]"+r"(y_buf),    // %[y_buf]
2109     [u_buf]"+r"(u_buf),    // %[u_buf]
2110     [v_buf]"+r"(v_buf),    // %[v_buf]
2111     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2112     [width]"+rm"(width)    // %[width]
2113   : [kYuvConstants]"r"(&kYuvJConstants.kUVToB)  // %[kYuvConstants]
2114   : "memory", "cc", NACL_R14
2115     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2116   );
2117 }
2118 #endif  // HAS_J422TOARGBROW_AVX2
2119 
2120 #if defined(HAS_I422TOABGRROW_AVX2)
2121 // 16 pixels
2122 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
I422ToABGRRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2123 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,
2124                                const uint8* u_buf,
2125                                const uint8* v_buf,
2126                                uint8* dst_argb,
2127                                int width) {
2128   asm volatile (
2129     "sub       %[u_buf],%[v_buf]               \n"
2130     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2131     LABELALIGN
2132   "1:                                          \n"
2133     READYUV422_AVX2
2134     YUVTORGB_AVX2(kYuvConstants)
2135 
2136     // Step 3: Weave into ABGR
2137     "vpunpcklbw %%ymm1,%%ymm2,%%ymm1           \n"  // RG
2138     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2139     "vpunpcklbw %%ymm5,%%ymm0,%%ymm2           \n"  // BA
2140     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2141     "vpunpcklwd %%ymm2,%%ymm1,%%ymm0           \n"  // RGBA first 8 pixels
2142     "vpunpckhwd %%ymm2,%%ymm1,%%ymm1           \n"  // RGBA next 8 pixels
2143     "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
2144     "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2145     "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2146     "sub       $0x10,%[width]                  \n"
2147     "jg        1b                              \n"
2148     "vzeroupper                                \n"
2149   : [y_buf]"+r"(y_buf),    // %[y_buf]
2150     [u_buf]"+r"(u_buf),    // %[u_buf]
2151     [v_buf]"+r"(v_buf),    // %[v_buf]
2152     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2153     [width]"+rm"(width)    // %[width]
2154   : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
2155   : "memory", "cc", NACL_R14
2156     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2157   );
2158 }
2159 #endif  // HAS_I422TOABGRROW_AVX2
2160 
2161 #if defined(HAS_I422TORGBAROW_AVX2)
2162 // 16 pixels
2163 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
I422ToRGBARow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2164 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
2165                                const uint8* u_buf,
2166                                const uint8* v_buf,
2167                                uint8* dst_argb,
2168                                int width) {
2169   asm volatile (
2170     "sub       %[u_buf],%[v_buf]               \n"
2171     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2172     LABELALIGN
2173   "1:                                          \n"
2174     READYUV422_AVX2
2175     YUVTORGB_AVX2(kYuvConstants)
2176 
2177     // Step 3: Weave into RGBA
2178     "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
2179     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2180     "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
2181     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2182     "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
2183     "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
2184     "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
2185     "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2186     "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2187     "sub       $0x10,%[width]                  \n"
2188     "jg        1b                              \n"
2189     "vzeroupper                                \n"
2190   : [y_buf]"+r"(y_buf),    // %[y_buf]
2191     [u_buf]"+r"(u_buf),    // %[u_buf]
2192     [v_buf]"+r"(v_buf),    // %[v_buf]
2193     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2194     [width]"+rm"(width)    // %[width]
2195   : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
2196   : "memory", "cc", NACL_R14
2197     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2198   );
2199 }
2200 #endif  // HAS_I422TORGBAROW_AVX2
2201 
2202 #ifdef HAS_I400TOARGBROW_SSE2
I400ToARGBRow_SSE2(const uint8 * y_buf,uint8 * dst_argb,int width)2203 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
2204   asm volatile (
2205     "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
2206     "movd      %%eax,%%xmm2                    \n"
2207     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
2208     "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
2209     "movd      %%eax,%%xmm3                    \n"
2210     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
2211     "pcmpeqb   %%xmm4,%%xmm4                   \n"
2212     "pslld     $0x18,%%xmm4                    \n"
2213     LABELALIGN
2214   "1:                                          \n"
2215     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2216     "movq      " MEMACCESS(0) ",%%xmm0         \n"
2217     "lea       " MEMLEA(0x8,0) ",%0            \n"
2218     "punpcklbw %%xmm0,%%xmm0                   \n"
2219     "pmulhuw   %%xmm2,%%xmm0                   \n"
2220     "psubusw   %%xmm3,%%xmm0                   \n"
2221     "psrlw     $6, %%xmm0                      \n"
2222     "packuswb  %%xmm0,%%xmm0                   \n"
2223 
2224     // Step 2: Weave into ARGB
2225     "punpcklbw %%xmm0,%%xmm0                   \n"
2226     "movdqa    %%xmm0,%%xmm1                   \n"
2227     "punpcklwd %%xmm0,%%xmm0                   \n"
2228     "punpckhwd %%xmm1,%%xmm1                   \n"
2229     "por       %%xmm4,%%xmm0                   \n"
2230     "por       %%xmm4,%%xmm1                   \n"
2231     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2232     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
2233     "lea       " MEMLEA(0x20,1) ",%1           \n"
2234 
2235     "sub       $0x8,%2                         \n"
2236     "jg        1b                              \n"
2237   : "+r"(y_buf),     // %0
2238     "+r"(dst_argb),  // %1
2239     "+rm"(width)     // %2
2240   :
2241   : "memory", "cc", "eax"
2242     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2243   );
2244 }
2245 #endif  // HAS_I400TOARGBROW_SSE2
2246 
2247 #ifdef HAS_I400TOARGBROW_AVX2
2248 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2249 // note: vpunpcklbw mutates and vpackuswb unmutates.
I400ToARGBRow_AVX2(const uint8 * y_buf,uint8 * dst_argb,int width)2250 void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
2251   asm volatile (
2252     "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
2253     "vmovd      %%eax,%%xmm2                   \n"
2254     "vbroadcastss %%xmm2,%%ymm2                \n"
2255     "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
2256     "vmovd      %%eax,%%xmm3                   \n"
2257     "vbroadcastss %%xmm3,%%ymm3                \n"
2258     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
2259     "vpslld     $0x18,%%ymm4,%%ymm4            \n"
2260 
2261     LABELALIGN
2262   "1:                                          \n"
2263     // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
2264     "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
2265     "lea        " MEMLEA(0x10,0) ",%0          \n"
2266     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
2267     "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
2268     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
2269     "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
2270     "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
2271     "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
2272     "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
2273     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2274     "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
2275     "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
2276     "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
2277     "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
2278     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2279     "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
2280     "lea       " MEMLEA(0x40,1) ",%1           \n"
2281     "sub        $0x10,%2                       \n"
2282     "jg        1b                              \n"
2283     "vzeroupper                                \n"
2284   : "+r"(y_buf),     // %0
2285     "+r"(dst_argb),  // %1
2286     "+rm"(width)     // %2
2287   :
2288   : "memory", "cc", "eax"
2289     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2290   );
2291 }
2292 #endif  // HAS_I400TOARGBROW_AVX2
2293 
2294 #ifdef HAS_MIRRORROW_SSSE3
2295 // Shuffle table for reversing the bytes.
2296 static uvec8 kShuffleMirror = {
2297   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2298 };
2299 
MirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)2300 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2301   intptr_t temp_width = (intptr_t)(width);
2302   asm volatile (
2303     "movdqa    %3,%%xmm5                       \n"
2304     LABELALIGN
2305   "1:                                          \n"
2306     MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
2307     "pshufb    %%xmm5,%%xmm0                   \n"
2308     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2309     "lea       " MEMLEA(0x10,1) ",%1           \n"
2310     "sub       $0x10,%2                        \n"
2311     "jg        1b                              \n"
2312   : "+r"(src),  // %0
2313     "+r"(dst),  // %1
2314     "+r"(temp_width)  // %2
2315   : "m"(kShuffleMirror) // %3
2316   : "memory", "cc", NACL_R14
2317     "xmm0", "xmm5"
2318   );
2319 }
2320 #endif  // HAS_MIRRORROW_SSSE3
2321 
2322 #ifdef HAS_MIRRORROW_AVX2
MirrorRow_AVX2(const uint8 * src,uint8 * dst,int width)2323 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2324   intptr_t temp_width = (intptr_t)(width);
2325   asm volatile (
2326     "vbroadcastf128 %3,%%ymm5                  \n"
2327     LABELALIGN
2328   "1:                                          \n"
2329     MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
2330     "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
2331     "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
2332     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2333     "lea       " MEMLEA(0x20,1) ",%1           \n"
2334     "sub       $0x20,%2                        \n"
2335     "jg        1b                              \n"
2336     "vzeroupper                                \n"
2337   : "+r"(src),  // %0
2338     "+r"(dst),  // %1
2339     "+r"(temp_width)  // %2
2340   : "m"(kShuffleMirror) // %3
2341   : "memory", "cc", NACL_R14
2342     "xmm0", "xmm5"
2343   );
2344 }
2345 #endif  // HAS_MIRRORROW_AVX2
2346 
2347 #ifdef HAS_MIRRORROW_SSE2
MirrorRow_SSE2(const uint8 * src,uint8 * dst,int width)2348 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2349   intptr_t temp_width = (intptr_t)(width);
2350   asm volatile (
2351     LABELALIGN
2352   "1:                                          \n"
2353     MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
2354     "movdqa    %%xmm0,%%xmm1                   \n"
2355     "psllw     $0x8,%%xmm0                     \n"
2356     "psrlw     $0x8,%%xmm1                     \n"
2357     "por       %%xmm1,%%xmm0                   \n"
2358     "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
2359     "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
2360     "pshufd    $0x4e,%%xmm0,%%xmm0             \n"
2361     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2362     "lea       " MEMLEA(0x10,1)",%1            \n"
2363     "sub       $0x10,%2                        \n"
2364     "jg        1b                              \n"
2365   : "+r"(src),  // %0
2366     "+r"(dst),  // %1
2367     "+r"(temp_width)  // %2
2368   :
2369   : "memory", "cc", NACL_R14
2370     "xmm0", "xmm1"
2371   );
2372 }
2373 #endif  // HAS_MIRRORROW_SSE2
2374 
2375 #ifdef HAS_MIRRORROW_UV_SSSE3
2376 // Shuffle table for reversing the bytes of UV channels.
2377 static uvec8 kShuffleMirrorUV = {
2378   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2379 };
MirrorUVRow_SSSE3(const uint8 * src,uint8 * dst_u,uint8 * dst_v,int width)2380 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2381                        int width) {
2382   intptr_t temp_width = (intptr_t)(width);
2383   asm volatile (
2384     "movdqa    %4,%%xmm1                       \n"
2385     "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
2386     "sub       %1,%2                           \n"
2387     LABELALIGN
2388   "1:                                          \n"
2389     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2390     "lea       " MEMLEA(-0x10,0) ",%0          \n"
2391     "pshufb    %%xmm1,%%xmm0                   \n"
2392     "movlpd    %%xmm0," MEMACCESS(1) "         \n"
2393     MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
2394     "lea       " MEMLEA(0x8,1) ",%1            \n"
2395     "sub       $8,%3                           \n"
2396     "jg        1b                              \n"
2397   : "+r"(src),      // %0
2398     "+r"(dst_u),    // %1
2399     "+r"(dst_v),    // %2
2400     "+r"(temp_width)  // %3
2401   : "m"(kShuffleMirrorUV)  // %4
2402   : "memory", "cc", NACL_R14
2403     "xmm0", "xmm1"
2404   );
2405 }
2406 #endif  // HAS_MIRRORROW_UV_SSSE3
2407 
2408 #ifdef HAS_ARGBMIRRORROW_SSE2
2409 
ARGBMirrorRow_SSE2(const uint8 * src,uint8 * dst,int width)2410 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2411   intptr_t temp_width = (intptr_t)(width);
2412   asm volatile (
2413     "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
2414     LABELALIGN
2415   "1:                                          \n"
2416     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2417     "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
2418     "lea       " MEMLEA(-0x10,0) ",%0          \n"
2419     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2420     "lea       " MEMLEA(0x10,1) ",%1           \n"
2421     "sub       $0x4,%2                         \n"
2422     "jg        1b                              \n"
2423   : "+r"(src),  // %0
2424     "+r"(dst),  // %1
2425     "+r"(temp_width)  // %2
2426   :
2427   : "memory", "cc"
2428     , "xmm0"
2429   );
2430 }
2431 #endif  // HAS_ARGBMIRRORROW_SSE2
2432 
2433 #ifdef HAS_ARGBMIRRORROW_AVX2
2434 // Shuffle table for reversing the bytes.
2435 static const ulvec32 kARGBShuffleMirror_AVX2 = {
2436   7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2437 };
ARGBMirrorRow_AVX2(const uint8 * src,uint8 * dst,int width)2438 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2439   intptr_t temp_width = (intptr_t)(width);
2440   asm volatile (
2441     "vmovdqu    %3,%%ymm5                      \n"
2442     LABELALIGN
2443   "1:                                          \n"
2444     VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
2445     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2446     "lea        " MEMLEA(0x20,1) ",%1          \n"
2447     "sub        $0x8,%2                        \n"
2448     "jg         1b                             \n"
2449     "vzeroupper                                \n"
2450   : "+r"(src),  // %0
2451     "+r"(dst),  // %1
2452     "+r"(temp_width)  // %2
2453   : "m"(kARGBShuffleMirror_AVX2) // %3
2454   : "memory", "cc", NACL_R14
2455     "xmm0", "xmm5"
2456   );
2457 }
2458 #endif  // HAS_ARGBMIRRORROW_AVX2
2459 
2460 #ifdef HAS_SPLITUVROW_AVX2
SplitUVRow_AVX2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int pix)2461 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2462   asm volatile (
2463     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5             \n"
2464     "vpsrlw     $0x8,%%ymm5,%%ymm5               \n"
2465     "sub        %1,%2                            \n"
2466     LABELALIGN
2467   "1:                                            \n"
2468     "vmovdqu    " MEMACCESS(0) ",%%ymm0          \n"
2469     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1    \n"
2470     "lea        " MEMLEA(0x40,0) ",%0            \n"
2471     "vpsrlw     $0x8,%%ymm0,%%ymm2               \n"
2472     "vpsrlw     $0x8,%%ymm1,%%ymm3               \n"
2473     "vpand      %%ymm5,%%ymm0,%%ymm0             \n"
2474     "vpand      %%ymm5,%%ymm1,%%ymm1             \n"
2475     "vpackuswb  %%ymm1,%%ymm0,%%ymm0             \n"
2476     "vpackuswb  %%ymm3,%%ymm2,%%ymm2             \n"
2477     "vpermq     $0xd8,%%ymm0,%%ymm0              \n"
2478     "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
2479     "vmovdqu    %%ymm0," MEMACCESS(1) "          \n"
2480     MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)             //  vmovdqu %%ymm2,(%1,%2)
2481     "lea        " MEMLEA(0x20,1) ",%1            \n"
2482     "sub        $0x20,%3                         \n"
2483     "jg         1b                               \n"
2484     "vzeroupper                                  \n"
2485   : "+r"(src_uv),     // %0
2486     "+r"(dst_u),      // %1
2487     "+r"(dst_v),      // %2
2488     "+r"(pix)         // %3
2489   :
2490   : "memory", "cc", NACL_R14
2491     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2492   );
2493 }
2494 #endif  // HAS_SPLITUVROW_AVX2
2495 
2496 #ifdef HAS_SPLITUVROW_SSE2
SplitUVRow_SSE2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int pix)2497 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2498   asm volatile (
2499     "pcmpeqb    %%xmm5,%%xmm5                    \n"
2500     "psrlw      $0x8,%%xmm5                      \n"
2501     "sub        %1,%2                            \n"
2502     LABELALIGN
2503   "1:                                            \n"
2504     "movdqu     " MEMACCESS(0) ",%%xmm0          \n"
2505     "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
2506     "lea        " MEMLEA(0x20,0) ",%0            \n"
2507     "movdqa     %%xmm0,%%xmm2                    \n"
2508     "movdqa     %%xmm1,%%xmm3                    \n"
2509     "pand       %%xmm5,%%xmm0                    \n"
2510     "pand       %%xmm5,%%xmm1                    \n"
2511     "packuswb   %%xmm1,%%xmm0                    \n"
2512     "psrlw      $0x8,%%xmm2                      \n"
2513     "psrlw      $0x8,%%xmm3                      \n"
2514     "packuswb   %%xmm3,%%xmm2                    \n"
2515     "movdqu     %%xmm0," MEMACCESS(1) "          \n"
2516     MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)
2517     "lea        " MEMLEA(0x10,1) ",%1            \n"
2518     "sub        $0x10,%3                         \n"
2519     "jg         1b                               \n"
2520   : "+r"(src_uv),     // %0
2521     "+r"(dst_u),      // %1
2522     "+r"(dst_v),      // %2
2523     "+r"(pix)         // %3
2524   :
2525   : "memory", "cc", NACL_R14
2526     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2527   );
2528 }
2529 #endif  // HAS_SPLITUVROW_SSE2
2530 
2531 #ifdef HAS_MERGEUVROW_AVX2
MergeUVRow_AVX2(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)2532 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2533                      int width) {
2534   asm volatile (
2535     "sub       %0,%1                             \n"
2536     LABELALIGN
2537   "1:                                            \n"
2538     "vmovdqu   " MEMACCESS(0) ",%%ymm0           \n"
2539     MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)             //  vmovdqu (%0,%1,1),%%ymm1
2540     "lea       " MEMLEA(0x20,0) ",%0             \n"
2541     "vpunpcklbw %%ymm1,%%ymm0,%%ymm2             \n"
2542     "vpunpckhbw %%ymm1,%%ymm0,%%ymm0             \n"
2543     "vextractf128 $0x0,%%ymm2," MEMACCESS(2) "   \n"
2544     "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
2545     "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
2546     "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
2547     "lea       " MEMLEA(0x40,2) ",%2             \n"
2548     "sub       $0x20,%3                          \n"
2549     "jg        1b                                \n"
2550     "vzeroupper                                  \n"
2551   : "+r"(src_u),     // %0
2552     "+r"(src_v),     // %1
2553     "+r"(dst_uv),    // %2
2554     "+r"(width)      // %3
2555   :
2556   : "memory", "cc", NACL_R14
2557     "xmm0", "xmm1", "xmm2"
2558   );
2559 }
2560 #endif  // HAS_MERGEUVROW_AVX2
2561 
2562 #ifdef HAS_MERGEUVROW_SSE2
MergeUVRow_SSE2(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)2563 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2564                      int width) {
2565   asm volatile (
2566     "sub       %0,%1                             \n"
2567     LABELALIGN
2568   "1:                                            \n"
2569     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
2570     MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1
2571     "lea       " MEMLEA(0x10,0) ",%0             \n"
2572     "movdqa    %%xmm0,%%xmm2                     \n"
2573     "punpcklbw %%xmm1,%%xmm0                     \n"
2574     "punpckhbw %%xmm1,%%xmm2                     \n"
2575     "movdqu    %%xmm0," MEMACCESS(2) "           \n"
2576     "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"
2577     "lea       " MEMLEA(0x20,2) ",%2             \n"
2578     "sub       $0x10,%3                          \n"
2579     "jg        1b                                \n"
2580   : "+r"(src_u),     // %0
2581     "+r"(src_v),     // %1
2582     "+r"(dst_uv),    // %2
2583     "+r"(width)      // %3
2584   :
2585   : "memory", "cc", NACL_R14
2586     "xmm0", "xmm1", "xmm2"
2587   );
2588 }
2589 #endif  // HAS_MERGEUVROW_SSE2
2590 
2591 #ifdef HAS_COPYROW_SSE2
CopyRow_SSE2(const uint8 * src,uint8 * dst,int count)2592 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2593   asm volatile (
2594     LABELALIGN
2595   "1:                                          \n"
2596     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2597     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2598     "lea       " MEMLEA(0x20,0) ",%0           \n"
2599     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2600     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
2601     "lea       " MEMLEA(0x20,1) ",%1           \n"
2602     "sub       $0x20,%2                        \n"
2603     "jg        1b                              \n"
2604   : "+r"(src),   // %0
2605     "+r"(dst),   // %1
2606     "+r"(count)  // %2
2607   :
2608   : "memory", "cc"
2609     , "xmm0", "xmm1"
2610   );
2611 }
2612 #endif  // HAS_COPYROW_SSE2
2613 
2614 #ifdef HAS_COPYROW_AVX
CopyRow_AVX(const uint8 * src,uint8 * dst,int count)2615 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
2616   asm volatile (
2617     LABELALIGN
2618   "1:                                          \n"
2619     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
2620     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
2621     "lea       " MEMLEA(0x40,0) ",%0           \n"
2622     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
2623     "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
2624     "lea       " MEMLEA(0x40,1) ",%1           \n"
2625     "sub       $0x40,%2                        \n"
2626     "jg        1b                              \n"
2627   : "+r"(src),   // %0
2628     "+r"(dst),   // %1
2629     "+r"(count)  // %2
2630   :
2631   : "memory", "cc"
2632     , "xmm0", "xmm1"
2633   );
2634 }
2635 #endif  // HAS_COPYROW_AVX
2636 
2637 #ifdef HAS_COPYROW_ERMS
2638 // Multiple of 1.
CopyRow_ERMS(const uint8 * src,uint8 * dst,int width)2639 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
2640   size_t width_tmp = (size_t)(width);
2641   asm volatile (
2642     "rep movsb " MEMMOVESTRING(0,1) "          \n"
2643   : "+S"(src),  // %0
2644     "+D"(dst),  // %1
2645     "+c"(width_tmp) // %2
2646   :
2647   : "memory", "cc"
2648   );
2649 }
2650 #endif  // HAS_COPYROW_ERMS
2651 
2652 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
2653 // width in pixels
ARGBCopyAlphaRow_SSE2(const uint8 * src,uint8 * dst,int width)2654 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2655   asm volatile (
2656     "pcmpeqb   %%xmm0,%%xmm0                   \n"
2657     "pslld     $0x18,%%xmm0                    \n"
2658     "pcmpeqb   %%xmm1,%%xmm1                   \n"
2659     "psrld     $0x8,%%xmm1                     \n"
2660     LABELALIGN
2661   "1:                                          \n"
2662     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
2663     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
2664     "lea       " MEMLEA(0x20,0) ",%0           \n"
2665     "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
2666     "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
2667     "pand      %%xmm0,%%xmm2                   \n"
2668     "pand      %%xmm0,%%xmm3                   \n"
2669     "pand      %%xmm1,%%xmm4                   \n"
2670     "pand      %%xmm1,%%xmm5                   \n"
2671     "por       %%xmm4,%%xmm2                   \n"
2672     "por       %%xmm5,%%xmm3                   \n"
2673     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
2674     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
2675     "lea       " MEMLEA(0x20,1) ",%1           \n"
2676     "sub       $0x8,%2                         \n"
2677     "jg        1b                              \n"
2678   : "+r"(src),   // %0
2679     "+r"(dst),   // %1
2680     "+r"(width)  // %2
2681   :
2682   : "memory", "cc"
2683     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2684   );
2685 }
2686 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
2687 
2688 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
2689 // width in pixels
ARGBCopyAlphaRow_AVX2(const uint8 * src,uint8 * dst,int width)2690 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2691   asm volatile (
2692     "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
2693     "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
2694     LABELALIGN
2695   "1:                                          \n"
2696     "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
2697     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
2698     "lea       " MEMLEA(0x40,0) ",%0           \n"
2699     "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
2700     "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
2701     "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
2702     "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
2703     "lea       " MEMLEA(0x40,1) ",%1           \n"
2704     "sub       $0x10,%2                        \n"
2705     "jg        1b                              \n"
2706     "vzeroupper                                \n"
2707   : "+r"(src),   // %0
2708     "+r"(dst),   // %1
2709     "+r"(width)  // %2
2710   :
2711   : "memory", "cc"
2712     , "xmm0", "xmm1", "xmm2"
2713   );
2714 }
2715 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
2716 
2717 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
2718 // width in pixels
ARGBCopyYToAlphaRow_SSE2(const uint8 * src,uint8 * dst,int width)2719 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2720   asm volatile (
2721     "pcmpeqb   %%xmm0,%%xmm0                   \n"
2722     "pslld     $0x18,%%xmm0                    \n"
2723     "pcmpeqb   %%xmm1,%%xmm1                   \n"
2724     "psrld     $0x8,%%xmm1                     \n"
2725     LABELALIGN
2726   "1:                                          \n"
2727     "movq      " MEMACCESS(0) ",%%xmm2         \n"
2728     "lea       " MEMLEA(0x8,0) ",%0            \n"
2729     "punpcklbw %%xmm2,%%xmm2                   \n"
2730     "punpckhwd %%xmm2,%%xmm3                   \n"
2731     "punpcklwd %%xmm2,%%xmm2                   \n"
2732     "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
2733     "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
2734     "pand      %%xmm0,%%xmm2                   \n"
2735     "pand      %%xmm0,%%xmm3                   \n"
2736     "pand      %%xmm1,%%xmm4                   \n"
2737     "pand      %%xmm1,%%xmm5                   \n"
2738     "por       %%xmm4,%%xmm2                   \n"
2739     "por       %%xmm5,%%xmm3                   \n"
2740     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
2741     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
2742     "lea       " MEMLEA(0x20,1) ",%1           \n"
2743     "sub       $0x8,%2                         \n"
2744     "jg        1b                              \n"
2745   : "+r"(src),   // %0
2746     "+r"(dst),   // %1
2747     "+r"(width)  // %2
2748   :
2749   : "memory", "cc"
2750     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2751   );
2752 }
2753 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
2754 
2755 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
2756 // width in pixels
ARGBCopyYToAlphaRow_AVX2(const uint8 * src,uint8 * dst,int width)2757 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2758   asm volatile (
2759     "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
2760     "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
2761     LABELALIGN
2762   "1:                                          \n"
2763     "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
2764     "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
2765     "lea       " MEMLEA(0x10,0) ",%0           \n"
2766     "vpslld    $0x18,%%ymm1,%%ymm1             \n"
2767     "vpslld    $0x18,%%ymm2,%%ymm2             \n"
2768     "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
2769     "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
2770     "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
2771     "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
2772     "lea       " MEMLEA(0x40,1) ",%1           \n"
2773     "sub       $0x10,%2                        \n"
2774     "jg        1b                              \n"
2775     "vzeroupper                                \n"
2776   : "+r"(src),   // %0
2777     "+r"(dst),   // %1
2778     "+r"(width)  // %2
2779   :
2780   : "memory", "cc"
2781     , "xmm0", "xmm1", "xmm2"
2782   );
2783 }
2784 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
2785 
2786 #ifdef HAS_SETROW_X86
SetRow_X86(uint8 * dst,uint8 v8,int width)2787 void SetRow_X86(uint8* dst, uint8 v8, int width) {
2788   size_t width_tmp = (size_t)(width >> 2);
2789   const uint32 v32 = v8 * 0x01010101;  // Duplicate byte to all bytes.
2790   asm volatile (
2791     "rep stosl " MEMSTORESTRING(eax,0) "       \n"
2792     : "+D"(dst),       // %0
2793       "+c"(width_tmp)  // %1
2794     : "a"(v32)         // %2
2795     : "memory", "cc");
2796 }
2797 
SetRow_ERMS(uint8 * dst,uint8 v8,int width)2798 void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
2799   size_t width_tmp = (size_t)(width);
2800   asm volatile (
2801     "rep stosb " MEMSTORESTRING(al,0) "        \n"
2802     : "+D"(dst),       // %0
2803       "+c"(width_tmp)  // %1
2804     : "a"(v8)          // %2
2805     : "memory", "cc");
2806 }
2807 
ARGBSetRow_X86(uint8 * dst_argb,uint32 v32,int width)2808 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
2809   size_t width_tmp = (size_t)(width);
2810   asm volatile (
2811     "rep stosl " MEMSTORESTRING(eax,0) "       \n"
2812     : "+D"(dst_argb),  // %0
2813       "+c"(width_tmp)  // %1
2814     : "a"(v32)         // %2
2815     : "memory", "cc");
2816 }
2817 #endif  // HAS_SETROW_X86
2818 
2819 #ifdef HAS_YUY2TOYROW_SSE2
YUY2ToYRow_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int pix)2820 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
2821   asm volatile (
2822     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2823     "psrlw     $0x8,%%xmm5                     \n"
2824     LABELALIGN
2825   "1:                                          \n"
2826     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2827     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2828     "lea       " MEMLEA(0x20,0) ",%0           \n"
2829     "pand      %%xmm5,%%xmm0                   \n"
2830     "pand      %%xmm5,%%xmm1                   \n"
2831     "packuswb  %%xmm1,%%xmm0                   \n"
2832     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2833     "lea       " MEMLEA(0x10,1) ",%1           \n"
2834     "sub       $0x10,%2                        \n"
2835     "jg        1b                              \n"
2836   : "+r"(src_yuy2),  // %0
2837     "+r"(dst_y),     // %1
2838     "+r"(pix)        // %2
2839   :
2840   : "memory", "cc"
2841     , "xmm0", "xmm1", "xmm5"
2842   );
2843 }
2844 
YUY2ToUVRow_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2845 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
2846                       uint8* dst_u, uint8* dst_v, int pix) {
2847   asm volatile (
2848     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2849     "psrlw     $0x8,%%xmm5                     \n"
2850     "sub       %1,%2                           \n"
2851     LABELALIGN
2852   "1:                                          \n"
2853     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2854     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2855     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
2856     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
2857     "lea       " MEMLEA(0x20,0) ",%0           \n"
2858     "pavgb     %%xmm2,%%xmm0                   \n"
2859     "pavgb     %%xmm3,%%xmm1                   \n"
2860     "psrlw     $0x8,%%xmm0                     \n"
2861     "psrlw     $0x8,%%xmm1                     \n"
2862     "packuswb  %%xmm1,%%xmm0                   \n"
2863     "movdqa    %%xmm0,%%xmm1                   \n"
2864     "pand      %%xmm5,%%xmm0                   \n"
2865     "packuswb  %%xmm0,%%xmm0                   \n"
2866     "psrlw     $0x8,%%xmm1                     \n"
2867     "packuswb  %%xmm1,%%xmm1                   \n"
2868     "movq      %%xmm0," MEMACCESS(1) "         \n"
2869     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
2870     "lea       " MEMLEA(0x8,1) ",%1            \n"
2871     "sub       $0x10,%3                        \n"
2872     "jg        1b                              \n"
2873   : "+r"(src_yuy2),    // %0
2874     "+r"(dst_u),       // %1
2875     "+r"(dst_v),       // %2
2876     "+r"(pix)          // %3
2877   : "r"((intptr_t)(stride_yuy2))  // %4
2878   : "memory", "cc", NACL_R14
2879     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2880   );
2881 }
2882 
YUY2ToUV422Row_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2883 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2884                          uint8* dst_u, uint8* dst_v, int pix) {
2885   asm volatile (
2886     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2887     "psrlw     $0x8,%%xmm5                     \n"
2888     "sub       %1,%2                           \n"
2889     LABELALIGN
2890   "1:                                          \n"
2891     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2892     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2893     "lea       " MEMLEA(0x20,0) ",%0           \n"
2894     "psrlw     $0x8,%%xmm0                     \n"
2895     "psrlw     $0x8,%%xmm1                     \n"
2896     "packuswb  %%xmm1,%%xmm0                   \n"
2897     "movdqa    %%xmm0,%%xmm1                   \n"
2898     "pand      %%xmm5,%%xmm0                   \n"
2899     "packuswb  %%xmm0,%%xmm0                   \n"
2900     "psrlw     $0x8,%%xmm1                     \n"
2901     "packuswb  %%xmm1,%%xmm1                   \n"
2902     "movq      %%xmm0," MEMACCESS(1) "         \n"
2903     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
2904     "lea       " MEMLEA(0x8,1) ",%1            \n"
2905     "sub       $0x10,%3                        \n"
2906     "jg        1b                              \n"
2907   : "+r"(src_yuy2),    // %0
2908     "+r"(dst_u),       // %1
2909     "+r"(dst_v),       // %2
2910     "+r"(pix)          // %3
2911   :
2912   : "memory", "cc", NACL_R14
2913     "xmm0", "xmm1", "xmm5"
2914   );
2915 }
2916 
UYVYToYRow_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int pix)2917 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
2918   asm volatile (
2919     LABELALIGN
2920   "1:                                          \n"
2921     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2922     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2923     "lea       " MEMLEA(0x20,0) ",%0           \n"
2924     "psrlw     $0x8,%%xmm0                     \n"
2925     "psrlw     $0x8,%%xmm1                     \n"
2926     "packuswb  %%xmm1,%%xmm0                   \n"
2927     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2928     "lea       " MEMLEA(0x10,1) ",%1           \n"
2929     "sub       $0x10,%2                        \n"
2930     "jg        1b                              \n"
2931   : "+r"(src_uyvy),  // %0
2932     "+r"(dst_y),     // %1
2933     "+r"(pix)        // %2
2934   :
2935   : "memory", "cc"
2936     , "xmm0", "xmm1"
2937   );
2938 }
2939 
UYVYToUVRow_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2940 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2941                       uint8* dst_u, uint8* dst_v, int pix) {
2942   asm volatile (
2943     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2944     "psrlw     $0x8,%%xmm5                     \n"
2945     "sub       %1,%2                           \n"
2946     LABELALIGN
2947   "1:                                          \n"
2948     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2949     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2950     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
2951     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
2952     "lea       " MEMLEA(0x20,0) ",%0           \n"
2953     "pavgb     %%xmm2,%%xmm0                   \n"
2954     "pavgb     %%xmm3,%%xmm1                   \n"
2955     "pand      %%xmm5,%%xmm0                   \n"
2956     "pand      %%xmm5,%%xmm1                   \n"
2957     "packuswb  %%xmm1,%%xmm0                   \n"
2958     "movdqa    %%xmm0,%%xmm1                   \n"
2959     "pand      %%xmm5,%%xmm0                   \n"
2960     "packuswb  %%xmm0,%%xmm0                   \n"
2961     "psrlw     $0x8,%%xmm1                     \n"
2962     "packuswb  %%xmm1,%%xmm1                   \n"
2963     "movq      %%xmm0," MEMACCESS(1) "         \n"
2964     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
2965     "lea       " MEMLEA(0x8,1) ",%1            \n"
2966     "sub       $0x10,%3                        \n"
2967     "jg        1b                              \n"
2968   : "+r"(src_uyvy),    // %0
2969     "+r"(dst_u),       // %1
2970     "+r"(dst_v),       // %2
2971     "+r"(pix)          // %3
2972   : "r"((intptr_t)(stride_uyvy))  // %4
2973   : "memory", "cc", NACL_R14
2974     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2975   );
2976 }
2977 
UYVYToUV422Row_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2978 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2979                          uint8* dst_u, uint8* dst_v, int pix) {
2980   asm volatile (
2981     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2982     "psrlw     $0x8,%%xmm5                     \n"
2983     "sub       %1,%2                           \n"
2984     LABELALIGN
2985   "1:                                          \n"
2986     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2987     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2988     "lea       " MEMLEA(0x20,0) ",%0           \n"
2989     "pand      %%xmm5,%%xmm0                   \n"
2990     "pand      %%xmm5,%%xmm1                   \n"
2991     "packuswb  %%xmm1,%%xmm0                   \n"
2992     "movdqa    %%xmm0,%%xmm1                   \n"
2993     "pand      %%xmm5,%%xmm0                   \n"
2994     "packuswb  %%xmm0,%%xmm0                   \n"
2995     "psrlw     $0x8,%%xmm1                     \n"
2996     "packuswb  %%xmm1,%%xmm1                   \n"
2997     "movq      %%xmm0," MEMACCESS(1) "         \n"
2998     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
2999     "lea       " MEMLEA(0x8,1) ",%1            \n"
3000     "sub       $0x10,%3                        \n"
3001     "jg        1b                              \n"
3002   : "+r"(src_uyvy),    // %0
3003     "+r"(dst_u),       // %1
3004     "+r"(dst_v),       // %2
3005     "+r"(pix)          // %3
3006   :
3007   : "memory", "cc", NACL_R14
3008     "xmm0", "xmm1", "xmm5"
3009   );
3010 }
3011 #endif  // HAS_YUY2TOYROW_SSE2
3012 
3013 #ifdef HAS_YUY2TOYROW_AVX2
YUY2ToYRow_AVX2(const uint8 * src_yuy2,uint8 * dst_y,int pix)3014 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) {
3015   asm volatile (
3016     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3017     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3018     LABELALIGN
3019   "1:                                          \n"
3020     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3021     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3022     "lea       " MEMLEA(0x40,0) ",%0           \n"
3023     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
3024     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
3025     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3026     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3027     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
3028     "lea      " MEMLEA(0x20,1) ",%1            \n"
3029     "sub       $0x20,%2                        \n"
3030     "jg        1b                              \n"
3031     "vzeroupper                                \n"
3032   : "+r"(src_yuy2),  // %0
3033     "+r"(dst_y),     // %1
3034     "+r"(pix)        // %2
3035   :
3036   : "memory", "cc"
3037     , "xmm0", "xmm1", "xmm5"
3038   );
3039 }
3040 
YUY2ToUVRow_AVX2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)3041 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
3042                       uint8* dst_u, uint8* dst_v, int pix) {
3043   asm volatile (
3044     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3045     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3046     "sub       %1,%2                           \n"
3047     LABELALIGN
3048   "1:                                          \n"
3049     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3050     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3051     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3052     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3053     "lea       " MEMLEA(0x40,0) ",%0           \n"
3054     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3055     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
3056     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3057     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3058     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3059     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3060     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3061     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3062     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3063     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3064     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3065     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3066     "lea      " MEMLEA(0x10,1) ",%1            \n"
3067     "sub       $0x20,%3                        \n"
3068     "jg        1b                              \n"
3069     "vzeroupper                                \n"
3070   : "+r"(src_yuy2),    // %0
3071     "+r"(dst_u),       // %1
3072     "+r"(dst_v),       // %2
3073     "+r"(pix)          // %3
3074   : "r"((intptr_t)(stride_yuy2))  // %4
3075   : "memory", "cc", NACL_R14
3076     "xmm0", "xmm1", "xmm5"
3077   );
3078 }
3079 
YUY2ToUV422Row_AVX2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)3080 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3081                          uint8* dst_u, uint8* dst_v, int pix) {
3082   asm volatile (
3083     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3084     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3085     "sub       %1,%2                           \n"
3086     LABELALIGN
3087   "1:                                          \n"
3088     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3089     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3090     "lea       " MEMLEA(0x40,0) ",%0           \n"
3091     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3092     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
3093     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3094     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3095     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3096     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3097     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3098     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3099     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3100     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3101     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3102     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3103     "lea      " MEMLEA(0x10,1) ",%1            \n"
3104     "sub       $0x20,%3                        \n"
3105     "jg        1b                              \n"
3106     "vzeroupper                                \n"
3107   : "+r"(src_yuy2),    // %0
3108     "+r"(dst_u),       // %1
3109     "+r"(dst_v),       // %2
3110     "+r"(pix)          // %3
3111   :
3112   : "memory", "cc", NACL_R14
3113     "xmm0", "xmm1", "xmm5"
3114   );
3115 }
3116 
UYVYToYRow_AVX2(const uint8 * src_uyvy,uint8 * dst_y,int pix)3117 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) {
3118   asm volatile (
3119     LABELALIGN
3120   "1:                                          \n"
3121     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3122     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3123     "lea       " MEMLEA(0x40,0) ",%0           \n"
3124     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3125     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
3126     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3127     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3128     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
3129     "lea      " MEMLEA(0x20,1) ",%1            \n"
3130     "sub       $0x20,%2                        \n"
3131     "jg        1b                              \n"
3132     "vzeroupper                                \n"
3133   : "+r"(src_uyvy),  // %0
3134     "+r"(dst_y),     // %1
3135     "+r"(pix)        // %2
3136   :
3137   : "memory", "cc"
3138     , "xmm0", "xmm1", "xmm5"
3139   );
3140 }
UYVYToUVRow_AVX2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)3141 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
3142                       uint8* dst_u, uint8* dst_v, int pix) {
3143   asm volatile (
3144     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3145     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3146     "sub       %1,%2                           \n"
3147 
3148     LABELALIGN
3149   "1:                                          \n"
3150     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3151     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3152     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3153     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3154     "lea       " MEMLEA(0x40,0) ",%0           \n"
3155     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
3156     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
3157     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3158     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3159     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3160     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3161     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3162     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3163     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3164     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3165     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3166     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3167     "lea      " MEMLEA(0x10,1) ",%1            \n"
3168     "sub       $0x20,%3                        \n"
3169     "jg        1b                              \n"
3170     "vzeroupper                                \n"
3171   : "+r"(src_uyvy),    // %0
3172     "+r"(dst_u),       // %1
3173     "+r"(dst_v),       // %2
3174     "+r"(pix)          // %3
3175   : "r"((intptr_t)(stride_uyvy))  // %4
3176   : "memory", "cc", NACL_R14
3177     "xmm0", "xmm1", "xmm5"
3178   );
3179 }
3180 
UYVYToUV422Row_AVX2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)3181 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3182                          uint8* dst_u, uint8* dst_v, int pix) {
3183   asm volatile (
3184     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
3185     "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
3186     "sub       %1,%2                           \n"
3187     LABELALIGN
3188   "1:                                          \n"
3189     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3190     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3191     "lea       " MEMLEA(0x40,0) ",%0           \n"
3192     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
3193     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
3194     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3195     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3196     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3197     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3198     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3199     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3200     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3201     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3202     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3203     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3204     "lea      " MEMLEA(0x10,1) ",%1            \n"
3205     "sub       $0x20,%3                        \n"
3206     "jg        1b                              \n"
3207     "vzeroupper                                \n"
3208   : "+r"(src_uyvy),    // %0
3209     "+r"(dst_u),       // %1
3210     "+r"(dst_v),       // %2
3211     "+r"(pix)          // %3
3212   :
3213   : "memory", "cc", NACL_R14
3214     "xmm0", "xmm1", "xmm5"
3215   );
3216 }
3217 #endif  // HAS_YUY2TOYROW_AVX2
3218 
3219 #ifdef HAS_ARGBBLENDROW_SSE2
3220 // Blend 8 pixels at a time.
ARGBBlendRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)3221 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3222                        uint8* dst_argb, int width) {
3223   asm volatile (
3224     "pcmpeqb   %%xmm7,%%xmm7                   \n"
3225     "psrlw     $0xf,%%xmm7                     \n"
3226     "pcmpeqb   %%xmm6,%%xmm6                   \n"
3227     "psrlw     $0x8,%%xmm6                     \n"
3228     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3229     "psllw     $0x8,%%xmm5                     \n"
3230     "pcmpeqb   %%xmm4,%%xmm4                   \n"
3231     "pslld     $0x18,%%xmm4                    \n"
3232     "sub       $0x4,%3                         \n"
3233     "jl        49f                             \n"
3234 
3235     // 4 pixel loop.
3236     LABELALIGN
3237   "41:                                         \n"
3238     "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
3239     "lea       " MEMLEA(0x10,0) ",%0           \n"
3240     "movdqa    %%xmm3,%%xmm0                   \n"
3241     "pxor      %%xmm4,%%xmm3                   \n"
3242     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
3243     "psrlw     $0x8,%%xmm3                     \n"
3244     "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
3245     "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
3246     "pand      %%xmm6,%%xmm2                   \n"
3247     "paddw     %%xmm7,%%xmm3                   \n"
3248     "pmullw    %%xmm3,%%xmm2                   \n"
3249     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
3250     "lea       " MEMLEA(0x10,1) ",%1           \n"
3251     "psrlw     $0x8,%%xmm1                     \n"
3252     "por       %%xmm4,%%xmm0                   \n"
3253     "pmullw    %%xmm3,%%xmm1                   \n"
3254     "psrlw     $0x8,%%xmm2                     \n"
3255     "paddusb   %%xmm2,%%xmm0                   \n"
3256     "pand      %%xmm5,%%xmm1                   \n"
3257     "paddusb   %%xmm1,%%xmm0                   \n"
3258     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
3259     "lea       " MEMLEA(0x10,2) ",%2           \n"
3260     "sub       $0x4,%3                         \n"
3261     "jge       41b                             \n"
3262 
3263   "49:                                         \n"
3264     "add       $0x3,%3                         \n"
3265     "jl        99f                             \n"
3266 
3267     // 1 pixel loop.
3268   "91:                                         \n"
3269     "movd      " MEMACCESS(0) ",%%xmm3         \n"
3270     "lea       " MEMLEA(0x4,0) ",%0            \n"
3271     "movdqa    %%xmm3,%%xmm0                   \n"
3272     "pxor      %%xmm4,%%xmm3                   \n"
3273     "movd      " MEMACCESS(1) ",%%xmm2         \n"
3274     "psrlw     $0x8,%%xmm3                     \n"
3275     "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
3276     "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
3277     "pand      %%xmm6,%%xmm2                   \n"
3278     "paddw     %%xmm7,%%xmm3                   \n"
3279     "pmullw    %%xmm3,%%xmm2                   \n"
3280     "movd      " MEMACCESS(1) ",%%xmm1         \n"
3281     "lea       " MEMLEA(0x4,1) ",%1            \n"
3282     "psrlw     $0x8,%%xmm1                     \n"
3283     "por       %%xmm4,%%xmm0                   \n"
3284     "pmullw    %%xmm3,%%xmm1                   \n"
3285     "psrlw     $0x8,%%xmm2                     \n"
3286     "paddusb   %%xmm2,%%xmm0                   \n"
3287     "pand      %%xmm5,%%xmm1                   \n"
3288     "paddusb   %%xmm1,%%xmm0                   \n"
3289     "movd      %%xmm0," MEMACCESS(2) "         \n"
3290     "lea       " MEMLEA(0x4,2) ",%2            \n"
3291     "sub       $0x1,%3                         \n"
3292     "jge       91b                             \n"
3293   "99:                                         \n"
3294   : "+r"(src_argb0),    // %0
3295     "+r"(src_argb1),    // %1
3296     "+r"(dst_argb),     // %2
3297     "+r"(width)         // %3
3298   :
3299   : "memory", "cc"
3300     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3301   );
3302 }
3303 #endif  // HAS_ARGBBLENDROW_SSE2
3304 
3305 #ifdef HAS_ARGBBLENDROW_SSSE3
3306 // Shuffle table for isolating alpha.
3307 static uvec8 kShuffleAlpha = {
3308   3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3309   11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3310 };
3311 
3312 // Blend 8 pixels at a time
3313 // Shuffle table for reversing the bytes.
3314 
3315 // Same as SSE2, but replaces
3316 //    psrlw      xmm3, 8          // alpha
3317 //    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
3318 //    pshuflw    xmm3, xmm3,0F5h
3319 // with..
3320 //    pshufb     xmm3, kShuffleAlpha // alpha
3321 
ARGBBlendRow_SSSE3(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)3322 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3323                         uint8* dst_argb, int width) {
3324   asm volatile (
3325     "pcmpeqb   %%xmm7,%%xmm7                   \n"
3326     "psrlw     $0xf,%%xmm7                     \n"
3327     "pcmpeqb   %%xmm6,%%xmm6                   \n"
3328     "psrlw     $0x8,%%xmm6                     \n"
3329     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3330     "psllw     $0x8,%%xmm5                     \n"
3331     "pcmpeqb   %%xmm4,%%xmm4                   \n"
3332     "pslld     $0x18,%%xmm4                    \n"
3333     "sub       $0x4,%3                         \n"
3334     "jl        49f                             \n"
3335 
3336     // 4 pixel loop.
3337     LABELALIGN
3338   "40:                                         \n"
3339     "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
3340     "lea       " MEMLEA(0x10,0) ",%0           \n"
3341     "movdqa    %%xmm3,%%xmm0                   \n"
3342     "pxor      %%xmm4,%%xmm3                   \n"
3343     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
3344     "pshufb    %4,%%xmm3                       \n"
3345     "pand      %%xmm6,%%xmm2                   \n"
3346     "paddw     %%xmm7,%%xmm3                   \n"
3347     "pmullw    %%xmm3,%%xmm2                   \n"
3348     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
3349     "lea       " MEMLEA(0x10,1) ",%1           \n"
3350     "psrlw     $0x8,%%xmm1                     \n"
3351     "por       %%xmm4,%%xmm0                   \n"
3352     "pmullw    %%xmm3,%%xmm1                   \n"
3353     "psrlw     $0x8,%%xmm2                     \n"
3354     "paddusb   %%xmm2,%%xmm0                   \n"
3355     "pand      %%xmm5,%%xmm1                   \n"
3356     "paddusb   %%xmm1,%%xmm0                   \n"
3357     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
3358     "lea       " MEMLEA(0x10,2) ",%2           \n"
3359     "sub       $0x4,%3                         \n"
3360     "jge       40b                             \n"
3361 
3362   "49:                                         \n"
3363     "add       $0x3,%3                         \n"
3364     "jl        99f                             \n"
3365 
3366     // 1 pixel loop.
3367   "91:                                         \n"
3368     "movd      " MEMACCESS(0) ",%%xmm3         \n"
3369     "lea       " MEMLEA(0x4,0) ",%0            \n"
3370     "movdqa    %%xmm3,%%xmm0                   \n"
3371     "pxor      %%xmm4,%%xmm3                   \n"
3372     "movd      " MEMACCESS(1) ",%%xmm2         \n"
3373     "pshufb    %4,%%xmm3                       \n"
3374     "pand      %%xmm6,%%xmm2                   \n"
3375     "paddw     %%xmm7,%%xmm3                   \n"
3376     "pmullw    %%xmm3,%%xmm2                   \n"
3377     "movd      " MEMACCESS(1) ",%%xmm1         \n"
3378     "lea       " MEMLEA(0x4,1) ",%1            \n"
3379     "psrlw     $0x8,%%xmm1                     \n"
3380     "por       %%xmm4,%%xmm0                   \n"
3381     "pmullw    %%xmm3,%%xmm1                   \n"
3382     "psrlw     $0x8,%%xmm2                     \n"
3383     "paddusb   %%xmm2,%%xmm0                   \n"
3384     "pand      %%xmm5,%%xmm1                   \n"
3385     "paddusb   %%xmm1,%%xmm0                   \n"
3386     "movd      %%xmm0," MEMACCESS(2) "         \n"
3387     "lea       " MEMLEA(0x4,2) ",%2            \n"
3388     "sub       $0x1,%3                         \n"
3389     "jge       91b                             \n"
3390   "99:                                         \n"
3391   : "+r"(src_argb0),    // %0
3392     "+r"(src_argb1),    // %1
3393     "+r"(dst_argb),     // %2
3394     "+r"(width)         // %3
3395   : "m"(kShuffleAlpha)  // %4
3396   : "memory", "cc"
3397     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3398   );
3399 }
3400 #endif  // HAS_ARGBBLENDROW_SSSE3
3401 
3402 #ifdef HAS_ARGBATTENUATEROW_SSE2
3403 // Attenuate 4 pixels at a time.
ARGBAttenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)3404 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3405   asm volatile (
3406     "pcmpeqb   %%xmm4,%%xmm4                   \n"
3407     "pslld     $0x18,%%xmm4                    \n"
3408     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3409     "psrld     $0x8,%%xmm5                     \n"
3410 
3411     // 4 pixel loop.
3412     LABELALIGN
3413   "1:                                          \n"
3414     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3415     "punpcklbw %%xmm0,%%xmm0                   \n"
3416     "pshufhw   $0xff,%%xmm0,%%xmm2             \n"
3417     "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
3418     "pmulhuw   %%xmm2,%%xmm0                   \n"
3419     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3420     "punpckhbw %%xmm1,%%xmm1                   \n"
3421     "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
3422     "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
3423     "pmulhuw   %%xmm2,%%xmm1                   \n"
3424     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3425     "lea       " MEMLEA(0x10,0) ",%0           \n"
3426     "psrlw     $0x8,%%xmm0                     \n"
3427     "pand      %%xmm4,%%xmm2                   \n"
3428     "psrlw     $0x8,%%xmm1                     \n"
3429     "packuswb  %%xmm1,%%xmm0                   \n"
3430     "pand      %%xmm5,%%xmm0                   \n"
3431     "por       %%xmm2,%%xmm0                   \n"
3432     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3433     "lea       " MEMLEA(0x10,1) ",%1           \n"
3434     "sub       $0x4,%2                         \n"
3435     "jg        1b                              \n"
3436   : "+r"(src_argb),    // %0
3437     "+r"(dst_argb),    // %1
3438     "+r"(width)        // %2
3439   :
3440   : "memory", "cc"
3441     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3442   );
3443 }
3444 #endif  // HAS_ARGBATTENUATEROW_SSE2
3445 
3446 #ifdef HAS_ARGBATTENUATEROW_SSSE3
3447 // Shuffle table duplicating alpha
3448 static uvec8 kShuffleAlpha0 = {
3449   3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
3450 };
3451 static uvec8 kShuffleAlpha1 = {
3452   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3453   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
3454 };
3455 // Attenuate 4 pixels at a time.
ARGBAttenuateRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)3456 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3457   asm volatile (
3458     "pcmpeqb   %%xmm3,%%xmm3                   \n"
3459     "pslld     $0x18,%%xmm3                    \n"
3460     "movdqa    %3,%%xmm4                       \n"
3461     "movdqa    %4,%%xmm5                       \n"
3462 
3463     // 4 pixel loop.
3464     LABELALIGN
3465   "1:                                          \n"
3466     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3467     "pshufb    %%xmm4,%%xmm0                   \n"
3468     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3469     "punpcklbw %%xmm1,%%xmm1                   \n"
3470     "pmulhuw   %%xmm1,%%xmm0                   \n"
3471     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3472     "pshufb    %%xmm5,%%xmm1                   \n"
3473     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3474     "punpckhbw %%xmm2,%%xmm2                   \n"
3475     "pmulhuw   %%xmm2,%%xmm1                   \n"
3476     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3477     "lea       " MEMLEA(0x10,0) ",%0           \n"
3478     "pand      %%xmm3,%%xmm2                   \n"
3479     "psrlw     $0x8,%%xmm0                     \n"
3480     "psrlw     $0x8,%%xmm1                     \n"
3481     "packuswb  %%xmm1,%%xmm0                   \n"
3482     "por       %%xmm2,%%xmm0                   \n"
3483     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3484     "lea       " MEMLEA(0x10,1) ",%1           \n"
3485     "sub       $0x4,%2                         \n"
3486     "jg        1b                              \n"
3487   : "+r"(src_argb),    // %0
3488     "+r"(dst_argb),    // %1
3489     "+r"(width)        // %2
3490   : "m"(kShuffleAlpha0),  // %3
3491     "m"(kShuffleAlpha1)  // %4
3492   : "memory", "cc"
3493     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3494   );
3495 }
3496 #endif  // HAS_ARGBATTENUATEROW_SSSE3
3497 
3498 #ifdef HAS_ARGBATTENUATEROW_AVX2
3499 // Shuffle table duplicating alpha.
3500 static const uvec8 kShuffleAlpha_AVX2 = {
3501   6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
3502 };
3503 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)3504 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
3505   asm volatile (
3506     "vbroadcastf128 %3,%%ymm4                  \n"
3507     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
3508     "vpslld     $0x18,%%ymm5,%%ymm5            \n"
3509     "sub        %0,%1                          \n"
3510 
3511     // 8 pixel loop.
3512     LABELALIGN
3513   "1:                                          \n"
3514     "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
3515     "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
3516     "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
3517     "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
3518     "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
3519     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
3520     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
3521     "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
3522     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
3523     "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
3524     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
3525     "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
3526     MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
3527     "lea       " MEMLEA(0x20,0) ",%0           \n"
3528     "sub        $0x8,%2                        \n"
3529     "jg        1b                              \n"
3530     "vzeroupper                                \n"
3531   : "+r"(src_argb),    // %0
3532     "+r"(dst_argb),    // %1
3533     "+r"(width)        // %2
3534   : "m"(kShuffleAlpha_AVX2)  // %3
3535   : "memory", "cc"
3536     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3537   );
3538 }
3539 #endif  // HAS_ARGBATTENUATEROW_AVX2
3540 
3541 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
3542 // Unattenuate 4 pixels at a time.
ARGBUnattenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)3543 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3544                              int width) {
3545   uintptr_t alpha = 0;
3546   asm volatile (
3547     // 4 pixel loop.
3548     LABELALIGN
3549   "1:                                          \n"
3550     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3551     "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
3552     "punpcklbw %%xmm0,%%xmm0                   \n"
3553     MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
3554     "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
3555     MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
3556     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
3557     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
3558     "movlhps   %%xmm3,%%xmm2                   \n"
3559     "pmulhuw   %%xmm2,%%xmm0                   \n"
3560     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3561     "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
3562     "punpckhbw %%xmm1,%%xmm1                   \n"
3563     MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
3564     "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
3565     MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
3566     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
3567     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
3568     "movlhps   %%xmm3,%%xmm2                   \n"
3569     "pmulhuw   %%xmm2,%%xmm1                   \n"
3570     "lea       " MEMLEA(0x10,0) ",%0           \n"
3571     "packuswb  %%xmm1,%%xmm0                   \n"
3572     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3573     "lea       " MEMLEA(0x10,1) ",%1           \n"
3574     "sub       $0x4,%2                         \n"
3575     "jg        1b                              \n"
3576   : "+r"(src_argb),    // %0
3577     "+r"(dst_argb),    // %1
3578     "+r"(width),       // %2
3579     "+r"(alpha)        // %3
3580   : "r"(fixed_invtbl8)  // %4
3581   : "memory", "cc", NACL_R14
3582     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3583   );
3584 }
3585 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
3586 
3587 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
3588 // Shuffle table duplicating alpha.
3589 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
3590   0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
3591 };
3592 // Unattenuate 8 pixels at a time.
ARGBUnattenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)3593 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
3594                              int width) {
3595   uintptr_t alpha = 0;
3596   asm volatile (
3597     "sub        %0,%1                          \n"
3598     "vbroadcastf128 %5,%%ymm5                  \n"
3599 
3600     // 8 pixel loop.
3601     LABELALIGN
3602   "1:                                          \n"
3603     // replace VPGATHER
3604     "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
3605     MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
3606     "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
3607     MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
3608     "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
3609     "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
3610     MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
3611     "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
3612     MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
3613     "movzb     " MEMACCESS2(0x13,0) ",%3       \n"
3614     "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
3615     MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
3616     "movzb     " MEMACCESS2(0x17,0) ",%3       \n"
3617     MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
3618     "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"
3619     "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
3620     MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
3621     "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"
3622     MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
3623     "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
3624     "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
3625     "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
3626     "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
3627     // end of VPGATHER
3628 
3629     "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
3630     "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
3631     "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
3632     "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
3633     "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
3634     "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
3635     "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
3636     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
3637     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
3638     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
3639     MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
3640     "lea       " MEMLEA(0x20,0) ",%0           \n"
3641     "sub        $0x8,%2                        \n"
3642     "jg        1b                              \n"
3643     "vzeroupper                                \n"
3644   : "+r"(src_argb),    // %0
3645     "+r"(dst_argb),    // %1
3646     "+r"(width),       // %2
3647     "+r"(alpha)        // %3
3648   : "r"(fixed_invtbl8),  // %4
3649     "m"(kUnattenShuffleAlpha_AVX2)  // %5
3650   : "memory", "cc", NACL_R14
3651     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3652   );
3653 }
3654 #endif  // HAS_ARGBUNATTENUATEROW_AVX2
3655 
3656 #ifdef HAS_ARGBGRAYROW_SSSE3
3657 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
ARGBGrayRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)3658 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3659   asm volatile (
3660     "movdqa    %3,%%xmm4                       \n"
3661     "movdqa    %4,%%xmm5                       \n"
3662 
3663     // 8 pixel loop.
3664     LABELALIGN
3665   "1:                                          \n"
3666     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3667     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3668     "pmaddubsw %%xmm4,%%xmm0                   \n"
3669     "pmaddubsw %%xmm4,%%xmm1                   \n"
3670     "phaddw    %%xmm1,%%xmm0                   \n"
3671     "paddw     %%xmm5,%%xmm0                   \n"
3672     "psrlw     $0x7,%%xmm0                     \n"
3673     "packuswb  %%xmm0,%%xmm0                   \n"
3674     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3675     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
3676     "lea       " MEMLEA(0x20,0) ",%0           \n"
3677     "psrld     $0x18,%%xmm2                    \n"
3678     "psrld     $0x18,%%xmm3                    \n"
3679     "packuswb  %%xmm3,%%xmm2                   \n"
3680     "packuswb  %%xmm2,%%xmm2                   \n"
3681     "movdqa    %%xmm0,%%xmm3                   \n"
3682     "punpcklbw %%xmm0,%%xmm0                   \n"
3683     "punpcklbw %%xmm2,%%xmm3                   \n"
3684     "movdqa    %%xmm0,%%xmm1                   \n"
3685     "punpcklwd %%xmm3,%%xmm0                   \n"
3686     "punpckhwd %%xmm3,%%xmm1                   \n"
3687     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3688     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
3689     "lea       " MEMLEA(0x20,1) ",%1           \n"
3690     "sub       $0x8,%2                         \n"
3691     "jg        1b                              \n"
3692   : "+r"(src_argb),   // %0
3693     "+r"(dst_argb),   // %1
3694     "+r"(width)       // %2
3695   : "m"(kARGBToYJ),   // %3
3696     "m"(kAddYJ64)     // %4
3697   : "memory", "cc"
3698     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3699   );
3700 }
3701 #endif  // HAS_ARGBGRAYROW_SSSE3
3702 
3703 #ifdef HAS_ARGBSEPIAROW_SSSE3
3704 //    b = (r * 35 + g * 68 + b * 17) >> 7
3705 //    g = (r * 45 + g * 88 + b * 22) >> 7
3706 //    r = (r * 50 + g * 98 + b * 24) >> 7
3707 // Constant for ARGB color to sepia tone
3708 static vec8 kARGBToSepiaB = {
3709   17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3710 };
3711 
3712 static vec8 kARGBToSepiaG = {
3713   22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3714 };
3715 
3716 static vec8 kARGBToSepiaR = {
3717   24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3718 };
3719 
3720 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
ARGBSepiaRow_SSSE3(uint8 * dst_argb,int width)3721 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3722   asm volatile (
3723     "movdqa    %2,%%xmm2                       \n"
3724     "movdqa    %3,%%xmm3                       \n"
3725     "movdqa    %4,%%xmm4                       \n"
3726 
3727     // 8 pixel loop.
3728     LABELALIGN
3729   "1:                                          \n"
3730     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3731     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
3732     "pmaddubsw %%xmm2,%%xmm0                   \n"
3733     "pmaddubsw %%xmm2,%%xmm6                   \n"
3734     "phaddw    %%xmm6,%%xmm0                   \n"
3735     "psrlw     $0x7,%%xmm0                     \n"
3736     "packuswb  %%xmm0,%%xmm0                   \n"
3737     "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
3738     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3739     "pmaddubsw %%xmm3,%%xmm5                   \n"
3740     "pmaddubsw %%xmm3,%%xmm1                   \n"
3741     "phaddw    %%xmm1,%%xmm5                   \n"
3742     "psrlw     $0x7,%%xmm5                     \n"
3743     "packuswb  %%xmm5,%%xmm5                   \n"
3744     "punpcklbw %%xmm5,%%xmm0                   \n"
3745     "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
3746     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3747     "pmaddubsw %%xmm4,%%xmm5                   \n"
3748     "pmaddubsw %%xmm4,%%xmm1                   \n"
3749     "phaddw    %%xmm1,%%xmm5                   \n"
3750     "psrlw     $0x7,%%xmm5                     \n"
3751     "packuswb  %%xmm5,%%xmm5                   \n"
3752     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
3753     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3754     "psrld     $0x18,%%xmm6                    \n"
3755     "psrld     $0x18,%%xmm1                    \n"
3756     "packuswb  %%xmm1,%%xmm6                   \n"
3757     "packuswb  %%xmm6,%%xmm6                   \n"
3758     "punpcklbw %%xmm6,%%xmm5                   \n"
3759     "movdqa    %%xmm0,%%xmm1                   \n"
3760     "punpcklwd %%xmm5,%%xmm0                   \n"
3761     "punpckhwd %%xmm5,%%xmm1                   \n"
3762     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
3763     "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
3764     "lea       " MEMLEA(0x20,0) ",%0           \n"
3765     "sub       $0x8,%1                         \n"
3766     "jg        1b                              \n"
3767   : "+r"(dst_argb),      // %0
3768     "+r"(width)          // %1
3769   : "m"(kARGBToSepiaB),  // %2
3770     "m"(kARGBToSepiaG),  // %3
3771     "m"(kARGBToSepiaR)   // %4
3772   : "memory", "cc"
3773     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3774   );
3775 }
3776 #endif  // HAS_ARGBSEPIAROW_SSSE3
3777 
3778 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3779 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
3780 // Same as Sepia except matrix is provided.
ARGBColorMatrixRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,const int8 * matrix_argb,int width)3781 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
3782                               const int8* matrix_argb, int width) {
3783   asm volatile (
3784     "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
3785     "pshufd    $0x00,%%xmm5,%%xmm2             \n"
3786     "pshufd    $0x55,%%xmm5,%%xmm3             \n"
3787     "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
3788     "pshufd    $0xff,%%xmm5,%%xmm5             \n"
3789 
3790     // 8 pixel loop.
3791     LABELALIGN
3792   "1:                                          \n"
3793     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3794     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
3795     "pmaddubsw %%xmm2,%%xmm0                   \n"
3796     "pmaddubsw %%xmm2,%%xmm7                   \n"
3797     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
3798     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3799     "pmaddubsw %%xmm3,%%xmm6                   \n"
3800     "pmaddubsw %%xmm3,%%xmm1                   \n"
3801     "phaddsw   %%xmm7,%%xmm0                   \n"
3802     "phaddsw   %%xmm1,%%xmm6                   \n"
3803     "psraw     $0x6,%%xmm0                     \n"
3804     "psraw     $0x6,%%xmm6                     \n"
3805     "packuswb  %%xmm0,%%xmm0                   \n"
3806     "packuswb  %%xmm6,%%xmm6                   \n"
3807     "punpcklbw %%xmm6,%%xmm0                   \n"
3808     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3809     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
3810     "pmaddubsw %%xmm4,%%xmm1                   \n"
3811     "pmaddubsw %%xmm4,%%xmm7                   \n"
3812     "phaddsw   %%xmm7,%%xmm1                   \n"
3813     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
3814     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
3815     "pmaddubsw %%xmm5,%%xmm6                   \n"
3816     "pmaddubsw %%xmm5,%%xmm7                   \n"
3817     "phaddsw   %%xmm7,%%xmm6                   \n"
3818     "psraw     $0x6,%%xmm1                     \n"
3819     "psraw     $0x6,%%xmm6                     \n"
3820     "packuswb  %%xmm1,%%xmm1                   \n"
3821     "packuswb  %%xmm6,%%xmm6                   \n"
3822     "punpcklbw %%xmm6,%%xmm1                   \n"
3823     "movdqa    %%xmm0,%%xmm6                   \n"
3824     "punpcklwd %%xmm1,%%xmm0                   \n"
3825     "punpckhwd %%xmm1,%%xmm6                   \n"
3826     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3827     "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"
3828     "lea       " MEMLEA(0x20,0) ",%0           \n"
3829     "lea       " MEMLEA(0x20,1) ",%1           \n"
3830     "sub       $0x8,%2                         \n"
3831     "jg        1b                              \n"
3832   : "+r"(src_argb),      // %0
3833     "+r"(dst_argb),      // %1
3834     "+r"(width)          // %2
3835   : "r"(matrix_argb)     // %3
3836   : "memory", "cc"
3837     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3838   );
3839 }
3840 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
3841 
3842 #ifdef HAS_ARGBQUANTIZEROW_SSE2
3843 // Quantize 4 ARGB pixels (16 bytes).
ARGBQuantizeRow_SSE2(uint8 * dst_argb,int scale,int interval_size,int interval_offset,int width)3844 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3845                           int interval_offset, int width) {
3846   asm volatile (
3847     "movd      %2,%%xmm2                       \n"
3848     "movd      %3,%%xmm3                       \n"
3849     "movd      %4,%%xmm4                       \n"
3850     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
3851     "pshufd    $0x44,%%xmm2,%%xmm2             \n"
3852     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
3853     "pshufd    $0x44,%%xmm3,%%xmm3             \n"
3854     "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
3855     "pshufd    $0x44,%%xmm4,%%xmm4             \n"
3856     "pxor      %%xmm5,%%xmm5                   \n"
3857     "pcmpeqb   %%xmm6,%%xmm6                   \n"
3858     "pslld     $0x18,%%xmm6                    \n"
3859 
3860     // 4 pixel loop.
3861     LABELALIGN
3862   "1:                                          \n"
3863     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3864     "punpcklbw %%xmm5,%%xmm0                   \n"
3865     "pmulhuw   %%xmm2,%%xmm0                   \n"
3866     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3867     "punpckhbw %%xmm5,%%xmm1                   \n"
3868     "pmulhuw   %%xmm2,%%xmm1                   \n"
3869     "pmullw    %%xmm3,%%xmm0                   \n"
3870     "movdqu    " MEMACCESS(0) ",%%xmm7         \n"
3871     "pmullw    %%xmm3,%%xmm1                   \n"
3872     "pand      %%xmm6,%%xmm7                   \n"
3873     "paddw     %%xmm4,%%xmm0                   \n"
3874     "paddw     %%xmm4,%%xmm1                   \n"
3875     "packuswb  %%xmm1,%%xmm0                   \n"
3876     "por       %%xmm7,%%xmm0                   \n"
3877     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
3878     "lea       " MEMLEA(0x10,0) ",%0           \n"
3879     "sub       $0x4,%1                         \n"
3880     "jg        1b                              \n"
3881   : "+r"(dst_argb),       // %0
3882     "+r"(width)           // %1
3883   : "r"(scale),           // %2
3884     "r"(interval_size),   // %3
3885     "r"(interval_offset)  // %4
3886   : "memory", "cc"
3887     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3888   );
3889 }
3890 #endif  // HAS_ARGBQUANTIZEROW_SSE2
3891 
3892 #ifdef HAS_ARGBSHADEROW_SSE2
3893 // Shade 4 pixels at a time by specified value.
ARGBShadeRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)3894 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3895                        uint32 value) {
3896   asm volatile (
3897     "movd      %3,%%xmm2                       \n"
3898     "punpcklbw %%xmm2,%%xmm2                   \n"
3899     "punpcklqdq %%xmm2,%%xmm2                  \n"
3900 
3901     // 4 pixel loop.
3902     LABELALIGN
3903   "1:                                          \n"
3904     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3905     "lea       " MEMLEA(0x10,0) ",%0           \n"
3906     "movdqa    %%xmm0,%%xmm1                   \n"
3907     "punpcklbw %%xmm0,%%xmm0                   \n"
3908     "punpckhbw %%xmm1,%%xmm1                   \n"
3909     "pmulhuw   %%xmm2,%%xmm0                   \n"
3910     "pmulhuw   %%xmm2,%%xmm1                   \n"
3911     "psrlw     $0x8,%%xmm0                     \n"
3912     "psrlw     $0x8,%%xmm1                     \n"
3913     "packuswb  %%xmm1,%%xmm0                   \n"
3914     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3915     "lea       " MEMLEA(0x10,1) ",%1           \n"
3916     "sub       $0x4,%2                         \n"
3917     "jg        1b                              \n"
3918   : "+r"(src_argb),  // %0
3919     "+r"(dst_argb),  // %1
3920     "+r"(width)      // %2
3921   : "r"(value)       // %3
3922   : "memory", "cc"
3923     , "xmm0", "xmm1", "xmm2"
3924   );
3925 }
3926 #endif  // HAS_ARGBSHADEROW_SSE2
3927 
3928 #ifdef HAS_ARGBMULTIPLYROW_SSE2
3929 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBMultiplyRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)3930 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3931                           uint8* dst_argb, int width) {
3932   asm volatile (
3933     "pxor      %%xmm5,%%xmm5                  \n"
3934 
3935     // 4 pixel loop.
3936     LABELALIGN
3937   "1:                                          \n"
3938     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3939     "lea       " MEMLEA(0x10,0) ",%0           \n"
3940     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
3941     "lea       " MEMLEA(0x10,1) ",%1           \n"
3942     "movdqu    %%xmm0,%%xmm1                   \n"
3943     "movdqu    %%xmm2,%%xmm3                   \n"
3944     "punpcklbw %%xmm0,%%xmm0                   \n"
3945     "punpckhbw %%xmm1,%%xmm1                   \n"
3946     "punpcklbw %%xmm5,%%xmm2                   \n"
3947     "punpckhbw %%xmm5,%%xmm3                   \n"
3948     "pmulhuw   %%xmm2,%%xmm0                   \n"
3949     "pmulhuw   %%xmm3,%%xmm1                   \n"
3950     "packuswb  %%xmm1,%%xmm0                   \n"
3951     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
3952     "lea       " MEMLEA(0x10,2) ",%2           \n"
3953     "sub       $0x4,%3                         \n"
3954     "jg        1b                              \n"
3955   : "+r"(src_argb0),  // %0
3956     "+r"(src_argb1),  // %1
3957     "+r"(dst_argb),   // %2
3958     "+r"(width)       // %3
3959   :
3960   : "memory", "cc"
3961     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3962   );
3963 }
3964 #endif  // HAS_ARGBMULTIPLYROW_SSE2
3965 
3966 #ifdef HAS_ARGBMULTIPLYROW_AVX2
3967 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)3968 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
3969                           uint8* dst_argb, int width) {
3970   asm volatile (
3971     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
3972 
3973     // 4 pixel loop.
3974     LABELALIGN
3975   "1:                                          \n"
3976     "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
3977     "lea        " MEMLEA(0x20,0) ",%0          \n"
3978     "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
3979     "lea        " MEMLEA(0x20,1) ",%1          \n"
3980     "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
3981     "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
3982     "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
3983     "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
3984     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
3985     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
3986     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
3987     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
3988     "lea       " MEMLEA(0x20,2) ",%2           \n"
3989     "sub        $0x8,%3                        \n"
3990     "jg        1b                              \n"
3991     "vzeroupper                                \n"
3992   : "+r"(src_argb0),  // %0
3993     "+r"(src_argb1),  // %1
3994     "+r"(dst_argb),   // %2
3995     "+r"(width)       // %3
3996   :
3997   : "memory", "cc"
3998 #if defined(__AVX2__)
3999     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4000 #endif
4001   );
4002 }
4003 #endif  // HAS_ARGBMULTIPLYROW_AVX2
4004 
4005 #ifdef HAS_ARGBADDROW_SSE2
4006 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4007 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4008                      uint8* dst_argb, int width) {
4009   asm volatile (
4010     // 4 pixel loop.
4011     LABELALIGN
4012   "1:                                          \n"
4013     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4014     "lea       " MEMLEA(0x10,0) ",%0           \n"
4015     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4016     "lea       " MEMLEA(0x10,1) ",%1           \n"
4017     "paddusb   %%xmm1,%%xmm0                   \n"
4018     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4019     "lea       " MEMLEA(0x10,2) ",%2           \n"
4020     "sub       $0x4,%3                         \n"
4021     "jg        1b                              \n"
4022   : "+r"(src_argb0),  // %0
4023     "+r"(src_argb1),  // %1
4024     "+r"(dst_argb),   // %2
4025     "+r"(width)       // %3
4026   :
4027   : "memory", "cc"
4028     , "xmm0", "xmm1"
4029   );
4030 }
4031 #endif  // HAS_ARGBADDROW_SSE2
4032 
4033 #ifdef HAS_ARGBADDROW_AVX2
4034 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4035 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4036                      uint8* dst_argb, int width) {
4037   asm volatile (
4038     // 4 pixel loop.
4039     LABELALIGN
4040   "1:                                          \n"
4041     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
4042     "lea        " MEMLEA(0x20,0) ",%0          \n"
4043     "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4044     "lea        " MEMLEA(0x20,1) ",%1          \n"
4045     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
4046     "lea        " MEMLEA(0x20,2) ",%2          \n"
4047     "sub        $0x8,%3                        \n"
4048     "jg        1b                              \n"
4049     "vzeroupper                                \n"
4050   : "+r"(src_argb0),  // %0
4051     "+r"(src_argb1),  // %1
4052     "+r"(dst_argb),   // %2
4053     "+r"(width)       // %3
4054   :
4055   : "memory", "cc"
4056     , "xmm0"
4057   );
4058 }
4059 #endif  // HAS_ARGBADDROW_AVX2
4060 
4061 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4062 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
ARGBSubtractRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4063 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4064                           uint8* dst_argb, int width) {
4065   asm volatile (
4066     // 4 pixel loop.
4067     LABELALIGN
4068   "1:                                          \n"
4069     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4070     "lea       " MEMLEA(0x10,0) ",%0           \n"
4071     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4072     "lea       " MEMLEA(0x10,1) ",%1           \n"
4073     "psubusb   %%xmm1,%%xmm0                   \n"
4074     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4075     "lea       " MEMLEA(0x10,2) ",%2           \n"
4076     "sub       $0x4,%3                         \n"
4077     "jg        1b                              \n"
4078   : "+r"(src_argb0),  // %0
4079     "+r"(src_argb1),  // %1
4080     "+r"(dst_argb),   // %2
4081     "+r"(width)       // %3
4082   :
4083   : "memory", "cc"
4084     , "xmm0", "xmm1"
4085   );
4086 }
4087 #endif  // HAS_ARGBSUBTRACTROW_SSE2
4088 
4089 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4090 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4091 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4092                           uint8* dst_argb, int width) {
4093   asm volatile (
4094     // 4 pixel loop.
4095     LABELALIGN
4096   "1:                                          \n"
4097     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
4098     "lea        " MEMLEA(0x20,0) ",%0          \n"
4099     "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4100     "lea        " MEMLEA(0x20,1) ",%1          \n"
4101     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
4102     "lea        " MEMLEA(0x20,2) ",%2          \n"
4103     "sub        $0x8,%3                        \n"
4104     "jg        1b                              \n"
4105     "vzeroupper                                \n"
4106   : "+r"(src_argb0),  // %0
4107     "+r"(src_argb1),  // %1
4108     "+r"(dst_argb),   // %2
4109     "+r"(width)       // %3
4110   :
4111   : "memory", "cc"
4112     , "xmm0"
4113   );
4114 }
4115 #endif  // HAS_ARGBSUBTRACTROW_AVX2
4116 
4117 #ifdef HAS_SOBELXROW_SSE2
4118 // SobelX as a matrix is
4119 // -1  0  1
4120 // -2  0  2
4121 // -1  0  1
SobelXRow_SSE2(const uint8 * src_y0,const uint8 * src_y1,const uint8 * src_y2,uint8 * dst_sobelx,int width)4122 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4123                     const uint8* src_y2, uint8* dst_sobelx, int width) {
4124   asm volatile (
4125     "sub       %0,%1                           \n"
4126     "sub       %0,%2                           \n"
4127     "sub       %0,%3                           \n"
4128     "pxor      %%xmm5,%%xmm5                   \n"
4129 
4130     // 8 pixel loop.
4131     LABELALIGN
4132   "1:                                          \n"
4133     "movq      " MEMACCESS(0) ",%%xmm0         \n"
4134     "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
4135     "punpcklbw %%xmm5,%%xmm0                   \n"
4136     "punpcklbw %%xmm5,%%xmm1                   \n"
4137     "psubw     %%xmm1,%%xmm0                   \n"
4138     MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
4139     MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
4140     "punpcklbw %%xmm5,%%xmm1                   \n"
4141     "punpcklbw %%xmm5,%%xmm2                   \n"
4142     "psubw     %%xmm2,%%xmm1                   \n"
4143     MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
4144     MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
4145     "punpcklbw %%xmm5,%%xmm2                   \n"
4146     "punpcklbw %%xmm5,%%xmm3                   \n"
4147     "psubw     %%xmm3,%%xmm2                   \n"
4148     "paddw     %%xmm2,%%xmm0                   \n"
4149     "paddw     %%xmm1,%%xmm0                   \n"
4150     "paddw     %%xmm1,%%xmm0                   \n"
4151     "pxor      %%xmm1,%%xmm1                   \n"
4152     "psubw     %%xmm0,%%xmm1                   \n"
4153     "pmaxsw    %%xmm1,%%xmm0                   \n"
4154     "packuswb  %%xmm0,%%xmm0                   \n"
4155     MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
4156     "lea       " MEMLEA(0x8,0) ",%0            \n"
4157     "sub       $0x8,%4                         \n"
4158     "jg        1b                              \n"
4159   : "+r"(src_y0),      // %0
4160     "+r"(src_y1),      // %1
4161     "+r"(src_y2),      // %2
4162     "+r"(dst_sobelx),  // %3
4163     "+r"(width)        // %4
4164   :
4165   : "memory", "cc", NACL_R14
4166     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4167   );
4168 }
4169 #endif  // HAS_SOBELXROW_SSE2
4170 
4171 #ifdef HAS_SOBELYROW_SSE2
4172 // SobelY as a matrix is
4173 // -1 -2 -1
4174 //  0  0  0
4175 //  1  2  1
SobelYRow_SSE2(const uint8 * src_y0,const uint8 * src_y1,uint8 * dst_sobely,int width)4176 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4177                     uint8* dst_sobely, int width) {
4178   asm volatile (
4179     "sub       %0,%1                           \n"
4180     "sub       %0,%2                           \n"
4181     "pxor      %%xmm5,%%xmm5                   \n"
4182 
4183     // 8 pixel loop.
4184     LABELALIGN
4185   "1:                                          \n"
4186     "movq      " MEMACCESS(0) ",%%xmm0         \n"
4187     MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
4188     "punpcklbw %%xmm5,%%xmm0                   \n"
4189     "punpcklbw %%xmm5,%%xmm1                   \n"
4190     "psubw     %%xmm1,%%xmm0                   \n"
4191     "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
4192     MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
4193     "punpcklbw %%xmm5,%%xmm1                   \n"
4194     "punpcklbw %%xmm5,%%xmm2                   \n"
4195     "psubw     %%xmm2,%%xmm1                   \n"
4196     "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
4197     MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
4198     "punpcklbw %%xmm5,%%xmm2                   \n"
4199     "punpcklbw %%xmm5,%%xmm3                   \n"
4200     "psubw     %%xmm3,%%xmm2                   \n"
4201     "paddw     %%xmm2,%%xmm0                   \n"
4202     "paddw     %%xmm1,%%xmm0                   \n"
4203     "paddw     %%xmm1,%%xmm0                   \n"
4204     "pxor      %%xmm1,%%xmm1                   \n"
4205     "psubw     %%xmm0,%%xmm1                   \n"
4206     "pmaxsw    %%xmm1,%%xmm0                   \n"
4207     "packuswb  %%xmm0,%%xmm0                   \n"
4208     MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
4209     "lea       " MEMLEA(0x8,0) ",%0            \n"
4210     "sub       $0x8,%3                         \n"
4211     "jg        1b                              \n"
4212   : "+r"(src_y0),      // %0
4213     "+r"(src_y1),      // %1
4214     "+r"(dst_sobely),  // %2
4215     "+r"(width)        // %3
4216   :
4217   : "memory", "cc", NACL_R14
4218     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4219   );
4220 }
4221 #endif  // HAS_SOBELYROW_SSE2
4222 
4223 #ifdef HAS_SOBELROW_SSE2
4224 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4225 // A = 255
4226 // R = Sobel
4227 // G = Sobel
4228 // B = Sobel
SobelRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)4229 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4230                    uint8* dst_argb, int width) {
4231   asm volatile (
4232     "sub       %0,%1                           \n"
4233     "pcmpeqb   %%xmm5,%%xmm5                   \n"
4234     "pslld     $0x18,%%xmm5                    \n"
4235 
4236     // 8 pixel loop.
4237     LABELALIGN
4238   "1:                                          \n"
4239     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4240     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4241     "lea       " MEMLEA(0x10,0) ",%0           \n"
4242     "paddusb   %%xmm1,%%xmm0                   \n"
4243     "movdqa    %%xmm0,%%xmm2                   \n"
4244     "punpcklbw %%xmm0,%%xmm2                   \n"
4245     "punpckhbw %%xmm0,%%xmm0                   \n"
4246     "movdqa    %%xmm2,%%xmm1                   \n"
4247     "punpcklwd %%xmm2,%%xmm1                   \n"
4248     "punpckhwd %%xmm2,%%xmm2                   \n"
4249     "por       %%xmm5,%%xmm1                   \n"
4250     "por       %%xmm5,%%xmm2                   \n"
4251     "movdqa    %%xmm0,%%xmm3                   \n"
4252     "punpcklwd %%xmm0,%%xmm3                   \n"
4253     "punpckhwd %%xmm0,%%xmm0                   \n"
4254     "por       %%xmm5,%%xmm3                   \n"
4255     "por       %%xmm5,%%xmm0                   \n"
4256     "movdqu    %%xmm1," MEMACCESS(2) "         \n"
4257     "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
4258     "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"
4259     "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"
4260     "lea       " MEMLEA(0x40,2) ",%2           \n"
4261     "sub       $0x10,%3                        \n"
4262     "jg        1b                              \n"
4263   : "+r"(src_sobelx),  // %0
4264     "+r"(src_sobely),  // %1
4265     "+r"(dst_argb),    // %2
4266     "+r"(width)        // %3
4267   :
4268   : "memory", "cc", NACL_R14
4269     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4270   );
4271 }
4272 #endif  // HAS_SOBELROW_SSE2
4273 
4274 #ifdef HAS_SOBELTOPLANEROW_SSE2
4275 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
SobelToPlaneRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_y,int width)4276 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4277                           uint8* dst_y, int width) {
4278   asm volatile (
4279     "sub       %0,%1                           \n"
4280     "pcmpeqb   %%xmm5,%%xmm5                   \n"
4281     "pslld     $0x18,%%xmm5                    \n"
4282 
4283     // 8 pixel loop.
4284     LABELALIGN
4285   "1:                                          \n"
4286     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4287     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4288     "lea       " MEMLEA(0x10,0) ",%0           \n"
4289     "paddusb   %%xmm1,%%xmm0                   \n"
4290     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4291     "lea       " MEMLEA(0x10,2) ",%2           \n"
4292     "sub       $0x10,%3                        \n"
4293     "jg        1b                              \n"
4294   : "+r"(src_sobelx),  // %0
4295     "+r"(src_sobely),  // %1
4296     "+r"(dst_y),       // %2
4297     "+r"(width)        // %3
4298   :
4299   : "memory", "cc", NACL_R14
4300     "xmm0", "xmm1"
4301   );
4302 }
4303 #endif  // HAS_SOBELTOPLANEROW_SSE2
4304 
4305 #ifdef HAS_SOBELXYROW_SSE2
4306 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
4307 // A = 255
4308 // R = Sobel X
4309 // G = Sobel
4310 // B = Sobel Y
SobelXYRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)4311 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4312                      uint8* dst_argb, int width) {
4313   asm volatile (
4314     "sub       %0,%1                           \n"
4315     "pcmpeqb   %%xmm5,%%xmm5                   \n"
4316 
4317     // 8 pixel loop.
4318     LABELALIGN
4319   "1:                                          \n"
4320     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4321     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4322     "lea       " MEMLEA(0x10,0) ",%0           \n"
4323     "movdqa    %%xmm0,%%xmm2                   \n"
4324     "paddusb   %%xmm1,%%xmm2                   \n"
4325     "movdqa    %%xmm0,%%xmm3                   \n"
4326     "punpcklbw %%xmm5,%%xmm3                   \n"
4327     "punpckhbw %%xmm5,%%xmm0                   \n"
4328     "movdqa    %%xmm1,%%xmm4                   \n"
4329     "punpcklbw %%xmm2,%%xmm4                   \n"
4330     "punpckhbw %%xmm2,%%xmm1                   \n"
4331     "movdqa    %%xmm4,%%xmm6                   \n"
4332     "punpcklwd %%xmm3,%%xmm6                   \n"
4333     "punpckhwd %%xmm3,%%xmm4                   \n"
4334     "movdqa    %%xmm1,%%xmm7                   \n"
4335     "punpcklwd %%xmm0,%%xmm7                   \n"
4336     "punpckhwd %%xmm0,%%xmm1                   \n"
4337     "movdqu    %%xmm6," MEMACCESS(2) "         \n"
4338     "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"
4339     "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"
4340     "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"
4341     "lea       " MEMLEA(0x40,2) ",%2           \n"
4342     "sub       $0x10,%3                        \n"
4343     "jg        1b                              \n"
4344   : "+r"(src_sobelx),  // %0
4345     "+r"(src_sobely),  // %1
4346     "+r"(dst_argb),    // %2
4347     "+r"(width)        // %3
4348   :
4349   : "memory", "cc", NACL_R14
4350     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4351   );
4352 }
4353 #endif  // HAS_SOBELXYROW_SSE2
4354 
4355 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4356 // Creates a table of cumulative sums where each value is a sum of all values
4357 // above and to the left of the value, inclusive of the value.
ComputeCumulativeSumRow_SSE2(const uint8 * row,int32 * cumsum,const int32 * previous_cumsum,int width)4358 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
4359                                   const int32* previous_cumsum, int width) {
4360   asm volatile (
4361     "pxor      %%xmm0,%%xmm0                   \n"
4362     "pxor      %%xmm1,%%xmm1                   \n"
4363     "sub       $0x4,%3                         \n"
4364     "jl        49f                             \n"
4365     "test      $0xf,%1                         \n"
4366     "jne       49f                             \n"
4367 
4368   // 4 pixel loop                              \n"
4369     LABELALIGN
4370   "40:                                         \n"
4371     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
4372     "lea       " MEMLEA(0x10,0) ",%0           \n"
4373     "movdqa    %%xmm2,%%xmm4                   \n"
4374     "punpcklbw %%xmm1,%%xmm2                   \n"
4375     "movdqa    %%xmm2,%%xmm3                   \n"
4376     "punpcklwd %%xmm1,%%xmm2                   \n"
4377     "punpckhwd %%xmm1,%%xmm3                   \n"
4378     "punpckhbw %%xmm1,%%xmm4                   \n"
4379     "movdqa    %%xmm4,%%xmm5                   \n"
4380     "punpcklwd %%xmm1,%%xmm4                   \n"
4381     "punpckhwd %%xmm1,%%xmm5                   \n"
4382     "paddd     %%xmm2,%%xmm0                   \n"
4383     "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
4384     "paddd     %%xmm0,%%xmm2                   \n"
4385     "paddd     %%xmm3,%%xmm0                   \n"
4386     "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
4387     "paddd     %%xmm0,%%xmm3                   \n"
4388     "paddd     %%xmm4,%%xmm0                   \n"
4389     "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
4390     "paddd     %%xmm0,%%xmm4                   \n"
4391     "paddd     %%xmm5,%%xmm0                   \n"
4392     "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
4393     "lea       " MEMLEA(0x40,2) ",%2           \n"
4394     "paddd     %%xmm0,%%xmm5                   \n"
4395     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
4396     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
4397     "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"
4398     "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"
4399     "lea       " MEMLEA(0x40,1) ",%1           \n"
4400     "sub       $0x4,%3                         \n"
4401     "jge       40b                             \n"
4402 
4403   "49:                                         \n"
4404     "add       $0x3,%3                         \n"
4405     "jl        19f                             \n"
4406 
4407   // 1 pixel loop                              \n"
4408     LABELALIGN
4409   "10:                                         \n"
4410     "movd      " MEMACCESS(0) ",%%xmm2         \n"
4411     "lea       " MEMLEA(0x4,0) ",%0            \n"
4412     "punpcklbw %%xmm1,%%xmm2                   \n"
4413     "punpcklwd %%xmm1,%%xmm2                   \n"
4414     "paddd     %%xmm2,%%xmm0                   \n"
4415     "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
4416     "lea       " MEMLEA(0x10,2) ",%2           \n"
4417     "paddd     %%xmm0,%%xmm2                   \n"
4418     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
4419     "lea       " MEMLEA(0x10,1) ",%1           \n"
4420     "sub       $0x1,%3                         \n"
4421     "jge       10b                             \n"
4422 
4423   "19:                                         \n"
4424   : "+r"(row),  // %0
4425     "+r"(cumsum),  // %1
4426     "+r"(previous_cumsum),  // %2
4427     "+r"(width)  // %3
4428   :
4429   : "memory", "cc"
4430     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4431   );
4432 }
4433 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
4434 
4435 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
CumulativeSumToAverageRow_SSE2(const int32 * topleft,const int32 * botleft,int width,int area,uint8 * dst,int count)4436 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
4437                                     int width, int area, uint8* dst,
4438                                     int count) {
4439   asm volatile (
4440     "movd      %5,%%xmm5                       \n"
4441     "cvtdq2ps  %%xmm5,%%xmm5                   \n"
4442     "rcpss     %%xmm5,%%xmm4                   \n"
4443     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
4444     "sub       $0x4,%3                         \n"
4445     "jl        49f                             \n"
4446     "cmpl      $0x80,%5                        \n"
4447     "ja        40f                             \n"
4448 
4449     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4450     "pcmpeqb   %%xmm6,%%xmm6                   \n"
4451     "psrld     $0x10,%%xmm6                    \n"
4452     "cvtdq2ps  %%xmm6,%%xmm6                   \n"
4453     "addps     %%xmm6,%%xmm5                   \n"
4454     "mulps     %%xmm4,%%xmm5                   \n"
4455     "cvtps2dq  %%xmm5,%%xmm5                   \n"
4456     "packssdw  %%xmm5,%%xmm5                   \n"
4457 
4458   // 4 pixel small loop                        \n"
4459     LABELALIGN
4460   "4:                                         \n"
4461     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4462     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4463     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
4464     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
4465     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4466     MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
4467     MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
4468     MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
4469     "lea       " MEMLEA(0x40,0) ",%0           \n"
4470     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4471     "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
4472     "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
4473     "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
4474     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4475     MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
4476     MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
4477     MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
4478     "lea       " MEMLEA(0x40,1) ",%1           \n"
4479     "packssdw  %%xmm1,%%xmm0                   \n"
4480     "packssdw  %%xmm3,%%xmm2                   \n"
4481     "pmulhuw   %%xmm5,%%xmm0                   \n"
4482     "pmulhuw   %%xmm5,%%xmm2                   \n"
4483     "packuswb  %%xmm2,%%xmm0                   \n"
4484     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4485     "lea       " MEMLEA(0x10,2) ",%2           \n"
4486     "sub       $0x4,%3                         \n"
4487     "jge       4b                              \n"
4488     "jmp       49f                             \n"
4489 
4490   // 4 pixel loop                              \n"
4491     LABELALIGN
4492   "40:                                         \n"
4493     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4494     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4495     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
4496     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
4497     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4498     MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
4499     MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
4500     MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
4501     "lea       " MEMLEA(0x40,0) ",%0           \n"
4502     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4503     "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
4504     "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
4505     "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
4506     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4507     MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
4508     MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
4509     MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
4510     "lea       " MEMLEA(0x40,1) ",%1           \n"
4511     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
4512     "cvtdq2ps  %%xmm1,%%xmm1                   \n"
4513     "mulps     %%xmm4,%%xmm0                   \n"
4514     "mulps     %%xmm4,%%xmm1                   \n"
4515     "cvtdq2ps  %%xmm2,%%xmm2                   \n"
4516     "cvtdq2ps  %%xmm3,%%xmm3                   \n"
4517     "mulps     %%xmm4,%%xmm2                   \n"
4518     "mulps     %%xmm4,%%xmm3                   \n"
4519     "cvtps2dq  %%xmm0,%%xmm0                   \n"
4520     "cvtps2dq  %%xmm1,%%xmm1                   \n"
4521     "cvtps2dq  %%xmm2,%%xmm2                   \n"
4522     "cvtps2dq  %%xmm3,%%xmm3                   \n"
4523     "packssdw  %%xmm1,%%xmm0                   \n"
4524     "packssdw  %%xmm3,%%xmm2                   \n"
4525     "packuswb  %%xmm2,%%xmm0                   \n"
4526     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4527     "lea       " MEMLEA(0x10,2) ",%2           \n"
4528     "sub       $0x4,%3                         \n"
4529     "jge       40b                             \n"
4530 
4531   "49:                                         \n"
4532     "add       $0x3,%3                         \n"
4533     "jl        19f                             \n"
4534 
4535   // 1 pixel loop                              \n"
4536     LABELALIGN
4537   "10:                                         \n"
4538     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4539     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4540     "lea       " MEMLEA(0x10,0) ",%0           \n"
4541     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4542     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4543     "lea       " MEMLEA(0x10,1) ",%1           \n"
4544     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
4545     "mulps     %%xmm4,%%xmm0                   \n"
4546     "cvtps2dq  %%xmm0,%%xmm0                   \n"
4547     "packssdw  %%xmm0,%%xmm0                   \n"
4548     "packuswb  %%xmm0,%%xmm0                   \n"
4549     "movd      %%xmm0," MEMACCESS(2) "         \n"
4550     "lea       " MEMLEA(0x4,2) ",%2            \n"
4551     "sub       $0x1,%3                         \n"
4552     "jge       10b                             \n"
4553   "19:                                         \n"
4554   : "+r"(topleft),  // %0
4555     "+r"(botleft),  // %1
4556     "+r"(dst),      // %2
4557     "+rm"(count)    // %3
4558   : "r"((intptr_t)(width)),  // %4
4559     "rm"(area)     // %5
4560   : "memory", "cc", NACL_R14
4561     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4562   );
4563 }
4564 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4565 
4566 #ifdef HAS_ARGBAFFINEROW_SSE2
4567 // Copy ARGB pixels from source image with slope to a row of destination.
4568 LIBYUV_API
ARGBAffineRow_SSE2(const uint8 * src_argb,int src_argb_stride,uint8 * dst_argb,const float * src_dudv,int width)4569 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
4570                         uint8* dst_argb, const float* src_dudv, int width) {
4571   intptr_t src_argb_stride_temp = src_argb_stride;
4572   intptr_t temp = 0;
4573   asm volatile (
4574     "movq      " MEMACCESS(3) ",%%xmm2         \n"
4575     "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
4576     "shl       $0x10,%1                        \n"
4577     "add       $0x4,%1                         \n"
4578     "movd      %1,%%xmm5                       \n"
4579     "sub       $0x4,%4                         \n"
4580     "jl        49f                             \n"
4581 
4582     "pshufd    $0x44,%%xmm7,%%xmm7             \n"
4583     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4584     "movdqa    %%xmm2,%%xmm0                   \n"
4585     "addps     %%xmm7,%%xmm0                   \n"
4586     "movlhps   %%xmm0,%%xmm2                   \n"
4587     "movdqa    %%xmm7,%%xmm4                   \n"
4588     "addps     %%xmm4,%%xmm4                   \n"
4589     "movdqa    %%xmm2,%%xmm3                   \n"
4590     "addps     %%xmm4,%%xmm3                   \n"
4591     "addps     %%xmm4,%%xmm4                   \n"
4592 
4593   // 4 pixel loop                              \n"
4594     LABELALIGN
4595   "40:                                         \n"
4596     "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
4597     "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
4598     "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
4599     "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
4600     "movd      %%xmm0,%k1                      \n"
4601     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4602     "movd      %%xmm0,%k5                      \n"
4603     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4604     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
4605     MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
4606     "punpckldq %%xmm6,%%xmm1                   \n"
4607     "addps     %%xmm4,%%xmm2                   \n"
4608     "movq      %%xmm1," MEMACCESS(2) "         \n"
4609     "movd      %%xmm0,%k1                      \n"
4610     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4611     "movd      %%xmm0,%k5                      \n"
4612     MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
4613     MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
4614     "punpckldq %%xmm6,%%xmm0                   \n"
4615     "addps     %%xmm4,%%xmm3                   \n"
4616     "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
4617     "lea       " MEMLEA(0x10,2) ",%2           \n"
4618     "sub       $0x4,%4                         \n"
4619     "jge       40b                             \n"
4620 
4621   "49:                                         \n"
4622     "add       $0x3,%4                         \n"
4623     "jl        19f                             \n"
4624 
4625   // 1 pixel loop                              \n"
4626     LABELALIGN
4627   "10:                                         \n"
4628     "cvttps2dq %%xmm2,%%xmm0                   \n"
4629     "packssdw  %%xmm0,%%xmm0                   \n"
4630     "pmaddwd   %%xmm5,%%xmm0                   \n"
4631     "addps     %%xmm7,%%xmm2                   \n"
4632     "movd      %%xmm0,%k1                      \n"
4633     MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
4634     "movd      %%xmm0," MEMACCESS(2) "         \n"
4635     "lea       " MEMLEA(0x04,2) ",%2           \n"
4636     "sub       $0x1,%4                         \n"
4637     "jge       10b                             \n"
4638   "19:                                         \n"
4639   : "+r"(src_argb),  // %0
4640     "+r"(src_argb_stride_temp),  // %1
4641     "+r"(dst_argb),  // %2
4642     "+r"(src_dudv),  // %3
4643     "+rm"(width),    // %4
4644     "+r"(temp)   // %5
4645   :
4646   : "memory", "cc", NACL_R14
4647     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4648   );
4649 }
4650 #endif  // HAS_ARGBAFFINEROW_SSE2
4651 
4652 #ifdef HAS_INTERPOLATEROW_SSSE3
4653 // Bilinear filter 16x2 -> 16x1
InterpolateRow_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)4654 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
4655                           ptrdiff_t src_stride, int dst_width,
4656                           int source_y_fraction) {
4657   asm volatile (
4658     "sub       %1,%0                           \n"
4659     "shr       %3                              \n"
4660     "cmp       $0x0,%3                         \n"
4661     "je        100f                            \n"
4662     "cmp       $0x20,%3                        \n"
4663     "je        75f                             \n"
4664     "cmp       $0x40,%3                        \n"
4665     "je        50f                             \n"
4666     "cmp       $0x60,%3                        \n"
4667     "je        25f                             \n"
4668 
4669     "movd      %3,%%xmm0                       \n"
4670     "neg       %3                              \n"
4671     "add       $0x80,%3                        \n"
4672     "movd      %3,%%xmm5                       \n"
4673     "punpcklbw %%xmm0,%%xmm5                   \n"
4674     "punpcklwd %%xmm5,%%xmm5                   \n"
4675     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4676 
4677     // General purpose row blend.
4678     LABELALIGN
4679   "1:                                          \n"
4680     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4681     MEMOPREG(movdqu,0x00,1,4,1,xmm2)
4682     "movdqa    %%xmm0,%%xmm1                   \n"
4683     "punpcklbw %%xmm2,%%xmm0                   \n"
4684     "punpckhbw %%xmm2,%%xmm1                   \n"
4685     "pmaddubsw %%xmm5,%%xmm0                   \n"
4686     "pmaddubsw %%xmm5,%%xmm1                   \n"
4687     "psrlw     $0x7,%%xmm0                     \n"
4688     "psrlw     $0x7,%%xmm1                     \n"
4689     "packuswb  %%xmm1,%%xmm0                   \n"
4690     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4691     "lea       " MEMLEA(0x10,1) ",%1           \n"
4692     "sub       $0x10,%2                        \n"
4693     "jg        1b                              \n"
4694     "jmp       99f                             \n"
4695 
4696     // Blend 25 / 75.
4697     LABELALIGN
4698   "25:                                         \n"
4699     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4700     MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4701     "pavgb     %%xmm1,%%xmm0                   \n"
4702     "pavgb     %%xmm1,%%xmm0                   \n"
4703     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4704     "lea       " MEMLEA(0x10,1) ",%1           \n"
4705     "sub       $0x10,%2                        \n"
4706     "jg        25b                             \n"
4707     "jmp       99f                             \n"
4708 
4709     // Blend 50 / 50.
4710     LABELALIGN
4711   "50:                                         \n"
4712     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4713     MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4714     "pavgb     %%xmm1,%%xmm0                   \n"
4715     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4716     "lea       " MEMLEA(0x10,1) ",%1           \n"
4717     "sub       $0x10,%2                        \n"
4718     "jg        50b                             \n"
4719     "jmp       99f                             \n"
4720 
4721     // Blend 75 / 25.
4722     LABELALIGN
4723   "75:                                         \n"
4724     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4725     MEMOPREG(movdqu,0x00,1,4,1,xmm0)
4726     "pavgb     %%xmm1,%%xmm0                   \n"
4727     "pavgb     %%xmm1,%%xmm0                   \n"
4728     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4729     "lea       " MEMLEA(0x10,1) ",%1           \n"
4730     "sub       $0x10,%2                        \n"
4731     "jg        75b                             \n"
4732     "jmp       99f                             \n"
4733 
4734     // Blend 100 / 0 - Copy row unchanged.
4735     LABELALIGN
4736   "100:                                        \n"
4737     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4738     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4739     "lea       " MEMLEA(0x10,1) ",%1           \n"
4740     "sub       $0x10,%2                        \n"
4741     "jg        100b                            \n"
4742 
4743   "99:                                         \n"
4744   : "+r"(dst_ptr),    // %0
4745     "+r"(src_ptr),    // %1
4746     "+r"(dst_width),  // %2
4747     "+r"(source_y_fraction)  // %3
4748   : "r"((intptr_t)(src_stride))  // %4
4749   : "memory", "cc", NACL_R14
4750     "xmm0", "xmm1", "xmm2", "xmm5"
4751   );
4752 }
4753 #endif  // HAS_INTERPOLATEROW_SSSE3
4754 
4755 #ifdef HAS_INTERPOLATEROW_AVX2
4756 // Bilinear filter 32x2 -> 32x1
InterpolateRow_AVX2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)4757 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
4758                          ptrdiff_t src_stride, int dst_width,
4759                          int source_y_fraction) {
4760   asm volatile (
4761     "shr       %3                              \n"
4762     "cmp       $0x0,%3                         \n"
4763     "je        100f                            \n"
4764     "sub       %1,%0                           \n"
4765     "cmp       $0x20,%3                        \n"
4766     "je        75f                             \n"
4767     "cmp       $0x40,%3                        \n"
4768     "je        50f                             \n"
4769     "cmp       $0x60,%3                        \n"
4770     "je        25f                             \n"
4771 
4772     "vmovd      %3,%%xmm0                      \n"
4773     "neg        %3                             \n"
4774     "add        $0x80,%3                       \n"
4775     "vmovd      %3,%%xmm5                      \n"
4776     "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
4777     "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
4778     "vpxor      %%ymm0,%%ymm0,%%ymm0           \n"
4779     "vpermd     %%ymm5,%%ymm0,%%ymm5           \n"
4780 
4781     // General purpose row blend.
4782     LABELALIGN
4783   "1:                                          \n"
4784     "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
4785     MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
4786     "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
4787     "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
4788     "vpmaddubsw %%ymm5,%%ymm0,%%ymm0           \n"
4789     "vpmaddubsw %%ymm5,%%ymm1,%%ymm1           \n"
4790     "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
4791     "vpsrlw     $0x7,%%ymm1,%%ymm1             \n"
4792     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
4793     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4794     "lea       " MEMLEA(0x20,1) ",%1           \n"
4795     "sub       $0x20,%2                        \n"
4796     "jg        1b                              \n"
4797     "jmp       99f                             \n"
4798 
4799     // Blend 25 / 75.
4800     LABELALIGN
4801   "25:                                         \n"
4802     "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
4803     MEMOPREG(vmovdqu,0x00,1,4,1,ymm1)
4804     "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
4805     "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
4806     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4807     "lea       " MEMLEA(0x20,1) ",%1           \n"
4808     "sub       $0x20,%2                        \n"
4809     "jg        25b                             \n"
4810     "jmp       99f                             \n"
4811 
4812     // Blend 50 / 50.
4813     LABELALIGN
4814   "50:                                         \n"
4815     "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
4816     VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0
4817     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4818     "lea       " MEMLEA(0x20,1) ",%1           \n"
4819     "sub       $0x20,%2                        \n"
4820     "jg        50b                             \n"
4821     "jmp       99f                             \n"
4822 
4823     // Blend 75 / 25.
4824     LABELALIGN
4825   "75:                                         \n"
4826     "vmovdqu    " MEMACCESS(1) ",%%ymm1        \n"
4827     MEMOPREG(vmovdqu,0x00,1,4,1,ymm0)
4828     "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
4829     "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
4830     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4831     "lea       " MEMLEA(0x20,1) ",%1           \n"
4832     "sub       $0x20,%2                        \n"
4833     "jg        75b                             \n"
4834     "jmp       99f                             \n"
4835 
4836     // Blend 100 / 0 - Copy row unchanged.
4837     LABELALIGN
4838   "100:                                        \n"
4839     "rep movsb " MEMMOVESTRING(1,0) "          \n"
4840     "jmp       999f                            \n"
4841 
4842   "99:                                         \n"
4843     "vzeroupper                                \n"
4844   "999:                                        \n"
4845   : "+D"(dst_ptr),    // %0
4846     "+S"(src_ptr),    // %1
4847     "+c"(dst_width),  // %2
4848     "+r"(source_y_fraction)  // %3
4849   : "r"((intptr_t)(src_stride))  // %4
4850   : "memory", "cc", NACL_R14
4851     "xmm0", "xmm1", "xmm2", "xmm5"
4852   );
4853 }
4854 #endif  // HAS_INTERPOLATEROW_AVX2
4855 
4856 #ifdef HAS_INTERPOLATEROW_SSE2
4857 // Bilinear filter 16x2 -> 16x1
InterpolateRow_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)4858 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
4859                          ptrdiff_t src_stride, int dst_width,
4860                          int source_y_fraction) {
4861   asm volatile (
4862     "sub       %1,%0                           \n"
4863     "shr       %3                              \n"
4864     "cmp       $0x0,%3                         \n"
4865     "je        100f                            \n"
4866     "cmp       $0x20,%3                        \n"
4867     "je        75f                             \n"
4868     "cmp       $0x40,%3                        \n"
4869     "je        50f                             \n"
4870     "cmp       $0x60,%3                        \n"
4871     "je        25f                             \n"
4872 
4873     "movd      %3,%%xmm0                       \n"
4874     "neg       %3                              \n"
4875     "add       $0x80,%3                        \n"
4876     "movd      %3,%%xmm5                       \n"
4877     "punpcklbw %%xmm0,%%xmm5                   \n"
4878     "punpcklwd %%xmm5,%%xmm5                   \n"
4879     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4880     "pxor      %%xmm4,%%xmm4                   \n"
4881 
4882     // General purpose row blend.
4883     LABELALIGN
4884   "1:                                          \n"
4885     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4886     MEMOPREG(movdqu,0x00,1,4,1,xmm2)           //  movdqu    (%1,%4,1),%%xmm2
4887     "movdqa    %%xmm0,%%xmm1                   \n"
4888     "movdqa    %%xmm2,%%xmm3                   \n"
4889     "punpcklbw %%xmm4,%%xmm2                   \n"
4890     "punpckhbw %%xmm4,%%xmm3                   \n"
4891     "punpcklbw %%xmm4,%%xmm0                   \n"
4892     "punpckhbw %%xmm4,%%xmm1                   \n"
4893     "psubw     %%xmm0,%%xmm2                   \n"
4894     "psubw     %%xmm1,%%xmm3                   \n"
4895     "paddw     %%xmm2,%%xmm2                   \n"
4896     "paddw     %%xmm3,%%xmm3                   \n"
4897     "pmulhw    %%xmm5,%%xmm2                   \n"
4898     "pmulhw    %%xmm5,%%xmm3                   \n"
4899     "paddw     %%xmm2,%%xmm0                   \n"
4900     "paddw     %%xmm3,%%xmm1                   \n"
4901     "packuswb  %%xmm1,%%xmm0                   \n"
4902     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
4903     "lea       " MEMLEA(0x10,1) ",%1           \n"
4904     "sub       $0x10,%2                        \n"
4905     "jg        1b                              \n"
4906     "jmp       99f                             \n"
4907 
4908     // Blend 25 / 75.
4909     LABELALIGN
4910   "25:                                         \n"
4911     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4912     MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
4913     "pavgb     %%xmm1,%%xmm0                   \n"
4914     "pavgb     %%xmm1,%%xmm0                   \n"
4915     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
4916     "lea       " MEMLEA(0x10,1) ",%1           \n"
4917     "sub       $0x10,%2                        \n"
4918     "jg        25b                             \n"
4919     "jmp       99f                             \n"
4920 
4921     // Blend 50 / 50.
4922     LABELALIGN
4923   "50:                                         \n"
4924     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4925     MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
4926     "pavgb     %%xmm1,%%xmm0                   \n"
4927     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
4928     "lea       " MEMLEA(0x10,1) ",%1           \n"
4929     "sub       $0x10,%2                        \n"
4930     "jg        50b                             \n"
4931     "jmp       99f                             \n"
4932 
4933     // Blend 75 / 25.
4934     LABELALIGN
4935   "75:                                         \n"
4936     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4937     MEMOPREG(movdqu,0x00,1,4,1,xmm0)           //  movdqu    (%1,%4,1),%%xmm0
4938     "pavgb     %%xmm1,%%xmm0                   \n"
4939     "pavgb     %%xmm1,%%xmm0                   \n"
4940     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
4941     "lea       " MEMLEA(0x10,1) ",%1           \n"
4942     "sub       $0x10,%2                        \n"
4943     "jg        75b                             \n"
4944     "jmp       99f                             \n"
4945 
4946     // Blend 100 / 0 - Copy row unchanged.
4947     LABELALIGN
4948   "100:                                        \n"
4949     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4950     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
4951     "lea       " MEMLEA(0x10,1) ",%1           \n"
4952     "sub       $0x10,%2                        \n"
4953     "jg        100b                            \n"
4954 
4955   "99:                                         \n"
4956   : "+r"(dst_ptr),    // %0
4957     "+r"(src_ptr),    // %1
4958     "+r"(dst_width),  // %2
4959     "+r"(source_y_fraction)  // %3
4960   : "r"((intptr_t)(src_stride))  // %4
4961   : "memory", "cc", NACL_R14
4962     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4963   );
4964 }
4965 #endif  // HAS_INTERPOLATEROW_SSE2
4966 
4967 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
4968 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int pix)4969 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4970                           const uint8* shuffler, int pix) {
4971   asm volatile (
4972     "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
4973     LABELALIGN
4974   "1:                                          \n"
4975     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4976     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4977     "lea       " MEMLEA(0x20,0) ",%0           \n"
4978     "pshufb    %%xmm5,%%xmm0                   \n"
4979     "pshufb    %%xmm5,%%xmm1                   \n"
4980     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
4981     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
4982     "lea       " MEMLEA(0x20,1) ",%1           \n"
4983     "sub       $0x8,%2                         \n"
4984     "jg        1b                              \n"
4985   : "+r"(src_argb),  // %0
4986     "+r"(dst_argb),  // %1
4987     "+r"(pix)        // %2
4988   : "r"(shuffler)    // %3
4989   : "memory", "cc"
4990     , "xmm0", "xmm1", "xmm5"
4991   );
4992 }
4993 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
4994 
4995 #ifdef HAS_ARGBSHUFFLEROW_AVX2
4996 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int pix)4997 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4998                          const uint8* shuffler, int pix) {
4999   asm volatile (
5000     "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
5001     LABELALIGN
5002   "1:                                          \n"
5003     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
5004     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
5005     "lea       " MEMLEA(0x40,0) ",%0           \n"
5006     "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
5007     "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
5008     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
5009     "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
5010     "lea       " MEMLEA(0x40,1) ",%1           \n"
5011     "sub       $0x10,%2                        \n"
5012     "jg        1b                              \n"
5013     "vzeroupper                                \n"
5014   : "+r"(src_argb),  // %0
5015     "+r"(dst_argb),  // %1
5016     "+r"(pix)        // %2
5017   : "r"(shuffler)    // %3
5018   : "memory", "cc"
5019     , "xmm0", "xmm1", "xmm5"
5020   );
5021 }
5022 #endif  // HAS_ARGBSHUFFLEROW_AVX2
5023 
5024 #ifdef HAS_ARGBSHUFFLEROW_SSE2
5025 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int pix)5026 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
5027                          const uint8* shuffler, int pix) {
5028   uintptr_t pixel_temp = 0u;
5029   asm volatile (
5030     "pxor      %%xmm5,%%xmm5                   \n"
5031     "mov       " MEMACCESS(4) ",%k2            \n"
5032     "cmp       $0x3000102,%k2                  \n"
5033     "je        3012f                           \n"
5034     "cmp       $0x10203,%k2                    \n"
5035     "je        123f                            \n"
5036     "cmp       $0x30201,%k2                    \n"
5037     "je        321f                            \n"
5038     "cmp       $0x2010003,%k2                  \n"
5039     "je        2103f                           \n"
5040 
5041     LABELALIGN
5042   "1:                                          \n"
5043     "movzb     " MEMACCESS(4) ",%2             \n"
5044     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5045     "mov       %b2," MEMACCESS(1) "            \n"
5046     "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
5047     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5048     "mov       %b2," MEMACCESS2(0x1,1) "       \n"
5049     "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
5050     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5051     "mov       %b2," MEMACCESS2(0x2,1) "       \n"
5052     "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
5053     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5054     "mov       %b2," MEMACCESS2(0x3,1) "       \n"
5055     "lea       " MEMLEA(0x4,0) ",%0            \n"
5056     "lea       " MEMLEA(0x4,1) ",%1            \n"
5057     "sub       $0x1,%3                         \n"
5058     "jg        1b                              \n"
5059     "jmp       99f                             \n"
5060 
5061     LABELALIGN
5062   "123:                                        \n"
5063     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5064     "lea       " MEMLEA(0x10,0) ",%0           \n"
5065     "movdqa    %%xmm0,%%xmm1                   \n"
5066     "punpcklbw %%xmm5,%%xmm0                   \n"
5067     "punpckhbw %%xmm5,%%xmm1                   \n"
5068     "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
5069     "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
5070     "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
5071     "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
5072     "packuswb  %%xmm1,%%xmm0                   \n"
5073     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5074     "lea       " MEMLEA(0x10,1) ",%1           \n"
5075     "sub       $0x4,%3                         \n"
5076     "jg        123b                            \n"
5077     "jmp       99f                             \n"
5078 
5079     LABELALIGN
5080   "321:                                        \n"
5081     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5082     "lea       " MEMLEA(0x10,0) ",%0           \n"
5083     "movdqa    %%xmm0,%%xmm1                   \n"
5084     "punpcklbw %%xmm5,%%xmm0                   \n"
5085     "punpckhbw %%xmm5,%%xmm1                   \n"
5086     "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
5087     "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
5088     "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
5089     "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
5090     "packuswb  %%xmm1,%%xmm0                   \n"
5091     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5092     "lea       " MEMLEA(0x10,1) ",%1           \n"
5093     "sub       $0x4,%3                         \n"
5094     "jg        321b                            \n"
5095     "jmp       99f                             \n"
5096 
5097     LABELALIGN
5098   "2103:                                       \n"
5099     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5100     "lea       " MEMLEA(0x10,0) ",%0           \n"
5101     "movdqa    %%xmm0,%%xmm1                   \n"
5102     "punpcklbw %%xmm5,%%xmm0                   \n"
5103     "punpckhbw %%xmm5,%%xmm1                   \n"
5104     "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
5105     "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
5106     "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
5107     "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
5108     "packuswb  %%xmm1,%%xmm0                   \n"
5109     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5110     "lea       " MEMLEA(0x10,1) ",%1           \n"
5111     "sub       $0x4,%3                         \n"
5112     "jg        2103b                           \n"
5113     "jmp       99f                             \n"
5114 
5115     LABELALIGN
5116   "3012:                                       \n"
5117     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5118     "lea       " MEMLEA(0x10,0) ",%0           \n"
5119     "movdqa    %%xmm0,%%xmm1                   \n"
5120     "punpcklbw %%xmm5,%%xmm0                   \n"
5121     "punpckhbw %%xmm5,%%xmm1                   \n"
5122     "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
5123     "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
5124     "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
5125     "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
5126     "packuswb  %%xmm1,%%xmm0                   \n"
5127     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5128     "lea       " MEMLEA(0x10,1) ",%1           \n"
5129     "sub       $0x4,%3                         \n"
5130     "jg        3012b                           \n"
5131 
5132   "99:                                         \n"
5133   : "+r"(src_argb),    // %0
5134     "+r"(dst_argb),    // %1
5135     "+d"(pixel_temp),  // %2
5136     "+r"(pix)         // %3
5137   : "r"(shuffler)      // %4
5138   : "memory", "cc", NACL_R14
5139     "xmm0", "xmm1", "xmm5"
5140   );
5141 }
5142 #endif  // HAS_ARGBSHUFFLEROW_SSE2
5143 
5144 #ifdef HAS_I422TOYUY2ROW_SSE2
I422ToYUY2Row_SSE2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_frame,int width)5145 void I422ToYUY2Row_SSE2(const uint8* src_y,
5146                         const uint8* src_u,
5147                         const uint8* src_v,
5148                         uint8* dst_frame, int width) {
5149  asm volatile (
5150     "sub       %1,%2                             \n"
5151     LABELALIGN
5152   "1:                                            \n"
5153     "movq      " MEMACCESS(1) ",%%xmm2           \n"
5154     MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
5155     "lea       " MEMLEA(0x8,1) ",%1              \n"
5156     "punpcklbw %%xmm3,%%xmm2                     \n"
5157     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
5158     "lea       " MEMLEA(0x10,0) ",%0             \n"
5159     "movdqa    %%xmm0,%%xmm1                     \n"
5160     "punpcklbw %%xmm2,%%xmm0                     \n"
5161     "punpckhbw %%xmm2,%%xmm1                     \n"
5162     "movdqu    %%xmm0," MEMACCESS(3) "           \n"
5163     "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
5164     "lea       " MEMLEA(0x20,3) ",%3             \n"
5165     "sub       $0x10,%4                          \n"
5166     "jg         1b                               \n"
5167     : "+r"(src_y),  // %0
5168       "+r"(src_u),  // %1
5169       "+r"(src_v),  // %2
5170       "+r"(dst_frame),  // %3
5171       "+rm"(width)  // %4
5172     :
5173     : "memory", "cc", NACL_R14
5174     "xmm0", "xmm1", "xmm2", "xmm3"
5175   );
5176 }
5177 #endif  // HAS_I422TOYUY2ROW_SSE2
5178 
5179 #ifdef HAS_I422TOUYVYROW_SSE2
I422ToUYVYRow_SSE2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_frame,int width)5180 void I422ToUYVYRow_SSE2(const uint8* src_y,
5181                         const uint8* src_u,
5182                         const uint8* src_v,
5183                         uint8* dst_frame, int width) {
5184  asm volatile (
5185     "sub        %1,%2                            \n"
5186     LABELALIGN
5187   "1:                                            \n"
5188     "movq      " MEMACCESS(1) ",%%xmm2           \n"
5189     MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
5190     "lea       " MEMLEA(0x8,1) ",%1              \n"
5191     "punpcklbw %%xmm3,%%xmm2                     \n"
5192     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
5193     "movdqa    %%xmm2,%%xmm1                     \n"
5194     "lea       " MEMLEA(0x10,0) ",%0             \n"
5195     "punpcklbw %%xmm0,%%xmm1                     \n"
5196     "punpckhbw %%xmm0,%%xmm2                     \n"
5197     "movdqu    %%xmm1," MEMACCESS(3) "           \n"
5198     "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
5199     "lea       " MEMLEA(0x20,3) ",%3             \n"
5200     "sub       $0x10,%4                          \n"
5201     "jg         1b                               \n"
5202     : "+r"(src_y),  // %0
5203       "+r"(src_u),  // %1
5204       "+r"(src_v),  // %2
5205       "+r"(dst_frame),  // %3
5206       "+rm"(width)  // %4
5207     :
5208     : "memory", "cc", NACL_R14
5209     "xmm0", "xmm1", "xmm2", "xmm3"
5210   );
5211 }
5212 #endif  // HAS_I422TOUYVYROW_SSE2
5213 
5214 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
ARGBPolynomialRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,const float * poly,int width)5215 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
5216                             uint8* dst_argb, const float* poly,
5217                             int width) {
5218   asm volatile (
5219     "pxor      %%xmm3,%%xmm3                   \n"
5220 
5221     // 2 pixel loop.
5222     LABELALIGN
5223   "1:                                          \n"
5224     "movq      " MEMACCESS(0) ",%%xmm0         \n"
5225     "lea       " MEMLEA(0x8,0) ",%0            \n"
5226     "punpcklbw %%xmm3,%%xmm0                   \n"
5227     "movdqa    %%xmm0,%%xmm4                   \n"
5228     "punpcklwd %%xmm3,%%xmm0                   \n"
5229     "punpckhwd %%xmm3,%%xmm4                   \n"
5230     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
5231     "cvtdq2ps  %%xmm4,%%xmm4                   \n"
5232     "movdqa    %%xmm0,%%xmm1                   \n"
5233     "movdqa    %%xmm4,%%xmm5                   \n"
5234     "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
5235     "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
5236     "addps     " MEMACCESS(3) ",%%xmm0         \n"
5237     "addps     " MEMACCESS(3) ",%%xmm4         \n"
5238     "movdqa    %%xmm1,%%xmm2                   \n"
5239     "movdqa    %%xmm5,%%xmm6                   \n"
5240     "mulps     %%xmm1,%%xmm2                   \n"
5241     "mulps     %%xmm5,%%xmm6                   \n"
5242     "mulps     %%xmm2,%%xmm1                   \n"
5243     "mulps     %%xmm6,%%xmm5                   \n"
5244     "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
5245     "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
5246     "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
5247     "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
5248     "addps     %%xmm2,%%xmm0                   \n"
5249     "addps     %%xmm6,%%xmm4                   \n"
5250     "addps     %%xmm1,%%xmm0                   \n"
5251     "addps     %%xmm5,%%xmm4                   \n"
5252     "cvttps2dq %%xmm0,%%xmm0                   \n"
5253     "cvttps2dq %%xmm4,%%xmm4                   \n"
5254     "packuswb  %%xmm4,%%xmm0                   \n"
5255     "packuswb  %%xmm0,%%xmm0                   \n"
5256     "movq      %%xmm0," MEMACCESS(1) "         \n"
5257     "lea       " MEMLEA(0x8,1) ",%1            \n"
5258     "sub       $0x2,%2                         \n"
5259     "jg        1b                              \n"
5260   : "+r"(src_argb),  // %0
5261     "+r"(dst_argb),  // %1
5262     "+r"(width)      // %2
5263   : "r"(poly)        // %3
5264   : "memory", "cc"
5265     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
5266   );
5267 }
5268 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
5269 
5270 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
ARGBPolynomialRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,const float * poly,int width)5271 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
5272                             uint8* dst_argb, const float* poly,
5273                             int width) {
5274   asm volatile (
5275     "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
5276     "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
5277     "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
5278     "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
5279 
5280     // 2 pixel loop.
5281     LABELALIGN
5282   "1:                                          \n"
5283     "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
5284     "lea         " MEMLEA(0x8,0) ",%0          \n"
5285     "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
5286     "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
5287     "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
5288     "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
5289     "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
5290     "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
5291     "vcvttps2dq  %%ymm0,%%ymm0                 \n"
5292     "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
5293     "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
5294     "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
5295     "vmovq       %%xmm0," MEMACCESS(1) "       \n"
5296     "lea         " MEMLEA(0x8,1) ",%1          \n"
5297     "sub         $0x2,%2                       \n"
5298     "jg          1b                            \n"
5299     "vzeroupper                                \n"
5300   : "+r"(src_argb),  // %0
5301     "+r"(dst_argb),  // %1
5302     "+r"(width)      // %2
5303   : "r"(poly)        // %3
5304   : "memory", "cc",
5305     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
5306   );
5307 }
5308 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
5309 
5310 #ifdef HAS_ARGBCOLORTABLEROW_X86
5311 // Tranform ARGB pixels with color table.
ARGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)5312 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
5313                            int width) {
5314   uintptr_t pixel_temp = 0u;
5315   asm volatile (
5316     // 1 pixel loop.
5317     LABELALIGN
5318   "1:                                          \n"
5319     "movzb     " MEMACCESS(0) ",%1             \n"
5320     "lea       " MEMLEA(0x4,0) ",%0            \n"
5321     MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
5322     "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
5323     "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
5324     MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
5325     "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
5326     "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
5327     MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
5328     "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
5329     "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
5330     MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
5331     "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
5332     "dec       %2                              \n"
5333     "jg        1b                              \n"
5334   : "+r"(dst_argb),   // %0
5335     "+d"(pixel_temp), // %1
5336     "+r"(width)       // %2
5337   : "r"(table_argb)   // %3
5338   : "memory", "cc");
5339 }
5340 #endif  // HAS_ARGBCOLORTABLEROW_X86
5341 
5342 #ifdef HAS_RGBCOLORTABLEROW_X86
5343 // Tranform RGB pixels with color table.
RGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)5344 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
5345   uintptr_t pixel_temp = 0u;
5346   asm volatile (
5347     // 1 pixel loop.
5348     LABELALIGN
5349   "1:                                          \n"
5350     "movzb     " MEMACCESS(0) ",%1             \n"
5351     "lea       " MEMLEA(0x4,0) ",%0            \n"
5352     MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
5353     "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
5354     "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
5355     MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
5356     "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
5357     "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
5358     MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
5359     "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
5360     "dec       %2                              \n"
5361     "jg        1b                              \n"
5362   : "+r"(dst_argb),   // %0
5363     "+d"(pixel_temp), // %1
5364     "+r"(width)       // %2
5365   : "r"(table_argb)   // %3
5366   : "memory", "cc");
5367 }
5368 #endif  // HAS_RGBCOLORTABLEROW_X86
5369 
5370 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
5371 // Tranform RGB pixels with luma table.
ARGBLumaColorTableRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width,const uint8 * luma,uint32 lumacoeff)5372 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5373                                  int width,
5374                                  const uint8* luma, uint32 lumacoeff) {
5375   uintptr_t pixel_temp = 0u;
5376   uintptr_t table_temp = 0u;
5377   asm volatile (
5378     "movd      %6,%%xmm3                       \n"
5379     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
5380     "pcmpeqb   %%xmm4,%%xmm4                   \n"
5381     "psllw     $0x8,%%xmm4                     \n"
5382     "pxor      %%xmm5,%%xmm5                   \n"
5383 
5384     // 4 pixel loop.
5385     LABELALIGN
5386   "1:                                          \n"
5387     "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
5388     "pmaddubsw %%xmm3,%%xmm0                   \n"
5389     "phaddw    %%xmm0,%%xmm0                   \n"
5390     "pand      %%xmm4,%%xmm0                   \n"
5391     "punpcklwd %%xmm5,%%xmm0                   \n"
5392     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5393     "add       %5,%1                           \n"
5394     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5395 
5396     "movzb     " MEMACCESS(2) ",%0             \n"
5397     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5398     "mov       %b0," MEMACCESS(3) "            \n"
5399     "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
5400     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5401     "mov       %b0," MEMACCESS2(0x1,3) "       \n"
5402     "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
5403     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5404     "mov       %b0," MEMACCESS2(0x2,3) "       \n"
5405     "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
5406     "mov       %b0," MEMACCESS2(0x3,3) "       \n"
5407 
5408     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5409     "add       %5,%1                           \n"
5410     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5411 
5412     "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
5413     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5414     "mov       %b0," MEMACCESS2(0x4,3) "       \n"
5415     "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
5416     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5417     "mov       %b0," MEMACCESS2(0x5,3) "       \n"
5418     "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
5419     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5420     "mov       %b0," MEMACCESS2(0x6,3) "       \n"
5421     "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
5422     "mov       %b0," MEMACCESS2(0x7,3) "       \n"
5423 
5424     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5425     "add       %5,%1                           \n"
5426     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5427 
5428     "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
5429     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5430     "mov       %b0," MEMACCESS2(0x8,3) "       \n"
5431     "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
5432     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5433     "mov       %b0," MEMACCESS2(0x9,3) "       \n"
5434     "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
5435     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5436     "mov       %b0," MEMACCESS2(0xa,3) "       \n"
5437     "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
5438     "mov       %b0," MEMACCESS2(0xb,3) "       \n"
5439 
5440     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5441     "add       %5,%1                           \n"
5442 
5443     "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
5444     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5445     "mov       %b0," MEMACCESS2(0xc,3) "       \n"
5446     "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
5447     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5448     "mov       %b0," MEMACCESS2(0xd,3) "       \n"
5449     "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
5450     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5451     "mov       %b0," MEMACCESS2(0xe,3) "       \n"
5452     "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
5453     "mov       %b0," MEMACCESS2(0xf,3) "       \n"
5454     "lea       " MEMLEA(0x10,2) ",%2           \n"
5455     "lea       " MEMLEA(0x10,3) ",%3           \n"
5456     "sub       $0x4,%4                         \n"
5457     "jg        1b                              \n"
5458   : "+d"(pixel_temp),  // %0
5459     "+a"(table_temp),  // %1
5460     "+r"(src_argb),    // %2
5461     "+r"(dst_argb),    // %3
5462     "+rm"(width)       // %4
5463   : "r"(luma),         // %5
5464     "rm"(lumacoeff)    // %6
5465   : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
5466   );
5467 }
5468 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5469 
5470 #endif  // defined(__x86_64__) || defined(__i386__)
5471 
5472 #ifdef __cplusplus
5473 }  // extern "C"
5474 }  // namespace libyuv
5475 #endif
5476