1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
13 
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18 
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
21 
22 // Offsets for source bytes 0 to 9
23 static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
24                              128, 128, 128, 128, 128, 128, 128, 128};
25 
26 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
27 static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
28                              128, 128, 128, 128, 128, 128, 128, 128};
29 
30 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
31 static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
32                              128, 128, 128, 128, 128, 128, 128, 128};
33 
34 // Offsets for source bytes 0 to 10
35 static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
36 
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38 static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
39                               8, 9, 9, 10, 10, 11, 12, 13};
40 
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42 static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
43                               10, 11, 12, 13, 13, 14, 14, 15};
44 
45 // Coefficients for source bytes 0 to 10
46 static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
47 
48 // Coefficients for source bytes 10 to 21
49 static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
50 
51 // Coefficients for source bytes 21 to 31
52 static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
53 
54 // Coefficients for source bytes 21 to 31
55 static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
56 
57 static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
58                                128, 128, 128, 128, 128, 128, 128, 128};
59 
60 static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
61                                6,   8,   11,  14,  128, 128, 128, 128};
62 
63 // Arrange words 0,3,6 into 0,1,2
64 static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
65                               128, 128, 128, 128, 128, 128, 128, 128};
66 
67 // Arrange words 0,3,6 into 3,4,5
68 static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
69                                6,   7,   12,  13,  128, 128, 128, 128};
70 
71 // Scaling values for boxes of 3x3 and 2x3
72 static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
73                                   65536 / 9, 65536 / 6, 0,         0};
74 
75 // Arrange first value for pixels 0,1,2,3,4,5
76 static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
77                                11, 128, 14, 128, 128, 128, 128, 128};
78 
79 // Arrange second value for pixels 0,1,2,3,4,5
80 static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
81                                12, 128, 15, 128, 128, 128, 128, 128};
82 
83 // Arrange third value for pixels 0,1,2,3,4,5
84 static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
85                                13, 128, 128, 128, 128, 128, 128, 128};
86 
87 // Scaling values for boxes of 3x2 and 2x2
88 static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
89                                  65536 / 3, 65536 / 2, 0,         0};
90 
91 // GCC versions of row functions are verbatim conversions from Visual C.
92 // Generated using gcc disassembly on Visual C object file:
93 // objdump -D yuvscaler.obj >yuvscaler.txt
94 
ScaleRowDown2_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)95 void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
96                          ptrdiff_t src_stride,
97                          uint8_t* dst_ptr,
98                          int dst_width) {
99   (void)src_stride;
100   asm volatile(
101       // 16 pixel loop.
102       LABELALIGN
103       "1:                                        \n"
104       "movdqu      (%0),%%xmm0                   \n"
105       "movdqu      0x10(%0),%%xmm1               \n"
106       "lea         0x20(%0),%0                   \n"
107       "psrlw       $0x8,%%xmm0                   \n"
108       "psrlw       $0x8,%%xmm1                   \n"
109       "packuswb    %%xmm1,%%xmm0                 \n"
110       "movdqu      %%xmm0,(%1)                   \n"
111       "lea         0x10(%1),%1                   \n"
112       "sub         $0x10,%2                      \n"
113       "jg          1b                            \n"
114       : "+r"(src_ptr),   // %0
115         "+r"(dst_ptr),   // %1
116         "+r"(dst_width)  // %2
117         ::"memory",
118         "cc", "xmm0", "xmm1");
119 }
120 
ScaleRowDown2Linear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)121 void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
122                                ptrdiff_t src_stride,
123                                uint8_t* dst_ptr,
124                                int dst_width) {
125   (void)src_stride;
126   asm volatile(
127       "pcmpeqb     %%xmm4,%%xmm4                 \n"
128       "psrlw       $0xf,%%xmm4                   \n"
129       "packuswb    %%xmm4,%%xmm4                 \n"
130       "pxor        %%xmm5,%%xmm5                 \n"
131 
132       LABELALIGN
133       "1:                                        \n"
134       "movdqu      (%0),%%xmm0                   \n"
135       "movdqu      0x10(%0),%%xmm1               \n"
136       "lea         0x20(%0),%0                   \n"
137       "pmaddubsw   %%xmm4,%%xmm0                 \n"
138       "pmaddubsw   %%xmm4,%%xmm1                 \n"
139       "pavgw       %%xmm5,%%xmm0                 \n"
140       "pavgw       %%xmm5,%%xmm1                 \n"
141       "packuswb    %%xmm1,%%xmm0                 \n"
142       "movdqu      %%xmm0,(%1)                   \n"
143       "lea         0x10(%1),%1                   \n"
144       "sub         $0x10,%2                      \n"
145       "jg          1b                            \n"
146       : "+r"(src_ptr),   // %0
147         "+r"(dst_ptr),   // %1
148         "+r"(dst_width)  // %2
149         ::"memory",
150         "cc", "xmm0", "xmm1", "xmm4", "xmm5");
151 }
152 
ScaleRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)153 void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
154                             ptrdiff_t src_stride,
155                             uint8_t* dst_ptr,
156                             int dst_width) {
157   asm volatile(
158       "pcmpeqb     %%xmm4,%%xmm4                 \n"
159       "psrlw       $0xf,%%xmm4                   \n"
160       "packuswb    %%xmm4,%%xmm4                 \n"
161       "pxor        %%xmm5,%%xmm5                 \n"
162 
163       LABELALIGN
164       "1:                                        \n"
165       "movdqu      (%0),%%xmm0                   \n"
166       "movdqu      0x10(%0),%%xmm1               \n"
167       "movdqu      0x00(%0,%3,1),%%xmm2          \n"
168       "movdqu      0x10(%0,%3,1),%%xmm3          \n"
169       "lea         0x20(%0),%0                   \n"
170       "pmaddubsw   %%xmm4,%%xmm0                 \n"
171       "pmaddubsw   %%xmm4,%%xmm1                 \n"
172       "pmaddubsw   %%xmm4,%%xmm2                 \n"
173       "pmaddubsw   %%xmm4,%%xmm3                 \n"
174       "paddw       %%xmm2,%%xmm0                 \n"
175       "paddw       %%xmm3,%%xmm1                 \n"
176       "psrlw       $0x1,%%xmm0                   \n"
177       "psrlw       $0x1,%%xmm1                   \n"
178       "pavgw       %%xmm5,%%xmm0                 \n"
179       "pavgw       %%xmm5,%%xmm1                 \n"
180       "packuswb    %%xmm1,%%xmm0                 \n"
181       "movdqu      %%xmm0,(%1)                   \n"
182       "lea         0x10(%1),%1                   \n"
183       "sub         $0x10,%2                      \n"
184       "jg          1b                            \n"
185       : "+r"(src_ptr),               // %0
186         "+r"(dst_ptr),               // %1
187         "+r"(dst_width)              // %2
188       : "r"((intptr_t)(src_stride))  // %3
189       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
190 }
191 
192 #ifdef HAS_SCALEROWDOWN2_AVX2
ScaleRowDown2_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)193 void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
194                         ptrdiff_t src_stride,
195                         uint8_t* dst_ptr,
196                         int dst_width) {
197   (void)src_stride;
198   asm volatile(LABELALIGN
199       "1:                                        \n"
200       "vmovdqu     (%0),%%ymm0                   \n"
201       "vmovdqu     0x20(%0),%%ymm1               \n"
202       "lea         0x40(%0),%0                   \n"
203       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
204       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
205       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
206       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
207       "vmovdqu     %%ymm0,(%1)                   \n"
208       "lea         0x20(%1),%1                   \n"
209       "sub         $0x20,%2                      \n"
210       "jg          1b                            \n"
211       "vzeroupper                                \n"
212                : "+r"(src_ptr),   // %0
213                  "+r"(dst_ptr),   // %1
214                  "+r"(dst_width)  // %2
215                  ::"memory",
216                  "cc", "xmm0", "xmm1");
217 }
218 
ScaleRowDown2Linear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)219 void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
220                               ptrdiff_t src_stride,
221                               uint8_t* dst_ptr,
222                               int dst_width) {
223   (void)src_stride;
224   asm volatile(
225       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
226       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
227       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
228       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
229 
230       LABELALIGN
231       "1:                                        \n"
232       "vmovdqu     (%0),%%ymm0                   \n"
233       "vmovdqu     0x20(%0),%%ymm1               \n"
234       "lea         0x40(%0),%0                   \n"
235       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
236       "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
237       "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
238       "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
239       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
240       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
241       "vmovdqu     %%ymm0,(%1)                   \n"
242       "lea         0x20(%1),%1                   \n"
243       "sub         $0x20,%2                      \n"
244       "jg          1b                            \n"
245       "vzeroupper                                \n"
246       : "+r"(src_ptr),   // %0
247         "+r"(dst_ptr),   // %1
248         "+r"(dst_width)  // %2
249         ::"memory",
250         "cc", "xmm0", "xmm1", "xmm4", "xmm5");
251 }
252 
ScaleRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)253 void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
254                            ptrdiff_t src_stride,
255                            uint8_t* dst_ptr,
256                            int dst_width) {
257   asm volatile(
258       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
259       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
260       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
261       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
262 
263       LABELALIGN
264       "1:                                        \n"
265       "vmovdqu     (%0),%%ymm0                   \n"
266       "vmovdqu     0x20(%0),%%ymm1               \n"
267       "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
268       "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
269       "lea         0x40(%0),%0                   \n"
270       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
271       "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
272       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
273       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
274       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
275       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
276       "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
277       "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
278       "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
279       "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
280       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
281       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
282       "vmovdqu     %%ymm0,(%1)                   \n"
283       "lea         0x20(%1),%1                   \n"
284       "sub         $0x20,%2                      \n"
285       "jg          1b                            \n"
286       "vzeroupper                                \n"
287       : "+r"(src_ptr),               // %0
288         "+r"(dst_ptr),               // %1
289         "+r"(dst_width)              // %2
290       : "r"((intptr_t)(src_stride))  // %3
291       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
292 }
293 #endif  // HAS_SCALEROWDOWN2_AVX2
294 
ScaleRowDown4_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)295 void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
296                          ptrdiff_t src_stride,
297                          uint8_t* dst_ptr,
298                          int dst_width) {
299   (void)src_stride;
300   asm volatile(
301       "pcmpeqb     %%xmm5,%%xmm5                 \n"
302       "psrld       $0x18,%%xmm5                  \n"
303       "pslld       $0x10,%%xmm5                  \n"
304 
305       LABELALIGN
306       "1:                                        \n"
307       "movdqu      (%0),%%xmm0                   \n"
308       "movdqu      0x10(%0),%%xmm1               \n"
309       "lea         0x20(%0),%0                   \n"
310       "pand        %%xmm5,%%xmm0                 \n"
311       "pand        %%xmm5,%%xmm1                 \n"
312       "packuswb    %%xmm1,%%xmm0                 \n"
313       "psrlw       $0x8,%%xmm0                   \n"
314       "packuswb    %%xmm0,%%xmm0                 \n"
315       "movq        %%xmm0,(%1)                   \n"
316       "lea         0x8(%1),%1                    \n"
317       "sub         $0x8,%2                       \n"
318       "jg          1b                            \n"
319       : "+r"(src_ptr),   // %0
320         "+r"(dst_ptr),   // %1
321         "+r"(dst_width)  // %2
322         ::"memory",
323         "cc", "xmm0", "xmm1", "xmm5");
324 }
325 
ScaleRowDown4Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)326 void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
327                             ptrdiff_t src_stride,
328                             uint8_t* dst_ptr,
329                             int dst_width) {
330   intptr_t stridex3;
331   asm volatile(
332       "pcmpeqb     %%xmm4,%%xmm4                 \n"
333       "psrlw       $0xf,%%xmm4                   \n"
334       "movdqa      %%xmm4,%%xmm5                 \n"
335       "packuswb    %%xmm4,%%xmm4                 \n"
336       "psllw       $0x3,%%xmm5                   \n"
337       "lea         0x00(%4,%4,2),%3              \n"
338 
339       LABELALIGN
340       "1:                                        \n"
341       "movdqu      (%0),%%xmm0                   \n"
342       "movdqu      0x10(%0),%%xmm1               \n"
343       "movdqu      0x00(%0,%4,1),%%xmm2          \n"
344       "movdqu      0x10(%0,%4,1),%%xmm3          \n"
345       "pmaddubsw   %%xmm4,%%xmm0                 \n"
346       "pmaddubsw   %%xmm4,%%xmm1                 \n"
347       "pmaddubsw   %%xmm4,%%xmm2                 \n"
348       "pmaddubsw   %%xmm4,%%xmm3                 \n"
349       "paddw       %%xmm2,%%xmm0                 \n"
350       "paddw       %%xmm3,%%xmm1                 \n"
351       "movdqu      0x00(%0,%4,2),%%xmm2          \n"
352       "movdqu      0x10(%0,%4,2),%%xmm3          \n"
353       "pmaddubsw   %%xmm4,%%xmm2                 \n"
354       "pmaddubsw   %%xmm4,%%xmm3                 \n"
355       "paddw       %%xmm2,%%xmm0                 \n"
356       "paddw       %%xmm3,%%xmm1                 \n"
357       "movdqu      0x00(%0,%3,1),%%xmm2          \n"
358       "movdqu      0x10(%0,%3,1),%%xmm3          \n"
359       "lea         0x20(%0),%0                   \n"
360       "pmaddubsw   %%xmm4,%%xmm2                 \n"
361       "pmaddubsw   %%xmm4,%%xmm3                 \n"
362       "paddw       %%xmm2,%%xmm0                 \n"
363       "paddw       %%xmm3,%%xmm1                 \n"
364       "phaddw      %%xmm1,%%xmm0                 \n"
365       "paddw       %%xmm5,%%xmm0                 \n"
366       "psrlw       $0x4,%%xmm0                   \n"
367       "packuswb    %%xmm0,%%xmm0                 \n"
368       "movq        %%xmm0,(%1)                   \n"
369       "lea         0x8(%1),%1                    \n"
370       "sub         $0x8,%2                       \n"
371       "jg          1b                            \n"
372       : "+r"(src_ptr),               // %0
373         "+r"(dst_ptr),               // %1
374         "+r"(dst_width),             // %2
375         "=&r"(stridex3)              // %3
376       : "r"((intptr_t)(src_stride))  // %4
377       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
378 }
379 
380 #ifdef HAS_SCALEROWDOWN4_AVX2
ScaleRowDown4_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)381 void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
382                         ptrdiff_t src_stride,
383                         uint8_t* dst_ptr,
384                         int dst_width) {
385   (void)src_stride;
386   asm volatile(
387       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
388       "vpsrld      $0x18,%%ymm5,%%ymm5           \n"
389       "vpslld      $0x10,%%ymm5,%%ymm5           \n"
390 
391       LABELALIGN
392       "1:                                        \n"
393       "vmovdqu     (%0),%%ymm0                   \n"
394       "vmovdqu     0x20(%0),%%ymm1               \n"
395       "lea         0x40(%0),%0                   \n"
396       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
397       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
398       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
399       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
400       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
401       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
402       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
403       "vmovdqu     %%xmm0,(%1)                   \n"
404       "lea         0x10(%1),%1                   \n"
405       "sub         $0x10,%2                      \n"
406       "jg          1b                            \n"
407       "vzeroupper                                \n"
408       : "+r"(src_ptr),   // %0
409         "+r"(dst_ptr),   // %1
410         "+r"(dst_width)  // %2
411         ::"memory",
412         "cc", "xmm0", "xmm1", "xmm5");
413 }
414 
ScaleRowDown4Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)415 void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
416                            ptrdiff_t src_stride,
417                            uint8_t* dst_ptr,
418                            int dst_width) {
419   asm volatile(
420       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
421       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
422       "vpsllw      $0x3,%%ymm4,%%ymm5            \n"
423       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
424 
425       LABELALIGN
426       "1:                                        \n"
427       "vmovdqu     (%0),%%ymm0                   \n"
428       "vmovdqu     0x20(%0),%%ymm1               \n"
429       "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
430       "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
431       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
432       "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
433       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
434       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
435       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
436       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
437       "vmovdqu     0x00(%0,%3,2),%%ymm2          \n"
438       "vmovdqu     0x20(%0,%3,2),%%ymm3          \n"
439       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
440       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
441       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
442       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
443       "vmovdqu     0x00(%0,%4,1),%%ymm2          \n"
444       "vmovdqu     0x20(%0,%4,1),%%ymm3          \n"
445       "lea         0x40(%0),%0                   \n"
446       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
447       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
448       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
449       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
450       "vphaddw     %%ymm1,%%ymm0,%%ymm0          \n"
451       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
452       "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
453       "vpsrlw      $0x4,%%ymm0,%%ymm0            \n"
454       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
455       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
456       "vmovdqu     %%xmm0,(%1)                   \n"
457       "lea         0x10(%1),%1                   \n"
458       "sub         $0x10,%2                      \n"
459       "jg          1b                            \n"
460       "vzeroupper                                \n"
461       : "+r"(src_ptr),                   // %0
462         "+r"(dst_ptr),                   // %1
463         "+r"(dst_width)                  // %2
464       : "r"((intptr_t)(src_stride)),     // %3
465         "r"((intptr_t)(src_stride * 3))  // %4
466       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
467 }
468 #endif  // HAS_SCALEROWDOWN4_AVX2
469 
ScaleRowDown34_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)470 void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
471                           ptrdiff_t src_stride,
472                           uint8_t* dst_ptr,
473                           int dst_width) {
474   (void)src_stride;
475   asm volatile(
476       "movdqa      %0,%%xmm3                     \n"
477       "movdqa      %1,%%xmm4                     \n"
478       "movdqa      %2,%%xmm5                     \n"
479       :
480       : "m"(kShuf0),  // %0
481         "m"(kShuf1),  // %1
482         "m"(kShuf2)   // %2
483   );
484   asm volatile(LABELALIGN
485       "1:                                        \n"
486       "movdqu      (%0),%%xmm0                   \n"
487       "movdqu      0x10(%0),%%xmm2               \n"
488       "lea         0x20(%0),%0                   \n"
489       "movdqa      %%xmm2,%%xmm1                 \n"
490       "palignr     $0x8,%%xmm0,%%xmm1            \n"
491       "pshufb      %%xmm3,%%xmm0                 \n"
492       "pshufb      %%xmm4,%%xmm1                 \n"
493       "pshufb      %%xmm5,%%xmm2                 \n"
494       "movq        %%xmm0,(%1)                   \n"
495       "movq        %%xmm1,0x8(%1)                \n"
496       "movq        %%xmm2,0x10(%1)               \n"
497       "lea         0x18(%1),%1                   \n"
498       "sub         $0x18,%2                      \n"
499       "jg          1b                            \n"
500                : "+r"(src_ptr),   // %0
501                  "+r"(dst_ptr),   // %1
502                  "+r"(dst_width)  // %2
503                  ::"memory",
504                  "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
505 }
506 
ScaleRowDown34_1_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)507 void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
508                                 ptrdiff_t src_stride,
509                                 uint8_t* dst_ptr,
510                                 int dst_width) {
511   asm volatile(
512       "movdqa      %0,%%xmm2                     \n"  // kShuf01
513       "movdqa      %1,%%xmm3                     \n"  // kShuf11
514       "movdqa      %2,%%xmm4                     \n"  // kShuf21
515       :
516       : "m"(kShuf01),  // %0
517         "m"(kShuf11),  // %1
518         "m"(kShuf21)   // %2
519   );
520   asm volatile(
521       "movdqa      %0,%%xmm5                     \n"  // kMadd01
522       "movdqa      %1,%%xmm0                     \n"  // kMadd11
523       "movdqa      %2,%%xmm1                     \n"  // kRound34
524       :
525       : "m"(kMadd01),  // %0
526         "m"(kMadd11),  // %1
527         "m"(kRound34)  // %2
528   );
529   asm volatile(LABELALIGN
530       "1:                                        \n"
531       "movdqu      (%0),%%xmm6                   \n"
532       "movdqu      0x00(%0,%3,1),%%xmm7          \n"
533       "pavgb       %%xmm7,%%xmm6                 \n"
534       "pshufb      %%xmm2,%%xmm6                 \n"
535       "pmaddubsw   %%xmm5,%%xmm6                 \n"
536       "paddsw      %%xmm1,%%xmm6                 \n"
537       "psrlw       $0x2,%%xmm6                   \n"
538       "packuswb    %%xmm6,%%xmm6                 \n"
539       "movq        %%xmm6,(%1)                   \n"
540       "movdqu      0x8(%0),%%xmm6                \n"
541       "movdqu      0x8(%0,%3,1),%%xmm7           \n"
542       "pavgb       %%xmm7,%%xmm6                 \n"
543       "pshufb      %%xmm3,%%xmm6                 \n"
544       "pmaddubsw   %%xmm0,%%xmm6                 \n"
545       "paddsw      %%xmm1,%%xmm6                 \n"
546       "psrlw       $0x2,%%xmm6                   \n"
547       "packuswb    %%xmm6,%%xmm6                 \n"
548       "movq        %%xmm6,0x8(%1)                \n"
549       "movdqu      0x10(%0),%%xmm6               \n"
550       "movdqu      0x10(%0,%3,1),%%xmm7          \n"
551       "lea         0x20(%0),%0                   \n"
552       "pavgb       %%xmm7,%%xmm6                 \n"
553       "pshufb      %%xmm4,%%xmm6                 \n"
554       "pmaddubsw   %4,%%xmm6                     \n"
555       "paddsw      %%xmm1,%%xmm6                 \n"
556       "psrlw       $0x2,%%xmm6                   \n"
557       "packuswb    %%xmm6,%%xmm6                 \n"
558       "movq        %%xmm6,0x10(%1)               \n"
559       "lea         0x18(%1),%1                   \n"
560       "sub         $0x18,%2                      \n"
561       "jg          1b                            \n"
562                : "+r"(src_ptr),                // %0
563                  "+r"(dst_ptr),                // %1
564                  "+r"(dst_width)               // %2
565                : "r"((intptr_t)(src_stride)),  // %3
566                  "m"(kMadd21)                  // %4
567                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
568                  "xmm6", "xmm7");
569 }
570 
ScaleRowDown34_0_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)571 void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
572                                 ptrdiff_t src_stride,
573                                 uint8_t* dst_ptr,
574                                 int dst_width) {
575   asm volatile(
576       "movdqa      %0,%%xmm2                     \n"  // kShuf01
577       "movdqa      %1,%%xmm3                     \n"  // kShuf11
578       "movdqa      %2,%%xmm4                     \n"  // kShuf21
579       :
580       : "m"(kShuf01),  // %0
581         "m"(kShuf11),  // %1
582         "m"(kShuf21)   // %2
583   );
584   asm volatile(
585       "movdqa      %0,%%xmm5                     \n"  // kMadd01
586       "movdqa      %1,%%xmm0                     \n"  // kMadd11
587       "movdqa      %2,%%xmm1                     \n"  // kRound34
588       :
589       : "m"(kMadd01),  // %0
590         "m"(kMadd11),  // %1
591         "m"(kRound34)  // %2
592   );
593 
594   asm volatile(LABELALIGN
595       "1:                                        \n"
596       "movdqu      (%0),%%xmm6                   \n"
597       "movdqu      0x00(%0,%3,1),%%xmm7          \n"
598       "pavgb       %%xmm6,%%xmm7                 \n"
599       "pavgb       %%xmm7,%%xmm6                 \n"
600       "pshufb      %%xmm2,%%xmm6                 \n"
601       "pmaddubsw   %%xmm5,%%xmm6                 \n"
602       "paddsw      %%xmm1,%%xmm6                 \n"
603       "psrlw       $0x2,%%xmm6                   \n"
604       "packuswb    %%xmm6,%%xmm6                 \n"
605       "movq        %%xmm6,(%1)                   \n"
606       "movdqu      0x8(%0),%%xmm6                \n"
607       "movdqu      0x8(%0,%3,1),%%xmm7           \n"
608       "pavgb       %%xmm6,%%xmm7                 \n"
609       "pavgb       %%xmm7,%%xmm6                 \n"
610       "pshufb      %%xmm3,%%xmm6                 \n"
611       "pmaddubsw   %%xmm0,%%xmm6                 \n"
612       "paddsw      %%xmm1,%%xmm6                 \n"
613       "psrlw       $0x2,%%xmm6                   \n"
614       "packuswb    %%xmm6,%%xmm6                 \n"
615       "movq        %%xmm6,0x8(%1)                \n"
616       "movdqu      0x10(%0),%%xmm6               \n"
617       "movdqu      0x10(%0,%3,1),%%xmm7          \n"
618       "lea         0x20(%0),%0                   \n"
619       "pavgb       %%xmm6,%%xmm7                 \n"
620       "pavgb       %%xmm7,%%xmm6                 \n"
621       "pshufb      %%xmm4,%%xmm6                 \n"
622       "pmaddubsw   %4,%%xmm6                     \n"
623       "paddsw      %%xmm1,%%xmm6                 \n"
624       "psrlw       $0x2,%%xmm6                   \n"
625       "packuswb    %%xmm6,%%xmm6                 \n"
626       "movq        %%xmm6,0x10(%1)               \n"
627       "lea         0x18(%1),%1                   \n"
628       "sub         $0x18,%2                      \n"
629       "jg          1b                            \n"
630                : "+r"(src_ptr),                // %0
631                  "+r"(dst_ptr),                // %1
632                  "+r"(dst_width)               // %2
633                : "r"((intptr_t)(src_stride)),  // %3
634                  "m"(kMadd21)                  // %4
635                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
636                  "xmm6", "xmm7");
637 }
638 
ScaleRowDown38_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)639 void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
640                           ptrdiff_t src_stride,
641                           uint8_t* dst_ptr,
642                           int dst_width) {
643   (void)src_stride;
644   asm volatile(
645       "movdqa      %3,%%xmm4                     \n"
646       "movdqa      %4,%%xmm5                     \n"
647 
648       LABELALIGN
649       "1:                                        \n"
650       "movdqu      (%0),%%xmm0                   \n"
651       "movdqu      0x10(%0),%%xmm1               \n"
652       "lea         0x20(%0),%0                   \n"
653       "pshufb      %%xmm4,%%xmm0                 \n"
654       "pshufb      %%xmm5,%%xmm1                 \n"
655       "paddusb     %%xmm1,%%xmm0                 \n"
656       "movq        %%xmm0,(%1)                   \n"
657       "movhlps     %%xmm0,%%xmm1                 \n"
658       "movd        %%xmm1,0x8(%1)                \n"
659       "lea         0xc(%1),%1                    \n"
660       "sub         $0xc,%2                       \n"
661       "jg          1b                            \n"
662       : "+r"(src_ptr),   // %0
663         "+r"(dst_ptr),   // %1
664         "+r"(dst_width)  // %2
665       : "m"(kShuf38a),   // %3
666         "m"(kShuf38b)    // %4
667       : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
668 }
669 
ScaleRowDown38_2_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)670 void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
671                                 ptrdiff_t src_stride,
672                                 uint8_t* dst_ptr,
673                                 int dst_width) {
674   asm volatile(
675       "movdqa      %0,%%xmm2                     \n"
676       "movdqa      %1,%%xmm3                     \n"
677       "movdqa      %2,%%xmm4                     \n"
678       "movdqa      %3,%%xmm5                     \n"
679       :
680       : "m"(kShufAb0),  // %0
681         "m"(kShufAb1),  // %1
682         "m"(kShufAb2),  // %2
683         "m"(kScaleAb2)  // %3
684   );
685   asm volatile(LABELALIGN
686       "1:                                        \n"
687       "movdqu      (%0),%%xmm0                   \n"
688       "movdqu      0x00(%0,%3,1),%%xmm1          \n"
689       "lea         0x10(%0),%0                   \n"
690       "pavgb       %%xmm1,%%xmm0                 \n"
691       "movdqa      %%xmm0,%%xmm1                 \n"
692       "pshufb      %%xmm2,%%xmm1                 \n"
693       "movdqa      %%xmm0,%%xmm6                 \n"
694       "pshufb      %%xmm3,%%xmm6                 \n"
695       "paddusw     %%xmm6,%%xmm1                 \n"
696       "pshufb      %%xmm4,%%xmm0                 \n"
697       "paddusw     %%xmm0,%%xmm1                 \n"
698       "pmulhuw     %%xmm5,%%xmm1                 \n"
699       "packuswb    %%xmm1,%%xmm1                 \n"
700       "movd        %%xmm1,(%1)                   \n"
701       "psrlq       $0x10,%%xmm1                  \n"
702       "movd        %%xmm1,0x2(%1)                \n"
703       "lea         0x6(%1),%1                    \n"
704       "sub         $0x6,%2                       \n"
705       "jg          1b                            \n"
706                : "+r"(src_ptr),               // %0
707                  "+r"(dst_ptr),               // %1
708                  "+r"(dst_width)              // %2
709                : "r"((intptr_t)(src_stride))  // %3
710                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
711                  "xmm6");
712 }
713 
ScaleRowDown38_3_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)714 void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
715                                 ptrdiff_t src_stride,
716                                 uint8_t* dst_ptr,
717                                 int dst_width) {
718   asm volatile(
719       "movdqa      %0,%%xmm2                     \n"
720       "movdqa      %1,%%xmm3                     \n"
721       "movdqa      %2,%%xmm4                     \n"
722       "pxor        %%xmm5,%%xmm5                 \n"
723       :
724       : "m"(kShufAc),    // %0
725         "m"(kShufAc3),   // %1
726         "m"(kScaleAc33)  // %2
727   );
728   asm volatile(LABELALIGN
729       "1:                                        \n"
730       "movdqu      (%0),%%xmm0                   \n"
731       "movdqu      0x00(%0,%3,1),%%xmm6          \n"
732       "movhlps     %%xmm0,%%xmm1                 \n"
733       "movhlps     %%xmm6,%%xmm7                 \n"
734       "punpcklbw   %%xmm5,%%xmm0                 \n"
735       "punpcklbw   %%xmm5,%%xmm1                 \n"
736       "punpcklbw   %%xmm5,%%xmm6                 \n"
737       "punpcklbw   %%xmm5,%%xmm7                 \n"
738       "paddusw     %%xmm6,%%xmm0                 \n"
739       "paddusw     %%xmm7,%%xmm1                 \n"
740       "movdqu      0x00(%0,%3,2),%%xmm6          \n"
741       "lea         0x10(%0),%0                   \n"
742       "movhlps     %%xmm6,%%xmm7                 \n"
743       "punpcklbw   %%xmm5,%%xmm6                 \n"
744       "punpcklbw   %%xmm5,%%xmm7                 \n"
745       "paddusw     %%xmm6,%%xmm0                 \n"
746       "paddusw     %%xmm7,%%xmm1                 \n"
747       "movdqa      %%xmm0,%%xmm6                 \n"
748       "psrldq      $0x2,%%xmm0                   \n"
749       "paddusw     %%xmm0,%%xmm6                 \n"
750       "psrldq      $0x2,%%xmm0                   \n"
751       "paddusw     %%xmm0,%%xmm6                 \n"
752       "pshufb      %%xmm2,%%xmm6                 \n"
753       "movdqa      %%xmm1,%%xmm7                 \n"
754       "psrldq      $0x2,%%xmm1                   \n"
755       "paddusw     %%xmm1,%%xmm7                 \n"
756       "psrldq      $0x2,%%xmm1                   \n"
757       "paddusw     %%xmm1,%%xmm7                 \n"
758       "pshufb      %%xmm3,%%xmm7                 \n"
759       "paddusw     %%xmm7,%%xmm6                 \n"
760       "pmulhuw     %%xmm4,%%xmm6                 \n"
761       "packuswb    %%xmm6,%%xmm6                 \n"
762       "movd        %%xmm6,(%1)                   \n"
763       "psrlq       $0x10,%%xmm6                  \n"
764       "movd        %%xmm6,0x2(%1)                \n"
765       "lea         0x6(%1),%1                    \n"
766       "sub         $0x6,%2                       \n"
767       "jg          1b                            \n"
768                : "+r"(src_ptr),               // %0
769                  "+r"(dst_ptr),               // %1
770                  "+r"(dst_width)              // %2
771                : "r"((intptr_t)(src_stride))  // %3
772                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
773                  "xmm6", "xmm7");
774 }
775 
776 static const uvec8 kLinearShuffleFar = {2,  3,  0, 1, 6,  7,  4,  5,
777                                         10, 11, 8, 9, 14, 15, 12, 13};
778 
779 static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
780                                     3, 1, 1, 3, 3, 1, 1, 3};
781 
782 #ifdef HAS_SCALEROWUP2LINEAR_SSE2
ScaleRowUp2_Linear_SSE2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)783 void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
784                              uint8_t* dst_ptr,
785                              int dst_width) {
786   asm volatile(
787       "pxor        %%xmm0,%%xmm0                 \n"  // 0
788       "pcmpeqw     %%xmm6,%%xmm6                 \n"
789       "psrlw       $15,%%xmm6                    \n"
790       "psllw       $1,%%xmm6                     \n"  // all 2
791 
792       LABELALIGN
793       "1:                                        \n"
794       "movq        (%0),%%xmm1                   \n"  // 01234567
795       "movq        1(%0),%%xmm2                  \n"  // 12345678
796       "movdqa      %%xmm1,%%xmm3                 \n"
797       "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
798       "punpcklbw   %%xmm1,%%xmm1                 \n"  // 0011223344556677
799       "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
800       "movdqa      %%xmm1,%%xmm4                 \n"
801       "punpcklbw   %%xmm0,%%xmm4                 \n"  // 00112233 (16)
802       "movdqa      %%xmm2,%%xmm5                 \n"
803       "punpcklbw   %%xmm0,%%xmm5                 \n"  // 11223344 (16)
804       "paddw       %%xmm5,%%xmm4                 \n"
805       "movdqa      %%xmm3,%%xmm5                 \n"
806       "paddw       %%xmm6,%%xmm4                 \n"
807       "punpcklbw   %%xmm0,%%xmm5                 \n"  // 01122334 (16)
808       "paddw       %%xmm5,%%xmm5                 \n"
809       "paddw       %%xmm4,%%xmm5                 \n"  // 3*near+far+2 (lo)
810       "psrlw       $2,%%xmm5                     \n"  // 3/4*near+1/4*far (lo)
811 
812       "punpckhbw   %%xmm0,%%xmm1                 \n"  // 44556677 (16)
813       "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
814       "paddw       %%xmm2,%%xmm1                 \n"
815       "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
816       "paddw       %%xmm6,%%xmm1                 \n"
817       "paddw       %%xmm3,%%xmm3                 \n"
818       "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
819       "psrlw       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
820 
821       "packuswb    %%xmm1,%%xmm5                 \n"
822       "movdqu      %%xmm5,(%1)                   \n"
823 
824       "lea         0x8(%0),%0                    \n"
825       "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
826       "sub         $0x10,%2                      \n"
827       "jg          1b                            \n"
828       : "+r"(src_ptr),   // %0
829         "+r"(dst_ptr),   // %1
830         "+r"(dst_width)  // %2
831       :
832       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
833 }
834 #endif
835 
836 #ifdef HAS_SCALEROWUP2BILINEAR_SSE2
ScaleRowUp2_Bilinear_SSE2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)837 void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
838                                ptrdiff_t src_stride,
839                                uint8_t* dst_ptr,
840                                ptrdiff_t dst_stride,
841                                int dst_width) {
842   asm volatile(
843       LABELALIGN
844       "1:                                        \n"
845       "pxor        %%xmm0,%%xmm0                 \n"  // 0
846       // above line
847       "movq        (%0),%%xmm1                   \n"  // 01234567
848       "movq        1(%0),%%xmm2                  \n"  // 12345678
849       "movdqa      %%xmm1,%%xmm3                 \n"
850       "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
851       "punpcklbw   %%xmm1,%%xmm1                 \n"  // 0011223344556677
852       "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
853 
854       "movdqa      %%xmm1,%%xmm4                 \n"
855       "punpcklbw   %%xmm0,%%xmm4                 \n"  // 00112233 (16)
856       "movdqa      %%xmm2,%%xmm5                 \n"
857       "punpcklbw   %%xmm0,%%xmm5                 \n"  // 11223344 (16)
858       "paddw       %%xmm5,%%xmm4                 \n"  // near+far
859       "movdqa      %%xmm3,%%xmm5                 \n"
860       "punpcklbw   %%xmm0,%%xmm5                 \n"  // 01122334 (16)
861       "paddw       %%xmm5,%%xmm5                 \n"  // 2*near
862       "paddw       %%xmm5,%%xmm4                 \n"  // 3*near+far (1, lo)
863 
864       "punpckhbw   %%xmm0,%%xmm1                 \n"  // 44556677 (16)
865       "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
866       "paddw       %%xmm2,%%xmm1                 \n"
867       "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
868       "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
869       "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
870 
871       // below line
872       "movq        (%0,%3),%%xmm6                \n"  // 01234567
873       "movq        1(%0,%3),%%xmm2               \n"  // 12345678
874       "movdqa      %%xmm6,%%xmm3                 \n"
875       "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
876       "punpcklbw   %%xmm6,%%xmm6                 \n"  // 0011223344556677
877       "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
878 
879       "movdqa      %%xmm6,%%xmm5                 \n"
880       "punpcklbw   %%xmm0,%%xmm5                 \n"  // 00112233 (16)
881       "movdqa      %%xmm2,%%xmm7                 \n"
882       "punpcklbw   %%xmm0,%%xmm7                 \n"  // 11223344 (16)
883       "paddw       %%xmm7,%%xmm5                 \n"  // near+far
884       "movdqa      %%xmm3,%%xmm7                 \n"
885       "punpcklbw   %%xmm0,%%xmm7                 \n"  // 01122334 (16)
886       "paddw       %%xmm7,%%xmm7                 \n"  // 2*near
887       "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far (2, lo)
888 
889       "punpckhbw   %%xmm0,%%xmm6                 \n"  // 44556677 (16)
890       "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
891       "paddw       %%xmm6,%%xmm2                 \n"  // near+far
892       "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
893       "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
894       "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far (2, hi)
895 
896       // xmm4 xmm1
897       // xmm5 xmm2
898       "pcmpeqw     %%xmm0,%%xmm0                 \n"
899       "psrlw       $15,%%xmm0                    \n"
900       "psllw       $3,%%xmm0                     \n"  // all 8
901 
902       "movdqa      %%xmm4,%%xmm3                 \n"
903       "movdqa      %%xmm5,%%xmm6                 \n"
904       "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (1, lo)
905       "paddw       %%xmm0,%%xmm6                 \n"  // 3*near+far+8 (2, lo)
906       "paddw       %%xmm4,%%xmm3                 \n"  // 9*near+3*far (1, lo)
907       "paddw       %%xmm6,%%xmm3                 \n"  // 9 3 3 1 + 8 (1, lo)
908       "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
909 
910       "movdqa      %%xmm1,%%xmm7                 \n"
911       "movdqa      %%xmm2,%%xmm6                 \n"
912       "paddw       %%xmm7,%%xmm7                 \n"  // 6*near+2*far (1, hi)
913       "paddw       %%xmm0,%%xmm6                 \n"  // 3*near+far+8 (2, hi)
914       "paddw       %%xmm1,%%xmm7                 \n"  // 9*near+3*far (1, hi)
915       "paddw       %%xmm6,%%xmm7                 \n"  // 9 3 3 1 + 8 (1, hi)
916       "psrlw       $4,%%xmm7                     \n"  // ^ div by 16
917 
918       "packuswb    %%xmm7,%%xmm3                 \n"
919       "movdqu      %%xmm3,(%1)                   \n"  // save above line
920 
921       "movdqa      %%xmm5,%%xmm3                 \n"
922       "paddw       %%xmm0,%%xmm4                 \n"  // 3*near+far+8 (1, lo)
923       "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (2, lo)
924       "paddw       %%xmm3,%%xmm5                 \n"  // 9*near+3*far (2, lo)
925       "paddw       %%xmm4,%%xmm5                 \n"  // 9 3 3 1 + 8 (lo)
926       "psrlw       $4,%%xmm5                     \n"  // ^ div by 16
927 
928       "movdqa      %%xmm2,%%xmm3                 \n"
929       "paddw       %%xmm0,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
930       "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (2, hi)
931       "paddw       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
932       "paddw       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (hi)
933       "psrlw       $4,%%xmm2                     \n"  // ^ div by 16
934 
935       "packuswb    %%xmm2,%%xmm5                 \n"
936       "movdqu      %%xmm5,(%1,%4)                \n"  // save below line
937 
938       "lea         0x8(%0),%0                    \n"
939       "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
940       "sub         $0x10,%2                      \n"
941       "jg          1b                            \n"
942       : "+r"(src_ptr),                // %0
943         "+r"(dst_ptr),                // %1
944         "+r"(dst_width)               // %2
945       : "r"((intptr_t)(src_stride)),  // %3
946         "r"((intptr_t)(dst_stride))   // %4
947       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
948         "xmm7");
949 }
950 #endif
951 
952 #ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3
ScaleRowUp2_Linear_12_SSSE3(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)953 void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
954                                  uint16_t* dst_ptr,
955                                  int dst_width) {
956   asm volatile(
957       "movdqa      %3,%%xmm5                     \n"
958       "pcmpeqw     %%xmm4,%%xmm4                 \n"
959       "psrlw       $15,%%xmm4                    \n"
960       "psllw       $1,%%xmm4                     \n"  // all 2
961 
962       LABELALIGN
963       "1:                                        \n"
964       "movdqu      (%0),%%xmm0                   \n"  // 01234567 (16)
965       "movdqu      2(%0),%%xmm1                  \n"  // 12345678 (16)
966 
967       "movdqa      %%xmm0,%%xmm2                 \n"
968       "punpckhwd   %%xmm1,%%xmm2                 \n"  // 45566778 (16)
969       "punpcklwd   %%xmm1,%%xmm0                 \n"  // 01122334 (16)
970 
971       "movdqa      %%xmm2,%%xmm3                 \n"
972       "movdqa      %%xmm0,%%xmm1                 \n"
973       "pshufb      %%xmm5,%%xmm3                 \n"  // 54657687 (far)
974       "pshufb      %%xmm5,%%xmm1                 \n"  // 10213243 (far)
975 
976       "paddw       %%xmm4,%%xmm1                 \n"  // far+2
977       "paddw       %%xmm4,%%xmm3                 \n"  // far+2
978       "paddw       %%xmm0,%%xmm1                 \n"  // near+far+2
979       "paddw       %%xmm2,%%xmm3                 \n"  // near+far+2
980       "paddw       %%xmm0,%%xmm0                 \n"  // 2*near
981       "paddw       %%xmm2,%%xmm2                 \n"  // 2*near
982       "paddw       %%xmm1,%%xmm0                 \n"  // 3*near+far+2 (lo)
983       "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far+2 (hi)
984 
985       "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far
986       "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far
987       "movdqu      %%xmm0,(%1)                   \n"
988       "movdqu      %%xmm2,16(%1)                 \n"
989 
990       "lea         0x10(%0),%0                   \n"
991       "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
992       "sub         $0x10,%2                      \n"
993       "jg          1b                            \n"
994       : "+r"(src_ptr),          // %0
995         "+r"(dst_ptr),          // %1
996         "+r"(dst_width)         // %2
997       : "m"(kLinearShuffleFar)  // %3
998       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
999 }
1000 #endif
1001 
1002 #ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3
ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1003 void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
1004                                    ptrdiff_t src_stride,
1005                                    uint16_t* dst_ptr,
1006                                    ptrdiff_t dst_stride,
1007                                    int dst_width) {
1008   asm volatile(
1009       "pcmpeqw     %%xmm7,%%xmm7                 \n"
1010       "psrlw       $15,%%xmm7                    \n"
1011       "psllw       $3,%%xmm7                     \n"  // all 8
1012       "movdqa      %5,%%xmm6                     \n"
1013 
1014       LABELALIGN
1015       "1:                                        \n"
1016       // above line
1017       "movdqu      (%0),%%xmm0                   \n"  // 01234567 (16)
1018       "movdqu      2(%0),%%xmm1                  \n"  // 12345678 (16)
1019       "movdqa      %%xmm0,%%xmm2                 \n"
1020       "punpckhwd   %%xmm1,%%xmm2                 \n"  // 45566778 (16)
1021       "punpcklwd   %%xmm1,%%xmm0                 \n"  // 01122334 (16)
1022       "movdqa      %%xmm2,%%xmm3                 \n"
1023       "movdqa      %%xmm0,%%xmm1                 \n"
1024       "pshufb      %%xmm6,%%xmm3                 \n"  // 54657687 (far)
1025       "pshufb      %%xmm6,%%xmm1                 \n"  // 10213243 (far)
1026       "paddw       %%xmm0,%%xmm1                 \n"  // near+far
1027       "paddw       %%xmm2,%%xmm3                 \n"  // near+far
1028       "paddw       %%xmm0,%%xmm0                 \n"  // 2*near
1029       "paddw       %%xmm2,%%xmm2                 \n"  // 2*near
1030       "paddw       %%xmm1,%%xmm0                 \n"  // 3*near+far (1, lo)
1031       "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far (1, hi)
1032 
1033       // below line
1034       "movdqu      (%0,%3,2),%%xmm1              \n"  // 01234567 (16)
1035       "movdqu      2(%0,%3,2),%%xmm4             \n"  // 12345678 (16)
1036       "movdqa      %%xmm1,%%xmm3                 \n"
1037       "punpckhwd   %%xmm4,%%xmm3                 \n"  // 45566778 (16)
1038       "punpcklwd   %%xmm4,%%xmm1                 \n"  // 01122334 (16)
1039       "movdqa      %%xmm3,%%xmm5                 \n"
1040       "movdqa      %%xmm1,%%xmm4                 \n"
1041       "pshufb      %%xmm6,%%xmm5                 \n"  // 54657687 (far)
1042       "pshufb      %%xmm6,%%xmm4                 \n"  // 10213243 (far)
1043       "paddw       %%xmm1,%%xmm4                 \n"  // near+far
1044       "paddw       %%xmm3,%%xmm5                 \n"  // near+far
1045       "paddw       %%xmm1,%%xmm1                 \n"  // 2*near
1046       "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
1047       "paddw       %%xmm4,%%xmm1                 \n"  // 3*near+far (2, lo)
1048       "paddw       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
1049 
1050       // xmm0 xmm2
1051       // xmm1 xmm3
1052 
1053       "movdqa      %%xmm0,%%xmm4                 \n"
1054       "movdqa      %%xmm1,%%xmm5                 \n"
1055       "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (1, lo)
1056       "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
1057       "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
1058       "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
1059       "psrlw       $4,%%xmm4                     \n"  // ^ div by 16
1060       "movdqu      %%xmm4,(%1)                   \n"
1061 
1062       "movdqa      %%xmm2,%%xmm4                 \n"
1063       "movdqa      %%xmm3,%%xmm5                 \n"
1064       "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (1, hi)
1065       "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far+8 (2, hi)
1066       "paddw       %%xmm2,%%xmm4                 \n"  // 9*near+3*far (1, hi)
1067       "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, hi)
1068       "psrlw       $4,%%xmm4                     \n"  // ^ div by 16
1069       "movdqu      %%xmm4,0x10(%1)               \n"
1070 
1071       "movdqa      %%xmm1,%%xmm4                 \n"
1072       "paddw       %%xmm7,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
1073       "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (2, lo)
1074       "paddw       %%xmm4,%%xmm1                 \n"  // 9*near+3*far (2, lo)
1075       "paddw       %%xmm0,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, lo)
1076       "psrlw       $4,%%xmm1                     \n"  // ^ div by 16
1077       "movdqu      %%xmm1,(%1,%4,2)              \n"
1078 
1079       "movdqa      %%xmm3,%%xmm4                 \n"
1080       "paddw       %%xmm7,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
1081       "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (2, hi)
1082       "paddw       %%xmm4,%%xmm3                 \n"  // 9*near+3*far (2, hi)
1083       "paddw       %%xmm2,%%xmm3                 \n"  // 9 3 3 1 + 8 (2, hi)
1084       "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
1085       "movdqu      %%xmm3,0x10(%1,%4,2)          \n"
1086 
1087       "lea         0x10(%0),%0                   \n"
1088       "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
1089       "sub         $0x10,%2                      \n"
1090       "jg          1b                            \n"
1091       : "+r"(src_ptr),                // %0
1092         "+r"(dst_ptr),                // %1
1093         "+r"(dst_width)               // %2
1094       : "r"((intptr_t)(src_stride)),  // %3
1095         "r"((intptr_t)(dst_stride)),  // %4
1096         "m"(kLinearShuffleFar)        // %5
1097       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1098 }
1099 #endif
1100 
1101 #ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
ScaleRowUp2_Linear_16_SSE2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1102 void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
1103                                 uint16_t* dst_ptr,
1104                                 int dst_width) {
1105   asm volatile(
1106       "pxor        %%xmm5,%%xmm5                 \n"
1107       "pcmpeqd     %%xmm4,%%xmm4                 \n"
1108       "psrld       $31,%%xmm4                    \n"
1109       "pslld       $1,%%xmm4                     \n"  // all 2
1110 
1111       LABELALIGN
1112       "1:                                        \n"
1113       "movq        (%0),%%xmm0                   \n"  // 0123 (16b)
1114       "movq        2(%0),%%xmm1                  \n"  // 1234 (16b)
1115 
1116       "punpcklwd   %%xmm5,%%xmm0                 \n"  // 0123 (32b)
1117       "punpcklwd   %%xmm5,%%xmm1                 \n"  // 1234 (32b)
1118 
1119       "movdqa      %%xmm0,%%xmm2                 \n"
1120       "movdqa      %%xmm1,%%xmm3                 \n"
1121 
1122       "pshufd      $0b10110001,%%xmm2,%%xmm2     \n"  // 1032 (even, far)
1123       "pshufd      $0b10110001,%%xmm3,%%xmm3     \n"  // 2143 (odd, far)
1124 
1125       "paddd       %%xmm4,%%xmm2                 \n"  // far+2 (lo)
1126       "paddd       %%xmm4,%%xmm3                 \n"  // far+2 (hi)
1127       "paddd       %%xmm0,%%xmm2                 \n"  // near+far+2 (lo)
1128       "paddd       %%xmm1,%%xmm3                 \n"  // near+far+2 (hi)
1129       "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
1130       "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
1131       "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far+2 (lo)
1132       "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
1133 
1134       "psrld       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
1135       "psrld       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
1136       "packssdw    %%xmm1,%%xmm0                 \n"
1137       "pshufd      $0b11011000,%%xmm0,%%xmm0     \n"
1138       "movdqu      %%xmm0,(%1)                   \n"
1139 
1140       "lea         0x8(%0),%0                    \n"
1141       "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
1142       "sub         $0x8,%2                       \n"
1143       "jg          1b                            \n"
1144       : "+r"(src_ptr),   // %0
1145         "+r"(dst_ptr),   // %1
1146         "+r"(dst_width)  // %2
1147       :
1148       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1149 }
1150 #endif
1151 
1152 #ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
ScaleRowUp2_Bilinear_16_SSE2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1153 void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
1154                                   ptrdiff_t src_stride,
1155                                   uint16_t* dst_ptr,
1156                                   ptrdiff_t dst_stride,
1157                                   int dst_width) {
1158   asm volatile(
1159       "pxor        %%xmm7,%%xmm7                 \n"
1160       "pcmpeqd     %%xmm6,%%xmm6                 \n"
1161       "psrld       $31,%%xmm6                    \n"
1162       "pslld       $3,%%xmm6                     \n"  // all 8
1163 
1164       LABELALIGN
1165       "1:                                        \n"
1166       "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
1167       "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
1168       "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0011 (near) (32b, 1u1v)
1169       "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1122 (near) (32b, 1u1v)
1170       "movdqa      %%xmm0,%%xmm2                 \n"
1171       "movdqa      %%xmm1,%%xmm3                 \n"
1172       "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (far) (1, lo)
1173       "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (far) (1, hi)
1174       "paddd       %%xmm0,%%xmm2                 \n"  // near+far (1, lo)
1175       "paddd       %%xmm1,%%xmm3                 \n"  // near+far (1, hi)
1176       "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (1, lo)
1177       "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (1, hi)
1178       "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
1179       "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
1180 
1181       "movq        (%0),%%xmm0                   \n"  // 0123 (16b)
1182       "movq        2(%0),%%xmm1                  \n"  // 1234 (16b)
1183       "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0123 (32b)
1184       "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1234 (32b)
1185       "movdqa      %%xmm0,%%xmm2                 \n"
1186       "movdqa      %%xmm1,%%xmm3                 \n"
1187       "pshufd      $0b10110001,%%xmm2,%%xmm2     \n"  // 1032 (even, far)
1188       "pshufd      $0b10110001,%%xmm3,%%xmm3     \n"  // 2143 (odd, far)
1189       "paddd       %%xmm0,%%xmm2                 \n"  // near+far (lo)
1190       "paddd       %%xmm1,%%xmm3                 \n"  // near+far (hi)
1191       "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
1192       "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
1193       "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
1194       "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
1195 
1196       "movq        (%0,%3,2),%%xmm2              \n"
1197       "movq        2(%0,%3,2),%%xmm3             \n"
1198       "punpcklwd   %%xmm7,%%xmm2                 \n"  // 0123 (32b)
1199       "punpcklwd   %%xmm7,%%xmm3                 \n"  // 1234 (32b)
1200       "movdqa      %%xmm2,%%xmm4                 \n"
1201       "movdqa      %%xmm3,%%xmm5                 \n"
1202       "pshufd      $0b10110001,%%xmm4,%%xmm4     \n"  // 1032 (even, far)
1203       "pshufd      $0b10110001,%%xmm5,%%xmm5     \n"  // 2143 (odd, far)
1204       "paddd       %%xmm2,%%xmm4                 \n"  // near+far (lo)
1205       "paddd       %%xmm3,%%xmm5                 \n"  // near+far (hi)
1206       "paddd       %%xmm2,%%xmm2                 \n"  // 2*near (lo)
1207       "paddd       %%xmm3,%%xmm3                 \n"  // 2*near (hi)
1208       "paddd       %%xmm4,%%xmm2                 \n"  // 3*near+far (2, lo)
1209       "paddd       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
1210 
1211       "movdqa      %%xmm0,%%xmm4                 \n"
1212       "movdqa      %%xmm2,%%xmm5                 \n"
1213       "paddd       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
1214       "paddd       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
1215       "paddd       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
1216       "paddd       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
1217       "psrld       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
1218 
1219       "movdqa      %%xmm2,%%xmm5                 \n"
1220       "paddd       %%xmm2,%%xmm5                 \n"  // 6*near+2*far (2, lo)
1221       "paddd       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
1222       "paddd       %%xmm2,%%xmm5                 \n"  // 9*near+3*far (2, lo)
1223       "paddd       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
1224       "psrld       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
1225 
1226       "movdqa      %%xmm1,%%xmm0                 \n"
1227       "movdqa      %%xmm3,%%xmm2                 \n"
1228       "paddd       %%xmm1,%%xmm0                 \n"  // 6*near+2*far (1, hi)
1229       "paddd       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (2, hi)
1230       "paddd       %%xmm1,%%xmm0                 \n"  // 9*near+3*far (1, hi)
1231       "paddd       %%xmm2,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
1232       "psrld       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
1233 
1234       "movdqa      %%xmm3,%%xmm2                 \n"
1235       "paddd       %%xmm3,%%xmm2                 \n"  // 6*near+2*far (2, hi)
1236       "paddd       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
1237       "paddd       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
1238       "paddd       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (2, hi)
1239       "psrld       $4,%%xmm2                     \n"  // ^ div by 16 (2, hi)
1240 
1241       "packssdw    %%xmm0,%%xmm4                 \n"
1242       "pshufd      $0b11011000,%%xmm4,%%xmm4     \n"
1243       "movdqu      %%xmm4,(%1)                   \n"  // store above
1244       "packssdw    %%xmm2,%%xmm5                 \n"
1245       "pshufd      $0b11011000,%%xmm4,%%xmm4     \n"
1246       "movdqu      %%xmm5,(%1,%4,2)              \n"  // store below
1247 
1248       "lea         0x8(%0),%0                    \n"
1249       "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
1250       "sub         $0x8,%2                       \n"
1251       "jg          1b                            \n"
1252       : "+r"(src_ptr),                // %0
1253         "+r"(dst_ptr),                // %1
1254         "+r"(dst_width)               // %2
1255       : "r"((intptr_t)(src_stride)),  // %3
1256         "r"((intptr_t)(dst_stride))   // %4
1257       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1258 }
1259 #endif
1260 
1261 #ifdef HAS_SCALEROWUP2LINEAR_SSSE3
ScaleRowUp2_Linear_SSSE3(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)1262 void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
1263                               uint8_t* dst_ptr,
1264                               int dst_width) {
1265   asm volatile(
1266       "pcmpeqw     %%xmm4,%%xmm4                 \n"
1267       "psrlw       $15,%%xmm4                    \n"
1268       "psllw       $1,%%xmm4                     \n"  // all 2
1269       "movdqa      %3,%%xmm3                     \n"
1270 
1271       LABELALIGN
1272       "1:                                        \n"
1273       "movq        (%0),%%xmm0                   \n"  // 01234567
1274       "movq        1(%0),%%xmm1                  \n"  // 12345678
1275       "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
1276       "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
1277       "movdqa      %%xmm0,%%xmm2                 \n"
1278       "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
1279       "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
1280       "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (hi)
1281       "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (lo)
1282       "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
1283       "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
1284       "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
1285       "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
1286       "vpackuswb   %%xmm2,%%xmm0,%%xmm0          \n"
1287       "vmovdqu     %%xmm0,(%1)                   \n"
1288 
1289       "lea         0x8(%0),%0                    \n"
1290       "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
1291       "sub         $0x10,%2                      \n"
1292       "jg          1b                            \n"
1293       : "+r"(src_ptr),      // %0
1294         "+r"(dst_ptr),      // %1
1295         "+r"(dst_width)     // %2
1296       : "m"(kLinearMadd31)  // %3
1297       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1298 }
1299 #endif
1300 
1301 #ifdef HAS_SCALEROWUP2BILINEAR_SSSE3
ScaleRowUp2_Bilinear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1302 void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
1303                                 ptrdiff_t src_stride,
1304                                 uint8_t* dst_ptr,
1305                                 ptrdiff_t dst_stride,
1306                                 int dst_width) {
1307   asm volatile(
1308       "pcmpeqw     %%xmm6,%%xmm6                 \n"
1309       "psrlw       $15,%%xmm6                    \n"
1310       "psllw       $3,%%xmm6                     \n"  // all 8
1311       "movdqa      %5,%%xmm7                     \n"
1312 
1313       LABELALIGN
1314       "1:                                        \n"
1315       "movq        (%0),%%xmm0                   \n"  // 01234567
1316       "movq        1(%0),%%xmm1                  \n"  // 12345678
1317       "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
1318       "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
1319       "movdqa      %%xmm0,%%xmm2                 \n"
1320       "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
1321       "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
1322       "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1, hi)
1323       "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1, lo)
1324 
1325       "movq        (%0,%3),%%xmm1                \n"
1326       "movq        1(%0,%3),%%xmm4               \n"
1327       "punpcklwd   %%xmm1,%%xmm1                 \n"
1328       "punpcklwd   %%xmm4,%%xmm4                 \n"
1329       "movdqa      %%xmm1,%%xmm3                 \n"
1330       "punpckhdq   %%xmm4,%%xmm3                 \n"
1331       "punpckldq   %%xmm4,%%xmm1                 \n"
1332       "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
1333       "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
1334 
1335       // xmm0 xmm2
1336       // xmm1 xmm3
1337 
1338       "movdqa      %%xmm0,%%xmm4                 \n"
1339       "movdqa      %%xmm1,%%xmm5                 \n"
1340       "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
1341       "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
1342       "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
1343       "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
1344       "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
1345 
1346       "movdqa      %%xmm1,%%xmm5                 \n"
1347       "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
1348       "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
1349       "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
1350       "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
1351       "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
1352 
1353       "movdqa      %%xmm2,%%xmm0                 \n"
1354       "movdqa      %%xmm3,%%xmm1                 \n"
1355       "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
1356       "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
1357       "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
1358       "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
1359       "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
1360 
1361       "movdqa      %%xmm3,%%xmm1                 \n"
1362       "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
1363       "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
1364       "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
1365       "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
1366       "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
1367 
1368       "packuswb    %%xmm0,%%xmm4                 \n"
1369       "movdqu      %%xmm4,(%1)                   \n"  // store above
1370       "packuswb    %%xmm1,%%xmm5                 \n"
1371       "movdqu      %%xmm5,(%1,%4)                \n"  // store below
1372 
1373       "lea         0x8(%0),%0                    \n"
1374       "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
1375       "sub         $0x10,%2                      \n"
1376       "jg          1b                            \n"
1377       : "+r"(src_ptr),                // %0
1378         "+r"(dst_ptr),                // %1
1379         "+r"(dst_width)               // %2
1380       : "r"((intptr_t)(src_stride)),  // %3
1381         "r"((intptr_t)(dst_stride)),  // %4
1382         "m"(kLinearMadd31)            // %5
1383       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1384         "xmm7");
1385 }
1386 #endif
1387 
1388 #ifdef HAS_SCALEROWUP2LINEAR_AVX2
ScaleRowUp2_Linear_AVX2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)1389 void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
1390                              uint8_t* dst_ptr,
1391                              int dst_width) {
1392   asm volatile(
1393       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
1394       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
1395       "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
1396       "vbroadcastf128 %3,%%ymm3                  \n"
1397 
1398       LABELALIGN
1399       "1:                                        \n"
1400       "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
1401       "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
1402       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
1403       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
1404       "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
1405       "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
1406       "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
1407       "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
1408       "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
1409       "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
1410       "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
1411       "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
1412       "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
1413       "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
1414       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
1415       "vmovdqu     %%ymm0,(%1)                   \n"
1416 
1417       "lea         0x10(%0),%0                   \n"
1418       "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
1419       "sub         $0x20,%2                      \n"
1420       "jg          1b                            \n"
1421       "vzeroupper                                \n"
1422       : "+r"(src_ptr),      // %0
1423         "+r"(dst_ptr),      // %1
1424         "+r"(dst_width)     // %2
1425       : "m"(kLinearMadd31)  // %3
1426       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1427 }
1428 #endif
1429 
1430 #ifdef HAS_SCALEROWUP2BILINEAR_AVX2
ScaleRowUp2_Bilinear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1431 void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
1432                                ptrdiff_t src_stride,
1433                                uint8_t* dst_ptr,
1434                                ptrdiff_t dst_stride,
1435                                int dst_width) {
1436   asm volatile(
1437       "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
1438       "vpsrlw      $15,%%ymm6,%%ymm6             \n"
1439       "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
1440       "vbroadcastf128 %5,%%ymm7                  \n"
1441 
1442       LABELALIGN
1443       "1:                                        \n"
1444       "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
1445       "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
1446       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
1447       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
1448       "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
1449       "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
1450       "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
1451       "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
1452       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
1453       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
1454 
1455       "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
1456       "vmovdqu     1(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
1457       "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
1458       "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
1459       "vpunpcklwd  %%ymm2,%%ymm2,%%ymm2          \n"
1460       "vpunpcklwd  %%ymm3,%%ymm3,%%ymm3          \n"
1461       "vpunpckhdq  %%ymm3,%%ymm2,%%ymm4          \n"
1462       "vpunpckldq  %%ymm3,%%ymm2,%%ymm2          \n"
1463       "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
1464       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
1465 
1466       // ymm0 ymm1
1467       // ymm2 ymm3
1468 
1469       "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
1470       "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
1471       "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
1472       "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
1473       "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
1474 
1475       "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
1476       "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
1477       "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
1478       "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
1479       "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
1480 
1481       "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
1482       "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
1483       "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
1484       "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
1485       "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
1486 
1487       "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
1488       "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
1489       "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
1490       "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
1491       "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
1492 
1493       "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
1494       "vmovdqu     %%ymm4,(%1)                   \n"  // store above
1495       "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
1496       "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
1497 
1498       "lea         0x10(%0),%0                   \n"
1499       "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
1500       "sub         $0x20,%2                      \n"
1501       "jg          1b                            \n"
1502       "vzeroupper                                \n"
1503       : "+r"(src_ptr),                // %0
1504         "+r"(dst_ptr),                // %1
1505         "+r"(dst_width)               // %2
1506       : "r"((intptr_t)(src_stride)),  // %3
1507         "r"((intptr_t)(dst_stride)),  // %4
1508         "m"(kLinearMadd31)            // %5
1509       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1510         "xmm7");
1511 }
1512 #endif
1513 
1514 #ifdef HAS_SCALEROWUP2LINEAR_12_AVX2
ScaleRowUp2_Linear_12_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1515 void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
1516                                 uint16_t* dst_ptr,
1517                                 int dst_width) {
1518   asm volatile(
1519       "vbroadcastf128 %3,%%ymm5                  \n"
1520       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
1521       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
1522       "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
1523 
1524       LABELALIGN
1525       "1:                                        \n"
1526       "vmovdqu     (%0),%%ymm0                   \n"  // 0123456789ABCDEF (16b)
1527       "vmovdqu     2(%0),%%ymm1                  \n"  // 123456789ABCDEF0 (16b)
1528 
1529       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 012389AB4567CDEF
1530       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 12349ABC5678DEF0
1531 
1532       "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"  // 899AABBCCDDEEFF0 (near)
1533       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
1534       "vpshufb     %%ymm5,%%ymm2,%%ymm3          \n"  // 98A9BACBDCEDFE0F (far)
1535       "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
1536 
1537       "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // far+2
1538       "vpaddw      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2
1539       "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far+2
1540       "vpaddw      %%ymm2,%%ymm3,%%ymm3          \n"  // near+far+2
1541       "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
1542       "vpaddw      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near
1543       "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 3*near+far+2
1544       "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 3*near+far+2
1545 
1546       "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far
1547       "vpsrlw      $2,%%ymm2,%%ymm2              \n"  // 3/4*near+1/4*far
1548       "vmovdqu     %%ymm0,(%1)                   \n"
1549       "vmovdqu     %%ymm2,32(%1)                 \n"
1550 
1551       "lea         0x20(%0),%0                   \n"
1552       "lea         0x40(%1),%1                   \n"  // 16 sample to 32 sample
1553       "sub         $0x20,%2                      \n"
1554       "jg          1b                            \n"
1555       "vzeroupper                                \n"
1556       : "+r"(src_ptr),          // %0
1557         "+r"(dst_ptr),          // %1
1558         "+r"(dst_width)         // %2
1559       : "m"(kLinearShuffleFar)  // %3
1560       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1561 }
1562 #endif
1563 
1564 #ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2
ScaleRowUp2_Bilinear_12_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1565 void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
1566                                   ptrdiff_t src_stride,
1567                                   uint16_t* dst_ptr,
1568                                   ptrdiff_t dst_stride,
1569                                   int dst_width) {
1570   asm volatile(
1571       "vbroadcastf128 %5,%%ymm5                  \n"
1572       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
1573       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
1574       "vpsllw      $3,%%ymm4,%%ymm4              \n"  // all 8
1575 
1576       LABELALIGN
1577       "1:                                        \n"
1578 
1579       "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b)
1580       "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b)
1581       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
1582       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
1583       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
1584       "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
1585       "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far
1586       "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
1587       "vpaddw      %%ymm0,%%ymm1,%%ymm2          \n"  // 3*near+far (1)
1588 
1589       "vmovdqu     (%0,%3,2),%%xmm0              \n"  // 01234567 (16b)
1590       "vmovdqu     2(%0,%3,2),%%xmm1             \n"  // 12345678 (16b)
1591       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
1592       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
1593       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
1594       "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
1595       "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far
1596       "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
1597       "vpaddw      %%ymm0,%%ymm1,%%ymm3          \n"  // 3*near+far (2)
1598 
1599       "vpaddw      %%ymm2,%%ymm2,%%ymm0          \n"  // 6*near+2*far (1)
1600       "vpaddw      %%ymm4,%%ymm3,%%ymm1          \n"  // 3*near+far+8 (2)
1601       "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9*near+3*far (1)
1602       "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (1)
1603       "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
1604       "vmovdqu     %%ymm0,(%1)                   \n"  // store above
1605 
1606       "vpaddw      %%ymm3,%%ymm3,%%ymm0          \n"  // 6*near+2*far (2)
1607       "vpaddw      %%ymm4,%%ymm2,%%ymm1          \n"  // 3*near+far+8 (1)
1608       "vpaddw      %%ymm0,%%ymm3,%%ymm0          \n"  // 9*near+3*far (2)
1609       "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (2)
1610       "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
1611       "vmovdqu     %%ymm0,(%1,%4,2)              \n"  // store below
1612 
1613       "lea         0x10(%0),%0                   \n"
1614       "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
1615       "sub         $0x10,%2                      \n"
1616       "jg          1b                            \n"
1617       "vzeroupper                                \n"
1618       : "+r"(src_ptr),                // %0
1619         "+r"(dst_ptr),                // %1
1620         "+r"(dst_width)               // %2
1621       : "r"((intptr_t)(src_stride)),  // %3
1622         "r"((intptr_t)(dst_stride)),  // %4
1623         "m"(kLinearShuffleFar)        // %5
1624       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1625 }
1626 #endif
1627 
1628 #ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
ScaleRowUp2_Linear_16_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1629 void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
1630                                 uint16_t* dst_ptr,
1631                                 int dst_width) {
1632   asm volatile(
1633       "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
1634       "vpsrld      $31,%%ymm4,%%ymm4             \n"
1635       "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
1636 
1637       LABELALIGN
1638       "1:                                        \n"
1639       "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b, 1u1v)
1640       "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b, 1u1v)
1641 
1642       "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
1643       "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
1644 
1645       "vpshufd     $0b10110001,%%ymm0,%%ymm2     \n"  // 10325476 (lo, far)
1646       "vpshufd     $0b10110001,%%ymm1,%%ymm3     \n"  // 21436587 (hi, far)
1647 
1648       "vpaddd      %%ymm4,%%ymm2,%%ymm2          \n"  // far+2 (lo)
1649       "vpaddd      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2 (hi)
1650       "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far+2 (lo)
1651       "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far+2 (hi)
1652       "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
1653       "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
1654       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far+2 (lo)
1655       "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far+2 (hi)
1656 
1657       "vpsrld      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
1658       "vpsrld      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
1659       "vpackusdw   %%ymm1,%%ymm0,%%ymm0          \n"
1660       "vpshufd     $0b11011000,%%ymm0,%%ymm0     \n"
1661       "vmovdqu     %%ymm0,(%1)                   \n"
1662 
1663       "lea         0x10(%0),%0                   \n"
1664       "lea         0x20(%1),%1                   \n"  // 8 pixel to 16 pixel
1665       "sub         $0x10,%2                      \n"
1666       "jg          1b                            \n"
1667       "vzeroupper                                \n"
1668       : "+r"(src_ptr),   // %0
1669         "+r"(dst_ptr),   // %1
1670         "+r"(dst_width)  // %2
1671       :
1672       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1673 }
1674 #endif
1675 
1676 #ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
ScaleRowUp2_Bilinear_16_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1677 void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
1678                                   ptrdiff_t src_stride,
1679                                   uint16_t* dst_ptr,
1680                                   ptrdiff_t dst_stride,
1681                                   int dst_width) {
1682   asm volatile(
1683       "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
1684       "vpsrld      $31,%%ymm6,%%ymm6             \n"
1685       "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
1686 
1687       LABELALIGN
1688       "1:                                        \n"
1689 
1690       "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b, 1u1v)
1691       "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b, 1u1v)
1692       "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
1693       "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
1694       "vpshufd     $0b10110001,%%ymm0,%%ymm2     \n"  // 10325476 (lo, far)
1695       "vpshufd     $0b10110001,%%ymm1,%%ymm3     \n"  // 21436587 (hi, far)
1696       "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far (lo)
1697       "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far (hi)
1698       "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
1699       "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
1700       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far (1, lo)
1701       "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far (1, hi)
1702 
1703       "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 01234567 (16b, 1u1v)
1704       "vmovdqu     2(%0,%3,2),%%xmm3             \n"  // 12345678 (16b, 1u1v)
1705       "vpmovzxwd   %%xmm2,%%ymm2                 \n"  // 01234567 (32b, 1u1v)
1706       "vpmovzxwd   %%xmm3,%%ymm3                 \n"  // 12345678 (32b, 1u1v)
1707       "vpshufd     $0b10110001,%%ymm2,%%ymm4     \n"  // 10325476 (lo, far)
1708       "vpshufd     $0b10110001,%%ymm3,%%ymm5     \n"  // 21436587 (hi, far)
1709       "vpaddd      %%ymm2,%%ymm4,%%ymm4          \n"  // near+far (lo)
1710       "vpaddd      %%ymm3,%%ymm5,%%ymm5          \n"  // near+far (hi)
1711       "vpaddd      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near (lo)
1712       "vpaddd      %%ymm3,%%ymm3,%%ymm3          \n"  // 2*near (hi)
1713       "vpaddd      %%ymm2,%%ymm4,%%ymm2          \n"  // 3*near+far (2, lo)
1714       "vpaddd      %%ymm3,%%ymm5,%%ymm3          \n"  // 3*near+far (2, hi)
1715 
1716       "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
1717       "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
1718       "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
1719       "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
1720       "vpsrld      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
1721 
1722       "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
1723       "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
1724       "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
1725       "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
1726       "vpsrld      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
1727 
1728       "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
1729       "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
1730       "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
1731       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
1732       "vpsrld      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
1733 
1734       "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
1735       "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
1736       "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
1737       "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
1738       "vpsrld      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
1739 
1740       "vpackusdw   %%ymm0,%%ymm4,%%ymm4          \n"
1741       "vpshufd     $0b11011000,%%ymm4,%%ymm4     \n"
1742       "vmovdqu     %%ymm4,(%1)                   \n"  // store above
1743       "vpackusdw   %%ymm2,%%ymm5,%%ymm5          \n"
1744       "vpshufd     $0b11011000,%%ymm5,%%ymm5     \n"
1745       "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below
1746 
1747       "lea         0x10(%0),%0                   \n"
1748       "lea         0x20(%1),%1                   \n"  // 8 pixel to 16 pixel
1749       "sub         $0x10,%2                      \n"
1750       "jg          1b                            \n"
1751       "vzeroupper                                \n"
1752       : "+r"(src_ptr),                // %0
1753         "+r"(dst_ptr),                // %1
1754         "+r"(dst_width)               // %2
1755       : "r"((intptr_t)(src_stride)),  // %3
1756         "r"((intptr_t)(dst_stride))   // %4
1757       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1758 }
1759 #endif
1760 
1761 // Reads 16xN bytes and produces 16 shorts at a time.
ScaleAddRow_SSE2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)1762 void ScaleAddRow_SSE2(const uint8_t* src_ptr,
1763                       uint16_t* dst_ptr,
1764                       int src_width) {
1765       asm volatile("pxor        %%xmm5,%%xmm5                 \n"
1766 
1767                // 16 pixel loop.
1768                LABELALIGN
1769       "1:                                        \n"
1770       "movdqu      (%0),%%xmm3                   \n"
1771       "lea         0x10(%0),%0                   \n"  // src_ptr += 16
1772       "movdqu      (%1),%%xmm0                   \n"
1773       "movdqu      0x10(%1),%%xmm1               \n"
1774       "movdqa      %%xmm3,%%xmm2                 \n"
1775       "punpcklbw   %%xmm5,%%xmm2                 \n"
1776       "punpckhbw   %%xmm5,%%xmm3                 \n"
1777       "paddusw     %%xmm2,%%xmm0                 \n"
1778       "paddusw     %%xmm3,%%xmm1                 \n"
1779       "movdqu      %%xmm0,(%1)                   \n"
1780       "movdqu      %%xmm1,0x10(%1)               \n"
1781       "lea         0x20(%1),%1                   \n"
1782       "sub         $0x10,%2                      \n"
1783       "jg          1b                            \n"
1784                : "+r"(src_ptr),   // %0
1785                  "+r"(dst_ptr),   // %1
1786                  "+r"(src_width)  // %2
1787                :
1788                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
1789 }
1790 
1791 #ifdef HAS_SCALEADDROW_AVX2
1792 // Reads 32 bytes and accumulates to 32 shorts at a time.
ScaleAddRow_AVX2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)1793 void ScaleAddRow_AVX2(const uint8_t* src_ptr,
1794                       uint16_t* dst_ptr,
1795                       int src_width) {
1796       asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
1797 
1798                LABELALIGN
1799       "1:                                        \n"
1800       "vmovdqu     (%0),%%ymm3                   \n"
1801       "lea         0x20(%0),%0                   \n"  // src_ptr += 32
1802       "vpermq      $0xd8,%%ymm3,%%ymm3           \n"
1803       "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
1804       "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
1805       "vpaddusw    (%1),%%ymm2,%%ymm0            \n"
1806       "vpaddusw    0x20(%1),%%ymm3,%%ymm1        \n"
1807       "vmovdqu     %%ymm0,(%1)                   \n"
1808       "vmovdqu     %%ymm1,0x20(%1)               \n"
1809       "lea         0x40(%1),%1                   \n"
1810       "sub         $0x20,%2                      \n"
1811       "jg          1b                            \n"
1812       "vzeroupper                                \n"
1813                : "+r"(src_ptr),   // %0
1814                  "+r"(dst_ptr),   // %1
1815                  "+r"(src_width)  // %2
1816                :
1817                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
1818 }
1819 #endif  // HAS_SCALEADDROW_AVX2
1820 
1821 // Constant for making pixels signed to avoid pmaddubsw
1822 // saturation.
1823 static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1824                               0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
1825 
1826 // Constant for making pixels unsigned and adding .5 for rounding.
1827 static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
1828                                0x4040, 0x4040, 0x4040, 0x4040};
1829 
1830 // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)1831 void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
1832                            const uint8_t* src_ptr,
1833                            int dst_width,
1834                            int x,
1835                            int dx) {
1836   intptr_t x0, x1, temp_pixel;
1837   asm volatile(
1838       "movd        %6,%%xmm2                     \n"
1839       "movd        %7,%%xmm3                     \n"
1840       "movl        $0x04040000,%k2               \n"
1841       "movd        %k2,%%xmm5                    \n"
1842       "pcmpeqb     %%xmm6,%%xmm6                 \n"
1843       "psrlw       $0x9,%%xmm6                   \n"  // 0x007f007f
1844       "pcmpeqb     %%xmm7,%%xmm7                 \n"
1845       "psrlw       $15,%%xmm7                    \n"  // 0x00010001
1846 
1847       "pextrw      $0x1,%%xmm2,%k3               \n"
1848       "subl        $0x2,%5                       \n"
1849       "jl          29f                           \n"
1850       "movdqa      %%xmm2,%%xmm0                 \n"
1851       "paddd       %%xmm3,%%xmm0                 \n"
1852       "punpckldq   %%xmm0,%%xmm2                 \n"
1853       "punpckldq   %%xmm3,%%xmm3                 \n"
1854       "paddd       %%xmm3,%%xmm3                 \n"
1855       "pextrw      $0x3,%%xmm2,%k4               \n"
1856 
1857       LABELALIGN
1858       "2:                                        \n"
1859       "movdqa      %%xmm2,%%xmm1                 \n"
1860       "paddd       %%xmm3,%%xmm2                 \n"
1861       "movzwl      0x00(%1,%3,1),%k2             \n"
1862       "movd        %k2,%%xmm0                    \n"
1863       "psrlw       $0x9,%%xmm1                   \n"
1864       "movzwl      0x00(%1,%4,1),%k2             \n"
1865       "movd        %k2,%%xmm4                    \n"
1866       "pshufb      %%xmm5,%%xmm1                 \n"
1867       "punpcklwd   %%xmm4,%%xmm0                 \n"
1868       "psubb       %8,%%xmm0                     \n"  // make pixels signed.
1869       "pxor        %%xmm6,%%xmm1                 \n"  // 128 - f = (f ^ 127 ) +
1870                                                       // 1
1871       "paddusb     %%xmm7,%%xmm1                 \n"
1872       "pmaddubsw   %%xmm0,%%xmm1                 \n"
1873       "pextrw      $0x1,%%xmm2,%k3               \n"
1874       "pextrw      $0x3,%%xmm2,%k4               \n"
1875       "paddw       %9,%%xmm1                     \n"  // make pixels unsigned.
1876       "psrlw       $0x7,%%xmm1                   \n"
1877       "packuswb    %%xmm1,%%xmm1                 \n"
1878       "movd        %%xmm1,%k2                    \n"
1879       "mov         %w2,(%0)                      \n"
1880       "lea         0x2(%0),%0                    \n"
1881       "subl        $0x2,%5                       \n"
1882       "jge         2b                            \n"
1883 
1884       LABELALIGN
1885       "29:                                       \n"
1886       "addl        $0x1,%5                       \n"
1887       "jl          99f                           \n"
1888       "movzwl      0x00(%1,%3,1),%k2             \n"
1889       "movd        %k2,%%xmm0                    \n"
1890       "psrlw       $0x9,%%xmm2                   \n"
1891       "pshufb      %%xmm5,%%xmm2                 \n"
1892       "psubb       %8,%%xmm0                     \n"  // make pixels signed.
1893       "pxor        %%xmm6,%%xmm2                 \n"
1894       "paddusb     %%xmm7,%%xmm2                 \n"
1895       "pmaddubsw   %%xmm0,%%xmm2                 \n"
1896       "paddw       %9,%%xmm2                     \n"  // make pixels unsigned.
1897       "psrlw       $0x7,%%xmm2                   \n"
1898       "packuswb    %%xmm2,%%xmm2                 \n"
1899       "movd        %%xmm2,%k2                    \n"
1900       "mov         %b2,(%0)                      \n"
1901       "99:                                       \n"
1902       : "+r"(dst_ptr),      // %0
1903         "+r"(src_ptr),      // %1
1904         "=&a"(temp_pixel),  // %2
1905         "=&r"(x0),          // %3
1906         "=&r"(x1),          // %4
1907 #if defined(__x86_64__)
1908         "+rm"(dst_width)  // %5
1909 #else
1910         "+m"(dst_width)  // %5
1911 #endif
1912       : "rm"(x),   // %6
1913         "rm"(dx),  // %7
1914 #if defined(__x86_64__)
1915         "x"(kFsub80),  // %8
1916         "x"(kFadd40)   // %9
1917 #else
1918         "m"(kFsub80),    // %8
1919         "m"(kFadd40)     // %9
1920 #endif
1921       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1922         "xmm7");
1923 }
1924 
1925 // Reads 4 pixels, duplicates them and writes 8 pixels.
1926 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleColsUp2_SSE2(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)1927 void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
1928                        const uint8_t* src_ptr,
1929                        int dst_width,
1930                        int x,
1931                        int dx) {
1932   (void)x;
1933   (void)dx;
1934   asm volatile(LABELALIGN
1935       "1:                                        \n"
1936       "movdqu      (%1),%%xmm0                   \n"
1937       "lea         0x10(%1),%1                   \n"
1938       "movdqa      %%xmm0,%%xmm1                 \n"
1939       "punpcklbw   %%xmm0,%%xmm0                 \n"
1940       "punpckhbw   %%xmm1,%%xmm1                 \n"
1941       "movdqu      %%xmm0,(%0)                   \n"
1942       "movdqu      %%xmm1,0x10(%0)               \n"
1943       "lea         0x20(%0),%0                   \n"
1944       "sub         $0x20,%2                      \n"
1945       "jg          1b                            \n"
1946 
1947                : "+r"(dst_ptr),   // %0
1948                  "+r"(src_ptr),   // %1
1949                  "+r"(dst_width)  // %2
1950                  ::"memory",
1951                  "cc", "xmm0", "xmm1");
1952 }
1953 
ScaleARGBRowDown2_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1954 void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
1955                             ptrdiff_t src_stride,
1956                             uint8_t* dst_argb,
1957                             int dst_width) {
1958   (void)src_stride;
1959   asm volatile(LABELALIGN
1960       "1:                                        \n"
1961       "movdqu      (%0),%%xmm0                   \n"
1962       "movdqu      0x10(%0),%%xmm1               \n"
1963       "lea         0x20(%0),%0                   \n"
1964       "shufps      $0xdd,%%xmm1,%%xmm0           \n"
1965       "movdqu      %%xmm0,(%1)                   \n"
1966       "lea         0x10(%1),%1                   \n"
1967       "sub         $0x4,%2                       \n"
1968       "jg          1b                            \n"
1969                : "+r"(src_argb),  // %0
1970                  "+r"(dst_argb),  // %1
1971                  "+r"(dst_width)  // %2
1972                  ::"memory",
1973                  "cc", "xmm0", "xmm1");
1974 }
1975 
ScaleARGBRowDown2Linear_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1976 void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
1977                                   ptrdiff_t src_stride,
1978                                   uint8_t* dst_argb,
1979                                   int dst_width) {
1980   (void)src_stride;
1981   asm volatile(LABELALIGN
1982       "1:                                        \n"
1983       "movdqu      (%0),%%xmm0                   \n"
1984       "movdqu      0x10(%0),%%xmm1               \n"
1985       "lea         0x20(%0),%0                   \n"
1986       "movdqa      %%xmm0,%%xmm2                 \n"
1987       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1988       "shufps      $0xdd,%%xmm1,%%xmm2           \n"
1989       "pavgb       %%xmm2,%%xmm0                 \n"
1990       "movdqu      %%xmm0,(%1)                   \n"
1991       "lea         0x10(%1),%1                   \n"
1992       "sub         $0x4,%2                       \n"
1993       "jg          1b                            \n"
1994                : "+r"(src_argb),  // %0
1995                  "+r"(dst_argb),  // %1
1996                  "+r"(dst_width)  // %2
1997                  ::"memory",
1998                  "cc", "xmm0", "xmm1");
1999 }
2000 
ScaleARGBRowDown2Box_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)2001 void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
2002                                ptrdiff_t src_stride,
2003                                uint8_t* dst_argb,
2004                                int dst_width) {
2005   asm volatile(LABELALIGN
2006       "1:                                        \n"
2007       "movdqu      (%0),%%xmm0                   \n"
2008       "movdqu      0x10(%0),%%xmm1               \n"
2009       "movdqu      0x00(%0,%3,1),%%xmm2          \n"
2010       "movdqu      0x10(%0,%3,1),%%xmm3          \n"
2011       "lea         0x20(%0),%0                   \n"
2012       "pavgb       %%xmm2,%%xmm0                 \n"
2013       "pavgb       %%xmm3,%%xmm1                 \n"
2014       "movdqa      %%xmm0,%%xmm2                 \n"
2015       "shufps      $0x88,%%xmm1,%%xmm0           \n"
2016       "shufps      $0xdd,%%xmm1,%%xmm2           \n"
2017       "pavgb       %%xmm2,%%xmm0                 \n"
2018       "movdqu      %%xmm0,(%1)                   \n"
2019       "lea         0x10(%1),%1                   \n"
2020       "sub         $0x4,%2                       \n"
2021       "jg          1b                            \n"
2022                : "+r"(src_argb),              // %0
2023                  "+r"(dst_argb),              // %1
2024                  "+r"(dst_width)              // %2
2025                : "r"((intptr_t)(src_stride))  // %3
2026                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2027 }
2028 
2029 // Reads 4 pixels at a time.
2030 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEven_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)2031 void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
2032                                ptrdiff_t src_stride,
2033                                int src_stepx,
2034                                uint8_t* dst_argb,
2035                                int dst_width) {
2036   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
2037   intptr_t src_stepx_x12;
2038   (void)src_stride;
2039   asm volatile(
2040       "lea         0x00(,%1,4),%1                \n"
2041       "lea         0x00(%1,%1,2),%4              \n"
2042 
2043       LABELALIGN
2044       "1:                                        \n"
2045       "movd        (%0),%%xmm0                   \n"
2046       "movd        0x00(%0,%1,1),%%xmm1          \n"
2047       "punpckldq   %%xmm1,%%xmm0                 \n"
2048       "movd        0x00(%0,%1,2),%%xmm2          \n"
2049       "movd        0x00(%0,%4,1),%%xmm3          \n"
2050       "lea         0x00(%0,%1,4),%0              \n"
2051       "punpckldq   %%xmm3,%%xmm2                 \n"
2052       "punpcklqdq  %%xmm2,%%xmm0                 \n"
2053       "movdqu      %%xmm0,(%2)                   \n"
2054       "lea         0x10(%2),%2                   \n"
2055       "sub         $0x4,%3                       \n"
2056       "jg          1b                            \n"
2057       : "+r"(src_argb),       // %0
2058         "+r"(src_stepx_x4),   // %1
2059         "+r"(dst_argb),       // %2
2060         "+r"(dst_width),      // %3
2061         "=&r"(src_stepx_x12)  // %4
2062         ::"memory",
2063         "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2064 }
2065 
2066 // Blends four 2x2 to 4x1.
2067 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEvenBox_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)2068 void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
2069                                   ptrdiff_t src_stride,
2070                                   int src_stepx,
2071                                   uint8_t* dst_argb,
2072                                   int dst_width) {
2073   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
2074   intptr_t src_stepx_x12;
2075   intptr_t row1 = (intptr_t)(src_stride);
2076   asm volatile(
2077       "lea         0x00(,%1,4),%1                \n"
2078       "lea         0x00(%1,%1,2),%4              \n"
2079       "lea         0x00(%0,%5,1),%5              \n"
2080 
2081       LABELALIGN
2082       "1:                                        \n"
2083       "movq        (%0),%%xmm0                   \n"
2084       "movhps      0x00(%0,%1,1),%%xmm0          \n"
2085       "movq        0x00(%0,%1,2),%%xmm1          \n"
2086       "movhps      0x00(%0,%4,1),%%xmm1          \n"
2087       "lea         0x00(%0,%1,4),%0              \n"
2088       "movq        (%5),%%xmm2                   \n"
2089       "movhps      0x00(%5,%1,1),%%xmm2          \n"
2090       "movq        0x00(%5,%1,2),%%xmm3          \n"
2091       "movhps      0x00(%5,%4,1),%%xmm3          \n"
2092       "lea         0x00(%5,%1,4),%5              \n"
2093       "pavgb       %%xmm2,%%xmm0                 \n"
2094       "pavgb       %%xmm3,%%xmm1                 \n"
2095       "movdqa      %%xmm0,%%xmm2                 \n"
2096       "shufps      $0x88,%%xmm1,%%xmm0           \n"
2097       "shufps      $0xdd,%%xmm1,%%xmm2           \n"
2098       "pavgb       %%xmm2,%%xmm0                 \n"
2099       "movdqu      %%xmm0,(%2)                   \n"
2100       "lea         0x10(%2),%2                   \n"
2101       "sub         $0x4,%3                       \n"
2102       "jg          1b                            \n"
2103       : "+r"(src_argb),        // %0
2104         "+r"(src_stepx_x4),    // %1
2105         "+r"(dst_argb),        // %2
2106         "+rm"(dst_width),      // %3
2107         "=&r"(src_stepx_x12),  // %4
2108         "+r"(row1)             // %5
2109         ::"memory",
2110         "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2111 }
2112 
ScaleARGBCols_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2113 void ScaleARGBCols_SSE2(uint8_t* dst_argb,
2114                         const uint8_t* src_argb,
2115                         int dst_width,
2116                         int x,
2117                         int dx) {
2118   intptr_t x0, x1;
2119   asm volatile(
2120       "movd        %5,%%xmm2                     \n"
2121       "movd        %6,%%xmm3                     \n"
2122       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
2123       "pshufd      $0x11,%%xmm3,%%xmm0           \n"
2124       "paddd       %%xmm0,%%xmm2                 \n"
2125       "paddd       %%xmm3,%%xmm3                 \n"
2126       "pshufd      $0x5,%%xmm3,%%xmm0            \n"
2127       "paddd       %%xmm0,%%xmm2                 \n"
2128       "paddd       %%xmm3,%%xmm3                 \n"
2129       "pshufd      $0x0,%%xmm3,%%xmm3            \n"
2130       "pextrw      $0x1,%%xmm2,%k0               \n"
2131       "pextrw      $0x3,%%xmm2,%k1               \n"
2132       "cmp         $0x0,%4                       \n"
2133       "jl          99f                           \n"
2134       "sub         $0x4,%4                       \n"
2135       "jl          49f                           \n"
2136 
2137       LABELALIGN
2138       "40:                                       \n"
2139       "movd        0x00(%3,%0,4),%%xmm0          \n"
2140       "movd        0x00(%3,%1,4),%%xmm1          \n"
2141       "pextrw      $0x5,%%xmm2,%k0               \n"
2142       "pextrw      $0x7,%%xmm2,%k1               \n"
2143       "paddd       %%xmm3,%%xmm2                 \n"
2144       "punpckldq   %%xmm1,%%xmm0                 \n"
2145       "movd        0x00(%3,%0,4),%%xmm1          \n"
2146       "movd        0x00(%3,%1,4),%%xmm4          \n"
2147       "pextrw      $0x1,%%xmm2,%k0               \n"
2148       "pextrw      $0x3,%%xmm2,%k1               \n"
2149       "punpckldq   %%xmm4,%%xmm1                 \n"
2150       "punpcklqdq  %%xmm1,%%xmm0                 \n"
2151       "movdqu      %%xmm0,(%2)                   \n"
2152       "lea         0x10(%2),%2                   \n"
2153       "sub         $0x4,%4                       \n"
2154       "jge         40b                           \n"
2155 
2156       "49:                                       \n"
2157       "test        $0x2,%4                       \n"
2158       "je          29f                           \n"
2159       "movd        0x00(%3,%0,4),%%xmm0          \n"
2160       "movd        0x00(%3,%1,4),%%xmm1          \n"
2161       "pextrw      $0x5,%%xmm2,%k0               \n"
2162       "punpckldq   %%xmm1,%%xmm0                 \n"
2163       "movq        %%xmm0,(%2)                   \n"
2164       "lea         0x8(%2),%2                    \n"
2165       "29:                                       \n"
2166       "test        $0x1,%4                       \n"
2167       "je          99f                           \n"
2168       "movd        0x00(%3,%0,4),%%xmm0          \n"
2169       "movd        %%xmm0,(%2)                   \n"
2170       "99:                                       \n"
2171       : "=&a"(x0),       // %0
2172         "=&d"(x1),       // %1
2173         "+r"(dst_argb),  // %2
2174         "+r"(src_argb),  // %3
2175         "+r"(dst_width)  // %4
2176       : "rm"(x),         // %5
2177         "rm"(dx)         // %6
2178       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2179 }
2180 
2181 // Reads 4 pixels, duplicates them and writes 8 pixels.
2182 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleARGBColsUp2_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2183 void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
2184                            const uint8_t* src_argb,
2185                            int dst_width,
2186                            int x,
2187                            int dx) {
2188   (void)x;
2189   (void)dx;
2190   asm volatile(LABELALIGN
2191       "1:                                        \n"
2192       "movdqu      (%1),%%xmm0                   \n"
2193       "lea         0x10(%1),%1                   \n"
2194       "movdqa      %%xmm0,%%xmm1                 \n"
2195       "punpckldq   %%xmm0,%%xmm0                 \n"
2196       "punpckhdq   %%xmm1,%%xmm1                 \n"
2197       "movdqu      %%xmm0,(%0)                   \n"
2198       "movdqu      %%xmm1,0x10(%0)               \n"
2199       "lea         0x20(%0),%0                   \n"
2200       "sub         $0x8,%2                       \n"
2201       "jg          1b                            \n"
2202 
2203                : "+r"(dst_argb),  // %0
2204                  "+r"(src_argb),  // %1
2205                  "+r"(dst_width)  // %2
2206                  ::"memory",
2207                  "cc", "xmm0", "xmm1");
2208 }
2209 
2210 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
2211 static const uvec8 kShuffleColARGB = {
2212     0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
2213     8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
2214 };
2215 
2216 // Shuffle table for duplicating 2 fractions into 8 bytes each
2217 static const uvec8 kShuffleFractions = {
2218     0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
2219 };
2220 
2221 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
ScaleARGBFilterCols_SSSE3(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2222 void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
2223                                const uint8_t* src_argb,
2224                                int dst_width,
2225                                int x,
2226                                int dx) {
2227   intptr_t x0, x1;
2228   asm volatile(
2229       "movdqa      %0,%%xmm4                     \n"
2230       "movdqa      %1,%%xmm5                     \n"
2231       :
2232       : "m"(kShuffleColARGB),   // %0
2233         "m"(kShuffleFractions)  // %1
2234   );
2235 
2236   asm volatile(
2237       "movd        %5,%%xmm2                     \n"
2238       "movd        %6,%%xmm3                     \n"
2239       "pcmpeqb     %%xmm6,%%xmm6                 \n"
2240       "psrlw       $0x9,%%xmm6                   \n"
2241       "pextrw      $0x1,%%xmm2,%k3               \n"
2242       "sub         $0x2,%2                       \n"
2243       "jl          29f                           \n"
2244       "movdqa      %%xmm2,%%xmm0                 \n"
2245       "paddd       %%xmm3,%%xmm0                 \n"
2246       "punpckldq   %%xmm0,%%xmm2                 \n"
2247       "punpckldq   %%xmm3,%%xmm3                 \n"
2248       "paddd       %%xmm3,%%xmm3                 \n"
2249       "pextrw      $0x3,%%xmm2,%k4               \n"
2250 
2251       LABELALIGN
2252       "2:                                        \n"
2253       "movdqa      %%xmm2,%%xmm1                 \n"
2254       "paddd       %%xmm3,%%xmm2                 \n"
2255       "movq        0x00(%1,%3,4),%%xmm0          \n"
2256       "psrlw       $0x9,%%xmm1                   \n"
2257       "movhps      0x00(%1,%4,4),%%xmm0          \n"
2258       "pshufb      %%xmm5,%%xmm1                 \n"
2259       "pshufb      %%xmm4,%%xmm0                 \n"
2260       "pxor        %%xmm6,%%xmm1                 \n"
2261       "pmaddubsw   %%xmm1,%%xmm0                 \n"
2262       "psrlw       $0x7,%%xmm0                   \n"
2263       "pextrw      $0x1,%%xmm2,%k3               \n"
2264       "pextrw      $0x3,%%xmm2,%k4               \n"
2265       "packuswb    %%xmm0,%%xmm0                 \n"
2266       "movq        %%xmm0,(%0)                   \n"
2267       "lea         0x8(%0),%0                    \n"
2268       "sub         $0x2,%2                       \n"
2269       "jge         2b                            \n"
2270 
2271       LABELALIGN
2272       "29:                                       \n"
2273       "add         $0x1,%2                       \n"
2274       "jl          99f                           \n"
2275       "psrlw       $0x9,%%xmm2                   \n"
2276       "movq        0x00(%1,%3,4),%%xmm0          \n"
2277       "pshufb      %%xmm5,%%xmm2                 \n"
2278       "pshufb      %%xmm4,%%xmm0                 \n"
2279       "pxor        %%xmm6,%%xmm2                 \n"
2280       "pmaddubsw   %%xmm2,%%xmm0                 \n"
2281       "psrlw       $0x7,%%xmm0                   \n"
2282       "packuswb    %%xmm0,%%xmm0                 \n"
2283       "movd        %%xmm0,(%0)                   \n"
2284 
2285       LABELALIGN
2286       "99:                                       \n"  // clang-format error.
2287 
2288       : "+r"(dst_argb),    // %0
2289         "+r"(src_argb),    // %1
2290         "+rm"(dst_width),  // %2
2291         "=&r"(x0),         // %3
2292         "=&r"(x1)          // %4
2293       : "rm"(x),           // %5
2294         "rm"(dx)           // %6
2295       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2296 }
2297 
2298 // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)2299 int FixedDiv_X86(int num, int div) {
2300   asm volatile(
2301       "cdq                                       \n"
2302       "shld        $0x10,%%eax,%%edx             \n"
2303       "shl         $0x10,%%eax                   \n"
2304       "idiv        %1                            \n"
2305       "mov         %0, %%eax                     \n"
2306       : "+a"(num)  // %0
2307       : "c"(div)   // %1
2308       : "memory", "cc", "edx");
2309   return num;
2310 }
2311 
2312 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)2313 int FixedDiv1_X86(int num, int div) {
2314   asm volatile(
2315       "cdq                                       \n"
2316       "shld        $0x10,%%eax,%%edx             \n"
2317       "shl         $0x10,%%eax                   \n"
2318       "sub         $0x10001,%%eax                \n"
2319       "sbb         $0x0,%%edx                    \n"
2320       "sub         $0x1,%1                       \n"
2321       "idiv        %1                            \n"
2322       "mov         %0, %%eax                     \n"
2323       : "+a"(num)  // %0
2324       : "c"(div)   // %1
2325       : "memory", "cc", "edx");
2326   return num;
2327 }
2328 
2329 #ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
2330 // Shuffle table for splitting UV into upper and lower part of register.
2331 static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
2332                                       1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
2333 static const uvec8 kShuffleMergeUV = {0u,   8u,   2u,   10u,  4u,   12u,
2334                                       6u,   14u,  0x80, 0x80, 0x80, 0x80,
2335                                       0x80, 0x80, 0x80, 0x80};
2336 
ScaleUVRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)2337 void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
2338                               ptrdiff_t src_stride,
2339                               uint8_t* dst_ptr,
2340                               int dst_width) {
2341   asm volatile(
2342       "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
2343       "psrlw       $0xf,%%xmm4                   \n"
2344       "packuswb    %%xmm4,%%xmm4                 \n"
2345       "pxor        %%xmm5, %%xmm5                \n"  // zero
2346       "movdqa      %4,%%xmm1                     \n"  // split shuffler
2347       "movdqa      %5,%%xmm3                     \n"  // merge shuffler
2348 
2349       LABELALIGN
2350       "1:                                        \n"
2351       "movdqu      (%0),%%xmm0                   \n"  // 8 UV row 0
2352       "movdqu      0x00(%0,%3,1),%%xmm2          \n"  // 8 UV row 1
2353       "lea         0x10(%0),%0                   \n"
2354       "pshufb      %%xmm1,%%xmm0                 \n"  // uuuuvvvv
2355       "pshufb      %%xmm1,%%xmm2                 \n"
2356       "pmaddubsw   %%xmm4,%%xmm0                 \n"  // horizontal add
2357       "pmaddubsw   %%xmm4,%%xmm2                 \n"
2358       "paddw       %%xmm2,%%xmm0                 \n"  // vertical add
2359       "psrlw       $0x1,%%xmm0                   \n"  // round
2360       "pavgw       %%xmm5,%%xmm0                 \n"
2361       "pshufb      %%xmm3,%%xmm0                 \n"  // merge uv
2362       "movq        %%xmm0,(%1)                   \n"
2363       "lea         0x8(%1),%1                    \n"  // 4 UV
2364       "sub         $0x4,%2                       \n"
2365       "jg          1b                            \n"
2366       : "+r"(src_ptr),                // %0
2367         "+r"(dst_ptr),                // %1
2368         "+r"(dst_width)               // %2
2369       : "r"((intptr_t)(src_stride)),  // %3
2370         "m"(kShuffleSplitUV),         // %4
2371         "m"(kShuffleMergeUV)          // %5
2372       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2373 }
2374 #endif  // HAS_SCALEUVROWDOWN2BOX_SSSE3
2375 
2376 #ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
ScaleUVRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)2377 void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
2378                              ptrdiff_t src_stride,
2379                              uint8_t* dst_ptr,
2380                              int dst_width) {
2381   asm volatile(
2382       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
2383       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
2384       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
2385       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"  // zero
2386       "vbroadcastf128 %4,%%ymm1                  \n"  // split shuffler
2387       "vbroadcastf128 %5,%%ymm3                  \n"  // merge shuffler
2388 
2389       LABELALIGN
2390       "1:                                        \n"
2391       "vmovdqu     (%0),%%ymm0                   \n"  // 16 UV row 0
2392       "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"  // 16 UV row 1
2393       "lea         0x20(%0),%0                   \n"
2394       "vpshufb     %%ymm1,%%ymm0,%%ymm0          \n"  // uuuuvvvv
2395       "vpshufb     %%ymm1,%%ymm2,%%ymm2          \n"
2396       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // horizontal add
2397       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
2398       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"  // vertical add
2399       "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"  // round
2400       "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
2401       "vpshufb     %%ymm3,%%ymm0,%%ymm0          \n"  // merge uv
2402       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"  // combine qwords
2403       "vmovdqu     %%xmm0,(%1)                   \n"
2404       "lea         0x10(%1),%1                   \n"  // 8 UV
2405       "sub         $0x8,%2                       \n"
2406       "jg          1b                            \n"
2407       "vzeroupper                                \n"
2408       : "+r"(src_ptr),                // %0
2409         "+r"(dst_ptr),                // %1
2410         "+r"(dst_width)               // %2
2411       : "r"((intptr_t)(src_stride)),  // %3
2412         "m"(kShuffleSplitUV),         // %4
2413         "m"(kShuffleMergeUV)          // %5
2414       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2415 }
2416 #endif  // HAS_SCALEUVROWDOWN2BOX_AVX2
2417 
2418 static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
2419                                       3, 1, 3, 1, 1, 3, 1, 3};
2420 
2421 #ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
ScaleUVRowUp2_Linear_SSSE3(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)2422 void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
2423                                 uint8_t* dst_ptr,
2424                                 int dst_width) {
2425   asm volatile(
2426       "pcmpeqw     %%xmm4,%%xmm4                 \n"
2427       "psrlw       $15,%%xmm4                    \n"
2428       "psllw       $1,%%xmm4                     \n"  // all 2
2429       "movdqa      %3,%%xmm3                     \n"
2430 
2431       LABELALIGN
2432       "1:                                        \n"
2433       "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
2434       "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
2435       "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
2436       "movdqa      %%xmm0,%%xmm2                 \n"
2437       "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
2438       "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
2439       "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
2440       "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
2441       "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
2442       "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
2443       "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
2444       "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
2445       "packuswb    %%xmm2,%%xmm0                 \n"
2446       "movdqu      %%xmm0,(%1)                   \n"
2447 
2448       "lea         0x8(%0),%0                    \n"
2449       "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
2450       "sub         $0x8,%2                       \n"
2451       "jg          1b                            \n"
2452       : "+r"(src_ptr),        // %0
2453         "+r"(dst_ptr),        // %1
2454         "+r"(dst_width)       // %2
2455       : "m"(kUVLinearMadd31)  // %3
2456       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2457 }
2458 #endif
2459 
2460 #ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2461 void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
2462                                   ptrdiff_t src_stride,
2463                                   uint8_t* dst_ptr,
2464                                   ptrdiff_t dst_stride,
2465                                   int dst_width) {
2466   asm volatile(
2467       "pcmpeqw     %%xmm6,%%xmm6                 \n"
2468       "psrlw       $15,%%xmm6                    \n"
2469       "psllw       $3,%%xmm6                     \n"  // all 8
2470       "movdqa      %5,%%xmm7                     \n"
2471 
2472       LABELALIGN
2473       "1:                                        \n"
2474       "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
2475       "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
2476       "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
2477       "movdqa      %%xmm0,%%xmm2                 \n"
2478       "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
2479       "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
2480       "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
2481       "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
2482 
2483       "movq        (%0,%3),%%xmm1                \n"
2484       "movq        2(%0,%3),%%xmm4               \n"
2485       "punpcklbw   %%xmm4,%%xmm1                 \n"
2486       "movdqa      %%xmm1,%%xmm3                 \n"
2487       "punpckhdq   %%xmm1,%%xmm3                 \n"
2488       "punpckldq   %%xmm1,%%xmm1                 \n"
2489       "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
2490       "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
2491 
2492       // xmm0 xmm2
2493       // xmm1 xmm3
2494 
2495       "movdqa      %%xmm0,%%xmm4                 \n"
2496       "movdqa      %%xmm1,%%xmm5                 \n"
2497       "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
2498       "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
2499       "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
2500       "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
2501       "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
2502 
2503       "movdqa      %%xmm1,%%xmm5                 \n"
2504       "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
2505       "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
2506       "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
2507       "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
2508       "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
2509 
2510       "movdqa      %%xmm2,%%xmm0                 \n"
2511       "movdqa      %%xmm3,%%xmm1                 \n"
2512       "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
2513       "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
2514       "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
2515       "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
2516       "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
2517 
2518       "movdqa      %%xmm3,%%xmm1                 \n"
2519       "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
2520       "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
2521       "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
2522       "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
2523       "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
2524 
2525       "packuswb    %%xmm0,%%xmm4                 \n"
2526       "movdqu      %%xmm4,(%1)                   \n"  // store above
2527       "packuswb    %%xmm1,%%xmm5                 \n"
2528       "movdqu      %%xmm5,(%1,%4)                \n"  // store below
2529 
2530       "lea         0x8(%0),%0                    \n"
2531       "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
2532       "sub         $0x8,%2                       \n"
2533       "jg          1b                            \n"
2534       : "+r"(src_ptr),                // %0
2535         "+r"(dst_ptr),                // %1
2536         "+r"(dst_width)               // %2
2537       : "r"((intptr_t)(src_stride)),  // %3
2538         "r"((intptr_t)(dst_stride)),  // %4
2539         "m"(kUVLinearMadd31)          // %5
2540       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2541         "xmm7");
2542 }
2543 #endif
2544 
2545 #ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
2546 
ScaleUVRowUp2_Linear_AVX2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)2547 void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
2548                                uint8_t* dst_ptr,
2549                                int dst_width) {
2550   asm volatile(
2551       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
2552       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
2553       "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
2554       "vbroadcastf128 %3,%%ymm3                  \n"
2555 
2556       LABELALIGN
2557       "1:                                        \n"
2558       "vmovdqu     (%0),%%xmm0                   \n"
2559       "vmovdqu     2(%0),%%xmm1                  \n"
2560       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
2561       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
2562       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
2563       "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
2564       "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
2565       "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
2566       "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
2567       "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
2568       "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
2569       "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
2570       "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
2571       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
2572       "vmovdqu     %%ymm0,(%1)                   \n"
2573 
2574       "lea         0x10(%0),%0                   \n"
2575       "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
2576       "sub         $0x10,%2                      \n"
2577       "jg          1b                            \n"
2578       "vzeroupper                                \n"
2579       : "+r"(src_ptr),        // %0
2580         "+r"(dst_ptr),        // %1
2581         "+r"(dst_width)       // %2
2582       : "m"(kUVLinearMadd31)  // %3
2583       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2584 }
2585 #endif
2586 
2587 #ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
ScaleUVRowUp2_Bilinear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2588 void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
2589                                  ptrdiff_t src_stride,
2590                                  uint8_t* dst_ptr,
2591                                  ptrdiff_t dst_stride,
2592                                  int dst_width) {
2593   asm volatile(
2594       "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
2595       "vpsrlw      $15,%%ymm6,%%ymm6             \n"
2596       "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
2597       "vbroadcastf128 %5,%%ymm7                  \n"
2598 
2599       LABELALIGN
2600       "1:                                        \n"
2601       "vmovdqu     (%0),%%xmm0                   \n"
2602       "vmovdqu     2(%0),%%xmm1                  \n"
2603       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
2604       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
2605       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
2606       "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
2607       "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
2608       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
2609       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
2610 
2611       "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
2612       "vmovdqu     2(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
2613       "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
2614       "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
2615       "vpunpcklbw  %%ymm3,%%ymm2,%%ymm2          \n"
2616       "vpunpckhdq  %%ymm2,%%ymm2,%%ymm4          \n"
2617       "vpunpckldq  %%ymm2,%%ymm2,%%ymm2          \n"
2618       "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
2619       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
2620 
2621       // ymm0 ymm1
2622       // ymm2 ymm3
2623 
2624       "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
2625       "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
2626       "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
2627       "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
2628       "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
2629 
2630       "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
2631       "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
2632       "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
2633       "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
2634       "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
2635 
2636       "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
2637       "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
2638       "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
2639       "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
2640       "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
2641 
2642       "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
2643       "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
2644       "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
2645       "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
2646       "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
2647 
2648       "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
2649       "vmovdqu     %%ymm4,(%1)                   \n"  // store above
2650       "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
2651       "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
2652 
2653       "lea         0x10(%0),%0                   \n"
2654       "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
2655       "sub         $0x10,%2                      \n"
2656       "jg          1b                            \n"
2657       "vzeroupper                                \n"
2658       : "+r"(src_ptr),                // %0
2659         "+r"(dst_ptr),                // %1
2660         "+r"(dst_width)               // %2
2661       : "r"((intptr_t)(src_stride)),  // %3
2662         "r"((intptr_t)(dst_stride)),  // %4
2663         "m"(kUVLinearMadd31)          // %5
2664       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2665         "xmm7");
2666 }
2667 #endif
2668 
2669 #ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
ScaleUVRowUp2_Linear_16_SSE2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)2670 void ScaleUVRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
2671                                   uint16_t* dst_ptr,
2672                                   int dst_width) {
2673   asm volatile(
2674       "pxor        %%xmm5,%%xmm5                 \n"
2675       "pcmpeqd     %%xmm4,%%xmm4                 \n"
2676       "psrld       $31,%%xmm4                    \n"
2677       "pslld       $1,%%xmm4                     \n"  // all 2
2678 
2679       LABELALIGN
2680       "1:                                        \n"
2681       "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
2682       "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
2683 
2684       "punpcklwd   %%xmm5,%%xmm0                 \n"  // 0011 (32b, 1u1v)
2685       "punpcklwd   %%xmm5,%%xmm1                 \n"  // 1122 (32b, 1u1v)
2686 
2687       "movdqa      %%xmm0,%%xmm2                 \n"
2688       "movdqa      %%xmm1,%%xmm3                 \n"
2689 
2690       "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (lo, far)
2691       "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (hi, far)
2692 
2693       "paddd       %%xmm4,%%xmm2                 \n"  // far+2 (lo)
2694       "paddd       %%xmm4,%%xmm3                 \n"  // far+2 (hi)
2695       "paddd       %%xmm0,%%xmm2                 \n"  // near+far+2 (lo)
2696       "paddd       %%xmm1,%%xmm3                 \n"  // near+far+2 (hi)
2697       "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
2698       "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
2699       "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far+2 (lo)
2700       "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
2701 
2702       "psrld       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
2703       "psrld       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
2704       "packusdw    %%xmm1,%%xmm0                 \n"
2705       "movdqu      %%xmm0,(%1)                   \n"
2706 
2707       "lea         0x8(%0),%0                    \n"
2708       "lea         0x10(%1),%1                   \n"  // 2 uv to 4 uv
2709       "sub         $0x4,%2                       \n"
2710       "jg          1b                            \n"
2711       : "+r"(src_ptr),   // %0
2712         "+r"(dst_ptr),   // %1
2713         "+r"(dst_width)  // %2
2714       :
2715       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2716 }
2717 #endif
2718 
2719 #ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
ScaleUVRowUp2_Bilinear_16_SSE2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2720 void ScaleUVRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
2721                                     ptrdiff_t src_stride,
2722                                     uint16_t* dst_ptr,
2723                                     ptrdiff_t dst_stride,
2724                                     int dst_width) {
2725   asm volatile(
2726       "pxor        %%xmm7,%%xmm7                 \n"
2727       "pcmpeqd     %%xmm6,%%xmm6                 \n"
2728       "psrld       $31,%%xmm6                    \n"
2729       "pslld       $3,%%xmm6                     \n"  // all 8
2730 
2731       LABELALIGN
2732       "1:                                        \n"
2733       "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
2734       "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
2735       "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0011 (near) (32b, 1u1v)
2736       "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1122 (near) (32b, 1u1v)
2737       "movdqa      %%xmm0,%%xmm2                 \n"
2738       "movdqa      %%xmm1,%%xmm3                 \n"
2739       "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (far) (1, lo)
2740       "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (far) (1, hi)
2741       "paddd       %%xmm0,%%xmm2                 \n"  // near+far (1, lo)
2742       "paddd       %%xmm1,%%xmm3                 \n"  // near+far (1, hi)
2743       "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (1, lo)
2744       "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (1, hi)
2745       "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
2746       "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
2747 
2748       "movq        (%0,%3,2),%%xmm2              \n"
2749       "movq        4(%0,%3,2),%%xmm3             \n"
2750       "punpcklwd   %%xmm7,%%xmm2                 \n"
2751       "punpcklwd   %%xmm7,%%xmm3                 \n"
2752       "movdqa      %%xmm2,%%xmm4                 \n"
2753       "movdqa      %%xmm3,%%xmm5                 \n"
2754       "pshufd      $0b01001110,%%xmm4,%%xmm4     \n"  // 1100 (far) (2, lo)
2755       "pshufd      $0b01001110,%%xmm5,%%xmm5     \n"  // 2211 (far) (2, hi)
2756       "paddd       %%xmm2,%%xmm4                 \n"  // near+far (2, lo)
2757       "paddd       %%xmm3,%%xmm5                 \n"  // near+far (2, hi)
2758       "paddd       %%xmm2,%%xmm2                 \n"  // 2*near (2, lo)
2759       "paddd       %%xmm3,%%xmm3                 \n"  // 2*near (2, hi)
2760       "paddd       %%xmm4,%%xmm2                 \n"  // 3*near+far (2, lo)
2761       "paddd       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
2762 
2763       "movdqa      %%xmm0,%%xmm4                 \n"
2764       "movdqa      %%xmm2,%%xmm5                 \n"
2765       "paddd       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
2766       "paddd       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
2767       "paddd       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
2768       "paddd       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
2769       "psrld       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
2770 
2771       "movdqa      %%xmm2,%%xmm5                 \n"
2772       "paddd       %%xmm2,%%xmm5                 \n"  // 6*near+2*far (2, lo)
2773       "paddd       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
2774       "paddd       %%xmm2,%%xmm5                 \n"  // 9*near+3*far (2, lo)
2775       "paddd       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
2776       "psrld       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
2777 
2778       "movdqa      %%xmm1,%%xmm0                 \n"
2779       "movdqa      %%xmm3,%%xmm2                 \n"
2780       "paddd       %%xmm1,%%xmm0                 \n"  // 6*near+2*far (1, hi)
2781       "paddd       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (2, hi)
2782       "paddd       %%xmm1,%%xmm0                 \n"  // 9*near+3*far (1, hi)
2783       "paddd       %%xmm2,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
2784       "psrld       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
2785 
2786       "movdqa      %%xmm3,%%xmm2                 \n"
2787       "paddd       %%xmm3,%%xmm2                 \n"  // 6*near+2*far (2, hi)
2788       "paddd       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
2789       "paddd       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
2790       "paddd       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (2, hi)
2791       "psrld       $4,%%xmm2                     \n"  // ^ div by 16 (2, hi)
2792 
2793       "packusdw    %%xmm0,%%xmm4                 \n"
2794       "movdqu      %%xmm4,(%1)                   \n"  // store above
2795       "packusdw    %%xmm2,%%xmm5                 \n"
2796       "movdqu      %%xmm5,(%1,%4,2)              \n"  // store below
2797 
2798       "lea         0x8(%0),%0                    \n"
2799       "lea         0x10(%1),%1                   \n"  // 2 uv to 4 uv
2800       "sub         $0x4,%2                       \n"
2801       "jg          1b                            \n"
2802       : "+r"(src_ptr),                // %0
2803         "+r"(dst_ptr),                // %1
2804         "+r"(dst_width)               // %2
2805       : "r"((intptr_t)(src_stride)),  // %3
2806         "r"((intptr_t)(dst_stride))   // %4
2807       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2808         "xmm7");
2809 }
2810 #endif
2811 
2812 #ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2
ScaleUVRowUp2_Linear_16_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)2813 void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
2814                                   uint16_t* dst_ptr,
2815                                   int dst_width) {
2816   asm volatile(
2817       "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
2818       "vpsrld      $31,%%ymm4,%%ymm4             \n"
2819       "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
2820 
2821       LABELALIGN
2822       "1:                                        \n"
2823       "vmovdqu     (%0),%%xmm0                   \n"  // 00112233 (16b, 1u1v)
2824       "vmovdqu     4(%0),%%xmm1                  \n"  // 11223344 (16b, 1u1v)
2825 
2826       "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
2827       "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
2828 
2829       "vpshufd     $0b01001110,%%ymm0,%%ymm2     \n"  // 11003322 (lo, far)
2830       "vpshufd     $0b01001110,%%ymm1,%%ymm3     \n"  // 22114433 (hi, far)
2831 
2832       "vpaddd      %%ymm4,%%ymm2,%%ymm2          \n"  // far+2 (lo)
2833       "vpaddd      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2 (hi)
2834       "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far+2 (lo)
2835       "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far+2 (hi)
2836       "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
2837       "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
2838       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far+2 (lo)
2839       "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far+2 (hi)
2840 
2841       "vpsrld      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
2842       "vpsrld      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
2843       "vpackusdw   %%ymm1,%%ymm0,%%ymm0          \n"
2844       "vmovdqu     %%ymm0,(%1)                   \n"
2845 
2846       "lea         0x10(%0),%0                   \n"
2847       "lea         0x20(%1),%1                   \n"  // 4 uv to 8 uv
2848       "sub         $0x8,%2                       \n"
2849       "jg          1b                            \n"
2850       "vzeroupper                                \n"
2851       : "+r"(src_ptr),   // %0
2852         "+r"(dst_ptr),   // %1
2853         "+r"(dst_width)  // %2
2854       :
2855       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2856 }
2857 #endif
2858 
2859 #ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2
ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2860 void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
2861                                     ptrdiff_t src_stride,
2862                                     uint16_t* dst_ptr,
2863                                     ptrdiff_t dst_stride,
2864                                     int dst_width) {
2865   asm volatile(
2866       "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
2867       "vpsrld      $31,%%ymm6,%%ymm6             \n"
2868       "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
2869 
2870       LABELALIGN
2871       "1:                                        \n"
2872 
2873       "vmovdqu     (%0),%%xmm0                   \n"  // 00112233 (16b, 1u1v)
2874       "vmovdqu     4(%0),%%xmm1                  \n"  // 11223344 (16b, 1u1v)
2875       "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
2876       "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
2877       "vpshufd     $0b01001110,%%ymm0,%%ymm2     \n"  // 11003322 (lo, far)
2878       "vpshufd     $0b01001110,%%ymm1,%%ymm3     \n"  // 22114433 (hi, far)
2879       "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far (lo)
2880       "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far (hi)
2881       "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
2882       "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
2883       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far (lo)
2884       "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far (hi)
2885 
2886       "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 00112233 (16b, 1u1v)
2887       "vmovdqu     4(%0,%3,2),%%xmm3             \n"  // 11223344 (16b, 1u1v)
2888       "vpmovzxwd   %%xmm2,%%ymm2                 \n"  // 01234567 (32b, 1u1v)
2889       "vpmovzxwd   %%xmm3,%%ymm3                 \n"  // 12345678 (32b, 1u1v)
2890       "vpshufd     $0b01001110,%%ymm2,%%ymm4     \n"  // 11003322 (lo, far)
2891       "vpshufd     $0b01001110,%%ymm3,%%ymm5     \n"  // 22114433 (hi, far)
2892       "vpaddd      %%ymm2,%%ymm4,%%ymm4          \n"  // near+far (lo)
2893       "vpaddd      %%ymm3,%%ymm5,%%ymm5          \n"  // near+far (hi)
2894       "vpaddd      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near (lo)
2895       "vpaddd      %%ymm3,%%ymm3,%%ymm3          \n"  // 2*near (hi)
2896       "vpaddd      %%ymm2,%%ymm4,%%ymm2          \n"  // 3*near+far (lo)
2897       "vpaddd      %%ymm3,%%ymm5,%%ymm3          \n"  // 3*near+far (hi)
2898 
2899       "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
2900       "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
2901       "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
2902       "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
2903       "vpsrld      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
2904 
2905       "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
2906       "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
2907       "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
2908       "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
2909       "vpsrld      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
2910 
2911       "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
2912       "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
2913       "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
2914       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
2915       "vpsrld      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
2916 
2917       "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
2918       "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
2919       "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
2920       "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
2921       "vpsrld      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
2922 
2923       "vpackusdw   %%ymm0,%%ymm4,%%ymm4          \n"
2924       "vmovdqu     %%ymm4,(%1)                   \n"  // store above
2925       "vpackusdw   %%ymm2,%%ymm5,%%ymm5          \n"
2926       "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below
2927 
2928       "lea         0x10(%0),%0                   \n"
2929       "lea         0x20(%1),%1                   \n"  // 4 uv to 8 uv
2930       "sub         $0x8,%2                       \n"
2931       "jg          1b                            \n"
2932       "vzeroupper                                \n"
2933       : "+r"(src_ptr),                // %0
2934         "+r"(dst_ptr),                // %1
2935         "+r"(dst_width)               // %2
2936       : "r"((intptr_t)(src_stride)),  // %3
2937         "r"((intptr_t)(dst_stride))   // %4
2938       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2939 }
2940 #endif
2941 
2942 #endif  // defined(__x86_64__) || defined(__i386__)
2943 
2944 #ifdef __cplusplus
2945 }  // extern "C"
2946 }  // namespace libyuv
2947 #endif
2948