1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
21
22 // Offsets for source bytes 0 to 9
23 static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
24 128, 128, 128, 128, 128, 128, 128, 128};
25
26 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
27 static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
28 128, 128, 128, 128, 128, 128, 128, 128};
29
30 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
31 static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
32 128, 128, 128, 128, 128, 128, 128, 128};
33
34 // Offsets for source bytes 0 to 10
35 static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
36
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38 static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7,
39 8, 9, 9, 10, 10, 11, 12, 13};
40
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42 static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
43 10, 11, 12, 13, 13, 14, 14, 15};
44
45 // Coefficients for source bytes 0 to 10
46 static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
47
48 // Coefficients for source bytes 10 to 21
49 static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
50
51 // Coefficients for source bytes 21 to 31
52 static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
53
54 // Coefficients for source bytes 21 to 31
55 static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
56
57 static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
58 128, 128, 128, 128, 128, 128, 128, 128};
59
60 static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
61 6, 8, 11, 14, 128, 128, 128, 128};
62
63 // Arrange words 0,3,6 into 0,1,2
64 static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
65 128, 128, 128, 128, 128, 128, 128, 128};
66
67 // Arrange words 0,3,6 into 3,4,5
68 static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
69 6, 7, 12, 13, 128, 128, 128, 128};
70
71 // Scaling values for boxes of 3x3 and 2x3
72 static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
73 65536 / 9, 65536 / 6, 0, 0};
74
75 // Arrange first value for pixels 0,1,2,3,4,5
76 static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
77 11, 128, 14, 128, 128, 128, 128, 128};
78
79 // Arrange second value for pixels 0,1,2,3,4,5
80 static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
81 12, 128, 15, 128, 128, 128, 128, 128};
82
83 // Arrange third value for pixels 0,1,2,3,4,5
84 static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
85 13, 128, 128, 128, 128, 128, 128, 128};
86
87 // Scaling values for boxes of 3x2 and 2x2
88 static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
89 65536 / 3, 65536 / 2, 0, 0};
90
91 // GCC versions of row functions are verbatim conversions from Visual C.
92 // Generated using gcc disassembly on Visual C object file:
93 // objdump -D yuvscaler.obj >yuvscaler.txt
94
ScaleRowDown2_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)95 void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
96 ptrdiff_t src_stride,
97 uint8_t* dst_ptr,
98 int dst_width) {
99 (void)src_stride;
100 asm volatile(
101 // 16 pixel loop.
102 LABELALIGN
103 "1: \n"
104 "movdqu (%0),%%xmm0 \n"
105 "movdqu 0x10(%0),%%xmm1 \n"
106 "lea 0x20(%0),%0 \n"
107 "psrlw $0x8,%%xmm0 \n"
108 "psrlw $0x8,%%xmm1 \n"
109 "packuswb %%xmm1,%%xmm0 \n"
110 "movdqu %%xmm0,(%1) \n"
111 "lea 0x10(%1),%1 \n"
112 "sub $0x10,%2 \n"
113 "jg 1b \n"
114 : "+r"(src_ptr), // %0
115 "+r"(dst_ptr), // %1
116 "+r"(dst_width) // %2
117 ::"memory",
118 "cc", "xmm0", "xmm1");
119 }
120
ScaleRowDown2Linear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)121 void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
122 ptrdiff_t src_stride,
123 uint8_t* dst_ptr,
124 int dst_width) {
125 (void)src_stride;
126 asm volatile(
127 "pcmpeqb %%xmm4,%%xmm4 \n"
128 "psrlw $0xf,%%xmm4 \n"
129 "packuswb %%xmm4,%%xmm4 \n"
130 "pxor %%xmm5,%%xmm5 \n"
131
132 LABELALIGN
133 "1: \n"
134 "movdqu (%0),%%xmm0 \n"
135 "movdqu 0x10(%0),%%xmm1 \n"
136 "lea 0x20(%0),%0 \n"
137 "pmaddubsw %%xmm4,%%xmm0 \n"
138 "pmaddubsw %%xmm4,%%xmm1 \n"
139 "pavgw %%xmm5,%%xmm0 \n"
140 "pavgw %%xmm5,%%xmm1 \n"
141 "packuswb %%xmm1,%%xmm0 \n"
142 "movdqu %%xmm0,(%1) \n"
143 "lea 0x10(%1),%1 \n"
144 "sub $0x10,%2 \n"
145 "jg 1b \n"
146 : "+r"(src_ptr), // %0
147 "+r"(dst_ptr), // %1
148 "+r"(dst_width) // %2
149 ::"memory",
150 "cc", "xmm0", "xmm1", "xmm4", "xmm5");
151 }
152
ScaleRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)153 void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
154 ptrdiff_t src_stride,
155 uint8_t* dst_ptr,
156 int dst_width) {
157 asm volatile(
158 "pcmpeqb %%xmm4,%%xmm4 \n"
159 "psrlw $0xf,%%xmm4 \n"
160 "packuswb %%xmm4,%%xmm4 \n"
161 "pxor %%xmm5,%%xmm5 \n"
162
163 LABELALIGN
164 "1: \n"
165 "movdqu (%0),%%xmm0 \n"
166 "movdqu 0x10(%0),%%xmm1 \n"
167 "movdqu 0x00(%0,%3,1),%%xmm2 \n"
168 "movdqu 0x10(%0,%3,1),%%xmm3 \n"
169 "lea 0x20(%0),%0 \n"
170 "pmaddubsw %%xmm4,%%xmm0 \n"
171 "pmaddubsw %%xmm4,%%xmm1 \n"
172 "pmaddubsw %%xmm4,%%xmm2 \n"
173 "pmaddubsw %%xmm4,%%xmm3 \n"
174 "paddw %%xmm2,%%xmm0 \n"
175 "paddw %%xmm3,%%xmm1 \n"
176 "psrlw $0x1,%%xmm0 \n"
177 "psrlw $0x1,%%xmm1 \n"
178 "pavgw %%xmm5,%%xmm0 \n"
179 "pavgw %%xmm5,%%xmm1 \n"
180 "packuswb %%xmm1,%%xmm0 \n"
181 "movdqu %%xmm0,(%1) \n"
182 "lea 0x10(%1),%1 \n"
183 "sub $0x10,%2 \n"
184 "jg 1b \n"
185 : "+r"(src_ptr), // %0
186 "+r"(dst_ptr), // %1
187 "+r"(dst_width) // %2
188 : "r"((intptr_t)(src_stride)) // %3
189 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
190 }
191
192 #ifdef HAS_SCALEROWDOWN2_AVX2
ScaleRowDown2_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)193 void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
194 ptrdiff_t src_stride,
195 uint8_t* dst_ptr,
196 int dst_width) {
197 (void)src_stride;
198 asm volatile(LABELALIGN
199 "1: \n"
200 "vmovdqu (%0),%%ymm0 \n"
201 "vmovdqu 0x20(%0),%%ymm1 \n"
202 "lea 0x40(%0),%0 \n"
203 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
204 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
205 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
206 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
207 "vmovdqu %%ymm0,(%1) \n"
208 "lea 0x20(%1),%1 \n"
209 "sub $0x20,%2 \n"
210 "jg 1b \n"
211 "vzeroupper \n"
212 : "+r"(src_ptr), // %0
213 "+r"(dst_ptr), // %1
214 "+r"(dst_width) // %2
215 ::"memory",
216 "cc", "xmm0", "xmm1");
217 }
218
ScaleRowDown2Linear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)219 void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
220 ptrdiff_t src_stride,
221 uint8_t* dst_ptr,
222 int dst_width) {
223 (void)src_stride;
224 asm volatile(
225 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
226 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
227 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
228 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
229
230 LABELALIGN
231 "1: \n"
232 "vmovdqu (%0),%%ymm0 \n"
233 "vmovdqu 0x20(%0),%%ymm1 \n"
234 "lea 0x40(%0),%0 \n"
235 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
236 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
237 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
238 "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
239 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
240 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
241 "vmovdqu %%ymm0,(%1) \n"
242 "lea 0x20(%1),%1 \n"
243 "sub $0x20,%2 \n"
244 "jg 1b \n"
245 "vzeroupper \n"
246 : "+r"(src_ptr), // %0
247 "+r"(dst_ptr), // %1
248 "+r"(dst_width) // %2
249 ::"memory",
250 "cc", "xmm0", "xmm1", "xmm4", "xmm5");
251 }
252
ScaleRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)253 void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
254 ptrdiff_t src_stride,
255 uint8_t* dst_ptr,
256 int dst_width) {
257 asm volatile(
258 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
259 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
260 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
261 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
262
263 LABELALIGN
264 "1: \n"
265 "vmovdqu (%0),%%ymm0 \n"
266 "vmovdqu 0x20(%0),%%ymm1 \n"
267 "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
268 "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
269 "lea 0x40(%0),%0 \n"
270 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
271 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
272 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
273 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
274 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
275 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
276 "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
277 "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
278 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
279 "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
280 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
281 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
282 "vmovdqu %%ymm0,(%1) \n"
283 "lea 0x20(%1),%1 \n"
284 "sub $0x20,%2 \n"
285 "jg 1b \n"
286 "vzeroupper \n"
287 : "+r"(src_ptr), // %0
288 "+r"(dst_ptr), // %1
289 "+r"(dst_width) // %2
290 : "r"((intptr_t)(src_stride)) // %3
291 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
292 }
293 #endif // HAS_SCALEROWDOWN2_AVX2
294
ScaleRowDown4_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)295 void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
296 ptrdiff_t src_stride,
297 uint8_t* dst_ptr,
298 int dst_width) {
299 (void)src_stride;
300 asm volatile(
301 "pcmpeqb %%xmm5,%%xmm5 \n"
302 "psrld $0x18,%%xmm5 \n"
303 "pslld $0x10,%%xmm5 \n"
304
305 LABELALIGN
306 "1: \n"
307 "movdqu (%0),%%xmm0 \n"
308 "movdqu 0x10(%0),%%xmm1 \n"
309 "lea 0x20(%0),%0 \n"
310 "pand %%xmm5,%%xmm0 \n"
311 "pand %%xmm5,%%xmm1 \n"
312 "packuswb %%xmm1,%%xmm0 \n"
313 "psrlw $0x8,%%xmm0 \n"
314 "packuswb %%xmm0,%%xmm0 \n"
315 "movq %%xmm0,(%1) \n"
316 "lea 0x8(%1),%1 \n"
317 "sub $0x8,%2 \n"
318 "jg 1b \n"
319 : "+r"(src_ptr), // %0
320 "+r"(dst_ptr), // %1
321 "+r"(dst_width) // %2
322 ::"memory",
323 "cc", "xmm0", "xmm1", "xmm5");
324 }
325
ScaleRowDown4Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)326 void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
327 ptrdiff_t src_stride,
328 uint8_t* dst_ptr,
329 int dst_width) {
330 intptr_t stridex3;
331 asm volatile(
332 "pcmpeqb %%xmm4,%%xmm4 \n"
333 "psrlw $0xf,%%xmm4 \n"
334 "movdqa %%xmm4,%%xmm5 \n"
335 "packuswb %%xmm4,%%xmm4 \n"
336 "psllw $0x3,%%xmm5 \n"
337 "lea 0x00(%4,%4,2),%3 \n"
338
339 LABELALIGN
340 "1: \n"
341 "movdqu (%0),%%xmm0 \n"
342 "movdqu 0x10(%0),%%xmm1 \n"
343 "movdqu 0x00(%0,%4,1),%%xmm2 \n"
344 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
345 "pmaddubsw %%xmm4,%%xmm0 \n"
346 "pmaddubsw %%xmm4,%%xmm1 \n"
347 "pmaddubsw %%xmm4,%%xmm2 \n"
348 "pmaddubsw %%xmm4,%%xmm3 \n"
349 "paddw %%xmm2,%%xmm0 \n"
350 "paddw %%xmm3,%%xmm1 \n"
351 "movdqu 0x00(%0,%4,2),%%xmm2 \n"
352 "movdqu 0x10(%0,%4,2),%%xmm3 \n"
353 "pmaddubsw %%xmm4,%%xmm2 \n"
354 "pmaddubsw %%xmm4,%%xmm3 \n"
355 "paddw %%xmm2,%%xmm0 \n"
356 "paddw %%xmm3,%%xmm1 \n"
357 "movdqu 0x00(%0,%3,1),%%xmm2 \n"
358 "movdqu 0x10(%0,%3,1),%%xmm3 \n"
359 "lea 0x20(%0),%0 \n"
360 "pmaddubsw %%xmm4,%%xmm2 \n"
361 "pmaddubsw %%xmm4,%%xmm3 \n"
362 "paddw %%xmm2,%%xmm0 \n"
363 "paddw %%xmm3,%%xmm1 \n"
364 "phaddw %%xmm1,%%xmm0 \n"
365 "paddw %%xmm5,%%xmm0 \n"
366 "psrlw $0x4,%%xmm0 \n"
367 "packuswb %%xmm0,%%xmm0 \n"
368 "movq %%xmm0,(%1) \n"
369 "lea 0x8(%1),%1 \n"
370 "sub $0x8,%2 \n"
371 "jg 1b \n"
372 : "+r"(src_ptr), // %0
373 "+r"(dst_ptr), // %1
374 "+r"(dst_width), // %2
375 "=&r"(stridex3) // %3
376 : "r"((intptr_t)(src_stride)) // %4
377 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
378 }
379
380 #ifdef HAS_SCALEROWDOWN4_AVX2
ScaleRowDown4_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)381 void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
382 ptrdiff_t src_stride,
383 uint8_t* dst_ptr,
384 int dst_width) {
385 (void)src_stride;
386 asm volatile(
387 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
388 "vpsrld $0x18,%%ymm5,%%ymm5 \n"
389 "vpslld $0x10,%%ymm5,%%ymm5 \n"
390
391 LABELALIGN
392 "1: \n"
393 "vmovdqu (%0),%%ymm0 \n"
394 "vmovdqu 0x20(%0),%%ymm1 \n"
395 "lea 0x40(%0),%0 \n"
396 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
397 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
398 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
399 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
400 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
401 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
402 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
403 "vmovdqu %%xmm0,(%1) \n"
404 "lea 0x10(%1),%1 \n"
405 "sub $0x10,%2 \n"
406 "jg 1b \n"
407 "vzeroupper \n"
408 : "+r"(src_ptr), // %0
409 "+r"(dst_ptr), // %1
410 "+r"(dst_width) // %2
411 ::"memory",
412 "cc", "xmm0", "xmm1", "xmm5");
413 }
414
ScaleRowDown4Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)415 void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
416 ptrdiff_t src_stride,
417 uint8_t* dst_ptr,
418 int dst_width) {
419 asm volatile(
420 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
421 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
422 "vpsllw $0x3,%%ymm4,%%ymm5 \n"
423 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
424
425 LABELALIGN
426 "1: \n"
427 "vmovdqu (%0),%%ymm0 \n"
428 "vmovdqu 0x20(%0),%%ymm1 \n"
429 "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
430 "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
431 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
432 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
433 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
434 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
435 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
436 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
437 "vmovdqu 0x00(%0,%3,2),%%ymm2 \n"
438 "vmovdqu 0x20(%0,%3,2),%%ymm3 \n"
439 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
440 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
441 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
442 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
443 "vmovdqu 0x00(%0,%4,1),%%ymm2 \n"
444 "vmovdqu 0x20(%0,%4,1),%%ymm3 \n"
445 "lea 0x40(%0),%0 \n"
446 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
447 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
448 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
449 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
450 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
451 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
452 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
453 "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
454 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
455 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
456 "vmovdqu %%xmm0,(%1) \n"
457 "lea 0x10(%1),%1 \n"
458 "sub $0x10,%2 \n"
459 "jg 1b \n"
460 "vzeroupper \n"
461 : "+r"(src_ptr), // %0
462 "+r"(dst_ptr), // %1
463 "+r"(dst_width) // %2
464 : "r"((intptr_t)(src_stride)), // %3
465 "r"((intptr_t)(src_stride * 3)) // %4
466 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
467 }
468 #endif // HAS_SCALEROWDOWN4_AVX2
469
ScaleRowDown34_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)470 void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
471 ptrdiff_t src_stride,
472 uint8_t* dst_ptr,
473 int dst_width) {
474 (void)src_stride;
475 asm volatile(
476 "movdqa %0,%%xmm3 \n"
477 "movdqa %1,%%xmm4 \n"
478 "movdqa %2,%%xmm5 \n"
479 :
480 : "m"(kShuf0), // %0
481 "m"(kShuf1), // %1
482 "m"(kShuf2) // %2
483 );
484 asm volatile(LABELALIGN
485 "1: \n"
486 "movdqu (%0),%%xmm0 \n"
487 "movdqu 0x10(%0),%%xmm2 \n"
488 "lea 0x20(%0),%0 \n"
489 "movdqa %%xmm2,%%xmm1 \n"
490 "palignr $0x8,%%xmm0,%%xmm1 \n"
491 "pshufb %%xmm3,%%xmm0 \n"
492 "pshufb %%xmm4,%%xmm1 \n"
493 "pshufb %%xmm5,%%xmm2 \n"
494 "movq %%xmm0,(%1) \n"
495 "movq %%xmm1,0x8(%1) \n"
496 "movq %%xmm2,0x10(%1) \n"
497 "lea 0x18(%1),%1 \n"
498 "sub $0x18,%2 \n"
499 "jg 1b \n"
500 : "+r"(src_ptr), // %0
501 "+r"(dst_ptr), // %1
502 "+r"(dst_width) // %2
503 ::"memory",
504 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
505 }
506
ScaleRowDown34_1_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)507 void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
508 ptrdiff_t src_stride,
509 uint8_t* dst_ptr,
510 int dst_width) {
511 asm volatile(
512 "movdqa %0,%%xmm2 \n" // kShuf01
513 "movdqa %1,%%xmm3 \n" // kShuf11
514 "movdqa %2,%%xmm4 \n" // kShuf21
515 :
516 : "m"(kShuf01), // %0
517 "m"(kShuf11), // %1
518 "m"(kShuf21) // %2
519 );
520 asm volatile(
521 "movdqa %0,%%xmm5 \n" // kMadd01
522 "movdqa %1,%%xmm0 \n" // kMadd11
523 "movdqa %2,%%xmm1 \n" // kRound34
524 :
525 : "m"(kMadd01), // %0
526 "m"(kMadd11), // %1
527 "m"(kRound34) // %2
528 );
529 asm volatile(LABELALIGN
530 "1: \n"
531 "movdqu (%0),%%xmm6 \n"
532 "movdqu 0x00(%0,%3,1),%%xmm7 \n"
533 "pavgb %%xmm7,%%xmm6 \n"
534 "pshufb %%xmm2,%%xmm6 \n"
535 "pmaddubsw %%xmm5,%%xmm6 \n"
536 "paddsw %%xmm1,%%xmm6 \n"
537 "psrlw $0x2,%%xmm6 \n"
538 "packuswb %%xmm6,%%xmm6 \n"
539 "movq %%xmm6,(%1) \n"
540 "movdqu 0x8(%0),%%xmm6 \n"
541 "movdqu 0x8(%0,%3,1),%%xmm7 \n"
542 "pavgb %%xmm7,%%xmm6 \n"
543 "pshufb %%xmm3,%%xmm6 \n"
544 "pmaddubsw %%xmm0,%%xmm6 \n"
545 "paddsw %%xmm1,%%xmm6 \n"
546 "psrlw $0x2,%%xmm6 \n"
547 "packuswb %%xmm6,%%xmm6 \n"
548 "movq %%xmm6,0x8(%1) \n"
549 "movdqu 0x10(%0),%%xmm6 \n"
550 "movdqu 0x10(%0,%3,1),%%xmm7 \n"
551 "lea 0x20(%0),%0 \n"
552 "pavgb %%xmm7,%%xmm6 \n"
553 "pshufb %%xmm4,%%xmm6 \n"
554 "pmaddubsw %4,%%xmm6 \n"
555 "paddsw %%xmm1,%%xmm6 \n"
556 "psrlw $0x2,%%xmm6 \n"
557 "packuswb %%xmm6,%%xmm6 \n"
558 "movq %%xmm6,0x10(%1) \n"
559 "lea 0x18(%1),%1 \n"
560 "sub $0x18,%2 \n"
561 "jg 1b \n"
562 : "+r"(src_ptr), // %0
563 "+r"(dst_ptr), // %1
564 "+r"(dst_width) // %2
565 : "r"((intptr_t)(src_stride)), // %3
566 "m"(kMadd21) // %4
567 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
568 "xmm6", "xmm7");
569 }
570
ScaleRowDown34_0_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)571 void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
572 ptrdiff_t src_stride,
573 uint8_t* dst_ptr,
574 int dst_width) {
575 asm volatile(
576 "movdqa %0,%%xmm2 \n" // kShuf01
577 "movdqa %1,%%xmm3 \n" // kShuf11
578 "movdqa %2,%%xmm4 \n" // kShuf21
579 :
580 : "m"(kShuf01), // %0
581 "m"(kShuf11), // %1
582 "m"(kShuf21) // %2
583 );
584 asm volatile(
585 "movdqa %0,%%xmm5 \n" // kMadd01
586 "movdqa %1,%%xmm0 \n" // kMadd11
587 "movdqa %2,%%xmm1 \n" // kRound34
588 :
589 : "m"(kMadd01), // %0
590 "m"(kMadd11), // %1
591 "m"(kRound34) // %2
592 );
593
594 asm volatile(LABELALIGN
595 "1: \n"
596 "movdqu (%0),%%xmm6 \n"
597 "movdqu 0x00(%0,%3,1),%%xmm7 \n"
598 "pavgb %%xmm6,%%xmm7 \n"
599 "pavgb %%xmm7,%%xmm6 \n"
600 "pshufb %%xmm2,%%xmm6 \n"
601 "pmaddubsw %%xmm5,%%xmm6 \n"
602 "paddsw %%xmm1,%%xmm6 \n"
603 "psrlw $0x2,%%xmm6 \n"
604 "packuswb %%xmm6,%%xmm6 \n"
605 "movq %%xmm6,(%1) \n"
606 "movdqu 0x8(%0),%%xmm6 \n"
607 "movdqu 0x8(%0,%3,1),%%xmm7 \n"
608 "pavgb %%xmm6,%%xmm7 \n"
609 "pavgb %%xmm7,%%xmm6 \n"
610 "pshufb %%xmm3,%%xmm6 \n"
611 "pmaddubsw %%xmm0,%%xmm6 \n"
612 "paddsw %%xmm1,%%xmm6 \n"
613 "psrlw $0x2,%%xmm6 \n"
614 "packuswb %%xmm6,%%xmm6 \n"
615 "movq %%xmm6,0x8(%1) \n"
616 "movdqu 0x10(%0),%%xmm6 \n"
617 "movdqu 0x10(%0,%3,1),%%xmm7 \n"
618 "lea 0x20(%0),%0 \n"
619 "pavgb %%xmm6,%%xmm7 \n"
620 "pavgb %%xmm7,%%xmm6 \n"
621 "pshufb %%xmm4,%%xmm6 \n"
622 "pmaddubsw %4,%%xmm6 \n"
623 "paddsw %%xmm1,%%xmm6 \n"
624 "psrlw $0x2,%%xmm6 \n"
625 "packuswb %%xmm6,%%xmm6 \n"
626 "movq %%xmm6,0x10(%1) \n"
627 "lea 0x18(%1),%1 \n"
628 "sub $0x18,%2 \n"
629 "jg 1b \n"
630 : "+r"(src_ptr), // %0
631 "+r"(dst_ptr), // %1
632 "+r"(dst_width) // %2
633 : "r"((intptr_t)(src_stride)), // %3
634 "m"(kMadd21) // %4
635 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
636 "xmm6", "xmm7");
637 }
638
ScaleRowDown38_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)639 void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
640 ptrdiff_t src_stride,
641 uint8_t* dst_ptr,
642 int dst_width) {
643 (void)src_stride;
644 asm volatile(
645 "movdqa %3,%%xmm4 \n"
646 "movdqa %4,%%xmm5 \n"
647
648 LABELALIGN
649 "1: \n"
650 "movdqu (%0),%%xmm0 \n"
651 "movdqu 0x10(%0),%%xmm1 \n"
652 "lea 0x20(%0),%0 \n"
653 "pshufb %%xmm4,%%xmm0 \n"
654 "pshufb %%xmm5,%%xmm1 \n"
655 "paddusb %%xmm1,%%xmm0 \n"
656 "movq %%xmm0,(%1) \n"
657 "movhlps %%xmm0,%%xmm1 \n"
658 "movd %%xmm1,0x8(%1) \n"
659 "lea 0xc(%1),%1 \n"
660 "sub $0xc,%2 \n"
661 "jg 1b \n"
662 : "+r"(src_ptr), // %0
663 "+r"(dst_ptr), // %1
664 "+r"(dst_width) // %2
665 : "m"(kShuf38a), // %3
666 "m"(kShuf38b) // %4
667 : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
668 }
669
ScaleRowDown38_2_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)670 void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
671 ptrdiff_t src_stride,
672 uint8_t* dst_ptr,
673 int dst_width) {
674 asm volatile(
675 "movdqa %0,%%xmm2 \n"
676 "movdqa %1,%%xmm3 \n"
677 "movdqa %2,%%xmm4 \n"
678 "movdqa %3,%%xmm5 \n"
679 :
680 : "m"(kShufAb0), // %0
681 "m"(kShufAb1), // %1
682 "m"(kShufAb2), // %2
683 "m"(kScaleAb2) // %3
684 );
685 asm volatile(LABELALIGN
686 "1: \n"
687 "movdqu (%0),%%xmm0 \n"
688 "movdqu 0x00(%0,%3,1),%%xmm1 \n"
689 "lea 0x10(%0),%0 \n"
690 "pavgb %%xmm1,%%xmm0 \n"
691 "movdqa %%xmm0,%%xmm1 \n"
692 "pshufb %%xmm2,%%xmm1 \n"
693 "movdqa %%xmm0,%%xmm6 \n"
694 "pshufb %%xmm3,%%xmm6 \n"
695 "paddusw %%xmm6,%%xmm1 \n"
696 "pshufb %%xmm4,%%xmm0 \n"
697 "paddusw %%xmm0,%%xmm1 \n"
698 "pmulhuw %%xmm5,%%xmm1 \n"
699 "packuswb %%xmm1,%%xmm1 \n"
700 "movd %%xmm1,(%1) \n"
701 "psrlq $0x10,%%xmm1 \n"
702 "movd %%xmm1,0x2(%1) \n"
703 "lea 0x6(%1),%1 \n"
704 "sub $0x6,%2 \n"
705 "jg 1b \n"
706 : "+r"(src_ptr), // %0
707 "+r"(dst_ptr), // %1
708 "+r"(dst_width) // %2
709 : "r"((intptr_t)(src_stride)) // %3
710 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
711 "xmm6");
712 }
713
ScaleRowDown38_3_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)714 void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
715 ptrdiff_t src_stride,
716 uint8_t* dst_ptr,
717 int dst_width) {
718 asm volatile(
719 "movdqa %0,%%xmm2 \n"
720 "movdqa %1,%%xmm3 \n"
721 "movdqa %2,%%xmm4 \n"
722 "pxor %%xmm5,%%xmm5 \n"
723 :
724 : "m"(kShufAc), // %0
725 "m"(kShufAc3), // %1
726 "m"(kScaleAc33) // %2
727 );
728 asm volatile(LABELALIGN
729 "1: \n"
730 "movdqu (%0),%%xmm0 \n"
731 "movdqu 0x00(%0,%3,1),%%xmm6 \n"
732 "movhlps %%xmm0,%%xmm1 \n"
733 "movhlps %%xmm6,%%xmm7 \n"
734 "punpcklbw %%xmm5,%%xmm0 \n"
735 "punpcklbw %%xmm5,%%xmm1 \n"
736 "punpcklbw %%xmm5,%%xmm6 \n"
737 "punpcklbw %%xmm5,%%xmm7 \n"
738 "paddusw %%xmm6,%%xmm0 \n"
739 "paddusw %%xmm7,%%xmm1 \n"
740 "movdqu 0x00(%0,%3,2),%%xmm6 \n"
741 "lea 0x10(%0),%0 \n"
742 "movhlps %%xmm6,%%xmm7 \n"
743 "punpcklbw %%xmm5,%%xmm6 \n"
744 "punpcklbw %%xmm5,%%xmm7 \n"
745 "paddusw %%xmm6,%%xmm0 \n"
746 "paddusw %%xmm7,%%xmm1 \n"
747 "movdqa %%xmm0,%%xmm6 \n"
748 "psrldq $0x2,%%xmm0 \n"
749 "paddusw %%xmm0,%%xmm6 \n"
750 "psrldq $0x2,%%xmm0 \n"
751 "paddusw %%xmm0,%%xmm6 \n"
752 "pshufb %%xmm2,%%xmm6 \n"
753 "movdqa %%xmm1,%%xmm7 \n"
754 "psrldq $0x2,%%xmm1 \n"
755 "paddusw %%xmm1,%%xmm7 \n"
756 "psrldq $0x2,%%xmm1 \n"
757 "paddusw %%xmm1,%%xmm7 \n"
758 "pshufb %%xmm3,%%xmm7 \n"
759 "paddusw %%xmm7,%%xmm6 \n"
760 "pmulhuw %%xmm4,%%xmm6 \n"
761 "packuswb %%xmm6,%%xmm6 \n"
762 "movd %%xmm6,(%1) \n"
763 "psrlq $0x10,%%xmm6 \n"
764 "movd %%xmm6,0x2(%1) \n"
765 "lea 0x6(%1),%1 \n"
766 "sub $0x6,%2 \n"
767 "jg 1b \n"
768 : "+r"(src_ptr), // %0
769 "+r"(dst_ptr), // %1
770 "+r"(dst_width) // %2
771 : "r"((intptr_t)(src_stride)) // %3
772 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
773 "xmm6", "xmm7");
774 }
775
776 static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5,
777 10, 11, 8, 9, 14, 15, 12, 13};
778
779 static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
780 3, 1, 1, 3, 3, 1, 1, 3};
781
782 #ifdef HAS_SCALEROWUP2LINEAR_SSE2
ScaleRowUp2_Linear_SSE2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)783 void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
784 uint8_t* dst_ptr,
785 int dst_width) {
786 asm volatile(
787 "pxor %%xmm0,%%xmm0 \n" // 0
788 "pcmpeqw %%xmm6,%%xmm6 \n"
789 "psrlw $15,%%xmm6 \n"
790 "psllw $1,%%xmm6 \n" // all 2
791
792 LABELALIGN
793 "1: \n"
794 "movq (%0),%%xmm1 \n" // 01234567
795 "movq 1(%0),%%xmm2 \n" // 12345678
796 "movdqa %%xmm1,%%xmm3 \n"
797 "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
798 "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677
799 "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
800 "movdqa %%xmm1,%%xmm4 \n"
801 "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16)
802 "movdqa %%xmm2,%%xmm5 \n"
803 "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16)
804 "paddw %%xmm5,%%xmm4 \n"
805 "movdqa %%xmm3,%%xmm5 \n"
806 "paddw %%xmm6,%%xmm4 \n"
807 "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16)
808 "paddw %%xmm5,%%xmm5 \n"
809 "paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo)
810 "psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo)
811
812 "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16)
813 "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
814 "paddw %%xmm2,%%xmm1 \n"
815 "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
816 "paddw %%xmm6,%%xmm1 \n"
817 "paddw %%xmm3,%%xmm3 \n"
818 "paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
819 "psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
820
821 "packuswb %%xmm1,%%xmm5 \n"
822 "movdqu %%xmm5,(%1) \n"
823
824 "lea 0x8(%0),%0 \n"
825 "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
826 "sub $0x10,%2 \n"
827 "jg 1b \n"
828 : "+r"(src_ptr), // %0
829 "+r"(dst_ptr), // %1
830 "+r"(dst_width) // %2
831 :
832 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
833 }
834 #endif
835
836 #ifdef HAS_SCALEROWUP2BILINEAR_SSE2
ScaleRowUp2_Bilinear_SSE2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)837 void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
838 ptrdiff_t src_stride,
839 uint8_t* dst_ptr,
840 ptrdiff_t dst_stride,
841 int dst_width) {
842 asm volatile(
843 LABELALIGN
844 "1: \n"
845 "pxor %%xmm0,%%xmm0 \n" // 0
846 // above line
847 "movq (%0),%%xmm1 \n" // 01234567
848 "movq 1(%0),%%xmm2 \n" // 12345678
849 "movdqa %%xmm1,%%xmm3 \n"
850 "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
851 "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677
852 "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
853
854 "movdqa %%xmm1,%%xmm4 \n"
855 "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16)
856 "movdqa %%xmm2,%%xmm5 \n"
857 "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16)
858 "paddw %%xmm5,%%xmm4 \n" // near+far
859 "movdqa %%xmm3,%%xmm5 \n"
860 "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16)
861 "paddw %%xmm5,%%xmm5 \n" // 2*near
862 "paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo)
863
864 "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16)
865 "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
866 "paddw %%xmm2,%%xmm1 \n"
867 "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
868 "paddw %%xmm3,%%xmm3 \n" // 2*near
869 "paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
870
871 // below line
872 "movq (%0,%3),%%xmm6 \n" // 01234567
873 "movq 1(%0,%3),%%xmm2 \n" // 12345678
874 "movdqa %%xmm6,%%xmm3 \n"
875 "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
876 "punpcklbw %%xmm6,%%xmm6 \n" // 0011223344556677
877 "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
878
879 "movdqa %%xmm6,%%xmm5 \n"
880 "punpcklbw %%xmm0,%%xmm5 \n" // 00112233 (16)
881 "movdqa %%xmm2,%%xmm7 \n"
882 "punpcklbw %%xmm0,%%xmm7 \n" // 11223344 (16)
883 "paddw %%xmm7,%%xmm5 \n" // near+far
884 "movdqa %%xmm3,%%xmm7 \n"
885 "punpcklbw %%xmm0,%%xmm7 \n" // 01122334 (16)
886 "paddw %%xmm7,%%xmm7 \n" // 2*near
887 "paddw %%xmm7,%%xmm5 \n" // 3*near+far (2, lo)
888
889 "punpckhbw %%xmm0,%%xmm6 \n" // 44556677 (16)
890 "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
891 "paddw %%xmm6,%%xmm2 \n" // near+far
892 "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
893 "paddw %%xmm3,%%xmm3 \n" // 2*near
894 "paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi)
895
896 // xmm4 xmm1
897 // xmm5 xmm2
898 "pcmpeqw %%xmm0,%%xmm0 \n"
899 "psrlw $15,%%xmm0 \n"
900 "psllw $3,%%xmm0 \n" // all 8
901
902 "movdqa %%xmm4,%%xmm3 \n"
903 "movdqa %%xmm5,%%xmm6 \n"
904 "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (1, lo)
905 "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, lo)
906 "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo)
907 "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo)
908 "psrlw $4,%%xmm3 \n" // ^ div by 16
909
910 "movdqa %%xmm1,%%xmm7 \n"
911 "movdqa %%xmm2,%%xmm6 \n"
912 "paddw %%xmm7,%%xmm7 \n" // 6*near+2*far (1, hi)
913 "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, hi)
914 "paddw %%xmm1,%%xmm7 \n" // 9*near+3*far (1, hi)
915 "paddw %%xmm6,%%xmm7 \n" // 9 3 3 1 + 8 (1, hi)
916 "psrlw $4,%%xmm7 \n" // ^ div by 16
917
918 "packuswb %%xmm7,%%xmm3 \n"
919 "movdqu %%xmm3,(%1) \n" // save above line
920
921 "movdqa %%xmm5,%%xmm3 \n"
922 "paddw %%xmm0,%%xmm4 \n" // 3*near+far+8 (1, lo)
923 "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, lo)
924 "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo)
925 "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (lo)
926 "psrlw $4,%%xmm5 \n" // ^ div by 16
927
928 "movdqa %%xmm2,%%xmm3 \n"
929 "paddw %%xmm0,%%xmm1 \n" // 3*near+far+8 (1, hi)
930 "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, hi)
931 "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
932 "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (hi)
933 "psrlw $4,%%xmm2 \n" // ^ div by 16
934
935 "packuswb %%xmm2,%%xmm5 \n"
936 "movdqu %%xmm5,(%1,%4) \n" // save below line
937
938 "lea 0x8(%0),%0 \n"
939 "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
940 "sub $0x10,%2 \n"
941 "jg 1b \n"
942 : "+r"(src_ptr), // %0
943 "+r"(dst_ptr), // %1
944 "+r"(dst_width) // %2
945 : "r"((intptr_t)(src_stride)), // %3
946 "r"((intptr_t)(dst_stride)) // %4
947 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
948 "xmm7");
949 }
950 #endif
951
952 #ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3
ScaleRowUp2_Linear_12_SSSE3(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)953 void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
954 uint16_t* dst_ptr,
955 int dst_width) {
956 asm volatile(
957 "movdqa %3,%%xmm5 \n"
958 "pcmpeqw %%xmm4,%%xmm4 \n"
959 "psrlw $15,%%xmm4 \n"
960 "psllw $1,%%xmm4 \n" // all 2
961
962 LABELALIGN
963 "1: \n"
964 "movdqu (%0),%%xmm0 \n" // 01234567 (16)
965 "movdqu 2(%0),%%xmm1 \n" // 12345678 (16)
966
967 "movdqa %%xmm0,%%xmm2 \n"
968 "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16)
969 "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16)
970
971 "movdqa %%xmm2,%%xmm3 \n"
972 "movdqa %%xmm0,%%xmm1 \n"
973 "pshufb %%xmm5,%%xmm3 \n" // 54657687 (far)
974 "pshufb %%xmm5,%%xmm1 \n" // 10213243 (far)
975
976 "paddw %%xmm4,%%xmm1 \n" // far+2
977 "paddw %%xmm4,%%xmm3 \n" // far+2
978 "paddw %%xmm0,%%xmm1 \n" // near+far+2
979 "paddw %%xmm2,%%xmm3 \n" // near+far+2
980 "paddw %%xmm0,%%xmm0 \n" // 2*near
981 "paddw %%xmm2,%%xmm2 \n" // 2*near
982 "paddw %%xmm1,%%xmm0 \n" // 3*near+far+2 (lo)
983 "paddw %%xmm3,%%xmm2 \n" // 3*near+far+2 (hi)
984
985 "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far
986 "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far
987 "movdqu %%xmm0,(%1) \n"
988 "movdqu %%xmm2,16(%1) \n"
989
990 "lea 0x10(%0),%0 \n"
991 "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
992 "sub $0x10,%2 \n"
993 "jg 1b \n"
994 : "+r"(src_ptr), // %0
995 "+r"(dst_ptr), // %1
996 "+r"(dst_width) // %2
997 : "m"(kLinearShuffleFar) // %3
998 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
999 }
1000 #endif
1001
1002 #ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3
ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1003 void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
1004 ptrdiff_t src_stride,
1005 uint16_t* dst_ptr,
1006 ptrdiff_t dst_stride,
1007 int dst_width) {
1008 asm volatile(
1009 "pcmpeqw %%xmm7,%%xmm7 \n"
1010 "psrlw $15,%%xmm7 \n"
1011 "psllw $3,%%xmm7 \n" // all 8
1012 "movdqa %5,%%xmm6 \n"
1013
1014 LABELALIGN
1015 "1: \n"
1016 // above line
1017 "movdqu (%0),%%xmm0 \n" // 01234567 (16)
1018 "movdqu 2(%0),%%xmm1 \n" // 12345678 (16)
1019 "movdqa %%xmm0,%%xmm2 \n"
1020 "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16)
1021 "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16)
1022 "movdqa %%xmm2,%%xmm3 \n"
1023 "movdqa %%xmm0,%%xmm1 \n"
1024 "pshufb %%xmm6,%%xmm3 \n" // 54657687 (far)
1025 "pshufb %%xmm6,%%xmm1 \n" // 10213243 (far)
1026 "paddw %%xmm0,%%xmm1 \n" // near+far
1027 "paddw %%xmm2,%%xmm3 \n" // near+far
1028 "paddw %%xmm0,%%xmm0 \n" // 2*near
1029 "paddw %%xmm2,%%xmm2 \n" // 2*near
1030 "paddw %%xmm1,%%xmm0 \n" // 3*near+far (1, lo)
1031 "paddw %%xmm3,%%xmm2 \n" // 3*near+far (1, hi)
1032
1033 // below line
1034 "movdqu (%0,%3,2),%%xmm1 \n" // 01234567 (16)
1035 "movdqu 2(%0,%3,2),%%xmm4 \n" // 12345678 (16)
1036 "movdqa %%xmm1,%%xmm3 \n"
1037 "punpckhwd %%xmm4,%%xmm3 \n" // 45566778 (16)
1038 "punpcklwd %%xmm4,%%xmm1 \n" // 01122334 (16)
1039 "movdqa %%xmm3,%%xmm5 \n"
1040 "movdqa %%xmm1,%%xmm4 \n"
1041 "pshufb %%xmm6,%%xmm5 \n" // 54657687 (far)
1042 "pshufb %%xmm6,%%xmm4 \n" // 10213243 (far)
1043 "paddw %%xmm1,%%xmm4 \n" // near+far
1044 "paddw %%xmm3,%%xmm5 \n" // near+far
1045 "paddw %%xmm1,%%xmm1 \n" // 2*near
1046 "paddw %%xmm3,%%xmm3 \n" // 2*near
1047 "paddw %%xmm4,%%xmm1 \n" // 3*near+far (2, lo)
1048 "paddw %%xmm5,%%xmm3 \n" // 3*near+far (2, hi)
1049
1050 // xmm0 xmm2
1051 // xmm1 xmm3
1052
1053 "movdqa %%xmm0,%%xmm4 \n"
1054 "movdqa %%xmm1,%%xmm5 \n"
1055 "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, lo)
1056 "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, lo)
1057 "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
1058 "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
1059 "psrlw $4,%%xmm4 \n" // ^ div by 16
1060 "movdqu %%xmm4,(%1) \n"
1061
1062 "movdqa %%xmm2,%%xmm4 \n"
1063 "movdqa %%xmm3,%%xmm5 \n"
1064 "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, hi)
1065 "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, hi)
1066 "paddw %%xmm2,%%xmm4 \n" // 9*near+3*far (1, hi)
1067 "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, hi)
1068 "psrlw $4,%%xmm4 \n" // ^ div by 16
1069 "movdqu %%xmm4,0x10(%1) \n"
1070
1071 "movdqa %%xmm1,%%xmm4 \n"
1072 "paddw %%xmm7,%%xmm0 \n" // 3*near+far+8 (1, lo)
1073 "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, lo)
1074 "paddw %%xmm4,%%xmm1 \n" // 9*near+3*far (2, lo)
1075 "paddw %%xmm0,%%xmm1 \n" // 9 3 3 1 + 8 (2, lo)
1076 "psrlw $4,%%xmm1 \n" // ^ div by 16
1077 "movdqu %%xmm1,(%1,%4,2) \n"
1078
1079 "movdqa %%xmm3,%%xmm4 \n"
1080 "paddw %%xmm7,%%xmm2 \n" // 3*near+far+8 (1, hi)
1081 "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, hi)
1082 "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (2, hi)
1083 "paddw %%xmm2,%%xmm3 \n" // 9 3 3 1 + 8 (2, hi)
1084 "psrlw $4,%%xmm3 \n" // ^ div by 16
1085 "movdqu %%xmm3,0x10(%1,%4,2) \n"
1086
1087 "lea 0x10(%0),%0 \n"
1088 "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
1089 "sub $0x10,%2 \n"
1090 "jg 1b \n"
1091 : "+r"(src_ptr), // %0
1092 "+r"(dst_ptr), // %1
1093 "+r"(dst_width) // %2
1094 : "r"((intptr_t)(src_stride)), // %3
1095 "r"((intptr_t)(dst_stride)), // %4
1096 "m"(kLinearShuffleFar) // %5
1097 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1098 }
1099 #endif
1100
1101 #ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
ScaleRowUp2_Linear_16_SSE2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1102 void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
1103 uint16_t* dst_ptr,
1104 int dst_width) {
1105 asm volatile(
1106 "pxor %%xmm5,%%xmm5 \n"
1107 "pcmpeqd %%xmm4,%%xmm4 \n"
1108 "psrld $31,%%xmm4 \n"
1109 "pslld $1,%%xmm4 \n" // all 2
1110
1111 LABELALIGN
1112 "1: \n"
1113 "movq (%0),%%xmm0 \n" // 0123 (16b)
1114 "movq 2(%0),%%xmm1 \n" // 1234 (16b)
1115
1116 "punpcklwd %%xmm5,%%xmm0 \n" // 0123 (32b)
1117 "punpcklwd %%xmm5,%%xmm1 \n" // 1234 (32b)
1118
1119 "movdqa %%xmm0,%%xmm2 \n"
1120 "movdqa %%xmm1,%%xmm3 \n"
1121
1122 "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far)
1123 "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far)
1124
1125 "paddd %%xmm4,%%xmm2 \n" // far+2 (lo)
1126 "paddd %%xmm4,%%xmm3 \n" // far+2 (hi)
1127 "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo)
1128 "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi)
1129 "paddd %%xmm0,%%xmm0 \n" // 2*near (lo)
1130 "paddd %%xmm1,%%xmm1 \n" // 2*near (hi)
1131 "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo)
1132 "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
1133
1134 "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
1135 "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
1136 "packssdw %%xmm1,%%xmm0 \n"
1137 "pshufd $0b11011000,%%xmm0,%%xmm0 \n"
1138 "movdqu %%xmm0,(%1) \n"
1139
1140 "lea 0x8(%0),%0 \n"
1141 "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel
1142 "sub $0x8,%2 \n"
1143 "jg 1b \n"
1144 : "+r"(src_ptr), // %0
1145 "+r"(dst_ptr), // %1
1146 "+r"(dst_width) // %2
1147 :
1148 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1149 }
1150 #endif
1151
1152 #ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
ScaleRowUp2_Bilinear_16_SSE2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1153 void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
1154 ptrdiff_t src_stride,
1155 uint16_t* dst_ptr,
1156 ptrdiff_t dst_stride,
1157 int dst_width) {
1158 asm volatile(
1159 "pxor %%xmm7,%%xmm7 \n"
1160 "pcmpeqd %%xmm6,%%xmm6 \n"
1161 "psrld $31,%%xmm6 \n"
1162 "pslld $3,%%xmm6 \n" // all 8
1163
1164 LABELALIGN
1165 "1: \n"
1166 "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
1167 "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
1168 "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v)
1169 "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v)
1170 "movdqa %%xmm0,%%xmm2 \n"
1171 "movdqa %%xmm1,%%xmm3 \n"
1172 "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo)
1173 "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi)
1174 "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo)
1175 "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi)
1176 "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo)
1177 "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi)
1178 "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo)
1179 "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
1180
1181 "movq (%0),%%xmm0 \n" // 0123 (16b)
1182 "movq 2(%0),%%xmm1 \n" // 1234 (16b)
1183 "punpcklwd %%xmm7,%%xmm0 \n" // 0123 (32b)
1184 "punpcklwd %%xmm7,%%xmm1 \n" // 1234 (32b)
1185 "movdqa %%xmm0,%%xmm2 \n"
1186 "movdqa %%xmm1,%%xmm3 \n"
1187 "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far)
1188 "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far)
1189 "paddd %%xmm0,%%xmm2 \n" // near+far (lo)
1190 "paddd %%xmm1,%%xmm3 \n" // near+far (hi)
1191 "paddd %%xmm0,%%xmm0 \n" // 2*near (lo)
1192 "paddd %%xmm1,%%xmm1 \n" // 2*near (hi)
1193 "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo)
1194 "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
1195
1196 "movq (%0,%3,2),%%xmm2 \n"
1197 "movq 2(%0,%3,2),%%xmm3 \n"
1198 "punpcklwd %%xmm7,%%xmm2 \n" // 0123 (32b)
1199 "punpcklwd %%xmm7,%%xmm3 \n" // 1234 (32b)
1200 "movdqa %%xmm2,%%xmm4 \n"
1201 "movdqa %%xmm3,%%xmm5 \n"
1202 "pshufd $0b10110001,%%xmm4,%%xmm4 \n" // 1032 (even, far)
1203 "pshufd $0b10110001,%%xmm5,%%xmm5 \n" // 2143 (odd, far)
1204 "paddd %%xmm2,%%xmm4 \n" // near+far (lo)
1205 "paddd %%xmm3,%%xmm5 \n" // near+far (hi)
1206 "paddd %%xmm2,%%xmm2 \n" // 2*near (lo)
1207 "paddd %%xmm3,%%xmm3 \n" // 2*near (hi)
1208 "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo)
1209 "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi)
1210
1211 "movdqa %%xmm0,%%xmm4 \n"
1212 "movdqa %%xmm2,%%xmm5 \n"
1213 "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
1214 "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
1215 "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
1216 "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
1217 "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo)
1218
1219 "movdqa %%xmm2,%%xmm5 \n"
1220 "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo)
1221 "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
1222 "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo)
1223 "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
1224 "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo)
1225
1226 "movdqa %%xmm1,%%xmm0 \n"
1227 "movdqa %%xmm3,%%xmm2 \n"
1228 "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi)
1229 "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi)
1230 "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi)
1231 "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
1232 "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi)
1233
1234 "movdqa %%xmm3,%%xmm2 \n"
1235 "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi)
1236 "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi)
1237 "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
1238 "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi)
1239 "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi)
1240
1241 "packssdw %%xmm0,%%xmm4 \n"
1242 "pshufd $0b11011000,%%xmm4,%%xmm4 \n"
1243 "movdqu %%xmm4,(%1) \n" // store above
1244 "packssdw %%xmm2,%%xmm5 \n"
1245 "pshufd $0b11011000,%%xmm4,%%xmm4 \n"
1246 "movdqu %%xmm5,(%1,%4,2) \n" // store below
1247
1248 "lea 0x8(%0),%0 \n"
1249 "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel
1250 "sub $0x8,%2 \n"
1251 "jg 1b \n"
1252 : "+r"(src_ptr), // %0
1253 "+r"(dst_ptr), // %1
1254 "+r"(dst_width) // %2
1255 : "r"((intptr_t)(src_stride)), // %3
1256 "r"((intptr_t)(dst_stride)) // %4
1257 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1258 }
1259 #endif
1260
1261 #ifdef HAS_SCALEROWUP2LINEAR_SSSE3
ScaleRowUp2_Linear_SSSE3(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)1262 void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
1263 uint8_t* dst_ptr,
1264 int dst_width) {
1265 asm volatile(
1266 "pcmpeqw %%xmm4,%%xmm4 \n"
1267 "psrlw $15,%%xmm4 \n"
1268 "psllw $1,%%xmm4 \n" // all 2
1269 "movdqa %3,%%xmm3 \n"
1270
1271 LABELALIGN
1272 "1: \n"
1273 "movq (%0),%%xmm0 \n" // 01234567
1274 "movq 1(%0),%%xmm1 \n" // 12345678
1275 "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
1276 "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
1277 "movdqa %%xmm0,%%xmm2 \n"
1278 "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
1279 "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
1280 "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi)
1281 "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo)
1282 "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
1283 "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
1284 "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
1285 "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
1286 "vpackuswb %%xmm2,%%xmm0,%%xmm0 \n"
1287 "vmovdqu %%xmm0,(%1) \n"
1288
1289 "lea 0x8(%0),%0 \n"
1290 "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
1291 "sub $0x10,%2 \n"
1292 "jg 1b \n"
1293 : "+r"(src_ptr), // %0
1294 "+r"(dst_ptr), // %1
1295 "+r"(dst_width) // %2
1296 : "m"(kLinearMadd31) // %3
1297 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1298 }
1299 #endif
1300
1301 #ifdef HAS_SCALEROWUP2BILINEAR_SSSE3
ScaleRowUp2_Bilinear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1302 void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
1303 ptrdiff_t src_stride,
1304 uint8_t* dst_ptr,
1305 ptrdiff_t dst_stride,
1306 int dst_width) {
1307 asm volatile(
1308 "pcmpeqw %%xmm6,%%xmm6 \n"
1309 "psrlw $15,%%xmm6 \n"
1310 "psllw $3,%%xmm6 \n" // all 8
1311 "movdqa %5,%%xmm7 \n"
1312
1313 LABELALIGN
1314 "1: \n"
1315 "movq (%0),%%xmm0 \n" // 01234567
1316 "movq 1(%0),%%xmm1 \n" // 12345678
1317 "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
1318 "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
1319 "movdqa %%xmm0,%%xmm2 \n"
1320 "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
1321 "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
1322 "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi)
1323 "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo)
1324
1325 "movq (%0,%3),%%xmm1 \n"
1326 "movq 1(%0,%3),%%xmm4 \n"
1327 "punpcklwd %%xmm1,%%xmm1 \n"
1328 "punpcklwd %%xmm4,%%xmm4 \n"
1329 "movdqa %%xmm1,%%xmm3 \n"
1330 "punpckhdq %%xmm4,%%xmm3 \n"
1331 "punpckldq %%xmm4,%%xmm1 \n"
1332 "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
1333 "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
1334
1335 // xmm0 xmm2
1336 // xmm1 xmm3
1337
1338 "movdqa %%xmm0,%%xmm4 \n"
1339 "movdqa %%xmm1,%%xmm5 \n"
1340 "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
1341 "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
1342 "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
1343 "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
1344 "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
1345
1346 "movdqa %%xmm1,%%xmm5 \n"
1347 "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
1348 "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
1349 "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
1350 "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
1351 "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
1352
1353 "movdqa %%xmm2,%%xmm0 \n"
1354 "movdqa %%xmm3,%%xmm1 \n"
1355 "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
1356 "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
1357 "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
1358 "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
1359 "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
1360
1361 "movdqa %%xmm3,%%xmm1 \n"
1362 "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
1363 "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
1364 "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
1365 "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
1366 "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
1367
1368 "packuswb %%xmm0,%%xmm4 \n"
1369 "movdqu %%xmm4,(%1) \n" // store above
1370 "packuswb %%xmm1,%%xmm5 \n"
1371 "movdqu %%xmm5,(%1,%4) \n" // store below
1372
1373 "lea 0x8(%0),%0 \n"
1374 "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
1375 "sub $0x10,%2 \n"
1376 "jg 1b \n"
1377 : "+r"(src_ptr), // %0
1378 "+r"(dst_ptr), // %1
1379 "+r"(dst_width) // %2
1380 : "r"((intptr_t)(src_stride)), // %3
1381 "r"((intptr_t)(dst_stride)), // %4
1382 "m"(kLinearMadd31) // %5
1383 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1384 "xmm7");
1385 }
1386 #endif
1387
1388 #ifdef HAS_SCALEROWUP2LINEAR_AVX2
ScaleRowUp2_Linear_AVX2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)1389 void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
1390 uint8_t* dst_ptr,
1391 int dst_width) {
1392 asm volatile(
1393 "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
1394 "vpsrlw $15,%%ymm4,%%ymm4 \n"
1395 "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
1396 "vbroadcastf128 %3,%%ymm3 \n"
1397
1398 LABELALIGN
1399 "1: \n"
1400 "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
1401 "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
1402 "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
1403 "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
1404 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
1405 "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
1406 "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
1407 "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
1408 "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
1409 "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
1410 "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
1411 "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
1412 "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
1413 "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
1414 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
1415 "vmovdqu %%ymm0,(%1) \n"
1416
1417 "lea 0x10(%0),%0 \n"
1418 "lea 0x20(%1),%1 \n" // 16 sample to 32 sample
1419 "sub $0x20,%2 \n"
1420 "jg 1b \n"
1421 "vzeroupper \n"
1422 : "+r"(src_ptr), // %0
1423 "+r"(dst_ptr), // %1
1424 "+r"(dst_width) // %2
1425 : "m"(kLinearMadd31) // %3
1426 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1427 }
1428 #endif
1429
1430 #ifdef HAS_SCALEROWUP2BILINEAR_AVX2
ScaleRowUp2_Bilinear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1431 void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
1432 ptrdiff_t src_stride,
1433 uint8_t* dst_ptr,
1434 ptrdiff_t dst_stride,
1435 int dst_width) {
1436 asm volatile(
1437 "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
1438 "vpsrlw $15,%%ymm6,%%ymm6 \n"
1439 "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
1440 "vbroadcastf128 %5,%%ymm7 \n"
1441
1442 LABELALIGN
1443 "1: \n"
1444 "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
1445 "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
1446 "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
1447 "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
1448 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
1449 "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
1450 "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
1451 "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
1452 "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
1453 "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
1454
1455 "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
1456 "vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
1457 "vpermq $0b11011000,%%ymm2,%%ymm2 \n"
1458 "vpermq $0b11011000,%%ymm3,%%ymm3 \n"
1459 "vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n"
1460 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n"
1461 "vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n"
1462 "vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n"
1463 "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
1464 "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
1465
1466 // ymm0 ymm1
1467 // ymm2 ymm3
1468
1469 "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
1470 "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
1471 "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
1472 "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
1473 "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
1474
1475 "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
1476 "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
1477 "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
1478 "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
1479 "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
1480
1481 "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
1482 "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
1483 "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
1484 "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
1485 "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
1486
1487 "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
1488 "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
1489 "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
1490 "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
1491 "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
1492
1493 "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
1494 "vmovdqu %%ymm4,(%1) \n" // store above
1495 "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
1496 "vmovdqu %%ymm5,(%1,%4) \n" // store below
1497
1498 "lea 0x10(%0),%0 \n"
1499 "lea 0x20(%1),%1 \n" // 16 sample to 32 sample
1500 "sub $0x20,%2 \n"
1501 "jg 1b \n"
1502 "vzeroupper \n"
1503 : "+r"(src_ptr), // %0
1504 "+r"(dst_ptr), // %1
1505 "+r"(dst_width) // %2
1506 : "r"((intptr_t)(src_stride)), // %3
1507 "r"((intptr_t)(dst_stride)), // %4
1508 "m"(kLinearMadd31) // %5
1509 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1510 "xmm7");
1511 }
1512 #endif
1513
1514 #ifdef HAS_SCALEROWUP2LINEAR_12_AVX2
ScaleRowUp2_Linear_12_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1515 void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
1516 uint16_t* dst_ptr,
1517 int dst_width) {
1518 asm volatile(
1519 "vbroadcastf128 %3,%%ymm5 \n"
1520 "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
1521 "vpsrlw $15,%%ymm4,%%ymm4 \n"
1522 "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
1523
1524 LABELALIGN
1525 "1: \n"
1526 "vmovdqu (%0),%%ymm0 \n" // 0123456789ABCDEF (16b)
1527 "vmovdqu 2(%0),%%ymm1 \n" // 123456789ABCDEF0 (16b)
1528
1529 "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 012389AB4567CDEF
1530 "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 12349ABC5678DEF0
1531
1532 "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" // 899AABBCCDDEEFF0 (near)
1533 "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
1534 "vpshufb %%ymm5,%%ymm2,%%ymm3 \n" // 98A9BACBDCEDFE0F (far)
1535 "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
1536
1537 "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // far+2
1538 "vpaddw %%ymm4,%%ymm3,%%ymm3 \n" // far+2
1539 "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far+2
1540 "vpaddw %%ymm2,%%ymm3,%%ymm3 \n" // near+far+2
1541 "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
1542 "vpaddw %%ymm2,%%ymm2,%%ymm2 \n" // 2*near
1543 "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 3*near+far+2
1544 "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 3*near+far+2
1545
1546 "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far
1547 "vpsrlw $2,%%ymm2,%%ymm2 \n" // 3/4*near+1/4*far
1548 "vmovdqu %%ymm0,(%1) \n"
1549 "vmovdqu %%ymm2,32(%1) \n"
1550
1551 "lea 0x20(%0),%0 \n"
1552 "lea 0x40(%1),%1 \n" // 16 sample to 32 sample
1553 "sub $0x20,%2 \n"
1554 "jg 1b \n"
1555 "vzeroupper \n"
1556 : "+r"(src_ptr), // %0
1557 "+r"(dst_ptr), // %1
1558 "+r"(dst_width) // %2
1559 : "m"(kLinearShuffleFar) // %3
1560 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1561 }
1562 #endif
1563
1564 #ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2
ScaleRowUp2_Bilinear_12_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1565 void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
1566 ptrdiff_t src_stride,
1567 uint16_t* dst_ptr,
1568 ptrdiff_t dst_stride,
1569 int dst_width) {
1570 asm volatile(
1571 "vbroadcastf128 %5,%%ymm5 \n"
1572 "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
1573 "vpsrlw $15,%%ymm4,%%ymm4 \n"
1574 "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8
1575
1576 LABELALIGN
1577 "1: \n"
1578
1579 "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
1580 "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
1581 "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
1582 "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
1583 "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
1584 "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
1585 "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far
1586 "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
1587 "vpaddw %%ymm0,%%ymm1,%%ymm2 \n" // 3*near+far (1)
1588
1589 "vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b)
1590 "vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b)
1591 "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
1592 "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
1593 "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
1594 "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
1595 "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far
1596 "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
1597 "vpaddw %%ymm0,%%ymm1,%%ymm3 \n" // 3*near+far (2)
1598
1599 "vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1)
1600 "vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2)
1601 "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1)
1602 "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1)
1603 "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
1604 "vmovdqu %%ymm0,(%1) \n" // store above
1605
1606 "vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2)
1607 "vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1)
1608 "vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2)
1609 "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2)
1610 "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
1611 "vmovdqu %%ymm0,(%1,%4,2) \n" // store below
1612
1613 "lea 0x10(%0),%0 \n"
1614 "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
1615 "sub $0x10,%2 \n"
1616 "jg 1b \n"
1617 "vzeroupper \n"
1618 : "+r"(src_ptr), // %0
1619 "+r"(dst_ptr), // %1
1620 "+r"(dst_width) // %2
1621 : "r"((intptr_t)(src_stride)), // %3
1622 "r"((intptr_t)(dst_stride)), // %4
1623 "m"(kLinearShuffleFar) // %5
1624 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1625 }
1626 #endif
1627
1628 #ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
ScaleRowUp2_Linear_16_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1629 void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
1630 uint16_t* dst_ptr,
1631 int dst_width) {
1632 asm volatile(
1633 "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
1634 "vpsrld $31,%%ymm4,%%ymm4 \n"
1635 "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
1636
1637 LABELALIGN
1638 "1: \n"
1639 "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v)
1640 "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v)
1641
1642 "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
1643 "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
1644
1645 "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far)
1646 "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far)
1647
1648 "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo)
1649 "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi)
1650 "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo)
1651 "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi)
1652 "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
1653 "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
1654 "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo)
1655 "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi)
1656
1657 "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
1658 "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
1659 "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n"
1660 "vpshufd $0b11011000,%%ymm0,%%ymm0 \n"
1661 "vmovdqu %%ymm0,(%1) \n"
1662
1663 "lea 0x10(%0),%0 \n"
1664 "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel
1665 "sub $0x10,%2 \n"
1666 "jg 1b \n"
1667 "vzeroupper \n"
1668 : "+r"(src_ptr), // %0
1669 "+r"(dst_ptr), // %1
1670 "+r"(dst_width) // %2
1671 :
1672 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1673 }
1674 #endif
1675
1676 #ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
ScaleRowUp2_Bilinear_16_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1677 void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
1678 ptrdiff_t src_stride,
1679 uint16_t* dst_ptr,
1680 ptrdiff_t dst_stride,
1681 int dst_width) {
1682 asm volatile(
1683 "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
1684 "vpsrld $31,%%ymm6,%%ymm6 \n"
1685 "vpslld $3,%%ymm6,%%ymm6 \n" // all 8
1686
1687 LABELALIGN
1688 "1: \n"
1689
1690 "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v)
1691 "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v)
1692 "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
1693 "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
1694 "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far)
1695 "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far)
1696 "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo)
1697 "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi)
1698 "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
1699 "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
1700 "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (1, lo)
1701 "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (1, hi)
1702
1703 "vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b, 1u1v)
1704 "vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b, 1u1v)
1705 "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v)
1706 "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v)
1707 "vpshufd $0b10110001,%%ymm2,%%ymm4 \n" // 10325476 (lo, far)
1708 "vpshufd $0b10110001,%%ymm3,%%ymm5 \n" // 21436587 (hi, far)
1709 "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo)
1710 "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi)
1711 "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo)
1712 "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi)
1713 "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (2, lo)
1714 "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (2, hi)
1715
1716 "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
1717 "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
1718 "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
1719 "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
1720 "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
1721
1722 "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
1723 "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
1724 "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
1725 "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
1726 "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
1727
1728 "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
1729 "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
1730 "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
1731 "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
1732 "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
1733
1734 "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
1735 "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
1736 "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
1737 "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
1738 "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
1739
1740 "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n"
1741 "vpshufd $0b11011000,%%ymm4,%%ymm4 \n"
1742 "vmovdqu %%ymm4,(%1) \n" // store above
1743 "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n"
1744 "vpshufd $0b11011000,%%ymm5,%%ymm5 \n"
1745 "vmovdqu %%ymm5,(%1,%4,2) \n" // store below
1746
1747 "lea 0x10(%0),%0 \n"
1748 "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel
1749 "sub $0x10,%2 \n"
1750 "jg 1b \n"
1751 "vzeroupper \n"
1752 : "+r"(src_ptr), // %0
1753 "+r"(dst_ptr), // %1
1754 "+r"(dst_width) // %2
1755 : "r"((intptr_t)(src_stride)), // %3
1756 "r"((intptr_t)(dst_stride)) // %4
1757 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1758 }
1759 #endif
1760
1761 // Reads 16xN bytes and produces 16 shorts at a time.
ScaleAddRow_SSE2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)1762 void ScaleAddRow_SSE2(const uint8_t* src_ptr,
1763 uint16_t* dst_ptr,
1764 int src_width) {
1765 asm volatile("pxor %%xmm5,%%xmm5 \n"
1766
1767 // 16 pixel loop.
1768 LABELALIGN
1769 "1: \n"
1770 "movdqu (%0),%%xmm3 \n"
1771 "lea 0x10(%0),%0 \n" // src_ptr += 16
1772 "movdqu (%1),%%xmm0 \n"
1773 "movdqu 0x10(%1),%%xmm1 \n"
1774 "movdqa %%xmm3,%%xmm2 \n"
1775 "punpcklbw %%xmm5,%%xmm2 \n"
1776 "punpckhbw %%xmm5,%%xmm3 \n"
1777 "paddusw %%xmm2,%%xmm0 \n"
1778 "paddusw %%xmm3,%%xmm1 \n"
1779 "movdqu %%xmm0,(%1) \n"
1780 "movdqu %%xmm1,0x10(%1) \n"
1781 "lea 0x20(%1),%1 \n"
1782 "sub $0x10,%2 \n"
1783 "jg 1b \n"
1784 : "+r"(src_ptr), // %0
1785 "+r"(dst_ptr), // %1
1786 "+r"(src_width) // %2
1787 :
1788 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
1789 }
1790
1791 #ifdef HAS_SCALEADDROW_AVX2
1792 // Reads 32 bytes and accumulates to 32 shorts at a time.
ScaleAddRow_AVX2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)1793 void ScaleAddRow_AVX2(const uint8_t* src_ptr,
1794 uint16_t* dst_ptr,
1795 int src_width) {
1796 asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n"
1797
1798 LABELALIGN
1799 "1: \n"
1800 "vmovdqu (%0),%%ymm3 \n"
1801 "lea 0x20(%0),%0 \n" // src_ptr += 32
1802 "vpermq $0xd8,%%ymm3,%%ymm3 \n"
1803 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
1804 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
1805 "vpaddusw (%1),%%ymm2,%%ymm0 \n"
1806 "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
1807 "vmovdqu %%ymm0,(%1) \n"
1808 "vmovdqu %%ymm1,0x20(%1) \n"
1809 "lea 0x40(%1),%1 \n"
1810 "sub $0x20,%2 \n"
1811 "jg 1b \n"
1812 "vzeroupper \n"
1813 : "+r"(src_ptr), // %0
1814 "+r"(dst_ptr), // %1
1815 "+r"(src_width) // %2
1816 :
1817 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
1818 }
1819 #endif // HAS_SCALEADDROW_AVX2
1820
1821 // Constant for making pixels signed to avoid pmaddubsw
1822 // saturation.
1823 static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1824 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
1825
1826 // Constant for making pixels unsigned and adding .5 for rounding.
1827 static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
1828 0x4040, 0x4040, 0x4040, 0x4040};
1829
1830 // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)1831 void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
1832 const uint8_t* src_ptr,
1833 int dst_width,
1834 int x,
1835 int dx) {
1836 intptr_t x0, x1, temp_pixel;
1837 asm volatile(
1838 "movd %6,%%xmm2 \n"
1839 "movd %7,%%xmm3 \n"
1840 "movl $0x04040000,%k2 \n"
1841 "movd %k2,%%xmm5 \n"
1842 "pcmpeqb %%xmm6,%%xmm6 \n"
1843 "psrlw $0x9,%%xmm6 \n" // 0x007f007f
1844 "pcmpeqb %%xmm7,%%xmm7 \n"
1845 "psrlw $15,%%xmm7 \n" // 0x00010001
1846
1847 "pextrw $0x1,%%xmm2,%k3 \n"
1848 "subl $0x2,%5 \n"
1849 "jl 29f \n"
1850 "movdqa %%xmm2,%%xmm0 \n"
1851 "paddd %%xmm3,%%xmm0 \n"
1852 "punpckldq %%xmm0,%%xmm2 \n"
1853 "punpckldq %%xmm3,%%xmm3 \n"
1854 "paddd %%xmm3,%%xmm3 \n"
1855 "pextrw $0x3,%%xmm2,%k4 \n"
1856
1857 LABELALIGN
1858 "2: \n"
1859 "movdqa %%xmm2,%%xmm1 \n"
1860 "paddd %%xmm3,%%xmm2 \n"
1861 "movzwl 0x00(%1,%3,1),%k2 \n"
1862 "movd %k2,%%xmm0 \n"
1863 "psrlw $0x9,%%xmm1 \n"
1864 "movzwl 0x00(%1,%4,1),%k2 \n"
1865 "movd %k2,%%xmm4 \n"
1866 "pshufb %%xmm5,%%xmm1 \n"
1867 "punpcklwd %%xmm4,%%xmm0 \n"
1868 "psubb %8,%%xmm0 \n" // make pixels signed.
1869 "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) +
1870 // 1
1871 "paddusb %%xmm7,%%xmm1 \n"
1872 "pmaddubsw %%xmm0,%%xmm1 \n"
1873 "pextrw $0x1,%%xmm2,%k3 \n"
1874 "pextrw $0x3,%%xmm2,%k4 \n"
1875 "paddw %9,%%xmm1 \n" // make pixels unsigned.
1876 "psrlw $0x7,%%xmm1 \n"
1877 "packuswb %%xmm1,%%xmm1 \n"
1878 "movd %%xmm1,%k2 \n"
1879 "mov %w2,(%0) \n"
1880 "lea 0x2(%0),%0 \n"
1881 "subl $0x2,%5 \n"
1882 "jge 2b \n"
1883
1884 LABELALIGN
1885 "29: \n"
1886 "addl $0x1,%5 \n"
1887 "jl 99f \n"
1888 "movzwl 0x00(%1,%3,1),%k2 \n"
1889 "movd %k2,%%xmm0 \n"
1890 "psrlw $0x9,%%xmm2 \n"
1891 "pshufb %%xmm5,%%xmm2 \n"
1892 "psubb %8,%%xmm0 \n" // make pixels signed.
1893 "pxor %%xmm6,%%xmm2 \n"
1894 "paddusb %%xmm7,%%xmm2 \n"
1895 "pmaddubsw %%xmm0,%%xmm2 \n"
1896 "paddw %9,%%xmm2 \n" // make pixels unsigned.
1897 "psrlw $0x7,%%xmm2 \n"
1898 "packuswb %%xmm2,%%xmm2 \n"
1899 "movd %%xmm2,%k2 \n"
1900 "mov %b2,(%0) \n"
1901 "99: \n"
1902 : "+r"(dst_ptr), // %0
1903 "+r"(src_ptr), // %1
1904 "=&a"(temp_pixel), // %2
1905 "=&r"(x0), // %3
1906 "=&r"(x1), // %4
1907 #if defined(__x86_64__)
1908 "+rm"(dst_width) // %5
1909 #else
1910 "+m"(dst_width) // %5
1911 #endif
1912 : "rm"(x), // %6
1913 "rm"(dx), // %7
1914 #if defined(__x86_64__)
1915 "x"(kFsub80), // %8
1916 "x"(kFadd40) // %9
1917 #else
1918 "m"(kFsub80), // %8
1919 "m"(kFadd40) // %9
1920 #endif
1921 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1922 "xmm7");
1923 }
1924
1925 // Reads 4 pixels, duplicates them and writes 8 pixels.
1926 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleColsUp2_SSE2(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)1927 void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
1928 const uint8_t* src_ptr,
1929 int dst_width,
1930 int x,
1931 int dx) {
1932 (void)x;
1933 (void)dx;
1934 asm volatile(LABELALIGN
1935 "1: \n"
1936 "movdqu (%1),%%xmm0 \n"
1937 "lea 0x10(%1),%1 \n"
1938 "movdqa %%xmm0,%%xmm1 \n"
1939 "punpcklbw %%xmm0,%%xmm0 \n"
1940 "punpckhbw %%xmm1,%%xmm1 \n"
1941 "movdqu %%xmm0,(%0) \n"
1942 "movdqu %%xmm1,0x10(%0) \n"
1943 "lea 0x20(%0),%0 \n"
1944 "sub $0x20,%2 \n"
1945 "jg 1b \n"
1946
1947 : "+r"(dst_ptr), // %0
1948 "+r"(src_ptr), // %1
1949 "+r"(dst_width) // %2
1950 ::"memory",
1951 "cc", "xmm0", "xmm1");
1952 }
1953
ScaleARGBRowDown2_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1954 void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
1955 ptrdiff_t src_stride,
1956 uint8_t* dst_argb,
1957 int dst_width) {
1958 (void)src_stride;
1959 asm volatile(LABELALIGN
1960 "1: \n"
1961 "movdqu (%0),%%xmm0 \n"
1962 "movdqu 0x10(%0),%%xmm1 \n"
1963 "lea 0x20(%0),%0 \n"
1964 "shufps $0xdd,%%xmm1,%%xmm0 \n"
1965 "movdqu %%xmm0,(%1) \n"
1966 "lea 0x10(%1),%1 \n"
1967 "sub $0x4,%2 \n"
1968 "jg 1b \n"
1969 : "+r"(src_argb), // %0
1970 "+r"(dst_argb), // %1
1971 "+r"(dst_width) // %2
1972 ::"memory",
1973 "cc", "xmm0", "xmm1");
1974 }
1975
ScaleARGBRowDown2Linear_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1976 void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
1977 ptrdiff_t src_stride,
1978 uint8_t* dst_argb,
1979 int dst_width) {
1980 (void)src_stride;
1981 asm volatile(LABELALIGN
1982 "1: \n"
1983 "movdqu (%0),%%xmm0 \n"
1984 "movdqu 0x10(%0),%%xmm1 \n"
1985 "lea 0x20(%0),%0 \n"
1986 "movdqa %%xmm0,%%xmm2 \n"
1987 "shufps $0x88,%%xmm1,%%xmm0 \n"
1988 "shufps $0xdd,%%xmm1,%%xmm2 \n"
1989 "pavgb %%xmm2,%%xmm0 \n"
1990 "movdqu %%xmm0,(%1) \n"
1991 "lea 0x10(%1),%1 \n"
1992 "sub $0x4,%2 \n"
1993 "jg 1b \n"
1994 : "+r"(src_argb), // %0
1995 "+r"(dst_argb), // %1
1996 "+r"(dst_width) // %2
1997 ::"memory",
1998 "cc", "xmm0", "xmm1");
1999 }
2000
ScaleARGBRowDown2Box_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)2001 void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
2002 ptrdiff_t src_stride,
2003 uint8_t* dst_argb,
2004 int dst_width) {
2005 asm volatile(LABELALIGN
2006 "1: \n"
2007 "movdqu (%0),%%xmm0 \n"
2008 "movdqu 0x10(%0),%%xmm1 \n"
2009 "movdqu 0x00(%0,%3,1),%%xmm2 \n"
2010 "movdqu 0x10(%0,%3,1),%%xmm3 \n"
2011 "lea 0x20(%0),%0 \n"
2012 "pavgb %%xmm2,%%xmm0 \n"
2013 "pavgb %%xmm3,%%xmm1 \n"
2014 "movdqa %%xmm0,%%xmm2 \n"
2015 "shufps $0x88,%%xmm1,%%xmm0 \n"
2016 "shufps $0xdd,%%xmm1,%%xmm2 \n"
2017 "pavgb %%xmm2,%%xmm0 \n"
2018 "movdqu %%xmm0,(%1) \n"
2019 "lea 0x10(%1),%1 \n"
2020 "sub $0x4,%2 \n"
2021 "jg 1b \n"
2022 : "+r"(src_argb), // %0
2023 "+r"(dst_argb), // %1
2024 "+r"(dst_width) // %2
2025 : "r"((intptr_t)(src_stride)) // %3
2026 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2027 }
2028
2029 // Reads 4 pixels at a time.
2030 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEven_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)2031 void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
2032 ptrdiff_t src_stride,
2033 int src_stepx,
2034 uint8_t* dst_argb,
2035 int dst_width) {
2036 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
2037 intptr_t src_stepx_x12;
2038 (void)src_stride;
2039 asm volatile(
2040 "lea 0x00(,%1,4),%1 \n"
2041 "lea 0x00(%1,%1,2),%4 \n"
2042
2043 LABELALIGN
2044 "1: \n"
2045 "movd (%0),%%xmm0 \n"
2046 "movd 0x00(%0,%1,1),%%xmm1 \n"
2047 "punpckldq %%xmm1,%%xmm0 \n"
2048 "movd 0x00(%0,%1,2),%%xmm2 \n"
2049 "movd 0x00(%0,%4,1),%%xmm3 \n"
2050 "lea 0x00(%0,%1,4),%0 \n"
2051 "punpckldq %%xmm3,%%xmm2 \n"
2052 "punpcklqdq %%xmm2,%%xmm0 \n"
2053 "movdqu %%xmm0,(%2) \n"
2054 "lea 0x10(%2),%2 \n"
2055 "sub $0x4,%3 \n"
2056 "jg 1b \n"
2057 : "+r"(src_argb), // %0
2058 "+r"(src_stepx_x4), // %1
2059 "+r"(dst_argb), // %2
2060 "+r"(dst_width), // %3
2061 "=&r"(src_stepx_x12) // %4
2062 ::"memory",
2063 "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2064 }
2065
2066 // Blends four 2x2 to 4x1.
2067 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEvenBox_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)2068 void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
2069 ptrdiff_t src_stride,
2070 int src_stepx,
2071 uint8_t* dst_argb,
2072 int dst_width) {
2073 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
2074 intptr_t src_stepx_x12;
2075 intptr_t row1 = (intptr_t)(src_stride);
2076 asm volatile(
2077 "lea 0x00(,%1,4),%1 \n"
2078 "lea 0x00(%1,%1,2),%4 \n"
2079 "lea 0x00(%0,%5,1),%5 \n"
2080
2081 LABELALIGN
2082 "1: \n"
2083 "movq (%0),%%xmm0 \n"
2084 "movhps 0x00(%0,%1,1),%%xmm0 \n"
2085 "movq 0x00(%0,%1,2),%%xmm1 \n"
2086 "movhps 0x00(%0,%4,1),%%xmm1 \n"
2087 "lea 0x00(%0,%1,4),%0 \n"
2088 "movq (%5),%%xmm2 \n"
2089 "movhps 0x00(%5,%1,1),%%xmm2 \n"
2090 "movq 0x00(%5,%1,2),%%xmm3 \n"
2091 "movhps 0x00(%5,%4,1),%%xmm3 \n"
2092 "lea 0x00(%5,%1,4),%5 \n"
2093 "pavgb %%xmm2,%%xmm0 \n"
2094 "pavgb %%xmm3,%%xmm1 \n"
2095 "movdqa %%xmm0,%%xmm2 \n"
2096 "shufps $0x88,%%xmm1,%%xmm0 \n"
2097 "shufps $0xdd,%%xmm1,%%xmm2 \n"
2098 "pavgb %%xmm2,%%xmm0 \n"
2099 "movdqu %%xmm0,(%2) \n"
2100 "lea 0x10(%2),%2 \n"
2101 "sub $0x4,%3 \n"
2102 "jg 1b \n"
2103 : "+r"(src_argb), // %0
2104 "+r"(src_stepx_x4), // %1
2105 "+r"(dst_argb), // %2
2106 "+rm"(dst_width), // %3
2107 "=&r"(src_stepx_x12), // %4
2108 "+r"(row1) // %5
2109 ::"memory",
2110 "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2111 }
2112
ScaleARGBCols_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2113 void ScaleARGBCols_SSE2(uint8_t* dst_argb,
2114 const uint8_t* src_argb,
2115 int dst_width,
2116 int x,
2117 int dx) {
2118 intptr_t x0, x1;
2119 asm volatile(
2120 "movd %5,%%xmm2 \n"
2121 "movd %6,%%xmm3 \n"
2122 "pshufd $0x0,%%xmm2,%%xmm2 \n"
2123 "pshufd $0x11,%%xmm3,%%xmm0 \n"
2124 "paddd %%xmm0,%%xmm2 \n"
2125 "paddd %%xmm3,%%xmm3 \n"
2126 "pshufd $0x5,%%xmm3,%%xmm0 \n"
2127 "paddd %%xmm0,%%xmm2 \n"
2128 "paddd %%xmm3,%%xmm3 \n"
2129 "pshufd $0x0,%%xmm3,%%xmm3 \n"
2130 "pextrw $0x1,%%xmm2,%k0 \n"
2131 "pextrw $0x3,%%xmm2,%k1 \n"
2132 "cmp $0x0,%4 \n"
2133 "jl 99f \n"
2134 "sub $0x4,%4 \n"
2135 "jl 49f \n"
2136
2137 LABELALIGN
2138 "40: \n"
2139 "movd 0x00(%3,%0,4),%%xmm0 \n"
2140 "movd 0x00(%3,%1,4),%%xmm1 \n"
2141 "pextrw $0x5,%%xmm2,%k0 \n"
2142 "pextrw $0x7,%%xmm2,%k1 \n"
2143 "paddd %%xmm3,%%xmm2 \n"
2144 "punpckldq %%xmm1,%%xmm0 \n"
2145 "movd 0x00(%3,%0,4),%%xmm1 \n"
2146 "movd 0x00(%3,%1,4),%%xmm4 \n"
2147 "pextrw $0x1,%%xmm2,%k0 \n"
2148 "pextrw $0x3,%%xmm2,%k1 \n"
2149 "punpckldq %%xmm4,%%xmm1 \n"
2150 "punpcklqdq %%xmm1,%%xmm0 \n"
2151 "movdqu %%xmm0,(%2) \n"
2152 "lea 0x10(%2),%2 \n"
2153 "sub $0x4,%4 \n"
2154 "jge 40b \n"
2155
2156 "49: \n"
2157 "test $0x2,%4 \n"
2158 "je 29f \n"
2159 "movd 0x00(%3,%0,4),%%xmm0 \n"
2160 "movd 0x00(%3,%1,4),%%xmm1 \n"
2161 "pextrw $0x5,%%xmm2,%k0 \n"
2162 "punpckldq %%xmm1,%%xmm0 \n"
2163 "movq %%xmm0,(%2) \n"
2164 "lea 0x8(%2),%2 \n"
2165 "29: \n"
2166 "test $0x1,%4 \n"
2167 "je 99f \n"
2168 "movd 0x00(%3,%0,4),%%xmm0 \n"
2169 "movd %%xmm0,(%2) \n"
2170 "99: \n"
2171 : "=&a"(x0), // %0
2172 "=&d"(x1), // %1
2173 "+r"(dst_argb), // %2
2174 "+r"(src_argb), // %3
2175 "+r"(dst_width) // %4
2176 : "rm"(x), // %5
2177 "rm"(dx) // %6
2178 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2179 }
2180
2181 // Reads 4 pixels, duplicates them and writes 8 pixels.
2182 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleARGBColsUp2_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2183 void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
2184 const uint8_t* src_argb,
2185 int dst_width,
2186 int x,
2187 int dx) {
2188 (void)x;
2189 (void)dx;
2190 asm volatile(LABELALIGN
2191 "1: \n"
2192 "movdqu (%1),%%xmm0 \n"
2193 "lea 0x10(%1),%1 \n"
2194 "movdqa %%xmm0,%%xmm1 \n"
2195 "punpckldq %%xmm0,%%xmm0 \n"
2196 "punpckhdq %%xmm1,%%xmm1 \n"
2197 "movdqu %%xmm0,(%0) \n"
2198 "movdqu %%xmm1,0x10(%0) \n"
2199 "lea 0x20(%0),%0 \n"
2200 "sub $0x8,%2 \n"
2201 "jg 1b \n"
2202
2203 : "+r"(dst_argb), // %0
2204 "+r"(src_argb), // %1
2205 "+r"(dst_width) // %2
2206 ::"memory",
2207 "cc", "xmm0", "xmm1");
2208 }
2209
2210 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
2211 static const uvec8 kShuffleColARGB = {
2212 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
2213 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
2214 };
2215
2216 // Shuffle table for duplicating 2 fractions into 8 bytes each
2217 static const uvec8 kShuffleFractions = {
2218 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
2219 };
2220
2221 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
ScaleARGBFilterCols_SSSE3(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2222 void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
2223 const uint8_t* src_argb,
2224 int dst_width,
2225 int x,
2226 int dx) {
2227 intptr_t x0, x1;
2228 asm volatile(
2229 "movdqa %0,%%xmm4 \n"
2230 "movdqa %1,%%xmm5 \n"
2231 :
2232 : "m"(kShuffleColARGB), // %0
2233 "m"(kShuffleFractions) // %1
2234 );
2235
2236 asm volatile(
2237 "movd %5,%%xmm2 \n"
2238 "movd %6,%%xmm3 \n"
2239 "pcmpeqb %%xmm6,%%xmm6 \n"
2240 "psrlw $0x9,%%xmm6 \n"
2241 "pextrw $0x1,%%xmm2,%k3 \n"
2242 "sub $0x2,%2 \n"
2243 "jl 29f \n"
2244 "movdqa %%xmm2,%%xmm0 \n"
2245 "paddd %%xmm3,%%xmm0 \n"
2246 "punpckldq %%xmm0,%%xmm2 \n"
2247 "punpckldq %%xmm3,%%xmm3 \n"
2248 "paddd %%xmm3,%%xmm3 \n"
2249 "pextrw $0x3,%%xmm2,%k4 \n"
2250
2251 LABELALIGN
2252 "2: \n"
2253 "movdqa %%xmm2,%%xmm1 \n"
2254 "paddd %%xmm3,%%xmm2 \n"
2255 "movq 0x00(%1,%3,4),%%xmm0 \n"
2256 "psrlw $0x9,%%xmm1 \n"
2257 "movhps 0x00(%1,%4,4),%%xmm0 \n"
2258 "pshufb %%xmm5,%%xmm1 \n"
2259 "pshufb %%xmm4,%%xmm0 \n"
2260 "pxor %%xmm6,%%xmm1 \n"
2261 "pmaddubsw %%xmm1,%%xmm0 \n"
2262 "psrlw $0x7,%%xmm0 \n"
2263 "pextrw $0x1,%%xmm2,%k3 \n"
2264 "pextrw $0x3,%%xmm2,%k4 \n"
2265 "packuswb %%xmm0,%%xmm0 \n"
2266 "movq %%xmm0,(%0) \n"
2267 "lea 0x8(%0),%0 \n"
2268 "sub $0x2,%2 \n"
2269 "jge 2b \n"
2270
2271 LABELALIGN
2272 "29: \n"
2273 "add $0x1,%2 \n"
2274 "jl 99f \n"
2275 "psrlw $0x9,%%xmm2 \n"
2276 "movq 0x00(%1,%3,4),%%xmm0 \n"
2277 "pshufb %%xmm5,%%xmm2 \n"
2278 "pshufb %%xmm4,%%xmm0 \n"
2279 "pxor %%xmm6,%%xmm2 \n"
2280 "pmaddubsw %%xmm2,%%xmm0 \n"
2281 "psrlw $0x7,%%xmm0 \n"
2282 "packuswb %%xmm0,%%xmm0 \n"
2283 "movd %%xmm0,(%0) \n"
2284
2285 LABELALIGN
2286 "99: \n" // clang-format error.
2287
2288 : "+r"(dst_argb), // %0
2289 "+r"(src_argb), // %1
2290 "+rm"(dst_width), // %2
2291 "=&r"(x0), // %3
2292 "=&r"(x1) // %4
2293 : "rm"(x), // %5
2294 "rm"(dx) // %6
2295 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2296 }
2297
2298 // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)2299 int FixedDiv_X86(int num, int div) {
2300 asm volatile(
2301 "cdq \n"
2302 "shld $0x10,%%eax,%%edx \n"
2303 "shl $0x10,%%eax \n"
2304 "idiv %1 \n"
2305 "mov %0, %%eax \n"
2306 : "+a"(num) // %0
2307 : "c"(div) // %1
2308 : "memory", "cc", "edx");
2309 return num;
2310 }
2311
2312 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)2313 int FixedDiv1_X86(int num, int div) {
2314 asm volatile(
2315 "cdq \n"
2316 "shld $0x10,%%eax,%%edx \n"
2317 "shl $0x10,%%eax \n"
2318 "sub $0x10001,%%eax \n"
2319 "sbb $0x0,%%edx \n"
2320 "sub $0x1,%1 \n"
2321 "idiv %1 \n"
2322 "mov %0, %%eax \n"
2323 : "+a"(num) // %0
2324 : "c"(div) // %1
2325 : "memory", "cc", "edx");
2326 return num;
2327 }
2328
2329 #ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
2330 // Shuffle table for splitting UV into upper and lower part of register.
2331 static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
2332 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
2333 static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u,
2334 6u, 14u, 0x80, 0x80, 0x80, 0x80,
2335 0x80, 0x80, 0x80, 0x80};
2336
ScaleUVRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)2337 void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
2338 ptrdiff_t src_stride,
2339 uint8_t* dst_ptr,
2340 int dst_width) {
2341 asm volatile(
2342 "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
2343 "psrlw $0xf,%%xmm4 \n"
2344 "packuswb %%xmm4,%%xmm4 \n"
2345 "pxor %%xmm5, %%xmm5 \n" // zero
2346 "movdqa %4,%%xmm1 \n" // split shuffler
2347 "movdqa %5,%%xmm3 \n" // merge shuffler
2348
2349 LABELALIGN
2350 "1: \n"
2351 "movdqu (%0),%%xmm0 \n" // 8 UV row 0
2352 "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1
2353 "lea 0x10(%0),%0 \n"
2354 "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv
2355 "pshufb %%xmm1,%%xmm2 \n"
2356 "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add
2357 "pmaddubsw %%xmm4,%%xmm2 \n"
2358 "paddw %%xmm2,%%xmm0 \n" // vertical add
2359 "psrlw $0x1,%%xmm0 \n" // round
2360 "pavgw %%xmm5,%%xmm0 \n"
2361 "pshufb %%xmm3,%%xmm0 \n" // merge uv
2362 "movq %%xmm0,(%1) \n"
2363 "lea 0x8(%1),%1 \n" // 4 UV
2364 "sub $0x4,%2 \n"
2365 "jg 1b \n"
2366 : "+r"(src_ptr), // %0
2367 "+r"(dst_ptr), // %1
2368 "+r"(dst_width) // %2
2369 : "r"((intptr_t)(src_stride)), // %3
2370 "m"(kShuffleSplitUV), // %4
2371 "m"(kShuffleMergeUV) // %5
2372 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2373 }
2374 #endif // HAS_SCALEUVROWDOWN2BOX_SSSE3
2375
2376 #ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
ScaleUVRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)2377 void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
2378 ptrdiff_t src_stride,
2379 uint8_t* dst_ptr,
2380 int dst_width) {
2381 asm volatile(
2382 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
2383 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
2384 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
2385 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero
2386 "vbroadcastf128 %4,%%ymm1 \n" // split shuffler
2387 "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler
2388
2389 LABELALIGN
2390 "1: \n"
2391 "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0
2392 "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1
2393 "lea 0x20(%0),%0 \n"
2394 "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv
2395 "vpshufb %%ymm1,%%ymm2,%%ymm2 \n"
2396 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add
2397 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
2398 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add
2399 "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round
2400 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
2401 "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv
2402 "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords
2403 "vmovdqu %%xmm0,(%1) \n"
2404 "lea 0x10(%1),%1 \n" // 8 UV
2405 "sub $0x8,%2 \n"
2406 "jg 1b \n"
2407 "vzeroupper \n"
2408 : "+r"(src_ptr), // %0
2409 "+r"(dst_ptr), // %1
2410 "+r"(dst_width) // %2
2411 : "r"((intptr_t)(src_stride)), // %3
2412 "m"(kShuffleSplitUV), // %4
2413 "m"(kShuffleMergeUV) // %5
2414 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2415 }
2416 #endif // HAS_SCALEUVROWDOWN2BOX_AVX2
2417
2418 static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
2419 3, 1, 3, 1, 1, 3, 1, 3};
2420
2421 #ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
ScaleUVRowUp2_Linear_SSSE3(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)2422 void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
2423 uint8_t* dst_ptr,
2424 int dst_width) {
2425 asm volatile(
2426 "pcmpeqw %%xmm4,%%xmm4 \n"
2427 "psrlw $15,%%xmm4 \n"
2428 "psllw $1,%%xmm4 \n" // all 2
2429 "movdqa %3,%%xmm3 \n"
2430
2431 LABELALIGN
2432 "1: \n"
2433 "movq (%0),%%xmm0 \n" // 00112233 (1u1v)
2434 "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v)
2435 "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v)
2436 "movdqa %%xmm0,%%xmm2 \n"
2437 "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v)
2438 "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v)
2439 "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (1u1v16, hi)
2440 "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (1u1v16, lo)
2441 "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
2442 "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
2443 "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
2444 "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
2445 "packuswb %%xmm2,%%xmm0 \n"
2446 "movdqu %%xmm0,(%1) \n"
2447
2448 "lea 0x8(%0),%0 \n"
2449 "lea 0x10(%1),%1 \n" // 4 uv to 8 uv
2450 "sub $0x8,%2 \n"
2451 "jg 1b \n"
2452 : "+r"(src_ptr), // %0
2453 "+r"(dst_ptr), // %1
2454 "+r"(dst_width) // %2
2455 : "m"(kUVLinearMadd31) // %3
2456 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2457 }
2458 #endif
2459
2460 #ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2461 void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
2462 ptrdiff_t src_stride,
2463 uint8_t* dst_ptr,
2464 ptrdiff_t dst_stride,
2465 int dst_width) {
2466 asm volatile(
2467 "pcmpeqw %%xmm6,%%xmm6 \n"
2468 "psrlw $15,%%xmm6 \n"
2469 "psllw $3,%%xmm6 \n" // all 8
2470 "movdqa %5,%%xmm7 \n"
2471
2472 LABELALIGN
2473 "1: \n"
2474 "movq (%0),%%xmm0 \n" // 00112233 (1u1v)
2475 "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v)
2476 "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v)
2477 "movdqa %%xmm0,%%xmm2 \n"
2478 "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v)
2479 "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v)
2480 "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1u1v16, hi)
2481 "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1u1v16, lo)
2482
2483 "movq (%0,%3),%%xmm1 \n"
2484 "movq 2(%0,%3),%%xmm4 \n"
2485 "punpcklbw %%xmm4,%%xmm1 \n"
2486 "movdqa %%xmm1,%%xmm3 \n"
2487 "punpckhdq %%xmm1,%%xmm3 \n"
2488 "punpckldq %%xmm1,%%xmm1 \n"
2489 "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
2490 "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
2491
2492 // xmm0 xmm2
2493 // xmm1 xmm3
2494
2495 "movdqa %%xmm0,%%xmm4 \n"
2496 "movdqa %%xmm1,%%xmm5 \n"
2497 "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
2498 "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
2499 "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
2500 "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
2501 "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
2502
2503 "movdqa %%xmm1,%%xmm5 \n"
2504 "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
2505 "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
2506 "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
2507 "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
2508 "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
2509
2510 "movdqa %%xmm2,%%xmm0 \n"
2511 "movdqa %%xmm3,%%xmm1 \n"
2512 "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
2513 "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
2514 "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
2515 "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
2516 "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
2517
2518 "movdqa %%xmm3,%%xmm1 \n"
2519 "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
2520 "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
2521 "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
2522 "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
2523 "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
2524
2525 "packuswb %%xmm0,%%xmm4 \n"
2526 "movdqu %%xmm4,(%1) \n" // store above
2527 "packuswb %%xmm1,%%xmm5 \n"
2528 "movdqu %%xmm5,(%1,%4) \n" // store below
2529
2530 "lea 0x8(%0),%0 \n"
2531 "lea 0x10(%1),%1 \n" // 4 uv to 8 uv
2532 "sub $0x8,%2 \n"
2533 "jg 1b \n"
2534 : "+r"(src_ptr), // %0
2535 "+r"(dst_ptr), // %1
2536 "+r"(dst_width) // %2
2537 : "r"((intptr_t)(src_stride)), // %3
2538 "r"((intptr_t)(dst_stride)), // %4
2539 "m"(kUVLinearMadd31) // %5
2540 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2541 "xmm7");
2542 }
2543 #endif
2544
2545 #ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
2546
ScaleUVRowUp2_Linear_AVX2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)2547 void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
2548 uint8_t* dst_ptr,
2549 int dst_width) {
2550 asm volatile(
2551 "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
2552 "vpsrlw $15,%%ymm4,%%ymm4 \n"
2553 "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
2554 "vbroadcastf128 %3,%%ymm3 \n"
2555
2556 LABELALIGN
2557 "1: \n"
2558 "vmovdqu (%0),%%xmm0 \n"
2559 "vmovdqu 2(%0),%%xmm1 \n"
2560 "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
2561 "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
2562 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
2563 "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n"
2564 "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n"
2565 "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
2566 "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
2567 "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
2568 "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
2569 "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
2570 "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
2571 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
2572 "vmovdqu %%ymm0,(%1) \n"
2573
2574 "lea 0x10(%0),%0 \n"
2575 "lea 0x20(%1),%1 \n" // 8 uv to 16 uv
2576 "sub $0x10,%2 \n"
2577 "jg 1b \n"
2578 "vzeroupper \n"
2579 : "+r"(src_ptr), // %0
2580 "+r"(dst_ptr), // %1
2581 "+r"(dst_width) // %2
2582 : "m"(kUVLinearMadd31) // %3
2583 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2584 }
2585 #endif
2586
2587 #ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
ScaleUVRowUp2_Bilinear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2588 void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
2589 ptrdiff_t src_stride,
2590 uint8_t* dst_ptr,
2591 ptrdiff_t dst_stride,
2592 int dst_width) {
2593 asm volatile(
2594 "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
2595 "vpsrlw $15,%%ymm6,%%ymm6 \n"
2596 "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
2597 "vbroadcastf128 %5,%%ymm7 \n"
2598
2599 LABELALIGN
2600 "1: \n"
2601 "vmovdqu (%0),%%xmm0 \n"
2602 "vmovdqu 2(%0),%%xmm1 \n"
2603 "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
2604 "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
2605 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
2606 "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n"
2607 "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n"
2608 "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
2609 "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
2610
2611 "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
2612 "vmovdqu 2(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
2613 "vpermq $0b11011000,%%ymm2,%%ymm2 \n"
2614 "vpermq $0b11011000,%%ymm3,%%ymm3 \n"
2615 "vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n"
2616 "vpunpckhdq %%ymm2,%%ymm2,%%ymm4 \n"
2617 "vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n"
2618 "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
2619 "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
2620
2621 // ymm0 ymm1
2622 // ymm2 ymm3
2623
2624 "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
2625 "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
2626 "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
2627 "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
2628 "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
2629
2630 "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
2631 "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
2632 "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
2633 "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
2634 "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
2635
2636 "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
2637 "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
2638 "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
2639 "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
2640 "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
2641
2642 "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
2643 "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
2644 "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
2645 "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
2646 "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
2647
2648 "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
2649 "vmovdqu %%ymm4,(%1) \n" // store above
2650 "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
2651 "vmovdqu %%ymm5,(%1,%4) \n" // store below
2652
2653 "lea 0x10(%0),%0 \n"
2654 "lea 0x20(%1),%1 \n" // 8 uv to 16 uv
2655 "sub $0x10,%2 \n"
2656 "jg 1b \n"
2657 "vzeroupper \n"
2658 : "+r"(src_ptr), // %0
2659 "+r"(dst_ptr), // %1
2660 "+r"(dst_width) // %2
2661 : "r"((intptr_t)(src_stride)), // %3
2662 "r"((intptr_t)(dst_stride)), // %4
2663 "m"(kUVLinearMadd31) // %5
2664 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2665 "xmm7");
2666 }
2667 #endif
2668
2669 #ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
ScaleUVRowUp2_Linear_16_SSE2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)2670 void ScaleUVRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
2671 uint16_t* dst_ptr,
2672 int dst_width) {
2673 asm volatile(
2674 "pxor %%xmm5,%%xmm5 \n"
2675 "pcmpeqd %%xmm4,%%xmm4 \n"
2676 "psrld $31,%%xmm4 \n"
2677 "pslld $1,%%xmm4 \n" // all 2
2678
2679 LABELALIGN
2680 "1: \n"
2681 "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
2682 "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
2683
2684 "punpcklwd %%xmm5,%%xmm0 \n" // 0011 (32b, 1u1v)
2685 "punpcklwd %%xmm5,%%xmm1 \n" // 1122 (32b, 1u1v)
2686
2687 "movdqa %%xmm0,%%xmm2 \n"
2688 "movdqa %%xmm1,%%xmm3 \n"
2689
2690 "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (lo, far)
2691 "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (hi, far)
2692
2693 "paddd %%xmm4,%%xmm2 \n" // far+2 (lo)
2694 "paddd %%xmm4,%%xmm3 \n" // far+2 (hi)
2695 "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo)
2696 "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi)
2697 "paddd %%xmm0,%%xmm0 \n" // 2*near (lo)
2698 "paddd %%xmm1,%%xmm1 \n" // 2*near (hi)
2699 "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo)
2700 "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
2701
2702 "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
2703 "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
2704 "packusdw %%xmm1,%%xmm0 \n"
2705 "movdqu %%xmm0,(%1) \n"
2706
2707 "lea 0x8(%0),%0 \n"
2708 "lea 0x10(%1),%1 \n" // 2 uv to 4 uv
2709 "sub $0x4,%2 \n"
2710 "jg 1b \n"
2711 : "+r"(src_ptr), // %0
2712 "+r"(dst_ptr), // %1
2713 "+r"(dst_width) // %2
2714 :
2715 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2716 }
2717 #endif
2718
2719 #ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
ScaleUVRowUp2_Bilinear_16_SSE2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2720 void ScaleUVRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
2721 ptrdiff_t src_stride,
2722 uint16_t* dst_ptr,
2723 ptrdiff_t dst_stride,
2724 int dst_width) {
2725 asm volatile(
2726 "pxor %%xmm7,%%xmm7 \n"
2727 "pcmpeqd %%xmm6,%%xmm6 \n"
2728 "psrld $31,%%xmm6 \n"
2729 "pslld $3,%%xmm6 \n" // all 8
2730
2731 LABELALIGN
2732 "1: \n"
2733 "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
2734 "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
2735 "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v)
2736 "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v)
2737 "movdqa %%xmm0,%%xmm2 \n"
2738 "movdqa %%xmm1,%%xmm3 \n"
2739 "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo)
2740 "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi)
2741 "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo)
2742 "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi)
2743 "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo)
2744 "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi)
2745 "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo)
2746 "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
2747
2748 "movq (%0,%3,2),%%xmm2 \n"
2749 "movq 4(%0,%3,2),%%xmm3 \n"
2750 "punpcklwd %%xmm7,%%xmm2 \n"
2751 "punpcklwd %%xmm7,%%xmm3 \n"
2752 "movdqa %%xmm2,%%xmm4 \n"
2753 "movdqa %%xmm3,%%xmm5 \n"
2754 "pshufd $0b01001110,%%xmm4,%%xmm4 \n" // 1100 (far) (2, lo)
2755 "pshufd $0b01001110,%%xmm5,%%xmm5 \n" // 2211 (far) (2, hi)
2756 "paddd %%xmm2,%%xmm4 \n" // near+far (2, lo)
2757 "paddd %%xmm3,%%xmm5 \n" // near+far (2, hi)
2758 "paddd %%xmm2,%%xmm2 \n" // 2*near (2, lo)
2759 "paddd %%xmm3,%%xmm3 \n" // 2*near (2, hi)
2760 "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo)
2761 "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi)
2762
2763 "movdqa %%xmm0,%%xmm4 \n"
2764 "movdqa %%xmm2,%%xmm5 \n"
2765 "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
2766 "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
2767 "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
2768 "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
2769 "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo)
2770
2771 "movdqa %%xmm2,%%xmm5 \n"
2772 "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo)
2773 "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
2774 "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo)
2775 "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
2776 "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo)
2777
2778 "movdqa %%xmm1,%%xmm0 \n"
2779 "movdqa %%xmm3,%%xmm2 \n"
2780 "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi)
2781 "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi)
2782 "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi)
2783 "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
2784 "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi)
2785
2786 "movdqa %%xmm3,%%xmm2 \n"
2787 "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi)
2788 "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi)
2789 "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
2790 "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi)
2791 "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi)
2792
2793 "packusdw %%xmm0,%%xmm4 \n"
2794 "movdqu %%xmm4,(%1) \n" // store above
2795 "packusdw %%xmm2,%%xmm5 \n"
2796 "movdqu %%xmm5,(%1,%4,2) \n" // store below
2797
2798 "lea 0x8(%0),%0 \n"
2799 "lea 0x10(%1),%1 \n" // 2 uv to 4 uv
2800 "sub $0x4,%2 \n"
2801 "jg 1b \n"
2802 : "+r"(src_ptr), // %0
2803 "+r"(dst_ptr), // %1
2804 "+r"(dst_width) // %2
2805 : "r"((intptr_t)(src_stride)), // %3
2806 "r"((intptr_t)(dst_stride)) // %4
2807 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2808 "xmm7");
2809 }
2810 #endif
2811
2812 #ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2
ScaleUVRowUp2_Linear_16_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)2813 void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
2814 uint16_t* dst_ptr,
2815 int dst_width) {
2816 asm volatile(
2817 "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
2818 "vpsrld $31,%%ymm4,%%ymm4 \n"
2819 "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
2820
2821 LABELALIGN
2822 "1: \n"
2823 "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v)
2824 "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v)
2825
2826 "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
2827 "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
2828
2829 "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far)
2830 "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far)
2831
2832 "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo)
2833 "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi)
2834 "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo)
2835 "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi)
2836 "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
2837 "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
2838 "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo)
2839 "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi)
2840
2841 "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
2842 "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
2843 "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n"
2844 "vmovdqu %%ymm0,(%1) \n"
2845
2846 "lea 0x10(%0),%0 \n"
2847 "lea 0x20(%1),%1 \n" // 4 uv to 8 uv
2848 "sub $0x8,%2 \n"
2849 "jg 1b \n"
2850 "vzeroupper \n"
2851 : "+r"(src_ptr), // %0
2852 "+r"(dst_ptr), // %1
2853 "+r"(dst_width) // %2
2854 :
2855 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2856 }
2857 #endif
2858
2859 #ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2
ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2860 void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
2861 ptrdiff_t src_stride,
2862 uint16_t* dst_ptr,
2863 ptrdiff_t dst_stride,
2864 int dst_width) {
2865 asm volatile(
2866 "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
2867 "vpsrld $31,%%ymm6,%%ymm6 \n"
2868 "vpslld $3,%%ymm6,%%ymm6 \n" // all 8
2869
2870 LABELALIGN
2871 "1: \n"
2872
2873 "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v)
2874 "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v)
2875 "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
2876 "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
2877 "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far)
2878 "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far)
2879 "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo)
2880 "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi)
2881 "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
2882 "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
2883 "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (lo)
2884 "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (hi)
2885
2886 "vmovdqu (%0,%3,2),%%xmm2 \n" // 00112233 (16b, 1u1v)
2887 "vmovdqu 4(%0,%3,2),%%xmm3 \n" // 11223344 (16b, 1u1v)
2888 "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v)
2889 "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v)
2890 "vpshufd $0b01001110,%%ymm2,%%ymm4 \n" // 11003322 (lo, far)
2891 "vpshufd $0b01001110,%%ymm3,%%ymm5 \n" // 22114433 (hi, far)
2892 "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo)
2893 "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi)
2894 "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo)
2895 "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi)
2896 "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (lo)
2897 "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (hi)
2898
2899 "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
2900 "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
2901 "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
2902 "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
2903 "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
2904
2905 "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
2906 "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
2907 "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
2908 "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
2909 "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
2910
2911 "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
2912 "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
2913 "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
2914 "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
2915 "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
2916
2917 "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
2918 "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
2919 "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
2920 "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
2921 "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
2922
2923 "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n"
2924 "vmovdqu %%ymm4,(%1) \n" // store above
2925 "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n"
2926 "vmovdqu %%ymm5,(%1,%4,2) \n" // store below
2927
2928 "lea 0x10(%0),%0 \n"
2929 "lea 0x20(%1),%1 \n" // 4 uv to 8 uv
2930 "sub $0x8,%2 \n"
2931 "jg 1b \n"
2932 "vzeroupper \n"
2933 : "+r"(src_ptr), // %0
2934 "+r"(dst_ptr), // %1
2935 "+r"(dst_width) // %2
2936 : "r"((intptr_t)(src_stride)), // %3
2937 "r"((intptr_t)(dst_stride)) // %4
2938 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2939 }
2940 #endif
2941
2942 #endif // defined(__x86_64__) || defined(__i386__)
2943
2944 #ifdef __cplusplus
2945 } // extern "C"
2946 } // namespace libyuv
2947 #endif
2948