1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC x86 and x64.
19 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
20
21 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
22
23 // Constants for ARGB
24 static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
25 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
26
27 // JPeg full range.
28 static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
29 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
30
31 static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
32 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
33 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
34
35 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
36
37 static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
38 112, -74, -38, 0, 112, -74, -38, 0};
39
40 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
41 127, -84, -43, 0, 127, -84, -43, 0};
42
43 static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
44 -18, -94, 112, 0, -18, -94, 112, 0};
45
46 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
47 -20, -107, 127, 0, -20, -107, 127, 0};
48
49 // Constants for BGRA
50 static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
51 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
52
53 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
54 0, -38, -74, 112, 0, -38, -74, 112};
55
56 static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
57 0, 112, -94, -18, 0, 112, -94, -18};
58
59 // Constants for ABGR
60 static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
61 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
62
63 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
64 -38, -74, 112, 0, -38, -74, 112, 0};
65
66 static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
67 112, -94, -18, 0, 112, -94, -18, 0};
68
69 // Constants for RGBA.
70 static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
71 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
72
73 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
74 0, 112, -74, -38, 0, 112, -74, -38};
75
76 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
77 0, -18, -94, 112, 0, -18, -94, 112};
78
79 static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
80 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
81
82 static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
83 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
84
85 static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
86 0x8080u, 0x8080u, 0x8080u, 0x8080u};
87
88 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
89
90 #ifdef HAS_RGB24TOARGBROW_SSSE3
91
92 // Shuffle table for converting RGB24 to ARGB.
93 static const uvec8 kShuffleMaskRGB24ToARGB = {
94 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
95
96 // Shuffle table for converting RAW to ARGB.
97 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
98 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
99
100 // Shuffle table for converting RAW to RGBA.
101 static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u, 4u, 3u,
102 14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u};
103
104 // Shuffle table for converting RAW to RGB24. First 8.
105 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
106 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
107 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
108
109 // Shuffle table for converting RAW to RGB24. Middle 8.
110 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
111 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
112 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
113
114 // Shuffle table for converting RAW to RGB24. Last 8.
115 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
116 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
117 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
118
119 // Shuffle table for converting ARGB to RGB24.
120 static const uvec8 kShuffleMaskARGBToRGB24 = {
121 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
122
123 // Shuffle table for converting ARGB to RAW.
124 static const uvec8 kShuffleMaskARGBToRAW = {
125 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
126
127 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
128 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
129 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
130
131 // YUY2 shuf 16 Y to 32 Y.
132 static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
133 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
134 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
135
136 // YUY2 shuf 8 UV to 16 UV.
137 static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
138 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
139 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
140
141 // UYVY shuf 16 Y to 32 Y.
142 static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
143 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
144 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
145
146 // UYVY shuf 8 UV to 16 UV.
147 static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
148 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
149 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
150
151 // NV21 shuf 8 VU to 16 UV.
152 static const lvec8 kShuffleNV21 = {
153 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
154 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
155 };
156 #endif // HAS_RGB24TOARGBROW_SSSE3
157
158 #ifdef HAS_J400TOARGBROW_SSE2
J400ToARGBRow_SSE2(const uint8_t * src_y,uint8_t * dst_argb,int width)159 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
160 asm volatile(
161 "pcmpeqb %%xmm5,%%xmm5 \n"
162 "pslld $0x18,%%xmm5 \n"
163
164 LABELALIGN
165 "1: \n"
166 "movq (%0),%%xmm0 \n"
167 "lea 0x8(%0),%0 \n"
168 "punpcklbw %%xmm0,%%xmm0 \n"
169 "movdqa %%xmm0,%%xmm1 \n"
170 "punpcklwd %%xmm0,%%xmm0 \n"
171 "punpckhwd %%xmm1,%%xmm1 \n"
172 "por %%xmm5,%%xmm0 \n"
173 "por %%xmm5,%%xmm1 \n"
174 "movdqu %%xmm0,(%1) \n"
175 "movdqu %%xmm1,0x10(%1) \n"
176 "lea 0x20(%1),%1 \n"
177 "sub $0x8,%2 \n"
178 "jg 1b \n"
179 : "+r"(src_y), // %0
180 "+r"(dst_argb), // %1
181 "+r"(width) // %2
182 ::"memory",
183 "cc", "xmm0", "xmm1", "xmm5");
184 }
185 #endif // HAS_J400TOARGBROW_SSE2
186
187 #ifdef HAS_RGB24TOARGBROW_SSSE3
RGB24ToARGBRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)188 void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
189 uint8_t* dst_argb,
190 int width) {
191 asm volatile(
192 "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
193 "pslld $0x18,%%xmm5 \n"
194 "movdqa %3,%%xmm4 \n"
195
196 LABELALIGN
197 "1: \n"
198 "movdqu (%0),%%xmm0 \n"
199 "movdqu 0x10(%0),%%xmm1 \n"
200 "movdqu 0x20(%0),%%xmm3 \n"
201 "lea 0x30(%0),%0 \n"
202 "movdqa %%xmm3,%%xmm2 \n"
203 "palignr $0x8,%%xmm1,%%xmm2 \n"
204 "pshufb %%xmm4,%%xmm2 \n"
205 "por %%xmm5,%%xmm2 \n"
206 "palignr $0xc,%%xmm0,%%xmm1 \n"
207 "pshufb %%xmm4,%%xmm0 \n"
208 "movdqu %%xmm2,0x20(%1) \n"
209 "por %%xmm5,%%xmm0 \n"
210 "pshufb %%xmm4,%%xmm1 \n"
211 "movdqu %%xmm0,(%1) \n"
212 "por %%xmm5,%%xmm1 \n"
213 "palignr $0x4,%%xmm3,%%xmm3 \n"
214 "pshufb %%xmm4,%%xmm3 \n"
215 "movdqu %%xmm1,0x10(%1) \n"
216 "por %%xmm5,%%xmm3 \n"
217 "movdqu %%xmm3,0x30(%1) \n"
218 "lea 0x40(%1),%1 \n"
219 "sub $0x10,%2 \n"
220 "jg 1b \n"
221 : "+r"(src_rgb24), // %0
222 "+r"(dst_argb), // %1
223 "+r"(width) // %2
224 : "m"(kShuffleMaskRGB24ToARGB) // %3
225 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
226 }
227
RAWToARGBRow_SSSE3(const uint8_t * src_raw,uint8_t * dst_argb,int width)228 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
229 asm volatile(
230 "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
231 "pslld $0x18,%%xmm5 \n"
232 "movdqa %3,%%xmm4 \n"
233
234 LABELALIGN
235 "1: \n"
236 "movdqu (%0),%%xmm0 \n"
237 "movdqu 0x10(%0),%%xmm1 \n"
238 "movdqu 0x20(%0),%%xmm3 \n"
239 "lea 0x30(%0),%0 \n"
240 "movdqa %%xmm3,%%xmm2 \n"
241 "palignr $0x8,%%xmm1,%%xmm2 \n"
242 "pshufb %%xmm4,%%xmm2 \n"
243 "por %%xmm5,%%xmm2 \n"
244 "palignr $0xc,%%xmm0,%%xmm1 \n"
245 "pshufb %%xmm4,%%xmm0 \n"
246 "movdqu %%xmm2,0x20(%1) \n"
247 "por %%xmm5,%%xmm0 \n"
248 "pshufb %%xmm4,%%xmm1 \n"
249 "movdqu %%xmm0,(%1) \n"
250 "por %%xmm5,%%xmm1 \n"
251 "palignr $0x4,%%xmm3,%%xmm3 \n"
252 "pshufb %%xmm4,%%xmm3 \n"
253 "movdqu %%xmm1,0x10(%1) \n"
254 "por %%xmm5,%%xmm3 \n"
255 "movdqu %%xmm3,0x30(%1) \n"
256 "lea 0x40(%1),%1 \n"
257 "sub $0x10,%2 \n"
258 "jg 1b \n"
259 : "+r"(src_raw), // %0
260 "+r"(dst_argb), // %1
261 "+r"(width) // %2
262 : "m"(kShuffleMaskRAWToARGB) // %3
263 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
264 }
265
266 // Same code as RAWToARGB with different shuffler and A in low bits
RAWToRGBARow_SSSE3(const uint8_t * src_raw,uint8_t * dst_rgba,int width)267 void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
268 asm volatile(
269 "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff
270 "psrld $0x18,%%xmm5 \n"
271 "movdqa %3,%%xmm4 \n"
272
273 LABELALIGN
274 "1: \n"
275 "movdqu (%0),%%xmm0 \n"
276 "movdqu 0x10(%0),%%xmm1 \n"
277 "movdqu 0x20(%0),%%xmm3 \n"
278 "lea 0x30(%0),%0 \n"
279 "movdqa %%xmm3,%%xmm2 \n"
280 "palignr $0x8,%%xmm1,%%xmm2 \n"
281 "pshufb %%xmm4,%%xmm2 \n"
282 "por %%xmm5,%%xmm2 \n"
283 "palignr $0xc,%%xmm0,%%xmm1 \n"
284 "pshufb %%xmm4,%%xmm0 \n"
285 "movdqu %%xmm2,0x20(%1) \n"
286 "por %%xmm5,%%xmm0 \n"
287 "pshufb %%xmm4,%%xmm1 \n"
288 "movdqu %%xmm0,(%1) \n"
289 "por %%xmm5,%%xmm1 \n"
290 "palignr $0x4,%%xmm3,%%xmm3 \n"
291 "pshufb %%xmm4,%%xmm3 \n"
292 "movdqu %%xmm1,0x10(%1) \n"
293 "por %%xmm5,%%xmm3 \n"
294 "movdqu %%xmm3,0x30(%1) \n"
295 "lea 0x40(%1),%1 \n"
296 "sub $0x10,%2 \n"
297 "jg 1b \n"
298 : "+r"(src_raw), // %0
299 "+r"(dst_rgba), // %1
300 "+r"(width) // %2
301 : "m"(kShuffleMaskRAWToRGBA) // %3
302 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
303 }
304
RAWToRGB24Row_SSSE3(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)305 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
306 uint8_t* dst_rgb24,
307 int width) {
308 asm volatile(
309 "movdqa %3,%%xmm3 \n"
310 "movdqa %4,%%xmm4 \n"
311 "movdqa %5,%%xmm5 \n"
312
313 LABELALIGN
314 "1: \n"
315 "movdqu (%0),%%xmm0 \n"
316 "movdqu 0x4(%0),%%xmm1 \n"
317 "movdqu 0x8(%0),%%xmm2 \n"
318 "lea 0x18(%0),%0 \n"
319 "pshufb %%xmm3,%%xmm0 \n"
320 "pshufb %%xmm4,%%xmm1 \n"
321 "pshufb %%xmm5,%%xmm2 \n"
322 "movq %%xmm0,(%1) \n"
323 "movq %%xmm1,0x8(%1) \n"
324 "movq %%xmm2,0x10(%1) \n"
325 "lea 0x18(%1),%1 \n"
326 "sub $0x8,%2 \n"
327 "jg 1b \n"
328 : "+r"(src_raw), // %0
329 "+r"(dst_rgb24), // %1
330 "+r"(width) // %2
331 : "m"(kShuffleMaskRAWToRGB24_0), // %3
332 "m"(kShuffleMaskRAWToRGB24_1), // %4
333 "m"(kShuffleMaskRAWToRGB24_2) // %5
334 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
335 }
336
RGB565ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)337 void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
338 asm volatile(
339 "mov $0x1080108,%%eax \n"
340 "movd %%eax,%%xmm5 \n"
341 "pshufd $0x0,%%xmm5,%%xmm5 \n"
342 "mov $0x20802080,%%eax \n"
343 "movd %%eax,%%xmm6 \n"
344 "pshufd $0x0,%%xmm6,%%xmm6 \n"
345 "pcmpeqb %%xmm3,%%xmm3 \n"
346 "psllw $0xb,%%xmm3 \n"
347 "pcmpeqb %%xmm4,%%xmm4 \n"
348 "psllw $0xa,%%xmm4 \n"
349 "psrlw $0x5,%%xmm4 \n"
350 "pcmpeqb %%xmm7,%%xmm7 \n"
351 "psllw $0x8,%%xmm7 \n"
352 "sub %0,%1 \n"
353 "sub %0,%1 \n"
354
355 LABELALIGN
356 "1: \n"
357 "movdqu (%0),%%xmm0 \n"
358 "movdqa %%xmm0,%%xmm1 \n"
359 "movdqa %%xmm0,%%xmm2 \n"
360 "pand %%xmm3,%%xmm1 \n"
361 "psllw $0xb,%%xmm2 \n"
362 "pmulhuw %%xmm5,%%xmm1 \n"
363 "pmulhuw %%xmm5,%%xmm2 \n"
364 "psllw $0x8,%%xmm1 \n"
365 "por %%xmm2,%%xmm1 \n"
366 "pand %%xmm4,%%xmm0 \n"
367 "pmulhuw %%xmm6,%%xmm0 \n"
368 "por %%xmm7,%%xmm0 \n"
369 "movdqa %%xmm1,%%xmm2 \n"
370 "punpcklbw %%xmm0,%%xmm1 \n"
371 "punpckhbw %%xmm0,%%xmm2 \n"
372 "movdqu %%xmm1,0x00(%1,%0,2) \n"
373 "movdqu %%xmm2,0x10(%1,%0,2) \n"
374 "lea 0x10(%0),%0 \n"
375 "sub $0x8,%2 \n"
376 "jg 1b \n"
377 : "+r"(src), // %0
378 "+r"(dst), // %1
379 "+r"(width) // %2
380 :
381 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
382 "xmm6", "xmm7");
383 }
384
ARGB1555ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)385 void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
386 asm volatile(
387 "mov $0x1080108,%%eax \n"
388 "movd %%eax,%%xmm5 \n"
389 "pshufd $0x0,%%xmm5,%%xmm5 \n"
390 "mov $0x42004200,%%eax \n"
391 "movd %%eax,%%xmm6 \n"
392 "pshufd $0x0,%%xmm6,%%xmm6 \n"
393 "pcmpeqb %%xmm3,%%xmm3 \n"
394 "psllw $0xb,%%xmm3 \n"
395 "movdqa %%xmm3,%%xmm4 \n"
396 "psrlw $0x6,%%xmm4 \n"
397 "pcmpeqb %%xmm7,%%xmm7 \n"
398 "psllw $0x8,%%xmm7 \n"
399 "sub %0,%1 \n"
400 "sub %0,%1 \n"
401
402 LABELALIGN
403 "1: \n"
404 "movdqu (%0),%%xmm0 \n"
405 "movdqa %%xmm0,%%xmm1 \n"
406 "movdqa %%xmm0,%%xmm2 \n"
407 "psllw $0x1,%%xmm1 \n"
408 "psllw $0xb,%%xmm2 \n"
409 "pand %%xmm3,%%xmm1 \n"
410 "pmulhuw %%xmm5,%%xmm2 \n"
411 "pmulhuw %%xmm5,%%xmm1 \n"
412 "psllw $0x8,%%xmm1 \n"
413 "por %%xmm2,%%xmm1 \n"
414 "movdqa %%xmm0,%%xmm2 \n"
415 "pand %%xmm4,%%xmm0 \n"
416 "psraw $0x8,%%xmm2 \n"
417 "pmulhuw %%xmm6,%%xmm0 \n"
418 "pand %%xmm7,%%xmm2 \n"
419 "por %%xmm2,%%xmm0 \n"
420 "movdqa %%xmm1,%%xmm2 \n"
421 "punpcklbw %%xmm0,%%xmm1 \n"
422 "punpckhbw %%xmm0,%%xmm2 \n"
423 "movdqu %%xmm1,0x00(%1,%0,2) \n"
424 "movdqu %%xmm2,0x10(%1,%0,2) \n"
425 "lea 0x10(%0),%0 \n"
426 "sub $0x8,%2 \n"
427 "jg 1b \n"
428 : "+r"(src), // %0
429 "+r"(dst), // %1
430 "+r"(width) // %2
431 :
432 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
433 "xmm6", "xmm7");
434 }
435
ARGB4444ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)436 void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
437 asm volatile(
438 "mov $0xf0f0f0f,%%eax \n"
439 "movd %%eax,%%xmm4 \n"
440 "pshufd $0x0,%%xmm4,%%xmm4 \n"
441 "movdqa %%xmm4,%%xmm5 \n"
442 "pslld $0x4,%%xmm5 \n"
443 "sub %0,%1 \n"
444 "sub %0,%1 \n"
445
446 LABELALIGN
447 "1: \n"
448 "movdqu (%0),%%xmm0 \n"
449 "movdqa %%xmm0,%%xmm2 \n"
450 "pand %%xmm4,%%xmm0 \n"
451 "pand %%xmm5,%%xmm2 \n"
452 "movdqa %%xmm0,%%xmm1 \n"
453 "movdqa %%xmm2,%%xmm3 \n"
454 "psllw $0x4,%%xmm1 \n"
455 "psrlw $0x4,%%xmm3 \n"
456 "por %%xmm1,%%xmm0 \n"
457 "por %%xmm3,%%xmm2 \n"
458 "movdqa %%xmm0,%%xmm1 \n"
459 "punpcklbw %%xmm2,%%xmm0 \n"
460 "punpckhbw %%xmm2,%%xmm1 \n"
461 "movdqu %%xmm0,0x00(%1,%0,2) \n"
462 "movdqu %%xmm1,0x10(%1,%0,2) \n"
463 "lea 0x10(%0),%0 \n"
464 "sub $0x8,%2 \n"
465 "jg 1b \n"
466 : "+r"(src), // %0
467 "+r"(dst), // %1
468 "+r"(width) // %2
469 :
470 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
471 }
472
ARGBToRGB24Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)473 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
474 asm volatile(
475
476 "movdqa %3,%%xmm6 \n"
477
478 LABELALIGN
479 "1: \n"
480 "movdqu (%0),%%xmm0 \n"
481 "movdqu 0x10(%0),%%xmm1 \n"
482 "movdqu 0x20(%0),%%xmm2 \n"
483 "movdqu 0x30(%0),%%xmm3 \n"
484 "lea 0x40(%0),%0 \n"
485 "pshufb %%xmm6,%%xmm0 \n"
486 "pshufb %%xmm6,%%xmm1 \n"
487 "pshufb %%xmm6,%%xmm2 \n"
488 "pshufb %%xmm6,%%xmm3 \n"
489 "movdqa %%xmm1,%%xmm4 \n"
490 "psrldq $0x4,%%xmm1 \n"
491 "pslldq $0xc,%%xmm4 \n"
492 "movdqa %%xmm2,%%xmm5 \n"
493 "por %%xmm4,%%xmm0 \n"
494 "pslldq $0x8,%%xmm5 \n"
495 "movdqu %%xmm0,(%1) \n"
496 "por %%xmm5,%%xmm1 \n"
497 "psrldq $0x8,%%xmm2 \n"
498 "pslldq $0x4,%%xmm3 \n"
499 "por %%xmm3,%%xmm2 \n"
500 "movdqu %%xmm1,0x10(%1) \n"
501 "movdqu %%xmm2,0x20(%1) \n"
502 "lea 0x30(%1),%1 \n"
503 "sub $0x10,%2 \n"
504 "jg 1b \n"
505 : "+r"(src), // %0
506 "+r"(dst), // %1
507 "+r"(width) // %2
508 : "m"(kShuffleMaskARGBToRGB24) // %3
509 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
510 }
511
ARGBToRAWRow_SSSE3(const uint8_t * src,uint8_t * dst,int width)512 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
513 asm volatile(
514
515 "movdqa %3,%%xmm6 \n"
516
517 LABELALIGN
518 "1: \n"
519 "movdqu (%0),%%xmm0 \n"
520 "movdqu 0x10(%0),%%xmm1 \n"
521 "movdqu 0x20(%0),%%xmm2 \n"
522 "movdqu 0x30(%0),%%xmm3 \n"
523 "lea 0x40(%0),%0 \n"
524 "pshufb %%xmm6,%%xmm0 \n"
525 "pshufb %%xmm6,%%xmm1 \n"
526 "pshufb %%xmm6,%%xmm2 \n"
527 "pshufb %%xmm6,%%xmm3 \n"
528 "movdqa %%xmm1,%%xmm4 \n"
529 "psrldq $0x4,%%xmm1 \n"
530 "pslldq $0xc,%%xmm4 \n"
531 "movdqa %%xmm2,%%xmm5 \n"
532 "por %%xmm4,%%xmm0 \n"
533 "pslldq $0x8,%%xmm5 \n"
534 "movdqu %%xmm0,(%1) \n"
535 "por %%xmm5,%%xmm1 \n"
536 "psrldq $0x8,%%xmm2 \n"
537 "pslldq $0x4,%%xmm3 \n"
538 "por %%xmm3,%%xmm2 \n"
539 "movdqu %%xmm1,0x10(%1) \n"
540 "movdqu %%xmm2,0x20(%1) \n"
541 "lea 0x30(%1),%1 \n"
542 "sub $0x10,%2 \n"
543 "jg 1b \n"
544 : "+r"(src), // %0
545 "+r"(dst), // %1
546 "+r"(width) // %2
547 : "m"(kShuffleMaskARGBToRAW) // %3
548 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
549 }
550
551 #ifdef HAS_ARGBTORGB24ROW_AVX2
552 // vpermd for 12+12 to 24
553 static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
554
ARGBToRGB24Row_AVX2(const uint8_t * src,uint8_t * dst,int width)555 void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
556 asm volatile(
557 "vbroadcastf128 %3,%%ymm6 \n"
558 "vmovdqa %4,%%ymm7 \n"
559
560 LABELALIGN
561 "1: \n"
562 "vmovdqu (%0),%%ymm0 \n"
563 "vmovdqu 0x20(%0),%%ymm1 \n"
564 "vmovdqu 0x40(%0),%%ymm2 \n"
565 "vmovdqu 0x60(%0),%%ymm3 \n"
566 "lea 0x80(%0),%0 \n"
567 "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
568 "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
569 "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
570 "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
571 "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
572 "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
573 "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
574 "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
575 "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
576 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
577 "vmovdqu %%ymm0,(%1) \n"
578 "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
579 "vpermq $0x4f,%%ymm2,%%ymm4 \n"
580 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
581 "vmovdqu %%ymm1,0x20(%1) \n"
582 "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
583 "vpermq $0x93,%%ymm3,%%ymm3 \n"
584 "vpor %%ymm3,%%ymm2,%%ymm2 \n"
585 "vmovdqu %%ymm2,0x40(%1) \n"
586 "lea 0x60(%1),%1 \n"
587 "sub $0x20,%2 \n"
588 "jg 1b \n"
589 "vzeroupper \n"
590 : "+r"(src), // %0
591 "+r"(dst), // %1
592 "+r"(width) // %2
593 : "m"(kShuffleMaskARGBToRGB24), // %3
594 "m"(kPermdRGB24_AVX) // %4
595 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
596 "xmm7");
597 }
598 #endif
599
600 #ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
601 // Shuffle table for converting ARGBToRGB24
602 static const ulvec8 kPermARGBToRGB24_0 = {
603 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u,
604 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
605 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
606 static const ulvec8 kPermARGBToRGB24_1 = {
607 10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
608 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
609 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
610 static const ulvec8 kPermARGBToRGB24_2 = {
611 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
612 36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
613 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
614
ARGBToRGB24Row_AVX512VBMI(const uint8_t * src,uint8_t * dst,int width)615 void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
616 asm volatile(
617 "vmovdqa %3,%%ymm5 \n"
618 "vmovdqa %4,%%ymm6 \n"
619 "vmovdqa %5,%%ymm7 \n"
620
621 LABELALIGN
622 "1: \n"
623 "vmovdqu (%0),%%ymm0 \n"
624 "vmovdqu 0x20(%0),%%ymm1 \n"
625 "vmovdqu 0x40(%0),%%ymm2 \n"
626 "vmovdqu 0x60(%0),%%ymm3 \n"
627 "lea 0x80(%0),%0 \n"
628 "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
629 "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
630 "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
631 "vmovdqu %%ymm0,(%1) \n"
632 "vmovdqu %%ymm1,0x20(%1) \n"
633 "vmovdqu %%ymm2,0x40(%1) \n"
634 "lea 0x60(%1),%1 \n"
635 "sub $0x20,%2 \n"
636 "jg 1b \n"
637 "vzeroupper \n"
638 : "+r"(src), // %0
639 "+r"(dst), // %1
640 "+r"(width) // %2
641 : "m"(kPermARGBToRGB24_0), // %3
642 "m"(kPermARGBToRGB24_1), // %4
643 "m"(kPermARGBToRGB24_2) // %5
644 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
645 }
646 #endif
647
648 #ifdef HAS_ARGBTORAWROW_AVX2
ARGBToRAWRow_AVX2(const uint8_t * src,uint8_t * dst,int width)649 void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
650 asm volatile(
651 "vbroadcastf128 %3,%%ymm6 \n"
652 "vmovdqa %4,%%ymm7 \n"
653
654 LABELALIGN
655 "1: \n"
656 "vmovdqu (%0),%%ymm0 \n"
657 "vmovdqu 0x20(%0),%%ymm1 \n"
658 "vmovdqu 0x40(%0),%%ymm2 \n"
659 "vmovdqu 0x60(%0),%%ymm3 \n"
660 "lea 0x80(%0),%0 \n"
661 "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
662 "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
663 "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
664 "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
665 "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
666 "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
667 "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
668 "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
669 "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
670 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
671 "vmovdqu %%ymm0,(%1) \n"
672 "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
673 "vpermq $0x4f,%%ymm2,%%ymm4 \n"
674 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
675 "vmovdqu %%ymm1,0x20(%1) \n"
676 "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
677 "vpermq $0x93,%%ymm3,%%ymm3 \n"
678 "vpor %%ymm3,%%ymm2,%%ymm2 \n"
679 "vmovdqu %%ymm2,0x40(%1) \n"
680 "lea 0x60(%1),%1 \n"
681 "sub $0x20,%2 \n"
682 "jg 1b \n"
683 "vzeroupper \n"
684 : "+r"(src), // %0
685 "+r"(dst), // %1
686 "+r"(width) // %2
687 : "m"(kShuffleMaskARGBToRAW), // %3
688 "m"(kPermdRGB24_AVX) // %4
689 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
690 "xmm7");
691 }
692 #endif
693
ARGBToRGB565Row_SSE2(const uint8_t * src,uint8_t * dst,int width)694 void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
695 asm volatile(
696 "pcmpeqb %%xmm3,%%xmm3 \n"
697 "psrld $0x1b,%%xmm3 \n"
698 "pcmpeqb %%xmm4,%%xmm4 \n"
699 "psrld $0x1a,%%xmm4 \n"
700 "pslld $0x5,%%xmm4 \n"
701 "pcmpeqb %%xmm5,%%xmm5 \n"
702 "pslld $0xb,%%xmm5 \n"
703
704 LABELALIGN
705 "1: \n"
706 "movdqu (%0),%%xmm0 \n"
707 "movdqa %%xmm0,%%xmm1 \n"
708 "movdqa %%xmm0,%%xmm2 \n"
709 "pslld $0x8,%%xmm0 \n"
710 "psrld $0x3,%%xmm1 \n"
711 "psrld $0x5,%%xmm2 \n"
712 "psrad $0x10,%%xmm0 \n"
713 "pand %%xmm3,%%xmm1 \n"
714 "pand %%xmm4,%%xmm2 \n"
715 "pand %%xmm5,%%xmm0 \n"
716 "por %%xmm2,%%xmm1 \n"
717 "por %%xmm1,%%xmm0 \n"
718 "packssdw %%xmm0,%%xmm0 \n"
719 "lea 0x10(%0),%0 \n"
720 "movq %%xmm0,(%1) \n"
721 "lea 0x8(%1),%1 \n"
722 "sub $0x4,%2 \n"
723 "jg 1b \n"
724 : "+r"(src), // %0
725 "+r"(dst), // %1
726 "+r"(width) // %2
727 ::"memory",
728 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
729 }
730
ARGBToRGB565DitherRow_SSE2(const uint8_t * src,uint8_t * dst,const uint32_t dither4,int width)731 void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
732 uint8_t* dst,
733 const uint32_t dither4,
734 int width) {
735 asm volatile(
736 "movd %3,%%xmm6 \n"
737 "punpcklbw %%xmm6,%%xmm6 \n"
738 "movdqa %%xmm6,%%xmm7 \n"
739 "punpcklwd %%xmm6,%%xmm6 \n"
740 "punpckhwd %%xmm7,%%xmm7 \n"
741 "pcmpeqb %%xmm3,%%xmm3 \n"
742 "psrld $0x1b,%%xmm3 \n"
743 "pcmpeqb %%xmm4,%%xmm4 \n"
744 "psrld $0x1a,%%xmm4 \n"
745 "pslld $0x5,%%xmm4 \n"
746 "pcmpeqb %%xmm5,%%xmm5 \n"
747 "pslld $0xb,%%xmm5 \n"
748
749 LABELALIGN
750 "1: \n"
751 "movdqu (%0),%%xmm0 \n"
752 "paddusb %%xmm6,%%xmm0 \n"
753 "movdqa %%xmm0,%%xmm1 \n"
754 "movdqa %%xmm0,%%xmm2 \n"
755 "pslld $0x8,%%xmm0 \n"
756 "psrld $0x3,%%xmm1 \n"
757 "psrld $0x5,%%xmm2 \n"
758 "psrad $0x10,%%xmm0 \n"
759 "pand %%xmm3,%%xmm1 \n"
760 "pand %%xmm4,%%xmm2 \n"
761 "pand %%xmm5,%%xmm0 \n"
762 "por %%xmm2,%%xmm1 \n"
763 "por %%xmm1,%%xmm0 \n"
764 "packssdw %%xmm0,%%xmm0 \n"
765 "lea 0x10(%0),%0 \n"
766 "movq %%xmm0,(%1) \n"
767 "lea 0x8(%1),%1 \n"
768 "sub $0x4,%2 \n"
769 "jg 1b \n"
770 : "+r"(src), // %0
771 "+r"(dst), // %1
772 "+r"(width) // %2
773 : "m"(dither4) // %3
774 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
775 "xmm7");
776 }
777
778 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
ARGBToRGB565DitherRow_AVX2(const uint8_t * src,uint8_t * dst,const uint32_t dither4,int width)779 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
780 uint8_t* dst,
781 const uint32_t dither4,
782 int width) {
783 asm volatile(
784 "vbroadcastss %3,%%xmm6 \n"
785 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
786 "vpermq $0xd8,%%ymm6,%%ymm6 \n"
787 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
788 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
789 "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
790 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
791 "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
792 "vpslld $0x5,%%ymm4,%%ymm4 \n"
793 "vpslld $0xb,%%ymm3,%%ymm5 \n"
794
795 LABELALIGN
796 "1: \n"
797 "vmovdqu (%0),%%ymm0 \n"
798 "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
799 "vpsrld $0x5,%%ymm0,%%ymm2 \n"
800 "vpsrld $0x3,%%ymm0,%%ymm1 \n"
801 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
802 "vpand %%ymm4,%%ymm2,%%ymm2 \n"
803 "vpand %%ymm3,%%ymm1,%%ymm1 \n"
804 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
805 "vpor %%ymm2,%%ymm1,%%ymm1 \n"
806 "vpor %%ymm1,%%ymm0,%%ymm0 \n"
807 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
808 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
809 "lea 0x20(%0),%0 \n"
810 "vmovdqu %%xmm0,(%1) \n"
811 "lea 0x10(%1),%1 \n"
812 "sub $0x8,%2 \n"
813 "jg 1b \n"
814 "vzeroupper \n"
815 : "+r"(src), // %0
816 "+r"(dst), // %1
817 "+r"(width) // %2
818 : "m"(dither4) // %3
819 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
820 "xmm7");
821 }
822 #endif // HAS_ARGBTORGB565DITHERROW_AVX2
823
ARGBToARGB1555Row_SSE2(const uint8_t * src,uint8_t * dst,int width)824 void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
825 asm volatile(
826 "pcmpeqb %%xmm4,%%xmm4 \n"
827 "psrld $0x1b,%%xmm4 \n"
828 "movdqa %%xmm4,%%xmm5 \n"
829 "pslld $0x5,%%xmm5 \n"
830 "movdqa %%xmm4,%%xmm6 \n"
831 "pslld $0xa,%%xmm6 \n"
832 "pcmpeqb %%xmm7,%%xmm7 \n"
833 "pslld $0xf,%%xmm7 \n"
834
835 LABELALIGN
836 "1: \n"
837 "movdqu (%0),%%xmm0 \n"
838 "movdqa %%xmm0,%%xmm1 \n"
839 "movdqa %%xmm0,%%xmm2 \n"
840 "movdqa %%xmm0,%%xmm3 \n"
841 "psrad $0x10,%%xmm0 \n"
842 "psrld $0x3,%%xmm1 \n"
843 "psrld $0x6,%%xmm2 \n"
844 "psrld $0x9,%%xmm3 \n"
845 "pand %%xmm7,%%xmm0 \n"
846 "pand %%xmm4,%%xmm1 \n"
847 "pand %%xmm5,%%xmm2 \n"
848 "pand %%xmm6,%%xmm3 \n"
849 "por %%xmm1,%%xmm0 \n"
850 "por %%xmm3,%%xmm2 \n"
851 "por %%xmm2,%%xmm0 \n"
852 "packssdw %%xmm0,%%xmm0 \n"
853 "lea 0x10(%0),%0 \n"
854 "movq %%xmm0,(%1) \n"
855 "lea 0x8(%1),%1 \n"
856 "sub $0x4,%2 \n"
857 "jg 1b \n"
858 : "+r"(src), // %0
859 "+r"(dst), // %1
860 "+r"(width) // %2
861 ::"memory",
862 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
863 }
864
ARGBToARGB4444Row_SSE2(const uint8_t * src,uint8_t * dst,int width)865 void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
866 asm volatile(
867 "pcmpeqb %%xmm4,%%xmm4 \n"
868 "psllw $0xc,%%xmm4 \n"
869 "movdqa %%xmm4,%%xmm3 \n"
870 "psrlw $0x8,%%xmm3 \n"
871
872 LABELALIGN
873 "1: \n"
874 "movdqu (%0),%%xmm0 \n"
875 "movdqa %%xmm0,%%xmm1 \n"
876 "pand %%xmm3,%%xmm0 \n"
877 "pand %%xmm4,%%xmm1 \n"
878 "psrlq $0x4,%%xmm0 \n"
879 "psrlq $0x8,%%xmm1 \n"
880 "por %%xmm1,%%xmm0 \n"
881 "packuswb %%xmm0,%%xmm0 \n"
882 "lea 0x10(%0),%0 \n"
883 "movq %%xmm0,(%1) \n"
884 "lea 0x8(%1),%1 \n"
885 "sub $0x4,%2 \n"
886 "jg 1b \n"
887 : "+r"(src), // %0
888 "+r"(dst), // %1
889 "+r"(width) // %2
890 ::"memory",
891 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
892 }
893 #endif // HAS_RGB24TOARGBROW_SSSE3
894
895 /*
896
897 ARGBToAR30Row:
898
899 Red Blue
900 With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
901 produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
902 wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
903 (1024+4)*16 for red.
904
905 Alpha Green
906 Alpha and Green are already in the high bits so vpand can zero out the other
907 bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
908 could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha
909 would be a simple multiplier to shift it into position. It wants a gap of 10
910 above the green. Green is 10 bits, so there are 6 bits in the low short. 4
911 more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
912 and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the
913 result left 10 to position the A and G channels.
914 */
915
916 // Shuffle table for converting RAW to RGB24. Last 8.
917 static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u,
918 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
919
920 static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u,
921 128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
922
923 static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
924 static const uint32_t kMaskRB10 = 0x3ff003ff;
925 static const uint32_t kMaskAG10 = 0xc000ff00;
926 static const uint32_t kMulAG10 = 64 * 65536 + 1028;
927
ARGBToAR30Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)928 void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
929 asm volatile(
930 "movdqa %3,%%xmm2 \n" // shuffler for RB
931 "movd %4,%%xmm3 \n" // multipler for RB
932 "movd %5,%%xmm4 \n" // mask for R10 B10
933 "movd %6,%%xmm5 \n" // mask for AG
934 "movd %7,%%xmm6 \n" // multipler for AG
935 "pshufd $0x0,%%xmm3,%%xmm3 \n"
936 "pshufd $0x0,%%xmm4,%%xmm4 \n"
937 "pshufd $0x0,%%xmm5,%%xmm5 \n"
938 "pshufd $0x0,%%xmm6,%%xmm6 \n"
939 "sub %0,%1 \n"
940
941 "1: \n"
942 "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
943 "movdqa %%xmm0,%%xmm1 \n"
944 "pshufb %%xmm2,%%xmm1 \n" // R0B0
945 "pand %%xmm5,%%xmm0 \n" // A0G0
946 "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
947 "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
948 "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
949 "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
950 "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
951 "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
952 "add $0x10,%0 \n"
953 "sub $0x4,%2 \n"
954 "jg 1b \n"
955
956 : "+r"(src), // %0
957 "+r"(dst), // %1
958 "+r"(width) // %2
959 : "m"(kShuffleRB30), // %3
960 "m"(kMulRB10), // %4
961 "m"(kMaskRB10), // %5
962 "m"(kMaskAG10), // %6
963 "m"(kMulAG10) // %7
964 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
965 }
966
ABGRToAR30Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)967 void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
968 asm volatile(
969 "movdqa %3,%%xmm2 \n" // shuffler for RB
970 "movd %4,%%xmm3 \n" // multipler for RB
971 "movd %5,%%xmm4 \n" // mask for R10 B10
972 "movd %6,%%xmm5 \n" // mask for AG
973 "movd %7,%%xmm6 \n" // multipler for AG
974 "pshufd $0x0,%%xmm3,%%xmm3 \n"
975 "pshufd $0x0,%%xmm4,%%xmm4 \n"
976 "pshufd $0x0,%%xmm5,%%xmm5 \n"
977 "pshufd $0x0,%%xmm6,%%xmm6 \n"
978 "sub %0,%1 \n"
979
980 "1: \n"
981 "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
982 "movdqa %%xmm0,%%xmm1 \n"
983 "pshufb %%xmm2,%%xmm1 \n" // R0B0
984 "pand %%xmm5,%%xmm0 \n" // A0G0
985 "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
986 "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
987 "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
988 "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
989 "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
990 "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
991 "add $0x10,%0 \n"
992 "sub $0x4,%2 \n"
993 "jg 1b \n"
994
995 : "+r"(src), // %0
996 "+r"(dst), // %1
997 "+r"(width) // %2
998 : "m"(kShuffleBR30), // %3 reversed shuffler
999 "m"(kMulRB10), // %4
1000 "m"(kMaskRB10), // %5
1001 "m"(kMaskAG10), // %6
1002 "m"(kMulAG10) // %7
1003 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1004 }
1005
1006 #ifdef HAS_ARGBTOAR30ROW_AVX2
ARGBToAR30Row_AVX2(const uint8_t * src,uint8_t * dst,int width)1007 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
1008 asm volatile(
1009 "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
1010 "vbroadcastss %4,%%ymm3 \n" // multipler for RB
1011 "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
1012 "vbroadcastss %6,%%ymm5 \n" // mask for AG
1013 "vbroadcastss %7,%%ymm6 \n" // multipler for AG
1014 "sub %0,%1 \n"
1015
1016 "1: \n"
1017 "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels
1018 "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
1019 "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
1020 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
1021 "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
1022 "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
1023 "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
1024 "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
1025 "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
1026 "add $0x20,%0 \n"
1027 "sub $0x8,%2 \n"
1028 "jg 1b \n"
1029 "vzeroupper \n"
1030
1031 : "+r"(src), // %0
1032 "+r"(dst), // %1
1033 "+r"(width) // %2
1034 : "m"(kShuffleRB30), // %3
1035 "m"(kMulRB10), // %4
1036 "m"(kMaskRB10), // %5
1037 "m"(kMaskAG10), // %6
1038 "m"(kMulAG10) // %7
1039 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1040 }
1041 #endif
1042
1043 #ifdef HAS_ABGRTOAR30ROW_AVX2
ABGRToAR30Row_AVX2(const uint8_t * src,uint8_t * dst,int width)1044 void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
1045 asm volatile(
1046 "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
1047 "vbroadcastss %4,%%ymm3 \n" // multipler for RB
1048 "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
1049 "vbroadcastss %6,%%ymm5 \n" // mask for AG
1050 "vbroadcastss %7,%%ymm6 \n" // multipler for AG
1051 "sub %0,%1 \n"
1052
1053 "1: \n"
1054 "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
1055 "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
1056 "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
1057 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
1058 "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
1059 "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
1060 "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
1061 "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
1062 "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
1063 "add $0x20,%0 \n"
1064 "sub $0x8,%2 \n"
1065 "jg 1b \n"
1066 "vzeroupper \n"
1067
1068 : "+r"(src), // %0
1069 "+r"(dst), // %1
1070 "+r"(width) // %2
1071 : "m"(kShuffleBR30), // %3 reversed shuffler
1072 "m"(kMulRB10), // %4
1073 "m"(kMaskRB10), // %5
1074 "m"(kMaskAG10), // %6
1075 "m"(kMulAG10) // %7
1076 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1077 }
1078 #endif
1079
1080 static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
1081 10, 9, 8, 11, 14, 13, 12, 15};
1082
1083 static const uvec8 kShuffleARGBToAB64Lo = {2, 2, 1, 1, 0, 0, 3, 3,
1084 6, 6, 5, 5, 4, 4, 7, 7};
1085 static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9, 9, 8, 8, 11, 11,
1086 14, 14, 13, 13, 12, 12, 15, 15};
1087
ARGBToAR64Row_SSSE3(const uint8_t * src_argb,uint16_t * dst_ar64,int width)1088 void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
1089 uint16_t* dst_ar64,
1090 int width) {
1091 asm volatile(
1092
1093 LABELALIGN
1094 "1: \n"
1095 "movdqu (%0),%%xmm0 \n"
1096 "movdqa %%xmm0,%%xmm1 \n"
1097 "punpcklbw %%xmm0,%%xmm0 \n"
1098 "punpckhbw %%xmm1,%%xmm1 \n"
1099 "movdqu %%xmm0,(%1) \n"
1100 "movdqu %%xmm1,0x10(%1) \n"
1101 "lea 0x10(%0),%0 \n"
1102 "lea 0x20(%1),%1 \n"
1103 "sub $0x4,%2 \n"
1104 "jg 1b \n"
1105 : "+r"(src_argb), // %0
1106 "+r"(dst_ar64), // %1
1107 "+r"(width) // %2
1108 :
1109 : "memory", "cc", "xmm0", "xmm1");
1110 }
1111
ARGBToAB64Row_SSSE3(const uint8_t * src_argb,uint16_t * dst_ab64,int width)1112 void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
1113 uint16_t* dst_ab64,
1114 int width) {
1115 asm volatile(
1116
1117 "movdqa %3,%%xmm2 \n"
1118 "movdqa %4,%%xmm3 \n" LABELALIGN
1119 "1: \n"
1120 "movdqu (%0),%%xmm0 \n"
1121 "movdqa %%xmm0,%%xmm1 \n"
1122 "pshufb %%xmm2,%%xmm0 \n"
1123 "pshufb %%xmm3,%%xmm1 \n"
1124 "movdqu %%xmm0,(%1) \n"
1125 "movdqu %%xmm1,0x10(%1) \n"
1126 "lea 0x10(%0),%0 \n"
1127 "lea 0x20(%1),%1 \n"
1128 "sub $0x4,%2 \n"
1129 "jg 1b \n"
1130 : "+r"(src_argb), // %0
1131 "+r"(dst_ab64), // %1
1132 "+r"(width) // %2
1133 : "m"(kShuffleARGBToAB64Lo), // %3
1134 "m"(kShuffleARGBToAB64Hi) // %4
1135 : "memory", "cc", "xmm0", "xmm1", "xmm2");
1136 }
1137
AR64ToARGBRow_SSSE3(const uint16_t * src_ar64,uint8_t * dst_argb,int width)1138 void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
1139 uint8_t* dst_argb,
1140 int width) {
1141 asm volatile(
1142
1143 LABELALIGN
1144 "1: \n"
1145 "movdqu (%0),%%xmm0 \n"
1146 "movdqu 0x10(%0),%%xmm1 \n"
1147 "psrlw $8,%%xmm0 \n"
1148 "psrlw $8,%%xmm1 \n"
1149 "packuswb %%xmm1,%%xmm0 \n"
1150 "movdqu %%xmm0,(%1) \n"
1151 "lea 0x20(%0),%0 \n"
1152 "lea 0x10(%1),%1 \n"
1153 "sub $0x4,%2 \n"
1154 "jg 1b \n"
1155 : "+r"(src_ar64), // %0
1156 "+r"(dst_argb), // %1
1157 "+r"(width) // %2
1158 :
1159 : "memory", "cc", "xmm0", "xmm1");
1160 }
1161
AB64ToARGBRow_SSSE3(const uint16_t * src_ab64,uint8_t * dst_argb,int width)1162 void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
1163 uint8_t* dst_argb,
1164 int width) {
1165 asm volatile(
1166
1167 "movdqa %3,%%xmm2 \n" LABELALIGN
1168 "1: \n"
1169 "movdqu (%0),%%xmm0 \n"
1170 "movdqu 0x10(%0),%%xmm1 \n"
1171 "psrlw $8,%%xmm0 \n"
1172 "psrlw $8,%%xmm1 \n"
1173 "packuswb %%xmm1,%%xmm0 \n"
1174 "pshufb %%xmm2,%%xmm0 \n"
1175 "movdqu %%xmm0,(%1) \n"
1176 "lea 0x20(%0),%0 \n"
1177 "lea 0x10(%1),%1 \n"
1178 "sub $0x4,%2 \n"
1179 "jg 1b \n"
1180 : "+r"(src_ab64), // %0
1181 "+r"(dst_argb), // %1
1182 "+r"(width) // %2
1183 : "m"(kShuffleARGBToABGR) // %3
1184 : "memory", "cc", "xmm0", "xmm1", "xmm2");
1185 }
1186
1187 #ifdef HAS_ARGBTOAR64ROW_AVX2
ARGBToAR64Row_AVX2(const uint8_t * src_argb,uint16_t * dst_ar64,int width)1188 void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
1189 uint16_t* dst_ar64,
1190 int width) {
1191 asm volatile(
1192
1193 LABELALIGN
1194 "1: \n"
1195 "vmovdqu (%0),%%ymm0 \n"
1196 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1197 "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
1198 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
1199 "vmovdqu %%ymm0,(%1) \n"
1200 "vmovdqu %%ymm1,0x20(%1) \n"
1201 "lea 0x20(%0),%0 \n"
1202 "lea 0x40(%1),%1 \n"
1203 "sub $0x8,%2 \n"
1204 "jg 1b \n"
1205 : "+r"(src_argb), // %0
1206 "+r"(dst_ar64), // %1
1207 "+r"(width) // %2
1208 :
1209 : "memory", "cc", "xmm0", "xmm1");
1210 }
1211 #endif
1212
1213 #ifdef HAS_ARGBTOAB64ROW_AVX2
ARGBToAB64Row_AVX2(const uint8_t * src_argb,uint16_t * dst_ab64,int width)1214 void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
1215 uint16_t* dst_ab64,
1216 int width) {
1217 asm volatile(
1218
1219 "vbroadcastf128 %3,%%ymm2 \n"
1220 "vbroadcastf128 %4,%%ymm3 \n" LABELALIGN
1221 "1: \n"
1222 "vmovdqu (%0),%%ymm0 \n"
1223 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1224 "vpshufb %%ymm3,%%ymm0,%%ymm1 \n"
1225 "vpshufb %%ymm2,%%ymm0,%%ymm0 \n"
1226 "vmovdqu %%ymm0,(%1) \n"
1227 "vmovdqu %%ymm1,0x20(%1) \n"
1228 "lea 0x20(%0),%0 \n"
1229 "lea 0x40(%1),%1 \n"
1230 "sub $0x8,%2 \n"
1231 "jg 1b \n"
1232 : "+r"(src_argb), // %0
1233 "+r"(dst_ab64), // %1
1234 "+r"(width) // %2
1235 : "m"(kShuffleARGBToAB64Lo), // %3
1236 "m"(kShuffleARGBToAB64Hi) // %3
1237 : "memory", "cc", "xmm0", "xmm1", "xmm2");
1238 }
1239 #endif
1240
1241 #ifdef HAS_AR64TOARGBROW_AVX2
AR64ToARGBRow_AVX2(const uint16_t * src_ar64,uint8_t * dst_argb,int width)1242 void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
1243 uint8_t* dst_argb,
1244 int width) {
1245 asm volatile(
1246
1247 LABELALIGN
1248 "1: \n"
1249 "vmovdqu (%0),%%ymm0 \n"
1250 "vmovdqu 0x20(%0),%%ymm1 \n"
1251 "vpsrlw $8,%%ymm0,%%ymm0 \n"
1252 "vpsrlw $8,%%ymm1,%%ymm1 \n"
1253 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
1254 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1255 "vmovdqu %%ymm0,(%1) \n"
1256 "lea 0x40(%0),%0 \n"
1257 "lea 0x20(%1),%1 \n"
1258 "sub $0x8,%2 \n"
1259 "jg 1b \n"
1260 : "+r"(src_ar64), // %0
1261 "+r"(dst_argb), // %1
1262 "+r"(width) // %2
1263 :
1264 : "memory", "cc", "xmm0", "xmm1");
1265 }
1266 #endif
1267
1268 #ifdef HAS_AB64TOARGBROW_AVX2
AB64ToARGBRow_AVX2(const uint16_t * src_ab64,uint8_t * dst_argb,int width)1269 void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
1270 uint8_t* dst_argb,
1271 int width) {
1272 asm volatile(
1273
1274 "vbroadcastf128 %3,%%ymm2 \n" LABELALIGN
1275 "1: \n"
1276 "vmovdqu (%0),%%ymm0 \n"
1277 "vmovdqu 0x20(%0),%%ymm1 \n"
1278 "vpsrlw $8,%%ymm0,%%ymm0 \n"
1279 "vpsrlw $8,%%ymm1,%%ymm1 \n"
1280 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
1281 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1282 "vpshufb %%ymm2,%%ymm0,%%ymm0 \n"
1283 "vmovdqu %%ymm0,(%1) \n"
1284 "lea 0x40(%0),%0 \n"
1285 "lea 0x20(%1),%1 \n"
1286 "sub $0x8,%2 \n"
1287 "jg 1b \n"
1288 : "+r"(src_ab64), // %0
1289 "+r"(dst_argb), // %1
1290 "+r"(width) // %2
1291 : "m"(kShuffleARGBToABGR) // %3
1292 : "memory", "cc", "xmm0", "xmm1", "xmm2");
1293 }
1294 #endif
1295
1296 // clang-format off
1297
1298 // TODO(mraptis): Consider passing R, G, B multipliers as parameter.
1299 // round parameter is register containing value to add before shift.
1300 #define RGBTOY(round) \
1301 "1: \n" \
1302 "movdqu (%0),%%xmm0 \n" \
1303 "movdqu 0x10(%0),%%xmm1 \n" \
1304 "movdqu 0x20(%0),%%xmm2 \n" \
1305 "movdqu 0x30(%0),%%xmm3 \n" \
1306 "psubb %%xmm5,%%xmm0 \n" \
1307 "psubb %%xmm5,%%xmm1 \n" \
1308 "psubb %%xmm5,%%xmm2 \n" \
1309 "psubb %%xmm5,%%xmm3 \n" \
1310 "movdqu %%xmm4,%%xmm6 \n" \
1311 "pmaddubsw %%xmm0,%%xmm6 \n" \
1312 "movdqu %%xmm4,%%xmm0 \n" \
1313 "pmaddubsw %%xmm1,%%xmm0 \n" \
1314 "movdqu %%xmm4,%%xmm1 \n" \
1315 "pmaddubsw %%xmm2,%%xmm1 \n" \
1316 "movdqu %%xmm4,%%xmm2 \n" \
1317 "pmaddubsw %%xmm3,%%xmm2 \n" \
1318 "lea 0x40(%0),%0 \n" \
1319 "phaddw %%xmm0,%%xmm6 \n" \
1320 "phaddw %%xmm2,%%xmm1 \n" \
1321 "prefetcht0 1280(%0) \n" \
1322 "paddw %%" #round ",%%xmm6 \n" \
1323 "paddw %%" #round ",%%xmm1 \n" \
1324 "psrlw $0x8,%%xmm6 \n" \
1325 "psrlw $0x8,%%xmm1 \n" \
1326 "packuswb %%xmm1,%%xmm6 \n" \
1327 "movdqu %%xmm6,(%1) \n" \
1328 "lea 0x10(%1),%1 \n" \
1329 "sub $0x10,%2 \n" \
1330 "jg 1b \n"
1331
1332 #define RGBTOY_AVX2(round) \
1333 "1: \n" \
1334 "vmovdqu (%0),%%ymm0 \n" \
1335 "vmovdqu 0x20(%0),%%ymm1 \n" \
1336 "vmovdqu 0x40(%0),%%ymm2 \n" \
1337 "vmovdqu 0x60(%0),%%ymm3 \n" \
1338 "vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \
1339 "vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \
1340 "vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \
1341 "vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \
1342 "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \
1343 "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \
1344 "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \
1345 "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \
1346 "lea 0x80(%0),%0 \n" \
1347 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \
1348 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \
1349 "prefetcht0 1280(%0) \n" \
1350 "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
1351 "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \
1352 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \
1353 "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \
1354 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \
1355 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \
1356 "vmovdqu %%ymm0,(%1) \n" \
1357 "lea 0x20(%1),%1 \n" \
1358 "sub $0x20,%2 \n" \
1359 "jg 1b \n" \
1360 "vzeroupper \n"
1361
1362 // clang-format on
1363
1364 #ifdef HAS_ARGBTOYROW_SSSE3
1365 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
ARGBToYRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_y,int width)1366 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1367 asm volatile(
1368 "movdqa %3,%%xmm4 \n"
1369 "movdqa %4,%%xmm5 \n"
1370 "movdqa %5,%%xmm7 \n"
1371
1372 LABELALIGN RGBTOY(xmm7)
1373 : "+r"(src_argb), // %0
1374 "+r"(dst_y), // %1
1375 "+r"(width) // %2
1376 : "m"(kARGBToY), // %3
1377 "m"(kSub128), // %4
1378 "m"(kAddY16) // %5
1379 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1380 "xmm7");
1381 }
1382 #endif // HAS_ARGBTOYROW_SSSE3
1383
1384 #ifdef HAS_ARGBTOYJROW_SSSE3
1385 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1386 // Same as ARGBToYRow but different coefficients, no add 16.
ARGBToYJRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_y,int width)1387 void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1388 asm volatile(
1389 "movdqa %3,%%xmm4 \n"
1390 "movdqa %4,%%xmm5 \n"
1391
1392 LABELALIGN RGBTOY(xmm5)
1393 : "+r"(src_argb), // %0
1394 "+r"(dst_y), // %1
1395 "+r"(width) // %2
1396 : "m"(kARGBToYJ), // %3
1397 "m"(kSub128) // %4
1398 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1399 }
1400 #endif // HAS_ARGBTOYJROW_SSSE3
1401
1402 #ifdef HAS_RGBATOYJROW_SSSE3
1403 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1404 // Same as ARGBToYRow but different coefficients, no add 16.
RGBAToYJRow_SSSE3(const uint8_t * src_rgba,uint8_t * dst_y,int width)1405 void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1406 asm volatile(
1407 "movdqa %3,%%xmm4 \n"
1408 "movdqa %4,%%xmm5 \n"
1409
1410 LABELALIGN RGBTOY(xmm5)
1411 : "+r"(src_rgba), // %0
1412 "+r"(dst_y), // %1
1413 "+r"(width) // %2
1414 : "m"(kRGBAToYJ), // %3
1415 "m"(kSub128) // %4
1416 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1417 }
1418 #endif // HAS_RGBATOYJROW_SSSE3
1419
1420 #ifdef HAS_ARGBTOYROW_AVX2
1421 // vpermd for vphaddw + vpackuswb vpermd.
1422 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
1423
1424 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYRow_AVX2(const uint8_t * src_argb,uint8_t * dst_y,int width)1425 void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1426 asm volatile(
1427 "vbroadcastf128 %3,%%ymm4 \n"
1428 "vbroadcastf128 %4,%%ymm5 \n"
1429 "vbroadcastf128 %5,%%ymm7 \n"
1430 "vmovdqu %6,%%ymm6 \n"
1431
1432 LABELALIGN RGBTOY_AVX2(ymm7)
1433 : "+r"(src_argb), // %0
1434 "+r"(dst_y), // %1
1435 "+r"(width) // %2
1436 : "m"(kARGBToY), // %3
1437 "m"(kSub128), // %4
1438 "m"(kAddY16), // %5
1439 "m"(kPermdARGBToY_AVX) // %6
1440 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1441 "xmm7");
1442 }
1443 #endif // HAS_ARGBTOYROW_AVX2
1444
1445 #ifdef HAS_ABGRTOYROW_AVX2
1446 // Convert 32 ABGR pixels (128 bytes) to 32 Y values.
ABGRToYRow_AVX2(const uint8_t * src_abgr,uint8_t * dst_y,int width)1447 void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1448 asm volatile(
1449 "vbroadcastf128 %3,%%ymm4 \n"
1450 "vbroadcastf128 %4,%%ymm5 \n"
1451 "vbroadcastf128 %5,%%ymm7 \n"
1452 "vmovdqu %6,%%ymm6 \n"
1453
1454 LABELALIGN RGBTOY_AVX2(ymm7)
1455 : "+r"(src_abgr), // %0
1456 "+r"(dst_y), // %1
1457 "+r"(width) // %2
1458 : "m"(kABGRToY), // %3
1459 "m"(kSub128), // %4
1460 "m"(kAddY16), // %5
1461 "m"(kPermdARGBToY_AVX) // %6
1462 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1463 "xmm7");
1464 }
1465 #endif // HAS_ABGRTOYROW_AVX2
1466
1467 #ifdef HAS_ARGBTOYJROW_AVX2
1468 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYJRow_AVX2(const uint8_t * src_argb,uint8_t * dst_y,int width)1469 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1470 asm volatile(
1471 "vbroadcastf128 %3,%%ymm4 \n"
1472 "vbroadcastf128 %4,%%ymm5 \n"
1473 "vmovdqu %5,%%ymm6 \n"
1474
1475 LABELALIGN RGBTOY_AVX2(ymm5)
1476 : "+r"(src_argb), // %0
1477 "+r"(dst_y), // %1
1478 "+r"(width) // %2
1479 : "m"(kARGBToYJ), // %3
1480 "m"(kSub128), // %4
1481 "m"(kPermdARGBToY_AVX) // %5
1482 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1483 "xmm7");
1484 }
1485 #endif // HAS_ARGBTOYJROW_AVX2
1486
1487 #ifdef HAS_RGBATOYJROW_AVX2
1488 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
RGBAToYJRow_AVX2(const uint8_t * src_rgba,uint8_t * dst_y,int width)1489 void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1490 asm volatile(
1491 "vbroadcastf128 %3,%%ymm4 \n"
1492 "vbroadcastf128 %4,%%ymm5 \n"
1493 "vmovdqu %5,%%ymm6 \n"
1494
1495 LABELALIGN RGBTOY_AVX2(
1496 ymm5) "vzeroupper \n"
1497 : "+r"(src_rgba), // %0
1498 "+r"(dst_y), // %1
1499 "+r"(width) // %2
1500 : "m"(kRGBAToYJ), // %3
1501 "m"(kSub128), // %4
1502 "m"(kPermdARGBToY_AVX) // %5
1503 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1504 }
1505 #endif // HAS_RGBATOYJROW_AVX2
1506
1507 #ifdef HAS_ARGBTOUVROW_SSSE3
ARGBToUVRow_SSSE3(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1508 void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
1509 int src_stride_argb,
1510 uint8_t* dst_u,
1511 uint8_t* dst_v,
1512 int width) {
1513 asm volatile(
1514 "movdqa %5,%%xmm3 \n"
1515 "movdqa %6,%%xmm4 \n"
1516 "movdqa %7,%%xmm5 \n"
1517 "sub %1,%2 \n"
1518
1519 LABELALIGN
1520 "1: \n"
1521 "movdqu (%0),%%xmm0 \n"
1522 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
1523 "pavgb %%xmm7,%%xmm0 \n"
1524 "movdqu 0x10(%0),%%xmm1 \n"
1525 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1526 "pavgb %%xmm7,%%xmm1 \n"
1527 "movdqu 0x20(%0),%%xmm2 \n"
1528 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1529 "pavgb %%xmm7,%%xmm2 \n"
1530 "movdqu 0x30(%0),%%xmm6 \n"
1531 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1532 "pavgb %%xmm7,%%xmm6 \n"
1533
1534 "lea 0x40(%0),%0 \n"
1535 "movdqa %%xmm0,%%xmm7 \n"
1536 "shufps $0x88,%%xmm1,%%xmm0 \n"
1537 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1538 "pavgb %%xmm7,%%xmm0 \n"
1539 "movdqa %%xmm2,%%xmm7 \n"
1540 "shufps $0x88,%%xmm6,%%xmm2 \n"
1541 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1542 "pavgb %%xmm7,%%xmm2 \n"
1543 "movdqa %%xmm0,%%xmm1 \n"
1544 "movdqa %%xmm2,%%xmm6 \n"
1545 "pmaddubsw %%xmm4,%%xmm0 \n"
1546 "pmaddubsw %%xmm4,%%xmm2 \n"
1547 "pmaddubsw %%xmm3,%%xmm1 \n"
1548 "pmaddubsw %%xmm3,%%xmm6 \n"
1549 "phaddw %%xmm2,%%xmm0 \n"
1550 "phaddw %%xmm6,%%xmm1 \n"
1551 "psraw $0x8,%%xmm0 \n"
1552 "psraw $0x8,%%xmm1 \n"
1553 "packsswb %%xmm1,%%xmm0 \n"
1554 "paddb %%xmm5,%%xmm0 \n"
1555 "movlps %%xmm0,(%1) \n"
1556 "movhps %%xmm0,0x00(%1,%2,1) \n"
1557 "lea 0x8(%1),%1 \n"
1558 "sub $0x10,%3 \n"
1559 "jg 1b \n"
1560 : "+r"(src_argb), // %0
1561 "+r"(dst_u), // %1
1562 "+r"(dst_v), // %2
1563 "+rm"(width) // %3
1564 : "r"((intptr_t)(src_stride_argb)), // %4
1565 "m"(kARGBToV), // %5
1566 "m"(kARGBToU), // %6
1567 "m"(kAddUV128) // %7
1568 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1569 }
1570 #endif // HAS_ARGBTOUVROW_SSSE3
1571
1572 #ifdef HAS_ARGBTOUVROW_AVX2
1573 // vpshufb for vphaddw + vpackuswb packed to shorts.
1574 static const lvec8 kShufARGBToUV_AVX = {
1575 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
1576 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
ARGBToUVRow_AVX2(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1577 void ARGBToUVRow_AVX2(const uint8_t* src_argb,
1578 int src_stride_argb,
1579 uint8_t* dst_u,
1580 uint8_t* dst_v,
1581 int width) {
1582 asm volatile(
1583 "vbroadcastf128 %5,%%ymm5 \n"
1584 "vbroadcastf128 %6,%%ymm6 \n"
1585 "vbroadcastf128 %7,%%ymm7 \n"
1586 "sub %1,%2 \n"
1587
1588 LABELALIGN
1589 "1: \n"
1590 "vmovdqu (%0),%%ymm0 \n"
1591 "vmovdqu 0x20(%0),%%ymm1 \n"
1592 "vmovdqu 0x40(%0),%%ymm2 \n"
1593 "vmovdqu 0x60(%0),%%ymm3 \n"
1594 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
1595 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
1596 "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
1597 "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
1598 "lea 0x80(%0),%0 \n"
1599 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
1600 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
1601 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
1602 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
1603 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
1604 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
1605
1606 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
1607 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
1608 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
1609 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
1610 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
1611 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
1612 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
1613 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
1614 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
1615 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1616 "vpshufb %8,%%ymm0,%%ymm0 \n"
1617 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
1618
1619 "vextractf128 $0x0,%%ymm0,(%1) \n"
1620 "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
1621 "lea 0x10(%1),%1 \n"
1622 "sub $0x20,%3 \n"
1623 "jg 1b \n"
1624 "vzeroupper \n"
1625 : "+r"(src_argb), // %0
1626 "+r"(dst_u), // %1
1627 "+r"(dst_v), // %2
1628 "+rm"(width) // %3
1629 : "r"((intptr_t)(src_stride_argb)), // %4
1630 "m"(kAddUV128), // %5
1631 "m"(kARGBToV), // %6
1632 "m"(kARGBToU), // %7
1633 "m"(kShufARGBToUV_AVX) // %8
1634 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1635 "xmm7");
1636 }
1637 #endif // HAS_ARGBTOUVROW_AVX2
1638
1639 #ifdef HAS_ABGRTOUVROW_AVX2
ABGRToUVRow_AVX2(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1640 void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
1641 int src_stride_abgr,
1642 uint8_t* dst_u,
1643 uint8_t* dst_v,
1644 int width) {
1645 asm volatile(
1646 "vbroadcastf128 %5,%%ymm5 \n"
1647 "vbroadcastf128 %6,%%ymm6 \n"
1648 "vbroadcastf128 %7,%%ymm7 \n"
1649 "sub %1,%2 \n"
1650
1651 LABELALIGN
1652 "1: \n"
1653 "vmovdqu (%0),%%ymm0 \n"
1654 "vmovdqu 0x20(%0),%%ymm1 \n"
1655 "vmovdqu 0x40(%0),%%ymm2 \n"
1656 "vmovdqu 0x60(%0),%%ymm3 \n"
1657 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
1658 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
1659 "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
1660 "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
1661 "lea 0x80(%0),%0 \n"
1662 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
1663 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
1664 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
1665 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
1666 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
1667 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
1668
1669 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
1670 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
1671 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
1672 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
1673 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
1674 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
1675 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
1676 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
1677 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
1678 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1679 "vpshufb %8,%%ymm0,%%ymm0 \n"
1680 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
1681
1682 "vextractf128 $0x0,%%ymm0,(%1) \n"
1683 "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
1684 "lea 0x10(%1),%1 \n"
1685 "sub $0x20,%3 \n"
1686 "jg 1b \n"
1687 "vzeroupper \n"
1688 : "+r"(src_abgr), // %0
1689 "+r"(dst_u), // %1
1690 "+r"(dst_v), // %2
1691 "+rm"(width) // %3
1692 : "r"((intptr_t)(src_stride_abgr)), // %4
1693 "m"(kAddUV128), // %5
1694 "m"(kABGRToV), // %6
1695 "m"(kABGRToU), // %7
1696 "m"(kShufARGBToUV_AVX) // %8
1697 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1698 "xmm7");
1699 }
1700 #endif // HAS_ABGRTOUVROW_AVX2
1701
1702 #ifdef HAS_ARGBTOUVJROW_AVX2
ARGBToUVJRow_AVX2(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1703 void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
1704 int src_stride_argb,
1705 uint8_t* dst_u,
1706 uint8_t* dst_v,
1707 int width) {
1708 asm volatile(
1709 "vbroadcastf128 %5,%%ymm5 \n"
1710 "vbroadcastf128 %6,%%ymm6 \n"
1711 "vbroadcastf128 %7,%%ymm7 \n"
1712 "sub %1,%2 \n"
1713
1714 LABELALIGN
1715 "1: \n"
1716 "vmovdqu (%0),%%ymm0 \n"
1717 "vmovdqu 0x20(%0),%%ymm1 \n"
1718 "vmovdqu 0x40(%0),%%ymm2 \n"
1719 "vmovdqu 0x60(%0),%%ymm3 \n"
1720 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
1721 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
1722 "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
1723 "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
1724 "lea 0x80(%0),%0 \n"
1725 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
1726 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
1727 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
1728 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
1729 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
1730 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
1731
1732 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
1733 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
1734 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
1735 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
1736 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
1737 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
1738 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
1739 "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
1740 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
1741 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
1742 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
1743 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1744 "vpshufb %8,%%ymm0,%%ymm0 \n"
1745
1746 "vextractf128 $0x0,%%ymm0,(%1) \n"
1747 "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
1748 "lea 0x10(%1),%1 \n"
1749 "sub $0x20,%3 \n"
1750 "jg 1b \n"
1751 "vzeroupper \n"
1752 : "+r"(src_argb), // %0
1753 "+r"(dst_u), // %1
1754 "+r"(dst_v), // %2
1755 "+rm"(width) // %3
1756 : "r"((intptr_t)(src_stride_argb)), // %4
1757 "m"(kSub128), // %5
1758 "m"(kARGBToVJ), // %6
1759 "m"(kARGBToUJ), // %7
1760 "m"(kShufARGBToUV_AVX) // %8
1761 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1762 "xmm7");
1763 }
1764 #endif // HAS_ARGBTOUVJROW_AVX2
1765
1766 #ifdef HAS_ARGBTOUVJROW_SSSE3
ARGBToUVJRow_SSSE3(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1767 void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
1768 int src_stride_argb,
1769 uint8_t* dst_u,
1770 uint8_t* dst_v,
1771 int width) {
1772 asm volatile(
1773 "movdqa %5,%%xmm3 \n"
1774 "movdqa %6,%%xmm4 \n"
1775 "movdqa %7,%%xmm5 \n"
1776 "sub %1,%2 \n"
1777
1778 LABELALIGN
1779 "1: \n"
1780 "movdqu (%0),%%xmm0 \n"
1781 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
1782 "pavgb %%xmm7,%%xmm0 \n"
1783 "movdqu 0x10(%0),%%xmm1 \n"
1784 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1785 "pavgb %%xmm7,%%xmm1 \n"
1786 "movdqu 0x20(%0),%%xmm2 \n"
1787 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1788 "pavgb %%xmm7,%%xmm2 \n"
1789 "movdqu 0x30(%0),%%xmm6 \n"
1790 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1791 "pavgb %%xmm7,%%xmm6 \n"
1792
1793 "lea 0x40(%0),%0 \n"
1794 "movdqa %%xmm0,%%xmm7 \n"
1795 "shufps $0x88,%%xmm1,%%xmm0 \n"
1796 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1797 "pavgb %%xmm7,%%xmm0 \n"
1798 "movdqa %%xmm2,%%xmm7 \n"
1799 "shufps $0x88,%%xmm6,%%xmm2 \n"
1800 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1801 "pavgb %%xmm7,%%xmm2 \n"
1802 "movdqa %%xmm0,%%xmm1 \n"
1803 "movdqa %%xmm2,%%xmm6 \n"
1804 "pmaddubsw %%xmm4,%%xmm0 \n"
1805 "pmaddubsw %%xmm4,%%xmm2 \n"
1806 "pmaddubsw %%xmm3,%%xmm1 \n"
1807 "pmaddubsw %%xmm3,%%xmm6 \n"
1808 "phaddw %%xmm2,%%xmm0 \n"
1809 "phaddw %%xmm6,%%xmm1 \n"
1810 "paddw %%xmm5,%%xmm0 \n"
1811 "paddw %%xmm5,%%xmm1 \n"
1812 "psraw $0x8,%%xmm0 \n"
1813 "psraw $0x8,%%xmm1 \n"
1814 "packsswb %%xmm1,%%xmm0 \n"
1815 "movlps %%xmm0,(%1) \n"
1816 "movhps %%xmm0,0x00(%1,%2,1) \n"
1817 "lea 0x8(%1),%1 \n"
1818 "sub $0x10,%3 \n"
1819 "jg 1b \n"
1820 : "+r"(src_argb), // %0
1821 "+r"(dst_u), // %1
1822 "+r"(dst_v), // %2
1823 "+rm"(width) // %3
1824 : "r"((intptr_t)(src_stride_argb)), // %4
1825 "m"(kARGBToVJ), // %5
1826 "m"(kARGBToUJ), // %6
1827 "m"(kSub128) // %7
1828 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1829 }
1830 #endif // HAS_ARGBTOUVJROW_SSSE3
1831
1832 #ifdef HAS_ARGBTOUV444ROW_SSSE3
ARGBToUV444Row_SSSE3(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1833 void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
1834 uint8_t* dst_u,
1835 uint8_t* dst_v,
1836 int width) {
1837 asm volatile(
1838 "movdqa %4,%%xmm3 \n"
1839 "movdqa %5,%%xmm4 \n"
1840 "movdqa %6,%%xmm5 \n"
1841 "sub %1,%2 \n"
1842
1843 LABELALIGN
1844 "1: \n"
1845 "movdqu (%0),%%xmm0 \n"
1846 "movdqu 0x10(%0),%%xmm1 \n"
1847 "movdqu 0x20(%0),%%xmm2 \n"
1848 "movdqu 0x30(%0),%%xmm6 \n"
1849 "pmaddubsw %%xmm4,%%xmm0 \n"
1850 "pmaddubsw %%xmm4,%%xmm1 \n"
1851 "pmaddubsw %%xmm4,%%xmm2 \n"
1852 "pmaddubsw %%xmm4,%%xmm6 \n"
1853 "phaddw %%xmm1,%%xmm0 \n"
1854 "phaddw %%xmm6,%%xmm2 \n"
1855 "psraw $0x8,%%xmm0 \n"
1856 "psraw $0x8,%%xmm2 \n"
1857 "packsswb %%xmm2,%%xmm0 \n"
1858 "paddb %%xmm5,%%xmm0 \n"
1859 "movdqu %%xmm0,(%1) \n"
1860 "movdqu (%0),%%xmm0 \n"
1861 "movdqu 0x10(%0),%%xmm1 \n"
1862 "movdqu 0x20(%0),%%xmm2 \n"
1863 "movdqu 0x30(%0),%%xmm6 \n"
1864 "pmaddubsw %%xmm3,%%xmm0 \n"
1865 "pmaddubsw %%xmm3,%%xmm1 \n"
1866 "pmaddubsw %%xmm3,%%xmm2 \n"
1867 "pmaddubsw %%xmm3,%%xmm6 \n"
1868 "phaddw %%xmm1,%%xmm0 \n"
1869 "phaddw %%xmm6,%%xmm2 \n"
1870 "psraw $0x8,%%xmm0 \n"
1871 "psraw $0x8,%%xmm2 \n"
1872 "packsswb %%xmm2,%%xmm0 \n"
1873 "paddb %%xmm5,%%xmm0 \n"
1874 "lea 0x40(%0),%0 \n"
1875 "movdqu %%xmm0,0x00(%1,%2,1) \n"
1876 "lea 0x10(%1),%1 \n"
1877 "sub $0x10,%3 \n"
1878 "jg 1b \n"
1879 : "+r"(src_argb), // %0
1880 "+r"(dst_u), // %1
1881 "+r"(dst_v), // %2
1882 "+rm"(width) // %3
1883 : "m"(kARGBToV), // %4
1884 "m"(kARGBToU), // %5
1885 "m"(kAddUV128) // %6
1886 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
1887 }
1888 #endif // HAS_ARGBTOUV444ROW_SSSE3
1889
BGRAToYRow_SSSE3(const uint8_t * src_bgra,uint8_t * dst_y,int width)1890 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
1891 asm volatile(
1892 "movdqa %3,%%xmm4 \n"
1893 "movdqa %4,%%xmm5 \n"
1894 "movdqa %5,%%xmm7 \n"
1895
1896 LABELALIGN RGBTOY(xmm7)
1897 : "+r"(src_bgra), // %0
1898 "+r"(dst_y), // %1
1899 "+r"(width) // %2
1900 : "m"(kBGRAToY), // %3
1901 "m"(kSub128), // %4
1902 "m"(kAddY16) // %5
1903 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1904 "xmm7");
1905 }
1906
BGRAToUVRow_SSSE3(const uint8_t * src_bgra,int src_stride_bgra,uint8_t * dst_u,uint8_t * dst_v,int width)1907 void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
1908 int src_stride_bgra,
1909 uint8_t* dst_u,
1910 uint8_t* dst_v,
1911 int width) {
1912 asm volatile(
1913 "movdqa %5,%%xmm3 \n"
1914 "movdqa %6,%%xmm4 \n"
1915 "movdqa %7,%%xmm5 \n"
1916 "sub %1,%2 \n"
1917
1918 LABELALIGN
1919 "1: \n"
1920 "movdqu (%0),%%xmm0 \n"
1921 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
1922 "pavgb %%xmm7,%%xmm0 \n"
1923 "movdqu 0x10(%0),%%xmm1 \n"
1924 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1925 "pavgb %%xmm7,%%xmm1 \n"
1926 "movdqu 0x20(%0),%%xmm2 \n"
1927 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1928 "pavgb %%xmm7,%%xmm2 \n"
1929 "movdqu 0x30(%0),%%xmm6 \n"
1930 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1931 "pavgb %%xmm7,%%xmm6 \n"
1932
1933 "lea 0x40(%0),%0 \n"
1934 "movdqa %%xmm0,%%xmm7 \n"
1935 "shufps $0x88,%%xmm1,%%xmm0 \n"
1936 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1937 "pavgb %%xmm7,%%xmm0 \n"
1938 "movdqa %%xmm2,%%xmm7 \n"
1939 "shufps $0x88,%%xmm6,%%xmm2 \n"
1940 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1941 "pavgb %%xmm7,%%xmm2 \n"
1942 "movdqa %%xmm0,%%xmm1 \n"
1943 "movdqa %%xmm2,%%xmm6 \n"
1944 "pmaddubsw %%xmm4,%%xmm0 \n"
1945 "pmaddubsw %%xmm4,%%xmm2 \n"
1946 "pmaddubsw %%xmm3,%%xmm1 \n"
1947 "pmaddubsw %%xmm3,%%xmm6 \n"
1948 "phaddw %%xmm2,%%xmm0 \n"
1949 "phaddw %%xmm6,%%xmm1 \n"
1950 "psraw $0x8,%%xmm0 \n"
1951 "psraw $0x8,%%xmm1 \n"
1952 "packsswb %%xmm1,%%xmm0 \n"
1953 "paddb %%xmm5,%%xmm0 \n"
1954 "movlps %%xmm0,(%1) \n"
1955 "movhps %%xmm0,0x00(%1,%2,1) \n"
1956 "lea 0x8(%1),%1 \n"
1957 "sub $0x10,%3 \n"
1958 "jg 1b \n"
1959 : "+r"(src_bgra), // %0
1960 "+r"(dst_u), // %1
1961 "+r"(dst_v), // %2
1962 "+rm"(width) // %3
1963 : "r"((intptr_t)(src_stride_bgra)), // %4
1964 "m"(kBGRAToV), // %5
1965 "m"(kBGRAToU), // %6
1966 "m"(kAddUV128) // %7
1967 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1968 }
1969
ABGRToYRow_SSSE3(const uint8_t * src_abgr,uint8_t * dst_y,int width)1970 void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1971 asm volatile(
1972 "movdqa %3,%%xmm4 \n"
1973 "movdqa %4,%%xmm5 \n"
1974 "movdqa %5,%%xmm7 \n"
1975
1976 LABELALIGN RGBTOY(xmm7)
1977 : "+r"(src_abgr), // %0
1978 "+r"(dst_y), // %1
1979 "+r"(width) // %2
1980 : "m"(kABGRToY), // %3
1981 "m"(kSub128), // %4
1982 "m"(kAddY16) // %5
1983 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1984 "xmm7");
1985 }
1986
RGBAToYRow_SSSE3(const uint8_t * src_rgba,uint8_t * dst_y,int width)1987 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1988 asm volatile(
1989 "movdqa %3,%%xmm4 \n"
1990 "movdqa %4,%%xmm5 \n"
1991 "movdqa %5,%%xmm7 \n"
1992
1993 LABELALIGN RGBTOY(xmm7)
1994 : "+r"(src_rgba), // %0
1995 "+r"(dst_y), // %1
1996 "+r"(width) // %2
1997 : "m"(kRGBAToY), // %3
1998 "m"(kSub128), // %4
1999 "m"(kAddY16) // %5
2000 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2001 "xmm7");
2002 }
2003
ABGRToUVRow_SSSE3(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)2004 void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
2005 int src_stride_abgr,
2006 uint8_t* dst_u,
2007 uint8_t* dst_v,
2008 int width) {
2009 asm volatile(
2010 "movdqa %5,%%xmm3 \n"
2011 "movdqa %6,%%xmm4 \n"
2012 "movdqa %7,%%xmm5 \n"
2013 "sub %1,%2 \n"
2014
2015 LABELALIGN
2016 "1: \n"
2017 "movdqu (%0),%%xmm0 \n"
2018 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
2019 "pavgb %%xmm7,%%xmm0 \n"
2020 "movdqu 0x10(%0),%%xmm1 \n"
2021 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
2022 "pavgb %%xmm7,%%xmm1 \n"
2023 "movdqu 0x20(%0),%%xmm2 \n"
2024 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
2025 "pavgb %%xmm7,%%xmm2 \n"
2026 "movdqu 0x30(%0),%%xmm6 \n"
2027 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
2028 "pavgb %%xmm7,%%xmm6 \n"
2029
2030 "lea 0x40(%0),%0 \n"
2031 "movdqa %%xmm0,%%xmm7 \n"
2032 "shufps $0x88,%%xmm1,%%xmm0 \n"
2033 "shufps $0xdd,%%xmm1,%%xmm7 \n"
2034 "pavgb %%xmm7,%%xmm0 \n"
2035 "movdqa %%xmm2,%%xmm7 \n"
2036 "shufps $0x88,%%xmm6,%%xmm2 \n"
2037 "shufps $0xdd,%%xmm6,%%xmm7 \n"
2038 "pavgb %%xmm7,%%xmm2 \n"
2039 "movdqa %%xmm0,%%xmm1 \n"
2040 "movdqa %%xmm2,%%xmm6 \n"
2041 "pmaddubsw %%xmm4,%%xmm0 \n"
2042 "pmaddubsw %%xmm4,%%xmm2 \n"
2043 "pmaddubsw %%xmm3,%%xmm1 \n"
2044 "pmaddubsw %%xmm3,%%xmm6 \n"
2045 "phaddw %%xmm2,%%xmm0 \n"
2046 "phaddw %%xmm6,%%xmm1 \n"
2047 "psraw $0x8,%%xmm0 \n"
2048 "psraw $0x8,%%xmm1 \n"
2049 "packsswb %%xmm1,%%xmm0 \n"
2050 "paddb %%xmm5,%%xmm0 \n"
2051 "movlps %%xmm0,(%1) \n"
2052 "movhps %%xmm0,0x00(%1,%2,1) \n"
2053 "lea 0x8(%1),%1 \n"
2054 "sub $0x10,%3 \n"
2055 "jg 1b \n"
2056 : "+r"(src_abgr), // %0
2057 "+r"(dst_u), // %1
2058 "+r"(dst_v), // %2
2059 "+rm"(width) // %3
2060 : "r"((intptr_t)(src_stride_abgr)), // %4
2061 "m"(kABGRToV), // %5
2062 "m"(kABGRToU), // %6
2063 "m"(kAddUV128) // %7
2064 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
2065 }
2066
RGBAToUVRow_SSSE3(const uint8_t * src_rgba,int src_stride_rgba,uint8_t * dst_u,uint8_t * dst_v,int width)2067 void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
2068 int src_stride_rgba,
2069 uint8_t* dst_u,
2070 uint8_t* dst_v,
2071 int width) {
2072 asm volatile(
2073 "movdqa %5,%%xmm3 \n"
2074 "movdqa %6,%%xmm4 \n"
2075 "movdqa %7,%%xmm5 \n"
2076 "sub %1,%2 \n"
2077
2078 LABELALIGN
2079 "1: \n"
2080 "movdqu (%0),%%xmm0 \n"
2081 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
2082 "pavgb %%xmm7,%%xmm0 \n"
2083 "movdqu 0x10(%0),%%xmm1 \n"
2084 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
2085 "pavgb %%xmm7,%%xmm1 \n"
2086 "movdqu 0x20(%0),%%xmm2 \n"
2087 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
2088 "pavgb %%xmm7,%%xmm2 \n"
2089 "movdqu 0x30(%0),%%xmm6 \n"
2090 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
2091 "pavgb %%xmm7,%%xmm6 \n"
2092
2093 "lea 0x40(%0),%0 \n"
2094 "movdqa %%xmm0,%%xmm7 \n"
2095 "shufps $0x88,%%xmm1,%%xmm0 \n"
2096 "shufps $0xdd,%%xmm1,%%xmm7 \n"
2097 "pavgb %%xmm7,%%xmm0 \n"
2098 "movdqa %%xmm2,%%xmm7 \n"
2099 "shufps $0x88,%%xmm6,%%xmm2 \n"
2100 "shufps $0xdd,%%xmm6,%%xmm7 \n"
2101 "pavgb %%xmm7,%%xmm2 \n"
2102 "movdqa %%xmm0,%%xmm1 \n"
2103 "movdqa %%xmm2,%%xmm6 \n"
2104 "pmaddubsw %%xmm4,%%xmm0 \n"
2105 "pmaddubsw %%xmm4,%%xmm2 \n"
2106 "pmaddubsw %%xmm3,%%xmm1 \n"
2107 "pmaddubsw %%xmm3,%%xmm6 \n"
2108 "phaddw %%xmm2,%%xmm0 \n"
2109 "phaddw %%xmm6,%%xmm1 \n"
2110 "psraw $0x8,%%xmm0 \n"
2111 "psraw $0x8,%%xmm1 \n"
2112 "packsswb %%xmm1,%%xmm0 \n"
2113 "paddb %%xmm5,%%xmm0 \n"
2114 "movlps %%xmm0,(%1) \n"
2115 "movhps %%xmm0,0x00(%1,%2,1) \n"
2116 "lea 0x8(%1),%1 \n"
2117 "sub $0x10,%3 \n"
2118 "jg 1b \n"
2119 : "+r"(src_rgba), // %0
2120 "+r"(dst_u), // %1
2121 "+r"(dst_v), // %2
2122 "+rm"(width) // %3
2123 : "r"((intptr_t)(src_stride_rgba)), // %4
2124 "m"(kRGBAToV), // %5
2125 "m"(kRGBAToU), // %6
2126 "m"(kAddUV128) // %7
2127 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
2128 }
2129
2130 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
2131
2132 // Read 8 UV from 444
2133 #define READYUV444 \
2134 "movq (%[u_buf]),%%xmm3 \n" \
2135 "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2136 "lea 0x8(%[u_buf]),%[u_buf] \n" \
2137 "punpcklbw %%xmm1,%%xmm3 \n" \
2138 "movq (%[y_buf]),%%xmm4 \n" \
2139 "punpcklbw %%xmm4,%%xmm4 \n" \
2140 "lea 0x8(%[y_buf]),%[y_buf] \n"
2141
2142 // Read 4 UV from 422, upsample to 8 UV
2143 #define READYUV422 \
2144 "movd (%[u_buf]),%%xmm3 \n" \
2145 "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2146 "lea 0x4(%[u_buf]),%[u_buf] \n" \
2147 "punpcklbw %%xmm1,%%xmm3 \n" \
2148 "punpcklwd %%xmm3,%%xmm3 \n" \
2149 "movq (%[y_buf]),%%xmm4 \n" \
2150 "punpcklbw %%xmm4,%%xmm4 \n" \
2151 "lea 0x8(%[y_buf]),%[y_buf] \n"
2152
2153 // Read 4 UV from 422 10 bit, upsample to 8 UV
2154 // TODO(fbarchard): Consider shufb to replace pack/unpack
2155 // TODO(fbarchard): Consider pmulhuw to replace psraw
2156 // TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
2157 #define READYUV210 \
2158 "movq (%[u_buf]),%%xmm3 \n" \
2159 "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2160 "lea 0x8(%[u_buf]),%[u_buf] \n" \
2161 "punpcklwd %%xmm1,%%xmm3 \n" \
2162 "psraw $2,%%xmm3 \n" \
2163 "packuswb %%xmm3,%%xmm3 \n" \
2164 "punpcklwd %%xmm3,%%xmm3 \n" \
2165 "movdqu (%[y_buf]),%%xmm4 \n" \
2166 "psllw $6,%%xmm4 \n" \
2167 "lea 0x10(%[y_buf]),%[y_buf] \n"
2168
2169 #define READYUVA210 \
2170 "movq (%[u_buf]),%%xmm3 \n" \
2171 "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2172 "lea 0x8(%[u_buf]),%[u_buf] \n" \
2173 "punpcklwd %%xmm1,%%xmm3 \n" \
2174 "psraw $2,%%xmm3 \n" \
2175 "packuswb %%xmm3,%%xmm3 \n" \
2176 "punpcklwd %%xmm3,%%xmm3 \n" \
2177 "movdqu (%[y_buf]),%%xmm4 \n" \
2178 "psllw $6,%%xmm4 \n" \
2179 "lea 0x10(%[y_buf]),%[y_buf] \n" \
2180 "movdqu (%[a_buf]),%%xmm5 \n" \
2181 "psraw $2,%%xmm5 \n" \
2182 "packuswb %%xmm5,%%xmm5 \n" \
2183 "lea 0x10(%[a_buf]),%[a_buf] \n"
2184
2185 // Read 8 UV from 444 10 bit
2186 #define READYUV410 \
2187 "movdqu (%[u_buf]),%%xmm3 \n" \
2188 "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \
2189 "lea 0x10(%[u_buf]),%[u_buf] \n" \
2190 "psraw $2,%%xmm3 \n" \
2191 "psraw $2,%%xmm2 \n" \
2192 "movdqa %%xmm3,%%xmm1 \n" \
2193 "punpcklwd %%xmm2,%%xmm3 \n" \
2194 "punpckhwd %%xmm2,%%xmm1 \n" \
2195 "packuswb %%xmm1,%%xmm3 \n" \
2196 "movdqu (%[y_buf]),%%xmm4 \n" \
2197 "psllw $6,%%xmm4 \n" \
2198 "lea 0x10(%[y_buf]),%[y_buf] \n"
2199
2200 // Read 8 UV from 444 10 bit. With 8 Alpha.
2201 #define READYUVA410 \
2202 "movdqu (%[u_buf]),%%xmm3 \n" \
2203 "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \
2204 "lea 0x10(%[u_buf]),%[u_buf] \n" \
2205 "psraw $2,%%xmm3 \n" \
2206 "psraw $2,%%xmm2 \n" \
2207 "movdqa %%xmm3,%%xmm1 \n" \
2208 "punpcklwd %%xmm2,%%xmm3 \n" \
2209 "punpckhwd %%xmm2,%%xmm1 \n" \
2210 "packuswb %%xmm1,%%xmm3 \n" \
2211 "movdqu (%[y_buf]),%%xmm4 \n" \
2212 "psllw $0x6,%%xmm4 \n" \
2213 "lea 0x10(%[y_buf]),%[y_buf] \n" \
2214 "movdqu (%[a_buf]),%%xmm5 \n" \
2215 "psraw $2,%%xmm5 \n" \
2216 "packuswb %%xmm5,%%xmm5 \n" \
2217 "lea 0x10(%[a_buf]),%[a_buf] \n"
2218
2219 // Read 4 UV from 422 12 bit, upsample to 8 UV
2220 #define READYUV212 \
2221 "movq (%[u_buf]),%%xmm3 \n" \
2222 "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2223 "lea 0x8(%[u_buf]),%[u_buf] \n" \
2224 "punpcklwd %%xmm1,%%xmm3 \n" \
2225 "psraw $0x4,%%xmm3 \n" \
2226 "packuswb %%xmm3,%%xmm3 \n" \
2227 "punpcklwd %%xmm3,%%xmm3 \n" \
2228 "movdqu (%[y_buf]),%%xmm4 \n" \
2229 "psllw $0x4,%%xmm4 \n" \
2230 "lea 0x10(%[y_buf]),%[y_buf] \n"
2231
2232 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
2233 #define READYUVA422 \
2234 "movd (%[u_buf]),%%xmm3 \n" \
2235 "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2236 "lea 0x4(%[u_buf]),%[u_buf] \n" \
2237 "punpcklbw %%xmm1,%%xmm3 \n" \
2238 "punpcklwd %%xmm3,%%xmm3 \n" \
2239 "movq (%[y_buf]),%%xmm4 \n" \
2240 "punpcklbw %%xmm4,%%xmm4 \n" \
2241 "lea 0x8(%[y_buf]),%[y_buf] \n" \
2242 "movq (%[a_buf]),%%xmm5 \n" \
2243 "lea 0x8(%[a_buf]),%[a_buf] \n"
2244
2245 // Read 8 UV from 444. With 8 Alpha.
2246 #define READYUVA444 \
2247 "movq (%[u_buf]),%%xmm3 \n" \
2248 "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2249 "lea 0x8(%[u_buf]),%[u_buf] \n" \
2250 "punpcklbw %%xmm1,%%xmm3 \n" \
2251 "movq (%[y_buf]),%%xmm4 \n" \
2252 "punpcklbw %%xmm4,%%xmm4 \n" \
2253 "lea 0x8(%[y_buf]),%[y_buf] \n" \
2254 "movq (%[a_buf]),%%xmm5 \n" \
2255 "lea 0x8(%[a_buf]),%[a_buf] \n"
2256
2257 // Read 4 UV from NV12, upsample to 8 UV
2258 #define READNV12 \
2259 "movq (%[uv_buf]),%%xmm3 \n" \
2260 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
2261 "punpcklwd %%xmm3,%%xmm3 \n" \
2262 "movq (%[y_buf]),%%xmm4 \n" \
2263 "punpcklbw %%xmm4,%%xmm4 \n" \
2264 "lea 0x8(%[y_buf]),%[y_buf] \n"
2265
2266 // Read 4 VU from NV21, upsample to 8 UV
2267 #define READNV21 \
2268 "movq (%[vu_buf]),%%xmm3 \n" \
2269 "lea 0x8(%[vu_buf]),%[vu_buf] \n" \
2270 "pshufb %[kShuffleNV21], %%xmm3 \n" \
2271 "movq (%[y_buf]),%%xmm4 \n" \
2272 "punpcklbw %%xmm4,%%xmm4 \n" \
2273 "lea 0x8(%[y_buf]),%[y_buf] \n"
2274
2275 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
2276 #define READYUY2 \
2277 "movdqu (%[yuy2_buf]),%%xmm4 \n" \
2278 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
2279 "movdqu (%[yuy2_buf]),%%xmm3 \n" \
2280 "pshufb %[kShuffleYUY2UV], %%xmm3 \n" \
2281 "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n"
2282
2283 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
2284 #define READUYVY \
2285 "movdqu (%[uyvy_buf]),%%xmm4 \n" \
2286 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
2287 "movdqu (%[uyvy_buf]),%%xmm3 \n" \
2288 "pshufb %[kShuffleUYVYUV], %%xmm3 \n" \
2289 "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n"
2290
2291 // Read 4 UV from P210, upsample to 8 UV
2292 #define READP210 \
2293 "movdqu (%[uv_buf]),%%xmm3 \n" \
2294 "lea 0x10(%[uv_buf]),%[uv_buf] \n" \
2295 "psrlw $0x8,%%xmm3 \n" \
2296 "packuswb %%xmm3,%%xmm3 \n" \
2297 "punpcklwd %%xmm3,%%xmm3 \n" \
2298 "movdqu (%[y_buf]),%%xmm4 \n" \
2299 "lea 0x10(%[y_buf]),%[y_buf] \n"
2300
2301 // Read 8 UV from P410
2302 #define READP410 \
2303 "movdqu (%[uv_buf]),%%xmm3 \n" \
2304 "movdqu 0x10(%[uv_buf]),%%xmm1 \n" \
2305 "lea 0x20(%[uv_buf]),%[uv_buf] \n" \
2306 "psrlw $0x8,%%xmm3 \n" \
2307 "psrlw $0x8,%%xmm1 \n" \
2308 "packuswb %%xmm1,%%xmm3 \n" \
2309 "movdqu (%[y_buf]),%%xmm4 \n" \
2310 "lea 0x10(%[y_buf]),%[y_buf] \n"
2311
2312 #if defined(__x86_64__)
2313 #define YUVTORGB_SETUP(yuvconstants) \
2314 "pcmpeqb %%xmm13,%%xmm13 \n" \
2315 "movdqa (%[yuvconstants]),%%xmm8 \n" \
2316 "pxor %%xmm12,%%xmm12 \n" \
2317 "movdqa 32(%[yuvconstants]),%%xmm9 \n" \
2318 "psllw $7,%%xmm13 \n" \
2319 "movdqa 64(%[yuvconstants]),%%xmm10 \n" \
2320 "pshufb %%xmm12,%%xmm13 \n" \
2321 "movdqa 96(%[yuvconstants]),%%xmm11 \n" \
2322 "movdqa 128(%[yuvconstants]),%%xmm12 \n"
2323
2324 // Convert 8 pixels: 8 UV and 8 Y
2325 #define YUVTORGB16(yuvconstants) \
2326 "psubb %%xmm13,%%xmm3 \n" \
2327 "pmulhuw %%xmm11,%%xmm4 \n" \
2328 "movdqa %%xmm8,%%xmm0 \n" \
2329 "movdqa %%xmm9,%%xmm1 \n" \
2330 "movdqa %%xmm10,%%xmm2 \n" \
2331 "paddw %%xmm12,%%xmm4 \n" \
2332 "pmaddubsw %%xmm3,%%xmm0 \n" \
2333 "pmaddubsw %%xmm3,%%xmm1 \n" \
2334 "pmaddubsw %%xmm3,%%xmm2 \n" \
2335 "paddsw %%xmm4,%%xmm0 \n" \
2336 "paddsw %%xmm4,%%xmm2 \n" \
2337 "psubsw %%xmm1,%%xmm4 \n" \
2338 "movdqa %%xmm4,%%xmm1 \n"
2339
2340 #define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
2341
2342 #else
2343 #define YUVTORGB_SETUP(yuvconstants)
2344 // Convert 8 pixels: 8 UV and 8 Y
2345 #define YUVTORGB16(yuvconstants) \
2346 "pcmpeqb %%xmm0,%%xmm0 \n" \
2347 "pxor %%xmm1,%%xmm1 \n" \
2348 "psllw $7,%%xmm0 \n" \
2349 "pshufb %%xmm1,%%xmm0 \n" \
2350 "psubb %%xmm0,%%xmm3 \n" \
2351 "pmulhuw 96(%[yuvconstants]),%%xmm4 \n" \
2352 "movdqa (%[yuvconstants]),%%xmm0 \n" \
2353 "movdqa 32(%[yuvconstants]),%%xmm1 \n" \
2354 "movdqa 64(%[yuvconstants]),%%xmm2 \n" \
2355 "pmaddubsw %%xmm3,%%xmm0 \n" \
2356 "pmaddubsw %%xmm3,%%xmm1 \n" \
2357 "pmaddubsw %%xmm3,%%xmm2 \n" \
2358 "movdqa 128(%[yuvconstants]),%%xmm3 \n" \
2359 "paddw %%xmm3,%%xmm4 \n" \
2360 "paddsw %%xmm4,%%xmm0 \n" \
2361 "paddsw %%xmm4,%%xmm2 \n" \
2362 "psubsw %%xmm1,%%xmm4 \n" \
2363 "movdqa %%xmm4,%%xmm1 \n"
2364
2365 #define YUVTORGB_REGS
2366 #endif
2367
2368 #define YUVTORGB(yuvconstants) \
2369 YUVTORGB16(yuvconstants) \
2370 "psraw $0x6,%%xmm0 \n" \
2371 "psraw $0x6,%%xmm1 \n" \
2372 "psraw $0x6,%%xmm2 \n" \
2373 "packuswb %%xmm0,%%xmm0 \n" \
2374 "packuswb %%xmm1,%%xmm1 \n" \
2375 "packuswb %%xmm2,%%xmm2 \n"
2376
2377 // Store 8 ARGB values.
2378 #define STOREARGB \
2379 "punpcklbw %%xmm1,%%xmm0 \n" \
2380 "punpcklbw %%xmm5,%%xmm2 \n" \
2381 "movdqa %%xmm0,%%xmm1 \n" \
2382 "punpcklwd %%xmm2,%%xmm0 \n" \
2383 "punpckhwd %%xmm2,%%xmm1 \n" \
2384 "movdqu %%xmm0,(%[dst_argb]) \n" \
2385 "movdqu %%xmm1,0x10(%[dst_argb]) \n" \
2386 "lea 0x20(%[dst_argb]), %[dst_argb] \n"
2387
2388 // Store 8 RGBA values.
2389 #define STORERGBA \
2390 "pcmpeqb %%xmm5,%%xmm5 \n" \
2391 "punpcklbw %%xmm2,%%xmm1 \n" \
2392 "punpcklbw %%xmm0,%%xmm5 \n" \
2393 "movdqa %%xmm5,%%xmm0 \n" \
2394 "punpcklwd %%xmm1,%%xmm5 \n" \
2395 "punpckhwd %%xmm1,%%xmm0 \n" \
2396 "movdqu %%xmm5,(%[dst_rgba]) \n" \
2397 "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \
2398 "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
2399
2400 // Store 8 AR30 values.
2401 #define STOREAR30 \
2402 "psraw $0x4,%%xmm0 \n" \
2403 "psraw $0x4,%%xmm1 \n" \
2404 "psraw $0x4,%%xmm2 \n" \
2405 "pminsw %%xmm7,%%xmm0 \n" \
2406 "pminsw %%xmm7,%%xmm1 \n" \
2407 "pminsw %%xmm7,%%xmm2 \n" \
2408 "pmaxsw %%xmm6,%%xmm0 \n" \
2409 "pmaxsw %%xmm6,%%xmm1 \n" \
2410 "pmaxsw %%xmm6,%%xmm2 \n" \
2411 "psllw $0x4,%%xmm2 \n" \
2412 "movdqa %%xmm0,%%xmm3 \n" \
2413 "punpcklwd %%xmm2,%%xmm0 \n" \
2414 "punpckhwd %%xmm2,%%xmm3 \n" \
2415 "movdqa %%xmm1,%%xmm2 \n" \
2416 "punpcklwd %%xmm5,%%xmm1 \n" \
2417 "punpckhwd %%xmm5,%%xmm2 \n" \
2418 "pslld $0xa,%%xmm1 \n" \
2419 "pslld $0xa,%%xmm2 \n" \
2420 "por %%xmm1,%%xmm0 \n" \
2421 "por %%xmm2,%%xmm3 \n" \
2422 "movdqu %%xmm0,(%[dst_ar30]) \n" \
2423 "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \
2424 "lea 0x20(%[dst_ar30]), %[dst_ar30] \n"
2425
I444ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2426 void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
2427 const uint8_t* u_buf,
2428 const uint8_t* v_buf,
2429 uint8_t* dst_argb,
2430 const struct YuvConstants* yuvconstants,
2431 int width) {
2432 asm volatile (
2433 YUVTORGB_SETUP(yuvconstants)
2434 "sub %[u_buf],%[v_buf] \n"
2435 "pcmpeqb %%xmm5,%%xmm5 \n"
2436
2437 LABELALIGN
2438 "1: \n"
2439 READYUV444
2440 YUVTORGB(yuvconstants)
2441 STOREARGB
2442 "sub $0x8,%[width] \n"
2443 "jg 1b \n"
2444 : [y_buf]"+r"(y_buf), // %[y_buf]
2445 [u_buf]"+r"(u_buf), // %[u_buf]
2446 [v_buf]"+r"(v_buf), // %[v_buf]
2447 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2448 [width]"+rm"(width) // %[width]
2449 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2450 : "memory", "cc", YUVTORGB_REGS
2451 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2452 );
2453 }
2454
2455 #ifdef HAS_I444ALPHATOARGBROW_SSSE3
I444AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2456 void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
2457 const uint8_t* u_buf,
2458 const uint8_t* v_buf,
2459 const uint8_t* a_buf,
2460 uint8_t* dst_argb,
2461 const struct YuvConstants* yuvconstants,
2462 int width) {
2463 // clang-format off
2464 asm volatile (
2465 YUVTORGB_SETUP(yuvconstants)
2466 "sub %[u_buf],%[v_buf] \n"
2467
2468 LABELALIGN
2469 "1: \n"
2470 READYUVA444
2471 YUVTORGB(yuvconstants)
2472 STOREARGB
2473 "subl $0x8,%[width] \n"
2474 "jg 1b \n"
2475 : [y_buf]"+r"(y_buf), // %[y_buf]
2476 [u_buf]"+r"(u_buf), // %[u_buf]
2477 [v_buf]"+r"(v_buf), // %[v_buf]
2478 [a_buf]"+r"(a_buf), // %[a_buf]
2479 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2480 #if defined(__i386__)
2481 [width]"+m"(width) // %[width]
2482 #else
2483 [width]"+rm"(width) // %[width]
2484 #endif
2485 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2486 : "memory", "cc", YUVTORGB_REGS
2487 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2488 );
2489 // clang-format on
2490 }
2491 #endif // HAS_I444ALPHATOARGBROW_SSSE3
2492
I422ToRGB24Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)2493 void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
2494 const uint8_t* u_buf,
2495 const uint8_t* v_buf,
2496 uint8_t* dst_rgb24,
2497 const struct YuvConstants* yuvconstants,
2498 int width) {
2499 asm volatile (
2500 YUVTORGB_SETUP(yuvconstants)
2501 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2502 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
2503 "sub %[u_buf],%[v_buf] \n"
2504
2505 LABELALIGN
2506 "1: \n"
2507 READYUV422
2508 YUVTORGB(yuvconstants)
2509 "punpcklbw %%xmm1,%%xmm0 \n"
2510 "punpcklbw %%xmm2,%%xmm2 \n"
2511 "movdqa %%xmm0,%%xmm1 \n"
2512 "punpcklwd %%xmm2,%%xmm0 \n"
2513 "punpckhwd %%xmm2,%%xmm1 \n"
2514 "pshufb %%xmm5,%%xmm0 \n"
2515 "pshufb %%xmm6,%%xmm1 \n"
2516 "palignr $0xc,%%xmm0,%%xmm1 \n"
2517 "movq %%xmm0,(%[dst_rgb24]) \n"
2518 "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
2519 "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
2520 "subl $0x8,%[width] \n"
2521 "jg 1b \n"
2522 : [y_buf]"+r"(y_buf), // %[y_buf]
2523 [u_buf]"+r"(u_buf), // %[u_buf]
2524 [v_buf]"+r"(v_buf), // %[v_buf]
2525 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
2526 #if defined(__i386__)
2527 [width]"+m"(width) // %[width]
2528 #else
2529 [width]"+rm"(width) // %[width]
2530 #endif
2531 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2532 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2533 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
2534 : "memory", "cc", YUVTORGB_REGS
2535 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2536 );
2537 }
2538
I422ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2539 void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
2540 const uint8_t* u_buf,
2541 const uint8_t* v_buf,
2542 uint8_t* dst_argb,
2543 const struct YuvConstants* yuvconstants,
2544 int width) {
2545 asm volatile (
2546 YUVTORGB_SETUP(yuvconstants)
2547 "sub %[u_buf],%[v_buf] \n"
2548 "pcmpeqb %%xmm5,%%xmm5 \n"
2549
2550 LABELALIGN
2551 "1: \n"
2552 READYUV422
2553 YUVTORGB(yuvconstants)
2554 STOREARGB
2555 "sub $0x8,%[width] \n"
2556 "jg 1b \n"
2557 : [y_buf]"+r"(y_buf), // %[y_buf]
2558 [u_buf]"+r"(u_buf), // %[u_buf]
2559 [v_buf]"+r"(v_buf), // %[v_buf]
2560 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2561 [width]"+rm"(width) // %[width]
2562 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2563 : "memory", "cc", YUVTORGB_REGS
2564 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2565 );
2566 }
2567
I422ToAR30Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2568 void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
2569 const uint8_t* u_buf,
2570 const uint8_t* v_buf,
2571 uint8_t* dst_ar30,
2572 const struct YuvConstants* yuvconstants,
2573 int width) {
2574 asm volatile (
2575 YUVTORGB_SETUP(yuvconstants)
2576 "sub %[u_buf],%[v_buf] \n"
2577 "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
2578 "psrlw $14,%%xmm5 \n"
2579 "psllw $4,%%xmm5 \n" // 2 alpha bits
2580 "pxor %%xmm6,%%xmm6 \n" // 0 for min
2581 "pcmpeqb %%xmm7,%%xmm7 \n"
2582 "psrlw $6,%%xmm7 \n" // 1023 for max
2583
2584 LABELALIGN
2585 "1: \n"
2586 READYUV422
2587 YUVTORGB16(yuvconstants)
2588 STOREAR30
2589 "sub $0x8,%[width] \n"
2590 "jg 1b \n"
2591 : [y_buf]"+r"(y_buf), // %[y_buf]
2592 [u_buf]"+r"(u_buf), // %[u_buf]
2593 [v_buf]"+r"(v_buf), // %[v_buf]
2594 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
2595 [width]"+rm"(width) // %[width]
2596 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2597 : "memory", "cc", YUVTORGB_REGS
2598 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2599 );
2600 }
2601
2602 // 10 bit YUV to ARGB
I210ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2603 void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
2604 const uint16_t* u_buf,
2605 const uint16_t* v_buf,
2606 uint8_t* dst_argb,
2607 const struct YuvConstants* yuvconstants,
2608 int width) {
2609 asm volatile (
2610 YUVTORGB_SETUP(yuvconstants)
2611 "sub %[u_buf],%[v_buf] \n"
2612 "pcmpeqb %%xmm5,%%xmm5 \n"
2613
2614 LABELALIGN
2615 "1: \n"
2616 READYUV210
2617 YUVTORGB(yuvconstants)
2618 STOREARGB
2619 "sub $0x8,%[width] \n"
2620 "jg 1b \n"
2621 : [y_buf]"+r"(y_buf), // %[y_buf]
2622 [u_buf]"+r"(u_buf), // %[u_buf]
2623 [v_buf]"+r"(v_buf), // %[v_buf]
2624 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2625 [width]"+rm"(width) // %[width]
2626 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2627 : "memory", "cc", YUVTORGB_REGS
2628 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2629 );
2630 }
2631
2632 // 12 bit YUV to ARGB
I212ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2633 void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf,
2634 const uint16_t* u_buf,
2635 const uint16_t* v_buf,
2636 uint8_t* dst_argb,
2637 const struct YuvConstants* yuvconstants,
2638 int width) {
2639 asm volatile (
2640 YUVTORGB_SETUP(yuvconstants)
2641 "sub %[u_buf],%[v_buf] \n"
2642 "pcmpeqb %%xmm5,%%xmm5 \n"
2643
2644 LABELALIGN
2645 "1: \n"
2646 READYUV212
2647 YUVTORGB(yuvconstants)
2648 STOREARGB
2649 "sub $0x8,%[width] \n"
2650 "jg 1b \n"
2651 : [y_buf]"+r"(y_buf), // %[y_buf]
2652 [u_buf]"+r"(u_buf), // %[u_buf]
2653 [v_buf]"+r"(v_buf), // %[v_buf]
2654 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2655 [width]"+rm"(width) // %[width]
2656 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2657 : "memory", "cc", YUVTORGB_REGS
2658 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2659 );
2660 }
2661
2662 // 10 bit YUV to AR30
I210ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2663 void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
2664 const uint16_t* u_buf,
2665 const uint16_t* v_buf,
2666 uint8_t* dst_ar30,
2667 const struct YuvConstants* yuvconstants,
2668 int width) {
2669 asm volatile (
2670 YUVTORGB_SETUP(yuvconstants)
2671 "sub %[u_buf],%[v_buf] \n"
2672 "pcmpeqb %%xmm5,%%xmm5 \n"
2673 "psrlw $14,%%xmm5 \n"
2674 "psllw $4,%%xmm5 \n" // 2 alpha bits
2675 "pxor %%xmm6,%%xmm6 \n" // 0 for min
2676 "pcmpeqb %%xmm7,%%xmm7 \n"
2677 "psrlw $6,%%xmm7 \n" // 1023 for max
2678
2679 LABELALIGN
2680 "1: \n"
2681 READYUV210
2682 YUVTORGB16(yuvconstants)
2683 STOREAR30
2684 "sub $0x8,%[width] \n"
2685 "jg 1b \n"
2686 : [y_buf]"+r"(y_buf), // %[y_buf]
2687 [u_buf]"+r"(u_buf), // %[u_buf]
2688 [v_buf]"+r"(v_buf), // %[v_buf]
2689 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
2690 [width]"+rm"(width) // %[width]
2691 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2692 : "memory", "cc", YUVTORGB_REGS
2693 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2694 );
2695 }
2696
2697 // 12 bit YUV to AR30
I212ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2698 void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
2699 const uint16_t* u_buf,
2700 const uint16_t* v_buf,
2701 uint8_t* dst_ar30,
2702 const struct YuvConstants* yuvconstants,
2703 int width) {
2704 asm volatile (
2705 YUVTORGB_SETUP(yuvconstants)
2706 "sub %[u_buf],%[v_buf] \n"
2707 "pcmpeqb %%xmm5,%%xmm5 \n"
2708 "psrlw $14,%%xmm5 \n"
2709 "psllw $4,%%xmm5 \n" // 2 alpha bits
2710 "pxor %%xmm6,%%xmm6 \n" // 0 for min
2711 "pcmpeqb %%xmm7,%%xmm7 \n"
2712 "psrlw $6,%%xmm7 \n" // 1023 for max
2713
2714 LABELALIGN
2715 "1: \n"
2716 READYUV212
2717 YUVTORGB16(yuvconstants)
2718 STOREAR30
2719 "sub $0x8,%[width] \n"
2720 "jg 1b \n"
2721 : [y_buf]"+r"(y_buf), // %[y_buf]
2722 [u_buf]"+r"(u_buf), // %[u_buf]
2723 [v_buf]"+r"(v_buf), // %[v_buf]
2724 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
2725 [width]"+rm"(width) // %[width]
2726 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2727 : "memory", "cc", YUVTORGB_REGS
2728 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2729 );
2730 }
2731
2732 // 10 bit YUV to ARGB
I410ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2733 void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf,
2734 const uint16_t* u_buf,
2735 const uint16_t* v_buf,
2736 uint8_t* dst_argb,
2737 const struct YuvConstants* yuvconstants,
2738 int width) {
2739 asm volatile (
2740 YUVTORGB_SETUP(yuvconstants)
2741 "sub %[u_buf],%[v_buf] \n"
2742 "pcmpeqb %%xmm5,%%xmm5 \n"
2743
2744 LABELALIGN
2745 "1: \n"
2746 READYUV410
2747 YUVTORGB(yuvconstants)
2748 STOREARGB
2749 "sub $0x8,%[width] \n"
2750 "jg 1b \n"
2751 : [y_buf]"+r"(y_buf), // %[y_buf]
2752 [u_buf]"+r"(u_buf), // %[u_buf]
2753 [v_buf]"+r"(v_buf), // %[v_buf]
2754 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2755 [width]"+rm"(width) // %[width]
2756 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2757 : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2758 );
2759 }
2760
2761 #ifdef HAS_I210ALPHATOARGBROW_SSSE3
2762 // 10 bit YUVA to ARGB
I210AlphaToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2763 void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
2764 const uint16_t* u_buf,
2765 const uint16_t* v_buf,
2766 const uint16_t* a_buf,
2767 uint8_t* dst_argb,
2768 const struct YuvConstants* yuvconstants,
2769 int width) {
2770 asm volatile(
2771 YUVTORGB_SETUP(
2772 yuvconstants) "sub %[u_buf],%[v_buf] \n"
2773
2774 LABELALIGN "1: \n" READYUVA210
2775 YUVTORGB(yuvconstants) STOREARGB
2776 "subl $0x8,%[width] \n"
2777 "jg 1b \n"
2778 : [y_buf] "+r"(y_buf), // %[y_buf]
2779 [u_buf] "+r"(u_buf), // %[u_buf]
2780 [v_buf] "+r"(v_buf), // %[v_buf]
2781 [a_buf] "+r"(a_buf),
2782 [dst_argb] "+r"(dst_argb), // %[dst_argb]
2783 #if defined(__i386__)
2784 [width] "+m"(width) // %[width]
2785 #else
2786 [width] "+rm"(width) // %[width]
2787 #endif
2788 : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
2789 : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
2790 "xmm5");
2791 }
2792 #endif
2793
2794 #ifdef HAS_I410ALPHATOARGBROW_SSSE3
2795 // 10 bit YUVA to ARGB
I410AlphaToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2796 void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
2797 const uint16_t* u_buf,
2798 const uint16_t* v_buf,
2799 const uint16_t* a_buf,
2800 uint8_t* dst_argb,
2801 const struct YuvConstants* yuvconstants,
2802 int width) {
2803 // clang-format off
2804 asm volatile(
2805 YUVTORGB_SETUP(yuvconstants)
2806 "sub %[u_buf],%[v_buf] \n"
2807
2808 LABELALIGN
2809 "1: \n"
2810 READYUVA410
2811 YUVTORGB(yuvconstants)
2812 STOREARGB
2813 "subl $0x8,%[width] \n"
2814 "jg 1b \n"
2815 : [y_buf] "+r"(y_buf), // %[y_buf]
2816 [u_buf] "+r"(u_buf), // %[u_buf]
2817 [v_buf] "+r"(v_buf), // %[v_buf]
2818 [a_buf] "+r"(a_buf),
2819 [dst_argb] "+r"(dst_argb), // %[dst_argb]
2820 #if defined(__i386__)
2821 [width] "+m"(width) // %[width]
2822 #else
2823 [width] "+rm"(width) // %[width]
2824 #endif
2825 : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
2826 : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
2827 "xmm5");
2828 // clang-format on
2829 }
2830 #endif
2831
2832 // 10 bit YUV to AR30
I410ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2833 void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
2834 const uint16_t* u_buf,
2835 const uint16_t* v_buf,
2836 uint8_t* dst_ar30,
2837 const struct YuvConstants* yuvconstants,
2838 int width) {
2839 asm volatile (
2840 YUVTORGB_SETUP(yuvconstants)
2841 "sub %[u_buf],%[v_buf] \n"
2842 "pcmpeqb %%xmm5,%%xmm5 \n"
2843 "psrlw $14,%%xmm5 \n"
2844 "psllw $4,%%xmm5 \n" // 2 alpha bits
2845 "pxor %%xmm6,%%xmm6 \n" // 0 for min
2846 "pcmpeqb %%xmm7,%%xmm7 \n"
2847 "psrlw $6,%%xmm7 \n" // 1023 for max
2848
2849 LABELALIGN
2850 "1: \n"
2851 READYUV410
2852 YUVTORGB16(yuvconstants)
2853 STOREAR30
2854 "sub $0x8,%[width] \n"
2855 "jg 1b \n"
2856 : [y_buf]"+r"(y_buf), // %[y_buf]
2857 [u_buf]"+r"(u_buf), // %[u_buf]
2858 [v_buf]"+r"(v_buf), // %[v_buf]
2859 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
2860 [width]"+rm"(width) // %[width]
2861 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2862 : "memory", "cc", YUVTORGB_REGS
2863 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2864 );
2865 }
2866
2867 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
I422AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2868 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
2869 const uint8_t* u_buf,
2870 const uint8_t* v_buf,
2871 const uint8_t* a_buf,
2872 uint8_t* dst_argb,
2873 const struct YuvConstants* yuvconstants,
2874 int width) {
2875 // clang-format off
2876 asm volatile (
2877 YUVTORGB_SETUP(yuvconstants)
2878 "sub %[u_buf],%[v_buf] \n"
2879
2880 LABELALIGN
2881 "1: \n"
2882 READYUVA422
2883 YUVTORGB(yuvconstants)
2884 STOREARGB
2885 "subl $0x8,%[width] \n"
2886 "jg 1b \n"
2887 : [y_buf]"+r"(y_buf), // %[y_buf]
2888 [u_buf]"+r"(u_buf), // %[u_buf]
2889 [v_buf]"+r"(v_buf), // %[v_buf]
2890 [a_buf]"+r"(a_buf), // %[a_buf]
2891 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2892 #if defined(__i386__)
2893 [width]"+m"(width) // %[width]
2894 #else
2895 [width]"+rm"(width) // %[width]
2896 #endif
2897 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2898 : "memory", "cc", YUVTORGB_REGS
2899 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2900 );
2901 // clang-format on
2902 }
2903 #endif // HAS_I422ALPHATOARGBROW_SSSE3
2904
NV12ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2905 void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
2906 const uint8_t* uv_buf,
2907 uint8_t* dst_argb,
2908 const struct YuvConstants* yuvconstants,
2909 int width) {
2910 // clang-format off
2911 asm volatile (
2912 YUVTORGB_SETUP(yuvconstants)
2913 "pcmpeqb %%xmm5,%%xmm5 \n"
2914
2915 LABELALIGN
2916 "1: \n"
2917 READNV12
2918 YUVTORGB(yuvconstants)
2919 STOREARGB
2920 "sub $0x8,%[width] \n"
2921 "jg 1b \n"
2922 : [y_buf]"+r"(y_buf), // %[y_buf]
2923 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2924 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2925 [width]"+rm"(width) // %[width]
2926 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2927 : "memory", "cc", YUVTORGB_REGS
2928 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2929 );
2930 // clang-format on
2931 }
2932
NV21ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * vu_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2933 void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
2934 const uint8_t* vu_buf,
2935 uint8_t* dst_argb,
2936 const struct YuvConstants* yuvconstants,
2937 int width) {
2938 // clang-format off
2939 asm volatile (
2940 YUVTORGB_SETUP(yuvconstants)
2941 "pcmpeqb %%xmm5,%%xmm5 \n"
2942
2943 LABELALIGN
2944 "1: \n"
2945 READNV21
2946 YUVTORGB(yuvconstants)
2947 STOREARGB
2948 "sub $0x8,%[width] \n"
2949 "jg 1b \n"
2950 : [y_buf]"+r"(y_buf), // %[y_buf]
2951 [vu_buf]"+r"(vu_buf), // %[vu_buf]
2952 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2953 [width]"+rm"(width) // %[width]
2954 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2955 [kShuffleNV21]"m"(kShuffleNV21)
2956 : "memory", "cc", YUVTORGB_REGS
2957 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2958 );
2959 // clang-format on
2960 }
2961
YUY2ToARGBRow_SSSE3(const uint8_t * yuy2_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2962 void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
2963 uint8_t* dst_argb,
2964 const struct YuvConstants* yuvconstants,
2965 int width) {
2966 // clang-format off
2967 asm volatile (
2968 YUVTORGB_SETUP(yuvconstants)
2969 "pcmpeqb %%xmm5,%%xmm5 \n"
2970
2971 LABELALIGN
2972 "1: \n"
2973 READYUY2
2974 YUVTORGB(yuvconstants)
2975 STOREARGB
2976 "sub $0x8,%[width] \n"
2977 "jg 1b \n"
2978 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
2979 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2980 [width]"+rm"(width) // %[width]
2981 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2982 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2983 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2984 : "memory", "cc", YUVTORGB_REGS
2985 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2986 );
2987 // clang-format on
2988 }
2989
UYVYToARGBRow_SSSE3(const uint8_t * uyvy_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2990 void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
2991 uint8_t* dst_argb,
2992 const struct YuvConstants* yuvconstants,
2993 int width) {
2994 // clang-format off
2995 asm volatile (
2996 YUVTORGB_SETUP(yuvconstants)
2997 "pcmpeqb %%xmm5,%%xmm5 \n"
2998
2999 LABELALIGN
3000 "1: \n"
3001 READUYVY
3002 YUVTORGB(yuvconstants)
3003 STOREARGB
3004 "sub $0x8,%[width] \n"
3005 "jg 1b \n"
3006 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
3007 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3008 [width]"+rm"(width) // %[width]
3009 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
3010 [kShuffleUYVYY]"m"(kShuffleUYVYY),
3011 [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
3012 : "memory", "cc", YUVTORGB_REGS
3013 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3014 );
3015 // clang-format on
3016 }
3017
P210ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3018 void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf,
3019 const uint16_t* uv_buf,
3020 uint8_t* dst_argb,
3021 const struct YuvConstants* yuvconstants,
3022 int width) {
3023 asm volatile(
3024 YUVTORGB_SETUP(
3025 yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n"
3026
3027 LABELALIGN "1: \n" READP210
3028 YUVTORGB(yuvconstants) STOREARGB
3029 "sub $0x8,%[width] \n"
3030 "jg 1b \n"
3031 : [y_buf] "+r"(y_buf), // %[y_buf]
3032 [uv_buf] "+r"(uv_buf), // %[u_buf]
3033 [dst_argb] "+r"(dst_argb), // %[dst_argb]
3034 [width] "+rm"(width) // %[width]
3035 : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
3036 : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
3037 "xmm5");
3038 }
3039
P410ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3040 void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
3041 const uint16_t* uv_buf,
3042 uint8_t* dst_argb,
3043 const struct YuvConstants* yuvconstants,
3044 int width) {
3045 asm volatile(
3046 YUVTORGB_SETUP(
3047 yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n"
3048
3049 LABELALIGN "1: \n" READP410
3050 YUVTORGB(yuvconstants) STOREARGB
3051 "sub $0x8,%[width] \n"
3052 "jg 1b \n"
3053 : [y_buf] "+r"(y_buf), // %[y_buf]
3054 [uv_buf] "+r"(uv_buf), // %[u_buf]
3055 [dst_argb] "+r"(dst_argb), // %[dst_argb]
3056 [width] "+rm"(width) // %[width]
3057 : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
3058 : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
3059 "xmm5");
3060 }
3061
P210ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3062 void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
3063 const uint16_t* uv_buf,
3064 uint8_t* dst_ar30,
3065 const struct YuvConstants* yuvconstants,
3066 int width) {
3067 asm volatile (
3068 YUVTORGB_SETUP(yuvconstants)
3069 "pcmpeqb %%xmm5,%%xmm5 \n"
3070 "psrlw $14,%%xmm5 \n"
3071 "psllw $4,%%xmm5 \n" // 2 alpha bits
3072 "pxor %%xmm6,%%xmm6 \n" // 0 for min
3073 "pcmpeqb %%xmm7,%%xmm7 \n"
3074 "psrlw $6,%%xmm7 \n" // 1023 for max
3075
3076 LABELALIGN
3077 "1: \n"
3078 READP210
3079 YUVTORGB16(yuvconstants)
3080 STOREAR30
3081 "sub $0x8,%[width] \n"
3082 "jg 1b \n"
3083 : [y_buf]"+r"(y_buf), // %[y_buf]
3084 [uv_buf]"+r"(uv_buf), // %[uv_buf]
3085 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
3086 [width]"+rm"(width) // %[width]
3087 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3088 : "memory", "cc", YUVTORGB_REGS
3089 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3090 );
3091 }
3092
P410ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3093 void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
3094 const uint16_t* uv_buf,
3095 uint8_t* dst_ar30,
3096 const struct YuvConstants* yuvconstants,
3097 int width) {
3098 asm volatile (
3099 YUVTORGB_SETUP(yuvconstants)
3100 "pcmpeqb %%xmm5,%%xmm5 \n"
3101 "psrlw $14,%%xmm5 \n"
3102 "psllw $4,%%xmm5 \n" // 2 alpha bits
3103 "pxor %%xmm6,%%xmm6 \n" // 0 for min
3104 "pcmpeqb %%xmm7,%%xmm7 \n"
3105 "psrlw $6,%%xmm7 \n" // 1023 for max
3106
3107 LABELALIGN
3108 "1: \n"
3109 READP410
3110 YUVTORGB16(yuvconstants)
3111 STOREAR30
3112 "sub $0x8,%[width] \n"
3113 "jg 1b \n"
3114 : [y_buf]"+r"(y_buf), // %[y_buf]
3115 [uv_buf]"+r"(uv_buf), // %[uv_buf]
3116 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
3117 [width]"+rm"(width) // %[width]
3118 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3119 : "memory", "cc", YUVTORGB_REGS
3120 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3121 );
3122 }
3123
I422ToRGBARow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgba,const struct YuvConstants * yuvconstants,int width)3124 void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
3125 const uint8_t* u_buf,
3126 const uint8_t* v_buf,
3127 uint8_t* dst_rgba,
3128 const struct YuvConstants* yuvconstants,
3129 int width) {
3130 asm volatile (
3131 YUVTORGB_SETUP(yuvconstants)
3132 "sub %[u_buf],%[v_buf] \n"
3133 "pcmpeqb %%xmm5,%%xmm5 \n"
3134
3135 LABELALIGN
3136 "1: \n"
3137 READYUV422
3138 YUVTORGB(yuvconstants)
3139 STORERGBA
3140 "sub $0x8,%[width] \n"
3141 "jg 1b \n"
3142 : [y_buf]"+r"(y_buf), // %[y_buf]
3143 [u_buf]"+r"(u_buf), // %[u_buf]
3144 [v_buf]"+r"(v_buf), // %[v_buf]
3145 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
3146 [width]"+rm"(width) // %[width]
3147 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3148 : "memory", "cc", YUVTORGB_REGS
3149 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3150 );
3151 }
3152
3153 #endif // HAS_I422TOARGBROW_SSSE3
3154
3155 // Read 16 UV from 444
3156 #define READYUV444_AVX2 \
3157 "vmovdqu (%[u_buf]),%%xmm3 \n" \
3158 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3159 "lea 0x10(%[u_buf]),%[u_buf] \n" \
3160 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3161 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
3162 "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \
3163 "vmovdqu (%[y_buf]),%%xmm4 \n" \
3164 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
3165 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
3166 "lea 0x10(%[y_buf]),%[y_buf] \n"
3167
3168 // Read 8 UV from 422, upsample to 16 UV.
3169 #define READYUV422_AVX2 \
3170 "vmovq (%[u_buf]),%%xmm3 \n" \
3171 "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3172 "lea 0x8(%[u_buf]),%[u_buf] \n" \
3173 "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \
3174 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3175 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3176 "vmovdqu (%[y_buf]),%%xmm4 \n" \
3177 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
3178 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
3179 "lea 0x10(%[y_buf]),%[y_buf] \n"
3180
3181 // Read 8 UV from 210, upsample to 16 UV
3182 // TODO(fbarchard): Consider vshufb to replace pack/unpack
3183 // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
3184 #define READYUV210_AVX2 \
3185 "vmovdqu (%[u_buf]),%%xmm3 \n" \
3186 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3187 "lea 0x10(%[u_buf]),%[u_buf] \n" \
3188 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3189 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
3190 "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \
3191 "vpsraw $2,%%ymm3,%%ymm3 \n" \
3192 "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
3193 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3194 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3195 "vpsllw $6,%%ymm4,%%ymm4 \n" \
3196 "lea 0x20(%[y_buf]),%[y_buf] \n"
3197
3198 // Read 8 UV from 210, upsample to 16 UV. With 16 Alpha.
3199 #define READYUVA210_AVX2 \
3200 "vmovdqu (%[u_buf]),%%xmm3 \n" \
3201 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3202 "lea 0x10(%[u_buf]),%[u_buf] \n" \
3203 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3204 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
3205 "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \
3206 "vpsraw $2,%%ymm3,%%ymm3 \n" \
3207 "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
3208 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3209 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3210 "vpsllw $6,%%ymm4,%%ymm4 \n" \
3211 "lea 0x20(%[y_buf]),%[y_buf] \n" \
3212 "vmovdqu (%[a_buf]),%%ymm5 \n" \
3213 "vpsraw $2,%%ymm5,%%ymm5 \n" \
3214 "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \
3215 "lea 0x20(%[a_buf]),%[a_buf] \n"
3216
3217 // Read 16 UV from 410
3218 #define READYUV410_AVX2 \
3219 "vmovdqu (%[u_buf]),%%ymm3 \n" \
3220 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \
3221 "lea 0x20(%[u_buf]),%[u_buf] \n" \
3222 "vpsraw $2,%%ymm3,%%ymm3 \n" \
3223 "vpsraw $2,%%ymm2,%%ymm2 \n" \
3224 "vpunpckhwd %%ymm2,%%ymm3,%%ymm1 \n" \
3225 "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \
3226 "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \
3227 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3228 "vpsllw $6,%%ymm4,%%ymm4 \n" \
3229 "lea 0x20(%[y_buf]),%[y_buf] \n"
3230
3231 // Read 8 UV from 212 12 bit, upsample to 16 UV
3232 #define READYUV212_AVX2 \
3233 "vmovdqu (%[u_buf]),%%xmm3 \n" \
3234 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3235 "lea 0x10(%[u_buf]),%[u_buf] \n" \
3236 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3237 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
3238 "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \
3239 "vpsraw $0x4,%%ymm3,%%ymm3 \n" \
3240 "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
3241 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3242 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3243 "vpsllw $0x4,%%ymm4,%%ymm4 \n" \
3244 "lea 0x20(%[y_buf]),%[y_buf] \n"
3245
3246 // Read 16 UV from 410. With 16 Alpha.
3247 #define READYUVA410_AVX2 \
3248 "vmovdqu (%[u_buf]),%%ymm3 \n" \
3249 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \
3250 "lea 0x20(%[u_buf]),%[u_buf] \n" \
3251 "vpsraw $2,%%ymm3,%%ymm3 \n" \
3252 "vpsraw $2,%%ymm2,%%ymm2 \n" \
3253 "vpunpckhwd %%ymm2,%%ymm3,%%ymm1 \n" \
3254 "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \
3255 "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \
3256 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3257 "vpsllw $6,%%ymm4,%%ymm4 \n" \
3258 "lea 0x20(%[y_buf]),%[y_buf] \n" \
3259 "vmovdqu (%[a_buf]),%%ymm5 \n" \
3260 "vpsraw $2,%%ymm5,%%ymm5 \n" \
3261 "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \
3262 "lea 0x20(%[a_buf]),%[a_buf] \n"
3263
3264 // Read 16 UV from 444. With 16 Alpha.
3265 #define READYUVA444_AVX2 \
3266 "vmovdqu (%[u_buf]),%%xmm3 \n" \
3267 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3268 "lea 0x10(%[u_buf]),%[u_buf] \n" \
3269 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3270 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
3271 "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \
3272 "vmovdqu (%[y_buf]),%%xmm4 \n" \
3273 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
3274 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
3275 "lea 0x10(%[y_buf]),%[y_buf] \n" \
3276 "vmovdqu (%[a_buf]),%%xmm5 \n" \
3277 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
3278 "lea 0x10(%[a_buf]),%[a_buf] \n"
3279
3280 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
3281 #define READYUVA422_AVX2 \
3282 "vmovq (%[u_buf]),%%xmm3 \n" \
3283 "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3284 "lea 0x8(%[u_buf]),%[u_buf] \n" \
3285 "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \
3286 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3287 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3288 "vmovdqu (%[y_buf]),%%xmm4 \n" \
3289 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
3290 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
3291 "lea 0x10(%[y_buf]),%[y_buf] \n" \
3292 "vmovdqu (%[a_buf]),%%xmm5 \n" \
3293 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
3294 "lea 0x10(%[a_buf]),%[a_buf] \n"
3295
3296 // Read 8 UV from NV12, upsample to 16 UV.
3297 #define READNV12_AVX2 \
3298 "vmovdqu (%[uv_buf]),%%xmm3 \n" \
3299 "lea 0x10(%[uv_buf]),%[uv_buf] \n" \
3300 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3301 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3302 "vmovdqu (%[y_buf]),%%xmm4 \n" \
3303 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
3304 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
3305 "lea 0x10(%[y_buf]),%[y_buf] \n"
3306
3307 // Read 8 VU from NV21, upsample to 16 UV.
3308 #define READNV21_AVX2 \
3309 "vmovdqu (%[vu_buf]),%%xmm3 \n" \
3310 "lea 0x10(%[vu_buf]),%[vu_buf] \n" \
3311 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3312 "vpshufb %[kShuffleNV21], %%ymm3, %%ymm3 \n" \
3313 "vmovdqu (%[y_buf]),%%xmm4 \n" \
3314 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
3315 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
3316 "lea 0x10(%[y_buf]),%[y_buf] \n"
3317
3318 // Read 4 UV from P210, upsample to 8 UV
3319 #define READP210_AVX2 \
3320 "vmovdqu (%[uv_buf]),%%ymm3 \n" \
3321 "lea 0x20(%[uv_buf]),%[uv_buf] \n" \
3322 "vpsrlw $0x8,%%ymm3,%%ymm3 \n" \
3323 "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
3324 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3325 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3326 "lea 0x20(%[y_buf]),%[y_buf] \n"
3327
3328 // Read 8 UV from P410
3329 #define READP410_AVX2 \
3330 "vmovdqu (%[uv_buf]),%%ymm3 \n" \
3331 "vmovdqu 0x20(%[uv_buf]),%%ymm1 \n" \
3332 "lea 0x40(%[uv_buf]),%[uv_buf] \n" \
3333 "vpsrlw $0x8,%%ymm3,%%ymm3 \n" \
3334 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" \
3335 "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \
3336 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3337 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3338 "lea 0x20(%[y_buf]),%[y_buf] \n"
3339
3340 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
3341 #define READYUY2_AVX2 \
3342 "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \
3343 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
3344 "vmovdqu (%[yuy2_buf]),%%ymm3 \n" \
3345 "vpshufb %[kShuffleYUY2UV], %%ymm3, %%ymm3 \n" \
3346 "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n"
3347
3348 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
3349 #define READUYVY_AVX2 \
3350 "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \
3351 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
3352 "vmovdqu (%[uyvy_buf]),%%ymm3 \n" \
3353 "vpshufb %[kShuffleUYVYUV], %%ymm3, %%ymm3 \n" \
3354 "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n"
3355
3356 #if defined(__x86_64__)
3357 #define YUVTORGB_SETUP_AVX2(yuvconstants) \
3358 "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \
3359 "vmovdqa (%[yuvconstants]),%%ymm8 \n" \
3360 "vpsllw $7,%%xmm13,%%xmm13 \n" \
3361 "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \
3362 "vpbroadcastb %%xmm13,%%ymm13 \n" \
3363 "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \
3364 "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \
3365 "vmovdqa 128(%[yuvconstants]),%%ymm12 \n"
3366
3367 #define YUVTORGB16_AVX2(yuvconstants) \
3368 "vpsubb %%ymm13,%%ymm3,%%ymm3 \n" \
3369 "vpmulhuw %%ymm11,%%ymm4,%%ymm4 \n" \
3370 "vpmaddubsw %%ymm3,%%ymm8,%%ymm0 \n" \
3371 "vpmaddubsw %%ymm3,%%ymm9,%%ymm1 \n" \
3372 "vpmaddubsw %%ymm3,%%ymm10,%%ymm2 \n" \
3373 "vpaddw %%ymm4,%%ymm12,%%ymm4 \n" \
3374 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
3375 "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \
3376 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
3377
3378 #define YUVTORGB_REGS_AVX2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
3379
3380 #else // Convert 16 pixels: 16 UV and 16 Y.
3381
3382 #define YUVTORGB_SETUP_AVX2(yuvconstants)
3383 #define YUVTORGB16_AVX2(yuvconstants) \
3384 "vpcmpeqb %%xmm0,%%xmm0,%%xmm0 \n" \
3385 "vpsllw $7,%%xmm0,%%xmm0 \n" \
3386 "vpbroadcastb %%xmm0,%%ymm0 \n" \
3387 "vpsubb %%ymm0,%%ymm3,%%ymm3 \n" \
3388 "vpmulhuw 96(%[yuvconstants]),%%ymm4,%%ymm4 \n" \
3389 "vmovdqa (%[yuvconstants]),%%ymm0 \n" \
3390 "vmovdqa 32(%[yuvconstants]),%%ymm1 \n" \
3391 "vmovdqa 64(%[yuvconstants]),%%ymm2 \n" \
3392 "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" \
3393 "vpmaddubsw %%ymm3,%%ymm1,%%ymm1 \n" \
3394 "vpmaddubsw %%ymm3,%%ymm2,%%ymm2 \n" \
3395 "vmovdqa 128(%[yuvconstants]),%%ymm3 \n" \
3396 "vpaddw %%ymm4,%%ymm3,%%ymm4 \n" \
3397 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
3398 "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \
3399 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
3400
3401 #define YUVTORGB_REGS_AVX2
3402 #endif
3403
3404 #define YUVTORGB_AVX2(yuvconstants) \
3405 YUVTORGB16_AVX2(yuvconstants) \
3406 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
3407 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
3408 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
3409 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
3410 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
3411 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
3412
3413 // Store 16 ARGB values.
3414 #define STOREARGB_AVX2 \
3415 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
3416 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
3417 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
3418 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
3419 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
3420 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
3421 "vmovdqu %%ymm1,(%[dst_argb]) \n" \
3422 "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \
3423 "lea 0x40(%[dst_argb]), %[dst_argb] \n"
3424
3425 // Store 16 AR30 values.
3426 #define STOREAR30_AVX2 \
3427 "vpsraw $0x4,%%ymm0,%%ymm0 \n" \
3428 "vpsraw $0x4,%%ymm1,%%ymm1 \n" \
3429 "vpsraw $0x4,%%ymm2,%%ymm2 \n" \
3430 "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \
3431 "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \
3432 "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \
3433 "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \
3434 "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \
3435 "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \
3436 "vpsllw $0x4,%%ymm2,%%ymm2 \n" \
3437 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
3438 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
3439 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
3440 "vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \
3441 "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \
3442 "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \
3443 "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \
3444 "vpslld $0xa,%%ymm1,%%ymm1 \n" \
3445 "vpslld $0xa,%%ymm2,%%ymm2 \n" \
3446 "vpor %%ymm1,%%ymm0,%%ymm0 \n" \
3447 "vpor %%ymm2,%%ymm3,%%ymm3 \n" \
3448 "vmovdqu %%ymm0,(%[dst_ar30]) \n" \
3449 "vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \
3450 "lea 0x40(%[dst_ar30]), %[dst_ar30] \n"
3451
3452 #ifdef HAS_I444TOARGBROW_AVX2
3453 // 16 pixels
3454 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
I444ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3455 void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
3456 const uint8_t* u_buf,
3457 const uint8_t* v_buf,
3458 uint8_t* dst_argb,
3459 const struct YuvConstants* yuvconstants,
3460 int width) {
3461 asm volatile (
3462 YUVTORGB_SETUP_AVX2(yuvconstants)
3463 "sub %[u_buf],%[v_buf] \n"
3464 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3465
3466 LABELALIGN
3467 "1: \n"
3468 READYUV444_AVX2
3469 YUVTORGB_AVX2(yuvconstants)
3470 STOREARGB_AVX2
3471 "sub $0x10,%[width] \n"
3472 "jg 1b \n"
3473 "vzeroupper \n"
3474 : [y_buf]"+r"(y_buf), // %[y_buf]
3475 [u_buf]"+r"(u_buf), // %[u_buf]
3476 [v_buf]"+r"(v_buf), // %[v_buf]
3477 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3478 [width]"+rm"(width) // %[width]
3479 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3480 : "memory", "cc", YUVTORGB_REGS_AVX2
3481 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3482 );
3483 }
3484 #endif // HAS_I444TOARGBROW_AVX2
3485
3486 #if defined(HAS_I422TOARGBROW_AVX2)
3487 // 16 pixels
3488 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I422ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3489 void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
3490 const uint8_t* u_buf,
3491 const uint8_t* v_buf,
3492 uint8_t* dst_argb,
3493 const struct YuvConstants* yuvconstants,
3494 int width) {
3495 asm volatile (
3496 YUVTORGB_SETUP_AVX2(yuvconstants)
3497 "sub %[u_buf],%[v_buf] \n"
3498 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3499
3500 LABELALIGN
3501 "1: \n"
3502 READYUV422_AVX2
3503 YUVTORGB_AVX2(yuvconstants)
3504 STOREARGB_AVX2
3505 "sub $0x10,%[width] \n"
3506 "jg 1b \n"
3507
3508 "vzeroupper \n"
3509 : [y_buf]"+r"(y_buf), // %[y_buf]
3510 [u_buf]"+r"(u_buf), // %[u_buf]
3511 [v_buf]"+r"(v_buf), // %[v_buf]
3512 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3513 [width]"+rm"(width) // %[width]
3514 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3515 : "memory", "cc", YUVTORGB_REGS_AVX2
3516 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3517 );
3518 }
3519 #endif // HAS_I422TOARGBROW_AVX2
3520
3521 #if defined(HAS_I422TOAR30ROW_AVX2)
3522 // 16 pixels
3523 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I422ToAR30Row_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3524 void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
3525 const uint8_t* u_buf,
3526 const uint8_t* v_buf,
3527 uint8_t* dst_ar30,
3528 const struct YuvConstants* yuvconstants,
3529 int width) {
3530 asm volatile (
3531 YUVTORGB_SETUP_AVX2(yuvconstants)
3532 "sub %[u_buf],%[v_buf] \n"
3533 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
3534 "vpsrlw $14,%%ymm5,%%ymm5 \n"
3535 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
3536 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
3537 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
3538 "vpsrlw $6,%%ymm7,%%ymm7 \n"
3539
3540 LABELALIGN
3541 "1: \n"
3542 READYUV422_AVX2
3543 YUVTORGB16_AVX2(yuvconstants)
3544 STOREAR30_AVX2
3545 "sub $0x10,%[width] \n"
3546 "jg 1b \n"
3547
3548 "vzeroupper \n"
3549 : [y_buf]"+r"(y_buf), // %[y_buf]
3550 [u_buf]"+r"(u_buf), // %[u_buf]
3551 [v_buf]"+r"(v_buf), // %[v_buf]
3552 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
3553 [width]"+rm"(width) // %[width]
3554 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3555 : "memory", "cc", YUVTORGB_REGS_AVX2
3556 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3557 );
3558 }
3559 #endif // HAS_I422TOAR30ROW_AVX2
3560
3561 #if defined(HAS_I210TOARGBROW_AVX2)
3562 // 16 pixels
3563 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I210ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3564 void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
3565 const uint16_t* u_buf,
3566 const uint16_t* v_buf,
3567 uint8_t* dst_argb,
3568 const struct YuvConstants* yuvconstants,
3569 int width) {
3570 asm volatile (
3571 YUVTORGB_SETUP_AVX2(yuvconstants)
3572 "sub %[u_buf],%[v_buf] \n"
3573 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3574
3575 LABELALIGN
3576 "1: \n"
3577 READYUV210_AVX2
3578 YUVTORGB_AVX2(yuvconstants)
3579 STOREARGB_AVX2
3580 "sub $0x10,%[width] \n"
3581 "jg 1b \n"
3582
3583 "vzeroupper \n"
3584 : [y_buf]"+r"(y_buf), // %[y_buf]
3585 [u_buf]"+r"(u_buf), // %[u_buf]
3586 [v_buf]"+r"(v_buf), // %[v_buf]
3587 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3588 [width]"+rm"(width) // %[width]
3589 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3590 : "memory", "cc", YUVTORGB_REGS_AVX2
3591 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3592 );
3593 }
3594 #endif // HAS_I210TOARGBROW_AVX2
3595
3596 #if defined(HAS_I212TOARGBROW_AVX2)
3597 // 16 pixels
3598 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I212ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3599 void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf,
3600 const uint16_t* u_buf,
3601 const uint16_t* v_buf,
3602 uint8_t* dst_argb,
3603 const struct YuvConstants* yuvconstants,
3604 int width) {
3605 asm volatile (
3606 YUVTORGB_SETUP_AVX2(yuvconstants)
3607 "sub %[u_buf],%[v_buf] \n"
3608 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3609
3610 LABELALIGN
3611 "1: \n"
3612 READYUV212_AVX2
3613 YUVTORGB_AVX2(yuvconstants)
3614 STOREARGB_AVX2
3615 "sub $0x10,%[width] \n"
3616 "jg 1b \n"
3617
3618 "vzeroupper \n"
3619 : [y_buf]"+r"(y_buf), // %[y_buf]
3620 [u_buf]"+r"(u_buf), // %[u_buf]
3621 [v_buf]"+r"(v_buf), // %[v_buf]
3622 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3623 [width]"+rm"(width) // %[width]
3624 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3625 : "memory", "cc", YUVTORGB_REGS_AVX2
3626 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3627 );
3628 }
3629 #endif // HAS_I212TOARGBROW_AVX2
3630
3631 #if defined(HAS_I210TOAR30ROW_AVX2)
3632 // 16 pixels
3633 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I210ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3634 void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
3635 const uint16_t* u_buf,
3636 const uint16_t* v_buf,
3637 uint8_t* dst_ar30,
3638 const struct YuvConstants* yuvconstants,
3639 int width) {
3640 asm volatile (
3641 YUVTORGB_SETUP_AVX2(yuvconstants)
3642 "sub %[u_buf],%[v_buf] \n"
3643 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
3644 "vpsrlw $14,%%ymm5,%%ymm5 \n"
3645 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
3646 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
3647 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
3648 "vpsrlw $6,%%ymm7,%%ymm7 \n"
3649
3650 LABELALIGN
3651 "1: \n"
3652 READYUV210_AVX2
3653 YUVTORGB16_AVX2(yuvconstants)
3654 STOREAR30_AVX2
3655 "sub $0x10,%[width] \n"
3656 "jg 1b \n"
3657
3658 "vzeroupper \n"
3659 : [y_buf]"+r"(y_buf), // %[y_buf]
3660 [u_buf]"+r"(u_buf), // %[u_buf]
3661 [v_buf]"+r"(v_buf), // %[v_buf]
3662 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
3663 [width]"+rm"(width) // %[width]
3664 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3665 : "memory", "cc", YUVTORGB_REGS_AVX2
3666 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3667 );
3668 }
3669 #endif // HAS_I210TOAR30ROW_AVX2
3670
3671 #if defined(HAS_I212TOAR30ROW_AVX2)
3672 // 16 pixels
3673 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I212ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3674 void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
3675 const uint16_t* u_buf,
3676 const uint16_t* v_buf,
3677 uint8_t* dst_ar30,
3678 const struct YuvConstants* yuvconstants,
3679 int width) {
3680 asm volatile (
3681 YUVTORGB_SETUP_AVX2(yuvconstants)
3682 "sub %[u_buf],%[v_buf] \n"
3683 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
3684 "vpsrlw $14,%%ymm5,%%ymm5 \n"
3685 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
3686 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
3687 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
3688 "vpsrlw $6,%%ymm7,%%ymm7 \n"
3689
3690 LABELALIGN
3691 "1: \n"
3692 READYUV212_AVX2
3693 YUVTORGB16_AVX2(yuvconstants)
3694 STOREAR30_AVX2
3695 "sub $0x10,%[width] \n"
3696 "jg 1b \n"
3697
3698 "vzeroupper \n"
3699 : [y_buf]"+r"(y_buf), // %[y_buf]
3700 [u_buf]"+r"(u_buf), // %[u_buf]
3701 [v_buf]"+r"(v_buf), // %[v_buf]
3702 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
3703 [width]"+rm"(width) // %[width]
3704 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3705 : "memory", "cc", YUVTORGB_REGS_AVX2
3706 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3707 );
3708 }
3709 #endif // HAS_I212TOAR30ROW_AVX2
3710
3711 #if defined(HAS_I410TOARGBROW_AVX2)
3712 // 16 pixels
3713 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
I410ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3714 void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf,
3715 const uint16_t* u_buf,
3716 const uint16_t* v_buf,
3717 uint8_t* dst_argb,
3718 const struct YuvConstants* yuvconstants,
3719 int width) {
3720 asm volatile (
3721 YUVTORGB_SETUP_AVX2(yuvconstants)
3722 "sub %[u_buf],%[v_buf] \n"
3723 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3724
3725 LABELALIGN
3726 "1: \n"
3727 READYUV410_AVX2
3728 YUVTORGB_AVX2(yuvconstants)
3729 STOREARGB_AVX2
3730 "sub $0x10,%[width] \n"
3731 "jg 1b \n"
3732 "vzeroupper \n"
3733
3734 : [y_buf]"+r"(y_buf), // %[y_buf]
3735 [u_buf]"+r"(u_buf), // %[u_buf]
3736 [v_buf]"+r"(v_buf), // %[v_buf]
3737 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3738 [width]"+rm"(width) // %[width]
3739 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3740 : "memory", "cc", YUVTORGB_REGS_AVX2
3741 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3742 );
3743 }
3744 #endif // HAS_I410TOARGBROW_AVX2
3745
3746 #if defined(HAS_I210ALPHATOARGBROW_AVX2)
3747 // 16 pixels
3748 // 8 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
I210AlphaToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3749 void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
3750 const uint16_t* u_buf,
3751 const uint16_t* v_buf,
3752 const uint16_t* a_buf,
3753 uint8_t* dst_argb,
3754 const struct YuvConstants* yuvconstants,
3755 int width) {
3756 asm volatile(
3757 YUVTORGB_SETUP_AVX2(
3758 yuvconstants) "sub %[u_buf],%[v_buf] \n"
3759
3760 LABELALIGN "1: \n" READYUVA210_AVX2
3761 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
3762 "subl $0x10,%[width] \n"
3763 "jg 1b \n"
3764 "vzeroupper \n"
3765
3766 : [y_buf] "+r"(y_buf), // %[y_buf]
3767 [u_buf] "+r"(u_buf), // %[u_buf]
3768 [v_buf] "+r"(v_buf), // %[v_buf]
3769 [a_buf] "+r"(a_buf), // %[a_buf]
3770 [dst_argb] "+r"(dst_argb), // %[dst_argb]
3771 #if defined(__i386__)
3772 [width] "+m"(width) // %[width]
3773 #else
3774 [width] "+rm"(width) // %[width]
3775 #endif
3776 : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
3777 : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
3778 "xmm4", "xmm5");
3779 }
3780 #endif // HAS_I210TOARGBROW_AVX2
3781
3782 #if defined(HAS_I410ALPHATOARGBROW_AVX2)
3783 // 16 pixels
3784 // 16 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
I410AlphaToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3785 void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
3786 const uint16_t* u_buf,
3787 const uint16_t* v_buf,
3788 const uint16_t* a_buf,
3789 uint8_t* dst_argb,
3790 const struct YuvConstants* yuvconstants,
3791 int width) {
3792 asm volatile(
3793 YUVTORGB_SETUP_AVX2(
3794 yuvconstants) "sub %[u_buf],%[v_buf] \n"
3795
3796 LABELALIGN "1: \n" READYUVA410_AVX2
3797 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
3798 "subl $0x10,%[width] \n"
3799 "jg 1b \n"
3800 "vzeroupper \n"
3801
3802 : [y_buf] "+r"(y_buf), // %[y_buf]
3803 [u_buf] "+r"(u_buf), // %[u_buf]
3804 [v_buf] "+r"(v_buf), // %[v_buf]
3805 [a_buf] "+r"(a_buf), // %[a_buf]
3806 [dst_argb] "+r"(dst_argb), // %[dst_argb]
3807 #if defined(__i386__)
3808 [width] "+m"(width) // %[width]
3809 #else
3810 [width] "+rm"(width) // %[width]
3811 #endif
3812 : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
3813 : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
3814 "xmm4", "xmm5");
3815 }
3816 #endif // HAS_I410TOARGBROW_AVX2
3817
3818 #if defined(HAS_I410TOAR30ROW_AVX2)
3819 // 16 pixels
3820 // 16 UV values with 16 Y producing 16 AR30 (64 bytes).
I410ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3821 void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
3822 const uint16_t* u_buf,
3823 const uint16_t* v_buf,
3824 uint8_t* dst_ar30,
3825 const struct YuvConstants* yuvconstants,
3826 int width) {
3827 asm volatile (
3828 YUVTORGB_SETUP_AVX2(yuvconstants)
3829 "sub %[u_buf],%[v_buf] \n"
3830 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
3831 "vpsrlw $14,%%ymm5,%%ymm5 \n"
3832 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
3833 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
3834 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
3835 "vpsrlw $6,%%ymm7,%%ymm7 \n"
3836
3837 LABELALIGN
3838 "1: \n"
3839 READYUV410_AVX2
3840 YUVTORGB16_AVX2(yuvconstants)
3841 STOREAR30_AVX2
3842 "sub $0x10,%[width] \n"
3843 "jg 1b \n"
3844
3845 "vzeroupper \n"
3846 : [y_buf]"+r"(y_buf), // %[y_buf]
3847 [u_buf]"+r"(u_buf), // %[u_buf]
3848 [v_buf]"+r"(v_buf), // %[v_buf]
3849 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
3850 [width]"+rm"(width) // %[width]
3851 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3852 : "memory", "cc", YUVTORGB_REGS_AVX2
3853 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3854 );
3855 }
3856 #endif // HAS_I410TOAR30ROW_AVX2
3857
3858 #if defined(HAS_I444ALPHATOARGBROW_AVX2)
3859 // 16 pixels
3860 // 16 UV values with 16 Y and 16 A producing 16 ARGB.
I444AlphaToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3861 void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf,
3862 const uint8_t* u_buf,
3863 const uint8_t* v_buf,
3864 const uint8_t* a_buf,
3865 uint8_t* dst_argb,
3866 const struct YuvConstants* yuvconstants,
3867 int width) {
3868 // clang-format off
3869 asm volatile (
3870 YUVTORGB_SETUP_AVX2(yuvconstants)
3871 "sub %[u_buf],%[v_buf] \n"
3872
3873 LABELALIGN
3874 "1: \n"
3875 READYUVA444_AVX2
3876 YUVTORGB_AVX2(yuvconstants)
3877 STOREARGB_AVX2
3878 "subl $0x10,%[width] \n"
3879 "jg 1b \n"
3880 "vzeroupper \n"
3881 : [y_buf]"+r"(y_buf), // %[y_buf]
3882 [u_buf]"+r"(u_buf), // %[u_buf]
3883 [v_buf]"+r"(v_buf), // %[v_buf]
3884 [a_buf]"+r"(a_buf), // %[a_buf]
3885 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3886 #if defined(__i386__)
3887 [width]"+m"(width) // %[width]
3888 #else
3889 [width]"+rm"(width) // %[width]
3890 #endif
3891 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3892 : "memory", "cc", YUVTORGB_REGS_AVX2
3893 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3894 );
3895 // clang-format on
3896 }
3897 #endif // HAS_I444ALPHATOARGBROW_AVX2
3898
3899 #if defined(HAS_I422ALPHATOARGBROW_AVX2)
3900 // 16 pixels
3901 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
I422AlphaToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3902 void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
3903 const uint8_t* u_buf,
3904 const uint8_t* v_buf,
3905 const uint8_t* a_buf,
3906 uint8_t* dst_argb,
3907 const struct YuvConstants* yuvconstants,
3908 int width) {
3909 // clang-format off
3910 asm volatile (
3911 YUVTORGB_SETUP_AVX2(yuvconstants)
3912 "sub %[u_buf],%[v_buf] \n"
3913
3914 LABELALIGN
3915 "1: \n"
3916 READYUVA422_AVX2
3917 YUVTORGB_AVX2(yuvconstants)
3918 STOREARGB_AVX2
3919 "subl $0x10,%[width] \n"
3920 "jg 1b \n"
3921 "vzeroupper \n"
3922 : [y_buf]"+r"(y_buf), // %[y_buf]
3923 [u_buf]"+r"(u_buf), // %[u_buf]
3924 [v_buf]"+r"(v_buf), // %[v_buf]
3925 [a_buf]"+r"(a_buf), // %[a_buf]
3926 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3927 #if defined(__i386__)
3928 [width]"+m"(width) // %[width]
3929 #else
3930 [width]"+rm"(width) // %[width]
3931 #endif
3932 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3933 : "memory", "cc", YUVTORGB_REGS_AVX2
3934 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3935 );
3936 // clang-format on
3937 }
3938 #endif // HAS_I422ALPHATOARGBROW_AVX2
3939
3940 #if defined(HAS_I422TORGBAROW_AVX2)
3941 // 16 pixels
3942 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
I422ToRGBARow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3943 void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
3944 const uint8_t* u_buf,
3945 const uint8_t* v_buf,
3946 uint8_t* dst_argb,
3947 const struct YuvConstants* yuvconstants,
3948 int width) {
3949 asm volatile (
3950 YUVTORGB_SETUP_AVX2(yuvconstants)
3951 "sub %[u_buf],%[v_buf] \n"
3952 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3953
3954 LABELALIGN
3955 "1: \n"
3956 READYUV422_AVX2
3957 YUVTORGB_AVX2(yuvconstants)
3958
3959 // Step 3: Weave into RGBA
3960 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
3961 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3962 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
3963 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
3964 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
3965 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
3966 "vmovdqu %%ymm0,(%[dst_argb]) \n"
3967 "vmovdqu %%ymm1,0x20(%[dst_argb]) \n"
3968 "lea 0x40(%[dst_argb]),%[dst_argb] \n"
3969 "sub $0x10,%[width] \n"
3970 "jg 1b \n"
3971 "vzeroupper \n"
3972 : [y_buf]"+r"(y_buf), // %[y_buf]
3973 [u_buf]"+r"(u_buf), // %[u_buf]
3974 [v_buf]"+r"(v_buf), // %[v_buf]
3975 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3976 [width]"+rm"(width) // %[width]
3977 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3978 : "memory", "cc", YUVTORGB_REGS_AVX2
3979 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3980 );
3981 }
3982 #endif // HAS_I422TORGBAROW_AVX2
3983
3984 #if defined(HAS_NV12TOARGBROW_AVX2)
3985 // 16 pixels.
3986 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV12ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3987 void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
3988 const uint8_t* uv_buf,
3989 uint8_t* dst_argb,
3990 const struct YuvConstants* yuvconstants,
3991 int width) {
3992 // clang-format off
3993 asm volatile (
3994 YUVTORGB_SETUP_AVX2(yuvconstants)
3995 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3996
3997 LABELALIGN
3998 "1: \n"
3999 READNV12_AVX2
4000 YUVTORGB_AVX2(yuvconstants)
4001 STOREARGB_AVX2
4002 "sub $0x10,%[width] \n"
4003 "jg 1b \n"
4004 "vzeroupper \n"
4005 : [y_buf]"+r"(y_buf), // %[y_buf]
4006 [uv_buf]"+r"(uv_buf), // %[uv_buf]
4007 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4008 [width]"+rm"(width) // %[width]
4009 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4010 : "memory", "cc", YUVTORGB_REGS_AVX2
4011 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4012 );
4013 // clang-format on
4014 }
4015 #endif // HAS_NV12TOARGBROW_AVX2
4016
4017 #if defined(HAS_NV21TOARGBROW_AVX2)
4018 // 16 pixels.
4019 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV21ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * vu_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4020 void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
4021 const uint8_t* vu_buf,
4022 uint8_t* dst_argb,
4023 const struct YuvConstants* yuvconstants,
4024 int width) {
4025 // clang-format off
4026 asm volatile (
4027 YUVTORGB_SETUP_AVX2(yuvconstants)
4028 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4029
4030 LABELALIGN
4031 "1: \n"
4032 READNV21_AVX2
4033 YUVTORGB_AVX2(yuvconstants)
4034 STOREARGB_AVX2
4035 "sub $0x10,%[width] \n"
4036 "jg 1b \n"
4037 "vzeroupper \n"
4038 : [y_buf]"+r"(y_buf), // %[y_buf]
4039 [vu_buf]"+r"(vu_buf), // %[vu_buf]
4040 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4041 [width]"+rm"(width) // %[width]
4042 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
4043 [kShuffleNV21]"m"(kShuffleNV21)
4044 : "memory", "cc", YUVTORGB_REGS_AVX2
4045 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4046 );
4047 // clang-format on
4048 }
4049 #endif // HAS_NV21TOARGBROW_AVX2
4050
4051 #if defined(HAS_YUY2TOARGBROW_AVX2)
4052 // 16 pixels.
4053 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
YUY2ToARGBRow_AVX2(const uint8_t * yuy2_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4054 void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
4055 uint8_t* dst_argb,
4056 const struct YuvConstants* yuvconstants,
4057 int width) {
4058 // clang-format off
4059 asm volatile (
4060 YUVTORGB_SETUP_AVX2(yuvconstants)
4061 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4062
4063 LABELALIGN
4064 "1: \n"
4065 READYUY2_AVX2
4066 YUVTORGB_AVX2(yuvconstants)
4067 STOREARGB_AVX2
4068 "sub $0x10,%[width] \n"
4069 "jg 1b \n"
4070 "vzeroupper \n"
4071 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
4072 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4073 [width]"+rm"(width) // %[width]
4074 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
4075 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
4076 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
4077 : "memory", "cc", YUVTORGB_REGS_AVX2
4078 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4079 );
4080 // clang-format on
4081 }
4082 #endif // HAS_YUY2TOARGBROW_AVX2
4083
4084 #if defined(HAS_UYVYTOARGBROW_AVX2)
4085 // 16 pixels.
4086 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
UYVYToARGBRow_AVX2(const uint8_t * uyvy_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4087 void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
4088 uint8_t* dst_argb,
4089 const struct YuvConstants* yuvconstants,
4090 int width) {
4091 // clang-format off
4092 asm volatile (
4093 YUVTORGB_SETUP_AVX2(yuvconstants)
4094 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4095
4096 LABELALIGN
4097 "1: \n"
4098 READUYVY_AVX2
4099 YUVTORGB_AVX2(yuvconstants)
4100 STOREARGB_AVX2
4101 "sub $0x10,%[width] \n"
4102 "jg 1b \n"
4103 "vzeroupper \n"
4104 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
4105 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4106 [width]"+rm"(width) // %[width]
4107 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
4108 [kShuffleUYVYY]"m"(kShuffleUYVYY),
4109 [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
4110 : "memory", "cc", YUVTORGB_REGS_AVX2
4111 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4112 );
4113 // clang-format on
4114 }
4115 #endif // HAS_UYVYTOARGBROW_AVX2
4116
4117 #if defined(HAS_P210TOARGBROW_AVX2)
4118 // 16 pixels.
4119 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
P210ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4120 void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf,
4121 const uint16_t* uv_buf,
4122 uint8_t* dst_argb,
4123 const struct YuvConstants* yuvconstants,
4124 int width) {
4125 // clang-format off
4126 asm volatile (
4127 YUVTORGB_SETUP_AVX2(yuvconstants)
4128 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4129
4130 LABELALIGN
4131 "1: \n"
4132 READP210_AVX2
4133 YUVTORGB_AVX2(yuvconstants)
4134 STOREARGB_AVX2
4135 "sub $0x10,%[width] \n"
4136 "jg 1b \n"
4137 "vzeroupper \n"
4138 : [y_buf]"+r"(y_buf), // %[y_buf]
4139 [uv_buf]"+r"(uv_buf), // %[uv_buf]
4140 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4141 [width]"+rm"(width) // %[width]
4142 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4143 : "memory", "cc", YUVTORGB_REGS_AVX2
4144 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4145 );
4146 // clang-format on
4147 }
4148 #endif // HAS_P210TOARGBROW_AVX2
4149
4150 #if defined(HAS_P410TOARGBROW_AVX2)
4151 // 16 pixels.
4152 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
P410ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4153 void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf,
4154 const uint16_t* uv_buf,
4155 uint8_t* dst_argb,
4156 const struct YuvConstants* yuvconstants,
4157 int width) {
4158 // clang-format off
4159 asm volatile (
4160 YUVTORGB_SETUP_AVX2(yuvconstants)
4161 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4162
4163 LABELALIGN
4164 "1: \n"
4165 READP410_AVX2
4166 YUVTORGB_AVX2(yuvconstants)
4167 STOREARGB_AVX2
4168 "sub $0x10,%[width] \n"
4169 "jg 1b \n"
4170 "vzeroupper \n"
4171 : [y_buf]"+r"(y_buf), // %[y_buf]
4172 [uv_buf]"+r"(uv_buf), // %[uv_buf]
4173 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4174 [width]"+rm"(width) // %[width]
4175 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4176 : "memory", "cc", YUVTORGB_REGS_AVX2
4177 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4178 );
4179 // clang-format on
4180 }
4181 #endif // HAS_P410TOARGBROW_AVX2
4182
4183 #if defined(HAS_P210TOAR30ROW_AVX2)
4184 // 16 pixels
4185 // 16 UV values with 16 Y producing 16 AR30 (64 bytes).
P210ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)4186 void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf,
4187 const uint16_t* uv_buf,
4188 uint8_t* dst_ar30,
4189 const struct YuvConstants* yuvconstants,
4190 int width) {
4191 asm volatile (
4192 YUVTORGB_SETUP_AVX2(yuvconstants)
4193 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
4194 "vpsrlw $14,%%ymm5,%%ymm5 \n"
4195 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
4196 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
4197 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
4198 "vpsrlw $6,%%ymm7,%%ymm7 \n"
4199
4200 LABELALIGN
4201 "1: \n"
4202 READP210_AVX2
4203 YUVTORGB16_AVX2(yuvconstants)
4204 STOREAR30_AVX2
4205 "sub $0x10,%[width] \n"
4206 "jg 1b \n"
4207
4208 "vzeroupper \n"
4209 : [y_buf]"+r"(y_buf), // %[y_buf]
4210 [uv_buf]"+r"(uv_buf), // %[uv_buf]
4211 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
4212 [width]"+rm"(width) // %[width]
4213 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4214 : "memory", "cc", YUVTORGB_REGS_AVX2
4215 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4216 );
4217 }
4218 #endif // HAS_P210TOAR30ROW_AVX2
4219
4220 #if defined(HAS_P410TOAR30ROW_AVX2)
4221 // 16 pixels
4222 // 16 UV values with 16 Y producing 16 AR30 (64 bytes).
P410ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)4223 void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf,
4224 const uint16_t* uv_buf,
4225 uint8_t* dst_ar30,
4226 const struct YuvConstants* yuvconstants,
4227 int width) {
4228 asm volatile (
4229 YUVTORGB_SETUP_AVX2(yuvconstants)
4230 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
4231 "vpsrlw $14,%%ymm5,%%ymm5 \n"
4232 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
4233 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
4234 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
4235 "vpsrlw $6,%%ymm7,%%ymm7 \n"
4236
4237 LABELALIGN
4238 "1: \n"
4239 READP410_AVX2
4240 YUVTORGB16_AVX2(yuvconstants)
4241 STOREAR30_AVX2
4242 "sub $0x10,%[width] \n"
4243 "jg 1b \n"
4244
4245 "vzeroupper \n"
4246 : [y_buf]"+r"(y_buf), // %[y_buf]
4247 [uv_buf]"+r"(uv_buf), // %[uv_buf]
4248 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
4249 [width]"+rm"(width) // %[width]
4250 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4251 : "memory", "cc", YUVTORGB_REGS_AVX2
4252 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4253 );
4254 }
4255 #endif // HAS_P410TOAR30ROW_AVX2
4256
4257 #ifdef HAS_I400TOARGBROW_SSE2
I400ToARGBRow_SSE2(const uint8_t * y_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4258 void I400ToARGBRow_SSE2(const uint8_t* y_buf,
4259 uint8_t* dst_argb,
4260 const struct YuvConstants* yuvconstants,
4261 int width) {
4262 asm volatile(
4263 "movdqa 96(%3),%%xmm2 \n" // yg = 18997 = 1.164
4264 "movdqa 128(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16
4265 "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000
4266 "pslld $0x18,%%xmm4 \n"
4267
4268 LABELALIGN
4269 "1: \n"
4270 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
4271 "movq (%0),%%xmm0 \n"
4272 "lea 0x8(%0),%0 \n"
4273 "punpcklbw %%xmm0,%%xmm0 \n"
4274 "pmulhuw %%xmm2,%%xmm0 \n"
4275 "paddsw %%xmm3,%%xmm0 \n"
4276 "psraw $6, %%xmm0 \n"
4277 "packuswb %%xmm0,%%xmm0 \n"
4278
4279 // Step 2: Weave into ARGB
4280 "punpcklbw %%xmm0,%%xmm0 \n"
4281 "movdqa %%xmm0,%%xmm1 \n"
4282 "punpcklwd %%xmm0,%%xmm0 \n"
4283 "punpckhwd %%xmm1,%%xmm1 \n"
4284 "por %%xmm4,%%xmm0 \n"
4285 "por %%xmm4,%%xmm1 \n"
4286 "movdqu %%xmm0,(%1) \n"
4287 "movdqu %%xmm1,0x10(%1) \n"
4288 "lea 0x20(%1),%1 \n"
4289
4290 "sub $0x8,%2 \n"
4291 "jg 1b \n"
4292 : "+r"(y_buf), // %0
4293 "+r"(dst_argb), // %1
4294 "+rm"(width) // %2
4295 : "r"(yuvconstants) // %3
4296 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
4297 }
4298 #endif // HAS_I400TOARGBROW_SSE2
4299
4300 #ifdef HAS_I400TOARGBROW_AVX2
4301 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
4302 // note: vpunpcklbw mutates and vpackuswb unmutates.
I400ToARGBRow_AVX2(const uint8_t * y_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4303 void I400ToARGBRow_AVX2(const uint8_t* y_buf,
4304 uint8_t* dst_argb,
4305 const struct YuvConstants* yuvconstants,
4306 int width) {
4307 asm volatile(
4308 "vmovdqa 96(%3),%%ymm2 \n" // yg = 18997 = 1.164
4309 "vmovdqa 128(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16
4310 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000
4311 "vpslld $0x18,%%ymm4,%%ymm4 \n"
4312
4313 LABELALIGN
4314 "1: \n"
4315 // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
4316 "vmovdqu (%0),%%xmm0 \n"
4317 "lea 0x10(%0),%0 \n"
4318 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4319 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
4320 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
4321 "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n"
4322 "vpsraw $0x6,%%ymm0,%%ymm0 \n"
4323 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
4324 "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
4325 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
4326 "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
4327 "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
4328 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
4329 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
4330 "vmovdqu %%ymm0,(%1) \n"
4331 "vmovdqu %%ymm1,0x20(%1) \n"
4332 "lea 0x40(%1),%1 \n"
4333 "sub $0x10,%2 \n"
4334 "jg 1b \n"
4335 "vzeroupper \n"
4336 : "+r"(y_buf), // %0
4337 "+r"(dst_argb), // %1
4338 "+rm"(width) // %2
4339 : "r"(yuvconstants) // %3
4340 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
4341 }
4342 #endif // HAS_I400TOARGBROW_AVX2
4343
4344 #ifdef HAS_MIRRORROW_SSSE3
4345 // Shuffle table for reversing the bytes.
4346 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
4347 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
4348
MirrorRow_SSSE3(const uint8_t * src,uint8_t * dst,int width)4349 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
4350 intptr_t temp_width = (intptr_t)(width);
4351 asm volatile(
4352
4353 "movdqa %3,%%xmm5 \n"
4354
4355 LABELALIGN
4356 "1: \n"
4357 "movdqu -0x10(%0,%2,1),%%xmm0 \n"
4358 "pshufb %%xmm5,%%xmm0 \n"
4359 "movdqu %%xmm0,(%1) \n"
4360 "lea 0x10(%1),%1 \n"
4361 "sub $0x10,%2 \n"
4362 "jg 1b \n"
4363 : "+r"(src), // %0
4364 "+r"(dst), // %1
4365 "+r"(temp_width) // %2
4366 : "m"(kShuffleMirror) // %3
4367 : "memory", "cc", "xmm0", "xmm5");
4368 }
4369 #endif // HAS_MIRRORROW_SSSE3
4370
4371 #ifdef HAS_MIRRORROW_AVX2
MirrorRow_AVX2(const uint8_t * src,uint8_t * dst,int width)4372 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
4373 intptr_t temp_width = (intptr_t)(width);
4374 asm volatile(
4375
4376 "vbroadcastf128 %3,%%ymm5 \n"
4377
4378 LABELALIGN
4379 "1: \n"
4380 "vmovdqu -0x20(%0,%2,1),%%ymm0 \n"
4381 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
4382 "vpermq $0x4e,%%ymm0,%%ymm0 \n"
4383 "vmovdqu %%ymm0,(%1) \n"
4384 "lea 0x20(%1),%1 \n"
4385 "sub $0x20,%2 \n"
4386 "jg 1b \n"
4387 "vzeroupper \n"
4388 : "+r"(src), // %0
4389 "+r"(dst), // %1
4390 "+r"(temp_width) // %2
4391 : "m"(kShuffleMirror) // %3
4392 : "memory", "cc", "xmm0", "xmm5");
4393 }
4394 #endif // HAS_MIRRORROW_AVX2
4395
4396 #ifdef HAS_MIRRORUVROW_SSSE3
4397 // Shuffle table for reversing the UV.
4398 static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
4399 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
4400
MirrorUVRow_SSSE3(const uint8_t * src_uv,uint8_t * dst_uv,int width)4401 void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
4402 intptr_t temp_width = (intptr_t)(width);
4403 asm volatile(
4404
4405 "movdqa %3,%%xmm5 \n"
4406
4407 LABELALIGN
4408 "1: \n"
4409 "movdqu -0x10(%0,%2,2),%%xmm0 \n"
4410 "pshufb %%xmm5,%%xmm0 \n"
4411 "movdqu %%xmm0,(%1) \n"
4412 "lea 0x10(%1),%1 \n"
4413 "sub $0x8,%2 \n"
4414 "jg 1b \n"
4415 : "+r"(src_uv), // %0
4416 "+r"(dst_uv), // %1
4417 "+r"(temp_width) // %2
4418 : "m"(kShuffleMirrorUV) // %3
4419 : "memory", "cc", "xmm0", "xmm5");
4420 }
4421 #endif // HAS_MIRRORUVROW_SSSE3
4422
4423 #ifdef HAS_MIRRORUVROW_AVX2
MirrorUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_uv,int width)4424 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
4425 intptr_t temp_width = (intptr_t)(width);
4426 asm volatile(
4427
4428 "vbroadcastf128 %3,%%ymm5 \n"
4429
4430 LABELALIGN
4431 "1: \n"
4432 "vmovdqu -0x20(%0,%2,2),%%ymm0 \n"
4433 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
4434 "vpermq $0x4e,%%ymm0,%%ymm0 \n"
4435 "vmovdqu %%ymm0,(%1) \n"
4436 "lea 0x20(%1),%1 \n"
4437 "sub $0x10,%2 \n"
4438 "jg 1b \n"
4439 "vzeroupper \n"
4440 : "+r"(src_uv), // %0
4441 "+r"(dst_uv), // %1
4442 "+r"(temp_width) // %2
4443 : "m"(kShuffleMirrorUV) // %3
4444 : "memory", "cc", "xmm0", "xmm5");
4445 }
4446 #endif // HAS_MIRRORUVROW_AVX2
4447
4448 #ifdef HAS_MIRRORSPLITUVROW_SSSE3
4449 // Shuffle table for reversing the bytes of UV channels.
4450 static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
4451 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
MirrorSplitUVRow_SSSE3(const uint8_t * src,uint8_t * dst_u,uint8_t * dst_v,int width)4452 void MirrorSplitUVRow_SSSE3(const uint8_t* src,
4453 uint8_t* dst_u,
4454 uint8_t* dst_v,
4455 int width) {
4456 intptr_t temp_width = (intptr_t)(width);
4457 asm volatile(
4458 "movdqa %4,%%xmm1 \n"
4459 "lea -0x10(%0,%3,2),%0 \n"
4460 "sub %1,%2 \n"
4461
4462 LABELALIGN
4463 "1: \n"
4464 "movdqu (%0),%%xmm0 \n"
4465 "lea -0x10(%0),%0 \n"
4466 "pshufb %%xmm1,%%xmm0 \n"
4467 "movlpd %%xmm0,(%1) \n"
4468 "movhpd %%xmm0,0x00(%1,%2,1) \n"
4469 "lea 0x8(%1),%1 \n"
4470 "sub $8,%3 \n"
4471 "jg 1b \n"
4472 : "+r"(src), // %0
4473 "+r"(dst_u), // %1
4474 "+r"(dst_v), // %2
4475 "+r"(temp_width) // %3
4476 : "m"(kShuffleMirrorSplitUV) // %4
4477 : "memory", "cc", "xmm0", "xmm1");
4478 }
4479 #endif // HAS_MIRRORSPLITUVROW_SSSE3
4480
4481 #ifdef HAS_RGB24MIRRORROW_SSSE3
4482
4483 // Shuffle first 5 pixels to last 5 mirrored. first byte zero
4484 static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u,
4485 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u};
4486
4487 // Shuffle last 5 pixels to first 5 mirrored. last byte zero
4488 static const uvec8 kShuffleMirrorRGB1 = {
4489 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
4490
4491 // Shuffle 5 pixels at a time (15 bytes)
RGB24MirrorRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_rgb24,int width)4492 void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
4493 uint8_t* dst_rgb24,
4494 int width) {
4495 intptr_t temp_width = (intptr_t)(width);
4496 src_rgb24 += width * 3 - 48;
4497 asm volatile(
4498 "movdqa %3,%%xmm4 \n"
4499 "movdqa %4,%%xmm5 \n"
4500
4501 LABELALIGN
4502 "1: \n"
4503 "movdqu (%0),%%xmm0 \n" // first 5
4504 "movdqu 15(%0),%%xmm1 \n" // next 5
4505 "movdqu 30(%0),%%xmm2 \n" // next 5
4506 "movdqu 32(%0),%%xmm3 \n" // last 1 special
4507 "pshufb %%xmm4,%%xmm0 \n"
4508 "pshufb %%xmm4,%%xmm1 \n"
4509 "pshufb %%xmm4,%%xmm2 \n"
4510 "pshufb %%xmm5,%%xmm3 \n"
4511 "lea -0x30(%0),%0 \n"
4512 "movdqu %%xmm0,32(%1) \n" // last 5
4513 "movdqu %%xmm1,17(%1) \n" // next 5
4514 "movdqu %%xmm2,2(%1) \n" // next 5
4515 "movlpd %%xmm3,0(%1) \n" // first 1
4516 "lea 0x30(%1),%1 \n"
4517 "sub $0x10,%2 \n"
4518 "jg 1b \n"
4519 : "+r"(src_rgb24), // %0
4520 "+r"(dst_rgb24), // %1
4521 "+r"(temp_width) // %2
4522 : "m"(kShuffleMirrorRGB0), // %3
4523 "m"(kShuffleMirrorRGB1) // %4
4524 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4525 }
4526 #endif // HAS_RGB24MIRRORROW_SSSE3
4527
4528 #ifdef HAS_ARGBMIRRORROW_SSE2
4529
ARGBMirrorRow_SSE2(const uint8_t * src,uint8_t * dst,int width)4530 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
4531 intptr_t temp_width = (intptr_t)(width);
4532 asm volatile(
4533
4534 "lea -0x10(%0,%2,4),%0 \n"
4535
4536 LABELALIGN
4537 "1: \n"
4538 "movdqu (%0),%%xmm0 \n"
4539 "pshufd $0x1b,%%xmm0,%%xmm0 \n"
4540 "lea -0x10(%0),%0 \n"
4541 "movdqu %%xmm0,(%1) \n"
4542 "lea 0x10(%1),%1 \n"
4543 "sub $0x4,%2 \n"
4544 "jg 1b \n"
4545 : "+r"(src), // %0
4546 "+r"(dst), // %1
4547 "+r"(temp_width) // %2
4548 :
4549 : "memory", "cc", "xmm0");
4550 }
4551 #endif // HAS_ARGBMIRRORROW_SSE2
4552
4553 #ifdef HAS_ARGBMIRRORROW_AVX2
4554 // Shuffle table for reversing the bytes.
4555 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
ARGBMirrorRow_AVX2(const uint8_t * src,uint8_t * dst,int width)4556 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
4557 intptr_t temp_width = (intptr_t)(width);
4558 asm volatile(
4559
4560 "vmovdqu %3,%%ymm5 \n"
4561
4562 LABELALIGN
4563 "1: \n"
4564 "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n"
4565 "vmovdqu %%ymm0,(%1) \n"
4566 "lea 0x20(%1),%1 \n"
4567 "sub $0x8,%2 \n"
4568 "jg 1b \n"
4569 "vzeroupper \n"
4570 : "+r"(src), // %0
4571 "+r"(dst), // %1
4572 "+r"(temp_width) // %2
4573 : "m"(kARGBShuffleMirror_AVX2) // %3
4574 : "memory", "cc", "xmm0", "xmm5");
4575 }
4576 #endif // HAS_ARGBMIRRORROW_AVX2
4577
4578 #ifdef HAS_SPLITUVROW_AVX2
SplitUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)4579 void SplitUVRow_AVX2(const uint8_t* src_uv,
4580 uint8_t* dst_u,
4581 uint8_t* dst_v,
4582 int width) {
4583 asm volatile(
4584 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4585 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
4586 "sub %1,%2 \n"
4587
4588 LABELALIGN
4589 "1: \n"
4590 "vmovdqu (%0),%%ymm0 \n"
4591 "vmovdqu 0x20(%0),%%ymm1 \n"
4592 "lea 0x40(%0),%0 \n"
4593 "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
4594 "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
4595 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
4596 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
4597 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4598 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
4599 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4600 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
4601 "vmovdqu %%ymm0,(%1) \n"
4602 "vmovdqu %%ymm2,0x00(%1,%2,1) \n"
4603 "lea 0x20(%1),%1 \n"
4604 "sub $0x20,%3 \n"
4605 "jg 1b \n"
4606 "vzeroupper \n"
4607 : "+r"(src_uv), // %0
4608 "+r"(dst_u), // %1
4609 "+r"(dst_v), // %2
4610 "+r"(width) // %3
4611 :
4612 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
4613 }
4614 #endif // HAS_SPLITUVROW_AVX2
4615
4616 #ifdef HAS_SPLITUVROW_SSE2
SplitUVRow_SSE2(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)4617 void SplitUVRow_SSE2(const uint8_t* src_uv,
4618 uint8_t* dst_u,
4619 uint8_t* dst_v,
4620 int width) {
4621 asm volatile(
4622 "pcmpeqb %%xmm5,%%xmm5 \n"
4623 "psrlw $0x8,%%xmm5 \n"
4624 "sub %1,%2 \n"
4625
4626 LABELALIGN
4627 "1: \n"
4628 "movdqu (%0),%%xmm0 \n"
4629 "movdqu 0x10(%0),%%xmm1 \n"
4630 "lea 0x20(%0),%0 \n"
4631 "movdqa %%xmm0,%%xmm2 \n"
4632 "movdqa %%xmm1,%%xmm3 \n"
4633 "pand %%xmm5,%%xmm0 \n"
4634 "pand %%xmm5,%%xmm1 \n"
4635 "packuswb %%xmm1,%%xmm0 \n"
4636 "psrlw $0x8,%%xmm2 \n"
4637 "psrlw $0x8,%%xmm3 \n"
4638 "packuswb %%xmm3,%%xmm2 \n"
4639 "movdqu %%xmm0,(%1) \n"
4640 "movdqu %%xmm2,0x00(%1,%2,1) \n"
4641 "lea 0x10(%1),%1 \n"
4642 "sub $0x10,%3 \n"
4643 "jg 1b \n"
4644 : "+r"(src_uv), // %0
4645 "+r"(dst_u), // %1
4646 "+r"(dst_v), // %2
4647 "+r"(width) // %3
4648 :
4649 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
4650 }
4651 #endif // HAS_SPLITUVROW_SSE2
4652
4653 #ifdef HAS_MERGEUVROW_AVX2
MergeUVRow_AVX2(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)4654 void MergeUVRow_AVX2(const uint8_t* src_u,
4655 const uint8_t* src_v,
4656 uint8_t* dst_uv,
4657 int width) {
4658 asm volatile(
4659
4660 "sub %0,%1 \n"
4661
4662 LABELALIGN
4663 "1: \n"
4664 "vmovdqu (%0),%%ymm0 \n"
4665 "vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
4666 "lea 0x20(%0),%0 \n"
4667 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
4668 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
4669 "vextractf128 $0x0,%%ymm2,(%2) \n"
4670 "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
4671 "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
4672 "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
4673 "lea 0x40(%2),%2 \n"
4674 "sub $0x20,%3 \n"
4675 "jg 1b \n"
4676 "vzeroupper \n"
4677 : "+r"(src_u), // %0
4678 "+r"(src_v), // %1
4679 "+r"(dst_uv), // %2
4680 "+r"(width) // %3
4681 :
4682 : "memory", "cc", "xmm0", "xmm1", "xmm2");
4683 }
4684 #endif // HAS_MERGEUVROW_AVX2
4685
4686 #ifdef HAS_MERGEUVROW_SSE2
MergeUVRow_SSE2(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)4687 void MergeUVRow_SSE2(const uint8_t* src_u,
4688 const uint8_t* src_v,
4689 uint8_t* dst_uv,
4690 int width) {
4691 asm volatile(
4692
4693 "sub %0,%1 \n"
4694
4695 LABELALIGN
4696 "1: \n"
4697 "movdqu (%0),%%xmm0 \n"
4698 "movdqu 0x00(%0,%1,1),%%xmm1 \n"
4699 "lea 0x10(%0),%0 \n"
4700 "movdqa %%xmm0,%%xmm2 \n"
4701 "punpcklbw %%xmm1,%%xmm0 \n"
4702 "punpckhbw %%xmm1,%%xmm2 \n"
4703 "movdqu %%xmm0,(%2) \n"
4704 "movdqu %%xmm2,0x10(%2) \n"
4705 "lea 0x20(%2),%2 \n"
4706 "sub $0x10,%3 \n"
4707 "jg 1b \n"
4708 : "+r"(src_u), // %0
4709 "+r"(src_v), // %1
4710 "+r"(dst_uv), // %2
4711 "+r"(width) // %3
4712 :
4713 : "memory", "cc", "xmm0", "xmm1", "xmm2");
4714 }
4715 #endif // HAS_MERGEUVROW_SSE2
4716
4717 #ifdef HAS_MERGEUVROW_16_AVX2
MergeUVRow_16_AVX2(const uint16_t * src_u,const uint16_t * src_v,uint16_t * dst_uv,int depth,int width)4718 void MergeUVRow_16_AVX2(const uint16_t* src_u,
4719 const uint16_t* src_v,
4720 uint16_t* dst_uv,
4721 int depth,
4722 int width) {
4723 depth = 16 - depth;
4724 // clang-format off
4725 asm volatile (
4726 "vmovd %4,%%xmm3 \n"
4727 "sub %0,%1 \n"
4728
4729 // 16 pixels per loop.
4730 LABELALIGN
4731 "1: \n"
4732 "vmovdqu (%0),%%ymm0 \n"
4733 "vmovdqu (%0,%1,1),%%ymm1 \n"
4734 "add $0x20,%0 \n"
4735
4736 "vpsllw %%xmm3,%%ymm0,%%ymm0 \n"
4737 "vpsllw %%xmm3,%%ymm1,%%ymm1 \n"
4738 "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
4739 "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
4740 "vextractf128 $0x0,%%ymm2,(%2) \n"
4741 "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
4742 "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
4743 "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
4744 "add $0x40,%2 \n"
4745 "sub $0x10,%3 \n"
4746 "jg 1b \n"
4747 "vzeroupper \n"
4748 : "+r"(src_u), // %0
4749 "+r"(src_v), // %1
4750 "+r"(dst_uv), // %2
4751 "+r"(width) // %3
4752 : "r"(depth) // %4
4753 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
4754 // clang-format on
4755 }
4756 #endif // HAS_MERGEUVROW_AVX2
4757
4758 #ifdef HAS_SPLITUVROW_16_AVX2
4759 const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8, 9, 12, 13,
4760 2, 3, 6, 7, 10, 11, 14, 15};
SplitUVRow_16_AVX2(const uint16_t * src_uv,uint16_t * dst_u,uint16_t * dst_v,int depth,int width)4761 void SplitUVRow_16_AVX2(const uint16_t* src_uv,
4762 uint16_t* dst_u,
4763 uint16_t* dst_v,
4764 int depth,
4765 int width) {
4766 depth = 16 - depth;
4767 // clang-format off
4768 asm volatile (
4769 "vmovd %4,%%xmm3 \n"
4770 "vbroadcastf128 %5,%%ymm4 \n"
4771 "sub %1,%2 \n"
4772
4773 // 16 pixels per loop.
4774 LABELALIGN
4775 "1: \n"
4776 "vmovdqu (%0),%%ymm0 \n"
4777 "vmovdqu 0x20(%0),%%ymm1 \n"
4778 "add $0x40,%0 \n"
4779
4780 "vpsrlw %%xmm3,%%ymm0,%%ymm0 \n"
4781 "vpsrlw %%xmm3,%%ymm1,%%ymm1 \n"
4782 "vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
4783 "vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
4784 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4785 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
4786 "vextractf128 $0x0,%%ymm0,(%1) \n"
4787 "vextractf128 $0x0,%%ymm1,0x10(%1) \n"
4788 "vextractf128 $0x1,%%ymm0,(%1,%2) \n"
4789 "vextractf128 $0x1,%%ymm1,0x10(%1,%2) \n"
4790 "add $0x20,%1 \n"
4791 "sub $0x10,%3 \n"
4792 "jg 1b \n"
4793 "vzeroupper \n"
4794 : "+r"(src_uv), // %0
4795 "+r"(dst_u), // %1
4796 "+r"(dst_v), // %2
4797 "+r"(width) // %3
4798 : "r"(depth), // %4
4799 "m"(kSplitUVShuffle16) // %5
4800 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
4801 // clang-format on
4802 }
4803 #endif // HAS_SPLITUVROW_16_AVX2
4804
4805 // Use scale to convert lsb formats to msb, depending how many bits there are:
4806 // 128 = 9 bits
4807 // 64 = 10 bits
4808 // 16 = 12 bits
4809 // 1 = 16 bits
4810 #ifdef HAS_MULTIPLYROW_16_AVX2
MultiplyRow_16_AVX2(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)4811 void MultiplyRow_16_AVX2(const uint16_t* src_y,
4812 uint16_t* dst_y,
4813 int scale,
4814 int width) {
4815 // clang-format off
4816 asm volatile (
4817 "vmovd %3,%%xmm3 \n"
4818 "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
4819 "vbroadcastss %%xmm3,%%ymm3 \n"
4820 "sub %0,%1 \n"
4821
4822 // 32 pixels per loop.
4823 LABELALIGN
4824 "1: \n"
4825 "vmovdqu (%0),%%ymm0 \n"
4826 "vmovdqu 0x20(%0),%%ymm1 \n"
4827 "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
4828 "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
4829 "vmovdqu %%ymm0,(%0,%1) \n"
4830 "vmovdqu %%ymm1,0x20(%0,%1) \n"
4831 "add $0x40,%0 \n"
4832 "sub $0x20,%2 \n"
4833 "jg 1b \n"
4834 "vzeroupper \n"
4835 : "+r"(src_y), // %0
4836 "+r"(dst_y), // %1
4837 "+r"(width) // %2
4838 : "r"(scale) // %3
4839 : "memory", "cc", "xmm0", "xmm1", "xmm3");
4840 // clang-format on
4841 }
4842 #endif // HAS_MULTIPLYROW_16_AVX2
4843
4844 // Use scale to convert msb formats to lsb, depending how many bits there are:
4845 // 512 = 9 bits
4846 // 1024 = 10 bits
4847 // 4096 = 12 bits
4848 // 65536 = 16 bits
4849 #ifdef HAS_DIVIDEROW_16_AVX2
DivideRow_16_AVX2(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)4850 void DivideRow_16_AVX2(const uint16_t* src_y,
4851 uint16_t* dst_y,
4852 int scale,
4853 int width) {
4854 // clang-format off
4855 asm volatile (
4856 "vmovd %3,%%xmm3 \n"
4857 "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
4858 "vbroadcastss %%xmm3,%%ymm3 \n"
4859 "sub %0,%1 \n"
4860
4861 // 32 pixels per loop.
4862 LABELALIGN
4863 "1: \n"
4864 "vmovdqu (%0),%%ymm0 \n"
4865 "vmovdqu 0x20(%0),%%ymm1 \n"
4866 "vpmulhuw %%ymm3,%%ymm0,%%ymm0 \n"
4867 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
4868 "vmovdqu %%ymm0,(%0,%1) \n"
4869 "vmovdqu %%ymm1,0x20(%0,%1) \n"
4870 "add $0x40,%0 \n"
4871 "sub $0x20,%2 \n"
4872 "jg 1b \n"
4873 "vzeroupper \n"
4874 : "+r"(src_y), // %0
4875 "+r"(dst_y), // %1
4876 "+r"(width), // %2
4877 "+r"(scale) // %3
4878 :
4879 : "memory", "cc", "xmm0", "xmm1", "xmm3");
4880 // clang-format on
4881 }
4882 #endif // HAS_MULTIPLYROW_16_AVX2
4883
4884 // Use scale to convert lsb formats to msb, depending how many bits there are:
4885 // 32768 = 9 bits
4886 // 16384 = 10 bits
4887 // 4096 = 12 bits
4888 // 256 = 16 bits
Convert16To8Row_SSSE3(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)4889 void Convert16To8Row_SSSE3(const uint16_t* src_y,
4890 uint8_t* dst_y,
4891 int scale,
4892 int width) {
4893 // clang-format off
4894 asm volatile (
4895 "movd %3,%%xmm2 \n"
4896 "punpcklwd %%xmm2,%%xmm2 \n"
4897 "pshufd $0x0,%%xmm2,%%xmm2 \n"
4898
4899 // 32 pixels per loop.
4900 LABELALIGN
4901 "1: \n"
4902 "movdqu (%0),%%xmm0 \n"
4903 "movdqu 0x10(%0),%%xmm1 \n"
4904 "add $0x20,%0 \n"
4905 "pmulhuw %%xmm2,%%xmm0 \n"
4906 "pmulhuw %%xmm2,%%xmm1 \n"
4907 "packuswb %%xmm1,%%xmm0 \n"
4908 "movdqu %%xmm0,(%1) \n"
4909 "add $0x10,%1 \n"
4910 "sub $0x10,%2 \n"
4911 "jg 1b \n"
4912 : "+r"(src_y), // %0
4913 "+r"(dst_y), // %1
4914 "+r"(width) // %2
4915 : "r"(scale) // %3
4916 : "memory", "cc", "xmm0", "xmm1", "xmm2");
4917 // clang-format on
4918 }
4919
4920 #ifdef HAS_CONVERT16TO8ROW_AVX2
Convert16To8Row_AVX2(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)4921 void Convert16To8Row_AVX2(const uint16_t* src_y,
4922 uint8_t* dst_y,
4923 int scale,
4924 int width) {
4925 // clang-format off
4926 asm volatile (
4927 "vmovd %3,%%xmm2 \n"
4928 "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
4929 "vbroadcastss %%xmm2,%%ymm2 \n"
4930
4931 // 32 pixels per loop.
4932 LABELALIGN
4933 "1: \n"
4934 "vmovdqu (%0),%%ymm0 \n"
4935 "vmovdqu 0x20(%0),%%ymm1 \n"
4936 "add $0x40,%0 \n"
4937 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
4938 "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
4939 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
4940 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4941 "vmovdqu %%ymm0,(%1) \n"
4942 "add $0x20,%1 \n"
4943 "sub $0x20,%2 \n"
4944 "jg 1b \n"
4945 "vzeroupper \n"
4946 : "+r"(src_y), // %0
4947 "+r"(dst_y), // %1
4948 "+r"(width) // %2
4949 : "r"(scale) // %3
4950 : "memory", "cc", "xmm0", "xmm1", "xmm2");
4951 // clang-format on
4952 }
4953 #endif // HAS_CONVERT16TO8ROW_AVX2
4954
4955 // Use scale to convert to lsb formats depending how many bits there are:
4956 // 512 = 9 bits
4957 // 1024 = 10 bits
4958 // 4096 = 12 bits
4959 // TODO(fbarchard): reduce to SSE2
Convert8To16Row_SSE2(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)4960 void Convert8To16Row_SSE2(const uint8_t* src_y,
4961 uint16_t* dst_y,
4962 int scale,
4963 int width) {
4964 // clang-format off
4965 asm volatile (
4966 "movd %3,%%xmm2 \n"
4967 "punpcklwd %%xmm2,%%xmm2 \n"
4968 "pshufd $0x0,%%xmm2,%%xmm2 \n"
4969
4970 // 32 pixels per loop.
4971 LABELALIGN
4972 "1: \n"
4973 "movdqu (%0),%%xmm0 \n"
4974 "movdqa %%xmm0,%%xmm1 \n"
4975 "punpcklbw %%xmm0,%%xmm0 \n"
4976 "punpckhbw %%xmm1,%%xmm1 \n"
4977 "add $0x10,%0 \n"
4978 "pmulhuw %%xmm2,%%xmm0 \n"
4979 "pmulhuw %%xmm2,%%xmm1 \n"
4980 "movdqu %%xmm0,(%1) \n"
4981 "movdqu %%xmm1,0x10(%1) \n"
4982 "add $0x20,%1 \n"
4983 "sub $0x10,%2 \n"
4984 "jg 1b \n"
4985 : "+r"(src_y), // %0
4986 "+r"(dst_y), // %1
4987 "+r"(width) // %2
4988 : "r"(scale) // %3
4989 : "memory", "cc", "xmm0", "xmm1", "xmm2");
4990 // clang-format on
4991 }
4992
4993 #ifdef HAS_CONVERT8TO16ROW_AVX2
Convert8To16Row_AVX2(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)4994 void Convert8To16Row_AVX2(const uint8_t* src_y,
4995 uint16_t* dst_y,
4996 int scale,
4997 int width) {
4998 // clang-format off
4999 asm volatile (
5000 "vmovd %3,%%xmm2 \n"
5001 "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
5002 "vbroadcastss %%xmm2,%%ymm2 \n"
5003
5004 // 32 pixels per loop.
5005 LABELALIGN
5006 "1: \n"
5007 "vmovdqu (%0),%%ymm0 \n"
5008 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
5009 "add $0x20,%0 \n"
5010 "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
5011 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
5012 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
5013 "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
5014 "vmovdqu %%ymm0,(%1) \n"
5015 "vmovdqu %%ymm1,0x20(%1) \n"
5016 "add $0x40,%1 \n"
5017 "sub $0x20,%2 \n"
5018 "jg 1b \n"
5019 "vzeroupper \n"
5020 : "+r"(src_y), // %0
5021 "+r"(dst_y), // %1
5022 "+r"(width) // %2
5023 : "r"(scale) // %3
5024 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5025 // clang-format on
5026 }
5027 #endif // HAS_CONVERT8TO16ROW_AVX2
5028
5029 #ifdef HAS_SPLITRGBROW_SSSE3
5030
5031 // Shuffle table for converting RGB to Planar.
5032 static const uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u,
5033 128u, 128u, 128u, 128u, 128u, 128u,
5034 128u, 128u, 128u, 128u};
5035 static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
5036 2u, 5u, 8u, 11u, 14u, 128u,
5037 128u, 128u, 128u, 128u};
5038 static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
5039 128u, 128u, 128u, 128u, 128u, 1u,
5040 4u, 7u, 10u, 13u};
5041
5042 static const uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u,
5043 128u, 128u, 128u, 128u, 128u, 128u,
5044 128u, 128u, 128u, 128u};
5045 static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
5046 3u, 6u, 9u, 12u, 15u, 128u,
5047 128u, 128u, 128u, 128u};
5048 static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
5049 128u, 128u, 128u, 128u, 128u, 2u,
5050 5u, 8u, 11u, 14u};
5051
5052 static const uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u,
5053 128u, 128u, 128u, 128u, 128u, 128u,
5054 128u, 128u, 128u, 128u};
5055 static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
5056 4u, 7u, 10u, 13u, 128u, 128u,
5057 128u, 128u, 128u, 128u};
5058 static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
5059 128u, 128u, 128u, 128u, 0u, 3u,
5060 6u, 9u, 12u, 15u};
5061
SplitRGBRow_SSSE3(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)5062 void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
5063 uint8_t* dst_r,
5064 uint8_t* dst_g,
5065 uint8_t* dst_b,
5066 int width) {
5067 asm volatile(
5068
5069 LABELALIGN
5070 "1: \n"
5071 "movdqu (%0),%%xmm0 \n"
5072 "movdqu 0x10(%0),%%xmm1 \n"
5073 "movdqu 0x20(%0),%%xmm2 \n"
5074 "pshufb %5, %%xmm0 \n"
5075 "pshufb %6, %%xmm1 \n"
5076 "pshufb %7, %%xmm2 \n"
5077 "por %%xmm1,%%xmm0 \n"
5078 "por %%xmm2,%%xmm0 \n"
5079 "movdqu %%xmm0,(%1) \n"
5080 "lea 0x10(%1),%1 \n"
5081
5082 "movdqu (%0),%%xmm0 \n"
5083 "movdqu 0x10(%0),%%xmm1 \n"
5084 "movdqu 0x20(%0),%%xmm2 \n"
5085 "pshufb %8, %%xmm0 \n"
5086 "pshufb %9, %%xmm1 \n"
5087 "pshufb %10, %%xmm2 \n"
5088 "por %%xmm1,%%xmm0 \n"
5089 "por %%xmm2,%%xmm0 \n"
5090 "movdqu %%xmm0,(%2) \n"
5091 "lea 0x10(%2),%2 \n"
5092
5093 "movdqu (%0),%%xmm0 \n"
5094 "movdqu 0x10(%0),%%xmm1 \n"
5095 "movdqu 0x20(%0),%%xmm2 \n"
5096 "pshufb %11, %%xmm0 \n"
5097 "pshufb %12, %%xmm1 \n"
5098 "pshufb %13, %%xmm2 \n"
5099 "por %%xmm1,%%xmm0 \n"
5100 "por %%xmm2,%%xmm0 \n"
5101 "movdqu %%xmm0,(%3) \n"
5102 "lea 0x10(%3),%3 \n"
5103 "lea 0x30(%0),%0 \n"
5104 "sub $0x10,%4 \n"
5105 "jg 1b \n"
5106 : "+r"(src_rgb), // %0
5107 "+r"(dst_r), // %1
5108 "+r"(dst_g), // %2
5109 "+r"(dst_b), // %3
5110 "+r"(width) // %4
5111 : "m"(kShuffleMaskRGBToR0), // %5
5112 "m"(kShuffleMaskRGBToR1), // %6
5113 "m"(kShuffleMaskRGBToR2), // %7
5114 "m"(kShuffleMaskRGBToG0), // %8
5115 "m"(kShuffleMaskRGBToG1), // %9
5116 "m"(kShuffleMaskRGBToG2), // %10
5117 "m"(kShuffleMaskRGBToB0), // %11
5118 "m"(kShuffleMaskRGBToB1), // %12
5119 "m"(kShuffleMaskRGBToB2) // %13
5120 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5121 }
5122 #endif // HAS_SPLITRGBROW_SSSE3
5123
5124 #ifdef HAS_MERGERGBROW_SSSE3
5125
5126 // Shuffle table for converting RGB to Planar.
5127 static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
5128 2u, 128u, 128u, 3u, 128u, 128u,
5129 4u, 128u, 128u, 5u};
5130 static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
5131 128u, 2u, 128u, 128u, 3u, 128u,
5132 128u, 4u, 128u, 128u};
5133 static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
5134 128u, 128u, 2u, 128u, 128u, 3u,
5135 128u, 128u, 4u, 128u};
5136
5137 static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
5138 7u, 128u, 128u, 8u, 128u, 128u,
5139 9u, 128u, 128u, 10u};
5140 static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
5141 128u, 7u, 128u, 128u, 8u, 128u,
5142 128u, 9u, 128u, 128u};
5143 static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u,
5144 128u, 128u, 8u, 128u, 128u, 9u,
5145 128u, 128u, 10u, 128u};
5146
5147 static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
5148 12u, 128u, 128u, 13u, 128u, 128u,
5149 14u, 128u, 128u, 15u};
5150 static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
5151 128u, 13u, 128u, 128u, 14u, 128u,
5152 128u, 15u, 128u, 128u};
5153 static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
5154 128u, 128u, 13u, 128u, 128u, 14u,
5155 128u, 128u, 15u, 128u};
5156
MergeRGBRow_SSSE3(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)5157 void MergeRGBRow_SSSE3(const uint8_t* src_r,
5158 const uint8_t* src_g,
5159 const uint8_t* src_b,
5160 uint8_t* dst_rgb,
5161 int width) {
5162 asm volatile(
5163
5164 LABELALIGN
5165 "1: \n"
5166 "movdqu (%0),%%xmm0 \n"
5167 "movdqu (%1),%%xmm1 \n"
5168 "movdqu (%2),%%xmm2 \n"
5169 "pshufb %5, %%xmm0 \n"
5170 "pshufb %6, %%xmm1 \n"
5171 "pshufb %7, %%xmm2 \n"
5172 "por %%xmm1,%%xmm0 \n"
5173 "por %%xmm2,%%xmm0 \n"
5174 "movdqu %%xmm0,(%3) \n"
5175
5176 "movdqu (%0),%%xmm0 \n"
5177 "movdqu (%1),%%xmm1 \n"
5178 "movdqu (%2),%%xmm2 \n"
5179 "pshufb %8, %%xmm0 \n"
5180 "pshufb %9, %%xmm1 \n"
5181 "pshufb %10, %%xmm2 \n"
5182 "por %%xmm1,%%xmm0 \n"
5183 "por %%xmm2,%%xmm0 \n"
5184 "movdqu %%xmm0,16(%3) \n"
5185
5186 "movdqu (%0),%%xmm0 \n"
5187 "movdqu (%1),%%xmm1 \n"
5188 "movdqu (%2),%%xmm2 \n"
5189 "pshufb %11, %%xmm0 \n"
5190 "pshufb %12, %%xmm1 \n"
5191 "pshufb %13, %%xmm2 \n"
5192 "por %%xmm1,%%xmm0 \n"
5193 "por %%xmm2,%%xmm0 \n"
5194 "movdqu %%xmm0,32(%3) \n"
5195
5196 "lea 0x10(%0),%0 \n"
5197 "lea 0x10(%1),%1 \n"
5198 "lea 0x10(%2),%2 \n"
5199 "lea 0x30(%3),%3 \n"
5200 "sub $0x10,%4 \n"
5201 "jg 1b \n"
5202 : "+r"(src_r), // %0
5203 "+r"(src_g), // %1
5204 "+r"(src_b), // %2
5205 "+r"(dst_rgb), // %3
5206 "+r"(width) // %4
5207 : "m"(kShuffleMaskRToRGB0), // %5
5208 "m"(kShuffleMaskGToRGB0), // %6
5209 "m"(kShuffleMaskBToRGB0), // %7
5210 "m"(kShuffleMaskRToRGB1), // %8
5211 "m"(kShuffleMaskGToRGB1), // %9
5212 "m"(kShuffleMaskBToRGB1), // %10
5213 "m"(kShuffleMaskRToRGB2), // %11
5214 "m"(kShuffleMaskGToRGB2), // %12
5215 "m"(kShuffleMaskBToRGB2) // %13
5216 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5217 }
5218 #endif // HAS_MERGERGBROW_SSSE3
5219
5220 #ifdef HAS_MERGEARGBROW_SSE2
MergeARGBRow_SSE2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)5221 void MergeARGBRow_SSE2(const uint8_t* src_r,
5222 const uint8_t* src_g,
5223 const uint8_t* src_b,
5224 const uint8_t* src_a,
5225 uint8_t* dst_argb,
5226 int width) {
5227 asm volatile(
5228
5229 "sub %0,%1 \n"
5230 "sub %0,%2 \n"
5231 "sub %0,%3 \n"
5232
5233 LABELALIGN
5234 "1: \n"
5235
5236 "movq (%0,%2),%%xmm0 \n" // B
5237 "movq (%0),%%xmm1 \n" // R
5238 "movq (%0,%1),%%xmm2 \n" // G
5239 "punpcklbw %%xmm1,%%xmm0 \n" // BR
5240 "movq (%0,%3),%%xmm1 \n" // A
5241 "punpcklbw %%xmm1,%%xmm2 \n" // GA
5242 "movdqa %%xmm0,%%xmm1 \n" // BR
5243 "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi)
5244 "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo)
5245 "movdqu %%xmm0,(%4) \n"
5246 "movdqu %%xmm1,16(%4) \n"
5247
5248 "lea 8(%0),%0 \n"
5249 "lea 32(%4),%4 \n"
5250 "sub $0x8,%5 \n"
5251 "jg 1b \n"
5252 : "+r"(src_r), // %0
5253 "+r"(src_g), // %1
5254 "+r"(src_b), // %2
5255 "+r"(src_a), // %3
5256 "+r"(dst_argb), // %4
5257 "+r"(width) // %5
5258 :
5259 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5260 }
5261 #endif
5262
5263 #ifdef HAS_MERGEXRGBROW_SSE2
MergeXRGBRow_SSE2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_argb,int width)5264 void MergeXRGBRow_SSE2(const uint8_t* src_r,
5265 const uint8_t* src_g,
5266 const uint8_t* src_b,
5267 uint8_t* dst_argb,
5268 int width) {
5269 asm volatile(
5270
5271 LABELALIGN
5272 "1: \n"
5273
5274 "movq (%2),%%xmm0 \n" // B
5275 "movq (%0),%%xmm1 \n" // R
5276 "movq (%1),%%xmm2 \n" // G
5277 "punpcklbw %%xmm1,%%xmm0 \n" // BR
5278 "pcmpeqd %%xmm1,%%xmm1 \n" // A(255)
5279 "punpcklbw %%xmm1,%%xmm2 \n" // GA
5280 "movdqa %%xmm0,%%xmm1 \n" // BR
5281 "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi)
5282 "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo)
5283 "movdqu %%xmm0,(%3) \n"
5284 "movdqu %%xmm1,16(%3) \n"
5285
5286 "lea 8(%0),%0 \n"
5287 "lea 8(%1),%1 \n"
5288 "lea 8(%2),%2 \n"
5289 "lea 32(%3),%3 \n"
5290 "sub $0x8,%4 \n"
5291 "jg 1b \n"
5292 : "+r"(src_r), // %0
5293 "+r"(src_g), // %1
5294 "+r"(src_b), // %2
5295 "+r"(dst_argb), // %3
5296 "+r"(width) // %4
5297 :
5298 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5299 }
5300 #endif // HAS_MERGEARGBROW_SSE2
5301
5302 #ifdef HAS_MERGEARGBROW_AVX2
MergeARGBRow_AVX2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)5303 void MergeARGBRow_AVX2(const uint8_t* src_r,
5304 const uint8_t* src_g,
5305 const uint8_t* src_b,
5306 const uint8_t* src_a,
5307 uint8_t* dst_argb,
5308 int width) {
5309 asm volatile(
5310
5311 "sub %0,%1 \n"
5312 "sub %0,%2 \n"
5313 "sub %0,%3 \n"
5314
5315 LABELALIGN
5316 "1: \n"
5317
5318 "vmovdqu (%0,%2),%%xmm0 \n" // B
5319 "vmovdqu (%0,%1),%%xmm1 \n" // R
5320 "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G
5321 "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1 \n" // A
5322 "vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n"
5323 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
5324 "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
5325 "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
5326 "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n"
5327 "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n"
5328 "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
5329 "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
5330 "vmovdqu %%ymm0,(%4) \n" // First 8
5331 "vmovdqu %%ymm1,32(%4) \n" // Next 8
5332
5333 "lea 16(%0),%0 \n"
5334 "lea 64(%4),%4 \n"
5335 "sub $0x10,%5 \n"
5336 "jg 1b \n"
5337 "vzeroupper \n"
5338 : "+r"(src_r), // %0
5339 "+r"(src_g), // %1
5340 "+r"(src_b), // %2
5341 "+r"(src_a), // %3
5342 "+r"(dst_argb), // %4
5343 "+r"(width) // %5
5344 :
5345 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5346 }
5347 #endif
5348
5349 #ifdef HAS_MERGEXRGBROW_AVX2
MergeXRGBRow_AVX2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_argb,int width)5350 void MergeXRGBRow_AVX2(const uint8_t* src_r,
5351 const uint8_t* src_g,
5352 const uint8_t* src_b,
5353 uint8_t* dst_argb,
5354 int width) {
5355 asm volatile(
5356
5357 LABELALIGN
5358 "1: \n"
5359
5360 "vmovdqu (%2),%%xmm0 \n" // B
5361 "vpcmpeqd %%ymm1,%%ymm1,%%ymm1 \n" // A(255)
5362 "vinserti128 $0,(%1),%%ymm1,%%ymm1 \n" // R
5363 "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G
5364 "vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n"
5365 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
5366 "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
5367 "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
5368 "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n"
5369 "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n"
5370 "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
5371 "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
5372 "vmovdqu %%ymm0,(%3) \n" // First 8
5373 "vmovdqu %%ymm1,32(%3) \n" // Next 8
5374
5375 "lea 16(%0),%0 \n"
5376 "lea 16(%1),%1 \n"
5377 "lea 16(%2),%2 \n"
5378 "lea 64(%3),%3 \n"
5379 "sub $0x10,%4 \n"
5380 "jg 1b \n"
5381 "vzeroupper \n"
5382 : "+r"(src_r), // %0
5383 "+r"(src_g), // %1
5384 "+r"(src_b), // %2
5385 "+r"(dst_argb), // %3
5386 "+rm"(width) // %4
5387 :
5388 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5389 }
5390 #endif // HAS_MERGEARGBROW_AVX2
5391
5392 #ifdef HAS_SPLITARGBROW_SSE2
SplitARGBRow_SSE2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)5393 void SplitARGBRow_SSE2(const uint8_t* src_argb,
5394 uint8_t* dst_r,
5395 uint8_t* dst_g,
5396 uint8_t* dst_b,
5397 uint8_t* dst_a,
5398 int width) {
5399 asm volatile(
5400
5401 "sub %1,%2 \n"
5402 "sub %1,%3 \n"
5403 "sub %1,%4 \n"
5404
5405 LABELALIGN
5406 "1: \n"
5407
5408 "movdqu (%0),%%xmm0 \n" // 00-0F
5409 "movdqu 16(%0),%%xmm1 \n" // 10-1F
5410 "movdqa %%xmm0,%%xmm2 \n"
5411 "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17
5412 "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F
5413 "movdqa %%xmm0,%%xmm1 \n"
5414 "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo)
5415 "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi)
5416 "movdqa %%xmm0,%%xmm2 \n"
5417 "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B
5418 "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F
5419 "movdqa %%xmm0,%%xmm1 \n"
5420 "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo)
5421 "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi)
5422 "movdqa %%xmm0,%%xmm2 \n"
5423 "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
5424 "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
5425 "movlps %%xmm0,(%1,%3) \n" // B
5426 "movhps %%xmm0,(%1,%2) \n" // G
5427 "movlps %%xmm2,(%1) \n" // R
5428 "movhps %%xmm2,(%1,%4) \n" // A
5429
5430 "lea 32(%0),%0 \n"
5431 "lea 8(%1),%1 \n"
5432 "sub $0x8,%5 \n"
5433 "jg 1b \n"
5434 : "+r"(src_argb), // %0
5435 "+r"(dst_r), // %1
5436 "+r"(dst_g), // %2
5437 "+r"(dst_b), // %3
5438 "+r"(dst_a), // %4
5439 "+rm"(width) // %5
5440 :
5441 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5442 }
5443 #endif
5444
5445 #ifdef HAS_SPLITXRGBROW_SSE2
SplitXRGBRow_SSE2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)5446 void SplitXRGBRow_SSE2(const uint8_t* src_argb,
5447 uint8_t* dst_r,
5448 uint8_t* dst_g,
5449 uint8_t* dst_b,
5450 int width) {
5451 asm volatile(
5452
5453 LABELALIGN
5454 "1: \n"
5455
5456 "movdqu (%0),%%xmm0 \n" // 00-0F
5457 "movdqu 16(%0),%%xmm1 \n" // 10-1F
5458 "movdqa %%xmm0,%%xmm2 \n"
5459 "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17
5460 "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F
5461 "movdqa %%xmm0,%%xmm1 \n"
5462 "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo)
5463 "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi)
5464 "movdqa %%xmm0,%%xmm2 \n"
5465 "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B
5466 "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F
5467 "movdqa %%xmm0,%%xmm1 \n"
5468 "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo)
5469 "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi)
5470 "movdqa %%xmm0,%%xmm2 \n"
5471 "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
5472 "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
5473 "movlps %%xmm0,(%3) \n" // B
5474 "movhps %%xmm0,(%2) \n" // G
5475 "movlps %%xmm2,(%1) \n" // R
5476
5477 "lea 32(%0),%0 \n"
5478 "lea 8(%1),%1 \n"
5479 "lea 8(%2),%2 \n"
5480 "lea 8(%3),%3 \n"
5481 "sub $0x8,%4 \n"
5482 "jg 1b \n"
5483 : "+r"(src_argb), // %0
5484 "+r"(dst_r), // %1
5485 "+r"(dst_g), // %2
5486 "+r"(dst_b), // %3
5487 "+rm"(width) // %4
5488 :
5489 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5490 }
5491 #endif
5492
5493 static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8, 12, 1, 5, 9, 13,
5494 2, 6, 10, 14, 3, 7, 11, 15};
5495 #ifdef HAS_SPLITARGBROW_SSSE3
SplitARGBRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)5496 void SplitARGBRow_SSSE3(const uint8_t* src_argb,
5497 uint8_t* dst_r,
5498 uint8_t* dst_g,
5499 uint8_t* dst_b,
5500 uint8_t* dst_a,
5501 int width) {
5502 asm volatile(
5503
5504 "movdqa %6,%%xmm3 \n"
5505 "sub %1,%2 \n"
5506 "sub %1,%3 \n"
5507 "sub %1,%4 \n"
5508
5509 LABELALIGN
5510 "1: \n"
5511
5512 "movdqu (%0),%%xmm0 \n" // 00-0F
5513 "movdqu 16(%0),%%xmm1 \n" // 10-1F
5514 "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo)
5515 "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi)
5516 "movdqa %%xmm0,%%xmm2 \n"
5517 "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
5518 "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
5519 "movlps %%xmm0,(%1,%3) \n" // B
5520 "movhps %%xmm0,(%1,%2) \n" // G
5521 "movlps %%xmm2,(%1) \n" // R
5522 "movhps %%xmm2,(%1,%4) \n" // A
5523
5524 "lea 32(%0),%0 \n"
5525 "lea 8(%1),%1 \n"
5526 "subl $0x8,%5 \n"
5527 "jg 1b \n"
5528 : "+r"(src_argb), // %0
5529 "+r"(dst_r), // %1
5530 "+r"(dst_g), // %2
5531 "+r"(dst_b), // %3
5532 "+r"(dst_a), // %4
5533 #if defined(__i386__)
5534 "+m"(width) // %5
5535 #else
5536 "+rm"(width) // %5
5537 #endif
5538 : "m"(kShuffleMaskARGBSplit) // %6
5539 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
5540 }
5541 #endif
5542
5543 #ifdef HAS_SPLITXRGBROW_SSSE3
SplitXRGBRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)5544 void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
5545 uint8_t* dst_r,
5546 uint8_t* dst_g,
5547 uint8_t* dst_b,
5548 int width) {
5549 asm volatile(
5550
5551 "movdqa %5,%%xmm3 \n"
5552
5553 LABELALIGN
5554 "1: \n"
5555
5556 "movdqu (%0),%%xmm0 \n" // 00-0F
5557 "movdqu 16(%0),%%xmm1 \n" // 10-1F
5558 "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo)
5559 "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi)
5560 "movdqa %%xmm0,%%xmm2 \n"
5561 "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
5562 "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
5563 "movlps %%xmm0,(%3) \n" // B
5564 "movhps %%xmm0,(%2) \n" // G
5565 "movlps %%xmm2,(%1) \n" // R
5566
5567 "lea 32(%0),%0 \n"
5568 "lea 8(%1),%1 \n"
5569 "lea 8(%2),%2 \n"
5570 "lea 8(%3),%3 \n"
5571 "sub $0x8,%4 \n"
5572 "jg 1b \n"
5573 : "+r"(src_argb), // %0
5574 "+r"(dst_r), // %1
5575 "+r"(dst_g), // %2
5576 "+r"(dst_b), // %3
5577 "+r"(width) // %4
5578 : "m"(kShuffleMaskARGBSplit) // %5
5579 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
5580 }
5581 #endif
5582
5583 #ifdef HAS_SPLITARGBROW_AVX2
5584 static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7};
SplitARGBRow_AVX2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)5585 void SplitARGBRow_AVX2(const uint8_t* src_argb,
5586 uint8_t* dst_r,
5587 uint8_t* dst_g,
5588 uint8_t* dst_b,
5589 uint8_t* dst_a,
5590 int width) {
5591 asm volatile(
5592
5593 "sub %1,%2 \n"
5594 "sub %1,%3 \n"
5595 "sub %1,%4 \n"
5596 "vmovdqa %7,%%ymm3 \n"
5597 "vbroadcastf128 %6,%%ymm4 \n"
5598
5599 LABELALIGN
5600 "1: \n"
5601
5602 "vmovdqu (%0),%%xmm0 \n" // 00-0F
5603 "vmovdqu 16(%0),%%xmm1 \n" // 10-1F
5604 "vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F
5605 "vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F
5606 "vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
5607 "vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
5608 "vpermd %%ymm0,%%ymm3,%%ymm0 \n"
5609 "vpermd %%ymm1,%%ymm3,%%ymm1 \n"
5610 "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA
5611 "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" // BR
5612 "vmovdqu %%xmm0,(%1,%3) \n" // B
5613 "vextracti128 $1,%%ymm0,(%1) \n" // R
5614 "vmovdqu %%xmm2,(%1,%2) \n" // G
5615 "vextracti128 $1,%%ymm2,(%1,%4) \n" // A
5616 "lea 64(%0),%0 \n"
5617 "lea 16(%1),%1 \n"
5618 "subl $0x10,%5 \n"
5619 "jg 1b \n"
5620 "vzeroupper \n"
5621 : "+r"(src_argb), // %0
5622 "+r"(dst_r), // %1
5623 "+r"(dst_g), // %2
5624 "+r"(dst_b), // %3
5625 "+r"(dst_a), // %4
5626 #if defined(__i386__)
5627 "+m"(width) // %5
5628 #else
5629 "+rm"(width) // %5
5630 #endif
5631 : "m"(kShuffleMaskARGBSplit), // %6
5632 "m"(kShuffleMaskARGBPermute) // %7
5633 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
5634 }
5635 #endif
5636
5637 #ifdef HAS_SPLITXRGBROW_AVX2
SplitXRGBRow_AVX2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)5638 void SplitXRGBRow_AVX2(const uint8_t* src_argb,
5639 uint8_t* dst_r,
5640 uint8_t* dst_g,
5641 uint8_t* dst_b,
5642 int width) {
5643 asm volatile(
5644
5645 "vmovdqa %6,%%ymm3 \n"
5646 "vbroadcastf128 %5,%%ymm4 \n"
5647
5648 LABELALIGN
5649 "1: \n"
5650
5651 "vmovdqu (%0),%%xmm0 \n" // 00-0F
5652 "vmovdqu 16(%0),%%xmm1 \n" // 10-1F
5653 "vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F
5654 "vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F
5655 "vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
5656 "vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
5657 "vpermd %%ymm0,%%ymm3,%%ymm0 \n"
5658 "vpermd %%ymm1,%%ymm3,%%ymm1 \n"
5659 "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA
5660 "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" // BR
5661 "vmovdqu %%xmm0,(%3) \n" // B
5662 "vextracti128 $1,%%ymm0,(%1) \n" // R
5663 "vmovdqu %%xmm2,(%2) \n" // G
5664
5665 "lea 64(%0),%0 \n"
5666 "lea 16(%1),%1 \n"
5667 "lea 16(%2),%2 \n"
5668 "lea 16(%3),%3 \n"
5669 "sub $0x10,%4 \n"
5670 "jg 1b \n"
5671 "vzeroupper \n"
5672 : "+r"(src_argb), // %0
5673 "+r"(dst_r), // %1
5674 "+r"(dst_g), // %2
5675 "+r"(dst_b), // %3
5676 "+r"(width) // %4
5677 : "m"(kShuffleMaskARGBSplit), // %5
5678 "m"(kShuffleMaskARGBPermute) // %6
5679 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
5680 }
5681 #endif
5682
5683 #ifdef HAS_MERGEXR30ROW_AVX2
MergeXR30Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_ar30,int depth,int width)5684 void MergeXR30Row_AVX2(const uint16_t* src_r,
5685 const uint16_t* src_g,
5686 const uint16_t* src_b,
5687 uint8_t* dst_ar30,
5688 int depth,
5689 int width) {
5690 int shift = depth - 10;
5691 asm volatile(
5692
5693 "sub %0,%1 \n"
5694 "sub %0,%2 \n"
5695 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
5696 "vpsrlw $14,%%ymm5,%%ymm5 \n"
5697 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
5698 "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n"
5699 "vpsrlw $6,%%ymm6,%%ymm6 \n"
5700 "vmovd %5,%%xmm4 \n"
5701
5702 LABELALIGN
5703 "1: \n"
5704 "vmovdqu (%0),%%ymm0 \n"
5705 "vmovdqu (%0,%1),%%ymm1 \n"
5706 "vmovdqu (%0,%2),%%ymm2 \n"
5707 "vpsrlw %%xmm4,%%ymm0,%%ymm0 \n"
5708 "vpsrlw %%xmm4,%%ymm1,%%ymm1 \n"
5709 "vpsrlw %%xmm4,%%ymm2,%%ymm2 \n"
5710 "vpminuw %%ymm0,%%ymm6,%%ymm0 \n"
5711 "vpminuw %%ymm1,%%ymm6,%%ymm1 \n"
5712 "vpminuw %%ymm2,%%ymm6,%%ymm2 \n"
5713 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
5714 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
5715 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
5716 "vpsllw $0x4,%%ymm0,%%ymm0 \n" // Shift R to target bit
5717 "vpunpckhwd %%ymm0,%%ymm2,%%ymm3 \n" // RB
5718 "vpunpcklwd %%ymm0,%%ymm2,%%ymm0 \n"
5719 "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" // AG
5720 "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n"
5721 "vpslld $0xa,%%ymm1,%%ymm1 \n" // Shift AG to target bit
5722 "vpslld $0xa,%%ymm2,%%ymm2 \n"
5723 "vpor %%ymm1,%%ymm0,%%ymm0 \n" // Combine
5724 "vpor %%ymm2,%%ymm3,%%ymm3 \n"
5725 "vmovdqu %%ymm0,(%3) \n"
5726 "vmovdqu %%ymm3,0x20(%3) \n"
5727 "lea 0x20(%0),%0 \n"
5728 "lea 0x40(%3),%3 \n"
5729 "sub $0x10,%4 \n"
5730 "jg 1b \n"
5731 "vzeroupper \n"
5732 : "+r"(src_r), // %0
5733 "+r"(src_g), // %1
5734 "+r"(src_b), // %2
5735 "+r"(dst_ar30), // %3
5736 "+r"(width) // %4
5737 #if defined(__i386__)
5738 : "m"(shift) // %5
5739 #else
5740 : "rm"(shift) // %5
5741 #endif
5742 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
5743 }
5744 #endif
5745
5746 #ifdef HAS_MERGEAR64ROW_AVX2
5747 static const lvec32 MergeAR64Permute = {0, 4, 2, 6, 1, 5, 3, 7};
MergeAR64Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint16_t * dst_ar64,int depth,int width)5748 void MergeAR64Row_AVX2(const uint16_t* src_r,
5749 const uint16_t* src_g,
5750 const uint16_t* src_b,
5751 const uint16_t* src_a,
5752 uint16_t* dst_ar64,
5753 int depth,
5754 int width) {
5755 int shift = 16 - depth;
5756 int mask = (1 << depth) - 1;
5757 mask = (mask << 16) + mask;
5758 asm volatile(
5759
5760 "sub %0,%1 \n"
5761 "sub %0,%2 \n"
5762 "sub %0,%3 \n"
5763 "vmovdqa %8,%%ymm5 \n"
5764 "vmovd %6,%%xmm6 \n"
5765 "vbroadcastss %7,%%ymm7 \n"
5766
5767 LABELALIGN
5768 "1: \n"
5769 "vmovdqu (%0),%%ymm0 \n" // R
5770 "vmovdqu (%0,%1),%%ymm1 \n" // G
5771 "vmovdqu (%0,%2),%%ymm2 \n" // B
5772 "vmovdqu (%0,%3),%%ymm3 \n" // A
5773 "vpminuw %%ymm0,%%ymm7,%%ymm0 \n"
5774 "vpminuw %%ymm1,%%ymm7,%%ymm1 \n"
5775 "vpminuw %%ymm2,%%ymm7,%%ymm2 \n"
5776 "vpminuw %%ymm3,%%ymm7,%%ymm3 \n"
5777 "vpsllw %%xmm6,%%ymm0,%%ymm0 \n"
5778 "vpsllw %%xmm6,%%ymm1,%%ymm1 \n"
5779 "vpsllw %%xmm6,%%ymm2,%%ymm2 \n"
5780 "vpsllw %%xmm6,%%ymm3,%%ymm3 \n"
5781 "vpermd %%ymm0,%%ymm5,%%ymm0 \n"
5782 "vpermd %%ymm1,%%ymm5,%%ymm1 \n"
5783 "vpermd %%ymm2,%%ymm5,%%ymm2 \n"
5784 "vpermd %%ymm3,%%ymm5,%%ymm3 \n"
5785 "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low)
5786 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi)
5787 "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low)
5788 "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi)
5789 "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1)
5790 "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3)
5791 "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2)
5792 "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4)
5793 "vmovdqu %%ymm3,(%4) \n"
5794 "vmovdqu %%ymm2,0x20(%4) \n"
5795 "vmovdqu %%ymm4,0x40(%4) \n"
5796 "vmovdqu %%ymm1,0x60(%4) \n"
5797 "lea 0x20(%0),%0 \n"
5798 "lea 0x80(%4),%4 \n"
5799 "subl $0x10,%5 \n"
5800 "jg 1b \n"
5801 "vzeroupper \n"
5802 : "+r"(src_r), // %0
5803 "+r"(src_g), // %1
5804 "+r"(src_b), // %2
5805 "+r"(src_a), // %3
5806 "+r"(dst_ar64), // %4
5807 #if defined(__i386__)
5808 "+m"(width) // %5
5809 #else
5810 "+rm"(width) // %5
5811 #endif
5812 : "m"(shift), // %6
5813 "m"(mask), // %7
5814 "m"(MergeAR64Permute) // %8
5815 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5816 "xmm7");
5817 }
5818 #endif
5819
5820 #ifdef HAS_MERGEXR64ROW_AVX2
MergeXR64Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint16_t * dst_ar64,int depth,int width)5821 void MergeXR64Row_AVX2(const uint16_t* src_r,
5822 const uint16_t* src_g,
5823 const uint16_t* src_b,
5824 uint16_t* dst_ar64,
5825 int depth,
5826 int width) {
5827 int shift = 16 - depth;
5828 int mask = (1 << depth) - 1;
5829 mask = (mask << 16) + mask;
5830 asm volatile(
5831
5832 "sub %0,%1 \n"
5833 "sub %0,%2 \n"
5834 "vmovdqa %7,%%ymm5 \n"
5835 "vmovd %5,%%xmm6 \n"
5836 "vbroadcastss %6,%%ymm7 \n"
5837
5838 LABELALIGN
5839 "1: \n"
5840 "vmovdqu (%0),%%ymm0 \n" // R
5841 "vmovdqu (%0,%1),%%ymm1 \n" // G
5842 "vmovdqu (%0,%2),%%ymm2 \n" // B
5843 "vpminuw %%ymm0,%%ymm7,%%ymm0 \n"
5844 "vpminuw %%ymm1,%%ymm7,%%ymm1 \n"
5845 "vpminuw %%ymm2,%%ymm7,%%ymm2 \n"
5846 "vpsllw %%xmm6,%%ymm0,%%ymm0 \n"
5847 "vpsllw %%xmm6,%%ymm1,%%ymm1 \n"
5848 "vpsllw %%xmm6,%%ymm2,%%ymm2 \n"
5849 "vpermd %%ymm0,%%ymm5,%%ymm0 \n"
5850 "vpermd %%ymm1,%%ymm5,%%ymm1 \n"
5851 "vpermd %%ymm2,%%ymm5,%%ymm2 \n"
5852 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" // A (0xffff)
5853 "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low)
5854 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi)
5855 "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low)
5856 "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi)
5857 "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1)
5858 "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3)
5859 "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2)
5860 "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4)
5861 "vmovdqu %%ymm3,(%3) \n"
5862 "vmovdqu %%ymm2,0x20(%3) \n"
5863 "vmovdqu %%ymm4,0x40(%3) \n"
5864 "vmovdqu %%ymm1,0x60(%3) \n"
5865 "lea 0x20(%0),%0 \n"
5866 "lea 0x80(%3),%3 \n"
5867 "subl $0x10,%4 \n"
5868 "jg 1b \n"
5869 "vzeroupper \n"
5870 : "+r"(src_r), // %0
5871 "+r"(src_g), // %1
5872 "+r"(src_b), // %2
5873 "+r"(dst_ar64), // %3
5874 "+r"(width) // %4
5875 : "m"(shift), // %5
5876 "m"(mask), // %6
5877 "m"(MergeAR64Permute) // %7
5878 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5879 "xmm7");
5880 }
5881 #endif
5882
5883 #ifdef HAS_MERGEARGB16TO8ROW_AVX2
5884 static const uvec8 MergeARGB16To8Shuffle = {0, 8, 1, 9, 2, 10, 3, 11,
5885 4, 12, 5, 13, 6, 14, 7, 15};
MergeARGB16To8Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint8_t * dst_argb,int depth,int width)5886 void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
5887 const uint16_t* src_g,
5888 const uint16_t* src_b,
5889 const uint16_t* src_a,
5890 uint8_t* dst_argb,
5891 int depth,
5892 int width) {
5893 int shift = depth - 8;
5894 asm volatile(
5895
5896 "sub %0,%1 \n"
5897 "sub %0,%2 \n"
5898 "sub %0,%3 \n"
5899 "vbroadcastf128 %7,%%ymm5 \n"
5900 "vmovd %6,%%xmm6 \n"
5901
5902 LABELALIGN
5903 "1: \n"
5904 "vmovdqu (%0),%%ymm0 \n" // R
5905 "vmovdqu (%0,%1),%%ymm1 \n" // G
5906 "vmovdqu (%0,%2),%%ymm2 \n" // B
5907 "vmovdqu (%0,%3),%%ymm3 \n" // A
5908 "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n"
5909 "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n"
5910 "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n"
5911 "vpsrlw %%xmm6,%%ymm3,%%ymm3 \n"
5912 "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar)
5913 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar)
5914 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave)
5915 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave)
5916 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
5917 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
5918 "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low)
5919 "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi)
5920 "vmovdqu %%ymm2,(%4) \n"
5921 "vmovdqu %%ymm0,0x20(%4) \n"
5922 "lea 0x20(%0),%0 \n"
5923 "lea 0x40(%4),%4 \n"
5924 "subl $0x10,%5 \n"
5925 "jg 1b \n"
5926 "vzeroupper \n"
5927 : "+r"(src_r), // %0
5928 "+r"(src_g), // %1
5929 "+r"(src_b), // %2
5930 "+r"(src_a), // %3
5931 "+r"(dst_argb), // %4
5932 #if defined(__i386__)
5933 "+m"(width) // %5
5934 #else
5935 "+rm"(width) // %5
5936 #endif
5937 : "m"(shift), // %6
5938 "m"(MergeARGB16To8Shuffle) // %7
5939 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
5940 }
5941 #endif
5942
5943 #ifdef HAS_MERGEXRGB16TO8ROW_AVX2
MergeXRGB16To8Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_argb,int depth,int width)5944 void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
5945 const uint16_t* src_g,
5946 const uint16_t* src_b,
5947 uint8_t* dst_argb,
5948 int depth,
5949 int width) {
5950 int shift = depth - 8;
5951 asm volatile(
5952
5953 "sub %0,%1 \n"
5954 "sub %0,%2 \n"
5955 "vbroadcastf128 %6,%%ymm5 \n"
5956 "vmovd %5,%%xmm6 \n"
5957 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
5958 "vpsrlw $8,%%ymm3,%%ymm3 \n" // A (0xff)
5959
5960 LABELALIGN
5961 "1: \n"
5962 "vmovdqu (%0),%%ymm0 \n" // R
5963 "vmovdqu (%0,%1),%%ymm1 \n" // G
5964 "vmovdqu (%0,%2),%%ymm2 \n" // B
5965 "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n"
5966 "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n"
5967 "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n"
5968 "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar)
5969 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar)
5970 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave)
5971 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave)
5972 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
5973 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
5974 "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low)
5975 "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi)
5976 "vmovdqu %%ymm2,(%3) \n"
5977 "vmovdqu %%ymm0,0x20(%3) \n"
5978 "lea 0x20(%0),%0 \n"
5979 "lea 0x40(%3),%3 \n"
5980 "subl $0x10,%4 \n"
5981 "jg 1b \n"
5982 "vzeroupper \n"
5983 : "+r"(src_r), // %0
5984 "+r"(src_g), // %1
5985 "+r"(src_b), // %2
5986 "+r"(dst_argb), // %3
5987 "+r"(width) // %4
5988 : "m"(shift), // %5
5989 "m"(MergeARGB16To8Shuffle) // %6
5990 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
5991 }
5992 #endif
5993
5994 #ifdef HAS_COPYROW_SSE2
CopyRow_SSE2(const uint8_t * src,uint8_t * dst,int width)5995 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
5996 asm volatile(
5997 "test $0xf,%0 \n"
5998 "jne 2f \n"
5999 "test $0xf,%1 \n"
6000 "jne 2f \n"
6001
6002 LABELALIGN
6003 "1: \n"
6004 "movdqa (%0),%%xmm0 \n"
6005 "movdqa 0x10(%0),%%xmm1 \n"
6006 "lea 0x20(%0),%0 \n"
6007 "movdqa %%xmm0,(%1) \n"
6008 "movdqa %%xmm1,0x10(%1) \n"
6009 "lea 0x20(%1),%1 \n"
6010 "sub $0x20,%2 \n"
6011 "jg 1b \n"
6012 "jmp 9f \n"
6013
6014 LABELALIGN
6015 "2: \n"
6016 "movdqu (%0),%%xmm0 \n"
6017 "movdqu 0x10(%0),%%xmm1 \n"
6018 "lea 0x20(%0),%0 \n"
6019 "movdqu %%xmm0,(%1) \n"
6020 "movdqu %%xmm1,0x10(%1) \n"
6021 "lea 0x20(%1),%1 \n"
6022 "sub $0x20,%2 \n"
6023 "jg 2b \n"
6024
6025 LABELALIGN "9: \n"
6026 : "+r"(src), // %0
6027 "+r"(dst), // %1
6028 "+r"(width) // %2
6029 :
6030 : "memory", "cc", "xmm0", "xmm1");
6031 }
6032 #endif // HAS_COPYROW_SSE2
6033
6034 #ifdef HAS_COPYROW_AVX
CopyRow_AVX(const uint8_t * src,uint8_t * dst,int width)6035 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
6036 asm volatile(
6037
6038 LABELALIGN
6039 "1: \n"
6040 "vmovdqu (%0),%%ymm0 \n"
6041 "vmovdqu 0x20(%0),%%ymm1 \n"
6042 "lea 0x40(%0),%0 \n"
6043 "vmovdqu %%ymm0,(%1) \n"
6044 "vmovdqu %%ymm1,0x20(%1) \n"
6045 "lea 0x40(%1),%1 \n"
6046 "sub $0x40,%2 \n"
6047 "jg 1b \n"
6048 : "+r"(src), // %0
6049 "+r"(dst), // %1
6050 "+r"(width) // %2
6051 :
6052 : "memory", "cc", "xmm0", "xmm1");
6053 }
6054 #endif // HAS_COPYROW_AVX
6055
6056 #ifdef HAS_COPYROW_ERMS
6057 // Multiple of 1.
CopyRow_ERMS(const uint8_t * src,uint8_t * dst,int width)6058 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
6059 size_t width_tmp = (size_t)(width);
6060 asm volatile(
6061
6062 "rep movsb \n"
6063 : "+S"(src), // %0
6064 "+D"(dst), // %1
6065 "+c"(width_tmp) // %2
6066 :
6067 : "memory", "cc");
6068 }
6069 #endif // HAS_COPYROW_ERMS
6070
6071 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
6072 // width in pixels
ARGBCopyAlphaRow_SSE2(const uint8_t * src,uint8_t * dst,int width)6073 void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
6074 asm volatile(
6075 "pcmpeqb %%xmm0,%%xmm0 \n"
6076 "pslld $0x18,%%xmm0 \n"
6077 "pcmpeqb %%xmm1,%%xmm1 \n"
6078 "psrld $0x8,%%xmm1 \n"
6079
6080 LABELALIGN
6081 "1: \n"
6082 "movdqu (%0),%%xmm2 \n"
6083 "movdqu 0x10(%0),%%xmm3 \n"
6084 "lea 0x20(%0),%0 \n"
6085 "movdqu (%1),%%xmm4 \n"
6086 "movdqu 0x10(%1),%%xmm5 \n"
6087 "pand %%xmm0,%%xmm2 \n"
6088 "pand %%xmm0,%%xmm3 \n"
6089 "pand %%xmm1,%%xmm4 \n"
6090 "pand %%xmm1,%%xmm5 \n"
6091 "por %%xmm4,%%xmm2 \n"
6092 "por %%xmm5,%%xmm3 \n"
6093 "movdqu %%xmm2,(%1) \n"
6094 "movdqu %%xmm3,0x10(%1) \n"
6095 "lea 0x20(%1),%1 \n"
6096 "sub $0x8,%2 \n"
6097 "jg 1b \n"
6098 : "+r"(src), // %0
6099 "+r"(dst), // %1
6100 "+r"(width) // %2
6101 :
6102 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6103 }
6104 #endif // HAS_ARGBCOPYALPHAROW_SSE2
6105
6106 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
6107 // width in pixels
ARGBCopyAlphaRow_AVX2(const uint8_t * src,uint8_t * dst,int width)6108 void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
6109 asm volatile(
6110 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
6111 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
6112
6113 LABELALIGN
6114 "1: \n"
6115 "vmovdqu (%0),%%ymm1 \n"
6116 "vmovdqu 0x20(%0),%%ymm2 \n"
6117 "lea 0x40(%0),%0 \n"
6118 "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
6119 "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
6120 "vmovdqu %%ymm1,(%1) \n"
6121 "vmovdqu %%ymm2,0x20(%1) \n"
6122 "lea 0x40(%1),%1 \n"
6123 "sub $0x10,%2 \n"
6124 "jg 1b \n"
6125 "vzeroupper \n"
6126 : "+r"(src), // %0
6127 "+r"(dst), // %1
6128 "+r"(width) // %2
6129 :
6130 : "memory", "cc", "xmm0", "xmm1", "xmm2");
6131 }
6132 #endif // HAS_ARGBCOPYALPHAROW_AVX2
6133
6134 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
6135 // width in pixels
ARGBExtractAlphaRow_SSE2(const uint8_t * src_argb,uint8_t * dst_a,int width)6136 void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
6137 uint8_t* dst_a,
6138 int width) {
6139 asm volatile(
6140
6141 LABELALIGN
6142 "1: \n"
6143 "movdqu (%0), %%xmm0 \n"
6144 "movdqu 0x10(%0), %%xmm1 \n"
6145 "lea 0x20(%0), %0 \n"
6146 "psrld $0x18, %%xmm0 \n"
6147 "psrld $0x18, %%xmm1 \n"
6148 "packssdw %%xmm1, %%xmm0 \n"
6149 "packuswb %%xmm0, %%xmm0 \n"
6150 "movq %%xmm0,(%1) \n"
6151 "lea 0x8(%1), %1 \n"
6152 "sub $0x8, %2 \n"
6153 "jg 1b \n"
6154 : "+r"(src_argb), // %0
6155 "+r"(dst_a), // %1
6156 "+rm"(width) // %2
6157 :
6158 : "memory", "cc", "xmm0", "xmm1");
6159 }
6160 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
6161
6162 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
6163 static const uvec8 kShuffleAlphaShort_AVX2 = {
6164 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u,
6165 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
6166
ARGBExtractAlphaRow_AVX2(const uint8_t * src_argb,uint8_t * dst_a,int width)6167 void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
6168 uint8_t* dst_a,
6169 int width) {
6170 asm volatile(
6171 "vmovdqa %3,%%ymm4 \n"
6172 "vbroadcastf128 %4,%%ymm5 \n"
6173
6174 LABELALIGN
6175 "1: \n"
6176 "vmovdqu (%0), %%ymm0 \n"
6177 "vmovdqu 0x20(%0), %%ymm1 \n"
6178 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
6179 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
6180 "vmovdqu 0x40(%0), %%ymm2 \n"
6181 "vmovdqu 0x60(%0), %%ymm3 \n"
6182 "lea 0x80(%0), %0 \n"
6183 "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
6184 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
6185 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
6186 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
6187 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
6188 "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
6189 "vmovdqu %%ymm0,(%1) \n"
6190 "lea 0x20(%1),%1 \n"
6191 "sub $0x20, %2 \n"
6192 "jg 1b \n"
6193 "vzeroupper \n"
6194 : "+r"(src_argb), // %0
6195 "+r"(dst_a), // %1
6196 "+rm"(width) // %2
6197 : "m"(kPermdARGBToY_AVX), // %3
6198 "m"(kShuffleAlphaShort_AVX2) // %4
6199 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6200 }
6201 #endif // HAS_ARGBEXTRACTALPHAROW_AVX2
6202
6203 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
6204 // width in pixels
ARGBCopyYToAlphaRow_SSE2(const uint8_t * src,uint8_t * dst,int width)6205 void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
6206 asm volatile(
6207 "pcmpeqb %%xmm0,%%xmm0 \n"
6208 "pslld $0x18,%%xmm0 \n"
6209 "pcmpeqb %%xmm1,%%xmm1 \n"
6210 "psrld $0x8,%%xmm1 \n"
6211
6212 LABELALIGN
6213 "1: \n"
6214 "movq (%0),%%xmm2 \n"
6215 "lea 0x8(%0),%0 \n"
6216 "punpcklbw %%xmm2,%%xmm2 \n"
6217 "punpckhwd %%xmm2,%%xmm3 \n"
6218 "punpcklwd %%xmm2,%%xmm2 \n"
6219 "movdqu (%1),%%xmm4 \n"
6220 "movdqu 0x10(%1),%%xmm5 \n"
6221 "pand %%xmm0,%%xmm2 \n"
6222 "pand %%xmm0,%%xmm3 \n"
6223 "pand %%xmm1,%%xmm4 \n"
6224 "pand %%xmm1,%%xmm5 \n"
6225 "por %%xmm4,%%xmm2 \n"
6226 "por %%xmm5,%%xmm3 \n"
6227 "movdqu %%xmm2,(%1) \n"
6228 "movdqu %%xmm3,0x10(%1) \n"
6229 "lea 0x20(%1),%1 \n"
6230 "sub $0x8,%2 \n"
6231 "jg 1b \n"
6232 : "+r"(src), // %0
6233 "+r"(dst), // %1
6234 "+r"(width) // %2
6235 :
6236 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6237 }
6238 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
6239
6240 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
6241 // width in pixels
ARGBCopyYToAlphaRow_AVX2(const uint8_t * src,uint8_t * dst,int width)6242 void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
6243 asm volatile(
6244 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
6245 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
6246
6247 LABELALIGN
6248 "1: \n"
6249 "vpmovzxbd (%0),%%ymm1 \n"
6250 "vpmovzxbd 0x8(%0),%%ymm2 \n"
6251 "lea 0x10(%0),%0 \n"
6252 "vpslld $0x18,%%ymm1,%%ymm1 \n"
6253 "vpslld $0x18,%%ymm2,%%ymm2 \n"
6254 "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
6255 "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
6256 "vmovdqu %%ymm1,(%1) \n"
6257 "vmovdqu %%ymm2,0x20(%1) \n"
6258 "lea 0x40(%1),%1 \n"
6259 "sub $0x10,%2 \n"
6260 "jg 1b \n"
6261 "vzeroupper \n"
6262 : "+r"(src), // %0
6263 "+r"(dst), // %1
6264 "+r"(width) // %2
6265 :
6266 : "memory", "cc", "xmm0", "xmm1", "xmm2");
6267 }
6268 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
6269
6270 #ifdef HAS_SETROW_X86
SetRow_X86(uint8_t * dst,uint8_t v8,int width)6271 void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
6272 size_t width_tmp = (size_t)(width >> 2);
6273 const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
6274 asm volatile(
6275
6276 "rep stosl \n"
6277 : "+D"(dst), // %0
6278 "+c"(width_tmp) // %1
6279 : "a"(v32) // %2
6280 : "memory", "cc");
6281 }
6282
SetRow_ERMS(uint8_t * dst,uint8_t v8,int width)6283 void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
6284 size_t width_tmp = (size_t)(width);
6285 asm volatile(
6286
6287 "rep stosb \n"
6288 : "+D"(dst), // %0
6289 "+c"(width_tmp) // %1
6290 : "a"(v8) // %2
6291 : "memory", "cc");
6292 }
6293
ARGBSetRow_X86(uint8_t * dst_argb,uint32_t v32,int width)6294 void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
6295 size_t width_tmp = (size_t)(width);
6296 asm volatile(
6297
6298 "rep stosl \n"
6299 : "+D"(dst_argb), // %0
6300 "+c"(width_tmp) // %1
6301 : "a"(v32) // %2
6302 : "memory", "cc");
6303 }
6304 #endif // HAS_SETROW_X86
6305
6306 #ifdef HAS_YUY2TOYROW_SSE2
YUY2ToYRow_SSE2(const uint8_t * src_yuy2,uint8_t * dst_y,int width)6307 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
6308 asm volatile(
6309 "pcmpeqb %%xmm5,%%xmm5 \n"
6310 "psrlw $0x8,%%xmm5 \n"
6311
6312 LABELALIGN
6313 "1: \n"
6314 "movdqu (%0),%%xmm0 \n"
6315 "movdqu 0x10(%0),%%xmm1 \n"
6316 "lea 0x20(%0),%0 \n"
6317 "pand %%xmm5,%%xmm0 \n"
6318 "pand %%xmm5,%%xmm1 \n"
6319 "packuswb %%xmm1,%%xmm0 \n"
6320 "movdqu %%xmm0,(%1) \n"
6321 "lea 0x10(%1),%1 \n"
6322 "sub $0x10,%2 \n"
6323 "jg 1b \n"
6324 : "+r"(src_yuy2), // %0
6325 "+r"(dst_y), // %1
6326 "+r"(width) // %2
6327 :
6328 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6329 }
6330
YUY2ToUVRow_SSE2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)6331 void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
6332 int stride_yuy2,
6333 uint8_t* dst_u,
6334 uint8_t* dst_v,
6335 int width) {
6336 asm volatile(
6337 "pcmpeqb %%xmm5,%%xmm5 \n"
6338 "psrlw $0x8,%%xmm5 \n"
6339 "sub %1,%2 \n"
6340
6341 LABELALIGN
6342 "1: \n"
6343 "movdqu (%0),%%xmm0 \n"
6344 "movdqu 0x10(%0),%%xmm1 \n"
6345 "movdqu 0x00(%0,%4,1),%%xmm2 \n"
6346 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
6347 "lea 0x20(%0),%0 \n"
6348 "pavgb %%xmm2,%%xmm0 \n"
6349 "pavgb %%xmm3,%%xmm1 \n"
6350 "psrlw $0x8,%%xmm0 \n"
6351 "psrlw $0x8,%%xmm1 \n"
6352 "packuswb %%xmm1,%%xmm0 \n"
6353 "movdqa %%xmm0,%%xmm1 \n"
6354 "pand %%xmm5,%%xmm0 \n"
6355 "packuswb %%xmm0,%%xmm0 \n"
6356 "psrlw $0x8,%%xmm1 \n"
6357 "packuswb %%xmm1,%%xmm1 \n"
6358 "movq %%xmm0,(%1) \n"
6359 "movq %%xmm1,0x00(%1,%2,1) \n"
6360 "lea 0x8(%1),%1 \n"
6361 "sub $0x10,%3 \n"
6362 "jg 1b \n"
6363 : "+r"(src_yuy2), // %0
6364 "+r"(dst_u), // %1
6365 "+r"(dst_v), // %2
6366 "+r"(width) // %3
6367 : "r"((intptr_t)(stride_yuy2)) // %4
6368 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
6369 }
6370
YUY2ToUV422Row_SSE2(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)6371 void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
6372 uint8_t* dst_u,
6373 uint8_t* dst_v,
6374 int width) {
6375 asm volatile(
6376 "pcmpeqb %%xmm5,%%xmm5 \n"
6377 "psrlw $0x8,%%xmm5 \n"
6378 "sub %1,%2 \n"
6379
6380 LABELALIGN
6381 "1: \n"
6382 "movdqu (%0),%%xmm0 \n"
6383 "movdqu 0x10(%0),%%xmm1 \n"
6384 "lea 0x20(%0),%0 \n"
6385 "psrlw $0x8,%%xmm0 \n"
6386 "psrlw $0x8,%%xmm1 \n"
6387 "packuswb %%xmm1,%%xmm0 \n"
6388 "movdqa %%xmm0,%%xmm1 \n"
6389 "pand %%xmm5,%%xmm0 \n"
6390 "packuswb %%xmm0,%%xmm0 \n"
6391 "psrlw $0x8,%%xmm1 \n"
6392 "packuswb %%xmm1,%%xmm1 \n"
6393 "movq %%xmm0,(%1) \n"
6394 "movq %%xmm1,0x00(%1,%2,1) \n"
6395 "lea 0x8(%1),%1 \n"
6396 "sub $0x10,%3 \n"
6397 "jg 1b \n"
6398 : "+r"(src_yuy2), // %0
6399 "+r"(dst_u), // %1
6400 "+r"(dst_v), // %2
6401 "+r"(width) // %3
6402 :
6403 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6404 }
6405
UYVYToYRow_SSE2(const uint8_t * src_uyvy,uint8_t * dst_y,int width)6406 void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
6407 asm volatile(
6408
6409 LABELALIGN
6410 "1: \n"
6411 "movdqu (%0),%%xmm0 \n"
6412 "movdqu 0x10(%0),%%xmm1 \n"
6413 "lea 0x20(%0),%0 \n"
6414 "psrlw $0x8,%%xmm0 \n"
6415 "psrlw $0x8,%%xmm1 \n"
6416 "packuswb %%xmm1,%%xmm0 \n"
6417 "movdqu %%xmm0,(%1) \n"
6418 "lea 0x10(%1),%1 \n"
6419 "sub $0x10,%2 \n"
6420 "jg 1b \n"
6421 : "+r"(src_uyvy), // %0
6422 "+r"(dst_y), // %1
6423 "+r"(width) // %2
6424 :
6425 : "memory", "cc", "xmm0", "xmm1");
6426 }
6427
UYVYToUVRow_SSE2(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)6428 void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
6429 int stride_uyvy,
6430 uint8_t* dst_u,
6431 uint8_t* dst_v,
6432 int width) {
6433 asm volatile(
6434 "pcmpeqb %%xmm5,%%xmm5 \n"
6435 "psrlw $0x8,%%xmm5 \n"
6436 "sub %1,%2 \n"
6437
6438 LABELALIGN
6439 "1: \n"
6440 "movdqu (%0),%%xmm0 \n"
6441 "movdqu 0x10(%0),%%xmm1 \n"
6442 "movdqu 0x00(%0,%4,1),%%xmm2 \n"
6443 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
6444 "lea 0x20(%0),%0 \n"
6445 "pavgb %%xmm2,%%xmm0 \n"
6446 "pavgb %%xmm3,%%xmm1 \n"
6447 "pand %%xmm5,%%xmm0 \n"
6448 "pand %%xmm5,%%xmm1 \n"
6449 "packuswb %%xmm1,%%xmm0 \n"
6450 "movdqa %%xmm0,%%xmm1 \n"
6451 "pand %%xmm5,%%xmm0 \n"
6452 "packuswb %%xmm0,%%xmm0 \n"
6453 "psrlw $0x8,%%xmm1 \n"
6454 "packuswb %%xmm1,%%xmm1 \n"
6455 "movq %%xmm0,(%1) \n"
6456 "movq %%xmm1,0x00(%1,%2,1) \n"
6457 "lea 0x8(%1),%1 \n"
6458 "sub $0x10,%3 \n"
6459 "jg 1b \n"
6460 : "+r"(src_uyvy), // %0
6461 "+r"(dst_u), // %1
6462 "+r"(dst_v), // %2
6463 "+r"(width) // %3
6464 : "r"((intptr_t)(stride_uyvy)) // %4
6465 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
6466 }
6467
UYVYToUV422Row_SSE2(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)6468 void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
6469 uint8_t* dst_u,
6470 uint8_t* dst_v,
6471 int width) {
6472 asm volatile(
6473 "pcmpeqb %%xmm5,%%xmm5 \n"
6474 "psrlw $0x8,%%xmm5 \n"
6475 "sub %1,%2 \n"
6476
6477 LABELALIGN
6478 "1: \n"
6479 "movdqu (%0),%%xmm0 \n"
6480 "movdqu 0x10(%0),%%xmm1 \n"
6481 "lea 0x20(%0),%0 \n"
6482 "pand %%xmm5,%%xmm0 \n"
6483 "pand %%xmm5,%%xmm1 \n"
6484 "packuswb %%xmm1,%%xmm0 \n"
6485 "movdqa %%xmm0,%%xmm1 \n"
6486 "pand %%xmm5,%%xmm0 \n"
6487 "packuswb %%xmm0,%%xmm0 \n"
6488 "psrlw $0x8,%%xmm1 \n"
6489 "packuswb %%xmm1,%%xmm1 \n"
6490 "movq %%xmm0,(%1) \n"
6491 "movq %%xmm1,0x00(%1,%2,1) \n"
6492 "lea 0x8(%1),%1 \n"
6493 "sub $0x10,%3 \n"
6494 "jg 1b \n"
6495 : "+r"(src_uyvy), // %0
6496 "+r"(dst_u), // %1
6497 "+r"(dst_v), // %2
6498 "+r"(width) // %3
6499 :
6500 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6501 }
6502 #endif // HAS_YUY2TOYROW_SSE2
6503
6504 #ifdef HAS_YUY2TOYROW_AVX2
YUY2ToYRow_AVX2(const uint8_t * src_yuy2,uint8_t * dst_y,int width)6505 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
6506 asm volatile(
6507 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
6508 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
6509
6510 LABELALIGN
6511 "1: \n"
6512 "vmovdqu (%0),%%ymm0 \n"
6513 "vmovdqu 0x20(%0),%%ymm1 \n"
6514 "lea 0x40(%0),%0 \n"
6515 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
6516 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
6517 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
6518 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6519 "vmovdqu %%ymm0,(%1) \n"
6520 "lea 0x20(%1),%1 \n"
6521 "sub $0x20,%2 \n"
6522 "jg 1b \n"
6523 "vzeroupper \n"
6524 : "+r"(src_yuy2), // %0
6525 "+r"(dst_y), // %1
6526 "+r"(width) // %2
6527 :
6528 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6529 }
6530
YUY2ToUVRow_AVX2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)6531 void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
6532 int stride_yuy2,
6533 uint8_t* dst_u,
6534 uint8_t* dst_v,
6535 int width) {
6536 asm volatile(
6537 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
6538 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
6539 "sub %1,%2 \n"
6540
6541 LABELALIGN
6542 "1: \n"
6543 "vmovdqu (%0),%%ymm0 \n"
6544 "vmovdqu 0x20(%0),%%ymm1 \n"
6545 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
6546 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
6547 "lea 0x40(%0),%0 \n"
6548 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
6549 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
6550 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
6551 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6552 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
6553 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
6554 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
6555 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
6556 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
6557 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6558 "vextractf128 $0x0,%%ymm1,(%1) \n"
6559 "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
6560 "lea 0x10(%1),%1 \n"
6561 "sub $0x20,%3 \n"
6562 "jg 1b \n"
6563 "vzeroupper \n"
6564 : "+r"(src_yuy2), // %0
6565 "+r"(dst_u), // %1
6566 "+r"(dst_v), // %2
6567 "+r"(width) // %3
6568 : "r"((intptr_t)(stride_yuy2)) // %4
6569 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6570 }
6571
YUY2ToUV422Row_AVX2(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)6572 void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
6573 uint8_t* dst_u,
6574 uint8_t* dst_v,
6575 int width) {
6576 asm volatile(
6577 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
6578 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
6579 "sub %1,%2 \n"
6580
6581 LABELALIGN
6582 "1: \n"
6583 "vmovdqu (%0),%%ymm0 \n"
6584 "vmovdqu 0x20(%0),%%ymm1 \n"
6585 "lea 0x40(%0),%0 \n"
6586 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
6587 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
6588 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
6589 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6590 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
6591 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
6592 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
6593 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
6594 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
6595 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6596 "vextractf128 $0x0,%%ymm1,(%1) \n"
6597 "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
6598 "lea 0x10(%1),%1 \n"
6599 "sub $0x20,%3 \n"
6600 "jg 1b \n"
6601 "vzeroupper \n"
6602 : "+r"(src_yuy2), // %0
6603 "+r"(dst_u), // %1
6604 "+r"(dst_v), // %2
6605 "+r"(width) // %3
6606 :
6607 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6608 }
6609
UYVYToYRow_AVX2(const uint8_t * src_uyvy,uint8_t * dst_y,int width)6610 void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
6611 asm volatile(
6612
6613 LABELALIGN
6614 "1: \n"
6615 "vmovdqu (%0),%%ymm0 \n"
6616 "vmovdqu 0x20(%0),%%ymm1 \n"
6617 "lea 0x40(%0),%0 \n"
6618 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
6619 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
6620 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
6621 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6622 "vmovdqu %%ymm0,(%1) \n"
6623 "lea 0x20(%1),%1 \n"
6624 "sub $0x20,%2 \n"
6625 "jg 1b \n"
6626 "vzeroupper \n"
6627 : "+r"(src_uyvy), // %0
6628 "+r"(dst_y), // %1
6629 "+r"(width) // %2
6630 :
6631 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6632 }
UYVYToUVRow_AVX2(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)6633 void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
6634 int stride_uyvy,
6635 uint8_t* dst_u,
6636 uint8_t* dst_v,
6637 int width) {
6638 asm volatile(
6639 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
6640 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
6641 "sub %1,%2 \n"
6642
6643 LABELALIGN
6644 "1: \n"
6645 "vmovdqu (%0),%%ymm0 \n"
6646 "vmovdqu 0x20(%0),%%ymm1 \n"
6647 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
6648 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
6649 "lea 0x40(%0),%0 \n"
6650 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
6651 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
6652 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
6653 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6654 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
6655 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
6656 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
6657 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
6658 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
6659 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6660 "vextractf128 $0x0,%%ymm1,(%1) \n"
6661 "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
6662 "lea 0x10(%1),%1 \n"
6663 "sub $0x20,%3 \n"
6664 "jg 1b \n"
6665 "vzeroupper \n"
6666 : "+r"(src_uyvy), // %0
6667 "+r"(dst_u), // %1
6668 "+r"(dst_v), // %2
6669 "+r"(width) // %3
6670 : "r"((intptr_t)(stride_uyvy)) // %4
6671 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6672 }
6673
UYVYToUV422Row_AVX2(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)6674 void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
6675 uint8_t* dst_u,
6676 uint8_t* dst_v,
6677 int width) {
6678 asm volatile(
6679 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
6680 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
6681 "sub %1,%2 \n"
6682
6683 LABELALIGN
6684 "1: \n"
6685 "vmovdqu (%0),%%ymm0 \n"
6686 "vmovdqu 0x20(%0),%%ymm1 \n"
6687 "lea 0x40(%0),%0 \n"
6688 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
6689 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
6690 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
6691 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6692 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
6693 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
6694 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
6695 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
6696 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
6697 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6698 "vextractf128 $0x0,%%ymm1,(%1) \n"
6699 "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
6700 "lea 0x10(%1),%1 \n"
6701 "sub $0x20,%3 \n"
6702 "jg 1b \n"
6703 "vzeroupper \n"
6704 : "+r"(src_uyvy), // %0
6705 "+r"(dst_u), // %1
6706 "+r"(dst_v), // %2
6707 "+r"(width) // %3
6708 :
6709 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6710 }
6711 #endif // HAS_YUY2TOYROW_AVX2
6712
6713 #ifdef HAS_ARGBBLENDROW_SSSE3
6714 // Shuffle table for isolating alpha.
6715 static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
6716 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
6717
6718 // Blend 8 pixels at a time
ARGBBlendRow_SSSE3(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)6719 void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
6720 const uint8_t* src_argb1,
6721 uint8_t* dst_argb,
6722 int width) {
6723 asm volatile(
6724 "pcmpeqb %%xmm7,%%xmm7 \n"
6725 "psrlw $0xf,%%xmm7 \n"
6726 "pcmpeqb %%xmm6,%%xmm6 \n"
6727 "psrlw $0x8,%%xmm6 \n"
6728 "pcmpeqb %%xmm5,%%xmm5 \n"
6729 "psllw $0x8,%%xmm5 \n"
6730 "pcmpeqb %%xmm4,%%xmm4 \n"
6731 "pslld $0x18,%%xmm4 \n"
6732 "sub $0x4,%3 \n"
6733 "jl 49f \n"
6734
6735 // 4 pixel loop.
6736 LABELALIGN
6737 "40: \n"
6738 "movdqu (%0),%%xmm3 \n"
6739 "lea 0x10(%0),%0 \n"
6740 "movdqa %%xmm3,%%xmm0 \n"
6741 "pxor %%xmm4,%%xmm3 \n"
6742 "movdqu (%1),%%xmm2 \n"
6743 "pshufb %4,%%xmm3 \n"
6744 "pand %%xmm6,%%xmm2 \n"
6745 "paddw %%xmm7,%%xmm3 \n"
6746 "pmullw %%xmm3,%%xmm2 \n"
6747 "movdqu (%1),%%xmm1 \n"
6748 "lea 0x10(%1),%1 \n"
6749 "psrlw $0x8,%%xmm1 \n"
6750 "por %%xmm4,%%xmm0 \n"
6751 "pmullw %%xmm3,%%xmm1 \n"
6752 "psrlw $0x8,%%xmm2 \n"
6753 "paddusb %%xmm2,%%xmm0 \n"
6754 "pand %%xmm5,%%xmm1 \n"
6755 "paddusb %%xmm1,%%xmm0 \n"
6756 "movdqu %%xmm0,(%2) \n"
6757 "lea 0x10(%2),%2 \n"
6758 "sub $0x4,%3 \n"
6759 "jge 40b \n"
6760
6761 "49: \n"
6762 "add $0x3,%3 \n"
6763 "jl 99f \n"
6764
6765 // 1 pixel loop.
6766 "91: \n"
6767 "movd (%0),%%xmm3 \n"
6768 "lea 0x4(%0),%0 \n"
6769 "movdqa %%xmm3,%%xmm0 \n"
6770 "pxor %%xmm4,%%xmm3 \n"
6771 "movd (%1),%%xmm2 \n"
6772 "pshufb %4,%%xmm3 \n"
6773 "pand %%xmm6,%%xmm2 \n"
6774 "paddw %%xmm7,%%xmm3 \n"
6775 "pmullw %%xmm3,%%xmm2 \n"
6776 "movd (%1),%%xmm1 \n"
6777 "lea 0x4(%1),%1 \n"
6778 "psrlw $0x8,%%xmm1 \n"
6779 "por %%xmm4,%%xmm0 \n"
6780 "pmullw %%xmm3,%%xmm1 \n"
6781 "psrlw $0x8,%%xmm2 \n"
6782 "paddusb %%xmm2,%%xmm0 \n"
6783 "pand %%xmm5,%%xmm1 \n"
6784 "paddusb %%xmm1,%%xmm0 \n"
6785 "movd %%xmm0,(%2) \n"
6786 "lea 0x4(%2),%2 \n"
6787 "sub $0x1,%3 \n"
6788 "jge 91b \n"
6789 "99: \n"
6790 : "+r"(src_argb), // %0
6791 "+r"(src_argb1), // %1
6792 "+r"(dst_argb), // %2
6793 "+r"(width) // %3
6794 : "m"(kShuffleAlpha) // %4
6795 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
6796 "xmm7");
6797 }
6798 #endif // HAS_ARGBBLENDROW_SSSE3
6799
6800 #ifdef HAS_BLENDPLANEROW_SSSE3
6801 // Blend 8 pixels at a time.
6802 // unsigned version of math
6803 // =((A2*C2)+(B2*(255-C2))+255)/256
6804 // signed version of math
6805 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_SSSE3(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)6806 void BlendPlaneRow_SSSE3(const uint8_t* src0,
6807 const uint8_t* src1,
6808 const uint8_t* alpha,
6809 uint8_t* dst,
6810 int width) {
6811 asm volatile(
6812 "pcmpeqb %%xmm5,%%xmm5 \n"
6813 "psllw $0x8,%%xmm5 \n"
6814 "mov $0x80808080,%%eax \n"
6815 "movd %%eax,%%xmm6 \n"
6816 "pshufd $0x0,%%xmm6,%%xmm6 \n"
6817 "mov $0x807f807f,%%eax \n"
6818 "movd %%eax,%%xmm7 \n"
6819 "pshufd $0x0,%%xmm7,%%xmm7 \n"
6820 "sub %2,%0 \n"
6821 "sub %2,%1 \n"
6822 "sub %2,%3 \n"
6823
6824 // 8 pixel loop.
6825 LABELALIGN
6826 "1: \n"
6827 "movq (%2),%%xmm0 \n"
6828 "punpcklbw %%xmm0,%%xmm0 \n"
6829 "pxor %%xmm5,%%xmm0 \n"
6830 "movq (%0,%2,1),%%xmm1 \n"
6831 "movq (%1,%2,1),%%xmm2 \n"
6832 "punpcklbw %%xmm2,%%xmm1 \n"
6833 "psubb %%xmm6,%%xmm1 \n"
6834 "pmaddubsw %%xmm1,%%xmm0 \n"
6835 "paddw %%xmm7,%%xmm0 \n"
6836 "psrlw $0x8,%%xmm0 \n"
6837 "packuswb %%xmm0,%%xmm0 \n"
6838 "movq %%xmm0,(%3,%2,1) \n"
6839 "lea 0x8(%2),%2 \n"
6840 "sub $0x8,%4 \n"
6841 "jg 1b \n"
6842 : "+r"(src0), // %0
6843 "+r"(src1), // %1
6844 "+r"(alpha), // %2
6845 "+r"(dst), // %3
6846 "+rm"(width) // %4
6847 ::"memory",
6848 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
6849 }
6850 #endif // HAS_BLENDPLANEROW_SSSE3
6851
6852 #ifdef HAS_BLENDPLANEROW_AVX2
6853 // Blend 32 pixels at a time.
6854 // unsigned version of math
6855 // =((A2*C2)+(B2*(255-C2))+255)/256
6856 // signed version of math
6857 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_AVX2(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)6858 void BlendPlaneRow_AVX2(const uint8_t* src0,
6859 const uint8_t* src1,
6860 const uint8_t* alpha,
6861 uint8_t* dst,
6862 int width) {
6863 asm volatile(
6864 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
6865 "vpsllw $0x8,%%ymm5,%%ymm5 \n"
6866 "mov $0x80808080,%%eax \n"
6867 "vmovd %%eax,%%xmm6 \n"
6868 "vbroadcastss %%xmm6,%%ymm6 \n"
6869 "mov $0x807f807f,%%eax \n"
6870 "vmovd %%eax,%%xmm7 \n"
6871 "vbroadcastss %%xmm7,%%ymm7 \n"
6872 "sub %2,%0 \n"
6873 "sub %2,%1 \n"
6874 "sub %2,%3 \n"
6875
6876 // 32 pixel loop.
6877 LABELALIGN
6878 "1: \n"
6879 "vmovdqu (%2),%%ymm0 \n"
6880 "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
6881 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
6882 "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
6883 "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
6884 "vmovdqu (%0,%2,1),%%ymm1 \n"
6885 "vmovdqu (%1,%2,1),%%ymm2 \n"
6886 "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
6887 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
6888 "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
6889 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
6890 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
6891 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
6892 "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
6893 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
6894 "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
6895 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
6896 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
6897 "vmovdqu %%ymm0,(%3,%2,1) \n"
6898 "lea 0x20(%2),%2 \n"
6899 "sub $0x20,%4 \n"
6900 "jg 1b \n"
6901 "vzeroupper \n"
6902 : "+r"(src0), // %0
6903 "+r"(src1), // %1
6904 "+r"(alpha), // %2
6905 "+r"(dst), // %3
6906 "+rm"(width) // %4
6907 ::"memory",
6908 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
6909 "xmm7");
6910 }
6911 #endif // HAS_BLENDPLANEROW_AVX2
6912
6913 #ifdef HAS_ARGBATTENUATEROW_SSSE3
6914 // Shuffle table duplicating alpha.
6915 static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
6916 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
6917 static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
6918 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
6919 // Attenuate 4 pixels at a time.
ARGBAttenuateRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width)6920 void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
6921 uint8_t* dst_argb,
6922 int width) {
6923 asm volatile(
6924 "pcmpeqb %%xmm3,%%xmm3 \n"
6925 "pslld $0x18,%%xmm3 \n"
6926 "movdqa %3,%%xmm4 \n"
6927 "movdqa %4,%%xmm5 \n"
6928
6929 // 4 pixel loop.
6930 LABELALIGN
6931 "1: \n"
6932 "movdqu (%0),%%xmm0 \n"
6933 "pshufb %%xmm4,%%xmm0 \n"
6934 "movdqu (%0),%%xmm1 \n"
6935 "punpcklbw %%xmm1,%%xmm1 \n"
6936 "pmulhuw %%xmm1,%%xmm0 \n"
6937 "movdqu (%0),%%xmm1 \n"
6938 "pshufb %%xmm5,%%xmm1 \n"
6939 "movdqu (%0),%%xmm2 \n"
6940 "punpckhbw %%xmm2,%%xmm2 \n"
6941 "pmulhuw %%xmm2,%%xmm1 \n"
6942 "movdqu (%0),%%xmm2 \n"
6943 "lea 0x10(%0),%0 \n"
6944 "pand %%xmm3,%%xmm2 \n"
6945 "psrlw $0x8,%%xmm0 \n"
6946 "psrlw $0x8,%%xmm1 \n"
6947 "packuswb %%xmm1,%%xmm0 \n"
6948 "por %%xmm2,%%xmm0 \n"
6949 "movdqu %%xmm0,(%1) \n"
6950 "lea 0x10(%1),%1 \n"
6951 "sub $0x4,%2 \n"
6952 "jg 1b \n"
6953 : "+r"(src_argb), // %0
6954 "+r"(dst_argb), // %1
6955 "+r"(width) // %2
6956 : "m"(kShuffleAlpha0), // %3
6957 "m"(kShuffleAlpha1) // %4
6958 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6959 }
6960 #endif // HAS_ARGBATTENUATEROW_SSSE3
6961
6962 #ifdef HAS_ARGBATTENUATEROW_AVX2
6963 // Shuffle table duplicating alpha.
6964 static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
6965 128u, 128u, 14u, 15u, 14u, 15u,
6966 14u, 15u, 128u, 128u};
6967 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,int width)6968 void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
6969 uint8_t* dst_argb,
6970 int width) {
6971 asm volatile(
6972 "vbroadcastf128 %3,%%ymm4 \n"
6973 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
6974 "vpslld $0x18,%%ymm5,%%ymm5 \n"
6975 "sub %0,%1 \n"
6976
6977 // 8 pixel loop.
6978 LABELALIGN
6979 "1: \n"
6980 "vmovdqu (%0),%%ymm6 \n"
6981 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
6982 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
6983 "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
6984 "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
6985 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
6986 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
6987 "vpand %%ymm5,%%ymm6,%%ymm6 \n"
6988 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
6989 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
6990 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
6991 "vpor %%ymm6,%%ymm0,%%ymm0 \n"
6992 "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
6993 "lea 0x20(%0),%0 \n"
6994 "sub $0x8,%2 \n"
6995 "jg 1b \n"
6996 "vzeroupper \n"
6997 : "+r"(src_argb), // %0
6998 "+r"(dst_argb), // %1
6999 "+r"(width) // %2
7000 : "m"(kShuffleAlpha_AVX2) // %3
7001 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
7002 }
7003 #endif // HAS_ARGBATTENUATEROW_AVX2
7004
7005 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
7006 // Unattenuate 4 pixels at a time.
ARGBUnattenuateRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,int width)7007 void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
7008 uint8_t* dst_argb,
7009 int width) {
7010 uintptr_t alpha;
7011 asm volatile(
7012 // 4 pixel loop.
7013 LABELALIGN
7014 "1: \n"
7015 "movdqu (%0),%%xmm0 \n"
7016 "movzb 0x03(%0),%3 \n"
7017 "punpcklbw %%xmm0,%%xmm0 \n"
7018 "movd 0x00(%4,%3,4),%%xmm2 \n"
7019 "movzb 0x07(%0),%3 \n"
7020 "movd 0x00(%4,%3,4),%%xmm3 \n"
7021 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
7022 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
7023 "movlhps %%xmm3,%%xmm2 \n"
7024 "pmulhuw %%xmm2,%%xmm0 \n"
7025 "movdqu (%0),%%xmm1 \n"
7026 "movzb 0x0b(%0),%3 \n"
7027 "punpckhbw %%xmm1,%%xmm1 \n"
7028 "movd 0x00(%4,%3,4),%%xmm2 \n"
7029 "movzb 0x0f(%0),%3 \n"
7030 "movd 0x00(%4,%3,4),%%xmm3 \n"
7031 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
7032 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
7033 "movlhps %%xmm3,%%xmm2 \n"
7034 "pmulhuw %%xmm2,%%xmm1 \n"
7035 "lea 0x10(%0),%0 \n"
7036 "packuswb %%xmm1,%%xmm0 \n"
7037 "movdqu %%xmm0,(%1) \n"
7038 "lea 0x10(%1),%1 \n"
7039 "sub $0x4,%2 \n"
7040 "jg 1b \n"
7041 : "+r"(src_argb), // %0
7042 "+r"(dst_argb), // %1
7043 "+r"(width), // %2
7044 "=&r"(alpha) // %3
7045 : "r"(fixed_invtbl8) // %4
7046 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
7047 }
7048 #endif // HAS_ARGBUNATTENUATEROW_SSE2
7049
7050 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
7051 // Shuffle table duplicating alpha.
7052 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
7053 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
7054 // Unattenuate 8 pixels at a time.
ARGBUnattenuateRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,int width)7055 void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
7056 uint8_t* dst_argb,
7057 int width) {
7058 uintptr_t alpha;
7059 asm volatile(
7060 "sub %0,%1 \n"
7061 "vbroadcastf128 %5,%%ymm5 \n"
7062
7063 // 8 pixel loop.
7064 LABELALIGN
7065 "1: \n"
7066 // replace VPGATHER
7067 "movzb 0x03(%0),%3 \n"
7068 "vmovd 0x00(%4,%3,4),%%xmm0 \n"
7069 "movzb 0x07(%0),%3 \n"
7070 "vmovd 0x00(%4,%3,4),%%xmm1 \n"
7071 "movzb 0x0b(%0),%3 \n"
7072 "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
7073 "vmovd 0x00(%4,%3,4),%%xmm2 \n"
7074 "movzb 0x0f(%0),%3 \n"
7075 "vmovd 0x00(%4,%3,4),%%xmm3 \n"
7076 "movzb 0x13(%0),%3 \n"
7077 "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
7078 "vmovd 0x00(%4,%3,4),%%xmm0 \n"
7079 "movzb 0x17(%0),%3 \n"
7080 "vmovd 0x00(%4,%3,4),%%xmm1 \n"
7081 "movzb 0x1b(%0),%3 \n"
7082 "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
7083 "vmovd 0x00(%4,%3,4),%%xmm2 \n"
7084 "movzb 0x1f(%0),%3 \n"
7085 "vmovd 0x00(%4,%3,4),%%xmm3 \n"
7086 "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
7087 "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
7088 "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
7089 "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
7090 // end of VPGATHER
7091
7092 "vmovdqu (%0),%%ymm6 \n"
7093 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
7094 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
7095 "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
7096 "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
7097 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
7098 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
7099 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
7100 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
7101 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
7102 "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
7103 "lea 0x20(%0),%0 \n"
7104 "sub $0x8,%2 \n"
7105 "jg 1b \n"
7106 "vzeroupper \n"
7107 : "+r"(src_argb), // %0
7108 "+r"(dst_argb), // %1
7109 "+r"(width), // %2
7110 "=&r"(alpha) // %3
7111 : "r"(fixed_invtbl8), // %4
7112 "m"(kUnattenShuffleAlpha_AVX2) // %5
7113 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7114 "xmm7");
7115 }
7116 #endif // HAS_ARGBUNATTENUATEROW_AVX2
7117
7118 #ifdef HAS_ARGBGRAYROW_SSSE3
7119 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
ARGBGrayRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width)7120 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
7121 asm volatile(
7122 "movdqa %3,%%xmm4 \n"
7123 "movdqa %4,%%xmm5 \n"
7124
7125 // 8 pixel loop.
7126 LABELALIGN
7127 "1: \n"
7128 "movdqu (%0),%%xmm0 \n"
7129 "movdqu 0x10(%0),%%xmm1 \n"
7130 "psubb %%xmm5,%%xmm0 \n"
7131 "psubb %%xmm5,%%xmm1 \n"
7132 "movdqu %%xmm4,%%xmm6 \n"
7133 "pmaddubsw %%xmm0,%%xmm6 \n"
7134 "movdqu %%xmm4,%%xmm0 \n"
7135 "pmaddubsw %%xmm1,%%xmm0 \n"
7136 "phaddw %%xmm0,%%xmm6 \n"
7137 "paddw %%xmm5,%%xmm6 \n"
7138 "psrlw $0x8,%%xmm6 \n"
7139 "packuswb %%xmm6,%%xmm6 \n"
7140 "movdqu (%0),%%xmm2 \n"
7141 "movdqu 0x10(%0),%%xmm3 \n"
7142 "lea 0x20(%0),%0 \n"
7143 "psrld $0x18,%%xmm2 \n"
7144 "psrld $0x18,%%xmm3 \n"
7145 "packuswb %%xmm3,%%xmm2 \n"
7146 "packuswb %%xmm2,%%xmm2 \n"
7147 "movdqa %%xmm6,%%xmm3 \n"
7148 "punpcklbw %%xmm6,%%xmm6 \n"
7149 "punpcklbw %%xmm2,%%xmm3 \n"
7150 "movdqa %%xmm6,%%xmm1 \n"
7151 "punpcklwd %%xmm3,%%xmm6 \n"
7152 "punpckhwd %%xmm3,%%xmm1 \n"
7153 "movdqu %%xmm6,(%1) \n"
7154 "movdqu %%xmm1,0x10(%1) \n"
7155 "lea 0x20(%1),%1 \n"
7156 "sub $0x8,%2 \n"
7157 "jg 1b \n"
7158 : "+r"(src_argb), // %0
7159 "+r"(dst_argb), // %1
7160 "+r"(width) // %2
7161 : "m"(kARGBToYJ), // %3
7162 "m"(kSub128) // %4
7163 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
7164 }
7165 #endif // HAS_ARGBGRAYROW_SSSE3
7166
7167 #ifdef HAS_ARGBSEPIAROW_SSSE3
7168 // b = (r * 35 + g * 68 + b * 17) >> 7
7169 // g = (r * 45 + g * 88 + b * 22) >> 7
7170 // r = (r * 50 + g * 98 + b * 24) >> 7
7171 // Constant for ARGB color to sepia tone
7172 static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
7173 17, 68, 35, 0, 17, 68, 35, 0};
7174
7175 static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
7176 22, 88, 45, 0, 22, 88, 45, 0};
7177
7178 static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
7179 24, 98, 50, 0, 24, 98, 50, 0};
7180
7181 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
ARGBSepiaRow_SSSE3(uint8_t * dst_argb,int width)7182 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
7183 asm volatile(
7184 "movdqa %2,%%xmm2 \n"
7185 "movdqa %3,%%xmm3 \n"
7186 "movdqa %4,%%xmm4 \n"
7187
7188 // 8 pixel loop.
7189 LABELALIGN
7190 "1: \n"
7191 "movdqu (%0),%%xmm0 \n"
7192 "movdqu 0x10(%0),%%xmm6 \n"
7193 "pmaddubsw %%xmm2,%%xmm0 \n"
7194 "pmaddubsw %%xmm2,%%xmm6 \n"
7195 "phaddw %%xmm6,%%xmm0 \n"
7196 "psrlw $0x7,%%xmm0 \n"
7197 "packuswb %%xmm0,%%xmm0 \n"
7198 "movdqu (%0),%%xmm5 \n"
7199 "movdqu 0x10(%0),%%xmm1 \n"
7200 "pmaddubsw %%xmm3,%%xmm5 \n"
7201 "pmaddubsw %%xmm3,%%xmm1 \n"
7202 "phaddw %%xmm1,%%xmm5 \n"
7203 "psrlw $0x7,%%xmm5 \n"
7204 "packuswb %%xmm5,%%xmm5 \n"
7205 "punpcklbw %%xmm5,%%xmm0 \n"
7206 "movdqu (%0),%%xmm5 \n"
7207 "movdqu 0x10(%0),%%xmm1 \n"
7208 "pmaddubsw %%xmm4,%%xmm5 \n"
7209 "pmaddubsw %%xmm4,%%xmm1 \n"
7210 "phaddw %%xmm1,%%xmm5 \n"
7211 "psrlw $0x7,%%xmm5 \n"
7212 "packuswb %%xmm5,%%xmm5 \n"
7213 "movdqu (%0),%%xmm6 \n"
7214 "movdqu 0x10(%0),%%xmm1 \n"
7215 "psrld $0x18,%%xmm6 \n"
7216 "psrld $0x18,%%xmm1 \n"
7217 "packuswb %%xmm1,%%xmm6 \n"
7218 "packuswb %%xmm6,%%xmm6 \n"
7219 "punpcklbw %%xmm6,%%xmm5 \n"
7220 "movdqa %%xmm0,%%xmm1 \n"
7221 "punpcklwd %%xmm5,%%xmm0 \n"
7222 "punpckhwd %%xmm5,%%xmm1 \n"
7223 "movdqu %%xmm0,(%0) \n"
7224 "movdqu %%xmm1,0x10(%0) \n"
7225 "lea 0x20(%0),%0 \n"
7226 "sub $0x8,%1 \n"
7227 "jg 1b \n"
7228 : "+r"(dst_argb), // %0
7229 "+r"(width) // %1
7230 : "m"(kARGBToSepiaB), // %2
7231 "m"(kARGBToSepiaG), // %3
7232 "m"(kARGBToSepiaR) // %4
7233 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
7234 }
7235 #endif // HAS_ARGBSEPIAROW_SSSE3
7236
7237 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
7238 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
7239 // Same as Sepia except matrix is provided.
ARGBColorMatrixRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)7240 void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
7241 uint8_t* dst_argb,
7242 const int8_t* matrix_argb,
7243 int width) {
7244 asm volatile(
7245 "movdqu (%3),%%xmm5 \n"
7246 "pshufd $0x00,%%xmm5,%%xmm2 \n"
7247 "pshufd $0x55,%%xmm5,%%xmm3 \n"
7248 "pshufd $0xaa,%%xmm5,%%xmm4 \n"
7249 "pshufd $0xff,%%xmm5,%%xmm5 \n"
7250
7251 // 8 pixel loop.
7252 LABELALIGN
7253 "1: \n"
7254 "movdqu (%0),%%xmm0 \n"
7255 "movdqu 0x10(%0),%%xmm7 \n"
7256 "pmaddubsw %%xmm2,%%xmm0 \n"
7257 "pmaddubsw %%xmm2,%%xmm7 \n"
7258 "movdqu (%0),%%xmm6 \n"
7259 "movdqu 0x10(%0),%%xmm1 \n"
7260 "pmaddubsw %%xmm3,%%xmm6 \n"
7261 "pmaddubsw %%xmm3,%%xmm1 \n"
7262 "phaddsw %%xmm7,%%xmm0 \n"
7263 "phaddsw %%xmm1,%%xmm6 \n"
7264 "psraw $0x6,%%xmm0 \n"
7265 "psraw $0x6,%%xmm6 \n"
7266 "packuswb %%xmm0,%%xmm0 \n"
7267 "packuswb %%xmm6,%%xmm6 \n"
7268 "punpcklbw %%xmm6,%%xmm0 \n"
7269 "movdqu (%0),%%xmm1 \n"
7270 "movdqu 0x10(%0),%%xmm7 \n"
7271 "pmaddubsw %%xmm4,%%xmm1 \n"
7272 "pmaddubsw %%xmm4,%%xmm7 \n"
7273 "phaddsw %%xmm7,%%xmm1 \n"
7274 "movdqu (%0),%%xmm6 \n"
7275 "movdqu 0x10(%0),%%xmm7 \n"
7276 "pmaddubsw %%xmm5,%%xmm6 \n"
7277 "pmaddubsw %%xmm5,%%xmm7 \n"
7278 "phaddsw %%xmm7,%%xmm6 \n"
7279 "psraw $0x6,%%xmm1 \n"
7280 "psraw $0x6,%%xmm6 \n"
7281 "packuswb %%xmm1,%%xmm1 \n"
7282 "packuswb %%xmm6,%%xmm6 \n"
7283 "punpcklbw %%xmm6,%%xmm1 \n"
7284 "movdqa %%xmm0,%%xmm6 \n"
7285 "punpcklwd %%xmm1,%%xmm0 \n"
7286 "punpckhwd %%xmm1,%%xmm6 \n"
7287 "movdqu %%xmm0,(%1) \n"
7288 "movdqu %%xmm6,0x10(%1) \n"
7289 "lea 0x20(%0),%0 \n"
7290 "lea 0x20(%1),%1 \n"
7291 "sub $0x8,%2 \n"
7292 "jg 1b \n"
7293 : "+r"(src_argb), // %0
7294 "+r"(dst_argb), // %1
7295 "+r"(width) // %2
7296 : "r"(matrix_argb) // %3
7297 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7298 "xmm7");
7299 }
7300 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
7301
7302 #ifdef HAS_ARGBQUANTIZEROW_SSE2
7303 // Quantize 4 ARGB pixels (16 bytes).
ARGBQuantizeRow_SSE2(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)7304 void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
7305 int scale,
7306 int interval_size,
7307 int interval_offset,
7308 int width) {
7309 asm volatile(
7310 "movd %2,%%xmm2 \n"
7311 "movd %3,%%xmm3 \n"
7312 "movd %4,%%xmm4 \n"
7313 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
7314 "pshufd $0x44,%%xmm2,%%xmm2 \n"
7315 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
7316 "pshufd $0x44,%%xmm3,%%xmm3 \n"
7317 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
7318 "pshufd $0x44,%%xmm4,%%xmm4 \n"
7319 "pxor %%xmm5,%%xmm5 \n"
7320 "pcmpeqb %%xmm6,%%xmm6 \n"
7321 "pslld $0x18,%%xmm6 \n"
7322
7323 // 4 pixel loop.
7324 LABELALIGN
7325 "1: \n"
7326 "movdqu (%0),%%xmm0 \n"
7327 "punpcklbw %%xmm5,%%xmm0 \n"
7328 "pmulhuw %%xmm2,%%xmm0 \n"
7329 "movdqu (%0),%%xmm1 \n"
7330 "punpckhbw %%xmm5,%%xmm1 \n"
7331 "pmulhuw %%xmm2,%%xmm1 \n"
7332 "pmullw %%xmm3,%%xmm0 \n"
7333 "movdqu (%0),%%xmm7 \n"
7334 "pmullw %%xmm3,%%xmm1 \n"
7335 "pand %%xmm6,%%xmm7 \n"
7336 "paddw %%xmm4,%%xmm0 \n"
7337 "paddw %%xmm4,%%xmm1 \n"
7338 "packuswb %%xmm1,%%xmm0 \n"
7339 "por %%xmm7,%%xmm0 \n"
7340 "movdqu %%xmm0,(%0) \n"
7341 "lea 0x10(%0),%0 \n"
7342 "sub $0x4,%1 \n"
7343 "jg 1b \n"
7344 : "+r"(dst_argb), // %0
7345 "+r"(width) // %1
7346 : "r"(scale), // %2
7347 "r"(interval_size), // %3
7348 "r"(interval_offset) // %4
7349 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7350 "xmm7");
7351 }
7352 #endif // HAS_ARGBQUANTIZEROW_SSE2
7353
7354 #ifdef HAS_ARGBSHADEROW_SSE2
7355 // Shade 4 pixels at a time by specified value.
ARGBShadeRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)7356 void ARGBShadeRow_SSE2(const uint8_t* src_argb,
7357 uint8_t* dst_argb,
7358 int width,
7359 uint32_t value) {
7360 asm volatile(
7361 "movd %3,%%xmm2 \n"
7362 "punpcklbw %%xmm2,%%xmm2 \n"
7363 "punpcklqdq %%xmm2,%%xmm2 \n"
7364
7365 // 4 pixel loop.
7366 LABELALIGN
7367 "1: \n"
7368 "movdqu (%0),%%xmm0 \n"
7369 "lea 0x10(%0),%0 \n"
7370 "movdqa %%xmm0,%%xmm1 \n"
7371 "punpcklbw %%xmm0,%%xmm0 \n"
7372 "punpckhbw %%xmm1,%%xmm1 \n"
7373 "pmulhuw %%xmm2,%%xmm0 \n"
7374 "pmulhuw %%xmm2,%%xmm1 \n"
7375 "psrlw $0x8,%%xmm0 \n"
7376 "psrlw $0x8,%%xmm1 \n"
7377 "packuswb %%xmm1,%%xmm0 \n"
7378 "movdqu %%xmm0,(%1) \n"
7379 "lea 0x10(%1),%1 \n"
7380 "sub $0x4,%2 \n"
7381 "jg 1b \n"
7382 : "+r"(src_argb), // %0
7383 "+r"(dst_argb), // %1
7384 "+r"(width) // %2
7385 : "r"(value) // %3
7386 : "memory", "cc", "xmm0", "xmm1", "xmm2");
7387 }
7388 #endif // HAS_ARGBSHADEROW_SSE2
7389
7390 #ifdef HAS_ARGBMULTIPLYROW_SSE2
7391 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBMultiplyRow_SSE2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7392 void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
7393 const uint8_t* src_argb1,
7394 uint8_t* dst_argb,
7395 int width) {
7396 asm volatile(
7397
7398 "pxor %%xmm5,%%xmm5 \n"
7399
7400 // 4 pixel loop.
7401 LABELALIGN
7402 "1: \n"
7403 "movdqu (%0),%%xmm0 \n"
7404 "lea 0x10(%0),%0 \n"
7405 "movdqu (%1),%%xmm2 \n"
7406 "lea 0x10(%1),%1 \n"
7407 "movdqu %%xmm0,%%xmm1 \n"
7408 "movdqu %%xmm2,%%xmm3 \n"
7409 "punpcklbw %%xmm0,%%xmm0 \n"
7410 "punpckhbw %%xmm1,%%xmm1 \n"
7411 "punpcklbw %%xmm5,%%xmm2 \n"
7412 "punpckhbw %%xmm5,%%xmm3 \n"
7413 "pmulhuw %%xmm2,%%xmm0 \n"
7414 "pmulhuw %%xmm3,%%xmm1 \n"
7415 "packuswb %%xmm1,%%xmm0 \n"
7416 "movdqu %%xmm0,(%2) \n"
7417 "lea 0x10(%2),%2 \n"
7418 "sub $0x4,%3 \n"
7419 "jg 1b \n"
7420 : "+r"(src_argb), // %0
7421 "+r"(src_argb1), // %1
7422 "+r"(dst_argb), // %2
7423 "+r"(width) // %3
7424 :
7425 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7426 }
7427 #endif // HAS_ARGBMULTIPLYROW_SSE2
7428
7429 #ifdef HAS_ARGBMULTIPLYROW_AVX2
7430 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_AVX2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7431 void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
7432 const uint8_t* src_argb1,
7433 uint8_t* dst_argb,
7434 int width) {
7435 asm volatile(
7436
7437 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
7438
7439 // 4 pixel loop.
7440 LABELALIGN
7441 "1: \n"
7442 "vmovdqu (%0),%%ymm1 \n"
7443 "lea 0x20(%0),%0 \n"
7444 "vmovdqu (%1),%%ymm3 \n"
7445 "lea 0x20(%1),%1 \n"
7446 "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
7447 "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
7448 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
7449 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
7450 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
7451 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
7452 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
7453 "vmovdqu %%ymm0,(%2) \n"
7454 "lea 0x20(%2),%2 \n"
7455 "sub $0x8,%3 \n"
7456 "jg 1b \n"
7457 "vzeroupper \n"
7458 : "+r"(src_argb), // %0
7459 "+r"(src_argb1), // %1
7460 "+r"(dst_argb), // %2
7461 "+r"(width) // %3
7462 :
7463 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7464 }
7465 #endif // HAS_ARGBMULTIPLYROW_AVX2
7466
7467 #ifdef HAS_ARGBADDROW_SSE2
7468 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_SSE2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7469 void ARGBAddRow_SSE2(const uint8_t* src_argb,
7470 const uint8_t* src_argb1,
7471 uint8_t* dst_argb,
7472 int width) {
7473 asm volatile(
7474 // 4 pixel loop.
7475 LABELALIGN
7476 "1: \n"
7477 "movdqu (%0),%%xmm0 \n"
7478 "lea 0x10(%0),%0 \n"
7479 "movdqu (%1),%%xmm1 \n"
7480 "lea 0x10(%1),%1 \n"
7481 "paddusb %%xmm1,%%xmm0 \n"
7482 "movdqu %%xmm0,(%2) \n"
7483 "lea 0x10(%2),%2 \n"
7484 "sub $0x4,%3 \n"
7485 "jg 1b \n"
7486 : "+r"(src_argb), // %0
7487 "+r"(src_argb1), // %1
7488 "+r"(dst_argb), // %2
7489 "+r"(width) // %3
7490 :
7491 : "memory", "cc", "xmm0", "xmm1");
7492 }
7493 #endif // HAS_ARGBADDROW_SSE2
7494
7495 #ifdef HAS_ARGBADDROW_AVX2
7496 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_AVX2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7497 void ARGBAddRow_AVX2(const uint8_t* src_argb,
7498 const uint8_t* src_argb1,
7499 uint8_t* dst_argb,
7500 int width) {
7501 asm volatile(
7502 // 4 pixel loop.
7503 LABELALIGN
7504 "1: \n"
7505 "vmovdqu (%0),%%ymm0 \n"
7506 "lea 0x20(%0),%0 \n"
7507 "vpaddusb (%1),%%ymm0,%%ymm0 \n"
7508 "lea 0x20(%1),%1 \n"
7509 "vmovdqu %%ymm0,(%2) \n"
7510 "lea 0x20(%2),%2 \n"
7511 "sub $0x8,%3 \n"
7512 "jg 1b \n"
7513 "vzeroupper \n"
7514 : "+r"(src_argb), // %0
7515 "+r"(src_argb1), // %1
7516 "+r"(dst_argb), // %2
7517 "+r"(width) // %3
7518 :
7519 : "memory", "cc", "xmm0");
7520 }
7521 #endif // HAS_ARGBADDROW_AVX2
7522
7523 #ifdef HAS_ARGBSUBTRACTROW_SSE2
7524 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
ARGBSubtractRow_SSE2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7525 void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
7526 const uint8_t* src_argb1,
7527 uint8_t* dst_argb,
7528 int width) {
7529 asm volatile(
7530 // 4 pixel loop.
7531 LABELALIGN
7532 "1: \n"
7533 "movdqu (%0),%%xmm0 \n"
7534 "lea 0x10(%0),%0 \n"
7535 "movdqu (%1),%%xmm1 \n"
7536 "lea 0x10(%1),%1 \n"
7537 "psubusb %%xmm1,%%xmm0 \n"
7538 "movdqu %%xmm0,(%2) \n"
7539 "lea 0x10(%2),%2 \n"
7540 "sub $0x4,%3 \n"
7541 "jg 1b \n"
7542 : "+r"(src_argb), // %0
7543 "+r"(src_argb1), // %1
7544 "+r"(dst_argb), // %2
7545 "+r"(width) // %3
7546 :
7547 : "memory", "cc", "xmm0", "xmm1");
7548 }
7549 #endif // HAS_ARGBSUBTRACTROW_SSE2
7550
7551 #ifdef HAS_ARGBSUBTRACTROW_AVX2
7552 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_AVX2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7553 void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
7554 const uint8_t* src_argb1,
7555 uint8_t* dst_argb,
7556 int width) {
7557 asm volatile(
7558 // 4 pixel loop.
7559 LABELALIGN
7560 "1: \n"
7561 "vmovdqu (%0),%%ymm0 \n"
7562 "lea 0x20(%0),%0 \n"
7563 "vpsubusb (%1),%%ymm0,%%ymm0 \n"
7564 "lea 0x20(%1),%1 \n"
7565 "vmovdqu %%ymm0,(%2) \n"
7566 "lea 0x20(%2),%2 \n"
7567 "sub $0x8,%3 \n"
7568 "jg 1b \n"
7569 "vzeroupper \n"
7570 : "+r"(src_argb), // %0
7571 "+r"(src_argb1), // %1
7572 "+r"(dst_argb), // %2
7573 "+r"(width) // %3
7574 :
7575 : "memory", "cc", "xmm0");
7576 }
7577 #endif // HAS_ARGBSUBTRACTROW_AVX2
7578
7579 #ifdef HAS_SOBELXROW_SSE2
7580 // SobelX as a matrix is
7581 // -1 0 1
7582 // -2 0 2
7583 // -1 0 1
SobelXRow_SSE2(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int width)7584 void SobelXRow_SSE2(const uint8_t* src_y0,
7585 const uint8_t* src_y1,
7586 const uint8_t* src_y2,
7587 uint8_t* dst_sobelx,
7588 int width) {
7589 asm volatile(
7590 "sub %0,%1 \n"
7591 "sub %0,%2 \n"
7592 "sub %0,%3 \n"
7593 "pxor %%xmm5,%%xmm5 \n"
7594
7595 // 8 pixel loop.
7596 LABELALIGN
7597 "1: \n"
7598 "movq (%0),%%xmm0 \n"
7599 "movq 0x2(%0),%%xmm1 \n"
7600 "punpcklbw %%xmm5,%%xmm0 \n"
7601 "punpcklbw %%xmm5,%%xmm1 \n"
7602 "psubw %%xmm1,%%xmm0 \n"
7603 "movq 0x00(%0,%1,1),%%xmm1 \n"
7604 "movq 0x02(%0,%1,1),%%xmm2 \n"
7605 "punpcklbw %%xmm5,%%xmm1 \n"
7606 "punpcklbw %%xmm5,%%xmm2 \n"
7607 "psubw %%xmm2,%%xmm1 \n"
7608 "movq 0x00(%0,%2,1),%%xmm2 \n"
7609 "movq 0x02(%0,%2,1),%%xmm3 \n"
7610 "punpcklbw %%xmm5,%%xmm2 \n"
7611 "punpcklbw %%xmm5,%%xmm3 \n"
7612 "psubw %%xmm3,%%xmm2 \n"
7613 "paddw %%xmm2,%%xmm0 \n"
7614 "paddw %%xmm1,%%xmm0 \n"
7615 "paddw %%xmm1,%%xmm0 \n"
7616 "pxor %%xmm1,%%xmm1 \n"
7617 "psubw %%xmm0,%%xmm1 \n"
7618 "pmaxsw %%xmm1,%%xmm0 \n"
7619 "packuswb %%xmm0,%%xmm0 \n"
7620 "movq %%xmm0,0x00(%0,%3,1) \n"
7621 "lea 0x8(%0),%0 \n"
7622 "sub $0x8,%4 \n"
7623 "jg 1b \n"
7624 : "+r"(src_y0), // %0
7625 "+r"(src_y1), // %1
7626 "+r"(src_y2), // %2
7627 "+r"(dst_sobelx), // %3
7628 "+r"(width) // %4
7629 :
7630 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7631 }
7632 #endif // HAS_SOBELXROW_SSE2
7633
7634 #ifdef HAS_SOBELYROW_SSE2
7635 // SobelY as a matrix is
7636 // -1 -2 -1
7637 // 0 0 0
7638 // 1 2 1
SobelYRow_SSE2(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int width)7639 void SobelYRow_SSE2(const uint8_t* src_y0,
7640 const uint8_t* src_y1,
7641 uint8_t* dst_sobely,
7642 int width) {
7643 asm volatile(
7644 "sub %0,%1 \n"
7645 "sub %0,%2 \n"
7646 "pxor %%xmm5,%%xmm5 \n"
7647
7648 // 8 pixel loop.
7649 LABELALIGN
7650 "1: \n"
7651 "movq (%0),%%xmm0 \n"
7652 "movq 0x00(%0,%1,1),%%xmm1 \n"
7653 "punpcklbw %%xmm5,%%xmm0 \n"
7654 "punpcklbw %%xmm5,%%xmm1 \n"
7655 "psubw %%xmm1,%%xmm0 \n"
7656 "movq 0x1(%0),%%xmm1 \n"
7657 "movq 0x01(%0,%1,1),%%xmm2 \n"
7658 "punpcklbw %%xmm5,%%xmm1 \n"
7659 "punpcklbw %%xmm5,%%xmm2 \n"
7660 "psubw %%xmm2,%%xmm1 \n"
7661 "movq 0x2(%0),%%xmm2 \n"
7662 "movq 0x02(%0,%1,1),%%xmm3 \n"
7663 "punpcklbw %%xmm5,%%xmm2 \n"
7664 "punpcklbw %%xmm5,%%xmm3 \n"
7665 "psubw %%xmm3,%%xmm2 \n"
7666 "paddw %%xmm2,%%xmm0 \n"
7667 "paddw %%xmm1,%%xmm0 \n"
7668 "paddw %%xmm1,%%xmm0 \n"
7669 "pxor %%xmm1,%%xmm1 \n"
7670 "psubw %%xmm0,%%xmm1 \n"
7671 "pmaxsw %%xmm1,%%xmm0 \n"
7672 "packuswb %%xmm0,%%xmm0 \n"
7673 "movq %%xmm0,0x00(%0,%2,1) \n"
7674 "lea 0x8(%0),%0 \n"
7675 "sub $0x8,%3 \n"
7676 "jg 1b \n"
7677 : "+r"(src_y0), // %0
7678 "+r"(src_y1), // %1
7679 "+r"(dst_sobely), // %2
7680 "+r"(width) // %3
7681 :
7682 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7683 }
7684 #endif // HAS_SOBELYROW_SSE2
7685
7686 #ifdef HAS_SOBELROW_SSE2
7687 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
7688 // A = 255
7689 // R = Sobel
7690 // G = Sobel
7691 // B = Sobel
SobelRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)7692 void SobelRow_SSE2(const uint8_t* src_sobelx,
7693 const uint8_t* src_sobely,
7694 uint8_t* dst_argb,
7695 int width) {
7696 asm volatile(
7697 "sub %0,%1 \n"
7698 "pcmpeqb %%xmm5,%%xmm5 \n"
7699 "pslld $0x18,%%xmm5 \n"
7700
7701 // 8 pixel loop.
7702 LABELALIGN
7703 "1: \n"
7704 "movdqu (%0),%%xmm0 \n"
7705 "movdqu 0x00(%0,%1,1),%%xmm1 \n"
7706 "lea 0x10(%0),%0 \n"
7707 "paddusb %%xmm1,%%xmm0 \n"
7708 "movdqa %%xmm0,%%xmm2 \n"
7709 "punpcklbw %%xmm0,%%xmm2 \n"
7710 "punpckhbw %%xmm0,%%xmm0 \n"
7711 "movdqa %%xmm2,%%xmm1 \n"
7712 "punpcklwd %%xmm2,%%xmm1 \n"
7713 "punpckhwd %%xmm2,%%xmm2 \n"
7714 "por %%xmm5,%%xmm1 \n"
7715 "por %%xmm5,%%xmm2 \n"
7716 "movdqa %%xmm0,%%xmm3 \n"
7717 "punpcklwd %%xmm0,%%xmm3 \n"
7718 "punpckhwd %%xmm0,%%xmm0 \n"
7719 "por %%xmm5,%%xmm3 \n"
7720 "por %%xmm5,%%xmm0 \n"
7721 "movdqu %%xmm1,(%2) \n"
7722 "movdqu %%xmm2,0x10(%2) \n"
7723 "movdqu %%xmm3,0x20(%2) \n"
7724 "movdqu %%xmm0,0x30(%2) \n"
7725 "lea 0x40(%2),%2 \n"
7726 "sub $0x10,%3 \n"
7727 "jg 1b \n"
7728 : "+r"(src_sobelx), // %0
7729 "+r"(src_sobely), // %1
7730 "+r"(dst_argb), // %2
7731 "+r"(width) // %3
7732 :
7733 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7734 }
7735 #endif // HAS_SOBELROW_SSE2
7736
7737 #ifdef HAS_SOBELTOPLANEROW_SSE2
7738 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
SobelToPlaneRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)7739 void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
7740 const uint8_t* src_sobely,
7741 uint8_t* dst_y,
7742 int width) {
7743 asm volatile(
7744 "sub %0,%1 \n"
7745 "pcmpeqb %%xmm5,%%xmm5 \n"
7746 "pslld $0x18,%%xmm5 \n"
7747
7748 // 8 pixel loop.
7749 LABELALIGN
7750 "1: \n"
7751 "movdqu (%0),%%xmm0 \n"
7752 "movdqu 0x00(%0,%1,1),%%xmm1 \n"
7753 "lea 0x10(%0),%0 \n"
7754 "paddusb %%xmm1,%%xmm0 \n"
7755 "movdqu %%xmm0,(%2) \n"
7756 "lea 0x10(%2),%2 \n"
7757 "sub $0x10,%3 \n"
7758 "jg 1b \n"
7759 : "+r"(src_sobelx), // %0
7760 "+r"(src_sobely), // %1
7761 "+r"(dst_y), // %2
7762 "+r"(width) // %3
7763 :
7764 : "memory", "cc", "xmm0", "xmm1");
7765 }
7766 #endif // HAS_SOBELTOPLANEROW_SSE2
7767
7768 #ifdef HAS_SOBELXYROW_SSE2
7769 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
7770 // A = 255
7771 // R = Sobel X
7772 // G = Sobel
7773 // B = Sobel Y
SobelXYRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)7774 void SobelXYRow_SSE2(const uint8_t* src_sobelx,
7775 const uint8_t* src_sobely,
7776 uint8_t* dst_argb,
7777 int width) {
7778 asm volatile(
7779 "sub %0,%1 \n"
7780 "pcmpeqb %%xmm5,%%xmm5 \n"
7781
7782 // 8 pixel loop.
7783 LABELALIGN
7784 "1: \n"
7785 "movdqu (%0),%%xmm0 \n"
7786 "movdqu 0x00(%0,%1,1),%%xmm1 \n"
7787 "lea 0x10(%0),%0 \n"
7788 "movdqa %%xmm0,%%xmm2 \n"
7789 "paddusb %%xmm1,%%xmm2 \n"
7790 "movdqa %%xmm0,%%xmm3 \n"
7791 "punpcklbw %%xmm5,%%xmm3 \n"
7792 "punpckhbw %%xmm5,%%xmm0 \n"
7793 "movdqa %%xmm1,%%xmm4 \n"
7794 "punpcklbw %%xmm2,%%xmm4 \n"
7795 "punpckhbw %%xmm2,%%xmm1 \n"
7796 "movdqa %%xmm4,%%xmm6 \n"
7797 "punpcklwd %%xmm3,%%xmm6 \n"
7798 "punpckhwd %%xmm3,%%xmm4 \n"
7799 "movdqa %%xmm1,%%xmm7 \n"
7800 "punpcklwd %%xmm0,%%xmm7 \n"
7801 "punpckhwd %%xmm0,%%xmm1 \n"
7802 "movdqu %%xmm6,(%2) \n"
7803 "movdqu %%xmm4,0x10(%2) \n"
7804 "movdqu %%xmm7,0x20(%2) \n"
7805 "movdqu %%xmm1,0x30(%2) \n"
7806 "lea 0x40(%2),%2 \n"
7807 "sub $0x10,%3 \n"
7808 "jg 1b \n"
7809 : "+r"(src_sobelx), // %0
7810 "+r"(src_sobely), // %1
7811 "+r"(dst_argb), // %2
7812 "+r"(width) // %3
7813 :
7814 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7815 "xmm7");
7816 }
7817 #endif // HAS_SOBELXYROW_SSE2
7818
7819 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
7820 // Creates a table of cumulative sums where each value is a sum of all values
7821 // above and to the left of the value, inclusive of the value.
ComputeCumulativeSumRow_SSE2(const uint8_t * row,int32_t * cumsum,const int32_t * previous_cumsum,int width)7822 void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
7823 int32_t* cumsum,
7824 const int32_t* previous_cumsum,
7825 int width) {
7826 asm volatile(
7827 "pxor %%xmm0,%%xmm0 \n"
7828 "pxor %%xmm1,%%xmm1 \n"
7829 "sub $0x4,%3 \n"
7830 "jl 49f \n"
7831 "test $0xf,%1 \n"
7832 "jne 49f \n"
7833
7834 // 4 pixel loop.
7835 LABELALIGN
7836 "40: \n"
7837 "movdqu (%0),%%xmm2 \n"
7838 "lea 0x10(%0),%0 \n"
7839 "movdqa %%xmm2,%%xmm4 \n"
7840 "punpcklbw %%xmm1,%%xmm2 \n"
7841 "movdqa %%xmm2,%%xmm3 \n"
7842 "punpcklwd %%xmm1,%%xmm2 \n"
7843 "punpckhwd %%xmm1,%%xmm3 \n"
7844 "punpckhbw %%xmm1,%%xmm4 \n"
7845 "movdqa %%xmm4,%%xmm5 \n"
7846 "punpcklwd %%xmm1,%%xmm4 \n"
7847 "punpckhwd %%xmm1,%%xmm5 \n"
7848 "paddd %%xmm2,%%xmm0 \n"
7849 "movdqu (%2),%%xmm2 \n"
7850 "paddd %%xmm0,%%xmm2 \n"
7851 "paddd %%xmm3,%%xmm0 \n"
7852 "movdqu 0x10(%2),%%xmm3 \n"
7853 "paddd %%xmm0,%%xmm3 \n"
7854 "paddd %%xmm4,%%xmm0 \n"
7855 "movdqu 0x20(%2),%%xmm4 \n"
7856 "paddd %%xmm0,%%xmm4 \n"
7857 "paddd %%xmm5,%%xmm0 \n"
7858 "movdqu 0x30(%2),%%xmm5 \n"
7859 "lea 0x40(%2),%2 \n"
7860 "paddd %%xmm0,%%xmm5 \n"
7861 "movdqu %%xmm2,(%1) \n"
7862 "movdqu %%xmm3,0x10(%1) \n"
7863 "movdqu %%xmm4,0x20(%1) \n"
7864 "movdqu %%xmm5,0x30(%1) \n"
7865 "lea 0x40(%1),%1 \n"
7866 "sub $0x4,%3 \n"
7867 "jge 40b \n"
7868
7869 "49: \n"
7870 "add $0x3,%3 \n"
7871 "jl 19f \n"
7872
7873 // 1 pixel loop.
7874 LABELALIGN
7875 "10: \n"
7876 "movd (%0),%%xmm2 \n"
7877 "lea 0x4(%0),%0 \n"
7878 "punpcklbw %%xmm1,%%xmm2 \n"
7879 "punpcklwd %%xmm1,%%xmm2 \n"
7880 "paddd %%xmm2,%%xmm0 \n"
7881 "movdqu (%2),%%xmm2 \n"
7882 "lea 0x10(%2),%2 \n"
7883 "paddd %%xmm0,%%xmm2 \n"
7884 "movdqu %%xmm2,(%1) \n"
7885 "lea 0x10(%1),%1 \n"
7886 "sub $0x1,%3 \n"
7887 "jge 10b \n"
7888
7889 "19: \n"
7890 : "+r"(row), // %0
7891 "+r"(cumsum), // %1
7892 "+r"(previous_cumsum), // %2
7893 "+r"(width) // %3
7894 :
7895 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
7896 }
7897 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
7898
7899 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
CumulativeSumToAverageRow_SSE2(const int32_t * topleft,const int32_t * botleft,int width,int area,uint8_t * dst,int count)7900 void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
7901 const int32_t* botleft,
7902 int width,
7903 int area,
7904 uint8_t* dst,
7905 int count) {
7906 asm volatile(
7907 "movd %5,%%xmm5 \n"
7908 "cvtdq2ps %%xmm5,%%xmm5 \n"
7909 "rcpss %%xmm5,%%xmm4 \n"
7910 "pshufd $0x0,%%xmm4,%%xmm4 \n"
7911 "sub $0x4,%3 \n"
7912 "jl 49f \n"
7913 "cmpl $0x80,%5 \n"
7914 "ja 40f \n"
7915
7916 "pshufd $0x0,%%xmm5,%%xmm5 \n"
7917 "pcmpeqb %%xmm6,%%xmm6 \n"
7918 "psrld $0x10,%%xmm6 \n"
7919 "cvtdq2ps %%xmm6,%%xmm6 \n"
7920 "addps %%xmm6,%%xmm5 \n"
7921 "mulps %%xmm4,%%xmm5 \n"
7922 "cvtps2dq %%xmm5,%%xmm5 \n"
7923 "packssdw %%xmm5,%%xmm5 \n"
7924
7925 // 4 pixel small loop.
7926 LABELALIGN
7927 "4: \n"
7928 "movdqu (%0),%%xmm0 \n"
7929 "movdqu 0x10(%0),%%xmm1 \n"
7930 "movdqu 0x20(%0),%%xmm2 \n"
7931 "movdqu 0x30(%0),%%xmm3 \n"
7932 "psubd 0x00(%0,%4,4),%%xmm0 \n"
7933 "psubd 0x10(%0,%4,4),%%xmm1 \n"
7934 "psubd 0x20(%0,%4,4),%%xmm2 \n"
7935 "psubd 0x30(%0,%4,4),%%xmm3 \n"
7936 "lea 0x40(%0),%0 \n"
7937 "psubd (%1),%%xmm0 \n"
7938 "psubd 0x10(%1),%%xmm1 \n"
7939 "psubd 0x20(%1),%%xmm2 \n"
7940 "psubd 0x30(%1),%%xmm3 \n"
7941 "paddd 0x00(%1,%4,4),%%xmm0 \n"
7942 "paddd 0x10(%1,%4,4),%%xmm1 \n"
7943 "paddd 0x20(%1,%4,4),%%xmm2 \n"
7944 "paddd 0x30(%1,%4,4),%%xmm3 \n"
7945 "lea 0x40(%1),%1 \n"
7946 "packssdw %%xmm1,%%xmm0 \n"
7947 "packssdw %%xmm3,%%xmm2 \n"
7948 "pmulhuw %%xmm5,%%xmm0 \n"
7949 "pmulhuw %%xmm5,%%xmm2 \n"
7950 "packuswb %%xmm2,%%xmm0 \n"
7951 "movdqu %%xmm0,(%2) \n"
7952 "lea 0x10(%2),%2 \n"
7953 "sub $0x4,%3 \n"
7954 "jge 4b \n"
7955 "jmp 49f \n"
7956
7957 // 4 pixel loop
7958 LABELALIGN
7959 "40: \n"
7960 "movdqu (%0),%%xmm0 \n"
7961 "movdqu 0x10(%0),%%xmm1 \n"
7962 "movdqu 0x20(%0),%%xmm2 \n"
7963 "movdqu 0x30(%0),%%xmm3 \n"
7964 "psubd 0x00(%0,%4,4),%%xmm0 \n"
7965 "psubd 0x10(%0,%4,4),%%xmm1 \n"
7966 "psubd 0x20(%0,%4,4),%%xmm2 \n"
7967 "psubd 0x30(%0,%4,4),%%xmm3 \n"
7968 "lea 0x40(%0),%0 \n"
7969 "psubd (%1),%%xmm0 \n"
7970 "psubd 0x10(%1),%%xmm1 \n"
7971 "psubd 0x20(%1),%%xmm2 \n"
7972 "psubd 0x30(%1),%%xmm3 \n"
7973 "paddd 0x00(%1,%4,4),%%xmm0 \n"
7974 "paddd 0x10(%1,%4,4),%%xmm1 \n"
7975 "paddd 0x20(%1,%4,4),%%xmm2 \n"
7976 "paddd 0x30(%1,%4,4),%%xmm3 \n"
7977 "lea 0x40(%1),%1 \n"
7978 "cvtdq2ps %%xmm0,%%xmm0 \n"
7979 "cvtdq2ps %%xmm1,%%xmm1 \n"
7980 "mulps %%xmm4,%%xmm0 \n"
7981 "mulps %%xmm4,%%xmm1 \n"
7982 "cvtdq2ps %%xmm2,%%xmm2 \n"
7983 "cvtdq2ps %%xmm3,%%xmm3 \n"
7984 "mulps %%xmm4,%%xmm2 \n"
7985 "mulps %%xmm4,%%xmm3 \n"
7986 "cvtps2dq %%xmm0,%%xmm0 \n"
7987 "cvtps2dq %%xmm1,%%xmm1 \n"
7988 "cvtps2dq %%xmm2,%%xmm2 \n"
7989 "cvtps2dq %%xmm3,%%xmm3 \n"
7990 "packssdw %%xmm1,%%xmm0 \n"
7991 "packssdw %%xmm3,%%xmm2 \n"
7992 "packuswb %%xmm2,%%xmm0 \n"
7993 "movdqu %%xmm0,(%2) \n"
7994 "lea 0x10(%2),%2 \n"
7995 "sub $0x4,%3 \n"
7996 "jge 40b \n"
7997
7998 "49: \n"
7999 "add $0x3,%3 \n"
8000 "jl 19f \n"
8001
8002 // 1 pixel loop
8003 LABELALIGN
8004 "10: \n"
8005 "movdqu (%0),%%xmm0 \n"
8006 "psubd 0x00(%0,%4,4),%%xmm0 \n"
8007 "lea 0x10(%0),%0 \n"
8008 "psubd (%1),%%xmm0 \n"
8009 "paddd 0x00(%1,%4,4),%%xmm0 \n"
8010 "lea 0x10(%1),%1 \n"
8011 "cvtdq2ps %%xmm0,%%xmm0 \n"
8012 "mulps %%xmm4,%%xmm0 \n"
8013 "cvtps2dq %%xmm0,%%xmm0 \n"
8014 "packssdw %%xmm0,%%xmm0 \n"
8015 "packuswb %%xmm0,%%xmm0 \n"
8016 "movd %%xmm0,(%2) \n"
8017 "lea 0x4(%2),%2 \n"
8018 "sub $0x1,%3 \n"
8019 "jge 10b \n"
8020 "19: \n"
8021 : "+r"(topleft), // %0
8022 "+r"(botleft), // %1
8023 "+r"(dst), // %2
8024 "+rm"(count) // %3
8025 : "r"((intptr_t)(width)), // %4
8026 "rm"(area) // %5
8027 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
8028 }
8029 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
8030
8031 #ifdef HAS_ARGBAFFINEROW_SSE2
8032 // Copy ARGB pixels from source image with slope to a row of destination.
8033 LIBYUV_API
ARGBAffineRow_SSE2(const uint8_t * src_argb,int src_argb_stride,uint8_t * dst_argb,const float * src_dudv,int width)8034 void ARGBAffineRow_SSE2(const uint8_t* src_argb,
8035 int src_argb_stride,
8036 uint8_t* dst_argb,
8037 const float* src_dudv,
8038 int width) {
8039 intptr_t src_argb_stride_temp = src_argb_stride;
8040 intptr_t temp;
8041 asm volatile(
8042 "movq (%3),%%xmm2 \n"
8043 "movq 0x08(%3),%%xmm7 \n"
8044 "shl $0x10,%1 \n"
8045 "add $0x4,%1 \n"
8046 "movd %1,%%xmm5 \n"
8047 "sub $0x4,%4 \n"
8048 "jl 49f \n"
8049
8050 "pshufd $0x44,%%xmm7,%%xmm7 \n"
8051 "pshufd $0x0,%%xmm5,%%xmm5 \n"
8052 "movdqa %%xmm2,%%xmm0 \n"
8053 "addps %%xmm7,%%xmm0 \n"
8054 "movlhps %%xmm0,%%xmm2 \n"
8055 "movdqa %%xmm7,%%xmm4 \n"
8056 "addps %%xmm4,%%xmm4 \n"
8057 "movdqa %%xmm2,%%xmm3 \n"
8058 "addps %%xmm4,%%xmm3 \n"
8059 "addps %%xmm4,%%xmm4 \n"
8060
8061 // 4 pixel loop
8062 LABELALIGN
8063 "40: \n"
8064 "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2
8065 "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2
8066 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
8067 "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride
8068 "movd %%xmm0,%k1 \n"
8069 "pshufd $0x39,%%xmm0,%%xmm0 \n"
8070 "movd %%xmm0,%k5 \n"
8071 "pshufd $0x39,%%xmm0,%%xmm0 \n"
8072 "movd 0x00(%0,%1,1),%%xmm1 \n"
8073 "movd 0x00(%0,%5,1),%%xmm6 \n"
8074 "punpckldq %%xmm6,%%xmm1 \n"
8075 "addps %%xmm4,%%xmm2 \n"
8076 "movq %%xmm1,(%2) \n"
8077 "movd %%xmm0,%k1 \n"
8078 "pshufd $0x39,%%xmm0,%%xmm0 \n"
8079 "movd %%xmm0,%k5 \n"
8080 "movd 0x00(%0,%1,1),%%xmm0 \n"
8081 "movd 0x00(%0,%5,1),%%xmm6 \n"
8082 "punpckldq %%xmm6,%%xmm0 \n"
8083 "addps %%xmm4,%%xmm3 \n"
8084 "movq %%xmm0,0x08(%2) \n"
8085 "lea 0x10(%2),%2 \n"
8086 "sub $0x4,%4 \n"
8087 "jge 40b \n"
8088
8089 "49: \n"
8090 "add $0x3,%4 \n"
8091 "jl 19f \n"
8092
8093 // 1 pixel loop
8094 LABELALIGN
8095 "10: \n"
8096 "cvttps2dq %%xmm2,%%xmm0 \n"
8097 "packssdw %%xmm0,%%xmm0 \n"
8098 "pmaddwd %%xmm5,%%xmm0 \n"
8099 "addps %%xmm7,%%xmm2 \n"
8100 "movd %%xmm0,%k1 \n"
8101 "movd 0x00(%0,%1,1),%%xmm0 \n"
8102 "movd %%xmm0,(%2) \n"
8103 "lea 0x04(%2),%2 \n"
8104 "sub $0x1,%4 \n"
8105 "jge 10b \n"
8106 "19: \n"
8107 : "+r"(src_argb), // %0
8108 "+r"(src_argb_stride_temp), // %1
8109 "+r"(dst_argb), // %2
8110 "+r"(src_dudv), // %3
8111 "+rm"(width), // %4
8112 "=&r"(temp) // %5
8113 :
8114 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
8115 "xmm7");
8116 }
8117 #endif // HAS_ARGBAFFINEROW_SSE2
8118
8119 #ifdef HAS_INTERPOLATEROW_SSSE3
8120 // Bilinear filter 16x2 -> 16x1
InterpolateRow_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)8121 void InterpolateRow_SSSE3(uint8_t* dst_ptr,
8122 const uint8_t* src_ptr,
8123 ptrdiff_t src_stride,
8124 int dst_width,
8125 int source_y_fraction) {
8126 asm volatile(
8127 "sub %1,%0 \n"
8128 "cmp $0x0,%3 \n"
8129 "je 100f \n"
8130 "cmp $0x80,%3 \n"
8131 "je 50f \n"
8132
8133 "movd %3,%%xmm0 \n"
8134 "neg %3 \n"
8135 "add $0x100,%3 \n"
8136 "movd %3,%%xmm5 \n"
8137 "punpcklbw %%xmm0,%%xmm5 \n"
8138 "punpcklwd %%xmm5,%%xmm5 \n"
8139 "pshufd $0x0,%%xmm5,%%xmm5 \n"
8140 "mov $0x80808080,%%eax \n"
8141 "movd %%eax,%%xmm4 \n"
8142 "pshufd $0x0,%%xmm4,%%xmm4 \n"
8143
8144 // General purpose row blend.
8145 LABELALIGN
8146 "1: \n"
8147 "movdqu (%1),%%xmm0 \n"
8148 "movdqu 0x00(%1,%4,1),%%xmm2 \n"
8149 "movdqa %%xmm0,%%xmm1 \n"
8150 "punpcklbw %%xmm2,%%xmm0 \n"
8151 "punpckhbw %%xmm2,%%xmm1 \n"
8152 "psubb %%xmm4,%%xmm0 \n"
8153 "psubb %%xmm4,%%xmm1 \n"
8154 "movdqa %%xmm5,%%xmm2 \n"
8155 "movdqa %%xmm5,%%xmm3 \n"
8156 "pmaddubsw %%xmm0,%%xmm2 \n"
8157 "pmaddubsw %%xmm1,%%xmm3 \n"
8158 "paddw %%xmm4,%%xmm2 \n"
8159 "paddw %%xmm4,%%xmm3 \n"
8160 "psrlw $0x8,%%xmm2 \n"
8161 "psrlw $0x8,%%xmm3 \n"
8162 "packuswb %%xmm3,%%xmm2 \n"
8163 "movdqu %%xmm2,0x00(%1,%0,1) \n"
8164 "lea 0x10(%1),%1 \n"
8165 "sub $0x10,%2 \n"
8166 "jg 1b \n"
8167 "jmp 99f \n"
8168
8169 // Blend 50 / 50.
8170 LABELALIGN
8171 "50: \n"
8172 "movdqu (%1),%%xmm0 \n"
8173 "movdqu 0x00(%1,%4,1),%%xmm1 \n"
8174 "pavgb %%xmm1,%%xmm0 \n"
8175 "movdqu %%xmm0,0x00(%1,%0,1) \n"
8176 "lea 0x10(%1),%1 \n"
8177 "sub $0x10,%2 \n"
8178 "jg 50b \n"
8179 "jmp 99f \n"
8180
8181 // Blend 100 / 0 - Copy row unchanged.
8182 LABELALIGN
8183 "100: \n"
8184 "movdqu (%1),%%xmm0 \n"
8185 "movdqu %%xmm0,0x00(%1,%0,1) \n"
8186 "lea 0x10(%1),%1 \n"
8187 "sub $0x10,%2 \n"
8188 "jg 100b \n"
8189
8190 "99: \n"
8191 : "+r"(dst_ptr), // %0
8192 "+r"(src_ptr), // %1
8193 "+rm"(dst_width), // %2
8194 "+r"(source_y_fraction) // %3
8195 : "r"((intptr_t)(src_stride)) // %4
8196 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
8197 }
8198 #endif // HAS_INTERPOLATEROW_SSSE3
8199
8200 #ifdef HAS_INTERPOLATEROW_AVX2
8201 // Bilinear filter 32x2 -> 32x1
InterpolateRow_AVX2(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)8202 void InterpolateRow_AVX2(uint8_t* dst_ptr,
8203 const uint8_t* src_ptr,
8204 ptrdiff_t src_stride,
8205 int dst_width,
8206 int source_y_fraction) {
8207 asm volatile(
8208 "cmp $0x0,%3 \n"
8209 "je 100f \n"
8210 "sub %1,%0 \n"
8211 "cmp $0x80,%3 \n"
8212 "je 50f \n"
8213
8214 "vmovd %3,%%xmm0 \n"
8215 "neg %3 \n"
8216 "add $0x100,%3 \n"
8217 "vmovd %3,%%xmm5 \n"
8218 "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
8219 "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
8220 "vbroadcastss %%xmm5,%%ymm5 \n"
8221 "mov $0x80808080,%%eax \n"
8222 "vmovd %%eax,%%xmm4 \n"
8223 "vbroadcastss %%xmm4,%%ymm4 \n"
8224
8225 // General purpose row blend.
8226 LABELALIGN
8227 "1: \n"
8228 "vmovdqu (%1),%%ymm0 \n"
8229 "vmovdqu 0x00(%1,%4,1),%%ymm2 \n"
8230 "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
8231 "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
8232 "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
8233 "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
8234 "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
8235 "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
8236 "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
8237 "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
8238 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
8239 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
8240 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
8241 "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
8242 "lea 0x20(%1),%1 \n"
8243 "sub $0x20,%2 \n"
8244 "jg 1b \n"
8245 "jmp 99f \n"
8246
8247 // Blend 50 / 50.
8248 LABELALIGN
8249 "50: \n"
8250 "vmovdqu (%1),%%ymm0 \n"
8251 "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n"
8252 "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
8253 "lea 0x20(%1),%1 \n"
8254 "sub $0x20,%2 \n"
8255 "jg 50b \n"
8256 "jmp 99f \n"
8257
8258 // Blend 100 / 0 - Copy row unchanged.
8259 LABELALIGN
8260 "100: \n"
8261 "rep movsb \n"
8262 "jmp 999f \n"
8263
8264 "99: \n"
8265 "vzeroupper \n"
8266 "999: \n"
8267 : "+D"(dst_ptr), // %0
8268 "+S"(src_ptr), // %1
8269 "+cm"(dst_width), // %2
8270 "+r"(source_y_fraction) // %3
8271 : "r"((intptr_t)(src_stride)) // %4
8272 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
8273 }
8274 #endif // HAS_INTERPOLATEROW_AVX2
8275
8276 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
8277 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)8278 void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
8279 uint8_t* dst_argb,
8280 const uint8_t* shuffler,
8281 int width) {
8282 asm volatile(
8283
8284 "movdqu (%3),%%xmm5 \n"
8285
8286 LABELALIGN
8287 "1: \n"
8288 "movdqu (%0),%%xmm0 \n"
8289 "movdqu 0x10(%0),%%xmm1 \n"
8290 "lea 0x20(%0),%0 \n"
8291 "pshufb %%xmm5,%%xmm0 \n"
8292 "pshufb %%xmm5,%%xmm1 \n"
8293 "movdqu %%xmm0,(%1) \n"
8294 "movdqu %%xmm1,0x10(%1) \n"
8295 "lea 0x20(%1),%1 \n"
8296 "sub $0x8,%2 \n"
8297 "jg 1b \n"
8298 : "+r"(src_argb), // %0
8299 "+r"(dst_argb), // %1
8300 "+r"(width) // %2
8301 : "r"(shuffler) // %3
8302 : "memory", "cc", "xmm0", "xmm1", "xmm5");
8303 }
8304 #endif // HAS_ARGBSHUFFLEROW_SSSE3
8305
8306 #ifdef HAS_ARGBSHUFFLEROW_AVX2
8307 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)8308 void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
8309 uint8_t* dst_argb,
8310 const uint8_t* shuffler,
8311 int width) {
8312 asm volatile(
8313
8314 "vbroadcastf128 (%3),%%ymm5 \n"
8315
8316 LABELALIGN
8317 "1: \n"
8318 "vmovdqu (%0),%%ymm0 \n"
8319 "vmovdqu 0x20(%0),%%ymm1 \n"
8320 "lea 0x40(%0),%0 \n"
8321 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
8322 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
8323 "vmovdqu %%ymm0,(%1) \n"
8324 "vmovdqu %%ymm1,0x20(%1) \n"
8325 "lea 0x40(%1),%1 \n"
8326 "sub $0x10,%2 \n"
8327 "jg 1b \n"
8328 "vzeroupper \n"
8329 : "+r"(src_argb), // %0
8330 "+r"(dst_argb), // %1
8331 "+r"(width) // %2
8332 : "r"(shuffler) // %3
8333 : "memory", "cc", "xmm0", "xmm1", "xmm5");
8334 }
8335 #endif // HAS_ARGBSHUFFLEROW_AVX2
8336
8337 #ifdef HAS_I422TOYUY2ROW_SSE2
I422ToYUY2Row_SSE2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)8338 void I422ToYUY2Row_SSE2(const uint8_t* src_y,
8339 const uint8_t* src_u,
8340 const uint8_t* src_v,
8341 uint8_t* dst_yuy2,
8342 int width) {
8343 asm volatile(
8344
8345 "sub %1,%2 \n"
8346
8347 LABELALIGN
8348 "1: \n"
8349 "movq (%1),%%xmm2 \n"
8350 "movq 0x00(%1,%2,1),%%xmm1 \n"
8351 "add $0x8,%1 \n"
8352 "punpcklbw %%xmm1,%%xmm2 \n"
8353 "movdqu (%0),%%xmm0 \n"
8354 "add $0x10,%0 \n"
8355 "movdqa %%xmm0,%%xmm1 \n"
8356 "punpcklbw %%xmm2,%%xmm0 \n"
8357 "punpckhbw %%xmm2,%%xmm1 \n"
8358 "movdqu %%xmm0,(%3) \n"
8359 "movdqu %%xmm1,0x10(%3) \n"
8360 "lea 0x20(%3),%3 \n"
8361 "sub $0x10,%4 \n"
8362 "jg 1b \n"
8363 : "+r"(src_y), // %0
8364 "+r"(src_u), // %1
8365 "+r"(src_v), // %2
8366 "+r"(dst_yuy2), // %3
8367 "+rm"(width) // %4
8368 :
8369 : "memory", "cc", "xmm0", "xmm1", "xmm2");
8370 }
8371 #endif // HAS_I422TOYUY2ROW_SSE2
8372
8373 #ifdef HAS_I422TOUYVYROW_SSE2
I422ToUYVYRow_SSE2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)8374 void I422ToUYVYRow_SSE2(const uint8_t* src_y,
8375 const uint8_t* src_u,
8376 const uint8_t* src_v,
8377 uint8_t* dst_uyvy,
8378 int width) {
8379 asm volatile(
8380
8381 "sub %1,%2 \n"
8382
8383 LABELALIGN
8384 "1: \n"
8385 "movq (%1),%%xmm2 \n"
8386 "movq 0x00(%1,%2,1),%%xmm1 \n"
8387 "add $0x8,%1 \n"
8388 "punpcklbw %%xmm1,%%xmm2 \n"
8389 "movdqu (%0),%%xmm0 \n"
8390 "movdqa %%xmm2,%%xmm1 \n"
8391 "add $0x10,%0 \n"
8392 "punpcklbw %%xmm0,%%xmm1 \n"
8393 "punpckhbw %%xmm0,%%xmm2 \n"
8394 "movdqu %%xmm1,(%3) \n"
8395 "movdqu %%xmm2,0x10(%3) \n"
8396 "lea 0x20(%3),%3 \n"
8397 "sub $0x10,%4 \n"
8398 "jg 1b \n"
8399 : "+r"(src_y), // %0
8400 "+r"(src_u), // %1
8401 "+r"(src_v), // %2
8402 "+r"(dst_uyvy), // %3
8403 "+rm"(width) // %4
8404 :
8405 : "memory", "cc", "xmm0", "xmm1", "xmm2");
8406 }
8407 #endif // HAS_I422TOUYVYROW_SSE2
8408
8409 #ifdef HAS_I422TOYUY2ROW_AVX2
I422ToYUY2Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)8410 void I422ToYUY2Row_AVX2(const uint8_t* src_y,
8411 const uint8_t* src_u,
8412 const uint8_t* src_v,
8413 uint8_t* dst_yuy2,
8414 int width) {
8415 asm volatile(
8416
8417 "sub %1,%2 \n"
8418
8419 LABELALIGN
8420 "1: \n"
8421 "vpmovzxbw (%1),%%ymm1 \n"
8422 "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
8423 "add $0x10,%1 \n"
8424 "vpsllw $0x8,%%ymm2,%%ymm2 \n"
8425 "vpor %%ymm1,%%ymm2,%%ymm2 \n"
8426 "vmovdqu (%0),%%ymm0 \n"
8427 "add $0x20,%0 \n"
8428 "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
8429 "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
8430 "vextractf128 $0x0,%%ymm1,(%3) \n"
8431 "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
8432 "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
8433 "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
8434 "lea 0x40(%3),%3 \n"
8435 "sub $0x20,%4 \n"
8436 "jg 1b \n"
8437 "vzeroupper \n"
8438 : "+r"(src_y), // %0
8439 "+r"(src_u), // %1
8440 "+r"(src_v), // %2
8441 "+r"(dst_yuy2), // %3
8442 "+rm"(width) // %4
8443 :
8444 : "memory", "cc", "xmm0", "xmm1", "xmm2");
8445 }
8446 #endif // HAS_I422TOYUY2ROW_AVX2
8447
8448 #ifdef HAS_I422TOUYVYROW_AVX2
I422ToUYVYRow_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)8449 void I422ToUYVYRow_AVX2(const uint8_t* src_y,
8450 const uint8_t* src_u,
8451 const uint8_t* src_v,
8452 uint8_t* dst_uyvy,
8453 int width) {
8454 asm volatile(
8455
8456 "sub %1,%2 \n"
8457
8458 LABELALIGN
8459 "1: \n"
8460 "vpmovzxbw (%1),%%ymm1 \n"
8461 "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
8462 "add $0x10,%1 \n"
8463 "vpsllw $0x8,%%ymm2,%%ymm2 \n"
8464 "vpor %%ymm1,%%ymm2,%%ymm2 \n"
8465 "vmovdqu (%0),%%ymm0 \n"
8466 "add $0x20,%0 \n"
8467 "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
8468 "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
8469 "vextractf128 $0x0,%%ymm1,(%3) \n"
8470 "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
8471 "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
8472 "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
8473 "lea 0x40(%3),%3 \n"
8474 "sub $0x20,%4 \n"
8475 "jg 1b \n"
8476 "vzeroupper \n"
8477 : "+r"(src_y), // %0
8478 "+r"(src_u), // %1
8479 "+r"(src_v), // %2
8480 "+r"(dst_uyvy), // %3
8481 "+rm"(width) // %4
8482 :
8483 : "memory", "cc", "xmm0", "xmm1", "xmm2");
8484 }
8485 #endif // HAS_I422TOUYVYROW_AVX2
8486
8487 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
ARGBPolynomialRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)8488 void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
8489 uint8_t* dst_argb,
8490 const float* poly,
8491 int width) {
8492 asm volatile(
8493
8494 "pxor %%xmm3,%%xmm3 \n"
8495
8496 // 2 pixel loop.
8497 LABELALIGN
8498 "1: \n"
8499 "movq (%0),%%xmm0 \n"
8500 "lea 0x8(%0),%0 \n"
8501 "punpcklbw %%xmm3,%%xmm0 \n"
8502 "movdqa %%xmm0,%%xmm4 \n"
8503 "punpcklwd %%xmm3,%%xmm0 \n"
8504 "punpckhwd %%xmm3,%%xmm4 \n"
8505 "cvtdq2ps %%xmm0,%%xmm0 \n"
8506 "cvtdq2ps %%xmm4,%%xmm4 \n"
8507 "movdqa %%xmm0,%%xmm1 \n"
8508 "movdqa %%xmm4,%%xmm5 \n"
8509 "mulps 0x10(%3),%%xmm0 \n"
8510 "mulps 0x10(%3),%%xmm4 \n"
8511 "addps (%3),%%xmm0 \n"
8512 "addps (%3),%%xmm4 \n"
8513 "movdqa %%xmm1,%%xmm2 \n"
8514 "movdqa %%xmm5,%%xmm6 \n"
8515 "mulps %%xmm1,%%xmm2 \n"
8516 "mulps %%xmm5,%%xmm6 \n"
8517 "mulps %%xmm2,%%xmm1 \n"
8518 "mulps %%xmm6,%%xmm5 \n"
8519 "mulps 0x20(%3),%%xmm2 \n"
8520 "mulps 0x20(%3),%%xmm6 \n"
8521 "mulps 0x30(%3),%%xmm1 \n"
8522 "mulps 0x30(%3),%%xmm5 \n"
8523 "addps %%xmm2,%%xmm0 \n"
8524 "addps %%xmm6,%%xmm4 \n"
8525 "addps %%xmm1,%%xmm0 \n"
8526 "addps %%xmm5,%%xmm4 \n"
8527 "cvttps2dq %%xmm0,%%xmm0 \n"
8528 "cvttps2dq %%xmm4,%%xmm4 \n"
8529 "packuswb %%xmm4,%%xmm0 \n"
8530 "packuswb %%xmm0,%%xmm0 \n"
8531 "movq %%xmm0,(%1) \n"
8532 "lea 0x8(%1),%1 \n"
8533 "sub $0x2,%2 \n"
8534 "jg 1b \n"
8535 : "+r"(src_argb), // %0
8536 "+r"(dst_argb), // %1
8537 "+r"(width) // %2
8538 : "r"(poly) // %3
8539 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
8540 }
8541 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
8542
8543 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
ARGBPolynomialRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)8544 void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
8545 uint8_t* dst_argb,
8546 const float* poly,
8547 int width) {
8548 asm volatile(
8549 "vbroadcastf128 (%3),%%ymm4 \n"
8550 "vbroadcastf128 0x10(%3),%%ymm5 \n"
8551 "vbroadcastf128 0x20(%3),%%ymm6 \n"
8552 "vbroadcastf128 0x30(%3),%%ymm7 \n"
8553
8554 // 2 pixel loop.
8555 LABELALIGN
8556 "1: \n"
8557 "vpmovzxbd (%0),%%ymm0 \n" // 2 ARGB pixels
8558 "lea 0x8(%0),%0 \n"
8559 "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
8560 "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
8561 "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
8562 "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
8563 "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
8564 "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X *
8565 // X
8566 "vcvttps2dq %%ymm0,%%ymm0 \n"
8567 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
8568 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
8569 "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
8570 "vmovq %%xmm0,(%1) \n"
8571 "lea 0x8(%1),%1 \n"
8572 "sub $0x2,%2 \n"
8573 "jg 1b \n"
8574 "vzeroupper \n"
8575 : "+r"(src_argb), // %0
8576 "+r"(dst_argb), // %1
8577 "+r"(width) // %2
8578 : "r"(poly) // %3
8579 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
8580 "xmm7");
8581 }
8582 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
8583
8584 #ifdef HAS_HALFFLOATROW_SSE2
8585 static float kScaleBias = 1.9259299444e-34f;
HalfFloatRow_SSE2(const uint16_t * src,uint16_t * dst,float scale,int width)8586 void HalfFloatRow_SSE2(const uint16_t* src,
8587 uint16_t* dst,
8588 float scale,
8589 int width) {
8590 scale *= kScaleBias;
8591 asm volatile(
8592 "movd %3,%%xmm4 \n"
8593 "pshufd $0x0,%%xmm4,%%xmm4 \n"
8594 "pxor %%xmm5,%%xmm5 \n"
8595 "sub %0,%1 \n"
8596
8597 // 16 pixel loop.
8598 LABELALIGN
8599 "1: \n"
8600 "movdqu (%0),%%xmm2 \n" // 8 shorts
8601 "add $0x10,%0 \n"
8602 "movdqa %%xmm2,%%xmm3 \n"
8603 "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
8604 "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
8605 "punpckhwd %%xmm5,%%xmm3 \n"
8606 "cvtdq2ps %%xmm3,%%xmm3 \n"
8607 "mulps %%xmm4,%%xmm2 \n"
8608 "mulps %%xmm4,%%xmm3 \n"
8609 "psrld $0xd,%%xmm2 \n"
8610 "psrld $0xd,%%xmm3 \n"
8611 "packssdw %%xmm3,%%xmm2 \n"
8612 "movdqu %%xmm2,-0x10(%0,%1,1) \n"
8613 "sub $0x8,%2 \n"
8614 "jg 1b \n"
8615 : "+r"(src), // %0
8616 "+r"(dst), // %1
8617 "+r"(width) // %2
8618 : "m"(scale) // %3
8619 : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
8620 }
8621 #endif // HAS_HALFFLOATROW_SSE2
8622
8623 #ifdef HAS_HALFFLOATROW_AVX2
HalfFloatRow_AVX2(const uint16_t * src,uint16_t * dst,float scale,int width)8624 void HalfFloatRow_AVX2(const uint16_t* src,
8625 uint16_t* dst,
8626 float scale,
8627 int width) {
8628 scale *= kScaleBias;
8629 asm volatile(
8630 "vbroadcastss %3, %%ymm4 \n"
8631 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
8632 "sub %0,%1 \n"
8633
8634 // 16 pixel loop.
8635 LABELALIGN
8636 "1: \n"
8637 "vmovdqu (%0),%%ymm2 \n" // 16 shorts
8638 "add $0x20,%0 \n"
8639 "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
8640 "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
8641 "vcvtdq2ps %%ymm3,%%ymm3 \n"
8642 "vcvtdq2ps %%ymm2,%%ymm2 \n"
8643 "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
8644 "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
8645 "vpsrld $0xd,%%ymm3,%%ymm3 \n"
8646 "vpsrld $0xd,%%ymm2,%%ymm2 \n"
8647 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
8648 "vmovdqu %%ymm2,-0x20(%0,%1,1) \n"
8649 "sub $0x10,%2 \n"
8650 "jg 1b \n"
8651
8652 "vzeroupper \n"
8653 : "+r"(src), // %0
8654 "+r"(dst), // %1
8655 "+r"(width) // %2
8656 #if defined(__x86_64__)
8657 : "x"(scale) // %3
8658 #else
8659 : "m"(scale) // %3
8660 #endif
8661 : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
8662 }
8663 #endif // HAS_HALFFLOATROW_AVX2
8664
8665 #ifdef HAS_HALFFLOATROW_F16C
HalfFloatRow_F16C(const uint16_t * src,uint16_t * dst,float scale,int width)8666 void HalfFloatRow_F16C(const uint16_t* src,
8667 uint16_t* dst,
8668 float scale,
8669 int width) {
8670 asm volatile(
8671 "vbroadcastss %3, %%ymm4 \n"
8672 "sub %0,%1 \n"
8673
8674 // 16 pixel loop.
8675 LABELALIGN
8676 "1: \n"
8677 "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
8678 "vpmovzxwd 0x10(%0),%%ymm3 \n"
8679 "vcvtdq2ps %%ymm2,%%ymm2 \n"
8680 "vcvtdq2ps %%ymm3,%%ymm3 \n"
8681 "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
8682 "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
8683 "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
8684 "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
8685 "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
8686 "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
8687 "add $0x20,%0 \n"
8688 "sub $0x10,%2 \n"
8689 "jg 1b \n"
8690 "vzeroupper \n"
8691 : "+r"(src), // %0
8692 "+r"(dst), // %1
8693 "+r"(width) // %2
8694 #if defined(__x86_64__)
8695 : "x"(scale) // %3
8696 #else
8697 : "m"(scale) // %3
8698 #endif
8699 : "memory", "cc", "xmm2", "xmm3", "xmm4");
8700 }
8701 #endif // HAS_HALFFLOATROW_F16C
8702
8703 #ifdef HAS_HALFFLOATROW_F16C
HalfFloat1Row_F16C(const uint16_t * src,uint16_t * dst,float,int width)8704 void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
8705 asm volatile(
8706 "sub %0,%1 \n"
8707 // 16 pixel loop.
8708 LABELALIGN
8709 "1: \n"
8710 "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
8711 "vpmovzxwd 0x10(%0),%%ymm3 \n"
8712 "vcvtdq2ps %%ymm2,%%ymm2 \n"
8713 "vcvtdq2ps %%ymm3,%%ymm3 \n"
8714 "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
8715 "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
8716 "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
8717 "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
8718 "add $0x20,%0 \n"
8719 "sub $0x10,%2 \n"
8720 "jg 1b \n"
8721 "vzeroupper \n"
8722 : "+r"(src), // %0
8723 "+r"(dst), // %1
8724 "+r"(width) // %2
8725 :
8726 : "memory", "cc", "xmm2", "xmm3");
8727 }
8728 #endif // HAS_HALFFLOATROW_F16C
8729
8730 #ifdef HAS_ARGBCOLORTABLEROW_X86
8731 // Tranform ARGB pixels with color table.
ARGBColorTableRow_X86(uint8_t * dst_argb,const uint8_t * table_argb,int width)8732 void ARGBColorTableRow_X86(uint8_t* dst_argb,
8733 const uint8_t* table_argb,
8734 int width) {
8735 uintptr_t pixel_temp;
8736 asm volatile(
8737 // 1 pixel loop.
8738 LABELALIGN
8739 "1: \n"
8740 "movzb (%0),%1 \n"
8741 "lea 0x4(%0),%0 \n"
8742 "movzb 0x00(%3,%1,4),%1 \n"
8743 "mov %b1,-0x4(%0) \n"
8744 "movzb -0x3(%0),%1 \n"
8745 "movzb 0x01(%3,%1,4),%1 \n"
8746 "mov %b1,-0x3(%0) \n"
8747 "movzb -0x2(%0),%1 \n"
8748 "movzb 0x02(%3,%1,4),%1 \n"
8749 "mov %b1,-0x2(%0) \n"
8750 "movzb -0x1(%0),%1 \n"
8751 "movzb 0x03(%3,%1,4),%1 \n"
8752 "mov %b1,-0x1(%0) \n"
8753 "dec %2 \n"
8754 "jg 1b \n"
8755 : "+r"(dst_argb), // %0
8756 "=&d"(pixel_temp), // %1
8757 "+r"(width) // %2
8758 : "r"(table_argb) // %3
8759 : "memory", "cc");
8760 }
8761 #endif // HAS_ARGBCOLORTABLEROW_X86
8762
8763 #ifdef HAS_RGBCOLORTABLEROW_X86
8764 // Tranform RGB pixels with color table.
RGBColorTableRow_X86(uint8_t * dst_argb,const uint8_t * table_argb,int width)8765 void RGBColorTableRow_X86(uint8_t* dst_argb,
8766 const uint8_t* table_argb,
8767 int width) {
8768 uintptr_t pixel_temp;
8769 asm volatile(
8770 // 1 pixel loop.
8771 LABELALIGN
8772 "1: \n"
8773 "movzb (%0),%1 \n"
8774 "lea 0x4(%0),%0 \n"
8775 "movzb 0x00(%3,%1,4),%1 \n"
8776 "mov %b1,-0x4(%0) \n"
8777 "movzb -0x3(%0),%1 \n"
8778 "movzb 0x01(%3,%1,4),%1 \n"
8779 "mov %b1,-0x3(%0) \n"
8780 "movzb -0x2(%0),%1 \n"
8781 "movzb 0x02(%3,%1,4),%1 \n"
8782 "mov %b1,-0x2(%0) \n"
8783 "dec %2 \n"
8784 "jg 1b \n"
8785 : "+r"(dst_argb), // %0
8786 "=&d"(pixel_temp), // %1
8787 "+r"(width) // %2
8788 : "r"(table_argb) // %3
8789 : "memory", "cc");
8790 }
8791 #endif // HAS_RGBCOLORTABLEROW_X86
8792
8793 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
8794 // Tranform RGB pixels with luma table.
ARGBLumaColorTableRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width,const uint8_t * luma,uint32_t lumacoeff)8795 void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
8796 uint8_t* dst_argb,
8797 int width,
8798 const uint8_t* luma,
8799 uint32_t lumacoeff) {
8800 uintptr_t pixel_temp;
8801 uintptr_t table_temp;
8802 asm volatile(
8803 "movd %6,%%xmm3 \n"
8804 "pshufd $0x0,%%xmm3,%%xmm3 \n"
8805 "pcmpeqb %%xmm4,%%xmm4 \n"
8806 "psllw $0x8,%%xmm4 \n"
8807 "pxor %%xmm5,%%xmm5 \n"
8808
8809 // 4 pixel loop.
8810 LABELALIGN
8811 "1: \n"
8812 "movdqu (%2),%%xmm0 \n"
8813 "pmaddubsw %%xmm3,%%xmm0 \n"
8814 "phaddw %%xmm0,%%xmm0 \n"
8815 "pand %%xmm4,%%xmm0 \n"
8816 "punpcklwd %%xmm5,%%xmm0 \n"
8817 "movd %%xmm0,%k1 \n" // 32 bit offset
8818 "add %5,%1 \n"
8819 "pshufd $0x39,%%xmm0,%%xmm0 \n"
8820
8821 "movzb (%2),%0 \n"
8822 "movzb 0x00(%1,%0,1),%0 \n"
8823 "mov %b0,(%3) \n"
8824 "movzb 0x1(%2),%0 \n"
8825 "movzb 0x00(%1,%0,1),%0 \n"
8826 "mov %b0,0x1(%3) \n"
8827 "movzb 0x2(%2),%0 \n"
8828 "movzb 0x00(%1,%0,1),%0 \n"
8829 "mov %b0,0x2(%3) \n"
8830 "movzb 0x3(%2),%0 \n"
8831 "mov %b0,0x3(%3) \n"
8832
8833 "movd %%xmm0,%k1 \n" // 32 bit offset
8834 "add %5,%1 \n"
8835 "pshufd $0x39,%%xmm0,%%xmm0 \n"
8836
8837 "movzb 0x4(%2),%0 \n"
8838 "movzb 0x00(%1,%0,1),%0 \n"
8839 "mov %b0,0x4(%3) \n"
8840 "movzb 0x5(%2),%0 \n"
8841 "movzb 0x00(%1,%0,1),%0 \n"
8842 "mov %b0,0x5(%3) \n"
8843 "movzb 0x6(%2),%0 \n"
8844 "movzb 0x00(%1,%0,1),%0 \n"
8845 "mov %b0,0x6(%3) \n"
8846 "movzb 0x7(%2),%0 \n"
8847 "mov %b0,0x7(%3) \n"
8848
8849 "movd %%xmm0,%k1 \n" // 32 bit offset
8850 "add %5,%1 \n"
8851 "pshufd $0x39,%%xmm0,%%xmm0 \n"
8852
8853 "movzb 0x8(%2),%0 \n"
8854 "movzb 0x00(%1,%0,1),%0 \n"
8855 "mov %b0,0x8(%3) \n"
8856 "movzb 0x9(%2),%0 \n"
8857 "movzb 0x00(%1,%0,1),%0 \n"
8858 "mov %b0,0x9(%3) \n"
8859 "movzb 0xa(%2),%0 \n"
8860 "movzb 0x00(%1,%0,1),%0 \n"
8861 "mov %b0,0xa(%3) \n"
8862 "movzb 0xb(%2),%0 \n"
8863 "mov %b0,0xb(%3) \n"
8864
8865 "movd %%xmm0,%k1 \n" // 32 bit offset
8866 "add %5,%1 \n"
8867
8868 "movzb 0xc(%2),%0 \n"
8869 "movzb 0x00(%1,%0,1),%0 \n"
8870 "mov %b0,0xc(%3) \n"
8871 "movzb 0xd(%2),%0 \n"
8872 "movzb 0x00(%1,%0,1),%0 \n"
8873 "mov %b0,0xd(%3) \n"
8874 "movzb 0xe(%2),%0 \n"
8875 "movzb 0x00(%1,%0,1),%0 \n"
8876 "mov %b0,0xe(%3) \n"
8877 "movzb 0xf(%2),%0 \n"
8878 "mov %b0,0xf(%3) \n"
8879 "lea 0x10(%2),%2 \n"
8880 "lea 0x10(%3),%3 \n"
8881 "sub $0x4,%4 \n"
8882 "jg 1b \n"
8883 : "=&d"(pixel_temp), // %0
8884 "=&a"(table_temp), // %1
8885 "+r"(src_argb), // %2
8886 "+r"(dst_argb), // %3
8887 "+rm"(width) // %4
8888 : "r"(luma), // %5
8889 "rm"(lumacoeff) // %6
8890 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
8891 }
8892 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
8893
8894 #ifdef HAS_NV21TOYUV24ROW_AVX2
8895
8896 // begin NV21ToYUV24Row_C avx2 constants
8897 static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00,
8898 0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80,
8899 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
8900 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00};
8901
8902 static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
8903 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
8904 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
8905 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80};
8906
8907 static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
8908 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
8909 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
8910 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00};
8911
8912 static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
8913 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05,
8914 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
8915 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05};
8916
8917 static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
8918 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80,
8919 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
8920 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80};
8921
8922 static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
8923 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f,
8924 0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
8925 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f};
8926
8927 static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
8928 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80,
8929 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
8930 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80};
8931
8932 static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
8933 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a,
8934 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
8935 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a};
8936
8937 static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
8938 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80,
8939 0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
8940 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80};
8941
8942 // NV21ToYUV24Row_AVX2
NV21ToYUV24Row_AVX2(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)8943 void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
8944 const uint8_t* src_vu,
8945 uint8_t* dst_yuv24,
8946 int width) {
8947 uint8_t* src_y_ptr;
8948 uint64_t src_offset = 0;
8949 uint64_t width64;
8950
8951 width64 = width;
8952 src_y_ptr = (uint8_t*)src_y;
8953
8954 asm volatile(
8955 "vmovdqu %5, %%ymm0 \n" // init blend value
8956 "vmovdqu %6, %%ymm1 \n" // init blend value
8957 "vmovdqu %7, %%ymm2 \n" // init blend value
8958 // "sub $0x20, %3 \n" //sub 32 from
8959 // width for final loop
8960
8961 LABELALIGN
8962 "1: \n" // label 1
8963 "vmovdqu (%0,%4), %%ymm3 \n" // src_y
8964 "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1
8965 "vmovdqu (%1), %%ymm5 \n" // src_uv
8966 "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf
8967 "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for
8968 // shuf
8969 "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for
8970 // shuf
8971 "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf
8972 "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for
8973 // shuf
8974 "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0
8975 "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0
8976 "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2
8977 "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1
8978 "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const
8979 "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results
8980 "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h
8981 "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results
8982 "add $0x20, %4 \n" // add to src buffer
8983 // ptr
8984 "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert
8985 "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert
8986 "vmovdqu %%ymm4, (%2) \n" // store dst_yuv
8987 "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h
8988 "add $0x60,%2 \n" // add to dst buffer
8989 // ptr
8990 // "cmp %3, %4 \n" //(width64 -
8991 // 32 bytes) and src_offset
8992 "sub $0x20,%3 \n" // 32 pixels per loop
8993 "jg 1b \n"
8994 "vzeroupper \n" // sse-avx2
8995 // transistions
8996
8997 : "+r"(src_y), //%0
8998 "+r"(src_vu), //%1
8999 "+r"(dst_yuv24), //%2
9000 "+r"(width64), //%3
9001 "+r"(src_offset) //%4
9002 : "m"(kBLEND0), //%5
9003 "m"(kBLEND1), //%6
9004 "m"(kBLEND2), //%7
9005 "m"(kSHUF0), //%8
9006 "m"(kSHUF1), //%9
9007 "m"(kSHUF2), //%10
9008 "m"(kSHUF3), //%11
9009 "m"(kSHUF4), //%12
9010 "m"(kSHUF5) //%13
9011 : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12",
9012 "xmm13", "xmm14", "xmm15");
9013 }
9014 #endif // HAS_NV21TOYUV24ROW_AVX2
9015
9016 #ifdef HAS_SWAPUVROW_SSSE3
9017
9018 // Shuffle table for reversing the bytes.
9019 static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
9020 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
9021
9022 // Convert UV plane of NV12 to VU of NV21.
SwapUVRow_SSSE3(const uint8_t * src_uv,uint8_t * dst_vu,int width)9023 void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
9024 asm volatile(
9025
9026 "movdqu %3,%%xmm5 \n"
9027
9028 LABELALIGN
9029 "1: \n"
9030 "movdqu (%0),%%xmm0 \n"
9031 "movdqu 0x10(%0),%%xmm1 \n"
9032 "lea 0x20(%0),%0 \n"
9033 "pshufb %%xmm5,%%xmm0 \n"
9034 "pshufb %%xmm5,%%xmm1 \n"
9035 "movdqu %%xmm0,(%1) \n"
9036 "movdqu %%xmm1,0x10(%1) \n"
9037 "lea 0x20(%1),%1 \n"
9038 "sub $0x10,%2 \n"
9039 "jg 1b \n"
9040 : "+r"(src_uv), // %0
9041 "+r"(dst_vu), // %1
9042 "+r"(width) // %2
9043 : "m"(kShuffleUVToVU) // %3
9044 : "memory", "cc", "xmm0", "xmm1", "xmm5");
9045 }
9046 #endif // HAS_SWAPUVROW_SSSE3
9047
9048 #ifdef HAS_SWAPUVROW_AVX2
SwapUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_vu,int width)9049 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
9050 asm volatile(
9051
9052 "vbroadcastf128 %3,%%ymm5 \n"
9053
9054 LABELALIGN
9055 "1: \n"
9056 "vmovdqu (%0),%%ymm0 \n"
9057 "vmovdqu 0x20(%0),%%ymm1 \n"
9058 "lea 0x40(%0),%0 \n"
9059 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
9060 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
9061 "vmovdqu %%ymm0,(%1) \n"
9062 "vmovdqu %%ymm1,0x20(%1) \n"
9063 "lea 0x40(%1),%1 \n"
9064 "sub $0x20,%2 \n"
9065 "jg 1b \n"
9066 "vzeroupper \n"
9067 : "+r"(src_uv), // %0
9068 "+r"(dst_vu), // %1
9069 "+r"(width) // %2
9070 : "m"(kShuffleUVToVU) // %3
9071 : "memory", "cc", "xmm0", "xmm1", "xmm5");
9072 }
9073 #endif // HAS_SWAPUVROW_AVX2
9074
HalfMergeUVRow_SSSE3(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)9075 void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
9076 int src_stride_u,
9077 const uint8_t* src_v,
9078 int src_stride_v,
9079 uint8_t* dst_uv,
9080 int width) {
9081 asm volatile(
9082 "pcmpeqb %%xmm4,%%xmm4 \n"
9083 "psrlw $0xf,%%xmm4 \n"
9084 "packuswb %%xmm4,%%xmm4 \n"
9085 "pxor %%xmm5,%%xmm5 \n"
9086
9087 LABELALIGN
9088 "1: \n"
9089 "movdqu (%0),%%xmm0 \n" // load 16 U values
9090 "movdqu (%1),%%xmm1 \n" // load 16 V values
9091 "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row
9092 "movdqu 0(%1,%5,1),%%xmm3 \n"
9093 "lea 0x10(%0),%0 \n"
9094 "pmaddubsw %%xmm4,%%xmm0 \n" // half size
9095 "pmaddubsw %%xmm4,%%xmm1 \n"
9096 "pmaddubsw %%xmm4,%%xmm2 \n"
9097 "pmaddubsw %%xmm4,%%xmm3 \n"
9098 "lea 0x10(%1),%1 \n"
9099 "paddw %%xmm2,%%xmm0 \n"
9100 "paddw %%xmm3,%%xmm1 \n"
9101 "psrlw $0x1,%%xmm0 \n"
9102 "psrlw $0x1,%%xmm1 \n"
9103 "pavgw %%xmm5,%%xmm0 \n"
9104 "pavgw %%xmm5,%%xmm1 \n"
9105 "packuswb %%xmm0,%%xmm0 \n"
9106 "packuswb %%xmm1,%%xmm1 \n"
9107 "punpcklbw %%xmm1,%%xmm0 \n"
9108 "movdqu %%xmm0,(%2) \n" // store 8 UV pixels
9109 "lea 0x10(%2),%2 \n"
9110 "sub $0x10,%3 \n" // 16 src pixels per loop
9111 "jg 1b \n"
9112 : "+r"(src_u), // %0
9113 "+r"(src_v), // %1
9114 "+r"(dst_uv), // %2
9115 "+r"(width) // %3
9116 : "r"((intptr_t)(src_stride_u)), // %4
9117 "r"((intptr_t)(src_stride_v)) // %5
9118 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
9119 }
9120
HalfMergeUVRow_AVX2(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)9121 void HalfMergeUVRow_AVX2(const uint8_t* src_u,
9122 int src_stride_u,
9123 const uint8_t* src_v,
9124 int src_stride_v,
9125 uint8_t* dst_uv,
9126 int width) {
9127 asm volatile(
9128 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
9129 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
9130 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
9131 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
9132
9133 LABELALIGN
9134 "1: \n"
9135 "vmovdqu (%0),%%ymm0 \n" // load 32 U values
9136 "vmovdqu (%1),%%ymm1 \n" // load 32 V values
9137 "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row
9138 "vmovdqu 0(%1,%5,1),%%ymm3 \n"
9139 "lea 0x20(%0),%0 \n"
9140 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size
9141 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
9142 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
9143 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
9144 "lea 0x20(%1),%1 \n"
9145 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
9146 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
9147 "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
9148 "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
9149 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
9150 "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
9151 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
9152 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
9153 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
9154 "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels
9155 "lea 0x20(%2),%2 \n"
9156 "sub $0x20,%3 \n" // 32 src pixels per loop
9157 "jg 1b \n"
9158 "vzeroupper \n"
9159 : "+r"(src_u), // %0
9160 "+r"(src_v), // %1
9161 "+r"(dst_uv), // %2
9162 "+r"(width) // %3
9163 : "r"((intptr_t)(src_stride_u)), // %4
9164 "r"((intptr_t)(src_stride_v)) // %5
9165 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
9166 }
9167
ClampFloatToZero_SSE2(const float * src_x,float * dst_y,int width)9168 void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
9169 asm volatile(
9170 "pxor %%xmm1,%%xmm1 \n"
9171
9172 LABELALIGN
9173 "1: \n"
9174 "movd (%0),%%xmm0 \n" // load float
9175 "maxss %%xmm1, %%xmm0 \n" // clamp to zero
9176 "add 4, %0 \n"
9177 "movd %%xmm0, (%1) \n" // store float
9178 "add 4, %1 \n"
9179 "sub $0x4,%2 \n" // 1 float per loop
9180 "jg 1b \n"
9181 : "+r"(src_x), // %0
9182 "+r"(dst_y), // %1
9183 "+r"(width) // %2
9184 :
9185 : "memory", "cc", "xmm0", "xmm1");
9186 }
9187
9188 #endif // defined(__x86_64__) || defined(__i386__)
9189
9190 #ifdef __cplusplus
9191 } // extern "C"
9192 } // namespace libyuv
9193 #endif
9194