1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 // This module is for Visual C 32/64 bit and clangcl 32 bit
14 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
15 (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
16
17 #if defined(_M_X64)
18 #include <emmintrin.h>
19 #include <tmmintrin.h> // For _mm_maddubs_epi16
20 #endif
21
22 #ifdef __cplusplus
23 namespace libyuv {
24 extern "C" {
25 #endif
26
27 // 64 bit
28 #if defined(_M_X64)
29
30 // Read 4 UV from 422, upsample to 8 UV.
31 #define READYUV422 \
32 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
33 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
34 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
35 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
36 u_buf += 4; \
37 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
38 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
39 y_buf += 8;
40
41 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
42 #define READYUVA422 \
43 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
44 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
45 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
46 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
47 u_buf += 4; \
48 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
49 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
50 y_buf += 8; \
51 xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
52 a_buf += 8;
53
54 // Convert 8 pixels: 8 UV and 8 Y.
55 #define YUVTORGB(yuvconstants) \
56 xmm1 = _mm_loadu_si128(&xmm0); \
57 xmm2 = _mm_loadu_si128(&xmm0); \
58 xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
59 xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
60 xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
61 xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \
62 xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \
63 xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \
64 xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
65 xmm0 = _mm_adds_epi16(xmm0, xmm4); \
66 xmm1 = _mm_adds_epi16(xmm1, xmm4); \
67 xmm2 = _mm_adds_epi16(xmm2, xmm4); \
68 xmm0 = _mm_srai_epi16(xmm0, 6); \
69 xmm1 = _mm_srai_epi16(xmm1, 6); \
70 xmm2 = _mm_srai_epi16(xmm2, 6); \
71 xmm0 = _mm_packus_epi16(xmm0, xmm0); \
72 xmm1 = _mm_packus_epi16(xmm1, xmm1); \
73 xmm2 = _mm_packus_epi16(xmm2, xmm2);
74
75 // Store 8 ARGB values.
76 #define STOREARGB \
77 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
78 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \
79 xmm1 = _mm_loadu_si128(&xmm0); \
80 xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \
81 xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \
82 _mm_storeu_si128((__m128i *)dst_argb, xmm0); \
83 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); \
84 dst_argb += 32;
85
86
87 #if defined(HAS_I422TOARGBROW_SSSE3)
I422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)88 void I422ToARGBRow_SSSE3(const uint8* y_buf,
89 const uint8* u_buf,
90 const uint8* v_buf,
91 uint8* dst_argb,
92 const struct YuvConstants* yuvconstants,
93 int width) {
94 __m128i xmm0, xmm1, xmm2, xmm4;
95 const __m128i xmm5 = _mm_set1_epi8(-1);
96 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
97 while (width > 0) {
98 READYUV422
99 YUVTORGB(yuvconstants)
100 STOREARGB
101 width -= 8;
102 }
103 }
104 #endif
105
106 #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
I422AlphaToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,const uint8 * a_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)107 void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
108 const uint8* u_buf,
109 const uint8* v_buf,
110 const uint8* a_buf,
111 uint8* dst_argb,
112 const struct YuvConstants* yuvconstants,
113 int width) {
114 __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
115 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
116 while (width > 0) {
117 READYUVA422
118 YUVTORGB(yuvconstants)
119 STOREARGB
120 width -= 8;
121 }
122 }
123 #endif
124
125 // 32 bit
126 #else // defined(_M_X64)
127 #ifdef HAS_ARGBTOYROW_SSSE3
128
129 // Constants for ARGB.
130 static const vec8 kARGBToY = {
131 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
132 };
133
134 // JPeg full range.
135 static const vec8 kARGBToYJ = {
136 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
137 };
138
139 static const vec8 kARGBToU = {
140 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
141 };
142
143 static const vec8 kARGBToUJ = {
144 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
145 };
146
147 static const vec8 kARGBToV = {
148 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
149 };
150
151 static const vec8 kARGBToVJ = {
152 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
153 };
154
155 // vpshufb for vphaddw + vpackuswb packed to shorts.
156 static const lvec8 kShufARGBToUV_AVX = {
157 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
158 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
159 };
160
161 // Constants for BGRA.
162 static const vec8 kBGRAToY = {
163 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
164 };
165
166 static const vec8 kBGRAToU = {
167 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
168 };
169
170 static const vec8 kBGRAToV = {
171 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
172 };
173
174 // Constants for ABGR.
175 static const vec8 kABGRToY = {
176 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
177 };
178
179 static const vec8 kABGRToU = {
180 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
181 };
182
183 static const vec8 kABGRToV = {
184 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
185 };
186
187 // Constants for RGBA.
188 static const vec8 kRGBAToY = {
189 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
190 };
191
192 static const vec8 kRGBAToU = {
193 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
194 };
195
196 static const vec8 kRGBAToV = {
197 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
198 };
199
200 static const uvec8 kAddY16 = {
201 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
202 };
203
204 // 7 bit fixed point 0.5.
205 static const vec16 kAddYJ64 = {
206 64, 64, 64, 64, 64, 64, 64, 64
207 };
208
209 static const uvec8 kAddUV128 = {
210 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
211 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
212 };
213
214 static const uvec16 kAddUVJ128 = {
215 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
216 };
217
218 // Shuffle table for converting RGB24 to ARGB.
219 static const uvec8 kShuffleMaskRGB24ToARGB = {
220 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
221 };
222
223 // Shuffle table for converting RAW to ARGB.
224 static const uvec8 kShuffleMaskRAWToARGB = {
225 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
226 };
227
228 // Shuffle table for converting RAW to RGB24. First 8.
229 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
230 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
231 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
232 };
233
234 // Shuffle table for converting RAW to RGB24. Middle 8.
235 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
236 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
237 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
238 };
239
240 // Shuffle table for converting RAW to RGB24. Last 8.
241 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
242 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
243 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
244 };
245
246 // Shuffle table for converting ARGB to RGB24.
247 static const uvec8 kShuffleMaskARGBToRGB24 = {
248 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
249 };
250
251 // Shuffle table for converting ARGB to RAW.
252 static const uvec8 kShuffleMaskARGBToRAW = {
253 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
254 };
255
256 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
257 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
258 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
259 };
260
261 // YUY2 shuf 16 Y to 32 Y.
262 static const lvec8 kShuffleYUY2Y = {
263 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
264 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
265 };
266
267 // YUY2 shuf 8 UV to 16 UV.
268 static const lvec8 kShuffleYUY2UV = {
269 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
270 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
271 };
272
273 // UYVY shuf 16 Y to 32 Y.
274 static const lvec8 kShuffleUYVYY = {
275 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
276 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
277 };
278
279 // UYVY shuf 8 UV to 16 UV.
280 static const lvec8 kShuffleUYVYUV = {
281 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
282 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
283 };
284
285 // NV21 shuf 8 VU to 16 UV.
286 static const lvec8 kShuffleNV21 = {
287 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
288 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
289 };
290
291 // Duplicates gray value 3 times and fills in alpha opaque.
292 __declspec(naked)
293 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
294 __asm {
295 mov eax, [esp + 4] // src_y
296 mov edx, [esp + 8] // dst_argb
297 mov ecx, [esp + 12] // width
298 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
299 pslld xmm5, 24
300
301 convertloop:
302 movq xmm0, qword ptr [eax]
303 lea eax, [eax + 8]
304 punpcklbw xmm0, xmm0
305 movdqa xmm1, xmm0
306 punpcklwd xmm0, xmm0
307 punpckhwd xmm1, xmm1
308 por xmm0, xmm5
309 por xmm1, xmm5
310 movdqu [edx], xmm0
311 movdqu [edx + 16], xmm1
312 lea edx, [edx + 32]
313 sub ecx, 8
314 jg convertloop
315 ret
316 }
317 }
318
319 #ifdef HAS_J400TOARGBROW_AVX2
320 // Duplicates gray value 3 times and fills in alpha opaque.
321 __declspec(naked)
322 void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {
323 __asm {
324 mov eax, [esp + 4] // src_y
325 mov edx, [esp + 8] // dst_argb
326 mov ecx, [esp + 12] // width
327 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
328 vpslld ymm5, ymm5, 24
329
330 convertloop:
331 vmovdqu xmm0, [eax]
332 lea eax, [eax + 16]
333 vpermq ymm0, ymm0, 0xd8
334 vpunpcklbw ymm0, ymm0, ymm0
335 vpermq ymm0, ymm0, 0xd8
336 vpunpckhwd ymm1, ymm0, ymm0
337 vpunpcklwd ymm0, ymm0, ymm0
338 vpor ymm0, ymm0, ymm5
339 vpor ymm1, ymm1, ymm5
340 vmovdqu [edx], ymm0
341 vmovdqu [edx + 32], ymm1
342 lea edx, [edx + 64]
343 sub ecx, 16
344 jg convertloop
345 vzeroupper
346 ret
347 }
348 }
349 #endif // HAS_J400TOARGBROW_AVX2
350
351 __declspec(naked)
352 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
353 __asm {
354 mov eax, [esp + 4] // src_rgb24
355 mov edx, [esp + 8] // dst_argb
356 mov ecx, [esp + 12] // width
357 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
358 pslld xmm5, 24
359 movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
360
361 convertloop:
362 movdqu xmm0, [eax]
363 movdqu xmm1, [eax + 16]
364 movdqu xmm3, [eax + 32]
365 lea eax, [eax + 48]
366 movdqa xmm2, xmm3
367 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
368 pshufb xmm2, xmm4
369 por xmm2, xmm5
370 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
371 pshufb xmm0, xmm4
372 movdqu [edx + 32], xmm2
373 por xmm0, xmm5
374 pshufb xmm1, xmm4
375 movdqu [edx], xmm0
376 por xmm1, xmm5
377 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
378 pshufb xmm3, xmm4
379 movdqu [edx + 16], xmm1
380 por xmm3, xmm5
381 movdqu [edx + 48], xmm3
382 lea edx, [edx + 64]
383 sub ecx, 16
384 jg convertloop
385 ret
386 }
387 }
388
389 __declspec(naked)
390 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
391 int width) {
392 __asm {
393 mov eax, [esp + 4] // src_raw
394 mov edx, [esp + 8] // dst_argb
395 mov ecx, [esp + 12] // width
396 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
397 pslld xmm5, 24
398 movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB
399
400 convertloop:
401 movdqu xmm0, [eax]
402 movdqu xmm1, [eax + 16]
403 movdqu xmm3, [eax + 32]
404 lea eax, [eax + 48]
405 movdqa xmm2, xmm3
406 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
407 pshufb xmm2, xmm4
408 por xmm2, xmm5
409 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
410 pshufb xmm0, xmm4
411 movdqu [edx + 32], xmm2
412 por xmm0, xmm5
413 pshufb xmm1, xmm4
414 movdqu [edx], xmm0
415 por xmm1, xmm5
416 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
417 pshufb xmm3, xmm4
418 movdqu [edx + 16], xmm1
419 por xmm3, xmm5
420 movdqu [edx + 48], xmm3
421 lea edx, [edx + 64]
422 sub ecx, 16
423 jg convertloop
424 ret
425 }
426 }
427
428 __declspec(naked)
429 void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
430 __asm {
431 mov eax, [esp + 4] // src_raw
432 mov edx, [esp + 8] // dst_rgb24
433 mov ecx, [esp + 12] // width
434 movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
435 movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
436 movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
437
438 convertloop:
439 movdqu xmm0, [eax]
440 movdqu xmm1, [eax + 4]
441 movdqu xmm2, [eax + 8]
442 lea eax, [eax + 24]
443 pshufb xmm0, xmm3
444 pshufb xmm1, xmm4
445 pshufb xmm2, xmm5
446 movq qword ptr [edx], xmm0
447 movq qword ptr [edx + 8], xmm1
448 movq qword ptr [edx + 16], xmm2
449 lea edx, [edx + 24]
450 sub ecx, 8
451 jg convertloop
452 ret
453 }
454 }
455
456 // pmul method to replicate bits.
457 // Math to replicate bits:
458 // (v << 8) | (v << 3)
459 // v * 256 + v * 8
460 // v * (256 + 8)
461 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
462 // 20 instructions.
463 __declspec(naked)
464 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
465 int width) {
466 __asm {
467 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
468 movd xmm5, eax
469 pshufd xmm5, xmm5, 0
470 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
471 movd xmm6, eax
472 pshufd xmm6, xmm6, 0
473 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
474 psllw xmm3, 11
475 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
476 psllw xmm4, 10
477 psrlw xmm4, 5
478 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
479 psllw xmm7, 8
480
481 mov eax, [esp + 4] // src_rgb565
482 mov edx, [esp + 8] // dst_argb
483 mov ecx, [esp + 12] // width
484 sub edx, eax
485 sub edx, eax
486
487 convertloop:
488 movdqu xmm0, [eax] // fetch 8 pixels of bgr565
489 movdqa xmm1, xmm0
490 movdqa xmm2, xmm0
491 pand xmm1, xmm3 // R in upper 5 bits
492 psllw xmm2, 11 // B in upper 5 bits
493 pmulhuw xmm1, xmm5 // * (256 + 8)
494 pmulhuw xmm2, xmm5 // * (256 + 8)
495 psllw xmm1, 8
496 por xmm1, xmm2 // RB
497 pand xmm0, xmm4 // G in middle 6 bits
498 pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
499 por xmm0, xmm7 // AG
500 movdqa xmm2, xmm1
501 punpcklbw xmm1, xmm0
502 punpckhbw xmm2, xmm0
503 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
504 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
505 lea eax, [eax + 16]
506 sub ecx, 8
507 jg convertloop
508 ret
509 }
510 }
511
512 #ifdef HAS_RGB565TOARGBROW_AVX2
513 // pmul method to replicate bits.
514 // Math to replicate bits:
515 // (v << 8) | (v << 3)
516 // v * 256 + v * 8
517 // v * (256 + 8)
518 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
519 __declspec(naked)
520 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
521 int width) {
522 __asm {
523 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
524 vmovd xmm5, eax
525 vbroadcastss ymm5, xmm5
526 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
527 vmovd xmm6, eax
528 vbroadcastss ymm6, xmm6
529 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
530 vpsllw ymm3, ymm3, 11
531 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
532 vpsllw ymm4, ymm4, 10
533 vpsrlw ymm4, ymm4, 5
534 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
535 vpsllw ymm7, ymm7, 8
536
537 mov eax, [esp + 4] // src_rgb565
538 mov edx, [esp + 8] // dst_argb
539 mov ecx, [esp + 12] // width
540 sub edx, eax
541 sub edx, eax
542
543 convertloop:
544 vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
545 vpand ymm1, ymm0, ymm3 // R in upper 5 bits
546 vpsllw ymm2, ymm0, 11 // B in upper 5 bits
547 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
548 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
549 vpsllw ymm1, ymm1, 8
550 vpor ymm1, ymm1, ymm2 // RB
551 vpand ymm0, ymm0, ymm4 // G in middle 6 bits
552 vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
553 vpor ymm0, ymm0, ymm7 // AG
554 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
555 vpermq ymm1, ymm1, 0xd8
556 vpunpckhbw ymm2, ymm1, ymm0
557 vpunpcklbw ymm1, ymm1, ymm0
558 vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB
559 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB
560 lea eax, [eax + 32]
561 sub ecx, 16
562 jg convertloop
563 vzeroupper
564 ret
565 }
566 }
567 #endif // HAS_RGB565TOARGBROW_AVX2
568
569 #ifdef HAS_ARGB1555TOARGBROW_AVX2
570 __declspec(naked)
571 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
572 int width) {
573 __asm {
574 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
575 vmovd xmm5, eax
576 vbroadcastss ymm5, xmm5
577 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
578 vmovd xmm6, eax
579 vbroadcastss ymm6, xmm6
580 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
581 vpsllw ymm3, ymm3, 11
582 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
583 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
584 vpsllw ymm7, ymm7, 8
585
586 mov eax, [esp + 4] // src_argb1555
587 mov edx, [esp + 8] // dst_argb
588 mov ecx, [esp + 12] // width
589 sub edx, eax
590 sub edx, eax
591
592 convertloop:
593 vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
594 vpsllw ymm1, ymm0, 1 // R in upper 5 bits
595 vpsllw ymm2, ymm0, 11 // B in upper 5 bits
596 vpand ymm1, ymm1, ymm3
597 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
598 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
599 vpsllw ymm1, ymm1, 8
600 vpor ymm1, ymm1, ymm2 // RB
601 vpsraw ymm2, ymm0, 8 // A
602 vpand ymm0, ymm0, ymm4 // G in middle 5 bits
603 vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
604 vpand ymm2, ymm2, ymm7
605 vpor ymm0, ymm0, ymm2 // AG
606 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
607 vpermq ymm1, ymm1, 0xd8
608 vpunpckhbw ymm2, ymm1, ymm0
609 vpunpcklbw ymm1, ymm1, ymm0
610 vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB
611 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB
612 lea eax, [eax + 32]
613 sub ecx, 16
614 jg convertloop
615 vzeroupper
616 ret
617 }
618 }
619 #endif // HAS_ARGB1555TOARGBROW_AVX2
620
621 #ifdef HAS_ARGB4444TOARGBROW_AVX2
622 __declspec(naked)
623 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
624 int width) {
625 __asm {
626 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
627 vmovd xmm4, eax
628 vbroadcastss ymm4, xmm4
629 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
630 mov eax, [esp + 4] // src_argb4444
631 mov edx, [esp + 8] // dst_argb
632 mov ecx, [esp + 12] // width
633 sub edx, eax
634 sub edx, eax
635
636 convertloop:
637 vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
638 vpand ymm2, ymm0, ymm5 // mask high nibbles
639 vpand ymm0, ymm0, ymm4 // mask low nibbles
640 vpsrlw ymm3, ymm2, 4
641 vpsllw ymm1, ymm0, 4
642 vpor ymm2, ymm2, ymm3
643 vpor ymm0, ymm0, ymm1
644 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
645 vpermq ymm2, ymm2, 0xd8
646 vpunpckhbw ymm1, ymm0, ymm2
647 vpunpcklbw ymm0, ymm0, ymm2
648 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
649 vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
650 lea eax, [eax + 32]
651 sub ecx, 16
652 jg convertloop
653 vzeroupper
654 ret
655 }
656 }
657 #endif // HAS_ARGB4444TOARGBROW_AVX2
658
659 // 24 instructions
660 __declspec(naked)
661 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
662 int width) {
663 __asm {
664 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
665 movd xmm5, eax
666 pshufd xmm5, xmm5, 0
667 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
668 movd xmm6, eax
669 pshufd xmm6, xmm6, 0
670 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
671 psllw xmm3, 11
672 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
673 psrlw xmm4, 6
674 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
675 psllw xmm7, 8
676
677 mov eax, [esp + 4] // src_argb1555
678 mov edx, [esp + 8] // dst_argb
679 mov ecx, [esp + 12] // width
680 sub edx, eax
681 sub edx, eax
682
683 convertloop:
684 movdqu xmm0, [eax] // fetch 8 pixels of 1555
685 movdqa xmm1, xmm0
686 movdqa xmm2, xmm0
687 psllw xmm1, 1 // R in upper 5 bits
688 psllw xmm2, 11 // B in upper 5 bits
689 pand xmm1, xmm3
690 pmulhuw xmm2, xmm5 // * (256 + 8)
691 pmulhuw xmm1, xmm5 // * (256 + 8)
692 psllw xmm1, 8
693 por xmm1, xmm2 // RB
694 movdqa xmm2, xmm0
695 pand xmm0, xmm4 // G in middle 5 bits
696 psraw xmm2, 8 // A
697 pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
698 pand xmm2, xmm7
699 por xmm0, xmm2 // AG
700 movdqa xmm2, xmm1
701 punpcklbw xmm1, xmm0
702 punpckhbw xmm2, xmm0
703 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
704 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
705 lea eax, [eax + 16]
706 sub ecx, 8
707 jg convertloop
708 ret
709 }
710 }
711
712 // 18 instructions.
713 __declspec(naked)
714 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
715 int width) {
716 __asm {
717 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
718 movd xmm4, eax
719 pshufd xmm4, xmm4, 0
720 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
721 pslld xmm5, 4
722 mov eax, [esp + 4] // src_argb4444
723 mov edx, [esp + 8] // dst_argb
724 mov ecx, [esp + 12] // width
725 sub edx, eax
726 sub edx, eax
727
728 convertloop:
729 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
730 movdqa xmm2, xmm0
731 pand xmm0, xmm4 // mask low nibbles
732 pand xmm2, xmm5 // mask high nibbles
733 movdqa xmm1, xmm0
734 movdqa xmm3, xmm2
735 psllw xmm1, 4
736 psrlw xmm3, 4
737 por xmm0, xmm1
738 por xmm2, xmm3
739 movdqa xmm1, xmm0
740 punpcklbw xmm0, xmm2
741 punpckhbw xmm1, xmm2
742 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
743 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
744 lea eax, [eax + 16]
745 sub ecx, 8
746 jg convertloop
747 ret
748 }
749 }
750
751 __declspec(naked)
752 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
753 __asm {
754 mov eax, [esp + 4] // src_argb
755 mov edx, [esp + 8] // dst_rgb
756 mov ecx, [esp + 12] // width
757 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
758
759 convertloop:
760 movdqu xmm0, [eax] // fetch 16 pixels of argb
761 movdqu xmm1, [eax + 16]
762 movdqu xmm2, [eax + 32]
763 movdqu xmm3, [eax + 48]
764 lea eax, [eax + 64]
765 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
766 pshufb xmm1, xmm6
767 pshufb xmm2, xmm6
768 pshufb xmm3, xmm6
769 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
770 psrldq xmm1, 4 // 8 bytes from 1
771 pslldq xmm4, 12 // 4 bytes from 1 for 0
772 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
773 por xmm0, xmm4 // 4 bytes from 1 for 0
774 pslldq xmm5, 8 // 8 bytes from 2 for 1
775 movdqu [edx], xmm0 // store 0
776 por xmm1, xmm5 // 8 bytes from 2 for 1
777 psrldq xmm2, 8 // 4 bytes from 2
778 pslldq xmm3, 4 // 12 bytes from 3 for 2
779 por xmm2, xmm3 // 12 bytes from 3 for 2
780 movdqu [edx + 16], xmm1 // store 1
781 movdqu [edx + 32], xmm2 // store 2
782 lea edx, [edx + 48]
783 sub ecx, 16
784 jg convertloop
785 ret
786 }
787 }
788
789 __declspec(naked)
790 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
791 __asm {
792 mov eax, [esp + 4] // src_argb
793 mov edx, [esp + 8] // dst_rgb
794 mov ecx, [esp + 12] // width
795 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW
796
797 convertloop:
798 movdqu xmm0, [eax] // fetch 16 pixels of argb
799 movdqu xmm1, [eax + 16]
800 movdqu xmm2, [eax + 32]
801 movdqu xmm3, [eax + 48]
802 lea eax, [eax + 64]
803 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
804 pshufb xmm1, xmm6
805 pshufb xmm2, xmm6
806 pshufb xmm3, xmm6
807 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
808 psrldq xmm1, 4 // 8 bytes from 1
809 pslldq xmm4, 12 // 4 bytes from 1 for 0
810 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
811 por xmm0, xmm4 // 4 bytes from 1 for 0
812 pslldq xmm5, 8 // 8 bytes from 2 for 1
813 movdqu [edx], xmm0 // store 0
814 por xmm1, xmm5 // 8 bytes from 2 for 1
815 psrldq xmm2, 8 // 4 bytes from 2
816 pslldq xmm3, 4 // 12 bytes from 3 for 2
817 por xmm2, xmm3 // 12 bytes from 3 for 2
818 movdqu [edx + 16], xmm1 // store 1
819 movdqu [edx + 32], xmm2 // store 2
820 lea edx, [edx + 48]
821 sub ecx, 16
822 jg convertloop
823 ret
824 }
825 }
826
827 __declspec(naked)
828 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
829 __asm {
830 mov eax, [esp + 4] // src_argb
831 mov edx, [esp + 8] // dst_rgb
832 mov ecx, [esp + 12] // width
833 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
834 psrld xmm3, 27
835 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
836 psrld xmm4, 26
837 pslld xmm4, 5
838 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
839 pslld xmm5, 11
840
841 convertloop:
842 movdqu xmm0, [eax] // fetch 4 pixels of argb
843 movdqa xmm1, xmm0 // B
844 movdqa xmm2, xmm0 // G
845 pslld xmm0, 8 // R
846 psrld xmm1, 3 // B
847 psrld xmm2, 5 // G
848 psrad xmm0, 16 // R
849 pand xmm1, xmm3 // B
850 pand xmm2, xmm4 // G
851 pand xmm0, xmm5 // R
852 por xmm1, xmm2 // BG
853 por xmm0, xmm1 // BGR
854 packssdw xmm0, xmm0
855 lea eax, [eax + 16]
856 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
857 lea edx, [edx + 8]
858 sub ecx, 4
859 jg convertloop
860 ret
861 }
862 }
863
864 __declspec(naked)
865 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
866 const uint32 dither4, int width) {
867 __asm {
868
869 mov eax, [esp + 4] // src_argb
870 mov edx, [esp + 8] // dst_rgb
871 movd xmm6, [esp + 12] // dither4
872 mov ecx, [esp + 16] // width
873 punpcklbw xmm6, xmm6 // make dither 16 bytes
874 movdqa xmm7, xmm6
875 punpcklwd xmm6, xmm6
876 punpckhwd xmm7, xmm7
877 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
878 psrld xmm3, 27
879 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
880 psrld xmm4, 26
881 pslld xmm4, 5
882 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
883 pslld xmm5, 11
884
885 convertloop:
886 movdqu xmm0, [eax] // fetch 4 pixels of argb
887 paddusb xmm0, xmm6 // add dither
888 movdqa xmm1, xmm0 // B
889 movdqa xmm2, xmm0 // G
890 pslld xmm0, 8 // R
891 psrld xmm1, 3 // B
892 psrld xmm2, 5 // G
893 psrad xmm0, 16 // R
894 pand xmm1, xmm3 // B
895 pand xmm2, xmm4 // G
896 pand xmm0, xmm5 // R
897 por xmm1, xmm2 // BG
898 por xmm0, xmm1 // BGR
899 packssdw xmm0, xmm0
900 lea eax, [eax + 16]
901 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
902 lea edx, [edx + 8]
903 sub ecx, 4
904 jg convertloop
905 ret
906 }
907 }
908
909 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
910 __declspec(naked)
911 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
912 const uint32 dither4, int width) {
913 __asm {
914 mov eax, [esp + 4] // src_argb
915 mov edx, [esp + 8] // dst_rgb
916 vbroadcastss xmm6, [esp + 12] // dither4
917 mov ecx, [esp + 16] // width
918 vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
919 vpermq ymm6, ymm6, 0xd8
920 vpunpcklwd ymm6, ymm6, ymm6
921 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
922 vpsrld ymm3, ymm3, 27
923 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
924 vpsrld ymm4, ymm4, 26
925 vpslld ymm4, ymm4, 5
926 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
927
928 convertloop:
929 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
930 vpaddusb ymm0, ymm0, ymm6 // add dither
931 vpsrld ymm2, ymm0, 5 // G
932 vpsrld ymm1, ymm0, 3 // B
933 vpsrld ymm0, ymm0, 8 // R
934 vpand ymm2, ymm2, ymm4 // G
935 vpand ymm1, ymm1, ymm3 // B
936 vpand ymm0, ymm0, ymm5 // R
937 vpor ymm1, ymm1, ymm2 // BG
938 vpor ymm0, ymm0, ymm1 // BGR
939 vpackusdw ymm0, ymm0, ymm0
940 vpermq ymm0, ymm0, 0xd8
941 lea eax, [eax + 32]
942 vmovdqu [edx], xmm0 // store 8 pixels of RGB565
943 lea edx, [edx + 16]
944 sub ecx, 8
945 jg convertloop
946 vzeroupper
947 ret
948 }
949 }
950 #endif // HAS_ARGBTORGB565DITHERROW_AVX2
951
952 // TODO(fbarchard): Improve sign extension/packing.
953 __declspec(naked)
954 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
955 __asm {
956 mov eax, [esp + 4] // src_argb
957 mov edx, [esp + 8] // dst_rgb
958 mov ecx, [esp + 12] // width
959 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
960 psrld xmm4, 27
961 movdqa xmm5, xmm4 // generate mask 0x000003e0
962 pslld xmm5, 5
963 movdqa xmm6, xmm4 // generate mask 0x00007c00
964 pslld xmm6, 10
965 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
966 pslld xmm7, 15
967
968 convertloop:
969 movdqu xmm0, [eax] // fetch 4 pixels of argb
970 movdqa xmm1, xmm0 // B
971 movdqa xmm2, xmm0 // G
972 movdqa xmm3, xmm0 // R
973 psrad xmm0, 16 // A
974 psrld xmm1, 3 // B
975 psrld xmm2, 6 // G
976 psrld xmm3, 9 // R
977 pand xmm0, xmm7 // A
978 pand xmm1, xmm4 // B
979 pand xmm2, xmm5 // G
980 pand xmm3, xmm6 // R
981 por xmm0, xmm1 // BA
982 por xmm2, xmm3 // GR
983 por xmm0, xmm2 // BGRA
984 packssdw xmm0, xmm0
985 lea eax, [eax + 16]
986 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
987 lea edx, [edx + 8]
988 sub ecx, 4
989 jg convertloop
990 ret
991 }
992 }
993
994 __declspec(naked)
995 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
996 __asm {
997 mov eax, [esp + 4] // src_argb
998 mov edx, [esp + 8] // dst_rgb
999 mov ecx, [esp + 12] // width
1000 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
1001 psllw xmm4, 12
1002 movdqa xmm3, xmm4 // generate mask 0x00f000f0
1003 psrlw xmm3, 8
1004
1005 convertloop:
1006 movdqu xmm0, [eax] // fetch 4 pixels of argb
1007 movdqa xmm1, xmm0
1008 pand xmm0, xmm3 // low nibble
1009 pand xmm1, xmm4 // high nibble
1010 psrld xmm0, 4
1011 psrld xmm1, 8
1012 por xmm0, xmm1
1013 packuswb xmm0, xmm0
1014 lea eax, [eax + 16]
1015 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
1016 lea edx, [edx + 8]
1017 sub ecx, 4
1018 jg convertloop
1019 ret
1020 }
1021 }
1022
1023 #ifdef HAS_ARGBTORGB565ROW_AVX2
1024 __declspec(naked)
1025 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
1026 __asm {
1027 mov eax, [esp + 4] // src_argb
1028 mov edx, [esp + 8] // dst_rgb
1029 mov ecx, [esp + 12] // width
1030 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
1031 vpsrld ymm3, ymm3, 27
1032 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
1033 vpsrld ymm4, ymm4, 26
1034 vpslld ymm4, ymm4, 5
1035 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
1036
1037 convertloop:
1038 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1039 vpsrld ymm2, ymm0, 5 // G
1040 vpsrld ymm1, ymm0, 3 // B
1041 vpsrld ymm0, ymm0, 8 // R
1042 vpand ymm2, ymm2, ymm4 // G
1043 vpand ymm1, ymm1, ymm3 // B
1044 vpand ymm0, ymm0, ymm5 // R
1045 vpor ymm1, ymm1, ymm2 // BG
1046 vpor ymm0, ymm0, ymm1 // BGR
1047 vpackusdw ymm0, ymm0, ymm0
1048 vpermq ymm0, ymm0, 0xd8
1049 lea eax, [eax + 32]
1050 vmovdqu [edx], xmm0 // store 8 pixels of RGB565
1051 lea edx, [edx + 16]
1052 sub ecx, 8
1053 jg convertloop
1054 vzeroupper
1055 ret
1056 }
1057 }
1058 #endif // HAS_ARGBTORGB565ROW_AVX2
1059
1060 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
1061 __declspec(naked)
1062 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
1063 __asm {
1064 mov eax, [esp + 4] // src_argb
1065 mov edx, [esp + 8] // dst_rgb
1066 mov ecx, [esp + 12] // width
1067 vpcmpeqb ymm4, ymm4, ymm4
1068 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
1069 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
1070 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
1071 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
1072 vpslld ymm7, ymm7, 15
1073
1074 convertloop:
1075 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1076 vpsrld ymm3, ymm0, 9 // R
1077 vpsrld ymm2, ymm0, 6 // G
1078 vpsrld ymm1, ymm0, 3 // B
1079 vpsrad ymm0, ymm0, 16 // A
1080 vpand ymm3, ymm3, ymm6 // R
1081 vpand ymm2, ymm2, ymm5 // G
1082 vpand ymm1, ymm1, ymm4 // B
1083 vpand ymm0, ymm0, ymm7 // A
1084 vpor ymm0, ymm0, ymm1 // BA
1085 vpor ymm2, ymm2, ymm3 // GR
1086 vpor ymm0, ymm0, ymm2 // BGRA
1087 vpackssdw ymm0, ymm0, ymm0
1088 vpermq ymm0, ymm0, 0xd8
1089 lea eax, [eax + 32]
1090 vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555
1091 lea edx, [edx + 16]
1092 sub ecx, 8
1093 jg convertloop
1094 vzeroupper
1095 ret
1096 }
1097 }
1098 #endif // HAS_ARGBTOARGB1555ROW_AVX2
1099
1100 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
1101 __declspec(naked)
1102 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
1103 __asm {
1104 mov eax, [esp + 4] // src_argb
1105 mov edx, [esp + 8] // dst_rgb
1106 mov ecx, [esp + 12] // width
1107 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
1108 vpsllw ymm4, ymm4, 12
1109 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
1110
1111 convertloop:
1112 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1113 vpand ymm1, ymm0, ymm4 // high nibble
1114 vpand ymm0, ymm0, ymm3 // low nibble
1115 vpsrld ymm1, ymm1, 8
1116 vpsrld ymm0, ymm0, 4
1117 vpor ymm0, ymm0, ymm1
1118 vpackuswb ymm0, ymm0, ymm0
1119 vpermq ymm0, ymm0, 0xd8
1120 lea eax, [eax + 32]
1121 vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
1122 lea edx, [edx + 16]
1123 sub ecx, 8
1124 jg convertloop
1125 vzeroupper
1126 ret
1127 }
1128 }
1129 #endif // HAS_ARGBTOARGB4444ROW_AVX2
1130
1131 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
1132 __declspec(naked)
1133 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
1134 __asm {
1135 mov eax, [esp + 4] /* src_argb */
1136 mov edx, [esp + 8] /* dst_y */
1137 mov ecx, [esp + 12] /* width */
1138 movdqa xmm4, xmmword ptr kARGBToY
1139 movdqa xmm5, xmmword ptr kAddY16
1140
1141 convertloop:
1142 movdqu xmm0, [eax]
1143 movdqu xmm1, [eax + 16]
1144 movdqu xmm2, [eax + 32]
1145 movdqu xmm3, [eax + 48]
1146 pmaddubsw xmm0, xmm4
1147 pmaddubsw xmm1, xmm4
1148 pmaddubsw xmm2, xmm4
1149 pmaddubsw xmm3, xmm4
1150 lea eax, [eax + 64]
1151 phaddw xmm0, xmm1
1152 phaddw xmm2, xmm3
1153 psrlw xmm0, 7
1154 psrlw xmm2, 7
1155 packuswb xmm0, xmm2
1156 paddb xmm0, xmm5
1157 movdqu [edx], xmm0
1158 lea edx, [edx + 16]
1159 sub ecx, 16
1160 jg convertloop
1161 ret
1162 }
1163 }
1164
1165 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1166 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
1167 __declspec(naked)
1168 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
1169 __asm {
1170 mov eax, [esp + 4] /* src_argb */
1171 mov edx, [esp + 8] /* dst_y */
1172 mov ecx, [esp + 12] /* width */
1173 movdqa xmm4, xmmword ptr kARGBToYJ
1174 movdqa xmm5, xmmword ptr kAddYJ64
1175
1176 convertloop:
1177 movdqu xmm0, [eax]
1178 movdqu xmm1, [eax + 16]
1179 movdqu xmm2, [eax + 32]
1180 movdqu xmm3, [eax + 48]
1181 pmaddubsw xmm0, xmm4
1182 pmaddubsw xmm1, xmm4
1183 pmaddubsw xmm2, xmm4
1184 pmaddubsw xmm3, xmm4
1185 lea eax, [eax + 64]
1186 phaddw xmm0, xmm1
1187 phaddw xmm2, xmm3
1188 paddw xmm0, xmm5 // Add .5 for rounding.
1189 paddw xmm2, xmm5
1190 psrlw xmm0, 7
1191 psrlw xmm2, 7
1192 packuswb xmm0, xmm2
1193 movdqu [edx], xmm0
1194 lea edx, [edx + 16]
1195 sub ecx, 16
1196 jg convertloop
1197 ret
1198 }
1199 }
1200
1201 #ifdef HAS_ARGBTOYROW_AVX2
1202 // vpermd for vphaddw + vpackuswb vpermd.
1203 static const lvec32 kPermdARGBToY_AVX = {
1204 0, 4, 1, 5, 2, 6, 3, 7
1205 };
1206
1207 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1208 __declspec(naked)
1209 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
1210 __asm {
1211 mov eax, [esp + 4] /* src_argb */
1212 mov edx, [esp + 8] /* dst_y */
1213 mov ecx, [esp + 12] /* width */
1214 vbroadcastf128 ymm4, xmmword ptr kARGBToY
1215 vbroadcastf128 ymm5, xmmword ptr kAddY16
1216 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
1217
1218 convertloop:
1219 vmovdqu ymm0, [eax]
1220 vmovdqu ymm1, [eax + 32]
1221 vmovdqu ymm2, [eax + 64]
1222 vmovdqu ymm3, [eax + 96]
1223 vpmaddubsw ymm0, ymm0, ymm4
1224 vpmaddubsw ymm1, ymm1, ymm4
1225 vpmaddubsw ymm2, ymm2, ymm4
1226 vpmaddubsw ymm3, ymm3, ymm4
1227 lea eax, [eax + 128]
1228 vphaddw ymm0, ymm0, ymm1 // mutates.
1229 vphaddw ymm2, ymm2, ymm3
1230 vpsrlw ymm0, ymm0, 7
1231 vpsrlw ymm2, ymm2, 7
1232 vpackuswb ymm0, ymm0, ymm2 // mutates.
1233 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
1234 vpaddb ymm0, ymm0, ymm5 // add 16 for Y
1235 vmovdqu [edx], ymm0
1236 lea edx, [edx + 32]
1237 sub ecx, 32
1238 jg convertloop
1239 vzeroupper
1240 ret
1241 }
1242 }
1243 #endif // HAS_ARGBTOYROW_AVX2
1244
1245 #ifdef HAS_ARGBTOYJROW_AVX2
1246 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1247 __declspec(naked)
1248 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
1249 __asm {
1250 mov eax, [esp + 4] /* src_argb */
1251 mov edx, [esp + 8] /* dst_y */
1252 mov ecx, [esp + 12] /* width */
1253 vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
1254 vbroadcastf128 ymm5, xmmword ptr kAddYJ64
1255 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
1256
1257 convertloop:
1258 vmovdqu ymm0, [eax]
1259 vmovdqu ymm1, [eax + 32]
1260 vmovdqu ymm2, [eax + 64]
1261 vmovdqu ymm3, [eax + 96]
1262 vpmaddubsw ymm0, ymm0, ymm4
1263 vpmaddubsw ymm1, ymm1, ymm4
1264 vpmaddubsw ymm2, ymm2, ymm4
1265 vpmaddubsw ymm3, ymm3, ymm4
1266 lea eax, [eax + 128]
1267 vphaddw ymm0, ymm0, ymm1 // mutates.
1268 vphaddw ymm2, ymm2, ymm3
1269 vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
1270 vpaddw ymm2, ymm2, ymm5
1271 vpsrlw ymm0, ymm0, 7
1272 vpsrlw ymm2, ymm2, 7
1273 vpackuswb ymm0, ymm0, ymm2 // mutates.
1274 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
1275 vmovdqu [edx], ymm0
1276 lea edx, [edx + 32]
1277 sub ecx, 32
1278 jg convertloop
1279
1280 vzeroupper
1281 ret
1282 }
1283 }
1284 #endif // HAS_ARGBTOYJROW_AVX2
1285
1286 __declspec(naked)
1287 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
1288 __asm {
1289 mov eax, [esp + 4] /* src_argb */
1290 mov edx, [esp + 8] /* dst_y */
1291 mov ecx, [esp + 12] /* width */
1292 movdqa xmm4, xmmword ptr kBGRAToY
1293 movdqa xmm5, xmmword ptr kAddY16
1294
1295 convertloop:
1296 movdqu xmm0, [eax]
1297 movdqu xmm1, [eax + 16]
1298 movdqu xmm2, [eax + 32]
1299 movdqu xmm3, [eax + 48]
1300 pmaddubsw xmm0, xmm4
1301 pmaddubsw xmm1, xmm4
1302 pmaddubsw xmm2, xmm4
1303 pmaddubsw xmm3, xmm4
1304 lea eax, [eax + 64]
1305 phaddw xmm0, xmm1
1306 phaddw xmm2, xmm3
1307 psrlw xmm0, 7
1308 psrlw xmm2, 7
1309 packuswb xmm0, xmm2
1310 paddb xmm0, xmm5
1311 movdqu [edx], xmm0
1312 lea edx, [edx + 16]
1313 sub ecx, 16
1314 jg convertloop
1315 ret
1316 }
1317 }
1318
1319 __declspec(naked)
1320 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
1321 __asm {
1322 mov eax, [esp + 4] /* src_argb */
1323 mov edx, [esp + 8] /* dst_y */
1324 mov ecx, [esp + 12] /* width */
1325 movdqa xmm4, xmmword ptr kABGRToY
1326 movdqa xmm5, xmmword ptr kAddY16
1327
1328 convertloop:
1329 movdqu xmm0, [eax]
1330 movdqu xmm1, [eax + 16]
1331 movdqu xmm2, [eax + 32]
1332 movdqu xmm3, [eax + 48]
1333 pmaddubsw xmm0, xmm4
1334 pmaddubsw xmm1, xmm4
1335 pmaddubsw xmm2, xmm4
1336 pmaddubsw xmm3, xmm4
1337 lea eax, [eax + 64]
1338 phaddw xmm0, xmm1
1339 phaddw xmm2, xmm3
1340 psrlw xmm0, 7
1341 psrlw xmm2, 7
1342 packuswb xmm0, xmm2
1343 paddb xmm0, xmm5
1344 movdqu [edx], xmm0
1345 lea edx, [edx + 16]
1346 sub ecx, 16
1347 jg convertloop
1348 ret
1349 }
1350 }
1351
1352 __declspec(naked)
1353 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
1354 __asm {
1355 mov eax, [esp + 4] /* src_argb */
1356 mov edx, [esp + 8] /* dst_y */
1357 mov ecx, [esp + 12] /* width */
1358 movdqa xmm4, xmmword ptr kRGBAToY
1359 movdqa xmm5, xmmword ptr kAddY16
1360
1361 convertloop:
1362 movdqu xmm0, [eax]
1363 movdqu xmm1, [eax + 16]
1364 movdqu xmm2, [eax + 32]
1365 movdqu xmm3, [eax + 48]
1366 pmaddubsw xmm0, xmm4
1367 pmaddubsw xmm1, xmm4
1368 pmaddubsw xmm2, xmm4
1369 pmaddubsw xmm3, xmm4
1370 lea eax, [eax + 64]
1371 phaddw xmm0, xmm1
1372 phaddw xmm2, xmm3
1373 psrlw xmm0, 7
1374 psrlw xmm2, 7
1375 packuswb xmm0, xmm2
1376 paddb xmm0, xmm5
1377 movdqu [edx], xmm0
1378 lea edx, [edx + 16]
1379 sub ecx, 16
1380 jg convertloop
1381 ret
1382 }
1383 }
1384
1385 __declspec(naked)
1386 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1387 uint8* dst_u, uint8* dst_v, int width) {
1388 __asm {
1389 push esi
1390 push edi
1391 mov eax, [esp + 8 + 4] // src_argb
1392 mov esi, [esp + 8 + 8] // src_stride_argb
1393 mov edx, [esp + 8 + 12] // dst_u
1394 mov edi, [esp + 8 + 16] // dst_v
1395 mov ecx, [esp + 8 + 20] // width
1396 movdqa xmm5, xmmword ptr kAddUV128
1397 movdqa xmm6, xmmword ptr kARGBToV
1398 movdqa xmm7, xmmword ptr kARGBToU
1399 sub edi, edx // stride from u to v
1400
1401 convertloop:
1402 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1403 movdqu xmm0, [eax]
1404 movdqu xmm4, [eax + esi]
1405 pavgb xmm0, xmm4
1406 movdqu xmm1, [eax + 16]
1407 movdqu xmm4, [eax + esi + 16]
1408 pavgb xmm1, xmm4
1409 movdqu xmm2, [eax + 32]
1410 movdqu xmm4, [eax + esi + 32]
1411 pavgb xmm2, xmm4
1412 movdqu xmm3, [eax + 48]
1413 movdqu xmm4, [eax + esi + 48]
1414 pavgb xmm3, xmm4
1415
1416 lea eax, [eax + 64]
1417 movdqa xmm4, xmm0
1418 shufps xmm0, xmm1, 0x88
1419 shufps xmm4, xmm1, 0xdd
1420 pavgb xmm0, xmm4
1421 movdqa xmm4, xmm2
1422 shufps xmm2, xmm3, 0x88
1423 shufps xmm4, xmm3, 0xdd
1424 pavgb xmm2, xmm4
1425
1426 // step 2 - convert to U and V
1427 // from here down is very similar to Y code except
1428 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1429 movdqa xmm1, xmm0
1430 movdqa xmm3, xmm2
1431 pmaddubsw xmm0, xmm7 // U
1432 pmaddubsw xmm2, xmm7
1433 pmaddubsw xmm1, xmm6 // V
1434 pmaddubsw xmm3, xmm6
1435 phaddw xmm0, xmm2
1436 phaddw xmm1, xmm3
1437 psraw xmm0, 8
1438 psraw xmm1, 8
1439 packsswb xmm0, xmm1
1440 paddb xmm0, xmm5 // -> unsigned
1441
1442 // step 3 - store 8 U and 8 V values
1443 movlps qword ptr [edx], xmm0 // U
1444 movhps qword ptr [edx + edi], xmm0 // V
1445 lea edx, [edx + 8]
1446 sub ecx, 16
1447 jg convertloop
1448
1449 pop edi
1450 pop esi
1451 ret
1452 }
1453 }
1454
1455 __declspec(naked)
1456 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1457 uint8* dst_u, uint8* dst_v, int width) {
1458 __asm {
1459 push esi
1460 push edi
1461 mov eax, [esp + 8 + 4] // src_argb
1462 mov esi, [esp + 8 + 8] // src_stride_argb
1463 mov edx, [esp + 8 + 12] // dst_u
1464 mov edi, [esp + 8 + 16] // dst_v
1465 mov ecx, [esp + 8 + 20] // width
1466 movdqa xmm5, xmmword ptr kAddUVJ128
1467 movdqa xmm6, xmmword ptr kARGBToVJ
1468 movdqa xmm7, xmmword ptr kARGBToUJ
1469 sub edi, edx // stride from u to v
1470
1471 convertloop:
1472 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1473 movdqu xmm0, [eax]
1474 movdqu xmm4, [eax + esi]
1475 pavgb xmm0, xmm4
1476 movdqu xmm1, [eax + 16]
1477 movdqu xmm4, [eax + esi + 16]
1478 pavgb xmm1, xmm4
1479 movdqu xmm2, [eax + 32]
1480 movdqu xmm4, [eax + esi + 32]
1481 pavgb xmm2, xmm4
1482 movdqu xmm3, [eax + 48]
1483 movdqu xmm4, [eax + esi + 48]
1484 pavgb xmm3, xmm4
1485
1486 lea eax, [eax + 64]
1487 movdqa xmm4, xmm0
1488 shufps xmm0, xmm1, 0x88
1489 shufps xmm4, xmm1, 0xdd
1490 pavgb xmm0, xmm4
1491 movdqa xmm4, xmm2
1492 shufps xmm2, xmm3, 0x88
1493 shufps xmm4, xmm3, 0xdd
1494 pavgb xmm2, xmm4
1495
1496 // step 2 - convert to U and V
1497 // from here down is very similar to Y code except
1498 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1499 movdqa xmm1, xmm0
1500 movdqa xmm3, xmm2
1501 pmaddubsw xmm0, xmm7 // U
1502 pmaddubsw xmm2, xmm7
1503 pmaddubsw xmm1, xmm6 // V
1504 pmaddubsw xmm3, xmm6
1505 phaddw xmm0, xmm2
1506 phaddw xmm1, xmm3
1507 paddw xmm0, xmm5 // +.5 rounding -> unsigned
1508 paddw xmm1, xmm5
1509 psraw xmm0, 8
1510 psraw xmm1, 8
1511 packsswb xmm0, xmm1
1512
1513 // step 3 - store 8 U and 8 V values
1514 movlps qword ptr [edx], xmm0 // U
1515 movhps qword ptr [edx + edi], xmm0 // V
1516 lea edx, [edx + 8]
1517 sub ecx, 16
1518 jg convertloop
1519
1520 pop edi
1521 pop esi
1522 ret
1523 }
1524 }
1525
1526 #ifdef HAS_ARGBTOUVROW_AVX2
1527 __declspec(naked)
1528 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
1529 uint8* dst_u, uint8* dst_v, int width) {
1530 __asm {
1531 push esi
1532 push edi
1533 mov eax, [esp + 8 + 4] // src_argb
1534 mov esi, [esp + 8 + 8] // src_stride_argb
1535 mov edx, [esp + 8 + 12] // dst_u
1536 mov edi, [esp + 8 + 16] // dst_v
1537 mov ecx, [esp + 8 + 20] // width
1538 vbroadcastf128 ymm5, xmmword ptr kAddUV128
1539 vbroadcastf128 ymm6, xmmword ptr kARGBToV
1540 vbroadcastf128 ymm7, xmmword ptr kARGBToU
1541 sub edi, edx // stride from u to v
1542
1543 convertloop:
1544 /* step 1 - subsample 32x2 argb pixels to 16x1 */
1545 vmovdqu ymm0, [eax]
1546 vmovdqu ymm1, [eax + 32]
1547 vmovdqu ymm2, [eax + 64]
1548 vmovdqu ymm3, [eax + 96]
1549 vpavgb ymm0, ymm0, [eax + esi]
1550 vpavgb ymm1, ymm1, [eax + esi + 32]
1551 vpavgb ymm2, ymm2, [eax + esi + 64]
1552 vpavgb ymm3, ymm3, [eax + esi + 96]
1553 lea eax, [eax + 128]
1554 vshufps ymm4, ymm0, ymm1, 0x88
1555 vshufps ymm0, ymm0, ymm1, 0xdd
1556 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
1557 vshufps ymm4, ymm2, ymm3, 0x88
1558 vshufps ymm2, ymm2, ymm3, 0xdd
1559 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
1560
1561 // step 2 - convert to U and V
1562 // from here down is very similar to Y code except
1563 // instead of 32 different pixels, its 16 pixels of U and 16 of V
1564 vpmaddubsw ymm1, ymm0, ymm7 // U
1565 vpmaddubsw ymm3, ymm2, ymm7
1566 vpmaddubsw ymm0, ymm0, ymm6 // V
1567 vpmaddubsw ymm2, ymm2, ymm6
1568 vphaddw ymm1, ymm1, ymm3 // mutates
1569 vphaddw ymm0, ymm0, ymm2
1570 vpsraw ymm1, ymm1, 8
1571 vpsraw ymm0, ymm0, 8
1572 vpacksswb ymm0, ymm1, ymm0 // mutates
1573 vpermq ymm0, ymm0, 0xd8 // For vpacksswb
1574 vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
1575 vpaddb ymm0, ymm0, ymm5 // -> unsigned
1576
1577 // step 3 - store 16 U and 16 V values
1578 vextractf128 [edx], ymm0, 0 // U
1579 vextractf128 [edx + edi], ymm0, 1 // V
1580 lea edx, [edx + 16]
1581 sub ecx, 32
1582 jg convertloop
1583
1584 pop edi
1585 pop esi
1586 vzeroupper
1587 ret
1588 }
1589 }
1590 #endif // HAS_ARGBTOUVROW_AVX2
1591
1592 #ifdef HAS_ARGBTOUVJROW_AVX2
1593 __declspec(naked)
1594 void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
1595 uint8* dst_u, uint8* dst_v, int width) {
1596 __asm {
1597 push esi
1598 push edi
1599 mov eax, [esp + 8 + 4] // src_argb
1600 mov esi, [esp + 8 + 8] // src_stride_argb
1601 mov edx, [esp + 8 + 12] // dst_u
1602 mov edi, [esp + 8 + 16] // dst_v
1603 mov ecx, [esp + 8 + 20] // width
1604 vbroadcastf128 ymm5, xmmword ptr kAddUV128
1605 vbroadcastf128 ymm6, xmmword ptr kARGBToV
1606 vbroadcastf128 ymm7, xmmword ptr kARGBToU
1607 sub edi, edx // stride from u to v
1608
1609 convertloop:
1610 /* step 1 - subsample 32x2 argb pixels to 16x1 */
1611 vmovdqu ymm0, [eax]
1612 vmovdqu ymm1, [eax + 32]
1613 vmovdqu ymm2, [eax + 64]
1614 vmovdqu ymm3, [eax + 96]
1615 vpavgb ymm0, ymm0, [eax + esi]
1616 vpavgb ymm1, ymm1, [eax + esi + 32]
1617 vpavgb ymm2, ymm2, [eax + esi + 64]
1618 vpavgb ymm3, ymm3, [eax + esi + 96]
1619 lea eax, [eax + 128]
1620 vshufps ymm4, ymm0, ymm1, 0x88
1621 vshufps ymm0, ymm0, ymm1, 0xdd
1622 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
1623 vshufps ymm4, ymm2, ymm3, 0x88
1624 vshufps ymm2, ymm2, ymm3, 0xdd
1625 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
1626
1627 // step 2 - convert to U and V
1628 // from here down is very similar to Y code except
1629 // instead of 32 different pixels, its 16 pixels of U and 16 of V
1630 vpmaddubsw ymm1, ymm0, ymm7 // U
1631 vpmaddubsw ymm3, ymm2, ymm7
1632 vpmaddubsw ymm0, ymm0, ymm6 // V
1633 vpmaddubsw ymm2, ymm2, ymm6
1634 vphaddw ymm1, ymm1, ymm3 // mutates
1635 vphaddw ymm0, ymm0, ymm2
1636 vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned
1637 vpaddw ymm0, ymm0, ymm5
1638 vpsraw ymm1, ymm1, 8
1639 vpsraw ymm0, ymm0, 8
1640 vpacksswb ymm0, ymm1, ymm0 // mutates
1641 vpermq ymm0, ymm0, 0xd8 // For vpacksswb
1642 vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
1643
1644 // step 3 - store 16 U and 16 V values
1645 vextractf128 [edx], ymm0, 0 // U
1646 vextractf128 [edx + edi], ymm0, 1 // V
1647 lea edx, [edx + 16]
1648 sub ecx, 32
1649 jg convertloop
1650
1651 pop edi
1652 pop esi
1653 vzeroupper
1654 ret
1655 }
1656 }
1657 #endif // HAS_ARGBTOUVJROW_AVX2
1658
1659 __declspec(naked)
1660 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
1661 uint8* dst_u, uint8* dst_v, int width) {
1662 __asm {
1663 push edi
1664 mov eax, [esp + 4 + 4] // src_argb
1665 mov edx, [esp + 4 + 8] // dst_u
1666 mov edi, [esp + 4 + 12] // dst_v
1667 mov ecx, [esp + 4 + 16] // width
1668 movdqa xmm5, xmmword ptr kAddUV128
1669 movdqa xmm6, xmmword ptr kARGBToV
1670 movdqa xmm7, xmmword ptr kARGBToU
1671 sub edi, edx // stride from u to v
1672
1673 convertloop:
1674 /* convert to U and V */
1675 movdqu xmm0, [eax] // U
1676 movdqu xmm1, [eax + 16]
1677 movdqu xmm2, [eax + 32]
1678 movdqu xmm3, [eax + 48]
1679 pmaddubsw xmm0, xmm7
1680 pmaddubsw xmm1, xmm7
1681 pmaddubsw xmm2, xmm7
1682 pmaddubsw xmm3, xmm7
1683 phaddw xmm0, xmm1
1684 phaddw xmm2, xmm3
1685 psraw xmm0, 8
1686 psraw xmm2, 8
1687 packsswb xmm0, xmm2
1688 paddb xmm0, xmm5
1689 movdqu [edx], xmm0
1690
1691 movdqu xmm0, [eax] // V
1692 movdqu xmm1, [eax + 16]
1693 movdqu xmm2, [eax + 32]
1694 movdqu xmm3, [eax + 48]
1695 pmaddubsw xmm0, xmm6
1696 pmaddubsw xmm1, xmm6
1697 pmaddubsw xmm2, xmm6
1698 pmaddubsw xmm3, xmm6
1699 phaddw xmm0, xmm1
1700 phaddw xmm2, xmm3
1701 psraw xmm0, 8
1702 psraw xmm2, 8
1703 packsswb xmm0, xmm2
1704 paddb xmm0, xmm5
1705 lea eax, [eax + 64]
1706 movdqu [edx + edi], xmm0
1707 lea edx, [edx + 16]
1708 sub ecx, 16
1709 jg convertloop
1710
1711 pop edi
1712 ret
1713 }
1714 }
1715
1716 __declspec(naked)
1717 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1718 uint8* dst_u, uint8* dst_v, int width) {
1719 __asm {
1720 push esi
1721 push edi
1722 mov eax, [esp + 8 + 4] // src_argb
1723 mov esi, [esp + 8 + 8] // src_stride_argb
1724 mov edx, [esp + 8 + 12] // dst_u
1725 mov edi, [esp + 8 + 16] // dst_v
1726 mov ecx, [esp + 8 + 20] // width
1727 movdqa xmm5, xmmword ptr kAddUV128
1728 movdqa xmm6, xmmword ptr kBGRAToV
1729 movdqa xmm7, xmmword ptr kBGRAToU
1730 sub edi, edx // stride from u to v
1731
1732 convertloop:
1733 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1734 movdqu xmm0, [eax]
1735 movdqu xmm4, [eax + esi]
1736 pavgb xmm0, xmm4
1737 movdqu xmm1, [eax + 16]
1738 movdqu xmm4, [eax + esi + 16]
1739 pavgb xmm1, xmm4
1740 movdqu xmm2, [eax + 32]
1741 movdqu xmm4, [eax + esi + 32]
1742 pavgb xmm2, xmm4
1743 movdqu xmm3, [eax + 48]
1744 movdqu xmm4, [eax + esi + 48]
1745 pavgb xmm3, xmm4
1746
1747 lea eax, [eax + 64]
1748 movdqa xmm4, xmm0
1749 shufps xmm0, xmm1, 0x88
1750 shufps xmm4, xmm1, 0xdd
1751 pavgb xmm0, xmm4
1752 movdqa xmm4, xmm2
1753 shufps xmm2, xmm3, 0x88
1754 shufps xmm4, xmm3, 0xdd
1755 pavgb xmm2, xmm4
1756
1757 // step 2 - convert to U and V
1758 // from here down is very similar to Y code except
1759 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1760 movdqa xmm1, xmm0
1761 movdqa xmm3, xmm2
1762 pmaddubsw xmm0, xmm7 // U
1763 pmaddubsw xmm2, xmm7
1764 pmaddubsw xmm1, xmm6 // V
1765 pmaddubsw xmm3, xmm6
1766 phaddw xmm0, xmm2
1767 phaddw xmm1, xmm3
1768 psraw xmm0, 8
1769 psraw xmm1, 8
1770 packsswb xmm0, xmm1
1771 paddb xmm0, xmm5 // -> unsigned
1772
1773 // step 3 - store 8 U and 8 V values
1774 movlps qword ptr [edx], xmm0 // U
1775 movhps qword ptr [edx + edi], xmm0 // V
1776 lea edx, [edx + 8]
1777 sub ecx, 16
1778 jg convertloop
1779
1780 pop edi
1781 pop esi
1782 ret
1783 }
1784 }
1785
1786 __declspec(naked)
1787 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1788 uint8* dst_u, uint8* dst_v, int width) {
1789 __asm {
1790 push esi
1791 push edi
1792 mov eax, [esp + 8 + 4] // src_argb
1793 mov esi, [esp + 8 + 8] // src_stride_argb
1794 mov edx, [esp + 8 + 12] // dst_u
1795 mov edi, [esp + 8 + 16] // dst_v
1796 mov ecx, [esp + 8 + 20] // width
1797 movdqa xmm5, xmmword ptr kAddUV128
1798 movdqa xmm6, xmmword ptr kABGRToV
1799 movdqa xmm7, xmmword ptr kABGRToU
1800 sub edi, edx // stride from u to v
1801
1802 convertloop:
1803 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1804 movdqu xmm0, [eax]
1805 movdqu xmm4, [eax + esi]
1806 pavgb xmm0, xmm4
1807 movdqu xmm1, [eax + 16]
1808 movdqu xmm4, [eax + esi + 16]
1809 pavgb xmm1, xmm4
1810 movdqu xmm2, [eax + 32]
1811 movdqu xmm4, [eax + esi + 32]
1812 pavgb xmm2, xmm4
1813 movdqu xmm3, [eax + 48]
1814 movdqu xmm4, [eax + esi + 48]
1815 pavgb xmm3, xmm4
1816
1817 lea eax, [eax + 64]
1818 movdqa xmm4, xmm0
1819 shufps xmm0, xmm1, 0x88
1820 shufps xmm4, xmm1, 0xdd
1821 pavgb xmm0, xmm4
1822 movdqa xmm4, xmm2
1823 shufps xmm2, xmm3, 0x88
1824 shufps xmm4, xmm3, 0xdd
1825 pavgb xmm2, xmm4
1826
1827 // step 2 - convert to U and V
1828 // from here down is very similar to Y code except
1829 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1830 movdqa xmm1, xmm0
1831 movdqa xmm3, xmm2
1832 pmaddubsw xmm0, xmm7 // U
1833 pmaddubsw xmm2, xmm7
1834 pmaddubsw xmm1, xmm6 // V
1835 pmaddubsw xmm3, xmm6
1836 phaddw xmm0, xmm2
1837 phaddw xmm1, xmm3
1838 psraw xmm0, 8
1839 psraw xmm1, 8
1840 packsswb xmm0, xmm1
1841 paddb xmm0, xmm5 // -> unsigned
1842
1843 // step 3 - store 8 U and 8 V values
1844 movlps qword ptr [edx], xmm0 // U
1845 movhps qword ptr [edx + edi], xmm0 // V
1846 lea edx, [edx + 8]
1847 sub ecx, 16
1848 jg convertloop
1849
1850 pop edi
1851 pop esi
1852 ret
1853 }
1854 }
1855
1856 __declspec(naked)
1857 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1858 uint8* dst_u, uint8* dst_v, int width) {
1859 __asm {
1860 push esi
1861 push edi
1862 mov eax, [esp + 8 + 4] // src_argb
1863 mov esi, [esp + 8 + 8] // src_stride_argb
1864 mov edx, [esp + 8 + 12] // dst_u
1865 mov edi, [esp + 8 + 16] // dst_v
1866 mov ecx, [esp + 8 + 20] // width
1867 movdqa xmm5, xmmword ptr kAddUV128
1868 movdqa xmm6, xmmword ptr kRGBAToV
1869 movdqa xmm7, xmmword ptr kRGBAToU
1870 sub edi, edx // stride from u to v
1871
1872 convertloop:
1873 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1874 movdqu xmm0, [eax]
1875 movdqu xmm4, [eax + esi]
1876 pavgb xmm0, xmm4
1877 movdqu xmm1, [eax + 16]
1878 movdqu xmm4, [eax + esi + 16]
1879 pavgb xmm1, xmm4
1880 movdqu xmm2, [eax + 32]
1881 movdqu xmm4, [eax + esi + 32]
1882 pavgb xmm2, xmm4
1883 movdqu xmm3, [eax + 48]
1884 movdqu xmm4, [eax + esi + 48]
1885 pavgb xmm3, xmm4
1886
1887 lea eax, [eax + 64]
1888 movdqa xmm4, xmm0
1889 shufps xmm0, xmm1, 0x88
1890 shufps xmm4, xmm1, 0xdd
1891 pavgb xmm0, xmm4
1892 movdqa xmm4, xmm2
1893 shufps xmm2, xmm3, 0x88
1894 shufps xmm4, xmm3, 0xdd
1895 pavgb xmm2, xmm4
1896
1897 // step 2 - convert to U and V
1898 // from here down is very similar to Y code except
1899 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1900 movdqa xmm1, xmm0
1901 movdqa xmm3, xmm2
1902 pmaddubsw xmm0, xmm7 // U
1903 pmaddubsw xmm2, xmm7
1904 pmaddubsw xmm1, xmm6 // V
1905 pmaddubsw xmm3, xmm6
1906 phaddw xmm0, xmm2
1907 phaddw xmm1, xmm3
1908 psraw xmm0, 8
1909 psraw xmm1, 8
1910 packsswb xmm0, xmm1
1911 paddb xmm0, xmm5 // -> unsigned
1912
1913 // step 3 - store 8 U and 8 V values
1914 movlps qword ptr [edx], xmm0 // U
1915 movhps qword ptr [edx + edi], xmm0 // V
1916 lea edx, [edx + 8]
1917 sub ecx, 16
1918 jg convertloop
1919
1920 pop edi
1921 pop esi
1922 ret
1923 }
1924 }
1925 #endif // HAS_ARGBTOYROW_SSSE3
1926
1927 // Read 16 UV from 444
1928 #define READYUV444_AVX2 __asm { \
1929 __asm vmovdqu xmm0, [esi] /* U */ \
1930 __asm vmovdqu xmm1, [esi + edi] /* V */ \
1931 __asm lea esi, [esi + 16] \
1932 __asm vpermq ymm0, ymm0, 0xd8 \
1933 __asm vpermq ymm1, ymm1, 0xd8 \
1934 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1935 __asm vmovdqu xmm4, [eax] /* Y */ \
1936 __asm vpermq ymm4, ymm4, 0xd8 \
1937 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1938 __asm lea eax, [eax + 16] \
1939 }
1940
1941 // Read 8 UV from 422, upsample to 16 UV.
1942 #define READYUV422_AVX2 __asm { \
1943 __asm vmovq xmm0, qword ptr [esi] /* U */ \
1944 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
1945 __asm lea esi, [esi + 8] \
1946 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1947 __asm vpermq ymm0, ymm0, 0xd8 \
1948 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1949 __asm vmovdqu xmm4, [eax] /* Y */ \
1950 __asm vpermq ymm4, ymm4, 0xd8 \
1951 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1952 __asm lea eax, [eax + 16] \
1953 }
1954
1955 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
1956 #define READYUVA422_AVX2 __asm { \
1957 __asm vmovq xmm0, qword ptr [esi] /* U */ \
1958 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
1959 __asm lea esi, [esi + 8] \
1960 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1961 __asm vpermq ymm0, ymm0, 0xd8 \
1962 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1963 __asm vmovdqu xmm4, [eax] /* Y */ \
1964 __asm vpermq ymm4, ymm4, 0xd8 \
1965 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1966 __asm lea eax, [eax + 16] \
1967 __asm vmovdqu xmm5, [ebp] /* A */ \
1968 __asm vpermq ymm5, ymm5, 0xd8 \
1969 __asm lea ebp, [ebp + 16] \
1970 }
1971
1972 // Read 4 UV from 411, upsample to 16 UV.
1973 #define READYUV411_AVX2 __asm { \
1974 __asm vmovd xmm0, dword ptr [esi] /* U */ \
1975 __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \
1976 __asm lea esi, [esi + 4] \
1977 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1978 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1979 __asm vpermq ymm0, ymm0, 0xd8 \
1980 __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \
1981 __asm vmovdqu xmm4, [eax] /* Y */ \
1982 __asm vpermq ymm4, ymm4, 0xd8 \
1983 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1984 __asm lea eax, [eax + 16] \
1985 }
1986
1987 // Read 8 UV from NV12, upsample to 16 UV.
1988 #define READNV12_AVX2 __asm { \
1989 __asm vmovdqu xmm0, [esi] /* UV */ \
1990 __asm lea esi, [esi + 16] \
1991 __asm vpermq ymm0, ymm0, 0xd8 \
1992 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1993 __asm vmovdqu xmm4, [eax] /* Y */ \
1994 __asm vpermq ymm4, ymm4, 0xd8 \
1995 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1996 __asm lea eax, [eax + 16] \
1997 }
1998
1999 // Read 8 UV from NV21, upsample to 16 UV.
2000 #define READNV21_AVX2 __asm { \
2001 __asm vmovdqu xmm0, [esi] /* UV */ \
2002 __asm lea esi, [esi + 16] \
2003 __asm vpermq ymm0, ymm0, 0xd8 \
2004 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
2005 __asm vmovdqu xmm4, [eax] /* Y */ \
2006 __asm vpermq ymm4, ymm4, 0xd8 \
2007 __asm vpunpcklbw ymm4, ymm4, ymm4 \
2008 __asm lea eax, [eax + 16] \
2009 }
2010
2011 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
2012 #define READYUY2_AVX2 __asm { \
2013 __asm vmovdqu ymm4, [eax] /* YUY2 */ \
2014 __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
2015 __asm vmovdqu ymm0, [eax] /* UV */ \
2016 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
2017 __asm lea eax, [eax + 32] \
2018 }
2019
2020 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2021 #define READUYVY_AVX2 __asm { \
2022 __asm vmovdqu ymm4, [eax] /* UYVY */ \
2023 __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
2024 __asm vmovdqu ymm0, [eax] /* UV */ \
2025 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
2026 __asm lea eax, [eax + 32] \
2027 }
2028
2029 // Convert 16 pixels: 16 UV and 16 Y.
2030 #define YUVTORGB_AVX2(YuvConstants) __asm { \
2031 __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
2032 __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
2033 __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
2034 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
2035 __asm vpsubw ymm2, ymm3, ymm2 \
2036 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
2037 __asm vpsubw ymm1, ymm3, ymm1 \
2038 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
2039 __asm vpsubw ymm0, ymm3, ymm0 \
2040 /* Step 2: Find Y contribution to 16 R,G,B values */ \
2041 __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
2042 __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
2043 __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
2044 __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
2045 __asm vpsraw ymm0, ymm0, 6 \
2046 __asm vpsraw ymm1, ymm1, 6 \
2047 __asm vpsraw ymm2, ymm2, 6 \
2048 __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
2049 __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
2050 __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
2051 }
2052
2053 // Store 16 ARGB values.
2054 #define STOREARGB_AVX2 __asm { \
2055 __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
2056 __asm vpermq ymm0, ymm0, 0xd8 \
2057 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
2058 __asm vpermq ymm2, ymm2, 0xd8 \
2059 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
2060 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
2061 __asm vmovdqu 0[edx], ymm1 \
2062 __asm vmovdqu 32[edx], ymm0 \
2063 __asm lea edx, [edx + 64] \
2064 }
2065
2066 // Store 16 RGBA values.
2067 #define STORERGBA_AVX2 __asm { \
2068 __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
2069 __asm vpermq ymm1, ymm1, 0xd8 \
2070 __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
2071 __asm vpermq ymm2, ymm2, 0xd8 \
2072 __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
2073 __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
2074 __asm vmovdqu [edx], ymm0 \
2075 __asm vmovdqu [edx + 32], ymm1 \
2076 __asm lea edx, [edx + 64] \
2077 }
2078
2079 #ifdef HAS_I422TOARGBROW_AVX2
2080 // 16 pixels
2081 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2082 __declspec(naked)
2083 void I422ToARGBRow_AVX2(const uint8* y_buf,
2084 const uint8* u_buf,
2085 const uint8* v_buf,
2086 uint8* dst_argb,
2087 const struct YuvConstants* yuvconstants,
2088 int width) {
2089 __asm {
2090 push esi
2091 push edi
2092 push ebx
2093 mov eax, [esp + 12 + 4] // Y
2094 mov esi, [esp + 12 + 8] // U
2095 mov edi, [esp + 12 + 12] // V
2096 mov edx, [esp + 12 + 16] // argb
2097 mov ebx, [esp + 12 + 20] // yuvconstants
2098 mov ecx, [esp + 12 + 24] // width
2099 sub edi, esi
2100 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2101
2102 convertloop:
2103 READYUV422_AVX2
2104 YUVTORGB_AVX2(ebx)
2105 STOREARGB_AVX2
2106
2107 sub ecx, 16
2108 jg convertloop
2109
2110 pop ebx
2111 pop edi
2112 pop esi
2113 vzeroupper
2114 ret
2115 }
2116 }
2117 #endif // HAS_I422TOARGBROW_AVX2
2118
2119 #ifdef HAS_I422ALPHATOARGBROW_AVX2
2120 // 16 pixels
2121 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
2122 __declspec(naked)
2123 void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
2124 const uint8* u_buf,
2125 const uint8* v_buf,
2126 const uint8* a_buf,
2127 uint8* dst_argb,
2128 const struct YuvConstants* yuvconstants,
2129 int width) {
2130 __asm {
2131 push esi
2132 push edi
2133 push ebx
2134 push ebp
2135 mov eax, [esp + 16 + 4] // Y
2136 mov esi, [esp + 16 + 8] // U
2137 mov edi, [esp + 16 + 12] // V
2138 mov ebp, [esp + 16 + 16] // A
2139 mov edx, [esp + 16 + 20] // argb
2140 mov ebx, [esp + 16 + 24] // yuvconstants
2141 mov ecx, [esp + 16 + 28] // width
2142 sub edi, esi
2143
2144 convertloop:
2145 READYUVA422_AVX2
2146 YUVTORGB_AVX2(ebx)
2147 STOREARGB_AVX2
2148
2149 sub ecx, 16
2150 jg convertloop
2151
2152 pop ebp
2153 pop ebx
2154 pop edi
2155 pop esi
2156 vzeroupper
2157 ret
2158 }
2159 }
2160 #endif // HAS_I422ALPHATOARGBROW_AVX2
2161
2162 #ifdef HAS_I444TOARGBROW_AVX2
2163 // 16 pixels
2164 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2165 __declspec(naked)
2166 void I444ToARGBRow_AVX2(const uint8* y_buf,
2167 const uint8* u_buf,
2168 const uint8* v_buf,
2169 uint8* dst_argb,
2170 const struct YuvConstants* yuvconstants,
2171 int width) {
2172 __asm {
2173 push esi
2174 push edi
2175 push ebx
2176 mov eax, [esp + 12 + 4] // Y
2177 mov esi, [esp + 12 + 8] // U
2178 mov edi, [esp + 12 + 12] // V
2179 mov edx, [esp + 12 + 16] // argb
2180 mov ebx, [esp + 12 + 20] // yuvconstants
2181 mov ecx, [esp + 12 + 24] // width
2182 sub edi, esi
2183 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2184 convertloop:
2185 READYUV444_AVX2
2186 YUVTORGB_AVX2(ebx)
2187 STOREARGB_AVX2
2188
2189 sub ecx, 16
2190 jg convertloop
2191
2192 pop ebx
2193 pop edi
2194 pop esi
2195 vzeroupper
2196 ret
2197 }
2198 }
2199 #endif // HAS_I444TOARGBROW_AVX2
2200
2201 #ifdef HAS_I411TOARGBROW_AVX2
2202 // 16 pixels
2203 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2204 __declspec(naked)
2205 void I411ToARGBRow_AVX2(const uint8* y_buf,
2206 const uint8* u_buf,
2207 const uint8* v_buf,
2208 uint8* dst_argb,
2209 const struct YuvConstants* yuvconstants,
2210 int width) {
2211 __asm {
2212 push esi
2213 push edi
2214 push ebx
2215 mov eax, [esp + 12 + 4] // Y
2216 mov esi, [esp + 12 + 8] // U
2217 mov edi, [esp + 12 + 12] // V
2218 mov edx, [esp + 12 + 16] // abgr
2219 mov ebx, [esp + 12 + 20] // yuvconstants
2220 mov ecx, [esp + 12 + 24] // width
2221 sub edi, esi
2222 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2223
2224 convertloop:
2225 READYUV411_AVX2
2226 YUVTORGB_AVX2(ebx)
2227 STOREARGB_AVX2
2228
2229 sub ecx, 16
2230 jg convertloop
2231
2232 pop ebx
2233 pop edi
2234 pop esi
2235 vzeroupper
2236 ret
2237 }
2238 }
2239 #endif // HAS_I411TOARGBROW_AVX2
2240
2241 #ifdef HAS_NV12TOARGBROW_AVX2
2242 // 16 pixels.
2243 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2244 __declspec(naked)
2245 void NV12ToARGBRow_AVX2(const uint8* y_buf,
2246 const uint8* uv_buf,
2247 uint8* dst_argb,
2248 const struct YuvConstants* yuvconstants,
2249 int width) {
2250 __asm {
2251 push esi
2252 push ebx
2253 mov eax, [esp + 8 + 4] // Y
2254 mov esi, [esp + 8 + 8] // UV
2255 mov edx, [esp + 8 + 12] // argb
2256 mov ebx, [esp + 8 + 16] // yuvconstants
2257 mov ecx, [esp + 8 + 20] // width
2258 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2259
2260 convertloop:
2261 READNV12_AVX2
2262 YUVTORGB_AVX2(ebx)
2263 STOREARGB_AVX2
2264
2265 sub ecx, 16
2266 jg convertloop
2267
2268 pop ebx
2269 pop esi
2270 vzeroupper
2271 ret
2272 }
2273 }
2274 #endif // HAS_NV12TOARGBROW_AVX2
2275
2276 #ifdef HAS_NV21TOARGBROW_AVX2
2277 // 16 pixels.
2278 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2279 __declspec(naked)
2280 void NV21ToARGBRow_AVX2(const uint8* y_buf,
2281 const uint8* vu_buf,
2282 uint8* dst_argb,
2283 const struct YuvConstants* yuvconstants,
2284 int width) {
2285 __asm {
2286 push esi
2287 push ebx
2288 mov eax, [esp + 8 + 4] // Y
2289 mov esi, [esp + 8 + 8] // VU
2290 mov edx, [esp + 8 + 12] // argb
2291 mov ebx, [esp + 8 + 16] // yuvconstants
2292 mov ecx, [esp + 8 + 20] // width
2293 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2294
2295 convertloop:
2296 READNV21_AVX2
2297 YUVTORGB_AVX2(ebx)
2298 STOREARGB_AVX2
2299
2300 sub ecx, 16
2301 jg convertloop
2302
2303 pop ebx
2304 pop esi
2305 vzeroupper
2306 ret
2307 }
2308 }
2309 #endif // HAS_NV21TOARGBROW_AVX2
2310
2311 #ifdef HAS_YUY2TOARGBROW_AVX2
2312 // 16 pixels.
2313 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2314 __declspec(naked)
2315 void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
2316 uint8* dst_argb,
2317 const struct YuvConstants* yuvconstants,
2318 int width) {
2319 __asm {
2320 push ebx
2321 mov eax, [esp + 4 + 4] // yuy2
2322 mov edx, [esp + 4 + 8] // argb
2323 mov ebx, [esp + 4 + 12] // yuvconstants
2324 mov ecx, [esp + 4 + 16] // width
2325 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2326
2327 convertloop:
2328 READYUY2_AVX2
2329 YUVTORGB_AVX2(ebx)
2330 STOREARGB_AVX2
2331
2332 sub ecx, 16
2333 jg convertloop
2334
2335 pop ebx
2336 vzeroupper
2337 ret
2338 }
2339 }
2340 #endif // HAS_YUY2TOARGBROW_AVX2
2341
2342 #ifdef HAS_UYVYTOARGBROW_AVX2
2343 // 16 pixels.
2344 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2345 __declspec(naked)
2346 void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
2347 uint8* dst_argb,
2348 const struct YuvConstants* yuvconstants,
2349 int width) {
2350 __asm {
2351 push ebx
2352 mov eax, [esp + 4 + 4] // uyvy
2353 mov edx, [esp + 4 + 8] // argb
2354 mov ebx, [esp + 4 + 12] // yuvconstants
2355 mov ecx, [esp + 4 + 16] // width
2356 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2357
2358 convertloop:
2359 READUYVY_AVX2
2360 YUVTORGB_AVX2(ebx)
2361 STOREARGB_AVX2
2362
2363 sub ecx, 16
2364 jg convertloop
2365
2366 pop ebx
2367 vzeroupper
2368 ret
2369 }
2370 }
2371 #endif // HAS_UYVYTOARGBROW_AVX2
2372
2373 #ifdef HAS_I422TORGBAROW_AVX2
2374 // 16 pixels
2375 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2376 __declspec(naked)
2377 void I422ToRGBARow_AVX2(const uint8* y_buf,
2378 const uint8* u_buf,
2379 const uint8* v_buf,
2380 uint8* dst_argb,
2381 const struct YuvConstants* yuvconstants,
2382 int width) {
2383 __asm {
2384 push esi
2385 push edi
2386 push ebx
2387 mov eax, [esp + 12 + 4] // Y
2388 mov esi, [esp + 12 + 8] // U
2389 mov edi, [esp + 12 + 12] // V
2390 mov edx, [esp + 12 + 16] // abgr
2391 mov ebx, [esp + 12 + 20] // yuvconstants
2392 mov ecx, [esp + 12 + 24] // width
2393 sub edi, esi
2394 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2395
2396 convertloop:
2397 READYUV422_AVX2
2398 YUVTORGB_AVX2(ebx)
2399 STORERGBA_AVX2
2400
2401 sub ecx, 16
2402 jg convertloop
2403
2404 pop ebx
2405 pop edi
2406 pop esi
2407 vzeroupper
2408 ret
2409 }
2410 }
2411 #endif // HAS_I422TORGBAROW_AVX2
2412
2413 #if defined(HAS_I422TOARGBROW_SSSE3)
2414 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
2415 // Allows a conversion with half size scaling.
2416
2417 // Read 8 UV from 444.
2418 #define READYUV444 __asm { \
2419 __asm movq xmm0, qword ptr [esi] /* U */ \
2420 __asm movq xmm1, qword ptr [esi + edi] /* V */ \
2421 __asm lea esi, [esi + 8] \
2422 __asm punpcklbw xmm0, xmm1 /* UV */ \
2423 __asm movq xmm4, qword ptr [eax] \
2424 __asm punpcklbw xmm4, xmm4 \
2425 __asm lea eax, [eax + 8] \
2426 }
2427
2428 // Read 4 UV from 422, upsample to 8 UV.
2429 #define READYUV422 __asm { \
2430 __asm movd xmm0, [esi] /* U */ \
2431 __asm movd xmm1, [esi + edi] /* V */ \
2432 __asm lea esi, [esi + 4] \
2433 __asm punpcklbw xmm0, xmm1 /* UV */ \
2434 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2435 __asm movq xmm4, qword ptr [eax] \
2436 __asm punpcklbw xmm4, xmm4 \
2437 __asm lea eax, [eax + 8] \
2438 }
2439
2440 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
2441 #define READYUVA422 __asm { \
2442 __asm movd xmm0, [esi] /* U */ \
2443 __asm movd xmm1, [esi + edi] /* V */ \
2444 __asm lea esi, [esi + 4] \
2445 __asm punpcklbw xmm0, xmm1 /* UV */ \
2446 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2447 __asm movq xmm4, qword ptr [eax] /* Y */ \
2448 __asm punpcklbw xmm4, xmm4 \
2449 __asm lea eax, [eax + 8] \
2450 __asm movq xmm5, qword ptr [ebp] /* A */ \
2451 __asm lea ebp, [ebp + 8] \
2452 }
2453
2454 // Read 2 UV from 411, upsample to 8 UV.
2455 // drmemory fails with memory fault if pinsrw used. libyuv bug: 525
2456 // __asm pinsrw xmm0, [esi], 0 /* U */
2457 // __asm pinsrw xmm1, [esi + edi], 0 /* V */
2458 #define READYUV411_EBX __asm { \
2459 __asm movzx ebx, word ptr [esi] /* U */ \
2460 __asm movd xmm0, ebx \
2461 __asm movzx ebx, word ptr [esi + edi] /* V */ \
2462 __asm movd xmm1, ebx \
2463 __asm lea esi, [esi + 2] \
2464 __asm punpcklbw xmm0, xmm1 /* UV */ \
2465 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2466 __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \
2467 __asm movq xmm4, qword ptr [eax] \
2468 __asm punpcklbw xmm4, xmm4 \
2469 __asm lea eax, [eax + 8] \
2470 }
2471
2472 // Read 4 UV from NV12, upsample to 8 UV.
2473 #define READNV12 __asm { \
2474 __asm movq xmm0, qword ptr [esi] /* UV */ \
2475 __asm lea esi, [esi + 8] \
2476 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2477 __asm movq xmm4, qword ptr [eax] \
2478 __asm punpcklbw xmm4, xmm4 \
2479 __asm lea eax, [eax + 8] \
2480 }
2481
2482 // Read 4 VU from NV21, upsample to 8 UV.
2483 #define READNV21 __asm { \
2484 __asm movq xmm0, qword ptr [esi] /* UV */ \
2485 __asm lea esi, [esi + 8] \
2486 __asm pshufb xmm0, xmmword ptr kShuffleNV21 \
2487 __asm movq xmm4, qword ptr [eax] \
2488 __asm punpcklbw xmm4, xmm4 \
2489 __asm lea eax, [eax + 8] \
2490 }
2491
2492 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
2493 #define READYUY2 __asm { \
2494 __asm movdqu xmm4, [eax] /* YUY2 */ \
2495 __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
2496 __asm movdqu xmm0, [eax] /* UV */ \
2497 __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
2498 __asm lea eax, [eax + 16] \
2499 }
2500
2501 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
2502 #define READUYVY __asm { \
2503 __asm movdqu xmm4, [eax] /* UYVY */ \
2504 __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
2505 __asm movdqu xmm0, [eax] /* UV */ \
2506 __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
2507 __asm lea eax, [eax + 16] \
2508 }
2509
2510 // Convert 8 pixels: 8 UV and 8 Y.
2511 #define YUVTORGB(YuvConstants) __asm { \
2512 __asm movdqa xmm1, xmm0 \
2513 __asm movdqa xmm2, xmm0 \
2514 __asm movdqa xmm3, xmm0 \
2515 __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \
2516 __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \
2517 __asm psubw xmm0, xmm1 \
2518 __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \
2519 __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \
2520 __asm psubw xmm1, xmm2 \
2521 __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
2522 __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
2523 __asm psubw xmm2, xmm3 \
2524 __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
2525 __asm paddsw xmm0, xmm4 /* B += Y */ \
2526 __asm paddsw xmm1, xmm4 /* G += Y */ \
2527 __asm paddsw xmm2, xmm4 /* R += Y */ \
2528 __asm psraw xmm0, 6 \
2529 __asm psraw xmm1, 6 \
2530 __asm psraw xmm2, 6 \
2531 __asm packuswb xmm0, xmm0 /* B */ \
2532 __asm packuswb xmm1, xmm1 /* G */ \
2533 __asm packuswb xmm2, xmm2 /* R */ \
2534 }
2535
2536 // Store 8 ARGB values.
2537 #define STOREARGB __asm { \
2538 __asm punpcklbw xmm0, xmm1 /* BG */ \
2539 __asm punpcklbw xmm2, xmm5 /* RA */ \
2540 __asm movdqa xmm1, xmm0 \
2541 __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
2542 __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
2543 __asm movdqu 0[edx], xmm0 \
2544 __asm movdqu 16[edx], xmm1 \
2545 __asm lea edx, [edx + 32] \
2546 }
2547
2548 // Store 8 BGRA values.
2549 #define STOREBGRA __asm { \
2550 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
2551 __asm punpcklbw xmm1, xmm0 /* GB */ \
2552 __asm punpcklbw xmm5, xmm2 /* AR */ \
2553 __asm movdqa xmm0, xmm5 \
2554 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
2555 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
2556 __asm movdqu 0[edx], xmm5 \
2557 __asm movdqu 16[edx], xmm0 \
2558 __asm lea edx, [edx + 32] \
2559 }
2560
2561 // Store 8 RGBA values.
2562 #define STORERGBA __asm { \
2563 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
2564 __asm punpcklbw xmm1, xmm2 /* GR */ \
2565 __asm punpcklbw xmm5, xmm0 /* AB */ \
2566 __asm movdqa xmm0, xmm5 \
2567 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
2568 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
2569 __asm movdqu 0[edx], xmm5 \
2570 __asm movdqu 16[edx], xmm0 \
2571 __asm lea edx, [edx + 32] \
2572 }
2573
2574 // Store 8 RGB24 values.
2575 #define STORERGB24 __asm { \
2576 /* Weave into RRGB */ \
2577 __asm punpcklbw xmm0, xmm1 /* BG */ \
2578 __asm punpcklbw xmm2, xmm2 /* RR */ \
2579 __asm movdqa xmm1, xmm0 \
2580 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
2581 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
2582 /* RRGB -> RGB24 */ \
2583 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
2584 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
2585 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
2586 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
2587 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
2588 __asm lea edx, [edx + 24] \
2589 }
2590
2591 // Store 8 RGB565 values.
2592 #define STORERGB565 __asm { \
2593 /* Weave into RRGB */ \
2594 __asm punpcklbw xmm0, xmm1 /* BG */ \
2595 __asm punpcklbw xmm2, xmm2 /* RR */ \
2596 __asm movdqa xmm1, xmm0 \
2597 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
2598 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
2599 /* RRGB -> RGB565 */ \
2600 __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
2601 __asm movdqa xmm2, xmm0 /* G */ \
2602 __asm pslld xmm0, 8 /* R */ \
2603 __asm psrld xmm3, 3 /* B */ \
2604 __asm psrld xmm2, 5 /* G */ \
2605 __asm psrad xmm0, 16 /* R */ \
2606 __asm pand xmm3, xmm5 /* B */ \
2607 __asm pand xmm2, xmm6 /* G */ \
2608 __asm pand xmm0, xmm7 /* R */ \
2609 __asm por xmm3, xmm2 /* BG */ \
2610 __asm por xmm0, xmm3 /* BGR */ \
2611 __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
2612 __asm movdqa xmm2, xmm1 /* G */ \
2613 __asm pslld xmm1, 8 /* R */ \
2614 __asm psrld xmm3, 3 /* B */ \
2615 __asm psrld xmm2, 5 /* G */ \
2616 __asm psrad xmm1, 16 /* R */ \
2617 __asm pand xmm3, xmm5 /* B */ \
2618 __asm pand xmm2, xmm6 /* G */ \
2619 __asm pand xmm1, xmm7 /* R */ \
2620 __asm por xmm3, xmm2 /* BG */ \
2621 __asm por xmm1, xmm3 /* BGR */ \
2622 __asm packssdw xmm0, xmm1 \
2623 __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
2624 __asm lea edx, [edx + 16] \
2625 }
2626
2627 // 8 pixels.
2628 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2629 __declspec(naked)
2630 void I444ToARGBRow_SSSE3(const uint8* y_buf,
2631 const uint8* u_buf,
2632 const uint8* v_buf,
2633 uint8* dst_argb,
2634 const struct YuvConstants* yuvconstants,
2635 int width) {
2636 __asm {
2637 push esi
2638 push edi
2639 push ebx
2640 mov eax, [esp + 12 + 4] // Y
2641 mov esi, [esp + 12 + 8] // U
2642 mov edi, [esp + 12 + 12] // V
2643 mov edx, [esp + 12 + 16] // argb
2644 mov ebx, [esp + 12 + 20] // yuvconstants
2645 mov ecx, [esp + 12 + 24] // width
2646 sub edi, esi
2647 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2648
2649 convertloop:
2650 READYUV444
2651 YUVTORGB(ebx)
2652 STOREARGB
2653
2654 sub ecx, 8
2655 jg convertloop
2656
2657 pop ebx
2658 pop edi
2659 pop esi
2660 ret
2661 }
2662 }
2663
2664 // 8 pixels.
2665 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
2666 __declspec(naked)
2667 void I422ToRGB24Row_SSSE3(const uint8* y_buf,
2668 const uint8* u_buf,
2669 const uint8* v_buf,
2670 uint8* dst_rgb24,
2671 const struct YuvConstants* yuvconstants,
2672 int width) {
2673 __asm {
2674 push esi
2675 push edi
2676 push ebx
2677 mov eax, [esp + 12 + 4] // Y
2678 mov esi, [esp + 12 + 8] // U
2679 mov edi, [esp + 12 + 12] // V
2680 mov edx, [esp + 12 + 16] // argb
2681 mov ebx, [esp + 12 + 20] // yuvconstants
2682 mov ecx, [esp + 12 + 24] // width
2683 sub edi, esi
2684 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
2685 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
2686
2687 convertloop:
2688 READYUV422
2689 YUVTORGB(ebx)
2690 STORERGB24
2691
2692 sub ecx, 8
2693 jg convertloop
2694
2695 pop ebx
2696 pop edi
2697 pop esi
2698 ret
2699 }
2700 }
2701
2702 // 8 pixels
2703 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
2704 __declspec(naked)
2705 void I422ToRGB565Row_SSSE3(const uint8* y_buf,
2706 const uint8* u_buf,
2707 const uint8* v_buf,
2708 uint8* rgb565_buf,
2709 const struct YuvConstants* yuvconstants,
2710 int width) {
2711 __asm {
2712 push esi
2713 push edi
2714 push ebx
2715 mov eax, [esp + 12 + 4] // Y
2716 mov esi, [esp + 12 + 8] // U
2717 mov edi, [esp + 12 + 12] // V
2718 mov edx, [esp + 12 + 16] // argb
2719 mov ebx, [esp + 12 + 20] // yuvconstants
2720 mov ecx, [esp + 12 + 24] // width
2721 sub edi, esi
2722 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
2723 psrld xmm5, 27
2724 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
2725 psrld xmm6, 26
2726 pslld xmm6, 5
2727 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
2728 pslld xmm7, 11
2729
2730 convertloop:
2731 READYUV422
2732 YUVTORGB(ebx)
2733 STORERGB565
2734
2735 sub ecx, 8
2736 jg convertloop
2737
2738 pop ebx
2739 pop edi
2740 pop esi
2741 ret
2742 }
2743 }
2744
2745 // 8 pixels.
2746 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2747 __declspec(naked)
2748 void I422ToARGBRow_SSSE3(const uint8* y_buf,
2749 const uint8* u_buf,
2750 const uint8* v_buf,
2751 uint8* dst_argb,
2752 const struct YuvConstants* yuvconstants,
2753 int width) {
2754 __asm {
2755 push esi
2756 push edi
2757 push ebx
2758 mov eax, [esp + 12 + 4] // Y
2759 mov esi, [esp + 12 + 8] // U
2760 mov edi, [esp + 12 + 12] // V
2761 mov edx, [esp + 12 + 16] // argb
2762 mov ebx, [esp + 12 + 20] // yuvconstants
2763 mov ecx, [esp + 12 + 24] // width
2764 sub edi, esi
2765 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2766
2767 convertloop:
2768 READYUV422
2769 YUVTORGB(ebx)
2770 STOREARGB
2771
2772 sub ecx, 8
2773 jg convertloop
2774
2775 pop ebx
2776 pop edi
2777 pop esi
2778 ret
2779 }
2780 }
2781
2782 // 8 pixels.
2783 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
2784 __declspec(naked)
2785 void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
2786 const uint8* u_buf,
2787 const uint8* v_buf,
2788 const uint8* a_buf,
2789 uint8* dst_argb,
2790 const struct YuvConstants* yuvconstants,
2791 int width) {
2792 __asm {
2793 push esi
2794 push edi
2795 push ebx
2796 push ebp
2797 mov eax, [esp + 16 + 4] // Y
2798 mov esi, [esp + 16 + 8] // U
2799 mov edi, [esp + 16 + 12] // V
2800 mov ebp, [esp + 16 + 16] // A
2801 mov edx, [esp + 16 + 20] // argb
2802 mov ebx, [esp + 16 + 24] // yuvconstants
2803 mov ecx, [esp + 16 + 28] // width
2804 sub edi, esi
2805
2806 convertloop:
2807 READYUVA422
2808 YUVTORGB(ebx)
2809 STOREARGB
2810
2811 sub ecx, 8
2812 jg convertloop
2813
2814 pop ebp
2815 pop ebx
2816 pop edi
2817 pop esi
2818 ret
2819 }
2820 }
2821
2822 // 8 pixels.
2823 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2824 // Similar to I420 but duplicate UV once more.
2825 __declspec(naked)
2826 void I411ToARGBRow_SSSE3(const uint8* y_buf,
2827 const uint8* u_buf,
2828 const uint8* v_buf,
2829 uint8* dst_argb,
2830 const struct YuvConstants* yuvconstants,
2831 int width) {
2832 __asm {
2833 push esi
2834 push edi
2835 push ebx
2836 push ebp
2837 mov eax, [esp + 16 + 4] // Y
2838 mov esi, [esp + 16 + 8] // U
2839 mov edi, [esp + 16 + 12] // V
2840 mov edx, [esp + 16 + 16] // abgr
2841 mov ebp, [esp + 16 + 20] // yuvconstants
2842 mov ecx, [esp + 16 + 24] // width
2843 sub edi, esi
2844 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2845
2846 convertloop:
2847 READYUV411_EBX
2848 YUVTORGB(ebp)
2849 STOREARGB
2850
2851 sub ecx, 8
2852 jg convertloop
2853
2854 pop ebp
2855 pop ebx
2856 pop edi
2857 pop esi
2858 ret
2859 }
2860 }
2861
2862 // 8 pixels.
2863 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2864 __declspec(naked)
2865 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
2866 const uint8* uv_buf,
2867 uint8* dst_argb,
2868 const struct YuvConstants* yuvconstants,
2869 int width) {
2870 __asm {
2871 push esi
2872 push ebx
2873 mov eax, [esp + 8 + 4] // Y
2874 mov esi, [esp + 8 + 8] // UV
2875 mov edx, [esp + 8 + 12] // argb
2876 mov ebx, [esp + 8 + 16] // yuvconstants
2877 mov ecx, [esp + 8 + 20] // width
2878 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2879
2880 convertloop:
2881 READNV12
2882 YUVTORGB(ebx)
2883 STOREARGB
2884
2885 sub ecx, 8
2886 jg convertloop
2887
2888 pop ebx
2889 pop esi
2890 ret
2891 }
2892 }
2893
2894 // 8 pixels.
2895 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2896 __declspec(naked)
2897 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
2898 const uint8* vu_buf,
2899 uint8* dst_argb,
2900 const struct YuvConstants* yuvconstants,
2901 int width) {
2902 __asm {
2903 push esi
2904 push ebx
2905 mov eax, [esp + 8 + 4] // Y
2906 mov esi, [esp + 8 + 8] // VU
2907 mov edx, [esp + 8 + 12] // argb
2908 mov ebx, [esp + 8 + 16] // yuvconstants
2909 mov ecx, [esp + 8 + 20] // width
2910 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2911
2912 convertloop:
2913 READNV21
2914 YUVTORGB(ebx)
2915 STOREARGB
2916
2917 sub ecx, 8
2918 jg convertloop
2919
2920 pop ebx
2921 pop esi
2922 ret
2923 }
2924 }
2925
2926 // 8 pixels.
2927 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
2928 __declspec(naked)
2929 void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
2930 uint8* dst_argb,
2931 const struct YuvConstants* yuvconstants,
2932 int width) {
2933 __asm {
2934 push ebx
2935 mov eax, [esp + 4 + 4] // yuy2
2936 mov edx, [esp + 4 + 8] // argb
2937 mov ebx, [esp + 4 + 12] // yuvconstants
2938 mov ecx, [esp + 4 + 16] // width
2939 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2940
2941 convertloop:
2942 READYUY2
2943 YUVTORGB(ebx)
2944 STOREARGB
2945
2946 sub ecx, 8
2947 jg convertloop
2948
2949 pop ebx
2950 ret
2951 }
2952 }
2953
2954 // 8 pixels.
2955 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
2956 __declspec(naked)
2957 void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
2958 uint8* dst_argb,
2959 const struct YuvConstants* yuvconstants,
2960 int width) {
2961 __asm {
2962 push ebx
2963 mov eax, [esp + 4 + 4] // uyvy
2964 mov edx, [esp + 4 + 8] // argb
2965 mov ebx, [esp + 4 + 12] // yuvconstants
2966 mov ecx, [esp + 4 + 16] // width
2967 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2968
2969 convertloop:
2970 READUYVY
2971 YUVTORGB(ebx)
2972 STOREARGB
2973
2974 sub ecx, 8
2975 jg convertloop
2976
2977 pop ebx
2978 ret
2979 }
2980 }
2981
2982 __declspec(naked)
2983 void I422ToRGBARow_SSSE3(const uint8* y_buf,
2984 const uint8* u_buf,
2985 const uint8* v_buf,
2986 uint8* dst_rgba,
2987 const struct YuvConstants* yuvconstants,
2988 int width) {
2989 __asm {
2990 push esi
2991 push edi
2992 push ebx
2993 mov eax, [esp + 12 + 4] // Y
2994 mov esi, [esp + 12 + 8] // U
2995 mov edi, [esp + 12 + 12] // V
2996 mov edx, [esp + 12 + 16] // argb
2997 mov ebx, [esp + 12 + 20] // yuvconstants
2998 mov ecx, [esp + 12 + 24] // width
2999 sub edi, esi
3000
3001 convertloop:
3002 READYUV422
3003 YUVTORGB(ebx)
3004 STORERGBA
3005
3006 sub ecx, 8
3007 jg convertloop
3008
3009 pop ebx
3010 pop edi
3011 pop esi
3012 ret
3013 }
3014 }
3015 #endif // HAS_I422TOARGBROW_SSSE3
3016
3017 #ifdef HAS_I400TOARGBROW_SSE2
3018 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
3019 __declspec(naked)
3020 void I400ToARGBRow_SSE2(const uint8* y_buf,
3021 uint8* rgb_buf,
3022 int width) {
3023 __asm {
3024 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
3025 movd xmm2, eax
3026 pshufd xmm2, xmm2,0
3027 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
3028 movd xmm3, eax
3029 pshufd xmm3, xmm3, 0
3030 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
3031 pslld xmm4, 24
3032
3033 mov eax, [esp + 4] // Y
3034 mov edx, [esp + 8] // rgb
3035 mov ecx, [esp + 12] // width
3036
3037 convertloop:
3038 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
3039 movq xmm0, qword ptr [eax]
3040 lea eax, [eax + 8]
3041 punpcklbw xmm0, xmm0 // Y.Y
3042 pmulhuw xmm0, xmm2
3043 psubusw xmm0, xmm3
3044 psrlw xmm0, 6
3045 packuswb xmm0, xmm0 // G
3046
3047 // Step 2: Weave into ARGB
3048 punpcklbw xmm0, xmm0 // GG
3049 movdqa xmm1, xmm0
3050 punpcklwd xmm0, xmm0 // BGRA first 4 pixels
3051 punpckhwd xmm1, xmm1 // BGRA next 4 pixels
3052 por xmm0, xmm4
3053 por xmm1, xmm4
3054 movdqu [edx], xmm0
3055 movdqu [edx + 16], xmm1
3056 lea edx, [edx + 32]
3057 sub ecx, 8
3058 jg convertloop
3059 ret
3060 }
3061 }
3062 #endif // HAS_I400TOARGBROW_SSE2
3063
3064 #ifdef HAS_I400TOARGBROW_AVX2
3065 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
3066 // note: vpunpcklbw mutates and vpackuswb unmutates.
3067 __declspec(naked)
3068 void I400ToARGBRow_AVX2(const uint8* y_buf,
3069 uint8* rgb_buf,
3070 int width) {
3071 __asm {
3072 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
3073 vmovd xmm2, eax
3074 vbroadcastss ymm2, xmm2
3075 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
3076 vmovd xmm3, eax
3077 vbroadcastss ymm3, xmm3
3078 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
3079 vpslld ymm4, ymm4, 24
3080
3081 mov eax, [esp + 4] // Y
3082 mov edx, [esp + 8] // rgb
3083 mov ecx, [esp + 12] // width
3084
3085 convertloop:
3086 // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
3087 vmovdqu xmm0, [eax]
3088 lea eax, [eax + 16]
3089 vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
3090 vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
3091 vpmulhuw ymm0, ymm0, ymm2
3092 vpsubusw ymm0, ymm0, ymm3
3093 vpsrlw ymm0, ymm0, 6
3094 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
3095
3096 // TODO(fbarchard): Weave alpha with unpack.
3097 // Step 2: Weave into ARGB
3098 vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
3099 vpermq ymm1, ymm1, 0xd8
3100 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
3101 vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
3102 vpor ymm0, ymm0, ymm4
3103 vpor ymm1, ymm1, ymm4
3104 vmovdqu [edx], ymm0
3105 vmovdqu [edx + 32], ymm1
3106 lea edx, [edx + 64]
3107 sub ecx, 16
3108 jg convertloop
3109 vzeroupper
3110 ret
3111 }
3112 }
3113 #endif // HAS_I400TOARGBROW_AVX2
3114
3115 #ifdef HAS_MIRRORROW_SSSE3
3116 // Shuffle table for reversing the bytes.
3117 static const uvec8 kShuffleMirror = {
3118 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3119 };
3120
3121 // TODO(fbarchard): Replace lea with -16 offset.
3122 __declspec(naked)
3123 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
3124 __asm {
3125 mov eax, [esp + 4] // src
3126 mov edx, [esp + 8] // dst
3127 mov ecx, [esp + 12] // width
3128 movdqa xmm5, xmmword ptr kShuffleMirror
3129
3130 convertloop:
3131 movdqu xmm0, [eax - 16 + ecx]
3132 pshufb xmm0, xmm5
3133 movdqu [edx], xmm0
3134 lea edx, [edx + 16]
3135 sub ecx, 16
3136 jg convertloop
3137 ret
3138 }
3139 }
3140 #endif // HAS_MIRRORROW_SSSE3
3141
3142 #ifdef HAS_MIRRORROW_AVX2
3143 __declspec(naked)
3144 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3145 __asm {
3146 mov eax, [esp + 4] // src
3147 mov edx, [esp + 8] // dst
3148 mov ecx, [esp + 12] // width
3149 vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
3150
3151 convertloop:
3152 vmovdqu ymm0, [eax - 32 + ecx]
3153 vpshufb ymm0, ymm0, ymm5
3154 vpermq ymm0, ymm0, 0x4e // swap high and low halfs
3155 vmovdqu [edx], ymm0
3156 lea edx, [edx + 32]
3157 sub ecx, 32
3158 jg convertloop
3159 vzeroupper
3160 ret
3161 }
3162 }
3163 #endif // HAS_MIRRORROW_AVX2
3164
3165 #ifdef HAS_MIRRORUVROW_SSSE3
3166 // Shuffle table for reversing the bytes of UV channels.
3167 static const uvec8 kShuffleMirrorUV = {
3168 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
3169 };
3170
3171 __declspec(naked)
3172 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
3173 int width) {
3174 __asm {
3175 push edi
3176 mov eax, [esp + 4 + 4] // src
3177 mov edx, [esp + 4 + 8] // dst_u
3178 mov edi, [esp + 4 + 12] // dst_v
3179 mov ecx, [esp + 4 + 16] // width
3180 movdqa xmm1, xmmword ptr kShuffleMirrorUV
3181 lea eax, [eax + ecx * 2 - 16]
3182 sub edi, edx
3183
3184 convertloop:
3185 movdqu xmm0, [eax]
3186 lea eax, [eax - 16]
3187 pshufb xmm0, xmm1
3188 movlpd qword ptr [edx], xmm0
3189 movhpd qword ptr [edx + edi], xmm0
3190 lea edx, [edx + 8]
3191 sub ecx, 8
3192 jg convertloop
3193
3194 pop edi
3195 ret
3196 }
3197 }
3198 #endif // HAS_MIRRORUVROW_SSSE3
3199
3200 #ifdef HAS_ARGBMIRRORROW_SSE2
3201 __declspec(naked)
3202 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
3203 __asm {
3204 mov eax, [esp + 4] // src
3205 mov edx, [esp + 8] // dst
3206 mov ecx, [esp + 12] // width
3207 lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
3208
3209 convertloop:
3210 movdqu xmm0, [eax]
3211 lea eax, [eax - 16]
3212 pshufd xmm0, xmm0, 0x1b
3213 movdqu [edx], xmm0
3214 lea edx, [edx + 16]
3215 sub ecx, 4
3216 jg convertloop
3217 ret
3218 }
3219 }
3220 #endif // HAS_ARGBMIRRORROW_SSE2
3221
3222 #ifdef HAS_ARGBMIRRORROW_AVX2
3223 // Shuffle table for reversing the bytes.
3224 static const ulvec32 kARGBShuffleMirror_AVX2 = {
3225 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3226 };
3227
3228 __declspec(naked)
3229 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3230 __asm {
3231 mov eax, [esp + 4] // src
3232 mov edx, [esp + 8] // dst
3233 mov ecx, [esp + 12] // width
3234 vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2
3235
3236 convertloop:
3237 vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
3238 vmovdqu [edx], ymm0
3239 lea edx, [edx + 32]
3240 sub ecx, 8
3241 jg convertloop
3242 vzeroupper
3243 ret
3244 }
3245 }
3246 #endif // HAS_ARGBMIRRORROW_AVX2
3247
3248 #ifdef HAS_SPLITUVROW_SSE2
3249 __declspec(naked)
3250 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
3251 int width) {
3252 __asm {
3253 push edi
3254 mov eax, [esp + 4 + 4] // src_uv
3255 mov edx, [esp + 4 + 8] // dst_u
3256 mov edi, [esp + 4 + 12] // dst_v
3257 mov ecx, [esp + 4 + 16] // width
3258 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3259 psrlw xmm5, 8
3260 sub edi, edx
3261
3262 convertloop:
3263 movdqu xmm0, [eax]
3264 movdqu xmm1, [eax + 16]
3265 lea eax, [eax + 32]
3266 movdqa xmm2, xmm0
3267 movdqa xmm3, xmm1
3268 pand xmm0, xmm5 // even bytes
3269 pand xmm1, xmm5
3270 packuswb xmm0, xmm1
3271 psrlw xmm2, 8 // odd bytes
3272 psrlw xmm3, 8
3273 packuswb xmm2, xmm3
3274 movdqu [edx], xmm0
3275 movdqu [edx + edi], xmm2
3276 lea edx, [edx + 16]
3277 sub ecx, 16
3278 jg convertloop
3279
3280 pop edi
3281 ret
3282 }
3283 }
3284
3285 #endif // HAS_SPLITUVROW_SSE2
3286
3287 #ifdef HAS_SPLITUVROW_AVX2
3288 __declspec(naked)
3289 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
3290 int width) {
3291 __asm {
3292 push edi
3293 mov eax, [esp + 4 + 4] // src_uv
3294 mov edx, [esp + 4 + 8] // dst_u
3295 mov edi, [esp + 4 + 12] // dst_v
3296 mov ecx, [esp + 4 + 16] // width
3297 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3298 vpsrlw ymm5, ymm5, 8
3299 sub edi, edx
3300
3301 convertloop:
3302 vmovdqu ymm0, [eax]
3303 vmovdqu ymm1, [eax + 32]
3304 lea eax, [eax + 64]
3305 vpsrlw ymm2, ymm0, 8 // odd bytes
3306 vpsrlw ymm3, ymm1, 8
3307 vpand ymm0, ymm0, ymm5 // even bytes
3308 vpand ymm1, ymm1, ymm5
3309 vpackuswb ymm0, ymm0, ymm1
3310 vpackuswb ymm2, ymm2, ymm3
3311 vpermq ymm0, ymm0, 0xd8
3312 vpermq ymm2, ymm2, 0xd8
3313 vmovdqu [edx], ymm0
3314 vmovdqu [edx + edi], ymm2
3315 lea edx, [edx + 32]
3316 sub ecx, 32
3317 jg convertloop
3318
3319 pop edi
3320 vzeroupper
3321 ret
3322 }
3323 }
3324 #endif // HAS_SPLITUVROW_AVX2
3325
3326 #ifdef HAS_MERGEUVROW_SSE2
3327 __declspec(naked)
3328 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3329 int width) {
3330 __asm {
3331 push edi
3332 mov eax, [esp + 4 + 4] // src_u
3333 mov edx, [esp + 4 + 8] // src_v
3334 mov edi, [esp + 4 + 12] // dst_uv
3335 mov ecx, [esp + 4 + 16] // width
3336 sub edx, eax
3337
3338 convertloop:
3339 movdqu xmm0, [eax] // read 16 U's
3340 movdqu xmm1, [eax + edx] // and 16 V's
3341 lea eax, [eax + 16]
3342 movdqa xmm2, xmm0
3343 punpcklbw xmm0, xmm1 // first 8 UV pairs
3344 punpckhbw xmm2, xmm1 // next 8 UV pairs
3345 movdqu [edi], xmm0
3346 movdqu [edi + 16], xmm2
3347 lea edi, [edi + 32]
3348 sub ecx, 16
3349 jg convertloop
3350
3351 pop edi
3352 ret
3353 }
3354 }
3355 #endif // HAS_MERGEUVROW_SSE2
3356
3357 #ifdef HAS_MERGEUVROW_AVX2
3358 __declspec(naked)
3359 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3360 int width) {
3361 __asm {
3362 push edi
3363 mov eax, [esp + 4 + 4] // src_u
3364 mov edx, [esp + 4 + 8] // src_v
3365 mov edi, [esp + 4 + 12] // dst_uv
3366 mov ecx, [esp + 4 + 16] // width
3367 sub edx, eax
3368
3369 convertloop:
3370 vmovdqu ymm0, [eax] // read 32 U's
3371 vmovdqu ymm1, [eax + edx] // and 32 V's
3372 lea eax, [eax + 32]
3373 vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
3374 vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
3375 vextractf128 [edi], ymm2, 0 // bytes 0..15
3376 vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
3377 vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
3378 vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
3379 lea edi, [edi + 64]
3380 sub ecx, 32
3381 jg convertloop
3382
3383 pop edi
3384 vzeroupper
3385 ret
3386 }
3387 }
3388 #endif // HAS_MERGEUVROW_AVX2
3389
3390 #ifdef HAS_COPYROW_SSE2
3391 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
3392 __declspec(naked)
3393 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
3394 __asm {
3395 mov eax, [esp + 4] // src
3396 mov edx, [esp + 8] // dst
3397 mov ecx, [esp + 12] // count
3398 test eax, 15
3399 jne convertloopu
3400 test edx, 15
3401 jne convertloopu
3402
3403 convertloopa:
3404 movdqa xmm0, [eax]
3405 movdqa xmm1, [eax + 16]
3406 lea eax, [eax + 32]
3407 movdqa [edx], xmm0
3408 movdqa [edx + 16], xmm1
3409 lea edx, [edx + 32]
3410 sub ecx, 32
3411 jg convertloopa
3412 ret
3413
3414 convertloopu:
3415 movdqu xmm0, [eax]
3416 movdqu xmm1, [eax + 16]
3417 lea eax, [eax + 32]
3418 movdqu [edx], xmm0
3419 movdqu [edx + 16], xmm1
3420 lea edx, [edx + 32]
3421 sub ecx, 32
3422 jg convertloopu
3423 ret
3424 }
3425 }
3426 #endif // HAS_COPYROW_SSE2
3427
3428 #ifdef HAS_COPYROW_AVX
3429 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
3430 __declspec(naked)
3431 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
3432 __asm {
3433 mov eax, [esp + 4] // src
3434 mov edx, [esp + 8] // dst
3435 mov ecx, [esp + 12] // count
3436
3437 convertloop:
3438 vmovdqu ymm0, [eax]
3439 vmovdqu ymm1, [eax + 32]
3440 lea eax, [eax + 64]
3441 vmovdqu [edx], ymm0
3442 vmovdqu [edx + 32], ymm1
3443 lea edx, [edx + 64]
3444 sub ecx, 64
3445 jg convertloop
3446
3447 vzeroupper
3448 ret
3449 }
3450 }
3451 #endif // HAS_COPYROW_AVX
3452
3453 // Multiple of 1.
3454 __declspec(naked)
3455 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
3456 __asm {
3457 mov eax, esi
3458 mov edx, edi
3459 mov esi, [esp + 4] // src
3460 mov edi, [esp + 8] // dst
3461 mov ecx, [esp + 12] // count
3462 rep movsb
3463 mov edi, edx
3464 mov esi, eax
3465 ret
3466 }
3467 }
3468
3469 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
3470 // width in pixels
3471 __declspec(naked)
3472 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3473 __asm {
3474 mov eax, [esp + 4] // src
3475 mov edx, [esp + 8] // dst
3476 mov ecx, [esp + 12] // count
3477 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
3478 pslld xmm0, 24
3479 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
3480 psrld xmm1, 8
3481
3482 convertloop:
3483 movdqu xmm2, [eax]
3484 movdqu xmm3, [eax + 16]
3485 lea eax, [eax + 32]
3486 movdqu xmm4, [edx]
3487 movdqu xmm5, [edx + 16]
3488 pand xmm2, xmm0
3489 pand xmm3, xmm0
3490 pand xmm4, xmm1
3491 pand xmm5, xmm1
3492 por xmm2, xmm4
3493 por xmm3, xmm5
3494 movdqu [edx], xmm2
3495 movdqu [edx + 16], xmm3
3496 lea edx, [edx + 32]
3497 sub ecx, 8
3498 jg convertloop
3499
3500 ret
3501 }
3502 }
3503 #endif // HAS_ARGBCOPYALPHAROW_SSE2
3504
3505 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
3506 // width in pixels
3507 __declspec(naked)
3508 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3509 __asm {
3510 mov eax, [esp + 4] // src
3511 mov edx, [esp + 8] // dst
3512 mov ecx, [esp + 12] // count
3513 vpcmpeqb ymm0, ymm0, ymm0
3514 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3515
3516 convertloop:
3517 vmovdqu ymm1, [eax]
3518 vmovdqu ymm2, [eax + 32]
3519 lea eax, [eax + 64]
3520 vpblendvb ymm1, ymm1, [edx], ymm0
3521 vpblendvb ymm2, ymm2, [edx + 32], ymm0
3522 vmovdqu [edx], ymm1
3523 vmovdqu [edx + 32], ymm2
3524 lea edx, [edx + 64]
3525 sub ecx, 16
3526 jg convertloop
3527
3528 vzeroupper
3529 ret
3530 }
3531 }
3532 #endif // HAS_ARGBCOPYALPHAROW_AVX2
3533
3534 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
3535 // width in pixels
3536 __declspec(naked)
3537 void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
3538 __asm {
3539 mov eax, [esp + 4] // src_argb
3540 mov edx, [esp + 8] // dst_a
3541 mov ecx, [esp + 12] // width
3542
3543 extractloop:
3544 movdqu xmm0, [eax]
3545 movdqu xmm1, [eax + 16]
3546 lea eax, [eax + 32]
3547 psrld xmm0, 24
3548 psrld xmm1, 24
3549 packssdw xmm0, xmm1
3550 packuswb xmm0, xmm0
3551 movq qword ptr [edx], xmm0
3552 lea edx, [edx + 8]
3553 sub ecx, 8
3554 jg extractloop
3555
3556 ret
3557 }
3558 }
3559 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
3560
3561 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3562 // width in pixels
3563 __declspec(naked)
3564 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3565 __asm {
3566 mov eax, [esp + 4] // src
3567 mov edx, [esp + 8] // dst
3568 mov ecx, [esp + 12] // count
3569 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
3570 pslld xmm0, 24
3571 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
3572 psrld xmm1, 8
3573
3574 convertloop:
3575 movq xmm2, qword ptr [eax] // 8 Y's
3576 lea eax, [eax + 8]
3577 punpcklbw xmm2, xmm2
3578 punpckhwd xmm3, xmm2
3579 punpcklwd xmm2, xmm2
3580 movdqu xmm4, [edx]
3581 movdqu xmm5, [edx + 16]
3582 pand xmm2, xmm0
3583 pand xmm3, xmm0
3584 pand xmm4, xmm1
3585 pand xmm5, xmm1
3586 por xmm2, xmm4
3587 por xmm3, xmm5
3588 movdqu [edx], xmm2
3589 movdqu [edx + 16], xmm3
3590 lea edx, [edx + 32]
3591 sub ecx, 8
3592 jg convertloop
3593
3594 ret
3595 }
3596 }
3597 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
3598
3599 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3600 // width in pixels
3601 __declspec(naked)
3602 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3603 __asm {
3604 mov eax, [esp + 4] // src
3605 mov edx, [esp + 8] // dst
3606 mov ecx, [esp + 12] // count
3607 vpcmpeqb ymm0, ymm0, ymm0
3608 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3609
3610 convertloop:
3611 vpmovzxbd ymm1, qword ptr [eax]
3612 vpmovzxbd ymm2, qword ptr [eax + 8]
3613 lea eax, [eax + 16]
3614 vpslld ymm1, ymm1, 24
3615 vpslld ymm2, ymm2, 24
3616 vpblendvb ymm1, ymm1, [edx], ymm0
3617 vpblendvb ymm2, ymm2, [edx + 32], ymm0
3618 vmovdqu [edx], ymm1
3619 vmovdqu [edx + 32], ymm2
3620 lea edx, [edx + 64]
3621 sub ecx, 16
3622 jg convertloop
3623
3624 vzeroupper
3625 ret
3626 }
3627 }
3628 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
3629
3630 #ifdef HAS_SETROW_X86
3631 // Write 'count' bytes using an 8 bit value repeated.
3632 // Count should be multiple of 4.
3633 __declspec(naked)
3634 void SetRow_X86(uint8* dst, uint8 v8, int count) {
3635 __asm {
3636 movzx eax, byte ptr [esp + 8] // v8
3637 mov edx, 0x01010101 // Duplicate byte to all bytes.
3638 mul edx // overwrites edx with upper part of result.
3639 mov edx, edi
3640 mov edi, [esp + 4] // dst
3641 mov ecx, [esp + 12] // count
3642 shr ecx, 2
3643 rep stosd
3644 mov edi, edx
3645 ret
3646 }
3647 }
3648
3649 // Write 'count' bytes using an 8 bit value repeated.
3650 __declspec(naked)
3651 void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
3652 __asm {
3653 mov edx, edi
3654 mov edi, [esp + 4] // dst
3655 mov eax, [esp + 8] // v8
3656 mov ecx, [esp + 12] // count
3657 rep stosb
3658 mov edi, edx
3659 ret
3660 }
3661 }
3662
3663 // Write 'count' 32 bit values.
3664 __declspec(naked)
3665 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
3666 __asm {
3667 mov edx, edi
3668 mov edi, [esp + 4] // dst
3669 mov eax, [esp + 8] // v32
3670 mov ecx, [esp + 12] // count
3671 rep stosd
3672 mov edi, edx
3673 ret
3674 }
3675 }
3676 #endif // HAS_SETROW_X86
3677
3678 #ifdef HAS_YUY2TOYROW_AVX2
3679 __declspec(naked)
3680 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
3681 __asm {
3682 mov eax, [esp + 4] // src_yuy2
3683 mov edx, [esp + 8] // dst_y
3684 mov ecx, [esp + 12] // width
3685 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3686 vpsrlw ymm5, ymm5, 8
3687
3688 convertloop:
3689 vmovdqu ymm0, [eax]
3690 vmovdqu ymm1, [eax + 32]
3691 lea eax, [eax + 64]
3692 vpand ymm0, ymm0, ymm5 // even bytes are Y
3693 vpand ymm1, ymm1, ymm5
3694 vpackuswb ymm0, ymm0, ymm1 // mutates.
3695 vpermq ymm0, ymm0, 0xd8
3696 vmovdqu [edx], ymm0
3697 lea edx, [edx + 32]
3698 sub ecx, 32
3699 jg convertloop
3700 vzeroupper
3701 ret
3702 }
3703 }
3704
3705 __declspec(naked)
3706 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
3707 uint8* dst_u, uint8* dst_v, int width) {
3708 __asm {
3709 push esi
3710 push edi
3711 mov eax, [esp + 8 + 4] // src_yuy2
3712 mov esi, [esp + 8 + 8] // stride_yuy2
3713 mov edx, [esp + 8 + 12] // dst_u
3714 mov edi, [esp + 8 + 16] // dst_v
3715 mov ecx, [esp + 8 + 20] // width
3716 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3717 vpsrlw ymm5, ymm5, 8
3718 sub edi, edx
3719
3720 convertloop:
3721 vmovdqu ymm0, [eax]
3722 vmovdqu ymm1, [eax + 32]
3723 vpavgb ymm0, ymm0, [eax + esi]
3724 vpavgb ymm1, ymm1, [eax + esi + 32]
3725 lea eax, [eax + 64]
3726 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
3727 vpsrlw ymm1, ymm1, 8
3728 vpackuswb ymm0, ymm0, ymm1 // mutates.
3729 vpermq ymm0, ymm0, 0xd8
3730 vpand ymm1, ymm0, ymm5 // U
3731 vpsrlw ymm0, ymm0, 8 // V
3732 vpackuswb ymm1, ymm1, ymm1 // mutates.
3733 vpackuswb ymm0, ymm0, ymm0 // mutates.
3734 vpermq ymm1, ymm1, 0xd8
3735 vpermq ymm0, ymm0, 0xd8
3736 vextractf128 [edx], ymm1, 0 // U
3737 vextractf128 [edx + edi], ymm0, 0 // V
3738 lea edx, [edx + 16]
3739 sub ecx, 32
3740 jg convertloop
3741
3742 pop edi
3743 pop esi
3744 vzeroupper
3745 ret
3746 }
3747 }
3748
3749 __declspec(naked)
3750 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3751 uint8* dst_u, uint8* dst_v, int width) {
3752 __asm {
3753 push edi
3754 mov eax, [esp + 4 + 4] // src_yuy2
3755 mov edx, [esp + 4 + 8] // dst_u
3756 mov edi, [esp + 4 + 12] // dst_v
3757 mov ecx, [esp + 4 + 16] // width
3758 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3759 vpsrlw ymm5, ymm5, 8
3760 sub edi, edx
3761
3762 convertloop:
3763 vmovdqu ymm0, [eax]
3764 vmovdqu ymm1, [eax + 32]
3765 lea eax, [eax + 64]
3766 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
3767 vpsrlw ymm1, ymm1, 8
3768 vpackuswb ymm0, ymm0, ymm1 // mutates.
3769 vpermq ymm0, ymm0, 0xd8
3770 vpand ymm1, ymm0, ymm5 // U
3771 vpsrlw ymm0, ymm0, 8 // V
3772 vpackuswb ymm1, ymm1, ymm1 // mutates.
3773 vpackuswb ymm0, ymm0, ymm0 // mutates.
3774 vpermq ymm1, ymm1, 0xd8
3775 vpermq ymm0, ymm0, 0xd8
3776 vextractf128 [edx], ymm1, 0 // U
3777 vextractf128 [edx + edi], ymm0, 0 // V
3778 lea edx, [edx + 16]
3779 sub ecx, 32
3780 jg convertloop
3781
3782 pop edi
3783 vzeroupper
3784 ret
3785 }
3786 }
3787
3788 __declspec(naked)
3789 void UYVYToYRow_AVX2(const uint8* src_uyvy,
3790 uint8* dst_y, int width) {
3791 __asm {
3792 mov eax, [esp + 4] // src_uyvy
3793 mov edx, [esp + 8] // dst_y
3794 mov ecx, [esp + 12] // width
3795
3796 convertloop:
3797 vmovdqu ymm0, [eax]
3798 vmovdqu ymm1, [eax + 32]
3799 lea eax, [eax + 64]
3800 vpsrlw ymm0, ymm0, 8 // odd bytes are Y
3801 vpsrlw ymm1, ymm1, 8
3802 vpackuswb ymm0, ymm0, ymm1 // mutates.
3803 vpermq ymm0, ymm0, 0xd8
3804 vmovdqu [edx], ymm0
3805 lea edx, [edx + 32]
3806 sub ecx, 32
3807 jg convertloop
3808 vzeroupper
3809 ret
3810 }
3811 }
3812
3813 __declspec(naked)
3814 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
3815 uint8* dst_u, uint8* dst_v, int width) {
3816 __asm {
3817 push esi
3818 push edi
3819 mov eax, [esp + 8 + 4] // src_yuy2
3820 mov esi, [esp + 8 + 8] // stride_yuy2
3821 mov edx, [esp + 8 + 12] // dst_u
3822 mov edi, [esp + 8 + 16] // dst_v
3823 mov ecx, [esp + 8 + 20] // width
3824 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3825 vpsrlw ymm5, ymm5, 8
3826 sub edi, edx
3827
3828 convertloop:
3829 vmovdqu ymm0, [eax]
3830 vmovdqu ymm1, [eax + 32]
3831 vpavgb ymm0, ymm0, [eax + esi]
3832 vpavgb ymm1, ymm1, [eax + esi + 32]
3833 lea eax, [eax + 64]
3834 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
3835 vpand ymm1, ymm1, ymm5
3836 vpackuswb ymm0, ymm0, ymm1 // mutates.
3837 vpermq ymm0, ymm0, 0xd8
3838 vpand ymm1, ymm0, ymm5 // U
3839 vpsrlw ymm0, ymm0, 8 // V
3840 vpackuswb ymm1, ymm1, ymm1 // mutates.
3841 vpackuswb ymm0, ymm0, ymm0 // mutates.
3842 vpermq ymm1, ymm1, 0xd8
3843 vpermq ymm0, ymm0, 0xd8
3844 vextractf128 [edx], ymm1, 0 // U
3845 vextractf128 [edx + edi], ymm0, 0 // V
3846 lea edx, [edx + 16]
3847 sub ecx, 32
3848 jg convertloop
3849
3850 pop edi
3851 pop esi
3852 vzeroupper
3853 ret
3854 }
3855 }
3856
3857 __declspec(naked)
3858 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3859 uint8* dst_u, uint8* dst_v, int width) {
3860 __asm {
3861 push edi
3862 mov eax, [esp + 4 + 4] // src_yuy2
3863 mov edx, [esp + 4 + 8] // dst_u
3864 mov edi, [esp + 4 + 12] // dst_v
3865 mov ecx, [esp + 4 + 16] // width
3866 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3867 vpsrlw ymm5, ymm5, 8
3868 sub edi, edx
3869
3870 convertloop:
3871 vmovdqu ymm0, [eax]
3872 vmovdqu ymm1, [eax + 32]
3873 lea eax, [eax + 64]
3874 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
3875 vpand ymm1, ymm1, ymm5
3876 vpackuswb ymm0, ymm0, ymm1 // mutates.
3877 vpermq ymm0, ymm0, 0xd8
3878 vpand ymm1, ymm0, ymm5 // U
3879 vpsrlw ymm0, ymm0, 8 // V
3880 vpackuswb ymm1, ymm1, ymm1 // mutates.
3881 vpackuswb ymm0, ymm0, ymm0 // mutates.
3882 vpermq ymm1, ymm1, 0xd8
3883 vpermq ymm0, ymm0, 0xd8
3884 vextractf128 [edx], ymm1, 0 // U
3885 vextractf128 [edx + edi], ymm0, 0 // V
3886 lea edx, [edx + 16]
3887 sub ecx, 32
3888 jg convertloop
3889
3890 pop edi
3891 vzeroupper
3892 ret
3893 }
3894 }
3895 #endif // HAS_YUY2TOYROW_AVX2
3896
3897 #ifdef HAS_YUY2TOYROW_SSE2
3898 __declspec(naked)
3899 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
3900 uint8* dst_y, int width) {
3901 __asm {
3902 mov eax, [esp + 4] // src_yuy2
3903 mov edx, [esp + 8] // dst_y
3904 mov ecx, [esp + 12] // width
3905 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3906 psrlw xmm5, 8
3907
3908 convertloop:
3909 movdqu xmm0, [eax]
3910 movdqu xmm1, [eax + 16]
3911 lea eax, [eax + 32]
3912 pand xmm0, xmm5 // even bytes are Y
3913 pand xmm1, xmm5
3914 packuswb xmm0, xmm1
3915 movdqu [edx], xmm0
3916 lea edx, [edx + 16]
3917 sub ecx, 16
3918 jg convertloop
3919 ret
3920 }
3921 }
3922
3923 __declspec(naked)
3924 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
3925 uint8* dst_u, uint8* dst_v, int width) {
3926 __asm {
3927 push esi
3928 push edi
3929 mov eax, [esp + 8 + 4] // src_yuy2
3930 mov esi, [esp + 8 + 8] // stride_yuy2
3931 mov edx, [esp + 8 + 12] // dst_u
3932 mov edi, [esp + 8 + 16] // dst_v
3933 mov ecx, [esp + 8 + 20] // width
3934 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3935 psrlw xmm5, 8
3936 sub edi, edx
3937
3938 convertloop:
3939 movdqu xmm0, [eax]
3940 movdqu xmm1, [eax + 16]
3941 movdqu xmm2, [eax + esi]
3942 movdqu xmm3, [eax + esi + 16]
3943 lea eax, [eax + 32]
3944 pavgb xmm0, xmm2
3945 pavgb xmm1, xmm3
3946 psrlw xmm0, 8 // YUYV -> UVUV
3947 psrlw xmm1, 8
3948 packuswb xmm0, xmm1
3949 movdqa xmm1, xmm0
3950 pand xmm0, xmm5 // U
3951 packuswb xmm0, xmm0
3952 psrlw xmm1, 8 // V
3953 packuswb xmm1, xmm1
3954 movq qword ptr [edx], xmm0
3955 movq qword ptr [edx + edi], xmm1
3956 lea edx, [edx + 8]
3957 sub ecx, 16
3958 jg convertloop
3959
3960 pop edi
3961 pop esi
3962 ret
3963 }
3964 }
3965
3966 __declspec(naked)
3967 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3968 uint8* dst_u, uint8* dst_v, int width) {
3969 __asm {
3970 push edi
3971 mov eax, [esp + 4 + 4] // src_yuy2
3972 mov edx, [esp + 4 + 8] // dst_u
3973 mov edi, [esp + 4 + 12] // dst_v
3974 mov ecx, [esp + 4 + 16] // width
3975 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3976 psrlw xmm5, 8
3977 sub edi, edx
3978
3979 convertloop:
3980 movdqu xmm0, [eax]
3981 movdqu xmm1, [eax + 16]
3982 lea eax, [eax + 32]
3983 psrlw xmm0, 8 // YUYV -> UVUV
3984 psrlw xmm1, 8
3985 packuswb xmm0, xmm1
3986 movdqa xmm1, xmm0
3987 pand xmm0, xmm5 // U
3988 packuswb xmm0, xmm0
3989 psrlw xmm1, 8 // V
3990 packuswb xmm1, xmm1
3991 movq qword ptr [edx], xmm0
3992 movq qword ptr [edx + edi], xmm1
3993 lea edx, [edx + 8]
3994 sub ecx, 16
3995 jg convertloop
3996
3997 pop edi
3998 ret
3999 }
4000 }
4001
4002 __declspec(naked)
4003 void UYVYToYRow_SSE2(const uint8* src_uyvy,
4004 uint8* dst_y, int width) {
4005 __asm {
4006 mov eax, [esp + 4] // src_uyvy
4007 mov edx, [esp + 8] // dst_y
4008 mov ecx, [esp + 12] // width
4009
4010 convertloop:
4011 movdqu xmm0, [eax]
4012 movdqu xmm1, [eax + 16]
4013 lea eax, [eax + 32]
4014 psrlw xmm0, 8 // odd bytes are Y
4015 psrlw xmm1, 8
4016 packuswb xmm0, xmm1
4017 movdqu [edx], xmm0
4018 lea edx, [edx + 16]
4019 sub ecx, 16
4020 jg convertloop
4021 ret
4022 }
4023 }
4024
4025 __declspec(naked)
4026 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
4027 uint8* dst_u, uint8* dst_v, int width) {
4028 __asm {
4029 push esi
4030 push edi
4031 mov eax, [esp + 8 + 4] // src_yuy2
4032 mov esi, [esp + 8 + 8] // stride_yuy2
4033 mov edx, [esp + 8 + 12] // dst_u
4034 mov edi, [esp + 8 + 16] // dst_v
4035 mov ecx, [esp + 8 + 20] // width
4036 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4037 psrlw xmm5, 8
4038 sub edi, edx
4039
4040 convertloop:
4041 movdqu xmm0, [eax]
4042 movdqu xmm1, [eax + 16]
4043 movdqu xmm2, [eax + esi]
4044 movdqu xmm3, [eax + esi + 16]
4045 lea eax, [eax + 32]
4046 pavgb xmm0, xmm2
4047 pavgb xmm1, xmm3
4048 pand xmm0, xmm5 // UYVY -> UVUV
4049 pand xmm1, xmm5
4050 packuswb xmm0, xmm1
4051 movdqa xmm1, xmm0
4052 pand xmm0, xmm5 // U
4053 packuswb xmm0, xmm0
4054 psrlw xmm1, 8 // V
4055 packuswb xmm1, xmm1
4056 movq qword ptr [edx], xmm0
4057 movq qword ptr [edx + edi], xmm1
4058 lea edx, [edx + 8]
4059 sub ecx, 16
4060 jg convertloop
4061
4062 pop edi
4063 pop esi
4064 ret
4065 }
4066 }
4067
4068 __declspec(naked)
4069 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
4070 uint8* dst_u, uint8* dst_v, int width) {
4071 __asm {
4072 push edi
4073 mov eax, [esp + 4 + 4] // src_yuy2
4074 mov edx, [esp + 4 + 8] // dst_u
4075 mov edi, [esp + 4 + 12] // dst_v
4076 mov ecx, [esp + 4 + 16] // width
4077 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4078 psrlw xmm5, 8
4079 sub edi, edx
4080
4081 convertloop:
4082 movdqu xmm0, [eax]
4083 movdqu xmm1, [eax + 16]
4084 lea eax, [eax + 32]
4085 pand xmm0, xmm5 // UYVY -> UVUV
4086 pand xmm1, xmm5
4087 packuswb xmm0, xmm1
4088 movdqa xmm1, xmm0
4089 pand xmm0, xmm5 // U
4090 packuswb xmm0, xmm0
4091 psrlw xmm1, 8 // V
4092 packuswb xmm1, xmm1
4093 movq qword ptr [edx], xmm0
4094 movq qword ptr [edx + edi], xmm1
4095 lea edx, [edx + 8]
4096 sub ecx, 16
4097 jg convertloop
4098
4099 pop edi
4100 ret
4101 }
4102 }
4103 #endif // HAS_YUY2TOYROW_SSE2
4104
4105 #ifdef HAS_BLENDPLANEROW_SSSE3
4106 // Blend 8 pixels at a time.
4107 // unsigned version of math
4108 // =((A2*C2)+(B2*(255-C2))+255)/256
4109 // signed version of math
4110 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4111 __declspec(naked)
4112 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
4113 const uint8* alpha, uint8* dst, int width) {
4114 __asm {
4115 push esi
4116 push edi
4117 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4118 psllw xmm5, 8
4119 mov eax, 0x80808080 // 128 for biasing image to signed.
4120 movd xmm6, eax
4121 pshufd xmm6, xmm6, 0x00
4122
4123 mov eax, 0x807f807f // 32768 + 127 for unbias and round.
4124 movd xmm7, eax
4125 pshufd xmm7, xmm7, 0x00
4126 mov eax, [esp + 8 + 4] // src0
4127 mov edx, [esp + 8 + 8] // src1
4128 mov esi, [esp + 8 + 12] // alpha
4129 mov edi, [esp + 8 + 16] // dst
4130 mov ecx, [esp + 8 + 20] // width
4131 sub eax, esi
4132 sub edx, esi
4133 sub edi, esi
4134
4135 // 8 pixel loop.
4136 convertloop8:
4137 movq xmm0, qword ptr [esi] // alpha
4138 punpcklbw xmm0, xmm0
4139 pxor xmm0, xmm5 // a, 255-a
4140 movq xmm1, qword ptr [eax + esi] // src0
4141 movq xmm2, qword ptr [edx + esi] // src1
4142 punpcklbw xmm1, xmm2
4143 psubb xmm1, xmm6 // bias src0/1 - 128
4144 pmaddubsw xmm0, xmm1
4145 paddw xmm0, xmm7 // unbias result - 32768 and round.
4146 psrlw xmm0, 8
4147 packuswb xmm0, xmm0
4148 movq qword ptr [edi + esi], xmm0
4149 lea esi, [esi + 8]
4150 sub ecx, 8
4151 jg convertloop8
4152
4153 pop edi
4154 pop esi
4155 ret
4156 }
4157 }
4158 #endif // HAS_BLENDPLANEROW_SSSE3
4159
4160 #ifdef HAS_BLENDPLANEROW_AVX2
4161 // Blend 32 pixels at a time.
4162 // unsigned version of math
4163 // =((A2*C2)+(B2*(255-C2))+255)/256
4164 // signed version of math
4165 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4166 __declspec(naked)
4167 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
4168 const uint8* alpha, uint8* dst, int width) {
4169 __asm {
4170 push esi
4171 push edi
4172 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
4173 vpsllw ymm5, ymm5, 8
4174 mov eax, 0x80808080 // 128 for biasing image to signed.
4175 vmovd xmm6, eax
4176 vbroadcastss ymm6, xmm6
4177 mov eax, 0x807f807f // 32768 + 127 for unbias and round.
4178 vmovd xmm7, eax
4179 vbroadcastss ymm7, xmm7
4180 mov eax, [esp + 8 + 4] // src0
4181 mov edx, [esp + 8 + 8] // src1
4182 mov esi, [esp + 8 + 12] // alpha
4183 mov edi, [esp + 8 + 16] // dst
4184 mov ecx, [esp + 8 + 20] // width
4185 sub eax, esi
4186 sub edx, esi
4187 sub edi, esi
4188
4189 // 32 pixel loop.
4190 convertloop32:
4191 vmovdqu ymm0, [esi] // alpha
4192 vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
4193 vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23
4194 vpxor ymm3, ymm3, ymm5 // a, 255-a
4195 vpxor ymm0, ymm0, ymm5 // a, 255-a
4196 vmovdqu ymm1, [eax + esi] // src0
4197 vmovdqu ymm2, [edx + esi] // src1
4198 vpunpckhbw ymm4, ymm1, ymm2
4199 vpunpcklbw ymm1, ymm1, ymm2
4200 vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128
4201 vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
4202 vpmaddubsw ymm3, ymm3, ymm4
4203 vpmaddubsw ymm0, ymm0, ymm1
4204 vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.
4205 vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
4206 vpsrlw ymm3, ymm3, 8
4207 vpsrlw ymm0, ymm0, 8
4208 vpackuswb ymm0, ymm0, ymm3
4209 vmovdqu [edi + esi], ymm0
4210 lea esi, [esi + 32]
4211 sub ecx, 32
4212 jg convertloop32
4213
4214 pop edi
4215 pop esi
4216 vzeroupper
4217 ret
4218 }
4219 }
4220 #endif // HAS_BLENDPLANEROW_AVX2
4221
4222 #ifdef HAS_ARGBBLENDROW_SSSE3
4223 // Shuffle table for isolating alpha.
4224 static const uvec8 kShuffleAlpha = {
4225 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
4226 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
4227 };
4228
4229 // Blend 8 pixels at a time.
4230 __declspec(naked)
4231 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
4232 uint8* dst_argb, int width) {
4233 __asm {
4234 push esi
4235 mov eax, [esp + 4 + 4] // src_argb0
4236 mov esi, [esp + 4 + 8] // src_argb1
4237 mov edx, [esp + 4 + 12] // dst_argb
4238 mov ecx, [esp + 4 + 16] // width
4239 pcmpeqb xmm7, xmm7 // generate constant 0x0001
4240 psrlw xmm7, 15
4241 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
4242 psrlw xmm6, 8
4243 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4244 psllw xmm5, 8
4245 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
4246 pslld xmm4, 24
4247 sub ecx, 4
4248 jl convertloop4b // less than 4 pixels?
4249
4250 // 4 pixel loop.
4251 convertloop4:
4252 movdqu xmm3, [eax] // src argb
4253 lea eax, [eax + 16]
4254 movdqa xmm0, xmm3 // src argb
4255 pxor xmm3, xmm4 // ~alpha
4256 movdqu xmm2, [esi] // _r_b
4257 pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
4258 pand xmm2, xmm6 // _r_b
4259 paddw xmm3, xmm7 // 256 - alpha
4260 pmullw xmm2, xmm3 // _r_b * alpha
4261 movdqu xmm1, [esi] // _a_g
4262 lea esi, [esi + 16]
4263 psrlw xmm1, 8 // _a_g
4264 por xmm0, xmm4 // set alpha to 255
4265 pmullw xmm1, xmm3 // _a_g * alpha
4266 psrlw xmm2, 8 // _r_b convert to 8 bits again
4267 paddusb xmm0, xmm2 // + src argb
4268 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4269 paddusb xmm0, xmm1 // + src argb
4270 movdqu [edx], xmm0
4271 lea edx, [edx + 16]
4272 sub ecx, 4
4273 jge convertloop4
4274
4275 convertloop4b:
4276 add ecx, 4 - 1
4277 jl convertloop1b
4278
4279 // 1 pixel loop.
4280 convertloop1:
4281 movd xmm3, [eax] // src argb
4282 lea eax, [eax + 4]
4283 movdqa xmm0, xmm3 // src argb
4284 pxor xmm3, xmm4 // ~alpha
4285 movd xmm2, [esi] // _r_b
4286 pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
4287 pand xmm2, xmm6 // _r_b
4288 paddw xmm3, xmm7 // 256 - alpha
4289 pmullw xmm2, xmm3 // _r_b * alpha
4290 movd xmm1, [esi] // _a_g
4291 lea esi, [esi + 4]
4292 psrlw xmm1, 8 // _a_g
4293 por xmm0, xmm4 // set alpha to 255
4294 pmullw xmm1, xmm3 // _a_g * alpha
4295 psrlw xmm2, 8 // _r_b convert to 8 bits again
4296 paddusb xmm0, xmm2 // + src argb
4297 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4298 paddusb xmm0, xmm1 // + src argb
4299 movd [edx], xmm0
4300 lea edx, [edx + 4]
4301 sub ecx, 1
4302 jge convertloop1
4303
4304 convertloop1b:
4305 pop esi
4306 ret
4307 }
4308 }
4309 #endif // HAS_ARGBBLENDROW_SSSE3
4310
4311 #ifdef HAS_ARGBATTENUATEROW_SSSE3
4312 // Shuffle table duplicating alpha.
4313 static const uvec8 kShuffleAlpha0 = {
4314 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4315 };
4316 static const uvec8 kShuffleAlpha1 = {
4317 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4318 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4319 };
4320 __declspec(naked)
4321 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4322 __asm {
4323 mov eax, [esp + 4] // src_argb0
4324 mov edx, [esp + 8] // dst_argb
4325 mov ecx, [esp + 12] // width
4326 pcmpeqb xmm3, xmm3 // generate mask 0xff000000
4327 pslld xmm3, 24
4328 movdqa xmm4, xmmword ptr kShuffleAlpha0
4329 movdqa xmm5, xmmword ptr kShuffleAlpha1
4330
4331 convertloop:
4332 movdqu xmm0, [eax] // read 4 pixels
4333 pshufb xmm0, xmm4 // isolate first 2 alphas
4334 movdqu xmm1, [eax] // read 4 pixels
4335 punpcklbw xmm1, xmm1 // first 2 pixel rgbs
4336 pmulhuw xmm0, xmm1 // rgb * a
4337 movdqu xmm1, [eax] // read 4 pixels
4338 pshufb xmm1, xmm5 // isolate next 2 alphas
4339 movdqu xmm2, [eax] // read 4 pixels
4340 punpckhbw xmm2, xmm2 // next 2 pixel rgbs
4341 pmulhuw xmm1, xmm2 // rgb * a
4342 movdqu xmm2, [eax] // mask original alpha
4343 lea eax, [eax + 16]
4344 pand xmm2, xmm3
4345 psrlw xmm0, 8
4346 psrlw xmm1, 8
4347 packuswb xmm0, xmm1
4348 por xmm0, xmm2 // copy original alpha
4349 movdqu [edx], xmm0
4350 lea edx, [edx + 16]
4351 sub ecx, 4
4352 jg convertloop
4353
4354 ret
4355 }
4356 }
4357 #endif // HAS_ARGBATTENUATEROW_SSSE3
4358
4359 #ifdef HAS_ARGBATTENUATEROW_AVX2
4360 // Shuffle table duplicating alpha.
4361 static const uvec8 kShuffleAlpha_AVX2 = {
4362 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
4363 };
4364 __declspec(naked)
4365 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
4366 __asm {
4367 mov eax, [esp + 4] // src_argb0
4368 mov edx, [esp + 8] // dst_argb
4369 mov ecx, [esp + 12] // width
4370 sub edx, eax
4371 vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
4372 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
4373 vpslld ymm5, ymm5, 24
4374
4375 convertloop:
4376 vmovdqu ymm6, [eax] // read 8 pixels.
4377 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4378 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4379 vpshufb ymm2, ymm0, ymm4 // low 4 alphas
4380 vpshufb ymm3, ymm1, ymm4 // high 4 alphas
4381 vpmulhuw ymm0, ymm0, ymm2 // rgb * a
4382 vpmulhuw ymm1, ymm1, ymm3 // rgb * a
4383 vpand ymm6, ymm6, ymm5 // isolate alpha
4384 vpsrlw ymm0, ymm0, 8
4385 vpsrlw ymm1, ymm1, 8
4386 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4387 vpor ymm0, ymm0, ymm6 // copy original alpha
4388 vmovdqu [eax + edx], ymm0
4389 lea eax, [eax + 32]
4390 sub ecx, 8
4391 jg convertloop
4392
4393 vzeroupper
4394 ret
4395 }
4396 }
4397 #endif // HAS_ARGBATTENUATEROW_AVX2
4398
4399 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
4400 // Unattenuate 4 pixels at a time.
4401 __declspec(naked)
4402 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
4403 int width) {
4404 __asm {
4405 push ebx
4406 push esi
4407 push edi
4408 mov eax, [esp + 12 + 4] // src_argb
4409 mov edx, [esp + 12 + 8] // dst_argb
4410 mov ecx, [esp + 12 + 12] // width
4411 lea ebx, fixed_invtbl8
4412
4413 convertloop:
4414 movdqu xmm0, [eax] // read 4 pixels
4415 movzx esi, byte ptr [eax + 3] // first alpha
4416 movzx edi, byte ptr [eax + 7] // second alpha
4417 punpcklbw xmm0, xmm0 // first 2
4418 movd xmm2, dword ptr [ebx + esi * 4]
4419 movd xmm3, dword ptr [ebx + edi * 4]
4420 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
4421 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
4422 movlhps xmm2, xmm3
4423 pmulhuw xmm0, xmm2 // rgb * a
4424
4425 movdqu xmm1, [eax] // read 4 pixels
4426 movzx esi, byte ptr [eax + 11] // third alpha
4427 movzx edi, byte ptr [eax + 15] // forth alpha
4428 punpckhbw xmm1, xmm1 // next 2
4429 movd xmm2, dword ptr [ebx + esi * 4]
4430 movd xmm3, dword ptr [ebx + edi * 4]
4431 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
4432 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
4433 movlhps xmm2, xmm3
4434 pmulhuw xmm1, xmm2 // rgb * a
4435 lea eax, [eax + 16]
4436 packuswb xmm0, xmm1
4437 movdqu [edx], xmm0
4438 lea edx, [edx + 16]
4439 sub ecx, 4
4440 jg convertloop
4441
4442 pop edi
4443 pop esi
4444 pop ebx
4445 ret
4446 }
4447 }
4448 #endif // HAS_ARGBUNATTENUATEROW_SSE2
4449
4450 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
4451 // Shuffle table duplicating alpha.
4452 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
4453 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
4454 };
4455 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
4456 // USE_GATHER is not on by default, due to being a slow instruction.
4457 #ifdef USE_GATHER
4458 __declspec(naked)
4459 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4460 int width) {
4461 __asm {
4462 mov eax, [esp + 4] // src_argb0
4463 mov edx, [esp + 8] // dst_argb
4464 mov ecx, [esp + 12] // width
4465 sub edx, eax
4466 vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
4467
4468 convertloop:
4469 vmovdqu ymm6, [eax] // read 8 pixels.
4470 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
4471 vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
4472 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4473 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4474 vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
4475 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
4476 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
4477 vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
4478 vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
4479 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
4480 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
4481 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4482 vmovdqu [eax + edx], ymm0
4483 lea eax, [eax + 32]
4484 sub ecx, 8
4485 jg convertloop
4486
4487 vzeroupper
4488 ret
4489 }
4490 }
4491 #else // USE_GATHER
4492 __declspec(naked)
4493 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4494 int width) {
4495 __asm {
4496
4497 push ebx
4498 push esi
4499 push edi
4500 mov eax, [esp + 12 + 4] // src_argb
4501 mov edx, [esp + 12 + 8] // dst_argb
4502 mov ecx, [esp + 12 + 12] // width
4503 sub edx, eax
4504 lea ebx, fixed_invtbl8
4505 vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
4506
4507 convertloop:
4508 // replace VPGATHER
4509 movzx esi, byte ptr [eax + 3] // alpha0
4510 movzx edi, byte ptr [eax + 7] // alpha1
4511 vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0]
4512 vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1]
4513 movzx esi, byte ptr [eax + 11] // alpha2
4514 movzx edi, byte ptr [eax + 15] // alpha3
4515 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
4516 vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2]
4517 vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3]
4518 movzx esi, byte ptr [eax + 19] // alpha4
4519 movzx edi, byte ptr [eax + 23] // alpha5
4520 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
4521 vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4]
4522 vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5]
4523 movzx esi, byte ptr [eax + 27] // alpha6
4524 movzx edi, byte ptr [eax + 31] // alpha7
4525 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
4526 vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6]
4527 vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7]
4528 vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
4529 vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
4530 vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
4531 vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
4532 // end of VPGATHER
4533
4534 vmovdqu ymm6, [eax] // read 8 pixels.
4535 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4536 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4537 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
4538 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
4539 vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
4540 vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
4541 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
4542 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
4543 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4544 vmovdqu [eax + edx], ymm0
4545 lea eax, [eax + 32]
4546 sub ecx, 8
4547 jg convertloop
4548
4549 pop edi
4550 pop esi
4551 pop ebx
4552 vzeroupper
4553 ret
4554 }
4555 }
4556 #endif // USE_GATHER
4557 #endif // HAS_ARGBATTENUATEROW_AVX2
4558
4559 #ifdef HAS_ARGBGRAYROW_SSSE3
4560 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
4561 __declspec(naked)
4562 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4563 __asm {
4564 mov eax, [esp + 4] /* src_argb */
4565 mov edx, [esp + 8] /* dst_argb */
4566 mov ecx, [esp + 12] /* width */
4567 movdqa xmm4, xmmword ptr kARGBToYJ
4568 movdqa xmm5, xmmword ptr kAddYJ64
4569
4570 convertloop:
4571 movdqu xmm0, [eax] // G
4572 movdqu xmm1, [eax + 16]
4573 pmaddubsw xmm0, xmm4
4574 pmaddubsw xmm1, xmm4
4575 phaddw xmm0, xmm1
4576 paddw xmm0, xmm5 // Add .5 for rounding.
4577 psrlw xmm0, 7
4578 packuswb xmm0, xmm0 // 8 G bytes
4579 movdqu xmm2, [eax] // A
4580 movdqu xmm3, [eax + 16]
4581 lea eax, [eax + 32]
4582 psrld xmm2, 24
4583 psrld xmm3, 24
4584 packuswb xmm2, xmm3
4585 packuswb xmm2, xmm2 // 8 A bytes
4586 movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
4587 punpcklbw xmm0, xmm0 // 8 GG words
4588 punpcklbw xmm3, xmm2 // 8 GA words
4589 movdqa xmm1, xmm0
4590 punpcklwd xmm0, xmm3 // GGGA first 4
4591 punpckhwd xmm1, xmm3 // GGGA next 4
4592 movdqu [edx], xmm0
4593 movdqu [edx + 16], xmm1
4594 lea edx, [edx + 32]
4595 sub ecx, 8
4596 jg convertloop
4597 ret
4598 }
4599 }
4600 #endif // HAS_ARGBGRAYROW_SSSE3
4601
4602 #ifdef HAS_ARGBSEPIAROW_SSSE3
4603 // b = (r * 35 + g * 68 + b * 17) >> 7
4604 // g = (r * 45 + g * 88 + b * 22) >> 7
4605 // r = (r * 50 + g * 98 + b * 24) >> 7
4606 // Constant for ARGB color to sepia tone.
4607 static const vec8 kARGBToSepiaB = {
4608 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
4609 };
4610
4611 static const vec8 kARGBToSepiaG = {
4612 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
4613 };
4614
4615 static const vec8 kARGBToSepiaR = {
4616 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
4617 };
4618
4619 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
4620 __declspec(naked)
4621 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
4622 __asm {
4623 mov eax, [esp + 4] /* dst_argb */
4624 mov ecx, [esp + 8] /* width */
4625 movdqa xmm2, xmmword ptr kARGBToSepiaB
4626 movdqa xmm3, xmmword ptr kARGBToSepiaG
4627 movdqa xmm4, xmmword ptr kARGBToSepiaR
4628
4629 convertloop:
4630 movdqu xmm0, [eax] // B
4631 movdqu xmm6, [eax + 16]
4632 pmaddubsw xmm0, xmm2
4633 pmaddubsw xmm6, xmm2
4634 phaddw xmm0, xmm6
4635 psrlw xmm0, 7
4636 packuswb xmm0, xmm0 // 8 B values
4637 movdqu xmm5, [eax] // G
4638 movdqu xmm1, [eax + 16]
4639 pmaddubsw xmm5, xmm3
4640 pmaddubsw xmm1, xmm3
4641 phaddw xmm5, xmm1
4642 psrlw xmm5, 7
4643 packuswb xmm5, xmm5 // 8 G values
4644 punpcklbw xmm0, xmm5 // 8 BG values
4645 movdqu xmm5, [eax] // R
4646 movdqu xmm1, [eax + 16]
4647 pmaddubsw xmm5, xmm4
4648 pmaddubsw xmm1, xmm4
4649 phaddw xmm5, xmm1
4650 psrlw xmm5, 7
4651 packuswb xmm5, xmm5 // 8 R values
4652 movdqu xmm6, [eax] // A
4653 movdqu xmm1, [eax + 16]
4654 psrld xmm6, 24
4655 psrld xmm1, 24
4656 packuswb xmm6, xmm1
4657 packuswb xmm6, xmm6 // 8 A values
4658 punpcklbw xmm5, xmm6 // 8 RA values
4659 movdqa xmm1, xmm0 // Weave BG, RA together
4660 punpcklwd xmm0, xmm5 // BGRA first 4
4661 punpckhwd xmm1, xmm5 // BGRA next 4
4662 movdqu [eax], xmm0
4663 movdqu [eax + 16], xmm1
4664 lea eax, [eax + 32]
4665 sub ecx, 8
4666 jg convertloop
4667 ret
4668 }
4669 }
4670 #endif // HAS_ARGBSEPIAROW_SSSE3
4671
4672 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4673 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
4674 // Same as Sepia except matrix is provided.
4675 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
4676 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
4677 __declspec(naked)
4678 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4679 const int8* matrix_argb, int width) {
4680 __asm {
4681 mov eax, [esp + 4] /* src_argb */
4682 mov edx, [esp + 8] /* dst_argb */
4683 mov ecx, [esp + 12] /* matrix_argb */
4684 movdqu xmm5, [ecx]
4685 pshufd xmm2, xmm5, 0x00
4686 pshufd xmm3, xmm5, 0x55
4687 pshufd xmm4, xmm5, 0xaa
4688 pshufd xmm5, xmm5, 0xff
4689 mov ecx, [esp + 16] /* width */
4690
4691 convertloop:
4692 movdqu xmm0, [eax] // B
4693 movdqu xmm7, [eax + 16]
4694 pmaddubsw xmm0, xmm2
4695 pmaddubsw xmm7, xmm2
4696 movdqu xmm6, [eax] // G
4697 movdqu xmm1, [eax + 16]
4698 pmaddubsw xmm6, xmm3
4699 pmaddubsw xmm1, xmm3
4700 phaddsw xmm0, xmm7 // B
4701 phaddsw xmm6, xmm1 // G
4702 psraw xmm0, 6 // B
4703 psraw xmm6, 6 // G
4704 packuswb xmm0, xmm0 // 8 B values
4705 packuswb xmm6, xmm6 // 8 G values
4706 punpcklbw xmm0, xmm6 // 8 BG values
4707 movdqu xmm1, [eax] // R
4708 movdqu xmm7, [eax + 16]
4709 pmaddubsw xmm1, xmm4
4710 pmaddubsw xmm7, xmm4
4711 phaddsw xmm1, xmm7 // R
4712 movdqu xmm6, [eax] // A
4713 movdqu xmm7, [eax + 16]
4714 pmaddubsw xmm6, xmm5
4715 pmaddubsw xmm7, xmm5
4716 phaddsw xmm6, xmm7 // A
4717 psraw xmm1, 6 // R
4718 psraw xmm6, 6 // A
4719 packuswb xmm1, xmm1 // 8 R values
4720 packuswb xmm6, xmm6 // 8 A values
4721 punpcklbw xmm1, xmm6 // 8 RA values
4722 movdqa xmm6, xmm0 // Weave BG, RA together
4723 punpcklwd xmm0, xmm1 // BGRA first 4
4724 punpckhwd xmm6, xmm1 // BGRA next 4
4725 movdqu [edx], xmm0
4726 movdqu [edx + 16], xmm6
4727 lea eax, [eax + 32]
4728 lea edx, [edx + 32]
4729 sub ecx, 8
4730 jg convertloop
4731 ret
4732 }
4733 }
4734 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
4735
4736 #ifdef HAS_ARGBQUANTIZEROW_SSE2
4737 // Quantize 4 ARGB pixels (16 bytes).
4738 __declspec(naked)
4739 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
4740 int interval_offset, int width) {
4741 __asm {
4742 mov eax, [esp + 4] /* dst_argb */
4743 movd xmm2, [esp + 8] /* scale */
4744 movd xmm3, [esp + 12] /* interval_size */
4745 movd xmm4, [esp + 16] /* interval_offset */
4746 mov ecx, [esp + 20] /* width */
4747 pshuflw xmm2, xmm2, 040h
4748 pshufd xmm2, xmm2, 044h
4749 pshuflw xmm3, xmm3, 040h
4750 pshufd xmm3, xmm3, 044h
4751 pshuflw xmm4, xmm4, 040h
4752 pshufd xmm4, xmm4, 044h
4753 pxor xmm5, xmm5 // constant 0
4754 pcmpeqb xmm6, xmm6 // generate mask 0xff000000
4755 pslld xmm6, 24
4756
4757 convertloop:
4758 movdqu xmm0, [eax] // read 4 pixels
4759 punpcklbw xmm0, xmm5 // first 2 pixels
4760 pmulhuw xmm0, xmm2 // pixel * scale >> 16
4761 movdqu xmm1, [eax] // read 4 pixels
4762 punpckhbw xmm1, xmm5 // next 2 pixels
4763 pmulhuw xmm1, xmm2
4764 pmullw xmm0, xmm3 // * interval_size
4765 movdqu xmm7, [eax] // read 4 pixels
4766 pmullw xmm1, xmm3
4767 pand xmm7, xmm6 // mask alpha
4768 paddw xmm0, xmm4 // + interval_size / 2
4769 paddw xmm1, xmm4
4770 packuswb xmm0, xmm1
4771 por xmm0, xmm7
4772 movdqu [eax], xmm0
4773 lea eax, [eax + 16]
4774 sub ecx, 4
4775 jg convertloop
4776 ret
4777 }
4778 }
4779 #endif // HAS_ARGBQUANTIZEROW_SSE2
4780
4781 #ifdef HAS_ARGBSHADEROW_SSE2
4782 // Shade 4 pixels at a time by specified value.
4783 __declspec(naked)
4784 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
4785 uint32 value) {
4786 __asm {
4787 mov eax, [esp + 4] // src_argb
4788 mov edx, [esp + 8] // dst_argb
4789 mov ecx, [esp + 12] // width
4790 movd xmm2, [esp + 16] // value
4791 punpcklbw xmm2, xmm2
4792 punpcklqdq xmm2, xmm2
4793
4794 convertloop:
4795 movdqu xmm0, [eax] // read 4 pixels
4796 lea eax, [eax + 16]
4797 movdqa xmm1, xmm0
4798 punpcklbw xmm0, xmm0 // first 2
4799 punpckhbw xmm1, xmm1 // next 2
4800 pmulhuw xmm0, xmm2 // argb * value
4801 pmulhuw xmm1, xmm2 // argb * value
4802 psrlw xmm0, 8
4803 psrlw xmm1, 8
4804 packuswb xmm0, xmm1
4805 movdqu [edx], xmm0
4806 lea edx, [edx + 16]
4807 sub ecx, 4
4808 jg convertloop
4809
4810 ret
4811 }
4812 }
4813 #endif // HAS_ARGBSHADEROW_SSE2
4814
4815 #ifdef HAS_ARGBMULTIPLYROW_SSE2
4816 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4817 __declspec(naked)
4818 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4819 uint8* dst_argb, int width) {
4820 __asm {
4821 push esi
4822 mov eax, [esp + 4 + 4] // src_argb0
4823 mov esi, [esp + 4 + 8] // src_argb1
4824 mov edx, [esp + 4 + 12] // dst_argb
4825 mov ecx, [esp + 4 + 16] // width
4826 pxor xmm5, xmm5 // constant 0
4827
4828 convertloop:
4829 movdqu xmm0, [eax] // read 4 pixels from src_argb0
4830 movdqu xmm2, [esi] // read 4 pixels from src_argb1
4831 movdqu xmm1, xmm0
4832 movdqu xmm3, xmm2
4833 punpcklbw xmm0, xmm0 // first 2
4834 punpckhbw xmm1, xmm1 // next 2
4835 punpcklbw xmm2, xmm5 // first 2
4836 punpckhbw xmm3, xmm5 // next 2
4837 pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
4838 pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
4839 lea eax, [eax + 16]
4840 lea esi, [esi + 16]
4841 packuswb xmm0, xmm1
4842 movdqu [edx], xmm0
4843 lea edx, [edx + 16]
4844 sub ecx, 4
4845 jg convertloop
4846
4847 pop esi
4848 ret
4849 }
4850 }
4851 #endif // HAS_ARGBMULTIPLYROW_SSE2
4852
4853 #ifdef HAS_ARGBADDROW_SSE2
4854 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
4855 // TODO(fbarchard): Port this to posix, neon and other math functions.
4856 __declspec(naked)
4857 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4858 uint8* dst_argb, int width) {
4859 __asm {
4860 push esi
4861 mov eax, [esp + 4 + 4] // src_argb0
4862 mov esi, [esp + 4 + 8] // src_argb1
4863 mov edx, [esp + 4 + 12] // dst_argb
4864 mov ecx, [esp + 4 + 16] // width
4865
4866 sub ecx, 4
4867 jl convertloop49
4868
4869 convertloop4:
4870 movdqu xmm0, [eax] // read 4 pixels from src_argb0
4871 lea eax, [eax + 16]
4872 movdqu xmm1, [esi] // read 4 pixels from src_argb1
4873 lea esi, [esi + 16]
4874 paddusb xmm0, xmm1 // src_argb0 + src_argb1
4875 movdqu [edx], xmm0
4876 lea edx, [edx + 16]
4877 sub ecx, 4
4878 jge convertloop4
4879
4880 convertloop49:
4881 add ecx, 4 - 1
4882 jl convertloop19
4883
4884 convertloop1:
4885 movd xmm0, [eax] // read 1 pixels from src_argb0
4886 lea eax, [eax + 4]
4887 movd xmm1, [esi] // read 1 pixels from src_argb1
4888 lea esi, [esi + 4]
4889 paddusb xmm0, xmm1 // src_argb0 + src_argb1
4890 movd [edx], xmm0
4891 lea edx, [edx + 4]
4892 sub ecx, 1
4893 jge convertloop1
4894
4895 convertloop19:
4896 pop esi
4897 ret
4898 }
4899 }
4900 #endif // HAS_ARGBADDROW_SSE2
4901
4902 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4903 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
4904 __declspec(naked)
4905 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4906 uint8* dst_argb, int width) {
4907 __asm {
4908 push esi
4909 mov eax, [esp + 4 + 4] // src_argb0
4910 mov esi, [esp + 4 + 8] // src_argb1
4911 mov edx, [esp + 4 + 12] // dst_argb
4912 mov ecx, [esp + 4 + 16] // width
4913
4914 convertloop:
4915 movdqu xmm0, [eax] // read 4 pixels from src_argb0
4916 lea eax, [eax + 16]
4917 movdqu xmm1, [esi] // read 4 pixels from src_argb1
4918 lea esi, [esi + 16]
4919 psubusb xmm0, xmm1 // src_argb0 - src_argb1
4920 movdqu [edx], xmm0
4921 lea edx, [edx + 16]
4922 sub ecx, 4
4923 jg convertloop
4924
4925 pop esi
4926 ret
4927 }
4928 }
4929 #endif // HAS_ARGBSUBTRACTROW_SSE2
4930
4931 #ifdef HAS_ARGBMULTIPLYROW_AVX2
4932 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
4933 __declspec(naked)
4934 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4935 uint8* dst_argb, int width) {
4936 __asm {
4937 push esi
4938 mov eax, [esp + 4 + 4] // src_argb0
4939 mov esi, [esp + 4 + 8] // src_argb1
4940 mov edx, [esp + 4 + 12] // dst_argb
4941 mov ecx, [esp + 4 + 16] // width
4942 vpxor ymm5, ymm5, ymm5 // constant 0
4943
4944 convertloop:
4945 vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
4946 lea eax, [eax + 32]
4947 vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
4948 lea esi, [esi + 32]
4949 vpunpcklbw ymm0, ymm1, ymm1 // low 4
4950 vpunpckhbw ymm1, ymm1, ymm1 // high 4
4951 vpunpcklbw ymm2, ymm3, ymm5 // low 4
4952 vpunpckhbw ymm3, ymm3, ymm5 // high 4
4953 vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
4954 vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
4955 vpackuswb ymm0, ymm0, ymm1
4956 vmovdqu [edx], ymm0
4957 lea edx, [edx + 32]
4958 sub ecx, 8
4959 jg convertloop
4960
4961 pop esi
4962 vzeroupper
4963 ret
4964 }
4965 }
4966 #endif // HAS_ARGBMULTIPLYROW_AVX2
4967
4968 #ifdef HAS_ARGBADDROW_AVX2
4969 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
4970 __declspec(naked)
4971 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4972 uint8* dst_argb, int width) {
4973 __asm {
4974 push esi
4975 mov eax, [esp + 4 + 4] // src_argb0
4976 mov esi, [esp + 4 + 8] // src_argb1
4977 mov edx, [esp + 4 + 12] // dst_argb
4978 mov ecx, [esp + 4 + 16] // width
4979
4980 convertloop:
4981 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
4982 lea eax, [eax + 32]
4983 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
4984 lea esi, [esi + 32]
4985 vmovdqu [edx], ymm0
4986 lea edx, [edx + 32]
4987 sub ecx, 8
4988 jg convertloop
4989
4990 pop esi
4991 vzeroupper
4992 ret
4993 }
4994 }
4995 #endif // HAS_ARGBADDROW_AVX2
4996
4997 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4998 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
4999 __declspec(naked)
5000 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
5001 uint8* dst_argb, int width) {
5002 __asm {
5003 push esi
5004 mov eax, [esp + 4 + 4] // src_argb0
5005 mov esi, [esp + 4 + 8] // src_argb1
5006 mov edx, [esp + 4 + 12] // dst_argb
5007 mov ecx, [esp + 4 + 16] // width
5008
5009 convertloop:
5010 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
5011 lea eax, [eax + 32]
5012 vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
5013 lea esi, [esi + 32]
5014 vmovdqu [edx], ymm0
5015 lea edx, [edx + 32]
5016 sub ecx, 8
5017 jg convertloop
5018
5019 pop esi
5020 vzeroupper
5021 ret
5022 }
5023 }
5024 #endif // HAS_ARGBSUBTRACTROW_AVX2
5025
5026 #ifdef HAS_SOBELXROW_SSE2
5027 // SobelX as a matrix is
5028 // -1 0 1
5029 // -2 0 2
5030 // -1 0 1
5031 __declspec(naked)
5032 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
5033 const uint8* src_y2, uint8* dst_sobelx, int width) {
5034 __asm {
5035 push esi
5036 push edi
5037 mov eax, [esp + 8 + 4] // src_y0
5038 mov esi, [esp + 8 + 8] // src_y1
5039 mov edi, [esp + 8 + 12] // src_y2
5040 mov edx, [esp + 8 + 16] // dst_sobelx
5041 mov ecx, [esp + 8 + 20] // width
5042 sub esi, eax
5043 sub edi, eax
5044 sub edx, eax
5045 pxor xmm5, xmm5 // constant 0
5046
5047 convertloop:
5048 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
5049 movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
5050 punpcklbw xmm0, xmm5
5051 punpcklbw xmm1, xmm5
5052 psubw xmm0, xmm1
5053 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
5054 movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
5055 punpcklbw xmm1, xmm5
5056 punpcklbw xmm2, xmm5
5057 psubw xmm1, xmm2
5058 movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
5059 movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
5060 punpcklbw xmm2, xmm5
5061 punpcklbw xmm3, xmm5
5062 psubw xmm2, xmm3
5063 paddw xmm0, xmm2
5064 paddw xmm0, xmm1
5065 paddw xmm0, xmm1
5066 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
5067 psubw xmm1, xmm0
5068 pmaxsw xmm0, xmm1
5069 packuswb xmm0, xmm0
5070 movq qword ptr [eax + edx], xmm0
5071 lea eax, [eax + 8]
5072 sub ecx, 8
5073 jg convertloop
5074
5075 pop edi
5076 pop esi
5077 ret
5078 }
5079 }
5080 #endif // HAS_SOBELXROW_SSE2
5081
5082 #ifdef HAS_SOBELYROW_SSE2
5083 // SobelY as a matrix is
5084 // -1 -2 -1
5085 // 0 0 0
5086 // 1 2 1
5087 __declspec(naked)
5088 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
5089 uint8* dst_sobely, int width) {
5090 __asm {
5091 push esi
5092 mov eax, [esp + 4 + 4] // src_y0
5093 mov esi, [esp + 4 + 8] // src_y1
5094 mov edx, [esp + 4 + 12] // dst_sobely
5095 mov ecx, [esp + 4 + 16] // width
5096 sub esi, eax
5097 sub edx, eax
5098 pxor xmm5, xmm5 // constant 0
5099
5100 convertloop:
5101 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
5102 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
5103 punpcklbw xmm0, xmm5
5104 punpcklbw xmm1, xmm5
5105 psubw xmm0, xmm1
5106 movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
5107 movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
5108 punpcklbw xmm1, xmm5
5109 punpcklbw xmm2, xmm5
5110 psubw xmm1, xmm2
5111 movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
5112 movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
5113 punpcklbw xmm2, xmm5
5114 punpcklbw xmm3, xmm5
5115 psubw xmm2, xmm3
5116 paddw xmm0, xmm2
5117 paddw xmm0, xmm1
5118 paddw xmm0, xmm1
5119 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
5120 psubw xmm1, xmm0
5121 pmaxsw xmm0, xmm1
5122 packuswb xmm0, xmm0
5123 movq qword ptr [eax + edx], xmm0
5124 lea eax, [eax + 8]
5125 sub ecx, 8
5126 jg convertloop
5127
5128 pop esi
5129 ret
5130 }
5131 }
5132 #endif // HAS_SOBELYROW_SSE2
5133
5134 #ifdef HAS_SOBELROW_SSE2
5135 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5136 // A = 255
5137 // R = Sobel
5138 // G = Sobel
5139 // B = Sobel
5140 __declspec(naked)
5141 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5142 uint8* dst_argb, int width) {
5143 __asm {
5144 push esi
5145 mov eax, [esp + 4 + 4] // src_sobelx
5146 mov esi, [esp + 4 + 8] // src_sobely
5147 mov edx, [esp + 4 + 12] // dst_argb
5148 mov ecx, [esp + 4 + 16] // width
5149 sub esi, eax
5150 pcmpeqb xmm5, xmm5 // alpha 255
5151 pslld xmm5, 24 // 0xff000000
5152
5153 convertloop:
5154 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5155 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5156 lea eax, [eax + 16]
5157 paddusb xmm0, xmm1 // sobel = sobelx + sobely
5158 movdqa xmm2, xmm0 // GG
5159 punpcklbw xmm2, xmm0 // First 8
5160 punpckhbw xmm0, xmm0 // Next 8
5161 movdqa xmm1, xmm2 // GGGG
5162 punpcklwd xmm1, xmm2 // First 4
5163 punpckhwd xmm2, xmm2 // Next 4
5164 por xmm1, xmm5 // GGGA
5165 por xmm2, xmm5
5166 movdqa xmm3, xmm0 // GGGG
5167 punpcklwd xmm3, xmm0 // Next 4
5168 punpckhwd xmm0, xmm0 // Last 4
5169 por xmm3, xmm5 // GGGA
5170 por xmm0, xmm5
5171 movdqu [edx], xmm1
5172 movdqu [edx + 16], xmm2
5173 movdqu [edx + 32], xmm3
5174 movdqu [edx + 48], xmm0
5175 lea edx, [edx + 64]
5176 sub ecx, 16
5177 jg convertloop
5178
5179 pop esi
5180 ret
5181 }
5182 }
5183 #endif // HAS_SOBELROW_SSE2
5184
5185 #ifdef HAS_SOBELTOPLANEROW_SSE2
5186 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
5187 __declspec(naked)
5188 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5189 uint8* dst_y, int width) {
5190 __asm {
5191 push esi
5192 mov eax, [esp + 4 + 4] // src_sobelx
5193 mov esi, [esp + 4 + 8] // src_sobely
5194 mov edx, [esp + 4 + 12] // dst_argb
5195 mov ecx, [esp + 4 + 16] // width
5196 sub esi, eax
5197
5198 convertloop:
5199 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5200 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5201 lea eax, [eax + 16]
5202 paddusb xmm0, xmm1 // sobel = sobelx + sobely
5203 movdqu [edx], xmm0
5204 lea edx, [edx + 16]
5205 sub ecx, 16
5206 jg convertloop
5207
5208 pop esi
5209 ret
5210 }
5211 }
5212 #endif // HAS_SOBELTOPLANEROW_SSE2
5213
5214 #ifdef HAS_SOBELXYROW_SSE2
5215 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
5216 // A = 255
5217 // R = Sobel X
5218 // G = Sobel
5219 // B = Sobel Y
5220 __declspec(naked)
5221 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5222 uint8* dst_argb, int width) {
5223 __asm {
5224 push esi
5225 mov eax, [esp + 4 + 4] // src_sobelx
5226 mov esi, [esp + 4 + 8] // src_sobely
5227 mov edx, [esp + 4 + 12] // dst_argb
5228 mov ecx, [esp + 4 + 16] // width
5229 sub esi, eax
5230 pcmpeqb xmm5, xmm5 // alpha 255
5231
5232 convertloop:
5233 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5234 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5235 lea eax, [eax + 16]
5236 movdqa xmm2, xmm0
5237 paddusb xmm2, xmm1 // sobel = sobelx + sobely
5238 movdqa xmm3, xmm0 // XA
5239 punpcklbw xmm3, xmm5
5240 punpckhbw xmm0, xmm5
5241 movdqa xmm4, xmm1 // YS
5242 punpcklbw xmm4, xmm2
5243 punpckhbw xmm1, xmm2
5244 movdqa xmm6, xmm4 // YSXA
5245 punpcklwd xmm6, xmm3 // First 4
5246 punpckhwd xmm4, xmm3 // Next 4
5247 movdqa xmm7, xmm1 // YSXA
5248 punpcklwd xmm7, xmm0 // Next 4
5249 punpckhwd xmm1, xmm0 // Last 4
5250 movdqu [edx], xmm6
5251 movdqu [edx + 16], xmm4
5252 movdqu [edx + 32], xmm7
5253 movdqu [edx + 48], xmm1
5254 lea edx, [edx + 64]
5255 sub ecx, 16
5256 jg convertloop
5257
5258 pop esi
5259 ret
5260 }
5261 }
5262 #endif // HAS_SOBELXYROW_SSE2
5263
5264 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5265 // Consider float CumulativeSum.
5266 // Consider calling CumulativeSum one row at time as needed.
5267 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
5268 // Convert cumulative sum for an area to an average for 1 pixel.
5269 // topleft is pointer to top left of CumulativeSum buffer for area.
5270 // botleft is pointer to bottom left of CumulativeSum buffer.
5271 // width is offset from left to right of area in CumulativeSum buffer measured
5272 // in number of ints.
5273 // area is the number of pixels in the area being averaged.
5274 // dst points to pixel to store result to.
5275 // count is number of averaged pixels to produce.
5276 // Does 4 pixels at a time.
5277 // This function requires alignment on accumulation buffer pointers.
5278 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
5279 int width, int area, uint8* dst,
5280 int count) {
5281 __asm {
5282 mov eax, topleft // eax topleft
5283 mov esi, botleft // esi botleft
5284 mov edx, width
5285 movd xmm5, area
5286 mov edi, dst
5287 mov ecx, count
5288 cvtdq2ps xmm5, xmm5
5289 rcpss xmm4, xmm5 // 1.0f / area
5290 pshufd xmm4, xmm4, 0
5291 sub ecx, 4
5292 jl l4b
5293
5294 cmp area, 128 // 128 pixels will not overflow 15 bits.
5295 ja l4
5296
5297 pshufd xmm5, xmm5, 0 // area
5298 pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
5299 psrld xmm6, 16
5300 cvtdq2ps xmm6, xmm6
5301 addps xmm5, xmm6 // (65536.0 + area - 1)
5302 mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
5303 cvtps2dq xmm5, xmm5 // 0.16 fixed point
5304 packssdw xmm5, xmm5 // 16 bit shorts
5305
5306 // 4 pixel loop small blocks.
5307 s4:
5308 // top left
5309 movdqu xmm0, [eax]
5310 movdqu xmm1, [eax + 16]
5311 movdqu xmm2, [eax + 32]
5312 movdqu xmm3, [eax + 48]
5313
5314 // - top right
5315 psubd xmm0, [eax + edx * 4]
5316 psubd xmm1, [eax + edx * 4 + 16]
5317 psubd xmm2, [eax + edx * 4 + 32]
5318 psubd xmm3, [eax + edx * 4 + 48]
5319 lea eax, [eax + 64]
5320
5321 // - bottom left
5322 psubd xmm0, [esi]
5323 psubd xmm1, [esi + 16]
5324 psubd xmm2, [esi + 32]
5325 psubd xmm3, [esi + 48]
5326
5327 // + bottom right
5328 paddd xmm0, [esi + edx * 4]
5329 paddd xmm1, [esi + edx * 4 + 16]
5330 paddd xmm2, [esi + edx * 4 + 32]
5331 paddd xmm3, [esi + edx * 4 + 48]
5332 lea esi, [esi + 64]
5333
5334 packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
5335 packssdw xmm2, xmm3
5336
5337 pmulhuw xmm0, xmm5
5338 pmulhuw xmm2, xmm5
5339
5340 packuswb xmm0, xmm2
5341 movdqu [edi], xmm0
5342 lea edi, [edi + 16]
5343 sub ecx, 4
5344 jge s4
5345
5346 jmp l4b
5347
5348 // 4 pixel loop
5349 l4:
5350 // top left
5351 movdqu xmm0, [eax]
5352 movdqu xmm1, [eax + 16]
5353 movdqu xmm2, [eax + 32]
5354 movdqu xmm3, [eax + 48]
5355
5356 // - top right
5357 psubd xmm0, [eax + edx * 4]
5358 psubd xmm1, [eax + edx * 4 + 16]
5359 psubd xmm2, [eax + edx * 4 + 32]
5360 psubd xmm3, [eax + edx * 4 + 48]
5361 lea eax, [eax + 64]
5362
5363 // - bottom left
5364 psubd xmm0, [esi]
5365 psubd xmm1, [esi + 16]
5366 psubd xmm2, [esi + 32]
5367 psubd xmm3, [esi + 48]
5368
5369 // + bottom right
5370 paddd xmm0, [esi + edx * 4]
5371 paddd xmm1, [esi + edx * 4 + 16]
5372 paddd xmm2, [esi + edx * 4 + 32]
5373 paddd xmm3, [esi + edx * 4 + 48]
5374 lea esi, [esi + 64]
5375
5376 cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
5377 cvtdq2ps xmm1, xmm1
5378 mulps xmm0, xmm4
5379 mulps xmm1, xmm4
5380 cvtdq2ps xmm2, xmm2
5381 cvtdq2ps xmm3, xmm3
5382 mulps xmm2, xmm4
5383 mulps xmm3, xmm4
5384 cvtps2dq xmm0, xmm0
5385 cvtps2dq xmm1, xmm1
5386 cvtps2dq xmm2, xmm2
5387 cvtps2dq xmm3, xmm3
5388 packssdw xmm0, xmm1
5389 packssdw xmm2, xmm3
5390 packuswb xmm0, xmm2
5391 movdqu [edi], xmm0
5392 lea edi, [edi + 16]
5393 sub ecx, 4
5394 jge l4
5395
5396 l4b:
5397 add ecx, 4 - 1
5398 jl l1b
5399
5400 // 1 pixel loop
5401 l1:
5402 movdqu xmm0, [eax]
5403 psubd xmm0, [eax + edx * 4]
5404 lea eax, [eax + 16]
5405 psubd xmm0, [esi]
5406 paddd xmm0, [esi + edx * 4]
5407 lea esi, [esi + 16]
5408 cvtdq2ps xmm0, xmm0
5409 mulps xmm0, xmm4
5410 cvtps2dq xmm0, xmm0
5411 packssdw xmm0, xmm0
5412 packuswb xmm0, xmm0
5413 movd dword ptr [edi], xmm0
5414 lea edi, [edi + 4]
5415 sub ecx, 1
5416 jge l1
5417 l1b:
5418 }
5419 }
5420 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5421
5422 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5423 // Creates a table of cumulative sums where each value is a sum of all values
5424 // above and to the left of the value.
5425 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
5426 const int32* previous_cumsum, int width) {
5427 __asm {
5428 mov eax, row
5429 mov edx, cumsum
5430 mov esi, previous_cumsum
5431 mov ecx, width
5432 pxor xmm0, xmm0
5433 pxor xmm1, xmm1
5434
5435 sub ecx, 4
5436 jl l4b
5437 test edx, 15
5438 jne l4b
5439
5440 // 4 pixel loop
5441 l4:
5442 movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
5443 lea eax, [eax + 16]
5444 movdqa xmm4, xmm2
5445
5446 punpcklbw xmm2, xmm1
5447 movdqa xmm3, xmm2
5448 punpcklwd xmm2, xmm1
5449 punpckhwd xmm3, xmm1
5450
5451 punpckhbw xmm4, xmm1
5452 movdqa xmm5, xmm4
5453 punpcklwd xmm4, xmm1
5454 punpckhwd xmm5, xmm1
5455
5456 paddd xmm0, xmm2
5457 movdqu xmm2, [esi] // previous row above.
5458 paddd xmm2, xmm0
5459
5460 paddd xmm0, xmm3
5461 movdqu xmm3, [esi + 16]
5462 paddd xmm3, xmm0
5463
5464 paddd xmm0, xmm4
5465 movdqu xmm4, [esi + 32]
5466 paddd xmm4, xmm0
5467
5468 paddd xmm0, xmm5
5469 movdqu xmm5, [esi + 48]
5470 lea esi, [esi + 64]
5471 paddd xmm5, xmm0
5472
5473 movdqu [edx], xmm2
5474 movdqu [edx + 16], xmm3
5475 movdqu [edx + 32], xmm4
5476 movdqu [edx + 48], xmm5
5477
5478 lea edx, [edx + 64]
5479 sub ecx, 4
5480 jge l4
5481
5482 l4b:
5483 add ecx, 4 - 1
5484 jl l1b
5485
5486 // 1 pixel loop
5487 l1:
5488 movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
5489 lea eax, [eax + 4]
5490 punpcklbw xmm2, xmm1
5491 punpcklwd xmm2, xmm1
5492 paddd xmm0, xmm2
5493 movdqu xmm2, [esi]
5494 lea esi, [esi + 16]
5495 paddd xmm2, xmm0
5496 movdqu [edx], xmm2
5497 lea edx, [edx + 16]
5498 sub ecx, 1
5499 jge l1
5500
5501 l1b:
5502 }
5503 }
5504 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
5505
5506 #ifdef HAS_ARGBAFFINEROW_SSE2
5507 // Copy ARGB pixels from source image with slope to a row of destination.
5508 __declspec(naked)
5509 LIBYUV_API
5510 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
5511 uint8* dst_argb, const float* uv_dudv, int width) {
5512 __asm {
5513 push esi
5514 push edi
5515 mov eax, [esp + 12] // src_argb
5516 mov esi, [esp + 16] // stride
5517 mov edx, [esp + 20] // dst_argb
5518 mov ecx, [esp + 24] // pointer to uv_dudv
5519 movq xmm2, qword ptr [ecx] // uv
5520 movq xmm7, qword ptr [ecx + 8] // dudv
5521 mov ecx, [esp + 28] // width
5522 shl esi, 16 // 4, stride
5523 add esi, 4
5524 movd xmm5, esi
5525 sub ecx, 4
5526 jl l4b
5527
5528 // setup for 4 pixel loop
5529 pshufd xmm7, xmm7, 0x44 // dup dudv
5530 pshufd xmm5, xmm5, 0 // dup 4, stride
5531 movdqa xmm0, xmm2 // x0, y0, x1, y1
5532 addps xmm0, xmm7
5533 movlhps xmm2, xmm0
5534 movdqa xmm4, xmm7
5535 addps xmm4, xmm4 // dudv *= 2
5536 movdqa xmm3, xmm2 // x2, y2, x3, y3
5537 addps xmm3, xmm4
5538 addps xmm4, xmm4 // dudv *= 4
5539
5540 // 4 pixel loop
5541 l4:
5542 cvttps2dq xmm0, xmm2 // x, y float to int first 2
5543 cvttps2dq xmm1, xmm3 // x, y float to int next 2
5544 packssdw xmm0, xmm1 // x, y as 8 shorts
5545 pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
5546 movd esi, xmm0
5547 pshufd xmm0, xmm0, 0x39 // shift right
5548 movd edi, xmm0
5549 pshufd xmm0, xmm0, 0x39 // shift right
5550 movd xmm1, [eax + esi] // read pixel 0
5551 movd xmm6, [eax + edi] // read pixel 1
5552 punpckldq xmm1, xmm6 // combine pixel 0 and 1
5553 addps xmm2, xmm4 // x, y += dx, dy first 2
5554 movq qword ptr [edx], xmm1
5555 movd esi, xmm0
5556 pshufd xmm0, xmm0, 0x39 // shift right
5557 movd edi, xmm0
5558 movd xmm6, [eax + esi] // read pixel 2
5559 movd xmm0, [eax + edi] // read pixel 3
5560 punpckldq xmm6, xmm0 // combine pixel 2 and 3
5561 addps xmm3, xmm4 // x, y += dx, dy next 2
5562 movq qword ptr 8[edx], xmm6
5563 lea edx, [edx + 16]
5564 sub ecx, 4
5565 jge l4
5566
5567 l4b:
5568 add ecx, 4 - 1
5569 jl l1b
5570
5571 // 1 pixel loop
5572 l1:
5573 cvttps2dq xmm0, xmm2 // x, y float to int
5574 packssdw xmm0, xmm0 // x, y as shorts
5575 pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
5576 addps xmm2, xmm7 // x, y += dx, dy
5577 movd esi, xmm0
5578 movd xmm0, [eax + esi] // copy a pixel
5579 movd [edx], xmm0
5580 lea edx, [edx + 4]
5581 sub ecx, 1
5582 jge l1
5583 l1b:
5584 pop edi
5585 pop esi
5586 ret
5587 }
5588 }
5589 #endif // HAS_ARGBAFFINEROW_SSE2
5590
5591 #ifdef HAS_INTERPOLATEROW_AVX2
5592 // Bilinear filter 32x2 -> 32x1
5593 __declspec(naked)
5594 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
5595 ptrdiff_t src_stride, int dst_width,
5596 int source_y_fraction) {
5597 __asm {
5598 push esi
5599 push edi
5600 mov edi, [esp + 8 + 4] // dst_ptr
5601 mov esi, [esp + 8 + 8] // src_ptr
5602 mov edx, [esp + 8 + 12] // src_stride
5603 mov ecx, [esp + 8 + 16] // dst_width
5604 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5605 // Dispatch to specialized filters if applicable.
5606 cmp eax, 0
5607 je xloop100 // 0 / 256. Blend 100 / 0.
5608 sub edi, esi
5609 cmp eax, 128
5610 je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
5611
5612 vmovd xmm0, eax // high fraction 0..255
5613 neg eax
5614 add eax, 256
5615 vmovd xmm5, eax // low fraction 256..1
5616 vpunpcklbw xmm5, xmm5, xmm0
5617 vpunpcklwd xmm5, xmm5, xmm5
5618 vbroadcastss ymm5, xmm5
5619
5620 mov eax, 0x80808080 // 128b for bias and rounding.
5621 vmovd xmm4, eax
5622 vbroadcastss ymm4, xmm4
5623
5624 xloop:
5625 vmovdqu ymm0, [esi]
5626 vmovdqu ymm2, [esi + edx]
5627 vpunpckhbw ymm1, ymm0, ymm2 // mutates
5628 vpunpcklbw ymm0, ymm0, ymm2
5629 vpsubb ymm1, ymm1, ymm4 // bias to signed image
5630 vpsubb ymm0, ymm0, ymm4
5631 vpmaddubsw ymm1, ymm5, ymm1
5632 vpmaddubsw ymm0, ymm5, ymm0
5633 vpaddw ymm1, ymm1, ymm4 // unbias and round
5634 vpaddw ymm0, ymm0, ymm4
5635 vpsrlw ymm1, ymm1, 8
5636 vpsrlw ymm0, ymm0, 8
5637 vpackuswb ymm0, ymm0, ymm1 // unmutates
5638 vmovdqu [esi + edi], ymm0
5639 lea esi, [esi + 32]
5640 sub ecx, 32
5641 jg xloop
5642 jmp xloop99
5643
5644 // Blend 50 / 50.
5645 xloop50:
5646 vmovdqu ymm0, [esi]
5647 vpavgb ymm0, ymm0, [esi + edx]
5648 vmovdqu [esi + edi], ymm0
5649 lea esi, [esi + 32]
5650 sub ecx, 32
5651 jg xloop50
5652 jmp xloop99
5653
5654 // Blend 100 / 0 - Copy row unchanged.
5655 xloop100:
5656 rep movsb
5657
5658 xloop99:
5659 pop edi
5660 pop esi
5661 vzeroupper
5662 ret
5663 }
5664 }
5665 #endif // HAS_INTERPOLATEROW_AVX2
5666
5667 // Bilinear filter 16x2 -> 16x1
5668 // TODO(fbarchard): Consider allowing 256 using memcpy.
5669 __declspec(naked)
5670 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
5671 ptrdiff_t src_stride, int dst_width,
5672 int source_y_fraction) {
5673 __asm {
5674 push esi
5675 push edi
5676
5677 mov edi, [esp + 8 + 4] // dst_ptr
5678 mov esi, [esp + 8 + 8] // src_ptr
5679 mov edx, [esp + 8 + 12] // src_stride
5680 mov ecx, [esp + 8 + 16] // dst_width
5681 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5682 sub edi, esi
5683 // Dispatch to specialized filters if applicable.
5684 cmp eax, 0
5685 je xloop100 // 0 /256. Blend 100 / 0.
5686 cmp eax, 128
5687 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
5688
5689 movd xmm0, eax // high fraction 0..255
5690 neg eax
5691 add eax, 256
5692 movd xmm5, eax // low fraction 255..1
5693 punpcklbw xmm5, xmm0
5694 punpcklwd xmm5, xmm5
5695 pshufd xmm5, xmm5, 0
5696 mov eax, 0x80808080 // 128 for biasing image to signed.
5697 movd xmm4, eax
5698 pshufd xmm4, xmm4, 0x00
5699
5700 xloop:
5701 movdqu xmm0, [esi]
5702 movdqu xmm2, [esi + edx]
5703 movdqu xmm1, xmm0
5704 punpcklbw xmm0, xmm2
5705 punpckhbw xmm1, xmm2
5706 psubb xmm0, xmm4 // bias image by -128
5707 psubb xmm1, xmm4
5708 movdqa xmm2, xmm5
5709 movdqa xmm3, xmm5
5710 pmaddubsw xmm2, xmm0
5711 pmaddubsw xmm3, xmm1
5712 paddw xmm2, xmm4
5713 paddw xmm3, xmm4
5714 psrlw xmm2, 8
5715 psrlw xmm3, 8
5716 packuswb xmm2, xmm3
5717 movdqu [esi + edi], xmm2
5718 lea esi, [esi + 16]
5719 sub ecx, 16
5720 jg xloop
5721 jmp xloop99
5722
5723 // Blend 50 / 50.
5724 xloop50:
5725 movdqu xmm0, [esi]
5726 movdqu xmm1, [esi + edx]
5727 pavgb xmm0, xmm1
5728 movdqu [esi + edi], xmm0
5729 lea esi, [esi + 16]
5730 sub ecx, 16
5731 jg xloop50
5732 jmp xloop99
5733
5734 // Blend 100 / 0 - Copy row unchanged.
5735 xloop100:
5736 movdqu xmm0, [esi]
5737 movdqu [esi + edi], xmm0
5738 lea esi, [esi + 16]
5739 sub ecx, 16
5740 jg xloop100
5741
5742 xloop99:
5743 pop edi
5744 pop esi
5745 ret
5746 }
5747 }
5748
5749 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5750 __declspec(naked)
5751 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5752 const uint8* shuffler, int width) {
5753 __asm {
5754 mov eax, [esp + 4] // src_argb
5755 mov edx, [esp + 8] // dst_argb
5756 mov ecx, [esp + 12] // shuffler
5757 movdqu xmm5, [ecx]
5758 mov ecx, [esp + 16] // width
5759
5760 wloop:
5761 movdqu xmm0, [eax]
5762 movdqu xmm1, [eax + 16]
5763 lea eax, [eax + 32]
5764 pshufb xmm0, xmm5
5765 pshufb xmm1, xmm5
5766 movdqu [edx], xmm0
5767 movdqu [edx + 16], xmm1
5768 lea edx, [edx + 32]
5769 sub ecx, 8
5770 jg wloop
5771 ret
5772 }
5773 }
5774
5775 #ifdef HAS_ARGBSHUFFLEROW_AVX2
5776 __declspec(naked)
5777 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
5778 const uint8* shuffler, int width) {
5779 __asm {
5780 mov eax, [esp + 4] // src_argb
5781 mov edx, [esp + 8] // dst_argb
5782 mov ecx, [esp + 12] // shuffler
5783 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
5784 mov ecx, [esp + 16] // width
5785
5786 wloop:
5787 vmovdqu ymm0, [eax]
5788 vmovdqu ymm1, [eax + 32]
5789 lea eax, [eax + 64]
5790 vpshufb ymm0, ymm0, ymm5
5791 vpshufb ymm1, ymm1, ymm5
5792 vmovdqu [edx], ymm0
5793 vmovdqu [edx + 32], ymm1
5794 lea edx, [edx + 64]
5795 sub ecx, 16
5796 jg wloop
5797
5798 vzeroupper
5799 ret
5800 }
5801 }
5802 #endif // HAS_ARGBSHUFFLEROW_AVX2
5803
5804 __declspec(naked)
5805 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
5806 const uint8* shuffler, int width) {
5807 __asm {
5808 push ebx
5809 push esi
5810 mov eax, [esp + 8 + 4] // src_argb
5811 mov edx, [esp + 8 + 8] // dst_argb
5812 mov esi, [esp + 8 + 12] // shuffler
5813 mov ecx, [esp + 8 + 16] // width
5814 pxor xmm5, xmm5
5815
5816 mov ebx, [esi] // shuffler
5817 cmp ebx, 0x03000102
5818 je shuf_3012
5819 cmp ebx, 0x00010203
5820 je shuf_0123
5821 cmp ebx, 0x00030201
5822 je shuf_0321
5823 cmp ebx, 0x02010003
5824 je shuf_2103
5825
5826 // TODO(fbarchard): Use one source pointer and 3 offsets.
5827 shuf_any1:
5828 movzx ebx, byte ptr [esi]
5829 movzx ebx, byte ptr [eax + ebx]
5830 mov [edx], bl
5831 movzx ebx, byte ptr [esi + 1]
5832 movzx ebx, byte ptr [eax + ebx]
5833 mov [edx + 1], bl
5834 movzx ebx, byte ptr [esi + 2]
5835 movzx ebx, byte ptr [eax + ebx]
5836 mov [edx + 2], bl
5837 movzx ebx, byte ptr [esi + 3]
5838 movzx ebx, byte ptr [eax + ebx]
5839 mov [edx + 3], bl
5840 lea eax, [eax + 4]
5841 lea edx, [edx + 4]
5842 sub ecx, 1
5843 jg shuf_any1
5844 jmp shuf99
5845
5846 shuf_0123:
5847 movdqu xmm0, [eax]
5848 lea eax, [eax + 16]
5849 movdqa xmm1, xmm0
5850 punpcklbw xmm0, xmm5
5851 punpckhbw xmm1, xmm5
5852 pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB
5853 pshuflw xmm0, xmm0, 01Bh
5854 pshufhw xmm1, xmm1, 01Bh
5855 pshuflw xmm1, xmm1, 01Bh
5856 packuswb xmm0, xmm1
5857 movdqu [edx], xmm0
5858 lea edx, [edx + 16]
5859 sub ecx, 4
5860 jg shuf_0123
5861 jmp shuf99
5862
5863 shuf_0321:
5864 movdqu xmm0, [eax]
5865 lea eax, [eax + 16]
5866 movdqa xmm1, xmm0
5867 punpcklbw xmm0, xmm5
5868 punpckhbw xmm1, xmm5
5869 pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB
5870 pshuflw xmm0, xmm0, 039h
5871 pshufhw xmm1, xmm1, 039h
5872 pshuflw xmm1, xmm1, 039h
5873 packuswb xmm0, xmm1
5874 movdqu [edx], xmm0
5875 lea edx, [edx + 16]
5876 sub ecx, 4
5877 jg shuf_0321
5878 jmp shuf99
5879
5880 shuf_2103:
5881 movdqu xmm0, [eax]
5882 lea eax, [eax + 16]
5883 movdqa xmm1, xmm0
5884 punpcklbw xmm0, xmm5
5885 punpckhbw xmm1, xmm5
5886 pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA
5887 pshuflw xmm0, xmm0, 093h
5888 pshufhw xmm1, xmm1, 093h
5889 pshuflw xmm1, xmm1, 093h
5890 packuswb xmm0, xmm1
5891 movdqu [edx], xmm0
5892 lea edx, [edx + 16]
5893 sub ecx, 4
5894 jg shuf_2103
5895 jmp shuf99
5896
5897 shuf_3012:
5898 movdqu xmm0, [eax]
5899 lea eax, [eax + 16]
5900 movdqa xmm1, xmm0
5901 punpcklbw xmm0, xmm5
5902 punpckhbw xmm1, xmm5
5903 pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB
5904 pshuflw xmm0, xmm0, 0C6h
5905 pshufhw xmm1, xmm1, 0C6h
5906 pshuflw xmm1, xmm1, 0C6h
5907 packuswb xmm0, xmm1
5908 movdqu [edx], xmm0
5909 lea edx, [edx + 16]
5910 sub ecx, 4
5911 jg shuf_3012
5912
5913 shuf99:
5914 pop esi
5915 pop ebx
5916 ret
5917 }
5918 }
5919
5920 // YUY2 - Macro-pixel = 2 image pixels
5921 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
5922
5923 // UYVY - Macro-pixel = 2 image pixels
5924 // U0Y0V0Y1
5925
5926 __declspec(naked)
5927 void I422ToYUY2Row_SSE2(const uint8* src_y,
5928 const uint8* src_u,
5929 const uint8* src_v,
5930 uint8* dst_frame, int width) {
5931 __asm {
5932 push esi
5933 push edi
5934 mov eax, [esp + 8 + 4] // src_y
5935 mov esi, [esp + 8 + 8] // src_u
5936 mov edx, [esp + 8 + 12] // src_v
5937 mov edi, [esp + 8 + 16] // dst_frame
5938 mov ecx, [esp + 8 + 20] // width
5939 sub edx, esi
5940
5941 convertloop:
5942 movq xmm2, qword ptr [esi] // U
5943 movq xmm3, qword ptr [esi + edx] // V
5944 lea esi, [esi + 8]
5945 punpcklbw xmm2, xmm3 // UV
5946 movdqu xmm0, [eax] // Y
5947 lea eax, [eax + 16]
5948 movdqa xmm1, xmm0
5949 punpcklbw xmm0, xmm2 // YUYV
5950 punpckhbw xmm1, xmm2
5951 movdqu [edi], xmm0
5952 movdqu [edi + 16], xmm1
5953 lea edi, [edi + 32]
5954 sub ecx, 16
5955 jg convertloop
5956
5957 pop edi
5958 pop esi
5959 ret
5960 }
5961 }
5962
5963 __declspec(naked)
5964 void I422ToUYVYRow_SSE2(const uint8* src_y,
5965 const uint8* src_u,
5966 const uint8* src_v,
5967 uint8* dst_frame, int width) {
5968 __asm {
5969 push esi
5970 push edi
5971 mov eax, [esp + 8 + 4] // src_y
5972 mov esi, [esp + 8 + 8] // src_u
5973 mov edx, [esp + 8 + 12] // src_v
5974 mov edi, [esp + 8 + 16] // dst_frame
5975 mov ecx, [esp + 8 + 20] // width
5976 sub edx, esi
5977
5978 convertloop:
5979 movq xmm2, qword ptr [esi] // U
5980 movq xmm3, qword ptr [esi + edx] // V
5981 lea esi, [esi + 8]
5982 punpcklbw xmm2, xmm3 // UV
5983 movdqu xmm0, [eax] // Y
5984 movdqa xmm1, xmm2
5985 lea eax, [eax + 16]
5986 punpcklbw xmm1, xmm0 // UYVY
5987 punpckhbw xmm2, xmm0
5988 movdqu [edi], xmm1
5989 movdqu [edi + 16], xmm2
5990 lea edi, [edi + 32]
5991 sub ecx, 16
5992 jg convertloop
5993
5994 pop edi
5995 pop esi
5996 ret
5997 }
5998 }
5999
6000 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
6001 __declspec(naked)
6002 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
6003 uint8* dst_argb, const float* poly,
6004 int width) {
6005 __asm {
6006 push esi
6007 mov eax, [esp + 4 + 4] /* src_argb */
6008 mov edx, [esp + 4 + 8] /* dst_argb */
6009 mov esi, [esp + 4 + 12] /* poly */
6010 mov ecx, [esp + 4 + 16] /* width */
6011 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
6012
6013 // 2 pixel loop.
6014 convertloop:
6015 // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
6016 // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
6017 movq xmm0, qword ptr [eax] // BGRABGRA
6018 lea eax, [eax + 8]
6019 punpcklbw xmm0, xmm3
6020 movdqa xmm4, xmm0
6021 punpcklwd xmm0, xmm3 // pixel 0
6022 punpckhwd xmm4, xmm3 // pixel 1
6023 cvtdq2ps xmm0, xmm0 // 4 floats
6024 cvtdq2ps xmm4, xmm4
6025 movdqa xmm1, xmm0 // X
6026 movdqa xmm5, xmm4
6027 mulps xmm0, [esi + 16] // C1 * X
6028 mulps xmm4, [esi + 16]
6029 addps xmm0, [esi] // result = C0 + C1 * X
6030 addps xmm4, [esi]
6031 movdqa xmm2, xmm1
6032 movdqa xmm6, xmm5
6033 mulps xmm2, xmm1 // X * X
6034 mulps xmm6, xmm5
6035 mulps xmm1, xmm2 // X * X * X
6036 mulps xmm5, xmm6
6037 mulps xmm2, [esi + 32] // C2 * X * X
6038 mulps xmm6, [esi + 32]
6039 mulps xmm1, [esi + 48] // C3 * X * X * X
6040 mulps xmm5, [esi + 48]
6041 addps xmm0, xmm2 // result += C2 * X * X
6042 addps xmm4, xmm6
6043 addps xmm0, xmm1 // result += C3 * X * X * X
6044 addps xmm4, xmm5
6045 cvttps2dq xmm0, xmm0
6046 cvttps2dq xmm4, xmm4
6047 packuswb xmm0, xmm4
6048 packuswb xmm0, xmm0
6049 movq qword ptr [edx], xmm0
6050 lea edx, [edx + 8]
6051 sub ecx, 2
6052 jg convertloop
6053 pop esi
6054 ret
6055 }
6056 }
6057 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
6058
6059 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
6060 __declspec(naked)
6061 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
6062 uint8* dst_argb, const float* poly,
6063 int width) {
6064 __asm {
6065 mov eax, [esp + 4] /* src_argb */
6066 mov edx, [esp + 8] /* dst_argb */
6067 mov ecx, [esp + 12] /* poly */
6068 vbroadcastf128 ymm4, [ecx] // C0
6069 vbroadcastf128 ymm5, [ecx + 16] // C1
6070 vbroadcastf128 ymm6, [ecx + 32] // C2
6071 vbroadcastf128 ymm7, [ecx + 48] // C3
6072 mov ecx, [esp + 16] /* width */
6073
6074 // 2 pixel loop.
6075 convertloop:
6076 vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
6077 lea eax, [eax + 8]
6078 vcvtdq2ps ymm0, ymm0 // X 8 floats
6079 vmulps ymm2, ymm0, ymm0 // X * X
6080 vmulps ymm3, ymm0, ymm7 // C3 * X
6081 vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
6082 vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
6083 vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
6084 vcvttps2dq ymm0, ymm0
6085 vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
6086 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
6087 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
6088 vmovq qword ptr [edx], xmm0
6089 lea edx, [edx + 8]
6090 sub ecx, 2
6091 jg convertloop
6092 vzeroupper
6093 ret
6094 }
6095 }
6096 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
6097
6098 #ifdef HAS_ARGBCOLORTABLEROW_X86
6099 // Tranform ARGB pixels with color table.
6100 __declspec(naked)
6101 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
6102 int width) {
6103 __asm {
6104 push esi
6105 mov eax, [esp + 4 + 4] /* dst_argb */
6106 mov esi, [esp + 4 + 8] /* table_argb */
6107 mov ecx, [esp + 4 + 12] /* width */
6108
6109 // 1 pixel loop.
6110 convertloop:
6111 movzx edx, byte ptr [eax]
6112 lea eax, [eax + 4]
6113 movzx edx, byte ptr [esi + edx * 4]
6114 mov byte ptr [eax - 4], dl
6115 movzx edx, byte ptr [eax - 4 + 1]
6116 movzx edx, byte ptr [esi + edx * 4 + 1]
6117 mov byte ptr [eax - 4 + 1], dl
6118 movzx edx, byte ptr [eax - 4 + 2]
6119 movzx edx, byte ptr [esi + edx * 4 + 2]
6120 mov byte ptr [eax - 4 + 2], dl
6121 movzx edx, byte ptr [eax - 4 + 3]
6122 movzx edx, byte ptr [esi + edx * 4 + 3]
6123 mov byte ptr [eax - 4 + 3], dl
6124 dec ecx
6125 jg convertloop
6126 pop esi
6127 ret
6128 }
6129 }
6130 #endif // HAS_ARGBCOLORTABLEROW_X86
6131
6132 #ifdef HAS_RGBCOLORTABLEROW_X86
6133 // Tranform RGB pixels with color table.
6134 __declspec(naked)
6135 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
6136 __asm {
6137 push esi
6138 mov eax, [esp + 4 + 4] /* dst_argb */
6139 mov esi, [esp + 4 + 8] /* table_argb */
6140 mov ecx, [esp + 4 + 12] /* width */
6141
6142 // 1 pixel loop.
6143 convertloop:
6144 movzx edx, byte ptr [eax]
6145 lea eax, [eax + 4]
6146 movzx edx, byte ptr [esi + edx * 4]
6147 mov byte ptr [eax - 4], dl
6148 movzx edx, byte ptr [eax - 4 + 1]
6149 movzx edx, byte ptr [esi + edx * 4 + 1]
6150 mov byte ptr [eax - 4 + 1], dl
6151 movzx edx, byte ptr [eax - 4 + 2]
6152 movzx edx, byte ptr [esi + edx * 4 + 2]
6153 mov byte ptr [eax - 4 + 2], dl
6154 dec ecx
6155 jg convertloop
6156
6157 pop esi
6158 ret
6159 }
6160 }
6161 #endif // HAS_RGBCOLORTABLEROW_X86
6162
6163 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6164 // Tranform RGB pixels with luma table.
6165 __declspec(naked)
6166 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
6167 int width,
6168 const uint8* luma, uint32 lumacoeff) {
6169 __asm {
6170 push esi
6171 push edi
6172 mov eax, [esp + 8 + 4] /* src_argb */
6173 mov edi, [esp + 8 + 8] /* dst_argb */
6174 mov ecx, [esp + 8 + 12] /* width */
6175 movd xmm2, dword ptr [esp + 8 + 16] // luma table
6176 movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
6177 pshufd xmm2, xmm2, 0
6178 pshufd xmm3, xmm3, 0
6179 pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
6180 psllw xmm4, 8
6181 pxor xmm5, xmm5
6182
6183 // 4 pixel loop.
6184 convertloop:
6185 movdqu xmm0, xmmword ptr [eax] // generate luma ptr
6186 pmaddubsw xmm0, xmm3
6187 phaddw xmm0, xmm0
6188 pand xmm0, xmm4 // mask out low bits
6189 punpcklwd xmm0, xmm5
6190 paddd xmm0, xmm2 // add table base
6191 movd esi, xmm0
6192 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6193
6194 movzx edx, byte ptr [eax]
6195 movzx edx, byte ptr [esi + edx]
6196 mov byte ptr [edi], dl
6197 movzx edx, byte ptr [eax + 1]
6198 movzx edx, byte ptr [esi + edx]
6199 mov byte ptr [edi + 1], dl
6200 movzx edx, byte ptr [eax + 2]
6201 movzx edx, byte ptr [esi + edx]
6202 mov byte ptr [edi + 2], dl
6203 movzx edx, byte ptr [eax + 3] // copy alpha.
6204 mov byte ptr [edi + 3], dl
6205
6206 movd esi, xmm0
6207 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6208
6209 movzx edx, byte ptr [eax + 4]
6210 movzx edx, byte ptr [esi + edx]
6211 mov byte ptr [edi + 4], dl
6212 movzx edx, byte ptr [eax + 5]
6213 movzx edx, byte ptr [esi + edx]
6214 mov byte ptr [edi + 5], dl
6215 movzx edx, byte ptr [eax + 6]
6216 movzx edx, byte ptr [esi + edx]
6217 mov byte ptr [edi + 6], dl
6218 movzx edx, byte ptr [eax + 7] // copy alpha.
6219 mov byte ptr [edi + 7], dl
6220
6221 movd esi, xmm0
6222 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6223
6224 movzx edx, byte ptr [eax + 8]
6225 movzx edx, byte ptr [esi + edx]
6226 mov byte ptr [edi + 8], dl
6227 movzx edx, byte ptr [eax + 9]
6228 movzx edx, byte ptr [esi + edx]
6229 mov byte ptr [edi + 9], dl
6230 movzx edx, byte ptr [eax + 10]
6231 movzx edx, byte ptr [esi + edx]
6232 mov byte ptr [edi + 10], dl
6233 movzx edx, byte ptr [eax + 11] // copy alpha.
6234 mov byte ptr [edi + 11], dl
6235
6236 movd esi, xmm0
6237
6238 movzx edx, byte ptr [eax + 12]
6239 movzx edx, byte ptr [esi + edx]
6240 mov byte ptr [edi + 12], dl
6241 movzx edx, byte ptr [eax + 13]
6242 movzx edx, byte ptr [esi + edx]
6243 mov byte ptr [edi + 13], dl
6244 movzx edx, byte ptr [eax + 14]
6245 movzx edx, byte ptr [esi + edx]
6246 mov byte ptr [edi + 14], dl
6247 movzx edx, byte ptr [eax + 15] // copy alpha.
6248 mov byte ptr [edi + 15], dl
6249
6250 lea eax, [eax + 16]
6251 lea edi, [edi + 16]
6252 sub ecx, 4
6253 jg convertloop
6254
6255 pop edi
6256 pop esi
6257 ret
6258 }
6259 }
6260 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6261
6262 #endif // defined(_M_X64)
6263
6264 #ifdef __cplusplus
6265 } // extern "C"
6266 } // namespace libyuv
6267 #endif
6268
6269 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6270