1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 // This module is for Visual C 32/64 bit and clangcl 32 bit
14 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
15 (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
16
17 #if defined(_M_X64)
18 #include <emmintrin.h>
19 #include <tmmintrin.h> // For _mm_maddubs_epi16
20 #endif
21
22 #ifdef __cplusplus
23 namespace libyuv {
24 extern "C" {
25 #endif
26
27 // 64 bit
28 #if defined(_M_X64)
29
30 // Read 4 UV from 422, upsample to 8 UV.
31 #define READYUV422 \
32 xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
33 xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
34 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
35 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
36 u_buf += 4; \
37 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
38 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
39 y_buf += 8;
40
41 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
42 #define READYUVA422 \
43 xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
44 xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
45 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
46 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
47 u_buf += 4; \
48 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
49 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
50 y_buf += 8; \
51 xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
52 a_buf += 8;
53
54 // Convert 8 pixels: 8 UV and 8 Y.
55 #define YUVTORGB(yuvconstants) \
56 xmm1 = _mm_loadu_si128(&xmm0); \
57 xmm2 = _mm_loadu_si128(&xmm0); \
58 xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
59 xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
60 xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
61 xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \
62 xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \
63 xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \
64 xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
65 xmm0 = _mm_adds_epi16(xmm0, xmm4); \
66 xmm1 = _mm_adds_epi16(xmm1, xmm4); \
67 xmm2 = _mm_adds_epi16(xmm2, xmm4); \
68 xmm0 = _mm_srai_epi16(xmm0, 6); \
69 xmm1 = _mm_srai_epi16(xmm1, 6); \
70 xmm2 = _mm_srai_epi16(xmm2, 6); \
71 xmm0 = _mm_packus_epi16(xmm0, xmm0); \
72 xmm1 = _mm_packus_epi16(xmm1, xmm1); \
73 xmm2 = _mm_packus_epi16(xmm2, xmm2);
74
75 // Store 8 ARGB values.
76 #define STOREARGB \
77 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
78 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \
79 xmm1 = _mm_loadu_si128(&xmm0); \
80 xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \
81 xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \
82 _mm_storeu_si128((__m128i*)dst_argb, xmm0); \
83 _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
84 dst_argb += 32;
85
86 #if defined(HAS_I422TOARGBROW_SSSE3)
I422ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)87 void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
88 const uint8_t* u_buf,
89 const uint8_t* v_buf,
90 uint8_t* dst_argb,
91 const struct YuvConstants* yuvconstants,
92 int width) {
93 __m128i xmm0, xmm1, xmm2, xmm4;
94 const __m128i xmm5 = _mm_set1_epi8(-1);
95 const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
96 while (width > 0) {
97 READYUV422
98 YUVTORGB(yuvconstants)
99 STOREARGB
100 width -= 8;
101 }
102 }
103 #endif
104
105 #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
I422AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)106 void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
107 const uint8_t* u_buf,
108 const uint8_t* v_buf,
109 const uint8_t* a_buf,
110 uint8_t* dst_argb,
111 const struct YuvConstants* yuvconstants,
112 int width) {
113 __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
114 const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
115 while (width > 0) {
116 READYUVA422
117 YUVTORGB(yuvconstants)
118 STOREARGB
119 width -= 8;
120 }
121 }
122 #endif
123
124 // 32 bit
125 #else // defined(_M_X64)
126 #ifdef HAS_ARGBTOYROW_SSSE3
127
128 // Constants for ARGB.
129 static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
130 13, 65, 33, 0, 13, 65, 33, 0};
131
132 // JPeg full range.
133 static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
134 15, 75, 38, 0, 15, 75, 38, 0};
135
136 static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
137 112, -74, -38, 0, 112, -74, -38, 0};
138
139 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
140 127, -84, -43, 0, 127, -84, -43, 0};
141
142 static const vec8 kARGBToV = {
143 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
144 };
145
146 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
147 -20, -107, 127, 0, -20, -107, 127, 0};
148
149 // vpshufb for vphaddw + vpackuswb packed to shorts.
150 static const lvec8 kShufARGBToUV_AVX = {
151 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
152 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
153
154 // Constants for BGRA.
155 static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
156 0, 33, 65, 13, 0, 33, 65, 13};
157
158 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
159 0, -38, -74, 112, 0, -38, -74, 112};
160
161 static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
162 0, 112, -94, -18, 0, 112, -94, -18};
163
164 // Constants for ABGR.
165 static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
166 33, 65, 13, 0, 33, 65, 13, 0};
167
168 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
169 -38, -74, 112, 0, -38, -74, 112, 0};
170
171 static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
172 112, -94, -18, 0, 112, -94, -18, 0};
173
174 // Constants for RGBA.
175 static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
176 0, 13, 65, 33, 0, 13, 65, 33};
177
178 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
179 0, 112, -74, -38, 0, 112, -74, -38};
180
181 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
182 0, -18, -94, 112, 0, -18, -94, 112};
183
184 static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
185 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
186
187 // 7 bit fixed point 0.5.
188 static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
189
190 static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
191 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
192
193 static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
194 0x8080u, 0x8080u, 0x8080u, 0x8080u};
195
196 // Shuffle table for converting RGB24 to ARGB.
197 static const uvec8 kShuffleMaskRGB24ToARGB = {
198 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
199
200 // Shuffle table for converting RAW to ARGB.
201 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
202 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
203
204 // Shuffle table for converting RAW to RGB24. First 8.
205 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
206 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
207 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
208
209 // Shuffle table for converting RAW to RGB24. Middle 8.
210 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
211 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
212 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
213
214 // Shuffle table for converting RAW to RGB24. Last 8.
215 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
216 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
217 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
218
219 // Shuffle table for converting ARGB to RGB24.
220 static const uvec8 kShuffleMaskARGBToRGB24 = {
221 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
222
223 // Shuffle table for converting ARGB to RAW.
224 static const uvec8 kShuffleMaskARGBToRAW = {
225 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
226
227 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
228 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
229 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
230
231 // YUY2 shuf 16 Y to 32 Y.
232 static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
233 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
234 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
235
236 // YUY2 shuf 8 UV to 16 UV.
237 static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
238 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
239 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
240
241 // UYVY shuf 16 Y to 32 Y.
242 static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
243 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
244 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
245
246 // UYVY shuf 8 UV to 16 UV.
247 static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
248 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
249 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
250
251 // NV21 shuf 8 VU to 16 UV.
252 static const lvec8 kShuffleNV21 = {
253 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
254 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
255 };
256
257 // Duplicates gray value 3 times and fills in alpha opaque.
258 __declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y,
259 uint8_t* dst_argb,
260 int width) {
261 __asm {
262 mov eax, [esp + 4] // src_y
263 mov edx, [esp + 8] // dst_argb
264 mov ecx, [esp + 12] // width
265 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
266 pslld xmm5, 24
267
268 convertloop:
269 movq xmm0, qword ptr [eax]
270 lea eax, [eax + 8]
271 punpcklbw xmm0, xmm0
272 movdqa xmm1, xmm0
273 punpcklwd xmm0, xmm0
274 punpckhwd xmm1, xmm1
275 por xmm0, xmm5
276 por xmm1, xmm5
277 movdqu [edx], xmm0
278 movdqu [edx + 16], xmm1
279 lea edx, [edx + 32]
280 sub ecx, 8
281 jg convertloop
282 ret
283 }
284 }
285
286 #ifdef HAS_J400TOARGBROW_AVX2
287 // Duplicates gray value 3 times and fills in alpha opaque.
288 __declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y,
289 uint8_t* dst_argb,
290 int width) {
291 __asm {
292 mov eax, [esp + 4] // src_y
293 mov edx, [esp + 8] // dst_argb
294 mov ecx, [esp + 12] // width
295 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
296 vpslld ymm5, ymm5, 24
297
298 convertloop:
299 vmovdqu xmm0, [eax]
300 lea eax, [eax + 16]
301 vpermq ymm0, ymm0, 0xd8
302 vpunpcklbw ymm0, ymm0, ymm0
303 vpermq ymm0, ymm0, 0xd8
304 vpunpckhwd ymm1, ymm0, ymm0
305 vpunpcklwd ymm0, ymm0, ymm0
306 vpor ymm0, ymm0, ymm5
307 vpor ymm1, ymm1, ymm5
308 vmovdqu [edx], ymm0
309 vmovdqu [edx + 32], ymm1
310 lea edx, [edx + 64]
311 sub ecx, 16
312 jg convertloop
313 vzeroupper
314 ret
315 }
316 }
317 #endif // HAS_J400TOARGBROW_AVX2
318
319 __declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
320 uint8_t* dst_argb,
321 int width) {
322 __asm {
323 mov eax, [esp + 4] // src_rgb24
324 mov edx, [esp + 8] // dst_argb
325 mov ecx, [esp + 12] // width
326 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
327 pslld xmm5, 24
328 movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
329
330 convertloop:
331 movdqu xmm0, [eax]
332 movdqu xmm1, [eax + 16]
333 movdqu xmm3, [eax + 32]
334 lea eax, [eax + 48]
335 movdqa xmm2, xmm3
336 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
337 pshufb xmm2, xmm4
338 por xmm2, xmm5
339 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
340 pshufb xmm0, xmm4
341 movdqu [edx + 32], xmm2
342 por xmm0, xmm5
343 pshufb xmm1, xmm4
344 movdqu [edx], xmm0
345 por xmm1, xmm5
346 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
347 pshufb xmm3, xmm4
348 movdqu [edx + 16], xmm1
349 por xmm3, xmm5
350 movdqu [edx + 48], xmm3
351 lea edx, [edx + 64]
352 sub ecx, 16
353 jg convertloop
354 ret
355 }
356 }
357
358 __declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw,
359 uint8_t* dst_argb,
360 int width) {
361 __asm {
362 mov eax, [esp + 4] // src_raw
363 mov edx, [esp + 8] // dst_argb
364 mov ecx, [esp + 12] // width
365 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
366 pslld xmm5, 24
367 movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB
368
369 convertloop:
370 movdqu xmm0, [eax]
371 movdqu xmm1, [eax + 16]
372 movdqu xmm3, [eax + 32]
373 lea eax, [eax + 48]
374 movdqa xmm2, xmm3
375 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
376 pshufb xmm2, xmm4
377 por xmm2, xmm5
378 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
379 pshufb xmm0, xmm4
380 movdqu [edx + 32], xmm2
381 por xmm0, xmm5
382 pshufb xmm1, xmm4
383 movdqu [edx], xmm0
384 por xmm1, xmm5
385 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
386 pshufb xmm3, xmm4
387 movdqu [edx + 16], xmm1
388 por xmm3, xmm5
389 movdqu [edx + 48], xmm3
390 lea edx, [edx + 64]
391 sub ecx, 16
392 jg convertloop
393 ret
394 }
395 }
396
397 __declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
398 uint8_t* dst_rgb24,
399 int width) {
400 __asm {
401 mov eax, [esp + 4] // src_raw
402 mov edx, [esp + 8] // dst_rgb24
403 mov ecx, [esp + 12] // width
404 movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
405 movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
406 movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
407
408 convertloop:
409 movdqu xmm0, [eax]
410 movdqu xmm1, [eax + 4]
411 movdqu xmm2, [eax + 8]
412 lea eax, [eax + 24]
413 pshufb xmm0, xmm3
414 pshufb xmm1, xmm4
415 pshufb xmm2, xmm5
416 movq qword ptr [edx], xmm0
417 movq qword ptr [edx + 8], xmm1
418 movq qword ptr [edx + 16], xmm2
419 lea edx, [edx + 24]
420 sub ecx, 8
421 jg convertloop
422 ret
423 }
424 }
425
426 // pmul method to replicate bits.
427 // Math to replicate bits:
428 // (v << 8) | (v << 3)
429 // v * 256 + v * 8
430 // v * (256 + 8)
431 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
432 // 20 instructions.
433 __declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565,
434 uint8_t* dst_argb,
435 int width) {
436 __asm {
437 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
438 movd xmm5, eax
439 pshufd xmm5, xmm5, 0
440 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
441 movd xmm6, eax
442 pshufd xmm6, xmm6, 0
443 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
444 psllw xmm3, 11
445 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
446 psllw xmm4, 10
447 psrlw xmm4, 5
448 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
449 psllw xmm7, 8
450
451 mov eax, [esp + 4] // src_rgb565
452 mov edx, [esp + 8] // dst_argb
453 mov ecx, [esp + 12] // width
454 sub edx, eax
455 sub edx, eax
456
457 convertloop:
458 movdqu xmm0, [eax] // fetch 8 pixels of bgr565
459 movdqa xmm1, xmm0
460 movdqa xmm2, xmm0
461 pand xmm1, xmm3 // R in upper 5 bits
462 psllw xmm2, 11 // B in upper 5 bits
463 pmulhuw xmm1, xmm5 // * (256 + 8)
464 pmulhuw xmm2, xmm5 // * (256 + 8)
465 psllw xmm1, 8
466 por xmm1, xmm2 // RB
467 pand xmm0, xmm4 // G in middle 6 bits
468 pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
469 por xmm0, xmm7 // AG
470 movdqa xmm2, xmm1
471 punpcklbw xmm1, xmm0
472 punpckhbw xmm2, xmm0
473 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
474 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
475 lea eax, [eax + 16]
476 sub ecx, 8
477 jg convertloop
478 ret
479 }
480 }
481
482 #ifdef HAS_RGB565TOARGBROW_AVX2
483 // pmul method to replicate bits.
484 // Math to replicate bits:
485 // (v << 8) | (v << 3)
486 // v * 256 + v * 8
487 // v * (256 + 8)
488 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
489 __declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
490 uint8_t* dst_argb,
491 int width) {
492 __asm {
493 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
494 vmovd xmm5, eax
495 vbroadcastss ymm5, xmm5
496 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
497 vmovd xmm6, eax
498 vbroadcastss ymm6, xmm6
499 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
500 vpsllw ymm3, ymm3, 11
501 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
502 vpsllw ymm4, ymm4, 10
503 vpsrlw ymm4, ymm4, 5
504 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
505 vpsllw ymm7, ymm7, 8
506
507 mov eax, [esp + 4] // src_rgb565
508 mov edx, [esp + 8] // dst_argb
509 mov ecx, [esp + 12] // width
510 sub edx, eax
511 sub edx, eax
512
513 convertloop:
514 vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
515 vpand ymm1, ymm0, ymm3 // R in upper 5 bits
516 vpsllw ymm2, ymm0, 11 // B in upper 5 bits
517 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
518 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
519 vpsllw ymm1, ymm1, 8
520 vpor ymm1, ymm1, ymm2 // RB
521 vpand ymm0, ymm0, ymm4 // G in middle 6 bits
522 vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
523 vpor ymm0, ymm0, ymm7 // AG
524 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
525 vpermq ymm1, ymm1, 0xd8
526 vpunpckhbw ymm2, ymm1, ymm0
527 vpunpcklbw ymm1, ymm1, ymm0
528 vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB
529 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB
530 lea eax, [eax + 32]
531 sub ecx, 16
532 jg convertloop
533 vzeroupper
534 ret
535 }
536 }
537 #endif // HAS_RGB565TOARGBROW_AVX2
538
539 #ifdef HAS_ARGB1555TOARGBROW_AVX2
540 __declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
541 uint8_t* dst_argb,
542 int width) {
543 __asm {
544 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
545 vmovd xmm5, eax
546 vbroadcastss ymm5, xmm5
547 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
548 vmovd xmm6, eax
549 vbroadcastss ymm6, xmm6
550 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
551 vpsllw ymm3, ymm3, 11
552 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
553 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
554 vpsllw ymm7, ymm7, 8
555
556 mov eax, [esp + 4] // src_argb1555
557 mov edx, [esp + 8] // dst_argb
558 mov ecx, [esp + 12] // width
559 sub edx, eax
560 sub edx, eax
561
562 convertloop:
563 vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
564 vpsllw ymm1, ymm0, 1 // R in upper 5 bits
565 vpsllw ymm2, ymm0, 11 // B in upper 5 bits
566 vpand ymm1, ymm1, ymm3
567 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
568 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
569 vpsllw ymm1, ymm1, 8
570 vpor ymm1, ymm1, ymm2 // RB
571 vpsraw ymm2, ymm0, 8 // A
572 vpand ymm0, ymm0, ymm4 // G in middle 5 bits
573 vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
574 vpand ymm2, ymm2, ymm7
575 vpor ymm0, ymm0, ymm2 // AG
576 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
577 vpermq ymm1, ymm1, 0xd8
578 vpunpckhbw ymm2, ymm1, ymm0
579 vpunpcklbw ymm1, ymm1, ymm0
580 vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB
581 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB
582 lea eax, [eax + 32]
583 sub ecx, 16
584 jg convertloop
585 vzeroupper
586 ret
587 }
588 }
589 #endif // HAS_ARGB1555TOARGBROW_AVX2
590
591 #ifdef HAS_ARGB4444TOARGBROW_AVX2
592 __declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
593 uint8_t* dst_argb,
594 int width) {
595 __asm {
596 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
597 vmovd xmm4, eax
598 vbroadcastss ymm4, xmm4
599 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
600 mov eax, [esp + 4] // src_argb4444
601 mov edx, [esp + 8] // dst_argb
602 mov ecx, [esp + 12] // width
603 sub edx, eax
604 sub edx, eax
605
606 convertloop:
607 vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
608 vpand ymm2, ymm0, ymm5 // mask high nibbles
609 vpand ymm0, ymm0, ymm4 // mask low nibbles
610 vpsrlw ymm3, ymm2, 4
611 vpsllw ymm1, ymm0, 4
612 vpor ymm2, ymm2, ymm3
613 vpor ymm0, ymm0, ymm1
614 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
615 vpermq ymm2, ymm2, 0xd8
616 vpunpckhbw ymm1, ymm0, ymm2
617 vpunpcklbw ymm0, ymm0, ymm2
618 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
619 vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
620 lea eax, [eax + 32]
621 sub ecx, 16
622 jg convertloop
623 vzeroupper
624 ret
625 }
626 }
627 #endif // HAS_ARGB4444TOARGBROW_AVX2
628
629 // 24 instructions
630 __declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555,
631 uint8_t* dst_argb,
632 int width) {
633 __asm {
634 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
635 movd xmm5, eax
636 pshufd xmm5, xmm5, 0
637 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
638 movd xmm6, eax
639 pshufd xmm6, xmm6, 0
640 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
641 psllw xmm3, 11
642 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
643 psrlw xmm4, 6
644 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
645 psllw xmm7, 8
646
647 mov eax, [esp + 4] // src_argb1555
648 mov edx, [esp + 8] // dst_argb
649 mov ecx, [esp + 12] // width
650 sub edx, eax
651 sub edx, eax
652
653 convertloop:
654 movdqu xmm0, [eax] // fetch 8 pixels of 1555
655 movdqa xmm1, xmm0
656 movdqa xmm2, xmm0
657 psllw xmm1, 1 // R in upper 5 bits
658 psllw xmm2, 11 // B in upper 5 bits
659 pand xmm1, xmm3
660 pmulhuw xmm2, xmm5 // * (256 + 8)
661 pmulhuw xmm1, xmm5 // * (256 + 8)
662 psllw xmm1, 8
663 por xmm1, xmm2 // RB
664 movdqa xmm2, xmm0
665 pand xmm0, xmm4 // G in middle 5 bits
666 psraw xmm2, 8 // A
667 pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
668 pand xmm2, xmm7
669 por xmm0, xmm2 // AG
670 movdqa xmm2, xmm1
671 punpcklbw xmm1, xmm0
672 punpckhbw xmm2, xmm0
673 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
674 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
675 lea eax, [eax + 16]
676 sub ecx, 8
677 jg convertloop
678 ret
679 }
680 }
681
682 // 18 instructions.
683 __declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444,
684 uint8_t* dst_argb,
685 int width) {
686 __asm {
687 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
688 movd xmm4, eax
689 pshufd xmm4, xmm4, 0
690 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
691 pslld xmm5, 4
692 mov eax, [esp + 4] // src_argb4444
693 mov edx, [esp + 8] // dst_argb
694 mov ecx, [esp + 12] // width
695 sub edx, eax
696 sub edx, eax
697
698 convertloop:
699 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
700 movdqa xmm2, xmm0
701 pand xmm0, xmm4 // mask low nibbles
702 pand xmm2, xmm5 // mask high nibbles
703 movdqa xmm1, xmm0
704 movdqa xmm3, xmm2
705 psllw xmm1, 4
706 psrlw xmm3, 4
707 por xmm0, xmm1
708 por xmm2, xmm3
709 movdqa xmm1, xmm0
710 punpcklbw xmm0, xmm2
711 punpckhbw xmm1, xmm2
712 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
713 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
714 lea eax, [eax + 16]
715 sub ecx, 8
716 jg convertloop
717 ret
718 }
719 }
720
721 __declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb,
722 uint8_t* dst_rgb,
723 int width) {
724 __asm {
725 mov eax, [esp + 4] // src_argb
726 mov edx, [esp + 8] // dst_rgb
727 mov ecx, [esp + 12] // width
728 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
729
730 convertloop:
731 movdqu xmm0, [eax] // fetch 16 pixels of argb
732 movdqu xmm1, [eax + 16]
733 movdqu xmm2, [eax + 32]
734 movdqu xmm3, [eax + 48]
735 lea eax, [eax + 64]
736 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
737 pshufb xmm1, xmm6
738 pshufb xmm2, xmm6
739 pshufb xmm3, xmm6
740 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
741 psrldq xmm1, 4 // 8 bytes from 1
742 pslldq xmm4, 12 // 4 bytes from 1 for 0
743 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
744 por xmm0, xmm4 // 4 bytes from 1 for 0
745 pslldq xmm5, 8 // 8 bytes from 2 for 1
746 movdqu [edx], xmm0 // store 0
747 por xmm1, xmm5 // 8 bytes from 2 for 1
748 psrldq xmm2, 8 // 4 bytes from 2
749 pslldq xmm3, 4 // 12 bytes from 3 for 2
750 por xmm2, xmm3 // 12 bytes from 3 for 2
751 movdqu [edx + 16], xmm1 // store 1
752 movdqu [edx + 32], xmm2 // store 2
753 lea edx, [edx + 48]
754 sub ecx, 16
755 jg convertloop
756 ret
757 }
758 }
759
760 __declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb,
761 uint8_t* dst_rgb,
762 int width) {
763 __asm {
764 mov eax, [esp + 4] // src_argb
765 mov edx, [esp + 8] // dst_rgb
766 mov ecx, [esp + 12] // width
767 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW
768
769 convertloop:
770 movdqu xmm0, [eax] // fetch 16 pixels of argb
771 movdqu xmm1, [eax + 16]
772 movdqu xmm2, [eax + 32]
773 movdqu xmm3, [eax + 48]
774 lea eax, [eax + 64]
775 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
776 pshufb xmm1, xmm6
777 pshufb xmm2, xmm6
778 pshufb xmm3, xmm6
779 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
780 psrldq xmm1, 4 // 8 bytes from 1
781 pslldq xmm4, 12 // 4 bytes from 1 for 0
782 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
783 por xmm0, xmm4 // 4 bytes from 1 for 0
784 pslldq xmm5, 8 // 8 bytes from 2 for 1
785 movdqu [edx], xmm0 // store 0
786 por xmm1, xmm5 // 8 bytes from 2 for 1
787 psrldq xmm2, 8 // 4 bytes from 2
788 pslldq xmm3, 4 // 12 bytes from 3 for 2
789 por xmm2, xmm3 // 12 bytes from 3 for 2
790 movdqu [edx + 16], xmm1 // store 1
791 movdqu [edx + 32], xmm2 // store 2
792 lea edx, [edx + 48]
793 sub ecx, 16
794 jg convertloop
795 ret
796 }
797 }
798
799 __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
800 uint8_t* dst_rgb,
801 int width) {
802 __asm {
803 mov eax, [esp + 4] // src_argb
804 mov edx, [esp + 8] // dst_rgb
805 mov ecx, [esp + 12] // width
806 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
807 psrld xmm3, 27
808 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
809 psrld xmm4, 26
810 pslld xmm4, 5
811 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
812 pslld xmm5, 11
813
814 convertloop:
815 movdqu xmm0, [eax] // fetch 4 pixels of argb
816 movdqa xmm1, xmm0 // B
817 movdqa xmm2, xmm0 // G
818 pslld xmm0, 8 // R
819 psrld xmm1, 3 // B
820 psrld xmm2, 5 // G
821 psrad xmm0, 16 // R
822 pand xmm1, xmm3 // B
823 pand xmm2, xmm4 // G
824 pand xmm0, xmm5 // R
825 por xmm1, xmm2 // BG
826 por xmm0, xmm1 // BGR
827 packssdw xmm0, xmm0
828 lea eax, [eax + 16]
829 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
830 lea edx, [edx + 8]
831 sub ecx, 4
832 jg convertloop
833 ret
834 }
835 }
836
837 __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
838 uint8_t* dst_rgb,
839 const uint32_t dither4,
840 int width) {
841 __asm {
842
843 mov eax, [esp + 4] // src_argb
844 mov edx, [esp + 8] // dst_rgb
845 movd xmm6, [esp + 12] // dither4
846 mov ecx, [esp + 16] // width
847 punpcklbw xmm6, xmm6 // make dither 16 bytes
848 movdqa xmm7, xmm6
849 punpcklwd xmm6, xmm6
850 punpckhwd xmm7, xmm7
851 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
852 psrld xmm3, 27
853 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
854 psrld xmm4, 26
855 pslld xmm4, 5
856 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
857 pslld xmm5, 11
858
859 convertloop:
860 movdqu xmm0, [eax] // fetch 4 pixels of argb
861 paddusb xmm0, xmm6 // add dither
862 movdqa xmm1, xmm0 // B
863 movdqa xmm2, xmm0 // G
864 pslld xmm0, 8 // R
865 psrld xmm1, 3 // B
866 psrld xmm2, 5 // G
867 psrad xmm0, 16 // R
868 pand xmm1, xmm3 // B
869 pand xmm2, xmm4 // G
870 pand xmm0, xmm5 // R
871 por xmm1, xmm2 // BG
872 por xmm0, xmm1 // BGR
873 packssdw xmm0, xmm0
874 lea eax, [eax + 16]
875 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
876 lea edx, [edx + 8]
877 sub ecx, 4
878 jg convertloop
879 ret
880 }
881 }
882
883 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
884 __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
885 uint8_t* dst_rgb,
886 const uint32_t dither4,
887 int width) {
888 __asm {
889 mov eax, [esp + 4] // src_argb
890 mov edx, [esp + 8] // dst_rgb
891 vbroadcastss xmm6, [esp + 12] // dither4
892 mov ecx, [esp + 16] // width
893 vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
894 vpermq ymm6, ymm6, 0xd8
895 vpunpcklwd ymm6, ymm6, ymm6
896 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
897 vpsrld ymm3, ymm3, 27
898 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
899 vpsrld ymm4, ymm4, 26
900 vpslld ymm4, ymm4, 5
901 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
902
903 convertloop:
904 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
905 vpaddusb ymm0, ymm0, ymm6 // add dither
906 vpsrld ymm2, ymm0, 5 // G
907 vpsrld ymm1, ymm0, 3 // B
908 vpsrld ymm0, ymm0, 8 // R
909 vpand ymm2, ymm2, ymm4 // G
910 vpand ymm1, ymm1, ymm3 // B
911 vpand ymm0, ymm0, ymm5 // R
912 vpor ymm1, ymm1, ymm2 // BG
913 vpor ymm0, ymm0, ymm1 // BGR
914 vpackusdw ymm0, ymm0, ymm0
915 vpermq ymm0, ymm0, 0xd8
916 lea eax, [eax + 32]
917 vmovdqu [edx], xmm0 // store 8 pixels of RGB565
918 lea edx, [edx + 16]
919 sub ecx, 8
920 jg convertloop
921 vzeroupper
922 ret
923 }
924 }
925 #endif // HAS_ARGBTORGB565DITHERROW_AVX2
926
927 // TODO(fbarchard): Improve sign extension/packing.
928 __declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,
929 uint8_t* dst_rgb,
930 int width) {
931 __asm {
932 mov eax, [esp + 4] // src_argb
933 mov edx, [esp + 8] // dst_rgb
934 mov ecx, [esp + 12] // width
935 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
936 psrld xmm4, 27
937 movdqa xmm5, xmm4 // generate mask 0x000003e0
938 pslld xmm5, 5
939 movdqa xmm6, xmm4 // generate mask 0x00007c00
940 pslld xmm6, 10
941 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
942 pslld xmm7, 15
943
944 convertloop:
945 movdqu xmm0, [eax] // fetch 4 pixels of argb
946 movdqa xmm1, xmm0 // B
947 movdqa xmm2, xmm0 // G
948 movdqa xmm3, xmm0 // R
949 psrad xmm0, 16 // A
950 psrld xmm1, 3 // B
951 psrld xmm2, 6 // G
952 psrld xmm3, 9 // R
953 pand xmm0, xmm7 // A
954 pand xmm1, xmm4 // B
955 pand xmm2, xmm5 // G
956 pand xmm3, xmm6 // R
957 por xmm0, xmm1 // BA
958 por xmm2, xmm3 // GR
959 por xmm0, xmm2 // BGRA
960 packssdw xmm0, xmm0
961 lea eax, [eax + 16]
962 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
963 lea edx, [edx + 8]
964 sub ecx, 4
965 jg convertloop
966 ret
967 }
968 }
969
970 __declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,
971 uint8_t* dst_rgb,
972 int width) {
973 __asm {
974 mov eax, [esp + 4] // src_argb
975 mov edx, [esp + 8] // dst_rgb
976 mov ecx, [esp + 12] // width
977 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
978 psllw xmm4, 12
979 movdqa xmm3, xmm4 // generate mask 0x00f000f0
980 psrlw xmm3, 8
981
982 convertloop:
983 movdqu xmm0, [eax] // fetch 4 pixels of argb
984 movdqa xmm1, xmm0
985 pand xmm0, xmm3 // low nibble
986 pand xmm1, xmm4 // high nibble
987 psrld xmm0, 4
988 psrld xmm1, 8
989 por xmm0, xmm1
990 packuswb xmm0, xmm0
991 lea eax, [eax + 16]
992 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
993 lea edx, [edx + 8]
994 sub ecx, 4
995 jg convertloop
996 ret
997 }
998 }
999
1000 #ifdef HAS_ARGBTORGB565ROW_AVX2
1001 __declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb,
1002 uint8_t* dst_rgb,
1003 int width) {
1004 __asm {
1005 mov eax, [esp + 4] // src_argb
1006 mov edx, [esp + 8] // dst_rgb
1007 mov ecx, [esp + 12] // width
1008 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
1009 vpsrld ymm3, ymm3, 27
1010 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
1011 vpsrld ymm4, ymm4, 26
1012 vpslld ymm4, ymm4, 5
1013 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
1014
1015 convertloop:
1016 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1017 vpsrld ymm2, ymm0, 5 // G
1018 vpsrld ymm1, ymm0, 3 // B
1019 vpsrld ymm0, ymm0, 8 // R
1020 vpand ymm2, ymm2, ymm4 // G
1021 vpand ymm1, ymm1, ymm3 // B
1022 vpand ymm0, ymm0, ymm5 // R
1023 vpor ymm1, ymm1, ymm2 // BG
1024 vpor ymm0, ymm0, ymm1 // BGR
1025 vpackusdw ymm0, ymm0, ymm0
1026 vpermq ymm0, ymm0, 0xd8
1027 lea eax, [eax + 32]
1028 vmovdqu [edx], xmm0 // store 8 pixels of RGB565
1029 lea edx, [edx + 16]
1030 sub ecx, 8
1031 jg convertloop
1032 vzeroupper
1033 ret
1034 }
1035 }
1036 #endif // HAS_ARGBTORGB565ROW_AVX2
1037
1038 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
1039 __declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
1040 uint8_t* dst_rgb,
1041 int width) {
1042 __asm {
1043 mov eax, [esp + 4] // src_argb
1044 mov edx, [esp + 8] // dst_rgb
1045 mov ecx, [esp + 12] // width
1046 vpcmpeqb ymm4, ymm4, ymm4
1047 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
1048 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
1049 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
1050 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
1051 vpslld ymm7, ymm7, 15
1052
1053 convertloop:
1054 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1055 vpsrld ymm3, ymm0, 9 // R
1056 vpsrld ymm2, ymm0, 6 // G
1057 vpsrld ymm1, ymm0, 3 // B
1058 vpsrad ymm0, ymm0, 16 // A
1059 vpand ymm3, ymm3, ymm6 // R
1060 vpand ymm2, ymm2, ymm5 // G
1061 vpand ymm1, ymm1, ymm4 // B
1062 vpand ymm0, ymm0, ymm7 // A
1063 vpor ymm0, ymm0, ymm1 // BA
1064 vpor ymm2, ymm2, ymm3 // GR
1065 vpor ymm0, ymm0, ymm2 // BGRA
1066 vpackssdw ymm0, ymm0, ymm0
1067 vpermq ymm0, ymm0, 0xd8
1068 lea eax, [eax + 32]
1069 vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555
1070 lea edx, [edx + 16]
1071 sub ecx, 8
1072 jg convertloop
1073 vzeroupper
1074 ret
1075 }
1076 }
1077 #endif // HAS_ARGBTOARGB1555ROW_AVX2
1078
1079 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
1080 __declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
1081 uint8_t* dst_rgb,
1082 int width) {
1083 __asm {
1084 mov eax, [esp + 4] // src_argb
1085 mov edx, [esp + 8] // dst_rgb
1086 mov ecx, [esp + 12] // width
1087 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
1088 vpsllw ymm4, ymm4, 12
1089 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
1090
1091 convertloop:
1092 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1093 vpand ymm1, ymm0, ymm4 // high nibble
1094 vpand ymm0, ymm0, ymm3 // low nibble
1095 vpsrld ymm1, ymm1, 8
1096 vpsrld ymm0, ymm0, 4
1097 vpor ymm0, ymm0, ymm1
1098 vpackuswb ymm0, ymm0, ymm0
1099 vpermq ymm0, ymm0, 0xd8
1100 lea eax, [eax + 32]
1101 vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
1102 lea edx, [edx + 16]
1103 sub ecx, 8
1104 jg convertloop
1105 vzeroupper
1106 ret
1107 }
1108 }
1109 #endif // HAS_ARGBTOARGB4444ROW_AVX2
1110
1111 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
1112 __declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
1113 uint8_t* dst_y,
1114 int width) {
1115 __asm {
1116 mov eax, [esp + 4] /* src_argb */
1117 mov edx, [esp + 8] /* dst_y */
1118 mov ecx, [esp + 12] /* width */
1119 movdqa xmm4, xmmword ptr kARGBToY
1120 movdqa xmm5, xmmword ptr kAddY16
1121
1122 convertloop:
1123 movdqu xmm0, [eax]
1124 movdqu xmm1, [eax + 16]
1125 movdqu xmm2, [eax + 32]
1126 movdqu xmm3, [eax + 48]
1127 pmaddubsw xmm0, xmm4
1128 pmaddubsw xmm1, xmm4
1129 pmaddubsw xmm2, xmm4
1130 pmaddubsw xmm3, xmm4
1131 lea eax, [eax + 64]
1132 phaddw xmm0, xmm1
1133 phaddw xmm2, xmm3
1134 psrlw xmm0, 7
1135 psrlw xmm2, 7
1136 packuswb xmm0, xmm2
1137 paddb xmm0, xmm5
1138 movdqu [edx], xmm0
1139 lea edx, [edx + 16]
1140 sub ecx, 16
1141 jg convertloop
1142 ret
1143 }
1144 }
1145
1146 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1147 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
1148 __declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
1149 uint8_t* dst_y,
1150 int width) {
1151 __asm {
1152 mov eax, [esp + 4] /* src_argb */
1153 mov edx, [esp + 8] /* dst_y */
1154 mov ecx, [esp + 12] /* width */
1155 movdqa xmm4, xmmword ptr kARGBToYJ
1156 movdqa xmm5, xmmword ptr kAddYJ64
1157
1158 convertloop:
1159 movdqu xmm0, [eax]
1160 movdqu xmm1, [eax + 16]
1161 movdqu xmm2, [eax + 32]
1162 movdqu xmm3, [eax + 48]
1163 pmaddubsw xmm0, xmm4
1164 pmaddubsw xmm1, xmm4
1165 pmaddubsw xmm2, xmm4
1166 pmaddubsw xmm3, xmm4
1167 lea eax, [eax + 64]
1168 phaddw xmm0, xmm1
1169 phaddw xmm2, xmm3
1170 paddw xmm0, xmm5 // Add .5 for rounding.
1171 paddw xmm2, xmm5
1172 psrlw xmm0, 7
1173 psrlw xmm2, 7
1174 packuswb xmm0, xmm2
1175 movdqu [edx], xmm0
1176 lea edx, [edx + 16]
1177 sub ecx, 16
1178 jg convertloop
1179 ret
1180 }
1181 }
1182
1183 #ifdef HAS_ARGBTOYROW_AVX2
1184 // vpermd for vphaddw + vpackuswb vpermd.
1185 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
1186
1187 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1188 __declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb,
1189 uint8_t* dst_y,
1190 int width) {
1191 __asm {
1192 mov eax, [esp + 4] /* src_argb */
1193 mov edx, [esp + 8] /* dst_y */
1194 mov ecx, [esp + 12] /* width */
1195 vbroadcastf128 ymm4, xmmword ptr kARGBToY
1196 vbroadcastf128 ymm5, xmmword ptr kAddY16
1197 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
1198
1199 convertloop:
1200 vmovdqu ymm0, [eax]
1201 vmovdqu ymm1, [eax + 32]
1202 vmovdqu ymm2, [eax + 64]
1203 vmovdqu ymm3, [eax + 96]
1204 vpmaddubsw ymm0, ymm0, ymm4
1205 vpmaddubsw ymm1, ymm1, ymm4
1206 vpmaddubsw ymm2, ymm2, ymm4
1207 vpmaddubsw ymm3, ymm3, ymm4
1208 lea eax, [eax + 128]
1209 vphaddw ymm0, ymm0, ymm1 // mutates.
1210 vphaddw ymm2, ymm2, ymm3
1211 vpsrlw ymm0, ymm0, 7
1212 vpsrlw ymm2, ymm2, 7
1213 vpackuswb ymm0, ymm0, ymm2 // mutates.
1214 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
1215 vpaddb ymm0, ymm0, ymm5 // add 16 for Y
1216 vmovdqu [edx], ymm0
1217 lea edx, [edx + 32]
1218 sub ecx, 32
1219 jg convertloop
1220 vzeroupper
1221 ret
1222 }
1223 }
1224 #endif // HAS_ARGBTOYROW_AVX2
1225
1226 #ifdef HAS_ARGBTOYJROW_AVX2
1227 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1228 __declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb,
1229 uint8_t* dst_y,
1230 int width) {
1231 __asm {
1232 mov eax, [esp + 4] /* src_argb */
1233 mov edx, [esp + 8] /* dst_y */
1234 mov ecx, [esp + 12] /* width */
1235 vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
1236 vbroadcastf128 ymm5, xmmword ptr kAddYJ64
1237 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
1238
1239 convertloop:
1240 vmovdqu ymm0, [eax]
1241 vmovdqu ymm1, [eax + 32]
1242 vmovdqu ymm2, [eax + 64]
1243 vmovdqu ymm3, [eax + 96]
1244 vpmaddubsw ymm0, ymm0, ymm4
1245 vpmaddubsw ymm1, ymm1, ymm4
1246 vpmaddubsw ymm2, ymm2, ymm4
1247 vpmaddubsw ymm3, ymm3, ymm4
1248 lea eax, [eax + 128]
1249 vphaddw ymm0, ymm0, ymm1 // mutates.
1250 vphaddw ymm2, ymm2, ymm3
1251 vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
1252 vpaddw ymm2, ymm2, ymm5
1253 vpsrlw ymm0, ymm0, 7
1254 vpsrlw ymm2, ymm2, 7
1255 vpackuswb ymm0, ymm0, ymm2 // mutates.
1256 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
1257 vmovdqu [edx], ymm0
1258 lea edx, [edx + 32]
1259 sub ecx, 32
1260 jg convertloop
1261
1262 vzeroupper
1263 ret
1264 }
1265 }
1266 #endif // HAS_ARGBTOYJROW_AVX2
1267
1268 __declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb,
1269 uint8_t* dst_y,
1270 int width) {
1271 __asm {
1272 mov eax, [esp + 4] /* src_argb */
1273 mov edx, [esp + 8] /* dst_y */
1274 mov ecx, [esp + 12] /* width */
1275 movdqa xmm4, xmmword ptr kBGRAToY
1276 movdqa xmm5, xmmword ptr kAddY16
1277
1278 convertloop:
1279 movdqu xmm0, [eax]
1280 movdqu xmm1, [eax + 16]
1281 movdqu xmm2, [eax + 32]
1282 movdqu xmm3, [eax + 48]
1283 pmaddubsw xmm0, xmm4
1284 pmaddubsw xmm1, xmm4
1285 pmaddubsw xmm2, xmm4
1286 pmaddubsw xmm3, xmm4
1287 lea eax, [eax + 64]
1288 phaddw xmm0, xmm1
1289 phaddw xmm2, xmm3
1290 psrlw xmm0, 7
1291 psrlw xmm2, 7
1292 packuswb xmm0, xmm2
1293 paddb xmm0, xmm5
1294 movdqu [edx], xmm0
1295 lea edx, [edx + 16]
1296 sub ecx, 16
1297 jg convertloop
1298 ret
1299 }
1300 }
1301
1302 __declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb,
1303 uint8_t* dst_y,
1304 int width) {
1305 __asm {
1306 mov eax, [esp + 4] /* src_argb */
1307 mov edx, [esp + 8] /* dst_y */
1308 mov ecx, [esp + 12] /* width */
1309 movdqa xmm4, xmmword ptr kABGRToY
1310 movdqa xmm5, xmmword ptr kAddY16
1311
1312 convertloop:
1313 movdqu xmm0, [eax]
1314 movdqu xmm1, [eax + 16]
1315 movdqu xmm2, [eax + 32]
1316 movdqu xmm3, [eax + 48]
1317 pmaddubsw xmm0, xmm4
1318 pmaddubsw xmm1, xmm4
1319 pmaddubsw xmm2, xmm4
1320 pmaddubsw xmm3, xmm4
1321 lea eax, [eax + 64]
1322 phaddw xmm0, xmm1
1323 phaddw xmm2, xmm3
1324 psrlw xmm0, 7
1325 psrlw xmm2, 7
1326 packuswb xmm0, xmm2
1327 paddb xmm0, xmm5
1328 movdqu [edx], xmm0
1329 lea edx, [edx + 16]
1330 sub ecx, 16
1331 jg convertloop
1332 ret
1333 }
1334 }
1335
1336 __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
1337 uint8_t* dst_y,
1338 int width) {
1339 __asm {
1340 mov eax, [esp + 4] /* src_argb */
1341 mov edx, [esp + 8] /* dst_y */
1342 mov ecx, [esp + 12] /* width */
1343 movdqa xmm4, xmmword ptr kRGBAToY
1344 movdqa xmm5, xmmword ptr kAddY16
1345
1346 convertloop:
1347 movdqu xmm0, [eax]
1348 movdqu xmm1, [eax + 16]
1349 movdqu xmm2, [eax + 32]
1350 movdqu xmm3, [eax + 48]
1351 pmaddubsw xmm0, xmm4
1352 pmaddubsw xmm1, xmm4
1353 pmaddubsw xmm2, xmm4
1354 pmaddubsw xmm3, xmm4
1355 lea eax, [eax + 64]
1356 phaddw xmm0, xmm1
1357 phaddw xmm2, xmm3
1358 psrlw xmm0, 7
1359 psrlw xmm2, 7
1360 packuswb xmm0, xmm2
1361 paddb xmm0, xmm5
1362 movdqu [edx], xmm0
1363 lea edx, [edx + 16]
1364 sub ecx, 16
1365 jg convertloop
1366 ret
1367 }
1368 }
1369
1370 __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
1371 int src_stride_argb,
1372 uint8_t* dst_u,
1373 uint8_t* dst_v,
1374 int width) {
1375 __asm {
1376 push esi
1377 push edi
1378 mov eax, [esp + 8 + 4] // src_argb
1379 mov esi, [esp + 8 + 8] // src_stride_argb
1380 mov edx, [esp + 8 + 12] // dst_u
1381 mov edi, [esp + 8 + 16] // dst_v
1382 mov ecx, [esp + 8 + 20] // width
1383 movdqa xmm5, xmmword ptr kAddUV128
1384 movdqa xmm6, xmmword ptr kARGBToV
1385 movdqa xmm7, xmmword ptr kARGBToU
1386 sub edi, edx // stride from u to v
1387
1388 convertloop:
1389 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1390 movdqu xmm0, [eax]
1391 movdqu xmm4, [eax + esi]
1392 pavgb xmm0, xmm4
1393 movdqu xmm1, [eax + 16]
1394 movdqu xmm4, [eax + esi + 16]
1395 pavgb xmm1, xmm4
1396 movdqu xmm2, [eax + 32]
1397 movdqu xmm4, [eax + esi + 32]
1398 pavgb xmm2, xmm4
1399 movdqu xmm3, [eax + 48]
1400 movdqu xmm4, [eax + esi + 48]
1401 pavgb xmm3, xmm4
1402
1403 lea eax, [eax + 64]
1404 movdqa xmm4, xmm0
1405 shufps xmm0, xmm1, 0x88
1406 shufps xmm4, xmm1, 0xdd
1407 pavgb xmm0, xmm4
1408 movdqa xmm4, xmm2
1409 shufps xmm2, xmm3, 0x88
1410 shufps xmm4, xmm3, 0xdd
1411 pavgb xmm2, xmm4
1412
1413 // step 2 - convert to U and V
1414 // from here down is very similar to Y code except
1415 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1416 movdqa xmm1, xmm0
1417 movdqa xmm3, xmm2
1418 pmaddubsw xmm0, xmm7 // U
1419 pmaddubsw xmm2, xmm7
1420 pmaddubsw xmm1, xmm6 // V
1421 pmaddubsw xmm3, xmm6
1422 phaddw xmm0, xmm2
1423 phaddw xmm1, xmm3
1424 psraw xmm0, 8
1425 psraw xmm1, 8
1426 packsswb xmm0, xmm1
1427 paddb xmm0, xmm5 // -> unsigned
1428
1429 // step 3 - store 8 U and 8 V values
1430 movlps qword ptr [edx], xmm0 // U
1431 movhps qword ptr [edx + edi], xmm0 // V
1432 lea edx, [edx + 8]
1433 sub ecx, 16
1434 jg convertloop
1435
1436 pop edi
1437 pop esi
1438 ret
1439 }
1440 }
1441
1442 __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
1443 int src_stride_argb,
1444 uint8_t* dst_u,
1445 uint8_t* dst_v,
1446 int width) {
1447 __asm {
1448 push esi
1449 push edi
1450 mov eax, [esp + 8 + 4] // src_argb
1451 mov esi, [esp + 8 + 8] // src_stride_argb
1452 mov edx, [esp + 8 + 12] // dst_u
1453 mov edi, [esp + 8 + 16] // dst_v
1454 mov ecx, [esp + 8 + 20] // width
1455 movdqa xmm5, xmmword ptr kAddUVJ128
1456 movdqa xmm6, xmmword ptr kARGBToVJ
1457 movdqa xmm7, xmmword ptr kARGBToUJ
1458 sub edi, edx // stride from u to v
1459
1460 convertloop:
1461 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1462 movdqu xmm0, [eax]
1463 movdqu xmm4, [eax + esi]
1464 pavgb xmm0, xmm4
1465 movdqu xmm1, [eax + 16]
1466 movdqu xmm4, [eax + esi + 16]
1467 pavgb xmm1, xmm4
1468 movdqu xmm2, [eax + 32]
1469 movdqu xmm4, [eax + esi + 32]
1470 pavgb xmm2, xmm4
1471 movdqu xmm3, [eax + 48]
1472 movdqu xmm4, [eax + esi + 48]
1473 pavgb xmm3, xmm4
1474
1475 lea eax, [eax + 64]
1476 movdqa xmm4, xmm0
1477 shufps xmm0, xmm1, 0x88
1478 shufps xmm4, xmm1, 0xdd
1479 pavgb xmm0, xmm4
1480 movdqa xmm4, xmm2
1481 shufps xmm2, xmm3, 0x88
1482 shufps xmm4, xmm3, 0xdd
1483 pavgb xmm2, xmm4
1484
1485 // step 2 - convert to U and V
1486 // from here down is very similar to Y code except
1487 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1488 movdqa xmm1, xmm0
1489 movdqa xmm3, xmm2
1490 pmaddubsw xmm0, xmm7 // U
1491 pmaddubsw xmm2, xmm7
1492 pmaddubsw xmm1, xmm6 // V
1493 pmaddubsw xmm3, xmm6
1494 phaddw xmm0, xmm2
1495 phaddw xmm1, xmm3
1496 paddw xmm0, xmm5 // +.5 rounding -> unsigned
1497 paddw xmm1, xmm5
1498 psraw xmm0, 8
1499 psraw xmm1, 8
1500 packsswb xmm0, xmm1
1501
1502 // step 3 - store 8 U and 8 V values
1503 movlps qword ptr [edx], xmm0 // U
1504 movhps qword ptr [edx + edi], xmm0 // V
1505 lea edx, [edx + 8]
1506 sub ecx, 16
1507 jg convertloop
1508
1509 pop edi
1510 pop esi
1511 ret
1512 }
1513 }
1514
1515 #ifdef HAS_ARGBTOUVROW_AVX2
1516 __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
1517 int src_stride_argb,
1518 uint8_t* dst_u,
1519 uint8_t* dst_v,
1520 int width) {
1521 __asm {
1522 push esi
1523 push edi
1524 mov eax, [esp + 8 + 4] // src_argb
1525 mov esi, [esp + 8 + 8] // src_stride_argb
1526 mov edx, [esp + 8 + 12] // dst_u
1527 mov edi, [esp + 8 + 16] // dst_v
1528 mov ecx, [esp + 8 + 20] // width
1529 vbroadcastf128 ymm5, xmmword ptr kAddUV128
1530 vbroadcastf128 ymm6, xmmword ptr kARGBToV
1531 vbroadcastf128 ymm7, xmmword ptr kARGBToU
1532 sub edi, edx // stride from u to v
1533
1534 convertloop:
1535 /* step 1 - subsample 32x2 argb pixels to 16x1 */
1536 vmovdqu ymm0, [eax]
1537 vmovdqu ymm1, [eax + 32]
1538 vmovdqu ymm2, [eax + 64]
1539 vmovdqu ymm3, [eax + 96]
1540 vpavgb ymm0, ymm0, [eax + esi]
1541 vpavgb ymm1, ymm1, [eax + esi + 32]
1542 vpavgb ymm2, ymm2, [eax + esi + 64]
1543 vpavgb ymm3, ymm3, [eax + esi + 96]
1544 lea eax, [eax + 128]
1545 vshufps ymm4, ymm0, ymm1, 0x88
1546 vshufps ymm0, ymm0, ymm1, 0xdd
1547 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
1548 vshufps ymm4, ymm2, ymm3, 0x88
1549 vshufps ymm2, ymm2, ymm3, 0xdd
1550 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
1551
1552 // step 2 - convert to U and V
1553 // from here down is very similar to Y code except
1554 // instead of 32 different pixels, its 16 pixels of U and 16 of V
1555 vpmaddubsw ymm1, ymm0, ymm7 // U
1556 vpmaddubsw ymm3, ymm2, ymm7
1557 vpmaddubsw ymm0, ymm0, ymm6 // V
1558 vpmaddubsw ymm2, ymm2, ymm6
1559 vphaddw ymm1, ymm1, ymm3 // mutates
1560 vphaddw ymm0, ymm0, ymm2
1561 vpsraw ymm1, ymm1, 8
1562 vpsraw ymm0, ymm0, 8
1563 vpacksswb ymm0, ymm1, ymm0 // mutates
1564 vpermq ymm0, ymm0, 0xd8 // For vpacksswb
1565 vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
1566 vpaddb ymm0, ymm0, ymm5 // -> unsigned
1567
1568 // step 3 - store 16 U and 16 V values
1569 vextractf128 [edx], ymm0, 0 // U
1570 vextractf128 [edx + edi], ymm0, 1 // V
1571 lea edx, [edx + 16]
1572 sub ecx, 32
1573 jg convertloop
1574
1575 pop edi
1576 pop esi
1577 vzeroupper
1578 ret
1579 }
1580 }
1581 #endif // HAS_ARGBTOUVROW_AVX2
1582
1583 #ifdef HAS_ARGBTOUVJROW_AVX2
1584 __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
1585 int src_stride_argb,
1586 uint8_t* dst_u,
1587 uint8_t* dst_v,
1588 int width) {
1589 __asm {
1590 push esi
1591 push edi
1592 mov eax, [esp + 8 + 4] // src_argb
1593 mov esi, [esp + 8 + 8] // src_stride_argb
1594 mov edx, [esp + 8 + 12] // dst_u
1595 mov edi, [esp + 8 + 16] // dst_v
1596 mov ecx, [esp + 8 + 20] // width
1597 vbroadcastf128 ymm5, xmmword ptr kAddUVJ128
1598 vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
1599 vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
1600 sub edi, edx // stride from u to v
1601
1602 convertloop:
1603 /* step 1 - subsample 32x2 argb pixels to 16x1 */
1604 vmovdqu ymm0, [eax]
1605 vmovdqu ymm1, [eax + 32]
1606 vmovdqu ymm2, [eax + 64]
1607 vmovdqu ymm3, [eax + 96]
1608 vpavgb ymm0, ymm0, [eax + esi]
1609 vpavgb ymm1, ymm1, [eax + esi + 32]
1610 vpavgb ymm2, ymm2, [eax + esi + 64]
1611 vpavgb ymm3, ymm3, [eax + esi + 96]
1612 lea eax, [eax + 128]
1613 vshufps ymm4, ymm0, ymm1, 0x88
1614 vshufps ymm0, ymm0, ymm1, 0xdd
1615 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
1616 vshufps ymm4, ymm2, ymm3, 0x88
1617 vshufps ymm2, ymm2, ymm3, 0xdd
1618 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
1619
1620 // step 2 - convert to U and V
1621 // from here down is very similar to Y code except
1622 // instead of 32 different pixels, its 16 pixels of U and 16 of V
1623 vpmaddubsw ymm1, ymm0, ymm7 // U
1624 vpmaddubsw ymm3, ymm2, ymm7
1625 vpmaddubsw ymm0, ymm0, ymm6 // V
1626 vpmaddubsw ymm2, ymm2, ymm6
1627 vphaddw ymm1, ymm1, ymm3 // mutates
1628 vphaddw ymm0, ymm0, ymm2
1629 vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned
1630 vpaddw ymm0, ymm0, ymm5
1631 vpsraw ymm1, ymm1, 8
1632 vpsraw ymm0, ymm0, 8
1633 vpacksswb ymm0, ymm1, ymm0 // mutates
1634 vpermq ymm0, ymm0, 0xd8 // For vpacksswb
1635 vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
1636
1637 // step 3 - store 16 U and 16 V values
1638 vextractf128 [edx], ymm0, 0 // U
1639 vextractf128 [edx + edi], ymm0, 1 // V
1640 lea edx, [edx + 16]
1641 sub ecx, 32
1642 jg convertloop
1643
1644 pop edi
1645 pop esi
1646 vzeroupper
1647 ret
1648 }
1649 }
1650 #endif // HAS_ARGBTOUVJROW_AVX2
1651
1652 __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
1653 uint8_t* dst_u,
1654 uint8_t* dst_v,
1655 int width) {
1656 __asm {
1657 push edi
1658 mov eax, [esp + 4 + 4] // src_argb
1659 mov edx, [esp + 4 + 8] // dst_u
1660 mov edi, [esp + 4 + 12] // dst_v
1661 mov ecx, [esp + 4 + 16] // width
1662 movdqa xmm5, xmmword ptr kAddUV128
1663 movdqa xmm6, xmmword ptr kARGBToV
1664 movdqa xmm7, xmmword ptr kARGBToU
1665 sub edi, edx // stride from u to v
1666
1667 convertloop:
1668 /* convert to U and V */
1669 movdqu xmm0, [eax] // U
1670 movdqu xmm1, [eax + 16]
1671 movdqu xmm2, [eax + 32]
1672 movdqu xmm3, [eax + 48]
1673 pmaddubsw xmm0, xmm7
1674 pmaddubsw xmm1, xmm7
1675 pmaddubsw xmm2, xmm7
1676 pmaddubsw xmm3, xmm7
1677 phaddw xmm0, xmm1
1678 phaddw xmm2, xmm3
1679 psraw xmm0, 8
1680 psraw xmm2, 8
1681 packsswb xmm0, xmm2
1682 paddb xmm0, xmm5
1683 movdqu [edx], xmm0
1684
1685 movdqu xmm0, [eax] // V
1686 movdqu xmm1, [eax + 16]
1687 movdqu xmm2, [eax + 32]
1688 movdqu xmm3, [eax + 48]
1689 pmaddubsw xmm0, xmm6
1690 pmaddubsw xmm1, xmm6
1691 pmaddubsw xmm2, xmm6
1692 pmaddubsw xmm3, xmm6
1693 phaddw xmm0, xmm1
1694 phaddw xmm2, xmm3
1695 psraw xmm0, 8
1696 psraw xmm2, 8
1697 packsswb xmm0, xmm2
1698 paddb xmm0, xmm5
1699 lea eax, [eax + 64]
1700 movdqu [edx + edi], xmm0
1701 lea edx, [edx + 16]
1702 sub ecx, 16
1703 jg convertloop
1704
1705 pop edi
1706 ret
1707 }
1708 }
1709
1710 __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
1711 int src_stride_argb,
1712 uint8_t* dst_u,
1713 uint8_t* dst_v,
1714 int width) {
1715 __asm {
1716 push esi
1717 push edi
1718 mov eax, [esp + 8 + 4] // src_argb
1719 mov esi, [esp + 8 + 8] // src_stride_argb
1720 mov edx, [esp + 8 + 12] // dst_u
1721 mov edi, [esp + 8 + 16] // dst_v
1722 mov ecx, [esp + 8 + 20] // width
1723 movdqa xmm5, xmmword ptr kAddUV128
1724 movdqa xmm6, xmmword ptr kBGRAToV
1725 movdqa xmm7, xmmword ptr kBGRAToU
1726 sub edi, edx // stride from u to v
1727
1728 convertloop:
1729 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1730 movdqu xmm0, [eax]
1731 movdqu xmm4, [eax + esi]
1732 pavgb xmm0, xmm4
1733 movdqu xmm1, [eax + 16]
1734 movdqu xmm4, [eax + esi + 16]
1735 pavgb xmm1, xmm4
1736 movdqu xmm2, [eax + 32]
1737 movdqu xmm4, [eax + esi + 32]
1738 pavgb xmm2, xmm4
1739 movdqu xmm3, [eax + 48]
1740 movdqu xmm4, [eax + esi + 48]
1741 pavgb xmm3, xmm4
1742
1743 lea eax, [eax + 64]
1744 movdqa xmm4, xmm0
1745 shufps xmm0, xmm1, 0x88
1746 shufps xmm4, xmm1, 0xdd
1747 pavgb xmm0, xmm4
1748 movdqa xmm4, xmm2
1749 shufps xmm2, xmm3, 0x88
1750 shufps xmm4, xmm3, 0xdd
1751 pavgb xmm2, xmm4
1752
1753 // step 2 - convert to U and V
1754 // from here down is very similar to Y code except
1755 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1756 movdqa xmm1, xmm0
1757 movdqa xmm3, xmm2
1758 pmaddubsw xmm0, xmm7 // U
1759 pmaddubsw xmm2, xmm7
1760 pmaddubsw xmm1, xmm6 // V
1761 pmaddubsw xmm3, xmm6
1762 phaddw xmm0, xmm2
1763 phaddw xmm1, xmm3
1764 psraw xmm0, 8
1765 psraw xmm1, 8
1766 packsswb xmm0, xmm1
1767 paddb xmm0, xmm5 // -> unsigned
1768
1769 // step 3 - store 8 U and 8 V values
1770 movlps qword ptr [edx], xmm0 // U
1771 movhps qword ptr [edx + edi], xmm0 // V
1772 lea edx, [edx + 8]
1773 sub ecx, 16
1774 jg convertloop
1775
1776 pop edi
1777 pop esi
1778 ret
1779 }
1780 }
1781
1782 __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
1783 int src_stride_argb,
1784 uint8_t* dst_u,
1785 uint8_t* dst_v,
1786 int width) {
1787 __asm {
1788 push esi
1789 push edi
1790 mov eax, [esp + 8 + 4] // src_argb
1791 mov esi, [esp + 8 + 8] // src_stride_argb
1792 mov edx, [esp + 8 + 12] // dst_u
1793 mov edi, [esp + 8 + 16] // dst_v
1794 mov ecx, [esp + 8 + 20] // width
1795 movdqa xmm5, xmmword ptr kAddUV128
1796 movdqa xmm6, xmmword ptr kABGRToV
1797 movdqa xmm7, xmmword ptr kABGRToU
1798 sub edi, edx // stride from u to v
1799
1800 convertloop:
1801 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1802 movdqu xmm0, [eax]
1803 movdqu xmm4, [eax + esi]
1804 pavgb xmm0, xmm4
1805 movdqu xmm1, [eax + 16]
1806 movdqu xmm4, [eax + esi + 16]
1807 pavgb xmm1, xmm4
1808 movdqu xmm2, [eax + 32]
1809 movdqu xmm4, [eax + esi + 32]
1810 pavgb xmm2, xmm4
1811 movdqu xmm3, [eax + 48]
1812 movdqu xmm4, [eax + esi + 48]
1813 pavgb xmm3, xmm4
1814
1815 lea eax, [eax + 64]
1816 movdqa xmm4, xmm0
1817 shufps xmm0, xmm1, 0x88
1818 shufps xmm4, xmm1, 0xdd
1819 pavgb xmm0, xmm4
1820 movdqa xmm4, xmm2
1821 shufps xmm2, xmm3, 0x88
1822 shufps xmm4, xmm3, 0xdd
1823 pavgb xmm2, xmm4
1824
1825 // step 2 - convert to U and V
1826 // from here down is very similar to Y code except
1827 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1828 movdqa xmm1, xmm0
1829 movdqa xmm3, xmm2
1830 pmaddubsw xmm0, xmm7 // U
1831 pmaddubsw xmm2, xmm7
1832 pmaddubsw xmm1, xmm6 // V
1833 pmaddubsw xmm3, xmm6
1834 phaddw xmm0, xmm2
1835 phaddw xmm1, xmm3
1836 psraw xmm0, 8
1837 psraw xmm1, 8
1838 packsswb xmm0, xmm1
1839 paddb xmm0, xmm5 // -> unsigned
1840
1841 // step 3 - store 8 U and 8 V values
1842 movlps qword ptr [edx], xmm0 // U
1843 movhps qword ptr [edx + edi], xmm0 // V
1844 lea edx, [edx + 8]
1845 sub ecx, 16
1846 jg convertloop
1847
1848 pop edi
1849 pop esi
1850 ret
1851 }
1852 }
1853
1854 __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
1855 int src_stride_argb,
1856 uint8_t* dst_u,
1857 uint8_t* dst_v,
1858 int width) {
1859 __asm {
1860 push esi
1861 push edi
1862 mov eax, [esp + 8 + 4] // src_argb
1863 mov esi, [esp + 8 + 8] // src_stride_argb
1864 mov edx, [esp + 8 + 12] // dst_u
1865 mov edi, [esp + 8 + 16] // dst_v
1866 mov ecx, [esp + 8 + 20] // width
1867 movdqa xmm5, xmmword ptr kAddUV128
1868 movdqa xmm6, xmmword ptr kRGBAToV
1869 movdqa xmm7, xmmword ptr kRGBAToU
1870 sub edi, edx // stride from u to v
1871
1872 convertloop:
1873 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1874 movdqu xmm0, [eax]
1875 movdqu xmm4, [eax + esi]
1876 pavgb xmm0, xmm4
1877 movdqu xmm1, [eax + 16]
1878 movdqu xmm4, [eax + esi + 16]
1879 pavgb xmm1, xmm4
1880 movdqu xmm2, [eax + 32]
1881 movdqu xmm4, [eax + esi + 32]
1882 pavgb xmm2, xmm4
1883 movdqu xmm3, [eax + 48]
1884 movdqu xmm4, [eax + esi + 48]
1885 pavgb xmm3, xmm4
1886
1887 lea eax, [eax + 64]
1888 movdqa xmm4, xmm0
1889 shufps xmm0, xmm1, 0x88
1890 shufps xmm4, xmm1, 0xdd
1891 pavgb xmm0, xmm4
1892 movdqa xmm4, xmm2
1893 shufps xmm2, xmm3, 0x88
1894 shufps xmm4, xmm3, 0xdd
1895 pavgb xmm2, xmm4
1896
1897 // step 2 - convert to U and V
1898 // from here down is very similar to Y code except
1899 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1900 movdqa xmm1, xmm0
1901 movdqa xmm3, xmm2
1902 pmaddubsw xmm0, xmm7 // U
1903 pmaddubsw xmm2, xmm7
1904 pmaddubsw xmm1, xmm6 // V
1905 pmaddubsw xmm3, xmm6
1906 phaddw xmm0, xmm2
1907 phaddw xmm1, xmm3
1908 psraw xmm0, 8
1909 psraw xmm1, 8
1910 packsswb xmm0, xmm1
1911 paddb xmm0, xmm5 // -> unsigned
1912
1913 // step 3 - store 8 U and 8 V values
1914 movlps qword ptr [edx], xmm0 // U
1915 movhps qword ptr [edx + edi], xmm0 // V
1916 lea edx, [edx + 8]
1917 sub ecx, 16
1918 jg convertloop
1919
1920 pop edi
1921 pop esi
1922 ret
1923 }
1924 }
1925 #endif // HAS_ARGBTOYROW_SSSE3
1926
1927 // Read 16 UV from 444
1928 #define READYUV444_AVX2 \
1929 __asm { \
1930 __asm vmovdqu xmm0, [esi] /* U */ \
1931 __asm vmovdqu xmm1, [esi + edi] /* V */ \
1932 __asm lea esi, [esi + 16] \
1933 __asm vpermq ymm0, ymm0, 0xd8 \
1934 __asm vpermq ymm1, ymm1, 0xd8 \
1935 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1936 __asm vmovdqu xmm4, [eax] /* Y */ \
1937 __asm vpermq ymm4, ymm4, 0xd8 \
1938 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1939 __asm lea eax, [eax + 16]}
1940
1941 // Read 8 UV from 422, upsample to 16 UV.
1942 #define READYUV422_AVX2 \
1943 __asm { \
1944 __asm vmovq xmm0, qword ptr [esi] /* U */ \
1945 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
1946 __asm lea esi, [esi + 8] \
1947 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1948 __asm vpermq ymm0, ymm0, 0xd8 \
1949 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1950 __asm vmovdqu xmm4, [eax] /* Y */ \
1951 __asm vpermq ymm4, ymm4, 0xd8 \
1952 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1953 __asm lea eax, [eax + 16]}
1954
1955 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
1956 #define READYUVA422_AVX2 \
1957 __asm { \
1958 __asm vmovq xmm0, qword ptr [esi] /* U */ \
1959 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
1960 __asm lea esi, [esi + 8] \
1961 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1962 __asm vpermq ymm0, ymm0, 0xd8 \
1963 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1964 __asm vmovdqu xmm4, [eax] /* Y */ \
1965 __asm vpermq ymm4, ymm4, 0xd8 \
1966 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1967 __asm lea eax, [eax + 16] \
1968 __asm vmovdqu xmm5, [ebp] /* A */ \
1969 __asm vpermq ymm5, ymm5, 0xd8 \
1970 __asm lea ebp, [ebp + 16]}
1971
1972 // Read 8 UV from NV12, upsample to 16 UV.
1973 #define READNV12_AVX2 \
1974 __asm { \
1975 __asm vmovdqu xmm0, [esi] /* UV */ \
1976 __asm lea esi, [esi + 16] \
1977 __asm vpermq ymm0, ymm0, 0xd8 \
1978 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1979 __asm vmovdqu xmm4, [eax] /* Y */ \
1980 __asm vpermq ymm4, ymm4, 0xd8 \
1981 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1982 __asm lea eax, [eax + 16]}
1983
1984 // Read 8 UV from NV21, upsample to 16 UV.
1985 #define READNV21_AVX2 \
1986 __asm { \
1987 __asm vmovdqu xmm0, [esi] /* UV */ \
1988 __asm lea esi, [esi + 16] \
1989 __asm vpermq ymm0, ymm0, 0xd8 \
1990 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
1991 __asm vmovdqu xmm4, [eax] /* Y */ \
1992 __asm vpermq ymm4, ymm4, 0xd8 \
1993 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1994 __asm lea eax, [eax + 16]}
1995
1996 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
1997 #define READYUY2_AVX2 \
1998 __asm { \
1999 __asm vmovdqu ymm4, [eax] /* YUY2 */ \
2000 __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
2001 __asm vmovdqu ymm0, [eax] /* UV */ \
2002 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
2003 __asm lea eax, [eax + 32]}
2004
2005 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2006 #define READUYVY_AVX2 \
2007 __asm { \
2008 __asm vmovdqu ymm4, [eax] /* UYVY */ \
2009 __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
2010 __asm vmovdqu ymm0, [eax] /* UV */ \
2011 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
2012 __asm lea eax, [eax + 32]}
2013
2014 // Convert 16 pixels: 16 UV and 16 Y.
2015 #define YUVTORGB_AVX2(YuvConstants) \
2016 __asm { \
2017 __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
2018 __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
2019 __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
2020 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
2021 __asm vpsubw ymm2, ymm3, ymm2 \
2022 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
2023 __asm vpsubw ymm1, ymm3, ymm1 \
2024 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
2025 __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \
2026 __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
2027 __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
2028 __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
2029 __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
2030 __asm vpsraw ymm0, ymm0, 6 \
2031 __asm vpsraw ymm1, ymm1, 6 \
2032 __asm vpsraw ymm2, ymm2, 6 \
2033 __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
2034 __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
2035 __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
2036 }
2037
2038 // Store 16 ARGB values.
2039 #define STOREARGB_AVX2 \
2040 __asm { \
2041 __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
2042 __asm vpermq ymm0, ymm0, 0xd8 \
2043 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
2044 __asm vpermq ymm2, ymm2, 0xd8 \
2045 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
2046 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
2047 __asm vmovdqu 0[edx], ymm1 \
2048 __asm vmovdqu 32[edx], ymm0 \
2049 __asm lea edx, [edx + 64]}
2050
2051 // Store 16 RGBA values.
2052 #define STORERGBA_AVX2 \
2053 __asm { \
2054 __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
2055 __asm vpermq ymm1, ymm1, 0xd8 \
2056 __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
2057 __asm vpermq ymm2, ymm2, 0xd8 \
2058 __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
2059 __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
2060 __asm vmovdqu [edx], ymm0 \
2061 __asm vmovdqu [edx + 32], ymm1 \
2062 __asm lea edx, [edx + 64]}
2063
2064 #ifdef HAS_I422TOARGBROW_AVX2
2065 // 16 pixels
2066 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2067 __declspec(naked) void I422ToARGBRow_AVX2(
2068 const uint8_t* y_buf,
2069 const uint8_t* u_buf,
2070 const uint8_t* v_buf,
2071 uint8_t* dst_argb,
2072 const struct YuvConstants* yuvconstants,
2073 int width) {
2074 __asm {
2075 push esi
2076 push edi
2077 push ebx
2078 mov eax, [esp + 12 + 4] // Y
2079 mov esi, [esp + 12 + 8] // U
2080 mov edi, [esp + 12 + 12] // V
2081 mov edx, [esp + 12 + 16] // argb
2082 mov ebx, [esp + 12 + 20] // yuvconstants
2083 mov ecx, [esp + 12 + 24] // width
2084 sub edi, esi
2085 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2086
2087 convertloop:
2088 READYUV422_AVX2
2089 YUVTORGB_AVX2(ebx)
2090 STOREARGB_AVX2
2091
2092 sub ecx, 16
2093 jg convertloop
2094
2095 pop ebx
2096 pop edi
2097 pop esi
2098 vzeroupper
2099 ret
2100 }
2101 }
2102 #endif // HAS_I422TOARGBROW_AVX2
2103
2104 #ifdef HAS_I422ALPHATOARGBROW_AVX2
2105 // 16 pixels
2106 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
2107 __declspec(naked) void I422AlphaToARGBRow_AVX2(
2108 const uint8_t* y_buf,
2109 const uint8_t* u_buf,
2110 const uint8_t* v_buf,
2111 const uint8_t* a_buf,
2112 uint8_t* dst_argb,
2113 const struct YuvConstants* yuvconstants,
2114 int width) {
2115 __asm {
2116 push esi
2117 push edi
2118 push ebx
2119 push ebp
2120 mov eax, [esp + 16 + 4] // Y
2121 mov esi, [esp + 16 + 8] // U
2122 mov edi, [esp + 16 + 12] // V
2123 mov ebp, [esp + 16 + 16] // A
2124 mov edx, [esp + 16 + 20] // argb
2125 mov ebx, [esp + 16 + 24] // yuvconstants
2126 mov ecx, [esp + 16 + 28] // width
2127 sub edi, esi
2128
2129 convertloop:
2130 READYUVA422_AVX2
2131 YUVTORGB_AVX2(ebx)
2132 STOREARGB_AVX2
2133
2134 sub ecx, 16
2135 jg convertloop
2136
2137 pop ebp
2138 pop ebx
2139 pop edi
2140 pop esi
2141 vzeroupper
2142 ret
2143 }
2144 }
2145 #endif // HAS_I422ALPHATOARGBROW_AVX2
2146
2147 #ifdef HAS_I444TOARGBROW_AVX2
2148 // 16 pixels
2149 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2150 __declspec(naked) void I444ToARGBRow_AVX2(
2151 const uint8_t* y_buf,
2152 const uint8_t* u_buf,
2153 const uint8_t* v_buf,
2154 uint8_t* dst_argb,
2155 const struct YuvConstants* yuvconstants,
2156 int width) {
2157 __asm {
2158 push esi
2159 push edi
2160 push ebx
2161 mov eax, [esp + 12 + 4] // Y
2162 mov esi, [esp + 12 + 8] // U
2163 mov edi, [esp + 12 + 12] // V
2164 mov edx, [esp + 12 + 16] // argb
2165 mov ebx, [esp + 12 + 20] // yuvconstants
2166 mov ecx, [esp + 12 + 24] // width
2167 sub edi, esi
2168 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2169 convertloop:
2170 READYUV444_AVX2
2171 YUVTORGB_AVX2(ebx)
2172 STOREARGB_AVX2
2173
2174 sub ecx, 16
2175 jg convertloop
2176
2177 pop ebx
2178 pop edi
2179 pop esi
2180 vzeroupper
2181 ret
2182 }
2183 }
2184 #endif // HAS_I444TOARGBROW_AVX2
2185
2186 #ifdef HAS_NV12TOARGBROW_AVX2
2187 // 16 pixels.
2188 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2189 __declspec(naked) void NV12ToARGBRow_AVX2(
2190 const uint8_t* y_buf,
2191 const uint8_t* uv_buf,
2192 uint8_t* dst_argb,
2193 const struct YuvConstants* yuvconstants,
2194 int width) {
2195 __asm {
2196 push esi
2197 push ebx
2198 mov eax, [esp + 8 + 4] // Y
2199 mov esi, [esp + 8 + 8] // UV
2200 mov edx, [esp + 8 + 12] // argb
2201 mov ebx, [esp + 8 + 16] // yuvconstants
2202 mov ecx, [esp + 8 + 20] // width
2203 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2204
2205 convertloop:
2206 READNV12_AVX2
2207 YUVTORGB_AVX2(ebx)
2208 STOREARGB_AVX2
2209
2210 sub ecx, 16
2211 jg convertloop
2212
2213 pop ebx
2214 pop esi
2215 vzeroupper
2216 ret
2217 }
2218 }
2219 #endif // HAS_NV12TOARGBROW_AVX2
2220
2221 #ifdef HAS_NV21TOARGBROW_AVX2
2222 // 16 pixels.
2223 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2224 __declspec(naked) void NV21ToARGBRow_AVX2(
2225 const uint8_t* y_buf,
2226 const uint8_t* vu_buf,
2227 uint8_t* dst_argb,
2228 const struct YuvConstants* yuvconstants,
2229 int width) {
2230 __asm {
2231 push esi
2232 push ebx
2233 mov eax, [esp + 8 + 4] // Y
2234 mov esi, [esp + 8 + 8] // VU
2235 mov edx, [esp + 8 + 12] // argb
2236 mov ebx, [esp + 8 + 16] // yuvconstants
2237 mov ecx, [esp + 8 + 20] // width
2238 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2239
2240 convertloop:
2241 READNV21_AVX2
2242 YUVTORGB_AVX2(ebx)
2243 STOREARGB_AVX2
2244
2245 sub ecx, 16
2246 jg convertloop
2247
2248 pop ebx
2249 pop esi
2250 vzeroupper
2251 ret
2252 }
2253 }
2254 #endif // HAS_NV21TOARGBROW_AVX2
2255
2256 #ifdef HAS_YUY2TOARGBROW_AVX2
2257 // 16 pixels.
2258 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2259 __declspec(naked) void YUY2ToARGBRow_AVX2(
2260 const uint8_t* src_yuy2,
2261 uint8_t* dst_argb,
2262 const struct YuvConstants* yuvconstants,
2263 int width) {
2264 __asm {
2265 push ebx
2266 mov eax, [esp + 4 + 4] // yuy2
2267 mov edx, [esp + 4 + 8] // argb
2268 mov ebx, [esp + 4 + 12] // yuvconstants
2269 mov ecx, [esp + 4 + 16] // width
2270 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2271
2272 convertloop:
2273 READYUY2_AVX2
2274 YUVTORGB_AVX2(ebx)
2275 STOREARGB_AVX2
2276
2277 sub ecx, 16
2278 jg convertloop
2279
2280 pop ebx
2281 vzeroupper
2282 ret
2283 }
2284 }
2285 #endif // HAS_YUY2TOARGBROW_AVX2
2286
2287 #ifdef HAS_UYVYTOARGBROW_AVX2
2288 // 16 pixels.
2289 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2290 __declspec(naked) void UYVYToARGBRow_AVX2(
2291 const uint8_t* src_uyvy,
2292 uint8_t* dst_argb,
2293 const struct YuvConstants* yuvconstants,
2294 int width) {
2295 __asm {
2296 push ebx
2297 mov eax, [esp + 4 + 4] // uyvy
2298 mov edx, [esp + 4 + 8] // argb
2299 mov ebx, [esp + 4 + 12] // yuvconstants
2300 mov ecx, [esp + 4 + 16] // width
2301 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2302
2303 convertloop:
2304 READUYVY_AVX2
2305 YUVTORGB_AVX2(ebx)
2306 STOREARGB_AVX2
2307
2308 sub ecx, 16
2309 jg convertloop
2310
2311 pop ebx
2312 vzeroupper
2313 ret
2314 }
2315 }
2316 #endif // HAS_UYVYTOARGBROW_AVX2
2317
2318 #ifdef HAS_I422TORGBAROW_AVX2
2319 // 16 pixels
2320 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2321 __declspec(naked) void I422ToRGBARow_AVX2(
2322 const uint8_t* y_buf,
2323 const uint8_t* u_buf,
2324 const uint8_t* v_buf,
2325 uint8_t* dst_argb,
2326 const struct YuvConstants* yuvconstants,
2327 int width) {
2328 __asm {
2329 push esi
2330 push edi
2331 push ebx
2332 mov eax, [esp + 12 + 4] // Y
2333 mov esi, [esp + 12 + 8] // U
2334 mov edi, [esp + 12 + 12] // V
2335 mov edx, [esp + 12 + 16] // abgr
2336 mov ebx, [esp + 12 + 20] // yuvconstants
2337 mov ecx, [esp + 12 + 24] // width
2338 sub edi, esi
2339 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2340
2341 convertloop:
2342 READYUV422_AVX2
2343 YUVTORGB_AVX2(ebx)
2344 STORERGBA_AVX2
2345
2346 sub ecx, 16
2347 jg convertloop
2348
2349 pop ebx
2350 pop edi
2351 pop esi
2352 vzeroupper
2353 ret
2354 }
2355 }
2356 #endif // HAS_I422TORGBAROW_AVX2
2357
2358 #if defined(HAS_I422TOARGBROW_SSSE3)
2359 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
2360 // Allows a conversion with half size scaling.
2361
2362 // Read 8 UV from 444.
2363 #define READYUV444 \
2364 __asm { \
2365 __asm movq xmm0, qword ptr [esi] /* U */ \
2366 __asm movq xmm1, qword ptr [esi + edi] /* V */ \
2367 __asm lea esi, [esi + 8] \
2368 __asm punpcklbw xmm0, xmm1 /* UV */ \
2369 __asm movq xmm4, qword ptr [eax] \
2370 __asm punpcklbw xmm4, xmm4 \
2371 __asm lea eax, [eax + 8]}
2372
2373 // Read 4 UV from 422, upsample to 8 UV.
2374 #define READYUV422 \
2375 __asm { \
2376 __asm movd xmm0, [esi] /* U */ \
2377 __asm movd xmm1, [esi + edi] /* V */ \
2378 __asm lea esi, [esi + 4] \
2379 __asm punpcklbw xmm0, xmm1 /* UV */ \
2380 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2381 __asm movq xmm4, qword ptr [eax] \
2382 __asm punpcklbw xmm4, xmm4 \
2383 __asm lea eax, [eax + 8]}
2384
2385 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
2386 #define READYUVA422 \
2387 __asm { \
2388 __asm movd xmm0, [esi] /* U */ \
2389 __asm movd xmm1, [esi + edi] /* V */ \
2390 __asm lea esi, [esi + 4] \
2391 __asm punpcklbw xmm0, xmm1 /* UV */ \
2392 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2393 __asm movq xmm4, qword ptr [eax] /* Y */ \
2394 __asm punpcklbw xmm4, xmm4 \
2395 __asm lea eax, [eax + 8] \
2396 __asm movq xmm5, qword ptr [ebp] /* A */ \
2397 __asm lea ebp, [ebp + 8]}
2398
2399 // Read 4 UV from NV12, upsample to 8 UV.
2400 #define READNV12 \
2401 __asm { \
2402 __asm movq xmm0, qword ptr [esi] /* UV */ \
2403 __asm lea esi, [esi + 8] \
2404 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2405 __asm movq xmm4, qword ptr [eax] \
2406 __asm punpcklbw xmm4, xmm4 \
2407 __asm lea eax, [eax + 8]}
2408
2409 // Read 4 VU from NV21, upsample to 8 UV.
2410 #define READNV21 \
2411 __asm { \
2412 __asm movq xmm0, qword ptr [esi] /* UV */ \
2413 __asm lea esi, [esi + 8] \
2414 __asm pshufb xmm0, xmmword ptr kShuffleNV21 \
2415 __asm movq xmm4, qword ptr [eax] \
2416 __asm punpcklbw xmm4, xmm4 \
2417 __asm lea eax, [eax + 8]}
2418
2419 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
2420 #define READYUY2 \
2421 __asm { \
2422 __asm movdqu xmm4, [eax] /* YUY2 */ \
2423 __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
2424 __asm movdqu xmm0, [eax] /* UV */ \
2425 __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
2426 __asm lea eax, [eax + 16]}
2427
2428 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
2429 #define READUYVY \
2430 __asm { \
2431 __asm movdqu xmm4, [eax] /* UYVY */ \
2432 __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
2433 __asm movdqu xmm0, [eax] /* UV */ \
2434 __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
2435 __asm lea eax, [eax + 16]}
2436
2437 // Convert 8 pixels: 8 UV and 8 Y.
2438 #define YUVTORGB(YuvConstants) \
2439 __asm { \
2440 __asm movdqa xmm1, xmm0 \
2441 __asm movdqa xmm2, xmm0 \
2442 __asm movdqa xmm3, xmm0 \
2443 __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \
2444 __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \
2445 __asm psubw xmm0, xmm1 \
2446 __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \
2447 __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \
2448 __asm psubw xmm1, xmm2 \
2449 __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
2450 __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
2451 __asm psubw xmm2, xmm3 \
2452 __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
2453 __asm paddsw xmm0, xmm4 /* B += Y */ \
2454 __asm paddsw xmm1, xmm4 /* G += Y */ \
2455 __asm paddsw xmm2, xmm4 /* R += Y */ \
2456 __asm psraw xmm0, 6 \
2457 __asm psraw xmm1, 6 \
2458 __asm psraw xmm2, 6 \
2459 __asm packuswb xmm0, xmm0 /* B */ \
2460 __asm packuswb xmm1, xmm1 /* G */ \
2461 __asm packuswb xmm2, xmm2 /* R */ \
2462 }
2463
2464 // Store 8 ARGB values.
2465 #define STOREARGB \
2466 __asm { \
2467 __asm punpcklbw xmm0, xmm1 /* BG */ \
2468 __asm punpcklbw xmm2, xmm5 /* RA */ \
2469 __asm movdqa xmm1, xmm0 \
2470 __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
2471 __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
2472 __asm movdqu 0[edx], xmm0 \
2473 __asm movdqu 16[edx], xmm1 \
2474 __asm lea edx, [edx + 32]}
2475
2476 // Store 8 BGRA values.
2477 #define STOREBGRA \
2478 __asm { \
2479 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
2480 __asm punpcklbw xmm1, xmm0 /* GB */ \
2481 __asm punpcklbw xmm5, xmm2 /* AR */ \
2482 __asm movdqa xmm0, xmm5 \
2483 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
2484 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
2485 __asm movdqu 0[edx], xmm5 \
2486 __asm movdqu 16[edx], xmm0 \
2487 __asm lea edx, [edx + 32]}
2488
2489 // Store 8 RGBA values.
2490 #define STORERGBA \
2491 __asm { \
2492 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
2493 __asm punpcklbw xmm1, xmm2 /* GR */ \
2494 __asm punpcklbw xmm5, xmm0 /* AB */ \
2495 __asm movdqa xmm0, xmm5 \
2496 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
2497 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
2498 __asm movdqu 0[edx], xmm5 \
2499 __asm movdqu 16[edx], xmm0 \
2500 __asm lea edx, [edx + 32]}
2501
2502 // Store 8 RGB24 values.
2503 #define STORERGB24 \
2504 __asm {/* Weave into RRGB */ \
2505 __asm punpcklbw xmm0, xmm1 /* BG */ \
2506 __asm punpcklbw xmm2, xmm2 /* RR */ \
2507 __asm movdqa xmm1, xmm0 \
2508 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
2509 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \
2510 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
2511 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
2512 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
2513 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
2514 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
2515 __asm lea edx, [edx + 24]}
2516
2517 // Store 8 RGB565 values.
2518 #define STORERGB565 \
2519 __asm {/* Weave into RRGB */ \
2520 __asm punpcklbw xmm0, xmm1 /* BG */ \
2521 __asm punpcklbw xmm2, xmm2 /* RR */ \
2522 __asm movdqa xmm1, xmm0 \
2523 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
2524 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \
2525 __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
2526 __asm movdqa xmm2, xmm0 /* G */ \
2527 __asm pslld xmm0, 8 /* R */ \
2528 __asm psrld xmm3, 3 /* B */ \
2529 __asm psrld xmm2, 5 /* G */ \
2530 __asm psrad xmm0, 16 /* R */ \
2531 __asm pand xmm3, xmm5 /* B */ \
2532 __asm pand xmm2, xmm6 /* G */ \
2533 __asm pand xmm0, xmm7 /* R */ \
2534 __asm por xmm3, xmm2 /* BG */ \
2535 __asm por xmm0, xmm3 /* BGR */ \
2536 __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
2537 __asm movdqa xmm2, xmm1 /* G */ \
2538 __asm pslld xmm1, 8 /* R */ \
2539 __asm psrld xmm3, 3 /* B */ \
2540 __asm psrld xmm2, 5 /* G */ \
2541 __asm psrad xmm1, 16 /* R */ \
2542 __asm pand xmm3, xmm5 /* B */ \
2543 __asm pand xmm2, xmm6 /* G */ \
2544 __asm pand xmm1, xmm7 /* R */ \
2545 __asm por xmm3, xmm2 /* BG */ \
2546 __asm por xmm1, xmm3 /* BGR */ \
2547 __asm packssdw xmm0, xmm1 \
2548 __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
2549 __asm lea edx, [edx + 16]}
2550
2551 // 8 pixels.
2552 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2553 __declspec(naked) void I444ToARGBRow_SSSE3(
2554 const uint8_t* y_buf,
2555 const uint8_t* u_buf,
2556 const uint8_t* v_buf,
2557 uint8_t* dst_argb,
2558 const struct YuvConstants* yuvconstants,
2559 int width) {
2560 __asm {
2561 push esi
2562 push edi
2563 push ebx
2564 mov eax, [esp + 12 + 4] // Y
2565 mov esi, [esp + 12 + 8] // U
2566 mov edi, [esp + 12 + 12] // V
2567 mov edx, [esp + 12 + 16] // argb
2568 mov ebx, [esp + 12 + 20] // yuvconstants
2569 mov ecx, [esp + 12 + 24] // width
2570 sub edi, esi
2571 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2572
2573 convertloop:
2574 READYUV444
2575 YUVTORGB(ebx)
2576 STOREARGB
2577
2578 sub ecx, 8
2579 jg convertloop
2580
2581 pop ebx
2582 pop edi
2583 pop esi
2584 ret
2585 }
2586 }
2587
2588 // 8 pixels.
2589 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
2590 __declspec(naked) void I422ToRGB24Row_SSSE3(
2591 const uint8_t* y_buf,
2592 const uint8_t* u_buf,
2593 const uint8_t* v_buf,
2594 uint8_t* dst_rgb24,
2595 const struct YuvConstants* yuvconstants,
2596 int width) {
2597 __asm {
2598 push esi
2599 push edi
2600 push ebx
2601 mov eax, [esp + 12 + 4] // Y
2602 mov esi, [esp + 12 + 8] // U
2603 mov edi, [esp + 12 + 12] // V
2604 mov edx, [esp + 12 + 16] // argb
2605 mov ebx, [esp + 12 + 20] // yuvconstants
2606 mov ecx, [esp + 12 + 24] // width
2607 sub edi, esi
2608 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
2609 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
2610
2611 convertloop:
2612 READYUV422
2613 YUVTORGB(ebx)
2614 STORERGB24
2615
2616 sub ecx, 8
2617 jg convertloop
2618
2619 pop ebx
2620 pop edi
2621 pop esi
2622 ret
2623 }
2624 }
2625
2626 // 8 pixels
2627 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
2628 __declspec(naked) void I422ToRGB565Row_SSSE3(
2629 const uint8_t* y_buf,
2630 const uint8_t* u_buf,
2631 const uint8_t* v_buf,
2632 uint8_t* rgb565_buf,
2633 const struct YuvConstants* yuvconstants,
2634 int width) {
2635 __asm {
2636 push esi
2637 push edi
2638 push ebx
2639 mov eax, [esp + 12 + 4] // Y
2640 mov esi, [esp + 12 + 8] // U
2641 mov edi, [esp + 12 + 12] // V
2642 mov edx, [esp + 12 + 16] // argb
2643 mov ebx, [esp + 12 + 20] // yuvconstants
2644 mov ecx, [esp + 12 + 24] // width
2645 sub edi, esi
2646 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
2647 psrld xmm5, 27
2648 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
2649 psrld xmm6, 26
2650 pslld xmm6, 5
2651 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
2652 pslld xmm7, 11
2653
2654 convertloop:
2655 READYUV422
2656 YUVTORGB(ebx)
2657 STORERGB565
2658
2659 sub ecx, 8
2660 jg convertloop
2661
2662 pop ebx
2663 pop edi
2664 pop esi
2665 ret
2666 }
2667 }
2668
2669 // 8 pixels.
2670 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2671 __declspec(naked) void I422ToARGBRow_SSSE3(
2672 const uint8_t* y_buf,
2673 const uint8_t* u_buf,
2674 const uint8_t* v_buf,
2675 uint8_t* dst_argb,
2676 const struct YuvConstants* yuvconstants,
2677 int width) {
2678 __asm {
2679 push esi
2680 push edi
2681 push ebx
2682 mov eax, [esp + 12 + 4] // Y
2683 mov esi, [esp + 12 + 8] // U
2684 mov edi, [esp + 12 + 12] // V
2685 mov edx, [esp + 12 + 16] // argb
2686 mov ebx, [esp + 12 + 20] // yuvconstants
2687 mov ecx, [esp + 12 + 24] // width
2688 sub edi, esi
2689 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2690
2691 convertloop:
2692 READYUV422
2693 YUVTORGB(ebx)
2694 STOREARGB
2695
2696 sub ecx, 8
2697 jg convertloop
2698
2699 pop ebx
2700 pop edi
2701 pop esi
2702 ret
2703 }
2704 }
2705
2706 // 8 pixels.
2707 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
2708 __declspec(naked) void I422AlphaToARGBRow_SSSE3(
2709 const uint8_t* y_buf,
2710 const uint8_t* u_buf,
2711 const uint8_t* v_buf,
2712 const uint8_t* a_buf,
2713 uint8_t* dst_argb,
2714 const struct YuvConstants* yuvconstants,
2715 int width) {
2716 __asm {
2717 push esi
2718 push edi
2719 push ebx
2720 push ebp
2721 mov eax, [esp + 16 + 4] // Y
2722 mov esi, [esp + 16 + 8] // U
2723 mov edi, [esp + 16 + 12] // V
2724 mov ebp, [esp + 16 + 16] // A
2725 mov edx, [esp + 16 + 20] // argb
2726 mov ebx, [esp + 16 + 24] // yuvconstants
2727 mov ecx, [esp + 16 + 28] // width
2728 sub edi, esi
2729
2730 convertloop:
2731 READYUVA422
2732 YUVTORGB(ebx)
2733 STOREARGB
2734
2735 sub ecx, 8
2736 jg convertloop
2737
2738 pop ebp
2739 pop ebx
2740 pop edi
2741 pop esi
2742 ret
2743 }
2744 }
2745
2746 // 8 pixels.
2747 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2748 __declspec(naked) void NV12ToARGBRow_SSSE3(
2749 const uint8_t* y_buf,
2750 const uint8_t* uv_buf,
2751 uint8_t* dst_argb,
2752 const struct YuvConstants* yuvconstants,
2753 int width) {
2754 __asm {
2755 push esi
2756 push ebx
2757 mov eax, [esp + 8 + 4] // Y
2758 mov esi, [esp + 8 + 8] // UV
2759 mov edx, [esp + 8 + 12] // argb
2760 mov ebx, [esp + 8 + 16] // yuvconstants
2761 mov ecx, [esp + 8 + 20] // width
2762 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2763
2764 convertloop:
2765 READNV12
2766 YUVTORGB(ebx)
2767 STOREARGB
2768
2769 sub ecx, 8
2770 jg convertloop
2771
2772 pop ebx
2773 pop esi
2774 ret
2775 }
2776 }
2777
2778 // 8 pixels.
2779 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2780 __declspec(naked) void NV21ToARGBRow_SSSE3(
2781 const uint8_t* y_buf,
2782 const uint8_t* vu_buf,
2783 uint8_t* dst_argb,
2784 const struct YuvConstants* yuvconstants,
2785 int width) {
2786 __asm {
2787 push esi
2788 push ebx
2789 mov eax, [esp + 8 + 4] // Y
2790 mov esi, [esp + 8 + 8] // VU
2791 mov edx, [esp + 8 + 12] // argb
2792 mov ebx, [esp + 8 + 16] // yuvconstants
2793 mov ecx, [esp + 8 + 20] // width
2794 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2795
2796 convertloop:
2797 READNV21
2798 YUVTORGB(ebx)
2799 STOREARGB
2800
2801 sub ecx, 8
2802 jg convertloop
2803
2804 pop ebx
2805 pop esi
2806 ret
2807 }
2808 }
2809
2810 // 8 pixels.
2811 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
2812 __declspec(naked) void YUY2ToARGBRow_SSSE3(
2813 const uint8_t* src_yuy2,
2814 uint8_t* dst_argb,
2815 const struct YuvConstants* yuvconstants,
2816 int width) {
2817 __asm {
2818 push ebx
2819 mov eax, [esp + 4 + 4] // yuy2
2820 mov edx, [esp + 4 + 8] // argb
2821 mov ebx, [esp + 4 + 12] // yuvconstants
2822 mov ecx, [esp + 4 + 16] // width
2823 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2824
2825 convertloop:
2826 READYUY2
2827 YUVTORGB(ebx)
2828 STOREARGB
2829
2830 sub ecx, 8
2831 jg convertloop
2832
2833 pop ebx
2834 ret
2835 }
2836 }
2837
2838 // 8 pixels.
2839 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
2840 __declspec(naked) void UYVYToARGBRow_SSSE3(
2841 const uint8_t* src_uyvy,
2842 uint8_t* dst_argb,
2843 const struct YuvConstants* yuvconstants,
2844 int width) {
2845 __asm {
2846 push ebx
2847 mov eax, [esp + 4 + 4] // uyvy
2848 mov edx, [esp + 4 + 8] // argb
2849 mov ebx, [esp + 4 + 12] // yuvconstants
2850 mov ecx, [esp + 4 + 16] // width
2851 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2852
2853 convertloop:
2854 READUYVY
2855 YUVTORGB(ebx)
2856 STOREARGB
2857
2858 sub ecx, 8
2859 jg convertloop
2860
2861 pop ebx
2862 ret
2863 }
2864 }
2865
2866 __declspec(naked) void I422ToRGBARow_SSSE3(
2867 const uint8_t* y_buf,
2868 const uint8_t* u_buf,
2869 const uint8_t* v_buf,
2870 uint8_t* dst_rgba,
2871 const struct YuvConstants* yuvconstants,
2872 int width) {
2873 __asm {
2874 push esi
2875 push edi
2876 push ebx
2877 mov eax, [esp + 12 + 4] // Y
2878 mov esi, [esp + 12 + 8] // U
2879 mov edi, [esp + 12 + 12] // V
2880 mov edx, [esp + 12 + 16] // argb
2881 mov ebx, [esp + 12 + 20] // yuvconstants
2882 mov ecx, [esp + 12 + 24] // width
2883 sub edi, esi
2884
2885 convertloop:
2886 READYUV422
2887 YUVTORGB(ebx)
2888 STORERGBA
2889
2890 sub ecx, 8
2891 jg convertloop
2892
2893 pop ebx
2894 pop edi
2895 pop esi
2896 ret
2897 }
2898 }
2899 #endif // HAS_I422TOARGBROW_SSSE3
2900
2901 // I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter
2902 #ifdef HAS_I400TOARGBROW_SSE2
2903 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
2904 __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
2905 uint8_t* rgb_buf,
2906 const struct YuvConstants*,
2907 int width) {
2908 __asm {
2909 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
2910 movd xmm2, eax
2911 pshufd xmm2, xmm2,0
2912 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
2913 movd xmm3, eax
2914 pshufd xmm3, xmm3, 0
2915 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
2916 pslld xmm4, 24
2917
2918 mov eax, [esp + 4] // Y
2919 mov edx, [esp + 8] // rgb
2920 mov ecx, [esp + 12] // width
2921
2922 convertloop:
2923 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2924 movq xmm0, qword ptr [eax]
2925 lea eax, [eax + 8]
2926 punpcklbw xmm0, xmm0 // Y.Y
2927 pmulhuw xmm0, xmm2
2928 psubusw xmm0, xmm3
2929 psrlw xmm0, 6
2930 packuswb xmm0, xmm0 // G
2931
2932 // Step 2: Weave into ARGB
2933 punpcklbw xmm0, xmm0 // GG
2934 movdqa xmm1, xmm0
2935 punpcklwd xmm0, xmm0 // BGRA first 4 pixels
2936 punpckhwd xmm1, xmm1 // BGRA next 4 pixels
2937 por xmm0, xmm4
2938 por xmm1, xmm4
2939 movdqu [edx], xmm0
2940 movdqu [edx + 16], xmm1
2941 lea edx, [edx + 32]
2942 sub ecx, 8
2943 jg convertloop
2944 ret
2945 }
2946 }
2947 #endif // HAS_I400TOARGBROW_SSE2
2948
2949 #ifdef HAS_I400TOARGBROW_AVX2
2950 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2951 // note: vpunpcklbw mutates and vpackuswb unmutates.
2952 __declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
2953 uint8_t* rgb_buf,
2954 const struct YuvConstants*,
2955 int width) {
2956 __asm {
2957 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
2958 vmovd xmm2, eax
2959 vbroadcastss ymm2, xmm2
2960 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
2961 vmovd xmm3, eax
2962 vbroadcastss ymm3, xmm3
2963 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
2964 vpslld ymm4, ymm4, 24
2965
2966 mov eax, [esp + 4] // Y
2967 mov edx, [esp + 8] // rgb
2968 mov ecx, [esp + 12] // width
2969
2970 convertloop:
2971 // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
2972 vmovdqu xmm0, [eax]
2973 lea eax, [eax + 16]
2974 vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
2975 vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
2976 vpmulhuw ymm0, ymm0, ymm2
2977 vpsubusw ymm0, ymm0, ymm3
2978 vpsrlw ymm0, ymm0, 6
2979 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
2980
2981 // TODO(fbarchard): Weave alpha with unpack.
2982 // Step 2: Weave into ARGB
2983 vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
2984 vpermq ymm1, ymm1, 0xd8
2985 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
2986 vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
2987 vpor ymm0, ymm0, ymm4
2988 vpor ymm1, ymm1, ymm4
2989 vmovdqu [edx], ymm0
2990 vmovdqu [edx + 32], ymm1
2991 lea edx, [edx + 64]
2992 sub ecx, 16
2993 jg convertloop
2994 vzeroupper
2995 ret
2996 }
2997 }
2998 #endif // HAS_I400TOARGBROW_AVX2
2999
3000 #ifdef HAS_MIRRORROW_SSSE3
3001 // Shuffle table for reversing the bytes.
3002 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
3003 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
3004
3005 // TODO(fbarchard): Replace lea with -16 offset.
3006 __declspec(naked) void MirrorRow_SSSE3(const uint8_t* src,
3007 uint8_t* dst,
3008 int width) {
3009 __asm {
3010 mov eax, [esp + 4] // src
3011 mov edx, [esp + 8] // dst
3012 mov ecx, [esp + 12] // width
3013 movdqa xmm5, xmmword ptr kShuffleMirror
3014
3015 convertloop:
3016 movdqu xmm0, [eax - 16 + ecx]
3017 pshufb xmm0, xmm5
3018 movdqu [edx], xmm0
3019 lea edx, [edx + 16]
3020 sub ecx, 16
3021 jg convertloop
3022 ret
3023 }
3024 }
3025 #endif // HAS_MIRRORROW_SSSE3
3026
3027 #ifdef HAS_MIRRORROW_AVX2
3028 __declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
3029 uint8_t* dst,
3030 int width) {
3031 __asm {
3032 mov eax, [esp + 4] // src
3033 mov edx, [esp + 8] // dst
3034 mov ecx, [esp + 12] // width
3035 vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
3036
3037 convertloop:
3038 vmovdqu ymm0, [eax - 32 + ecx]
3039 vpshufb ymm0, ymm0, ymm5
3040 vpermq ymm0, ymm0, 0x4e // swap high and low halfs
3041 vmovdqu [edx], ymm0
3042 lea edx, [edx + 32]
3043 sub ecx, 32
3044 jg convertloop
3045 vzeroupper
3046 ret
3047 }
3048 }
3049 #endif // HAS_MIRRORROW_AVX2
3050
3051 #ifdef HAS_MIRRORSPLITUVROW_SSSE3
3052 // Shuffle table for reversing the bytes of UV channels.
3053 static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
3054 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
3055
3056 __declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src,
3057 uint8_t* dst_u,
3058 uint8_t* dst_v,
3059 int width) {
3060 __asm {
3061 push edi
3062 mov eax, [esp + 4 + 4] // src
3063 mov edx, [esp + 4 + 8] // dst_u
3064 mov edi, [esp + 4 + 12] // dst_v
3065 mov ecx, [esp + 4 + 16] // width
3066 movdqa xmm1, xmmword ptr kShuffleMirrorUV
3067 lea eax, [eax + ecx * 2 - 16]
3068 sub edi, edx
3069
3070 convertloop:
3071 movdqu xmm0, [eax]
3072 lea eax, [eax - 16]
3073 pshufb xmm0, xmm1
3074 movlpd qword ptr [edx], xmm0
3075 movhpd qword ptr [edx + edi], xmm0
3076 lea edx, [edx + 8]
3077 sub ecx, 8
3078 jg convertloop
3079
3080 pop edi
3081 ret
3082 }
3083 }
3084 #endif // HAS_MIRRORSPLITUVROW_SSSE3
3085
3086 #ifdef HAS_ARGBMIRRORROW_SSE2
3087 __declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
3088 uint8_t* dst,
3089 int width) {
3090 __asm {
3091 mov eax, [esp + 4] // src
3092 mov edx, [esp + 8] // dst
3093 mov ecx, [esp + 12] // width
3094 lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
3095
3096 convertloop:
3097 movdqu xmm0, [eax]
3098 lea eax, [eax - 16]
3099 pshufd xmm0, xmm0, 0x1b
3100 movdqu [edx], xmm0
3101 lea edx, [edx + 16]
3102 sub ecx, 4
3103 jg convertloop
3104 ret
3105 }
3106 }
3107 #endif // HAS_ARGBMIRRORROW_SSE2
3108
3109 #ifdef HAS_ARGBMIRRORROW_AVX2
3110 // Shuffle table for reversing the bytes.
3111 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
3112
3113 __declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src,
3114 uint8_t* dst,
3115 int width) {
3116 __asm {
3117 mov eax, [esp + 4] // src
3118 mov edx, [esp + 8] // dst
3119 mov ecx, [esp + 12] // width
3120 vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2
3121
3122 convertloop:
3123 vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
3124 vmovdqu [edx], ymm0
3125 lea edx, [edx + 32]
3126 sub ecx, 8
3127 jg convertloop
3128 vzeroupper
3129 ret
3130 }
3131 }
3132 #endif // HAS_ARGBMIRRORROW_AVX2
3133
3134 #ifdef HAS_SPLITUVROW_SSE2
3135 __declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv,
3136 uint8_t* dst_u,
3137 uint8_t* dst_v,
3138 int width) {
3139 __asm {
3140 push edi
3141 mov eax, [esp + 4 + 4] // src_uv
3142 mov edx, [esp + 4 + 8] // dst_u
3143 mov edi, [esp + 4 + 12] // dst_v
3144 mov ecx, [esp + 4 + 16] // width
3145 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3146 psrlw xmm5, 8
3147 sub edi, edx
3148
3149 convertloop:
3150 movdqu xmm0, [eax]
3151 movdqu xmm1, [eax + 16]
3152 lea eax, [eax + 32]
3153 movdqa xmm2, xmm0
3154 movdqa xmm3, xmm1
3155 pand xmm0, xmm5 // even bytes
3156 pand xmm1, xmm5
3157 packuswb xmm0, xmm1
3158 psrlw xmm2, 8 // odd bytes
3159 psrlw xmm3, 8
3160 packuswb xmm2, xmm3
3161 movdqu [edx], xmm0
3162 movdqu [edx + edi], xmm2
3163 lea edx, [edx + 16]
3164 sub ecx, 16
3165 jg convertloop
3166
3167 pop edi
3168 ret
3169 }
3170 }
3171
3172 #endif // HAS_SPLITUVROW_SSE2
3173
3174 #ifdef HAS_SPLITUVROW_AVX2
3175 __declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv,
3176 uint8_t* dst_u,
3177 uint8_t* dst_v,
3178 int width) {
3179 __asm {
3180 push edi
3181 mov eax, [esp + 4 + 4] // src_uv
3182 mov edx, [esp + 4 + 8] // dst_u
3183 mov edi, [esp + 4 + 12] // dst_v
3184 mov ecx, [esp + 4 + 16] // width
3185 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3186 vpsrlw ymm5, ymm5, 8
3187 sub edi, edx
3188
3189 convertloop:
3190 vmovdqu ymm0, [eax]
3191 vmovdqu ymm1, [eax + 32]
3192 lea eax, [eax + 64]
3193 vpsrlw ymm2, ymm0, 8 // odd bytes
3194 vpsrlw ymm3, ymm1, 8
3195 vpand ymm0, ymm0, ymm5 // even bytes
3196 vpand ymm1, ymm1, ymm5
3197 vpackuswb ymm0, ymm0, ymm1
3198 vpackuswb ymm2, ymm2, ymm3
3199 vpermq ymm0, ymm0, 0xd8
3200 vpermq ymm2, ymm2, 0xd8
3201 vmovdqu [edx], ymm0
3202 vmovdqu [edx + edi], ymm2
3203 lea edx, [edx + 32]
3204 sub ecx, 32
3205 jg convertloop
3206
3207 pop edi
3208 vzeroupper
3209 ret
3210 }
3211 }
3212 #endif // HAS_SPLITUVROW_AVX2
3213
3214 #ifdef HAS_MERGEUVROW_SSE2
3215 __declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u,
3216 const uint8_t* src_v,
3217 uint8_t* dst_uv,
3218 int width) {
3219 __asm {
3220 push edi
3221 mov eax, [esp + 4 + 4] // src_u
3222 mov edx, [esp + 4 + 8] // src_v
3223 mov edi, [esp + 4 + 12] // dst_uv
3224 mov ecx, [esp + 4 + 16] // width
3225 sub edx, eax
3226
3227 convertloop:
3228 movdqu xmm0, [eax] // read 16 U's
3229 movdqu xmm1, [eax + edx] // and 16 V's
3230 lea eax, [eax + 16]
3231 movdqa xmm2, xmm0
3232 punpcklbw xmm0, xmm1 // first 8 UV pairs
3233 punpckhbw xmm2, xmm1 // next 8 UV pairs
3234 movdqu [edi], xmm0
3235 movdqu [edi + 16], xmm2
3236 lea edi, [edi + 32]
3237 sub ecx, 16
3238 jg convertloop
3239
3240 pop edi
3241 ret
3242 }
3243 }
3244 #endif // HAS_MERGEUVROW_SSE2
3245
3246 #ifdef HAS_MERGEUVROW_AVX2
3247 __declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
3248 const uint8_t* src_v,
3249 uint8_t* dst_uv,
3250 int width) {
3251 __asm {
3252 push edi
3253 mov eax, [esp + 4 + 4] // src_u
3254 mov edx, [esp + 4 + 8] // src_v
3255 mov edi, [esp + 4 + 12] // dst_uv
3256 mov ecx, [esp + 4 + 16] // width
3257 sub edx, eax
3258
3259 convertloop:
3260 vmovdqu ymm0, [eax] // read 32 U's
3261 vmovdqu ymm1, [eax + edx] // and 32 V's
3262 lea eax, [eax + 32]
3263 vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
3264 vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
3265 vextractf128 [edi], ymm2, 0 // bytes 0..15
3266 vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
3267 vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
3268 vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
3269 lea edi, [edi + 64]
3270 sub ecx, 32
3271 jg convertloop
3272
3273 pop edi
3274 vzeroupper
3275 ret
3276 }
3277 }
3278 #endif // HAS_MERGEUVROW_AVX2
3279
3280 #ifdef HAS_COPYROW_SSE2
3281 // CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time.
3282 __declspec(naked) void CopyRow_SSE2(const uint8_t* src,
3283 uint8_t* dst,
3284 int width) {
3285 __asm {
3286 mov eax, [esp + 4] // src
3287 mov edx, [esp + 8] // dst
3288 mov ecx, [esp + 12] // width
3289 test eax, 15
3290 jne convertloopu
3291 test edx, 15
3292 jne convertloopu
3293
3294 convertloopa:
3295 movdqa xmm0, [eax]
3296 movdqa xmm1, [eax + 16]
3297 lea eax, [eax + 32]
3298 movdqa [edx], xmm0
3299 movdqa [edx + 16], xmm1
3300 lea edx, [edx + 32]
3301 sub ecx, 32
3302 jg convertloopa
3303 ret
3304
3305 convertloopu:
3306 movdqu xmm0, [eax]
3307 movdqu xmm1, [eax + 16]
3308 lea eax, [eax + 32]
3309 movdqu [edx], xmm0
3310 movdqu [edx + 16], xmm1
3311 lea edx, [edx + 32]
3312 sub ecx, 32
3313 jg convertloopu
3314 ret
3315 }
3316 }
3317 #endif // HAS_COPYROW_SSE2
3318
3319 #ifdef HAS_COPYROW_AVX
3320 // CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time.
3321 __declspec(naked) void CopyRow_AVX(const uint8_t* src,
3322 uint8_t* dst,
3323 int width) {
3324 __asm {
3325 mov eax, [esp + 4] // src
3326 mov edx, [esp + 8] // dst
3327 mov ecx, [esp + 12] // width
3328
3329 convertloop:
3330 vmovdqu ymm0, [eax]
3331 vmovdqu ymm1, [eax + 32]
3332 lea eax, [eax + 64]
3333 vmovdqu [edx], ymm0
3334 vmovdqu [edx + 32], ymm1
3335 lea edx, [edx + 64]
3336 sub ecx, 64
3337 jg convertloop
3338
3339 vzeroupper
3340 ret
3341 }
3342 }
3343 #endif // HAS_COPYROW_AVX
3344
3345 // Multiple of 1.
3346 __declspec(naked) void CopyRow_ERMS(const uint8_t* src,
3347 uint8_t* dst,
3348 int width) {
3349 __asm {
3350 mov eax, esi
3351 mov edx, edi
3352 mov esi, [esp + 4] // src
3353 mov edi, [esp + 8] // dst
3354 mov ecx, [esp + 12] // width
3355 rep movsb
3356 mov edi, edx
3357 mov esi, eax
3358 ret
3359 }
3360 }
3361
3362 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
3363 // width in pixels
3364 __declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src,
3365 uint8_t* dst,
3366 int width) {
3367 __asm {
3368 mov eax, [esp + 4] // src
3369 mov edx, [esp + 8] // dst
3370 mov ecx, [esp + 12] // width
3371 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
3372 pslld xmm0, 24
3373 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
3374 psrld xmm1, 8
3375
3376 convertloop:
3377 movdqu xmm2, [eax]
3378 movdqu xmm3, [eax + 16]
3379 lea eax, [eax + 32]
3380 movdqu xmm4, [edx]
3381 movdqu xmm5, [edx + 16]
3382 pand xmm2, xmm0
3383 pand xmm3, xmm0
3384 pand xmm4, xmm1
3385 pand xmm5, xmm1
3386 por xmm2, xmm4
3387 por xmm3, xmm5
3388 movdqu [edx], xmm2
3389 movdqu [edx + 16], xmm3
3390 lea edx, [edx + 32]
3391 sub ecx, 8
3392 jg convertloop
3393
3394 ret
3395 }
3396 }
3397 #endif // HAS_ARGBCOPYALPHAROW_SSE2
3398
3399 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
3400 // width in pixels
3401 __declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src,
3402 uint8_t* dst,
3403 int width) {
3404 __asm {
3405 mov eax, [esp + 4] // src
3406 mov edx, [esp + 8] // dst
3407 mov ecx, [esp + 12] // width
3408 vpcmpeqb ymm0, ymm0, ymm0
3409 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3410
3411 convertloop:
3412 vmovdqu ymm1, [eax]
3413 vmovdqu ymm2, [eax + 32]
3414 lea eax, [eax + 64]
3415 vpblendvb ymm1, ymm1, [edx], ymm0
3416 vpblendvb ymm2, ymm2, [edx + 32], ymm0
3417 vmovdqu [edx], ymm1
3418 vmovdqu [edx + 32], ymm2
3419 lea edx, [edx + 64]
3420 sub ecx, 16
3421 jg convertloop
3422
3423 vzeroupper
3424 ret
3425 }
3426 }
3427 #endif // HAS_ARGBCOPYALPHAROW_AVX2
3428
3429 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
3430 // width in pixels
3431 __declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
3432 uint8_t* dst_a,
3433 int width) {
3434 __asm {
3435 mov eax, [esp + 4] // src_argb
3436 mov edx, [esp + 8] // dst_a
3437 mov ecx, [esp + 12] // width
3438
3439 extractloop:
3440 movdqu xmm0, [eax]
3441 movdqu xmm1, [eax + 16]
3442 lea eax, [eax + 32]
3443 psrld xmm0, 24
3444 psrld xmm1, 24
3445 packssdw xmm0, xmm1
3446 packuswb xmm0, xmm0
3447 movq qword ptr [edx], xmm0
3448 lea edx, [edx + 8]
3449 sub ecx, 8
3450 jg extractloop
3451
3452 ret
3453 }
3454 }
3455 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
3456
3457 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
3458 // width in pixels
3459 __declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
3460 uint8_t* dst_a,
3461 int width) {
3462 __asm {
3463 mov eax, [esp + 4] // src_argb
3464 mov edx, [esp + 8] // dst_a
3465 mov ecx, [esp + 12] // width
3466 vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX
3467
3468 extractloop:
3469 vmovdqu ymm0, [eax]
3470 vmovdqu ymm1, [eax + 32]
3471 vpsrld ymm0, ymm0, 24
3472 vpsrld ymm1, ymm1, 24
3473 vmovdqu ymm2, [eax + 64]
3474 vmovdqu ymm3, [eax + 96]
3475 lea eax, [eax + 128]
3476 vpackssdw ymm0, ymm0, ymm1 // mutates
3477 vpsrld ymm2, ymm2, 24
3478 vpsrld ymm3, ymm3, 24
3479 vpackssdw ymm2, ymm2, ymm3 // mutates
3480 vpackuswb ymm0, ymm0, ymm2 // mutates
3481 vpermd ymm0, ymm4, ymm0 // unmutate
3482 vmovdqu [edx], ymm0
3483 lea edx, [edx + 32]
3484 sub ecx, 32
3485 jg extractloop
3486
3487 vzeroupper
3488 ret
3489 }
3490 }
3491 #endif // HAS_ARGBEXTRACTALPHAROW_AVX2
3492
3493 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3494 // width in pixels
3495 __declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src,
3496 uint8_t* dst,
3497 int width) {
3498 __asm {
3499 mov eax, [esp + 4] // src
3500 mov edx, [esp + 8] // dst
3501 mov ecx, [esp + 12] // width
3502 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
3503 pslld xmm0, 24
3504 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
3505 psrld xmm1, 8
3506
3507 convertloop:
3508 movq xmm2, qword ptr [eax] // 8 Y's
3509 lea eax, [eax + 8]
3510 punpcklbw xmm2, xmm2
3511 punpckhwd xmm3, xmm2
3512 punpcklwd xmm2, xmm2
3513 movdqu xmm4, [edx]
3514 movdqu xmm5, [edx + 16]
3515 pand xmm2, xmm0
3516 pand xmm3, xmm0
3517 pand xmm4, xmm1
3518 pand xmm5, xmm1
3519 por xmm2, xmm4
3520 por xmm3, xmm5
3521 movdqu [edx], xmm2
3522 movdqu [edx + 16], xmm3
3523 lea edx, [edx + 32]
3524 sub ecx, 8
3525 jg convertloop
3526
3527 ret
3528 }
3529 }
3530 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
3531
3532 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3533 // width in pixels
3534 __declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src,
3535 uint8_t* dst,
3536 int width) {
3537 __asm {
3538 mov eax, [esp + 4] // src
3539 mov edx, [esp + 8] // dst
3540 mov ecx, [esp + 12] // width
3541 vpcmpeqb ymm0, ymm0, ymm0
3542 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3543
3544 convertloop:
3545 vpmovzxbd ymm1, qword ptr [eax]
3546 vpmovzxbd ymm2, qword ptr [eax + 8]
3547 lea eax, [eax + 16]
3548 vpslld ymm1, ymm1, 24
3549 vpslld ymm2, ymm2, 24
3550 vpblendvb ymm1, ymm1, [edx], ymm0
3551 vpblendvb ymm2, ymm2, [edx + 32], ymm0
3552 vmovdqu [edx], ymm1
3553 vmovdqu [edx + 32], ymm2
3554 lea edx, [edx + 64]
3555 sub ecx, 16
3556 jg convertloop
3557
3558 vzeroupper
3559 ret
3560 }
3561 }
3562 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
3563
3564 #ifdef HAS_SETROW_X86
3565 // Write 'width' bytes using an 8 bit value repeated.
3566 // width should be multiple of 4.
3567 __declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
3568 __asm {
3569 movzx eax, byte ptr [esp + 8] // v8
3570 mov edx, 0x01010101 // Duplicate byte to all bytes.
3571 mul edx // overwrites edx with upper part of result.
3572 mov edx, edi
3573 mov edi, [esp + 4] // dst
3574 mov ecx, [esp + 12] // width
3575 shr ecx, 2
3576 rep stosd
3577 mov edi, edx
3578 ret
3579 }
3580 }
3581
3582 // Write 'width' bytes using an 8 bit value repeated.
3583 __declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
3584 __asm {
3585 mov edx, edi
3586 mov edi, [esp + 4] // dst
3587 mov eax, [esp + 8] // v8
3588 mov ecx, [esp + 12] // width
3589 rep stosb
3590 mov edi, edx
3591 ret
3592 }
3593 }
3594
3595 // Write 'width' 32 bit values.
3596 __declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb,
3597 uint32_t v32,
3598 int width) {
3599 __asm {
3600 mov edx, edi
3601 mov edi, [esp + 4] // dst
3602 mov eax, [esp + 8] // v32
3603 mov ecx, [esp + 12] // width
3604 rep stosd
3605 mov edi, edx
3606 ret
3607 }
3608 }
3609 #endif // HAS_SETROW_X86
3610
3611 #ifdef HAS_YUY2TOYROW_AVX2
3612 __declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2,
3613 uint8_t* dst_y,
3614 int width) {
3615 __asm {
3616 mov eax, [esp + 4] // src_yuy2
3617 mov edx, [esp + 8] // dst_y
3618 mov ecx, [esp + 12] // width
3619 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3620 vpsrlw ymm5, ymm5, 8
3621
3622 convertloop:
3623 vmovdqu ymm0, [eax]
3624 vmovdqu ymm1, [eax + 32]
3625 lea eax, [eax + 64]
3626 vpand ymm0, ymm0, ymm5 // even bytes are Y
3627 vpand ymm1, ymm1, ymm5
3628 vpackuswb ymm0, ymm0, ymm1 // mutates.
3629 vpermq ymm0, ymm0, 0xd8
3630 vmovdqu [edx], ymm0
3631 lea edx, [edx + 32]
3632 sub ecx, 32
3633 jg convertloop
3634 vzeroupper
3635 ret
3636 }
3637 }
3638
3639 __declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
3640 int stride_yuy2,
3641 uint8_t* dst_u,
3642 uint8_t* dst_v,
3643 int width) {
3644 __asm {
3645 push esi
3646 push edi
3647 mov eax, [esp + 8 + 4] // src_yuy2
3648 mov esi, [esp + 8 + 8] // stride_yuy2
3649 mov edx, [esp + 8 + 12] // dst_u
3650 mov edi, [esp + 8 + 16] // dst_v
3651 mov ecx, [esp + 8 + 20] // width
3652 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3653 vpsrlw ymm5, ymm5, 8
3654 sub edi, edx
3655
3656 convertloop:
3657 vmovdqu ymm0, [eax]
3658 vmovdqu ymm1, [eax + 32]
3659 vpavgb ymm0, ymm0, [eax + esi]
3660 vpavgb ymm1, ymm1, [eax + esi + 32]
3661 lea eax, [eax + 64]
3662 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
3663 vpsrlw ymm1, ymm1, 8
3664 vpackuswb ymm0, ymm0, ymm1 // mutates.
3665 vpermq ymm0, ymm0, 0xd8
3666 vpand ymm1, ymm0, ymm5 // U
3667 vpsrlw ymm0, ymm0, 8 // V
3668 vpackuswb ymm1, ymm1, ymm1 // mutates.
3669 vpackuswb ymm0, ymm0, ymm0 // mutates.
3670 vpermq ymm1, ymm1, 0xd8
3671 vpermq ymm0, ymm0, 0xd8
3672 vextractf128 [edx], ymm1, 0 // U
3673 vextractf128 [edx + edi], ymm0, 0 // V
3674 lea edx, [edx + 16]
3675 sub ecx, 32
3676 jg convertloop
3677
3678 pop edi
3679 pop esi
3680 vzeroupper
3681 ret
3682 }
3683 }
3684
3685 __declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
3686 uint8_t* dst_u,
3687 uint8_t* dst_v,
3688 int width) {
3689 __asm {
3690 push edi
3691 mov eax, [esp + 4 + 4] // src_yuy2
3692 mov edx, [esp + 4 + 8] // dst_u
3693 mov edi, [esp + 4 + 12] // dst_v
3694 mov ecx, [esp + 4 + 16] // width
3695 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3696 vpsrlw ymm5, ymm5, 8
3697 sub edi, edx
3698
3699 convertloop:
3700 vmovdqu ymm0, [eax]
3701 vmovdqu ymm1, [eax + 32]
3702 lea eax, [eax + 64]
3703 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
3704 vpsrlw ymm1, ymm1, 8
3705 vpackuswb ymm0, ymm0, ymm1 // mutates.
3706 vpermq ymm0, ymm0, 0xd8
3707 vpand ymm1, ymm0, ymm5 // U
3708 vpsrlw ymm0, ymm0, 8 // V
3709 vpackuswb ymm1, ymm1, ymm1 // mutates.
3710 vpackuswb ymm0, ymm0, ymm0 // mutates.
3711 vpermq ymm1, ymm1, 0xd8
3712 vpermq ymm0, ymm0, 0xd8
3713 vextractf128 [edx], ymm1, 0 // U
3714 vextractf128 [edx + edi], ymm0, 0 // V
3715 lea edx, [edx + 16]
3716 sub ecx, 32
3717 jg convertloop
3718
3719 pop edi
3720 vzeroupper
3721 ret
3722 }
3723 }
3724
3725 __declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy,
3726 uint8_t* dst_y,
3727 int width) {
3728 __asm {
3729 mov eax, [esp + 4] // src_uyvy
3730 mov edx, [esp + 8] // dst_y
3731 mov ecx, [esp + 12] // width
3732
3733 convertloop:
3734 vmovdqu ymm0, [eax]
3735 vmovdqu ymm1, [eax + 32]
3736 lea eax, [eax + 64]
3737 vpsrlw ymm0, ymm0, 8 // odd bytes are Y
3738 vpsrlw ymm1, ymm1, 8
3739 vpackuswb ymm0, ymm0, ymm1 // mutates.
3740 vpermq ymm0, ymm0, 0xd8
3741 vmovdqu [edx], ymm0
3742 lea edx, [edx + 32]
3743 sub ecx, 32
3744 jg convertloop
3745 vzeroupper
3746 ret
3747 }
3748 }
3749
3750 __declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
3751 int stride_uyvy,
3752 uint8_t* dst_u,
3753 uint8_t* dst_v,
3754 int width) {
3755 __asm {
3756 push esi
3757 push edi
3758 mov eax, [esp + 8 + 4] // src_yuy2
3759 mov esi, [esp + 8 + 8] // stride_yuy2
3760 mov edx, [esp + 8 + 12] // dst_u
3761 mov edi, [esp + 8 + 16] // dst_v
3762 mov ecx, [esp + 8 + 20] // width
3763 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3764 vpsrlw ymm5, ymm5, 8
3765 sub edi, edx
3766
3767 convertloop:
3768 vmovdqu ymm0, [eax]
3769 vmovdqu ymm1, [eax + 32]
3770 vpavgb ymm0, ymm0, [eax + esi]
3771 vpavgb ymm1, ymm1, [eax + esi + 32]
3772 lea eax, [eax + 64]
3773 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
3774 vpand ymm1, ymm1, ymm5
3775 vpackuswb ymm0, ymm0, ymm1 // mutates.
3776 vpermq ymm0, ymm0, 0xd8
3777 vpand ymm1, ymm0, ymm5 // U
3778 vpsrlw ymm0, ymm0, 8 // V
3779 vpackuswb ymm1, ymm1, ymm1 // mutates.
3780 vpackuswb ymm0, ymm0, ymm0 // mutates.
3781 vpermq ymm1, ymm1, 0xd8
3782 vpermq ymm0, ymm0, 0xd8
3783 vextractf128 [edx], ymm1, 0 // U
3784 vextractf128 [edx + edi], ymm0, 0 // V
3785 lea edx, [edx + 16]
3786 sub ecx, 32
3787 jg convertloop
3788
3789 pop edi
3790 pop esi
3791 vzeroupper
3792 ret
3793 }
3794 }
3795
3796 __declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
3797 uint8_t* dst_u,
3798 uint8_t* dst_v,
3799 int width) {
3800 __asm {
3801 push edi
3802 mov eax, [esp + 4 + 4] // src_yuy2
3803 mov edx, [esp + 4 + 8] // dst_u
3804 mov edi, [esp + 4 + 12] // dst_v
3805 mov ecx, [esp + 4 + 16] // width
3806 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3807 vpsrlw ymm5, ymm5, 8
3808 sub edi, edx
3809
3810 convertloop:
3811 vmovdqu ymm0, [eax]
3812 vmovdqu ymm1, [eax + 32]
3813 lea eax, [eax + 64]
3814 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
3815 vpand ymm1, ymm1, ymm5
3816 vpackuswb ymm0, ymm0, ymm1 // mutates.
3817 vpermq ymm0, ymm0, 0xd8
3818 vpand ymm1, ymm0, ymm5 // U
3819 vpsrlw ymm0, ymm0, 8 // V
3820 vpackuswb ymm1, ymm1, ymm1 // mutates.
3821 vpackuswb ymm0, ymm0, ymm0 // mutates.
3822 vpermq ymm1, ymm1, 0xd8
3823 vpermq ymm0, ymm0, 0xd8
3824 vextractf128 [edx], ymm1, 0 // U
3825 vextractf128 [edx + edi], ymm0, 0 // V
3826 lea edx, [edx + 16]
3827 sub ecx, 32
3828 jg convertloop
3829
3830 pop edi
3831 vzeroupper
3832 ret
3833 }
3834 }
3835 #endif // HAS_YUY2TOYROW_AVX2
3836
3837 #ifdef HAS_YUY2TOYROW_SSE2
3838 __declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2,
3839 uint8_t* dst_y,
3840 int width) {
3841 __asm {
3842 mov eax, [esp + 4] // src_yuy2
3843 mov edx, [esp + 8] // dst_y
3844 mov ecx, [esp + 12] // width
3845 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3846 psrlw xmm5, 8
3847
3848 convertloop:
3849 movdqu xmm0, [eax]
3850 movdqu xmm1, [eax + 16]
3851 lea eax, [eax + 32]
3852 pand xmm0, xmm5 // even bytes are Y
3853 pand xmm1, xmm5
3854 packuswb xmm0, xmm1
3855 movdqu [edx], xmm0
3856 lea edx, [edx + 16]
3857 sub ecx, 16
3858 jg convertloop
3859 ret
3860 }
3861 }
3862
3863 __declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
3864 int stride_yuy2,
3865 uint8_t* dst_u,
3866 uint8_t* dst_v,
3867 int width) {
3868 __asm {
3869 push esi
3870 push edi
3871 mov eax, [esp + 8 + 4] // src_yuy2
3872 mov esi, [esp + 8 + 8] // stride_yuy2
3873 mov edx, [esp + 8 + 12] // dst_u
3874 mov edi, [esp + 8 + 16] // dst_v
3875 mov ecx, [esp + 8 + 20] // width
3876 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3877 psrlw xmm5, 8
3878 sub edi, edx
3879
3880 convertloop:
3881 movdqu xmm0, [eax]
3882 movdqu xmm1, [eax + 16]
3883 movdqu xmm2, [eax + esi]
3884 movdqu xmm3, [eax + esi + 16]
3885 lea eax, [eax + 32]
3886 pavgb xmm0, xmm2
3887 pavgb xmm1, xmm3
3888 psrlw xmm0, 8 // YUYV -> UVUV
3889 psrlw xmm1, 8
3890 packuswb xmm0, xmm1
3891 movdqa xmm1, xmm0
3892 pand xmm0, xmm5 // U
3893 packuswb xmm0, xmm0
3894 psrlw xmm1, 8 // V
3895 packuswb xmm1, xmm1
3896 movq qword ptr [edx], xmm0
3897 movq qword ptr [edx + edi], xmm1
3898 lea edx, [edx + 8]
3899 sub ecx, 16
3900 jg convertloop
3901
3902 pop edi
3903 pop esi
3904 ret
3905 }
3906 }
3907
3908 __declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
3909 uint8_t* dst_u,
3910 uint8_t* dst_v,
3911 int width) {
3912 __asm {
3913 push edi
3914 mov eax, [esp + 4 + 4] // src_yuy2
3915 mov edx, [esp + 4 + 8] // dst_u
3916 mov edi, [esp + 4 + 12] // dst_v
3917 mov ecx, [esp + 4 + 16] // width
3918 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3919 psrlw xmm5, 8
3920 sub edi, edx
3921
3922 convertloop:
3923 movdqu xmm0, [eax]
3924 movdqu xmm1, [eax + 16]
3925 lea eax, [eax + 32]
3926 psrlw xmm0, 8 // YUYV -> UVUV
3927 psrlw xmm1, 8
3928 packuswb xmm0, xmm1
3929 movdqa xmm1, xmm0
3930 pand xmm0, xmm5 // U
3931 packuswb xmm0, xmm0
3932 psrlw xmm1, 8 // V
3933 packuswb xmm1, xmm1
3934 movq qword ptr [edx], xmm0
3935 movq qword ptr [edx + edi], xmm1
3936 lea edx, [edx + 8]
3937 sub ecx, 16
3938 jg convertloop
3939
3940 pop edi
3941 ret
3942 }
3943 }
3944
3945 __declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy,
3946 uint8_t* dst_y,
3947 int width) {
3948 __asm {
3949 mov eax, [esp + 4] // src_uyvy
3950 mov edx, [esp + 8] // dst_y
3951 mov ecx, [esp + 12] // width
3952
3953 convertloop:
3954 movdqu xmm0, [eax]
3955 movdqu xmm1, [eax + 16]
3956 lea eax, [eax + 32]
3957 psrlw xmm0, 8 // odd bytes are Y
3958 psrlw xmm1, 8
3959 packuswb xmm0, xmm1
3960 movdqu [edx], xmm0
3961 lea edx, [edx + 16]
3962 sub ecx, 16
3963 jg convertloop
3964 ret
3965 }
3966 }
3967
3968 __declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
3969 int stride_uyvy,
3970 uint8_t* dst_u,
3971 uint8_t* dst_v,
3972 int width) {
3973 __asm {
3974 push esi
3975 push edi
3976 mov eax, [esp + 8 + 4] // src_yuy2
3977 mov esi, [esp + 8 + 8] // stride_yuy2
3978 mov edx, [esp + 8 + 12] // dst_u
3979 mov edi, [esp + 8 + 16] // dst_v
3980 mov ecx, [esp + 8 + 20] // width
3981 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3982 psrlw xmm5, 8
3983 sub edi, edx
3984
3985 convertloop:
3986 movdqu xmm0, [eax]
3987 movdqu xmm1, [eax + 16]
3988 movdqu xmm2, [eax + esi]
3989 movdqu xmm3, [eax + esi + 16]
3990 lea eax, [eax + 32]
3991 pavgb xmm0, xmm2
3992 pavgb xmm1, xmm3
3993 pand xmm0, xmm5 // UYVY -> UVUV
3994 pand xmm1, xmm5
3995 packuswb xmm0, xmm1
3996 movdqa xmm1, xmm0
3997 pand xmm0, xmm5 // U
3998 packuswb xmm0, xmm0
3999 psrlw xmm1, 8 // V
4000 packuswb xmm1, xmm1
4001 movq qword ptr [edx], xmm0
4002 movq qword ptr [edx + edi], xmm1
4003 lea edx, [edx + 8]
4004 sub ecx, 16
4005 jg convertloop
4006
4007 pop edi
4008 pop esi
4009 ret
4010 }
4011 }
4012
4013 __declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
4014 uint8_t* dst_u,
4015 uint8_t* dst_v,
4016 int width) {
4017 __asm {
4018 push edi
4019 mov eax, [esp + 4 + 4] // src_yuy2
4020 mov edx, [esp + 4 + 8] // dst_u
4021 mov edi, [esp + 4 + 12] // dst_v
4022 mov ecx, [esp + 4 + 16] // width
4023 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4024 psrlw xmm5, 8
4025 sub edi, edx
4026
4027 convertloop:
4028 movdqu xmm0, [eax]
4029 movdqu xmm1, [eax + 16]
4030 lea eax, [eax + 32]
4031 pand xmm0, xmm5 // UYVY -> UVUV
4032 pand xmm1, xmm5
4033 packuswb xmm0, xmm1
4034 movdqa xmm1, xmm0
4035 pand xmm0, xmm5 // U
4036 packuswb xmm0, xmm0
4037 psrlw xmm1, 8 // V
4038 packuswb xmm1, xmm1
4039 movq qword ptr [edx], xmm0
4040 movq qword ptr [edx + edi], xmm1
4041 lea edx, [edx + 8]
4042 sub ecx, 16
4043 jg convertloop
4044
4045 pop edi
4046 ret
4047 }
4048 }
4049 #endif // HAS_YUY2TOYROW_SSE2
4050
4051 #ifdef HAS_BLENDPLANEROW_SSSE3
4052 // Blend 8 pixels at a time.
4053 // unsigned version of math
4054 // =((A2*C2)+(B2*(255-C2))+255)/256
4055 // signed version of math
4056 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4057 __declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0,
4058 const uint8_t* src1,
4059 const uint8_t* alpha,
4060 uint8_t* dst,
4061 int width) {
4062 __asm {
4063 push esi
4064 push edi
4065 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4066 psllw xmm5, 8
4067 mov eax, 0x80808080 // 128 for biasing image to signed.
4068 movd xmm6, eax
4069 pshufd xmm6, xmm6, 0x00
4070
4071 mov eax, 0x807f807f // 32768 + 127 for unbias and round.
4072 movd xmm7, eax
4073 pshufd xmm7, xmm7, 0x00
4074 mov eax, [esp + 8 + 4] // src0
4075 mov edx, [esp + 8 + 8] // src1
4076 mov esi, [esp + 8 + 12] // alpha
4077 mov edi, [esp + 8 + 16] // dst
4078 mov ecx, [esp + 8 + 20] // width
4079 sub eax, esi
4080 sub edx, esi
4081 sub edi, esi
4082
4083 // 8 pixel loop.
4084 convertloop8:
4085 movq xmm0, qword ptr [esi] // alpha
4086 punpcklbw xmm0, xmm0
4087 pxor xmm0, xmm5 // a, 255-a
4088 movq xmm1, qword ptr [eax + esi] // src0
4089 movq xmm2, qword ptr [edx + esi] // src1
4090 punpcklbw xmm1, xmm2
4091 psubb xmm1, xmm6 // bias src0/1 - 128
4092 pmaddubsw xmm0, xmm1
4093 paddw xmm0, xmm7 // unbias result - 32768 and round.
4094 psrlw xmm0, 8
4095 packuswb xmm0, xmm0
4096 movq qword ptr [edi + esi], xmm0
4097 lea esi, [esi + 8]
4098 sub ecx, 8
4099 jg convertloop8
4100
4101 pop edi
4102 pop esi
4103 ret
4104 }
4105 }
4106 #endif // HAS_BLENDPLANEROW_SSSE3
4107
4108 #ifdef HAS_BLENDPLANEROW_AVX2
4109 // Blend 32 pixels at a time.
4110 // unsigned version of math
4111 // =((A2*C2)+(B2*(255-C2))+255)/256
4112 // signed version of math
4113 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4114 __declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0,
4115 const uint8_t* src1,
4116 const uint8_t* alpha,
4117 uint8_t* dst,
4118 int width) {
4119 __asm {
4120 push esi
4121 push edi
4122 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
4123 vpsllw ymm5, ymm5, 8
4124 mov eax, 0x80808080 // 128 for biasing image to signed.
4125 vmovd xmm6, eax
4126 vbroadcastss ymm6, xmm6
4127 mov eax, 0x807f807f // 32768 + 127 for unbias and round.
4128 vmovd xmm7, eax
4129 vbroadcastss ymm7, xmm7
4130 mov eax, [esp + 8 + 4] // src0
4131 mov edx, [esp + 8 + 8] // src1
4132 mov esi, [esp + 8 + 12] // alpha
4133 mov edi, [esp + 8 + 16] // dst
4134 mov ecx, [esp + 8 + 20] // width
4135 sub eax, esi
4136 sub edx, esi
4137 sub edi, esi
4138
4139 // 32 pixel loop.
4140 convertloop32:
4141 vmovdqu ymm0, [esi] // alpha
4142 vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
4143 vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23
4144 vpxor ymm3, ymm3, ymm5 // a, 255-a
4145 vpxor ymm0, ymm0, ymm5 // a, 255-a
4146 vmovdqu ymm1, [eax + esi] // src0
4147 vmovdqu ymm2, [edx + esi] // src1
4148 vpunpckhbw ymm4, ymm1, ymm2
4149 vpunpcklbw ymm1, ymm1, ymm2
4150 vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128
4151 vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
4152 vpmaddubsw ymm3, ymm3, ymm4
4153 vpmaddubsw ymm0, ymm0, ymm1
4154 vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.
4155 vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
4156 vpsrlw ymm3, ymm3, 8
4157 vpsrlw ymm0, ymm0, 8
4158 vpackuswb ymm0, ymm0, ymm3
4159 vmovdqu [edi + esi], ymm0
4160 lea esi, [esi + 32]
4161 sub ecx, 32
4162 jg convertloop32
4163
4164 pop edi
4165 pop esi
4166 vzeroupper
4167 ret
4168 }
4169 }
4170 #endif // HAS_BLENDPLANEROW_AVX2
4171
4172 #ifdef HAS_ARGBBLENDROW_SSSE3
4173 // Shuffle table for isolating alpha.
4174 static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
4175 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
4176
4177 // Blend 8 pixels at a time.
4178 __declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
4179 const uint8_t* src_argb1,
4180 uint8_t* dst_argb,
4181 int width) {
4182 __asm {
4183 push esi
4184 mov eax, [esp + 4 + 4] // src_argb0
4185 mov esi, [esp + 4 + 8] // src_argb1
4186 mov edx, [esp + 4 + 12] // dst_argb
4187 mov ecx, [esp + 4 + 16] // width
4188 pcmpeqb xmm7, xmm7 // generate constant 0x0001
4189 psrlw xmm7, 15
4190 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
4191 psrlw xmm6, 8
4192 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4193 psllw xmm5, 8
4194 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
4195 pslld xmm4, 24
4196 sub ecx, 4
4197 jl convertloop4b // less than 4 pixels?
4198
4199 // 4 pixel loop.
4200 convertloop4:
4201 movdqu xmm3, [eax] // src argb
4202 lea eax, [eax + 16]
4203 movdqa xmm0, xmm3 // src argb
4204 pxor xmm3, xmm4 // ~alpha
4205 movdqu xmm2, [esi] // _r_b
4206 pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
4207 pand xmm2, xmm6 // _r_b
4208 paddw xmm3, xmm7 // 256 - alpha
4209 pmullw xmm2, xmm3 // _r_b * alpha
4210 movdqu xmm1, [esi] // _a_g
4211 lea esi, [esi + 16]
4212 psrlw xmm1, 8 // _a_g
4213 por xmm0, xmm4 // set alpha to 255
4214 pmullw xmm1, xmm3 // _a_g * alpha
4215 psrlw xmm2, 8 // _r_b convert to 8 bits again
4216 paddusb xmm0, xmm2 // + src argb
4217 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4218 paddusb xmm0, xmm1 // + src argb
4219 movdqu [edx], xmm0
4220 lea edx, [edx + 16]
4221 sub ecx, 4
4222 jge convertloop4
4223
4224 convertloop4b:
4225 add ecx, 4 - 1
4226 jl convertloop1b
4227
4228 // 1 pixel loop.
4229 convertloop1:
4230 movd xmm3, [eax] // src argb
4231 lea eax, [eax + 4]
4232 movdqa xmm0, xmm3 // src argb
4233 pxor xmm3, xmm4 // ~alpha
4234 movd xmm2, [esi] // _r_b
4235 pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
4236 pand xmm2, xmm6 // _r_b
4237 paddw xmm3, xmm7 // 256 - alpha
4238 pmullw xmm2, xmm3 // _r_b * alpha
4239 movd xmm1, [esi] // _a_g
4240 lea esi, [esi + 4]
4241 psrlw xmm1, 8 // _a_g
4242 por xmm0, xmm4 // set alpha to 255
4243 pmullw xmm1, xmm3 // _a_g * alpha
4244 psrlw xmm2, 8 // _r_b convert to 8 bits again
4245 paddusb xmm0, xmm2 // + src argb
4246 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4247 paddusb xmm0, xmm1 // + src argb
4248 movd [edx], xmm0
4249 lea edx, [edx + 4]
4250 sub ecx, 1
4251 jge convertloop1
4252
4253 convertloop1b:
4254 pop esi
4255 ret
4256 }
4257 }
4258 #endif // HAS_ARGBBLENDROW_SSSE3
4259
4260 #ifdef HAS_ARGBATTENUATEROW_SSSE3
4261 // Shuffle table duplicating alpha.
4262 static const uvec8 kShuffleAlpha0 = {
4263 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4264 };
4265 static const uvec8 kShuffleAlpha1 = {
4266 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4267 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4268 };
4269 __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
4270 uint8_t* dst_argb,
4271 int width) {
4272 __asm {
4273 mov eax, [esp + 4] // src_argb0
4274 mov edx, [esp + 8] // dst_argb
4275 mov ecx, [esp + 12] // width
4276 pcmpeqb xmm3, xmm3 // generate mask 0xff000000
4277 pslld xmm3, 24
4278 movdqa xmm4, xmmword ptr kShuffleAlpha0
4279 movdqa xmm5, xmmword ptr kShuffleAlpha1
4280
4281 convertloop:
4282 movdqu xmm0, [eax] // read 4 pixels
4283 pshufb xmm0, xmm4 // isolate first 2 alphas
4284 movdqu xmm1, [eax] // read 4 pixels
4285 punpcklbw xmm1, xmm1 // first 2 pixel rgbs
4286 pmulhuw xmm0, xmm1 // rgb * a
4287 movdqu xmm1, [eax] // read 4 pixels
4288 pshufb xmm1, xmm5 // isolate next 2 alphas
4289 movdqu xmm2, [eax] // read 4 pixels
4290 punpckhbw xmm2, xmm2 // next 2 pixel rgbs
4291 pmulhuw xmm1, xmm2 // rgb * a
4292 movdqu xmm2, [eax] // mask original alpha
4293 lea eax, [eax + 16]
4294 pand xmm2, xmm3
4295 psrlw xmm0, 8
4296 psrlw xmm1, 8
4297 packuswb xmm0, xmm1
4298 por xmm0, xmm2 // copy original alpha
4299 movdqu [edx], xmm0
4300 lea edx, [edx + 16]
4301 sub ecx, 4
4302 jg convertloop
4303
4304 ret
4305 }
4306 }
4307 #endif // HAS_ARGBATTENUATEROW_SSSE3
4308
4309 #ifdef HAS_ARGBATTENUATEROW_AVX2
4310 // Shuffle table duplicating alpha.
4311 static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
4312 128u, 128u, 14u, 15u, 14u, 15u,
4313 14u, 15u, 128u, 128u};
4314 __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
4315 uint8_t* dst_argb,
4316 int width) {
4317 __asm {
4318 mov eax, [esp + 4] // src_argb0
4319 mov edx, [esp + 8] // dst_argb
4320 mov ecx, [esp + 12] // width
4321 sub edx, eax
4322 vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
4323 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
4324 vpslld ymm5, ymm5, 24
4325
4326 convertloop:
4327 vmovdqu ymm6, [eax] // read 8 pixels.
4328 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4329 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4330 vpshufb ymm2, ymm0, ymm4 // low 4 alphas
4331 vpshufb ymm3, ymm1, ymm4 // high 4 alphas
4332 vpmulhuw ymm0, ymm0, ymm2 // rgb * a
4333 vpmulhuw ymm1, ymm1, ymm3 // rgb * a
4334 vpand ymm6, ymm6, ymm5 // isolate alpha
4335 vpsrlw ymm0, ymm0, 8
4336 vpsrlw ymm1, ymm1, 8
4337 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4338 vpor ymm0, ymm0, ymm6 // copy original alpha
4339 vmovdqu [eax + edx], ymm0
4340 lea eax, [eax + 32]
4341 sub ecx, 8
4342 jg convertloop
4343
4344 vzeroupper
4345 ret
4346 }
4347 }
4348 #endif // HAS_ARGBATTENUATEROW_AVX2
4349
4350 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
4351 // Unattenuate 4 pixels at a time.
4352 __declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
4353 uint8_t* dst_argb,
4354 int width) {
4355 __asm {
4356 push ebx
4357 push esi
4358 push edi
4359 mov eax, [esp + 12 + 4] // src_argb
4360 mov edx, [esp + 12 + 8] // dst_argb
4361 mov ecx, [esp + 12 + 12] // width
4362 lea ebx, fixed_invtbl8
4363
4364 convertloop:
4365 movdqu xmm0, [eax] // read 4 pixels
4366 movzx esi, byte ptr [eax + 3] // first alpha
4367 movzx edi, byte ptr [eax + 7] // second alpha
4368 punpcklbw xmm0, xmm0 // first 2
4369 movd xmm2, dword ptr [ebx + esi * 4]
4370 movd xmm3, dword ptr [ebx + edi * 4]
4371 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
4372 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
4373 movlhps xmm2, xmm3
4374 pmulhuw xmm0, xmm2 // rgb * a
4375
4376 movdqu xmm1, [eax] // read 4 pixels
4377 movzx esi, byte ptr [eax + 11] // third alpha
4378 movzx edi, byte ptr [eax + 15] // forth alpha
4379 punpckhbw xmm1, xmm1 // next 2
4380 movd xmm2, dword ptr [ebx + esi * 4]
4381 movd xmm3, dword ptr [ebx + edi * 4]
4382 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
4383 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
4384 movlhps xmm2, xmm3
4385 pmulhuw xmm1, xmm2 // rgb * a
4386 lea eax, [eax + 16]
4387 packuswb xmm0, xmm1
4388 movdqu [edx], xmm0
4389 lea edx, [edx + 16]
4390 sub ecx, 4
4391 jg convertloop
4392
4393 pop edi
4394 pop esi
4395 pop ebx
4396 ret
4397 }
4398 }
4399 #endif // HAS_ARGBUNATTENUATEROW_SSE2
4400
4401 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
4402 // Shuffle table duplicating alpha.
4403 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
4404 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
4405 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
4406 // USE_GATHER is not on by default, due to being a slow instruction.
4407 #ifdef USE_GATHER
4408 __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
4409 uint8_t* dst_argb,
4410 int width) {
4411 __asm {
4412 mov eax, [esp + 4] // src_argb0
4413 mov edx, [esp + 8] // dst_argb
4414 mov ecx, [esp + 12] // width
4415 sub edx, eax
4416 vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
4417
4418 convertloop:
4419 vmovdqu ymm6, [eax] // read 8 pixels.
4420 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
4421 vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
4422 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4423 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4424 vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
4425 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
4426 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
4427 vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
4428 vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
4429 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
4430 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
4431 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4432 vmovdqu [eax + edx], ymm0
4433 lea eax, [eax + 32]
4434 sub ecx, 8
4435 jg convertloop
4436
4437 vzeroupper
4438 ret
4439 }
4440 }
4441 #else // USE_GATHER
4442 __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
4443 uint8_t* dst_argb,
4444 int width) {
4445 __asm {
4446
4447 push ebx
4448 push esi
4449 push edi
4450 mov eax, [esp + 12 + 4] // src_argb
4451 mov edx, [esp + 12 + 8] // dst_argb
4452 mov ecx, [esp + 12 + 12] // width
4453 sub edx, eax
4454 lea ebx, fixed_invtbl8
4455 vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
4456
4457 convertloop:
4458 // replace VPGATHER
4459 movzx esi, byte ptr [eax + 3] // alpha0
4460 movzx edi, byte ptr [eax + 7] // alpha1
4461 vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0]
4462 vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1]
4463 movzx esi, byte ptr [eax + 11] // alpha2
4464 movzx edi, byte ptr [eax + 15] // alpha3
4465 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
4466 vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2]
4467 vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3]
4468 movzx esi, byte ptr [eax + 19] // alpha4
4469 movzx edi, byte ptr [eax + 23] // alpha5
4470 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
4471 vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4]
4472 vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5]
4473 movzx esi, byte ptr [eax + 27] // alpha6
4474 movzx edi, byte ptr [eax + 31] // alpha7
4475 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
4476 vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6]
4477 vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7]
4478 vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
4479 vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
4480 vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
4481 vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
4482 // end of VPGATHER
4483
4484 vmovdqu ymm6, [eax] // read 8 pixels.
4485 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4486 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4487 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
4488 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
4489 vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
4490 vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
4491 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
4492 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
4493 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4494 vmovdqu [eax + edx], ymm0
4495 lea eax, [eax + 32]
4496 sub ecx, 8
4497 jg convertloop
4498
4499 pop edi
4500 pop esi
4501 pop ebx
4502 vzeroupper
4503 ret
4504 }
4505 }
4506 #endif // USE_GATHER
4507 #endif // HAS_ARGBATTENUATEROW_AVX2
4508
4509 #ifdef HAS_ARGBGRAYROW_SSSE3
4510 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
4511 __declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb,
4512 uint8_t* dst_argb,
4513 int width) {
4514 __asm {
4515 mov eax, [esp + 4] /* src_argb */
4516 mov edx, [esp + 8] /* dst_argb */
4517 mov ecx, [esp + 12] /* width */
4518 movdqa xmm4, xmmword ptr kARGBToYJ
4519 movdqa xmm5, xmmword ptr kAddYJ64
4520
4521 convertloop:
4522 movdqu xmm0, [eax] // G
4523 movdqu xmm1, [eax + 16]
4524 pmaddubsw xmm0, xmm4
4525 pmaddubsw xmm1, xmm4
4526 phaddw xmm0, xmm1
4527 paddw xmm0, xmm5 // Add .5 for rounding.
4528 psrlw xmm0, 7
4529 packuswb xmm0, xmm0 // 8 G bytes
4530 movdqu xmm2, [eax] // A
4531 movdqu xmm3, [eax + 16]
4532 lea eax, [eax + 32]
4533 psrld xmm2, 24
4534 psrld xmm3, 24
4535 packuswb xmm2, xmm3
4536 packuswb xmm2, xmm2 // 8 A bytes
4537 movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
4538 punpcklbw xmm0, xmm0 // 8 GG words
4539 punpcklbw xmm3, xmm2 // 8 GA words
4540 movdqa xmm1, xmm0
4541 punpcklwd xmm0, xmm3 // GGGA first 4
4542 punpckhwd xmm1, xmm3 // GGGA next 4
4543 movdqu [edx], xmm0
4544 movdqu [edx + 16], xmm1
4545 lea edx, [edx + 32]
4546 sub ecx, 8
4547 jg convertloop
4548 ret
4549 }
4550 }
4551 #endif // HAS_ARGBGRAYROW_SSSE3
4552
4553 #ifdef HAS_ARGBSEPIAROW_SSSE3
4554 // b = (r * 35 + g * 68 + b * 17) >> 7
4555 // g = (r * 45 + g * 88 + b * 22) >> 7
4556 // r = (r * 50 + g * 98 + b * 24) >> 7
4557 // Constant for ARGB color to sepia tone.
4558 static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
4559 17, 68, 35, 0, 17, 68, 35, 0};
4560
4561 static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
4562 22, 88, 45, 0, 22, 88, 45, 0};
4563
4564 static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
4565 24, 98, 50, 0, 24, 98, 50, 0};
4566
4567 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
4568 __declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
4569 __asm {
4570 mov eax, [esp + 4] /* dst_argb */
4571 mov ecx, [esp + 8] /* width */
4572 movdqa xmm2, xmmword ptr kARGBToSepiaB
4573 movdqa xmm3, xmmword ptr kARGBToSepiaG
4574 movdqa xmm4, xmmword ptr kARGBToSepiaR
4575
4576 convertloop:
4577 movdqu xmm0, [eax] // B
4578 movdqu xmm6, [eax + 16]
4579 pmaddubsw xmm0, xmm2
4580 pmaddubsw xmm6, xmm2
4581 phaddw xmm0, xmm6
4582 psrlw xmm0, 7
4583 packuswb xmm0, xmm0 // 8 B values
4584 movdqu xmm5, [eax] // G
4585 movdqu xmm1, [eax + 16]
4586 pmaddubsw xmm5, xmm3
4587 pmaddubsw xmm1, xmm3
4588 phaddw xmm5, xmm1
4589 psrlw xmm5, 7
4590 packuswb xmm5, xmm5 // 8 G values
4591 punpcklbw xmm0, xmm5 // 8 BG values
4592 movdqu xmm5, [eax] // R
4593 movdqu xmm1, [eax + 16]
4594 pmaddubsw xmm5, xmm4
4595 pmaddubsw xmm1, xmm4
4596 phaddw xmm5, xmm1
4597 psrlw xmm5, 7
4598 packuswb xmm5, xmm5 // 8 R values
4599 movdqu xmm6, [eax] // A
4600 movdqu xmm1, [eax + 16]
4601 psrld xmm6, 24
4602 psrld xmm1, 24
4603 packuswb xmm6, xmm1
4604 packuswb xmm6, xmm6 // 8 A values
4605 punpcklbw xmm5, xmm6 // 8 RA values
4606 movdqa xmm1, xmm0 // Weave BG, RA together
4607 punpcklwd xmm0, xmm5 // BGRA first 4
4608 punpckhwd xmm1, xmm5 // BGRA next 4
4609 movdqu [eax], xmm0
4610 movdqu [eax + 16], xmm1
4611 lea eax, [eax + 32]
4612 sub ecx, 8
4613 jg convertloop
4614 ret
4615 }
4616 }
4617 #endif // HAS_ARGBSEPIAROW_SSSE3
4618
4619 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4620 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
4621 // Same as Sepia except matrix is provided.
4622 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
4623 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
4624 __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
4625 uint8_t* dst_argb,
4626 const int8_t* matrix_argb,
4627 int width) {
4628 __asm {
4629 mov eax, [esp + 4] /* src_argb */
4630 mov edx, [esp + 8] /* dst_argb */
4631 mov ecx, [esp + 12] /* matrix_argb */
4632 movdqu xmm5, [ecx]
4633 pshufd xmm2, xmm5, 0x00
4634 pshufd xmm3, xmm5, 0x55
4635 pshufd xmm4, xmm5, 0xaa
4636 pshufd xmm5, xmm5, 0xff
4637 mov ecx, [esp + 16] /* width */
4638
4639 convertloop:
4640 movdqu xmm0, [eax] // B
4641 movdqu xmm7, [eax + 16]
4642 pmaddubsw xmm0, xmm2
4643 pmaddubsw xmm7, xmm2
4644 movdqu xmm6, [eax] // G
4645 movdqu xmm1, [eax + 16]
4646 pmaddubsw xmm6, xmm3
4647 pmaddubsw xmm1, xmm3
4648 phaddsw xmm0, xmm7 // B
4649 phaddsw xmm6, xmm1 // G
4650 psraw xmm0, 6 // B
4651 psraw xmm6, 6 // G
4652 packuswb xmm0, xmm0 // 8 B values
4653 packuswb xmm6, xmm6 // 8 G values
4654 punpcklbw xmm0, xmm6 // 8 BG values
4655 movdqu xmm1, [eax] // R
4656 movdqu xmm7, [eax + 16]
4657 pmaddubsw xmm1, xmm4
4658 pmaddubsw xmm7, xmm4
4659 phaddsw xmm1, xmm7 // R
4660 movdqu xmm6, [eax] // A
4661 movdqu xmm7, [eax + 16]
4662 pmaddubsw xmm6, xmm5
4663 pmaddubsw xmm7, xmm5
4664 phaddsw xmm6, xmm7 // A
4665 psraw xmm1, 6 // R
4666 psraw xmm6, 6 // A
4667 packuswb xmm1, xmm1 // 8 R values
4668 packuswb xmm6, xmm6 // 8 A values
4669 punpcklbw xmm1, xmm6 // 8 RA values
4670 movdqa xmm6, xmm0 // Weave BG, RA together
4671 punpcklwd xmm0, xmm1 // BGRA first 4
4672 punpckhwd xmm6, xmm1 // BGRA next 4
4673 movdqu [edx], xmm0
4674 movdqu [edx + 16], xmm6
4675 lea eax, [eax + 32]
4676 lea edx, [edx + 32]
4677 sub ecx, 8
4678 jg convertloop
4679 ret
4680 }
4681 }
4682 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
4683
4684 #ifdef HAS_ARGBQUANTIZEROW_SSE2
4685 // Quantize 4 ARGB pixels (16 bytes).
4686 __declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
4687 int scale,
4688 int interval_size,
4689 int interval_offset,
4690 int width) {
4691 __asm {
4692 mov eax, [esp + 4] /* dst_argb */
4693 movd xmm2, [esp + 8] /* scale */
4694 movd xmm3, [esp + 12] /* interval_size */
4695 movd xmm4, [esp + 16] /* interval_offset */
4696 mov ecx, [esp + 20] /* width */
4697 pshuflw xmm2, xmm2, 040h
4698 pshufd xmm2, xmm2, 044h
4699 pshuflw xmm3, xmm3, 040h
4700 pshufd xmm3, xmm3, 044h
4701 pshuflw xmm4, xmm4, 040h
4702 pshufd xmm4, xmm4, 044h
4703 pxor xmm5, xmm5 // constant 0
4704 pcmpeqb xmm6, xmm6 // generate mask 0xff000000
4705 pslld xmm6, 24
4706
4707 convertloop:
4708 movdqu xmm0, [eax] // read 4 pixels
4709 punpcklbw xmm0, xmm5 // first 2 pixels
4710 pmulhuw xmm0, xmm2 // pixel * scale >> 16
4711 movdqu xmm1, [eax] // read 4 pixels
4712 punpckhbw xmm1, xmm5 // next 2 pixels
4713 pmulhuw xmm1, xmm2
4714 pmullw xmm0, xmm3 // * interval_size
4715 movdqu xmm7, [eax] // read 4 pixels
4716 pmullw xmm1, xmm3
4717 pand xmm7, xmm6 // mask alpha
4718 paddw xmm0, xmm4 // + interval_size / 2
4719 paddw xmm1, xmm4
4720 packuswb xmm0, xmm1
4721 por xmm0, xmm7
4722 movdqu [eax], xmm0
4723 lea eax, [eax + 16]
4724 sub ecx, 4
4725 jg convertloop
4726 ret
4727 }
4728 }
4729 #endif // HAS_ARGBQUANTIZEROW_SSE2
4730
4731 #ifdef HAS_ARGBSHADEROW_SSE2
4732 // Shade 4 pixels at a time by specified value.
4733 __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
4734 uint8_t* dst_argb,
4735 int width,
4736 uint32_t value) {
4737 __asm {
4738 mov eax, [esp + 4] // src_argb
4739 mov edx, [esp + 8] // dst_argb
4740 mov ecx, [esp + 12] // width
4741 movd xmm2, [esp + 16] // value
4742 punpcklbw xmm2, xmm2
4743 punpcklqdq xmm2, xmm2
4744
4745 convertloop:
4746 movdqu xmm0, [eax] // read 4 pixels
4747 lea eax, [eax + 16]
4748 movdqa xmm1, xmm0
4749 punpcklbw xmm0, xmm0 // first 2
4750 punpckhbw xmm1, xmm1 // next 2
4751 pmulhuw xmm0, xmm2 // argb * value
4752 pmulhuw xmm1, xmm2 // argb * value
4753 psrlw xmm0, 8
4754 psrlw xmm1, 8
4755 packuswb xmm0, xmm1
4756 movdqu [edx], xmm0
4757 lea edx, [edx + 16]
4758 sub ecx, 4
4759 jg convertloop
4760
4761 ret
4762 }
4763 }
4764 #endif // HAS_ARGBSHADEROW_SSE2
4765
4766 #ifdef HAS_ARGBMULTIPLYROW_SSE2
4767 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4768 __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
4769 const uint8_t* src_argb1,
4770 uint8_t* dst_argb,
4771 int width) {
4772 __asm {
4773 push esi
4774 mov eax, [esp + 4 + 4] // src_argb0
4775 mov esi, [esp + 4 + 8] // src_argb1
4776 mov edx, [esp + 4 + 12] // dst_argb
4777 mov ecx, [esp + 4 + 16] // width
4778 pxor xmm5, xmm5 // constant 0
4779
4780 convertloop:
4781 movdqu xmm0, [eax] // read 4 pixels from src_argb0
4782 movdqu xmm2, [esi] // read 4 pixels from src_argb1
4783 movdqu xmm1, xmm0
4784 movdqu xmm3, xmm2
4785 punpcklbw xmm0, xmm0 // first 2
4786 punpckhbw xmm1, xmm1 // next 2
4787 punpcklbw xmm2, xmm5 // first 2
4788 punpckhbw xmm3, xmm5 // next 2
4789 pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
4790 pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
4791 lea eax, [eax + 16]
4792 lea esi, [esi + 16]
4793 packuswb xmm0, xmm1
4794 movdqu [edx], xmm0
4795 lea edx, [edx + 16]
4796 sub ecx, 4
4797 jg convertloop
4798
4799 pop esi
4800 ret
4801 }
4802 }
4803 #endif // HAS_ARGBMULTIPLYROW_SSE2
4804
4805 #ifdef HAS_ARGBADDROW_SSE2
4806 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
4807 // TODO(fbarchard): Port this to posix, neon and other math functions.
4808 __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
4809 const uint8_t* src_argb1,
4810 uint8_t* dst_argb,
4811 int width) {
4812 __asm {
4813 push esi
4814 mov eax, [esp + 4 + 4] // src_argb0
4815 mov esi, [esp + 4 + 8] // src_argb1
4816 mov edx, [esp + 4 + 12] // dst_argb
4817 mov ecx, [esp + 4 + 16] // width
4818
4819 sub ecx, 4
4820 jl convertloop49
4821
4822 convertloop4:
4823 movdqu xmm0, [eax] // read 4 pixels from src_argb0
4824 lea eax, [eax + 16]
4825 movdqu xmm1, [esi] // read 4 pixels from src_argb1
4826 lea esi, [esi + 16]
4827 paddusb xmm0, xmm1 // src_argb0 + src_argb1
4828 movdqu [edx], xmm0
4829 lea edx, [edx + 16]
4830 sub ecx, 4
4831 jge convertloop4
4832
4833 convertloop49:
4834 add ecx, 4 - 1
4835 jl convertloop19
4836
4837 convertloop1:
4838 movd xmm0, [eax] // read 1 pixels from src_argb0
4839 lea eax, [eax + 4]
4840 movd xmm1, [esi] // read 1 pixels from src_argb1
4841 lea esi, [esi + 4]
4842 paddusb xmm0, xmm1 // src_argb0 + src_argb1
4843 movd [edx], xmm0
4844 lea edx, [edx + 4]
4845 sub ecx, 1
4846 jge convertloop1
4847
4848 convertloop19:
4849 pop esi
4850 ret
4851 }
4852 }
4853 #endif // HAS_ARGBADDROW_SSE2
4854
4855 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4856 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
4857 __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
4858 const uint8_t* src_argb1,
4859 uint8_t* dst_argb,
4860 int width) {
4861 __asm {
4862 push esi
4863 mov eax, [esp + 4 + 4] // src_argb0
4864 mov esi, [esp + 4 + 8] // src_argb1
4865 mov edx, [esp + 4 + 12] // dst_argb
4866 mov ecx, [esp + 4 + 16] // width
4867
4868 convertloop:
4869 movdqu xmm0, [eax] // read 4 pixels from src_argb0
4870 lea eax, [eax + 16]
4871 movdqu xmm1, [esi] // read 4 pixels from src_argb1
4872 lea esi, [esi + 16]
4873 psubusb xmm0, xmm1 // src_argb0 - src_argb1
4874 movdqu [edx], xmm0
4875 lea edx, [edx + 16]
4876 sub ecx, 4
4877 jg convertloop
4878
4879 pop esi
4880 ret
4881 }
4882 }
4883 #endif // HAS_ARGBSUBTRACTROW_SSE2
4884
4885 #ifdef HAS_ARGBMULTIPLYROW_AVX2
4886 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
4887 __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
4888 const uint8_t* src_argb1,
4889 uint8_t* dst_argb,
4890 int width) {
4891 __asm {
4892 push esi
4893 mov eax, [esp + 4 + 4] // src_argb0
4894 mov esi, [esp + 4 + 8] // src_argb1
4895 mov edx, [esp + 4 + 12] // dst_argb
4896 mov ecx, [esp + 4 + 16] // width
4897 vpxor ymm5, ymm5, ymm5 // constant 0
4898
4899 convertloop:
4900 vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
4901 lea eax, [eax + 32]
4902 vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
4903 lea esi, [esi + 32]
4904 vpunpcklbw ymm0, ymm1, ymm1 // low 4
4905 vpunpckhbw ymm1, ymm1, ymm1 // high 4
4906 vpunpcklbw ymm2, ymm3, ymm5 // low 4
4907 vpunpckhbw ymm3, ymm3, ymm5 // high 4
4908 vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
4909 vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
4910 vpackuswb ymm0, ymm0, ymm1
4911 vmovdqu [edx], ymm0
4912 lea edx, [edx + 32]
4913 sub ecx, 8
4914 jg convertloop
4915
4916 pop esi
4917 vzeroupper
4918 ret
4919 }
4920 }
4921 #endif // HAS_ARGBMULTIPLYROW_AVX2
4922
4923 #ifdef HAS_ARGBADDROW_AVX2
4924 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
4925 __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
4926 const uint8_t* src_argb1,
4927 uint8_t* dst_argb,
4928 int width) {
4929 __asm {
4930 push esi
4931 mov eax, [esp + 4 + 4] // src_argb0
4932 mov esi, [esp + 4 + 8] // src_argb1
4933 mov edx, [esp + 4 + 12] // dst_argb
4934 mov ecx, [esp + 4 + 16] // width
4935
4936 convertloop:
4937 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
4938 lea eax, [eax + 32]
4939 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
4940 lea esi, [esi + 32]
4941 vmovdqu [edx], ymm0
4942 lea edx, [edx + 32]
4943 sub ecx, 8
4944 jg convertloop
4945
4946 pop esi
4947 vzeroupper
4948 ret
4949 }
4950 }
4951 #endif // HAS_ARGBADDROW_AVX2
4952
4953 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4954 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
4955 __declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
4956 const uint8_t* src_argb1,
4957 uint8_t* dst_argb,
4958 int width) {
4959 __asm {
4960 push esi
4961 mov eax, [esp + 4 + 4] // src_argb0
4962 mov esi, [esp + 4 + 8] // src_argb1
4963 mov edx, [esp + 4 + 12] // dst_argb
4964 mov ecx, [esp + 4 + 16] // width
4965
4966 convertloop:
4967 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
4968 lea eax, [eax + 32]
4969 vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
4970 lea esi, [esi + 32]
4971 vmovdqu [edx], ymm0
4972 lea edx, [edx + 32]
4973 sub ecx, 8
4974 jg convertloop
4975
4976 pop esi
4977 vzeroupper
4978 ret
4979 }
4980 }
4981 #endif // HAS_ARGBSUBTRACTROW_AVX2
4982
4983 #ifdef HAS_SOBELXROW_SSE2
4984 // SobelX as a matrix is
4985 // -1 0 1
4986 // -2 0 2
4987 // -1 0 1
4988 __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
4989 const uint8_t* src_y1,
4990 const uint8_t* src_y2,
4991 uint8_t* dst_sobelx,
4992 int width) {
4993 __asm {
4994 push esi
4995 push edi
4996 mov eax, [esp + 8 + 4] // src_y0
4997 mov esi, [esp + 8 + 8] // src_y1
4998 mov edi, [esp + 8 + 12] // src_y2
4999 mov edx, [esp + 8 + 16] // dst_sobelx
5000 mov ecx, [esp + 8 + 20] // width
5001 sub esi, eax
5002 sub edi, eax
5003 sub edx, eax
5004 pxor xmm5, xmm5 // constant 0
5005
5006 convertloop:
5007 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
5008 movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
5009 punpcklbw xmm0, xmm5
5010 punpcklbw xmm1, xmm5
5011 psubw xmm0, xmm1
5012 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
5013 movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
5014 punpcklbw xmm1, xmm5
5015 punpcklbw xmm2, xmm5
5016 psubw xmm1, xmm2
5017 movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
5018 movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
5019 punpcklbw xmm2, xmm5
5020 punpcklbw xmm3, xmm5
5021 psubw xmm2, xmm3
5022 paddw xmm0, xmm2
5023 paddw xmm0, xmm1
5024 paddw xmm0, xmm1
5025 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
5026 psubw xmm1, xmm0
5027 pmaxsw xmm0, xmm1
5028 packuswb xmm0, xmm0
5029 movq qword ptr [eax + edx], xmm0
5030 lea eax, [eax + 8]
5031 sub ecx, 8
5032 jg convertloop
5033
5034 pop edi
5035 pop esi
5036 ret
5037 }
5038 }
5039 #endif // HAS_SOBELXROW_SSE2
5040
5041 #ifdef HAS_SOBELYROW_SSE2
5042 // SobelY as a matrix is
5043 // -1 -2 -1
5044 // 0 0 0
5045 // 1 2 1
5046 __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
5047 const uint8_t* src_y1,
5048 uint8_t* dst_sobely,
5049 int width) {
5050 __asm {
5051 push esi
5052 mov eax, [esp + 4 + 4] // src_y0
5053 mov esi, [esp + 4 + 8] // src_y1
5054 mov edx, [esp + 4 + 12] // dst_sobely
5055 mov ecx, [esp + 4 + 16] // width
5056 sub esi, eax
5057 sub edx, eax
5058 pxor xmm5, xmm5 // constant 0
5059
5060 convertloop:
5061 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
5062 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
5063 punpcklbw xmm0, xmm5
5064 punpcklbw xmm1, xmm5
5065 psubw xmm0, xmm1
5066 movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
5067 movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
5068 punpcklbw xmm1, xmm5
5069 punpcklbw xmm2, xmm5
5070 psubw xmm1, xmm2
5071 movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
5072 movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
5073 punpcklbw xmm2, xmm5
5074 punpcklbw xmm3, xmm5
5075 psubw xmm2, xmm3
5076 paddw xmm0, xmm2
5077 paddw xmm0, xmm1
5078 paddw xmm0, xmm1
5079 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
5080 psubw xmm1, xmm0
5081 pmaxsw xmm0, xmm1
5082 packuswb xmm0, xmm0
5083 movq qword ptr [eax + edx], xmm0
5084 lea eax, [eax + 8]
5085 sub ecx, 8
5086 jg convertloop
5087
5088 pop esi
5089 ret
5090 }
5091 }
5092 #endif // HAS_SOBELYROW_SSE2
5093
5094 #ifdef HAS_SOBELROW_SSE2
5095 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5096 // A = 255
5097 // R = Sobel
5098 // G = Sobel
5099 // B = Sobel
5100 __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
5101 const uint8_t* src_sobely,
5102 uint8_t* dst_argb,
5103 int width) {
5104 __asm {
5105 push esi
5106 mov eax, [esp + 4 + 4] // src_sobelx
5107 mov esi, [esp + 4 + 8] // src_sobely
5108 mov edx, [esp + 4 + 12] // dst_argb
5109 mov ecx, [esp + 4 + 16] // width
5110 sub esi, eax
5111 pcmpeqb xmm5, xmm5 // alpha 255
5112 pslld xmm5, 24 // 0xff000000
5113
5114 convertloop:
5115 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5116 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5117 lea eax, [eax + 16]
5118 paddusb xmm0, xmm1 // sobel = sobelx + sobely
5119 movdqa xmm2, xmm0 // GG
5120 punpcklbw xmm2, xmm0 // First 8
5121 punpckhbw xmm0, xmm0 // Next 8
5122 movdqa xmm1, xmm2 // GGGG
5123 punpcklwd xmm1, xmm2 // First 4
5124 punpckhwd xmm2, xmm2 // Next 4
5125 por xmm1, xmm5 // GGGA
5126 por xmm2, xmm5
5127 movdqa xmm3, xmm0 // GGGG
5128 punpcklwd xmm3, xmm0 // Next 4
5129 punpckhwd xmm0, xmm0 // Last 4
5130 por xmm3, xmm5 // GGGA
5131 por xmm0, xmm5
5132 movdqu [edx], xmm1
5133 movdqu [edx + 16], xmm2
5134 movdqu [edx + 32], xmm3
5135 movdqu [edx + 48], xmm0
5136 lea edx, [edx + 64]
5137 sub ecx, 16
5138 jg convertloop
5139
5140 pop esi
5141 ret
5142 }
5143 }
5144 #endif // HAS_SOBELROW_SSE2
5145
5146 #ifdef HAS_SOBELTOPLANEROW_SSE2
5147 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
5148 __declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
5149 const uint8_t* src_sobely,
5150 uint8_t* dst_y,
5151 int width) {
5152 __asm {
5153 push esi
5154 mov eax, [esp + 4 + 4] // src_sobelx
5155 mov esi, [esp + 4 + 8] // src_sobely
5156 mov edx, [esp + 4 + 12] // dst_argb
5157 mov ecx, [esp + 4 + 16] // width
5158 sub esi, eax
5159
5160 convertloop:
5161 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5162 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5163 lea eax, [eax + 16]
5164 paddusb xmm0, xmm1 // sobel = sobelx + sobely
5165 movdqu [edx], xmm0
5166 lea edx, [edx + 16]
5167 sub ecx, 16
5168 jg convertloop
5169
5170 pop esi
5171 ret
5172 }
5173 }
5174 #endif // HAS_SOBELTOPLANEROW_SSE2
5175
5176 #ifdef HAS_SOBELXYROW_SSE2
5177 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
5178 // A = 255
5179 // R = Sobel X
5180 // G = Sobel
5181 // B = Sobel Y
5182 __declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
5183 const uint8_t* src_sobely,
5184 uint8_t* dst_argb,
5185 int width) {
5186 __asm {
5187 push esi
5188 mov eax, [esp + 4 + 4] // src_sobelx
5189 mov esi, [esp + 4 + 8] // src_sobely
5190 mov edx, [esp + 4 + 12] // dst_argb
5191 mov ecx, [esp + 4 + 16] // width
5192 sub esi, eax
5193 pcmpeqb xmm5, xmm5 // alpha 255
5194
5195 convertloop:
5196 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5197 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5198 lea eax, [eax + 16]
5199 movdqa xmm2, xmm0
5200 paddusb xmm2, xmm1 // sobel = sobelx + sobely
5201 movdqa xmm3, xmm0 // XA
5202 punpcklbw xmm3, xmm5
5203 punpckhbw xmm0, xmm5
5204 movdqa xmm4, xmm1 // YS
5205 punpcklbw xmm4, xmm2
5206 punpckhbw xmm1, xmm2
5207 movdqa xmm6, xmm4 // YSXA
5208 punpcklwd xmm6, xmm3 // First 4
5209 punpckhwd xmm4, xmm3 // Next 4
5210 movdqa xmm7, xmm1 // YSXA
5211 punpcklwd xmm7, xmm0 // Next 4
5212 punpckhwd xmm1, xmm0 // Last 4
5213 movdqu [edx], xmm6
5214 movdqu [edx + 16], xmm4
5215 movdqu [edx + 32], xmm7
5216 movdqu [edx + 48], xmm1
5217 lea edx, [edx + 64]
5218 sub ecx, 16
5219 jg convertloop
5220
5221 pop esi
5222 ret
5223 }
5224 }
5225 #endif // HAS_SOBELXYROW_SSE2
5226
5227 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5228 // Consider float CumulativeSum.
5229 // Consider calling CumulativeSum one row at time as needed.
5230 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
5231 // Convert cumulative sum for an area to an average for 1 pixel.
5232 // topleft is pointer to top left of CumulativeSum buffer for area.
5233 // botleft is pointer to bottom left of CumulativeSum buffer.
5234 // width is offset from left to right of area in CumulativeSum buffer measured
5235 // in number of ints.
5236 // area is the number of pixels in the area being averaged.
5237 // dst points to pixel to store result to.
5238 // count is number of averaged pixels to produce.
5239 // Does 4 pixels at a time.
5240 // This function requires alignment on accumulation buffer pointers.
5241 void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
5242 const int32_t* botleft,
5243 int width,
5244 int area,
5245 uint8_t* dst,
5246 int count) {
5247 __asm {
5248 mov eax, topleft // eax topleft
5249 mov esi, botleft // esi botleft
5250 mov edx, width
5251 movd xmm5, area
5252 mov edi, dst
5253 mov ecx, count
5254 cvtdq2ps xmm5, xmm5
5255 rcpss xmm4, xmm5 // 1.0f / area
5256 pshufd xmm4, xmm4, 0
5257 sub ecx, 4
5258 jl l4b
5259
5260 cmp area, 128 // 128 pixels will not overflow 15 bits.
5261 ja l4
5262
5263 pshufd xmm5, xmm5, 0 // area
5264 pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
5265 psrld xmm6, 16
5266 cvtdq2ps xmm6, xmm6
5267 addps xmm5, xmm6 // (65536.0 + area - 1)
5268 mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
5269 cvtps2dq xmm5, xmm5 // 0.16 fixed point
5270 packssdw xmm5, xmm5 // 16 bit shorts
5271
5272 // 4 pixel loop small blocks.
5273 s4:
5274 // top left
5275 movdqu xmm0, [eax]
5276 movdqu xmm1, [eax + 16]
5277 movdqu xmm2, [eax + 32]
5278 movdqu xmm3, [eax + 48]
5279
5280 // - top right
5281 psubd xmm0, [eax + edx * 4]
5282 psubd xmm1, [eax + edx * 4 + 16]
5283 psubd xmm2, [eax + edx * 4 + 32]
5284 psubd xmm3, [eax + edx * 4 + 48]
5285 lea eax, [eax + 64]
5286
5287 // - bottom left
5288 psubd xmm0, [esi]
5289 psubd xmm1, [esi + 16]
5290 psubd xmm2, [esi + 32]
5291 psubd xmm3, [esi + 48]
5292
5293 // + bottom right
5294 paddd xmm0, [esi + edx * 4]
5295 paddd xmm1, [esi + edx * 4 + 16]
5296 paddd xmm2, [esi + edx * 4 + 32]
5297 paddd xmm3, [esi + edx * 4 + 48]
5298 lea esi, [esi + 64]
5299
5300 packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
5301 packssdw xmm2, xmm3
5302
5303 pmulhuw xmm0, xmm5
5304 pmulhuw xmm2, xmm5
5305
5306 packuswb xmm0, xmm2
5307 movdqu [edi], xmm0
5308 lea edi, [edi + 16]
5309 sub ecx, 4
5310 jge s4
5311
5312 jmp l4b
5313
5314 // 4 pixel loop
5315 l4:
5316 // top left
5317 movdqu xmm0, [eax]
5318 movdqu xmm1, [eax + 16]
5319 movdqu xmm2, [eax + 32]
5320 movdqu xmm3, [eax + 48]
5321
5322 // - top right
5323 psubd xmm0, [eax + edx * 4]
5324 psubd xmm1, [eax + edx * 4 + 16]
5325 psubd xmm2, [eax + edx * 4 + 32]
5326 psubd xmm3, [eax + edx * 4 + 48]
5327 lea eax, [eax + 64]
5328
5329 // - bottom left
5330 psubd xmm0, [esi]
5331 psubd xmm1, [esi + 16]
5332 psubd xmm2, [esi + 32]
5333 psubd xmm3, [esi + 48]
5334
5335 // + bottom right
5336 paddd xmm0, [esi + edx * 4]
5337 paddd xmm1, [esi + edx * 4 + 16]
5338 paddd xmm2, [esi + edx * 4 + 32]
5339 paddd xmm3, [esi + edx * 4 + 48]
5340 lea esi, [esi + 64]
5341
5342 cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
5343 cvtdq2ps xmm1, xmm1
5344 mulps xmm0, xmm4
5345 mulps xmm1, xmm4
5346 cvtdq2ps xmm2, xmm2
5347 cvtdq2ps xmm3, xmm3
5348 mulps xmm2, xmm4
5349 mulps xmm3, xmm4
5350 cvtps2dq xmm0, xmm0
5351 cvtps2dq xmm1, xmm1
5352 cvtps2dq xmm2, xmm2
5353 cvtps2dq xmm3, xmm3
5354 packssdw xmm0, xmm1
5355 packssdw xmm2, xmm3
5356 packuswb xmm0, xmm2
5357 movdqu [edi], xmm0
5358 lea edi, [edi + 16]
5359 sub ecx, 4
5360 jge l4
5361
5362 l4b:
5363 add ecx, 4 - 1
5364 jl l1b
5365
5366 // 1 pixel loop
5367 l1:
5368 movdqu xmm0, [eax]
5369 psubd xmm0, [eax + edx * 4]
5370 lea eax, [eax + 16]
5371 psubd xmm0, [esi]
5372 paddd xmm0, [esi + edx * 4]
5373 lea esi, [esi + 16]
5374 cvtdq2ps xmm0, xmm0
5375 mulps xmm0, xmm4
5376 cvtps2dq xmm0, xmm0
5377 packssdw xmm0, xmm0
5378 packuswb xmm0, xmm0
5379 movd dword ptr [edi], xmm0
5380 lea edi, [edi + 4]
5381 sub ecx, 1
5382 jge l1
5383 l1b:
5384 }
5385 }
5386 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5387
5388 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5389 // Creates a table of cumulative sums where each value is a sum of all values
5390 // above and to the left of the value.
5391 void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
5392 int32_t* cumsum,
5393 const int32_t* previous_cumsum,
5394 int width) {
5395 __asm {
5396 mov eax, row
5397 mov edx, cumsum
5398 mov esi, previous_cumsum
5399 mov ecx, width
5400 pxor xmm0, xmm0
5401 pxor xmm1, xmm1
5402
5403 sub ecx, 4
5404 jl l4b
5405 test edx, 15
5406 jne l4b
5407
5408 // 4 pixel loop
5409 l4:
5410 movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
5411 lea eax, [eax + 16]
5412 movdqa xmm4, xmm2
5413
5414 punpcklbw xmm2, xmm1
5415 movdqa xmm3, xmm2
5416 punpcklwd xmm2, xmm1
5417 punpckhwd xmm3, xmm1
5418
5419 punpckhbw xmm4, xmm1
5420 movdqa xmm5, xmm4
5421 punpcklwd xmm4, xmm1
5422 punpckhwd xmm5, xmm1
5423
5424 paddd xmm0, xmm2
5425 movdqu xmm2, [esi] // previous row above.
5426 paddd xmm2, xmm0
5427
5428 paddd xmm0, xmm3
5429 movdqu xmm3, [esi + 16]
5430 paddd xmm3, xmm0
5431
5432 paddd xmm0, xmm4
5433 movdqu xmm4, [esi + 32]
5434 paddd xmm4, xmm0
5435
5436 paddd xmm0, xmm5
5437 movdqu xmm5, [esi + 48]
5438 lea esi, [esi + 64]
5439 paddd xmm5, xmm0
5440
5441 movdqu [edx], xmm2
5442 movdqu [edx + 16], xmm3
5443 movdqu [edx + 32], xmm4
5444 movdqu [edx + 48], xmm5
5445
5446 lea edx, [edx + 64]
5447 sub ecx, 4
5448 jge l4
5449
5450 l4b:
5451 add ecx, 4 - 1
5452 jl l1b
5453
5454 // 1 pixel loop
5455 l1:
5456 movd xmm2, dword ptr [eax] // 1 argb pixel
5457 lea eax, [eax + 4]
5458 punpcklbw xmm2, xmm1
5459 punpcklwd xmm2, xmm1
5460 paddd xmm0, xmm2
5461 movdqu xmm2, [esi]
5462 lea esi, [esi + 16]
5463 paddd xmm2, xmm0
5464 movdqu [edx], xmm2
5465 lea edx, [edx + 16]
5466 sub ecx, 1
5467 jge l1
5468
5469 l1b:
5470 }
5471 }
5472 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
5473
5474 #ifdef HAS_ARGBAFFINEROW_SSE2
5475 // Copy ARGB pixels from source image with slope to a row of destination.
5476 __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
5477 int src_argb_stride,
5478 uint8_t* dst_argb,
5479 const float* uv_dudv,
5480 int width) {
5481 __asm {
5482 push esi
5483 push edi
5484 mov eax, [esp + 12] // src_argb
5485 mov esi, [esp + 16] // stride
5486 mov edx, [esp + 20] // dst_argb
5487 mov ecx, [esp + 24] // pointer to uv_dudv
5488 movq xmm2, qword ptr [ecx] // uv
5489 movq xmm7, qword ptr [ecx + 8] // dudv
5490 mov ecx, [esp + 28] // width
5491 shl esi, 16 // 4, stride
5492 add esi, 4
5493 movd xmm5, esi
5494 sub ecx, 4
5495 jl l4b
5496
5497 // setup for 4 pixel loop
5498 pshufd xmm7, xmm7, 0x44 // dup dudv
5499 pshufd xmm5, xmm5, 0 // dup 4, stride
5500 movdqa xmm0, xmm2 // x0, y0, x1, y1
5501 addps xmm0, xmm7
5502 movlhps xmm2, xmm0
5503 movdqa xmm4, xmm7
5504 addps xmm4, xmm4 // dudv *= 2
5505 movdqa xmm3, xmm2 // x2, y2, x3, y3
5506 addps xmm3, xmm4
5507 addps xmm4, xmm4 // dudv *= 4
5508
5509 // 4 pixel loop
5510 l4:
5511 cvttps2dq xmm0, xmm2 // x, y float to int first 2
5512 cvttps2dq xmm1, xmm3 // x, y float to int next 2
5513 packssdw xmm0, xmm1 // x, y as 8 shorts
5514 pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
5515 movd esi, xmm0
5516 pshufd xmm0, xmm0, 0x39 // shift right
5517 movd edi, xmm0
5518 pshufd xmm0, xmm0, 0x39 // shift right
5519 movd xmm1, [eax + esi] // read pixel 0
5520 movd xmm6, [eax + edi] // read pixel 1
5521 punpckldq xmm1, xmm6 // combine pixel 0 and 1
5522 addps xmm2, xmm4 // x, y += dx, dy first 2
5523 movq qword ptr [edx], xmm1
5524 movd esi, xmm0
5525 pshufd xmm0, xmm0, 0x39 // shift right
5526 movd edi, xmm0
5527 movd xmm6, [eax + esi] // read pixel 2
5528 movd xmm0, [eax + edi] // read pixel 3
5529 punpckldq xmm6, xmm0 // combine pixel 2 and 3
5530 addps xmm3, xmm4 // x, y += dx, dy next 2
5531 movq qword ptr 8[edx], xmm6
5532 lea edx, [edx + 16]
5533 sub ecx, 4
5534 jge l4
5535
5536 l4b:
5537 add ecx, 4 - 1
5538 jl l1b
5539
5540 // 1 pixel loop
5541 l1:
5542 cvttps2dq xmm0, xmm2 // x, y float to int
5543 packssdw xmm0, xmm0 // x, y as shorts
5544 pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
5545 addps xmm2, xmm7 // x, y += dx, dy
5546 movd esi, xmm0
5547 movd xmm0, [eax + esi] // copy a pixel
5548 movd [edx], xmm0
5549 lea edx, [edx + 4]
5550 sub ecx, 1
5551 jge l1
5552 l1b:
5553 pop edi
5554 pop esi
5555 ret
5556 }
5557 }
5558 #endif // HAS_ARGBAFFINEROW_SSE2
5559
5560 #ifdef HAS_INTERPOLATEROW_AVX2
5561 // Bilinear filter 32x2 -> 32x1
5562 __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
5563 const uint8_t* src_ptr,
5564 ptrdiff_t src_stride,
5565 int dst_width,
5566 int source_y_fraction) {
5567 __asm {
5568 push esi
5569 push edi
5570 mov edi, [esp + 8 + 4] // dst_ptr
5571 mov esi, [esp + 8 + 8] // src_ptr
5572 mov edx, [esp + 8 + 12] // src_stride
5573 mov ecx, [esp + 8 + 16] // dst_width
5574 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5575 // Dispatch to specialized filters if applicable.
5576 cmp eax, 0
5577 je xloop100 // 0 / 256. Blend 100 / 0.
5578 sub edi, esi
5579 cmp eax, 128
5580 je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
5581
5582 vmovd xmm0, eax // high fraction 0..255
5583 neg eax
5584 add eax, 256
5585 vmovd xmm5, eax // low fraction 256..1
5586 vpunpcklbw xmm5, xmm5, xmm0
5587 vpunpcklwd xmm5, xmm5, xmm5
5588 vbroadcastss ymm5, xmm5
5589
5590 mov eax, 0x80808080 // 128b for bias and rounding.
5591 vmovd xmm4, eax
5592 vbroadcastss ymm4, xmm4
5593
5594 xloop:
5595 vmovdqu ymm0, [esi]
5596 vmovdqu ymm2, [esi + edx]
5597 vpunpckhbw ymm1, ymm0, ymm2 // mutates
5598 vpunpcklbw ymm0, ymm0, ymm2
5599 vpsubb ymm1, ymm1, ymm4 // bias to signed image
5600 vpsubb ymm0, ymm0, ymm4
5601 vpmaddubsw ymm1, ymm5, ymm1
5602 vpmaddubsw ymm0, ymm5, ymm0
5603 vpaddw ymm1, ymm1, ymm4 // unbias and round
5604 vpaddw ymm0, ymm0, ymm4
5605 vpsrlw ymm1, ymm1, 8
5606 vpsrlw ymm0, ymm0, 8
5607 vpackuswb ymm0, ymm0, ymm1 // unmutates
5608 vmovdqu [esi + edi], ymm0
5609 lea esi, [esi + 32]
5610 sub ecx, 32
5611 jg xloop
5612 jmp xloop99
5613
5614 // Blend 50 / 50.
5615 xloop50:
5616 vmovdqu ymm0, [esi]
5617 vpavgb ymm0, ymm0, [esi + edx]
5618 vmovdqu [esi + edi], ymm0
5619 lea esi, [esi + 32]
5620 sub ecx, 32
5621 jg xloop50
5622 jmp xloop99
5623
5624 // Blend 100 / 0 - Copy row unchanged.
5625 xloop100:
5626 rep movsb
5627
5628 xloop99:
5629 pop edi
5630 pop esi
5631 vzeroupper
5632 ret
5633 }
5634 }
5635 #endif // HAS_INTERPOLATEROW_AVX2
5636
5637 // Bilinear filter 16x2 -> 16x1
5638 // TODO(fbarchard): Consider allowing 256 using memcpy.
5639 __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
5640 const uint8_t* src_ptr,
5641 ptrdiff_t src_stride,
5642 int dst_width,
5643 int source_y_fraction) {
5644 __asm {
5645 push esi
5646 push edi
5647
5648 mov edi, [esp + 8 + 4] // dst_ptr
5649 mov esi, [esp + 8 + 8] // src_ptr
5650 mov edx, [esp + 8 + 12] // src_stride
5651 mov ecx, [esp + 8 + 16] // dst_width
5652 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5653 sub edi, esi
5654 // Dispatch to specialized filters if applicable.
5655 cmp eax, 0
5656 je xloop100 // 0 /256. Blend 100 / 0.
5657 cmp eax, 128
5658 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
5659
5660 movd xmm0, eax // high fraction 0..255
5661 neg eax
5662 add eax, 256
5663 movd xmm5, eax // low fraction 255..1
5664 punpcklbw xmm5, xmm0
5665 punpcklwd xmm5, xmm5
5666 pshufd xmm5, xmm5, 0
5667 mov eax, 0x80808080 // 128 for biasing image to signed.
5668 movd xmm4, eax
5669 pshufd xmm4, xmm4, 0x00
5670
5671 xloop:
5672 movdqu xmm0, [esi]
5673 movdqu xmm2, [esi + edx]
5674 movdqu xmm1, xmm0
5675 punpcklbw xmm0, xmm2
5676 punpckhbw xmm1, xmm2
5677 psubb xmm0, xmm4 // bias image by -128
5678 psubb xmm1, xmm4
5679 movdqa xmm2, xmm5
5680 movdqa xmm3, xmm5
5681 pmaddubsw xmm2, xmm0
5682 pmaddubsw xmm3, xmm1
5683 paddw xmm2, xmm4
5684 paddw xmm3, xmm4
5685 psrlw xmm2, 8
5686 psrlw xmm3, 8
5687 packuswb xmm2, xmm3
5688 movdqu [esi + edi], xmm2
5689 lea esi, [esi + 16]
5690 sub ecx, 16
5691 jg xloop
5692 jmp xloop99
5693
5694 // Blend 50 / 50.
5695 xloop50:
5696 movdqu xmm0, [esi]
5697 movdqu xmm1, [esi + edx]
5698 pavgb xmm0, xmm1
5699 movdqu [esi + edi], xmm0
5700 lea esi, [esi + 16]
5701 sub ecx, 16
5702 jg xloop50
5703 jmp xloop99
5704
5705 // Blend 100 / 0 - Copy row unchanged.
5706 xloop100:
5707 movdqu xmm0, [esi]
5708 movdqu [esi + edi], xmm0
5709 lea esi, [esi + 16]
5710 sub ecx, 16
5711 jg xloop100
5712
5713 xloop99:
5714 pop edi
5715 pop esi
5716 ret
5717 }
5718 }
5719
5720 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5721 __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
5722 uint8_t* dst_argb,
5723 const uint8_t* shuffler,
5724 int width) {
5725 __asm {
5726 mov eax, [esp + 4] // src_argb
5727 mov edx, [esp + 8] // dst_argb
5728 mov ecx, [esp + 12] // shuffler
5729 movdqu xmm5, [ecx]
5730 mov ecx, [esp + 16] // width
5731
5732 wloop:
5733 movdqu xmm0, [eax]
5734 movdqu xmm1, [eax + 16]
5735 lea eax, [eax + 32]
5736 pshufb xmm0, xmm5
5737 pshufb xmm1, xmm5
5738 movdqu [edx], xmm0
5739 movdqu [edx + 16], xmm1
5740 lea edx, [edx + 32]
5741 sub ecx, 8
5742 jg wloop
5743 ret
5744 }
5745 }
5746
5747 #ifdef HAS_ARGBSHUFFLEROW_AVX2
5748 __declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
5749 uint8_t* dst_argb,
5750 const uint8_t* shuffler,
5751 int width) {
5752 __asm {
5753 mov eax, [esp + 4] // src_argb
5754 mov edx, [esp + 8] // dst_argb
5755 mov ecx, [esp + 12] // shuffler
5756 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
5757 mov ecx, [esp + 16] // width
5758
5759 wloop:
5760 vmovdqu ymm0, [eax]
5761 vmovdqu ymm1, [eax + 32]
5762 lea eax, [eax + 64]
5763 vpshufb ymm0, ymm0, ymm5
5764 vpshufb ymm1, ymm1, ymm5
5765 vmovdqu [edx], ymm0
5766 vmovdqu [edx + 32], ymm1
5767 lea edx, [edx + 64]
5768 sub ecx, 16
5769 jg wloop
5770
5771 vzeroupper
5772 ret
5773 }
5774 }
5775 #endif // HAS_ARGBSHUFFLEROW_AVX2
5776
5777 // YUY2 - Macro-pixel = 2 image pixels
5778 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
5779
5780 // UYVY - Macro-pixel = 2 image pixels
5781 // U0Y0V0Y1
5782
5783 __declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
5784 const uint8_t* src_u,
5785 const uint8_t* src_v,
5786 uint8_t* dst_frame,
5787 int width) {
5788 __asm {
5789 push esi
5790 push edi
5791 mov eax, [esp + 8 + 4] // src_y
5792 mov esi, [esp + 8 + 8] // src_u
5793 mov edx, [esp + 8 + 12] // src_v
5794 mov edi, [esp + 8 + 16] // dst_frame
5795 mov ecx, [esp + 8 + 20] // width
5796 sub edx, esi
5797
5798 convertloop:
5799 movq xmm2, qword ptr [esi] // U
5800 movq xmm3, qword ptr [esi + edx] // V
5801 lea esi, [esi + 8]
5802 punpcklbw xmm2, xmm3 // UV
5803 movdqu xmm0, [eax] // Y
5804 lea eax, [eax + 16]
5805 movdqa xmm1, xmm0
5806 punpcklbw xmm0, xmm2 // YUYV
5807 punpckhbw xmm1, xmm2
5808 movdqu [edi], xmm0
5809 movdqu [edi + 16], xmm1
5810 lea edi, [edi + 32]
5811 sub ecx, 16
5812 jg convertloop
5813
5814 pop edi
5815 pop esi
5816 ret
5817 }
5818 }
5819
5820 __declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
5821 const uint8_t* src_u,
5822 const uint8_t* src_v,
5823 uint8_t* dst_frame,
5824 int width) {
5825 __asm {
5826 push esi
5827 push edi
5828 mov eax, [esp + 8 + 4] // src_y
5829 mov esi, [esp + 8 + 8] // src_u
5830 mov edx, [esp + 8 + 12] // src_v
5831 mov edi, [esp + 8 + 16] // dst_frame
5832 mov ecx, [esp + 8 + 20] // width
5833 sub edx, esi
5834
5835 convertloop:
5836 movq xmm2, qword ptr [esi] // U
5837 movq xmm3, qword ptr [esi + edx] // V
5838 lea esi, [esi + 8]
5839 punpcklbw xmm2, xmm3 // UV
5840 movdqu xmm0, [eax] // Y
5841 movdqa xmm1, xmm2
5842 lea eax, [eax + 16]
5843 punpcklbw xmm1, xmm0 // UYVY
5844 punpckhbw xmm2, xmm0
5845 movdqu [edi], xmm1
5846 movdqu [edi + 16], xmm2
5847 lea edi, [edi + 32]
5848 sub ecx, 16
5849 jg convertloop
5850
5851 pop edi
5852 pop esi
5853 ret
5854 }
5855 }
5856
5857 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
5858 __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
5859 uint8_t* dst_argb,
5860 const float* poly,
5861 int width) {
5862 __asm {
5863 push esi
5864 mov eax, [esp + 4 + 4] /* src_argb */
5865 mov edx, [esp + 4 + 8] /* dst_argb */
5866 mov esi, [esp + 4 + 12] /* poly */
5867 mov ecx, [esp + 4 + 16] /* width */
5868 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
5869
5870 // 2 pixel loop.
5871 convertloop:
5872 // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
5873 // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
5874 movq xmm0, qword ptr [eax] // BGRABGRA
5875 lea eax, [eax + 8]
5876 punpcklbw xmm0, xmm3
5877 movdqa xmm4, xmm0
5878 punpcklwd xmm0, xmm3 // pixel 0
5879 punpckhwd xmm4, xmm3 // pixel 1
5880 cvtdq2ps xmm0, xmm0 // 4 floats
5881 cvtdq2ps xmm4, xmm4
5882 movdqa xmm1, xmm0 // X
5883 movdqa xmm5, xmm4
5884 mulps xmm0, [esi + 16] // C1 * X
5885 mulps xmm4, [esi + 16]
5886 addps xmm0, [esi] // result = C0 + C1 * X
5887 addps xmm4, [esi]
5888 movdqa xmm2, xmm1
5889 movdqa xmm6, xmm5
5890 mulps xmm2, xmm1 // X * X
5891 mulps xmm6, xmm5
5892 mulps xmm1, xmm2 // X * X * X
5893 mulps xmm5, xmm6
5894 mulps xmm2, [esi + 32] // C2 * X * X
5895 mulps xmm6, [esi + 32]
5896 mulps xmm1, [esi + 48] // C3 * X * X * X
5897 mulps xmm5, [esi + 48]
5898 addps xmm0, xmm2 // result += C2 * X * X
5899 addps xmm4, xmm6
5900 addps xmm0, xmm1 // result += C3 * X * X * X
5901 addps xmm4, xmm5
5902 cvttps2dq xmm0, xmm0
5903 cvttps2dq xmm4, xmm4
5904 packuswb xmm0, xmm4
5905 packuswb xmm0, xmm0
5906 movq qword ptr [edx], xmm0
5907 lea edx, [edx + 8]
5908 sub ecx, 2
5909 jg convertloop
5910 pop esi
5911 ret
5912 }
5913 }
5914 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
5915
5916 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
5917 __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
5918 uint8_t* dst_argb,
5919 const float* poly,
5920 int width) {
5921 __asm {
5922 mov eax, [esp + 4] /* src_argb */
5923 mov edx, [esp + 8] /* dst_argb */
5924 mov ecx, [esp + 12] /* poly */
5925 vbroadcastf128 ymm4, [ecx] // C0
5926 vbroadcastf128 ymm5, [ecx + 16] // C1
5927 vbroadcastf128 ymm6, [ecx + 32] // C2
5928 vbroadcastf128 ymm7, [ecx + 48] // C3
5929 mov ecx, [esp + 16] /* width */
5930
5931 // 2 pixel loop.
5932 convertloop:
5933 vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
5934 lea eax, [eax + 8]
5935 vcvtdq2ps ymm0, ymm0 // X 8 floats
5936 vmulps ymm2, ymm0, ymm0 // X * X
5937 vmulps ymm3, ymm0, ymm7 // C3 * X
5938 vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
5939 vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
5940 vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
5941 vcvttps2dq ymm0, ymm0
5942 vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
5943 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
5944 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
5945 vmovq qword ptr [edx], xmm0
5946 lea edx, [edx + 8]
5947 sub ecx, 2
5948 jg convertloop
5949 vzeroupper
5950 ret
5951 }
5952 }
5953 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
5954
5955 #ifdef HAS_HALFFLOATROW_SSE2
5956 static float kExpBias = 1.9259299444e-34f;
5957 __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
5958 uint16_t* dst,
5959 float scale,
5960 int width) {
5961 __asm {
5962 mov eax, [esp + 4] /* src */
5963 mov edx, [esp + 8] /* dst */
5964 movd xmm4, dword ptr [esp + 12] /* scale */
5965 mov ecx, [esp + 16] /* width */
5966 mulss xmm4, kExpBias
5967 pshufd xmm4, xmm4, 0
5968 pxor xmm5, xmm5
5969 sub edx, eax
5970
5971 // 8 pixel loop.
5972 convertloop:
5973 movdqu xmm2, xmmword ptr [eax] // 8 shorts
5974 add eax, 16
5975 movdqa xmm3, xmm2
5976 punpcklwd xmm2, xmm5
5977 cvtdq2ps xmm2, xmm2 // convert 8 ints to floats
5978 punpckhwd xmm3, xmm5
5979 cvtdq2ps xmm3, xmm3
5980 mulps xmm2, xmm4
5981 mulps xmm3, xmm4
5982 psrld xmm2, 13
5983 psrld xmm3, 13
5984 packssdw xmm2, xmm3
5985 movdqu [eax + edx - 16], xmm2
5986 sub ecx, 8
5987 jg convertloop
5988 ret
5989 }
5990 }
5991 #endif // HAS_HALFFLOATROW_SSE2
5992
5993 #ifdef HAS_HALFFLOATROW_AVX2
5994 __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
5995 uint16_t* dst,
5996 float scale,
5997 int width) {
5998 __asm {
5999 mov eax, [esp + 4] /* src */
6000 mov edx, [esp + 8] /* dst */
6001 movd xmm4, dword ptr [esp + 12] /* scale */
6002 mov ecx, [esp + 16] /* width */
6003
6004 vmulss xmm4, xmm4, kExpBias
6005 vbroadcastss ymm4, xmm4
6006 vpxor ymm5, ymm5, ymm5
6007 sub edx, eax
6008
6009 // 16 pixel loop.
6010 convertloop:
6011 vmovdqu ymm2, [eax] // 16 shorts
6012 add eax, 32
6013 vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints
6014 vpunpcklwd ymm2, ymm2, ymm5
6015 vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats
6016 vcvtdq2ps ymm2, ymm2
6017 vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.
6018 vmulps ymm2, ymm2, ymm4
6019 vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
6020 vpsrld ymm2, ymm2, 13
6021 vpackssdw ymm2, ymm2, ymm3
6022 vmovdqu [eax + edx - 32], ymm2
6023 sub ecx, 16
6024 jg convertloop
6025 vzeroupper
6026 ret
6027 }
6028 }
6029 #endif // HAS_HALFFLOATROW_AVX2
6030
6031 #ifdef HAS_HALFFLOATROW_F16C
6032 __declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
6033 uint16_t* dst,
6034 float scale,
6035 int width) {
6036 __asm {
6037 mov eax, [esp + 4] /* src */
6038 mov edx, [esp + 8] /* dst */
6039 vbroadcastss ymm4, [esp + 12] /* scale */
6040 mov ecx, [esp + 16] /* width */
6041 sub edx, eax
6042
6043 // 16 pixel loop.
6044 convertloop:
6045 vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
6046 vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
6047 add eax, 32
6048 vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
6049 vcvtdq2ps ymm3, ymm3
6050 vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
6051 vmulps ymm3, ymm3, ymm4
6052 vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
6053 vcvtps2ph xmm3, ymm3, 3
6054 vmovdqu [eax + edx + 32], xmm2
6055 vmovdqu [eax + edx + 32 + 16], xmm3
6056 sub ecx, 16
6057 jg convertloop
6058 vzeroupper
6059 ret
6060 }
6061 }
6062 #endif // HAS_HALFFLOATROW_F16C
6063
6064 #ifdef HAS_ARGBCOLORTABLEROW_X86
6065 // Tranform ARGB pixels with color table.
6066 __declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
6067 const uint8_t* table_argb,
6068 int width) {
6069 __asm {
6070 push esi
6071 mov eax, [esp + 4 + 4] /* dst_argb */
6072 mov esi, [esp + 4 + 8] /* table_argb */
6073 mov ecx, [esp + 4 + 12] /* width */
6074
6075 // 1 pixel loop.
6076 convertloop:
6077 movzx edx, byte ptr [eax]
6078 lea eax, [eax + 4]
6079 movzx edx, byte ptr [esi + edx * 4]
6080 mov byte ptr [eax - 4], dl
6081 movzx edx, byte ptr [eax - 4 + 1]
6082 movzx edx, byte ptr [esi + edx * 4 + 1]
6083 mov byte ptr [eax - 4 + 1], dl
6084 movzx edx, byte ptr [eax - 4 + 2]
6085 movzx edx, byte ptr [esi + edx * 4 + 2]
6086 mov byte ptr [eax - 4 + 2], dl
6087 movzx edx, byte ptr [eax - 4 + 3]
6088 movzx edx, byte ptr [esi + edx * 4 + 3]
6089 mov byte ptr [eax - 4 + 3], dl
6090 dec ecx
6091 jg convertloop
6092 pop esi
6093 ret
6094 }
6095 }
6096 #endif // HAS_ARGBCOLORTABLEROW_X86
6097
6098 #ifdef HAS_RGBCOLORTABLEROW_X86
6099 // Tranform RGB pixels with color table.
6100 __declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
6101 const uint8_t* table_argb,
6102 int width) {
6103 __asm {
6104 push esi
6105 mov eax, [esp + 4 + 4] /* dst_argb */
6106 mov esi, [esp + 4 + 8] /* table_argb */
6107 mov ecx, [esp + 4 + 12] /* width */
6108
6109 // 1 pixel loop.
6110 convertloop:
6111 movzx edx, byte ptr [eax]
6112 lea eax, [eax + 4]
6113 movzx edx, byte ptr [esi + edx * 4]
6114 mov byte ptr [eax - 4], dl
6115 movzx edx, byte ptr [eax - 4 + 1]
6116 movzx edx, byte ptr [esi + edx * 4 + 1]
6117 mov byte ptr [eax - 4 + 1], dl
6118 movzx edx, byte ptr [eax - 4 + 2]
6119 movzx edx, byte ptr [esi + edx * 4 + 2]
6120 mov byte ptr [eax - 4 + 2], dl
6121 dec ecx
6122 jg convertloop
6123
6124 pop esi
6125 ret
6126 }
6127 }
6128 #endif // HAS_RGBCOLORTABLEROW_X86
6129
6130 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6131 // Tranform RGB pixels with luma table.
6132 __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
6133 uint8_t* dst_argb,
6134 int width,
6135 const uint8_t* luma,
6136 uint32_t lumacoeff) {
6137 __asm {
6138 push esi
6139 push edi
6140 mov eax, [esp + 8 + 4] /* src_argb */
6141 mov edi, [esp + 8 + 8] /* dst_argb */
6142 mov ecx, [esp + 8 + 12] /* width */
6143 movd xmm2, dword ptr [esp + 8 + 16] // luma table
6144 movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
6145 pshufd xmm2, xmm2, 0
6146 pshufd xmm3, xmm3, 0
6147 pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
6148 psllw xmm4, 8
6149 pxor xmm5, xmm5
6150
6151 // 4 pixel loop.
6152 convertloop:
6153 movdqu xmm0, xmmword ptr [eax] // generate luma ptr
6154 pmaddubsw xmm0, xmm3
6155 phaddw xmm0, xmm0
6156 pand xmm0, xmm4 // mask out low bits
6157 punpcklwd xmm0, xmm5
6158 paddd xmm0, xmm2 // add table base
6159 movd esi, xmm0
6160 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6161
6162 movzx edx, byte ptr [eax]
6163 movzx edx, byte ptr [esi + edx]
6164 mov byte ptr [edi], dl
6165 movzx edx, byte ptr [eax + 1]
6166 movzx edx, byte ptr [esi + edx]
6167 mov byte ptr [edi + 1], dl
6168 movzx edx, byte ptr [eax + 2]
6169 movzx edx, byte ptr [esi + edx]
6170 mov byte ptr [edi + 2], dl
6171 movzx edx, byte ptr [eax + 3] // copy alpha.
6172 mov byte ptr [edi + 3], dl
6173
6174 movd esi, xmm0
6175 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6176
6177 movzx edx, byte ptr [eax + 4]
6178 movzx edx, byte ptr [esi + edx]
6179 mov byte ptr [edi + 4], dl
6180 movzx edx, byte ptr [eax + 5]
6181 movzx edx, byte ptr [esi + edx]
6182 mov byte ptr [edi + 5], dl
6183 movzx edx, byte ptr [eax + 6]
6184 movzx edx, byte ptr [esi + edx]
6185 mov byte ptr [edi + 6], dl
6186 movzx edx, byte ptr [eax + 7] // copy alpha.
6187 mov byte ptr [edi + 7], dl
6188
6189 movd esi, xmm0
6190 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6191
6192 movzx edx, byte ptr [eax + 8]
6193 movzx edx, byte ptr [esi + edx]
6194 mov byte ptr [edi + 8], dl
6195 movzx edx, byte ptr [eax + 9]
6196 movzx edx, byte ptr [esi + edx]
6197 mov byte ptr [edi + 9], dl
6198 movzx edx, byte ptr [eax + 10]
6199 movzx edx, byte ptr [esi + edx]
6200 mov byte ptr [edi + 10], dl
6201 movzx edx, byte ptr [eax + 11] // copy alpha.
6202 mov byte ptr [edi + 11], dl
6203
6204 movd esi, xmm0
6205
6206 movzx edx, byte ptr [eax + 12]
6207 movzx edx, byte ptr [esi + edx]
6208 mov byte ptr [edi + 12], dl
6209 movzx edx, byte ptr [eax + 13]
6210 movzx edx, byte ptr [esi + edx]
6211 mov byte ptr [edi + 13], dl
6212 movzx edx, byte ptr [eax + 14]
6213 movzx edx, byte ptr [esi + edx]
6214 mov byte ptr [edi + 14], dl
6215 movzx edx, byte ptr [eax + 15] // copy alpha.
6216 mov byte ptr [edi + 15], dl
6217
6218 lea eax, [eax + 16]
6219 lea edi, [edi + 16]
6220 sub ecx, 4
6221 jg convertloop
6222
6223 pop edi
6224 pop esi
6225 ret
6226 }
6227 }
6228 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6229
6230 #endif // defined(_M_X64)
6231
6232 #ifdef __cplusplus
6233 } // extern "C"
6234 } // namespace libyuv
6235 #endif
6236
6237 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6238