1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 // This module is for Visual C 32/64 bit and clangcl 32 bit
14 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
15 (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
16
17 #if defined(_M_X64)
18 #include <emmintrin.h>
19 #include <tmmintrin.h> // For _mm_maddubs_epi16
20 #endif
21
22 #ifdef __cplusplus
23 namespace libyuv {
24 extern "C" {
25 #endif
26
27 // 64 bit
28 #if defined(_M_X64)
29
30 // Read 4 UV from 422, upsample to 8 UV.
31 #define READYUV422 \
32 xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
33 xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
34 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
35 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
36 u_buf += 4; \
37 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
38 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
39 y_buf += 8;
40
41 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
42 #define READYUVA422 \
43 xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
44 xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
45 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
46 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
47 u_buf += 4; \
48 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
49 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
50 y_buf += 8; \
51 xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
52 a_buf += 8;
53
54 // Convert 8 pixels: 8 UV and 8 Y.
55 #define YUVTORGB(yuvconstants) \
56 xmm1 = _mm_loadu_si128(&xmm0); \
57 xmm2 = _mm_loadu_si128(&xmm0); \
58 xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
59 xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
60 xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
61 xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \
62 xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \
63 xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \
64 xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
65 xmm0 = _mm_adds_epi16(xmm0, xmm4); \
66 xmm1 = _mm_adds_epi16(xmm1, xmm4); \
67 xmm2 = _mm_adds_epi16(xmm2, xmm4); \
68 xmm0 = _mm_srai_epi16(xmm0, 6); \
69 xmm1 = _mm_srai_epi16(xmm1, 6); \
70 xmm2 = _mm_srai_epi16(xmm2, 6); \
71 xmm0 = _mm_packus_epi16(xmm0, xmm0); \
72 xmm1 = _mm_packus_epi16(xmm1, xmm1); \
73 xmm2 = _mm_packus_epi16(xmm2, xmm2);
74
75 // Store 8 ARGB values.
76 #define STOREARGB \
77 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
78 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \
79 xmm1 = _mm_loadu_si128(&xmm0); \
80 xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \
81 xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \
82 _mm_storeu_si128((__m128i*)dst_argb, xmm0); \
83 _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
84 dst_argb += 32;
85
86 #if defined(HAS_I422TOARGBROW_SSSE3)
I422ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)87 void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
88 const uint8_t* u_buf,
89 const uint8_t* v_buf,
90 uint8_t* dst_argb,
91 const struct YuvConstants* yuvconstants,
92 int width) {
93 __m128i xmm0, xmm1, xmm2, xmm4;
94 const __m128i xmm5 = _mm_set1_epi8(-1);
95 const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
96 while (width > 0) {
97 READYUV422
98 YUVTORGB(yuvconstants)
99 STOREARGB
100 width -= 8;
101 }
102 }
103 #endif
104
105 #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
I422AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)106 void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
107 const uint8_t* u_buf,
108 const uint8_t* v_buf,
109 const uint8_t* a_buf,
110 uint8_t* dst_argb,
111 const struct YuvConstants* yuvconstants,
112 int width) {
113 __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
114 const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
115 while (width > 0) {
116 READYUVA422
117 YUVTORGB(yuvconstants)
118 STOREARGB
119 width -= 8;
120 }
121 }
122 #endif
123
124 // 32 bit
125 #else // defined(_M_X64)
126 #ifdef HAS_ARGBTOYROW_SSSE3
127
128 // Constants for ARGB.
129 static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
130 13, 65, 33, 0, 13, 65, 33, 0};
131
132 // JPeg full range.
133 static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
134 15, 75, 38, 0, 15, 75, 38, 0};
135
136 static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
137 112, -74, -38, 0, 112, -74, -38, 0};
138
139 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
140 127, -84, -43, 0, 127, -84, -43, 0};
141
142 static const vec8 kARGBToV = {
143 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
144 };
145
146 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
147 -20, -107, 127, 0, -20, -107, 127, 0};
148
149 // vpshufb for vphaddw + vpackuswb packed to shorts.
150 static const lvec8 kShufARGBToUV_AVX = {
151 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
152 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
153
154 // Constants for BGRA.
155 static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
156 0, 33, 65, 13, 0, 33, 65, 13};
157
158 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
159 0, -38, -74, 112, 0, -38, -74, 112};
160
161 static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
162 0, 112, -94, -18, 0, 112, -94, -18};
163
164 // Constants for ABGR.
165 static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
166 33, 65, 13, 0, 33, 65, 13, 0};
167
168 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
169 -38, -74, 112, 0, -38, -74, 112, 0};
170
171 static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
172 112, -94, -18, 0, 112, -94, -18, 0};
173
174 // Constants for RGBA.
175 static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
176 0, 13, 65, 33, 0, 13, 65, 33};
177
178 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
179 0, 112, -74, -38, 0, 112, -74, -38};
180
181 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
182 0, -18, -94, 112, 0, -18, -94, 112};
183
184 static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
185 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
186
187 // 7 bit fixed point 0.5.
188 static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
189
190 static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
191 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
192
193 static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
194 0x8080u, 0x8080u, 0x8080u, 0x8080u};
195
196 // Shuffle table for converting RGB24 to ARGB.
197 static const uvec8 kShuffleMaskRGB24ToARGB = {
198 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
199
200 // Shuffle table for converting RAW to ARGB.
201 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
202 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
203
204 // Shuffle table for converting RAW to RGB24. First 8.
205 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
206 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
207 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
208
209 // Shuffle table for converting RAW to RGB24. Middle 8.
210 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
211 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
212 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
213
214 // Shuffle table for converting RAW to RGB24. Last 8.
215 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
216 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
217 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
218
219 // Shuffle table for converting ARGB to RGB24.
220 static const uvec8 kShuffleMaskARGBToRGB24 = {
221 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
222
223 // Shuffle table for converting ARGB to RAW.
224 static const uvec8 kShuffleMaskARGBToRAW = {
225 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
226
227 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
228 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
229 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
230
231 // YUY2 shuf 16 Y to 32 Y.
232 static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
233 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
234 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
235
236 // YUY2 shuf 8 UV to 16 UV.
237 static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
238 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
239 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
240
241 // UYVY shuf 16 Y to 32 Y.
242 static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
243 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
244 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
245
246 // UYVY shuf 8 UV to 16 UV.
247 static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
248 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
249 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
250
251 // NV21 shuf 8 VU to 16 UV.
252 static const lvec8 kShuffleNV21 = {
253 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
254 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
255 };
256
257 // Duplicates gray value 3 times and fills in alpha opaque.
258 __declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y,
259 uint8_t* dst_argb,
260 int width) {
261 __asm {
262 mov eax, [esp + 4] // src_y
263 mov edx, [esp + 8] // dst_argb
264 mov ecx, [esp + 12] // width
265 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
266 pslld xmm5, 24
267
268 convertloop:
269 movq xmm0, qword ptr [eax]
270 lea eax, [eax + 8]
271 punpcklbw xmm0, xmm0
272 movdqa xmm1, xmm0
273 punpcklwd xmm0, xmm0
274 punpckhwd xmm1, xmm1
275 por xmm0, xmm5
276 por xmm1, xmm5
277 movdqu [edx], xmm0
278 movdqu [edx + 16], xmm1
279 lea edx, [edx + 32]
280 sub ecx, 8
281 jg convertloop
282 ret
283 }
284 }
285
286 #ifdef HAS_J400TOARGBROW_AVX2
287 // Duplicates gray value 3 times and fills in alpha opaque.
288 __declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y,
289 uint8_t* dst_argb,
290 int width) {
291 __asm {
292 mov eax, [esp + 4] // src_y
293 mov edx, [esp + 8] // dst_argb
294 mov ecx, [esp + 12] // width
295 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
296 vpslld ymm5, ymm5, 24
297
298 convertloop:
299 vmovdqu xmm0, [eax]
300 lea eax, [eax + 16]
301 vpermq ymm0, ymm0, 0xd8
302 vpunpcklbw ymm0, ymm0, ymm0
303 vpermq ymm0, ymm0, 0xd8
304 vpunpckhwd ymm1, ymm0, ymm0
305 vpunpcklwd ymm0, ymm0, ymm0
306 vpor ymm0, ymm0, ymm5
307 vpor ymm1, ymm1, ymm5
308 vmovdqu [edx], ymm0
309 vmovdqu [edx + 32], ymm1
310 lea edx, [edx + 64]
311 sub ecx, 16
312 jg convertloop
313 vzeroupper
314 ret
315 }
316 }
317 #endif // HAS_J400TOARGBROW_AVX2
318
319 __declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
320 uint8_t* dst_argb,
321 int width) {
322 __asm {
323 mov eax, [esp + 4] // src_rgb24
324 mov edx, [esp + 8] // dst_argb
325 mov ecx, [esp + 12] // width
326 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
327 pslld xmm5, 24
328 movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
329
330 convertloop:
331 movdqu xmm0, [eax]
332 movdqu xmm1, [eax + 16]
333 movdqu xmm3, [eax + 32]
334 lea eax, [eax + 48]
335 movdqa xmm2, xmm3
336 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
337 pshufb xmm2, xmm4
338 por xmm2, xmm5
339 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
340 pshufb xmm0, xmm4
341 movdqu [edx + 32], xmm2
342 por xmm0, xmm5
343 pshufb xmm1, xmm4
344 movdqu [edx], xmm0
345 por xmm1, xmm5
346 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
347 pshufb xmm3, xmm4
348 movdqu [edx + 16], xmm1
349 por xmm3, xmm5
350 movdqu [edx + 48], xmm3
351 lea edx, [edx + 64]
352 sub ecx, 16
353 jg convertloop
354 ret
355 }
356 }
357
358 __declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw,
359 uint8_t* dst_argb,
360 int width) {
361 __asm {
362 mov eax, [esp + 4] // src_raw
363 mov edx, [esp + 8] // dst_argb
364 mov ecx, [esp + 12] // width
365 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
366 pslld xmm5, 24
367 movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB
368
369 convertloop:
370 movdqu xmm0, [eax]
371 movdqu xmm1, [eax + 16]
372 movdqu xmm3, [eax + 32]
373 lea eax, [eax + 48]
374 movdqa xmm2, xmm3
375 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
376 pshufb xmm2, xmm4
377 por xmm2, xmm5
378 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
379 pshufb xmm0, xmm4
380 movdqu [edx + 32], xmm2
381 por xmm0, xmm5
382 pshufb xmm1, xmm4
383 movdqu [edx], xmm0
384 por xmm1, xmm5
385 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
386 pshufb xmm3, xmm4
387 movdqu [edx + 16], xmm1
388 por xmm3, xmm5
389 movdqu [edx + 48], xmm3
390 lea edx, [edx + 64]
391 sub ecx, 16
392 jg convertloop
393 ret
394 }
395 }
396
397 __declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
398 uint8_t* dst_rgb24,
399 int width) {
400 __asm {
401 mov eax, [esp + 4] // src_raw
402 mov edx, [esp + 8] // dst_rgb24
403 mov ecx, [esp + 12] // width
404 movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
405 movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
406 movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
407
408 convertloop:
409 movdqu xmm0, [eax]
410 movdqu xmm1, [eax + 4]
411 movdqu xmm2, [eax + 8]
412 lea eax, [eax + 24]
413 pshufb xmm0, xmm3
414 pshufb xmm1, xmm4
415 pshufb xmm2, xmm5
416 movq qword ptr [edx], xmm0
417 movq qword ptr [edx + 8], xmm1
418 movq qword ptr [edx + 16], xmm2
419 lea edx, [edx + 24]
420 sub ecx, 8
421 jg convertloop
422 ret
423 }
424 }
425
426 // pmul method to replicate bits.
427 // Math to replicate bits:
428 // (v << 8) | (v << 3)
429 // v * 256 + v * 8
430 // v * (256 + 8)
431 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
432 // 20 instructions.
433 __declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565,
434 uint8_t* dst_argb,
435 int width) {
436 __asm {
437 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
438 movd xmm5, eax
439 pshufd xmm5, xmm5, 0
440 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
441 movd xmm6, eax
442 pshufd xmm6, xmm6, 0
443 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
444 psllw xmm3, 11
445 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
446 psllw xmm4, 10
447 psrlw xmm4, 5
448 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
449 psllw xmm7, 8
450
451 mov eax, [esp + 4] // src_rgb565
452 mov edx, [esp + 8] // dst_argb
453 mov ecx, [esp + 12] // width
454 sub edx, eax
455 sub edx, eax
456
457 convertloop:
458 movdqu xmm0, [eax] // fetch 8 pixels of bgr565
459 movdqa xmm1, xmm0
460 movdqa xmm2, xmm0
461 pand xmm1, xmm3 // R in upper 5 bits
462 psllw xmm2, 11 // B in upper 5 bits
463 pmulhuw xmm1, xmm5 // * (256 + 8)
464 pmulhuw xmm2, xmm5 // * (256 + 8)
465 psllw xmm1, 8
466 por xmm1, xmm2 // RB
467 pand xmm0, xmm4 // G in middle 6 bits
468 pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
469 por xmm0, xmm7 // AG
470 movdqa xmm2, xmm1
471 punpcklbw xmm1, xmm0
472 punpckhbw xmm2, xmm0
473 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
474 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
475 lea eax, [eax + 16]
476 sub ecx, 8
477 jg convertloop
478 ret
479 }
480 }
481
482 #ifdef HAS_RGB565TOARGBROW_AVX2
483 // pmul method to replicate bits.
484 // Math to replicate bits:
485 // (v << 8) | (v << 3)
486 // v * 256 + v * 8
487 // v * (256 + 8)
488 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
489 __declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
490 uint8_t* dst_argb,
491 int width) {
492 __asm {
493 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
494 vmovd xmm5, eax
495 vbroadcastss ymm5, xmm5
496 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
497 vmovd xmm6, eax
498 vbroadcastss ymm6, xmm6
499 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
500 vpsllw ymm3, ymm3, 11
501 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
502 vpsllw ymm4, ymm4, 10
503 vpsrlw ymm4, ymm4, 5
504 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
505 vpsllw ymm7, ymm7, 8
506
507 mov eax, [esp + 4] // src_rgb565
508 mov edx, [esp + 8] // dst_argb
509 mov ecx, [esp + 12] // width
510 sub edx, eax
511 sub edx, eax
512
513 convertloop:
514 vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
515 vpand ymm1, ymm0, ymm3 // R in upper 5 bits
516 vpsllw ymm2, ymm0, 11 // B in upper 5 bits
517 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
518 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
519 vpsllw ymm1, ymm1, 8
520 vpor ymm1, ymm1, ymm2 // RB
521 vpand ymm0, ymm0, ymm4 // G in middle 6 bits
522 vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
523 vpor ymm0, ymm0, ymm7 // AG
524 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
525 vpermq ymm1, ymm1, 0xd8
526 vpunpckhbw ymm2, ymm1, ymm0
527 vpunpcklbw ymm1, ymm1, ymm0
528 vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB
529 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB
530 lea eax, [eax + 32]
531 sub ecx, 16
532 jg convertloop
533 vzeroupper
534 ret
535 }
536 }
537 #endif // HAS_RGB565TOARGBROW_AVX2
538
539 #ifdef HAS_ARGB1555TOARGBROW_AVX2
540 __declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
541 uint8_t* dst_argb,
542 int width) {
543 __asm {
544 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
545 vmovd xmm5, eax
546 vbroadcastss ymm5, xmm5
547 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
548 vmovd xmm6, eax
549 vbroadcastss ymm6, xmm6
550 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
551 vpsllw ymm3, ymm3, 11
552 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
553 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
554 vpsllw ymm7, ymm7, 8
555
556 mov eax, [esp + 4] // src_argb1555
557 mov edx, [esp + 8] // dst_argb
558 mov ecx, [esp + 12] // width
559 sub edx, eax
560 sub edx, eax
561
562 convertloop:
563 vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
564 vpsllw ymm1, ymm0, 1 // R in upper 5 bits
565 vpsllw ymm2, ymm0, 11 // B in upper 5 bits
566 vpand ymm1, ymm1, ymm3
567 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
568 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
569 vpsllw ymm1, ymm1, 8
570 vpor ymm1, ymm1, ymm2 // RB
571 vpsraw ymm2, ymm0, 8 // A
572 vpand ymm0, ymm0, ymm4 // G in middle 5 bits
573 vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
574 vpand ymm2, ymm2, ymm7
575 vpor ymm0, ymm0, ymm2 // AG
576 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
577 vpermq ymm1, ymm1, 0xd8
578 vpunpckhbw ymm2, ymm1, ymm0
579 vpunpcklbw ymm1, ymm1, ymm0
580 vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB
581 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB
582 lea eax, [eax + 32]
583 sub ecx, 16
584 jg convertloop
585 vzeroupper
586 ret
587 }
588 }
589 #endif // HAS_ARGB1555TOARGBROW_AVX2
590
591 #ifdef HAS_ARGB4444TOARGBROW_AVX2
592 __declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
593 uint8_t* dst_argb,
594 int width) {
595 __asm {
596 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
597 vmovd xmm4, eax
598 vbroadcastss ymm4, xmm4
599 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
600 mov eax, [esp + 4] // src_argb4444
601 mov edx, [esp + 8] // dst_argb
602 mov ecx, [esp + 12] // width
603 sub edx, eax
604 sub edx, eax
605
606 convertloop:
607 vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
608 vpand ymm2, ymm0, ymm5 // mask high nibbles
609 vpand ymm0, ymm0, ymm4 // mask low nibbles
610 vpsrlw ymm3, ymm2, 4
611 vpsllw ymm1, ymm0, 4
612 vpor ymm2, ymm2, ymm3
613 vpor ymm0, ymm0, ymm1
614 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
615 vpermq ymm2, ymm2, 0xd8
616 vpunpckhbw ymm1, ymm0, ymm2
617 vpunpcklbw ymm0, ymm0, ymm2
618 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
619 vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
620 lea eax, [eax + 32]
621 sub ecx, 16
622 jg convertloop
623 vzeroupper
624 ret
625 }
626 }
627 #endif // HAS_ARGB4444TOARGBROW_AVX2
628
629 // 24 instructions
630 __declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555,
631 uint8_t* dst_argb,
632 int width) {
633 __asm {
634 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
635 movd xmm5, eax
636 pshufd xmm5, xmm5, 0
637 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
638 movd xmm6, eax
639 pshufd xmm6, xmm6, 0
640 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
641 psllw xmm3, 11
642 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
643 psrlw xmm4, 6
644 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
645 psllw xmm7, 8
646
647 mov eax, [esp + 4] // src_argb1555
648 mov edx, [esp + 8] // dst_argb
649 mov ecx, [esp + 12] // width
650 sub edx, eax
651 sub edx, eax
652
653 convertloop:
654 movdqu xmm0, [eax] // fetch 8 pixels of 1555
655 movdqa xmm1, xmm0
656 movdqa xmm2, xmm0
657 psllw xmm1, 1 // R in upper 5 bits
658 psllw xmm2, 11 // B in upper 5 bits
659 pand xmm1, xmm3
660 pmulhuw xmm2, xmm5 // * (256 + 8)
661 pmulhuw xmm1, xmm5 // * (256 + 8)
662 psllw xmm1, 8
663 por xmm1, xmm2 // RB
664 movdqa xmm2, xmm0
665 pand xmm0, xmm4 // G in middle 5 bits
666 psraw xmm2, 8 // A
667 pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
668 pand xmm2, xmm7
669 por xmm0, xmm2 // AG
670 movdqa xmm2, xmm1
671 punpcklbw xmm1, xmm0
672 punpckhbw xmm2, xmm0
673 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
674 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
675 lea eax, [eax + 16]
676 sub ecx, 8
677 jg convertloop
678 ret
679 }
680 }
681
682 // 18 instructions.
683 __declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444,
684 uint8_t* dst_argb,
685 int width) {
686 __asm {
687 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
688 movd xmm4, eax
689 pshufd xmm4, xmm4, 0
690 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
691 pslld xmm5, 4
692 mov eax, [esp + 4] // src_argb4444
693 mov edx, [esp + 8] // dst_argb
694 mov ecx, [esp + 12] // width
695 sub edx, eax
696 sub edx, eax
697
698 convertloop:
699 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
700 movdqa xmm2, xmm0
701 pand xmm0, xmm4 // mask low nibbles
702 pand xmm2, xmm5 // mask high nibbles
703 movdqa xmm1, xmm0
704 movdqa xmm3, xmm2
705 psllw xmm1, 4
706 psrlw xmm3, 4
707 por xmm0, xmm1
708 por xmm2, xmm3
709 movdqa xmm1, xmm0
710 punpcklbw xmm0, xmm2
711 punpckhbw xmm1, xmm2
712 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
713 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
714 lea eax, [eax + 16]
715 sub ecx, 8
716 jg convertloop
717 ret
718 }
719 }
720
721 __declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb,
722 uint8_t* dst_rgb,
723 int width) {
724 __asm {
725 mov eax, [esp + 4] // src_argb
726 mov edx, [esp + 8] // dst_rgb
727 mov ecx, [esp + 12] // width
728 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
729
730 convertloop:
731 movdqu xmm0, [eax] // fetch 16 pixels of argb
732 movdqu xmm1, [eax + 16]
733 movdqu xmm2, [eax + 32]
734 movdqu xmm3, [eax + 48]
735 lea eax, [eax + 64]
736 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
737 pshufb xmm1, xmm6
738 pshufb xmm2, xmm6
739 pshufb xmm3, xmm6
740 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
741 psrldq xmm1, 4 // 8 bytes from 1
742 pslldq xmm4, 12 // 4 bytes from 1 for 0
743 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
744 por xmm0, xmm4 // 4 bytes from 1 for 0
745 pslldq xmm5, 8 // 8 bytes from 2 for 1
746 movdqu [edx], xmm0 // store 0
747 por xmm1, xmm5 // 8 bytes from 2 for 1
748 psrldq xmm2, 8 // 4 bytes from 2
749 pslldq xmm3, 4 // 12 bytes from 3 for 2
750 por xmm2, xmm3 // 12 bytes from 3 for 2
751 movdqu [edx + 16], xmm1 // store 1
752 movdqu [edx + 32], xmm2 // store 2
753 lea edx, [edx + 48]
754 sub ecx, 16
755 jg convertloop
756 ret
757 }
758 }
759
760 __declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb,
761 uint8_t* dst_rgb,
762 int width) {
763 __asm {
764 mov eax, [esp + 4] // src_argb
765 mov edx, [esp + 8] // dst_rgb
766 mov ecx, [esp + 12] // width
767 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW
768
769 convertloop:
770 movdqu xmm0, [eax] // fetch 16 pixels of argb
771 movdqu xmm1, [eax + 16]
772 movdqu xmm2, [eax + 32]
773 movdqu xmm3, [eax + 48]
774 lea eax, [eax + 64]
775 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
776 pshufb xmm1, xmm6
777 pshufb xmm2, xmm6
778 pshufb xmm3, xmm6
779 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
780 psrldq xmm1, 4 // 8 bytes from 1
781 pslldq xmm4, 12 // 4 bytes from 1 for 0
782 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
783 por xmm0, xmm4 // 4 bytes from 1 for 0
784 pslldq xmm5, 8 // 8 bytes from 2 for 1
785 movdqu [edx], xmm0 // store 0
786 por xmm1, xmm5 // 8 bytes from 2 for 1
787 psrldq xmm2, 8 // 4 bytes from 2
788 pslldq xmm3, 4 // 12 bytes from 3 for 2
789 por xmm2, xmm3 // 12 bytes from 3 for 2
790 movdqu [edx + 16], xmm1 // store 1
791 movdqu [edx + 32], xmm2 // store 2
792 lea edx, [edx + 48]
793 sub ecx, 16
794 jg convertloop
795 ret
796 }
797 }
798
799 __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
800 uint8_t* dst_rgb,
801 int width) {
802 __asm {
803 mov eax, [esp + 4] // src_argb
804 mov edx, [esp + 8] // dst_rgb
805 mov ecx, [esp + 12] // width
806 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
807 psrld xmm3, 27
808 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
809 psrld xmm4, 26
810 pslld xmm4, 5
811 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
812 pslld xmm5, 11
813
814 convertloop:
815 movdqu xmm0, [eax] // fetch 4 pixels of argb
816 movdqa xmm1, xmm0 // B
817 movdqa xmm2, xmm0 // G
818 pslld xmm0, 8 // R
819 psrld xmm1, 3 // B
820 psrld xmm2, 5 // G
821 psrad xmm0, 16 // R
822 pand xmm1, xmm3 // B
823 pand xmm2, xmm4 // G
824 pand xmm0, xmm5 // R
825 por xmm1, xmm2 // BG
826 por xmm0, xmm1 // BGR
827 packssdw xmm0, xmm0
828 lea eax, [eax + 16]
829 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
830 lea edx, [edx + 8]
831 sub ecx, 4
832 jg convertloop
833 ret
834 }
835 }
836
837 __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
838 uint8_t* dst_rgb,
839 const uint32_t dither4,
840 int width) {
841 __asm {
842
843 mov eax, [esp + 4] // src_argb
844 mov edx, [esp + 8] // dst_rgb
845 movd xmm6, [esp + 12] // dither4
846 mov ecx, [esp + 16] // width
847 punpcklbw xmm6, xmm6 // make dither 16 bytes
848 movdqa xmm7, xmm6
849 punpcklwd xmm6, xmm6
850 punpckhwd xmm7, xmm7
851 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
852 psrld xmm3, 27
853 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
854 psrld xmm4, 26
855 pslld xmm4, 5
856 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
857 pslld xmm5, 11
858
859 convertloop:
860 movdqu xmm0, [eax] // fetch 4 pixels of argb
861 paddusb xmm0, xmm6 // add dither
862 movdqa xmm1, xmm0 // B
863 movdqa xmm2, xmm0 // G
864 pslld xmm0, 8 // R
865 psrld xmm1, 3 // B
866 psrld xmm2, 5 // G
867 psrad xmm0, 16 // R
868 pand xmm1, xmm3 // B
869 pand xmm2, xmm4 // G
870 pand xmm0, xmm5 // R
871 por xmm1, xmm2 // BG
872 por xmm0, xmm1 // BGR
873 packssdw xmm0, xmm0
874 lea eax, [eax + 16]
875 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
876 lea edx, [edx + 8]
877 sub ecx, 4
878 jg convertloop
879 ret
880 }
881 }
882
883 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
884 __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
885 uint8_t* dst_rgb,
886 const uint32_t dither4,
887 int width) {
888 __asm {
889 mov eax, [esp + 4] // src_argb
890 mov edx, [esp + 8] // dst_rgb
891 vbroadcastss xmm6, [esp + 12] // dither4
892 mov ecx, [esp + 16] // width
893 vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
894 vpermq ymm6, ymm6, 0xd8
895 vpunpcklwd ymm6, ymm6, ymm6
896 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
897 vpsrld ymm3, ymm3, 27
898 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
899 vpsrld ymm4, ymm4, 26
900 vpslld ymm4, ymm4, 5
901 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
902
903 convertloop:
904 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
905 vpaddusb ymm0, ymm0, ymm6 // add dither
906 vpsrld ymm2, ymm0, 5 // G
907 vpsrld ymm1, ymm0, 3 // B
908 vpsrld ymm0, ymm0, 8 // R
909 vpand ymm2, ymm2, ymm4 // G
910 vpand ymm1, ymm1, ymm3 // B
911 vpand ymm0, ymm0, ymm5 // R
912 vpor ymm1, ymm1, ymm2 // BG
913 vpor ymm0, ymm0, ymm1 // BGR
914 vpackusdw ymm0, ymm0, ymm0
915 vpermq ymm0, ymm0, 0xd8
916 lea eax, [eax + 32]
917 vmovdqu [edx], xmm0 // store 8 pixels of RGB565
918 lea edx, [edx + 16]
919 sub ecx, 8
920 jg convertloop
921 vzeroupper
922 ret
923 }
924 }
925 #endif // HAS_ARGBTORGB565DITHERROW_AVX2
926
927 // TODO(fbarchard): Improve sign extension/packing.
928 __declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,
929 uint8_t* dst_rgb,
930 int width) {
931 __asm {
932 mov eax, [esp + 4] // src_argb
933 mov edx, [esp + 8] // dst_rgb
934 mov ecx, [esp + 12] // width
935 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
936 psrld xmm4, 27
937 movdqa xmm5, xmm4 // generate mask 0x000003e0
938 pslld xmm5, 5
939 movdqa xmm6, xmm4 // generate mask 0x00007c00
940 pslld xmm6, 10
941 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
942 pslld xmm7, 15
943
944 convertloop:
945 movdqu xmm0, [eax] // fetch 4 pixels of argb
946 movdqa xmm1, xmm0 // B
947 movdqa xmm2, xmm0 // G
948 movdqa xmm3, xmm0 // R
949 psrad xmm0, 16 // A
950 psrld xmm1, 3 // B
951 psrld xmm2, 6 // G
952 psrld xmm3, 9 // R
953 pand xmm0, xmm7 // A
954 pand xmm1, xmm4 // B
955 pand xmm2, xmm5 // G
956 pand xmm3, xmm6 // R
957 por xmm0, xmm1 // BA
958 por xmm2, xmm3 // GR
959 por xmm0, xmm2 // BGRA
960 packssdw xmm0, xmm0
961 lea eax, [eax + 16]
962 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
963 lea edx, [edx + 8]
964 sub ecx, 4
965 jg convertloop
966 ret
967 }
968 }
969
970 __declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,
971 uint8_t* dst_rgb,
972 int width) {
973 __asm {
974 mov eax, [esp + 4] // src_argb
975 mov edx, [esp + 8] // dst_rgb
976 mov ecx, [esp + 12] // width
977 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
978 psllw xmm4, 12
979 movdqa xmm3, xmm4 // generate mask 0x00f000f0
980 psrlw xmm3, 8
981
982 convertloop:
983 movdqu xmm0, [eax] // fetch 4 pixels of argb
984 movdqa xmm1, xmm0
985 pand xmm0, xmm3 // low nibble
986 pand xmm1, xmm4 // high nibble
987 psrld xmm0, 4
988 psrld xmm1, 8
989 por xmm0, xmm1
990 packuswb xmm0, xmm0
991 lea eax, [eax + 16]
992 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
993 lea edx, [edx + 8]
994 sub ecx, 4
995 jg convertloop
996 ret
997 }
998 }
999
1000 #ifdef HAS_ARGBTORGB565ROW_AVX2
1001 __declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb,
1002 uint8_t* dst_rgb,
1003 int width) {
1004 __asm {
1005 mov eax, [esp + 4] // src_argb
1006 mov edx, [esp + 8] // dst_rgb
1007 mov ecx, [esp + 12] // width
1008 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
1009 vpsrld ymm3, ymm3, 27
1010 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
1011 vpsrld ymm4, ymm4, 26
1012 vpslld ymm4, ymm4, 5
1013 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
1014
1015 convertloop:
1016 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1017 vpsrld ymm2, ymm0, 5 // G
1018 vpsrld ymm1, ymm0, 3 // B
1019 vpsrld ymm0, ymm0, 8 // R
1020 vpand ymm2, ymm2, ymm4 // G
1021 vpand ymm1, ymm1, ymm3 // B
1022 vpand ymm0, ymm0, ymm5 // R
1023 vpor ymm1, ymm1, ymm2 // BG
1024 vpor ymm0, ymm0, ymm1 // BGR
1025 vpackusdw ymm0, ymm0, ymm0
1026 vpermq ymm0, ymm0, 0xd8
1027 lea eax, [eax + 32]
1028 vmovdqu [edx], xmm0 // store 8 pixels of RGB565
1029 lea edx, [edx + 16]
1030 sub ecx, 8
1031 jg convertloop
1032 vzeroupper
1033 ret
1034 }
1035 }
1036 #endif // HAS_ARGBTORGB565ROW_AVX2
1037
1038 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
1039 __declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
1040 uint8_t* dst_rgb,
1041 int width) {
1042 __asm {
1043 mov eax, [esp + 4] // src_argb
1044 mov edx, [esp + 8] // dst_rgb
1045 mov ecx, [esp + 12] // width
1046 vpcmpeqb ymm4, ymm4, ymm4
1047 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
1048 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
1049 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
1050 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
1051 vpslld ymm7, ymm7, 15
1052
1053 convertloop:
1054 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1055 vpsrld ymm3, ymm0, 9 // R
1056 vpsrld ymm2, ymm0, 6 // G
1057 vpsrld ymm1, ymm0, 3 // B
1058 vpsrad ymm0, ymm0, 16 // A
1059 vpand ymm3, ymm3, ymm6 // R
1060 vpand ymm2, ymm2, ymm5 // G
1061 vpand ymm1, ymm1, ymm4 // B
1062 vpand ymm0, ymm0, ymm7 // A
1063 vpor ymm0, ymm0, ymm1 // BA
1064 vpor ymm2, ymm2, ymm3 // GR
1065 vpor ymm0, ymm0, ymm2 // BGRA
1066 vpackssdw ymm0, ymm0, ymm0
1067 vpermq ymm0, ymm0, 0xd8
1068 lea eax, [eax + 32]
1069 vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555
1070 lea edx, [edx + 16]
1071 sub ecx, 8
1072 jg convertloop
1073 vzeroupper
1074 ret
1075 }
1076 }
1077 #endif // HAS_ARGBTOARGB1555ROW_AVX2
1078
1079 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
1080 __declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
1081 uint8_t* dst_rgb,
1082 int width) {
1083 __asm {
1084 mov eax, [esp + 4] // src_argb
1085 mov edx, [esp + 8] // dst_rgb
1086 mov ecx, [esp + 12] // width
1087 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
1088 vpsllw ymm4, ymm4, 12
1089 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
1090
1091 convertloop:
1092 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1093 vpand ymm1, ymm0, ymm4 // high nibble
1094 vpand ymm0, ymm0, ymm3 // low nibble
1095 vpsrld ymm1, ymm1, 8
1096 vpsrld ymm0, ymm0, 4
1097 vpor ymm0, ymm0, ymm1
1098 vpackuswb ymm0, ymm0, ymm0
1099 vpermq ymm0, ymm0, 0xd8
1100 lea eax, [eax + 32]
1101 vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
1102 lea edx, [edx + 16]
1103 sub ecx, 8
1104 jg convertloop
1105 vzeroupper
1106 ret
1107 }
1108 }
1109 #endif // HAS_ARGBTOARGB4444ROW_AVX2
1110
1111 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
1112 __declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
1113 uint8_t* dst_y,
1114 int width) {
1115 __asm {
1116 mov eax, [esp + 4] /* src_argb */
1117 mov edx, [esp + 8] /* dst_y */
1118 mov ecx, [esp + 12] /* width */
1119 movdqa xmm4, xmmword ptr kARGBToY
1120 movdqa xmm5, xmmword ptr kAddY16
1121
1122 convertloop:
1123 movdqu xmm0, [eax]
1124 movdqu xmm1, [eax + 16]
1125 movdqu xmm2, [eax + 32]
1126 movdqu xmm3, [eax + 48]
1127 pmaddubsw xmm0, xmm4
1128 pmaddubsw xmm1, xmm4
1129 pmaddubsw xmm2, xmm4
1130 pmaddubsw xmm3, xmm4
1131 lea eax, [eax + 64]
1132 phaddw xmm0, xmm1
1133 phaddw xmm2, xmm3
1134 psrlw xmm0, 7
1135 psrlw xmm2, 7
1136 packuswb xmm0, xmm2
1137 paddb xmm0, xmm5
1138 movdqu [edx], xmm0
1139 lea edx, [edx + 16]
1140 sub ecx, 16
1141 jg convertloop
1142 ret
1143 }
1144 }
1145
1146 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1147 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
1148 __declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
1149 uint8_t* dst_y,
1150 int width) {
1151 __asm {
1152 mov eax, [esp + 4] /* src_argb */
1153 mov edx, [esp + 8] /* dst_y */
1154 mov ecx, [esp + 12] /* width */
1155 movdqa xmm4, xmmword ptr kARGBToYJ
1156 movdqa xmm5, xmmword ptr kAddYJ64
1157
1158 convertloop:
1159 movdqu xmm0, [eax]
1160 movdqu xmm1, [eax + 16]
1161 movdqu xmm2, [eax + 32]
1162 movdqu xmm3, [eax + 48]
1163 pmaddubsw xmm0, xmm4
1164 pmaddubsw xmm1, xmm4
1165 pmaddubsw xmm2, xmm4
1166 pmaddubsw xmm3, xmm4
1167 lea eax, [eax + 64]
1168 phaddw xmm0, xmm1
1169 phaddw xmm2, xmm3
1170 paddw xmm0, xmm5 // Add .5 for rounding.
1171 paddw xmm2, xmm5
1172 psrlw xmm0, 7
1173 psrlw xmm2, 7
1174 packuswb xmm0, xmm2
1175 movdqu [edx], xmm0
1176 lea edx, [edx + 16]
1177 sub ecx, 16
1178 jg convertloop
1179 ret
1180 }
1181 }
1182
1183 #ifdef HAS_ARGBTOYROW_AVX2
1184 // vpermd for vphaddw + vpackuswb vpermd.
1185 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
1186
1187 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1188 __declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb,
1189 uint8_t* dst_y,
1190 int width) {
1191 __asm {
1192 mov eax, [esp + 4] /* src_argb */
1193 mov edx, [esp + 8] /* dst_y */
1194 mov ecx, [esp + 12] /* width */
1195 vbroadcastf128 ymm4, xmmword ptr kARGBToY
1196 vbroadcastf128 ymm5, xmmword ptr kAddY16
1197 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
1198
1199 convertloop:
1200 vmovdqu ymm0, [eax]
1201 vmovdqu ymm1, [eax + 32]
1202 vmovdqu ymm2, [eax + 64]
1203 vmovdqu ymm3, [eax + 96]
1204 vpmaddubsw ymm0, ymm0, ymm4
1205 vpmaddubsw ymm1, ymm1, ymm4
1206 vpmaddubsw ymm2, ymm2, ymm4
1207 vpmaddubsw ymm3, ymm3, ymm4
1208 lea eax, [eax + 128]
1209 vphaddw ymm0, ymm0, ymm1 // mutates.
1210 vphaddw ymm2, ymm2, ymm3
1211 vpsrlw ymm0, ymm0, 7
1212 vpsrlw ymm2, ymm2, 7
1213 vpackuswb ymm0, ymm0, ymm2 // mutates.
1214 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
1215 vpaddb ymm0, ymm0, ymm5 // add 16 for Y
1216 vmovdqu [edx], ymm0
1217 lea edx, [edx + 32]
1218 sub ecx, 32
1219 jg convertloop
1220 vzeroupper
1221 ret
1222 }
1223 }
1224 #endif // HAS_ARGBTOYROW_AVX2
1225
1226 #ifdef HAS_ARGBTOYJROW_AVX2
1227 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1228 __declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb,
1229 uint8_t* dst_y,
1230 int width) {
1231 __asm {
1232 mov eax, [esp + 4] /* src_argb */
1233 mov edx, [esp + 8] /* dst_y */
1234 mov ecx, [esp + 12] /* width */
1235 vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
1236 vbroadcastf128 ymm5, xmmword ptr kAddYJ64
1237 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
1238
1239 convertloop:
1240 vmovdqu ymm0, [eax]
1241 vmovdqu ymm1, [eax + 32]
1242 vmovdqu ymm2, [eax + 64]
1243 vmovdqu ymm3, [eax + 96]
1244 vpmaddubsw ymm0, ymm0, ymm4
1245 vpmaddubsw ymm1, ymm1, ymm4
1246 vpmaddubsw ymm2, ymm2, ymm4
1247 vpmaddubsw ymm3, ymm3, ymm4
1248 lea eax, [eax + 128]
1249 vphaddw ymm0, ymm0, ymm1 // mutates.
1250 vphaddw ymm2, ymm2, ymm3
1251 vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
1252 vpaddw ymm2, ymm2, ymm5
1253 vpsrlw ymm0, ymm0, 7
1254 vpsrlw ymm2, ymm2, 7
1255 vpackuswb ymm0, ymm0, ymm2 // mutates.
1256 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
1257 vmovdqu [edx], ymm0
1258 lea edx, [edx + 32]
1259 sub ecx, 32
1260 jg convertloop
1261
1262 vzeroupper
1263 ret
1264 }
1265 }
1266 #endif // HAS_ARGBTOYJROW_AVX2
1267
1268 __declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb,
1269 uint8_t* dst_y,
1270 int width) {
1271 __asm {
1272 mov eax, [esp + 4] /* src_argb */
1273 mov edx, [esp + 8] /* dst_y */
1274 mov ecx, [esp + 12] /* width */
1275 movdqa xmm4, xmmword ptr kBGRAToY
1276 movdqa xmm5, xmmword ptr kAddY16
1277
1278 convertloop:
1279 movdqu xmm0, [eax]
1280 movdqu xmm1, [eax + 16]
1281 movdqu xmm2, [eax + 32]
1282 movdqu xmm3, [eax + 48]
1283 pmaddubsw xmm0, xmm4
1284 pmaddubsw xmm1, xmm4
1285 pmaddubsw xmm2, xmm4
1286 pmaddubsw xmm3, xmm4
1287 lea eax, [eax + 64]
1288 phaddw xmm0, xmm1
1289 phaddw xmm2, xmm3
1290 psrlw xmm0, 7
1291 psrlw xmm2, 7
1292 packuswb xmm0, xmm2
1293 paddb xmm0, xmm5
1294 movdqu [edx], xmm0
1295 lea edx, [edx + 16]
1296 sub ecx, 16
1297 jg convertloop
1298 ret
1299 }
1300 }
1301
1302 __declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb,
1303 uint8_t* dst_y,
1304 int width) {
1305 __asm {
1306 mov eax, [esp + 4] /* src_argb */
1307 mov edx, [esp + 8] /* dst_y */
1308 mov ecx, [esp + 12] /* width */
1309 movdqa xmm4, xmmword ptr kABGRToY
1310 movdqa xmm5, xmmword ptr kAddY16
1311
1312 convertloop:
1313 movdqu xmm0, [eax]
1314 movdqu xmm1, [eax + 16]
1315 movdqu xmm2, [eax + 32]
1316 movdqu xmm3, [eax + 48]
1317 pmaddubsw xmm0, xmm4
1318 pmaddubsw xmm1, xmm4
1319 pmaddubsw xmm2, xmm4
1320 pmaddubsw xmm3, xmm4
1321 lea eax, [eax + 64]
1322 phaddw xmm0, xmm1
1323 phaddw xmm2, xmm3
1324 psrlw xmm0, 7
1325 psrlw xmm2, 7
1326 packuswb xmm0, xmm2
1327 paddb xmm0, xmm5
1328 movdqu [edx], xmm0
1329 lea edx, [edx + 16]
1330 sub ecx, 16
1331 jg convertloop
1332 ret
1333 }
1334 }
1335
1336 __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
1337 uint8_t* dst_y,
1338 int width) {
1339 __asm {
1340 mov eax, [esp + 4] /* src_argb */
1341 mov edx, [esp + 8] /* dst_y */
1342 mov ecx, [esp + 12] /* width */
1343 movdqa xmm4, xmmword ptr kRGBAToY
1344 movdqa xmm5, xmmword ptr kAddY16
1345
1346 convertloop:
1347 movdqu xmm0, [eax]
1348 movdqu xmm1, [eax + 16]
1349 movdqu xmm2, [eax + 32]
1350 movdqu xmm3, [eax + 48]
1351 pmaddubsw xmm0, xmm4
1352 pmaddubsw xmm1, xmm4
1353 pmaddubsw xmm2, xmm4
1354 pmaddubsw xmm3, xmm4
1355 lea eax, [eax + 64]
1356 phaddw xmm0, xmm1
1357 phaddw xmm2, xmm3
1358 psrlw xmm0, 7
1359 psrlw xmm2, 7
1360 packuswb xmm0, xmm2
1361 paddb xmm0, xmm5
1362 movdqu [edx], xmm0
1363 lea edx, [edx + 16]
1364 sub ecx, 16
1365 jg convertloop
1366 ret
1367 }
1368 }
1369
1370 __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
1371 int src_stride_argb,
1372 uint8_t* dst_u,
1373 uint8_t* dst_v,
1374 int width) {
1375 __asm {
1376 push esi
1377 push edi
1378 mov eax, [esp + 8 + 4] // src_argb
1379 mov esi, [esp + 8 + 8] // src_stride_argb
1380 mov edx, [esp + 8 + 12] // dst_u
1381 mov edi, [esp + 8 + 16] // dst_v
1382 mov ecx, [esp + 8 + 20] // width
1383 movdqa xmm5, xmmword ptr kAddUV128
1384 movdqa xmm6, xmmword ptr kARGBToV
1385 movdqa xmm7, xmmword ptr kARGBToU
1386 sub edi, edx // stride from u to v
1387
1388 convertloop:
1389 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1390 movdqu xmm0, [eax]
1391 movdqu xmm4, [eax + esi]
1392 pavgb xmm0, xmm4
1393 movdqu xmm1, [eax + 16]
1394 movdqu xmm4, [eax + esi + 16]
1395 pavgb xmm1, xmm4
1396 movdqu xmm2, [eax + 32]
1397 movdqu xmm4, [eax + esi + 32]
1398 pavgb xmm2, xmm4
1399 movdqu xmm3, [eax + 48]
1400 movdqu xmm4, [eax + esi + 48]
1401 pavgb xmm3, xmm4
1402
1403 lea eax, [eax + 64]
1404 movdqa xmm4, xmm0
1405 shufps xmm0, xmm1, 0x88
1406 shufps xmm4, xmm1, 0xdd
1407 pavgb xmm0, xmm4
1408 movdqa xmm4, xmm2
1409 shufps xmm2, xmm3, 0x88
1410 shufps xmm4, xmm3, 0xdd
1411 pavgb xmm2, xmm4
1412
1413 // step 2 - convert to U and V
1414 // from here down is very similar to Y code except
1415 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1416 movdqa xmm1, xmm0
1417 movdqa xmm3, xmm2
1418 pmaddubsw xmm0, xmm7 // U
1419 pmaddubsw xmm2, xmm7
1420 pmaddubsw xmm1, xmm6 // V
1421 pmaddubsw xmm3, xmm6
1422 phaddw xmm0, xmm2
1423 phaddw xmm1, xmm3
1424 psraw xmm0, 8
1425 psraw xmm1, 8
1426 packsswb xmm0, xmm1
1427 paddb xmm0, xmm5 // -> unsigned
1428
1429 // step 3 - store 8 U and 8 V values
1430 movlps qword ptr [edx], xmm0 // U
1431 movhps qword ptr [edx + edi], xmm0 // V
1432 lea edx, [edx + 8]
1433 sub ecx, 16
1434 jg convertloop
1435
1436 pop edi
1437 pop esi
1438 ret
1439 }
1440 }
1441
1442 __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
1443 int src_stride_argb,
1444 uint8_t* dst_u,
1445 uint8_t* dst_v,
1446 int width) {
1447 __asm {
1448 push esi
1449 push edi
1450 mov eax, [esp + 8 + 4] // src_argb
1451 mov esi, [esp + 8 + 8] // src_stride_argb
1452 mov edx, [esp + 8 + 12] // dst_u
1453 mov edi, [esp + 8 + 16] // dst_v
1454 mov ecx, [esp + 8 + 20] // width
1455 movdqa xmm5, xmmword ptr kAddUVJ128
1456 movdqa xmm6, xmmword ptr kARGBToVJ
1457 movdqa xmm7, xmmword ptr kARGBToUJ
1458 sub edi, edx // stride from u to v
1459
1460 convertloop:
1461 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1462 movdqu xmm0, [eax]
1463 movdqu xmm4, [eax + esi]
1464 pavgb xmm0, xmm4
1465 movdqu xmm1, [eax + 16]
1466 movdqu xmm4, [eax + esi + 16]
1467 pavgb xmm1, xmm4
1468 movdqu xmm2, [eax + 32]
1469 movdqu xmm4, [eax + esi + 32]
1470 pavgb xmm2, xmm4
1471 movdqu xmm3, [eax + 48]
1472 movdqu xmm4, [eax + esi + 48]
1473 pavgb xmm3, xmm4
1474
1475 lea eax, [eax + 64]
1476 movdqa xmm4, xmm0
1477 shufps xmm0, xmm1, 0x88
1478 shufps xmm4, xmm1, 0xdd
1479 pavgb xmm0, xmm4
1480 movdqa xmm4, xmm2
1481 shufps xmm2, xmm3, 0x88
1482 shufps xmm4, xmm3, 0xdd
1483 pavgb xmm2, xmm4
1484
1485 // step 2 - convert to U and V
1486 // from here down is very similar to Y code except
1487 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1488 movdqa xmm1, xmm0
1489 movdqa xmm3, xmm2
1490 pmaddubsw xmm0, xmm7 // U
1491 pmaddubsw xmm2, xmm7
1492 pmaddubsw xmm1, xmm6 // V
1493 pmaddubsw xmm3, xmm6
1494 phaddw xmm0, xmm2
1495 phaddw xmm1, xmm3
1496 paddw xmm0, xmm5 // +.5 rounding -> unsigned
1497 paddw xmm1, xmm5
1498 psraw xmm0, 8
1499 psraw xmm1, 8
1500 packsswb xmm0, xmm1
1501
1502 // step 3 - store 8 U and 8 V values
1503 movlps qword ptr [edx], xmm0 // U
1504 movhps qword ptr [edx + edi], xmm0 // V
1505 lea edx, [edx + 8]
1506 sub ecx, 16
1507 jg convertloop
1508
1509 pop edi
1510 pop esi
1511 ret
1512 }
1513 }
1514
1515 #ifdef HAS_ARGBTOUVROW_AVX2
1516 __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
1517 int src_stride_argb,
1518 uint8_t* dst_u,
1519 uint8_t* dst_v,
1520 int width) {
1521 __asm {
1522 push esi
1523 push edi
1524 mov eax, [esp + 8 + 4] // src_argb
1525 mov esi, [esp + 8 + 8] // src_stride_argb
1526 mov edx, [esp + 8 + 12] // dst_u
1527 mov edi, [esp + 8 + 16] // dst_v
1528 mov ecx, [esp + 8 + 20] // width
1529 vbroadcastf128 ymm5, xmmword ptr kAddUV128
1530 vbroadcastf128 ymm6, xmmword ptr kARGBToV
1531 vbroadcastf128 ymm7, xmmword ptr kARGBToU
1532 sub edi, edx // stride from u to v
1533
1534 convertloop:
1535 /* step 1 - subsample 32x2 argb pixels to 16x1 */
1536 vmovdqu ymm0, [eax]
1537 vmovdqu ymm1, [eax + 32]
1538 vmovdqu ymm2, [eax + 64]
1539 vmovdqu ymm3, [eax + 96]
1540 vpavgb ymm0, ymm0, [eax + esi]
1541 vpavgb ymm1, ymm1, [eax + esi + 32]
1542 vpavgb ymm2, ymm2, [eax + esi + 64]
1543 vpavgb ymm3, ymm3, [eax + esi + 96]
1544 lea eax, [eax + 128]
1545 vshufps ymm4, ymm0, ymm1, 0x88
1546 vshufps ymm0, ymm0, ymm1, 0xdd
1547 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
1548 vshufps ymm4, ymm2, ymm3, 0x88
1549 vshufps ymm2, ymm2, ymm3, 0xdd
1550 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
1551
1552 // step 2 - convert to U and V
1553 // from here down is very similar to Y code except
1554 // instead of 32 different pixels, its 16 pixels of U and 16 of V
1555 vpmaddubsw ymm1, ymm0, ymm7 // U
1556 vpmaddubsw ymm3, ymm2, ymm7
1557 vpmaddubsw ymm0, ymm0, ymm6 // V
1558 vpmaddubsw ymm2, ymm2, ymm6
1559 vphaddw ymm1, ymm1, ymm3 // mutates
1560 vphaddw ymm0, ymm0, ymm2
1561 vpsraw ymm1, ymm1, 8
1562 vpsraw ymm0, ymm0, 8
1563 vpacksswb ymm0, ymm1, ymm0 // mutates
1564 vpermq ymm0, ymm0, 0xd8 // For vpacksswb
1565 vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
1566 vpaddb ymm0, ymm0, ymm5 // -> unsigned
1567
1568 // step 3 - store 16 U and 16 V values
1569 vextractf128 [edx], ymm0, 0 // U
1570 vextractf128 [edx + edi], ymm0, 1 // V
1571 lea edx, [edx + 16]
1572 sub ecx, 32
1573 jg convertloop
1574
1575 pop edi
1576 pop esi
1577 vzeroupper
1578 ret
1579 }
1580 }
1581 #endif // HAS_ARGBTOUVROW_AVX2
1582
1583 #ifdef HAS_ARGBTOUVJROW_AVX2
1584 __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
1585 int src_stride_argb,
1586 uint8_t* dst_u,
1587 uint8_t* dst_v,
1588 int width) {
1589 __asm {
1590 push esi
1591 push edi
1592 mov eax, [esp + 8 + 4] // src_argb
1593 mov esi, [esp + 8 + 8] // src_stride_argb
1594 mov edx, [esp + 8 + 12] // dst_u
1595 mov edi, [esp + 8 + 16] // dst_v
1596 mov ecx, [esp + 8 + 20] // width
1597 vbroadcastf128 ymm5, xmmword ptr kAddUV128
1598 vbroadcastf128 ymm6, xmmword ptr kARGBToV
1599 vbroadcastf128 ymm7, xmmword ptr kARGBToU
1600 sub edi, edx // stride from u to v
1601
1602 convertloop:
1603 /* step 1 - subsample 32x2 argb pixels to 16x1 */
1604 vmovdqu ymm0, [eax]
1605 vmovdqu ymm1, [eax + 32]
1606 vmovdqu ymm2, [eax + 64]
1607 vmovdqu ymm3, [eax + 96]
1608 vpavgb ymm0, ymm0, [eax + esi]
1609 vpavgb ymm1, ymm1, [eax + esi + 32]
1610 vpavgb ymm2, ymm2, [eax + esi + 64]
1611 vpavgb ymm3, ymm3, [eax + esi + 96]
1612 lea eax, [eax + 128]
1613 vshufps ymm4, ymm0, ymm1, 0x88
1614 vshufps ymm0, ymm0, ymm1, 0xdd
1615 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
1616 vshufps ymm4, ymm2, ymm3, 0x88
1617 vshufps ymm2, ymm2, ymm3, 0xdd
1618 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
1619
1620 // step 2 - convert to U and V
1621 // from here down is very similar to Y code except
1622 // instead of 32 different pixels, its 16 pixels of U and 16 of V
1623 vpmaddubsw ymm1, ymm0, ymm7 // U
1624 vpmaddubsw ymm3, ymm2, ymm7
1625 vpmaddubsw ymm0, ymm0, ymm6 // V
1626 vpmaddubsw ymm2, ymm2, ymm6
1627 vphaddw ymm1, ymm1, ymm3 // mutates
1628 vphaddw ymm0, ymm0, ymm2
1629 vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned
1630 vpaddw ymm0, ymm0, ymm5
1631 vpsraw ymm1, ymm1, 8
1632 vpsraw ymm0, ymm0, 8
1633 vpacksswb ymm0, ymm1, ymm0 // mutates
1634 vpermq ymm0, ymm0, 0xd8 // For vpacksswb
1635 vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
1636
1637 // step 3 - store 16 U and 16 V values
1638 vextractf128 [edx], ymm0, 0 // U
1639 vextractf128 [edx + edi], ymm0, 1 // V
1640 lea edx, [edx + 16]
1641 sub ecx, 32
1642 jg convertloop
1643
1644 pop edi
1645 pop esi
1646 vzeroupper
1647 ret
1648 }
1649 }
1650 #endif // HAS_ARGBTOUVJROW_AVX2
1651
1652 __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
1653 uint8_t* dst_u,
1654 uint8_t* dst_v,
1655 int width) {
1656 __asm {
1657 push edi
1658 mov eax, [esp + 4 + 4] // src_argb
1659 mov edx, [esp + 4 + 8] // dst_u
1660 mov edi, [esp + 4 + 12] // dst_v
1661 mov ecx, [esp + 4 + 16] // width
1662 movdqa xmm5, xmmword ptr kAddUV128
1663 movdqa xmm6, xmmword ptr kARGBToV
1664 movdqa xmm7, xmmword ptr kARGBToU
1665 sub edi, edx // stride from u to v
1666
1667 convertloop:
1668 /* convert to U and V */
1669 movdqu xmm0, [eax] // U
1670 movdqu xmm1, [eax + 16]
1671 movdqu xmm2, [eax + 32]
1672 movdqu xmm3, [eax + 48]
1673 pmaddubsw xmm0, xmm7
1674 pmaddubsw xmm1, xmm7
1675 pmaddubsw xmm2, xmm7
1676 pmaddubsw xmm3, xmm7
1677 phaddw xmm0, xmm1
1678 phaddw xmm2, xmm3
1679 psraw xmm0, 8
1680 psraw xmm2, 8
1681 packsswb xmm0, xmm2
1682 paddb xmm0, xmm5
1683 movdqu [edx], xmm0
1684
1685 movdqu xmm0, [eax] // V
1686 movdqu xmm1, [eax + 16]
1687 movdqu xmm2, [eax + 32]
1688 movdqu xmm3, [eax + 48]
1689 pmaddubsw xmm0, xmm6
1690 pmaddubsw xmm1, xmm6
1691 pmaddubsw xmm2, xmm6
1692 pmaddubsw xmm3, xmm6
1693 phaddw xmm0, xmm1
1694 phaddw xmm2, xmm3
1695 psraw xmm0, 8
1696 psraw xmm2, 8
1697 packsswb xmm0, xmm2
1698 paddb xmm0, xmm5
1699 lea eax, [eax + 64]
1700 movdqu [edx + edi], xmm0
1701 lea edx, [edx + 16]
1702 sub ecx, 16
1703 jg convertloop
1704
1705 pop edi
1706 ret
1707 }
1708 }
1709
1710 __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
1711 int src_stride_argb,
1712 uint8_t* dst_u,
1713 uint8_t* dst_v,
1714 int width) {
1715 __asm {
1716 push esi
1717 push edi
1718 mov eax, [esp + 8 + 4] // src_argb
1719 mov esi, [esp + 8 + 8] // src_stride_argb
1720 mov edx, [esp + 8 + 12] // dst_u
1721 mov edi, [esp + 8 + 16] // dst_v
1722 mov ecx, [esp + 8 + 20] // width
1723 movdqa xmm5, xmmword ptr kAddUV128
1724 movdqa xmm6, xmmword ptr kBGRAToV
1725 movdqa xmm7, xmmword ptr kBGRAToU
1726 sub edi, edx // stride from u to v
1727
1728 convertloop:
1729 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1730 movdqu xmm0, [eax]
1731 movdqu xmm4, [eax + esi]
1732 pavgb xmm0, xmm4
1733 movdqu xmm1, [eax + 16]
1734 movdqu xmm4, [eax + esi + 16]
1735 pavgb xmm1, xmm4
1736 movdqu xmm2, [eax + 32]
1737 movdqu xmm4, [eax + esi + 32]
1738 pavgb xmm2, xmm4
1739 movdqu xmm3, [eax + 48]
1740 movdqu xmm4, [eax + esi + 48]
1741 pavgb xmm3, xmm4
1742
1743 lea eax, [eax + 64]
1744 movdqa xmm4, xmm0
1745 shufps xmm0, xmm1, 0x88
1746 shufps xmm4, xmm1, 0xdd
1747 pavgb xmm0, xmm4
1748 movdqa xmm4, xmm2
1749 shufps xmm2, xmm3, 0x88
1750 shufps xmm4, xmm3, 0xdd
1751 pavgb xmm2, xmm4
1752
1753 // step 2 - convert to U and V
1754 // from here down is very similar to Y code except
1755 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1756 movdqa xmm1, xmm0
1757 movdqa xmm3, xmm2
1758 pmaddubsw xmm0, xmm7 // U
1759 pmaddubsw xmm2, xmm7
1760 pmaddubsw xmm1, xmm6 // V
1761 pmaddubsw xmm3, xmm6
1762 phaddw xmm0, xmm2
1763 phaddw xmm1, xmm3
1764 psraw xmm0, 8
1765 psraw xmm1, 8
1766 packsswb xmm0, xmm1
1767 paddb xmm0, xmm5 // -> unsigned
1768
1769 // step 3 - store 8 U and 8 V values
1770 movlps qword ptr [edx], xmm0 // U
1771 movhps qword ptr [edx + edi], xmm0 // V
1772 lea edx, [edx + 8]
1773 sub ecx, 16
1774 jg convertloop
1775
1776 pop edi
1777 pop esi
1778 ret
1779 }
1780 }
1781
1782 __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
1783 int src_stride_argb,
1784 uint8_t* dst_u,
1785 uint8_t* dst_v,
1786 int width) {
1787 __asm {
1788 push esi
1789 push edi
1790 mov eax, [esp + 8 + 4] // src_argb
1791 mov esi, [esp + 8 + 8] // src_stride_argb
1792 mov edx, [esp + 8 + 12] // dst_u
1793 mov edi, [esp + 8 + 16] // dst_v
1794 mov ecx, [esp + 8 + 20] // width
1795 movdqa xmm5, xmmword ptr kAddUV128
1796 movdqa xmm6, xmmword ptr kABGRToV
1797 movdqa xmm7, xmmword ptr kABGRToU
1798 sub edi, edx // stride from u to v
1799
1800 convertloop:
1801 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1802 movdqu xmm0, [eax]
1803 movdqu xmm4, [eax + esi]
1804 pavgb xmm0, xmm4
1805 movdqu xmm1, [eax + 16]
1806 movdqu xmm4, [eax + esi + 16]
1807 pavgb xmm1, xmm4
1808 movdqu xmm2, [eax + 32]
1809 movdqu xmm4, [eax + esi + 32]
1810 pavgb xmm2, xmm4
1811 movdqu xmm3, [eax + 48]
1812 movdqu xmm4, [eax + esi + 48]
1813 pavgb xmm3, xmm4
1814
1815 lea eax, [eax + 64]
1816 movdqa xmm4, xmm0
1817 shufps xmm0, xmm1, 0x88
1818 shufps xmm4, xmm1, 0xdd
1819 pavgb xmm0, xmm4
1820 movdqa xmm4, xmm2
1821 shufps xmm2, xmm3, 0x88
1822 shufps xmm4, xmm3, 0xdd
1823 pavgb xmm2, xmm4
1824
1825 // step 2 - convert to U and V
1826 // from here down is very similar to Y code except
1827 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1828 movdqa xmm1, xmm0
1829 movdqa xmm3, xmm2
1830 pmaddubsw xmm0, xmm7 // U
1831 pmaddubsw xmm2, xmm7
1832 pmaddubsw xmm1, xmm6 // V
1833 pmaddubsw xmm3, xmm6
1834 phaddw xmm0, xmm2
1835 phaddw xmm1, xmm3
1836 psraw xmm0, 8
1837 psraw xmm1, 8
1838 packsswb xmm0, xmm1
1839 paddb xmm0, xmm5 // -> unsigned
1840
1841 // step 3 - store 8 U and 8 V values
1842 movlps qword ptr [edx], xmm0 // U
1843 movhps qword ptr [edx + edi], xmm0 // V
1844 lea edx, [edx + 8]
1845 sub ecx, 16
1846 jg convertloop
1847
1848 pop edi
1849 pop esi
1850 ret
1851 }
1852 }
1853
1854 __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
1855 int src_stride_argb,
1856 uint8_t* dst_u,
1857 uint8_t* dst_v,
1858 int width) {
1859 __asm {
1860 push esi
1861 push edi
1862 mov eax, [esp + 8 + 4] // src_argb
1863 mov esi, [esp + 8 + 8] // src_stride_argb
1864 mov edx, [esp + 8 + 12] // dst_u
1865 mov edi, [esp + 8 + 16] // dst_v
1866 mov ecx, [esp + 8 + 20] // width
1867 movdqa xmm5, xmmword ptr kAddUV128
1868 movdqa xmm6, xmmword ptr kRGBAToV
1869 movdqa xmm7, xmmword ptr kRGBAToU
1870 sub edi, edx // stride from u to v
1871
1872 convertloop:
1873 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1874 movdqu xmm0, [eax]
1875 movdqu xmm4, [eax + esi]
1876 pavgb xmm0, xmm4
1877 movdqu xmm1, [eax + 16]
1878 movdqu xmm4, [eax + esi + 16]
1879 pavgb xmm1, xmm4
1880 movdqu xmm2, [eax + 32]
1881 movdqu xmm4, [eax + esi + 32]
1882 pavgb xmm2, xmm4
1883 movdqu xmm3, [eax + 48]
1884 movdqu xmm4, [eax + esi + 48]
1885 pavgb xmm3, xmm4
1886
1887 lea eax, [eax + 64]
1888 movdqa xmm4, xmm0
1889 shufps xmm0, xmm1, 0x88
1890 shufps xmm4, xmm1, 0xdd
1891 pavgb xmm0, xmm4
1892 movdqa xmm4, xmm2
1893 shufps xmm2, xmm3, 0x88
1894 shufps xmm4, xmm3, 0xdd
1895 pavgb xmm2, xmm4
1896
1897 // step 2 - convert to U and V
1898 // from here down is very similar to Y code except
1899 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1900 movdqa xmm1, xmm0
1901 movdqa xmm3, xmm2
1902 pmaddubsw xmm0, xmm7 // U
1903 pmaddubsw xmm2, xmm7
1904 pmaddubsw xmm1, xmm6 // V
1905 pmaddubsw xmm3, xmm6
1906 phaddw xmm0, xmm2
1907 phaddw xmm1, xmm3
1908 psraw xmm0, 8
1909 psraw xmm1, 8
1910 packsswb xmm0, xmm1
1911 paddb xmm0, xmm5 // -> unsigned
1912
1913 // step 3 - store 8 U and 8 V values
1914 movlps qword ptr [edx], xmm0 // U
1915 movhps qword ptr [edx + edi], xmm0 // V
1916 lea edx, [edx + 8]
1917 sub ecx, 16
1918 jg convertloop
1919
1920 pop edi
1921 pop esi
1922 ret
1923 }
1924 }
1925 #endif // HAS_ARGBTOYROW_SSSE3
1926
1927 // Read 16 UV from 444
1928 #define READYUV444_AVX2 \
1929 __asm { \
1930 __asm vmovdqu xmm0, [esi] /* U */ \
1931 __asm vmovdqu xmm1, [esi + edi] /* V */ \
1932 __asm lea esi, [esi + 16] \
1933 __asm vpermq ymm0, ymm0, 0xd8 \
1934 __asm vpermq ymm1, ymm1, 0xd8 \
1935 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1936 __asm vmovdqu xmm4, [eax] /* Y */ \
1937 __asm vpermq ymm4, ymm4, 0xd8 \
1938 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1939 __asm lea eax, [eax + 16]}
1940
1941 // Read 8 UV from 422, upsample to 16 UV.
1942 #define READYUV422_AVX2 \
1943 __asm { \
1944 __asm vmovq xmm0, qword ptr [esi] /* U */ \
1945 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
1946 __asm lea esi, [esi + 8] \
1947 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1948 __asm vpermq ymm0, ymm0, 0xd8 \
1949 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1950 __asm vmovdqu xmm4, [eax] /* Y */ \
1951 __asm vpermq ymm4, ymm4, 0xd8 \
1952 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1953 __asm lea eax, [eax + 16]}
1954
1955 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
1956 #define READYUVA422_AVX2 \
1957 __asm { \
1958 __asm vmovq xmm0, qword ptr [esi] /* U */ \
1959 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
1960 __asm lea esi, [esi + 8] \
1961 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1962 __asm vpermq ymm0, ymm0, 0xd8 \
1963 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1964 __asm vmovdqu xmm4, [eax] /* Y */ \
1965 __asm vpermq ymm4, ymm4, 0xd8 \
1966 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1967 __asm lea eax, [eax + 16] \
1968 __asm vmovdqu xmm5, [ebp] /* A */ \
1969 __asm vpermq ymm5, ymm5, 0xd8 \
1970 __asm lea ebp, [ebp + 16]}
1971
1972 // Read 8 UV from NV12, upsample to 16 UV.
1973 #define READNV12_AVX2 \
1974 __asm { \
1975 __asm vmovdqu xmm0, [esi] /* UV */ \
1976 __asm lea esi, [esi + 16] \
1977 __asm vpermq ymm0, ymm0, 0xd8 \
1978 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1979 __asm vmovdqu xmm4, [eax] /* Y */ \
1980 __asm vpermq ymm4, ymm4, 0xd8 \
1981 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1982 __asm lea eax, [eax + 16]}
1983
1984 // Read 8 UV from NV21, upsample to 16 UV.
1985 #define READNV21_AVX2 \
1986 __asm { \
1987 __asm vmovdqu xmm0, [esi] /* UV */ \
1988 __asm lea esi, [esi + 16] \
1989 __asm vpermq ymm0, ymm0, 0xd8 \
1990 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
1991 __asm vmovdqu xmm4, [eax] /* Y */ \
1992 __asm vpermq ymm4, ymm4, 0xd8 \
1993 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1994 __asm lea eax, [eax + 16]}
1995
1996 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
1997 #define READYUY2_AVX2 \
1998 __asm { \
1999 __asm vmovdqu ymm4, [eax] /* YUY2 */ \
2000 __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
2001 __asm vmovdqu ymm0, [eax] /* UV */ \
2002 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
2003 __asm lea eax, [eax + 32]}
2004
2005 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2006 #define READUYVY_AVX2 \
2007 __asm { \
2008 __asm vmovdqu ymm4, [eax] /* UYVY */ \
2009 __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
2010 __asm vmovdqu ymm0, [eax] /* UV */ \
2011 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
2012 __asm lea eax, [eax + 32]}
2013
2014 // Convert 16 pixels: 16 UV and 16 Y.
2015 #define YUVTORGB_AVX2(YuvConstants) \
2016 __asm { \
2017 __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
2018 __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
2019 __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
2020 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
2021 __asm vpsubw ymm2, ymm3, ymm2 \
2022 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
2023 __asm vpsubw ymm1, ymm3, ymm1 \
2024 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
2025 __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \
2026 __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
2027 __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
2028 __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
2029 __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
2030 __asm vpsraw ymm0, ymm0, 6 \
2031 __asm vpsraw ymm1, ymm1, 6 \
2032 __asm vpsraw ymm2, ymm2, 6 \
2033 __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
2034 __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
2035 __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
2036 }
2037
2038 // Store 16 ARGB values.
2039 #define STOREARGB_AVX2 \
2040 __asm { \
2041 __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
2042 __asm vpermq ymm0, ymm0, 0xd8 \
2043 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
2044 __asm vpermq ymm2, ymm2, 0xd8 \
2045 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
2046 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
2047 __asm vmovdqu 0[edx], ymm1 \
2048 __asm vmovdqu 32[edx], ymm0 \
2049 __asm lea edx, [edx + 64]}
2050
2051 // Store 16 RGBA values.
2052 #define STORERGBA_AVX2 \
2053 __asm { \
2054 __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
2055 __asm vpermq ymm1, ymm1, 0xd8 \
2056 __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
2057 __asm vpermq ymm2, ymm2, 0xd8 \
2058 __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
2059 __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
2060 __asm vmovdqu [edx], ymm0 \
2061 __asm vmovdqu [edx + 32], ymm1 \
2062 __asm lea edx, [edx + 64]}
2063
2064 #ifdef HAS_I422TOARGBROW_AVX2
2065 // 16 pixels
2066 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2067 __declspec(naked) void I422ToARGBRow_AVX2(
2068 const uint8_t* y_buf,
2069 const uint8_t* u_buf,
2070 const uint8_t* v_buf,
2071 uint8_t* dst_argb,
2072 const struct YuvConstants* yuvconstants,
2073 int width) {
2074 __asm {
2075 push esi
2076 push edi
2077 push ebx
2078 mov eax, [esp + 12 + 4] // Y
2079 mov esi, [esp + 12 + 8] // U
2080 mov edi, [esp + 12 + 12] // V
2081 mov edx, [esp + 12 + 16] // argb
2082 mov ebx, [esp + 12 + 20] // yuvconstants
2083 mov ecx, [esp + 12 + 24] // width
2084 sub edi, esi
2085 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2086
2087 convertloop:
2088 READYUV422_AVX2
2089 YUVTORGB_AVX2(ebx)
2090 STOREARGB_AVX2
2091
2092 sub ecx, 16
2093 jg convertloop
2094
2095 pop ebx
2096 pop edi
2097 pop esi
2098 vzeroupper
2099 ret
2100 }
2101 }
2102 #endif // HAS_I422TOARGBROW_AVX2
2103
2104 #ifdef HAS_I422ALPHATOARGBROW_AVX2
2105 // 16 pixels
2106 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
2107 __declspec(naked) void I422AlphaToARGBRow_AVX2(
2108 const uint8_t* y_buf,
2109 const uint8_t* u_buf,
2110 const uint8_t* v_buf,
2111 const uint8_t* a_buf,
2112 uint8_t* dst_argb,
2113 const struct YuvConstants* yuvconstants,
2114 int width) {
2115 __asm {
2116 push esi
2117 push edi
2118 push ebx
2119 push ebp
2120 mov eax, [esp + 16 + 4] // Y
2121 mov esi, [esp + 16 + 8] // U
2122 mov edi, [esp + 16 + 12] // V
2123 mov ebp, [esp + 16 + 16] // A
2124 mov edx, [esp + 16 + 20] // argb
2125 mov ebx, [esp + 16 + 24] // yuvconstants
2126 mov ecx, [esp + 16 + 28] // width
2127 sub edi, esi
2128
2129 convertloop:
2130 READYUVA422_AVX2
2131 YUVTORGB_AVX2(ebx)
2132 STOREARGB_AVX2
2133
2134 sub ecx, 16
2135 jg convertloop
2136
2137 pop ebp
2138 pop ebx
2139 pop edi
2140 pop esi
2141 vzeroupper
2142 ret
2143 }
2144 }
2145 #endif // HAS_I422ALPHATOARGBROW_AVX2
2146
2147 #ifdef HAS_I444TOARGBROW_AVX2
2148 // 16 pixels
2149 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2150 __declspec(naked) void I444ToARGBRow_AVX2(
2151 const uint8_t* y_buf,
2152 const uint8_t* u_buf,
2153 const uint8_t* v_buf,
2154 uint8_t* dst_argb,
2155 const struct YuvConstants* yuvconstants,
2156 int width) {
2157 __asm {
2158 push esi
2159 push edi
2160 push ebx
2161 mov eax, [esp + 12 + 4] // Y
2162 mov esi, [esp + 12 + 8] // U
2163 mov edi, [esp + 12 + 12] // V
2164 mov edx, [esp + 12 + 16] // argb
2165 mov ebx, [esp + 12 + 20] // yuvconstants
2166 mov ecx, [esp + 12 + 24] // width
2167 sub edi, esi
2168 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2169 convertloop:
2170 READYUV444_AVX2
2171 YUVTORGB_AVX2(ebx)
2172 STOREARGB_AVX2
2173
2174 sub ecx, 16
2175 jg convertloop
2176
2177 pop ebx
2178 pop edi
2179 pop esi
2180 vzeroupper
2181 ret
2182 }
2183 }
2184 #endif // HAS_I444TOARGBROW_AVX2
2185
2186 #ifdef HAS_NV12TOARGBROW_AVX2
2187 // 16 pixels.
2188 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2189 __declspec(naked) void NV12ToARGBRow_AVX2(
2190 const uint8_t* y_buf,
2191 const uint8_t* uv_buf,
2192 uint8_t* dst_argb,
2193 const struct YuvConstants* yuvconstants,
2194 int width) {
2195 __asm {
2196 push esi
2197 push ebx
2198 mov eax, [esp + 8 + 4] // Y
2199 mov esi, [esp + 8 + 8] // UV
2200 mov edx, [esp + 8 + 12] // argb
2201 mov ebx, [esp + 8 + 16] // yuvconstants
2202 mov ecx, [esp + 8 + 20] // width
2203 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2204
2205 convertloop:
2206 READNV12_AVX2
2207 YUVTORGB_AVX2(ebx)
2208 STOREARGB_AVX2
2209
2210 sub ecx, 16
2211 jg convertloop
2212
2213 pop ebx
2214 pop esi
2215 vzeroupper
2216 ret
2217 }
2218 }
2219 #endif // HAS_NV12TOARGBROW_AVX2
2220
2221 #ifdef HAS_NV21TOARGBROW_AVX2
2222 // 16 pixels.
2223 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2224 __declspec(naked) void NV21ToARGBRow_AVX2(
2225 const uint8_t* y_buf,
2226 const uint8_t* vu_buf,
2227 uint8_t* dst_argb,
2228 const struct YuvConstants* yuvconstants,
2229 int width) {
2230 __asm {
2231 push esi
2232 push ebx
2233 mov eax, [esp + 8 + 4] // Y
2234 mov esi, [esp + 8 + 8] // VU
2235 mov edx, [esp + 8 + 12] // argb
2236 mov ebx, [esp + 8 + 16] // yuvconstants
2237 mov ecx, [esp + 8 + 20] // width
2238 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2239
2240 convertloop:
2241 READNV21_AVX2
2242 YUVTORGB_AVX2(ebx)
2243 STOREARGB_AVX2
2244
2245 sub ecx, 16
2246 jg convertloop
2247
2248 pop ebx
2249 pop esi
2250 vzeroupper
2251 ret
2252 }
2253 }
2254 #endif // HAS_NV21TOARGBROW_AVX2
2255
2256 #ifdef HAS_YUY2TOARGBROW_AVX2
2257 // 16 pixels.
2258 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2259 __declspec(naked) void YUY2ToARGBRow_AVX2(
2260 const uint8_t* src_yuy2,
2261 uint8_t* dst_argb,
2262 const struct YuvConstants* yuvconstants,
2263 int width) {
2264 __asm {
2265 push ebx
2266 mov eax, [esp + 4 + 4] // yuy2
2267 mov edx, [esp + 4 + 8] // argb
2268 mov ebx, [esp + 4 + 12] // yuvconstants
2269 mov ecx, [esp + 4 + 16] // width
2270 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2271
2272 convertloop:
2273 READYUY2_AVX2
2274 YUVTORGB_AVX2(ebx)
2275 STOREARGB_AVX2
2276
2277 sub ecx, 16
2278 jg convertloop
2279
2280 pop ebx
2281 vzeroupper
2282 ret
2283 }
2284 }
2285 #endif // HAS_YUY2TOARGBROW_AVX2
2286
2287 #ifdef HAS_UYVYTOARGBROW_AVX2
2288 // 16 pixels.
2289 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2290 __declspec(naked) void UYVYToARGBRow_AVX2(
2291 const uint8_t* src_uyvy,
2292 uint8_t* dst_argb,
2293 const struct YuvConstants* yuvconstants,
2294 int width) {
2295 __asm {
2296 push ebx
2297 mov eax, [esp + 4 + 4] // uyvy
2298 mov edx, [esp + 4 + 8] // argb
2299 mov ebx, [esp + 4 + 12] // yuvconstants
2300 mov ecx, [esp + 4 + 16] // width
2301 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2302
2303 convertloop:
2304 READUYVY_AVX2
2305 YUVTORGB_AVX2(ebx)
2306 STOREARGB_AVX2
2307
2308 sub ecx, 16
2309 jg convertloop
2310
2311 pop ebx
2312 vzeroupper
2313 ret
2314 }
2315 }
2316 #endif // HAS_UYVYTOARGBROW_AVX2
2317
2318 #ifdef HAS_I422TORGBAROW_AVX2
2319 // 16 pixels
2320 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2321 __declspec(naked) void I422ToRGBARow_AVX2(
2322 const uint8_t* y_buf,
2323 const uint8_t* u_buf,
2324 const uint8_t* v_buf,
2325 uint8_t* dst_argb,
2326 const struct YuvConstants* yuvconstants,
2327 int width) {
2328 __asm {
2329 push esi
2330 push edi
2331 push ebx
2332 mov eax, [esp + 12 + 4] // Y
2333 mov esi, [esp + 12 + 8] // U
2334 mov edi, [esp + 12 + 12] // V
2335 mov edx, [esp + 12 + 16] // abgr
2336 mov ebx, [esp + 12 + 20] // yuvconstants
2337 mov ecx, [esp + 12 + 24] // width
2338 sub edi, esi
2339 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2340
2341 convertloop:
2342 READYUV422_AVX2
2343 YUVTORGB_AVX2(ebx)
2344 STORERGBA_AVX2
2345
2346 sub ecx, 16
2347 jg convertloop
2348
2349 pop ebx
2350 pop edi
2351 pop esi
2352 vzeroupper
2353 ret
2354 }
2355 }
2356 #endif // HAS_I422TORGBAROW_AVX2
2357
2358 #if defined(HAS_I422TOARGBROW_SSSE3)
2359 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
2360 // Allows a conversion with half size scaling.
2361
2362 // Read 8 UV from 444.
2363 #define READYUV444 \
2364 __asm { \
2365 __asm movq xmm0, qword ptr [esi] /* U */ \
2366 __asm movq xmm1, qword ptr [esi + edi] /* V */ \
2367 __asm lea esi, [esi + 8] \
2368 __asm punpcklbw xmm0, xmm1 /* UV */ \
2369 __asm movq xmm4, qword ptr [eax] \
2370 __asm punpcklbw xmm4, xmm4 \
2371 __asm lea eax, [eax + 8]}
2372
2373 // Read 4 UV from 422, upsample to 8 UV.
2374 #define READYUV422 \
2375 __asm { \
2376 __asm movd xmm0, [esi] /* U */ \
2377 __asm movd xmm1, [esi + edi] /* V */ \
2378 __asm lea esi, [esi + 4] \
2379 __asm punpcklbw xmm0, xmm1 /* UV */ \
2380 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2381 __asm movq xmm4, qword ptr [eax] \
2382 __asm punpcklbw xmm4, xmm4 \
2383 __asm lea eax, [eax + 8]}
2384
2385 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
2386 #define READYUVA422 \
2387 __asm { \
2388 __asm movd xmm0, [esi] /* U */ \
2389 __asm movd xmm1, [esi + edi] /* V */ \
2390 __asm lea esi, [esi + 4] \
2391 __asm punpcklbw xmm0, xmm1 /* UV */ \
2392 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2393 __asm movq xmm4, qword ptr [eax] /* Y */ \
2394 __asm punpcklbw xmm4, xmm4 \
2395 __asm lea eax, [eax + 8] \
2396 __asm movq xmm5, qword ptr [ebp] /* A */ \
2397 __asm lea ebp, [ebp + 8]}
2398
2399 // Read 4 UV from NV12, upsample to 8 UV.
2400 #define READNV12 \
2401 __asm { \
2402 __asm movq xmm0, qword ptr [esi] /* UV */ \
2403 __asm lea esi, [esi + 8] \
2404 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2405 __asm movq xmm4, qword ptr [eax] \
2406 __asm punpcklbw xmm4, xmm4 \
2407 __asm lea eax, [eax + 8]}
2408
2409 // Read 4 VU from NV21, upsample to 8 UV.
2410 #define READNV21 \
2411 __asm { \
2412 __asm movq xmm0, qword ptr [esi] /* UV */ \
2413 __asm lea esi, [esi + 8] \
2414 __asm pshufb xmm0, xmmword ptr kShuffleNV21 \
2415 __asm movq xmm4, qword ptr [eax] \
2416 __asm punpcklbw xmm4, xmm4 \
2417 __asm lea eax, [eax + 8]}
2418
2419 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
2420 #define READYUY2 \
2421 __asm { \
2422 __asm movdqu xmm4, [eax] /* YUY2 */ \
2423 __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
2424 __asm movdqu xmm0, [eax] /* UV */ \
2425 __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
2426 __asm lea eax, [eax + 16]}
2427
2428 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
2429 #define READUYVY \
2430 __asm { \
2431 __asm movdqu xmm4, [eax] /* UYVY */ \
2432 __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
2433 __asm movdqu xmm0, [eax] /* UV */ \
2434 __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
2435 __asm lea eax, [eax + 16]}
2436
2437 // Convert 8 pixels: 8 UV and 8 Y.
2438 #define YUVTORGB(YuvConstants) \
2439 __asm { \
2440 __asm movdqa xmm1, xmm0 \
2441 __asm movdqa xmm2, xmm0 \
2442 __asm movdqa xmm3, xmm0 \
2443 __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \
2444 __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \
2445 __asm psubw xmm0, xmm1 \
2446 __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \
2447 __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \
2448 __asm psubw xmm1, xmm2 \
2449 __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
2450 __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
2451 __asm psubw xmm2, xmm3 \
2452 __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
2453 __asm paddsw xmm0, xmm4 /* B += Y */ \
2454 __asm paddsw xmm1, xmm4 /* G += Y */ \
2455 __asm paddsw xmm2, xmm4 /* R += Y */ \
2456 __asm psraw xmm0, 6 \
2457 __asm psraw xmm1, 6 \
2458 __asm psraw xmm2, 6 \
2459 __asm packuswb xmm0, xmm0 /* B */ \
2460 __asm packuswb xmm1, xmm1 /* G */ \
2461 __asm packuswb xmm2, xmm2 /* R */ \
2462 }
2463
2464 // Store 8 ARGB values.
2465 #define STOREARGB \
2466 __asm { \
2467 __asm punpcklbw xmm0, xmm1 /* BG */ \
2468 __asm punpcklbw xmm2, xmm5 /* RA */ \
2469 __asm movdqa xmm1, xmm0 \
2470 __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
2471 __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
2472 __asm movdqu 0[edx], xmm0 \
2473 __asm movdqu 16[edx], xmm1 \
2474 __asm lea edx, [edx + 32]}
2475
2476 // Store 8 BGRA values.
2477 #define STOREBGRA \
2478 __asm { \
2479 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
2480 __asm punpcklbw xmm1, xmm0 /* GB */ \
2481 __asm punpcklbw xmm5, xmm2 /* AR */ \
2482 __asm movdqa xmm0, xmm5 \
2483 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
2484 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
2485 __asm movdqu 0[edx], xmm5 \
2486 __asm movdqu 16[edx], xmm0 \
2487 __asm lea edx, [edx + 32]}
2488
2489 // Store 8 RGBA values.
2490 #define STORERGBA \
2491 __asm { \
2492 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
2493 __asm punpcklbw xmm1, xmm2 /* GR */ \
2494 __asm punpcklbw xmm5, xmm0 /* AB */ \
2495 __asm movdqa xmm0, xmm5 \
2496 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
2497 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
2498 __asm movdqu 0[edx], xmm5 \
2499 __asm movdqu 16[edx], xmm0 \
2500 __asm lea edx, [edx + 32]}
2501
2502 // Store 8 RGB24 values.
2503 #define STORERGB24 \
2504 __asm {/* Weave into RRGB */ \
2505 __asm punpcklbw xmm0, xmm1 /* BG */ \
2506 __asm punpcklbw xmm2, xmm2 /* RR */ \
2507 __asm movdqa xmm1, xmm0 \
2508 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
2509 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \
2510 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
2511 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
2512 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
2513 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
2514 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
2515 __asm lea edx, [edx + 24]}
2516
2517 // Store 8 RGB565 values.
2518 #define STORERGB565 \
2519 __asm {/* Weave into RRGB */ \
2520 __asm punpcklbw xmm0, xmm1 /* BG */ \
2521 __asm punpcklbw xmm2, xmm2 /* RR */ \
2522 __asm movdqa xmm1, xmm0 \
2523 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
2524 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \
2525 __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
2526 __asm movdqa xmm2, xmm0 /* G */ \
2527 __asm pslld xmm0, 8 /* R */ \
2528 __asm psrld xmm3, 3 /* B */ \
2529 __asm psrld xmm2, 5 /* G */ \
2530 __asm psrad xmm0, 16 /* R */ \
2531 __asm pand xmm3, xmm5 /* B */ \
2532 __asm pand xmm2, xmm6 /* G */ \
2533 __asm pand xmm0, xmm7 /* R */ \
2534 __asm por xmm3, xmm2 /* BG */ \
2535 __asm por xmm0, xmm3 /* BGR */ \
2536 __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
2537 __asm movdqa xmm2, xmm1 /* G */ \
2538 __asm pslld xmm1, 8 /* R */ \
2539 __asm psrld xmm3, 3 /* B */ \
2540 __asm psrld xmm2, 5 /* G */ \
2541 __asm psrad xmm1, 16 /* R */ \
2542 __asm pand xmm3, xmm5 /* B */ \
2543 __asm pand xmm2, xmm6 /* G */ \
2544 __asm pand xmm1, xmm7 /* R */ \
2545 __asm por xmm3, xmm2 /* BG */ \
2546 __asm por xmm1, xmm3 /* BGR */ \
2547 __asm packssdw xmm0, xmm1 \
2548 __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
2549 __asm lea edx, [edx + 16]}
2550
2551 // 8 pixels.
2552 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2553 __declspec(naked) void I444ToARGBRow_SSSE3(
2554 const uint8_t* y_buf,
2555 const uint8_t* u_buf,
2556 const uint8_t* v_buf,
2557 uint8_t* dst_argb,
2558 const struct YuvConstants* yuvconstants,
2559 int width) {
2560 __asm {
2561 push esi
2562 push edi
2563 push ebx
2564 mov eax, [esp + 12 + 4] // Y
2565 mov esi, [esp + 12 + 8] // U
2566 mov edi, [esp + 12 + 12] // V
2567 mov edx, [esp + 12 + 16] // argb
2568 mov ebx, [esp + 12 + 20] // yuvconstants
2569 mov ecx, [esp + 12 + 24] // width
2570 sub edi, esi
2571 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2572
2573 convertloop:
2574 READYUV444
2575 YUVTORGB(ebx)
2576 STOREARGB
2577
2578 sub ecx, 8
2579 jg convertloop
2580
2581 pop ebx
2582 pop edi
2583 pop esi
2584 ret
2585 }
2586 }
2587
2588 // 8 pixels.
2589 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
2590 __declspec(naked) void I422ToRGB24Row_SSSE3(
2591 const uint8_t* y_buf,
2592 const uint8_t* u_buf,
2593 const uint8_t* v_buf,
2594 uint8_t* dst_rgb24,
2595 const struct YuvConstants* yuvconstants,
2596 int width) {
2597 __asm {
2598 push esi
2599 push edi
2600 push ebx
2601 mov eax, [esp + 12 + 4] // Y
2602 mov esi, [esp + 12 + 8] // U
2603 mov edi, [esp + 12 + 12] // V
2604 mov edx, [esp + 12 + 16] // argb
2605 mov ebx, [esp + 12 + 20] // yuvconstants
2606 mov ecx, [esp + 12 + 24] // width
2607 sub edi, esi
2608 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
2609 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
2610
2611 convertloop:
2612 READYUV422
2613 YUVTORGB(ebx)
2614 STORERGB24
2615
2616 sub ecx, 8
2617 jg convertloop
2618
2619 pop ebx
2620 pop edi
2621 pop esi
2622 ret
2623 }
2624 }
2625
2626 // 8 pixels
2627 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
2628 __declspec(naked) void I422ToRGB565Row_SSSE3(
2629 const uint8_t* y_buf,
2630 const uint8_t* u_buf,
2631 const uint8_t* v_buf,
2632 uint8_t* rgb565_buf,
2633 const struct YuvConstants* yuvconstants,
2634 int width) {
2635 __asm {
2636 push esi
2637 push edi
2638 push ebx
2639 mov eax, [esp + 12 + 4] // Y
2640 mov esi, [esp + 12 + 8] // U
2641 mov edi, [esp + 12 + 12] // V
2642 mov edx, [esp + 12 + 16] // argb
2643 mov ebx, [esp + 12 + 20] // yuvconstants
2644 mov ecx, [esp + 12 + 24] // width
2645 sub edi, esi
2646 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
2647 psrld xmm5, 27
2648 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
2649 psrld xmm6, 26
2650 pslld xmm6, 5
2651 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
2652 pslld xmm7, 11
2653
2654 convertloop:
2655 READYUV422
2656 YUVTORGB(ebx)
2657 STORERGB565
2658
2659 sub ecx, 8
2660 jg convertloop
2661
2662 pop ebx
2663 pop edi
2664 pop esi
2665 ret
2666 }
2667 }
2668
2669 // 8 pixels.
2670 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2671 __declspec(naked) void I422ToARGBRow_SSSE3(
2672 const uint8_t* y_buf,
2673 const uint8_t* u_buf,
2674 const uint8_t* v_buf,
2675 uint8_t* dst_argb,
2676 const struct YuvConstants* yuvconstants,
2677 int width) {
2678 __asm {
2679 push esi
2680 push edi
2681 push ebx
2682 mov eax, [esp + 12 + 4] // Y
2683 mov esi, [esp + 12 + 8] // U
2684 mov edi, [esp + 12 + 12] // V
2685 mov edx, [esp + 12 + 16] // argb
2686 mov ebx, [esp + 12 + 20] // yuvconstants
2687 mov ecx, [esp + 12 + 24] // width
2688 sub edi, esi
2689 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2690
2691 convertloop:
2692 READYUV422
2693 YUVTORGB(ebx)
2694 STOREARGB
2695
2696 sub ecx, 8
2697 jg convertloop
2698
2699 pop ebx
2700 pop edi
2701 pop esi
2702 ret
2703 }
2704 }
2705
2706 // 8 pixels.
2707 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
2708 __declspec(naked) void I422AlphaToARGBRow_SSSE3(
2709 const uint8_t* y_buf,
2710 const uint8_t* u_buf,
2711 const uint8_t* v_buf,
2712 const uint8_t* a_buf,
2713 uint8_t* dst_argb,
2714 const struct YuvConstants* yuvconstants,
2715 int width) {
2716 __asm {
2717 push esi
2718 push edi
2719 push ebx
2720 push ebp
2721 mov eax, [esp + 16 + 4] // Y
2722 mov esi, [esp + 16 + 8] // U
2723 mov edi, [esp + 16 + 12] // V
2724 mov ebp, [esp + 16 + 16] // A
2725 mov edx, [esp + 16 + 20] // argb
2726 mov ebx, [esp + 16 + 24] // yuvconstants
2727 mov ecx, [esp + 16 + 28] // width
2728 sub edi, esi
2729
2730 convertloop:
2731 READYUVA422
2732 YUVTORGB(ebx)
2733 STOREARGB
2734
2735 sub ecx, 8
2736 jg convertloop
2737
2738 pop ebp
2739 pop ebx
2740 pop edi
2741 pop esi
2742 ret
2743 }
2744 }
2745
2746 // 8 pixels.
2747 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2748 __declspec(naked) void NV12ToARGBRow_SSSE3(
2749 const uint8_t* y_buf,
2750 const uint8_t* uv_buf,
2751 uint8_t* dst_argb,
2752 const struct YuvConstants* yuvconstants,
2753 int width) {
2754 __asm {
2755 push esi
2756 push ebx
2757 mov eax, [esp + 8 + 4] // Y
2758 mov esi, [esp + 8 + 8] // UV
2759 mov edx, [esp + 8 + 12] // argb
2760 mov ebx, [esp + 8 + 16] // yuvconstants
2761 mov ecx, [esp + 8 + 20] // width
2762 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2763
2764 convertloop:
2765 READNV12
2766 YUVTORGB(ebx)
2767 STOREARGB
2768
2769 sub ecx, 8
2770 jg convertloop
2771
2772 pop ebx
2773 pop esi
2774 ret
2775 }
2776 }
2777
2778 // 8 pixels.
2779 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2780 __declspec(naked) void NV21ToARGBRow_SSSE3(
2781 const uint8_t* y_buf,
2782 const uint8_t* vu_buf,
2783 uint8_t* dst_argb,
2784 const struct YuvConstants* yuvconstants,
2785 int width) {
2786 __asm {
2787 push esi
2788 push ebx
2789 mov eax, [esp + 8 + 4] // Y
2790 mov esi, [esp + 8 + 8] // VU
2791 mov edx, [esp + 8 + 12] // argb
2792 mov ebx, [esp + 8 + 16] // yuvconstants
2793 mov ecx, [esp + 8 + 20] // width
2794 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2795
2796 convertloop:
2797 READNV21
2798 YUVTORGB(ebx)
2799 STOREARGB
2800
2801 sub ecx, 8
2802 jg convertloop
2803
2804 pop ebx
2805 pop esi
2806 ret
2807 }
2808 }
2809
2810 // 8 pixels.
2811 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
2812 __declspec(naked) void YUY2ToARGBRow_SSSE3(
2813 const uint8_t* src_yuy2,
2814 uint8_t* dst_argb,
2815 const struct YuvConstants* yuvconstants,
2816 int width) {
2817 __asm {
2818 push ebx
2819 mov eax, [esp + 4 + 4] // yuy2
2820 mov edx, [esp + 4 + 8] // argb
2821 mov ebx, [esp + 4 + 12] // yuvconstants
2822 mov ecx, [esp + 4 + 16] // width
2823 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2824
2825 convertloop:
2826 READYUY2
2827 YUVTORGB(ebx)
2828 STOREARGB
2829
2830 sub ecx, 8
2831 jg convertloop
2832
2833 pop ebx
2834 ret
2835 }
2836 }
2837
2838 // 8 pixels.
2839 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
2840 __declspec(naked) void UYVYToARGBRow_SSSE3(
2841 const uint8_t* src_uyvy,
2842 uint8_t* dst_argb,
2843 const struct YuvConstants* yuvconstants,
2844 int width) {
2845 __asm {
2846 push ebx
2847 mov eax, [esp + 4 + 4] // uyvy
2848 mov edx, [esp + 4 + 8] // argb
2849 mov ebx, [esp + 4 + 12] // yuvconstants
2850 mov ecx, [esp + 4 + 16] // width
2851 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2852
2853 convertloop:
2854 READUYVY
2855 YUVTORGB(ebx)
2856 STOREARGB
2857
2858 sub ecx, 8
2859 jg convertloop
2860
2861 pop ebx
2862 ret
2863 }
2864 }
2865
2866 __declspec(naked) void I422ToRGBARow_SSSE3(
2867 const uint8_t* y_buf,
2868 const uint8_t* u_buf,
2869 const uint8_t* v_buf,
2870 uint8_t* dst_rgba,
2871 const struct YuvConstants* yuvconstants,
2872 int width) {
2873 __asm {
2874 push esi
2875 push edi
2876 push ebx
2877 mov eax, [esp + 12 + 4] // Y
2878 mov esi, [esp + 12 + 8] // U
2879 mov edi, [esp + 12 + 12] // V
2880 mov edx, [esp + 12 + 16] // argb
2881 mov ebx, [esp + 12 + 20] // yuvconstants
2882 mov ecx, [esp + 12 + 24] // width
2883 sub edi, esi
2884
2885 convertloop:
2886 READYUV422
2887 YUVTORGB(ebx)
2888 STORERGBA
2889
2890 sub ecx, 8
2891 jg convertloop
2892
2893 pop ebx
2894 pop edi
2895 pop esi
2896 ret
2897 }
2898 }
2899 #endif // HAS_I422TOARGBROW_SSSE3
2900
2901 #ifdef HAS_I400TOARGBROW_SSE2
2902 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
2903 __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
2904 uint8_t* rgb_buf,
2905 int width) {
2906 __asm {
2907 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
2908 movd xmm2, eax
2909 pshufd xmm2, xmm2,0
2910 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
2911 movd xmm3, eax
2912 pshufd xmm3, xmm3, 0
2913 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
2914 pslld xmm4, 24
2915
2916 mov eax, [esp + 4] // Y
2917 mov edx, [esp + 8] // rgb
2918 mov ecx, [esp + 12] // width
2919
2920 convertloop:
2921 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2922 movq xmm0, qword ptr [eax]
2923 lea eax, [eax + 8]
2924 punpcklbw xmm0, xmm0 // Y.Y
2925 pmulhuw xmm0, xmm2
2926 psubusw xmm0, xmm3
2927 psrlw xmm0, 6
2928 packuswb xmm0, xmm0 // G
2929
2930 // Step 2: Weave into ARGB
2931 punpcklbw xmm0, xmm0 // GG
2932 movdqa xmm1, xmm0
2933 punpcklwd xmm0, xmm0 // BGRA first 4 pixels
2934 punpckhwd xmm1, xmm1 // BGRA next 4 pixels
2935 por xmm0, xmm4
2936 por xmm1, xmm4
2937 movdqu [edx], xmm0
2938 movdqu [edx + 16], xmm1
2939 lea edx, [edx + 32]
2940 sub ecx, 8
2941 jg convertloop
2942 ret
2943 }
2944 }
2945 #endif // HAS_I400TOARGBROW_SSE2
2946
2947 #ifdef HAS_I400TOARGBROW_AVX2
2948 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2949 // note: vpunpcklbw mutates and vpackuswb unmutates.
2950 __declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
2951 uint8_t* rgb_buf,
2952 int width) {
2953 __asm {
2954 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
2955 vmovd xmm2, eax
2956 vbroadcastss ymm2, xmm2
2957 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
2958 vmovd xmm3, eax
2959 vbroadcastss ymm3, xmm3
2960 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
2961 vpslld ymm4, ymm4, 24
2962
2963 mov eax, [esp + 4] // Y
2964 mov edx, [esp + 8] // rgb
2965 mov ecx, [esp + 12] // width
2966
2967 convertloop:
2968 // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
2969 vmovdqu xmm0, [eax]
2970 lea eax, [eax + 16]
2971 vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
2972 vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
2973 vpmulhuw ymm0, ymm0, ymm2
2974 vpsubusw ymm0, ymm0, ymm3
2975 vpsrlw ymm0, ymm0, 6
2976 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
2977
2978 // TODO(fbarchard): Weave alpha with unpack.
2979 // Step 2: Weave into ARGB
2980 vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
2981 vpermq ymm1, ymm1, 0xd8
2982 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
2983 vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
2984 vpor ymm0, ymm0, ymm4
2985 vpor ymm1, ymm1, ymm4
2986 vmovdqu [edx], ymm0
2987 vmovdqu [edx + 32], ymm1
2988 lea edx, [edx + 64]
2989 sub ecx, 16
2990 jg convertloop
2991 vzeroupper
2992 ret
2993 }
2994 }
2995 #endif // HAS_I400TOARGBROW_AVX2
2996
2997 #ifdef HAS_MIRRORROW_SSSE3
2998 // Shuffle table for reversing the bytes.
2999 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
3000 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
3001
3002 // TODO(fbarchard): Replace lea with -16 offset.
3003 __declspec(naked) void MirrorRow_SSSE3(const uint8_t* src,
3004 uint8_t* dst,
3005 int width) {
3006 __asm {
3007 mov eax, [esp + 4] // src
3008 mov edx, [esp + 8] // dst
3009 mov ecx, [esp + 12] // width
3010 movdqa xmm5, xmmword ptr kShuffleMirror
3011
3012 convertloop:
3013 movdqu xmm0, [eax - 16 + ecx]
3014 pshufb xmm0, xmm5
3015 movdqu [edx], xmm0
3016 lea edx, [edx + 16]
3017 sub ecx, 16
3018 jg convertloop
3019 ret
3020 }
3021 }
3022 #endif // HAS_MIRRORROW_SSSE3
3023
3024 #ifdef HAS_MIRRORROW_AVX2
3025 __declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
3026 uint8_t* dst,
3027 int width) {
3028 __asm {
3029 mov eax, [esp + 4] // src
3030 mov edx, [esp + 8] // dst
3031 mov ecx, [esp + 12] // width
3032 vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
3033
3034 convertloop:
3035 vmovdqu ymm0, [eax - 32 + ecx]
3036 vpshufb ymm0, ymm0, ymm5
3037 vpermq ymm0, ymm0, 0x4e // swap high and low halfs
3038 vmovdqu [edx], ymm0
3039 lea edx, [edx + 32]
3040 sub ecx, 32
3041 jg convertloop
3042 vzeroupper
3043 ret
3044 }
3045 }
3046 #endif // HAS_MIRRORROW_AVX2
3047
3048 #ifdef HAS_MIRRORUVROW_SSSE3
3049 // Shuffle table for reversing the bytes of UV channels.
3050 static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
3051 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
3052
3053 __declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
3054 uint8_t* dst_u,
3055 uint8_t* dst_v,
3056 int width) {
3057 __asm {
3058 push edi
3059 mov eax, [esp + 4 + 4] // src
3060 mov edx, [esp + 4 + 8] // dst_u
3061 mov edi, [esp + 4 + 12] // dst_v
3062 mov ecx, [esp + 4 + 16] // width
3063 movdqa xmm1, xmmword ptr kShuffleMirrorUV
3064 lea eax, [eax + ecx * 2 - 16]
3065 sub edi, edx
3066
3067 convertloop:
3068 movdqu xmm0, [eax]
3069 lea eax, [eax - 16]
3070 pshufb xmm0, xmm1
3071 movlpd qword ptr [edx], xmm0
3072 movhpd qword ptr [edx + edi], xmm0
3073 lea edx, [edx + 8]
3074 sub ecx, 8
3075 jg convertloop
3076
3077 pop edi
3078 ret
3079 }
3080 }
3081 #endif // HAS_MIRRORUVROW_SSSE3
3082
3083 #ifdef HAS_ARGBMIRRORROW_SSE2
3084 __declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
3085 uint8_t* dst,
3086 int width) {
3087 __asm {
3088 mov eax, [esp + 4] // src
3089 mov edx, [esp + 8] // dst
3090 mov ecx, [esp + 12] // width
3091 lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
3092
3093 convertloop:
3094 movdqu xmm0, [eax]
3095 lea eax, [eax - 16]
3096 pshufd xmm0, xmm0, 0x1b
3097 movdqu [edx], xmm0
3098 lea edx, [edx + 16]
3099 sub ecx, 4
3100 jg convertloop
3101 ret
3102 }
3103 }
3104 #endif // HAS_ARGBMIRRORROW_SSE2
3105
3106 #ifdef HAS_ARGBMIRRORROW_AVX2
3107 // Shuffle table for reversing the bytes.
3108 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
3109
3110 __declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src,
3111 uint8_t* dst,
3112 int width) {
3113 __asm {
3114 mov eax, [esp + 4] // src
3115 mov edx, [esp + 8] // dst
3116 mov ecx, [esp + 12] // width
3117 vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2
3118
3119 convertloop:
3120 vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
3121 vmovdqu [edx], ymm0
3122 lea edx, [edx + 32]
3123 sub ecx, 8
3124 jg convertloop
3125 vzeroupper
3126 ret
3127 }
3128 }
3129 #endif // HAS_ARGBMIRRORROW_AVX2
3130
3131 #ifdef HAS_SPLITUVROW_SSE2
3132 __declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv,
3133 uint8_t* dst_u,
3134 uint8_t* dst_v,
3135 int width) {
3136 __asm {
3137 push edi
3138 mov eax, [esp + 4 + 4] // src_uv
3139 mov edx, [esp + 4 + 8] // dst_u
3140 mov edi, [esp + 4 + 12] // dst_v
3141 mov ecx, [esp + 4 + 16] // width
3142 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3143 psrlw xmm5, 8
3144 sub edi, edx
3145
3146 convertloop:
3147 movdqu xmm0, [eax]
3148 movdqu xmm1, [eax + 16]
3149 lea eax, [eax + 32]
3150 movdqa xmm2, xmm0
3151 movdqa xmm3, xmm1
3152 pand xmm0, xmm5 // even bytes
3153 pand xmm1, xmm5
3154 packuswb xmm0, xmm1
3155 psrlw xmm2, 8 // odd bytes
3156 psrlw xmm3, 8
3157 packuswb xmm2, xmm3
3158 movdqu [edx], xmm0
3159 movdqu [edx + edi], xmm2
3160 lea edx, [edx + 16]
3161 sub ecx, 16
3162 jg convertloop
3163
3164 pop edi
3165 ret
3166 }
3167 }
3168
3169 #endif // HAS_SPLITUVROW_SSE2
3170
3171 #ifdef HAS_SPLITUVROW_AVX2
3172 __declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv,
3173 uint8_t* dst_u,
3174 uint8_t* dst_v,
3175 int width) {
3176 __asm {
3177 push edi
3178 mov eax, [esp + 4 + 4] // src_uv
3179 mov edx, [esp + 4 + 8] // dst_u
3180 mov edi, [esp + 4 + 12] // dst_v
3181 mov ecx, [esp + 4 + 16] // width
3182 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3183 vpsrlw ymm5, ymm5, 8
3184 sub edi, edx
3185
3186 convertloop:
3187 vmovdqu ymm0, [eax]
3188 vmovdqu ymm1, [eax + 32]
3189 lea eax, [eax + 64]
3190 vpsrlw ymm2, ymm0, 8 // odd bytes
3191 vpsrlw ymm3, ymm1, 8
3192 vpand ymm0, ymm0, ymm5 // even bytes
3193 vpand ymm1, ymm1, ymm5
3194 vpackuswb ymm0, ymm0, ymm1
3195 vpackuswb ymm2, ymm2, ymm3
3196 vpermq ymm0, ymm0, 0xd8
3197 vpermq ymm2, ymm2, 0xd8
3198 vmovdqu [edx], ymm0
3199 vmovdqu [edx + edi], ymm2
3200 lea edx, [edx + 32]
3201 sub ecx, 32
3202 jg convertloop
3203
3204 pop edi
3205 vzeroupper
3206 ret
3207 }
3208 }
3209 #endif // HAS_SPLITUVROW_AVX2
3210
3211 #ifdef HAS_MERGEUVROW_SSE2
3212 __declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u,
3213 const uint8_t* src_v,
3214 uint8_t* dst_uv,
3215 int width) {
3216 __asm {
3217 push edi
3218 mov eax, [esp + 4 + 4] // src_u
3219 mov edx, [esp + 4 + 8] // src_v
3220 mov edi, [esp + 4 + 12] // dst_uv
3221 mov ecx, [esp + 4 + 16] // width
3222 sub edx, eax
3223
3224 convertloop:
3225 movdqu xmm0, [eax] // read 16 U's
3226 movdqu xmm1, [eax + edx] // and 16 V's
3227 lea eax, [eax + 16]
3228 movdqa xmm2, xmm0
3229 punpcklbw xmm0, xmm1 // first 8 UV pairs
3230 punpckhbw xmm2, xmm1 // next 8 UV pairs
3231 movdqu [edi], xmm0
3232 movdqu [edi + 16], xmm2
3233 lea edi, [edi + 32]
3234 sub ecx, 16
3235 jg convertloop
3236
3237 pop edi
3238 ret
3239 }
3240 }
3241 #endif // HAS_MERGEUVROW_SSE2
3242
3243 #ifdef HAS_MERGEUVROW_AVX2
3244 __declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
3245 const uint8_t* src_v,
3246 uint8_t* dst_uv,
3247 int width) {
3248 __asm {
3249 push edi
3250 mov eax, [esp + 4 + 4] // src_u
3251 mov edx, [esp + 4 + 8] // src_v
3252 mov edi, [esp + 4 + 12] // dst_uv
3253 mov ecx, [esp + 4 + 16] // width
3254 sub edx, eax
3255
3256 convertloop:
3257 vmovdqu ymm0, [eax] // read 32 U's
3258 vmovdqu ymm1, [eax + edx] // and 32 V's
3259 lea eax, [eax + 32]
3260 vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
3261 vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
3262 vextractf128 [edi], ymm2, 0 // bytes 0..15
3263 vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
3264 vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
3265 vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
3266 lea edi, [edi + 64]
3267 sub ecx, 32
3268 jg convertloop
3269
3270 pop edi
3271 vzeroupper
3272 ret
3273 }
3274 }
3275 #endif // HAS_MERGEUVROW_AVX2
3276
3277 #ifdef HAS_COPYROW_SSE2
3278 // CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time.
3279 __declspec(naked) void CopyRow_SSE2(const uint8_t* src,
3280 uint8_t* dst,
3281 int width) {
3282 __asm {
3283 mov eax, [esp + 4] // src
3284 mov edx, [esp + 8] // dst
3285 mov ecx, [esp + 12] // width
3286 test eax, 15
3287 jne convertloopu
3288 test edx, 15
3289 jne convertloopu
3290
3291 convertloopa:
3292 movdqa xmm0, [eax]
3293 movdqa xmm1, [eax + 16]
3294 lea eax, [eax + 32]
3295 movdqa [edx], xmm0
3296 movdqa [edx + 16], xmm1
3297 lea edx, [edx + 32]
3298 sub ecx, 32
3299 jg convertloopa
3300 ret
3301
3302 convertloopu:
3303 movdqu xmm0, [eax]
3304 movdqu xmm1, [eax + 16]
3305 lea eax, [eax + 32]
3306 movdqu [edx], xmm0
3307 movdqu [edx + 16], xmm1
3308 lea edx, [edx + 32]
3309 sub ecx, 32
3310 jg convertloopu
3311 ret
3312 }
3313 }
3314 #endif // HAS_COPYROW_SSE2
3315
3316 #ifdef HAS_COPYROW_AVX
3317 // CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time.
3318 __declspec(naked) void CopyRow_AVX(const uint8_t* src,
3319 uint8_t* dst,
3320 int width) {
3321 __asm {
3322 mov eax, [esp + 4] // src
3323 mov edx, [esp + 8] // dst
3324 mov ecx, [esp + 12] // width
3325
3326 convertloop:
3327 vmovdqu ymm0, [eax]
3328 vmovdqu ymm1, [eax + 32]
3329 lea eax, [eax + 64]
3330 vmovdqu [edx], ymm0
3331 vmovdqu [edx + 32], ymm1
3332 lea edx, [edx + 64]
3333 sub ecx, 64
3334 jg convertloop
3335
3336 vzeroupper
3337 ret
3338 }
3339 }
3340 #endif // HAS_COPYROW_AVX
3341
3342 // Multiple of 1.
3343 __declspec(naked) void CopyRow_ERMS(const uint8_t* src,
3344 uint8_t* dst,
3345 int width) {
3346 __asm {
3347 mov eax, esi
3348 mov edx, edi
3349 mov esi, [esp + 4] // src
3350 mov edi, [esp + 8] // dst
3351 mov ecx, [esp + 12] // width
3352 rep movsb
3353 mov edi, edx
3354 mov esi, eax
3355 ret
3356 }
3357 }
3358
3359 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
3360 // width in pixels
3361 __declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src,
3362 uint8_t* dst,
3363 int width) {
3364 __asm {
3365 mov eax, [esp + 4] // src
3366 mov edx, [esp + 8] // dst
3367 mov ecx, [esp + 12] // width
3368 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
3369 pslld xmm0, 24
3370 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
3371 psrld xmm1, 8
3372
3373 convertloop:
3374 movdqu xmm2, [eax]
3375 movdqu xmm3, [eax + 16]
3376 lea eax, [eax + 32]
3377 movdqu xmm4, [edx]
3378 movdqu xmm5, [edx + 16]
3379 pand xmm2, xmm0
3380 pand xmm3, xmm0
3381 pand xmm4, xmm1
3382 pand xmm5, xmm1
3383 por xmm2, xmm4
3384 por xmm3, xmm5
3385 movdqu [edx], xmm2
3386 movdqu [edx + 16], xmm3
3387 lea edx, [edx + 32]
3388 sub ecx, 8
3389 jg convertloop
3390
3391 ret
3392 }
3393 }
3394 #endif // HAS_ARGBCOPYALPHAROW_SSE2
3395
3396 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
3397 // width in pixels
3398 __declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src,
3399 uint8_t* dst,
3400 int width) {
3401 __asm {
3402 mov eax, [esp + 4] // src
3403 mov edx, [esp + 8] // dst
3404 mov ecx, [esp + 12] // width
3405 vpcmpeqb ymm0, ymm0, ymm0
3406 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3407
3408 convertloop:
3409 vmovdqu ymm1, [eax]
3410 vmovdqu ymm2, [eax + 32]
3411 lea eax, [eax + 64]
3412 vpblendvb ymm1, ymm1, [edx], ymm0
3413 vpblendvb ymm2, ymm2, [edx + 32], ymm0
3414 vmovdqu [edx], ymm1
3415 vmovdqu [edx + 32], ymm2
3416 lea edx, [edx + 64]
3417 sub ecx, 16
3418 jg convertloop
3419
3420 vzeroupper
3421 ret
3422 }
3423 }
3424 #endif // HAS_ARGBCOPYALPHAROW_AVX2
3425
3426 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
3427 // width in pixels
3428 __declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
3429 uint8_t* dst_a,
3430 int width) {
3431 __asm {
3432 mov eax, [esp + 4] // src_argb
3433 mov edx, [esp + 8] // dst_a
3434 mov ecx, [esp + 12] // width
3435
3436 extractloop:
3437 movdqu xmm0, [eax]
3438 movdqu xmm1, [eax + 16]
3439 lea eax, [eax + 32]
3440 psrld xmm0, 24
3441 psrld xmm1, 24
3442 packssdw xmm0, xmm1
3443 packuswb xmm0, xmm0
3444 movq qword ptr [edx], xmm0
3445 lea edx, [edx + 8]
3446 sub ecx, 8
3447 jg extractloop
3448
3449 ret
3450 }
3451 }
3452 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
3453
3454 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
3455 // width in pixels
3456 __declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
3457 uint8_t* dst_a,
3458 int width) {
3459 __asm {
3460 mov eax, [esp + 4] // src_argb
3461 mov edx, [esp + 8] // dst_a
3462 mov ecx, [esp + 12] // width
3463 vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX
3464
3465 extractloop:
3466 vmovdqu ymm0, [eax]
3467 vmovdqu ymm1, [eax + 32]
3468 vpsrld ymm0, ymm0, 24
3469 vpsrld ymm1, ymm1, 24
3470 vmovdqu ymm2, [eax + 64]
3471 vmovdqu ymm3, [eax + 96]
3472 lea eax, [eax + 128]
3473 vpackssdw ymm0, ymm0, ymm1 // mutates
3474 vpsrld ymm2, ymm2, 24
3475 vpsrld ymm3, ymm3, 24
3476 vpackssdw ymm2, ymm2, ymm3 // mutates
3477 vpackuswb ymm0, ymm0, ymm2 // mutates
3478 vpermd ymm0, ymm4, ymm0 // unmutate
3479 vmovdqu [edx], ymm0
3480 lea edx, [edx + 32]
3481 sub ecx, 32
3482 jg extractloop
3483
3484 vzeroupper
3485 ret
3486 }
3487 }
3488 #endif // HAS_ARGBEXTRACTALPHAROW_AVX2
3489
3490 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3491 // width in pixels
3492 __declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src,
3493 uint8_t* dst,
3494 int width) {
3495 __asm {
3496 mov eax, [esp + 4] // src
3497 mov edx, [esp + 8] // dst
3498 mov ecx, [esp + 12] // width
3499 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
3500 pslld xmm0, 24
3501 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
3502 psrld xmm1, 8
3503
3504 convertloop:
3505 movq xmm2, qword ptr [eax] // 8 Y's
3506 lea eax, [eax + 8]
3507 punpcklbw xmm2, xmm2
3508 punpckhwd xmm3, xmm2
3509 punpcklwd xmm2, xmm2
3510 movdqu xmm4, [edx]
3511 movdqu xmm5, [edx + 16]
3512 pand xmm2, xmm0
3513 pand xmm3, xmm0
3514 pand xmm4, xmm1
3515 pand xmm5, xmm1
3516 por xmm2, xmm4
3517 por xmm3, xmm5
3518 movdqu [edx], xmm2
3519 movdqu [edx + 16], xmm3
3520 lea edx, [edx + 32]
3521 sub ecx, 8
3522 jg convertloop
3523
3524 ret
3525 }
3526 }
3527 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
3528
3529 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3530 // width in pixels
3531 __declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src,
3532 uint8_t* dst,
3533 int width) {
3534 __asm {
3535 mov eax, [esp + 4] // src
3536 mov edx, [esp + 8] // dst
3537 mov ecx, [esp + 12] // width
3538 vpcmpeqb ymm0, ymm0, ymm0
3539 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3540
3541 convertloop:
3542 vpmovzxbd ymm1, qword ptr [eax]
3543 vpmovzxbd ymm2, qword ptr [eax + 8]
3544 lea eax, [eax + 16]
3545 vpslld ymm1, ymm1, 24
3546 vpslld ymm2, ymm2, 24
3547 vpblendvb ymm1, ymm1, [edx], ymm0
3548 vpblendvb ymm2, ymm2, [edx + 32], ymm0
3549 vmovdqu [edx], ymm1
3550 vmovdqu [edx + 32], ymm2
3551 lea edx, [edx + 64]
3552 sub ecx, 16
3553 jg convertloop
3554
3555 vzeroupper
3556 ret
3557 }
3558 }
3559 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
3560
3561 #ifdef HAS_SETROW_X86
3562 // Write 'width' bytes using an 8 bit value repeated.
3563 // width should be multiple of 4.
3564 __declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
3565 __asm {
3566 movzx eax, byte ptr [esp + 8] // v8
3567 mov edx, 0x01010101 // Duplicate byte to all bytes.
3568 mul edx // overwrites edx with upper part of result.
3569 mov edx, edi
3570 mov edi, [esp + 4] // dst
3571 mov ecx, [esp + 12] // width
3572 shr ecx, 2
3573 rep stosd
3574 mov edi, edx
3575 ret
3576 }
3577 }
3578
3579 // Write 'width' bytes using an 8 bit value repeated.
3580 __declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
3581 __asm {
3582 mov edx, edi
3583 mov edi, [esp + 4] // dst
3584 mov eax, [esp + 8] // v8
3585 mov ecx, [esp + 12] // width
3586 rep stosb
3587 mov edi, edx
3588 ret
3589 }
3590 }
3591
3592 // Write 'width' 32 bit values.
3593 __declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb,
3594 uint32_t v32,
3595 int width) {
3596 __asm {
3597 mov edx, edi
3598 mov edi, [esp + 4] // dst
3599 mov eax, [esp + 8] // v32
3600 mov ecx, [esp + 12] // width
3601 rep stosd
3602 mov edi, edx
3603 ret
3604 }
3605 }
3606 #endif // HAS_SETROW_X86
3607
3608 #ifdef HAS_YUY2TOYROW_AVX2
3609 __declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2,
3610 uint8_t* dst_y,
3611 int width) {
3612 __asm {
3613 mov eax, [esp + 4] // src_yuy2
3614 mov edx, [esp + 8] // dst_y
3615 mov ecx, [esp + 12] // width
3616 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3617 vpsrlw ymm5, ymm5, 8
3618
3619 convertloop:
3620 vmovdqu ymm0, [eax]
3621 vmovdqu ymm1, [eax + 32]
3622 lea eax, [eax + 64]
3623 vpand ymm0, ymm0, ymm5 // even bytes are Y
3624 vpand ymm1, ymm1, ymm5
3625 vpackuswb ymm0, ymm0, ymm1 // mutates.
3626 vpermq ymm0, ymm0, 0xd8
3627 vmovdqu [edx], ymm0
3628 lea edx, [edx + 32]
3629 sub ecx, 32
3630 jg convertloop
3631 vzeroupper
3632 ret
3633 }
3634 }
3635
3636 __declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
3637 int stride_yuy2,
3638 uint8_t* dst_u,
3639 uint8_t* dst_v,
3640 int width) {
3641 __asm {
3642 push esi
3643 push edi
3644 mov eax, [esp + 8 + 4] // src_yuy2
3645 mov esi, [esp + 8 + 8] // stride_yuy2
3646 mov edx, [esp + 8 + 12] // dst_u
3647 mov edi, [esp + 8 + 16] // dst_v
3648 mov ecx, [esp + 8 + 20] // width
3649 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3650 vpsrlw ymm5, ymm5, 8
3651 sub edi, edx
3652
3653 convertloop:
3654 vmovdqu ymm0, [eax]
3655 vmovdqu ymm1, [eax + 32]
3656 vpavgb ymm0, ymm0, [eax + esi]
3657 vpavgb ymm1, ymm1, [eax + esi + 32]
3658 lea eax, [eax + 64]
3659 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
3660 vpsrlw ymm1, ymm1, 8
3661 vpackuswb ymm0, ymm0, ymm1 // mutates.
3662 vpermq ymm0, ymm0, 0xd8
3663 vpand ymm1, ymm0, ymm5 // U
3664 vpsrlw ymm0, ymm0, 8 // V
3665 vpackuswb ymm1, ymm1, ymm1 // mutates.
3666 vpackuswb ymm0, ymm0, ymm0 // mutates.
3667 vpermq ymm1, ymm1, 0xd8
3668 vpermq ymm0, ymm0, 0xd8
3669 vextractf128 [edx], ymm1, 0 // U
3670 vextractf128 [edx + edi], ymm0, 0 // V
3671 lea edx, [edx + 16]
3672 sub ecx, 32
3673 jg convertloop
3674
3675 pop edi
3676 pop esi
3677 vzeroupper
3678 ret
3679 }
3680 }
3681
3682 __declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
3683 uint8_t* dst_u,
3684 uint8_t* dst_v,
3685 int width) {
3686 __asm {
3687 push edi
3688 mov eax, [esp + 4 + 4] // src_yuy2
3689 mov edx, [esp + 4 + 8] // dst_u
3690 mov edi, [esp + 4 + 12] // dst_v
3691 mov ecx, [esp + 4 + 16] // width
3692 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3693 vpsrlw ymm5, ymm5, 8
3694 sub edi, edx
3695
3696 convertloop:
3697 vmovdqu ymm0, [eax]
3698 vmovdqu ymm1, [eax + 32]
3699 lea eax, [eax + 64]
3700 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
3701 vpsrlw ymm1, ymm1, 8
3702 vpackuswb ymm0, ymm0, ymm1 // mutates.
3703 vpermq ymm0, ymm0, 0xd8
3704 vpand ymm1, ymm0, ymm5 // U
3705 vpsrlw ymm0, ymm0, 8 // V
3706 vpackuswb ymm1, ymm1, ymm1 // mutates.
3707 vpackuswb ymm0, ymm0, ymm0 // mutates.
3708 vpermq ymm1, ymm1, 0xd8
3709 vpermq ymm0, ymm0, 0xd8
3710 vextractf128 [edx], ymm1, 0 // U
3711 vextractf128 [edx + edi], ymm0, 0 // V
3712 lea edx, [edx + 16]
3713 sub ecx, 32
3714 jg convertloop
3715
3716 pop edi
3717 vzeroupper
3718 ret
3719 }
3720 }
3721
3722 __declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy,
3723 uint8_t* dst_y,
3724 int width) {
3725 __asm {
3726 mov eax, [esp + 4] // src_uyvy
3727 mov edx, [esp + 8] // dst_y
3728 mov ecx, [esp + 12] // width
3729
3730 convertloop:
3731 vmovdqu ymm0, [eax]
3732 vmovdqu ymm1, [eax + 32]
3733 lea eax, [eax + 64]
3734 vpsrlw ymm0, ymm0, 8 // odd bytes are Y
3735 vpsrlw ymm1, ymm1, 8
3736 vpackuswb ymm0, ymm0, ymm1 // mutates.
3737 vpermq ymm0, ymm0, 0xd8
3738 vmovdqu [edx], ymm0
3739 lea edx, [edx + 32]
3740 sub ecx, 32
3741 jg convertloop
3742 vzeroupper
3743 ret
3744 }
3745 }
3746
3747 __declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
3748 int stride_uyvy,
3749 uint8_t* dst_u,
3750 uint8_t* dst_v,
3751 int width) {
3752 __asm {
3753 push esi
3754 push edi
3755 mov eax, [esp + 8 + 4] // src_yuy2
3756 mov esi, [esp + 8 + 8] // stride_yuy2
3757 mov edx, [esp + 8 + 12] // dst_u
3758 mov edi, [esp + 8 + 16] // dst_v
3759 mov ecx, [esp + 8 + 20] // width
3760 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3761 vpsrlw ymm5, ymm5, 8
3762 sub edi, edx
3763
3764 convertloop:
3765 vmovdqu ymm0, [eax]
3766 vmovdqu ymm1, [eax + 32]
3767 vpavgb ymm0, ymm0, [eax + esi]
3768 vpavgb ymm1, ymm1, [eax + esi + 32]
3769 lea eax, [eax + 64]
3770 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
3771 vpand ymm1, ymm1, ymm5
3772 vpackuswb ymm0, ymm0, ymm1 // mutates.
3773 vpermq ymm0, ymm0, 0xd8
3774 vpand ymm1, ymm0, ymm5 // U
3775 vpsrlw ymm0, ymm0, 8 // V
3776 vpackuswb ymm1, ymm1, ymm1 // mutates.
3777 vpackuswb ymm0, ymm0, ymm0 // mutates.
3778 vpermq ymm1, ymm1, 0xd8
3779 vpermq ymm0, ymm0, 0xd8
3780 vextractf128 [edx], ymm1, 0 // U
3781 vextractf128 [edx + edi], ymm0, 0 // V
3782 lea edx, [edx + 16]
3783 sub ecx, 32
3784 jg convertloop
3785
3786 pop edi
3787 pop esi
3788 vzeroupper
3789 ret
3790 }
3791 }
3792
3793 __declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
3794 uint8_t* dst_u,
3795 uint8_t* dst_v,
3796 int width) {
3797 __asm {
3798 push edi
3799 mov eax, [esp + 4 + 4] // src_yuy2
3800 mov edx, [esp + 4 + 8] // dst_u
3801 mov edi, [esp + 4 + 12] // dst_v
3802 mov ecx, [esp + 4 + 16] // width
3803 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3804 vpsrlw ymm5, ymm5, 8
3805 sub edi, edx
3806
3807 convertloop:
3808 vmovdqu ymm0, [eax]
3809 vmovdqu ymm1, [eax + 32]
3810 lea eax, [eax + 64]
3811 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
3812 vpand ymm1, ymm1, ymm5
3813 vpackuswb ymm0, ymm0, ymm1 // mutates.
3814 vpermq ymm0, ymm0, 0xd8
3815 vpand ymm1, ymm0, ymm5 // U
3816 vpsrlw ymm0, ymm0, 8 // V
3817 vpackuswb ymm1, ymm1, ymm1 // mutates.
3818 vpackuswb ymm0, ymm0, ymm0 // mutates.
3819 vpermq ymm1, ymm1, 0xd8
3820 vpermq ymm0, ymm0, 0xd8
3821 vextractf128 [edx], ymm1, 0 // U
3822 vextractf128 [edx + edi], ymm0, 0 // V
3823 lea edx, [edx + 16]
3824 sub ecx, 32
3825 jg convertloop
3826
3827 pop edi
3828 vzeroupper
3829 ret
3830 }
3831 }
3832 #endif // HAS_YUY2TOYROW_AVX2
3833
3834 #ifdef HAS_YUY2TOYROW_SSE2
3835 __declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2,
3836 uint8_t* dst_y,
3837 int width) {
3838 __asm {
3839 mov eax, [esp + 4] // src_yuy2
3840 mov edx, [esp + 8] // dst_y
3841 mov ecx, [esp + 12] // width
3842 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3843 psrlw xmm5, 8
3844
3845 convertloop:
3846 movdqu xmm0, [eax]
3847 movdqu xmm1, [eax + 16]
3848 lea eax, [eax + 32]
3849 pand xmm0, xmm5 // even bytes are Y
3850 pand xmm1, xmm5
3851 packuswb xmm0, xmm1
3852 movdqu [edx], xmm0
3853 lea edx, [edx + 16]
3854 sub ecx, 16
3855 jg convertloop
3856 ret
3857 }
3858 }
3859
3860 __declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
3861 int stride_yuy2,
3862 uint8_t* dst_u,
3863 uint8_t* dst_v,
3864 int width) {
3865 __asm {
3866 push esi
3867 push edi
3868 mov eax, [esp + 8 + 4] // src_yuy2
3869 mov esi, [esp + 8 + 8] // stride_yuy2
3870 mov edx, [esp + 8 + 12] // dst_u
3871 mov edi, [esp + 8 + 16] // dst_v
3872 mov ecx, [esp + 8 + 20] // width
3873 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3874 psrlw xmm5, 8
3875 sub edi, edx
3876
3877 convertloop:
3878 movdqu xmm0, [eax]
3879 movdqu xmm1, [eax + 16]
3880 movdqu xmm2, [eax + esi]
3881 movdqu xmm3, [eax + esi + 16]
3882 lea eax, [eax + 32]
3883 pavgb xmm0, xmm2
3884 pavgb xmm1, xmm3
3885 psrlw xmm0, 8 // YUYV -> UVUV
3886 psrlw xmm1, 8
3887 packuswb xmm0, xmm1
3888 movdqa xmm1, xmm0
3889 pand xmm0, xmm5 // U
3890 packuswb xmm0, xmm0
3891 psrlw xmm1, 8 // V
3892 packuswb xmm1, xmm1
3893 movq qword ptr [edx], xmm0
3894 movq qword ptr [edx + edi], xmm1
3895 lea edx, [edx + 8]
3896 sub ecx, 16
3897 jg convertloop
3898
3899 pop edi
3900 pop esi
3901 ret
3902 }
3903 }
3904
3905 __declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
3906 uint8_t* dst_u,
3907 uint8_t* dst_v,
3908 int width) {
3909 __asm {
3910 push edi
3911 mov eax, [esp + 4 + 4] // src_yuy2
3912 mov edx, [esp + 4 + 8] // dst_u
3913 mov edi, [esp + 4 + 12] // dst_v
3914 mov ecx, [esp + 4 + 16] // width
3915 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3916 psrlw xmm5, 8
3917 sub edi, edx
3918
3919 convertloop:
3920 movdqu xmm0, [eax]
3921 movdqu xmm1, [eax + 16]
3922 lea eax, [eax + 32]
3923 psrlw xmm0, 8 // YUYV -> UVUV
3924 psrlw xmm1, 8
3925 packuswb xmm0, xmm1
3926 movdqa xmm1, xmm0
3927 pand xmm0, xmm5 // U
3928 packuswb xmm0, xmm0
3929 psrlw xmm1, 8 // V
3930 packuswb xmm1, xmm1
3931 movq qword ptr [edx], xmm0
3932 movq qword ptr [edx + edi], xmm1
3933 lea edx, [edx + 8]
3934 sub ecx, 16
3935 jg convertloop
3936
3937 pop edi
3938 ret
3939 }
3940 }
3941
3942 __declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy,
3943 uint8_t* dst_y,
3944 int width) {
3945 __asm {
3946 mov eax, [esp + 4] // src_uyvy
3947 mov edx, [esp + 8] // dst_y
3948 mov ecx, [esp + 12] // width
3949
3950 convertloop:
3951 movdqu xmm0, [eax]
3952 movdqu xmm1, [eax + 16]
3953 lea eax, [eax + 32]
3954 psrlw xmm0, 8 // odd bytes are Y
3955 psrlw xmm1, 8
3956 packuswb xmm0, xmm1
3957 movdqu [edx], xmm0
3958 lea edx, [edx + 16]
3959 sub ecx, 16
3960 jg convertloop
3961 ret
3962 }
3963 }
3964
3965 __declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
3966 int stride_uyvy,
3967 uint8_t* dst_u,
3968 uint8_t* dst_v,
3969 int width) {
3970 __asm {
3971 push esi
3972 push edi
3973 mov eax, [esp + 8 + 4] // src_yuy2
3974 mov esi, [esp + 8 + 8] // stride_yuy2
3975 mov edx, [esp + 8 + 12] // dst_u
3976 mov edi, [esp + 8 + 16] // dst_v
3977 mov ecx, [esp + 8 + 20] // width
3978 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3979 psrlw xmm5, 8
3980 sub edi, edx
3981
3982 convertloop:
3983 movdqu xmm0, [eax]
3984 movdqu xmm1, [eax + 16]
3985 movdqu xmm2, [eax + esi]
3986 movdqu xmm3, [eax + esi + 16]
3987 lea eax, [eax + 32]
3988 pavgb xmm0, xmm2
3989 pavgb xmm1, xmm3
3990 pand xmm0, xmm5 // UYVY -> UVUV
3991 pand xmm1, xmm5
3992 packuswb xmm0, xmm1
3993 movdqa xmm1, xmm0
3994 pand xmm0, xmm5 // U
3995 packuswb xmm0, xmm0
3996 psrlw xmm1, 8 // V
3997 packuswb xmm1, xmm1
3998 movq qword ptr [edx], xmm0
3999 movq qword ptr [edx + edi], xmm1
4000 lea edx, [edx + 8]
4001 sub ecx, 16
4002 jg convertloop
4003
4004 pop edi
4005 pop esi
4006 ret
4007 }
4008 }
4009
4010 __declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
4011 uint8_t* dst_u,
4012 uint8_t* dst_v,
4013 int width) {
4014 __asm {
4015 push edi
4016 mov eax, [esp + 4 + 4] // src_yuy2
4017 mov edx, [esp + 4 + 8] // dst_u
4018 mov edi, [esp + 4 + 12] // dst_v
4019 mov ecx, [esp + 4 + 16] // width
4020 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4021 psrlw xmm5, 8
4022 sub edi, edx
4023
4024 convertloop:
4025 movdqu xmm0, [eax]
4026 movdqu xmm1, [eax + 16]
4027 lea eax, [eax + 32]
4028 pand xmm0, xmm5 // UYVY -> UVUV
4029 pand xmm1, xmm5
4030 packuswb xmm0, xmm1
4031 movdqa xmm1, xmm0
4032 pand xmm0, xmm5 // U
4033 packuswb xmm0, xmm0
4034 psrlw xmm1, 8 // V
4035 packuswb xmm1, xmm1
4036 movq qword ptr [edx], xmm0
4037 movq qword ptr [edx + edi], xmm1
4038 lea edx, [edx + 8]
4039 sub ecx, 16
4040 jg convertloop
4041
4042 pop edi
4043 ret
4044 }
4045 }
4046 #endif // HAS_YUY2TOYROW_SSE2
4047
4048 #ifdef HAS_BLENDPLANEROW_SSSE3
4049 // Blend 8 pixels at a time.
4050 // unsigned version of math
4051 // =((A2*C2)+(B2*(255-C2))+255)/256
4052 // signed version of math
4053 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4054 __declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0,
4055 const uint8_t* src1,
4056 const uint8_t* alpha,
4057 uint8_t* dst,
4058 int width) {
4059 __asm {
4060 push esi
4061 push edi
4062 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4063 psllw xmm5, 8
4064 mov eax, 0x80808080 // 128 for biasing image to signed.
4065 movd xmm6, eax
4066 pshufd xmm6, xmm6, 0x00
4067
4068 mov eax, 0x807f807f // 32768 + 127 for unbias and round.
4069 movd xmm7, eax
4070 pshufd xmm7, xmm7, 0x00
4071 mov eax, [esp + 8 + 4] // src0
4072 mov edx, [esp + 8 + 8] // src1
4073 mov esi, [esp + 8 + 12] // alpha
4074 mov edi, [esp + 8 + 16] // dst
4075 mov ecx, [esp + 8 + 20] // width
4076 sub eax, esi
4077 sub edx, esi
4078 sub edi, esi
4079
4080 // 8 pixel loop.
4081 convertloop8:
4082 movq xmm0, qword ptr [esi] // alpha
4083 punpcklbw xmm0, xmm0
4084 pxor xmm0, xmm5 // a, 255-a
4085 movq xmm1, qword ptr [eax + esi] // src0
4086 movq xmm2, qword ptr [edx + esi] // src1
4087 punpcklbw xmm1, xmm2
4088 psubb xmm1, xmm6 // bias src0/1 - 128
4089 pmaddubsw xmm0, xmm1
4090 paddw xmm0, xmm7 // unbias result - 32768 and round.
4091 psrlw xmm0, 8
4092 packuswb xmm0, xmm0
4093 movq qword ptr [edi + esi], xmm0
4094 lea esi, [esi + 8]
4095 sub ecx, 8
4096 jg convertloop8
4097
4098 pop edi
4099 pop esi
4100 ret
4101 }
4102 }
4103 #endif // HAS_BLENDPLANEROW_SSSE3
4104
4105 #ifdef HAS_BLENDPLANEROW_AVX2
4106 // Blend 32 pixels at a time.
4107 // unsigned version of math
4108 // =((A2*C2)+(B2*(255-C2))+255)/256
4109 // signed version of math
4110 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4111 __declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0,
4112 const uint8_t* src1,
4113 const uint8_t* alpha,
4114 uint8_t* dst,
4115 int width) {
4116 __asm {
4117 push esi
4118 push edi
4119 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
4120 vpsllw ymm5, ymm5, 8
4121 mov eax, 0x80808080 // 128 for biasing image to signed.
4122 vmovd xmm6, eax
4123 vbroadcastss ymm6, xmm6
4124 mov eax, 0x807f807f // 32768 + 127 for unbias and round.
4125 vmovd xmm7, eax
4126 vbroadcastss ymm7, xmm7
4127 mov eax, [esp + 8 + 4] // src0
4128 mov edx, [esp + 8 + 8] // src1
4129 mov esi, [esp + 8 + 12] // alpha
4130 mov edi, [esp + 8 + 16] // dst
4131 mov ecx, [esp + 8 + 20] // width
4132 sub eax, esi
4133 sub edx, esi
4134 sub edi, esi
4135
4136 // 32 pixel loop.
4137 convertloop32:
4138 vmovdqu ymm0, [esi] // alpha
4139 vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
4140 vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23
4141 vpxor ymm3, ymm3, ymm5 // a, 255-a
4142 vpxor ymm0, ymm0, ymm5 // a, 255-a
4143 vmovdqu ymm1, [eax + esi] // src0
4144 vmovdqu ymm2, [edx + esi] // src1
4145 vpunpckhbw ymm4, ymm1, ymm2
4146 vpunpcklbw ymm1, ymm1, ymm2
4147 vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128
4148 vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
4149 vpmaddubsw ymm3, ymm3, ymm4
4150 vpmaddubsw ymm0, ymm0, ymm1
4151 vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.
4152 vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
4153 vpsrlw ymm3, ymm3, 8
4154 vpsrlw ymm0, ymm0, 8
4155 vpackuswb ymm0, ymm0, ymm3
4156 vmovdqu [edi + esi], ymm0
4157 lea esi, [esi + 32]
4158 sub ecx, 32
4159 jg convertloop32
4160
4161 pop edi
4162 pop esi
4163 vzeroupper
4164 ret
4165 }
4166 }
4167 #endif // HAS_BLENDPLANEROW_AVX2
4168
4169 #ifdef HAS_ARGBBLENDROW_SSSE3
4170 // Shuffle table for isolating alpha.
4171 static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
4172 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
4173
4174 // Blend 8 pixels at a time.
4175 __declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
4176 const uint8_t* src_argb1,
4177 uint8_t* dst_argb,
4178 int width) {
4179 __asm {
4180 push esi
4181 mov eax, [esp + 4 + 4] // src_argb0
4182 mov esi, [esp + 4 + 8] // src_argb1
4183 mov edx, [esp + 4 + 12] // dst_argb
4184 mov ecx, [esp + 4 + 16] // width
4185 pcmpeqb xmm7, xmm7 // generate constant 0x0001
4186 psrlw xmm7, 15
4187 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
4188 psrlw xmm6, 8
4189 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4190 psllw xmm5, 8
4191 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
4192 pslld xmm4, 24
4193 sub ecx, 4
4194 jl convertloop4b // less than 4 pixels?
4195
4196 // 4 pixel loop.
4197 convertloop4:
4198 movdqu xmm3, [eax] // src argb
4199 lea eax, [eax + 16]
4200 movdqa xmm0, xmm3 // src argb
4201 pxor xmm3, xmm4 // ~alpha
4202 movdqu xmm2, [esi] // _r_b
4203 pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
4204 pand xmm2, xmm6 // _r_b
4205 paddw xmm3, xmm7 // 256 - alpha
4206 pmullw xmm2, xmm3 // _r_b * alpha
4207 movdqu xmm1, [esi] // _a_g
4208 lea esi, [esi + 16]
4209 psrlw xmm1, 8 // _a_g
4210 por xmm0, xmm4 // set alpha to 255
4211 pmullw xmm1, xmm3 // _a_g * alpha
4212 psrlw xmm2, 8 // _r_b convert to 8 bits again
4213 paddusb xmm0, xmm2 // + src argb
4214 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4215 paddusb xmm0, xmm1 // + src argb
4216 movdqu [edx], xmm0
4217 lea edx, [edx + 16]
4218 sub ecx, 4
4219 jge convertloop4
4220
4221 convertloop4b:
4222 add ecx, 4 - 1
4223 jl convertloop1b
4224
4225 // 1 pixel loop.
4226 convertloop1:
4227 movd xmm3, [eax] // src argb
4228 lea eax, [eax + 4]
4229 movdqa xmm0, xmm3 // src argb
4230 pxor xmm3, xmm4 // ~alpha
4231 movd xmm2, [esi] // _r_b
4232 pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
4233 pand xmm2, xmm6 // _r_b
4234 paddw xmm3, xmm7 // 256 - alpha
4235 pmullw xmm2, xmm3 // _r_b * alpha
4236 movd xmm1, [esi] // _a_g
4237 lea esi, [esi + 4]
4238 psrlw xmm1, 8 // _a_g
4239 por xmm0, xmm4 // set alpha to 255
4240 pmullw xmm1, xmm3 // _a_g * alpha
4241 psrlw xmm2, 8 // _r_b convert to 8 bits again
4242 paddusb xmm0, xmm2 // + src argb
4243 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4244 paddusb xmm0, xmm1 // + src argb
4245 movd [edx], xmm0
4246 lea edx, [edx + 4]
4247 sub ecx, 1
4248 jge convertloop1
4249
4250 convertloop1b:
4251 pop esi
4252 ret
4253 }
4254 }
4255 #endif // HAS_ARGBBLENDROW_SSSE3
4256
4257 #ifdef HAS_ARGBATTENUATEROW_SSSE3
4258 // Shuffle table duplicating alpha.
4259 static const uvec8 kShuffleAlpha0 = {
4260 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4261 };
4262 static const uvec8 kShuffleAlpha1 = {
4263 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4264 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4265 };
4266 __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
4267 uint8_t* dst_argb,
4268 int width) {
4269 __asm {
4270 mov eax, [esp + 4] // src_argb0
4271 mov edx, [esp + 8] // dst_argb
4272 mov ecx, [esp + 12] // width
4273 pcmpeqb xmm3, xmm3 // generate mask 0xff000000
4274 pslld xmm3, 24
4275 movdqa xmm4, xmmword ptr kShuffleAlpha0
4276 movdqa xmm5, xmmword ptr kShuffleAlpha1
4277
4278 convertloop:
4279 movdqu xmm0, [eax] // read 4 pixels
4280 pshufb xmm0, xmm4 // isolate first 2 alphas
4281 movdqu xmm1, [eax] // read 4 pixels
4282 punpcklbw xmm1, xmm1 // first 2 pixel rgbs
4283 pmulhuw xmm0, xmm1 // rgb * a
4284 movdqu xmm1, [eax] // read 4 pixels
4285 pshufb xmm1, xmm5 // isolate next 2 alphas
4286 movdqu xmm2, [eax] // read 4 pixels
4287 punpckhbw xmm2, xmm2 // next 2 pixel rgbs
4288 pmulhuw xmm1, xmm2 // rgb * a
4289 movdqu xmm2, [eax] // mask original alpha
4290 lea eax, [eax + 16]
4291 pand xmm2, xmm3
4292 psrlw xmm0, 8
4293 psrlw xmm1, 8
4294 packuswb xmm0, xmm1
4295 por xmm0, xmm2 // copy original alpha
4296 movdqu [edx], xmm0
4297 lea edx, [edx + 16]
4298 sub ecx, 4
4299 jg convertloop
4300
4301 ret
4302 }
4303 }
4304 #endif // HAS_ARGBATTENUATEROW_SSSE3
4305
4306 #ifdef HAS_ARGBATTENUATEROW_AVX2
4307 // Shuffle table duplicating alpha.
4308 static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
4309 128u, 128u, 14u, 15u, 14u, 15u,
4310 14u, 15u, 128u, 128u};
4311 __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
4312 uint8_t* dst_argb,
4313 int width) {
4314 __asm {
4315 mov eax, [esp + 4] // src_argb0
4316 mov edx, [esp + 8] // dst_argb
4317 mov ecx, [esp + 12] // width
4318 sub edx, eax
4319 vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
4320 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
4321 vpslld ymm5, ymm5, 24
4322
4323 convertloop:
4324 vmovdqu ymm6, [eax] // read 8 pixels.
4325 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4326 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4327 vpshufb ymm2, ymm0, ymm4 // low 4 alphas
4328 vpshufb ymm3, ymm1, ymm4 // high 4 alphas
4329 vpmulhuw ymm0, ymm0, ymm2 // rgb * a
4330 vpmulhuw ymm1, ymm1, ymm3 // rgb * a
4331 vpand ymm6, ymm6, ymm5 // isolate alpha
4332 vpsrlw ymm0, ymm0, 8
4333 vpsrlw ymm1, ymm1, 8
4334 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4335 vpor ymm0, ymm0, ymm6 // copy original alpha
4336 vmovdqu [eax + edx], ymm0
4337 lea eax, [eax + 32]
4338 sub ecx, 8
4339 jg convertloop
4340
4341 vzeroupper
4342 ret
4343 }
4344 }
4345 #endif // HAS_ARGBATTENUATEROW_AVX2
4346
4347 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
4348 // Unattenuate 4 pixels at a time.
4349 __declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
4350 uint8_t* dst_argb,
4351 int width) {
4352 __asm {
4353 push ebx
4354 push esi
4355 push edi
4356 mov eax, [esp + 12 + 4] // src_argb
4357 mov edx, [esp + 12 + 8] // dst_argb
4358 mov ecx, [esp + 12 + 12] // width
4359 lea ebx, fixed_invtbl8
4360
4361 convertloop:
4362 movdqu xmm0, [eax] // read 4 pixels
4363 movzx esi, byte ptr [eax + 3] // first alpha
4364 movzx edi, byte ptr [eax + 7] // second alpha
4365 punpcklbw xmm0, xmm0 // first 2
4366 movd xmm2, dword ptr [ebx + esi * 4]
4367 movd xmm3, dword ptr [ebx + edi * 4]
4368 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
4369 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
4370 movlhps xmm2, xmm3
4371 pmulhuw xmm0, xmm2 // rgb * a
4372
4373 movdqu xmm1, [eax] // read 4 pixels
4374 movzx esi, byte ptr [eax + 11] // third alpha
4375 movzx edi, byte ptr [eax + 15] // forth alpha
4376 punpckhbw xmm1, xmm1 // next 2
4377 movd xmm2, dword ptr [ebx + esi * 4]
4378 movd xmm3, dword ptr [ebx + edi * 4]
4379 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
4380 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
4381 movlhps xmm2, xmm3
4382 pmulhuw xmm1, xmm2 // rgb * a
4383 lea eax, [eax + 16]
4384 packuswb xmm0, xmm1
4385 movdqu [edx], xmm0
4386 lea edx, [edx + 16]
4387 sub ecx, 4
4388 jg convertloop
4389
4390 pop edi
4391 pop esi
4392 pop ebx
4393 ret
4394 }
4395 }
4396 #endif // HAS_ARGBUNATTENUATEROW_SSE2
4397
4398 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
4399 // Shuffle table duplicating alpha.
4400 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
4401 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
4402 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
4403 // USE_GATHER is not on by default, due to being a slow instruction.
4404 #ifdef USE_GATHER
4405 __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
4406 uint8_t* dst_argb,
4407 int width) {
4408 __asm {
4409 mov eax, [esp + 4] // src_argb0
4410 mov edx, [esp + 8] // dst_argb
4411 mov ecx, [esp + 12] // width
4412 sub edx, eax
4413 vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
4414
4415 convertloop:
4416 vmovdqu ymm6, [eax] // read 8 pixels.
4417 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
4418 vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
4419 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4420 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4421 vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
4422 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
4423 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
4424 vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
4425 vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
4426 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
4427 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
4428 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4429 vmovdqu [eax + edx], ymm0
4430 lea eax, [eax + 32]
4431 sub ecx, 8
4432 jg convertloop
4433
4434 vzeroupper
4435 ret
4436 }
4437 }
4438 #else // USE_GATHER
4439 __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
4440 uint8_t* dst_argb,
4441 int width) {
4442 __asm {
4443
4444 push ebx
4445 push esi
4446 push edi
4447 mov eax, [esp + 12 + 4] // src_argb
4448 mov edx, [esp + 12 + 8] // dst_argb
4449 mov ecx, [esp + 12 + 12] // width
4450 sub edx, eax
4451 lea ebx, fixed_invtbl8
4452 vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
4453
4454 convertloop:
4455 // replace VPGATHER
4456 movzx esi, byte ptr [eax + 3] // alpha0
4457 movzx edi, byte ptr [eax + 7] // alpha1
4458 vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0]
4459 vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1]
4460 movzx esi, byte ptr [eax + 11] // alpha2
4461 movzx edi, byte ptr [eax + 15] // alpha3
4462 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
4463 vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2]
4464 vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3]
4465 movzx esi, byte ptr [eax + 19] // alpha4
4466 movzx edi, byte ptr [eax + 23] // alpha5
4467 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
4468 vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4]
4469 vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5]
4470 movzx esi, byte ptr [eax + 27] // alpha6
4471 movzx edi, byte ptr [eax + 31] // alpha7
4472 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
4473 vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6]
4474 vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7]
4475 vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
4476 vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
4477 vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
4478 vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
4479 // end of VPGATHER
4480
4481 vmovdqu ymm6, [eax] // read 8 pixels.
4482 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4483 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4484 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
4485 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
4486 vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
4487 vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
4488 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
4489 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
4490 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4491 vmovdqu [eax + edx], ymm0
4492 lea eax, [eax + 32]
4493 sub ecx, 8
4494 jg convertloop
4495
4496 pop edi
4497 pop esi
4498 pop ebx
4499 vzeroupper
4500 ret
4501 }
4502 }
4503 #endif // USE_GATHER
4504 #endif // HAS_ARGBATTENUATEROW_AVX2
4505
4506 #ifdef HAS_ARGBGRAYROW_SSSE3
4507 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
4508 __declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb,
4509 uint8_t* dst_argb,
4510 int width) {
4511 __asm {
4512 mov eax, [esp + 4] /* src_argb */
4513 mov edx, [esp + 8] /* dst_argb */
4514 mov ecx, [esp + 12] /* width */
4515 movdqa xmm4, xmmword ptr kARGBToYJ
4516 movdqa xmm5, xmmword ptr kAddYJ64
4517
4518 convertloop:
4519 movdqu xmm0, [eax] // G
4520 movdqu xmm1, [eax + 16]
4521 pmaddubsw xmm0, xmm4
4522 pmaddubsw xmm1, xmm4
4523 phaddw xmm0, xmm1
4524 paddw xmm0, xmm5 // Add .5 for rounding.
4525 psrlw xmm0, 7
4526 packuswb xmm0, xmm0 // 8 G bytes
4527 movdqu xmm2, [eax] // A
4528 movdqu xmm3, [eax + 16]
4529 lea eax, [eax + 32]
4530 psrld xmm2, 24
4531 psrld xmm3, 24
4532 packuswb xmm2, xmm3
4533 packuswb xmm2, xmm2 // 8 A bytes
4534 movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
4535 punpcklbw xmm0, xmm0 // 8 GG words
4536 punpcklbw xmm3, xmm2 // 8 GA words
4537 movdqa xmm1, xmm0
4538 punpcklwd xmm0, xmm3 // GGGA first 4
4539 punpckhwd xmm1, xmm3 // GGGA next 4
4540 movdqu [edx], xmm0
4541 movdqu [edx + 16], xmm1
4542 lea edx, [edx + 32]
4543 sub ecx, 8
4544 jg convertloop
4545 ret
4546 }
4547 }
4548 #endif // HAS_ARGBGRAYROW_SSSE3
4549
4550 #ifdef HAS_ARGBSEPIAROW_SSSE3
4551 // b = (r * 35 + g * 68 + b * 17) >> 7
4552 // g = (r * 45 + g * 88 + b * 22) >> 7
4553 // r = (r * 50 + g * 98 + b * 24) >> 7
4554 // Constant for ARGB color to sepia tone.
4555 static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
4556 17, 68, 35, 0, 17, 68, 35, 0};
4557
4558 static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
4559 22, 88, 45, 0, 22, 88, 45, 0};
4560
4561 static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
4562 24, 98, 50, 0, 24, 98, 50, 0};
4563
4564 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
4565 __declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
4566 __asm {
4567 mov eax, [esp + 4] /* dst_argb */
4568 mov ecx, [esp + 8] /* width */
4569 movdqa xmm2, xmmword ptr kARGBToSepiaB
4570 movdqa xmm3, xmmword ptr kARGBToSepiaG
4571 movdqa xmm4, xmmword ptr kARGBToSepiaR
4572
4573 convertloop:
4574 movdqu xmm0, [eax] // B
4575 movdqu xmm6, [eax + 16]
4576 pmaddubsw xmm0, xmm2
4577 pmaddubsw xmm6, xmm2
4578 phaddw xmm0, xmm6
4579 psrlw xmm0, 7
4580 packuswb xmm0, xmm0 // 8 B values
4581 movdqu xmm5, [eax] // G
4582 movdqu xmm1, [eax + 16]
4583 pmaddubsw xmm5, xmm3
4584 pmaddubsw xmm1, xmm3
4585 phaddw xmm5, xmm1
4586 psrlw xmm5, 7
4587 packuswb xmm5, xmm5 // 8 G values
4588 punpcklbw xmm0, xmm5 // 8 BG values
4589 movdqu xmm5, [eax] // R
4590 movdqu xmm1, [eax + 16]
4591 pmaddubsw xmm5, xmm4
4592 pmaddubsw xmm1, xmm4
4593 phaddw xmm5, xmm1
4594 psrlw xmm5, 7
4595 packuswb xmm5, xmm5 // 8 R values
4596 movdqu xmm6, [eax] // A
4597 movdqu xmm1, [eax + 16]
4598 psrld xmm6, 24
4599 psrld xmm1, 24
4600 packuswb xmm6, xmm1
4601 packuswb xmm6, xmm6 // 8 A values
4602 punpcklbw xmm5, xmm6 // 8 RA values
4603 movdqa xmm1, xmm0 // Weave BG, RA together
4604 punpcklwd xmm0, xmm5 // BGRA first 4
4605 punpckhwd xmm1, xmm5 // BGRA next 4
4606 movdqu [eax], xmm0
4607 movdqu [eax + 16], xmm1
4608 lea eax, [eax + 32]
4609 sub ecx, 8
4610 jg convertloop
4611 ret
4612 }
4613 }
4614 #endif // HAS_ARGBSEPIAROW_SSSE3
4615
4616 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4617 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
4618 // Same as Sepia except matrix is provided.
4619 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
4620 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
4621 __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
4622 uint8_t* dst_argb,
4623 const int8_t* matrix_argb,
4624 int width) {
4625 __asm {
4626 mov eax, [esp + 4] /* src_argb */
4627 mov edx, [esp + 8] /* dst_argb */
4628 mov ecx, [esp + 12] /* matrix_argb */
4629 movdqu xmm5, [ecx]
4630 pshufd xmm2, xmm5, 0x00
4631 pshufd xmm3, xmm5, 0x55
4632 pshufd xmm4, xmm5, 0xaa
4633 pshufd xmm5, xmm5, 0xff
4634 mov ecx, [esp + 16] /* width */
4635
4636 convertloop:
4637 movdqu xmm0, [eax] // B
4638 movdqu xmm7, [eax + 16]
4639 pmaddubsw xmm0, xmm2
4640 pmaddubsw xmm7, xmm2
4641 movdqu xmm6, [eax] // G
4642 movdqu xmm1, [eax + 16]
4643 pmaddubsw xmm6, xmm3
4644 pmaddubsw xmm1, xmm3
4645 phaddsw xmm0, xmm7 // B
4646 phaddsw xmm6, xmm1 // G
4647 psraw xmm0, 6 // B
4648 psraw xmm6, 6 // G
4649 packuswb xmm0, xmm0 // 8 B values
4650 packuswb xmm6, xmm6 // 8 G values
4651 punpcklbw xmm0, xmm6 // 8 BG values
4652 movdqu xmm1, [eax] // R
4653 movdqu xmm7, [eax + 16]
4654 pmaddubsw xmm1, xmm4
4655 pmaddubsw xmm7, xmm4
4656 phaddsw xmm1, xmm7 // R
4657 movdqu xmm6, [eax] // A
4658 movdqu xmm7, [eax + 16]
4659 pmaddubsw xmm6, xmm5
4660 pmaddubsw xmm7, xmm5
4661 phaddsw xmm6, xmm7 // A
4662 psraw xmm1, 6 // R
4663 psraw xmm6, 6 // A
4664 packuswb xmm1, xmm1 // 8 R values
4665 packuswb xmm6, xmm6 // 8 A values
4666 punpcklbw xmm1, xmm6 // 8 RA values
4667 movdqa xmm6, xmm0 // Weave BG, RA together
4668 punpcklwd xmm0, xmm1 // BGRA first 4
4669 punpckhwd xmm6, xmm1 // BGRA next 4
4670 movdqu [edx], xmm0
4671 movdqu [edx + 16], xmm6
4672 lea eax, [eax + 32]
4673 lea edx, [edx + 32]
4674 sub ecx, 8
4675 jg convertloop
4676 ret
4677 }
4678 }
4679 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
4680
4681 #ifdef HAS_ARGBQUANTIZEROW_SSE2
4682 // Quantize 4 ARGB pixels (16 bytes).
4683 __declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
4684 int scale,
4685 int interval_size,
4686 int interval_offset,
4687 int width) {
4688 __asm {
4689 mov eax, [esp + 4] /* dst_argb */
4690 movd xmm2, [esp + 8] /* scale */
4691 movd xmm3, [esp + 12] /* interval_size */
4692 movd xmm4, [esp + 16] /* interval_offset */
4693 mov ecx, [esp + 20] /* width */
4694 pshuflw xmm2, xmm2, 040h
4695 pshufd xmm2, xmm2, 044h
4696 pshuflw xmm3, xmm3, 040h
4697 pshufd xmm3, xmm3, 044h
4698 pshuflw xmm4, xmm4, 040h
4699 pshufd xmm4, xmm4, 044h
4700 pxor xmm5, xmm5 // constant 0
4701 pcmpeqb xmm6, xmm6 // generate mask 0xff000000
4702 pslld xmm6, 24
4703
4704 convertloop:
4705 movdqu xmm0, [eax] // read 4 pixels
4706 punpcklbw xmm0, xmm5 // first 2 pixels
4707 pmulhuw xmm0, xmm2 // pixel * scale >> 16
4708 movdqu xmm1, [eax] // read 4 pixels
4709 punpckhbw xmm1, xmm5 // next 2 pixels
4710 pmulhuw xmm1, xmm2
4711 pmullw xmm0, xmm3 // * interval_size
4712 movdqu xmm7, [eax] // read 4 pixels
4713 pmullw xmm1, xmm3
4714 pand xmm7, xmm6 // mask alpha
4715 paddw xmm0, xmm4 // + interval_size / 2
4716 paddw xmm1, xmm4
4717 packuswb xmm0, xmm1
4718 por xmm0, xmm7
4719 movdqu [eax], xmm0
4720 lea eax, [eax + 16]
4721 sub ecx, 4
4722 jg convertloop
4723 ret
4724 }
4725 }
4726 #endif // HAS_ARGBQUANTIZEROW_SSE2
4727
4728 #ifdef HAS_ARGBSHADEROW_SSE2
4729 // Shade 4 pixels at a time by specified value.
4730 __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
4731 uint8_t* dst_argb,
4732 int width,
4733 uint32_t value) {
4734 __asm {
4735 mov eax, [esp + 4] // src_argb
4736 mov edx, [esp + 8] // dst_argb
4737 mov ecx, [esp + 12] // width
4738 movd xmm2, [esp + 16] // value
4739 punpcklbw xmm2, xmm2
4740 punpcklqdq xmm2, xmm2
4741
4742 convertloop:
4743 movdqu xmm0, [eax] // read 4 pixels
4744 lea eax, [eax + 16]
4745 movdqa xmm1, xmm0
4746 punpcklbw xmm0, xmm0 // first 2
4747 punpckhbw xmm1, xmm1 // next 2
4748 pmulhuw xmm0, xmm2 // argb * value
4749 pmulhuw xmm1, xmm2 // argb * value
4750 psrlw xmm0, 8
4751 psrlw xmm1, 8
4752 packuswb xmm0, xmm1
4753 movdqu [edx], xmm0
4754 lea edx, [edx + 16]
4755 sub ecx, 4
4756 jg convertloop
4757
4758 ret
4759 }
4760 }
4761 #endif // HAS_ARGBSHADEROW_SSE2
4762
4763 #ifdef HAS_ARGBMULTIPLYROW_SSE2
4764 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4765 __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
4766 const uint8_t* src_argb1,
4767 uint8_t* dst_argb,
4768 int width) {
4769 __asm {
4770 push esi
4771 mov eax, [esp + 4 + 4] // src_argb0
4772 mov esi, [esp + 4 + 8] // src_argb1
4773 mov edx, [esp + 4 + 12] // dst_argb
4774 mov ecx, [esp + 4 + 16] // width
4775 pxor xmm5, xmm5 // constant 0
4776
4777 convertloop:
4778 movdqu xmm0, [eax] // read 4 pixels from src_argb0
4779 movdqu xmm2, [esi] // read 4 pixels from src_argb1
4780 movdqu xmm1, xmm0
4781 movdqu xmm3, xmm2
4782 punpcklbw xmm0, xmm0 // first 2
4783 punpckhbw xmm1, xmm1 // next 2
4784 punpcklbw xmm2, xmm5 // first 2
4785 punpckhbw xmm3, xmm5 // next 2
4786 pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
4787 pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
4788 lea eax, [eax + 16]
4789 lea esi, [esi + 16]
4790 packuswb xmm0, xmm1
4791 movdqu [edx], xmm0
4792 lea edx, [edx + 16]
4793 sub ecx, 4
4794 jg convertloop
4795
4796 pop esi
4797 ret
4798 }
4799 }
4800 #endif // HAS_ARGBMULTIPLYROW_SSE2
4801
4802 #ifdef HAS_ARGBADDROW_SSE2
4803 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
4804 // TODO(fbarchard): Port this to posix, neon and other math functions.
4805 __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
4806 const uint8_t* src_argb1,
4807 uint8_t* dst_argb,
4808 int width) {
4809 __asm {
4810 push esi
4811 mov eax, [esp + 4 + 4] // src_argb0
4812 mov esi, [esp + 4 + 8] // src_argb1
4813 mov edx, [esp + 4 + 12] // dst_argb
4814 mov ecx, [esp + 4 + 16] // width
4815
4816 sub ecx, 4
4817 jl convertloop49
4818
4819 convertloop4:
4820 movdqu xmm0, [eax] // read 4 pixels from src_argb0
4821 lea eax, [eax + 16]
4822 movdqu xmm1, [esi] // read 4 pixels from src_argb1
4823 lea esi, [esi + 16]
4824 paddusb xmm0, xmm1 // src_argb0 + src_argb1
4825 movdqu [edx], xmm0
4826 lea edx, [edx + 16]
4827 sub ecx, 4
4828 jge convertloop4
4829
4830 convertloop49:
4831 add ecx, 4 - 1
4832 jl convertloop19
4833
4834 convertloop1:
4835 movd xmm0, [eax] // read 1 pixels from src_argb0
4836 lea eax, [eax + 4]
4837 movd xmm1, [esi] // read 1 pixels from src_argb1
4838 lea esi, [esi + 4]
4839 paddusb xmm0, xmm1 // src_argb0 + src_argb1
4840 movd [edx], xmm0
4841 lea edx, [edx + 4]
4842 sub ecx, 1
4843 jge convertloop1
4844
4845 convertloop19:
4846 pop esi
4847 ret
4848 }
4849 }
4850 #endif // HAS_ARGBADDROW_SSE2
4851
4852 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4853 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
4854 __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
4855 const uint8_t* src_argb1,
4856 uint8_t* dst_argb,
4857 int width) {
4858 __asm {
4859 push esi
4860 mov eax, [esp + 4 + 4] // src_argb0
4861 mov esi, [esp + 4 + 8] // src_argb1
4862 mov edx, [esp + 4 + 12] // dst_argb
4863 mov ecx, [esp + 4 + 16] // width
4864
4865 convertloop:
4866 movdqu xmm0, [eax] // read 4 pixels from src_argb0
4867 lea eax, [eax + 16]
4868 movdqu xmm1, [esi] // read 4 pixels from src_argb1
4869 lea esi, [esi + 16]
4870 psubusb xmm0, xmm1 // src_argb0 - src_argb1
4871 movdqu [edx], xmm0
4872 lea edx, [edx + 16]
4873 sub ecx, 4
4874 jg convertloop
4875
4876 pop esi
4877 ret
4878 }
4879 }
4880 #endif // HAS_ARGBSUBTRACTROW_SSE2
4881
4882 #ifdef HAS_ARGBMULTIPLYROW_AVX2
4883 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
4884 __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
4885 const uint8_t* src_argb1,
4886 uint8_t* dst_argb,
4887 int width) {
4888 __asm {
4889 push esi
4890 mov eax, [esp + 4 + 4] // src_argb0
4891 mov esi, [esp + 4 + 8] // src_argb1
4892 mov edx, [esp + 4 + 12] // dst_argb
4893 mov ecx, [esp + 4 + 16] // width
4894 vpxor ymm5, ymm5, ymm5 // constant 0
4895
4896 convertloop:
4897 vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
4898 lea eax, [eax + 32]
4899 vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
4900 lea esi, [esi + 32]
4901 vpunpcklbw ymm0, ymm1, ymm1 // low 4
4902 vpunpckhbw ymm1, ymm1, ymm1 // high 4
4903 vpunpcklbw ymm2, ymm3, ymm5 // low 4
4904 vpunpckhbw ymm3, ymm3, ymm5 // high 4
4905 vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
4906 vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
4907 vpackuswb ymm0, ymm0, ymm1
4908 vmovdqu [edx], ymm0
4909 lea edx, [edx + 32]
4910 sub ecx, 8
4911 jg convertloop
4912
4913 pop esi
4914 vzeroupper
4915 ret
4916 }
4917 }
4918 #endif // HAS_ARGBMULTIPLYROW_AVX2
4919
4920 #ifdef HAS_ARGBADDROW_AVX2
4921 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
4922 __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
4923 const uint8_t* src_argb1,
4924 uint8_t* dst_argb,
4925 int width) {
4926 __asm {
4927 push esi
4928 mov eax, [esp + 4 + 4] // src_argb0
4929 mov esi, [esp + 4 + 8] // src_argb1
4930 mov edx, [esp + 4 + 12] // dst_argb
4931 mov ecx, [esp + 4 + 16] // width
4932
4933 convertloop:
4934 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
4935 lea eax, [eax + 32]
4936 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
4937 lea esi, [esi + 32]
4938 vmovdqu [edx], ymm0
4939 lea edx, [edx + 32]
4940 sub ecx, 8
4941 jg convertloop
4942
4943 pop esi
4944 vzeroupper
4945 ret
4946 }
4947 }
4948 #endif // HAS_ARGBADDROW_AVX2
4949
4950 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4951 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
4952 __declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
4953 const uint8_t* src_argb1,
4954 uint8_t* dst_argb,
4955 int width) {
4956 __asm {
4957 push esi
4958 mov eax, [esp + 4 + 4] // src_argb0
4959 mov esi, [esp + 4 + 8] // src_argb1
4960 mov edx, [esp + 4 + 12] // dst_argb
4961 mov ecx, [esp + 4 + 16] // width
4962
4963 convertloop:
4964 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
4965 lea eax, [eax + 32]
4966 vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
4967 lea esi, [esi + 32]
4968 vmovdqu [edx], ymm0
4969 lea edx, [edx + 32]
4970 sub ecx, 8
4971 jg convertloop
4972
4973 pop esi
4974 vzeroupper
4975 ret
4976 }
4977 }
4978 #endif // HAS_ARGBSUBTRACTROW_AVX2
4979
4980 #ifdef HAS_SOBELXROW_SSE2
4981 // SobelX as a matrix is
4982 // -1 0 1
4983 // -2 0 2
4984 // -1 0 1
4985 __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
4986 const uint8_t* src_y1,
4987 const uint8_t* src_y2,
4988 uint8_t* dst_sobelx,
4989 int width) {
4990 __asm {
4991 push esi
4992 push edi
4993 mov eax, [esp + 8 + 4] // src_y0
4994 mov esi, [esp + 8 + 8] // src_y1
4995 mov edi, [esp + 8 + 12] // src_y2
4996 mov edx, [esp + 8 + 16] // dst_sobelx
4997 mov ecx, [esp + 8 + 20] // width
4998 sub esi, eax
4999 sub edi, eax
5000 sub edx, eax
5001 pxor xmm5, xmm5 // constant 0
5002
5003 convertloop:
5004 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
5005 movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
5006 punpcklbw xmm0, xmm5
5007 punpcklbw xmm1, xmm5
5008 psubw xmm0, xmm1
5009 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
5010 movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
5011 punpcklbw xmm1, xmm5
5012 punpcklbw xmm2, xmm5
5013 psubw xmm1, xmm2
5014 movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
5015 movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
5016 punpcklbw xmm2, xmm5
5017 punpcklbw xmm3, xmm5
5018 psubw xmm2, xmm3
5019 paddw xmm0, xmm2
5020 paddw xmm0, xmm1
5021 paddw xmm0, xmm1
5022 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
5023 psubw xmm1, xmm0
5024 pmaxsw xmm0, xmm1
5025 packuswb xmm0, xmm0
5026 movq qword ptr [eax + edx], xmm0
5027 lea eax, [eax + 8]
5028 sub ecx, 8
5029 jg convertloop
5030
5031 pop edi
5032 pop esi
5033 ret
5034 }
5035 }
5036 #endif // HAS_SOBELXROW_SSE2
5037
5038 #ifdef HAS_SOBELYROW_SSE2
5039 // SobelY as a matrix is
5040 // -1 -2 -1
5041 // 0 0 0
5042 // 1 2 1
5043 __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
5044 const uint8_t* src_y1,
5045 uint8_t* dst_sobely,
5046 int width) {
5047 __asm {
5048 push esi
5049 mov eax, [esp + 4 + 4] // src_y0
5050 mov esi, [esp + 4 + 8] // src_y1
5051 mov edx, [esp + 4 + 12] // dst_sobely
5052 mov ecx, [esp + 4 + 16] // width
5053 sub esi, eax
5054 sub edx, eax
5055 pxor xmm5, xmm5 // constant 0
5056
5057 convertloop:
5058 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
5059 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
5060 punpcklbw xmm0, xmm5
5061 punpcklbw xmm1, xmm5
5062 psubw xmm0, xmm1
5063 movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
5064 movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
5065 punpcklbw xmm1, xmm5
5066 punpcklbw xmm2, xmm5
5067 psubw xmm1, xmm2
5068 movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
5069 movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
5070 punpcklbw xmm2, xmm5
5071 punpcklbw xmm3, xmm5
5072 psubw xmm2, xmm3
5073 paddw xmm0, xmm2
5074 paddw xmm0, xmm1
5075 paddw xmm0, xmm1
5076 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
5077 psubw xmm1, xmm0
5078 pmaxsw xmm0, xmm1
5079 packuswb xmm0, xmm0
5080 movq qword ptr [eax + edx], xmm0
5081 lea eax, [eax + 8]
5082 sub ecx, 8
5083 jg convertloop
5084
5085 pop esi
5086 ret
5087 }
5088 }
5089 #endif // HAS_SOBELYROW_SSE2
5090
5091 #ifdef HAS_SOBELROW_SSE2
5092 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5093 // A = 255
5094 // R = Sobel
5095 // G = Sobel
5096 // B = Sobel
5097 __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
5098 const uint8_t* src_sobely,
5099 uint8_t* dst_argb,
5100 int width) {
5101 __asm {
5102 push esi
5103 mov eax, [esp + 4 + 4] // src_sobelx
5104 mov esi, [esp + 4 + 8] // src_sobely
5105 mov edx, [esp + 4 + 12] // dst_argb
5106 mov ecx, [esp + 4 + 16] // width
5107 sub esi, eax
5108 pcmpeqb xmm5, xmm5 // alpha 255
5109 pslld xmm5, 24 // 0xff000000
5110
5111 convertloop:
5112 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5113 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5114 lea eax, [eax + 16]
5115 paddusb xmm0, xmm1 // sobel = sobelx + sobely
5116 movdqa xmm2, xmm0 // GG
5117 punpcklbw xmm2, xmm0 // First 8
5118 punpckhbw xmm0, xmm0 // Next 8
5119 movdqa xmm1, xmm2 // GGGG
5120 punpcklwd xmm1, xmm2 // First 4
5121 punpckhwd xmm2, xmm2 // Next 4
5122 por xmm1, xmm5 // GGGA
5123 por xmm2, xmm5
5124 movdqa xmm3, xmm0 // GGGG
5125 punpcklwd xmm3, xmm0 // Next 4
5126 punpckhwd xmm0, xmm0 // Last 4
5127 por xmm3, xmm5 // GGGA
5128 por xmm0, xmm5
5129 movdqu [edx], xmm1
5130 movdqu [edx + 16], xmm2
5131 movdqu [edx + 32], xmm3
5132 movdqu [edx + 48], xmm0
5133 lea edx, [edx + 64]
5134 sub ecx, 16
5135 jg convertloop
5136
5137 pop esi
5138 ret
5139 }
5140 }
5141 #endif // HAS_SOBELROW_SSE2
5142
5143 #ifdef HAS_SOBELTOPLANEROW_SSE2
5144 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
5145 __declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
5146 const uint8_t* src_sobely,
5147 uint8_t* dst_y,
5148 int width) {
5149 __asm {
5150 push esi
5151 mov eax, [esp + 4 + 4] // src_sobelx
5152 mov esi, [esp + 4 + 8] // src_sobely
5153 mov edx, [esp + 4 + 12] // dst_argb
5154 mov ecx, [esp + 4 + 16] // width
5155 sub esi, eax
5156
5157 convertloop:
5158 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5159 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5160 lea eax, [eax + 16]
5161 paddusb xmm0, xmm1 // sobel = sobelx + sobely
5162 movdqu [edx], xmm0
5163 lea edx, [edx + 16]
5164 sub ecx, 16
5165 jg convertloop
5166
5167 pop esi
5168 ret
5169 }
5170 }
5171 #endif // HAS_SOBELTOPLANEROW_SSE2
5172
5173 #ifdef HAS_SOBELXYROW_SSE2
5174 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
5175 // A = 255
5176 // R = Sobel X
5177 // G = Sobel
5178 // B = Sobel Y
5179 __declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
5180 const uint8_t* src_sobely,
5181 uint8_t* dst_argb,
5182 int width) {
5183 __asm {
5184 push esi
5185 mov eax, [esp + 4 + 4] // src_sobelx
5186 mov esi, [esp + 4 + 8] // src_sobely
5187 mov edx, [esp + 4 + 12] // dst_argb
5188 mov ecx, [esp + 4 + 16] // width
5189 sub esi, eax
5190 pcmpeqb xmm5, xmm5 // alpha 255
5191
5192 convertloop:
5193 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5194 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5195 lea eax, [eax + 16]
5196 movdqa xmm2, xmm0
5197 paddusb xmm2, xmm1 // sobel = sobelx + sobely
5198 movdqa xmm3, xmm0 // XA
5199 punpcklbw xmm3, xmm5
5200 punpckhbw xmm0, xmm5
5201 movdqa xmm4, xmm1 // YS
5202 punpcklbw xmm4, xmm2
5203 punpckhbw xmm1, xmm2
5204 movdqa xmm6, xmm4 // YSXA
5205 punpcklwd xmm6, xmm3 // First 4
5206 punpckhwd xmm4, xmm3 // Next 4
5207 movdqa xmm7, xmm1 // YSXA
5208 punpcklwd xmm7, xmm0 // Next 4
5209 punpckhwd xmm1, xmm0 // Last 4
5210 movdqu [edx], xmm6
5211 movdqu [edx + 16], xmm4
5212 movdqu [edx + 32], xmm7
5213 movdqu [edx + 48], xmm1
5214 lea edx, [edx + 64]
5215 sub ecx, 16
5216 jg convertloop
5217
5218 pop esi
5219 ret
5220 }
5221 }
5222 #endif // HAS_SOBELXYROW_SSE2
5223
5224 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5225 // Consider float CumulativeSum.
5226 // Consider calling CumulativeSum one row at time as needed.
5227 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
5228 // Convert cumulative sum for an area to an average for 1 pixel.
5229 // topleft is pointer to top left of CumulativeSum buffer for area.
5230 // botleft is pointer to bottom left of CumulativeSum buffer.
5231 // width is offset from left to right of area in CumulativeSum buffer measured
5232 // in number of ints.
5233 // area is the number of pixels in the area being averaged.
5234 // dst points to pixel to store result to.
5235 // count is number of averaged pixels to produce.
5236 // Does 4 pixels at a time.
5237 // This function requires alignment on accumulation buffer pointers.
5238 void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
5239 const int32_t* botleft,
5240 int width,
5241 int area,
5242 uint8_t* dst,
5243 int count) {
5244 __asm {
5245 mov eax, topleft // eax topleft
5246 mov esi, botleft // esi botleft
5247 mov edx, width
5248 movd xmm5, area
5249 mov edi, dst
5250 mov ecx, count
5251 cvtdq2ps xmm5, xmm5
5252 rcpss xmm4, xmm5 // 1.0f / area
5253 pshufd xmm4, xmm4, 0
5254 sub ecx, 4
5255 jl l4b
5256
5257 cmp area, 128 // 128 pixels will not overflow 15 bits.
5258 ja l4
5259
5260 pshufd xmm5, xmm5, 0 // area
5261 pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
5262 psrld xmm6, 16
5263 cvtdq2ps xmm6, xmm6
5264 addps xmm5, xmm6 // (65536.0 + area - 1)
5265 mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
5266 cvtps2dq xmm5, xmm5 // 0.16 fixed point
5267 packssdw xmm5, xmm5 // 16 bit shorts
5268
5269 // 4 pixel loop small blocks.
5270 s4:
5271 // top left
5272 movdqu xmm0, [eax]
5273 movdqu xmm1, [eax + 16]
5274 movdqu xmm2, [eax + 32]
5275 movdqu xmm3, [eax + 48]
5276
5277 // - top right
5278 psubd xmm0, [eax + edx * 4]
5279 psubd xmm1, [eax + edx * 4 + 16]
5280 psubd xmm2, [eax + edx * 4 + 32]
5281 psubd xmm3, [eax + edx * 4 + 48]
5282 lea eax, [eax + 64]
5283
5284 // - bottom left
5285 psubd xmm0, [esi]
5286 psubd xmm1, [esi + 16]
5287 psubd xmm2, [esi + 32]
5288 psubd xmm3, [esi + 48]
5289
5290 // + bottom right
5291 paddd xmm0, [esi + edx * 4]
5292 paddd xmm1, [esi + edx * 4 + 16]
5293 paddd xmm2, [esi + edx * 4 + 32]
5294 paddd xmm3, [esi + edx * 4 + 48]
5295 lea esi, [esi + 64]
5296
5297 packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
5298 packssdw xmm2, xmm3
5299
5300 pmulhuw xmm0, xmm5
5301 pmulhuw xmm2, xmm5
5302
5303 packuswb xmm0, xmm2
5304 movdqu [edi], xmm0
5305 lea edi, [edi + 16]
5306 sub ecx, 4
5307 jge s4
5308
5309 jmp l4b
5310
5311 // 4 pixel loop
5312 l4:
5313 // top left
5314 movdqu xmm0, [eax]
5315 movdqu xmm1, [eax + 16]
5316 movdqu xmm2, [eax + 32]
5317 movdqu xmm3, [eax + 48]
5318
5319 // - top right
5320 psubd xmm0, [eax + edx * 4]
5321 psubd xmm1, [eax + edx * 4 + 16]
5322 psubd xmm2, [eax + edx * 4 + 32]
5323 psubd xmm3, [eax + edx * 4 + 48]
5324 lea eax, [eax + 64]
5325
5326 // - bottom left
5327 psubd xmm0, [esi]
5328 psubd xmm1, [esi + 16]
5329 psubd xmm2, [esi + 32]
5330 psubd xmm3, [esi + 48]
5331
5332 // + bottom right
5333 paddd xmm0, [esi + edx * 4]
5334 paddd xmm1, [esi + edx * 4 + 16]
5335 paddd xmm2, [esi + edx * 4 + 32]
5336 paddd xmm3, [esi + edx * 4 + 48]
5337 lea esi, [esi + 64]
5338
5339 cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
5340 cvtdq2ps xmm1, xmm1
5341 mulps xmm0, xmm4
5342 mulps xmm1, xmm4
5343 cvtdq2ps xmm2, xmm2
5344 cvtdq2ps xmm3, xmm3
5345 mulps xmm2, xmm4
5346 mulps xmm3, xmm4
5347 cvtps2dq xmm0, xmm0
5348 cvtps2dq xmm1, xmm1
5349 cvtps2dq xmm2, xmm2
5350 cvtps2dq xmm3, xmm3
5351 packssdw xmm0, xmm1
5352 packssdw xmm2, xmm3
5353 packuswb xmm0, xmm2
5354 movdqu [edi], xmm0
5355 lea edi, [edi + 16]
5356 sub ecx, 4
5357 jge l4
5358
5359 l4b:
5360 add ecx, 4 - 1
5361 jl l1b
5362
5363 // 1 pixel loop
5364 l1:
5365 movdqu xmm0, [eax]
5366 psubd xmm0, [eax + edx * 4]
5367 lea eax, [eax + 16]
5368 psubd xmm0, [esi]
5369 paddd xmm0, [esi + edx * 4]
5370 lea esi, [esi + 16]
5371 cvtdq2ps xmm0, xmm0
5372 mulps xmm0, xmm4
5373 cvtps2dq xmm0, xmm0
5374 packssdw xmm0, xmm0
5375 packuswb xmm0, xmm0
5376 movd dword ptr [edi], xmm0
5377 lea edi, [edi + 4]
5378 sub ecx, 1
5379 jge l1
5380 l1b:
5381 }
5382 }
5383 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5384
5385 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5386 // Creates a table of cumulative sums where each value is a sum of all values
5387 // above and to the left of the value.
5388 void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
5389 int32_t* cumsum,
5390 const int32_t* previous_cumsum,
5391 int width) {
5392 __asm {
5393 mov eax, row
5394 mov edx, cumsum
5395 mov esi, previous_cumsum
5396 mov ecx, width
5397 pxor xmm0, xmm0
5398 pxor xmm1, xmm1
5399
5400 sub ecx, 4
5401 jl l4b
5402 test edx, 15
5403 jne l4b
5404
5405 // 4 pixel loop
5406 l4:
5407 movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
5408 lea eax, [eax + 16]
5409 movdqa xmm4, xmm2
5410
5411 punpcklbw xmm2, xmm1
5412 movdqa xmm3, xmm2
5413 punpcklwd xmm2, xmm1
5414 punpckhwd xmm3, xmm1
5415
5416 punpckhbw xmm4, xmm1
5417 movdqa xmm5, xmm4
5418 punpcklwd xmm4, xmm1
5419 punpckhwd xmm5, xmm1
5420
5421 paddd xmm0, xmm2
5422 movdqu xmm2, [esi] // previous row above.
5423 paddd xmm2, xmm0
5424
5425 paddd xmm0, xmm3
5426 movdqu xmm3, [esi + 16]
5427 paddd xmm3, xmm0
5428
5429 paddd xmm0, xmm4
5430 movdqu xmm4, [esi + 32]
5431 paddd xmm4, xmm0
5432
5433 paddd xmm0, xmm5
5434 movdqu xmm5, [esi + 48]
5435 lea esi, [esi + 64]
5436 paddd xmm5, xmm0
5437
5438 movdqu [edx], xmm2
5439 movdqu [edx + 16], xmm3
5440 movdqu [edx + 32], xmm4
5441 movdqu [edx + 48], xmm5
5442
5443 lea edx, [edx + 64]
5444 sub ecx, 4
5445 jge l4
5446
5447 l4b:
5448 add ecx, 4 - 1
5449 jl l1b
5450
5451 // 1 pixel loop
5452 l1:
5453 movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
5454 lea eax, [eax + 4]
5455 punpcklbw xmm2, xmm1
5456 punpcklwd xmm2, xmm1
5457 paddd xmm0, xmm2
5458 movdqu xmm2, [esi]
5459 lea esi, [esi + 16]
5460 paddd xmm2, xmm0
5461 movdqu [edx], xmm2
5462 lea edx, [edx + 16]
5463 sub ecx, 1
5464 jge l1
5465
5466 l1b:
5467 }
5468 }
5469 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
5470
5471 #ifdef HAS_ARGBAFFINEROW_SSE2
5472 // Copy ARGB pixels from source image with slope to a row of destination.
5473 __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
5474 int src_argb_stride,
5475 uint8_t* dst_argb,
5476 const float* uv_dudv,
5477 int width) {
5478 __asm {
5479 push esi
5480 push edi
5481 mov eax, [esp + 12] // src_argb
5482 mov esi, [esp + 16] // stride
5483 mov edx, [esp + 20] // dst_argb
5484 mov ecx, [esp + 24] // pointer to uv_dudv
5485 movq xmm2, qword ptr [ecx] // uv
5486 movq xmm7, qword ptr [ecx + 8] // dudv
5487 mov ecx, [esp + 28] // width
5488 shl esi, 16 // 4, stride
5489 add esi, 4
5490 movd xmm5, esi
5491 sub ecx, 4
5492 jl l4b
5493
5494 // setup for 4 pixel loop
5495 pshufd xmm7, xmm7, 0x44 // dup dudv
5496 pshufd xmm5, xmm5, 0 // dup 4, stride
5497 movdqa xmm0, xmm2 // x0, y0, x1, y1
5498 addps xmm0, xmm7
5499 movlhps xmm2, xmm0
5500 movdqa xmm4, xmm7
5501 addps xmm4, xmm4 // dudv *= 2
5502 movdqa xmm3, xmm2 // x2, y2, x3, y3
5503 addps xmm3, xmm4
5504 addps xmm4, xmm4 // dudv *= 4
5505
5506 // 4 pixel loop
5507 l4:
5508 cvttps2dq xmm0, xmm2 // x, y float to int first 2
5509 cvttps2dq xmm1, xmm3 // x, y float to int next 2
5510 packssdw xmm0, xmm1 // x, y as 8 shorts
5511 pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
5512 movd esi, xmm0
5513 pshufd xmm0, xmm0, 0x39 // shift right
5514 movd edi, xmm0
5515 pshufd xmm0, xmm0, 0x39 // shift right
5516 movd xmm1, [eax + esi] // read pixel 0
5517 movd xmm6, [eax + edi] // read pixel 1
5518 punpckldq xmm1, xmm6 // combine pixel 0 and 1
5519 addps xmm2, xmm4 // x, y += dx, dy first 2
5520 movq qword ptr [edx], xmm1
5521 movd esi, xmm0
5522 pshufd xmm0, xmm0, 0x39 // shift right
5523 movd edi, xmm0
5524 movd xmm6, [eax + esi] // read pixel 2
5525 movd xmm0, [eax + edi] // read pixel 3
5526 punpckldq xmm6, xmm0 // combine pixel 2 and 3
5527 addps xmm3, xmm4 // x, y += dx, dy next 2
5528 movq qword ptr 8[edx], xmm6
5529 lea edx, [edx + 16]
5530 sub ecx, 4
5531 jge l4
5532
5533 l4b:
5534 add ecx, 4 - 1
5535 jl l1b
5536
5537 // 1 pixel loop
5538 l1:
5539 cvttps2dq xmm0, xmm2 // x, y float to int
5540 packssdw xmm0, xmm0 // x, y as shorts
5541 pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
5542 addps xmm2, xmm7 // x, y += dx, dy
5543 movd esi, xmm0
5544 movd xmm0, [eax + esi] // copy a pixel
5545 movd [edx], xmm0
5546 lea edx, [edx + 4]
5547 sub ecx, 1
5548 jge l1
5549 l1b:
5550 pop edi
5551 pop esi
5552 ret
5553 }
5554 }
5555 #endif // HAS_ARGBAFFINEROW_SSE2
5556
5557 #ifdef HAS_INTERPOLATEROW_AVX2
5558 // Bilinear filter 32x2 -> 32x1
5559 __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
5560 const uint8_t* src_ptr,
5561 ptrdiff_t src_stride,
5562 int dst_width,
5563 int source_y_fraction) {
5564 __asm {
5565 push esi
5566 push edi
5567 mov edi, [esp + 8 + 4] // dst_ptr
5568 mov esi, [esp + 8 + 8] // src_ptr
5569 mov edx, [esp + 8 + 12] // src_stride
5570 mov ecx, [esp + 8 + 16] // dst_width
5571 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5572 // Dispatch to specialized filters if applicable.
5573 cmp eax, 0
5574 je xloop100 // 0 / 256. Blend 100 / 0.
5575 sub edi, esi
5576 cmp eax, 128
5577 je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
5578
5579 vmovd xmm0, eax // high fraction 0..255
5580 neg eax
5581 add eax, 256
5582 vmovd xmm5, eax // low fraction 256..1
5583 vpunpcklbw xmm5, xmm5, xmm0
5584 vpunpcklwd xmm5, xmm5, xmm5
5585 vbroadcastss ymm5, xmm5
5586
5587 mov eax, 0x80808080 // 128b for bias and rounding.
5588 vmovd xmm4, eax
5589 vbroadcastss ymm4, xmm4
5590
5591 xloop:
5592 vmovdqu ymm0, [esi]
5593 vmovdqu ymm2, [esi + edx]
5594 vpunpckhbw ymm1, ymm0, ymm2 // mutates
5595 vpunpcklbw ymm0, ymm0, ymm2
5596 vpsubb ymm1, ymm1, ymm4 // bias to signed image
5597 vpsubb ymm0, ymm0, ymm4
5598 vpmaddubsw ymm1, ymm5, ymm1
5599 vpmaddubsw ymm0, ymm5, ymm0
5600 vpaddw ymm1, ymm1, ymm4 // unbias and round
5601 vpaddw ymm0, ymm0, ymm4
5602 vpsrlw ymm1, ymm1, 8
5603 vpsrlw ymm0, ymm0, 8
5604 vpackuswb ymm0, ymm0, ymm1 // unmutates
5605 vmovdqu [esi + edi], ymm0
5606 lea esi, [esi + 32]
5607 sub ecx, 32
5608 jg xloop
5609 jmp xloop99
5610
5611 // Blend 50 / 50.
5612 xloop50:
5613 vmovdqu ymm0, [esi]
5614 vpavgb ymm0, ymm0, [esi + edx]
5615 vmovdqu [esi + edi], ymm0
5616 lea esi, [esi + 32]
5617 sub ecx, 32
5618 jg xloop50
5619 jmp xloop99
5620
5621 // Blend 100 / 0 - Copy row unchanged.
5622 xloop100:
5623 rep movsb
5624
5625 xloop99:
5626 pop edi
5627 pop esi
5628 vzeroupper
5629 ret
5630 }
5631 }
5632 #endif // HAS_INTERPOLATEROW_AVX2
5633
5634 // Bilinear filter 16x2 -> 16x1
5635 // TODO(fbarchard): Consider allowing 256 using memcpy.
5636 __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
5637 const uint8_t* src_ptr,
5638 ptrdiff_t src_stride,
5639 int dst_width,
5640 int source_y_fraction) {
5641 __asm {
5642 push esi
5643 push edi
5644
5645 mov edi, [esp + 8 + 4] // dst_ptr
5646 mov esi, [esp + 8 + 8] // src_ptr
5647 mov edx, [esp + 8 + 12] // src_stride
5648 mov ecx, [esp + 8 + 16] // dst_width
5649 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5650 sub edi, esi
5651 // Dispatch to specialized filters if applicable.
5652 cmp eax, 0
5653 je xloop100 // 0 /256. Blend 100 / 0.
5654 cmp eax, 128
5655 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
5656
5657 movd xmm0, eax // high fraction 0..255
5658 neg eax
5659 add eax, 256
5660 movd xmm5, eax // low fraction 255..1
5661 punpcklbw xmm5, xmm0
5662 punpcklwd xmm5, xmm5
5663 pshufd xmm5, xmm5, 0
5664 mov eax, 0x80808080 // 128 for biasing image to signed.
5665 movd xmm4, eax
5666 pshufd xmm4, xmm4, 0x00
5667
5668 xloop:
5669 movdqu xmm0, [esi]
5670 movdqu xmm2, [esi + edx]
5671 movdqu xmm1, xmm0
5672 punpcklbw xmm0, xmm2
5673 punpckhbw xmm1, xmm2
5674 psubb xmm0, xmm4 // bias image by -128
5675 psubb xmm1, xmm4
5676 movdqa xmm2, xmm5
5677 movdqa xmm3, xmm5
5678 pmaddubsw xmm2, xmm0
5679 pmaddubsw xmm3, xmm1
5680 paddw xmm2, xmm4
5681 paddw xmm3, xmm4
5682 psrlw xmm2, 8
5683 psrlw xmm3, 8
5684 packuswb xmm2, xmm3
5685 movdqu [esi + edi], xmm2
5686 lea esi, [esi + 16]
5687 sub ecx, 16
5688 jg xloop
5689 jmp xloop99
5690
5691 // Blend 50 / 50.
5692 xloop50:
5693 movdqu xmm0, [esi]
5694 movdqu xmm1, [esi + edx]
5695 pavgb xmm0, xmm1
5696 movdqu [esi + edi], xmm0
5697 lea esi, [esi + 16]
5698 sub ecx, 16
5699 jg xloop50
5700 jmp xloop99
5701
5702 // Blend 100 / 0 - Copy row unchanged.
5703 xloop100:
5704 movdqu xmm0, [esi]
5705 movdqu [esi + edi], xmm0
5706 lea esi, [esi + 16]
5707 sub ecx, 16
5708 jg xloop100
5709
5710 xloop99:
5711 pop edi
5712 pop esi
5713 ret
5714 }
5715 }
5716
5717 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5718 __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
5719 uint8_t* dst_argb,
5720 const uint8_t* shuffler,
5721 int width) {
5722 __asm {
5723 mov eax, [esp + 4] // src_argb
5724 mov edx, [esp + 8] // dst_argb
5725 mov ecx, [esp + 12] // shuffler
5726 movdqu xmm5, [ecx]
5727 mov ecx, [esp + 16] // width
5728
5729 wloop:
5730 movdqu xmm0, [eax]
5731 movdqu xmm1, [eax + 16]
5732 lea eax, [eax + 32]
5733 pshufb xmm0, xmm5
5734 pshufb xmm1, xmm5
5735 movdqu [edx], xmm0
5736 movdqu [edx + 16], xmm1
5737 lea edx, [edx + 32]
5738 sub ecx, 8
5739 jg wloop
5740 ret
5741 }
5742 }
5743
5744 #ifdef HAS_ARGBSHUFFLEROW_AVX2
5745 __declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
5746 uint8_t* dst_argb,
5747 const uint8_t* shuffler,
5748 int width) {
5749 __asm {
5750 mov eax, [esp + 4] // src_argb
5751 mov edx, [esp + 8] // dst_argb
5752 mov ecx, [esp + 12] // shuffler
5753 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
5754 mov ecx, [esp + 16] // width
5755
5756 wloop:
5757 vmovdqu ymm0, [eax]
5758 vmovdqu ymm1, [eax + 32]
5759 lea eax, [eax + 64]
5760 vpshufb ymm0, ymm0, ymm5
5761 vpshufb ymm1, ymm1, ymm5
5762 vmovdqu [edx], ymm0
5763 vmovdqu [edx + 32], ymm1
5764 lea edx, [edx + 64]
5765 sub ecx, 16
5766 jg wloop
5767
5768 vzeroupper
5769 ret
5770 }
5771 }
5772 #endif // HAS_ARGBSHUFFLEROW_AVX2
5773
5774 // YUY2 - Macro-pixel = 2 image pixels
5775 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
5776
5777 // UYVY - Macro-pixel = 2 image pixels
5778 // U0Y0V0Y1
5779
5780 __declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
5781 const uint8_t* src_u,
5782 const uint8_t* src_v,
5783 uint8_t* dst_frame,
5784 int width) {
5785 __asm {
5786 push esi
5787 push edi
5788 mov eax, [esp + 8 + 4] // src_y
5789 mov esi, [esp + 8 + 8] // src_u
5790 mov edx, [esp + 8 + 12] // src_v
5791 mov edi, [esp + 8 + 16] // dst_frame
5792 mov ecx, [esp + 8 + 20] // width
5793 sub edx, esi
5794
5795 convertloop:
5796 movq xmm2, qword ptr [esi] // U
5797 movq xmm3, qword ptr [esi + edx] // V
5798 lea esi, [esi + 8]
5799 punpcklbw xmm2, xmm3 // UV
5800 movdqu xmm0, [eax] // Y
5801 lea eax, [eax + 16]
5802 movdqa xmm1, xmm0
5803 punpcklbw xmm0, xmm2 // YUYV
5804 punpckhbw xmm1, xmm2
5805 movdqu [edi], xmm0
5806 movdqu [edi + 16], xmm1
5807 lea edi, [edi + 32]
5808 sub ecx, 16
5809 jg convertloop
5810
5811 pop edi
5812 pop esi
5813 ret
5814 }
5815 }
5816
5817 __declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
5818 const uint8_t* src_u,
5819 const uint8_t* src_v,
5820 uint8_t* dst_frame,
5821 int width) {
5822 __asm {
5823 push esi
5824 push edi
5825 mov eax, [esp + 8 + 4] // src_y
5826 mov esi, [esp + 8 + 8] // src_u
5827 mov edx, [esp + 8 + 12] // src_v
5828 mov edi, [esp + 8 + 16] // dst_frame
5829 mov ecx, [esp + 8 + 20] // width
5830 sub edx, esi
5831
5832 convertloop:
5833 movq xmm2, qword ptr [esi] // U
5834 movq xmm3, qword ptr [esi + edx] // V
5835 lea esi, [esi + 8]
5836 punpcklbw xmm2, xmm3 // UV
5837 movdqu xmm0, [eax] // Y
5838 movdqa xmm1, xmm2
5839 lea eax, [eax + 16]
5840 punpcklbw xmm1, xmm0 // UYVY
5841 punpckhbw xmm2, xmm0
5842 movdqu [edi], xmm1
5843 movdqu [edi + 16], xmm2
5844 lea edi, [edi + 32]
5845 sub ecx, 16
5846 jg convertloop
5847
5848 pop edi
5849 pop esi
5850 ret
5851 }
5852 }
5853
5854 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
5855 __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
5856 uint8_t* dst_argb,
5857 const float* poly,
5858 int width) {
5859 __asm {
5860 push esi
5861 mov eax, [esp + 4 + 4] /* src_argb */
5862 mov edx, [esp + 4 + 8] /* dst_argb */
5863 mov esi, [esp + 4 + 12] /* poly */
5864 mov ecx, [esp + 4 + 16] /* width */
5865 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
5866
5867 // 2 pixel loop.
5868 convertloop:
5869 // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
5870 // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
5871 movq xmm0, qword ptr [eax] // BGRABGRA
5872 lea eax, [eax + 8]
5873 punpcklbw xmm0, xmm3
5874 movdqa xmm4, xmm0
5875 punpcklwd xmm0, xmm3 // pixel 0
5876 punpckhwd xmm4, xmm3 // pixel 1
5877 cvtdq2ps xmm0, xmm0 // 4 floats
5878 cvtdq2ps xmm4, xmm4
5879 movdqa xmm1, xmm0 // X
5880 movdqa xmm5, xmm4
5881 mulps xmm0, [esi + 16] // C1 * X
5882 mulps xmm4, [esi + 16]
5883 addps xmm0, [esi] // result = C0 + C1 * X
5884 addps xmm4, [esi]
5885 movdqa xmm2, xmm1
5886 movdqa xmm6, xmm5
5887 mulps xmm2, xmm1 // X * X
5888 mulps xmm6, xmm5
5889 mulps xmm1, xmm2 // X * X * X
5890 mulps xmm5, xmm6
5891 mulps xmm2, [esi + 32] // C2 * X * X
5892 mulps xmm6, [esi + 32]
5893 mulps xmm1, [esi + 48] // C3 * X * X * X
5894 mulps xmm5, [esi + 48]
5895 addps xmm0, xmm2 // result += C2 * X * X
5896 addps xmm4, xmm6
5897 addps xmm0, xmm1 // result += C3 * X * X * X
5898 addps xmm4, xmm5
5899 cvttps2dq xmm0, xmm0
5900 cvttps2dq xmm4, xmm4
5901 packuswb xmm0, xmm4
5902 packuswb xmm0, xmm0
5903 movq qword ptr [edx], xmm0
5904 lea edx, [edx + 8]
5905 sub ecx, 2
5906 jg convertloop
5907 pop esi
5908 ret
5909 }
5910 }
5911 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
5912
5913 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
5914 __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
5915 uint8_t* dst_argb,
5916 const float* poly,
5917 int width) {
5918 __asm {
5919 mov eax, [esp + 4] /* src_argb */
5920 mov edx, [esp + 8] /* dst_argb */
5921 mov ecx, [esp + 12] /* poly */
5922 vbroadcastf128 ymm4, [ecx] // C0
5923 vbroadcastf128 ymm5, [ecx + 16] // C1
5924 vbroadcastf128 ymm6, [ecx + 32] // C2
5925 vbroadcastf128 ymm7, [ecx + 48] // C3
5926 mov ecx, [esp + 16] /* width */
5927
5928 // 2 pixel loop.
5929 convertloop:
5930 vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
5931 lea eax, [eax + 8]
5932 vcvtdq2ps ymm0, ymm0 // X 8 floats
5933 vmulps ymm2, ymm0, ymm0 // X * X
5934 vmulps ymm3, ymm0, ymm7 // C3 * X
5935 vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
5936 vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
5937 vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
5938 vcvttps2dq ymm0, ymm0
5939 vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
5940 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
5941 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
5942 vmovq qword ptr [edx], xmm0
5943 lea edx, [edx + 8]
5944 sub ecx, 2
5945 jg convertloop
5946 vzeroupper
5947 ret
5948 }
5949 }
5950 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
5951
5952 #ifdef HAS_HALFFLOATROW_SSE2
5953 static float kExpBias = 1.9259299444e-34f;
5954 __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
5955 uint16_t* dst,
5956 float scale,
5957 int width) {
5958 __asm {
5959 mov eax, [esp + 4] /* src */
5960 mov edx, [esp + 8] /* dst */
5961 movd xmm4, dword ptr [esp + 12] /* scale */
5962 mov ecx, [esp + 16] /* width */
5963 mulss xmm4, kExpBias
5964 pshufd xmm4, xmm4, 0
5965 pxor xmm5, xmm5
5966 sub edx, eax
5967
5968 // 8 pixel loop.
5969 convertloop:
5970 movdqu xmm2, xmmword ptr [eax] // 8 shorts
5971 add eax, 16
5972 movdqa xmm3, xmm2
5973 punpcklwd xmm2, xmm5
5974 cvtdq2ps xmm2, xmm2 // convert 8 ints to floats
5975 punpckhwd xmm3, xmm5
5976 cvtdq2ps xmm3, xmm3
5977 mulps xmm2, xmm4
5978 mulps xmm3, xmm4
5979 psrld xmm2, 13
5980 psrld xmm3, 13
5981 packssdw xmm2, xmm3
5982 movdqu [eax + edx - 16], xmm2
5983 sub ecx, 8
5984 jg convertloop
5985 ret
5986 }
5987 }
5988 #endif // HAS_HALFFLOATROW_SSE2
5989
5990 #ifdef HAS_HALFFLOATROW_AVX2
5991 __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
5992 uint16_t* dst,
5993 float scale,
5994 int width) {
5995 __asm {
5996 mov eax, [esp + 4] /* src */
5997 mov edx, [esp + 8] /* dst */
5998 movd xmm4, dword ptr [esp + 12] /* scale */
5999 mov ecx, [esp + 16] /* width */
6000
6001 vmulss xmm4, xmm4, kExpBias
6002 vbroadcastss ymm4, xmm4
6003 vpxor ymm5, ymm5, ymm5
6004 sub edx, eax
6005
6006 // 16 pixel loop.
6007 convertloop:
6008 vmovdqu ymm2, [eax] // 16 shorts
6009 add eax, 32
6010 vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints
6011 vpunpcklwd ymm2, ymm2, ymm5
6012 vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats
6013 vcvtdq2ps ymm2, ymm2
6014 vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.
6015 vmulps ymm2, ymm2, ymm4
6016 vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
6017 vpsrld ymm2, ymm2, 13
6018 vpackssdw ymm2, ymm2, ymm3
6019 vmovdqu [eax + edx - 32], ymm2
6020 sub ecx, 16
6021 jg convertloop
6022 vzeroupper
6023 ret
6024 }
6025 }
6026 #endif // HAS_HALFFLOATROW_AVX2
6027
6028 #ifdef HAS_HALFFLOATROW_F16C
6029 __declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
6030 uint16_t* dst,
6031 float scale,
6032 int width) {
6033 __asm {
6034 mov eax, [esp + 4] /* src */
6035 mov edx, [esp + 8] /* dst */
6036 vbroadcastss ymm4, [esp + 12] /* scale */
6037 mov ecx, [esp + 16] /* width */
6038 sub edx, eax
6039
6040 // 16 pixel loop.
6041 convertloop:
6042 vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
6043 vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
6044 add eax, 32
6045 vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
6046 vcvtdq2ps ymm3, ymm3
6047 vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
6048 vmulps ymm3, ymm3, ymm4
6049 vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
6050 vcvtps2ph xmm3, ymm3, 3
6051 vmovdqu [eax + edx + 32], xmm2
6052 vmovdqu [eax + edx + 32 + 16], xmm3
6053 sub ecx, 16
6054 jg convertloop
6055 vzeroupper
6056 ret
6057 }
6058 }
6059 #endif // HAS_HALFFLOATROW_F16C
6060
6061 #ifdef HAS_ARGBCOLORTABLEROW_X86
6062 // Tranform ARGB pixels with color table.
6063 __declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
6064 const uint8_t* table_argb,
6065 int width) {
6066 __asm {
6067 push esi
6068 mov eax, [esp + 4 + 4] /* dst_argb */
6069 mov esi, [esp + 4 + 8] /* table_argb */
6070 mov ecx, [esp + 4 + 12] /* width */
6071
6072 // 1 pixel loop.
6073 convertloop:
6074 movzx edx, byte ptr [eax]
6075 lea eax, [eax + 4]
6076 movzx edx, byte ptr [esi + edx * 4]
6077 mov byte ptr [eax - 4], dl
6078 movzx edx, byte ptr [eax - 4 + 1]
6079 movzx edx, byte ptr [esi + edx * 4 + 1]
6080 mov byte ptr [eax - 4 + 1], dl
6081 movzx edx, byte ptr [eax - 4 + 2]
6082 movzx edx, byte ptr [esi + edx * 4 + 2]
6083 mov byte ptr [eax - 4 + 2], dl
6084 movzx edx, byte ptr [eax - 4 + 3]
6085 movzx edx, byte ptr [esi + edx * 4 + 3]
6086 mov byte ptr [eax - 4 + 3], dl
6087 dec ecx
6088 jg convertloop
6089 pop esi
6090 ret
6091 }
6092 }
6093 #endif // HAS_ARGBCOLORTABLEROW_X86
6094
6095 #ifdef HAS_RGBCOLORTABLEROW_X86
6096 // Tranform RGB pixels with color table.
6097 __declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
6098 const uint8_t* table_argb,
6099 int width) {
6100 __asm {
6101 push esi
6102 mov eax, [esp + 4 + 4] /* dst_argb */
6103 mov esi, [esp + 4 + 8] /* table_argb */
6104 mov ecx, [esp + 4 + 12] /* width */
6105
6106 // 1 pixel loop.
6107 convertloop:
6108 movzx edx, byte ptr [eax]
6109 lea eax, [eax + 4]
6110 movzx edx, byte ptr [esi + edx * 4]
6111 mov byte ptr [eax - 4], dl
6112 movzx edx, byte ptr [eax - 4 + 1]
6113 movzx edx, byte ptr [esi + edx * 4 + 1]
6114 mov byte ptr [eax - 4 + 1], dl
6115 movzx edx, byte ptr [eax - 4 + 2]
6116 movzx edx, byte ptr [esi + edx * 4 + 2]
6117 mov byte ptr [eax - 4 + 2], dl
6118 dec ecx
6119 jg convertloop
6120
6121 pop esi
6122 ret
6123 }
6124 }
6125 #endif // HAS_RGBCOLORTABLEROW_X86
6126
6127 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6128 // Tranform RGB pixels with luma table.
6129 __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
6130 uint8_t* dst_argb,
6131 int width,
6132 const uint8_t* luma,
6133 uint32_t lumacoeff) {
6134 __asm {
6135 push esi
6136 push edi
6137 mov eax, [esp + 8 + 4] /* src_argb */
6138 mov edi, [esp + 8 + 8] /* dst_argb */
6139 mov ecx, [esp + 8 + 12] /* width */
6140 movd xmm2, dword ptr [esp + 8 + 16] // luma table
6141 movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
6142 pshufd xmm2, xmm2, 0
6143 pshufd xmm3, xmm3, 0
6144 pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
6145 psllw xmm4, 8
6146 pxor xmm5, xmm5
6147
6148 // 4 pixel loop.
6149 convertloop:
6150 movdqu xmm0, xmmword ptr [eax] // generate luma ptr
6151 pmaddubsw xmm0, xmm3
6152 phaddw xmm0, xmm0
6153 pand xmm0, xmm4 // mask out low bits
6154 punpcklwd xmm0, xmm5
6155 paddd xmm0, xmm2 // add table base
6156 movd esi, xmm0
6157 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6158
6159 movzx edx, byte ptr [eax]
6160 movzx edx, byte ptr [esi + edx]
6161 mov byte ptr [edi], dl
6162 movzx edx, byte ptr [eax + 1]
6163 movzx edx, byte ptr [esi + edx]
6164 mov byte ptr [edi + 1], dl
6165 movzx edx, byte ptr [eax + 2]
6166 movzx edx, byte ptr [esi + edx]
6167 mov byte ptr [edi + 2], dl
6168 movzx edx, byte ptr [eax + 3] // copy alpha.
6169 mov byte ptr [edi + 3], dl
6170
6171 movd esi, xmm0
6172 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6173
6174 movzx edx, byte ptr [eax + 4]
6175 movzx edx, byte ptr [esi + edx]
6176 mov byte ptr [edi + 4], dl
6177 movzx edx, byte ptr [eax + 5]
6178 movzx edx, byte ptr [esi + edx]
6179 mov byte ptr [edi + 5], dl
6180 movzx edx, byte ptr [eax + 6]
6181 movzx edx, byte ptr [esi + edx]
6182 mov byte ptr [edi + 6], dl
6183 movzx edx, byte ptr [eax + 7] // copy alpha.
6184 mov byte ptr [edi + 7], dl
6185
6186 movd esi, xmm0
6187 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6188
6189 movzx edx, byte ptr [eax + 8]
6190 movzx edx, byte ptr [esi + edx]
6191 mov byte ptr [edi + 8], dl
6192 movzx edx, byte ptr [eax + 9]
6193 movzx edx, byte ptr [esi + edx]
6194 mov byte ptr [edi + 9], dl
6195 movzx edx, byte ptr [eax + 10]
6196 movzx edx, byte ptr [esi + edx]
6197 mov byte ptr [edi + 10], dl
6198 movzx edx, byte ptr [eax + 11] // copy alpha.
6199 mov byte ptr [edi + 11], dl
6200
6201 movd esi, xmm0
6202
6203 movzx edx, byte ptr [eax + 12]
6204 movzx edx, byte ptr [esi + edx]
6205 mov byte ptr [edi + 12], dl
6206 movzx edx, byte ptr [eax + 13]
6207 movzx edx, byte ptr [esi + edx]
6208 mov byte ptr [edi + 13], dl
6209 movzx edx, byte ptr [eax + 14]
6210 movzx edx, byte ptr [esi + edx]
6211 mov byte ptr [edi + 14], dl
6212 movzx edx, byte ptr [eax + 15] // copy alpha.
6213 mov byte ptr [edi + 15], dl
6214
6215 lea eax, [eax + 16]
6216 lea edi, [edi + 16]
6217 sub ecx, 4
6218 jg convertloop
6219
6220 pop edi
6221 pop esi
6222 ret
6223 }
6224 }
6225 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6226
6227 #endif // defined(_M_X64)
6228
6229 #ifdef __cplusplus
6230 } // extern "C"
6231 } // namespace libyuv
6232 #endif
6233
6234 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6235