1 /*
2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
4 *
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
14 *
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22 * SOFTWARE.
23 *
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
26 *
27 * Based on work by Owen Taylor and Søren Sandmann
28 */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32
33 /* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
34 #define PSHUFD_IS_FAST 0
35
36 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
37 #include <emmintrin.h> /* for SSE2 intrinsics */
38 #include "pixman-private.h"
39 #include "pixman-combine32.h"
40 #include "pixman-inlines.h"
41
42 static __m128i mask_0080;
43 static __m128i mask_00ff;
44 static __m128i mask_0101;
45 static __m128i mask_ffff;
46 static __m128i mask_ff000000;
47 static __m128i mask_alpha;
48
49 static __m128i mask_565_r;
50 static __m128i mask_565_g1, mask_565_g2;
51 static __m128i mask_565_b;
52 static __m128i mask_red;
53 static __m128i mask_green;
54 static __m128i mask_blue;
55
56 static __m128i mask_565_fix_rb;
57 static __m128i mask_565_fix_g;
58
59 static __m128i mask_565_rb;
60 static __m128i mask_565_pack_multiplier;
61
62 static force_inline __m128i
unpack_32_1x128(uint32_t data)63 unpack_32_1x128 (uint32_t data)
64 {
65 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
66 }
67
68 static force_inline void
unpack_128_2x128(__m128i data,__m128i * data_lo,__m128i * data_hi)69 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
70 {
71 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
72 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
73 }
74
75 static force_inline __m128i
unpack_565_to_8888(__m128i lo)76 unpack_565_to_8888 (__m128i lo)
77 {
78 __m128i r, g, b, rb, t;
79
80 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
81 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
82 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
83
84 rb = _mm_or_si128 (r, b);
85 t = _mm_and_si128 (rb, mask_565_fix_rb);
86 t = _mm_srli_epi32 (t, 5);
87 rb = _mm_or_si128 (rb, t);
88
89 t = _mm_and_si128 (g, mask_565_fix_g);
90 t = _mm_srli_epi32 (t, 6);
91 g = _mm_or_si128 (g, t);
92
93 return _mm_or_si128 (rb, g);
94 }
95
96 static force_inline void
unpack_565_128_4x128(__m128i data,__m128i * data0,__m128i * data1,__m128i * data2,__m128i * data3)97 unpack_565_128_4x128 (__m128i data,
98 __m128i* data0,
99 __m128i* data1,
100 __m128i* data2,
101 __m128i* data3)
102 {
103 __m128i lo, hi;
104
105 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
106 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
107
108 lo = unpack_565_to_8888 (lo);
109 hi = unpack_565_to_8888 (hi);
110
111 unpack_128_2x128 (lo, data0, data1);
112 unpack_128_2x128 (hi, data2, data3);
113 }
114
115 static force_inline uint16_t
pack_565_32_16(uint32_t pixel)116 pack_565_32_16 (uint32_t pixel)
117 {
118 return (uint16_t) (((pixel >> 8) & 0xf800) |
119 ((pixel >> 5) & 0x07e0) |
120 ((pixel >> 3) & 0x001f));
121 }
122
123 static force_inline __m128i
pack_2x128_128(__m128i lo,__m128i hi)124 pack_2x128_128 (__m128i lo, __m128i hi)
125 {
126 return _mm_packus_epi16 (lo, hi);
127 }
128
129 static force_inline __m128i
pack_565_2packedx128_128(__m128i lo,__m128i hi)130 pack_565_2packedx128_128 (__m128i lo, __m128i hi)
131 {
132 __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
133 __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
134
135 __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
136 __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
137
138 __m128i g0 = _mm_and_si128 (lo, mask_green);
139 __m128i g1 = _mm_and_si128 (hi, mask_green);
140
141 t0 = _mm_or_si128 (t0, g0);
142 t1 = _mm_or_si128 (t1, g1);
143
144 /* Simulates _mm_packus_epi32 */
145 t0 = _mm_slli_epi32 (t0, 16 - 5);
146 t1 = _mm_slli_epi32 (t1, 16 - 5);
147 t0 = _mm_srai_epi32 (t0, 16);
148 t1 = _mm_srai_epi32 (t1, 16);
149 return _mm_packs_epi32 (t0, t1);
150 }
151
152 static force_inline __m128i
pack_565_2x128_128(__m128i lo,__m128i hi)153 pack_565_2x128_128 (__m128i lo, __m128i hi)
154 {
155 __m128i data;
156 __m128i r, g1, g2, b;
157
158 data = pack_2x128_128 (lo, hi);
159
160 r = _mm_and_si128 (data, mask_565_r);
161 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
162 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
163 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
164
165 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
166 }
167
168 static force_inline __m128i
pack_565_4x128_128(__m128i * xmm0,__m128i * xmm1,__m128i * xmm2,__m128i * xmm3)169 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
170 {
171 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
172 pack_565_2x128_128 (*xmm2, *xmm3));
173 }
174
175 static force_inline int
is_opaque(__m128i x)176 is_opaque (__m128i x)
177 {
178 __m128i ffs = _mm_cmpeq_epi8 (x, x);
179
180 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
181 }
182
183 static force_inline int
is_zero(__m128i x)184 is_zero (__m128i x)
185 {
186 return _mm_movemask_epi8 (
187 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
188 }
189
190 static force_inline int
is_transparent(__m128i x)191 is_transparent (__m128i x)
192 {
193 return (_mm_movemask_epi8 (
194 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
195 }
196
197 static force_inline __m128i
expand_pixel_32_1x128(uint32_t data)198 expand_pixel_32_1x128 (uint32_t data)
199 {
200 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
201 }
202
203 static force_inline __m128i
expand_alpha_1x128(__m128i data)204 expand_alpha_1x128 (__m128i data)
205 {
206 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
207 _MM_SHUFFLE (3, 3, 3, 3)),
208 _MM_SHUFFLE (3, 3, 3, 3));
209 }
210
211 static force_inline void
expand_alpha_2x128(__m128i data_lo,__m128i data_hi,__m128i * alpha_lo,__m128i * alpha_hi)212 expand_alpha_2x128 (__m128i data_lo,
213 __m128i data_hi,
214 __m128i* alpha_lo,
215 __m128i* alpha_hi)
216 {
217 __m128i lo, hi;
218
219 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
220 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
221
222 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
223 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
224 }
225
226 static force_inline void
expand_alpha_rev_2x128(__m128i data_lo,__m128i data_hi,__m128i * alpha_lo,__m128i * alpha_hi)227 expand_alpha_rev_2x128 (__m128i data_lo,
228 __m128i data_hi,
229 __m128i* alpha_lo,
230 __m128i* alpha_hi)
231 {
232 __m128i lo, hi;
233
234 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
235 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
236 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
237 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
238 }
239
240 static force_inline void
pix_multiply_2x128(__m128i * data_lo,__m128i * data_hi,__m128i * alpha_lo,__m128i * alpha_hi,__m128i * ret_lo,__m128i * ret_hi)241 pix_multiply_2x128 (__m128i* data_lo,
242 __m128i* data_hi,
243 __m128i* alpha_lo,
244 __m128i* alpha_hi,
245 __m128i* ret_lo,
246 __m128i* ret_hi)
247 {
248 __m128i lo, hi;
249
250 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
251 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
252 lo = _mm_adds_epu16 (lo, mask_0080);
253 hi = _mm_adds_epu16 (hi, mask_0080);
254 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
255 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
256 }
257
258 static force_inline void
pix_add_multiply_2x128(__m128i * src_lo,__m128i * src_hi,__m128i * alpha_dst_lo,__m128i * alpha_dst_hi,__m128i * dst_lo,__m128i * dst_hi,__m128i * alpha_src_lo,__m128i * alpha_src_hi,__m128i * ret_lo,__m128i * ret_hi)259 pix_add_multiply_2x128 (__m128i* src_lo,
260 __m128i* src_hi,
261 __m128i* alpha_dst_lo,
262 __m128i* alpha_dst_hi,
263 __m128i* dst_lo,
264 __m128i* dst_hi,
265 __m128i* alpha_src_lo,
266 __m128i* alpha_src_hi,
267 __m128i* ret_lo,
268 __m128i* ret_hi)
269 {
270 __m128i t1_lo, t1_hi;
271 __m128i t2_lo, t2_hi;
272
273 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
274 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
275
276 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
277 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
278 }
279
280 static force_inline void
negate_2x128(__m128i data_lo,__m128i data_hi,__m128i * neg_lo,__m128i * neg_hi)281 negate_2x128 (__m128i data_lo,
282 __m128i data_hi,
283 __m128i* neg_lo,
284 __m128i* neg_hi)
285 {
286 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
287 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
288 }
289
290 static force_inline void
invert_colors_2x128(__m128i data_lo,__m128i data_hi,__m128i * inv_lo,__m128i * inv_hi)291 invert_colors_2x128 (__m128i data_lo,
292 __m128i data_hi,
293 __m128i* inv_lo,
294 __m128i* inv_hi)
295 {
296 __m128i lo, hi;
297
298 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
299 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
300 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
301 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
302 }
303
304 static force_inline void
over_2x128(__m128i * src_lo,__m128i * src_hi,__m128i * alpha_lo,__m128i * alpha_hi,__m128i * dst_lo,__m128i * dst_hi)305 over_2x128 (__m128i* src_lo,
306 __m128i* src_hi,
307 __m128i* alpha_lo,
308 __m128i* alpha_hi,
309 __m128i* dst_lo,
310 __m128i* dst_hi)
311 {
312 __m128i t1, t2;
313
314 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
315
316 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
317
318 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
319 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
320 }
321
322 static force_inline void
over_rev_non_pre_2x128(__m128i src_lo,__m128i src_hi,__m128i * dst_lo,__m128i * dst_hi)323 over_rev_non_pre_2x128 (__m128i src_lo,
324 __m128i src_hi,
325 __m128i* dst_lo,
326 __m128i* dst_hi)
327 {
328 __m128i lo, hi;
329 __m128i alpha_lo, alpha_hi;
330
331 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
332
333 lo = _mm_or_si128 (alpha_lo, mask_alpha);
334 hi = _mm_or_si128 (alpha_hi, mask_alpha);
335
336 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
337
338 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
339
340 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
341 }
342
343 static force_inline void
in_over_2x128(__m128i * src_lo,__m128i * src_hi,__m128i * alpha_lo,__m128i * alpha_hi,__m128i * mask_lo,__m128i * mask_hi,__m128i * dst_lo,__m128i * dst_hi)344 in_over_2x128 (__m128i* src_lo,
345 __m128i* src_hi,
346 __m128i* alpha_lo,
347 __m128i* alpha_hi,
348 __m128i* mask_lo,
349 __m128i* mask_hi,
350 __m128i* dst_lo,
351 __m128i* dst_hi)
352 {
353 __m128i s_lo, s_hi;
354 __m128i a_lo, a_hi;
355
356 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
357 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
358
359 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
360 }
361
362 /* load 4 pixels from a 16-byte boundary aligned address */
363 static force_inline __m128i
load_128_aligned(__m128i * src)364 load_128_aligned (__m128i* src)
365 {
366 return _mm_load_si128 (src);
367 }
368
369 /* load 4 pixels from a unaligned address */
370 static force_inline __m128i
load_128_unaligned(const __m128i * src)371 load_128_unaligned (const __m128i* src)
372 {
373 return _mm_loadu_si128 (src);
374 }
375
376 /* save 4 pixels using Write Combining memory on a 16-byte
377 * boundary aligned address
378 */
379 static force_inline void
save_128_write_combining(__m128i * dst,__m128i data)380 save_128_write_combining (__m128i* dst,
381 __m128i data)
382 {
383 _mm_stream_si128 (dst, data);
384 }
385
386 /* save 4 pixels on a 16-byte boundary aligned address */
387 static force_inline void
save_128_aligned(__m128i * dst,__m128i data)388 save_128_aligned (__m128i* dst,
389 __m128i data)
390 {
391 _mm_store_si128 (dst, data);
392 }
393
394 /* save 4 pixels on a unaligned address */
395 static force_inline void
save_128_unaligned(__m128i * dst,__m128i data)396 save_128_unaligned (__m128i* dst,
397 __m128i data)
398 {
399 _mm_storeu_si128 (dst, data);
400 }
401
402 static force_inline __m128i
load_32_1x128(uint32_t data)403 load_32_1x128 (uint32_t data)
404 {
405 return _mm_cvtsi32_si128 (data);
406 }
407
408 static force_inline __m128i
expand_alpha_rev_1x128(__m128i data)409 expand_alpha_rev_1x128 (__m128i data)
410 {
411 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
412 }
413
414 static force_inline __m128i
expand_pixel_8_1x128(uint8_t data)415 expand_pixel_8_1x128 (uint8_t data)
416 {
417 return _mm_shufflelo_epi16 (
418 unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
419 }
420
421 static force_inline __m128i
pix_multiply_1x128(__m128i data,__m128i alpha)422 pix_multiply_1x128 (__m128i data,
423 __m128i alpha)
424 {
425 return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
426 mask_0080),
427 mask_0101);
428 }
429
430 static force_inline __m128i
pix_add_multiply_1x128(__m128i * src,__m128i * alpha_dst,__m128i * dst,__m128i * alpha_src)431 pix_add_multiply_1x128 (__m128i* src,
432 __m128i* alpha_dst,
433 __m128i* dst,
434 __m128i* alpha_src)
435 {
436 __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
437 __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
438
439 return _mm_adds_epu8 (t1, t2);
440 }
441
442 static force_inline __m128i
negate_1x128(__m128i data)443 negate_1x128 (__m128i data)
444 {
445 return _mm_xor_si128 (data, mask_00ff);
446 }
447
448 static force_inline __m128i
invert_colors_1x128(__m128i data)449 invert_colors_1x128 (__m128i data)
450 {
451 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
452 }
453
454 static force_inline __m128i
over_1x128(__m128i src,__m128i alpha,__m128i dst)455 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
456 {
457 return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
458 }
459
460 static force_inline __m128i
in_over_1x128(__m128i * src,__m128i * alpha,__m128i * mask,__m128i * dst)461 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
462 {
463 return over_1x128 (pix_multiply_1x128 (*src, *mask),
464 pix_multiply_1x128 (*alpha, *mask),
465 *dst);
466 }
467
468 static force_inline __m128i
over_rev_non_pre_1x128(__m128i src,__m128i dst)469 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
470 {
471 __m128i alpha = expand_alpha_1x128 (src);
472
473 return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
474 _mm_or_si128 (alpha, mask_alpha)),
475 alpha,
476 dst);
477 }
478
479 static force_inline uint32_t
pack_1x128_32(__m128i data)480 pack_1x128_32 (__m128i data)
481 {
482 return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
483 }
484
485 static force_inline __m128i
expand565_16_1x128(uint16_t pixel)486 expand565_16_1x128 (uint16_t pixel)
487 {
488 __m128i m = _mm_cvtsi32_si128 (pixel);
489
490 m = unpack_565_to_8888 (m);
491
492 return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
493 }
494
495 static force_inline uint32_t
core_combine_over_u_pixel_sse2(uint32_t src,uint32_t dst)496 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
497 {
498 uint8_t a;
499 __m128i xmms;
500
501 a = src >> 24;
502
503 if (a == 0xff)
504 {
505 return src;
506 }
507 else if (src)
508 {
509 xmms = unpack_32_1x128 (src);
510 return pack_1x128_32 (
511 over_1x128 (xmms, expand_alpha_1x128 (xmms),
512 unpack_32_1x128 (dst)));
513 }
514
515 return dst;
516 }
517
518 static force_inline uint32_t
combine1(const uint32_t * ps,const uint32_t * pm)519 combine1 (const uint32_t *ps, const uint32_t *pm)
520 {
521 uint32_t s;
522 memcpy(&s, ps, sizeof(uint32_t));
523
524 if (pm)
525 {
526 __m128i ms, mm;
527
528 mm = unpack_32_1x128 (*pm);
529 mm = expand_alpha_1x128 (mm);
530
531 ms = unpack_32_1x128 (s);
532 ms = pix_multiply_1x128 (ms, mm);
533
534 s = pack_1x128_32 (ms);
535 }
536
537 return s;
538 }
539
540 static force_inline __m128i
combine4(const __m128i * ps,const __m128i * pm)541 combine4 (const __m128i *ps, const __m128i *pm)
542 {
543 __m128i xmm_src_lo, xmm_src_hi;
544 __m128i xmm_msk_lo, xmm_msk_hi;
545 __m128i s;
546
547 if (pm)
548 {
549 xmm_msk_lo = load_128_unaligned (pm);
550
551 if (is_transparent (xmm_msk_lo))
552 return _mm_setzero_si128 ();
553 }
554
555 s = load_128_unaligned (ps);
556
557 if (pm)
558 {
559 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
560 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
561
562 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
563
564 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
565 &xmm_msk_lo, &xmm_msk_hi,
566 &xmm_src_lo, &xmm_src_hi);
567
568 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
569 }
570
571 return s;
572 }
573
574 static force_inline void
core_combine_over_u_sse2_mask(uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)575 core_combine_over_u_sse2_mask (uint32_t * pd,
576 const uint32_t* ps,
577 const uint32_t* pm,
578 int w)
579 {
580 uint32_t s, d;
581
582 /* Align dst on a 16-byte boundary */
583 while (w && ((uintptr_t)pd & 15))
584 {
585 d = *pd;
586 s = combine1 (ps, pm);
587
588 if (s)
589 *pd = core_combine_over_u_pixel_sse2 (s, d);
590 pd++;
591 ps++;
592 pm++;
593 w--;
594 }
595
596 while (w >= 4)
597 {
598 __m128i mask = load_128_unaligned ((__m128i *)pm);
599
600 if (!is_zero (mask))
601 {
602 __m128i src;
603 __m128i src_hi, src_lo;
604 __m128i mask_hi, mask_lo;
605 __m128i alpha_hi, alpha_lo;
606
607 src = load_128_unaligned ((__m128i *)ps);
608
609 if (is_opaque (_mm_and_si128 (src, mask)))
610 {
611 save_128_aligned ((__m128i *)pd, src);
612 }
613 else
614 {
615 __m128i dst = load_128_aligned ((__m128i *)pd);
616 __m128i dst_hi, dst_lo;
617
618 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
619 unpack_128_2x128 (src, &src_lo, &src_hi);
620
621 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
622 pix_multiply_2x128 (&src_lo, &src_hi,
623 &mask_lo, &mask_hi,
624 &src_lo, &src_hi);
625
626 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
627
628 expand_alpha_2x128 (src_lo, src_hi,
629 &alpha_lo, &alpha_hi);
630
631 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
632 &dst_lo, &dst_hi);
633
634 save_128_aligned (
635 (__m128i *)pd,
636 pack_2x128_128 (dst_lo, dst_hi));
637 }
638 }
639
640 pm += 4;
641 ps += 4;
642 pd += 4;
643 w -= 4;
644 }
645 while (w)
646 {
647 d = *pd;
648 s = combine1 (ps, pm);
649
650 if (s)
651 *pd = core_combine_over_u_pixel_sse2 (s, d);
652 pd++;
653 ps++;
654 pm++;
655
656 w--;
657 }
658 }
659
660 static force_inline void
core_combine_over_u_sse2_no_mask(uint32_t * pd,const uint32_t * ps,int w)661 core_combine_over_u_sse2_no_mask (uint32_t * pd,
662 const uint32_t* ps,
663 int w)
664 {
665 uint32_t s, d;
666
667 /* Align dst on a 16-byte boundary */
668 while (w && ((uintptr_t)pd & 15))
669 {
670 d = *pd;
671 s = *ps;
672
673 if (s)
674 *pd = core_combine_over_u_pixel_sse2 (s, d);
675 pd++;
676 ps++;
677 w--;
678 }
679
680 while (w >= 4)
681 {
682 __m128i src;
683 __m128i src_hi, src_lo, dst_hi, dst_lo;
684 __m128i alpha_hi, alpha_lo;
685
686 src = load_128_unaligned ((__m128i *)ps);
687
688 if (!is_zero (src))
689 {
690 if (is_opaque (src))
691 {
692 save_128_aligned ((__m128i *)pd, src);
693 }
694 else
695 {
696 __m128i dst = load_128_aligned ((__m128i *)pd);
697
698 unpack_128_2x128 (src, &src_lo, &src_hi);
699 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
700
701 expand_alpha_2x128 (src_lo, src_hi,
702 &alpha_lo, &alpha_hi);
703 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
704 &dst_lo, &dst_hi);
705
706 save_128_aligned (
707 (__m128i *)pd,
708 pack_2x128_128 (dst_lo, dst_hi));
709 }
710 }
711
712 ps += 4;
713 pd += 4;
714 w -= 4;
715 }
716 while (w)
717 {
718 d = *pd;
719 s = *ps;
720
721 if (s)
722 *pd = core_combine_over_u_pixel_sse2 (s, d);
723 pd++;
724 ps++;
725
726 w--;
727 }
728 }
729
730 static force_inline void
sse2_combine_over_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)731 sse2_combine_over_u (pixman_implementation_t *imp,
732 pixman_op_t op,
733 uint32_t * pd,
734 const uint32_t * ps,
735 const uint32_t * pm,
736 int w)
737 {
738 if (pm)
739 core_combine_over_u_sse2_mask (pd, ps, pm, w);
740 else
741 core_combine_over_u_sse2_no_mask (pd, ps, w);
742 }
743
744 static void
sse2_combine_over_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)745 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
746 pixman_op_t op,
747 uint32_t * pd,
748 const uint32_t * ps,
749 const uint32_t * pm,
750 int w)
751 {
752 uint32_t s, d;
753
754 __m128i xmm_dst_lo, xmm_dst_hi;
755 __m128i xmm_src_lo, xmm_src_hi;
756 __m128i xmm_alpha_lo, xmm_alpha_hi;
757
758 /* Align dst on a 16-byte boundary */
759 while (w &&
760 ((uintptr_t)pd & 15))
761 {
762 d = *pd;
763 s = combine1 (ps, pm);
764
765 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
766 w--;
767 ps++;
768 if (pm)
769 pm++;
770 }
771
772 while (w >= 4)
773 {
774 /* I'm loading unaligned because I'm not sure
775 * about the address alignment.
776 */
777 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
778 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
779
780 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
781 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
782
783 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
784 &xmm_alpha_lo, &xmm_alpha_hi);
785
786 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
787 &xmm_alpha_lo, &xmm_alpha_hi,
788 &xmm_src_lo, &xmm_src_hi);
789
790 /* rebuid the 4 pixel data and save*/
791 save_128_aligned ((__m128i*)pd,
792 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
793
794 w -= 4;
795 ps += 4;
796 pd += 4;
797
798 if (pm)
799 pm += 4;
800 }
801
802 while (w)
803 {
804 d = *pd;
805 s = combine1 (ps, pm);
806
807 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
808 ps++;
809 w--;
810 if (pm)
811 pm++;
812 }
813 }
814
815 static force_inline uint32_t
core_combine_in_u_pixel_sse2(uint32_t src,uint32_t dst)816 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
817 {
818 uint32_t maska = src >> 24;
819
820 if (maska == 0)
821 {
822 return 0;
823 }
824 else if (maska != 0xff)
825 {
826 return pack_1x128_32 (
827 pix_multiply_1x128 (unpack_32_1x128 (dst),
828 expand_alpha_1x128 (unpack_32_1x128 (src))));
829 }
830
831 return dst;
832 }
833
834 static void
sse2_combine_in_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)835 sse2_combine_in_u (pixman_implementation_t *imp,
836 pixman_op_t op,
837 uint32_t * pd,
838 const uint32_t * ps,
839 const uint32_t * pm,
840 int w)
841 {
842 uint32_t s, d;
843
844 __m128i xmm_src_lo, xmm_src_hi;
845 __m128i xmm_dst_lo, xmm_dst_hi;
846
847 while (w && ((uintptr_t)pd & 15))
848 {
849 s = combine1 (ps, pm);
850 d = *pd;
851
852 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
853 w--;
854 ps++;
855 if (pm)
856 pm++;
857 }
858
859 while (w >= 4)
860 {
861 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
862 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
863
864 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
865 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
866
867 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
868 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
869 &xmm_dst_lo, &xmm_dst_hi,
870 &xmm_dst_lo, &xmm_dst_hi);
871
872 save_128_aligned ((__m128i*)pd,
873 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
874
875 ps += 4;
876 pd += 4;
877 w -= 4;
878 if (pm)
879 pm += 4;
880 }
881
882 while (w)
883 {
884 s = combine1 (ps, pm);
885 d = *pd;
886
887 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
888 w--;
889 ps++;
890 if (pm)
891 pm++;
892 }
893 }
894
895 static void
sse2_combine_in_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)896 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
897 pixman_op_t op,
898 uint32_t * pd,
899 const uint32_t * ps,
900 const uint32_t * pm,
901 int w)
902 {
903 uint32_t s, d;
904
905 __m128i xmm_src_lo, xmm_src_hi;
906 __m128i xmm_dst_lo, xmm_dst_hi;
907
908 while (w && ((uintptr_t)pd & 15))
909 {
910 s = combine1 (ps, pm);
911 d = *pd;
912
913 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
914 ps++;
915 w--;
916 if (pm)
917 pm++;
918 }
919
920 while (w >= 4)
921 {
922 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
923 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
924
925 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
926 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
927
928 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
929 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
930 &xmm_src_lo, &xmm_src_hi,
931 &xmm_dst_lo, &xmm_dst_hi);
932
933 save_128_aligned (
934 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
935
936 ps += 4;
937 pd += 4;
938 w -= 4;
939 if (pm)
940 pm += 4;
941 }
942
943 while (w)
944 {
945 s = combine1 (ps, pm);
946 d = *pd;
947
948 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
949 w--;
950 ps++;
951 if (pm)
952 pm++;
953 }
954 }
955
956 static void
sse2_combine_out_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)957 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
958 pixman_op_t op,
959 uint32_t * pd,
960 const uint32_t * ps,
961 const uint32_t * pm,
962 int w)
963 {
964 while (w && ((uintptr_t)pd & 15))
965 {
966 uint32_t s = combine1 (ps, pm);
967 uint32_t d = *pd;
968
969 *pd++ = pack_1x128_32 (
970 pix_multiply_1x128 (
971 unpack_32_1x128 (d), negate_1x128 (
972 expand_alpha_1x128 (unpack_32_1x128 (s)))));
973
974 if (pm)
975 pm++;
976 ps++;
977 w--;
978 }
979
980 while (w >= 4)
981 {
982 __m128i xmm_src_lo, xmm_src_hi;
983 __m128i xmm_dst_lo, xmm_dst_hi;
984
985 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
986 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
987
988 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
989 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
990
991 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
992 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
993
994 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
995 &xmm_src_lo, &xmm_src_hi,
996 &xmm_dst_lo, &xmm_dst_hi);
997
998 save_128_aligned (
999 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1000
1001 ps += 4;
1002 pd += 4;
1003 if (pm)
1004 pm += 4;
1005
1006 w -= 4;
1007 }
1008
1009 while (w)
1010 {
1011 uint32_t s = combine1 (ps, pm);
1012 uint32_t d = *pd;
1013
1014 *pd++ = pack_1x128_32 (
1015 pix_multiply_1x128 (
1016 unpack_32_1x128 (d), negate_1x128 (
1017 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1018 ps++;
1019 if (pm)
1020 pm++;
1021 w--;
1022 }
1023 }
1024
1025 static void
sse2_combine_out_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1026 sse2_combine_out_u (pixman_implementation_t *imp,
1027 pixman_op_t op,
1028 uint32_t * pd,
1029 const uint32_t * ps,
1030 const uint32_t * pm,
1031 int w)
1032 {
1033 while (w && ((uintptr_t)pd & 15))
1034 {
1035 uint32_t s = combine1 (ps, pm);
1036 uint32_t d = *pd;
1037
1038 *pd++ = pack_1x128_32 (
1039 pix_multiply_1x128 (
1040 unpack_32_1x128 (s), negate_1x128 (
1041 expand_alpha_1x128 (unpack_32_1x128 (d)))));
1042 w--;
1043 ps++;
1044 if (pm)
1045 pm++;
1046 }
1047
1048 while (w >= 4)
1049 {
1050 __m128i xmm_src_lo, xmm_src_hi;
1051 __m128i xmm_dst_lo, xmm_dst_hi;
1052
1053 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1054 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1055
1056 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1057 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1058
1059 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1060 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1061
1062 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1063 &xmm_dst_lo, &xmm_dst_hi,
1064 &xmm_dst_lo, &xmm_dst_hi);
1065
1066 save_128_aligned (
1067 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1068
1069 ps += 4;
1070 pd += 4;
1071 w -= 4;
1072 if (pm)
1073 pm += 4;
1074 }
1075
1076 while (w)
1077 {
1078 uint32_t s = combine1 (ps, pm);
1079 uint32_t d = *pd;
1080
1081 *pd++ = pack_1x128_32 (
1082 pix_multiply_1x128 (
1083 unpack_32_1x128 (s), negate_1x128 (
1084 expand_alpha_1x128 (unpack_32_1x128 (d)))));
1085 w--;
1086 ps++;
1087 if (pm)
1088 pm++;
1089 }
1090 }
1091
1092 static force_inline uint32_t
core_combine_atop_u_pixel_sse2(uint32_t src,uint32_t dst)1093 core_combine_atop_u_pixel_sse2 (uint32_t src,
1094 uint32_t dst)
1095 {
1096 __m128i s = unpack_32_1x128 (src);
1097 __m128i d = unpack_32_1x128 (dst);
1098
1099 __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1100 __m128i da = expand_alpha_1x128 (d);
1101
1102 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1103 }
1104
1105 static void
sse2_combine_atop_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1106 sse2_combine_atop_u (pixman_implementation_t *imp,
1107 pixman_op_t op,
1108 uint32_t * pd,
1109 const uint32_t * ps,
1110 const uint32_t * pm,
1111 int w)
1112 {
1113 uint32_t s, d;
1114
1115 __m128i xmm_src_lo, xmm_src_hi;
1116 __m128i xmm_dst_lo, xmm_dst_hi;
1117 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1118 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1119
1120 while (w && ((uintptr_t)pd & 15))
1121 {
1122 s = combine1 (ps, pm);
1123 d = *pd;
1124
1125 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1126 w--;
1127 ps++;
1128 if (pm)
1129 pm++;
1130 }
1131
1132 while (w >= 4)
1133 {
1134 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1135 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1136
1137 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1138 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1139
1140 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1141 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1142 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1143 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1144
1145 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1146 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1147
1148 pix_add_multiply_2x128 (
1149 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1150 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1151 &xmm_dst_lo, &xmm_dst_hi);
1152
1153 save_128_aligned (
1154 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1155
1156 ps += 4;
1157 pd += 4;
1158 w -= 4;
1159 if (pm)
1160 pm += 4;
1161 }
1162
1163 while (w)
1164 {
1165 s = combine1 (ps, pm);
1166 d = *pd;
1167
1168 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1169 w--;
1170 ps++;
1171 if (pm)
1172 pm++;
1173 }
1174 }
1175
1176 static force_inline uint32_t
core_combine_reverse_atop_u_pixel_sse2(uint32_t src,uint32_t dst)1177 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1178 uint32_t dst)
1179 {
1180 __m128i s = unpack_32_1x128 (src);
1181 __m128i d = unpack_32_1x128 (dst);
1182
1183 __m128i sa = expand_alpha_1x128 (s);
1184 __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1185
1186 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1187 }
1188
1189 static void
sse2_combine_atop_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1190 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1191 pixman_op_t op,
1192 uint32_t * pd,
1193 const uint32_t * ps,
1194 const uint32_t * pm,
1195 int w)
1196 {
1197 uint32_t s, d;
1198
1199 __m128i xmm_src_lo, xmm_src_hi;
1200 __m128i xmm_dst_lo, xmm_dst_hi;
1201 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1202 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1203
1204 while (w && ((uintptr_t)pd & 15))
1205 {
1206 s = combine1 (ps, pm);
1207 d = *pd;
1208
1209 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1210 ps++;
1211 w--;
1212 if (pm)
1213 pm++;
1214 }
1215
1216 while (w >= 4)
1217 {
1218 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1219 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1220
1221 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1222 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1223
1224 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1225 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1226 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1227 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1228
1229 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1230 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1231
1232 pix_add_multiply_2x128 (
1233 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1234 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1235 &xmm_dst_lo, &xmm_dst_hi);
1236
1237 save_128_aligned (
1238 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1239
1240 ps += 4;
1241 pd += 4;
1242 w -= 4;
1243 if (pm)
1244 pm += 4;
1245 }
1246
1247 while (w)
1248 {
1249 s = combine1 (ps, pm);
1250 d = *pd;
1251
1252 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1253 ps++;
1254 w--;
1255 if (pm)
1256 pm++;
1257 }
1258 }
1259
1260 static force_inline uint32_t
core_combine_xor_u_pixel_sse2(uint32_t src,uint32_t dst)1261 core_combine_xor_u_pixel_sse2 (uint32_t src,
1262 uint32_t dst)
1263 {
1264 __m128i s = unpack_32_1x128 (src);
1265 __m128i d = unpack_32_1x128 (dst);
1266
1267 __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1268 __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1269
1270 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1271 }
1272
1273 static void
sse2_combine_xor_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dst,const uint32_t * src,const uint32_t * mask,int width)1274 sse2_combine_xor_u (pixman_implementation_t *imp,
1275 pixman_op_t op,
1276 uint32_t * dst,
1277 const uint32_t * src,
1278 const uint32_t * mask,
1279 int width)
1280 {
1281 int w = width;
1282 uint32_t s, d;
1283 uint32_t* pd = dst;
1284 const uint32_t* ps = src;
1285 const uint32_t* pm = mask;
1286
1287 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1288 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1289 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1290 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1291
1292 while (w && ((uintptr_t)pd & 15))
1293 {
1294 s = combine1 (ps, pm);
1295 d = *pd;
1296
1297 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1298 w--;
1299 ps++;
1300 if (pm)
1301 pm++;
1302 }
1303
1304 while (w >= 4)
1305 {
1306 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1307 xmm_dst = load_128_aligned ((__m128i*) pd);
1308
1309 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1310 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1311
1312 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1313 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1314 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1315 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1316
1317 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1318 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1319 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1320 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1321
1322 pix_add_multiply_2x128 (
1323 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1324 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1325 &xmm_dst_lo, &xmm_dst_hi);
1326
1327 save_128_aligned (
1328 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1329
1330 ps += 4;
1331 pd += 4;
1332 w -= 4;
1333 if (pm)
1334 pm += 4;
1335 }
1336
1337 while (w)
1338 {
1339 s = combine1 (ps, pm);
1340 d = *pd;
1341
1342 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1343 w--;
1344 ps++;
1345 if (pm)
1346 pm++;
1347 }
1348 }
1349
1350 static force_inline void
sse2_combine_add_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dst,const uint32_t * src,const uint32_t * mask,int width)1351 sse2_combine_add_u (pixman_implementation_t *imp,
1352 pixman_op_t op,
1353 uint32_t * dst,
1354 const uint32_t * src,
1355 const uint32_t * mask,
1356 int width)
1357 {
1358 int w = width;
1359 uint32_t s, d;
1360 uint32_t* pd = dst;
1361 const uint32_t* ps = src;
1362 const uint32_t* pm = mask;
1363
1364 while (w && (uintptr_t)pd & 15)
1365 {
1366 s = combine1 (ps, pm);
1367 d = *pd;
1368
1369 ps++;
1370 if (pm)
1371 pm++;
1372 *pd++ = _mm_cvtsi128_si32 (
1373 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1374 w--;
1375 }
1376
1377 while (w >= 4)
1378 {
1379 __m128i s;
1380
1381 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1382
1383 save_128_aligned (
1384 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1385
1386 pd += 4;
1387 ps += 4;
1388 if (pm)
1389 pm += 4;
1390 w -= 4;
1391 }
1392
1393 while (w--)
1394 {
1395 s = combine1 (ps, pm);
1396 d = *pd;
1397
1398 ps++;
1399 *pd++ = _mm_cvtsi128_si32 (
1400 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1401 if (pm)
1402 pm++;
1403 }
1404 }
1405
1406 static force_inline uint32_t
core_combine_saturate_u_pixel_sse2(uint32_t src,uint32_t dst)1407 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1408 uint32_t dst)
1409 {
1410 __m128i ms = unpack_32_1x128 (src);
1411 __m128i md = unpack_32_1x128 (dst);
1412 uint32_t sa = src >> 24;
1413 uint32_t da = ~dst >> 24;
1414
1415 if (sa > da)
1416 {
1417 ms = pix_multiply_1x128 (
1418 ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1419 }
1420
1421 return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1422 }
1423
1424 static void
sse2_combine_saturate_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1425 sse2_combine_saturate_u (pixman_implementation_t *imp,
1426 pixman_op_t op,
1427 uint32_t * pd,
1428 const uint32_t * ps,
1429 const uint32_t * pm,
1430 int w)
1431 {
1432 uint32_t s, d;
1433
1434 uint32_t pack_cmp;
1435 __m128i xmm_src, xmm_dst;
1436
1437 while (w && (uintptr_t)pd & 15)
1438 {
1439 s = combine1 (ps, pm);
1440 d = *pd;
1441
1442 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1443 w--;
1444 ps++;
1445 if (pm)
1446 pm++;
1447 }
1448
1449 while (w >= 4)
1450 {
1451 xmm_dst = load_128_aligned ((__m128i*)pd);
1452 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1453
1454 pack_cmp = _mm_movemask_epi8 (
1455 _mm_cmpgt_epi32 (
1456 _mm_srli_epi32 (xmm_src, 24),
1457 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1458
1459 /* if some alpha src is grater than respective ~alpha dst */
1460 if (pack_cmp)
1461 {
1462 s = combine1 (ps++, pm);
1463 d = *pd;
1464 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1465 if (pm)
1466 pm++;
1467
1468 s = combine1 (ps++, pm);
1469 d = *pd;
1470 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1471 if (pm)
1472 pm++;
1473
1474 s = combine1 (ps++, pm);
1475 d = *pd;
1476 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1477 if (pm)
1478 pm++;
1479
1480 s = combine1 (ps++, pm);
1481 d = *pd;
1482 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1483 if (pm)
1484 pm++;
1485 }
1486 else
1487 {
1488 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1489
1490 pd += 4;
1491 ps += 4;
1492 if (pm)
1493 pm += 4;
1494 }
1495
1496 w -= 4;
1497 }
1498
1499 while (w--)
1500 {
1501 s = combine1 (ps, pm);
1502 d = *pd;
1503
1504 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1505 ps++;
1506 if (pm)
1507 pm++;
1508 }
1509 }
1510
1511 static void
sse2_combine_src_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1512 sse2_combine_src_ca (pixman_implementation_t *imp,
1513 pixman_op_t op,
1514 uint32_t * pd,
1515 const uint32_t * ps,
1516 const uint32_t * pm,
1517 int w)
1518 {
1519 uint32_t s, m;
1520
1521 __m128i xmm_src_lo, xmm_src_hi;
1522 __m128i xmm_mask_lo, xmm_mask_hi;
1523 __m128i xmm_dst_lo, xmm_dst_hi;
1524
1525 while (w && (uintptr_t)pd & 15)
1526 {
1527 s = *ps++;
1528 m = *pm++;
1529 *pd++ = pack_1x128_32 (
1530 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1531 w--;
1532 }
1533
1534 while (w >= 4)
1535 {
1536 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1537 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1538
1539 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1540 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1541
1542 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1543 &xmm_mask_lo, &xmm_mask_hi,
1544 &xmm_dst_lo, &xmm_dst_hi);
1545
1546 save_128_aligned (
1547 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1548
1549 ps += 4;
1550 pd += 4;
1551 pm += 4;
1552 w -= 4;
1553 }
1554
1555 while (w)
1556 {
1557 s = *ps++;
1558 m = *pm++;
1559 *pd++ = pack_1x128_32 (
1560 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1561 w--;
1562 }
1563 }
1564
1565 static force_inline uint32_t
core_combine_over_ca_pixel_sse2(uint32_t src,uint32_t mask,uint32_t dst)1566 core_combine_over_ca_pixel_sse2 (uint32_t src,
1567 uint32_t mask,
1568 uint32_t dst)
1569 {
1570 __m128i s = unpack_32_1x128 (src);
1571 __m128i expAlpha = expand_alpha_1x128 (s);
1572 __m128i unpk_mask = unpack_32_1x128 (mask);
1573 __m128i unpk_dst = unpack_32_1x128 (dst);
1574
1575 return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1576 }
1577
1578 static void
sse2_combine_over_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1579 sse2_combine_over_ca (pixman_implementation_t *imp,
1580 pixman_op_t op,
1581 uint32_t * pd,
1582 const uint32_t * ps,
1583 const uint32_t * pm,
1584 int w)
1585 {
1586 uint32_t s, m, d;
1587
1588 __m128i xmm_alpha_lo, xmm_alpha_hi;
1589 __m128i xmm_src_lo, xmm_src_hi;
1590 __m128i xmm_dst_lo, xmm_dst_hi;
1591 __m128i xmm_mask_lo, xmm_mask_hi;
1592
1593 while (w && (uintptr_t)pd & 15)
1594 {
1595 s = *ps++;
1596 m = *pm++;
1597 d = *pd;
1598
1599 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1600 w--;
1601 }
1602
1603 while (w >= 4)
1604 {
1605 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1606 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1607 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1608
1609 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1610 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1611 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1612
1613 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1614 &xmm_alpha_lo, &xmm_alpha_hi);
1615
1616 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1617 &xmm_alpha_lo, &xmm_alpha_hi,
1618 &xmm_mask_lo, &xmm_mask_hi,
1619 &xmm_dst_lo, &xmm_dst_hi);
1620
1621 save_128_aligned (
1622 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1623
1624 ps += 4;
1625 pd += 4;
1626 pm += 4;
1627 w -= 4;
1628 }
1629
1630 while (w)
1631 {
1632 s = *ps++;
1633 m = *pm++;
1634 d = *pd;
1635
1636 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1637 w--;
1638 }
1639 }
1640
1641 static force_inline uint32_t
core_combine_over_reverse_ca_pixel_sse2(uint32_t src,uint32_t mask,uint32_t dst)1642 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1643 uint32_t mask,
1644 uint32_t dst)
1645 {
1646 __m128i d = unpack_32_1x128 (dst);
1647
1648 return pack_1x128_32 (
1649 over_1x128 (d, expand_alpha_1x128 (d),
1650 pix_multiply_1x128 (unpack_32_1x128 (src),
1651 unpack_32_1x128 (mask))));
1652 }
1653
1654 static void
sse2_combine_over_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1655 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1656 pixman_op_t op,
1657 uint32_t * pd,
1658 const uint32_t * ps,
1659 const uint32_t * pm,
1660 int w)
1661 {
1662 uint32_t s, m, d;
1663
1664 __m128i xmm_alpha_lo, xmm_alpha_hi;
1665 __m128i xmm_src_lo, xmm_src_hi;
1666 __m128i xmm_dst_lo, xmm_dst_hi;
1667 __m128i xmm_mask_lo, xmm_mask_hi;
1668
1669 while (w && (uintptr_t)pd & 15)
1670 {
1671 s = *ps++;
1672 m = *pm++;
1673 d = *pd;
1674
1675 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1676 w--;
1677 }
1678
1679 while (w >= 4)
1680 {
1681 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1682 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1683 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1684
1685 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1686 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1687 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1688
1689 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1690 &xmm_alpha_lo, &xmm_alpha_hi);
1691 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1692 &xmm_mask_lo, &xmm_mask_hi,
1693 &xmm_mask_lo, &xmm_mask_hi);
1694
1695 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1696 &xmm_alpha_lo, &xmm_alpha_hi,
1697 &xmm_mask_lo, &xmm_mask_hi);
1698
1699 save_128_aligned (
1700 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1701
1702 ps += 4;
1703 pd += 4;
1704 pm += 4;
1705 w -= 4;
1706 }
1707
1708 while (w)
1709 {
1710 s = *ps++;
1711 m = *pm++;
1712 d = *pd;
1713
1714 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1715 w--;
1716 }
1717 }
1718
1719 static void
sse2_combine_in_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1720 sse2_combine_in_ca (pixman_implementation_t *imp,
1721 pixman_op_t op,
1722 uint32_t * pd,
1723 const uint32_t * ps,
1724 const uint32_t * pm,
1725 int w)
1726 {
1727 uint32_t s, m, d;
1728
1729 __m128i xmm_alpha_lo, xmm_alpha_hi;
1730 __m128i xmm_src_lo, xmm_src_hi;
1731 __m128i xmm_dst_lo, xmm_dst_hi;
1732 __m128i xmm_mask_lo, xmm_mask_hi;
1733
1734 while (w && (uintptr_t)pd & 15)
1735 {
1736 s = *ps++;
1737 m = *pm++;
1738 d = *pd;
1739
1740 *pd++ = pack_1x128_32 (
1741 pix_multiply_1x128 (
1742 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1743 expand_alpha_1x128 (unpack_32_1x128 (d))));
1744
1745 w--;
1746 }
1747
1748 while (w >= 4)
1749 {
1750 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1751 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1752 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1753
1754 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1755 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1756 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1757
1758 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1759 &xmm_alpha_lo, &xmm_alpha_hi);
1760
1761 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1762 &xmm_mask_lo, &xmm_mask_hi,
1763 &xmm_dst_lo, &xmm_dst_hi);
1764
1765 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1766 &xmm_alpha_lo, &xmm_alpha_hi,
1767 &xmm_dst_lo, &xmm_dst_hi);
1768
1769 save_128_aligned (
1770 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1771
1772 ps += 4;
1773 pd += 4;
1774 pm += 4;
1775 w -= 4;
1776 }
1777
1778 while (w)
1779 {
1780 s = *ps++;
1781 m = *pm++;
1782 d = *pd;
1783
1784 *pd++ = pack_1x128_32 (
1785 pix_multiply_1x128 (
1786 pix_multiply_1x128 (
1787 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1788 expand_alpha_1x128 (unpack_32_1x128 (d))));
1789
1790 w--;
1791 }
1792 }
1793
1794 static void
sse2_combine_in_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1795 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1796 pixman_op_t op,
1797 uint32_t * pd,
1798 const uint32_t * ps,
1799 const uint32_t * pm,
1800 int w)
1801 {
1802 uint32_t s, m, d;
1803
1804 __m128i xmm_alpha_lo, xmm_alpha_hi;
1805 __m128i xmm_src_lo, xmm_src_hi;
1806 __m128i xmm_dst_lo, xmm_dst_hi;
1807 __m128i xmm_mask_lo, xmm_mask_hi;
1808
1809 while (w && (uintptr_t)pd & 15)
1810 {
1811 s = *ps++;
1812 m = *pm++;
1813 d = *pd;
1814
1815 *pd++ = pack_1x128_32 (
1816 pix_multiply_1x128 (
1817 unpack_32_1x128 (d),
1818 pix_multiply_1x128 (unpack_32_1x128 (m),
1819 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1820 w--;
1821 }
1822
1823 while (w >= 4)
1824 {
1825 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1826 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1827 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1828
1829 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1830 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1831 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1832
1833 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1834 &xmm_alpha_lo, &xmm_alpha_hi);
1835 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1836 &xmm_alpha_lo, &xmm_alpha_hi,
1837 &xmm_alpha_lo, &xmm_alpha_hi);
1838
1839 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1840 &xmm_alpha_lo, &xmm_alpha_hi,
1841 &xmm_dst_lo, &xmm_dst_hi);
1842
1843 save_128_aligned (
1844 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1845
1846 ps += 4;
1847 pd += 4;
1848 pm += 4;
1849 w -= 4;
1850 }
1851
1852 while (w)
1853 {
1854 s = *ps++;
1855 m = *pm++;
1856 d = *pd;
1857
1858 *pd++ = pack_1x128_32 (
1859 pix_multiply_1x128 (
1860 unpack_32_1x128 (d),
1861 pix_multiply_1x128 (unpack_32_1x128 (m),
1862 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1863 w--;
1864 }
1865 }
1866
1867 static void
sse2_combine_out_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1868 sse2_combine_out_ca (pixman_implementation_t *imp,
1869 pixman_op_t op,
1870 uint32_t * pd,
1871 const uint32_t * ps,
1872 const uint32_t * pm,
1873 int w)
1874 {
1875 uint32_t s, m, d;
1876
1877 __m128i xmm_alpha_lo, xmm_alpha_hi;
1878 __m128i xmm_src_lo, xmm_src_hi;
1879 __m128i xmm_dst_lo, xmm_dst_hi;
1880 __m128i xmm_mask_lo, xmm_mask_hi;
1881
1882 while (w && (uintptr_t)pd & 15)
1883 {
1884 s = *ps++;
1885 m = *pm++;
1886 d = *pd;
1887
1888 *pd++ = pack_1x128_32 (
1889 pix_multiply_1x128 (
1890 pix_multiply_1x128 (
1891 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1892 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1893 w--;
1894 }
1895
1896 while (w >= 4)
1897 {
1898 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1899 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1900 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1901
1902 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1903 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1904 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1905
1906 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1907 &xmm_alpha_lo, &xmm_alpha_hi);
1908 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1909 &xmm_alpha_lo, &xmm_alpha_hi);
1910
1911 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1912 &xmm_mask_lo, &xmm_mask_hi,
1913 &xmm_dst_lo, &xmm_dst_hi);
1914 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1915 &xmm_alpha_lo, &xmm_alpha_hi,
1916 &xmm_dst_lo, &xmm_dst_hi);
1917
1918 save_128_aligned (
1919 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1920
1921 ps += 4;
1922 pd += 4;
1923 pm += 4;
1924 w -= 4;
1925 }
1926
1927 while (w)
1928 {
1929 s = *ps++;
1930 m = *pm++;
1931 d = *pd;
1932
1933 *pd++ = pack_1x128_32 (
1934 pix_multiply_1x128 (
1935 pix_multiply_1x128 (
1936 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1937 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1938
1939 w--;
1940 }
1941 }
1942
1943 static void
sse2_combine_out_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1944 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1945 pixman_op_t op,
1946 uint32_t * pd,
1947 const uint32_t * ps,
1948 const uint32_t * pm,
1949 int w)
1950 {
1951 uint32_t s, m, d;
1952
1953 __m128i xmm_alpha_lo, xmm_alpha_hi;
1954 __m128i xmm_src_lo, xmm_src_hi;
1955 __m128i xmm_dst_lo, xmm_dst_hi;
1956 __m128i xmm_mask_lo, xmm_mask_hi;
1957
1958 while (w && (uintptr_t)pd & 15)
1959 {
1960 s = *ps++;
1961 m = *pm++;
1962 d = *pd;
1963
1964 *pd++ = pack_1x128_32 (
1965 pix_multiply_1x128 (
1966 unpack_32_1x128 (d),
1967 negate_1x128 (pix_multiply_1x128 (
1968 unpack_32_1x128 (m),
1969 expand_alpha_1x128 (unpack_32_1x128 (s))))));
1970 w--;
1971 }
1972
1973 while (w >= 4)
1974 {
1975 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1976 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1977 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1978
1979 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1980 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1981 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1982
1983 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1984 &xmm_alpha_lo, &xmm_alpha_hi);
1985
1986 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1987 &xmm_alpha_lo, &xmm_alpha_hi,
1988 &xmm_mask_lo, &xmm_mask_hi);
1989
1990 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1991 &xmm_mask_lo, &xmm_mask_hi);
1992
1993 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1994 &xmm_mask_lo, &xmm_mask_hi,
1995 &xmm_dst_lo, &xmm_dst_hi);
1996
1997 save_128_aligned (
1998 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1999
2000 ps += 4;
2001 pd += 4;
2002 pm += 4;
2003 w -= 4;
2004 }
2005
2006 while (w)
2007 {
2008 s = *ps++;
2009 m = *pm++;
2010 d = *pd;
2011
2012 *pd++ = pack_1x128_32 (
2013 pix_multiply_1x128 (
2014 unpack_32_1x128 (d),
2015 negate_1x128 (pix_multiply_1x128 (
2016 unpack_32_1x128 (m),
2017 expand_alpha_1x128 (unpack_32_1x128 (s))))));
2018 w--;
2019 }
2020 }
2021
2022 static force_inline uint32_t
core_combine_atop_ca_pixel_sse2(uint32_t src,uint32_t mask,uint32_t dst)2023 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2024 uint32_t mask,
2025 uint32_t dst)
2026 {
2027 __m128i m = unpack_32_1x128 (mask);
2028 __m128i s = unpack_32_1x128 (src);
2029 __m128i d = unpack_32_1x128 (dst);
2030 __m128i sa = expand_alpha_1x128 (s);
2031 __m128i da = expand_alpha_1x128 (d);
2032
2033 s = pix_multiply_1x128 (s, m);
2034 m = negate_1x128 (pix_multiply_1x128 (m, sa));
2035
2036 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2037 }
2038
2039 static void
sse2_combine_atop_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)2040 sse2_combine_atop_ca (pixman_implementation_t *imp,
2041 pixman_op_t op,
2042 uint32_t * pd,
2043 const uint32_t * ps,
2044 const uint32_t * pm,
2045 int w)
2046 {
2047 uint32_t s, m, d;
2048
2049 __m128i xmm_src_lo, xmm_src_hi;
2050 __m128i xmm_dst_lo, xmm_dst_hi;
2051 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2052 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2053 __m128i xmm_mask_lo, xmm_mask_hi;
2054
2055 while (w && (uintptr_t)pd & 15)
2056 {
2057 s = *ps++;
2058 m = *pm++;
2059 d = *pd;
2060
2061 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2062 w--;
2063 }
2064
2065 while (w >= 4)
2066 {
2067 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2068 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2069 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2070
2071 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2072 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2073 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2074
2075 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2076 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2077 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2078 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2079
2080 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2081 &xmm_mask_lo, &xmm_mask_hi,
2082 &xmm_src_lo, &xmm_src_hi);
2083 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2084 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2085 &xmm_mask_lo, &xmm_mask_hi);
2086
2087 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2088
2089 pix_add_multiply_2x128 (
2090 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2091 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2092 &xmm_dst_lo, &xmm_dst_hi);
2093
2094 save_128_aligned (
2095 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2096
2097 ps += 4;
2098 pd += 4;
2099 pm += 4;
2100 w -= 4;
2101 }
2102
2103 while (w)
2104 {
2105 s = *ps++;
2106 m = *pm++;
2107 d = *pd;
2108
2109 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2110 w--;
2111 }
2112 }
2113
2114 static force_inline uint32_t
core_combine_reverse_atop_ca_pixel_sse2(uint32_t src,uint32_t mask,uint32_t dst)2115 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2116 uint32_t mask,
2117 uint32_t dst)
2118 {
2119 __m128i m = unpack_32_1x128 (mask);
2120 __m128i s = unpack_32_1x128 (src);
2121 __m128i d = unpack_32_1x128 (dst);
2122
2123 __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2124 __m128i sa = expand_alpha_1x128 (s);
2125
2126 s = pix_multiply_1x128 (s, m);
2127 m = pix_multiply_1x128 (m, sa);
2128
2129 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2130 }
2131
2132 static void
sse2_combine_atop_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)2133 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2134 pixman_op_t op,
2135 uint32_t * pd,
2136 const uint32_t * ps,
2137 const uint32_t * pm,
2138 int w)
2139 {
2140 uint32_t s, m, d;
2141
2142 __m128i xmm_src_lo, xmm_src_hi;
2143 __m128i xmm_dst_lo, xmm_dst_hi;
2144 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2145 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2146 __m128i xmm_mask_lo, xmm_mask_hi;
2147
2148 while (w && (uintptr_t)pd & 15)
2149 {
2150 s = *ps++;
2151 m = *pm++;
2152 d = *pd;
2153
2154 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2155 w--;
2156 }
2157
2158 while (w >= 4)
2159 {
2160 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2161 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2162 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2163
2164 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2165 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2166 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2167
2168 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2169 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2170 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2171 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2172
2173 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2174 &xmm_mask_lo, &xmm_mask_hi,
2175 &xmm_src_lo, &xmm_src_hi);
2176 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2177 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2178 &xmm_mask_lo, &xmm_mask_hi);
2179
2180 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2181 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2182
2183 pix_add_multiply_2x128 (
2184 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2185 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2186 &xmm_dst_lo, &xmm_dst_hi);
2187
2188 save_128_aligned (
2189 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2190
2191 ps += 4;
2192 pd += 4;
2193 pm += 4;
2194 w -= 4;
2195 }
2196
2197 while (w)
2198 {
2199 s = *ps++;
2200 m = *pm++;
2201 d = *pd;
2202
2203 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2204 w--;
2205 }
2206 }
2207
2208 static force_inline uint32_t
core_combine_xor_ca_pixel_sse2(uint32_t src,uint32_t mask,uint32_t dst)2209 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2210 uint32_t mask,
2211 uint32_t dst)
2212 {
2213 __m128i a = unpack_32_1x128 (mask);
2214 __m128i s = unpack_32_1x128 (src);
2215 __m128i d = unpack_32_1x128 (dst);
2216
2217 __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2218 a, expand_alpha_1x128 (s)));
2219 __m128i dest = pix_multiply_1x128 (s, a);
2220 __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2221
2222 return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2223 &alpha_dst,
2224 &dest,
2225 &alpha_src));
2226 }
2227
2228 static void
sse2_combine_xor_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)2229 sse2_combine_xor_ca (pixman_implementation_t *imp,
2230 pixman_op_t op,
2231 uint32_t * pd,
2232 const uint32_t * ps,
2233 const uint32_t * pm,
2234 int w)
2235 {
2236 uint32_t s, m, d;
2237
2238 __m128i xmm_src_lo, xmm_src_hi;
2239 __m128i xmm_dst_lo, xmm_dst_hi;
2240 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2241 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2242 __m128i xmm_mask_lo, xmm_mask_hi;
2243
2244 while (w && (uintptr_t)pd & 15)
2245 {
2246 s = *ps++;
2247 m = *pm++;
2248 d = *pd;
2249
2250 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2251 w--;
2252 }
2253
2254 while (w >= 4)
2255 {
2256 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2257 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2258 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2259
2260 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2261 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2262 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2263
2264 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2265 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2266 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2267 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2268
2269 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2270 &xmm_mask_lo, &xmm_mask_hi,
2271 &xmm_src_lo, &xmm_src_hi);
2272 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2273 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2274 &xmm_mask_lo, &xmm_mask_hi);
2275
2276 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2277 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2278 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2279 &xmm_mask_lo, &xmm_mask_hi);
2280
2281 pix_add_multiply_2x128 (
2282 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2283 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2284 &xmm_dst_lo, &xmm_dst_hi);
2285
2286 save_128_aligned (
2287 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2288
2289 ps += 4;
2290 pd += 4;
2291 pm += 4;
2292 w -= 4;
2293 }
2294
2295 while (w)
2296 {
2297 s = *ps++;
2298 m = *pm++;
2299 d = *pd;
2300
2301 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2302 w--;
2303 }
2304 }
2305
2306 static void
sse2_combine_add_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)2307 sse2_combine_add_ca (pixman_implementation_t *imp,
2308 pixman_op_t op,
2309 uint32_t * pd,
2310 const uint32_t * ps,
2311 const uint32_t * pm,
2312 int w)
2313 {
2314 uint32_t s, m, d;
2315
2316 __m128i xmm_src_lo, xmm_src_hi;
2317 __m128i xmm_dst_lo, xmm_dst_hi;
2318 __m128i xmm_mask_lo, xmm_mask_hi;
2319
2320 while (w && (uintptr_t)pd & 15)
2321 {
2322 s = *ps++;
2323 m = *pm++;
2324 d = *pd;
2325
2326 *pd++ = pack_1x128_32 (
2327 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2328 unpack_32_1x128 (m)),
2329 unpack_32_1x128 (d)));
2330 w--;
2331 }
2332
2333 while (w >= 4)
2334 {
2335 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2336 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2337 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2338
2339 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2340 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2341 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2342
2343 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2344 &xmm_mask_lo, &xmm_mask_hi,
2345 &xmm_src_lo, &xmm_src_hi);
2346
2347 save_128_aligned (
2348 (__m128i*)pd, pack_2x128_128 (
2349 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2350 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2351
2352 ps += 4;
2353 pd += 4;
2354 pm += 4;
2355 w -= 4;
2356 }
2357
2358 while (w)
2359 {
2360 s = *ps++;
2361 m = *pm++;
2362 d = *pd;
2363
2364 *pd++ = pack_1x128_32 (
2365 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2366 unpack_32_1x128 (m)),
2367 unpack_32_1x128 (d)));
2368 w--;
2369 }
2370 }
2371
2372 static force_inline __m128i
create_mask_16_128(uint16_t mask)2373 create_mask_16_128 (uint16_t mask)
2374 {
2375 return _mm_set1_epi16 (mask);
2376 }
2377
2378 /* Work around a code generation bug in Sun Studio 12. */
2379 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2380 # define create_mask_2x32_128(mask0, mask1) \
2381 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2382 #else
2383 static force_inline __m128i
create_mask_2x32_128(uint32_t mask0,uint32_t mask1)2384 create_mask_2x32_128 (uint32_t mask0,
2385 uint32_t mask1)
2386 {
2387 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2388 }
2389 #endif
2390
2391 static void
sse2_composite_over_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2392 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2393 pixman_composite_info_t *info)
2394 {
2395 PIXMAN_COMPOSITE_ARGS (info);
2396 uint32_t src;
2397 uint32_t *dst_line, *dst, d;
2398 int32_t w;
2399 int dst_stride;
2400 __m128i xmm_src, xmm_alpha;
2401 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2402
2403 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2404
2405 if (src == 0)
2406 return;
2407
2408 PIXMAN_IMAGE_GET_LINE (
2409 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2410
2411 xmm_src = expand_pixel_32_1x128 (src);
2412 xmm_alpha = expand_alpha_1x128 (xmm_src);
2413
2414 while (height--)
2415 {
2416 dst = dst_line;
2417
2418 dst_line += dst_stride;
2419 w = width;
2420
2421 while (w && (uintptr_t)dst & 15)
2422 {
2423 d = *dst;
2424 *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2425 xmm_alpha,
2426 unpack_32_1x128 (d)));
2427 w--;
2428 }
2429
2430 while (w >= 4)
2431 {
2432 xmm_dst = load_128_aligned ((__m128i*)dst);
2433
2434 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2435
2436 over_2x128 (&xmm_src, &xmm_src,
2437 &xmm_alpha, &xmm_alpha,
2438 &xmm_dst_lo, &xmm_dst_hi);
2439
2440 /* rebuid the 4 pixel data and save*/
2441 save_128_aligned (
2442 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2443
2444 w -= 4;
2445 dst += 4;
2446 }
2447
2448 while (w)
2449 {
2450 d = *dst;
2451 *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2452 xmm_alpha,
2453 unpack_32_1x128 (d)));
2454 w--;
2455 }
2456
2457 }
2458 }
2459
2460 static void
sse2_composite_over_n_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2461 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2462 pixman_composite_info_t *info)
2463 {
2464 PIXMAN_COMPOSITE_ARGS (info);
2465 uint32_t src;
2466 uint16_t *dst_line, *dst, d;
2467 int32_t w;
2468 int dst_stride;
2469 __m128i xmm_src, xmm_alpha;
2470 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2471
2472 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2473
2474 if (src == 0)
2475 return;
2476
2477 PIXMAN_IMAGE_GET_LINE (
2478 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2479
2480 xmm_src = expand_pixel_32_1x128 (src);
2481 xmm_alpha = expand_alpha_1x128 (xmm_src);
2482
2483 while (height--)
2484 {
2485 dst = dst_line;
2486
2487 dst_line += dst_stride;
2488 w = width;
2489
2490 while (w && (uintptr_t)dst & 15)
2491 {
2492 d = *dst;
2493
2494 *dst++ = pack_565_32_16 (
2495 pack_1x128_32 (over_1x128 (xmm_src,
2496 xmm_alpha,
2497 expand565_16_1x128 (d))));
2498 w--;
2499 }
2500
2501 while (w >= 8)
2502 {
2503 xmm_dst = load_128_aligned ((__m128i*)dst);
2504
2505 unpack_565_128_4x128 (xmm_dst,
2506 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2507
2508 over_2x128 (&xmm_src, &xmm_src,
2509 &xmm_alpha, &xmm_alpha,
2510 &xmm_dst0, &xmm_dst1);
2511 over_2x128 (&xmm_src, &xmm_src,
2512 &xmm_alpha, &xmm_alpha,
2513 &xmm_dst2, &xmm_dst3);
2514
2515 xmm_dst = pack_565_4x128_128 (
2516 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2517
2518 save_128_aligned ((__m128i*)dst, xmm_dst);
2519
2520 dst += 8;
2521 w -= 8;
2522 }
2523
2524 while (w--)
2525 {
2526 d = *dst;
2527 *dst++ = pack_565_32_16 (
2528 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2529 expand565_16_1x128 (d))));
2530 }
2531 }
2532
2533 }
2534
2535 static void
sse2_composite_add_n_8888_8888_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)2536 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2537 pixman_composite_info_t *info)
2538 {
2539 PIXMAN_COMPOSITE_ARGS (info);
2540 uint32_t src;
2541 uint32_t *dst_line, d;
2542 uint32_t *mask_line, m;
2543 uint32_t pack_cmp;
2544 int dst_stride, mask_stride;
2545
2546 __m128i xmm_src;
2547 __m128i xmm_dst;
2548 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2549
2550 __m128i mmx_src, mmx_mask, mmx_dest;
2551
2552 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2553
2554 if (src == 0)
2555 return;
2556
2557 PIXMAN_IMAGE_GET_LINE (
2558 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2559 PIXMAN_IMAGE_GET_LINE (
2560 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2561
2562 xmm_src = _mm_unpacklo_epi8 (
2563 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2564 mmx_src = xmm_src;
2565
2566 while (height--)
2567 {
2568 int w = width;
2569 const uint32_t *pm = (uint32_t *)mask_line;
2570 uint32_t *pd = (uint32_t *)dst_line;
2571
2572 dst_line += dst_stride;
2573 mask_line += mask_stride;
2574
2575 while (w && (uintptr_t)pd & 15)
2576 {
2577 m = *pm++;
2578
2579 if (m)
2580 {
2581 d = *pd;
2582
2583 mmx_mask = unpack_32_1x128 (m);
2584 mmx_dest = unpack_32_1x128 (d);
2585
2586 *pd = pack_1x128_32 (
2587 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2588 mmx_dest));
2589 }
2590
2591 pd++;
2592 w--;
2593 }
2594
2595 while (w >= 4)
2596 {
2597 xmm_mask = load_128_unaligned ((__m128i*)pm);
2598
2599 pack_cmp =
2600 _mm_movemask_epi8 (
2601 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2602
2603 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2604 if (pack_cmp != 0xffff)
2605 {
2606 xmm_dst = load_128_aligned ((__m128i*)pd);
2607
2608 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2609
2610 pix_multiply_2x128 (&xmm_src, &xmm_src,
2611 &xmm_mask_lo, &xmm_mask_hi,
2612 &xmm_mask_lo, &xmm_mask_hi);
2613 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2614
2615 save_128_aligned (
2616 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2617 }
2618
2619 pd += 4;
2620 pm += 4;
2621 w -= 4;
2622 }
2623
2624 while (w)
2625 {
2626 m = *pm++;
2627
2628 if (m)
2629 {
2630 d = *pd;
2631
2632 mmx_mask = unpack_32_1x128 (m);
2633 mmx_dest = unpack_32_1x128 (d);
2634
2635 *pd = pack_1x128_32 (
2636 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2637 mmx_dest));
2638 }
2639
2640 pd++;
2641 w--;
2642 }
2643 }
2644
2645 }
2646
2647 static void
sse2_composite_over_n_8888_8888_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)2648 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2649 pixman_composite_info_t *info)
2650 {
2651 PIXMAN_COMPOSITE_ARGS (info);
2652 uint32_t src;
2653 uint32_t *dst_line, d;
2654 uint32_t *mask_line, m;
2655 uint32_t pack_cmp;
2656 int dst_stride, mask_stride;
2657
2658 __m128i xmm_src, xmm_alpha;
2659 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2660 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2661
2662 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2663
2664 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2665
2666 if (src == 0)
2667 return;
2668
2669 PIXMAN_IMAGE_GET_LINE (
2670 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2671 PIXMAN_IMAGE_GET_LINE (
2672 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2673
2674 xmm_src = _mm_unpacklo_epi8 (
2675 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2676 xmm_alpha = expand_alpha_1x128 (xmm_src);
2677 mmx_src = xmm_src;
2678 mmx_alpha = xmm_alpha;
2679
2680 while (height--)
2681 {
2682 int w = width;
2683 const uint32_t *pm = (uint32_t *)mask_line;
2684 uint32_t *pd = (uint32_t *)dst_line;
2685
2686 dst_line += dst_stride;
2687 mask_line += mask_stride;
2688
2689 while (w && (uintptr_t)pd & 15)
2690 {
2691 m = *pm++;
2692
2693 if (m)
2694 {
2695 d = *pd;
2696 mmx_mask = unpack_32_1x128 (m);
2697 mmx_dest = unpack_32_1x128 (d);
2698
2699 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2700 &mmx_alpha,
2701 &mmx_mask,
2702 &mmx_dest));
2703 }
2704
2705 pd++;
2706 w--;
2707 }
2708
2709 while (w >= 4)
2710 {
2711 xmm_mask = load_128_unaligned ((__m128i*)pm);
2712
2713 pack_cmp =
2714 _mm_movemask_epi8 (
2715 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2716
2717 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2718 if (pack_cmp != 0xffff)
2719 {
2720 xmm_dst = load_128_aligned ((__m128i*)pd);
2721
2722 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2723 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2724
2725 in_over_2x128 (&xmm_src, &xmm_src,
2726 &xmm_alpha, &xmm_alpha,
2727 &xmm_mask_lo, &xmm_mask_hi,
2728 &xmm_dst_lo, &xmm_dst_hi);
2729
2730 save_128_aligned (
2731 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2732 }
2733
2734 pd += 4;
2735 pm += 4;
2736 w -= 4;
2737 }
2738
2739 while (w)
2740 {
2741 m = *pm++;
2742
2743 if (m)
2744 {
2745 d = *pd;
2746 mmx_mask = unpack_32_1x128 (m);
2747 mmx_dest = unpack_32_1x128 (d);
2748
2749 *pd = pack_1x128_32 (
2750 in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2751 }
2752
2753 pd++;
2754 w--;
2755 }
2756 }
2757
2758 }
2759
2760 static void
sse2_composite_over_8888_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2761 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2762 pixman_composite_info_t *info)
2763 {
2764 PIXMAN_COMPOSITE_ARGS (info);
2765 uint32_t *dst_line, *dst;
2766 uint32_t *src_line, *src;
2767 uint32_t mask;
2768 int32_t w;
2769 int dst_stride, src_stride;
2770
2771 __m128i xmm_mask;
2772 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2773 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2774 __m128i xmm_alpha_lo, xmm_alpha_hi;
2775
2776 PIXMAN_IMAGE_GET_LINE (
2777 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2778 PIXMAN_IMAGE_GET_LINE (
2779 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2780
2781 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2782
2783 xmm_mask = create_mask_16_128 (mask >> 24);
2784
2785 while (height--)
2786 {
2787 dst = dst_line;
2788 dst_line += dst_stride;
2789 src = src_line;
2790 src_line += src_stride;
2791 w = width;
2792
2793 while (w && (uintptr_t)dst & 15)
2794 {
2795 uint32_t s = *src++;
2796
2797 if (s)
2798 {
2799 uint32_t d = *dst;
2800
2801 __m128i ms = unpack_32_1x128 (s);
2802 __m128i alpha = expand_alpha_1x128 (ms);
2803 __m128i dest = xmm_mask;
2804 __m128i alpha_dst = unpack_32_1x128 (d);
2805
2806 *dst = pack_1x128_32 (
2807 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2808 }
2809 dst++;
2810 w--;
2811 }
2812
2813 while (w >= 4)
2814 {
2815 xmm_src = load_128_unaligned ((__m128i*)src);
2816
2817 if (!is_zero (xmm_src))
2818 {
2819 xmm_dst = load_128_aligned ((__m128i*)dst);
2820
2821 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2822 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2823 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2824 &xmm_alpha_lo, &xmm_alpha_hi);
2825
2826 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2827 &xmm_alpha_lo, &xmm_alpha_hi,
2828 &xmm_mask, &xmm_mask,
2829 &xmm_dst_lo, &xmm_dst_hi);
2830
2831 save_128_aligned (
2832 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2833 }
2834
2835 dst += 4;
2836 src += 4;
2837 w -= 4;
2838 }
2839
2840 while (w)
2841 {
2842 uint32_t s = *src++;
2843
2844 if (s)
2845 {
2846 uint32_t d = *dst;
2847
2848 __m128i ms = unpack_32_1x128 (s);
2849 __m128i alpha = expand_alpha_1x128 (ms);
2850 __m128i mask = xmm_mask;
2851 __m128i dest = unpack_32_1x128 (d);
2852
2853 *dst = pack_1x128_32 (
2854 in_over_1x128 (&ms, &alpha, &mask, &dest));
2855 }
2856
2857 dst++;
2858 w--;
2859 }
2860 }
2861
2862 }
2863
2864 static void
sse2_composite_src_x888_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2865 sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
2866 pixman_composite_info_t *info)
2867 {
2868 PIXMAN_COMPOSITE_ARGS (info);
2869 uint16_t *dst_line, *dst;
2870 uint32_t *src_line, *src, s;
2871 int dst_stride, src_stride;
2872 int32_t w;
2873
2874 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2875 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2876
2877 while (height--)
2878 {
2879 dst = dst_line;
2880 dst_line += dst_stride;
2881 src = src_line;
2882 src_line += src_stride;
2883 w = width;
2884
2885 while (w && (uintptr_t)dst & 15)
2886 {
2887 s = *src++;
2888 *dst = convert_8888_to_0565 (s);
2889 dst++;
2890 w--;
2891 }
2892
2893 while (w >= 8)
2894 {
2895 __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
2896 __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
2897
2898 save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
2899
2900 w -= 8;
2901 src += 8;
2902 dst += 8;
2903 }
2904
2905 while (w)
2906 {
2907 s = *src++;
2908 *dst = convert_8888_to_0565 (s);
2909 dst++;
2910 w--;
2911 }
2912 }
2913 }
2914
2915 static void
sse2_composite_src_x888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2916 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2917 pixman_composite_info_t *info)
2918 {
2919 PIXMAN_COMPOSITE_ARGS (info);
2920 uint32_t *dst_line, *dst;
2921 uint32_t *src_line, *src;
2922 int32_t w;
2923 int dst_stride, src_stride;
2924
2925
2926 PIXMAN_IMAGE_GET_LINE (
2927 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2928 PIXMAN_IMAGE_GET_LINE (
2929 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2930
2931 while (height--)
2932 {
2933 dst = dst_line;
2934 dst_line += dst_stride;
2935 src = src_line;
2936 src_line += src_stride;
2937 w = width;
2938
2939 while (w && (uintptr_t)dst & 15)
2940 {
2941 *dst++ = *src++ | 0xff000000;
2942 w--;
2943 }
2944
2945 while (w >= 16)
2946 {
2947 __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2948
2949 xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2950 xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2951 xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2952 xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2953
2954 save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2955 save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2956 save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2957 save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2958
2959 dst += 16;
2960 src += 16;
2961 w -= 16;
2962 }
2963
2964 while (w)
2965 {
2966 *dst++ = *src++ | 0xff000000;
2967 w--;
2968 }
2969 }
2970
2971 }
2972
2973 static void
sse2_composite_over_x888_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2974 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2975 pixman_composite_info_t *info)
2976 {
2977 PIXMAN_COMPOSITE_ARGS (info);
2978 uint32_t *dst_line, *dst;
2979 uint32_t *src_line, *src;
2980 uint32_t mask;
2981 int dst_stride, src_stride;
2982 int32_t w;
2983
2984 __m128i xmm_mask, xmm_alpha;
2985 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2986 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2987
2988 PIXMAN_IMAGE_GET_LINE (
2989 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2990 PIXMAN_IMAGE_GET_LINE (
2991 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2992
2993 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2994
2995 xmm_mask = create_mask_16_128 (mask >> 24);
2996 xmm_alpha = mask_00ff;
2997
2998 while (height--)
2999 {
3000 dst = dst_line;
3001 dst_line += dst_stride;
3002 src = src_line;
3003 src_line += src_stride;
3004 w = width;
3005
3006 while (w && (uintptr_t)dst & 15)
3007 {
3008 uint32_t s = (*src++) | 0xff000000;
3009 uint32_t d = *dst;
3010
3011 __m128i src = unpack_32_1x128 (s);
3012 __m128i alpha = xmm_alpha;
3013 __m128i mask = xmm_mask;
3014 __m128i dest = unpack_32_1x128 (d);
3015
3016 *dst++ = pack_1x128_32 (
3017 in_over_1x128 (&src, &alpha, &mask, &dest));
3018
3019 w--;
3020 }
3021
3022 while (w >= 4)
3023 {
3024 xmm_src = _mm_or_si128 (
3025 load_128_unaligned ((__m128i*)src), mask_ff000000);
3026 xmm_dst = load_128_aligned ((__m128i*)dst);
3027
3028 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3029 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3030
3031 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3032 &xmm_alpha, &xmm_alpha,
3033 &xmm_mask, &xmm_mask,
3034 &xmm_dst_lo, &xmm_dst_hi);
3035
3036 save_128_aligned (
3037 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3038
3039 dst += 4;
3040 src += 4;
3041 w -= 4;
3042
3043 }
3044
3045 while (w)
3046 {
3047 uint32_t s = (*src++) | 0xff000000;
3048 uint32_t d = *dst;
3049
3050 __m128i src = unpack_32_1x128 (s);
3051 __m128i alpha = xmm_alpha;
3052 __m128i mask = xmm_mask;
3053 __m128i dest = unpack_32_1x128 (d);
3054
3055 *dst++ = pack_1x128_32 (
3056 in_over_1x128 (&src, &alpha, &mask, &dest));
3057
3058 w--;
3059 }
3060 }
3061
3062 }
3063
3064 static void
sse2_composite_over_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3065 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3066 pixman_composite_info_t *info)
3067 {
3068 PIXMAN_COMPOSITE_ARGS (info);
3069 int dst_stride, src_stride;
3070 uint32_t *dst_line, *dst;
3071 uint32_t *src_line, *src;
3072
3073 PIXMAN_IMAGE_GET_LINE (
3074 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3075 PIXMAN_IMAGE_GET_LINE (
3076 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3077
3078 dst = dst_line;
3079 src = src_line;
3080
3081 while (height--)
3082 {
3083 sse2_combine_over_u (imp, op, dst, src, NULL, width);
3084
3085 dst += dst_stride;
3086 src += src_stride;
3087 }
3088 }
3089
3090 static force_inline uint16_t
composite_over_8888_0565pixel(uint32_t src,uint16_t dst)3091 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3092 {
3093 __m128i ms;
3094
3095 ms = unpack_32_1x128 (src);
3096 return pack_565_32_16 (
3097 pack_1x128_32 (
3098 over_1x128 (
3099 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3100 }
3101
3102 static void
sse2_composite_over_8888_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)3103 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3104 pixman_composite_info_t *info)
3105 {
3106 PIXMAN_COMPOSITE_ARGS (info);
3107 uint16_t *dst_line, *dst, d;
3108 uint32_t *src_line, *src, s;
3109 int dst_stride, src_stride;
3110 int32_t w;
3111
3112 __m128i xmm_alpha_lo, xmm_alpha_hi;
3113 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3114 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3115
3116 PIXMAN_IMAGE_GET_LINE (
3117 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3118 PIXMAN_IMAGE_GET_LINE (
3119 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3120
3121 while (height--)
3122 {
3123 dst = dst_line;
3124 src = src_line;
3125
3126 dst_line += dst_stride;
3127 src_line += src_stride;
3128 w = width;
3129
3130 /* Align dst on a 16-byte boundary */
3131 while (w &&
3132 ((uintptr_t)dst & 15))
3133 {
3134 s = *src++;
3135 d = *dst;
3136
3137 *dst++ = composite_over_8888_0565pixel (s, d);
3138 w--;
3139 }
3140
3141 /* It's a 8 pixel loop */
3142 while (w >= 8)
3143 {
3144 /* I'm loading unaligned because I'm not sure
3145 * about the address alignment.
3146 */
3147 xmm_src = load_128_unaligned ((__m128i*) src);
3148 xmm_dst = load_128_aligned ((__m128i*) dst);
3149
3150 /* Unpacking */
3151 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3152 unpack_565_128_4x128 (xmm_dst,
3153 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3154 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3155 &xmm_alpha_lo, &xmm_alpha_hi);
3156
3157 /* I'm loading next 4 pixels from memory
3158 * before to optimze the memory read.
3159 */
3160 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3161
3162 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3163 &xmm_alpha_lo, &xmm_alpha_hi,
3164 &xmm_dst0, &xmm_dst1);
3165
3166 /* Unpacking */
3167 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3168 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3169 &xmm_alpha_lo, &xmm_alpha_hi);
3170
3171 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3172 &xmm_alpha_lo, &xmm_alpha_hi,
3173 &xmm_dst2, &xmm_dst3);
3174
3175 save_128_aligned (
3176 (__m128i*)dst, pack_565_4x128_128 (
3177 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3178
3179 w -= 8;
3180 dst += 8;
3181 src += 8;
3182 }
3183
3184 while (w--)
3185 {
3186 s = *src++;
3187 d = *dst;
3188
3189 *dst++ = composite_over_8888_0565pixel (s, d);
3190 }
3191 }
3192
3193 }
3194
3195 static void
sse2_composite_over_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3196 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3197 pixman_composite_info_t *info)
3198 {
3199 PIXMAN_COMPOSITE_ARGS (info);
3200 uint32_t src, srca;
3201 uint32_t *dst_line, *dst;
3202 uint8_t *mask_line, *mask;
3203 int dst_stride, mask_stride;
3204 int32_t w;
3205 uint32_t m, d;
3206
3207 __m128i xmm_src, xmm_alpha, xmm_def;
3208 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3209 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3210
3211 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3212
3213 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3214
3215 srca = src >> 24;
3216 if (src == 0)
3217 return;
3218
3219 PIXMAN_IMAGE_GET_LINE (
3220 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3221 PIXMAN_IMAGE_GET_LINE (
3222 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3223
3224 xmm_def = create_mask_2x32_128 (src, src);
3225 xmm_src = expand_pixel_32_1x128 (src);
3226 xmm_alpha = expand_alpha_1x128 (xmm_src);
3227 mmx_src = xmm_src;
3228 mmx_alpha = xmm_alpha;
3229
3230 while (height--)
3231 {
3232 dst = dst_line;
3233 dst_line += dst_stride;
3234 mask = mask_line;
3235 mask_line += mask_stride;
3236 w = width;
3237
3238 while (w && (uintptr_t)dst & 15)
3239 {
3240 uint8_t m = *mask++;
3241
3242 if (m)
3243 {
3244 d = *dst;
3245 mmx_mask = expand_pixel_8_1x128 (m);
3246 mmx_dest = unpack_32_1x128 (d);
3247
3248 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3249 &mmx_alpha,
3250 &mmx_mask,
3251 &mmx_dest));
3252 }
3253
3254 w--;
3255 dst++;
3256 }
3257
3258 while (w >= 4)
3259 {
3260 memcpy(&m, mask, sizeof(uint32_t));
3261
3262 if (srca == 0xff && m == 0xffffffff)
3263 {
3264 save_128_aligned ((__m128i*)dst, xmm_def);
3265 }
3266 else if (m)
3267 {
3268 xmm_dst = load_128_aligned ((__m128i*) dst);
3269 xmm_mask = unpack_32_1x128 (m);
3270 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3271
3272 /* Unpacking */
3273 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3274 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3275
3276 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3277 &xmm_mask_lo, &xmm_mask_hi);
3278
3279 in_over_2x128 (&xmm_src, &xmm_src,
3280 &xmm_alpha, &xmm_alpha,
3281 &xmm_mask_lo, &xmm_mask_hi,
3282 &xmm_dst_lo, &xmm_dst_hi);
3283
3284 save_128_aligned (
3285 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3286 }
3287
3288 w -= 4;
3289 dst += 4;
3290 mask += 4;
3291 }
3292
3293 while (w)
3294 {
3295 uint8_t m = *mask++;
3296
3297 if (m)
3298 {
3299 d = *dst;
3300 mmx_mask = expand_pixel_8_1x128 (m);
3301 mmx_dest = unpack_32_1x128 (d);
3302
3303 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3304 &mmx_alpha,
3305 &mmx_mask,
3306 &mmx_dest));
3307 }
3308
3309 w--;
3310 dst++;
3311 }
3312 }
3313
3314 }
3315
3316 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
3317 __attribute__((__force_align_arg_pointer__))
3318 #endif
3319 static pixman_bool_t
sse2_fill(pixman_implementation_t * imp,uint32_t * bits,int stride,int bpp,int x,int y,int width,int height,uint32_t filler)3320 sse2_fill (pixman_implementation_t *imp,
3321 uint32_t * bits,
3322 int stride,
3323 int bpp,
3324 int x,
3325 int y,
3326 int width,
3327 int height,
3328 uint32_t filler)
3329 {
3330 uint32_t byte_width;
3331 uint8_t *byte_line;
3332
3333 __m128i xmm_def;
3334
3335 if (bpp == 8)
3336 {
3337 uint32_t b;
3338 uint32_t w;
3339
3340 stride = stride * (int) sizeof (uint32_t) / 1;
3341 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3342 byte_width = width;
3343 stride *= 1;
3344
3345 b = filler & 0xff;
3346 w = (b << 8) | b;
3347 filler = (w << 16) | w;
3348 }
3349 else if (bpp == 16)
3350 {
3351 stride = stride * (int) sizeof (uint32_t) / 2;
3352 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3353 byte_width = 2 * width;
3354 stride *= 2;
3355
3356 filler = (filler & 0xffff) * 0x00010001;
3357 }
3358 else if (bpp == 32)
3359 {
3360 stride = stride * (int) sizeof (uint32_t) / 4;
3361 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3362 byte_width = 4 * width;
3363 stride *= 4;
3364 }
3365 else
3366 {
3367 return FALSE;
3368 }
3369
3370 xmm_def = create_mask_2x32_128 (filler, filler);
3371
3372 while (height--)
3373 {
3374 int w;
3375 uint8_t *d = byte_line;
3376 byte_line += stride;
3377 w = byte_width;
3378
3379 if (w >= 1 && ((uintptr_t)d & 1))
3380 {
3381 *(uint8_t *)d = filler;
3382 w -= 1;
3383 d += 1;
3384 }
3385
3386 while (w >= 2 && ((uintptr_t)d & 3))
3387 {
3388 *(uint16_t *)d = filler;
3389 w -= 2;
3390 d += 2;
3391 }
3392
3393 while (w >= 4 && ((uintptr_t)d & 15))
3394 {
3395 *(uint32_t *)d = filler;
3396
3397 w -= 4;
3398 d += 4;
3399 }
3400
3401 while (w >= 128)
3402 {
3403 save_128_aligned ((__m128i*)(d), xmm_def);
3404 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3405 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3406 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3407 save_128_aligned ((__m128i*)(d + 64), xmm_def);
3408 save_128_aligned ((__m128i*)(d + 80), xmm_def);
3409 save_128_aligned ((__m128i*)(d + 96), xmm_def);
3410 save_128_aligned ((__m128i*)(d + 112), xmm_def);
3411
3412 d += 128;
3413 w -= 128;
3414 }
3415
3416 if (w >= 64)
3417 {
3418 save_128_aligned ((__m128i*)(d), xmm_def);
3419 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3420 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3421 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3422
3423 d += 64;
3424 w -= 64;
3425 }
3426
3427 if (w >= 32)
3428 {
3429 save_128_aligned ((__m128i*)(d), xmm_def);
3430 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3431
3432 d += 32;
3433 w -= 32;
3434 }
3435
3436 if (w >= 16)
3437 {
3438 save_128_aligned ((__m128i*)(d), xmm_def);
3439
3440 d += 16;
3441 w -= 16;
3442 }
3443
3444 while (w >= 4)
3445 {
3446 *(uint32_t *)d = filler;
3447
3448 w -= 4;
3449 d += 4;
3450 }
3451
3452 if (w >= 2)
3453 {
3454 *(uint16_t *)d = filler;
3455 w -= 2;
3456 d += 2;
3457 }
3458
3459 if (w >= 1)
3460 {
3461 *(uint8_t *)d = filler;
3462 w -= 1;
3463 d += 1;
3464 }
3465 }
3466
3467 return TRUE;
3468 }
3469
3470 static void
sse2_composite_src_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3471 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3472 pixman_composite_info_t *info)
3473 {
3474 PIXMAN_COMPOSITE_ARGS (info);
3475 uint32_t src, srca;
3476 uint32_t *dst_line, *dst;
3477 uint8_t *mask_line, *mask;
3478 int dst_stride, mask_stride;
3479 int32_t w;
3480 uint32_t m;
3481
3482 __m128i xmm_src, xmm_def;
3483 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3484
3485 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3486
3487 srca = src >> 24;
3488 if (src == 0)
3489 {
3490 sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
3491 PIXMAN_FORMAT_BPP (dest_image->bits.format),
3492 dest_x, dest_y, width, height, 0);
3493 return;
3494 }
3495
3496 PIXMAN_IMAGE_GET_LINE (
3497 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3498 PIXMAN_IMAGE_GET_LINE (
3499 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3500
3501 xmm_def = create_mask_2x32_128 (src, src);
3502 xmm_src = expand_pixel_32_1x128 (src);
3503
3504 while (height--)
3505 {
3506 dst = dst_line;
3507 dst_line += dst_stride;
3508 mask = mask_line;
3509 mask_line += mask_stride;
3510 w = width;
3511
3512 while (w && (uintptr_t)dst & 15)
3513 {
3514 uint8_t m = *mask++;
3515
3516 if (m)
3517 {
3518 *dst = pack_1x128_32 (
3519 pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3520 }
3521 else
3522 {
3523 *dst = 0;
3524 }
3525
3526 w--;
3527 dst++;
3528 }
3529
3530 while (w >= 4)
3531 {
3532 memcpy(&m, mask, sizeof(uint32_t));
3533
3534 if (srca == 0xff && m == 0xffffffff)
3535 {
3536 save_128_aligned ((__m128i*)dst, xmm_def);
3537 }
3538 else if (m)
3539 {
3540 xmm_mask = unpack_32_1x128 (m);
3541 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3542
3543 /* Unpacking */
3544 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3545
3546 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3547 &xmm_mask_lo, &xmm_mask_hi);
3548
3549 pix_multiply_2x128 (&xmm_src, &xmm_src,
3550 &xmm_mask_lo, &xmm_mask_hi,
3551 &xmm_mask_lo, &xmm_mask_hi);
3552
3553 save_128_aligned (
3554 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3555 }
3556 else
3557 {
3558 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3559 }
3560
3561 w -= 4;
3562 dst += 4;
3563 mask += 4;
3564 }
3565
3566 while (w)
3567 {
3568 uint8_t m = *mask++;
3569
3570 if (m)
3571 {
3572 *dst = pack_1x128_32 (
3573 pix_multiply_1x128 (
3574 xmm_src, expand_pixel_8_1x128 (m)));
3575 }
3576 else
3577 {
3578 *dst = 0;
3579 }
3580
3581 w--;
3582 dst++;
3583 }
3584 }
3585
3586 }
3587
3588 static void
sse2_composite_over_n_8_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)3589 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3590 pixman_composite_info_t *info)
3591 {
3592 PIXMAN_COMPOSITE_ARGS (info);
3593 uint32_t src;
3594 uint16_t *dst_line, *dst, d;
3595 uint8_t *mask_line, *mask;
3596 int dst_stride, mask_stride;
3597 int32_t w;
3598 uint32_t m;
3599 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3600
3601 __m128i xmm_src, xmm_alpha;
3602 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3603 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3604
3605 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3606
3607 if (src == 0)
3608 return;
3609
3610 PIXMAN_IMAGE_GET_LINE (
3611 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3612 PIXMAN_IMAGE_GET_LINE (
3613 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3614
3615 xmm_src = expand_pixel_32_1x128 (src);
3616 xmm_alpha = expand_alpha_1x128 (xmm_src);
3617 mmx_src = xmm_src;
3618 mmx_alpha = xmm_alpha;
3619
3620 while (height--)
3621 {
3622 dst = dst_line;
3623 dst_line += dst_stride;
3624 mask = mask_line;
3625 mask_line += mask_stride;
3626 w = width;
3627
3628 while (w && (uintptr_t)dst & 15)
3629 {
3630 m = *mask++;
3631
3632 if (m)
3633 {
3634 d = *dst;
3635 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3636 mmx_dest = expand565_16_1x128 (d);
3637
3638 *dst = pack_565_32_16 (
3639 pack_1x128_32 (
3640 in_over_1x128 (
3641 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3642 }
3643
3644 w--;
3645 dst++;
3646 }
3647
3648 while (w >= 8)
3649 {
3650 xmm_dst = load_128_aligned ((__m128i*) dst);
3651 unpack_565_128_4x128 (xmm_dst,
3652 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3653
3654 memcpy(&m, mask, sizeof(uint32_t));
3655 mask += 4;
3656
3657 if (m)
3658 {
3659 xmm_mask = unpack_32_1x128 (m);
3660 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3661
3662 /* Unpacking */
3663 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3664
3665 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3666 &xmm_mask_lo, &xmm_mask_hi);
3667
3668 in_over_2x128 (&xmm_src, &xmm_src,
3669 &xmm_alpha, &xmm_alpha,
3670 &xmm_mask_lo, &xmm_mask_hi,
3671 &xmm_dst0, &xmm_dst1);
3672 }
3673
3674 memcpy(&m, mask, sizeof(uint32_t));
3675 mask += 4;
3676
3677 if (m)
3678 {
3679 xmm_mask = unpack_32_1x128 (m);
3680 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3681
3682 /* Unpacking */
3683 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3684
3685 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3686 &xmm_mask_lo, &xmm_mask_hi);
3687 in_over_2x128 (&xmm_src, &xmm_src,
3688 &xmm_alpha, &xmm_alpha,
3689 &xmm_mask_lo, &xmm_mask_hi,
3690 &xmm_dst2, &xmm_dst3);
3691 }
3692
3693 save_128_aligned (
3694 (__m128i*)dst, pack_565_4x128_128 (
3695 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3696
3697 w -= 8;
3698 dst += 8;
3699 }
3700
3701 while (w)
3702 {
3703 m = *mask++;
3704
3705 if (m)
3706 {
3707 d = *dst;
3708 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3709 mmx_dest = expand565_16_1x128 (d);
3710
3711 *dst = pack_565_32_16 (
3712 pack_1x128_32 (
3713 in_over_1x128 (
3714 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3715 }
3716
3717 w--;
3718 dst++;
3719 }
3720 }
3721
3722 }
3723
3724 static void
sse2_composite_over_pixbuf_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)3725 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3726 pixman_composite_info_t *info)
3727 {
3728 PIXMAN_COMPOSITE_ARGS (info);
3729 uint16_t *dst_line, *dst, d;
3730 uint32_t *src_line, *src, s;
3731 int dst_stride, src_stride;
3732 int32_t w;
3733 uint32_t opaque, zero;
3734
3735 __m128i ms;
3736 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3737 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3738
3739 PIXMAN_IMAGE_GET_LINE (
3740 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3741 PIXMAN_IMAGE_GET_LINE (
3742 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3743
3744 while (height--)
3745 {
3746 dst = dst_line;
3747 dst_line += dst_stride;
3748 src = src_line;
3749 src_line += src_stride;
3750 w = width;
3751
3752 while (w && (uintptr_t)dst & 15)
3753 {
3754 s = *src++;
3755 d = *dst;
3756
3757 ms = unpack_32_1x128 (s);
3758
3759 *dst++ = pack_565_32_16 (
3760 pack_1x128_32 (
3761 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3762 w--;
3763 }
3764
3765 while (w >= 8)
3766 {
3767 /* First round */
3768 xmm_src = load_128_unaligned ((__m128i*)src);
3769 xmm_dst = load_128_aligned ((__m128i*)dst);
3770
3771 opaque = is_opaque (xmm_src);
3772 zero = is_zero (xmm_src);
3773
3774 unpack_565_128_4x128 (xmm_dst,
3775 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3776 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3777
3778 /* preload next round*/
3779 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3780
3781 if (opaque)
3782 {
3783 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3784 &xmm_dst0, &xmm_dst1);
3785 }
3786 else if (!zero)
3787 {
3788 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3789 &xmm_dst0, &xmm_dst1);
3790 }
3791
3792 /* Second round */
3793 opaque = is_opaque (xmm_src);
3794 zero = is_zero (xmm_src);
3795
3796 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3797
3798 if (opaque)
3799 {
3800 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3801 &xmm_dst2, &xmm_dst3);
3802 }
3803 else if (!zero)
3804 {
3805 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3806 &xmm_dst2, &xmm_dst3);
3807 }
3808
3809 save_128_aligned (
3810 (__m128i*)dst, pack_565_4x128_128 (
3811 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3812
3813 w -= 8;
3814 src += 8;
3815 dst += 8;
3816 }
3817
3818 while (w)
3819 {
3820 s = *src++;
3821 d = *dst;
3822
3823 ms = unpack_32_1x128 (s);
3824
3825 *dst++ = pack_565_32_16 (
3826 pack_1x128_32 (
3827 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3828 w--;
3829 }
3830 }
3831
3832 }
3833
3834 static void
sse2_composite_over_pixbuf_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3835 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3836 pixman_composite_info_t *info)
3837 {
3838 PIXMAN_COMPOSITE_ARGS (info);
3839 uint32_t *dst_line, *dst, d;
3840 uint32_t *src_line, *src, s;
3841 int dst_stride, src_stride;
3842 int32_t w;
3843 uint32_t opaque, zero;
3844
3845 __m128i xmm_src_lo, xmm_src_hi;
3846 __m128i xmm_dst_lo, xmm_dst_hi;
3847
3848 PIXMAN_IMAGE_GET_LINE (
3849 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3850 PIXMAN_IMAGE_GET_LINE (
3851 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3852
3853 while (height--)
3854 {
3855 dst = dst_line;
3856 dst_line += dst_stride;
3857 src = src_line;
3858 src_line += src_stride;
3859 w = width;
3860
3861 while (w && (uintptr_t)dst & 15)
3862 {
3863 s = *src++;
3864 d = *dst;
3865
3866 *dst++ = pack_1x128_32 (
3867 over_rev_non_pre_1x128 (
3868 unpack_32_1x128 (s), unpack_32_1x128 (d)));
3869
3870 w--;
3871 }
3872
3873 while (w >= 4)
3874 {
3875 xmm_src_hi = load_128_unaligned ((__m128i*)src);
3876
3877 opaque = is_opaque (xmm_src_hi);
3878 zero = is_zero (xmm_src_hi);
3879
3880 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3881
3882 if (opaque)
3883 {
3884 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3885 &xmm_dst_lo, &xmm_dst_hi);
3886
3887 save_128_aligned (
3888 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3889 }
3890 else if (!zero)
3891 {
3892 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
3893
3894 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3895
3896 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3897 &xmm_dst_lo, &xmm_dst_hi);
3898
3899 save_128_aligned (
3900 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3901 }
3902
3903 w -= 4;
3904 dst += 4;
3905 src += 4;
3906 }
3907
3908 while (w)
3909 {
3910 s = *src++;
3911 d = *dst;
3912
3913 *dst++ = pack_1x128_32 (
3914 over_rev_non_pre_1x128 (
3915 unpack_32_1x128 (s), unpack_32_1x128 (d)));
3916
3917 w--;
3918 }
3919 }
3920
3921 }
3922
3923 static void
sse2_composite_over_n_8888_0565_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)3924 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3925 pixman_composite_info_t *info)
3926 {
3927 PIXMAN_COMPOSITE_ARGS (info);
3928 uint32_t src;
3929 uint16_t *dst_line, *dst, d;
3930 uint32_t *mask_line, *mask, m;
3931 int dst_stride, mask_stride;
3932 int w;
3933 uint32_t pack_cmp;
3934
3935 __m128i xmm_src, xmm_alpha;
3936 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3937 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3938
3939 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3940
3941 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3942
3943 if (src == 0)
3944 return;
3945
3946 PIXMAN_IMAGE_GET_LINE (
3947 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3948 PIXMAN_IMAGE_GET_LINE (
3949 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3950
3951 xmm_src = expand_pixel_32_1x128 (src);
3952 xmm_alpha = expand_alpha_1x128 (xmm_src);
3953 mmx_src = xmm_src;
3954 mmx_alpha = xmm_alpha;
3955
3956 while (height--)
3957 {
3958 w = width;
3959 mask = mask_line;
3960 dst = dst_line;
3961 mask_line += mask_stride;
3962 dst_line += dst_stride;
3963
3964 while (w && ((uintptr_t)dst & 15))
3965 {
3966 m = *(uint32_t *) mask;
3967
3968 if (m)
3969 {
3970 d = *dst;
3971 mmx_mask = unpack_32_1x128 (m);
3972 mmx_dest = expand565_16_1x128 (d);
3973
3974 *dst = pack_565_32_16 (
3975 pack_1x128_32 (
3976 in_over_1x128 (
3977 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3978 }
3979
3980 w--;
3981 dst++;
3982 mask++;
3983 }
3984
3985 while (w >= 8)
3986 {
3987 /* First round */
3988 xmm_mask = load_128_unaligned ((__m128i*)mask);
3989 xmm_dst = load_128_aligned ((__m128i*)dst);
3990
3991 pack_cmp = _mm_movemask_epi8 (
3992 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3993
3994 unpack_565_128_4x128 (xmm_dst,
3995 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3996 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3997
3998 /* preload next round */
3999 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4000
4001 /* preload next round */
4002 if (pack_cmp != 0xffff)
4003 {
4004 in_over_2x128 (&xmm_src, &xmm_src,
4005 &xmm_alpha, &xmm_alpha,
4006 &xmm_mask_lo, &xmm_mask_hi,
4007 &xmm_dst0, &xmm_dst1);
4008 }
4009
4010 /* Second round */
4011 pack_cmp = _mm_movemask_epi8 (
4012 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4013
4014 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4015
4016 if (pack_cmp != 0xffff)
4017 {
4018 in_over_2x128 (&xmm_src, &xmm_src,
4019 &xmm_alpha, &xmm_alpha,
4020 &xmm_mask_lo, &xmm_mask_hi,
4021 &xmm_dst2, &xmm_dst3);
4022 }
4023
4024 save_128_aligned (
4025 (__m128i*)dst, pack_565_4x128_128 (
4026 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4027
4028 w -= 8;
4029 dst += 8;
4030 mask += 8;
4031 }
4032
4033 while (w)
4034 {
4035 m = *(uint32_t *) mask;
4036
4037 if (m)
4038 {
4039 d = *dst;
4040 mmx_mask = unpack_32_1x128 (m);
4041 mmx_dest = expand565_16_1x128 (d);
4042
4043 *dst = pack_565_32_16 (
4044 pack_1x128_32 (
4045 in_over_1x128 (
4046 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4047 }
4048
4049 w--;
4050 dst++;
4051 mask++;
4052 }
4053 }
4054
4055 }
4056
4057 static void
sse2_composite_in_n_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4058 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4059 pixman_composite_info_t *info)
4060 {
4061 PIXMAN_COMPOSITE_ARGS (info);
4062 uint8_t *dst_line, *dst;
4063 uint8_t *mask_line, *mask;
4064 int dst_stride, mask_stride;
4065 uint32_t d, m;
4066 uint32_t src;
4067 int32_t w;
4068
4069 __m128i xmm_alpha;
4070 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4071 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4072
4073 PIXMAN_IMAGE_GET_LINE (
4074 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4075 PIXMAN_IMAGE_GET_LINE (
4076 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4077
4078 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4079
4080 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4081
4082 while (height--)
4083 {
4084 dst = dst_line;
4085 dst_line += dst_stride;
4086 mask = mask_line;
4087 mask_line += mask_stride;
4088 w = width;
4089
4090 while (w && ((uintptr_t)dst & 15))
4091 {
4092 m = (uint32_t) *mask++;
4093 d = (uint32_t) *dst;
4094
4095 *dst++ = (uint8_t) pack_1x128_32 (
4096 pix_multiply_1x128 (
4097 pix_multiply_1x128 (xmm_alpha,
4098 unpack_32_1x128 (m)),
4099 unpack_32_1x128 (d)));
4100 w--;
4101 }
4102
4103 while (w >= 16)
4104 {
4105 xmm_mask = load_128_unaligned ((__m128i*)mask);
4106 xmm_dst = load_128_aligned ((__m128i*)dst);
4107
4108 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4109 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4110
4111 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4112 &xmm_mask_lo, &xmm_mask_hi,
4113 &xmm_mask_lo, &xmm_mask_hi);
4114
4115 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4116 &xmm_dst_lo, &xmm_dst_hi,
4117 &xmm_dst_lo, &xmm_dst_hi);
4118
4119 save_128_aligned (
4120 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4121
4122 mask += 16;
4123 dst += 16;
4124 w -= 16;
4125 }
4126
4127 while (w)
4128 {
4129 m = (uint32_t) *mask++;
4130 d = (uint32_t) *dst;
4131
4132 *dst++ = (uint8_t) pack_1x128_32 (
4133 pix_multiply_1x128 (
4134 pix_multiply_1x128 (
4135 xmm_alpha, unpack_32_1x128 (m)),
4136 unpack_32_1x128 (d)));
4137 w--;
4138 }
4139 }
4140
4141 }
4142
4143 static void
sse2_composite_in_n_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4144 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4145 pixman_composite_info_t *info)
4146 {
4147 PIXMAN_COMPOSITE_ARGS (info);
4148 uint8_t *dst_line, *dst;
4149 int dst_stride;
4150 uint32_t d;
4151 uint32_t src;
4152 int32_t w;
4153
4154 __m128i xmm_alpha;
4155 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4156
4157 PIXMAN_IMAGE_GET_LINE (
4158 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4159
4160 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4161
4162 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4163
4164 src = src >> 24;
4165
4166 if (src == 0xff)
4167 return;
4168
4169 if (src == 0x00)
4170 {
4171 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4172 8, dest_x, dest_y, width, height, src);
4173
4174 return;
4175 }
4176
4177 while (height--)
4178 {
4179 dst = dst_line;
4180 dst_line += dst_stride;
4181 w = width;
4182
4183 while (w && ((uintptr_t)dst & 15))
4184 {
4185 d = (uint32_t) *dst;
4186
4187 *dst++ = (uint8_t) pack_1x128_32 (
4188 pix_multiply_1x128 (
4189 xmm_alpha,
4190 unpack_32_1x128 (d)));
4191 w--;
4192 }
4193
4194 while (w >= 16)
4195 {
4196 xmm_dst = load_128_aligned ((__m128i*)dst);
4197
4198 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4199
4200 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4201 &xmm_dst_lo, &xmm_dst_hi,
4202 &xmm_dst_lo, &xmm_dst_hi);
4203
4204 save_128_aligned (
4205 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4206
4207 dst += 16;
4208 w -= 16;
4209 }
4210
4211 while (w)
4212 {
4213 d = (uint32_t) *dst;
4214
4215 *dst++ = (uint8_t) pack_1x128_32 (
4216 pix_multiply_1x128 (
4217 xmm_alpha,
4218 unpack_32_1x128 (d)));
4219 w--;
4220 }
4221 }
4222
4223 }
4224
4225 static void
sse2_composite_in_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4226 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4227 pixman_composite_info_t *info)
4228 {
4229 PIXMAN_COMPOSITE_ARGS (info);
4230 uint8_t *dst_line, *dst;
4231 uint8_t *src_line, *src;
4232 int src_stride, dst_stride;
4233 int32_t w;
4234 uint32_t s, d;
4235
4236 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4237 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4238
4239 PIXMAN_IMAGE_GET_LINE (
4240 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4241 PIXMAN_IMAGE_GET_LINE (
4242 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4243
4244 while (height--)
4245 {
4246 dst = dst_line;
4247 dst_line += dst_stride;
4248 src = src_line;
4249 src_line += src_stride;
4250 w = width;
4251
4252 while (w && ((uintptr_t)dst & 15))
4253 {
4254 s = (uint32_t) *src++;
4255 d = (uint32_t) *dst;
4256
4257 *dst++ = (uint8_t) pack_1x128_32 (
4258 pix_multiply_1x128 (
4259 unpack_32_1x128 (s), unpack_32_1x128 (d)));
4260 w--;
4261 }
4262
4263 while (w >= 16)
4264 {
4265 xmm_src = load_128_unaligned ((__m128i*)src);
4266 xmm_dst = load_128_aligned ((__m128i*)dst);
4267
4268 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4269 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4270
4271 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4272 &xmm_dst_lo, &xmm_dst_hi,
4273 &xmm_dst_lo, &xmm_dst_hi);
4274
4275 save_128_aligned (
4276 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4277
4278 src += 16;
4279 dst += 16;
4280 w -= 16;
4281 }
4282
4283 while (w)
4284 {
4285 s = (uint32_t) *src++;
4286 d = (uint32_t) *dst;
4287
4288 *dst++ = (uint8_t) pack_1x128_32 (
4289 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4290 w--;
4291 }
4292 }
4293
4294 }
4295
4296 static void
sse2_composite_add_n_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4297 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4298 pixman_composite_info_t *info)
4299 {
4300 PIXMAN_COMPOSITE_ARGS (info);
4301 uint8_t *dst_line, *dst;
4302 uint8_t *mask_line, *mask;
4303 int dst_stride, mask_stride;
4304 int32_t w;
4305 uint32_t src;
4306 uint32_t m, d;
4307
4308 __m128i xmm_alpha;
4309 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4310 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4311
4312 PIXMAN_IMAGE_GET_LINE (
4313 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4314 PIXMAN_IMAGE_GET_LINE (
4315 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4316
4317 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4318
4319 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4320
4321 while (height--)
4322 {
4323 dst = dst_line;
4324 dst_line += dst_stride;
4325 mask = mask_line;
4326 mask_line += mask_stride;
4327 w = width;
4328
4329 while (w && ((uintptr_t)dst & 15))
4330 {
4331 m = (uint32_t) *mask++;
4332 d = (uint32_t) *dst;
4333
4334 *dst++ = (uint8_t) pack_1x128_32 (
4335 _mm_adds_epu16 (
4336 pix_multiply_1x128 (
4337 xmm_alpha, unpack_32_1x128 (m)),
4338 unpack_32_1x128 (d)));
4339 w--;
4340 }
4341
4342 while (w >= 16)
4343 {
4344 xmm_mask = load_128_unaligned ((__m128i*)mask);
4345 xmm_dst = load_128_aligned ((__m128i*)dst);
4346
4347 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4348 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4349
4350 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4351 &xmm_mask_lo, &xmm_mask_hi,
4352 &xmm_mask_lo, &xmm_mask_hi);
4353
4354 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4355 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4356
4357 save_128_aligned (
4358 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4359
4360 mask += 16;
4361 dst += 16;
4362 w -= 16;
4363 }
4364
4365 while (w)
4366 {
4367 m = (uint32_t) *mask++;
4368 d = (uint32_t) *dst;
4369
4370 *dst++ = (uint8_t) pack_1x128_32 (
4371 _mm_adds_epu16 (
4372 pix_multiply_1x128 (
4373 xmm_alpha, unpack_32_1x128 (m)),
4374 unpack_32_1x128 (d)));
4375
4376 w--;
4377 }
4378 }
4379
4380 }
4381
4382 static void
sse2_composite_add_n_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4383 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4384 pixman_composite_info_t *info)
4385 {
4386 PIXMAN_COMPOSITE_ARGS (info);
4387 uint8_t *dst_line, *dst;
4388 int dst_stride;
4389 int32_t w;
4390 uint32_t src;
4391
4392 __m128i xmm_src;
4393
4394 PIXMAN_IMAGE_GET_LINE (
4395 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4396
4397 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4398
4399 src >>= 24;
4400
4401 if (src == 0x00)
4402 return;
4403
4404 if (src == 0xff)
4405 {
4406 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4407 8, dest_x, dest_y, width, height, 0xff);
4408
4409 return;
4410 }
4411
4412 src = (src << 24) | (src << 16) | (src << 8) | src;
4413 xmm_src = _mm_set_epi32 (src, src, src, src);
4414
4415 while (height--)
4416 {
4417 dst = dst_line;
4418 dst_line += dst_stride;
4419 w = width;
4420
4421 while (w && ((uintptr_t)dst & 15))
4422 {
4423 *dst = (uint8_t)_mm_cvtsi128_si32 (
4424 _mm_adds_epu8 (
4425 xmm_src,
4426 _mm_cvtsi32_si128 (*dst)));
4427
4428 w--;
4429 dst++;
4430 }
4431
4432 while (w >= 16)
4433 {
4434 save_128_aligned (
4435 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4436
4437 dst += 16;
4438 w -= 16;
4439 }
4440
4441 while (w)
4442 {
4443 *dst = (uint8_t)_mm_cvtsi128_si32 (
4444 _mm_adds_epu8 (
4445 xmm_src,
4446 _mm_cvtsi32_si128 (*dst)));
4447
4448 w--;
4449 dst++;
4450 }
4451 }
4452
4453 }
4454
4455 static void
sse2_composite_add_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4456 sse2_composite_add_8_8 (pixman_implementation_t *imp,
4457 pixman_composite_info_t *info)
4458 {
4459 PIXMAN_COMPOSITE_ARGS (info);
4460 uint8_t *dst_line, *dst;
4461 uint8_t *src_line, *src;
4462 int dst_stride, src_stride;
4463 int32_t w;
4464 uint16_t t;
4465
4466 PIXMAN_IMAGE_GET_LINE (
4467 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4468 PIXMAN_IMAGE_GET_LINE (
4469 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4470
4471 while (height--)
4472 {
4473 dst = dst_line;
4474 src = src_line;
4475
4476 dst_line += dst_stride;
4477 src_line += src_stride;
4478 w = width;
4479
4480 /* Small head */
4481 while (w && (uintptr_t)dst & 3)
4482 {
4483 t = (*dst) + (*src++);
4484 *dst++ = t | (0 - (t >> 8));
4485 w--;
4486 }
4487
4488 sse2_combine_add_u (imp, op,
4489 (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4490
4491 /* Small tail */
4492 dst += w & 0xfffc;
4493 src += w & 0xfffc;
4494
4495 w &= 3;
4496
4497 while (w)
4498 {
4499 t = (*dst) + (*src++);
4500 *dst++ = t | (0 - (t >> 8));
4501 w--;
4502 }
4503 }
4504
4505 }
4506
4507 static void
sse2_composite_add_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)4508 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4509 pixman_composite_info_t *info)
4510 {
4511 PIXMAN_COMPOSITE_ARGS (info);
4512 uint32_t *dst_line, *dst;
4513 uint32_t *src_line, *src;
4514 int dst_stride, src_stride;
4515
4516 PIXMAN_IMAGE_GET_LINE (
4517 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4518 PIXMAN_IMAGE_GET_LINE (
4519 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4520
4521 while (height--)
4522 {
4523 dst = dst_line;
4524 dst_line += dst_stride;
4525 src = src_line;
4526 src_line += src_stride;
4527
4528 sse2_combine_add_u (imp, op, dst, src, NULL, width);
4529 }
4530 }
4531
4532 static void
sse2_composite_add_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)4533 sse2_composite_add_n_8888 (pixman_implementation_t *imp,
4534 pixman_composite_info_t *info)
4535 {
4536 PIXMAN_COMPOSITE_ARGS (info);
4537 uint32_t *dst_line, *dst, src;
4538 int dst_stride;
4539
4540 __m128i xmm_src;
4541
4542 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4543
4544 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4545 if (src == 0)
4546 return;
4547
4548 if (src == ~0)
4549 {
4550 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
4551 dest_x, dest_y, width, height, ~0);
4552
4553 return;
4554 }
4555
4556 xmm_src = _mm_set_epi32 (src, src, src, src);
4557 while (height--)
4558 {
4559 int w = width;
4560 uint32_t d;
4561
4562 dst = dst_line;
4563 dst_line += dst_stride;
4564
4565 while (w && (uintptr_t)dst & 15)
4566 {
4567 d = *dst;
4568 *dst++ =
4569 _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
4570 w--;
4571 }
4572
4573 while (w >= 4)
4574 {
4575 save_128_aligned
4576 ((__m128i*)dst,
4577 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4578
4579 dst += 4;
4580 w -= 4;
4581 }
4582
4583 while (w--)
4584 {
4585 d = *dst;
4586 *dst++ =
4587 _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
4588 _mm_cvtsi32_si128 (d)));
4589 }
4590 }
4591 }
4592
4593 static void
sse2_composite_add_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)4594 sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
4595 pixman_composite_info_t *info)
4596 {
4597 PIXMAN_COMPOSITE_ARGS (info);
4598 uint32_t *dst_line, *dst;
4599 uint8_t *mask_line, *mask;
4600 int dst_stride, mask_stride;
4601 int32_t w;
4602 uint32_t src;
4603
4604 __m128i xmm_src;
4605
4606 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4607 if (src == 0)
4608 return;
4609 xmm_src = expand_pixel_32_1x128 (src);
4610
4611 PIXMAN_IMAGE_GET_LINE (
4612 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4613 PIXMAN_IMAGE_GET_LINE (
4614 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4615
4616 while (height--)
4617 {
4618 dst = dst_line;
4619 dst_line += dst_stride;
4620 mask = mask_line;
4621 mask_line += mask_stride;
4622 w = width;
4623
4624 while (w && ((uintptr_t)dst & 15))
4625 {
4626 uint8_t m = *mask++;
4627 if (m)
4628 {
4629 *dst = pack_1x128_32
4630 (_mm_adds_epu16
4631 (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4632 unpack_32_1x128 (*dst)));
4633 }
4634 dst++;
4635 w--;
4636 }
4637
4638 while (w >= 4)
4639 {
4640 uint32_t m;
4641 memcpy(&m, mask, sizeof(uint32_t));
4642
4643 if (m)
4644 {
4645 __m128i xmm_mask_lo, xmm_mask_hi;
4646 __m128i xmm_dst_lo, xmm_dst_hi;
4647
4648 __m128i xmm_dst = load_128_aligned ((__m128i*)dst);
4649 __m128i xmm_mask =
4650 _mm_unpacklo_epi8 (unpack_32_1x128(m),
4651 _mm_setzero_si128 ());
4652
4653 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4654 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4655
4656 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4657 &xmm_mask_lo, &xmm_mask_hi);
4658
4659 pix_multiply_2x128 (&xmm_src, &xmm_src,
4660 &xmm_mask_lo, &xmm_mask_hi,
4661 &xmm_mask_lo, &xmm_mask_hi);
4662
4663 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4664 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4665
4666 save_128_aligned (
4667 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4668 }
4669
4670 w -= 4;
4671 dst += 4;
4672 mask += 4;
4673 }
4674
4675 while (w)
4676 {
4677 uint8_t m = *mask++;
4678 if (m)
4679 {
4680 *dst = pack_1x128_32
4681 (_mm_adds_epu16
4682 (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4683 unpack_32_1x128 (*dst)));
4684 }
4685 dst++;
4686 w--;
4687 }
4688 }
4689 }
4690
4691 static pixman_bool_t
sse2_blt(pixman_implementation_t * imp,uint32_t * src_bits,uint32_t * dst_bits,int src_stride,int dst_stride,int src_bpp,int dst_bpp,int src_x,int src_y,int dest_x,int dest_y,int width,int height)4692 sse2_blt (pixman_implementation_t *imp,
4693 uint32_t * src_bits,
4694 uint32_t * dst_bits,
4695 int src_stride,
4696 int dst_stride,
4697 int src_bpp,
4698 int dst_bpp,
4699 int src_x,
4700 int src_y,
4701 int dest_x,
4702 int dest_y,
4703 int width,
4704 int height)
4705 {
4706 uint8_t * src_bytes;
4707 uint8_t * dst_bytes;
4708 int byte_width;
4709
4710 if (src_bpp != dst_bpp)
4711 return FALSE;
4712
4713 if (src_bpp == 16)
4714 {
4715 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4716 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4717 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4718 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4719 byte_width = 2 * width;
4720 src_stride *= 2;
4721 dst_stride *= 2;
4722 }
4723 else if (src_bpp == 32)
4724 {
4725 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4726 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4727 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4728 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4729 byte_width = 4 * width;
4730 src_stride *= 4;
4731 dst_stride *= 4;
4732 }
4733 else
4734 {
4735 return FALSE;
4736 }
4737
4738 while (height--)
4739 {
4740 int w;
4741 uint8_t *s = src_bytes;
4742 uint8_t *d = dst_bytes;
4743 src_bytes += src_stride;
4744 dst_bytes += dst_stride;
4745 w = byte_width;
4746
4747 while (w >= 2 && ((uintptr_t)d & 3))
4748 {
4749 memmove(d, s, 2);
4750 w -= 2;
4751 s += 2;
4752 d += 2;
4753 }
4754
4755 while (w >= 4 && ((uintptr_t)d & 15))
4756 {
4757 memmove(d, s, 4);
4758
4759 w -= 4;
4760 s += 4;
4761 d += 4;
4762 }
4763
4764 while (w >= 64)
4765 {
4766 __m128i xmm0, xmm1, xmm2, xmm3;
4767
4768 xmm0 = load_128_unaligned ((__m128i*)(s));
4769 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4770 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4771 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4772
4773 save_128_aligned ((__m128i*)(d), xmm0);
4774 save_128_aligned ((__m128i*)(d + 16), xmm1);
4775 save_128_aligned ((__m128i*)(d + 32), xmm2);
4776 save_128_aligned ((__m128i*)(d + 48), xmm3);
4777
4778 s += 64;
4779 d += 64;
4780 w -= 64;
4781 }
4782
4783 while (w >= 16)
4784 {
4785 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4786
4787 w -= 16;
4788 d += 16;
4789 s += 16;
4790 }
4791
4792 while (w >= 4)
4793 {
4794 memmove(d, s, 4);
4795
4796 w -= 4;
4797 s += 4;
4798 d += 4;
4799 }
4800
4801 if (w >= 2)
4802 {
4803 memmove(d, s, 2);
4804 w -= 2;
4805 s += 2;
4806 d += 2;
4807 }
4808 }
4809
4810 return TRUE;
4811 }
4812
4813 static void
sse2_composite_copy_area(pixman_implementation_t * imp,pixman_composite_info_t * info)4814 sse2_composite_copy_area (pixman_implementation_t *imp,
4815 pixman_composite_info_t *info)
4816 {
4817 PIXMAN_COMPOSITE_ARGS (info);
4818 sse2_blt (imp, src_image->bits.bits,
4819 dest_image->bits.bits,
4820 src_image->bits.rowstride,
4821 dest_image->bits.rowstride,
4822 PIXMAN_FORMAT_BPP (src_image->bits.format),
4823 PIXMAN_FORMAT_BPP (dest_image->bits.format),
4824 src_x, src_y, dest_x, dest_y, width, height);
4825 }
4826
4827 static void
sse2_composite_over_x888_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)4828 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4829 pixman_composite_info_t *info)
4830 {
4831 PIXMAN_COMPOSITE_ARGS (info);
4832 uint32_t *src, *src_line, s;
4833 uint32_t *dst, *dst_line, d;
4834 uint8_t *mask, *mask_line;
4835 uint32_t m;
4836 int src_stride, mask_stride, dst_stride;
4837 int32_t w;
4838 __m128i ms;
4839
4840 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4841 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4842 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4843
4844 PIXMAN_IMAGE_GET_LINE (
4845 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4846 PIXMAN_IMAGE_GET_LINE (
4847 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4848 PIXMAN_IMAGE_GET_LINE (
4849 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4850
4851 while (height--)
4852 {
4853 src = src_line;
4854 src_line += src_stride;
4855 dst = dst_line;
4856 dst_line += dst_stride;
4857 mask = mask_line;
4858 mask_line += mask_stride;
4859
4860 w = width;
4861
4862 while (w && (uintptr_t)dst & 15)
4863 {
4864 s = 0xff000000 | *src++;
4865 memcpy(&m, mask++, sizeof(uint32_t));
4866 d = *dst;
4867 ms = unpack_32_1x128 (s);
4868
4869 if (m != 0xff)
4870 {
4871 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4872 __m128i md = unpack_32_1x128 (d);
4873
4874 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4875 }
4876
4877 *dst++ = pack_1x128_32 (ms);
4878 w--;
4879 }
4880
4881 while (w >= 4)
4882 {
4883 memcpy(&m, mask, sizeof(uint32_t));
4884 xmm_src = _mm_or_si128 (
4885 load_128_unaligned ((__m128i*)src), mask_ff000000);
4886
4887 if (m == 0xffffffff)
4888 {
4889 save_128_aligned ((__m128i*)dst, xmm_src);
4890 }
4891 else
4892 {
4893 xmm_dst = load_128_aligned ((__m128i*)dst);
4894
4895 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4896
4897 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4898 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4899 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4900
4901 expand_alpha_rev_2x128 (
4902 xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4903
4904 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
4905 &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
4906 &xmm_dst_lo, &xmm_dst_hi);
4907
4908 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4909 }
4910
4911 src += 4;
4912 dst += 4;
4913 mask += 4;
4914 w -= 4;
4915 }
4916
4917 while (w)
4918 {
4919 memcpy(&m, mask++, sizeof(uint32_t));
4920
4921 if (m)
4922 {
4923 s = 0xff000000 | *src;
4924
4925 if (m == 0xff)
4926 {
4927 *dst = s;
4928 }
4929 else
4930 {
4931 __m128i ma, md, ms;
4932
4933 d = *dst;
4934
4935 ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4936 md = unpack_32_1x128 (d);
4937 ms = unpack_32_1x128 (s);
4938
4939 *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4940 }
4941
4942 }
4943
4944 src++;
4945 dst++;
4946 w--;
4947 }
4948 }
4949
4950 }
4951
4952 static void
sse2_composite_over_8888_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)4953 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
4954 pixman_composite_info_t *info)
4955 {
4956 PIXMAN_COMPOSITE_ARGS (info);
4957 uint32_t *src, *src_line, s;
4958 uint32_t *dst, *dst_line, d;
4959 uint8_t *mask, *mask_line;
4960 uint32_t m;
4961 int src_stride, mask_stride, dst_stride;
4962 int32_t w;
4963
4964 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4965 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4966 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4967
4968 PIXMAN_IMAGE_GET_LINE (
4969 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4970 PIXMAN_IMAGE_GET_LINE (
4971 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4972 PIXMAN_IMAGE_GET_LINE (
4973 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4974
4975 while (height--)
4976 {
4977 src = src_line;
4978 src_line += src_stride;
4979 dst = dst_line;
4980 dst_line += dst_stride;
4981 mask = mask_line;
4982 mask_line += mask_stride;
4983
4984 w = width;
4985
4986 while (w && (uintptr_t)dst & 15)
4987 {
4988 uint32_t sa;
4989
4990 s = *src++;
4991 m = (uint32_t) *mask++;
4992 d = *dst;
4993
4994 sa = s >> 24;
4995
4996 if (m)
4997 {
4998 if (sa == 0xff && m == 0xff)
4999 {
5000 *dst = s;
5001 }
5002 else
5003 {
5004 __m128i ms, md, ma, msa;
5005
5006 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5007 ms = unpack_32_1x128 (s);
5008 md = unpack_32_1x128 (d);
5009
5010 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5011
5012 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5013 }
5014 }
5015
5016 dst++;
5017 w--;
5018 }
5019
5020 while (w >= 4)
5021 {
5022 memcpy(&m, mask, sizeof(uint32_t));
5023
5024 if (m)
5025 {
5026 xmm_src = load_128_unaligned ((__m128i*)src);
5027
5028 if (m == 0xffffffff && is_opaque (xmm_src))
5029 {
5030 save_128_aligned ((__m128i *)dst, xmm_src);
5031 }
5032 else
5033 {
5034 xmm_dst = load_128_aligned ((__m128i *)dst);
5035
5036 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5037
5038 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5039 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5040 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5041
5042 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5043 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5044
5045 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5046 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5047
5048 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5049 }
5050 }
5051
5052 src += 4;
5053 dst += 4;
5054 mask += 4;
5055 w -= 4;
5056 }
5057
5058 while (w)
5059 {
5060 uint32_t sa;
5061
5062 s = *src++;
5063 m = (uint32_t) *mask++;
5064 d = *dst;
5065
5066 sa = s >> 24;
5067
5068 if (m)
5069 {
5070 if (sa == 0xff && m == 0xff)
5071 {
5072 *dst = s;
5073 }
5074 else
5075 {
5076 __m128i ms, md, ma, msa;
5077
5078 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5079 ms = unpack_32_1x128 (s);
5080 md = unpack_32_1x128 (d);
5081
5082 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5083
5084 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5085 }
5086 }
5087
5088 dst++;
5089 w--;
5090 }
5091 }
5092
5093 }
5094
5095 static void
sse2_composite_over_reverse_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)5096 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5097 pixman_composite_info_t *info)
5098 {
5099 PIXMAN_COMPOSITE_ARGS (info);
5100 uint32_t src;
5101 uint32_t *dst_line, *dst;
5102 __m128i xmm_src;
5103 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5104 __m128i xmm_dsta_hi, xmm_dsta_lo;
5105 int dst_stride;
5106 int32_t w;
5107
5108 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
5109
5110 if (src == 0)
5111 return;
5112
5113 PIXMAN_IMAGE_GET_LINE (
5114 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5115
5116 xmm_src = expand_pixel_32_1x128 (src);
5117
5118 while (height--)
5119 {
5120 dst = dst_line;
5121
5122 dst_line += dst_stride;
5123 w = width;
5124
5125 while (w && (uintptr_t)dst & 15)
5126 {
5127 __m128i vd;
5128
5129 vd = unpack_32_1x128 (*dst);
5130
5131 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5132 xmm_src));
5133 w--;
5134 dst++;
5135 }
5136
5137 while (w >= 4)
5138 {
5139 __m128i tmp_lo, tmp_hi;
5140
5141 xmm_dst = load_128_aligned ((__m128i*)dst);
5142
5143 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5144 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5145
5146 tmp_lo = xmm_src;
5147 tmp_hi = xmm_src;
5148
5149 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5150 &xmm_dsta_lo, &xmm_dsta_hi,
5151 &tmp_lo, &tmp_hi);
5152
5153 save_128_aligned (
5154 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5155
5156 w -= 4;
5157 dst += 4;
5158 }
5159
5160 while (w)
5161 {
5162 __m128i vd;
5163
5164 vd = unpack_32_1x128 (*dst);
5165
5166 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5167 xmm_src));
5168 w--;
5169 dst++;
5170 }
5171
5172 }
5173
5174 }
5175
5176 static void
sse2_composite_over_8888_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)5177 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5178 pixman_composite_info_t *info)
5179 {
5180 PIXMAN_COMPOSITE_ARGS (info);
5181 uint32_t *src, *src_line, s;
5182 uint32_t *dst, *dst_line, d;
5183 uint32_t *mask, *mask_line;
5184 uint32_t m;
5185 int src_stride, mask_stride, dst_stride;
5186 int32_t w;
5187
5188 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5189 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5190 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5191
5192 PIXMAN_IMAGE_GET_LINE (
5193 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5194 PIXMAN_IMAGE_GET_LINE (
5195 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5196 PIXMAN_IMAGE_GET_LINE (
5197 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5198
5199 while (height--)
5200 {
5201 src = src_line;
5202 src_line += src_stride;
5203 dst = dst_line;
5204 dst_line += dst_stride;
5205 mask = mask_line;
5206 mask_line += mask_stride;
5207
5208 w = width;
5209
5210 while (w && (uintptr_t)dst & 15)
5211 {
5212 uint32_t sa;
5213
5214 s = *src++;
5215 m = (*mask++) >> 24;
5216 d = *dst;
5217
5218 sa = s >> 24;
5219
5220 if (m)
5221 {
5222 if (sa == 0xff && m == 0xff)
5223 {
5224 *dst = s;
5225 }
5226 else
5227 {
5228 __m128i ms, md, ma, msa;
5229
5230 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5231 ms = unpack_32_1x128 (s);
5232 md = unpack_32_1x128 (d);
5233
5234 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5235
5236 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5237 }
5238 }
5239
5240 dst++;
5241 w--;
5242 }
5243
5244 while (w >= 4)
5245 {
5246 xmm_mask = load_128_unaligned ((__m128i*)mask);
5247
5248 if (!is_transparent (xmm_mask))
5249 {
5250 xmm_src = load_128_unaligned ((__m128i*)src);
5251
5252 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5253 {
5254 save_128_aligned ((__m128i *)dst, xmm_src);
5255 }
5256 else
5257 {
5258 xmm_dst = load_128_aligned ((__m128i *)dst);
5259
5260 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5261 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5262 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5263
5264 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5265 expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5266
5267 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5268 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5269
5270 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5271 }
5272 }
5273
5274 src += 4;
5275 dst += 4;
5276 mask += 4;
5277 w -= 4;
5278 }
5279
5280 while (w)
5281 {
5282 uint32_t sa;
5283
5284 s = *src++;
5285 m = (*mask++) >> 24;
5286 d = *dst;
5287
5288 sa = s >> 24;
5289
5290 if (m)
5291 {
5292 if (sa == 0xff && m == 0xff)
5293 {
5294 *dst = s;
5295 }
5296 else
5297 {
5298 __m128i ms, md, ma, msa;
5299
5300 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5301 ms = unpack_32_1x128 (s);
5302 md = unpack_32_1x128 (d);
5303
5304 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5305
5306 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5307 }
5308 }
5309
5310 dst++;
5311 w--;
5312 }
5313 }
5314
5315 }
5316
5317 /* A variant of 'sse2_combine_over_u' with minor tweaks */
5318 static force_inline void
scaled_nearest_scanline_sse2_8888_8888_OVER(uint32_t * pd,const uint32_t * ps,int32_t w,pixman_fixed_t vx,pixman_fixed_t unit_x,pixman_fixed_t src_width_fixed,pixman_bool_t fully_transparent_src)5319 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
5320 const uint32_t* ps,
5321 int32_t w,
5322 pixman_fixed_t vx,
5323 pixman_fixed_t unit_x,
5324 pixman_fixed_t src_width_fixed,
5325 pixman_bool_t fully_transparent_src)
5326 {
5327 uint32_t s, d;
5328 const uint32_t* pm = NULL;
5329
5330 __m128i xmm_dst_lo, xmm_dst_hi;
5331 __m128i xmm_src_lo, xmm_src_hi;
5332 __m128i xmm_alpha_lo, xmm_alpha_hi;
5333
5334 if (fully_transparent_src)
5335 return;
5336
5337 /* Align dst on a 16-byte boundary */
5338 while (w && ((uintptr_t)pd & 15))
5339 {
5340 d = *pd;
5341 s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5342 vx += unit_x;
5343 while (vx >= 0)
5344 vx -= src_width_fixed;
5345
5346 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5347 if (pm)
5348 pm++;
5349 w--;
5350 }
5351
5352 while (w >= 4)
5353 {
5354 __m128i tmp;
5355 uint32_t tmp1, tmp2, tmp3, tmp4;
5356
5357 tmp1 = *(ps + pixman_fixed_to_int (vx));
5358 vx += unit_x;
5359 while (vx >= 0)
5360 vx -= src_width_fixed;
5361 tmp2 = *(ps + pixman_fixed_to_int (vx));
5362 vx += unit_x;
5363 while (vx >= 0)
5364 vx -= src_width_fixed;
5365 tmp3 = *(ps + pixman_fixed_to_int (vx));
5366 vx += unit_x;
5367 while (vx >= 0)
5368 vx -= src_width_fixed;
5369 tmp4 = *(ps + pixman_fixed_to_int (vx));
5370 vx += unit_x;
5371 while (vx >= 0)
5372 vx -= src_width_fixed;
5373
5374 tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5375
5376 xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5377
5378 if (is_opaque (xmm_src_hi))
5379 {
5380 save_128_aligned ((__m128i*)pd, xmm_src_hi);
5381 }
5382 else if (!is_zero (xmm_src_hi))
5383 {
5384 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5385
5386 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5387 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5388
5389 expand_alpha_2x128 (
5390 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5391
5392 over_2x128 (&xmm_src_lo, &xmm_src_hi,
5393 &xmm_alpha_lo, &xmm_alpha_hi,
5394 &xmm_dst_lo, &xmm_dst_hi);
5395
5396 /* rebuid the 4 pixel data and save*/
5397 save_128_aligned ((__m128i*)pd,
5398 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5399 }
5400
5401 w -= 4;
5402 pd += 4;
5403 if (pm)
5404 pm += 4;
5405 }
5406
5407 while (w)
5408 {
5409 d = *pd;
5410 s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5411 vx += unit_x;
5412 while (vx >= 0)
5413 vx -= src_width_fixed;
5414
5415 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5416 if (pm)
5417 pm++;
5418
5419 w--;
5420 }
5421 }
5422
FAST_NEAREST_MAINLOOP(sse2_8888_8888_cover_OVER,scaled_nearest_scanline_sse2_8888_8888_OVER,uint32_t,uint32_t,COVER)5423 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5424 scaled_nearest_scanline_sse2_8888_8888_OVER,
5425 uint32_t, uint32_t, COVER)
5426 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5427 scaled_nearest_scanline_sse2_8888_8888_OVER,
5428 uint32_t, uint32_t, NONE)
5429 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5430 scaled_nearest_scanline_sse2_8888_8888_OVER,
5431 uint32_t, uint32_t, PAD)
5432 FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
5433 scaled_nearest_scanline_sse2_8888_8888_OVER,
5434 uint32_t, uint32_t, NORMAL)
5435
5436 static force_inline void
5437 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5438 uint32_t * dst,
5439 const uint32_t * src,
5440 int32_t w,
5441 pixman_fixed_t vx,
5442 pixman_fixed_t unit_x,
5443 pixman_fixed_t src_width_fixed,
5444 pixman_bool_t zero_src)
5445 {
5446 __m128i xmm_mask;
5447 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5448 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5449 __m128i xmm_alpha_lo, xmm_alpha_hi;
5450
5451 if (zero_src || (*mask >> 24) == 0)
5452 return;
5453
5454 xmm_mask = create_mask_16_128 (*mask >> 24);
5455
5456 while (w && (uintptr_t)dst & 15)
5457 {
5458 uint32_t s = *(src + pixman_fixed_to_int (vx));
5459 vx += unit_x;
5460 while (vx >= 0)
5461 vx -= src_width_fixed;
5462
5463 if (s)
5464 {
5465 uint32_t d = *dst;
5466
5467 __m128i ms = unpack_32_1x128 (s);
5468 __m128i alpha = expand_alpha_1x128 (ms);
5469 __m128i dest = xmm_mask;
5470 __m128i alpha_dst = unpack_32_1x128 (d);
5471
5472 *dst = pack_1x128_32 (
5473 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5474 }
5475 dst++;
5476 w--;
5477 }
5478
5479 while (w >= 4)
5480 {
5481 uint32_t tmp1, tmp2, tmp3, tmp4;
5482
5483 tmp1 = *(src + pixman_fixed_to_int (vx));
5484 vx += unit_x;
5485 while (vx >= 0)
5486 vx -= src_width_fixed;
5487 tmp2 = *(src + pixman_fixed_to_int (vx));
5488 vx += unit_x;
5489 while (vx >= 0)
5490 vx -= src_width_fixed;
5491 tmp3 = *(src + pixman_fixed_to_int (vx));
5492 vx += unit_x;
5493 while (vx >= 0)
5494 vx -= src_width_fixed;
5495 tmp4 = *(src + pixman_fixed_to_int (vx));
5496 vx += unit_x;
5497 while (vx >= 0)
5498 vx -= src_width_fixed;
5499
5500 xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5501
5502 if (!is_zero (xmm_src))
5503 {
5504 xmm_dst = load_128_aligned ((__m128i*)dst);
5505
5506 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5507 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5508 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5509 &xmm_alpha_lo, &xmm_alpha_hi);
5510
5511 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5512 &xmm_alpha_lo, &xmm_alpha_hi,
5513 &xmm_mask, &xmm_mask,
5514 &xmm_dst_lo, &xmm_dst_hi);
5515
5516 save_128_aligned (
5517 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5518 }
5519
5520 dst += 4;
5521 w -= 4;
5522 }
5523
5524 while (w)
5525 {
5526 uint32_t s = *(src + pixman_fixed_to_int (vx));
5527 vx += unit_x;
5528 while (vx >= 0)
5529 vx -= src_width_fixed;
5530
5531 if (s)
5532 {
5533 uint32_t d = *dst;
5534
5535 __m128i ms = unpack_32_1x128 (s);
5536 __m128i alpha = expand_alpha_1x128 (ms);
5537 __m128i mask = xmm_mask;
5538 __m128i dest = unpack_32_1x128 (d);
5539
5540 *dst = pack_1x128_32 (
5541 in_over_1x128 (&ms, &alpha, &mask, &dest));
5542 }
5543
5544 dst++;
5545 w--;
5546 }
5547
5548 }
5549
FAST_NEAREST_MAINLOOP_COMMON(sse2_8888_n_8888_cover_OVER,scaled_nearest_scanline_sse2_8888_n_8888_OVER,uint32_t,uint32_t,uint32_t,COVER,TRUE,TRUE)5550 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5551 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5552 uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5553 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5554 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5555 uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5556 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5557 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5558 uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5559 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
5560 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5561 uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
5562
5563 #if PSHUFD_IS_FAST
5564
5565 /***********************************************************************************/
5566
5567 # define BILINEAR_DECLARE_VARIABLES \
5568 const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
5569 const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
5570 const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \
5571 const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \
5572 unit_x, -unit_x, unit_x, -unit_x); \
5573 const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \
5574 unit_x * 4, -unit_x * 4, \
5575 unit_x * 4, -unit_x * 4, \
5576 unit_x * 4, -unit_x * 4); \
5577 const __m128i xmm_zero = _mm_setzero_si128 (); \
5578 __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3, \
5579 vx + unit_x * 2, -(vx + 1) - unit_x * 2, \
5580 vx + unit_x * 1, -(vx + 1) - unit_x * 1, \
5581 vx + unit_x * 0, -(vx + 1) - unit_x * 0); \
5582 __m128i xmm_wh_state;
5583
5584 #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_) \
5585 do { \
5586 int phase = phase_; \
5587 __m128i xmm_wh, xmm_a, xmm_b; \
5588 /* fetch 2x2 pixel block into sse2 registers */ \
5589 __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \
5590 __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \
5591 vx += unit_x; \
5592 /* vertical interpolation */ \
5593 xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \
5594 xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \
5595 xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \
5596 /* calculate horizontal weights */ \
5597 if (phase <= 0) \
5598 { \
5599 xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \
5600 16 - BILINEAR_INTERPOLATION_BITS)); \
5601 xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4); \
5602 phase = 0; \
5603 } \
5604 xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase, \
5605 phase, phase)); \
5606 /* horizontal interpolation */ \
5607 xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \
5608 xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh); \
5609 /* shift the result */ \
5610 pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \
5611 } while (0)
5612
5613 #else /************************************************************************/
5614
5615 # define BILINEAR_DECLARE_VARIABLES \
5616 const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
5617 const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
5618 const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \
5619 const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \
5620 unit_x, -unit_x, unit_x, -unit_x); \
5621 const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \
5622 unit_x * 4, -unit_x * 4, \
5623 unit_x * 4, -unit_x * 4, \
5624 unit_x * 4, -unit_x * 4); \
5625 const __m128i xmm_zero = _mm_setzero_si128 (); \
5626 __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1), \
5627 vx, -(vx + 1), vx, -(vx + 1))
5628
5629 #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase) \
5630 do { \
5631 __m128i xmm_wh, xmm_a, xmm_b; \
5632 /* fetch 2x2 pixel block into sse2 registers */ \
5633 __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \
5634 __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \
5635 (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */ \
5636 vx += unit_x; \
5637 /* vertical interpolation */ \
5638 xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \
5639 xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \
5640 xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \
5641 /* calculate horizontal weights */ \
5642 xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \
5643 16 - BILINEAR_INTERPOLATION_BITS)); \
5644 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \
5645 /* horizontal interpolation */ \
5646 xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a); \
5647 xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh); \
5648 /* shift the result */ \
5649 pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \
5650 } while (0)
5651
5652 /***********************************************************************************/
5653
5654 #endif
5655
5656 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix); \
5657 do { \
5658 __m128i xmm_pix; \
5659 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1); \
5660 xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix); \
5661 xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix); \
5662 pix = _mm_cvtsi128_si32 (xmm_pix); \
5663 } while(0)
5664
5665 #define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix); \
5666 do { \
5667 __m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4; \
5668 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0); \
5669 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1); \
5670 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2); \
5671 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3); \
5672 xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2); \
5673 xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4); \
5674 pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3); \
5675 } while(0)
5676
5677 #define BILINEAR_SKIP_ONE_PIXEL() \
5678 do { \
5679 vx += unit_x; \
5680 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \
5681 } while(0)
5682
5683 #define BILINEAR_SKIP_FOUR_PIXELS() \
5684 do { \
5685 vx += unit_x * 4; \
5686 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4); \
5687 } while(0)
5688
5689 /***********************************************************************************/
5690
5691 static force_inline void
5692 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst,
5693 const uint32_t * mask,
5694 const uint32_t * src_top,
5695 const uint32_t * src_bottom,
5696 int32_t w,
5697 int wt,
5698 int wb,
5699 pixman_fixed_t vx_,
5700 pixman_fixed_t unit_x_,
5701 pixman_fixed_t max_vx,
5702 pixman_bool_t zero_src)
5703 {
5704 intptr_t vx = vx_;
5705 intptr_t unit_x = unit_x_;
5706 BILINEAR_DECLARE_VARIABLES;
5707 uint32_t pix1, pix2;
5708
5709 while (w && ((uintptr_t)dst & 15))
5710 {
5711 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5712 *dst++ = pix1;
5713 w--;
5714 }
5715
5716 while ((w -= 4) >= 0) {
5717 __m128i xmm_src;
5718 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5719 _mm_store_si128 ((__m128i *)dst, xmm_src);
5720 dst += 4;
5721 }
5722
5723 if (w & 2)
5724 {
5725 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5726 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5727 *dst++ = pix1;
5728 *dst++ = pix2;
5729 }
5730
5731 if (w & 1)
5732 {
5733 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5734 *dst = pix1;
5735 }
5736
5737 }
5738
FAST_BILINEAR_MAINLOOP_COMMON(sse2_8888_8888_cover_SRC,scaled_bilinear_scanline_sse2_8888_8888_SRC,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)5739 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
5740 scaled_bilinear_scanline_sse2_8888_8888_SRC,
5741 uint32_t, uint32_t, uint32_t,
5742 COVER, FLAG_NONE)
5743 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
5744 scaled_bilinear_scanline_sse2_8888_8888_SRC,
5745 uint32_t, uint32_t, uint32_t,
5746 PAD, FLAG_NONE)
5747 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
5748 scaled_bilinear_scanline_sse2_8888_8888_SRC,
5749 uint32_t, uint32_t, uint32_t,
5750 NONE, FLAG_NONE)
5751 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
5752 scaled_bilinear_scanline_sse2_8888_8888_SRC,
5753 uint32_t, uint32_t, uint32_t,
5754 NORMAL, FLAG_NONE)
5755
5756 static force_inline void
5757 scaled_bilinear_scanline_sse2_x888_8888_SRC (uint32_t * dst,
5758 const uint32_t * mask,
5759 const uint32_t * src_top,
5760 const uint32_t * src_bottom,
5761 int32_t w,
5762 int wt,
5763 int wb,
5764 pixman_fixed_t vx_,
5765 pixman_fixed_t unit_x_,
5766 pixman_fixed_t max_vx,
5767 pixman_bool_t zero_src)
5768 {
5769 intptr_t vx = vx_;
5770 intptr_t unit_x = unit_x_;
5771 BILINEAR_DECLARE_VARIABLES;
5772 uint32_t pix1, pix2;
5773
5774 while (w && ((uintptr_t)dst & 15))
5775 {
5776 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5777 *dst++ = pix1 | 0xFF000000;
5778 w--;
5779 }
5780
5781 while ((w -= 4) >= 0) {
5782 __m128i xmm_src;
5783 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5784 _mm_store_si128 ((__m128i *)dst, _mm_or_si128 (xmm_src, mask_ff000000));
5785 dst += 4;
5786 }
5787
5788 if (w & 2)
5789 {
5790 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5791 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5792 *dst++ = pix1 | 0xFF000000;
5793 *dst++ = pix2 | 0xFF000000;
5794 }
5795
5796 if (w & 1)
5797 {
5798 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5799 *dst = pix1 | 0xFF000000;
5800 }
5801 }
5802
FAST_BILINEAR_MAINLOOP_COMMON(sse2_x888_8888_cover_SRC,scaled_bilinear_scanline_sse2_x888_8888_SRC,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)5803 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_cover_SRC,
5804 scaled_bilinear_scanline_sse2_x888_8888_SRC,
5805 uint32_t, uint32_t, uint32_t,
5806 COVER, FLAG_NONE)
5807 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_pad_SRC,
5808 scaled_bilinear_scanline_sse2_x888_8888_SRC,
5809 uint32_t, uint32_t, uint32_t,
5810 PAD, FLAG_NONE)
5811 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_normal_SRC,
5812 scaled_bilinear_scanline_sse2_x888_8888_SRC,
5813 uint32_t, uint32_t, uint32_t,
5814 NORMAL, FLAG_NONE)
5815
5816 static force_inline void
5817 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst,
5818 const uint32_t * mask,
5819 const uint32_t * src_top,
5820 const uint32_t * src_bottom,
5821 int32_t w,
5822 int wt,
5823 int wb,
5824 pixman_fixed_t vx_,
5825 pixman_fixed_t unit_x_,
5826 pixman_fixed_t max_vx,
5827 pixman_bool_t zero_src)
5828 {
5829 intptr_t vx = vx_;
5830 intptr_t unit_x = unit_x_;
5831 BILINEAR_DECLARE_VARIABLES;
5832 uint32_t pix1, pix2;
5833
5834 while (w && ((uintptr_t)dst & 15))
5835 {
5836 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5837
5838 if (pix1)
5839 {
5840 pix2 = *dst;
5841 *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5842 }
5843
5844 w--;
5845 dst++;
5846 }
5847
5848 while (w >= 4)
5849 {
5850 __m128i xmm_src;
5851 __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
5852 __m128i xmm_alpha_hi, xmm_alpha_lo;
5853
5854 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5855
5856 if (!is_zero (xmm_src))
5857 {
5858 if (is_opaque (xmm_src))
5859 {
5860 save_128_aligned ((__m128i *)dst, xmm_src);
5861 }
5862 else
5863 {
5864 __m128i xmm_dst = load_128_aligned ((__m128i *)dst);
5865
5866 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5867 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5868
5869 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5870 over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
5871 &xmm_dst_lo, &xmm_dst_hi);
5872
5873 save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5874 }
5875 }
5876
5877 w -= 4;
5878 dst += 4;
5879 }
5880
5881 while (w)
5882 {
5883 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5884
5885 if (pix1)
5886 {
5887 pix2 = *dst;
5888 *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5889 }
5890
5891 w--;
5892 dst++;
5893 }
5894 }
5895
FAST_BILINEAR_MAINLOOP_COMMON(sse2_8888_8888_cover_OVER,scaled_bilinear_scanline_sse2_8888_8888_OVER,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)5896 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
5897 scaled_bilinear_scanline_sse2_8888_8888_OVER,
5898 uint32_t, uint32_t, uint32_t,
5899 COVER, FLAG_NONE)
5900 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
5901 scaled_bilinear_scanline_sse2_8888_8888_OVER,
5902 uint32_t, uint32_t, uint32_t,
5903 PAD, FLAG_NONE)
5904 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
5905 scaled_bilinear_scanline_sse2_8888_8888_OVER,
5906 uint32_t, uint32_t, uint32_t,
5907 NONE, FLAG_NONE)
5908 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
5909 scaled_bilinear_scanline_sse2_8888_8888_OVER,
5910 uint32_t, uint32_t, uint32_t,
5911 NORMAL, FLAG_NONE)
5912
5913 static force_inline void
5914 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst,
5915 const uint8_t * mask,
5916 const uint32_t * src_top,
5917 const uint32_t * src_bottom,
5918 int32_t w,
5919 int wt,
5920 int wb,
5921 pixman_fixed_t vx_,
5922 pixman_fixed_t unit_x_,
5923 pixman_fixed_t max_vx,
5924 pixman_bool_t zero_src)
5925 {
5926 intptr_t vx = vx_;
5927 intptr_t unit_x = unit_x_;
5928 BILINEAR_DECLARE_VARIABLES;
5929 uint32_t pix1, pix2;
5930 uint32_t m;
5931
5932 while (w && ((uintptr_t)dst & 15))
5933 {
5934 uint32_t sa;
5935
5936 m = (uint32_t) *mask++;
5937
5938 if (m)
5939 {
5940 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5941 sa = pix1 >> 24;
5942
5943 if (sa == 0xff && m == 0xff)
5944 {
5945 *dst = pix1;
5946 }
5947 else
5948 {
5949 __m128i ms, md, ma, msa;
5950
5951 pix2 = *dst;
5952 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5953 ms = unpack_32_1x128 (pix1);
5954 md = unpack_32_1x128 (pix2);
5955
5956 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5957
5958 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5959 }
5960 }
5961 else
5962 {
5963 BILINEAR_SKIP_ONE_PIXEL ();
5964 }
5965
5966 w--;
5967 dst++;
5968 }
5969
5970 while (w >= 4)
5971 {
5972 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5973 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5974 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5975
5976 memcpy(&m, mask, sizeof(uint32_t));
5977
5978 if (m)
5979 {
5980 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5981
5982 if (m == 0xffffffff && is_opaque (xmm_src))
5983 {
5984 save_128_aligned ((__m128i *)dst, xmm_src);
5985 }
5986 else
5987 {
5988 xmm_dst = load_128_aligned ((__m128i *)dst);
5989
5990 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5991
5992 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5993 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5994 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5995
5996 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5997 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5998
5999 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
6000 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
6001
6002 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6003 }
6004 }
6005 else
6006 {
6007 BILINEAR_SKIP_FOUR_PIXELS ();
6008 }
6009
6010 w -= 4;
6011 dst += 4;
6012 mask += 4;
6013 }
6014
6015 while (w)
6016 {
6017 uint32_t sa;
6018
6019 m = (uint32_t) *mask++;
6020
6021 if (m)
6022 {
6023 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6024 sa = pix1 >> 24;
6025
6026 if (sa == 0xff && m == 0xff)
6027 {
6028 *dst = pix1;
6029 }
6030 else
6031 {
6032 __m128i ms, md, ma, msa;
6033
6034 pix2 = *dst;
6035 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
6036 ms = unpack_32_1x128 (pix1);
6037 md = unpack_32_1x128 (pix2);
6038
6039 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
6040
6041 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
6042 }
6043 }
6044 else
6045 {
6046 BILINEAR_SKIP_ONE_PIXEL ();
6047 }
6048
6049 w--;
6050 dst++;
6051 }
6052 }
6053
FAST_BILINEAR_MAINLOOP_COMMON(sse2_8888_8_8888_cover_OVER,scaled_bilinear_scanline_sse2_8888_8_8888_OVER,uint32_t,uint8_t,uint32_t,COVER,FLAG_HAVE_NON_SOLID_MASK)6054 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
6055 scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6056 uint32_t, uint8_t, uint32_t,
6057 COVER, FLAG_HAVE_NON_SOLID_MASK)
6058 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
6059 scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6060 uint32_t, uint8_t, uint32_t,
6061 PAD, FLAG_HAVE_NON_SOLID_MASK)
6062 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
6063 scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6064 uint32_t, uint8_t, uint32_t,
6065 NONE, FLAG_HAVE_NON_SOLID_MASK)
6066 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
6067 scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6068 uint32_t, uint8_t, uint32_t,
6069 NORMAL, FLAG_HAVE_NON_SOLID_MASK)
6070
6071 static force_inline void
6072 scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst,
6073 const uint32_t * mask,
6074 const uint32_t * src_top,
6075 const uint32_t * src_bottom,
6076 int32_t w,
6077 int wt,
6078 int wb,
6079 pixman_fixed_t vx_,
6080 pixman_fixed_t unit_x_,
6081 pixman_fixed_t max_vx,
6082 pixman_bool_t zero_src)
6083 {
6084 intptr_t vx = vx_;
6085 intptr_t unit_x = unit_x_;
6086 BILINEAR_DECLARE_VARIABLES;
6087 uint32_t pix1;
6088 __m128i xmm_mask;
6089
6090 if (zero_src || (*mask >> 24) == 0)
6091 return;
6092
6093 xmm_mask = create_mask_16_128 (*mask >> 24);
6094
6095 while (w && ((uintptr_t)dst & 15))
6096 {
6097 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6098 if (pix1)
6099 {
6100 uint32_t d = *dst;
6101
6102 __m128i ms = unpack_32_1x128 (pix1);
6103 __m128i alpha = expand_alpha_1x128 (ms);
6104 __m128i dest = xmm_mask;
6105 __m128i alpha_dst = unpack_32_1x128 (d);
6106
6107 *dst = pack_1x128_32
6108 (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6109 }
6110
6111 dst++;
6112 w--;
6113 }
6114
6115 while (w >= 4)
6116 {
6117 __m128i xmm_src;
6118 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
6119
6120 if (!is_zero (xmm_src))
6121 {
6122 __m128i xmm_src_lo, xmm_src_hi;
6123 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
6124 __m128i xmm_alpha_lo, xmm_alpha_hi;
6125
6126 xmm_dst = load_128_aligned ((__m128i*)dst);
6127
6128 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
6129 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
6130 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
6131 &xmm_alpha_lo, &xmm_alpha_hi);
6132
6133 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
6134 &xmm_alpha_lo, &xmm_alpha_hi,
6135 &xmm_mask, &xmm_mask,
6136 &xmm_dst_lo, &xmm_dst_hi);
6137
6138 save_128_aligned
6139 ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6140 }
6141
6142 dst += 4;
6143 w -= 4;
6144 }
6145
6146 while (w)
6147 {
6148 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6149 if (pix1)
6150 {
6151 uint32_t d = *dst;
6152
6153 __m128i ms = unpack_32_1x128 (pix1);
6154 __m128i alpha = expand_alpha_1x128 (ms);
6155 __m128i dest = xmm_mask;
6156 __m128i alpha_dst = unpack_32_1x128 (d);
6157
6158 *dst = pack_1x128_32
6159 (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6160 }
6161
6162 dst++;
6163 w--;
6164 }
6165 }
6166
6167 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
6168 scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6169 uint32_t, uint32_t, uint32_t,
6170 COVER, FLAG_HAVE_SOLID_MASK)
6171 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
6172 scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6173 uint32_t, uint32_t, uint32_t,
6174 PAD, FLAG_HAVE_SOLID_MASK)
6175 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
6176 scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6177 uint32_t, uint32_t, uint32_t,
6178 NONE, FLAG_HAVE_SOLID_MASK)
6179 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
6180 scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6181 uint32_t, uint32_t, uint32_t,
6182 NORMAL, FLAG_HAVE_SOLID_MASK)
6183
6184 static const pixman_fast_path_t sse2_fast_paths[] =
6185 {
6186 /* PIXMAN_OP_OVER */
6187 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
6188 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
6189 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
6190 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
6191 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
6192 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
6193 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
6194 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
6195 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
6196 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
6197 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
6198 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
6199 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
6200 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
6201 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
6202 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
6203 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
6204 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
6205 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
6206 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
6207 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
6208 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
6209 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
6210 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
6211 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
6212 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
6213 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
6214 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
6215 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
6216 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
6217 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
6218 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
6219 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
6220 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6221 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6222 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6223 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6224 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
6225 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
6226 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
6227 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
6228 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
6229 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
6230 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
6231 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
6232 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6233 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6234
6235 /* PIXMAN_OP_OVER_REVERSE */
6236 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
6237 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
6238
6239 /* PIXMAN_OP_ADD */
6240 PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
6241 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
6242 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
6243 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
6244 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
6245 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
6246 PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
6247 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
6248 PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
6249 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
6250 PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
6251 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
6252 PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
6253 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
6254
6255 /* PIXMAN_OP_SRC */
6256 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
6257 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
6258 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
6259 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
6260 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6261 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6262 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6263 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6264 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
6265 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
6266 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
6267 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
6268 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6269 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6270 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6271 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6272 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
6273 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
6274
6275 /* PIXMAN_OP_IN */
6276 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
6277 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
6278 PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
6279
6280 SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6281 SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6282 SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6283 SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6284
6285 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6286 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6287 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6288 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6289
6290 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6291 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6292 SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
6293 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6294 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6295 SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
6296
6297 SIMPLE_BILINEAR_FAST_PATH_COVER (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6298 SIMPLE_BILINEAR_FAST_PATH_COVER (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6299 SIMPLE_BILINEAR_FAST_PATH_PAD (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6300 SIMPLE_BILINEAR_FAST_PATH_PAD (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6301 SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6302 SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6303
6304 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6305 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6306 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6307 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6308
6309 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6310 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6311 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6312 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6313
6314 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
6315 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
6316 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
6317 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
6318
6319 { PIXMAN_OP_NONE },
6320 };
6321
6322 static uint32_t *
sse2_fetch_x8r8g8b8(pixman_iter_t * iter,const uint32_t * mask)6323 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
6324 {
6325 int w = iter->width;
6326 __m128i ff000000 = mask_ff000000;
6327 uint32_t *dst = iter->buffer;
6328 uint32_t *src = (uint32_t *)iter->bits;
6329
6330 iter->bits += iter->stride;
6331
6332 while (w && ((uintptr_t)dst) & 0x0f)
6333 {
6334 *dst++ = (*src++) | 0xff000000;
6335 w--;
6336 }
6337
6338 while (w >= 4)
6339 {
6340 save_128_aligned (
6341 (__m128i *)dst, _mm_or_si128 (
6342 load_128_unaligned ((__m128i *)src), ff000000));
6343
6344 dst += 4;
6345 src += 4;
6346 w -= 4;
6347 }
6348
6349 while (w)
6350 {
6351 *dst++ = (*src++) | 0xff000000;
6352 w--;
6353 }
6354
6355 return iter->buffer;
6356 }
6357
6358 static uint32_t *
sse2_fetch_r5g6b5(pixman_iter_t * iter,const uint32_t * mask)6359 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
6360 {
6361 int w = iter->width;
6362 uint32_t *dst = iter->buffer;
6363 uint16_t *src = (uint16_t *)iter->bits;
6364 __m128i ff000000 = mask_ff000000;
6365
6366 iter->bits += iter->stride;
6367
6368 while (w && ((uintptr_t)dst) & 0x0f)
6369 {
6370 uint16_t s = *src++;
6371
6372 *dst++ = convert_0565_to_8888 (s);
6373 w--;
6374 }
6375
6376 while (w >= 8)
6377 {
6378 __m128i lo, hi, s;
6379
6380 s = _mm_loadu_si128 ((__m128i *)src);
6381
6382 lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
6383 hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
6384
6385 save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
6386 save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
6387
6388 dst += 8;
6389 src += 8;
6390 w -= 8;
6391 }
6392
6393 while (w)
6394 {
6395 uint16_t s = *src++;
6396
6397 *dst++ = convert_0565_to_8888 (s);
6398 w--;
6399 }
6400
6401 return iter->buffer;
6402 }
6403
6404 static uint32_t *
sse2_fetch_a8(pixman_iter_t * iter,const uint32_t * mask)6405 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
6406 {
6407 int w = iter->width;
6408 uint32_t *dst = iter->buffer;
6409 uint8_t *src = iter->bits;
6410 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6411
6412 iter->bits += iter->stride;
6413
6414 while (w && (((uintptr_t)dst) & 15))
6415 {
6416 *dst++ = (uint32_t)(*(src++)) << 24;
6417 w--;
6418 }
6419
6420 while (w >= 16)
6421 {
6422 xmm0 = _mm_loadu_si128((__m128i *)src);
6423
6424 xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0);
6425 xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0);
6426 xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
6427 xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
6428 xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
6429 xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
6430
6431 _mm_store_si128(((__m128i *)(dst + 0)), xmm3);
6432 _mm_store_si128(((__m128i *)(dst + 4)), xmm4);
6433 _mm_store_si128(((__m128i *)(dst + 8)), xmm5);
6434 _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
6435
6436 dst += 16;
6437 src += 16;
6438 w -= 16;
6439 }
6440
6441 while (w)
6442 {
6443 *dst++ = (uint32_t)(*(src++)) << 24;
6444 w--;
6445 }
6446
6447 return iter->buffer;
6448 }
6449
6450 #define IMAGE_FLAGS \
6451 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
6452 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
6453
6454 static const pixman_iter_info_t sse2_iters[] =
6455 {
6456 { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
6457 _pixman_iter_init_bits_stride, sse2_fetch_x8r8g8b8, NULL
6458 },
6459 { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
6460 _pixman_iter_init_bits_stride, sse2_fetch_r5g6b5, NULL
6461 },
6462 { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
6463 _pixman_iter_init_bits_stride, sse2_fetch_a8, NULL
6464 },
6465 { PIXMAN_null },
6466 };
6467
6468 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6469 __attribute__((__force_align_arg_pointer__))
6470 #endif
6471 pixman_implementation_t *
_pixman_implementation_create_sse2(pixman_implementation_t * fallback)6472 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6473 {
6474 pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6475
6476 /* SSE2 constants */
6477 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6478 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6479 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6480 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6481 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6482 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6483 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6484 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6485 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
6486 mask_0080 = create_mask_16_128 (0x0080);
6487 mask_00ff = create_mask_16_128 (0x00ff);
6488 mask_0101 = create_mask_16_128 (0x0101);
6489 mask_ffff = create_mask_16_128 (0xffff);
6490 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6491 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6492 mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
6493 mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
6494
6495 /* Set up function pointers */
6496 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6497 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6498 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6499 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6500 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6501 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6502 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6503 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6504 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6505 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6506
6507 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6508
6509 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6510 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6511 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6512 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6513 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6514 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6515 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6516 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6517 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6518 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6519 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6520
6521 imp->blt = sse2_blt;
6522 imp->fill = sse2_fill;
6523
6524 imp->iter_info = sse2_iters;
6525
6526 return imp;
6527 }
6528