1 /*
2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
4 *
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
14 *
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22 * SOFTWARE.
23 *
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
26 *
27 * Based on work by Owen Taylor and Søren Sandmann
28 */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32
33 /* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
34 #define PSHUFD_IS_FAST 0
35
36 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
37 #include <emmintrin.h> /* for SSE2 intrinsics */
38 #include "pixman-private.h"
39 #include "pixman-combine32.h"
40 #include "pixman-inlines.h"
41
42 static __m128i mask_0080;
43 static __m128i mask_00ff;
44 static __m128i mask_0101;
45 static __m128i mask_ffff;
46 static __m128i mask_ff000000;
47 static __m128i mask_alpha;
48
49 static __m128i mask_565_r;
50 static __m128i mask_565_g1, mask_565_g2;
51 static __m128i mask_565_b;
52 static __m128i mask_red;
53 static __m128i mask_green;
54 static __m128i mask_blue;
55
56 static __m128i mask_565_fix_rb;
57 static __m128i mask_565_fix_g;
58
59 static __m128i mask_565_rb;
60 static __m128i mask_565_pack_multiplier;
61
62 static force_inline __m128i
unpack_32_1x128(uint32_t data)63 unpack_32_1x128 (uint32_t data)
64 {
65 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
66 }
67
68 static force_inline void
unpack_128_2x128(__m128i data,__m128i * data_lo,__m128i * data_hi)69 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
70 {
71 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
72 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
73 }
74
75 static force_inline __m128i
unpack_565_to_8888(__m128i lo)76 unpack_565_to_8888 (__m128i lo)
77 {
78 __m128i r, g, b, rb, t;
79
80 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
81 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
82 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
83
84 rb = _mm_or_si128 (r, b);
85 t = _mm_and_si128 (rb, mask_565_fix_rb);
86 t = _mm_srli_epi32 (t, 5);
87 rb = _mm_or_si128 (rb, t);
88
89 t = _mm_and_si128 (g, mask_565_fix_g);
90 t = _mm_srli_epi32 (t, 6);
91 g = _mm_or_si128 (g, t);
92
93 return _mm_or_si128 (rb, g);
94 }
95
96 static force_inline void
unpack_565_128_4x128(__m128i data,__m128i * data0,__m128i * data1,__m128i * data2,__m128i * data3)97 unpack_565_128_4x128 (__m128i data,
98 __m128i* data0,
99 __m128i* data1,
100 __m128i* data2,
101 __m128i* data3)
102 {
103 __m128i lo, hi;
104
105 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
106 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
107
108 lo = unpack_565_to_8888 (lo);
109 hi = unpack_565_to_8888 (hi);
110
111 unpack_128_2x128 (lo, data0, data1);
112 unpack_128_2x128 (hi, data2, data3);
113 }
114
115 static force_inline uint16_t
pack_565_32_16(uint32_t pixel)116 pack_565_32_16 (uint32_t pixel)
117 {
118 return (uint16_t) (((pixel >> 8) & 0xf800) |
119 ((pixel >> 5) & 0x07e0) |
120 ((pixel >> 3) & 0x001f));
121 }
122
123 static force_inline __m128i
pack_2x128_128(__m128i lo,__m128i hi)124 pack_2x128_128 (__m128i lo, __m128i hi)
125 {
126 return _mm_packus_epi16 (lo, hi);
127 }
128
129 static force_inline __m128i
pack_565_2packedx128_128(__m128i lo,__m128i hi)130 pack_565_2packedx128_128 (__m128i lo, __m128i hi)
131 {
132 __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
133 __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
134
135 __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
136 __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
137
138 __m128i g0 = _mm_and_si128 (lo, mask_green);
139 __m128i g1 = _mm_and_si128 (hi, mask_green);
140
141 t0 = _mm_or_si128 (t0, g0);
142 t1 = _mm_or_si128 (t1, g1);
143
144 /* Simulates _mm_packus_epi32 */
145 t0 = _mm_slli_epi32 (t0, 16 - 5);
146 t1 = _mm_slli_epi32 (t1, 16 - 5);
147 t0 = _mm_srai_epi32 (t0, 16);
148 t1 = _mm_srai_epi32 (t1, 16);
149 return _mm_packs_epi32 (t0, t1);
150 }
151
152 static force_inline __m128i
pack_565_2x128_128(__m128i lo,__m128i hi)153 pack_565_2x128_128 (__m128i lo, __m128i hi)
154 {
155 __m128i data;
156 __m128i r, g1, g2, b;
157
158 data = pack_2x128_128 (lo, hi);
159
160 r = _mm_and_si128 (data, mask_565_r);
161 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
162 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
163 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
164
165 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
166 }
167
168 static force_inline __m128i
pack_565_4x128_128(__m128i * xmm0,__m128i * xmm1,__m128i * xmm2,__m128i * xmm3)169 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
170 {
171 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
172 pack_565_2x128_128 (*xmm2, *xmm3));
173 }
174
175 static force_inline int
is_opaque(__m128i x)176 is_opaque (__m128i x)
177 {
178 __m128i ffs = _mm_cmpeq_epi8 (x, x);
179
180 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
181 }
182
183 static force_inline int
is_zero(__m128i x)184 is_zero (__m128i x)
185 {
186 return _mm_movemask_epi8 (
187 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
188 }
189
190 static force_inline int
is_transparent(__m128i x)191 is_transparent (__m128i x)
192 {
193 return (_mm_movemask_epi8 (
194 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
195 }
196
197 static force_inline __m128i
expand_pixel_32_1x128(uint32_t data)198 expand_pixel_32_1x128 (uint32_t data)
199 {
200 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
201 }
202
203 static force_inline __m128i
expand_alpha_1x128(__m128i data)204 expand_alpha_1x128 (__m128i data)
205 {
206 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
207 _MM_SHUFFLE (3, 3, 3, 3)),
208 _MM_SHUFFLE (3, 3, 3, 3));
209 }
210
211 static force_inline void
expand_alpha_2x128(__m128i data_lo,__m128i data_hi,__m128i * alpha_lo,__m128i * alpha_hi)212 expand_alpha_2x128 (__m128i data_lo,
213 __m128i data_hi,
214 __m128i* alpha_lo,
215 __m128i* alpha_hi)
216 {
217 __m128i lo, hi;
218
219 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
220 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
221
222 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
223 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
224 }
225
226 static force_inline void
expand_alpha_rev_2x128(__m128i data_lo,__m128i data_hi,__m128i * alpha_lo,__m128i * alpha_hi)227 expand_alpha_rev_2x128 (__m128i data_lo,
228 __m128i data_hi,
229 __m128i* alpha_lo,
230 __m128i* alpha_hi)
231 {
232 __m128i lo, hi;
233
234 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
235 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
236 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
237 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
238 }
239
240 static force_inline void
pix_multiply_2x128(__m128i * data_lo,__m128i * data_hi,__m128i * alpha_lo,__m128i * alpha_hi,__m128i * ret_lo,__m128i * ret_hi)241 pix_multiply_2x128 (__m128i* data_lo,
242 __m128i* data_hi,
243 __m128i* alpha_lo,
244 __m128i* alpha_hi,
245 __m128i* ret_lo,
246 __m128i* ret_hi)
247 {
248 __m128i lo, hi;
249
250 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
251 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
252 lo = _mm_adds_epu16 (lo, mask_0080);
253 hi = _mm_adds_epu16 (hi, mask_0080);
254 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
255 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
256 }
257
258 static force_inline void
pix_add_multiply_2x128(__m128i * src_lo,__m128i * src_hi,__m128i * alpha_dst_lo,__m128i * alpha_dst_hi,__m128i * dst_lo,__m128i * dst_hi,__m128i * alpha_src_lo,__m128i * alpha_src_hi,__m128i * ret_lo,__m128i * ret_hi)259 pix_add_multiply_2x128 (__m128i* src_lo,
260 __m128i* src_hi,
261 __m128i* alpha_dst_lo,
262 __m128i* alpha_dst_hi,
263 __m128i* dst_lo,
264 __m128i* dst_hi,
265 __m128i* alpha_src_lo,
266 __m128i* alpha_src_hi,
267 __m128i* ret_lo,
268 __m128i* ret_hi)
269 {
270 __m128i t1_lo, t1_hi;
271 __m128i t2_lo, t2_hi;
272
273 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
274 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
275
276 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
277 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
278 }
279
280 static force_inline void
negate_2x128(__m128i data_lo,__m128i data_hi,__m128i * neg_lo,__m128i * neg_hi)281 negate_2x128 (__m128i data_lo,
282 __m128i data_hi,
283 __m128i* neg_lo,
284 __m128i* neg_hi)
285 {
286 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
287 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
288 }
289
290 static force_inline void
invert_colors_2x128(__m128i data_lo,__m128i data_hi,__m128i * inv_lo,__m128i * inv_hi)291 invert_colors_2x128 (__m128i data_lo,
292 __m128i data_hi,
293 __m128i* inv_lo,
294 __m128i* inv_hi)
295 {
296 __m128i lo, hi;
297
298 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
299 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
300 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
301 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
302 }
303
304 static force_inline void
over_2x128(__m128i * src_lo,__m128i * src_hi,__m128i * alpha_lo,__m128i * alpha_hi,__m128i * dst_lo,__m128i * dst_hi)305 over_2x128 (__m128i* src_lo,
306 __m128i* src_hi,
307 __m128i* alpha_lo,
308 __m128i* alpha_hi,
309 __m128i* dst_lo,
310 __m128i* dst_hi)
311 {
312 __m128i t1, t2;
313
314 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
315
316 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
317
318 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
319 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
320 }
321
322 static force_inline void
over_rev_non_pre_2x128(__m128i src_lo,__m128i src_hi,__m128i * dst_lo,__m128i * dst_hi)323 over_rev_non_pre_2x128 (__m128i src_lo,
324 __m128i src_hi,
325 __m128i* dst_lo,
326 __m128i* dst_hi)
327 {
328 __m128i lo, hi;
329 __m128i alpha_lo, alpha_hi;
330
331 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
332
333 lo = _mm_or_si128 (alpha_lo, mask_alpha);
334 hi = _mm_or_si128 (alpha_hi, mask_alpha);
335
336 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
337
338 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
339
340 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
341 }
342
343 static force_inline void
in_over_2x128(__m128i * src_lo,__m128i * src_hi,__m128i * alpha_lo,__m128i * alpha_hi,__m128i * mask_lo,__m128i * mask_hi,__m128i * dst_lo,__m128i * dst_hi)344 in_over_2x128 (__m128i* src_lo,
345 __m128i* src_hi,
346 __m128i* alpha_lo,
347 __m128i* alpha_hi,
348 __m128i* mask_lo,
349 __m128i* mask_hi,
350 __m128i* dst_lo,
351 __m128i* dst_hi)
352 {
353 __m128i s_lo, s_hi;
354 __m128i a_lo, a_hi;
355
356 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
357 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
358
359 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
360 }
361
362 /* load 4 pixels from a 16-byte boundary aligned address */
363 static force_inline __m128i
load_128_aligned(__m128i * src)364 load_128_aligned (__m128i* src)
365 {
366 return _mm_load_si128 (src);
367 }
368
369 /* load 4 pixels from a unaligned address */
370 static force_inline __m128i
load_128_unaligned(const __m128i * src)371 load_128_unaligned (const __m128i* src)
372 {
373 return _mm_loadu_si128 (src);
374 }
375
376 /* save 4 pixels using Write Combining memory on a 16-byte
377 * boundary aligned address
378 */
379 static force_inline void
save_128_write_combining(__m128i * dst,__m128i data)380 save_128_write_combining (__m128i* dst,
381 __m128i data)
382 {
383 _mm_stream_si128 (dst, data);
384 }
385
386 /* save 4 pixels on a 16-byte boundary aligned address */
387 static force_inline void
save_128_aligned(__m128i * dst,__m128i data)388 save_128_aligned (__m128i* dst,
389 __m128i data)
390 {
391 _mm_store_si128 (dst, data);
392 }
393
394 /* save 4 pixels on a unaligned address */
395 static force_inline void
save_128_unaligned(__m128i * dst,__m128i data)396 save_128_unaligned (__m128i* dst,
397 __m128i data)
398 {
399 _mm_storeu_si128 (dst, data);
400 }
401
402 static force_inline __m128i
load_32_1x128(uint32_t data)403 load_32_1x128 (uint32_t data)
404 {
405 return _mm_cvtsi32_si128 (data);
406 }
407
408 static force_inline __m128i
expand_alpha_rev_1x128(__m128i data)409 expand_alpha_rev_1x128 (__m128i data)
410 {
411 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
412 }
413
414 static force_inline __m128i
expand_pixel_8_1x128(uint8_t data)415 expand_pixel_8_1x128 (uint8_t data)
416 {
417 return _mm_shufflelo_epi16 (
418 unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
419 }
420
421 static force_inline __m128i
pix_multiply_1x128(__m128i data,__m128i alpha)422 pix_multiply_1x128 (__m128i data,
423 __m128i alpha)
424 {
425 return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
426 mask_0080),
427 mask_0101);
428 }
429
430 static force_inline __m128i
pix_add_multiply_1x128(__m128i * src,__m128i * alpha_dst,__m128i * dst,__m128i * alpha_src)431 pix_add_multiply_1x128 (__m128i* src,
432 __m128i* alpha_dst,
433 __m128i* dst,
434 __m128i* alpha_src)
435 {
436 __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
437 __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
438
439 return _mm_adds_epu8 (t1, t2);
440 }
441
442 static force_inline __m128i
negate_1x128(__m128i data)443 negate_1x128 (__m128i data)
444 {
445 return _mm_xor_si128 (data, mask_00ff);
446 }
447
448 static force_inline __m128i
invert_colors_1x128(__m128i data)449 invert_colors_1x128 (__m128i data)
450 {
451 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
452 }
453
454 static force_inline __m128i
over_1x128(__m128i src,__m128i alpha,__m128i dst)455 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
456 {
457 return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
458 }
459
460 static force_inline __m128i
in_over_1x128(__m128i * src,__m128i * alpha,__m128i * mask,__m128i * dst)461 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
462 {
463 return over_1x128 (pix_multiply_1x128 (*src, *mask),
464 pix_multiply_1x128 (*alpha, *mask),
465 *dst);
466 }
467
468 static force_inline __m128i
over_rev_non_pre_1x128(__m128i src,__m128i dst)469 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
470 {
471 __m128i alpha = expand_alpha_1x128 (src);
472
473 return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
474 _mm_or_si128 (alpha, mask_alpha)),
475 alpha,
476 dst);
477 }
478
479 static force_inline uint32_t
pack_1x128_32(__m128i data)480 pack_1x128_32 (__m128i data)
481 {
482 return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
483 }
484
485 static force_inline __m128i
expand565_16_1x128(uint16_t pixel)486 expand565_16_1x128 (uint16_t pixel)
487 {
488 __m128i m = _mm_cvtsi32_si128 (pixel);
489
490 m = unpack_565_to_8888 (m);
491
492 return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
493 }
494
495 static force_inline uint32_t
core_combine_over_u_pixel_sse2(uint32_t src,uint32_t dst)496 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
497 {
498 uint8_t a;
499 __m128i xmms;
500
501 a = src >> 24;
502
503 if (a == 0xff)
504 {
505 return src;
506 }
507 else if (src)
508 {
509 xmms = unpack_32_1x128 (src);
510 return pack_1x128_32 (
511 over_1x128 (xmms, expand_alpha_1x128 (xmms),
512 unpack_32_1x128 (dst)));
513 }
514
515 return dst;
516 }
517
518 static force_inline uint32_t
combine1(const uint32_t * ps,const uint32_t * pm)519 combine1 (const uint32_t *ps, const uint32_t *pm)
520 {
521 uint32_t s;
522 memcpy(&s, ps, sizeof(uint32_t));
523
524 if (pm)
525 {
526 __m128i ms, mm;
527
528 mm = unpack_32_1x128 (*pm);
529 mm = expand_alpha_1x128 (mm);
530
531 ms = unpack_32_1x128 (s);
532 ms = pix_multiply_1x128 (ms, mm);
533
534 s = pack_1x128_32 (ms);
535 }
536
537 return s;
538 }
539
540 static force_inline __m128i
combine4(const __m128i * ps,const __m128i * pm)541 combine4 (const __m128i *ps, const __m128i *pm)
542 {
543 __m128i xmm_src_lo, xmm_src_hi;
544 __m128i xmm_msk_lo, xmm_msk_hi;
545 __m128i s;
546
547 if (pm)
548 {
549 xmm_msk_lo = load_128_unaligned (pm);
550
551 if (is_transparent (xmm_msk_lo))
552 return _mm_setzero_si128 ();
553 }
554
555 s = load_128_unaligned (ps);
556
557 if (pm)
558 {
559 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
560 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
561
562 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
563
564 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
565 &xmm_msk_lo, &xmm_msk_hi,
566 &xmm_src_lo, &xmm_src_hi);
567
568 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
569 }
570
571 return s;
572 }
573
574 static force_inline void
core_combine_over_u_sse2_mask(uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)575 core_combine_over_u_sse2_mask (uint32_t * pd,
576 const uint32_t* ps,
577 const uint32_t* pm,
578 int w)
579 {
580 uint32_t s, d;
581
582 /* Align dst on a 16-byte boundary */
583 while (w && ((uintptr_t)pd & 15))
584 {
585 d = *pd;
586 s = combine1 (ps, pm);
587
588 if (s)
589 *pd = core_combine_over_u_pixel_sse2 (s, d);
590 pd++;
591 ps++;
592 pm++;
593 w--;
594 }
595
596 while (w >= 4)
597 {
598 __m128i mask = load_128_unaligned ((__m128i *)pm);
599
600 if (!is_zero (mask))
601 {
602 __m128i src;
603 __m128i src_hi, src_lo;
604 __m128i mask_hi, mask_lo;
605 __m128i alpha_hi, alpha_lo;
606
607 src = load_128_unaligned ((__m128i *)ps);
608
609 if (is_opaque (_mm_and_si128 (src, mask)))
610 {
611 save_128_aligned ((__m128i *)pd, src);
612 }
613 else
614 {
615 __m128i dst = load_128_aligned ((__m128i *)pd);
616 __m128i dst_hi, dst_lo;
617
618 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
619 unpack_128_2x128 (src, &src_lo, &src_hi);
620
621 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
622 pix_multiply_2x128 (&src_lo, &src_hi,
623 &mask_lo, &mask_hi,
624 &src_lo, &src_hi);
625
626 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
627
628 expand_alpha_2x128 (src_lo, src_hi,
629 &alpha_lo, &alpha_hi);
630
631 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
632 &dst_lo, &dst_hi);
633
634 save_128_aligned (
635 (__m128i *)pd,
636 pack_2x128_128 (dst_lo, dst_hi));
637 }
638 }
639
640 pm += 4;
641 ps += 4;
642 pd += 4;
643 w -= 4;
644 }
645 while (w)
646 {
647 d = *pd;
648 s = combine1 (ps, pm);
649
650 if (s)
651 *pd = core_combine_over_u_pixel_sse2 (s, d);
652 pd++;
653 ps++;
654 pm++;
655
656 w--;
657 }
658 }
659
660 static force_inline void
core_combine_over_u_sse2_no_mask(uint32_t * pd,const uint32_t * ps,int w)661 core_combine_over_u_sse2_no_mask (uint32_t * pd,
662 const uint32_t* ps,
663 int w)
664 {
665 uint32_t s, d;
666
667 /* Align dst on a 16-byte boundary */
668 while (w && ((uintptr_t)pd & 15))
669 {
670 d = *pd;
671 s = *ps;
672
673 if (s)
674 *pd = core_combine_over_u_pixel_sse2 (s, d);
675 pd++;
676 ps++;
677 w--;
678 }
679
680 while (w >= 4)
681 {
682 __m128i src;
683 __m128i src_hi, src_lo, dst_hi, dst_lo;
684 __m128i alpha_hi, alpha_lo;
685
686 src = load_128_unaligned ((__m128i *)ps);
687
688 if (!is_zero (src))
689 {
690 if (is_opaque (src))
691 {
692 save_128_aligned ((__m128i *)pd, src);
693 }
694 else
695 {
696 __m128i dst = load_128_aligned ((__m128i *)pd);
697
698 unpack_128_2x128 (src, &src_lo, &src_hi);
699 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
700
701 expand_alpha_2x128 (src_lo, src_hi,
702 &alpha_lo, &alpha_hi);
703 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
704 &dst_lo, &dst_hi);
705
706 save_128_aligned (
707 (__m128i *)pd,
708 pack_2x128_128 (dst_lo, dst_hi));
709 }
710 }
711
712 ps += 4;
713 pd += 4;
714 w -= 4;
715 }
716 while (w)
717 {
718 d = *pd;
719 s = *ps;
720
721 if (s)
722 *pd = core_combine_over_u_pixel_sse2 (s, d);
723 pd++;
724 ps++;
725
726 w--;
727 }
728 }
729
730 static force_inline void
sse2_combine_over_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)731 sse2_combine_over_u (pixman_implementation_t *imp,
732 pixman_op_t op,
733 uint32_t * pd,
734 const uint32_t * ps,
735 const uint32_t * pm,
736 int w)
737 {
738 if (pm)
739 core_combine_over_u_sse2_mask (pd, ps, pm, w);
740 else
741 core_combine_over_u_sse2_no_mask (pd, ps, w);
742 }
743
744 static void
sse2_combine_over_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)745 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
746 pixman_op_t op,
747 uint32_t * pd,
748 const uint32_t * ps,
749 const uint32_t * pm,
750 int w)
751 {
752 uint32_t s, d;
753
754 __m128i xmm_dst_lo, xmm_dst_hi;
755 __m128i xmm_src_lo, xmm_src_hi;
756 __m128i xmm_alpha_lo, xmm_alpha_hi;
757
758 /* Align dst on a 16-byte boundary */
759 while (w &&
760 ((uintptr_t)pd & 15))
761 {
762 d = *pd;
763 s = combine1 (ps, pm);
764
765 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
766 w--;
767 ps++;
768 if (pm)
769 pm++;
770 }
771
772 while (w >= 4)
773 {
774 /* I'm loading unaligned because I'm not sure
775 * about the address alignment.
776 */
777 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
778 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
779
780 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
781 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
782
783 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
784 &xmm_alpha_lo, &xmm_alpha_hi);
785
786 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
787 &xmm_alpha_lo, &xmm_alpha_hi,
788 &xmm_src_lo, &xmm_src_hi);
789
790 /* rebuid the 4 pixel data and save*/
791 save_128_aligned ((__m128i*)pd,
792 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
793
794 w -= 4;
795 ps += 4;
796 pd += 4;
797
798 if (pm)
799 pm += 4;
800 }
801
802 while (w)
803 {
804 d = *pd;
805 s = combine1 (ps, pm);
806
807 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
808 ps++;
809 w--;
810 if (pm)
811 pm++;
812 }
813 }
814
815 static force_inline uint32_t
core_combine_in_u_pixel_sse2(uint32_t src,uint32_t dst)816 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
817 {
818 uint32_t maska = src >> 24;
819
820 if (maska == 0)
821 {
822 return 0;
823 }
824 else if (maska != 0xff)
825 {
826 return pack_1x128_32 (
827 pix_multiply_1x128 (unpack_32_1x128 (dst),
828 expand_alpha_1x128 (unpack_32_1x128 (src))));
829 }
830
831 return dst;
832 }
833
834 static void
sse2_combine_in_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)835 sse2_combine_in_u (pixman_implementation_t *imp,
836 pixman_op_t op,
837 uint32_t * pd,
838 const uint32_t * ps,
839 const uint32_t * pm,
840 int w)
841 {
842 uint32_t s, d;
843
844 __m128i xmm_src_lo, xmm_src_hi;
845 __m128i xmm_dst_lo, xmm_dst_hi;
846
847 while (w && ((uintptr_t)pd & 15))
848 {
849 s = combine1 (ps, pm);
850 d = *pd;
851
852 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
853 w--;
854 ps++;
855 if (pm)
856 pm++;
857 }
858
859 while (w >= 4)
860 {
861 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
862 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
863
864 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
865 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
866
867 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
868 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
869 &xmm_dst_lo, &xmm_dst_hi,
870 &xmm_dst_lo, &xmm_dst_hi);
871
872 save_128_aligned ((__m128i*)pd,
873 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
874
875 ps += 4;
876 pd += 4;
877 w -= 4;
878 if (pm)
879 pm += 4;
880 }
881
882 while (w)
883 {
884 s = combine1 (ps, pm);
885 d = *pd;
886
887 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
888 w--;
889 ps++;
890 if (pm)
891 pm++;
892 }
893 }
894
895 static void
sse2_combine_in_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)896 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
897 pixman_op_t op,
898 uint32_t * pd,
899 const uint32_t * ps,
900 const uint32_t * pm,
901 int w)
902 {
903 uint32_t s, d;
904
905 __m128i xmm_src_lo, xmm_src_hi;
906 __m128i xmm_dst_lo, xmm_dst_hi;
907
908 while (w && ((uintptr_t)pd & 15))
909 {
910 s = combine1 (ps, pm);
911 d = *pd;
912
913 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
914 ps++;
915 w--;
916 if (pm)
917 pm++;
918 }
919
920 while (w >= 4)
921 {
922 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
923 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
924
925 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
926 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
927
928 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
929 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
930 &xmm_src_lo, &xmm_src_hi,
931 &xmm_dst_lo, &xmm_dst_hi);
932
933 save_128_aligned (
934 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
935
936 ps += 4;
937 pd += 4;
938 w -= 4;
939 if (pm)
940 pm += 4;
941 }
942
943 while (w)
944 {
945 s = combine1 (ps, pm);
946 d = *pd;
947
948 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
949 w--;
950 ps++;
951 if (pm)
952 pm++;
953 }
954 }
955
956 static void
sse2_combine_out_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)957 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
958 pixman_op_t op,
959 uint32_t * pd,
960 const uint32_t * ps,
961 const uint32_t * pm,
962 int w)
963 {
964 while (w && ((uintptr_t)pd & 15))
965 {
966 uint32_t s = combine1 (ps, pm);
967 uint32_t d = *pd;
968
969 *pd++ = pack_1x128_32 (
970 pix_multiply_1x128 (
971 unpack_32_1x128 (d), negate_1x128 (
972 expand_alpha_1x128 (unpack_32_1x128 (s)))));
973
974 if (pm)
975 pm++;
976 ps++;
977 w--;
978 }
979
980 while (w >= 4)
981 {
982 __m128i xmm_src_lo, xmm_src_hi;
983 __m128i xmm_dst_lo, xmm_dst_hi;
984
985 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
986 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
987
988 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
989 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
990
991 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
992 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
993
994 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
995 &xmm_src_lo, &xmm_src_hi,
996 &xmm_dst_lo, &xmm_dst_hi);
997
998 save_128_aligned (
999 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1000
1001 ps += 4;
1002 pd += 4;
1003 if (pm)
1004 pm += 4;
1005
1006 w -= 4;
1007 }
1008
1009 while (w)
1010 {
1011 uint32_t s = combine1 (ps, pm);
1012 uint32_t d = *pd;
1013
1014 *pd++ = pack_1x128_32 (
1015 pix_multiply_1x128 (
1016 unpack_32_1x128 (d), negate_1x128 (
1017 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1018 ps++;
1019 if (pm)
1020 pm++;
1021 w--;
1022 }
1023 }
1024
1025 static void
sse2_combine_out_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1026 sse2_combine_out_u (pixman_implementation_t *imp,
1027 pixman_op_t op,
1028 uint32_t * pd,
1029 const uint32_t * ps,
1030 const uint32_t * pm,
1031 int w)
1032 {
1033 while (w && ((uintptr_t)pd & 15))
1034 {
1035 uint32_t s = combine1 (ps, pm);
1036 uint32_t d = *pd;
1037
1038 *pd++ = pack_1x128_32 (
1039 pix_multiply_1x128 (
1040 unpack_32_1x128 (s), negate_1x128 (
1041 expand_alpha_1x128 (unpack_32_1x128 (d)))));
1042 w--;
1043 ps++;
1044 if (pm)
1045 pm++;
1046 }
1047
1048 while (w >= 4)
1049 {
1050 __m128i xmm_src_lo, xmm_src_hi;
1051 __m128i xmm_dst_lo, xmm_dst_hi;
1052
1053 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1054 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1055
1056 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1057 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1058
1059 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1060 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1061
1062 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1063 &xmm_dst_lo, &xmm_dst_hi,
1064 &xmm_dst_lo, &xmm_dst_hi);
1065
1066 save_128_aligned (
1067 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1068
1069 ps += 4;
1070 pd += 4;
1071 w -= 4;
1072 if (pm)
1073 pm += 4;
1074 }
1075
1076 while (w)
1077 {
1078 uint32_t s = combine1 (ps, pm);
1079 uint32_t d = *pd;
1080
1081 *pd++ = pack_1x128_32 (
1082 pix_multiply_1x128 (
1083 unpack_32_1x128 (s), negate_1x128 (
1084 expand_alpha_1x128 (unpack_32_1x128 (d)))));
1085 w--;
1086 ps++;
1087 if (pm)
1088 pm++;
1089 }
1090 }
1091
1092 static force_inline uint32_t
core_combine_atop_u_pixel_sse2(uint32_t src,uint32_t dst)1093 core_combine_atop_u_pixel_sse2 (uint32_t src,
1094 uint32_t dst)
1095 {
1096 __m128i s = unpack_32_1x128 (src);
1097 __m128i d = unpack_32_1x128 (dst);
1098
1099 __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1100 __m128i da = expand_alpha_1x128 (d);
1101
1102 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1103 }
1104
1105 static void
sse2_combine_atop_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1106 sse2_combine_atop_u (pixman_implementation_t *imp,
1107 pixman_op_t op,
1108 uint32_t * pd,
1109 const uint32_t * ps,
1110 const uint32_t * pm,
1111 int w)
1112 {
1113 uint32_t s, d;
1114
1115 __m128i xmm_src_lo, xmm_src_hi;
1116 __m128i xmm_dst_lo, xmm_dst_hi;
1117 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1118 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1119
1120 while (w && ((uintptr_t)pd & 15))
1121 {
1122 s = combine1 (ps, pm);
1123 d = *pd;
1124
1125 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1126 w--;
1127 ps++;
1128 if (pm)
1129 pm++;
1130 }
1131
1132 while (w >= 4)
1133 {
1134 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1135 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1136
1137 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1138 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1139
1140 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1141 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1142 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1143 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1144
1145 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1146 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1147
1148 pix_add_multiply_2x128 (
1149 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1150 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1151 &xmm_dst_lo, &xmm_dst_hi);
1152
1153 save_128_aligned (
1154 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1155
1156 ps += 4;
1157 pd += 4;
1158 w -= 4;
1159 if (pm)
1160 pm += 4;
1161 }
1162
1163 while (w)
1164 {
1165 s = combine1 (ps, pm);
1166 d = *pd;
1167
1168 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1169 w--;
1170 ps++;
1171 if (pm)
1172 pm++;
1173 }
1174 }
1175
1176 static force_inline uint32_t
core_combine_reverse_atop_u_pixel_sse2(uint32_t src,uint32_t dst)1177 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1178 uint32_t dst)
1179 {
1180 __m128i s = unpack_32_1x128 (src);
1181 __m128i d = unpack_32_1x128 (dst);
1182
1183 __m128i sa = expand_alpha_1x128 (s);
1184 __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1185
1186 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1187 }
1188
1189 static void
sse2_combine_atop_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1190 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1191 pixman_op_t op,
1192 uint32_t * pd,
1193 const uint32_t * ps,
1194 const uint32_t * pm,
1195 int w)
1196 {
1197 uint32_t s, d;
1198
1199 __m128i xmm_src_lo, xmm_src_hi;
1200 __m128i xmm_dst_lo, xmm_dst_hi;
1201 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1202 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1203
1204 while (w && ((uintptr_t)pd & 15))
1205 {
1206 s = combine1 (ps, pm);
1207 d = *pd;
1208
1209 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1210 ps++;
1211 w--;
1212 if (pm)
1213 pm++;
1214 }
1215
1216 while (w >= 4)
1217 {
1218 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1219 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1220
1221 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1222 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1223
1224 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1225 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1226 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1227 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1228
1229 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1230 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1231
1232 pix_add_multiply_2x128 (
1233 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1234 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1235 &xmm_dst_lo, &xmm_dst_hi);
1236
1237 save_128_aligned (
1238 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1239
1240 ps += 4;
1241 pd += 4;
1242 w -= 4;
1243 if (pm)
1244 pm += 4;
1245 }
1246
1247 while (w)
1248 {
1249 s = combine1 (ps, pm);
1250 d = *pd;
1251
1252 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1253 ps++;
1254 w--;
1255 if (pm)
1256 pm++;
1257 }
1258 }
1259
1260 static force_inline uint32_t
core_combine_xor_u_pixel_sse2(uint32_t src,uint32_t dst)1261 core_combine_xor_u_pixel_sse2 (uint32_t src,
1262 uint32_t dst)
1263 {
1264 __m128i s = unpack_32_1x128 (src);
1265 __m128i d = unpack_32_1x128 (dst);
1266
1267 __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1268 __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1269
1270 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1271 }
1272
1273 static void
sse2_combine_xor_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dst,const uint32_t * src,const uint32_t * mask,int width)1274 sse2_combine_xor_u (pixman_implementation_t *imp,
1275 pixman_op_t op,
1276 uint32_t * dst,
1277 const uint32_t * src,
1278 const uint32_t * mask,
1279 int width)
1280 {
1281 int w = width;
1282 uint32_t s, d;
1283 uint32_t* pd = dst;
1284 const uint32_t* ps = src;
1285 const uint32_t* pm = mask;
1286
1287 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1288 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1289 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1290 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1291
1292 while (w && ((uintptr_t)pd & 15))
1293 {
1294 s = combine1 (ps, pm);
1295 d = *pd;
1296
1297 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1298 w--;
1299 ps++;
1300 if (pm)
1301 pm++;
1302 }
1303
1304 while (w >= 4)
1305 {
1306 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1307 xmm_dst = load_128_aligned ((__m128i*) pd);
1308
1309 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1310 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1311
1312 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1313 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1314 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1315 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1316
1317 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1318 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1319 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1320 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1321
1322 pix_add_multiply_2x128 (
1323 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1324 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1325 &xmm_dst_lo, &xmm_dst_hi);
1326
1327 save_128_aligned (
1328 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1329
1330 ps += 4;
1331 pd += 4;
1332 w -= 4;
1333 if (pm)
1334 pm += 4;
1335 }
1336
1337 while (w)
1338 {
1339 s = combine1 (ps, pm);
1340 d = *pd;
1341
1342 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1343 w--;
1344 ps++;
1345 if (pm)
1346 pm++;
1347 }
1348 }
1349
1350 static force_inline void
sse2_combine_add_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dst,const uint32_t * src,const uint32_t * mask,int width)1351 sse2_combine_add_u (pixman_implementation_t *imp,
1352 pixman_op_t op,
1353 uint32_t * dst,
1354 const uint32_t * src,
1355 const uint32_t * mask,
1356 int width)
1357 {
1358 int w = width;
1359 uint32_t s, d;
1360 uint32_t* pd = dst;
1361 const uint32_t* ps = src;
1362 const uint32_t* pm = mask;
1363
1364 while (w && (uintptr_t)pd & 15)
1365 {
1366 s = combine1 (ps, pm);
1367 d = *pd;
1368
1369 ps++;
1370 if (pm)
1371 pm++;
1372 *pd++ = _mm_cvtsi128_si32 (
1373 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1374 w--;
1375 }
1376
1377 while (w >= 4)
1378 {
1379 __m128i s;
1380
1381 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1382
1383 save_128_aligned (
1384 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1385
1386 pd += 4;
1387 ps += 4;
1388 if (pm)
1389 pm += 4;
1390 w -= 4;
1391 }
1392
1393 while (w--)
1394 {
1395 s = combine1 (ps, pm);
1396 d = *pd;
1397
1398 ps++;
1399 *pd++ = _mm_cvtsi128_si32 (
1400 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1401 if (pm)
1402 pm++;
1403 }
1404 }
1405
1406 static force_inline uint32_t
core_combine_saturate_u_pixel_sse2(uint32_t src,uint32_t dst)1407 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1408 uint32_t dst)
1409 {
1410 __m128i ms = unpack_32_1x128 (src);
1411 __m128i md = unpack_32_1x128 (dst);
1412 uint32_t sa = src >> 24;
1413 uint32_t da = ~dst >> 24;
1414
1415 if (sa > da)
1416 {
1417 ms = pix_multiply_1x128 (
1418 ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1419 }
1420
1421 return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1422 }
1423
1424 static void
sse2_combine_saturate_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1425 sse2_combine_saturate_u (pixman_implementation_t *imp,
1426 pixman_op_t op,
1427 uint32_t * pd,
1428 const uint32_t * ps,
1429 const uint32_t * pm,
1430 int w)
1431 {
1432 uint32_t s, d;
1433
1434 uint32_t pack_cmp;
1435 __m128i xmm_src, xmm_dst;
1436
1437 while (w && (uintptr_t)pd & 15)
1438 {
1439 s = combine1 (ps, pm);
1440 d = *pd;
1441
1442 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1443 w--;
1444 ps++;
1445 if (pm)
1446 pm++;
1447 }
1448
1449 while (w >= 4)
1450 {
1451 xmm_dst = load_128_aligned ((__m128i*)pd);
1452 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1453
1454 pack_cmp = _mm_movemask_epi8 (
1455 _mm_cmpgt_epi32 (
1456 _mm_srli_epi32 (xmm_src, 24),
1457 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1458
1459 /* if some alpha src is grater than respective ~alpha dst */
1460 if (pack_cmp)
1461 {
1462 s = combine1 (ps++, pm);
1463 d = *pd;
1464 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1465 if (pm)
1466 pm++;
1467
1468 s = combine1 (ps++, pm);
1469 d = *pd;
1470 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1471 if (pm)
1472 pm++;
1473
1474 s = combine1 (ps++, pm);
1475 d = *pd;
1476 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1477 if (pm)
1478 pm++;
1479
1480 s = combine1 (ps++, pm);
1481 d = *pd;
1482 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1483 if (pm)
1484 pm++;
1485 }
1486 else
1487 {
1488 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1489
1490 pd += 4;
1491 ps += 4;
1492 if (pm)
1493 pm += 4;
1494 }
1495
1496 w -= 4;
1497 }
1498
1499 while (w--)
1500 {
1501 s = combine1 (ps, pm);
1502 d = *pd;
1503
1504 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1505 ps++;
1506 if (pm)
1507 pm++;
1508 }
1509 }
1510
1511 static void
sse2_combine_src_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1512 sse2_combine_src_ca (pixman_implementation_t *imp,
1513 pixman_op_t op,
1514 uint32_t * pd,
1515 const uint32_t * ps,
1516 const uint32_t * pm,
1517 int w)
1518 {
1519 uint32_t s, m;
1520
1521 __m128i xmm_src_lo, xmm_src_hi;
1522 __m128i xmm_mask_lo, xmm_mask_hi;
1523 __m128i xmm_dst_lo, xmm_dst_hi;
1524
1525 while (w && (uintptr_t)pd & 15)
1526 {
1527 s = *ps++;
1528 m = *pm++;
1529 *pd++ = pack_1x128_32 (
1530 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1531 w--;
1532 }
1533
1534 while (w >= 4)
1535 {
1536 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1537 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1538
1539 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1540 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1541
1542 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1543 &xmm_mask_lo, &xmm_mask_hi,
1544 &xmm_dst_lo, &xmm_dst_hi);
1545
1546 save_128_aligned (
1547 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1548
1549 ps += 4;
1550 pd += 4;
1551 pm += 4;
1552 w -= 4;
1553 }
1554
1555 while (w)
1556 {
1557 s = *ps++;
1558 m = *pm++;
1559 *pd++ = pack_1x128_32 (
1560 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1561 w--;
1562 }
1563 }
1564
1565 static force_inline uint32_t
core_combine_over_ca_pixel_sse2(uint32_t src,uint32_t mask,uint32_t dst)1566 core_combine_over_ca_pixel_sse2 (uint32_t src,
1567 uint32_t mask,
1568 uint32_t dst)
1569 {
1570 __m128i s = unpack_32_1x128 (src);
1571 __m128i expAlpha = expand_alpha_1x128 (s);
1572 __m128i unpk_mask = unpack_32_1x128 (mask);
1573 __m128i unpk_dst = unpack_32_1x128 (dst);
1574
1575 return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1576 }
1577
1578 static void
sse2_combine_over_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1579 sse2_combine_over_ca (pixman_implementation_t *imp,
1580 pixman_op_t op,
1581 uint32_t * pd,
1582 const uint32_t * ps,
1583 const uint32_t * pm,
1584 int w)
1585 {
1586 uint32_t s, m, d;
1587
1588 __m128i xmm_alpha_lo, xmm_alpha_hi;
1589 __m128i xmm_src_lo, xmm_src_hi;
1590 __m128i xmm_dst_lo, xmm_dst_hi;
1591 __m128i xmm_mask_lo, xmm_mask_hi;
1592
1593 while (w && (uintptr_t)pd & 15)
1594 {
1595 s = *ps++;
1596 m = *pm++;
1597 d = *pd;
1598
1599 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1600 w--;
1601 }
1602
1603 while (w >= 4)
1604 {
1605 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1606 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1607 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1608
1609 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1610 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1611 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1612
1613 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1614 &xmm_alpha_lo, &xmm_alpha_hi);
1615
1616 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1617 &xmm_alpha_lo, &xmm_alpha_hi,
1618 &xmm_mask_lo, &xmm_mask_hi,
1619 &xmm_dst_lo, &xmm_dst_hi);
1620
1621 save_128_aligned (
1622 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1623
1624 ps += 4;
1625 pd += 4;
1626 pm += 4;
1627 w -= 4;
1628 }
1629
1630 while (w)
1631 {
1632 s = *ps++;
1633 m = *pm++;
1634 d = *pd;
1635
1636 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1637 w--;
1638 }
1639 }
1640
1641 static force_inline uint32_t
core_combine_over_reverse_ca_pixel_sse2(uint32_t src,uint32_t mask,uint32_t dst)1642 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1643 uint32_t mask,
1644 uint32_t dst)
1645 {
1646 __m128i d = unpack_32_1x128 (dst);
1647
1648 return pack_1x128_32 (
1649 over_1x128 (d, expand_alpha_1x128 (d),
1650 pix_multiply_1x128 (unpack_32_1x128 (src),
1651 unpack_32_1x128 (mask))));
1652 }
1653
1654 static void
sse2_combine_over_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1655 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1656 pixman_op_t op,
1657 uint32_t * pd,
1658 const uint32_t * ps,
1659 const uint32_t * pm,
1660 int w)
1661 {
1662 uint32_t s, m, d;
1663
1664 __m128i xmm_alpha_lo, xmm_alpha_hi;
1665 __m128i xmm_src_lo, xmm_src_hi;
1666 __m128i xmm_dst_lo, xmm_dst_hi;
1667 __m128i xmm_mask_lo, xmm_mask_hi;
1668
1669 while (w && (uintptr_t)pd & 15)
1670 {
1671 s = *ps++;
1672 m = *pm++;
1673 d = *pd;
1674
1675 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1676 w--;
1677 }
1678
1679 while (w >= 4)
1680 {
1681 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1682 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1683 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1684
1685 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1686 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1687 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1688
1689 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1690 &xmm_alpha_lo, &xmm_alpha_hi);
1691 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1692 &xmm_mask_lo, &xmm_mask_hi,
1693 &xmm_mask_lo, &xmm_mask_hi);
1694
1695 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1696 &xmm_alpha_lo, &xmm_alpha_hi,
1697 &xmm_mask_lo, &xmm_mask_hi);
1698
1699 save_128_aligned (
1700 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1701
1702 ps += 4;
1703 pd += 4;
1704 pm += 4;
1705 w -= 4;
1706 }
1707
1708 while (w)
1709 {
1710 s = *ps++;
1711 m = *pm++;
1712 d = *pd;
1713
1714 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1715 w--;
1716 }
1717 }
1718
1719 static void
sse2_combine_in_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1720 sse2_combine_in_ca (pixman_implementation_t *imp,
1721 pixman_op_t op,
1722 uint32_t * pd,
1723 const uint32_t * ps,
1724 const uint32_t * pm,
1725 int w)
1726 {
1727 uint32_t s, m, d;
1728
1729 __m128i xmm_alpha_lo, xmm_alpha_hi;
1730 __m128i xmm_src_lo, xmm_src_hi;
1731 __m128i xmm_dst_lo, xmm_dst_hi;
1732 __m128i xmm_mask_lo, xmm_mask_hi;
1733
1734 while (w && (uintptr_t)pd & 15)
1735 {
1736 s = *ps++;
1737 m = *pm++;
1738 d = *pd;
1739
1740 *pd++ = pack_1x128_32 (
1741 pix_multiply_1x128 (
1742 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1743 expand_alpha_1x128 (unpack_32_1x128 (d))));
1744
1745 w--;
1746 }
1747
1748 while (w >= 4)
1749 {
1750 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1751 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1752 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1753
1754 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1755 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1756 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1757
1758 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1759 &xmm_alpha_lo, &xmm_alpha_hi);
1760
1761 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1762 &xmm_mask_lo, &xmm_mask_hi,
1763 &xmm_dst_lo, &xmm_dst_hi);
1764
1765 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1766 &xmm_alpha_lo, &xmm_alpha_hi,
1767 &xmm_dst_lo, &xmm_dst_hi);
1768
1769 save_128_aligned (
1770 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1771
1772 ps += 4;
1773 pd += 4;
1774 pm += 4;
1775 w -= 4;
1776 }
1777
1778 while (w)
1779 {
1780 s = *ps++;
1781 m = *pm++;
1782 d = *pd;
1783
1784 *pd++ = pack_1x128_32 (
1785 pix_multiply_1x128 (
1786 pix_multiply_1x128 (
1787 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1788 expand_alpha_1x128 (unpack_32_1x128 (d))));
1789
1790 w--;
1791 }
1792 }
1793
1794 static void
sse2_combine_in_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1795 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1796 pixman_op_t op,
1797 uint32_t * pd,
1798 const uint32_t * ps,
1799 const uint32_t * pm,
1800 int w)
1801 {
1802 uint32_t s, m, d;
1803
1804 __m128i xmm_alpha_lo, xmm_alpha_hi;
1805 __m128i xmm_src_lo, xmm_src_hi;
1806 __m128i xmm_dst_lo, xmm_dst_hi;
1807 __m128i xmm_mask_lo, xmm_mask_hi;
1808
1809 while (w && (uintptr_t)pd & 15)
1810 {
1811 s = *ps++;
1812 m = *pm++;
1813 d = *pd;
1814
1815 *pd++ = pack_1x128_32 (
1816 pix_multiply_1x128 (
1817 unpack_32_1x128 (d),
1818 pix_multiply_1x128 (unpack_32_1x128 (m),
1819 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1820 w--;
1821 }
1822
1823 while (w >= 4)
1824 {
1825 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1826 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1827 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1828
1829 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1830 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1831 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1832
1833 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1834 &xmm_alpha_lo, &xmm_alpha_hi);
1835 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1836 &xmm_alpha_lo, &xmm_alpha_hi,
1837 &xmm_alpha_lo, &xmm_alpha_hi);
1838
1839 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1840 &xmm_alpha_lo, &xmm_alpha_hi,
1841 &xmm_dst_lo, &xmm_dst_hi);
1842
1843 save_128_aligned (
1844 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1845
1846 ps += 4;
1847 pd += 4;
1848 pm += 4;
1849 w -= 4;
1850 }
1851
1852 while (w)
1853 {
1854 s = *ps++;
1855 m = *pm++;
1856 d = *pd;
1857
1858 *pd++ = pack_1x128_32 (
1859 pix_multiply_1x128 (
1860 unpack_32_1x128 (d),
1861 pix_multiply_1x128 (unpack_32_1x128 (m),
1862 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1863 w--;
1864 }
1865 }
1866
1867 static void
sse2_combine_out_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1868 sse2_combine_out_ca (pixman_implementation_t *imp,
1869 pixman_op_t op,
1870 uint32_t * pd,
1871 const uint32_t * ps,
1872 const uint32_t * pm,
1873 int w)
1874 {
1875 uint32_t s, m, d;
1876
1877 __m128i xmm_alpha_lo, xmm_alpha_hi;
1878 __m128i xmm_src_lo, xmm_src_hi;
1879 __m128i xmm_dst_lo, xmm_dst_hi;
1880 __m128i xmm_mask_lo, xmm_mask_hi;
1881
1882 while (w && (uintptr_t)pd & 15)
1883 {
1884 s = *ps++;
1885 m = *pm++;
1886 d = *pd;
1887
1888 *pd++ = pack_1x128_32 (
1889 pix_multiply_1x128 (
1890 pix_multiply_1x128 (
1891 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1892 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1893 w--;
1894 }
1895
1896 while (w >= 4)
1897 {
1898 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1899 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1900 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1901
1902 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1903 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1904 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1905
1906 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1907 &xmm_alpha_lo, &xmm_alpha_hi);
1908 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1909 &xmm_alpha_lo, &xmm_alpha_hi);
1910
1911 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1912 &xmm_mask_lo, &xmm_mask_hi,
1913 &xmm_dst_lo, &xmm_dst_hi);
1914 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1915 &xmm_alpha_lo, &xmm_alpha_hi,
1916 &xmm_dst_lo, &xmm_dst_hi);
1917
1918 save_128_aligned (
1919 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1920
1921 ps += 4;
1922 pd += 4;
1923 pm += 4;
1924 w -= 4;
1925 }
1926
1927 while (w)
1928 {
1929 s = *ps++;
1930 m = *pm++;
1931 d = *pd;
1932
1933 *pd++ = pack_1x128_32 (
1934 pix_multiply_1x128 (
1935 pix_multiply_1x128 (
1936 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1937 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1938
1939 w--;
1940 }
1941 }
1942
1943 static void
sse2_combine_out_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1944 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1945 pixman_op_t op,
1946 uint32_t * pd,
1947 const uint32_t * ps,
1948 const uint32_t * pm,
1949 int w)
1950 {
1951 uint32_t s, m, d;
1952
1953 __m128i xmm_alpha_lo, xmm_alpha_hi;
1954 __m128i xmm_src_lo, xmm_src_hi;
1955 __m128i xmm_dst_lo, xmm_dst_hi;
1956 __m128i xmm_mask_lo, xmm_mask_hi;
1957
1958 while (w && (uintptr_t)pd & 15)
1959 {
1960 s = *ps++;
1961 m = *pm++;
1962 d = *pd;
1963
1964 *pd++ = pack_1x128_32 (
1965 pix_multiply_1x128 (
1966 unpack_32_1x128 (d),
1967 negate_1x128 (pix_multiply_1x128 (
1968 unpack_32_1x128 (m),
1969 expand_alpha_1x128 (unpack_32_1x128 (s))))));
1970 w--;
1971 }
1972
1973 while (w >= 4)
1974 {
1975 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1976 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1977 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1978
1979 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1980 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1981 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1982
1983 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1984 &xmm_alpha_lo, &xmm_alpha_hi);
1985
1986 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1987 &xmm_alpha_lo, &xmm_alpha_hi,
1988 &xmm_mask_lo, &xmm_mask_hi);
1989
1990 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1991 &xmm_mask_lo, &xmm_mask_hi);
1992
1993 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1994 &xmm_mask_lo, &xmm_mask_hi,
1995 &xmm_dst_lo, &xmm_dst_hi);
1996
1997 save_128_aligned (
1998 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1999
2000 ps += 4;
2001 pd += 4;
2002 pm += 4;
2003 w -= 4;
2004 }
2005
2006 while (w)
2007 {
2008 s = *ps++;
2009 m = *pm++;
2010 d = *pd;
2011
2012 *pd++ = pack_1x128_32 (
2013 pix_multiply_1x128 (
2014 unpack_32_1x128 (d),
2015 negate_1x128 (pix_multiply_1x128 (
2016 unpack_32_1x128 (m),
2017 expand_alpha_1x128 (unpack_32_1x128 (s))))));
2018 w--;
2019 }
2020 }
2021
2022 static force_inline uint32_t
core_combine_atop_ca_pixel_sse2(uint32_t src,uint32_t mask,uint32_t dst)2023 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2024 uint32_t mask,
2025 uint32_t dst)
2026 {
2027 __m128i m = unpack_32_1x128 (mask);
2028 __m128i s = unpack_32_1x128 (src);
2029 __m128i d = unpack_32_1x128 (dst);
2030 __m128i sa = expand_alpha_1x128 (s);
2031 __m128i da = expand_alpha_1x128 (d);
2032
2033 s = pix_multiply_1x128 (s, m);
2034 m = negate_1x128 (pix_multiply_1x128 (m, sa));
2035
2036 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2037 }
2038
2039 static void
sse2_combine_atop_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)2040 sse2_combine_atop_ca (pixman_implementation_t *imp,
2041 pixman_op_t op,
2042 uint32_t * pd,
2043 const uint32_t * ps,
2044 const uint32_t * pm,
2045 int w)
2046 {
2047 uint32_t s, m, d;
2048
2049 __m128i xmm_src_lo, xmm_src_hi;
2050 __m128i xmm_dst_lo, xmm_dst_hi;
2051 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2052 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2053 __m128i xmm_mask_lo, xmm_mask_hi;
2054
2055 while (w && (uintptr_t)pd & 15)
2056 {
2057 s = *ps++;
2058 m = *pm++;
2059 d = *pd;
2060
2061 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2062 w--;
2063 }
2064
2065 while (w >= 4)
2066 {
2067 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2068 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2069 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2070
2071 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2072 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2073 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2074
2075 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2076 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2077 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2078 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2079
2080 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2081 &xmm_mask_lo, &xmm_mask_hi,
2082 &xmm_src_lo, &xmm_src_hi);
2083 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2084 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2085 &xmm_mask_lo, &xmm_mask_hi);
2086
2087 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2088
2089 pix_add_multiply_2x128 (
2090 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2091 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2092 &xmm_dst_lo, &xmm_dst_hi);
2093
2094 save_128_aligned (
2095 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2096
2097 ps += 4;
2098 pd += 4;
2099 pm += 4;
2100 w -= 4;
2101 }
2102
2103 while (w)
2104 {
2105 s = *ps++;
2106 m = *pm++;
2107 d = *pd;
2108
2109 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2110 w--;
2111 }
2112 }
2113
2114 static force_inline uint32_t
core_combine_reverse_atop_ca_pixel_sse2(uint32_t src,uint32_t mask,uint32_t dst)2115 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2116 uint32_t mask,
2117 uint32_t dst)
2118 {
2119 __m128i m = unpack_32_1x128 (mask);
2120 __m128i s = unpack_32_1x128 (src);
2121 __m128i d = unpack_32_1x128 (dst);
2122
2123 __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2124 __m128i sa = expand_alpha_1x128 (s);
2125
2126 s = pix_multiply_1x128 (s, m);
2127 m = pix_multiply_1x128 (m, sa);
2128
2129 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2130 }
2131
2132 static void
sse2_combine_atop_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)2133 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2134 pixman_op_t op,
2135 uint32_t * pd,
2136 const uint32_t * ps,
2137 const uint32_t * pm,
2138 int w)
2139 {
2140 uint32_t s, m, d;
2141
2142 __m128i xmm_src_lo, xmm_src_hi;
2143 __m128i xmm_dst_lo, xmm_dst_hi;
2144 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2145 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2146 __m128i xmm_mask_lo, xmm_mask_hi;
2147
2148 while (w && (uintptr_t)pd & 15)
2149 {
2150 s = *ps++;
2151 m = *pm++;
2152 d = *pd;
2153
2154 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2155 w--;
2156 }
2157
2158 while (w >= 4)
2159 {
2160 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2161 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2162 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2163
2164 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2165 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2166 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2167
2168 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2169 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2170 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2171 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2172
2173 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2174 &xmm_mask_lo, &xmm_mask_hi,
2175 &xmm_src_lo, &xmm_src_hi);
2176 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2177 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2178 &xmm_mask_lo, &xmm_mask_hi);
2179
2180 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2181 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2182
2183 pix_add_multiply_2x128 (
2184 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2185 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2186 &xmm_dst_lo, &xmm_dst_hi);
2187
2188 save_128_aligned (
2189 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2190
2191 ps += 4;
2192 pd += 4;
2193 pm += 4;
2194 w -= 4;
2195 }
2196
2197 while (w)
2198 {
2199 s = *ps++;
2200 m = *pm++;
2201 d = *pd;
2202
2203 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2204 w--;
2205 }
2206 }
2207
2208 static force_inline uint32_t
core_combine_xor_ca_pixel_sse2(uint32_t src,uint32_t mask,uint32_t dst)2209 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2210 uint32_t mask,
2211 uint32_t dst)
2212 {
2213 __m128i a = unpack_32_1x128 (mask);
2214 __m128i s = unpack_32_1x128 (src);
2215 __m128i d = unpack_32_1x128 (dst);
2216
2217 __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2218 a, expand_alpha_1x128 (s)));
2219 __m128i dest = pix_multiply_1x128 (s, a);
2220 __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2221
2222 return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2223 &alpha_dst,
2224 &dest,
2225 &alpha_src));
2226 }
2227
2228 static void
sse2_combine_xor_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)2229 sse2_combine_xor_ca (pixman_implementation_t *imp,
2230 pixman_op_t op,
2231 uint32_t * pd,
2232 const uint32_t * ps,
2233 const uint32_t * pm,
2234 int w)
2235 {
2236 uint32_t s, m, d;
2237
2238 __m128i xmm_src_lo, xmm_src_hi;
2239 __m128i xmm_dst_lo, xmm_dst_hi;
2240 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2241 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2242 __m128i xmm_mask_lo, xmm_mask_hi;
2243
2244 while (w && (uintptr_t)pd & 15)
2245 {
2246 s = *ps++;
2247 m = *pm++;
2248 d = *pd;
2249
2250 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2251 w--;
2252 }
2253
2254 while (w >= 4)
2255 {
2256 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2257 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2258 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2259
2260 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2261 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2262 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2263
2264 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2265 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2266 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2267 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2268
2269 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2270 &xmm_mask_lo, &xmm_mask_hi,
2271 &xmm_src_lo, &xmm_src_hi);
2272 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2273 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2274 &xmm_mask_lo, &xmm_mask_hi);
2275
2276 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2277 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2278 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2279 &xmm_mask_lo, &xmm_mask_hi);
2280
2281 pix_add_multiply_2x128 (
2282 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2283 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2284 &xmm_dst_lo, &xmm_dst_hi);
2285
2286 save_128_aligned (
2287 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2288
2289 ps += 4;
2290 pd += 4;
2291 pm += 4;
2292 w -= 4;
2293 }
2294
2295 while (w)
2296 {
2297 s = *ps++;
2298 m = *pm++;
2299 d = *pd;
2300
2301 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2302 w--;
2303 }
2304 }
2305
2306 static void
sse2_combine_add_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)2307 sse2_combine_add_ca (pixman_implementation_t *imp,
2308 pixman_op_t op,
2309 uint32_t * pd,
2310 const uint32_t * ps,
2311 const uint32_t * pm,
2312 int w)
2313 {
2314 uint32_t s, m, d;
2315
2316 __m128i xmm_src_lo, xmm_src_hi;
2317 __m128i xmm_dst_lo, xmm_dst_hi;
2318 __m128i xmm_mask_lo, xmm_mask_hi;
2319
2320 while (w && (uintptr_t)pd & 15)
2321 {
2322 s = *ps++;
2323 m = *pm++;
2324 d = *pd;
2325
2326 *pd++ = pack_1x128_32 (
2327 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2328 unpack_32_1x128 (m)),
2329 unpack_32_1x128 (d)));
2330 w--;
2331 }
2332
2333 while (w >= 4)
2334 {
2335 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2336 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2337 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2338
2339 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2340 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2341 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2342
2343 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2344 &xmm_mask_lo, &xmm_mask_hi,
2345 &xmm_src_lo, &xmm_src_hi);
2346
2347 save_128_aligned (
2348 (__m128i*)pd, pack_2x128_128 (
2349 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2350 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2351
2352 ps += 4;
2353 pd += 4;
2354 pm += 4;
2355 w -= 4;
2356 }
2357
2358 while (w)
2359 {
2360 s = *ps++;
2361 m = *pm++;
2362 d = *pd;
2363
2364 *pd++ = pack_1x128_32 (
2365 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2366 unpack_32_1x128 (m)),
2367 unpack_32_1x128 (d)));
2368 w--;
2369 }
2370 }
2371
2372 static force_inline __m128i
create_mask_16_128(uint16_t mask)2373 create_mask_16_128 (uint16_t mask)
2374 {
2375 return _mm_set1_epi16 (mask);
2376 }
2377
2378 /* Work around a code generation bug in Sun Studio 12. */
2379 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2380 # define create_mask_2x32_128(mask0, mask1) \
2381 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2382 #else
2383 static force_inline __m128i
create_mask_2x32_128(uint32_t mask0,uint32_t mask1)2384 create_mask_2x32_128 (uint32_t mask0,
2385 uint32_t mask1)
2386 {
2387 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2388 }
2389 #endif
2390
2391 static void
sse2_composite_over_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2392 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2393 pixman_composite_info_t *info)
2394 {
2395 PIXMAN_COMPOSITE_ARGS (info);
2396 uint32_t src;
2397 uint32_t *dst_line, *dst, d;
2398 int32_t w;
2399 int dst_stride;
2400 __m128i xmm_src, xmm_alpha;
2401 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2402
2403 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2404
2405 if (src == 0)
2406 return;
2407
2408 PIXMAN_IMAGE_GET_LINE (
2409 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2410
2411 xmm_src = expand_pixel_32_1x128 (src);
2412 xmm_alpha = expand_alpha_1x128 (xmm_src);
2413
2414 while (height--)
2415 {
2416 dst = dst_line;
2417
2418 dst_line += dst_stride;
2419 w = width;
2420
2421 while (w && (uintptr_t)dst & 15)
2422 {
2423 d = *dst;
2424 *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2425 xmm_alpha,
2426 unpack_32_1x128 (d)));
2427 w--;
2428 }
2429
2430 while (w >= 4)
2431 {
2432 xmm_dst = load_128_aligned ((__m128i*)dst);
2433
2434 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2435
2436 over_2x128 (&xmm_src, &xmm_src,
2437 &xmm_alpha, &xmm_alpha,
2438 &xmm_dst_lo, &xmm_dst_hi);
2439
2440 /* rebuid the 4 pixel data and save*/
2441 save_128_aligned (
2442 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2443
2444 w -= 4;
2445 dst += 4;
2446 }
2447
2448 while (w)
2449 {
2450 d = *dst;
2451 *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2452 xmm_alpha,
2453 unpack_32_1x128 (d)));
2454 w--;
2455 }
2456
2457 }
2458 }
2459
2460 static void
sse2_composite_over_n_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2461 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2462 pixman_composite_info_t *info)
2463 {
2464 PIXMAN_COMPOSITE_ARGS (info);
2465 uint32_t src;
2466 uint16_t *dst_line, *dst, d;
2467 int32_t w;
2468 int dst_stride;
2469 __m128i xmm_src, xmm_alpha;
2470 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2471
2472 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2473
2474 if (src == 0)
2475 return;
2476
2477 PIXMAN_IMAGE_GET_LINE (
2478 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2479
2480 xmm_src = expand_pixel_32_1x128 (src);
2481 xmm_alpha = expand_alpha_1x128 (xmm_src);
2482
2483 while (height--)
2484 {
2485 dst = dst_line;
2486
2487 dst_line += dst_stride;
2488 w = width;
2489
2490 while (w && (uintptr_t)dst & 15)
2491 {
2492 d = *dst;
2493
2494 *dst++ = pack_565_32_16 (
2495 pack_1x128_32 (over_1x128 (xmm_src,
2496 xmm_alpha,
2497 expand565_16_1x128 (d))));
2498 w--;
2499 }
2500
2501 while (w >= 8)
2502 {
2503 xmm_dst = load_128_aligned ((__m128i*)dst);
2504
2505 unpack_565_128_4x128 (xmm_dst,
2506 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2507
2508 over_2x128 (&xmm_src, &xmm_src,
2509 &xmm_alpha, &xmm_alpha,
2510 &xmm_dst0, &xmm_dst1);
2511 over_2x128 (&xmm_src, &xmm_src,
2512 &xmm_alpha, &xmm_alpha,
2513 &xmm_dst2, &xmm_dst3);
2514
2515 xmm_dst = pack_565_4x128_128 (
2516 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2517
2518 save_128_aligned ((__m128i*)dst, xmm_dst);
2519
2520 dst += 8;
2521 w -= 8;
2522 }
2523
2524 while (w--)
2525 {
2526 d = *dst;
2527 *dst++ = pack_565_32_16 (
2528 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2529 expand565_16_1x128 (d))));
2530 }
2531 }
2532
2533 }
2534
2535 static void
sse2_composite_add_n_8888_8888_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)2536 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2537 pixman_composite_info_t *info)
2538 {
2539 PIXMAN_COMPOSITE_ARGS (info);
2540 uint32_t src;
2541 uint32_t *dst_line, d;
2542 uint32_t *mask_line, m;
2543 uint32_t pack_cmp;
2544 int dst_stride, mask_stride;
2545
2546 __m128i xmm_src;
2547 __m128i xmm_dst;
2548 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2549
2550 __m128i mmx_src, mmx_mask, mmx_dest;
2551
2552 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2553
2554 if (src == 0)
2555 return;
2556
2557 PIXMAN_IMAGE_GET_LINE (
2558 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2559 PIXMAN_IMAGE_GET_LINE (
2560 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2561
2562 xmm_src = _mm_unpacklo_epi8 (
2563 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2564 mmx_src = xmm_src;
2565
2566 while (height--)
2567 {
2568 int w = width;
2569 const uint32_t *pm = (uint32_t *)mask_line;
2570 uint32_t *pd = (uint32_t *)dst_line;
2571
2572 dst_line += dst_stride;
2573 mask_line += mask_stride;
2574
2575 while (w && (uintptr_t)pd & 15)
2576 {
2577 m = *pm++;
2578
2579 if (m)
2580 {
2581 d = *pd;
2582
2583 mmx_mask = unpack_32_1x128 (m);
2584 mmx_dest = unpack_32_1x128 (d);
2585
2586 *pd = pack_1x128_32 (
2587 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2588 mmx_dest));
2589 }
2590
2591 pd++;
2592 w--;
2593 }
2594
2595 while (w >= 4)
2596 {
2597 xmm_mask = load_128_unaligned ((__m128i*)pm);
2598
2599 pack_cmp =
2600 _mm_movemask_epi8 (
2601 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2602
2603 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2604 if (pack_cmp != 0xffff)
2605 {
2606 xmm_dst = load_128_aligned ((__m128i*)pd);
2607
2608 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2609
2610 pix_multiply_2x128 (&xmm_src, &xmm_src,
2611 &xmm_mask_lo, &xmm_mask_hi,
2612 &xmm_mask_lo, &xmm_mask_hi);
2613 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2614
2615 save_128_aligned (
2616 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2617 }
2618
2619 pd += 4;
2620 pm += 4;
2621 w -= 4;
2622 }
2623
2624 while (w)
2625 {
2626 m = *pm++;
2627
2628 if (m)
2629 {
2630 d = *pd;
2631
2632 mmx_mask = unpack_32_1x128 (m);
2633 mmx_dest = unpack_32_1x128 (d);
2634
2635 *pd = pack_1x128_32 (
2636 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2637 mmx_dest));
2638 }
2639
2640 pd++;
2641 w--;
2642 }
2643 }
2644
2645 }
2646
2647 static void
sse2_composite_over_n_8888_8888_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)2648 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2649 pixman_composite_info_t *info)
2650 {
2651 PIXMAN_COMPOSITE_ARGS (info);
2652 uint32_t src;
2653 uint32_t *dst_line, d;
2654 uint32_t *mask_line, m;
2655 uint32_t pack_cmp;
2656 int dst_stride, mask_stride;
2657
2658 __m128i xmm_src, xmm_alpha;
2659 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2660 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2661
2662 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2663
2664 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2665
2666 if (src == 0)
2667 return;
2668
2669 PIXMAN_IMAGE_GET_LINE (
2670 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2671 PIXMAN_IMAGE_GET_LINE (
2672 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2673
2674 xmm_src = _mm_unpacklo_epi8 (
2675 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2676 xmm_alpha = expand_alpha_1x128 (xmm_src);
2677 mmx_src = xmm_src;
2678 mmx_alpha = xmm_alpha;
2679
2680 while (height--)
2681 {
2682 int w = width;
2683 const uint32_t *pm = (uint32_t *)mask_line;
2684 uint32_t *pd = (uint32_t *)dst_line;
2685
2686 dst_line += dst_stride;
2687 mask_line += mask_stride;
2688
2689 while (w && (uintptr_t)pd & 15)
2690 {
2691 m = *pm++;
2692
2693 if (m)
2694 {
2695 d = *pd;
2696 mmx_mask = unpack_32_1x128 (m);
2697 mmx_dest = unpack_32_1x128 (d);
2698
2699 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2700 &mmx_alpha,
2701 &mmx_mask,
2702 &mmx_dest));
2703 }
2704
2705 pd++;
2706 w--;
2707 }
2708
2709 while (w >= 4)
2710 {
2711 xmm_mask = load_128_unaligned ((__m128i*)pm);
2712
2713 pack_cmp =
2714 _mm_movemask_epi8 (
2715 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2716
2717 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2718 if (pack_cmp != 0xffff)
2719 {
2720 xmm_dst = load_128_aligned ((__m128i*)pd);
2721
2722 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2723 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2724
2725 in_over_2x128 (&xmm_src, &xmm_src,
2726 &xmm_alpha, &xmm_alpha,
2727 &xmm_mask_lo, &xmm_mask_hi,
2728 &xmm_dst_lo, &xmm_dst_hi);
2729
2730 save_128_aligned (
2731 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2732 }
2733
2734 pd += 4;
2735 pm += 4;
2736 w -= 4;
2737 }
2738
2739 while (w)
2740 {
2741 m = *pm++;
2742
2743 if (m)
2744 {
2745 d = *pd;
2746 mmx_mask = unpack_32_1x128 (m);
2747 mmx_dest = unpack_32_1x128 (d);
2748
2749 *pd = pack_1x128_32 (
2750 in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2751 }
2752
2753 pd++;
2754 w--;
2755 }
2756 }
2757
2758 }
2759
2760 static void
sse2_composite_over_8888_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2761 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2762 pixman_composite_info_t *info)
2763 {
2764 PIXMAN_COMPOSITE_ARGS (info);
2765 uint32_t *dst_line, *dst;
2766 uint32_t *src_line, *src;
2767 uint32_t mask;
2768 int32_t w;
2769 int dst_stride, src_stride;
2770
2771 __m128i xmm_mask;
2772 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2773 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2774 __m128i xmm_alpha_lo, xmm_alpha_hi;
2775
2776 PIXMAN_IMAGE_GET_LINE (
2777 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2778 PIXMAN_IMAGE_GET_LINE (
2779 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2780
2781 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2782
2783 xmm_mask = create_mask_16_128 (mask >> 24);
2784
2785 while (height--)
2786 {
2787 dst = dst_line;
2788 dst_line += dst_stride;
2789 src = src_line;
2790 src_line += src_stride;
2791 w = width;
2792
2793 while (w && (uintptr_t)dst & 15)
2794 {
2795 uint32_t s = *src++;
2796
2797 if (s)
2798 {
2799 uint32_t d = *dst;
2800
2801 __m128i ms = unpack_32_1x128 (s);
2802 __m128i alpha = expand_alpha_1x128 (ms);
2803 __m128i dest = xmm_mask;
2804 __m128i alpha_dst = unpack_32_1x128 (d);
2805
2806 *dst = pack_1x128_32 (
2807 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2808 }
2809 dst++;
2810 w--;
2811 }
2812
2813 while (w >= 4)
2814 {
2815 xmm_src = load_128_unaligned ((__m128i*)src);
2816
2817 if (!is_zero (xmm_src))
2818 {
2819 xmm_dst = load_128_aligned ((__m128i*)dst);
2820
2821 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2822 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2823 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2824 &xmm_alpha_lo, &xmm_alpha_hi);
2825
2826 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2827 &xmm_alpha_lo, &xmm_alpha_hi,
2828 &xmm_mask, &xmm_mask,
2829 &xmm_dst_lo, &xmm_dst_hi);
2830
2831 save_128_aligned (
2832 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2833 }
2834
2835 dst += 4;
2836 src += 4;
2837 w -= 4;
2838 }
2839
2840 while (w)
2841 {
2842 uint32_t s = *src++;
2843
2844 if (s)
2845 {
2846 uint32_t d = *dst;
2847
2848 __m128i ms = unpack_32_1x128 (s);
2849 __m128i alpha = expand_alpha_1x128 (ms);
2850 __m128i mask = xmm_mask;
2851 __m128i dest = unpack_32_1x128 (d);
2852
2853 *dst = pack_1x128_32 (
2854 in_over_1x128 (&ms, &alpha, &mask, &dest));
2855 }
2856
2857 dst++;
2858 w--;
2859 }
2860 }
2861
2862 }
2863
2864 static void
sse2_composite_src_x888_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2865 sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
2866 pixman_composite_info_t *info)
2867 {
2868 PIXMAN_COMPOSITE_ARGS (info);
2869 uint16_t *dst_line, *dst;
2870 uint32_t *src_line, *src, s;
2871 int dst_stride, src_stride;
2872 int32_t w;
2873
2874 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2875 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2876
2877 while (height--)
2878 {
2879 dst = dst_line;
2880 dst_line += dst_stride;
2881 src = src_line;
2882 src_line += src_stride;
2883 w = width;
2884
2885 while (w && (uintptr_t)dst & 15)
2886 {
2887 s = *src++;
2888 *dst = convert_8888_to_0565 (s);
2889 dst++;
2890 w--;
2891 }
2892
2893 while (w >= 8)
2894 {
2895 __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
2896 __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
2897
2898 save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
2899
2900 w -= 8;
2901 src += 8;
2902 dst += 8;
2903 }
2904
2905 while (w)
2906 {
2907 s = *src++;
2908 *dst = convert_8888_to_0565 (s);
2909 dst++;
2910 w--;
2911 }
2912 }
2913 }
2914
2915 static void
sse2_composite_src_x888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2916 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2917 pixman_composite_info_t *info)
2918 {
2919 PIXMAN_COMPOSITE_ARGS (info);
2920 uint32_t *dst_line, *dst;
2921 uint32_t *src_line, *src;
2922 int32_t w;
2923 int dst_stride, src_stride;
2924
2925
2926 PIXMAN_IMAGE_GET_LINE (
2927 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2928 PIXMAN_IMAGE_GET_LINE (
2929 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2930
2931 while (height--)
2932 {
2933 dst = dst_line;
2934 dst_line += dst_stride;
2935 src = src_line;
2936 src_line += src_stride;
2937 w = width;
2938
2939 while (w && (uintptr_t)dst & 15)
2940 {
2941 *dst++ = *src++ | 0xff000000;
2942 w--;
2943 }
2944
2945 while (w >= 16)
2946 {
2947 __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2948
2949 xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2950 xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2951 xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2952 xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2953
2954 save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2955 save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2956 save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2957 save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2958
2959 dst += 16;
2960 src += 16;
2961 w -= 16;
2962 }
2963
2964 while (w)
2965 {
2966 *dst++ = *src++ | 0xff000000;
2967 w--;
2968 }
2969 }
2970
2971 }
2972
2973 static void
sse2_composite_over_x888_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2974 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2975 pixman_composite_info_t *info)
2976 {
2977 PIXMAN_COMPOSITE_ARGS (info);
2978 uint32_t *dst_line, *dst;
2979 uint32_t *src_line, *src;
2980 uint32_t mask;
2981 int dst_stride, src_stride;
2982 int32_t w;
2983
2984 __m128i xmm_mask, xmm_alpha;
2985 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2986 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2987
2988 PIXMAN_IMAGE_GET_LINE (
2989 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2990 PIXMAN_IMAGE_GET_LINE (
2991 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2992
2993 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2994
2995 xmm_mask = create_mask_16_128 (mask >> 24);
2996 xmm_alpha = mask_00ff;
2997
2998 while (height--)
2999 {
3000 dst = dst_line;
3001 dst_line += dst_stride;
3002 src = src_line;
3003 src_line += src_stride;
3004 w = width;
3005
3006 while (w && (uintptr_t)dst & 15)
3007 {
3008 uint32_t s = (*src++) | 0xff000000;
3009 uint32_t d = *dst;
3010
3011 __m128i src = unpack_32_1x128 (s);
3012 __m128i alpha = xmm_alpha;
3013 __m128i mask = xmm_mask;
3014 __m128i dest = unpack_32_1x128 (d);
3015
3016 *dst++ = pack_1x128_32 (
3017 in_over_1x128 (&src, &alpha, &mask, &dest));
3018
3019 w--;
3020 }
3021
3022 while (w >= 4)
3023 {
3024 xmm_src = _mm_or_si128 (
3025 load_128_unaligned ((__m128i*)src), mask_ff000000);
3026 xmm_dst = load_128_aligned ((__m128i*)dst);
3027
3028 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3029 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3030
3031 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3032 &xmm_alpha, &xmm_alpha,
3033 &xmm_mask, &xmm_mask,
3034 &xmm_dst_lo, &xmm_dst_hi);
3035
3036 save_128_aligned (
3037 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3038
3039 dst += 4;
3040 src += 4;
3041 w -= 4;
3042
3043 }
3044
3045 while (w)
3046 {
3047 uint32_t s = (*src++) | 0xff000000;
3048 uint32_t d = *dst;
3049
3050 __m128i src = unpack_32_1x128 (s);
3051 __m128i alpha = xmm_alpha;
3052 __m128i mask = xmm_mask;
3053 __m128i dest = unpack_32_1x128 (d);
3054
3055 *dst++ = pack_1x128_32 (
3056 in_over_1x128 (&src, &alpha, &mask, &dest));
3057
3058 w--;
3059 }
3060 }
3061
3062 }
3063
3064 static void
sse2_composite_over_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3065 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3066 pixman_composite_info_t *info)
3067 {
3068 PIXMAN_COMPOSITE_ARGS (info);
3069 int dst_stride, src_stride;
3070 uint32_t *dst_line, *dst;
3071 uint32_t *src_line, *src;
3072
3073 PIXMAN_IMAGE_GET_LINE (
3074 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3075 PIXMAN_IMAGE_GET_LINE (
3076 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3077
3078 dst = dst_line;
3079 src = src_line;
3080
3081 while (height--)
3082 {
3083 sse2_combine_over_u (imp, op, dst, src, NULL, width);
3084
3085 dst += dst_stride;
3086 src += src_stride;
3087 }
3088 }
3089
3090 static force_inline uint16_t
composite_over_8888_0565pixel(uint32_t src,uint16_t dst)3091 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3092 {
3093 __m128i ms;
3094
3095 ms = unpack_32_1x128 (src);
3096 return pack_565_32_16 (
3097 pack_1x128_32 (
3098 over_1x128 (
3099 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3100 }
3101
3102 static void
sse2_composite_over_8888_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)3103 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3104 pixman_composite_info_t *info)
3105 {
3106 PIXMAN_COMPOSITE_ARGS (info);
3107 uint16_t *dst_line, *dst, d;
3108 uint32_t *src_line, *src, s;
3109 int dst_stride, src_stride;
3110 int32_t w;
3111
3112 __m128i xmm_alpha_lo, xmm_alpha_hi;
3113 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3114 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3115
3116 PIXMAN_IMAGE_GET_LINE (
3117 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3118 PIXMAN_IMAGE_GET_LINE (
3119 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3120
3121 while (height--)
3122 {
3123 dst = dst_line;
3124 src = src_line;
3125
3126 dst_line += dst_stride;
3127 src_line += src_stride;
3128 w = width;
3129
3130 /* Align dst on a 16-byte boundary */
3131 while (w &&
3132 ((uintptr_t)dst & 15))
3133 {
3134 s = *src++;
3135 d = *dst;
3136
3137 *dst++ = composite_over_8888_0565pixel (s, d);
3138 w--;
3139 }
3140
3141 /* It's a 8 pixel loop */
3142 while (w >= 8)
3143 {
3144 /* I'm loading unaligned because I'm not sure
3145 * about the address alignment.
3146 */
3147 xmm_src = load_128_unaligned ((__m128i*) src);
3148 xmm_dst = load_128_aligned ((__m128i*) dst);
3149
3150 /* Unpacking */
3151 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3152 unpack_565_128_4x128 (xmm_dst,
3153 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3154 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3155 &xmm_alpha_lo, &xmm_alpha_hi);
3156
3157 /* I'm loading next 4 pixels from memory
3158 * before to optimze the memory read.
3159 */
3160 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3161
3162 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3163 &xmm_alpha_lo, &xmm_alpha_hi,
3164 &xmm_dst0, &xmm_dst1);
3165
3166 /* Unpacking */
3167 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3168 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3169 &xmm_alpha_lo, &xmm_alpha_hi);
3170
3171 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3172 &xmm_alpha_lo, &xmm_alpha_hi,
3173 &xmm_dst2, &xmm_dst3);
3174
3175 save_128_aligned (
3176 (__m128i*)dst, pack_565_4x128_128 (
3177 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3178
3179 w -= 8;
3180 dst += 8;
3181 src += 8;
3182 }
3183
3184 while (w--)
3185 {
3186 s = *src++;
3187 d = *dst;
3188
3189 *dst++ = composite_over_8888_0565pixel (s, d);
3190 }
3191 }
3192
3193 }
3194
3195 static void
sse2_composite_over_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3196 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3197 pixman_composite_info_t *info)
3198 {
3199 PIXMAN_COMPOSITE_ARGS (info);
3200 uint32_t src, srca;
3201 uint32_t *dst_line, *dst;
3202 uint8_t *mask_line, *mask;
3203 int dst_stride, mask_stride;
3204 int32_t w;
3205 uint32_t d;
3206
3207 __m128i xmm_src, xmm_alpha, xmm_def;
3208 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3209 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3210
3211 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3212
3213 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3214
3215 srca = src >> 24;
3216 if (src == 0)
3217 return;
3218
3219 PIXMAN_IMAGE_GET_LINE (
3220 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3221 PIXMAN_IMAGE_GET_LINE (
3222 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3223
3224 xmm_def = create_mask_2x32_128 (src, src);
3225 xmm_src = expand_pixel_32_1x128 (src);
3226 xmm_alpha = expand_alpha_1x128 (xmm_src);
3227 mmx_src = xmm_src;
3228 mmx_alpha = xmm_alpha;
3229
3230 while (height--)
3231 {
3232 dst = dst_line;
3233 dst_line += dst_stride;
3234 mask = mask_line;
3235 mask_line += mask_stride;
3236 w = width;
3237
3238 while (w && (uintptr_t)dst & 15)
3239 {
3240 uint8_t m = *mask++;
3241
3242 if (m)
3243 {
3244 d = *dst;
3245 mmx_mask = expand_pixel_8_1x128 (m);
3246 mmx_dest = unpack_32_1x128 (d);
3247
3248 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3249 &mmx_alpha,
3250 &mmx_mask,
3251 &mmx_dest));
3252 }
3253
3254 w--;
3255 dst++;
3256 }
3257
3258 while (w >= 4)
3259 {
3260 uint32_t m;
3261 memcpy(&m, mask, sizeof(uint32_t));
3262
3263 if (srca == 0xff && m == 0xffffffff)
3264 {
3265 save_128_aligned ((__m128i*)dst, xmm_def);
3266 }
3267 else if (m)
3268 {
3269 xmm_dst = load_128_aligned ((__m128i*) dst);
3270 xmm_mask = unpack_32_1x128 (m);
3271 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3272
3273 /* Unpacking */
3274 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3275 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3276
3277 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3278 &xmm_mask_lo, &xmm_mask_hi);
3279
3280 in_over_2x128 (&xmm_src, &xmm_src,
3281 &xmm_alpha, &xmm_alpha,
3282 &xmm_mask_lo, &xmm_mask_hi,
3283 &xmm_dst_lo, &xmm_dst_hi);
3284
3285 save_128_aligned (
3286 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3287 }
3288
3289 w -= 4;
3290 dst += 4;
3291 mask += 4;
3292 }
3293
3294 while (w)
3295 {
3296 uint8_t m = *mask++;
3297
3298 if (m)
3299 {
3300 d = *dst;
3301 mmx_mask = expand_pixel_8_1x128 (m);
3302 mmx_dest = unpack_32_1x128 (d);
3303
3304 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3305 &mmx_alpha,
3306 &mmx_mask,
3307 &mmx_dest));
3308 }
3309
3310 w--;
3311 dst++;
3312 }
3313 }
3314
3315 }
3316
3317 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
3318 __attribute__((__force_align_arg_pointer__))
3319 #endif
3320 static pixman_bool_t
sse2_fill(pixman_implementation_t * imp,uint32_t * bits,int stride,int bpp,int x,int y,int width,int height,uint32_t filler)3321 sse2_fill (pixman_implementation_t *imp,
3322 uint32_t * bits,
3323 int stride,
3324 int bpp,
3325 int x,
3326 int y,
3327 int width,
3328 int height,
3329 uint32_t filler)
3330 {
3331 uint32_t byte_width;
3332 uint8_t *byte_line;
3333
3334 __m128i xmm_def;
3335
3336 if (bpp == 8)
3337 {
3338 uint32_t b;
3339 uint32_t w;
3340
3341 stride = stride * (int) sizeof (uint32_t) / 1;
3342 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3343 byte_width = width;
3344 stride *= 1;
3345
3346 b = filler & 0xff;
3347 w = (b << 8) | b;
3348 filler = (w << 16) | w;
3349 }
3350 else if (bpp == 16)
3351 {
3352 stride = stride * (int) sizeof (uint32_t) / 2;
3353 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3354 byte_width = 2 * width;
3355 stride *= 2;
3356
3357 filler = (filler & 0xffff) * 0x00010001;
3358 }
3359 else if (bpp == 32)
3360 {
3361 stride = stride * (int) sizeof (uint32_t) / 4;
3362 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3363 byte_width = 4 * width;
3364 stride *= 4;
3365 }
3366 else
3367 {
3368 return FALSE;
3369 }
3370
3371 xmm_def = create_mask_2x32_128 (filler, filler);
3372
3373 while (height--)
3374 {
3375 int w;
3376 uint8_t *d = byte_line;
3377 byte_line += stride;
3378 w = byte_width;
3379
3380 if (w >= 1 && ((uintptr_t)d & 1))
3381 {
3382 *(uint8_t *)d = filler;
3383 w -= 1;
3384 d += 1;
3385 }
3386
3387 while (w >= 2 && ((uintptr_t)d & 3))
3388 {
3389 *(uint16_t *)d = filler;
3390 w -= 2;
3391 d += 2;
3392 }
3393
3394 while (w >= 4 && ((uintptr_t)d & 15))
3395 {
3396 *(uint32_t *)d = filler;
3397
3398 w -= 4;
3399 d += 4;
3400 }
3401
3402 while (w >= 128)
3403 {
3404 save_128_aligned ((__m128i*)(d), xmm_def);
3405 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3406 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3407 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3408 save_128_aligned ((__m128i*)(d + 64), xmm_def);
3409 save_128_aligned ((__m128i*)(d + 80), xmm_def);
3410 save_128_aligned ((__m128i*)(d + 96), xmm_def);
3411 save_128_aligned ((__m128i*)(d + 112), xmm_def);
3412
3413 d += 128;
3414 w -= 128;
3415 }
3416
3417 if (w >= 64)
3418 {
3419 save_128_aligned ((__m128i*)(d), xmm_def);
3420 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3421 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3422 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3423
3424 d += 64;
3425 w -= 64;
3426 }
3427
3428 if (w >= 32)
3429 {
3430 save_128_aligned ((__m128i*)(d), xmm_def);
3431 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3432
3433 d += 32;
3434 w -= 32;
3435 }
3436
3437 if (w >= 16)
3438 {
3439 save_128_aligned ((__m128i*)(d), xmm_def);
3440
3441 d += 16;
3442 w -= 16;
3443 }
3444
3445 while (w >= 4)
3446 {
3447 *(uint32_t *)d = filler;
3448
3449 w -= 4;
3450 d += 4;
3451 }
3452
3453 if (w >= 2)
3454 {
3455 *(uint16_t *)d = filler;
3456 w -= 2;
3457 d += 2;
3458 }
3459
3460 if (w >= 1)
3461 {
3462 *(uint8_t *)d = filler;
3463 w -= 1;
3464 d += 1;
3465 }
3466 }
3467
3468 return TRUE;
3469 }
3470
3471 static void
sse2_composite_src_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3472 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3473 pixman_composite_info_t *info)
3474 {
3475 PIXMAN_COMPOSITE_ARGS (info);
3476 uint32_t src, srca;
3477 uint32_t *dst_line, *dst;
3478 uint8_t *mask_line, *mask;
3479 int dst_stride, mask_stride;
3480 int32_t w;
3481
3482 __m128i xmm_src, xmm_def;
3483 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3484
3485 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3486
3487 srca = src >> 24;
3488 if (src == 0)
3489 {
3490 sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
3491 PIXMAN_FORMAT_BPP (dest_image->bits.format),
3492 dest_x, dest_y, width, height, 0);
3493 return;
3494 }
3495
3496 PIXMAN_IMAGE_GET_LINE (
3497 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3498 PIXMAN_IMAGE_GET_LINE (
3499 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3500
3501 xmm_def = create_mask_2x32_128 (src, src);
3502 xmm_src = expand_pixel_32_1x128 (src);
3503
3504 while (height--)
3505 {
3506 dst = dst_line;
3507 dst_line += dst_stride;
3508 mask = mask_line;
3509 mask_line += mask_stride;
3510 w = width;
3511
3512 while (w && (uintptr_t)dst & 15)
3513 {
3514 uint8_t m = *mask++;
3515
3516 if (m)
3517 {
3518 *dst = pack_1x128_32 (
3519 pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3520 }
3521 else
3522 {
3523 *dst = 0;
3524 }
3525
3526 w--;
3527 dst++;
3528 }
3529
3530 while (w >= 4)
3531 {
3532 uint32_t m;
3533 memcpy(&m, mask, sizeof(uint32_t));
3534
3535 if (srca == 0xff && m == 0xffffffff)
3536 {
3537 save_128_aligned ((__m128i*)dst, xmm_def);
3538 }
3539 else if (m)
3540 {
3541 xmm_mask = unpack_32_1x128 (m);
3542 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3543
3544 /* Unpacking */
3545 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3546
3547 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3548 &xmm_mask_lo, &xmm_mask_hi);
3549
3550 pix_multiply_2x128 (&xmm_src, &xmm_src,
3551 &xmm_mask_lo, &xmm_mask_hi,
3552 &xmm_mask_lo, &xmm_mask_hi);
3553
3554 save_128_aligned (
3555 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3556 }
3557 else
3558 {
3559 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3560 }
3561
3562 w -= 4;
3563 dst += 4;
3564 mask += 4;
3565 }
3566
3567 while (w)
3568 {
3569 uint8_t m = *mask++;
3570
3571 if (m)
3572 {
3573 *dst = pack_1x128_32 (
3574 pix_multiply_1x128 (
3575 xmm_src, expand_pixel_8_1x128 (m)));
3576 }
3577 else
3578 {
3579 *dst = 0;
3580 }
3581
3582 w--;
3583 dst++;
3584 }
3585 }
3586
3587 }
3588
3589 static void
sse2_composite_over_n_8_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)3590 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3591 pixman_composite_info_t *info)
3592 {
3593 PIXMAN_COMPOSITE_ARGS (info);
3594 uint32_t src;
3595 uint16_t *dst_line, *dst, d;
3596 uint8_t *mask_line, *mask;
3597 int dst_stride, mask_stride;
3598 int32_t w;
3599 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3600
3601 __m128i xmm_src, xmm_alpha;
3602 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3603 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3604
3605 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3606
3607 if (src == 0)
3608 return;
3609
3610 PIXMAN_IMAGE_GET_LINE (
3611 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3612 PIXMAN_IMAGE_GET_LINE (
3613 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3614
3615 xmm_src = expand_pixel_32_1x128 (src);
3616 xmm_alpha = expand_alpha_1x128 (xmm_src);
3617 mmx_src = xmm_src;
3618 mmx_alpha = xmm_alpha;
3619
3620 while (height--)
3621 {
3622 dst = dst_line;
3623 dst_line += dst_stride;
3624 mask = mask_line;
3625 mask_line += mask_stride;
3626 w = width;
3627
3628 while (w && (uintptr_t)dst & 15)
3629 {
3630 uint8_t m = *mask++;
3631
3632 if (m)
3633 {
3634 d = *dst;
3635 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3636 mmx_dest = expand565_16_1x128 (d);
3637
3638 *dst = pack_565_32_16 (
3639 pack_1x128_32 (
3640 in_over_1x128 (
3641 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3642 }
3643
3644 w--;
3645 dst++;
3646 }
3647
3648 while (w >= 8)
3649 {
3650 uint32_t m;
3651
3652 xmm_dst = load_128_aligned ((__m128i*) dst);
3653 unpack_565_128_4x128 (xmm_dst,
3654 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3655
3656 memcpy(&m, mask, sizeof(uint32_t));
3657 mask += 4;
3658
3659 if (m)
3660 {
3661 xmm_mask = unpack_32_1x128 (m);
3662 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3663
3664 /* Unpacking */
3665 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3666
3667 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3668 &xmm_mask_lo, &xmm_mask_hi);
3669
3670 in_over_2x128 (&xmm_src, &xmm_src,
3671 &xmm_alpha, &xmm_alpha,
3672 &xmm_mask_lo, &xmm_mask_hi,
3673 &xmm_dst0, &xmm_dst1);
3674 }
3675
3676 memcpy(&m, mask, sizeof(uint32_t));
3677 mask += 4;
3678
3679 if (m)
3680 {
3681 xmm_mask = unpack_32_1x128 (m);
3682 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3683
3684 /* Unpacking */
3685 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3686
3687 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3688 &xmm_mask_lo, &xmm_mask_hi);
3689 in_over_2x128 (&xmm_src, &xmm_src,
3690 &xmm_alpha, &xmm_alpha,
3691 &xmm_mask_lo, &xmm_mask_hi,
3692 &xmm_dst2, &xmm_dst3);
3693 }
3694
3695 save_128_aligned (
3696 (__m128i*)dst, pack_565_4x128_128 (
3697 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3698
3699 w -= 8;
3700 dst += 8;
3701 }
3702
3703 while (w)
3704 {
3705 uint8_t m = *mask++;
3706
3707 if (m)
3708 {
3709 d = *dst;
3710 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3711 mmx_dest = expand565_16_1x128 (d);
3712
3713 *dst = pack_565_32_16 (
3714 pack_1x128_32 (
3715 in_over_1x128 (
3716 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3717 }
3718
3719 w--;
3720 dst++;
3721 }
3722 }
3723
3724 }
3725
3726 static void
sse2_composite_over_pixbuf_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)3727 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3728 pixman_composite_info_t *info)
3729 {
3730 PIXMAN_COMPOSITE_ARGS (info);
3731 uint16_t *dst_line, *dst, d;
3732 uint32_t *src_line, *src, s;
3733 int dst_stride, src_stride;
3734 int32_t w;
3735 uint32_t opaque, zero;
3736
3737 __m128i ms;
3738 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3739 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3740
3741 PIXMAN_IMAGE_GET_LINE (
3742 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3743 PIXMAN_IMAGE_GET_LINE (
3744 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3745
3746 while (height--)
3747 {
3748 dst = dst_line;
3749 dst_line += dst_stride;
3750 src = src_line;
3751 src_line += src_stride;
3752 w = width;
3753
3754 while (w && (uintptr_t)dst & 15)
3755 {
3756 s = *src++;
3757 d = *dst;
3758
3759 ms = unpack_32_1x128 (s);
3760
3761 *dst++ = pack_565_32_16 (
3762 pack_1x128_32 (
3763 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3764 w--;
3765 }
3766
3767 while (w >= 8)
3768 {
3769 /* First round */
3770 xmm_src = load_128_unaligned ((__m128i*)src);
3771 xmm_dst = load_128_aligned ((__m128i*)dst);
3772
3773 opaque = is_opaque (xmm_src);
3774 zero = is_zero (xmm_src);
3775
3776 unpack_565_128_4x128 (xmm_dst,
3777 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3778 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3779
3780 /* preload next round*/
3781 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3782
3783 if (opaque)
3784 {
3785 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3786 &xmm_dst0, &xmm_dst1);
3787 }
3788 else if (!zero)
3789 {
3790 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3791 &xmm_dst0, &xmm_dst1);
3792 }
3793
3794 /* Second round */
3795 opaque = is_opaque (xmm_src);
3796 zero = is_zero (xmm_src);
3797
3798 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3799
3800 if (opaque)
3801 {
3802 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3803 &xmm_dst2, &xmm_dst3);
3804 }
3805 else if (!zero)
3806 {
3807 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3808 &xmm_dst2, &xmm_dst3);
3809 }
3810
3811 save_128_aligned (
3812 (__m128i*)dst, pack_565_4x128_128 (
3813 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3814
3815 w -= 8;
3816 src += 8;
3817 dst += 8;
3818 }
3819
3820 while (w)
3821 {
3822 s = *src++;
3823 d = *dst;
3824
3825 ms = unpack_32_1x128 (s);
3826
3827 *dst++ = pack_565_32_16 (
3828 pack_1x128_32 (
3829 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3830 w--;
3831 }
3832 }
3833
3834 }
3835
3836 static void
sse2_composite_over_pixbuf_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3837 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3838 pixman_composite_info_t *info)
3839 {
3840 PIXMAN_COMPOSITE_ARGS (info);
3841 uint32_t *dst_line, *dst, d;
3842 uint32_t *src_line, *src, s;
3843 int dst_stride, src_stride;
3844 int32_t w;
3845 uint32_t opaque, zero;
3846
3847 __m128i xmm_src_lo, xmm_src_hi;
3848 __m128i xmm_dst_lo, xmm_dst_hi;
3849
3850 PIXMAN_IMAGE_GET_LINE (
3851 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3852 PIXMAN_IMAGE_GET_LINE (
3853 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3854
3855 while (height--)
3856 {
3857 dst = dst_line;
3858 dst_line += dst_stride;
3859 src = src_line;
3860 src_line += src_stride;
3861 w = width;
3862
3863 while (w && (uintptr_t)dst & 15)
3864 {
3865 s = *src++;
3866 d = *dst;
3867
3868 *dst++ = pack_1x128_32 (
3869 over_rev_non_pre_1x128 (
3870 unpack_32_1x128 (s), unpack_32_1x128 (d)));
3871
3872 w--;
3873 }
3874
3875 while (w >= 4)
3876 {
3877 xmm_src_hi = load_128_unaligned ((__m128i*)src);
3878
3879 opaque = is_opaque (xmm_src_hi);
3880 zero = is_zero (xmm_src_hi);
3881
3882 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3883
3884 if (opaque)
3885 {
3886 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3887 &xmm_dst_lo, &xmm_dst_hi);
3888
3889 save_128_aligned (
3890 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3891 }
3892 else if (!zero)
3893 {
3894 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
3895
3896 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3897
3898 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3899 &xmm_dst_lo, &xmm_dst_hi);
3900
3901 save_128_aligned (
3902 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3903 }
3904
3905 w -= 4;
3906 dst += 4;
3907 src += 4;
3908 }
3909
3910 while (w)
3911 {
3912 s = *src++;
3913 d = *dst;
3914
3915 *dst++ = pack_1x128_32 (
3916 over_rev_non_pre_1x128 (
3917 unpack_32_1x128 (s), unpack_32_1x128 (d)));
3918
3919 w--;
3920 }
3921 }
3922
3923 }
3924
3925 static void
sse2_composite_over_n_8888_0565_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)3926 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3927 pixman_composite_info_t *info)
3928 {
3929 PIXMAN_COMPOSITE_ARGS (info);
3930 uint32_t src;
3931 uint16_t *dst_line, *dst, d;
3932 uint32_t *mask_line, *mask, m;
3933 int dst_stride, mask_stride;
3934 int w;
3935 uint32_t pack_cmp;
3936
3937 __m128i xmm_src, xmm_alpha;
3938 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3939 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3940
3941 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3942
3943 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3944
3945 if (src == 0)
3946 return;
3947
3948 PIXMAN_IMAGE_GET_LINE (
3949 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3950 PIXMAN_IMAGE_GET_LINE (
3951 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3952
3953 xmm_src = expand_pixel_32_1x128 (src);
3954 xmm_alpha = expand_alpha_1x128 (xmm_src);
3955 mmx_src = xmm_src;
3956 mmx_alpha = xmm_alpha;
3957
3958 while (height--)
3959 {
3960 w = width;
3961 mask = mask_line;
3962 dst = dst_line;
3963 mask_line += mask_stride;
3964 dst_line += dst_stride;
3965
3966 while (w && ((uintptr_t)dst & 15))
3967 {
3968 m = *(uint32_t *) mask;
3969
3970 if (m)
3971 {
3972 d = *dst;
3973 mmx_mask = unpack_32_1x128 (m);
3974 mmx_dest = expand565_16_1x128 (d);
3975
3976 *dst = pack_565_32_16 (
3977 pack_1x128_32 (
3978 in_over_1x128 (
3979 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3980 }
3981
3982 w--;
3983 dst++;
3984 mask++;
3985 }
3986
3987 while (w >= 8)
3988 {
3989 /* First round */
3990 xmm_mask = load_128_unaligned ((__m128i*)mask);
3991 xmm_dst = load_128_aligned ((__m128i*)dst);
3992
3993 pack_cmp = _mm_movemask_epi8 (
3994 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3995
3996 unpack_565_128_4x128 (xmm_dst,
3997 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3998 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3999
4000 /* preload next round */
4001 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4002
4003 /* preload next round */
4004 if (pack_cmp != 0xffff)
4005 {
4006 in_over_2x128 (&xmm_src, &xmm_src,
4007 &xmm_alpha, &xmm_alpha,
4008 &xmm_mask_lo, &xmm_mask_hi,
4009 &xmm_dst0, &xmm_dst1);
4010 }
4011
4012 /* Second round */
4013 pack_cmp = _mm_movemask_epi8 (
4014 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4015
4016 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4017
4018 if (pack_cmp != 0xffff)
4019 {
4020 in_over_2x128 (&xmm_src, &xmm_src,
4021 &xmm_alpha, &xmm_alpha,
4022 &xmm_mask_lo, &xmm_mask_hi,
4023 &xmm_dst2, &xmm_dst3);
4024 }
4025
4026 save_128_aligned (
4027 (__m128i*)dst, pack_565_4x128_128 (
4028 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4029
4030 w -= 8;
4031 dst += 8;
4032 mask += 8;
4033 }
4034
4035 while (w)
4036 {
4037 m = *(uint32_t *) mask;
4038
4039 if (m)
4040 {
4041 d = *dst;
4042 mmx_mask = unpack_32_1x128 (m);
4043 mmx_dest = expand565_16_1x128 (d);
4044
4045 *dst = pack_565_32_16 (
4046 pack_1x128_32 (
4047 in_over_1x128 (
4048 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4049 }
4050
4051 w--;
4052 dst++;
4053 mask++;
4054 }
4055 }
4056
4057 }
4058
4059 static void
sse2_composite_in_n_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4060 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4061 pixman_composite_info_t *info)
4062 {
4063 PIXMAN_COMPOSITE_ARGS (info);
4064 uint8_t *dst_line, *dst;
4065 uint8_t *mask_line, *mask;
4066 int dst_stride, mask_stride;
4067 uint32_t d;
4068 uint32_t src;
4069 int32_t w;
4070
4071 __m128i xmm_alpha;
4072 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4073 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4074
4075 PIXMAN_IMAGE_GET_LINE (
4076 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4077 PIXMAN_IMAGE_GET_LINE (
4078 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4079
4080 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4081
4082 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4083
4084 while (height--)
4085 {
4086 dst = dst_line;
4087 dst_line += dst_stride;
4088 mask = mask_line;
4089 mask_line += mask_stride;
4090 w = width;
4091
4092 while (w && ((uintptr_t)dst & 15))
4093 {
4094 uint8_t m = *mask++;
4095 d = (uint32_t) *dst;
4096
4097 *dst++ = (uint8_t) pack_1x128_32 (
4098 pix_multiply_1x128 (
4099 pix_multiply_1x128 (xmm_alpha,
4100 unpack_32_1x128 (m)),
4101 unpack_32_1x128 (d)));
4102 w--;
4103 }
4104
4105 while (w >= 16)
4106 {
4107 xmm_mask = load_128_unaligned ((__m128i*)mask);
4108 xmm_dst = load_128_aligned ((__m128i*)dst);
4109
4110 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4111 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4112
4113 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4114 &xmm_mask_lo, &xmm_mask_hi,
4115 &xmm_mask_lo, &xmm_mask_hi);
4116
4117 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4118 &xmm_dst_lo, &xmm_dst_hi,
4119 &xmm_dst_lo, &xmm_dst_hi);
4120
4121 save_128_aligned (
4122 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4123
4124 mask += 16;
4125 dst += 16;
4126 w -= 16;
4127 }
4128
4129 while (w)
4130 {
4131 uint8_t m = *mask++;
4132 d = (uint32_t) *dst;
4133
4134 *dst++ = (uint8_t) pack_1x128_32 (
4135 pix_multiply_1x128 (
4136 pix_multiply_1x128 (
4137 xmm_alpha, unpack_32_1x128 (m)),
4138 unpack_32_1x128 (d)));
4139 w--;
4140 }
4141 }
4142
4143 }
4144
4145 static void
sse2_composite_in_n_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4146 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4147 pixman_composite_info_t *info)
4148 {
4149 PIXMAN_COMPOSITE_ARGS (info);
4150 uint8_t *dst_line, *dst;
4151 int dst_stride;
4152 uint32_t d;
4153 uint32_t src;
4154 int32_t w;
4155
4156 __m128i xmm_alpha;
4157 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4158
4159 PIXMAN_IMAGE_GET_LINE (
4160 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4161
4162 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4163
4164 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4165
4166 src = src >> 24;
4167
4168 if (src == 0xff)
4169 return;
4170
4171 if (src == 0x00)
4172 {
4173 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4174 8, dest_x, dest_y, width, height, src);
4175
4176 return;
4177 }
4178
4179 while (height--)
4180 {
4181 dst = dst_line;
4182 dst_line += dst_stride;
4183 w = width;
4184
4185 while (w && ((uintptr_t)dst & 15))
4186 {
4187 d = (uint32_t) *dst;
4188
4189 *dst++ = (uint8_t) pack_1x128_32 (
4190 pix_multiply_1x128 (
4191 xmm_alpha,
4192 unpack_32_1x128 (d)));
4193 w--;
4194 }
4195
4196 while (w >= 16)
4197 {
4198 xmm_dst = load_128_aligned ((__m128i*)dst);
4199
4200 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4201
4202 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4203 &xmm_dst_lo, &xmm_dst_hi,
4204 &xmm_dst_lo, &xmm_dst_hi);
4205
4206 save_128_aligned (
4207 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4208
4209 dst += 16;
4210 w -= 16;
4211 }
4212
4213 while (w)
4214 {
4215 d = (uint32_t) *dst;
4216
4217 *dst++ = (uint8_t) pack_1x128_32 (
4218 pix_multiply_1x128 (
4219 xmm_alpha,
4220 unpack_32_1x128 (d)));
4221 w--;
4222 }
4223 }
4224
4225 }
4226
4227 static void
sse2_composite_in_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4228 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4229 pixman_composite_info_t *info)
4230 {
4231 PIXMAN_COMPOSITE_ARGS (info);
4232 uint8_t *dst_line, *dst;
4233 uint8_t *src_line, *src;
4234 int src_stride, dst_stride;
4235 int32_t w;
4236 uint32_t s, d;
4237
4238 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4239 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4240
4241 PIXMAN_IMAGE_GET_LINE (
4242 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4243 PIXMAN_IMAGE_GET_LINE (
4244 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4245
4246 while (height--)
4247 {
4248 dst = dst_line;
4249 dst_line += dst_stride;
4250 src = src_line;
4251 src_line += src_stride;
4252 w = width;
4253
4254 while (w && ((uintptr_t)dst & 15))
4255 {
4256 s = (uint32_t) *src++;
4257 d = (uint32_t) *dst;
4258
4259 *dst++ = (uint8_t) pack_1x128_32 (
4260 pix_multiply_1x128 (
4261 unpack_32_1x128 (s), unpack_32_1x128 (d)));
4262 w--;
4263 }
4264
4265 while (w >= 16)
4266 {
4267 xmm_src = load_128_unaligned ((__m128i*)src);
4268 xmm_dst = load_128_aligned ((__m128i*)dst);
4269
4270 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4271 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4272
4273 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4274 &xmm_dst_lo, &xmm_dst_hi,
4275 &xmm_dst_lo, &xmm_dst_hi);
4276
4277 save_128_aligned (
4278 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4279
4280 src += 16;
4281 dst += 16;
4282 w -= 16;
4283 }
4284
4285 while (w)
4286 {
4287 s = (uint32_t) *src++;
4288 d = (uint32_t) *dst;
4289
4290 *dst++ = (uint8_t) pack_1x128_32 (
4291 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4292 w--;
4293 }
4294 }
4295
4296 }
4297
4298 static void
sse2_composite_add_n_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4299 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4300 pixman_composite_info_t *info)
4301 {
4302 PIXMAN_COMPOSITE_ARGS (info);
4303 uint8_t *dst_line, *dst;
4304 uint8_t *mask_line, *mask;
4305 int dst_stride, mask_stride;
4306 int32_t w;
4307 uint32_t src;
4308 uint32_t d;
4309
4310 __m128i xmm_alpha;
4311 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4312 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4313
4314 PIXMAN_IMAGE_GET_LINE (
4315 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4316 PIXMAN_IMAGE_GET_LINE (
4317 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4318
4319 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4320
4321 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4322
4323 while (height--)
4324 {
4325 dst = dst_line;
4326 dst_line += dst_stride;
4327 mask = mask_line;
4328 mask_line += mask_stride;
4329 w = width;
4330
4331 while (w && ((uintptr_t)dst & 15))
4332 {
4333 uint8_t m = *mask++;
4334 d = (uint32_t) *dst;
4335
4336 *dst++ = (uint8_t) pack_1x128_32 (
4337 _mm_adds_epu16 (
4338 pix_multiply_1x128 (
4339 xmm_alpha, unpack_32_1x128 (m)),
4340 unpack_32_1x128 (d)));
4341 w--;
4342 }
4343
4344 while (w >= 16)
4345 {
4346 xmm_mask = load_128_unaligned ((__m128i*)mask);
4347 xmm_dst = load_128_aligned ((__m128i*)dst);
4348
4349 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4350 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4351
4352 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4353 &xmm_mask_lo, &xmm_mask_hi,
4354 &xmm_mask_lo, &xmm_mask_hi);
4355
4356 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4357 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4358
4359 save_128_aligned (
4360 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4361
4362 mask += 16;
4363 dst += 16;
4364 w -= 16;
4365 }
4366
4367 while (w)
4368 {
4369 uint8_t m = (uint32_t) *mask++;
4370 d = (uint32_t) *dst;
4371
4372 *dst++ = (uint8_t) pack_1x128_32 (
4373 _mm_adds_epu16 (
4374 pix_multiply_1x128 (
4375 xmm_alpha, unpack_32_1x128 (m)),
4376 unpack_32_1x128 (d)));
4377
4378 w--;
4379 }
4380 }
4381
4382 }
4383
4384 static void
sse2_composite_add_n_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4385 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4386 pixman_composite_info_t *info)
4387 {
4388 PIXMAN_COMPOSITE_ARGS (info);
4389 uint8_t *dst_line, *dst;
4390 int dst_stride;
4391 int32_t w;
4392 uint32_t src;
4393
4394 __m128i xmm_src;
4395
4396 PIXMAN_IMAGE_GET_LINE (
4397 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4398
4399 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4400
4401 src >>= 24;
4402
4403 if (src == 0x00)
4404 return;
4405
4406 if (src == 0xff)
4407 {
4408 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4409 8, dest_x, dest_y, width, height, 0xff);
4410
4411 return;
4412 }
4413
4414 src = (src << 24) | (src << 16) | (src << 8) | src;
4415 xmm_src = _mm_set_epi32 (src, src, src, src);
4416
4417 while (height--)
4418 {
4419 dst = dst_line;
4420 dst_line += dst_stride;
4421 w = width;
4422
4423 while (w && ((uintptr_t)dst & 15))
4424 {
4425 *dst = (uint8_t)_mm_cvtsi128_si32 (
4426 _mm_adds_epu8 (
4427 xmm_src,
4428 _mm_cvtsi32_si128 (*dst)));
4429
4430 w--;
4431 dst++;
4432 }
4433
4434 while (w >= 16)
4435 {
4436 save_128_aligned (
4437 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4438
4439 dst += 16;
4440 w -= 16;
4441 }
4442
4443 while (w)
4444 {
4445 *dst = (uint8_t)_mm_cvtsi128_si32 (
4446 _mm_adds_epu8 (
4447 xmm_src,
4448 _mm_cvtsi32_si128 (*dst)));
4449
4450 w--;
4451 dst++;
4452 }
4453 }
4454
4455 }
4456
4457 static void
sse2_composite_add_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4458 sse2_composite_add_8_8 (pixman_implementation_t *imp,
4459 pixman_composite_info_t *info)
4460 {
4461 PIXMAN_COMPOSITE_ARGS (info);
4462 uint8_t *dst_line, *dst;
4463 uint8_t *src_line, *src;
4464 int dst_stride, src_stride;
4465 int32_t w;
4466 uint16_t t;
4467
4468 PIXMAN_IMAGE_GET_LINE (
4469 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4470 PIXMAN_IMAGE_GET_LINE (
4471 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4472
4473 while (height--)
4474 {
4475 dst = dst_line;
4476 src = src_line;
4477
4478 dst_line += dst_stride;
4479 src_line += src_stride;
4480 w = width;
4481
4482 /* Small head */
4483 while (w && (uintptr_t)dst & 3)
4484 {
4485 t = (*dst) + (*src++);
4486 *dst++ = t | (0 - (t >> 8));
4487 w--;
4488 }
4489
4490 sse2_combine_add_u (imp, op,
4491 (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4492
4493 /* Small tail */
4494 dst += w & 0xfffc;
4495 src += w & 0xfffc;
4496
4497 w &= 3;
4498
4499 while (w)
4500 {
4501 t = (*dst) + (*src++);
4502 *dst++ = t | (0 - (t >> 8));
4503 w--;
4504 }
4505 }
4506
4507 }
4508
4509 static void
sse2_composite_add_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)4510 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4511 pixman_composite_info_t *info)
4512 {
4513 PIXMAN_COMPOSITE_ARGS (info);
4514 uint32_t *dst_line, *dst;
4515 uint32_t *src_line, *src;
4516 int dst_stride, src_stride;
4517
4518 PIXMAN_IMAGE_GET_LINE (
4519 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4520 PIXMAN_IMAGE_GET_LINE (
4521 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4522
4523 while (height--)
4524 {
4525 dst = dst_line;
4526 dst_line += dst_stride;
4527 src = src_line;
4528 src_line += src_stride;
4529
4530 sse2_combine_add_u (imp, op, dst, src, NULL, width);
4531 }
4532 }
4533
4534 static void
sse2_composite_add_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)4535 sse2_composite_add_n_8888 (pixman_implementation_t *imp,
4536 pixman_composite_info_t *info)
4537 {
4538 PIXMAN_COMPOSITE_ARGS (info);
4539 uint32_t *dst_line, *dst, src;
4540 int dst_stride;
4541
4542 __m128i xmm_src;
4543
4544 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4545
4546 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4547 if (src == 0)
4548 return;
4549
4550 if (src == ~0)
4551 {
4552 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
4553 dest_x, dest_y, width, height, ~0);
4554
4555 return;
4556 }
4557
4558 xmm_src = _mm_set_epi32 (src, src, src, src);
4559 while (height--)
4560 {
4561 int w = width;
4562 uint32_t d;
4563
4564 dst = dst_line;
4565 dst_line += dst_stride;
4566
4567 while (w && (uintptr_t)dst & 15)
4568 {
4569 d = *dst;
4570 *dst++ =
4571 _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
4572 w--;
4573 }
4574
4575 while (w >= 4)
4576 {
4577 save_128_aligned
4578 ((__m128i*)dst,
4579 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4580
4581 dst += 4;
4582 w -= 4;
4583 }
4584
4585 while (w--)
4586 {
4587 d = *dst;
4588 *dst++ =
4589 _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
4590 _mm_cvtsi32_si128 (d)));
4591 }
4592 }
4593 }
4594
4595 static void
sse2_composite_add_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)4596 sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
4597 pixman_composite_info_t *info)
4598 {
4599 PIXMAN_COMPOSITE_ARGS (info);
4600 uint32_t *dst_line, *dst;
4601 uint8_t *mask_line, *mask;
4602 int dst_stride, mask_stride;
4603 int32_t w;
4604 uint32_t src;
4605
4606 __m128i xmm_src;
4607
4608 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4609 if (src == 0)
4610 return;
4611 xmm_src = expand_pixel_32_1x128 (src);
4612
4613 PIXMAN_IMAGE_GET_LINE (
4614 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4615 PIXMAN_IMAGE_GET_LINE (
4616 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4617
4618 while (height--)
4619 {
4620 dst = dst_line;
4621 dst_line += dst_stride;
4622 mask = mask_line;
4623 mask_line += mask_stride;
4624 w = width;
4625
4626 while (w && ((uintptr_t)dst & 15))
4627 {
4628 uint8_t m = *mask++;
4629 if (m)
4630 {
4631 *dst = pack_1x128_32
4632 (_mm_adds_epu16
4633 (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4634 unpack_32_1x128 (*dst)));
4635 }
4636 dst++;
4637 w--;
4638 }
4639
4640 while (w >= 4)
4641 {
4642 uint32_t m;
4643 memcpy(&m, mask, sizeof(uint32_t));
4644
4645 if (m)
4646 {
4647 __m128i xmm_mask_lo, xmm_mask_hi;
4648 __m128i xmm_dst_lo, xmm_dst_hi;
4649
4650 __m128i xmm_dst = load_128_aligned ((__m128i*)dst);
4651 __m128i xmm_mask =
4652 _mm_unpacklo_epi8 (unpack_32_1x128(m),
4653 _mm_setzero_si128 ());
4654
4655 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4656 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4657
4658 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4659 &xmm_mask_lo, &xmm_mask_hi);
4660
4661 pix_multiply_2x128 (&xmm_src, &xmm_src,
4662 &xmm_mask_lo, &xmm_mask_hi,
4663 &xmm_mask_lo, &xmm_mask_hi);
4664
4665 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4666 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4667
4668 save_128_aligned (
4669 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4670 }
4671
4672 w -= 4;
4673 dst += 4;
4674 mask += 4;
4675 }
4676
4677 while (w)
4678 {
4679 uint8_t m = *mask++;
4680 if (m)
4681 {
4682 *dst = pack_1x128_32
4683 (_mm_adds_epu16
4684 (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4685 unpack_32_1x128 (*dst)));
4686 }
4687 dst++;
4688 w--;
4689 }
4690 }
4691 }
4692
4693 static pixman_bool_t
sse2_blt(pixman_implementation_t * imp,uint32_t * src_bits,uint32_t * dst_bits,int src_stride,int dst_stride,int src_bpp,int dst_bpp,int src_x,int src_y,int dest_x,int dest_y,int width,int height)4694 sse2_blt (pixman_implementation_t *imp,
4695 uint32_t * src_bits,
4696 uint32_t * dst_bits,
4697 int src_stride,
4698 int dst_stride,
4699 int src_bpp,
4700 int dst_bpp,
4701 int src_x,
4702 int src_y,
4703 int dest_x,
4704 int dest_y,
4705 int width,
4706 int height)
4707 {
4708 uint8_t * src_bytes;
4709 uint8_t * dst_bytes;
4710 int byte_width;
4711
4712 if (src_bpp != dst_bpp)
4713 return FALSE;
4714
4715 if (src_bpp == 16)
4716 {
4717 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4718 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4719 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4720 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4721 byte_width = 2 * width;
4722 src_stride *= 2;
4723 dst_stride *= 2;
4724 }
4725 else if (src_bpp == 32)
4726 {
4727 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4728 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4729 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4730 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4731 byte_width = 4 * width;
4732 src_stride *= 4;
4733 dst_stride *= 4;
4734 }
4735 else
4736 {
4737 return FALSE;
4738 }
4739
4740 while (height--)
4741 {
4742 int w;
4743 uint8_t *s = src_bytes;
4744 uint8_t *d = dst_bytes;
4745 src_bytes += src_stride;
4746 dst_bytes += dst_stride;
4747 w = byte_width;
4748
4749 while (w >= 2 && ((uintptr_t)d & 3))
4750 {
4751 memmove(d, s, 2);
4752 w -= 2;
4753 s += 2;
4754 d += 2;
4755 }
4756
4757 while (w >= 4 && ((uintptr_t)d & 15))
4758 {
4759 memmove(d, s, 4);
4760
4761 w -= 4;
4762 s += 4;
4763 d += 4;
4764 }
4765
4766 while (w >= 64)
4767 {
4768 __m128i xmm0, xmm1, xmm2, xmm3;
4769
4770 xmm0 = load_128_unaligned ((__m128i*)(s));
4771 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4772 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4773 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4774
4775 save_128_aligned ((__m128i*)(d), xmm0);
4776 save_128_aligned ((__m128i*)(d + 16), xmm1);
4777 save_128_aligned ((__m128i*)(d + 32), xmm2);
4778 save_128_aligned ((__m128i*)(d + 48), xmm3);
4779
4780 s += 64;
4781 d += 64;
4782 w -= 64;
4783 }
4784
4785 while (w >= 16)
4786 {
4787 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4788
4789 w -= 16;
4790 d += 16;
4791 s += 16;
4792 }
4793
4794 while (w >= 4)
4795 {
4796 memmove(d, s, 4);
4797
4798 w -= 4;
4799 s += 4;
4800 d += 4;
4801 }
4802
4803 if (w >= 2)
4804 {
4805 memmove(d, s, 2);
4806 w -= 2;
4807 s += 2;
4808 d += 2;
4809 }
4810 }
4811
4812 return TRUE;
4813 }
4814
4815 static void
sse2_composite_copy_area(pixman_implementation_t * imp,pixman_composite_info_t * info)4816 sse2_composite_copy_area (pixman_implementation_t *imp,
4817 pixman_composite_info_t *info)
4818 {
4819 PIXMAN_COMPOSITE_ARGS (info);
4820 sse2_blt (imp, src_image->bits.bits,
4821 dest_image->bits.bits,
4822 src_image->bits.rowstride,
4823 dest_image->bits.rowstride,
4824 PIXMAN_FORMAT_BPP (src_image->bits.format),
4825 PIXMAN_FORMAT_BPP (dest_image->bits.format),
4826 src_x, src_y, dest_x, dest_y, width, height);
4827 }
4828
4829 static void
sse2_composite_over_x888_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)4830 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4831 pixman_composite_info_t *info)
4832 {
4833 PIXMAN_COMPOSITE_ARGS (info);
4834 uint32_t *src, *src_line, s;
4835 uint32_t *dst, *dst_line, d;
4836 uint8_t *mask, *mask_line;
4837 int src_stride, mask_stride, dst_stride;
4838 int32_t w;
4839 __m128i ms;
4840
4841 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4842 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4843 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4844
4845 PIXMAN_IMAGE_GET_LINE (
4846 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4847 PIXMAN_IMAGE_GET_LINE (
4848 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4849 PIXMAN_IMAGE_GET_LINE (
4850 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4851
4852 while (height--)
4853 {
4854 src = src_line;
4855 src_line += src_stride;
4856 dst = dst_line;
4857 dst_line += dst_stride;
4858 mask = mask_line;
4859 mask_line += mask_stride;
4860
4861 w = width;
4862
4863 while (w && (uintptr_t)dst & 15)
4864 {
4865 uint8_t m = *mask++;
4866 s = 0xff000000 | *src++;
4867 d = *dst;
4868 ms = unpack_32_1x128 (s);
4869
4870 if (m != 0xff)
4871 {
4872 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4873 __m128i md = unpack_32_1x128 (d);
4874
4875 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4876 }
4877
4878 *dst++ = pack_1x128_32 (ms);
4879 w--;
4880 }
4881
4882 while (w >= 4)
4883 {
4884 uint32_t m;
4885 memcpy(&m, mask, sizeof(uint32_t));
4886 xmm_src = _mm_or_si128 (
4887 load_128_unaligned ((__m128i*)src), mask_ff000000);
4888
4889 if (m == 0xffffffff)
4890 {
4891 save_128_aligned ((__m128i*)dst, xmm_src);
4892 }
4893 else
4894 {
4895 xmm_dst = load_128_aligned ((__m128i*)dst);
4896
4897 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4898
4899 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4900 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4901 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4902
4903 expand_alpha_rev_2x128 (
4904 xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4905
4906 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
4907 &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
4908 &xmm_dst_lo, &xmm_dst_hi);
4909
4910 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4911 }
4912
4913 src += 4;
4914 dst += 4;
4915 mask += 4;
4916 w -= 4;
4917 }
4918
4919 while (w)
4920 {
4921 uint8_t m = *mask++;
4922
4923 if (m)
4924 {
4925 s = 0xff000000 | *src;
4926
4927 if (m == 0xff)
4928 {
4929 *dst = s;
4930 }
4931 else
4932 {
4933 __m128i ma, md, ms;
4934
4935 d = *dst;
4936
4937 ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4938 md = unpack_32_1x128 (d);
4939 ms = unpack_32_1x128 (s);
4940
4941 *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4942 }
4943
4944 }
4945
4946 src++;
4947 dst++;
4948 w--;
4949 }
4950 }
4951
4952 }
4953
4954 static void
sse2_composite_over_8888_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)4955 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
4956 pixman_composite_info_t *info)
4957 {
4958 PIXMAN_COMPOSITE_ARGS (info);
4959 uint32_t *src, *src_line, s;
4960 uint32_t *dst, *dst_line, d;
4961 uint8_t *mask, *mask_line;
4962 int src_stride, mask_stride, dst_stride;
4963 int32_t w;
4964
4965 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4966 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4967 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4968
4969 PIXMAN_IMAGE_GET_LINE (
4970 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4971 PIXMAN_IMAGE_GET_LINE (
4972 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4973 PIXMAN_IMAGE_GET_LINE (
4974 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4975
4976 while (height--)
4977 {
4978 src = src_line;
4979 src_line += src_stride;
4980 dst = dst_line;
4981 dst_line += dst_stride;
4982 mask = mask_line;
4983 mask_line += mask_stride;
4984
4985 w = width;
4986
4987 while (w && (uintptr_t)dst & 15)
4988 {
4989 uint32_t sa;
4990 uint8_t m = *mask++;
4991
4992 s = *src++;
4993 d = *dst;
4994
4995 sa = s >> 24;
4996
4997 if (m)
4998 {
4999 if (sa == 0xff && m == 0xff)
5000 {
5001 *dst = s;
5002 }
5003 else
5004 {
5005 __m128i ms, md, ma, msa;
5006
5007 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5008 ms = unpack_32_1x128 (s);
5009 md = unpack_32_1x128 (d);
5010
5011 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5012
5013 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5014 }
5015 }
5016
5017 dst++;
5018 w--;
5019 }
5020
5021 while (w >= 4)
5022 {
5023 uint32_t m;
5024 memcpy(&m, mask, sizeof(uint32_t));
5025
5026 if (m)
5027 {
5028 xmm_src = load_128_unaligned ((__m128i*)src);
5029
5030 if (m == 0xffffffff && is_opaque (xmm_src))
5031 {
5032 save_128_aligned ((__m128i *)dst, xmm_src);
5033 }
5034 else
5035 {
5036 xmm_dst = load_128_aligned ((__m128i *)dst);
5037
5038 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5039
5040 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5041 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5042 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5043
5044 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5045 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5046
5047 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5048 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5049
5050 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5051 }
5052 }
5053
5054 src += 4;
5055 dst += 4;
5056 mask += 4;
5057 w -= 4;
5058 }
5059
5060 while (w)
5061 {
5062 uint32_t sa;
5063 uint8_t m = *mask++;
5064
5065 s = *src++;
5066 d = *dst;
5067
5068 sa = s >> 24;
5069
5070 if (m)
5071 {
5072 if (sa == 0xff && m == 0xff)
5073 {
5074 *dst = s;
5075 }
5076 else
5077 {
5078 __m128i ms, md, ma, msa;
5079
5080 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5081 ms = unpack_32_1x128 (s);
5082 md = unpack_32_1x128 (d);
5083
5084 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5085
5086 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5087 }
5088 }
5089
5090 dst++;
5091 w--;
5092 }
5093 }
5094
5095 }
5096
5097 static void
sse2_composite_over_reverse_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)5098 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5099 pixman_composite_info_t *info)
5100 {
5101 PIXMAN_COMPOSITE_ARGS (info);
5102 uint32_t src;
5103 uint32_t *dst_line, *dst;
5104 __m128i xmm_src;
5105 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5106 __m128i xmm_dsta_hi, xmm_dsta_lo;
5107 int dst_stride;
5108 int32_t w;
5109
5110 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
5111
5112 if (src == 0)
5113 return;
5114
5115 PIXMAN_IMAGE_GET_LINE (
5116 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5117
5118 xmm_src = expand_pixel_32_1x128 (src);
5119
5120 while (height--)
5121 {
5122 dst = dst_line;
5123
5124 dst_line += dst_stride;
5125 w = width;
5126
5127 while (w && (uintptr_t)dst & 15)
5128 {
5129 __m128i vd;
5130
5131 vd = unpack_32_1x128 (*dst);
5132
5133 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5134 xmm_src));
5135 w--;
5136 dst++;
5137 }
5138
5139 while (w >= 4)
5140 {
5141 __m128i tmp_lo, tmp_hi;
5142
5143 xmm_dst = load_128_aligned ((__m128i*)dst);
5144
5145 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5146 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5147
5148 tmp_lo = xmm_src;
5149 tmp_hi = xmm_src;
5150
5151 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5152 &xmm_dsta_lo, &xmm_dsta_hi,
5153 &tmp_lo, &tmp_hi);
5154
5155 save_128_aligned (
5156 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5157
5158 w -= 4;
5159 dst += 4;
5160 }
5161
5162 while (w)
5163 {
5164 __m128i vd;
5165
5166 vd = unpack_32_1x128 (*dst);
5167
5168 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5169 xmm_src));
5170 w--;
5171 dst++;
5172 }
5173
5174 }
5175
5176 }
5177
5178 static void
sse2_composite_over_8888_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)5179 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5180 pixman_composite_info_t *info)
5181 {
5182 PIXMAN_COMPOSITE_ARGS (info);
5183 uint32_t *src, *src_line, s;
5184 uint32_t *dst, *dst_line, d;
5185 uint32_t *mask, *mask_line;
5186 uint32_t m;
5187 int src_stride, mask_stride, dst_stride;
5188 int32_t w;
5189
5190 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5191 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5192 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5193
5194 PIXMAN_IMAGE_GET_LINE (
5195 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5196 PIXMAN_IMAGE_GET_LINE (
5197 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5198 PIXMAN_IMAGE_GET_LINE (
5199 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5200
5201 while (height--)
5202 {
5203 src = src_line;
5204 src_line += src_stride;
5205 dst = dst_line;
5206 dst_line += dst_stride;
5207 mask = mask_line;
5208 mask_line += mask_stride;
5209
5210 w = width;
5211
5212 while (w && (uintptr_t)dst & 15)
5213 {
5214 uint32_t sa;
5215
5216 s = *src++;
5217 m = (*mask++) >> 24;
5218 d = *dst;
5219
5220 sa = s >> 24;
5221
5222 if (m)
5223 {
5224 if (sa == 0xff && m == 0xff)
5225 {
5226 *dst = s;
5227 }
5228 else
5229 {
5230 __m128i ms, md, ma, msa;
5231
5232 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5233 ms = unpack_32_1x128 (s);
5234 md = unpack_32_1x128 (d);
5235
5236 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5237
5238 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5239 }
5240 }
5241
5242 dst++;
5243 w--;
5244 }
5245
5246 while (w >= 4)
5247 {
5248 xmm_mask = load_128_unaligned ((__m128i*)mask);
5249
5250 if (!is_transparent (xmm_mask))
5251 {
5252 xmm_src = load_128_unaligned ((__m128i*)src);
5253
5254 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5255 {
5256 save_128_aligned ((__m128i *)dst, xmm_src);
5257 }
5258 else
5259 {
5260 xmm_dst = load_128_aligned ((__m128i *)dst);
5261
5262 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5263 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5264 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5265
5266 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5267 expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5268
5269 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5270 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5271
5272 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5273 }
5274 }
5275
5276 src += 4;
5277 dst += 4;
5278 mask += 4;
5279 w -= 4;
5280 }
5281
5282 while (w)
5283 {
5284 uint32_t sa;
5285
5286 s = *src++;
5287 m = (*mask++) >> 24;
5288 d = *dst;
5289
5290 sa = s >> 24;
5291
5292 if (m)
5293 {
5294 if (sa == 0xff && m == 0xff)
5295 {
5296 *dst = s;
5297 }
5298 else
5299 {
5300 __m128i ms, md, ma, msa;
5301
5302 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5303 ms = unpack_32_1x128 (s);
5304 md = unpack_32_1x128 (d);
5305
5306 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5307
5308 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5309 }
5310 }
5311
5312 dst++;
5313 w--;
5314 }
5315 }
5316
5317 }
5318
5319 /* A variant of 'sse2_combine_over_u' with minor tweaks */
5320 static force_inline void
scaled_nearest_scanline_sse2_8888_8888_OVER(uint32_t * pd,const uint32_t * ps,int32_t w,pixman_fixed_t vx,pixman_fixed_t unit_x,pixman_fixed_t src_width_fixed,pixman_bool_t fully_transparent_src)5321 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
5322 const uint32_t* ps,
5323 int32_t w,
5324 pixman_fixed_t vx,
5325 pixman_fixed_t unit_x,
5326 pixman_fixed_t src_width_fixed,
5327 pixman_bool_t fully_transparent_src)
5328 {
5329 uint32_t s, d;
5330 const uint32_t* pm = NULL;
5331
5332 __m128i xmm_dst_lo, xmm_dst_hi;
5333 __m128i xmm_src_lo, xmm_src_hi;
5334 __m128i xmm_alpha_lo, xmm_alpha_hi;
5335
5336 if (fully_transparent_src)
5337 return;
5338
5339 /* Align dst on a 16-byte boundary */
5340 while (w && ((uintptr_t)pd & 15))
5341 {
5342 d = *pd;
5343 s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5344 vx += unit_x;
5345 while (vx >= 0)
5346 vx -= src_width_fixed;
5347
5348 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5349 if (pm)
5350 pm++;
5351 w--;
5352 }
5353
5354 while (w >= 4)
5355 {
5356 __m128i tmp;
5357 uint32_t tmp1, tmp2, tmp3, tmp4;
5358
5359 tmp1 = *(ps + pixman_fixed_to_int (vx));
5360 vx += unit_x;
5361 while (vx >= 0)
5362 vx -= src_width_fixed;
5363 tmp2 = *(ps + pixman_fixed_to_int (vx));
5364 vx += unit_x;
5365 while (vx >= 0)
5366 vx -= src_width_fixed;
5367 tmp3 = *(ps + pixman_fixed_to_int (vx));
5368 vx += unit_x;
5369 while (vx >= 0)
5370 vx -= src_width_fixed;
5371 tmp4 = *(ps + pixman_fixed_to_int (vx));
5372 vx += unit_x;
5373 while (vx >= 0)
5374 vx -= src_width_fixed;
5375
5376 tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5377
5378 xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5379
5380 if (is_opaque (xmm_src_hi))
5381 {
5382 save_128_aligned ((__m128i*)pd, xmm_src_hi);
5383 }
5384 else if (!is_zero (xmm_src_hi))
5385 {
5386 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5387
5388 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5389 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5390
5391 expand_alpha_2x128 (
5392 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5393
5394 over_2x128 (&xmm_src_lo, &xmm_src_hi,
5395 &xmm_alpha_lo, &xmm_alpha_hi,
5396 &xmm_dst_lo, &xmm_dst_hi);
5397
5398 /* rebuid the 4 pixel data and save*/
5399 save_128_aligned ((__m128i*)pd,
5400 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5401 }
5402
5403 w -= 4;
5404 pd += 4;
5405 if (pm)
5406 pm += 4;
5407 }
5408
5409 while (w)
5410 {
5411 d = *pd;
5412 s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5413 vx += unit_x;
5414 while (vx >= 0)
5415 vx -= src_width_fixed;
5416
5417 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5418 if (pm)
5419 pm++;
5420
5421 w--;
5422 }
5423 }
5424
FAST_NEAREST_MAINLOOP(sse2_8888_8888_cover_OVER,scaled_nearest_scanline_sse2_8888_8888_OVER,uint32_t,uint32_t,COVER)5425 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5426 scaled_nearest_scanline_sse2_8888_8888_OVER,
5427 uint32_t, uint32_t, COVER)
5428 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5429 scaled_nearest_scanline_sse2_8888_8888_OVER,
5430 uint32_t, uint32_t, NONE)
5431 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5432 scaled_nearest_scanline_sse2_8888_8888_OVER,
5433 uint32_t, uint32_t, PAD)
5434 FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
5435 scaled_nearest_scanline_sse2_8888_8888_OVER,
5436 uint32_t, uint32_t, NORMAL)
5437
5438 static force_inline void
5439 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5440 uint32_t * dst,
5441 const uint32_t * src,
5442 int32_t w,
5443 pixman_fixed_t vx,
5444 pixman_fixed_t unit_x,
5445 pixman_fixed_t src_width_fixed,
5446 pixman_bool_t zero_src)
5447 {
5448 __m128i xmm_mask;
5449 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5450 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5451 __m128i xmm_alpha_lo, xmm_alpha_hi;
5452
5453 if (zero_src || (*mask >> 24) == 0)
5454 return;
5455
5456 xmm_mask = create_mask_16_128 (*mask >> 24);
5457
5458 while (w && (uintptr_t)dst & 15)
5459 {
5460 uint32_t s = *(src + pixman_fixed_to_int (vx));
5461 vx += unit_x;
5462 while (vx >= 0)
5463 vx -= src_width_fixed;
5464
5465 if (s)
5466 {
5467 uint32_t d = *dst;
5468
5469 __m128i ms = unpack_32_1x128 (s);
5470 __m128i alpha = expand_alpha_1x128 (ms);
5471 __m128i dest = xmm_mask;
5472 __m128i alpha_dst = unpack_32_1x128 (d);
5473
5474 *dst = pack_1x128_32 (
5475 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5476 }
5477 dst++;
5478 w--;
5479 }
5480
5481 while (w >= 4)
5482 {
5483 uint32_t tmp1, tmp2, tmp3, tmp4;
5484
5485 tmp1 = *(src + pixman_fixed_to_int (vx));
5486 vx += unit_x;
5487 while (vx >= 0)
5488 vx -= src_width_fixed;
5489 tmp2 = *(src + pixman_fixed_to_int (vx));
5490 vx += unit_x;
5491 while (vx >= 0)
5492 vx -= src_width_fixed;
5493 tmp3 = *(src + pixman_fixed_to_int (vx));
5494 vx += unit_x;
5495 while (vx >= 0)
5496 vx -= src_width_fixed;
5497 tmp4 = *(src + pixman_fixed_to_int (vx));
5498 vx += unit_x;
5499 while (vx >= 0)
5500 vx -= src_width_fixed;
5501
5502 xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5503
5504 if (!is_zero (xmm_src))
5505 {
5506 xmm_dst = load_128_aligned ((__m128i*)dst);
5507
5508 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5509 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5510 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5511 &xmm_alpha_lo, &xmm_alpha_hi);
5512
5513 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5514 &xmm_alpha_lo, &xmm_alpha_hi,
5515 &xmm_mask, &xmm_mask,
5516 &xmm_dst_lo, &xmm_dst_hi);
5517
5518 save_128_aligned (
5519 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5520 }
5521
5522 dst += 4;
5523 w -= 4;
5524 }
5525
5526 while (w)
5527 {
5528 uint32_t s = *(src + pixman_fixed_to_int (vx));
5529 vx += unit_x;
5530 while (vx >= 0)
5531 vx -= src_width_fixed;
5532
5533 if (s)
5534 {
5535 uint32_t d = *dst;
5536
5537 __m128i ms = unpack_32_1x128 (s);
5538 __m128i alpha = expand_alpha_1x128 (ms);
5539 __m128i mask = xmm_mask;
5540 __m128i dest = unpack_32_1x128 (d);
5541
5542 *dst = pack_1x128_32 (
5543 in_over_1x128 (&ms, &alpha, &mask, &dest));
5544 }
5545
5546 dst++;
5547 w--;
5548 }
5549
5550 }
5551
FAST_NEAREST_MAINLOOP_COMMON(sse2_8888_n_8888_cover_OVER,scaled_nearest_scanline_sse2_8888_n_8888_OVER,uint32_t,uint32_t,uint32_t,COVER,TRUE,TRUE)5552 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5553 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5554 uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5555 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5556 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5557 uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5558 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5559 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5560 uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5561 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
5562 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5563 uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
5564
5565 #if PSHUFD_IS_FAST
5566
5567 /***********************************************************************************/
5568
5569 # define BILINEAR_DECLARE_VARIABLES \
5570 const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
5571 const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
5572 const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \
5573 const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \
5574 unit_x, -unit_x, unit_x, -unit_x); \
5575 const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \
5576 unit_x * 4, -unit_x * 4, \
5577 unit_x * 4, -unit_x * 4, \
5578 unit_x * 4, -unit_x * 4); \
5579 const __m128i xmm_zero = _mm_setzero_si128 (); \
5580 __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3, \
5581 vx + unit_x * 2, -(vx + 1) - unit_x * 2, \
5582 vx + unit_x * 1, -(vx + 1) - unit_x * 1, \
5583 vx + unit_x * 0, -(vx + 1) - unit_x * 0); \
5584 __m128i xmm_wh_state;
5585
5586 #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_) \
5587 do { \
5588 int phase = phase_; \
5589 __m128i xmm_wh, xmm_a, xmm_b; \
5590 /* fetch 2x2 pixel block into sse2 registers */ \
5591 __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \
5592 __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \
5593 vx += unit_x; \
5594 /* vertical interpolation */ \
5595 xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \
5596 xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \
5597 xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \
5598 /* calculate horizontal weights */ \
5599 if (phase <= 0) \
5600 { \
5601 xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \
5602 16 - BILINEAR_INTERPOLATION_BITS)); \
5603 xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4); \
5604 phase = 0; \
5605 } \
5606 xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase, \
5607 phase, phase)); \
5608 /* horizontal interpolation */ \
5609 xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \
5610 xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh); \
5611 /* shift the result */ \
5612 pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \
5613 } while (0)
5614
5615 #else /************************************************************************/
5616
5617 # define BILINEAR_DECLARE_VARIABLES \
5618 const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
5619 const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
5620 const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \
5621 const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \
5622 unit_x, -unit_x, unit_x, -unit_x); \
5623 const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \
5624 unit_x * 4, -unit_x * 4, \
5625 unit_x * 4, -unit_x * 4, \
5626 unit_x * 4, -unit_x * 4); \
5627 const __m128i xmm_zero = _mm_setzero_si128 (); \
5628 __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1), \
5629 vx, -(vx + 1), vx, -(vx + 1))
5630
5631 #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase) \
5632 do { \
5633 __m128i xmm_wh, xmm_a, xmm_b; \
5634 /* fetch 2x2 pixel block into sse2 registers */ \
5635 __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \
5636 __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \
5637 (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */ \
5638 vx += unit_x; \
5639 /* vertical interpolation */ \
5640 xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \
5641 xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \
5642 xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \
5643 /* calculate horizontal weights */ \
5644 xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \
5645 16 - BILINEAR_INTERPOLATION_BITS)); \
5646 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \
5647 /* horizontal interpolation */ \
5648 xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a); \
5649 xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh); \
5650 /* shift the result */ \
5651 pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \
5652 } while (0)
5653
5654 /***********************************************************************************/
5655
5656 #endif
5657
5658 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix); \
5659 do { \
5660 __m128i xmm_pix; \
5661 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1); \
5662 xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix); \
5663 xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix); \
5664 pix = _mm_cvtsi128_si32 (xmm_pix); \
5665 } while(0)
5666
5667 #define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix); \
5668 do { \
5669 __m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4; \
5670 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0); \
5671 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1); \
5672 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2); \
5673 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3); \
5674 xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2); \
5675 xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4); \
5676 pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3); \
5677 } while(0)
5678
5679 #define BILINEAR_SKIP_ONE_PIXEL() \
5680 do { \
5681 vx += unit_x; \
5682 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \
5683 } while(0)
5684
5685 #define BILINEAR_SKIP_FOUR_PIXELS() \
5686 do { \
5687 vx += unit_x * 4; \
5688 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4); \
5689 } while(0)
5690
5691 /***********************************************************************************/
5692
5693 static force_inline void
5694 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst,
5695 const uint32_t * mask,
5696 const uint32_t * src_top,
5697 const uint32_t * src_bottom,
5698 int32_t w,
5699 int wt,
5700 int wb,
5701 pixman_fixed_t vx_,
5702 pixman_fixed_t unit_x_,
5703 pixman_fixed_t max_vx,
5704 pixman_bool_t zero_src)
5705 {
5706 intptr_t vx = vx_;
5707 intptr_t unit_x = unit_x_;
5708 BILINEAR_DECLARE_VARIABLES;
5709 uint32_t pix1, pix2;
5710
5711 while (w && ((uintptr_t)dst & 15))
5712 {
5713 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5714 *dst++ = pix1;
5715 w--;
5716 }
5717
5718 while ((w -= 4) >= 0) {
5719 __m128i xmm_src;
5720 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5721 _mm_store_si128 ((__m128i *)dst, xmm_src);
5722 dst += 4;
5723 }
5724
5725 if (w & 2)
5726 {
5727 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5728 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5729 *dst++ = pix1;
5730 *dst++ = pix2;
5731 }
5732
5733 if (w & 1)
5734 {
5735 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5736 *dst = pix1;
5737 }
5738
5739 }
5740
FAST_BILINEAR_MAINLOOP_COMMON(sse2_8888_8888_cover_SRC,scaled_bilinear_scanline_sse2_8888_8888_SRC,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)5741 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
5742 scaled_bilinear_scanline_sse2_8888_8888_SRC,
5743 uint32_t, uint32_t, uint32_t,
5744 COVER, FLAG_NONE)
5745 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
5746 scaled_bilinear_scanline_sse2_8888_8888_SRC,
5747 uint32_t, uint32_t, uint32_t,
5748 PAD, FLAG_NONE)
5749 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
5750 scaled_bilinear_scanline_sse2_8888_8888_SRC,
5751 uint32_t, uint32_t, uint32_t,
5752 NONE, FLAG_NONE)
5753 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
5754 scaled_bilinear_scanline_sse2_8888_8888_SRC,
5755 uint32_t, uint32_t, uint32_t,
5756 NORMAL, FLAG_NONE)
5757
5758 static force_inline void
5759 scaled_bilinear_scanline_sse2_x888_8888_SRC (uint32_t * dst,
5760 const uint32_t * mask,
5761 const uint32_t * src_top,
5762 const uint32_t * src_bottom,
5763 int32_t w,
5764 int wt,
5765 int wb,
5766 pixman_fixed_t vx_,
5767 pixman_fixed_t unit_x_,
5768 pixman_fixed_t max_vx,
5769 pixman_bool_t zero_src)
5770 {
5771 intptr_t vx = vx_;
5772 intptr_t unit_x = unit_x_;
5773 BILINEAR_DECLARE_VARIABLES;
5774 uint32_t pix1, pix2;
5775
5776 while (w && ((uintptr_t)dst & 15))
5777 {
5778 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5779 *dst++ = pix1 | 0xFF000000;
5780 w--;
5781 }
5782
5783 while ((w -= 4) >= 0) {
5784 __m128i xmm_src;
5785 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5786 _mm_store_si128 ((__m128i *)dst, _mm_or_si128 (xmm_src, mask_ff000000));
5787 dst += 4;
5788 }
5789
5790 if (w & 2)
5791 {
5792 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5793 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5794 *dst++ = pix1 | 0xFF000000;
5795 *dst++ = pix2 | 0xFF000000;
5796 }
5797
5798 if (w & 1)
5799 {
5800 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5801 *dst = pix1 | 0xFF000000;
5802 }
5803 }
5804
FAST_BILINEAR_MAINLOOP_COMMON(sse2_x888_8888_cover_SRC,scaled_bilinear_scanline_sse2_x888_8888_SRC,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)5805 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_cover_SRC,
5806 scaled_bilinear_scanline_sse2_x888_8888_SRC,
5807 uint32_t, uint32_t, uint32_t,
5808 COVER, FLAG_NONE)
5809 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_pad_SRC,
5810 scaled_bilinear_scanline_sse2_x888_8888_SRC,
5811 uint32_t, uint32_t, uint32_t,
5812 PAD, FLAG_NONE)
5813 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_normal_SRC,
5814 scaled_bilinear_scanline_sse2_x888_8888_SRC,
5815 uint32_t, uint32_t, uint32_t,
5816 NORMAL, FLAG_NONE)
5817
5818 static force_inline void
5819 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst,
5820 const uint32_t * mask,
5821 const uint32_t * src_top,
5822 const uint32_t * src_bottom,
5823 int32_t w,
5824 int wt,
5825 int wb,
5826 pixman_fixed_t vx_,
5827 pixman_fixed_t unit_x_,
5828 pixman_fixed_t max_vx,
5829 pixman_bool_t zero_src)
5830 {
5831 intptr_t vx = vx_;
5832 intptr_t unit_x = unit_x_;
5833 BILINEAR_DECLARE_VARIABLES;
5834 uint32_t pix1, pix2;
5835
5836 while (w && ((uintptr_t)dst & 15))
5837 {
5838 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5839
5840 if (pix1)
5841 {
5842 pix2 = *dst;
5843 *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5844 }
5845
5846 w--;
5847 dst++;
5848 }
5849
5850 while (w >= 4)
5851 {
5852 __m128i xmm_src;
5853 __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
5854 __m128i xmm_alpha_hi, xmm_alpha_lo;
5855
5856 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5857
5858 if (!is_zero (xmm_src))
5859 {
5860 if (is_opaque (xmm_src))
5861 {
5862 save_128_aligned ((__m128i *)dst, xmm_src);
5863 }
5864 else
5865 {
5866 __m128i xmm_dst = load_128_aligned ((__m128i *)dst);
5867
5868 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5869 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5870
5871 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5872 over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
5873 &xmm_dst_lo, &xmm_dst_hi);
5874
5875 save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5876 }
5877 }
5878
5879 w -= 4;
5880 dst += 4;
5881 }
5882
5883 while (w)
5884 {
5885 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5886
5887 if (pix1)
5888 {
5889 pix2 = *dst;
5890 *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5891 }
5892
5893 w--;
5894 dst++;
5895 }
5896 }
5897
FAST_BILINEAR_MAINLOOP_COMMON(sse2_8888_8888_cover_OVER,scaled_bilinear_scanline_sse2_8888_8888_OVER,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)5898 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
5899 scaled_bilinear_scanline_sse2_8888_8888_OVER,
5900 uint32_t, uint32_t, uint32_t,
5901 COVER, FLAG_NONE)
5902 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
5903 scaled_bilinear_scanline_sse2_8888_8888_OVER,
5904 uint32_t, uint32_t, uint32_t,
5905 PAD, FLAG_NONE)
5906 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
5907 scaled_bilinear_scanline_sse2_8888_8888_OVER,
5908 uint32_t, uint32_t, uint32_t,
5909 NONE, FLAG_NONE)
5910 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
5911 scaled_bilinear_scanline_sse2_8888_8888_OVER,
5912 uint32_t, uint32_t, uint32_t,
5913 NORMAL, FLAG_NONE)
5914
5915 static force_inline void
5916 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst,
5917 const uint8_t * mask,
5918 const uint32_t * src_top,
5919 const uint32_t * src_bottom,
5920 int32_t w,
5921 int wt,
5922 int wb,
5923 pixman_fixed_t vx_,
5924 pixman_fixed_t unit_x_,
5925 pixman_fixed_t max_vx,
5926 pixman_bool_t zero_src)
5927 {
5928 intptr_t vx = vx_;
5929 intptr_t unit_x = unit_x_;
5930 BILINEAR_DECLARE_VARIABLES;
5931 uint32_t pix1, pix2;
5932
5933 while (w && ((uintptr_t)dst & 15))
5934 {
5935 uint32_t sa;
5936 uint8_t m = *mask++;
5937
5938 if (m)
5939 {
5940 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5941 sa = pix1 >> 24;
5942
5943 if (sa == 0xff && m == 0xff)
5944 {
5945 *dst = pix1;
5946 }
5947 else
5948 {
5949 __m128i ms, md, ma, msa;
5950
5951 pix2 = *dst;
5952 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5953 ms = unpack_32_1x128 (pix1);
5954 md = unpack_32_1x128 (pix2);
5955
5956 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5957
5958 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5959 }
5960 }
5961 else
5962 {
5963 BILINEAR_SKIP_ONE_PIXEL ();
5964 }
5965
5966 w--;
5967 dst++;
5968 }
5969
5970 while (w >= 4)
5971 {
5972 uint32_t m;
5973
5974 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5975 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5976 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5977
5978 memcpy(&m, mask, sizeof(uint32_t));
5979
5980 if (m)
5981 {
5982 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5983
5984 if (m == 0xffffffff && is_opaque (xmm_src))
5985 {
5986 save_128_aligned ((__m128i *)dst, xmm_src);
5987 }
5988 else
5989 {
5990 xmm_dst = load_128_aligned ((__m128i *)dst);
5991
5992 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5993
5994 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5995 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5996 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5997
5998 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5999 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
6000
6001 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
6002 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
6003
6004 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6005 }
6006 }
6007 else
6008 {
6009 BILINEAR_SKIP_FOUR_PIXELS ();
6010 }
6011
6012 w -= 4;
6013 dst += 4;
6014 mask += 4;
6015 }
6016
6017 while (w)
6018 {
6019 uint32_t sa;
6020 uint8_t m = *mask++;
6021
6022 if (m)
6023 {
6024 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6025 sa = pix1 >> 24;
6026
6027 if (sa == 0xff && m == 0xff)
6028 {
6029 *dst = pix1;
6030 }
6031 else
6032 {
6033 __m128i ms, md, ma, msa;
6034
6035 pix2 = *dst;
6036 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
6037 ms = unpack_32_1x128 (pix1);
6038 md = unpack_32_1x128 (pix2);
6039
6040 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
6041
6042 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
6043 }
6044 }
6045 else
6046 {
6047 BILINEAR_SKIP_ONE_PIXEL ();
6048 }
6049
6050 w--;
6051 dst++;
6052 }
6053 }
6054
FAST_BILINEAR_MAINLOOP_COMMON(sse2_8888_8_8888_cover_OVER,scaled_bilinear_scanline_sse2_8888_8_8888_OVER,uint32_t,uint8_t,uint32_t,COVER,FLAG_HAVE_NON_SOLID_MASK)6055 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
6056 scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6057 uint32_t, uint8_t, uint32_t,
6058 COVER, FLAG_HAVE_NON_SOLID_MASK)
6059 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
6060 scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6061 uint32_t, uint8_t, uint32_t,
6062 PAD, FLAG_HAVE_NON_SOLID_MASK)
6063 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
6064 scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6065 uint32_t, uint8_t, uint32_t,
6066 NONE, FLAG_HAVE_NON_SOLID_MASK)
6067 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
6068 scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6069 uint32_t, uint8_t, uint32_t,
6070 NORMAL, FLAG_HAVE_NON_SOLID_MASK)
6071
6072 static force_inline void
6073 scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst,
6074 const uint32_t * mask,
6075 const uint32_t * src_top,
6076 const uint32_t * src_bottom,
6077 int32_t w,
6078 int wt,
6079 int wb,
6080 pixman_fixed_t vx_,
6081 pixman_fixed_t unit_x_,
6082 pixman_fixed_t max_vx,
6083 pixman_bool_t zero_src)
6084 {
6085 intptr_t vx = vx_;
6086 intptr_t unit_x = unit_x_;
6087 BILINEAR_DECLARE_VARIABLES;
6088 uint32_t pix1;
6089 __m128i xmm_mask;
6090
6091 if (zero_src || (*mask >> 24) == 0)
6092 return;
6093
6094 xmm_mask = create_mask_16_128 (*mask >> 24);
6095
6096 while (w && ((uintptr_t)dst & 15))
6097 {
6098 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6099 if (pix1)
6100 {
6101 uint32_t d = *dst;
6102
6103 __m128i ms = unpack_32_1x128 (pix1);
6104 __m128i alpha = expand_alpha_1x128 (ms);
6105 __m128i dest = xmm_mask;
6106 __m128i alpha_dst = unpack_32_1x128 (d);
6107
6108 *dst = pack_1x128_32
6109 (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6110 }
6111
6112 dst++;
6113 w--;
6114 }
6115
6116 while (w >= 4)
6117 {
6118 __m128i xmm_src;
6119 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
6120
6121 if (!is_zero (xmm_src))
6122 {
6123 __m128i xmm_src_lo, xmm_src_hi;
6124 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
6125 __m128i xmm_alpha_lo, xmm_alpha_hi;
6126
6127 xmm_dst = load_128_aligned ((__m128i*)dst);
6128
6129 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
6130 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
6131 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
6132 &xmm_alpha_lo, &xmm_alpha_hi);
6133
6134 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
6135 &xmm_alpha_lo, &xmm_alpha_hi,
6136 &xmm_mask, &xmm_mask,
6137 &xmm_dst_lo, &xmm_dst_hi);
6138
6139 save_128_aligned
6140 ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6141 }
6142
6143 dst += 4;
6144 w -= 4;
6145 }
6146
6147 while (w)
6148 {
6149 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6150 if (pix1)
6151 {
6152 uint32_t d = *dst;
6153
6154 __m128i ms = unpack_32_1x128 (pix1);
6155 __m128i alpha = expand_alpha_1x128 (ms);
6156 __m128i dest = xmm_mask;
6157 __m128i alpha_dst = unpack_32_1x128 (d);
6158
6159 *dst = pack_1x128_32
6160 (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6161 }
6162
6163 dst++;
6164 w--;
6165 }
6166 }
6167
6168 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
6169 scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6170 uint32_t, uint32_t, uint32_t,
6171 COVER, FLAG_HAVE_SOLID_MASK)
6172 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
6173 scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6174 uint32_t, uint32_t, uint32_t,
6175 PAD, FLAG_HAVE_SOLID_MASK)
6176 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
6177 scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6178 uint32_t, uint32_t, uint32_t,
6179 NONE, FLAG_HAVE_SOLID_MASK)
6180 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
6181 scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6182 uint32_t, uint32_t, uint32_t,
6183 NORMAL, FLAG_HAVE_SOLID_MASK)
6184
6185 static const pixman_fast_path_t sse2_fast_paths[] =
6186 {
6187 /* PIXMAN_OP_OVER */
6188 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
6189 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
6190 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
6191 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
6192 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
6193 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
6194 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
6195 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
6196 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
6197 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
6198 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
6199 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
6200 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
6201 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
6202 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
6203 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
6204 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
6205 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
6206 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
6207 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
6208 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
6209 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
6210 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
6211 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
6212 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
6213 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
6214 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
6215 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
6216 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
6217 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
6218 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
6219 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
6220 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
6221 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6222 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6223 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6224 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6225 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
6226 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
6227 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
6228 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
6229 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
6230 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
6231 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
6232 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
6233 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6234 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6235
6236 /* PIXMAN_OP_OVER_REVERSE */
6237 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
6238 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
6239
6240 /* PIXMAN_OP_ADD */
6241 PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
6242 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
6243 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
6244 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
6245 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
6246 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
6247 PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
6248 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
6249 PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
6250 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
6251 PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
6252 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
6253 PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
6254 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
6255
6256 /* PIXMAN_OP_SRC */
6257 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
6258 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
6259 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
6260 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
6261 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6262 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6263 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6264 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6265 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
6266 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
6267 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
6268 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
6269 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6270 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6271 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6272 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6273 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
6274 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
6275
6276 /* PIXMAN_OP_IN */
6277 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
6278 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
6279 PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
6280
6281 SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6282 SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6283 SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6284 SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6285
6286 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6287 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6288 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6289 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6290
6291 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6292 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6293 SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
6294 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6295 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6296 SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
6297
6298 SIMPLE_BILINEAR_FAST_PATH_COVER (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6299 SIMPLE_BILINEAR_FAST_PATH_COVER (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6300 SIMPLE_BILINEAR_FAST_PATH_PAD (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6301 SIMPLE_BILINEAR_FAST_PATH_PAD (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6302 SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6303 SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6304
6305 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6306 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6307 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6308 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6309
6310 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6311 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6312 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6313 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6314
6315 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
6316 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
6317 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
6318 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
6319
6320 { PIXMAN_OP_NONE },
6321 };
6322
6323 static uint32_t *
sse2_fetch_x8r8g8b8(pixman_iter_t * iter,const uint32_t * mask)6324 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
6325 {
6326 int w = iter->width;
6327 __m128i ff000000 = mask_ff000000;
6328 uint32_t *dst = iter->buffer;
6329 uint32_t *src = (uint32_t *)iter->bits;
6330
6331 iter->bits += iter->stride;
6332
6333 while (w && ((uintptr_t)dst) & 0x0f)
6334 {
6335 *dst++ = (*src++) | 0xff000000;
6336 w--;
6337 }
6338
6339 while (w >= 4)
6340 {
6341 save_128_aligned (
6342 (__m128i *)dst, _mm_or_si128 (
6343 load_128_unaligned ((__m128i *)src), ff000000));
6344
6345 dst += 4;
6346 src += 4;
6347 w -= 4;
6348 }
6349
6350 while (w)
6351 {
6352 *dst++ = (*src++) | 0xff000000;
6353 w--;
6354 }
6355
6356 return iter->buffer;
6357 }
6358
6359 static uint32_t *
sse2_fetch_r5g6b5(pixman_iter_t * iter,const uint32_t * mask)6360 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
6361 {
6362 int w = iter->width;
6363 uint32_t *dst = iter->buffer;
6364 uint16_t *src = (uint16_t *)iter->bits;
6365 __m128i ff000000 = mask_ff000000;
6366
6367 iter->bits += iter->stride;
6368
6369 while (w && ((uintptr_t)dst) & 0x0f)
6370 {
6371 uint16_t s = *src++;
6372
6373 *dst++ = convert_0565_to_8888 (s);
6374 w--;
6375 }
6376
6377 while (w >= 8)
6378 {
6379 __m128i lo, hi, s;
6380
6381 s = _mm_loadu_si128 ((__m128i *)src);
6382
6383 lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
6384 hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
6385
6386 save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
6387 save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
6388
6389 dst += 8;
6390 src += 8;
6391 w -= 8;
6392 }
6393
6394 while (w)
6395 {
6396 uint16_t s = *src++;
6397
6398 *dst++ = convert_0565_to_8888 (s);
6399 w--;
6400 }
6401
6402 return iter->buffer;
6403 }
6404
6405 static uint32_t *
sse2_fetch_a8(pixman_iter_t * iter,const uint32_t * mask)6406 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
6407 {
6408 int w = iter->width;
6409 uint32_t *dst = iter->buffer;
6410 uint8_t *src = iter->bits;
6411 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6412
6413 iter->bits += iter->stride;
6414
6415 while (w && (((uintptr_t)dst) & 15))
6416 {
6417 *dst++ = (uint32_t)(*(src++)) << 24;
6418 w--;
6419 }
6420
6421 while (w >= 16)
6422 {
6423 xmm0 = _mm_loadu_si128((__m128i *)src);
6424
6425 xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0);
6426 xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0);
6427 xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
6428 xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
6429 xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
6430 xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
6431
6432 _mm_store_si128(((__m128i *)(dst + 0)), xmm3);
6433 _mm_store_si128(((__m128i *)(dst + 4)), xmm4);
6434 _mm_store_si128(((__m128i *)(dst + 8)), xmm5);
6435 _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
6436
6437 dst += 16;
6438 src += 16;
6439 w -= 16;
6440 }
6441
6442 while (w)
6443 {
6444 *dst++ = (uint32_t)(*(src++)) << 24;
6445 w--;
6446 }
6447
6448 return iter->buffer;
6449 }
6450
6451 #define IMAGE_FLAGS \
6452 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
6453 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
6454
6455 static const pixman_iter_info_t sse2_iters[] =
6456 {
6457 { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
6458 _pixman_iter_init_bits_stride, sse2_fetch_x8r8g8b8, NULL
6459 },
6460 { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
6461 _pixman_iter_init_bits_stride, sse2_fetch_r5g6b5, NULL
6462 },
6463 { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
6464 _pixman_iter_init_bits_stride, sse2_fetch_a8, NULL
6465 },
6466 { PIXMAN_null },
6467 };
6468
6469 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6470 __attribute__((__force_align_arg_pointer__))
6471 #endif
6472 pixman_implementation_t *
_pixman_implementation_create_sse2(pixman_implementation_t * fallback)6473 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6474 {
6475 pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6476
6477 /* SSE2 constants */
6478 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6479 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6480 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6481 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6482 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6483 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6484 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6485 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6486 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
6487 mask_0080 = create_mask_16_128 (0x0080);
6488 mask_00ff = create_mask_16_128 (0x00ff);
6489 mask_0101 = create_mask_16_128 (0x0101);
6490 mask_ffff = create_mask_16_128 (0xffff);
6491 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6492 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6493 mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
6494 mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
6495
6496 /* Set up function pointers */
6497 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6498 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6499 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6500 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6501 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6502 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6503 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6504 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6505 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6506 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6507
6508 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6509
6510 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6511 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6512 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6513 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6514 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6515 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6516 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6517 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6518 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6519 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6520 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6521
6522 imp->blt = sse2_blt;
6523 imp->fill = sse2_fill;
6524
6525 imp->iter_info = sse2_iters;
6526
6527 return imp;
6528 }
6529