1 /*
2  * Copyright © 2008 Rodrigo Kumpera
3  * Copyright © 2008 André Tupinambá
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of Red Hat not be used in advertising or
10  * publicity pertaining to distribution of the software without specific,
11  * written prior permission.  Red Hat makes no representations about the
12  * suitability of this software for any purpose.  It is provided "as is"
13  * without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25  *          André Tupinambá (andrelrt@gmail.com)
26  *
27  * Based on work by Owen Taylor and Søren Sandmann
28  */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32 
33 /* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
34 #define PSHUFD_IS_FAST 0
35 
36 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
37 #include <emmintrin.h> /* for SSE2 intrinsics */
38 #include "pixman-private.h"
39 #include "pixman-combine32.h"
40 #include "pixman-inlines.h"
41 
42 static __m128i mask_0080;
43 static __m128i mask_00ff;
44 static __m128i mask_0101;
45 static __m128i mask_ffff;
46 static __m128i mask_ff000000;
47 static __m128i mask_alpha;
48 
49 static __m128i mask_565_r;
50 static __m128i mask_565_g1, mask_565_g2;
51 static __m128i mask_565_b;
52 static __m128i mask_red;
53 static __m128i mask_green;
54 static __m128i mask_blue;
55 
56 static __m128i mask_565_fix_rb;
57 static __m128i mask_565_fix_g;
58 
59 static __m128i mask_565_rb;
60 static __m128i mask_565_pack_multiplier;
61 
62 static force_inline __m128i
unpack_32_1x128(uint32_t data)63 unpack_32_1x128 (uint32_t data)
64 {
65     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
66 }
67 
68 static force_inline void
unpack_128_2x128(__m128i data,__m128i * data_lo,__m128i * data_hi)69 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
70 {
71     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
72     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
73 }
74 
75 static force_inline __m128i
unpack_565_to_8888(__m128i lo)76 unpack_565_to_8888 (__m128i lo)
77 {
78     __m128i r, g, b, rb, t;
79 
80     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
81     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
82     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
83 
84     rb = _mm_or_si128 (r, b);
85     t  = _mm_and_si128 (rb, mask_565_fix_rb);
86     t  = _mm_srli_epi32 (t, 5);
87     rb = _mm_or_si128 (rb, t);
88 
89     t  = _mm_and_si128 (g, mask_565_fix_g);
90     t  = _mm_srli_epi32 (t, 6);
91     g  = _mm_or_si128 (g, t);
92 
93     return _mm_or_si128 (rb, g);
94 }
95 
96 static force_inline void
unpack_565_128_4x128(__m128i data,__m128i * data0,__m128i * data1,__m128i * data2,__m128i * data3)97 unpack_565_128_4x128 (__m128i  data,
98                       __m128i* data0,
99                       __m128i* data1,
100                       __m128i* data2,
101                       __m128i* data3)
102 {
103     __m128i lo, hi;
104 
105     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
106     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
107 
108     lo = unpack_565_to_8888 (lo);
109     hi = unpack_565_to_8888 (hi);
110 
111     unpack_128_2x128 (lo, data0, data1);
112     unpack_128_2x128 (hi, data2, data3);
113 }
114 
115 static force_inline uint16_t
pack_565_32_16(uint32_t pixel)116 pack_565_32_16 (uint32_t pixel)
117 {
118     return (uint16_t) (((pixel >> 8) & 0xf800) |
119 		       ((pixel >> 5) & 0x07e0) |
120 		       ((pixel >> 3) & 0x001f));
121 }
122 
123 static force_inline __m128i
pack_2x128_128(__m128i lo,__m128i hi)124 pack_2x128_128 (__m128i lo, __m128i hi)
125 {
126     return _mm_packus_epi16 (lo, hi);
127 }
128 
129 static force_inline __m128i
pack_565_2packedx128_128(__m128i lo,__m128i hi)130 pack_565_2packedx128_128 (__m128i lo, __m128i hi)
131 {
132     __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
133     __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
134 
135     __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
136     __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
137 
138     __m128i g0 = _mm_and_si128 (lo, mask_green);
139     __m128i g1 = _mm_and_si128 (hi, mask_green);
140 
141     t0 = _mm_or_si128 (t0, g0);
142     t1 = _mm_or_si128 (t1, g1);
143 
144     /* Simulates _mm_packus_epi32 */
145     t0 = _mm_slli_epi32 (t0, 16 - 5);
146     t1 = _mm_slli_epi32 (t1, 16 - 5);
147     t0 = _mm_srai_epi32 (t0, 16);
148     t1 = _mm_srai_epi32 (t1, 16);
149     return _mm_packs_epi32 (t0, t1);
150 }
151 
152 static force_inline __m128i
pack_565_2x128_128(__m128i lo,__m128i hi)153 pack_565_2x128_128 (__m128i lo, __m128i hi)
154 {
155     __m128i data;
156     __m128i r, g1, g2, b;
157 
158     data = pack_2x128_128 (lo, hi);
159 
160     r  = _mm_and_si128 (data, mask_565_r);
161     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
162     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
163     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
164 
165     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
166 }
167 
168 static force_inline __m128i
pack_565_4x128_128(__m128i * xmm0,__m128i * xmm1,__m128i * xmm2,__m128i * xmm3)169 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
170 {
171     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
172 			     pack_565_2x128_128 (*xmm2, *xmm3));
173 }
174 
175 static force_inline int
is_opaque(__m128i x)176 is_opaque (__m128i x)
177 {
178     __m128i ffs = _mm_cmpeq_epi8 (x, x);
179 
180     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
181 }
182 
183 static force_inline int
is_zero(__m128i x)184 is_zero (__m128i x)
185 {
186     return _mm_movemask_epi8 (
187 	_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
188 }
189 
190 static force_inline int
is_transparent(__m128i x)191 is_transparent (__m128i x)
192 {
193     return (_mm_movemask_epi8 (
194 		_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
195 }
196 
197 static force_inline __m128i
expand_pixel_32_1x128(uint32_t data)198 expand_pixel_32_1x128 (uint32_t data)
199 {
200     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
201 }
202 
203 static force_inline __m128i
expand_alpha_1x128(__m128i data)204 expand_alpha_1x128 (__m128i data)
205 {
206     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
207 						     _MM_SHUFFLE (3, 3, 3, 3)),
208 				_MM_SHUFFLE (3, 3, 3, 3));
209 }
210 
211 static force_inline void
expand_alpha_2x128(__m128i data_lo,__m128i data_hi,__m128i * alpha_lo,__m128i * alpha_hi)212 expand_alpha_2x128 (__m128i  data_lo,
213                     __m128i  data_hi,
214                     __m128i* alpha_lo,
215                     __m128i* alpha_hi)
216 {
217     __m128i lo, hi;
218 
219     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
220     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
221 
222     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
223     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
224 }
225 
226 static force_inline void
expand_alpha_rev_2x128(__m128i data_lo,__m128i data_hi,__m128i * alpha_lo,__m128i * alpha_hi)227 expand_alpha_rev_2x128 (__m128i  data_lo,
228                         __m128i  data_hi,
229                         __m128i* alpha_lo,
230                         __m128i* alpha_hi)
231 {
232     __m128i lo, hi;
233 
234     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
235     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
236     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
237     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
238 }
239 
240 static force_inline void
pix_multiply_2x128(__m128i * data_lo,__m128i * data_hi,__m128i * alpha_lo,__m128i * alpha_hi,__m128i * ret_lo,__m128i * ret_hi)241 pix_multiply_2x128 (__m128i* data_lo,
242                     __m128i* data_hi,
243                     __m128i* alpha_lo,
244                     __m128i* alpha_hi,
245                     __m128i* ret_lo,
246                     __m128i* ret_hi)
247 {
248     __m128i lo, hi;
249 
250     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
251     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
252     lo = _mm_adds_epu16 (lo, mask_0080);
253     hi = _mm_adds_epu16 (hi, mask_0080);
254     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
255     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
256 }
257 
258 static force_inline void
pix_add_multiply_2x128(__m128i * src_lo,__m128i * src_hi,__m128i * alpha_dst_lo,__m128i * alpha_dst_hi,__m128i * dst_lo,__m128i * dst_hi,__m128i * alpha_src_lo,__m128i * alpha_src_hi,__m128i * ret_lo,__m128i * ret_hi)259 pix_add_multiply_2x128 (__m128i* src_lo,
260                         __m128i* src_hi,
261                         __m128i* alpha_dst_lo,
262                         __m128i* alpha_dst_hi,
263                         __m128i* dst_lo,
264                         __m128i* dst_hi,
265                         __m128i* alpha_src_lo,
266                         __m128i* alpha_src_hi,
267                         __m128i* ret_lo,
268                         __m128i* ret_hi)
269 {
270     __m128i t1_lo, t1_hi;
271     __m128i t2_lo, t2_hi;
272 
273     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
274     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
275 
276     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
277     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
278 }
279 
280 static force_inline void
negate_2x128(__m128i data_lo,__m128i data_hi,__m128i * neg_lo,__m128i * neg_hi)281 negate_2x128 (__m128i  data_lo,
282               __m128i  data_hi,
283               __m128i* neg_lo,
284               __m128i* neg_hi)
285 {
286     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
287     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
288 }
289 
290 static force_inline void
invert_colors_2x128(__m128i data_lo,__m128i data_hi,__m128i * inv_lo,__m128i * inv_hi)291 invert_colors_2x128 (__m128i  data_lo,
292                      __m128i  data_hi,
293                      __m128i* inv_lo,
294                      __m128i* inv_hi)
295 {
296     __m128i lo, hi;
297 
298     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
299     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
300     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
301     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
302 }
303 
304 static force_inline void
over_2x128(__m128i * src_lo,__m128i * src_hi,__m128i * alpha_lo,__m128i * alpha_hi,__m128i * dst_lo,__m128i * dst_hi)305 over_2x128 (__m128i* src_lo,
306             __m128i* src_hi,
307             __m128i* alpha_lo,
308             __m128i* alpha_hi,
309             __m128i* dst_lo,
310             __m128i* dst_hi)
311 {
312     __m128i t1, t2;
313 
314     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
315 
316     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
317 
318     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
319     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
320 }
321 
322 static force_inline void
over_rev_non_pre_2x128(__m128i src_lo,__m128i src_hi,__m128i * dst_lo,__m128i * dst_hi)323 over_rev_non_pre_2x128 (__m128i  src_lo,
324                         __m128i  src_hi,
325                         __m128i* dst_lo,
326                         __m128i* dst_hi)
327 {
328     __m128i lo, hi;
329     __m128i alpha_lo, alpha_hi;
330 
331     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
332 
333     lo = _mm_or_si128 (alpha_lo, mask_alpha);
334     hi = _mm_or_si128 (alpha_hi, mask_alpha);
335 
336     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
337 
338     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
339 
340     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
341 }
342 
343 static force_inline void
in_over_2x128(__m128i * src_lo,__m128i * src_hi,__m128i * alpha_lo,__m128i * alpha_hi,__m128i * mask_lo,__m128i * mask_hi,__m128i * dst_lo,__m128i * dst_hi)344 in_over_2x128 (__m128i* src_lo,
345                __m128i* src_hi,
346                __m128i* alpha_lo,
347                __m128i* alpha_hi,
348                __m128i* mask_lo,
349                __m128i* mask_hi,
350                __m128i* dst_lo,
351                __m128i* dst_hi)
352 {
353     __m128i s_lo, s_hi;
354     __m128i a_lo, a_hi;
355 
356     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
357     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
358 
359     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
360 }
361 
362 /* load 4 pixels from a 16-byte boundary aligned address */
363 static force_inline __m128i
load_128_aligned(__m128i * src)364 load_128_aligned (__m128i* src)
365 {
366     return _mm_load_si128 (src);
367 }
368 
369 /* load 4 pixels from a unaligned address */
370 static force_inline __m128i
load_128_unaligned(const __m128i * src)371 load_128_unaligned (const __m128i* src)
372 {
373     return _mm_loadu_si128 (src);
374 }
375 
376 /* save 4 pixels using Write Combining memory on a 16-byte
377  * boundary aligned address
378  */
379 static force_inline void
save_128_write_combining(__m128i * dst,__m128i data)380 save_128_write_combining (__m128i* dst,
381                           __m128i  data)
382 {
383     _mm_stream_si128 (dst, data);
384 }
385 
386 /* save 4 pixels on a 16-byte boundary aligned address */
387 static force_inline void
save_128_aligned(__m128i * dst,__m128i data)388 save_128_aligned (__m128i* dst,
389                   __m128i  data)
390 {
391     _mm_store_si128 (dst, data);
392 }
393 
394 /* save 4 pixels on a unaligned address */
395 static force_inline void
save_128_unaligned(__m128i * dst,__m128i data)396 save_128_unaligned (__m128i* dst,
397                     __m128i  data)
398 {
399     _mm_storeu_si128 (dst, data);
400 }
401 
402 static force_inline __m128i
load_32_1x128(uint32_t data)403 load_32_1x128 (uint32_t data)
404 {
405     return _mm_cvtsi32_si128 (data);
406 }
407 
408 static force_inline __m128i
expand_alpha_rev_1x128(__m128i data)409 expand_alpha_rev_1x128 (__m128i data)
410 {
411     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
412 }
413 
414 static force_inline __m128i
expand_pixel_8_1x128(uint8_t data)415 expand_pixel_8_1x128 (uint8_t data)
416 {
417     return _mm_shufflelo_epi16 (
418 	unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
419 }
420 
421 static force_inline __m128i
pix_multiply_1x128(__m128i data,__m128i alpha)422 pix_multiply_1x128 (__m128i data,
423 		    __m128i alpha)
424 {
425     return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
426 					    mask_0080),
427 			    mask_0101);
428 }
429 
430 static force_inline __m128i
pix_add_multiply_1x128(__m128i * src,__m128i * alpha_dst,__m128i * dst,__m128i * alpha_src)431 pix_add_multiply_1x128 (__m128i* src,
432 			__m128i* alpha_dst,
433 			__m128i* dst,
434 			__m128i* alpha_src)
435 {
436     __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
437     __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
438 
439     return _mm_adds_epu8 (t1, t2);
440 }
441 
442 static force_inline __m128i
negate_1x128(__m128i data)443 negate_1x128 (__m128i data)
444 {
445     return _mm_xor_si128 (data, mask_00ff);
446 }
447 
448 static force_inline __m128i
invert_colors_1x128(__m128i data)449 invert_colors_1x128 (__m128i data)
450 {
451     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
452 }
453 
454 static force_inline __m128i
over_1x128(__m128i src,__m128i alpha,__m128i dst)455 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
456 {
457     return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
458 }
459 
460 static force_inline __m128i
in_over_1x128(__m128i * src,__m128i * alpha,__m128i * mask,__m128i * dst)461 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
462 {
463     return over_1x128 (pix_multiply_1x128 (*src, *mask),
464 		       pix_multiply_1x128 (*alpha, *mask),
465 		       *dst);
466 }
467 
468 static force_inline __m128i
over_rev_non_pre_1x128(__m128i src,__m128i dst)469 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
470 {
471     __m128i alpha = expand_alpha_1x128 (src);
472 
473     return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
474 					   _mm_or_si128 (alpha, mask_alpha)),
475 		       alpha,
476 		       dst);
477 }
478 
479 static force_inline uint32_t
pack_1x128_32(__m128i data)480 pack_1x128_32 (__m128i data)
481 {
482     return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
483 }
484 
485 static force_inline __m128i
expand565_16_1x128(uint16_t pixel)486 expand565_16_1x128 (uint16_t pixel)
487 {
488     __m128i m = _mm_cvtsi32_si128 (pixel);
489 
490     m = unpack_565_to_8888 (m);
491 
492     return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
493 }
494 
495 static force_inline uint32_t
core_combine_over_u_pixel_sse2(uint32_t src,uint32_t dst)496 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
497 {
498     uint8_t a;
499     __m128i xmms;
500 
501     a = src >> 24;
502 
503     if (a == 0xff)
504     {
505 	return src;
506     }
507     else if (src)
508     {
509 	xmms = unpack_32_1x128 (src);
510 	return pack_1x128_32 (
511 	    over_1x128 (xmms, expand_alpha_1x128 (xmms),
512 			unpack_32_1x128 (dst)));
513     }
514 
515     return dst;
516 }
517 
518 static force_inline uint32_t
combine1(const uint32_t * ps,const uint32_t * pm)519 combine1 (const uint32_t *ps, const uint32_t *pm)
520 {
521     uint32_t s;
522     memcpy(&s, ps, sizeof(uint32_t));
523 
524     if (pm)
525     {
526 	__m128i ms, mm;
527 
528 	mm = unpack_32_1x128 (*pm);
529 	mm = expand_alpha_1x128 (mm);
530 
531 	ms = unpack_32_1x128 (s);
532 	ms = pix_multiply_1x128 (ms, mm);
533 
534 	s = pack_1x128_32 (ms);
535     }
536 
537     return s;
538 }
539 
540 static force_inline __m128i
combine4(const __m128i * ps,const __m128i * pm)541 combine4 (const __m128i *ps, const __m128i *pm)
542 {
543     __m128i xmm_src_lo, xmm_src_hi;
544     __m128i xmm_msk_lo, xmm_msk_hi;
545     __m128i s;
546 
547     if (pm)
548     {
549 	xmm_msk_lo = load_128_unaligned (pm);
550 
551 	if (is_transparent (xmm_msk_lo))
552 	    return _mm_setzero_si128 ();
553     }
554 
555     s = load_128_unaligned (ps);
556 
557     if (pm)
558     {
559 	unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
560 	unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
561 
562 	expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
563 
564 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
565 			    &xmm_msk_lo, &xmm_msk_hi,
566 			    &xmm_src_lo, &xmm_src_hi);
567 
568 	s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
569     }
570 
571     return s;
572 }
573 
574 static force_inline void
core_combine_over_u_sse2_mask(uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)575 core_combine_over_u_sse2_mask (uint32_t *	  pd,
576 			       const uint32_t*    ps,
577 			       const uint32_t*    pm,
578 			       int                w)
579 {
580     uint32_t s, d;
581 
582     /* Align dst on a 16-byte boundary */
583     while (w && ((uintptr_t)pd & 15))
584     {
585 	d = *pd;
586 	s = combine1 (ps, pm);
587 
588 	if (s)
589 	    *pd = core_combine_over_u_pixel_sse2 (s, d);
590 	pd++;
591 	ps++;
592 	pm++;
593 	w--;
594     }
595 
596     while (w >= 4)
597     {
598 	__m128i mask = load_128_unaligned ((__m128i *)pm);
599 
600 	if (!is_zero (mask))
601 	{
602 	    __m128i src;
603 	    __m128i src_hi, src_lo;
604 	    __m128i mask_hi, mask_lo;
605 	    __m128i alpha_hi, alpha_lo;
606 
607 	    src = load_128_unaligned ((__m128i *)ps);
608 
609 	    if (is_opaque (_mm_and_si128 (src, mask)))
610 	    {
611 		save_128_aligned ((__m128i *)pd, src);
612 	    }
613 	    else
614 	    {
615 		__m128i dst = load_128_aligned ((__m128i *)pd);
616 		__m128i dst_hi, dst_lo;
617 
618 		unpack_128_2x128 (mask, &mask_lo, &mask_hi);
619 		unpack_128_2x128 (src, &src_lo, &src_hi);
620 
621 		expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
622 		pix_multiply_2x128 (&src_lo, &src_hi,
623 				    &mask_lo, &mask_hi,
624 				    &src_lo, &src_hi);
625 
626 		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
627 
628 		expand_alpha_2x128 (src_lo, src_hi,
629 				    &alpha_lo, &alpha_hi);
630 
631 		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
632 			    &dst_lo, &dst_hi);
633 
634 		save_128_aligned (
635 		    (__m128i *)pd,
636 		    pack_2x128_128 (dst_lo, dst_hi));
637 	    }
638 	}
639 
640 	pm += 4;
641 	ps += 4;
642 	pd += 4;
643 	w -= 4;
644     }
645     while (w)
646     {
647 	d = *pd;
648 	s = combine1 (ps, pm);
649 
650 	if (s)
651 	    *pd = core_combine_over_u_pixel_sse2 (s, d);
652 	pd++;
653 	ps++;
654 	pm++;
655 
656 	w--;
657     }
658 }
659 
660 static force_inline void
core_combine_over_u_sse2_no_mask(uint32_t * pd,const uint32_t * ps,int w)661 core_combine_over_u_sse2_no_mask (uint32_t *	  pd,
662 				  const uint32_t*    ps,
663 				  int                w)
664 {
665     uint32_t s, d;
666 
667     /* Align dst on a 16-byte boundary */
668     while (w && ((uintptr_t)pd & 15))
669     {
670 	d = *pd;
671 	s = *ps;
672 
673 	if (s)
674 	    *pd = core_combine_over_u_pixel_sse2 (s, d);
675 	pd++;
676 	ps++;
677 	w--;
678     }
679 
680     while (w >= 4)
681     {
682 	__m128i src;
683 	__m128i src_hi, src_lo, dst_hi, dst_lo;
684 	__m128i alpha_hi, alpha_lo;
685 
686 	src = load_128_unaligned ((__m128i *)ps);
687 
688 	if (!is_zero (src))
689 	{
690 	    if (is_opaque (src))
691 	    {
692 		save_128_aligned ((__m128i *)pd, src);
693 	    }
694 	    else
695 	    {
696 		__m128i dst = load_128_aligned ((__m128i *)pd);
697 
698 		unpack_128_2x128 (src, &src_lo, &src_hi);
699 		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
700 
701 		expand_alpha_2x128 (src_lo, src_hi,
702 				    &alpha_lo, &alpha_hi);
703 		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
704 			    &dst_lo, &dst_hi);
705 
706 		save_128_aligned (
707 		    (__m128i *)pd,
708 		    pack_2x128_128 (dst_lo, dst_hi));
709 	    }
710 	}
711 
712 	ps += 4;
713 	pd += 4;
714 	w -= 4;
715     }
716     while (w)
717     {
718 	d = *pd;
719 	s = *ps;
720 
721 	if (s)
722 	    *pd = core_combine_over_u_pixel_sse2 (s, d);
723 	pd++;
724 	ps++;
725 
726 	w--;
727     }
728 }
729 
730 static force_inline void
sse2_combine_over_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)731 sse2_combine_over_u (pixman_implementation_t *imp,
732                      pixman_op_t              op,
733                      uint32_t *               pd,
734                      const uint32_t *         ps,
735                      const uint32_t *         pm,
736                      int                      w)
737 {
738     if (pm)
739 	core_combine_over_u_sse2_mask (pd, ps, pm, w);
740     else
741 	core_combine_over_u_sse2_no_mask (pd, ps, w);
742 }
743 
744 static void
sse2_combine_over_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)745 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
746                              pixman_op_t              op,
747                              uint32_t *               pd,
748                              const uint32_t *         ps,
749                              const uint32_t *         pm,
750                              int                      w)
751 {
752     uint32_t s, d;
753 
754     __m128i xmm_dst_lo, xmm_dst_hi;
755     __m128i xmm_src_lo, xmm_src_hi;
756     __m128i xmm_alpha_lo, xmm_alpha_hi;
757 
758     /* Align dst on a 16-byte boundary */
759     while (w &&
760            ((uintptr_t)pd & 15))
761     {
762 	d = *pd;
763 	s = combine1 (ps, pm);
764 
765 	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
766 	w--;
767 	ps++;
768 	if (pm)
769 	    pm++;
770     }
771 
772     while (w >= 4)
773     {
774 	/* I'm loading unaligned because I'm not sure
775 	 * about the address alignment.
776 	 */
777 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
778 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
779 
780 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
781 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
782 
783 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
784 			    &xmm_alpha_lo, &xmm_alpha_hi);
785 
786 	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
787 		    &xmm_alpha_lo, &xmm_alpha_hi,
788 		    &xmm_src_lo, &xmm_src_hi);
789 
790 	/* rebuid the 4 pixel data and save*/
791 	save_128_aligned ((__m128i*)pd,
792 			  pack_2x128_128 (xmm_src_lo, xmm_src_hi));
793 
794 	w -= 4;
795 	ps += 4;
796 	pd += 4;
797 
798 	if (pm)
799 	    pm += 4;
800     }
801 
802     while (w)
803     {
804 	d = *pd;
805 	s = combine1 (ps, pm);
806 
807 	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
808 	ps++;
809 	w--;
810 	if (pm)
811 	    pm++;
812     }
813 }
814 
815 static force_inline uint32_t
core_combine_in_u_pixel_sse2(uint32_t src,uint32_t dst)816 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
817 {
818     uint32_t maska = src >> 24;
819 
820     if (maska == 0)
821     {
822 	return 0;
823     }
824     else if (maska != 0xff)
825     {
826 	return pack_1x128_32 (
827 	    pix_multiply_1x128 (unpack_32_1x128 (dst),
828 				expand_alpha_1x128 (unpack_32_1x128 (src))));
829     }
830 
831     return dst;
832 }
833 
834 static void
sse2_combine_in_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)835 sse2_combine_in_u (pixman_implementation_t *imp,
836                    pixman_op_t              op,
837                    uint32_t *               pd,
838                    const uint32_t *         ps,
839                    const uint32_t *         pm,
840                    int                      w)
841 {
842     uint32_t s, d;
843 
844     __m128i xmm_src_lo, xmm_src_hi;
845     __m128i xmm_dst_lo, xmm_dst_hi;
846 
847     while (w && ((uintptr_t)pd & 15))
848     {
849 	s = combine1 (ps, pm);
850 	d = *pd;
851 
852 	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
853 	w--;
854 	ps++;
855 	if (pm)
856 	    pm++;
857     }
858 
859     while (w >= 4)
860     {
861 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
862 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
863 
864 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
865 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
866 
867 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
868 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
869 			    &xmm_dst_lo, &xmm_dst_hi,
870 			    &xmm_dst_lo, &xmm_dst_hi);
871 
872 	save_128_aligned ((__m128i*)pd,
873 			  pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
874 
875 	ps += 4;
876 	pd += 4;
877 	w -= 4;
878 	if (pm)
879 	    pm += 4;
880     }
881 
882     while (w)
883     {
884 	s = combine1 (ps, pm);
885 	d = *pd;
886 
887 	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
888 	w--;
889 	ps++;
890 	if (pm)
891 	    pm++;
892     }
893 }
894 
895 static void
sse2_combine_in_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)896 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
897                            pixman_op_t              op,
898                            uint32_t *               pd,
899                            const uint32_t *         ps,
900                            const uint32_t *         pm,
901                            int                      w)
902 {
903     uint32_t s, d;
904 
905     __m128i xmm_src_lo, xmm_src_hi;
906     __m128i xmm_dst_lo, xmm_dst_hi;
907 
908     while (w && ((uintptr_t)pd & 15))
909     {
910 	s = combine1 (ps, pm);
911 	d = *pd;
912 
913 	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
914 	ps++;
915 	w--;
916 	if (pm)
917 	    pm++;
918     }
919 
920     while (w >= 4)
921     {
922 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
923 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
924 
925 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
926 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
927 
928 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
929 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
930 			    &xmm_src_lo, &xmm_src_hi,
931 			    &xmm_dst_lo, &xmm_dst_hi);
932 
933 	save_128_aligned (
934 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
935 
936 	ps += 4;
937 	pd += 4;
938 	w -= 4;
939 	if (pm)
940 	    pm += 4;
941     }
942 
943     while (w)
944     {
945 	s = combine1 (ps, pm);
946 	d = *pd;
947 
948 	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
949 	w--;
950 	ps++;
951 	if (pm)
952 	    pm++;
953     }
954 }
955 
956 static void
sse2_combine_out_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)957 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
958                             pixman_op_t              op,
959                             uint32_t *               pd,
960                             const uint32_t *         ps,
961                             const uint32_t *         pm,
962                             int                      w)
963 {
964     while (w && ((uintptr_t)pd & 15))
965     {
966 	uint32_t s = combine1 (ps, pm);
967 	uint32_t d = *pd;
968 
969 	*pd++ = pack_1x128_32 (
970 	    pix_multiply_1x128 (
971 		unpack_32_1x128 (d), negate_1x128 (
972 		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
973 
974 	if (pm)
975 	    pm++;
976 	ps++;
977 	w--;
978     }
979 
980     while (w >= 4)
981     {
982 	__m128i xmm_src_lo, xmm_src_hi;
983 	__m128i xmm_dst_lo, xmm_dst_hi;
984 
985 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
986 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
987 
988 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
989 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
990 
991 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
992 	negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
993 
994 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
995 			    &xmm_src_lo, &xmm_src_hi,
996 			    &xmm_dst_lo, &xmm_dst_hi);
997 
998 	save_128_aligned (
999 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1000 
1001 	ps += 4;
1002 	pd += 4;
1003 	if (pm)
1004 	    pm += 4;
1005 
1006 	w -= 4;
1007     }
1008 
1009     while (w)
1010     {
1011 	uint32_t s = combine1 (ps, pm);
1012 	uint32_t d = *pd;
1013 
1014 	*pd++ = pack_1x128_32 (
1015 	    pix_multiply_1x128 (
1016 		unpack_32_1x128 (d), negate_1x128 (
1017 		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1018 	ps++;
1019 	if (pm)
1020 	    pm++;
1021 	w--;
1022     }
1023 }
1024 
1025 static void
sse2_combine_out_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1026 sse2_combine_out_u (pixman_implementation_t *imp,
1027                     pixman_op_t              op,
1028                     uint32_t *               pd,
1029                     const uint32_t *         ps,
1030                     const uint32_t *         pm,
1031                     int                      w)
1032 {
1033     while (w && ((uintptr_t)pd & 15))
1034     {
1035 	uint32_t s = combine1 (ps, pm);
1036 	uint32_t d = *pd;
1037 
1038 	*pd++ = pack_1x128_32 (
1039 	    pix_multiply_1x128 (
1040 		unpack_32_1x128 (s), negate_1x128 (
1041 		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
1042 	w--;
1043 	ps++;
1044 	if (pm)
1045 	    pm++;
1046     }
1047 
1048     while (w >= 4)
1049     {
1050 	__m128i xmm_src_lo, xmm_src_hi;
1051 	__m128i xmm_dst_lo, xmm_dst_hi;
1052 
1053 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1054 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1055 
1056 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1057 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1058 
1059 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1060 	negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1061 
1062 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1063 			    &xmm_dst_lo, &xmm_dst_hi,
1064 			    &xmm_dst_lo, &xmm_dst_hi);
1065 
1066 	save_128_aligned (
1067 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1068 
1069 	ps += 4;
1070 	pd += 4;
1071 	w -= 4;
1072 	if (pm)
1073 	    pm += 4;
1074     }
1075 
1076     while (w)
1077     {
1078 	uint32_t s = combine1 (ps, pm);
1079 	uint32_t d = *pd;
1080 
1081 	*pd++ = pack_1x128_32 (
1082 	    pix_multiply_1x128 (
1083 		unpack_32_1x128 (s), negate_1x128 (
1084 		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
1085 	w--;
1086 	ps++;
1087 	if (pm)
1088 	    pm++;
1089     }
1090 }
1091 
1092 static force_inline uint32_t
core_combine_atop_u_pixel_sse2(uint32_t src,uint32_t dst)1093 core_combine_atop_u_pixel_sse2 (uint32_t src,
1094                                 uint32_t dst)
1095 {
1096     __m128i s = unpack_32_1x128 (src);
1097     __m128i d = unpack_32_1x128 (dst);
1098 
1099     __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1100     __m128i da = expand_alpha_1x128 (d);
1101 
1102     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1103 }
1104 
1105 static void
sse2_combine_atop_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1106 sse2_combine_atop_u (pixman_implementation_t *imp,
1107                      pixman_op_t              op,
1108                      uint32_t *               pd,
1109                      const uint32_t *         ps,
1110                      const uint32_t *         pm,
1111                      int                      w)
1112 {
1113     uint32_t s, d;
1114 
1115     __m128i xmm_src_lo, xmm_src_hi;
1116     __m128i xmm_dst_lo, xmm_dst_hi;
1117     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1118     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1119 
1120     while (w && ((uintptr_t)pd & 15))
1121     {
1122 	s = combine1 (ps, pm);
1123 	d = *pd;
1124 
1125 	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1126 	w--;
1127 	ps++;
1128 	if (pm)
1129 	    pm++;
1130     }
1131 
1132     while (w >= 4)
1133     {
1134 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1135 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1136 
1137 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1138 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1139 
1140 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1141 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1142 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1143 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1144 
1145 	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1146 		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1147 
1148 	pix_add_multiply_2x128 (
1149 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1150 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1151 	    &xmm_dst_lo, &xmm_dst_hi);
1152 
1153 	save_128_aligned (
1154 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1155 
1156 	ps += 4;
1157 	pd += 4;
1158 	w -= 4;
1159 	if (pm)
1160 	    pm += 4;
1161     }
1162 
1163     while (w)
1164     {
1165 	s = combine1 (ps, pm);
1166 	d = *pd;
1167 
1168 	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1169 	w--;
1170 	ps++;
1171 	if (pm)
1172 	    pm++;
1173     }
1174 }
1175 
1176 static force_inline uint32_t
core_combine_reverse_atop_u_pixel_sse2(uint32_t src,uint32_t dst)1177 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1178                                         uint32_t dst)
1179 {
1180     __m128i s = unpack_32_1x128 (src);
1181     __m128i d = unpack_32_1x128 (dst);
1182 
1183     __m128i sa = expand_alpha_1x128 (s);
1184     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1185 
1186     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1187 }
1188 
1189 static void
sse2_combine_atop_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1190 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1191                              pixman_op_t              op,
1192                              uint32_t *               pd,
1193                              const uint32_t *         ps,
1194                              const uint32_t *         pm,
1195                              int                      w)
1196 {
1197     uint32_t s, d;
1198 
1199     __m128i xmm_src_lo, xmm_src_hi;
1200     __m128i xmm_dst_lo, xmm_dst_hi;
1201     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1202     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1203 
1204     while (w && ((uintptr_t)pd & 15))
1205     {
1206 	s = combine1 (ps, pm);
1207 	d = *pd;
1208 
1209 	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1210 	ps++;
1211 	w--;
1212 	if (pm)
1213 	    pm++;
1214     }
1215 
1216     while (w >= 4)
1217     {
1218 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1219 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1220 
1221 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1222 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1223 
1224 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1225 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1226 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1227 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1228 
1229 	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1230 		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1231 
1232 	pix_add_multiply_2x128 (
1233 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1234 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1235 	    &xmm_dst_lo, &xmm_dst_hi);
1236 
1237 	save_128_aligned (
1238 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1239 
1240 	ps += 4;
1241 	pd += 4;
1242 	w -= 4;
1243 	if (pm)
1244 	    pm += 4;
1245     }
1246 
1247     while (w)
1248     {
1249 	s = combine1 (ps, pm);
1250 	d = *pd;
1251 
1252 	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1253 	ps++;
1254 	w--;
1255 	if (pm)
1256 	    pm++;
1257     }
1258 }
1259 
1260 static force_inline uint32_t
core_combine_xor_u_pixel_sse2(uint32_t src,uint32_t dst)1261 core_combine_xor_u_pixel_sse2 (uint32_t src,
1262                                uint32_t dst)
1263 {
1264     __m128i s = unpack_32_1x128 (src);
1265     __m128i d = unpack_32_1x128 (dst);
1266 
1267     __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1268     __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1269 
1270     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1271 }
1272 
1273 static void
sse2_combine_xor_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dst,const uint32_t * src,const uint32_t * mask,int width)1274 sse2_combine_xor_u (pixman_implementation_t *imp,
1275                     pixman_op_t              op,
1276                     uint32_t *               dst,
1277                     const uint32_t *         src,
1278                     const uint32_t *         mask,
1279                     int                      width)
1280 {
1281     int w = width;
1282     uint32_t s, d;
1283     uint32_t* pd = dst;
1284     const uint32_t* ps = src;
1285     const uint32_t* pm = mask;
1286 
1287     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1288     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1289     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1290     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1291 
1292     while (w && ((uintptr_t)pd & 15))
1293     {
1294 	s = combine1 (ps, pm);
1295 	d = *pd;
1296 
1297 	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1298 	w--;
1299 	ps++;
1300 	if (pm)
1301 	    pm++;
1302     }
1303 
1304     while (w >= 4)
1305     {
1306 	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1307 	xmm_dst = load_128_aligned ((__m128i*) pd);
1308 
1309 	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1310 	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1311 
1312 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1313 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1314 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1315 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1316 
1317 	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1318 		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1319 	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1320 		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1321 
1322 	pix_add_multiply_2x128 (
1323 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1324 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1325 	    &xmm_dst_lo, &xmm_dst_hi);
1326 
1327 	save_128_aligned (
1328 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1329 
1330 	ps += 4;
1331 	pd += 4;
1332 	w -= 4;
1333 	if (pm)
1334 	    pm += 4;
1335     }
1336 
1337     while (w)
1338     {
1339 	s = combine1 (ps, pm);
1340 	d = *pd;
1341 
1342 	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1343 	w--;
1344 	ps++;
1345 	if (pm)
1346 	    pm++;
1347     }
1348 }
1349 
1350 static force_inline void
sse2_combine_add_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dst,const uint32_t * src,const uint32_t * mask,int width)1351 sse2_combine_add_u (pixman_implementation_t *imp,
1352                     pixman_op_t              op,
1353                     uint32_t *               dst,
1354                     const uint32_t *         src,
1355                     const uint32_t *         mask,
1356                     int                      width)
1357 {
1358     int w = width;
1359     uint32_t s, d;
1360     uint32_t* pd = dst;
1361     const uint32_t* ps = src;
1362     const uint32_t* pm = mask;
1363 
1364     while (w && (uintptr_t)pd & 15)
1365     {
1366 	s = combine1 (ps, pm);
1367 	d = *pd;
1368 
1369 	ps++;
1370 	if (pm)
1371 	    pm++;
1372 	*pd++ = _mm_cvtsi128_si32 (
1373 	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1374 	w--;
1375     }
1376 
1377     while (w >= 4)
1378     {
1379 	__m128i s;
1380 
1381 	s = combine4 ((__m128i*)ps, (__m128i*)pm);
1382 
1383 	save_128_aligned (
1384 	    (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1385 
1386 	pd += 4;
1387 	ps += 4;
1388 	if (pm)
1389 	    pm += 4;
1390 	w -= 4;
1391     }
1392 
1393     while (w--)
1394     {
1395 	s = combine1 (ps, pm);
1396 	d = *pd;
1397 
1398 	ps++;
1399 	*pd++ = _mm_cvtsi128_si32 (
1400 	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1401 	if (pm)
1402 	    pm++;
1403     }
1404 }
1405 
1406 static force_inline uint32_t
core_combine_saturate_u_pixel_sse2(uint32_t src,uint32_t dst)1407 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1408                                     uint32_t dst)
1409 {
1410     __m128i ms = unpack_32_1x128 (src);
1411     __m128i md = unpack_32_1x128 (dst);
1412     uint32_t sa = src >> 24;
1413     uint32_t da = ~dst >> 24;
1414 
1415     if (sa > da)
1416     {
1417 	ms = pix_multiply_1x128 (
1418 	    ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1419     }
1420 
1421     return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1422 }
1423 
1424 static void
sse2_combine_saturate_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1425 sse2_combine_saturate_u (pixman_implementation_t *imp,
1426                          pixman_op_t              op,
1427                          uint32_t *               pd,
1428                          const uint32_t *         ps,
1429                          const uint32_t *         pm,
1430                          int                      w)
1431 {
1432     uint32_t s, d;
1433 
1434     uint32_t pack_cmp;
1435     __m128i xmm_src, xmm_dst;
1436 
1437     while (w && (uintptr_t)pd & 15)
1438     {
1439 	s = combine1 (ps, pm);
1440 	d = *pd;
1441 
1442 	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1443 	w--;
1444 	ps++;
1445 	if (pm)
1446 	    pm++;
1447     }
1448 
1449     while (w >= 4)
1450     {
1451 	xmm_dst = load_128_aligned  ((__m128i*)pd);
1452 	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1453 
1454 	pack_cmp = _mm_movemask_epi8 (
1455 	    _mm_cmpgt_epi32 (
1456 		_mm_srli_epi32 (xmm_src, 24),
1457 		_mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1458 
1459 	/* if some alpha src is grater than respective ~alpha dst */
1460 	if (pack_cmp)
1461 	{
1462 	    s = combine1 (ps++, pm);
1463 	    d = *pd;
1464 	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1465 	    if (pm)
1466 		pm++;
1467 
1468 	    s = combine1 (ps++, pm);
1469 	    d = *pd;
1470 	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1471 	    if (pm)
1472 		pm++;
1473 
1474 	    s = combine1 (ps++, pm);
1475 	    d = *pd;
1476 	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1477 	    if (pm)
1478 		pm++;
1479 
1480 	    s = combine1 (ps++, pm);
1481 	    d = *pd;
1482 	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1483 	    if (pm)
1484 		pm++;
1485 	}
1486 	else
1487 	{
1488 	    save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1489 
1490 	    pd += 4;
1491 	    ps += 4;
1492 	    if (pm)
1493 		pm += 4;
1494 	}
1495 
1496 	w -= 4;
1497     }
1498 
1499     while (w--)
1500     {
1501 	s = combine1 (ps, pm);
1502 	d = *pd;
1503 
1504 	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1505 	ps++;
1506 	if (pm)
1507 	    pm++;
1508     }
1509 }
1510 
1511 static void
sse2_combine_src_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1512 sse2_combine_src_ca (pixman_implementation_t *imp,
1513                      pixman_op_t              op,
1514                      uint32_t *               pd,
1515                      const uint32_t *         ps,
1516                      const uint32_t *         pm,
1517                      int                      w)
1518 {
1519     uint32_t s, m;
1520 
1521     __m128i xmm_src_lo, xmm_src_hi;
1522     __m128i xmm_mask_lo, xmm_mask_hi;
1523     __m128i xmm_dst_lo, xmm_dst_hi;
1524 
1525     while (w && (uintptr_t)pd & 15)
1526     {
1527 	s = *ps++;
1528 	m = *pm++;
1529 	*pd++ = pack_1x128_32 (
1530 	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1531 	w--;
1532     }
1533 
1534     while (w >= 4)
1535     {
1536 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1537 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1538 
1539 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1540 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1541 
1542 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1543 			    &xmm_mask_lo, &xmm_mask_hi,
1544 			    &xmm_dst_lo, &xmm_dst_hi);
1545 
1546 	save_128_aligned (
1547 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1548 
1549 	ps += 4;
1550 	pd += 4;
1551 	pm += 4;
1552 	w -= 4;
1553     }
1554 
1555     while (w)
1556     {
1557 	s = *ps++;
1558 	m = *pm++;
1559 	*pd++ = pack_1x128_32 (
1560 	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1561 	w--;
1562     }
1563 }
1564 
1565 static force_inline uint32_t
core_combine_over_ca_pixel_sse2(uint32_t src,uint32_t mask,uint32_t dst)1566 core_combine_over_ca_pixel_sse2 (uint32_t src,
1567                                  uint32_t mask,
1568                                  uint32_t dst)
1569 {
1570     __m128i s = unpack_32_1x128 (src);
1571     __m128i expAlpha = expand_alpha_1x128 (s);
1572     __m128i unpk_mask = unpack_32_1x128 (mask);
1573     __m128i unpk_dst  = unpack_32_1x128 (dst);
1574 
1575     return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1576 }
1577 
1578 static void
sse2_combine_over_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1579 sse2_combine_over_ca (pixman_implementation_t *imp,
1580                       pixman_op_t              op,
1581                       uint32_t *               pd,
1582                       const uint32_t *         ps,
1583                       const uint32_t *         pm,
1584                       int                      w)
1585 {
1586     uint32_t s, m, d;
1587 
1588     __m128i xmm_alpha_lo, xmm_alpha_hi;
1589     __m128i xmm_src_lo, xmm_src_hi;
1590     __m128i xmm_dst_lo, xmm_dst_hi;
1591     __m128i xmm_mask_lo, xmm_mask_hi;
1592 
1593     while (w && (uintptr_t)pd & 15)
1594     {
1595 	s = *ps++;
1596 	m = *pm++;
1597 	d = *pd;
1598 
1599 	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1600 	w--;
1601     }
1602 
1603     while (w >= 4)
1604     {
1605 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1606 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1607 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1608 
1609 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1610 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1611 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1612 
1613 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1614 			    &xmm_alpha_lo, &xmm_alpha_hi);
1615 
1616 	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1617 		       &xmm_alpha_lo, &xmm_alpha_hi,
1618 		       &xmm_mask_lo, &xmm_mask_hi,
1619 		       &xmm_dst_lo, &xmm_dst_hi);
1620 
1621 	save_128_aligned (
1622 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1623 
1624 	ps += 4;
1625 	pd += 4;
1626 	pm += 4;
1627 	w -= 4;
1628     }
1629 
1630     while (w)
1631     {
1632 	s = *ps++;
1633 	m = *pm++;
1634 	d = *pd;
1635 
1636 	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1637 	w--;
1638     }
1639 }
1640 
1641 static force_inline uint32_t
core_combine_over_reverse_ca_pixel_sse2(uint32_t src,uint32_t mask,uint32_t dst)1642 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1643                                          uint32_t mask,
1644                                          uint32_t dst)
1645 {
1646     __m128i d = unpack_32_1x128 (dst);
1647 
1648     return pack_1x128_32 (
1649 	over_1x128 (d, expand_alpha_1x128 (d),
1650 		    pix_multiply_1x128 (unpack_32_1x128 (src),
1651 					unpack_32_1x128 (mask))));
1652 }
1653 
1654 static void
sse2_combine_over_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1655 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1656                               pixman_op_t              op,
1657                               uint32_t *               pd,
1658                               const uint32_t *         ps,
1659                               const uint32_t *         pm,
1660                               int                      w)
1661 {
1662     uint32_t s, m, d;
1663 
1664     __m128i xmm_alpha_lo, xmm_alpha_hi;
1665     __m128i xmm_src_lo, xmm_src_hi;
1666     __m128i xmm_dst_lo, xmm_dst_hi;
1667     __m128i xmm_mask_lo, xmm_mask_hi;
1668 
1669     while (w && (uintptr_t)pd & 15)
1670     {
1671 	s = *ps++;
1672 	m = *pm++;
1673 	d = *pd;
1674 
1675 	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1676 	w--;
1677     }
1678 
1679     while (w >= 4)
1680     {
1681 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1682 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1683 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1684 
1685 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1686 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1687 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1688 
1689 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1690 			    &xmm_alpha_lo, &xmm_alpha_hi);
1691 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1692 			    &xmm_mask_lo, &xmm_mask_hi,
1693 			    &xmm_mask_lo, &xmm_mask_hi);
1694 
1695 	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1696 		    &xmm_alpha_lo, &xmm_alpha_hi,
1697 		    &xmm_mask_lo, &xmm_mask_hi);
1698 
1699 	save_128_aligned (
1700 	    (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1701 
1702 	ps += 4;
1703 	pd += 4;
1704 	pm += 4;
1705 	w -= 4;
1706     }
1707 
1708     while (w)
1709     {
1710 	s = *ps++;
1711 	m = *pm++;
1712 	d = *pd;
1713 
1714 	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1715 	w--;
1716     }
1717 }
1718 
1719 static void
sse2_combine_in_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1720 sse2_combine_in_ca (pixman_implementation_t *imp,
1721                     pixman_op_t              op,
1722                     uint32_t *               pd,
1723                     const uint32_t *         ps,
1724                     const uint32_t *         pm,
1725                     int                      w)
1726 {
1727     uint32_t s, m, d;
1728 
1729     __m128i xmm_alpha_lo, xmm_alpha_hi;
1730     __m128i xmm_src_lo, xmm_src_hi;
1731     __m128i xmm_dst_lo, xmm_dst_hi;
1732     __m128i xmm_mask_lo, xmm_mask_hi;
1733 
1734     while (w && (uintptr_t)pd & 15)
1735     {
1736 	s = *ps++;
1737 	m = *pm++;
1738 	d = *pd;
1739 
1740 	*pd++ = pack_1x128_32 (
1741 	    pix_multiply_1x128 (
1742 		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1743 		expand_alpha_1x128 (unpack_32_1x128 (d))));
1744 
1745 	w--;
1746     }
1747 
1748     while (w >= 4)
1749     {
1750 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1751 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1752 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1753 
1754 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1755 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1756 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1757 
1758 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1759 			    &xmm_alpha_lo, &xmm_alpha_hi);
1760 
1761 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1762 			    &xmm_mask_lo, &xmm_mask_hi,
1763 			    &xmm_dst_lo, &xmm_dst_hi);
1764 
1765 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1766 			    &xmm_alpha_lo, &xmm_alpha_hi,
1767 			    &xmm_dst_lo, &xmm_dst_hi);
1768 
1769 	save_128_aligned (
1770 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1771 
1772 	ps += 4;
1773 	pd += 4;
1774 	pm += 4;
1775 	w -= 4;
1776     }
1777 
1778     while (w)
1779     {
1780 	s = *ps++;
1781 	m = *pm++;
1782 	d = *pd;
1783 
1784 	*pd++ = pack_1x128_32 (
1785 	    pix_multiply_1x128 (
1786 		pix_multiply_1x128 (
1787 		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
1788 		expand_alpha_1x128 (unpack_32_1x128 (d))));
1789 
1790 	w--;
1791     }
1792 }
1793 
1794 static void
sse2_combine_in_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1795 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1796                             pixman_op_t              op,
1797                             uint32_t *               pd,
1798                             const uint32_t *         ps,
1799                             const uint32_t *         pm,
1800                             int                      w)
1801 {
1802     uint32_t s, m, d;
1803 
1804     __m128i xmm_alpha_lo, xmm_alpha_hi;
1805     __m128i xmm_src_lo, xmm_src_hi;
1806     __m128i xmm_dst_lo, xmm_dst_hi;
1807     __m128i xmm_mask_lo, xmm_mask_hi;
1808 
1809     while (w && (uintptr_t)pd & 15)
1810     {
1811 	s = *ps++;
1812 	m = *pm++;
1813 	d = *pd;
1814 
1815 	*pd++ = pack_1x128_32 (
1816 	    pix_multiply_1x128 (
1817 		unpack_32_1x128 (d),
1818 		pix_multiply_1x128 (unpack_32_1x128 (m),
1819 				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
1820 	w--;
1821     }
1822 
1823     while (w >= 4)
1824     {
1825 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1826 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1827 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1828 
1829 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1830 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1831 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1832 
1833 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1834 			    &xmm_alpha_lo, &xmm_alpha_hi);
1835 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1836 			    &xmm_alpha_lo, &xmm_alpha_hi,
1837 			    &xmm_alpha_lo, &xmm_alpha_hi);
1838 
1839 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1840 			    &xmm_alpha_lo, &xmm_alpha_hi,
1841 			    &xmm_dst_lo, &xmm_dst_hi);
1842 
1843 	save_128_aligned (
1844 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1845 
1846 	ps += 4;
1847 	pd += 4;
1848 	pm += 4;
1849 	w -= 4;
1850     }
1851 
1852     while (w)
1853     {
1854 	s = *ps++;
1855 	m = *pm++;
1856 	d = *pd;
1857 
1858 	*pd++ = pack_1x128_32 (
1859 	    pix_multiply_1x128 (
1860 		unpack_32_1x128 (d),
1861 		pix_multiply_1x128 (unpack_32_1x128 (m),
1862 				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
1863 	w--;
1864     }
1865 }
1866 
1867 static void
sse2_combine_out_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1868 sse2_combine_out_ca (pixman_implementation_t *imp,
1869                      pixman_op_t              op,
1870                      uint32_t *               pd,
1871                      const uint32_t *         ps,
1872                      const uint32_t *         pm,
1873                      int                      w)
1874 {
1875     uint32_t s, m, d;
1876 
1877     __m128i xmm_alpha_lo, xmm_alpha_hi;
1878     __m128i xmm_src_lo, xmm_src_hi;
1879     __m128i xmm_dst_lo, xmm_dst_hi;
1880     __m128i xmm_mask_lo, xmm_mask_hi;
1881 
1882     while (w && (uintptr_t)pd & 15)
1883     {
1884 	s = *ps++;
1885 	m = *pm++;
1886 	d = *pd;
1887 
1888 	*pd++ = pack_1x128_32 (
1889 	    pix_multiply_1x128 (
1890 		pix_multiply_1x128 (
1891 		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
1892 		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1893 	w--;
1894     }
1895 
1896     while (w >= 4)
1897     {
1898 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1899 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1900 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1901 
1902 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1903 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1904 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1905 
1906 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1907 			    &xmm_alpha_lo, &xmm_alpha_hi);
1908 	negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1909 		      &xmm_alpha_lo, &xmm_alpha_hi);
1910 
1911 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1912 			    &xmm_mask_lo, &xmm_mask_hi,
1913 			    &xmm_dst_lo, &xmm_dst_hi);
1914 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1915 			    &xmm_alpha_lo, &xmm_alpha_hi,
1916 			    &xmm_dst_lo, &xmm_dst_hi);
1917 
1918 	save_128_aligned (
1919 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1920 
1921 	ps += 4;
1922 	pd += 4;
1923 	pm += 4;
1924 	w -= 4;
1925     }
1926 
1927     while (w)
1928     {
1929 	s = *ps++;
1930 	m = *pm++;
1931 	d = *pd;
1932 
1933 	*pd++ = pack_1x128_32 (
1934 	    pix_multiply_1x128 (
1935 		pix_multiply_1x128 (
1936 		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
1937 		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1938 
1939 	w--;
1940     }
1941 }
1942 
1943 static void
sse2_combine_out_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1944 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1945                              pixman_op_t              op,
1946                              uint32_t *               pd,
1947                              const uint32_t *         ps,
1948                              const uint32_t *         pm,
1949                              int                      w)
1950 {
1951     uint32_t s, m, d;
1952 
1953     __m128i xmm_alpha_lo, xmm_alpha_hi;
1954     __m128i xmm_src_lo, xmm_src_hi;
1955     __m128i xmm_dst_lo, xmm_dst_hi;
1956     __m128i xmm_mask_lo, xmm_mask_hi;
1957 
1958     while (w && (uintptr_t)pd & 15)
1959     {
1960 	s = *ps++;
1961 	m = *pm++;
1962 	d = *pd;
1963 
1964 	*pd++ = pack_1x128_32 (
1965 	    pix_multiply_1x128 (
1966 		unpack_32_1x128 (d),
1967 		negate_1x128 (pix_multiply_1x128 (
1968 				 unpack_32_1x128 (m),
1969 				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
1970 	w--;
1971     }
1972 
1973     while (w >= 4)
1974     {
1975 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1976 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1977 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1978 
1979 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1980 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1981 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1982 
1983 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1984 			    &xmm_alpha_lo, &xmm_alpha_hi);
1985 
1986 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1987 			    &xmm_alpha_lo, &xmm_alpha_hi,
1988 			    &xmm_mask_lo, &xmm_mask_hi);
1989 
1990 	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1991 		      &xmm_mask_lo, &xmm_mask_hi);
1992 
1993 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1994 			    &xmm_mask_lo, &xmm_mask_hi,
1995 			    &xmm_dst_lo, &xmm_dst_hi);
1996 
1997 	save_128_aligned (
1998 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1999 
2000 	ps += 4;
2001 	pd += 4;
2002 	pm += 4;
2003 	w -= 4;
2004     }
2005 
2006     while (w)
2007     {
2008 	s = *ps++;
2009 	m = *pm++;
2010 	d = *pd;
2011 
2012 	*pd++ = pack_1x128_32 (
2013 	    pix_multiply_1x128 (
2014 		unpack_32_1x128 (d),
2015 		negate_1x128 (pix_multiply_1x128 (
2016 				 unpack_32_1x128 (m),
2017 				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
2018 	w--;
2019     }
2020 }
2021 
2022 static force_inline uint32_t
core_combine_atop_ca_pixel_sse2(uint32_t src,uint32_t mask,uint32_t dst)2023 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2024                                  uint32_t mask,
2025                                  uint32_t dst)
2026 {
2027     __m128i m = unpack_32_1x128 (mask);
2028     __m128i s = unpack_32_1x128 (src);
2029     __m128i d = unpack_32_1x128 (dst);
2030     __m128i sa = expand_alpha_1x128 (s);
2031     __m128i da = expand_alpha_1x128 (d);
2032 
2033     s = pix_multiply_1x128 (s, m);
2034     m = negate_1x128 (pix_multiply_1x128 (m, sa));
2035 
2036     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2037 }
2038 
2039 static void
sse2_combine_atop_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)2040 sse2_combine_atop_ca (pixman_implementation_t *imp,
2041                       pixman_op_t              op,
2042                       uint32_t *               pd,
2043                       const uint32_t *         ps,
2044                       const uint32_t *         pm,
2045                       int                      w)
2046 {
2047     uint32_t s, m, d;
2048 
2049     __m128i xmm_src_lo, xmm_src_hi;
2050     __m128i xmm_dst_lo, xmm_dst_hi;
2051     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2052     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2053     __m128i xmm_mask_lo, xmm_mask_hi;
2054 
2055     while (w && (uintptr_t)pd & 15)
2056     {
2057 	s = *ps++;
2058 	m = *pm++;
2059 	d = *pd;
2060 
2061 	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2062 	w--;
2063     }
2064 
2065     while (w >= 4)
2066     {
2067 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2068 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2069 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2070 
2071 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2072 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2073 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2074 
2075 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2076 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2077 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2078 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2079 
2080 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2081 			    &xmm_mask_lo, &xmm_mask_hi,
2082 			    &xmm_src_lo, &xmm_src_hi);
2083 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2084 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2085 			    &xmm_mask_lo, &xmm_mask_hi);
2086 
2087 	negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2088 
2089 	pix_add_multiply_2x128 (
2090 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2091 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2092 	    &xmm_dst_lo, &xmm_dst_hi);
2093 
2094 	save_128_aligned (
2095 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2096 
2097 	ps += 4;
2098 	pd += 4;
2099 	pm += 4;
2100 	w -= 4;
2101     }
2102 
2103     while (w)
2104     {
2105 	s = *ps++;
2106 	m = *pm++;
2107 	d = *pd;
2108 
2109 	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2110 	w--;
2111     }
2112 }
2113 
2114 static force_inline uint32_t
core_combine_reverse_atop_ca_pixel_sse2(uint32_t src,uint32_t mask,uint32_t dst)2115 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2116                                          uint32_t mask,
2117                                          uint32_t dst)
2118 {
2119     __m128i m = unpack_32_1x128 (mask);
2120     __m128i s = unpack_32_1x128 (src);
2121     __m128i d = unpack_32_1x128 (dst);
2122 
2123     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2124     __m128i sa = expand_alpha_1x128 (s);
2125 
2126     s = pix_multiply_1x128 (s, m);
2127     m = pix_multiply_1x128 (m, sa);
2128 
2129     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2130 }
2131 
2132 static void
sse2_combine_atop_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)2133 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2134                               pixman_op_t              op,
2135                               uint32_t *               pd,
2136                               const uint32_t *         ps,
2137                               const uint32_t *         pm,
2138                               int                      w)
2139 {
2140     uint32_t s, m, d;
2141 
2142     __m128i xmm_src_lo, xmm_src_hi;
2143     __m128i xmm_dst_lo, xmm_dst_hi;
2144     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2145     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2146     __m128i xmm_mask_lo, xmm_mask_hi;
2147 
2148     while (w && (uintptr_t)pd & 15)
2149     {
2150 	s = *ps++;
2151 	m = *pm++;
2152 	d = *pd;
2153 
2154 	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2155 	w--;
2156     }
2157 
2158     while (w >= 4)
2159     {
2160 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2161 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2162 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2163 
2164 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2165 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2166 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2167 
2168 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2169 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2170 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2171 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2172 
2173 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2174 			    &xmm_mask_lo, &xmm_mask_hi,
2175 			    &xmm_src_lo, &xmm_src_hi);
2176 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2177 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2178 			    &xmm_mask_lo, &xmm_mask_hi);
2179 
2180 	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2181 		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2182 
2183 	pix_add_multiply_2x128 (
2184 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2185 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2186 	    &xmm_dst_lo, &xmm_dst_hi);
2187 
2188 	save_128_aligned (
2189 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2190 
2191 	ps += 4;
2192 	pd += 4;
2193 	pm += 4;
2194 	w -= 4;
2195     }
2196 
2197     while (w)
2198     {
2199 	s = *ps++;
2200 	m = *pm++;
2201 	d = *pd;
2202 
2203 	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2204 	w--;
2205     }
2206 }
2207 
2208 static force_inline uint32_t
core_combine_xor_ca_pixel_sse2(uint32_t src,uint32_t mask,uint32_t dst)2209 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2210                                 uint32_t mask,
2211                                 uint32_t dst)
2212 {
2213     __m128i a = unpack_32_1x128 (mask);
2214     __m128i s = unpack_32_1x128 (src);
2215     __m128i d = unpack_32_1x128 (dst);
2216 
2217     __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2218 				       a, expand_alpha_1x128 (s)));
2219     __m128i dest      = pix_multiply_1x128 (s, a);
2220     __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2221 
2222     return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2223                                                 &alpha_dst,
2224                                                 &dest,
2225                                                 &alpha_src));
2226 }
2227 
2228 static void
sse2_combine_xor_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)2229 sse2_combine_xor_ca (pixman_implementation_t *imp,
2230                      pixman_op_t              op,
2231                      uint32_t *               pd,
2232                      const uint32_t *         ps,
2233                      const uint32_t *         pm,
2234                      int                      w)
2235 {
2236     uint32_t s, m, d;
2237 
2238     __m128i xmm_src_lo, xmm_src_hi;
2239     __m128i xmm_dst_lo, xmm_dst_hi;
2240     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2241     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2242     __m128i xmm_mask_lo, xmm_mask_hi;
2243 
2244     while (w && (uintptr_t)pd & 15)
2245     {
2246 	s = *ps++;
2247 	m = *pm++;
2248 	d = *pd;
2249 
2250 	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2251 	w--;
2252     }
2253 
2254     while (w >= 4)
2255     {
2256 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2257 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2258 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2259 
2260 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2261 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2262 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2263 
2264 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2265 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2266 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2267 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2268 
2269 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2270 			    &xmm_mask_lo, &xmm_mask_hi,
2271 			    &xmm_src_lo, &xmm_src_hi);
2272 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2273 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2274 			    &xmm_mask_lo, &xmm_mask_hi);
2275 
2276 	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2277 		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2278 	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2279 		      &xmm_mask_lo, &xmm_mask_hi);
2280 
2281 	pix_add_multiply_2x128 (
2282 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2283 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2284 	    &xmm_dst_lo, &xmm_dst_hi);
2285 
2286 	save_128_aligned (
2287 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2288 
2289 	ps += 4;
2290 	pd += 4;
2291 	pm += 4;
2292 	w -= 4;
2293     }
2294 
2295     while (w)
2296     {
2297 	s = *ps++;
2298 	m = *pm++;
2299 	d = *pd;
2300 
2301 	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2302 	w--;
2303     }
2304 }
2305 
2306 static void
sse2_combine_add_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)2307 sse2_combine_add_ca (pixman_implementation_t *imp,
2308                      pixman_op_t              op,
2309                      uint32_t *               pd,
2310                      const uint32_t *         ps,
2311                      const uint32_t *         pm,
2312                      int                      w)
2313 {
2314     uint32_t s, m, d;
2315 
2316     __m128i xmm_src_lo, xmm_src_hi;
2317     __m128i xmm_dst_lo, xmm_dst_hi;
2318     __m128i xmm_mask_lo, xmm_mask_hi;
2319 
2320     while (w && (uintptr_t)pd & 15)
2321     {
2322 	s = *ps++;
2323 	m = *pm++;
2324 	d = *pd;
2325 
2326 	*pd++ = pack_1x128_32 (
2327 	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2328 					       unpack_32_1x128 (m)),
2329 			   unpack_32_1x128 (d)));
2330 	w--;
2331     }
2332 
2333     while (w >= 4)
2334     {
2335 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2336 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2337 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2338 
2339 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2340 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2341 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2342 
2343 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2344 			    &xmm_mask_lo, &xmm_mask_hi,
2345 			    &xmm_src_lo, &xmm_src_hi);
2346 
2347 	save_128_aligned (
2348 	    (__m128i*)pd, pack_2x128_128 (
2349 		_mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2350 		_mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2351 
2352 	ps += 4;
2353 	pd += 4;
2354 	pm += 4;
2355 	w -= 4;
2356     }
2357 
2358     while (w)
2359     {
2360 	s = *ps++;
2361 	m = *pm++;
2362 	d = *pd;
2363 
2364 	*pd++ = pack_1x128_32 (
2365 	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2366 					       unpack_32_1x128 (m)),
2367 			   unpack_32_1x128 (d)));
2368 	w--;
2369     }
2370 }
2371 
2372 static force_inline __m128i
create_mask_16_128(uint16_t mask)2373 create_mask_16_128 (uint16_t mask)
2374 {
2375     return _mm_set1_epi16 (mask);
2376 }
2377 
2378 /* Work around a code generation bug in Sun Studio 12. */
2379 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2380 # define create_mask_2x32_128(mask0, mask1)				\
2381     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2382 #else
2383 static force_inline __m128i
create_mask_2x32_128(uint32_t mask0,uint32_t mask1)2384 create_mask_2x32_128 (uint32_t mask0,
2385                       uint32_t mask1)
2386 {
2387     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2388 }
2389 #endif
2390 
2391 static void
sse2_composite_over_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2392 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2393                             pixman_composite_info_t *info)
2394 {
2395     PIXMAN_COMPOSITE_ARGS (info);
2396     uint32_t src;
2397     uint32_t    *dst_line, *dst, d;
2398     int32_t w;
2399     int dst_stride;
2400     __m128i xmm_src, xmm_alpha;
2401     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2402 
2403     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2404 
2405     if (src == 0)
2406 	return;
2407 
2408     PIXMAN_IMAGE_GET_LINE (
2409 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2410 
2411     xmm_src = expand_pixel_32_1x128 (src);
2412     xmm_alpha = expand_alpha_1x128 (xmm_src);
2413 
2414     while (height--)
2415     {
2416 	dst = dst_line;
2417 
2418 	dst_line += dst_stride;
2419 	w = width;
2420 
2421 	while (w && (uintptr_t)dst & 15)
2422 	{
2423 	    d = *dst;
2424 	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2425 						xmm_alpha,
2426 						unpack_32_1x128 (d)));
2427 	    w--;
2428 	}
2429 
2430 	while (w >= 4)
2431 	{
2432 	    xmm_dst = load_128_aligned ((__m128i*)dst);
2433 
2434 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2435 
2436 	    over_2x128 (&xmm_src, &xmm_src,
2437 			&xmm_alpha, &xmm_alpha,
2438 			&xmm_dst_lo, &xmm_dst_hi);
2439 
2440 	    /* rebuid the 4 pixel data and save*/
2441 	    save_128_aligned (
2442 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2443 
2444 	    w -= 4;
2445 	    dst += 4;
2446 	}
2447 
2448 	while (w)
2449 	{
2450 	    d = *dst;
2451 	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2452 						xmm_alpha,
2453 						unpack_32_1x128 (d)));
2454 	    w--;
2455 	}
2456 
2457     }
2458 }
2459 
2460 static void
sse2_composite_over_n_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2461 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2462                             pixman_composite_info_t *info)
2463 {
2464     PIXMAN_COMPOSITE_ARGS (info);
2465     uint32_t src;
2466     uint16_t    *dst_line, *dst, d;
2467     int32_t w;
2468     int dst_stride;
2469     __m128i xmm_src, xmm_alpha;
2470     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2471 
2472     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2473 
2474     if (src == 0)
2475 	return;
2476 
2477     PIXMAN_IMAGE_GET_LINE (
2478 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2479 
2480     xmm_src = expand_pixel_32_1x128 (src);
2481     xmm_alpha = expand_alpha_1x128 (xmm_src);
2482 
2483     while (height--)
2484     {
2485 	dst = dst_line;
2486 
2487 	dst_line += dst_stride;
2488 	w = width;
2489 
2490 	while (w && (uintptr_t)dst & 15)
2491 	{
2492 	    d = *dst;
2493 
2494 	    *dst++ = pack_565_32_16 (
2495 		pack_1x128_32 (over_1x128 (xmm_src,
2496 					   xmm_alpha,
2497 					   expand565_16_1x128 (d))));
2498 	    w--;
2499 	}
2500 
2501 	while (w >= 8)
2502 	{
2503 	    xmm_dst = load_128_aligned ((__m128i*)dst);
2504 
2505 	    unpack_565_128_4x128 (xmm_dst,
2506 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2507 
2508 	    over_2x128 (&xmm_src, &xmm_src,
2509 			&xmm_alpha, &xmm_alpha,
2510 			&xmm_dst0, &xmm_dst1);
2511 	    over_2x128 (&xmm_src, &xmm_src,
2512 			&xmm_alpha, &xmm_alpha,
2513 			&xmm_dst2, &xmm_dst3);
2514 
2515 	    xmm_dst = pack_565_4x128_128 (
2516 		&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2517 
2518 	    save_128_aligned ((__m128i*)dst, xmm_dst);
2519 
2520 	    dst += 8;
2521 	    w -= 8;
2522 	}
2523 
2524 	while (w--)
2525 	{
2526 	    d = *dst;
2527 	    *dst++ = pack_565_32_16 (
2528 		pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2529 					   expand565_16_1x128 (d))));
2530 	}
2531     }
2532 
2533 }
2534 
2535 static void
sse2_composite_add_n_8888_8888_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)2536 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2537 				   pixman_composite_info_t *info)
2538 {
2539     PIXMAN_COMPOSITE_ARGS (info);
2540     uint32_t src;
2541     uint32_t    *dst_line, d;
2542     uint32_t    *mask_line, m;
2543     uint32_t pack_cmp;
2544     int dst_stride, mask_stride;
2545 
2546     __m128i xmm_src;
2547     __m128i xmm_dst;
2548     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2549 
2550     __m128i mmx_src, mmx_mask, mmx_dest;
2551 
2552     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2553 
2554     if (src == 0)
2555 	return;
2556 
2557     PIXMAN_IMAGE_GET_LINE (
2558 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2559     PIXMAN_IMAGE_GET_LINE (
2560 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2561 
2562     xmm_src = _mm_unpacklo_epi8 (
2563 	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2564     mmx_src   = xmm_src;
2565 
2566     while (height--)
2567     {
2568 	int w = width;
2569 	const uint32_t *pm = (uint32_t *)mask_line;
2570 	uint32_t *pd = (uint32_t *)dst_line;
2571 
2572 	dst_line += dst_stride;
2573 	mask_line += mask_stride;
2574 
2575 	while (w && (uintptr_t)pd & 15)
2576 	{
2577 	    m = *pm++;
2578 
2579 	    if (m)
2580 	    {
2581 		d = *pd;
2582 
2583 		mmx_mask = unpack_32_1x128 (m);
2584 		mmx_dest = unpack_32_1x128 (d);
2585 
2586 		*pd = pack_1x128_32 (
2587 		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2588 				   mmx_dest));
2589 	    }
2590 
2591 	    pd++;
2592 	    w--;
2593 	}
2594 
2595 	while (w >= 4)
2596 	{
2597 	    xmm_mask = load_128_unaligned ((__m128i*)pm);
2598 
2599 	    pack_cmp =
2600 		_mm_movemask_epi8 (
2601 		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2602 
2603 	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2604 	    if (pack_cmp != 0xffff)
2605 	    {
2606 		xmm_dst = load_128_aligned ((__m128i*)pd);
2607 
2608 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2609 
2610 		pix_multiply_2x128 (&xmm_src, &xmm_src,
2611 				    &xmm_mask_lo, &xmm_mask_hi,
2612 				    &xmm_mask_lo, &xmm_mask_hi);
2613 		xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2614 
2615 		save_128_aligned (
2616 		    (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2617 	    }
2618 
2619 	    pd += 4;
2620 	    pm += 4;
2621 	    w -= 4;
2622 	}
2623 
2624 	while (w)
2625 	{
2626 	    m = *pm++;
2627 
2628 	    if (m)
2629 	    {
2630 		d = *pd;
2631 
2632 		mmx_mask = unpack_32_1x128 (m);
2633 		mmx_dest = unpack_32_1x128 (d);
2634 
2635 		*pd = pack_1x128_32 (
2636 		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2637 				   mmx_dest));
2638 	    }
2639 
2640 	    pd++;
2641 	    w--;
2642 	}
2643     }
2644 
2645 }
2646 
2647 static void
sse2_composite_over_n_8888_8888_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)2648 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2649                                     pixman_composite_info_t *info)
2650 {
2651     PIXMAN_COMPOSITE_ARGS (info);
2652     uint32_t src;
2653     uint32_t    *dst_line, d;
2654     uint32_t    *mask_line, m;
2655     uint32_t pack_cmp;
2656     int dst_stride, mask_stride;
2657 
2658     __m128i xmm_src, xmm_alpha;
2659     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2660     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2661 
2662     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2663 
2664     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2665 
2666     if (src == 0)
2667 	return;
2668 
2669     PIXMAN_IMAGE_GET_LINE (
2670 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2671     PIXMAN_IMAGE_GET_LINE (
2672 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2673 
2674     xmm_src = _mm_unpacklo_epi8 (
2675 	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2676     xmm_alpha = expand_alpha_1x128 (xmm_src);
2677     mmx_src   = xmm_src;
2678     mmx_alpha = xmm_alpha;
2679 
2680     while (height--)
2681     {
2682 	int w = width;
2683 	const uint32_t *pm = (uint32_t *)mask_line;
2684 	uint32_t *pd = (uint32_t *)dst_line;
2685 
2686 	dst_line += dst_stride;
2687 	mask_line += mask_stride;
2688 
2689 	while (w && (uintptr_t)pd & 15)
2690 	{
2691 	    m = *pm++;
2692 
2693 	    if (m)
2694 	    {
2695 		d = *pd;
2696 		mmx_mask = unpack_32_1x128 (m);
2697 		mmx_dest = unpack_32_1x128 (d);
2698 
2699 		*pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2700 		                                  &mmx_alpha,
2701 		                                  &mmx_mask,
2702 		                                  &mmx_dest));
2703 	    }
2704 
2705 	    pd++;
2706 	    w--;
2707 	}
2708 
2709 	while (w >= 4)
2710 	{
2711 	    xmm_mask = load_128_unaligned ((__m128i*)pm);
2712 
2713 	    pack_cmp =
2714 		_mm_movemask_epi8 (
2715 		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2716 
2717 	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2718 	    if (pack_cmp != 0xffff)
2719 	    {
2720 		xmm_dst = load_128_aligned ((__m128i*)pd);
2721 
2722 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2723 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2724 
2725 		in_over_2x128 (&xmm_src, &xmm_src,
2726 			       &xmm_alpha, &xmm_alpha,
2727 			       &xmm_mask_lo, &xmm_mask_hi,
2728 			       &xmm_dst_lo, &xmm_dst_hi);
2729 
2730 		save_128_aligned (
2731 		    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2732 	    }
2733 
2734 	    pd += 4;
2735 	    pm += 4;
2736 	    w -= 4;
2737 	}
2738 
2739 	while (w)
2740 	{
2741 	    m = *pm++;
2742 
2743 	    if (m)
2744 	    {
2745 		d = *pd;
2746 		mmx_mask = unpack_32_1x128 (m);
2747 		mmx_dest = unpack_32_1x128 (d);
2748 
2749 		*pd = pack_1x128_32 (
2750 		    in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2751 	    }
2752 
2753 	    pd++;
2754 	    w--;
2755 	}
2756     }
2757 
2758 }
2759 
2760 static void
sse2_composite_over_8888_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2761 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2762                                  pixman_composite_info_t *info)
2763 {
2764     PIXMAN_COMPOSITE_ARGS (info);
2765     uint32_t    *dst_line, *dst;
2766     uint32_t    *src_line, *src;
2767     uint32_t mask;
2768     int32_t w;
2769     int dst_stride, src_stride;
2770 
2771     __m128i xmm_mask;
2772     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2773     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2774     __m128i xmm_alpha_lo, xmm_alpha_hi;
2775 
2776     PIXMAN_IMAGE_GET_LINE (
2777 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2778     PIXMAN_IMAGE_GET_LINE (
2779 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2780 
2781     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2782 
2783     xmm_mask = create_mask_16_128 (mask >> 24);
2784 
2785     while (height--)
2786     {
2787 	dst = dst_line;
2788 	dst_line += dst_stride;
2789 	src = src_line;
2790 	src_line += src_stride;
2791 	w = width;
2792 
2793 	while (w && (uintptr_t)dst & 15)
2794 	{
2795 	    uint32_t s = *src++;
2796 
2797 	    if (s)
2798 	    {
2799 		uint32_t d = *dst;
2800 
2801 		__m128i ms = unpack_32_1x128 (s);
2802 		__m128i alpha    = expand_alpha_1x128 (ms);
2803 		__m128i dest     = xmm_mask;
2804 		__m128i alpha_dst = unpack_32_1x128 (d);
2805 
2806 		*dst = pack_1x128_32 (
2807 		    in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2808 	    }
2809 	    dst++;
2810 	    w--;
2811 	}
2812 
2813 	while (w >= 4)
2814 	{
2815 	    xmm_src = load_128_unaligned ((__m128i*)src);
2816 
2817 	    if (!is_zero (xmm_src))
2818 	    {
2819 		xmm_dst = load_128_aligned ((__m128i*)dst);
2820 
2821 		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2822 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2823 		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2824 				    &xmm_alpha_lo, &xmm_alpha_hi);
2825 
2826 		in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2827 			       &xmm_alpha_lo, &xmm_alpha_hi,
2828 			       &xmm_mask, &xmm_mask,
2829 			       &xmm_dst_lo, &xmm_dst_hi);
2830 
2831 		save_128_aligned (
2832 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2833 	    }
2834 
2835 	    dst += 4;
2836 	    src += 4;
2837 	    w -= 4;
2838 	}
2839 
2840 	while (w)
2841 	{
2842 	    uint32_t s = *src++;
2843 
2844 	    if (s)
2845 	    {
2846 		uint32_t d = *dst;
2847 
2848 		__m128i ms = unpack_32_1x128 (s);
2849 		__m128i alpha = expand_alpha_1x128 (ms);
2850 		__m128i mask  = xmm_mask;
2851 		__m128i dest  = unpack_32_1x128 (d);
2852 
2853 		*dst = pack_1x128_32 (
2854 		    in_over_1x128 (&ms, &alpha, &mask, &dest));
2855 	    }
2856 
2857 	    dst++;
2858 	    w--;
2859 	}
2860     }
2861 
2862 }
2863 
2864 static void
sse2_composite_src_x888_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2865 sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
2866                               pixman_composite_info_t *info)
2867 {
2868     PIXMAN_COMPOSITE_ARGS (info);
2869     uint16_t    *dst_line, *dst;
2870     uint32_t    *src_line, *src, s;
2871     int dst_stride, src_stride;
2872     int32_t w;
2873 
2874     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2875     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2876 
2877     while (height--)
2878     {
2879 	dst = dst_line;
2880 	dst_line += dst_stride;
2881 	src = src_line;
2882 	src_line += src_stride;
2883 	w = width;
2884 
2885 	while (w && (uintptr_t)dst & 15)
2886 	{
2887 	    s = *src++;
2888 	    *dst = convert_8888_to_0565 (s);
2889 	    dst++;
2890 	    w--;
2891 	}
2892 
2893 	while (w >= 8)
2894 	{
2895 	    __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
2896 	    __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
2897 
2898 	    save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
2899 
2900 	    w -= 8;
2901 	    src += 8;
2902 	    dst += 8;
2903 	}
2904 
2905 	while (w)
2906 	{
2907 	    s = *src++;
2908 	    *dst = convert_8888_to_0565 (s);
2909 	    dst++;
2910 	    w--;
2911 	}
2912     }
2913 }
2914 
2915 static void
sse2_composite_src_x888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2916 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2917 			      pixman_composite_info_t *info)
2918 {
2919     PIXMAN_COMPOSITE_ARGS (info);
2920     uint32_t    *dst_line, *dst;
2921     uint32_t    *src_line, *src;
2922     int32_t w;
2923     int dst_stride, src_stride;
2924 
2925 
2926     PIXMAN_IMAGE_GET_LINE (
2927 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2928     PIXMAN_IMAGE_GET_LINE (
2929 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2930 
2931     while (height--)
2932     {
2933 	dst = dst_line;
2934 	dst_line += dst_stride;
2935 	src = src_line;
2936 	src_line += src_stride;
2937 	w = width;
2938 
2939 	while (w && (uintptr_t)dst & 15)
2940 	{
2941 	    *dst++ = *src++ | 0xff000000;
2942 	    w--;
2943 	}
2944 
2945 	while (w >= 16)
2946 	{
2947 	    __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2948 
2949 	    xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2950 	    xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2951 	    xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2952 	    xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2953 
2954 	    save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2955 	    save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2956 	    save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2957 	    save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2958 
2959 	    dst += 16;
2960 	    src += 16;
2961 	    w -= 16;
2962 	}
2963 
2964 	while (w)
2965 	{
2966 	    *dst++ = *src++ | 0xff000000;
2967 	    w--;
2968 	}
2969     }
2970 
2971 }
2972 
2973 static void
sse2_composite_over_x888_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2974 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2975                                  pixman_composite_info_t *info)
2976 {
2977     PIXMAN_COMPOSITE_ARGS (info);
2978     uint32_t    *dst_line, *dst;
2979     uint32_t    *src_line, *src;
2980     uint32_t mask;
2981     int dst_stride, src_stride;
2982     int32_t w;
2983 
2984     __m128i xmm_mask, xmm_alpha;
2985     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2986     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2987 
2988     PIXMAN_IMAGE_GET_LINE (
2989 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2990     PIXMAN_IMAGE_GET_LINE (
2991 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2992 
2993     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2994 
2995     xmm_mask = create_mask_16_128 (mask >> 24);
2996     xmm_alpha = mask_00ff;
2997 
2998     while (height--)
2999     {
3000 	dst = dst_line;
3001 	dst_line += dst_stride;
3002 	src = src_line;
3003 	src_line += src_stride;
3004 	w = width;
3005 
3006 	while (w && (uintptr_t)dst & 15)
3007 	{
3008 	    uint32_t s = (*src++) | 0xff000000;
3009 	    uint32_t d = *dst;
3010 
3011 	    __m128i src   = unpack_32_1x128 (s);
3012 	    __m128i alpha = xmm_alpha;
3013 	    __m128i mask  = xmm_mask;
3014 	    __m128i dest  = unpack_32_1x128 (d);
3015 
3016 	    *dst++ = pack_1x128_32 (
3017 		in_over_1x128 (&src, &alpha, &mask, &dest));
3018 
3019 	    w--;
3020 	}
3021 
3022 	while (w >= 4)
3023 	{
3024 	    xmm_src = _mm_or_si128 (
3025 		load_128_unaligned ((__m128i*)src), mask_ff000000);
3026 	    xmm_dst = load_128_aligned ((__m128i*)dst);
3027 
3028 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3029 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3030 
3031 	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3032 			   &xmm_alpha, &xmm_alpha,
3033 			   &xmm_mask, &xmm_mask,
3034 			   &xmm_dst_lo, &xmm_dst_hi);
3035 
3036 	    save_128_aligned (
3037 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3038 
3039 	    dst += 4;
3040 	    src += 4;
3041 	    w -= 4;
3042 
3043 	}
3044 
3045 	while (w)
3046 	{
3047 	    uint32_t s = (*src++) | 0xff000000;
3048 	    uint32_t d = *dst;
3049 
3050 	    __m128i src  = unpack_32_1x128 (s);
3051 	    __m128i alpha = xmm_alpha;
3052 	    __m128i mask  = xmm_mask;
3053 	    __m128i dest  = unpack_32_1x128 (d);
3054 
3055 	    *dst++ = pack_1x128_32 (
3056 		in_over_1x128 (&src, &alpha, &mask, &dest));
3057 
3058 	    w--;
3059 	}
3060     }
3061 
3062 }
3063 
3064 static void
sse2_composite_over_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3065 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3066                                pixman_composite_info_t *info)
3067 {
3068     PIXMAN_COMPOSITE_ARGS (info);
3069     int dst_stride, src_stride;
3070     uint32_t    *dst_line, *dst;
3071     uint32_t    *src_line, *src;
3072 
3073     PIXMAN_IMAGE_GET_LINE (
3074 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3075     PIXMAN_IMAGE_GET_LINE (
3076 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3077 
3078     dst = dst_line;
3079     src = src_line;
3080 
3081     while (height--)
3082     {
3083 	sse2_combine_over_u (imp, op, dst, src, NULL, width);
3084 
3085 	dst += dst_stride;
3086 	src += src_stride;
3087     }
3088 }
3089 
3090 static force_inline uint16_t
composite_over_8888_0565pixel(uint32_t src,uint16_t dst)3091 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3092 {
3093     __m128i ms;
3094 
3095     ms = unpack_32_1x128 (src);
3096     return pack_565_32_16 (
3097 	pack_1x128_32 (
3098 	    over_1x128 (
3099 		ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3100 }
3101 
3102 static void
sse2_composite_over_8888_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)3103 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3104                                pixman_composite_info_t *info)
3105 {
3106     PIXMAN_COMPOSITE_ARGS (info);
3107     uint16_t    *dst_line, *dst, d;
3108     uint32_t    *src_line, *src, s;
3109     int dst_stride, src_stride;
3110     int32_t w;
3111 
3112     __m128i xmm_alpha_lo, xmm_alpha_hi;
3113     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3114     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3115 
3116     PIXMAN_IMAGE_GET_LINE (
3117 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3118     PIXMAN_IMAGE_GET_LINE (
3119 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3120 
3121     while (height--)
3122     {
3123 	dst = dst_line;
3124 	src = src_line;
3125 
3126 	dst_line += dst_stride;
3127 	src_line += src_stride;
3128 	w = width;
3129 
3130 	/* Align dst on a 16-byte boundary */
3131 	while (w &&
3132 	       ((uintptr_t)dst & 15))
3133 	{
3134 	    s = *src++;
3135 	    d = *dst;
3136 
3137 	    *dst++ = composite_over_8888_0565pixel (s, d);
3138 	    w--;
3139 	}
3140 
3141 	/* It's a 8 pixel loop */
3142 	while (w >= 8)
3143 	{
3144 	    /* I'm loading unaligned because I'm not sure
3145 	     * about the address alignment.
3146 	     */
3147 	    xmm_src = load_128_unaligned ((__m128i*) src);
3148 	    xmm_dst = load_128_aligned ((__m128i*) dst);
3149 
3150 	    /* Unpacking */
3151 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3152 	    unpack_565_128_4x128 (xmm_dst,
3153 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3154 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3155 				&xmm_alpha_lo, &xmm_alpha_hi);
3156 
3157 	    /* I'm loading next 4 pixels from memory
3158 	     * before to optimze the memory read.
3159 	     */
3160 	    xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3161 
3162 	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
3163 			&xmm_alpha_lo, &xmm_alpha_hi,
3164 			&xmm_dst0, &xmm_dst1);
3165 
3166 	    /* Unpacking */
3167 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3168 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3169 				&xmm_alpha_lo, &xmm_alpha_hi);
3170 
3171 	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
3172 			&xmm_alpha_lo, &xmm_alpha_hi,
3173 			&xmm_dst2, &xmm_dst3);
3174 
3175 	    save_128_aligned (
3176 		(__m128i*)dst, pack_565_4x128_128 (
3177 		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3178 
3179 	    w -= 8;
3180 	    dst += 8;
3181 	    src += 8;
3182 	}
3183 
3184 	while (w--)
3185 	{
3186 	    s = *src++;
3187 	    d = *dst;
3188 
3189 	    *dst++ = composite_over_8888_0565pixel (s, d);
3190 	}
3191     }
3192 
3193 }
3194 
3195 static void
sse2_composite_over_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3196 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3197                               pixman_composite_info_t *info)
3198 {
3199     PIXMAN_COMPOSITE_ARGS (info);
3200     uint32_t src, srca;
3201     uint32_t *dst_line, *dst;
3202     uint8_t *mask_line, *mask;
3203     int dst_stride, mask_stride;
3204     int32_t w;
3205     uint32_t d;
3206 
3207     __m128i xmm_src, xmm_alpha, xmm_def;
3208     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3209     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3210 
3211     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3212 
3213     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3214 
3215     srca = src >> 24;
3216     if (src == 0)
3217 	return;
3218 
3219     PIXMAN_IMAGE_GET_LINE (
3220 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3221     PIXMAN_IMAGE_GET_LINE (
3222 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3223 
3224     xmm_def = create_mask_2x32_128 (src, src);
3225     xmm_src = expand_pixel_32_1x128 (src);
3226     xmm_alpha = expand_alpha_1x128 (xmm_src);
3227     mmx_src   = xmm_src;
3228     mmx_alpha = xmm_alpha;
3229 
3230     while (height--)
3231     {
3232 	dst = dst_line;
3233 	dst_line += dst_stride;
3234 	mask = mask_line;
3235 	mask_line += mask_stride;
3236 	w = width;
3237 
3238 	while (w && (uintptr_t)dst & 15)
3239 	{
3240 	    uint8_t m = *mask++;
3241 
3242 	    if (m)
3243 	    {
3244 		d = *dst;
3245 		mmx_mask = expand_pixel_8_1x128 (m);
3246 		mmx_dest = unpack_32_1x128 (d);
3247 
3248 		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3249 		                                   &mmx_alpha,
3250 		                                   &mmx_mask,
3251 		                                   &mmx_dest));
3252 	    }
3253 
3254 	    w--;
3255 	    dst++;
3256 	}
3257 
3258 	while (w >= 4)
3259 	{
3260             uint32_t m;
3261             memcpy(&m, mask, sizeof(uint32_t));
3262 
3263 	    if (srca == 0xff && m == 0xffffffff)
3264 	    {
3265 		save_128_aligned ((__m128i*)dst, xmm_def);
3266 	    }
3267 	    else if (m)
3268 	    {
3269 		xmm_dst = load_128_aligned ((__m128i*) dst);
3270 		xmm_mask = unpack_32_1x128 (m);
3271 		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3272 
3273 		/* Unpacking */
3274 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3275 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3276 
3277 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3278 					&xmm_mask_lo, &xmm_mask_hi);
3279 
3280 		in_over_2x128 (&xmm_src, &xmm_src,
3281 			       &xmm_alpha, &xmm_alpha,
3282 			       &xmm_mask_lo, &xmm_mask_hi,
3283 			       &xmm_dst_lo, &xmm_dst_hi);
3284 
3285 		save_128_aligned (
3286 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3287 	    }
3288 
3289 	    w -= 4;
3290 	    dst += 4;
3291 	    mask += 4;
3292 	}
3293 
3294 	while (w)
3295 	{
3296 	    uint8_t m = *mask++;
3297 
3298 	    if (m)
3299 	    {
3300 		d = *dst;
3301 		mmx_mask = expand_pixel_8_1x128 (m);
3302 		mmx_dest = unpack_32_1x128 (d);
3303 
3304 		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3305 		                                   &mmx_alpha,
3306 		                                   &mmx_mask,
3307 		                                   &mmx_dest));
3308 	    }
3309 
3310 	    w--;
3311 	    dst++;
3312 	}
3313     }
3314 
3315 }
3316 
3317 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
3318 __attribute__((__force_align_arg_pointer__))
3319 #endif
3320 static pixman_bool_t
sse2_fill(pixman_implementation_t * imp,uint32_t * bits,int stride,int bpp,int x,int y,int width,int height,uint32_t filler)3321 sse2_fill (pixman_implementation_t *imp,
3322            uint32_t *               bits,
3323            int                      stride,
3324            int                      bpp,
3325            int                      x,
3326            int                      y,
3327            int                      width,
3328            int                      height,
3329            uint32_t		    filler)
3330 {
3331     uint32_t byte_width;
3332     uint8_t *byte_line;
3333 
3334     __m128i xmm_def;
3335 
3336     if (bpp == 8)
3337     {
3338 	uint32_t b;
3339 	uint32_t w;
3340 
3341 	stride = stride * (int) sizeof (uint32_t) / 1;
3342 	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3343 	byte_width = width;
3344 	stride *= 1;
3345 
3346 	b = filler & 0xff;
3347 	w = (b << 8) | b;
3348 	filler = (w << 16) | w;
3349     }
3350     else if (bpp == 16)
3351     {
3352 	stride = stride * (int) sizeof (uint32_t) / 2;
3353 	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3354 	byte_width = 2 * width;
3355 	stride *= 2;
3356 
3357         filler = (filler & 0xffff) * 0x00010001;
3358     }
3359     else if (bpp == 32)
3360     {
3361 	stride = stride * (int) sizeof (uint32_t) / 4;
3362 	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3363 	byte_width = 4 * width;
3364 	stride *= 4;
3365     }
3366     else
3367     {
3368 	return FALSE;
3369     }
3370 
3371     xmm_def = create_mask_2x32_128 (filler, filler);
3372 
3373     while (height--)
3374     {
3375 	int w;
3376 	uint8_t *d = byte_line;
3377 	byte_line += stride;
3378 	w = byte_width;
3379 
3380 	if (w >= 1 && ((uintptr_t)d & 1))
3381 	{
3382 	    *(uint8_t *)d = filler;
3383 	    w -= 1;
3384 	    d += 1;
3385 	}
3386 
3387 	while (w >= 2 && ((uintptr_t)d & 3))
3388 	{
3389 	    *(uint16_t *)d = filler;
3390 	    w -= 2;
3391 	    d += 2;
3392 	}
3393 
3394 	while (w >= 4 && ((uintptr_t)d & 15))
3395 	{
3396 	    *(uint32_t *)d = filler;
3397 
3398 	    w -= 4;
3399 	    d += 4;
3400 	}
3401 
3402 	while (w >= 128)
3403 	{
3404 	    save_128_aligned ((__m128i*)(d),     xmm_def);
3405 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3406 	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3407 	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3408 	    save_128_aligned ((__m128i*)(d + 64),  xmm_def);
3409 	    save_128_aligned ((__m128i*)(d + 80),  xmm_def);
3410 	    save_128_aligned ((__m128i*)(d + 96),  xmm_def);
3411 	    save_128_aligned ((__m128i*)(d + 112), xmm_def);
3412 
3413 	    d += 128;
3414 	    w -= 128;
3415 	}
3416 
3417 	if (w >= 64)
3418 	{
3419 	    save_128_aligned ((__m128i*)(d),     xmm_def);
3420 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3421 	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3422 	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3423 
3424 	    d += 64;
3425 	    w -= 64;
3426 	}
3427 
3428 	if (w >= 32)
3429 	{
3430 	    save_128_aligned ((__m128i*)(d),     xmm_def);
3431 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3432 
3433 	    d += 32;
3434 	    w -= 32;
3435 	}
3436 
3437 	if (w >= 16)
3438 	{
3439 	    save_128_aligned ((__m128i*)(d),     xmm_def);
3440 
3441 	    d += 16;
3442 	    w -= 16;
3443 	}
3444 
3445 	while (w >= 4)
3446 	{
3447 	    *(uint32_t *)d = filler;
3448 
3449 	    w -= 4;
3450 	    d += 4;
3451 	}
3452 
3453 	if (w >= 2)
3454 	{
3455 	    *(uint16_t *)d = filler;
3456 	    w -= 2;
3457 	    d += 2;
3458 	}
3459 
3460 	if (w >= 1)
3461 	{
3462 	    *(uint8_t *)d = filler;
3463 	    w -= 1;
3464 	    d += 1;
3465 	}
3466     }
3467 
3468     return TRUE;
3469 }
3470 
3471 static void
sse2_composite_src_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3472 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3473                              pixman_composite_info_t *info)
3474 {
3475     PIXMAN_COMPOSITE_ARGS (info);
3476     uint32_t src, srca;
3477     uint32_t    *dst_line, *dst;
3478     uint8_t     *mask_line, *mask;
3479     int dst_stride, mask_stride;
3480     int32_t w;
3481 
3482     __m128i xmm_src, xmm_def;
3483     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3484 
3485     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3486 
3487     srca = src >> 24;
3488     if (src == 0)
3489     {
3490 	sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
3491 		   PIXMAN_FORMAT_BPP (dest_image->bits.format),
3492 		   dest_x, dest_y, width, height, 0);
3493 	return;
3494     }
3495 
3496     PIXMAN_IMAGE_GET_LINE (
3497 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3498     PIXMAN_IMAGE_GET_LINE (
3499 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3500 
3501     xmm_def = create_mask_2x32_128 (src, src);
3502     xmm_src = expand_pixel_32_1x128 (src);
3503 
3504     while (height--)
3505     {
3506 	dst = dst_line;
3507 	dst_line += dst_stride;
3508 	mask = mask_line;
3509 	mask_line += mask_stride;
3510 	w = width;
3511 
3512 	while (w && (uintptr_t)dst & 15)
3513 	{
3514 	    uint8_t m = *mask++;
3515 
3516 	    if (m)
3517 	    {
3518 		*dst = pack_1x128_32 (
3519 		    pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3520 	    }
3521 	    else
3522 	    {
3523 		*dst = 0;
3524 	    }
3525 
3526 	    w--;
3527 	    dst++;
3528 	}
3529 
3530 	while (w >= 4)
3531 	{
3532             uint32_t m;
3533             memcpy(&m, mask, sizeof(uint32_t));
3534 
3535 	    if (srca == 0xff && m == 0xffffffff)
3536 	    {
3537 		save_128_aligned ((__m128i*)dst, xmm_def);
3538 	    }
3539 	    else if (m)
3540 	    {
3541 		xmm_mask = unpack_32_1x128 (m);
3542 		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3543 
3544 		/* Unpacking */
3545 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3546 
3547 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3548 					&xmm_mask_lo, &xmm_mask_hi);
3549 
3550 		pix_multiply_2x128 (&xmm_src, &xmm_src,
3551 				    &xmm_mask_lo, &xmm_mask_hi,
3552 				    &xmm_mask_lo, &xmm_mask_hi);
3553 
3554 		save_128_aligned (
3555 		    (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3556 	    }
3557 	    else
3558 	    {
3559 		save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3560 	    }
3561 
3562 	    w -= 4;
3563 	    dst += 4;
3564 	    mask += 4;
3565 	}
3566 
3567 	while (w)
3568 	{
3569 	    uint8_t m = *mask++;
3570 
3571 	    if (m)
3572 	    {
3573 		*dst = pack_1x128_32 (
3574 		    pix_multiply_1x128 (
3575 			xmm_src, expand_pixel_8_1x128 (m)));
3576 	    }
3577 	    else
3578 	    {
3579 		*dst = 0;
3580 	    }
3581 
3582 	    w--;
3583 	    dst++;
3584 	}
3585     }
3586 
3587 }
3588 
3589 static void
sse2_composite_over_n_8_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)3590 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3591                               pixman_composite_info_t *info)
3592 {
3593     PIXMAN_COMPOSITE_ARGS (info);
3594     uint32_t src;
3595     uint16_t    *dst_line, *dst, d;
3596     uint8_t     *mask_line, *mask;
3597     int dst_stride, mask_stride;
3598     int32_t w;
3599     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3600 
3601     __m128i xmm_src, xmm_alpha;
3602     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3603     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3604 
3605     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3606 
3607     if (src == 0)
3608 	return;
3609 
3610     PIXMAN_IMAGE_GET_LINE (
3611 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3612     PIXMAN_IMAGE_GET_LINE (
3613 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3614 
3615     xmm_src = expand_pixel_32_1x128 (src);
3616     xmm_alpha = expand_alpha_1x128 (xmm_src);
3617     mmx_src = xmm_src;
3618     mmx_alpha = xmm_alpha;
3619 
3620     while (height--)
3621     {
3622 	dst = dst_line;
3623 	dst_line += dst_stride;
3624 	mask = mask_line;
3625 	mask_line += mask_stride;
3626 	w = width;
3627 
3628 	while (w && (uintptr_t)dst & 15)
3629 	{
3630 	    uint8_t m = *mask++;
3631 
3632 	    if (m)
3633 	    {
3634 		d = *dst;
3635 		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3636 		mmx_dest = expand565_16_1x128 (d);
3637 
3638 		*dst = pack_565_32_16 (
3639 		    pack_1x128_32 (
3640 			in_over_1x128 (
3641 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3642 	    }
3643 
3644 	    w--;
3645 	    dst++;
3646 	}
3647 
3648 	while (w >= 8)
3649 	{
3650             uint32_t m;
3651 
3652 	    xmm_dst = load_128_aligned ((__m128i*) dst);
3653 	    unpack_565_128_4x128 (xmm_dst,
3654 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3655 
3656             memcpy(&m, mask, sizeof(uint32_t));
3657 	    mask += 4;
3658 
3659 	    if (m)
3660 	    {
3661 		xmm_mask = unpack_32_1x128 (m);
3662 		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3663 
3664 		/* Unpacking */
3665 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3666 
3667 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3668 					&xmm_mask_lo, &xmm_mask_hi);
3669 
3670 		in_over_2x128 (&xmm_src, &xmm_src,
3671 			       &xmm_alpha, &xmm_alpha,
3672 			       &xmm_mask_lo, &xmm_mask_hi,
3673 			       &xmm_dst0, &xmm_dst1);
3674 	    }
3675 
3676             memcpy(&m, mask, sizeof(uint32_t));
3677 	    mask += 4;
3678 
3679 	    if (m)
3680 	    {
3681 		xmm_mask = unpack_32_1x128 (m);
3682 		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3683 
3684 		/* Unpacking */
3685 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3686 
3687 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3688 					&xmm_mask_lo, &xmm_mask_hi);
3689 		in_over_2x128 (&xmm_src, &xmm_src,
3690 			       &xmm_alpha, &xmm_alpha,
3691 			       &xmm_mask_lo, &xmm_mask_hi,
3692 			       &xmm_dst2, &xmm_dst3);
3693 	    }
3694 
3695 	    save_128_aligned (
3696 		(__m128i*)dst, pack_565_4x128_128 (
3697 		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3698 
3699 	    w -= 8;
3700 	    dst += 8;
3701 	}
3702 
3703 	while (w)
3704 	{
3705 	    uint8_t m = *mask++;
3706 
3707 	    if (m)
3708 	    {
3709 		d = *dst;
3710 		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3711 		mmx_dest = expand565_16_1x128 (d);
3712 
3713 		*dst = pack_565_32_16 (
3714 		    pack_1x128_32 (
3715 			in_over_1x128 (
3716 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3717 	    }
3718 
3719 	    w--;
3720 	    dst++;
3721 	}
3722     }
3723 
3724 }
3725 
3726 static void
sse2_composite_over_pixbuf_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)3727 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3728                                  pixman_composite_info_t *info)
3729 {
3730     PIXMAN_COMPOSITE_ARGS (info);
3731     uint16_t    *dst_line, *dst, d;
3732     uint32_t    *src_line, *src, s;
3733     int dst_stride, src_stride;
3734     int32_t w;
3735     uint32_t opaque, zero;
3736 
3737     __m128i ms;
3738     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3739     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3740 
3741     PIXMAN_IMAGE_GET_LINE (
3742 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3743     PIXMAN_IMAGE_GET_LINE (
3744 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3745 
3746     while (height--)
3747     {
3748 	dst = dst_line;
3749 	dst_line += dst_stride;
3750 	src = src_line;
3751 	src_line += src_stride;
3752 	w = width;
3753 
3754 	while (w && (uintptr_t)dst & 15)
3755 	{
3756 	    s = *src++;
3757 	    d = *dst;
3758 
3759 	    ms = unpack_32_1x128 (s);
3760 
3761 	    *dst++ = pack_565_32_16 (
3762 		pack_1x128_32 (
3763 		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3764 	    w--;
3765 	}
3766 
3767 	while (w >= 8)
3768 	{
3769 	    /* First round */
3770 	    xmm_src = load_128_unaligned ((__m128i*)src);
3771 	    xmm_dst = load_128_aligned  ((__m128i*)dst);
3772 
3773 	    opaque = is_opaque (xmm_src);
3774 	    zero = is_zero (xmm_src);
3775 
3776 	    unpack_565_128_4x128 (xmm_dst,
3777 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3778 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3779 
3780 	    /* preload next round*/
3781 	    xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3782 
3783 	    if (opaque)
3784 	    {
3785 		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3786 				     &xmm_dst0, &xmm_dst1);
3787 	    }
3788 	    else if (!zero)
3789 	    {
3790 		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3791 					&xmm_dst0, &xmm_dst1);
3792 	    }
3793 
3794 	    /* Second round */
3795 	    opaque = is_opaque (xmm_src);
3796 	    zero = is_zero (xmm_src);
3797 
3798 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3799 
3800 	    if (opaque)
3801 	    {
3802 		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3803 				     &xmm_dst2, &xmm_dst3);
3804 	    }
3805 	    else if (!zero)
3806 	    {
3807 		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3808 					&xmm_dst2, &xmm_dst3);
3809 	    }
3810 
3811 	    save_128_aligned (
3812 		(__m128i*)dst, pack_565_4x128_128 (
3813 		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3814 
3815 	    w -= 8;
3816 	    src += 8;
3817 	    dst += 8;
3818 	}
3819 
3820 	while (w)
3821 	{
3822 	    s = *src++;
3823 	    d = *dst;
3824 
3825 	    ms = unpack_32_1x128 (s);
3826 
3827 	    *dst++ = pack_565_32_16 (
3828 		pack_1x128_32 (
3829 		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3830 	    w--;
3831 	}
3832     }
3833 
3834 }
3835 
3836 static void
sse2_composite_over_pixbuf_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3837 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3838                                  pixman_composite_info_t *info)
3839 {
3840     PIXMAN_COMPOSITE_ARGS (info);
3841     uint32_t    *dst_line, *dst, d;
3842     uint32_t    *src_line, *src, s;
3843     int dst_stride, src_stride;
3844     int32_t w;
3845     uint32_t opaque, zero;
3846 
3847     __m128i xmm_src_lo, xmm_src_hi;
3848     __m128i xmm_dst_lo, xmm_dst_hi;
3849 
3850     PIXMAN_IMAGE_GET_LINE (
3851 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3852     PIXMAN_IMAGE_GET_LINE (
3853 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3854 
3855     while (height--)
3856     {
3857 	dst = dst_line;
3858 	dst_line += dst_stride;
3859 	src = src_line;
3860 	src_line += src_stride;
3861 	w = width;
3862 
3863 	while (w && (uintptr_t)dst & 15)
3864 	{
3865 	    s = *src++;
3866 	    d = *dst;
3867 
3868 	    *dst++ = pack_1x128_32 (
3869 		over_rev_non_pre_1x128 (
3870 		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
3871 
3872 	    w--;
3873 	}
3874 
3875 	while (w >= 4)
3876 	{
3877 	    xmm_src_hi = load_128_unaligned ((__m128i*)src);
3878 
3879 	    opaque = is_opaque (xmm_src_hi);
3880 	    zero = is_zero (xmm_src_hi);
3881 
3882 	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3883 
3884 	    if (opaque)
3885 	    {
3886 		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3887 				     &xmm_dst_lo, &xmm_dst_hi);
3888 
3889 		save_128_aligned (
3890 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3891 	    }
3892 	    else if (!zero)
3893 	    {
3894 		xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
3895 
3896 		unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3897 
3898 		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3899 					&xmm_dst_lo, &xmm_dst_hi);
3900 
3901 		save_128_aligned (
3902 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3903 	    }
3904 
3905 	    w -= 4;
3906 	    dst += 4;
3907 	    src += 4;
3908 	}
3909 
3910 	while (w)
3911 	{
3912 	    s = *src++;
3913 	    d = *dst;
3914 
3915 	    *dst++ = pack_1x128_32 (
3916 		over_rev_non_pre_1x128 (
3917 		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
3918 
3919 	    w--;
3920 	}
3921     }
3922 
3923 }
3924 
3925 static void
sse2_composite_over_n_8888_0565_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)3926 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3927                                     pixman_composite_info_t *info)
3928 {
3929     PIXMAN_COMPOSITE_ARGS (info);
3930     uint32_t src;
3931     uint16_t    *dst_line, *dst, d;
3932     uint32_t    *mask_line, *mask, m;
3933     int dst_stride, mask_stride;
3934     int w;
3935     uint32_t pack_cmp;
3936 
3937     __m128i xmm_src, xmm_alpha;
3938     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3939     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3940 
3941     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3942 
3943     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3944 
3945     if (src == 0)
3946 	return;
3947 
3948     PIXMAN_IMAGE_GET_LINE (
3949 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3950     PIXMAN_IMAGE_GET_LINE (
3951 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3952 
3953     xmm_src = expand_pixel_32_1x128 (src);
3954     xmm_alpha = expand_alpha_1x128 (xmm_src);
3955     mmx_src = xmm_src;
3956     mmx_alpha = xmm_alpha;
3957 
3958     while (height--)
3959     {
3960 	w = width;
3961 	mask = mask_line;
3962 	dst = dst_line;
3963 	mask_line += mask_stride;
3964 	dst_line += dst_stride;
3965 
3966 	while (w && ((uintptr_t)dst & 15))
3967 	{
3968 	    m = *(uint32_t *) mask;
3969 
3970 	    if (m)
3971 	    {
3972 		d = *dst;
3973 		mmx_mask = unpack_32_1x128 (m);
3974 		mmx_dest = expand565_16_1x128 (d);
3975 
3976 		*dst = pack_565_32_16 (
3977 		    pack_1x128_32 (
3978 			in_over_1x128 (
3979 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3980 	    }
3981 
3982 	    w--;
3983 	    dst++;
3984 	    mask++;
3985 	}
3986 
3987 	while (w >= 8)
3988 	{
3989 	    /* First round */
3990 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
3991 	    xmm_dst = load_128_aligned ((__m128i*)dst);
3992 
3993 	    pack_cmp = _mm_movemask_epi8 (
3994 		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3995 
3996 	    unpack_565_128_4x128 (xmm_dst,
3997 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3998 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3999 
4000 	    /* preload next round */
4001 	    xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4002 
4003 	    /* preload next round */
4004 	    if (pack_cmp != 0xffff)
4005 	    {
4006 		in_over_2x128 (&xmm_src, &xmm_src,
4007 			       &xmm_alpha, &xmm_alpha,
4008 			       &xmm_mask_lo, &xmm_mask_hi,
4009 			       &xmm_dst0, &xmm_dst1);
4010 	    }
4011 
4012 	    /* Second round */
4013 	    pack_cmp = _mm_movemask_epi8 (
4014 		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4015 
4016 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4017 
4018 	    if (pack_cmp != 0xffff)
4019 	    {
4020 		in_over_2x128 (&xmm_src, &xmm_src,
4021 			       &xmm_alpha, &xmm_alpha,
4022 			       &xmm_mask_lo, &xmm_mask_hi,
4023 			       &xmm_dst2, &xmm_dst3);
4024 	    }
4025 
4026 	    save_128_aligned (
4027 		(__m128i*)dst, pack_565_4x128_128 (
4028 		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4029 
4030 	    w -= 8;
4031 	    dst += 8;
4032 	    mask += 8;
4033 	}
4034 
4035 	while (w)
4036 	{
4037 	    m = *(uint32_t *) mask;
4038 
4039 	    if (m)
4040 	    {
4041 		d = *dst;
4042 		mmx_mask = unpack_32_1x128 (m);
4043 		mmx_dest = expand565_16_1x128 (d);
4044 
4045 		*dst = pack_565_32_16 (
4046 		    pack_1x128_32 (
4047 			in_over_1x128 (
4048 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4049 	    }
4050 
4051 	    w--;
4052 	    dst++;
4053 	    mask++;
4054 	}
4055     }
4056 
4057 }
4058 
4059 static void
sse2_composite_in_n_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4060 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4061                          pixman_composite_info_t *info)
4062 {
4063     PIXMAN_COMPOSITE_ARGS (info);
4064     uint8_t     *dst_line, *dst;
4065     uint8_t     *mask_line, *mask;
4066     int dst_stride, mask_stride;
4067     uint32_t d;
4068     uint32_t src;
4069     int32_t w;
4070 
4071     __m128i xmm_alpha;
4072     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4073     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4074 
4075     PIXMAN_IMAGE_GET_LINE (
4076 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4077     PIXMAN_IMAGE_GET_LINE (
4078 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4079 
4080     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4081 
4082     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4083 
4084     while (height--)
4085     {
4086 	dst = dst_line;
4087 	dst_line += dst_stride;
4088 	mask = mask_line;
4089 	mask_line += mask_stride;
4090 	w = width;
4091 
4092 	while (w && ((uintptr_t)dst & 15))
4093 	{
4094 	    uint8_t m = *mask++;
4095 	    d = (uint32_t) *dst;
4096 
4097 	    *dst++ = (uint8_t) pack_1x128_32 (
4098 		pix_multiply_1x128 (
4099 		    pix_multiply_1x128 (xmm_alpha,
4100 				       unpack_32_1x128 (m)),
4101 		    unpack_32_1x128 (d)));
4102 	    w--;
4103 	}
4104 
4105 	while (w >= 16)
4106 	{
4107 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
4108 	    xmm_dst = load_128_aligned ((__m128i*)dst);
4109 
4110 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4111 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4112 
4113 	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4114 				&xmm_mask_lo, &xmm_mask_hi,
4115 				&xmm_mask_lo, &xmm_mask_hi);
4116 
4117 	    pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4118 				&xmm_dst_lo, &xmm_dst_hi,
4119 				&xmm_dst_lo, &xmm_dst_hi);
4120 
4121 	    save_128_aligned (
4122 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4123 
4124 	    mask += 16;
4125 	    dst += 16;
4126 	    w -= 16;
4127 	}
4128 
4129 	while (w)
4130 	{
4131 	    uint8_t m = *mask++;
4132 	    d = (uint32_t) *dst;
4133 
4134 	    *dst++ = (uint8_t) pack_1x128_32 (
4135 		pix_multiply_1x128 (
4136 		    pix_multiply_1x128 (
4137 			xmm_alpha, unpack_32_1x128 (m)),
4138 		    unpack_32_1x128 (d)));
4139 	    w--;
4140 	}
4141     }
4142 
4143 }
4144 
4145 static void
sse2_composite_in_n_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4146 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4147 		       pixman_composite_info_t *info)
4148 {
4149     PIXMAN_COMPOSITE_ARGS (info);
4150     uint8_t     *dst_line, *dst;
4151     int dst_stride;
4152     uint32_t d;
4153     uint32_t src;
4154     int32_t w;
4155 
4156     __m128i xmm_alpha;
4157     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4158 
4159     PIXMAN_IMAGE_GET_LINE (
4160 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4161 
4162     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4163 
4164     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4165 
4166     src = src >> 24;
4167 
4168     if (src == 0xff)
4169 	return;
4170 
4171     if (src == 0x00)
4172     {
4173 	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4174 		     8, dest_x, dest_y, width, height, src);
4175 
4176 	return;
4177     }
4178 
4179     while (height--)
4180     {
4181 	dst = dst_line;
4182 	dst_line += dst_stride;
4183 	w = width;
4184 
4185 	while (w && ((uintptr_t)dst & 15))
4186 	{
4187 	    d = (uint32_t) *dst;
4188 
4189 	    *dst++ = (uint8_t) pack_1x128_32 (
4190 		pix_multiply_1x128 (
4191 		    xmm_alpha,
4192 		    unpack_32_1x128 (d)));
4193 	    w--;
4194 	}
4195 
4196 	while (w >= 16)
4197 	{
4198 	    xmm_dst = load_128_aligned ((__m128i*)dst);
4199 
4200 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4201 
4202 	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4203 				&xmm_dst_lo, &xmm_dst_hi,
4204 				&xmm_dst_lo, &xmm_dst_hi);
4205 
4206 	    save_128_aligned (
4207 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4208 
4209 	    dst += 16;
4210 	    w -= 16;
4211 	}
4212 
4213 	while (w)
4214 	{
4215 	    d = (uint32_t) *dst;
4216 
4217 	    *dst++ = (uint8_t) pack_1x128_32 (
4218 		pix_multiply_1x128 (
4219 		    xmm_alpha,
4220 		    unpack_32_1x128 (d)));
4221 	    w--;
4222 	}
4223     }
4224 
4225 }
4226 
4227 static void
sse2_composite_in_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4228 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4229                        pixman_composite_info_t *info)
4230 {
4231     PIXMAN_COMPOSITE_ARGS (info);
4232     uint8_t     *dst_line, *dst;
4233     uint8_t     *src_line, *src;
4234     int src_stride, dst_stride;
4235     int32_t w;
4236     uint32_t s, d;
4237 
4238     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4239     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4240 
4241     PIXMAN_IMAGE_GET_LINE (
4242 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4243     PIXMAN_IMAGE_GET_LINE (
4244 	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4245 
4246     while (height--)
4247     {
4248 	dst = dst_line;
4249 	dst_line += dst_stride;
4250 	src = src_line;
4251 	src_line += src_stride;
4252 	w = width;
4253 
4254 	while (w && ((uintptr_t)dst & 15))
4255 	{
4256 	    s = (uint32_t) *src++;
4257 	    d = (uint32_t) *dst;
4258 
4259 	    *dst++ = (uint8_t) pack_1x128_32 (
4260 		pix_multiply_1x128 (
4261 		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
4262 	    w--;
4263 	}
4264 
4265 	while (w >= 16)
4266 	{
4267 	    xmm_src = load_128_unaligned ((__m128i*)src);
4268 	    xmm_dst = load_128_aligned ((__m128i*)dst);
4269 
4270 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4271 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4272 
4273 	    pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4274 				&xmm_dst_lo, &xmm_dst_hi,
4275 				&xmm_dst_lo, &xmm_dst_hi);
4276 
4277 	    save_128_aligned (
4278 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4279 
4280 	    src += 16;
4281 	    dst += 16;
4282 	    w -= 16;
4283 	}
4284 
4285 	while (w)
4286 	{
4287 	    s = (uint32_t) *src++;
4288 	    d = (uint32_t) *dst;
4289 
4290 	    *dst++ = (uint8_t) pack_1x128_32 (
4291 		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4292 	    w--;
4293 	}
4294     }
4295 
4296 }
4297 
4298 static void
sse2_composite_add_n_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4299 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4300 			  pixman_composite_info_t *info)
4301 {
4302     PIXMAN_COMPOSITE_ARGS (info);
4303     uint8_t     *dst_line, *dst;
4304     uint8_t     *mask_line, *mask;
4305     int dst_stride, mask_stride;
4306     int32_t w;
4307     uint32_t src;
4308     uint32_t d;
4309 
4310     __m128i xmm_alpha;
4311     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4312     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4313 
4314     PIXMAN_IMAGE_GET_LINE (
4315 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4316     PIXMAN_IMAGE_GET_LINE (
4317 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4318 
4319     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4320 
4321     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4322 
4323     while (height--)
4324     {
4325 	dst = dst_line;
4326 	dst_line += dst_stride;
4327 	mask = mask_line;
4328 	mask_line += mask_stride;
4329 	w = width;
4330 
4331 	while (w && ((uintptr_t)dst & 15))
4332 	{
4333 	    uint8_t m = *mask++;
4334 	    d = (uint32_t) *dst;
4335 
4336 	    *dst++ = (uint8_t) pack_1x128_32 (
4337 		_mm_adds_epu16 (
4338 		    pix_multiply_1x128 (
4339 			xmm_alpha, unpack_32_1x128 (m)),
4340 		    unpack_32_1x128 (d)));
4341 	    w--;
4342 	}
4343 
4344 	while (w >= 16)
4345 	{
4346 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
4347 	    xmm_dst = load_128_aligned ((__m128i*)dst);
4348 
4349 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4350 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4351 
4352 	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4353 				&xmm_mask_lo, &xmm_mask_hi,
4354 				&xmm_mask_lo, &xmm_mask_hi);
4355 
4356 	    xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4357 	    xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4358 
4359 	    save_128_aligned (
4360 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4361 
4362 	    mask += 16;
4363 	    dst += 16;
4364 	    w -= 16;
4365 	}
4366 
4367 	while (w)
4368 	{
4369 	    uint8_t m = (uint32_t) *mask++;
4370 	    d = (uint32_t) *dst;
4371 
4372 	    *dst++ = (uint8_t) pack_1x128_32 (
4373 		_mm_adds_epu16 (
4374 		    pix_multiply_1x128 (
4375 			xmm_alpha, unpack_32_1x128 (m)),
4376 		    unpack_32_1x128 (d)));
4377 
4378 	    w--;
4379 	}
4380     }
4381 
4382 }
4383 
4384 static void
sse2_composite_add_n_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4385 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4386 			pixman_composite_info_t *info)
4387 {
4388     PIXMAN_COMPOSITE_ARGS (info);
4389     uint8_t     *dst_line, *dst;
4390     int dst_stride;
4391     int32_t w;
4392     uint32_t src;
4393 
4394     __m128i xmm_src;
4395 
4396     PIXMAN_IMAGE_GET_LINE (
4397 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4398 
4399     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4400 
4401     src >>= 24;
4402 
4403     if (src == 0x00)
4404 	return;
4405 
4406     if (src == 0xff)
4407     {
4408 	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4409 		     8, dest_x, dest_y, width, height, 0xff);
4410 
4411 	return;
4412     }
4413 
4414     src = (src << 24) | (src << 16) | (src << 8) | src;
4415     xmm_src = _mm_set_epi32 (src, src, src, src);
4416 
4417     while (height--)
4418     {
4419 	dst = dst_line;
4420 	dst_line += dst_stride;
4421 	w = width;
4422 
4423 	while (w && ((uintptr_t)dst & 15))
4424 	{
4425 	    *dst = (uint8_t)_mm_cvtsi128_si32 (
4426 		_mm_adds_epu8 (
4427 		    xmm_src,
4428 		    _mm_cvtsi32_si128 (*dst)));
4429 
4430 	    w--;
4431 	    dst++;
4432 	}
4433 
4434 	while (w >= 16)
4435 	{
4436 	    save_128_aligned (
4437 		(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
4438 
4439 	    dst += 16;
4440 	    w -= 16;
4441 	}
4442 
4443 	while (w)
4444 	{
4445 	    *dst = (uint8_t)_mm_cvtsi128_si32 (
4446 		_mm_adds_epu8 (
4447 		    xmm_src,
4448 		    _mm_cvtsi32_si128 (*dst)));
4449 
4450 	    w--;
4451 	    dst++;
4452 	}
4453     }
4454 
4455 }
4456 
4457 static void
sse2_composite_add_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4458 sse2_composite_add_8_8 (pixman_implementation_t *imp,
4459 			pixman_composite_info_t *info)
4460 {
4461     PIXMAN_COMPOSITE_ARGS (info);
4462     uint8_t     *dst_line, *dst;
4463     uint8_t     *src_line, *src;
4464     int dst_stride, src_stride;
4465     int32_t w;
4466     uint16_t t;
4467 
4468     PIXMAN_IMAGE_GET_LINE (
4469 	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4470     PIXMAN_IMAGE_GET_LINE (
4471 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4472 
4473     while (height--)
4474     {
4475 	dst = dst_line;
4476 	src = src_line;
4477 
4478 	dst_line += dst_stride;
4479 	src_line += src_stride;
4480 	w = width;
4481 
4482 	/* Small head */
4483 	while (w && (uintptr_t)dst & 3)
4484 	{
4485 	    t = (*dst) + (*src++);
4486 	    *dst++ = t | (0 - (t >> 8));
4487 	    w--;
4488 	}
4489 
4490 	sse2_combine_add_u (imp, op,
4491 			    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4492 
4493 	/* Small tail */
4494 	dst += w & 0xfffc;
4495 	src += w & 0xfffc;
4496 
4497 	w &= 3;
4498 
4499 	while (w)
4500 	{
4501 	    t = (*dst) + (*src++);
4502 	    *dst++ = t | (0 - (t >> 8));
4503 	    w--;
4504 	}
4505     }
4506 
4507 }
4508 
4509 static void
sse2_composite_add_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)4510 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4511                               pixman_composite_info_t *info)
4512 {
4513     PIXMAN_COMPOSITE_ARGS (info);
4514     uint32_t    *dst_line, *dst;
4515     uint32_t    *src_line, *src;
4516     int dst_stride, src_stride;
4517 
4518     PIXMAN_IMAGE_GET_LINE (
4519 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4520     PIXMAN_IMAGE_GET_LINE (
4521 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4522 
4523     while (height--)
4524     {
4525 	dst = dst_line;
4526 	dst_line += dst_stride;
4527 	src = src_line;
4528 	src_line += src_stride;
4529 
4530 	sse2_combine_add_u (imp, op, dst, src, NULL, width);
4531     }
4532 }
4533 
4534 static void
sse2_composite_add_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)4535 sse2_composite_add_n_8888 (pixman_implementation_t *imp,
4536 			   pixman_composite_info_t *info)
4537 {
4538     PIXMAN_COMPOSITE_ARGS (info);
4539     uint32_t *dst_line, *dst, src;
4540     int dst_stride;
4541 
4542     __m128i xmm_src;
4543 
4544     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4545 
4546     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4547     if (src == 0)
4548 	return;
4549 
4550     if (src == ~0)
4551     {
4552 	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
4553 		     dest_x, dest_y, width, height, ~0);
4554 
4555 	return;
4556     }
4557 
4558     xmm_src = _mm_set_epi32 (src, src, src, src);
4559     while (height--)
4560     {
4561 	int w = width;
4562 	uint32_t d;
4563 
4564 	dst = dst_line;
4565 	dst_line += dst_stride;
4566 
4567 	while (w && (uintptr_t)dst & 15)
4568 	{
4569 	    d = *dst;
4570 	    *dst++ =
4571 		_mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
4572 	    w--;
4573 	}
4574 
4575 	while (w >= 4)
4576 	{
4577 	    save_128_aligned
4578 		((__m128i*)dst,
4579 		 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4580 
4581 	    dst += 4;
4582 	    w -= 4;
4583 	}
4584 
4585 	while (w--)
4586 	{
4587 	    d = *dst;
4588 	    *dst++ =
4589 		_mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
4590 						  _mm_cvtsi32_si128 (d)));
4591 	}
4592     }
4593 }
4594 
4595 static void
sse2_composite_add_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)4596 sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
4597 			     pixman_composite_info_t *info)
4598 {
4599     PIXMAN_COMPOSITE_ARGS (info);
4600     uint32_t     *dst_line, *dst;
4601     uint8_t     *mask_line, *mask;
4602     int dst_stride, mask_stride;
4603     int32_t w;
4604     uint32_t src;
4605 
4606     __m128i xmm_src;
4607 
4608     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4609     if (src == 0)
4610 	return;
4611     xmm_src = expand_pixel_32_1x128 (src);
4612 
4613     PIXMAN_IMAGE_GET_LINE (
4614 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4615     PIXMAN_IMAGE_GET_LINE (
4616 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4617 
4618     while (height--)
4619     {
4620 	dst = dst_line;
4621 	dst_line += dst_stride;
4622 	mask = mask_line;
4623 	mask_line += mask_stride;
4624 	w = width;
4625 
4626 	while (w && ((uintptr_t)dst & 15))
4627 	{
4628 	    uint8_t m = *mask++;
4629 	    if (m)
4630 	    {
4631 		*dst = pack_1x128_32
4632 		    (_mm_adds_epu16
4633 		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4634 		      unpack_32_1x128 (*dst)));
4635 	    }
4636 	    dst++;
4637 	    w--;
4638 	}
4639 
4640 	while (w >= 4)
4641 	{
4642 	    uint32_t m;
4643             memcpy(&m, mask, sizeof(uint32_t));
4644 
4645 	    if (m)
4646 	    {
4647 		__m128i xmm_mask_lo, xmm_mask_hi;
4648 		__m128i xmm_dst_lo, xmm_dst_hi;
4649 
4650 		__m128i xmm_dst = load_128_aligned ((__m128i*)dst);
4651 		__m128i xmm_mask =
4652 		    _mm_unpacklo_epi8 (unpack_32_1x128(m),
4653 				       _mm_setzero_si128 ());
4654 
4655 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4656 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4657 
4658 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4659 					&xmm_mask_lo, &xmm_mask_hi);
4660 
4661 		pix_multiply_2x128 (&xmm_src, &xmm_src,
4662 				    &xmm_mask_lo, &xmm_mask_hi,
4663 				    &xmm_mask_lo, &xmm_mask_hi);
4664 
4665 		xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4666 		xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4667 
4668 		save_128_aligned (
4669 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4670 	    }
4671 
4672 	    w -= 4;
4673 	    dst += 4;
4674 	    mask += 4;
4675 	}
4676 
4677 	while (w)
4678 	{
4679 	    uint8_t m = *mask++;
4680 	    if (m)
4681 	    {
4682 		*dst = pack_1x128_32
4683 		    (_mm_adds_epu16
4684 		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4685 		      unpack_32_1x128 (*dst)));
4686 	    }
4687 	    dst++;
4688 	    w--;
4689 	}
4690     }
4691 }
4692 
4693 static pixman_bool_t
sse2_blt(pixman_implementation_t * imp,uint32_t * src_bits,uint32_t * dst_bits,int src_stride,int dst_stride,int src_bpp,int dst_bpp,int src_x,int src_y,int dest_x,int dest_y,int width,int height)4694 sse2_blt (pixman_implementation_t *imp,
4695           uint32_t *               src_bits,
4696           uint32_t *               dst_bits,
4697           int                      src_stride,
4698           int                      dst_stride,
4699           int                      src_bpp,
4700           int                      dst_bpp,
4701           int                      src_x,
4702           int                      src_y,
4703           int                      dest_x,
4704           int                      dest_y,
4705           int                      width,
4706           int                      height)
4707 {
4708     uint8_t *   src_bytes;
4709     uint8_t *   dst_bytes;
4710     int byte_width;
4711 
4712     if (src_bpp != dst_bpp)
4713 	return FALSE;
4714 
4715     if (src_bpp == 16)
4716     {
4717 	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4718 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4719 	src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4720 	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4721 	byte_width = 2 * width;
4722 	src_stride *= 2;
4723 	dst_stride *= 2;
4724     }
4725     else if (src_bpp == 32)
4726     {
4727 	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4728 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4729 	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4730 	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4731 	byte_width = 4 * width;
4732 	src_stride *= 4;
4733 	dst_stride *= 4;
4734     }
4735     else
4736     {
4737 	return FALSE;
4738     }
4739 
4740     while (height--)
4741     {
4742 	int w;
4743 	uint8_t *s = src_bytes;
4744 	uint8_t *d = dst_bytes;
4745 	src_bytes += src_stride;
4746 	dst_bytes += dst_stride;
4747 	w = byte_width;
4748 
4749 	while (w >= 2 && ((uintptr_t)d & 3))
4750 	{
4751             memmove(d, s, 2);
4752 	    w -= 2;
4753 	    s += 2;
4754 	    d += 2;
4755 	}
4756 
4757 	while (w >= 4 && ((uintptr_t)d & 15))
4758 	{
4759             memmove(d, s, 4);
4760 
4761 	    w -= 4;
4762 	    s += 4;
4763 	    d += 4;
4764 	}
4765 
4766 	while (w >= 64)
4767 	{
4768 	    __m128i xmm0, xmm1, xmm2, xmm3;
4769 
4770 	    xmm0 = load_128_unaligned ((__m128i*)(s));
4771 	    xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4772 	    xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4773 	    xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4774 
4775 	    save_128_aligned ((__m128i*)(d),    xmm0);
4776 	    save_128_aligned ((__m128i*)(d + 16), xmm1);
4777 	    save_128_aligned ((__m128i*)(d + 32), xmm2);
4778 	    save_128_aligned ((__m128i*)(d + 48), xmm3);
4779 
4780 	    s += 64;
4781 	    d += 64;
4782 	    w -= 64;
4783 	}
4784 
4785 	while (w >= 16)
4786 	{
4787 	    save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4788 
4789 	    w -= 16;
4790 	    d += 16;
4791 	    s += 16;
4792 	}
4793 
4794 	while (w >= 4)
4795 	{
4796             memmove(d, s, 4);
4797 
4798 	    w -= 4;
4799 	    s += 4;
4800 	    d += 4;
4801 	}
4802 
4803 	if (w >= 2)
4804 	{
4805             memmove(d, s, 2);
4806 	    w -= 2;
4807 	    s += 2;
4808 	    d += 2;
4809 	}
4810     }
4811 
4812     return TRUE;
4813 }
4814 
4815 static void
sse2_composite_copy_area(pixman_implementation_t * imp,pixman_composite_info_t * info)4816 sse2_composite_copy_area (pixman_implementation_t *imp,
4817                           pixman_composite_info_t *info)
4818 {
4819     PIXMAN_COMPOSITE_ARGS (info);
4820     sse2_blt (imp, src_image->bits.bits,
4821 	      dest_image->bits.bits,
4822 	      src_image->bits.rowstride,
4823 	      dest_image->bits.rowstride,
4824 	      PIXMAN_FORMAT_BPP (src_image->bits.format),
4825 	      PIXMAN_FORMAT_BPP (dest_image->bits.format),
4826 	      src_x, src_y, dest_x, dest_y, width, height);
4827 }
4828 
4829 static void
sse2_composite_over_x888_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)4830 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4831                                  pixman_composite_info_t *info)
4832 {
4833     PIXMAN_COMPOSITE_ARGS (info);
4834     uint32_t    *src, *src_line, s;
4835     uint32_t    *dst, *dst_line, d;
4836     uint8_t         *mask, *mask_line;
4837     int src_stride, mask_stride, dst_stride;
4838     int32_t w;
4839     __m128i ms;
4840 
4841     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4842     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4843     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4844 
4845     PIXMAN_IMAGE_GET_LINE (
4846 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4847     PIXMAN_IMAGE_GET_LINE (
4848 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4849     PIXMAN_IMAGE_GET_LINE (
4850 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4851 
4852     while (height--)
4853     {
4854         src = src_line;
4855         src_line += src_stride;
4856         dst = dst_line;
4857         dst_line += dst_stride;
4858         mask = mask_line;
4859         mask_line += mask_stride;
4860 
4861         w = width;
4862 
4863         while (w && (uintptr_t)dst & 15)
4864         {
4865             uint8_t m = *mask++;
4866             s = 0xff000000 | *src++;
4867             d = *dst;
4868             ms = unpack_32_1x128 (s);
4869 
4870             if (m != 0xff)
4871             {
4872 		__m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4873 		__m128i md = unpack_32_1x128 (d);
4874 
4875                 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4876             }
4877 
4878             *dst++ = pack_1x128_32 (ms);
4879             w--;
4880         }
4881 
4882         while (w >= 4)
4883         {
4884             uint32_t m;
4885             memcpy(&m, mask, sizeof(uint32_t));
4886             xmm_src = _mm_or_si128 (
4887 		load_128_unaligned ((__m128i*)src), mask_ff000000);
4888 
4889             if (m == 0xffffffff)
4890             {
4891                 save_128_aligned ((__m128i*)dst, xmm_src);
4892             }
4893             else
4894             {
4895                 xmm_dst = load_128_aligned ((__m128i*)dst);
4896 
4897                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4898 
4899                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4900                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4901                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4902 
4903                 expand_alpha_rev_2x128 (
4904 		    xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4905 
4906                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
4907 			       &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
4908 			       &xmm_dst_lo, &xmm_dst_hi);
4909 
4910                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4911             }
4912 
4913             src += 4;
4914             dst += 4;
4915             mask += 4;
4916             w -= 4;
4917         }
4918 
4919         while (w)
4920         {
4921             uint8_t m = *mask++;
4922 
4923             if (m)
4924             {
4925                 s = 0xff000000 | *src;
4926 
4927                 if (m == 0xff)
4928                 {
4929                     *dst = s;
4930                 }
4931                 else
4932                 {
4933 		    __m128i ma, md, ms;
4934 
4935                     d = *dst;
4936 
4937 		    ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4938 		    md = unpack_32_1x128 (d);
4939 		    ms = unpack_32_1x128 (s);
4940 
4941                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4942                 }
4943 
4944             }
4945 
4946             src++;
4947             dst++;
4948             w--;
4949         }
4950     }
4951 
4952 }
4953 
4954 static void
sse2_composite_over_8888_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)4955 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
4956                                  pixman_composite_info_t *info)
4957 {
4958     PIXMAN_COMPOSITE_ARGS (info);
4959     uint32_t    *src, *src_line, s;
4960     uint32_t    *dst, *dst_line, d;
4961     uint8_t         *mask, *mask_line;
4962     int src_stride, mask_stride, dst_stride;
4963     int32_t w;
4964 
4965     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4966     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4967     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4968 
4969     PIXMAN_IMAGE_GET_LINE (
4970 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4971     PIXMAN_IMAGE_GET_LINE (
4972 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4973     PIXMAN_IMAGE_GET_LINE (
4974 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4975 
4976     while (height--)
4977     {
4978         src = src_line;
4979         src_line += src_stride;
4980         dst = dst_line;
4981         dst_line += dst_stride;
4982         mask = mask_line;
4983         mask_line += mask_stride;
4984 
4985         w = width;
4986 
4987         while (w && (uintptr_t)dst & 15)
4988         {
4989 	    uint32_t sa;
4990             uint8_t m = *mask++;
4991 
4992             s = *src++;
4993             d = *dst;
4994 
4995 	    sa = s >> 24;
4996 
4997 	    if (m)
4998 	    {
4999 		if (sa == 0xff && m == 0xff)
5000 		{
5001 		    *dst = s;
5002 		}
5003 		else
5004 		{
5005 		    __m128i ms, md, ma, msa;
5006 
5007 		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5008 		    ms = unpack_32_1x128 (s);
5009 		    md = unpack_32_1x128 (d);
5010 
5011 		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5012 
5013 		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5014 		}
5015 	    }
5016 
5017 	    dst++;
5018             w--;
5019         }
5020 
5021         while (w >= 4)
5022         {
5023             uint32_t m;
5024             memcpy(&m, mask, sizeof(uint32_t));
5025 
5026 	    if (m)
5027 	    {
5028 		xmm_src = load_128_unaligned ((__m128i*)src);
5029 
5030 		if (m == 0xffffffff && is_opaque (xmm_src))
5031 		{
5032 		    save_128_aligned ((__m128i *)dst, xmm_src);
5033 		}
5034 		else
5035 		{
5036 		    xmm_dst = load_128_aligned ((__m128i *)dst);
5037 
5038 		    xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5039 
5040 		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5041 		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5042 		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5043 
5044 		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5045 		    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5046 
5047 		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5048 				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5049 
5050 		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5051 		}
5052 	    }
5053 
5054             src += 4;
5055             dst += 4;
5056             mask += 4;
5057             w -= 4;
5058         }
5059 
5060         while (w)
5061         {
5062 	    uint32_t sa;
5063             uint8_t m = *mask++;
5064 
5065             s = *src++;
5066             d = *dst;
5067 
5068 	    sa = s >> 24;
5069 
5070 	    if (m)
5071 	    {
5072 		if (sa == 0xff && m == 0xff)
5073 		{
5074 		    *dst = s;
5075 		}
5076 		else
5077 		{
5078 		    __m128i ms, md, ma, msa;
5079 
5080 		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5081 		    ms = unpack_32_1x128 (s);
5082 		    md = unpack_32_1x128 (d);
5083 
5084 		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5085 
5086 		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5087 		}
5088 	    }
5089 
5090 	    dst++;
5091             w--;
5092         }
5093     }
5094 
5095 }
5096 
5097 static void
sse2_composite_over_reverse_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)5098 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5099 				    pixman_composite_info_t *info)
5100 {
5101     PIXMAN_COMPOSITE_ARGS (info);
5102     uint32_t src;
5103     uint32_t    *dst_line, *dst;
5104     __m128i xmm_src;
5105     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5106     __m128i xmm_dsta_hi, xmm_dsta_lo;
5107     int dst_stride;
5108     int32_t w;
5109 
5110     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
5111 
5112     if (src == 0)
5113 	return;
5114 
5115     PIXMAN_IMAGE_GET_LINE (
5116 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5117 
5118     xmm_src = expand_pixel_32_1x128 (src);
5119 
5120     while (height--)
5121     {
5122 	dst = dst_line;
5123 
5124 	dst_line += dst_stride;
5125 	w = width;
5126 
5127 	while (w && (uintptr_t)dst & 15)
5128 	{
5129 	    __m128i vd;
5130 
5131 	    vd = unpack_32_1x128 (*dst);
5132 
5133 	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5134 					      xmm_src));
5135 	    w--;
5136 	    dst++;
5137 	}
5138 
5139 	while (w >= 4)
5140 	{
5141 	    __m128i tmp_lo, tmp_hi;
5142 
5143 	    xmm_dst = load_128_aligned ((__m128i*)dst);
5144 
5145 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5146 	    expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5147 
5148 	    tmp_lo = xmm_src;
5149 	    tmp_hi = xmm_src;
5150 
5151 	    over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5152 			&xmm_dsta_lo, &xmm_dsta_hi,
5153 			&tmp_lo, &tmp_hi);
5154 
5155 	    save_128_aligned (
5156 		(__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5157 
5158 	    w -= 4;
5159 	    dst += 4;
5160 	}
5161 
5162 	while (w)
5163 	{
5164 	    __m128i vd;
5165 
5166 	    vd = unpack_32_1x128 (*dst);
5167 
5168 	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5169 					      xmm_src));
5170 	    w--;
5171 	    dst++;
5172 	}
5173 
5174     }
5175 
5176 }
5177 
5178 static void
sse2_composite_over_8888_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)5179 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5180 				    pixman_composite_info_t *info)
5181 {
5182     PIXMAN_COMPOSITE_ARGS (info);
5183     uint32_t    *src, *src_line, s;
5184     uint32_t    *dst, *dst_line, d;
5185     uint32_t    *mask, *mask_line;
5186     uint32_t    m;
5187     int src_stride, mask_stride, dst_stride;
5188     int32_t w;
5189 
5190     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5191     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5192     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5193 
5194     PIXMAN_IMAGE_GET_LINE (
5195 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5196     PIXMAN_IMAGE_GET_LINE (
5197 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5198     PIXMAN_IMAGE_GET_LINE (
5199 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5200 
5201     while (height--)
5202     {
5203         src = src_line;
5204         src_line += src_stride;
5205         dst = dst_line;
5206         dst_line += dst_stride;
5207         mask = mask_line;
5208         mask_line += mask_stride;
5209 
5210         w = width;
5211 
5212         while (w && (uintptr_t)dst & 15)
5213         {
5214 	    uint32_t sa;
5215 
5216             s = *src++;
5217             m = (*mask++) >> 24;
5218             d = *dst;
5219 
5220 	    sa = s >> 24;
5221 
5222 	    if (m)
5223 	    {
5224 		if (sa == 0xff && m == 0xff)
5225 		{
5226 		    *dst = s;
5227 		}
5228 		else
5229 		{
5230 		    __m128i ms, md, ma, msa;
5231 
5232 		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5233 		    ms = unpack_32_1x128 (s);
5234 		    md = unpack_32_1x128 (d);
5235 
5236 		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5237 
5238 		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5239 		}
5240 	    }
5241 
5242 	    dst++;
5243             w--;
5244         }
5245 
5246         while (w >= 4)
5247         {
5248 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
5249 
5250 	    if (!is_transparent (xmm_mask))
5251 	    {
5252 		xmm_src = load_128_unaligned ((__m128i*)src);
5253 
5254 		if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5255 		{
5256 		    save_128_aligned ((__m128i *)dst, xmm_src);
5257 		}
5258 		else
5259 		{
5260 		    xmm_dst = load_128_aligned ((__m128i *)dst);
5261 
5262 		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5263 		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5264 		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5265 
5266 		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5267 		    expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5268 
5269 		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5270 				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5271 
5272 		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5273 		}
5274 	    }
5275 
5276             src += 4;
5277             dst += 4;
5278             mask += 4;
5279             w -= 4;
5280         }
5281 
5282         while (w)
5283         {
5284 	    uint32_t sa;
5285 
5286             s = *src++;
5287             m = (*mask++) >> 24;
5288             d = *dst;
5289 
5290 	    sa = s >> 24;
5291 
5292 	    if (m)
5293 	    {
5294 		if (sa == 0xff && m == 0xff)
5295 		{
5296 		    *dst = s;
5297 		}
5298 		else
5299 		{
5300 		    __m128i ms, md, ma, msa;
5301 
5302 		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5303 		    ms = unpack_32_1x128 (s);
5304 		    md = unpack_32_1x128 (d);
5305 
5306 		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5307 
5308 		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5309 		}
5310 	    }
5311 
5312 	    dst++;
5313             w--;
5314         }
5315     }
5316 
5317 }
5318 
5319 /* A variant of 'sse2_combine_over_u' with minor tweaks */
5320 static force_inline void
scaled_nearest_scanline_sse2_8888_8888_OVER(uint32_t * pd,const uint32_t * ps,int32_t w,pixman_fixed_t vx,pixman_fixed_t unit_x,pixman_fixed_t src_width_fixed,pixman_bool_t fully_transparent_src)5321 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
5322                                              const uint32_t* ps,
5323                                              int32_t         w,
5324                                              pixman_fixed_t  vx,
5325                                              pixman_fixed_t  unit_x,
5326                                              pixman_fixed_t  src_width_fixed,
5327                                              pixman_bool_t   fully_transparent_src)
5328 {
5329     uint32_t s, d;
5330     const uint32_t* pm = NULL;
5331 
5332     __m128i xmm_dst_lo, xmm_dst_hi;
5333     __m128i xmm_src_lo, xmm_src_hi;
5334     __m128i xmm_alpha_lo, xmm_alpha_hi;
5335 
5336     if (fully_transparent_src)
5337 	return;
5338 
5339     /* Align dst on a 16-byte boundary */
5340     while (w && ((uintptr_t)pd & 15))
5341     {
5342 	d = *pd;
5343 	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5344 	vx += unit_x;
5345 	while (vx >= 0)
5346 	    vx -= src_width_fixed;
5347 
5348 	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
5349 	if (pm)
5350 	    pm++;
5351 	w--;
5352     }
5353 
5354     while (w >= 4)
5355     {
5356 	__m128i tmp;
5357 	uint32_t tmp1, tmp2, tmp3, tmp4;
5358 
5359 	tmp1 = *(ps + pixman_fixed_to_int (vx));
5360 	vx += unit_x;
5361 	while (vx >= 0)
5362 	    vx -= src_width_fixed;
5363 	tmp2 = *(ps + pixman_fixed_to_int (vx));
5364 	vx += unit_x;
5365 	while (vx >= 0)
5366 	    vx -= src_width_fixed;
5367 	tmp3 = *(ps + pixman_fixed_to_int (vx));
5368 	vx += unit_x;
5369 	while (vx >= 0)
5370 	    vx -= src_width_fixed;
5371 	tmp4 = *(ps + pixman_fixed_to_int (vx));
5372 	vx += unit_x;
5373 	while (vx >= 0)
5374 	    vx -= src_width_fixed;
5375 
5376 	tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5377 
5378 	xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5379 
5380 	if (is_opaque (xmm_src_hi))
5381 	{
5382 	    save_128_aligned ((__m128i*)pd, xmm_src_hi);
5383 	}
5384 	else if (!is_zero (xmm_src_hi))
5385 	{
5386 	    xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5387 
5388 	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5389 	    unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5390 
5391 	    expand_alpha_2x128 (
5392 		xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5393 
5394 	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
5395 			&xmm_alpha_lo, &xmm_alpha_hi,
5396 			&xmm_dst_lo, &xmm_dst_hi);
5397 
5398 	    /* rebuid the 4 pixel data and save*/
5399 	    save_128_aligned ((__m128i*)pd,
5400 			      pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5401 	}
5402 
5403 	w -= 4;
5404 	pd += 4;
5405 	if (pm)
5406 	    pm += 4;
5407     }
5408 
5409     while (w)
5410     {
5411 	d = *pd;
5412 	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5413 	vx += unit_x;
5414 	while (vx >= 0)
5415 	    vx -= src_width_fixed;
5416 
5417 	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
5418 	if (pm)
5419 	    pm++;
5420 
5421 	w--;
5422     }
5423 }
5424 
FAST_NEAREST_MAINLOOP(sse2_8888_8888_cover_OVER,scaled_nearest_scanline_sse2_8888_8888_OVER,uint32_t,uint32_t,COVER)5425 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5426 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
5427 		       uint32_t, uint32_t, COVER)
5428 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5429 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
5430 		       uint32_t, uint32_t, NONE)
5431 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5432 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
5433 		       uint32_t, uint32_t, PAD)
5434 FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
5435 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
5436 		       uint32_t, uint32_t, NORMAL)
5437 
5438 static force_inline void
5439 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5440 					       uint32_t *       dst,
5441 					       const uint32_t * src,
5442 					       int32_t          w,
5443 					       pixman_fixed_t   vx,
5444 					       pixman_fixed_t   unit_x,
5445 					       pixman_fixed_t   src_width_fixed,
5446 					       pixman_bool_t    zero_src)
5447 {
5448     __m128i xmm_mask;
5449     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5450     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5451     __m128i xmm_alpha_lo, xmm_alpha_hi;
5452 
5453     if (zero_src || (*mask >> 24) == 0)
5454 	return;
5455 
5456     xmm_mask = create_mask_16_128 (*mask >> 24);
5457 
5458     while (w && (uintptr_t)dst & 15)
5459     {
5460 	uint32_t s = *(src + pixman_fixed_to_int (vx));
5461 	vx += unit_x;
5462 	while (vx >= 0)
5463 	    vx -= src_width_fixed;
5464 
5465 	if (s)
5466 	{
5467 	    uint32_t d = *dst;
5468 
5469 	    __m128i ms = unpack_32_1x128 (s);
5470 	    __m128i alpha     = expand_alpha_1x128 (ms);
5471 	    __m128i dest      = xmm_mask;
5472 	    __m128i alpha_dst = unpack_32_1x128 (d);
5473 
5474 	    *dst = pack_1x128_32 (
5475 		in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5476 	}
5477 	dst++;
5478 	w--;
5479     }
5480 
5481     while (w >= 4)
5482     {
5483 	uint32_t tmp1, tmp2, tmp3, tmp4;
5484 
5485 	tmp1 = *(src + pixman_fixed_to_int (vx));
5486 	vx += unit_x;
5487 	while (vx >= 0)
5488 	    vx -= src_width_fixed;
5489 	tmp2 = *(src + pixman_fixed_to_int (vx));
5490 	vx += unit_x;
5491 	while (vx >= 0)
5492 	    vx -= src_width_fixed;
5493 	tmp3 = *(src + pixman_fixed_to_int (vx));
5494 	vx += unit_x;
5495 	while (vx >= 0)
5496 	    vx -= src_width_fixed;
5497 	tmp4 = *(src + pixman_fixed_to_int (vx));
5498 	vx += unit_x;
5499 	while (vx >= 0)
5500 	    vx -= src_width_fixed;
5501 
5502 	xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5503 
5504 	if (!is_zero (xmm_src))
5505 	{
5506 	    xmm_dst = load_128_aligned ((__m128i*)dst);
5507 
5508 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5509 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5510 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5511 			        &xmm_alpha_lo, &xmm_alpha_hi);
5512 
5513 	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5514 			   &xmm_alpha_lo, &xmm_alpha_hi,
5515 			   &xmm_mask, &xmm_mask,
5516 			   &xmm_dst_lo, &xmm_dst_hi);
5517 
5518 	    save_128_aligned (
5519 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5520 	}
5521 
5522 	dst += 4;
5523 	w -= 4;
5524     }
5525 
5526     while (w)
5527     {
5528 	uint32_t s = *(src + pixman_fixed_to_int (vx));
5529 	vx += unit_x;
5530 	while (vx >= 0)
5531 	    vx -= src_width_fixed;
5532 
5533 	if (s)
5534 	{
5535 	    uint32_t d = *dst;
5536 
5537 	    __m128i ms = unpack_32_1x128 (s);
5538 	    __m128i alpha = expand_alpha_1x128 (ms);
5539 	    __m128i mask  = xmm_mask;
5540 	    __m128i dest  = unpack_32_1x128 (d);
5541 
5542 	    *dst = pack_1x128_32 (
5543 		in_over_1x128 (&ms, &alpha, &mask, &dest));
5544 	}
5545 
5546 	dst++;
5547 	w--;
5548     }
5549 
5550 }
5551 
FAST_NEAREST_MAINLOOP_COMMON(sse2_8888_n_8888_cover_OVER,scaled_nearest_scanline_sse2_8888_n_8888_OVER,uint32_t,uint32_t,uint32_t,COVER,TRUE,TRUE)5552 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5553 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5554 			      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5555 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5556 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5557 			      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5558 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5559 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5560 			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5561 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
5562 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5563 			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
5564 
5565 #if PSHUFD_IS_FAST
5566 
5567 /***********************************************************************************/
5568 
5569 # define BILINEAR_DECLARE_VARIABLES						\
5570     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
5571     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
5572     const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
5573     const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
5574 					   unit_x, -unit_x, unit_x, -unit_x);	\
5575     const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4,		\
5576 					   unit_x * 4, -unit_x * 4,		\
5577 					   unit_x * 4, -unit_x * 4,		\
5578 					   unit_x * 4, -unit_x * 4);		\
5579     const __m128i xmm_zero = _mm_setzero_si128 ();				\
5580     __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3,	\
5581 				   vx + unit_x * 2, -(vx + 1) - unit_x * 2,	\
5582 				   vx + unit_x * 1, -(vx + 1) - unit_x * 1,	\
5583 				   vx + unit_x * 0, -(vx + 1) - unit_x * 0);	\
5584     __m128i xmm_wh_state;
5585 
5586 #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_)			\
5587 do {										\
5588     int phase = phase_;								\
5589     __m128i xmm_wh, xmm_a, xmm_b;						\
5590     /* fetch 2x2 pixel block into sse2 registers */				\
5591     __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]);		\
5592     __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]);		\
5593     vx += unit_x;								\
5594     /* vertical interpolation */						\
5595     xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt);	\
5596     xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb);	\
5597     xmm_a = _mm_add_epi16 (xmm_a, xmm_b);						\
5598     /* calculate horizontal weights */						\
5599     if (phase <= 0)								\
5600     {										\
5601 	xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
5602 					16 - BILINEAR_INTERPOLATION_BITS));	\
5603 	xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4);		\
5604 	phase = 0;								\
5605     }										\
5606     xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase,	\
5607 							   phase, phase));	\
5608     /* horizontal interpolation */						\
5609     xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\
5610 		xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh);		\
5611     /* shift the result */							\
5612     pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2);		\
5613 } while (0)
5614 
5615 #else /************************************************************************/
5616 
5617 # define BILINEAR_DECLARE_VARIABLES						\
5618     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
5619     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
5620     const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
5621     const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
5622 					  unit_x, -unit_x, unit_x, -unit_x);	\
5623     const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4,		\
5624 					   unit_x * 4, -unit_x * 4,		\
5625 					   unit_x * 4, -unit_x * 4,		\
5626 					   unit_x * 4, -unit_x * 4);		\
5627     const __m128i xmm_zero = _mm_setzero_si128 ();				\
5628     __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1),		\
5629 				   vx, -(vx + 1), vx, -(vx + 1))
5630 
5631 #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase)			\
5632 do {										\
5633     __m128i xmm_wh, xmm_a, xmm_b;						\
5634     /* fetch 2x2 pixel block into sse2 registers */				\
5635     __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]);		\
5636     __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]);		\
5637     (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */		\
5638     vx += unit_x;								\
5639     /* vertical interpolation */						\
5640     xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt);	\
5641     xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb);	\
5642     xmm_a = _mm_add_epi16 (xmm_a, xmm_b);					\
5643     /* calculate horizontal weights */						\
5644     xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,			\
5645 					16 - BILINEAR_INTERPOLATION_BITS));	\
5646     xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1);					\
5647     /* horizontal interpolation */						\
5648     xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a);	\
5649     xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh);		\
5650     /* shift the result */							\
5651     pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2);		\
5652 } while (0)
5653 
5654 /***********************************************************************************/
5655 
5656 #endif
5657 
5658 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix);					\
5659 do {										\
5660 	__m128i xmm_pix;							\
5661 	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1);			\
5662 	xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix);				\
5663 	xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix);				\
5664 	pix = _mm_cvtsi128_si32 (xmm_pix);					\
5665 } while(0)
5666 
5667 #define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix);					\
5668 do {										\
5669 	__m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4;				\
5670 	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0);			\
5671 	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1);			\
5672 	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2);			\
5673 	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3);			\
5674 	xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2);			\
5675 	xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4);			\
5676 	pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3);				\
5677 } while(0)
5678 
5679 #define BILINEAR_SKIP_ONE_PIXEL()						\
5680 do {										\
5681     vx += unit_x;								\
5682     xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1);					\
5683 } while(0)
5684 
5685 #define BILINEAR_SKIP_FOUR_PIXELS()						\
5686 do {										\
5687     vx += unit_x * 4;								\
5688     xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4);					\
5689 } while(0)
5690 
5691 /***********************************************************************************/
5692 
5693 static force_inline void
5694 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
5695 					     const uint32_t * mask,
5696 					     const uint32_t * src_top,
5697 					     const uint32_t * src_bottom,
5698 					     int32_t          w,
5699 					     int              wt,
5700 					     int              wb,
5701 					     pixman_fixed_t   vx_,
5702 					     pixman_fixed_t   unit_x_,
5703 					     pixman_fixed_t   max_vx,
5704 					     pixman_bool_t    zero_src)
5705 {
5706     intptr_t vx = vx_;
5707     intptr_t unit_x = unit_x_;
5708     BILINEAR_DECLARE_VARIABLES;
5709     uint32_t pix1, pix2;
5710 
5711     while (w && ((uintptr_t)dst & 15))
5712     {
5713 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5714 	*dst++ = pix1;
5715 	w--;
5716     }
5717 
5718     while ((w -= 4) >= 0) {
5719 	__m128i xmm_src;
5720 	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5721 	_mm_store_si128 ((__m128i *)dst, xmm_src);
5722 	dst += 4;
5723     }
5724 
5725     if (w & 2)
5726     {
5727 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5728 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5729 	*dst++ = pix1;
5730 	*dst++ = pix2;
5731     }
5732 
5733     if (w & 1)
5734     {
5735 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5736 	*dst = pix1;
5737     }
5738 
5739 }
5740 
FAST_BILINEAR_MAINLOOP_COMMON(sse2_8888_8888_cover_SRC,scaled_bilinear_scanline_sse2_8888_8888_SRC,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)5741 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
5742 			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
5743 			       uint32_t, uint32_t, uint32_t,
5744 			       COVER, FLAG_NONE)
5745 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
5746 			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
5747 			       uint32_t, uint32_t, uint32_t,
5748 			       PAD, FLAG_NONE)
5749 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
5750 			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
5751 			       uint32_t, uint32_t, uint32_t,
5752 			       NONE, FLAG_NONE)
5753 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
5754 			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
5755 			       uint32_t, uint32_t, uint32_t,
5756 			       NORMAL, FLAG_NONE)
5757 
5758 static force_inline void
5759 scaled_bilinear_scanline_sse2_x888_8888_SRC (uint32_t *       dst,
5760 					     const uint32_t * mask,
5761 					     const uint32_t * src_top,
5762 					     const uint32_t * src_bottom,
5763 					     int32_t          w,
5764 					     int              wt,
5765 					     int              wb,
5766 					     pixman_fixed_t   vx_,
5767 					     pixman_fixed_t   unit_x_,
5768 					     pixman_fixed_t   max_vx,
5769 					     pixman_bool_t    zero_src)
5770 {
5771     intptr_t vx = vx_;
5772     intptr_t unit_x = unit_x_;
5773     BILINEAR_DECLARE_VARIABLES;
5774     uint32_t pix1, pix2;
5775 
5776     while (w && ((uintptr_t)dst & 15))
5777     {
5778 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5779 	*dst++ = pix1 | 0xFF000000;
5780 	w--;
5781     }
5782 
5783     while ((w -= 4) >= 0) {
5784 	__m128i xmm_src;
5785 	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5786 	_mm_store_si128 ((__m128i *)dst, _mm_or_si128 (xmm_src, mask_ff000000));
5787 	dst += 4;
5788     }
5789 
5790     if (w & 2)
5791     {
5792 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5793 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5794 	*dst++ = pix1 | 0xFF000000;
5795 	*dst++ = pix2 | 0xFF000000;
5796     }
5797 
5798     if (w & 1)
5799     {
5800 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5801 	*dst = pix1 | 0xFF000000;
5802     }
5803 }
5804 
FAST_BILINEAR_MAINLOOP_COMMON(sse2_x888_8888_cover_SRC,scaled_bilinear_scanline_sse2_x888_8888_SRC,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)5805 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_cover_SRC,
5806 			       scaled_bilinear_scanline_sse2_x888_8888_SRC,
5807 			       uint32_t, uint32_t, uint32_t,
5808 			       COVER, FLAG_NONE)
5809 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_pad_SRC,
5810 			       scaled_bilinear_scanline_sse2_x888_8888_SRC,
5811 			       uint32_t, uint32_t, uint32_t,
5812 			       PAD, FLAG_NONE)
5813 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_normal_SRC,
5814 			       scaled_bilinear_scanline_sse2_x888_8888_SRC,
5815 			       uint32_t, uint32_t, uint32_t,
5816 			       NORMAL, FLAG_NONE)
5817 
5818 static force_inline void
5819 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
5820 					      const uint32_t * mask,
5821 					      const uint32_t * src_top,
5822 					      const uint32_t * src_bottom,
5823 					      int32_t          w,
5824 					      int              wt,
5825 					      int              wb,
5826 					      pixman_fixed_t   vx_,
5827 					      pixman_fixed_t   unit_x_,
5828 					      pixman_fixed_t   max_vx,
5829 					      pixman_bool_t    zero_src)
5830 {
5831     intptr_t vx = vx_;
5832     intptr_t unit_x = unit_x_;
5833     BILINEAR_DECLARE_VARIABLES;
5834     uint32_t pix1, pix2;
5835 
5836     while (w && ((uintptr_t)dst & 15))
5837     {
5838 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5839 
5840 	if (pix1)
5841 	{
5842 	    pix2 = *dst;
5843 	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5844 	}
5845 
5846 	w--;
5847 	dst++;
5848     }
5849 
5850     while (w  >= 4)
5851     {
5852 	__m128i xmm_src;
5853 	__m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
5854 	__m128i xmm_alpha_hi, xmm_alpha_lo;
5855 
5856 	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5857 
5858 	if (!is_zero (xmm_src))
5859 	{
5860 	    if (is_opaque (xmm_src))
5861 	    {
5862 		save_128_aligned ((__m128i *)dst, xmm_src);
5863 	    }
5864 	    else
5865 	    {
5866 		__m128i xmm_dst = load_128_aligned ((__m128i *)dst);
5867 
5868 		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5869 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5870 
5871 		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5872 		over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
5873 			    &xmm_dst_lo, &xmm_dst_hi);
5874 
5875 		save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5876 	    }
5877 	}
5878 
5879 	w -= 4;
5880 	dst += 4;
5881     }
5882 
5883     while (w)
5884     {
5885 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5886 
5887 	if (pix1)
5888 	{
5889 	    pix2 = *dst;
5890 	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5891 	}
5892 
5893 	w--;
5894 	dst++;
5895     }
5896 }
5897 
FAST_BILINEAR_MAINLOOP_COMMON(sse2_8888_8888_cover_OVER,scaled_bilinear_scanline_sse2_8888_8888_OVER,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)5898 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
5899 			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
5900 			       uint32_t, uint32_t, uint32_t,
5901 			       COVER, FLAG_NONE)
5902 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
5903 			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
5904 			       uint32_t, uint32_t, uint32_t,
5905 			       PAD, FLAG_NONE)
5906 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
5907 			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
5908 			       uint32_t, uint32_t, uint32_t,
5909 			       NONE, FLAG_NONE)
5910 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
5911 			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
5912 			       uint32_t, uint32_t, uint32_t,
5913 			       NORMAL, FLAG_NONE)
5914 
5915 static force_inline void
5916 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
5917 						const uint8_t  * mask,
5918 						const uint32_t * src_top,
5919 						const uint32_t * src_bottom,
5920 						int32_t          w,
5921 						int              wt,
5922 						int              wb,
5923 						pixman_fixed_t   vx_,
5924 						pixman_fixed_t   unit_x_,
5925 						pixman_fixed_t   max_vx,
5926 						pixman_bool_t    zero_src)
5927 {
5928     intptr_t vx = vx_;
5929     intptr_t unit_x = unit_x_;
5930     BILINEAR_DECLARE_VARIABLES;
5931     uint32_t pix1, pix2;
5932 
5933     while (w && ((uintptr_t)dst & 15))
5934     {
5935 	uint32_t sa;
5936 	uint8_t m = *mask++;
5937 
5938 	if (m)
5939 	{
5940 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5941 	    sa = pix1 >> 24;
5942 
5943 	    if (sa == 0xff && m == 0xff)
5944 	    {
5945 		*dst = pix1;
5946 	    }
5947 	    else
5948 	    {
5949 		__m128i ms, md, ma, msa;
5950 
5951 		pix2 = *dst;
5952 		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5953 		ms = unpack_32_1x128 (pix1);
5954 		md = unpack_32_1x128 (pix2);
5955 
5956 		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5957 
5958 		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5959 	    }
5960 	}
5961 	else
5962 	{
5963 	    BILINEAR_SKIP_ONE_PIXEL ();
5964 	}
5965 
5966 	w--;
5967 	dst++;
5968     }
5969 
5970     while (w >= 4)
5971     {
5972         uint32_t m;
5973 
5974 	__m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5975 	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5976 	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5977 
5978         memcpy(&m, mask, sizeof(uint32_t));
5979 
5980 	if (m)
5981 	{
5982 	    BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5983 
5984 	    if (m == 0xffffffff && is_opaque (xmm_src))
5985 	    {
5986 		save_128_aligned ((__m128i *)dst, xmm_src);
5987 	    }
5988 	    else
5989 	    {
5990 		xmm_dst = load_128_aligned ((__m128i *)dst);
5991 
5992 		xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5993 
5994 		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5995 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5996 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5997 
5998 		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5999 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
6000 
6001 		in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
6002 			       &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
6003 
6004 		save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6005 	    }
6006 	}
6007 	else
6008 	{
6009 	    BILINEAR_SKIP_FOUR_PIXELS ();
6010 	}
6011 
6012 	w -= 4;
6013 	dst += 4;
6014 	mask += 4;
6015     }
6016 
6017     while (w)
6018     {
6019 	uint32_t sa;
6020 	uint8_t m = *mask++;
6021 
6022 	if (m)
6023 	{
6024 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6025 	    sa = pix1 >> 24;
6026 
6027 	    if (sa == 0xff && m == 0xff)
6028 	    {
6029 		*dst = pix1;
6030 	    }
6031 	    else
6032 	    {
6033 		__m128i ms, md, ma, msa;
6034 
6035 		pix2 = *dst;
6036 		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
6037 		ms = unpack_32_1x128 (pix1);
6038 		md = unpack_32_1x128 (pix2);
6039 
6040 		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
6041 
6042 		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
6043 	    }
6044 	}
6045 	else
6046 	{
6047 	    BILINEAR_SKIP_ONE_PIXEL ();
6048 	}
6049 
6050 	w--;
6051 	dst++;
6052     }
6053 }
6054 
FAST_BILINEAR_MAINLOOP_COMMON(sse2_8888_8_8888_cover_OVER,scaled_bilinear_scanline_sse2_8888_8_8888_OVER,uint32_t,uint8_t,uint32_t,COVER,FLAG_HAVE_NON_SOLID_MASK)6055 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
6056 			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6057 			       uint32_t, uint8_t, uint32_t,
6058 			       COVER, FLAG_HAVE_NON_SOLID_MASK)
6059 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
6060 			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6061 			       uint32_t, uint8_t, uint32_t,
6062 			       PAD, FLAG_HAVE_NON_SOLID_MASK)
6063 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
6064 			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6065 			       uint32_t, uint8_t, uint32_t,
6066 			       NONE, FLAG_HAVE_NON_SOLID_MASK)
6067 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
6068 			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6069 			       uint32_t, uint8_t, uint32_t,
6070 			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
6071 
6072 static force_inline void
6073 scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,
6074 						const uint32_t * mask,
6075 						const uint32_t * src_top,
6076 						const uint32_t * src_bottom,
6077 						int32_t          w,
6078 						int              wt,
6079 						int              wb,
6080 						pixman_fixed_t   vx_,
6081 						pixman_fixed_t   unit_x_,
6082 						pixman_fixed_t   max_vx,
6083 						pixman_bool_t    zero_src)
6084 {
6085     intptr_t vx = vx_;
6086     intptr_t unit_x = unit_x_;
6087     BILINEAR_DECLARE_VARIABLES;
6088     uint32_t pix1;
6089     __m128i xmm_mask;
6090 
6091     if (zero_src || (*mask >> 24) == 0)
6092 	return;
6093 
6094     xmm_mask = create_mask_16_128 (*mask >> 24);
6095 
6096     while (w && ((uintptr_t)dst & 15))
6097     {
6098 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6099 	if (pix1)
6100 	{
6101 		uint32_t d = *dst;
6102 
6103 		__m128i ms = unpack_32_1x128 (pix1);
6104 		__m128i alpha     = expand_alpha_1x128 (ms);
6105 		__m128i dest      = xmm_mask;
6106 		__m128i alpha_dst = unpack_32_1x128 (d);
6107 
6108 		*dst = pack_1x128_32
6109 			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6110 	}
6111 
6112 	dst++;
6113 	w--;
6114     }
6115 
6116     while (w >= 4)
6117     {
6118 	__m128i xmm_src;
6119 	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
6120 
6121 	if (!is_zero (xmm_src))
6122 	{
6123 	    __m128i xmm_src_lo, xmm_src_hi;
6124 	    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
6125 	    __m128i xmm_alpha_lo, xmm_alpha_hi;
6126 
6127 	    xmm_dst = load_128_aligned ((__m128i*)dst);
6128 
6129 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
6130 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
6131 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
6132 				&xmm_alpha_lo, &xmm_alpha_hi);
6133 
6134 	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
6135 			   &xmm_alpha_lo, &xmm_alpha_hi,
6136 			   &xmm_mask, &xmm_mask,
6137 			   &xmm_dst_lo, &xmm_dst_hi);
6138 
6139 	    save_128_aligned
6140 		((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6141 	}
6142 
6143 	dst += 4;
6144 	w -= 4;
6145     }
6146 
6147     while (w)
6148     {
6149 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6150 	if (pix1)
6151 	{
6152 		uint32_t d = *dst;
6153 
6154 		__m128i ms = unpack_32_1x128 (pix1);
6155 		__m128i alpha     = expand_alpha_1x128 (ms);
6156 		__m128i dest      = xmm_mask;
6157 		__m128i alpha_dst = unpack_32_1x128 (d);
6158 
6159 		*dst = pack_1x128_32
6160 			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6161 	}
6162 
6163 	dst++;
6164 	w--;
6165     }
6166 }
6167 
6168 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
6169 			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6170 			       uint32_t, uint32_t, uint32_t,
6171 			       COVER, FLAG_HAVE_SOLID_MASK)
6172 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
6173 			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6174 			       uint32_t, uint32_t, uint32_t,
6175 			       PAD, FLAG_HAVE_SOLID_MASK)
6176 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
6177 			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6178 			       uint32_t, uint32_t, uint32_t,
6179 			       NONE, FLAG_HAVE_SOLID_MASK)
6180 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
6181 			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6182 			       uint32_t, uint32_t, uint32_t,
6183 			       NORMAL, FLAG_HAVE_SOLID_MASK)
6184 
6185 static const pixman_fast_path_t sse2_fast_paths[] =
6186 {
6187     /* PIXMAN_OP_OVER */
6188     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
6189     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
6190     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
6191     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
6192     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
6193     PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
6194     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
6195     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
6196     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
6197     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
6198     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
6199     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
6200     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
6201     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
6202     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
6203     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
6204     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
6205     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
6206     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
6207     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
6208     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
6209     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
6210     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
6211     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
6212     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
6213     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
6214     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
6215     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
6216     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
6217     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
6218     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
6219     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
6220     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
6221     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6222     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6223     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6224     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6225     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
6226     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
6227     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
6228     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
6229     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
6230     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
6231     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
6232     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
6233     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6234     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6235 
6236     /* PIXMAN_OP_OVER_REVERSE */
6237     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
6238     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
6239 
6240     /* PIXMAN_OP_ADD */
6241     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
6242     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
6243     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
6244     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
6245     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
6246     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
6247     PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
6248     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
6249     PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
6250     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
6251     PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
6252     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
6253     PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
6254     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
6255 
6256     /* PIXMAN_OP_SRC */
6257     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
6258     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
6259     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
6260     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
6261     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6262     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6263     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6264     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6265     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
6266     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
6267     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
6268     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
6269     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6270     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6271     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6272     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6273     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
6274     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
6275 
6276     /* PIXMAN_OP_IN */
6277     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
6278     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
6279     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
6280 
6281     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6282     SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6283     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6284     SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6285 
6286     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6287     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6288     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6289     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6290 
6291     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6292     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6293     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
6294     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6295     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6296     SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
6297 
6298     SIMPLE_BILINEAR_FAST_PATH_COVER  (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6299     SIMPLE_BILINEAR_FAST_PATH_COVER  (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6300     SIMPLE_BILINEAR_FAST_PATH_PAD    (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6301     SIMPLE_BILINEAR_FAST_PATH_PAD    (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6302     SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6303     SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6304 
6305     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6306     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6307     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6308     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6309 
6310     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6311     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6312     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6313     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6314 
6315     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
6316     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
6317     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
6318     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
6319 
6320     { PIXMAN_OP_NONE },
6321 };
6322 
6323 static uint32_t *
sse2_fetch_x8r8g8b8(pixman_iter_t * iter,const uint32_t * mask)6324 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
6325 {
6326     int w = iter->width;
6327     __m128i ff000000 = mask_ff000000;
6328     uint32_t *dst = iter->buffer;
6329     uint32_t *src = (uint32_t *)iter->bits;
6330 
6331     iter->bits += iter->stride;
6332 
6333     while (w && ((uintptr_t)dst) & 0x0f)
6334     {
6335 	*dst++ = (*src++) | 0xff000000;
6336 	w--;
6337     }
6338 
6339     while (w >= 4)
6340     {
6341 	save_128_aligned (
6342 	    (__m128i *)dst, _mm_or_si128 (
6343 		load_128_unaligned ((__m128i *)src), ff000000));
6344 
6345 	dst += 4;
6346 	src += 4;
6347 	w -= 4;
6348     }
6349 
6350     while (w)
6351     {
6352 	*dst++ = (*src++) | 0xff000000;
6353 	w--;
6354     }
6355 
6356     return iter->buffer;
6357 }
6358 
6359 static uint32_t *
sse2_fetch_r5g6b5(pixman_iter_t * iter,const uint32_t * mask)6360 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
6361 {
6362     int w = iter->width;
6363     uint32_t *dst = iter->buffer;
6364     uint16_t *src = (uint16_t *)iter->bits;
6365     __m128i ff000000 = mask_ff000000;
6366 
6367     iter->bits += iter->stride;
6368 
6369     while (w && ((uintptr_t)dst) & 0x0f)
6370     {
6371 	uint16_t s = *src++;
6372 
6373 	*dst++ = convert_0565_to_8888 (s);
6374 	w--;
6375     }
6376 
6377     while (w >= 8)
6378     {
6379 	__m128i lo, hi, s;
6380 
6381 	s = _mm_loadu_si128 ((__m128i *)src);
6382 
6383 	lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
6384 	hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
6385 
6386 	save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
6387 	save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
6388 
6389 	dst += 8;
6390 	src += 8;
6391 	w -= 8;
6392     }
6393 
6394     while (w)
6395     {
6396 	uint16_t s = *src++;
6397 
6398 	*dst++ = convert_0565_to_8888 (s);
6399 	w--;
6400     }
6401 
6402     return iter->buffer;
6403 }
6404 
6405 static uint32_t *
sse2_fetch_a8(pixman_iter_t * iter,const uint32_t * mask)6406 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
6407 {
6408     int w = iter->width;
6409     uint32_t *dst = iter->buffer;
6410     uint8_t *src = iter->bits;
6411     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6412 
6413     iter->bits += iter->stride;
6414 
6415     while (w && (((uintptr_t)dst) & 15))
6416     {
6417         *dst++ = (uint32_t)(*(src++)) << 24;
6418         w--;
6419     }
6420 
6421     while (w >= 16)
6422     {
6423 	xmm0 = _mm_loadu_si128((__m128i *)src);
6424 
6425 	xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
6426 	xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
6427 	xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
6428 	xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
6429 	xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
6430 	xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
6431 
6432 	_mm_store_si128(((__m128i *)(dst +  0)), xmm3);
6433 	_mm_store_si128(((__m128i *)(dst +  4)), xmm4);
6434 	_mm_store_si128(((__m128i *)(dst +  8)), xmm5);
6435 	_mm_store_si128(((__m128i *)(dst + 12)), xmm6);
6436 
6437 	dst += 16;
6438 	src += 16;
6439 	w -= 16;
6440     }
6441 
6442     while (w)
6443     {
6444 	*dst++ = (uint32_t)(*(src++)) << 24;
6445 	w--;
6446     }
6447 
6448     return iter->buffer;
6449 }
6450 
6451 #define IMAGE_FLAGS							\
6452     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
6453      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
6454 
6455 static const pixman_iter_info_t sse2_iters[] =
6456 {
6457     { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
6458       _pixman_iter_init_bits_stride, sse2_fetch_x8r8g8b8, NULL
6459     },
6460     { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
6461       _pixman_iter_init_bits_stride, sse2_fetch_r5g6b5, NULL
6462     },
6463     { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
6464       _pixman_iter_init_bits_stride, sse2_fetch_a8, NULL
6465     },
6466     { PIXMAN_null },
6467 };
6468 
6469 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6470 __attribute__((__force_align_arg_pointer__))
6471 #endif
6472 pixman_implementation_t *
_pixman_implementation_create_sse2(pixman_implementation_t * fallback)6473 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6474 {
6475     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6476 
6477     /* SSE2 constants */
6478     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6479     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6480     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6481     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6482     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6483     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6484     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6485     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6486     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
6487     mask_0080 = create_mask_16_128 (0x0080);
6488     mask_00ff = create_mask_16_128 (0x00ff);
6489     mask_0101 = create_mask_16_128 (0x0101);
6490     mask_ffff = create_mask_16_128 (0xffff);
6491     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6492     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6493     mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
6494     mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
6495 
6496     /* Set up function pointers */
6497     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6498     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6499     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6500     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6501     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6502     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6503     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6504     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6505     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6506     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6507 
6508     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6509 
6510     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6511     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6512     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6513     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6514     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6515     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6516     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6517     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6518     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6519     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6520     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6521 
6522     imp->blt = sse2_blt;
6523     imp->fill = sse2_fill;
6524 
6525     imp->iter_info = sse2_iters;
6526 
6527     return imp;
6528 }
6529