1 /*
2  * Copyright © 2008 Rodrigo Kumpera
3  * Copyright © 2008 André Tupinambá
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of Red Hat not be used in advertising or
10  * publicity pertaining to distribution of the software without specific,
11  * written prior permission.  Red Hat makes no representations about the
12  * suitability of this software for any purpose.  It is provided "as is"
13  * without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25  *          André Tupinambá (andrelrt@gmail.com)
26  *
27  * Based on work by Owen Taylor and Søren Sandmann
28  */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32 
33 /* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
34 #define PSHUFD_IS_FAST 0
35 
36 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
37 #include <emmintrin.h> /* for SSE2 intrinsics */
38 #include "pixman-private.h"
39 #include "pixman-combine32.h"
40 #include "pixman-inlines.h"
41 
42 static __m128i mask_0080;
43 static __m128i mask_00ff;
44 static __m128i mask_0101;
45 static __m128i mask_ffff;
46 static __m128i mask_ff000000;
47 static __m128i mask_alpha;
48 
49 static __m128i mask_565_r;
50 static __m128i mask_565_g1, mask_565_g2;
51 static __m128i mask_565_b;
52 static __m128i mask_red;
53 static __m128i mask_green;
54 static __m128i mask_blue;
55 
56 static __m128i mask_565_fix_rb;
57 static __m128i mask_565_fix_g;
58 
59 static __m128i mask_565_rb;
60 static __m128i mask_565_pack_multiplier;
61 
62 static force_inline __m128i
unpack_32_1x128(uint32_t data)63 unpack_32_1x128 (uint32_t data)
64 {
65     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
66 }
67 
68 static force_inline void
unpack_128_2x128(__m128i data,__m128i * data_lo,__m128i * data_hi)69 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
70 {
71     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
72     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
73 }
74 
75 static force_inline __m128i
unpack_565_to_8888(__m128i lo)76 unpack_565_to_8888 (__m128i lo)
77 {
78     __m128i r, g, b, rb, t;
79 
80     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
81     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
82     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
83 
84     rb = _mm_or_si128 (r, b);
85     t  = _mm_and_si128 (rb, mask_565_fix_rb);
86     t  = _mm_srli_epi32 (t, 5);
87     rb = _mm_or_si128 (rb, t);
88 
89     t  = _mm_and_si128 (g, mask_565_fix_g);
90     t  = _mm_srli_epi32 (t, 6);
91     g  = _mm_or_si128 (g, t);
92 
93     return _mm_or_si128 (rb, g);
94 }
95 
96 static force_inline void
unpack_565_128_4x128(__m128i data,__m128i * data0,__m128i * data1,__m128i * data2,__m128i * data3)97 unpack_565_128_4x128 (__m128i  data,
98                       __m128i* data0,
99                       __m128i* data1,
100                       __m128i* data2,
101                       __m128i* data3)
102 {
103     __m128i lo, hi;
104 
105     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
106     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
107 
108     lo = unpack_565_to_8888 (lo);
109     hi = unpack_565_to_8888 (hi);
110 
111     unpack_128_2x128 (lo, data0, data1);
112     unpack_128_2x128 (hi, data2, data3);
113 }
114 
115 static force_inline uint16_t
pack_565_32_16(uint32_t pixel)116 pack_565_32_16 (uint32_t pixel)
117 {
118     return (uint16_t) (((pixel >> 8) & 0xf800) |
119 		       ((pixel >> 5) & 0x07e0) |
120 		       ((pixel >> 3) & 0x001f));
121 }
122 
123 static force_inline __m128i
pack_2x128_128(__m128i lo,__m128i hi)124 pack_2x128_128 (__m128i lo, __m128i hi)
125 {
126     return _mm_packus_epi16 (lo, hi);
127 }
128 
129 static force_inline __m128i
pack_565_2packedx128_128(__m128i lo,__m128i hi)130 pack_565_2packedx128_128 (__m128i lo, __m128i hi)
131 {
132     __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
133     __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
134 
135     __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
136     __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
137 
138     __m128i g0 = _mm_and_si128 (lo, mask_green);
139     __m128i g1 = _mm_and_si128 (hi, mask_green);
140 
141     t0 = _mm_or_si128 (t0, g0);
142     t1 = _mm_or_si128 (t1, g1);
143 
144     /* Simulates _mm_packus_epi32 */
145     t0 = _mm_slli_epi32 (t0, 16 - 5);
146     t1 = _mm_slli_epi32 (t1, 16 - 5);
147     t0 = _mm_srai_epi32 (t0, 16);
148     t1 = _mm_srai_epi32 (t1, 16);
149     return _mm_packs_epi32 (t0, t1);
150 }
151 
152 static force_inline __m128i
pack_565_2x128_128(__m128i lo,__m128i hi)153 pack_565_2x128_128 (__m128i lo, __m128i hi)
154 {
155     __m128i data;
156     __m128i r, g1, g2, b;
157 
158     data = pack_2x128_128 (lo, hi);
159 
160     r  = _mm_and_si128 (data, mask_565_r);
161     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
162     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
163     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
164 
165     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
166 }
167 
168 static force_inline __m128i
pack_565_4x128_128(__m128i * xmm0,__m128i * xmm1,__m128i * xmm2,__m128i * xmm3)169 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
170 {
171     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
172 			     pack_565_2x128_128 (*xmm2, *xmm3));
173 }
174 
175 static force_inline int
is_opaque(__m128i x)176 is_opaque (__m128i x)
177 {
178     __m128i ffs = _mm_cmpeq_epi8 (x, x);
179 
180     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
181 }
182 
183 static force_inline int
is_zero(__m128i x)184 is_zero (__m128i x)
185 {
186     return _mm_movemask_epi8 (
187 	_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
188 }
189 
190 static force_inline int
is_transparent(__m128i x)191 is_transparent (__m128i x)
192 {
193     return (_mm_movemask_epi8 (
194 		_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
195 }
196 
197 static force_inline __m128i
expand_pixel_32_1x128(uint32_t data)198 expand_pixel_32_1x128 (uint32_t data)
199 {
200     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
201 }
202 
203 static force_inline __m128i
expand_alpha_1x128(__m128i data)204 expand_alpha_1x128 (__m128i data)
205 {
206     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
207 						     _MM_SHUFFLE (3, 3, 3, 3)),
208 				_MM_SHUFFLE (3, 3, 3, 3));
209 }
210 
211 static force_inline void
expand_alpha_2x128(__m128i data_lo,__m128i data_hi,__m128i * alpha_lo,__m128i * alpha_hi)212 expand_alpha_2x128 (__m128i  data_lo,
213                     __m128i  data_hi,
214                     __m128i* alpha_lo,
215                     __m128i* alpha_hi)
216 {
217     __m128i lo, hi;
218 
219     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
220     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
221 
222     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
223     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
224 }
225 
226 static force_inline void
expand_alpha_rev_2x128(__m128i data_lo,__m128i data_hi,__m128i * alpha_lo,__m128i * alpha_hi)227 expand_alpha_rev_2x128 (__m128i  data_lo,
228                         __m128i  data_hi,
229                         __m128i* alpha_lo,
230                         __m128i* alpha_hi)
231 {
232     __m128i lo, hi;
233 
234     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
235     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
236     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
237     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
238 }
239 
240 static force_inline void
pix_multiply_2x128(__m128i * data_lo,__m128i * data_hi,__m128i * alpha_lo,__m128i * alpha_hi,__m128i * ret_lo,__m128i * ret_hi)241 pix_multiply_2x128 (__m128i* data_lo,
242                     __m128i* data_hi,
243                     __m128i* alpha_lo,
244                     __m128i* alpha_hi,
245                     __m128i* ret_lo,
246                     __m128i* ret_hi)
247 {
248     __m128i lo, hi;
249 
250     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
251     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
252     lo = _mm_adds_epu16 (lo, mask_0080);
253     hi = _mm_adds_epu16 (hi, mask_0080);
254     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
255     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
256 }
257 
258 static force_inline void
pix_add_multiply_2x128(__m128i * src_lo,__m128i * src_hi,__m128i * alpha_dst_lo,__m128i * alpha_dst_hi,__m128i * dst_lo,__m128i * dst_hi,__m128i * alpha_src_lo,__m128i * alpha_src_hi,__m128i * ret_lo,__m128i * ret_hi)259 pix_add_multiply_2x128 (__m128i* src_lo,
260                         __m128i* src_hi,
261                         __m128i* alpha_dst_lo,
262                         __m128i* alpha_dst_hi,
263                         __m128i* dst_lo,
264                         __m128i* dst_hi,
265                         __m128i* alpha_src_lo,
266                         __m128i* alpha_src_hi,
267                         __m128i* ret_lo,
268                         __m128i* ret_hi)
269 {
270     __m128i t1_lo, t1_hi;
271     __m128i t2_lo, t2_hi;
272 
273     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
274     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
275 
276     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
277     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
278 }
279 
280 static force_inline void
negate_2x128(__m128i data_lo,__m128i data_hi,__m128i * neg_lo,__m128i * neg_hi)281 negate_2x128 (__m128i  data_lo,
282               __m128i  data_hi,
283               __m128i* neg_lo,
284               __m128i* neg_hi)
285 {
286     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
287     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
288 }
289 
290 static force_inline void
invert_colors_2x128(__m128i data_lo,__m128i data_hi,__m128i * inv_lo,__m128i * inv_hi)291 invert_colors_2x128 (__m128i  data_lo,
292                      __m128i  data_hi,
293                      __m128i* inv_lo,
294                      __m128i* inv_hi)
295 {
296     __m128i lo, hi;
297 
298     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
299     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
300     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
301     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
302 }
303 
304 static force_inline void
over_2x128(__m128i * src_lo,__m128i * src_hi,__m128i * alpha_lo,__m128i * alpha_hi,__m128i * dst_lo,__m128i * dst_hi)305 over_2x128 (__m128i* src_lo,
306             __m128i* src_hi,
307             __m128i* alpha_lo,
308             __m128i* alpha_hi,
309             __m128i* dst_lo,
310             __m128i* dst_hi)
311 {
312     __m128i t1, t2;
313 
314     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
315 
316     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
317 
318     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
319     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
320 }
321 
322 static force_inline void
over_rev_non_pre_2x128(__m128i src_lo,__m128i src_hi,__m128i * dst_lo,__m128i * dst_hi)323 over_rev_non_pre_2x128 (__m128i  src_lo,
324                         __m128i  src_hi,
325                         __m128i* dst_lo,
326                         __m128i* dst_hi)
327 {
328     __m128i lo, hi;
329     __m128i alpha_lo, alpha_hi;
330 
331     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
332 
333     lo = _mm_or_si128 (alpha_lo, mask_alpha);
334     hi = _mm_or_si128 (alpha_hi, mask_alpha);
335 
336     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
337 
338     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
339 
340     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
341 }
342 
343 static force_inline void
in_over_2x128(__m128i * src_lo,__m128i * src_hi,__m128i * alpha_lo,__m128i * alpha_hi,__m128i * mask_lo,__m128i * mask_hi,__m128i * dst_lo,__m128i * dst_hi)344 in_over_2x128 (__m128i* src_lo,
345                __m128i* src_hi,
346                __m128i* alpha_lo,
347                __m128i* alpha_hi,
348                __m128i* mask_lo,
349                __m128i* mask_hi,
350                __m128i* dst_lo,
351                __m128i* dst_hi)
352 {
353     __m128i s_lo, s_hi;
354     __m128i a_lo, a_hi;
355 
356     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
357     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
358 
359     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
360 }
361 
362 /* load 4 pixels from a 16-byte boundary aligned address */
363 static force_inline __m128i
load_128_aligned(__m128i * src)364 load_128_aligned (__m128i* src)
365 {
366     return _mm_load_si128 (src);
367 }
368 
369 /* load 4 pixels from a unaligned address */
370 static force_inline __m128i
load_128_unaligned(const __m128i * src)371 load_128_unaligned (const __m128i* src)
372 {
373     return _mm_loadu_si128 (src);
374 }
375 
376 /* save 4 pixels using Write Combining memory on a 16-byte
377  * boundary aligned address
378  */
379 static force_inline void
save_128_write_combining(__m128i * dst,__m128i data)380 save_128_write_combining (__m128i* dst,
381                           __m128i  data)
382 {
383     _mm_stream_si128 (dst, data);
384 }
385 
386 /* save 4 pixels on a 16-byte boundary aligned address */
387 static force_inline void
save_128_aligned(__m128i * dst,__m128i data)388 save_128_aligned (__m128i* dst,
389                   __m128i  data)
390 {
391     _mm_store_si128 (dst, data);
392 }
393 
394 /* save 4 pixels on a unaligned address */
395 static force_inline void
save_128_unaligned(__m128i * dst,__m128i data)396 save_128_unaligned (__m128i* dst,
397                     __m128i  data)
398 {
399     _mm_storeu_si128 (dst, data);
400 }
401 
402 static force_inline __m128i
load_32_1x128(uint32_t data)403 load_32_1x128 (uint32_t data)
404 {
405     return _mm_cvtsi32_si128 (data);
406 }
407 
408 static force_inline __m128i
expand_alpha_rev_1x128(__m128i data)409 expand_alpha_rev_1x128 (__m128i data)
410 {
411     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
412 }
413 
414 static force_inline __m128i
expand_pixel_8_1x128(uint8_t data)415 expand_pixel_8_1x128 (uint8_t data)
416 {
417     return _mm_shufflelo_epi16 (
418 	unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
419 }
420 
421 static force_inline __m128i
pix_multiply_1x128(__m128i data,__m128i alpha)422 pix_multiply_1x128 (__m128i data,
423 		    __m128i alpha)
424 {
425     return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
426 					    mask_0080),
427 			    mask_0101);
428 }
429 
430 static force_inline __m128i
pix_add_multiply_1x128(__m128i * src,__m128i * alpha_dst,__m128i * dst,__m128i * alpha_src)431 pix_add_multiply_1x128 (__m128i* src,
432 			__m128i* alpha_dst,
433 			__m128i* dst,
434 			__m128i* alpha_src)
435 {
436     __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
437     __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
438 
439     return _mm_adds_epu8 (t1, t2);
440 }
441 
442 static force_inline __m128i
negate_1x128(__m128i data)443 negate_1x128 (__m128i data)
444 {
445     return _mm_xor_si128 (data, mask_00ff);
446 }
447 
448 static force_inline __m128i
invert_colors_1x128(__m128i data)449 invert_colors_1x128 (__m128i data)
450 {
451     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
452 }
453 
454 static force_inline __m128i
over_1x128(__m128i src,__m128i alpha,__m128i dst)455 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
456 {
457     return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
458 }
459 
460 static force_inline __m128i
in_over_1x128(__m128i * src,__m128i * alpha,__m128i * mask,__m128i * dst)461 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
462 {
463     return over_1x128 (pix_multiply_1x128 (*src, *mask),
464 		       pix_multiply_1x128 (*alpha, *mask),
465 		       *dst);
466 }
467 
468 static force_inline __m128i
over_rev_non_pre_1x128(__m128i src,__m128i dst)469 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
470 {
471     __m128i alpha = expand_alpha_1x128 (src);
472 
473     return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
474 					   _mm_or_si128 (alpha, mask_alpha)),
475 		       alpha,
476 		       dst);
477 }
478 
479 static force_inline uint32_t
pack_1x128_32(__m128i data)480 pack_1x128_32 (__m128i data)
481 {
482     return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
483 }
484 
485 static force_inline __m128i
expand565_16_1x128(uint16_t pixel)486 expand565_16_1x128 (uint16_t pixel)
487 {
488     __m128i m = _mm_cvtsi32_si128 (pixel);
489 
490     m = unpack_565_to_8888 (m);
491 
492     return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
493 }
494 
495 static force_inline uint32_t
core_combine_over_u_pixel_sse2(uint32_t src,uint32_t dst)496 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
497 {
498     uint8_t a;
499     __m128i xmms;
500 
501     a = src >> 24;
502 
503     if (a == 0xff)
504     {
505 	return src;
506     }
507     else if (src)
508     {
509 	xmms = unpack_32_1x128 (src);
510 	return pack_1x128_32 (
511 	    over_1x128 (xmms, expand_alpha_1x128 (xmms),
512 			unpack_32_1x128 (dst)));
513     }
514 
515     return dst;
516 }
517 
518 static force_inline uint32_t
combine1(const uint32_t * ps,const uint32_t * pm)519 combine1 (const uint32_t *ps, const uint32_t *pm)
520 {
521     uint32_t s;
522     memcpy(&s, ps, sizeof(uint32_t));
523 
524     if (pm)
525     {
526 	__m128i ms, mm;
527 
528 	mm = unpack_32_1x128 (*pm);
529 	mm = expand_alpha_1x128 (mm);
530 
531 	ms = unpack_32_1x128 (s);
532 	ms = pix_multiply_1x128 (ms, mm);
533 
534 	s = pack_1x128_32 (ms);
535     }
536 
537     return s;
538 }
539 
540 static force_inline __m128i
combine4(const __m128i * ps,const __m128i * pm)541 combine4 (const __m128i *ps, const __m128i *pm)
542 {
543     __m128i xmm_src_lo, xmm_src_hi;
544     __m128i xmm_msk_lo, xmm_msk_hi;
545     __m128i s;
546 
547     if (pm)
548     {
549 	xmm_msk_lo = load_128_unaligned (pm);
550 
551 	if (is_transparent (xmm_msk_lo))
552 	    return _mm_setzero_si128 ();
553     }
554 
555     s = load_128_unaligned (ps);
556 
557     if (pm)
558     {
559 	unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
560 	unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
561 
562 	expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
563 
564 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
565 			    &xmm_msk_lo, &xmm_msk_hi,
566 			    &xmm_src_lo, &xmm_src_hi);
567 
568 	s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
569     }
570 
571     return s;
572 }
573 
574 static force_inline void
core_combine_over_u_sse2_mask(uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)575 core_combine_over_u_sse2_mask (uint32_t *	  pd,
576 			       const uint32_t*    ps,
577 			       const uint32_t*    pm,
578 			       int                w)
579 {
580     uint32_t s, d;
581 
582     /* Align dst on a 16-byte boundary */
583     while (w && ((uintptr_t)pd & 15))
584     {
585 	d = *pd;
586 	s = combine1 (ps, pm);
587 
588 	if (s)
589 	    *pd = core_combine_over_u_pixel_sse2 (s, d);
590 	pd++;
591 	ps++;
592 	pm++;
593 	w--;
594     }
595 
596     while (w >= 4)
597     {
598 	__m128i mask = load_128_unaligned ((__m128i *)pm);
599 
600 	if (!is_zero (mask))
601 	{
602 	    __m128i src;
603 	    __m128i src_hi, src_lo;
604 	    __m128i mask_hi, mask_lo;
605 	    __m128i alpha_hi, alpha_lo;
606 
607 	    src = load_128_unaligned ((__m128i *)ps);
608 
609 	    if (is_opaque (_mm_and_si128 (src, mask)))
610 	    {
611 		save_128_aligned ((__m128i *)pd, src);
612 	    }
613 	    else
614 	    {
615 		__m128i dst = load_128_aligned ((__m128i *)pd);
616 		__m128i dst_hi, dst_lo;
617 
618 		unpack_128_2x128 (mask, &mask_lo, &mask_hi);
619 		unpack_128_2x128 (src, &src_lo, &src_hi);
620 
621 		expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
622 		pix_multiply_2x128 (&src_lo, &src_hi,
623 				    &mask_lo, &mask_hi,
624 				    &src_lo, &src_hi);
625 
626 		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
627 
628 		expand_alpha_2x128 (src_lo, src_hi,
629 				    &alpha_lo, &alpha_hi);
630 
631 		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
632 			    &dst_lo, &dst_hi);
633 
634 		save_128_aligned (
635 		    (__m128i *)pd,
636 		    pack_2x128_128 (dst_lo, dst_hi));
637 	    }
638 	}
639 
640 	pm += 4;
641 	ps += 4;
642 	pd += 4;
643 	w -= 4;
644     }
645     while (w)
646     {
647 	d = *pd;
648 	s = combine1 (ps, pm);
649 
650 	if (s)
651 	    *pd = core_combine_over_u_pixel_sse2 (s, d);
652 	pd++;
653 	ps++;
654 	pm++;
655 
656 	w--;
657     }
658 }
659 
660 static force_inline void
core_combine_over_u_sse2_no_mask(uint32_t * pd,const uint32_t * ps,int w)661 core_combine_over_u_sse2_no_mask (uint32_t *	  pd,
662 				  const uint32_t*    ps,
663 				  int                w)
664 {
665     uint32_t s, d;
666 
667     /* Align dst on a 16-byte boundary */
668     while (w && ((uintptr_t)pd & 15))
669     {
670 	d = *pd;
671 	s = *ps;
672 
673 	if (s)
674 	    *pd = core_combine_over_u_pixel_sse2 (s, d);
675 	pd++;
676 	ps++;
677 	w--;
678     }
679 
680     while (w >= 4)
681     {
682 	__m128i src;
683 	__m128i src_hi, src_lo, dst_hi, dst_lo;
684 	__m128i alpha_hi, alpha_lo;
685 
686 	src = load_128_unaligned ((__m128i *)ps);
687 
688 	if (!is_zero (src))
689 	{
690 	    if (is_opaque (src))
691 	    {
692 		save_128_aligned ((__m128i *)pd, src);
693 	    }
694 	    else
695 	    {
696 		__m128i dst = load_128_aligned ((__m128i *)pd);
697 
698 		unpack_128_2x128 (src, &src_lo, &src_hi);
699 		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
700 
701 		expand_alpha_2x128 (src_lo, src_hi,
702 				    &alpha_lo, &alpha_hi);
703 		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
704 			    &dst_lo, &dst_hi);
705 
706 		save_128_aligned (
707 		    (__m128i *)pd,
708 		    pack_2x128_128 (dst_lo, dst_hi));
709 	    }
710 	}
711 
712 	ps += 4;
713 	pd += 4;
714 	w -= 4;
715     }
716     while (w)
717     {
718 	d = *pd;
719 	s = *ps;
720 
721 	if (s)
722 	    *pd = core_combine_over_u_pixel_sse2 (s, d);
723 	pd++;
724 	ps++;
725 
726 	w--;
727     }
728 }
729 
730 static force_inline void
sse2_combine_over_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)731 sse2_combine_over_u (pixman_implementation_t *imp,
732                      pixman_op_t              op,
733                      uint32_t *               pd,
734                      const uint32_t *         ps,
735                      const uint32_t *         pm,
736                      int                      w)
737 {
738     if (pm)
739 	core_combine_over_u_sse2_mask (pd, ps, pm, w);
740     else
741 	core_combine_over_u_sse2_no_mask (pd, ps, w);
742 }
743 
744 static void
sse2_combine_over_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)745 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
746                              pixman_op_t              op,
747                              uint32_t *               pd,
748                              const uint32_t *         ps,
749                              const uint32_t *         pm,
750                              int                      w)
751 {
752     uint32_t s, d;
753 
754     __m128i xmm_dst_lo, xmm_dst_hi;
755     __m128i xmm_src_lo, xmm_src_hi;
756     __m128i xmm_alpha_lo, xmm_alpha_hi;
757 
758     /* Align dst on a 16-byte boundary */
759     while (w &&
760            ((uintptr_t)pd & 15))
761     {
762 	d = *pd;
763 	s = combine1 (ps, pm);
764 
765 	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
766 	w--;
767 	ps++;
768 	if (pm)
769 	    pm++;
770     }
771 
772     while (w >= 4)
773     {
774 	/* I'm loading unaligned because I'm not sure
775 	 * about the address alignment.
776 	 */
777 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
778 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
779 
780 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
781 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
782 
783 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
784 			    &xmm_alpha_lo, &xmm_alpha_hi);
785 
786 	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
787 		    &xmm_alpha_lo, &xmm_alpha_hi,
788 		    &xmm_src_lo, &xmm_src_hi);
789 
790 	/* rebuid the 4 pixel data and save*/
791 	save_128_aligned ((__m128i*)pd,
792 			  pack_2x128_128 (xmm_src_lo, xmm_src_hi));
793 
794 	w -= 4;
795 	ps += 4;
796 	pd += 4;
797 
798 	if (pm)
799 	    pm += 4;
800     }
801 
802     while (w)
803     {
804 	d = *pd;
805 	s = combine1 (ps, pm);
806 
807 	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
808 	ps++;
809 	w--;
810 	if (pm)
811 	    pm++;
812     }
813 }
814 
815 static force_inline uint32_t
core_combine_in_u_pixel_sse2(uint32_t src,uint32_t dst)816 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
817 {
818     uint32_t maska = src >> 24;
819 
820     if (maska == 0)
821     {
822 	return 0;
823     }
824     else if (maska != 0xff)
825     {
826 	return pack_1x128_32 (
827 	    pix_multiply_1x128 (unpack_32_1x128 (dst),
828 				expand_alpha_1x128 (unpack_32_1x128 (src))));
829     }
830 
831     return dst;
832 }
833 
834 static void
sse2_combine_in_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)835 sse2_combine_in_u (pixman_implementation_t *imp,
836                    pixman_op_t              op,
837                    uint32_t *               pd,
838                    const uint32_t *         ps,
839                    const uint32_t *         pm,
840                    int                      w)
841 {
842     uint32_t s, d;
843 
844     __m128i xmm_src_lo, xmm_src_hi;
845     __m128i xmm_dst_lo, xmm_dst_hi;
846 
847     while (w && ((uintptr_t)pd & 15))
848     {
849 	s = combine1 (ps, pm);
850 	d = *pd;
851 
852 	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
853 	w--;
854 	ps++;
855 	if (pm)
856 	    pm++;
857     }
858 
859     while (w >= 4)
860     {
861 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
862 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
863 
864 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
865 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
866 
867 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
868 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
869 			    &xmm_dst_lo, &xmm_dst_hi,
870 			    &xmm_dst_lo, &xmm_dst_hi);
871 
872 	save_128_aligned ((__m128i*)pd,
873 			  pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
874 
875 	ps += 4;
876 	pd += 4;
877 	w -= 4;
878 	if (pm)
879 	    pm += 4;
880     }
881 
882     while (w)
883     {
884 	s = combine1 (ps, pm);
885 	d = *pd;
886 
887 	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
888 	w--;
889 	ps++;
890 	if (pm)
891 	    pm++;
892     }
893 }
894 
895 static void
sse2_combine_in_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)896 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
897                            pixman_op_t              op,
898                            uint32_t *               pd,
899                            const uint32_t *         ps,
900                            const uint32_t *         pm,
901                            int                      w)
902 {
903     uint32_t s, d;
904 
905     __m128i xmm_src_lo, xmm_src_hi;
906     __m128i xmm_dst_lo, xmm_dst_hi;
907 
908     while (w && ((uintptr_t)pd & 15))
909     {
910 	s = combine1 (ps, pm);
911 	d = *pd;
912 
913 	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
914 	ps++;
915 	w--;
916 	if (pm)
917 	    pm++;
918     }
919 
920     while (w >= 4)
921     {
922 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
923 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
924 
925 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
926 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
927 
928 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
929 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
930 			    &xmm_src_lo, &xmm_src_hi,
931 			    &xmm_dst_lo, &xmm_dst_hi);
932 
933 	save_128_aligned (
934 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
935 
936 	ps += 4;
937 	pd += 4;
938 	w -= 4;
939 	if (pm)
940 	    pm += 4;
941     }
942 
943     while (w)
944     {
945 	s = combine1 (ps, pm);
946 	d = *pd;
947 
948 	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
949 	w--;
950 	ps++;
951 	if (pm)
952 	    pm++;
953     }
954 }
955 
956 static void
sse2_combine_out_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)957 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
958                             pixman_op_t              op,
959                             uint32_t *               pd,
960                             const uint32_t *         ps,
961                             const uint32_t *         pm,
962                             int                      w)
963 {
964     while (w && ((uintptr_t)pd & 15))
965     {
966 	uint32_t s = combine1 (ps, pm);
967 	uint32_t d = *pd;
968 
969 	*pd++ = pack_1x128_32 (
970 	    pix_multiply_1x128 (
971 		unpack_32_1x128 (d), negate_1x128 (
972 		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
973 
974 	if (pm)
975 	    pm++;
976 	ps++;
977 	w--;
978     }
979 
980     while (w >= 4)
981     {
982 	__m128i xmm_src_lo, xmm_src_hi;
983 	__m128i xmm_dst_lo, xmm_dst_hi;
984 
985 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
986 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
987 
988 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
989 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
990 
991 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
992 	negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
993 
994 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
995 			    &xmm_src_lo, &xmm_src_hi,
996 			    &xmm_dst_lo, &xmm_dst_hi);
997 
998 	save_128_aligned (
999 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1000 
1001 	ps += 4;
1002 	pd += 4;
1003 	if (pm)
1004 	    pm += 4;
1005 
1006 	w -= 4;
1007     }
1008 
1009     while (w)
1010     {
1011 	uint32_t s = combine1 (ps, pm);
1012 	uint32_t d = *pd;
1013 
1014 	*pd++ = pack_1x128_32 (
1015 	    pix_multiply_1x128 (
1016 		unpack_32_1x128 (d), negate_1x128 (
1017 		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1018 	ps++;
1019 	if (pm)
1020 	    pm++;
1021 	w--;
1022     }
1023 }
1024 
1025 static void
sse2_combine_out_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1026 sse2_combine_out_u (pixman_implementation_t *imp,
1027                     pixman_op_t              op,
1028                     uint32_t *               pd,
1029                     const uint32_t *         ps,
1030                     const uint32_t *         pm,
1031                     int                      w)
1032 {
1033     while (w && ((uintptr_t)pd & 15))
1034     {
1035 	uint32_t s = combine1 (ps, pm);
1036 	uint32_t d = *pd;
1037 
1038 	*pd++ = pack_1x128_32 (
1039 	    pix_multiply_1x128 (
1040 		unpack_32_1x128 (s), negate_1x128 (
1041 		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
1042 	w--;
1043 	ps++;
1044 	if (pm)
1045 	    pm++;
1046     }
1047 
1048     while (w >= 4)
1049     {
1050 	__m128i xmm_src_lo, xmm_src_hi;
1051 	__m128i xmm_dst_lo, xmm_dst_hi;
1052 
1053 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1054 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1055 
1056 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1057 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1058 
1059 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1060 	negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1061 
1062 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1063 			    &xmm_dst_lo, &xmm_dst_hi,
1064 			    &xmm_dst_lo, &xmm_dst_hi);
1065 
1066 	save_128_aligned (
1067 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1068 
1069 	ps += 4;
1070 	pd += 4;
1071 	w -= 4;
1072 	if (pm)
1073 	    pm += 4;
1074     }
1075 
1076     while (w)
1077     {
1078 	uint32_t s = combine1 (ps, pm);
1079 	uint32_t d = *pd;
1080 
1081 	*pd++ = pack_1x128_32 (
1082 	    pix_multiply_1x128 (
1083 		unpack_32_1x128 (s), negate_1x128 (
1084 		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
1085 	w--;
1086 	ps++;
1087 	if (pm)
1088 	    pm++;
1089     }
1090 }
1091 
1092 static force_inline uint32_t
core_combine_atop_u_pixel_sse2(uint32_t src,uint32_t dst)1093 core_combine_atop_u_pixel_sse2 (uint32_t src,
1094                                 uint32_t dst)
1095 {
1096     __m128i s = unpack_32_1x128 (src);
1097     __m128i d = unpack_32_1x128 (dst);
1098 
1099     __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1100     __m128i da = expand_alpha_1x128 (d);
1101 
1102     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1103 }
1104 
1105 static void
sse2_combine_atop_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1106 sse2_combine_atop_u (pixman_implementation_t *imp,
1107                      pixman_op_t              op,
1108                      uint32_t *               pd,
1109                      const uint32_t *         ps,
1110                      const uint32_t *         pm,
1111                      int                      w)
1112 {
1113     uint32_t s, d;
1114 
1115     __m128i xmm_src_lo, xmm_src_hi;
1116     __m128i xmm_dst_lo, xmm_dst_hi;
1117     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1118     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1119 
1120     while (w && ((uintptr_t)pd & 15))
1121     {
1122 	s = combine1 (ps, pm);
1123 	d = *pd;
1124 
1125 	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1126 	w--;
1127 	ps++;
1128 	if (pm)
1129 	    pm++;
1130     }
1131 
1132     while (w >= 4)
1133     {
1134 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1135 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1136 
1137 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1138 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1139 
1140 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1141 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1142 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1143 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1144 
1145 	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1146 		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1147 
1148 	pix_add_multiply_2x128 (
1149 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1150 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1151 	    &xmm_dst_lo, &xmm_dst_hi);
1152 
1153 	save_128_aligned (
1154 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1155 
1156 	ps += 4;
1157 	pd += 4;
1158 	w -= 4;
1159 	if (pm)
1160 	    pm += 4;
1161     }
1162 
1163     while (w)
1164     {
1165 	s = combine1 (ps, pm);
1166 	d = *pd;
1167 
1168 	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1169 	w--;
1170 	ps++;
1171 	if (pm)
1172 	    pm++;
1173     }
1174 }
1175 
1176 static force_inline uint32_t
core_combine_reverse_atop_u_pixel_sse2(uint32_t src,uint32_t dst)1177 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1178                                         uint32_t dst)
1179 {
1180     __m128i s = unpack_32_1x128 (src);
1181     __m128i d = unpack_32_1x128 (dst);
1182 
1183     __m128i sa = expand_alpha_1x128 (s);
1184     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1185 
1186     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1187 }
1188 
1189 static void
sse2_combine_atop_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1190 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1191                              pixman_op_t              op,
1192                              uint32_t *               pd,
1193                              const uint32_t *         ps,
1194                              const uint32_t *         pm,
1195                              int                      w)
1196 {
1197     uint32_t s, d;
1198 
1199     __m128i xmm_src_lo, xmm_src_hi;
1200     __m128i xmm_dst_lo, xmm_dst_hi;
1201     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1202     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1203 
1204     while (w && ((uintptr_t)pd & 15))
1205     {
1206 	s = combine1 (ps, pm);
1207 	d = *pd;
1208 
1209 	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1210 	ps++;
1211 	w--;
1212 	if (pm)
1213 	    pm++;
1214     }
1215 
1216     while (w >= 4)
1217     {
1218 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1219 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1220 
1221 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1222 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1223 
1224 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1225 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1226 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1227 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1228 
1229 	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1230 		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1231 
1232 	pix_add_multiply_2x128 (
1233 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1234 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1235 	    &xmm_dst_lo, &xmm_dst_hi);
1236 
1237 	save_128_aligned (
1238 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1239 
1240 	ps += 4;
1241 	pd += 4;
1242 	w -= 4;
1243 	if (pm)
1244 	    pm += 4;
1245     }
1246 
1247     while (w)
1248     {
1249 	s = combine1 (ps, pm);
1250 	d = *pd;
1251 
1252 	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1253 	ps++;
1254 	w--;
1255 	if (pm)
1256 	    pm++;
1257     }
1258 }
1259 
1260 static force_inline uint32_t
core_combine_xor_u_pixel_sse2(uint32_t src,uint32_t dst)1261 core_combine_xor_u_pixel_sse2 (uint32_t src,
1262                                uint32_t dst)
1263 {
1264     __m128i s = unpack_32_1x128 (src);
1265     __m128i d = unpack_32_1x128 (dst);
1266 
1267     __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1268     __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1269 
1270     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1271 }
1272 
1273 static void
sse2_combine_xor_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dst,const uint32_t * src,const uint32_t * mask,int width)1274 sse2_combine_xor_u (pixman_implementation_t *imp,
1275                     pixman_op_t              op,
1276                     uint32_t *               dst,
1277                     const uint32_t *         src,
1278                     const uint32_t *         mask,
1279                     int                      width)
1280 {
1281     int w = width;
1282     uint32_t s, d;
1283     uint32_t* pd = dst;
1284     const uint32_t* ps = src;
1285     const uint32_t* pm = mask;
1286 
1287     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1288     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1289     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1290     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1291 
1292     while (w && ((uintptr_t)pd & 15))
1293     {
1294 	s = combine1 (ps, pm);
1295 	d = *pd;
1296 
1297 	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1298 	w--;
1299 	ps++;
1300 	if (pm)
1301 	    pm++;
1302     }
1303 
1304     while (w >= 4)
1305     {
1306 	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1307 	xmm_dst = load_128_aligned ((__m128i*) pd);
1308 
1309 	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1310 	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1311 
1312 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1313 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1314 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1315 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1316 
1317 	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1318 		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1319 	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1320 		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1321 
1322 	pix_add_multiply_2x128 (
1323 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1324 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1325 	    &xmm_dst_lo, &xmm_dst_hi);
1326 
1327 	save_128_aligned (
1328 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1329 
1330 	ps += 4;
1331 	pd += 4;
1332 	w -= 4;
1333 	if (pm)
1334 	    pm += 4;
1335     }
1336 
1337     while (w)
1338     {
1339 	s = combine1 (ps, pm);
1340 	d = *pd;
1341 
1342 	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1343 	w--;
1344 	ps++;
1345 	if (pm)
1346 	    pm++;
1347     }
1348 }
1349 
1350 static force_inline void
sse2_combine_add_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dst,const uint32_t * src,const uint32_t * mask,int width)1351 sse2_combine_add_u (pixman_implementation_t *imp,
1352                     pixman_op_t              op,
1353                     uint32_t *               dst,
1354                     const uint32_t *         src,
1355                     const uint32_t *         mask,
1356                     int                      width)
1357 {
1358     int w = width;
1359     uint32_t s, d;
1360     uint32_t* pd = dst;
1361     const uint32_t* ps = src;
1362     const uint32_t* pm = mask;
1363 
1364     while (w && (uintptr_t)pd & 15)
1365     {
1366 	s = combine1 (ps, pm);
1367 	d = *pd;
1368 
1369 	ps++;
1370 	if (pm)
1371 	    pm++;
1372 	*pd++ = _mm_cvtsi128_si32 (
1373 	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1374 	w--;
1375     }
1376 
1377     while (w >= 4)
1378     {
1379 	__m128i s;
1380 
1381 	s = combine4 ((__m128i*)ps, (__m128i*)pm);
1382 
1383 	save_128_aligned (
1384 	    (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1385 
1386 	pd += 4;
1387 	ps += 4;
1388 	if (pm)
1389 	    pm += 4;
1390 	w -= 4;
1391     }
1392 
1393     while (w--)
1394     {
1395 	s = combine1 (ps, pm);
1396 	d = *pd;
1397 
1398 	ps++;
1399 	*pd++ = _mm_cvtsi128_si32 (
1400 	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1401 	if (pm)
1402 	    pm++;
1403     }
1404 }
1405 
1406 static force_inline uint32_t
core_combine_saturate_u_pixel_sse2(uint32_t src,uint32_t dst)1407 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1408                                     uint32_t dst)
1409 {
1410     __m128i ms = unpack_32_1x128 (src);
1411     __m128i md = unpack_32_1x128 (dst);
1412     uint32_t sa = src >> 24;
1413     uint32_t da = ~dst >> 24;
1414 
1415     if (sa > da)
1416     {
1417 	ms = pix_multiply_1x128 (
1418 	    ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1419     }
1420 
1421     return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1422 }
1423 
1424 static void
sse2_combine_saturate_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1425 sse2_combine_saturate_u (pixman_implementation_t *imp,
1426                          pixman_op_t              op,
1427                          uint32_t *               pd,
1428                          const uint32_t *         ps,
1429                          const uint32_t *         pm,
1430                          int                      w)
1431 {
1432     uint32_t s, d;
1433 
1434     uint32_t pack_cmp;
1435     __m128i xmm_src, xmm_dst;
1436 
1437     while (w && (uintptr_t)pd & 15)
1438     {
1439 	s = combine1 (ps, pm);
1440 	d = *pd;
1441 
1442 	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1443 	w--;
1444 	ps++;
1445 	if (pm)
1446 	    pm++;
1447     }
1448 
1449     while (w >= 4)
1450     {
1451 	xmm_dst = load_128_aligned  ((__m128i*)pd);
1452 	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1453 
1454 	pack_cmp = _mm_movemask_epi8 (
1455 	    _mm_cmpgt_epi32 (
1456 		_mm_srli_epi32 (xmm_src, 24),
1457 		_mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1458 
1459 	/* if some alpha src is grater than respective ~alpha dst */
1460 	if (pack_cmp)
1461 	{
1462 	    s = combine1 (ps++, pm);
1463 	    d = *pd;
1464 	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1465 	    if (pm)
1466 		pm++;
1467 
1468 	    s = combine1 (ps++, pm);
1469 	    d = *pd;
1470 	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1471 	    if (pm)
1472 		pm++;
1473 
1474 	    s = combine1 (ps++, pm);
1475 	    d = *pd;
1476 	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1477 	    if (pm)
1478 		pm++;
1479 
1480 	    s = combine1 (ps++, pm);
1481 	    d = *pd;
1482 	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1483 	    if (pm)
1484 		pm++;
1485 	}
1486 	else
1487 	{
1488 	    save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1489 
1490 	    pd += 4;
1491 	    ps += 4;
1492 	    if (pm)
1493 		pm += 4;
1494 	}
1495 
1496 	w -= 4;
1497     }
1498 
1499     while (w--)
1500     {
1501 	s = combine1 (ps, pm);
1502 	d = *pd;
1503 
1504 	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1505 	ps++;
1506 	if (pm)
1507 	    pm++;
1508     }
1509 }
1510 
1511 static void
sse2_combine_src_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1512 sse2_combine_src_ca (pixman_implementation_t *imp,
1513                      pixman_op_t              op,
1514                      uint32_t *               pd,
1515                      const uint32_t *         ps,
1516                      const uint32_t *         pm,
1517                      int                      w)
1518 {
1519     uint32_t s, m;
1520 
1521     __m128i xmm_src_lo, xmm_src_hi;
1522     __m128i xmm_mask_lo, xmm_mask_hi;
1523     __m128i xmm_dst_lo, xmm_dst_hi;
1524 
1525     while (w && (uintptr_t)pd & 15)
1526     {
1527 	s = *ps++;
1528 	m = *pm++;
1529 	*pd++ = pack_1x128_32 (
1530 	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1531 	w--;
1532     }
1533 
1534     while (w >= 4)
1535     {
1536 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1537 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1538 
1539 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1540 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1541 
1542 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1543 			    &xmm_mask_lo, &xmm_mask_hi,
1544 			    &xmm_dst_lo, &xmm_dst_hi);
1545 
1546 	save_128_aligned (
1547 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1548 
1549 	ps += 4;
1550 	pd += 4;
1551 	pm += 4;
1552 	w -= 4;
1553     }
1554 
1555     while (w)
1556     {
1557 	s = *ps++;
1558 	m = *pm++;
1559 	*pd++ = pack_1x128_32 (
1560 	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1561 	w--;
1562     }
1563 }
1564 
1565 static force_inline uint32_t
core_combine_over_ca_pixel_sse2(uint32_t src,uint32_t mask,uint32_t dst)1566 core_combine_over_ca_pixel_sse2 (uint32_t src,
1567                                  uint32_t mask,
1568                                  uint32_t dst)
1569 {
1570     __m128i s = unpack_32_1x128 (src);
1571     __m128i expAlpha = expand_alpha_1x128 (s);
1572     __m128i unpk_mask = unpack_32_1x128 (mask);
1573     __m128i unpk_dst  = unpack_32_1x128 (dst);
1574 
1575     return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1576 }
1577 
1578 static void
sse2_combine_over_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1579 sse2_combine_over_ca (pixman_implementation_t *imp,
1580                       pixman_op_t              op,
1581                       uint32_t *               pd,
1582                       const uint32_t *         ps,
1583                       const uint32_t *         pm,
1584                       int                      w)
1585 {
1586     uint32_t s, m, d;
1587 
1588     __m128i xmm_alpha_lo, xmm_alpha_hi;
1589     __m128i xmm_src_lo, xmm_src_hi;
1590     __m128i xmm_dst_lo, xmm_dst_hi;
1591     __m128i xmm_mask_lo, xmm_mask_hi;
1592 
1593     while (w && (uintptr_t)pd & 15)
1594     {
1595 	s = *ps++;
1596 	m = *pm++;
1597 	d = *pd;
1598 
1599 	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1600 	w--;
1601     }
1602 
1603     while (w >= 4)
1604     {
1605 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1606 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1607 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1608 
1609 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1610 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1611 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1612 
1613 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1614 			    &xmm_alpha_lo, &xmm_alpha_hi);
1615 
1616 	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1617 		       &xmm_alpha_lo, &xmm_alpha_hi,
1618 		       &xmm_mask_lo, &xmm_mask_hi,
1619 		       &xmm_dst_lo, &xmm_dst_hi);
1620 
1621 	save_128_aligned (
1622 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1623 
1624 	ps += 4;
1625 	pd += 4;
1626 	pm += 4;
1627 	w -= 4;
1628     }
1629 
1630     while (w)
1631     {
1632 	s = *ps++;
1633 	m = *pm++;
1634 	d = *pd;
1635 
1636 	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1637 	w--;
1638     }
1639 }
1640 
1641 static force_inline uint32_t
core_combine_over_reverse_ca_pixel_sse2(uint32_t src,uint32_t mask,uint32_t dst)1642 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1643                                          uint32_t mask,
1644                                          uint32_t dst)
1645 {
1646     __m128i d = unpack_32_1x128 (dst);
1647 
1648     return pack_1x128_32 (
1649 	over_1x128 (d, expand_alpha_1x128 (d),
1650 		    pix_multiply_1x128 (unpack_32_1x128 (src),
1651 					unpack_32_1x128 (mask))));
1652 }
1653 
1654 static void
sse2_combine_over_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1655 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1656                               pixman_op_t              op,
1657                               uint32_t *               pd,
1658                               const uint32_t *         ps,
1659                               const uint32_t *         pm,
1660                               int                      w)
1661 {
1662     uint32_t s, m, d;
1663 
1664     __m128i xmm_alpha_lo, xmm_alpha_hi;
1665     __m128i xmm_src_lo, xmm_src_hi;
1666     __m128i xmm_dst_lo, xmm_dst_hi;
1667     __m128i xmm_mask_lo, xmm_mask_hi;
1668 
1669     while (w && (uintptr_t)pd & 15)
1670     {
1671 	s = *ps++;
1672 	m = *pm++;
1673 	d = *pd;
1674 
1675 	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1676 	w--;
1677     }
1678 
1679     while (w >= 4)
1680     {
1681 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1682 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1683 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1684 
1685 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1686 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1687 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1688 
1689 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1690 			    &xmm_alpha_lo, &xmm_alpha_hi);
1691 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1692 			    &xmm_mask_lo, &xmm_mask_hi,
1693 			    &xmm_mask_lo, &xmm_mask_hi);
1694 
1695 	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1696 		    &xmm_alpha_lo, &xmm_alpha_hi,
1697 		    &xmm_mask_lo, &xmm_mask_hi);
1698 
1699 	save_128_aligned (
1700 	    (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1701 
1702 	ps += 4;
1703 	pd += 4;
1704 	pm += 4;
1705 	w -= 4;
1706     }
1707 
1708     while (w)
1709     {
1710 	s = *ps++;
1711 	m = *pm++;
1712 	d = *pd;
1713 
1714 	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1715 	w--;
1716     }
1717 }
1718 
1719 static void
sse2_combine_in_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1720 sse2_combine_in_ca (pixman_implementation_t *imp,
1721                     pixman_op_t              op,
1722                     uint32_t *               pd,
1723                     const uint32_t *         ps,
1724                     const uint32_t *         pm,
1725                     int                      w)
1726 {
1727     uint32_t s, m, d;
1728 
1729     __m128i xmm_alpha_lo, xmm_alpha_hi;
1730     __m128i xmm_src_lo, xmm_src_hi;
1731     __m128i xmm_dst_lo, xmm_dst_hi;
1732     __m128i xmm_mask_lo, xmm_mask_hi;
1733 
1734     while (w && (uintptr_t)pd & 15)
1735     {
1736 	s = *ps++;
1737 	m = *pm++;
1738 	d = *pd;
1739 
1740 	*pd++ = pack_1x128_32 (
1741 	    pix_multiply_1x128 (
1742 		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1743 		expand_alpha_1x128 (unpack_32_1x128 (d))));
1744 
1745 	w--;
1746     }
1747 
1748     while (w >= 4)
1749     {
1750 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1751 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1752 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1753 
1754 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1755 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1756 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1757 
1758 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1759 			    &xmm_alpha_lo, &xmm_alpha_hi);
1760 
1761 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1762 			    &xmm_mask_lo, &xmm_mask_hi,
1763 			    &xmm_dst_lo, &xmm_dst_hi);
1764 
1765 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1766 			    &xmm_alpha_lo, &xmm_alpha_hi,
1767 			    &xmm_dst_lo, &xmm_dst_hi);
1768 
1769 	save_128_aligned (
1770 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1771 
1772 	ps += 4;
1773 	pd += 4;
1774 	pm += 4;
1775 	w -= 4;
1776     }
1777 
1778     while (w)
1779     {
1780 	s = *ps++;
1781 	m = *pm++;
1782 	d = *pd;
1783 
1784 	*pd++ = pack_1x128_32 (
1785 	    pix_multiply_1x128 (
1786 		pix_multiply_1x128 (
1787 		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
1788 		expand_alpha_1x128 (unpack_32_1x128 (d))));
1789 
1790 	w--;
1791     }
1792 }
1793 
1794 static void
sse2_combine_in_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1795 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1796                             pixman_op_t              op,
1797                             uint32_t *               pd,
1798                             const uint32_t *         ps,
1799                             const uint32_t *         pm,
1800                             int                      w)
1801 {
1802     uint32_t s, m, d;
1803 
1804     __m128i xmm_alpha_lo, xmm_alpha_hi;
1805     __m128i xmm_src_lo, xmm_src_hi;
1806     __m128i xmm_dst_lo, xmm_dst_hi;
1807     __m128i xmm_mask_lo, xmm_mask_hi;
1808 
1809     while (w && (uintptr_t)pd & 15)
1810     {
1811 	s = *ps++;
1812 	m = *pm++;
1813 	d = *pd;
1814 
1815 	*pd++ = pack_1x128_32 (
1816 	    pix_multiply_1x128 (
1817 		unpack_32_1x128 (d),
1818 		pix_multiply_1x128 (unpack_32_1x128 (m),
1819 				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
1820 	w--;
1821     }
1822 
1823     while (w >= 4)
1824     {
1825 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1826 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1827 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1828 
1829 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1830 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1831 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1832 
1833 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1834 			    &xmm_alpha_lo, &xmm_alpha_hi);
1835 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1836 			    &xmm_alpha_lo, &xmm_alpha_hi,
1837 			    &xmm_alpha_lo, &xmm_alpha_hi);
1838 
1839 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1840 			    &xmm_alpha_lo, &xmm_alpha_hi,
1841 			    &xmm_dst_lo, &xmm_dst_hi);
1842 
1843 	save_128_aligned (
1844 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1845 
1846 	ps += 4;
1847 	pd += 4;
1848 	pm += 4;
1849 	w -= 4;
1850     }
1851 
1852     while (w)
1853     {
1854 	s = *ps++;
1855 	m = *pm++;
1856 	d = *pd;
1857 
1858 	*pd++ = pack_1x128_32 (
1859 	    pix_multiply_1x128 (
1860 		unpack_32_1x128 (d),
1861 		pix_multiply_1x128 (unpack_32_1x128 (m),
1862 				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
1863 	w--;
1864     }
1865 }
1866 
1867 static void
sse2_combine_out_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1868 sse2_combine_out_ca (pixman_implementation_t *imp,
1869                      pixman_op_t              op,
1870                      uint32_t *               pd,
1871                      const uint32_t *         ps,
1872                      const uint32_t *         pm,
1873                      int                      w)
1874 {
1875     uint32_t s, m, d;
1876 
1877     __m128i xmm_alpha_lo, xmm_alpha_hi;
1878     __m128i xmm_src_lo, xmm_src_hi;
1879     __m128i xmm_dst_lo, xmm_dst_hi;
1880     __m128i xmm_mask_lo, xmm_mask_hi;
1881 
1882     while (w && (uintptr_t)pd & 15)
1883     {
1884 	s = *ps++;
1885 	m = *pm++;
1886 	d = *pd;
1887 
1888 	*pd++ = pack_1x128_32 (
1889 	    pix_multiply_1x128 (
1890 		pix_multiply_1x128 (
1891 		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
1892 		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1893 	w--;
1894     }
1895 
1896     while (w >= 4)
1897     {
1898 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1899 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1900 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1901 
1902 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1903 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1904 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1905 
1906 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1907 			    &xmm_alpha_lo, &xmm_alpha_hi);
1908 	negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1909 		      &xmm_alpha_lo, &xmm_alpha_hi);
1910 
1911 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1912 			    &xmm_mask_lo, &xmm_mask_hi,
1913 			    &xmm_dst_lo, &xmm_dst_hi);
1914 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1915 			    &xmm_alpha_lo, &xmm_alpha_hi,
1916 			    &xmm_dst_lo, &xmm_dst_hi);
1917 
1918 	save_128_aligned (
1919 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1920 
1921 	ps += 4;
1922 	pd += 4;
1923 	pm += 4;
1924 	w -= 4;
1925     }
1926 
1927     while (w)
1928     {
1929 	s = *ps++;
1930 	m = *pm++;
1931 	d = *pd;
1932 
1933 	*pd++ = pack_1x128_32 (
1934 	    pix_multiply_1x128 (
1935 		pix_multiply_1x128 (
1936 		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
1937 		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1938 
1939 	w--;
1940     }
1941 }
1942 
1943 static void
sse2_combine_out_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)1944 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1945                              pixman_op_t              op,
1946                              uint32_t *               pd,
1947                              const uint32_t *         ps,
1948                              const uint32_t *         pm,
1949                              int                      w)
1950 {
1951     uint32_t s, m, d;
1952 
1953     __m128i xmm_alpha_lo, xmm_alpha_hi;
1954     __m128i xmm_src_lo, xmm_src_hi;
1955     __m128i xmm_dst_lo, xmm_dst_hi;
1956     __m128i xmm_mask_lo, xmm_mask_hi;
1957 
1958     while (w && (uintptr_t)pd & 15)
1959     {
1960 	s = *ps++;
1961 	m = *pm++;
1962 	d = *pd;
1963 
1964 	*pd++ = pack_1x128_32 (
1965 	    pix_multiply_1x128 (
1966 		unpack_32_1x128 (d),
1967 		negate_1x128 (pix_multiply_1x128 (
1968 				 unpack_32_1x128 (m),
1969 				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
1970 	w--;
1971     }
1972 
1973     while (w >= 4)
1974     {
1975 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1976 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1977 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1978 
1979 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1980 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1981 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1982 
1983 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1984 			    &xmm_alpha_lo, &xmm_alpha_hi);
1985 
1986 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1987 			    &xmm_alpha_lo, &xmm_alpha_hi,
1988 			    &xmm_mask_lo, &xmm_mask_hi);
1989 
1990 	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1991 		      &xmm_mask_lo, &xmm_mask_hi);
1992 
1993 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1994 			    &xmm_mask_lo, &xmm_mask_hi,
1995 			    &xmm_dst_lo, &xmm_dst_hi);
1996 
1997 	save_128_aligned (
1998 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1999 
2000 	ps += 4;
2001 	pd += 4;
2002 	pm += 4;
2003 	w -= 4;
2004     }
2005 
2006     while (w)
2007     {
2008 	s = *ps++;
2009 	m = *pm++;
2010 	d = *pd;
2011 
2012 	*pd++ = pack_1x128_32 (
2013 	    pix_multiply_1x128 (
2014 		unpack_32_1x128 (d),
2015 		negate_1x128 (pix_multiply_1x128 (
2016 				 unpack_32_1x128 (m),
2017 				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
2018 	w--;
2019     }
2020 }
2021 
2022 static force_inline uint32_t
core_combine_atop_ca_pixel_sse2(uint32_t src,uint32_t mask,uint32_t dst)2023 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2024                                  uint32_t mask,
2025                                  uint32_t dst)
2026 {
2027     __m128i m = unpack_32_1x128 (mask);
2028     __m128i s = unpack_32_1x128 (src);
2029     __m128i d = unpack_32_1x128 (dst);
2030     __m128i sa = expand_alpha_1x128 (s);
2031     __m128i da = expand_alpha_1x128 (d);
2032 
2033     s = pix_multiply_1x128 (s, m);
2034     m = negate_1x128 (pix_multiply_1x128 (m, sa));
2035 
2036     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2037 }
2038 
2039 static void
sse2_combine_atop_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)2040 sse2_combine_atop_ca (pixman_implementation_t *imp,
2041                       pixman_op_t              op,
2042                       uint32_t *               pd,
2043                       const uint32_t *         ps,
2044                       const uint32_t *         pm,
2045                       int                      w)
2046 {
2047     uint32_t s, m, d;
2048 
2049     __m128i xmm_src_lo, xmm_src_hi;
2050     __m128i xmm_dst_lo, xmm_dst_hi;
2051     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2052     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2053     __m128i xmm_mask_lo, xmm_mask_hi;
2054 
2055     while (w && (uintptr_t)pd & 15)
2056     {
2057 	s = *ps++;
2058 	m = *pm++;
2059 	d = *pd;
2060 
2061 	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2062 	w--;
2063     }
2064 
2065     while (w >= 4)
2066     {
2067 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2068 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2069 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2070 
2071 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2072 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2073 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2074 
2075 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2076 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2077 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2078 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2079 
2080 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2081 			    &xmm_mask_lo, &xmm_mask_hi,
2082 			    &xmm_src_lo, &xmm_src_hi);
2083 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2084 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2085 			    &xmm_mask_lo, &xmm_mask_hi);
2086 
2087 	negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2088 
2089 	pix_add_multiply_2x128 (
2090 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2091 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2092 	    &xmm_dst_lo, &xmm_dst_hi);
2093 
2094 	save_128_aligned (
2095 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2096 
2097 	ps += 4;
2098 	pd += 4;
2099 	pm += 4;
2100 	w -= 4;
2101     }
2102 
2103     while (w)
2104     {
2105 	s = *ps++;
2106 	m = *pm++;
2107 	d = *pd;
2108 
2109 	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2110 	w--;
2111     }
2112 }
2113 
2114 static force_inline uint32_t
core_combine_reverse_atop_ca_pixel_sse2(uint32_t src,uint32_t mask,uint32_t dst)2115 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2116                                          uint32_t mask,
2117                                          uint32_t dst)
2118 {
2119     __m128i m = unpack_32_1x128 (mask);
2120     __m128i s = unpack_32_1x128 (src);
2121     __m128i d = unpack_32_1x128 (dst);
2122 
2123     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2124     __m128i sa = expand_alpha_1x128 (s);
2125 
2126     s = pix_multiply_1x128 (s, m);
2127     m = pix_multiply_1x128 (m, sa);
2128 
2129     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2130 }
2131 
2132 static void
sse2_combine_atop_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)2133 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2134                               pixman_op_t              op,
2135                               uint32_t *               pd,
2136                               const uint32_t *         ps,
2137                               const uint32_t *         pm,
2138                               int                      w)
2139 {
2140     uint32_t s, m, d;
2141 
2142     __m128i xmm_src_lo, xmm_src_hi;
2143     __m128i xmm_dst_lo, xmm_dst_hi;
2144     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2145     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2146     __m128i xmm_mask_lo, xmm_mask_hi;
2147 
2148     while (w && (uintptr_t)pd & 15)
2149     {
2150 	s = *ps++;
2151 	m = *pm++;
2152 	d = *pd;
2153 
2154 	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2155 	w--;
2156     }
2157 
2158     while (w >= 4)
2159     {
2160 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2161 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2162 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2163 
2164 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2165 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2166 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2167 
2168 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2169 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2170 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2171 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2172 
2173 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2174 			    &xmm_mask_lo, &xmm_mask_hi,
2175 			    &xmm_src_lo, &xmm_src_hi);
2176 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2177 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2178 			    &xmm_mask_lo, &xmm_mask_hi);
2179 
2180 	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2181 		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2182 
2183 	pix_add_multiply_2x128 (
2184 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2185 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2186 	    &xmm_dst_lo, &xmm_dst_hi);
2187 
2188 	save_128_aligned (
2189 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2190 
2191 	ps += 4;
2192 	pd += 4;
2193 	pm += 4;
2194 	w -= 4;
2195     }
2196 
2197     while (w)
2198     {
2199 	s = *ps++;
2200 	m = *pm++;
2201 	d = *pd;
2202 
2203 	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2204 	w--;
2205     }
2206 }
2207 
2208 static force_inline uint32_t
core_combine_xor_ca_pixel_sse2(uint32_t src,uint32_t mask,uint32_t dst)2209 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2210                                 uint32_t mask,
2211                                 uint32_t dst)
2212 {
2213     __m128i a = unpack_32_1x128 (mask);
2214     __m128i s = unpack_32_1x128 (src);
2215     __m128i d = unpack_32_1x128 (dst);
2216 
2217     __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2218 				       a, expand_alpha_1x128 (s)));
2219     __m128i dest      = pix_multiply_1x128 (s, a);
2220     __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2221 
2222     return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2223                                                 &alpha_dst,
2224                                                 &dest,
2225                                                 &alpha_src));
2226 }
2227 
2228 static void
sse2_combine_xor_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)2229 sse2_combine_xor_ca (pixman_implementation_t *imp,
2230                      pixman_op_t              op,
2231                      uint32_t *               pd,
2232                      const uint32_t *         ps,
2233                      const uint32_t *         pm,
2234                      int                      w)
2235 {
2236     uint32_t s, m, d;
2237 
2238     __m128i xmm_src_lo, xmm_src_hi;
2239     __m128i xmm_dst_lo, xmm_dst_hi;
2240     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2241     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2242     __m128i xmm_mask_lo, xmm_mask_hi;
2243 
2244     while (w && (uintptr_t)pd & 15)
2245     {
2246 	s = *ps++;
2247 	m = *pm++;
2248 	d = *pd;
2249 
2250 	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2251 	w--;
2252     }
2253 
2254     while (w >= 4)
2255     {
2256 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2257 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2258 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2259 
2260 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2261 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2262 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2263 
2264 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2265 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2266 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2267 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2268 
2269 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2270 			    &xmm_mask_lo, &xmm_mask_hi,
2271 			    &xmm_src_lo, &xmm_src_hi);
2272 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2273 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2274 			    &xmm_mask_lo, &xmm_mask_hi);
2275 
2276 	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2277 		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2278 	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2279 		      &xmm_mask_lo, &xmm_mask_hi);
2280 
2281 	pix_add_multiply_2x128 (
2282 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2283 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2284 	    &xmm_dst_lo, &xmm_dst_hi);
2285 
2286 	save_128_aligned (
2287 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2288 
2289 	ps += 4;
2290 	pd += 4;
2291 	pm += 4;
2292 	w -= 4;
2293     }
2294 
2295     while (w)
2296     {
2297 	s = *ps++;
2298 	m = *pm++;
2299 	d = *pd;
2300 
2301 	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2302 	w--;
2303     }
2304 }
2305 
2306 static void
sse2_combine_add_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * pd,const uint32_t * ps,const uint32_t * pm,int w)2307 sse2_combine_add_ca (pixman_implementation_t *imp,
2308                      pixman_op_t              op,
2309                      uint32_t *               pd,
2310                      const uint32_t *         ps,
2311                      const uint32_t *         pm,
2312                      int                      w)
2313 {
2314     uint32_t s, m, d;
2315 
2316     __m128i xmm_src_lo, xmm_src_hi;
2317     __m128i xmm_dst_lo, xmm_dst_hi;
2318     __m128i xmm_mask_lo, xmm_mask_hi;
2319 
2320     while (w && (uintptr_t)pd & 15)
2321     {
2322 	s = *ps++;
2323 	m = *pm++;
2324 	d = *pd;
2325 
2326 	*pd++ = pack_1x128_32 (
2327 	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2328 					       unpack_32_1x128 (m)),
2329 			   unpack_32_1x128 (d)));
2330 	w--;
2331     }
2332 
2333     while (w >= 4)
2334     {
2335 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2336 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2337 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2338 
2339 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2340 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2341 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2342 
2343 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2344 			    &xmm_mask_lo, &xmm_mask_hi,
2345 			    &xmm_src_lo, &xmm_src_hi);
2346 
2347 	save_128_aligned (
2348 	    (__m128i*)pd, pack_2x128_128 (
2349 		_mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2350 		_mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2351 
2352 	ps += 4;
2353 	pd += 4;
2354 	pm += 4;
2355 	w -= 4;
2356     }
2357 
2358     while (w)
2359     {
2360 	s = *ps++;
2361 	m = *pm++;
2362 	d = *pd;
2363 
2364 	*pd++ = pack_1x128_32 (
2365 	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2366 					       unpack_32_1x128 (m)),
2367 			   unpack_32_1x128 (d)));
2368 	w--;
2369     }
2370 }
2371 
2372 static force_inline __m128i
create_mask_16_128(uint16_t mask)2373 create_mask_16_128 (uint16_t mask)
2374 {
2375     return _mm_set1_epi16 (mask);
2376 }
2377 
2378 /* Work around a code generation bug in Sun Studio 12. */
2379 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2380 # define create_mask_2x32_128(mask0, mask1)				\
2381     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2382 #else
2383 static force_inline __m128i
create_mask_2x32_128(uint32_t mask0,uint32_t mask1)2384 create_mask_2x32_128 (uint32_t mask0,
2385                       uint32_t mask1)
2386 {
2387     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2388 }
2389 #endif
2390 
2391 static void
sse2_composite_over_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2392 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2393                             pixman_composite_info_t *info)
2394 {
2395     PIXMAN_COMPOSITE_ARGS (info);
2396     uint32_t src;
2397     uint32_t    *dst_line, *dst, d;
2398     int32_t w;
2399     int dst_stride;
2400     __m128i xmm_src, xmm_alpha;
2401     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2402 
2403     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2404 
2405     if (src == 0)
2406 	return;
2407 
2408     PIXMAN_IMAGE_GET_LINE (
2409 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2410 
2411     xmm_src = expand_pixel_32_1x128 (src);
2412     xmm_alpha = expand_alpha_1x128 (xmm_src);
2413 
2414     while (height--)
2415     {
2416 	dst = dst_line;
2417 
2418 	dst_line += dst_stride;
2419 	w = width;
2420 
2421 	while (w && (uintptr_t)dst & 15)
2422 	{
2423 	    d = *dst;
2424 	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2425 						xmm_alpha,
2426 						unpack_32_1x128 (d)));
2427 	    w--;
2428 	}
2429 
2430 	while (w >= 4)
2431 	{
2432 	    xmm_dst = load_128_aligned ((__m128i*)dst);
2433 
2434 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2435 
2436 	    over_2x128 (&xmm_src, &xmm_src,
2437 			&xmm_alpha, &xmm_alpha,
2438 			&xmm_dst_lo, &xmm_dst_hi);
2439 
2440 	    /* rebuid the 4 pixel data and save*/
2441 	    save_128_aligned (
2442 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2443 
2444 	    w -= 4;
2445 	    dst += 4;
2446 	}
2447 
2448 	while (w)
2449 	{
2450 	    d = *dst;
2451 	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2452 						xmm_alpha,
2453 						unpack_32_1x128 (d)));
2454 	    w--;
2455 	}
2456 
2457     }
2458 }
2459 
2460 static void
sse2_composite_over_n_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2461 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2462                             pixman_composite_info_t *info)
2463 {
2464     PIXMAN_COMPOSITE_ARGS (info);
2465     uint32_t src;
2466     uint16_t    *dst_line, *dst, d;
2467     int32_t w;
2468     int dst_stride;
2469     __m128i xmm_src, xmm_alpha;
2470     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2471 
2472     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2473 
2474     if (src == 0)
2475 	return;
2476 
2477     PIXMAN_IMAGE_GET_LINE (
2478 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2479 
2480     xmm_src = expand_pixel_32_1x128 (src);
2481     xmm_alpha = expand_alpha_1x128 (xmm_src);
2482 
2483     while (height--)
2484     {
2485 	dst = dst_line;
2486 
2487 	dst_line += dst_stride;
2488 	w = width;
2489 
2490 	while (w && (uintptr_t)dst & 15)
2491 	{
2492 	    d = *dst;
2493 
2494 	    *dst++ = pack_565_32_16 (
2495 		pack_1x128_32 (over_1x128 (xmm_src,
2496 					   xmm_alpha,
2497 					   expand565_16_1x128 (d))));
2498 	    w--;
2499 	}
2500 
2501 	while (w >= 8)
2502 	{
2503 	    xmm_dst = load_128_aligned ((__m128i*)dst);
2504 
2505 	    unpack_565_128_4x128 (xmm_dst,
2506 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2507 
2508 	    over_2x128 (&xmm_src, &xmm_src,
2509 			&xmm_alpha, &xmm_alpha,
2510 			&xmm_dst0, &xmm_dst1);
2511 	    over_2x128 (&xmm_src, &xmm_src,
2512 			&xmm_alpha, &xmm_alpha,
2513 			&xmm_dst2, &xmm_dst3);
2514 
2515 	    xmm_dst = pack_565_4x128_128 (
2516 		&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2517 
2518 	    save_128_aligned ((__m128i*)dst, xmm_dst);
2519 
2520 	    dst += 8;
2521 	    w -= 8;
2522 	}
2523 
2524 	while (w--)
2525 	{
2526 	    d = *dst;
2527 	    *dst++ = pack_565_32_16 (
2528 		pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2529 					   expand565_16_1x128 (d))));
2530 	}
2531     }
2532 
2533 }
2534 
2535 static void
sse2_composite_add_n_8888_8888_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)2536 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2537 				   pixman_composite_info_t *info)
2538 {
2539     PIXMAN_COMPOSITE_ARGS (info);
2540     uint32_t src;
2541     uint32_t    *dst_line, d;
2542     uint32_t    *mask_line, m;
2543     uint32_t pack_cmp;
2544     int dst_stride, mask_stride;
2545 
2546     __m128i xmm_src;
2547     __m128i xmm_dst;
2548     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2549 
2550     __m128i mmx_src, mmx_mask, mmx_dest;
2551 
2552     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2553 
2554     if (src == 0)
2555 	return;
2556 
2557     PIXMAN_IMAGE_GET_LINE (
2558 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2559     PIXMAN_IMAGE_GET_LINE (
2560 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2561 
2562     xmm_src = _mm_unpacklo_epi8 (
2563 	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2564     mmx_src   = xmm_src;
2565 
2566     while (height--)
2567     {
2568 	int w = width;
2569 	const uint32_t *pm = (uint32_t *)mask_line;
2570 	uint32_t *pd = (uint32_t *)dst_line;
2571 
2572 	dst_line += dst_stride;
2573 	mask_line += mask_stride;
2574 
2575 	while (w && (uintptr_t)pd & 15)
2576 	{
2577 	    m = *pm++;
2578 
2579 	    if (m)
2580 	    {
2581 		d = *pd;
2582 
2583 		mmx_mask = unpack_32_1x128 (m);
2584 		mmx_dest = unpack_32_1x128 (d);
2585 
2586 		*pd = pack_1x128_32 (
2587 		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2588 				   mmx_dest));
2589 	    }
2590 
2591 	    pd++;
2592 	    w--;
2593 	}
2594 
2595 	while (w >= 4)
2596 	{
2597 	    xmm_mask = load_128_unaligned ((__m128i*)pm);
2598 
2599 	    pack_cmp =
2600 		_mm_movemask_epi8 (
2601 		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2602 
2603 	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2604 	    if (pack_cmp != 0xffff)
2605 	    {
2606 		xmm_dst = load_128_aligned ((__m128i*)pd);
2607 
2608 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2609 
2610 		pix_multiply_2x128 (&xmm_src, &xmm_src,
2611 				    &xmm_mask_lo, &xmm_mask_hi,
2612 				    &xmm_mask_lo, &xmm_mask_hi);
2613 		xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2614 
2615 		save_128_aligned (
2616 		    (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2617 	    }
2618 
2619 	    pd += 4;
2620 	    pm += 4;
2621 	    w -= 4;
2622 	}
2623 
2624 	while (w)
2625 	{
2626 	    m = *pm++;
2627 
2628 	    if (m)
2629 	    {
2630 		d = *pd;
2631 
2632 		mmx_mask = unpack_32_1x128 (m);
2633 		mmx_dest = unpack_32_1x128 (d);
2634 
2635 		*pd = pack_1x128_32 (
2636 		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2637 				   mmx_dest));
2638 	    }
2639 
2640 	    pd++;
2641 	    w--;
2642 	}
2643     }
2644 
2645 }
2646 
2647 static void
sse2_composite_over_n_8888_8888_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)2648 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2649                                     pixman_composite_info_t *info)
2650 {
2651     PIXMAN_COMPOSITE_ARGS (info);
2652     uint32_t src;
2653     uint32_t    *dst_line, d;
2654     uint32_t    *mask_line, m;
2655     uint32_t pack_cmp;
2656     int dst_stride, mask_stride;
2657 
2658     __m128i xmm_src, xmm_alpha;
2659     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2660     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2661 
2662     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2663 
2664     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2665 
2666     if (src == 0)
2667 	return;
2668 
2669     PIXMAN_IMAGE_GET_LINE (
2670 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2671     PIXMAN_IMAGE_GET_LINE (
2672 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2673 
2674     xmm_src = _mm_unpacklo_epi8 (
2675 	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2676     xmm_alpha = expand_alpha_1x128 (xmm_src);
2677     mmx_src   = xmm_src;
2678     mmx_alpha = xmm_alpha;
2679 
2680     while (height--)
2681     {
2682 	int w = width;
2683 	const uint32_t *pm = (uint32_t *)mask_line;
2684 	uint32_t *pd = (uint32_t *)dst_line;
2685 
2686 	dst_line += dst_stride;
2687 	mask_line += mask_stride;
2688 
2689 	while (w && (uintptr_t)pd & 15)
2690 	{
2691 	    m = *pm++;
2692 
2693 	    if (m)
2694 	    {
2695 		d = *pd;
2696 		mmx_mask = unpack_32_1x128 (m);
2697 		mmx_dest = unpack_32_1x128 (d);
2698 
2699 		*pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2700 		                                  &mmx_alpha,
2701 		                                  &mmx_mask,
2702 		                                  &mmx_dest));
2703 	    }
2704 
2705 	    pd++;
2706 	    w--;
2707 	}
2708 
2709 	while (w >= 4)
2710 	{
2711 	    xmm_mask = load_128_unaligned ((__m128i*)pm);
2712 
2713 	    pack_cmp =
2714 		_mm_movemask_epi8 (
2715 		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2716 
2717 	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2718 	    if (pack_cmp != 0xffff)
2719 	    {
2720 		xmm_dst = load_128_aligned ((__m128i*)pd);
2721 
2722 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2723 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2724 
2725 		in_over_2x128 (&xmm_src, &xmm_src,
2726 			       &xmm_alpha, &xmm_alpha,
2727 			       &xmm_mask_lo, &xmm_mask_hi,
2728 			       &xmm_dst_lo, &xmm_dst_hi);
2729 
2730 		save_128_aligned (
2731 		    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2732 	    }
2733 
2734 	    pd += 4;
2735 	    pm += 4;
2736 	    w -= 4;
2737 	}
2738 
2739 	while (w)
2740 	{
2741 	    m = *pm++;
2742 
2743 	    if (m)
2744 	    {
2745 		d = *pd;
2746 		mmx_mask = unpack_32_1x128 (m);
2747 		mmx_dest = unpack_32_1x128 (d);
2748 
2749 		*pd = pack_1x128_32 (
2750 		    in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2751 	    }
2752 
2753 	    pd++;
2754 	    w--;
2755 	}
2756     }
2757 
2758 }
2759 
2760 static void
sse2_composite_over_8888_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2761 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2762                                  pixman_composite_info_t *info)
2763 {
2764     PIXMAN_COMPOSITE_ARGS (info);
2765     uint32_t    *dst_line, *dst;
2766     uint32_t    *src_line, *src;
2767     uint32_t mask;
2768     int32_t w;
2769     int dst_stride, src_stride;
2770 
2771     __m128i xmm_mask;
2772     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2773     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2774     __m128i xmm_alpha_lo, xmm_alpha_hi;
2775 
2776     PIXMAN_IMAGE_GET_LINE (
2777 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2778     PIXMAN_IMAGE_GET_LINE (
2779 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2780 
2781     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2782 
2783     xmm_mask = create_mask_16_128 (mask >> 24);
2784 
2785     while (height--)
2786     {
2787 	dst = dst_line;
2788 	dst_line += dst_stride;
2789 	src = src_line;
2790 	src_line += src_stride;
2791 	w = width;
2792 
2793 	while (w && (uintptr_t)dst & 15)
2794 	{
2795 	    uint32_t s = *src++;
2796 
2797 	    if (s)
2798 	    {
2799 		uint32_t d = *dst;
2800 
2801 		__m128i ms = unpack_32_1x128 (s);
2802 		__m128i alpha    = expand_alpha_1x128 (ms);
2803 		__m128i dest     = xmm_mask;
2804 		__m128i alpha_dst = unpack_32_1x128 (d);
2805 
2806 		*dst = pack_1x128_32 (
2807 		    in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2808 	    }
2809 	    dst++;
2810 	    w--;
2811 	}
2812 
2813 	while (w >= 4)
2814 	{
2815 	    xmm_src = load_128_unaligned ((__m128i*)src);
2816 
2817 	    if (!is_zero (xmm_src))
2818 	    {
2819 		xmm_dst = load_128_aligned ((__m128i*)dst);
2820 
2821 		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2822 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2823 		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2824 				    &xmm_alpha_lo, &xmm_alpha_hi);
2825 
2826 		in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2827 			       &xmm_alpha_lo, &xmm_alpha_hi,
2828 			       &xmm_mask, &xmm_mask,
2829 			       &xmm_dst_lo, &xmm_dst_hi);
2830 
2831 		save_128_aligned (
2832 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2833 	    }
2834 
2835 	    dst += 4;
2836 	    src += 4;
2837 	    w -= 4;
2838 	}
2839 
2840 	while (w)
2841 	{
2842 	    uint32_t s = *src++;
2843 
2844 	    if (s)
2845 	    {
2846 		uint32_t d = *dst;
2847 
2848 		__m128i ms = unpack_32_1x128 (s);
2849 		__m128i alpha = expand_alpha_1x128 (ms);
2850 		__m128i mask  = xmm_mask;
2851 		__m128i dest  = unpack_32_1x128 (d);
2852 
2853 		*dst = pack_1x128_32 (
2854 		    in_over_1x128 (&ms, &alpha, &mask, &dest));
2855 	    }
2856 
2857 	    dst++;
2858 	    w--;
2859 	}
2860     }
2861 
2862 }
2863 
2864 static void
sse2_composite_src_x888_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2865 sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
2866                               pixman_composite_info_t *info)
2867 {
2868     PIXMAN_COMPOSITE_ARGS (info);
2869     uint16_t    *dst_line, *dst;
2870     uint32_t    *src_line, *src, s;
2871     int dst_stride, src_stride;
2872     int32_t w;
2873 
2874     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2875     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2876 
2877     while (height--)
2878     {
2879 	dst = dst_line;
2880 	dst_line += dst_stride;
2881 	src = src_line;
2882 	src_line += src_stride;
2883 	w = width;
2884 
2885 	while (w && (uintptr_t)dst & 15)
2886 	{
2887 	    s = *src++;
2888 	    *dst = convert_8888_to_0565 (s);
2889 	    dst++;
2890 	    w--;
2891 	}
2892 
2893 	while (w >= 8)
2894 	{
2895 	    __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
2896 	    __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
2897 
2898 	    save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
2899 
2900 	    w -= 8;
2901 	    src += 8;
2902 	    dst += 8;
2903 	}
2904 
2905 	while (w)
2906 	{
2907 	    s = *src++;
2908 	    *dst = convert_8888_to_0565 (s);
2909 	    dst++;
2910 	    w--;
2911 	}
2912     }
2913 }
2914 
2915 static void
sse2_composite_src_x888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2916 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2917 			      pixman_composite_info_t *info)
2918 {
2919     PIXMAN_COMPOSITE_ARGS (info);
2920     uint32_t    *dst_line, *dst;
2921     uint32_t    *src_line, *src;
2922     int32_t w;
2923     int dst_stride, src_stride;
2924 
2925 
2926     PIXMAN_IMAGE_GET_LINE (
2927 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2928     PIXMAN_IMAGE_GET_LINE (
2929 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2930 
2931     while (height--)
2932     {
2933 	dst = dst_line;
2934 	dst_line += dst_stride;
2935 	src = src_line;
2936 	src_line += src_stride;
2937 	w = width;
2938 
2939 	while (w && (uintptr_t)dst & 15)
2940 	{
2941 	    *dst++ = *src++ | 0xff000000;
2942 	    w--;
2943 	}
2944 
2945 	while (w >= 16)
2946 	{
2947 	    __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2948 
2949 	    xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2950 	    xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2951 	    xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2952 	    xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2953 
2954 	    save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2955 	    save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2956 	    save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2957 	    save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2958 
2959 	    dst += 16;
2960 	    src += 16;
2961 	    w -= 16;
2962 	}
2963 
2964 	while (w)
2965 	{
2966 	    *dst++ = *src++ | 0xff000000;
2967 	    w--;
2968 	}
2969     }
2970 
2971 }
2972 
2973 static void
sse2_composite_over_x888_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2974 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2975                                  pixman_composite_info_t *info)
2976 {
2977     PIXMAN_COMPOSITE_ARGS (info);
2978     uint32_t    *dst_line, *dst;
2979     uint32_t    *src_line, *src;
2980     uint32_t mask;
2981     int dst_stride, src_stride;
2982     int32_t w;
2983 
2984     __m128i xmm_mask, xmm_alpha;
2985     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2986     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2987 
2988     PIXMAN_IMAGE_GET_LINE (
2989 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2990     PIXMAN_IMAGE_GET_LINE (
2991 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2992 
2993     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2994 
2995     xmm_mask = create_mask_16_128 (mask >> 24);
2996     xmm_alpha = mask_00ff;
2997 
2998     while (height--)
2999     {
3000 	dst = dst_line;
3001 	dst_line += dst_stride;
3002 	src = src_line;
3003 	src_line += src_stride;
3004 	w = width;
3005 
3006 	while (w && (uintptr_t)dst & 15)
3007 	{
3008 	    uint32_t s = (*src++) | 0xff000000;
3009 	    uint32_t d = *dst;
3010 
3011 	    __m128i src   = unpack_32_1x128 (s);
3012 	    __m128i alpha = xmm_alpha;
3013 	    __m128i mask  = xmm_mask;
3014 	    __m128i dest  = unpack_32_1x128 (d);
3015 
3016 	    *dst++ = pack_1x128_32 (
3017 		in_over_1x128 (&src, &alpha, &mask, &dest));
3018 
3019 	    w--;
3020 	}
3021 
3022 	while (w >= 4)
3023 	{
3024 	    xmm_src = _mm_or_si128 (
3025 		load_128_unaligned ((__m128i*)src), mask_ff000000);
3026 	    xmm_dst = load_128_aligned ((__m128i*)dst);
3027 
3028 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3029 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3030 
3031 	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3032 			   &xmm_alpha, &xmm_alpha,
3033 			   &xmm_mask, &xmm_mask,
3034 			   &xmm_dst_lo, &xmm_dst_hi);
3035 
3036 	    save_128_aligned (
3037 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3038 
3039 	    dst += 4;
3040 	    src += 4;
3041 	    w -= 4;
3042 
3043 	}
3044 
3045 	while (w)
3046 	{
3047 	    uint32_t s = (*src++) | 0xff000000;
3048 	    uint32_t d = *dst;
3049 
3050 	    __m128i src  = unpack_32_1x128 (s);
3051 	    __m128i alpha = xmm_alpha;
3052 	    __m128i mask  = xmm_mask;
3053 	    __m128i dest  = unpack_32_1x128 (d);
3054 
3055 	    *dst++ = pack_1x128_32 (
3056 		in_over_1x128 (&src, &alpha, &mask, &dest));
3057 
3058 	    w--;
3059 	}
3060     }
3061 
3062 }
3063 
3064 static void
sse2_composite_over_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3065 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3066                                pixman_composite_info_t *info)
3067 {
3068     PIXMAN_COMPOSITE_ARGS (info);
3069     int dst_stride, src_stride;
3070     uint32_t    *dst_line, *dst;
3071     uint32_t    *src_line, *src;
3072 
3073     PIXMAN_IMAGE_GET_LINE (
3074 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3075     PIXMAN_IMAGE_GET_LINE (
3076 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3077 
3078     dst = dst_line;
3079     src = src_line;
3080 
3081     while (height--)
3082     {
3083 	sse2_combine_over_u (imp, op, dst, src, NULL, width);
3084 
3085 	dst += dst_stride;
3086 	src += src_stride;
3087     }
3088 }
3089 
3090 static force_inline uint16_t
composite_over_8888_0565pixel(uint32_t src,uint16_t dst)3091 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3092 {
3093     __m128i ms;
3094 
3095     ms = unpack_32_1x128 (src);
3096     return pack_565_32_16 (
3097 	pack_1x128_32 (
3098 	    over_1x128 (
3099 		ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3100 }
3101 
3102 static void
sse2_composite_over_8888_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)3103 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3104                                pixman_composite_info_t *info)
3105 {
3106     PIXMAN_COMPOSITE_ARGS (info);
3107     uint16_t    *dst_line, *dst, d;
3108     uint32_t    *src_line, *src, s;
3109     int dst_stride, src_stride;
3110     int32_t w;
3111 
3112     __m128i xmm_alpha_lo, xmm_alpha_hi;
3113     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3114     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3115 
3116     PIXMAN_IMAGE_GET_LINE (
3117 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3118     PIXMAN_IMAGE_GET_LINE (
3119 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3120 
3121     while (height--)
3122     {
3123 	dst = dst_line;
3124 	src = src_line;
3125 
3126 	dst_line += dst_stride;
3127 	src_line += src_stride;
3128 	w = width;
3129 
3130 	/* Align dst on a 16-byte boundary */
3131 	while (w &&
3132 	       ((uintptr_t)dst & 15))
3133 	{
3134 	    s = *src++;
3135 	    d = *dst;
3136 
3137 	    *dst++ = composite_over_8888_0565pixel (s, d);
3138 	    w--;
3139 	}
3140 
3141 	/* It's a 8 pixel loop */
3142 	while (w >= 8)
3143 	{
3144 	    /* I'm loading unaligned because I'm not sure
3145 	     * about the address alignment.
3146 	     */
3147 	    xmm_src = load_128_unaligned ((__m128i*) src);
3148 	    xmm_dst = load_128_aligned ((__m128i*) dst);
3149 
3150 	    /* Unpacking */
3151 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3152 	    unpack_565_128_4x128 (xmm_dst,
3153 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3154 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3155 				&xmm_alpha_lo, &xmm_alpha_hi);
3156 
3157 	    /* I'm loading next 4 pixels from memory
3158 	     * before to optimze the memory read.
3159 	     */
3160 	    xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3161 
3162 	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
3163 			&xmm_alpha_lo, &xmm_alpha_hi,
3164 			&xmm_dst0, &xmm_dst1);
3165 
3166 	    /* Unpacking */
3167 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3168 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3169 				&xmm_alpha_lo, &xmm_alpha_hi);
3170 
3171 	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
3172 			&xmm_alpha_lo, &xmm_alpha_hi,
3173 			&xmm_dst2, &xmm_dst3);
3174 
3175 	    save_128_aligned (
3176 		(__m128i*)dst, pack_565_4x128_128 (
3177 		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3178 
3179 	    w -= 8;
3180 	    dst += 8;
3181 	    src += 8;
3182 	}
3183 
3184 	while (w--)
3185 	{
3186 	    s = *src++;
3187 	    d = *dst;
3188 
3189 	    *dst++ = composite_over_8888_0565pixel (s, d);
3190 	}
3191     }
3192 
3193 }
3194 
3195 static void
sse2_composite_over_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3196 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3197                               pixman_composite_info_t *info)
3198 {
3199     PIXMAN_COMPOSITE_ARGS (info);
3200     uint32_t src, srca;
3201     uint32_t *dst_line, *dst;
3202     uint8_t *mask_line, *mask;
3203     int dst_stride, mask_stride;
3204     int32_t w;
3205     uint32_t m, d;
3206 
3207     __m128i xmm_src, xmm_alpha, xmm_def;
3208     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3209     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3210 
3211     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3212 
3213     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3214 
3215     srca = src >> 24;
3216     if (src == 0)
3217 	return;
3218 
3219     PIXMAN_IMAGE_GET_LINE (
3220 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3221     PIXMAN_IMAGE_GET_LINE (
3222 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3223 
3224     xmm_def = create_mask_2x32_128 (src, src);
3225     xmm_src = expand_pixel_32_1x128 (src);
3226     xmm_alpha = expand_alpha_1x128 (xmm_src);
3227     mmx_src   = xmm_src;
3228     mmx_alpha = xmm_alpha;
3229 
3230     while (height--)
3231     {
3232 	dst = dst_line;
3233 	dst_line += dst_stride;
3234 	mask = mask_line;
3235 	mask_line += mask_stride;
3236 	w = width;
3237 
3238 	while (w && (uintptr_t)dst & 15)
3239 	{
3240 	    uint8_t m = *mask++;
3241 
3242 	    if (m)
3243 	    {
3244 		d = *dst;
3245 		mmx_mask = expand_pixel_8_1x128 (m);
3246 		mmx_dest = unpack_32_1x128 (d);
3247 
3248 		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3249 		                                   &mmx_alpha,
3250 		                                   &mmx_mask,
3251 		                                   &mmx_dest));
3252 	    }
3253 
3254 	    w--;
3255 	    dst++;
3256 	}
3257 
3258 	while (w >= 4)
3259 	{
3260             memcpy(&m, mask, sizeof(uint32_t));
3261 
3262 	    if (srca == 0xff && m == 0xffffffff)
3263 	    {
3264 		save_128_aligned ((__m128i*)dst, xmm_def);
3265 	    }
3266 	    else if (m)
3267 	    {
3268 		xmm_dst = load_128_aligned ((__m128i*) dst);
3269 		xmm_mask = unpack_32_1x128 (m);
3270 		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3271 
3272 		/* Unpacking */
3273 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3274 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3275 
3276 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3277 					&xmm_mask_lo, &xmm_mask_hi);
3278 
3279 		in_over_2x128 (&xmm_src, &xmm_src,
3280 			       &xmm_alpha, &xmm_alpha,
3281 			       &xmm_mask_lo, &xmm_mask_hi,
3282 			       &xmm_dst_lo, &xmm_dst_hi);
3283 
3284 		save_128_aligned (
3285 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3286 	    }
3287 
3288 	    w -= 4;
3289 	    dst += 4;
3290 	    mask += 4;
3291 	}
3292 
3293 	while (w)
3294 	{
3295 	    uint8_t m = *mask++;
3296 
3297 	    if (m)
3298 	    {
3299 		d = *dst;
3300 		mmx_mask = expand_pixel_8_1x128 (m);
3301 		mmx_dest = unpack_32_1x128 (d);
3302 
3303 		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3304 		                                   &mmx_alpha,
3305 		                                   &mmx_mask,
3306 		                                   &mmx_dest));
3307 	    }
3308 
3309 	    w--;
3310 	    dst++;
3311 	}
3312     }
3313 
3314 }
3315 
3316 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
3317 __attribute__((__force_align_arg_pointer__))
3318 #endif
3319 static pixman_bool_t
sse2_fill(pixman_implementation_t * imp,uint32_t * bits,int stride,int bpp,int x,int y,int width,int height,uint32_t filler)3320 sse2_fill (pixman_implementation_t *imp,
3321            uint32_t *               bits,
3322            int                      stride,
3323            int                      bpp,
3324            int                      x,
3325            int                      y,
3326            int                      width,
3327            int                      height,
3328            uint32_t		    filler)
3329 {
3330     uint32_t byte_width;
3331     uint8_t *byte_line;
3332 
3333     __m128i xmm_def;
3334 
3335     if (bpp == 8)
3336     {
3337 	uint32_t b;
3338 	uint32_t w;
3339 
3340 	stride = stride * (int) sizeof (uint32_t) / 1;
3341 	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3342 	byte_width = width;
3343 	stride *= 1;
3344 
3345 	b = filler & 0xff;
3346 	w = (b << 8) | b;
3347 	filler = (w << 16) | w;
3348     }
3349     else if (bpp == 16)
3350     {
3351 	stride = stride * (int) sizeof (uint32_t) / 2;
3352 	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3353 	byte_width = 2 * width;
3354 	stride *= 2;
3355 
3356         filler = (filler & 0xffff) * 0x00010001;
3357     }
3358     else if (bpp == 32)
3359     {
3360 	stride = stride * (int) sizeof (uint32_t) / 4;
3361 	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3362 	byte_width = 4 * width;
3363 	stride *= 4;
3364     }
3365     else
3366     {
3367 	return FALSE;
3368     }
3369 
3370     xmm_def = create_mask_2x32_128 (filler, filler);
3371 
3372     while (height--)
3373     {
3374 	int w;
3375 	uint8_t *d = byte_line;
3376 	byte_line += stride;
3377 	w = byte_width;
3378 
3379 	if (w >= 1 && ((uintptr_t)d & 1))
3380 	{
3381 	    *(uint8_t *)d = filler;
3382 	    w -= 1;
3383 	    d += 1;
3384 	}
3385 
3386 	while (w >= 2 && ((uintptr_t)d & 3))
3387 	{
3388 	    *(uint16_t *)d = filler;
3389 	    w -= 2;
3390 	    d += 2;
3391 	}
3392 
3393 	while (w >= 4 && ((uintptr_t)d & 15))
3394 	{
3395 	    *(uint32_t *)d = filler;
3396 
3397 	    w -= 4;
3398 	    d += 4;
3399 	}
3400 
3401 	while (w >= 128)
3402 	{
3403 	    save_128_aligned ((__m128i*)(d),     xmm_def);
3404 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3405 	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3406 	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3407 	    save_128_aligned ((__m128i*)(d + 64),  xmm_def);
3408 	    save_128_aligned ((__m128i*)(d + 80),  xmm_def);
3409 	    save_128_aligned ((__m128i*)(d + 96),  xmm_def);
3410 	    save_128_aligned ((__m128i*)(d + 112), xmm_def);
3411 
3412 	    d += 128;
3413 	    w -= 128;
3414 	}
3415 
3416 	if (w >= 64)
3417 	{
3418 	    save_128_aligned ((__m128i*)(d),     xmm_def);
3419 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3420 	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3421 	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3422 
3423 	    d += 64;
3424 	    w -= 64;
3425 	}
3426 
3427 	if (w >= 32)
3428 	{
3429 	    save_128_aligned ((__m128i*)(d),     xmm_def);
3430 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3431 
3432 	    d += 32;
3433 	    w -= 32;
3434 	}
3435 
3436 	if (w >= 16)
3437 	{
3438 	    save_128_aligned ((__m128i*)(d),     xmm_def);
3439 
3440 	    d += 16;
3441 	    w -= 16;
3442 	}
3443 
3444 	while (w >= 4)
3445 	{
3446 	    *(uint32_t *)d = filler;
3447 
3448 	    w -= 4;
3449 	    d += 4;
3450 	}
3451 
3452 	if (w >= 2)
3453 	{
3454 	    *(uint16_t *)d = filler;
3455 	    w -= 2;
3456 	    d += 2;
3457 	}
3458 
3459 	if (w >= 1)
3460 	{
3461 	    *(uint8_t *)d = filler;
3462 	    w -= 1;
3463 	    d += 1;
3464 	}
3465     }
3466 
3467     return TRUE;
3468 }
3469 
3470 static void
sse2_composite_src_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3471 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3472                              pixman_composite_info_t *info)
3473 {
3474     PIXMAN_COMPOSITE_ARGS (info);
3475     uint32_t src, srca;
3476     uint32_t    *dst_line, *dst;
3477     uint8_t     *mask_line, *mask;
3478     int dst_stride, mask_stride;
3479     int32_t w;
3480     uint32_t m;
3481 
3482     __m128i xmm_src, xmm_def;
3483     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3484 
3485     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3486 
3487     srca = src >> 24;
3488     if (src == 0)
3489     {
3490 	sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
3491 		   PIXMAN_FORMAT_BPP (dest_image->bits.format),
3492 		   dest_x, dest_y, width, height, 0);
3493 	return;
3494     }
3495 
3496     PIXMAN_IMAGE_GET_LINE (
3497 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3498     PIXMAN_IMAGE_GET_LINE (
3499 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3500 
3501     xmm_def = create_mask_2x32_128 (src, src);
3502     xmm_src = expand_pixel_32_1x128 (src);
3503 
3504     while (height--)
3505     {
3506 	dst = dst_line;
3507 	dst_line += dst_stride;
3508 	mask = mask_line;
3509 	mask_line += mask_stride;
3510 	w = width;
3511 
3512 	while (w && (uintptr_t)dst & 15)
3513 	{
3514 	    uint8_t m = *mask++;
3515 
3516 	    if (m)
3517 	    {
3518 		*dst = pack_1x128_32 (
3519 		    pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3520 	    }
3521 	    else
3522 	    {
3523 		*dst = 0;
3524 	    }
3525 
3526 	    w--;
3527 	    dst++;
3528 	}
3529 
3530 	while (w >= 4)
3531 	{
3532             memcpy(&m, mask, sizeof(uint32_t));
3533 
3534 	    if (srca == 0xff && m == 0xffffffff)
3535 	    {
3536 		save_128_aligned ((__m128i*)dst, xmm_def);
3537 	    }
3538 	    else if (m)
3539 	    {
3540 		xmm_mask = unpack_32_1x128 (m);
3541 		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3542 
3543 		/* Unpacking */
3544 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3545 
3546 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3547 					&xmm_mask_lo, &xmm_mask_hi);
3548 
3549 		pix_multiply_2x128 (&xmm_src, &xmm_src,
3550 				    &xmm_mask_lo, &xmm_mask_hi,
3551 				    &xmm_mask_lo, &xmm_mask_hi);
3552 
3553 		save_128_aligned (
3554 		    (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3555 	    }
3556 	    else
3557 	    {
3558 		save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3559 	    }
3560 
3561 	    w -= 4;
3562 	    dst += 4;
3563 	    mask += 4;
3564 	}
3565 
3566 	while (w)
3567 	{
3568 	    uint8_t m = *mask++;
3569 
3570 	    if (m)
3571 	    {
3572 		*dst = pack_1x128_32 (
3573 		    pix_multiply_1x128 (
3574 			xmm_src, expand_pixel_8_1x128 (m)));
3575 	    }
3576 	    else
3577 	    {
3578 		*dst = 0;
3579 	    }
3580 
3581 	    w--;
3582 	    dst++;
3583 	}
3584     }
3585 
3586 }
3587 
3588 static void
sse2_composite_over_n_8_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)3589 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3590                               pixman_composite_info_t *info)
3591 {
3592     PIXMAN_COMPOSITE_ARGS (info);
3593     uint32_t src;
3594     uint16_t    *dst_line, *dst, d;
3595     uint8_t     *mask_line, *mask;
3596     int dst_stride, mask_stride;
3597     int32_t w;
3598     uint32_t m;
3599     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3600 
3601     __m128i xmm_src, xmm_alpha;
3602     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3603     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3604 
3605     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3606 
3607     if (src == 0)
3608 	return;
3609 
3610     PIXMAN_IMAGE_GET_LINE (
3611 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3612     PIXMAN_IMAGE_GET_LINE (
3613 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3614 
3615     xmm_src = expand_pixel_32_1x128 (src);
3616     xmm_alpha = expand_alpha_1x128 (xmm_src);
3617     mmx_src = xmm_src;
3618     mmx_alpha = xmm_alpha;
3619 
3620     while (height--)
3621     {
3622 	dst = dst_line;
3623 	dst_line += dst_stride;
3624 	mask = mask_line;
3625 	mask_line += mask_stride;
3626 	w = width;
3627 
3628 	while (w && (uintptr_t)dst & 15)
3629 	{
3630 	    m = *mask++;
3631 
3632 	    if (m)
3633 	    {
3634 		d = *dst;
3635 		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3636 		mmx_dest = expand565_16_1x128 (d);
3637 
3638 		*dst = pack_565_32_16 (
3639 		    pack_1x128_32 (
3640 			in_over_1x128 (
3641 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3642 	    }
3643 
3644 	    w--;
3645 	    dst++;
3646 	}
3647 
3648 	while (w >= 8)
3649 	{
3650 	    xmm_dst = load_128_aligned ((__m128i*) dst);
3651 	    unpack_565_128_4x128 (xmm_dst,
3652 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3653 
3654             memcpy(&m, mask, sizeof(uint32_t));
3655 	    mask += 4;
3656 
3657 	    if (m)
3658 	    {
3659 		xmm_mask = unpack_32_1x128 (m);
3660 		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3661 
3662 		/* Unpacking */
3663 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3664 
3665 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3666 					&xmm_mask_lo, &xmm_mask_hi);
3667 
3668 		in_over_2x128 (&xmm_src, &xmm_src,
3669 			       &xmm_alpha, &xmm_alpha,
3670 			       &xmm_mask_lo, &xmm_mask_hi,
3671 			       &xmm_dst0, &xmm_dst1);
3672 	    }
3673 
3674             memcpy(&m, mask, sizeof(uint32_t));
3675 	    mask += 4;
3676 
3677 	    if (m)
3678 	    {
3679 		xmm_mask = unpack_32_1x128 (m);
3680 		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3681 
3682 		/* Unpacking */
3683 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3684 
3685 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3686 					&xmm_mask_lo, &xmm_mask_hi);
3687 		in_over_2x128 (&xmm_src, &xmm_src,
3688 			       &xmm_alpha, &xmm_alpha,
3689 			       &xmm_mask_lo, &xmm_mask_hi,
3690 			       &xmm_dst2, &xmm_dst3);
3691 	    }
3692 
3693 	    save_128_aligned (
3694 		(__m128i*)dst, pack_565_4x128_128 (
3695 		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3696 
3697 	    w -= 8;
3698 	    dst += 8;
3699 	}
3700 
3701 	while (w)
3702 	{
3703 	    m = *mask++;
3704 
3705 	    if (m)
3706 	    {
3707 		d = *dst;
3708 		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3709 		mmx_dest = expand565_16_1x128 (d);
3710 
3711 		*dst = pack_565_32_16 (
3712 		    pack_1x128_32 (
3713 			in_over_1x128 (
3714 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3715 	    }
3716 
3717 	    w--;
3718 	    dst++;
3719 	}
3720     }
3721 
3722 }
3723 
3724 static void
sse2_composite_over_pixbuf_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)3725 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3726                                  pixman_composite_info_t *info)
3727 {
3728     PIXMAN_COMPOSITE_ARGS (info);
3729     uint16_t    *dst_line, *dst, d;
3730     uint32_t    *src_line, *src, s;
3731     int dst_stride, src_stride;
3732     int32_t w;
3733     uint32_t opaque, zero;
3734 
3735     __m128i ms;
3736     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3737     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3738 
3739     PIXMAN_IMAGE_GET_LINE (
3740 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3741     PIXMAN_IMAGE_GET_LINE (
3742 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3743 
3744     while (height--)
3745     {
3746 	dst = dst_line;
3747 	dst_line += dst_stride;
3748 	src = src_line;
3749 	src_line += src_stride;
3750 	w = width;
3751 
3752 	while (w && (uintptr_t)dst & 15)
3753 	{
3754 	    s = *src++;
3755 	    d = *dst;
3756 
3757 	    ms = unpack_32_1x128 (s);
3758 
3759 	    *dst++ = pack_565_32_16 (
3760 		pack_1x128_32 (
3761 		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3762 	    w--;
3763 	}
3764 
3765 	while (w >= 8)
3766 	{
3767 	    /* First round */
3768 	    xmm_src = load_128_unaligned ((__m128i*)src);
3769 	    xmm_dst = load_128_aligned  ((__m128i*)dst);
3770 
3771 	    opaque = is_opaque (xmm_src);
3772 	    zero = is_zero (xmm_src);
3773 
3774 	    unpack_565_128_4x128 (xmm_dst,
3775 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3776 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3777 
3778 	    /* preload next round*/
3779 	    xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3780 
3781 	    if (opaque)
3782 	    {
3783 		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3784 				     &xmm_dst0, &xmm_dst1);
3785 	    }
3786 	    else if (!zero)
3787 	    {
3788 		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3789 					&xmm_dst0, &xmm_dst1);
3790 	    }
3791 
3792 	    /* Second round */
3793 	    opaque = is_opaque (xmm_src);
3794 	    zero = is_zero (xmm_src);
3795 
3796 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3797 
3798 	    if (opaque)
3799 	    {
3800 		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3801 				     &xmm_dst2, &xmm_dst3);
3802 	    }
3803 	    else if (!zero)
3804 	    {
3805 		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3806 					&xmm_dst2, &xmm_dst3);
3807 	    }
3808 
3809 	    save_128_aligned (
3810 		(__m128i*)dst, pack_565_4x128_128 (
3811 		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3812 
3813 	    w -= 8;
3814 	    src += 8;
3815 	    dst += 8;
3816 	}
3817 
3818 	while (w)
3819 	{
3820 	    s = *src++;
3821 	    d = *dst;
3822 
3823 	    ms = unpack_32_1x128 (s);
3824 
3825 	    *dst++ = pack_565_32_16 (
3826 		pack_1x128_32 (
3827 		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3828 	    w--;
3829 	}
3830     }
3831 
3832 }
3833 
3834 static void
sse2_composite_over_pixbuf_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3835 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3836                                  pixman_composite_info_t *info)
3837 {
3838     PIXMAN_COMPOSITE_ARGS (info);
3839     uint32_t    *dst_line, *dst, d;
3840     uint32_t    *src_line, *src, s;
3841     int dst_stride, src_stride;
3842     int32_t w;
3843     uint32_t opaque, zero;
3844 
3845     __m128i xmm_src_lo, xmm_src_hi;
3846     __m128i xmm_dst_lo, xmm_dst_hi;
3847 
3848     PIXMAN_IMAGE_GET_LINE (
3849 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3850     PIXMAN_IMAGE_GET_LINE (
3851 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3852 
3853     while (height--)
3854     {
3855 	dst = dst_line;
3856 	dst_line += dst_stride;
3857 	src = src_line;
3858 	src_line += src_stride;
3859 	w = width;
3860 
3861 	while (w && (uintptr_t)dst & 15)
3862 	{
3863 	    s = *src++;
3864 	    d = *dst;
3865 
3866 	    *dst++ = pack_1x128_32 (
3867 		over_rev_non_pre_1x128 (
3868 		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
3869 
3870 	    w--;
3871 	}
3872 
3873 	while (w >= 4)
3874 	{
3875 	    xmm_src_hi = load_128_unaligned ((__m128i*)src);
3876 
3877 	    opaque = is_opaque (xmm_src_hi);
3878 	    zero = is_zero (xmm_src_hi);
3879 
3880 	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3881 
3882 	    if (opaque)
3883 	    {
3884 		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3885 				     &xmm_dst_lo, &xmm_dst_hi);
3886 
3887 		save_128_aligned (
3888 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3889 	    }
3890 	    else if (!zero)
3891 	    {
3892 		xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
3893 
3894 		unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3895 
3896 		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3897 					&xmm_dst_lo, &xmm_dst_hi);
3898 
3899 		save_128_aligned (
3900 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3901 	    }
3902 
3903 	    w -= 4;
3904 	    dst += 4;
3905 	    src += 4;
3906 	}
3907 
3908 	while (w)
3909 	{
3910 	    s = *src++;
3911 	    d = *dst;
3912 
3913 	    *dst++ = pack_1x128_32 (
3914 		over_rev_non_pre_1x128 (
3915 		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
3916 
3917 	    w--;
3918 	}
3919     }
3920 
3921 }
3922 
3923 static void
sse2_composite_over_n_8888_0565_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)3924 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3925                                     pixman_composite_info_t *info)
3926 {
3927     PIXMAN_COMPOSITE_ARGS (info);
3928     uint32_t src;
3929     uint16_t    *dst_line, *dst, d;
3930     uint32_t    *mask_line, *mask, m;
3931     int dst_stride, mask_stride;
3932     int w;
3933     uint32_t pack_cmp;
3934 
3935     __m128i xmm_src, xmm_alpha;
3936     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3937     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3938 
3939     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3940 
3941     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3942 
3943     if (src == 0)
3944 	return;
3945 
3946     PIXMAN_IMAGE_GET_LINE (
3947 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3948     PIXMAN_IMAGE_GET_LINE (
3949 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3950 
3951     xmm_src = expand_pixel_32_1x128 (src);
3952     xmm_alpha = expand_alpha_1x128 (xmm_src);
3953     mmx_src = xmm_src;
3954     mmx_alpha = xmm_alpha;
3955 
3956     while (height--)
3957     {
3958 	w = width;
3959 	mask = mask_line;
3960 	dst = dst_line;
3961 	mask_line += mask_stride;
3962 	dst_line += dst_stride;
3963 
3964 	while (w && ((uintptr_t)dst & 15))
3965 	{
3966 	    m = *(uint32_t *) mask;
3967 
3968 	    if (m)
3969 	    {
3970 		d = *dst;
3971 		mmx_mask = unpack_32_1x128 (m);
3972 		mmx_dest = expand565_16_1x128 (d);
3973 
3974 		*dst = pack_565_32_16 (
3975 		    pack_1x128_32 (
3976 			in_over_1x128 (
3977 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3978 	    }
3979 
3980 	    w--;
3981 	    dst++;
3982 	    mask++;
3983 	}
3984 
3985 	while (w >= 8)
3986 	{
3987 	    /* First round */
3988 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
3989 	    xmm_dst = load_128_aligned ((__m128i*)dst);
3990 
3991 	    pack_cmp = _mm_movemask_epi8 (
3992 		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3993 
3994 	    unpack_565_128_4x128 (xmm_dst,
3995 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3996 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3997 
3998 	    /* preload next round */
3999 	    xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4000 
4001 	    /* preload next round */
4002 	    if (pack_cmp != 0xffff)
4003 	    {
4004 		in_over_2x128 (&xmm_src, &xmm_src,
4005 			       &xmm_alpha, &xmm_alpha,
4006 			       &xmm_mask_lo, &xmm_mask_hi,
4007 			       &xmm_dst0, &xmm_dst1);
4008 	    }
4009 
4010 	    /* Second round */
4011 	    pack_cmp = _mm_movemask_epi8 (
4012 		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4013 
4014 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4015 
4016 	    if (pack_cmp != 0xffff)
4017 	    {
4018 		in_over_2x128 (&xmm_src, &xmm_src,
4019 			       &xmm_alpha, &xmm_alpha,
4020 			       &xmm_mask_lo, &xmm_mask_hi,
4021 			       &xmm_dst2, &xmm_dst3);
4022 	    }
4023 
4024 	    save_128_aligned (
4025 		(__m128i*)dst, pack_565_4x128_128 (
4026 		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4027 
4028 	    w -= 8;
4029 	    dst += 8;
4030 	    mask += 8;
4031 	}
4032 
4033 	while (w)
4034 	{
4035 	    m = *(uint32_t *) mask;
4036 
4037 	    if (m)
4038 	    {
4039 		d = *dst;
4040 		mmx_mask = unpack_32_1x128 (m);
4041 		mmx_dest = expand565_16_1x128 (d);
4042 
4043 		*dst = pack_565_32_16 (
4044 		    pack_1x128_32 (
4045 			in_over_1x128 (
4046 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4047 	    }
4048 
4049 	    w--;
4050 	    dst++;
4051 	    mask++;
4052 	}
4053     }
4054 
4055 }
4056 
4057 static void
sse2_composite_in_n_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4058 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4059                          pixman_composite_info_t *info)
4060 {
4061     PIXMAN_COMPOSITE_ARGS (info);
4062     uint8_t     *dst_line, *dst;
4063     uint8_t     *mask_line, *mask;
4064     int dst_stride, mask_stride;
4065     uint32_t d, m;
4066     uint32_t src;
4067     int32_t w;
4068 
4069     __m128i xmm_alpha;
4070     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4071     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4072 
4073     PIXMAN_IMAGE_GET_LINE (
4074 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4075     PIXMAN_IMAGE_GET_LINE (
4076 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4077 
4078     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4079 
4080     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4081 
4082     while (height--)
4083     {
4084 	dst = dst_line;
4085 	dst_line += dst_stride;
4086 	mask = mask_line;
4087 	mask_line += mask_stride;
4088 	w = width;
4089 
4090 	while (w && ((uintptr_t)dst & 15))
4091 	{
4092 	    m = (uint32_t) *mask++;
4093 	    d = (uint32_t) *dst;
4094 
4095 	    *dst++ = (uint8_t) pack_1x128_32 (
4096 		pix_multiply_1x128 (
4097 		    pix_multiply_1x128 (xmm_alpha,
4098 				       unpack_32_1x128 (m)),
4099 		    unpack_32_1x128 (d)));
4100 	    w--;
4101 	}
4102 
4103 	while (w >= 16)
4104 	{
4105 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
4106 	    xmm_dst = load_128_aligned ((__m128i*)dst);
4107 
4108 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4109 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4110 
4111 	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4112 				&xmm_mask_lo, &xmm_mask_hi,
4113 				&xmm_mask_lo, &xmm_mask_hi);
4114 
4115 	    pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4116 				&xmm_dst_lo, &xmm_dst_hi,
4117 				&xmm_dst_lo, &xmm_dst_hi);
4118 
4119 	    save_128_aligned (
4120 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4121 
4122 	    mask += 16;
4123 	    dst += 16;
4124 	    w -= 16;
4125 	}
4126 
4127 	while (w)
4128 	{
4129 	    m = (uint32_t) *mask++;
4130 	    d = (uint32_t) *dst;
4131 
4132 	    *dst++ = (uint8_t) pack_1x128_32 (
4133 		pix_multiply_1x128 (
4134 		    pix_multiply_1x128 (
4135 			xmm_alpha, unpack_32_1x128 (m)),
4136 		    unpack_32_1x128 (d)));
4137 	    w--;
4138 	}
4139     }
4140 
4141 }
4142 
4143 static void
sse2_composite_in_n_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4144 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4145 		       pixman_composite_info_t *info)
4146 {
4147     PIXMAN_COMPOSITE_ARGS (info);
4148     uint8_t     *dst_line, *dst;
4149     int dst_stride;
4150     uint32_t d;
4151     uint32_t src;
4152     int32_t w;
4153 
4154     __m128i xmm_alpha;
4155     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4156 
4157     PIXMAN_IMAGE_GET_LINE (
4158 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4159 
4160     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4161 
4162     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4163 
4164     src = src >> 24;
4165 
4166     if (src == 0xff)
4167 	return;
4168 
4169     if (src == 0x00)
4170     {
4171 	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4172 		     8, dest_x, dest_y, width, height, src);
4173 
4174 	return;
4175     }
4176 
4177     while (height--)
4178     {
4179 	dst = dst_line;
4180 	dst_line += dst_stride;
4181 	w = width;
4182 
4183 	while (w && ((uintptr_t)dst & 15))
4184 	{
4185 	    d = (uint32_t) *dst;
4186 
4187 	    *dst++ = (uint8_t) pack_1x128_32 (
4188 		pix_multiply_1x128 (
4189 		    xmm_alpha,
4190 		    unpack_32_1x128 (d)));
4191 	    w--;
4192 	}
4193 
4194 	while (w >= 16)
4195 	{
4196 	    xmm_dst = load_128_aligned ((__m128i*)dst);
4197 
4198 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4199 
4200 	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4201 				&xmm_dst_lo, &xmm_dst_hi,
4202 				&xmm_dst_lo, &xmm_dst_hi);
4203 
4204 	    save_128_aligned (
4205 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4206 
4207 	    dst += 16;
4208 	    w -= 16;
4209 	}
4210 
4211 	while (w)
4212 	{
4213 	    d = (uint32_t) *dst;
4214 
4215 	    *dst++ = (uint8_t) pack_1x128_32 (
4216 		pix_multiply_1x128 (
4217 		    xmm_alpha,
4218 		    unpack_32_1x128 (d)));
4219 	    w--;
4220 	}
4221     }
4222 
4223 }
4224 
4225 static void
sse2_composite_in_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4226 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4227                        pixman_composite_info_t *info)
4228 {
4229     PIXMAN_COMPOSITE_ARGS (info);
4230     uint8_t     *dst_line, *dst;
4231     uint8_t     *src_line, *src;
4232     int src_stride, dst_stride;
4233     int32_t w;
4234     uint32_t s, d;
4235 
4236     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4237     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4238 
4239     PIXMAN_IMAGE_GET_LINE (
4240 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4241     PIXMAN_IMAGE_GET_LINE (
4242 	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4243 
4244     while (height--)
4245     {
4246 	dst = dst_line;
4247 	dst_line += dst_stride;
4248 	src = src_line;
4249 	src_line += src_stride;
4250 	w = width;
4251 
4252 	while (w && ((uintptr_t)dst & 15))
4253 	{
4254 	    s = (uint32_t) *src++;
4255 	    d = (uint32_t) *dst;
4256 
4257 	    *dst++ = (uint8_t) pack_1x128_32 (
4258 		pix_multiply_1x128 (
4259 		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
4260 	    w--;
4261 	}
4262 
4263 	while (w >= 16)
4264 	{
4265 	    xmm_src = load_128_unaligned ((__m128i*)src);
4266 	    xmm_dst = load_128_aligned ((__m128i*)dst);
4267 
4268 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4269 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4270 
4271 	    pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4272 				&xmm_dst_lo, &xmm_dst_hi,
4273 				&xmm_dst_lo, &xmm_dst_hi);
4274 
4275 	    save_128_aligned (
4276 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4277 
4278 	    src += 16;
4279 	    dst += 16;
4280 	    w -= 16;
4281 	}
4282 
4283 	while (w)
4284 	{
4285 	    s = (uint32_t) *src++;
4286 	    d = (uint32_t) *dst;
4287 
4288 	    *dst++ = (uint8_t) pack_1x128_32 (
4289 		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4290 	    w--;
4291 	}
4292     }
4293 
4294 }
4295 
4296 static void
sse2_composite_add_n_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4297 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4298 			  pixman_composite_info_t *info)
4299 {
4300     PIXMAN_COMPOSITE_ARGS (info);
4301     uint8_t     *dst_line, *dst;
4302     uint8_t     *mask_line, *mask;
4303     int dst_stride, mask_stride;
4304     int32_t w;
4305     uint32_t src;
4306     uint32_t m, d;
4307 
4308     __m128i xmm_alpha;
4309     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4310     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4311 
4312     PIXMAN_IMAGE_GET_LINE (
4313 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4314     PIXMAN_IMAGE_GET_LINE (
4315 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4316 
4317     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4318 
4319     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4320 
4321     while (height--)
4322     {
4323 	dst = dst_line;
4324 	dst_line += dst_stride;
4325 	mask = mask_line;
4326 	mask_line += mask_stride;
4327 	w = width;
4328 
4329 	while (w && ((uintptr_t)dst & 15))
4330 	{
4331 	    m = (uint32_t) *mask++;
4332 	    d = (uint32_t) *dst;
4333 
4334 	    *dst++ = (uint8_t) pack_1x128_32 (
4335 		_mm_adds_epu16 (
4336 		    pix_multiply_1x128 (
4337 			xmm_alpha, unpack_32_1x128 (m)),
4338 		    unpack_32_1x128 (d)));
4339 	    w--;
4340 	}
4341 
4342 	while (w >= 16)
4343 	{
4344 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
4345 	    xmm_dst = load_128_aligned ((__m128i*)dst);
4346 
4347 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4348 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4349 
4350 	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4351 				&xmm_mask_lo, &xmm_mask_hi,
4352 				&xmm_mask_lo, &xmm_mask_hi);
4353 
4354 	    xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4355 	    xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4356 
4357 	    save_128_aligned (
4358 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4359 
4360 	    mask += 16;
4361 	    dst += 16;
4362 	    w -= 16;
4363 	}
4364 
4365 	while (w)
4366 	{
4367 	    m = (uint32_t) *mask++;
4368 	    d = (uint32_t) *dst;
4369 
4370 	    *dst++ = (uint8_t) pack_1x128_32 (
4371 		_mm_adds_epu16 (
4372 		    pix_multiply_1x128 (
4373 			xmm_alpha, unpack_32_1x128 (m)),
4374 		    unpack_32_1x128 (d)));
4375 
4376 	    w--;
4377 	}
4378     }
4379 
4380 }
4381 
4382 static void
sse2_composite_add_n_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4383 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4384 			pixman_composite_info_t *info)
4385 {
4386     PIXMAN_COMPOSITE_ARGS (info);
4387     uint8_t     *dst_line, *dst;
4388     int dst_stride;
4389     int32_t w;
4390     uint32_t src;
4391 
4392     __m128i xmm_src;
4393 
4394     PIXMAN_IMAGE_GET_LINE (
4395 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4396 
4397     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4398 
4399     src >>= 24;
4400 
4401     if (src == 0x00)
4402 	return;
4403 
4404     if (src == 0xff)
4405     {
4406 	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4407 		     8, dest_x, dest_y, width, height, 0xff);
4408 
4409 	return;
4410     }
4411 
4412     src = (src << 24) | (src << 16) | (src << 8) | src;
4413     xmm_src = _mm_set_epi32 (src, src, src, src);
4414 
4415     while (height--)
4416     {
4417 	dst = dst_line;
4418 	dst_line += dst_stride;
4419 	w = width;
4420 
4421 	while (w && ((uintptr_t)dst & 15))
4422 	{
4423 	    *dst = (uint8_t)_mm_cvtsi128_si32 (
4424 		_mm_adds_epu8 (
4425 		    xmm_src,
4426 		    _mm_cvtsi32_si128 (*dst)));
4427 
4428 	    w--;
4429 	    dst++;
4430 	}
4431 
4432 	while (w >= 16)
4433 	{
4434 	    save_128_aligned (
4435 		(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
4436 
4437 	    dst += 16;
4438 	    w -= 16;
4439 	}
4440 
4441 	while (w)
4442 	{
4443 	    *dst = (uint8_t)_mm_cvtsi128_si32 (
4444 		_mm_adds_epu8 (
4445 		    xmm_src,
4446 		    _mm_cvtsi32_si128 (*dst)));
4447 
4448 	    w--;
4449 	    dst++;
4450 	}
4451     }
4452 
4453 }
4454 
4455 static void
sse2_composite_add_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)4456 sse2_composite_add_8_8 (pixman_implementation_t *imp,
4457 			pixman_composite_info_t *info)
4458 {
4459     PIXMAN_COMPOSITE_ARGS (info);
4460     uint8_t     *dst_line, *dst;
4461     uint8_t     *src_line, *src;
4462     int dst_stride, src_stride;
4463     int32_t w;
4464     uint16_t t;
4465 
4466     PIXMAN_IMAGE_GET_LINE (
4467 	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4468     PIXMAN_IMAGE_GET_LINE (
4469 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4470 
4471     while (height--)
4472     {
4473 	dst = dst_line;
4474 	src = src_line;
4475 
4476 	dst_line += dst_stride;
4477 	src_line += src_stride;
4478 	w = width;
4479 
4480 	/* Small head */
4481 	while (w && (uintptr_t)dst & 3)
4482 	{
4483 	    t = (*dst) + (*src++);
4484 	    *dst++ = t | (0 - (t >> 8));
4485 	    w--;
4486 	}
4487 
4488 	sse2_combine_add_u (imp, op,
4489 			    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4490 
4491 	/* Small tail */
4492 	dst += w & 0xfffc;
4493 	src += w & 0xfffc;
4494 
4495 	w &= 3;
4496 
4497 	while (w)
4498 	{
4499 	    t = (*dst) + (*src++);
4500 	    *dst++ = t | (0 - (t >> 8));
4501 	    w--;
4502 	}
4503     }
4504 
4505 }
4506 
4507 static void
sse2_composite_add_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)4508 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4509                               pixman_composite_info_t *info)
4510 {
4511     PIXMAN_COMPOSITE_ARGS (info);
4512     uint32_t    *dst_line, *dst;
4513     uint32_t    *src_line, *src;
4514     int dst_stride, src_stride;
4515 
4516     PIXMAN_IMAGE_GET_LINE (
4517 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4518     PIXMAN_IMAGE_GET_LINE (
4519 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4520 
4521     while (height--)
4522     {
4523 	dst = dst_line;
4524 	dst_line += dst_stride;
4525 	src = src_line;
4526 	src_line += src_stride;
4527 
4528 	sse2_combine_add_u (imp, op, dst, src, NULL, width);
4529     }
4530 }
4531 
4532 static void
sse2_composite_add_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)4533 sse2_composite_add_n_8888 (pixman_implementation_t *imp,
4534 			   pixman_composite_info_t *info)
4535 {
4536     PIXMAN_COMPOSITE_ARGS (info);
4537     uint32_t *dst_line, *dst, src;
4538     int dst_stride;
4539 
4540     __m128i xmm_src;
4541 
4542     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4543 
4544     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4545     if (src == 0)
4546 	return;
4547 
4548     if (src == ~0)
4549     {
4550 	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
4551 		     dest_x, dest_y, width, height, ~0);
4552 
4553 	return;
4554     }
4555 
4556     xmm_src = _mm_set_epi32 (src, src, src, src);
4557     while (height--)
4558     {
4559 	int w = width;
4560 	uint32_t d;
4561 
4562 	dst = dst_line;
4563 	dst_line += dst_stride;
4564 
4565 	while (w && (uintptr_t)dst & 15)
4566 	{
4567 	    d = *dst;
4568 	    *dst++ =
4569 		_mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
4570 	    w--;
4571 	}
4572 
4573 	while (w >= 4)
4574 	{
4575 	    save_128_aligned
4576 		((__m128i*)dst,
4577 		 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4578 
4579 	    dst += 4;
4580 	    w -= 4;
4581 	}
4582 
4583 	while (w--)
4584 	{
4585 	    d = *dst;
4586 	    *dst++ =
4587 		_mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
4588 						  _mm_cvtsi32_si128 (d)));
4589 	}
4590     }
4591 }
4592 
4593 static void
sse2_composite_add_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)4594 sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
4595 			     pixman_composite_info_t *info)
4596 {
4597     PIXMAN_COMPOSITE_ARGS (info);
4598     uint32_t     *dst_line, *dst;
4599     uint8_t     *mask_line, *mask;
4600     int dst_stride, mask_stride;
4601     int32_t w;
4602     uint32_t src;
4603 
4604     __m128i xmm_src;
4605 
4606     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4607     if (src == 0)
4608 	return;
4609     xmm_src = expand_pixel_32_1x128 (src);
4610 
4611     PIXMAN_IMAGE_GET_LINE (
4612 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4613     PIXMAN_IMAGE_GET_LINE (
4614 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4615 
4616     while (height--)
4617     {
4618 	dst = dst_line;
4619 	dst_line += dst_stride;
4620 	mask = mask_line;
4621 	mask_line += mask_stride;
4622 	w = width;
4623 
4624 	while (w && ((uintptr_t)dst & 15))
4625 	{
4626 	    uint8_t m = *mask++;
4627 	    if (m)
4628 	    {
4629 		*dst = pack_1x128_32
4630 		    (_mm_adds_epu16
4631 		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4632 		      unpack_32_1x128 (*dst)));
4633 	    }
4634 	    dst++;
4635 	    w--;
4636 	}
4637 
4638 	while (w >= 4)
4639 	{
4640 	    uint32_t m;
4641             memcpy(&m, mask, sizeof(uint32_t));
4642 
4643 	    if (m)
4644 	    {
4645 		__m128i xmm_mask_lo, xmm_mask_hi;
4646 		__m128i xmm_dst_lo, xmm_dst_hi;
4647 
4648 		__m128i xmm_dst = load_128_aligned ((__m128i*)dst);
4649 		__m128i xmm_mask =
4650 		    _mm_unpacklo_epi8 (unpack_32_1x128(m),
4651 				       _mm_setzero_si128 ());
4652 
4653 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4654 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4655 
4656 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4657 					&xmm_mask_lo, &xmm_mask_hi);
4658 
4659 		pix_multiply_2x128 (&xmm_src, &xmm_src,
4660 				    &xmm_mask_lo, &xmm_mask_hi,
4661 				    &xmm_mask_lo, &xmm_mask_hi);
4662 
4663 		xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4664 		xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4665 
4666 		save_128_aligned (
4667 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4668 	    }
4669 
4670 	    w -= 4;
4671 	    dst += 4;
4672 	    mask += 4;
4673 	}
4674 
4675 	while (w)
4676 	{
4677 	    uint8_t m = *mask++;
4678 	    if (m)
4679 	    {
4680 		*dst = pack_1x128_32
4681 		    (_mm_adds_epu16
4682 		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4683 		      unpack_32_1x128 (*dst)));
4684 	    }
4685 	    dst++;
4686 	    w--;
4687 	}
4688     }
4689 }
4690 
4691 static pixman_bool_t
sse2_blt(pixman_implementation_t * imp,uint32_t * src_bits,uint32_t * dst_bits,int src_stride,int dst_stride,int src_bpp,int dst_bpp,int src_x,int src_y,int dest_x,int dest_y,int width,int height)4692 sse2_blt (pixman_implementation_t *imp,
4693           uint32_t *               src_bits,
4694           uint32_t *               dst_bits,
4695           int                      src_stride,
4696           int                      dst_stride,
4697           int                      src_bpp,
4698           int                      dst_bpp,
4699           int                      src_x,
4700           int                      src_y,
4701           int                      dest_x,
4702           int                      dest_y,
4703           int                      width,
4704           int                      height)
4705 {
4706     uint8_t *   src_bytes;
4707     uint8_t *   dst_bytes;
4708     int byte_width;
4709 
4710     if (src_bpp != dst_bpp)
4711 	return FALSE;
4712 
4713     if (src_bpp == 16)
4714     {
4715 	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4716 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4717 	src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4718 	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4719 	byte_width = 2 * width;
4720 	src_stride *= 2;
4721 	dst_stride *= 2;
4722     }
4723     else if (src_bpp == 32)
4724     {
4725 	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4726 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4727 	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4728 	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4729 	byte_width = 4 * width;
4730 	src_stride *= 4;
4731 	dst_stride *= 4;
4732     }
4733     else
4734     {
4735 	return FALSE;
4736     }
4737 
4738     while (height--)
4739     {
4740 	int w;
4741 	uint8_t *s = src_bytes;
4742 	uint8_t *d = dst_bytes;
4743 	src_bytes += src_stride;
4744 	dst_bytes += dst_stride;
4745 	w = byte_width;
4746 
4747 	while (w >= 2 && ((uintptr_t)d & 3))
4748 	{
4749             memmove(d, s, 2);
4750 	    w -= 2;
4751 	    s += 2;
4752 	    d += 2;
4753 	}
4754 
4755 	while (w >= 4 && ((uintptr_t)d & 15))
4756 	{
4757             memmove(d, s, 4);
4758 
4759 	    w -= 4;
4760 	    s += 4;
4761 	    d += 4;
4762 	}
4763 
4764 	while (w >= 64)
4765 	{
4766 	    __m128i xmm0, xmm1, xmm2, xmm3;
4767 
4768 	    xmm0 = load_128_unaligned ((__m128i*)(s));
4769 	    xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4770 	    xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4771 	    xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4772 
4773 	    save_128_aligned ((__m128i*)(d),    xmm0);
4774 	    save_128_aligned ((__m128i*)(d + 16), xmm1);
4775 	    save_128_aligned ((__m128i*)(d + 32), xmm2);
4776 	    save_128_aligned ((__m128i*)(d + 48), xmm3);
4777 
4778 	    s += 64;
4779 	    d += 64;
4780 	    w -= 64;
4781 	}
4782 
4783 	while (w >= 16)
4784 	{
4785 	    save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4786 
4787 	    w -= 16;
4788 	    d += 16;
4789 	    s += 16;
4790 	}
4791 
4792 	while (w >= 4)
4793 	{
4794             memmove(d, s, 4);
4795 
4796 	    w -= 4;
4797 	    s += 4;
4798 	    d += 4;
4799 	}
4800 
4801 	if (w >= 2)
4802 	{
4803             memmove(d, s, 2);
4804 	    w -= 2;
4805 	    s += 2;
4806 	    d += 2;
4807 	}
4808     }
4809 
4810     return TRUE;
4811 }
4812 
4813 static void
sse2_composite_copy_area(pixman_implementation_t * imp,pixman_composite_info_t * info)4814 sse2_composite_copy_area (pixman_implementation_t *imp,
4815                           pixman_composite_info_t *info)
4816 {
4817     PIXMAN_COMPOSITE_ARGS (info);
4818     sse2_blt (imp, src_image->bits.bits,
4819 	      dest_image->bits.bits,
4820 	      src_image->bits.rowstride,
4821 	      dest_image->bits.rowstride,
4822 	      PIXMAN_FORMAT_BPP (src_image->bits.format),
4823 	      PIXMAN_FORMAT_BPP (dest_image->bits.format),
4824 	      src_x, src_y, dest_x, dest_y, width, height);
4825 }
4826 
4827 static void
sse2_composite_over_x888_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)4828 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4829                                  pixman_composite_info_t *info)
4830 {
4831     PIXMAN_COMPOSITE_ARGS (info);
4832     uint32_t    *src, *src_line, s;
4833     uint32_t    *dst, *dst_line, d;
4834     uint8_t         *mask, *mask_line;
4835     uint32_t m;
4836     int src_stride, mask_stride, dst_stride;
4837     int32_t w;
4838     __m128i ms;
4839 
4840     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4841     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4842     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4843 
4844     PIXMAN_IMAGE_GET_LINE (
4845 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4846     PIXMAN_IMAGE_GET_LINE (
4847 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4848     PIXMAN_IMAGE_GET_LINE (
4849 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4850 
4851     while (height--)
4852     {
4853         src = src_line;
4854         src_line += src_stride;
4855         dst = dst_line;
4856         dst_line += dst_stride;
4857         mask = mask_line;
4858         mask_line += mask_stride;
4859 
4860         w = width;
4861 
4862         while (w && (uintptr_t)dst & 15)
4863         {
4864             s = 0xff000000 | *src++;
4865             memcpy(&m, mask++, sizeof(uint32_t));
4866             d = *dst;
4867             ms = unpack_32_1x128 (s);
4868 
4869             if (m != 0xff)
4870             {
4871 		__m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4872 		__m128i md = unpack_32_1x128 (d);
4873 
4874                 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4875             }
4876 
4877             *dst++ = pack_1x128_32 (ms);
4878             w--;
4879         }
4880 
4881         while (w >= 4)
4882         {
4883             memcpy(&m, mask, sizeof(uint32_t));
4884             xmm_src = _mm_or_si128 (
4885 		load_128_unaligned ((__m128i*)src), mask_ff000000);
4886 
4887             if (m == 0xffffffff)
4888             {
4889                 save_128_aligned ((__m128i*)dst, xmm_src);
4890             }
4891             else
4892             {
4893                 xmm_dst = load_128_aligned ((__m128i*)dst);
4894 
4895                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4896 
4897                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4898                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4899                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4900 
4901                 expand_alpha_rev_2x128 (
4902 		    xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4903 
4904                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
4905 			       &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
4906 			       &xmm_dst_lo, &xmm_dst_hi);
4907 
4908                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4909             }
4910 
4911             src += 4;
4912             dst += 4;
4913             mask += 4;
4914             w -= 4;
4915         }
4916 
4917         while (w)
4918         {
4919             memcpy(&m, mask++, sizeof(uint32_t));
4920 
4921             if (m)
4922             {
4923                 s = 0xff000000 | *src;
4924 
4925                 if (m == 0xff)
4926                 {
4927                     *dst = s;
4928                 }
4929                 else
4930                 {
4931 		    __m128i ma, md, ms;
4932 
4933                     d = *dst;
4934 
4935 		    ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4936 		    md = unpack_32_1x128 (d);
4937 		    ms = unpack_32_1x128 (s);
4938 
4939                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4940                 }
4941 
4942             }
4943 
4944             src++;
4945             dst++;
4946             w--;
4947         }
4948     }
4949 
4950 }
4951 
4952 static void
sse2_composite_over_8888_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)4953 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
4954                                  pixman_composite_info_t *info)
4955 {
4956     PIXMAN_COMPOSITE_ARGS (info);
4957     uint32_t    *src, *src_line, s;
4958     uint32_t    *dst, *dst_line, d;
4959     uint8_t         *mask, *mask_line;
4960     uint32_t m;
4961     int src_stride, mask_stride, dst_stride;
4962     int32_t w;
4963 
4964     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4965     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4966     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4967 
4968     PIXMAN_IMAGE_GET_LINE (
4969 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4970     PIXMAN_IMAGE_GET_LINE (
4971 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4972     PIXMAN_IMAGE_GET_LINE (
4973 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4974 
4975     while (height--)
4976     {
4977         src = src_line;
4978         src_line += src_stride;
4979         dst = dst_line;
4980         dst_line += dst_stride;
4981         mask = mask_line;
4982         mask_line += mask_stride;
4983 
4984         w = width;
4985 
4986         while (w && (uintptr_t)dst & 15)
4987         {
4988 	    uint32_t sa;
4989 
4990             s = *src++;
4991             m = (uint32_t) *mask++;
4992             d = *dst;
4993 
4994 	    sa = s >> 24;
4995 
4996 	    if (m)
4997 	    {
4998 		if (sa == 0xff && m == 0xff)
4999 		{
5000 		    *dst = s;
5001 		}
5002 		else
5003 		{
5004 		    __m128i ms, md, ma, msa;
5005 
5006 		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5007 		    ms = unpack_32_1x128 (s);
5008 		    md = unpack_32_1x128 (d);
5009 
5010 		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5011 
5012 		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5013 		}
5014 	    }
5015 
5016 	    dst++;
5017             w--;
5018         }
5019 
5020         while (w >= 4)
5021         {
5022             memcpy(&m, mask, sizeof(uint32_t));
5023 
5024 	    if (m)
5025 	    {
5026 		xmm_src = load_128_unaligned ((__m128i*)src);
5027 
5028 		if (m == 0xffffffff && is_opaque (xmm_src))
5029 		{
5030 		    save_128_aligned ((__m128i *)dst, xmm_src);
5031 		}
5032 		else
5033 		{
5034 		    xmm_dst = load_128_aligned ((__m128i *)dst);
5035 
5036 		    xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5037 
5038 		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5039 		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5040 		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5041 
5042 		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5043 		    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5044 
5045 		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5046 				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5047 
5048 		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5049 		}
5050 	    }
5051 
5052             src += 4;
5053             dst += 4;
5054             mask += 4;
5055             w -= 4;
5056         }
5057 
5058         while (w)
5059         {
5060 	    uint32_t sa;
5061 
5062             s = *src++;
5063             m = (uint32_t) *mask++;
5064             d = *dst;
5065 
5066 	    sa = s >> 24;
5067 
5068 	    if (m)
5069 	    {
5070 		if (sa == 0xff && m == 0xff)
5071 		{
5072 		    *dst = s;
5073 		}
5074 		else
5075 		{
5076 		    __m128i ms, md, ma, msa;
5077 
5078 		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5079 		    ms = unpack_32_1x128 (s);
5080 		    md = unpack_32_1x128 (d);
5081 
5082 		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5083 
5084 		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5085 		}
5086 	    }
5087 
5088 	    dst++;
5089             w--;
5090         }
5091     }
5092 
5093 }
5094 
5095 static void
sse2_composite_over_reverse_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)5096 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5097 				    pixman_composite_info_t *info)
5098 {
5099     PIXMAN_COMPOSITE_ARGS (info);
5100     uint32_t src;
5101     uint32_t    *dst_line, *dst;
5102     __m128i xmm_src;
5103     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5104     __m128i xmm_dsta_hi, xmm_dsta_lo;
5105     int dst_stride;
5106     int32_t w;
5107 
5108     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
5109 
5110     if (src == 0)
5111 	return;
5112 
5113     PIXMAN_IMAGE_GET_LINE (
5114 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5115 
5116     xmm_src = expand_pixel_32_1x128 (src);
5117 
5118     while (height--)
5119     {
5120 	dst = dst_line;
5121 
5122 	dst_line += dst_stride;
5123 	w = width;
5124 
5125 	while (w && (uintptr_t)dst & 15)
5126 	{
5127 	    __m128i vd;
5128 
5129 	    vd = unpack_32_1x128 (*dst);
5130 
5131 	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5132 					      xmm_src));
5133 	    w--;
5134 	    dst++;
5135 	}
5136 
5137 	while (w >= 4)
5138 	{
5139 	    __m128i tmp_lo, tmp_hi;
5140 
5141 	    xmm_dst = load_128_aligned ((__m128i*)dst);
5142 
5143 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5144 	    expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5145 
5146 	    tmp_lo = xmm_src;
5147 	    tmp_hi = xmm_src;
5148 
5149 	    over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5150 			&xmm_dsta_lo, &xmm_dsta_hi,
5151 			&tmp_lo, &tmp_hi);
5152 
5153 	    save_128_aligned (
5154 		(__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5155 
5156 	    w -= 4;
5157 	    dst += 4;
5158 	}
5159 
5160 	while (w)
5161 	{
5162 	    __m128i vd;
5163 
5164 	    vd = unpack_32_1x128 (*dst);
5165 
5166 	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5167 					      xmm_src));
5168 	    w--;
5169 	    dst++;
5170 	}
5171 
5172     }
5173 
5174 }
5175 
5176 static void
sse2_composite_over_8888_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)5177 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5178 				    pixman_composite_info_t *info)
5179 {
5180     PIXMAN_COMPOSITE_ARGS (info);
5181     uint32_t    *src, *src_line, s;
5182     uint32_t    *dst, *dst_line, d;
5183     uint32_t    *mask, *mask_line;
5184     uint32_t    m;
5185     int src_stride, mask_stride, dst_stride;
5186     int32_t w;
5187 
5188     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5189     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5190     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5191 
5192     PIXMAN_IMAGE_GET_LINE (
5193 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5194     PIXMAN_IMAGE_GET_LINE (
5195 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5196     PIXMAN_IMAGE_GET_LINE (
5197 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5198 
5199     while (height--)
5200     {
5201         src = src_line;
5202         src_line += src_stride;
5203         dst = dst_line;
5204         dst_line += dst_stride;
5205         mask = mask_line;
5206         mask_line += mask_stride;
5207 
5208         w = width;
5209 
5210         while (w && (uintptr_t)dst & 15)
5211         {
5212 	    uint32_t sa;
5213 
5214             s = *src++;
5215             m = (*mask++) >> 24;
5216             d = *dst;
5217 
5218 	    sa = s >> 24;
5219 
5220 	    if (m)
5221 	    {
5222 		if (sa == 0xff && m == 0xff)
5223 		{
5224 		    *dst = s;
5225 		}
5226 		else
5227 		{
5228 		    __m128i ms, md, ma, msa;
5229 
5230 		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5231 		    ms = unpack_32_1x128 (s);
5232 		    md = unpack_32_1x128 (d);
5233 
5234 		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5235 
5236 		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5237 		}
5238 	    }
5239 
5240 	    dst++;
5241             w--;
5242         }
5243 
5244         while (w >= 4)
5245         {
5246 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
5247 
5248 	    if (!is_transparent (xmm_mask))
5249 	    {
5250 		xmm_src = load_128_unaligned ((__m128i*)src);
5251 
5252 		if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5253 		{
5254 		    save_128_aligned ((__m128i *)dst, xmm_src);
5255 		}
5256 		else
5257 		{
5258 		    xmm_dst = load_128_aligned ((__m128i *)dst);
5259 
5260 		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5261 		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5262 		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5263 
5264 		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5265 		    expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5266 
5267 		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5268 				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5269 
5270 		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5271 		}
5272 	    }
5273 
5274             src += 4;
5275             dst += 4;
5276             mask += 4;
5277             w -= 4;
5278         }
5279 
5280         while (w)
5281         {
5282 	    uint32_t sa;
5283 
5284             s = *src++;
5285             m = (*mask++) >> 24;
5286             d = *dst;
5287 
5288 	    sa = s >> 24;
5289 
5290 	    if (m)
5291 	    {
5292 		if (sa == 0xff && m == 0xff)
5293 		{
5294 		    *dst = s;
5295 		}
5296 		else
5297 		{
5298 		    __m128i ms, md, ma, msa;
5299 
5300 		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5301 		    ms = unpack_32_1x128 (s);
5302 		    md = unpack_32_1x128 (d);
5303 
5304 		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5305 
5306 		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5307 		}
5308 	    }
5309 
5310 	    dst++;
5311             w--;
5312         }
5313     }
5314 
5315 }
5316 
5317 /* A variant of 'sse2_combine_over_u' with minor tweaks */
5318 static force_inline void
scaled_nearest_scanline_sse2_8888_8888_OVER(uint32_t * pd,const uint32_t * ps,int32_t w,pixman_fixed_t vx,pixman_fixed_t unit_x,pixman_fixed_t src_width_fixed,pixman_bool_t fully_transparent_src)5319 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
5320                                              const uint32_t* ps,
5321                                              int32_t         w,
5322                                              pixman_fixed_t  vx,
5323                                              pixman_fixed_t  unit_x,
5324                                              pixman_fixed_t  src_width_fixed,
5325                                              pixman_bool_t   fully_transparent_src)
5326 {
5327     uint32_t s, d;
5328     const uint32_t* pm = NULL;
5329 
5330     __m128i xmm_dst_lo, xmm_dst_hi;
5331     __m128i xmm_src_lo, xmm_src_hi;
5332     __m128i xmm_alpha_lo, xmm_alpha_hi;
5333 
5334     if (fully_transparent_src)
5335 	return;
5336 
5337     /* Align dst on a 16-byte boundary */
5338     while (w && ((uintptr_t)pd & 15))
5339     {
5340 	d = *pd;
5341 	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5342 	vx += unit_x;
5343 	while (vx >= 0)
5344 	    vx -= src_width_fixed;
5345 
5346 	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
5347 	if (pm)
5348 	    pm++;
5349 	w--;
5350     }
5351 
5352     while (w >= 4)
5353     {
5354 	__m128i tmp;
5355 	uint32_t tmp1, tmp2, tmp3, tmp4;
5356 
5357 	tmp1 = *(ps + pixman_fixed_to_int (vx));
5358 	vx += unit_x;
5359 	while (vx >= 0)
5360 	    vx -= src_width_fixed;
5361 	tmp2 = *(ps + pixman_fixed_to_int (vx));
5362 	vx += unit_x;
5363 	while (vx >= 0)
5364 	    vx -= src_width_fixed;
5365 	tmp3 = *(ps + pixman_fixed_to_int (vx));
5366 	vx += unit_x;
5367 	while (vx >= 0)
5368 	    vx -= src_width_fixed;
5369 	tmp4 = *(ps + pixman_fixed_to_int (vx));
5370 	vx += unit_x;
5371 	while (vx >= 0)
5372 	    vx -= src_width_fixed;
5373 
5374 	tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5375 
5376 	xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5377 
5378 	if (is_opaque (xmm_src_hi))
5379 	{
5380 	    save_128_aligned ((__m128i*)pd, xmm_src_hi);
5381 	}
5382 	else if (!is_zero (xmm_src_hi))
5383 	{
5384 	    xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5385 
5386 	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5387 	    unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5388 
5389 	    expand_alpha_2x128 (
5390 		xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5391 
5392 	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
5393 			&xmm_alpha_lo, &xmm_alpha_hi,
5394 			&xmm_dst_lo, &xmm_dst_hi);
5395 
5396 	    /* rebuid the 4 pixel data and save*/
5397 	    save_128_aligned ((__m128i*)pd,
5398 			      pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5399 	}
5400 
5401 	w -= 4;
5402 	pd += 4;
5403 	if (pm)
5404 	    pm += 4;
5405     }
5406 
5407     while (w)
5408     {
5409 	d = *pd;
5410 	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5411 	vx += unit_x;
5412 	while (vx >= 0)
5413 	    vx -= src_width_fixed;
5414 
5415 	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
5416 	if (pm)
5417 	    pm++;
5418 
5419 	w--;
5420     }
5421 }
5422 
FAST_NEAREST_MAINLOOP(sse2_8888_8888_cover_OVER,scaled_nearest_scanline_sse2_8888_8888_OVER,uint32_t,uint32_t,COVER)5423 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5424 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
5425 		       uint32_t, uint32_t, COVER)
5426 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5427 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
5428 		       uint32_t, uint32_t, NONE)
5429 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5430 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
5431 		       uint32_t, uint32_t, PAD)
5432 FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
5433 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
5434 		       uint32_t, uint32_t, NORMAL)
5435 
5436 static force_inline void
5437 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5438 					       uint32_t *       dst,
5439 					       const uint32_t * src,
5440 					       int32_t          w,
5441 					       pixman_fixed_t   vx,
5442 					       pixman_fixed_t   unit_x,
5443 					       pixman_fixed_t   src_width_fixed,
5444 					       pixman_bool_t    zero_src)
5445 {
5446     __m128i xmm_mask;
5447     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5448     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5449     __m128i xmm_alpha_lo, xmm_alpha_hi;
5450 
5451     if (zero_src || (*mask >> 24) == 0)
5452 	return;
5453 
5454     xmm_mask = create_mask_16_128 (*mask >> 24);
5455 
5456     while (w && (uintptr_t)dst & 15)
5457     {
5458 	uint32_t s = *(src + pixman_fixed_to_int (vx));
5459 	vx += unit_x;
5460 	while (vx >= 0)
5461 	    vx -= src_width_fixed;
5462 
5463 	if (s)
5464 	{
5465 	    uint32_t d = *dst;
5466 
5467 	    __m128i ms = unpack_32_1x128 (s);
5468 	    __m128i alpha     = expand_alpha_1x128 (ms);
5469 	    __m128i dest      = xmm_mask;
5470 	    __m128i alpha_dst = unpack_32_1x128 (d);
5471 
5472 	    *dst = pack_1x128_32 (
5473 		in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5474 	}
5475 	dst++;
5476 	w--;
5477     }
5478 
5479     while (w >= 4)
5480     {
5481 	uint32_t tmp1, tmp2, tmp3, tmp4;
5482 
5483 	tmp1 = *(src + pixman_fixed_to_int (vx));
5484 	vx += unit_x;
5485 	while (vx >= 0)
5486 	    vx -= src_width_fixed;
5487 	tmp2 = *(src + pixman_fixed_to_int (vx));
5488 	vx += unit_x;
5489 	while (vx >= 0)
5490 	    vx -= src_width_fixed;
5491 	tmp3 = *(src + pixman_fixed_to_int (vx));
5492 	vx += unit_x;
5493 	while (vx >= 0)
5494 	    vx -= src_width_fixed;
5495 	tmp4 = *(src + pixman_fixed_to_int (vx));
5496 	vx += unit_x;
5497 	while (vx >= 0)
5498 	    vx -= src_width_fixed;
5499 
5500 	xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5501 
5502 	if (!is_zero (xmm_src))
5503 	{
5504 	    xmm_dst = load_128_aligned ((__m128i*)dst);
5505 
5506 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5507 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5508 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5509 			        &xmm_alpha_lo, &xmm_alpha_hi);
5510 
5511 	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5512 			   &xmm_alpha_lo, &xmm_alpha_hi,
5513 			   &xmm_mask, &xmm_mask,
5514 			   &xmm_dst_lo, &xmm_dst_hi);
5515 
5516 	    save_128_aligned (
5517 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5518 	}
5519 
5520 	dst += 4;
5521 	w -= 4;
5522     }
5523 
5524     while (w)
5525     {
5526 	uint32_t s = *(src + pixman_fixed_to_int (vx));
5527 	vx += unit_x;
5528 	while (vx >= 0)
5529 	    vx -= src_width_fixed;
5530 
5531 	if (s)
5532 	{
5533 	    uint32_t d = *dst;
5534 
5535 	    __m128i ms = unpack_32_1x128 (s);
5536 	    __m128i alpha = expand_alpha_1x128 (ms);
5537 	    __m128i mask  = xmm_mask;
5538 	    __m128i dest  = unpack_32_1x128 (d);
5539 
5540 	    *dst = pack_1x128_32 (
5541 		in_over_1x128 (&ms, &alpha, &mask, &dest));
5542 	}
5543 
5544 	dst++;
5545 	w--;
5546     }
5547 
5548 }
5549 
FAST_NEAREST_MAINLOOP_COMMON(sse2_8888_n_8888_cover_OVER,scaled_nearest_scanline_sse2_8888_n_8888_OVER,uint32_t,uint32_t,uint32_t,COVER,TRUE,TRUE)5550 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5551 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5552 			      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5553 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5554 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5555 			      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5556 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5557 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5558 			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5559 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
5560 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5561 			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
5562 
5563 #if PSHUFD_IS_FAST
5564 
5565 /***********************************************************************************/
5566 
5567 # define BILINEAR_DECLARE_VARIABLES						\
5568     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
5569     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
5570     const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
5571     const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
5572 					   unit_x, -unit_x, unit_x, -unit_x);	\
5573     const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4,		\
5574 					   unit_x * 4, -unit_x * 4,		\
5575 					   unit_x * 4, -unit_x * 4,		\
5576 					   unit_x * 4, -unit_x * 4);		\
5577     const __m128i xmm_zero = _mm_setzero_si128 ();				\
5578     __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3,	\
5579 				   vx + unit_x * 2, -(vx + 1) - unit_x * 2,	\
5580 				   vx + unit_x * 1, -(vx + 1) - unit_x * 1,	\
5581 				   vx + unit_x * 0, -(vx + 1) - unit_x * 0);	\
5582     __m128i xmm_wh_state;
5583 
5584 #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_)			\
5585 do {										\
5586     int phase = phase_;								\
5587     __m128i xmm_wh, xmm_a, xmm_b;						\
5588     /* fetch 2x2 pixel block into sse2 registers */				\
5589     __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]);		\
5590     __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]);		\
5591     vx += unit_x;								\
5592     /* vertical interpolation */						\
5593     xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt);	\
5594     xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb);	\
5595     xmm_a = _mm_add_epi16 (xmm_a, xmm_b);						\
5596     /* calculate horizontal weights */						\
5597     if (phase <= 0)								\
5598     {										\
5599 	xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
5600 					16 - BILINEAR_INTERPOLATION_BITS));	\
5601 	xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4);		\
5602 	phase = 0;								\
5603     }										\
5604     xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase,	\
5605 							   phase, phase));	\
5606     /* horizontal interpolation */						\
5607     xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\
5608 		xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh);		\
5609     /* shift the result */							\
5610     pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2);		\
5611 } while (0)
5612 
5613 #else /************************************************************************/
5614 
5615 # define BILINEAR_DECLARE_VARIABLES						\
5616     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
5617     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
5618     const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
5619     const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
5620 					  unit_x, -unit_x, unit_x, -unit_x);	\
5621     const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4,		\
5622 					   unit_x * 4, -unit_x * 4,		\
5623 					   unit_x * 4, -unit_x * 4,		\
5624 					   unit_x * 4, -unit_x * 4);		\
5625     const __m128i xmm_zero = _mm_setzero_si128 ();				\
5626     __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1),		\
5627 				   vx, -(vx + 1), vx, -(vx + 1))
5628 
5629 #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase)			\
5630 do {										\
5631     __m128i xmm_wh, xmm_a, xmm_b;						\
5632     /* fetch 2x2 pixel block into sse2 registers */				\
5633     __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]);		\
5634     __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]);		\
5635     (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */		\
5636     vx += unit_x;								\
5637     /* vertical interpolation */						\
5638     xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt);	\
5639     xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb);	\
5640     xmm_a = _mm_add_epi16 (xmm_a, xmm_b);					\
5641     /* calculate horizontal weights */						\
5642     xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,			\
5643 					16 - BILINEAR_INTERPOLATION_BITS));	\
5644     xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1);					\
5645     /* horizontal interpolation */						\
5646     xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a);	\
5647     xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh);		\
5648     /* shift the result */							\
5649     pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2);		\
5650 } while (0)
5651 
5652 /***********************************************************************************/
5653 
5654 #endif
5655 
5656 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix);					\
5657 do {										\
5658 	__m128i xmm_pix;							\
5659 	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1);			\
5660 	xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix);				\
5661 	xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix);				\
5662 	pix = _mm_cvtsi128_si32 (xmm_pix);					\
5663 } while(0)
5664 
5665 #define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix);					\
5666 do {										\
5667 	__m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4;				\
5668 	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0);			\
5669 	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1);			\
5670 	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2);			\
5671 	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3);			\
5672 	xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2);			\
5673 	xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4);			\
5674 	pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3);				\
5675 } while(0)
5676 
5677 #define BILINEAR_SKIP_ONE_PIXEL()						\
5678 do {										\
5679     vx += unit_x;								\
5680     xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1);					\
5681 } while(0)
5682 
5683 #define BILINEAR_SKIP_FOUR_PIXELS()						\
5684 do {										\
5685     vx += unit_x * 4;								\
5686     xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4);					\
5687 } while(0)
5688 
5689 /***********************************************************************************/
5690 
5691 static force_inline void
5692 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
5693 					     const uint32_t * mask,
5694 					     const uint32_t * src_top,
5695 					     const uint32_t * src_bottom,
5696 					     int32_t          w,
5697 					     int              wt,
5698 					     int              wb,
5699 					     pixman_fixed_t   vx_,
5700 					     pixman_fixed_t   unit_x_,
5701 					     pixman_fixed_t   max_vx,
5702 					     pixman_bool_t    zero_src)
5703 {
5704     intptr_t vx = vx_;
5705     intptr_t unit_x = unit_x_;
5706     BILINEAR_DECLARE_VARIABLES;
5707     uint32_t pix1, pix2;
5708 
5709     while (w && ((uintptr_t)dst & 15))
5710     {
5711 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5712 	*dst++ = pix1;
5713 	w--;
5714     }
5715 
5716     while ((w -= 4) >= 0) {
5717 	__m128i xmm_src;
5718 	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5719 	_mm_store_si128 ((__m128i *)dst, xmm_src);
5720 	dst += 4;
5721     }
5722 
5723     if (w & 2)
5724     {
5725 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5726 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5727 	*dst++ = pix1;
5728 	*dst++ = pix2;
5729     }
5730 
5731     if (w & 1)
5732     {
5733 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5734 	*dst = pix1;
5735     }
5736 
5737 }
5738 
FAST_BILINEAR_MAINLOOP_COMMON(sse2_8888_8888_cover_SRC,scaled_bilinear_scanline_sse2_8888_8888_SRC,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)5739 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
5740 			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
5741 			       uint32_t, uint32_t, uint32_t,
5742 			       COVER, FLAG_NONE)
5743 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
5744 			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
5745 			       uint32_t, uint32_t, uint32_t,
5746 			       PAD, FLAG_NONE)
5747 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
5748 			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
5749 			       uint32_t, uint32_t, uint32_t,
5750 			       NONE, FLAG_NONE)
5751 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
5752 			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
5753 			       uint32_t, uint32_t, uint32_t,
5754 			       NORMAL, FLAG_NONE)
5755 
5756 static force_inline void
5757 scaled_bilinear_scanline_sse2_x888_8888_SRC (uint32_t *       dst,
5758 					     const uint32_t * mask,
5759 					     const uint32_t * src_top,
5760 					     const uint32_t * src_bottom,
5761 					     int32_t          w,
5762 					     int              wt,
5763 					     int              wb,
5764 					     pixman_fixed_t   vx_,
5765 					     pixman_fixed_t   unit_x_,
5766 					     pixman_fixed_t   max_vx,
5767 					     pixman_bool_t    zero_src)
5768 {
5769     intptr_t vx = vx_;
5770     intptr_t unit_x = unit_x_;
5771     BILINEAR_DECLARE_VARIABLES;
5772     uint32_t pix1, pix2;
5773 
5774     while (w && ((uintptr_t)dst & 15))
5775     {
5776 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5777 	*dst++ = pix1 | 0xFF000000;
5778 	w--;
5779     }
5780 
5781     while ((w -= 4) >= 0) {
5782 	__m128i xmm_src;
5783 	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5784 	_mm_store_si128 ((__m128i *)dst, _mm_or_si128 (xmm_src, mask_ff000000));
5785 	dst += 4;
5786     }
5787 
5788     if (w & 2)
5789     {
5790 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5791 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5792 	*dst++ = pix1 | 0xFF000000;
5793 	*dst++ = pix2 | 0xFF000000;
5794     }
5795 
5796     if (w & 1)
5797     {
5798 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5799 	*dst = pix1 | 0xFF000000;
5800     }
5801 }
5802 
FAST_BILINEAR_MAINLOOP_COMMON(sse2_x888_8888_cover_SRC,scaled_bilinear_scanline_sse2_x888_8888_SRC,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)5803 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_cover_SRC,
5804 			       scaled_bilinear_scanline_sse2_x888_8888_SRC,
5805 			       uint32_t, uint32_t, uint32_t,
5806 			       COVER, FLAG_NONE)
5807 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_pad_SRC,
5808 			       scaled_bilinear_scanline_sse2_x888_8888_SRC,
5809 			       uint32_t, uint32_t, uint32_t,
5810 			       PAD, FLAG_NONE)
5811 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_normal_SRC,
5812 			       scaled_bilinear_scanline_sse2_x888_8888_SRC,
5813 			       uint32_t, uint32_t, uint32_t,
5814 			       NORMAL, FLAG_NONE)
5815 
5816 static force_inline void
5817 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
5818 					      const uint32_t * mask,
5819 					      const uint32_t * src_top,
5820 					      const uint32_t * src_bottom,
5821 					      int32_t          w,
5822 					      int              wt,
5823 					      int              wb,
5824 					      pixman_fixed_t   vx_,
5825 					      pixman_fixed_t   unit_x_,
5826 					      pixman_fixed_t   max_vx,
5827 					      pixman_bool_t    zero_src)
5828 {
5829     intptr_t vx = vx_;
5830     intptr_t unit_x = unit_x_;
5831     BILINEAR_DECLARE_VARIABLES;
5832     uint32_t pix1, pix2;
5833 
5834     while (w && ((uintptr_t)dst & 15))
5835     {
5836 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5837 
5838 	if (pix1)
5839 	{
5840 	    pix2 = *dst;
5841 	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5842 	}
5843 
5844 	w--;
5845 	dst++;
5846     }
5847 
5848     while (w  >= 4)
5849     {
5850 	__m128i xmm_src;
5851 	__m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
5852 	__m128i xmm_alpha_hi, xmm_alpha_lo;
5853 
5854 	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5855 
5856 	if (!is_zero (xmm_src))
5857 	{
5858 	    if (is_opaque (xmm_src))
5859 	    {
5860 		save_128_aligned ((__m128i *)dst, xmm_src);
5861 	    }
5862 	    else
5863 	    {
5864 		__m128i xmm_dst = load_128_aligned ((__m128i *)dst);
5865 
5866 		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5867 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5868 
5869 		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5870 		over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
5871 			    &xmm_dst_lo, &xmm_dst_hi);
5872 
5873 		save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5874 	    }
5875 	}
5876 
5877 	w -= 4;
5878 	dst += 4;
5879     }
5880 
5881     while (w)
5882     {
5883 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5884 
5885 	if (pix1)
5886 	{
5887 	    pix2 = *dst;
5888 	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5889 	}
5890 
5891 	w--;
5892 	dst++;
5893     }
5894 }
5895 
FAST_BILINEAR_MAINLOOP_COMMON(sse2_8888_8888_cover_OVER,scaled_bilinear_scanline_sse2_8888_8888_OVER,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)5896 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
5897 			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
5898 			       uint32_t, uint32_t, uint32_t,
5899 			       COVER, FLAG_NONE)
5900 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
5901 			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
5902 			       uint32_t, uint32_t, uint32_t,
5903 			       PAD, FLAG_NONE)
5904 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
5905 			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
5906 			       uint32_t, uint32_t, uint32_t,
5907 			       NONE, FLAG_NONE)
5908 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
5909 			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
5910 			       uint32_t, uint32_t, uint32_t,
5911 			       NORMAL, FLAG_NONE)
5912 
5913 static force_inline void
5914 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
5915 						const uint8_t  * mask,
5916 						const uint32_t * src_top,
5917 						const uint32_t * src_bottom,
5918 						int32_t          w,
5919 						int              wt,
5920 						int              wb,
5921 						pixman_fixed_t   vx_,
5922 						pixman_fixed_t   unit_x_,
5923 						pixman_fixed_t   max_vx,
5924 						pixman_bool_t    zero_src)
5925 {
5926     intptr_t vx = vx_;
5927     intptr_t unit_x = unit_x_;
5928     BILINEAR_DECLARE_VARIABLES;
5929     uint32_t pix1, pix2;
5930     uint32_t m;
5931 
5932     while (w && ((uintptr_t)dst & 15))
5933     {
5934 	uint32_t sa;
5935 
5936 	m = (uint32_t) *mask++;
5937 
5938 	if (m)
5939 	{
5940 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5941 	    sa = pix1 >> 24;
5942 
5943 	    if (sa == 0xff && m == 0xff)
5944 	    {
5945 		*dst = pix1;
5946 	    }
5947 	    else
5948 	    {
5949 		__m128i ms, md, ma, msa;
5950 
5951 		pix2 = *dst;
5952 		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5953 		ms = unpack_32_1x128 (pix1);
5954 		md = unpack_32_1x128 (pix2);
5955 
5956 		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5957 
5958 		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5959 	    }
5960 	}
5961 	else
5962 	{
5963 	    BILINEAR_SKIP_ONE_PIXEL ();
5964 	}
5965 
5966 	w--;
5967 	dst++;
5968     }
5969 
5970     while (w >= 4)
5971     {
5972 	__m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5973 	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5974 	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5975 
5976         memcpy(&m, mask, sizeof(uint32_t));
5977 
5978 	if (m)
5979 	{
5980 	    BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5981 
5982 	    if (m == 0xffffffff && is_opaque (xmm_src))
5983 	    {
5984 		save_128_aligned ((__m128i *)dst, xmm_src);
5985 	    }
5986 	    else
5987 	    {
5988 		xmm_dst = load_128_aligned ((__m128i *)dst);
5989 
5990 		xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5991 
5992 		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5993 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5994 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5995 
5996 		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5997 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5998 
5999 		in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
6000 			       &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
6001 
6002 		save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6003 	    }
6004 	}
6005 	else
6006 	{
6007 	    BILINEAR_SKIP_FOUR_PIXELS ();
6008 	}
6009 
6010 	w -= 4;
6011 	dst += 4;
6012 	mask += 4;
6013     }
6014 
6015     while (w)
6016     {
6017 	uint32_t sa;
6018 
6019 	m = (uint32_t) *mask++;
6020 
6021 	if (m)
6022 	{
6023 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6024 	    sa = pix1 >> 24;
6025 
6026 	    if (sa == 0xff && m == 0xff)
6027 	    {
6028 		*dst = pix1;
6029 	    }
6030 	    else
6031 	    {
6032 		__m128i ms, md, ma, msa;
6033 
6034 		pix2 = *dst;
6035 		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
6036 		ms = unpack_32_1x128 (pix1);
6037 		md = unpack_32_1x128 (pix2);
6038 
6039 		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
6040 
6041 		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
6042 	    }
6043 	}
6044 	else
6045 	{
6046 	    BILINEAR_SKIP_ONE_PIXEL ();
6047 	}
6048 
6049 	w--;
6050 	dst++;
6051     }
6052 }
6053 
FAST_BILINEAR_MAINLOOP_COMMON(sse2_8888_8_8888_cover_OVER,scaled_bilinear_scanline_sse2_8888_8_8888_OVER,uint32_t,uint8_t,uint32_t,COVER,FLAG_HAVE_NON_SOLID_MASK)6054 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
6055 			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6056 			       uint32_t, uint8_t, uint32_t,
6057 			       COVER, FLAG_HAVE_NON_SOLID_MASK)
6058 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
6059 			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6060 			       uint32_t, uint8_t, uint32_t,
6061 			       PAD, FLAG_HAVE_NON_SOLID_MASK)
6062 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
6063 			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6064 			       uint32_t, uint8_t, uint32_t,
6065 			       NONE, FLAG_HAVE_NON_SOLID_MASK)
6066 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
6067 			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6068 			       uint32_t, uint8_t, uint32_t,
6069 			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
6070 
6071 static force_inline void
6072 scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,
6073 						const uint32_t * mask,
6074 						const uint32_t * src_top,
6075 						const uint32_t * src_bottom,
6076 						int32_t          w,
6077 						int              wt,
6078 						int              wb,
6079 						pixman_fixed_t   vx_,
6080 						pixman_fixed_t   unit_x_,
6081 						pixman_fixed_t   max_vx,
6082 						pixman_bool_t    zero_src)
6083 {
6084     intptr_t vx = vx_;
6085     intptr_t unit_x = unit_x_;
6086     BILINEAR_DECLARE_VARIABLES;
6087     uint32_t pix1;
6088     __m128i xmm_mask;
6089 
6090     if (zero_src || (*mask >> 24) == 0)
6091 	return;
6092 
6093     xmm_mask = create_mask_16_128 (*mask >> 24);
6094 
6095     while (w && ((uintptr_t)dst & 15))
6096     {
6097 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6098 	if (pix1)
6099 	{
6100 		uint32_t d = *dst;
6101 
6102 		__m128i ms = unpack_32_1x128 (pix1);
6103 		__m128i alpha     = expand_alpha_1x128 (ms);
6104 		__m128i dest      = xmm_mask;
6105 		__m128i alpha_dst = unpack_32_1x128 (d);
6106 
6107 		*dst = pack_1x128_32
6108 			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6109 	}
6110 
6111 	dst++;
6112 	w--;
6113     }
6114 
6115     while (w >= 4)
6116     {
6117 	__m128i xmm_src;
6118 	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
6119 
6120 	if (!is_zero (xmm_src))
6121 	{
6122 	    __m128i xmm_src_lo, xmm_src_hi;
6123 	    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
6124 	    __m128i xmm_alpha_lo, xmm_alpha_hi;
6125 
6126 	    xmm_dst = load_128_aligned ((__m128i*)dst);
6127 
6128 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
6129 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
6130 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
6131 				&xmm_alpha_lo, &xmm_alpha_hi);
6132 
6133 	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
6134 			   &xmm_alpha_lo, &xmm_alpha_hi,
6135 			   &xmm_mask, &xmm_mask,
6136 			   &xmm_dst_lo, &xmm_dst_hi);
6137 
6138 	    save_128_aligned
6139 		((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6140 	}
6141 
6142 	dst += 4;
6143 	w -= 4;
6144     }
6145 
6146     while (w)
6147     {
6148 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6149 	if (pix1)
6150 	{
6151 		uint32_t d = *dst;
6152 
6153 		__m128i ms = unpack_32_1x128 (pix1);
6154 		__m128i alpha     = expand_alpha_1x128 (ms);
6155 		__m128i dest      = xmm_mask;
6156 		__m128i alpha_dst = unpack_32_1x128 (d);
6157 
6158 		*dst = pack_1x128_32
6159 			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6160 	}
6161 
6162 	dst++;
6163 	w--;
6164     }
6165 }
6166 
6167 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
6168 			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6169 			       uint32_t, uint32_t, uint32_t,
6170 			       COVER, FLAG_HAVE_SOLID_MASK)
6171 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
6172 			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6173 			       uint32_t, uint32_t, uint32_t,
6174 			       PAD, FLAG_HAVE_SOLID_MASK)
6175 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
6176 			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6177 			       uint32_t, uint32_t, uint32_t,
6178 			       NONE, FLAG_HAVE_SOLID_MASK)
6179 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
6180 			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6181 			       uint32_t, uint32_t, uint32_t,
6182 			       NORMAL, FLAG_HAVE_SOLID_MASK)
6183 
6184 static const pixman_fast_path_t sse2_fast_paths[] =
6185 {
6186     /* PIXMAN_OP_OVER */
6187     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
6188     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
6189     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
6190     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
6191     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
6192     PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
6193     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
6194     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
6195     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
6196     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
6197     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
6198     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
6199     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
6200     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
6201     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
6202     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
6203     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
6204     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
6205     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
6206     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
6207     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
6208     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
6209     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
6210     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
6211     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
6212     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
6213     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
6214     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
6215     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
6216     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
6217     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
6218     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
6219     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
6220     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6221     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6222     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6223     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6224     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
6225     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
6226     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
6227     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
6228     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
6229     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
6230     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
6231     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
6232     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6233     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6234 
6235     /* PIXMAN_OP_OVER_REVERSE */
6236     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
6237     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
6238 
6239     /* PIXMAN_OP_ADD */
6240     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
6241     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
6242     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
6243     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
6244     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
6245     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
6246     PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
6247     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
6248     PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
6249     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
6250     PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
6251     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
6252     PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
6253     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
6254 
6255     /* PIXMAN_OP_SRC */
6256     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
6257     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
6258     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
6259     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
6260     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6261     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6262     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6263     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6264     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
6265     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
6266     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
6267     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
6268     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6269     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6270     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6271     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6272     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
6273     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
6274 
6275     /* PIXMAN_OP_IN */
6276     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
6277     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
6278     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
6279 
6280     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6281     SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6282     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6283     SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6284 
6285     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6286     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6287     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6288     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6289 
6290     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6291     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6292     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
6293     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6294     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6295     SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
6296 
6297     SIMPLE_BILINEAR_FAST_PATH_COVER  (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6298     SIMPLE_BILINEAR_FAST_PATH_COVER  (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6299     SIMPLE_BILINEAR_FAST_PATH_PAD    (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6300     SIMPLE_BILINEAR_FAST_PATH_PAD    (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6301     SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6302     SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6303 
6304     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6305     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6306     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6307     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6308 
6309     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6310     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6311     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6312     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6313 
6314     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
6315     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
6316     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
6317     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
6318 
6319     { PIXMAN_OP_NONE },
6320 };
6321 
6322 static uint32_t *
sse2_fetch_x8r8g8b8(pixman_iter_t * iter,const uint32_t * mask)6323 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
6324 {
6325     int w = iter->width;
6326     __m128i ff000000 = mask_ff000000;
6327     uint32_t *dst = iter->buffer;
6328     uint32_t *src = (uint32_t *)iter->bits;
6329 
6330     iter->bits += iter->stride;
6331 
6332     while (w && ((uintptr_t)dst) & 0x0f)
6333     {
6334 	*dst++ = (*src++) | 0xff000000;
6335 	w--;
6336     }
6337 
6338     while (w >= 4)
6339     {
6340 	save_128_aligned (
6341 	    (__m128i *)dst, _mm_or_si128 (
6342 		load_128_unaligned ((__m128i *)src), ff000000));
6343 
6344 	dst += 4;
6345 	src += 4;
6346 	w -= 4;
6347     }
6348 
6349     while (w)
6350     {
6351 	*dst++ = (*src++) | 0xff000000;
6352 	w--;
6353     }
6354 
6355     return iter->buffer;
6356 }
6357 
6358 static uint32_t *
sse2_fetch_r5g6b5(pixman_iter_t * iter,const uint32_t * mask)6359 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
6360 {
6361     int w = iter->width;
6362     uint32_t *dst = iter->buffer;
6363     uint16_t *src = (uint16_t *)iter->bits;
6364     __m128i ff000000 = mask_ff000000;
6365 
6366     iter->bits += iter->stride;
6367 
6368     while (w && ((uintptr_t)dst) & 0x0f)
6369     {
6370 	uint16_t s = *src++;
6371 
6372 	*dst++ = convert_0565_to_8888 (s);
6373 	w--;
6374     }
6375 
6376     while (w >= 8)
6377     {
6378 	__m128i lo, hi, s;
6379 
6380 	s = _mm_loadu_si128 ((__m128i *)src);
6381 
6382 	lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
6383 	hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
6384 
6385 	save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
6386 	save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
6387 
6388 	dst += 8;
6389 	src += 8;
6390 	w -= 8;
6391     }
6392 
6393     while (w)
6394     {
6395 	uint16_t s = *src++;
6396 
6397 	*dst++ = convert_0565_to_8888 (s);
6398 	w--;
6399     }
6400 
6401     return iter->buffer;
6402 }
6403 
6404 static uint32_t *
sse2_fetch_a8(pixman_iter_t * iter,const uint32_t * mask)6405 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
6406 {
6407     int w = iter->width;
6408     uint32_t *dst = iter->buffer;
6409     uint8_t *src = iter->bits;
6410     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6411 
6412     iter->bits += iter->stride;
6413 
6414     while (w && (((uintptr_t)dst) & 15))
6415     {
6416         *dst++ = (uint32_t)(*(src++)) << 24;
6417         w--;
6418     }
6419 
6420     while (w >= 16)
6421     {
6422 	xmm0 = _mm_loadu_si128((__m128i *)src);
6423 
6424 	xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
6425 	xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
6426 	xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
6427 	xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
6428 	xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
6429 	xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
6430 
6431 	_mm_store_si128(((__m128i *)(dst +  0)), xmm3);
6432 	_mm_store_si128(((__m128i *)(dst +  4)), xmm4);
6433 	_mm_store_si128(((__m128i *)(dst +  8)), xmm5);
6434 	_mm_store_si128(((__m128i *)(dst + 12)), xmm6);
6435 
6436 	dst += 16;
6437 	src += 16;
6438 	w -= 16;
6439     }
6440 
6441     while (w)
6442     {
6443 	*dst++ = (uint32_t)(*(src++)) << 24;
6444 	w--;
6445     }
6446 
6447     return iter->buffer;
6448 }
6449 
6450 #define IMAGE_FLAGS							\
6451     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
6452      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
6453 
6454 static const pixman_iter_info_t sse2_iters[] =
6455 {
6456     { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
6457       _pixman_iter_init_bits_stride, sse2_fetch_x8r8g8b8, NULL
6458     },
6459     { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
6460       _pixman_iter_init_bits_stride, sse2_fetch_r5g6b5, NULL
6461     },
6462     { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
6463       _pixman_iter_init_bits_stride, sse2_fetch_a8, NULL
6464     },
6465     { PIXMAN_null },
6466 };
6467 
6468 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6469 __attribute__((__force_align_arg_pointer__))
6470 #endif
6471 pixman_implementation_t *
_pixman_implementation_create_sse2(pixman_implementation_t * fallback)6472 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6473 {
6474     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6475 
6476     /* SSE2 constants */
6477     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6478     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6479     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6480     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6481     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6482     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6483     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6484     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6485     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
6486     mask_0080 = create_mask_16_128 (0x0080);
6487     mask_00ff = create_mask_16_128 (0x00ff);
6488     mask_0101 = create_mask_16_128 (0x0101);
6489     mask_ffff = create_mask_16_128 (0xffff);
6490     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6491     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6492     mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
6493     mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
6494 
6495     /* Set up function pointers */
6496     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6497     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6498     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6499     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6500     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6501     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6502     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6503     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6504     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6505     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6506 
6507     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6508 
6509     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6510     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6511     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6512     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6513     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6514     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6515     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6516     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6517     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6518     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6519     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6520 
6521     imp->blt = sse2_blt;
6522     imp->fill = sse2_fill;
6523 
6524     imp->iter_info = sse2_iters;
6525 
6526     return imp;
6527 }
6528