1 /*
2  * Copyright © 2004, 2005 Red Hat, Inc.
3  * Copyright © 2004 Nicholas Miell
4  * Copyright © 2005 Trolltech AS
5  *
6  * Permission to use, copy, modify, distribute, and sell this software and its
7  * documentation for any purpose is hereby granted without fee, provided that
8  * the above copyright notice appear in all copies and that both that
9  * copyright notice and this permission notice appear in supporting
10  * documentation, and that the name of Red Hat not be used in advertising or
11  * publicity pertaining to distribution of the software without specific,
12  * written prior permission.  Red Hat makes no representations about the
13  * suitability of this software for any purpose.  It is provided "as is"
14  * without express or implied warranty.
15  *
16  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23  * SOFTWARE.
24  *
25  * Author:  Søren Sandmann (sandmann@redhat.com)
26  * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27  * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
28  *
29  * Based on work by Owen Taylor
30  */
31 
32 #ifdef HAVE_CONFIG_H
33 #include <config.h>
34 #endif
35 
36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
37 
38 #ifdef USE_LOONGSON_MMI
39 #include <loongson-mmintrin.h>
40 #else
41 #include <mmintrin.h>
42 #endif
43 #include "pixman-private.h"
44 #include "pixman-combine32.h"
45 #include "pixman-inlines.h"
46 
47 #ifdef VERBOSE
48 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
49 #else
50 #define CHECKPOINT()
51 #endif
52 
53 #if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
54 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this.  */
55 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_empty(void)56 _mm_empty (void)
57 {
58 
59 }
60 #endif
61 
62 #ifdef USE_X86_MMX
63 # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
64 #  include <xmmintrin.h>
65 # else
66 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
67  * instructions to be generated that we don't want. Just duplicate the
68  * functions we want to use.  */
69 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pi8(__m64 __A)70 _mm_movemask_pi8 (__m64 __A)
71 {
72     int ret;
73 
74     asm ("pmovmskb %1, %0\n\t"
75 	: "=r" (ret)
76 	: "y" (__A)
77     );
78 
79     return ret;
80 }
81 
82 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_pu16(__m64 __A,__m64 __B)83 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
84 {
85     asm ("pmulhuw %1, %0\n\t"
86 	: "+y" (__A)
87 	: "y" (__B)
88     );
89     return __A;
90 }
91 
92 # define _mm_shuffle_pi16(A, N)						\
93     ({									\
94 	__m64 ret;							\
95 									\
96 	asm ("pshufw %2, %1, %0\n\t"					\
97 	     : "=y" (ret)						\
98 	     : "y" (A), "K" ((const int8_t)N)				\
99 	);								\
100 									\
101 	ret;								\
102     })
103 # endif
104 #endif
105 
106 #ifndef _MSC_VER
107 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
108  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
109 #endif
110 
111 /* Notes about writing mmx code
112  *
113  * give memory operands as the second operand. If you give it as the
114  * first, gcc will first load it into a register, then use that
115  * register
116  *
117  *   ie. use
118  *
119  *         _mm_mullo_pi16 (x, mmx_constant);
120  *
121  *   not
122  *
123  *         _mm_mullo_pi16 (mmx_constant, x);
124  *
125  * Also try to minimize dependencies. i.e. when you need a value, try
126  * to calculate it from a value that was calculated as early as
127  * possible.
128  */
129 
130 /* --------------- MMX primitives ------------------------------------- */
131 
132 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
133  * the name of the member used to access the data.
134  * If __m64 requires using mm_cvt* intrinsics functions to convert between
135  * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
136  * If __m64 and uint64_t values can just be cast to each other directly,
137  * then define USE_M64_CASTS.
138  * If __m64 is a double datatype, then define USE_M64_DOUBLE.
139  */
140 #ifdef _MSC_VER
141 # define M64_MEMBER m64_u64
142 #elif defined(__ICC)
143 # define USE_CVT_INTRINSICS
144 #elif defined(USE_LOONGSON_MMI)
145 # define USE_M64_DOUBLE
146 #elif defined(__GNUC__)
147 # define USE_M64_CASTS
148 #elif defined(__SUNPRO_C)
149 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
150 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
151  * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
152  * is defined.   If it is used, then the mm_cvt* intrinsics must be used.
153  */
154 #  define USE_CVT_INTRINSICS
155 # else
156 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
157  * disabled, __m64 is defined as a struct containing "unsigned long long l_".
158  */
159 #  define M64_MEMBER l_
160 # endif
161 #endif
162 
163 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
164 typedef uint64_t mmxdatafield;
165 #else
166 typedef __m64 mmxdatafield;
167 #endif
168 
169 typedef struct
170 {
171     mmxdatafield mmx_4x00ff;
172     mmxdatafield mmx_4x0080;
173     mmxdatafield mmx_565_rgb;
174     mmxdatafield mmx_565_unpack_multiplier;
175     mmxdatafield mmx_565_pack_multiplier;
176     mmxdatafield mmx_565_r;
177     mmxdatafield mmx_565_g;
178     mmxdatafield mmx_565_b;
179     mmxdatafield mmx_packed_565_rb;
180     mmxdatafield mmx_packed_565_g;
181     mmxdatafield mmx_expand_565_g;
182     mmxdatafield mmx_expand_565_b;
183     mmxdatafield mmx_expand_565_r;
184 #ifndef USE_LOONGSON_MMI
185     mmxdatafield mmx_mask_0;
186     mmxdatafield mmx_mask_1;
187     mmxdatafield mmx_mask_2;
188     mmxdatafield mmx_mask_3;
189 #endif
190     mmxdatafield mmx_full_alpha;
191     mmxdatafield mmx_4x0101;
192     mmxdatafield mmx_ff000000;
193 } mmx_data_t;
194 
195 #if defined(_MSC_VER)
196 # define MMXDATA_INIT(field, val) { val ## UI64 }
197 #elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
198 # define MMXDATA_INIT(field, val) field =   { val ## ULL }
199 #else                           /* mmxdatafield is an integral type */
200 # define MMXDATA_INIT(field, val) field =   val ## ULL
201 #endif
202 
203 static const mmx_data_t c =
204 {
205     MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff),
206     MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
207     MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
208     MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
209     MMXDATA_INIT (.mmx_565_pack_multiplier,      0x2000000420000004),
210     MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
211     MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
212     MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
213     MMXDATA_INIT (.mmx_packed_565_rb,            0x00f800f800f800f8),
214     MMXDATA_INIT (.mmx_packed_565_g,             0x0000fc000000fc00),
215     MMXDATA_INIT (.mmx_expand_565_g,             0x07e007e007e007e0),
216     MMXDATA_INIT (.mmx_expand_565_b,             0x001f001f001f001f),
217     MMXDATA_INIT (.mmx_expand_565_r,             0xf800f800f800f800),
218 #ifndef USE_LOONGSON_MMI
219     MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
220     MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
221     MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
222     MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
223 #endif
224     MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
225     MMXDATA_INIT (.mmx_4x0101,                   0x0101010101010101),
226     MMXDATA_INIT (.mmx_ff000000,                 0xff000000ff000000),
227 };
228 
229 #ifdef USE_CVT_INTRINSICS
230 #    define MC(x) to_m64 (c.mmx_ ## x)
231 #elif defined(USE_M64_CASTS)
232 #    define MC(x) ((__m64)c.mmx_ ## x)
233 #elif defined(USE_M64_DOUBLE)
234 #    define MC(x) (*(__m64 *)&c.mmx_ ## x)
235 #else
236 #    define MC(x) c.mmx_ ## x
237 #endif
238 
239 static force_inline __m64
to_m64(uint64_t x)240 to_m64 (uint64_t x)
241 {
242 #ifdef USE_CVT_INTRINSICS
243     return _mm_cvtsi64_m64 (x);
244 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
245     __m64 res;
246 
247     res.M64_MEMBER = x;
248     return res;
249 #elif defined USE_M64_DOUBLE
250     return *(__m64 *)&x;
251 #else /* USE_M64_CASTS */
252     return (__m64)x;
253 #endif
254 }
255 
256 static force_inline uint64_t
to_uint64(__m64 x)257 to_uint64 (__m64 x)
258 {
259 #ifdef USE_CVT_INTRINSICS
260     return _mm_cvtm64_si64 (x);
261 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
262     uint64_t res = x.M64_MEMBER;
263     return res;
264 #elif defined USE_M64_DOUBLE
265     return *(uint64_t *)&x;
266 #else /* USE_M64_CASTS */
267     return (uint64_t)x;
268 #endif
269 }
270 
271 static force_inline __m64
shift(__m64 v,int s)272 shift (__m64 v,
273        int   s)
274 {
275     if (s > 0)
276 	return _mm_slli_si64 (v, s);
277     else if (s < 0)
278 	return _mm_srli_si64 (v, -s);
279     else
280 	return v;
281 }
282 
283 static force_inline __m64
negate(__m64 mask)284 negate (__m64 mask)
285 {
286     return _mm_xor_si64 (mask, MC (4x00ff));
287 }
288 
289 /* Computes the product of two unsigned fixed-point 8-bit values from 0 to 1
290  * and maps its result to the same range.
291  *
292  * Jim Blinn gives multiple ways to compute this in "Jim Blinn's Corner:
293  * Notation, Notation, Notation", the first of which is
294  *
295  *   prod(a, b) = (a * b + 128) / 255.
296  *
297  * By approximating the division by 255 as 257/65536 it can be replaced by a
298  * multiply and a right shift. This is the implementation that we use in
299  * pix_multiply(), but we _mm_mulhi_pu16() by 257 (part of SSE1 or Extended
300  * 3DNow!, and unavailable at the time of the book's publication) to perform
301  * the multiplication and right shift in a single operation.
302  *
303  *   prod(a, b) = ((a * b + 128) * 257) >> 16.
304  *
305  * A third way (how pix_multiply() was implemented prior to 14208344) exists
306  * also that performs the multiplication by 257 with adds and shifts.
307  *
308  * Where temp = a * b + 128
309  *
310  *   prod(a, b) = (temp + (temp >> 8)) >> 8.
311  */
312 static force_inline __m64
pix_multiply(__m64 a,__m64 b)313 pix_multiply (__m64 a, __m64 b)
314 {
315     __m64 res;
316 
317     res = _mm_mullo_pi16 (a, b);
318     res = _mm_adds_pu16 (res, MC (4x0080));
319     res = _mm_mulhi_pu16 (res, MC (4x0101));
320 
321     return res;
322 }
323 
324 static force_inline __m64
pix_add(__m64 a,__m64 b)325 pix_add (__m64 a, __m64 b)
326 {
327     return _mm_adds_pu8 (a, b);
328 }
329 
330 static force_inline __m64
expand_alpha(__m64 pixel)331 expand_alpha (__m64 pixel)
332 {
333     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
334 }
335 
336 static force_inline __m64
expand_alpha_rev(__m64 pixel)337 expand_alpha_rev (__m64 pixel)
338 {
339     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
340 }
341 
342 static force_inline __m64
invert_colors(__m64 pixel)343 invert_colors (__m64 pixel)
344 {
345     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
346 }
347 
348 static force_inline __m64
over(__m64 src,__m64 srca,__m64 dest)349 over (__m64 src,
350       __m64 srca,
351       __m64 dest)
352 {
353     return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
354 }
355 
356 static force_inline __m64
over_rev_non_pre(__m64 src,__m64 dest)357 over_rev_non_pre (__m64 src, __m64 dest)
358 {
359     __m64 srca = expand_alpha (src);
360     __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
361 
362     return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
363 }
364 
365 static force_inline __m64
in(__m64 src,__m64 mask)366 in (__m64 src, __m64 mask)
367 {
368     return pix_multiply (src, mask);
369 }
370 
371 #ifndef _MSC_VER
372 static force_inline __m64
in_over(__m64 src,__m64 srca,__m64 mask,__m64 dest)373 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
374 {
375     return over (in (src, mask), pix_multiply (srca, mask), dest);
376 }
377 
378 #else
379 
380 #define in_over(src, srca, mask, dest)					\
381     over (in (src, mask), pix_multiply (srca, mask), dest)
382 
383 #endif
384 
385 /* Elemental unaligned loads */
386 
ldq_u(__m64 * p)387 static force_inline __m64 ldq_u(__m64 *p)
388 {
389 #ifdef USE_X86_MMX
390     /* x86's alignment restrictions are very relaxed, but that's no excuse */
391     __m64 r;
392     memcpy(&r, p, sizeof(__m64));
393     return r;
394 #elif defined USE_ARM_IWMMXT
395     int align = (uintptr_t)p & 7;
396     __m64 *aligned_p;
397     if (align == 0)
398 	return *p;
399     aligned_p = (__m64 *)((uintptr_t)p & ~7);
400     return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
401 #else
402     struct __una_u64 { __m64 x __attribute__((packed)); };
403     const struct __una_u64 *ptr = (const struct __una_u64 *) p;
404     return (__m64) ptr->x;
405 #endif
406 }
407 
ldl_u(const uint32_t * p)408 static force_inline uint32_t ldl_u(const uint32_t *p)
409 {
410 #ifdef USE_X86_MMX
411     /* x86's alignment restrictions are very relaxed. */
412     uint32_t r;
413     memcpy(&r, p, sizeof(uint32_t));
414     return r;
415 #else
416     struct __una_u32 { uint32_t x __attribute__((packed)); };
417     const struct __una_u32 *ptr = (const struct __una_u32 *) p;
418     return ptr->x;
419 #endif
420 }
421 
422 static force_inline __m64
load(const uint32_t * v)423 load (const uint32_t *v)
424 {
425 #ifdef USE_LOONGSON_MMI
426     __m64 ret;
427     asm ("lwc1 %0, %1\n\t"
428 	: "=f" (ret)
429 	: "m" (*v)
430     );
431     return ret;
432 #else
433     return _mm_cvtsi32_si64 (*v);
434 #endif
435 }
436 
437 static force_inline __m64
load8888(const uint32_t * v)438 load8888 (const uint32_t *v)
439 {
440 #ifdef USE_LOONGSON_MMI
441     return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
442 #else
443     return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
444 #endif
445 }
446 
447 static force_inline __m64
load8888u(const uint32_t * v)448 load8888u (const uint32_t *v)
449 {
450     uint32_t l = ldl_u (v);
451     return load8888 (&l);
452 }
453 
454 static force_inline __m64
pack8888(__m64 lo,__m64 hi)455 pack8888 (__m64 lo, __m64 hi)
456 {
457     return _mm_packs_pu16 (lo, hi);
458 }
459 
460 static force_inline void
store(uint32_t * dest,__m64 v)461 store (uint32_t *dest, __m64 v)
462 {
463 #ifdef USE_LOONGSON_MMI
464     asm ("swc1 %1, %0\n\t"
465 	: "=m" (*dest)
466 	: "f" (v)
467 	: "memory"
468     );
469 #else
470     *dest = _mm_cvtsi64_si32 (v);
471 #endif
472 }
473 
474 static force_inline void
store8888(uint32_t * dest,__m64 v)475 store8888 (uint32_t *dest, __m64 v)
476 {
477     v = pack8888 (v, _mm_setzero_si64 ());
478     store (dest, v);
479 }
480 
481 static force_inline pixman_bool_t
is_equal(__m64 a,__m64 b)482 is_equal (__m64 a, __m64 b)
483 {
484 #ifdef USE_LOONGSON_MMI
485     /* __m64 is double, we can compare directly. */
486     return a == b;
487 #else
488     return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
489 #endif
490 }
491 
492 static force_inline pixman_bool_t
is_opaque(__m64 v)493 is_opaque (__m64 v)
494 {
495 #ifdef USE_LOONGSON_MMI
496     return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
497 #else
498     __m64 ffs = _mm_cmpeq_pi8 (v, v);
499     return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
500 #endif
501 }
502 
503 static force_inline pixman_bool_t
is_zero(__m64 v)504 is_zero (__m64 v)
505 {
506     return is_equal (v, _mm_setzero_si64 ());
507 }
508 
509 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
510  *
511  *    00RR00GG00BB
512  *
513  * --- Expanding 565 in the low word ---
514  *
515  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
516  * m = m & (01f0003f001f);
517  * m = m * (008404100840);
518  * m = m >> 8;
519  *
520  * Note the trick here - the top word is shifted by another nibble to
521  * avoid it bumping into the middle word
522  */
523 static force_inline __m64
expand565(__m64 pixel,int pos)524 expand565 (__m64 pixel, int pos)
525 {
526     __m64 p = pixel;
527     __m64 t1, t2;
528 
529     /* move pixel to low 16 bit and zero the rest */
530 #ifdef USE_LOONGSON_MMI
531     p = loongson_extract_pi16 (p, pos);
532 #else
533     p = shift (shift (p, (3 - pos) * 16), -48);
534 #endif
535 
536     t1 = shift (p, 36 - 11);
537     t2 = shift (p, 16 - 5);
538 
539     p = _mm_or_si64 (t1, p);
540     p = _mm_or_si64 (t2, p);
541     p = _mm_and_si64 (p, MC (565_rgb));
542 
543     pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
544     return _mm_srli_pi16 (pixel, 8);
545 }
546 
547 /* Expand 4 16 bit pixels in an mmx register into two mmx registers of
548  *
549  *    AARRGGBBRRGGBB
550  */
551 static force_inline void
expand_4xpacked565(__m64 vin,__m64 * vout0,__m64 * vout1,int full_alpha)552 expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
553 {
554     __m64 t0, t1, alpha = _mm_setzero_si64 ();
555     __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
556     __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
557     __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
558     if (full_alpha)
559 	alpha = _mm_cmpeq_pi32 (alpha, alpha);
560 
561     /* Replicate high bits into empty low bits. */
562     r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
563     g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
564     b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
565 
566     r = _mm_packs_pu16 (r, _mm_setzero_si64 ());	/* 00 00 00 00 R3 R2 R1 R0 */
567     g = _mm_packs_pu16 (g, _mm_setzero_si64 ());	/* 00 00 00 00 G3 G2 G1 G0 */
568     b = _mm_packs_pu16 (b, _mm_setzero_si64 ());	/* 00 00 00 00 B3 B2 B1 B0 */
569 
570     t1 = _mm_unpacklo_pi8 (r, alpha);			/* A3 R3 A2 R2 A1 R1 A0 R0 */
571     t0 = _mm_unpacklo_pi8 (b, g);			/* G3 B3 G2 B2 G1 B1 G0 B0 */
572 
573     *vout0 = _mm_unpacklo_pi16 (t0, t1);		/* A1 R1 G1 B1 A0 R0 G0 B0 */
574     *vout1 = _mm_unpackhi_pi16 (t0, t1);		/* A3 R3 G3 B3 A2 R2 G2 B2 */
575 }
576 
577 static force_inline __m64
expand8888(__m64 in,int pos)578 expand8888 (__m64 in, int pos)
579 {
580     if (pos == 0)
581 	return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
582     else
583 	return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
584 }
585 
586 static force_inline __m64
expandx888(__m64 in,int pos)587 expandx888 (__m64 in, int pos)
588 {
589     return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
590 }
591 
592 static force_inline void
expand_4x565(__m64 vin,__m64 * vout0,__m64 * vout1,__m64 * vout2,__m64 * vout3,int full_alpha)593 expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
594 {
595     __m64 v0, v1;
596     expand_4xpacked565 (vin, &v0, &v1, full_alpha);
597     *vout0 = expand8888 (v0, 0);
598     *vout1 = expand8888 (v0, 1);
599     *vout2 = expand8888 (v1, 0);
600     *vout3 = expand8888 (v1, 1);
601 }
602 
603 static force_inline __m64
pack_565(__m64 pixel,__m64 target,int pos)604 pack_565 (__m64 pixel, __m64 target, int pos)
605 {
606     __m64 p = pixel;
607     __m64 t = target;
608     __m64 r, g, b;
609 
610     r = _mm_and_si64 (p, MC (565_r));
611     g = _mm_and_si64 (p, MC (565_g));
612     b = _mm_and_si64 (p, MC (565_b));
613 
614 #ifdef USE_LOONGSON_MMI
615     r = shift (r, -(32 - 8));
616     g = shift (g, -(16 - 3));
617     b = shift (b, -(0  + 3));
618 
619     p = _mm_or_si64 (r, g);
620     p = _mm_or_si64 (p, b);
621     return loongson_insert_pi16 (t, p, pos);
622 #else
623     r = shift (r, -(32 - 8) + pos * 16);
624     g = shift (g, -(16 - 3) + pos * 16);
625     b = shift (b, -(0  + 3) + pos * 16);
626 
627     if (pos == 0)
628 	t = _mm_and_si64 (t, MC (mask_0));
629     else if (pos == 1)
630 	t = _mm_and_si64 (t, MC (mask_1));
631     else if (pos == 2)
632 	t = _mm_and_si64 (t, MC (mask_2));
633     else if (pos == 3)
634 	t = _mm_and_si64 (t, MC (mask_3));
635 
636     p = _mm_or_si64 (r, t);
637     p = _mm_or_si64 (g, p);
638 
639     return _mm_or_si64 (b, p);
640 #endif
641 }
642 
643 static force_inline __m64
pack_4xpacked565(__m64 a,__m64 b)644 pack_4xpacked565 (__m64 a, __m64 b)
645 {
646     __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
647     __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
648 
649     __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
650     __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
651 
652     __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
653     __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
654 
655     t0 = _mm_or_si64 (t0, g0);
656     t1 = _mm_or_si64 (t1, g1);
657 
658     t0 = shift(t0, -5);
659 #ifdef USE_ARM_IWMMXT
660     t1 = shift(t1, -5);
661     return _mm_packs_pu32 (t0, t1);
662 #else
663     t1 = shift(t1, -5 + 16);
664     return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
665 #endif
666 }
667 
668 #ifndef _MSC_VER
669 
670 static force_inline __m64
pack_4x565(__m64 v0,__m64 v1,__m64 v2,__m64 v3)671 pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
672 {
673     return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
674 }
675 
676 static force_inline __m64
pix_add_mul(__m64 x,__m64 a,__m64 y,__m64 b)677 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
678 {
679     x = pix_multiply (x, a);
680     y = pix_multiply (y, b);
681 
682     return pix_add (x, y);
683 }
684 
685 #else
686 
687 /* MSVC only handles a "pass by register" of up to three SSE intrinsics */
688 
689 #define pack_4x565(v0, v1, v2, v3) \
690     pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
691 
692 #define pix_add_mul(x, a, y, b)	 \
693     ( x = pix_multiply (x, a),	 \
694       y = pix_multiply (y, b),	 \
695       pix_add (x, y) )
696 
697 #endif
698 
699 /* --------------- MMX code patch for fbcompose.c --------------------- */
700 
701 static force_inline __m64
combine(const uint32_t * src,const uint32_t * mask)702 combine (const uint32_t *src, const uint32_t *mask)
703 {
704     __m64 vsrc = load8888 (src);
705 
706     if (mask)
707     {
708 	__m64 m = load8888 (mask);
709 
710 	m = expand_alpha (m);
711 	vsrc = pix_multiply (vsrc, m);
712     }
713 
714     return vsrc;
715 }
716 
717 static force_inline __m64
core_combine_over_u_pixel_mmx(__m64 vsrc,__m64 vdst)718 core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
719 {
720     vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
721 
722     if (is_opaque (vsrc))
723     {
724 	return vsrc;
725     }
726     else if (!is_zero (vsrc))
727     {
728 	return over (vsrc, expand_alpha (vsrc),
729 		     _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
730     }
731 
732     return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
733 }
734 
735 static void
mmx_combine_over_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)736 mmx_combine_over_u (pixman_implementation_t *imp,
737                     pixman_op_t              op,
738                     uint32_t *               dest,
739                     const uint32_t *         src,
740                     const uint32_t *         mask,
741                     int                      width)
742 {
743     const uint32_t *end = dest + width;
744 
745     while (dest < end)
746     {
747 	__m64 vsrc = combine (src, mask);
748 
749 	if (is_opaque (vsrc))
750 	{
751 	    store8888 (dest, vsrc);
752 	}
753 	else if (!is_zero (vsrc))
754 	{
755 	    __m64 sa = expand_alpha (vsrc);
756 	    store8888 (dest, over (vsrc, sa, load8888 (dest)));
757 	}
758 
759 	++dest;
760 	++src;
761 	if (mask)
762 	    ++mask;
763     }
764     _mm_empty ();
765 }
766 
767 static void
mmx_combine_over_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)768 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
769                             pixman_op_t              op,
770                             uint32_t *               dest,
771                             const uint32_t *         src,
772                             const uint32_t *         mask,
773                             int                      width)
774 {
775     const uint32_t *end = dest + width;
776 
777     while (dest < end)
778     {
779 	__m64 d, da;
780 	__m64 s = combine (src, mask);
781 
782 	d = load8888 (dest);
783 	da = expand_alpha (d);
784 	store8888 (dest, over (d, da, s));
785 
786 	++dest;
787 	++src;
788 	if (mask)
789 	    mask++;
790     }
791     _mm_empty ();
792 }
793 
794 static void
mmx_combine_in_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)795 mmx_combine_in_u (pixman_implementation_t *imp,
796                   pixman_op_t              op,
797                   uint32_t *               dest,
798                   const uint32_t *         src,
799                   const uint32_t *         mask,
800                   int                      width)
801 {
802     const uint32_t *end = dest + width;
803 
804     while (dest < end)
805     {
806 	__m64 a;
807 	__m64 x = combine (src, mask);
808 
809 	a = load8888 (dest);
810 	a = expand_alpha (a);
811 	x = pix_multiply (x, a);
812 
813 	store8888 (dest, x);
814 
815 	++dest;
816 	++src;
817 	if (mask)
818 	    mask++;
819     }
820     _mm_empty ();
821 }
822 
823 static void
mmx_combine_in_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)824 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
825                           pixman_op_t              op,
826                           uint32_t *               dest,
827                           const uint32_t *         src,
828                           const uint32_t *         mask,
829                           int                      width)
830 {
831     const uint32_t *end = dest + width;
832 
833     while (dest < end)
834     {
835 	__m64 a = combine (src, mask);
836 	__m64 x;
837 
838 	x = load8888 (dest);
839 	a = expand_alpha (a);
840 	x = pix_multiply (x, a);
841 	store8888 (dest, x);
842 
843 	++dest;
844 	++src;
845 	if (mask)
846 	    mask++;
847     }
848     _mm_empty ();
849 }
850 
851 static void
mmx_combine_out_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)852 mmx_combine_out_u (pixman_implementation_t *imp,
853                    pixman_op_t              op,
854                    uint32_t *               dest,
855                    const uint32_t *         src,
856                    const uint32_t *         mask,
857                    int                      width)
858 {
859     const uint32_t *end = dest + width;
860 
861     while (dest < end)
862     {
863 	__m64 a;
864 	__m64 x = combine (src, mask);
865 
866 	a = load8888 (dest);
867 	a = expand_alpha (a);
868 	a = negate (a);
869 	x = pix_multiply (x, a);
870 	store8888 (dest, x);
871 
872 	++dest;
873 	++src;
874 	if (mask)
875 	    mask++;
876     }
877     _mm_empty ();
878 }
879 
880 static void
mmx_combine_out_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)881 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
882                            pixman_op_t              op,
883                            uint32_t *               dest,
884                            const uint32_t *         src,
885                            const uint32_t *         mask,
886                            int                      width)
887 {
888     const uint32_t *end = dest + width;
889 
890     while (dest < end)
891     {
892 	__m64 a = combine (src, mask);
893 	__m64 x;
894 
895 	x = load8888 (dest);
896 	a = expand_alpha (a);
897 	a = negate (a);
898 	x = pix_multiply (x, a);
899 
900 	store8888 (dest, x);
901 
902 	++dest;
903 	++src;
904 	if (mask)
905 	    mask++;
906     }
907     _mm_empty ();
908 }
909 
910 static void
mmx_combine_atop_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)911 mmx_combine_atop_u (pixman_implementation_t *imp,
912                     pixman_op_t              op,
913                     uint32_t *               dest,
914                     const uint32_t *         src,
915                     const uint32_t *         mask,
916                     int                      width)
917 {
918     const uint32_t *end = dest + width;
919 
920     while (dest < end)
921     {
922 	__m64 da, d, sia;
923 	__m64 s = combine (src, mask);
924 
925 	d = load8888 (dest);
926 	sia = expand_alpha (s);
927 	sia = negate (sia);
928 	da = expand_alpha (d);
929 	s = pix_add_mul (s, da, d, sia);
930 	store8888 (dest, s);
931 
932 	++dest;
933 	++src;
934 	if (mask)
935 	    mask++;
936     }
937     _mm_empty ();
938 }
939 
940 static void
mmx_combine_atop_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)941 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
942                             pixman_op_t              op,
943                             uint32_t *               dest,
944                             const uint32_t *         src,
945                             const uint32_t *         mask,
946                             int                      width)
947 {
948     const uint32_t *end;
949 
950     end = dest + width;
951 
952     while (dest < end)
953     {
954 	__m64 dia, d, sa;
955 	__m64 s = combine (src, mask);
956 
957 	d = load8888 (dest);
958 	sa = expand_alpha (s);
959 	dia = expand_alpha (d);
960 	dia = negate (dia);
961 	s = pix_add_mul (s, dia, d, sa);
962 	store8888 (dest, s);
963 
964 	++dest;
965 	++src;
966 	if (mask)
967 	    mask++;
968     }
969     _mm_empty ();
970 }
971 
972 static void
mmx_combine_xor_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)973 mmx_combine_xor_u (pixman_implementation_t *imp,
974                    pixman_op_t              op,
975                    uint32_t *               dest,
976                    const uint32_t *         src,
977                    const uint32_t *         mask,
978                    int                      width)
979 {
980     const uint32_t *end = dest + width;
981 
982     while (dest < end)
983     {
984 	__m64 dia, d, sia;
985 	__m64 s = combine (src, mask);
986 
987 	d = load8888 (dest);
988 	sia = expand_alpha (s);
989 	dia = expand_alpha (d);
990 	sia = negate (sia);
991 	dia = negate (dia);
992 	s = pix_add_mul (s, dia, d, sia);
993 	store8888 (dest, s);
994 
995 	++dest;
996 	++src;
997 	if (mask)
998 	    mask++;
999     }
1000     _mm_empty ();
1001 }
1002 
1003 static void
mmx_combine_add_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1004 mmx_combine_add_u (pixman_implementation_t *imp,
1005                    pixman_op_t              op,
1006                    uint32_t *               dest,
1007                    const uint32_t *         src,
1008                    const uint32_t *         mask,
1009                    int                      width)
1010 {
1011     const uint32_t *end = dest + width;
1012 
1013     while (dest < end)
1014     {
1015 	__m64 d;
1016 	__m64 s = combine (src, mask);
1017 
1018 	d = load8888 (dest);
1019 	s = pix_add (s, d);
1020 	store8888 (dest, s);
1021 
1022 	++dest;
1023 	++src;
1024 	if (mask)
1025 	    mask++;
1026     }
1027     _mm_empty ();
1028 }
1029 
1030 static void
mmx_combine_saturate_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1031 mmx_combine_saturate_u (pixman_implementation_t *imp,
1032                         pixman_op_t              op,
1033                         uint32_t *               dest,
1034                         const uint32_t *         src,
1035                         const uint32_t *         mask,
1036                         int                      width)
1037 {
1038     const uint32_t *end = dest + width;
1039 
1040     while (dest < end)
1041     {
1042 	uint32_t s, sa, da;
1043 	uint32_t d = *dest;
1044 	__m64 ms = combine (src, mask);
1045 	__m64 md = load8888 (dest);
1046 
1047 	store8888(&s, ms);
1048 	da = ~d >> 24;
1049 	sa = s >> 24;
1050 
1051 	if (sa > da)
1052 	{
1053 	    uint32_t quot = DIV_UN8 (da, sa) << 24;
1054 	    __m64 msa = load8888 (&quot);
1055 	    msa = expand_alpha (msa);
1056 	    ms = pix_multiply (ms, msa);
1057 	}
1058 
1059 	md = pix_add (md, ms);
1060 	store8888 (dest, md);
1061 
1062 	++src;
1063 	++dest;
1064 	if (mask)
1065 	    mask++;
1066     }
1067     _mm_empty ();
1068 }
1069 
1070 static void
mmx_combine_src_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1071 mmx_combine_src_ca (pixman_implementation_t *imp,
1072                     pixman_op_t              op,
1073                     uint32_t *               dest,
1074                     const uint32_t *         src,
1075                     const uint32_t *         mask,
1076                     int                      width)
1077 {
1078     const uint32_t *end = src + width;
1079 
1080     while (src < end)
1081     {
1082 	__m64 a = load8888 (mask);
1083 	__m64 s = load8888 (src);
1084 
1085 	s = pix_multiply (s, a);
1086 	store8888 (dest, s);
1087 
1088 	++src;
1089 	++mask;
1090 	++dest;
1091     }
1092     _mm_empty ();
1093 }
1094 
1095 static void
mmx_combine_over_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1096 mmx_combine_over_ca (pixman_implementation_t *imp,
1097                      pixman_op_t              op,
1098                      uint32_t *               dest,
1099                      const uint32_t *         src,
1100                      const uint32_t *         mask,
1101                      int                      width)
1102 {
1103     const uint32_t *end = src + width;
1104 
1105     while (src < end)
1106     {
1107 	__m64 a = load8888 (mask);
1108 	__m64 s = load8888 (src);
1109 	__m64 d = load8888 (dest);
1110 	__m64 sa = expand_alpha (s);
1111 
1112 	store8888 (dest, in_over (s, sa, a, d));
1113 
1114 	++src;
1115 	++dest;
1116 	++mask;
1117     }
1118     _mm_empty ();
1119 }
1120 
1121 static void
mmx_combine_over_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1122 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1123                              pixman_op_t              op,
1124                              uint32_t *               dest,
1125                              const uint32_t *         src,
1126                              const uint32_t *         mask,
1127                              int                      width)
1128 {
1129     const uint32_t *end = src + width;
1130 
1131     while (src < end)
1132     {
1133 	__m64 a = load8888 (mask);
1134 	__m64 s = load8888 (src);
1135 	__m64 d = load8888 (dest);
1136 	__m64 da = expand_alpha (d);
1137 
1138 	store8888 (dest, over (d, da, in (s, a)));
1139 
1140 	++src;
1141 	++dest;
1142 	++mask;
1143     }
1144     _mm_empty ();
1145 }
1146 
1147 static void
mmx_combine_in_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1148 mmx_combine_in_ca (pixman_implementation_t *imp,
1149                    pixman_op_t              op,
1150                    uint32_t *               dest,
1151                    const uint32_t *         src,
1152                    const uint32_t *         mask,
1153                    int                      width)
1154 {
1155     const uint32_t *end = src + width;
1156 
1157     while (src < end)
1158     {
1159 	__m64 a = load8888 (mask);
1160 	__m64 s = load8888 (src);
1161 	__m64 d = load8888 (dest);
1162 	__m64 da = expand_alpha (d);
1163 
1164 	s = pix_multiply (s, a);
1165 	s = pix_multiply (s, da);
1166 	store8888 (dest, s);
1167 
1168 	++src;
1169 	++dest;
1170 	++mask;
1171     }
1172     _mm_empty ();
1173 }
1174 
1175 static void
mmx_combine_in_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1176 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1177                            pixman_op_t              op,
1178                            uint32_t *               dest,
1179                            const uint32_t *         src,
1180                            const uint32_t *         mask,
1181                            int                      width)
1182 {
1183     const uint32_t *end = src + width;
1184 
1185     while (src < end)
1186     {
1187 	__m64 a = load8888 (mask);
1188 	__m64 s = load8888 (src);
1189 	__m64 d = load8888 (dest);
1190 	__m64 sa = expand_alpha (s);
1191 
1192 	a = pix_multiply (a, sa);
1193 	d = pix_multiply (d, a);
1194 	store8888 (dest, d);
1195 
1196 	++src;
1197 	++dest;
1198 	++mask;
1199     }
1200     _mm_empty ();
1201 }
1202 
1203 static void
mmx_combine_out_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1204 mmx_combine_out_ca (pixman_implementation_t *imp,
1205                     pixman_op_t              op,
1206                     uint32_t *               dest,
1207                     const uint32_t *         src,
1208                     const uint32_t *         mask,
1209                     int                      width)
1210 {
1211     const uint32_t *end = src + width;
1212 
1213     while (src < end)
1214     {
1215 	__m64 a = load8888 (mask);
1216 	__m64 s = load8888 (src);
1217 	__m64 d = load8888 (dest);
1218 	__m64 da = expand_alpha (d);
1219 
1220 	da = negate (da);
1221 	s = pix_multiply (s, a);
1222 	s = pix_multiply (s, da);
1223 	store8888 (dest, s);
1224 
1225 	++src;
1226 	++dest;
1227 	++mask;
1228     }
1229     _mm_empty ();
1230 }
1231 
1232 static void
mmx_combine_out_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1233 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1234                             pixman_op_t              op,
1235                             uint32_t *               dest,
1236                             const uint32_t *         src,
1237                             const uint32_t *         mask,
1238                             int                      width)
1239 {
1240     const uint32_t *end = src + width;
1241 
1242     while (src < end)
1243     {
1244 	__m64 a = load8888 (mask);
1245 	__m64 s = load8888 (src);
1246 	__m64 d = load8888 (dest);
1247 	__m64 sa = expand_alpha (s);
1248 
1249 	a = pix_multiply (a, sa);
1250 	a = negate (a);
1251 	d = pix_multiply (d, a);
1252 	store8888 (dest, d);
1253 
1254 	++src;
1255 	++dest;
1256 	++mask;
1257     }
1258     _mm_empty ();
1259 }
1260 
1261 static void
mmx_combine_atop_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1262 mmx_combine_atop_ca (pixman_implementation_t *imp,
1263                      pixman_op_t              op,
1264                      uint32_t *               dest,
1265                      const uint32_t *         src,
1266                      const uint32_t *         mask,
1267                      int                      width)
1268 {
1269     const uint32_t *end = src + width;
1270 
1271     while (src < end)
1272     {
1273 	__m64 a = load8888 (mask);
1274 	__m64 s = load8888 (src);
1275 	__m64 d = load8888 (dest);
1276 	__m64 da = expand_alpha (d);
1277 	__m64 sa = expand_alpha (s);
1278 
1279 	s = pix_multiply (s, a);
1280 	a = pix_multiply (a, sa);
1281 	a = negate (a);
1282 	d = pix_add_mul (d, a, s, da);
1283 	store8888 (dest, d);
1284 
1285 	++src;
1286 	++dest;
1287 	++mask;
1288     }
1289     _mm_empty ();
1290 }
1291 
1292 static void
mmx_combine_atop_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1293 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1294                              pixman_op_t              op,
1295                              uint32_t *               dest,
1296                              const uint32_t *         src,
1297                              const uint32_t *         mask,
1298                              int                      width)
1299 {
1300     const uint32_t *end = src + width;
1301 
1302     while (src < end)
1303     {
1304 	__m64 a = load8888 (mask);
1305 	__m64 s = load8888 (src);
1306 	__m64 d = load8888 (dest);
1307 	__m64 da = expand_alpha (d);
1308 	__m64 sa = expand_alpha (s);
1309 
1310 	s = pix_multiply (s, a);
1311 	a = pix_multiply (a, sa);
1312 	da = negate (da);
1313 	d = pix_add_mul (d, a, s, da);
1314 	store8888 (dest, d);
1315 
1316 	++src;
1317 	++dest;
1318 	++mask;
1319     }
1320     _mm_empty ();
1321 }
1322 
1323 static void
mmx_combine_xor_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1324 mmx_combine_xor_ca (pixman_implementation_t *imp,
1325                     pixman_op_t              op,
1326                     uint32_t *               dest,
1327                     const uint32_t *         src,
1328                     const uint32_t *         mask,
1329                     int                      width)
1330 {
1331     const uint32_t *end = src + width;
1332 
1333     while (src < end)
1334     {
1335 	__m64 a = load8888 (mask);
1336 	__m64 s = load8888 (src);
1337 	__m64 d = load8888 (dest);
1338 	__m64 da = expand_alpha (d);
1339 	__m64 sa = expand_alpha (s);
1340 
1341 	s = pix_multiply (s, a);
1342 	a = pix_multiply (a, sa);
1343 	da = negate (da);
1344 	a = negate (a);
1345 	d = pix_add_mul (d, a, s, da);
1346 	store8888 (dest, d);
1347 
1348 	++src;
1349 	++dest;
1350 	++mask;
1351     }
1352     _mm_empty ();
1353 }
1354 
1355 static void
mmx_combine_add_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1356 mmx_combine_add_ca (pixman_implementation_t *imp,
1357                     pixman_op_t              op,
1358                     uint32_t *               dest,
1359                     const uint32_t *         src,
1360                     const uint32_t *         mask,
1361                     int                      width)
1362 {
1363     const uint32_t *end = src + width;
1364 
1365     while (src < end)
1366     {
1367 	__m64 a = load8888 (mask);
1368 	__m64 s = load8888 (src);
1369 	__m64 d = load8888 (dest);
1370 
1371 	s = pix_multiply (s, a);
1372 	d = pix_add (s, d);
1373 	store8888 (dest, d);
1374 
1375 	++src;
1376 	++dest;
1377 	++mask;
1378     }
1379     _mm_empty ();
1380 }
1381 
1382 /* ------------- MMX code paths called from fbpict.c -------------------- */
1383 
1384 static void
mmx_composite_over_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1385 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1386                            pixman_composite_info_t *info)
1387 {
1388     PIXMAN_COMPOSITE_ARGS (info);
1389     uint32_t src;
1390     uint32_t    *dst_line, *dst;
1391     int32_t w;
1392     int dst_stride;
1393     __m64 vsrc, vsrca;
1394 
1395     CHECKPOINT ();
1396 
1397     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1398 
1399     if (src == 0)
1400 	return;
1401 
1402     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1403 
1404     vsrc = load8888 (&src);
1405     vsrca = expand_alpha (vsrc);
1406 
1407     while (height--)
1408     {
1409 	dst = dst_line;
1410 	dst_line += dst_stride;
1411 	w = width;
1412 
1413 	CHECKPOINT ();
1414 
1415 	while (w && (uintptr_t)dst & 7)
1416 	{
1417 	    store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1418 
1419 	    w--;
1420 	    dst++;
1421 	}
1422 
1423 	while (w >= 2)
1424 	{
1425 	    __m64 vdest;
1426 	    __m64 dest0, dest1;
1427 
1428 	    vdest = *(__m64 *)dst;
1429 
1430 	    dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1431 	    dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1432 
1433 	    *(__m64 *)dst = pack8888 (dest0, dest1);
1434 
1435 	    dst += 2;
1436 	    w -= 2;
1437 	}
1438 
1439 	CHECKPOINT ();
1440 
1441 	if (w)
1442 	{
1443 	    store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1444 	}
1445     }
1446 
1447     _mm_empty ();
1448 }
1449 
1450 static void
mmx_composite_over_n_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)1451 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1452                            pixman_composite_info_t *info)
1453 {
1454     PIXMAN_COMPOSITE_ARGS (info);
1455     uint32_t src;
1456     uint16_t    *dst_line, *dst;
1457     int32_t w;
1458     int dst_stride;
1459     __m64 vsrc, vsrca;
1460 
1461     CHECKPOINT ();
1462 
1463     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1464 
1465     if (src == 0)
1466 	return;
1467 
1468     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1469 
1470     vsrc = load8888 (&src);
1471     vsrca = expand_alpha (vsrc);
1472 
1473     while (height--)
1474     {
1475 	dst = dst_line;
1476 	dst_line += dst_stride;
1477 	w = width;
1478 
1479 	CHECKPOINT ();
1480 
1481 	while (w && (uintptr_t)dst & 7)
1482 	{
1483 	    uint64_t d = *dst;
1484 	    __m64 vdest = expand565 (to_m64 (d), 0);
1485 
1486 	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1487 	    *dst = to_uint64 (vdest);
1488 
1489 	    w--;
1490 	    dst++;
1491 	}
1492 
1493 	while (w >= 4)
1494 	{
1495 	    __m64 vdest = *(__m64 *)dst;
1496 	    __m64 v0, v1, v2, v3;
1497 
1498 	    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1499 
1500 	    v0 = over (vsrc, vsrca, v0);
1501 	    v1 = over (vsrc, vsrca, v1);
1502 	    v2 = over (vsrc, vsrca, v2);
1503 	    v3 = over (vsrc, vsrca, v3);
1504 
1505 	    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1506 
1507 	    dst += 4;
1508 	    w -= 4;
1509 	}
1510 
1511 	CHECKPOINT ();
1512 
1513 	while (w)
1514 	{
1515 	    uint64_t d = *dst;
1516 	    __m64 vdest = expand565 (to_m64 (d), 0);
1517 
1518 	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1519 	    *dst = to_uint64 (vdest);
1520 
1521 	    w--;
1522 	    dst++;
1523 	}
1524     }
1525 
1526     _mm_empty ();
1527 }
1528 
1529 static void
mmx_composite_over_n_8888_8888_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)1530 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1531                                    pixman_composite_info_t *info)
1532 {
1533     PIXMAN_COMPOSITE_ARGS (info);
1534     uint32_t src;
1535     uint32_t    *dst_line;
1536     uint32_t    *mask_line;
1537     int dst_stride, mask_stride;
1538     __m64 vsrc, vsrca;
1539 
1540     CHECKPOINT ();
1541 
1542     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1543 
1544     if (src == 0)
1545 	return;
1546 
1547     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1548     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1549 
1550     vsrc = load8888 (&src);
1551     vsrca = expand_alpha (vsrc);
1552 
1553     while (height--)
1554     {
1555 	int twidth = width;
1556 	uint32_t *p = (uint32_t *)mask_line;
1557 	uint32_t *q = (uint32_t *)dst_line;
1558 
1559 	while (twidth && (uintptr_t)q & 7)
1560 	{
1561 	    uint32_t m = *(uint32_t *)p;
1562 
1563 	    if (m)
1564 	    {
1565 		__m64 vdest = load8888 (q);
1566 		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1567 		store8888 (q, vdest);
1568 	    }
1569 
1570 	    twidth--;
1571 	    p++;
1572 	    q++;
1573 	}
1574 
1575 	while (twidth >= 2)
1576 	{
1577 	    uint32_t m0, m1;
1578 	    m0 = *p;
1579 	    m1 = *(p + 1);
1580 
1581 	    if (m0 | m1)
1582 	    {
1583 		__m64 dest0, dest1;
1584 		__m64 vdest = *(__m64 *)q;
1585 
1586 		dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1587 		                 expand8888 (vdest, 0));
1588 		dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1589 		                 expand8888 (vdest, 1));
1590 
1591 		*(__m64 *)q = pack8888 (dest0, dest1);
1592 	    }
1593 
1594 	    p += 2;
1595 	    q += 2;
1596 	    twidth -= 2;
1597 	}
1598 
1599 	if (twidth)
1600 	{
1601 	    uint32_t m = *(uint32_t *)p;
1602 
1603 	    if (m)
1604 	    {
1605 		__m64 vdest = load8888 (q);
1606 		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1607 		store8888 (q, vdest);
1608 	    }
1609 
1610 	    twidth--;
1611 	    p++;
1612 	    q++;
1613 	}
1614 
1615 	dst_line += dst_stride;
1616 	mask_line += mask_stride;
1617     }
1618 
1619     _mm_empty ();
1620 }
1621 
1622 static void
mmx_composite_over_8888_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1623 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1624                                 pixman_composite_info_t *info)
1625 {
1626     PIXMAN_COMPOSITE_ARGS (info);
1627     uint32_t    *dst_line, *dst;
1628     uint32_t    *src_line, *src;
1629     uint32_t mask;
1630     __m64 vmask;
1631     int dst_stride, src_stride;
1632     int32_t w;
1633 
1634     CHECKPOINT ();
1635 
1636     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1637     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1638 
1639     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1640     vmask = expand_alpha (load8888 (&mask));
1641 
1642     while (height--)
1643     {
1644 	dst = dst_line;
1645 	dst_line += dst_stride;
1646 	src = src_line;
1647 	src_line += src_stride;
1648 	w = width;
1649 
1650 	while (w && (uintptr_t)dst & 7)
1651 	{
1652 	    __m64 s = load8888 (src);
1653 	    __m64 d = load8888 (dst);
1654 
1655 	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1656 
1657 	    w--;
1658 	    dst++;
1659 	    src++;
1660 	}
1661 
1662 	while (w >= 2)
1663 	{
1664 	    __m64 vs = ldq_u ((__m64 *)src);
1665 	    __m64 vd = *(__m64 *)dst;
1666 	    __m64 vsrc0 = expand8888 (vs, 0);
1667 	    __m64 vsrc1 = expand8888 (vs, 1);
1668 
1669 	    *(__m64 *)dst = pack8888 (
1670 	        in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1671 	        in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1672 
1673 	    w -= 2;
1674 	    dst += 2;
1675 	    src += 2;
1676 	}
1677 
1678 	if (w)
1679 	{
1680 	    __m64 s = load8888 (src);
1681 	    __m64 d = load8888 (dst);
1682 
1683 	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1684 	}
1685     }
1686 
1687     _mm_empty ();
1688 }
1689 
1690 static void
mmx_composite_over_x888_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1691 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1692                                 pixman_composite_info_t *info)
1693 {
1694     PIXMAN_COMPOSITE_ARGS (info);
1695     uint32_t *dst_line, *dst;
1696     uint32_t *src_line, *src;
1697     uint32_t mask;
1698     __m64 vmask;
1699     int dst_stride, src_stride;
1700     int32_t w;
1701     __m64 srca;
1702 
1703     CHECKPOINT ();
1704 
1705     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1706     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1707     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1708 
1709     vmask = expand_alpha (load8888 (&mask));
1710     srca = MC (4x00ff);
1711 
1712     while (height--)
1713     {
1714 	dst = dst_line;
1715 	dst_line += dst_stride;
1716 	src = src_line;
1717 	src_line += src_stride;
1718 	w = width;
1719 
1720 	while (w && (uintptr_t)dst & 7)
1721 	{
1722 	    uint32_t ssrc = *src | 0xff000000;
1723 	    __m64 s = load8888 (&ssrc);
1724 	    __m64 d = load8888 (dst);
1725 
1726 	    store8888 (dst, in_over (s, srca, vmask, d));
1727 
1728 	    w--;
1729 	    dst++;
1730 	    src++;
1731 	}
1732 
1733 	while (w >= 16)
1734 	{
1735 	    __m64 vd0 = *(__m64 *)(dst + 0);
1736 	    __m64 vd1 = *(__m64 *)(dst + 2);
1737 	    __m64 vd2 = *(__m64 *)(dst + 4);
1738 	    __m64 vd3 = *(__m64 *)(dst + 6);
1739 	    __m64 vd4 = *(__m64 *)(dst + 8);
1740 	    __m64 vd5 = *(__m64 *)(dst + 10);
1741 	    __m64 vd6 = *(__m64 *)(dst + 12);
1742 	    __m64 vd7 = *(__m64 *)(dst + 14);
1743 
1744 	    __m64 vs0 = ldq_u ((__m64 *)(src + 0));
1745 	    __m64 vs1 = ldq_u ((__m64 *)(src + 2));
1746 	    __m64 vs2 = ldq_u ((__m64 *)(src + 4));
1747 	    __m64 vs3 = ldq_u ((__m64 *)(src + 6));
1748 	    __m64 vs4 = ldq_u ((__m64 *)(src + 8));
1749 	    __m64 vs5 = ldq_u ((__m64 *)(src + 10));
1750 	    __m64 vs6 = ldq_u ((__m64 *)(src + 12));
1751 	    __m64 vs7 = ldq_u ((__m64 *)(src + 14));
1752 
1753 	    vd0 = pack8888 (
1754 	        in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1755 	        in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1756 
1757 	    vd1 = pack8888 (
1758 	        in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1759 	        in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1760 
1761 	    vd2 = pack8888 (
1762 	        in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1763 	        in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1764 
1765 	    vd3 = pack8888 (
1766 	        in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1767 	        in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1768 
1769 	    vd4 = pack8888 (
1770 	        in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1771 	        in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1772 
1773 	    vd5 = pack8888 (
1774 	        in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1775 	        in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1776 
1777 	    vd6 = pack8888 (
1778 	        in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1779 	        in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1780 
1781 	    vd7 = pack8888 (
1782 	        in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1783 	        in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1784 
1785 	    *(__m64 *)(dst + 0) = vd0;
1786 	    *(__m64 *)(dst + 2) = vd1;
1787 	    *(__m64 *)(dst + 4) = vd2;
1788 	    *(__m64 *)(dst + 6) = vd3;
1789 	    *(__m64 *)(dst + 8) = vd4;
1790 	    *(__m64 *)(dst + 10) = vd5;
1791 	    *(__m64 *)(dst + 12) = vd6;
1792 	    *(__m64 *)(dst + 14) = vd7;
1793 
1794 	    w -= 16;
1795 	    dst += 16;
1796 	    src += 16;
1797 	}
1798 
1799 	while (w)
1800 	{
1801 	    uint32_t ssrc = *src | 0xff000000;
1802 	    __m64 s = load8888 (&ssrc);
1803 	    __m64 d = load8888 (dst);
1804 
1805 	    store8888 (dst, in_over (s, srca, vmask, d));
1806 
1807 	    w--;
1808 	    dst++;
1809 	    src++;
1810 	}
1811     }
1812 
1813     _mm_empty ();
1814 }
1815 
1816 static void
mmx_composite_over_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1817 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1818                               pixman_composite_info_t *info)
1819 {
1820     PIXMAN_COMPOSITE_ARGS (info);
1821     uint32_t *dst_line, *dst;
1822     uint32_t *src_line, *src;
1823     uint32_t s;
1824     int dst_stride, src_stride;
1825     uint8_t a;
1826     int32_t w;
1827 
1828     CHECKPOINT ();
1829 
1830     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1831     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1832 
1833     while (height--)
1834     {
1835 	dst = dst_line;
1836 	dst_line += dst_stride;
1837 	src = src_line;
1838 	src_line += src_stride;
1839 	w = width;
1840 
1841 	while (w--)
1842 	{
1843 	    s = *src++;
1844 	    a = s >> 24;
1845 
1846 	    if (a == 0xff)
1847 	    {
1848 		*dst = s;
1849 	    }
1850 	    else if (s)
1851 	    {
1852 		__m64 ms, sa;
1853 		ms = load8888 (&s);
1854 		sa = expand_alpha (ms);
1855 		store8888 (dst, over (ms, sa, load8888 (dst)));
1856 	    }
1857 
1858 	    dst++;
1859 	}
1860     }
1861     _mm_empty ();
1862 }
1863 
1864 static void
mmx_composite_over_8888_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)1865 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1866                               pixman_composite_info_t *info)
1867 {
1868     PIXMAN_COMPOSITE_ARGS (info);
1869     uint16_t    *dst_line, *dst;
1870     uint32_t    *src_line, *src;
1871     int dst_stride, src_stride;
1872     int32_t w;
1873 
1874     CHECKPOINT ();
1875 
1876     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1877     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1878 
1879 #if 0
1880     /* FIXME */
1881     assert (src_image->drawable == mask_image->drawable);
1882 #endif
1883 
1884     while (height--)
1885     {
1886 	dst = dst_line;
1887 	dst_line += dst_stride;
1888 	src = src_line;
1889 	src_line += src_stride;
1890 	w = width;
1891 
1892 	CHECKPOINT ();
1893 
1894 	while (w && (uintptr_t)dst & 7)
1895 	{
1896 	    __m64 vsrc = load8888 (src);
1897 	    uint64_t d = *dst;
1898 	    __m64 vdest = expand565 (to_m64 (d), 0);
1899 
1900 	    vdest = pack_565 (
1901 		over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1902 
1903 	    *dst = to_uint64 (vdest);
1904 
1905 	    w--;
1906 	    dst++;
1907 	    src++;
1908 	}
1909 
1910 	CHECKPOINT ();
1911 
1912 	while (w >= 4)
1913 	{
1914 	    __m64 vdest = *(__m64 *)dst;
1915 	    __m64 v0, v1, v2, v3;
1916 	    __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1917 
1918 	    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1919 
1920 	    vsrc0 = load8888 ((src + 0));
1921 	    vsrc1 = load8888 ((src + 1));
1922 	    vsrc2 = load8888 ((src + 2));
1923 	    vsrc3 = load8888 ((src + 3));
1924 
1925 	    v0 = over (vsrc0, expand_alpha (vsrc0), v0);
1926 	    v1 = over (vsrc1, expand_alpha (vsrc1), v1);
1927 	    v2 = over (vsrc2, expand_alpha (vsrc2), v2);
1928 	    v3 = over (vsrc3, expand_alpha (vsrc3), v3);
1929 
1930 	    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1931 
1932 	    w -= 4;
1933 	    dst += 4;
1934 	    src += 4;
1935 	}
1936 
1937 	CHECKPOINT ();
1938 
1939 	while (w)
1940 	{
1941 	    __m64 vsrc = load8888 (src);
1942 	    uint64_t d = *dst;
1943 	    __m64 vdest = expand565 (to_m64 (d), 0);
1944 
1945 	    vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1946 
1947 	    *dst = to_uint64 (vdest);
1948 
1949 	    w--;
1950 	    dst++;
1951 	    src++;
1952 	}
1953     }
1954 
1955     _mm_empty ();
1956 }
1957 
1958 static void
mmx_composite_over_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1959 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1960                              pixman_composite_info_t *info)
1961 {
1962     PIXMAN_COMPOSITE_ARGS (info);
1963     uint32_t src, srca;
1964     uint32_t *dst_line, *dst;
1965     uint8_t *mask_line, *mask;
1966     int dst_stride, mask_stride;
1967     int32_t w;
1968     __m64 vsrc, vsrca;
1969     uint64_t srcsrc;
1970 
1971     CHECKPOINT ();
1972 
1973     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1974 
1975     srca = src >> 24;
1976     if (src == 0)
1977 	return;
1978 
1979     srcsrc = (uint64_t)src << 32 | src;
1980 
1981     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1982     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1983 
1984     vsrc = load8888 (&src);
1985     vsrca = expand_alpha (vsrc);
1986 
1987     while (height--)
1988     {
1989 	dst = dst_line;
1990 	dst_line += dst_stride;
1991 	mask = mask_line;
1992 	mask_line += mask_stride;
1993 	w = width;
1994 
1995 	CHECKPOINT ();
1996 
1997 	while (w && (uintptr_t)dst & 7)
1998 	{
1999 	    uint64_t m = *mask;
2000 
2001 	    if (m)
2002 	    {
2003 		__m64 vdest = in_over (vsrc, vsrca,
2004 				       expand_alpha_rev (to_m64 (m)),
2005 				       load8888 (dst));
2006 
2007 		store8888 (dst, vdest);
2008 	    }
2009 
2010 	    w--;
2011 	    mask++;
2012 	    dst++;
2013 	}
2014 
2015 	CHECKPOINT ();
2016 
2017 	while (w >= 2)
2018 	{
2019 	    uint64_t m0, m1;
2020 
2021 	    m0 = *mask;
2022 	    m1 = *(mask + 1);
2023 
2024 	    if (srca == 0xff && (m0 & m1) == 0xff)
2025 	    {
2026 		*(uint64_t *)dst = srcsrc;
2027 	    }
2028 	    else if (m0 | m1)
2029 	    {
2030 		__m64 vdest;
2031 		__m64 dest0, dest1;
2032 
2033 		vdest = *(__m64 *)dst;
2034 
2035 		dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
2036 				 expand8888 (vdest, 0));
2037 		dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
2038 				 expand8888 (vdest, 1));
2039 
2040 		*(__m64 *)dst = pack8888 (dest0, dest1);
2041 	    }
2042 
2043 	    mask += 2;
2044 	    dst += 2;
2045 	    w -= 2;
2046 	}
2047 
2048 	CHECKPOINT ();
2049 
2050 	if (w)
2051 	{
2052 	    uint64_t m = *mask;
2053 
2054 	    if (m)
2055 	    {
2056 		__m64 vdest = load8888 (dst);
2057 
2058 		vdest = in_over (
2059 		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
2060 		store8888 (dst, vdest);
2061 	    }
2062 	}
2063     }
2064 
2065     _mm_empty ();
2066 }
2067 
2068 static pixman_bool_t
mmx_fill(pixman_implementation_t * imp,uint32_t * bits,int stride,int bpp,int x,int y,int width,int height,uint32_t filler)2069 mmx_fill (pixman_implementation_t *imp,
2070           uint32_t *               bits,
2071           int                      stride,
2072           int                      bpp,
2073           int                      x,
2074           int                      y,
2075           int                      width,
2076           int                      height,
2077           uint32_t		   filler)
2078 {
2079     uint64_t fill;
2080     __m64 vfill;
2081     uint32_t byte_width;
2082     uint8_t     *byte_line;
2083 
2084 #if defined __GNUC__ && defined USE_X86_MMX
2085     __m64 v1, v2, v3, v4, v5, v6, v7;
2086 #endif
2087 
2088     if (bpp != 16 && bpp != 32 && bpp != 8)
2089 	return FALSE;
2090 
2091     if (bpp == 8)
2092     {
2093 	stride = stride * (int) sizeof (uint32_t) / 1;
2094 	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
2095 	byte_width = width;
2096 	stride *= 1;
2097         filler = (filler & 0xff) * 0x01010101;
2098     }
2099     else if (bpp == 16)
2100     {
2101 	stride = stride * (int) sizeof (uint32_t) / 2;
2102 	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
2103 	byte_width = 2 * width;
2104 	stride *= 2;
2105         filler = (filler & 0xffff) * 0x00010001;
2106     }
2107     else
2108     {
2109 	stride = stride * (int) sizeof (uint32_t) / 4;
2110 	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
2111 	byte_width = 4 * width;
2112 	stride *= 4;
2113     }
2114 
2115     fill = ((uint64_t)filler << 32) | filler;
2116     vfill = to_m64 (fill);
2117 
2118 #if defined __GNUC__ && defined USE_X86_MMX
2119     __asm__ (
2120         "movq		%7,	%0\n"
2121         "movq		%7,	%1\n"
2122         "movq		%7,	%2\n"
2123         "movq		%7,	%3\n"
2124         "movq		%7,	%4\n"
2125         "movq		%7,	%5\n"
2126         "movq		%7,	%6\n"
2127 	: "=&y" (v1), "=&y" (v2), "=&y" (v3),
2128 	  "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
2129 	: "y" (vfill));
2130 #endif
2131 
2132     while (height--)
2133     {
2134 	int w;
2135 	uint8_t *d = byte_line;
2136 
2137 	byte_line += stride;
2138 	w = byte_width;
2139 
2140 	if (w >= 1 && ((uintptr_t)d & 1))
2141 	{
2142 	    *(uint8_t *)d = (filler & 0xff);
2143 	    w--;
2144 	    d++;
2145 	}
2146 
2147 	if (w >= 2 && ((uintptr_t)d & 3))
2148 	{
2149 	    *(uint16_t *)d = filler;
2150 	    w -= 2;
2151 	    d += 2;
2152 	}
2153 
2154 	while (w >= 4 && ((uintptr_t)d & 7))
2155 	{
2156 	    *(uint32_t *)d = filler;
2157 
2158 	    w -= 4;
2159 	    d += 4;
2160 	}
2161 
2162 	while (w >= 64)
2163 	{
2164 #if defined __GNUC__ && defined USE_X86_MMX
2165 	    __asm__ (
2166 	        "movq	%1,	  (%0)\n"
2167 	        "movq	%2,	 8(%0)\n"
2168 	        "movq	%3,	16(%0)\n"
2169 	        "movq	%4,	24(%0)\n"
2170 	        "movq	%5,	32(%0)\n"
2171 	        "movq	%6,	40(%0)\n"
2172 	        "movq	%7,	48(%0)\n"
2173 	        "movq	%8,	56(%0)\n"
2174 		:
2175 		: "r" (d),
2176 		  "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
2177 		  "y" (v4), "y" (v5), "y" (v6), "y" (v7)
2178 		: "memory");
2179 #else
2180 	    *(__m64*) (d +  0) = vfill;
2181 	    *(__m64*) (d +  8) = vfill;
2182 	    *(__m64*) (d + 16) = vfill;
2183 	    *(__m64*) (d + 24) = vfill;
2184 	    *(__m64*) (d + 32) = vfill;
2185 	    *(__m64*) (d + 40) = vfill;
2186 	    *(__m64*) (d + 48) = vfill;
2187 	    *(__m64*) (d + 56) = vfill;
2188 #endif
2189 	    w -= 64;
2190 	    d += 64;
2191 	}
2192 
2193 	while (w >= 4)
2194 	{
2195 	    *(uint32_t *)d = filler;
2196 
2197 	    w -= 4;
2198 	    d += 4;
2199 	}
2200 	if (w >= 2)
2201 	{
2202 	    *(uint16_t *)d = filler;
2203 	    w -= 2;
2204 	    d += 2;
2205 	}
2206 	if (w >= 1)
2207 	{
2208 	    *(uint8_t *)d = (filler & 0xff);
2209 	    w--;
2210 	    d++;
2211 	}
2212 
2213     }
2214 
2215     _mm_empty ();
2216     return TRUE;
2217 }
2218 
2219 static void
mmx_composite_src_x888_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2220 mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
2221                              pixman_composite_info_t *info)
2222 {
2223     PIXMAN_COMPOSITE_ARGS (info);
2224     uint16_t    *dst_line, *dst;
2225     uint32_t    *src_line, *src, s;
2226     int dst_stride, src_stride;
2227     int32_t w;
2228 
2229     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2230     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2231 
2232     while (height--)
2233     {
2234 	dst = dst_line;
2235 	dst_line += dst_stride;
2236 	src = src_line;
2237 	src_line += src_stride;
2238 	w = width;
2239 
2240 	while (w && (uintptr_t)dst & 7)
2241 	{
2242 	    s = *src++;
2243 	    *dst = convert_8888_to_0565 (s);
2244 	    dst++;
2245 	    w--;
2246 	}
2247 
2248 	while (w >= 4)
2249 	{
2250 	    __m64 vdest;
2251 	    __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
2252 	    __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
2253 
2254 	    vdest = pack_4xpacked565 (vsrc0, vsrc1);
2255 
2256 	    *(__m64 *)dst = vdest;
2257 
2258 	    w -= 4;
2259 	    src += 4;
2260 	    dst += 4;
2261 	}
2262 
2263 	while (w)
2264 	{
2265 	    s = *src++;
2266 	    *dst = convert_8888_to_0565 (s);
2267 	    dst++;
2268 	    w--;
2269 	}
2270     }
2271 
2272     _mm_empty ();
2273 }
2274 
2275 static void
mmx_composite_src_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2276 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2277                             pixman_composite_info_t *info)
2278 {
2279     PIXMAN_COMPOSITE_ARGS (info);
2280     uint32_t src, srca;
2281     uint32_t    *dst_line, *dst;
2282     uint8_t     *mask_line, *mask;
2283     int dst_stride, mask_stride;
2284     int32_t w;
2285     __m64 vsrc;
2286     uint64_t srcsrc;
2287 
2288     CHECKPOINT ();
2289 
2290     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2291 
2292     srca = src >> 24;
2293     if (src == 0)
2294     {
2295 	mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
2296 		  PIXMAN_FORMAT_BPP (dest_image->bits.format),
2297 		  dest_x, dest_y, width, height, 0);
2298 	return;
2299     }
2300 
2301     srcsrc = (uint64_t)src << 32 | src;
2302 
2303     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2304     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2305 
2306     vsrc = load8888 (&src);
2307 
2308     while (height--)
2309     {
2310 	dst = dst_line;
2311 	dst_line += dst_stride;
2312 	mask = mask_line;
2313 	mask_line += mask_stride;
2314 	w = width;
2315 
2316 	CHECKPOINT ();
2317 
2318 	while (w && (uintptr_t)dst & 7)
2319 	{
2320 	    uint64_t m = *mask;
2321 
2322 	    if (m)
2323 	    {
2324 		__m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2325 
2326 		store8888 (dst, vdest);
2327 	    }
2328 	    else
2329 	    {
2330 		*dst = 0;
2331 	    }
2332 
2333 	    w--;
2334 	    mask++;
2335 	    dst++;
2336 	}
2337 
2338 	CHECKPOINT ();
2339 
2340 	while (w >= 2)
2341 	{
2342 	    uint64_t m0, m1;
2343 	    m0 = *mask;
2344 	    m1 = *(mask + 1);
2345 
2346 	    if (srca == 0xff && (m0 & m1) == 0xff)
2347 	    {
2348 		*(uint64_t *)dst = srcsrc;
2349 	    }
2350 	    else if (m0 | m1)
2351 	    {
2352 		__m64 dest0, dest1;
2353 
2354 		dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2355 		dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2356 
2357 		*(__m64 *)dst = pack8888 (dest0, dest1);
2358 	    }
2359 	    else
2360 	    {
2361 		*(uint64_t *)dst = 0;
2362 	    }
2363 
2364 	    mask += 2;
2365 	    dst += 2;
2366 	    w -= 2;
2367 	}
2368 
2369 	CHECKPOINT ();
2370 
2371 	if (w)
2372 	{
2373 	    uint64_t m = *mask;
2374 
2375 	    if (m)
2376 	    {
2377 		__m64 vdest = load8888 (dst);
2378 
2379 		vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2380 		store8888 (dst, vdest);
2381 	    }
2382 	    else
2383 	    {
2384 		*dst = 0;
2385 	    }
2386 	}
2387     }
2388 
2389     _mm_empty ();
2390 }
2391 
2392 static void
mmx_composite_over_n_8_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2393 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2394                              pixman_composite_info_t *info)
2395 {
2396     PIXMAN_COMPOSITE_ARGS (info);
2397     uint32_t src, srca;
2398     uint16_t *dst_line, *dst;
2399     uint8_t *mask_line, *mask;
2400     int dst_stride, mask_stride;
2401     int32_t w;
2402     __m64 vsrc, vsrca, tmp;
2403     __m64 srcsrcsrcsrc;
2404 
2405     CHECKPOINT ();
2406 
2407     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2408 
2409     srca = src >> 24;
2410     if (src == 0)
2411 	return;
2412 
2413     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2414     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2415 
2416     vsrc = load8888 (&src);
2417     vsrca = expand_alpha (vsrc);
2418 
2419     tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2420     srcsrcsrcsrc = expand_alpha_rev (tmp);
2421 
2422     while (height--)
2423     {
2424 	dst = dst_line;
2425 	dst_line += dst_stride;
2426 	mask = mask_line;
2427 	mask_line += mask_stride;
2428 	w = width;
2429 
2430 	CHECKPOINT ();
2431 
2432 	while (w && (uintptr_t)dst & 7)
2433 	{
2434 	    uint64_t m = *mask;
2435 
2436 	    if (m)
2437 	    {
2438 		uint64_t d = *dst;
2439 		__m64 vd = to_m64 (d);
2440 		__m64 vdest = in_over (
2441 		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2442 
2443 		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2444 		*dst = to_uint64 (vd);
2445 	    }
2446 
2447 	    w--;
2448 	    mask++;
2449 	    dst++;
2450 	}
2451 
2452 	CHECKPOINT ();
2453 
2454 	while (w >= 4)
2455 	{
2456 	    uint64_t m0, m1, m2, m3;
2457 	    m0 = *mask;
2458 	    m1 = *(mask + 1);
2459 	    m2 = *(mask + 2);
2460 	    m3 = *(mask + 3);
2461 
2462 	    if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2463 	    {
2464 		*(__m64 *)dst = srcsrcsrcsrc;
2465 	    }
2466 	    else if (m0 | m1 | m2 | m3)
2467 	    {
2468 		__m64 vdest = *(__m64 *)dst;
2469 		__m64 v0, v1, v2, v3;
2470 		__m64 vm0, vm1, vm2, vm3;
2471 
2472 		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2473 
2474 		vm0 = to_m64 (m0);
2475 		v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
2476 
2477 		vm1 = to_m64 (m1);
2478 		v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
2479 
2480 		vm2 = to_m64 (m2);
2481 		v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
2482 
2483 		vm3 = to_m64 (m3);
2484 		v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
2485 
2486 		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
2487 	    }
2488 
2489 	    w -= 4;
2490 	    mask += 4;
2491 	    dst += 4;
2492 	}
2493 
2494 	CHECKPOINT ();
2495 
2496 	while (w)
2497 	{
2498 	    uint64_t m = *mask;
2499 
2500 	    if (m)
2501 	    {
2502 		uint64_t d = *dst;
2503 		__m64 vd = to_m64 (d);
2504 		__m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2505 				       expand565 (vd, 0));
2506 		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2507 		*dst = to_uint64 (vd);
2508 	    }
2509 
2510 	    w--;
2511 	    mask++;
2512 	    dst++;
2513 	}
2514     }
2515 
2516     _mm_empty ();
2517 }
2518 
2519 static void
mmx_composite_over_pixbuf_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2520 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2521                                 pixman_composite_info_t *info)
2522 {
2523     PIXMAN_COMPOSITE_ARGS (info);
2524     uint16_t    *dst_line, *dst;
2525     uint32_t    *src_line, *src;
2526     int dst_stride, src_stride;
2527     int32_t w;
2528 
2529     CHECKPOINT ();
2530 
2531     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2532     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2533 
2534 #if 0
2535     /* FIXME */
2536     assert (src_image->drawable == mask_image->drawable);
2537 #endif
2538 
2539     while (height--)
2540     {
2541 	dst = dst_line;
2542 	dst_line += dst_stride;
2543 	src = src_line;
2544 	src_line += src_stride;
2545 	w = width;
2546 
2547 	CHECKPOINT ();
2548 
2549 	while (w && (uintptr_t)dst & 7)
2550 	{
2551 	    __m64 vsrc = load8888 (src);
2552 	    uint64_t d = *dst;
2553 	    __m64 vdest = expand565 (to_m64 (d), 0);
2554 
2555 	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2556 
2557 	    *dst = to_uint64 (vdest);
2558 
2559 	    w--;
2560 	    dst++;
2561 	    src++;
2562 	}
2563 
2564 	CHECKPOINT ();
2565 
2566 	while (w >= 4)
2567 	{
2568 	    uint32_t s0, s1, s2, s3;
2569 	    unsigned char a0, a1, a2, a3;
2570 
2571 	    s0 = *src;
2572 	    s1 = *(src + 1);
2573 	    s2 = *(src + 2);
2574 	    s3 = *(src + 3);
2575 
2576 	    a0 = (s0 >> 24);
2577 	    a1 = (s1 >> 24);
2578 	    a2 = (s2 >> 24);
2579 	    a3 = (s3 >> 24);
2580 
2581 	    if ((a0 & a1 & a2 & a3) == 0xFF)
2582 	    {
2583 		__m64 v0 = invert_colors (load8888 (&s0));
2584 		__m64 v1 = invert_colors (load8888 (&s1));
2585 		__m64 v2 = invert_colors (load8888 (&s2));
2586 		__m64 v3 = invert_colors (load8888 (&s3));
2587 
2588 		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2589 	    }
2590 	    else if (s0 | s1 | s2 | s3)
2591 	    {
2592 		__m64 vdest = *(__m64 *)dst;
2593 		__m64 v0, v1, v2, v3;
2594 
2595 		__m64 vsrc0 = load8888 (&s0);
2596 		__m64 vsrc1 = load8888 (&s1);
2597 		__m64 vsrc2 = load8888 (&s2);
2598 		__m64 vsrc3 = load8888 (&s3);
2599 
2600 		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2601 
2602 		v0 = over_rev_non_pre (vsrc0, v0);
2603 		v1 = over_rev_non_pre (vsrc1, v1);
2604 		v2 = over_rev_non_pre (vsrc2, v2);
2605 		v3 = over_rev_non_pre (vsrc3, v3);
2606 
2607 		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2608 	    }
2609 
2610 	    w -= 4;
2611 	    dst += 4;
2612 	    src += 4;
2613 	}
2614 
2615 	CHECKPOINT ();
2616 
2617 	while (w)
2618 	{
2619 	    __m64 vsrc = load8888 (src);
2620 	    uint64_t d = *dst;
2621 	    __m64 vdest = expand565 (to_m64 (d), 0);
2622 
2623 	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2624 
2625 	    *dst = to_uint64 (vdest);
2626 
2627 	    w--;
2628 	    dst++;
2629 	    src++;
2630 	}
2631     }
2632 
2633     _mm_empty ();
2634 }
2635 
2636 static void
mmx_composite_over_pixbuf_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2637 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2638                                 pixman_composite_info_t *info)
2639 {
2640     PIXMAN_COMPOSITE_ARGS (info);
2641     uint32_t    *dst_line, *dst;
2642     uint32_t    *src_line, *src;
2643     int dst_stride, src_stride;
2644     int32_t w;
2645 
2646     CHECKPOINT ();
2647 
2648     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2649     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2650 
2651 #if 0
2652     /* FIXME */
2653     assert (src_image->drawable == mask_image->drawable);
2654 #endif
2655 
2656     while (height--)
2657     {
2658 	dst = dst_line;
2659 	dst_line += dst_stride;
2660 	src = src_line;
2661 	src_line += src_stride;
2662 	w = width;
2663 
2664 	while (w && (uintptr_t)dst & 7)
2665 	{
2666 	    __m64 s = load8888 (src);
2667 	    __m64 d = load8888 (dst);
2668 
2669 	    store8888 (dst, over_rev_non_pre (s, d));
2670 
2671 	    w--;
2672 	    dst++;
2673 	    src++;
2674 	}
2675 
2676 	while (w >= 2)
2677 	{
2678 	    uint32_t s0, s1;
2679 	    unsigned char a0, a1;
2680 	    __m64 d0, d1;
2681 
2682 	    s0 = *src;
2683 	    s1 = *(src + 1);
2684 
2685 	    a0 = (s0 >> 24);
2686 	    a1 = (s1 >> 24);
2687 
2688 	    if ((a0 & a1) == 0xFF)
2689 	    {
2690 		d0 = invert_colors (load8888 (&s0));
2691 		d1 = invert_colors (load8888 (&s1));
2692 
2693 		*(__m64 *)dst = pack8888 (d0, d1);
2694 	    }
2695 	    else if (s0 | s1)
2696 	    {
2697 		__m64 vdest = *(__m64 *)dst;
2698 
2699 		d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
2700 		d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
2701 
2702 		*(__m64 *)dst = pack8888 (d0, d1);
2703 	    }
2704 
2705 	    w -= 2;
2706 	    dst += 2;
2707 	    src += 2;
2708 	}
2709 
2710 	if (w)
2711 	{
2712 	    __m64 s = load8888 (src);
2713 	    __m64 d = load8888 (dst);
2714 
2715 	    store8888 (dst, over_rev_non_pre (s, d));
2716 	}
2717     }
2718 
2719     _mm_empty ();
2720 }
2721 
2722 static void
mmx_composite_over_n_8888_0565_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)2723 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2724                                    pixman_composite_info_t *info)
2725 {
2726     PIXMAN_COMPOSITE_ARGS (info);
2727     uint32_t src;
2728     uint16_t    *dst_line;
2729     uint32_t    *mask_line;
2730     int dst_stride, mask_stride;
2731     __m64 vsrc, vsrca;
2732 
2733     CHECKPOINT ();
2734 
2735     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2736 
2737     if (src == 0)
2738 	return;
2739 
2740     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2741     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2742 
2743     vsrc = load8888 (&src);
2744     vsrca = expand_alpha (vsrc);
2745 
2746     while (height--)
2747     {
2748 	int twidth = width;
2749 	uint32_t *p = (uint32_t *)mask_line;
2750 	uint16_t *q = (uint16_t *)dst_line;
2751 
2752 	while (twidth && ((uintptr_t)q & 7))
2753 	{
2754 	    uint32_t m = *(uint32_t *)p;
2755 
2756 	    if (m)
2757 	    {
2758 		uint64_t d = *q;
2759 		__m64 vdest = expand565 (to_m64 (d), 0);
2760 		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2761 		*q = to_uint64 (vdest);
2762 	    }
2763 
2764 	    twidth--;
2765 	    p++;
2766 	    q++;
2767 	}
2768 
2769 	while (twidth >= 4)
2770 	{
2771 	    uint32_t m0, m1, m2, m3;
2772 
2773 	    m0 = *p;
2774 	    m1 = *(p + 1);
2775 	    m2 = *(p + 2);
2776 	    m3 = *(p + 3);
2777 
2778 	    if ((m0 | m1 | m2 | m3))
2779 	    {
2780 		__m64 vdest = *(__m64 *)q;
2781 		__m64 v0, v1, v2, v3;
2782 
2783 		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2784 
2785 		v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
2786 		v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
2787 		v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
2788 		v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
2789 
2790 		*(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
2791 	    }
2792 	    twidth -= 4;
2793 	    p += 4;
2794 	    q += 4;
2795 	}
2796 
2797 	while (twidth)
2798 	{
2799 	    uint32_t m;
2800 
2801 	    m = *(uint32_t *)p;
2802 	    if (m)
2803 	    {
2804 		uint64_t d = *q;
2805 		__m64 vdest = expand565 (to_m64 (d), 0);
2806 		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2807 		*q = to_uint64 (vdest);
2808 	    }
2809 
2810 	    twidth--;
2811 	    p++;
2812 	    q++;
2813 	}
2814 
2815 	mask_line += mask_stride;
2816 	dst_line += dst_stride;
2817     }
2818 
2819     _mm_empty ();
2820 }
2821 
2822 static void
mmx_composite_in_n_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)2823 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2824                         pixman_composite_info_t *info)
2825 {
2826     PIXMAN_COMPOSITE_ARGS (info);
2827     uint8_t *dst_line, *dst;
2828     uint8_t *mask_line, *mask;
2829     int dst_stride, mask_stride;
2830     int32_t w;
2831     uint32_t src;
2832     uint8_t sa;
2833     __m64 vsrc, vsrca;
2834 
2835     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2836     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2837 
2838     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2839 
2840     sa = src >> 24;
2841 
2842     vsrc = load8888 (&src);
2843     vsrca = expand_alpha (vsrc);
2844 
2845     while (height--)
2846     {
2847 	dst = dst_line;
2848 	dst_line += dst_stride;
2849 	mask = mask_line;
2850 	mask_line += mask_stride;
2851 	w = width;
2852 
2853 	while (w && (uintptr_t)dst & 7)
2854 	{
2855 	    uint16_t tmp;
2856 	    uint8_t a;
2857 	    uint32_t m, d;
2858 
2859 	    a = *mask++;
2860 	    d = *dst;
2861 
2862 	    m = MUL_UN8 (sa, a, tmp);
2863 	    d = MUL_UN8 (m, d, tmp);
2864 
2865 	    *dst++ = d;
2866 	    w--;
2867 	}
2868 
2869 	while (w >= 4)
2870 	{
2871 	    __m64 vmask;
2872 	    __m64 vdest;
2873 
2874 	    vmask = load8888u ((uint32_t *)mask);
2875 	    vdest = load8888 ((uint32_t *)dst);
2876 
2877 	    store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
2878 
2879 	    dst += 4;
2880 	    mask += 4;
2881 	    w -= 4;
2882 	}
2883 
2884 	while (w--)
2885 	{
2886 	    uint16_t tmp;
2887 	    uint8_t a;
2888 	    uint32_t m, d;
2889 
2890 	    a = *mask++;
2891 	    d = *dst;
2892 
2893 	    m = MUL_UN8 (sa, a, tmp);
2894 	    d = MUL_UN8 (m, d, tmp);
2895 
2896 	    *dst++ = d;
2897 	}
2898     }
2899 
2900     _mm_empty ();
2901 }
2902 
2903 static void
mmx_composite_in_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)2904 mmx_composite_in_8_8 (pixman_implementation_t *imp,
2905                       pixman_composite_info_t *info)
2906 {
2907     PIXMAN_COMPOSITE_ARGS (info);
2908     uint8_t     *dst_line, *dst;
2909     uint8_t     *src_line, *src;
2910     int src_stride, dst_stride;
2911     int32_t w;
2912 
2913     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2914     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2915 
2916     while (height--)
2917     {
2918 	dst = dst_line;
2919 	dst_line += dst_stride;
2920 	src = src_line;
2921 	src_line += src_stride;
2922 	w = width;
2923 
2924 	while (w && (uintptr_t)dst & 3)
2925 	{
2926 	    uint8_t s, d;
2927 	    uint16_t tmp;
2928 
2929 	    s = *src;
2930 	    d = *dst;
2931 
2932 	    *dst = MUL_UN8 (s, d, tmp);
2933 
2934 	    src++;
2935 	    dst++;
2936 	    w--;
2937 	}
2938 
2939 	while (w >= 4)
2940 	{
2941 	    uint32_t *s = (uint32_t *)src;
2942 	    uint32_t *d = (uint32_t *)dst;
2943 
2944 	    store8888 (d, in (load8888u (s), load8888 (d)));
2945 
2946 	    w -= 4;
2947 	    dst += 4;
2948 	    src += 4;
2949 	}
2950 
2951 	while (w--)
2952 	{
2953 	    uint8_t s, d;
2954 	    uint16_t tmp;
2955 
2956 	    s = *src;
2957 	    d = *dst;
2958 
2959 	    *dst = MUL_UN8 (s, d, tmp);
2960 
2961 	    src++;
2962 	    dst++;
2963 	}
2964     }
2965 
2966     _mm_empty ();
2967 }
2968 
2969 static void
mmx_composite_add_n_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)2970 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2971 			 pixman_composite_info_t *info)
2972 {
2973     PIXMAN_COMPOSITE_ARGS (info);
2974     uint8_t     *dst_line, *dst;
2975     uint8_t     *mask_line, *mask;
2976     int dst_stride, mask_stride;
2977     int32_t w;
2978     uint32_t src;
2979     uint8_t sa;
2980     __m64 vsrc, vsrca;
2981 
2982     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2983     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2984 
2985     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2986 
2987     sa = src >> 24;
2988 
2989     if (src == 0)
2990 	return;
2991 
2992     vsrc = load8888 (&src);
2993     vsrca = expand_alpha (vsrc);
2994 
2995     while (height--)
2996     {
2997 	dst = dst_line;
2998 	dst_line += dst_stride;
2999 	mask = mask_line;
3000 	mask_line += mask_stride;
3001 	w = width;
3002 
3003 	while (w && (uintptr_t)dst & 3)
3004 	{
3005 	    uint16_t tmp;
3006 	    uint16_t a;
3007 	    uint32_t m, d;
3008 	    uint32_t r;
3009 
3010 	    a = *mask++;
3011 	    d = *dst;
3012 
3013 	    m = MUL_UN8 (sa, a, tmp);
3014 	    r = ADD_UN8 (m, d, tmp);
3015 
3016 	    *dst++ = r;
3017 	    w--;
3018 	}
3019 
3020 	while (w >= 4)
3021 	{
3022 	    __m64 vmask;
3023 	    __m64 vdest;
3024 
3025 	    vmask = load8888u ((uint32_t *)mask);
3026 	    vdest = load8888 ((uint32_t *)dst);
3027 
3028 	    store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
3029 
3030 	    dst += 4;
3031 	    mask += 4;
3032 	    w -= 4;
3033 	}
3034 
3035 	while (w--)
3036 	{
3037 	    uint16_t tmp;
3038 	    uint16_t a;
3039 	    uint32_t m, d;
3040 	    uint32_t r;
3041 
3042 	    a = *mask++;
3043 	    d = *dst;
3044 
3045 	    m = MUL_UN8 (sa, a, tmp);
3046 	    r = ADD_UN8 (m, d, tmp);
3047 
3048 	    *dst++ = r;
3049 	}
3050     }
3051 
3052     _mm_empty ();
3053 }
3054 
3055 static void
mmx_composite_add_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)3056 mmx_composite_add_8_8 (pixman_implementation_t *imp,
3057 		       pixman_composite_info_t *info)
3058 {
3059     PIXMAN_COMPOSITE_ARGS (info);
3060     uint8_t *dst_line, *dst;
3061     uint8_t *src_line, *src;
3062     int dst_stride, src_stride;
3063     int32_t w;
3064     uint8_t s, d;
3065     uint16_t t;
3066 
3067     CHECKPOINT ();
3068 
3069     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
3070     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
3071 
3072     while (height--)
3073     {
3074 	dst = dst_line;
3075 	dst_line += dst_stride;
3076 	src = src_line;
3077 	src_line += src_stride;
3078 	w = width;
3079 
3080 	while (w && (uintptr_t)dst & 7)
3081 	{
3082 	    s = *src;
3083 	    d = *dst;
3084 	    t = d + s;
3085 	    s = t | (0 - (t >> 8));
3086 	    *dst = s;
3087 
3088 	    dst++;
3089 	    src++;
3090 	    w--;
3091 	}
3092 
3093 	while (w >= 8)
3094 	{
3095 	    *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3096 	    dst += 8;
3097 	    src += 8;
3098 	    w -= 8;
3099 	}
3100 
3101 	while (w)
3102 	{
3103 	    s = *src;
3104 	    d = *dst;
3105 	    t = d + s;
3106 	    s = t | (0 - (t >> 8));
3107 	    *dst = s;
3108 
3109 	    dst++;
3110 	    src++;
3111 	    w--;
3112 	}
3113     }
3114 
3115     _mm_empty ();
3116 }
3117 
3118 static void
mmx_composite_add_0565_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)3119 mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
3120                              pixman_composite_info_t *info)
3121 {
3122     PIXMAN_COMPOSITE_ARGS (info);
3123     uint16_t    *dst_line, *dst;
3124     uint32_t	d;
3125     uint16_t    *src_line, *src;
3126     uint32_t	s;
3127     int dst_stride, src_stride;
3128     int32_t w;
3129 
3130     CHECKPOINT ();
3131 
3132     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
3133     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3134 
3135     while (height--)
3136     {
3137 	dst = dst_line;
3138 	dst_line += dst_stride;
3139 	src = src_line;
3140 	src_line += src_stride;
3141 	w = width;
3142 
3143 	while (w && (uintptr_t)dst & 7)
3144 	{
3145 	    s = *src++;
3146 	    if (s)
3147 	    {
3148 		d = *dst;
3149 		s = convert_0565_to_8888 (s);
3150 		if (d)
3151 		{
3152 		    d = convert_0565_to_8888 (d);
3153 		    UN8x4_ADD_UN8x4 (s, d);
3154 		}
3155 		*dst = convert_8888_to_0565 (s);
3156 	    }
3157 	    dst++;
3158 	    w--;
3159 	}
3160 
3161 	while (w >= 4)
3162 	{
3163 	    __m64 vdest = *(__m64 *)dst;
3164 	    __m64 vsrc = ldq_u ((__m64 *)src);
3165 	    __m64 vd0, vd1;
3166 	    __m64 vs0, vs1;
3167 
3168 	    expand_4xpacked565 (vdest, &vd0, &vd1, 0);
3169 	    expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
3170 
3171 	    vd0 = _mm_adds_pu8 (vd0, vs0);
3172 	    vd1 = _mm_adds_pu8 (vd1, vs1);
3173 
3174 	    *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
3175 
3176 	    dst += 4;
3177 	    src += 4;
3178 	    w -= 4;
3179 	}
3180 
3181 	while (w--)
3182 	{
3183 	    s = *src++;
3184 	    if (s)
3185 	    {
3186 		d = *dst;
3187 		s = convert_0565_to_8888 (s);
3188 		if (d)
3189 		{
3190 		    d = convert_0565_to_8888 (d);
3191 		    UN8x4_ADD_UN8x4 (s, d);
3192 		}
3193 		*dst = convert_8888_to_0565 (s);
3194 	    }
3195 	    dst++;
3196 	}
3197     }
3198 
3199     _mm_empty ();
3200 }
3201 
3202 static void
mmx_composite_add_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3203 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
3204                              pixman_composite_info_t *info)
3205 {
3206     PIXMAN_COMPOSITE_ARGS (info);
3207     uint32_t    *dst_line, *dst;
3208     uint32_t    *src_line, *src;
3209     int dst_stride, src_stride;
3210     int32_t w;
3211 
3212     CHECKPOINT ();
3213 
3214     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3215     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3216 
3217     while (height--)
3218     {
3219 	dst = dst_line;
3220 	dst_line += dst_stride;
3221 	src = src_line;
3222 	src_line += src_stride;
3223 	w = width;
3224 
3225 	while (w && (uintptr_t)dst & 7)
3226 	{
3227 	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3228 	                              load ((const uint32_t *)dst)));
3229 	    dst++;
3230 	    src++;
3231 	    w--;
3232 	}
3233 
3234 	while (w >= 2)
3235 	{
3236 	    *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3237 	    dst += 2;
3238 	    src += 2;
3239 	    w -= 2;
3240 	}
3241 
3242 	if (w)
3243 	{
3244 	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3245 	                              load ((const uint32_t *)dst)));
3246 
3247 	}
3248     }
3249 
3250     _mm_empty ();
3251 }
3252 
3253 static pixman_bool_t
mmx_blt(pixman_implementation_t * imp,uint32_t * src_bits,uint32_t * dst_bits,int src_stride,int dst_stride,int src_bpp,int dst_bpp,int src_x,int src_y,int dest_x,int dest_y,int width,int height)3254 mmx_blt (pixman_implementation_t *imp,
3255          uint32_t *               src_bits,
3256          uint32_t *               dst_bits,
3257          int                      src_stride,
3258          int                      dst_stride,
3259          int                      src_bpp,
3260          int                      dst_bpp,
3261          int                      src_x,
3262          int                      src_y,
3263          int                      dest_x,
3264          int                      dest_y,
3265          int                      width,
3266          int                      height)
3267 {
3268     uint8_t *   src_bytes;
3269     uint8_t *   dst_bytes;
3270     int byte_width;
3271 
3272     if (src_bpp != dst_bpp)
3273 	return FALSE;
3274 
3275     if (src_bpp == 16)
3276     {
3277 	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3278 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3279 	src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3280 	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3281 	byte_width = 2 * width;
3282 	src_stride *= 2;
3283 	dst_stride *= 2;
3284     }
3285     else if (src_bpp == 32)
3286     {
3287 	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3288 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3289 	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3290 	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3291 	byte_width = 4 * width;
3292 	src_stride *= 4;
3293 	dst_stride *= 4;
3294     }
3295     else
3296     {
3297 	return FALSE;
3298     }
3299 
3300     while (height--)
3301     {
3302 	int w;
3303 	uint8_t *s = src_bytes;
3304 	uint8_t *d = dst_bytes;
3305 	src_bytes += src_stride;
3306 	dst_bytes += dst_stride;
3307 	w = byte_width;
3308 
3309 	if (w >= 1 && ((uintptr_t)d & 1))
3310 	{
3311 	    *(uint8_t *)d = *(uint8_t *)s;
3312 	    w -= 1;
3313 	    s += 1;
3314 	    d += 1;
3315 	}
3316 
3317 	if (w >= 2 && ((uintptr_t)d & 3))
3318 	{
3319 	    *(uint16_t *)d = *(uint16_t *)s;
3320 	    w -= 2;
3321 	    s += 2;
3322 	    d += 2;
3323 	}
3324 
3325 	while (w >= 4 && ((uintptr_t)d & 7))
3326 	{
3327 	    *(uint32_t *)d = ldl_u ((uint32_t *)s);
3328 
3329 	    w -= 4;
3330 	    s += 4;
3331 	    d += 4;
3332 	}
3333 
3334 	while (w >= 64)
3335 	{
3336 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
3337 	    __asm__ (
3338 	        "movq	  (%1),	  %%mm0\n"
3339 	        "movq	 8(%1),	  %%mm1\n"
3340 	        "movq	16(%1),	  %%mm2\n"
3341 	        "movq	24(%1),	  %%mm3\n"
3342 	        "movq	32(%1),	  %%mm4\n"
3343 	        "movq	40(%1),	  %%mm5\n"
3344 	        "movq	48(%1),	  %%mm6\n"
3345 	        "movq	56(%1),	  %%mm7\n"
3346 
3347 	        "movq	%%mm0,	  (%0)\n"
3348 	        "movq	%%mm1,	 8(%0)\n"
3349 	        "movq	%%mm2,	16(%0)\n"
3350 	        "movq	%%mm3,	24(%0)\n"
3351 	        "movq	%%mm4,	32(%0)\n"
3352 	        "movq	%%mm5,	40(%0)\n"
3353 	        "movq	%%mm6,	48(%0)\n"
3354 	        "movq	%%mm7,	56(%0)\n"
3355 		:
3356 		: "r" (d), "r" (s)
3357 		: "memory",
3358 		  "%mm0", "%mm1", "%mm2", "%mm3",
3359 		  "%mm4", "%mm5", "%mm6", "%mm7");
3360 #else
3361 	    __m64 v0 = ldq_u ((__m64 *)(s + 0));
3362 	    __m64 v1 = ldq_u ((__m64 *)(s + 8));
3363 	    __m64 v2 = ldq_u ((__m64 *)(s + 16));
3364 	    __m64 v3 = ldq_u ((__m64 *)(s + 24));
3365 	    __m64 v4 = ldq_u ((__m64 *)(s + 32));
3366 	    __m64 v5 = ldq_u ((__m64 *)(s + 40));
3367 	    __m64 v6 = ldq_u ((__m64 *)(s + 48));
3368 	    __m64 v7 = ldq_u ((__m64 *)(s + 56));
3369 	    *(__m64 *)(d + 0)  = v0;
3370 	    *(__m64 *)(d + 8)  = v1;
3371 	    *(__m64 *)(d + 16) = v2;
3372 	    *(__m64 *)(d + 24) = v3;
3373 	    *(__m64 *)(d + 32) = v4;
3374 	    *(__m64 *)(d + 40) = v5;
3375 	    *(__m64 *)(d + 48) = v6;
3376 	    *(__m64 *)(d + 56) = v7;
3377 #endif
3378 
3379 	    w -= 64;
3380 	    s += 64;
3381 	    d += 64;
3382 	}
3383 	while (w >= 4)
3384 	{
3385 	    *(uint32_t *)d = ldl_u ((uint32_t *)s);
3386 
3387 	    w -= 4;
3388 	    s += 4;
3389 	    d += 4;
3390 	}
3391 	if (w >= 2)
3392 	{
3393 	    *(uint16_t *)d = *(uint16_t *)s;
3394 	    w -= 2;
3395 	    s += 2;
3396 	    d += 2;
3397 	}
3398     }
3399 
3400     _mm_empty ();
3401 
3402     return TRUE;
3403 }
3404 
3405 static void
mmx_composite_copy_area(pixman_implementation_t * imp,pixman_composite_info_t * info)3406 mmx_composite_copy_area (pixman_implementation_t *imp,
3407                          pixman_composite_info_t *info)
3408 {
3409     PIXMAN_COMPOSITE_ARGS (info);
3410 
3411     mmx_blt (imp, src_image->bits.bits,
3412 	     dest_image->bits.bits,
3413 	     src_image->bits.rowstride,
3414 	     dest_image->bits.rowstride,
3415 	     PIXMAN_FORMAT_BPP (src_image->bits.format),
3416 	     PIXMAN_FORMAT_BPP (dest_image->bits.format),
3417 	     src_x, src_y, dest_x, dest_y, width, height);
3418 }
3419 
3420 static void
mmx_composite_over_x888_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3421 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3422                                 pixman_composite_info_t *info)
3423 {
3424     PIXMAN_COMPOSITE_ARGS (info);
3425     uint32_t  *src, *src_line;
3426     uint32_t  *dst, *dst_line;
3427     uint8_t  *mask, *mask_line;
3428     int src_stride, mask_stride, dst_stride;
3429     int32_t w;
3430 
3431     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3432     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3433     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3434 
3435     while (height--)
3436     {
3437 	src = src_line;
3438 	src_line += src_stride;
3439 	dst = dst_line;
3440 	dst_line += dst_stride;
3441 	mask = mask_line;
3442 	mask_line += mask_stride;
3443 
3444 	w = width;
3445 
3446 	while (w--)
3447 	{
3448 	    uint64_t m = *mask;
3449 
3450 	    if (m)
3451 	    {
3452 		uint32_t ssrc = *src | 0xff000000;
3453 		__m64 s = load8888 (&ssrc);
3454 
3455 		if (m == 0xff)
3456 		{
3457 		    store8888 (dst, s);
3458 		}
3459 		else
3460 		{
3461 		    __m64 sa = expand_alpha (s);
3462 		    __m64 vm = expand_alpha_rev (to_m64 (m));
3463 		    __m64 vdest = in_over (s, sa, vm, load8888 (dst));
3464 
3465 		    store8888 (dst, vdest);
3466 		}
3467 	    }
3468 
3469 	    mask++;
3470 	    dst++;
3471 	    src++;
3472 	}
3473     }
3474 
3475     _mm_empty ();
3476 }
3477 
3478 static void
mmx_composite_over_reverse_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3479 mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
3480                                    pixman_composite_info_t *info)
3481 {
3482     PIXMAN_COMPOSITE_ARGS (info);
3483     uint32_t src;
3484     uint32_t    *dst_line, *dst;
3485     int32_t w;
3486     int dst_stride;
3487     __m64 vsrc;
3488 
3489     CHECKPOINT ();
3490 
3491     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3492 
3493     if (src == 0)
3494 	return;
3495 
3496     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3497 
3498     vsrc = load8888 (&src);
3499 
3500     while (height--)
3501     {
3502 	dst = dst_line;
3503 	dst_line += dst_stride;
3504 	w = width;
3505 
3506 	CHECKPOINT ();
3507 
3508 	while (w && (uintptr_t)dst & 7)
3509 	{
3510 	    __m64 vdest = load8888 (dst);
3511 
3512 	    store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3513 
3514 	    w--;
3515 	    dst++;
3516 	}
3517 
3518 	while (w >= 2)
3519 	{
3520 	    __m64 vdest = *(__m64 *)dst;
3521 	    __m64 dest0 = expand8888 (vdest, 0);
3522 	    __m64 dest1 = expand8888 (vdest, 1);
3523 
3524 
3525 	    dest0 = over (dest0, expand_alpha (dest0), vsrc);
3526 	    dest1 = over (dest1, expand_alpha (dest1), vsrc);
3527 
3528 	    *(__m64 *)dst = pack8888 (dest0, dest1);
3529 
3530 	    dst += 2;
3531 	    w -= 2;
3532 	}
3533 
3534 	CHECKPOINT ();
3535 
3536 	if (w)
3537 	{
3538 	    __m64 vdest = load8888 (dst);
3539 
3540 	    store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3541 	}
3542     }
3543 
3544     _mm_empty ();
3545 }
3546 
3547 static force_inline void
scaled_nearest_scanline_mmx_8888_8888_OVER(uint32_t * pd,const uint32_t * ps,int32_t w,pixman_fixed_t vx,pixman_fixed_t unit_x,pixman_fixed_t src_width_fixed,pixman_bool_t fully_transparent_src)3548 scaled_nearest_scanline_mmx_8888_8888_OVER (uint32_t*       pd,
3549                                             const uint32_t* ps,
3550                                             int32_t         w,
3551                                             pixman_fixed_t  vx,
3552                                             pixman_fixed_t  unit_x,
3553                                             pixman_fixed_t  src_width_fixed,
3554                                             pixman_bool_t   fully_transparent_src)
3555 {
3556     if (fully_transparent_src)
3557 	return;
3558 
3559     while (w)
3560     {
3561 	__m64 d = load (pd);
3562 	__m64 s = load (ps + pixman_fixed_to_int (vx));
3563 	vx += unit_x;
3564 	while (vx >= 0)
3565 	    vx -= src_width_fixed;
3566 
3567 	store8888 (pd, core_combine_over_u_pixel_mmx (s, d));
3568 	pd++;
3569 
3570 	w--;
3571     }
3572 
3573     _mm_empty ();
3574 }
3575 
FAST_NEAREST_MAINLOOP(mmx_8888_8888_cover_OVER,scaled_nearest_scanline_mmx_8888_8888_OVER,uint32_t,uint32_t,COVER)3576 FAST_NEAREST_MAINLOOP (mmx_8888_8888_cover_OVER,
3577 		       scaled_nearest_scanline_mmx_8888_8888_OVER,
3578 		       uint32_t, uint32_t, COVER)
3579 FAST_NEAREST_MAINLOOP (mmx_8888_8888_none_OVER,
3580 		       scaled_nearest_scanline_mmx_8888_8888_OVER,
3581 		       uint32_t, uint32_t, NONE)
3582 FAST_NEAREST_MAINLOOP (mmx_8888_8888_pad_OVER,
3583 		       scaled_nearest_scanline_mmx_8888_8888_OVER,
3584 		       uint32_t, uint32_t, PAD)
3585 FAST_NEAREST_MAINLOOP (mmx_8888_8888_normal_OVER,
3586 		       scaled_nearest_scanline_mmx_8888_8888_OVER,
3587 		       uint32_t, uint32_t, NORMAL)
3588 
3589 static force_inline void
3590 scaled_nearest_scanline_mmx_8888_n_8888_OVER (const uint32_t * mask,
3591 					      uint32_t *       dst,
3592 					      const uint32_t * src,
3593 					      int32_t          w,
3594 					      pixman_fixed_t   vx,
3595 					      pixman_fixed_t   unit_x,
3596 					      pixman_fixed_t   src_width_fixed,
3597 					      pixman_bool_t    zero_src)
3598 {
3599     __m64 mm_mask;
3600 
3601     if (zero_src || (*mask >> 24) == 0)
3602     {
3603 	/* A workaround for https://gcc.gnu.org/PR47759 */
3604 	_mm_empty ();
3605 	return;
3606     }
3607 
3608     mm_mask = expand_alpha (load8888 (mask));
3609 
3610     while (w)
3611     {
3612 	uint32_t s = *(src + pixman_fixed_to_int (vx));
3613 	vx += unit_x;
3614 	while (vx >= 0)
3615 	    vx -= src_width_fixed;
3616 
3617 	if (s)
3618 	{
3619 	    __m64 ms = load8888 (&s);
3620 	    __m64 alpha = expand_alpha (ms);
3621 	    __m64 dest  = load8888 (dst);
3622 
3623 	    store8888 (dst, (in_over (ms, alpha, mm_mask, dest)));
3624 	}
3625 
3626 	dst++;
3627 	w--;
3628     }
3629 
3630     _mm_empty ();
3631 }
3632 
FAST_NEAREST_MAINLOOP_COMMON(mmx_8888_n_8888_cover_OVER,scaled_nearest_scanline_mmx_8888_n_8888_OVER,uint32_t,uint32_t,uint32_t,COVER,TRUE,TRUE)3633 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_cover_OVER,
3634 			      scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3635 			      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
3636 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_pad_OVER,
3637 			      scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3638 			      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
3639 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_none_OVER,
3640 			      scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3641 			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
3642 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_normal_OVER,
3643 			      scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3644 			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
3645 
3646 #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
3647 #define BMSK (BSHIFT - 1)
3648 
3649 #define BILINEAR_DECLARE_VARIABLES						\
3650     const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);				\
3651     const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);				\
3652     const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1);				\
3653     const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK);			\
3654     const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);		\
3655     const __m64 mm_zero = _mm_setzero_si64 ();					\
3656     __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
3657 
3658 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
3659 do {										\
3660     /* fetch 2x2 pixel block into 2 mmx registers */				\
3661     __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]);		\
3662     __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]);		\
3663     /* vertical interpolation */						\
3664     __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt);		\
3665     __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt);		\
3666     __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb);		\
3667     __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);		\
3668     __m64 hi = _mm_add_pi16 (t_hi, b_hi);					\
3669     __m64 lo = _mm_add_pi16 (t_lo, b_lo);					\
3670     /* calculate horizontal weights */						\
3671     __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7,		\
3672 			  _mm_srli_pi16 (mm_x,					\
3673 					 16 - BILINEAR_INTERPOLATION_BITS)));	\
3674     /* horizontal interpolation */						\
3675     __m64 p = _mm_unpacklo_pi16 (lo, hi);					\
3676     __m64 q = _mm_unpackhi_pi16 (lo, hi);					\
3677     vx += unit_x;								\
3678     lo = _mm_madd_pi16 (p, mm_wh);						\
3679     hi = _mm_madd_pi16 (q, mm_wh);						\
3680     mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
3681     /* shift and pack the result */						\
3682     hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2);			\
3683     lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2);			\
3684     lo = _mm_packs_pi32 (lo, hi);						\
3685     lo = _mm_packs_pu16 (lo, lo);						\
3686     pix = lo;									\
3687 } while (0)
3688 
3689 #define BILINEAR_SKIP_ONE_PIXEL()						\
3690 do {										\
3691     vx += unit_x;								\
3692     mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
3693 } while(0)
3694 
3695 static force_inline void
3696 scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t *       dst,
3697 					    const uint32_t * mask,
3698 					    const uint32_t * src_top,
3699 					    const uint32_t * src_bottom,
3700 					    int32_t          w,
3701 					    int              wt,
3702 					    int              wb,
3703 					    pixman_fixed_t   vx,
3704 					    pixman_fixed_t   unit_x,
3705 					    pixman_fixed_t   max_vx,
3706 					    pixman_bool_t    zero_src)
3707 {
3708     BILINEAR_DECLARE_VARIABLES;
3709     __m64 pix;
3710 
3711     while (w--)
3712     {
3713 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
3714 	store (dst, pix);
3715 	dst++;
3716     }
3717 
3718     _mm_empty ();
3719 }
3720 
FAST_BILINEAR_MAINLOOP_COMMON(mmx_8888_8888_cover_SRC,scaled_bilinear_scanline_mmx_8888_8888_SRC,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)3721 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
3722 			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
3723 			       uint32_t, uint32_t, uint32_t,
3724 			       COVER, FLAG_NONE)
3725 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
3726 			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
3727 			       uint32_t, uint32_t, uint32_t,
3728 			       PAD, FLAG_NONE)
3729 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
3730 			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
3731 			       uint32_t, uint32_t, uint32_t,
3732 			       NONE, FLAG_NONE)
3733 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
3734 			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
3735 			       uint32_t, uint32_t, uint32_t,
3736 			       NORMAL, FLAG_NONE)
3737 
3738 static force_inline void
3739 scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t *       dst,
3740 					     const uint32_t * mask,
3741 					     const uint32_t * src_top,
3742 					     const uint32_t * src_bottom,
3743 					     int32_t          w,
3744 					     int              wt,
3745 					     int              wb,
3746 					     pixman_fixed_t   vx,
3747 					     pixman_fixed_t   unit_x,
3748 					     pixman_fixed_t   max_vx,
3749 					     pixman_bool_t    zero_src)
3750 {
3751     BILINEAR_DECLARE_VARIABLES;
3752     __m64 pix1, pix2;
3753 
3754     while (w)
3755     {
3756 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3757 
3758 	if (!is_zero (pix1))
3759 	{
3760 	    pix2 = load (dst);
3761 	    store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
3762 	}
3763 
3764 	w--;
3765 	dst++;
3766     }
3767 
3768     _mm_empty ();
3769 }
3770 
FAST_BILINEAR_MAINLOOP_COMMON(mmx_8888_8888_cover_OVER,scaled_bilinear_scanline_mmx_8888_8888_OVER,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)3771 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
3772 			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
3773 			       uint32_t, uint32_t, uint32_t,
3774 			       COVER, FLAG_NONE)
3775 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
3776 			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
3777 			       uint32_t, uint32_t, uint32_t,
3778 			       PAD, FLAG_NONE)
3779 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
3780 			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
3781 			       uint32_t, uint32_t, uint32_t,
3782 			       NONE, FLAG_NONE)
3783 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
3784 			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
3785 			       uint32_t, uint32_t, uint32_t,
3786 			       NORMAL, FLAG_NONE)
3787 
3788 static force_inline void
3789 scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t *       dst,
3790 					       const uint8_t  * mask,
3791 					       const uint32_t * src_top,
3792 					       const uint32_t * src_bottom,
3793 					       int32_t          w,
3794 					       int              wt,
3795 					       int              wb,
3796 					       pixman_fixed_t   vx,
3797 					       pixman_fixed_t   unit_x,
3798 					       pixman_fixed_t   max_vx,
3799 					       pixman_bool_t    zero_src)
3800 {
3801     BILINEAR_DECLARE_VARIABLES;
3802     __m64 pix1, pix2;
3803     uint32_t m;
3804 
3805     while (w)
3806     {
3807 	m = (uint32_t) *mask++;
3808 
3809 	if (m)
3810 	{
3811 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3812 
3813 	    if (m == 0xff && is_opaque (pix1))
3814 	    {
3815 		store (dst, pix1);
3816 	    }
3817 	    else
3818 	    {
3819 		__m64 ms, md, ma, msa;
3820 
3821 		pix2 = load (dst);
3822 		ma = expand_alpha_rev (to_m64 (m));
3823 		ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
3824 		md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
3825 
3826 		msa = expand_alpha (ms);
3827 
3828 		store8888 (dst, (in_over (ms, msa, ma, md)));
3829 	    }
3830 	}
3831 	else
3832 	{
3833 	    BILINEAR_SKIP_ONE_PIXEL ();
3834 	}
3835 
3836 	w--;
3837 	dst++;
3838     }
3839 
3840     _mm_empty ();
3841 }
3842 
FAST_BILINEAR_MAINLOOP_COMMON(mmx_8888_8_8888_cover_OVER,scaled_bilinear_scanline_mmx_8888_8_8888_OVER,uint32_t,uint8_t,uint32_t,COVER,FLAG_HAVE_NON_SOLID_MASK)3843 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
3844 			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3845 			       uint32_t, uint8_t, uint32_t,
3846 			       COVER, FLAG_HAVE_NON_SOLID_MASK)
3847 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
3848 			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3849 			       uint32_t, uint8_t, uint32_t,
3850 			       PAD, FLAG_HAVE_NON_SOLID_MASK)
3851 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
3852 			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3853 			       uint32_t, uint8_t, uint32_t,
3854 			       NONE, FLAG_HAVE_NON_SOLID_MASK)
3855 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
3856 			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3857 			       uint32_t, uint8_t, uint32_t,
3858 			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
3859 
3860 static uint32_t *
3861 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3862 {
3863     int w = iter->width;
3864     uint32_t *dst = iter->buffer;
3865     uint32_t *src = (uint32_t *)iter->bits;
3866 
3867     iter->bits += iter->stride;
3868 
3869     while (w && ((uintptr_t)dst) & 7)
3870     {
3871 	*dst++ = (*src++) | 0xff000000;
3872 	w--;
3873     }
3874 
3875     while (w >= 8)
3876     {
3877 	__m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
3878 	__m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
3879 	__m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
3880 	__m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
3881 
3882 	*(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
3883 	*(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
3884 	*(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
3885 	*(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
3886 
3887 	dst += 8;
3888 	src += 8;
3889 	w -= 8;
3890     }
3891 
3892     while (w)
3893     {
3894 	*dst++ = (*src++) | 0xff000000;
3895 	w--;
3896     }
3897 
3898     _mm_empty ();
3899     return iter->buffer;
3900 }
3901 
3902 static uint32_t *
mmx_fetch_r5g6b5(pixman_iter_t * iter,const uint32_t * mask)3903 mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
3904 {
3905     int w = iter->width;
3906     uint32_t *dst = iter->buffer;
3907     uint16_t *src = (uint16_t *)iter->bits;
3908 
3909     iter->bits += iter->stride;
3910 
3911     while (w && ((uintptr_t)dst) & 0x0f)
3912     {
3913 	uint16_t s = *src++;
3914 
3915 	*dst++ = convert_0565_to_8888 (s);
3916 	w--;
3917     }
3918 
3919     while (w >= 4)
3920     {
3921 	__m64 vsrc = ldq_u ((__m64 *)src);
3922 	__m64 mm0, mm1;
3923 
3924 	expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
3925 
3926 	*(__m64 *)(dst + 0) = mm0;
3927 	*(__m64 *)(dst + 2) = mm1;
3928 
3929 	dst += 4;
3930 	src += 4;
3931 	w -= 4;
3932     }
3933 
3934     while (w)
3935     {
3936 	uint16_t s = *src++;
3937 
3938 	*dst++ = convert_0565_to_8888 (s);
3939 	w--;
3940     }
3941 
3942     _mm_empty ();
3943     return iter->buffer;
3944 }
3945 
3946 static uint32_t *
mmx_fetch_a8(pixman_iter_t * iter,const uint32_t * mask)3947 mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3948 {
3949     int w = iter->width;
3950     uint32_t *dst = iter->buffer;
3951     uint8_t *src = iter->bits;
3952 
3953     iter->bits += iter->stride;
3954 
3955     while (w && (((uintptr_t)dst) & 15))
3956     {
3957         *dst++ = (uint32_t)*(src++) << 24;
3958         w--;
3959     }
3960 
3961     while (w >= 8)
3962     {
3963 	__m64 mm0 = ldq_u ((__m64 *)src);
3964 
3965 	__m64 mm1 = _mm_unpacklo_pi8  (_mm_setzero_si64(), mm0);
3966 	__m64 mm2 = _mm_unpackhi_pi8  (_mm_setzero_si64(), mm0);
3967 	__m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
3968 	__m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
3969 	__m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
3970 	__m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
3971 
3972 	*(__m64 *)(dst + 0) = mm3;
3973 	*(__m64 *)(dst + 2) = mm4;
3974 	*(__m64 *)(dst + 4) = mm5;
3975 	*(__m64 *)(dst + 6) = mm6;
3976 
3977 	dst += 8;
3978 	src += 8;
3979 	w -= 8;
3980     }
3981 
3982     while (w)
3983     {
3984 	*dst++ = (uint32_t)*(src++) << 24;
3985 	w--;
3986     }
3987 
3988     _mm_empty ();
3989     return iter->buffer;
3990 }
3991 
3992 #define IMAGE_FLAGS							\
3993     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
3994      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
3995 
3996 static const pixman_iter_info_t mmx_iters[] =
3997 {
3998     { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
3999       _pixman_iter_init_bits_stride, mmx_fetch_x8r8g8b8, NULL
4000     },
4001     { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
4002       _pixman_iter_init_bits_stride, mmx_fetch_r5g6b5, NULL
4003     },
4004     { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
4005       _pixman_iter_init_bits_stride, mmx_fetch_a8, NULL
4006     },
4007     { PIXMAN_null },
4008 };
4009 
4010 static const pixman_fast_path_t mmx_fast_paths[] =
4011 {
4012     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
4013     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ),
4014     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ),
4015     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ),
4016     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ),
4017     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ),
4018     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
4019     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
4020     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ),
4021     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
4022     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
4023     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ),
4024     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ),
4025     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ),
4026     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ),
4027     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ),
4028     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ),
4029     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ),
4030     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ),
4031     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ),
4032     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ),
4033     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ),
4034     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ),
4035     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
4036     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
4037     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
4038     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
4039     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
4040     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
4041     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, mmx_composite_over_x888_8_8888    ),
4042     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
4043     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
4044     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
4045     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     b5g6r5,   mmx_composite_over_n_0565         ),
4046     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
4047     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
4048 
4049     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ),
4050     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ),
4051     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ),
4052     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ),
4053     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
4054     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
4055 
4056     PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
4057     PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
4058 
4059     PIXMAN_STD_FAST_PATH    (ADD,  r5g6b5,   null,     r5g6b5,   mmx_composite_add_0565_0565       ),
4060     PIXMAN_STD_FAST_PATH    (ADD,  b5g6r5,   null,     b5g6r5,   mmx_composite_add_0565_0565       ),
4061     PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
4062     PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
4063     PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8		   ),
4064     PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
4065 
4066     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
4067     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
4068     PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
4069     PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
4070     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
4071     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
4072     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
4073     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
4074     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
4075     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
4076     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
4077     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
4078     PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
4079     PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
4080     PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ),
4081     PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ),
4082 
4083     PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
4084     PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
4085 
4086     SIMPLE_NEAREST_FAST_PATH (OVER,   a8r8g8b8, x8r8g8b8, mmx_8888_8888                            ),
4087     SIMPLE_NEAREST_FAST_PATH (OVER,   a8b8g8r8, x8b8g8r8, mmx_8888_8888                            ),
4088     SIMPLE_NEAREST_FAST_PATH (OVER,   a8r8g8b8, a8r8g8b8, mmx_8888_8888                            ),
4089     SIMPLE_NEAREST_FAST_PATH (OVER,   a8b8g8r8, a8b8g8r8, mmx_8888_8888                            ),
4090 
4091     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_n_8888                 ),
4092     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_n_8888                 ),
4093     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_n_8888                 ),
4094     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_n_8888                 ),
4095 
4096     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          a8r8g8b8, mmx_8888_8888                     ),
4097     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
4098     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
4099     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          a8b8g8r8, mmx_8888_8888                     ),
4100     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
4101     SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
4102 
4103     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         x8r8g8b8, mmx_8888_8888                     ),
4104     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         x8b8g8r8, mmx_8888_8888                     ),
4105     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         a8r8g8b8, mmx_8888_8888                     ),
4106     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         a8b8g8r8, mmx_8888_8888                     ),
4107 
4108     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888                   ),
4109     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888                   ),
4110     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888                   ),
4111     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888                   ),
4112 
4113     { PIXMAN_OP_NONE },
4114 };
4115 
4116 pixman_implementation_t *
_pixman_implementation_create_mmx(pixman_implementation_t * fallback)4117 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
4118 {
4119     pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
4120 
4121     imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
4122     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
4123     imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
4124     imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
4125     imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
4126     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
4127     imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
4128     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
4129     imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
4130     imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
4131     imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
4132 
4133     imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
4134     imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
4135     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
4136     imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
4137     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
4138     imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
4139     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
4140     imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
4141     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
4142     imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
4143     imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
4144 
4145     imp->blt = mmx_blt;
4146     imp->fill = mmx_fill;
4147 
4148     imp->iter_info = mmx_iters;
4149 
4150     return imp;
4151 }
4152 
4153 #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */
4154