1 /*
2  * Copyright © 2004, 2005 Red Hat, Inc.
3  * Copyright © 2004 Nicholas Miell
4  * Copyright © 2005 Trolltech AS
5  *
6  * Permission to use, copy, modify, distribute, and sell this software and its
7  * documentation for any purpose is hereby granted without fee, provided that
8  * the above copyright notice appear in all copies and that both that
9  * copyright notice and this permission notice appear in supporting
10  * documentation, and that the name of Red Hat not be used in advertising or
11  * publicity pertaining to distribution of the software without specific,
12  * written prior permission.  Red Hat makes no representations about the
13  * suitability of this software for any purpose.  It is provided "as is"
14  * without express or implied warranty.
15  *
16  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23  * SOFTWARE.
24  *
25  * Author:  Søren Sandmann (sandmann@redhat.com)
26  * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27  * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
28  *
29  * Based on work by Owen Taylor
30  */
31 
32 #ifdef HAVE_CONFIG_H
33 #include <config.h>
34 #endif
35 
36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
37 
38 #ifdef USE_LOONGSON_MMI
39 #include <loongson-mmintrin.h>
40 #else
41 #include <mmintrin.h>
42 #endif
43 #include "pixman-private.h"
44 #include "pixman-combine32.h"
45 #include "pixman-inlines.h"
46 
47 #define no_vERBOSE
48 
49 #ifdef VERBOSE
50 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
51 #else
52 #define CHECKPOINT()
53 #endif
54 
55 #if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
56 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this.  */
57 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_empty(void)58 _mm_empty (void)
59 {
60 
61 }
62 #endif
63 
64 #ifdef USE_X86_MMX
65 # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
66 #  include <xmmintrin.h>
67 # else
68 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
69  * instructions to be generated that we don't want. Just duplicate the
70  * functions we want to use.  */
71 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pi8(__m64 __A)72 _mm_movemask_pi8 (__m64 __A)
73 {
74     int ret;
75 
76     asm ("pmovmskb %1, %0\n\t"
77 	: "=r" (ret)
78 	: "y" (__A)
79     );
80 
81     return ret;
82 }
83 
84 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_pu16(__m64 __A,__m64 __B)85 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
86 {
87     asm ("pmulhuw %1, %0\n\t"
88 	: "+y" (__A)
89 	: "y" (__B)
90     );
91     return __A;
92 }
93 
94 #  ifdef __OPTIMIZE__
95 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pi16(__m64 __A,int8_t const __N)96 _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
97 {
98     __m64 ret;
99 
100     asm ("pshufw %2, %1, %0\n\t"
101 	: "=y" (ret)
102 	: "y" (__A), "K" (__N)
103     );
104 
105     return ret;
106 }
107 #  else
108 #   define _mm_shuffle_pi16(A, N)					\
109     ({									\
110 	__m64 ret;							\
111 									\
112 	asm ("pshufw %2, %1, %0\n\t"					\
113 	     : "=y" (ret)						\
114 	     : "y" (A), "K" ((const int8_t)N)				\
115 	);								\
116 									\
117 	ret;								\
118     })
119 #  endif
120 # endif
121 #endif
122 
123 #ifndef _MSC_VER
124 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
125  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
126 #endif
127 
128 /* Notes about writing mmx code
129  *
130  * give memory operands as the second operand. If you give it as the
131  * first, gcc will first load it into a register, then use that
132  * register
133  *
134  *   ie. use
135  *
136  *         _mm_mullo_pi16 (x, mmx_constant);
137  *
138  *   not
139  *
140  *         _mm_mullo_pi16 (mmx_constant, x);
141  *
142  * Also try to minimize dependencies. i.e. when you need a value, try
143  * to calculate it from a value that was calculated as early as
144  * possible.
145  */
146 
147 /* --------------- MMX primitives ------------------------------------- */
148 
149 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
150  * the name of the member used to access the data.
151  * If __m64 requires using mm_cvt* intrinsics functions to convert between
152  * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
153  * If __m64 and uint64_t values can just be cast to each other directly,
154  * then define USE_M64_CASTS.
155  * If __m64 is a double datatype, then define USE_M64_DOUBLE.
156  */
157 #ifdef _MSC_VER
158 # define M64_MEMBER m64_u64
159 #elif defined(__ICC)
160 # define USE_CVT_INTRINSICS
161 #elif defined(USE_LOONGSON_MMI)
162 # define USE_M64_DOUBLE
163 #elif defined(__GNUC__)
164 # define USE_M64_CASTS
165 #elif defined(__SUNPRO_C)
166 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
167 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
168  * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
169  * is defined.   If it is used, then the mm_cvt* intrinsics must be used.
170  */
171 #  define USE_CVT_INTRINSICS
172 # else
173 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
174  * disabled, __m64 is defined as a struct containing "unsigned long long l_".
175  */
176 #  define M64_MEMBER l_
177 # endif
178 #endif
179 
180 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
181 typedef uint64_t mmxdatafield;
182 #else
183 typedef __m64 mmxdatafield;
184 #endif
185 
186 typedef struct
187 {
188     mmxdatafield mmx_4x00ff;
189     mmxdatafield mmx_4x0080;
190     mmxdatafield mmx_565_rgb;
191     mmxdatafield mmx_565_unpack_multiplier;
192     mmxdatafield mmx_565_pack_multiplier;
193     mmxdatafield mmx_565_r;
194     mmxdatafield mmx_565_g;
195     mmxdatafield mmx_565_b;
196     mmxdatafield mmx_packed_565_rb;
197     mmxdatafield mmx_packed_565_g;
198     mmxdatafield mmx_expand_565_g;
199     mmxdatafield mmx_expand_565_b;
200     mmxdatafield mmx_expand_565_r;
201 #ifndef USE_LOONGSON_MMI
202     mmxdatafield mmx_mask_0;
203     mmxdatafield mmx_mask_1;
204     mmxdatafield mmx_mask_2;
205     mmxdatafield mmx_mask_3;
206 #endif
207     mmxdatafield mmx_full_alpha;
208     mmxdatafield mmx_4x0101;
209     mmxdatafield mmx_ff000000;
210 } mmx_data_t;
211 
212 #if defined(_MSC_VER)
213 # define MMXDATA_INIT(field, val) { val ## UI64 }
214 #elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
215 # define MMXDATA_INIT(field, val) field =   { val ## ULL }
216 #else                           /* mmxdatafield is an integral type */
217 # define MMXDATA_INIT(field, val) field =   val ## ULL
218 #endif
219 
220 static const mmx_data_t c =
221 {
222     MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff),
223     MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
224     MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
225     MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
226     MMXDATA_INIT (.mmx_565_pack_multiplier,      0x2000000420000004),
227     MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
228     MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
229     MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
230     MMXDATA_INIT (.mmx_packed_565_rb,            0x00f800f800f800f8),
231     MMXDATA_INIT (.mmx_packed_565_g,             0x0000fc000000fc00),
232     MMXDATA_INIT (.mmx_expand_565_g,             0x07e007e007e007e0),
233     MMXDATA_INIT (.mmx_expand_565_b,             0x001f001f001f001f),
234     MMXDATA_INIT (.mmx_expand_565_r,             0xf800f800f800f800),
235 #ifndef USE_LOONGSON_MMI
236     MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
237     MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
238     MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
239     MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
240 #endif
241     MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
242     MMXDATA_INIT (.mmx_4x0101,                   0x0101010101010101),
243     MMXDATA_INIT (.mmx_ff000000,                 0xff000000ff000000),
244 };
245 
246 #ifdef USE_CVT_INTRINSICS
247 #    define MC(x) to_m64 (c.mmx_ ## x)
248 #elif defined(USE_M64_CASTS)
249 #    define MC(x) ((__m64)c.mmx_ ## x)
250 #elif defined(USE_M64_DOUBLE)
251 #    define MC(x) (*(__m64 *)&c.mmx_ ## x)
252 #else
253 #    define MC(x) c.mmx_ ## x
254 #endif
255 
256 static force_inline __m64
to_m64(uint64_t x)257 to_m64 (uint64_t x)
258 {
259 #ifdef USE_CVT_INTRINSICS
260     return _mm_cvtsi64_m64 (x);
261 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
262     __m64 res;
263 
264     res.M64_MEMBER = x;
265     return res;
266 #elif defined USE_M64_DOUBLE
267     return *(__m64 *)&x;
268 #else /* USE_M64_CASTS */
269     return (__m64)x;
270 #endif
271 }
272 
273 static force_inline uint64_t
to_uint64(__m64 x)274 to_uint64 (__m64 x)
275 {
276 #ifdef USE_CVT_INTRINSICS
277     return _mm_cvtm64_si64 (x);
278 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
279     uint64_t res = x.M64_MEMBER;
280     return res;
281 #elif defined USE_M64_DOUBLE
282     return *(uint64_t *)&x;
283 #else /* USE_M64_CASTS */
284     return (uint64_t)x;
285 #endif
286 }
287 
288 static force_inline __m64
shift(__m64 v,int s)289 shift (__m64 v,
290        int   s)
291 {
292     if (s > 0)
293 	return _mm_slli_si64 (v, s);
294     else if (s < 0)
295 	return _mm_srli_si64 (v, -s);
296     else
297 	return v;
298 }
299 
300 static force_inline __m64
negate(__m64 mask)301 negate (__m64 mask)
302 {
303     return _mm_xor_si64 (mask, MC (4x00ff));
304 }
305 
306 static force_inline __m64
pix_multiply(__m64 a,__m64 b)307 pix_multiply (__m64 a, __m64 b)
308 {
309     __m64 res;
310 
311     res = _mm_mullo_pi16 (a, b);
312     res = _mm_adds_pu16 (res, MC (4x0080));
313     res = _mm_mulhi_pu16 (res, MC (4x0101));
314 
315     return res;
316 }
317 
318 static force_inline __m64
pix_add(__m64 a,__m64 b)319 pix_add (__m64 a, __m64 b)
320 {
321     return _mm_adds_pu8 (a, b);
322 }
323 
324 static force_inline __m64
expand_alpha(__m64 pixel)325 expand_alpha (__m64 pixel)
326 {
327     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
328 }
329 
330 static force_inline __m64
expand_alpha_rev(__m64 pixel)331 expand_alpha_rev (__m64 pixel)
332 {
333     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
334 }
335 
336 static force_inline __m64
invert_colors(__m64 pixel)337 invert_colors (__m64 pixel)
338 {
339     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
340 }
341 
342 static force_inline __m64
over(__m64 src,__m64 srca,__m64 dest)343 over (__m64 src,
344       __m64 srca,
345       __m64 dest)
346 {
347     return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
348 }
349 
350 static force_inline __m64
over_rev_non_pre(__m64 src,__m64 dest)351 over_rev_non_pre (__m64 src, __m64 dest)
352 {
353     __m64 srca = expand_alpha (src);
354     __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
355 
356     return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
357 }
358 
359 static force_inline __m64
in(__m64 src,__m64 mask)360 in (__m64 src, __m64 mask)
361 {
362     return pix_multiply (src, mask);
363 }
364 
365 #ifndef _MSC_VER
366 static force_inline __m64
in_over(__m64 src,__m64 srca,__m64 mask,__m64 dest)367 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
368 {
369     return over (in (src, mask), pix_multiply (srca, mask), dest);
370 }
371 
372 #else
373 
374 #define in_over(src, srca, mask, dest)					\
375     over (in (src, mask), pix_multiply (srca, mask), dest)
376 
377 #endif
378 
379 /* Elemental unaligned loads */
380 
ldq_u(__m64 * p)381 static force_inline __m64 ldq_u(__m64 *p)
382 {
383 #ifdef USE_X86_MMX
384     /* x86's alignment restrictions are very relaxed. */
385     return *(__m64 *)p;
386 #elif defined USE_ARM_IWMMXT
387     int align = (uintptr_t)p & 7;
388     __m64 *aligned_p;
389     if (align == 0)
390 	return *p;
391     aligned_p = (__m64 *)((uintptr_t)p & ~7);
392     return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
393 #else
394     struct __una_u64 { __m64 x __attribute__((packed)); };
395     const struct __una_u64 *ptr = (const struct __una_u64 *) p;
396     return (__m64) ptr->x;
397 #endif
398 }
399 
ldl_u(const uint32_t * p)400 static force_inline uint32_t ldl_u(const uint32_t *p)
401 {
402 #ifdef USE_X86_MMX
403     /* x86's alignment restrictions are very relaxed. */
404     return *p;
405 #else
406     struct __una_u32 { uint32_t x __attribute__((packed)); };
407     const struct __una_u32 *ptr = (const struct __una_u32 *) p;
408     return ptr->x;
409 #endif
410 }
411 
412 static force_inline __m64
load(const uint32_t * v)413 load (const uint32_t *v)
414 {
415 #ifdef USE_LOONGSON_MMI
416     __m64 ret;
417     asm ("lwc1 %0, %1\n\t"
418 	: "=f" (ret)
419 	: "m" (*v)
420     );
421     return ret;
422 #else
423     return _mm_cvtsi32_si64 (*v);
424 #endif
425 }
426 
427 static force_inline __m64
load8888(const uint32_t * v)428 load8888 (const uint32_t *v)
429 {
430 #ifdef USE_LOONGSON_MMI
431     return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
432 #else
433     return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
434 #endif
435 }
436 
437 static force_inline __m64
load8888u(const uint32_t * v)438 load8888u (const uint32_t *v)
439 {
440     uint32_t l = ldl_u (v);
441     return load8888 (&l);
442 }
443 
444 static force_inline __m64
pack8888(__m64 lo,__m64 hi)445 pack8888 (__m64 lo, __m64 hi)
446 {
447     return _mm_packs_pu16 (lo, hi);
448 }
449 
450 static force_inline void
store(uint32_t * dest,__m64 v)451 store (uint32_t *dest, __m64 v)
452 {
453 #ifdef USE_LOONGSON_MMI
454     asm ("swc1 %1, %0\n\t"
455 	: "=m" (*dest)
456 	: "f" (v)
457 	: "memory"
458     );
459 #else
460     *dest = _mm_cvtsi64_si32 (v);
461 #endif
462 }
463 
464 static force_inline void
store8888(uint32_t * dest,__m64 v)465 store8888 (uint32_t *dest, __m64 v)
466 {
467     v = pack8888 (v, _mm_setzero_si64 ());
468     store (dest, v);
469 }
470 
471 static force_inline pixman_bool_t
is_equal(__m64 a,__m64 b)472 is_equal (__m64 a, __m64 b)
473 {
474 #ifdef USE_LOONGSON_MMI
475     /* __m64 is double, we can compare directly. */
476     return a == b;
477 #else
478     return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
479 #endif
480 }
481 
482 static force_inline pixman_bool_t
is_opaque(__m64 v)483 is_opaque (__m64 v)
484 {
485 #ifdef USE_LOONGSON_MMI
486     return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
487 #else
488     __m64 ffs = _mm_cmpeq_pi8 (v, v);
489     return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
490 #endif
491 }
492 
493 static force_inline pixman_bool_t
is_zero(__m64 v)494 is_zero (__m64 v)
495 {
496     return is_equal (v, _mm_setzero_si64 ());
497 }
498 
499 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
500  *
501  *    00RR00GG00BB
502  *
503  * --- Expanding 565 in the low word ---
504  *
505  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
506  * m = m & (01f0003f001f);
507  * m = m * (008404100840);
508  * m = m >> 8;
509  *
510  * Note the trick here - the top word is shifted by another nibble to
511  * avoid it bumping into the middle word
512  */
513 static force_inline __m64
expand565(__m64 pixel,int pos)514 expand565 (__m64 pixel, int pos)
515 {
516     __m64 p = pixel;
517     __m64 t1, t2;
518 
519     /* move pixel to low 16 bit and zero the rest */
520 #ifdef USE_LOONGSON_MMI
521     p = loongson_extract_pi16 (p, pos);
522 #else
523     p = shift (shift (p, (3 - pos) * 16), -48);
524 #endif
525 
526     t1 = shift (p, 36 - 11);
527     t2 = shift (p, 16 - 5);
528 
529     p = _mm_or_si64 (t1, p);
530     p = _mm_or_si64 (t2, p);
531     p = _mm_and_si64 (p, MC (565_rgb));
532 
533     pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
534     return _mm_srli_pi16 (pixel, 8);
535 }
536 
537 /* Expand 4 16 bit pixels in an mmx register into two mmx registers of
538  *
539  *    AARRGGBBRRGGBB
540  */
541 static force_inline void
expand_4xpacked565(__m64 vin,__m64 * vout0,__m64 * vout1,int full_alpha)542 expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
543 {
544     __m64 t0, t1, alpha = _mm_setzero_si64 ();
545     __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
546     __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
547     __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
548     if (full_alpha)
549 	alpha = _mm_cmpeq_pi32 (alpha, alpha);
550 
551     /* Replicate high bits into empty low bits. */
552     r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
553     g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
554     b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
555 
556     r = _mm_packs_pu16 (r, _mm_setzero_si64 ());	/* 00 00 00 00 R3 R2 R1 R0 */
557     g = _mm_packs_pu16 (g, _mm_setzero_si64 ());	/* 00 00 00 00 G3 G2 G1 G0 */
558     b = _mm_packs_pu16 (b, _mm_setzero_si64 ());	/* 00 00 00 00 B3 B2 B1 B0 */
559 
560     t1 = _mm_unpacklo_pi8 (r, alpha);			/* A3 R3 A2 R2 A1 R1 A0 R0 */
561     t0 = _mm_unpacklo_pi8 (b, g);			/* G3 B3 G2 B2 G1 B1 G0 B0 */
562 
563     *vout0 = _mm_unpacklo_pi16 (t0, t1);		/* A1 R1 G1 B1 A0 R0 G0 B0 */
564     *vout1 = _mm_unpackhi_pi16 (t0, t1);		/* A3 R3 G3 B3 A2 R2 G2 B2 */
565 }
566 
567 static force_inline __m64
expand8888(__m64 in,int pos)568 expand8888 (__m64 in, int pos)
569 {
570     if (pos == 0)
571 	return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
572     else
573 	return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
574 }
575 
576 static force_inline __m64
expandx888(__m64 in,int pos)577 expandx888 (__m64 in, int pos)
578 {
579     return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
580 }
581 
582 static force_inline void
expand_4x565(__m64 vin,__m64 * vout0,__m64 * vout1,__m64 * vout2,__m64 * vout3,int full_alpha)583 expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
584 {
585     __m64 v0, v1;
586     expand_4xpacked565 (vin, &v0, &v1, full_alpha);
587     *vout0 = expand8888 (v0, 0);
588     *vout1 = expand8888 (v0, 1);
589     *vout2 = expand8888 (v1, 0);
590     *vout3 = expand8888 (v1, 1);
591 }
592 
593 static force_inline __m64
pack_565(__m64 pixel,__m64 target,int pos)594 pack_565 (__m64 pixel, __m64 target, int pos)
595 {
596     __m64 p = pixel;
597     __m64 t = target;
598     __m64 r, g, b;
599 
600     r = _mm_and_si64 (p, MC (565_r));
601     g = _mm_and_si64 (p, MC (565_g));
602     b = _mm_and_si64 (p, MC (565_b));
603 
604 #ifdef USE_LOONGSON_MMI
605     r = shift (r, -(32 - 8));
606     g = shift (g, -(16 - 3));
607     b = shift (b, -(0  + 3));
608 
609     p = _mm_or_si64 (r, g);
610     p = _mm_or_si64 (p, b);
611     return loongson_insert_pi16 (t, p, pos);
612 #else
613     r = shift (r, -(32 - 8) + pos * 16);
614     g = shift (g, -(16 - 3) + pos * 16);
615     b = shift (b, -(0  + 3) + pos * 16);
616 
617     if (pos == 0)
618 	t = _mm_and_si64 (t, MC (mask_0));
619     else if (pos == 1)
620 	t = _mm_and_si64 (t, MC (mask_1));
621     else if (pos == 2)
622 	t = _mm_and_si64 (t, MC (mask_2));
623     else if (pos == 3)
624 	t = _mm_and_si64 (t, MC (mask_3));
625 
626     p = _mm_or_si64 (r, t);
627     p = _mm_or_si64 (g, p);
628 
629     return _mm_or_si64 (b, p);
630 #endif
631 }
632 
633 static force_inline __m64
pack_4xpacked565(__m64 a,__m64 b)634 pack_4xpacked565 (__m64 a, __m64 b)
635 {
636     __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
637     __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
638 
639     __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
640     __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
641 
642     __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
643     __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
644 
645     t0 = _mm_or_si64 (t0, g0);
646     t1 = _mm_or_si64 (t1, g1);
647 
648     t0 = shift(t0, -5);
649 #ifdef USE_ARM_IWMMXT
650     t1 = shift(t1, -5);
651     return _mm_packs_pu32 (t0, t1);
652 #else
653     t1 = shift(t1, -5 + 16);
654     return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
655 #endif
656 }
657 
658 #ifndef _MSC_VER
659 
660 static force_inline __m64
pack_4x565(__m64 v0,__m64 v1,__m64 v2,__m64 v3)661 pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
662 {
663     return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
664 }
665 
666 static force_inline __m64
pix_add_mul(__m64 x,__m64 a,__m64 y,__m64 b)667 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
668 {
669     x = pix_multiply (x, a);
670     y = pix_multiply (y, b);
671 
672     return pix_add (x, y);
673 }
674 
675 #else
676 
677 /* MSVC only handles a "pass by register" of up to three SSE intrinsics */
678 
679 #define pack_4x565(v0, v1, v2, v3) \
680     pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
681 
682 #define pix_add_mul(x, a, y, b)	 \
683     ( x = pix_multiply (x, a),	 \
684       y = pix_multiply (y, b),	 \
685       pix_add (x, y) )
686 
687 #endif
688 
689 /* --------------- MMX code patch for fbcompose.c --------------------- */
690 
691 static force_inline __m64
combine(const uint32_t * src,const uint32_t * mask)692 combine (const uint32_t *src, const uint32_t *mask)
693 {
694     __m64 vsrc = load8888 (src);
695 
696     if (mask)
697     {
698 	__m64 m = load8888 (mask);
699 
700 	m = expand_alpha (m);
701 	vsrc = pix_multiply (vsrc, m);
702     }
703 
704     return vsrc;
705 }
706 
707 static force_inline __m64
core_combine_over_u_pixel_mmx(__m64 vsrc,__m64 vdst)708 core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
709 {
710     vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
711 
712     if (is_opaque (vsrc))
713     {
714 	return vsrc;
715     }
716     else if (!is_zero (vsrc))
717     {
718 	return over (vsrc, expand_alpha (vsrc),
719 		     _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
720     }
721 
722     return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
723 }
724 
725 static void
mmx_combine_over_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)726 mmx_combine_over_u (pixman_implementation_t *imp,
727                     pixman_op_t              op,
728                     uint32_t *               dest,
729                     const uint32_t *         src,
730                     const uint32_t *         mask,
731                     int                      width)
732 {
733     const uint32_t *end = dest + width;
734 
735     while (dest < end)
736     {
737 	__m64 vsrc = combine (src, mask);
738 
739 	if (is_opaque (vsrc))
740 	{
741 	    store8888 (dest, vsrc);
742 	}
743 	else if (!is_zero (vsrc))
744 	{
745 	    __m64 sa = expand_alpha (vsrc);
746 	    store8888 (dest, over (vsrc, sa, load8888 (dest)));
747 	}
748 
749 	++dest;
750 	++src;
751 	if (mask)
752 	    ++mask;
753     }
754     _mm_empty ();
755 }
756 
757 static void
mmx_combine_over_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)758 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
759                             pixman_op_t              op,
760                             uint32_t *               dest,
761                             const uint32_t *         src,
762                             const uint32_t *         mask,
763                             int                      width)
764 {
765     const uint32_t *end = dest + width;
766 
767     while (dest < end)
768     {
769 	__m64 d, da;
770 	__m64 s = combine (src, mask);
771 
772 	d = load8888 (dest);
773 	da = expand_alpha (d);
774 	store8888 (dest, over (d, da, s));
775 
776 	++dest;
777 	++src;
778 	if (mask)
779 	    mask++;
780     }
781     _mm_empty ();
782 }
783 
784 static void
mmx_combine_in_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)785 mmx_combine_in_u (pixman_implementation_t *imp,
786                   pixman_op_t              op,
787                   uint32_t *               dest,
788                   const uint32_t *         src,
789                   const uint32_t *         mask,
790                   int                      width)
791 {
792     const uint32_t *end = dest + width;
793 
794     while (dest < end)
795     {
796 	__m64 a;
797 	__m64 x = combine (src, mask);
798 
799 	a = load8888 (dest);
800 	a = expand_alpha (a);
801 	x = pix_multiply (x, a);
802 
803 	store8888 (dest, x);
804 
805 	++dest;
806 	++src;
807 	if (mask)
808 	    mask++;
809     }
810     _mm_empty ();
811 }
812 
813 static void
mmx_combine_in_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)814 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
815                           pixman_op_t              op,
816                           uint32_t *               dest,
817                           const uint32_t *         src,
818                           const uint32_t *         mask,
819                           int                      width)
820 {
821     const uint32_t *end = dest + width;
822 
823     while (dest < end)
824     {
825 	__m64 a = combine (src, mask);
826 	__m64 x;
827 
828 	x = load8888 (dest);
829 	a = expand_alpha (a);
830 	x = pix_multiply (x, a);
831 	store8888 (dest, x);
832 
833 	++dest;
834 	++src;
835 	if (mask)
836 	    mask++;
837     }
838     _mm_empty ();
839 }
840 
841 static void
mmx_combine_out_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)842 mmx_combine_out_u (pixman_implementation_t *imp,
843                    pixman_op_t              op,
844                    uint32_t *               dest,
845                    const uint32_t *         src,
846                    const uint32_t *         mask,
847                    int                      width)
848 {
849     const uint32_t *end = dest + width;
850 
851     while (dest < end)
852     {
853 	__m64 a;
854 	__m64 x = combine (src, mask);
855 
856 	a = load8888 (dest);
857 	a = expand_alpha (a);
858 	a = negate (a);
859 	x = pix_multiply (x, a);
860 	store8888 (dest, x);
861 
862 	++dest;
863 	++src;
864 	if (mask)
865 	    mask++;
866     }
867     _mm_empty ();
868 }
869 
870 static void
mmx_combine_out_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)871 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
872                            pixman_op_t              op,
873                            uint32_t *               dest,
874                            const uint32_t *         src,
875                            const uint32_t *         mask,
876                            int                      width)
877 {
878     const uint32_t *end = dest + width;
879 
880     while (dest < end)
881     {
882 	__m64 a = combine (src, mask);
883 	__m64 x;
884 
885 	x = load8888 (dest);
886 	a = expand_alpha (a);
887 	a = negate (a);
888 	x = pix_multiply (x, a);
889 
890 	store8888 (dest, x);
891 
892 	++dest;
893 	++src;
894 	if (mask)
895 	    mask++;
896     }
897     _mm_empty ();
898 }
899 
900 static void
mmx_combine_atop_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)901 mmx_combine_atop_u (pixman_implementation_t *imp,
902                     pixman_op_t              op,
903                     uint32_t *               dest,
904                     const uint32_t *         src,
905                     const uint32_t *         mask,
906                     int                      width)
907 {
908     const uint32_t *end = dest + width;
909 
910     while (dest < end)
911     {
912 	__m64 da, d, sia;
913 	__m64 s = combine (src, mask);
914 
915 	d = load8888 (dest);
916 	sia = expand_alpha (s);
917 	sia = negate (sia);
918 	da = expand_alpha (d);
919 	s = pix_add_mul (s, da, d, sia);
920 	store8888 (dest, s);
921 
922 	++dest;
923 	++src;
924 	if (mask)
925 	    mask++;
926     }
927     _mm_empty ();
928 }
929 
930 static void
mmx_combine_atop_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)931 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
932                             pixman_op_t              op,
933                             uint32_t *               dest,
934                             const uint32_t *         src,
935                             const uint32_t *         mask,
936                             int                      width)
937 {
938     const uint32_t *end;
939 
940     end = dest + width;
941 
942     while (dest < end)
943     {
944 	__m64 dia, d, sa;
945 	__m64 s = combine (src, mask);
946 
947 	d = load8888 (dest);
948 	sa = expand_alpha (s);
949 	dia = expand_alpha (d);
950 	dia = negate (dia);
951 	s = pix_add_mul (s, dia, d, sa);
952 	store8888 (dest, s);
953 
954 	++dest;
955 	++src;
956 	if (mask)
957 	    mask++;
958     }
959     _mm_empty ();
960 }
961 
962 static void
mmx_combine_xor_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)963 mmx_combine_xor_u (pixman_implementation_t *imp,
964                    pixman_op_t              op,
965                    uint32_t *               dest,
966                    const uint32_t *         src,
967                    const uint32_t *         mask,
968                    int                      width)
969 {
970     const uint32_t *end = dest + width;
971 
972     while (dest < end)
973     {
974 	__m64 dia, d, sia;
975 	__m64 s = combine (src, mask);
976 
977 	d = load8888 (dest);
978 	sia = expand_alpha (s);
979 	dia = expand_alpha (d);
980 	sia = negate (sia);
981 	dia = negate (dia);
982 	s = pix_add_mul (s, dia, d, sia);
983 	store8888 (dest, s);
984 
985 	++dest;
986 	++src;
987 	if (mask)
988 	    mask++;
989     }
990     _mm_empty ();
991 }
992 
993 static void
mmx_combine_add_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)994 mmx_combine_add_u (pixman_implementation_t *imp,
995                    pixman_op_t              op,
996                    uint32_t *               dest,
997                    const uint32_t *         src,
998                    const uint32_t *         mask,
999                    int                      width)
1000 {
1001     const uint32_t *end = dest + width;
1002 
1003     while (dest < end)
1004     {
1005 	__m64 d;
1006 	__m64 s = combine (src, mask);
1007 
1008 	d = load8888 (dest);
1009 	s = pix_add (s, d);
1010 	store8888 (dest, s);
1011 
1012 	++dest;
1013 	++src;
1014 	if (mask)
1015 	    mask++;
1016     }
1017     _mm_empty ();
1018 }
1019 
1020 static void
mmx_combine_saturate_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1021 mmx_combine_saturate_u (pixman_implementation_t *imp,
1022                         pixman_op_t              op,
1023                         uint32_t *               dest,
1024                         const uint32_t *         src,
1025                         const uint32_t *         mask,
1026                         int                      width)
1027 {
1028     const uint32_t *end = dest + width;
1029 
1030     while (dest < end)
1031     {
1032 	uint32_t s, sa, da;
1033 	uint32_t d = *dest;
1034 	__m64 ms = combine (src, mask);
1035 	__m64 md = load8888 (dest);
1036 
1037 	store8888(&s, ms);
1038 	da = ~d >> 24;
1039 	sa = s >> 24;
1040 
1041 	if (sa > da)
1042 	{
1043 	    uint32_t quot = DIV_UN8 (da, sa) << 24;
1044 	    __m64 msa = load8888 (&quot);
1045 	    msa = expand_alpha (msa);
1046 	    ms = pix_multiply (ms, msa);
1047 	}
1048 
1049 	md = pix_add (md, ms);
1050 	store8888 (dest, md);
1051 
1052 	++src;
1053 	++dest;
1054 	if (mask)
1055 	    mask++;
1056     }
1057     _mm_empty ();
1058 }
1059 
1060 static void
mmx_combine_src_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1061 mmx_combine_src_ca (pixman_implementation_t *imp,
1062                     pixman_op_t              op,
1063                     uint32_t *               dest,
1064                     const uint32_t *         src,
1065                     const uint32_t *         mask,
1066                     int                      width)
1067 {
1068     const uint32_t *end = src + width;
1069 
1070     while (src < end)
1071     {
1072 	__m64 a = load8888 (mask);
1073 	__m64 s = load8888 (src);
1074 
1075 	s = pix_multiply (s, a);
1076 	store8888 (dest, s);
1077 
1078 	++src;
1079 	++mask;
1080 	++dest;
1081     }
1082     _mm_empty ();
1083 }
1084 
1085 static void
mmx_combine_over_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1086 mmx_combine_over_ca (pixman_implementation_t *imp,
1087                      pixman_op_t              op,
1088                      uint32_t *               dest,
1089                      const uint32_t *         src,
1090                      const uint32_t *         mask,
1091                      int                      width)
1092 {
1093     const uint32_t *end = src + width;
1094 
1095     while (src < end)
1096     {
1097 	__m64 a = load8888 (mask);
1098 	__m64 s = load8888 (src);
1099 	__m64 d = load8888 (dest);
1100 	__m64 sa = expand_alpha (s);
1101 
1102 	store8888 (dest, in_over (s, sa, a, d));
1103 
1104 	++src;
1105 	++dest;
1106 	++mask;
1107     }
1108     _mm_empty ();
1109 }
1110 
1111 static void
mmx_combine_over_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1112 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1113                              pixman_op_t              op,
1114                              uint32_t *               dest,
1115                              const uint32_t *         src,
1116                              const uint32_t *         mask,
1117                              int                      width)
1118 {
1119     const uint32_t *end = src + width;
1120 
1121     while (src < end)
1122     {
1123 	__m64 a = load8888 (mask);
1124 	__m64 s = load8888 (src);
1125 	__m64 d = load8888 (dest);
1126 	__m64 da = expand_alpha (d);
1127 
1128 	store8888 (dest, over (d, da, in (s, a)));
1129 
1130 	++src;
1131 	++dest;
1132 	++mask;
1133     }
1134     _mm_empty ();
1135 }
1136 
1137 static void
mmx_combine_in_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1138 mmx_combine_in_ca (pixman_implementation_t *imp,
1139                    pixman_op_t              op,
1140                    uint32_t *               dest,
1141                    const uint32_t *         src,
1142                    const uint32_t *         mask,
1143                    int                      width)
1144 {
1145     const uint32_t *end = src + width;
1146 
1147     while (src < end)
1148     {
1149 	__m64 a = load8888 (mask);
1150 	__m64 s = load8888 (src);
1151 	__m64 d = load8888 (dest);
1152 	__m64 da = expand_alpha (d);
1153 
1154 	s = pix_multiply (s, a);
1155 	s = pix_multiply (s, da);
1156 	store8888 (dest, s);
1157 
1158 	++src;
1159 	++dest;
1160 	++mask;
1161     }
1162     _mm_empty ();
1163 }
1164 
1165 static void
mmx_combine_in_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1166 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1167                            pixman_op_t              op,
1168                            uint32_t *               dest,
1169                            const uint32_t *         src,
1170                            const uint32_t *         mask,
1171                            int                      width)
1172 {
1173     const uint32_t *end = src + width;
1174 
1175     while (src < end)
1176     {
1177 	__m64 a = load8888 (mask);
1178 	__m64 s = load8888 (src);
1179 	__m64 d = load8888 (dest);
1180 	__m64 sa = expand_alpha (s);
1181 
1182 	a = pix_multiply (a, sa);
1183 	d = pix_multiply (d, a);
1184 	store8888 (dest, d);
1185 
1186 	++src;
1187 	++dest;
1188 	++mask;
1189     }
1190     _mm_empty ();
1191 }
1192 
1193 static void
mmx_combine_out_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1194 mmx_combine_out_ca (pixman_implementation_t *imp,
1195                     pixman_op_t              op,
1196                     uint32_t *               dest,
1197                     const uint32_t *         src,
1198                     const uint32_t *         mask,
1199                     int                      width)
1200 {
1201     const uint32_t *end = src + width;
1202 
1203     while (src < end)
1204     {
1205 	__m64 a = load8888 (mask);
1206 	__m64 s = load8888 (src);
1207 	__m64 d = load8888 (dest);
1208 	__m64 da = expand_alpha (d);
1209 
1210 	da = negate (da);
1211 	s = pix_multiply (s, a);
1212 	s = pix_multiply (s, da);
1213 	store8888 (dest, s);
1214 
1215 	++src;
1216 	++dest;
1217 	++mask;
1218     }
1219     _mm_empty ();
1220 }
1221 
1222 static void
mmx_combine_out_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1223 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1224                             pixman_op_t              op,
1225                             uint32_t *               dest,
1226                             const uint32_t *         src,
1227                             const uint32_t *         mask,
1228                             int                      width)
1229 {
1230     const uint32_t *end = src + width;
1231 
1232     while (src < end)
1233     {
1234 	__m64 a = load8888 (mask);
1235 	__m64 s = load8888 (src);
1236 	__m64 d = load8888 (dest);
1237 	__m64 sa = expand_alpha (s);
1238 
1239 	a = pix_multiply (a, sa);
1240 	a = negate (a);
1241 	d = pix_multiply (d, a);
1242 	store8888 (dest, d);
1243 
1244 	++src;
1245 	++dest;
1246 	++mask;
1247     }
1248     _mm_empty ();
1249 }
1250 
1251 static void
mmx_combine_atop_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1252 mmx_combine_atop_ca (pixman_implementation_t *imp,
1253                      pixman_op_t              op,
1254                      uint32_t *               dest,
1255                      const uint32_t *         src,
1256                      const uint32_t *         mask,
1257                      int                      width)
1258 {
1259     const uint32_t *end = src + width;
1260 
1261     while (src < end)
1262     {
1263 	__m64 a = load8888 (mask);
1264 	__m64 s = load8888 (src);
1265 	__m64 d = load8888 (dest);
1266 	__m64 da = expand_alpha (d);
1267 	__m64 sa = expand_alpha (s);
1268 
1269 	s = pix_multiply (s, a);
1270 	a = pix_multiply (a, sa);
1271 	a = negate (a);
1272 	d = pix_add_mul (d, a, s, da);
1273 	store8888 (dest, d);
1274 
1275 	++src;
1276 	++dest;
1277 	++mask;
1278     }
1279     _mm_empty ();
1280 }
1281 
1282 static void
mmx_combine_atop_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1283 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1284                              pixman_op_t              op,
1285                              uint32_t *               dest,
1286                              const uint32_t *         src,
1287                              const uint32_t *         mask,
1288                              int                      width)
1289 {
1290     const uint32_t *end = src + width;
1291 
1292     while (src < end)
1293     {
1294 	__m64 a = load8888 (mask);
1295 	__m64 s = load8888 (src);
1296 	__m64 d = load8888 (dest);
1297 	__m64 da = expand_alpha (d);
1298 	__m64 sa = expand_alpha (s);
1299 
1300 	s = pix_multiply (s, a);
1301 	a = pix_multiply (a, sa);
1302 	da = negate (da);
1303 	d = pix_add_mul (d, a, s, da);
1304 	store8888 (dest, d);
1305 
1306 	++src;
1307 	++dest;
1308 	++mask;
1309     }
1310     _mm_empty ();
1311 }
1312 
1313 static void
mmx_combine_xor_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1314 mmx_combine_xor_ca (pixman_implementation_t *imp,
1315                     pixman_op_t              op,
1316                     uint32_t *               dest,
1317                     const uint32_t *         src,
1318                     const uint32_t *         mask,
1319                     int                      width)
1320 {
1321     const uint32_t *end = src + width;
1322 
1323     while (src < end)
1324     {
1325 	__m64 a = load8888 (mask);
1326 	__m64 s = load8888 (src);
1327 	__m64 d = load8888 (dest);
1328 	__m64 da = expand_alpha (d);
1329 	__m64 sa = expand_alpha (s);
1330 
1331 	s = pix_multiply (s, a);
1332 	a = pix_multiply (a, sa);
1333 	da = negate (da);
1334 	a = negate (a);
1335 	d = pix_add_mul (d, a, s, da);
1336 	store8888 (dest, d);
1337 
1338 	++src;
1339 	++dest;
1340 	++mask;
1341     }
1342     _mm_empty ();
1343 }
1344 
1345 static void
mmx_combine_add_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1346 mmx_combine_add_ca (pixman_implementation_t *imp,
1347                     pixman_op_t              op,
1348                     uint32_t *               dest,
1349                     const uint32_t *         src,
1350                     const uint32_t *         mask,
1351                     int                      width)
1352 {
1353     const uint32_t *end = src + width;
1354 
1355     while (src < end)
1356     {
1357 	__m64 a = load8888 (mask);
1358 	__m64 s = load8888 (src);
1359 	__m64 d = load8888 (dest);
1360 
1361 	s = pix_multiply (s, a);
1362 	d = pix_add (s, d);
1363 	store8888 (dest, d);
1364 
1365 	++src;
1366 	++dest;
1367 	++mask;
1368     }
1369     _mm_empty ();
1370 }
1371 
1372 /* ------------- MMX code paths called from fbpict.c -------------------- */
1373 
1374 static void
mmx_composite_over_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1375 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1376                            pixman_composite_info_t *info)
1377 {
1378     PIXMAN_COMPOSITE_ARGS (info);
1379     uint32_t src;
1380     uint32_t    *dst_line, *dst;
1381     int32_t w;
1382     int dst_stride;
1383     __m64 vsrc, vsrca;
1384 
1385     CHECKPOINT ();
1386 
1387     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1388 
1389     if (src == 0)
1390 	return;
1391 
1392     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1393 
1394     vsrc = load8888 (&src);
1395     vsrca = expand_alpha (vsrc);
1396 
1397     while (height--)
1398     {
1399 	dst = dst_line;
1400 	dst_line += dst_stride;
1401 	w = width;
1402 
1403 	CHECKPOINT ();
1404 
1405 	while (w && (uintptr_t)dst & 7)
1406 	{
1407 	    store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1408 
1409 	    w--;
1410 	    dst++;
1411 	}
1412 
1413 	while (w >= 2)
1414 	{
1415 	    __m64 vdest;
1416 	    __m64 dest0, dest1;
1417 
1418 	    vdest = *(__m64 *)dst;
1419 
1420 	    dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1421 	    dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1422 
1423 	    *(__m64 *)dst = pack8888 (dest0, dest1);
1424 
1425 	    dst += 2;
1426 	    w -= 2;
1427 	}
1428 
1429 	CHECKPOINT ();
1430 
1431 	if (w)
1432 	{
1433 	    store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1434 	}
1435     }
1436 
1437     _mm_empty ();
1438 }
1439 
1440 static void
mmx_composite_over_n_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)1441 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1442                            pixman_composite_info_t *info)
1443 {
1444     PIXMAN_COMPOSITE_ARGS (info);
1445     uint32_t src;
1446     uint16_t    *dst_line, *dst;
1447     int32_t w;
1448     int dst_stride;
1449     __m64 vsrc, vsrca;
1450 
1451     CHECKPOINT ();
1452 
1453     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1454 
1455     if (src == 0)
1456 	return;
1457 
1458     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1459 
1460     vsrc = load8888 (&src);
1461     vsrca = expand_alpha (vsrc);
1462 
1463     while (height--)
1464     {
1465 	dst = dst_line;
1466 	dst_line += dst_stride;
1467 	w = width;
1468 
1469 	CHECKPOINT ();
1470 
1471 	while (w && (uintptr_t)dst & 7)
1472 	{
1473 	    uint64_t d = *dst;
1474 	    __m64 vdest = expand565 (to_m64 (d), 0);
1475 
1476 	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1477 	    *dst = to_uint64 (vdest);
1478 
1479 	    w--;
1480 	    dst++;
1481 	}
1482 
1483 	while (w >= 4)
1484 	{
1485 	    __m64 vdest = *(__m64 *)dst;
1486 	    __m64 v0, v1, v2, v3;
1487 
1488 	    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1489 
1490 	    v0 = over (vsrc, vsrca, v0);
1491 	    v1 = over (vsrc, vsrca, v1);
1492 	    v2 = over (vsrc, vsrca, v2);
1493 	    v3 = over (vsrc, vsrca, v3);
1494 
1495 	    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1496 
1497 	    dst += 4;
1498 	    w -= 4;
1499 	}
1500 
1501 	CHECKPOINT ();
1502 
1503 	while (w)
1504 	{
1505 	    uint64_t d = *dst;
1506 	    __m64 vdest = expand565 (to_m64 (d), 0);
1507 
1508 	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1509 	    *dst = to_uint64 (vdest);
1510 
1511 	    w--;
1512 	    dst++;
1513 	}
1514     }
1515 
1516     _mm_empty ();
1517 }
1518 
1519 static void
mmx_composite_over_n_8888_8888_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)1520 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1521                                    pixman_composite_info_t *info)
1522 {
1523     PIXMAN_COMPOSITE_ARGS (info);
1524     uint32_t src;
1525     uint32_t    *dst_line;
1526     uint32_t    *mask_line;
1527     int dst_stride, mask_stride;
1528     __m64 vsrc, vsrca;
1529 
1530     CHECKPOINT ();
1531 
1532     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1533 
1534     if (src == 0)
1535 	return;
1536 
1537     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1538     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1539 
1540     vsrc = load8888 (&src);
1541     vsrca = expand_alpha (vsrc);
1542 
1543     while (height--)
1544     {
1545 	int twidth = width;
1546 	uint32_t *p = (uint32_t *)mask_line;
1547 	uint32_t *q = (uint32_t *)dst_line;
1548 
1549 	while (twidth && (uintptr_t)q & 7)
1550 	{
1551 	    uint32_t m = *(uint32_t *)p;
1552 
1553 	    if (m)
1554 	    {
1555 		__m64 vdest = load8888 (q);
1556 		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1557 		store8888 (q, vdest);
1558 	    }
1559 
1560 	    twidth--;
1561 	    p++;
1562 	    q++;
1563 	}
1564 
1565 	while (twidth >= 2)
1566 	{
1567 	    uint32_t m0, m1;
1568 	    m0 = *p;
1569 	    m1 = *(p + 1);
1570 
1571 	    if (m0 | m1)
1572 	    {
1573 		__m64 dest0, dest1;
1574 		__m64 vdest = *(__m64 *)q;
1575 
1576 		dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1577 		                 expand8888 (vdest, 0));
1578 		dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1579 		                 expand8888 (vdest, 1));
1580 
1581 		*(__m64 *)q = pack8888 (dest0, dest1);
1582 	    }
1583 
1584 	    p += 2;
1585 	    q += 2;
1586 	    twidth -= 2;
1587 	}
1588 
1589 	if (twidth)
1590 	{
1591 	    uint32_t m = *(uint32_t *)p;
1592 
1593 	    if (m)
1594 	    {
1595 		__m64 vdest = load8888 (q);
1596 		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1597 		store8888 (q, vdest);
1598 	    }
1599 
1600 	    twidth--;
1601 	    p++;
1602 	    q++;
1603 	}
1604 
1605 	dst_line += dst_stride;
1606 	mask_line += mask_stride;
1607     }
1608 
1609     _mm_empty ();
1610 }
1611 
1612 static void
mmx_composite_over_8888_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1613 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1614                                 pixman_composite_info_t *info)
1615 {
1616     PIXMAN_COMPOSITE_ARGS (info);
1617     uint32_t    *dst_line, *dst;
1618     uint32_t    *src_line, *src;
1619     uint32_t mask;
1620     __m64 vmask;
1621     int dst_stride, src_stride;
1622     int32_t w;
1623 
1624     CHECKPOINT ();
1625 
1626     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1627     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1628 
1629     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1630     vmask = expand_alpha (load8888 (&mask));
1631 
1632     while (height--)
1633     {
1634 	dst = dst_line;
1635 	dst_line += dst_stride;
1636 	src = src_line;
1637 	src_line += src_stride;
1638 	w = width;
1639 
1640 	while (w && (uintptr_t)dst & 7)
1641 	{
1642 	    __m64 s = load8888 (src);
1643 	    __m64 d = load8888 (dst);
1644 
1645 	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1646 
1647 	    w--;
1648 	    dst++;
1649 	    src++;
1650 	}
1651 
1652 	while (w >= 2)
1653 	{
1654 	    __m64 vs = ldq_u ((__m64 *)src);
1655 	    __m64 vd = *(__m64 *)dst;
1656 	    __m64 vsrc0 = expand8888 (vs, 0);
1657 	    __m64 vsrc1 = expand8888 (vs, 1);
1658 
1659 	    *(__m64 *)dst = pack8888 (
1660 	        in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1661 	        in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1662 
1663 	    w -= 2;
1664 	    dst += 2;
1665 	    src += 2;
1666 	}
1667 
1668 	if (w)
1669 	{
1670 	    __m64 s = load8888 (src);
1671 	    __m64 d = load8888 (dst);
1672 
1673 	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1674 	}
1675     }
1676 
1677     _mm_empty ();
1678 }
1679 
1680 static void
mmx_composite_over_x888_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1681 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1682                                 pixman_composite_info_t *info)
1683 {
1684     PIXMAN_COMPOSITE_ARGS (info);
1685     uint32_t *dst_line, *dst;
1686     uint32_t *src_line, *src;
1687     uint32_t mask;
1688     __m64 vmask;
1689     int dst_stride, src_stride;
1690     int32_t w;
1691     __m64 srca;
1692 
1693     CHECKPOINT ();
1694 
1695     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1696     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1697     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1698 
1699     vmask = expand_alpha (load8888 (&mask));
1700     srca = MC (4x00ff);
1701 
1702     while (height--)
1703     {
1704 	dst = dst_line;
1705 	dst_line += dst_stride;
1706 	src = src_line;
1707 	src_line += src_stride;
1708 	w = width;
1709 
1710 	while (w && (uintptr_t)dst & 7)
1711 	{
1712 	    uint32_t ssrc = *src | 0xff000000;
1713 	    __m64 s = load8888 (&ssrc);
1714 	    __m64 d = load8888 (dst);
1715 
1716 	    store8888 (dst, in_over (s, srca, vmask, d));
1717 
1718 	    w--;
1719 	    dst++;
1720 	    src++;
1721 	}
1722 
1723 	while (w >= 16)
1724 	{
1725 	    __m64 vd0 = *(__m64 *)(dst + 0);
1726 	    __m64 vd1 = *(__m64 *)(dst + 2);
1727 	    __m64 vd2 = *(__m64 *)(dst + 4);
1728 	    __m64 vd3 = *(__m64 *)(dst + 6);
1729 	    __m64 vd4 = *(__m64 *)(dst + 8);
1730 	    __m64 vd5 = *(__m64 *)(dst + 10);
1731 	    __m64 vd6 = *(__m64 *)(dst + 12);
1732 	    __m64 vd7 = *(__m64 *)(dst + 14);
1733 
1734 	    __m64 vs0 = ldq_u ((__m64 *)(src + 0));
1735 	    __m64 vs1 = ldq_u ((__m64 *)(src + 2));
1736 	    __m64 vs2 = ldq_u ((__m64 *)(src + 4));
1737 	    __m64 vs3 = ldq_u ((__m64 *)(src + 6));
1738 	    __m64 vs4 = ldq_u ((__m64 *)(src + 8));
1739 	    __m64 vs5 = ldq_u ((__m64 *)(src + 10));
1740 	    __m64 vs6 = ldq_u ((__m64 *)(src + 12));
1741 	    __m64 vs7 = ldq_u ((__m64 *)(src + 14));
1742 
1743 	    vd0 = pack8888 (
1744 	        in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1745 	        in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1746 
1747 	    vd1 = pack8888 (
1748 	        in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1749 	        in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1750 
1751 	    vd2 = pack8888 (
1752 	        in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1753 	        in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1754 
1755 	    vd3 = pack8888 (
1756 	        in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1757 	        in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1758 
1759 	    vd4 = pack8888 (
1760 	        in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1761 	        in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1762 
1763 	    vd5 = pack8888 (
1764 	        in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1765 	        in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1766 
1767 	    vd6 = pack8888 (
1768 	        in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1769 	        in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1770 
1771 	    vd7 = pack8888 (
1772 	        in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1773 	        in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1774 
1775 	    *(__m64 *)(dst + 0) = vd0;
1776 	    *(__m64 *)(dst + 2) = vd1;
1777 	    *(__m64 *)(dst + 4) = vd2;
1778 	    *(__m64 *)(dst + 6) = vd3;
1779 	    *(__m64 *)(dst + 8) = vd4;
1780 	    *(__m64 *)(dst + 10) = vd5;
1781 	    *(__m64 *)(dst + 12) = vd6;
1782 	    *(__m64 *)(dst + 14) = vd7;
1783 
1784 	    w -= 16;
1785 	    dst += 16;
1786 	    src += 16;
1787 	}
1788 
1789 	while (w)
1790 	{
1791 	    uint32_t ssrc = *src | 0xff000000;
1792 	    __m64 s = load8888 (&ssrc);
1793 	    __m64 d = load8888 (dst);
1794 
1795 	    store8888 (dst, in_over (s, srca, vmask, d));
1796 
1797 	    w--;
1798 	    dst++;
1799 	    src++;
1800 	}
1801     }
1802 
1803     _mm_empty ();
1804 }
1805 
1806 static void
mmx_composite_over_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1807 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1808                               pixman_composite_info_t *info)
1809 {
1810     PIXMAN_COMPOSITE_ARGS (info);
1811     uint32_t *dst_line, *dst;
1812     uint32_t *src_line, *src;
1813     uint32_t s;
1814     int dst_stride, src_stride;
1815     uint8_t a;
1816     int32_t w;
1817 
1818     CHECKPOINT ();
1819 
1820     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1821     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1822 
1823     while (height--)
1824     {
1825 	dst = dst_line;
1826 	dst_line += dst_stride;
1827 	src = src_line;
1828 	src_line += src_stride;
1829 	w = width;
1830 
1831 	while (w--)
1832 	{
1833 	    s = *src++;
1834 	    a = s >> 24;
1835 
1836 	    if (a == 0xff)
1837 	    {
1838 		*dst = s;
1839 	    }
1840 	    else if (s)
1841 	    {
1842 		__m64 ms, sa;
1843 		ms = load8888 (&s);
1844 		sa = expand_alpha (ms);
1845 		store8888 (dst, over (ms, sa, load8888 (dst)));
1846 	    }
1847 
1848 	    dst++;
1849 	}
1850     }
1851     _mm_empty ();
1852 }
1853 
1854 static void
mmx_composite_over_8888_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)1855 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1856                               pixman_composite_info_t *info)
1857 {
1858     PIXMAN_COMPOSITE_ARGS (info);
1859     uint16_t    *dst_line, *dst;
1860     uint32_t    *src_line, *src;
1861     int dst_stride, src_stride;
1862     int32_t w;
1863 
1864     CHECKPOINT ();
1865 
1866     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1867     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1868 
1869 #if 0
1870     /* FIXME */
1871     assert (src_image->drawable == mask_image->drawable);
1872 #endif
1873 
1874     while (height--)
1875     {
1876 	dst = dst_line;
1877 	dst_line += dst_stride;
1878 	src = src_line;
1879 	src_line += src_stride;
1880 	w = width;
1881 
1882 	CHECKPOINT ();
1883 
1884 	while (w && (uintptr_t)dst & 7)
1885 	{
1886 	    __m64 vsrc = load8888 (src);
1887 	    uint64_t d = *dst;
1888 	    __m64 vdest = expand565 (to_m64 (d), 0);
1889 
1890 	    vdest = pack_565 (
1891 		over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1892 
1893 	    *dst = to_uint64 (vdest);
1894 
1895 	    w--;
1896 	    dst++;
1897 	    src++;
1898 	}
1899 
1900 	CHECKPOINT ();
1901 
1902 	while (w >= 4)
1903 	{
1904 	    __m64 vdest = *(__m64 *)dst;
1905 	    __m64 v0, v1, v2, v3;
1906 	    __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1907 
1908 	    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1909 
1910 	    vsrc0 = load8888 ((src + 0));
1911 	    vsrc1 = load8888 ((src + 1));
1912 	    vsrc2 = load8888 ((src + 2));
1913 	    vsrc3 = load8888 ((src + 3));
1914 
1915 	    v0 = over (vsrc0, expand_alpha (vsrc0), v0);
1916 	    v1 = over (vsrc1, expand_alpha (vsrc1), v1);
1917 	    v2 = over (vsrc2, expand_alpha (vsrc2), v2);
1918 	    v3 = over (vsrc3, expand_alpha (vsrc3), v3);
1919 
1920 	    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1921 
1922 	    w -= 4;
1923 	    dst += 4;
1924 	    src += 4;
1925 	}
1926 
1927 	CHECKPOINT ();
1928 
1929 	while (w)
1930 	{
1931 	    __m64 vsrc = load8888 (src);
1932 	    uint64_t d = *dst;
1933 	    __m64 vdest = expand565 (to_m64 (d), 0);
1934 
1935 	    vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1936 
1937 	    *dst = to_uint64 (vdest);
1938 
1939 	    w--;
1940 	    dst++;
1941 	    src++;
1942 	}
1943     }
1944 
1945     _mm_empty ();
1946 }
1947 
1948 static void
mmx_composite_over_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1949 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1950                              pixman_composite_info_t *info)
1951 {
1952     PIXMAN_COMPOSITE_ARGS (info);
1953     uint32_t src, srca;
1954     uint32_t *dst_line, *dst;
1955     uint8_t *mask_line, *mask;
1956     int dst_stride, mask_stride;
1957     int32_t w;
1958     __m64 vsrc, vsrca;
1959     uint64_t srcsrc;
1960 
1961     CHECKPOINT ();
1962 
1963     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1964 
1965     srca = src >> 24;
1966     if (src == 0)
1967 	return;
1968 
1969     srcsrc = (uint64_t)src << 32 | src;
1970 
1971     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1972     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1973 
1974     vsrc = load8888 (&src);
1975     vsrca = expand_alpha (vsrc);
1976 
1977     while (height--)
1978     {
1979 	dst = dst_line;
1980 	dst_line += dst_stride;
1981 	mask = mask_line;
1982 	mask_line += mask_stride;
1983 	w = width;
1984 
1985 	CHECKPOINT ();
1986 
1987 	while (w && (uintptr_t)dst & 7)
1988 	{
1989 	    uint64_t m = *mask;
1990 
1991 	    if (m)
1992 	    {
1993 		__m64 vdest = in_over (vsrc, vsrca,
1994 				       expand_alpha_rev (to_m64 (m)),
1995 				       load8888 (dst));
1996 
1997 		store8888 (dst, vdest);
1998 	    }
1999 
2000 	    w--;
2001 	    mask++;
2002 	    dst++;
2003 	}
2004 
2005 	CHECKPOINT ();
2006 
2007 	while (w >= 2)
2008 	{
2009 	    uint64_t m0, m1;
2010 
2011 	    m0 = *mask;
2012 	    m1 = *(mask + 1);
2013 
2014 	    if (srca == 0xff && (m0 & m1) == 0xff)
2015 	    {
2016 		*(uint64_t *)dst = srcsrc;
2017 	    }
2018 	    else if (m0 | m1)
2019 	    {
2020 		__m64 vdest;
2021 		__m64 dest0, dest1;
2022 
2023 		vdest = *(__m64 *)dst;
2024 
2025 		dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
2026 				 expand8888 (vdest, 0));
2027 		dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
2028 				 expand8888 (vdest, 1));
2029 
2030 		*(__m64 *)dst = pack8888 (dest0, dest1);
2031 	    }
2032 
2033 	    mask += 2;
2034 	    dst += 2;
2035 	    w -= 2;
2036 	}
2037 
2038 	CHECKPOINT ();
2039 
2040 	if (w)
2041 	{
2042 	    uint64_t m = *mask;
2043 
2044 	    if (m)
2045 	    {
2046 		__m64 vdest = load8888 (dst);
2047 
2048 		vdest = in_over (
2049 		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
2050 		store8888 (dst, vdest);
2051 	    }
2052 	}
2053     }
2054 
2055     _mm_empty ();
2056 }
2057 
2058 static pixman_bool_t
mmx_fill(pixman_implementation_t * imp,uint32_t * bits,int stride,int bpp,int x,int y,int width,int height,uint32_t filler)2059 mmx_fill (pixman_implementation_t *imp,
2060           uint32_t *               bits,
2061           int                      stride,
2062           int                      bpp,
2063           int                      x,
2064           int                      y,
2065           int                      width,
2066           int                      height,
2067           uint32_t		   filler)
2068 {
2069     uint64_t fill;
2070     __m64 vfill;
2071     uint32_t byte_width;
2072     uint8_t     *byte_line;
2073 
2074 #if defined __GNUC__ && defined USE_X86_MMX
2075     __m64 v1, v2, v3, v4, v5, v6, v7;
2076 #endif
2077 
2078     if (bpp != 16 && bpp != 32 && bpp != 8)
2079 	return FALSE;
2080 
2081     if (bpp == 8)
2082     {
2083 	stride = stride * (int) sizeof (uint32_t) / 1;
2084 	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
2085 	byte_width = width;
2086 	stride *= 1;
2087         filler = (filler & 0xff) * 0x01010101;
2088     }
2089     else if (bpp == 16)
2090     {
2091 	stride = stride * (int) sizeof (uint32_t) / 2;
2092 	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
2093 	byte_width = 2 * width;
2094 	stride *= 2;
2095         filler = (filler & 0xffff) * 0x00010001;
2096     }
2097     else
2098     {
2099 	stride = stride * (int) sizeof (uint32_t) / 4;
2100 	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
2101 	byte_width = 4 * width;
2102 	stride *= 4;
2103     }
2104 
2105     fill = ((uint64_t)filler << 32) | filler;
2106     vfill = to_m64 (fill);
2107 
2108 #if defined __GNUC__ && defined USE_X86_MMX
2109     __asm__ (
2110         "movq		%7,	%0\n"
2111         "movq		%7,	%1\n"
2112         "movq		%7,	%2\n"
2113         "movq		%7,	%3\n"
2114         "movq		%7,	%4\n"
2115         "movq		%7,	%5\n"
2116         "movq		%7,	%6\n"
2117 	: "=&y" (v1), "=&y" (v2), "=&y" (v3),
2118 	  "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
2119 	: "y" (vfill));
2120 #endif
2121 
2122     while (height--)
2123     {
2124 	int w;
2125 	uint8_t *d = byte_line;
2126 
2127 	byte_line += stride;
2128 	w = byte_width;
2129 
2130 	if (w >= 1 && ((uintptr_t)d & 1))
2131 	{
2132 	    *(uint8_t *)d = (filler & 0xff);
2133 	    w--;
2134 	    d++;
2135 	}
2136 
2137 	if (w >= 2 && ((uintptr_t)d & 3))
2138 	{
2139 	    *(uint16_t *)d = filler;
2140 	    w -= 2;
2141 	    d += 2;
2142 	}
2143 
2144 	while (w >= 4 && ((uintptr_t)d & 7))
2145 	{
2146 	    *(uint32_t *)d = filler;
2147 
2148 	    w -= 4;
2149 	    d += 4;
2150 	}
2151 
2152 	while (w >= 64)
2153 	{
2154 #if defined __GNUC__ && defined USE_X86_MMX
2155 	    __asm__ (
2156 	        "movq	%1,	  (%0)\n"
2157 	        "movq	%2,	 8(%0)\n"
2158 	        "movq	%3,	16(%0)\n"
2159 	        "movq	%4,	24(%0)\n"
2160 	        "movq	%5,	32(%0)\n"
2161 	        "movq	%6,	40(%0)\n"
2162 	        "movq	%7,	48(%0)\n"
2163 	        "movq	%8,	56(%0)\n"
2164 		:
2165 		: "r" (d),
2166 		  "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
2167 		  "y" (v4), "y" (v5), "y" (v6), "y" (v7)
2168 		: "memory");
2169 #else
2170 	    *(__m64*) (d +  0) = vfill;
2171 	    *(__m64*) (d +  8) = vfill;
2172 	    *(__m64*) (d + 16) = vfill;
2173 	    *(__m64*) (d + 24) = vfill;
2174 	    *(__m64*) (d + 32) = vfill;
2175 	    *(__m64*) (d + 40) = vfill;
2176 	    *(__m64*) (d + 48) = vfill;
2177 	    *(__m64*) (d + 56) = vfill;
2178 #endif
2179 	    w -= 64;
2180 	    d += 64;
2181 	}
2182 
2183 	while (w >= 4)
2184 	{
2185 	    *(uint32_t *)d = filler;
2186 
2187 	    w -= 4;
2188 	    d += 4;
2189 	}
2190 	if (w >= 2)
2191 	{
2192 	    *(uint16_t *)d = filler;
2193 	    w -= 2;
2194 	    d += 2;
2195 	}
2196 	if (w >= 1)
2197 	{
2198 	    *(uint8_t *)d = (filler & 0xff);
2199 	    w--;
2200 	    d++;
2201 	}
2202 
2203     }
2204 
2205     _mm_empty ();
2206     return TRUE;
2207 }
2208 
2209 static void
mmx_composite_src_x888_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2210 mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
2211                              pixman_composite_info_t *info)
2212 {
2213     PIXMAN_COMPOSITE_ARGS (info);
2214     uint16_t    *dst_line, *dst;
2215     uint32_t    *src_line, *src, s;
2216     int dst_stride, src_stride;
2217     int32_t w;
2218 
2219     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2220     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2221 
2222     while (height--)
2223     {
2224 	dst = dst_line;
2225 	dst_line += dst_stride;
2226 	src = src_line;
2227 	src_line += src_stride;
2228 	w = width;
2229 
2230 	while (w && (uintptr_t)dst & 7)
2231 	{
2232 	    s = *src++;
2233 	    *dst = convert_8888_to_0565 (s);
2234 	    dst++;
2235 	    w--;
2236 	}
2237 
2238 	while (w >= 4)
2239 	{
2240 	    __m64 vdest;
2241 	    __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
2242 	    __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
2243 
2244 	    vdest = pack_4xpacked565 (vsrc0, vsrc1);
2245 
2246 	    *(__m64 *)dst = vdest;
2247 
2248 	    w -= 4;
2249 	    src += 4;
2250 	    dst += 4;
2251 	}
2252 
2253 	while (w)
2254 	{
2255 	    s = *src++;
2256 	    *dst = convert_8888_to_0565 (s);
2257 	    dst++;
2258 	    w--;
2259 	}
2260     }
2261 
2262     _mm_empty ();
2263 }
2264 
2265 static void
mmx_composite_src_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2266 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2267                             pixman_composite_info_t *info)
2268 {
2269     PIXMAN_COMPOSITE_ARGS (info);
2270     uint32_t src, srca;
2271     uint32_t    *dst_line, *dst;
2272     uint8_t     *mask_line, *mask;
2273     int dst_stride, mask_stride;
2274     int32_t w;
2275     __m64 vsrc;
2276     uint64_t srcsrc;
2277 
2278     CHECKPOINT ();
2279 
2280     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2281 
2282     srca = src >> 24;
2283     if (src == 0)
2284     {
2285 	mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
2286 		  PIXMAN_FORMAT_BPP (dest_image->bits.format),
2287 		  dest_x, dest_y, width, height, 0);
2288 	return;
2289     }
2290 
2291     srcsrc = (uint64_t)src << 32 | src;
2292 
2293     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2294     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2295 
2296     vsrc = load8888 (&src);
2297 
2298     while (height--)
2299     {
2300 	dst = dst_line;
2301 	dst_line += dst_stride;
2302 	mask = mask_line;
2303 	mask_line += mask_stride;
2304 	w = width;
2305 
2306 	CHECKPOINT ();
2307 
2308 	while (w && (uintptr_t)dst & 7)
2309 	{
2310 	    uint64_t m = *mask;
2311 
2312 	    if (m)
2313 	    {
2314 		__m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2315 
2316 		store8888 (dst, vdest);
2317 	    }
2318 	    else
2319 	    {
2320 		*dst = 0;
2321 	    }
2322 
2323 	    w--;
2324 	    mask++;
2325 	    dst++;
2326 	}
2327 
2328 	CHECKPOINT ();
2329 
2330 	while (w >= 2)
2331 	{
2332 	    uint64_t m0, m1;
2333 	    m0 = *mask;
2334 	    m1 = *(mask + 1);
2335 
2336 	    if (srca == 0xff && (m0 & m1) == 0xff)
2337 	    {
2338 		*(uint64_t *)dst = srcsrc;
2339 	    }
2340 	    else if (m0 | m1)
2341 	    {
2342 		__m64 dest0, dest1;
2343 
2344 		dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2345 		dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2346 
2347 		*(__m64 *)dst = pack8888 (dest0, dest1);
2348 	    }
2349 	    else
2350 	    {
2351 		*(uint64_t *)dst = 0;
2352 	    }
2353 
2354 	    mask += 2;
2355 	    dst += 2;
2356 	    w -= 2;
2357 	}
2358 
2359 	CHECKPOINT ();
2360 
2361 	if (w)
2362 	{
2363 	    uint64_t m = *mask;
2364 
2365 	    if (m)
2366 	    {
2367 		__m64 vdest = load8888 (dst);
2368 
2369 		vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2370 		store8888 (dst, vdest);
2371 	    }
2372 	    else
2373 	    {
2374 		*dst = 0;
2375 	    }
2376 	}
2377     }
2378 
2379     _mm_empty ();
2380 }
2381 
2382 static void
mmx_composite_over_n_8_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2383 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2384                              pixman_composite_info_t *info)
2385 {
2386     PIXMAN_COMPOSITE_ARGS (info);
2387     uint32_t src, srca;
2388     uint16_t *dst_line, *dst;
2389     uint8_t *mask_line, *mask;
2390     int dst_stride, mask_stride;
2391     int32_t w;
2392     __m64 vsrc, vsrca, tmp;
2393     __m64 srcsrcsrcsrc;
2394 
2395     CHECKPOINT ();
2396 
2397     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2398 
2399     srca = src >> 24;
2400     if (src == 0)
2401 	return;
2402 
2403     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2404     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2405 
2406     vsrc = load8888 (&src);
2407     vsrca = expand_alpha (vsrc);
2408 
2409     tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2410     srcsrcsrcsrc = expand_alpha_rev (tmp);
2411 
2412     while (height--)
2413     {
2414 	dst = dst_line;
2415 	dst_line += dst_stride;
2416 	mask = mask_line;
2417 	mask_line += mask_stride;
2418 	w = width;
2419 
2420 	CHECKPOINT ();
2421 
2422 	while (w && (uintptr_t)dst & 7)
2423 	{
2424 	    uint64_t m = *mask;
2425 
2426 	    if (m)
2427 	    {
2428 		uint64_t d = *dst;
2429 		__m64 vd = to_m64 (d);
2430 		__m64 vdest = in_over (
2431 		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2432 
2433 		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2434 		*dst = to_uint64 (vd);
2435 	    }
2436 
2437 	    w--;
2438 	    mask++;
2439 	    dst++;
2440 	}
2441 
2442 	CHECKPOINT ();
2443 
2444 	while (w >= 4)
2445 	{
2446 	    uint64_t m0, m1, m2, m3;
2447 	    m0 = *mask;
2448 	    m1 = *(mask + 1);
2449 	    m2 = *(mask + 2);
2450 	    m3 = *(mask + 3);
2451 
2452 	    if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2453 	    {
2454 		*(__m64 *)dst = srcsrcsrcsrc;
2455 	    }
2456 	    else if (m0 | m1 | m2 | m3)
2457 	    {
2458 		__m64 vdest = *(__m64 *)dst;
2459 		__m64 v0, v1, v2, v3;
2460 		__m64 vm0, vm1, vm2, vm3;
2461 
2462 		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2463 
2464 		vm0 = to_m64 (m0);
2465 		v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
2466 
2467 		vm1 = to_m64 (m1);
2468 		v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
2469 
2470 		vm2 = to_m64 (m2);
2471 		v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
2472 
2473 		vm3 = to_m64 (m3);
2474 		v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
2475 
2476 		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
2477 	    }
2478 
2479 	    w -= 4;
2480 	    mask += 4;
2481 	    dst += 4;
2482 	}
2483 
2484 	CHECKPOINT ();
2485 
2486 	while (w)
2487 	{
2488 	    uint64_t m = *mask;
2489 
2490 	    if (m)
2491 	    {
2492 		uint64_t d = *dst;
2493 		__m64 vd = to_m64 (d);
2494 		__m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2495 				       expand565 (vd, 0));
2496 		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2497 		*dst = to_uint64 (vd);
2498 	    }
2499 
2500 	    w--;
2501 	    mask++;
2502 	    dst++;
2503 	}
2504     }
2505 
2506     _mm_empty ();
2507 }
2508 
2509 static void
mmx_composite_over_pixbuf_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2510 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2511                                 pixman_composite_info_t *info)
2512 {
2513     PIXMAN_COMPOSITE_ARGS (info);
2514     uint16_t    *dst_line, *dst;
2515     uint32_t    *src_line, *src;
2516     int dst_stride, src_stride;
2517     int32_t w;
2518 
2519     CHECKPOINT ();
2520 
2521     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2522     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2523 
2524 #if 0
2525     /* FIXME */
2526     assert (src_image->drawable == mask_image->drawable);
2527 #endif
2528 
2529     while (height--)
2530     {
2531 	dst = dst_line;
2532 	dst_line += dst_stride;
2533 	src = src_line;
2534 	src_line += src_stride;
2535 	w = width;
2536 
2537 	CHECKPOINT ();
2538 
2539 	while (w && (uintptr_t)dst & 7)
2540 	{
2541 	    __m64 vsrc = load8888 (src);
2542 	    uint64_t d = *dst;
2543 	    __m64 vdest = expand565 (to_m64 (d), 0);
2544 
2545 	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2546 
2547 	    *dst = to_uint64 (vdest);
2548 
2549 	    w--;
2550 	    dst++;
2551 	    src++;
2552 	}
2553 
2554 	CHECKPOINT ();
2555 
2556 	while (w >= 4)
2557 	{
2558 	    uint32_t s0, s1, s2, s3;
2559 	    unsigned char a0, a1, a2, a3;
2560 
2561 	    s0 = *src;
2562 	    s1 = *(src + 1);
2563 	    s2 = *(src + 2);
2564 	    s3 = *(src + 3);
2565 
2566 	    a0 = (s0 >> 24);
2567 	    a1 = (s1 >> 24);
2568 	    a2 = (s2 >> 24);
2569 	    a3 = (s3 >> 24);
2570 
2571 	    if ((a0 & a1 & a2 & a3) == 0xFF)
2572 	    {
2573 		__m64 v0 = invert_colors (load8888 (&s0));
2574 		__m64 v1 = invert_colors (load8888 (&s1));
2575 		__m64 v2 = invert_colors (load8888 (&s2));
2576 		__m64 v3 = invert_colors (load8888 (&s3));
2577 
2578 		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2579 	    }
2580 	    else if (s0 | s1 | s2 | s3)
2581 	    {
2582 		__m64 vdest = *(__m64 *)dst;
2583 		__m64 v0, v1, v2, v3;
2584 
2585 		__m64 vsrc0 = load8888 (&s0);
2586 		__m64 vsrc1 = load8888 (&s1);
2587 		__m64 vsrc2 = load8888 (&s2);
2588 		__m64 vsrc3 = load8888 (&s3);
2589 
2590 		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2591 
2592 		v0 = over_rev_non_pre (vsrc0, v0);
2593 		v1 = over_rev_non_pre (vsrc1, v1);
2594 		v2 = over_rev_non_pre (vsrc2, v2);
2595 		v3 = over_rev_non_pre (vsrc3, v3);
2596 
2597 		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2598 	    }
2599 
2600 	    w -= 4;
2601 	    dst += 4;
2602 	    src += 4;
2603 	}
2604 
2605 	CHECKPOINT ();
2606 
2607 	while (w)
2608 	{
2609 	    __m64 vsrc = load8888 (src);
2610 	    uint64_t d = *dst;
2611 	    __m64 vdest = expand565 (to_m64 (d), 0);
2612 
2613 	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2614 
2615 	    *dst = to_uint64 (vdest);
2616 
2617 	    w--;
2618 	    dst++;
2619 	    src++;
2620 	}
2621     }
2622 
2623     _mm_empty ();
2624 }
2625 
2626 static void
mmx_composite_over_pixbuf_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2627 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2628                                 pixman_composite_info_t *info)
2629 {
2630     PIXMAN_COMPOSITE_ARGS (info);
2631     uint32_t    *dst_line, *dst;
2632     uint32_t    *src_line, *src;
2633     int dst_stride, src_stride;
2634     int32_t w;
2635 
2636     CHECKPOINT ();
2637 
2638     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2639     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2640 
2641 #if 0
2642     /* FIXME */
2643     assert (src_image->drawable == mask_image->drawable);
2644 #endif
2645 
2646     while (height--)
2647     {
2648 	dst = dst_line;
2649 	dst_line += dst_stride;
2650 	src = src_line;
2651 	src_line += src_stride;
2652 	w = width;
2653 
2654 	while (w && (uintptr_t)dst & 7)
2655 	{
2656 	    __m64 s = load8888 (src);
2657 	    __m64 d = load8888 (dst);
2658 
2659 	    store8888 (dst, over_rev_non_pre (s, d));
2660 
2661 	    w--;
2662 	    dst++;
2663 	    src++;
2664 	}
2665 
2666 	while (w >= 2)
2667 	{
2668 	    uint32_t s0, s1;
2669 	    unsigned char a0, a1;
2670 	    __m64 d0, d1;
2671 
2672 	    s0 = *src;
2673 	    s1 = *(src + 1);
2674 
2675 	    a0 = (s0 >> 24);
2676 	    a1 = (s1 >> 24);
2677 
2678 	    if ((a0 & a1) == 0xFF)
2679 	    {
2680 		d0 = invert_colors (load8888 (&s0));
2681 		d1 = invert_colors (load8888 (&s1));
2682 
2683 		*(__m64 *)dst = pack8888 (d0, d1);
2684 	    }
2685 	    else if (s0 | s1)
2686 	    {
2687 		__m64 vdest = *(__m64 *)dst;
2688 
2689 		d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
2690 		d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
2691 
2692 		*(__m64 *)dst = pack8888 (d0, d1);
2693 	    }
2694 
2695 	    w -= 2;
2696 	    dst += 2;
2697 	    src += 2;
2698 	}
2699 
2700 	if (w)
2701 	{
2702 	    __m64 s = load8888 (src);
2703 	    __m64 d = load8888 (dst);
2704 
2705 	    store8888 (dst, over_rev_non_pre (s, d));
2706 	}
2707     }
2708 
2709     _mm_empty ();
2710 }
2711 
2712 static void
mmx_composite_over_n_8888_0565_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)2713 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2714                                    pixman_composite_info_t *info)
2715 {
2716     PIXMAN_COMPOSITE_ARGS (info);
2717     uint32_t src;
2718     uint16_t    *dst_line;
2719     uint32_t    *mask_line;
2720     int dst_stride, mask_stride;
2721     __m64 vsrc, vsrca;
2722 
2723     CHECKPOINT ();
2724 
2725     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2726 
2727     if (src == 0)
2728 	return;
2729 
2730     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2731     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2732 
2733     vsrc = load8888 (&src);
2734     vsrca = expand_alpha (vsrc);
2735 
2736     while (height--)
2737     {
2738 	int twidth = width;
2739 	uint32_t *p = (uint32_t *)mask_line;
2740 	uint16_t *q = (uint16_t *)dst_line;
2741 
2742 	while (twidth && ((uintptr_t)q & 7))
2743 	{
2744 	    uint32_t m = *(uint32_t *)p;
2745 
2746 	    if (m)
2747 	    {
2748 		uint64_t d = *q;
2749 		__m64 vdest = expand565 (to_m64 (d), 0);
2750 		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2751 		*q = to_uint64 (vdest);
2752 	    }
2753 
2754 	    twidth--;
2755 	    p++;
2756 	    q++;
2757 	}
2758 
2759 	while (twidth >= 4)
2760 	{
2761 	    uint32_t m0, m1, m2, m3;
2762 
2763 	    m0 = *p;
2764 	    m1 = *(p + 1);
2765 	    m2 = *(p + 2);
2766 	    m3 = *(p + 3);
2767 
2768 	    if ((m0 | m1 | m2 | m3))
2769 	    {
2770 		__m64 vdest = *(__m64 *)q;
2771 		__m64 v0, v1, v2, v3;
2772 
2773 		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2774 
2775 		v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
2776 		v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
2777 		v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
2778 		v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
2779 
2780 		*(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
2781 	    }
2782 	    twidth -= 4;
2783 	    p += 4;
2784 	    q += 4;
2785 	}
2786 
2787 	while (twidth)
2788 	{
2789 	    uint32_t m;
2790 
2791 	    m = *(uint32_t *)p;
2792 	    if (m)
2793 	    {
2794 		uint64_t d = *q;
2795 		__m64 vdest = expand565 (to_m64 (d), 0);
2796 		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2797 		*q = to_uint64 (vdest);
2798 	    }
2799 
2800 	    twidth--;
2801 	    p++;
2802 	    q++;
2803 	}
2804 
2805 	mask_line += mask_stride;
2806 	dst_line += dst_stride;
2807     }
2808 
2809     _mm_empty ();
2810 }
2811 
2812 static void
mmx_composite_in_n_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)2813 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2814                         pixman_composite_info_t *info)
2815 {
2816     PIXMAN_COMPOSITE_ARGS (info);
2817     uint8_t *dst_line, *dst;
2818     uint8_t *mask_line, *mask;
2819     int dst_stride, mask_stride;
2820     int32_t w;
2821     uint32_t src;
2822     uint8_t sa;
2823     __m64 vsrc, vsrca;
2824 
2825     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2826     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2827 
2828     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2829 
2830     sa = src >> 24;
2831 
2832     vsrc = load8888 (&src);
2833     vsrca = expand_alpha (vsrc);
2834 
2835     while (height--)
2836     {
2837 	dst = dst_line;
2838 	dst_line += dst_stride;
2839 	mask = mask_line;
2840 	mask_line += mask_stride;
2841 	w = width;
2842 
2843 	while (w && (uintptr_t)dst & 7)
2844 	{
2845 	    uint16_t tmp;
2846 	    uint8_t a;
2847 	    uint32_t m, d;
2848 
2849 	    a = *mask++;
2850 	    d = *dst;
2851 
2852 	    m = MUL_UN8 (sa, a, tmp);
2853 	    d = MUL_UN8 (m, d, tmp);
2854 
2855 	    *dst++ = d;
2856 	    w--;
2857 	}
2858 
2859 	while (w >= 4)
2860 	{
2861 	    __m64 vmask;
2862 	    __m64 vdest;
2863 
2864 	    vmask = load8888u ((uint32_t *)mask);
2865 	    vdest = load8888 ((uint32_t *)dst);
2866 
2867 	    store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
2868 
2869 	    dst += 4;
2870 	    mask += 4;
2871 	    w -= 4;
2872 	}
2873 
2874 	while (w--)
2875 	{
2876 	    uint16_t tmp;
2877 	    uint8_t a;
2878 	    uint32_t m, d;
2879 
2880 	    a = *mask++;
2881 	    d = *dst;
2882 
2883 	    m = MUL_UN8 (sa, a, tmp);
2884 	    d = MUL_UN8 (m, d, tmp);
2885 
2886 	    *dst++ = d;
2887 	}
2888     }
2889 
2890     _mm_empty ();
2891 }
2892 
2893 static void
mmx_composite_in_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)2894 mmx_composite_in_8_8 (pixman_implementation_t *imp,
2895                       pixman_composite_info_t *info)
2896 {
2897     PIXMAN_COMPOSITE_ARGS (info);
2898     uint8_t     *dst_line, *dst;
2899     uint8_t     *src_line, *src;
2900     int src_stride, dst_stride;
2901     int32_t w;
2902 
2903     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2904     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2905 
2906     while (height--)
2907     {
2908 	dst = dst_line;
2909 	dst_line += dst_stride;
2910 	src = src_line;
2911 	src_line += src_stride;
2912 	w = width;
2913 
2914 	while (w && (uintptr_t)dst & 3)
2915 	{
2916 	    uint8_t s, d;
2917 	    uint16_t tmp;
2918 
2919 	    s = *src;
2920 	    d = *dst;
2921 
2922 	    *dst = MUL_UN8 (s, d, tmp);
2923 
2924 	    src++;
2925 	    dst++;
2926 	    w--;
2927 	}
2928 
2929 	while (w >= 4)
2930 	{
2931 	    uint32_t *s = (uint32_t *)src;
2932 	    uint32_t *d = (uint32_t *)dst;
2933 
2934 	    store8888 (d, in (load8888u (s), load8888 (d)));
2935 
2936 	    w -= 4;
2937 	    dst += 4;
2938 	    src += 4;
2939 	}
2940 
2941 	while (w--)
2942 	{
2943 	    uint8_t s, d;
2944 	    uint16_t tmp;
2945 
2946 	    s = *src;
2947 	    d = *dst;
2948 
2949 	    *dst = MUL_UN8 (s, d, tmp);
2950 
2951 	    src++;
2952 	    dst++;
2953 	}
2954     }
2955 
2956     _mm_empty ();
2957 }
2958 
2959 static void
mmx_composite_add_n_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)2960 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2961 			 pixman_composite_info_t *info)
2962 {
2963     PIXMAN_COMPOSITE_ARGS (info);
2964     uint8_t     *dst_line, *dst;
2965     uint8_t     *mask_line, *mask;
2966     int dst_stride, mask_stride;
2967     int32_t w;
2968     uint32_t src;
2969     uint8_t sa;
2970     __m64 vsrc, vsrca;
2971 
2972     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2973     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2974 
2975     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2976 
2977     sa = src >> 24;
2978 
2979     if (src == 0)
2980 	return;
2981 
2982     vsrc = load8888 (&src);
2983     vsrca = expand_alpha (vsrc);
2984 
2985     while (height--)
2986     {
2987 	dst = dst_line;
2988 	dst_line += dst_stride;
2989 	mask = mask_line;
2990 	mask_line += mask_stride;
2991 	w = width;
2992 
2993 	while (w && (uintptr_t)dst & 3)
2994 	{
2995 	    uint16_t tmp;
2996 	    uint16_t a;
2997 	    uint32_t m, d;
2998 	    uint32_t r;
2999 
3000 	    a = *mask++;
3001 	    d = *dst;
3002 
3003 	    m = MUL_UN8 (sa, a, tmp);
3004 	    r = ADD_UN8 (m, d, tmp);
3005 
3006 	    *dst++ = r;
3007 	    w--;
3008 	}
3009 
3010 	while (w >= 4)
3011 	{
3012 	    __m64 vmask;
3013 	    __m64 vdest;
3014 
3015 	    vmask = load8888u ((uint32_t *)mask);
3016 	    vdest = load8888 ((uint32_t *)dst);
3017 
3018 	    store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
3019 
3020 	    dst += 4;
3021 	    mask += 4;
3022 	    w -= 4;
3023 	}
3024 
3025 	while (w--)
3026 	{
3027 	    uint16_t tmp;
3028 	    uint16_t a;
3029 	    uint32_t m, d;
3030 	    uint32_t r;
3031 
3032 	    a = *mask++;
3033 	    d = *dst;
3034 
3035 	    m = MUL_UN8 (sa, a, tmp);
3036 	    r = ADD_UN8 (m, d, tmp);
3037 
3038 	    *dst++ = r;
3039 	}
3040     }
3041 
3042     _mm_empty ();
3043 }
3044 
3045 static void
mmx_composite_add_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)3046 mmx_composite_add_8_8 (pixman_implementation_t *imp,
3047 		       pixman_composite_info_t *info)
3048 {
3049     PIXMAN_COMPOSITE_ARGS (info);
3050     uint8_t *dst_line, *dst;
3051     uint8_t *src_line, *src;
3052     int dst_stride, src_stride;
3053     int32_t w;
3054     uint8_t s, d;
3055     uint16_t t;
3056 
3057     CHECKPOINT ();
3058 
3059     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
3060     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
3061 
3062     while (height--)
3063     {
3064 	dst = dst_line;
3065 	dst_line += dst_stride;
3066 	src = src_line;
3067 	src_line += src_stride;
3068 	w = width;
3069 
3070 	while (w && (uintptr_t)dst & 7)
3071 	{
3072 	    s = *src;
3073 	    d = *dst;
3074 	    t = d + s;
3075 	    s = t | (0 - (t >> 8));
3076 	    *dst = s;
3077 
3078 	    dst++;
3079 	    src++;
3080 	    w--;
3081 	}
3082 
3083 	while (w >= 8)
3084 	{
3085 	    *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3086 	    dst += 8;
3087 	    src += 8;
3088 	    w -= 8;
3089 	}
3090 
3091 	while (w)
3092 	{
3093 	    s = *src;
3094 	    d = *dst;
3095 	    t = d + s;
3096 	    s = t | (0 - (t >> 8));
3097 	    *dst = s;
3098 
3099 	    dst++;
3100 	    src++;
3101 	    w--;
3102 	}
3103     }
3104 
3105     _mm_empty ();
3106 }
3107 
3108 static void
mmx_composite_add_0565_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)3109 mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
3110                              pixman_composite_info_t *info)
3111 {
3112     PIXMAN_COMPOSITE_ARGS (info);
3113     uint16_t    *dst_line, *dst;
3114     uint32_t	d;
3115     uint16_t    *src_line, *src;
3116     uint32_t	s;
3117     int dst_stride, src_stride;
3118     int32_t w;
3119 
3120     CHECKPOINT ();
3121 
3122     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
3123     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3124 
3125     while (height--)
3126     {
3127 	dst = dst_line;
3128 	dst_line += dst_stride;
3129 	src = src_line;
3130 	src_line += src_stride;
3131 	w = width;
3132 
3133 	while (w && (uintptr_t)dst & 7)
3134 	{
3135 	    s = *src++;
3136 	    if (s)
3137 	    {
3138 		d = *dst;
3139 		s = convert_0565_to_8888 (s);
3140 		if (d)
3141 		{
3142 		    d = convert_0565_to_8888 (d);
3143 		    UN8x4_ADD_UN8x4 (s, d);
3144 		}
3145 		*dst = convert_8888_to_0565 (s);
3146 	    }
3147 	    dst++;
3148 	    w--;
3149 	}
3150 
3151 	while (w >= 4)
3152 	{
3153 	    __m64 vdest = *(__m64 *)dst;
3154 	    __m64 vsrc = ldq_u ((__m64 *)src);
3155 	    __m64 vd0, vd1;
3156 	    __m64 vs0, vs1;
3157 
3158 	    expand_4xpacked565 (vdest, &vd0, &vd1, 0);
3159 	    expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
3160 
3161 	    vd0 = _mm_adds_pu8 (vd0, vs0);
3162 	    vd1 = _mm_adds_pu8 (vd1, vs1);
3163 
3164 	    *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
3165 
3166 	    dst += 4;
3167 	    src += 4;
3168 	    w -= 4;
3169 	}
3170 
3171 	while (w--)
3172 	{
3173 	    s = *src++;
3174 	    if (s)
3175 	    {
3176 		d = *dst;
3177 		s = convert_0565_to_8888 (s);
3178 		if (d)
3179 		{
3180 		    d = convert_0565_to_8888 (d);
3181 		    UN8x4_ADD_UN8x4 (s, d);
3182 		}
3183 		*dst = convert_8888_to_0565 (s);
3184 	    }
3185 	    dst++;
3186 	}
3187     }
3188 
3189     _mm_empty ();
3190 }
3191 
3192 static void
mmx_composite_add_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3193 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
3194                              pixman_composite_info_t *info)
3195 {
3196     PIXMAN_COMPOSITE_ARGS (info);
3197     uint32_t    *dst_line, *dst;
3198     uint32_t    *src_line, *src;
3199     int dst_stride, src_stride;
3200     int32_t w;
3201 
3202     CHECKPOINT ();
3203 
3204     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3205     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3206 
3207     while (height--)
3208     {
3209 	dst = dst_line;
3210 	dst_line += dst_stride;
3211 	src = src_line;
3212 	src_line += src_stride;
3213 	w = width;
3214 
3215 	while (w && (uintptr_t)dst & 7)
3216 	{
3217 	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3218 	                              load ((const uint32_t *)dst)));
3219 	    dst++;
3220 	    src++;
3221 	    w--;
3222 	}
3223 
3224 	while (w >= 2)
3225 	{
3226 	    *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3227 	    dst += 2;
3228 	    src += 2;
3229 	    w -= 2;
3230 	}
3231 
3232 	if (w)
3233 	{
3234 	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3235 	                              load ((const uint32_t *)dst)));
3236 
3237 	}
3238     }
3239 
3240     _mm_empty ();
3241 }
3242 
3243 static pixman_bool_t
mmx_blt(pixman_implementation_t * imp,uint32_t * src_bits,uint32_t * dst_bits,int src_stride,int dst_stride,int src_bpp,int dst_bpp,int src_x,int src_y,int dest_x,int dest_y,int width,int height)3244 mmx_blt (pixman_implementation_t *imp,
3245          uint32_t *               src_bits,
3246          uint32_t *               dst_bits,
3247          int                      src_stride,
3248          int                      dst_stride,
3249          int                      src_bpp,
3250          int                      dst_bpp,
3251          int                      src_x,
3252          int                      src_y,
3253          int                      dest_x,
3254          int                      dest_y,
3255          int                      width,
3256          int                      height)
3257 {
3258     uint8_t *   src_bytes;
3259     uint8_t *   dst_bytes;
3260     int byte_width;
3261 
3262     if (src_bpp != dst_bpp)
3263 	return FALSE;
3264 
3265     if (src_bpp == 16)
3266     {
3267 	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3268 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3269 	src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3270 	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3271 	byte_width = 2 * width;
3272 	src_stride *= 2;
3273 	dst_stride *= 2;
3274     }
3275     else if (src_bpp == 32)
3276     {
3277 	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3278 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3279 	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3280 	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3281 	byte_width = 4 * width;
3282 	src_stride *= 4;
3283 	dst_stride *= 4;
3284     }
3285     else
3286     {
3287 	return FALSE;
3288     }
3289 
3290     while (height--)
3291     {
3292 	int w;
3293 	uint8_t *s = src_bytes;
3294 	uint8_t *d = dst_bytes;
3295 	src_bytes += src_stride;
3296 	dst_bytes += dst_stride;
3297 	w = byte_width;
3298 
3299 	if (w >= 1 && ((uintptr_t)d & 1))
3300 	{
3301 	    *(uint8_t *)d = *(uint8_t *)s;
3302 	    w -= 1;
3303 	    s += 1;
3304 	    d += 1;
3305 	}
3306 
3307 	if (w >= 2 && ((uintptr_t)d & 3))
3308 	{
3309 	    *(uint16_t *)d = *(uint16_t *)s;
3310 	    w -= 2;
3311 	    s += 2;
3312 	    d += 2;
3313 	}
3314 
3315 	while (w >= 4 && ((uintptr_t)d & 7))
3316 	{
3317 	    *(uint32_t *)d = ldl_u ((uint32_t *)s);
3318 
3319 	    w -= 4;
3320 	    s += 4;
3321 	    d += 4;
3322 	}
3323 
3324 	while (w >= 64)
3325 	{
3326 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
3327 	    __asm__ (
3328 	        "movq	  (%1),	  %%mm0\n"
3329 	        "movq	 8(%1),	  %%mm1\n"
3330 	        "movq	16(%1),	  %%mm2\n"
3331 	        "movq	24(%1),	  %%mm3\n"
3332 	        "movq	32(%1),	  %%mm4\n"
3333 	        "movq	40(%1),	  %%mm5\n"
3334 	        "movq	48(%1),	  %%mm6\n"
3335 	        "movq	56(%1),	  %%mm7\n"
3336 
3337 	        "movq	%%mm0,	  (%0)\n"
3338 	        "movq	%%mm1,	 8(%0)\n"
3339 	        "movq	%%mm2,	16(%0)\n"
3340 	        "movq	%%mm3,	24(%0)\n"
3341 	        "movq	%%mm4,	32(%0)\n"
3342 	        "movq	%%mm5,	40(%0)\n"
3343 	        "movq	%%mm6,	48(%0)\n"
3344 	        "movq	%%mm7,	56(%0)\n"
3345 		:
3346 		: "r" (d), "r" (s)
3347 		: "memory",
3348 		  "%mm0", "%mm1", "%mm2", "%mm3",
3349 		  "%mm4", "%mm5", "%mm6", "%mm7");
3350 #else
3351 	    __m64 v0 = ldq_u ((__m64 *)(s + 0));
3352 	    __m64 v1 = ldq_u ((__m64 *)(s + 8));
3353 	    __m64 v2 = ldq_u ((__m64 *)(s + 16));
3354 	    __m64 v3 = ldq_u ((__m64 *)(s + 24));
3355 	    __m64 v4 = ldq_u ((__m64 *)(s + 32));
3356 	    __m64 v5 = ldq_u ((__m64 *)(s + 40));
3357 	    __m64 v6 = ldq_u ((__m64 *)(s + 48));
3358 	    __m64 v7 = ldq_u ((__m64 *)(s + 56));
3359 	    *(__m64 *)(d + 0)  = v0;
3360 	    *(__m64 *)(d + 8)  = v1;
3361 	    *(__m64 *)(d + 16) = v2;
3362 	    *(__m64 *)(d + 24) = v3;
3363 	    *(__m64 *)(d + 32) = v4;
3364 	    *(__m64 *)(d + 40) = v5;
3365 	    *(__m64 *)(d + 48) = v6;
3366 	    *(__m64 *)(d + 56) = v7;
3367 #endif
3368 
3369 	    w -= 64;
3370 	    s += 64;
3371 	    d += 64;
3372 	}
3373 	while (w >= 4)
3374 	{
3375 	    *(uint32_t *)d = ldl_u ((uint32_t *)s);
3376 
3377 	    w -= 4;
3378 	    s += 4;
3379 	    d += 4;
3380 	}
3381 	if (w >= 2)
3382 	{
3383 	    *(uint16_t *)d = *(uint16_t *)s;
3384 	    w -= 2;
3385 	    s += 2;
3386 	    d += 2;
3387 	}
3388     }
3389 
3390     _mm_empty ();
3391 
3392     return TRUE;
3393 }
3394 
3395 static void
mmx_composite_copy_area(pixman_implementation_t * imp,pixman_composite_info_t * info)3396 mmx_composite_copy_area (pixman_implementation_t *imp,
3397                          pixman_composite_info_t *info)
3398 {
3399     PIXMAN_COMPOSITE_ARGS (info);
3400 
3401     mmx_blt (imp, src_image->bits.bits,
3402 	     dest_image->bits.bits,
3403 	     src_image->bits.rowstride,
3404 	     dest_image->bits.rowstride,
3405 	     PIXMAN_FORMAT_BPP (src_image->bits.format),
3406 	     PIXMAN_FORMAT_BPP (dest_image->bits.format),
3407 	     src_x, src_y, dest_x, dest_y, width, height);
3408 }
3409 
3410 static void
mmx_composite_over_x888_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3411 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3412                                 pixman_composite_info_t *info)
3413 {
3414     PIXMAN_COMPOSITE_ARGS (info);
3415     uint32_t  *src, *src_line;
3416     uint32_t  *dst, *dst_line;
3417     uint8_t  *mask, *mask_line;
3418     int src_stride, mask_stride, dst_stride;
3419     int32_t w;
3420 
3421     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3422     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3423     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3424 
3425     while (height--)
3426     {
3427 	src = src_line;
3428 	src_line += src_stride;
3429 	dst = dst_line;
3430 	dst_line += dst_stride;
3431 	mask = mask_line;
3432 	mask_line += mask_stride;
3433 
3434 	w = width;
3435 
3436 	while (w--)
3437 	{
3438 	    uint64_t m = *mask;
3439 
3440 	    if (m)
3441 	    {
3442 		uint32_t ssrc = *src | 0xff000000;
3443 		__m64 s = load8888 (&ssrc);
3444 
3445 		if (m == 0xff)
3446 		{
3447 		    store8888 (dst, s);
3448 		}
3449 		else
3450 		{
3451 		    __m64 sa = expand_alpha (s);
3452 		    __m64 vm = expand_alpha_rev (to_m64 (m));
3453 		    __m64 vdest = in_over (s, sa, vm, load8888 (dst));
3454 
3455 		    store8888 (dst, vdest);
3456 		}
3457 	    }
3458 
3459 	    mask++;
3460 	    dst++;
3461 	    src++;
3462 	}
3463     }
3464 
3465     _mm_empty ();
3466 }
3467 
3468 static void
mmx_composite_over_reverse_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3469 mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
3470                                    pixman_composite_info_t *info)
3471 {
3472     PIXMAN_COMPOSITE_ARGS (info);
3473     uint32_t src;
3474     uint32_t    *dst_line, *dst;
3475     int32_t w;
3476     int dst_stride;
3477     __m64 vsrc;
3478 
3479     CHECKPOINT ();
3480 
3481     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3482 
3483     if (src == 0)
3484 	return;
3485 
3486     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3487 
3488     vsrc = load8888 (&src);
3489 
3490     while (height--)
3491     {
3492 	dst = dst_line;
3493 	dst_line += dst_stride;
3494 	w = width;
3495 
3496 	CHECKPOINT ();
3497 
3498 	while (w && (uintptr_t)dst & 7)
3499 	{
3500 	    __m64 vdest = load8888 (dst);
3501 
3502 	    store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3503 
3504 	    w--;
3505 	    dst++;
3506 	}
3507 
3508 	while (w >= 2)
3509 	{
3510 	    __m64 vdest = *(__m64 *)dst;
3511 	    __m64 dest0 = expand8888 (vdest, 0);
3512 	    __m64 dest1 = expand8888 (vdest, 1);
3513 
3514 
3515 	    dest0 = over (dest0, expand_alpha (dest0), vsrc);
3516 	    dest1 = over (dest1, expand_alpha (dest1), vsrc);
3517 
3518 	    *(__m64 *)dst = pack8888 (dest0, dest1);
3519 
3520 	    dst += 2;
3521 	    w -= 2;
3522 	}
3523 
3524 	CHECKPOINT ();
3525 
3526 	if (w)
3527 	{
3528 	    __m64 vdest = load8888 (dst);
3529 
3530 	    store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3531 	}
3532     }
3533 
3534     _mm_empty ();
3535 }
3536 
3537 #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
3538 #define BMSK (BSHIFT - 1)
3539 
3540 #define BILINEAR_DECLARE_VARIABLES						\
3541     const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);				\
3542     const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);				\
3543     const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT);	\
3544     const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1);				\
3545     const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK);			\
3546     const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);		\
3547     const __m64 mm_zero = _mm_setzero_si64 ();					\
3548     __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
3549 
3550 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
3551 do {										\
3552     /* fetch 2x2 pixel block into 2 mmx registers */				\
3553     __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]);		\
3554     __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]);		\
3555     /* vertical interpolation */						\
3556     __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt);		\
3557     __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt);		\
3558     __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb);		\
3559     __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);		\
3560     __m64 hi = _mm_add_pi16 (t_hi, b_hi);					\
3561     __m64 lo = _mm_add_pi16 (t_lo, b_lo);					\
3562     vx += unit_x;								\
3563     if (BILINEAR_INTERPOLATION_BITS < 8)					\
3564     {										\
3565 	/* calculate horizontal weights */					\
3566 	__m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7,		\
3567 			  _mm_srli_pi16 (mm_x,					\
3568 					 16 - BILINEAR_INTERPOLATION_BITS)));	\
3569 	/* horizontal interpolation */						\
3570 	__m64 p = _mm_unpacklo_pi16 (lo, hi);					\
3571 	__m64 q = _mm_unpackhi_pi16 (lo, hi);					\
3572 	lo = _mm_madd_pi16 (p, mm_wh);						\
3573 	hi = _mm_madd_pi16 (q, mm_wh);						\
3574     }										\
3575     else									\
3576     {										\
3577 	/* calculate horizontal weights */					\
3578 	__m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x,		\
3579 					16 - BILINEAR_INTERPOLATION_BITS));	\
3580 	__m64 mm_wh_hi = _mm_srli_pi16 (mm_x,					\
3581 					16 - BILINEAR_INTERPOLATION_BITS);	\
3582 	/* horizontal interpolation */						\
3583 	__m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo);				\
3584 	__m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi);				\
3585 	__m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo);				\
3586 	__m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi);				\
3587 	lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo),		\
3588 			   _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi));		\
3589 	hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo),		\
3590 			   _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi));		\
3591     }										\
3592     mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
3593     /* shift and pack the result */						\
3594     hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2);			\
3595     lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2);			\
3596     lo = _mm_packs_pi32 (lo, hi);						\
3597     lo = _mm_packs_pu16 (lo, lo);						\
3598     pix = lo;									\
3599 } while (0)
3600 
3601 #define BILINEAR_SKIP_ONE_PIXEL()						\
3602 do {										\
3603     vx += unit_x;								\
3604     mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
3605 } while(0)
3606 
3607 static force_inline void
scaled_bilinear_scanline_mmx_8888_8888_SRC(uint32_t * dst,const uint32_t * mask,const uint32_t * src_top,const uint32_t * src_bottom,int32_t w,int wt,int wb,pixman_fixed_t vx,pixman_fixed_t unit_x,pixman_fixed_t max_vx,pixman_bool_t zero_src)3608 scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t *       dst,
3609 					    const uint32_t * mask,
3610 					    const uint32_t * src_top,
3611 					    const uint32_t * src_bottom,
3612 					    int32_t          w,
3613 					    int              wt,
3614 					    int              wb,
3615 					    pixman_fixed_t   vx,
3616 					    pixman_fixed_t   unit_x,
3617 					    pixman_fixed_t   max_vx,
3618 					    pixman_bool_t    zero_src)
3619 {
3620     BILINEAR_DECLARE_VARIABLES;
3621     __m64 pix;
3622 
3623     while (w--)
3624     {
3625 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
3626 	store (dst, pix);
3627 	dst++;
3628     }
3629 
3630     _mm_empty ();
3631 }
3632 
FAST_BILINEAR_MAINLOOP_COMMON(mmx_8888_8888_cover_SRC,scaled_bilinear_scanline_mmx_8888_8888_SRC,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)3633 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
3634 			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
3635 			       uint32_t, uint32_t, uint32_t,
3636 			       COVER, FLAG_NONE)
3637 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
3638 			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
3639 			       uint32_t, uint32_t, uint32_t,
3640 			       PAD, FLAG_NONE)
3641 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
3642 			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
3643 			       uint32_t, uint32_t, uint32_t,
3644 			       NONE, FLAG_NONE)
3645 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
3646 			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
3647 			       uint32_t, uint32_t, uint32_t,
3648 			       NORMAL, FLAG_NONE)
3649 
3650 static force_inline void
3651 scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t *       dst,
3652 					     const uint32_t * mask,
3653 					     const uint32_t * src_top,
3654 					     const uint32_t * src_bottom,
3655 					     int32_t          w,
3656 					     int              wt,
3657 					     int              wb,
3658 					     pixman_fixed_t   vx,
3659 					     pixman_fixed_t   unit_x,
3660 					     pixman_fixed_t   max_vx,
3661 					     pixman_bool_t    zero_src)
3662 {
3663     BILINEAR_DECLARE_VARIABLES;
3664     __m64 pix1, pix2;
3665 
3666     while (w)
3667     {
3668 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3669 
3670 	if (!is_zero (pix1))
3671 	{
3672 	    pix2 = load (dst);
3673 	    store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
3674 	}
3675 
3676 	w--;
3677 	dst++;
3678     }
3679 
3680     _mm_empty ();
3681 }
3682 
FAST_BILINEAR_MAINLOOP_COMMON(mmx_8888_8888_cover_OVER,scaled_bilinear_scanline_mmx_8888_8888_OVER,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)3683 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
3684 			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
3685 			       uint32_t, uint32_t, uint32_t,
3686 			       COVER, FLAG_NONE)
3687 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
3688 			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
3689 			       uint32_t, uint32_t, uint32_t,
3690 			       PAD, FLAG_NONE)
3691 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
3692 			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
3693 			       uint32_t, uint32_t, uint32_t,
3694 			       NONE, FLAG_NONE)
3695 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
3696 			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
3697 			       uint32_t, uint32_t, uint32_t,
3698 			       NORMAL, FLAG_NONE)
3699 
3700 static force_inline void
3701 scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t *       dst,
3702 					       const uint8_t  * mask,
3703 					       const uint32_t * src_top,
3704 					       const uint32_t * src_bottom,
3705 					       int32_t          w,
3706 					       int              wt,
3707 					       int              wb,
3708 					       pixman_fixed_t   vx,
3709 					       pixman_fixed_t   unit_x,
3710 					       pixman_fixed_t   max_vx,
3711 					       pixman_bool_t    zero_src)
3712 {
3713     BILINEAR_DECLARE_VARIABLES;
3714     __m64 pix1, pix2;
3715     uint32_t m;
3716 
3717     while (w)
3718     {
3719 	m = (uint32_t) *mask++;
3720 
3721 	if (m)
3722 	{
3723 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3724 
3725 	    if (m == 0xff && is_opaque (pix1))
3726 	    {
3727 		store (dst, pix1);
3728 	    }
3729 	    else
3730 	    {
3731 		__m64 ms, md, ma, msa;
3732 
3733 		pix2 = load (dst);
3734 		ma = expand_alpha_rev (to_m64 (m));
3735 		ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
3736 		md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
3737 
3738 		msa = expand_alpha (ms);
3739 
3740 		store8888 (dst, (in_over (ms, msa, ma, md)));
3741 	    }
3742 	}
3743 	else
3744 	{
3745 	    BILINEAR_SKIP_ONE_PIXEL ();
3746 	}
3747 
3748 	w--;
3749 	dst++;
3750     }
3751 
3752     _mm_empty ();
3753 }
3754 
FAST_BILINEAR_MAINLOOP_COMMON(mmx_8888_8_8888_cover_OVER,scaled_bilinear_scanline_mmx_8888_8_8888_OVER,uint32_t,uint8_t,uint32_t,COVER,FLAG_HAVE_NON_SOLID_MASK)3755 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
3756 			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3757 			       uint32_t, uint8_t, uint32_t,
3758 			       COVER, FLAG_HAVE_NON_SOLID_MASK)
3759 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
3760 			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3761 			       uint32_t, uint8_t, uint32_t,
3762 			       PAD, FLAG_HAVE_NON_SOLID_MASK)
3763 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
3764 			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3765 			       uint32_t, uint8_t, uint32_t,
3766 			       NONE, FLAG_HAVE_NON_SOLID_MASK)
3767 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
3768 			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3769 			       uint32_t, uint8_t, uint32_t,
3770 			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
3771 
3772 static uint32_t *
3773 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3774 {
3775     int w = iter->width;
3776     uint32_t *dst = iter->buffer;
3777     uint32_t *src = (uint32_t *)iter->bits;
3778 
3779     iter->bits += iter->stride;
3780 
3781     while (w && ((uintptr_t)dst) & 7)
3782     {
3783 	*dst++ = (*src++) | 0xff000000;
3784 	w--;
3785     }
3786 
3787     while (w >= 8)
3788     {
3789 	__m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
3790 	__m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
3791 	__m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
3792 	__m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
3793 
3794 	*(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
3795 	*(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
3796 	*(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
3797 	*(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
3798 
3799 	dst += 8;
3800 	src += 8;
3801 	w -= 8;
3802     }
3803 
3804     while (w)
3805     {
3806 	*dst++ = (*src++) | 0xff000000;
3807 	w--;
3808     }
3809 
3810     _mm_empty ();
3811     return iter->buffer;
3812 }
3813 
3814 static uint32_t *
mmx_fetch_r5g6b5(pixman_iter_t * iter,const uint32_t * mask)3815 mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
3816 {
3817     int w = iter->width;
3818     uint32_t *dst = iter->buffer;
3819     uint16_t *src = (uint16_t *)iter->bits;
3820 
3821     iter->bits += iter->stride;
3822 
3823     while (w && ((uintptr_t)dst) & 0x0f)
3824     {
3825 	uint16_t s = *src++;
3826 
3827 	*dst++ = convert_0565_to_8888 (s);
3828 	w--;
3829     }
3830 
3831     while (w >= 4)
3832     {
3833 	__m64 vsrc = ldq_u ((__m64 *)src);
3834 	__m64 mm0, mm1;
3835 
3836 	expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
3837 
3838 	*(__m64 *)(dst + 0) = mm0;
3839 	*(__m64 *)(dst + 2) = mm1;
3840 
3841 	dst += 4;
3842 	src += 4;
3843 	w -= 4;
3844     }
3845 
3846     while (w)
3847     {
3848 	uint16_t s = *src++;
3849 
3850 	*dst++ = convert_0565_to_8888 (s);
3851 	w--;
3852     }
3853 
3854     _mm_empty ();
3855     return iter->buffer;
3856 }
3857 
3858 static uint32_t *
mmx_fetch_a8(pixman_iter_t * iter,const uint32_t * mask)3859 mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3860 {
3861     int w = iter->width;
3862     uint32_t *dst = iter->buffer;
3863     uint8_t *src = iter->bits;
3864 
3865     iter->bits += iter->stride;
3866 
3867     while (w && (((uintptr_t)dst) & 15))
3868     {
3869         *dst++ = *(src++) << 24;
3870         w--;
3871     }
3872 
3873     while (w >= 8)
3874     {
3875 	__m64 mm0 = ldq_u ((__m64 *)src);
3876 
3877 	__m64 mm1 = _mm_unpacklo_pi8  (_mm_setzero_si64(), mm0);
3878 	__m64 mm2 = _mm_unpackhi_pi8  (_mm_setzero_si64(), mm0);
3879 	__m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
3880 	__m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
3881 	__m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
3882 	__m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
3883 
3884 	*(__m64 *)(dst + 0) = mm3;
3885 	*(__m64 *)(dst + 2) = mm4;
3886 	*(__m64 *)(dst + 4) = mm5;
3887 	*(__m64 *)(dst + 6) = mm6;
3888 
3889 	dst += 8;
3890 	src += 8;
3891 	w -= 8;
3892     }
3893 
3894     while (w)
3895     {
3896 	*dst++ = *(src++) << 24;
3897 	w--;
3898     }
3899 
3900     _mm_empty ();
3901     return iter->buffer;
3902 }
3903 
3904 typedef struct
3905 {
3906     pixman_format_code_t	format;
3907     pixman_iter_get_scanline_t	get_scanline;
3908 } fetcher_info_t;
3909 
3910 static const fetcher_info_t fetchers[] =
3911 {
3912     { PIXMAN_x8r8g8b8,		mmx_fetch_x8r8g8b8 },
3913     { PIXMAN_r5g6b5,		mmx_fetch_r5g6b5 },
3914     { PIXMAN_a8,		mmx_fetch_a8 },
3915     { PIXMAN_null }
3916 };
3917 
3918 static pixman_bool_t
mmx_src_iter_init(pixman_implementation_t * imp,pixman_iter_t * iter)3919 mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
3920 {
3921     pixman_image_t *image = iter->image;
3922 
3923 #define FLAGS								\
3924     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
3925      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
3926 
3927     if ((iter->iter_flags & ITER_NARROW)			&&
3928 	(iter->image_flags & FLAGS) == FLAGS)
3929     {
3930 	const fetcher_info_t *f;
3931 
3932 	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
3933 	{
3934 	    if (image->common.extended_format_code == f->format)
3935 	    {
3936 		uint8_t *b = (uint8_t *)image->bits.bits;
3937 		int s = image->bits.rowstride * 4;
3938 
3939 		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
3940 		iter->stride = s;
3941 
3942 		iter->get_scanline = f->get_scanline;
3943 		return TRUE;
3944 	    }
3945 	}
3946     }
3947 
3948     return FALSE;
3949 }
3950 
3951 static const pixman_fast_path_t mmx_fast_paths[] =
3952 {
3953     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
3954     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ),
3955     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ),
3956     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ),
3957     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ),
3958     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ),
3959     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3960     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3961     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ),
3962     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3963     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3964     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ),
3965     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ),
3966     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ),
3967     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ),
3968     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ),
3969     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ),
3970     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ),
3971     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ),
3972     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ),
3973     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ),
3974     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ),
3975     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ),
3976     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
3977     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
3978     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
3979     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
3980     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
3981     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
3982     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, mmx_composite_over_x888_8_8888    ),
3983     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
3984     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
3985     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
3986     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     b5g6r5,   mmx_composite_over_n_0565         ),
3987     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
3988     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
3989 
3990     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ),
3991     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ),
3992     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ),
3993     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ),
3994     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
3995     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
3996 
3997     PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
3998     PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
3999 
4000     PIXMAN_STD_FAST_PATH    (ADD,  r5g6b5,   null,     r5g6b5,   mmx_composite_add_0565_0565       ),
4001     PIXMAN_STD_FAST_PATH    (ADD,  b5g6r5,   null,     b5g6r5,   mmx_composite_add_0565_0565       ),
4002     PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
4003     PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
4004     PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8		   ),
4005     PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
4006 
4007     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
4008     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
4009     PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
4010     PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
4011     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
4012     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
4013     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
4014     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
4015     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
4016     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
4017     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
4018     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
4019     PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
4020     PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
4021     PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ),
4022     PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ),
4023 
4024     PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
4025     PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
4026 
4027     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          a8r8g8b8, mmx_8888_8888                     ),
4028     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
4029     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
4030     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          a8b8g8r8, mmx_8888_8888                     ),
4031     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
4032     SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
4033 
4034     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         x8r8g8b8, mmx_8888_8888                     ),
4035     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         x8b8g8r8, mmx_8888_8888                     ),
4036     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         a8r8g8b8, mmx_8888_8888                     ),
4037     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         a8b8g8r8, mmx_8888_8888                     ),
4038 
4039     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888                   ),
4040     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888                   ),
4041     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888                   ),
4042     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888                   ),
4043 
4044     { PIXMAN_OP_NONE },
4045 };
4046 
4047 pixman_implementation_t *
_pixman_implementation_create_mmx(pixman_implementation_t * fallback)4048 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
4049 {
4050     pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
4051 
4052     imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
4053     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
4054     imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
4055     imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
4056     imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
4057     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
4058     imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
4059     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
4060     imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
4061     imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
4062     imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
4063 
4064     imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
4065     imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
4066     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
4067     imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
4068     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
4069     imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
4070     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
4071     imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
4072     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
4073     imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
4074     imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
4075 
4076     imp->blt = mmx_blt;
4077     imp->fill = mmx_fill;
4078 
4079     imp->src_iter_init = mmx_src_iter_init;
4080 
4081     return imp;
4082 }
4083 
4084 #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */
4085