1 /*
2 * Copyright © 2004, 2005 Red Hat, Inc.
3 * Copyright © 2004 Nicholas Miell
4 * Copyright © 2005 Trolltech AS
5 *
6 * Permission to use, copy, modify, distribute, and sell this software and its
7 * documentation for any purpose is hereby granted without fee, provided that
8 * the above copyright notice appear in all copies and that both that
9 * copyright notice and this permission notice appear in supporting
10 * documentation, and that the name of Red Hat not be used in advertising or
11 * publicity pertaining to distribution of the software without specific,
12 * written prior permission. Red Hat makes no representations about the
13 * suitability of this software for any purpose. It is provided "as is"
14 * without express or implied warranty.
15 *
16 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 * SOFTWARE.
24 *
25 * Author: Søren Sandmann (sandmann@redhat.com)
26 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
28 *
29 * Based on work by Owen Taylor
30 */
31
32 #ifdef HAVE_CONFIG_H
33 #include <config.h>
34 #endif
35
36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
37
38 #ifdef USE_LOONGSON_MMI
39 #include <loongson-mmintrin.h>
40 #else
41 #include <mmintrin.h>
42 #endif
43 #include "pixman-private.h"
44 #include "pixman-combine32.h"
45 #include "pixman-inlines.h"
46
47 #define no_vERBOSE
48
49 #ifdef VERBOSE
50 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
51 #else
52 #define CHECKPOINT()
53 #endif
54
55 #if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
56 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this. */
57 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_empty(void)58 _mm_empty (void)
59 {
60
61 }
62 #endif
63
64 #ifdef USE_X86_MMX
65 # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
66 # include <xmmintrin.h>
67 # else
68 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
69 * instructions to be generated that we don't want. Just duplicate the
70 * functions we want to use. */
71 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pi8(__m64 __A)72 _mm_movemask_pi8 (__m64 __A)
73 {
74 int ret;
75
76 asm ("pmovmskb %1, %0\n\t"
77 : "=r" (ret)
78 : "y" (__A)
79 );
80
81 return ret;
82 }
83
84 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_pu16(__m64 __A,__m64 __B)85 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
86 {
87 asm ("pmulhuw %1, %0\n\t"
88 : "+y" (__A)
89 : "y" (__B)
90 );
91 return __A;
92 }
93
94 # ifdef __OPTIMIZE__
95 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pi16(__m64 __A,int8_t const __N)96 _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
97 {
98 __m64 ret;
99
100 asm ("pshufw %2, %1, %0\n\t"
101 : "=y" (ret)
102 : "y" (__A), "K" (__N)
103 );
104
105 return ret;
106 }
107 # else
108 # define _mm_shuffle_pi16(A, N) \
109 ({ \
110 __m64 ret; \
111 \
112 asm ("pshufw %2, %1, %0\n\t" \
113 : "=y" (ret) \
114 : "y" (A), "K" ((const int8_t)N) \
115 ); \
116 \
117 ret; \
118 })
119 # endif
120 # endif
121 #endif
122
123 #ifndef _MSC_VER
124 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
125 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
126 #endif
127
128 /* Notes about writing mmx code
129 *
130 * give memory operands as the second operand. If you give it as the
131 * first, gcc will first load it into a register, then use that
132 * register
133 *
134 * ie. use
135 *
136 * _mm_mullo_pi16 (x, mmx_constant);
137 *
138 * not
139 *
140 * _mm_mullo_pi16 (mmx_constant, x);
141 *
142 * Also try to minimize dependencies. i.e. when you need a value, try
143 * to calculate it from a value that was calculated as early as
144 * possible.
145 */
146
147 /* --------------- MMX primitives ------------------------------------- */
148
149 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
150 * the name of the member used to access the data.
151 * If __m64 requires using mm_cvt* intrinsics functions to convert between
152 * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
153 * If __m64 and uint64_t values can just be cast to each other directly,
154 * then define USE_M64_CASTS.
155 * If __m64 is a double datatype, then define USE_M64_DOUBLE.
156 */
157 #ifdef _MSC_VER
158 # define M64_MEMBER m64_u64
159 #elif defined(__ICC)
160 # define USE_CVT_INTRINSICS
161 #elif defined(USE_LOONGSON_MMI)
162 # define USE_M64_DOUBLE
163 #elif defined(__GNUC__)
164 # define USE_M64_CASTS
165 #elif defined(__SUNPRO_C)
166 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
167 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
168 * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
169 * is defined. If it is used, then the mm_cvt* intrinsics must be used.
170 */
171 # define USE_CVT_INTRINSICS
172 # else
173 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
174 * disabled, __m64 is defined as a struct containing "unsigned long long l_".
175 */
176 # define M64_MEMBER l_
177 # endif
178 #endif
179
180 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
181 typedef uint64_t mmxdatafield;
182 #else
183 typedef __m64 mmxdatafield;
184 #endif
185
186 typedef struct
187 {
188 mmxdatafield mmx_4x00ff;
189 mmxdatafield mmx_4x0080;
190 mmxdatafield mmx_565_rgb;
191 mmxdatafield mmx_565_unpack_multiplier;
192 mmxdatafield mmx_565_pack_multiplier;
193 mmxdatafield mmx_565_r;
194 mmxdatafield mmx_565_g;
195 mmxdatafield mmx_565_b;
196 mmxdatafield mmx_packed_565_rb;
197 mmxdatafield mmx_packed_565_g;
198 mmxdatafield mmx_expand_565_g;
199 mmxdatafield mmx_expand_565_b;
200 mmxdatafield mmx_expand_565_r;
201 #ifndef USE_LOONGSON_MMI
202 mmxdatafield mmx_mask_0;
203 mmxdatafield mmx_mask_1;
204 mmxdatafield mmx_mask_2;
205 mmxdatafield mmx_mask_3;
206 #endif
207 mmxdatafield mmx_full_alpha;
208 mmxdatafield mmx_4x0101;
209 mmxdatafield mmx_ff000000;
210 } mmx_data_t;
211
212 #if defined(_MSC_VER)
213 # define MMXDATA_INIT(field, val) { val ## UI64 }
214 #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */
215 # define MMXDATA_INIT(field, val) field = { val ## ULL }
216 #else /* mmxdatafield is an integral type */
217 # define MMXDATA_INIT(field, val) field = val ## ULL
218 #endif
219
220 static const mmx_data_t c =
221 {
222 MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff),
223 MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080),
224 MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f),
225 MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840),
226 MMXDATA_INIT (.mmx_565_pack_multiplier, 0x2000000420000004),
227 MMXDATA_INIT (.mmx_565_r, 0x000000f800000000),
228 MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000),
229 MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8),
230 MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8),
231 MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00),
232 MMXDATA_INIT (.mmx_expand_565_g, 0x07e007e007e007e0),
233 MMXDATA_INIT (.mmx_expand_565_b, 0x001f001f001f001f),
234 MMXDATA_INIT (.mmx_expand_565_r, 0xf800f800f800f800),
235 #ifndef USE_LOONGSON_MMI
236 MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000),
237 MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff),
238 MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff),
239 MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff),
240 #endif
241 MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000),
242 MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101),
243 MMXDATA_INIT (.mmx_ff000000, 0xff000000ff000000),
244 };
245
246 #ifdef USE_CVT_INTRINSICS
247 # define MC(x) to_m64 (c.mmx_ ## x)
248 #elif defined(USE_M64_CASTS)
249 # define MC(x) ((__m64)c.mmx_ ## x)
250 #elif defined(USE_M64_DOUBLE)
251 # define MC(x) (*(__m64 *)&c.mmx_ ## x)
252 #else
253 # define MC(x) c.mmx_ ## x
254 #endif
255
256 static force_inline __m64
to_m64(uint64_t x)257 to_m64 (uint64_t x)
258 {
259 #ifdef USE_CVT_INTRINSICS
260 return _mm_cvtsi64_m64 (x);
261 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
262 __m64 res;
263
264 res.M64_MEMBER = x;
265 return res;
266 #elif defined USE_M64_DOUBLE
267 return *(__m64 *)&x;
268 #else /* USE_M64_CASTS */
269 return (__m64)x;
270 #endif
271 }
272
273 static force_inline uint64_t
to_uint64(__m64 x)274 to_uint64 (__m64 x)
275 {
276 #ifdef USE_CVT_INTRINSICS
277 return _mm_cvtm64_si64 (x);
278 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
279 uint64_t res = x.M64_MEMBER;
280 return res;
281 #elif defined USE_M64_DOUBLE
282 return *(uint64_t *)&x;
283 #else /* USE_M64_CASTS */
284 return (uint64_t)x;
285 #endif
286 }
287
288 static force_inline __m64
shift(__m64 v,int s)289 shift (__m64 v,
290 int s)
291 {
292 if (s > 0)
293 return _mm_slli_si64 (v, s);
294 else if (s < 0)
295 return _mm_srli_si64 (v, -s);
296 else
297 return v;
298 }
299
300 static force_inline __m64
negate(__m64 mask)301 negate (__m64 mask)
302 {
303 return _mm_xor_si64 (mask, MC (4x00ff));
304 }
305
306 static force_inline __m64
pix_multiply(__m64 a,__m64 b)307 pix_multiply (__m64 a, __m64 b)
308 {
309 __m64 res;
310
311 res = _mm_mullo_pi16 (a, b);
312 res = _mm_adds_pu16 (res, MC (4x0080));
313 res = _mm_mulhi_pu16 (res, MC (4x0101));
314
315 return res;
316 }
317
318 static force_inline __m64
pix_add(__m64 a,__m64 b)319 pix_add (__m64 a, __m64 b)
320 {
321 return _mm_adds_pu8 (a, b);
322 }
323
324 static force_inline __m64
expand_alpha(__m64 pixel)325 expand_alpha (__m64 pixel)
326 {
327 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
328 }
329
330 static force_inline __m64
expand_alpha_rev(__m64 pixel)331 expand_alpha_rev (__m64 pixel)
332 {
333 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
334 }
335
336 static force_inline __m64
invert_colors(__m64 pixel)337 invert_colors (__m64 pixel)
338 {
339 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
340 }
341
342 static force_inline __m64
over(__m64 src,__m64 srca,__m64 dest)343 over (__m64 src,
344 __m64 srca,
345 __m64 dest)
346 {
347 return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
348 }
349
350 static force_inline __m64
over_rev_non_pre(__m64 src,__m64 dest)351 over_rev_non_pre (__m64 src, __m64 dest)
352 {
353 __m64 srca = expand_alpha (src);
354 __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
355
356 return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
357 }
358
359 static force_inline __m64
in(__m64 src,__m64 mask)360 in (__m64 src, __m64 mask)
361 {
362 return pix_multiply (src, mask);
363 }
364
365 #ifndef _MSC_VER
366 static force_inline __m64
in_over(__m64 src,__m64 srca,__m64 mask,__m64 dest)367 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
368 {
369 return over (in (src, mask), pix_multiply (srca, mask), dest);
370 }
371
372 #else
373
374 #define in_over(src, srca, mask, dest) \
375 over (in (src, mask), pix_multiply (srca, mask), dest)
376
377 #endif
378
379 /* Elemental unaligned loads */
380
ldq_u(__m64 * p)381 static force_inline __m64 ldq_u(__m64 *p)
382 {
383 #ifdef USE_X86_MMX
384 /* x86's alignment restrictions are very relaxed. */
385 return *(__m64 *)p;
386 #elif defined USE_ARM_IWMMXT
387 int align = (uintptr_t)p & 7;
388 __m64 *aligned_p;
389 if (align == 0)
390 return *p;
391 aligned_p = (__m64 *)((uintptr_t)p & ~7);
392 return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
393 #else
394 struct __una_u64 { __m64 x __attribute__((packed)); };
395 const struct __una_u64 *ptr = (const struct __una_u64 *) p;
396 return (__m64) ptr->x;
397 #endif
398 }
399
ldl_u(const uint32_t * p)400 static force_inline uint32_t ldl_u(const uint32_t *p)
401 {
402 #ifdef USE_X86_MMX
403 /* x86's alignment restrictions are very relaxed. */
404 return *p;
405 #else
406 struct __una_u32 { uint32_t x __attribute__((packed)); };
407 const struct __una_u32 *ptr = (const struct __una_u32 *) p;
408 return ptr->x;
409 #endif
410 }
411
412 static force_inline __m64
load(const uint32_t * v)413 load (const uint32_t *v)
414 {
415 #ifdef USE_LOONGSON_MMI
416 __m64 ret;
417 asm ("lwc1 %0, %1\n\t"
418 : "=f" (ret)
419 : "m" (*v)
420 );
421 return ret;
422 #else
423 return _mm_cvtsi32_si64 (*v);
424 #endif
425 }
426
427 static force_inline __m64
load8888(const uint32_t * v)428 load8888 (const uint32_t *v)
429 {
430 #ifdef USE_LOONGSON_MMI
431 return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
432 #else
433 return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
434 #endif
435 }
436
437 static force_inline __m64
load8888u(const uint32_t * v)438 load8888u (const uint32_t *v)
439 {
440 uint32_t l = ldl_u (v);
441 return load8888 (&l);
442 }
443
444 static force_inline __m64
pack8888(__m64 lo,__m64 hi)445 pack8888 (__m64 lo, __m64 hi)
446 {
447 return _mm_packs_pu16 (lo, hi);
448 }
449
450 static force_inline void
store(uint32_t * dest,__m64 v)451 store (uint32_t *dest, __m64 v)
452 {
453 #ifdef USE_LOONGSON_MMI
454 asm ("swc1 %1, %0\n\t"
455 : "=m" (*dest)
456 : "f" (v)
457 : "memory"
458 );
459 #else
460 *dest = _mm_cvtsi64_si32 (v);
461 #endif
462 }
463
464 static force_inline void
store8888(uint32_t * dest,__m64 v)465 store8888 (uint32_t *dest, __m64 v)
466 {
467 v = pack8888 (v, _mm_setzero_si64 ());
468 store (dest, v);
469 }
470
471 static force_inline pixman_bool_t
is_equal(__m64 a,__m64 b)472 is_equal (__m64 a, __m64 b)
473 {
474 #ifdef USE_LOONGSON_MMI
475 /* __m64 is double, we can compare directly. */
476 return a == b;
477 #else
478 return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
479 #endif
480 }
481
482 static force_inline pixman_bool_t
is_opaque(__m64 v)483 is_opaque (__m64 v)
484 {
485 #ifdef USE_LOONGSON_MMI
486 return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
487 #else
488 __m64 ffs = _mm_cmpeq_pi8 (v, v);
489 return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
490 #endif
491 }
492
493 static force_inline pixman_bool_t
is_zero(__m64 v)494 is_zero (__m64 v)
495 {
496 return is_equal (v, _mm_setzero_si64 ());
497 }
498
499 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
500 *
501 * 00RR00GG00BB
502 *
503 * --- Expanding 565 in the low word ---
504 *
505 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
506 * m = m & (01f0003f001f);
507 * m = m * (008404100840);
508 * m = m >> 8;
509 *
510 * Note the trick here - the top word is shifted by another nibble to
511 * avoid it bumping into the middle word
512 */
513 static force_inline __m64
expand565(__m64 pixel,int pos)514 expand565 (__m64 pixel, int pos)
515 {
516 __m64 p = pixel;
517 __m64 t1, t2;
518
519 /* move pixel to low 16 bit and zero the rest */
520 #ifdef USE_LOONGSON_MMI
521 p = loongson_extract_pi16 (p, pos);
522 #else
523 p = shift (shift (p, (3 - pos) * 16), -48);
524 #endif
525
526 t1 = shift (p, 36 - 11);
527 t2 = shift (p, 16 - 5);
528
529 p = _mm_or_si64 (t1, p);
530 p = _mm_or_si64 (t2, p);
531 p = _mm_and_si64 (p, MC (565_rgb));
532
533 pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
534 return _mm_srli_pi16 (pixel, 8);
535 }
536
537 /* Expand 4 16 bit pixels in an mmx register into two mmx registers of
538 *
539 * AARRGGBBRRGGBB
540 */
541 static force_inline void
expand_4xpacked565(__m64 vin,__m64 * vout0,__m64 * vout1,int full_alpha)542 expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
543 {
544 __m64 t0, t1, alpha = _mm_setzero_si64 ();
545 __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
546 __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
547 __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
548 if (full_alpha)
549 alpha = _mm_cmpeq_pi32 (alpha, alpha);
550
551 /* Replicate high bits into empty low bits. */
552 r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
553 g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
554 b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
555
556 r = _mm_packs_pu16 (r, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */
557 g = _mm_packs_pu16 (g, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */
558 b = _mm_packs_pu16 (b, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */
559
560 t1 = _mm_unpacklo_pi8 (r, alpha); /* A3 R3 A2 R2 A1 R1 A0 R0 */
561 t0 = _mm_unpacklo_pi8 (b, g); /* G3 B3 G2 B2 G1 B1 G0 B0 */
562
563 *vout0 = _mm_unpacklo_pi16 (t0, t1); /* A1 R1 G1 B1 A0 R0 G0 B0 */
564 *vout1 = _mm_unpackhi_pi16 (t0, t1); /* A3 R3 G3 B3 A2 R2 G2 B2 */
565 }
566
567 static force_inline __m64
expand8888(__m64 in,int pos)568 expand8888 (__m64 in, int pos)
569 {
570 if (pos == 0)
571 return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
572 else
573 return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
574 }
575
576 static force_inline __m64
expandx888(__m64 in,int pos)577 expandx888 (__m64 in, int pos)
578 {
579 return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
580 }
581
582 static force_inline void
expand_4x565(__m64 vin,__m64 * vout0,__m64 * vout1,__m64 * vout2,__m64 * vout3,int full_alpha)583 expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
584 {
585 __m64 v0, v1;
586 expand_4xpacked565 (vin, &v0, &v1, full_alpha);
587 *vout0 = expand8888 (v0, 0);
588 *vout1 = expand8888 (v0, 1);
589 *vout2 = expand8888 (v1, 0);
590 *vout3 = expand8888 (v1, 1);
591 }
592
593 static force_inline __m64
pack_565(__m64 pixel,__m64 target,int pos)594 pack_565 (__m64 pixel, __m64 target, int pos)
595 {
596 __m64 p = pixel;
597 __m64 t = target;
598 __m64 r, g, b;
599
600 r = _mm_and_si64 (p, MC (565_r));
601 g = _mm_and_si64 (p, MC (565_g));
602 b = _mm_and_si64 (p, MC (565_b));
603
604 #ifdef USE_LOONGSON_MMI
605 r = shift (r, -(32 - 8));
606 g = shift (g, -(16 - 3));
607 b = shift (b, -(0 + 3));
608
609 p = _mm_or_si64 (r, g);
610 p = _mm_or_si64 (p, b);
611 return loongson_insert_pi16 (t, p, pos);
612 #else
613 r = shift (r, -(32 - 8) + pos * 16);
614 g = shift (g, -(16 - 3) + pos * 16);
615 b = shift (b, -(0 + 3) + pos * 16);
616
617 if (pos == 0)
618 t = _mm_and_si64 (t, MC (mask_0));
619 else if (pos == 1)
620 t = _mm_and_si64 (t, MC (mask_1));
621 else if (pos == 2)
622 t = _mm_and_si64 (t, MC (mask_2));
623 else if (pos == 3)
624 t = _mm_and_si64 (t, MC (mask_3));
625
626 p = _mm_or_si64 (r, t);
627 p = _mm_or_si64 (g, p);
628
629 return _mm_or_si64 (b, p);
630 #endif
631 }
632
633 static force_inline __m64
pack_4xpacked565(__m64 a,__m64 b)634 pack_4xpacked565 (__m64 a, __m64 b)
635 {
636 __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
637 __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
638
639 __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
640 __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
641
642 __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
643 __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
644
645 t0 = _mm_or_si64 (t0, g0);
646 t1 = _mm_or_si64 (t1, g1);
647
648 t0 = shift(t0, -5);
649 #ifdef USE_ARM_IWMMXT
650 t1 = shift(t1, -5);
651 return _mm_packs_pu32 (t0, t1);
652 #else
653 t1 = shift(t1, -5 + 16);
654 return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
655 #endif
656 }
657
658 #ifndef _MSC_VER
659
660 static force_inline __m64
pack_4x565(__m64 v0,__m64 v1,__m64 v2,__m64 v3)661 pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
662 {
663 return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
664 }
665
666 static force_inline __m64
pix_add_mul(__m64 x,__m64 a,__m64 y,__m64 b)667 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
668 {
669 x = pix_multiply (x, a);
670 y = pix_multiply (y, b);
671
672 return pix_add (x, y);
673 }
674
675 #else
676
677 /* MSVC only handles a "pass by register" of up to three SSE intrinsics */
678
679 #define pack_4x565(v0, v1, v2, v3) \
680 pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
681
682 #define pix_add_mul(x, a, y, b) \
683 ( x = pix_multiply (x, a), \
684 y = pix_multiply (y, b), \
685 pix_add (x, y) )
686
687 #endif
688
689 /* --------------- MMX code patch for fbcompose.c --------------------- */
690
691 static force_inline __m64
combine(const uint32_t * src,const uint32_t * mask)692 combine (const uint32_t *src, const uint32_t *mask)
693 {
694 __m64 vsrc = load8888 (src);
695
696 if (mask)
697 {
698 __m64 m = load8888 (mask);
699
700 m = expand_alpha (m);
701 vsrc = pix_multiply (vsrc, m);
702 }
703
704 return vsrc;
705 }
706
707 static force_inline __m64
core_combine_over_u_pixel_mmx(__m64 vsrc,__m64 vdst)708 core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
709 {
710 vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
711
712 if (is_opaque (vsrc))
713 {
714 return vsrc;
715 }
716 else if (!is_zero (vsrc))
717 {
718 return over (vsrc, expand_alpha (vsrc),
719 _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
720 }
721
722 return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
723 }
724
725 static void
mmx_combine_over_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)726 mmx_combine_over_u (pixman_implementation_t *imp,
727 pixman_op_t op,
728 uint32_t * dest,
729 const uint32_t * src,
730 const uint32_t * mask,
731 int width)
732 {
733 const uint32_t *end = dest + width;
734
735 while (dest < end)
736 {
737 __m64 vsrc = combine (src, mask);
738
739 if (is_opaque (vsrc))
740 {
741 store8888 (dest, vsrc);
742 }
743 else if (!is_zero (vsrc))
744 {
745 __m64 sa = expand_alpha (vsrc);
746 store8888 (dest, over (vsrc, sa, load8888 (dest)));
747 }
748
749 ++dest;
750 ++src;
751 if (mask)
752 ++mask;
753 }
754 _mm_empty ();
755 }
756
757 static void
mmx_combine_over_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)758 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
759 pixman_op_t op,
760 uint32_t * dest,
761 const uint32_t * src,
762 const uint32_t * mask,
763 int width)
764 {
765 const uint32_t *end = dest + width;
766
767 while (dest < end)
768 {
769 __m64 d, da;
770 __m64 s = combine (src, mask);
771
772 d = load8888 (dest);
773 da = expand_alpha (d);
774 store8888 (dest, over (d, da, s));
775
776 ++dest;
777 ++src;
778 if (mask)
779 mask++;
780 }
781 _mm_empty ();
782 }
783
784 static void
mmx_combine_in_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)785 mmx_combine_in_u (pixman_implementation_t *imp,
786 pixman_op_t op,
787 uint32_t * dest,
788 const uint32_t * src,
789 const uint32_t * mask,
790 int width)
791 {
792 const uint32_t *end = dest + width;
793
794 while (dest < end)
795 {
796 __m64 a;
797 __m64 x = combine (src, mask);
798
799 a = load8888 (dest);
800 a = expand_alpha (a);
801 x = pix_multiply (x, a);
802
803 store8888 (dest, x);
804
805 ++dest;
806 ++src;
807 if (mask)
808 mask++;
809 }
810 _mm_empty ();
811 }
812
813 static void
mmx_combine_in_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)814 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
815 pixman_op_t op,
816 uint32_t * dest,
817 const uint32_t * src,
818 const uint32_t * mask,
819 int width)
820 {
821 const uint32_t *end = dest + width;
822
823 while (dest < end)
824 {
825 __m64 a = combine (src, mask);
826 __m64 x;
827
828 x = load8888 (dest);
829 a = expand_alpha (a);
830 x = pix_multiply (x, a);
831 store8888 (dest, x);
832
833 ++dest;
834 ++src;
835 if (mask)
836 mask++;
837 }
838 _mm_empty ();
839 }
840
841 static void
mmx_combine_out_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)842 mmx_combine_out_u (pixman_implementation_t *imp,
843 pixman_op_t op,
844 uint32_t * dest,
845 const uint32_t * src,
846 const uint32_t * mask,
847 int width)
848 {
849 const uint32_t *end = dest + width;
850
851 while (dest < end)
852 {
853 __m64 a;
854 __m64 x = combine (src, mask);
855
856 a = load8888 (dest);
857 a = expand_alpha (a);
858 a = negate (a);
859 x = pix_multiply (x, a);
860 store8888 (dest, x);
861
862 ++dest;
863 ++src;
864 if (mask)
865 mask++;
866 }
867 _mm_empty ();
868 }
869
870 static void
mmx_combine_out_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)871 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
872 pixman_op_t op,
873 uint32_t * dest,
874 const uint32_t * src,
875 const uint32_t * mask,
876 int width)
877 {
878 const uint32_t *end = dest + width;
879
880 while (dest < end)
881 {
882 __m64 a = combine (src, mask);
883 __m64 x;
884
885 x = load8888 (dest);
886 a = expand_alpha (a);
887 a = negate (a);
888 x = pix_multiply (x, a);
889
890 store8888 (dest, x);
891
892 ++dest;
893 ++src;
894 if (mask)
895 mask++;
896 }
897 _mm_empty ();
898 }
899
900 static void
mmx_combine_atop_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)901 mmx_combine_atop_u (pixman_implementation_t *imp,
902 pixman_op_t op,
903 uint32_t * dest,
904 const uint32_t * src,
905 const uint32_t * mask,
906 int width)
907 {
908 const uint32_t *end = dest + width;
909
910 while (dest < end)
911 {
912 __m64 da, d, sia;
913 __m64 s = combine (src, mask);
914
915 d = load8888 (dest);
916 sia = expand_alpha (s);
917 sia = negate (sia);
918 da = expand_alpha (d);
919 s = pix_add_mul (s, da, d, sia);
920 store8888 (dest, s);
921
922 ++dest;
923 ++src;
924 if (mask)
925 mask++;
926 }
927 _mm_empty ();
928 }
929
930 static void
mmx_combine_atop_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)931 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
932 pixman_op_t op,
933 uint32_t * dest,
934 const uint32_t * src,
935 const uint32_t * mask,
936 int width)
937 {
938 const uint32_t *end;
939
940 end = dest + width;
941
942 while (dest < end)
943 {
944 __m64 dia, d, sa;
945 __m64 s = combine (src, mask);
946
947 d = load8888 (dest);
948 sa = expand_alpha (s);
949 dia = expand_alpha (d);
950 dia = negate (dia);
951 s = pix_add_mul (s, dia, d, sa);
952 store8888 (dest, s);
953
954 ++dest;
955 ++src;
956 if (mask)
957 mask++;
958 }
959 _mm_empty ();
960 }
961
962 static void
mmx_combine_xor_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)963 mmx_combine_xor_u (pixman_implementation_t *imp,
964 pixman_op_t op,
965 uint32_t * dest,
966 const uint32_t * src,
967 const uint32_t * mask,
968 int width)
969 {
970 const uint32_t *end = dest + width;
971
972 while (dest < end)
973 {
974 __m64 dia, d, sia;
975 __m64 s = combine (src, mask);
976
977 d = load8888 (dest);
978 sia = expand_alpha (s);
979 dia = expand_alpha (d);
980 sia = negate (sia);
981 dia = negate (dia);
982 s = pix_add_mul (s, dia, d, sia);
983 store8888 (dest, s);
984
985 ++dest;
986 ++src;
987 if (mask)
988 mask++;
989 }
990 _mm_empty ();
991 }
992
993 static void
mmx_combine_add_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)994 mmx_combine_add_u (pixman_implementation_t *imp,
995 pixman_op_t op,
996 uint32_t * dest,
997 const uint32_t * src,
998 const uint32_t * mask,
999 int width)
1000 {
1001 const uint32_t *end = dest + width;
1002
1003 while (dest < end)
1004 {
1005 __m64 d;
1006 __m64 s = combine (src, mask);
1007
1008 d = load8888 (dest);
1009 s = pix_add (s, d);
1010 store8888 (dest, s);
1011
1012 ++dest;
1013 ++src;
1014 if (mask)
1015 mask++;
1016 }
1017 _mm_empty ();
1018 }
1019
1020 static void
mmx_combine_saturate_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1021 mmx_combine_saturate_u (pixman_implementation_t *imp,
1022 pixman_op_t op,
1023 uint32_t * dest,
1024 const uint32_t * src,
1025 const uint32_t * mask,
1026 int width)
1027 {
1028 const uint32_t *end = dest + width;
1029
1030 while (dest < end)
1031 {
1032 uint32_t s, sa, da;
1033 uint32_t d = *dest;
1034 __m64 ms = combine (src, mask);
1035 __m64 md = load8888 (dest);
1036
1037 store8888(&s, ms);
1038 da = ~d >> 24;
1039 sa = s >> 24;
1040
1041 if (sa > da)
1042 {
1043 uint32_t quot = DIV_UN8 (da, sa) << 24;
1044 __m64 msa = load8888 (");
1045 msa = expand_alpha (msa);
1046 ms = pix_multiply (ms, msa);
1047 }
1048
1049 md = pix_add (md, ms);
1050 store8888 (dest, md);
1051
1052 ++src;
1053 ++dest;
1054 if (mask)
1055 mask++;
1056 }
1057 _mm_empty ();
1058 }
1059
1060 static void
mmx_combine_src_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1061 mmx_combine_src_ca (pixman_implementation_t *imp,
1062 pixman_op_t op,
1063 uint32_t * dest,
1064 const uint32_t * src,
1065 const uint32_t * mask,
1066 int width)
1067 {
1068 const uint32_t *end = src + width;
1069
1070 while (src < end)
1071 {
1072 __m64 a = load8888 (mask);
1073 __m64 s = load8888 (src);
1074
1075 s = pix_multiply (s, a);
1076 store8888 (dest, s);
1077
1078 ++src;
1079 ++mask;
1080 ++dest;
1081 }
1082 _mm_empty ();
1083 }
1084
1085 static void
mmx_combine_over_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1086 mmx_combine_over_ca (pixman_implementation_t *imp,
1087 pixman_op_t op,
1088 uint32_t * dest,
1089 const uint32_t * src,
1090 const uint32_t * mask,
1091 int width)
1092 {
1093 const uint32_t *end = src + width;
1094
1095 while (src < end)
1096 {
1097 __m64 a = load8888 (mask);
1098 __m64 s = load8888 (src);
1099 __m64 d = load8888 (dest);
1100 __m64 sa = expand_alpha (s);
1101
1102 store8888 (dest, in_over (s, sa, a, d));
1103
1104 ++src;
1105 ++dest;
1106 ++mask;
1107 }
1108 _mm_empty ();
1109 }
1110
1111 static void
mmx_combine_over_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1112 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1113 pixman_op_t op,
1114 uint32_t * dest,
1115 const uint32_t * src,
1116 const uint32_t * mask,
1117 int width)
1118 {
1119 const uint32_t *end = src + width;
1120
1121 while (src < end)
1122 {
1123 __m64 a = load8888 (mask);
1124 __m64 s = load8888 (src);
1125 __m64 d = load8888 (dest);
1126 __m64 da = expand_alpha (d);
1127
1128 store8888 (dest, over (d, da, in (s, a)));
1129
1130 ++src;
1131 ++dest;
1132 ++mask;
1133 }
1134 _mm_empty ();
1135 }
1136
1137 static void
mmx_combine_in_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1138 mmx_combine_in_ca (pixman_implementation_t *imp,
1139 pixman_op_t op,
1140 uint32_t * dest,
1141 const uint32_t * src,
1142 const uint32_t * mask,
1143 int width)
1144 {
1145 const uint32_t *end = src + width;
1146
1147 while (src < end)
1148 {
1149 __m64 a = load8888 (mask);
1150 __m64 s = load8888 (src);
1151 __m64 d = load8888 (dest);
1152 __m64 da = expand_alpha (d);
1153
1154 s = pix_multiply (s, a);
1155 s = pix_multiply (s, da);
1156 store8888 (dest, s);
1157
1158 ++src;
1159 ++dest;
1160 ++mask;
1161 }
1162 _mm_empty ();
1163 }
1164
1165 static void
mmx_combine_in_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1166 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1167 pixman_op_t op,
1168 uint32_t * dest,
1169 const uint32_t * src,
1170 const uint32_t * mask,
1171 int width)
1172 {
1173 const uint32_t *end = src + width;
1174
1175 while (src < end)
1176 {
1177 __m64 a = load8888 (mask);
1178 __m64 s = load8888 (src);
1179 __m64 d = load8888 (dest);
1180 __m64 sa = expand_alpha (s);
1181
1182 a = pix_multiply (a, sa);
1183 d = pix_multiply (d, a);
1184 store8888 (dest, d);
1185
1186 ++src;
1187 ++dest;
1188 ++mask;
1189 }
1190 _mm_empty ();
1191 }
1192
1193 static void
mmx_combine_out_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1194 mmx_combine_out_ca (pixman_implementation_t *imp,
1195 pixman_op_t op,
1196 uint32_t * dest,
1197 const uint32_t * src,
1198 const uint32_t * mask,
1199 int width)
1200 {
1201 const uint32_t *end = src + width;
1202
1203 while (src < end)
1204 {
1205 __m64 a = load8888 (mask);
1206 __m64 s = load8888 (src);
1207 __m64 d = load8888 (dest);
1208 __m64 da = expand_alpha (d);
1209
1210 da = negate (da);
1211 s = pix_multiply (s, a);
1212 s = pix_multiply (s, da);
1213 store8888 (dest, s);
1214
1215 ++src;
1216 ++dest;
1217 ++mask;
1218 }
1219 _mm_empty ();
1220 }
1221
1222 static void
mmx_combine_out_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1223 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1224 pixman_op_t op,
1225 uint32_t * dest,
1226 const uint32_t * src,
1227 const uint32_t * mask,
1228 int width)
1229 {
1230 const uint32_t *end = src + width;
1231
1232 while (src < end)
1233 {
1234 __m64 a = load8888 (mask);
1235 __m64 s = load8888 (src);
1236 __m64 d = load8888 (dest);
1237 __m64 sa = expand_alpha (s);
1238
1239 a = pix_multiply (a, sa);
1240 a = negate (a);
1241 d = pix_multiply (d, a);
1242 store8888 (dest, d);
1243
1244 ++src;
1245 ++dest;
1246 ++mask;
1247 }
1248 _mm_empty ();
1249 }
1250
1251 static void
mmx_combine_atop_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1252 mmx_combine_atop_ca (pixman_implementation_t *imp,
1253 pixman_op_t op,
1254 uint32_t * dest,
1255 const uint32_t * src,
1256 const uint32_t * mask,
1257 int width)
1258 {
1259 const uint32_t *end = src + width;
1260
1261 while (src < end)
1262 {
1263 __m64 a = load8888 (mask);
1264 __m64 s = load8888 (src);
1265 __m64 d = load8888 (dest);
1266 __m64 da = expand_alpha (d);
1267 __m64 sa = expand_alpha (s);
1268
1269 s = pix_multiply (s, a);
1270 a = pix_multiply (a, sa);
1271 a = negate (a);
1272 d = pix_add_mul (d, a, s, da);
1273 store8888 (dest, d);
1274
1275 ++src;
1276 ++dest;
1277 ++mask;
1278 }
1279 _mm_empty ();
1280 }
1281
1282 static void
mmx_combine_atop_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1283 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1284 pixman_op_t op,
1285 uint32_t * dest,
1286 const uint32_t * src,
1287 const uint32_t * mask,
1288 int width)
1289 {
1290 const uint32_t *end = src + width;
1291
1292 while (src < end)
1293 {
1294 __m64 a = load8888 (mask);
1295 __m64 s = load8888 (src);
1296 __m64 d = load8888 (dest);
1297 __m64 da = expand_alpha (d);
1298 __m64 sa = expand_alpha (s);
1299
1300 s = pix_multiply (s, a);
1301 a = pix_multiply (a, sa);
1302 da = negate (da);
1303 d = pix_add_mul (d, a, s, da);
1304 store8888 (dest, d);
1305
1306 ++src;
1307 ++dest;
1308 ++mask;
1309 }
1310 _mm_empty ();
1311 }
1312
1313 static void
mmx_combine_xor_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1314 mmx_combine_xor_ca (pixman_implementation_t *imp,
1315 pixman_op_t op,
1316 uint32_t * dest,
1317 const uint32_t * src,
1318 const uint32_t * mask,
1319 int width)
1320 {
1321 const uint32_t *end = src + width;
1322
1323 while (src < end)
1324 {
1325 __m64 a = load8888 (mask);
1326 __m64 s = load8888 (src);
1327 __m64 d = load8888 (dest);
1328 __m64 da = expand_alpha (d);
1329 __m64 sa = expand_alpha (s);
1330
1331 s = pix_multiply (s, a);
1332 a = pix_multiply (a, sa);
1333 da = negate (da);
1334 a = negate (a);
1335 d = pix_add_mul (d, a, s, da);
1336 store8888 (dest, d);
1337
1338 ++src;
1339 ++dest;
1340 ++mask;
1341 }
1342 _mm_empty ();
1343 }
1344
1345 static void
mmx_combine_add_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1346 mmx_combine_add_ca (pixman_implementation_t *imp,
1347 pixman_op_t op,
1348 uint32_t * dest,
1349 const uint32_t * src,
1350 const uint32_t * mask,
1351 int width)
1352 {
1353 const uint32_t *end = src + width;
1354
1355 while (src < end)
1356 {
1357 __m64 a = load8888 (mask);
1358 __m64 s = load8888 (src);
1359 __m64 d = load8888 (dest);
1360
1361 s = pix_multiply (s, a);
1362 d = pix_add (s, d);
1363 store8888 (dest, d);
1364
1365 ++src;
1366 ++dest;
1367 ++mask;
1368 }
1369 _mm_empty ();
1370 }
1371
1372 /* ------------- MMX code paths called from fbpict.c -------------------- */
1373
1374 static void
mmx_composite_over_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1375 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1376 pixman_composite_info_t *info)
1377 {
1378 PIXMAN_COMPOSITE_ARGS (info);
1379 uint32_t src;
1380 uint32_t *dst_line, *dst;
1381 int32_t w;
1382 int dst_stride;
1383 __m64 vsrc, vsrca;
1384
1385 CHECKPOINT ();
1386
1387 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1388
1389 if (src == 0)
1390 return;
1391
1392 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1393
1394 vsrc = load8888 (&src);
1395 vsrca = expand_alpha (vsrc);
1396
1397 while (height--)
1398 {
1399 dst = dst_line;
1400 dst_line += dst_stride;
1401 w = width;
1402
1403 CHECKPOINT ();
1404
1405 while (w && (uintptr_t)dst & 7)
1406 {
1407 store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1408
1409 w--;
1410 dst++;
1411 }
1412
1413 while (w >= 2)
1414 {
1415 __m64 vdest;
1416 __m64 dest0, dest1;
1417
1418 vdest = *(__m64 *)dst;
1419
1420 dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1421 dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1422
1423 *(__m64 *)dst = pack8888 (dest0, dest1);
1424
1425 dst += 2;
1426 w -= 2;
1427 }
1428
1429 CHECKPOINT ();
1430
1431 if (w)
1432 {
1433 store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1434 }
1435 }
1436
1437 _mm_empty ();
1438 }
1439
1440 static void
mmx_composite_over_n_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)1441 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1442 pixman_composite_info_t *info)
1443 {
1444 PIXMAN_COMPOSITE_ARGS (info);
1445 uint32_t src;
1446 uint16_t *dst_line, *dst;
1447 int32_t w;
1448 int dst_stride;
1449 __m64 vsrc, vsrca;
1450
1451 CHECKPOINT ();
1452
1453 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1454
1455 if (src == 0)
1456 return;
1457
1458 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1459
1460 vsrc = load8888 (&src);
1461 vsrca = expand_alpha (vsrc);
1462
1463 while (height--)
1464 {
1465 dst = dst_line;
1466 dst_line += dst_stride;
1467 w = width;
1468
1469 CHECKPOINT ();
1470
1471 while (w && (uintptr_t)dst & 7)
1472 {
1473 uint64_t d = *dst;
1474 __m64 vdest = expand565 (to_m64 (d), 0);
1475
1476 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1477 *dst = to_uint64 (vdest);
1478
1479 w--;
1480 dst++;
1481 }
1482
1483 while (w >= 4)
1484 {
1485 __m64 vdest = *(__m64 *)dst;
1486 __m64 v0, v1, v2, v3;
1487
1488 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1489
1490 v0 = over (vsrc, vsrca, v0);
1491 v1 = over (vsrc, vsrca, v1);
1492 v2 = over (vsrc, vsrca, v2);
1493 v3 = over (vsrc, vsrca, v3);
1494
1495 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1496
1497 dst += 4;
1498 w -= 4;
1499 }
1500
1501 CHECKPOINT ();
1502
1503 while (w)
1504 {
1505 uint64_t d = *dst;
1506 __m64 vdest = expand565 (to_m64 (d), 0);
1507
1508 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1509 *dst = to_uint64 (vdest);
1510
1511 w--;
1512 dst++;
1513 }
1514 }
1515
1516 _mm_empty ();
1517 }
1518
1519 static void
mmx_composite_over_n_8888_8888_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)1520 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1521 pixman_composite_info_t *info)
1522 {
1523 PIXMAN_COMPOSITE_ARGS (info);
1524 uint32_t src;
1525 uint32_t *dst_line;
1526 uint32_t *mask_line;
1527 int dst_stride, mask_stride;
1528 __m64 vsrc, vsrca;
1529
1530 CHECKPOINT ();
1531
1532 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1533
1534 if (src == 0)
1535 return;
1536
1537 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1538 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1539
1540 vsrc = load8888 (&src);
1541 vsrca = expand_alpha (vsrc);
1542
1543 while (height--)
1544 {
1545 int twidth = width;
1546 uint32_t *p = (uint32_t *)mask_line;
1547 uint32_t *q = (uint32_t *)dst_line;
1548
1549 while (twidth && (uintptr_t)q & 7)
1550 {
1551 uint32_t m = *(uint32_t *)p;
1552
1553 if (m)
1554 {
1555 __m64 vdest = load8888 (q);
1556 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1557 store8888 (q, vdest);
1558 }
1559
1560 twidth--;
1561 p++;
1562 q++;
1563 }
1564
1565 while (twidth >= 2)
1566 {
1567 uint32_t m0, m1;
1568 m0 = *p;
1569 m1 = *(p + 1);
1570
1571 if (m0 | m1)
1572 {
1573 __m64 dest0, dest1;
1574 __m64 vdest = *(__m64 *)q;
1575
1576 dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1577 expand8888 (vdest, 0));
1578 dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1579 expand8888 (vdest, 1));
1580
1581 *(__m64 *)q = pack8888 (dest0, dest1);
1582 }
1583
1584 p += 2;
1585 q += 2;
1586 twidth -= 2;
1587 }
1588
1589 if (twidth)
1590 {
1591 uint32_t m = *(uint32_t *)p;
1592
1593 if (m)
1594 {
1595 __m64 vdest = load8888 (q);
1596 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1597 store8888 (q, vdest);
1598 }
1599
1600 twidth--;
1601 p++;
1602 q++;
1603 }
1604
1605 dst_line += dst_stride;
1606 mask_line += mask_stride;
1607 }
1608
1609 _mm_empty ();
1610 }
1611
1612 static void
mmx_composite_over_8888_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1613 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1614 pixman_composite_info_t *info)
1615 {
1616 PIXMAN_COMPOSITE_ARGS (info);
1617 uint32_t *dst_line, *dst;
1618 uint32_t *src_line, *src;
1619 uint32_t mask;
1620 __m64 vmask;
1621 int dst_stride, src_stride;
1622 int32_t w;
1623
1624 CHECKPOINT ();
1625
1626 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1627 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1628
1629 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1630 vmask = expand_alpha (load8888 (&mask));
1631
1632 while (height--)
1633 {
1634 dst = dst_line;
1635 dst_line += dst_stride;
1636 src = src_line;
1637 src_line += src_stride;
1638 w = width;
1639
1640 while (w && (uintptr_t)dst & 7)
1641 {
1642 __m64 s = load8888 (src);
1643 __m64 d = load8888 (dst);
1644
1645 store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1646
1647 w--;
1648 dst++;
1649 src++;
1650 }
1651
1652 while (w >= 2)
1653 {
1654 __m64 vs = ldq_u ((__m64 *)src);
1655 __m64 vd = *(__m64 *)dst;
1656 __m64 vsrc0 = expand8888 (vs, 0);
1657 __m64 vsrc1 = expand8888 (vs, 1);
1658
1659 *(__m64 *)dst = pack8888 (
1660 in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1661 in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1662
1663 w -= 2;
1664 dst += 2;
1665 src += 2;
1666 }
1667
1668 if (w)
1669 {
1670 __m64 s = load8888 (src);
1671 __m64 d = load8888 (dst);
1672
1673 store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1674 }
1675 }
1676
1677 _mm_empty ();
1678 }
1679
1680 static void
mmx_composite_over_x888_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1681 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1682 pixman_composite_info_t *info)
1683 {
1684 PIXMAN_COMPOSITE_ARGS (info);
1685 uint32_t *dst_line, *dst;
1686 uint32_t *src_line, *src;
1687 uint32_t mask;
1688 __m64 vmask;
1689 int dst_stride, src_stride;
1690 int32_t w;
1691 __m64 srca;
1692
1693 CHECKPOINT ();
1694
1695 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1696 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1697 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1698
1699 vmask = expand_alpha (load8888 (&mask));
1700 srca = MC (4x00ff);
1701
1702 while (height--)
1703 {
1704 dst = dst_line;
1705 dst_line += dst_stride;
1706 src = src_line;
1707 src_line += src_stride;
1708 w = width;
1709
1710 while (w && (uintptr_t)dst & 7)
1711 {
1712 uint32_t ssrc = *src | 0xff000000;
1713 __m64 s = load8888 (&ssrc);
1714 __m64 d = load8888 (dst);
1715
1716 store8888 (dst, in_over (s, srca, vmask, d));
1717
1718 w--;
1719 dst++;
1720 src++;
1721 }
1722
1723 while (w >= 16)
1724 {
1725 __m64 vd0 = *(__m64 *)(dst + 0);
1726 __m64 vd1 = *(__m64 *)(dst + 2);
1727 __m64 vd2 = *(__m64 *)(dst + 4);
1728 __m64 vd3 = *(__m64 *)(dst + 6);
1729 __m64 vd4 = *(__m64 *)(dst + 8);
1730 __m64 vd5 = *(__m64 *)(dst + 10);
1731 __m64 vd6 = *(__m64 *)(dst + 12);
1732 __m64 vd7 = *(__m64 *)(dst + 14);
1733
1734 __m64 vs0 = ldq_u ((__m64 *)(src + 0));
1735 __m64 vs1 = ldq_u ((__m64 *)(src + 2));
1736 __m64 vs2 = ldq_u ((__m64 *)(src + 4));
1737 __m64 vs3 = ldq_u ((__m64 *)(src + 6));
1738 __m64 vs4 = ldq_u ((__m64 *)(src + 8));
1739 __m64 vs5 = ldq_u ((__m64 *)(src + 10));
1740 __m64 vs6 = ldq_u ((__m64 *)(src + 12));
1741 __m64 vs7 = ldq_u ((__m64 *)(src + 14));
1742
1743 vd0 = pack8888 (
1744 in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1745 in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1746
1747 vd1 = pack8888 (
1748 in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1749 in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1750
1751 vd2 = pack8888 (
1752 in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1753 in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1754
1755 vd3 = pack8888 (
1756 in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1757 in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1758
1759 vd4 = pack8888 (
1760 in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1761 in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1762
1763 vd5 = pack8888 (
1764 in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1765 in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1766
1767 vd6 = pack8888 (
1768 in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1769 in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1770
1771 vd7 = pack8888 (
1772 in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1773 in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1774
1775 *(__m64 *)(dst + 0) = vd0;
1776 *(__m64 *)(dst + 2) = vd1;
1777 *(__m64 *)(dst + 4) = vd2;
1778 *(__m64 *)(dst + 6) = vd3;
1779 *(__m64 *)(dst + 8) = vd4;
1780 *(__m64 *)(dst + 10) = vd5;
1781 *(__m64 *)(dst + 12) = vd6;
1782 *(__m64 *)(dst + 14) = vd7;
1783
1784 w -= 16;
1785 dst += 16;
1786 src += 16;
1787 }
1788
1789 while (w)
1790 {
1791 uint32_t ssrc = *src | 0xff000000;
1792 __m64 s = load8888 (&ssrc);
1793 __m64 d = load8888 (dst);
1794
1795 store8888 (dst, in_over (s, srca, vmask, d));
1796
1797 w--;
1798 dst++;
1799 src++;
1800 }
1801 }
1802
1803 _mm_empty ();
1804 }
1805
1806 static void
mmx_composite_over_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1807 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1808 pixman_composite_info_t *info)
1809 {
1810 PIXMAN_COMPOSITE_ARGS (info);
1811 uint32_t *dst_line, *dst;
1812 uint32_t *src_line, *src;
1813 uint32_t s;
1814 int dst_stride, src_stride;
1815 uint8_t a;
1816 int32_t w;
1817
1818 CHECKPOINT ();
1819
1820 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1821 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1822
1823 while (height--)
1824 {
1825 dst = dst_line;
1826 dst_line += dst_stride;
1827 src = src_line;
1828 src_line += src_stride;
1829 w = width;
1830
1831 while (w--)
1832 {
1833 s = *src++;
1834 a = s >> 24;
1835
1836 if (a == 0xff)
1837 {
1838 *dst = s;
1839 }
1840 else if (s)
1841 {
1842 __m64 ms, sa;
1843 ms = load8888 (&s);
1844 sa = expand_alpha (ms);
1845 store8888 (dst, over (ms, sa, load8888 (dst)));
1846 }
1847
1848 dst++;
1849 }
1850 }
1851 _mm_empty ();
1852 }
1853
1854 static void
mmx_composite_over_8888_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)1855 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1856 pixman_composite_info_t *info)
1857 {
1858 PIXMAN_COMPOSITE_ARGS (info);
1859 uint16_t *dst_line, *dst;
1860 uint32_t *src_line, *src;
1861 int dst_stride, src_stride;
1862 int32_t w;
1863
1864 CHECKPOINT ();
1865
1866 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1867 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1868
1869 #if 0
1870 /* FIXME */
1871 assert (src_image->drawable == mask_image->drawable);
1872 #endif
1873
1874 while (height--)
1875 {
1876 dst = dst_line;
1877 dst_line += dst_stride;
1878 src = src_line;
1879 src_line += src_stride;
1880 w = width;
1881
1882 CHECKPOINT ();
1883
1884 while (w && (uintptr_t)dst & 7)
1885 {
1886 __m64 vsrc = load8888 (src);
1887 uint64_t d = *dst;
1888 __m64 vdest = expand565 (to_m64 (d), 0);
1889
1890 vdest = pack_565 (
1891 over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1892
1893 *dst = to_uint64 (vdest);
1894
1895 w--;
1896 dst++;
1897 src++;
1898 }
1899
1900 CHECKPOINT ();
1901
1902 while (w >= 4)
1903 {
1904 __m64 vdest = *(__m64 *)dst;
1905 __m64 v0, v1, v2, v3;
1906 __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1907
1908 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1909
1910 vsrc0 = load8888 ((src + 0));
1911 vsrc1 = load8888 ((src + 1));
1912 vsrc2 = load8888 ((src + 2));
1913 vsrc3 = load8888 ((src + 3));
1914
1915 v0 = over (vsrc0, expand_alpha (vsrc0), v0);
1916 v1 = over (vsrc1, expand_alpha (vsrc1), v1);
1917 v2 = over (vsrc2, expand_alpha (vsrc2), v2);
1918 v3 = over (vsrc3, expand_alpha (vsrc3), v3);
1919
1920 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1921
1922 w -= 4;
1923 dst += 4;
1924 src += 4;
1925 }
1926
1927 CHECKPOINT ();
1928
1929 while (w)
1930 {
1931 __m64 vsrc = load8888 (src);
1932 uint64_t d = *dst;
1933 __m64 vdest = expand565 (to_m64 (d), 0);
1934
1935 vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1936
1937 *dst = to_uint64 (vdest);
1938
1939 w--;
1940 dst++;
1941 src++;
1942 }
1943 }
1944
1945 _mm_empty ();
1946 }
1947
1948 static void
mmx_composite_over_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1949 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1950 pixman_composite_info_t *info)
1951 {
1952 PIXMAN_COMPOSITE_ARGS (info);
1953 uint32_t src, srca;
1954 uint32_t *dst_line, *dst;
1955 uint8_t *mask_line, *mask;
1956 int dst_stride, mask_stride;
1957 int32_t w;
1958 __m64 vsrc, vsrca;
1959 uint64_t srcsrc;
1960
1961 CHECKPOINT ();
1962
1963 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1964
1965 srca = src >> 24;
1966 if (src == 0)
1967 return;
1968
1969 srcsrc = (uint64_t)src << 32 | src;
1970
1971 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1972 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1973
1974 vsrc = load8888 (&src);
1975 vsrca = expand_alpha (vsrc);
1976
1977 while (height--)
1978 {
1979 dst = dst_line;
1980 dst_line += dst_stride;
1981 mask = mask_line;
1982 mask_line += mask_stride;
1983 w = width;
1984
1985 CHECKPOINT ();
1986
1987 while (w && (uintptr_t)dst & 7)
1988 {
1989 uint64_t m = *mask;
1990
1991 if (m)
1992 {
1993 __m64 vdest = in_over (vsrc, vsrca,
1994 expand_alpha_rev (to_m64 (m)),
1995 load8888 (dst));
1996
1997 store8888 (dst, vdest);
1998 }
1999
2000 w--;
2001 mask++;
2002 dst++;
2003 }
2004
2005 CHECKPOINT ();
2006
2007 while (w >= 2)
2008 {
2009 uint64_t m0, m1;
2010
2011 m0 = *mask;
2012 m1 = *(mask + 1);
2013
2014 if (srca == 0xff && (m0 & m1) == 0xff)
2015 {
2016 *(uint64_t *)dst = srcsrc;
2017 }
2018 else if (m0 | m1)
2019 {
2020 __m64 vdest;
2021 __m64 dest0, dest1;
2022
2023 vdest = *(__m64 *)dst;
2024
2025 dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
2026 expand8888 (vdest, 0));
2027 dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
2028 expand8888 (vdest, 1));
2029
2030 *(__m64 *)dst = pack8888 (dest0, dest1);
2031 }
2032
2033 mask += 2;
2034 dst += 2;
2035 w -= 2;
2036 }
2037
2038 CHECKPOINT ();
2039
2040 if (w)
2041 {
2042 uint64_t m = *mask;
2043
2044 if (m)
2045 {
2046 __m64 vdest = load8888 (dst);
2047
2048 vdest = in_over (
2049 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
2050 store8888 (dst, vdest);
2051 }
2052 }
2053 }
2054
2055 _mm_empty ();
2056 }
2057
2058 static pixman_bool_t
mmx_fill(pixman_implementation_t * imp,uint32_t * bits,int stride,int bpp,int x,int y,int width,int height,uint32_t filler)2059 mmx_fill (pixman_implementation_t *imp,
2060 uint32_t * bits,
2061 int stride,
2062 int bpp,
2063 int x,
2064 int y,
2065 int width,
2066 int height,
2067 uint32_t filler)
2068 {
2069 uint64_t fill;
2070 __m64 vfill;
2071 uint32_t byte_width;
2072 uint8_t *byte_line;
2073
2074 #if defined __GNUC__ && defined USE_X86_MMX
2075 __m64 v1, v2, v3, v4, v5, v6, v7;
2076 #endif
2077
2078 if (bpp != 16 && bpp != 32 && bpp != 8)
2079 return FALSE;
2080
2081 if (bpp == 8)
2082 {
2083 stride = stride * (int) sizeof (uint32_t) / 1;
2084 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
2085 byte_width = width;
2086 stride *= 1;
2087 filler = (filler & 0xff) * 0x01010101;
2088 }
2089 else if (bpp == 16)
2090 {
2091 stride = stride * (int) sizeof (uint32_t) / 2;
2092 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
2093 byte_width = 2 * width;
2094 stride *= 2;
2095 filler = (filler & 0xffff) * 0x00010001;
2096 }
2097 else
2098 {
2099 stride = stride * (int) sizeof (uint32_t) / 4;
2100 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
2101 byte_width = 4 * width;
2102 stride *= 4;
2103 }
2104
2105 fill = ((uint64_t)filler << 32) | filler;
2106 vfill = to_m64 (fill);
2107
2108 #if defined __GNUC__ && defined USE_X86_MMX
2109 __asm__ (
2110 "movq %7, %0\n"
2111 "movq %7, %1\n"
2112 "movq %7, %2\n"
2113 "movq %7, %3\n"
2114 "movq %7, %4\n"
2115 "movq %7, %5\n"
2116 "movq %7, %6\n"
2117 : "=&y" (v1), "=&y" (v2), "=&y" (v3),
2118 "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
2119 : "y" (vfill));
2120 #endif
2121
2122 while (height--)
2123 {
2124 int w;
2125 uint8_t *d = byte_line;
2126
2127 byte_line += stride;
2128 w = byte_width;
2129
2130 if (w >= 1 && ((uintptr_t)d & 1))
2131 {
2132 *(uint8_t *)d = (filler & 0xff);
2133 w--;
2134 d++;
2135 }
2136
2137 if (w >= 2 && ((uintptr_t)d & 3))
2138 {
2139 *(uint16_t *)d = filler;
2140 w -= 2;
2141 d += 2;
2142 }
2143
2144 while (w >= 4 && ((uintptr_t)d & 7))
2145 {
2146 *(uint32_t *)d = filler;
2147
2148 w -= 4;
2149 d += 4;
2150 }
2151
2152 while (w >= 64)
2153 {
2154 #if defined __GNUC__ && defined USE_X86_MMX
2155 __asm__ (
2156 "movq %1, (%0)\n"
2157 "movq %2, 8(%0)\n"
2158 "movq %3, 16(%0)\n"
2159 "movq %4, 24(%0)\n"
2160 "movq %5, 32(%0)\n"
2161 "movq %6, 40(%0)\n"
2162 "movq %7, 48(%0)\n"
2163 "movq %8, 56(%0)\n"
2164 :
2165 : "r" (d),
2166 "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
2167 "y" (v4), "y" (v5), "y" (v6), "y" (v7)
2168 : "memory");
2169 #else
2170 *(__m64*) (d + 0) = vfill;
2171 *(__m64*) (d + 8) = vfill;
2172 *(__m64*) (d + 16) = vfill;
2173 *(__m64*) (d + 24) = vfill;
2174 *(__m64*) (d + 32) = vfill;
2175 *(__m64*) (d + 40) = vfill;
2176 *(__m64*) (d + 48) = vfill;
2177 *(__m64*) (d + 56) = vfill;
2178 #endif
2179 w -= 64;
2180 d += 64;
2181 }
2182
2183 while (w >= 4)
2184 {
2185 *(uint32_t *)d = filler;
2186
2187 w -= 4;
2188 d += 4;
2189 }
2190 if (w >= 2)
2191 {
2192 *(uint16_t *)d = filler;
2193 w -= 2;
2194 d += 2;
2195 }
2196 if (w >= 1)
2197 {
2198 *(uint8_t *)d = (filler & 0xff);
2199 w--;
2200 d++;
2201 }
2202
2203 }
2204
2205 _mm_empty ();
2206 return TRUE;
2207 }
2208
2209 static void
mmx_composite_src_x888_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2210 mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
2211 pixman_composite_info_t *info)
2212 {
2213 PIXMAN_COMPOSITE_ARGS (info);
2214 uint16_t *dst_line, *dst;
2215 uint32_t *src_line, *src, s;
2216 int dst_stride, src_stride;
2217 int32_t w;
2218
2219 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2220 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2221
2222 while (height--)
2223 {
2224 dst = dst_line;
2225 dst_line += dst_stride;
2226 src = src_line;
2227 src_line += src_stride;
2228 w = width;
2229
2230 while (w && (uintptr_t)dst & 7)
2231 {
2232 s = *src++;
2233 *dst = convert_8888_to_0565 (s);
2234 dst++;
2235 w--;
2236 }
2237
2238 while (w >= 4)
2239 {
2240 __m64 vdest;
2241 __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
2242 __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
2243
2244 vdest = pack_4xpacked565 (vsrc0, vsrc1);
2245
2246 *(__m64 *)dst = vdest;
2247
2248 w -= 4;
2249 src += 4;
2250 dst += 4;
2251 }
2252
2253 while (w)
2254 {
2255 s = *src++;
2256 *dst = convert_8888_to_0565 (s);
2257 dst++;
2258 w--;
2259 }
2260 }
2261
2262 _mm_empty ();
2263 }
2264
2265 static void
mmx_composite_src_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2266 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2267 pixman_composite_info_t *info)
2268 {
2269 PIXMAN_COMPOSITE_ARGS (info);
2270 uint32_t src, srca;
2271 uint32_t *dst_line, *dst;
2272 uint8_t *mask_line, *mask;
2273 int dst_stride, mask_stride;
2274 int32_t w;
2275 __m64 vsrc;
2276 uint64_t srcsrc;
2277
2278 CHECKPOINT ();
2279
2280 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2281
2282 srca = src >> 24;
2283 if (src == 0)
2284 {
2285 mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
2286 PIXMAN_FORMAT_BPP (dest_image->bits.format),
2287 dest_x, dest_y, width, height, 0);
2288 return;
2289 }
2290
2291 srcsrc = (uint64_t)src << 32 | src;
2292
2293 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2294 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2295
2296 vsrc = load8888 (&src);
2297
2298 while (height--)
2299 {
2300 dst = dst_line;
2301 dst_line += dst_stride;
2302 mask = mask_line;
2303 mask_line += mask_stride;
2304 w = width;
2305
2306 CHECKPOINT ();
2307
2308 while (w && (uintptr_t)dst & 7)
2309 {
2310 uint64_t m = *mask;
2311
2312 if (m)
2313 {
2314 __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2315
2316 store8888 (dst, vdest);
2317 }
2318 else
2319 {
2320 *dst = 0;
2321 }
2322
2323 w--;
2324 mask++;
2325 dst++;
2326 }
2327
2328 CHECKPOINT ();
2329
2330 while (w >= 2)
2331 {
2332 uint64_t m0, m1;
2333 m0 = *mask;
2334 m1 = *(mask + 1);
2335
2336 if (srca == 0xff && (m0 & m1) == 0xff)
2337 {
2338 *(uint64_t *)dst = srcsrc;
2339 }
2340 else if (m0 | m1)
2341 {
2342 __m64 dest0, dest1;
2343
2344 dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2345 dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2346
2347 *(__m64 *)dst = pack8888 (dest0, dest1);
2348 }
2349 else
2350 {
2351 *(uint64_t *)dst = 0;
2352 }
2353
2354 mask += 2;
2355 dst += 2;
2356 w -= 2;
2357 }
2358
2359 CHECKPOINT ();
2360
2361 if (w)
2362 {
2363 uint64_t m = *mask;
2364
2365 if (m)
2366 {
2367 __m64 vdest = load8888 (dst);
2368
2369 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2370 store8888 (dst, vdest);
2371 }
2372 else
2373 {
2374 *dst = 0;
2375 }
2376 }
2377 }
2378
2379 _mm_empty ();
2380 }
2381
2382 static void
mmx_composite_over_n_8_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2383 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2384 pixman_composite_info_t *info)
2385 {
2386 PIXMAN_COMPOSITE_ARGS (info);
2387 uint32_t src, srca;
2388 uint16_t *dst_line, *dst;
2389 uint8_t *mask_line, *mask;
2390 int dst_stride, mask_stride;
2391 int32_t w;
2392 __m64 vsrc, vsrca, tmp;
2393 __m64 srcsrcsrcsrc;
2394
2395 CHECKPOINT ();
2396
2397 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2398
2399 srca = src >> 24;
2400 if (src == 0)
2401 return;
2402
2403 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2404 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2405
2406 vsrc = load8888 (&src);
2407 vsrca = expand_alpha (vsrc);
2408
2409 tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2410 srcsrcsrcsrc = expand_alpha_rev (tmp);
2411
2412 while (height--)
2413 {
2414 dst = dst_line;
2415 dst_line += dst_stride;
2416 mask = mask_line;
2417 mask_line += mask_stride;
2418 w = width;
2419
2420 CHECKPOINT ();
2421
2422 while (w && (uintptr_t)dst & 7)
2423 {
2424 uint64_t m = *mask;
2425
2426 if (m)
2427 {
2428 uint64_t d = *dst;
2429 __m64 vd = to_m64 (d);
2430 __m64 vdest = in_over (
2431 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2432
2433 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2434 *dst = to_uint64 (vd);
2435 }
2436
2437 w--;
2438 mask++;
2439 dst++;
2440 }
2441
2442 CHECKPOINT ();
2443
2444 while (w >= 4)
2445 {
2446 uint64_t m0, m1, m2, m3;
2447 m0 = *mask;
2448 m1 = *(mask + 1);
2449 m2 = *(mask + 2);
2450 m3 = *(mask + 3);
2451
2452 if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2453 {
2454 *(__m64 *)dst = srcsrcsrcsrc;
2455 }
2456 else if (m0 | m1 | m2 | m3)
2457 {
2458 __m64 vdest = *(__m64 *)dst;
2459 __m64 v0, v1, v2, v3;
2460 __m64 vm0, vm1, vm2, vm3;
2461
2462 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2463
2464 vm0 = to_m64 (m0);
2465 v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
2466
2467 vm1 = to_m64 (m1);
2468 v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
2469
2470 vm2 = to_m64 (m2);
2471 v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
2472
2473 vm3 = to_m64 (m3);
2474 v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
2475
2476 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
2477 }
2478
2479 w -= 4;
2480 mask += 4;
2481 dst += 4;
2482 }
2483
2484 CHECKPOINT ();
2485
2486 while (w)
2487 {
2488 uint64_t m = *mask;
2489
2490 if (m)
2491 {
2492 uint64_t d = *dst;
2493 __m64 vd = to_m64 (d);
2494 __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2495 expand565 (vd, 0));
2496 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2497 *dst = to_uint64 (vd);
2498 }
2499
2500 w--;
2501 mask++;
2502 dst++;
2503 }
2504 }
2505
2506 _mm_empty ();
2507 }
2508
2509 static void
mmx_composite_over_pixbuf_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2510 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2511 pixman_composite_info_t *info)
2512 {
2513 PIXMAN_COMPOSITE_ARGS (info);
2514 uint16_t *dst_line, *dst;
2515 uint32_t *src_line, *src;
2516 int dst_stride, src_stride;
2517 int32_t w;
2518
2519 CHECKPOINT ();
2520
2521 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2522 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2523
2524 #if 0
2525 /* FIXME */
2526 assert (src_image->drawable == mask_image->drawable);
2527 #endif
2528
2529 while (height--)
2530 {
2531 dst = dst_line;
2532 dst_line += dst_stride;
2533 src = src_line;
2534 src_line += src_stride;
2535 w = width;
2536
2537 CHECKPOINT ();
2538
2539 while (w && (uintptr_t)dst & 7)
2540 {
2541 __m64 vsrc = load8888 (src);
2542 uint64_t d = *dst;
2543 __m64 vdest = expand565 (to_m64 (d), 0);
2544
2545 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2546
2547 *dst = to_uint64 (vdest);
2548
2549 w--;
2550 dst++;
2551 src++;
2552 }
2553
2554 CHECKPOINT ();
2555
2556 while (w >= 4)
2557 {
2558 uint32_t s0, s1, s2, s3;
2559 unsigned char a0, a1, a2, a3;
2560
2561 s0 = *src;
2562 s1 = *(src + 1);
2563 s2 = *(src + 2);
2564 s3 = *(src + 3);
2565
2566 a0 = (s0 >> 24);
2567 a1 = (s1 >> 24);
2568 a2 = (s2 >> 24);
2569 a3 = (s3 >> 24);
2570
2571 if ((a0 & a1 & a2 & a3) == 0xFF)
2572 {
2573 __m64 v0 = invert_colors (load8888 (&s0));
2574 __m64 v1 = invert_colors (load8888 (&s1));
2575 __m64 v2 = invert_colors (load8888 (&s2));
2576 __m64 v3 = invert_colors (load8888 (&s3));
2577
2578 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2579 }
2580 else if (s0 | s1 | s2 | s3)
2581 {
2582 __m64 vdest = *(__m64 *)dst;
2583 __m64 v0, v1, v2, v3;
2584
2585 __m64 vsrc0 = load8888 (&s0);
2586 __m64 vsrc1 = load8888 (&s1);
2587 __m64 vsrc2 = load8888 (&s2);
2588 __m64 vsrc3 = load8888 (&s3);
2589
2590 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2591
2592 v0 = over_rev_non_pre (vsrc0, v0);
2593 v1 = over_rev_non_pre (vsrc1, v1);
2594 v2 = over_rev_non_pre (vsrc2, v2);
2595 v3 = over_rev_non_pre (vsrc3, v3);
2596
2597 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2598 }
2599
2600 w -= 4;
2601 dst += 4;
2602 src += 4;
2603 }
2604
2605 CHECKPOINT ();
2606
2607 while (w)
2608 {
2609 __m64 vsrc = load8888 (src);
2610 uint64_t d = *dst;
2611 __m64 vdest = expand565 (to_m64 (d), 0);
2612
2613 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2614
2615 *dst = to_uint64 (vdest);
2616
2617 w--;
2618 dst++;
2619 src++;
2620 }
2621 }
2622
2623 _mm_empty ();
2624 }
2625
2626 static void
mmx_composite_over_pixbuf_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2627 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2628 pixman_composite_info_t *info)
2629 {
2630 PIXMAN_COMPOSITE_ARGS (info);
2631 uint32_t *dst_line, *dst;
2632 uint32_t *src_line, *src;
2633 int dst_stride, src_stride;
2634 int32_t w;
2635
2636 CHECKPOINT ();
2637
2638 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2639 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2640
2641 #if 0
2642 /* FIXME */
2643 assert (src_image->drawable == mask_image->drawable);
2644 #endif
2645
2646 while (height--)
2647 {
2648 dst = dst_line;
2649 dst_line += dst_stride;
2650 src = src_line;
2651 src_line += src_stride;
2652 w = width;
2653
2654 while (w && (uintptr_t)dst & 7)
2655 {
2656 __m64 s = load8888 (src);
2657 __m64 d = load8888 (dst);
2658
2659 store8888 (dst, over_rev_non_pre (s, d));
2660
2661 w--;
2662 dst++;
2663 src++;
2664 }
2665
2666 while (w >= 2)
2667 {
2668 uint32_t s0, s1;
2669 unsigned char a0, a1;
2670 __m64 d0, d1;
2671
2672 s0 = *src;
2673 s1 = *(src + 1);
2674
2675 a0 = (s0 >> 24);
2676 a1 = (s1 >> 24);
2677
2678 if ((a0 & a1) == 0xFF)
2679 {
2680 d0 = invert_colors (load8888 (&s0));
2681 d1 = invert_colors (load8888 (&s1));
2682
2683 *(__m64 *)dst = pack8888 (d0, d1);
2684 }
2685 else if (s0 | s1)
2686 {
2687 __m64 vdest = *(__m64 *)dst;
2688
2689 d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
2690 d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
2691
2692 *(__m64 *)dst = pack8888 (d0, d1);
2693 }
2694
2695 w -= 2;
2696 dst += 2;
2697 src += 2;
2698 }
2699
2700 if (w)
2701 {
2702 __m64 s = load8888 (src);
2703 __m64 d = load8888 (dst);
2704
2705 store8888 (dst, over_rev_non_pre (s, d));
2706 }
2707 }
2708
2709 _mm_empty ();
2710 }
2711
2712 static void
mmx_composite_over_n_8888_0565_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)2713 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2714 pixman_composite_info_t *info)
2715 {
2716 PIXMAN_COMPOSITE_ARGS (info);
2717 uint32_t src;
2718 uint16_t *dst_line;
2719 uint32_t *mask_line;
2720 int dst_stride, mask_stride;
2721 __m64 vsrc, vsrca;
2722
2723 CHECKPOINT ();
2724
2725 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2726
2727 if (src == 0)
2728 return;
2729
2730 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2731 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2732
2733 vsrc = load8888 (&src);
2734 vsrca = expand_alpha (vsrc);
2735
2736 while (height--)
2737 {
2738 int twidth = width;
2739 uint32_t *p = (uint32_t *)mask_line;
2740 uint16_t *q = (uint16_t *)dst_line;
2741
2742 while (twidth && ((uintptr_t)q & 7))
2743 {
2744 uint32_t m = *(uint32_t *)p;
2745
2746 if (m)
2747 {
2748 uint64_t d = *q;
2749 __m64 vdest = expand565 (to_m64 (d), 0);
2750 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2751 *q = to_uint64 (vdest);
2752 }
2753
2754 twidth--;
2755 p++;
2756 q++;
2757 }
2758
2759 while (twidth >= 4)
2760 {
2761 uint32_t m0, m1, m2, m3;
2762
2763 m0 = *p;
2764 m1 = *(p + 1);
2765 m2 = *(p + 2);
2766 m3 = *(p + 3);
2767
2768 if ((m0 | m1 | m2 | m3))
2769 {
2770 __m64 vdest = *(__m64 *)q;
2771 __m64 v0, v1, v2, v3;
2772
2773 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2774
2775 v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
2776 v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
2777 v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
2778 v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
2779
2780 *(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
2781 }
2782 twidth -= 4;
2783 p += 4;
2784 q += 4;
2785 }
2786
2787 while (twidth)
2788 {
2789 uint32_t m;
2790
2791 m = *(uint32_t *)p;
2792 if (m)
2793 {
2794 uint64_t d = *q;
2795 __m64 vdest = expand565 (to_m64 (d), 0);
2796 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2797 *q = to_uint64 (vdest);
2798 }
2799
2800 twidth--;
2801 p++;
2802 q++;
2803 }
2804
2805 mask_line += mask_stride;
2806 dst_line += dst_stride;
2807 }
2808
2809 _mm_empty ();
2810 }
2811
2812 static void
mmx_composite_in_n_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)2813 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2814 pixman_composite_info_t *info)
2815 {
2816 PIXMAN_COMPOSITE_ARGS (info);
2817 uint8_t *dst_line, *dst;
2818 uint8_t *mask_line, *mask;
2819 int dst_stride, mask_stride;
2820 int32_t w;
2821 uint32_t src;
2822 uint8_t sa;
2823 __m64 vsrc, vsrca;
2824
2825 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2826 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2827
2828 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2829
2830 sa = src >> 24;
2831
2832 vsrc = load8888 (&src);
2833 vsrca = expand_alpha (vsrc);
2834
2835 while (height--)
2836 {
2837 dst = dst_line;
2838 dst_line += dst_stride;
2839 mask = mask_line;
2840 mask_line += mask_stride;
2841 w = width;
2842
2843 while (w && (uintptr_t)dst & 7)
2844 {
2845 uint16_t tmp;
2846 uint8_t a;
2847 uint32_t m, d;
2848
2849 a = *mask++;
2850 d = *dst;
2851
2852 m = MUL_UN8 (sa, a, tmp);
2853 d = MUL_UN8 (m, d, tmp);
2854
2855 *dst++ = d;
2856 w--;
2857 }
2858
2859 while (w >= 4)
2860 {
2861 __m64 vmask;
2862 __m64 vdest;
2863
2864 vmask = load8888u ((uint32_t *)mask);
2865 vdest = load8888 ((uint32_t *)dst);
2866
2867 store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
2868
2869 dst += 4;
2870 mask += 4;
2871 w -= 4;
2872 }
2873
2874 while (w--)
2875 {
2876 uint16_t tmp;
2877 uint8_t a;
2878 uint32_t m, d;
2879
2880 a = *mask++;
2881 d = *dst;
2882
2883 m = MUL_UN8 (sa, a, tmp);
2884 d = MUL_UN8 (m, d, tmp);
2885
2886 *dst++ = d;
2887 }
2888 }
2889
2890 _mm_empty ();
2891 }
2892
2893 static void
mmx_composite_in_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)2894 mmx_composite_in_8_8 (pixman_implementation_t *imp,
2895 pixman_composite_info_t *info)
2896 {
2897 PIXMAN_COMPOSITE_ARGS (info);
2898 uint8_t *dst_line, *dst;
2899 uint8_t *src_line, *src;
2900 int src_stride, dst_stride;
2901 int32_t w;
2902
2903 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2904 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2905
2906 while (height--)
2907 {
2908 dst = dst_line;
2909 dst_line += dst_stride;
2910 src = src_line;
2911 src_line += src_stride;
2912 w = width;
2913
2914 while (w && (uintptr_t)dst & 3)
2915 {
2916 uint8_t s, d;
2917 uint16_t tmp;
2918
2919 s = *src;
2920 d = *dst;
2921
2922 *dst = MUL_UN8 (s, d, tmp);
2923
2924 src++;
2925 dst++;
2926 w--;
2927 }
2928
2929 while (w >= 4)
2930 {
2931 uint32_t *s = (uint32_t *)src;
2932 uint32_t *d = (uint32_t *)dst;
2933
2934 store8888 (d, in (load8888u (s), load8888 (d)));
2935
2936 w -= 4;
2937 dst += 4;
2938 src += 4;
2939 }
2940
2941 while (w--)
2942 {
2943 uint8_t s, d;
2944 uint16_t tmp;
2945
2946 s = *src;
2947 d = *dst;
2948
2949 *dst = MUL_UN8 (s, d, tmp);
2950
2951 src++;
2952 dst++;
2953 }
2954 }
2955
2956 _mm_empty ();
2957 }
2958
2959 static void
mmx_composite_add_n_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)2960 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2961 pixman_composite_info_t *info)
2962 {
2963 PIXMAN_COMPOSITE_ARGS (info);
2964 uint8_t *dst_line, *dst;
2965 uint8_t *mask_line, *mask;
2966 int dst_stride, mask_stride;
2967 int32_t w;
2968 uint32_t src;
2969 uint8_t sa;
2970 __m64 vsrc, vsrca;
2971
2972 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2973 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2974
2975 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2976
2977 sa = src >> 24;
2978
2979 if (src == 0)
2980 return;
2981
2982 vsrc = load8888 (&src);
2983 vsrca = expand_alpha (vsrc);
2984
2985 while (height--)
2986 {
2987 dst = dst_line;
2988 dst_line += dst_stride;
2989 mask = mask_line;
2990 mask_line += mask_stride;
2991 w = width;
2992
2993 while (w && (uintptr_t)dst & 3)
2994 {
2995 uint16_t tmp;
2996 uint16_t a;
2997 uint32_t m, d;
2998 uint32_t r;
2999
3000 a = *mask++;
3001 d = *dst;
3002
3003 m = MUL_UN8 (sa, a, tmp);
3004 r = ADD_UN8 (m, d, tmp);
3005
3006 *dst++ = r;
3007 w--;
3008 }
3009
3010 while (w >= 4)
3011 {
3012 __m64 vmask;
3013 __m64 vdest;
3014
3015 vmask = load8888u ((uint32_t *)mask);
3016 vdest = load8888 ((uint32_t *)dst);
3017
3018 store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
3019
3020 dst += 4;
3021 mask += 4;
3022 w -= 4;
3023 }
3024
3025 while (w--)
3026 {
3027 uint16_t tmp;
3028 uint16_t a;
3029 uint32_t m, d;
3030 uint32_t r;
3031
3032 a = *mask++;
3033 d = *dst;
3034
3035 m = MUL_UN8 (sa, a, tmp);
3036 r = ADD_UN8 (m, d, tmp);
3037
3038 *dst++ = r;
3039 }
3040 }
3041
3042 _mm_empty ();
3043 }
3044
3045 static void
mmx_composite_add_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)3046 mmx_composite_add_8_8 (pixman_implementation_t *imp,
3047 pixman_composite_info_t *info)
3048 {
3049 PIXMAN_COMPOSITE_ARGS (info);
3050 uint8_t *dst_line, *dst;
3051 uint8_t *src_line, *src;
3052 int dst_stride, src_stride;
3053 int32_t w;
3054 uint8_t s, d;
3055 uint16_t t;
3056
3057 CHECKPOINT ();
3058
3059 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
3060 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
3061
3062 while (height--)
3063 {
3064 dst = dst_line;
3065 dst_line += dst_stride;
3066 src = src_line;
3067 src_line += src_stride;
3068 w = width;
3069
3070 while (w && (uintptr_t)dst & 7)
3071 {
3072 s = *src;
3073 d = *dst;
3074 t = d + s;
3075 s = t | (0 - (t >> 8));
3076 *dst = s;
3077
3078 dst++;
3079 src++;
3080 w--;
3081 }
3082
3083 while (w >= 8)
3084 {
3085 *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3086 dst += 8;
3087 src += 8;
3088 w -= 8;
3089 }
3090
3091 while (w)
3092 {
3093 s = *src;
3094 d = *dst;
3095 t = d + s;
3096 s = t | (0 - (t >> 8));
3097 *dst = s;
3098
3099 dst++;
3100 src++;
3101 w--;
3102 }
3103 }
3104
3105 _mm_empty ();
3106 }
3107
3108 static void
mmx_composite_add_0565_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)3109 mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
3110 pixman_composite_info_t *info)
3111 {
3112 PIXMAN_COMPOSITE_ARGS (info);
3113 uint16_t *dst_line, *dst;
3114 uint32_t d;
3115 uint16_t *src_line, *src;
3116 uint32_t s;
3117 int dst_stride, src_stride;
3118 int32_t w;
3119
3120 CHECKPOINT ();
3121
3122 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
3123 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3124
3125 while (height--)
3126 {
3127 dst = dst_line;
3128 dst_line += dst_stride;
3129 src = src_line;
3130 src_line += src_stride;
3131 w = width;
3132
3133 while (w && (uintptr_t)dst & 7)
3134 {
3135 s = *src++;
3136 if (s)
3137 {
3138 d = *dst;
3139 s = convert_0565_to_8888 (s);
3140 if (d)
3141 {
3142 d = convert_0565_to_8888 (d);
3143 UN8x4_ADD_UN8x4 (s, d);
3144 }
3145 *dst = convert_8888_to_0565 (s);
3146 }
3147 dst++;
3148 w--;
3149 }
3150
3151 while (w >= 4)
3152 {
3153 __m64 vdest = *(__m64 *)dst;
3154 __m64 vsrc = ldq_u ((__m64 *)src);
3155 __m64 vd0, vd1;
3156 __m64 vs0, vs1;
3157
3158 expand_4xpacked565 (vdest, &vd0, &vd1, 0);
3159 expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
3160
3161 vd0 = _mm_adds_pu8 (vd0, vs0);
3162 vd1 = _mm_adds_pu8 (vd1, vs1);
3163
3164 *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
3165
3166 dst += 4;
3167 src += 4;
3168 w -= 4;
3169 }
3170
3171 while (w--)
3172 {
3173 s = *src++;
3174 if (s)
3175 {
3176 d = *dst;
3177 s = convert_0565_to_8888 (s);
3178 if (d)
3179 {
3180 d = convert_0565_to_8888 (d);
3181 UN8x4_ADD_UN8x4 (s, d);
3182 }
3183 *dst = convert_8888_to_0565 (s);
3184 }
3185 dst++;
3186 }
3187 }
3188
3189 _mm_empty ();
3190 }
3191
3192 static void
mmx_composite_add_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3193 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
3194 pixman_composite_info_t *info)
3195 {
3196 PIXMAN_COMPOSITE_ARGS (info);
3197 uint32_t *dst_line, *dst;
3198 uint32_t *src_line, *src;
3199 int dst_stride, src_stride;
3200 int32_t w;
3201
3202 CHECKPOINT ();
3203
3204 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3205 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3206
3207 while (height--)
3208 {
3209 dst = dst_line;
3210 dst_line += dst_stride;
3211 src = src_line;
3212 src_line += src_stride;
3213 w = width;
3214
3215 while (w && (uintptr_t)dst & 7)
3216 {
3217 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3218 load ((const uint32_t *)dst)));
3219 dst++;
3220 src++;
3221 w--;
3222 }
3223
3224 while (w >= 2)
3225 {
3226 *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3227 dst += 2;
3228 src += 2;
3229 w -= 2;
3230 }
3231
3232 if (w)
3233 {
3234 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3235 load ((const uint32_t *)dst)));
3236
3237 }
3238 }
3239
3240 _mm_empty ();
3241 }
3242
3243 static pixman_bool_t
mmx_blt(pixman_implementation_t * imp,uint32_t * src_bits,uint32_t * dst_bits,int src_stride,int dst_stride,int src_bpp,int dst_bpp,int src_x,int src_y,int dest_x,int dest_y,int width,int height)3244 mmx_blt (pixman_implementation_t *imp,
3245 uint32_t * src_bits,
3246 uint32_t * dst_bits,
3247 int src_stride,
3248 int dst_stride,
3249 int src_bpp,
3250 int dst_bpp,
3251 int src_x,
3252 int src_y,
3253 int dest_x,
3254 int dest_y,
3255 int width,
3256 int height)
3257 {
3258 uint8_t * src_bytes;
3259 uint8_t * dst_bytes;
3260 int byte_width;
3261
3262 if (src_bpp != dst_bpp)
3263 return FALSE;
3264
3265 if (src_bpp == 16)
3266 {
3267 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3268 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3269 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3270 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3271 byte_width = 2 * width;
3272 src_stride *= 2;
3273 dst_stride *= 2;
3274 }
3275 else if (src_bpp == 32)
3276 {
3277 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3278 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3279 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3280 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3281 byte_width = 4 * width;
3282 src_stride *= 4;
3283 dst_stride *= 4;
3284 }
3285 else
3286 {
3287 return FALSE;
3288 }
3289
3290 while (height--)
3291 {
3292 int w;
3293 uint8_t *s = src_bytes;
3294 uint8_t *d = dst_bytes;
3295 src_bytes += src_stride;
3296 dst_bytes += dst_stride;
3297 w = byte_width;
3298
3299 if (w >= 1 && ((uintptr_t)d & 1))
3300 {
3301 *(uint8_t *)d = *(uint8_t *)s;
3302 w -= 1;
3303 s += 1;
3304 d += 1;
3305 }
3306
3307 if (w >= 2 && ((uintptr_t)d & 3))
3308 {
3309 *(uint16_t *)d = *(uint16_t *)s;
3310 w -= 2;
3311 s += 2;
3312 d += 2;
3313 }
3314
3315 while (w >= 4 && ((uintptr_t)d & 7))
3316 {
3317 *(uint32_t *)d = ldl_u ((uint32_t *)s);
3318
3319 w -= 4;
3320 s += 4;
3321 d += 4;
3322 }
3323
3324 while (w >= 64)
3325 {
3326 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
3327 __asm__ (
3328 "movq (%1), %%mm0\n"
3329 "movq 8(%1), %%mm1\n"
3330 "movq 16(%1), %%mm2\n"
3331 "movq 24(%1), %%mm3\n"
3332 "movq 32(%1), %%mm4\n"
3333 "movq 40(%1), %%mm5\n"
3334 "movq 48(%1), %%mm6\n"
3335 "movq 56(%1), %%mm7\n"
3336
3337 "movq %%mm0, (%0)\n"
3338 "movq %%mm1, 8(%0)\n"
3339 "movq %%mm2, 16(%0)\n"
3340 "movq %%mm3, 24(%0)\n"
3341 "movq %%mm4, 32(%0)\n"
3342 "movq %%mm5, 40(%0)\n"
3343 "movq %%mm6, 48(%0)\n"
3344 "movq %%mm7, 56(%0)\n"
3345 :
3346 : "r" (d), "r" (s)
3347 : "memory",
3348 "%mm0", "%mm1", "%mm2", "%mm3",
3349 "%mm4", "%mm5", "%mm6", "%mm7");
3350 #else
3351 __m64 v0 = ldq_u ((__m64 *)(s + 0));
3352 __m64 v1 = ldq_u ((__m64 *)(s + 8));
3353 __m64 v2 = ldq_u ((__m64 *)(s + 16));
3354 __m64 v3 = ldq_u ((__m64 *)(s + 24));
3355 __m64 v4 = ldq_u ((__m64 *)(s + 32));
3356 __m64 v5 = ldq_u ((__m64 *)(s + 40));
3357 __m64 v6 = ldq_u ((__m64 *)(s + 48));
3358 __m64 v7 = ldq_u ((__m64 *)(s + 56));
3359 *(__m64 *)(d + 0) = v0;
3360 *(__m64 *)(d + 8) = v1;
3361 *(__m64 *)(d + 16) = v2;
3362 *(__m64 *)(d + 24) = v3;
3363 *(__m64 *)(d + 32) = v4;
3364 *(__m64 *)(d + 40) = v5;
3365 *(__m64 *)(d + 48) = v6;
3366 *(__m64 *)(d + 56) = v7;
3367 #endif
3368
3369 w -= 64;
3370 s += 64;
3371 d += 64;
3372 }
3373 while (w >= 4)
3374 {
3375 *(uint32_t *)d = ldl_u ((uint32_t *)s);
3376
3377 w -= 4;
3378 s += 4;
3379 d += 4;
3380 }
3381 if (w >= 2)
3382 {
3383 *(uint16_t *)d = *(uint16_t *)s;
3384 w -= 2;
3385 s += 2;
3386 d += 2;
3387 }
3388 }
3389
3390 _mm_empty ();
3391
3392 return TRUE;
3393 }
3394
3395 static void
mmx_composite_copy_area(pixman_implementation_t * imp,pixman_composite_info_t * info)3396 mmx_composite_copy_area (pixman_implementation_t *imp,
3397 pixman_composite_info_t *info)
3398 {
3399 PIXMAN_COMPOSITE_ARGS (info);
3400
3401 mmx_blt (imp, src_image->bits.bits,
3402 dest_image->bits.bits,
3403 src_image->bits.rowstride,
3404 dest_image->bits.rowstride,
3405 PIXMAN_FORMAT_BPP (src_image->bits.format),
3406 PIXMAN_FORMAT_BPP (dest_image->bits.format),
3407 src_x, src_y, dest_x, dest_y, width, height);
3408 }
3409
3410 static void
mmx_composite_over_x888_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3411 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3412 pixman_composite_info_t *info)
3413 {
3414 PIXMAN_COMPOSITE_ARGS (info);
3415 uint32_t *src, *src_line;
3416 uint32_t *dst, *dst_line;
3417 uint8_t *mask, *mask_line;
3418 int src_stride, mask_stride, dst_stride;
3419 int32_t w;
3420
3421 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3422 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3423 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3424
3425 while (height--)
3426 {
3427 src = src_line;
3428 src_line += src_stride;
3429 dst = dst_line;
3430 dst_line += dst_stride;
3431 mask = mask_line;
3432 mask_line += mask_stride;
3433
3434 w = width;
3435
3436 while (w--)
3437 {
3438 uint64_t m = *mask;
3439
3440 if (m)
3441 {
3442 uint32_t ssrc = *src | 0xff000000;
3443 __m64 s = load8888 (&ssrc);
3444
3445 if (m == 0xff)
3446 {
3447 store8888 (dst, s);
3448 }
3449 else
3450 {
3451 __m64 sa = expand_alpha (s);
3452 __m64 vm = expand_alpha_rev (to_m64 (m));
3453 __m64 vdest = in_over (s, sa, vm, load8888 (dst));
3454
3455 store8888 (dst, vdest);
3456 }
3457 }
3458
3459 mask++;
3460 dst++;
3461 src++;
3462 }
3463 }
3464
3465 _mm_empty ();
3466 }
3467
3468 static void
mmx_composite_over_reverse_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3469 mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
3470 pixman_composite_info_t *info)
3471 {
3472 PIXMAN_COMPOSITE_ARGS (info);
3473 uint32_t src;
3474 uint32_t *dst_line, *dst;
3475 int32_t w;
3476 int dst_stride;
3477 __m64 vsrc;
3478
3479 CHECKPOINT ();
3480
3481 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3482
3483 if (src == 0)
3484 return;
3485
3486 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3487
3488 vsrc = load8888 (&src);
3489
3490 while (height--)
3491 {
3492 dst = dst_line;
3493 dst_line += dst_stride;
3494 w = width;
3495
3496 CHECKPOINT ();
3497
3498 while (w && (uintptr_t)dst & 7)
3499 {
3500 __m64 vdest = load8888 (dst);
3501
3502 store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3503
3504 w--;
3505 dst++;
3506 }
3507
3508 while (w >= 2)
3509 {
3510 __m64 vdest = *(__m64 *)dst;
3511 __m64 dest0 = expand8888 (vdest, 0);
3512 __m64 dest1 = expand8888 (vdest, 1);
3513
3514
3515 dest0 = over (dest0, expand_alpha (dest0), vsrc);
3516 dest1 = over (dest1, expand_alpha (dest1), vsrc);
3517
3518 *(__m64 *)dst = pack8888 (dest0, dest1);
3519
3520 dst += 2;
3521 w -= 2;
3522 }
3523
3524 CHECKPOINT ();
3525
3526 if (w)
3527 {
3528 __m64 vdest = load8888 (dst);
3529
3530 store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3531 }
3532 }
3533
3534 _mm_empty ();
3535 }
3536
3537 #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
3538 #define BMSK (BSHIFT - 1)
3539
3540 #define BILINEAR_DECLARE_VARIABLES \
3541 const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \
3542 const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \
3543 const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT); \
3544 const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \
3545 const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \
3546 const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \
3547 const __m64 mm_zero = _mm_setzero_si64 (); \
3548 __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
3549
3550 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
3551 do { \
3552 /* fetch 2x2 pixel block into 2 mmx registers */ \
3553 __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \
3554 __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \
3555 /* vertical interpolation */ \
3556 __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \
3557 __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \
3558 __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \
3559 __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \
3560 __m64 hi = _mm_add_pi16 (t_hi, b_hi); \
3561 __m64 lo = _mm_add_pi16 (t_lo, b_lo); \
3562 vx += unit_x; \
3563 if (BILINEAR_INTERPOLATION_BITS < 8) \
3564 { \
3565 /* calculate horizontal weights */ \
3566 __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \
3567 _mm_srli_pi16 (mm_x, \
3568 16 - BILINEAR_INTERPOLATION_BITS))); \
3569 /* horizontal interpolation */ \
3570 __m64 p = _mm_unpacklo_pi16 (lo, hi); \
3571 __m64 q = _mm_unpackhi_pi16 (lo, hi); \
3572 lo = _mm_madd_pi16 (p, mm_wh); \
3573 hi = _mm_madd_pi16 (q, mm_wh); \
3574 } \
3575 else \
3576 { \
3577 /* calculate horizontal weights */ \
3578 __m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x, \
3579 16 - BILINEAR_INTERPOLATION_BITS)); \
3580 __m64 mm_wh_hi = _mm_srli_pi16 (mm_x, \
3581 16 - BILINEAR_INTERPOLATION_BITS); \
3582 /* horizontal interpolation */ \
3583 __m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo); \
3584 __m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi); \
3585 __m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo); \
3586 __m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi); \
3587 lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo), \
3588 _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi)); \
3589 hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo), \
3590 _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi)); \
3591 } \
3592 mm_x = _mm_add_pi16 (mm_x, mm_ux); \
3593 /* shift and pack the result */ \
3594 hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \
3595 lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \
3596 lo = _mm_packs_pi32 (lo, hi); \
3597 lo = _mm_packs_pu16 (lo, lo); \
3598 pix = lo; \
3599 } while (0)
3600
3601 #define BILINEAR_SKIP_ONE_PIXEL() \
3602 do { \
3603 vx += unit_x; \
3604 mm_x = _mm_add_pi16 (mm_x, mm_ux); \
3605 } while(0)
3606
3607 static force_inline void
scaled_bilinear_scanline_mmx_8888_8888_SRC(uint32_t * dst,const uint32_t * mask,const uint32_t * src_top,const uint32_t * src_bottom,int32_t w,int wt,int wb,pixman_fixed_t vx,pixman_fixed_t unit_x,pixman_fixed_t max_vx,pixman_bool_t zero_src)3608 scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst,
3609 const uint32_t * mask,
3610 const uint32_t * src_top,
3611 const uint32_t * src_bottom,
3612 int32_t w,
3613 int wt,
3614 int wb,
3615 pixman_fixed_t vx,
3616 pixman_fixed_t unit_x,
3617 pixman_fixed_t max_vx,
3618 pixman_bool_t zero_src)
3619 {
3620 BILINEAR_DECLARE_VARIABLES;
3621 __m64 pix;
3622
3623 while (w--)
3624 {
3625 BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
3626 store (dst, pix);
3627 dst++;
3628 }
3629
3630 _mm_empty ();
3631 }
3632
FAST_BILINEAR_MAINLOOP_COMMON(mmx_8888_8888_cover_SRC,scaled_bilinear_scanline_mmx_8888_8888_SRC,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)3633 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
3634 scaled_bilinear_scanline_mmx_8888_8888_SRC,
3635 uint32_t, uint32_t, uint32_t,
3636 COVER, FLAG_NONE)
3637 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
3638 scaled_bilinear_scanline_mmx_8888_8888_SRC,
3639 uint32_t, uint32_t, uint32_t,
3640 PAD, FLAG_NONE)
3641 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
3642 scaled_bilinear_scanline_mmx_8888_8888_SRC,
3643 uint32_t, uint32_t, uint32_t,
3644 NONE, FLAG_NONE)
3645 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
3646 scaled_bilinear_scanline_mmx_8888_8888_SRC,
3647 uint32_t, uint32_t, uint32_t,
3648 NORMAL, FLAG_NONE)
3649
3650 static force_inline void
3651 scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst,
3652 const uint32_t * mask,
3653 const uint32_t * src_top,
3654 const uint32_t * src_bottom,
3655 int32_t w,
3656 int wt,
3657 int wb,
3658 pixman_fixed_t vx,
3659 pixman_fixed_t unit_x,
3660 pixman_fixed_t max_vx,
3661 pixman_bool_t zero_src)
3662 {
3663 BILINEAR_DECLARE_VARIABLES;
3664 __m64 pix1, pix2;
3665
3666 while (w)
3667 {
3668 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3669
3670 if (!is_zero (pix1))
3671 {
3672 pix2 = load (dst);
3673 store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
3674 }
3675
3676 w--;
3677 dst++;
3678 }
3679
3680 _mm_empty ();
3681 }
3682
FAST_BILINEAR_MAINLOOP_COMMON(mmx_8888_8888_cover_OVER,scaled_bilinear_scanline_mmx_8888_8888_OVER,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)3683 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
3684 scaled_bilinear_scanline_mmx_8888_8888_OVER,
3685 uint32_t, uint32_t, uint32_t,
3686 COVER, FLAG_NONE)
3687 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
3688 scaled_bilinear_scanline_mmx_8888_8888_OVER,
3689 uint32_t, uint32_t, uint32_t,
3690 PAD, FLAG_NONE)
3691 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
3692 scaled_bilinear_scanline_mmx_8888_8888_OVER,
3693 uint32_t, uint32_t, uint32_t,
3694 NONE, FLAG_NONE)
3695 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
3696 scaled_bilinear_scanline_mmx_8888_8888_OVER,
3697 uint32_t, uint32_t, uint32_t,
3698 NORMAL, FLAG_NONE)
3699
3700 static force_inline void
3701 scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t * dst,
3702 const uint8_t * mask,
3703 const uint32_t * src_top,
3704 const uint32_t * src_bottom,
3705 int32_t w,
3706 int wt,
3707 int wb,
3708 pixman_fixed_t vx,
3709 pixman_fixed_t unit_x,
3710 pixman_fixed_t max_vx,
3711 pixman_bool_t zero_src)
3712 {
3713 BILINEAR_DECLARE_VARIABLES;
3714 __m64 pix1, pix2;
3715 uint32_t m;
3716
3717 while (w)
3718 {
3719 m = (uint32_t) *mask++;
3720
3721 if (m)
3722 {
3723 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3724
3725 if (m == 0xff && is_opaque (pix1))
3726 {
3727 store (dst, pix1);
3728 }
3729 else
3730 {
3731 __m64 ms, md, ma, msa;
3732
3733 pix2 = load (dst);
3734 ma = expand_alpha_rev (to_m64 (m));
3735 ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
3736 md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
3737
3738 msa = expand_alpha (ms);
3739
3740 store8888 (dst, (in_over (ms, msa, ma, md)));
3741 }
3742 }
3743 else
3744 {
3745 BILINEAR_SKIP_ONE_PIXEL ();
3746 }
3747
3748 w--;
3749 dst++;
3750 }
3751
3752 _mm_empty ();
3753 }
3754
FAST_BILINEAR_MAINLOOP_COMMON(mmx_8888_8_8888_cover_OVER,scaled_bilinear_scanline_mmx_8888_8_8888_OVER,uint32_t,uint8_t,uint32_t,COVER,FLAG_HAVE_NON_SOLID_MASK)3755 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
3756 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3757 uint32_t, uint8_t, uint32_t,
3758 COVER, FLAG_HAVE_NON_SOLID_MASK)
3759 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
3760 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3761 uint32_t, uint8_t, uint32_t,
3762 PAD, FLAG_HAVE_NON_SOLID_MASK)
3763 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
3764 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3765 uint32_t, uint8_t, uint32_t,
3766 NONE, FLAG_HAVE_NON_SOLID_MASK)
3767 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
3768 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3769 uint32_t, uint8_t, uint32_t,
3770 NORMAL, FLAG_HAVE_NON_SOLID_MASK)
3771
3772 static uint32_t *
3773 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3774 {
3775 int w = iter->width;
3776 uint32_t *dst = iter->buffer;
3777 uint32_t *src = (uint32_t *)iter->bits;
3778
3779 iter->bits += iter->stride;
3780
3781 while (w && ((uintptr_t)dst) & 7)
3782 {
3783 *dst++ = (*src++) | 0xff000000;
3784 w--;
3785 }
3786
3787 while (w >= 8)
3788 {
3789 __m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
3790 __m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
3791 __m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
3792 __m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
3793
3794 *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
3795 *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
3796 *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
3797 *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
3798
3799 dst += 8;
3800 src += 8;
3801 w -= 8;
3802 }
3803
3804 while (w)
3805 {
3806 *dst++ = (*src++) | 0xff000000;
3807 w--;
3808 }
3809
3810 _mm_empty ();
3811 return iter->buffer;
3812 }
3813
3814 static uint32_t *
mmx_fetch_r5g6b5(pixman_iter_t * iter,const uint32_t * mask)3815 mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
3816 {
3817 int w = iter->width;
3818 uint32_t *dst = iter->buffer;
3819 uint16_t *src = (uint16_t *)iter->bits;
3820
3821 iter->bits += iter->stride;
3822
3823 while (w && ((uintptr_t)dst) & 0x0f)
3824 {
3825 uint16_t s = *src++;
3826
3827 *dst++ = convert_0565_to_8888 (s);
3828 w--;
3829 }
3830
3831 while (w >= 4)
3832 {
3833 __m64 vsrc = ldq_u ((__m64 *)src);
3834 __m64 mm0, mm1;
3835
3836 expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
3837
3838 *(__m64 *)(dst + 0) = mm0;
3839 *(__m64 *)(dst + 2) = mm1;
3840
3841 dst += 4;
3842 src += 4;
3843 w -= 4;
3844 }
3845
3846 while (w)
3847 {
3848 uint16_t s = *src++;
3849
3850 *dst++ = convert_0565_to_8888 (s);
3851 w--;
3852 }
3853
3854 _mm_empty ();
3855 return iter->buffer;
3856 }
3857
3858 static uint32_t *
mmx_fetch_a8(pixman_iter_t * iter,const uint32_t * mask)3859 mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3860 {
3861 int w = iter->width;
3862 uint32_t *dst = iter->buffer;
3863 uint8_t *src = iter->bits;
3864
3865 iter->bits += iter->stride;
3866
3867 while (w && (((uintptr_t)dst) & 15))
3868 {
3869 *dst++ = *(src++) << 24;
3870 w--;
3871 }
3872
3873 while (w >= 8)
3874 {
3875 __m64 mm0 = ldq_u ((__m64 *)src);
3876
3877 __m64 mm1 = _mm_unpacklo_pi8 (_mm_setzero_si64(), mm0);
3878 __m64 mm2 = _mm_unpackhi_pi8 (_mm_setzero_si64(), mm0);
3879 __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
3880 __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
3881 __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
3882 __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
3883
3884 *(__m64 *)(dst + 0) = mm3;
3885 *(__m64 *)(dst + 2) = mm4;
3886 *(__m64 *)(dst + 4) = mm5;
3887 *(__m64 *)(dst + 6) = mm6;
3888
3889 dst += 8;
3890 src += 8;
3891 w -= 8;
3892 }
3893
3894 while (w)
3895 {
3896 *dst++ = *(src++) << 24;
3897 w--;
3898 }
3899
3900 _mm_empty ();
3901 return iter->buffer;
3902 }
3903
3904 typedef struct
3905 {
3906 pixman_format_code_t format;
3907 pixman_iter_get_scanline_t get_scanline;
3908 } fetcher_info_t;
3909
3910 static const fetcher_info_t fetchers[] =
3911 {
3912 { PIXMAN_x8r8g8b8, mmx_fetch_x8r8g8b8 },
3913 { PIXMAN_r5g6b5, mmx_fetch_r5g6b5 },
3914 { PIXMAN_a8, mmx_fetch_a8 },
3915 { PIXMAN_null }
3916 };
3917
3918 static pixman_bool_t
mmx_src_iter_init(pixman_implementation_t * imp,pixman_iter_t * iter)3919 mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
3920 {
3921 pixman_image_t *image = iter->image;
3922
3923 #define FLAGS \
3924 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
3925 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
3926
3927 if ((iter->iter_flags & ITER_NARROW) &&
3928 (iter->image_flags & FLAGS) == FLAGS)
3929 {
3930 const fetcher_info_t *f;
3931
3932 for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
3933 {
3934 if (image->common.extended_format_code == f->format)
3935 {
3936 uint8_t *b = (uint8_t *)image->bits.bits;
3937 int s = image->bits.rowstride * 4;
3938
3939 iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
3940 iter->stride = s;
3941
3942 iter->get_scanline = f->get_scanline;
3943 return TRUE;
3944 }
3945 }
3946 }
3947
3948 return FALSE;
3949 }
3950
3951 static const pixman_fast_path_t mmx_fast_paths[] =
3952 {
3953 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ),
3954 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ),
3955 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ),
3956 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ),
3957 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ),
3958 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ),
3959 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3960 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3961 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ),
3962 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3963 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3964 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ),
3965 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3966 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3967 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ),
3968 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3969 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3970 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ),
3971 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ),
3972 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ),
3973 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ),
3974 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ),
3975 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ),
3976 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ),
3977 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ),
3978 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ),
3979 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ),
3980 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ),
3981 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ),
3982 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ),
3983 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ),
3984 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ),
3985 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ),
3986 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, mmx_composite_over_n_0565 ),
3987 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3988 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3989
3990 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ),
3991 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ),
3992 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ),
3993 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ),
3994 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ),
3995 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ),
3996
3997 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
3998 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
3999
4000 PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, mmx_composite_add_0565_0565 ),
4001 PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, mmx_composite_add_0565_0565 ),
4002 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ),
4003 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ),
4004 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ),
4005 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ),
4006
4007 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
4008 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
4009 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
4010 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
4011 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ),
4012 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ),
4013 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ),
4014 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ),
4015 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ),
4016 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ),
4017 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
4018 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
4019 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
4020 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
4021 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ),
4022 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ),
4023
4024 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ),
4025 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ),
4026
4027 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
4028 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4029 SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4030 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
4031 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4032 SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4033
4034 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4035 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4036 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
4037 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
4038
4039 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888 ),
4040 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888 ),
4041 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888 ),
4042 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888 ),
4043
4044 { PIXMAN_OP_NONE },
4045 };
4046
4047 pixman_implementation_t *
_pixman_implementation_create_mmx(pixman_implementation_t * fallback)4048 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
4049 {
4050 pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
4051
4052 imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
4053 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
4054 imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
4055 imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
4056 imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
4057 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
4058 imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
4059 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
4060 imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
4061 imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
4062 imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
4063
4064 imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
4065 imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
4066 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
4067 imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
4068 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
4069 imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
4070 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
4071 imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
4072 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
4073 imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
4074 imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
4075
4076 imp->blt = mmx_blt;
4077 imp->fill = mmx_fill;
4078
4079 imp->src_iter_init = mmx_src_iter_init;
4080
4081 return imp;
4082 }
4083
4084 #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */
4085