1 /*
2 * Copyright © 2004, 2005 Red Hat, Inc.
3 * Copyright © 2004 Nicholas Miell
4 * Copyright © 2005 Trolltech AS
5 *
6 * Permission to use, copy, modify, distribute, and sell this software and its
7 * documentation for any purpose is hereby granted without fee, provided that
8 * the above copyright notice appear in all copies and that both that
9 * copyright notice and this permission notice appear in supporting
10 * documentation, and that the name of Red Hat not be used in advertising or
11 * publicity pertaining to distribution of the software without specific,
12 * written prior permission. Red Hat makes no representations about the
13 * suitability of this software for any purpose. It is provided "as is"
14 * without express or implied warranty.
15 *
16 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 * SOFTWARE.
24 *
25 * Author: Søren Sandmann (sandmann@redhat.com)
26 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
28 *
29 * Based on work by Owen Taylor
30 */
31
32 #ifdef HAVE_CONFIG_H
33 #include <config.h>
34 #endif
35
36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
37
38 #ifdef USE_LOONGSON_MMI
39 #include <loongson-mmintrin.h>
40 #else
41 #include <mmintrin.h>
42 #endif
43 #include "pixman-private.h"
44 #include "pixman-combine32.h"
45 #include "pixman-inlines.h"
46
47 #ifdef VERBOSE
48 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
49 #else
50 #define CHECKPOINT()
51 #endif
52
53 #if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
54 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this. */
55 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_empty(void)56 _mm_empty (void)
57 {
58
59 }
60 #endif
61
62 #ifdef USE_X86_MMX
63 # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
64 # include <xmmintrin.h>
65 # else
66 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
67 * instructions to be generated that we don't want. Just duplicate the
68 * functions we want to use. */
69 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pi8(__m64 __A)70 _mm_movemask_pi8 (__m64 __A)
71 {
72 int ret;
73
74 asm ("pmovmskb %1, %0\n\t"
75 : "=r" (ret)
76 : "y" (__A)
77 );
78
79 return ret;
80 }
81
82 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_pu16(__m64 __A,__m64 __B)83 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
84 {
85 asm ("pmulhuw %1, %0\n\t"
86 : "+y" (__A)
87 : "y" (__B)
88 );
89 return __A;
90 }
91
92 # define _mm_shuffle_pi16(A, N) \
93 ({ \
94 __m64 ret; \
95 \
96 asm ("pshufw %2, %1, %0\n\t" \
97 : "=y" (ret) \
98 : "y" (A), "K" ((const int8_t)N) \
99 ); \
100 \
101 ret; \
102 })
103 # endif
104 #endif
105
106 #ifndef _MSC_VER
107 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
108 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
109 #endif
110
111 /* Notes about writing mmx code
112 *
113 * give memory operands as the second operand. If you give it as the
114 * first, gcc will first load it into a register, then use that
115 * register
116 *
117 * ie. use
118 *
119 * _mm_mullo_pi16 (x, mmx_constant);
120 *
121 * not
122 *
123 * _mm_mullo_pi16 (mmx_constant, x);
124 *
125 * Also try to minimize dependencies. i.e. when you need a value, try
126 * to calculate it from a value that was calculated as early as
127 * possible.
128 */
129
130 /* --------------- MMX primitives ------------------------------------- */
131
132 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
133 * the name of the member used to access the data.
134 * If __m64 requires using mm_cvt* intrinsics functions to convert between
135 * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
136 * If __m64 and uint64_t values can just be cast to each other directly,
137 * then define USE_M64_CASTS.
138 * If __m64 is a double datatype, then define USE_M64_DOUBLE.
139 */
140 #ifdef _MSC_VER
141 # define M64_MEMBER m64_u64
142 #elif defined(__ICC)
143 # define USE_CVT_INTRINSICS
144 #elif defined(USE_LOONGSON_MMI)
145 # define USE_M64_DOUBLE
146 #elif defined(__GNUC__)
147 # define USE_M64_CASTS
148 #elif defined(__SUNPRO_C)
149 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
150 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
151 * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
152 * is defined. If it is used, then the mm_cvt* intrinsics must be used.
153 */
154 # define USE_CVT_INTRINSICS
155 # else
156 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
157 * disabled, __m64 is defined as a struct containing "unsigned long long l_".
158 */
159 # define M64_MEMBER l_
160 # endif
161 #endif
162
163 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
164 typedef uint64_t mmxdatafield;
165 #else
166 typedef __m64 mmxdatafield;
167 #endif
168
169 typedef struct
170 {
171 mmxdatafield mmx_4x00ff;
172 mmxdatafield mmx_4x0080;
173 mmxdatafield mmx_565_rgb;
174 mmxdatafield mmx_565_unpack_multiplier;
175 mmxdatafield mmx_565_pack_multiplier;
176 mmxdatafield mmx_565_r;
177 mmxdatafield mmx_565_g;
178 mmxdatafield mmx_565_b;
179 mmxdatafield mmx_packed_565_rb;
180 mmxdatafield mmx_packed_565_g;
181 mmxdatafield mmx_expand_565_g;
182 mmxdatafield mmx_expand_565_b;
183 mmxdatafield mmx_expand_565_r;
184 #ifndef USE_LOONGSON_MMI
185 mmxdatafield mmx_mask_0;
186 mmxdatafield mmx_mask_1;
187 mmxdatafield mmx_mask_2;
188 mmxdatafield mmx_mask_3;
189 #endif
190 mmxdatafield mmx_full_alpha;
191 mmxdatafield mmx_4x0101;
192 mmxdatafield mmx_ff000000;
193 } mmx_data_t;
194
195 #if defined(_MSC_VER)
196 # define MMXDATA_INIT(field, val) { val ## UI64 }
197 #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */
198 # define MMXDATA_INIT(field, val) field = { val ## ULL }
199 #else /* mmxdatafield is an integral type */
200 # define MMXDATA_INIT(field, val) field = val ## ULL
201 #endif
202
203 static const mmx_data_t c =
204 {
205 MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff),
206 MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080),
207 MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f),
208 MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840),
209 MMXDATA_INIT (.mmx_565_pack_multiplier, 0x2000000420000004),
210 MMXDATA_INIT (.mmx_565_r, 0x000000f800000000),
211 MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000),
212 MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8),
213 MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8),
214 MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00),
215 MMXDATA_INIT (.mmx_expand_565_g, 0x07e007e007e007e0),
216 MMXDATA_INIT (.mmx_expand_565_b, 0x001f001f001f001f),
217 MMXDATA_INIT (.mmx_expand_565_r, 0xf800f800f800f800),
218 #ifndef USE_LOONGSON_MMI
219 MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000),
220 MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff),
221 MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff),
222 MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff),
223 #endif
224 MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000),
225 MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101),
226 MMXDATA_INIT (.mmx_ff000000, 0xff000000ff000000),
227 };
228
229 #ifdef USE_CVT_INTRINSICS
230 # define MC(x) to_m64 (c.mmx_ ## x)
231 #elif defined(USE_M64_CASTS)
232 # define MC(x) ((__m64)c.mmx_ ## x)
233 #elif defined(USE_M64_DOUBLE)
234 # define MC(x) (*(__m64 *)&c.mmx_ ## x)
235 #else
236 # define MC(x) c.mmx_ ## x
237 #endif
238
239 static force_inline __m64
to_m64(uint64_t x)240 to_m64 (uint64_t x)
241 {
242 #ifdef USE_CVT_INTRINSICS
243 return _mm_cvtsi64_m64 (x);
244 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
245 __m64 res;
246
247 res.M64_MEMBER = x;
248 return res;
249 #elif defined USE_M64_DOUBLE
250 return *(__m64 *)&x;
251 #else /* USE_M64_CASTS */
252 return (__m64)x;
253 #endif
254 }
255
256 static force_inline uint64_t
to_uint64(__m64 x)257 to_uint64 (__m64 x)
258 {
259 #ifdef USE_CVT_INTRINSICS
260 return _mm_cvtm64_si64 (x);
261 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
262 uint64_t res = x.M64_MEMBER;
263 return res;
264 #elif defined USE_M64_DOUBLE
265 return *(uint64_t *)&x;
266 #else /* USE_M64_CASTS */
267 return (uint64_t)x;
268 #endif
269 }
270
271 static force_inline __m64
shift(__m64 v,int s)272 shift (__m64 v,
273 int s)
274 {
275 if (s > 0)
276 return _mm_slli_si64 (v, s);
277 else if (s < 0)
278 return _mm_srli_si64 (v, -s);
279 else
280 return v;
281 }
282
283 static force_inline __m64
negate(__m64 mask)284 negate (__m64 mask)
285 {
286 return _mm_xor_si64 (mask, MC (4x00ff));
287 }
288
289 /* Computes the product of two unsigned fixed-point 8-bit values from 0 to 1
290 * and maps its result to the same range.
291 *
292 * Jim Blinn gives multiple ways to compute this in "Jim Blinn's Corner:
293 * Notation, Notation, Notation", the first of which is
294 *
295 * prod(a, b) = (a * b + 128) / 255.
296 *
297 * By approximating the division by 255 as 257/65536 it can be replaced by a
298 * multiply and a right shift. This is the implementation that we use in
299 * pix_multiply(), but we _mm_mulhi_pu16() by 257 (part of SSE1 or Extended
300 * 3DNow!, and unavailable at the time of the book's publication) to perform
301 * the multiplication and right shift in a single operation.
302 *
303 * prod(a, b) = ((a * b + 128) * 257) >> 16.
304 *
305 * A third way (how pix_multiply() was implemented prior to 14208344) exists
306 * also that performs the multiplication by 257 with adds and shifts.
307 *
308 * Where temp = a * b + 128
309 *
310 * prod(a, b) = (temp + (temp >> 8)) >> 8.
311 */
312 static force_inline __m64
pix_multiply(__m64 a,__m64 b)313 pix_multiply (__m64 a, __m64 b)
314 {
315 __m64 res;
316
317 res = _mm_mullo_pi16 (a, b);
318 res = _mm_adds_pu16 (res, MC (4x0080));
319 res = _mm_mulhi_pu16 (res, MC (4x0101));
320
321 return res;
322 }
323
324 static force_inline __m64
pix_add(__m64 a,__m64 b)325 pix_add (__m64 a, __m64 b)
326 {
327 return _mm_adds_pu8 (a, b);
328 }
329
330 static force_inline __m64
expand_alpha(__m64 pixel)331 expand_alpha (__m64 pixel)
332 {
333 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
334 }
335
336 static force_inline __m64
expand_alpha_rev(__m64 pixel)337 expand_alpha_rev (__m64 pixel)
338 {
339 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
340 }
341
342 static force_inline __m64
invert_colors(__m64 pixel)343 invert_colors (__m64 pixel)
344 {
345 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
346 }
347
348 static force_inline __m64
over(__m64 src,__m64 srca,__m64 dest)349 over (__m64 src,
350 __m64 srca,
351 __m64 dest)
352 {
353 return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
354 }
355
356 static force_inline __m64
over_rev_non_pre(__m64 src,__m64 dest)357 over_rev_non_pre (__m64 src, __m64 dest)
358 {
359 __m64 srca = expand_alpha (src);
360 __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
361
362 return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
363 }
364
365 static force_inline __m64
in(__m64 src,__m64 mask)366 in (__m64 src, __m64 mask)
367 {
368 return pix_multiply (src, mask);
369 }
370
371 #ifndef _MSC_VER
372 static force_inline __m64
in_over(__m64 src,__m64 srca,__m64 mask,__m64 dest)373 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
374 {
375 return over (in (src, mask), pix_multiply (srca, mask), dest);
376 }
377
378 #else
379
380 #define in_over(src, srca, mask, dest) \
381 over (in (src, mask), pix_multiply (srca, mask), dest)
382
383 #endif
384
385 /* Elemental unaligned loads */
386
ldq_u(__m64 * p)387 static force_inline __m64 ldq_u(__m64 *p)
388 {
389 #ifdef USE_X86_MMX
390 /* x86's alignment restrictions are very relaxed, but that's no excuse */
391 __m64 r;
392 memcpy(&r, p, sizeof(__m64));
393 return r;
394 #elif defined USE_ARM_IWMMXT
395 int align = (uintptr_t)p & 7;
396 __m64 *aligned_p;
397 if (align == 0)
398 return *p;
399 aligned_p = (__m64 *)((uintptr_t)p & ~7);
400 return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
401 #else
402 struct __una_u64 { __m64 x __attribute__((packed)); };
403 const struct __una_u64 *ptr = (const struct __una_u64 *) p;
404 return (__m64) ptr->x;
405 #endif
406 }
407
ldl_u(const uint32_t * p)408 static force_inline uint32_t ldl_u(const uint32_t *p)
409 {
410 #ifdef USE_X86_MMX
411 /* x86's alignment restrictions are very relaxed. */
412 uint32_t r;
413 memcpy(&r, p, sizeof(uint32_t));
414 return r;
415 #else
416 struct __una_u32 { uint32_t x __attribute__((packed)); };
417 const struct __una_u32 *ptr = (const struct __una_u32 *) p;
418 return ptr->x;
419 #endif
420 }
421
422 static force_inline __m64
load(const uint32_t * v)423 load (const uint32_t *v)
424 {
425 #ifdef USE_LOONGSON_MMI
426 __m64 ret;
427 asm ("lwc1 %0, %1\n\t"
428 : "=f" (ret)
429 : "m" (*v)
430 );
431 return ret;
432 #else
433 return _mm_cvtsi32_si64 (*v);
434 #endif
435 }
436
437 static force_inline __m64
load8888(const uint32_t * v)438 load8888 (const uint32_t *v)
439 {
440 #ifdef USE_LOONGSON_MMI
441 return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
442 #else
443 return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
444 #endif
445 }
446
447 static force_inline __m64
load8888u(const uint32_t * v)448 load8888u (const uint32_t *v)
449 {
450 uint32_t l = ldl_u (v);
451 return load8888 (&l);
452 }
453
454 static force_inline __m64
pack8888(__m64 lo,__m64 hi)455 pack8888 (__m64 lo, __m64 hi)
456 {
457 return _mm_packs_pu16 (lo, hi);
458 }
459
460 static force_inline void
store(uint32_t * dest,__m64 v)461 store (uint32_t *dest, __m64 v)
462 {
463 #ifdef USE_LOONGSON_MMI
464 asm ("swc1 %1, %0\n\t"
465 : "=m" (*dest)
466 : "f" (v)
467 : "memory"
468 );
469 #else
470 *dest = _mm_cvtsi64_si32 (v);
471 #endif
472 }
473
474 static force_inline void
store8888(uint32_t * dest,__m64 v)475 store8888 (uint32_t *dest, __m64 v)
476 {
477 v = pack8888 (v, _mm_setzero_si64 ());
478 store (dest, v);
479 }
480
481 static force_inline pixman_bool_t
is_equal(__m64 a,__m64 b)482 is_equal (__m64 a, __m64 b)
483 {
484 #ifdef USE_LOONGSON_MMI
485 /* __m64 is double, we can compare directly. */
486 return a == b;
487 #else
488 return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
489 #endif
490 }
491
492 static force_inline pixman_bool_t
is_opaque(__m64 v)493 is_opaque (__m64 v)
494 {
495 #ifdef USE_LOONGSON_MMI
496 return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
497 #else
498 __m64 ffs = _mm_cmpeq_pi8 (v, v);
499 return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
500 #endif
501 }
502
503 static force_inline pixman_bool_t
is_zero(__m64 v)504 is_zero (__m64 v)
505 {
506 return is_equal (v, _mm_setzero_si64 ());
507 }
508
509 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
510 *
511 * 00RR00GG00BB
512 *
513 * --- Expanding 565 in the low word ---
514 *
515 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
516 * m = m & (01f0003f001f);
517 * m = m * (008404100840);
518 * m = m >> 8;
519 *
520 * Note the trick here - the top word is shifted by another nibble to
521 * avoid it bumping into the middle word
522 */
523 static force_inline __m64
expand565(__m64 pixel,int pos)524 expand565 (__m64 pixel, int pos)
525 {
526 __m64 p = pixel;
527 __m64 t1, t2;
528
529 /* move pixel to low 16 bit and zero the rest */
530 #ifdef USE_LOONGSON_MMI
531 p = loongson_extract_pi16 (p, pos);
532 #else
533 p = shift (shift (p, (3 - pos) * 16), -48);
534 #endif
535
536 t1 = shift (p, 36 - 11);
537 t2 = shift (p, 16 - 5);
538
539 p = _mm_or_si64 (t1, p);
540 p = _mm_or_si64 (t2, p);
541 p = _mm_and_si64 (p, MC (565_rgb));
542
543 pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
544 return _mm_srli_pi16 (pixel, 8);
545 }
546
547 /* Expand 4 16 bit pixels in an mmx register into two mmx registers of
548 *
549 * AARRGGBBRRGGBB
550 */
551 static force_inline void
expand_4xpacked565(__m64 vin,__m64 * vout0,__m64 * vout1,int full_alpha)552 expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
553 {
554 __m64 t0, t1, alpha = _mm_setzero_si64 ();
555 __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
556 __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
557 __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
558 if (full_alpha)
559 alpha = _mm_cmpeq_pi32 (alpha, alpha);
560
561 /* Replicate high bits into empty low bits. */
562 r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
563 g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
564 b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
565
566 r = _mm_packs_pu16 (r, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */
567 g = _mm_packs_pu16 (g, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */
568 b = _mm_packs_pu16 (b, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */
569
570 t1 = _mm_unpacklo_pi8 (r, alpha); /* A3 R3 A2 R2 A1 R1 A0 R0 */
571 t0 = _mm_unpacklo_pi8 (b, g); /* G3 B3 G2 B2 G1 B1 G0 B0 */
572
573 *vout0 = _mm_unpacklo_pi16 (t0, t1); /* A1 R1 G1 B1 A0 R0 G0 B0 */
574 *vout1 = _mm_unpackhi_pi16 (t0, t1); /* A3 R3 G3 B3 A2 R2 G2 B2 */
575 }
576
577 static force_inline __m64
expand8888(__m64 in,int pos)578 expand8888 (__m64 in, int pos)
579 {
580 if (pos == 0)
581 return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
582 else
583 return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
584 }
585
586 static force_inline __m64
expandx888(__m64 in,int pos)587 expandx888 (__m64 in, int pos)
588 {
589 return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
590 }
591
592 static force_inline void
expand_4x565(__m64 vin,__m64 * vout0,__m64 * vout1,__m64 * vout2,__m64 * vout3,int full_alpha)593 expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
594 {
595 __m64 v0, v1;
596 expand_4xpacked565 (vin, &v0, &v1, full_alpha);
597 *vout0 = expand8888 (v0, 0);
598 *vout1 = expand8888 (v0, 1);
599 *vout2 = expand8888 (v1, 0);
600 *vout3 = expand8888 (v1, 1);
601 }
602
603 static force_inline __m64
pack_565(__m64 pixel,__m64 target,int pos)604 pack_565 (__m64 pixel, __m64 target, int pos)
605 {
606 __m64 p = pixel;
607 __m64 t = target;
608 __m64 r, g, b;
609
610 r = _mm_and_si64 (p, MC (565_r));
611 g = _mm_and_si64 (p, MC (565_g));
612 b = _mm_and_si64 (p, MC (565_b));
613
614 #ifdef USE_LOONGSON_MMI
615 r = shift (r, -(32 - 8));
616 g = shift (g, -(16 - 3));
617 b = shift (b, -(0 + 3));
618
619 p = _mm_or_si64 (r, g);
620 p = _mm_or_si64 (p, b);
621 return loongson_insert_pi16 (t, p, pos);
622 #else
623 r = shift (r, -(32 - 8) + pos * 16);
624 g = shift (g, -(16 - 3) + pos * 16);
625 b = shift (b, -(0 + 3) + pos * 16);
626
627 if (pos == 0)
628 t = _mm_and_si64 (t, MC (mask_0));
629 else if (pos == 1)
630 t = _mm_and_si64 (t, MC (mask_1));
631 else if (pos == 2)
632 t = _mm_and_si64 (t, MC (mask_2));
633 else if (pos == 3)
634 t = _mm_and_si64 (t, MC (mask_3));
635
636 p = _mm_or_si64 (r, t);
637 p = _mm_or_si64 (g, p);
638
639 return _mm_or_si64 (b, p);
640 #endif
641 }
642
643 static force_inline __m64
pack_4xpacked565(__m64 a,__m64 b)644 pack_4xpacked565 (__m64 a, __m64 b)
645 {
646 __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
647 __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
648
649 __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
650 __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
651
652 __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
653 __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
654
655 t0 = _mm_or_si64 (t0, g0);
656 t1 = _mm_or_si64 (t1, g1);
657
658 t0 = shift(t0, -5);
659 #ifdef USE_ARM_IWMMXT
660 t1 = shift(t1, -5);
661 return _mm_packs_pu32 (t0, t1);
662 #else
663 t1 = shift(t1, -5 + 16);
664 return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
665 #endif
666 }
667
668 #ifndef _MSC_VER
669
670 static force_inline __m64
pack_4x565(__m64 v0,__m64 v1,__m64 v2,__m64 v3)671 pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
672 {
673 return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
674 }
675
676 static force_inline __m64
pix_add_mul(__m64 x,__m64 a,__m64 y,__m64 b)677 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
678 {
679 x = pix_multiply (x, a);
680 y = pix_multiply (y, b);
681
682 return pix_add (x, y);
683 }
684
685 #else
686
687 /* MSVC only handles a "pass by register" of up to three SSE intrinsics */
688
689 #define pack_4x565(v0, v1, v2, v3) \
690 pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
691
692 #define pix_add_mul(x, a, y, b) \
693 ( x = pix_multiply (x, a), \
694 y = pix_multiply (y, b), \
695 pix_add (x, y) )
696
697 #endif
698
699 /* --------------- MMX code patch for fbcompose.c --------------------- */
700
701 static force_inline __m64
combine(const uint32_t * src,const uint32_t * mask)702 combine (const uint32_t *src, const uint32_t *mask)
703 {
704 __m64 vsrc = load8888 (src);
705
706 if (mask)
707 {
708 __m64 m = load8888 (mask);
709
710 m = expand_alpha (m);
711 vsrc = pix_multiply (vsrc, m);
712 }
713
714 return vsrc;
715 }
716
717 static force_inline __m64
core_combine_over_u_pixel_mmx(__m64 vsrc,__m64 vdst)718 core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
719 {
720 vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
721
722 if (is_opaque (vsrc))
723 {
724 return vsrc;
725 }
726 else if (!is_zero (vsrc))
727 {
728 return over (vsrc, expand_alpha (vsrc),
729 _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
730 }
731
732 return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
733 }
734
735 static void
mmx_combine_over_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)736 mmx_combine_over_u (pixman_implementation_t *imp,
737 pixman_op_t op,
738 uint32_t * dest,
739 const uint32_t * src,
740 const uint32_t * mask,
741 int width)
742 {
743 const uint32_t *end = dest + width;
744
745 while (dest < end)
746 {
747 __m64 vsrc = combine (src, mask);
748
749 if (is_opaque (vsrc))
750 {
751 store8888 (dest, vsrc);
752 }
753 else if (!is_zero (vsrc))
754 {
755 __m64 sa = expand_alpha (vsrc);
756 store8888 (dest, over (vsrc, sa, load8888 (dest)));
757 }
758
759 ++dest;
760 ++src;
761 if (mask)
762 ++mask;
763 }
764 _mm_empty ();
765 }
766
767 static void
mmx_combine_over_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)768 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
769 pixman_op_t op,
770 uint32_t * dest,
771 const uint32_t * src,
772 const uint32_t * mask,
773 int width)
774 {
775 const uint32_t *end = dest + width;
776
777 while (dest < end)
778 {
779 __m64 d, da;
780 __m64 s = combine (src, mask);
781
782 d = load8888 (dest);
783 da = expand_alpha (d);
784 store8888 (dest, over (d, da, s));
785
786 ++dest;
787 ++src;
788 if (mask)
789 mask++;
790 }
791 _mm_empty ();
792 }
793
794 static void
mmx_combine_in_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)795 mmx_combine_in_u (pixman_implementation_t *imp,
796 pixman_op_t op,
797 uint32_t * dest,
798 const uint32_t * src,
799 const uint32_t * mask,
800 int width)
801 {
802 const uint32_t *end = dest + width;
803
804 while (dest < end)
805 {
806 __m64 a;
807 __m64 x = combine (src, mask);
808
809 a = load8888 (dest);
810 a = expand_alpha (a);
811 x = pix_multiply (x, a);
812
813 store8888 (dest, x);
814
815 ++dest;
816 ++src;
817 if (mask)
818 mask++;
819 }
820 _mm_empty ();
821 }
822
823 static void
mmx_combine_in_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)824 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
825 pixman_op_t op,
826 uint32_t * dest,
827 const uint32_t * src,
828 const uint32_t * mask,
829 int width)
830 {
831 const uint32_t *end = dest + width;
832
833 while (dest < end)
834 {
835 __m64 a = combine (src, mask);
836 __m64 x;
837
838 x = load8888 (dest);
839 a = expand_alpha (a);
840 x = pix_multiply (x, a);
841 store8888 (dest, x);
842
843 ++dest;
844 ++src;
845 if (mask)
846 mask++;
847 }
848 _mm_empty ();
849 }
850
851 static void
mmx_combine_out_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)852 mmx_combine_out_u (pixman_implementation_t *imp,
853 pixman_op_t op,
854 uint32_t * dest,
855 const uint32_t * src,
856 const uint32_t * mask,
857 int width)
858 {
859 const uint32_t *end = dest + width;
860
861 while (dest < end)
862 {
863 __m64 a;
864 __m64 x = combine (src, mask);
865
866 a = load8888 (dest);
867 a = expand_alpha (a);
868 a = negate (a);
869 x = pix_multiply (x, a);
870 store8888 (dest, x);
871
872 ++dest;
873 ++src;
874 if (mask)
875 mask++;
876 }
877 _mm_empty ();
878 }
879
880 static void
mmx_combine_out_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)881 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
882 pixman_op_t op,
883 uint32_t * dest,
884 const uint32_t * src,
885 const uint32_t * mask,
886 int width)
887 {
888 const uint32_t *end = dest + width;
889
890 while (dest < end)
891 {
892 __m64 a = combine (src, mask);
893 __m64 x;
894
895 x = load8888 (dest);
896 a = expand_alpha (a);
897 a = negate (a);
898 x = pix_multiply (x, a);
899
900 store8888 (dest, x);
901
902 ++dest;
903 ++src;
904 if (mask)
905 mask++;
906 }
907 _mm_empty ();
908 }
909
910 static void
mmx_combine_atop_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)911 mmx_combine_atop_u (pixman_implementation_t *imp,
912 pixman_op_t op,
913 uint32_t * dest,
914 const uint32_t * src,
915 const uint32_t * mask,
916 int width)
917 {
918 const uint32_t *end = dest + width;
919
920 while (dest < end)
921 {
922 __m64 da, d, sia;
923 __m64 s = combine (src, mask);
924
925 d = load8888 (dest);
926 sia = expand_alpha (s);
927 sia = negate (sia);
928 da = expand_alpha (d);
929 s = pix_add_mul (s, da, d, sia);
930 store8888 (dest, s);
931
932 ++dest;
933 ++src;
934 if (mask)
935 mask++;
936 }
937 _mm_empty ();
938 }
939
940 static void
mmx_combine_atop_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)941 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
942 pixman_op_t op,
943 uint32_t * dest,
944 const uint32_t * src,
945 const uint32_t * mask,
946 int width)
947 {
948 const uint32_t *end;
949
950 end = dest + width;
951
952 while (dest < end)
953 {
954 __m64 dia, d, sa;
955 __m64 s = combine (src, mask);
956
957 d = load8888 (dest);
958 sa = expand_alpha (s);
959 dia = expand_alpha (d);
960 dia = negate (dia);
961 s = pix_add_mul (s, dia, d, sa);
962 store8888 (dest, s);
963
964 ++dest;
965 ++src;
966 if (mask)
967 mask++;
968 }
969 _mm_empty ();
970 }
971
972 static void
mmx_combine_xor_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)973 mmx_combine_xor_u (pixman_implementation_t *imp,
974 pixman_op_t op,
975 uint32_t * dest,
976 const uint32_t * src,
977 const uint32_t * mask,
978 int width)
979 {
980 const uint32_t *end = dest + width;
981
982 while (dest < end)
983 {
984 __m64 dia, d, sia;
985 __m64 s = combine (src, mask);
986
987 d = load8888 (dest);
988 sia = expand_alpha (s);
989 dia = expand_alpha (d);
990 sia = negate (sia);
991 dia = negate (dia);
992 s = pix_add_mul (s, dia, d, sia);
993 store8888 (dest, s);
994
995 ++dest;
996 ++src;
997 if (mask)
998 mask++;
999 }
1000 _mm_empty ();
1001 }
1002
1003 static void
mmx_combine_add_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1004 mmx_combine_add_u (pixman_implementation_t *imp,
1005 pixman_op_t op,
1006 uint32_t * dest,
1007 const uint32_t * src,
1008 const uint32_t * mask,
1009 int width)
1010 {
1011 const uint32_t *end = dest + width;
1012
1013 while (dest < end)
1014 {
1015 __m64 d;
1016 __m64 s = combine (src, mask);
1017
1018 d = load8888 (dest);
1019 s = pix_add (s, d);
1020 store8888 (dest, s);
1021
1022 ++dest;
1023 ++src;
1024 if (mask)
1025 mask++;
1026 }
1027 _mm_empty ();
1028 }
1029
1030 static void
mmx_combine_saturate_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1031 mmx_combine_saturate_u (pixman_implementation_t *imp,
1032 pixman_op_t op,
1033 uint32_t * dest,
1034 const uint32_t * src,
1035 const uint32_t * mask,
1036 int width)
1037 {
1038 const uint32_t *end = dest + width;
1039
1040 while (dest < end)
1041 {
1042 uint32_t s, sa, da;
1043 uint32_t d = *dest;
1044 __m64 ms = combine (src, mask);
1045 __m64 md = load8888 (dest);
1046
1047 store8888(&s, ms);
1048 da = ~d >> 24;
1049 sa = s >> 24;
1050
1051 if (sa > da)
1052 {
1053 uint32_t quot = DIV_UN8 (da, sa) << 24;
1054 __m64 msa = load8888 (");
1055 msa = expand_alpha (msa);
1056 ms = pix_multiply (ms, msa);
1057 }
1058
1059 md = pix_add (md, ms);
1060 store8888 (dest, md);
1061
1062 ++src;
1063 ++dest;
1064 if (mask)
1065 mask++;
1066 }
1067 _mm_empty ();
1068 }
1069
1070 static void
mmx_combine_src_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1071 mmx_combine_src_ca (pixman_implementation_t *imp,
1072 pixman_op_t op,
1073 uint32_t * dest,
1074 const uint32_t * src,
1075 const uint32_t * mask,
1076 int width)
1077 {
1078 const uint32_t *end = src + width;
1079
1080 while (src < end)
1081 {
1082 __m64 a = load8888 (mask);
1083 __m64 s = load8888 (src);
1084
1085 s = pix_multiply (s, a);
1086 store8888 (dest, s);
1087
1088 ++src;
1089 ++mask;
1090 ++dest;
1091 }
1092 _mm_empty ();
1093 }
1094
1095 static void
mmx_combine_over_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1096 mmx_combine_over_ca (pixman_implementation_t *imp,
1097 pixman_op_t op,
1098 uint32_t * dest,
1099 const uint32_t * src,
1100 const uint32_t * mask,
1101 int width)
1102 {
1103 const uint32_t *end = src + width;
1104
1105 while (src < end)
1106 {
1107 __m64 a = load8888 (mask);
1108 __m64 s = load8888 (src);
1109 __m64 d = load8888 (dest);
1110 __m64 sa = expand_alpha (s);
1111
1112 store8888 (dest, in_over (s, sa, a, d));
1113
1114 ++src;
1115 ++dest;
1116 ++mask;
1117 }
1118 _mm_empty ();
1119 }
1120
1121 static void
mmx_combine_over_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1122 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1123 pixman_op_t op,
1124 uint32_t * dest,
1125 const uint32_t * src,
1126 const uint32_t * mask,
1127 int width)
1128 {
1129 const uint32_t *end = src + width;
1130
1131 while (src < end)
1132 {
1133 __m64 a = load8888 (mask);
1134 __m64 s = load8888 (src);
1135 __m64 d = load8888 (dest);
1136 __m64 da = expand_alpha (d);
1137
1138 store8888 (dest, over (d, da, in (s, a)));
1139
1140 ++src;
1141 ++dest;
1142 ++mask;
1143 }
1144 _mm_empty ();
1145 }
1146
1147 static void
mmx_combine_in_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1148 mmx_combine_in_ca (pixman_implementation_t *imp,
1149 pixman_op_t op,
1150 uint32_t * dest,
1151 const uint32_t * src,
1152 const uint32_t * mask,
1153 int width)
1154 {
1155 const uint32_t *end = src + width;
1156
1157 while (src < end)
1158 {
1159 __m64 a = load8888 (mask);
1160 __m64 s = load8888 (src);
1161 __m64 d = load8888 (dest);
1162 __m64 da = expand_alpha (d);
1163
1164 s = pix_multiply (s, a);
1165 s = pix_multiply (s, da);
1166 store8888 (dest, s);
1167
1168 ++src;
1169 ++dest;
1170 ++mask;
1171 }
1172 _mm_empty ();
1173 }
1174
1175 static void
mmx_combine_in_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1176 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1177 pixman_op_t op,
1178 uint32_t * dest,
1179 const uint32_t * src,
1180 const uint32_t * mask,
1181 int width)
1182 {
1183 const uint32_t *end = src + width;
1184
1185 while (src < end)
1186 {
1187 __m64 a = load8888 (mask);
1188 __m64 s = load8888 (src);
1189 __m64 d = load8888 (dest);
1190 __m64 sa = expand_alpha (s);
1191
1192 a = pix_multiply (a, sa);
1193 d = pix_multiply (d, a);
1194 store8888 (dest, d);
1195
1196 ++src;
1197 ++dest;
1198 ++mask;
1199 }
1200 _mm_empty ();
1201 }
1202
1203 static void
mmx_combine_out_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1204 mmx_combine_out_ca (pixman_implementation_t *imp,
1205 pixman_op_t op,
1206 uint32_t * dest,
1207 const uint32_t * src,
1208 const uint32_t * mask,
1209 int width)
1210 {
1211 const uint32_t *end = src + width;
1212
1213 while (src < end)
1214 {
1215 __m64 a = load8888 (mask);
1216 __m64 s = load8888 (src);
1217 __m64 d = load8888 (dest);
1218 __m64 da = expand_alpha (d);
1219
1220 da = negate (da);
1221 s = pix_multiply (s, a);
1222 s = pix_multiply (s, da);
1223 store8888 (dest, s);
1224
1225 ++src;
1226 ++dest;
1227 ++mask;
1228 }
1229 _mm_empty ();
1230 }
1231
1232 static void
mmx_combine_out_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1233 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1234 pixman_op_t op,
1235 uint32_t * dest,
1236 const uint32_t * src,
1237 const uint32_t * mask,
1238 int width)
1239 {
1240 const uint32_t *end = src + width;
1241
1242 while (src < end)
1243 {
1244 __m64 a = load8888 (mask);
1245 __m64 s = load8888 (src);
1246 __m64 d = load8888 (dest);
1247 __m64 sa = expand_alpha (s);
1248
1249 a = pix_multiply (a, sa);
1250 a = negate (a);
1251 d = pix_multiply (d, a);
1252 store8888 (dest, d);
1253
1254 ++src;
1255 ++dest;
1256 ++mask;
1257 }
1258 _mm_empty ();
1259 }
1260
1261 static void
mmx_combine_atop_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1262 mmx_combine_atop_ca (pixman_implementation_t *imp,
1263 pixman_op_t op,
1264 uint32_t * dest,
1265 const uint32_t * src,
1266 const uint32_t * mask,
1267 int width)
1268 {
1269 const uint32_t *end = src + width;
1270
1271 while (src < end)
1272 {
1273 __m64 a = load8888 (mask);
1274 __m64 s = load8888 (src);
1275 __m64 d = load8888 (dest);
1276 __m64 da = expand_alpha (d);
1277 __m64 sa = expand_alpha (s);
1278
1279 s = pix_multiply (s, a);
1280 a = pix_multiply (a, sa);
1281 a = negate (a);
1282 d = pix_add_mul (d, a, s, da);
1283 store8888 (dest, d);
1284
1285 ++src;
1286 ++dest;
1287 ++mask;
1288 }
1289 _mm_empty ();
1290 }
1291
1292 static void
mmx_combine_atop_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1293 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1294 pixman_op_t op,
1295 uint32_t * dest,
1296 const uint32_t * src,
1297 const uint32_t * mask,
1298 int width)
1299 {
1300 const uint32_t *end = src + width;
1301
1302 while (src < end)
1303 {
1304 __m64 a = load8888 (mask);
1305 __m64 s = load8888 (src);
1306 __m64 d = load8888 (dest);
1307 __m64 da = expand_alpha (d);
1308 __m64 sa = expand_alpha (s);
1309
1310 s = pix_multiply (s, a);
1311 a = pix_multiply (a, sa);
1312 da = negate (da);
1313 d = pix_add_mul (d, a, s, da);
1314 store8888 (dest, d);
1315
1316 ++src;
1317 ++dest;
1318 ++mask;
1319 }
1320 _mm_empty ();
1321 }
1322
1323 static void
mmx_combine_xor_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1324 mmx_combine_xor_ca (pixman_implementation_t *imp,
1325 pixman_op_t op,
1326 uint32_t * dest,
1327 const uint32_t * src,
1328 const uint32_t * mask,
1329 int width)
1330 {
1331 const uint32_t *end = src + width;
1332
1333 while (src < end)
1334 {
1335 __m64 a = load8888 (mask);
1336 __m64 s = load8888 (src);
1337 __m64 d = load8888 (dest);
1338 __m64 da = expand_alpha (d);
1339 __m64 sa = expand_alpha (s);
1340
1341 s = pix_multiply (s, a);
1342 a = pix_multiply (a, sa);
1343 da = negate (da);
1344 a = negate (a);
1345 d = pix_add_mul (d, a, s, da);
1346 store8888 (dest, d);
1347
1348 ++src;
1349 ++dest;
1350 ++mask;
1351 }
1352 _mm_empty ();
1353 }
1354
1355 static void
mmx_combine_add_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1356 mmx_combine_add_ca (pixman_implementation_t *imp,
1357 pixman_op_t op,
1358 uint32_t * dest,
1359 const uint32_t * src,
1360 const uint32_t * mask,
1361 int width)
1362 {
1363 const uint32_t *end = src + width;
1364
1365 while (src < end)
1366 {
1367 __m64 a = load8888 (mask);
1368 __m64 s = load8888 (src);
1369 __m64 d = load8888 (dest);
1370
1371 s = pix_multiply (s, a);
1372 d = pix_add (s, d);
1373 store8888 (dest, d);
1374
1375 ++src;
1376 ++dest;
1377 ++mask;
1378 }
1379 _mm_empty ();
1380 }
1381
1382 /* ------------- MMX code paths called from fbpict.c -------------------- */
1383
1384 static void
mmx_composite_over_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1385 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1386 pixman_composite_info_t *info)
1387 {
1388 PIXMAN_COMPOSITE_ARGS (info);
1389 uint32_t src;
1390 uint32_t *dst_line, *dst;
1391 int32_t w;
1392 int dst_stride;
1393 __m64 vsrc, vsrca;
1394
1395 CHECKPOINT ();
1396
1397 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1398
1399 if (src == 0)
1400 return;
1401
1402 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1403
1404 vsrc = load8888 (&src);
1405 vsrca = expand_alpha (vsrc);
1406
1407 while (height--)
1408 {
1409 dst = dst_line;
1410 dst_line += dst_stride;
1411 w = width;
1412
1413 CHECKPOINT ();
1414
1415 while (w && (uintptr_t)dst & 7)
1416 {
1417 store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1418
1419 w--;
1420 dst++;
1421 }
1422
1423 while (w >= 2)
1424 {
1425 __m64 vdest;
1426 __m64 dest0, dest1;
1427
1428 vdest = *(__m64 *)dst;
1429
1430 dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1431 dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1432
1433 *(__m64 *)dst = pack8888 (dest0, dest1);
1434
1435 dst += 2;
1436 w -= 2;
1437 }
1438
1439 CHECKPOINT ();
1440
1441 if (w)
1442 {
1443 store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1444 }
1445 }
1446
1447 _mm_empty ();
1448 }
1449
1450 static void
mmx_composite_over_n_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)1451 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1452 pixman_composite_info_t *info)
1453 {
1454 PIXMAN_COMPOSITE_ARGS (info);
1455 uint32_t src;
1456 uint16_t *dst_line, *dst;
1457 int32_t w;
1458 int dst_stride;
1459 __m64 vsrc, vsrca;
1460
1461 CHECKPOINT ();
1462
1463 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1464
1465 if (src == 0)
1466 return;
1467
1468 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1469
1470 vsrc = load8888 (&src);
1471 vsrca = expand_alpha (vsrc);
1472
1473 while (height--)
1474 {
1475 dst = dst_line;
1476 dst_line += dst_stride;
1477 w = width;
1478
1479 CHECKPOINT ();
1480
1481 while (w && (uintptr_t)dst & 7)
1482 {
1483 uint64_t d = *dst;
1484 __m64 vdest = expand565 (to_m64 (d), 0);
1485
1486 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1487 *dst = to_uint64 (vdest);
1488
1489 w--;
1490 dst++;
1491 }
1492
1493 while (w >= 4)
1494 {
1495 __m64 vdest = *(__m64 *)dst;
1496 __m64 v0, v1, v2, v3;
1497
1498 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1499
1500 v0 = over (vsrc, vsrca, v0);
1501 v1 = over (vsrc, vsrca, v1);
1502 v2 = over (vsrc, vsrca, v2);
1503 v3 = over (vsrc, vsrca, v3);
1504
1505 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1506
1507 dst += 4;
1508 w -= 4;
1509 }
1510
1511 CHECKPOINT ();
1512
1513 while (w)
1514 {
1515 uint64_t d = *dst;
1516 __m64 vdest = expand565 (to_m64 (d), 0);
1517
1518 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1519 *dst = to_uint64 (vdest);
1520
1521 w--;
1522 dst++;
1523 }
1524 }
1525
1526 _mm_empty ();
1527 }
1528
1529 static void
mmx_composite_over_n_8888_8888_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)1530 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1531 pixman_composite_info_t *info)
1532 {
1533 PIXMAN_COMPOSITE_ARGS (info);
1534 uint32_t src;
1535 uint32_t *dst_line;
1536 uint32_t *mask_line;
1537 int dst_stride, mask_stride;
1538 __m64 vsrc, vsrca;
1539
1540 CHECKPOINT ();
1541
1542 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1543
1544 if (src == 0)
1545 return;
1546
1547 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1548 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1549
1550 vsrc = load8888 (&src);
1551 vsrca = expand_alpha (vsrc);
1552
1553 while (height--)
1554 {
1555 int twidth = width;
1556 uint32_t *p = (uint32_t *)mask_line;
1557 uint32_t *q = (uint32_t *)dst_line;
1558
1559 while (twidth && (uintptr_t)q & 7)
1560 {
1561 uint32_t m = *(uint32_t *)p;
1562
1563 if (m)
1564 {
1565 __m64 vdest = load8888 (q);
1566 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1567 store8888 (q, vdest);
1568 }
1569
1570 twidth--;
1571 p++;
1572 q++;
1573 }
1574
1575 while (twidth >= 2)
1576 {
1577 uint32_t m0, m1;
1578 m0 = *p;
1579 m1 = *(p + 1);
1580
1581 if (m0 | m1)
1582 {
1583 __m64 dest0, dest1;
1584 __m64 vdest = *(__m64 *)q;
1585
1586 dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1587 expand8888 (vdest, 0));
1588 dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1589 expand8888 (vdest, 1));
1590
1591 *(__m64 *)q = pack8888 (dest0, dest1);
1592 }
1593
1594 p += 2;
1595 q += 2;
1596 twidth -= 2;
1597 }
1598
1599 if (twidth)
1600 {
1601 uint32_t m = *(uint32_t *)p;
1602
1603 if (m)
1604 {
1605 __m64 vdest = load8888 (q);
1606 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1607 store8888 (q, vdest);
1608 }
1609
1610 twidth--;
1611 p++;
1612 q++;
1613 }
1614
1615 dst_line += dst_stride;
1616 mask_line += mask_stride;
1617 }
1618
1619 _mm_empty ();
1620 }
1621
1622 static void
mmx_composite_over_8888_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1623 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1624 pixman_composite_info_t *info)
1625 {
1626 PIXMAN_COMPOSITE_ARGS (info);
1627 uint32_t *dst_line, *dst;
1628 uint32_t *src_line, *src;
1629 uint32_t mask;
1630 __m64 vmask;
1631 int dst_stride, src_stride;
1632 int32_t w;
1633
1634 CHECKPOINT ();
1635
1636 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1637 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1638
1639 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1640 vmask = expand_alpha (load8888 (&mask));
1641
1642 while (height--)
1643 {
1644 dst = dst_line;
1645 dst_line += dst_stride;
1646 src = src_line;
1647 src_line += src_stride;
1648 w = width;
1649
1650 while (w && (uintptr_t)dst & 7)
1651 {
1652 __m64 s = load8888 (src);
1653 __m64 d = load8888 (dst);
1654
1655 store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1656
1657 w--;
1658 dst++;
1659 src++;
1660 }
1661
1662 while (w >= 2)
1663 {
1664 __m64 vs = ldq_u ((__m64 *)src);
1665 __m64 vd = *(__m64 *)dst;
1666 __m64 vsrc0 = expand8888 (vs, 0);
1667 __m64 vsrc1 = expand8888 (vs, 1);
1668
1669 *(__m64 *)dst = pack8888 (
1670 in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1671 in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1672
1673 w -= 2;
1674 dst += 2;
1675 src += 2;
1676 }
1677
1678 if (w)
1679 {
1680 __m64 s = load8888 (src);
1681 __m64 d = load8888 (dst);
1682
1683 store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1684 }
1685 }
1686
1687 _mm_empty ();
1688 }
1689
1690 static void
mmx_composite_over_x888_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1691 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1692 pixman_composite_info_t *info)
1693 {
1694 PIXMAN_COMPOSITE_ARGS (info);
1695 uint32_t *dst_line, *dst;
1696 uint32_t *src_line, *src;
1697 uint32_t mask;
1698 __m64 vmask;
1699 int dst_stride, src_stride;
1700 int32_t w;
1701 __m64 srca;
1702
1703 CHECKPOINT ();
1704
1705 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1706 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1707 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1708
1709 vmask = expand_alpha (load8888 (&mask));
1710 srca = MC (4x00ff);
1711
1712 while (height--)
1713 {
1714 dst = dst_line;
1715 dst_line += dst_stride;
1716 src = src_line;
1717 src_line += src_stride;
1718 w = width;
1719
1720 while (w && (uintptr_t)dst & 7)
1721 {
1722 uint32_t ssrc = *src | 0xff000000;
1723 __m64 s = load8888 (&ssrc);
1724 __m64 d = load8888 (dst);
1725
1726 store8888 (dst, in_over (s, srca, vmask, d));
1727
1728 w--;
1729 dst++;
1730 src++;
1731 }
1732
1733 while (w >= 16)
1734 {
1735 __m64 vd0 = *(__m64 *)(dst + 0);
1736 __m64 vd1 = *(__m64 *)(dst + 2);
1737 __m64 vd2 = *(__m64 *)(dst + 4);
1738 __m64 vd3 = *(__m64 *)(dst + 6);
1739 __m64 vd4 = *(__m64 *)(dst + 8);
1740 __m64 vd5 = *(__m64 *)(dst + 10);
1741 __m64 vd6 = *(__m64 *)(dst + 12);
1742 __m64 vd7 = *(__m64 *)(dst + 14);
1743
1744 __m64 vs0 = ldq_u ((__m64 *)(src + 0));
1745 __m64 vs1 = ldq_u ((__m64 *)(src + 2));
1746 __m64 vs2 = ldq_u ((__m64 *)(src + 4));
1747 __m64 vs3 = ldq_u ((__m64 *)(src + 6));
1748 __m64 vs4 = ldq_u ((__m64 *)(src + 8));
1749 __m64 vs5 = ldq_u ((__m64 *)(src + 10));
1750 __m64 vs6 = ldq_u ((__m64 *)(src + 12));
1751 __m64 vs7 = ldq_u ((__m64 *)(src + 14));
1752
1753 vd0 = pack8888 (
1754 in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1755 in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1756
1757 vd1 = pack8888 (
1758 in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1759 in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1760
1761 vd2 = pack8888 (
1762 in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1763 in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1764
1765 vd3 = pack8888 (
1766 in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1767 in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1768
1769 vd4 = pack8888 (
1770 in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1771 in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1772
1773 vd5 = pack8888 (
1774 in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1775 in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1776
1777 vd6 = pack8888 (
1778 in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1779 in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1780
1781 vd7 = pack8888 (
1782 in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1783 in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1784
1785 *(__m64 *)(dst + 0) = vd0;
1786 *(__m64 *)(dst + 2) = vd1;
1787 *(__m64 *)(dst + 4) = vd2;
1788 *(__m64 *)(dst + 6) = vd3;
1789 *(__m64 *)(dst + 8) = vd4;
1790 *(__m64 *)(dst + 10) = vd5;
1791 *(__m64 *)(dst + 12) = vd6;
1792 *(__m64 *)(dst + 14) = vd7;
1793
1794 w -= 16;
1795 dst += 16;
1796 src += 16;
1797 }
1798
1799 while (w)
1800 {
1801 uint32_t ssrc = *src | 0xff000000;
1802 __m64 s = load8888 (&ssrc);
1803 __m64 d = load8888 (dst);
1804
1805 store8888 (dst, in_over (s, srca, vmask, d));
1806
1807 w--;
1808 dst++;
1809 src++;
1810 }
1811 }
1812
1813 _mm_empty ();
1814 }
1815
1816 static void
mmx_composite_over_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1817 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1818 pixman_composite_info_t *info)
1819 {
1820 PIXMAN_COMPOSITE_ARGS (info);
1821 uint32_t *dst_line, *dst;
1822 uint32_t *src_line, *src;
1823 uint32_t s;
1824 int dst_stride, src_stride;
1825 uint8_t a;
1826 int32_t w;
1827
1828 CHECKPOINT ();
1829
1830 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1831 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1832
1833 while (height--)
1834 {
1835 dst = dst_line;
1836 dst_line += dst_stride;
1837 src = src_line;
1838 src_line += src_stride;
1839 w = width;
1840
1841 while (w--)
1842 {
1843 s = *src++;
1844 a = s >> 24;
1845
1846 if (a == 0xff)
1847 {
1848 *dst = s;
1849 }
1850 else if (s)
1851 {
1852 __m64 ms, sa;
1853 ms = load8888 (&s);
1854 sa = expand_alpha (ms);
1855 store8888 (dst, over (ms, sa, load8888 (dst)));
1856 }
1857
1858 dst++;
1859 }
1860 }
1861 _mm_empty ();
1862 }
1863
1864 static void
mmx_composite_over_8888_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)1865 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1866 pixman_composite_info_t *info)
1867 {
1868 PIXMAN_COMPOSITE_ARGS (info);
1869 uint16_t *dst_line, *dst;
1870 uint32_t *src_line, *src;
1871 int dst_stride, src_stride;
1872 int32_t w;
1873
1874 CHECKPOINT ();
1875
1876 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1877 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1878
1879 #if 0
1880 /* FIXME */
1881 assert (src_image->drawable == mask_image->drawable);
1882 #endif
1883
1884 while (height--)
1885 {
1886 dst = dst_line;
1887 dst_line += dst_stride;
1888 src = src_line;
1889 src_line += src_stride;
1890 w = width;
1891
1892 CHECKPOINT ();
1893
1894 while (w && (uintptr_t)dst & 7)
1895 {
1896 __m64 vsrc = load8888 (src);
1897 uint64_t d = *dst;
1898 __m64 vdest = expand565 (to_m64 (d), 0);
1899
1900 vdest = pack_565 (
1901 over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1902
1903 *dst = to_uint64 (vdest);
1904
1905 w--;
1906 dst++;
1907 src++;
1908 }
1909
1910 CHECKPOINT ();
1911
1912 while (w >= 4)
1913 {
1914 __m64 vdest = *(__m64 *)dst;
1915 __m64 v0, v1, v2, v3;
1916 __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1917
1918 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1919
1920 vsrc0 = load8888 ((src + 0));
1921 vsrc1 = load8888 ((src + 1));
1922 vsrc2 = load8888 ((src + 2));
1923 vsrc3 = load8888 ((src + 3));
1924
1925 v0 = over (vsrc0, expand_alpha (vsrc0), v0);
1926 v1 = over (vsrc1, expand_alpha (vsrc1), v1);
1927 v2 = over (vsrc2, expand_alpha (vsrc2), v2);
1928 v3 = over (vsrc3, expand_alpha (vsrc3), v3);
1929
1930 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1931
1932 w -= 4;
1933 dst += 4;
1934 src += 4;
1935 }
1936
1937 CHECKPOINT ();
1938
1939 while (w)
1940 {
1941 __m64 vsrc = load8888 (src);
1942 uint64_t d = *dst;
1943 __m64 vdest = expand565 (to_m64 (d), 0);
1944
1945 vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1946
1947 *dst = to_uint64 (vdest);
1948
1949 w--;
1950 dst++;
1951 src++;
1952 }
1953 }
1954
1955 _mm_empty ();
1956 }
1957
1958 static void
mmx_composite_over_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1959 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1960 pixman_composite_info_t *info)
1961 {
1962 PIXMAN_COMPOSITE_ARGS (info);
1963 uint32_t src, srca;
1964 uint32_t *dst_line, *dst;
1965 uint8_t *mask_line, *mask;
1966 int dst_stride, mask_stride;
1967 int32_t w;
1968 __m64 vsrc, vsrca;
1969 uint64_t srcsrc;
1970
1971 CHECKPOINT ();
1972
1973 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1974
1975 srca = src >> 24;
1976 if (src == 0)
1977 return;
1978
1979 srcsrc = (uint64_t)src << 32 | src;
1980
1981 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1982 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1983
1984 vsrc = load8888 (&src);
1985 vsrca = expand_alpha (vsrc);
1986
1987 while (height--)
1988 {
1989 dst = dst_line;
1990 dst_line += dst_stride;
1991 mask = mask_line;
1992 mask_line += mask_stride;
1993 w = width;
1994
1995 CHECKPOINT ();
1996
1997 while (w && (uintptr_t)dst & 7)
1998 {
1999 uint64_t m = *mask;
2000
2001 if (m)
2002 {
2003 __m64 vdest = in_over (vsrc, vsrca,
2004 expand_alpha_rev (to_m64 (m)),
2005 load8888 (dst));
2006
2007 store8888 (dst, vdest);
2008 }
2009
2010 w--;
2011 mask++;
2012 dst++;
2013 }
2014
2015 CHECKPOINT ();
2016
2017 while (w >= 2)
2018 {
2019 uint64_t m0, m1;
2020
2021 m0 = *mask;
2022 m1 = *(mask + 1);
2023
2024 if (srca == 0xff && (m0 & m1) == 0xff)
2025 {
2026 *(uint64_t *)dst = srcsrc;
2027 }
2028 else if (m0 | m1)
2029 {
2030 __m64 vdest;
2031 __m64 dest0, dest1;
2032
2033 vdest = *(__m64 *)dst;
2034
2035 dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
2036 expand8888 (vdest, 0));
2037 dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
2038 expand8888 (vdest, 1));
2039
2040 *(__m64 *)dst = pack8888 (dest0, dest1);
2041 }
2042
2043 mask += 2;
2044 dst += 2;
2045 w -= 2;
2046 }
2047
2048 CHECKPOINT ();
2049
2050 if (w)
2051 {
2052 uint64_t m = *mask;
2053
2054 if (m)
2055 {
2056 __m64 vdest = load8888 (dst);
2057
2058 vdest = in_over (
2059 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
2060 store8888 (dst, vdest);
2061 }
2062 }
2063 }
2064
2065 _mm_empty ();
2066 }
2067
2068 static pixman_bool_t
mmx_fill(pixman_implementation_t * imp,uint32_t * bits,int stride,int bpp,int x,int y,int width,int height,uint32_t filler)2069 mmx_fill (pixman_implementation_t *imp,
2070 uint32_t * bits,
2071 int stride,
2072 int bpp,
2073 int x,
2074 int y,
2075 int width,
2076 int height,
2077 uint32_t filler)
2078 {
2079 uint64_t fill;
2080 __m64 vfill;
2081 uint32_t byte_width;
2082 uint8_t *byte_line;
2083
2084 #if defined __GNUC__ && defined USE_X86_MMX
2085 __m64 v1, v2, v3, v4, v5, v6, v7;
2086 #endif
2087
2088 if (bpp != 16 && bpp != 32 && bpp != 8)
2089 return FALSE;
2090
2091 if (bpp == 8)
2092 {
2093 stride = stride * (int) sizeof (uint32_t) / 1;
2094 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
2095 byte_width = width;
2096 stride *= 1;
2097 filler = (filler & 0xff) * 0x01010101;
2098 }
2099 else if (bpp == 16)
2100 {
2101 stride = stride * (int) sizeof (uint32_t) / 2;
2102 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
2103 byte_width = 2 * width;
2104 stride *= 2;
2105 filler = (filler & 0xffff) * 0x00010001;
2106 }
2107 else
2108 {
2109 stride = stride * (int) sizeof (uint32_t) / 4;
2110 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
2111 byte_width = 4 * width;
2112 stride *= 4;
2113 }
2114
2115 fill = ((uint64_t)filler << 32) | filler;
2116 vfill = to_m64 (fill);
2117
2118 #if defined __GNUC__ && defined USE_X86_MMX
2119 __asm__ (
2120 "movq %7, %0\n"
2121 "movq %7, %1\n"
2122 "movq %7, %2\n"
2123 "movq %7, %3\n"
2124 "movq %7, %4\n"
2125 "movq %7, %5\n"
2126 "movq %7, %6\n"
2127 : "=&y" (v1), "=&y" (v2), "=&y" (v3),
2128 "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
2129 : "y" (vfill));
2130 #endif
2131
2132 while (height--)
2133 {
2134 int w;
2135 uint8_t *d = byte_line;
2136
2137 byte_line += stride;
2138 w = byte_width;
2139
2140 if (w >= 1 && ((uintptr_t)d & 1))
2141 {
2142 *(uint8_t *)d = (filler & 0xff);
2143 w--;
2144 d++;
2145 }
2146
2147 if (w >= 2 && ((uintptr_t)d & 3))
2148 {
2149 *(uint16_t *)d = filler;
2150 w -= 2;
2151 d += 2;
2152 }
2153
2154 while (w >= 4 && ((uintptr_t)d & 7))
2155 {
2156 *(uint32_t *)d = filler;
2157
2158 w -= 4;
2159 d += 4;
2160 }
2161
2162 while (w >= 64)
2163 {
2164 #if defined __GNUC__ && defined USE_X86_MMX
2165 __asm__ (
2166 "movq %1, (%0)\n"
2167 "movq %2, 8(%0)\n"
2168 "movq %3, 16(%0)\n"
2169 "movq %4, 24(%0)\n"
2170 "movq %5, 32(%0)\n"
2171 "movq %6, 40(%0)\n"
2172 "movq %7, 48(%0)\n"
2173 "movq %8, 56(%0)\n"
2174 :
2175 : "r" (d),
2176 "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
2177 "y" (v4), "y" (v5), "y" (v6), "y" (v7)
2178 : "memory");
2179 #else
2180 *(__m64*) (d + 0) = vfill;
2181 *(__m64*) (d + 8) = vfill;
2182 *(__m64*) (d + 16) = vfill;
2183 *(__m64*) (d + 24) = vfill;
2184 *(__m64*) (d + 32) = vfill;
2185 *(__m64*) (d + 40) = vfill;
2186 *(__m64*) (d + 48) = vfill;
2187 *(__m64*) (d + 56) = vfill;
2188 #endif
2189 w -= 64;
2190 d += 64;
2191 }
2192
2193 while (w >= 4)
2194 {
2195 *(uint32_t *)d = filler;
2196
2197 w -= 4;
2198 d += 4;
2199 }
2200 if (w >= 2)
2201 {
2202 *(uint16_t *)d = filler;
2203 w -= 2;
2204 d += 2;
2205 }
2206 if (w >= 1)
2207 {
2208 *(uint8_t *)d = (filler & 0xff);
2209 w--;
2210 d++;
2211 }
2212
2213 }
2214
2215 _mm_empty ();
2216 return TRUE;
2217 }
2218
2219 static void
mmx_composite_src_x888_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2220 mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
2221 pixman_composite_info_t *info)
2222 {
2223 PIXMAN_COMPOSITE_ARGS (info);
2224 uint16_t *dst_line, *dst;
2225 uint32_t *src_line, *src, s;
2226 int dst_stride, src_stride;
2227 int32_t w;
2228
2229 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2230 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2231
2232 while (height--)
2233 {
2234 dst = dst_line;
2235 dst_line += dst_stride;
2236 src = src_line;
2237 src_line += src_stride;
2238 w = width;
2239
2240 while (w && (uintptr_t)dst & 7)
2241 {
2242 s = *src++;
2243 *dst = convert_8888_to_0565 (s);
2244 dst++;
2245 w--;
2246 }
2247
2248 while (w >= 4)
2249 {
2250 __m64 vdest;
2251 __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
2252 __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
2253
2254 vdest = pack_4xpacked565 (vsrc0, vsrc1);
2255
2256 *(__m64 *)dst = vdest;
2257
2258 w -= 4;
2259 src += 4;
2260 dst += 4;
2261 }
2262
2263 while (w)
2264 {
2265 s = *src++;
2266 *dst = convert_8888_to_0565 (s);
2267 dst++;
2268 w--;
2269 }
2270 }
2271
2272 _mm_empty ();
2273 }
2274
2275 static void
mmx_composite_src_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2276 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2277 pixman_composite_info_t *info)
2278 {
2279 PIXMAN_COMPOSITE_ARGS (info);
2280 uint32_t src, srca;
2281 uint32_t *dst_line, *dst;
2282 uint8_t *mask_line, *mask;
2283 int dst_stride, mask_stride;
2284 int32_t w;
2285 __m64 vsrc;
2286 uint64_t srcsrc;
2287
2288 CHECKPOINT ();
2289
2290 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2291
2292 srca = src >> 24;
2293 if (src == 0)
2294 {
2295 mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
2296 PIXMAN_FORMAT_BPP (dest_image->bits.format),
2297 dest_x, dest_y, width, height, 0);
2298 return;
2299 }
2300
2301 srcsrc = (uint64_t)src << 32 | src;
2302
2303 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2304 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2305
2306 vsrc = load8888 (&src);
2307
2308 while (height--)
2309 {
2310 dst = dst_line;
2311 dst_line += dst_stride;
2312 mask = mask_line;
2313 mask_line += mask_stride;
2314 w = width;
2315
2316 CHECKPOINT ();
2317
2318 while (w && (uintptr_t)dst & 7)
2319 {
2320 uint64_t m = *mask;
2321
2322 if (m)
2323 {
2324 __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2325
2326 store8888 (dst, vdest);
2327 }
2328 else
2329 {
2330 *dst = 0;
2331 }
2332
2333 w--;
2334 mask++;
2335 dst++;
2336 }
2337
2338 CHECKPOINT ();
2339
2340 while (w >= 2)
2341 {
2342 uint64_t m0, m1;
2343 m0 = *mask;
2344 m1 = *(mask + 1);
2345
2346 if (srca == 0xff && (m0 & m1) == 0xff)
2347 {
2348 *(uint64_t *)dst = srcsrc;
2349 }
2350 else if (m0 | m1)
2351 {
2352 __m64 dest0, dest1;
2353
2354 dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2355 dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2356
2357 *(__m64 *)dst = pack8888 (dest0, dest1);
2358 }
2359 else
2360 {
2361 *(uint64_t *)dst = 0;
2362 }
2363
2364 mask += 2;
2365 dst += 2;
2366 w -= 2;
2367 }
2368
2369 CHECKPOINT ();
2370
2371 if (w)
2372 {
2373 uint64_t m = *mask;
2374
2375 if (m)
2376 {
2377 __m64 vdest = load8888 (dst);
2378
2379 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2380 store8888 (dst, vdest);
2381 }
2382 else
2383 {
2384 *dst = 0;
2385 }
2386 }
2387 }
2388
2389 _mm_empty ();
2390 }
2391
2392 static void
mmx_composite_over_n_8_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2393 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2394 pixman_composite_info_t *info)
2395 {
2396 PIXMAN_COMPOSITE_ARGS (info);
2397 uint32_t src, srca;
2398 uint16_t *dst_line, *dst;
2399 uint8_t *mask_line, *mask;
2400 int dst_stride, mask_stride;
2401 int32_t w;
2402 __m64 vsrc, vsrca, tmp;
2403 __m64 srcsrcsrcsrc;
2404
2405 CHECKPOINT ();
2406
2407 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2408
2409 srca = src >> 24;
2410 if (src == 0)
2411 return;
2412
2413 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2414 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2415
2416 vsrc = load8888 (&src);
2417 vsrca = expand_alpha (vsrc);
2418
2419 tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2420 srcsrcsrcsrc = expand_alpha_rev (tmp);
2421
2422 while (height--)
2423 {
2424 dst = dst_line;
2425 dst_line += dst_stride;
2426 mask = mask_line;
2427 mask_line += mask_stride;
2428 w = width;
2429
2430 CHECKPOINT ();
2431
2432 while (w && (uintptr_t)dst & 7)
2433 {
2434 uint64_t m = *mask;
2435
2436 if (m)
2437 {
2438 uint64_t d = *dst;
2439 __m64 vd = to_m64 (d);
2440 __m64 vdest = in_over (
2441 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2442
2443 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2444 *dst = to_uint64 (vd);
2445 }
2446
2447 w--;
2448 mask++;
2449 dst++;
2450 }
2451
2452 CHECKPOINT ();
2453
2454 while (w >= 4)
2455 {
2456 uint64_t m0, m1, m2, m3;
2457 m0 = *mask;
2458 m1 = *(mask + 1);
2459 m2 = *(mask + 2);
2460 m3 = *(mask + 3);
2461
2462 if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2463 {
2464 *(__m64 *)dst = srcsrcsrcsrc;
2465 }
2466 else if (m0 | m1 | m2 | m3)
2467 {
2468 __m64 vdest = *(__m64 *)dst;
2469 __m64 v0, v1, v2, v3;
2470 __m64 vm0, vm1, vm2, vm3;
2471
2472 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2473
2474 vm0 = to_m64 (m0);
2475 v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
2476
2477 vm1 = to_m64 (m1);
2478 v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
2479
2480 vm2 = to_m64 (m2);
2481 v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
2482
2483 vm3 = to_m64 (m3);
2484 v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
2485
2486 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
2487 }
2488
2489 w -= 4;
2490 mask += 4;
2491 dst += 4;
2492 }
2493
2494 CHECKPOINT ();
2495
2496 while (w)
2497 {
2498 uint64_t m = *mask;
2499
2500 if (m)
2501 {
2502 uint64_t d = *dst;
2503 __m64 vd = to_m64 (d);
2504 __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2505 expand565 (vd, 0));
2506 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2507 *dst = to_uint64 (vd);
2508 }
2509
2510 w--;
2511 mask++;
2512 dst++;
2513 }
2514 }
2515
2516 _mm_empty ();
2517 }
2518
2519 static void
mmx_composite_over_pixbuf_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2520 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2521 pixman_composite_info_t *info)
2522 {
2523 PIXMAN_COMPOSITE_ARGS (info);
2524 uint16_t *dst_line, *dst;
2525 uint32_t *src_line, *src;
2526 int dst_stride, src_stride;
2527 int32_t w;
2528
2529 CHECKPOINT ();
2530
2531 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2532 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2533
2534 #if 0
2535 /* FIXME */
2536 assert (src_image->drawable == mask_image->drawable);
2537 #endif
2538
2539 while (height--)
2540 {
2541 dst = dst_line;
2542 dst_line += dst_stride;
2543 src = src_line;
2544 src_line += src_stride;
2545 w = width;
2546
2547 CHECKPOINT ();
2548
2549 while (w && (uintptr_t)dst & 7)
2550 {
2551 __m64 vsrc = load8888 (src);
2552 uint64_t d = *dst;
2553 __m64 vdest = expand565 (to_m64 (d), 0);
2554
2555 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2556
2557 *dst = to_uint64 (vdest);
2558
2559 w--;
2560 dst++;
2561 src++;
2562 }
2563
2564 CHECKPOINT ();
2565
2566 while (w >= 4)
2567 {
2568 uint32_t s0, s1, s2, s3;
2569 unsigned char a0, a1, a2, a3;
2570
2571 s0 = *src;
2572 s1 = *(src + 1);
2573 s2 = *(src + 2);
2574 s3 = *(src + 3);
2575
2576 a0 = (s0 >> 24);
2577 a1 = (s1 >> 24);
2578 a2 = (s2 >> 24);
2579 a3 = (s3 >> 24);
2580
2581 if ((a0 & a1 & a2 & a3) == 0xFF)
2582 {
2583 __m64 v0 = invert_colors (load8888 (&s0));
2584 __m64 v1 = invert_colors (load8888 (&s1));
2585 __m64 v2 = invert_colors (load8888 (&s2));
2586 __m64 v3 = invert_colors (load8888 (&s3));
2587
2588 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2589 }
2590 else if (s0 | s1 | s2 | s3)
2591 {
2592 __m64 vdest = *(__m64 *)dst;
2593 __m64 v0, v1, v2, v3;
2594
2595 __m64 vsrc0 = load8888 (&s0);
2596 __m64 vsrc1 = load8888 (&s1);
2597 __m64 vsrc2 = load8888 (&s2);
2598 __m64 vsrc3 = load8888 (&s3);
2599
2600 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2601
2602 v0 = over_rev_non_pre (vsrc0, v0);
2603 v1 = over_rev_non_pre (vsrc1, v1);
2604 v2 = over_rev_non_pre (vsrc2, v2);
2605 v3 = over_rev_non_pre (vsrc3, v3);
2606
2607 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2608 }
2609
2610 w -= 4;
2611 dst += 4;
2612 src += 4;
2613 }
2614
2615 CHECKPOINT ();
2616
2617 while (w)
2618 {
2619 __m64 vsrc = load8888 (src);
2620 uint64_t d = *dst;
2621 __m64 vdest = expand565 (to_m64 (d), 0);
2622
2623 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2624
2625 *dst = to_uint64 (vdest);
2626
2627 w--;
2628 dst++;
2629 src++;
2630 }
2631 }
2632
2633 _mm_empty ();
2634 }
2635
2636 static void
mmx_composite_over_pixbuf_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2637 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2638 pixman_composite_info_t *info)
2639 {
2640 PIXMAN_COMPOSITE_ARGS (info);
2641 uint32_t *dst_line, *dst;
2642 uint32_t *src_line, *src;
2643 int dst_stride, src_stride;
2644 int32_t w;
2645
2646 CHECKPOINT ();
2647
2648 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2649 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2650
2651 #if 0
2652 /* FIXME */
2653 assert (src_image->drawable == mask_image->drawable);
2654 #endif
2655
2656 while (height--)
2657 {
2658 dst = dst_line;
2659 dst_line += dst_stride;
2660 src = src_line;
2661 src_line += src_stride;
2662 w = width;
2663
2664 while (w && (uintptr_t)dst & 7)
2665 {
2666 __m64 s = load8888 (src);
2667 __m64 d = load8888 (dst);
2668
2669 store8888 (dst, over_rev_non_pre (s, d));
2670
2671 w--;
2672 dst++;
2673 src++;
2674 }
2675
2676 while (w >= 2)
2677 {
2678 uint32_t s0, s1;
2679 unsigned char a0, a1;
2680 __m64 d0, d1;
2681
2682 s0 = *src;
2683 s1 = *(src + 1);
2684
2685 a0 = (s0 >> 24);
2686 a1 = (s1 >> 24);
2687
2688 if ((a0 & a1) == 0xFF)
2689 {
2690 d0 = invert_colors (load8888 (&s0));
2691 d1 = invert_colors (load8888 (&s1));
2692
2693 *(__m64 *)dst = pack8888 (d0, d1);
2694 }
2695 else if (s0 | s1)
2696 {
2697 __m64 vdest = *(__m64 *)dst;
2698
2699 d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
2700 d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
2701
2702 *(__m64 *)dst = pack8888 (d0, d1);
2703 }
2704
2705 w -= 2;
2706 dst += 2;
2707 src += 2;
2708 }
2709
2710 if (w)
2711 {
2712 __m64 s = load8888 (src);
2713 __m64 d = load8888 (dst);
2714
2715 store8888 (dst, over_rev_non_pre (s, d));
2716 }
2717 }
2718
2719 _mm_empty ();
2720 }
2721
2722 static void
mmx_composite_over_n_8888_0565_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)2723 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2724 pixman_composite_info_t *info)
2725 {
2726 PIXMAN_COMPOSITE_ARGS (info);
2727 uint32_t src;
2728 uint16_t *dst_line;
2729 uint32_t *mask_line;
2730 int dst_stride, mask_stride;
2731 __m64 vsrc, vsrca;
2732
2733 CHECKPOINT ();
2734
2735 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2736
2737 if (src == 0)
2738 return;
2739
2740 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2741 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2742
2743 vsrc = load8888 (&src);
2744 vsrca = expand_alpha (vsrc);
2745
2746 while (height--)
2747 {
2748 int twidth = width;
2749 uint32_t *p = (uint32_t *)mask_line;
2750 uint16_t *q = (uint16_t *)dst_line;
2751
2752 while (twidth && ((uintptr_t)q & 7))
2753 {
2754 uint32_t m = *(uint32_t *)p;
2755
2756 if (m)
2757 {
2758 uint64_t d = *q;
2759 __m64 vdest = expand565 (to_m64 (d), 0);
2760 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2761 *q = to_uint64 (vdest);
2762 }
2763
2764 twidth--;
2765 p++;
2766 q++;
2767 }
2768
2769 while (twidth >= 4)
2770 {
2771 uint32_t m0, m1, m2, m3;
2772
2773 m0 = *p;
2774 m1 = *(p + 1);
2775 m2 = *(p + 2);
2776 m3 = *(p + 3);
2777
2778 if ((m0 | m1 | m2 | m3))
2779 {
2780 __m64 vdest = *(__m64 *)q;
2781 __m64 v0, v1, v2, v3;
2782
2783 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2784
2785 v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
2786 v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
2787 v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
2788 v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
2789
2790 *(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
2791 }
2792 twidth -= 4;
2793 p += 4;
2794 q += 4;
2795 }
2796
2797 while (twidth)
2798 {
2799 uint32_t m;
2800
2801 m = *(uint32_t *)p;
2802 if (m)
2803 {
2804 uint64_t d = *q;
2805 __m64 vdest = expand565 (to_m64 (d), 0);
2806 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2807 *q = to_uint64 (vdest);
2808 }
2809
2810 twidth--;
2811 p++;
2812 q++;
2813 }
2814
2815 mask_line += mask_stride;
2816 dst_line += dst_stride;
2817 }
2818
2819 _mm_empty ();
2820 }
2821
2822 static void
mmx_composite_in_n_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)2823 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2824 pixman_composite_info_t *info)
2825 {
2826 PIXMAN_COMPOSITE_ARGS (info);
2827 uint8_t *dst_line, *dst;
2828 uint8_t *mask_line, *mask;
2829 int dst_stride, mask_stride;
2830 int32_t w;
2831 uint32_t src;
2832 uint8_t sa;
2833 __m64 vsrc, vsrca;
2834
2835 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2836 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2837
2838 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2839
2840 sa = src >> 24;
2841
2842 vsrc = load8888 (&src);
2843 vsrca = expand_alpha (vsrc);
2844
2845 while (height--)
2846 {
2847 dst = dst_line;
2848 dst_line += dst_stride;
2849 mask = mask_line;
2850 mask_line += mask_stride;
2851 w = width;
2852
2853 while (w && (uintptr_t)dst & 7)
2854 {
2855 uint16_t tmp;
2856 uint8_t a;
2857 uint32_t m, d;
2858
2859 a = *mask++;
2860 d = *dst;
2861
2862 m = MUL_UN8 (sa, a, tmp);
2863 d = MUL_UN8 (m, d, tmp);
2864
2865 *dst++ = d;
2866 w--;
2867 }
2868
2869 while (w >= 4)
2870 {
2871 __m64 vmask;
2872 __m64 vdest;
2873
2874 vmask = load8888u ((uint32_t *)mask);
2875 vdest = load8888 ((uint32_t *)dst);
2876
2877 store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
2878
2879 dst += 4;
2880 mask += 4;
2881 w -= 4;
2882 }
2883
2884 while (w--)
2885 {
2886 uint16_t tmp;
2887 uint8_t a;
2888 uint32_t m, d;
2889
2890 a = *mask++;
2891 d = *dst;
2892
2893 m = MUL_UN8 (sa, a, tmp);
2894 d = MUL_UN8 (m, d, tmp);
2895
2896 *dst++ = d;
2897 }
2898 }
2899
2900 _mm_empty ();
2901 }
2902
2903 static void
mmx_composite_in_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)2904 mmx_composite_in_8_8 (pixman_implementation_t *imp,
2905 pixman_composite_info_t *info)
2906 {
2907 PIXMAN_COMPOSITE_ARGS (info);
2908 uint8_t *dst_line, *dst;
2909 uint8_t *src_line, *src;
2910 int src_stride, dst_stride;
2911 int32_t w;
2912
2913 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2914 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2915
2916 while (height--)
2917 {
2918 dst = dst_line;
2919 dst_line += dst_stride;
2920 src = src_line;
2921 src_line += src_stride;
2922 w = width;
2923
2924 while (w && (uintptr_t)dst & 3)
2925 {
2926 uint8_t s, d;
2927 uint16_t tmp;
2928
2929 s = *src;
2930 d = *dst;
2931
2932 *dst = MUL_UN8 (s, d, tmp);
2933
2934 src++;
2935 dst++;
2936 w--;
2937 }
2938
2939 while (w >= 4)
2940 {
2941 uint32_t *s = (uint32_t *)src;
2942 uint32_t *d = (uint32_t *)dst;
2943
2944 store8888 (d, in (load8888u (s), load8888 (d)));
2945
2946 w -= 4;
2947 dst += 4;
2948 src += 4;
2949 }
2950
2951 while (w--)
2952 {
2953 uint8_t s, d;
2954 uint16_t tmp;
2955
2956 s = *src;
2957 d = *dst;
2958
2959 *dst = MUL_UN8 (s, d, tmp);
2960
2961 src++;
2962 dst++;
2963 }
2964 }
2965
2966 _mm_empty ();
2967 }
2968
2969 static void
mmx_composite_add_n_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)2970 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2971 pixman_composite_info_t *info)
2972 {
2973 PIXMAN_COMPOSITE_ARGS (info);
2974 uint8_t *dst_line, *dst;
2975 uint8_t *mask_line, *mask;
2976 int dst_stride, mask_stride;
2977 int32_t w;
2978 uint32_t src;
2979 uint8_t sa;
2980 __m64 vsrc, vsrca;
2981
2982 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2983 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2984
2985 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2986
2987 sa = src >> 24;
2988
2989 if (src == 0)
2990 return;
2991
2992 vsrc = load8888 (&src);
2993 vsrca = expand_alpha (vsrc);
2994
2995 while (height--)
2996 {
2997 dst = dst_line;
2998 dst_line += dst_stride;
2999 mask = mask_line;
3000 mask_line += mask_stride;
3001 w = width;
3002
3003 while (w && (uintptr_t)dst & 3)
3004 {
3005 uint16_t tmp;
3006 uint16_t a;
3007 uint32_t m, d;
3008 uint32_t r;
3009
3010 a = *mask++;
3011 d = *dst;
3012
3013 m = MUL_UN8 (sa, a, tmp);
3014 r = ADD_UN8 (m, d, tmp);
3015
3016 *dst++ = r;
3017 w--;
3018 }
3019
3020 while (w >= 4)
3021 {
3022 __m64 vmask;
3023 __m64 vdest;
3024
3025 vmask = load8888u ((uint32_t *)mask);
3026 vdest = load8888 ((uint32_t *)dst);
3027
3028 store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
3029
3030 dst += 4;
3031 mask += 4;
3032 w -= 4;
3033 }
3034
3035 while (w--)
3036 {
3037 uint16_t tmp;
3038 uint16_t a;
3039 uint32_t m, d;
3040 uint32_t r;
3041
3042 a = *mask++;
3043 d = *dst;
3044
3045 m = MUL_UN8 (sa, a, tmp);
3046 r = ADD_UN8 (m, d, tmp);
3047
3048 *dst++ = r;
3049 }
3050 }
3051
3052 _mm_empty ();
3053 }
3054
3055 static void
mmx_composite_add_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)3056 mmx_composite_add_8_8 (pixman_implementation_t *imp,
3057 pixman_composite_info_t *info)
3058 {
3059 PIXMAN_COMPOSITE_ARGS (info);
3060 uint8_t *dst_line, *dst;
3061 uint8_t *src_line, *src;
3062 int dst_stride, src_stride;
3063 int32_t w;
3064 uint8_t s, d;
3065 uint16_t t;
3066
3067 CHECKPOINT ();
3068
3069 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
3070 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
3071
3072 while (height--)
3073 {
3074 dst = dst_line;
3075 dst_line += dst_stride;
3076 src = src_line;
3077 src_line += src_stride;
3078 w = width;
3079
3080 while (w && (uintptr_t)dst & 7)
3081 {
3082 s = *src;
3083 d = *dst;
3084 t = d + s;
3085 s = t | (0 - (t >> 8));
3086 *dst = s;
3087
3088 dst++;
3089 src++;
3090 w--;
3091 }
3092
3093 while (w >= 8)
3094 {
3095 *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3096 dst += 8;
3097 src += 8;
3098 w -= 8;
3099 }
3100
3101 while (w)
3102 {
3103 s = *src;
3104 d = *dst;
3105 t = d + s;
3106 s = t | (0 - (t >> 8));
3107 *dst = s;
3108
3109 dst++;
3110 src++;
3111 w--;
3112 }
3113 }
3114
3115 _mm_empty ();
3116 }
3117
3118 static void
mmx_composite_add_0565_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)3119 mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
3120 pixman_composite_info_t *info)
3121 {
3122 PIXMAN_COMPOSITE_ARGS (info);
3123 uint16_t *dst_line, *dst;
3124 uint32_t d;
3125 uint16_t *src_line, *src;
3126 uint32_t s;
3127 int dst_stride, src_stride;
3128 int32_t w;
3129
3130 CHECKPOINT ();
3131
3132 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
3133 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3134
3135 while (height--)
3136 {
3137 dst = dst_line;
3138 dst_line += dst_stride;
3139 src = src_line;
3140 src_line += src_stride;
3141 w = width;
3142
3143 while (w && (uintptr_t)dst & 7)
3144 {
3145 s = *src++;
3146 if (s)
3147 {
3148 d = *dst;
3149 s = convert_0565_to_8888 (s);
3150 if (d)
3151 {
3152 d = convert_0565_to_8888 (d);
3153 UN8x4_ADD_UN8x4 (s, d);
3154 }
3155 *dst = convert_8888_to_0565 (s);
3156 }
3157 dst++;
3158 w--;
3159 }
3160
3161 while (w >= 4)
3162 {
3163 __m64 vdest = *(__m64 *)dst;
3164 __m64 vsrc = ldq_u ((__m64 *)src);
3165 __m64 vd0, vd1;
3166 __m64 vs0, vs1;
3167
3168 expand_4xpacked565 (vdest, &vd0, &vd1, 0);
3169 expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
3170
3171 vd0 = _mm_adds_pu8 (vd0, vs0);
3172 vd1 = _mm_adds_pu8 (vd1, vs1);
3173
3174 *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
3175
3176 dst += 4;
3177 src += 4;
3178 w -= 4;
3179 }
3180
3181 while (w--)
3182 {
3183 s = *src++;
3184 if (s)
3185 {
3186 d = *dst;
3187 s = convert_0565_to_8888 (s);
3188 if (d)
3189 {
3190 d = convert_0565_to_8888 (d);
3191 UN8x4_ADD_UN8x4 (s, d);
3192 }
3193 *dst = convert_8888_to_0565 (s);
3194 }
3195 dst++;
3196 }
3197 }
3198
3199 _mm_empty ();
3200 }
3201
3202 static void
mmx_composite_add_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3203 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
3204 pixman_composite_info_t *info)
3205 {
3206 PIXMAN_COMPOSITE_ARGS (info);
3207 uint32_t *dst_line, *dst;
3208 uint32_t *src_line, *src;
3209 int dst_stride, src_stride;
3210 int32_t w;
3211
3212 CHECKPOINT ();
3213
3214 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3215 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3216
3217 while (height--)
3218 {
3219 dst = dst_line;
3220 dst_line += dst_stride;
3221 src = src_line;
3222 src_line += src_stride;
3223 w = width;
3224
3225 while (w && (uintptr_t)dst & 7)
3226 {
3227 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3228 load ((const uint32_t *)dst)));
3229 dst++;
3230 src++;
3231 w--;
3232 }
3233
3234 while (w >= 2)
3235 {
3236 *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3237 dst += 2;
3238 src += 2;
3239 w -= 2;
3240 }
3241
3242 if (w)
3243 {
3244 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3245 load ((const uint32_t *)dst)));
3246
3247 }
3248 }
3249
3250 _mm_empty ();
3251 }
3252
3253 static pixman_bool_t
mmx_blt(pixman_implementation_t * imp,uint32_t * src_bits,uint32_t * dst_bits,int src_stride,int dst_stride,int src_bpp,int dst_bpp,int src_x,int src_y,int dest_x,int dest_y,int width,int height)3254 mmx_blt (pixman_implementation_t *imp,
3255 uint32_t * src_bits,
3256 uint32_t * dst_bits,
3257 int src_stride,
3258 int dst_stride,
3259 int src_bpp,
3260 int dst_bpp,
3261 int src_x,
3262 int src_y,
3263 int dest_x,
3264 int dest_y,
3265 int width,
3266 int height)
3267 {
3268 uint8_t * src_bytes;
3269 uint8_t * dst_bytes;
3270 int byte_width;
3271
3272 if (src_bpp != dst_bpp)
3273 return FALSE;
3274
3275 if (src_bpp == 16)
3276 {
3277 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3278 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3279 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3280 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3281 byte_width = 2 * width;
3282 src_stride *= 2;
3283 dst_stride *= 2;
3284 }
3285 else if (src_bpp == 32)
3286 {
3287 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3288 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3289 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3290 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3291 byte_width = 4 * width;
3292 src_stride *= 4;
3293 dst_stride *= 4;
3294 }
3295 else
3296 {
3297 return FALSE;
3298 }
3299
3300 while (height--)
3301 {
3302 int w;
3303 uint8_t *s = src_bytes;
3304 uint8_t *d = dst_bytes;
3305 src_bytes += src_stride;
3306 dst_bytes += dst_stride;
3307 w = byte_width;
3308
3309 if (w >= 1 && ((uintptr_t)d & 1))
3310 {
3311 *(uint8_t *)d = *(uint8_t *)s;
3312 w -= 1;
3313 s += 1;
3314 d += 1;
3315 }
3316
3317 if (w >= 2 && ((uintptr_t)d & 3))
3318 {
3319 *(uint16_t *)d = *(uint16_t *)s;
3320 w -= 2;
3321 s += 2;
3322 d += 2;
3323 }
3324
3325 while (w >= 4 && ((uintptr_t)d & 7))
3326 {
3327 *(uint32_t *)d = ldl_u ((uint32_t *)s);
3328
3329 w -= 4;
3330 s += 4;
3331 d += 4;
3332 }
3333
3334 while (w >= 64)
3335 {
3336 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
3337 __asm__ (
3338 "movq (%1), %%mm0\n"
3339 "movq 8(%1), %%mm1\n"
3340 "movq 16(%1), %%mm2\n"
3341 "movq 24(%1), %%mm3\n"
3342 "movq 32(%1), %%mm4\n"
3343 "movq 40(%1), %%mm5\n"
3344 "movq 48(%1), %%mm6\n"
3345 "movq 56(%1), %%mm7\n"
3346
3347 "movq %%mm0, (%0)\n"
3348 "movq %%mm1, 8(%0)\n"
3349 "movq %%mm2, 16(%0)\n"
3350 "movq %%mm3, 24(%0)\n"
3351 "movq %%mm4, 32(%0)\n"
3352 "movq %%mm5, 40(%0)\n"
3353 "movq %%mm6, 48(%0)\n"
3354 "movq %%mm7, 56(%0)\n"
3355 :
3356 : "r" (d), "r" (s)
3357 : "memory",
3358 "%mm0", "%mm1", "%mm2", "%mm3",
3359 "%mm4", "%mm5", "%mm6", "%mm7");
3360 #else
3361 __m64 v0 = ldq_u ((__m64 *)(s + 0));
3362 __m64 v1 = ldq_u ((__m64 *)(s + 8));
3363 __m64 v2 = ldq_u ((__m64 *)(s + 16));
3364 __m64 v3 = ldq_u ((__m64 *)(s + 24));
3365 __m64 v4 = ldq_u ((__m64 *)(s + 32));
3366 __m64 v5 = ldq_u ((__m64 *)(s + 40));
3367 __m64 v6 = ldq_u ((__m64 *)(s + 48));
3368 __m64 v7 = ldq_u ((__m64 *)(s + 56));
3369 *(__m64 *)(d + 0) = v0;
3370 *(__m64 *)(d + 8) = v1;
3371 *(__m64 *)(d + 16) = v2;
3372 *(__m64 *)(d + 24) = v3;
3373 *(__m64 *)(d + 32) = v4;
3374 *(__m64 *)(d + 40) = v5;
3375 *(__m64 *)(d + 48) = v6;
3376 *(__m64 *)(d + 56) = v7;
3377 #endif
3378
3379 w -= 64;
3380 s += 64;
3381 d += 64;
3382 }
3383 while (w >= 4)
3384 {
3385 *(uint32_t *)d = ldl_u ((uint32_t *)s);
3386
3387 w -= 4;
3388 s += 4;
3389 d += 4;
3390 }
3391 if (w >= 2)
3392 {
3393 *(uint16_t *)d = *(uint16_t *)s;
3394 w -= 2;
3395 s += 2;
3396 d += 2;
3397 }
3398 }
3399
3400 _mm_empty ();
3401
3402 return TRUE;
3403 }
3404
3405 static void
mmx_composite_copy_area(pixman_implementation_t * imp,pixman_composite_info_t * info)3406 mmx_composite_copy_area (pixman_implementation_t *imp,
3407 pixman_composite_info_t *info)
3408 {
3409 PIXMAN_COMPOSITE_ARGS (info);
3410
3411 mmx_blt (imp, src_image->bits.bits,
3412 dest_image->bits.bits,
3413 src_image->bits.rowstride,
3414 dest_image->bits.rowstride,
3415 PIXMAN_FORMAT_BPP (src_image->bits.format),
3416 PIXMAN_FORMAT_BPP (dest_image->bits.format),
3417 src_x, src_y, dest_x, dest_y, width, height);
3418 }
3419
3420 static void
mmx_composite_over_x888_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3421 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3422 pixman_composite_info_t *info)
3423 {
3424 PIXMAN_COMPOSITE_ARGS (info);
3425 uint32_t *src, *src_line;
3426 uint32_t *dst, *dst_line;
3427 uint8_t *mask, *mask_line;
3428 int src_stride, mask_stride, dst_stride;
3429 int32_t w;
3430
3431 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3432 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3433 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3434
3435 while (height--)
3436 {
3437 src = src_line;
3438 src_line += src_stride;
3439 dst = dst_line;
3440 dst_line += dst_stride;
3441 mask = mask_line;
3442 mask_line += mask_stride;
3443
3444 w = width;
3445
3446 while (w--)
3447 {
3448 uint64_t m = *mask;
3449
3450 if (m)
3451 {
3452 uint32_t ssrc = *src | 0xff000000;
3453 __m64 s = load8888 (&ssrc);
3454
3455 if (m == 0xff)
3456 {
3457 store8888 (dst, s);
3458 }
3459 else
3460 {
3461 __m64 sa = expand_alpha (s);
3462 __m64 vm = expand_alpha_rev (to_m64 (m));
3463 __m64 vdest = in_over (s, sa, vm, load8888 (dst));
3464
3465 store8888 (dst, vdest);
3466 }
3467 }
3468
3469 mask++;
3470 dst++;
3471 src++;
3472 }
3473 }
3474
3475 _mm_empty ();
3476 }
3477
3478 static void
mmx_composite_over_reverse_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3479 mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
3480 pixman_composite_info_t *info)
3481 {
3482 PIXMAN_COMPOSITE_ARGS (info);
3483 uint32_t src;
3484 uint32_t *dst_line, *dst;
3485 int32_t w;
3486 int dst_stride;
3487 __m64 vsrc;
3488
3489 CHECKPOINT ();
3490
3491 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3492
3493 if (src == 0)
3494 return;
3495
3496 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3497
3498 vsrc = load8888 (&src);
3499
3500 while (height--)
3501 {
3502 dst = dst_line;
3503 dst_line += dst_stride;
3504 w = width;
3505
3506 CHECKPOINT ();
3507
3508 while (w && (uintptr_t)dst & 7)
3509 {
3510 __m64 vdest = load8888 (dst);
3511
3512 store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3513
3514 w--;
3515 dst++;
3516 }
3517
3518 while (w >= 2)
3519 {
3520 __m64 vdest = *(__m64 *)dst;
3521 __m64 dest0 = expand8888 (vdest, 0);
3522 __m64 dest1 = expand8888 (vdest, 1);
3523
3524
3525 dest0 = over (dest0, expand_alpha (dest0), vsrc);
3526 dest1 = over (dest1, expand_alpha (dest1), vsrc);
3527
3528 *(__m64 *)dst = pack8888 (dest0, dest1);
3529
3530 dst += 2;
3531 w -= 2;
3532 }
3533
3534 CHECKPOINT ();
3535
3536 if (w)
3537 {
3538 __m64 vdest = load8888 (dst);
3539
3540 store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3541 }
3542 }
3543
3544 _mm_empty ();
3545 }
3546
3547 static force_inline void
scaled_nearest_scanline_mmx_8888_8888_OVER(uint32_t * pd,const uint32_t * ps,int32_t w,pixman_fixed_t vx,pixman_fixed_t unit_x,pixman_fixed_t src_width_fixed,pixman_bool_t fully_transparent_src)3548 scaled_nearest_scanline_mmx_8888_8888_OVER (uint32_t* pd,
3549 const uint32_t* ps,
3550 int32_t w,
3551 pixman_fixed_t vx,
3552 pixman_fixed_t unit_x,
3553 pixman_fixed_t src_width_fixed,
3554 pixman_bool_t fully_transparent_src)
3555 {
3556 if (fully_transparent_src)
3557 return;
3558
3559 while (w)
3560 {
3561 __m64 d = load (pd);
3562 __m64 s = load (ps + pixman_fixed_to_int (vx));
3563 vx += unit_x;
3564 while (vx >= 0)
3565 vx -= src_width_fixed;
3566
3567 store8888 (pd, core_combine_over_u_pixel_mmx (s, d));
3568 pd++;
3569
3570 w--;
3571 }
3572
3573 _mm_empty ();
3574 }
3575
FAST_NEAREST_MAINLOOP(mmx_8888_8888_cover_OVER,scaled_nearest_scanline_mmx_8888_8888_OVER,uint32_t,uint32_t,COVER)3576 FAST_NEAREST_MAINLOOP (mmx_8888_8888_cover_OVER,
3577 scaled_nearest_scanline_mmx_8888_8888_OVER,
3578 uint32_t, uint32_t, COVER)
3579 FAST_NEAREST_MAINLOOP (mmx_8888_8888_none_OVER,
3580 scaled_nearest_scanline_mmx_8888_8888_OVER,
3581 uint32_t, uint32_t, NONE)
3582 FAST_NEAREST_MAINLOOP (mmx_8888_8888_pad_OVER,
3583 scaled_nearest_scanline_mmx_8888_8888_OVER,
3584 uint32_t, uint32_t, PAD)
3585 FAST_NEAREST_MAINLOOP (mmx_8888_8888_normal_OVER,
3586 scaled_nearest_scanline_mmx_8888_8888_OVER,
3587 uint32_t, uint32_t, NORMAL)
3588
3589 static force_inline void
3590 scaled_nearest_scanline_mmx_8888_n_8888_OVER (const uint32_t * mask,
3591 uint32_t * dst,
3592 const uint32_t * src,
3593 int32_t w,
3594 pixman_fixed_t vx,
3595 pixman_fixed_t unit_x,
3596 pixman_fixed_t src_width_fixed,
3597 pixman_bool_t zero_src)
3598 {
3599 __m64 mm_mask;
3600
3601 if (zero_src || (*mask >> 24) == 0)
3602 {
3603 /* A workaround for https://gcc.gnu.org/PR47759 */
3604 _mm_empty ();
3605 return;
3606 }
3607
3608 mm_mask = expand_alpha (load8888 (mask));
3609
3610 while (w)
3611 {
3612 uint32_t s = *(src + pixman_fixed_to_int (vx));
3613 vx += unit_x;
3614 while (vx >= 0)
3615 vx -= src_width_fixed;
3616
3617 if (s)
3618 {
3619 __m64 ms = load8888 (&s);
3620 __m64 alpha = expand_alpha (ms);
3621 __m64 dest = load8888 (dst);
3622
3623 store8888 (dst, (in_over (ms, alpha, mm_mask, dest)));
3624 }
3625
3626 dst++;
3627 w--;
3628 }
3629
3630 _mm_empty ();
3631 }
3632
FAST_NEAREST_MAINLOOP_COMMON(mmx_8888_n_8888_cover_OVER,scaled_nearest_scanline_mmx_8888_n_8888_OVER,uint32_t,uint32_t,uint32_t,COVER,TRUE,TRUE)3633 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_cover_OVER,
3634 scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3635 uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
3636 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_pad_OVER,
3637 scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3638 uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
3639 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_none_OVER,
3640 scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3641 uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
3642 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_normal_OVER,
3643 scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3644 uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
3645
3646 #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
3647 #define BMSK (BSHIFT - 1)
3648
3649 #define BILINEAR_DECLARE_VARIABLES \
3650 const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \
3651 const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \
3652 const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \
3653 const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \
3654 const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \
3655 const __m64 mm_zero = _mm_setzero_si64 (); \
3656 __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
3657
3658 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
3659 do { \
3660 /* fetch 2x2 pixel block into 2 mmx registers */ \
3661 __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \
3662 __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \
3663 /* vertical interpolation */ \
3664 __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \
3665 __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \
3666 __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \
3667 __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \
3668 __m64 hi = _mm_add_pi16 (t_hi, b_hi); \
3669 __m64 lo = _mm_add_pi16 (t_lo, b_lo); \
3670 /* calculate horizontal weights */ \
3671 __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \
3672 _mm_srli_pi16 (mm_x, \
3673 16 - BILINEAR_INTERPOLATION_BITS))); \
3674 /* horizontal interpolation */ \
3675 __m64 p = _mm_unpacklo_pi16 (lo, hi); \
3676 __m64 q = _mm_unpackhi_pi16 (lo, hi); \
3677 vx += unit_x; \
3678 lo = _mm_madd_pi16 (p, mm_wh); \
3679 hi = _mm_madd_pi16 (q, mm_wh); \
3680 mm_x = _mm_add_pi16 (mm_x, mm_ux); \
3681 /* shift and pack the result */ \
3682 hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \
3683 lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \
3684 lo = _mm_packs_pi32 (lo, hi); \
3685 lo = _mm_packs_pu16 (lo, lo); \
3686 pix = lo; \
3687 } while (0)
3688
3689 #define BILINEAR_SKIP_ONE_PIXEL() \
3690 do { \
3691 vx += unit_x; \
3692 mm_x = _mm_add_pi16 (mm_x, mm_ux); \
3693 } while(0)
3694
3695 static force_inline void
3696 scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst,
3697 const uint32_t * mask,
3698 const uint32_t * src_top,
3699 const uint32_t * src_bottom,
3700 int32_t w,
3701 int wt,
3702 int wb,
3703 pixman_fixed_t vx,
3704 pixman_fixed_t unit_x,
3705 pixman_fixed_t max_vx,
3706 pixman_bool_t zero_src)
3707 {
3708 BILINEAR_DECLARE_VARIABLES;
3709 __m64 pix;
3710
3711 while (w--)
3712 {
3713 BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
3714 store (dst, pix);
3715 dst++;
3716 }
3717
3718 _mm_empty ();
3719 }
3720
FAST_BILINEAR_MAINLOOP_COMMON(mmx_8888_8888_cover_SRC,scaled_bilinear_scanline_mmx_8888_8888_SRC,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)3721 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
3722 scaled_bilinear_scanline_mmx_8888_8888_SRC,
3723 uint32_t, uint32_t, uint32_t,
3724 COVER, FLAG_NONE)
3725 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
3726 scaled_bilinear_scanline_mmx_8888_8888_SRC,
3727 uint32_t, uint32_t, uint32_t,
3728 PAD, FLAG_NONE)
3729 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
3730 scaled_bilinear_scanline_mmx_8888_8888_SRC,
3731 uint32_t, uint32_t, uint32_t,
3732 NONE, FLAG_NONE)
3733 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
3734 scaled_bilinear_scanline_mmx_8888_8888_SRC,
3735 uint32_t, uint32_t, uint32_t,
3736 NORMAL, FLAG_NONE)
3737
3738 static force_inline void
3739 scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst,
3740 const uint32_t * mask,
3741 const uint32_t * src_top,
3742 const uint32_t * src_bottom,
3743 int32_t w,
3744 int wt,
3745 int wb,
3746 pixman_fixed_t vx,
3747 pixman_fixed_t unit_x,
3748 pixman_fixed_t max_vx,
3749 pixman_bool_t zero_src)
3750 {
3751 BILINEAR_DECLARE_VARIABLES;
3752 __m64 pix1, pix2;
3753
3754 while (w)
3755 {
3756 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3757
3758 if (!is_zero (pix1))
3759 {
3760 pix2 = load (dst);
3761 store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
3762 }
3763
3764 w--;
3765 dst++;
3766 }
3767
3768 _mm_empty ();
3769 }
3770
FAST_BILINEAR_MAINLOOP_COMMON(mmx_8888_8888_cover_OVER,scaled_bilinear_scanline_mmx_8888_8888_OVER,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)3771 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
3772 scaled_bilinear_scanline_mmx_8888_8888_OVER,
3773 uint32_t, uint32_t, uint32_t,
3774 COVER, FLAG_NONE)
3775 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
3776 scaled_bilinear_scanline_mmx_8888_8888_OVER,
3777 uint32_t, uint32_t, uint32_t,
3778 PAD, FLAG_NONE)
3779 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
3780 scaled_bilinear_scanline_mmx_8888_8888_OVER,
3781 uint32_t, uint32_t, uint32_t,
3782 NONE, FLAG_NONE)
3783 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
3784 scaled_bilinear_scanline_mmx_8888_8888_OVER,
3785 uint32_t, uint32_t, uint32_t,
3786 NORMAL, FLAG_NONE)
3787
3788 static force_inline void
3789 scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t * dst,
3790 const uint8_t * mask,
3791 const uint32_t * src_top,
3792 const uint32_t * src_bottom,
3793 int32_t w,
3794 int wt,
3795 int wb,
3796 pixman_fixed_t vx,
3797 pixman_fixed_t unit_x,
3798 pixman_fixed_t max_vx,
3799 pixman_bool_t zero_src)
3800 {
3801 BILINEAR_DECLARE_VARIABLES;
3802 __m64 pix1, pix2;
3803 uint32_t m;
3804
3805 while (w)
3806 {
3807 m = (uint32_t) *mask++;
3808
3809 if (m)
3810 {
3811 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3812
3813 if (m == 0xff && is_opaque (pix1))
3814 {
3815 store (dst, pix1);
3816 }
3817 else
3818 {
3819 __m64 ms, md, ma, msa;
3820
3821 pix2 = load (dst);
3822 ma = expand_alpha_rev (to_m64 (m));
3823 ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
3824 md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
3825
3826 msa = expand_alpha (ms);
3827
3828 store8888 (dst, (in_over (ms, msa, ma, md)));
3829 }
3830 }
3831 else
3832 {
3833 BILINEAR_SKIP_ONE_PIXEL ();
3834 }
3835
3836 w--;
3837 dst++;
3838 }
3839
3840 _mm_empty ();
3841 }
3842
FAST_BILINEAR_MAINLOOP_COMMON(mmx_8888_8_8888_cover_OVER,scaled_bilinear_scanline_mmx_8888_8_8888_OVER,uint32_t,uint8_t,uint32_t,COVER,FLAG_HAVE_NON_SOLID_MASK)3843 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
3844 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3845 uint32_t, uint8_t, uint32_t,
3846 COVER, FLAG_HAVE_NON_SOLID_MASK)
3847 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
3848 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3849 uint32_t, uint8_t, uint32_t,
3850 PAD, FLAG_HAVE_NON_SOLID_MASK)
3851 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
3852 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3853 uint32_t, uint8_t, uint32_t,
3854 NONE, FLAG_HAVE_NON_SOLID_MASK)
3855 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
3856 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3857 uint32_t, uint8_t, uint32_t,
3858 NORMAL, FLAG_HAVE_NON_SOLID_MASK)
3859
3860 static uint32_t *
3861 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3862 {
3863 int w = iter->width;
3864 uint32_t *dst = iter->buffer;
3865 uint32_t *src = (uint32_t *)iter->bits;
3866
3867 iter->bits += iter->stride;
3868
3869 while (w && ((uintptr_t)dst) & 7)
3870 {
3871 *dst++ = (*src++) | 0xff000000;
3872 w--;
3873 }
3874
3875 while (w >= 8)
3876 {
3877 __m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
3878 __m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
3879 __m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
3880 __m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
3881
3882 *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
3883 *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
3884 *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
3885 *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
3886
3887 dst += 8;
3888 src += 8;
3889 w -= 8;
3890 }
3891
3892 while (w)
3893 {
3894 *dst++ = (*src++) | 0xff000000;
3895 w--;
3896 }
3897
3898 _mm_empty ();
3899 return iter->buffer;
3900 }
3901
3902 static uint32_t *
mmx_fetch_r5g6b5(pixman_iter_t * iter,const uint32_t * mask)3903 mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
3904 {
3905 int w = iter->width;
3906 uint32_t *dst = iter->buffer;
3907 uint16_t *src = (uint16_t *)iter->bits;
3908
3909 iter->bits += iter->stride;
3910
3911 while (w && ((uintptr_t)dst) & 0x0f)
3912 {
3913 uint16_t s = *src++;
3914
3915 *dst++ = convert_0565_to_8888 (s);
3916 w--;
3917 }
3918
3919 while (w >= 4)
3920 {
3921 __m64 vsrc = ldq_u ((__m64 *)src);
3922 __m64 mm0, mm1;
3923
3924 expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
3925
3926 *(__m64 *)(dst + 0) = mm0;
3927 *(__m64 *)(dst + 2) = mm1;
3928
3929 dst += 4;
3930 src += 4;
3931 w -= 4;
3932 }
3933
3934 while (w)
3935 {
3936 uint16_t s = *src++;
3937
3938 *dst++ = convert_0565_to_8888 (s);
3939 w--;
3940 }
3941
3942 _mm_empty ();
3943 return iter->buffer;
3944 }
3945
3946 static uint32_t *
mmx_fetch_a8(pixman_iter_t * iter,const uint32_t * mask)3947 mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3948 {
3949 int w = iter->width;
3950 uint32_t *dst = iter->buffer;
3951 uint8_t *src = iter->bits;
3952
3953 iter->bits += iter->stride;
3954
3955 while (w && (((uintptr_t)dst) & 15))
3956 {
3957 *dst++ = (uint32_t)*(src++) << 24;
3958 w--;
3959 }
3960
3961 while (w >= 8)
3962 {
3963 __m64 mm0 = ldq_u ((__m64 *)src);
3964
3965 __m64 mm1 = _mm_unpacklo_pi8 (_mm_setzero_si64(), mm0);
3966 __m64 mm2 = _mm_unpackhi_pi8 (_mm_setzero_si64(), mm0);
3967 __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
3968 __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
3969 __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
3970 __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
3971
3972 *(__m64 *)(dst + 0) = mm3;
3973 *(__m64 *)(dst + 2) = mm4;
3974 *(__m64 *)(dst + 4) = mm5;
3975 *(__m64 *)(dst + 6) = mm6;
3976
3977 dst += 8;
3978 src += 8;
3979 w -= 8;
3980 }
3981
3982 while (w)
3983 {
3984 *dst++ = (uint32_t)*(src++) << 24;
3985 w--;
3986 }
3987
3988 _mm_empty ();
3989 return iter->buffer;
3990 }
3991
3992 #define IMAGE_FLAGS \
3993 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
3994 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
3995
3996 static const pixman_iter_info_t mmx_iters[] =
3997 {
3998 { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
3999 _pixman_iter_init_bits_stride, mmx_fetch_x8r8g8b8, NULL
4000 },
4001 { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
4002 _pixman_iter_init_bits_stride, mmx_fetch_r5g6b5, NULL
4003 },
4004 { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
4005 _pixman_iter_init_bits_stride, mmx_fetch_a8, NULL
4006 },
4007 { PIXMAN_null },
4008 };
4009
4010 static const pixman_fast_path_t mmx_fast_paths[] =
4011 {
4012 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ),
4013 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ),
4014 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ),
4015 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ),
4016 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ),
4017 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ),
4018 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
4019 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
4020 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ),
4021 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
4022 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
4023 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ),
4024 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ),
4025 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ),
4026 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ),
4027 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ),
4028 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ),
4029 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ),
4030 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ),
4031 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ),
4032 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ),
4033 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ),
4034 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ),
4035 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ),
4036 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ),
4037 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ),
4038 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ),
4039 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ),
4040 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ),
4041 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ),
4042 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ),
4043 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ),
4044 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ),
4045 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, mmx_composite_over_n_0565 ),
4046 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
4047 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
4048
4049 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ),
4050 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ),
4051 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ),
4052 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ),
4053 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ),
4054 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ),
4055
4056 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
4057 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
4058
4059 PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, mmx_composite_add_0565_0565 ),
4060 PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, mmx_composite_add_0565_0565 ),
4061 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ),
4062 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ),
4063 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ),
4064 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ),
4065
4066 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
4067 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
4068 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
4069 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
4070 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ),
4071 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ),
4072 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ),
4073 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ),
4074 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ),
4075 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ),
4076 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
4077 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
4078 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
4079 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
4080 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ),
4081 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ),
4082
4083 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ),
4084 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ),
4085
4086 SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4087 SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4088 SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
4089 SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
4090
4091 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_n_8888 ),
4092 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_n_8888 ),
4093 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_n_8888 ),
4094 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_n_8888 ),
4095
4096 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
4097 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4098 SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4099 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
4100 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4101 SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4102
4103 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4104 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4105 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
4106 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
4107
4108 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888 ),
4109 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888 ),
4110 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888 ),
4111 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888 ),
4112
4113 { PIXMAN_OP_NONE },
4114 };
4115
4116 pixman_implementation_t *
_pixman_implementation_create_mmx(pixman_implementation_t * fallback)4117 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
4118 {
4119 pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
4120
4121 imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
4122 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
4123 imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
4124 imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
4125 imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
4126 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
4127 imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
4128 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
4129 imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
4130 imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
4131 imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
4132
4133 imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
4134 imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
4135 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
4136 imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
4137 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
4138 imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
4139 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
4140 imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
4141 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
4142 imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
4143 imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
4144
4145 imp->blt = mmx_blt;
4146 imp->fill = mmx_fill;
4147
4148 imp->iter_info = mmx_iters;
4149
4150 return imp;
4151 }
4152
4153 #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */
4154