1 /*
2  * Copyright © 2007 Luca Barbato
3  *
4  * Permission to use, copy, modify, distribute, and sell this software and its
5  * documentation for any purpose is hereby granted without fee, provided that
6  * the above copyright notice appear in all copies and that both that
7  * copyright notice and this permission notice appear in supporting
8  * documentation, and that the name of Luca Barbato not be used in advertising or
9  * publicity pertaining to distribution of the software without specific,
10  * written prior permission.  Luca Barbato makes no representations about the
11  * suitability of this software for any purpose.  It is provided "as is"
12  * without express or implied warranty.
13  *
14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
21  * SOFTWARE.
22  *
23  * Author:  Luca Barbato (lu_zero@gentoo.org)
24  *
25  * Based on fbmmx.c by Owen Taylor, Søren Sandmann and Nicholas Miell
26  */
27 
28 #ifdef HAVE_CONFIG_H
29 #include <config.h>
30 #endif
31 #include "pixman-private.h"
32 #include "pixman-combine32.h"
33 #include "pixman-inlines.h"
34 #include <altivec.h>
35 
36 #define AVV(x...) {x}
37 
38 static vector unsigned int mask_ff000000;
39 static vector unsigned int mask_red;
40 static vector unsigned int mask_green;
41 static vector unsigned int mask_blue;
42 static vector unsigned int mask_565_fix_rb;
43 static vector unsigned int mask_565_fix_g;
44 
45 static force_inline vector unsigned int
splat_alpha(vector unsigned int pix)46 splat_alpha (vector unsigned int pix)
47 {
48 #ifdef WORDS_BIGENDIAN
49     return vec_perm (pix, pix,
50 		     (vector unsigned char)AVV (
51 			 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04,
52 			 0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C));
53 #else
54     return vec_perm (pix, pix,
55 		     (vector unsigned char)AVV (
56 			 0x03, 0x03, 0x03, 0x03, 0x07, 0x07, 0x07, 0x07,
57 			 0x0B, 0x0B, 0x0B, 0x0B, 0x0F, 0x0F, 0x0F, 0x0F));
58 #endif
59 }
60 
61 static force_inline vector unsigned int
splat_pixel(vector unsigned int pix)62 splat_pixel (vector unsigned int pix)
63 {
64     return vec_perm (pix, pix,
65 		     (vector unsigned char)AVV (
66 			 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
67 			 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03));
68 }
69 
70 static force_inline vector unsigned int
pix_multiply(vector unsigned int p,vector unsigned int a)71 pix_multiply (vector unsigned int p, vector unsigned int a)
72 {
73     vector unsigned short hi, lo, mod;
74 
75     /* unpack to short */
76     hi = (vector unsigned short)
77 #ifdef WORDS_BIGENDIAN
78 	vec_mergeh ((vector unsigned char)AVV (0),
79 		    (vector unsigned char)p);
80 #else
81 	vec_mergeh ((vector unsigned char) p,
82 		    (vector unsigned char) AVV (0));
83 #endif
84 
85     mod = (vector unsigned short)
86 #ifdef WORDS_BIGENDIAN
87 	vec_mergeh ((vector unsigned char)AVV (0),
88 		    (vector unsigned char)a);
89 #else
90 	vec_mergeh ((vector unsigned char) a,
91 		    (vector unsigned char) AVV (0));
92 #endif
93 
94     hi = vec_mladd (hi, mod, (vector unsigned short)
95                     AVV (0x0080, 0x0080, 0x0080, 0x0080,
96                          0x0080, 0x0080, 0x0080, 0x0080));
97 
98     hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));
99 
100     hi = vec_sr (hi, vec_splat_u16 (8));
101 
102     /* unpack to short */
103     lo = (vector unsigned short)
104 #ifdef WORDS_BIGENDIAN
105 	vec_mergel ((vector unsigned char)AVV (0),
106 		    (vector unsigned char)p);
107 #else
108 	vec_mergel ((vector unsigned char) p,
109 		    (vector unsigned char) AVV (0));
110 #endif
111 
112     mod = (vector unsigned short)
113 #ifdef WORDS_BIGENDIAN
114 	vec_mergel ((vector unsigned char)AVV (0),
115 		    (vector unsigned char)a);
116 #else
117 	vec_mergel ((vector unsigned char) a,
118 		    (vector unsigned char) AVV (0));
119 #endif
120 
121     lo = vec_mladd (lo, mod, (vector unsigned short)
122                     AVV (0x0080, 0x0080, 0x0080, 0x0080,
123                          0x0080, 0x0080, 0x0080, 0x0080));
124 
125     lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));
126 
127     lo = vec_sr (lo, vec_splat_u16 (8));
128 
129     return (vector unsigned int)vec_packsu (hi, lo);
130 }
131 
132 static force_inline vector unsigned int
pix_add(vector unsigned int a,vector unsigned int b)133 pix_add (vector unsigned int a, vector unsigned int b)
134 {
135     return (vector unsigned int)vec_adds ((vector unsigned char)a,
136                                           (vector unsigned char)b);
137 }
138 
139 static force_inline vector unsigned int
pix_add_mul(vector unsigned int x,vector unsigned int a,vector unsigned int y,vector unsigned int b)140 pix_add_mul (vector unsigned int x,
141              vector unsigned int a,
142              vector unsigned int y,
143              vector unsigned int b)
144 {
145     vector unsigned int t1, t2;
146 
147     t1 = pix_multiply (x, a);
148     t2 = pix_multiply (y, b);
149 
150     return pix_add (t1, t2);
151 }
152 
153 static force_inline vector unsigned int
negate(vector unsigned int src)154 negate (vector unsigned int src)
155 {
156     return vec_nor (src, src);
157 }
158 
159 /* dest*~srca + src */
160 static force_inline vector unsigned int
over(vector unsigned int src,vector unsigned int srca,vector unsigned int dest)161 over (vector unsigned int src,
162       vector unsigned int srca,
163       vector unsigned int dest)
164 {
165     vector unsigned char tmp = (vector unsigned char)
166 	pix_multiply (dest, negate (srca));
167 
168     tmp = vec_adds ((vector unsigned char)src, tmp);
169     return (vector unsigned int)tmp;
170 }
171 
172 /* in == pix_multiply */
173 #define in_over(src, srca, mask, dest)					\
174     over (pix_multiply (src, mask),					\
175           pix_multiply (srca, mask), dest)
176 
177 #ifdef WORDS_BIGENDIAN
178 
179 #define COMPUTE_SHIFT_MASK(source)					\
180     source ## _mask = vec_lvsl (0, source);
181 
182 #define COMPUTE_SHIFT_MASKS(dest, source)				\
183     source ## _mask = vec_lvsl (0, source);
184 
185 #define COMPUTE_SHIFT_MASKC(dest, source, mask)				\
186     mask ## _mask = vec_lvsl (0, mask);					\
187     source ## _mask = vec_lvsl (0, source);
188 
189 #define LOAD_VECTOR(source)				  \
190 do							  \
191 {							  \
192     vector unsigned char tmp1, tmp2;			  \
193     tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
194     tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
195     v ## source = (typeof(v ## source)) 		  \
196 	vec_perm (tmp1, tmp2, source ## _mask);		  \
197 } while (0)
198 
199 #define LOAD_VECTORS(dest, source)			  \
200 do							  \
201 {							  \
202     LOAD_VECTOR(source);				  \
203     v ## dest = (typeof(v ## dest))vec_ld (0, dest);	  \
204 } while (0)
205 
206 #define LOAD_VECTORSC(dest, source, mask)		  \
207 do							  \
208 {							  \
209     LOAD_VECTORS(dest, source); 			  \
210     LOAD_VECTOR(mask);					  \
211 } while (0)
212 
213 #define DECLARE_SRC_MASK_VAR vector unsigned char src_mask
214 #define DECLARE_MASK_MASK_VAR vector unsigned char mask_mask
215 
216 #else
217 
218 /* Now the COMPUTE_SHIFT_{MASK, MASKS, MASKC} below are just no-op.
219  * They are defined that way because little endian altivec can do unaligned
220  * reads natively and have no need for constructing the permutation pattern
221  * variables.
222  */
223 #define COMPUTE_SHIFT_MASK(source)
224 
225 #define COMPUTE_SHIFT_MASKS(dest, source)
226 
227 #define COMPUTE_SHIFT_MASKC(dest, source, mask)
228 
229 # define LOAD_VECTOR(source)				\
230     v ## source = (typeof(v ## source))vec_xl(0, source);
231 
232 # define LOAD_VECTORS(dest, source)			\
233     LOAD_VECTOR(source);				\
234     LOAD_VECTOR(dest);					\
235 
236 # define LOAD_VECTORSC(dest, source, mask)		\
237     LOAD_VECTORS(dest, source); 			\
238     LOAD_VECTOR(mask);					\
239 
240 #define DECLARE_SRC_MASK_VAR
241 #define DECLARE_MASK_MASK_VAR
242 
243 #endif /* WORDS_BIGENDIAN */
244 
245 #define LOAD_VECTORSM(dest, source, mask)				\
246     LOAD_VECTORSC (dest, source, mask); 				\
247     v ## source = pix_multiply (v ## source,				\
248                                 splat_alpha (v ## mask));
249 
250 #define STORE_VECTOR(dest)						\
251     vec_st ((vector unsigned int) v ## dest, 0, dest);
252 
253 /* load 4 pixels from a 16-byte boundary aligned address */
254 static force_inline vector unsigned int
load_128_aligned(const uint32_t * src)255 load_128_aligned (const uint32_t* src)
256 {
257     return *((vector unsigned int *) src);
258 }
259 
260 /* load 4 pixels from a unaligned address */
261 static force_inline vector unsigned int
load_128_unaligned(const uint32_t * src)262 load_128_unaligned (const uint32_t* src)
263 {
264     vector unsigned int vsrc;
265     DECLARE_SRC_MASK_VAR;
266 
267     COMPUTE_SHIFT_MASK (src);
268     LOAD_VECTOR (src);
269 
270     return vsrc;
271 }
272 
273 /* save 4 pixels on a 16-byte boundary aligned address */
274 static force_inline void
save_128_aligned(uint32_t * data,vector unsigned int vdata)275 save_128_aligned (uint32_t* data,
276 		  vector unsigned int vdata)
277 {
278     STORE_VECTOR(data)
279 }
280 
281 static force_inline vector unsigned int
create_mask_1x32_128(const uint32_t * src)282 create_mask_1x32_128 (const uint32_t *src)
283 {
284     vector unsigned int vsrc;
285     DECLARE_SRC_MASK_VAR;
286 
287     COMPUTE_SHIFT_MASK (src);
288     LOAD_VECTOR (src);
289     return vec_splat(vsrc, 0);
290 }
291 
292 static force_inline vector unsigned int
create_mask_32_128(uint32_t mask)293 create_mask_32_128 (uint32_t mask)
294 {
295     return create_mask_1x32_128(&mask);
296 }
297 
298 static force_inline vector unsigned int
unpacklo_128_16x8(vector unsigned int data1,vector unsigned int data2)299 unpacklo_128_16x8 (vector unsigned int data1, vector unsigned int data2)
300 {
301     vector unsigned char lo;
302 
303     /* unpack to short */
304     lo = (vector unsigned char)
305 #ifdef WORDS_BIGENDIAN
306 	vec_mergel ((vector unsigned char) data2,
307 		    (vector unsigned char) data1);
308 #else
309 	vec_mergel ((vector unsigned char) data1,
310 		    (vector unsigned char) data2);
311 #endif
312 
313     return (vector unsigned int) lo;
314 }
315 
316 static force_inline vector unsigned int
unpackhi_128_16x8(vector unsigned int data1,vector unsigned int data2)317 unpackhi_128_16x8 (vector unsigned int data1, vector unsigned int data2)
318 {
319     vector unsigned char hi;
320 
321     /* unpack to short */
322     hi = (vector unsigned char)
323 #ifdef WORDS_BIGENDIAN
324 	vec_mergeh ((vector unsigned char) data2,
325 		    (vector unsigned char) data1);
326 #else
327 	vec_mergeh ((vector unsigned char) data1,
328 		    (vector unsigned char) data2);
329 #endif
330 
331     return (vector unsigned int) hi;
332 }
333 
334 static force_inline vector unsigned int
unpacklo_128_8x16(vector unsigned int data1,vector unsigned int data2)335 unpacklo_128_8x16 (vector unsigned int data1, vector unsigned int data2)
336 {
337     vector unsigned short lo;
338 
339     /* unpack to char */
340     lo = (vector unsigned short)
341 #ifdef WORDS_BIGENDIAN
342 	vec_mergel ((vector unsigned short) data2,
343 		    (vector unsigned short) data1);
344 #else
345 	vec_mergel ((vector unsigned short) data1,
346 		    (vector unsigned short) data2);
347 #endif
348 
349     return (vector unsigned int) lo;
350 }
351 
352 static force_inline vector unsigned int
unpackhi_128_8x16(vector unsigned int data1,vector unsigned int data2)353 unpackhi_128_8x16 (vector unsigned int data1, vector unsigned int data2)
354 {
355     vector unsigned short hi;
356 
357     /* unpack to char */
358     hi = (vector unsigned short)
359 #ifdef WORDS_BIGENDIAN
360 	vec_mergeh ((vector unsigned short) data2,
361 		    (vector unsigned short) data1);
362 #else
363 	vec_mergeh ((vector unsigned short) data1,
364 		    (vector unsigned short) data2);
365 #endif
366 
367     return (vector unsigned int) hi;
368 }
369 
370 static force_inline void
unpack_128_2x128(vector unsigned int data1,vector unsigned int data2,vector unsigned int * data_lo,vector unsigned int * data_hi)371 unpack_128_2x128 (vector unsigned int data1, vector unsigned int data2,
372 		    vector unsigned int* data_lo, vector unsigned int* data_hi)
373 {
374     *data_lo = unpacklo_128_16x8(data1, data2);
375     *data_hi = unpackhi_128_16x8(data1, data2);
376 }
377 
378 static force_inline void
unpack_128_2x128_16(vector unsigned int data1,vector unsigned int data2,vector unsigned int * data_lo,vector unsigned int * data_hi)379 unpack_128_2x128_16 (vector unsigned int data1, vector unsigned int data2,
380 		    vector unsigned int* data_lo, vector unsigned int* data_hi)
381 {
382     *data_lo = unpacklo_128_8x16(data1, data2);
383     *data_hi = unpackhi_128_8x16(data1, data2);
384 }
385 
386 static force_inline vector unsigned int
unpack_565_to_8888(vector unsigned int lo)387 unpack_565_to_8888 (vector unsigned int lo)
388 {
389     vector unsigned int r, g, b, rb, t;
390 
391     r = vec_and (vec_sl(lo, create_mask_32_128(8)), mask_red);
392     g = vec_and (vec_sl(lo, create_mask_32_128(5)), mask_green);
393     b = vec_and (vec_sl(lo, create_mask_32_128(3)), mask_blue);
394 
395     rb = vec_or (r, b);
396     t  = vec_and (rb, mask_565_fix_rb);
397     t  = vec_sr (t, create_mask_32_128(5));
398     rb = vec_or (rb, t);
399 
400     t  = vec_and (g, mask_565_fix_g);
401     t  = vec_sr (t, create_mask_32_128(6));
402     g  = vec_or (g, t);
403 
404     return vec_or (rb, g);
405 }
406 
407 static force_inline int
is_opaque(vector unsigned int x)408 is_opaque (vector unsigned int x)
409 {
410     uint32_t cmp_result;
411     vector bool int ffs = vec_cmpeq(x, x);
412 
413     cmp_result = vec_all_eq(x, ffs);
414 
415     return (cmp_result & 0x8888) == 0x8888;
416 }
417 
418 static force_inline int
is_zero(vector unsigned int x)419 is_zero (vector unsigned int x)
420 {
421     uint32_t cmp_result;
422 
423     cmp_result = vec_all_eq(x, (vector unsigned int) AVV(0));
424 
425     return cmp_result == 0xffff;
426 }
427 
428 static force_inline int
is_transparent(vector unsigned int x)429 is_transparent (vector unsigned int x)
430 {
431     uint32_t cmp_result;
432 
433     cmp_result = vec_all_eq(x, (vector unsigned int) AVV(0));
434     return (cmp_result & 0x8888) == 0x8888;
435 }
436 
437 static force_inline uint32_t
core_combine_over_u_pixel_vmx(uint32_t src,uint32_t dst)438 core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst)
439 {
440     uint32_t a;
441 
442     a = ALPHA_8(src);
443 
444     if (a == 0xff)
445     {
446 	return src;
447     }
448     else if (src)
449     {
450 	UN8x4_MUL_UN8_ADD_UN8x4(dst, (~a & MASK), src);
451     }
452 
453     return dst;
454 }
455 
456 static force_inline uint32_t
combine1(const uint32_t * ps,const uint32_t * pm)457 combine1 (const uint32_t *ps, const uint32_t *pm)
458 {
459     uint32_t s = *ps;
460 
461     if (pm)
462 	UN8x4_MUL_UN8(s, ALPHA_8(*pm));
463 
464     return s;
465 }
466 
467 static force_inline vector unsigned int
combine4(const uint32_t * ps,const uint32_t * pm)468 combine4 (const uint32_t* ps, const uint32_t* pm)
469 {
470     vector unsigned int src, msk;
471 
472     if (pm)
473     {
474 	msk = load_128_unaligned(pm);
475 
476 	if (is_transparent(msk))
477 	    return (vector unsigned int) AVV(0);
478     }
479 
480     src = load_128_unaligned(ps);
481 
482     if (pm)
483 	src = pix_multiply(src, msk);
484 
485     return src;
486 }
487 
488 static void
vmx_combine_over_u_no_mask(uint32_t * dest,const uint32_t * src,int width)489 vmx_combine_over_u_no_mask (uint32_t *      dest,
490                             const uint32_t *src,
491                             int             width)
492 {
493     int i;
494     vector unsigned int vdest, vsrc;
495     DECLARE_SRC_MASK_VAR;
496 
497     while (width && ((uintptr_t)dest & 15))
498     {
499 	uint32_t s = *src++;
500 	uint32_t d = *dest;
501 	uint32_t ia = ALPHA_8 (~s);
502 
503 	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
504 
505 	*dest++ = d;
506 	width--;
507     }
508 
509     COMPUTE_SHIFT_MASKS (dest, src);
510 
511     /* printf ("%s\n",__PRETTY_FUNCTION__); */
512     for (i = width / 4; i > 0; i--)
513     {
514 
515 	LOAD_VECTORS (dest, src);
516 
517 	vdest = over (vsrc, splat_alpha (vsrc), vdest);
518 
519 	STORE_VECTOR (dest);
520 
521 	src += 4;
522 	dest += 4;
523     }
524 
525     for (i = width % 4; --i >= 0;)
526     {
527 	uint32_t s = src[i];
528 	uint32_t d = dest[i];
529 	uint32_t ia = ALPHA_8 (~s);
530 
531 	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
532 
533 	dest[i] = d;
534     }
535 }
536 
537 static void
vmx_combine_over_u_mask(uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)538 vmx_combine_over_u_mask (uint32_t *      dest,
539                          const uint32_t *src,
540                          const uint32_t *mask,
541                          int             width)
542 {
543     int i;
544     vector unsigned int vdest, vsrc, vmask;
545     DECLARE_SRC_MASK_VAR;
546     DECLARE_MASK_MASK_VAR;
547 
548     while (width && ((uintptr_t)dest & 15))
549     {
550 	uint32_t m = ALPHA_8 (*mask++);
551 	uint32_t s = *src++;
552 	uint32_t d = *dest;
553 	uint32_t ia;
554 
555 	UN8x4_MUL_UN8 (s, m);
556 
557 	ia = ALPHA_8 (~s);
558 
559 	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
560 	*dest++ = d;
561 	width--;
562     }
563 
564     COMPUTE_SHIFT_MASKC (dest, src, mask);
565 
566     /* printf ("%s\n",__PRETTY_FUNCTION__); */
567     for (i = width / 4; i > 0; i--)
568     {
569 	LOAD_VECTORSM (dest, src, mask);
570 
571 	vdest = over (vsrc, splat_alpha (vsrc), vdest);
572 
573 	STORE_VECTOR (dest);
574 
575 	src += 4;
576 	dest += 4;
577 	mask += 4;
578     }
579 
580     for (i = width % 4; --i >= 0;)
581     {
582 	uint32_t m = ALPHA_8 (mask[i]);
583 	uint32_t s = src[i];
584 	uint32_t d = dest[i];
585 	uint32_t ia;
586 
587 	UN8x4_MUL_UN8 (s, m);
588 
589 	ia = ALPHA_8 (~s);
590 
591 	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
592 	dest[i] = d;
593     }
594 }
595 
596 static void
vmx_combine_over_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)597 vmx_combine_over_u (pixman_implementation_t *imp,
598                     pixman_op_t              op,
599                     uint32_t *               dest,
600                     const uint32_t *         src,
601                     const uint32_t *         mask,
602                     int                      width)
603 {
604     if (mask)
605 	vmx_combine_over_u_mask (dest, src, mask, width);
606     else
607 	vmx_combine_over_u_no_mask (dest, src, width);
608 }
609 
610 static void
vmx_combine_over_reverse_u_no_mask(uint32_t * dest,const uint32_t * src,int width)611 vmx_combine_over_reverse_u_no_mask (uint32_t *      dest,
612                                     const uint32_t *src,
613                                     int             width)
614 {
615     int i;
616     vector unsigned int vdest, vsrc;
617     DECLARE_SRC_MASK_VAR;
618 
619     while (width && ((uintptr_t)dest & 15))
620     {
621 	uint32_t s = *src++;
622 	uint32_t d = *dest;
623 	uint32_t ia = ALPHA_8 (~d);
624 
625 	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
626 	*dest++ = s;
627 	width--;
628     }
629 
630     COMPUTE_SHIFT_MASKS (dest, src);
631 
632     /* printf ("%s\n",__PRETTY_FUNCTION__); */
633     for (i = width / 4; i > 0; i--)
634     {
635 
636 	LOAD_VECTORS (dest, src);
637 
638 	vdest = over (vdest, splat_alpha (vdest), vsrc);
639 
640 	STORE_VECTOR (dest);
641 
642 	src += 4;
643 	dest += 4;
644     }
645 
646     for (i = width % 4; --i >= 0;)
647     {
648 	uint32_t s = src[i];
649 	uint32_t d = dest[i];
650 	uint32_t ia = ALPHA_8 (~dest[i]);
651 
652 	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
653 	dest[i] = s;
654     }
655 }
656 
657 static void
vmx_combine_over_reverse_u_mask(uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)658 vmx_combine_over_reverse_u_mask (uint32_t *      dest,
659                                  const uint32_t *src,
660                                  const uint32_t *mask,
661                                  int             width)
662 {
663     int i;
664     vector unsigned int vdest, vsrc, vmask;
665     DECLARE_SRC_MASK_VAR;
666     DECLARE_MASK_MASK_VAR;
667 
668     while (width && ((uintptr_t)dest & 15))
669     {
670 	uint32_t m = ALPHA_8 (*mask++);
671 	uint32_t s = *src++;
672 	uint32_t d = *dest;
673 	uint32_t ia = ALPHA_8 (~d);
674 
675 	UN8x4_MUL_UN8 (s, m);
676 
677 	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
678 	*dest++ = s;
679 	width--;
680     }
681 
682     COMPUTE_SHIFT_MASKC (dest, src, mask);
683 
684     /* printf ("%s\n",__PRETTY_FUNCTION__); */
685     for (i = width / 4; i > 0; i--)
686     {
687 
688 	LOAD_VECTORSM (dest, src, mask);
689 
690 	vdest = over (vdest, splat_alpha (vdest), vsrc);
691 
692 	STORE_VECTOR (dest);
693 
694 	src += 4;
695 	dest += 4;
696 	mask += 4;
697     }
698 
699     for (i = width % 4; --i >= 0;)
700     {
701 	uint32_t m = ALPHA_8 (mask[i]);
702 	uint32_t s = src[i];
703 	uint32_t d = dest[i];
704 	uint32_t ia = ALPHA_8 (~dest[i]);
705 
706 	UN8x4_MUL_UN8 (s, m);
707 
708 	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
709 	dest[i] = s;
710     }
711 }
712 
713 static void
vmx_combine_over_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)714 vmx_combine_over_reverse_u (pixman_implementation_t *imp,
715                             pixman_op_t              op,
716                             uint32_t *               dest,
717                             const uint32_t *         src,
718                             const uint32_t *         mask,
719                             int                      width)
720 {
721     if (mask)
722 	vmx_combine_over_reverse_u_mask (dest, src, mask, width);
723     else
724 	vmx_combine_over_reverse_u_no_mask (dest, src, width);
725 }
726 
727 static void
vmx_combine_in_u_no_mask(uint32_t * dest,const uint32_t * src,int width)728 vmx_combine_in_u_no_mask (uint32_t *      dest,
729                           const uint32_t *src,
730                           int             width)
731 {
732     int i;
733     vector unsigned int vdest, vsrc;
734     DECLARE_SRC_MASK_VAR;
735 
736     while (width && ((uintptr_t)dest & 15))
737     {
738 	uint32_t s = *src++;
739 	uint32_t a = ALPHA_8 (*dest);
740 
741 	UN8x4_MUL_UN8 (s, a);
742 	*dest++ = s;
743 	width--;
744     }
745 
746     COMPUTE_SHIFT_MASKS (dest, src);
747 
748     /* printf ("%s\n",__PRETTY_FUNCTION__); */
749     for (i = width / 4; i > 0; i--)
750     {
751 	LOAD_VECTORS (dest, src);
752 
753 	vdest = pix_multiply (vsrc, splat_alpha (vdest));
754 
755 	STORE_VECTOR (dest);
756 
757 	src += 4;
758 	dest += 4;
759     }
760 
761     for (i = width % 4; --i >= 0;)
762     {
763 	uint32_t s = src[i];
764 	uint32_t a = ALPHA_8 (dest[i]);
765 
766 	UN8x4_MUL_UN8 (s, a);
767 	dest[i] = s;
768     }
769 }
770 
771 static void
vmx_combine_in_u_mask(uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)772 vmx_combine_in_u_mask (uint32_t *      dest,
773                        const uint32_t *src,
774                        const uint32_t *mask,
775                        int             width)
776 {
777     int i;
778     vector unsigned int vdest, vsrc, vmask;
779     DECLARE_SRC_MASK_VAR;
780     DECLARE_MASK_MASK_VAR;
781 
782     while (width && ((uintptr_t)dest & 15))
783     {
784 	uint32_t m = ALPHA_8 (*mask++);
785 	uint32_t s = *src++;
786 	uint32_t a = ALPHA_8 (*dest);
787 
788 	UN8x4_MUL_UN8 (s, m);
789 	UN8x4_MUL_UN8 (s, a);
790 
791 	*dest++ = s;
792 	width--;
793     }
794 
795     COMPUTE_SHIFT_MASKC (dest, src, mask);
796 
797     /* printf ("%s\n",__PRETTY_FUNCTION__); */
798     for (i = width / 4; i > 0; i--)
799     {
800 	LOAD_VECTORSM (dest, src, mask);
801 
802 	vdest = pix_multiply (vsrc, splat_alpha (vdest));
803 
804 	STORE_VECTOR (dest);
805 
806 	src += 4;
807 	dest += 4;
808 	mask += 4;
809     }
810 
811     for (i = width % 4; --i >= 0;)
812     {
813 	uint32_t m = ALPHA_8 (mask[i]);
814 	uint32_t s = src[i];
815 	uint32_t a = ALPHA_8 (dest[i]);
816 
817 	UN8x4_MUL_UN8 (s, m);
818 	UN8x4_MUL_UN8 (s, a);
819 
820 	dest[i] = s;
821     }
822 }
823 
824 static void
vmx_combine_in_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)825 vmx_combine_in_u (pixman_implementation_t *imp,
826                   pixman_op_t              op,
827                   uint32_t *               dest,
828                   const uint32_t *         src,
829                   const uint32_t *         mask,
830                   int                      width)
831 {
832     if (mask)
833 	vmx_combine_in_u_mask (dest, src, mask, width);
834     else
835 	vmx_combine_in_u_no_mask (dest, src, width);
836 }
837 
838 static void
vmx_combine_in_reverse_u_no_mask(uint32_t * dest,const uint32_t * src,int width)839 vmx_combine_in_reverse_u_no_mask (uint32_t *      dest,
840                                   const uint32_t *src,
841                                   int             width)
842 {
843     int i;
844     vector unsigned int vdest, vsrc;
845     DECLARE_SRC_MASK_VAR;
846 
847     while (width && ((uintptr_t)dest & 15))
848     {
849 	uint32_t d = *dest;
850 	uint32_t a = ALPHA_8 (*src++);
851 
852 	UN8x4_MUL_UN8 (d, a);
853 
854 	*dest++ = d;
855 	width--;
856     }
857 
858     COMPUTE_SHIFT_MASKS (dest, src);
859 
860     /* printf ("%s\n",__PRETTY_FUNCTION__); */
861     for (i = width / 4; i > 0; i--)
862     {
863 	LOAD_VECTORS (dest, src);
864 
865 	vdest = pix_multiply (vdest, splat_alpha (vsrc));
866 
867 	STORE_VECTOR (dest);
868 
869 	src += 4;
870 	dest += 4;
871     }
872 
873     for (i = width % 4; --i >= 0;)
874     {
875 	uint32_t d = dest[i];
876 	uint32_t a = ALPHA_8 (src[i]);
877 
878 	UN8x4_MUL_UN8 (d, a);
879 
880 	dest[i] = d;
881     }
882 }
883 
884 static void
vmx_combine_in_reverse_u_mask(uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)885 vmx_combine_in_reverse_u_mask (uint32_t *      dest,
886                                const uint32_t *src,
887                                const uint32_t *mask,
888                                int             width)
889 {
890     int i;
891     vector unsigned int vdest, vsrc, vmask;
892     DECLARE_SRC_MASK_VAR;
893     DECLARE_MASK_MASK_VAR;
894 
895     while (width && ((uintptr_t)dest & 15))
896     {
897 	uint32_t m = ALPHA_8 (*mask++);
898 	uint32_t d = *dest;
899 	uint32_t a = *src++;
900 
901 	UN8x4_MUL_UN8 (a, m);
902 	a = ALPHA_8 (a);
903 	UN8x4_MUL_UN8 (d, a);
904 
905 	*dest++ = d;
906 	width--;
907     }
908 
909     COMPUTE_SHIFT_MASKC (dest, src, mask);
910 
911     /* printf ("%s\n",__PRETTY_FUNCTION__); */
912     for (i = width / 4; i > 0; i--)
913     {
914 	LOAD_VECTORSM (dest, src, mask);
915 
916 	vdest = pix_multiply (vdest, splat_alpha (vsrc));
917 
918 	STORE_VECTOR (dest);
919 
920 	src += 4;
921 	dest += 4;
922 	mask += 4;
923     }
924 
925     for (i = width % 4; --i >= 0;)
926     {
927 	uint32_t m = ALPHA_8 (mask[i]);
928 	uint32_t d = dest[i];
929 	uint32_t a = src[i];
930 
931 	UN8x4_MUL_UN8 (a, m);
932 	a = ALPHA_8 (a);
933 	UN8x4_MUL_UN8 (d, a);
934 
935 	dest[i] = d;
936     }
937 }
938 
939 static void
vmx_combine_in_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)940 vmx_combine_in_reverse_u (pixman_implementation_t *imp,
941                           pixman_op_t              op,
942                           uint32_t *               dest,
943                           const uint32_t *         src,
944                           const uint32_t *         mask,
945                           int                      width)
946 {
947     if (mask)
948 	vmx_combine_in_reverse_u_mask (dest, src, mask, width);
949     else
950 	vmx_combine_in_reverse_u_no_mask (dest, src, width);
951 }
952 
953 static void
vmx_combine_out_u_no_mask(uint32_t * dest,const uint32_t * src,int width)954 vmx_combine_out_u_no_mask (uint32_t *      dest,
955                            const uint32_t *src,
956                            int             width)
957 {
958     int i;
959     vector unsigned int vdest, vsrc;
960     DECLARE_SRC_MASK_VAR;
961 
962     while (width && ((uintptr_t)dest & 15))
963     {
964 	uint32_t s = *src++;
965 	uint32_t a = ALPHA_8 (~(*dest));
966 
967 	UN8x4_MUL_UN8 (s, a);
968 
969 	*dest++ = s;
970 	width--;
971     }
972 
973     COMPUTE_SHIFT_MASKS (dest, src);
974 
975     /* printf ("%s\n",__PRETTY_FUNCTION__); */
976     for (i = width / 4; i > 0; i--)
977     {
978 	LOAD_VECTORS (dest, src);
979 
980 	vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
981 
982 	STORE_VECTOR (dest);
983 
984 	src += 4;
985 	dest += 4;
986     }
987 
988     for (i = width % 4; --i >= 0;)
989     {
990 	uint32_t s = src[i];
991 	uint32_t a = ALPHA_8 (~dest[i]);
992 
993 	UN8x4_MUL_UN8 (s, a);
994 
995 	dest[i] = s;
996     }
997 }
998 
999 static void
vmx_combine_out_u_mask(uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1000 vmx_combine_out_u_mask (uint32_t *      dest,
1001                         const uint32_t *src,
1002                         const uint32_t *mask,
1003                         int             width)
1004 {
1005     int i;
1006     vector unsigned int vdest, vsrc, vmask;
1007     DECLARE_SRC_MASK_VAR;
1008     DECLARE_MASK_MASK_VAR;
1009 
1010     while (width && ((uintptr_t)dest & 15))
1011     {
1012 	uint32_t m = ALPHA_8 (*mask++);
1013 	uint32_t s = *src++;
1014 	uint32_t a = ALPHA_8 (~(*dest));
1015 
1016 	UN8x4_MUL_UN8 (s, m);
1017 	UN8x4_MUL_UN8 (s, a);
1018 
1019 	*dest++ = s;
1020 	width--;
1021     }
1022 
1023     COMPUTE_SHIFT_MASKC (dest, src, mask);
1024 
1025     /* printf ("%s\n",__PRETTY_FUNCTION__); */
1026     for (i = width / 4; i > 0; i--)
1027     {
1028 	LOAD_VECTORSM (dest, src, mask);
1029 
1030 	vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
1031 
1032 	STORE_VECTOR (dest);
1033 
1034 	src += 4;
1035 	dest += 4;
1036 	mask += 4;
1037     }
1038 
1039     for (i = width % 4; --i >= 0;)
1040     {
1041 	uint32_t m = ALPHA_8 (mask[i]);
1042 	uint32_t s = src[i];
1043 	uint32_t a = ALPHA_8 (~dest[i]);
1044 
1045 	UN8x4_MUL_UN8 (s, m);
1046 	UN8x4_MUL_UN8 (s, a);
1047 
1048 	dest[i] = s;
1049     }
1050 }
1051 
1052 static void
vmx_combine_out_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1053 vmx_combine_out_u (pixman_implementation_t *imp,
1054                    pixman_op_t              op,
1055                    uint32_t *               dest,
1056                    const uint32_t *         src,
1057                    const uint32_t *         mask,
1058                    int                      width)
1059 {
1060     if (mask)
1061 	vmx_combine_out_u_mask (dest, src, mask, width);
1062     else
1063 	vmx_combine_out_u_no_mask (dest, src, width);
1064 }
1065 
1066 static void
vmx_combine_out_reverse_u_no_mask(uint32_t * dest,const uint32_t * src,int width)1067 vmx_combine_out_reverse_u_no_mask (uint32_t *      dest,
1068                                    const uint32_t *src,
1069                                    int             width)
1070 {
1071     int i;
1072     vector unsigned int vdest, vsrc;
1073     DECLARE_SRC_MASK_VAR;
1074 
1075     while (width && ((uintptr_t)dest & 15))
1076     {
1077 	uint32_t d = *dest;
1078 	uint32_t a = ALPHA_8 (~(*src++));
1079 
1080 	UN8x4_MUL_UN8 (d, a);
1081 
1082 	*dest++ = d;
1083 	width--;
1084     }
1085 
1086     COMPUTE_SHIFT_MASKS (dest, src);
1087 
1088     /* printf ("%s\n",__PRETTY_FUNCTION__); */
1089     for (i = width / 4; i > 0; i--)
1090     {
1091 
1092 	LOAD_VECTORS (dest, src);
1093 
1094 	vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
1095 
1096 	STORE_VECTOR (dest);
1097 
1098 	src += 4;
1099 	dest += 4;
1100     }
1101 
1102     for (i = width % 4; --i >= 0;)
1103     {
1104 	uint32_t d = dest[i];
1105 	uint32_t a = ALPHA_8 (~src[i]);
1106 
1107 	UN8x4_MUL_UN8 (d, a);
1108 
1109 	dest[i] = d;
1110     }
1111 }
1112 
1113 static void
vmx_combine_out_reverse_u_mask(uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1114 vmx_combine_out_reverse_u_mask (uint32_t *      dest,
1115                                 const uint32_t *src,
1116                                 const uint32_t *mask,
1117                                 int             width)
1118 {
1119     int i;
1120     vector unsigned int vdest, vsrc, vmask;
1121     DECLARE_SRC_MASK_VAR;
1122     DECLARE_MASK_MASK_VAR;
1123 
1124     while (width && ((uintptr_t)dest & 15))
1125     {
1126 	uint32_t m = ALPHA_8 (*mask++);
1127 	uint32_t d = *dest;
1128 	uint32_t a = *src++;
1129 
1130 	UN8x4_MUL_UN8 (a, m);
1131 	a = ALPHA_8 (~a);
1132 	UN8x4_MUL_UN8 (d, a);
1133 
1134 	*dest++ = d;
1135 	width--;
1136     }
1137 
1138     COMPUTE_SHIFT_MASKC (dest, src, mask);
1139 
1140     /* printf ("%s\n",__PRETTY_FUNCTION__); */
1141     for (i = width / 4; i > 0; i--)
1142     {
1143 	LOAD_VECTORSM (dest, src, mask);
1144 
1145 	vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
1146 
1147 	STORE_VECTOR (dest);
1148 
1149 	src += 4;
1150 	dest += 4;
1151 	mask += 4;
1152     }
1153 
1154     for (i = width % 4; --i >= 0;)
1155     {
1156 	uint32_t m = ALPHA_8 (mask[i]);
1157 	uint32_t d = dest[i];
1158 	uint32_t a = src[i];
1159 
1160 	UN8x4_MUL_UN8 (a, m);
1161 	a = ALPHA_8 (~a);
1162 	UN8x4_MUL_UN8 (d, a);
1163 
1164 	dest[i] = d;
1165     }
1166 }
1167 
1168 static void
vmx_combine_out_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1169 vmx_combine_out_reverse_u (pixman_implementation_t *imp,
1170                            pixman_op_t              op,
1171                            uint32_t *               dest,
1172                            const uint32_t *         src,
1173                            const uint32_t *         mask,
1174                            int                      width)
1175 {
1176     if (mask)
1177 	vmx_combine_out_reverse_u_mask (dest, src, mask, width);
1178     else
1179 	vmx_combine_out_reverse_u_no_mask (dest, src, width);
1180 }
1181 
1182 static void
vmx_combine_atop_u_no_mask(uint32_t * dest,const uint32_t * src,int width)1183 vmx_combine_atop_u_no_mask (uint32_t *      dest,
1184                             const uint32_t *src,
1185                             int             width)
1186 {
1187     int i;
1188     vector unsigned int vdest, vsrc;
1189     DECLARE_SRC_MASK_VAR;
1190 
1191     while (width && ((uintptr_t)dest & 15))
1192     {
1193 	uint32_t s = *src++;
1194 	uint32_t d = *dest;
1195 	uint32_t dest_a = ALPHA_8 (d);
1196 	uint32_t src_ia = ALPHA_8 (~s);
1197 
1198 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
1199 
1200 	*dest++ = s;
1201 	width--;
1202     }
1203 
1204     COMPUTE_SHIFT_MASKS (dest, src);
1205 
1206     /* printf ("%s\n",__PRETTY_FUNCTION__); */
1207     for (i = width / 4; i > 0; i--)
1208     {
1209 	LOAD_VECTORS (dest, src);
1210 
1211 	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
1212 			     vdest, splat_alpha (negate (vsrc)));
1213 
1214 	STORE_VECTOR (dest);
1215 
1216 	src += 4;
1217 	dest += 4;
1218     }
1219 
1220     for (i = width % 4; --i >= 0;)
1221     {
1222 	uint32_t s = src[i];
1223 	uint32_t d = dest[i];
1224 	uint32_t dest_a = ALPHA_8 (d);
1225 	uint32_t src_ia = ALPHA_8 (~s);
1226 
1227 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
1228 
1229 	dest[i] = s;
1230     }
1231 }
1232 
1233 static void
vmx_combine_atop_u_mask(uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1234 vmx_combine_atop_u_mask (uint32_t *      dest,
1235                          const uint32_t *src,
1236                          const uint32_t *mask,
1237                          int             width)
1238 {
1239     int i;
1240     vector unsigned int vdest, vsrc, vmask;
1241     DECLARE_SRC_MASK_VAR;
1242     DECLARE_MASK_MASK_VAR;
1243 
1244     while (width && ((uintptr_t)dest & 15))
1245     {
1246 	uint32_t m = ALPHA_8 (*mask++);
1247 	uint32_t s = *src++;
1248 	uint32_t d = *dest;
1249 	uint32_t dest_a = ALPHA_8 (d);
1250 	uint32_t src_ia;
1251 
1252 	UN8x4_MUL_UN8 (s, m);
1253 
1254 	src_ia = ALPHA_8 (~s);
1255 
1256 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
1257 
1258 	*dest++ = s;
1259 	width--;
1260     }
1261 
1262     COMPUTE_SHIFT_MASKC (dest, src, mask);
1263 
1264     /* printf ("%s\n",__PRETTY_FUNCTION__); */
1265     for (i = width / 4; i > 0; i--)
1266     {
1267 	LOAD_VECTORSM (dest, src, mask);
1268 
1269 	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
1270 			     vdest, splat_alpha (negate (vsrc)));
1271 
1272 	STORE_VECTOR (dest);
1273 
1274 	src += 4;
1275 	dest += 4;
1276 	mask += 4;
1277     }
1278 
1279     for (i = width % 4; --i >= 0;)
1280     {
1281 	uint32_t m = ALPHA_8 (mask[i]);
1282 	uint32_t s = src[i];
1283 	uint32_t d = dest[i];
1284 	uint32_t dest_a = ALPHA_8 (d);
1285 	uint32_t src_ia;
1286 
1287 	UN8x4_MUL_UN8 (s, m);
1288 
1289 	src_ia = ALPHA_8 (~s);
1290 
1291 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
1292 
1293 	dest[i] = s;
1294     }
1295 }
1296 
1297 static void
vmx_combine_atop_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1298 vmx_combine_atop_u (pixman_implementation_t *imp,
1299                     pixman_op_t              op,
1300                     uint32_t *               dest,
1301                     const uint32_t *         src,
1302                     const uint32_t *         mask,
1303                     int                      width)
1304 {
1305     if (mask)
1306 	vmx_combine_atop_u_mask (dest, src, mask, width);
1307     else
1308 	vmx_combine_atop_u_no_mask (dest, src, width);
1309 }
1310 
1311 static void
vmx_combine_atop_reverse_u_no_mask(uint32_t * dest,const uint32_t * src,int width)1312 vmx_combine_atop_reverse_u_no_mask (uint32_t *      dest,
1313                                     const uint32_t *src,
1314                                     int             width)
1315 {
1316     int i;
1317     vector unsigned int vdest, vsrc;
1318     DECLARE_SRC_MASK_VAR;
1319 
1320     while (width && ((uintptr_t)dest & 15))
1321     {
1322 	uint32_t s = *src++;
1323 	uint32_t d = *dest;
1324 	uint32_t src_a = ALPHA_8 (s);
1325 	uint32_t dest_ia = ALPHA_8 (~d);
1326 
1327 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
1328 
1329 	*dest++ = s;
1330 	width--;
1331     }
1332 
1333     COMPUTE_SHIFT_MASKS (dest, src);
1334 
1335     /* printf ("%s\n",__PRETTY_FUNCTION__); */
1336     for (i = width / 4; i > 0; i--)
1337     {
1338 	LOAD_VECTORS (dest, src);
1339 
1340 	vdest = pix_add_mul (vdest, splat_alpha (vsrc),
1341 			     vsrc, splat_alpha (negate (vdest)));
1342 
1343 	STORE_VECTOR (dest);
1344 
1345 	src += 4;
1346 	dest += 4;
1347     }
1348 
1349     for (i = width % 4; --i >= 0;)
1350     {
1351 	uint32_t s = src[i];
1352 	uint32_t d = dest[i];
1353 	uint32_t src_a = ALPHA_8 (s);
1354 	uint32_t dest_ia = ALPHA_8 (~d);
1355 
1356 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
1357 
1358 	dest[i] = s;
1359     }
1360 }
1361 
1362 static void
vmx_combine_atop_reverse_u_mask(uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1363 vmx_combine_atop_reverse_u_mask (uint32_t *      dest,
1364                                  const uint32_t *src,
1365                                  const uint32_t *mask,
1366                                  int             width)
1367 {
1368     int i;
1369     vector unsigned int vdest, vsrc, vmask;
1370     DECLARE_SRC_MASK_VAR;
1371     DECLARE_MASK_MASK_VAR;
1372 
1373     while (width && ((uintptr_t)dest & 15))
1374     {
1375 	uint32_t m = ALPHA_8 (*mask++);
1376 	uint32_t s = *src++;
1377 	uint32_t d = *dest;
1378 	uint32_t src_a;
1379 	uint32_t dest_ia = ALPHA_8 (~d);
1380 
1381 	UN8x4_MUL_UN8 (s, m);
1382 
1383 	src_a = ALPHA_8 (s);
1384 
1385 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
1386 
1387 	*dest++ = s;
1388 	width--;
1389     }
1390 
1391     COMPUTE_SHIFT_MASKC (dest, src, mask);
1392 
1393     /* printf ("%s\n",__PRETTY_FUNCTION__); */
1394     for (i = width / 4; i > 0; i--)
1395     {
1396 	LOAD_VECTORSM (dest, src, mask);
1397 
1398 	vdest = pix_add_mul (vdest, splat_alpha (vsrc),
1399 			     vsrc, splat_alpha (negate (vdest)));
1400 
1401 	STORE_VECTOR (dest);
1402 
1403 	src += 4;
1404 	dest += 4;
1405 	mask += 4;
1406     }
1407 
1408     for (i = width % 4; --i >= 0;)
1409     {
1410 	uint32_t m = ALPHA_8 (mask[i]);
1411 	uint32_t s = src[i];
1412 	uint32_t d = dest[i];
1413 	uint32_t src_a;
1414 	uint32_t dest_ia = ALPHA_8 (~d);
1415 
1416 	UN8x4_MUL_UN8 (s, m);
1417 
1418 	src_a = ALPHA_8 (s);
1419 
1420 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
1421 
1422 	dest[i] = s;
1423     }
1424 }
1425 
1426 static void
vmx_combine_atop_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1427 vmx_combine_atop_reverse_u (pixman_implementation_t *imp,
1428                             pixman_op_t              op,
1429                             uint32_t *               dest,
1430                             const uint32_t *         src,
1431                             const uint32_t *         mask,
1432                             int                      width)
1433 {
1434     if (mask)
1435 	vmx_combine_atop_reverse_u_mask (dest, src, mask, width);
1436     else
1437 	vmx_combine_atop_reverse_u_no_mask (dest, src, width);
1438 }
1439 
1440 static void
vmx_combine_xor_u_no_mask(uint32_t * dest,const uint32_t * src,int width)1441 vmx_combine_xor_u_no_mask (uint32_t *      dest,
1442                            const uint32_t *src,
1443                            int             width)
1444 {
1445     int i;
1446     vector unsigned int vdest, vsrc;
1447     DECLARE_SRC_MASK_VAR;
1448 
1449     while (width && ((uintptr_t)dest & 15))
1450     {
1451 	uint32_t s = *src++;
1452 	uint32_t d = *dest;
1453 	uint32_t src_ia = ALPHA_8 (~s);
1454 	uint32_t dest_ia = ALPHA_8 (~d);
1455 
1456 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
1457 
1458 	*dest++ = s;
1459 	width--;
1460     }
1461 
1462     COMPUTE_SHIFT_MASKS (dest, src);
1463 
1464     /* printf ("%s\n",__PRETTY_FUNCTION__); */
1465     for (i = width / 4; i > 0; i--)
1466     {
1467 	LOAD_VECTORS (dest, src);
1468 
1469 	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
1470 			     vdest, splat_alpha (negate (vsrc)));
1471 
1472 	STORE_VECTOR (dest);
1473 
1474 	src += 4;
1475 	dest += 4;
1476     }
1477 
1478     for (i = width % 4; --i >= 0;)
1479     {
1480 	uint32_t s = src[i];
1481 	uint32_t d = dest[i];
1482 	uint32_t src_ia = ALPHA_8 (~s);
1483 	uint32_t dest_ia = ALPHA_8 (~d);
1484 
1485 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
1486 
1487 	dest[i] = s;
1488     }
1489 }
1490 
1491 static void
vmx_combine_xor_u_mask(uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1492 vmx_combine_xor_u_mask (uint32_t *      dest,
1493                         const uint32_t *src,
1494                         const uint32_t *mask,
1495                         int             width)
1496 {
1497     int i;
1498     vector unsigned int vdest, vsrc, vmask;
1499     DECLARE_SRC_MASK_VAR;
1500     DECLARE_MASK_MASK_VAR;
1501 
1502     while (width && ((uintptr_t)dest & 15))
1503     {
1504 	uint32_t m = ALPHA_8 (*mask++);
1505 	uint32_t s = *src++;
1506 	uint32_t d = *dest;
1507 	uint32_t src_ia;
1508 	uint32_t dest_ia = ALPHA_8 (~d);
1509 
1510 	UN8x4_MUL_UN8 (s, m);
1511 
1512 	src_ia = ALPHA_8 (~s);
1513 
1514 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
1515 
1516 	*dest++ = s;
1517 	width--;
1518     }
1519 
1520     COMPUTE_SHIFT_MASKC (dest, src, mask);
1521 
1522     /* printf ("%s\n",__PRETTY_FUNCTION__); */
1523     for (i = width / 4; i > 0; i--)
1524     {
1525 	LOAD_VECTORSM (dest, src, mask);
1526 
1527 	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
1528 			     vdest, splat_alpha (negate (vsrc)));
1529 
1530 	STORE_VECTOR (dest);
1531 
1532 	src += 4;
1533 	dest += 4;
1534 	mask += 4;
1535     }
1536 
1537     for (i = width % 4; --i >= 0;)
1538     {
1539 	uint32_t m = ALPHA_8 (mask[i]);
1540 	uint32_t s = src[i];
1541 	uint32_t d = dest[i];
1542 	uint32_t src_ia;
1543 	uint32_t dest_ia = ALPHA_8 (~d);
1544 
1545 	UN8x4_MUL_UN8 (s, m);
1546 
1547 	src_ia = ALPHA_8 (~s);
1548 
1549 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
1550 
1551 	dest[i] = s;
1552     }
1553 }
1554 
1555 static void
vmx_combine_xor_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1556 vmx_combine_xor_u (pixman_implementation_t *imp,
1557                    pixman_op_t              op,
1558                    uint32_t *               dest,
1559                    const uint32_t *         src,
1560                    const uint32_t *         mask,
1561                    int                      width)
1562 {
1563     if (mask)
1564 	vmx_combine_xor_u_mask (dest, src, mask, width);
1565     else
1566 	vmx_combine_xor_u_no_mask (dest, src, width);
1567 }
1568 
1569 static void
vmx_combine_add_u_no_mask(uint32_t * dest,const uint32_t * src,int width)1570 vmx_combine_add_u_no_mask (uint32_t *      dest,
1571                            const uint32_t *src,
1572                            int             width)
1573 {
1574     int i;
1575     vector unsigned int vdest, vsrc;
1576     DECLARE_SRC_MASK_VAR;
1577 
1578     while (width && ((uintptr_t)dest & 15))
1579     {
1580 	uint32_t s = *src++;
1581 	uint32_t d = *dest;
1582 
1583 	UN8x4_ADD_UN8x4 (d, s);
1584 
1585 	*dest++ = d;
1586 	width--;
1587     }
1588 
1589     COMPUTE_SHIFT_MASKS (dest, src);
1590     /* printf ("%s\n",__PRETTY_FUNCTION__); */
1591     for (i = width / 4; i > 0; i--)
1592     {
1593 	LOAD_VECTORS (dest, src);
1594 
1595 	vdest = pix_add (vsrc, vdest);
1596 
1597 	STORE_VECTOR (dest);
1598 
1599 	src += 4;
1600 	dest += 4;
1601     }
1602 
1603     for (i = width % 4; --i >= 0;)
1604     {
1605 	uint32_t s = src[i];
1606 	uint32_t d = dest[i];
1607 
1608 	UN8x4_ADD_UN8x4 (d, s);
1609 
1610 	dest[i] = d;
1611     }
1612 }
1613 
1614 static void
vmx_combine_add_u_mask(uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1615 vmx_combine_add_u_mask (uint32_t *      dest,
1616                         const uint32_t *src,
1617                         const uint32_t *mask,
1618                         int             width)
1619 {
1620     int i;
1621     vector unsigned int vdest, vsrc, vmask;
1622     DECLARE_SRC_MASK_VAR;
1623     DECLARE_MASK_MASK_VAR;
1624 
1625     while (width && ((uintptr_t)dest & 15))
1626     {
1627 	uint32_t m = ALPHA_8 (*mask++);
1628 	uint32_t s = *src++;
1629 	uint32_t d = *dest;
1630 
1631 	UN8x4_MUL_UN8 (s, m);
1632 	UN8x4_ADD_UN8x4 (d, s);
1633 
1634 	*dest++ = d;
1635 	width--;
1636     }
1637 
1638     COMPUTE_SHIFT_MASKC (dest, src, mask);
1639 
1640     /* printf ("%s\n",__PRETTY_FUNCTION__); */
1641     for (i = width / 4; i > 0; i--)
1642     {
1643 	LOAD_VECTORSM (dest, src, mask);
1644 
1645 	vdest = pix_add (vsrc, vdest);
1646 
1647 	STORE_VECTOR (dest);
1648 
1649 	src += 4;
1650 	dest += 4;
1651 	mask += 4;
1652     }
1653 
1654     for (i = width % 4; --i >= 0;)
1655     {
1656 	uint32_t m = ALPHA_8 (mask[i]);
1657 	uint32_t s = src[i];
1658 	uint32_t d = dest[i];
1659 
1660 	UN8x4_MUL_UN8 (s, m);
1661 	UN8x4_ADD_UN8x4 (d, s);
1662 
1663 	dest[i] = d;
1664     }
1665 }
1666 
1667 static void
vmx_combine_add_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1668 vmx_combine_add_u (pixman_implementation_t *imp,
1669                    pixman_op_t              op,
1670                    uint32_t *               dest,
1671                    const uint32_t *         src,
1672                    const uint32_t *         mask,
1673                    int                      width)
1674 {
1675     if (mask)
1676 	vmx_combine_add_u_mask (dest, src, mask, width);
1677     else
1678 	vmx_combine_add_u_no_mask (dest, src, width);
1679 }
1680 
1681 static void
vmx_combine_src_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1682 vmx_combine_src_ca (pixman_implementation_t *imp,
1683                     pixman_op_t              op,
1684                     uint32_t *               dest,
1685                     const uint32_t *         src,
1686                     const uint32_t *         mask,
1687                     int                      width)
1688 {
1689     int i;
1690     vector unsigned int vdest, vsrc, vmask;
1691     DECLARE_SRC_MASK_VAR;
1692     DECLARE_MASK_MASK_VAR;
1693 
1694     while (width && ((uintptr_t)dest & 15))
1695     {
1696 	uint32_t a = *mask++;
1697 	uint32_t s = *src++;
1698 
1699 	UN8x4_MUL_UN8x4 (s, a);
1700 
1701 	*dest++ = s;
1702 	width--;
1703     }
1704 
1705     COMPUTE_SHIFT_MASKC (dest, src, mask);
1706 
1707     /* printf ("%s\n",__PRETTY_FUNCTION__); */
1708     for (i = width / 4; i > 0; i--)
1709     {
1710 	LOAD_VECTORSC (dest, src, mask);
1711 
1712 	vdest = pix_multiply (vsrc, vmask);
1713 
1714 	STORE_VECTOR (dest);
1715 
1716 	mask += 4;
1717 	src += 4;
1718 	dest += 4;
1719     }
1720 
1721     for (i = width % 4; --i >= 0;)
1722     {
1723 	uint32_t a = mask[i];
1724 	uint32_t s = src[i];
1725 
1726 	UN8x4_MUL_UN8x4 (s, a);
1727 
1728 	dest[i] = s;
1729     }
1730 }
1731 
1732 static void
vmx_combine_over_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1733 vmx_combine_over_ca (pixman_implementation_t *imp,
1734                      pixman_op_t              op,
1735                      uint32_t *               dest,
1736                      const uint32_t *         src,
1737                      const uint32_t *         mask,
1738                      int                      width)
1739 {
1740     int i;
1741     vector unsigned int vdest, vsrc, vmask;
1742     DECLARE_SRC_MASK_VAR;
1743     DECLARE_MASK_MASK_VAR;
1744 
1745     while (width && ((uintptr_t)dest & 15))
1746     {
1747 	uint32_t a = *mask++;
1748 	uint32_t s = *src++;
1749 	uint32_t d = *dest;
1750 	uint32_t sa = ALPHA_8 (s);
1751 
1752 	UN8x4_MUL_UN8x4 (s, a);
1753 	UN8x4_MUL_UN8 (a, sa);
1754 	UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
1755 
1756 	*dest++ = d;
1757 	width--;
1758     }
1759 
1760     COMPUTE_SHIFT_MASKC (dest, src, mask);
1761 
1762     /* printf ("%s\n",__PRETTY_FUNCTION__); */
1763     for (i = width / 4; i > 0; i--)
1764     {
1765 	LOAD_VECTORSC (dest, src, mask);
1766 
1767 	vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest);
1768 
1769 	STORE_VECTOR (dest);
1770 
1771 	mask += 4;
1772 	src += 4;
1773 	dest += 4;
1774     }
1775 
1776     for (i = width % 4; --i >= 0;)
1777     {
1778 	uint32_t a = mask[i];
1779 	uint32_t s = src[i];
1780 	uint32_t d = dest[i];
1781 	uint32_t sa = ALPHA_8 (s);
1782 
1783 	UN8x4_MUL_UN8x4 (s, a);
1784 	UN8x4_MUL_UN8 (a, sa);
1785 	UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
1786 
1787 	dest[i] = d;
1788     }
1789 }
1790 
1791 static void
vmx_combine_over_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1792 vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1793                              pixman_op_t              op,
1794                              uint32_t *               dest,
1795                              const uint32_t *         src,
1796                              const uint32_t *         mask,
1797                              int                      width)
1798 {
1799     int i;
1800     vector unsigned int vdest, vsrc, vmask;
1801     DECLARE_SRC_MASK_VAR;
1802     DECLARE_MASK_MASK_VAR;
1803 
1804     while (width && ((uintptr_t)dest & 15))
1805     {
1806 	uint32_t a = *mask++;
1807 	uint32_t s = *src++;
1808 	uint32_t d = *dest;
1809 	uint32_t ida = ALPHA_8 (~d);
1810 
1811 	UN8x4_MUL_UN8x4 (s, a);
1812 	UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
1813 
1814 	*dest++ = s;
1815 	width--;
1816     }
1817 
1818     COMPUTE_SHIFT_MASKC (dest, src, mask);
1819 
1820     /* printf("%s\n",__PRETTY_FUNCTION__); */
1821     for (i = width / 4; i > 0; i--)
1822     {
1823 	LOAD_VECTORSC (dest, src, mask);
1824 
1825 	vdest = over (vdest, splat_alpha (vdest), pix_multiply (vsrc, vmask));
1826 
1827 	STORE_VECTOR (dest);
1828 
1829 	mask += 4;
1830 	src += 4;
1831 	dest += 4;
1832     }
1833 
1834     for (i = width % 4; --i >= 0;)
1835     {
1836 	uint32_t a = mask[i];
1837 	uint32_t s = src[i];
1838 	uint32_t d = dest[i];
1839 	uint32_t ida = ALPHA_8 (~d);
1840 
1841 	UN8x4_MUL_UN8x4 (s, a);
1842 	UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
1843 
1844 	dest[i] = s;
1845     }
1846 }
1847 
1848 static void
vmx_combine_in_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1849 vmx_combine_in_ca (pixman_implementation_t *imp,
1850                    pixman_op_t              op,
1851                    uint32_t *               dest,
1852                    const uint32_t *         src,
1853                    const uint32_t *         mask,
1854                    int                      width)
1855 {
1856     int i;
1857     vector unsigned int vdest, vsrc, vmask;
1858     DECLARE_SRC_MASK_VAR;
1859     DECLARE_MASK_MASK_VAR;
1860 
1861     while (width && ((uintptr_t)dest & 15))
1862     {
1863 	uint32_t a = *mask++;
1864 	uint32_t s = *src++;
1865 	uint32_t da = ALPHA_8 (*dest);
1866 
1867 	UN8x4_MUL_UN8x4 (s, a);
1868 	UN8x4_MUL_UN8 (s, da);
1869 
1870 	*dest++ = s;
1871 	width--;
1872     }
1873 
1874     COMPUTE_SHIFT_MASKC (dest, src, mask);
1875 
1876     /* printf ("%s\n",__PRETTY_FUNCTION__); */
1877     for (i = width / 4; i > 0; i--)
1878     {
1879 	LOAD_VECTORSC (dest, src, mask);
1880 
1881 	vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
1882 
1883 	STORE_VECTOR (dest);
1884 
1885 	src += 4;
1886 	dest += 4;
1887 	mask += 4;
1888     }
1889 
1890     for (i = width % 4; --i >= 0;)
1891     {
1892 	uint32_t a = mask[i];
1893 	uint32_t s = src[i];
1894 	uint32_t da = ALPHA_8 (dest[i]);
1895 
1896 	UN8x4_MUL_UN8x4 (s, a);
1897 	UN8x4_MUL_UN8 (s, da);
1898 
1899 	dest[i] = s;
1900     }
1901 }
1902 
1903 static void
vmx_combine_in_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1904 vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1905                            pixman_op_t              op,
1906                            uint32_t *               dest,
1907                            const uint32_t *         src,
1908                            const uint32_t *         mask,
1909                            int                      width)
1910 {
1911     int i;
1912     vector unsigned int vdest, vsrc, vmask;
1913     DECLARE_SRC_MASK_VAR;
1914     DECLARE_MASK_MASK_VAR;
1915 
1916     while (width && ((uintptr_t)dest & 15))
1917     {
1918 	uint32_t a = *mask++;
1919 	uint32_t d = *dest;
1920 	uint32_t sa = ALPHA_8 (*src++);
1921 
1922 	UN8x4_MUL_UN8 (a, sa);
1923 	UN8x4_MUL_UN8x4 (d, a);
1924 
1925 	*dest++ = d;
1926 	width--;
1927     }
1928 
1929     COMPUTE_SHIFT_MASKC (dest, src, mask);
1930 
1931     /* printf ("%s\n",__PRETTY_FUNCTION__); */
1932     for (i = width / 4; i > 0; i--)
1933     {
1934 
1935 	LOAD_VECTORSC (dest, src, mask);
1936 
1937 	vdest = pix_multiply (vdest, pix_multiply (vmask, splat_alpha (vsrc)));
1938 
1939 	STORE_VECTOR (dest);
1940 
1941 	src += 4;
1942 	dest += 4;
1943 	mask += 4;
1944     }
1945 
1946     for (i = width % 4; --i >= 0;)
1947     {
1948 	uint32_t a = mask[i];
1949 	uint32_t d = dest[i];
1950 	uint32_t sa = ALPHA_8 (src[i]);
1951 
1952 	UN8x4_MUL_UN8 (a, sa);
1953 	UN8x4_MUL_UN8x4 (d, a);
1954 
1955 	dest[i] = d;
1956     }
1957 }
1958 
1959 static void
vmx_combine_out_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1960 vmx_combine_out_ca (pixman_implementation_t *imp,
1961                     pixman_op_t              op,
1962                     uint32_t *               dest,
1963                     const uint32_t *         src,
1964                     const uint32_t *         mask,
1965                     int                      width)
1966 {
1967     int i;
1968     vector unsigned int vdest, vsrc, vmask;
1969     DECLARE_SRC_MASK_VAR;
1970     DECLARE_MASK_MASK_VAR;
1971 
1972     while (width && ((uintptr_t)dest & 15))
1973     {
1974 	uint32_t a = *mask++;
1975 	uint32_t s = *src++;
1976 	uint32_t d = *dest;
1977 	uint32_t da = ALPHA_8 (~d);
1978 
1979 	UN8x4_MUL_UN8x4 (s, a);
1980 	UN8x4_MUL_UN8 (s, da);
1981 
1982 	*dest++ = s;
1983 	width--;
1984     }
1985 
1986     COMPUTE_SHIFT_MASKC (dest, src, mask);
1987 
1988     /* printf ("%s\n",__PRETTY_FUNCTION__); */
1989     for (i = width / 4; i > 0; i--)
1990     {
1991 	LOAD_VECTORSC (dest, src, mask);
1992 
1993 	vdest = pix_multiply (
1994 	    pix_multiply (vsrc, vmask), splat_alpha (negate (vdest)));
1995 
1996 	STORE_VECTOR (dest);
1997 
1998 	src += 4;
1999 	dest += 4;
2000 	mask += 4;
2001     }
2002 
2003     for (i = width % 4; --i >= 0;)
2004     {
2005 	uint32_t a = mask[i];
2006 	uint32_t s = src[i];
2007 	uint32_t d = dest[i];
2008 	uint32_t da = ALPHA_8 (~d);
2009 
2010 	UN8x4_MUL_UN8x4 (s, a);
2011 	UN8x4_MUL_UN8 (s, da);
2012 
2013 	dest[i] = s;
2014     }
2015 }
2016 
2017 static void
vmx_combine_out_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)2018 vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
2019                             pixman_op_t              op,
2020                             uint32_t *               dest,
2021                             const uint32_t *         src,
2022                             const uint32_t *         mask,
2023                             int                      width)
2024 {
2025     int i;
2026     vector unsigned int vdest, vsrc, vmask;
2027     DECLARE_SRC_MASK_VAR;
2028     DECLARE_MASK_MASK_VAR;
2029 
2030     while (width && ((uintptr_t)dest & 15))
2031     {
2032 	uint32_t a = *mask++;
2033 	uint32_t s = *src++;
2034 	uint32_t d = *dest;
2035 	uint32_t sa = ALPHA_8 (s);
2036 
2037 	UN8x4_MUL_UN8 (a, sa);
2038 	UN8x4_MUL_UN8x4 (d, ~a);
2039 
2040 	*dest++ = d;
2041 	width--;
2042     }
2043 
2044     COMPUTE_SHIFT_MASKC (dest, src, mask);
2045 
2046     /* printf ("%s\n",__PRETTY_FUNCTION__); */
2047     for (i = width / 4; i > 0; i--)
2048     {
2049 	LOAD_VECTORSC (dest, src, mask);
2050 
2051 	vdest = pix_multiply (
2052 	    vdest, negate (pix_multiply (vmask, splat_alpha (vsrc))));
2053 
2054 	STORE_VECTOR (dest);
2055 
2056 	src += 4;
2057 	dest += 4;
2058 	mask += 4;
2059     }
2060 
2061     for (i = width % 4; --i >= 0;)
2062     {
2063 	uint32_t a = mask[i];
2064 	uint32_t s = src[i];
2065 	uint32_t d = dest[i];
2066 	uint32_t sa = ALPHA_8 (s);
2067 
2068 	UN8x4_MUL_UN8 (a, sa);
2069 	UN8x4_MUL_UN8x4 (d, ~a);
2070 
2071 	dest[i] = d;
2072     }
2073 }
2074 
2075 static void
vmx_combine_atop_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)2076 vmx_combine_atop_ca (pixman_implementation_t *imp,
2077                      pixman_op_t              op,
2078                      uint32_t *               dest,
2079                      const uint32_t *         src,
2080                      const uint32_t *         mask,
2081                      int                      width)
2082 {
2083     int i;
2084     vector unsigned int vdest, vsrc, vmask, vsrca;
2085     DECLARE_SRC_MASK_VAR;
2086     DECLARE_MASK_MASK_VAR;
2087 
2088     while (width && ((uintptr_t)dest & 15))
2089     {
2090 	uint32_t a = *mask++;
2091 	uint32_t s = *src++;
2092 	uint32_t d = *dest;
2093 	uint32_t sa = ALPHA_8 (s);
2094 	uint32_t da = ALPHA_8 (d);
2095 
2096 	UN8x4_MUL_UN8x4 (s, a);
2097 	UN8x4_MUL_UN8 (a, sa);
2098 	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
2099 
2100 	*dest++ = d;
2101 	width--;
2102     }
2103 
2104     COMPUTE_SHIFT_MASKC (dest, src, mask);
2105 
2106     /* printf ("%s\n",__PRETTY_FUNCTION__); */
2107     for (i = width / 4; i > 0; i--)
2108     {
2109 	LOAD_VECTORSC (dest, src, mask);
2110 
2111 	vsrca = splat_alpha (vsrc);
2112 
2113 	vsrc = pix_multiply (vsrc, vmask);
2114 	vmask = pix_multiply (vmask, vsrca);
2115 
2116 	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
2117 			     negate (vmask), vdest);
2118 
2119 	STORE_VECTOR (dest);
2120 
2121 	src += 4;
2122 	dest += 4;
2123 	mask += 4;
2124     }
2125 
2126     for (i = width % 4; --i >= 0;)
2127     {
2128 	uint32_t a = mask[i];
2129 	uint32_t s = src[i];
2130 	uint32_t d = dest[i];
2131 	uint32_t sa = ALPHA_8 (s);
2132 	uint32_t da = ALPHA_8 (d);
2133 
2134 	UN8x4_MUL_UN8x4 (s, a);
2135 	UN8x4_MUL_UN8 (a, sa);
2136 	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
2137 
2138 	dest[i] = d;
2139     }
2140 }
2141 
2142 static void
vmx_combine_atop_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)2143 vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
2144                              pixman_op_t              op,
2145                              uint32_t *               dest,
2146                              const uint32_t *         src,
2147                              const uint32_t *         mask,
2148                              int                      width)
2149 {
2150     int i;
2151     vector unsigned int vdest, vsrc, vmask;
2152     DECLARE_SRC_MASK_VAR;
2153     DECLARE_MASK_MASK_VAR;
2154 
2155     while (width && ((uintptr_t)dest & 15))
2156     {
2157 	uint32_t a = *mask++;
2158 	uint32_t s = *src++;
2159 	uint32_t d = *dest;
2160 	uint32_t sa = ALPHA_8 (s);
2161 	uint32_t da = ALPHA_8 (~d);
2162 
2163 	UN8x4_MUL_UN8x4 (s, a);
2164 	UN8x4_MUL_UN8 (a, sa);
2165 	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
2166 
2167 	*dest++ = d;
2168 	width--;
2169     }
2170 
2171     COMPUTE_SHIFT_MASKC (dest, src, mask);
2172 
2173     /* printf ("%s\n",__PRETTY_FUNCTION__); */
2174     for (i = width / 4; i > 0; i--)
2175     {
2176 	LOAD_VECTORSC (dest, src, mask);
2177 
2178 	vdest = pix_add_mul (vdest,
2179 			     pix_multiply (vmask, splat_alpha (vsrc)),
2180 			     pix_multiply (vsrc, vmask),
2181 			     negate (splat_alpha (vdest)));
2182 
2183 	STORE_VECTOR (dest);
2184 
2185 	src += 4;
2186 	dest += 4;
2187 	mask += 4;
2188     }
2189 
2190     for (i = width % 4; --i >= 0;)
2191     {
2192 	uint32_t a = mask[i];
2193 	uint32_t s = src[i];
2194 	uint32_t d = dest[i];
2195 	uint32_t sa = ALPHA_8 (s);
2196 	uint32_t da = ALPHA_8 (~d);
2197 
2198 	UN8x4_MUL_UN8x4 (s, a);
2199 	UN8x4_MUL_UN8 (a, sa);
2200 	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
2201 
2202 	dest[i] = d;
2203     }
2204 }
2205 
2206 static void
vmx_combine_xor_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)2207 vmx_combine_xor_ca (pixman_implementation_t *imp,
2208                     pixman_op_t              op,
2209                     uint32_t *               dest,
2210                     const uint32_t *         src,
2211                     const uint32_t *         mask,
2212                     int                      width)
2213 {
2214     int i;
2215     vector unsigned int vdest, vsrc, vmask;
2216     DECLARE_SRC_MASK_VAR;
2217     DECLARE_MASK_MASK_VAR;
2218 
2219     while (width && ((uintptr_t)dest & 15))
2220     {
2221 	uint32_t a = *mask++;
2222 	uint32_t s = *src++;
2223 	uint32_t d = *dest;
2224 	uint32_t sa = ALPHA_8 (s);
2225 	uint32_t da = ALPHA_8 (~d);
2226 
2227 	UN8x4_MUL_UN8x4 (s, a);
2228 	UN8x4_MUL_UN8 (a, sa);
2229 	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
2230 
2231 	*dest++ = d;
2232 	width--;
2233     }
2234 
2235     COMPUTE_SHIFT_MASKC (dest, src, mask);
2236 
2237     /* printf ("%s\n",__PRETTY_FUNCTION__); */
2238     for (i = width / 4; i > 0; i--)
2239     {
2240 	LOAD_VECTORSC (dest, src, mask);
2241 
2242 	vdest = pix_add_mul (vdest,
2243 			     negate (pix_multiply (vmask, splat_alpha (vsrc))),
2244 			     pix_multiply (vsrc, vmask),
2245 			     negate (splat_alpha (vdest)));
2246 
2247 	STORE_VECTOR (dest);
2248 
2249 	src += 4;
2250 	dest += 4;
2251 	mask += 4;
2252     }
2253 
2254     for (i = width % 4; --i >= 0;)
2255     {
2256 	uint32_t a = mask[i];
2257 	uint32_t s = src[i];
2258 	uint32_t d = dest[i];
2259 	uint32_t sa = ALPHA_8 (s);
2260 	uint32_t da = ALPHA_8 (~d);
2261 
2262 	UN8x4_MUL_UN8x4 (s, a);
2263 	UN8x4_MUL_UN8 (a, sa);
2264 	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
2265 
2266 	dest[i] = d;
2267     }
2268 }
2269 
2270 static void
vmx_combine_add_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)2271 vmx_combine_add_ca (pixman_implementation_t *imp,
2272                     pixman_op_t              op,
2273                     uint32_t *               dest,
2274                     const uint32_t *         src,
2275                     const uint32_t *         mask,
2276                     int                      width)
2277 {
2278     int i;
2279     vector unsigned int vdest, vsrc, vmask;
2280     DECLARE_SRC_MASK_VAR;
2281     DECLARE_MASK_MASK_VAR;
2282 
2283     while (width && ((uintptr_t)dest & 15))
2284     {
2285 	uint32_t a = *mask++;
2286 	uint32_t s = *src++;
2287 	uint32_t d = *dest;
2288 
2289 	UN8x4_MUL_UN8x4 (s, a);
2290 	UN8x4_ADD_UN8x4 (s, d);
2291 
2292 	*dest++ = s;
2293 	width--;
2294     }
2295 
2296     COMPUTE_SHIFT_MASKC (dest, src, mask);
2297 
2298     /* printf ("%s\n",__PRETTY_FUNCTION__); */
2299     for (i = width / 4; i > 0; i--)
2300     {
2301 	LOAD_VECTORSC (dest, src, mask);
2302 
2303 	vdest = pix_add (pix_multiply (vsrc, vmask), vdest);
2304 
2305 	STORE_VECTOR (dest);
2306 
2307 	src += 4;
2308 	dest += 4;
2309 	mask += 4;
2310     }
2311 
2312     for (i = width % 4; --i >= 0;)
2313     {
2314 	uint32_t a = mask[i];
2315 	uint32_t s = src[i];
2316 	uint32_t d = dest[i];
2317 
2318 	UN8x4_MUL_UN8x4 (s, a);
2319 	UN8x4_ADD_UN8x4 (s, d);
2320 
2321 	dest[i] = s;
2322     }
2323 }
2324 
2325 static void
vmx_composite_over_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2326 vmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
2327                               pixman_composite_info_t *info)
2328 {
2329     PIXMAN_COMPOSITE_ARGS (info);
2330     uint32_t src, srca;
2331     uint32_t *dst_line, *dst;
2332     uint8_t *mask_line;
2333     int dst_stride, mask_stride;
2334     int32_t w;
2335     uint32_t m, d, s, ia;
2336 
2337     vector unsigned int vsrc, valpha, vmask, vdst;
2338 
2339     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2340 
2341     srca = ALPHA_8(src);
2342     if (src == 0)
2343 	return;
2344 
2345     PIXMAN_IMAGE_GET_LINE (
2346 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2347     PIXMAN_IMAGE_GET_LINE (
2348 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2349 
2350     vsrc = (vector unsigned int) {src, src, src, src};
2351     valpha = splat_alpha(vsrc);
2352 
2353     while (height--)
2354     {
2355 	const uint8_t *pm = mask_line;
2356 	dst = dst_line;
2357 	dst_line += dst_stride;
2358 	mask_line += mask_stride;
2359 	w = width;
2360 
2361 	while (w && (uintptr_t)dst & 15)
2362 	{
2363 	    s = src;
2364 	    m = *pm++;
2365 
2366 	    if (m)
2367 	    {
2368 		d = *dst;
2369 		UN8x4_MUL_UN8 (s, m);
2370 		ia = ALPHA_8 (~s);
2371 		UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
2372 		*dst = d;
2373 	    }
2374 
2375 	    w--;
2376 	    dst++;
2377 	}
2378 
2379 	while (w >= 4)
2380 	{
2381 	    m = *((uint32_t*)pm);
2382 
2383 	    if (srca == 0xff && m == 0xffffffff)
2384 	    {
2385 		save_128_aligned(dst, vsrc);
2386 	    }
2387 	    else if (m)
2388 	    {
2389 		vmask = splat_pixel((vector unsigned int) {m, m, m, m});
2390 
2391 		/* dst is 16-byte aligned */
2392 		vdst = in_over (vsrc, valpha, vmask, load_128_aligned (dst));
2393 
2394 		save_128_aligned(dst, vdst);
2395 	    }
2396 
2397 	    w -= 4;
2398 	    dst += 4;
2399 	    pm += 4;
2400 	}
2401 
2402 	while (w)
2403 	{
2404 	    s = src;
2405 	    m = *pm++;
2406 
2407 	    if (m)
2408 	    {
2409 		d = *dst;
2410 		UN8x4_MUL_UN8 (s, m);
2411 		ia = ALPHA_8 (~s);
2412 		UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
2413 		*dst = d;
2414 	    }
2415 
2416 	    w--;
2417 	    dst++;
2418 	}
2419     }
2420 
2421 }
2422 
2423 static pixman_bool_t
vmx_fill(pixman_implementation_t * imp,uint32_t * bits,int stride,int bpp,int x,int y,int width,int height,uint32_t filler)2424 vmx_fill (pixman_implementation_t *imp,
2425            uint32_t *               bits,
2426            int                      stride,
2427            int                      bpp,
2428            int                      x,
2429            int                      y,
2430            int                      width,
2431            int                      height,
2432            uint32_t		    filler)
2433 {
2434     uint32_t byte_width;
2435     uint8_t *byte_line;
2436 
2437     vector unsigned int vfiller;
2438 
2439     if (bpp == 8)
2440     {
2441 	uint8_t b;
2442 	uint16_t w;
2443 
2444 	stride = stride * (int) sizeof (uint32_t) / 1;
2445 	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
2446 	byte_width = width;
2447 	stride *= 1;
2448 
2449 	b = filler & 0xff;
2450 	w = (b << 8) | b;
2451 	filler = (w << 16) | w;
2452     }
2453     else if (bpp == 16)
2454     {
2455 	stride = stride * (int) sizeof (uint32_t) / 2;
2456 	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
2457 	byte_width = 2 * width;
2458 	stride *= 2;
2459 
2460         filler = (filler & 0xffff) * 0x00010001;
2461     }
2462     else if (bpp == 32)
2463     {
2464 	stride = stride * (int) sizeof (uint32_t) / 4;
2465 	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
2466 	byte_width = 4 * width;
2467 	stride *= 4;
2468     }
2469     else
2470     {
2471 	return FALSE;
2472     }
2473 
2474     vfiller = create_mask_1x32_128(&filler);
2475 
2476     while (height--)
2477     {
2478 	int w;
2479 	uint8_t *d = byte_line;
2480 	byte_line += stride;
2481 	w = byte_width;
2482 
2483 	if (w >= 1 && ((uintptr_t)d & 1))
2484 	{
2485 	    *(uint8_t *)d = filler;
2486 	    w -= 1;
2487 	    d += 1;
2488 	}
2489 
2490 	while (w >= 2 && ((uintptr_t)d & 3))
2491 	{
2492 	    *(uint16_t *)d = filler;
2493 	    w -= 2;
2494 	    d += 2;
2495 	}
2496 
2497 	while (w >= 4 && ((uintptr_t)d & 15))
2498 	{
2499 	    *(uint32_t *)d = filler;
2500 
2501 	    w -= 4;
2502 	    d += 4;
2503 	}
2504 
2505 	while (w >= 128)
2506 	{
2507 	    vec_st(vfiller, 0, (uint32_t *) d);
2508 	    vec_st(vfiller, 0, (uint32_t *) d + 4);
2509 	    vec_st(vfiller, 0, (uint32_t *) d + 8);
2510 	    vec_st(vfiller, 0, (uint32_t *) d + 12);
2511 	    vec_st(vfiller, 0, (uint32_t *) d + 16);
2512 	    vec_st(vfiller, 0, (uint32_t *) d + 20);
2513 	    vec_st(vfiller, 0, (uint32_t *) d + 24);
2514 	    vec_st(vfiller, 0, (uint32_t *) d + 28);
2515 
2516 	    d += 128;
2517 	    w -= 128;
2518 	}
2519 
2520 	if (w >= 64)
2521 	{
2522 	    vec_st(vfiller, 0, (uint32_t *) d);
2523 	    vec_st(vfiller, 0, (uint32_t *) d + 4);
2524 	    vec_st(vfiller, 0, (uint32_t *) d + 8);
2525 	    vec_st(vfiller, 0, (uint32_t *) d + 12);
2526 
2527 	    d += 64;
2528 	    w -= 64;
2529 	}
2530 
2531 	if (w >= 32)
2532 	{
2533 	    vec_st(vfiller, 0, (uint32_t *) d);
2534 	    vec_st(vfiller, 0, (uint32_t *) d + 4);
2535 
2536 	    d += 32;
2537 	    w -= 32;
2538 	}
2539 
2540 	if (w >= 16)
2541 	{
2542 	    vec_st(vfiller, 0, (uint32_t *) d);
2543 
2544 	    d += 16;
2545 	    w -= 16;
2546 	}
2547 
2548 	while (w >= 4)
2549 	{
2550 	    *(uint32_t *)d = filler;
2551 
2552 	    w -= 4;
2553 	    d += 4;
2554 	}
2555 
2556 	if (w >= 2)
2557 	{
2558 	    *(uint16_t *)d = filler;
2559 	    w -= 2;
2560 	    d += 2;
2561 	}
2562 
2563 	if (w >= 1)
2564 	{
2565 	    *(uint8_t *)d = filler;
2566 	    w -= 1;
2567 	    d += 1;
2568 	}
2569     }
2570 
2571     return TRUE;
2572 }
2573 
2574 static void
vmx_composite_src_x888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2575 vmx_composite_src_x888_8888 (pixman_implementation_t *imp,
2576 			      pixman_composite_info_t *info)
2577 {
2578     PIXMAN_COMPOSITE_ARGS (info);
2579     uint32_t    *dst_line, *dst;
2580     uint32_t    *src_line, *src;
2581     int32_t w;
2582     int dst_stride, src_stride;
2583 
2584     PIXMAN_IMAGE_GET_LINE (
2585 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2586     PIXMAN_IMAGE_GET_LINE (
2587 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2588 
2589     while (height--)
2590     {
2591 	dst = dst_line;
2592 	dst_line += dst_stride;
2593 	src = src_line;
2594 	src_line += src_stride;
2595 	w = width;
2596 
2597 	while (w && (uintptr_t)dst & 15)
2598 	{
2599 	    *dst++ = *src++ | 0xff000000;
2600 	    w--;
2601 	}
2602 
2603 	while (w >= 16)
2604 	{
2605 	    vector unsigned int vmx_src1, vmx_src2, vmx_src3, vmx_src4;
2606 
2607 	    vmx_src1 = load_128_unaligned (src);
2608 	    vmx_src2 = load_128_unaligned (src + 4);
2609 	    vmx_src3 = load_128_unaligned (src + 8);
2610 	    vmx_src4 = load_128_unaligned (src + 12);
2611 
2612 	    save_128_aligned (dst, vec_or (vmx_src1, mask_ff000000));
2613 	    save_128_aligned (dst + 4, vec_or (vmx_src2, mask_ff000000));
2614 	    save_128_aligned (dst + 8, vec_or (vmx_src3, mask_ff000000));
2615 	    save_128_aligned (dst + 12, vec_or (vmx_src4, mask_ff000000));
2616 
2617 	    dst += 16;
2618 	    src += 16;
2619 	    w -= 16;
2620 	}
2621 
2622 	while (w)
2623 	{
2624 	    *dst++ = *src++ | 0xff000000;
2625 	    w--;
2626 	}
2627     }
2628 }
2629 
2630 static void
vmx_composite_over_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2631 vmx_composite_over_n_8888 (pixman_implementation_t *imp,
2632                            pixman_composite_info_t *info)
2633 {
2634     PIXMAN_COMPOSITE_ARGS (info);
2635     uint32_t *dst_line, *dst;
2636     uint32_t src, ia;
2637     int      i, w, dst_stride;
2638     vector unsigned int vdst, vsrc, via;
2639 
2640     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2641 
2642     if (src == 0)
2643 	return;
2644 
2645     PIXMAN_IMAGE_GET_LINE (
2646 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2647 
2648     vsrc = (vector unsigned int){src, src, src, src};
2649     via = negate (splat_alpha (vsrc));
2650     ia = ALPHA_8 (~src);
2651 
2652     while (height--)
2653     {
2654 	dst = dst_line;
2655 	dst_line += dst_stride;
2656 	w = width;
2657 
2658 	while (w && ((uintptr_t)dst & 15))
2659 	{
2660 	    uint32_t d = *dst;
2661 	    UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
2662 	    *dst++ = d;
2663 	    w--;
2664 	}
2665 
2666 	for (i = w / 4; i > 0; i--)
2667 	{
2668 	    vdst = pix_multiply (load_128_aligned (dst), via);
2669 	    save_128_aligned (dst, pix_add (vsrc, vdst));
2670 	    dst += 4;
2671 	}
2672 
2673 	for (i = w % 4; --i >= 0;)
2674 	{
2675 	    uint32_t d = dst[i];
2676 	    UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
2677 	    dst[i] = d;
2678 	}
2679     }
2680 }
2681 
2682 static void
vmx_composite_over_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2683 vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
2684                                pixman_composite_info_t *info)
2685 {
2686     PIXMAN_COMPOSITE_ARGS (info);
2687     int dst_stride, src_stride;
2688     uint32_t    *dst_line, *dst;
2689     uint32_t    *src_line, *src;
2690 
2691     PIXMAN_IMAGE_GET_LINE (
2692     dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2693     PIXMAN_IMAGE_GET_LINE (
2694     src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2695 
2696     dst = dst_line;
2697     src = src_line;
2698 
2699     while (height--)
2700     {
2701         vmx_combine_over_u (imp, op, dst, src, NULL, width);
2702 
2703         dst += dst_stride;
2704         src += src_stride;
2705     }
2706 }
2707 
2708 static void
vmx_composite_over_n_8888_8888_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)2709 vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2710                                     pixman_composite_info_t *info)
2711 {
2712     PIXMAN_COMPOSITE_ARGS (info);
2713     uint32_t src, ia;
2714     uint32_t    *dst_line, d;
2715     uint32_t    *mask_line, m;
2716     uint32_t pack_cmp;
2717     int dst_stride, mask_stride;
2718 
2719     vector unsigned int vsrc, valpha, vmask, vdest;
2720 
2721     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2722 
2723     if (src == 0)
2724 	return;
2725 
2726     PIXMAN_IMAGE_GET_LINE (
2727 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2728     PIXMAN_IMAGE_GET_LINE (
2729 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2730 
2731     vsrc = (vector unsigned int) {src, src, src, src};
2732     valpha = splat_alpha(vsrc);
2733     ia = ALPHA_8 (src);
2734 
2735     while (height--)
2736     {
2737 	int w = width;
2738 	const uint32_t *pm = (uint32_t *)mask_line;
2739 	uint32_t *pd = (uint32_t *)dst_line;
2740 	uint32_t s;
2741 
2742 	dst_line += dst_stride;
2743 	mask_line += mask_stride;
2744 
2745 	while (w && (uintptr_t)pd & 15)
2746 	{
2747 	    s = src;
2748 	    m = *pm++;
2749 
2750 	    if (m)
2751 	    {
2752 		d = *pd;
2753 		UN8x4_MUL_UN8x4 (s, m);
2754 		UN8x4_MUL_UN8 (m, ia);
2755 		m = ~m;
2756 		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, m, s);
2757 		*pd = d;
2758 	    }
2759 
2760 	    pd++;
2761 	    w--;
2762 	}
2763 
2764 	while (w >= 4)
2765 	{
2766 	    /* pm is NOT necessarily 16-byte aligned */
2767 	    vmask = load_128_unaligned (pm);
2768 
2769 	    pack_cmp = vec_all_eq(vmask, (vector unsigned int) AVV(0));
2770 
2771 	    /* if all bits in mask are zero, pack_cmp is not 0 */
2772 	    if (pack_cmp == 0)
2773 	    {
2774 		/* pd is 16-byte aligned */
2775 		vdest = in_over (vsrc, valpha, vmask, load_128_aligned (pd));
2776 
2777 		save_128_aligned(pd, vdest);
2778 	    }
2779 
2780 	    pd += 4;
2781 	    pm += 4;
2782 	    w -= 4;
2783 	}
2784 
2785 	while (w)
2786 	{
2787 	    s = src;
2788 	    m = *pm++;
2789 
2790 	    if (m)
2791 	    {
2792 		d = *pd;
2793 		UN8x4_MUL_UN8x4 (s, m);
2794 		UN8x4_MUL_UN8 (m, ia);
2795 		m = ~m;
2796 		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, m, s);
2797 		*pd = d;
2798 	    }
2799 
2800 	    pd++;
2801 	    w--;
2802 	}
2803     }
2804 }
2805 
2806 static void
vmx_composite_add_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)2807 vmx_composite_add_8_8 (pixman_implementation_t *imp,
2808             pixman_composite_info_t *info)
2809 {
2810     PIXMAN_COMPOSITE_ARGS (info);
2811     uint8_t     *dst_line, *dst;
2812     uint8_t     *src_line, *src;
2813     int dst_stride, src_stride;
2814     int32_t w;
2815     uint16_t t;
2816 
2817     PIXMAN_IMAGE_GET_LINE (
2818     src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2819     PIXMAN_IMAGE_GET_LINE (
2820     dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2821 
2822     while (height--)
2823     {
2824 	dst = dst_line;
2825 	src = src_line;
2826 
2827 	dst_line += dst_stride;
2828 	src_line += src_stride;
2829 	w = width;
2830 
2831 	/* Small head */
2832 	while (w && (uintptr_t)dst & 3)
2833 	{
2834 	    t = (*dst) + (*src++);
2835 	    *dst++ = t | (0 - (t >> 8));
2836 	    w--;
2837 	}
2838 
2839 	vmx_combine_add_u (imp, op,
2840 		    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
2841 
2842 	/* Small tail */
2843 	dst += w & 0xfffc;
2844 	src += w & 0xfffc;
2845 
2846 	w &= 3;
2847 
2848 	while (w)
2849 	{
2850 	    t = (*dst) + (*src++);
2851 	    *dst++ = t | (0 - (t >> 8));
2852 	    w--;
2853 	}
2854     }
2855 }
2856 
2857 static void
vmx_composite_add_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2858 vmx_composite_add_8888_8888 (pixman_implementation_t *imp,
2859                               pixman_composite_info_t *info)
2860 {
2861     PIXMAN_COMPOSITE_ARGS (info);
2862     uint32_t    *dst_line, *dst;
2863     uint32_t    *src_line, *src;
2864     int dst_stride, src_stride;
2865 
2866     PIXMAN_IMAGE_GET_LINE (
2867 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2868     PIXMAN_IMAGE_GET_LINE (
2869 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2870 
2871     while (height--)
2872     {
2873 	dst = dst_line;
2874 	dst_line += dst_stride;
2875 	src = src_line;
2876 	src_line += src_stride;
2877 
2878 	vmx_combine_add_u (imp, op, dst, src, NULL, width);
2879     }
2880 }
2881 
2882 static force_inline void
scaled_nearest_scanline_vmx_8888_8888_OVER(uint32_t * pd,const uint32_t * ps,int32_t w,pixman_fixed_t vx,pixman_fixed_t unit_x,pixman_fixed_t src_width_fixed,pixman_bool_t fully_transparent_src)2883 scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t*       pd,
2884                                             const uint32_t* ps,
2885                                             int32_t         w,
2886                                             pixman_fixed_t  vx,
2887                                             pixman_fixed_t  unit_x,
2888                                             pixman_fixed_t  src_width_fixed,
2889                                             pixman_bool_t   fully_transparent_src)
2890 {
2891     uint32_t s, d;
2892     const uint32_t* pm = NULL;
2893 
2894     vector unsigned int vsrc, vdst;
2895 
2896     if (fully_transparent_src)
2897 	return;
2898 
2899     /* Align dst on a 16-byte boundary */
2900     while (w && ((uintptr_t)pd & 15))
2901     {
2902 	d = *pd;
2903 	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
2904 	vx += unit_x;
2905 	while (vx >= 0)
2906 	    vx -= src_width_fixed;
2907 
2908 	*pd++ = core_combine_over_u_pixel_vmx (s, d);
2909 	if (pm)
2910 	    pm++;
2911 	w--;
2912     }
2913 
2914     while (w >= 4)
2915     {
2916 	vector unsigned int tmp;
2917 	uint32_t tmp1, tmp2, tmp3, tmp4;
2918 
2919 	tmp1 = *(ps + pixman_fixed_to_int (vx));
2920 	vx += unit_x;
2921 	while (vx >= 0)
2922 	    vx -= src_width_fixed;
2923 	tmp2 = *(ps + pixman_fixed_to_int (vx));
2924 	vx += unit_x;
2925 	while (vx >= 0)
2926 	    vx -= src_width_fixed;
2927 	tmp3 = *(ps + pixman_fixed_to_int (vx));
2928 	vx += unit_x;
2929 	while (vx >= 0)
2930 	    vx -= src_width_fixed;
2931 	tmp4 = *(ps + pixman_fixed_to_int (vx));
2932 	vx += unit_x;
2933 	while (vx >= 0)
2934 	    vx -= src_width_fixed;
2935 
2936 	tmp[0] = tmp1;
2937 	tmp[1] = tmp2;
2938 	tmp[2] = tmp3;
2939 	tmp[3] = tmp4;
2940 
2941 	vsrc = combine4 ((const uint32_t *) &tmp, pm);
2942 
2943 	if (is_opaque (vsrc))
2944 	{
2945 	    save_128_aligned (pd, vsrc);
2946 	}
2947 	else if (!is_zero (vsrc))
2948 	{
2949 	    vdst = over(vsrc, splat_alpha(vsrc), load_128_aligned (pd));
2950 
2951 	    save_128_aligned (pd, vdst);
2952 	}
2953 
2954 	w -= 4;
2955 	pd += 4;
2956 	if (pm)
2957 	    pm += 4;
2958     }
2959 
2960     while (w)
2961     {
2962 	d = *pd;
2963 	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
2964 	vx += unit_x;
2965 	while (vx >= 0)
2966 	    vx -= src_width_fixed;
2967 
2968 	*pd++ = core_combine_over_u_pixel_vmx (s, d);
2969 	if (pm)
2970 	    pm++;
2971 
2972 	w--;
2973     }
2974 }
2975 
2976 FAST_NEAREST_MAINLOOP (vmx_8888_8888_cover_OVER,
2977 		       scaled_nearest_scanline_vmx_8888_8888_OVER,
2978 		       uint32_t, uint32_t, COVER)
2979 FAST_NEAREST_MAINLOOP (vmx_8888_8888_none_OVER,
2980 		       scaled_nearest_scanline_vmx_8888_8888_OVER,
2981 		       uint32_t, uint32_t, NONE)
2982 FAST_NEAREST_MAINLOOP (vmx_8888_8888_pad_OVER,
2983 		       scaled_nearest_scanline_vmx_8888_8888_OVER,
2984 		       uint32_t, uint32_t, PAD)
2985 FAST_NEAREST_MAINLOOP (vmx_8888_8888_normal_OVER,
2986 		       scaled_nearest_scanline_vmx_8888_8888_OVER,
2987 		       uint32_t, uint32_t, NORMAL)
2988 
2989 static const pixman_fast_path_t vmx_fast_paths[] =
2990 {
2991     PIXMAN_STD_FAST_PATH (OVER, solid,    null, a8r8g8b8, vmx_composite_over_n_8888),
2992     PIXMAN_STD_FAST_PATH (OVER, solid,    null, x8r8g8b8, vmx_composite_over_n_8888),
2993     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888),
2994     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
2995     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),
2996     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888),
2997     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, vmx_composite_over_n_8_8888),
2998     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, vmx_composite_over_n_8_8888),
2999     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, vmx_composite_over_n_8_8888),
3000     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, vmx_composite_over_n_8_8888),
3001     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, vmx_composite_over_n_8888_8888_ca),
3002     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, vmx_composite_over_n_8888_8888_ca),
3003     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, vmx_composite_over_n_8888_8888_ca),
3004     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, vmx_composite_over_n_8888_8888_ca),
3005 
3006     /* PIXMAN_OP_ADD */
3007     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8),
3008     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, vmx_composite_add_8888_8888),
3009     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, vmx_composite_add_8888_8888),
3010 
3011     /* PIXMAN_OP_SRC */
3012     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, vmx_composite_src_x888_8888),
3013     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, vmx_composite_src_x888_8888),
3014 
3015     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, vmx_8888_8888),
3016     SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, vmx_8888_8888),
3017     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, vmx_8888_8888),
3018     SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, vmx_8888_8888),
3019 
3020     {   PIXMAN_OP_NONE	},
3021 };
3022 
3023 static uint32_t *
vmx_fetch_x8r8g8b8(pixman_iter_t * iter,const uint32_t * mask)3024 vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3025 {
3026     int w = iter->width;
3027     vector unsigned int ff000000 = mask_ff000000;
3028     uint32_t *dst = iter->buffer;
3029     uint32_t *src = (uint32_t *)iter->bits;
3030 
3031     iter->bits += iter->stride;
3032 
3033     while (w && ((uintptr_t)dst) & 0x0f)
3034     {
3035 	*dst++ = (*src++) | 0xff000000;
3036 	w--;
3037     }
3038 
3039     while (w >= 4)
3040     {
3041 	save_128_aligned(dst, vec_or(load_128_unaligned(src), ff000000));
3042 
3043 	dst += 4;
3044 	src += 4;
3045 	w -= 4;
3046     }
3047 
3048     while (w)
3049     {
3050 	*dst++ = (*src++) | 0xff000000;
3051 	w--;
3052     }
3053 
3054     return iter->buffer;
3055 }
3056 
3057 static uint32_t *
vmx_fetch_a8(pixman_iter_t * iter,const uint32_t * mask)3058 vmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3059 {
3060     int w = iter->width;
3061     uint32_t *dst = iter->buffer;
3062     uint8_t *src = iter->bits;
3063     vector unsigned int vmx0, vmx1, vmx2, vmx3, vmx4, vmx5, vmx6;
3064 
3065     iter->bits += iter->stride;
3066 
3067     while (w && (((uintptr_t)dst) & 15))
3068     {
3069         *dst++ = *(src++) << 24;
3070         w--;
3071     }
3072 
3073     while (w >= 16)
3074     {
3075 	vmx0 = load_128_unaligned((uint32_t *) src);
3076 
3077 	unpack_128_2x128((vector unsigned int) AVV(0), vmx0, &vmx1, &vmx2);
3078 	unpack_128_2x128_16((vector unsigned int) AVV(0), vmx1, &vmx3, &vmx4);
3079 	unpack_128_2x128_16((vector unsigned int) AVV(0), vmx2, &vmx5, &vmx6);
3080 
3081 	save_128_aligned(dst, vmx6);
3082 	save_128_aligned((dst +  4), vmx5);
3083 	save_128_aligned((dst +  8), vmx4);
3084 	save_128_aligned((dst + 12), vmx3);
3085 
3086 	dst += 16;
3087 	src += 16;
3088 	w -= 16;
3089     }
3090 
3091     while (w)
3092     {
3093 	*dst++ = *(src++) << 24;
3094 	w--;
3095     }
3096 
3097     return iter->buffer;
3098 }
3099 
3100 #define IMAGE_FLAGS							\
3101     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
3102      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
3103 
3104 static const pixman_iter_info_t vmx_iters[] =
3105 {
3106     { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
3107       _pixman_iter_init_bits_stride, vmx_fetch_x8r8g8b8, NULL
3108     },
3109     { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
3110       _pixman_iter_init_bits_stride, vmx_fetch_a8, NULL
3111     },
3112     { PIXMAN_null },
3113 };
3114 
3115 pixman_implementation_t *
_pixman_implementation_create_vmx(pixman_implementation_t * fallback)3116 _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
3117 {
3118     pixman_implementation_t *imp = _pixman_implementation_create (fallback, vmx_fast_paths);
3119 
3120     /* VMX constants */
3121     mask_ff000000 = create_mask_32_128 (0xff000000);
3122     mask_red   = create_mask_32_128 (0x00f80000);
3123     mask_green = create_mask_32_128 (0x0000fc00);
3124     mask_blue  = create_mask_32_128 (0x000000f8);
3125     mask_565_fix_rb = create_mask_32_128 (0x00e000e0);
3126     mask_565_fix_g = create_mask_32_128  (0x0000c000);
3127 
3128     /* Set up function pointers */
3129 
3130     imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u;
3131     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_u;
3132     imp->combine_32[PIXMAN_OP_IN] = vmx_combine_in_u;
3133     imp->combine_32[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_u;
3134     imp->combine_32[PIXMAN_OP_OUT] = vmx_combine_out_u;
3135     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_u;
3136     imp->combine_32[PIXMAN_OP_ATOP] = vmx_combine_atop_u;
3137     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_u;
3138     imp->combine_32[PIXMAN_OP_XOR] = vmx_combine_xor_u;
3139 
3140     imp->combine_32[PIXMAN_OP_ADD] = vmx_combine_add_u;
3141 
3142     imp->combine_32_ca[PIXMAN_OP_SRC] = vmx_combine_src_ca;
3143     imp->combine_32_ca[PIXMAN_OP_OVER] = vmx_combine_over_ca;
3144     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_ca;
3145     imp->combine_32_ca[PIXMAN_OP_IN] = vmx_combine_in_ca;
3146     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_ca;
3147     imp->combine_32_ca[PIXMAN_OP_OUT] = vmx_combine_out_ca;
3148     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_ca;
3149     imp->combine_32_ca[PIXMAN_OP_ATOP] = vmx_combine_atop_ca;
3150     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_ca;
3151     imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
3152     imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;
3153 
3154     imp->fill = vmx_fill;
3155 
3156     imp->iter_info = vmx_iters;
3157 
3158     return imp;
3159 }
3160