1 /*
2  * Copyright © 2004 Red Hat, Inc.
3  * Copyright © 2004 Nicholas Miell
4  * Copyright © 2005 Trolltech AS
5  *
6  * Permission to use, copy, modify, distribute, and sell this software and its
7  * documentation for any purpose is hereby granted without fee, provided that
8  * the above copyright notice appear in all copies and that both that
9  * copyright notice and this permission notice appear in supporting
10  * documentation, and that the name of Red Hat not be used in advertising or
11  * publicity pertaining to distribution of the software without specific,
12  * written prior permission.  Red Hat makes no representations about the
13  * suitability of this software for any purpose.  It is provided "as is"
14  * without express or implied warranty.
15  *
16  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23  * SOFTWARE.
24  *
25  * Author:  Søren Sandmann (sandmann@redhat.com)
26  * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27  * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
28  *
29  * Based on work by Owen Taylor
30  */
31 
32 #ifdef HAVE_CONFIG_H
33 #include "config.h"
34 #endif
35 
36 #include <liboil/liboil.h>
37 #include <liboil/liboilfunction.h>
38 
39 #include <mmintrin.h>
40 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
41 
42 typedef uint32_t CARD32;
43 typedef uint16_t CARD16;
44 typedef int16_t INT16;
45 typedef uint8_t CARD8;
46 typedef uint64_t ullong;
47 typedef CARD32* PicturePtr;
48 typedef CARD32* FbBits;
49 typedef int FbStride;
50 
51 
52 #include "fbmmx.h"
53 #include "fbpict.h"
54 
55 #define CHECKPOINT()
56 
57 OIL_DECLARE_CLASS (composite_in_argb);
58 OIL_DECLARE_CLASS (composite_in_argb_const_src);
59 OIL_DECLARE_CLASS (composite_in_argb_const_mask);
60 OIL_DECLARE_CLASS (composite_over_argb);
61 OIL_DECLARE_CLASS (composite_over_argb_const_src);
62 OIL_DECLARE_CLASS (composite_add_argb);
63 OIL_DECLARE_CLASS (composite_add_argb_const_src);
64 OIL_DECLARE_CLASS (composite_in_over_argb);
65 OIL_DECLARE_CLASS (composite_in_over_argb_const_src);
66 OIL_DECLARE_CLASS (composite_in_over_argb_const_mask);
67 OIL_DECLARE_CLASS (composite_over_u8);
68 OIL_DECLARE_CLASS (composite_add_u8);
69 
70 
71 /* --------------- MMX code patch for fbcompose.c --------------------- */
72 
73 #if 0
74 static void
75 mmxCombineMaskU (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int width)
76 {
77     const __m64 mmx_0 = _mm_setzero_si64();
78     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
79 
80     const uint32_t *end = mask + width;
81     while (mask < end) {
82         __m64 a = MmxTo(*mask);
83         __m64 s = MmxTo(*src);
84         a = MmxAlpha(a);
85         MmxMul(s, a);
86         *dest = MmxFrom(s);
87         ++src;
88         ++dest;
89         ++mask;
90     }
91     _mm_empty();
92 }
93 #endif
94 
95 #ifdef ENABLE_BROKEN_IMPLS
96 static void
mmxCombineOverU(uint32_t * dest,const uint32_t * src,int width)97 mmxCombineOverU (uint32_t *dest, const uint32_t *src, int width)
98 {
99     const __m64 mmx_0 = _mm_setzero_si64();
100     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
101     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
102 
103     const uint32_t *end = dest + width;
104 
105     while (dest < end) {
106         __m64 x, y, a;
107         x = MmxTo(*src);
108         y = MmxTo(*dest);
109         a = MmxAlpha(x);
110         a = MmxNegate(a);
111         MmxMulAdd(y, a, x);
112         *dest = MmxFrom(y);
113         ++dest;
114         ++src;
115     }
116     _mm_empty();
117 }
118 OIL_DEFINE_IMPL_FULL(mmxCombineOverU, composite_over_argb, OIL_IMPL_FLAG_MMX);
119 #endif
120 
121 #if 0
122 static FASTCALL void
123 mmxCombineOverReverseU (CARD32 *dest, const CARD32 *src, int width)
124 {
125     const __m64 mmx_0 = _mm_setzero_si64();
126     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
127     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
128 
129     const CARD32 *end = dest + width;
130 
131     while (dest < end) {
132         __m64 x, y, a;
133         x = MmxTo(*dest);
134         y = MmxTo(*src);
135         a = MmxAlpha(x);
136         a = MmxNegate(a);
137         MmxMulAdd(y, a, x);
138         *dest = MmxFrom(y);
139         ++dest;
140         ++src;
141     }
142     _mm_empty();
143 }
144 #endif
145 
146 #if 0
147 static void
148 mmxCombineInU (CARD32 *dest, const CARD32 *src, int width)
149 {
150     const __m64 mmx_0 = _mm_setzero_si64();
151     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
152 
153     const CARD32 *end = dest + width;
154 
155     while (dest < end) {
156         __m64 x, a;
157         x = MmxTo(*src);
158         a = MmxTo(*dest);
159         a = MmxAlpha(a);
160         MmxMul(x, a);
161         *dest = MmxFrom(x);
162         ++dest;
163         ++src;
164     }
165     _mm_empty();
166 }
167 #endif
168 
169 #if 0
170 static FASTCALL void
171 mmxCombineInReverseU (CARD32 *dest, const CARD32 *src, int width)
172 {
173     const __m64 mmx_0 = _mm_setzero_si64();
174     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
175 
176     const CARD32 *end = dest + width;
177 
178     while (dest < end) {
179         __m64 x, a;
180         x = MmxTo(*dest);
181         a = MmxTo(*src);
182         a = MmxAlpha(a);
183         MmxMul(x, a);
184         *dest = MmxFrom(x);
185         ++dest;
186         ++src;
187     }
188     _mm_empty();
189 }
190 #endif
191 
192 #if 0
193 static FASTCALL void
194 mmxCombineOutU (CARD32 *dest, const CARD32 *src, int width)
195 {
196     const __m64 mmx_0 = _mm_setzero_si64();
197     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
198     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
199 
200     const CARD32 *end = dest + width;
201 
202     while (dest < end) {
203         __m64 x, a;
204         x = MmxTo(*src);
205         a = MmxTo(*dest);
206         a = MmxAlpha(a);
207         a = MmxNegate(a);
208         MmxMul(x, a);
209         *dest = MmxFrom(x);
210         ++dest;
211         ++src;
212     }
213     _mm_empty();
214 }
215 #endif
216 
217 #if 0
218 static FASTCALL void
219 mmxCombineOutReverseU (CARD32 *dest, const CARD32 *src, int width)
220 {
221     const __m64 mmx_0 = _mm_setzero_si64();
222     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
223     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
224 
225     const CARD32 *end = dest + width;
226 
227     while (dest < end) {
228         __m64 x, a;
229         x = MmxTo(*dest);
230         a = MmxTo(*src);
231         a = MmxAlpha(a);
232         a = MmxNegate(a);
233         MmxMul(x, a);
234         *dest = MmxFrom(x);
235         ++dest;
236         ++src;
237     }
238     _mm_empty();
239 }
240 
241 static FASTCALL void
242 mmxCombineAtopU (CARD32 *dest, const CARD32 *src, int width)
243 {
244     const __m64 mmx_0 = _mm_setzero_si64();
245     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
246     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
247 
248     const CARD32 *end = dest + width;
249 
250     while (dest < end) {
251         __m64 s, da, d, sia;
252         s = MmxTo(*src);
253         d = MmxTo(*dest);
254         sia = MmxAlpha(s);
255         sia = MmxNegate(sia);
256         da = MmxAlpha(d);
257         MmxAddMul(s, da, d, sia);
258         *dest = MmxFrom(s);
259         ++dest;
260         ++src;
261     }
262     _mm_empty();
263 }
264 
265 static FASTCALL void
266 mmxCombineAtopReverseU (CARD32 *dest, const CARD32 *src, int width)
267 {
268     const __m64 mmx_0 = _mm_setzero_si64();
269     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
270     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
271 
272     const CARD32 *end;
273 
274     end = dest + width;
275 
276     while (dest < end) {
277         __m64 s, dia, d, sa;
278         s = MmxTo(*src);
279         d = MmxTo(*dest);
280         sa = MmxAlpha(s);
281         dia = MmxAlpha(d);
282         dia = MmxNegate(dia);
283         MmxAddMul(s, dia, d, sa);
284         *dest = MmxFrom(s);
285         ++dest;
286         ++src;
287     }
288     _mm_empty();
289 }
290 
291 static FASTCALL void
292 mmxCombineXorU (CARD32 *dest, const CARD32 *src, int width)
293 {
294     const __m64 mmx_0 = _mm_setzero_si64();
295     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
296     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
297 
298     const CARD32 *end = dest + width;
299 
300     while (dest < end) {
301         __m64 s, dia, d, sia;
302         s = MmxTo(*src);
303         d = MmxTo(*dest);
304         sia = MmxAlpha(s);
305         dia = MmxAlpha(d);
306         sia = MmxNegate(sia);
307         dia = MmxNegate(dia);
308         MmxAddMul(s, dia, d, sia);
309         *dest = MmxFrom(s);
310         ++dest;
311         ++src;
312     }
313     _mm_empty();
314 }
315 #endif
316 
317 static void
mmxCombineAddU(uint32_t * dest,const uint32_t * src,int width)318 mmxCombineAddU (uint32_t *dest, const uint32_t *src, int width)
319 {
320     const __m64 mmx_0 = _mm_setzero_si64();
321 
322     const uint32_t *end = dest + width;
323     while (dest < end) {
324         __m64 s, d;
325         s = MmxTo(*src);
326         d = MmxTo(*dest);
327         s = MmxAdd(s, d);
328         *dest = MmxFrom(s);
329         ++dest;
330         ++src;
331     }
332     _mm_empty();
333 }
334 OIL_DEFINE_IMPL_FULL(mmxCombineAddU, composite_add_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_SSE);
335 
336 #if 0
337 static FASTCALL void
338 mmxCombineSaturateU (CARD32 *dest, const CARD32 *src, int width)
339 {
340     const __m64 mmx_0 = _mm_setzero_si64();
341     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
342 
343     const CARD32 *end = dest + width;
344     while (dest < end) {
345         CARD32 s = *src;
346         CARD32 d = *dest;
347         __m64 ms = MmxTo(s);
348         __m64 md = MmxTo(d);
349         CARD32 sa = s >> 24;
350         CARD32 da = ~d >> 24;
351 
352         if (sa > da) {
353             __m64 msa = MmxTo(FbIntDiv(da, sa));
354             msa = MmxAlpha(msa);
355             MmxMul(ms, msa);
356         }
357         MmxAdd(md, ms);
358         *dest = MmxFrom(md);
359         ++src;
360         ++dest;
361     }
362     _mm_empty();
363 }
364 
365 
366 static FASTCALL void
367 mmxCombineSrcC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
368 {
369     const __m64 mmx_0 = _mm_setzero_si64();
370     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
371 
372     const CARD32 *end = src + width;
373     while (src < end) {
374         __m64 a = MmxTo(*mask);
375         __m64 s = MmxTo(*src);
376         MmxMul(s, a);
377         *dest = MmxFrom(s);
378         ++src;
379         ++mask;
380         ++dest;
381     }
382     _mm_empty();
383 }
384 
385 static FASTCALL void
386 mmxCombineOverC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
387 {
388     const __m64 mmx_0 = _mm_setzero_si64();
389     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
390     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
391 
392     const CARD32 *end = src + width;
393     while (src < end) {
394         __m64 a = MmxTo(*mask);
395         __m64 s = MmxTo(*src);
396         __m64 d = MmxTo(*dest);
397         __m64 sa = MmxAlpha(s);
398         MmxMul(s, a);
399         MmxMul(a, sa);
400         a = MmxNegate(a);
401         MmxMulAdd(d, a, s);
402         *dest = MmxFrom(d);
403         ++src;
404         ++dest;
405         ++mask;
406     }
407     _mm_empty();
408 }
409 
410 static FASTCALL void
411 mmxCombineOverReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
412 {
413     const __m64 mmx_0 = _mm_setzero_si64();
414     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
415     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
416 
417     const CARD32 *end = src + width;
418     while (src < end) {
419         __m64 a = MmxTo(*mask);
420         __m64 s = MmxTo(*src);
421         __m64 d = MmxTo(*dest);
422         __m64 da = MmxAlpha(d);
423         da = MmxNegate(da);
424         MmxMul(s, a);
425         MmxMulAdd(s, da, d);
426         *dest = MmxFrom(s);
427         ++src;
428         ++dest;
429         ++mask;
430     }
431     _mm_empty();
432 }
433 
434 
435 static FASTCALL void
436 mmxCombineInC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
437 {
438     const __m64 mmx_0 = _mm_setzero_si64();
439     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
440 
441     const CARD32 *end = src + width;
442     while (src < end) {
443         __m64 a = MmxTo(*mask);
444         __m64 s = MmxTo(*src);
445         __m64 d = MmxTo(*dest);
446         __m64 da = MmxAlpha(d);
447         MmxMul(s, a);
448         MmxMul(s, da);
449         *dest = MmxFrom(s);
450         ++src;
451         ++dest;
452         ++mask;
453     }
454     _mm_empty();
455 }
456 
457 static FASTCALL void
458 mmxCombineInReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
459 {
460     const __m64 mmx_0 = _mm_setzero_si64();
461     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
462 
463     const CARD32 *end = src + width;
464     while (src < end) {
465         __m64 a = MmxTo(*mask);
466         __m64 s = MmxTo(*src);
467         __m64 d = MmxTo(*dest);
468         __m64 sa = MmxAlpha(s);
469         MmxMul(a, sa);
470         MmxMul(d, a);
471         *dest = MmxFrom(d);
472         ++src;
473         ++dest;
474         ++mask;
475     }
476     _mm_empty();
477 }
478 
479 static FASTCALL void
480 mmxCombineOutC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
481 {
482     const __m64 mmx_0 = _mm_setzero_si64();
483     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
484     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
485 
486     const CARD32 *end = src + width;
487     while (src < end) {
488         __m64 a = MmxTo(*mask);
489         __m64 s = MmxTo(*src);
490         __m64 d = MmxTo(*dest);
491         __m64 da = MmxAlpha(d);
492         da = MmxNegate(da);
493         MmxMul(s, a);
494         MmxMul(s, da);
495         *dest = MmxFrom(s);
496         ++src;
497         ++dest;
498         ++mask;
499     }
500     _mm_empty();
501 }
502 
503 static FASTCALL void
504 mmxCombineOutReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
505 {
506     const __m64 mmx_0 = _mm_setzero_si64();
507     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
508     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
509 
510     const CARD32 *end = src + width;
511     while (src < end) {
512         __m64 a = MmxTo(*mask);
513         __m64 s = MmxTo(*src);
514         __m64 d = MmxTo(*dest);
515         __m64 sa = MmxAlpha(s);
516         MmxMul(a, sa);
517         a = MmxNegate(a);
518         MmxMul(d, a);
519         *dest = MmxFrom(d);
520         ++src;
521         ++dest;
522         ++mask;
523     }
524     _mm_empty();
525 }
526 
527 static FASTCALL void
528 mmxCombineAtopC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
529 {
530     const __m64 mmx_0 = _mm_setzero_si64();
531     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
532     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
533 
534     const CARD32 *end = src + width;
535     while (src < end) {
536         __m64 a = MmxTo(*mask);
537         __m64 s = MmxTo(*src);
538         __m64 d = MmxTo(*dest);
539         __m64 da = MmxAlpha(d);
540         __m64 sa = MmxAlpha(s);
541         MmxMul(s, a);
542         MmxMul(a, sa);
543         a = MmxNegate(a);
544         MmxAddMul(d, a, s, da);
545         *dest = MmxFrom(d);
546         ++src;
547         ++dest;
548         ++mask;
549     }
550     _mm_empty();
551 }
552 
553 static FASTCALL void
554 mmxCombineAtopReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
555 {
556     const __m64 mmx_0 = _mm_setzero_si64();
557     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
558     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
559 
560     const CARD32 *end = src + width;
561     while (src < end) {
562         __m64 a = MmxTo(*mask);
563         __m64 s = MmxTo(*src);
564         __m64 d = MmxTo(*dest);
565         __m64 da = MmxAlpha(d);
566         __m64 sa = MmxAlpha(s)
567         MmxMul(s, a);
568         MmxMul(a, sa);
569         da = MmxNegate(da);
570         MmxAddMul(d, a, s, da);
571         *dest = MmxFrom(d);
572         ++src;
573         ++dest;
574         ++mask;
575     }
576     _mm_empty();
577 }
578 
579 static FASTCALL void
580 mmxCombineXorC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
581 {
582     const __m64 mmx_0 = _mm_setzero_si64();
583     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
584     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
585 
586     const CARD32 *end = src + width;
587     while (src < end) {
588         __m64 a = MmxTo(*mask);
589         __m64 s = MmxTo(*src);
590         __m64 d = MmxTo(*dest);
591         __m64 da = MmxAlpha(d);
592         __m64 sa = MmxAlpha(s);
593         MmxMul(s, a);
594         MmxMul(a, sa);
595         da = MmxNegate(da);
596         a = MmxNegate(a);
597         MmxAddMul(d, a, s, da);
598         *dest = MmxFrom(d);
599         ++src;
600         ++dest;
601         ++mask;
602     }
603     _mm_empty();
604 }
605 
606 static FASTCALL void
607 mmxCombineAddC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
608 {
609     const __m64 mmx_0 = _mm_setzero_si64();
610     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
611 
612     const CARD32 *end = src + width;
613     while (src < end) {
614         __m64 a = MmxTo(*mask);
615         __m64 s = MmxTo(*src);
616         __m64 d = MmxTo(*dest);
617         MmxMul(s, a);
618         d = MmxAdd(s, d);
619         *dest = MmxFrom(d);
620         ++src;
621         ++dest;
622         ++mask;
623     }
624     _mm_empty();
625 }
626 
627 extern FbComposeFunctions composeFunctions;
628 
629 void fbComposeSetupMMX(void)
630 {
631     /* check if we have MMX support and initialize accordingly */
632     if (fbHaveMMX()) {
633         composeFunctions.combineU[PictOpOver] = mmxCombineOverU;
634         composeFunctions.combineU[PictOpOverReverse] = mmxCombineOverReverseU;
635         composeFunctions.combineU[PictOpIn] = mmxCombineInU;
636         composeFunctions.combineU[PictOpInReverse] = mmxCombineInReverseU;
637         composeFunctions.combineU[PictOpOut] = mmxCombineOutU;
638         composeFunctions.combineU[PictOpOutReverse] = mmxCombineOutReverseU;
639         composeFunctions.combineU[PictOpAtop] = mmxCombineAtopU;
640         composeFunctions.combineU[PictOpAtopReverse] = mmxCombineAtopReverseU;
641         composeFunctions.combineU[PictOpXor] = mmxCombineXorU;
642         composeFunctions.combineU[PictOpAdd] = mmxCombineAddU;
643         composeFunctions.combineU[PictOpSaturate] = mmxCombineSaturateU;
644 
645         composeFunctions.combineC[PictOpSrc] = mmxCombineSrcC;
646         composeFunctions.combineC[PictOpOver] = mmxCombineOverC;
647         composeFunctions.combineC[PictOpOverReverse] = mmxCombineOverReverseC;
648         composeFunctions.combineC[PictOpIn] = mmxCombineInC;
649         composeFunctions.combineC[PictOpInReverse] = mmxCombineInReverseC;
650         composeFunctions.combineC[PictOpOut] = mmxCombineOutC;
651         composeFunctions.combineC[PictOpOutReverse] = mmxCombineOutReverseC;
652         composeFunctions.combineC[PictOpAtop] = mmxCombineAtopC;
653         composeFunctions.combineC[PictOpAtopReverse] = mmxCombineAtopReverseC;
654         composeFunctions.combineC[PictOpXor] = mmxCombineXorC;
655         composeFunctions.combineC[PictOpAdd] = mmxCombineAddC;
656 
657         composeFunctions.combineMaskU = mmxCombineMaskU;
658     }
659 }
660 #endif
661 
662 
663 /* ------------------ MMX code paths called from fbpict.c ----------------------- */
664 
665 typedef union {
666   __m64 m64;
667   uint64_t ull;
668 } m64_ull;
669 
670 typedef struct
671 {
672     m64_ull mmx_4x00ff;
673     m64_ull mmx_4x0080;
674     m64_ull mmx_565_rgb;
675     m64_ull mmx_565_unpack_multiplier;
676     m64_ull mmx_565_r;
677     m64_ull mmx_565_g;
678     m64_ull mmx_565_b;
679     m64_ull mmx_mask_0;
680     m64_ull mmx_mask_1;
681     m64_ull mmx_mask_2;
682     m64_ull mmx_mask_3;
683     m64_ull mmx_full_alpha;
684     m64_ull mmx_ffff0000ffff0000;
685     m64_ull mmx_0000ffff00000000;
686     m64_ull mmx_000000000000ffff;
687 } MMXData;
688 
689 static const MMXData c =
690 {
691     .mmx_4x00ff.ull =			0x00ff00ff00ff00ffULL,
692     .mmx_4x0080.ull =			0x0080008000800080ULL,
693     .mmx_565_rgb.ull =			0x000001f0003f001fULL,
694     .mmx_565_r.ull =			0x000000f800000000ULL,
695     .mmx_565_g.ull =			0x0000000000fc0000ULL,
696     .mmx_565_b.ull =			0x00000000000000f8ULL,
697     .mmx_mask_0.ull =			0xffffffffffff0000ULL,
698     .mmx_mask_1.ull =			0xffffffff0000ffffULL,
699     .mmx_mask_2.ull =			0xffff0000ffffffffULL,
700     .mmx_mask_3.ull =			0x0000ffffffffffffULL,
701     .mmx_full_alpha.ull =			0x00ff000000000000ULL,
702     .mmx_565_unpack_multiplier.ull =	0x0000008404100840ULL,
703     .mmx_ffff0000ffff0000.ull =		0xffff0000ffff0000ULL,
704     .mmx_0000ffff00000000.ull =		0x0000ffff00000000ULL,
705     .mmx_000000000000ffff.ull =		0x000000000000ffffULL,
706 };
707 
708 #define MC(x) ((__m64) c.mmx_##x.m64)
709 
710 static __inline__ __m64
shift(__m64 v,int s)711 shift (__m64 v, int s)
712 {
713     if (s > 0)
714 	return _mm_slli_si64 (v, s);
715     else if (s < 0)
716 	return _mm_srli_si64 (v, -s);
717     else
718 	return v;
719 }
720 
721 static __inline__ __m64
negate(__m64 mask)722 negate (__m64 mask)
723 {
724     return _mm_xor_si64 (mask, MC(4x00ff));
725 }
726 
727 static __inline__ __m64
pix_multiply(__m64 a,__m64 b)728 pix_multiply (__m64 a, __m64 b)
729 {
730     __m64 res;
731 
732     res = _mm_mullo_pi16 (a, b);
733     res = _mm_adds_pu16 (res, MC(4x0080));
734     res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8));
735     res = _mm_srli_pi16 (res, 8);
736 
737     return res;
738 }
739 
740 static __inline__ __m64
expand_alpha(__m64 pixel)741 expand_alpha (__m64 pixel)
742 {
743     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3));
744 }
745 
746 static __inline__ __m64
expand_alpha_rev(__m64 pixel)747 expand_alpha_rev (__m64 pixel)
748 {
749     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0));
750 }
751 
752 static __inline__ __m64
invert_colors(__m64 pixel)753 invert_colors (__m64 pixel)
754 {
755     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2));
756 }
757 
758 /* Notes about writing mmx code
759  *
760  * give memory operands as the second operand. If you give it as the
761  * first, gcc will first load it into a register, then use that
762  * register
763  *
764  *   ie. use
765  *
766  *         _mm_mullo_pi16 (x, mmx_constant);
767  *
768  *   not
769  *
770  *         _mm_mullo_pi16 (mmx_constant, x);
771  *
772  * Also try to minimize dependencies. i.e. when you need a value, try
773  * to calculate it from a value that was calculated as early as
774  * possible.
775  */
776 
777 static __inline__ __m64
over(__m64 src,__m64 srca,__m64 dest)778 over (__m64 src, __m64 srca, __m64 dest)
779 {
780     return  _mm_adds_pu8 (src, pix_multiply(dest, negate(srca)));
781 }
782 
783 static __inline__ __m64
over_rev_non_pre(__m64 src,__m64 dest)784 over_rev_non_pre (__m64 src, __m64 dest)
785 {
786     __m64 srca = expand_alpha (src);
787     __m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha));
788 
789     return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest);
790 }
791 
792 static __inline__ __m64
in(__m64 src,__m64 mask)793 in (__m64 src,
794     __m64 mask)
795 {
796     return pix_multiply (src, mask);
797 }
798 
799 static __inline__ __m64
in_over(__m64 src,__m64 srca,__m64 mask,__m64 dest)800 in_over (__m64 src,
801 	 __m64 srca,
802 	 __m64 mask,
803 	 __m64 dest)
804 {
805     return over(in(src, mask), pix_multiply(srca, mask), dest);
806 }
807 
808 static __inline__ __m64
load8888(CARD32 v)809 load8888 (CARD32 v)
810 {
811     return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64());
812 }
813 
814 static __inline__ __m64
pack8888(__m64 lo,__m64 hi)815 pack8888 (__m64 lo, __m64 hi)
816 {
817     __m64 r;
818     r = _mm_packs_pu16 (lo, hi);
819     return r;
820 }
821 
822 static __inline__ CARD32
store8888(__m64 v)823 store8888 (__m64 v)
824 {
825     return _mm_cvtsi64_si32(pack8888(v, _mm_setzero_si64()));
826 }
827 
828 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
829  *
830  *    00RR00GG00BB
831  *
832  * --- Expanding 565 in the low word ---
833  *
834  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
835  * m = m & (01f0003f001f);
836  * m = m * (008404100840);
837  * m = m >> 8;
838  *
839  * Note the trick here - the top word is shifted by another nibble to
840  * avoid it bumping into the middle word
841  */
842 static __inline__ __m64
expand565(__m64 pixel,int pos)843 expand565 (__m64 pixel, int pos)
844 {
845     __m64 p = pixel;
846     __m64 t1, t2;
847 
848     /* move pixel to low 16 bit and zero the rest */
849     p = shift (shift (p, (3 - pos) * 16), -48);
850 
851     t1 = shift (p, 36 - 11);
852     t2 = shift (p, 16 - 5);
853 
854     p = _mm_or_si64 (t1, p);
855     p = _mm_or_si64 (t2, p);
856     p = _mm_and_si64 (p, MC(565_rgb));
857 
858     pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier));
859     return _mm_srli_pi16 (pixel, 8);
860 }
861 
862 static __inline__ __m64
expand8888(__m64 in,int pos)863 expand8888 (__m64 in, int pos)
864 {
865     if (pos == 0)
866 	return _mm_unpacklo_pi8 (in, _mm_setzero_si64());
867     else
868 	return _mm_unpackhi_pi8 (in, _mm_setzero_si64());
869 }
870 
871 static __inline__ __m64
pack565(__m64 pixel,__m64 target,int pos)872 pack565 (__m64 pixel, __m64 target, int pos)
873 {
874     __m64 p = pixel;
875     __m64 t = target;
876     __m64 r, g, b;
877 
878     r = _mm_and_si64 (p, MC(565_r));
879     g = _mm_and_si64 (p, MC(565_g));
880     b = _mm_and_si64 (p, MC(565_b));
881 
882     r = shift (r, - (32 - 8) + pos * 16);
883     g = shift (g, - (16 - 3) + pos * 16);
884     b = shift (b, - (0  + 3) + pos * 16);
885 
886     if (pos == 0)
887 	t = _mm_and_si64 (t, MC(mask_0));
888     else if (pos == 1)
889 	t = _mm_and_si64 (t, MC(mask_1));
890     else if (pos == 2)
891 	t = _mm_and_si64 (t, MC(mask_2));
892     else if (pos == 3)
893 	t = _mm_and_si64 (t, MC(mask_3));
894 
895     p = _mm_or_si64 (r, t);
896     p = _mm_or_si64 (g, p);
897 
898     return _mm_or_si64 (b, p);
899 }
900 
901 #ifdef ENABLE_BROKEN_IMPLS
902 /* broken.  See Debian bug #340932 */
903 static void
fbCompositeSolid_nx8888mmx(uint32_t * dst,uint32_t * src,int w)904 fbCompositeSolid_nx8888mmx (uint32_t *dst, uint32_t *src, int w)
905 {
906     __m64	vsrc, vsrca;
907 
908     vsrc = load8888 (*src);
909     vsrca = expand_alpha (vsrc);
910 
911     while (w && (unsigned long)dst & 7)
912     {
913         *dst = store8888(over(vsrc, vsrca, load8888(*dst)));
914 
915         w--;
916         dst++;
917     }
918 
919     while (w >= 2)
920     {
921         __m64 vdest;
922         __m64 dest0, dest1;
923 
924         vdest = *(__m64 *)dst;
925 
926         dest0 = over(vsrc, vsrca, expand8888(vdest, 0));
927         dest1 = over(vsrc, vsrca, expand8888(vdest, 1));
928 
929         *(__m64 *)dst = pack8888(dest0, dest1);
930 
931         dst += 2;
932         w -= 2;
933     }
934 
935     while (w)
936     {
937         *dst = store8888(over(vsrc, vsrca, load8888(*dst)));
938 
939         w--;
940         dst++;
941     }
942 
943     _mm_empty();
944 }
945 OIL_DEFINE_IMPL_FULL(fbCompositeSolid_nx8888mmx, composite_over_argb_const_src,
946     OIL_IMPL_FLAG_MMX| OIL_IMPL_FLAG_MMXEXT);
947 #endif
948 
949 #if 0
950 void
951 fbCompositeSolid_nx0565mmx (CARD8	op,
952 			    PicturePtr pSrc,
953 			    PicturePtr pMask,
954 			    PicturePtr pDst,
955 			    INT16	xSrc,
956 			    INT16	ySrc,
957 			    INT16	xMask,
958 			    INT16	yMask,
959 			    INT16	xDst,
960 			    INT16	yDst,
961 			    CARD16	width,
962 			    CARD16	height)
963 {
964     CARD32	src;
965     CARD16	*dstLine, *dst;
966     CARD16	w;
967     FbStride	dstStride;
968     __m64	vsrc, vsrca;
969 
970     CHECKPOINT();
971 
972     fbComposeGetSolid(pSrc, src, pDst->format);
973 
974     if (src >> 24 == 0)
975 	return;
976 
977     fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
978 
979     vsrc = load8888 (src);
980     vsrca = expand_alpha (vsrc);
981 
982     while (height--)
983     {
984 	dst = dstLine;
985 	dstLine += dstStride;
986 	w = width;
987 
988 	CHECKPOINT();
989 
990 	while (w && (unsigned long)dst & 7)
991 	{
992 	    ullong d = *dst;
993 	    __m64 vdest = expand565 ((__m64)d, 0);
994 	    vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
995 	    *dst = (ullong)vdest;
996 
997 	    w--;
998 	    dst++;
999 	}
1000 
1001 	while (w >= 4)
1002 	{
1003 	    __m64 vdest;
1004 
1005 	    vdest = *(__m64 *)dst;
1006 
1007 	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0);
1008 	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1);
1009 	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2);
1010 	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3);
1011 
1012 	    *(__m64 *)dst = vdest;
1013 
1014 	    dst += 4;
1015 	    w -= 4;
1016 	}
1017 
1018 	CHECKPOINT();
1019 
1020 	while (w)
1021 	{
1022 	    ullong d = *dst;
1023 	    __m64 vdest = expand565 ((__m64)d, 0);
1024 	    vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
1025 	    *dst = (ullong)vdest;
1026 
1027 	    w--;
1028 	    dst++;
1029 	}
1030     }
1031 
1032     _mm_empty();
1033 }
1034 #endif
1035 
1036 #if 0
1037 static void
1038 fbCompositeSolidMask_nx8888x8888Cmmx (uint32_t *dst, uint32_t *src, uint8_t *mask, int w)
1039 {
1040     CARD32	src, srca;
1041     CARD32	*dstLine;
1042     CARD32	*maskLine;
1043     FbStride	dstStride, maskStride;
1044     __m64	vsrc, vsrca;
1045 
1046 
1047     while (twidth && (unsigned long)q & 7)
1048     {
1049         CARD32 m = *(CARD32 *)p;
1050 
1051         if (m)
1052         {
1053             __m64 vdest = load8888(*q);
1054             vdest = in_over(vsrc, vsrca, load8888(m), vdest);
1055             *q = (ullong)pack8888(vdest, _mm_setzero_si64());
1056         }
1057 
1058         twidth--;
1059         p++;
1060         q++;
1061     }
1062 
1063     while (twidth >= 2)
1064     {
1065         CARD32 m0, m1;
1066         m0 = *p;
1067         m1 = *(p + 1);
1068 
1069         if (m0 | m1)
1070         {
1071             __m64 dest0, dest1;
1072             __m64 vdest = *(__m64 *)q;
1073 
1074             dest0 = in_over(vsrc, vsrca, load8888(m0),
1075                             expand8888 (vdest, 0));
1076             dest1 = in_over(vsrc, vsrca, load8888(m1),
1077                             expand8888 (vdest, 1));
1078 
1079             *(__m64 *)q = pack8888(dest0, dest1);
1080         }
1081 
1082         p += 2;
1083         q += 2;
1084         twidth -= 2;
1085     }
1086 
1087     while (twidth)
1088     {
1089         CARD32 m = *(CARD32 *)p;
1090 
1091         if (m)
1092         {
1093             __m64 vdest = load8888(*q);
1094             vdest = in_over(vsrc, vsrca, load8888(m), vdest);
1095             *q = (ullong)pack8888(vdest, _mm_setzero_si64());
1096         }
1097 
1098         twidth--;
1099         p++;
1100         q++;
1101     }
1102 
1103     _mm_empty();
1104 }
1105 #endif
1106 
1107 #if 0
1108 static void
1109 fbCompositeSrc_8888x8x8888mmx (uint32_t *dest, uint32_t *src, uint8_t *mask,
1110     int width)
1111 {
1112 
1113     mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine;
1114     vmask = load8888 (mask);
1115     srca = MC(4x00ff);
1116 
1117     while (height--)
1118     {
1119 	dst = dstLine;
1120 	dstLine += dstStride;
1121 	src = srcLine;
1122 	srcLine += srcStride;
1123 	w = width;
1124 
1125 	while (w && (unsigned long)dst & 7)
1126 	{
1127 	    __m64 s = load8888 (*src);
1128 	    __m64 d = load8888 (*dst);
1129 
1130 	    *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64());
1131 
1132 	    w--;
1133 	    dst++;
1134 	    src++;
1135 	}
1136 
1137 	while (w >= 16)
1138 	{
1139 	    __m64 vd0 = *(__m64 *)(dst + 0);
1140 	    __m64 vd1 = *(__m64 *)(dst + 2);
1141 	    __m64 vd2 = *(__m64 *)(dst + 4);
1142 	    __m64 vd3 = *(__m64 *)(dst + 6);
1143 	    __m64 vd4 = *(__m64 *)(dst + 8);
1144 	    __m64 vd5 = *(__m64 *)(dst + 10);
1145 	    __m64 vd6 = *(__m64 *)(dst + 12);
1146 	    __m64 vd7 = *(__m64 *)(dst + 14);
1147 
1148 	    __m64 vs0 = *(__m64 *)(src + 0);
1149 	    __m64 vs1 = *(__m64 *)(src + 2);
1150 	    __m64 vs2 = *(__m64 *)(src + 4);
1151 	    __m64 vs3 = *(__m64 *)(src + 6);
1152 	    __m64 vs4 = *(__m64 *)(src + 8);
1153 	    __m64 vs5 = *(__m64 *)(src + 10);
1154 	    __m64 vs6 = *(__m64 *)(src + 12);
1155 	    __m64 vs7 = *(__m64 *)(src + 14);
1156 
1157 	    vd0 = (__m64)pack8888 (
1158 		in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1159 		in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1160 
1161 	    vd1 = (__m64)pack8888 (
1162 		in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1163 		in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1164 
1165 	    vd2 = (__m64)pack8888 (
1166 		in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1167 		in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1168 
1169 	    vd3 = (__m64)pack8888 (
1170 		in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1171 		in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1172 
1173 	    vd4 = (__m64)pack8888 (
1174 		in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1175 		in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1176 
1177 	    vd5 = (__m64)pack8888 (
1178 		in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1179 		in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1180 
1181 	    vd6 = (__m64)pack8888 (
1182 		in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1183 		in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1184 
1185 	    vd7 = (__m64)pack8888 (
1186 		in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1187 		in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1188 
1189     	    *(__m64 *)(dst + 0) = vd0;
1190 	    *(__m64 *)(dst + 2) = vd1;
1191 	    *(__m64 *)(dst + 4) = vd2;
1192 	    *(__m64 *)(dst + 6) = vd3;
1193 	    *(__m64 *)(dst + 8) = vd4;
1194 	    *(__m64 *)(dst + 10) = vd5;
1195 	    *(__m64 *)(dst + 12) = vd6;
1196 	    *(__m64 *)(dst + 14) = vd7;
1197 
1198 	    w -= 16;
1199 	    dst += 16;
1200 	    src += 16;
1201 	}
1202 
1203 	while (w)
1204 	{
1205 	    __m64 s = load8888 (*src);
1206 	    __m64 d = load8888 (*dst);
1207 
1208 	    *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64());
1209 
1210 	    w--;
1211 	    dst++;
1212 	    src++;
1213 	}
1214     }
1215 
1216     _mm_empty();
1217 }
1218 
1219 void
1220 fbCompositeSrc_8888x8888mmx (CARD8	op,
1221 			     PicturePtr pSrc,
1222 			     PicturePtr pMask,
1223 			     PicturePtr pDst,
1224 			     INT16	xSrc,
1225 			     INT16	ySrc,
1226 			     INT16      xMask,
1227 			     INT16      yMask,
1228 			     INT16      xDst,
1229 			     INT16      yDst,
1230 			     CARD16     width,
1231 			     CARD16     height)
1232 {
1233     CARD32	*dstLine, *dst;
1234     CARD32	*srcLine, *src;
1235     FbStride	dstStride, srcStride;
1236     CARD16	w;
1237     __m64  srca;
1238 
1239     CHECKPOINT();
1240 
1241     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
1242     fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
1243 
1244     srca = MC (4x00ff);
1245 
1246     while (height--)
1247     {
1248 	dst = dstLine;
1249 	dstLine += dstStride;
1250 	src = srcLine;
1251 	srcLine += srcStride;
1252 	w = width;
1253 
1254 	while (w && (unsigned long)dst & 7)
1255 	{
1256 	    __m64 s = load8888 (*src);
1257 	    __m64 d = load8888 (*dst);
1258 
1259 	    *dst = (ullong)pack8888 (over (s, expand_alpha (s), d), (__m64)_mm_setzero_si64());
1260 
1261 	    w--;
1262 	    dst++;
1263 	    src++;
1264 	}
1265 
1266 	while (w >= 2)
1267 	{
1268 	    __m64 vd = *(__m64 *)(dst + 0);
1269 	    __m64 vs = *(__m64 *)(src + 0);
1270 	    __m64 vs0 = expand8888 (vs, 0);
1271 	    __m64 vs1 = expand8888 (vs, 1);
1272 
1273 	    *(__m64 *)dst = (__m64)pack8888 (
1274 		over (vs0, expand_alpha (vs0), expand8888 (vd, 0)),
1275 		over (vs1, expand_alpha (vs1), expand8888 (vd, 1)));
1276 
1277 	    w -= 2;
1278 	    dst += 2;
1279 	    src += 2;
1280 	}
1281 
1282 	while (w)
1283 	{
1284 	    __m64 s = load8888 (*src);
1285 	    __m64 d = load8888 (*dst);
1286 
1287 	    *dst = (ullong)pack8888 (over (s, expand_alpha (s), d),
1288 				     (__m64)_mm_setzero_si64());
1289 
1290 	    w--;
1291 	    dst++;
1292 	    src++;
1293 	}
1294     }
1295 
1296     _mm_empty();
1297 }
1298 
1299 void
1300 fbCompositeSolidMask_nx8x8888mmx (CARD8      op,
1301 				  PicturePtr pSrc,
1302 				  PicturePtr pMask,
1303 				  PicturePtr pDst,
1304 				  INT16      xSrc,
1305 				  INT16      ySrc,
1306 				  INT16      xMask,
1307 				  INT16      yMask,
1308 				  INT16      xDst,
1309 				  INT16      yDst,
1310 				  CARD16     width,
1311 				  CARD16     height)
1312 {
1313     CARD32	src, srca;
1314     CARD32	*dstLine, *dst;
1315     CARD8	*maskLine, *mask;
1316     FbStride	dstStride, maskStride;
1317     CARD16	w;
1318     __m64	vsrc, vsrca;
1319     ullong	srcsrc;
1320 
1321     CHECKPOINT();
1322 
1323     fbComposeGetSolid(pSrc, src, pDst->format);
1324 
1325     srca = src >> 24;
1326     if (srca == 0)
1327 	return;
1328 
1329     srcsrc = (unsigned long long)src << 32 | src;
1330 
1331     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
1332     fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
1333 
1334     vsrc = load8888 (src);
1335     vsrca = expand_alpha (vsrc);
1336 
1337     while (height--)
1338     {
1339 	dst = dstLine;
1340 	dstLine += dstStride;
1341 	mask = maskLine;
1342 	maskLine += maskStride;
1343 	w = width;
1344 
1345 	CHECKPOINT();
1346 
1347 	while (w && (unsigned long)dst & 7)
1348 	{
1349 	    ullong m = *mask;
1350 
1351 	    if (m)
1352 	    {
1353 		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst));
1354 		*dst = (ullong)pack8888(vdest, _mm_setzero_si64());
1355 	    }
1356 
1357 	    w--;
1358 	    mask++;
1359 	    dst++;
1360 	}
1361 
1362 	CHECKPOINT();
1363 
1364 	while (w >= 2)
1365 	{
1366 	    ullong m0, m1;
1367 	    m0 = *mask;
1368 	    m1 = *(mask + 1);
1369 
1370 	    if (srca == 0xff && (m0 & m1) == 0xff)
1371 	    {
1372 		*(unsigned long long *)dst = srcsrc;
1373 	    }
1374 	    else if (m0 | m1)
1375 	    {
1376 		__m64 vdest;
1377 		__m64 dest0, dest1;
1378 
1379 		vdest = *(__m64 *)dst;
1380 
1381 		dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0));
1382 		dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1));
1383 
1384 		*(__m64 *)dst = pack8888(dest0, dest1);
1385 	    }
1386 
1387 	    mask += 2;
1388 	    dst += 2;
1389 	    w -= 2;
1390 	}
1391 
1392 	CHECKPOINT();
1393 
1394 	while (w)
1395 	{
1396 	    ullong m = *mask;
1397 
1398 	    if (m)
1399 	    {
1400 		__m64 vdest = load8888(*dst);
1401 		vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest);
1402 		*dst = (ullong)pack8888(vdest, _mm_setzero_si64());
1403 	    }
1404 
1405 	    w--;
1406 	    mask++;
1407 	    dst++;
1408 	}
1409     }
1410 
1411     _mm_empty();
1412 }
1413 
1414 
1415 void
1416 fbCompositeSolidMask_nx8x0565mmx (CARD8      op,
1417 				  PicturePtr pSrc,
1418 				  PicturePtr pMask,
1419 				  PicturePtr pDst,
1420 				  INT16      xSrc,
1421 				  INT16      ySrc,
1422 				  INT16      xMask,
1423 				  INT16      yMask,
1424 				  INT16      xDst,
1425 				  INT16      yDst,
1426 				  CARD16     width,
1427 				  CARD16     height)
1428 {
1429     CARD32	src, srca;
1430     CARD16	*dstLine, *dst;
1431     CARD8	*maskLine, *mask;
1432     FbStride	dstStride, maskStride;
1433     CARD16	w;
1434     __m64	vsrc, vsrca;
1435     unsigned long long srcsrcsrcsrc, src16;
1436 
1437     CHECKPOINT();
1438 
1439     fbComposeGetSolid(pSrc, src, pDst->format);
1440 
1441     srca = src >> 24;
1442     if (srca == 0)
1443 	return;
1444 
1445     fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
1446     fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
1447 
1448     vsrc = load8888 (src);
1449     vsrca = expand_alpha (vsrc);
1450 
1451     src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0);
1452 
1453     srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 |
1454 	(ullong)src16 << 16 | (ullong)src16;
1455 
1456     while (height--)
1457     {
1458 	dst = dstLine;
1459 	dstLine += dstStride;
1460 	mask = maskLine;
1461 	maskLine += maskStride;
1462 	w = width;
1463 
1464 	CHECKPOINT();
1465 
1466 	while (w && (unsigned long)dst & 7)
1467 	{
1468 	    ullong m = *mask;
1469 
1470 	    if (m)
1471 	    {
1472 		ullong d = *dst;
1473 		__m64 vd = (__m64)d;
1474 		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
1475 		*dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
1476 	    }
1477 
1478 	    w--;
1479 	    mask++;
1480 	    dst++;
1481 	}
1482 
1483 	CHECKPOINT();
1484 
1485 	while (w >= 4)
1486 	{
1487 	    ullong m0, m1, m2, m3;
1488 	    m0 = *mask;
1489 	    m1 = *(mask + 1);
1490 	    m2 = *(mask + 2);
1491 	    m3 = *(mask + 3);
1492 
1493 	    if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
1494 	    {
1495 		*(unsigned long long *)dst = srcsrcsrcsrc;
1496 	    }
1497 	    else if (m0 | m1 | m2 | m3)
1498 	    {
1499 		__m64 vdest;
1500 		__m64 vm0, vm1, vm2, vm3;
1501 
1502 		vdest = *(__m64 *)dst;
1503 
1504 		vm0 = (__m64)m0;
1505 		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0);
1506 		vm1 = (__m64)m1;
1507 		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1);
1508 		vm2 = (__m64)m2;
1509 		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2);
1510 		vm3 = (__m64)m3;
1511 		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3);
1512 
1513 		*(__m64 *)dst = vdest;
1514 	    }
1515 
1516 	    w -= 4;
1517 	    mask += 4;
1518 	    dst += 4;
1519 	}
1520 
1521 	CHECKPOINT();
1522 
1523 	while (w)
1524 	{
1525 	    ullong m = *mask;
1526 
1527 	    if (m)
1528 	    {
1529 		ullong d = *dst;
1530 		__m64 vd = (__m64)d;
1531 		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
1532 		*dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
1533 	    }
1534 
1535 	    w--;
1536 	    mask++;
1537 	    dst++;
1538 	}
1539     }
1540 
1541     _mm_empty();
1542 }
1543 
1544 void
1545 fbCompositeSrc_8888RevNPx0565mmx (CARD8      op,
1546 				  PicturePtr pSrc,
1547 				  PicturePtr pMask,
1548 				  PicturePtr pDst,
1549 				  INT16      xSrc,
1550 				  INT16      ySrc,
1551 				  INT16      xMask,
1552 				  INT16      yMask,
1553 				  INT16      xDst,
1554 				  INT16      yDst,
1555 				  CARD16     width,
1556 				  CARD16     height)
1557 {
1558     CARD16	*dstLine, *dst;
1559     CARD32	*srcLine, *src;
1560     FbStride	dstStride, srcStride;
1561     CARD16	w;
1562 
1563     CHECKPOINT();
1564 
1565     fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
1566     fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
1567 
1568     assert (pSrc->pDrawable == pMask->pDrawable);
1569 
1570     while (height--)
1571     {
1572 	dst = dstLine;
1573 	dstLine += dstStride;
1574 	src = srcLine;
1575 	srcLine += srcStride;
1576 	w = width;
1577 
1578 	CHECKPOINT();
1579 
1580 	while (w && (unsigned long)dst & 7)
1581 	{
1582 	    __m64 vsrc = load8888 (*src);
1583 	    ullong d = *dst;
1584 	    __m64 vdest = expand565 ((__m64)d, 0);
1585 
1586 	    vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
1587 
1588 	    *dst = (ullong)vdest;
1589 
1590 	    w--;
1591 	    dst++;
1592 	    src++;
1593 	}
1594 
1595 	CHECKPOINT();
1596 
1597 	while (w >= 4)
1598 	{
1599 	    CARD32 s0, s1, s2, s3;
1600 	    unsigned char a0, a1, a2, a3;
1601 
1602 	    s0 = *src;
1603 	    s1 = *(src + 1);
1604 	    s2 = *(src + 2);
1605 	    s3 = *(src + 3);
1606 
1607 	    a0 = (s0 >> 24);
1608 	    a1 = (s1 >> 24);
1609 	    a2 = (s2 >> 24);
1610 	    a3 = (s3 >> 24);
1611 
1612 	    if ((a0 & a1 & a2 & a3) == 0xFF)
1613 	    {
1614 		__m64 vdest;
1615 		vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0);
1616 		vdest = pack565(invert_colors(load8888(s1)), vdest, 1);
1617 		vdest = pack565(invert_colors(load8888(s2)), vdest, 2);
1618 		vdest = pack565(invert_colors(load8888(s3)), vdest, 3);
1619 
1620 		*(__m64 *)dst = vdest;
1621 	    }
1622 	    else if (a0 | a1 | a2 | a3)
1623 	    {
1624 		__m64 vdest = *(__m64 *)dst;
1625 
1626 		vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0);
1627 	        vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1);
1628 		vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2);
1629 		vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3);
1630 
1631 		*(__m64 *)dst = vdest;
1632 	    }
1633 
1634 	    w -= 4;
1635 	    dst += 4;
1636 	    src += 4;
1637 	}
1638 
1639 	CHECKPOINT();
1640 
1641 	while (w)
1642 	{
1643 	    __m64 vsrc = load8888 (*src);
1644 	    ullong d = *dst;
1645 	    __m64 vdest = expand565 ((__m64)d, 0);
1646 
1647 	    vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
1648 
1649 	    *dst = (ullong)vdest;
1650 
1651 	    w--;
1652 	    dst++;
1653 	    src++;
1654 	}
1655     }
1656 
1657     _mm_empty();
1658 }
1659 
1660 /* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
1661 
1662 void
1663 fbCompositeSrc_8888RevNPx8888mmx (CARD8      op,
1664 				  PicturePtr pSrc,
1665 				  PicturePtr pMask,
1666 				  PicturePtr pDst,
1667 				  INT16      xSrc,
1668 				  INT16      ySrc,
1669 				  INT16      xMask,
1670 				  INT16      yMask,
1671 				  INT16      xDst,
1672 				  INT16      yDst,
1673 				  CARD16     width,
1674 				  CARD16     height)
1675 {
1676     CARD32	*dstLine, *dst;
1677     CARD32	*srcLine, *src;
1678     FbStride	dstStride, srcStride;
1679     CARD16	w;
1680 
1681     CHECKPOINT();
1682 
1683     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
1684     fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
1685 
1686     assert (pSrc->pDrawable == pMask->pDrawable);
1687 
1688     while (height--)
1689     {
1690 	dst = dstLine;
1691 	dstLine += dstStride;
1692 	src = srcLine;
1693 	srcLine += srcStride;
1694 	w = width;
1695 
1696 	while (w && (unsigned long)dst & 7)
1697 	{
1698 	    __m64 s = load8888 (*src);
1699 	    __m64 d = load8888 (*dst);
1700 
1701 	    *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64());
1702 
1703 	    w--;
1704 	    dst++;
1705 	    src++;
1706 	}
1707 
1708 	while (w >= 2)
1709 	{
1710 	    ullong s0, s1;
1711 	    unsigned char a0, a1;
1712 	    __m64 d0, d1;
1713 
1714 	    s0 = *src;
1715 	    s1 = *(src + 1);
1716 
1717 	    a0 = (s0 >> 24);
1718 	    a1 = (s1 >> 24);
1719 
1720 	    if ((a0 & a1) == 0xFF)
1721 	    {
1722 		d0 = invert_colors(load8888(s0));
1723 		d1 = invert_colors(load8888(s1));
1724 
1725 		*(__m64 *)dst = pack8888 (d0, d1);
1726 	    }
1727 	    else if (a0 | a1)
1728 	    {
1729 		__m64 vdest = *(__m64 *)dst;
1730 
1731 		d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0));
1732 		d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1));
1733 
1734 		*(__m64 *)dst = pack8888 (d0, d1);
1735 	    }
1736 
1737 	    w -= 2;
1738 	    dst += 2;
1739 	    src += 2;
1740 	}
1741 
1742 	while (w)
1743 	{
1744 	    __m64 s = load8888 (*src);
1745 	    __m64 d = load8888 (*dst);
1746 
1747 	    *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64());
1748 
1749 	    w--;
1750 	    dst++;
1751 	    src++;
1752 	}
1753     }
1754 
1755     _mm_empty();
1756 }
1757 
1758 void
1759 fbCompositeSolidMask_nx8888x0565Cmmx (CARD8      op,
1760 				      PicturePtr pSrc,
1761 				      PicturePtr pMask,
1762 				      PicturePtr pDst,
1763 				      INT16      xSrc,
1764 				      INT16      ySrc,
1765 				      INT16      xMask,
1766 				      INT16      yMask,
1767 				      INT16      xDst,
1768 				      INT16      yDst,
1769 				      CARD16     width,
1770 				      CARD16     height)
1771 {
1772     CARD32	src, srca;
1773     CARD16	*dstLine;
1774     CARD32	*maskLine;
1775     FbStride	dstStride, maskStride;
1776     __m64  vsrc, vsrca;
1777 
1778     CHECKPOINT();
1779 
1780     fbComposeGetSolid(pSrc, src, pDst->format);
1781 
1782     srca = src >> 24;
1783     if (srca == 0)
1784 	return;
1785 
1786     fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
1787     fbComposeGetStart (pMask, xMask, yMask, CARD32, maskStride, maskLine, 1);
1788 
1789     vsrc = load8888 (src);
1790     vsrca = expand_alpha (vsrc);
1791 
1792     while (height--)
1793     {
1794 	int twidth = width;
1795 	CARD32 *p = (CARD32 *)maskLine;
1796 	CARD16 *q = (CARD16 *)dstLine;
1797 
1798 	while (twidth && ((unsigned long)q & 7))
1799 	{
1800 	    CARD32 m = *(CARD32 *)p;
1801 
1802 	    if (m)
1803 	    {
1804 		ullong d = *q;
1805 		__m64 vdest = expand565 ((__m64)d, 0);
1806 		vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
1807 		*q = (ullong)vdest;
1808 	    }
1809 
1810 	    twidth--;
1811 	    p++;
1812 	    q++;
1813 	}
1814 
1815 	while (twidth >= 4)
1816 	{
1817 	    CARD32 m0, m1, m2, m3;
1818 
1819 	    m0 = *p;
1820 	    m1 = *(p + 1);
1821 	    m2 = *(p + 2);
1822 	    m3 = *(p + 3);
1823 
1824 	    if ((m0 | m1 | m2 | m3))
1825 	    {
1826 		__m64 vdest = *(__m64 *)q;
1827 
1828 		vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0);
1829 		vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1);
1830 		vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2);
1831 		vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3);
1832 
1833 		*(__m64 *)q = vdest;
1834 	    }
1835 	    twidth -= 4;
1836 	    p += 4;
1837 	    q += 4;
1838 	}
1839 
1840 	while (twidth)
1841 	{
1842 	    CARD32 m;
1843 
1844 	    m = *(CARD32 *)p;
1845 	    if (m)
1846 	    {
1847 		ullong d = *q;
1848 		__m64 vdest = expand565((__m64)d, 0);
1849 		vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0);
1850 		*q = (ullong)vdest;
1851 	    }
1852 
1853 	    twidth--;
1854 	    p++;
1855 	    q++;
1856 	}
1857 
1858 	maskLine += maskStride;
1859 	dstLine += dstStride;
1860     }
1861 
1862     _mm_empty ();
1863 }
1864 #endif
1865 
1866 static void
fbCompositeSrcAdd_8000x8000mmx(uint8_t * dst,uint8_t * src,int w)1867 fbCompositeSrcAdd_8000x8000mmx (uint8_t *dst, uint8_t *src, int w)
1868 {
1869     int s;
1870     int d;
1871     int t;
1872 
1873     while (w && (unsigned long)dst & 7)
1874     {
1875         s = *src;
1876         d = *dst;
1877         t = d + s;
1878         s = t | (0 - (t >> 8));
1879         *dst = s;
1880 
1881         dst++;
1882         src++;
1883         w--;
1884     }
1885 
1886     while (w >= 8)
1887     {
1888         *(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
1889         dst += 8;
1890         src += 8;
1891         w -= 8;
1892     }
1893 
1894     while (w)
1895     {
1896         s = *src;
1897         d = *dst;
1898         t = d + s;
1899         s = t | (0 - (t >> 8));
1900         *dst = s;
1901 
1902         dst++;
1903         src++;
1904         w--;
1905     }
1906 
1907     _mm_empty();
1908 }
1909 OIL_DEFINE_IMPL_FULL (fbCompositeSrcAdd_8000x8000mmx, composite_add_u8, OIL_IMPL_FLAG_MMX);
1910 
1911 static void
fbCompositeSrcAdd_8888x8888mmx(uint32_t * dst,uint32_t * src,int w)1912 fbCompositeSrcAdd_8888x8888mmx (uint32_t *dst, uint32_t *src, int w)
1913 {
1914     while (w && (unsigned long)dst & 7)
1915     {
1916         *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
1917                                              _mm_cvtsi32_si64(*dst)));
1918         dst++;
1919         src++;
1920         w--;
1921     }
1922 
1923     while (w >= 2)
1924     {
1925         *(__m64 *)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
1926         dst += 2;
1927         src += 2;
1928         w -= 2;
1929     }
1930 
1931     if (w)
1932     {
1933         *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
1934                                              _mm_cvtsi32_si64(*dst)));
1935 
1936     }
1937 
1938     _mm_empty();
1939 }
1940 OIL_DEFINE_IMPL_FULL (fbCompositeSrcAdd_8888x8888mmx, composite_add_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_SSE);
1941 
1942 #if 0
1943 #define GetStart(drw,x,y,type,stride,line,bpp) {\
1944     FbBits	*__bits__;									\
1945     FbStride	__stride__;									\
1946     int		__xoff__,__yoff__;								\
1947 												\
1948     fbGetDrawable((drw),__bits__,__stride__,bpp,__xoff__,__yoff__);				\
1949     (stride) = __stride__ * sizeof (FbBits) / sizeof (type);					\
1950     (line) = ((type *) __bits__) + (stride) * ((y) - __yoff__) + ((x) - __xoff__);		\
1951 }
1952 
1953 Bool
1954 fbSolidFillmmx (DrawablePtr	pDraw,
1955 		int		x,
1956 		int		y,
1957 		int		width,
1958 		int		height,
1959 		FbBits		xor)
1960 {
1961     FbStride	stride;
1962     int		bpp;
1963     ullong	fill;
1964     __m64	vfill;
1965     CARD32	byte_width;
1966     CARD8	*byte_line;
1967     FbBits      *bits;
1968     int		xoff, yoff;
1969 
1970     CHECKPOINT();
1971 
1972     fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff);
1973 
1974     if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))
1975 	return FALSE;
1976 
1977     if (bpp != 16 && bpp != 32)
1978 	return FALSE;
1979 
1980     if (bpp == 16)
1981     {
1982 	stride = stride * sizeof (FbBits) / 2;
1983 	byte_line = (CARD8 *)(((CARD16 *)bits) + stride * (y - yoff) + (x - xoff));
1984 	byte_width = 2 * width;
1985 	stride *= 2;
1986     }
1987     else
1988     {
1989 	stride = stride * sizeof (FbBits) / 4;
1990 	byte_line = (CARD8 *)(((CARD32 *)bits) + stride * (y - yoff) + (x - xoff));
1991 	byte_width = 4 * width;
1992 	stride *= 4;
1993     }
1994 
1995     fill = ((ullong)xor << 32) | xor;
1996     vfill = (__m64)fill;
1997 
1998     while (height--)
1999     {
2000 	int w;
2001 	CARD8 *d = byte_line;
2002 	byte_line += stride;
2003 	w = byte_width;
2004 
2005 	while (w >= 2 && ((unsigned long)d & 3))
2006 	{
2007 	    *(CARD16 *)d = xor;
2008 	    w -= 2;
2009 	    d += 2;
2010 	}
2011 
2012 	while (w >= 4 && ((unsigned long)d & 7))
2013 	{
2014 	    *(CARD32 *)d = xor;
2015 
2016 	    w -= 4;
2017 	    d += 4;
2018 	}
2019 
2020 	while (w >= 64)
2021 	{
2022 	    *(__m64*) (d +  0) = vfill;
2023 	    *(__m64*) (d +  8) = vfill;
2024 	    *(__m64*) (d + 16) = vfill;
2025 	    *(__m64*) (d + 24) = vfill;
2026 	    *(__m64*) (d + 32) = vfill;
2027 	    *(__m64*) (d + 40) = vfill;
2028 	    *(__m64*) (d + 48) = vfill;
2029 	    *(__m64*) (d + 56) = vfill;
2030 
2031 	    w -= 64;
2032 	    d += 64;
2033 	}
2034 	while (w >= 4)
2035 	{
2036 	    *(CARD32 *)d = xor;
2037 
2038 	    w -= 4;
2039 	    d += 4;
2040 	}
2041 	if (w >= 2)
2042 	{
2043 	    *(CARD16 *)d = xor;
2044 	    w -= 2;
2045 	    d += 2;
2046 	}
2047     }
2048 
2049     _mm_empty();
2050     return TRUE;
2051 }
2052 
2053 Bool
2054 fbCopyAreammx (DrawablePtr	pSrc,
2055 	       DrawablePtr	pDst,
2056 	       int		src_x,
2057 	       int		src_y,
2058 	       int		dst_x,
2059 	       int		dst_y,
2060 	       int		width,
2061 	       int		height)
2062 {
2063     FbBits *	src_bits;
2064     FbStride	src_stride;
2065     int		src_bpp;
2066     int		src_xoff;
2067     int		src_yoff;
2068 
2069     FbBits *	dst_bits;
2070     FbStride	dst_stride;
2071     int		dst_bpp;
2072     int		dst_xoff;
2073     int		dst_yoff;
2074 
2075     CARD8 *	src_bytes;
2076     CARD8 *	dst_bytes;
2077     int		byte_width;
2078 
2079     fbGetDrawable(pSrc, src_bits, src_stride, src_bpp, src_xoff, src_yoff);
2080     fbGetDrawable(pDst, dst_bits, dst_stride, dst_bpp, dst_xoff, dst_yoff);
2081 
2082     if (src_bpp != 16 && src_bpp != 32)
2083 	return FALSE;
2084 
2085     if (dst_bpp != 16 && dst_bpp != 32)
2086 	return FALSE;
2087 
2088     if (src_bpp != dst_bpp)
2089     {
2090 	return FALSE;
2091     }
2092 
2093     if (src_bpp == 16)
2094     {
2095 	src_stride = src_stride * sizeof (FbBits) / 2;
2096 	dst_stride = dst_stride * sizeof (FbBits) / 2;
2097 	src_bytes = (CARD8 *)(((CARD16 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff));
2098 	dst_bytes = (CARD8 *)(((CARD16 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff));
2099 	byte_width = 2 * width;
2100 	src_stride *= 2;
2101 	dst_stride *= 2;
2102     }
2103     else
2104     {
2105 	src_stride = src_stride * sizeof (FbBits) / 4;
2106 	dst_stride = dst_stride * sizeof (FbBits) / 4;
2107 	src_bytes = (CARD8 *)(((CARD32 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff));
2108 	dst_bytes = (CARD8 *)(((CARD32 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff));
2109 	byte_width = 4 * width;
2110 	src_stride *= 4;
2111 	dst_stride *= 4;
2112     }
2113 
2114     while (height--)
2115     {
2116 	int w;
2117 	CARD8 *s = src_bytes;
2118 	CARD8 *d = dst_bytes;
2119 	src_bytes += src_stride;
2120 	dst_bytes += dst_stride;
2121 	w = byte_width;
2122 
2123 	while (w >= 2 && ((unsigned long)d & 3))
2124 	{
2125 	    *(CARD16 *)d = *(CARD16 *)s;
2126 	    w -= 2;
2127 	    s += 2;
2128 	    d += 2;
2129 	}
2130 
2131 	while (w >= 4 && ((unsigned long)d & 7))
2132 	{
2133 	    *(CARD32 *)d = *(CARD32 *)s;
2134 
2135 	    w -= 4;
2136 	    s += 4;
2137 	    d += 4;
2138 	}
2139 
2140 	while (w >= 64)
2141 	{
2142 	    *(__m64 *)(d + 0)  = *(__m64 *)(s + 0);
2143 	    *(__m64 *)(d + 8)  = *(__m64 *)(s + 8);
2144 	    *(__m64 *)(d + 16) = *(__m64 *)(s + 16);
2145 	    *(__m64 *)(d + 24) = *(__m64 *)(s + 24);
2146 	    *(__m64 *)(d + 32) = *(__m64 *)(s + 32);
2147 	    *(__m64 *)(d + 40) = *(__m64 *)(s + 40);
2148 	    *(__m64 *)(d + 48) = *(__m64 *)(s + 48);
2149 	    *(__m64 *)(d + 56) = *(__m64 *)(s + 56);
2150 	    w -= 64;
2151 	    s += 64;
2152 	    d += 64;
2153 	}
2154 	while (w >= 4)
2155 	{
2156 	    *(CARD32 *)d = *(CARD32 *)s;
2157 
2158 	    w -= 4;
2159 	    s += 4;
2160 	    d += 4;
2161 	}
2162 	if (w >= 2)
2163 	{
2164 	    *(CARD16 *)d = *(CARD16 *)s;
2165 	    w -= 2;
2166 	    s += 2;
2167 	    d += 2;
2168 	}
2169     }
2170 
2171     _mm_empty();
2172     return TRUE;
2173 }
2174 
2175 void
2176 fbCompositeCopyAreammx (CARD8		op,
2177 			PicturePtr	pSrc,
2178 			PicturePtr	pMask,
2179 			PicturePtr	pDst,
2180 			INT16		xSrc,
2181 			INT16		ySrc,
2182 			INT16		xMask,
2183 			INT16		yMask,
2184 			INT16		xDst,
2185 			INT16		yDst,
2186 			CARD16		width,
2187 			CARD16		height)
2188 {
2189     fbCopyAreammx (pSrc->pDrawable,
2190 		   pDst->pDrawable,
2191 		   xSrc, ySrc,
2192 		   xDst, yDst,
2193 		   width, height);
2194 }
2195 
2196 #if !defined(__amd64__) && !defined(__x86_64__)
2197 
2198 enum CPUFeatures {
2199     NoFeatures = 0,
2200     MMX = 0x1,
2201     MMX_Extensions = 0x2,
2202     SSE = 0x6,
2203     SSE2 = 0x8,
2204     CMOV = 0x10
2205 };
2206 
2207 static unsigned int detectCPUFeatures(void) {
2208     unsigned int result;
2209     char vendor[13];
2210     vendor[0] = 0;
2211     vendor[12] = 0;
2212     /* see p. 118 of amd64 instruction set manual Vol3 */
2213     __asm__ ("push %%ebx\n"
2214              "pushf\n"
2215              "pop %%eax\n"
2216              "mov %%eax, %%ebx\n"
2217              "xor $0x00200000, %%eax\n"
2218              "push %%eax\n"
2219              "popf\n"
2220              "pushf\n"
2221              "pop %%eax\n"
2222              "mov $0x0, %%edx\n"
2223              "xor %%ebx, %%eax\n"
2224              "jz skip\n"
2225 
2226              "mov $0x00000000, %%eax\n"
2227              "cpuid\n"
2228              "mov %%ebx, %1\n"
2229              "mov %%edx, %2\n"
2230              "mov %%ecx, %3\n"
2231              "mov $0x00000001, %%eax\n"
2232              "cpuid\n"
2233              "skip:\n"
2234              "pop %%ebx\n"
2235              "mov %%edx, %0\n"
2236              : "=r" (result),
2237                "=m" (vendor[0]),
2238                "=m" (vendor[4]),
2239                "=m" (vendor[8])
2240              :
2241              : "%eax", "%ecx", "%edx"
2242         );
2243 
2244     unsigned int features = 0;
2245     if (result) {
2246         /* result now contains the standard feature bits */
2247         if (result & (1 << 15))
2248             features |= CMOV;
2249         if (result & (1 << 23))
2250             features |= MMX;
2251         if (result & (1 << 25))
2252             features |= SSE;
2253         if (result & (1 << 26))
2254             features |= SSE2;
2255         if ((result & MMX) && !(result & SSE) && (strcmp(vendor, "AuthenticAMD") == 0)) {
2256             /* check for AMD MMX extensions */
2257 
2258             unsigned int result;
2259             __asm__("push %%ebx\n"
2260                     "mov $0x80000000, %%eax\n"
2261                     "cpuid\n"
2262                     "xor %%edx, %%edx\n"
2263                     "cmp $0x1, %%eax\n"
2264                     "jge skip2\n"
2265                     "mov $0x80000001, %%eax\n"
2266                     "cpuid\n"
2267                     "skip2:\n"
2268                     "mov %%edx, %0\n"
2269                     "pop %%ebx\n"
2270                     : "=r" (result)
2271                     :
2272                     : "%eax", "%ecx", "%edx"
2273                 );
2274             if (result & (1<<22))
2275                 features |= MMX_Extensions;
2276         }
2277     }
2278     return features;
2279 }
2280 
2281 Bool
2282 fbHaveMMX (void)
2283 {
2284     static Bool initialized = FALSE;
2285     static Bool mmx_present;
2286 
2287     if (!initialized)
2288     {
2289         unsigned int features = detectCPUFeatures();
2290 	mmx_present = (features & (MMX|MMX_Extensions)) == (MMX|MMX_Extensions);
2291         initialized = TRUE;
2292     }
2293 
2294     return mmx_present;
2295 }
2296 #endif /* __amd64__ */
2297 
2298 
2299 #endif
2300