1 /*
2 * Copyright © 2004 Red Hat, Inc.
3 * Copyright © 2004 Nicholas Miell
4 * Copyright © 2005 Trolltech AS
5 *
6 * Permission to use, copy, modify, distribute, and sell this software and its
7 * documentation for any purpose is hereby granted without fee, provided that
8 * the above copyright notice appear in all copies and that both that
9 * copyright notice and this permission notice appear in supporting
10 * documentation, and that the name of Red Hat not be used in advertising or
11 * publicity pertaining to distribution of the software without specific,
12 * written prior permission. Red Hat makes no representations about the
13 * suitability of this software for any purpose. It is provided "as is"
14 * without express or implied warranty.
15 *
16 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 * SOFTWARE.
24 *
25 * Author: Søren Sandmann (sandmann@redhat.com)
26 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
28 *
29 * Based on work by Owen Taylor
30 */
31
32 #ifdef HAVE_CONFIG_H
33 #include "config.h"
34 #endif
35
36 #include <liboil/liboil.h>
37 #include <liboil/liboilfunction.h>
38
39 #include <mmintrin.h>
40 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
41
42 typedef uint32_t CARD32;
43 typedef uint16_t CARD16;
44 typedef int16_t INT16;
45 typedef uint8_t CARD8;
46 typedef uint64_t ullong;
47 typedef CARD32* PicturePtr;
48 typedef CARD32* FbBits;
49 typedef int FbStride;
50
51
52 #include "fbmmx.h"
53 #include "fbpict.h"
54
55 #define CHECKPOINT()
56
57 OIL_DECLARE_CLASS (composite_in_argb);
58 OIL_DECLARE_CLASS (composite_in_argb_const_src);
59 OIL_DECLARE_CLASS (composite_in_argb_const_mask);
60 OIL_DECLARE_CLASS (composite_over_argb);
61 OIL_DECLARE_CLASS (composite_over_argb_const_src);
62 OIL_DECLARE_CLASS (composite_add_argb);
63 OIL_DECLARE_CLASS (composite_add_argb_const_src);
64 OIL_DECLARE_CLASS (composite_in_over_argb);
65 OIL_DECLARE_CLASS (composite_in_over_argb_const_src);
66 OIL_DECLARE_CLASS (composite_in_over_argb_const_mask);
67 OIL_DECLARE_CLASS (composite_over_u8);
68 OIL_DECLARE_CLASS (composite_add_u8);
69
70
71 /* --------------- MMX code patch for fbcompose.c --------------------- */
72
73 #if 0
74 static void
75 mmxCombineMaskU (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int width)
76 {
77 const __m64 mmx_0 = _mm_setzero_si64();
78 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
79
80 const uint32_t *end = mask + width;
81 while (mask < end) {
82 __m64 a = MmxTo(*mask);
83 __m64 s = MmxTo(*src);
84 a = MmxAlpha(a);
85 MmxMul(s, a);
86 *dest = MmxFrom(s);
87 ++src;
88 ++dest;
89 ++mask;
90 }
91 _mm_empty();
92 }
93 #endif
94
95 #ifdef ENABLE_BROKEN_IMPLS
96 static void
mmxCombineOverU(uint32_t * dest,const uint32_t * src,int width)97 mmxCombineOverU (uint32_t *dest, const uint32_t *src, int width)
98 {
99 const __m64 mmx_0 = _mm_setzero_si64();
100 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
101 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
102
103 const uint32_t *end = dest + width;
104
105 while (dest < end) {
106 __m64 x, y, a;
107 x = MmxTo(*src);
108 y = MmxTo(*dest);
109 a = MmxAlpha(x);
110 a = MmxNegate(a);
111 MmxMulAdd(y, a, x);
112 *dest = MmxFrom(y);
113 ++dest;
114 ++src;
115 }
116 _mm_empty();
117 }
118 OIL_DEFINE_IMPL_FULL(mmxCombineOverU, composite_over_argb, OIL_IMPL_FLAG_MMX);
119 #endif
120
121 #if 0
122 static FASTCALL void
123 mmxCombineOverReverseU (CARD32 *dest, const CARD32 *src, int width)
124 {
125 const __m64 mmx_0 = _mm_setzero_si64();
126 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
127 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
128
129 const CARD32 *end = dest + width;
130
131 while (dest < end) {
132 __m64 x, y, a;
133 x = MmxTo(*dest);
134 y = MmxTo(*src);
135 a = MmxAlpha(x);
136 a = MmxNegate(a);
137 MmxMulAdd(y, a, x);
138 *dest = MmxFrom(y);
139 ++dest;
140 ++src;
141 }
142 _mm_empty();
143 }
144 #endif
145
146 #if 0
147 static void
148 mmxCombineInU (CARD32 *dest, const CARD32 *src, int width)
149 {
150 const __m64 mmx_0 = _mm_setzero_si64();
151 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
152
153 const CARD32 *end = dest + width;
154
155 while (dest < end) {
156 __m64 x, a;
157 x = MmxTo(*src);
158 a = MmxTo(*dest);
159 a = MmxAlpha(a);
160 MmxMul(x, a);
161 *dest = MmxFrom(x);
162 ++dest;
163 ++src;
164 }
165 _mm_empty();
166 }
167 #endif
168
169 #if 0
170 static FASTCALL void
171 mmxCombineInReverseU (CARD32 *dest, const CARD32 *src, int width)
172 {
173 const __m64 mmx_0 = _mm_setzero_si64();
174 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
175
176 const CARD32 *end = dest + width;
177
178 while (dest < end) {
179 __m64 x, a;
180 x = MmxTo(*dest);
181 a = MmxTo(*src);
182 a = MmxAlpha(a);
183 MmxMul(x, a);
184 *dest = MmxFrom(x);
185 ++dest;
186 ++src;
187 }
188 _mm_empty();
189 }
190 #endif
191
192 #if 0
193 static FASTCALL void
194 mmxCombineOutU (CARD32 *dest, const CARD32 *src, int width)
195 {
196 const __m64 mmx_0 = _mm_setzero_si64();
197 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
198 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
199
200 const CARD32 *end = dest + width;
201
202 while (dest < end) {
203 __m64 x, a;
204 x = MmxTo(*src);
205 a = MmxTo(*dest);
206 a = MmxAlpha(a);
207 a = MmxNegate(a);
208 MmxMul(x, a);
209 *dest = MmxFrom(x);
210 ++dest;
211 ++src;
212 }
213 _mm_empty();
214 }
215 #endif
216
217 #if 0
218 static FASTCALL void
219 mmxCombineOutReverseU (CARD32 *dest, const CARD32 *src, int width)
220 {
221 const __m64 mmx_0 = _mm_setzero_si64();
222 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
223 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
224
225 const CARD32 *end = dest + width;
226
227 while (dest < end) {
228 __m64 x, a;
229 x = MmxTo(*dest);
230 a = MmxTo(*src);
231 a = MmxAlpha(a);
232 a = MmxNegate(a);
233 MmxMul(x, a);
234 *dest = MmxFrom(x);
235 ++dest;
236 ++src;
237 }
238 _mm_empty();
239 }
240
241 static FASTCALL void
242 mmxCombineAtopU (CARD32 *dest, const CARD32 *src, int width)
243 {
244 const __m64 mmx_0 = _mm_setzero_si64();
245 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
246 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
247
248 const CARD32 *end = dest + width;
249
250 while (dest < end) {
251 __m64 s, da, d, sia;
252 s = MmxTo(*src);
253 d = MmxTo(*dest);
254 sia = MmxAlpha(s);
255 sia = MmxNegate(sia);
256 da = MmxAlpha(d);
257 MmxAddMul(s, da, d, sia);
258 *dest = MmxFrom(s);
259 ++dest;
260 ++src;
261 }
262 _mm_empty();
263 }
264
265 static FASTCALL void
266 mmxCombineAtopReverseU (CARD32 *dest, const CARD32 *src, int width)
267 {
268 const __m64 mmx_0 = _mm_setzero_si64();
269 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
270 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
271
272 const CARD32 *end;
273
274 end = dest + width;
275
276 while (dest < end) {
277 __m64 s, dia, d, sa;
278 s = MmxTo(*src);
279 d = MmxTo(*dest);
280 sa = MmxAlpha(s);
281 dia = MmxAlpha(d);
282 dia = MmxNegate(dia);
283 MmxAddMul(s, dia, d, sa);
284 *dest = MmxFrom(s);
285 ++dest;
286 ++src;
287 }
288 _mm_empty();
289 }
290
291 static FASTCALL void
292 mmxCombineXorU (CARD32 *dest, const CARD32 *src, int width)
293 {
294 const __m64 mmx_0 = _mm_setzero_si64();
295 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
296 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
297
298 const CARD32 *end = dest + width;
299
300 while (dest < end) {
301 __m64 s, dia, d, sia;
302 s = MmxTo(*src);
303 d = MmxTo(*dest);
304 sia = MmxAlpha(s);
305 dia = MmxAlpha(d);
306 sia = MmxNegate(sia);
307 dia = MmxNegate(dia);
308 MmxAddMul(s, dia, d, sia);
309 *dest = MmxFrom(s);
310 ++dest;
311 ++src;
312 }
313 _mm_empty();
314 }
315 #endif
316
317 static void
mmxCombineAddU(uint32_t * dest,const uint32_t * src,int width)318 mmxCombineAddU (uint32_t *dest, const uint32_t *src, int width)
319 {
320 const __m64 mmx_0 = _mm_setzero_si64();
321
322 const uint32_t *end = dest + width;
323 while (dest < end) {
324 __m64 s, d;
325 s = MmxTo(*src);
326 d = MmxTo(*dest);
327 s = MmxAdd(s, d);
328 *dest = MmxFrom(s);
329 ++dest;
330 ++src;
331 }
332 _mm_empty();
333 }
334 OIL_DEFINE_IMPL_FULL(mmxCombineAddU, composite_add_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_SSE);
335
336 #if 0
337 static FASTCALL void
338 mmxCombineSaturateU (CARD32 *dest, const CARD32 *src, int width)
339 {
340 const __m64 mmx_0 = _mm_setzero_si64();
341 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
342
343 const CARD32 *end = dest + width;
344 while (dest < end) {
345 CARD32 s = *src;
346 CARD32 d = *dest;
347 __m64 ms = MmxTo(s);
348 __m64 md = MmxTo(d);
349 CARD32 sa = s >> 24;
350 CARD32 da = ~d >> 24;
351
352 if (sa > da) {
353 __m64 msa = MmxTo(FbIntDiv(da, sa));
354 msa = MmxAlpha(msa);
355 MmxMul(ms, msa);
356 }
357 MmxAdd(md, ms);
358 *dest = MmxFrom(md);
359 ++src;
360 ++dest;
361 }
362 _mm_empty();
363 }
364
365
366 static FASTCALL void
367 mmxCombineSrcC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
368 {
369 const __m64 mmx_0 = _mm_setzero_si64();
370 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
371
372 const CARD32 *end = src + width;
373 while (src < end) {
374 __m64 a = MmxTo(*mask);
375 __m64 s = MmxTo(*src);
376 MmxMul(s, a);
377 *dest = MmxFrom(s);
378 ++src;
379 ++mask;
380 ++dest;
381 }
382 _mm_empty();
383 }
384
385 static FASTCALL void
386 mmxCombineOverC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
387 {
388 const __m64 mmx_0 = _mm_setzero_si64();
389 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
390 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
391
392 const CARD32 *end = src + width;
393 while (src < end) {
394 __m64 a = MmxTo(*mask);
395 __m64 s = MmxTo(*src);
396 __m64 d = MmxTo(*dest);
397 __m64 sa = MmxAlpha(s);
398 MmxMul(s, a);
399 MmxMul(a, sa);
400 a = MmxNegate(a);
401 MmxMulAdd(d, a, s);
402 *dest = MmxFrom(d);
403 ++src;
404 ++dest;
405 ++mask;
406 }
407 _mm_empty();
408 }
409
410 static FASTCALL void
411 mmxCombineOverReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
412 {
413 const __m64 mmx_0 = _mm_setzero_si64();
414 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
415 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
416
417 const CARD32 *end = src + width;
418 while (src < end) {
419 __m64 a = MmxTo(*mask);
420 __m64 s = MmxTo(*src);
421 __m64 d = MmxTo(*dest);
422 __m64 da = MmxAlpha(d);
423 da = MmxNegate(da);
424 MmxMul(s, a);
425 MmxMulAdd(s, da, d);
426 *dest = MmxFrom(s);
427 ++src;
428 ++dest;
429 ++mask;
430 }
431 _mm_empty();
432 }
433
434
435 static FASTCALL void
436 mmxCombineInC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
437 {
438 const __m64 mmx_0 = _mm_setzero_si64();
439 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
440
441 const CARD32 *end = src + width;
442 while (src < end) {
443 __m64 a = MmxTo(*mask);
444 __m64 s = MmxTo(*src);
445 __m64 d = MmxTo(*dest);
446 __m64 da = MmxAlpha(d);
447 MmxMul(s, a);
448 MmxMul(s, da);
449 *dest = MmxFrom(s);
450 ++src;
451 ++dest;
452 ++mask;
453 }
454 _mm_empty();
455 }
456
457 static FASTCALL void
458 mmxCombineInReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
459 {
460 const __m64 mmx_0 = _mm_setzero_si64();
461 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
462
463 const CARD32 *end = src + width;
464 while (src < end) {
465 __m64 a = MmxTo(*mask);
466 __m64 s = MmxTo(*src);
467 __m64 d = MmxTo(*dest);
468 __m64 sa = MmxAlpha(s);
469 MmxMul(a, sa);
470 MmxMul(d, a);
471 *dest = MmxFrom(d);
472 ++src;
473 ++dest;
474 ++mask;
475 }
476 _mm_empty();
477 }
478
479 static FASTCALL void
480 mmxCombineOutC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
481 {
482 const __m64 mmx_0 = _mm_setzero_si64();
483 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
484 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
485
486 const CARD32 *end = src + width;
487 while (src < end) {
488 __m64 a = MmxTo(*mask);
489 __m64 s = MmxTo(*src);
490 __m64 d = MmxTo(*dest);
491 __m64 da = MmxAlpha(d);
492 da = MmxNegate(da);
493 MmxMul(s, a);
494 MmxMul(s, da);
495 *dest = MmxFrom(s);
496 ++src;
497 ++dest;
498 ++mask;
499 }
500 _mm_empty();
501 }
502
503 static FASTCALL void
504 mmxCombineOutReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
505 {
506 const __m64 mmx_0 = _mm_setzero_si64();
507 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
508 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
509
510 const CARD32 *end = src + width;
511 while (src < end) {
512 __m64 a = MmxTo(*mask);
513 __m64 s = MmxTo(*src);
514 __m64 d = MmxTo(*dest);
515 __m64 sa = MmxAlpha(s);
516 MmxMul(a, sa);
517 a = MmxNegate(a);
518 MmxMul(d, a);
519 *dest = MmxFrom(d);
520 ++src;
521 ++dest;
522 ++mask;
523 }
524 _mm_empty();
525 }
526
527 static FASTCALL void
528 mmxCombineAtopC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
529 {
530 const __m64 mmx_0 = _mm_setzero_si64();
531 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
532 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
533
534 const CARD32 *end = src + width;
535 while (src < end) {
536 __m64 a = MmxTo(*mask);
537 __m64 s = MmxTo(*src);
538 __m64 d = MmxTo(*dest);
539 __m64 da = MmxAlpha(d);
540 __m64 sa = MmxAlpha(s);
541 MmxMul(s, a);
542 MmxMul(a, sa);
543 a = MmxNegate(a);
544 MmxAddMul(d, a, s, da);
545 *dest = MmxFrom(d);
546 ++src;
547 ++dest;
548 ++mask;
549 }
550 _mm_empty();
551 }
552
553 static FASTCALL void
554 mmxCombineAtopReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
555 {
556 const __m64 mmx_0 = _mm_setzero_si64();
557 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
558 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
559
560 const CARD32 *end = src + width;
561 while (src < end) {
562 __m64 a = MmxTo(*mask);
563 __m64 s = MmxTo(*src);
564 __m64 d = MmxTo(*dest);
565 __m64 da = MmxAlpha(d);
566 __m64 sa = MmxAlpha(s)
567 MmxMul(s, a);
568 MmxMul(a, sa);
569 da = MmxNegate(da);
570 MmxAddMul(d, a, s, da);
571 *dest = MmxFrom(d);
572 ++src;
573 ++dest;
574 ++mask;
575 }
576 _mm_empty();
577 }
578
579 static FASTCALL void
580 mmxCombineXorC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
581 {
582 const __m64 mmx_0 = _mm_setzero_si64();
583 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
584 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
585
586 const CARD32 *end = src + width;
587 while (src < end) {
588 __m64 a = MmxTo(*mask);
589 __m64 s = MmxTo(*src);
590 __m64 d = MmxTo(*dest);
591 __m64 da = MmxAlpha(d);
592 __m64 sa = MmxAlpha(s);
593 MmxMul(s, a);
594 MmxMul(a, sa);
595 da = MmxNegate(da);
596 a = MmxNegate(a);
597 MmxAddMul(d, a, s, da);
598 *dest = MmxFrom(d);
599 ++src;
600 ++dest;
601 ++mask;
602 }
603 _mm_empty();
604 }
605
606 static FASTCALL void
607 mmxCombineAddC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
608 {
609 const __m64 mmx_0 = _mm_setzero_si64();
610 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
611
612 const CARD32 *end = src + width;
613 while (src < end) {
614 __m64 a = MmxTo(*mask);
615 __m64 s = MmxTo(*src);
616 __m64 d = MmxTo(*dest);
617 MmxMul(s, a);
618 d = MmxAdd(s, d);
619 *dest = MmxFrom(d);
620 ++src;
621 ++dest;
622 ++mask;
623 }
624 _mm_empty();
625 }
626
627 extern FbComposeFunctions composeFunctions;
628
629 void fbComposeSetupMMX(void)
630 {
631 /* check if we have MMX support and initialize accordingly */
632 if (fbHaveMMX()) {
633 composeFunctions.combineU[PictOpOver] = mmxCombineOverU;
634 composeFunctions.combineU[PictOpOverReverse] = mmxCombineOverReverseU;
635 composeFunctions.combineU[PictOpIn] = mmxCombineInU;
636 composeFunctions.combineU[PictOpInReverse] = mmxCombineInReverseU;
637 composeFunctions.combineU[PictOpOut] = mmxCombineOutU;
638 composeFunctions.combineU[PictOpOutReverse] = mmxCombineOutReverseU;
639 composeFunctions.combineU[PictOpAtop] = mmxCombineAtopU;
640 composeFunctions.combineU[PictOpAtopReverse] = mmxCombineAtopReverseU;
641 composeFunctions.combineU[PictOpXor] = mmxCombineXorU;
642 composeFunctions.combineU[PictOpAdd] = mmxCombineAddU;
643 composeFunctions.combineU[PictOpSaturate] = mmxCombineSaturateU;
644
645 composeFunctions.combineC[PictOpSrc] = mmxCombineSrcC;
646 composeFunctions.combineC[PictOpOver] = mmxCombineOverC;
647 composeFunctions.combineC[PictOpOverReverse] = mmxCombineOverReverseC;
648 composeFunctions.combineC[PictOpIn] = mmxCombineInC;
649 composeFunctions.combineC[PictOpInReverse] = mmxCombineInReverseC;
650 composeFunctions.combineC[PictOpOut] = mmxCombineOutC;
651 composeFunctions.combineC[PictOpOutReverse] = mmxCombineOutReverseC;
652 composeFunctions.combineC[PictOpAtop] = mmxCombineAtopC;
653 composeFunctions.combineC[PictOpAtopReverse] = mmxCombineAtopReverseC;
654 composeFunctions.combineC[PictOpXor] = mmxCombineXorC;
655 composeFunctions.combineC[PictOpAdd] = mmxCombineAddC;
656
657 composeFunctions.combineMaskU = mmxCombineMaskU;
658 }
659 }
660 #endif
661
662
663 /* ------------------ MMX code paths called from fbpict.c ----------------------- */
664
665 typedef union {
666 __m64 m64;
667 uint64_t ull;
668 } m64_ull;
669
670 typedef struct
671 {
672 m64_ull mmx_4x00ff;
673 m64_ull mmx_4x0080;
674 m64_ull mmx_565_rgb;
675 m64_ull mmx_565_unpack_multiplier;
676 m64_ull mmx_565_r;
677 m64_ull mmx_565_g;
678 m64_ull mmx_565_b;
679 m64_ull mmx_mask_0;
680 m64_ull mmx_mask_1;
681 m64_ull mmx_mask_2;
682 m64_ull mmx_mask_3;
683 m64_ull mmx_full_alpha;
684 m64_ull mmx_ffff0000ffff0000;
685 m64_ull mmx_0000ffff00000000;
686 m64_ull mmx_000000000000ffff;
687 } MMXData;
688
689 static const MMXData c =
690 {
691 .mmx_4x00ff.ull = 0x00ff00ff00ff00ffULL,
692 .mmx_4x0080.ull = 0x0080008000800080ULL,
693 .mmx_565_rgb.ull = 0x000001f0003f001fULL,
694 .mmx_565_r.ull = 0x000000f800000000ULL,
695 .mmx_565_g.ull = 0x0000000000fc0000ULL,
696 .mmx_565_b.ull = 0x00000000000000f8ULL,
697 .mmx_mask_0.ull = 0xffffffffffff0000ULL,
698 .mmx_mask_1.ull = 0xffffffff0000ffffULL,
699 .mmx_mask_2.ull = 0xffff0000ffffffffULL,
700 .mmx_mask_3.ull = 0x0000ffffffffffffULL,
701 .mmx_full_alpha.ull = 0x00ff000000000000ULL,
702 .mmx_565_unpack_multiplier.ull = 0x0000008404100840ULL,
703 .mmx_ffff0000ffff0000.ull = 0xffff0000ffff0000ULL,
704 .mmx_0000ffff00000000.ull = 0x0000ffff00000000ULL,
705 .mmx_000000000000ffff.ull = 0x000000000000ffffULL,
706 };
707
708 #define MC(x) ((__m64) c.mmx_##x.m64)
709
710 static __inline__ __m64
shift(__m64 v,int s)711 shift (__m64 v, int s)
712 {
713 if (s > 0)
714 return _mm_slli_si64 (v, s);
715 else if (s < 0)
716 return _mm_srli_si64 (v, -s);
717 else
718 return v;
719 }
720
721 static __inline__ __m64
negate(__m64 mask)722 negate (__m64 mask)
723 {
724 return _mm_xor_si64 (mask, MC(4x00ff));
725 }
726
727 static __inline__ __m64
pix_multiply(__m64 a,__m64 b)728 pix_multiply (__m64 a, __m64 b)
729 {
730 __m64 res;
731
732 res = _mm_mullo_pi16 (a, b);
733 res = _mm_adds_pu16 (res, MC(4x0080));
734 res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8));
735 res = _mm_srli_pi16 (res, 8);
736
737 return res;
738 }
739
740 static __inline__ __m64
expand_alpha(__m64 pixel)741 expand_alpha (__m64 pixel)
742 {
743 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3));
744 }
745
746 static __inline__ __m64
expand_alpha_rev(__m64 pixel)747 expand_alpha_rev (__m64 pixel)
748 {
749 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0));
750 }
751
752 static __inline__ __m64
invert_colors(__m64 pixel)753 invert_colors (__m64 pixel)
754 {
755 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2));
756 }
757
758 /* Notes about writing mmx code
759 *
760 * give memory operands as the second operand. If you give it as the
761 * first, gcc will first load it into a register, then use that
762 * register
763 *
764 * ie. use
765 *
766 * _mm_mullo_pi16 (x, mmx_constant);
767 *
768 * not
769 *
770 * _mm_mullo_pi16 (mmx_constant, x);
771 *
772 * Also try to minimize dependencies. i.e. when you need a value, try
773 * to calculate it from a value that was calculated as early as
774 * possible.
775 */
776
777 static __inline__ __m64
over(__m64 src,__m64 srca,__m64 dest)778 over (__m64 src, __m64 srca, __m64 dest)
779 {
780 return _mm_adds_pu8 (src, pix_multiply(dest, negate(srca)));
781 }
782
783 static __inline__ __m64
over_rev_non_pre(__m64 src,__m64 dest)784 over_rev_non_pre (__m64 src, __m64 dest)
785 {
786 __m64 srca = expand_alpha (src);
787 __m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha));
788
789 return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest);
790 }
791
792 static __inline__ __m64
in(__m64 src,__m64 mask)793 in (__m64 src,
794 __m64 mask)
795 {
796 return pix_multiply (src, mask);
797 }
798
799 static __inline__ __m64
in_over(__m64 src,__m64 srca,__m64 mask,__m64 dest)800 in_over (__m64 src,
801 __m64 srca,
802 __m64 mask,
803 __m64 dest)
804 {
805 return over(in(src, mask), pix_multiply(srca, mask), dest);
806 }
807
808 static __inline__ __m64
load8888(CARD32 v)809 load8888 (CARD32 v)
810 {
811 return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64());
812 }
813
814 static __inline__ __m64
pack8888(__m64 lo,__m64 hi)815 pack8888 (__m64 lo, __m64 hi)
816 {
817 __m64 r;
818 r = _mm_packs_pu16 (lo, hi);
819 return r;
820 }
821
822 static __inline__ CARD32
store8888(__m64 v)823 store8888 (__m64 v)
824 {
825 return _mm_cvtsi64_si32(pack8888(v, _mm_setzero_si64()));
826 }
827
828 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
829 *
830 * 00RR00GG00BB
831 *
832 * --- Expanding 565 in the low word ---
833 *
834 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
835 * m = m & (01f0003f001f);
836 * m = m * (008404100840);
837 * m = m >> 8;
838 *
839 * Note the trick here - the top word is shifted by another nibble to
840 * avoid it bumping into the middle word
841 */
842 static __inline__ __m64
expand565(__m64 pixel,int pos)843 expand565 (__m64 pixel, int pos)
844 {
845 __m64 p = pixel;
846 __m64 t1, t2;
847
848 /* move pixel to low 16 bit and zero the rest */
849 p = shift (shift (p, (3 - pos) * 16), -48);
850
851 t1 = shift (p, 36 - 11);
852 t2 = shift (p, 16 - 5);
853
854 p = _mm_or_si64 (t1, p);
855 p = _mm_or_si64 (t2, p);
856 p = _mm_and_si64 (p, MC(565_rgb));
857
858 pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier));
859 return _mm_srli_pi16 (pixel, 8);
860 }
861
862 static __inline__ __m64
expand8888(__m64 in,int pos)863 expand8888 (__m64 in, int pos)
864 {
865 if (pos == 0)
866 return _mm_unpacklo_pi8 (in, _mm_setzero_si64());
867 else
868 return _mm_unpackhi_pi8 (in, _mm_setzero_si64());
869 }
870
871 static __inline__ __m64
pack565(__m64 pixel,__m64 target,int pos)872 pack565 (__m64 pixel, __m64 target, int pos)
873 {
874 __m64 p = pixel;
875 __m64 t = target;
876 __m64 r, g, b;
877
878 r = _mm_and_si64 (p, MC(565_r));
879 g = _mm_and_si64 (p, MC(565_g));
880 b = _mm_and_si64 (p, MC(565_b));
881
882 r = shift (r, - (32 - 8) + pos * 16);
883 g = shift (g, - (16 - 3) + pos * 16);
884 b = shift (b, - (0 + 3) + pos * 16);
885
886 if (pos == 0)
887 t = _mm_and_si64 (t, MC(mask_0));
888 else if (pos == 1)
889 t = _mm_and_si64 (t, MC(mask_1));
890 else if (pos == 2)
891 t = _mm_and_si64 (t, MC(mask_2));
892 else if (pos == 3)
893 t = _mm_and_si64 (t, MC(mask_3));
894
895 p = _mm_or_si64 (r, t);
896 p = _mm_or_si64 (g, p);
897
898 return _mm_or_si64 (b, p);
899 }
900
901 #ifdef ENABLE_BROKEN_IMPLS
902 /* broken. See Debian bug #340932 */
903 static void
fbCompositeSolid_nx8888mmx(uint32_t * dst,uint32_t * src,int w)904 fbCompositeSolid_nx8888mmx (uint32_t *dst, uint32_t *src, int w)
905 {
906 __m64 vsrc, vsrca;
907
908 vsrc = load8888 (*src);
909 vsrca = expand_alpha (vsrc);
910
911 while (w && (unsigned long)dst & 7)
912 {
913 *dst = store8888(over(vsrc, vsrca, load8888(*dst)));
914
915 w--;
916 dst++;
917 }
918
919 while (w >= 2)
920 {
921 __m64 vdest;
922 __m64 dest0, dest1;
923
924 vdest = *(__m64 *)dst;
925
926 dest0 = over(vsrc, vsrca, expand8888(vdest, 0));
927 dest1 = over(vsrc, vsrca, expand8888(vdest, 1));
928
929 *(__m64 *)dst = pack8888(dest0, dest1);
930
931 dst += 2;
932 w -= 2;
933 }
934
935 while (w)
936 {
937 *dst = store8888(over(vsrc, vsrca, load8888(*dst)));
938
939 w--;
940 dst++;
941 }
942
943 _mm_empty();
944 }
945 OIL_DEFINE_IMPL_FULL(fbCompositeSolid_nx8888mmx, composite_over_argb_const_src,
946 OIL_IMPL_FLAG_MMX| OIL_IMPL_FLAG_MMXEXT);
947 #endif
948
949 #if 0
950 void
951 fbCompositeSolid_nx0565mmx (CARD8 op,
952 PicturePtr pSrc,
953 PicturePtr pMask,
954 PicturePtr pDst,
955 INT16 xSrc,
956 INT16 ySrc,
957 INT16 xMask,
958 INT16 yMask,
959 INT16 xDst,
960 INT16 yDst,
961 CARD16 width,
962 CARD16 height)
963 {
964 CARD32 src;
965 CARD16 *dstLine, *dst;
966 CARD16 w;
967 FbStride dstStride;
968 __m64 vsrc, vsrca;
969
970 CHECKPOINT();
971
972 fbComposeGetSolid(pSrc, src, pDst->format);
973
974 if (src >> 24 == 0)
975 return;
976
977 fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
978
979 vsrc = load8888 (src);
980 vsrca = expand_alpha (vsrc);
981
982 while (height--)
983 {
984 dst = dstLine;
985 dstLine += dstStride;
986 w = width;
987
988 CHECKPOINT();
989
990 while (w && (unsigned long)dst & 7)
991 {
992 ullong d = *dst;
993 __m64 vdest = expand565 ((__m64)d, 0);
994 vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
995 *dst = (ullong)vdest;
996
997 w--;
998 dst++;
999 }
1000
1001 while (w >= 4)
1002 {
1003 __m64 vdest;
1004
1005 vdest = *(__m64 *)dst;
1006
1007 vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0);
1008 vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1);
1009 vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2);
1010 vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3);
1011
1012 *(__m64 *)dst = vdest;
1013
1014 dst += 4;
1015 w -= 4;
1016 }
1017
1018 CHECKPOINT();
1019
1020 while (w)
1021 {
1022 ullong d = *dst;
1023 __m64 vdest = expand565 ((__m64)d, 0);
1024 vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
1025 *dst = (ullong)vdest;
1026
1027 w--;
1028 dst++;
1029 }
1030 }
1031
1032 _mm_empty();
1033 }
1034 #endif
1035
1036 #if 0
1037 static void
1038 fbCompositeSolidMask_nx8888x8888Cmmx (uint32_t *dst, uint32_t *src, uint8_t *mask, int w)
1039 {
1040 CARD32 src, srca;
1041 CARD32 *dstLine;
1042 CARD32 *maskLine;
1043 FbStride dstStride, maskStride;
1044 __m64 vsrc, vsrca;
1045
1046
1047 while (twidth && (unsigned long)q & 7)
1048 {
1049 CARD32 m = *(CARD32 *)p;
1050
1051 if (m)
1052 {
1053 __m64 vdest = load8888(*q);
1054 vdest = in_over(vsrc, vsrca, load8888(m), vdest);
1055 *q = (ullong)pack8888(vdest, _mm_setzero_si64());
1056 }
1057
1058 twidth--;
1059 p++;
1060 q++;
1061 }
1062
1063 while (twidth >= 2)
1064 {
1065 CARD32 m0, m1;
1066 m0 = *p;
1067 m1 = *(p + 1);
1068
1069 if (m0 | m1)
1070 {
1071 __m64 dest0, dest1;
1072 __m64 vdest = *(__m64 *)q;
1073
1074 dest0 = in_over(vsrc, vsrca, load8888(m0),
1075 expand8888 (vdest, 0));
1076 dest1 = in_over(vsrc, vsrca, load8888(m1),
1077 expand8888 (vdest, 1));
1078
1079 *(__m64 *)q = pack8888(dest0, dest1);
1080 }
1081
1082 p += 2;
1083 q += 2;
1084 twidth -= 2;
1085 }
1086
1087 while (twidth)
1088 {
1089 CARD32 m = *(CARD32 *)p;
1090
1091 if (m)
1092 {
1093 __m64 vdest = load8888(*q);
1094 vdest = in_over(vsrc, vsrca, load8888(m), vdest);
1095 *q = (ullong)pack8888(vdest, _mm_setzero_si64());
1096 }
1097
1098 twidth--;
1099 p++;
1100 q++;
1101 }
1102
1103 _mm_empty();
1104 }
1105 #endif
1106
1107 #if 0
1108 static void
1109 fbCompositeSrc_8888x8x8888mmx (uint32_t *dest, uint32_t *src, uint8_t *mask,
1110 int width)
1111 {
1112
1113 mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine;
1114 vmask = load8888 (mask);
1115 srca = MC(4x00ff);
1116
1117 while (height--)
1118 {
1119 dst = dstLine;
1120 dstLine += dstStride;
1121 src = srcLine;
1122 srcLine += srcStride;
1123 w = width;
1124
1125 while (w && (unsigned long)dst & 7)
1126 {
1127 __m64 s = load8888 (*src);
1128 __m64 d = load8888 (*dst);
1129
1130 *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64());
1131
1132 w--;
1133 dst++;
1134 src++;
1135 }
1136
1137 while (w >= 16)
1138 {
1139 __m64 vd0 = *(__m64 *)(dst + 0);
1140 __m64 vd1 = *(__m64 *)(dst + 2);
1141 __m64 vd2 = *(__m64 *)(dst + 4);
1142 __m64 vd3 = *(__m64 *)(dst + 6);
1143 __m64 vd4 = *(__m64 *)(dst + 8);
1144 __m64 vd5 = *(__m64 *)(dst + 10);
1145 __m64 vd6 = *(__m64 *)(dst + 12);
1146 __m64 vd7 = *(__m64 *)(dst + 14);
1147
1148 __m64 vs0 = *(__m64 *)(src + 0);
1149 __m64 vs1 = *(__m64 *)(src + 2);
1150 __m64 vs2 = *(__m64 *)(src + 4);
1151 __m64 vs3 = *(__m64 *)(src + 6);
1152 __m64 vs4 = *(__m64 *)(src + 8);
1153 __m64 vs5 = *(__m64 *)(src + 10);
1154 __m64 vs6 = *(__m64 *)(src + 12);
1155 __m64 vs7 = *(__m64 *)(src + 14);
1156
1157 vd0 = (__m64)pack8888 (
1158 in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1159 in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1160
1161 vd1 = (__m64)pack8888 (
1162 in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1163 in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1164
1165 vd2 = (__m64)pack8888 (
1166 in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1167 in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1168
1169 vd3 = (__m64)pack8888 (
1170 in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1171 in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1172
1173 vd4 = (__m64)pack8888 (
1174 in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1175 in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1176
1177 vd5 = (__m64)pack8888 (
1178 in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1179 in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1180
1181 vd6 = (__m64)pack8888 (
1182 in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1183 in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1184
1185 vd7 = (__m64)pack8888 (
1186 in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1187 in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1188
1189 *(__m64 *)(dst + 0) = vd0;
1190 *(__m64 *)(dst + 2) = vd1;
1191 *(__m64 *)(dst + 4) = vd2;
1192 *(__m64 *)(dst + 6) = vd3;
1193 *(__m64 *)(dst + 8) = vd4;
1194 *(__m64 *)(dst + 10) = vd5;
1195 *(__m64 *)(dst + 12) = vd6;
1196 *(__m64 *)(dst + 14) = vd7;
1197
1198 w -= 16;
1199 dst += 16;
1200 src += 16;
1201 }
1202
1203 while (w)
1204 {
1205 __m64 s = load8888 (*src);
1206 __m64 d = load8888 (*dst);
1207
1208 *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64());
1209
1210 w--;
1211 dst++;
1212 src++;
1213 }
1214 }
1215
1216 _mm_empty();
1217 }
1218
1219 void
1220 fbCompositeSrc_8888x8888mmx (CARD8 op,
1221 PicturePtr pSrc,
1222 PicturePtr pMask,
1223 PicturePtr pDst,
1224 INT16 xSrc,
1225 INT16 ySrc,
1226 INT16 xMask,
1227 INT16 yMask,
1228 INT16 xDst,
1229 INT16 yDst,
1230 CARD16 width,
1231 CARD16 height)
1232 {
1233 CARD32 *dstLine, *dst;
1234 CARD32 *srcLine, *src;
1235 FbStride dstStride, srcStride;
1236 CARD16 w;
1237 __m64 srca;
1238
1239 CHECKPOINT();
1240
1241 fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
1242 fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
1243
1244 srca = MC (4x00ff);
1245
1246 while (height--)
1247 {
1248 dst = dstLine;
1249 dstLine += dstStride;
1250 src = srcLine;
1251 srcLine += srcStride;
1252 w = width;
1253
1254 while (w && (unsigned long)dst & 7)
1255 {
1256 __m64 s = load8888 (*src);
1257 __m64 d = load8888 (*dst);
1258
1259 *dst = (ullong)pack8888 (over (s, expand_alpha (s), d), (__m64)_mm_setzero_si64());
1260
1261 w--;
1262 dst++;
1263 src++;
1264 }
1265
1266 while (w >= 2)
1267 {
1268 __m64 vd = *(__m64 *)(dst + 0);
1269 __m64 vs = *(__m64 *)(src + 0);
1270 __m64 vs0 = expand8888 (vs, 0);
1271 __m64 vs1 = expand8888 (vs, 1);
1272
1273 *(__m64 *)dst = (__m64)pack8888 (
1274 over (vs0, expand_alpha (vs0), expand8888 (vd, 0)),
1275 over (vs1, expand_alpha (vs1), expand8888 (vd, 1)));
1276
1277 w -= 2;
1278 dst += 2;
1279 src += 2;
1280 }
1281
1282 while (w)
1283 {
1284 __m64 s = load8888 (*src);
1285 __m64 d = load8888 (*dst);
1286
1287 *dst = (ullong)pack8888 (over (s, expand_alpha (s), d),
1288 (__m64)_mm_setzero_si64());
1289
1290 w--;
1291 dst++;
1292 src++;
1293 }
1294 }
1295
1296 _mm_empty();
1297 }
1298
1299 void
1300 fbCompositeSolidMask_nx8x8888mmx (CARD8 op,
1301 PicturePtr pSrc,
1302 PicturePtr pMask,
1303 PicturePtr pDst,
1304 INT16 xSrc,
1305 INT16 ySrc,
1306 INT16 xMask,
1307 INT16 yMask,
1308 INT16 xDst,
1309 INT16 yDst,
1310 CARD16 width,
1311 CARD16 height)
1312 {
1313 CARD32 src, srca;
1314 CARD32 *dstLine, *dst;
1315 CARD8 *maskLine, *mask;
1316 FbStride dstStride, maskStride;
1317 CARD16 w;
1318 __m64 vsrc, vsrca;
1319 ullong srcsrc;
1320
1321 CHECKPOINT();
1322
1323 fbComposeGetSolid(pSrc, src, pDst->format);
1324
1325 srca = src >> 24;
1326 if (srca == 0)
1327 return;
1328
1329 srcsrc = (unsigned long long)src << 32 | src;
1330
1331 fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
1332 fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
1333
1334 vsrc = load8888 (src);
1335 vsrca = expand_alpha (vsrc);
1336
1337 while (height--)
1338 {
1339 dst = dstLine;
1340 dstLine += dstStride;
1341 mask = maskLine;
1342 maskLine += maskStride;
1343 w = width;
1344
1345 CHECKPOINT();
1346
1347 while (w && (unsigned long)dst & 7)
1348 {
1349 ullong m = *mask;
1350
1351 if (m)
1352 {
1353 __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst));
1354 *dst = (ullong)pack8888(vdest, _mm_setzero_si64());
1355 }
1356
1357 w--;
1358 mask++;
1359 dst++;
1360 }
1361
1362 CHECKPOINT();
1363
1364 while (w >= 2)
1365 {
1366 ullong m0, m1;
1367 m0 = *mask;
1368 m1 = *(mask + 1);
1369
1370 if (srca == 0xff && (m0 & m1) == 0xff)
1371 {
1372 *(unsigned long long *)dst = srcsrc;
1373 }
1374 else if (m0 | m1)
1375 {
1376 __m64 vdest;
1377 __m64 dest0, dest1;
1378
1379 vdest = *(__m64 *)dst;
1380
1381 dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0));
1382 dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1));
1383
1384 *(__m64 *)dst = pack8888(dest0, dest1);
1385 }
1386
1387 mask += 2;
1388 dst += 2;
1389 w -= 2;
1390 }
1391
1392 CHECKPOINT();
1393
1394 while (w)
1395 {
1396 ullong m = *mask;
1397
1398 if (m)
1399 {
1400 __m64 vdest = load8888(*dst);
1401 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest);
1402 *dst = (ullong)pack8888(vdest, _mm_setzero_si64());
1403 }
1404
1405 w--;
1406 mask++;
1407 dst++;
1408 }
1409 }
1410
1411 _mm_empty();
1412 }
1413
1414
1415 void
1416 fbCompositeSolidMask_nx8x0565mmx (CARD8 op,
1417 PicturePtr pSrc,
1418 PicturePtr pMask,
1419 PicturePtr pDst,
1420 INT16 xSrc,
1421 INT16 ySrc,
1422 INT16 xMask,
1423 INT16 yMask,
1424 INT16 xDst,
1425 INT16 yDst,
1426 CARD16 width,
1427 CARD16 height)
1428 {
1429 CARD32 src, srca;
1430 CARD16 *dstLine, *dst;
1431 CARD8 *maskLine, *mask;
1432 FbStride dstStride, maskStride;
1433 CARD16 w;
1434 __m64 vsrc, vsrca;
1435 unsigned long long srcsrcsrcsrc, src16;
1436
1437 CHECKPOINT();
1438
1439 fbComposeGetSolid(pSrc, src, pDst->format);
1440
1441 srca = src >> 24;
1442 if (srca == 0)
1443 return;
1444
1445 fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
1446 fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
1447
1448 vsrc = load8888 (src);
1449 vsrca = expand_alpha (vsrc);
1450
1451 src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0);
1452
1453 srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 |
1454 (ullong)src16 << 16 | (ullong)src16;
1455
1456 while (height--)
1457 {
1458 dst = dstLine;
1459 dstLine += dstStride;
1460 mask = maskLine;
1461 maskLine += maskStride;
1462 w = width;
1463
1464 CHECKPOINT();
1465
1466 while (w && (unsigned long)dst & 7)
1467 {
1468 ullong m = *mask;
1469
1470 if (m)
1471 {
1472 ullong d = *dst;
1473 __m64 vd = (__m64)d;
1474 __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
1475 *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
1476 }
1477
1478 w--;
1479 mask++;
1480 dst++;
1481 }
1482
1483 CHECKPOINT();
1484
1485 while (w >= 4)
1486 {
1487 ullong m0, m1, m2, m3;
1488 m0 = *mask;
1489 m1 = *(mask + 1);
1490 m2 = *(mask + 2);
1491 m3 = *(mask + 3);
1492
1493 if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
1494 {
1495 *(unsigned long long *)dst = srcsrcsrcsrc;
1496 }
1497 else if (m0 | m1 | m2 | m3)
1498 {
1499 __m64 vdest;
1500 __m64 vm0, vm1, vm2, vm3;
1501
1502 vdest = *(__m64 *)dst;
1503
1504 vm0 = (__m64)m0;
1505 vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0);
1506 vm1 = (__m64)m1;
1507 vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1);
1508 vm2 = (__m64)m2;
1509 vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2);
1510 vm3 = (__m64)m3;
1511 vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3);
1512
1513 *(__m64 *)dst = vdest;
1514 }
1515
1516 w -= 4;
1517 mask += 4;
1518 dst += 4;
1519 }
1520
1521 CHECKPOINT();
1522
1523 while (w)
1524 {
1525 ullong m = *mask;
1526
1527 if (m)
1528 {
1529 ullong d = *dst;
1530 __m64 vd = (__m64)d;
1531 __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
1532 *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
1533 }
1534
1535 w--;
1536 mask++;
1537 dst++;
1538 }
1539 }
1540
1541 _mm_empty();
1542 }
1543
1544 void
1545 fbCompositeSrc_8888RevNPx0565mmx (CARD8 op,
1546 PicturePtr pSrc,
1547 PicturePtr pMask,
1548 PicturePtr pDst,
1549 INT16 xSrc,
1550 INT16 ySrc,
1551 INT16 xMask,
1552 INT16 yMask,
1553 INT16 xDst,
1554 INT16 yDst,
1555 CARD16 width,
1556 CARD16 height)
1557 {
1558 CARD16 *dstLine, *dst;
1559 CARD32 *srcLine, *src;
1560 FbStride dstStride, srcStride;
1561 CARD16 w;
1562
1563 CHECKPOINT();
1564
1565 fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
1566 fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
1567
1568 assert (pSrc->pDrawable == pMask->pDrawable);
1569
1570 while (height--)
1571 {
1572 dst = dstLine;
1573 dstLine += dstStride;
1574 src = srcLine;
1575 srcLine += srcStride;
1576 w = width;
1577
1578 CHECKPOINT();
1579
1580 while (w && (unsigned long)dst & 7)
1581 {
1582 __m64 vsrc = load8888 (*src);
1583 ullong d = *dst;
1584 __m64 vdest = expand565 ((__m64)d, 0);
1585
1586 vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
1587
1588 *dst = (ullong)vdest;
1589
1590 w--;
1591 dst++;
1592 src++;
1593 }
1594
1595 CHECKPOINT();
1596
1597 while (w >= 4)
1598 {
1599 CARD32 s0, s1, s2, s3;
1600 unsigned char a0, a1, a2, a3;
1601
1602 s0 = *src;
1603 s1 = *(src + 1);
1604 s2 = *(src + 2);
1605 s3 = *(src + 3);
1606
1607 a0 = (s0 >> 24);
1608 a1 = (s1 >> 24);
1609 a2 = (s2 >> 24);
1610 a3 = (s3 >> 24);
1611
1612 if ((a0 & a1 & a2 & a3) == 0xFF)
1613 {
1614 __m64 vdest;
1615 vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0);
1616 vdest = pack565(invert_colors(load8888(s1)), vdest, 1);
1617 vdest = pack565(invert_colors(load8888(s2)), vdest, 2);
1618 vdest = pack565(invert_colors(load8888(s3)), vdest, 3);
1619
1620 *(__m64 *)dst = vdest;
1621 }
1622 else if (a0 | a1 | a2 | a3)
1623 {
1624 __m64 vdest = *(__m64 *)dst;
1625
1626 vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0);
1627 vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1);
1628 vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2);
1629 vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3);
1630
1631 *(__m64 *)dst = vdest;
1632 }
1633
1634 w -= 4;
1635 dst += 4;
1636 src += 4;
1637 }
1638
1639 CHECKPOINT();
1640
1641 while (w)
1642 {
1643 __m64 vsrc = load8888 (*src);
1644 ullong d = *dst;
1645 __m64 vdest = expand565 ((__m64)d, 0);
1646
1647 vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
1648
1649 *dst = (ullong)vdest;
1650
1651 w--;
1652 dst++;
1653 src++;
1654 }
1655 }
1656
1657 _mm_empty();
1658 }
1659
1660 /* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
1661
1662 void
1663 fbCompositeSrc_8888RevNPx8888mmx (CARD8 op,
1664 PicturePtr pSrc,
1665 PicturePtr pMask,
1666 PicturePtr pDst,
1667 INT16 xSrc,
1668 INT16 ySrc,
1669 INT16 xMask,
1670 INT16 yMask,
1671 INT16 xDst,
1672 INT16 yDst,
1673 CARD16 width,
1674 CARD16 height)
1675 {
1676 CARD32 *dstLine, *dst;
1677 CARD32 *srcLine, *src;
1678 FbStride dstStride, srcStride;
1679 CARD16 w;
1680
1681 CHECKPOINT();
1682
1683 fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
1684 fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
1685
1686 assert (pSrc->pDrawable == pMask->pDrawable);
1687
1688 while (height--)
1689 {
1690 dst = dstLine;
1691 dstLine += dstStride;
1692 src = srcLine;
1693 srcLine += srcStride;
1694 w = width;
1695
1696 while (w && (unsigned long)dst & 7)
1697 {
1698 __m64 s = load8888 (*src);
1699 __m64 d = load8888 (*dst);
1700
1701 *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64());
1702
1703 w--;
1704 dst++;
1705 src++;
1706 }
1707
1708 while (w >= 2)
1709 {
1710 ullong s0, s1;
1711 unsigned char a0, a1;
1712 __m64 d0, d1;
1713
1714 s0 = *src;
1715 s1 = *(src + 1);
1716
1717 a0 = (s0 >> 24);
1718 a1 = (s1 >> 24);
1719
1720 if ((a0 & a1) == 0xFF)
1721 {
1722 d0 = invert_colors(load8888(s0));
1723 d1 = invert_colors(load8888(s1));
1724
1725 *(__m64 *)dst = pack8888 (d0, d1);
1726 }
1727 else if (a0 | a1)
1728 {
1729 __m64 vdest = *(__m64 *)dst;
1730
1731 d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0));
1732 d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1));
1733
1734 *(__m64 *)dst = pack8888 (d0, d1);
1735 }
1736
1737 w -= 2;
1738 dst += 2;
1739 src += 2;
1740 }
1741
1742 while (w)
1743 {
1744 __m64 s = load8888 (*src);
1745 __m64 d = load8888 (*dst);
1746
1747 *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64());
1748
1749 w--;
1750 dst++;
1751 src++;
1752 }
1753 }
1754
1755 _mm_empty();
1756 }
1757
1758 void
1759 fbCompositeSolidMask_nx8888x0565Cmmx (CARD8 op,
1760 PicturePtr pSrc,
1761 PicturePtr pMask,
1762 PicturePtr pDst,
1763 INT16 xSrc,
1764 INT16 ySrc,
1765 INT16 xMask,
1766 INT16 yMask,
1767 INT16 xDst,
1768 INT16 yDst,
1769 CARD16 width,
1770 CARD16 height)
1771 {
1772 CARD32 src, srca;
1773 CARD16 *dstLine;
1774 CARD32 *maskLine;
1775 FbStride dstStride, maskStride;
1776 __m64 vsrc, vsrca;
1777
1778 CHECKPOINT();
1779
1780 fbComposeGetSolid(pSrc, src, pDst->format);
1781
1782 srca = src >> 24;
1783 if (srca == 0)
1784 return;
1785
1786 fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
1787 fbComposeGetStart (pMask, xMask, yMask, CARD32, maskStride, maskLine, 1);
1788
1789 vsrc = load8888 (src);
1790 vsrca = expand_alpha (vsrc);
1791
1792 while (height--)
1793 {
1794 int twidth = width;
1795 CARD32 *p = (CARD32 *)maskLine;
1796 CARD16 *q = (CARD16 *)dstLine;
1797
1798 while (twidth && ((unsigned long)q & 7))
1799 {
1800 CARD32 m = *(CARD32 *)p;
1801
1802 if (m)
1803 {
1804 ullong d = *q;
1805 __m64 vdest = expand565 ((__m64)d, 0);
1806 vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
1807 *q = (ullong)vdest;
1808 }
1809
1810 twidth--;
1811 p++;
1812 q++;
1813 }
1814
1815 while (twidth >= 4)
1816 {
1817 CARD32 m0, m1, m2, m3;
1818
1819 m0 = *p;
1820 m1 = *(p + 1);
1821 m2 = *(p + 2);
1822 m3 = *(p + 3);
1823
1824 if ((m0 | m1 | m2 | m3))
1825 {
1826 __m64 vdest = *(__m64 *)q;
1827
1828 vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0);
1829 vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1);
1830 vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2);
1831 vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3);
1832
1833 *(__m64 *)q = vdest;
1834 }
1835 twidth -= 4;
1836 p += 4;
1837 q += 4;
1838 }
1839
1840 while (twidth)
1841 {
1842 CARD32 m;
1843
1844 m = *(CARD32 *)p;
1845 if (m)
1846 {
1847 ullong d = *q;
1848 __m64 vdest = expand565((__m64)d, 0);
1849 vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0);
1850 *q = (ullong)vdest;
1851 }
1852
1853 twidth--;
1854 p++;
1855 q++;
1856 }
1857
1858 maskLine += maskStride;
1859 dstLine += dstStride;
1860 }
1861
1862 _mm_empty ();
1863 }
1864 #endif
1865
1866 static void
fbCompositeSrcAdd_8000x8000mmx(uint8_t * dst,uint8_t * src,int w)1867 fbCompositeSrcAdd_8000x8000mmx (uint8_t *dst, uint8_t *src, int w)
1868 {
1869 int s;
1870 int d;
1871 int t;
1872
1873 while (w && (unsigned long)dst & 7)
1874 {
1875 s = *src;
1876 d = *dst;
1877 t = d + s;
1878 s = t | (0 - (t >> 8));
1879 *dst = s;
1880
1881 dst++;
1882 src++;
1883 w--;
1884 }
1885
1886 while (w >= 8)
1887 {
1888 *(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
1889 dst += 8;
1890 src += 8;
1891 w -= 8;
1892 }
1893
1894 while (w)
1895 {
1896 s = *src;
1897 d = *dst;
1898 t = d + s;
1899 s = t | (0 - (t >> 8));
1900 *dst = s;
1901
1902 dst++;
1903 src++;
1904 w--;
1905 }
1906
1907 _mm_empty();
1908 }
1909 OIL_DEFINE_IMPL_FULL (fbCompositeSrcAdd_8000x8000mmx, composite_add_u8, OIL_IMPL_FLAG_MMX);
1910
1911 static void
fbCompositeSrcAdd_8888x8888mmx(uint32_t * dst,uint32_t * src,int w)1912 fbCompositeSrcAdd_8888x8888mmx (uint32_t *dst, uint32_t *src, int w)
1913 {
1914 while (w && (unsigned long)dst & 7)
1915 {
1916 *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
1917 _mm_cvtsi32_si64(*dst)));
1918 dst++;
1919 src++;
1920 w--;
1921 }
1922
1923 while (w >= 2)
1924 {
1925 *(__m64 *)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
1926 dst += 2;
1927 src += 2;
1928 w -= 2;
1929 }
1930
1931 if (w)
1932 {
1933 *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
1934 _mm_cvtsi32_si64(*dst)));
1935
1936 }
1937
1938 _mm_empty();
1939 }
1940 OIL_DEFINE_IMPL_FULL (fbCompositeSrcAdd_8888x8888mmx, composite_add_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_SSE);
1941
1942 #if 0
1943 #define GetStart(drw,x,y,type,stride,line,bpp) {\
1944 FbBits *__bits__; \
1945 FbStride __stride__; \
1946 int __xoff__,__yoff__; \
1947 \
1948 fbGetDrawable((drw),__bits__,__stride__,bpp,__xoff__,__yoff__); \
1949 (stride) = __stride__ * sizeof (FbBits) / sizeof (type); \
1950 (line) = ((type *) __bits__) + (stride) * ((y) - __yoff__) + ((x) - __xoff__); \
1951 }
1952
1953 Bool
1954 fbSolidFillmmx (DrawablePtr pDraw,
1955 int x,
1956 int y,
1957 int width,
1958 int height,
1959 FbBits xor)
1960 {
1961 FbStride stride;
1962 int bpp;
1963 ullong fill;
1964 __m64 vfill;
1965 CARD32 byte_width;
1966 CARD8 *byte_line;
1967 FbBits *bits;
1968 int xoff, yoff;
1969
1970 CHECKPOINT();
1971
1972 fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff);
1973
1974 if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))
1975 return FALSE;
1976
1977 if (bpp != 16 && bpp != 32)
1978 return FALSE;
1979
1980 if (bpp == 16)
1981 {
1982 stride = stride * sizeof (FbBits) / 2;
1983 byte_line = (CARD8 *)(((CARD16 *)bits) + stride * (y - yoff) + (x - xoff));
1984 byte_width = 2 * width;
1985 stride *= 2;
1986 }
1987 else
1988 {
1989 stride = stride * sizeof (FbBits) / 4;
1990 byte_line = (CARD8 *)(((CARD32 *)bits) + stride * (y - yoff) + (x - xoff));
1991 byte_width = 4 * width;
1992 stride *= 4;
1993 }
1994
1995 fill = ((ullong)xor << 32) | xor;
1996 vfill = (__m64)fill;
1997
1998 while (height--)
1999 {
2000 int w;
2001 CARD8 *d = byte_line;
2002 byte_line += stride;
2003 w = byte_width;
2004
2005 while (w >= 2 && ((unsigned long)d & 3))
2006 {
2007 *(CARD16 *)d = xor;
2008 w -= 2;
2009 d += 2;
2010 }
2011
2012 while (w >= 4 && ((unsigned long)d & 7))
2013 {
2014 *(CARD32 *)d = xor;
2015
2016 w -= 4;
2017 d += 4;
2018 }
2019
2020 while (w >= 64)
2021 {
2022 *(__m64*) (d + 0) = vfill;
2023 *(__m64*) (d + 8) = vfill;
2024 *(__m64*) (d + 16) = vfill;
2025 *(__m64*) (d + 24) = vfill;
2026 *(__m64*) (d + 32) = vfill;
2027 *(__m64*) (d + 40) = vfill;
2028 *(__m64*) (d + 48) = vfill;
2029 *(__m64*) (d + 56) = vfill;
2030
2031 w -= 64;
2032 d += 64;
2033 }
2034 while (w >= 4)
2035 {
2036 *(CARD32 *)d = xor;
2037
2038 w -= 4;
2039 d += 4;
2040 }
2041 if (w >= 2)
2042 {
2043 *(CARD16 *)d = xor;
2044 w -= 2;
2045 d += 2;
2046 }
2047 }
2048
2049 _mm_empty();
2050 return TRUE;
2051 }
2052
2053 Bool
2054 fbCopyAreammx (DrawablePtr pSrc,
2055 DrawablePtr pDst,
2056 int src_x,
2057 int src_y,
2058 int dst_x,
2059 int dst_y,
2060 int width,
2061 int height)
2062 {
2063 FbBits * src_bits;
2064 FbStride src_stride;
2065 int src_bpp;
2066 int src_xoff;
2067 int src_yoff;
2068
2069 FbBits * dst_bits;
2070 FbStride dst_stride;
2071 int dst_bpp;
2072 int dst_xoff;
2073 int dst_yoff;
2074
2075 CARD8 * src_bytes;
2076 CARD8 * dst_bytes;
2077 int byte_width;
2078
2079 fbGetDrawable(pSrc, src_bits, src_stride, src_bpp, src_xoff, src_yoff);
2080 fbGetDrawable(pDst, dst_bits, dst_stride, dst_bpp, dst_xoff, dst_yoff);
2081
2082 if (src_bpp != 16 && src_bpp != 32)
2083 return FALSE;
2084
2085 if (dst_bpp != 16 && dst_bpp != 32)
2086 return FALSE;
2087
2088 if (src_bpp != dst_bpp)
2089 {
2090 return FALSE;
2091 }
2092
2093 if (src_bpp == 16)
2094 {
2095 src_stride = src_stride * sizeof (FbBits) / 2;
2096 dst_stride = dst_stride * sizeof (FbBits) / 2;
2097 src_bytes = (CARD8 *)(((CARD16 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff));
2098 dst_bytes = (CARD8 *)(((CARD16 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff));
2099 byte_width = 2 * width;
2100 src_stride *= 2;
2101 dst_stride *= 2;
2102 }
2103 else
2104 {
2105 src_stride = src_stride * sizeof (FbBits) / 4;
2106 dst_stride = dst_stride * sizeof (FbBits) / 4;
2107 src_bytes = (CARD8 *)(((CARD32 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff));
2108 dst_bytes = (CARD8 *)(((CARD32 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff));
2109 byte_width = 4 * width;
2110 src_stride *= 4;
2111 dst_stride *= 4;
2112 }
2113
2114 while (height--)
2115 {
2116 int w;
2117 CARD8 *s = src_bytes;
2118 CARD8 *d = dst_bytes;
2119 src_bytes += src_stride;
2120 dst_bytes += dst_stride;
2121 w = byte_width;
2122
2123 while (w >= 2 && ((unsigned long)d & 3))
2124 {
2125 *(CARD16 *)d = *(CARD16 *)s;
2126 w -= 2;
2127 s += 2;
2128 d += 2;
2129 }
2130
2131 while (w >= 4 && ((unsigned long)d & 7))
2132 {
2133 *(CARD32 *)d = *(CARD32 *)s;
2134
2135 w -= 4;
2136 s += 4;
2137 d += 4;
2138 }
2139
2140 while (w >= 64)
2141 {
2142 *(__m64 *)(d + 0) = *(__m64 *)(s + 0);
2143 *(__m64 *)(d + 8) = *(__m64 *)(s + 8);
2144 *(__m64 *)(d + 16) = *(__m64 *)(s + 16);
2145 *(__m64 *)(d + 24) = *(__m64 *)(s + 24);
2146 *(__m64 *)(d + 32) = *(__m64 *)(s + 32);
2147 *(__m64 *)(d + 40) = *(__m64 *)(s + 40);
2148 *(__m64 *)(d + 48) = *(__m64 *)(s + 48);
2149 *(__m64 *)(d + 56) = *(__m64 *)(s + 56);
2150 w -= 64;
2151 s += 64;
2152 d += 64;
2153 }
2154 while (w >= 4)
2155 {
2156 *(CARD32 *)d = *(CARD32 *)s;
2157
2158 w -= 4;
2159 s += 4;
2160 d += 4;
2161 }
2162 if (w >= 2)
2163 {
2164 *(CARD16 *)d = *(CARD16 *)s;
2165 w -= 2;
2166 s += 2;
2167 d += 2;
2168 }
2169 }
2170
2171 _mm_empty();
2172 return TRUE;
2173 }
2174
2175 void
2176 fbCompositeCopyAreammx (CARD8 op,
2177 PicturePtr pSrc,
2178 PicturePtr pMask,
2179 PicturePtr pDst,
2180 INT16 xSrc,
2181 INT16 ySrc,
2182 INT16 xMask,
2183 INT16 yMask,
2184 INT16 xDst,
2185 INT16 yDst,
2186 CARD16 width,
2187 CARD16 height)
2188 {
2189 fbCopyAreammx (pSrc->pDrawable,
2190 pDst->pDrawable,
2191 xSrc, ySrc,
2192 xDst, yDst,
2193 width, height);
2194 }
2195
2196 #if !defined(__amd64__) && !defined(__x86_64__)
2197
2198 enum CPUFeatures {
2199 NoFeatures = 0,
2200 MMX = 0x1,
2201 MMX_Extensions = 0x2,
2202 SSE = 0x6,
2203 SSE2 = 0x8,
2204 CMOV = 0x10
2205 };
2206
2207 static unsigned int detectCPUFeatures(void) {
2208 unsigned int result;
2209 char vendor[13];
2210 vendor[0] = 0;
2211 vendor[12] = 0;
2212 /* see p. 118 of amd64 instruction set manual Vol3 */
2213 __asm__ ("push %%ebx\n"
2214 "pushf\n"
2215 "pop %%eax\n"
2216 "mov %%eax, %%ebx\n"
2217 "xor $0x00200000, %%eax\n"
2218 "push %%eax\n"
2219 "popf\n"
2220 "pushf\n"
2221 "pop %%eax\n"
2222 "mov $0x0, %%edx\n"
2223 "xor %%ebx, %%eax\n"
2224 "jz skip\n"
2225
2226 "mov $0x00000000, %%eax\n"
2227 "cpuid\n"
2228 "mov %%ebx, %1\n"
2229 "mov %%edx, %2\n"
2230 "mov %%ecx, %3\n"
2231 "mov $0x00000001, %%eax\n"
2232 "cpuid\n"
2233 "skip:\n"
2234 "pop %%ebx\n"
2235 "mov %%edx, %0\n"
2236 : "=r" (result),
2237 "=m" (vendor[0]),
2238 "=m" (vendor[4]),
2239 "=m" (vendor[8])
2240 :
2241 : "%eax", "%ecx", "%edx"
2242 );
2243
2244 unsigned int features = 0;
2245 if (result) {
2246 /* result now contains the standard feature bits */
2247 if (result & (1 << 15))
2248 features |= CMOV;
2249 if (result & (1 << 23))
2250 features |= MMX;
2251 if (result & (1 << 25))
2252 features |= SSE;
2253 if (result & (1 << 26))
2254 features |= SSE2;
2255 if ((result & MMX) && !(result & SSE) && (strcmp(vendor, "AuthenticAMD") == 0)) {
2256 /* check for AMD MMX extensions */
2257
2258 unsigned int result;
2259 __asm__("push %%ebx\n"
2260 "mov $0x80000000, %%eax\n"
2261 "cpuid\n"
2262 "xor %%edx, %%edx\n"
2263 "cmp $0x1, %%eax\n"
2264 "jge skip2\n"
2265 "mov $0x80000001, %%eax\n"
2266 "cpuid\n"
2267 "skip2:\n"
2268 "mov %%edx, %0\n"
2269 "pop %%ebx\n"
2270 : "=r" (result)
2271 :
2272 : "%eax", "%ecx", "%edx"
2273 );
2274 if (result & (1<<22))
2275 features |= MMX_Extensions;
2276 }
2277 }
2278 return features;
2279 }
2280
2281 Bool
2282 fbHaveMMX (void)
2283 {
2284 static Bool initialized = FALSE;
2285 static Bool mmx_present;
2286
2287 if (!initialized)
2288 {
2289 unsigned int features = detectCPUFeatures();
2290 mmx_present = (features & (MMX|MMX_Extensions)) == (MMX|MMX_Extensions);
2291 initialized = TRUE;
2292 }
2293
2294 return mmx_present;
2295 }
2296 #endif /* __amd64__ */
2297
2298
2299 #endif
2300