1 /*
2 * Copyright (C) 2005 Alex Volkov (codepro@usa.net)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 */
18
19 #ifndef SCALEMMX_H_
20 #define SCALEMMX_H_
21
22 #if !defined(SCALE_)
23 # error Please define SCALE_(name) before including scalemmx.h
24 #endif
25
26 #if !defined(MSVC_ASM) && !defined(GCC_ASM)
27 # error Please define target assembler (MSVC_ASM, GCC_ASM) before including scalemmx.h
28 #endif
29
30 // MMX defaults (no Format param)
31 #undef SCALE_CMPRGB
32 #define SCALE_CMPRGB(p1, p2) \
33 SCALE_(GetRGBDelta) (p1, p2)
34
35 #undef SCALE_TOYUV
36 #define SCALE_TOYUV(p) \
37 SCALE_(RGBtoYUV) (p)
38
39 #undef SCALE_CMPYUV
40 #define SCALE_CMPYUV(p1, p2, toler) \
41 SCALE_(CmpYUV) (p1, p2, toler)
42
43 #undef SCALE_GETY
44 #define SCALE_GETY(p) \
45 SCALE_(GetPixY) (p)
46
47 // MMX transformation multipliers
48 extern Uint64 mmx_888to555_mult;
49 extern Uint64 mmx_Y_mult;
50 extern Uint64 mmx_U_mult;
51 extern Uint64 mmx_V_mult;
52 extern Uint64 mmx_YUV_threshold;
53
54 #define USE_YUV_LOOKUP
55
56 #if defined(MSVC_ASM)
57 // MSVC inline assembly versions
58
59 #if defined(USE_MOVNTQ)
60 # define MOVNTQ(addr, val) movntq [addr], val
61 #else
62 # define MOVNTQ(addr, val) movq [addr], val
63 #endif
64
65 #if USE_PREFETCH == INTEL_PREFETCH
66 // using Intel SSE non-temporal prefetch
67 # define PREFETCH(addr) prefetchnta [addr]
68 # define HAVE_PREFETCH
69 #elif USE_PREFETCH == AMD_PREFETCH
70 // using AMD 3DNOW! prefetch
71 # define PREFETCH(addr) prefetch [addr]
72 # define HAVE_PREFETCH
73 #else
74 // no prefetch -- too bad for poor MMX-only souls
75 # define PREFETCH(addr)
76 # undef HAVE_PREFETCH
77 #endif
78
79 #if defined(_MSC_VER) && (_MSC_VER >= 1300)
80 # pragma warning( disable : 4799 )
81 #endif
82
83 static inline void
SCALE_(PlatInit)84 SCALE_(PlatInit) (void)
85 {
86 __asm
87 {
88 // mm0 will be kept == 0 throughout
89 // 0 is needed for bytes->words unpack instructions
90 pxor mm0, mm0
91 }
92 }
93
94 static inline void
SCALE_(PlatDone)95 SCALE_(PlatDone) (void)
96 {
97 // finish with MMX registers and yield them to FPU
98 __asm
99 {
100 emms
101 }
102 }
103
104 #if defined(HAVE_PREFETCH)
105 static inline void
SCALE_(Prefetch)106 SCALE_(Prefetch) (const void* p)
107 {
108 __asm
109 {
110 mov eax, p
111 PREFETCH (eax)
112 }
113 }
114
115 #else /* Not HAVE_PREFETCH */
116
117 static inline void
SCALE_(Prefetch)118 SCALE_(Prefetch) (const void* p)
119 {
120 (void)p; // silence compiler
121 /* no-op */
122 }
123
124 #endif /* HAVE_PREFETCH */
125
126 // compute the RGB distance squared between 2 pixels
127 static inline int
SCALE_(GetRGBDelta)128 SCALE_(GetRGBDelta) (Uint32 pix1, Uint32 pix2)
129 {
130 __asm
131 {
132 // load pixels
133 movd mm1, pix1
134 punpcklbw mm1, mm0
135 movd mm2, pix2
136 punpcklbw mm2, mm0
137 // get the difference between RGBA components
138 psubw mm1, mm2
139 // squared and sumed
140 pmaddwd mm1, mm1
141 // finish suming the squares
142 movq mm2, mm1
143 punpckhdq mm2, mm0
144 paddd mm1, mm2
145 // store result
146 movd eax, mm1
147 }
148 }
149
150 // retrieve the Y (intensity) component of pixel's YUV
151 static inline int
SCALE_(GetPixY)152 SCALE_(GetPixY) (Uint32 pix)
153 {
154 __asm
155 {
156 // load pixel
157 movd mm1, pix
158 punpcklbw mm1, mm0
159 // process
160 pmaddwd mm1, mmx_Y_mult // RGB * Yvec
161 movq mm2, mm1 // finish suming
162 punpckhdq mm2, mm0 // ditto
163 paddd mm1, mm2 // ditto
164 // store result
165 movd eax, mm1
166 shr eax, 14
167 }
168 }
169
170 #ifdef USE_YUV_LOOKUP
171
172 // convert pixel RGB vector into YUV representation vector
173 static inline YUV_VECTOR
SCALE_(RGBtoYUV)174 SCALE_(RGBtoYUV) (Uint32 pix)
175 {
176 __asm
177 {
178 // convert RGB888 to 555
179 movd mm1, pix
180 punpcklbw mm1, mm0
181 psrlw mm1, 3 // 8->5 bit
182 pmaddwd mm1, mmx_888to555_mult // shuffle into the right channel order
183 movq mm2, mm1 // finish shuffling
184 punpckhdq mm2, mm0 // ditto
185 por mm1, mm2 // ditto
186
187 // lookup the YUV vector
188 movd eax, mm1
189 mov eax, [RGB15_to_YUV + eax * 4]
190 }
191 }
192
193 // compare 2 pixels with respect to their YUV representations
194 // tolerance set by toler arg
195 // returns true: close; false: distant (-gt toler)
196 static inline bool
SCALE_(CmpYUV)197 SCALE_(CmpYUV) (Uint32 pix1, Uint32 pix2, int toler)
198 {
199 __asm
200 {
201 // convert RGB888 to 555
202 movd mm1, pix1
203 punpcklbw mm1, mm0
204 psrlw mm1, 3 // 8->5 bit
205 movd mm3, pix2
206 punpcklbw mm3, mm0
207 psrlw mm3, 3 // 8->5 bit
208 pmaddwd mm1, mmx_888to555_mult // shuffle into the right channel order
209 movq mm2, mm1 // finish shuffling
210 pmaddwd mm3, mmx_888to555_mult // shuffle into the right channel order
211 movq mm4, mm3 // finish shuffling
212 punpckhdq mm2, mm0 // ditto
213 por mm1, mm2 // ditto
214 punpckhdq mm4, mm0 // ditto
215 por mm3, mm4 // ditto
216
217 // lookup the YUV vector
218 movd eax, mm1
219 movd edx, mm3
220 movd mm1, [RGB15_to_YUV + eax * 4]
221 movq mm4, mm1
222 movd mm2, [RGB15_to_YUV + edx * 4]
223
224 // get abs difference between YUV components
225 #ifdef USE_PSADBW
226 // we can use PSADBW and save us some grief
227 psadbw mm1, mm2
228 movd edx, mm1
229 #else
230 // no PSADBW -- have to do it the hard way
231 psubusb mm1, mm2
232 psubusb mm2, mm4
233 por mm1, mm2
234
235 // sum the differences
236 // XXX: technically, this produces a MAX diff of 510
237 // but we do not need anything bigger, currently
238 movq mm2, mm1
239 psrlq mm2, 8
240 paddusb mm1, mm2
241 psrlq mm2, 8
242 paddusb mm1, mm2
243 movd edx, mm1
244 and edx, 0xff
245 #endif /* USE_PSADBW */
246 xor eax, eax
247 shl edx, 1
248 cmp edx, toler
249 // store result
250 setle al
251 }
252 }
253
254 #else /* Not USE_YUV_LOOKUP */
255
256 // convert pixel RGB vector into YUV representation vector
257 static inline YUV_VECTOR
SCALE_(RGBtoYUV)258 SCALE_(RGBtoYUV) (Uint32 pix)
259 {
260 __asm
261 {
262 movd mm1, pix
263 punpcklbw mm1, mm0
264
265 movq mm2, mm1
266
267 // Y vector multiply
268 pmaddwd mm1, mmx_Y_mult
269 movq mm4, mm1
270 punpckhdq mm4, mm0
271 punpckldq mm1, mm0 // clear out the high dword
272 paddd mm1, mm4
273 psrad mm1, 15
274
275 movq mm3, mm2
276
277 // U vector multiply
278 pmaddwd mm2, mmx_U_mult
279 psrad mm2, 10
280
281 // V vector multiply
282 pmaddwd mm3, mmx_V_mult
283 psrad mm3, 10
284
285 // load (1|1|1|1) into mm4
286 pcmpeqw mm4, mm4
287 psrlw mm4, 15
288
289 packssdw mm3, mm2
290 pmaddwd mm3, mm4
291 psrad mm3, 5
292
293 // load (64|64) into mm4
294 punpcklwd mm4, mm0
295 pslld mm4, 6
296 paddd mm3, mm4
297
298 packssdw mm3, mm1
299 packuswb mm3, mm0
300
301 movd eax, mm3
302 }
303 }
304
305 // compare 2 pixels with respect to their YUV representations
306 // tolerance set by toler arg
307 // returns true: close; false: distant (-gt toler)
308 static inline bool
SCALE_(CmpYUV)309 SCALE_(CmpYUV) (Uint32 pix1, Uint32 pix2, int toler)
310 {
311 __asm
312 {
313 movd mm1, pix1
314 punpcklbw mm1, mm0
315 movd mm2, pix2
316 punpcklbw mm2, mm0
317
318 psubw mm1, mm2
319 movq mm2, mm1
320
321 // Y vector multiply
322 pmaddwd mm1, mmx_Y_mult
323 movq mm4, mm1
324 punpckhdq mm4, mm0
325 paddd mm1, mm4
326 // abs()
327 movq mm4, mm1
328 psrad mm4, 31
329 pxor mm4, mm1
330 psubd mm1, mm4
331
332 movq mm3, mm2
333
334 // U vector multiply
335 pmaddwd mm2, mmx_U_mult
336 movq mm4, mm2
337 punpckhdq mm4, mm0
338 paddd mm2, mm4
339 // abs()
340 movq mm4, mm2
341 psrad mm4, 31
342 pxor mm4, mm2
343 psubd mm2, mm4
344
345 paddd mm1, mm2
346
347 // V vector multiply
348 pmaddwd mm3, mmx_V_mult
349 movq mm4, mm3
350 punpckhdq mm3, mm0
351 paddd mm3, mm4
352 // abs()
353 movq mm4, mm3
354 psrad mm4, 31
355 pxor mm4, mm3
356 psubd mm3, mm4
357
358 paddd mm1, mm3
359
360 movd edx, mm1
361 xor eax, eax
362 shr edx, 14
363 cmp edx, toler
364 // store result
365 setle al
366 }
367 }
368
369 #endif /* USE_YUV_LOOKUP */
370
371 // Check if 2 pixels are different with respect to their
372 // YUV representations
373 // returns 0: close; ~0: distant
374 static inline int
SCALE_(DiffYUV)375 SCALE_(DiffYUV) (Uint32 yuv1, Uint32 yuv2)
376 {
377 __asm
378 {
379 // load YUV pixels
380 movd mm1, yuv1
381 movq mm4, mm1
382 movd mm2, yuv2
383 // abs difference between channels
384 psubusb mm1, mm2
385 psubusb mm2, mm4
386 por mm1, mm2
387 // compare to threshold
388 psubusb mm1, mmx_YUV_threshold
389
390 movd edx, mm1
391 // transform eax to 0 or ~0
392 xor eax, eax
393 or edx, edx
394 setz al
395 dec eax
396 }
397 }
398
399 // bilinear weighted blend of four pixels
400 // MSVC asm version
401 static inline void
SCALE_(Blend_bilinear)402 SCALE_(Blend_bilinear) (const Uint32* row0, const Uint32* row1,
403 Uint32* dst_p, Uint32 dlen)
404 {
405 __asm
406 {
407 // EL0: setup vars
408 mov ebx, row0 // EL0
409
410 // EL0: load pixels
411 movq mm1, [ebx] // EL0
412 movq mm2, mm1 // EL0: p[1] -> mm2
413 PREFETCH (ebx + 0x80)
414 punpckhbw mm2, mm0 // EL0: p[1] -> mm2
415 mov ebx, row1
416 punpcklbw mm1, mm0 // EL0: p[0] -> mm1
417 movq mm3, [ebx]
418 movq mm4, mm3 // EL0: p[3] -> mm4
419 movq mm6, mm2 // EL1.1: p[1] -> mm6
420 PREFETCH (ebx + 0x80)
421 punpcklbw mm3, mm0 // EL0: p[2] -> mm3
422 movq mm5, mm1 // EL1.1: p[0] -> mm5
423 punpckhbw mm4, mm0 // EL0: p[3] -> mm4
424
425 mov edi, dst_p // EL0
426
427 // EL1: cache p[0] + 3*(p[1] + p[2]) + p[3] in mm6
428 paddw mm6, mm3 // EL1.2: p[1] + p[2] -> mm6
429 // EL1: cache p[0] + p[1] + p[2] + p[3] in mm7
430 movq mm7, mm6 // EL1.3: p[1] + p[2] -> mm7
431 // EL1: cache p[1] + 3*(p[0] + p[3]) + p[2] in mm5
432 paddw mm5, mm4 // EL1.2: p[0] + p[3] -> mm5
433 psllw mm6, 1 // EL1.4: 2*(p[1] + p[2]) -> mm6
434 paddw mm7, mm5 // EL1.4: sum(p[]) -> mm7
435 psllw mm5, 1 // EL1.5: 2*(p[0] + p[3]) -> mm5
436 paddw mm6, mm7 // EL1.5: p[0] + 3*(p[1] + p[2]) + p[3] -> mm6
437 paddw mm5, mm7 // EL1.6: p[1] + 3*(p[0] + p[3]) + p[2] -> mm5
438
439 // EL2: pixel 0 math -- (9*p[0] + 3*(p[1] + p[2]) + p[3]) / 16
440 psllw mm1, 3 // EL2.1: 8*p[0] -> mm1
441 paddw mm1, mm6 // EL2.2: 9*p[0] + 3*(p[1] + p[2]) + p[3] -> mm1
442 psrlw mm1, 4 // EL2.3: sum[0]/16 -> mm1
443
444 mov edx, dlen // EL0
445
446 // EL3: pixel 1 math -- (9*p[1] + 3*(p[0] + p[3]) + p[2]) / 16
447 psllw mm2, 3 // EL3.1: 8*p[1] -> mm2
448 paddw mm2, mm5 // EL3.2: 9*p[1] + 3*(p[0] + p[3]) + p[2] -> mm2
449 psrlw mm2, 4 // EL3.3: sum[1]/16 -> mm5
450
451 // EL2/3: store pixels 0 & 1
452 packuswb mm1, mm2 // EL2/3: pack into bytes
453 MOVNTQ (edi, mm1) // EL2/3: store 2 pixels
454
455 // EL4: pixel 2 math -- (9*p[2] + 3*(p[0] + p[3]) + p[1]) / 16
456 psllw mm3, 3 // EL4.1: 8*p[2] -> mm3
457 paddw mm3, mm5 // EL4.2: 9*p[2] + 3*(p[0] + p[3]) + p[1] -> mm3
458 psrlw mm3, 4 // EL4.3: sum[2]/16 -> mm3
459
460 // EL5: pixel 3 math -- (9*p[3] + 3*(p[1] + p[2]) + p[0]) / 16
461 psllw mm4, 3 // EL5.1: 8*p[3] -> mm4
462 paddw mm4, mm6 // EL5.2: 9*p[3] + 3*(p[1] + p[2]) + p[0] -> mm4
463 psrlw mm4, 4 // EL5.3: sum[3]/16 -> mm4
464
465 // EL4/5: store pixels 2 & 3
466 packuswb mm3, mm4 // EL4/5: pack into bytes
467 MOVNTQ (edi + edx*4, mm3) // EL4/5: store 2 pixels
468 }
469 }
470 // End MSVC_ASM
471
472 #elif defined(GCC_ASM)
473 // GCC inline assembly versions
474
475 #if defined(USE_MOVNTQ)
476 # define MOVNTQ(val, addr) "movntq " #val "," #addr
477 #else
478 # define MOVNTQ(val, addr) "movq " #val "," #addr
479 #endif
480
481 #if USE_PREFETCH == INTEL_PREFETCH
482 // using Intel SSE non-temporal prefetch
483 # define PREFETCH(addr) "prefetchnta " #addr
484 #elif USE_PREFETCH == AMD_PREFETCH
485 // using AMD 3DNOW! prefetch
486 # define PREFETCH(addr) "prefetch " #addr
487 #else
488 // no prefetch -- too bad for poor MMX-only souls
489 # define PREFETCH(addr)
490 #endif
491
492 #if defined(__x86_64__)
493 # define A_REG "rax"
494 # define D_REG "rdx"
495 # define CLR_UPPER32(r) "xor " "%%" r "," "%%" r
496 #else
497 # define A_REG "eax"
498 # define D_REG "edx"
499 # define CLR_UPPER32(r)
500 #endif
501
502 static inline void
SCALE_(PlatInit)503 SCALE_(PlatInit) (void)
504 {
505 __asm__ (
506 // mm0 will be kept == 0 throughout
507 // 0 is needed for bytes->words unpack instructions
508 "pxor %%mm0, %%mm0 \n\t"
509
510 : /* nothing */
511 : /* nothing */
512 );
513 }
514
515 static inline void
SCALE_(PlatDone)516 SCALE_(PlatDone) (void)
517 {
518 // finish with MMX registers and yield them to FPU
519 __asm__ (
520 "emms \n\t"
521 : /* nothing */ : /* nothing */
522 );
523 }
524
525 static inline void
SCALE_(Prefetch)526 SCALE_(Prefetch) (const void* p)
527 {
528 __asm__ __volatile__ ("" PREFETCH (%0) : /*nothing*/ : "m" (p) );
529 }
530
531 // compute the RGB distance squared between 2 pixels
532 static inline int
SCALE_(GetRGBDelta)533 SCALE_(GetRGBDelta) (Uint32 pix1, Uint32 pix2)
534 {
535 int res;
536
537 __asm__ (
538 // load pixels
539 "movd %1, %%mm1 \n\t"
540 "punpcklbw %%mm0, %%mm1 \n\t"
541 "movd %2, %%mm2 \n\t"
542 "punpcklbw %%mm0, %%mm2 \n\t"
543 // get the difference between RGBA components
544 "psubw %%mm2, %%mm1 \n\t"
545 // squared and sumed
546 "pmaddwd %%mm1, %%mm1 \n\t"
547 // finish suming the squares
548 "movq %%mm1, %%mm2 \n\t"
549 "punpckhdq %%mm0, %%mm2 \n\t"
550 "paddd %%mm2, %%mm1 \n\t"
551 // store result
552 "movd %%mm1, %0 \n\t"
553
554 : /*0*/"=rm" (res)
555 : /*1*/"rm" (pix1), /*2*/"rm" (pix2)
556 );
557
558 return res;
559 }
560
561 // retrieve the Y (intensity) component of pixel's YUV
562 static inline int
SCALE_(GetPixY)563 SCALE_(GetPixY) (Uint32 pix)
564 {
565 int ret;
566
567 __asm__ (
568 // load pixel
569 "movd %1, %%mm1 \n\t"
570 "punpcklbw %%mm0, %%mm1 \n\t"
571 // process
572 "pmaddwd %2, %%mm1 \n\t" // R,G,B * Yvec
573 "movq %%mm1, %%mm2 \n\t" // finish suming
574 "punpckhdq %%mm0, %%mm2 \n\t" // ditto
575 "paddd %%mm2, %%mm1 \n\t" // ditto
576 // store index
577 "movd %%mm1, %0 \n\t"
578
579 : /*0*/"=r" (ret)
580 : /*1*/"rm" (pix), /*2*/"m" (mmx_Y_mult)
581 );
582 return ret >> 14;
583 }
584
585 #ifdef USE_YUV_LOOKUP
586
587 // convert pixel RGB vector into YUV representation vector
588 static inline YUV_VECTOR
SCALE_(RGBtoYUV)589 SCALE_(RGBtoYUV) (Uint32 pix)
590 {
591 int i;
592
593 __asm__ (
594 // convert RGB888 to 555
595 "movd %1, %%mm1 \n\t"
596 "punpcklbw %%mm0, %%mm1 \n\t"
597 "psrlw $3, %%mm1 \n\t" // 8->5 bit
598 "pmaddwd %2, %%mm1 \n\t" // shuffle into the right channel order
599 "movq %%mm1, %%mm2 \n\t" // finish shuffling
600 "punpckhdq %%mm0, %%mm2 \n\t" // ditto
601 "por %%mm2, %%mm1 \n\t" // ditto
602 "movd %%mm1, %0 \n\t"
603
604 : /*0*/"=rm" (i)
605 : /*1*/"rm" (pix), /*2*/"m" (mmx_888to555_mult)
606 );
607 return RGB15_to_YUV[i];
608 }
609
610 // compare 2 pixels with respect to their YUV representations
611 // tolerance set by toler arg
612 // returns true: close; false: distant (-gt toler)
613 static inline bool
SCALE_(CmpYUV)614 SCALE_(CmpYUV) (Uint32 pix1, Uint32 pix2, int toler)
615 {
616 int delta;
617
618 __asm__ (
619 "movd %1, %%mm1 \n\t"
620 "movd %2, %%mm3 \n\t"
621
622 // convert RGB888 to 555
623 // this is somewhat parallelized
624 "punpcklbw %%mm0, %%mm1 \n\t"
625 CLR_UPPER32 (A_REG) "\n\t"
626 "psrlw $3, %%mm1 \n\t" // 8->5 bit
627 "punpcklbw %%mm0, %%mm3 \n\t"
628 "psrlw $3, %%mm3 \n\t" // 8->5 bit
629 "pmaddwd %4, %%mm1 \n\t" // shuffle into the right channel order
630 "movq %%mm1, %%mm2 \n\t" // finish shuffling
631 "pmaddwd %4, %%mm3 \n\t" // shuffle into the right channel order
632 CLR_UPPER32 (D_REG) "\n\t"
633 "movq %%mm3, %%mm4 \n\t" // finish shuffling
634 "punpckhdq %%mm0, %%mm2 \n\t" // ditto
635 "por %%mm2, %%mm1 \n\t" // ditto
636 "punpckhdq %%mm0, %%mm4 \n\t" // ditto
637 "por %%mm4, %%mm3 \n\t" // ditto
638
639 // lookup the YUV vector
640 "movd %%mm1, %%eax \n\t"
641 "movd %%mm3, %%edx \n\t"
642 "movd (%3, %%" A_REG ", 4), %%mm1 \n\t"
643 "movq %%mm1, %%mm4 \n\t"
644 "movd (%3, %%" D_REG ", 4), %%mm2 \n\t"
645
646 // get abs difference between YUV components
647 #ifdef USE_PSADBW
648 // we can use PSADBW and save us some grief
649 "psadbw %%mm2, %%mm1 \n\t"
650 "movd %%mm1, %0 \n\t"
651 #else
652 // no PSADBW -- have to do it the hard way
653 "psubusb %%mm2, %%mm1 \n\t"
654 "psubusb %%mm4, %%mm2 \n\t"
655 "por %%mm2, %%mm1 \n\t"
656
657 // sum the differences
658 // technically, this produces a MAX diff of 510
659 // but we do not need anything bigger, currently
660 "movq %%mm1, %%mm2 \n\t"
661 "psrlq $8, %%mm2 \n\t"
662 "paddusb %%mm2, %%mm1 \n\t"
663 "psrlq $8, %%mm2 \n\t"
664 "paddusb %%mm2, %%mm1 \n\t"
665 // store intermediate delta
666 "movd %%mm1, %0 \n\t"
667 "andl $0xff, %0 \n\t"
668 #endif /* USE_PSADBW */
669 : /*0*/"=rm" (delta)
670 : /*1*/"rm" (pix1), /*2*/"rm" (pix2),
671 /*3*/ "r" (RGB15_to_YUV),
672 /*4*/"m" (mmx_888to555_mult)
673 : "%" A_REG, "%" D_REG, "cc"
674 );
675
676 return (delta << 1) <= toler;
677 }
678
679 #endif /* USE_YUV_LOOKUP */
680
681 // Check if 2 pixels are different with respect to their
682 // YUV representations
683 // returns 0: close; ~0: distant
684 static inline int
SCALE_(DiffYUV)685 SCALE_(DiffYUV) (Uint32 yuv1, Uint32 yuv2)
686 {
687 sint32 ret;
688
689 __asm__ (
690 // load YUV pixels
691 "movd %1, %%mm1 \n\t"
692 "movq %%mm1, %%mm4 \n\t"
693 "movd %2, %%mm2 \n\t"
694 // abs difference between channels
695 "psubusb %%mm2, %%mm1 \n\t"
696 "psubusb %%mm4, %%mm2 \n\t"
697 CLR_UPPER32(D_REG) "\n\t"
698 "por %%mm2, %%mm1 \n\t"
699 // compare to threshold
700 "psubusb %3, %%mm1 \n\t"
701
702 "movd %%mm1, %%edx \n\t"
703 // transform eax to 0 or ~0
704 "xor %%" A_REG ", %%" A_REG "\n\t"
705 "or %%" D_REG ", %%" D_REG "\n\t"
706 "setz %%al \n\t"
707 "dec %%" A_REG " \n\t"
708
709 : /*0*/"=a" (ret)
710 : /*1*/"rm" (yuv1), /*2*/"rm" (yuv2),
711 /*3*/"m" (mmx_YUV_threshold)
712 : "%" D_REG, "cc"
713 );
714 return ret;
715 }
716
717 // Bilinear weighted blend of four pixels
718 // Function produces 4 blended pixels (in 2x2 matrix) and writes them
719 // out to the surface
720 // Last version
721 static inline void
SCALE_(Blend_bilinear)722 SCALE_(Blend_bilinear) (const Uint32* row0, const Uint32* row1,
723 Uint32* dst_p, Uint32 dlen)
724 {
725 __asm__ (
726 // EL0: load pixels
727 "movq %0, %%mm1 \n\t" // EL0
728 "movq %%mm1, %%mm2 \n\t" // EL0: p[1] -> mm2
729 PREFETCH (0x80%0) "\n\t"
730 "punpckhbw %%mm0, %%mm2 \n\t" // EL0: p[1] -> mm2
731 "punpcklbw %%mm0, %%mm1 \n\t" // EL0: p[0] -> mm1
732 "movq %1, %%mm3 \n\t"
733 "movq %%mm3, %%mm4 \n\t" // EL0: p[3] -> mm4
734 "movq %%mm2, %%mm6 \n\t" // EL1.1: p[1] -> mm6
735 PREFETCH (0x80%1) "\n\t"
736 "punpcklbw %%mm0, %%mm3 \n\t" // EL0: p[2] -> mm3
737 "movq %%mm1, %%mm5 \n\t" // EL1.1: p[0] -> mm5
738 "punpckhbw %%mm0, %%mm4 \n\t" // EL0: p[3] -> mm4
739
740 // EL1: cache p[0] + 3*(p[1] + p[2]) + p[3] in mm6
741 "paddw %%mm3, %%mm6 \n\t" // EL1.2: p[1] + p[2] -> mm6
742 // EL1: cache p[0] + p[1] + p[2] + p[3] in mm7
743 "movq %%mm6, %%mm7 \n\t" // EL1.3: p[1] + p[2] -> mm7
744 // EL1: cache p[1] + 3*(p[0] + p[3]) + p[2] in mm5
745 "paddw %%mm4, %%mm5 \n\t" // EL1.2: p[0] + p[3] -> mm5
746 "psllw $1, %%mm6 \n\t" // EL1.4: 2*(p[1] + p[2]) -> mm6
747 "paddw %%mm5, %%mm7 \n\t" // EL1.4: sum(p[]) -> mm7
748 "psllw $1, %%mm5 \n\t" // EL1.5: 2*(p[0] + p[3]) -> mm5
749 "paddw %%mm7, %%mm6 \n\t" // EL1.5: p[0] + 3*(p[1] + p[2]) + p[3] -> mm6
750 "paddw %%mm7, %%mm5 \n\t" // EL1.6: p[1] + 3*(p[0] + p[3]) + p[2] -> mm5
751
752 // EL2: pixel 0 math -- (9*p[0] + 3*(p[1] + p[2]) + p[3]) / 16
753 "psllw $3, %%mm1 \n\t" // EL2.1: 8*p[0] -> mm1
754 "paddw %%mm6, %%mm1 \n\t" // EL2.2: 9*p[0] + 3*(p[1] + p[2]) + p[3] -> mm1
755 "psrlw $4, %%mm1 \n\t" // EL2.3: sum[0]/16 -> mm1
756
757 // EL3: pixel 1 math -- (9*p[1] + 3*(p[0] + p[3]) + p[2]) / 16
758 "psllw $3, %%mm2 \n\t" // EL3.1: 8*p[1] -> mm2
759 "paddw %%mm5, %%mm2 \n\t" // EL3.2: 9*p[1] + 3*(p[0] + p[3]) + p[2] -> mm5
760 "psrlw $4, %%mm2 \n\t" // EL3.3: sum[1]/16 -> mm5
761
762 // EL2/4: store pixels 0 & 1
763 "packuswb %%mm2, %%mm1 \n\t" // EL2/4: pack into bytes
764 MOVNTQ (%%mm1, (%2)) "\n\t" // EL2/4: store 2 pixels
765
766 // EL4: pixel 2 math -- (9*p[2] + 3*(p[0] + p[3]) + p[1]) / 16
767 "psllw $3, %%mm3 \n\t" // EL4.1: 8*p[2] -> mm3
768 "paddw %%mm5, %%mm3 \n\t" // EL4.2: 9*p[2] + 3*(p[0] + p[3]) + p[1] -> mm3
769 "psrlw $4, %%mm3 \n\t" // EL4.3: sum[2]/16 -> mm3
770
771 // EL5: pixel 3 math -- (9*p[3] + 3*(p[1] + p[2]) + p[0]) / 16
772 "psllw $3, %%mm4 \n\t" // EL5.1: 8*p[3] -> mm4
773 "paddw %%mm6, %%mm4 \n\t" // EL5.2: 9*p[3] + 3*(p[1] + p[2]) + p[0] -> mm4
774 "psrlw $4, %%mm4 \n\t" // EL5.3: sum[3]/16 -> mm4
775
776 // EL4/5: store pixels 2 & 3
777 "packuswb %%mm4, %%mm3 \n\t" // EL4/5: pack into bytes
778 MOVNTQ (%%mm3, (%2,%3,4)) "\n\t" // EL4/5: store 2 pixels
779
780 : /* nothing */
781 : /*0*/"m" (*row0), /*1*/"m" (*row1), /*2*/"r" (dst_p),
782 /*3*/"r" ((unsigned long)dlen) /* 'long' is for proper reg alloc on amd64 */
783 : "memory"
784 );
785 }
786
787 #undef A_REG
788 #undef D_REG
789 #undef CLR_UPPER32
790
791 #endif // GCC_ASM
792
793 #endif /* SCALEMMX_H_ */
794