1 /*
2  * Copyright (C) 2005  Alex Volkov (codepro@usa.net)
3  *
4  *  This program is free software; you can redistribute it and/or modify
5  *  it under the terms of the GNU General Public License as published by
6  *  the Free Software Foundation; either version 2 of the License, or
7  *  (at your option) any later version.
8  *
9  *  This program is distributed in the hope that it will be useful,
10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *  GNU General Public License for more details.
13  *
14  *  You should have received a copy of the GNU General Public License
15  *  along with this program; if not, write to the Free Software
16  *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17  */
18 
19 #ifndef SCALEMMX_H_
20 #define SCALEMMX_H_
21 
22 #if !defined(SCALE_)
23 #	error Please define SCALE_(name) before including scalemmx.h
24 #endif
25 
26 #if !defined(MSVC_ASM) && !defined(GCC_ASM)
27 #	error Please define target assembler (MSVC_ASM, GCC_ASM) before including scalemmx.h
28 #endif
29 
30 // MMX defaults (no Format param)
31 #undef  SCALE_CMPRGB
32 #define SCALE_CMPRGB(p1, p2) \
33 			SCALE_(GetRGBDelta) (p1, p2)
34 
35 #undef  SCALE_TOYUV
36 #define SCALE_TOYUV(p) \
37 			SCALE_(RGBtoYUV) (p)
38 
39 #undef  SCALE_CMPYUV
40 #define SCALE_CMPYUV(p1, p2, toler) \
41 			SCALE_(CmpYUV) (p1, p2, toler)
42 
43 #undef  SCALE_GETY
44 #define SCALE_GETY(p) \
45 			SCALE_(GetPixY) (p)
46 
47 // MMX transformation multipliers
48 extern Uint64 mmx_888to555_mult;
49 extern Uint64 mmx_Y_mult;
50 extern Uint64 mmx_U_mult;
51 extern Uint64 mmx_V_mult;
52 extern Uint64 mmx_YUV_threshold;
53 
54 #define USE_YUV_LOOKUP
55 
56 #if defined(MSVC_ASM)
57 //	MSVC inline assembly versions
58 
59 #if defined(USE_MOVNTQ)
60 #	define MOVNTQ(addr, val)   movntq      [addr], val
61 #else
62 #	define MOVNTQ(addr, val)   movq        [addr], val
63 #endif
64 
65 #if USE_PREFETCH == INTEL_PREFETCH
66 //	using Intel SSE non-temporal prefetch
67 #	define PREFETCH(addr)      prefetchnta [addr]
68 #	define HAVE_PREFETCH
69 #elif USE_PREFETCH == AMD_PREFETCH
70 //	using AMD 3DNOW! prefetch
71 #	define PREFETCH(addr)      prefetch    [addr]
72 #	define HAVE_PREFETCH
73 #else
74 //	no prefetch -- too bad for poor MMX-only souls
75 #	define PREFETCH(addr)
76 #	undef  HAVE_PREFETCH
77 #endif
78 
79 #if defined(_MSC_VER) && (_MSC_VER >= 1300)
80 #	pragma warning( disable : 4799 )
81 #endif
82 
83 static inline void
SCALE_(PlatInit)84 SCALE_(PlatInit) (void)
85 {
86 	__asm
87 	{
88 		// mm0 will be kept == 0 throughout
89 		// 0 is needed for bytes->words unpack instructions
90 		pxor       mm0, mm0
91 	}
92 }
93 
94 static inline void
SCALE_(PlatDone)95 SCALE_(PlatDone) (void)
96 {
97 	// finish with MMX registers and yield them to FPU
98 	__asm
99 	{
100 		emms
101 	}
102 }
103 
104 #if defined(HAVE_PREFETCH)
105 static inline void
SCALE_(Prefetch)106 SCALE_(Prefetch) (const void* p)
107 {
108     __asm
109 	{
110 		mov       eax, p
111 		PREFETCH  (eax)
112 	}
113 }
114 
115 #else /* Not HAVE_PREFETCH */
116 
117 static inline void
SCALE_(Prefetch)118 SCALE_(Prefetch) (const void* p)
119 {
120 	(void)p; // silence compiler
121 	/* no-op */
122 }
123 
124 #endif /* HAVE_PREFETCH */
125 
126 // compute the RGB distance squared between 2 pixels
127 static inline int
SCALE_(GetRGBDelta)128 SCALE_(GetRGBDelta) (Uint32 pix1, Uint32 pix2)
129 {
130 	__asm
131 	{
132 		// load pixels
133 		movd       mm1, pix1
134 		punpcklbw  mm1, mm0
135 		movd       mm2, pix2
136 		punpcklbw  mm2, mm0
137 		// get the difference between RGBA components
138 		psubw      mm1, mm2
139 		// squared and sumed
140 		pmaddwd    mm1, mm1
141 		// finish suming the squares
142 		movq       mm2, mm1
143 		punpckhdq  mm2, mm0
144 		paddd      mm1, mm2
145 		// store result
146 		movd       eax, mm1
147 	}
148 }
149 
150 // retrieve the Y (intensity) component of pixel's YUV
151 static inline int
SCALE_(GetPixY)152 SCALE_(GetPixY) (Uint32 pix)
153 {
154 	__asm
155 	{
156 		// load pixel
157 		movd       mm1, pix
158 		punpcklbw  mm1, mm0
159 		// process
160 		pmaddwd    mm1, mmx_Y_mult // RGB * Yvec
161 		movq       mm2, mm1   // finish suming
162 		punpckhdq  mm2, mm0   //   ditto
163 		paddd      mm1, mm2   //   ditto
164 		// store result
165 		movd       eax, mm1
166 		shr        eax, 14
167 	}
168 }
169 
170 #ifdef USE_YUV_LOOKUP
171 
172 // convert pixel RGB vector into YUV representation vector
173 static inline YUV_VECTOR
SCALE_(RGBtoYUV)174 SCALE_(RGBtoYUV) (Uint32 pix)
175 {
176 	__asm
177 	{
178 		// convert RGB888 to 555
179 		movd       mm1, pix
180 		punpcklbw  mm1, mm0
181 		psrlw      mm1, 3    // 8->5 bit
182 		pmaddwd    mm1, mmx_888to555_mult // shuffle into the right channel order
183 		movq       mm2, mm1   // finish shuffling
184 		punpckhdq  mm2, mm0   //   ditto
185 		por        mm1, mm2   //   ditto
186 
187 		// lookup the YUV vector
188 		movd       eax, mm1
189 		mov        eax, [RGB15_to_YUV + eax * 4]
190 	}
191 }
192 
193 // compare 2 pixels with respect to their YUV representations
194 // tolerance set by toler arg
195 // returns true: close; false: distant (-gt toler)
196 static inline bool
SCALE_(CmpYUV)197 SCALE_(CmpYUV) (Uint32 pix1, Uint32 pix2, int toler)
198 {
199 	__asm
200 	{
201 		// convert RGB888 to 555
202 		movd       mm1, pix1
203 		punpcklbw  mm1, mm0
204 		psrlw      mm1, 3    // 8->5 bit
205 		movd       mm3, pix2
206 		punpcklbw  mm3, mm0
207 		psrlw      mm3, 3    // 8->5 bit
208 		pmaddwd    mm1, mmx_888to555_mult  // shuffle into the right channel order
209 		movq       mm2, mm1   // finish shuffling
210 		pmaddwd    mm3, mmx_888to555_mult  // shuffle into the right channel order
211 		movq       mm4, mm3   // finish shuffling
212 		punpckhdq  mm2, mm0   //   ditto
213 		por        mm1, mm2   //   ditto
214 		punpckhdq  mm4, mm0   //   ditto
215 		por        mm3, mm4   //   ditto
216 
217 		// lookup the YUV vector
218 		movd       eax, mm1
219 		movd       edx, mm3
220 		movd       mm1, [RGB15_to_YUV + eax * 4]
221 		movq       mm4, mm1
222 		movd       mm2, [RGB15_to_YUV + edx * 4]
223 
224 		// get abs difference between YUV components
225 #ifdef USE_PSADBW
226 		// we can use PSADBW and save us some grief
227 		psadbw     mm1, mm2
228 		movd       edx, mm1
229 #else
230 		// no PSADBW -- have to do it the hard way
231 		psubusb    mm1, mm2
232 		psubusb    mm2, mm4
233 		por        mm1, mm2
234 
235 		// sum the differences
236 		// XXX: technically, this produces a MAX diff of 510
237 		//  but we do not need anything bigger, currently
238 		movq       mm2, mm1
239 		psrlq      mm2, 8
240 		paddusb    mm1, mm2
241 		psrlq      mm2, 8
242 		paddusb    mm1, mm2
243 		movd       edx, mm1
244 		and        edx, 0xff
245 #endif /* USE_PSADBW */
246 		xor        eax, eax
247 		shl        edx, 1
248 		cmp        edx, toler
249 		// store result
250 		setle      al
251 	}
252 }
253 
254 #else /* Not USE_YUV_LOOKUP */
255 
256 // convert pixel RGB vector into YUV representation vector
257 static inline YUV_VECTOR
SCALE_(RGBtoYUV)258 SCALE_(RGBtoYUV) (Uint32 pix)
259 {
260 	__asm
261 	{
262 		movd       mm1, pix
263 		punpcklbw  mm1, mm0
264 
265 		movq       mm2, mm1
266 
267 		// Y vector multiply
268 		pmaddwd    mm1, mmx_Y_mult
269 		movq       mm4, mm1
270 		punpckhdq  mm4, mm0
271 		punpckldq  mm1, mm0 // clear out the high dword
272 		paddd      mm1, mm4
273 		psrad      mm1, 15
274 
275 		movq       mm3, mm2
276 
277 		// U vector multiply
278 		pmaddwd    mm2, mmx_U_mult
279 		psrad      mm2, 10
280 
281 		// V vector multiply
282 		pmaddwd    mm3, mmx_V_mult
283 		psrad      mm3, 10
284 
285 		// load (1|1|1|1) into mm4
286 		pcmpeqw    mm4, mm4
287 		psrlw      mm4, 15
288 
289 		packssdw   mm3, mm2
290 		pmaddwd    mm3, mm4
291 		psrad      mm3, 5
292 
293 		// load (64|64) into mm4
294 		punpcklwd  mm4, mm0
295 		pslld      mm4, 6
296 		paddd      mm3, mm4
297 
298 		packssdw   mm3, mm1
299 		packuswb   mm3, mm0
300 
301 		movd       eax, mm3
302 	}
303 }
304 
305 // compare 2 pixels with respect to their YUV representations
306 // tolerance set by toler arg
307 // returns true: close; false: distant (-gt toler)
308 static inline bool
SCALE_(CmpYUV)309 SCALE_(CmpYUV) (Uint32 pix1, Uint32 pix2, int toler)
310 {
311 	__asm
312 	{
313 		movd       mm1, pix1
314 		punpcklbw  mm1, mm0
315 		movd       mm2, pix2
316 		punpcklbw  mm2, mm0
317 
318 		psubw      mm1, mm2
319 		movq       mm2, mm1
320 
321 		// Y vector multiply
322 		pmaddwd    mm1, mmx_Y_mult
323 		movq       mm4, mm1
324 		punpckhdq  mm4, mm0
325 		paddd      mm1, mm4
326 		// abs()
327 		movq       mm4, mm1
328 		psrad      mm4, 31
329 		pxor       mm4, mm1
330 		psubd      mm1, mm4
331 
332 		movq       mm3, mm2
333 
334 		// U vector multiply
335 		pmaddwd    mm2, mmx_U_mult
336 		movq       mm4, mm2
337 		punpckhdq  mm4, mm0
338 		paddd      mm2, mm4
339 		// abs()
340 		movq       mm4, mm2
341 		psrad      mm4, 31
342 		pxor       mm4, mm2
343 		psubd      mm2, mm4
344 
345 		paddd      mm1, mm2
346 
347 		// V vector multiply
348 		pmaddwd    mm3, mmx_V_mult
349 		movq       mm4, mm3
350 		punpckhdq  mm3, mm0
351 		paddd      mm3, mm4
352 		// abs()
353 		movq       mm4, mm3
354 		psrad      mm4, 31
355 		pxor       mm4, mm3
356 		psubd      mm3, mm4
357 
358 		paddd      mm1, mm3
359 
360 		movd       edx, mm1
361 		xor        eax, eax
362 		shr        edx, 14
363 		cmp        edx, toler
364 		// store result
365 		setle      al
366 	}
367 }
368 
369 #endif /* USE_YUV_LOOKUP */
370 
371 // Check if 2 pixels are different with respect to their
372 // YUV representations
373 // returns 0: close; ~0: distant
374 static inline int
SCALE_(DiffYUV)375 SCALE_(DiffYUV) (Uint32 yuv1, Uint32 yuv2)
376 {
377 	__asm
378 	{
379 		// load YUV pixels
380 		movd       mm1, yuv1
381 		movq       mm4, mm1
382 		movd       mm2, yuv2
383 		// abs difference between channels
384 		psubusb    mm1, mm2
385 		psubusb    mm2, mm4
386 		por        mm1, mm2
387 		// compare to threshold
388 		psubusb    mm1, mmx_YUV_threshold
389 
390 		movd       edx, mm1
391 		// transform eax to 0 or ~0
392 		xor        eax, eax
393 		or         edx, edx
394 		setz       al
395 		dec        eax
396 	}
397 }
398 
399 // bilinear weighted blend of four pixels
400 // MSVC asm version
401 static inline void
SCALE_(Blend_bilinear)402 SCALE_(Blend_bilinear) (const Uint32* row0, const Uint32* row1,
403 				Uint32* dst_p, Uint32 dlen)
404 {
405 	__asm
406 	{
407 		// EL0: setup vars
408 		mov        ebx, row0 // EL0
409 
410 		// EL0: load pixels
411 		movq       mm1, [ebx] // EL0
412 		movq       mm2, mm1   // EL0: p[1] -> mm2
413 		PREFETCH   (ebx + 0x80)
414 		punpckhbw  mm2, mm0   // EL0: p[1] -> mm2
415 		mov        ebx, row1
416 		punpcklbw  mm1, mm0   // EL0: p[0] -> mm1
417 		movq       mm3, [ebx]
418 		movq       mm4, mm3   // EL0: p[3] -> mm4
419 		movq       mm6, mm2   // EL1.1: p[1] -> mm6
420 		PREFETCH   (ebx + 0x80)
421 		punpcklbw  mm3, mm0   // EL0: p[2] -> mm3
422 		movq       mm5, mm1   // EL1.1: p[0] -> mm5
423 		punpckhbw  mm4, mm0   // EL0: p[3] -> mm4
424 
425 		mov        edi, dst_p // EL0
426 
427 		// EL1: cache p[0] + 3*(p[1] + p[2]) + p[3] in mm6
428 		paddw      mm6, mm3   // EL1.2: p[1] + p[2] -> mm6
429 		// EL1: cache p[0] + p[1] + p[2] + p[3] in mm7
430 		movq       mm7, mm6   // EL1.3: p[1] + p[2] -> mm7
431 		// EL1: cache p[1] + 3*(p[0] + p[3]) + p[2] in mm5
432 		paddw      mm5, mm4   // EL1.2: p[0] + p[3] -> mm5
433 		psllw      mm6, 1     // EL1.4: 2*(p[1] + p[2]) -> mm6
434 		paddw      mm7, mm5   // EL1.4: sum(p[]) -> mm7
435 		psllw      mm5, 1     // EL1.5: 2*(p[0] + p[3]) -> mm5
436 		paddw      mm6, mm7   // EL1.5: p[0] + 3*(p[1] + p[2]) + p[3] -> mm6
437 		paddw      mm5, mm7   // EL1.6: p[1] + 3*(p[0] + p[3]) + p[2] -> mm5
438 
439 		// EL2: pixel 0 math -- (9*p[0] + 3*(p[1] + p[2]) + p[3]) / 16
440 		psllw      mm1, 3     // EL2.1: 8*p[0] -> mm1
441 		paddw      mm1, mm6   // EL2.2: 9*p[0] + 3*(p[1] + p[2]) + p[3] -> mm1
442 		psrlw      mm1, 4     // EL2.3: sum[0]/16 -> mm1
443 
444 		mov        edx, dlen  // EL0
445 
446 		// EL3: pixel 1 math -- (9*p[1] + 3*(p[0] + p[3]) + p[2]) / 16
447 		psllw      mm2, 3     // EL3.1: 8*p[1] -> mm2
448 		paddw      mm2, mm5   // EL3.2: 9*p[1] + 3*(p[0] + p[3]) + p[2] -> mm2
449 		psrlw      mm2, 4     // EL3.3: sum[1]/16 -> mm5
450 
451 		// EL2/3: store pixels 0 & 1
452 		packuswb   mm1, mm2   // EL2/3: pack into bytes
453 		MOVNTQ     (edi, mm1) // EL2/3: store 2 pixels
454 
455 		// EL4: pixel 2 math -- (9*p[2] + 3*(p[0] + p[3]) + p[1]) / 16
456 		psllw      mm3, 3     // EL4.1: 8*p[2] -> mm3
457 		paddw      mm3, mm5   // EL4.2: 9*p[2] + 3*(p[0] + p[3]) + p[1] -> mm3
458 		psrlw      mm3, 4     // EL4.3: sum[2]/16 -> mm3
459 
460 		// EL5: pixel 3 math -- (9*p[3] + 3*(p[1] + p[2]) + p[0]) / 16
461 		psllw      mm4, 3     // EL5.1: 8*p[3] -> mm4
462 		paddw      mm4, mm6   // EL5.2: 9*p[3] + 3*(p[1] + p[2]) + p[0] -> mm4
463 		psrlw      mm4, 4     // EL5.3: sum[3]/16 -> mm4
464 
465 		// EL4/5: store pixels 2 & 3
466 		packuswb   mm3, mm4   // EL4/5: pack into bytes
467 		MOVNTQ     (edi + edx*4, mm3) // EL4/5: store 2 pixels
468 	}
469 }
470 // End MSVC_ASM
471 
472 #elif defined(GCC_ASM)
473 //	GCC inline assembly versions
474 
475 #if defined(USE_MOVNTQ)
476 #	define MOVNTQ(val, addr)   "movntq "   #val "," #addr
477 #else
478 #	define MOVNTQ(val, addr)   "movq "     #val "," #addr
479 #endif
480 
481 #if USE_PREFETCH == INTEL_PREFETCH
482 //	using Intel SSE non-temporal prefetch
483 #	define PREFETCH(addr)      "prefetchnta " #addr
484 #elif USE_PREFETCH == AMD_PREFETCH
485 //	using AMD 3DNOW! prefetch
486 #	define PREFETCH(addr)      "prefetch "    #addr
487 #else
488 //	no prefetch -- too bad for poor MMX-only souls
489 #	define PREFETCH(addr)
490 #endif
491 
492 #if defined(__x86_64__)
493 #	define A_REG   "rax"
494 #	define D_REG   "rdx"
495 #	define CLR_UPPER32(r)      "xor "  "%%" r "," "%%" r
496 #else
497 #	define A_REG   "eax"
498 #	define D_REG   "edx"
499 #	define CLR_UPPER32(r)
500 #endif
501 
502 static inline void
SCALE_(PlatInit)503 SCALE_(PlatInit) (void)
504 {
505 	__asm__ (
506 		// mm0 will be kept == 0 throughout
507 		// 0 is needed for bytes->words unpack instructions
508 		"pxor       %%mm0, %%mm0 \n\t"
509 
510 	: /* nothing */
511 	: /* nothing */
512 	);
513 }
514 
515 static inline void
SCALE_(PlatDone)516 SCALE_(PlatDone) (void)
517 {
518 	// finish with MMX registers and yield them to FPU
519 	__asm__ (
520 		"emms \n\t"
521 	: /* nothing */ : /* nothing */
522 	);
523 }
524 
525 static inline void
SCALE_(Prefetch)526 SCALE_(Prefetch) (const void* p)
527 {
528     __asm__ __volatile__ ("" PREFETCH (%0) : /*nothing*/ : "m" (p) );
529 }
530 
531 // compute the RGB distance squared between 2 pixels
532 static inline int
SCALE_(GetRGBDelta)533 SCALE_(GetRGBDelta) (Uint32 pix1, Uint32 pix2)
534 {
535 	int res;
536 
537 	__asm__ (
538 		// load pixels
539 		"movd       %1, %%mm1    \n\t"
540 		"punpcklbw  %%mm0, %%mm1 \n\t"
541 		"movd       %2, %%mm2    \n\t"
542 		"punpcklbw  %%mm0, %%mm2 \n\t"
543 		// get the difference between RGBA components
544 		"psubw      %%mm2, %%mm1 \n\t"
545 		// squared and sumed
546 		"pmaddwd    %%mm1, %%mm1 \n\t"
547 		// finish suming the squares
548 		"movq       %%mm1, %%mm2 \n\t"
549 		"punpckhdq  %%mm0, %%mm2 \n\t"
550 		"paddd      %%mm2, %%mm1 \n\t"
551 		// store result
552 		"movd       %%mm1, %0    \n\t"
553 
554 	: /*0*/"=rm" (res)
555 	: /*1*/"rm" (pix1), /*2*/"rm" (pix2)
556 	);
557 
558 	return res;
559 }
560 
561 // retrieve the Y (intensity) component of pixel's YUV
562 static inline int
SCALE_(GetPixY)563 SCALE_(GetPixY) (Uint32 pix)
564 {
565 	int ret;
566 
567 	__asm__ (
568 		// load pixel
569 		"movd       %1, %%mm1    \n\t"
570 		"punpcklbw  %%mm0, %%mm1 \n\t"
571 		// process
572 		"pmaddwd    %2, %%mm1    \n\t" // R,G,B * Yvec
573 		"movq       %%mm1, %%mm2 \n\t" // finish suming
574 		"punpckhdq  %%mm0, %%mm2 \n\t" //   ditto
575 		"paddd      %%mm2, %%mm1 \n\t" //   ditto
576 		// store index
577 		"movd       %%mm1, %0    \n\t"
578 
579 	: /*0*/"=r" (ret)
580 	: /*1*/"rm" (pix), /*2*/"m" (mmx_Y_mult)
581 	);
582 	return ret >> 14;
583 }
584 
585 #ifdef USE_YUV_LOOKUP
586 
587 // convert pixel RGB vector into YUV representation vector
588 static inline YUV_VECTOR
SCALE_(RGBtoYUV)589 SCALE_(RGBtoYUV) (Uint32 pix)
590 {
591 	int i;
592 
593 	__asm__ (
594 		// convert RGB888 to 555
595 		"movd       %1, %%mm1 \n\t"
596 		"punpcklbw  %%mm0, %%mm1 \n\t"
597 		"psrlw      $3, %%mm1    \n\t"   // 8->5 bit
598 		"pmaddwd    %2, %%mm1    \n\t"  // shuffle into the right channel order
599 		"movq       %%mm1, %%mm2 \n\t"  // finish shuffling
600 		"punpckhdq  %%mm0, %%mm2 \n\t"  //   ditto
601 		"por        %%mm2, %%mm1 \n\t"  //   ditto
602 		"movd       %%mm1, %0    \n\t"
603 
604 	: /*0*/"=rm" (i)
605 	: /*1*/"rm" (pix), /*2*/"m" (mmx_888to555_mult)
606 	);
607 	return RGB15_to_YUV[i];
608 }
609 
610 // compare 2 pixels with respect to their YUV representations
611 // tolerance set by toler arg
612 // returns true: close; false: distant (-gt toler)
613 static inline bool
SCALE_(CmpYUV)614 SCALE_(CmpYUV) (Uint32 pix1, Uint32 pix2, int toler)
615 {
616 	int delta;
617 
618 	__asm__ (
619 		"movd       %1, %%mm1 \n\t"
620 		"movd       %2, %%mm3 \n\t"
621 
622 		// convert RGB888 to 555
623 		// this is somewhat parallelized
624 		"punpcklbw  %%mm0, %%mm1 \n\t"
625 		CLR_UPPER32 (A_REG)     "\n\t"
626 		"psrlw      $3, %%mm1    \n\t"   // 8->5 bit
627 		"punpcklbw  %%mm0, %%mm3 \n\t"
628 		"psrlw      $3, %%mm3    \n\t" // 8->5 bit
629 		"pmaddwd    %4, %%mm1    \n\t" // shuffle into the right channel order
630 		"movq       %%mm1, %%mm2 \n\t"  // finish shuffling
631 		"pmaddwd    %4, %%mm3    \n\t" // shuffle into the right channel order
632 		CLR_UPPER32 (D_REG)     "\n\t"
633 		"movq       %%mm3, %%mm4 \n\t"  // finish shuffling
634 		"punpckhdq  %%mm0, %%mm2 \n\t"  //   ditto
635 		"por        %%mm2, %%mm1 \n\t"  //   ditto
636 		"punpckhdq  %%mm0, %%mm4 \n\t"  //   ditto
637 		"por        %%mm4, %%mm3 \n\t"  //   ditto
638 
639 		// lookup the YUV vector
640 		"movd       %%mm1, %%eax \n\t"
641 		"movd       %%mm3, %%edx \n\t"
642 		"movd       (%3, %%" A_REG ", 4), %%mm1  \n\t"
643 		"movq       %%mm1, %%mm4 \n\t"
644 		"movd       (%3, %%" D_REG ", 4), %%mm2  \n\t"
645 
646 		// get abs difference between YUV components
647 #ifdef USE_PSADBW
648 		// we can use PSADBW and save us some grief
649 		"psadbw     %%mm2, %%mm1 \n\t"
650 		"movd       %%mm1, %0    \n\t"
651 #else
652 		// no PSADBW -- have to do it the hard way
653 		"psubusb    %%mm2, %%mm1 \n\t"
654 		"psubusb    %%mm4, %%mm2 \n\t"
655 		"por        %%mm2, %%mm1 \n\t"
656 
657 		// sum the differences
658 		//  technically, this produces a MAX diff of 510
659 		//  but we do not need anything bigger, currently
660 		"movq       %%mm1, %%mm2 \n\t"
661 		"psrlq      $8, %%mm2    \n\t"
662 		"paddusb    %%mm2, %%mm1 \n\t"
663 		"psrlq      $8, %%mm2    \n\t"
664 		"paddusb    %%mm2, %%mm1 \n\t"
665 		// store intermediate delta
666 		"movd       %%mm1, %0    \n\t"
667 		"andl       $0xff, %0    \n\t"
668 #endif /* USE_PSADBW */
669 	: /*0*/"=rm" (delta)
670 	: /*1*/"rm" (pix1), /*2*/"rm" (pix2),
671 		/*3*/ "r" (RGB15_to_YUV),
672 		/*4*/"m" (mmx_888to555_mult)
673 	: "%" A_REG, "%" D_REG, "cc"
674 	);
675 
676 	return (delta << 1) <= toler;
677 }
678 
679 #endif /* USE_YUV_LOOKUP */
680 
681 // Check if 2 pixels are different with respect to their
682 // YUV representations
683 // returns 0: close; ~0: distant
684 static inline int
SCALE_(DiffYUV)685 SCALE_(DiffYUV) (Uint32 yuv1, Uint32 yuv2)
686 {
687 	sint32 ret;
688 
689 	__asm__ (
690 		// load YUV pixels
691 		"movd       %1, %%mm1    \n\t"
692 		"movq       %%mm1, %%mm4 \n\t"
693 		"movd       %2, %%mm2    \n\t"
694 		// abs difference between channels
695 		"psubusb    %%mm2, %%mm1 \n\t"
696 		"psubusb    %%mm4, %%mm2 \n\t"
697 		CLR_UPPER32(D_REG)      "\n\t"
698 		"por        %%mm2, %%mm1 \n\t"
699 		// compare to threshold
700 		"psubusb    %3, %%mm1    \n\t"
701 
702 		"movd       %%mm1, %%edx \n\t"
703 		// transform eax to 0 or ~0
704 		"xor        %%" A_REG ", %%" A_REG "\n\t"
705 		"or         %%" D_REG ", %%" D_REG "\n\t"
706 		"setz       %%al         \n\t"
707 		"dec        %%" A_REG "  \n\t"
708 
709 	: /*0*/"=a" (ret)
710 	: /*1*/"rm" (yuv1), /*2*/"rm" (yuv2),
711 		/*3*/"m" (mmx_YUV_threshold)
712 	: "%" D_REG, "cc"
713 	);
714 	return ret;
715 }
716 
717 // Bilinear weighted blend of four pixels
718 // Function produces 4 blended pixels (in 2x2 matrix) and writes them
719 // out to the surface
720 // Last version
721 static inline void
SCALE_(Blend_bilinear)722 SCALE_(Blend_bilinear) (const Uint32* row0, const Uint32* row1,
723 				Uint32* dst_p, Uint32 dlen)
724 {
725 	__asm__ (
726 		// EL0: load pixels
727 		"movq       %0, %%mm1      \n\t" // EL0
728 		"movq       %%mm1, %%mm2   \n\t" // EL0: p[1] -> mm2
729 		 PREFETCH   (0x80%0)      "\n\t"
730 		"punpckhbw  %%mm0, %%mm2   \n\t" // EL0: p[1] -> mm2
731 		"punpcklbw  %%mm0, %%mm1   \n\t" // EL0: p[0] -> mm1
732 		"movq       %1, %%mm3      \n\t"
733 		"movq       %%mm3, %%mm4   \n\t" // EL0: p[3] -> mm4
734 		"movq       %%mm2, %%mm6   \n\t" // EL1.1: p[1] -> mm6
735 		 PREFETCH   (0x80%1)      "\n\t"
736 		"punpcklbw  %%mm0, %%mm3   \n\t" // EL0: p[2] -> mm3
737 		"movq       %%mm1, %%mm5   \n\t" // EL1.1: p[0] -> mm5
738 		"punpckhbw  %%mm0, %%mm4   \n\t" // EL0: p[3] -> mm4
739 
740 		// EL1: cache p[0] + 3*(p[1] + p[2]) + p[3] in mm6
741 		"paddw      %%mm3, %%mm6   \n\t" // EL1.2: p[1] + p[2] -> mm6
742 		// EL1: cache p[0] + p[1] + p[2] + p[3] in mm7
743 		"movq       %%mm6, %%mm7   \n\t" // EL1.3: p[1] + p[2] -> mm7
744 		// EL1: cache p[1] + 3*(p[0] + p[3]) + p[2] in mm5
745 		"paddw      %%mm4, %%mm5   \n\t" // EL1.2: p[0] + p[3] -> mm5
746 		"psllw      $1, %%mm6      \n\t" // EL1.4: 2*(p[1] + p[2]) -> mm6
747 		"paddw      %%mm5, %%mm7   \n\t" // EL1.4: sum(p[]) -> mm7
748 		"psllw      $1, %%mm5      \n\t" // EL1.5: 2*(p[0] + p[3]) -> mm5
749 		"paddw      %%mm7, %%mm6   \n\t" // EL1.5: p[0] + 3*(p[1] + p[2]) + p[3] -> mm6
750 		"paddw      %%mm7, %%mm5   \n\t" // EL1.6: p[1] + 3*(p[0] + p[3]) + p[2] -> mm5
751 
752 		// EL2: pixel 0 math -- (9*p[0] + 3*(p[1] + p[2]) + p[3]) / 16
753 		"psllw      $3, %%mm1      \n\t" // EL2.1: 8*p[0] -> mm1
754 		"paddw      %%mm6, %%mm1   \n\t" // EL2.2: 9*p[0] + 3*(p[1] + p[2]) + p[3] -> mm1
755 		"psrlw      $4, %%mm1      \n\t" // EL2.3: sum[0]/16 -> mm1
756 
757 		// EL3: pixel 1 math -- (9*p[1] + 3*(p[0] + p[3]) + p[2]) / 16
758 		"psllw      $3, %%mm2      \n\t" // EL3.1: 8*p[1] -> mm2
759 		"paddw      %%mm5, %%mm2   \n\t" // EL3.2: 9*p[1] + 3*(p[0] + p[3]) + p[2] -> mm5
760 		"psrlw      $4, %%mm2      \n\t" // EL3.3: sum[1]/16 -> mm5
761 
762 		// EL2/4: store pixels 0 & 1
763 		"packuswb   %%mm2, %%mm1   \n\t" // EL2/4: pack into bytes
764 		 MOVNTQ     (%%mm1, (%2)) "\n\t" // EL2/4: store 2 pixels
765 
766 		// EL4: pixel 2 math -- (9*p[2] + 3*(p[0] + p[3]) + p[1]) / 16
767 		"psllw      $3, %%mm3      \n\t" // EL4.1: 8*p[2] -> mm3
768 		"paddw      %%mm5, %%mm3   \n\t" // EL4.2: 9*p[2] + 3*(p[0] + p[3]) + p[1] -> mm3
769 		"psrlw      $4, %%mm3      \n\t" // EL4.3: sum[2]/16 -> mm3
770 
771 		// EL5: pixel 3 math -- (9*p[3] + 3*(p[1] + p[2]) + p[0]) / 16
772 		"psllw      $3, %%mm4      \n\t" // EL5.1: 8*p[3] -> mm4
773 		"paddw      %%mm6, %%mm4   \n\t" // EL5.2: 9*p[3] + 3*(p[1] + p[2]) + p[0] -> mm4
774 		"psrlw      $4, %%mm4      \n\t" // EL5.3: sum[3]/16 -> mm4
775 
776 		// EL4/5: store pixels 2 & 3
777 		"packuswb   %%mm4, %%mm3   \n\t" // EL4/5: pack into bytes
778 		 MOVNTQ     (%%mm3, (%2,%3,4)) "\n\t" // EL4/5: store 2 pixels
779 
780 	: /* nothing */
781 	: /*0*/"m" (*row0), /*1*/"m" (*row1), /*2*/"r" (dst_p),
782 			/*3*/"r" ((unsigned long)dlen) /* 'long' is for proper reg alloc on amd64 */
783 	: "memory"
784 	);
785 }
786 
787 #undef A_REG
788 #undef D_REG
789 #undef CLR_UPPER32
790 
791 #endif // GCC_ASM
792 
793 #endif /* SCALEMMX_H_ */
794