1 /*
2  * This file is part of the Advance project.
3  *
4  * Copyright (C) 1999-2002 Andrea Mazzoleni
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19  */
20 
21 /*
22  * This file contains a C and MMX implentation of the Scale2x effect.
23  *
24  * You can found an high level description of the effect at :
25  *
26  * http://scale2x.sourceforge.net/scale2x.html
27  *
28  * Alternatively at the previous license terms, you are allowed to use this
29  * code in your program with these conditions:
30  * - the program is not used in commercial activities.
31  * - the whole source code of the program is released with the binary.
32  * - derivative works of the program are allowed.
33  */
34 
35 /*
36  * Code adapted To OpenBOR by SX
37  * scale2x.c - Trying to scale 2x.
38  *
39  * Updated: 5/05/08 - SX
40  *
41  */
42 
43 
44 #include "gfx.h"
45 #include "gfxtypes.h"
46 
47 /* Suggested in "Intel Optimization" for Pentium II */
48 #define ASM_JUMP_ALIGN ".p2align 4\n"
49 
internal_scale2x_16_def(u16 * dst0,u16 * dst1,const u16 * src0,const u16 * src1,const u16 * src2,unsigned count)50 static void internal_scale2x_16_def(u16 *dst0, u16* dst1, const u16* src0, const u16* src1, const u16* src2, unsigned count)
51 {
52 	/* first pixel */
53 	dst0[0] = src1[0];
54 	dst1[0] = src1[0];
55 	if (src1[1] == src0[0] && src2[0] != src0[0])
56 		dst0[1] =src0[0];
57 	else
58 		dst0[1] =src1[0];
59 	if (src1[1] == src2[0] && src0[0] != src2[0])
60 		dst1[1] =src2[0];
61 	else
62 		dst1[1] =src1[0];
63 	++src0;
64 	++src1;
65 	++src2;
66 	dst0 += 2;
67 	dst1 += 2;
68 
69 	/* central pixels */
70 	count -= 2;
71 	while (count)
72 	{
73 		if (src1[-1] == src0[0] && src2[0] != src0[0] && src1[1] != src0[0])
74 			dst0[0] = src0[0];
75 		else
76 			dst0[0] = src1[0];
77 		if (src1[1] == src0[0] && src2[0] != src0[0] && src1[-1] != src0[0])
78 			dst0[1] =src0[0];
79 		else
80 			dst0[1] =src1[0];
81 
82 		if (src1[-1] == src2[0] && src0[0] != src2[0] && src1[1] != src2[0])
83 			dst1[0] =src2[0];
84 		else
85 			dst1[0] =src1[0];
86 		if (src1[1] == src2[0] && src0[0] != src2[0] && src1[-1] != src2[0])
87 			dst1[1] =src2[0];
88 		else
89 			dst1[1] =src1[0];
90 
91 		++src0;
92 		++src1;
93 		++src2;
94 		dst0 += 2;
95 		dst1 += 2;
96 		--count;
97 	}
98 
99 	/* last pixel */
100 	if (src1[-1] == src0[0] && src2[0] != src0[0])
101 		dst0[0] =src0[0];
102 	else
103 		dst0[0] =src1[0];
104 	if (src1[-1] == src2[0] && src0[0] != src2[0])
105 		dst1[0] =src2[0];
106 	else
107 		dst1[0] =src1[0];
108 	dst0[1] =src1[0];
109 	dst1[1] =src1[0];
110 }
111 
internal_scale2x_32_def(u32 * dst0,u32 * dst1,const u32 * src0,const u32 * src1,const u32 * src2,unsigned count)112 static void internal_scale2x_32_def(u32* dst0, u32* dst1, const u32* src0, const u32* src1, const u32* src2, unsigned count)
113 {
114 	/* first pixel */
115 	dst0[0] = src1[0];
116 	dst1[0] = src1[0];
117 	if (src1[1] == src0[0] && src2[0] != src0[0])
118 		dst0[1] = src0[0];
119 	else
120 		dst0[1] = src1[0];
121 	if (src1[1] == src2[0] && src0[0] != src2[0])
122 		dst1[1] = src2[0];
123 	else
124 		dst1[1] = src1[0];
125 	++src0;
126 	++src1;
127 	++src2;
128 	dst0 += 2;
129 	dst1 += 2;
130 
131 	/* central pixels */
132 	count -= 2;
133 	while (count)
134 	{
135 		if (src1[-1] == src0[0] && src2[0] != src0[0] && src1[1] != src0[0])
136 			dst0[0] = src0[0];
137 		else
138 			dst0[0] = src1[0];
139 		if (src1[1] == src0[0] && src2[0] != src0[0] && src1[-1] != src0[0])
140 			dst0[1] = src0[0];
141 		else
142 			dst0[1] = src1[0];
143 
144 		if (src1[-1] == src2[0] && src0[0] != src2[0] && src1[1] != src2[0])
145 			dst1[0] = src2[0];
146 		else
147 			dst1[0] = src1[0];
148 		if (src1[1] == src2[0] && src0[0] != src2[0] && src1[-1] != src2[0])
149 			dst1[1] = src2[0];
150 		else
151 			dst1[1] = src1[0];
152 
153 		++src0;
154 		++src1;
155 		++src2;
156 		dst0 += 2;
157 		dst1 += 2;
158 	    --count;
159 	}
160 
161 	/* last pixel */
162 	if (src1[-1] == src0[0] && src2[0] != src0[0])
163 		dst0[0] = src0[0];
164 	else
165 	    dst0[0] = src1[0];
166 	if (src1[-1] == src2[0] && src0[0] != src2[0])
167 		dst1[0] = src2[0];
168 	else
169 	    dst1[0] = src1[0];
170 	dst0[1] = src1[0];
171 	dst1[1] = src1[0];
172 }
173 
174 #ifdef MMX
internal_scale2x_16_mmx_single(u16 * dst,const u16 * src0,const u16 * src1,const u16 * src2,unsigned count)175 static void internal_scale2x_16_mmx_single(u16* dst, const u16* src0, const u16* src1, const u16* src2, unsigned count)
176 {
177 	/* always do the first and last run */
178 	count -= 2*4;
179 
180 #ifdef __GNUC__
181   __asm__ __volatile__(
182 					   /* first run */
183 					   /* set the current, current_pre, current_next registers */
184 					   "pxor %%mm0,%%mm0\n" /* use a fake black out of screen */
185 					   "movq 0(%1),%%mm7\n"
186 					   "movq 8(%1),%%mm1\n"
187 					   "psrlq $48,%%mm0\n"
188 					   "psllq $48,%%mm1\n"
189 					   "movq %%mm7,%%mm2\n"
190 					   "movq %%mm7,%%mm3\n"
191 					   "psllq $16,%%mm2\n"
192 					   "psrlq $16,%%mm3\n"
193 					   "por %%mm2,%%mm0\n"
194 					   "por %%mm3,%%mm1\n"
195 
196 					   /* current_upper */
197 					   "movq (%0),%%mm6\n"
198 
199 					   /* compute the upper-left pixel for dst0 on %%mm2 */
200 					   /* compute the upper-right pixel for dst0 on %%mm4 */
201 					   "movq %%mm0,%%mm2\n"
202 					   "movq %%mm1,%%mm4\n"
203 					   "movq %%mm0,%%mm3\n"
204 					   "movq %%mm1,%%mm5\n"
205 					   "pcmpeqw %%mm6,%%mm2\n"
206 					   "pcmpeqw %%mm6,%%mm4\n"
207 					   "pcmpeqw (%2),%%mm3\n"
208 					   "pcmpeqw (%2),%%mm5\n"
209 					   "pandn %%mm2,%%mm3\n"
210 					   "pandn %%mm4,%%mm5\n"
211 					   "movq %%mm0,%%mm2\n"
212 					   "movq %%mm1,%%mm4\n"
213 					   "pcmpeqw %%mm1,%%mm2\n"
214 					   "pcmpeqw %%mm0,%%mm4\n"
215 					   "pandn %%mm3,%%mm2\n"
216 					   "pandn %%mm5,%%mm4\n"
217 					   "movq %%mm2,%%mm3\n"
218 					   "movq %%mm4,%%mm5\n"
219 					   "pand %%mm6,%%mm2\n"
220 					   "pand %%mm6,%%mm4\n"
221 					   "pandn %%mm7,%%mm3\n"
222 					   "pandn %%mm7,%%mm5\n"
223 					   "por %%mm3,%%mm2\n"
224 					   "por %%mm5,%%mm4\n"
225 
226 					   /* set *dst0 */
227 					   "movq %%mm2,%%mm3\n"
228 					   "punpcklwd %%mm4,%%mm2\n"
229 					   "punpckhwd %%mm4,%%mm3\n"
230 					   "movq %%mm2,(%3)\n"
231 					   "movq %%mm3,8(%3)\n"
232 
233 					   /* next */
234 					   "add $8,%0\n"
235 					   "add $8,%1\n"
236 					   "add $8,%2\n"
237 					   "add $16,%3\n"
238 
239 					   /* central runs */
240 					   "shr $2,%4\n"
241 					   "jz 1f\n"
242 					   ASM_JUMP_ALIGN
243 					   "0:\n"
244 
245 					   /* set the current, current_pre, current_next registers */
246 					   "movq -8(%1),%%mm0\n"
247 					   "movq (%1),%%mm7\n"
248 					   "movq 8(%1),%%mm1\n"
249 					   "psrlq $48,%%mm0\n"
250 					   "psllq $48,%%mm1\n"
251 					   "movq %%mm7,%%mm2\n"
252 					   "movq %%mm7,%%mm3\n"
253 					   "psllq $16,%%mm2\n"
254 					   "psrlq $16,%%mm3\n"
255 					   "por %%mm2,%%mm0\n"
256 					   "por %%mm3,%%mm1\n"
257 
258 					   /* current_upper */
259 					   "movq (%0),%%mm6\n"
260 
261 					   /* compute the upper-left pixel for dst0 on %%mm2 */
262 					   /* compute the upper-right pixel for dst0 on %%mm4 */
263 					   "movq %%mm0,%%mm2\n"
264 					   "movq %%mm1,%%mm4\n"
265 					   "movq %%mm0,%%mm3\n"
266 					   "movq %%mm1,%%mm5\n"
267 					   "pcmpeqw %%mm6,%%mm2\n"
268 					   "pcmpeqw %%mm6,%%mm4\n"
269 					   "pcmpeqw (%2),%%mm3\n"
270 					   "pcmpeqw (%2),%%mm5\n"
271 					   "pandn %%mm2,%%mm3\n"
272 					   "pandn %%mm4,%%mm5\n"
273 					   "movq %%mm0,%%mm2\n"
274 					   "movq %%mm1,%%mm4\n"
275 					   "pcmpeqw %%mm1,%%mm2\n"
276 					   "pcmpeqw %%mm0,%%mm4\n"
277 					   "pandn %%mm3,%%mm2\n"
278 					   "pandn %%mm5,%%mm4\n"
279 					   "movq %%mm2,%%mm3\n"
280 					   "movq %%mm4,%%mm5\n"
281 					   "pand %%mm6,%%mm2\n"
282 					   "pand %%mm6,%%mm4\n"
283 					   "pandn %%mm7,%%mm3\n"
284 					   "pandn %%mm7,%%mm5\n"
285 					   "por %%mm3,%%mm2\n"
286 					   "por %%mm5,%%mm4\n"
287 
288 					   /* set *dst0 */
289 					   "movq %%mm2,%%mm3\n"
290 					   "punpcklwd %%mm4,%%mm2\n"
291 					   "punpckhwd %%mm4,%%mm3\n"
292 					   "movq %%mm2,(%3)\n"
293 					   "movq %%mm3,8(%3)\n"
294 
295 					   /* next */
296 					   "add $8,%0\n"
297 					   "add $8,%1\n"
298 					   "add $8,%2\n"
299 					   "add $16,%3\n"
300 
301 					   "decl %4\n"
302 					   "jnz 0b\n"
303 					   "1:\n"
304 
305 					   /* final run */
306 					   /* set the current, current_pre, current_next registers */
307 					   "movq -8(%1),%%mm0\n"
308 					   "movq (%1),%%mm7\n"
309 					   "pxor %%mm1,%%mm1\n" /* use a fake black out of screen */
310 					   "psrlq $48,%%mm0\n"
311 					   "psllq $48,%%mm1\n"
312 					   "movq %%mm7,%%mm2\n"
313 					   "movq %%mm7,%%mm3\n"
314 					   "psllq $16,%%mm2\n"
315 					   "psrlq $16,%%mm3\n"
316 					   "por %%mm2,%%mm0\n"
317 					   "por %%mm3,%%mm1\n"
318 
319 					   /* current_upper */
320 					   "movq (%0),%%mm6\n"
321 
322 					   /* compute the upper-left pixel for dst0 on %%mm2 */
323 					   /* compute the upper-right pixel for dst0 on %%mm4 */
324 					   "movq %%mm0,%%mm2\n"
325 					   "movq %%mm1,%%mm4\n"
326 					   "movq %%mm0,%%mm3\n"
327 					   "movq %%mm1,%%mm5\n"
328 					   "pcmpeqw %%mm6,%%mm2\n"
329 					   "pcmpeqw %%mm6,%%mm4\n"
330 					   "pcmpeqw (%2),%%mm3\n"
331 					   "pcmpeqw (%2),%%mm5\n"
332 					   "pandn %%mm2,%%mm3\n"
333 					   "pandn %%mm4,%%mm5\n"
334 					   "movq %%mm0,%%mm2\n"
335 					   "movq %%mm1,%%mm4\n"
336 					   "pcmpeqw %%mm1,%%mm2\n"
337 					   "pcmpeqw %%mm0,%%mm4\n"
338 					   "pandn %%mm3,%%mm2\n"
339 					   "pandn %%mm5,%%mm4\n"
340 					   "movq %%mm2,%%mm3\n"
341 					   "movq %%mm4,%%mm5\n"
342 					   "pand %%mm6,%%mm2\n"
343 					   "pand %%mm6,%%mm4\n"
344 					   "pandn %%mm7,%%mm3\n"
345 					   "pandn %%mm7,%%mm5\n"
346 					   "por %%mm3,%%mm2\n"
347 					   "por %%mm5,%%mm4\n"
348 
349 					   /* set *dst0 */
350 					   "movq %%mm2,%%mm3\n"
351 					   "punpcklwd %%mm4,%%mm2\n"
352 					   "punpckhwd %%mm4,%%mm3\n"
353 					   "movq %%mm2,(%3)\n"
354 					   "movq %%mm3,8(%3)\n"
355 					   "emms\n"
356 
357 					   : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
358 					   :
359 					   : "cc"
360 					   );
361 #else
362   __asm {
363 	mov eax, src0;
364 	mov ebx, src1;
365 	mov ecx, src2;
366 	mov edx, dst;
367 	mov esi, count;
368 
369 	/* first run */
370 	/* set the current, current_pre, current_next registers */
371 	pxor mm0,mm0; /* use a fake black out of screen */
372 	movq mm7, qword ptr [ebx];
373 	movq mm1, qword ptr [ebx + 8];
374 	psrlq mm0, 48;
375 	psllq mm1, 48;
376 	movq mm2, mm7;
377 	movq mm3, mm7;
378 	psllq mm2, 16;
379 	psrlq mm3, 16;
380 	por mm0, mm2;
381 	por mm1, mm3;
382 
383 	/* current_upper */
384 	movq mm6, qword ptr [eax];
385 
386 	/* compute the upper-left pixel for dst0 on %%mm2 */
387 	/* compute the upper-right pixel for dst0 on %%mm4 */
388 	movq mm2, mm0;
389 	movq mm4, mm1;
390 	movq mm3, mm0;
391 	movq mm5, mm1;
392 	pcmpeqw mm2, mm6;
393 	pcmpeqw mm4, mm6;
394 	pcmpeqw mm3, qword ptr [ecx];
395 	pcmpeqw mm5, qword ptr [ecx];
396 	pandn mm3,mm2;
397 	pandn mm5,mm4;
398 	movq mm2,mm0;
399 	movq mm4,mm1;
400 	pcmpeqw mm2,mm1;
401 	pcmpeqw mm4,mm0;
402 	pandn mm2,mm3;
403 	pandn mm4,mm5;
404 	movq mm3,mm2;
405 	movq mm5,mm4;
406 	pand mm2,mm6;
407 	pand mm4,mm6;
408 	pandn mm3,mm7;
409 	pandn mm5,mm7;
410 	por mm2,mm3;
411 	por mm4,mm5;
412 
413 	/* set *dst0 */
414 	movq mm3,mm2;
415 	punpcklwd mm2,mm4;
416 	punpckhwd mm3,mm4;
417 	movq qword ptr [edx], mm2;
418 	movq qword ptr [edx + 8], mm3;
419 
420 	/* next */
421 	add eax, 8;
422 	add ebx, 8;
423 	add ecx, 8;
424 	add edx, 16;
425 
426 	/* central runs */
427 	shr esi, 2;
428 	jz label1;
429 	align 4;
430   label0:
431 
432 	/* set the current, current_pre, current_next registers */
433 	movq mm0, qword ptr [ebx-8];
434 	movq mm7, qword ptr [ebx];
435 	movq mm1, qword ptr [ebx+8];
436 	psrlq mm0,48;
437 	psllq mm1,48;
438 	movq mm2,mm7;
439 	movq mm3,mm7;
440 	psllq mm2,16;
441 	psrlq mm3,16;
442 	por mm0,mm2;
443 	por mm1,mm3;
444 
445 	/* current_upper */
446 	movq mm6, qword ptr [eax];
447 
448 	/* compute the upper-left pixel for dst0 on %%mm2 */
449 	/* compute the upper-right pixel for dst0 on %%mm4 */
450 	movq mm2,mm0;
451 	movq mm4,mm1;
452 	movq mm3,mm0;
453 	movq mm5,mm1;
454 	pcmpeqw mm2,mm6;
455 	pcmpeqw mm4,mm6;
456 	pcmpeqw mm3, qword ptr [ecx];
457 	pcmpeqw mm5, qword ptr [ecx];
458 	pandn mm3,mm2;
459 	pandn mm5,mm4;
460 	movq mm2,mm0;
461 	movq mm4,mm1;
462 	pcmpeqw mm2,mm1;
463 	pcmpeqw mm4,mm0;
464 	pandn mm2,mm3;
465 	pandn mm4,mm5;
466 	movq mm3,mm2;
467 	movq mm5,mm4;
468 	pand mm2,mm6;
469 	pand mm4,mm6;
470 	pandn mm3,mm7;
471 	pandn mm5,mm7;
472 	por mm2,mm3;
473 	por mm4,mm5;
474 
475 	/* set *dst0 */
476 	movq mm3,mm2;
477 	punpcklwd mm2,mm4;
478 	punpckhwd mm3,mm4;
479 	movq qword ptr [edx], mm2;
480 	movq qword ptr [edx+8], mm3;
481 
482 	/* next */
483 	add eax,8;
484 	add ebx,8;
485 	add ecx,8;
486 	add edx,16;
487 
488 	dec esi;
489 	jnz label0;
490   label1:
491 
492 	/* final run */
493 	/* set the current, current_pre, current_next registers */
494 	movq mm0, qword ptr [ebx-8];
495 	movq mm7, qword ptr [ebx];
496 	pxor mm1,mm1; /* use a fake black out of screen */
497 	psrlq mm0,48;
498 	psllq mm1,48;
499 	movq mm2,mm7;
500 	movq mm3,mm7;
501 	psllq mm2,16;
502 	psrlq mm3,16;
503 	por mm0,mm2;
504 	por mm1,mm3;
505 
506 	/* current_upper */
507 	movq mm6, qword ptr [eax];
508 
509 	/* compute the upper-left pixel for dst0 on %%mm2 */
510 	/* compute the upper-right pixel for dst0 on %%mm4 */
511 	movq mm2,mm0;
512 	movq mm4,mm1;
513 	movq mm3,mm0;
514 	movq mm5,mm1;
515 	pcmpeqw mm2,mm6;
516 	pcmpeqw mm4,mm6;
517 	pcmpeqw mm3, qword ptr [ecx];
518 	pcmpeqw mm5, qword ptr [ecx];
519 	pandn mm3,mm2;
520 	pandn mm5,mm4;
521 	movq mm2,mm0;
522 	movq mm4,mm1;
523 	pcmpeqw mm2,mm1;
524 	pcmpeqw mm4,mm0;
525 	pandn mm2,mm3;
526 	pandn mm4,mm5;
527 	movq mm3,mm2;
528 	movq mm5,mm4;
529 	pand mm2,mm6;
530 	pand mm4,mm6;
531 	pandn mm3,mm7;
532 	pandn mm5,mm7;
533 	por mm2,mm3;
534 	por mm4,mm5;
535 
536 	/* set *dst0 */
537 	movq mm3,mm2;
538 	punpcklwd mm2,mm4;
539 	punpckhwd mm3,mm4;
540 	movq qword ptr [edx], mm2;
541 	movq qword ptr [edx+8], mm3;
542 
543 	mov src0, eax;
544 	mov src1, ebx;
545 	mov src2, ecx;
546 	mov dst, edx;
547 	mov count, esi;
548 
549 	emms;
550   }
551 #endif
552 }
553 
internal_scale2x_32_mmx_single(u32 * dst,const u32 * src0,const u32 * src1,const u32 * src2,unsigned count)554 static void internal_scale2x_32_mmx_single(u32* dst, const u32* src0, const u32* src1, const u32* src2, unsigned count)
555 {
556 	/* always do the first and last run */
557 	count -= 2*2;
558 
559 #ifdef __GNUC__
560   __asm__ __volatile__(
561 					   /* first run */
562 					   /* set the current, current_pre, current_next registers */
563 					   "pxor %%mm0,%%mm0\n" /* use a fake black out of screen */
564 					   "movq 0(%1),%%mm7\n"
565 					   "movq 8(%1),%%mm1\n"
566 					   "psrlq $32,%%mm0\n"
567 					   "psllq $32,%%mm1\n"
568 					   "movq %%mm7,%%mm2\n"
569 					   "movq %%mm7,%%mm3\n"
570 					   "psllq $32,%%mm2\n"
571 					   "psrlq $32,%%mm3\n"
572 					   "por %%mm2,%%mm0\n"
573 					   "por %%mm3,%%mm1\n"
574 
575 					   /* current_upper */
576 					   "movq (%0),%%mm6\n"
577 
578 					   /* compute the upper-left pixel for dst0 on %%mm2 */
579 					   /* compute the upper-right pixel for dst0 on %%mm4 */
580 					   "movq %%mm0,%%mm2\n"
581 					   "movq %%mm1,%%mm4\n"
582 					   "movq %%mm0,%%mm3\n"
583 					   "movq %%mm1,%%mm5\n"
584 					   "pcmpeqd %%mm6,%%mm2\n"
585 					   "pcmpeqd %%mm6,%%mm4\n"
586 					   "pcmpeqd (%2),%%mm3\n"
587 					   "pcmpeqd (%2),%%mm5\n"
588 					   "pandn %%mm2,%%mm3\n"
589 					   "pandn %%mm4,%%mm5\n"
590 					   "movq %%mm0,%%mm2\n"
591 					   "movq %%mm1,%%mm4\n"
592 					   "pcmpeqd %%mm1,%%mm2\n"
593 					   "pcmpeqd %%mm0,%%mm4\n"
594 					   "pandn %%mm3,%%mm2\n"
595 					   "pandn %%mm5,%%mm4\n"
596 					   "movq %%mm2,%%mm3\n"
597 					   "movq %%mm4,%%mm5\n"
598 					   "pand %%mm6,%%mm2\n"
599 					   "pand %%mm6,%%mm4\n"
600 					   "pandn %%mm7,%%mm3\n"
601 					   "pandn %%mm7,%%mm5\n"
602 					   "por %%mm3,%%mm2\n"
603 					   "por %%mm5,%%mm4\n"
604 
605 					   /* set *dst0 */
606 					   "movq %%mm2,%%mm3\n"
607 					   "punpckldq %%mm4,%%mm2\n"
608 					   "punpckhdq %%mm4,%%mm3\n"
609 					   "movq %%mm2,(%3)\n"
610 					   "movq %%mm3,8(%3)\n"
611 
612 					   /* next */
613 					   "add $8,%0\n"
614 					   "add $8,%1\n"
615 					   "add $8,%2\n"
616 					   "add $16,%3\n"
617 
618 					   /* central runs */
619 					   "shr $1,%4\n"
620 					   "jz 1f\n"
621 					   ASM_JUMP_ALIGN
622 					   "0:\n"
623 
624 					   /* set the current, current_pre, current_next registers */
625 					   "movq -8(%1),%%mm0\n"
626 					   "movq (%1),%%mm7\n"
627 					   "movq 8(%1),%%mm1\n"
628 					   "psrlq $32,%%mm0\n"
629 					   "psllq $32,%%mm1\n"
630 					   "movq %%mm7,%%mm2\n"
631 					   "movq %%mm7,%%mm3\n"
632 					   "psllq $32,%%mm2\n"
633 					   "psrlq $32,%%mm3\n"
634 					   "por %%mm2,%%mm0\n"
635 					   "por %%mm3,%%mm1\n"
636 
637 					   /* current_upper */
638 					   "movq (%0),%%mm6\n"
639 
640 					   /* compute the upper-left pixel for dst0 on %%mm2 */
641 					   /* compute the upper-right pixel for dst0 on %%mm4 */
642 					   "movq %%mm0,%%mm2\n"
643 					   "movq %%mm1,%%mm4\n"
644 					   "movq %%mm0,%%mm3\n"
645 					   "movq %%mm1,%%mm5\n"
646 					   "pcmpeqd %%mm6,%%mm2\n"
647 					   "pcmpeqd %%mm6,%%mm4\n"
648 					   "pcmpeqd (%2),%%mm3\n"
649 					   "pcmpeqd (%2),%%mm5\n"
650 					   "pandn %%mm2,%%mm3\n"
651 					   "pandn %%mm4,%%mm5\n"
652 					   "movq %%mm0,%%mm2\n"
653 					   "movq %%mm1,%%mm4\n"
654 					   "pcmpeqd %%mm1,%%mm2\n"
655 					   "pcmpeqd %%mm0,%%mm4\n"
656 					   "pandn %%mm3,%%mm2\n"
657 					   "pandn %%mm5,%%mm4\n"
658 					   "movq %%mm2,%%mm3\n"
659 					   "movq %%mm4,%%mm5\n"
660 					   "pand %%mm6,%%mm2\n"
661 					   "pand %%mm6,%%mm4\n"
662 					   "pandn %%mm7,%%mm3\n"
663 					   "pandn %%mm7,%%mm5\n"
664 					   "por %%mm3,%%mm2\n"
665 					   "por %%mm5,%%mm4\n"
666 
667 					   /* set *dst0 */
668 					   "movq %%mm2,%%mm3\n"
669 					   "punpckldq %%mm4,%%mm2\n"
670 					   "punpckhdq %%mm4,%%mm3\n"
671 					   "movq %%mm2,(%3)\n"
672 					   "movq %%mm3,8(%3)\n"
673 
674 					   /* next */
675 					   "add $8,%0\n"
676 					   "add $8,%1\n"
677 					   "add $8,%2\n"
678 					   "add $16,%3\n"
679 
680 					   "decl %4\n"
681 					   "jnz 0b\n"
682 					   "1:\n"
683 
684 					   /* final run */
685 					   /* set the current, current_pre, current_next registers */
686 					   "movq -8(%1),%%mm0\n"
687 					   "movq (%1),%%mm7\n"
688 					   "pxor %%mm1,%%mm1\n" /* use a fake black out of screen */
689 					   "psrlq $32,%%mm0\n"
690 					   "psllq $32,%%mm1\n"
691 					   "movq %%mm7,%%mm2\n"
692 					   "movq %%mm7,%%mm3\n"
693 					   "psllq $32,%%mm2\n"
694 					   "psrlq $32,%%mm3\n"
695 					   "por %%mm2,%%mm0\n"
696 					   "por %%mm3,%%mm1\n"
697 
698 					   /* current_upper */
699 					   "movq (%0),%%mm6\n"
700 
701 					   /* compute the upper-left pixel for dst0 on %%mm2 */
702 					   /* compute the upper-right pixel for dst0 on %%mm4 */
703 					   "movq %%mm0,%%mm2\n"
704 					   "movq %%mm1,%%mm4\n"
705 					   "movq %%mm0,%%mm3\n"
706 					   "movq %%mm1,%%mm5\n"
707 					   "pcmpeqd %%mm6,%%mm2\n"
708 					   "pcmpeqd %%mm6,%%mm4\n"
709 					   "pcmpeqd (%2),%%mm3\n"
710 					   "pcmpeqd (%2),%%mm5\n"
711 					   "pandn %%mm2,%%mm3\n"
712 					   "pandn %%mm4,%%mm5\n"
713 					   "movq %%mm0,%%mm2\n"
714 					   "movq %%mm1,%%mm4\n"
715 					   "pcmpeqd %%mm1,%%mm2\n"
716 					   "pcmpeqd %%mm0,%%mm4\n"
717 					   "pandn %%mm3,%%mm2\n"
718 					   "pandn %%mm5,%%mm4\n"
719 					   "movq %%mm2,%%mm3\n"
720 					   "movq %%mm4,%%mm5\n"
721 					   "pand %%mm6,%%mm2\n"
722 					   "pand %%mm6,%%mm4\n"
723 					   "pandn %%mm7,%%mm3\n"
724 					   "pandn %%mm7,%%mm5\n"
725 					   "por %%mm3,%%mm2\n"
726 					   "por %%mm5,%%mm4\n"
727 
728 					   /* set *dst0 */
729 					   "movq %%mm2,%%mm3\n"
730 					   "punpckldq %%mm4,%%mm2\n"
731 					   "punpckhdq %%mm4,%%mm3\n"
732 					   "movq %%mm2,(%3)\n"
733 					   "movq %%mm3,8(%3)\n"
734 					   "emms\n"
735 
736 					   : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
737 					   :
738 					   : "cc"
739 					   );
740 #else
741   __asm {
742 	mov eax, src0;
743 	mov ebx, src1;
744 	mov ecx, src2;
745 	mov edx, dst;
746 	mov esi, count;
747 
748 	/* first run */
749 	/* set the current, current_pre, current_next registers */
750 	pxor mm0,mm0;
751 	movq mm7,qword ptr [ebx];
752 	movq mm1,qword ptr [ebx + 8];
753 	psrlq mm0,32;
754 	psllq mm1,32;
755 	movq mm2,mm7;
756 	movq mm3,mm7;
757 	psllq mm2,32;
758 	psrlq mm3,32;
759 	por mm0,mm2;
760 	por mm1,mm3;
761 
762 	/* current_upper */
763 	movq mm6,qword ptr [eax];
764 
765 	/* compute the upper-left pixel for dst0 on %%mm2 */
766 	/* compute the upper-right pixel for dst0 on %%mm4 */
767 	movq mm2,mm0;
768 	movq mm4,mm1;
769 	movq mm3,mm0;
770 	movq mm5,mm1;
771 	pcmpeqd mm2,mm6;
772 	pcmpeqd mm4,mm6;
773 	pcmpeqd mm3,qword ptr [ecx];
774 	pcmpeqd mm5,qword ptr [ecx];
775 	pandn mm3,mm2;
776 	pandn mm5,mm4;
777 	movq mm2,mm0;
778 	movq mm4,mm1;
779 	pcmpeqd mm2,mm1;
780 	pcmpeqd mm4,mm0;
781 	pandn mm2,mm3;
782 	pandn mm4,mm5;
783 	movq mm3,mm2;
784 	movq mm5,mm4;
785 	pand mm2,mm6;
786 	pand mm4,mm6;
787 	pandn mm3,mm7;
788 	pandn mm5,mm7;
789 	por mm2,mm3;
790 	por mm4,mm5;
791 
792 	/* set *dst0 */
793 	movq mm3,mm2;
794 	punpckldq mm2,mm4;
795 	punpckhdq mm3,mm4;
796 	movq qword ptr [edx],mm2;
797 	movq qword ptr [edx+8],mm3;
798 
799 	/* next */
800 	add eax,8;
801 	add ebx,8;
802 	add ecx,8;
803 	add edx,16;
804 
805 	/* central runs */
806 	shr esi,1;
807 	jz label1;
808 label0:
809 
810   /* set the current, current_pre, current_next registers */
811 	movq mm0,qword ptr [ebx-8];
812 	movq mm7,qword ptr [ebx];
813 	movq mm1,qword ptr [ebx+8];
814 	psrlq mm0,32;
815 	psllq mm1,32;
816 	movq mm2,mm7;
817 	movq mm3,mm7;
818 	psllq mm2,32;
819 	psrlq mm3,32;
820 	por mm0,mm2;
821 	por mm1,mm3;
822 
823 	/* current_upper */
824 	movq mm6,qword ptr[eax];
825 
826 	/* compute the upper-left pixel for dst0 on %%mm2 */
827 	/* compute the upper-right pixel for dst0 on %%mm4 */
828 	movq mm2,mm0;
829 	movq mm4,mm1;
830 	movq mm3,mm0;
831 	movq mm5,mm1;
832 	pcmpeqd mm2,mm6;
833 	pcmpeqd mm4,mm6;
834 	pcmpeqd mm3,qword ptr[ecx];
835 	pcmpeqd mm5,qword ptr[ecx];
836 	pandn mm3,mm2;
837 	pandn mm5,mm4;
838 	movq mm2,mm0;
839 	movq mm4,mm1;
840 	pcmpeqd mm2,mm1;
841 	pcmpeqd mm4,mm0;
842 	pandn mm2,mm3;
843 	pandn mm4,mm5;
844 	movq mm3,mm2;
845 	movq mm5,mm4;
846 	pand mm2,mm6;
847 	pand mm4,mm6;
848 	pandn mm3,mm7;
849 	pandn mm5,mm7;
850 	por mm2,mm3;
851 	por mm4,mm5;
852 
853 	/* set *dst0 */
854 	movq mm3,mm2;
855 	punpckldq mm2,mm4;
856 	punpckhdq mm3,mm4;
857 	movq qword ptr [edx],mm2;
858 	movq qword ptr [edx+8],mm3;
859 
860 	/* next */
861 	add eax,8;
862 	add ebx,8;
863 	add ecx,8;
864 	add edx,16;
865 
866 	dec esi;
867 	jnz label0;
868 label1:
869 
870 	/* final run */
871 	/* set the current, current_pre, current_next registers */
872 	movq mm0,qword ptr [ebx-8];
873 	movq mm7,qword ptr [ebx];
874 	pxor mm1,mm1;
875 	psrlq mm0,32;
876 	psllq mm1,32;
877 	movq mm2,mm7;
878 	movq mm3,mm7;
879 	psllq mm2,32;
880 	psrlq mm3,32;
881 	por mm0,mm2;
882 	por mm1,mm3;
883 
884 	/* current_upper */
885 	movq mm6,qword ptr [eax];
886 
887 	/* compute the upper-left pixel for dst0 on %%mm2 */
888 	/* compute the upper-right pixel for dst0 on %%mm4 */
889 	movq mm2,mm0;
890 	movq mm4,mm1;
891 	movq mm3,mm0;
892 	movq mm5,mm1;
893 	pcmpeqd mm2,mm6;
894 	pcmpeqd mm4,mm6;
895 	pcmpeqd mm3,qword ptr [ecx];
896 	pcmpeqd mm5,qword ptr [ecx];
897 	pandn mm3,mm2;
898 	pandn mm5,mm4;
899 	movq mm2,mm0;
900 	movq mm4,mm1;
901 	pcmpeqd mm2,mm1;
902 	pcmpeqd mm4,mm0;
903 	pandn mm2,mm3;
904 	pandn mm4,mm5;
905 	movq mm3,mm2;
906 	movq mm5,mm4;
907 	pand mm2,mm6;
908 	pand mm4,mm6;
909 	pandn mm3,mm7;
910 	pandn mm5,mm7;
911 	por mm2,mm3;
912 	por mm4,mm5;
913 
914 	/* set *dst0 */
915 	movq mm3,mm2;
916 	punpckldq mm2,mm4;
917 	punpckhdq mm3,mm4;
918 	movq qword ptr [edx],mm2;
919 	movq qword ptr [edx+8],mm3;
920 
921 	mov src0, eax;
922 	mov src1, ebx;
923 	mov src2, ecx;
924 	mov dst, edx;
925 	mov count, esi;
926 
927 	emms;
928   }
929 #endif
930 }
931 
internal_scale2x_16_mmx(u16 * dst0,u16 * dst1,const u16 * src0,const u16 * src1,const u16 * src2,unsigned count)932 static void internal_scale2x_16_mmx(u16* dst0, u16* dst1, const u16* src0, const u16* src1, const u16* src2, unsigned count)
933 {
934 	//assert( count >= 2*4 );
935 	internal_scale2x_16_mmx_single(dst0, src0, src1, src2, count);
936 	internal_scale2x_16_mmx_single(dst1, src2, src1, src0, count);
937 }
938 
internal_scale2x_32_mmx(u32 * dst0,u32 * dst1,const u32 * src0,const u32 * src1,const u32 * src2,unsigned count)939 static void internal_scale2x_32_mmx(u32* dst0, u32* dst1, const u32* src0, const u32* src1, const u32* src2, unsigned count)
940 {
941 	 //assert( count >= 2*2 );
942 	internal_scale2x_32_mmx_single(dst0, src0, src1, src2, count);
943 	internal_scale2x_32_mmx_single(dst1, src2, src1, src0, count);
944 }
945 #endif
946 
AdMame2x(u8 * srcPtr,u32 srcPitch,u8 * deltaPtr,u8 * dstPtr,u32 dstPitch,int width,int height)947 void AdMame2x(u8 *srcPtr, u32 srcPitch, u8 *deltaPtr, u8 *dstPtr, u32 dstPitch, int width, int height)
948 {
949 	int count;
950 	u16 *dst0 = (u16 *)dstPtr;
951 	u16 *dst1 = dst0 + (dstPitch/2);
952   	u16 *src0 = (u16 *)srcPtr;
953 	u16 *src1 = src0 + (srcPitch/2);
954 	u16 *src2 = src1 + (srcPitch/2);
955 
956 #ifdef MMX
957 	if(GetMMX())
958 	{
959 		internal_scale2x_16_mmx(dst0, dst1, src0, src0, src1, width);
960 
961 		count = height;
962 		count -= 2;
963 		while(count)
964 		{
965 			dst0 += dstPitch;
966 			dst1 += dstPitch;
967 			internal_scale2x_16_mmx(dst0, dst1, src0, src1, src2, width);
968 			src0 = src1;
969 			src1 = src2;
970 			src2 += srcPitch/2;
971 			--count;
972 		}
973 		dst0 += dstPitch;
974 		dst1 += dstPitch;
975 		internal_scale2x_16_mmx(dst0, dst1, src0, src1, src1, width);
976 	}
977 	else
978 #endif
979 	{
980 		internal_scale2x_16_def(dst0, dst1, src0, src0, src1, width);
981 
982 		count = height;
983 	    count -= 2;
984 		while(count)
985 		{
986 			dst0 += dstPitch;
987 			dst1 += dstPitch;
988 			internal_scale2x_16_def(dst0, dst1, src0, src1, src2, width);
989 			src0 = src1;
990 			src1 = src2;
991 			src2 += srcPitch/2;
992 			--count;
993 		}
994 		dst0 += dstPitch;
995 		dst1 += dstPitch;
996 		internal_scale2x_16_def(dst0, dst1, src0, src1, src1, width);
997 	}
998 }
999 
AdMame2x32(u8 * srcPtr,u32 srcPitch,u8 * deltaPtr,u8 * dstPtr,u32 dstPitch,int width,int height)1000 void AdMame2x32(u8 *srcPtr, u32 srcPitch, u8 *deltaPtr, u8 *dstPtr, u32 dstPitch, int width, int height)
1001 {
1002 	int count;
1003 	u32 *dst0 = (u32 *)dstPtr;
1004 	u32 *dst1 = dst0 + (dstPitch/4);
1005   	u32 *src0 = (u32 *)srcPtr;
1006 	u32 *src1 = src0 + (srcPitch/4);
1007 	u32 *src2 = src1 + (srcPitch/4);
1008 
1009 #ifdef MMX
1010 	if(GetMMX())
1011 	{
1012 		internal_scale2x_32_mmx(dst0, dst1, src0, src0, src1, width);
1013 
1014 		count = height;
1015 		count -= 2;
1016 		while(count)
1017 		{
1018 			dst0 += dstPitch/2;
1019 			dst1 += dstPitch/2;
1020 			internal_scale2x_32_mmx(dst0, dst1, src0, src1, src2, width);
1021 			src0 = src1;
1022 			src1 = src2;
1023 			src2 += srcPitch/4;
1024 			--count;
1025 		}
1026 		dst0 += dstPitch/2;
1027 		dst1 += dstPitch/2;
1028 		internal_scale2x_32_mmx(dst0, dst1, src0, src1, src1, width);
1029 	}
1030 	else
1031 #endif
1032 	{
1033 		internal_scale2x_32_def(dst0, dst1, src0, src0, src1, width);
1034 
1035 		count = height;
1036 		count -= 2;
1037 		while(count)
1038 		{
1039 			dst0 += dstPitch/2;
1040 			dst1 += dstPitch/2;
1041 			internal_scale2x_32_def(dst0, dst1, src0, src1, src2, width);
1042 			src0 = src1;
1043 			src1 = src2;
1044 			src2 += srcPitch/4;
1045 			--count;
1046 		}
1047 		dst0 += dstPitch/2;
1048 		dst1 += dstPitch/2;
1049 		internal_scale2x_32_def(dst0, dst1, src0, src1, src1, width);
1050 	}
1051 }
1052