1 /*
2 * This file is part of the Advance project.
3 *
4 * Copyright (C) 1999-2002 Andrea Mazzoleni
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20
21 /*
22 * This file contains a C and MMX implentation of the Scale2x effect.
23 *
24 * You can found an high level description of the effect at :
25 *
26 * http://scale2x.sourceforge.net/scale2x.html
27 *
28 * Alternatively at the previous license terms, you are allowed to use this
29 * code in your program with these conditions:
30 * - the program is not used in commercial activities.
31 * - the whole source code of the program is released with the binary.
32 * - derivative works of the program are allowed.
33 */
34
35 /*
36 * Code adapted To OpenBOR by SX
37 * scale2x.c - Trying to scale 2x.
38 *
39 * Updated: 5/05/08 - SX
40 *
41 */
42
43
44 #include "gfx.h"
45 #include "gfxtypes.h"
46
47 /* Suggested in "Intel Optimization" for Pentium II */
48 #define ASM_JUMP_ALIGN ".p2align 4\n"
49
internal_scale2x_16_def(u16 * dst0,u16 * dst1,const u16 * src0,const u16 * src1,const u16 * src2,unsigned count)50 static void internal_scale2x_16_def(u16 *dst0, u16* dst1, const u16* src0, const u16* src1, const u16* src2, unsigned count)
51 {
52 /* first pixel */
53 dst0[0] = src1[0];
54 dst1[0] = src1[0];
55 if (src1[1] == src0[0] && src2[0] != src0[0])
56 dst0[1] =src0[0];
57 else
58 dst0[1] =src1[0];
59 if (src1[1] == src2[0] && src0[0] != src2[0])
60 dst1[1] =src2[0];
61 else
62 dst1[1] =src1[0];
63 ++src0;
64 ++src1;
65 ++src2;
66 dst0 += 2;
67 dst1 += 2;
68
69 /* central pixels */
70 count -= 2;
71 while (count)
72 {
73 if (src1[-1] == src0[0] && src2[0] != src0[0] && src1[1] != src0[0])
74 dst0[0] = src0[0];
75 else
76 dst0[0] = src1[0];
77 if (src1[1] == src0[0] && src2[0] != src0[0] && src1[-1] != src0[0])
78 dst0[1] =src0[0];
79 else
80 dst0[1] =src1[0];
81
82 if (src1[-1] == src2[0] && src0[0] != src2[0] && src1[1] != src2[0])
83 dst1[0] =src2[0];
84 else
85 dst1[0] =src1[0];
86 if (src1[1] == src2[0] && src0[0] != src2[0] && src1[-1] != src2[0])
87 dst1[1] =src2[0];
88 else
89 dst1[1] =src1[0];
90
91 ++src0;
92 ++src1;
93 ++src2;
94 dst0 += 2;
95 dst1 += 2;
96 --count;
97 }
98
99 /* last pixel */
100 if (src1[-1] == src0[0] && src2[0] != src0[0])
101 dst0[0] =src0[0];
102 else
103 dst0[0] =src1[0];
104 if (src1[-1] == src2[0] && src0[0] != src2[0])
105 dst1[0] =src2[0];
106 else
107 dst1[0] =src1[0];
108 dst0[1] =src1[0];
109 dst1[1] =src1[0];
110 }
111
internal_scale2x_32_def(u32 * dst0,u32 * dst1,const u32 * src0,const u32 * src1,const u32 * src2,unsigned count)112 static void internal_scale2x_32_def(u32* dst0, u32* dst1, const u32* src0, const u32* src1, const u32* src2, unsigned count)
113 {
114 /* first pixel */
115 dst0[0] = src1[0];
116 dst1[0] = src1[0];
117 if (src1[1] == src0[0] && src2[0] != src0[0])
118 dst0[1] = src0[0];
119 else
120 dst0[1] = src1[0];
121 if (src1[1] == src2[0] && src0[0] != src2[0])
122 dst1[1] = src2[0];
123 else
124 dst1[1] = src1[0];
125 ++src0;
126 ++src1;
127 ++src2;
128 dst0 += 2;
129 dst1 += 2;
130
131 /* central pixels */
132 count -= 2;
133 while (count)
134 {
135 if (src1[-1] == src0[0] && src2[0] != src0[0] && src1[1] != src0[0])
136 dst0[0] = src0[0];
137 else
138 dst0[0] = src1[0];
139 if (src1[1] == src0[0] && src2[0] != src0[0] && src1[-1] != src0[0])
140 dst0[1] = src0[0];
141 else
142 dst0[1] = src1[0];
143
144 if (src1[-1] == src2[0] && src0[0] != src2[0] && src1[1] != src2[0])
145 dst1[0] = src2[0];
146 else
147 dst1[0] = src1[0];
148 if (src1[1] == src2[0] && src0[0] != src2[0] && src1[-1] != src2[0])
149 dst1[1] = src2[0];
150 else
151 dst1[1] = src1[0];
152
153 ++src0;
154 ++src1;
155 ++src2;
156 dst0 += 2;
157 dst1 += 2;
158 --count;
159 }
160
161 /* last pixel */
162 if (src1[-1] == src0[0] && src2[0] != src0[0])
163 dst0[0] = src0[0];
164 else
165 dst0[0] = src1[0];
166 if (src1[-1] == src2[0] && src0[0] != src2[0])
167 dst1[0] = src2[0];
168 else
169 dst1[0] = src1[0];
170 dst0[1] = src1[0];
171 dst1[1] = src1[0];
172 }
173
174 #ifdef MMX
internal_scale2x_16_mmx_single(u16 * dst,const u16 * src0,const u16 * src1,const u16 * src2,unsigned count)175 static void internal_scale2x_16_mmx_single(u16* dst, const u16* src0, const u16* src1, const u16* src2, unsigned count)
176 {
177 /* always do the first and last run */
178 count -= 2*4;
179
180 #ifdef __GNUC__
181 __asm__ __volatile__(
182 /* first run */
183 /* set the current, current_pre, current_next registers */
184 "pxor %%mm0,%%mm0\n" /* use a fake black out of screen */
185 "movq 0(%1),%%mm7\n"
186 "movq 8(%1),%%mm1\n"
187 "psrlq $48,%%mm0\n"
188 "psllq $48,%%mm1\n"
189 "movq %%mm7,%%mm2\n"
190 "movq %%mm7,%%mm3\n"
191 "psllq $16,%%mm2\n"
192 "psrlq $16,%%mm3\n"
193 "por %%mm2,%%mm0\n"
194 "por %%mm3,%%mm1\n"
195
196 /* current_upper */
197 "movq (%0),%%mm6\n"
198
199 /* compute the upper-left pixel for dst0 on %%mm2 */
200 /* compute the upper-right pixel for dst0 on %%mm4 */
201 "movq %%mm0,%%mm2\n"
202 "movq %%mm1,%%mm4\n"
203 "movq %%mm0,%%mm3\n"
204 "movq %%mm1,%%mm5\n"
205 "pcmpeqw %%mm6,%%mm2\n"
206 "pcmpeqw %%mm6,%%mm4\n"
207 "pcmpeqw (%2),%%mm3\n"
208 "pcmpeqw (%2),%%mm5\n"
209 "pandn %%mm2,%%mm3\n"
210 "pandn %%mm4,%%mm5\n"
211 "movq %%mm0,%%mm2\n"
212 "movq %%mm1,%%mm4\n"
213 "pcmpeqw %%mm1,%%mm2\n"
214 "pcmpeqw %%mm0,%%mm4\n"
215 "pandn %%mm3,%%mm2\n"
216 "pandn %%mm5,%%mm4\n"
217 "movq %%mm2,%%mm3\n"
218 "movq %%mm4,%%mm5\n"
219 "pand %%mm6,%%mm2\n"
220 "pand %%mm6,%%mm4\n"
221 "pandn %%mm7,%%mm3\n"
222 "pandn %%mm7,%%mm5\n"
223 "por %%mm3,%%mm2\n"
224 "por %%mm5,%%mm4\n"
225
226 /* set *dst0 */
227 "movq %%mm2,%%mm3\n"
228 "punpcklwd %%mm4,%%mm2\n"
229 "punpckhwd %%mm4,%%mm3\n"
230 "movq %%mm2,(%3)\n"
231 "movq %%mm3,8(%3)\n"
232
233 /* next */
234 "add $8,%0\n"
235 "add $8,%1\n"
236 "add $8,%2\n"
237 "add $16,%3\n"
238
239 /* central runs */
240 "shr $2,%4\n"
241 "jz 1f\n"
242 ASM_JUMP_ALIGN
243 "0:\n"
244
245 /* set the current, current_pre, current_next registers */
246 "movq -8(%1),%%mm0\n"
247 "movq (%1),%%mm7\n"
248 "movq 8(%1),%%mm1\n"
249 "psrlq $48,%%mm0\n"
250 "psllq $48,%%mm1\n"
251 "movq %%mm7,%%mm2\n"
252 "movq %%mm7,%%mm3\n"
253 "psllq $16,%%mm2\n"
254 "psrlq $16,%%mm3\n"
255 "por %%mm2,%%mm0\n"
256 "por %%mm3,%%mm1\n"
257
258 /* current_upper */
259 "movq (%0),%%mm6\n"
260
261 /* compute the upper-left pixel for dst0 on %%mm2 */
262 /* compute the upper-right pixel for dst0 on %%mm4 */
263 "movq %%mm0,%%mm2\n"
264 "movq %%mm1,%%mm4\n"
265 "movq %%mm0,%%mm3\n"
266 "movq %%mm1,%%mm5\n"
267 "pcmpeqw %%mm6,%%mm2\n"
268 "pcmpeqw %%mm6,%%mm4\n"
269 "pcmpeqw (%2),%%mm3\n"
270 "pcmpeqw (%2),%%mm5\n"
271 "pandn %%mm2,%%mm3\n"
272 "pandn %%mm4,%%mm5\n"
273 "movq %%mm0,%%mm2\n"
274 "movq %%mm1,%%mm4\n"
275 "pcmpeqw %%mm1,%%mm2\n"
276 "pcmpeqw %%mm0,%%mm4\n"
277 "pandn %%mm3,%%mm2\n"
278 "pandn %%mm5,%%mm4\n"
279 "movq %%mm2,%%mm3\n"
280 "movq %%mm4,%%mm5\n"
281 "pand %%mm6,%%mm2\n"
282 "pand %%mm6,%%mm4\n"
283 "pandn %%mm7,%%mm3\n"
284 "pandn %%mm7,%%mm5\n"
285 "por %%mm3,%%mm2\n"
286 "por %%mm5,%%mm4\n"
287
288 /* set *dst0 */
289 "movq %%mm2,%%mm3\n"
290 "punpcklwd %%mm4,%%mm2\n"
291 "punpckhwd %%mm4,%%mm3\n"
292 "movq %%mm2,(%3)\n"
293 "movq %%mm3,8(%3)\n"
294
295 /* next */
296 "add $8,%0\n"
297 "add $8,%1\n"
298 "add $8,%2\n"
299 "add $16,%3\n"
300
301 "decl %4\n"
302 "jnz 0b\n"
303 "1:\n"
304
305 /* final run */
306 /* set the current, current_pre, current_next registers */
307 "movq -8(%1),%%mm0\n"
308 "movq (%1),%%mm7\n"
309 "pxor %%mm1,%%mm1\n" /* use a fake black out of screen */
310 "psrlq $48,%%mm0\n"
311 "psllq $48,%%mm1\n"
312 "movq %%mm7,%%mm2\n"
313 "movq %%mm7,%%mm3\n"
314 "psllq $16,%%mm2\n"
315 "psrlq $16,%%mm3\n"
316 "por %%mm2,%%mm0\n"
317 "por %%mm3,%%mm1\n"
318
319 /* current_upper */
320 "movq (%0),%%mm6\n"
321
322 /* compute the upper-left pixel for dst0 on %%mm2 */
323 /* compute the upper-right pixel for dst0 on %%mm4 */
324 "movq %%mm0,%%mm2\n"
325 "movq %%mm1,%%mm4\n"
326 "movq %%mm0,%%mm3\n"
327 "movq %%mm1,%%mm5\n"
328 "pcmpeqw %%mm6,%%mm2\n"
329 "pcmpeqw %%mm6,%%mm4\n"
330 "pcmpeqw (%2),%%mm3\n"
331 "pcmpeqw (%2),%%mm5\n"
332 "pandn %%mm2,%%mm3\n"
333 "pandn %%mm4,%%mm5\n"
334 "movq %%mm0,%%mm2\n"
335 "movq %%mm1,%%mm4\n"
336 "pcmpeqw %%mm1,%%mm2\n"
337 "pcmpeqw %%mm0,%%mm4\n"
338 "pandn %%mm3,%%mm2\n"
339 "pandn %%mm5,%%mm4\n"
340 "movq %%mm2,%%mm3\n"
341 "movq %%mm4,%%mm5\n"
342 "pand %%mm6,%%mm2\n"
343 "pand %%mm6,%%mm4\n"
344 "pandn %%mm7,%%mm3\n"
345 "pandn %%mm7,%%mm5\n"
346 "por %%mm3,%%mm2\n"
347 "por %%mm5,%%mm4\n"
348
349 /* set *dst0 */
350 "movq %%mm2,%%mm3\n"
351 "punpcklwd %%mm4,%%mm2\n"
352 "punpckhwd %%mm4,%%mm3\n"
353 "movq %%mm2,(%3)\n"
354 "movq %%mm3,8(%3)\n"
355 "emms\n"
356
357 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
358 :
359 : "cc"
360 );
361 #else
362 __asm {
363 mov eax, src0;
364 mov ebx, src1;
365 mov ecx, src2;
366 mov edx, dst;
367 mov esi, count;
368
369 /* first run */
370 /* set the current, current_pre, current_next registers */
371 pxor mm0,mm0; /* use a fake black out of screen */
372 movq mm7, qword ptr [ebx];
373 movq mm1, qword ptr [ebx + 8];
374 psrlq mm0, 48;
375 psllq mm1, 48;
376 movq mm2, mm7;
377 movq mm3, mm7;
378 psllq mm2, 16;
379 psrlq mm3, 16;
380 por mm0, mm2;
381 por mm1, mm3;
382
383 /* current_upper */
384 movq mm6, qword ptr [eax];
385
386 /* compute the upper-left pixel for dst0 on %%mm2 */
387 /* compute the upper-right pixel for dst0 on %%mm4 */
388 movq mm2, mm0;
389 movq mm4, mm1;
390 movq mm3, mm0;
391 movq mm5, mm1;
392 pcmpeqw mm2, mm6;
393 pcmpeqw mm4, mm6;
394 pcmpeqw mm3, qword ptr [ecx];
395 pcmpeqw mm5, qword ptr [ecx];
396 pandn mm3,mm2;
397 pandn mm5,mm4;
398 movq mm2,mm0;
399 movq mm4,mm1;
400 pcmpeqw mm2,mm1;
401 pcmpeqw mm4,mm0;
402 pandn mm2,mm3;
403 pandn mm4,mm5;
404 movq mm3,mm2;
405 movq mm5,mm4;
406 pand mm2,mm6;
407 pand mm4,mm6;
408 pandn mm3,mm7;
409 pandn mm5,mm7;
410 por mm2,mm3;
411 por mm4,mm5;
412
413 /* set *dst0 */
414 movq mm3,mm2;
415 punpcklwd mm2,mm4;
416 punpckhwd mm3,mm4;
417 movq qword ptr [edx], mm2;
418 movq qword ptr [edx + 8], mm3;
419
420 /* next */
421 add eax, 8;
422 add ebx, 8;
423 add ecx, 8;
424 add edx, 16;
425
426 /* central runs */
427 shr esi, 2;
428 jz label1;
429 align 4;
430 label0:
431
432 /* set the current, current_pre, current_next registers */
433 movq mm0, qword ptr [ebx-8];
434 movq mm7, qword ptr [ebx];
435 movq mm1, qword ptr [ebx+8];
436 psrlq mm0,48;
437 psllq mm1,48;
438 movq mm2,mm7;
439 movq mm3,mm7;
440 psllq mm2,16;
441 psrlq mm3,16;
442 por mm0,mm2;
443 por mm1,mm3;
444
445 /* current_upper */
446 movq mm6, qword ptr [eax];
447
448 /* compute the upper-left pixel for dst0 on %%mm2 */
449 /* compute the upper-right pixel for dst0 on %%mm4 */
450 movq mm2,mm0;
451 movq mm4,mm1;
452 movq mm3,mm0;
453 movq mm5,mm1;
454 pcmpeqw mm2,mm6;
455 pcmpeqw mm4,mm6;
456 pcmpeqw mm3, qword ptr [ecx];
457 pcmpeqw mm5, qword ptr [ecx];
458 pandn mm3,mm2;
459 pandn mm5,mm4;
460 movq mm2,mm0;
461 movq mm4,mm1;
462 pcmpeqw mm2,mm1;
463 pcmpeqw mm4,mm0;
464 pandn mm2,mm3;
465 pandn mm4,mm5;
466 movq mm3,mm2;
467 movq mm5,mm4;
468 pand mm2,mm6;
469 pand mm4,mm6;
470 pandn mm3,mm7;
471 pandn mm5,mm7;
472 por mm2,mm3;
473 por mm4,mm5;
474
475 /* set *dst0 */
476 movq mm3,mm2;
477 punpcklwd mm2,mm4;
478 punpckhwd mm3,mm4;
479 movq qword ptr [edx], mm2;
480 movq qword ptr [edx+8], mm3;
481
482 /* next */
483 add eax,8;
484 add ebx,8;
485 add ecx,8;
486 add edx,16;
487
488 dec esi;
489 jnz label0;
490 label1:
491
492 /* final run */
493 /* set the current, current_pre, current_next registers */
494 movq mm0, qword ptr [ebx-8];
495 movq mm7, qword ptr [ebx];
496 pxor mm1,mm1; /* use a fake black out of screen */
497 psrlq mm0,48;
498 psllq mm1,48;
499 movq mm2,mm7;
500 movq mm3,mm7;
501 psllq mm2,16;
502 psrlq mm3,16;
503 por mm0,mm2;
504 por mm1,mm3;
505
506 /* current_upper */
507 movq mm6, qword ptr [eax];
508
509 /* compute the upper-left pixel for dst0 on %%mm2 */
510 /* compute the upper-right pixel for dst0 on %%mm4 */
511 movq mm2,mm0;
512 movq mm4,mm1;
513 movq mm3,mm0;
514 movq mm5,mm1;
515 pcmpeqw mm2,mm6;
516 pcmpeqw mm4,mm6;
517 pcmpeqw mm3, qword ptr [ecx];
518 pcmpeqw mm5, qword ptr [ecx];
519 pandn mm3,mm2;
520 pandn mm5,mm4;
521 movq mm2,mm0;
522 movq mm4,mm1;
523 pcmpeqw mm2,mm1;
524 pcmpeqw mm4,mm0;
525 pandn mm2,mm3;
526 pandn mm4,mm5;
527 movq mm3,mm2;
528 movq mm5,mm4;
529 pand mm2,mm6;
530 pand mm4,mm6;
531 pandn mm3,mm7;
532 pandn mm5,mm7;
533 por mm2,mm3;
534 por mm4,mm5;
535
536 /* set *dst0 */
537 movq mm3,mm2;
538 punpcklwd mm2,mm4;
539 punpckhwd mm3,mm4;
540 movq qword ptr [edx], mm2;
541 movq qword ptr [edx+8], mm3;
542
543 mov src0, eax;
544 mov src1, ebx;
545 mov src2, ecx;
546 mov dst, edx;
547 mov count, esi;
548
549 emms;
550 }
551 #endif
552 }
553
internal_scale2x_32_mmx_single(u32 * dst,const u32 * src0,const u32 * src1,const u32 * src2,unsigned count)554 static void internal_scale2x_32_mmx_single(u32* dst, const u32* src0, const u32* src1, const u32* src2, unsigned count)
555 {
556 /* always do the first and last run */
557 count -= 2*2;
558
559 #ifdef __GNUC__
560 __asm__ __volatile__(
561 /* first run */
562 /* set the current, current_pre, current_next registers */
563 "pxor %%mm0,%%mm0\n" /* use a fake black out of screen */
564 "movq 0(%1),%%mm7\n"
565 "movq 8(%1),%%mm1\n"
566 "psrlq $32,%%mm0\n"
567 "psllq $32,%%mm1\n"
568 "movq %%mm7,%%mm2\n"
569 "movq %%mm7,%%mm3\n"
570 "psllq $32,%%mm2\n"
571 "psrlq $32,%%mm3\n"
572 "por %%mm2,%%mm0\n"
573 "por %%mm3,%%mm1\n"
574
575 /* current_upper */
576 "movq (%0),%%mm6\n"
577
578 /* compute the upper-left pixel for dst0 on %%mm2 */
579 /* compute the upper-right pixel for dst0 on %%mm4 */
580 "movq %%mm0,%%mm2\n"
581 "movq %%mm1,%%mm4\n"
582 "movq %%mm0,%%mm3\n"
583 "movq %%mm1,%%mm5\n"
584 "pcmpeqd %%mm6,%%mm2\n"
585 "pcmpeqd %%mm6,%%mm4\n"
586 "pcmpeqd (%2),%%mm3\n"
587 "pcmpeqd (%2),%%mm5\n"
588 "pandn %%mm2,%%mm3\n"
589 "pandn %%mm4,%%mm5\n"
590 "movq %%mm0,%%mm2\n"
591 "movq %%mm1,%%mm4\n"
592 "pcmpeqd %%mm1,%%mm2\n"
593 "pcmpeqd %%mm0,%%mm4\n"
594 "pandn %%mm3,%%mm2\n"
595 "pandn %%mm5,%%mm4\n"
596 "movq %%mm2,%%mm3\n"
597 "movq %%mm4,%%mm5\n"
598 "pand %%mm6,%%mm2\n"
599 "pand %%mm6,%%mm4\n"
600 "pandn %%mm7,%%mm3\n"
601 "pandn %%mm7,%%mm5\n"
602 "por %%mm3,%%mm2\n"
603 "por %%mm5,%%mm4\n"
604
605 /* set *dst0 */
606 "movq %%mm2,%%mm3\n"
607 "punpckldq %%mm4,%%mm2\n"
608 "punpckhdq %%mm4,%%mm3\n"
609 "movq %%mm2,(%3)\n"
610 "movq %%mm3,8(%3)\n"
611
612 /* next */
613 "add $8,%0\n"
614 "add $8,%1\n"
615 "add $8,%2\n"
616 "add $16,%3\n"
617
618 /* central runs */
619 "shr $1,%4\n"
620 "jz 1f\n"
621 ASM_JUMP_ALIGN
622 "0:\n"
623
624 /* set the current, current_pre, current_next registers */
625 "movq -8(%1),%%mm0\n"
626 "movq (%1),%%mm7\n"
627 "movq 8(%1),%%mm1\n"
628 "psrlq $32,%%mm0\n"
629 "psllq $32,%%mm1\n"
630 "movq %%mm7,%%mm2\n"
631 "movq %%mm7,%%mm3\n"
632 "psllq $32,%%mm2\n"
633 "psrlq $32,%%mm3\n"
634 "por %%mm2,%%mm0\n"
635 "por %%mm3,%%mm1\n"
636
637 /* current_upper */
638 "movq (%0),%%mm6\n"
639
640 /* compute the upper-left pixel for dst0 on %%mm2 */
641 /* compute the upper-right pixel for dst0 on %%mm4 */
642 "movq %%mm0,%%mm2\n"
643 "movq %%mm1,%%mm4\n"
644 "movq %%mm0,%%mm3\n"
645 "movq %%mm1,%%mm5\n"
646 "pcmpeqd %%mm6,%%mm2\n"
647 "pcmpeqd %%mm6,%%mm4\n"
648 "pcmpeqd (%2),%%mm3\n"
649 "pcmpeqd (%2),%%mm5\n"
650 "pandn %%mm2,%%mm3\n"
651 "pandn %%mm4,%%mm5\n"
652 "movq %%mm0,%%mm2\n"
653 "movq %%mm1,%%mm4\n"
654 "pcmpeqd %%mm1,%%mm2\n"
655 "pcmpeqd %%mm0,%%mm4\n"
656 "pandn %%mm3,%%mm2\n"
657 "pandn %%mm5,%%mm4\n"
658 "movq %%mm2,%%mm3\n"
659 "movq %%mm4,%%mm5\n"
660 "pand %%mm6,%%mm2\n"
661 "pand %%mm6,%%mm4\n"
662 "pandn %%mm7,%%mm3\n"
663 "pandn %%mm7,%%mm5\n"
664 "por %%mm3,%%mm2\n"
665 "por %%mm5,%%mm4\n"
666
667 /* set *dst0 */
668 "movq %%mm2,%%mm3\n"
669 "punpckldq %%mm4,%%mm2\n"
670 "punpckhdq %%mm4,%%mm3\n"
671 "movq %%mm2,(%3)\n"
672 "movq %%mm3,8(%3)\n"
673
674 /* next */
675 "add $8,%0\n"
676 "add $8,%1\n"
677 "add $8,%2\n"
678 "add $16,%3\n"
679
680 "decl %4\n"
681 "jnz 0b\n"
682 "1:\n"
683
684 /* final run */
685 /* set the current, current_pre, current_next registers */
686 "movq -8(%1),%%mm0\n"
687 "movq (%1),%%mm7\n"
688 "pxor %%mm1,%%mm1\n" /* use a fake black out of screen */
689 "psrlq $32,%%mm0\n"
690 "psllq $32,%%mm1\n"
691 "movq %%mm7,%%mm2\n"
692 "movq %%mm7,%%mm3\n"
693 "psllq $32,%%mm2\n"
694 "psrlq $32,%%mm3\n"
695 "por %%mm2,%%mm0\n"
696 "por %%mm3,%%mm1\n"
697
698 /* current_upper */
699 "movq (%0),%%mm6\n"
700
701 /* compute the upper-left pixel for dst0 on %%mm2 */
702 /* compute the upper-right pixel for dst0 on %%mm4 */
703 "movq %%mm0,%%mm2\n"
704 "movq %%mm1,%%mm4\n"
705 "movq %%mm0,%%mm3\n"
706 "movq %%mm1,%%mm5\n"
707 "pcmpeqd %%mm6,%%mm2\n"
708 "pcmpeqd %%mm6,%%mm4\n"
709 "pcmpeqd (%2),%%mm3\n"
710 "pcmpeqd (%2),%%mm5\n"
711 "pandn %%mm2,%%mm3\n"
712 "pandn %%mm4,%%mm5\n"
713 "movq %%mm0,%%mm2\n"
714 "movq %%mm1,%%mm4\n"
715 "pcmpeqd %%mm1,%%mm2\n"
716 "pcmpeqd %%mm0,%%mm4\n"
717 "pandn %%mm3,%%mm2\n"
718 "pandn %%mm5,%%mm4\n"
719 "movq %%mm2,%%mm3\n"
720 "movq %%mm4,%%mm5\n"
721 "pand %%mm6,%%mm2\n"
722 "pand %%mm6,%%mm4\n"
723 "pandn %%mm7,%%mm3\n"
724 "pandn %%mm7,%%mm5\n"
725 "por %%mm3,%%mm2\n"
726 "por %%mm5,%%mm4\n"
727
728 /* set *dst0 */
729 "movq %%mm2,%%mm3\n"
730 "punpckldq %%mm4,%%mm2\n"
731 "punpckhdq %%mm4,%%mm3\n"
732 "movq %%mm2,(%3)\n"
733 "movq %%mm3,8(%3)\n"
734 "emms\n"
735
736 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
737 :
738 : "cc"
739 );
740 #else
741 __asm {
742 mov eax, src0;
743 mov ebx, src1;
744 mov ecx, src2;
745 mov edx, dst;
746 mov esi, count;
747
748 /* first run */
749 /* set the current, current_pre, current_next registers */
750 pxor mm0,mm0;
751 movq mm7,qword ptr [ebx];
752 movq mm1,qword ptr [ebx + 8];
753 psrlq mm0,32;
754 psllq mm1,32;
755 movq mm2,mm7;
756 movq mm3,mm7;
757 psllq mm2,32;
758 psrlq mm3,32;
759 por mm0,mm2;
760 por mm1,mm3;
761
762 /* current_upper */
763 movq mm6,qword ptr [eax];
764
765 /* compute the upper-left pixel for dst0 on %%mm2 */
766 /* compute the upper-right pixel for dst0 on %%mm4 */
767 movq mm2,mm0;
768 movq mm4,mm1;
769 movq mm3,mm0;
770 movq mm5,mm1;
771 pcmpeqd mm2,mm6;
772 pcmpeqd mm4,mm6;
773 pcmpeqd mm3,qword ptr [ecx];
774 pcmpeqd mm5,qword ptr [ecx];
775 pandn mm3,mm2;
776 pandn mm5,mm4;
777 movq mm2,mm0;
778 movq mm4,mm1;
779 pcmpeqd mm2,mm1;
780 pcmpeqd mm4,mm0;
781 pandn mm2,mm3;
782 pandn mm4,mm5;
783 movq mm3,mm2;
784 movq mm5,mm4;
785 pand mm2,mm6;
786 pand mm4,mm6;
787 pandn mm3,mm7;
788 pandn mm5,mm7;
789 por mm2,mm3;
790 por mm4,mm5;
791
792 /* set *dst0 */
793 movq mm3,mm2;
794 punpckldq mm2,mm4;
795 punpckhdq mm3,mm4;
796 movq qword ptr [edx],mm2;
797 movq qword ptr [edx+8],mm3;
798
799 /* next */
800 add eax,8;
801 add ebx,8;
802 add ecx,8;
803 add edx,16;
804
805 /* central runs */
806 shr esi,1;
807 jz label1;
808 label0:
809
810 /* set the current, current_pre, current_next registers */
811 movq mm0,qword ptr [ebx-8];
812 movq mm7,qword ptr [ebx];
813 movq mm1,qword ptr [ebx+8];
814 psrlq mm0,32;
815 psllq mm1,32;
816 movq mm2,mm7;
817 movq mm3,mm7;
818 psllq mm2,32;
819 psrlq mm3,32;
820 por mm0,mm2;
821 por mm1,mm3;
822
823 /* current_upper */
824 movq mm6,qword ptr[eax];
825
826 /* compute the upper-left pixel for dst0 on %%mm2 */
827 /* compute the upper-right pixel for dst0 on %%mm4 */
828 movq mm2,mm0;
829 movq mm4,mm1;
830 movq mm3,mm0;
831 movq mm5,mm1;
832 pcmpeqd mm2,mm6;
833 pcmpeqd mm4,mm6;
834 pcmpeqd mm3,qword ptr[ecx];
835 pcmpeqd mm5,qword ptr[ecx];
836 pandn mm3,mm2;
837 pandn mm5,mm4;
838 movq mm2,mm0;
839 movq mm4,mm1;
840 pcmpeqd mm2,mm1;
841 pcmpeqd mm4,mm0;
842 pandn mm2,mm3;
843 pandn mm4,mm5;
844 movq mm3,mm2;
845 movq mm5,mm4;
846 pand mm2,mm6;
847 pand mm4,mm6;
848 pandn mm3,mm7;
849 pandn mm5,mm7;
850 por mm2,mm3;
851 por mm4,mm5;
852
853 /* set *dst0 */
854 movq mm3,mm2;
855 punpckldq mm2,mm4;
856 punpckhdq mm3,mm4;
857 movq qword ptr [edx],mm2;
858 movq qword ptr [edx+8],mm3;
859
860 /* next */
861 add eax,8;
862 add ebx,8;
863 add ecx,8;
864 add edx,16;
865
866 dec esi;
867 jnz label0;
868 label1:
869
870 /* final run */
871 /* set the current, current_pre, current_next registers */
872 movq mm0,qword ptr [ebx-8];
873 movq mm7,qword ptr [ebx];
874 pxor mm1,mm1;
875 psrlq mm0,32;
876 psllq mm1,32;
877 movq mm2,mm7;
878 movq mm3,mm7;
879 psllq mm2,32;
880 psrlq mm3,32;
881 por mm0,mm2;
882 por mm1,mm3;
883
884 /* current_upper */
885 movq mm6,qword ptr [eax];
886
887 /* compute the upper-left pixel for dst0 on %%mm2 */
888 /* compute the upper-right pixel for dst0 on %%mm4 */
889 movq mm2,mm0;
890 movq mm4,mm1;
891 movq mm3,mm0;
892 movq mm5,mm1;
893 pcmpeqd mm2,mm6;
894 pcmpeqd mm4,mm6;
895 pcmpeqd mm3,qword ptr [ecx];
896 pcmpeqd mm5,qword ptr [ecx];
897 pandn mm3,mm2;
898 pandn mm5,mm4;
899 movq mm2,mm0;
900 movq mm4,mm1;
901 pcmpeqd mm2,mm1;
902 pcmpeqd mm4,mm0;
903 pandn mm2,mm3;
904 pandn mm4,mm5;
905 movq mm3,mm2;
906 movq mm5,mm4;
907 pand mm2,mm6;
908 pand mm4,mm6;
909 pandn mm3,mm7;
910 pandn mm5,mm7;
911 por mm2,mm3;
912 por mm4,mm5;
913
914 /* set *dst0 */
915 movq mm3,mm2;
916 punpckldq mm2,mm4;
917 punpckhdq mm3,mm4;
918 movq qword ptr [edx],mm2;
919 movq qword ptr [edx+8],mm3;
920
921 mov src0, eax;
922 mov src1, ebx;
923 mov src2, ecx;
924 mov dst, edx;
925 mov count, esi;
926
927 emms;
928 }
929 #endif
930 }
931
internal_scale2x_16_mmx(u16 * dst0,u16 * dst1,const u16 * src0,const u16 * src1,const u16 * src2,unsigned count)932 static void internal_scale2x_16_mmx(u16* dst0, u16* dst1, const u16* src0, const u16* src1, const u16* src2, unsigned count)
933 {
934 //assert( count >= 2*4 );
935 internal_scale2x_16_mmx_single(dst0, src0, src1, src2, count);
936 internal_scale2x_16_mmx_single(dst1, src2, src1, src0, count);
937 }
938
internal_scale2x_32_mmx(u32 * dst0,u32 * dst1,const u32 * src0,const u32 * src1,const u32 * src2,unsigned count)939 static void internal_scale2x_32_mmx(u32* dst0, u32* dst1, const u32* src0, const u32* src1, const u32* src2, unsigned count)
940 {
941 //assert( count >= 2*2 );
942 internal_scale2x_32_mmx_single(dst0, src0, src1, src2, count);
943 internal_scale2x_32_mmx_single(dst1, src2, src1, src0, count);
944 }
945 #endif
946
AdMame2x(u8 * srcPtr,u32 srcPitch,u8 * deltaPtr,u8 * dstPtr,u32 dstPitch,int width,int height)947 void AdMame2x(u8 *srcPtr, u32 srcPitch, u8 *deltaPtr, u8 *dstPtr, u32 dstPitch, int width, int height)
948 {
949 int count;
950 u16 *dst0 = (u16 *)dstPtr;
951 u16 *dst1 = dst0 + (dstPitch/2);
952 u16 *src0 = (u16 *)srcPtr;
953 u16 *src1 = src0 + (srcPitch/2);
954 u16 *src2 = src1 + (srcPitch/2);
955
956 #ifdef MMX
957 if(GetMMX())
958 {
959 internal_scale2x_16_mmx(dst0, dst1, src0, src0, src1, width);
960
961 count = height;
962 count -= 2;
963 while(count)
964 {
965 dst0 += dstPitch;
966 dst1 += dstPitch;
967 internal_scale2x_16_mmx(dst0, dst1, src0, src1, src2, width);
968 src0 = src1;
969 src1 = src2;
970 src2 += srcPitch/2;
971 --count;
972 }
973 dst0 += dstPitch;
974 dst1 += dstPitch;
975 internal_scale2x_16_mmx(dst0, dst1, src0, src1, src1, width);
976 }
977 else
978 #endif
979 {
980 internal_scale2x_16_def(dst0, dst1, src0, src0, src1, width);
981
982 count = height;
983 count -= 2;
984 while(count)
985 {
986 dst0 += dstPitch;
987 dst1 += dstPitch;
988 internal_scale2x_16_def(dst0, dst1, src0, src1, src2, width);
989 src0 = src1;
990 src1 = src2;
991 src2 += srcPitch/2;
992 --count;
993 }
994 dst0 += dstPitch;
995 dst1 += dstPitch;
996 internal_scale2x_16_def(dst0, dst1, src0, src1, src1, width);
997 }
998 }
999
AdMame2x32(u8 * srcPtr,u32 srcPitch,u8 * deltaPtr,u8 * dstPtr,u32 dstPitch,int width,int height)1000 void AdMame2x32(u8 *srcPtr, u32 srcPitch, u8 *deltaPtr, u8 *dstPtr, u32 dstPitch, int width, int height)
1001 {
1002 int count;
1003 u32 *dst0 = (u32 *)dstPtr;
1004 u32 *dst1 = dst0 + (dstPitch/4);
1005 u32 *src0 = (u32 *)srcPtr;
1006 u32 *src1 = src0 + (srcPitch/4);
1007 u32 *src2 = src1 + (srcPitch/4);
1008
1009 #ifdef MMX
1010 if(GetMMX())
1011 {
1012 internal_scale2x_32_mmx(dst0, dst1, src0, src0, src1, width);
1013
1014 count = height;
1015 count -= 2;
1016 while(count)
1017 {
1018 dst0 += dstPitch/2;
1019 dst1 += dstPitch/2;
1020 internal_scale2x_32_mmx(dst0, dst1, src0, src1, src2, width);
1021 src0 = src1;
1022 src1 = src2;
1023 src2 += srcPitch/4;
1024 --count;
1025 }
1026 dst0 += dstPitch/2;
1027 dst1 += dstPitch/2;
1028 internal_scale2x_32_mmx(dst0, dst1, src0, src1, src1, width);
1029 }
1030 else
1031 #endif
1032 {
1033 internal_scale2x_32_def(dst0, dst1, src0, src0, src1, width);
1034
1035 count = height;
1036 count -= 2;
1037 while(count)
1038 {
1039 dst0 += dstPitch/2;
1040 dst1 += dstPitch/2;
1041 internal_scale2x_32_def(dst0, dst1, src0, src1, src2, width);
1042 src0 = src1;
1043 src1 = src2;
1044 src2 += srcPitch/4;
1045 --count;
1046 }
1047 dst0 += dstPitch/2;
1048 dst1 += dstPitch/2;
1049 internal_scale2x_32_def(dst0, dst1, src0, src1, src1, width);
1050 }
1051 }
1052