1 /*
2  * This file is part of the Advance project.
3  *
4  * Copyright (C) 1999-2002 Andrea Mazzoleni
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19  *
20  * In addition, as a special exception, Andrea Mazzoleni
21  * gives permission to link the code of this program with
22  * the MAME library (or with modified versions of MAME that use the
23  * same license as MAME), and distribute linked combinations including
24  * the two.  You must obey the GNU General Public License in all
25  * respects for all of the code used other than MAME.  If you modify
26  * this file, you may extend this exception to your version of the
27  * file, but you are not obligated to do so.  If you do not wish to
28  * do so, delete this exception statement from your version.
29  */
30 
31 /*
32  * Alternatively at the previous license terms, you are allowed to use this
33  * code in your program with these conditions:
34  * - the program is not used in commercial activities.
35  * - the whole source code of the program is released with the binary.
36  */
37 
38 #include "../System.h"
39 
40 #ifdef MMX
41 extern "C" bool cpu_mmx;
42 #endif
43 
internal_scale2x_16_def(u16 * dst,const u16 * src0,const u16 * src1,const u16 * src2,unsigned count)44 static void internal_scale2x_16_def(u16 *dst, const u16* src0, const u16* src1, const u16* src2, unsigned count) {
45   /* first pixel */
46   dst[0] = src1[0];
47   if (src1[1] == src0[0] && src2[0] != src0[0])
48     dst[1] = src0[0];
49   else
50     dst[1] = src1[0];
51   ++src0;
52   ++src1;
53   ++src2;
54   dst += 2;
55 
56   /* central pixels */
57   count -= 2;
58   while (count) {
59     if (src0[0] != src2[0] && src1[-1] != src1[1]) {
60       dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
61       dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
62     } else {
63       dst[0] = src1[0];
64       dst[1] = src1[0];
65     }
66 
67     ++src0;
68     ++src1;
69     ++src2;
70     dst += 2;
71     --count;
72   }
73 
74   /* last pixel */
75   if (src1[-1] == src0[0] && src2[0] != src0[0])
76     dst[0] = src0[0];
77   else
78     dst[0] = src1[0];
79   dst[1] = src1[0];
80 }
81 
internal_scale2x_32_def(u32 * dst,const u32 * src0,const u32 * src1,const u32 * src2,unsigned count)82 static void internal_scale2x_32_def(u32* dst,
83                                     const u32* src0,
84                                     const u32* src1,
85                                     const u32* src2,
86                                     unsigned count)
87 {
88   /* first pixel */
89   dst[0] = src1[0];
90   if (src1[1] == src0[0] && src2[0] != src0[0])
91     dst[1] = src0[0];
92   else
93     dst[1] = src1[0];
94   ++src0;
95   ++src1;
96   ++src2;
97   dst += 2;
98 
99   /* central pixels */
100   count -= 2;
101   while (count) {
102     if (src0[0] != src2[0] && src1[-1] != src1[1]) {
103       dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
104       dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
105     } else {
106       dst[0] = src1[0];
107       dst[1] = src1[0];
108     }
109 
110     ++src0;
111     ++src1;
112     ++src2;
113     dst += 2;
114     --count;
115   }
116 
117   /* last pixel */
118   if (src1[-1] == src0[0] && src2[0] != src0[0])
119     dst[0] = src0[0];
120   else
121     dst[0] = src1[0];
122   dst[1] = src1[0];
123 }
124 
125 #ifdef MMX
internal_scale2x_16_mmx_single(u16 * dst,const u16 * src0,const u16 * src1,const u16 * src2,unsigned count)126 static void internal_scale2x_16_mmx_single(u16* dst, const u16* src0, const u16* src1, const u16* src2, unsigned count) {
127   /* always do the first and last run */
128   count -= 2*4;
129 
130 #ifdef __GNUC__
131   __asm__ __volatile__(
132                        /* first run */
133                        /* set the current, current_pre, current_next registers */
134                        "movq 0(%1), %%mm0\n"
135                        "movq 0(%1),%%mm7\n"
136                        "movq 8(%1),%%mm1\n"
137                        "psllq $48,%%mm0\n"
138                        "psllq $48,%%mm1\n"
139                        "psrlq $48, %%mm0\n"
140                        "movq %%mm7,%%mm2\n"
141                        "movq %%mm7,%%mm3\n"
142                        "psllq $16,%%mm2\n"
143                        "psrlq $16,%%mm3\n"
144                        "por %%mm2,%%mm0\n"
145                        "por %%mm3,%%mm1\n"
146 
147                        /* current_upper */
148                        "movq (%0),%%mm6\n"
149 
150                        /* compute the upper-left pixel for dst on %%mm2 */
151                        /* compute the upper-right pixel for dst on %%mm4 */
152                        "movq %%mm0,%%mm2\n"
153                        "movq %%mm1,%%mm4\n"
154                        "movq %%mm0,%%mm3\n"
155                        "movq %%mm1,%%mm5\n"
156                        "pcmpeqw %%mm6,%%mm2\n"
157                        "pcmpeqw %%mm6,%%mm4\n"
158                        "pcmpeqw (%2),%%mm3\n"
159                        "pcmpeqw (%2),%%mm5\n"
160                        "pandn %%mm2,%%mm3\n"
161                        "pandn %%mm4,%%mm5\n"
162                        "movq %%mm0,%%mm2\n"
163                        "movq %%mm1,%%mm4\n"
164                        "pcmpeqw %%mm1,%%mm2\n"
165                        "pcmpeqw %%mm0,%%mm4\n"
166                        "pandn %%mm3,%%mm2\n"
167                        "pandn %%mm5,%%mm4\n"
168                        "movq %%mm2,%%mm3\n"
169                        "movq %%mm4,%%mm5\n"
170                        "pand %%mm6,%%mm2\n"
171                        "pand %%mm6,%%mm4\n"
172                        "pandn %%mm7,%%mm3\n"
173                        "pandn %%mm7,%%mm5\n"
174                        "por %%mm3,%%mm2\n"
175                        "por %%mm5,%%mm4\n"
176 
177                        /* set *dst */
178                        "movq %%mm2,%%mm3\n"
179                        "punpcklwd %%mm4,%%mm2\n"
180                        "punpckhwd %%mm4,%%mm3\n"
181                        "movq %%mm2,(%3)\n"
182                        "movq %%mm3,8(%3)\n"
183 
184                        /* next */
185                        "addl $8,%0\n"
186                        "addl $8,%1\n"
187                        "addl $8,%2\n"
188                        "addl $16,%3\n"
189 
190                        /* central runs */
191                        "shrl $2,%4\n"
192                        "jz 1f\n"
193 
194                        "0:\n"
195 
196                        /* set the current, current_pre, current_next registers */
197                        "movq -8(%1),%%mm0\n"
198                        "movq (%1),%%mm7\n"
199                        "movq 8(%1),%%mm1\n"
200                        "psrlq $48,%%mm0\n"
201                        "psllq $48,%%mm1\n"
202                        "movq %%mm7,%%mm2\n"
203                        "movq %%mm7,%%mm3\n"
204                        "psllq $16,%%mm2\n"
205                        "psrlq $16,%%mm3\n"
206                        "por %%mm2,%%mm0\n"
207                        "por %%mm3,%%mm1\n"
208 
209                        /* current_upper */
210                        "movq (%0),%%mm6\n"
211 
212                        /* compute the upper-left pixel for dst on %%mm2 */
213                        /* compute the upper-right pixel for dst on %%mm4 */
214                        "movq %%mm0,%%mm2\n"
215                        "movq %%mm1,%%mm4\n"
216                        "movq %%mm0,%%mm3\n"
217                        "movq %%mm1,%%mm5\n"
218                        "pcmpeqw %%mm6,%%mm2\n"
219                        "pcmpeqw %%mm6,%%mm4\n"
220                        "pcmpeqw (%2),%%mm3\n"
221                        "pcmpeqw (%2),%%mm5\n"
222                        "pandn %%mm2,%%mm3\n"
223                        "pandn %%mm4,%%mm5\n"
224                        "movq %%mm0,%%mm2\n"
225                        "movq %%mm1,%%mm4\n"
226                        "pcmpeqw %%mm1,%%mm2\n"
227                        "pcmpeqw %%mm0,%%mm4\n"
228                        "pandn %%mm3,%%mm2\n"
229                        "pandn %%mm5,%%mm4\n"
230                        "movq %%mm2,%%mm3\n"
231                        "movq %%mm4,%%mm5\n"
232                        "pand %%mm6,%%mm2\n"
233                        "pand %%mm6,%%mm4\n"
234                        "pandn %%mm7,%%mm3\n"
235                        "pandn %%mm7,%%mm5\n"
236                        "por %%mm3,%%mm2\n"
237                        "por %%mm5,%%mm4\n"
238 
239                        /* set *dst */
240                        "movq %%mm2,%%mm3\n"
241                        "punpcklwd %%mm4,%%mm2\n"
242                        "punpckhwd %%mm4,%%mm3\n"
243                        "movq %%mm2,(%3)\n"
244                        "movq %%mm3,8(%3)\n"
245 
246                        /* next */
247                        "addl $8,%0\n"
248                        "addl $8,%1\n"
249                        "addl $8,%2\n"
250                        "addl $16,%3\n"
251 
252                        "decl %4\n"
253                        "jnz 0b\n"
254                        "1:\n"
255 
256                        /* final run */
257                        /* set the current, current_pre, current_next registers */
258                        "movq (%1),%%mm1\n"
259                        "movq (%1),%%mm7\n"
260                        "movq -8(%1),%%mm0\n"
261                        "psrlq $48,%%mm1\n"
262                        "psrlq $48,%%mm0\n"
263                        "psllq $48,%%mm1\n"
264                        "movq %%mm7,%%mm2\n"
265                        "movq %%mm7,%%mm3\n"
266                        "psllq $16,%%mm2\n"
267                        "psrlq $16,%%mm3\n"
268                        "por %%mm2,%%mm0\n"
269                        "por %%mm3,%%mm1\n"
270 
271                        /* current_upper */
272                        "movq (%0),%%mm6\n"
273 
274                        /* compute the upper-left pixel for dst on %%mm2 */
275                        /* compute the upper-right pixel for dst on %%mm4 */
276                        "movq %%mm0,%%mm2\n"
277                        "movq %%mm1,%%mm4\n"
278                        "movq %%mm0,%%mm3\n"
279                        "movq %%mm1,%%mm5\n"
280                        "pcmpeqw %%mm6,%%mm2\n"
281                        "pcmpeqw %%mm6,%%mm4\n"
282                        "pcmpeqw (%2),%%mm3\n"
283                        "pcmpeqw (%2),%%mm5\n"
284                        "pandn %%mm2,%%mm3\n"
285                        "pandn %%mm4,%%mm5\n"
286                        "movq %%mm0,%%mm2\n"
287                        "movq %%mm1,%%mm4\n"
288                        "pcmpeqw %%mm1,%%mm2\n"
289                        "pcmpeqw %%mm0,%%mm4\n"
290                        "pandn %%mm3,%%mm2\n"
291                        "pandn %%mm5,%%mm4\n"
292                        "movq %%mm2,%%mm3\n"
293                        "movq %%mm4,%%mm5\n"
294                        "pand %%mm6,%%mm2\n"
295                        "pand %%mm6,%%mm4\n"
296                        "pandn %%mm7,%%mm3\n"
297                        "pandn %%mm7,%%mm5\n"
298                        "por %%mm3,%%mm2\n"
299                        "por %%mm5,%%mm4\n"
300 
301                        /* set *dst */
302                        "movq %%mm2,%%mm3\n"
303                        "punpcklwd %%mm4,%%mm2\n"
304                        "punpckhwd %%mm4,%%mm3\n"
305                        "movq %%mm2,(%3)\n"
306                        "movq %%mm3,8(%3)\n"
307                        "emms\n"
308 
309                        : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
310                        :
311                        : "cc"
312                        );
313 #else
314   __asm {
315     mov eax, src0;
316     mov ebx, src1;
317     mov ecx, src2;
318     mov edx, dst;
319     mov esi, count;
320 
321     /* first run */
322     /* set the current, current_pre, current_next registers */
323     movq mm0, qword ptr [ebx];
324     movq mm7, qword ptr [ebx];
325     movq mm1, qword ptr [ebx + 8];
326     psllq mm0, 48;
327     psllq mm1, 48;
328     psrlq mm0, 48;
329     movq mm2, mm7;
330     movq mm3, mm7;
331     psllq mm2, 16;
332     psrlq mm3, 16;
333     por mm0, mm2;
334     por mm1, mm3;
335 
336     /* current_upper */
337     movq mm6, qword ptr [eax];
338 
339     /* compute the upper-left pixel for dst on %%mm2 */
340     /* compute the upper-right pixel for dst on %%mm4 */
341     movq mm2, mm0;
342     movq mm4, mm1;
343     movq mm3, mm0;
344     movq mm5, mm1;
345     pcmpeqw mm2, mm6;
346     pcmpeqw mm4, mm6;
347     pcmpeqw mm3, qword ptr [ecx];
348     pcmpeqw mm5, qword ptr [ecx];
349     pandn mm3,mm2;
350     pandn mm5,mm4;
351     movq mm2,mm0;
352     movq mm4,mm1;
353     pcmpeqw mm2,mm1;
354     pcmpeqw mm4,mm0;
355     pandn mm2,mm3;
356     pandn mm4,mm5;
357     movq mm3,mm2;
358     movq mm5,mm4;
359     pand mm2,mm6;
360     pand mm4,mm6;
361     pandn mm3,mm7;
362     pandn mm5,mm7;
363     por mm2,mm3;
364     por mm4,mm5;
365 
366     /* set *dst0 */
367     movq mm3,mm2;
368     punpcklwd mm2,mm4;
369     punpckhwd mm3,mm4;
370     movq qword ptr [edx], mm2;
371     movq qword ptr [edx + 8], mm3;
372 
373     /* next */
374     add eax, 8;
375     add ebx, 8;
376     add ecx, 8;
377     add edx, 16;
378 
379     /* central runs */
380     shr esi, 2;
381     jz label1;
382     align 4;
383   label0:
384 
385     /* set the current, current_pre, current_next registers */
386     movq mm0, qword ptr [ebx-8];
387     movq mm7, qword ptr [ebx];
388     movq mm1, qword ptr [ebx+8];
389     psrlq mm0,48;
390     psllq mm1,48;
391     movq mm2,mm7;
392     movq mm3,mm7;
393     psllq mm2,16;
394     psrlq mm3,16;
395     por mm0,mm2;
396     por mm1,mm3;
397 
398     /* current_upper */
399     movq mm6, qword ptr [eax];
400 
401     /* compute the upper-left pixel for dst on %%mm2 */
402     /* compute the upper-right pixel for dst on %%mm4 */
403     movq mm2,mm0;
404     movq mm4,mm1;
405     movq mm3,mm0;
406     movq mm5,mm1;
407     pcmpeqw mm2,mm6;
408     pcmpeqw mm4,mm6;
409     pcmpeqw mm3, qword ptr [ecx];
410     pcmpeqw mm5, qword ptr [ecx];
411     pandn mm3,mm2;
412     pandn mm5,mm4;
413     movq mm2,mm0;
414     movq mm4,mm1;
415     pcmpeqw mm2,mm1;
416     pcmpeqw mm4,mm0;
417     pandn mm2,mm3;
418     pandn mm4,mm5;
419     movq mm3,mm2;
420     movq mm5,mm4;
421     pand mm2,mm6;
422     pand mm4,mm6;
423     pandn mm3,mm7;
424     pandn mm5,mm7;
425     por mm2,mm3;
426     por mm4,mm5;
427 
428     /* set *dst */
429     movq mm3,mm2;
430     punpcklwd mm2,mm4;
431     punpckhwd mm3,mm4;
432     movq qword ptr [edx], mm2;
433     movq qword ptr [edx+8], mm3;
434 
435     /* next */
436     add eax,8;
437     add ebx,8;
438     add ecx,8;
439     add edx,16;
440 
441     dec esi;
442     jnz label0;
443   label1:
444 
445     /* final run */
446     /* set the current, current_pre, current_next registers */
447     movq mm1, qword ptr [ebx];
448     movq mm7, qword ptr [ebx];
449     movq mm0, qword ptr [ebx-8];
450     psrlq mm1,48;
451     psrlq mm0,48;
452     psllq mm1,48;
453     movq mm2,mm7;
454     movq mm3,mm7;
455     psllq mm2,16;
456     psrlq mm3,16;
457     por mm0,mm2;
458     por mm1,mm3;
459 
460     /* current_upper */
461     movq mm6, qword ptr [eax];
462 
463     /* compute the upper-left pixel for dst on %%mm2 */
464     /* compute the upper-right pixel for dst on %%mm4 */
465     movq mm2,mm0;
466     movq mm4,mm1;
467     movq mm3,mm0;
468     movq mm5,mm1;
469     pcmpeqw mm2,mm6;
470     pcmpeqw mm4,mm6;
471     pcmpeqw mm3, qword ptr [ecx];
472     pcmpeqw mm5, qword ptr [ecx];
473     pandn mm3,mm2;
474     pandn mm5,mm4;
475     movq mm2,mm0;
476     movq mm4,mm1;
477     pcmpeqw mm2,mm1;
478     pcmpeqw mm4,mm0;
479     pandn mm2,mm3;
480     pandn mm4,mm5;
481     movq mm3,mm2;
482     movq mm5,mm4;
483     pand mm2,mm6;
484     pand mm4,mm6;
485     pandn mm3,mm7;
486     pandn mm5,mm7;
487     por mm2,mm3;
488     por mm4,mm5;
489 
490     /* set *dst */
491     movq mm3,mm2;
492     punpcklwd mm2,mm4;
493     punpckhwd mm3,mm4;
494     movq qword ptr [edx], mm2;
495     movq qword ptr [edx+8], mm3;
496 
497     mov src0, eax;
498     mov src1, ebx;
499     mov src2, ecx;
500     mov dst, edx;
501     mov count, esi;
502 
503     emms;
504   }
505 #endif
506 }
507 
internal_scale2x_32_mmx_single(u32 * dst,const u32 * src0,const u32 * src1,const u32 * src2,unsigned count)508 static void internal_scale2x_32_mmx_single(u32* dst, const u32* src0, const u32* src1, const u32* src2, unsigned count) {
509   /* always do the first and last run */
510   count -= 2*2;
511 
512 #ifdef __GNUC__
513   __asm__ __volatile__(
514                        /* first run */
515                        /* set the current, current_pre, current_next registers */
516                        "movq 0(%1),%%mm0\n"
517                        "movq 0(%1),%%mm7\n"
518                        "movq 8(%1),%%mm1\n"
519                        "psllq $32,%%mm0\n"
520                        "psllq $32,%%mm1\n"
521                        "psrlq $32,%%mm0\n"
522                        "movq %%mm7,%%mm2\n"
523                        "movq %%mm7,%%mm3\n"
524                        "psllq $32,%%mm2\n"
525                        "psrlq $32,%%mm3\n"
526                        "por %%mm2,%%mm0\n"
527                        "por %%mm3,%%mm1\n"
528 
529                        /* current_upper */
530                        "movq (%0),%%mm6\n"
531 
532                        /* compute the upper-left pixel for dst on %%mm2 */
533                        /* compute the upper-right pixel for dst on %%mm4 */
534                        "movq %%mm0,%%mm2\n"
535                        "movq %%mm1,%%mm4\n"
536                        "movq %%mm0,%%mm3\n"
537                        "movq %%mm1,%%mm5\n"
538                        "pcmpeqd %%mm6,%%mm2\n"
539                        "pcmpeqd %%mm6,%%mm4\n"
540                        "pcmpeqd (%2),%%mm3\n"
541                        "pcmpeqd (%2),%%mm5\n"
542                        "pandn %%mm2,%%mm3\n"
543                        "pandn %%mm4,%%mm5\n"
544                        "movq %%mm0,%%mm2\n"
545                        "movq %%mm1,%%mm4\n"
546                        "pcmpeqd %%mm1,%%mm2\n"
547                        "pcmpeqd %%mm0,%%mm4\n"
548                        "pandn %%mm3,%%mm2\n"
549                        "pandn %%mm5,%%mm4\n"
550                        "movq %%mm2,%%mm3\n"
551                        "movq %%mm4,%%mm5\n"
552                        "pand %%mm6,%%mm2\n"
553                        "pand %%mm6,%%mm4\n"
554                        "pandn %%mm7,%%mm3\n"
555                        "pandn %%mm7,%%mm5\n"
556                        "por %%mm3,%%mm2\n"
557                        "por %%mm5,%%mm4\n"
558 
559                        /* set *dst */
560                        "movq %%mm2,%%mm3\n"
561                        "punpckldq %%mm4,%%mm2\n"
562                        "punpckhdq %%mm4,%%mm3\n"
563                        "movq %%mm2,(%3)\n"
564                        "movq %%mm3, 8(%3)\n"
565 
566                        /* next */
567                        "addl $8,%0\n"
568                        "addl $8,%1\n"
569                        "addl $8,%2\n"
570                        "addl $16,%3\n"
571 
572                        /* central runs */
573                        "shrl $1,%4\n"
574                        "jz 1f\n"
575 
576                        "0:\n"
577 
578                        /* set the current, current_pre, current_next registers */
579                        "movq -8(%1),%%mm0\n"
580                        "movq (%1),%%mm7\n"
581                        "movq 8(%1),%%mm1\n"
582                        "psrlq $32,%%mm0\n"
583                        "psllq $32,%%mm1\n"
584                        "movq %%mm7,%%mm2\n"
585                        "movq %%mm7,%%mm3\n"
586                        "psllq $32,%%mm2\n"
587                        "psrlq $32,%%mm3\n"
588                        "por %%mm2,%%mm0\n"
589                        "por %%mm3,%%mm1\n"
590 
591                        /* current_upper */
592                        "movq (%0),%%mm6\n"
593 
594                        /* compute the upper-left pixel for dst on %%mm2 */
595                        /* compute the upper-right pixel for dst on %%mm4 */
596                        "movq %%mm0,%%mm2\n"
597                        "movq %%mm1,%%mm4\n"
598                        "movq %%mm0,%%mm3\n"
599                        "movq %%mm1,%%mm5\n"
600                        "pcmpeqd %%mm6,%%mm2\n"
601                        "pcmpeqd %%mm6,%%mm4\n"
602                        "pcmpeqd (%2),%%mm3\n"
603                        "pcmpeqd (%2),%%mm5\n"
604                        "pandn %%mm2,%%mm3\n"
605                        "pandn %%mm4,%%mm5\n"
606                        "movq %%mm0,%%mm2\n"
607                        "movq %%mm1,%%mm4\n"
608                        "pcmpeqd %%mm1,%%mm2\n"
609                        "pcmpeqd %%mm0,%%mm4\n"
610                        "pandn %%mm3,%%mm2\n"
611                        "pandn %%mm5,%%mm4\n"
612                        "movq %%mm2,%%mm3\n"
613                        "movq %%mm4,%%mm5\n"
614                        "pand %%mm6,%%mm2\n"
615                        "pand %%mm6,%%mm4\n"
616                        "pandn %%mm7,%%mm3\n"
617                        "pandn %%mm7,%%mm5\n"
618                        "por %%mm3,%%mm2\n"
619                        "por %%mm5,%%mm4\n"
620 
621                        /* set *dst */
622                        "movq %%mm2,%%mm3\n"
623                        "punpckldq %%mm4,%%mm2\n"
624                        "punpckhdq %%mm4,%%mm3\n"
625                        "movq %%mm2,(%3)\n"
626                        "movq %%mm3,8(%3)\n"
627 
628                        /* next */
629                        "addl $8,%0\n"
630                        "addl $8,%1\n"
631                        "addl $8,%2\n"
632                        "addl $16,%3\n"
633 
634                        "decl %4\n"
635                        "jnz 0b\n"
636                        "1:\n"
637 
638                        /* final run */
639                        /* set the current, current_pre, current_next registers */
640                        "movq (%1),%%mm1\n"
641                        "movq (%1),%%mm7\n"
642                        "movq -8(%1), %%mm0\n"
643                        "psrlq $32,%%mm1\n"
644                        "psrlq $32,%%mm0\n"
645                        "psllq $32,%%mm1\n"
646                        "movq %%mm7,%%mm2\n"
647                        "movq %%mm7,%%mm3\n"
648                        "psllq $32,%%mm2\n"
649                        "psrlq $32,%%mm3\n"
650                        "por %%mm2,%%mm0\n"
651                        "por %%mm3,%%mm1\n"
652 
653                        /* current_upper */
654                        "movq (%0),%%mm6\n"
655 
656                        /* compute the upper-left pixel for dst on %%mm2 */
657                        /* compute the upper-right pixel for dst on %%mm4 */
658                        "movq %%mm0,%%mm2\n"
659                        "movq %%mm1,%%mm4\n"
660                        "movq %%mm0,%%mm3\n"
661                        "movq %%mm1,%%mm5\n"
662                        "pcmpeqd %%mm6,%%mm2\n"
663                        "pcmpeqd %%mm6,%%mm4\n"
664                        "pcmpeqd (%2),%%mm3\n"
665                        "pcmpeqd (%2),%%mm5\n"
666                        "pandn %%mm2,%%mm3\n"
667                        "pandn %%mm4,%%mm5\n"
668                        "movq %%mm0,%%mm2\n"
669                        "movq %%mm1,%%mm4\n"
670                        "pcmpeqd %%mm1,%%mm2\n"
671                        "pcmpeqd %%mm0,%%mm4\n"
672                        "pandn %%mm3,%%mm2\n"
673                        "pandn %%mm5,%%mm4\n"
674                        "movq %%mm2,%%mm3\n"
675                        "movq %%mm4,%%mm5\n"
676                        "pand %%mm6,%%mm2\n"
677                        "pand %%mm6,%%mm4\n"
678                        "pandn %%mm7,%%mm3\n"
679                        "pandn %%mm7,%%mm5\n"
680                        "por %%mm3,%%mm2\n"
681                        "por %%mm5,%%mm4\n"
682 
683                        /* set *dst */
684                        "movq %%mm2,%%mm3\n"
685                        "punpckldq %%mm4,%%mm2\n"
686                        "punpckhdq %%mm4,%%mm3\n"
687                        "movq %%mm2,(%3)\n"
688                        "movq %%mm3,8(%3)\n"
689                        "emms\n"
690 
691                        : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
692                        :
693                        : "cc"
694                        );
695 #else
696   __asm {
697     mov eax, src0;
698     mov ebx, src1;
699     mov ecx, src2;
700     mov edx, dst;
701     mov esi, count;
702 
703     /* first run */
704     /* set the current, current_pre, current_next registers */
705     movq mm0,qword ptr [ebx];
706     movq mm7,qword ptr [ebx];
707     movq mm1,qword ptr [ebx + 8];
708     psllq mm0,32;
709     psllq mm1,32;
710     psrlq mm0,32;
711     movq mm2,mm7;
712     movq mm3,mm7;
713     psllq mm2,32;
714     psrlq mm3,32;
715     por mm0,mm2;
716     por mm1,mm3;
717 
718     /* current_upper */
719     movq mm6,qword ptr [eax];
720 
721     /* compute the upper-left pixel for dst on %%mm2 */
722     /* compute the upper-right pixel for dst on %%mm4 */
723     movq mm2,mm0;
724     movq mm4,mm1;
725     movq mm3,mm0;
726     movq mm5,mm1;
727     pcmpeqd mm2,mm6;
728     pcmpeqd mm4,mm6;
729     pcmpeqd mm3,qword ptr [ecx];
730     pcmpeqd mm5,qword ptr [ecx];
731     pandn mm3,mm2;
732     pandn mm5,mm4;
733     movq mm2,mm0;
734     movq mm4,mm1;
735     pcmpeqd mm2,mm1;
736     pcmpeqd mm4,mm0;
737     pandn mm2,mm3;
738     pandn mm4,mm5;
739     movq mm3,mm2;
740     movq mm5,mm4;
741     pand mm2,mm6;
742     pand mm4,mm6;
743     pandn mm3,mm7;
744     pandn mm5,mm7;
745     por mm2,mm3;
746     por mm4,mm5;
747 
748     /* set *dst */
749     movq mm3,mm2;
750     punpckldq mm2,mm4;
751     punpckhdq mm3,mm4;
752     movq qword ptr [edx],mm2;
753     movq qword ptr [edx+8],mm3;
754 
755     /* next */
756     add eax,8;
757     add ebx,8;
758     add ecx,8;
759     add edx,16;
760 
761     /* central runs */
762     shr esi,1;
763     jz label1;
764 label0:
765 
766   /* set the current, current_pre, current_next registers */
767     movq mm0,qword ptr [ebx-8];
768     movq mm7,qword ptr [ebx];
769     movq mm1,qword ptr [ebx+8];
770     psrlq mm0,32;
771     psllq mm1,32;
772     movq mm2,mm7;
773     movq mm3,mm7;
774     psllq mm2,32;
775     psrlq mm3,32;
776     por mm0,mm2;
777     por mm1,mm3;
778 
779     /* current_upper */
780     movq mm6,qword ptr[eax];
781 
782     /* compute the upper-left pixel for dst on %%mm2 */
783     /* compute the upper-right pixel for dst on %%mm4 */
784     movq mm2,mm0;
785     movq mm4,mm1;
786     movq mm3,mm0;
787     movq mm5,mm1;
788     pcmpeqd mm2,mm6;
789     pcmpeqd mm4,mm6;
790     pcmpeqd mm3,qword ptr[ecx];
791     pcmpeqd mm5,qword ptr[ecx];
792     pandn mm3,mm2;
793     pandn mm5,mm4;
794     movq mm2,mm0;
795     movq mm4,mm1;
796     pcmpeqd mm2,mm1;
797     pcmpeqd mm4,mm0;
798     pandn mm2,mm3;
799     pandn mm4,mm5;
800     movq mm3,mm2;
801     movq mm5,mm4;
802     pand mm2,mm6;
803     pand mm4,mm6;
804     pandn mm3,mm7;
805     pandn mm5,mm7;
806     por mm2,mm3;
807     por mm4,mm5;
808 
809     /* set *dst */
810     movq mm3,mm2;
811     punpckldq mm2,mm4;
812     punpckhdq mm3,mm4;
813     movq qword ptr [edx],mm2;
814     movq qword ptr [edx+8],mm3;
815 
816     /* next */
817     add eax,8;
818     add ebx,8;
819     add ecx,8;
820     add edx,16;
821 
822     dec esi;
823     jnz label0;
824 label1:
825 
826     /* final run */
827     /* set the current, current_pre, current_next registers */
828     movq mm1,qword ptr [ebx];
829     movq mm7,qword ptr [ebx];
830     movq mm0,qword ptr [ebx-8];
831     psrlq mm1,32;
832     psrlq mm0,32;
833     psllq mm1,32;
834     movq mm2,mm7;
835     movq mm3,mm7;
836     psllq mm2,32;
837     psrlq mm3,32;
838     por mm0,mm2;
839     por mm1,mm3;
840 
841     /* current_upper */
842     movq mm6,qword ptr [eax];
843 
844     /* compute the upper-left pixel for dst on %%mm2 */
845     /* compute the upper-right pixel for dst on %%mm4 */
846     movq mm2,mm0;
847     movq mm4,mm1;
848     movq mm3,mm0;
849     movq mm5,mm1;
850     pcmpeqd mm2,mm6;
851     pcmpeqd mm4,mm6;
852     pcmpeqd mm3,qword ptr [ecx];
853     pcmpeqd mm5,qword ptr [ecx];
854     pandn mm3,mm2;
855     pandn mm5,mm4;
856     movq mm2,mm0;
857     movq mm4,mm1;
858     pcmpeqd mm2,mm1;
859     pcmpeqd mm4,mm0;
860     pandn mm2,mm3;
861     pandn mm4,mm5;
862     movq mm3,mm2;
863     movq mm5,mm4;
864     pand mm2,mm6;
865     pand mm4,mm6;
866     pandn mm3,mm7;
867     pandn mm5,mm7;
868     por mm2,mm3;
869     por mm4,mm5;
870 
871     /* set *dst */
872     movq mm3,mm2;
873     punpckldq mm2,mm4;
874     punpckhdq mm3,mm4;
875     movq qword ptr [edx],mm2;
876     movq qword ptr [edx+8],mm3;
877 
878     mov src0, eax;
879     mov src1, ebx;
880     mov src2, ecx;
881     mov dst, edx;
882     mov count, esi;
883 
884     emms;
885   }
886 #endif
887 }
888 
internal_scale2x_16_mmx(u16 * dst0,u16 * dst1,const u16 * src0,const u16 * src1,const u16 * src2,unsigned count)889 static void internal_scale2x_16_mmx(u16* dst0, u16* dst1, const u16* src0, const u16* src1, const u16* src2, unsigned count) {
890   //	assert( count >= 2*4 );
891   internal_scale2x_16_mmx_single(dst0, src0, src1, src2, count);
892   internal_scale2x_16_mmx_single(dst1, src2, src1, src0, count);
893 }
894 
internal_scale2x_32_mmx(u32 * dst0,u32 * dst1,const u32 * src0,const u32 * src1,const u32 * src2,unsigned count)895 static void internal_scale2x_32_mmx(u32* dst0, u32* dst1, const u32* src0, const u32* src1, const u32* src2, unsigned count) {
896   //	assert( count >= 2*2 );
897   internal_scale2x_32_mmx_single(dst0, src0, src1, src2, count);
898   internal_scale2x_32_mmx_single(dst1, src2, src1, src0, count);
899 }
900 #endif
901 
AdMame2x(u8 * srcPtr,u32 srcPitch,u8 *,u8 * dstPtr,u32 dstPitch,int width,int height)902 void AdMame2x(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
903               u8 *dstPtr, u32 dstPitch, int width, int height)
904 {
905   u16 *dst0 = (u16 *)dstPtr;
906   u16 *dst1 = dst0 + (dstPitch >> 1);
907 
908   u16 *src0 = (u16 *)srcPtr;
909   u16 *src1 = src0 + (srcPitch >> 1);
910   u16 *src2 = src1 + (srcPitch >> 1);
911 #ifdef MMX
912   if(cpu_mmx) {
913     internal_scale2x_16_mmx(dst0, dst1, src0, src0, src1, width);
914 
915     int count = height;
916 
917     count -= 2;
918     while(count) {
919       dst0 += dstPitch;
920       dst1 += dstPitch;
921       internal_scale2x_16_mmx(dst0, dst1, src0, src1, src2, width);
922       src0 = src1;
923       src1 = src2;
924       src2 += srcPitch >> 1;
925       --count;
926     }
927     dst0 += dstPitch;
928     dst1 += dstPitch;
929     internal_scale2x_16_mmx(dst0, dst1, src0, src1, src1, width);
930   } else {
931 #endif
932     internal_scale2x_16_def(dst0, src0, src0, src1, width);
933     internal_scale2x_16_def(dst1, src1, src0, src0, width);
934 
935     int count = height;
936 
937     count -= 2;
938     while(count) {
939       dst0 += dstPitch;
940       dst1 += dstPitch;
941       internal_scale2x_16_def(dst0, src0, src1, src2, width);
942       internal_scale2x_16_def(dst1, src2, src1, src0, width);
943       src0 = src1;
944       src1 = src2;
945       src2 += srcPitch >> 1;
946       --count;
947     }
948     dst0 += dstPitch;
949     dst1 += dstPitch;
950     internal_scale2x_16_def(dst0, src0, src1, src1, width);
951     internal_scale2x_16_def(dst1, src1, src1, src0, width);
952 #ifdef MMX
953   }
954 #endif
955 }
956 
AdMame2x32(u8 * srcPtr,u32 srcPitch,u8 *,u8 * dstPtr,u32 dstPitch,int width,int height)957 void AdMame2x32(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
958                 u8 *dstPtr, u32 dstPitch, int width, int height)
959 {
960   u32 *dst0 = (u32 *)dstPtr;
961   u32 *dst1 = dst0 + (dstPitch >> 2);
962 
963   u32 *src0 = (u32 *)srcPtr;
964   u32 *src1 = src0 + (srcPitch >> 2);
965   u32 *src2 = src1 + (srcPitch >> 2);
966 #ifdef MMX
967   if(cpu_mmx) {
968     internal_scale2x_32_mmx(dst0, dst1, src0, src0, src1, width);
969 
970     int count = height;
971 
972     count -= 2;
973     while(count) {
974       dst0 += dstPitch >> 1;
975       dst1 += dstPitch >> 1;
976       internal_scale2x_32_mmx(dst0, dst1, src0, src1, src2, width);
977       src0 = src1;
978       src1 = src2;
979       src2 += srcPitch >> 2;
980       --count;
981     }
982     dst0 += dstPitch >> 1;
983     dst1 += dstPitch >> 1;
984     internal_scale2x_32_mmx(dst0, dst1, src0, src1, src1, width);
985   } else {
986 #endif
987     internal_scale2x_32_def(dst0, src0, src0, src1, width);
988     internal_scale2x_32_def(dst1, src1, src0, src0, width);
989 
990     int count = height;
991 
992     count -= 2;
993     while(count) {
994       dst0 += dstPitch >> 1;
995       dst1 += dstPitch >> 1;
996       internal_scale2x_32_def(dst0, src0, src1, src2, width);
997       internal_scale2x_32_def(dst1, src2, src1, src0, width);
998       src0 = src1;
999       src1 = src2;
1000       src2 += srcPitch >> 2;
1001       --count;
1002     }
1003     dst0 += dstPitch >> 1;
1004     dst1 += dstPitch >> 1;
1005     internal_scale2x_32_def(dst0, src0, src1, src1, width);
1006     internal_scale2x_32_def(dst1, src1, src1, src0, width);
1007 #ifdef MMX
1008   }
1009 #endif
1010 }
1011