1 /*
2  * OpenBOR - http://www.chronocrash.com
3  * -----------------------------------------------------------------------
4  * All rights reserved, see LICENSE in OpenBOR root for details.
5  *
6  * Copyright (c) 2004 - 2012 OpenBOR Team
7  */
8 
9 /**
10  * This is an implementation of the Scale2x algorithm, also known as
11  * AdvanceMAME2x.  Before October 2012, OpenBOR contained a version of the
12  * Scale2x filter licensed under the GPL.  Both implementations in this version
13  * (C and MMX) were written from scratch by Plombo based on the description of
14  * the algorithm on the Scale2x website at:
15  *     http://scale2x.sourceforge.net/algorithm.html
16  */
17 
18 #include "gfx.h"
19 #include "types.h"
20 
scale2x_16_pixel_c(void * src0v,void * src1v,void * src2v,void * dst0v,void * dst1v)21 static inline void scale2x_16_pixel_c(void *src0v, void *src1v, void *src2v, void *dst0v, void *dst1v)
22 {
23     u16 *src0 = src0v, *src1 = src1v, *src2 = src2v, *dst0 = dst0v, *dst1 = dst1v;
24     u16 D = *(src0 - 1), E = *src0, F = *(src0 + 1), B = *src1, H = *src2;
25     u16 R1, R2, R3, R4;
26     if (B != H && D != F)
27     {
28         R1 = D == B ? D : E;
29         R2 = F == B ? F : E;
30         R3 = D == H ? D : E;
31         R4 = F == H ? F : E;
32     }
33     else
34     {
35         R1 = R2 = R3 = R4 = E;
36     }
37 
38     *dst0 = R1;
39     *(dst0 + 1) = R2;
40     *dst1 = R3;
41     *(dst1 + 1) = R4;
42 }
43 
scale2x_32_pixel_c(void * src0v,void * src1v,void * src2v,void * dst0v,void * dst1v)44 static inline void scale2x_32_pixel_c(void *src0v, void *src1v, void *src2v, void *dst0v, void *dst1v)
45 {
46     u32 *src0 = src0v, *src1 = src1v, *src2 = src2v, *dst0 = dst0v, *dst1 = dst1v;
47     u32 D = *(src0 - 1), E = *src0, F = *(src0 + 1), B = *src1, H = *src2;
48     u32 R1, R2, R3, R4;
49     if (B != H && D != F)
50     {
51         R1 = D == B ? D : E;
52         R2 = F == B ? F : E;
53         R3 = D == H ? D : E;
54         R4 = F == H ? F : E;
55     }
56     else
57     {
58         R1 = R2 = R3 = R4 = E;
59     }
60 
61     *dst0 = R1;
62     *(dst0 + 1) = R2;
63     *dst1 = R3;
64     *(dst1 + 1) = R4;
65 }
66 
67 #if MMX
scale2x_16_pixel_mmx(void * src0,void * src1,void * src2,void * dst0,void * dst1)68 static inline void scale2x_16_pixel_mmx(void *src0, void *src1, void *src2, void *dst0, void *dst1)
69 {
70     __asm__ __volatile__ (
71         "# load pixels surrounding input pixel\n"
72         "movq -2(%%eax),%%mm0                      # mm0 := D\n"
73         "movq 2(%%eax),%%mm1                       # mm1 := F\n"
74         "movq (%%edi),%%mm2                        # mm2 := B\n"
75         "movq (%%ecx),%%mm3                        # mm3 := H\n"
76         "\n"
77         "# mm4 := ~((B==H)|(D==F))\n"
78         "movq %%mm2,%%mm4\n"
79         "pcmpeqw %%mm3,%%mm4\n"
80         "movq %%mm0,%%mm5\n"
81         "pcmpeqw %%mm1,%%mm5\n"
82         "por %%mm5,%%mm4\n"
83         "pxor %%mm7,%%mm7\n"
84         "pcmpeqw %%mm7,%%mm4\n"
85         "\n"
86         "# calculate boolean conditions\n"
87         "movq %%mm0,%%mm5\n"
88         "pcmpeqw %%mm2,%%mm5\n"
89         "pand %%mm4,%%mm5                          # mm5 := (D == B) & mm4\n"
90         "movq %%mm1,%%mm6\n"
91         "pcmpeqw %%mm2,%%mm6\n"
92         "pand %%mm4,%%mm6                          # mm6 := (F == B) & mm4\n"
93         "movq %%mm0,%%mm7\n"
94         "pcmpeqw %%mm3,%%mm7\n"
95         "pand %%mm4,%%mm7                          # mm7 := (D == H) & mm4\n"
96         "pcmpeqw %%mm1,%%mm3\n"
97         "pand %%mm4,%%mm3                          # mm3 := (F == H) & mm4\n"
98         "\n"
99         "# fetch input pixel E\n"
100         "movq (%%eax),%%mm2                        # mm2 := E\n"
101         "\n"
102         "# calculate output pixel values\n"
103         "movq %%mm5,%%mm4\n"
104         "pandn %%mm2,%%mm4\n"
105         "pand %%mm0,%%mm5\n"
106         "por %%mm4,%%mm5                           # mm5 := R0\n"
107         "movq %%mm6,%%mm4\n"
108         "pandn %%mm2,%%mm4\n"
109         "pand %%mm1,%%mm6\n"
110         "por %%mm4,%%mm6                           # mm6 := R1\n"
111         "movq %%mm7,%%mm4\n"
112         "pandn %%mm2,%%mm4\n"
113         "pand %%mm0,%%mm7\n"
114         "por %%mm4,%%mm7                           # mm7 := R2\n"
115         "movq %%mm3,%%mm4\n"
116         "pandn %%mm2,%%mm4\n"
117         "pand %%mm1,%%mm3\n"
118         "por %%mm4,%%mm3                           # mm3 := R3\n"
119         "\n"
120         "# write the R0 pixels to memory\n"
121         "movd %%mm5,%%eax\n"
122         "movw %%ax,(%%edx)                         # far left pixel\n"
123         "shrl $16,%%eax\n"
124         "movw %%ax,4(%%edx)                        # middle left pixel\n"
125         "psrlq $32,%%mm5\n"
126         "movd %%mm5,%%eax\n"
127         "movw %%ax,8(%%edx)                        # middle right pixel\n"
128         "shrl $16,%%eax\n"
129         "movw %%ax,12(%%edx)                       # far right pixel\n"
130         "\n"
131         "# write the R1 pixels to memory\n"
132         "movd %%mm6,%%eax\n"
133         "movw %%ax,2(%%edx)                        # far left pixel\n"
134         "shrl $16,%%eax\n"
135         "movw %%ax,6(%%edx)                        # middle left pixel\n"
136         "psrlq $32,%%mm6\n"
137         "movd %%mm6,%%eax\n"
138         "movw %%ax,10(%%edx)                       # middle right pixel\n"
139         "shrl $16,%%eax\n"
140         "movw %%ax,14(%%edx)                       # far right pixel\n"
141         "\n"
142         "# write the R2 pixels to memory\n"
143         "movd %%mm7,%%eax\n"
144         "movw %%ax,(%%esi)                         # far left pixel\n"
145         "shrl $16,%%eax\n"
146         "movw %%ax,4(%%esi)                        # middle left pixel\n"
147         "psrlq $32,%%mm7\n"
148         "movd %%mm7,%%eax\n"
149         "movw %%ax,8(%%esi)                        # middle right pixel\n"
150         "shrl $16,%%eax\n"
151         "movw %%ax,12(%%esi)                       # far right pixel\n"
152         "\n"
153         "# write the R3 pixels to memory\n"
154         "movd %%mm3,%%eax\n"
155         "movw %%ax,2(%%esi)                        # far left pixel\n"
156         "shrl $16,%%eax\n"
157         "movw %%ax,6(%%esi)                        # middle left pixel\n"
158         "psrlq $32,%%mm3\n"
159         "movd %%mm3,%%eax\n"
160         "movw %%ax,10(%%esi)                       # middle right pixel\n"
161         "shrl $16,%%eax\n"
162         "movw %%ax,14(%%esi)                       # far right pixel\n"
163         : "=a" (src0)
164         : "a" (src0), "D" (src1), "c" (src2), "d" (dst0), "S" (dst1)
165         : "cc", "memory"
166     );
167 }
168 
scale2x_32_pixel_mmx(void * src0,void * src1,void * src2,void * dst0,void * dst1)169 static inline void scale2x_32_pixel_mmx(void *src0, void *src1, void *src2, void *dst0, void *dst1)
170 {
171     __asm__ __volatile__ (
172         "# load pixels surrounding input pixel\n"
173         "movq -4(%%eax),%%mm0                      # mm0 := D\n"
174         "movq 4(%%eax),%%mm1                       # mm1 := F\n"
175         "movq (%%edi),%%mm2                        # mm2 := B\n"
176         "movq (%%ecx),%%mm3                        # mm3 := H\n"
177         "\n"
178         "# mm4 := ~((B==H)|(D==F))\n"
179         "movq %%mm2,%%mm4\n"
180         "pcmpeqd %%mm3,%%mm4\n"
181         "movq %%mm0,%%mm5\n"
182         "pcmpeqd %%mm1,%%mm5\n"
183         "por %%mm5,%%mm4\n"
184         "pxor %%mm7,%%mm7\n"
185         "pcmpeqd %%mm7,%%mm4\n"
186         "\n"
187         "# calculate boolean conditions\n"
188         "movq %%mm0,%%mm5\n"
189         "pcmpeqd %%mm2,%%mm5\n"
190         "pand %%mm4,%%mm5                          # mm5 := (D == B) & mm4\n"
191         "movq %%mm1,%%mm6\n"
192         "pcmpeqd %%mm2,%%mm6\n"
193         "pand %%mm4,%%mm6                          # mm6 := (F == B) & mm4\n"
194         "movq %%mm0,%%mm7\n"
195         "pcmpeqd %%mm3,%%mm7\n"
196         "pand %%mm4,%%mm7                          # mm7 := (D == H) & mm4\n"
197         "pcmpeqd %%mm1,%%mm3\n"
198         "pand %%mm4,%%mm3                          # mm3 := (F == H) & mm4\n"
199         "\n"
200         "# fetch input pixel E\n"
201         "movq (%%eax),%%mm2                        # mm2 := E\n"
202         "\n"
203         "# calculate output pixel values\n"
204         "movq %%mm5,%%mm4\n"
205         "pandn %%mm2,%%mm4\n"
206         "pand %%mm0,%%mm5\n"
207         "por %%mm4,%%mm5                           # mm5 := R0\n"
208         "movq %%mm6,%%mm4\n"
209         "pandn %%mm2,%%mm4\n"
210         "pand %%mm1,%%mm6\n"
211         "por %%mm4,%%mm6                           # mm6 := R1\n"
212         "movq %%mm7,%%mm4\n"
213         "pandn %%mm2,%%mm4\n"
214         "pand %%mm0,%%mm7\n"
215         "por %%mm4,%%mm7                           # mm7 := R2\n"
216         "movq %%mm3,%%mm4\n"
217         "pandn %%mm2,%%mm4\n"
218         "pand %%mm1,%%mm3\n"
219         "por %%mm4,%%mm3                           # mm3 := R3\n"
220         "\n"
221         "# write the R0 pixels to memory\n"
222         "movd %%mm5,%%eax\n"
223         "movl %%eax,(%%edx)                        # left pixel\n"
224         "psrlq $32,%%mm5\n"
225         "movd %%mm5,%%eax\n"
226         "movl %%eax,8(%%edx)                       # right pixel\n"
227         "\n"
228         "# write the R1 pixels to memory\n"
229         "movd %%mm6,%%eax\n"
230         "movl %%eax,4(%%edx)                       # left pixel\n"
231         "psrlq $32,%%mm6\n"
232         "movd %%mm6,%%eax\n"
233         "movl %%eax,12(%%edx)                      # right pixel\n"
234         "\n"
235         "# write the R2 pixels to memory\n"
236         "movd %%mm7,%%eax\n"
237         "movl %%eax,(%%esi)                        # left pixel\n"
238         "psrlq $32,%%mm7\n"
239         "movd %%mm7,%%eax\n"
240         "movl %%eax,8(%%esi)                       # right pixel\n"
241         "\n"
242         "# write the R3 pixels to memory\n"
243         "movd %%mm3,%%eax\n"
244         "movl %%eax,4(%%esi)                       # left pixel\n"
245         "psrlq $32,%%mm3\n"
246         "movd %%mm3,%%eax\n"
247         "movl %%eax,12(%%esi)                      # right pixel\n"
248         : "=a" (src0)
249         : "a" (src0), "D" (src1), "c" (src2), "d" (dst0), "S" (dst1)
250         : "cc", "memory"
251     );
252 }
253 #endif
254 
255 #if MMX
256 #define scale2x_16_pixel scale2x_16_pixel_mmx
257 #define scale2x_32_pixel scale2x_32_pixel_mmx
258 #define increment16 4
259 #define increment32 2
260 #else
261 #define scale2x_16_pixel scale2x_16_pixel_c
262 #define scale2x_32_pixel scale2x_32_pixel_c
263 #define increment16 1
264 #define increment32 1
265 #endif
266 
AdMame2x(u8 * srcPtr,u32 srcPitch,u8 * deltaPtr,u8 * dstPtr,u32 dstPitch,int width,int height)267 void AdMame2x(u8 *srcPtr, u32 srcPitch, u8 *deltaPtr, u8 *dstPtr, u32 dstPitch, int width, int height)
268 {
269     int x, y;
270     for (y = 0; y < height; y++)
271     {
272         u8 *src0 = srcPtr + srcPitch * y;
273         u8 *src1 = y == 0 ? src0 : src0 - srcPitch;
274         u8 *src2 = (y == height - 1) ? src0 : src1 + srcPitch;
275         u8 *dst0 = dstPtr + dstPitch * y * 2;
276         u8 *dst1 = dst0 + dstPitch;
277 
278         for (x = 0; x < width; x += increment16)
279         {
280             scale2x_16_pixel(src0 + 2 * x, src1 + 2 * x, src2 + 2 * x, dst0 + 4 * x, dst1 + 4 * x);
281         }
282     }
283 
284 #if MMX
285     // done with MMX instructions, so tell the processor to restore floating-point state
286     __asm__ __volatile__ ("emms");
287 #endif
288 }
289 
AdMame2x32(u8 * srcPtr,u32 srcPitch,u8 * deltaPtr,u8 * dstPtr,u32 dstPitch,int width,int height)290 void AdMame2x32(u8 *srcPtr, u32 srcPitch, u8 *deltaPtr, u8 *dstPtr, u32 dstPitch, int width, int height)
291 {
292     int x, y;
293     for (y = 0; y < height; y++)
294     {
295         u8 *src0 = srcPtr + srcPitch * y;
296         u8 *src1 = y == 0 ? src0 : src0 - srcPitch;
297         u8 *src2 = (y == height - 1) ? src0 : src1 + srcPitch;
298         u8 *dst0 = dstPtr + dstPitch * y * 2;
299         u8 *dst1 = dst0 + dstPitch;
300 
301         for (x = 0; x < width; x += increment32)
302         {
303             scale2x_32_pixel(src0 + 4 * x, src1 + 4 * x, src2 + 4 * x, dst0 + 8 * x, dst1 + 8 * x);
304         }
305     }
306 #if MMX
307     // done with MMX instructions, so tell the processor to restore floating-point state
308     __asm__ __volatile__ ("emms");
309 #endif
310 }
311 
312