1 /*	xmmx.c
2 
3 	eXtended MultiMedia eXtensions GCC interface library for IA32.
4 
5 	To use this library, simply include this header file
6 	and compile with GCC.  You MUST have inlining enabled
7 	in order for xmmx_ok() to work; this can be done by
8 	simply using -O on the GCC command line.
9 
10 	Compiling with -DXMMX_TRACE will cause detailed trace
11 	output to be sent to stderr for each mmx operation.
12 	This adds lots of code, and obviously slows execution to
13 	a crawl, but can be very useful for debugging.
14 
15 	THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY
16 	EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
17 	LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY
18 	AND FITNESS FOR ANY PARTICULAR PURPOSE.
19 
20 	1999 by R. Fisher
21 	Based on libmmx, 1997-99 by H. Dietz and R. Fisher
22 
23  Notes:
24 	It appears that the latest gas has the pand problem fixed, therefore
25 	  I'll undefine BROKEN_PAND by default.
26 */
27 #ifdef HAVE_CONFIG_H
28 #include "config.h"
29 #endif
30 
31 #include "goom_config.h"
32 
33 #ifdef HAVE_MMX
34 
35 /* a definir pour avoir exactement le meme resultat que la fonction C
36  * (un chouillat plus lent).. mais la difference est assez peu notable.
37  */
38 // #define STRICT_COMPAT
39 
40 #define BUFFPOINTNB 16
41 #define BUFFPOINTMASK 0xffff
42 #define BUFFINCR 0xff
43 
44 #define sqrtperte 16
45 /* faire : a % sqrtperte <=> a & pertemask*/
46 #define PERTEMASK 0xf
47 /* faire : a / sqrtperte <=> a >> PERTEDEC*/
48 #define PERTEDEC 4
49 
50 
51 /*#define MMX_TRACE*/
52 #include "mmx.h"
53 /*#include "xmmx.h"*/
54 #include "goom_graphic.h"
55 
56 int
xmmx_supported(void)57 xmmx_supported (void)
58 {
59   return (mm_support () & 0x8) >> 3;
60 }
61 
62 void
zoom_filter_xmmx(int prevX,int prevY,Pixel * expix1,Pixel * expix2,int * lbruS,int * lbruD,int buffratio,int precalCoef[16][16])63 zoom_filter_xmmx (int prevX, int prevY,
64     Pixel * expix1, Pixel * expix2,
65     int *lbruS, int *lbruD, int buffratio, int precalCoef[16][16])
66 {
67   int bufsize = prevX * prevY;  /* taille du buffer */
68   volatile int loop;            /* variable de boucle */
69 
70   mmx_t *brutS = (mmx_t *) lbruS;       /* buffer de transformation source */
71   mmx_t *brutD = (mmx_t *) lbruD;       /* buffer de transformation dest */
72 
73   volatile mmx_t prevXY;
74   volatile mmx_t ratiox;
75 
76   /*      volatile mmx_t interpix; */
77 
78   expix1[0].val = expix1[prevX - 1].val = expix1[prevX * prevY - 1].val =
79       expix1[prevX * prevY - prevX].val = 0;
80 
81   prevXY.ud[0] = (prevX - 1) << PERTEDEC;
82   prevXY.ud[1] = (prevY - 1) << PERTEDEC;
83 
84   ratiox.d[0] = buffratio;
85   ratiox.d[1] = buffratio;
86 
87   asm volatile ("\n\t movq  %[ratio], %%mm6" "\n\t pslld $16,      %%mm6"       /* mm6 = [rat16=buffratio<<16 | rat16=buffratio<<16] */
88       "\n\t pxor  %%mm7,    %%mm7"      /* mm7 = 0 */
89       ::[ratio] "m" (ratiox));
90 
91   loop = 0;
92 
93   /*
94    * NOTE : mm6 et mm7 ne sont pas modifies dans la boucle.
95    */
96   while (loop < bufsize) {
97     /* Thread #1
98      * pre :  mm6 = [rat16|rat16]
99      * post : mm0 = S + ((D-S)*rat16 format [X|Y]
100      * modified = mm0,mm1,mm2
101      */
102 
103     asm volatile ("#1 \n\t movq 0(%[brutS]), %%mm0" "#1 \n\t movq 0(%[brutD]), %%mm1" "#1 \n\t psubd   %%mm0, %%mm1"    /* mm1 = D - S */
104         "#1 \n\t movq    %%mm1, %%mm2"  /* mm2 = D - S */
105         "#1 \n\t pslld     $16, %%mm1" "#1 \n\t pmullw  %%mm6, %%mm2" "#1 \n\t pmulhuw %%mm6, %%mm1" "#1 \n\t pslld   $16,   %%mm0" "#1 \n\t paddd   %%mm2, %%mm1"      /* mm1 = (D - S) * buffratio >> 16 */
106         "#1 \n\t paddd   %%mm1, %%mm0"  /* mm0 = S + mm1 */
107         "#1 \n\t psrld   $16,   %%mm0"::[brutS] "r" (&brutS[loop]),
108         [brutD] "r" (&brutD[loop])
109         );                      /* mm0 = S */
110 
111     /*
112      * pre : mm0 : position vector on screen
113      *       prevXY : coordinate of the lower-right point on screen
114      * post : clipped mm0
115      * modified : mm0,mm1,mm2
116      */
117     asm volatile
118         ("#1 \n\t movq %[prevXY], %%mm1" "#1 \n\t pcmpgtd %%mm0,  %%mm1"
119         /* mm0 en X contient (idem pour Y) :
120          *   1111 si prevXY > px
121          *   0000 si prevXY <= px */
122 #ifdef STRICT_COMPAT
123         "#1 \n\t movq      %%mm1, %%mm2"
124         "#1 \n\t punpckhdq %%mm2, %%mm2"
125         "#1 \n\t punpckldq %%mm1, %%mm1" "#1 \n\t pand      %%mm2, %%mm0"
126 #endif
127         "#1 \n\t pand %%mm1, %%mm0"     /* on met a zero la partie qui deborde */
128         ::[prevXY] "m" (prevXY));
129 
130     /* Thread #2
131      * pre :  mm0 : clipped position on screen
132      *
133      * post : mm3 : coefs for this position
134      *        mm1 : X vector [0|X]
135      *
136      * modif : eax,esi
137      */
138     __asm__ __volatile__ ("#2 \n\t movd %%mm0,%%esi"
139         "#2 \n\t movq %%mm0,%%mm1"
140         "#2 \n\t andl $15,%%esi"
141         "#2 \n\t psrlq $32,%%mm1"
142         "#2 \n\t shll $6,%%esi"
143         "#2 \n\t movd %%mm1,%%eax"
144         "#2 \n\t addl %[precalCoef],%%esi"
145         "#2 \n\t andl $15,%%eax"
146         "#2 \n\t movd (%%esi,%%eax,4),%%mm3"::[precalCoef]
147         "g" (precalCoef):"eax", "esi");
148 
149     /*
150      * extraction des coefficients... (Thread #3)
151      *
152      * pre : coef dans mm3
153      *
154      * post : coef extraits dans mm3 (c1 & c2)
155      *                        et mm4 (c3 & c4)
156      *
157      * modif : mm5
158      */
159 
160     /* (Thread #4)
161      * pre : mm0 : Y pos [*|Y]
162      *       mm1 : X pos [*|X]
163      *
164      * post : mm0 : expix1[position]
165      *        mm2 : expix1[position+largeur]
166      *
167      * modif : eax, esi
168      */
169     __asm__ __volatile__ ("#2 \n\t psrld $4, %%mm0" "#2 \n\t psrld $4, %%mm1"   /* PERTEDEC = $4 */
170         "#4 \n\t movd %%mm1,%%eax"
171         "#3 \n\t movq %%mm3,%%mm5"
172         "#4 \n\t mull %[prevX]"
173         "#4 \n\t movd %%mm0,%%esi"
174         "#3 \n\t punpcklbw %%mm5, %%mm3"
175         "#4 \n\t addl %%esi, %%eax"
176         "#3 \n\t movq %%mm3, %%mm4"
177         "#3 \n\t movq %%mm3, %%mm5"
178         "#4 \n\t movl %[expix1], %%esi"
179         "#3 \n\t punpcklbw %%mm5, %%mm3"
180         "#4 \n\t movq (%%esi,%%eax,4),%%mm0"
181         "#3 \n\t punpckhbw %%mm5, %%mm4"
182         "#4 \n\t addl %[prevX],%%eax"
183         "#4 \n\t movq (%%esi,%%eax,4),%%mm2"::[expix1] "g" (expix1)
184         ,[prevX] "g" (prevX)
185         :"eax", "esi", "edx");
186 
187     /*
188      * pre :       mm0 : expix1[position]
189      *             mm2 : expix1[position+largeur]
190      *       mm3 & mm4 : coefs
191      */
192 
193     /* recopie des deux premiers pixels dans mm0 et mm1 */
194     movq_r2r (mm0, mm1);        /* b1-v1-r1-a1-b2-v2-r2-a2 */
195 
196     /* depackage du premier pixel */
197     punpcklbw_r2r (mm7, mm0);   /* 00-b2-00-v2-00-r2-00-a2 */
198 
199     /* extraction des coefficients... */
200 
201     movq_r2r (mm3, mm5);        /* c2-c2-c2-c2-c1-c1-c1-c1 */
202 
203     /*^en parrallele^ *//* depackage du 2ieme pixel */
204     /*^ */ punpckhbw_r2r (mm7, mm1);
205     /* 00-b1-00-v1-00-r1-00-a1 */
206 
207     punpcklbw_r2r (mm7, mm5);   /* 00-c1-00-c1-00-c1-00-c1 */
208     punpckhbw_r2r (mm7, mm3);   /* 00-c2-00-c2-00-c2-00-c2 */
209 
210     /* multiplication des pixels par les coefficients */
211     pmullw_r2r (mm5, mm0);      /* c1*b2-c1*v2-c1*r2-c1*a2 */
212     pmullw_r2r (mm3, mm1);      /* c2*b1-c2*v1-c2*r1-c2*a1 */
213     paddw_r2r (mm1, mm0);
214 
215     /* ...extraction des 2 derniers coefficients */
216     movq_r2r (mm4, mm5);        /* c4-c4-c4-c4-c3-c3-c3-c3 */
217     punpcklbw_r2r (mm7, mm4);   /* 00-c3-00-c3-00-c3-00-c3 */
218     punpckhbw_r2r (mm7, mm5);   /* 00-c4-00-c4-00-c4-00-c4 */
219 
220     /* recuperation des 2 derniers pixels */
221     movq_r2r (mm2, mm1);
222 
223     /* depackage des pixels */
224     punpcklbw_r2r (mm7, mm1);
225     punpckhbw_r2r (mm7, mm2);
226 
227     /* multiplication pas les coeffs */
228     pmullw_r2r (mm4, mm1);
229     pmullw_r2r (mm5, mm2);
230 
231     /* ajout des valeurs obtenues � la valeur finale */
232     paddw_r2r (mm1, mm0);
233     paddw_r2r (mm2, mm0);
234 
235     /* division par 256 = 16+16+16+16, puis repackage du pixel final */
236     psrlw_i2r (8, mm0);
237     packuswb_r2r (mm7, mm0);
238 
239     movd_r2m (mm0, expix2[loop]);
240 
241     ++loop;
242   }
243   /* this was femms, which is AMD 3dnow */
244   __asm__ __volatile__ ("emms\n");
245 }
246 
247 #define DRAWMETHOD_PLUS_XMMX(_out,_backbuf,_col) \
248 { \
249 	movd_m2r(_backbuf, mm0); \
250 	paddusb_m2r(_col, mm0); \
251 	movd_r2m(mm0, _out); \
252 }
253 
254 #define DRAWMETHOD DRAWMETHOD_PLUS_XMMX(*p,*p,col)
255 
256 void
draw_line_xmmx(Pixel * data,int x1,int y1,int x2,int y2,int col,int screenx,int screeny)257 draw_line_xmmx (Pixel * data, int x1, int y1, int x2, int y2, int col,
258     int screenx, int screeny)
259 {
260   int x, y, dx, dy, yy, xx;
261   Pixel *p;
262 
263   if ((y1 < 0) || (y2 < 0) || (x1 < 0) || (x2 < 0) || (y1 >= screeny)
264       || (y2 >= screeny) || (x1 >= screenx) || (x2 >= screenx))
265     goto end_of_line;
266 
267   dx = x2 - x1;
268   dy = y2 - y1;
269   if (x1 >= x2) {
270     int tmp;
271 
272     tmp = x1;
273     x1 = x2;
274     x2 = tmp;
275     tmp = y1;
276     y1 = y2;
277     y2 = tmp;
278     dx = x2 - x1;
279     dy = y2 - y1;
280   }
281 
282   /* vertical line */
283   if (dx == 0) {
284     if (y1 < y2) {
285       p = &(data[(screenx * y1) + x1]);
286       for (y = y1; y <= y2; y++) {
287         DRAWMETHOD;
288         p += screenx;
289       }
290     } else {
291       p = &(data[(screenx * y2) + x1]);
292       for (y = y2; y <= y1; y++) {
293         DRAWMETHOD;
294         p += screenx;
295       }
296     }
297     goto end_of_line;
298   }
299   /* horizontal line */
300   if (dy == 0) {
301     if (x1 < x2) {
302       p = &(data[(screenx * y1) + x1]);
303       for (x = x1; x <= x2; x++) {
304         DRAWMETHOD;
305         p++;
306       }
307       goto end_of_line;
308     } else {
309       p = &(data[(screenx * y1) + x2]);
310       for (x = x2; x <= x1; x++) {
311         DRAWMETHOD;
312         p++;
313       }
314       goto end_of_line;
315     }
316   }
317   /* 1    */
318   /*  \   */
319   /*   \  */
320   /*    2 */
321   if (y2 > y1) {
322     /* steep */
323     if (dy > dx) {
324       dx = ((dx << 16) / dy);
325       x = x1 << 16;
326       for (y = y1; y <= y2; y++) {
327         xx = x >> 16;
328         p = &(data[(screenx * y) + xx]);
329         DRAWMETHOD;
330         if (xx < (screenx - 1)) {
331           p++;
332           /* DRAWMETHOD; */
333         }
334         x += dx;
335       }
336       goto end_of_line;
337     }
338     /* shallow */
339     else {
340       dy = ((dy << 16) / dx);
341       y = y1 << 16;
342       for (x = x1; x <= x2; x++) {
343         yy = y >> 16;
344         p = &(data[(screenx * yy) + x]);
345         DRAWMETHOD;
346         if (yy < (screeny - 1)) {
347           p += screeny;
348           /* DRAWMETHOD; */
349         }
350         y += dy;
351       }
352     }
353   }
354   /*    2 */
355   /*   /  */
356   /*  /   */
357   /* 1    */
358   else {
359     /* steep */
360     if (-dy > dx) {
361       dx = ((dx << 16) / -dy);
362       x = (x1 + 1) << 16;
363       for (y = y1; y >= y2; y--) {
364         xx = x >> 16;
365         p = &(data[(screenx * y) + xx]);
366         DRAWMETHOD;
367         if (xx < (screenx - 1)) {
368           p--;
369           /* DRAWMETHOD; */
370         }
371         x += dx;
372       }
373       goto end_of_line;
374     }
375     /* shallow */
376     else {
377       dy = ((dy << 16) / dx);
378       y = y1 << 16;
379       for (x = x1; x <= x2; x++) {
380         yy = y >> 16;
381         p = &(data[(screenx * yy) + x]);
382         DRAWMETHOD;
383         if (yy < (screeny - 1)) {
384           p += screeny;
385           /* DRAWMETHOD; */
386         }
387         y += dy;
388       }
389       goto end_of_line;
390     }
391   }
392 end_of_line:
393   /* this was femms, which is AMD 3dnow */
394   __asm__ __volatile__ ("emms\n");
395 }
396 #else
397 int
xmmx_supported(void)398 xmmx_supported (void)
399 {
400   return (0);
401 }
402 #endif
403