1/*
2 * Copyright (C) 1997-2009, Michael Jennings
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to
6 * deal in the Software without restriction, including without limitation the
7 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8 * sell copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies of the Software, its documentation and marketing & publicity
13 * materials, and acknowledgment shall be given in the documentation, materials
14 * and software packages that this Software was used.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
20 * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24#include "config.h"
25
26/* MMX routines for tinting XImages written by Willem Monsuwe <willem@stack.nl> */
27
28/* Function calling conventions:
29 *   shade_ximage_xx(void *data, int bpl, int w, int h, int rm, int gm, int bm);
30 */
31
32#ifdef HAVE_MMX
33
34#define data	8(%ebp)
35#define bpl	12(%ebp)
36#define w	16(%ebp)
37#define h	20(%ebp)
38#define rm	24(%ebp)
39#define gm	28(%ebp)
40#define bm	32(%ebp)
41
42.global shade_ximage_15_mmx
43        .type shade_ximage_15_mmx,@function
44.global shade_ximage_16_mmx
45        .type shade_ximage_16_mmx,@function
46.global shade_ximage_32_mmx
47        .type shade_ximage_32_mmx,@function
48
49.bss
50.text
51.align 8
52
53#define ENTER                   \
54        pushl %ebp              ;\
55        movl %esp, %ebp         ;\
56        pushl %ebx              ;\
57        pushl %ecx              ;\
58        pushl %edx              ;\
59        pushl %edi              ;\
60        pushl %esi              ;\
61        movl data, %esi         ;\
62        movl w, %ebx            ;\
63        movl h, %edx
64
65#define LEAVE                   \
664:                              ;\
67        emms                    ;\
68        popl %esi               ;\
69        popl %edi               ;\
70        popl %edx               ;\
71        popl %ecx               ;\
72        popl %ebx               ;\
73        movl %ebp, %esp         ;\
74        popl %ebp               ;\
75        ret
76
77
78shade_ximage_15_mmx:
79        ENTER
80
81        leal -6(%esi, %ebx, 2), %esi
82        negl %ebx
83        jz 5f
84
85        /* Setup multipliers */
86        movd rm, %mm5
87        movd gm, %mm6
88        movd bm, %mm7
89        punpcklwd %mm5, %mm5    /* 00 00 00 00 rm rm rm rm */
90        punpcklwd %mm6, %mm6    /* 00 00 00 00 gm gm gm gm */
91        punpcklwd %mm7, %mm7    /* 00 00 00 00 bm bm bm bm */
92        punpckldq %mm5, %mm5    /* rm rm rm rm rm rm rm rm */
93        punpckldq %mm6, %mm6    /* gm gm gm gm gm gm gm gm */
94        punpckldq %mm7, %mm7    /* bm bm bm bm bm bm bm bm */
95
96        cmpl $256, rm
97        jg shade_ximage_15_mmx_saturate
98        cmpl $256, gm
99        jg shade_ximage_15_mmx_saturate
100        cmpl $256, bm
101        jg shade_ximage_15_mmx_saturate
102
1031:      movl %ebx, %ecx
104        addl $3, %ecx
105        jns 3f
1062:
107        movq (%esi, %ecx, 2), %mm0
108
109        movq %mm0, %mm1         /* rg gb */
110        movq %mm0, %mm2         /* rg gb */
111        psrlw $5, %mm1          /* 0r rg */
112        psrlw $10, %mm0         /* 00 0r */
113        psllw $11, %mm2         /* b0 00 */
114        psllw $11, %mm1         /* g0 00 */
115        psllw $8, %mm0          /* 0r 00 */
116        psrlw $3, %mm1          /* 0g 00 */
117        psrlw $3, %mm2          /* 0b 00 */
118
119        pmulhw %mm5, %mm0       /* 00 0r */
120        pmulhw %mm6, %mm1       /* 00 0g */
121        pmulhw %mm7, %mm2       /* 00 0b */
122
123        psllw $10, %mm0         /* r0 00 */
124        psllw $5, %mm1          /* 0g g0 */
125        por %mm2, %mm0          /* r0 0b */
126        por %mm1, %mm0          /* rg gb */
127
128        movq %mm0, (%esi, %ecx, 2)
129
130        addl $4, %ecx
131        js 2b
132        jmp 4f
1333:
134        movw (%esi, %ecx, 2), %ax
135        movd %eax, %mm0
136
137        movq %mm0, %mm1         /* rg gb */
138        movq %mm0, %mm2         /* rg gb */
139        psrlw $5, %mm1          /* 0r rg */
140        psrlw $10, %mm0         /* 00 0r */
141        psllw $11, %mm2         /* b0 00 */
142        psllw $11, %mm1         /* g0 00 */
143        psllw $8, %mm0          /* 0r 00 */
144        psrlw $3, %mm1          /* 0g 00 */
145        psrlw $3, %mm2          /* 0b 00 */
146
147        pmulhw %mm5, %mm0       /* 00 0r */
148        pmulhw %mm6, %mm1       /* 00 0g */
149        pmulhw %mm7, %mm2       /* 00 0b */
150
151        psllw $10, %mm0         /* r0 00 */
152        psllw $5, %mm1          /* 0g g0 */
153        por %mm2, %mm0          /* r0 0b */
154        por %mm1, %mm0          /* rg gb */
155
156        movd %mm0, %eax
157        movw %ax, (%esi, %ecx, 2)
158
159        incl %ecx
1604:
161        cmpl $2, %ecx
162        jng 3b
163
164        addl bpl, %esi
165        decl %edx
166        jnz 1b
1675:
168        LEAVE
169
170
171shade_ximage_15_mmx_saturate:
172
173        pcmpeqw %mm3, %mm3
174        psllw $5, %mm3          /* ff e0 ff e0 ff e0 ff e0 */
175
1761:      movl %ebx, %ecx
177        addl $3, %ecx
178        jns 3f
1792:
180        movq (%esi, %ecx, 2), %mm0
181
182        movq %mm0, %mm1         /* rg gb */
183        movq %mm0, %mm2         /* rg gb */
184        psrlw $5, %mm1          /* 0r rg */
185        psrlw $10, %mm0         /* 00 0r */
186        psllw $11, %mm2         /* b0 00 */
187        psllw $11, %mm1         /* g0 00 */
188        psllw $8, %mm0          /* 0r 00 */
189        psrlw $3, %mm1          /* 0g 00 */
190        psrlw $3, %mm2          /* 0b 00 */
191
192        pmulhw %mm5, %mm0       /* xx xr */
193        pmulhw %mm6, %mm1       /* xx xg */
194        pmulhw %mm7, %mm2       /* xx xb */
195
196        /* Saturate upper */
197        paddusw %mm3, %mm0      /* ff er */
198        paddusw %mm3, %mm1      /* ff eg */
199        paddusw %mm3, %mm2      /* ff eb */
200
201        psubw %mm3, %mm0        /* 00 0r */
202        psubw %mm3, %mm1        /* 00 0g */
203        psubw %mm3, %mm2        /* 00 0b */
204
205        psllw $10, %mm0         /* r0 00 */
206        psllw $5, %mm1          /* 0g g0 */
207        por %mm2, %mm0          /* r0 0b */
208        por %mm1, %mm0          /* rg gb */
209
210        movq %mm0, (%esi, %ecx, 2)
211
212        addl $4, %ecx
213        js 2b
214        jmp 4f
2153:
216        movw (%esi, %ecx, 2), %ax
217        movd %eax, %mm0
218
219        movq %mm0, %mm1         /* rg gb */
220        movq %mm0, %mm2         /* rg gb */
221        psrlw $5, %mm1          /* 0r rg */
222        psrlw $10, %mm0         /* 00 0r */
223        psllw $11, %mm2         /* b0 00 */
224        psllw $11, %mm1         /* g0 00 */
225        psllw $8, %mm0          /* 0r 00 */
226        psrlw $3, %mm1          /* 0g 00 */
227        psrlw $3, %mm2          /* 0b 00 */
228
229        pmulhw %mm5, %mm0       /* xx xr */
230        pmulhw %mm6, %mm1       /* xx xg */
231        pmulhw %mm7, %mm2       /* xx xb */
232
233        /* Saturate upper */
234        paddusw %mm3, %mm0      /* ff er */
235        paddusw %mm3, %mm1      /* ff eg */
236        paddusw %mm3, %mm2      /* ff eb */
237
238        psubw %mm3, %mm0        /* 00 0r */
239        psubw %mm3, %mm1        /* 00 0g */
240        psubw %mm3, %mm2        /* 00 0b */
241
242        psllw $10, %mm0         /* r0 00 */
243        psllw $5, %mm1          /* 0g g0 */
244        por %mm2, %mm0          /* r0 0b */
245        por %mm1, %mm0          /* rg gb */
246
247        movd %mm0, %eax
248        movw %ax, (%esi, %ecx, 2)
249
250        incl %ecx
2514:
252        cmpl $2, %ecx
253        jng 3b
254
255        addl bpl, %esi
256        decl %edx
257        jnz 1b
2585:
259        LEAVE
260
261
262shade_ximage_16_mmx:
263        ENTER
264
265        leal -6(%esi, %ebx, 2), %esi
266        negl %ebx
267        jz 5f
268
269        /* Setup multipliers */
270        movd rm, %mm5
271        movd gm, %mm6
272        movd bm, %mm7
273        punpcklwd %mm5, %mm5    /* 00 00 00 00 rm rm rm rm */
274        punpcklwd %mm6, %mm6    /* 00 00 00 00 gm gm gm gm */
275        punpcklwd %mm7, %mm7    /* 00 00 00 00 bm bm bm bm */
276        punpckldq %mm5, %mm5    /* rm rm rm rm rm rm rm rm */
277        punpckldq %mm6, %mm6    /* gm gm gm gm gm gm gm gm */
278        punpckldq %mm7, %mm7    /* bm bm bm bm bm bm bm bm */
279
280        cmpl $256, rm
281        jg shade_ximage_16_mmx_saturate
282        cmpl $256, gm
283        jg shade_ximage_16_mmx_saturate
284        cmpl $256, bm
285        jg shade_ximage_16_mmx_saturate
286
2871:      movl %ebx, %ecx
288        addl $3, %ecx
289        jns 3f
2902:
291        movq (%esi, %ecx, 2), %mm0
292
293        movq %mm0, %mm1         /* rg gb */
294        movq %mm0, %mm2         /* rg gb */
295        psrlw $5, %mm1          /* 0r rg */
296        psrlw $11, %mm0         /* 00 0r */
297        psllw $11, %mm2         /* b0 00 */
298        psllw $10, %mm1         /* g0 00 */
299        psllw $8, %mm0          /* 0r 00 */
300        psrlw $2, %mm1          /* 0g 00 */
301        psrlw $3, %mm2          /* 0b 00 */
302
303        pmulhw %mm5, %mm0       /* 00 0r */
304        pmulhw %mm6, %mm1       /* 00 0g */
305        pmulhw %mm7, %mm2       /* 00 0b */
306
307        psllw $11, %mm0         /* r0 00 */
308        psllw $5, %mm1          /* 0g g0 */
309        por %mm2, %mm0          /* r0 0b */
310        por %mm1, %mm0          /* rg gb */
311
312        movq %mm0, (%esi, %ecx, 2)
313
314        addl $4, %ecx
315        js 2b
316	jmp 4f
3173:
318        movw (%esi, %ecx, 2), %ax
319        movd %eax, %mm0
320
321        movq %mm0, %mm1         /* rg gb */
322        movq %mm0, %mm2         /* rg gb */
323        psrlw $5, %mm1          /* 0r rg */
324        psrlw $11, %mm0         /* 00 0r */
325        psllw $11, %mm2         /* b0 00 */
326        psllw $10, %mm1         /* g0 00 */
327        psllw $8, %mm0          /* 0r 00 */
328        psrlw $2, %mm1          /* 0g 00 */
329        psrlw $3, %mm2          /* 0b 00 */
330
331        pmulhw %mm5, %mm0       /* 00 0r */
332        pmulhw %mm6, %mm1       /* 00 0g */
333        pmulhw %mm7, %mm2       /* 00 0b */
334
335        psllw $11, %mm0         /* r0 00 */
336        psllw $5, %mm1          /* 0g g0 */
337        por %mm2, %mm0          /* r0 0b */
338        por %mm1, %mm0          /* rg gb */
339
340        movd %mm0, %eax
341        movw %ax, (%esi, %ecx, 2)
342
343        incl %ecx
3444:
345        cmpl $2, %ecx
346        jng 3b
347
348        addl bpl, %esi
349        decl %edx
350        jnz 1b
3515:
352        LEAVE
353
354
355shade_ximage_16_mmx_saturate:
356
357        pcmpeqw %mm3, %mm3
358        movq %mm3, %mm4
359        psllw $5, %mm3          /* ff e0 ff e0 ff e0 ff e0 */
360        psllw $6, %mm4          /* ff c0 ff c0 ff c0 ff c0 */
361
3621:      movl %ebx, %ecx
363        addl $3, %ecx
364        jns 3f
3652:
366        movq (%esi, %ecx, 2), %mm0
367
368        movq %mm0, %mm1         /* rg gb */
369        movq %mm0, %mm2         /* rg gb */
370        psrlw $5, %mm1          /* 0r rg */
371        psrlw $11, %mm0         /* 00 0r */
372        psllw $11, %mm2         /* b0 00 */
373        psllw $10, %mm1         /* g0 00 */
374        psllw $8, %mm0          /* 0r 00 */
375        psrlw $2, %mm1          /* 0g 00 */
376        psrlw $3, %mm2          /* 0b 00 */
377
378        pmulhw %mm5, %mm0       /* xx xr */
379        pmulhw %mm6, %mm1       /* xx xg */
380        pmulhw %mm7, %mm2       /* xx xb */
381
382        /* Saturate upper */
383        paddusw %mm3, %mm0      /* ff er */
384        paddusw %mm4, %mm1      /* ff cg */
385        paddusw %mm3, %mm2      /* ff eb */
386
387        psubw %mm4, %mm1        /* 00 0g */
388        psubw %mm3, %mm2        /* 00 0b */
389
390        psllw $11, %mm0         /* r0 00 */
391        psllw $5, %mm1          /* 0g g0 */
392        por %mm2, %mm0          /* r0 0b */
393        por %mm1, %mm0          /* rg gb */
394
395        movq %mm0, (%esi, %ecx, 2)
396
397        addl $4, %ecx
398        js 2b
399        jmp 4f
4003:
401        movw (%esi, %ecx, 2), %ax
402        movd %eax, %mm0
403
404        movq %mm0, %mm1         /* rg gb */
405        movq %mm0, %mm2         /* rg gb */
406        psrlw $5, %mm1          /* 0r rg */
407        psrlw $11, %mm0         /* 00 0r */
408        psllw $11, %mm2         /* b0 00 */
409        psllw $10, %mm1         /* g0 00 */
410        psllw $8, %mm0          /* 0r 00 */
411        psrlw $2, %mm1          /* 0g 00 */
412        psrlw $3, %mm2          /* 0b 00 */
413
414        pmulhw %mm5, %mm0       /* xx xr */
415        pmulhw %mm6, %mm1       /* xx xg */
416        pmulhw %mm7, %mm2       /* xx xb */
417
418        /* Saturate upper */
419        paddusw %mm3, %mm0      /* ff er */
420        paddusw %mm4, %mm1      /* ff cg */
421        paddusw %mm3, %mm2      /* ff eb */
422
423        psubw %mm4, %mm1        /* 00 0g */
424        psubw %mm3, %mm2        /* 00 0b */
425
426        psllw $11, %mm0         /* r0 00 */
427        psllw $5, %mm1          /* 0g g0 */
428        por %mm2, %mm0          /* r0 0b */
429        por %mm1, %mm0          /* rg gb */
430
431        movd %mm0, %eax
432        movw %ax, (%esi, %ecx, 2)
433
434        incl %ecx
4354:
436        cmpl $2, %ecx
437        jng 3b
438
439        addl bpl, %esi
440        decl %edx
441        jnz 1b
4425:
443        LEAVE
444
445
446shade_ximage_32_mmx:
447        ENTER
448
449        leal (%esi, %ebx, 4), %esi
450        negl %ebx
451        jz 3f
452
453        movd rm, %mm4
454        movd gm, %mm5
455        movd bm, %mm6
456        psllq $32, %mm4
457        psllq $16, %mm5
458        por %mm6, %mm4
459        por %mm5, %mm4
460
461        pcmpeqw %mm6, %mm6
462        psllw $15, %mm6                 /* 80 00 80 00 80 00 80 00 */
463        movq %mm6, %mm5
464        pmulhw %mm4, %mm5               /* Get correction factor */
4651:
466        movl %ebx, %ecx
4672:
468        movd (%esi, %ecx, 4), %mm1      /* 00 rr gg bb */
469        pxor %mm0, %mm0
470        punpcklbw %mm1, %mm0            /* 00 00 rr 00 gg 00 bb 00 */
471        pxor %mm6, %mm0                 /* Flip sign */
472
473        pmulhw %mm4, %mm0               /* 00 00 xx rr xx gg xx bb */
474        psubw %mm5, %mm0                /* Correct range */
475        packuswb %mm0, %mm0             /* 00 rr gg bb 00 rr gg bb */
476
477        movd %mm0, (%esi, %ecx, 4)
478
479        incl %ecx
480        jnz 2b
481
482        addl bpl, %esi
483        decl %edx
484        jnz 1b
4853:
486        LEAVE
487
488#endif /* HAVE_MMX */
489