1/* 2 * Copyright (C) 1997-2009, Michael Jennings 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a copy 5 * of this software and associated documentation files (the "Software"), to 6 * deal in the Software without restriction, including without limitation the 7 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 * sell copies of the Software, and to permit persons to whom the Software is 9 * furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies of the Software, its documentation and marketing & publicity 13 * materials, and acknowledgment shall be given in the documentation, materials 14 * and software packages that this Software was used. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 20 * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 24#include "config.h" 25 26/* MMX routines for tinting XImages written by Willem Monsuwe <willem@stack.nl> */ 27 28/* Function calling conventions: 29 * shade_ximage_xx(void *data, int bpl, int w, int h, int rm, int gm, int bm); 30 */ 31 32#ifdef HAVE_MMX 33 34#define data 8(%ebp) 35#define bpl 12(%ebp) 36#define w 16(%ebp) 37#define h 20(%ebp) 38#define rm 24(%ebp) 39#define gm 28(%ebp) 40#define bm 32(%ebp) 41 42.global shade_ximage_15_mmx 43 .type shade_ximage_15_mmx,@function 44.global shade_ximage_16_mmx 45 .type shade_ximage_16_mmx,@function 46.global shade_ximage_32_mmx 47 .type shade_ximage_32_mmx,@function 48 49.bss 50.text 51.align 8 52 53#define ENTER \ 54 pushl %ebp ;\ 55 movl %esp, %ebp ;\ 56 pushl %ebx ;\ 57 pushl %ecx ;\ 58 pushl %edx ;\ 59 pushl %edi ;\ 60 pushl %esi ;\ 61 movl data, %esi ;\ 62 movl w, %ebx ;\ 63 movl h, %edx 64 65#define LEAVE \ 664: ;\ 67 emms ;\ 68 popl %esi ;\ 69 popl %edi ;\ 70 popl %edx ;\ 71 popl %ecx ;\ 72 popl %ebx ;\ 73 movl %ebp, %esp ;\ 74 popl %ebp ;\ 75 ret 76 77 78shade_ximage_15_mmx: 79 ENTER 80 81 leal -6(%esi, %ebx, 2), %esi 82 negl %ebx 83 jz 5f 84 85 /* Setup multipliers */ 86 movd rm, %mm5 87 movd gm, %mm6 88 movd bm, %mm7 89 punpcklwd %mm5, %mm5 /* 00 00 00 00 rm rm rm rm */ 90 punpcklwd %mm6, %mm6 /* 00 00 00 00 gm gm gm gm */ 91 punpcklwd %mm7, %mm7 /* 00 00 00 00 bm bm bm bm */ 92 punpckldq %mm5, %mm5 /* rm rm rm rm rm rm rm rm */ 93 punpckldq %mm6, %mm6 /* gm gm gm gm gm gm gm gm */ 94 punpckldq %mm7, %mm7 /* bm bm bm bm bm bm bm bm */ 95 96 cmpl $256, rm 97 jg shade_ximage_15_mmx_saturate 98 cmpl $256, gm 99 jg shade_ximage_15_mmx_saturate 100 cmpl $256, bm 101 jg shade_ximage_15_mmx_saturate 102 1031: movl %ebx, %ecx 104 addl $3, %ecx 105 jns 3f 1062: 107 movq (%esi, %ecx, 2), %mm0 108 109 movq %mm0, %mm1 /* rg gb */ 110 movq %mm0, %mm2 /* rg gb */ 111 psrlw $5, %mm1 /* 0r rg */ 112 psrlw $10, %mm0 /* 00 0r */ 113 psllw $11, %mm2 /* b0 00 */ 114 psllw $11, %mm1 /* g0 00 */ 115 psllw $8, %mm0 /* 0r 00 */ 116 psrlw $3, %mm1 /* 0g 00 */ 117 psrlw $3, %mm2 /* 0b 00 */ 118 119 pmulhw %mm5, %mm0 /* 00 0r */ 120 pmulhw %mm6, %mm1 /* 00 0g */ 121 pmulhw %mm7, %mm2 /* 00 0b */ 122 123 psllw $10, %mm0 /* r0 00 */ 124 psllw $5, %mm1 /* 0g g0 */ 125 por %mm2, %mm0 /* r0 0b */ 126 por %mm1, %mm0 /* rg gb */ 127 128 movq %mm0, (%esi, %ecx, 2) 129 130 addl $4, %ecx 131 js 2b 132 jmp 4f 1333: 134 movw (%esi, %ecx, 2), %ax 135 movd %eax, %mm0 136 137 movq %mm0, %mm1 /* rg gb */ 138 movq %mm0, %mm2 /* rg gb */ 139 psrlw $5, %mm1 /* 0r rg */ 140 psrlw $10, %mm0 /* 00 0r */ 141 psllw $11, %mm2 /* b0 00 */ 142 psllw $11, %mm1 /* g0 00 */ 143 psllw $8, %mm0 /* 0r 00 */ 144 psrlw $3, %mm1 /* 0g 00 */ 145 psrlw $3, %mm2 /* 0b 00 */ 146 147 pmulhw %mm5, %mm0 /* 00 0r */ 148 pmulhw %mm6, %mm1 /* 00 0g */ 149 pmulhw %mm7, %mm2 /* 00 0b */ 150 151 psllw $10, %mm0 /* r0 00 */ 152 psllw $5, %mm1 /* 0g g0 */ 153 por %mm2, %mm0 /* r0 0b */ 154 por %mm1, %mm0 /* rg gb */ 155 156 movd %mm0, %eax 157 movw %ax, (%esi, %ecx, 2) 158 159 incl %ecx 1604: 161 cmpl $2, %ecx 162 jng 3b 163 164 addl bpl, %esi 165 decl %edx 166 jnz 1b 1675: 168 LEAVE 169 170 171shade_ximage_15_mmx_saturate: 172 173 pcmpeqw %mm3, %mm3 174 psllw $5, %mm3 /* ff e0 ff e0 ff e0 ff e0 */ 175 1761: movl %ebx, %ecx 177 addl $3, %ecx 178 jns 3f 1792: 180 movq (%esi, %ecx, 2), %mm0 181 182 movq %mm0, %mm1 /* rg gb */ 183 movq %mm0, %mm2 /* rg gb */ 184 psrlw $5, %mm1 /* 0r rg */ 185 psrlw $10, %mm0 /* 00 0r */ 186 psllw $11, %mm2 /* b0 00 */ 187 psllw $11, %mm1 /* g0 00 */ 188 psllw $8, %mm0 /* 0r 00 */ 189 psrlw $3, %mm1 /* 0g 00 */ 190 psrlw $3, %mm2 /* 0b 00 */ 191 192 pmulhw %mm5, %mm0 /* xx xr */ 193 pmulhw %mm6, %mm1 /* xx xg */ 194 pmulhw %mm7, %mm2 /* xx xb */ 195 196 /* Saturate upper */ 197 paddusw %mm3, %mm0 /* ff er */ 198 paddusw %mm3, %mm1 /* ff eg */ 199 paddusw %mm3, %mm2 /* ff eb */ 200 201 psubw %mm3, %mm0 /* 00 0r */ 202 psubw %mm3, %mm1 /* 00 0g */ 203 psubw %mm3, %mm2 /* 00 0b */ 204 205 psllw $10, %mm0 /* r0 00 */ 206 psllw $5, %mm1 /* 0g g0 */ 207 por %mm2, %mm0 /* r0 0b */ 208 por %mm1, %mm0 /* rg gb */ 209 210 movq %mm0, (%esi, %ecx, 2) 211 212 addl $4, %ecx 213 js 2b 214 jmp 4f 2153: 216 movw (%esi, %ecx, 2), %ax 217 movd %eax, %mm0 218 219 movq %mm0, %mm1 /* rg gb */ 220 movq %mm0, %mm2 /* rg gb */ 221 psrlw $5, %mm1 /* 0r rg */ 222 psrlw $10, %mm0 /* 00 0r */ 223 psllw $11, %mm2 /* b0 00 */ 224 psllw $11, %mm1 /* g0 00 */ 225 psllw $8, %mm0 /* 0r 00 */ 226 psrlw $3, %mm1 /* 0g 00 */ 227 psrlw $3, %mm2 /* 0b 00 */ 228 229 pmulhw %mm5, %mm0 /* xx xr */ 230 pmulhw %mm6, %mm1 /* xx xg */ 231 pmulhw %mm7, %mm2 /* xx xb */ 232 233 /* Saturate upper */ 234 paddusw %mm3, %mm0 /* ff er */ 235 paddusw %mm3, %mm1 /* ff eg */ 236 paddusw %mm3, %mm2 /* ff eb */ 237 238 psubw %mm3, %mm0 /* 00 0r */ 239 psubw %mm3, %mm1 /* 00 0g */ 240 psubw %mm3, %mm2 /* 00 0b */ 241 242 psllw $10, %mm0 /* r0 00 */ 243 psllw $5, %mm1 /* 0g g0 */ 244 por %mm2, %mm0 /* r0 0b */ 245 por %mm1, %mm0 /* rg gb */ 246 247 movd %mm0, %eax 248 movw %ax, (%esi, %ecx, 2) 249 250 incl %ecx 2514: 252 cmpl $2, %ecx 253 jng 3b 254 255 addl bpl, %esi 256 decl %edx 257 jnz 1b 2585: 259 LEAVE 260 261 262shade_ximage_16_mmx: 263 ENTER 264 265 leal -6(%esi, %ebx, 2), %esi 266 negl %ebx 267 jz 5f 268 269 /* Setup multipliers */ 270 movd rm, %mm5 271 movd gm, %mm6 272 movd bm, %mm7 273 punpcklwd %mm5, %mm5 /* 00 00 00 00 rm rm rm rm */ 274 punpcklwd %mm6, %mm6 /* 00 00 00 00 gm gm gm gm */ 275 punpcklwd %mm7, %mm7 /* 00 00 00 00 bm bm bm bm */ 276 punpckldq %mm5, %mm5 /* rm rm rm rm rm rm rm rm */ 277 punpckldq %mm6, %mm6 /* gm gm gm gm gm gm gm gm */ 278 punpckldq %mm7, %mm7 /* bm bm bm bm bm bm bm bm */ 279 280 cmpl $256, rm 281 jg shade_ximage_16_mmx_saturate 282 cmpl $256, gm 283 jg shade_ximage_16_mmx_saturate 284 cmpl $256, bm 285 jg shade_ximage_16_mmx_saturate 286 2871: movl %ebx, %ecx 288 addl $3, %ecx 289 jns 3f 2902: 291 movq (%esi, %ecx, 2), %mm0 292 293 movq %mm0, %mm1 /* rg gb */ 294 movq %mm0, %mm2 /* rg gb */ 295 psrlw $5, %mm1 /* 0r rg */ 296 psrlw $11, %mm0 /* 00 0r */ 297 psllw $11, %mm2 /* b0 00 */ 298 psllw $10, %mm1 /* g0 00 */ 299 psllw $8, %mm0 /* 0r 00 */ 300 psrlw $2, %mm1 /* 0g 00 */ 301 psrlw $3, %mm2 /* 0b 00 */ 302 303 pmulhw %mm5, %mm0 /* 00 0r */ 304 pmulhw %mm6, %mm1 /* 00 0g */ 305 pmulhw %mm7, %mm2 /* 00 0b */ 306 307 psllw $11, %mm0 /* r0 00 */ 308 psllw $5, %mm1 /* 0g g0 */ 309 por %mm2, %mm0 /* r0 0b */ 310 por %mm1, %mm0 /* rg gb */ 311 312 movq %mm0, (%esi, %ecx, 2) 313 314 addl $4, %ecx 315 js 2b 316 jmp 4f 3173: 318 movw (%esi, %ecx, 2), %ax 319 movd %eax, %mm0 320 321 movq %mm0, %mm1 /* rg gb */ 322 movq %mm0, %mm2 /* rg gb */ 323 psrlw $5, %mm1 /* 0r rg */ 324 psrlw $11, %mm0 /* 00 0r */ 325 psllw $11, %mm2 /* b0 00 */ 326 psllw $10, %mm1 /* g0 00 */ 327 psllw $8, %mm0 /* 0r 00 */ 328 psrlw $2, %mm1 /* 0g 00 */ 329 psrlw $3, %mm2 /* 0b 00 */ 330 331 pmulhw %mm5, %mm0 /* 00 0r */ 332 pmulhw %mm6, %mm1 /* 00 0g */ 333 pmulhw %mm7, %mm2 /* 00 0b */ 334 335 psllw $11, %mm0 /* r0 00 */ 336 psllw $5, %mm1 /* 0g g0 */ 337 por %mm2, %mm0 /* r0 0b */ 338 por %mm1, %mm0 /* rg gb */ 339 340 movd %mm0, %eax 341 movw %ax, (%esi, %ecx, 2) 342 343 incl %ecx 3444: 345 cmpl $2, %ecx 346 jng 3b 347 348 addl bpl, %esi 349 decl %edx 350 jnz 1b 3515: 352 LEAVE 353 354 355shade_ximage_16_mmx_saturate: 356 357 pcmpeqw %mm3, %mm3 358 movq %mm3, %mm4 359 psllw $5, %mm3 /* ff e0 ff e0 ff e0 ff e0 */ 360 psllw $6, %mm4 /* ff c0 ff c0 ff c0 ff c0 */ 361 3621: movl %ebx, %ecx 363 addl $3, %ecx 364 jns 3f 3652: 366 movq (%esi, %ecx, 2), %mm0 367 368 movq %mm0, %mm1 /* rg gb */ 369 movq %mm0, %mm2 /* rg gb */ 370 psrlw $5, %mm1 /* 0r rg */ 371 psrlw $11, %mm0 /* 00 0r */ 372 psllw $11, %mm2 /* b0 00 */ 373 psllw $10, %mm1 /* g0 00 */ 374 psllw $8, %mm0 /* 0r 00 */ 375 psrlw $2, %mm1 /* 0g 00 */ 376 psrlw $3, %mm2 /* 0b 00 */ 377 378 pmulhw %mm5, %mm0 /* xx xr */ 379 pmulhw %mm6, %mm1 /* xx xg */ 380 pmulhw %mm7, %mm2 /* xx xb */ 381 382 /* Saturate upper */ 383 paddusw %mm3, %mm0 /* ff er */ 384 paddusw %mm4, %mm1 /* ff cg */ 385 paddusw %mm3, %mm2 /* ff eb */ 386 387 psubw %mm4, %mm1 /* 00 0g */ 388 psubw %mm3, %mm2 /* 00 0b */ 389 390 psllw $11, %mm0 /* r0 00 */ 391 psllw $5, %mm1 /* 0g g0 */ 392 por %mm2, %mm0 /* r0 0b */ 393 por %mm1, %mm0 /* rg gb */ 394 395 movq %mm0, (%esi, %ecx, 2) 396 397 addl $4, %ecx 398 js 2b 399 jmp 4f 4003: 401 movw (%esi, %ecx, 2), %ax 402 movd %eax, %mm0 403 404 movq %mm0, %mm1 /* rg gb */ 405 movq %mm0, %mm2 /* rg gb */ 406 psrlw $5, %mm1 /* 0r rg */ 407 psrlw $11, %mm0 /* 00 0r */ 408 psllw $11, %mm2 /* b0 00 */ 409 psllw $10, %mm1 /* g0 00 */ 410 psllw $8, %mm0 /* 0r 00 */ 411 psrlw $2, %mm1 /* 0g 00 */ 412 psrlw $3, %mm2 /* 0b 00 */ 413 414 pmulhw %mm5, %mm0 /* xx xr */ 415 pmulhw %mm6, %mm1 /* xx xg */ 416 pmulhw %mm7, %mm2 /* xx xb */ 417 418 /* Saturate upper */ 419 paddusw %mm3, %mm0 /* ff er */ 420 paddusw %mm4, %mm1 /* ff cg */ 421 paddusw %mm3, %mm2 /* ff eb */ 422 423 psubw %mm4, %mm1 /* 00 0g */ 424 psubw %mm3, %mm2 /* 00 0b */ 425 426 psllw $11, %mm0 /* r0 00 */ 427 psllw $5, %mm1 /* 0g g0 */ 428 por %mm2, %mm0 /* r0 0b */ 429 por %mm1, %mm0 /* rg gb */ 430 431 movd %mm0, %eax 432 movw %ax, (%esi, %ecx, 2) 433 434 incl %ecx 4354: 436 cmpl $2, %ecx 437 jng 3b 438 439 addl bpl, %esi 440 decl %edx 441 jnz 1b 4425: 443 LEAVE 444 445 446shade_ximage_32_mmx: 447 ENTER 448 449 leal (%esi, %ebx, 4), %esi 450 negl %ebx 451 jz 3f 452 453 movd rm, %mm4 454 movd gm, %mm5 455 movd bm, %mm6 456 psllq $32, %mm4 457 psllq $16, %mm5 458 por %mm6, %mm4 459 por %mm5, %mm4 460 461 pcmpeqw %mm6, %mm6 462 psllw $15, %mm6 /* 80 00 80 00 80 00 80 00 */ 463 movq %mm6, %mm5 464 pmulhw %mm4, %mm5 /* Get correction factor */ 4651: 466 movl %ebx, %ecx 4672: 468 movd (%esi, %ecx, 4), %mm1 /* 00 rr gg bb */ 469 pxor %mm0, %mm0 470 punpcklbw %mm1, %mm0 /* 00 00 rr 00 gg 00 bb 00 */ 471 pxor %mm6, %mm0 /* Flip sign */ 472 473 pmulhw %mm4, %mm0 /* 00 00 xx rr xx gg xx bb */ 474 psubw %mm5, %mm0 /* Correct range */ 475 packuswb %mm0, %mm0 /* 00 rr gg bb 00 rr gg bb */ 476 477 movd %mm0, (%esi, %ecx, 4) 478 479 incl %ecx 480 jnz 2b 481 482 addl bpl, %esi 483 decl %edx 484 jnz 1b 4853: 486 LEAVE 487 488#endif /* HAVE_MMX */ 489