1/* 2 * (C) Copyright IBM Corporation 2004 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25/** 26 * \file read_rgba_span_x86.S 27 * Optimized routines to transfer pixel data from the framebuffer to a 28 * buffer in main memory. 29 * 30 * \author Ian Romanick <idr@us.ibm.com> 31 */ 32/* Control flow enforcement support */ 33#ifdef HAVE_CET_H 34#include <cet.h> 35#else 36#define _CET_ENDBR 37#endif 38 39 .file "read_rgba_span_x86.S" 40#if !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */ 41/* Kevin F. Quinn 2nd July 2006 42 * Replaced data segment constants with text-segment instructions. 43 */ 44#define LOAD_MASK(mvins,m1,m2) \ 45 pushl $0xff00ff00 ;\ 46 pushl $0xff00ff00 ;\ 47 pushl $0xff00ff00 ;\ 48 pushl $0xff00ff00 ;\ 49 mvins (%esp), m1 ;\ 50 pushl $0x00ff0000 ;\ 51 pushl $0x00ff0000 ;\ 52 pushl $0x00ff0000 ;\ 53 pushl $0x00ff0000 ;\ 54 mvins (%esp), m2 ;\ 55 addl $32, %esp 56 57/* I implemented these as macros because they appear in several places, 58 * and I've tweaked them a number of times. I got tired of changing every 59 * place they appear. :) 60 */ 61 62#define DO_ONE_PIXEL() \ 63 movl (%ebx), %eax ; \ 64 addl $4, %ebx ; \ 65 bswap %eax /* ARGB -> BGRA */ ; \ 66 rorl $8, %eax /* BGRA -> ABGR */ ; \ 67 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \ 68 addl $4, %ecx 69 70#define DO_ONE_LAST_PIXEL() \ 71 movl (%ebx), %eax ; \ 72 bswap %eax /* ARGB -> BGRA */ ; \ 73 rorl $8, %eax /* BGRA -> ABGR */ ; \ 74 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; 75 76 77/** 78 * MMX optimized version of the BGRA8888_REV to RGBA copy routine. 79 * 80 * \warning 81 * This function assumes that the caller will issue the EMMS instruction 82 * at the correct places. 83 */ 84 85.globl _generic_read_RGBA_span_BGRA8888_REV_MMX 86.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX 87 .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function 88_generic_read_RGBA_span_BGRA8888_REV_MMX: 89 _CET_ENDBR 90 pushl %ebx 91 92#ifdef USE_INNER_EMMS 93 emms 94#endif 95 LOAD_MASK(movq,%mm1,%mm2) 96 97 movl 8(%esp), %ebx /* source pointer */ 98 movl 16(%esp), %edx /* number of pixels to copy */ 99 movl 12(%esp), %ecx /* destination pointer */ 100 101 testl %edx, %edx 102 jle .L20 /* Bail if there's nothing to do. */ 103 104 movl %ebx, %eax 105 106 negl %eax 107 sarl $2, %eax 108 andl $1, %eax 109 je .L17 110 111 subl %eax, %edx 112 DO_ONE_PIXEL() 113.L17: 114 115 /* Would it be faster to unroll this loop once and process 4 pixels 116 * per pass, instead of just two? 117 */ 118 119 movl %edx, %eax 120 shrl %eax 121 jmp .L18 122.L19: 123 movq (%ebx), %mm0 124 addl $8, %ebx 125 126 /* These 9 instructions do what PSHUFB (if there were such an 127 * instruction) could do in 1. :( 128 */ 129 130 movq %mm0, %mm3 131 movq %mm0, %mm4 132 133 pand %mm2, %mm3 134 psllq $16, %mm4 135 psrlq $16, %mm3 136 pand %mm2, %mm4 137 138 pand %mm1, %mm0 139 por %mm4, %mm3 140 por %mm3, %mm0 141 142 movq %mm0, (%ecx) 143 addl $8, %ecx 144 subl $1, %eax 145.L18: 146 jne .L19 147 148#ifdef USE_INNER_EMMS 149 emms 150#endif 151 152 /* At this point there are either 1 or 0 pixels remaining to be 153 * converted. Convert the last pixel, if needed. 154 */ 155 156 testl $1, %edx 157 je .L20 158 159 DO_ONE_LAST_PIXEL() 160 161.L20: 162 popl %ebx 163 ret 164 .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX 165 166 167/** 168 * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE 169 * instructions are only actually used to read data from the framebuffer. 170 * In practice, the speed-up is pretty small. 171 * 172 * \todo 173 * Do some more testing and determine if there's any reason to have this 174 * function in addition to the MMX version. 175 * 176 * \warning 177 * This function assumes that the caller will issue the EMMS instruction 178 * at the correct places. 179 */ 180 181.globl _generic_read_RGBA_span_BGRA8888_REV_SSE 182.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE 183 .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function 184_generic_read_RGBA_span_BGRA8888_REV_SSE: 185 _CET_ENDBR 186 pushl %esi 187 pushl %ebx 188 pushl %ebp 189 190#ifdef USE_INNER_EMMS 191 emms 192#endif 193 194 LOAD_MASK(movq,%mm1,%mm2) 195 196 movl 16(%esp), %ebx /* source pointer */ 197 movl 24(%esp), %edx /* number of pixels to copy */ 198 movl 20(%esp), %ecx /* destination pointer */ 199 200 testl %edx, %edx 201 jle .L35 /* Bail if there's nothing to do. */ 202 203 movl %esp, %ebp 204 subl $16, %esp 205 andl $0xfffffff0, %esp 206 207 movl %ebx, %eax 208 movl %edx, %esi 209 210 negl %eax 211 andl $15, %eax 212 sarl $2, %eax 213 cmpl %edx, %eax 214 cmovle %eax, %esi 215 216 subl %esi, %edx 217 218 testl $1, %esi 219 je .L32 220 221 DO_ONE_PIXEL() 222.L32: 223 224 testl $2, %esi 225 je .L31 226 227 movq (%ebx), %mm0 228 addl $8, %ebx 229 230 movq %mm0, %mm3 231 movq %mm0, %mm4 232 233 pand %mm2, %mm3 234 psllq $16, %mm4 235 psrlq $16, %mm3 236 pand %mm2, %mm4 237 238 pand %mm1, %mm0 239 por %mm4, %mm3 240 por %mm3, %mm0 241 242 movq %mm0, (%ecx) 243 addl $8, %ecx 244.L31: 245 246 movl %edx, %eax 247 shrl $2, %eax 248 jmp .L33 249.L34: 250 movaps (%ebx), %xmm0 251 addl $16, %ebx 252 253 /* This would be so much better if we could just move directly from 254 * an SSE register to an MMX register. Unfortunately, that 255 * functionality wasn't introduced until SSE2 with the MOVDQ2Q 256 * instruction. 257 */ 258 259 movaps %xmm0, (%esp) 260 movq (%esp), %mm0 261 movq 8(%esp), %mm5 262 263 movq %mm0, %mm3 264 movq %mm0, %mm4 265 movq %mm5, %mm6 266 movq %mm5, %mm7 267 268 pand %mm2, %mm3 269 pand %mm2, %mm6 270 271 psllq $16, %mm4 272 psllq $16, %mm7 273 274 psrlq $16, %mm3 275 psrlq $16, %mm6 276 277 pand %mm2, %mm4 278 pand %mm2, %mm7 279 280 pand %mm1, %mm0 281 pand %mm1, %mm5 282 283 por %mm4, %mm3 284 por %mm7, %mm6 285 286 por %mm3, %mm0 287 por %mm6, %mm5 288 289 movq %mm0, (%ecx) 290 movq %mm5, 8(%ecx) 291 addl $16, %ecx 292 293 subl $1, %eax 294.L33: 295 jne .L34 296 297#ifdef USE_INNER_EMMS 298 emms 299#endif 300 movl %ebp, %esp 301 302 /* At this point there are either [0, 3] pixels remaining to be 303 * converted. 304 */ 305 306 testl $2, %edx 307 je .L36 308 309 movq (%ebx), %mm0 310 addl $8, %ebx 311 312 movq %mm0, %mm3 313 movq %mm0, %mm4 314 315 pand %mm2, %mm3 316 psllq $16, %mm4 317 psrlq $16, %mm3 318 pand %mm2, %mm4 319 320 pand %mm1, %mm0 321 por %mm4, %mm3 322 por %mm3, %mm0 323 324 movq %mm0, (%ecx) 325 addl $8, %ecx 326.L36: 327 328 testl $1, %edx 329 je .L35 330 331 DO_ONE_LAST_PIXEL() 332.L35: 333 popl %ebp 334 popl %ebx 335 popl %esi 336 ret 337 .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE 338 339 340/** 341 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine. 342 */ 343 344 .text 345.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2 346.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2 347 .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function 348_generic_read_RGBA_span_BGRA8888_REV_SSE2: 349 _CET_ENDBR 350 pushl %esi 351 pushl %ebx 352 353 LOAD_MASK(movdqu,%xmm1,%xmm2) 354 355 movl 12(%esp), %ebx /* source pointer */ 356 movl 20(%esp), %edx /* number of pixels to copy */ 357 movl 16(%esp), %ecx /* destination pointer */ 358 359 movl %ebx, %eax 360 movl %edx, %esi 361 362 testl %edx, %edx 363 jle .L46 /* Bail if there's nothing to do. */ 364 365 /* If the source pointer isn't a multiple of 16 we have to process 366 * a few pixels the "slow" way to get the address aligned for 367 * the SSE fetch intsructions. 368 */ 369 370 negl %eax 371 andl $15, %eax 372 sarl $2, %eax 373 374 cmpl %edx, %eax 375 cmovbe %eax, %esi 376 subl %esi, %edx 377 378 testl $1, %esi 379 je .L41 380 381 DO_ONE_PIXEL() 382.L41: 383 testl $2, %esi 384 je .L40 385 386 movq (%ebx), %xmm0 387 addl $8, %ebx 388 389 movdqa %xmm0, %xmm3 390 movdqa %xmm0, %xmm4 391 andps %xmm1, %xmm0 392 393 andps %xmm2, %xmm3 394 pslldq $2, %xmm4 395 psrldq $2, %xmm3 396 andps %xmm2, %xmm4 397 398 orps %xmm4, %xmm3 399 orps %xmm3, %xmm0 400 401 movq %xmm0, (%ecx) 402 addl $8, %ecx 403.L40: 404 405 /* Would it be worth having a specialized version of this loop for 406 * the case where the destination is 16-byte aligned? That version 407 * would be identical except that it could use movedqa instead of 408 * movdqu. 409 */ 410 411 movl %edx, %eax 412 shrl $2, %eax 413 jmp .L42 414.L43: 415 movdqa (%ebx), %xmm0 416 addl $16, %ebx 417 418 movdqa %xmm0, %xmm3 419 movdqa %xmm0, %xmm4 420 andps %xmm1, %xmm0 421 422 andps %xmm2, %xmm3 423 pslldq $2, %xmm4 424 psrldq $2, %xmm3 425 andps %xmm2, %xmm4 426 427 orps %xmm4, %xmm3 428 orps %xmm3, %xmm0 429 430 movdqu %xmm0, (%ecx) 431 addl $16, %ecx 432 subl $1, %eax 433.L42: 434 jne .L43 435 436 437 /* There may be upto 3 pixels remaining to be copied. Take care 438 * of them now. We do the 2 pixel case first because the data 439 * will be aligned. 440 */ 441 442 testl $2, %edx 443 je .L47 444 445 movq (%ebx), %xmm0 446 addl $8, %ebx 447 448 movdqa %xmm0, %xmm3 449 movdqa %xmm0, %xmm4 450 andps %xmm1, %xmm0 451 452 andps %xmm2, %xmm3 453 pslldq $2, %xmm4 454 psrldq $2, %xmm3 455 andps %xmm2, %xmm4 456 457 orps %xmm4, %xmm3 458 orps %xmm3, %xmm0 459 460 movq %xmm0, (%ecx) 461 addl $8, %ecx 462.L47: 463 464 testl $1, %edx 465 je .L46 466 467 DO_ONE_LAST_PIXEL() 468.L46: 469 470 popl %ebx 471 popl %esi 472 ret 473 .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2 474 475 476 477#define MASK_565_L 0x07e0f800 478#define MASK_565_H 0x0000001f 479/* Setting SCALE_ADJUST to 5 gives a perfect match with the 480 * classic C implementation in Mesa. Setting SCALE_ADJUST 481 * to 0 is slightly faster but at a small cost to accuracy. 482 */ 483#define SCALE_ADJUST 5 484#if SCALE_ADJUST == 5 485#define PRESCALE_L 0x00100001 486#define PRESCALE_H 0x00000200 487#define SCALE_L 0x40C620E8 488#define SCALE_H 0x0000839d 489#elif SCALE_ADJUST == 0 490#define PRESCALE_L 0x00200001 491#define PRESCALE_H 0x00000800 492#define SCALE_L 0x01040108 493#define SCALE_H 0x00000108 494#else 495#error SCALE_ADJUST must either be 5 or 0. 496#endif 497#define ALPHA_L 0x00000000 498#define ALPHA_H 0x00ff0000 499 500/** 501 * MMX optimized version of the RGB565 to RGBA copy routine. 502 */ 503 504 .text 505 .globl _generic_read_RGBA_span_RGB565_MMX 506 .hidden _generic_read_RGBA_span_RGB565_MMX 507 .type _generic_read_RGBA_span_RGB565_MMX, @function 508 509_generic_read_RGBA_span_RGB565_MMX: 510 _CET_ENDBR 511#ifdef USE_INNER_EMMS 512 emms 513#endif 514 515 movl 4(%esp), %eax /* source pointer */ 516 movl 8(%esp), %edx /* destination pointer */ 517 movl 12(%esp), %ecx /* number of pixels to copy */ 518 519 pushl $MASK_565_H 520 pushl $MASK_565_L 521 movq (%esp), %mm5 522 pushl $PRESCALE_H 523 pushl $PRESCALE_L 524 movq (%esp), %mm6 525 pushl $SCALE_H 526 pushl $SCALE_L 527 movq (%esp), %mm7 528 pushl $ALPHA_H 529 pushl $ALPHA_L 530 movq (%esp), %mm3 531 addl $32,%esp 532 533 sarl $2, %ecx 534 jl .L01 /* Bail early if the count is negative. */ 535 jmp .L02 536 537.L03: 538 /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and 539 * second pixels into the four words of %mm0 and %mm2. 540 */ 541 542 movq (%eax), %mm4 543 addl $8, %eax 544 545 pshufw $0x00, %mm4, %mm0 546 pshufw $0x55, %mm4, %mm2 547 548 549 /* Mask the pixels so that each word of each register contains only 550 * one color component. 551 */ 552 553 pand %mm5, %mm0 554 pand %mm5, %mm2 555 556 557 /* Adjust the component values so that they are as small as possible, 558 * but large enough so that we can multiply them by an unsigned 16-bit 559 * number and get a value as large as 0x00ff0000. 560 */ 561 562 pmullw %mm6, %mm0 563 pmullw %mm6, %mm2 564#if SCALE_ADJUST > 0 565 psrlw $SCALE_ADJUST, %mm0 566 psrlw $SCALE_ADJUST, %mm2 567#endif 568 569 /* Scale the input component values to be on the range 570 * [0, 0x00ff0000]. This it the real magic of the whole routine. 571 */ 572 573 pmulhuw %mm7, %mm0 574 pmulhuw %mm7, %mm2 575 576 577 /* Always set the alpha value to 0xff. 578 */ 579 580 por %mm3, %mm0 581 por %mm3, %mm2 582 583 584 /* Pack the 16-bit values to 8-bit values and store the converted 585 * pixel data. 586 */ 587 588 packuswb %mm2, %mm0 589 movq %mm0, (%edx) 590 addl $8, %edx 591 592 pshufw $0xaa, %mm4, %mm0 593 pshufw $0xff, %mm4, %mm2 594 595 pand %mm5, %mm0 596 pand %mm5, %mm2 597 pmullw %mm6, %mm0 598 pmullw %mm6, %mm2 599#if SCALE_ADJUST > 0 600 psrlw $SCALE_ADJUST, %mm0 601 psrlw $SCALE_ADJUST, %mm2 602#endif 603 pmulhuw %mm7, %mm0 604 pmulhuw %mm7, %mm2 605 606 por %mm3, %mm0 607 por %mm3, %mm2 608 609 packuswb %mm2, %mm0 610 611 movq %mm0, (%edx) 612 addl $8, %edx 613 614 subl $1, %ecx 615.L02: 616 jne .L03 617 618 619 /* At this point there can be at most 3 pixels left to process. If 620 * there is either 2 or 3 left, process 2. 621 */ 622 623 movl 12(%esp), %ecx 624 testl $0x02, %ecx 625 je .L04 626 627 movd (%eax), %mm4 628 addl $4, %eax 629 630 pshufw $0x00, %mm4, %mm0 631 pshufw $0x55, %mm4, %mm2 632 633 pand %mm5, %mm0 634 pand %mm5, %mm2 635 pmullw %mm6, %mm0 636 pmullw %mm6, %mm2 637#if SCALE_ADJUST > 0 638 psrlw $SCALE_ADJUST, %mm0 639 psrlw $SCALE_ADJUST, %mm2 640#endif 641 pmulhuw %mm7, %mm0 642 pmulhuw %mm7, %mm2 643 644 por %mm3, %mm0 645 por %mm3, %mm2 646 647 packuswb %mm2, %mm0 648 649 movq %mm0, (%edx) 650 addl $8, %edx 651 652.L04: 653 /* At this point there can be at most 1 pixel left to process. 654 * Process it if needed. 655 */ 656 657 testl $0x01, %ecx 658 je .L01 659 660 movzwl (%eax), %ecx 661 movd %ecx, %mm4 662 663 pshufw $0x00, %mm4, %mm0 664 665 pand %mm5, %mm0 666 pmullw %mm6, %mm0 667#if SCALE_ADJUST > 0 668 psrlw $SCALE_ADJUST, %mm0 669#endif 670 pmulhuw %mm7, %mm0 671 672 por %mm3, %mm0 673 674 packuswb %mm0, %mm0 675 676 movd %mm0, (%edx) 677 678.L01: 679#ifdef USE_INNER_EMMS 680 emms 681#endif 682 ret 683#endif /* !defined(__MINGW32__) && !defined(__APPLE__) */ 684 685#if defined (__ELF__) && defined (__linux__) 686 .section .note.GNU-stack,"",%progbits 687#endif 688