1;****************************************************************************** 2;* H.264 intra prediction asm optimizations 3;* Copyright (c) 2010 Fiona Glaser 4;* Copyright (c) 2010 Holger Lubitz 5;* Copyright (c) 2010 Loren Merritt 6;* Copyright (c) 2010 Ronald S. Bultje 7;* 8;* This file is part of FFmpeg. 9;* 10;* FFmpeg is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* FFmpeg is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with FFmpeg; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "libavutil/x86/x86util.asm" 26 27SECTION_RODATA 28 29tm_shuf: times 8 db 0x03, 0x80 30pw_ff00: times 8 dw 0xff00 31plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1 32 db 1, 2, 3, 4, 5, 6, 7, 8 33plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0 34 db 1, 2, 3, 4, 0, 0, 0, 0 35pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7 36pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8 37pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1 38pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4 39 40SECTION .text 41 42cextern pb_1 43cextern pb_3 44cextern pw_4 45cextern pw_5 46cextern pw_8 47cextern pw_16 48cextern pw_17 49cextern pw_32 50 51;----------------------------------------------------------------------------- 52; void ff_pred16x16_vertical_8(uint8_t *src, ptrdiff_t stride) 53;----------------------------------------------------------------------------- 54 55INIT_MMX mmx 56cglobal pred16x16_vertical_8, 2,3 57 sub r0, r1 58 mov r2, 8 59 movq mm0, [r0+0] 60 movq mm1, [r0+8] 61.loop: 62 movq [r0+r1*1+0], mm0 63 movq [r0+r1*1+8], mm1 64 movq [r0+r1*2+0], mm0 65 movq [r0+r1*2+8], mm1 66 lea r0, [r0+r1*2] 67 dec r2 68 jg .loop 69 REP_RET 70 71INIT_XMM sse 72cglobal pred16x16_vertical_8, 2,3 73 sub r0, r1 74 mov r2, 4 75 movaps xmm0, [r0] 76.loop: 77 movaps [r0+r1*1], xmm0 78 movaps [r0+r1*2], xmm0 79 lea r0, [r0+r1*2] 80 movaps [r0+r1*1], xmm0 81 movaps [r0+r1*2], xmm0 82 lea r0, [r0+r1*2] 83 dec r2 84 jg .loop 85 REP_RET 86 87;----------------------------------------------------------------------------- 88; void ff_pred16x16_horizontal_8(uint8_t *src, ptrdiff_t stride) 89;----------------------------------------------------------------------------- 90 91%macro PRED16x16_H 0 92cglobal pred16x16_horizontal_8, 2,3 93 mov r2, 8 94%if cpuflag(ssse3) 95 mova m2, [pb_3] 96%endif 97.loop: 98 movd m0, [r0+r1*0-4] 99 movd m1, [r0+r1*1-4] 100 101%if cpuflag(ssse3) 102 pshufb m0, m2 103 pshufb m1, m2 104%else 105 punpcklbw m0, m0 106 punpcklbw m1, m1 107 SPLATW m0, m0, 3 108 SPLATW m1, m1, 3 109 mova [r0+r1*0+8], m0 110 mova [r0+r1*1+8], m1 111%endif 112 113 mova [r0+r1*0], m0 114 mova [r0+r1*1], m1 115 lea r0, [r0+r1*2] 116 dec r2 117 jg .loop 118 REP_RET 119%endmacro 120 121INIT_MMX mmx 122PRED16x16_H 123INIT_MMX mmxext 124PRED16x16_H 125INIT_XMM ssse3 126PRED16x16_H 127 128;----------------------------------------------------------------------------- 129; void ff_pred16x16_dc_8(uint8_t *src, ptrdiff_t stride) 130;----------------------------------------------------------------------------- 131 132%macro PRED16x16_DC 0 133cglobal pred16x16_dc_8, 2,7 134 mov r4, r0 135 sub r0, r1 136 pxor mm0, mm0 137 pxor mm1, mm1 138 psadbw mm0, [r0+0] 139 psadbw mm1, [r0+8] 140 dec r0 141 movzx r5d, byte [r0+r1*1] 142 paddw mm0, mm1 143 movd r6d, mm0 144 lea r0, [r0+r1*2] 145%rep 7 146 movzx r2d, byte [r0+r1*0] 147 movzx r3d, byte [r0+r1*1] 148 add r5d, r2d 149 add r6d, r3d 150 lea r0, [r0+r1*2] 151%endrep 152 movzx r2d, byte [r0+r1*0] 153 add r5d, r6d 154 lea r2d, [r2+r5+16] 155 shr r2d, 5 156%if cpuflag(ssse3) 157 pxor m1, m1 158%endif 159 SPLATB_REG m0, r2, m1 160 161%if mmsize==8 162 mov r3d, 8 163.loop: 164 mova [r4+r1*0+0], m0 165 mova [r4+r1*0+8], m0 166 mova [r4+r1*1+0], m0 167 mova [r4+r1*1+8], m0 168%else 169 mov r3d, 4 170.loop: 171 mova [r4+r1*0], m0 172 mova [r4+r1*1], m0 173 lea r4, [r4+r1*2] 174 mova [r4+r1*0], m0 175 mova [r4+r1*1], m0 176%endif 177 lea r4, [r4+r1*2] 178 dec r3d 179 jg .loop 180 REP_RET 181%endmacro 182 183INIT_MMX mmxext 184PRED16x16_DC 185INIT_XMM sse2 186PRED16x16_DC 187INIT_XMM ssse3 188PRED16x16_DC 189 190;----------------------------------------------------------------------------- 191; void ff_pred16x16_tm_vp8_8(uint8_t *src, ptrdiff_t stride) 192;----------------------------------------------------------------------------- 193 194%macro PRED16x16_TM 0 195cglobal pred16x16_tm_vp8_8, 2,5 196 sub r0, r1 197 pxor mm7, mm7 198 movq mm0, [r0+0] 199 movq mm2, [r0+8] 200 movq mm1, mm0 201 movq mm3, mm2 202 punpcklbw mm0, mm7 203 punpckhbw mm1, mm7 204 punpcklbw mm2, mm7 205 punpckhbw mm3, mm7 206 movzx r3d, byte [r0-1] 207 mov r4d, 16 208.loop: 209 movzx r2d, byte [r0+r1-1] 210 sub r2d, r3d 211 movd mm4, r2d 212 SPLATW mm4, mm4, 0 213 movq mm5, mm4 214 movq mm6, mm4 215 movq mm7, mm4 216 paddw mm4, mm0 217 paddw mm5, mm1 218 paddw mm6, mm2 219 paddw mm7, mm3 220 packuswb mm4, mm5 221 packuswb mm6, mm7 222 movq [r0+r1+0], mm4 223 movq [r0+r1+8], mm6 224 add r0, r1 225 dec r4d 226 jg .loop 227 REP_RET 228%endmacro 229 230INIT_MMX mmx 231PRED16x16_TM 232INIT_MMX mmxext 233PRED16x16_TM 234 235INIT_XMM sse2 236cglobal pred16x16_tm_vp8_8, 2,6,6 237 sub r0, r1 238 pxor xmm2, xmm2 239 movdqa xmm0, [r0] 240 movdqa xmm1, xmm0 241 punpcklbw xmm0, xmm2 242 punpckhbw xmm1, xmm2 243 movzx r4d, byte [r0-1] 244 mov r5d, 8 245.loop: 246 movzx r2d, byte [r0+r1*1-1] 247 movzx r3d, byte [r0+r1*2-1] 248 sub r2d, r4d 249 sub r3d, r4d 250 movd xmm2, r2d 251 movd xmm4, r3d 252 pshuflw xmm2, xmm2, 0 253 pshuflw xmm4, xmm4, 0 254 punpcklqdq xmm2, xmm2 255 punpcklqdq xmm4, xmm4 256 movdqa xmm3, xmm2 257 movdqa xmm5, xmm4 258 paddw xmm2, xmm0 259 paddw xmm3, xmm1 260 paddw xmm4, xmm0 261 paddw xmm5, xmm1 262 packuswb xmm2, xmm3 263 packuswb xmm4, xmm5 264 movdqa [r0+r1*1], xmm2 265 movdqa [r0+r1*2], xmm4 266 lea r0, [r0+r1*2] 267 dec r5d 268 jg .loop 269 REP_RET 270 271%if HAVE_AVX2_EXTERNAL 272INIT_YMM avx2 273cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration 274 sub dstq, strideq 275 pmovzxbw m0, [dstq] 276 vpbroadcastb xm1, [r0-1] 277 pmovzxbw m1, xm1 278 psubw m0, m1 279 mov iterationd, 4 280 lea stride3q, [strideq*3] 281.loop: 282 vpbroadcastb xm1, [dstq+strideq*1-1] 283 vpbroadcastb xm2, [dstq+strideq*2-1] 284 vpbroadcastb xm3, [dstq+stride3q-1] 285 vpbroadcastb xm4, [dstq+strideq*4-1] 286 pmovzxbw m1, xm1 287 pmovzxbw m2, xm2 288 pmovzxbw m3, xm3 289 pmovzxbw m4, xm4 290 paddw m1, m0 291 paddw m2, m0 292 paddw m3, m0 293 paddw m4, m0 294 vpackuswb m1, m1, m2 295 vpackuswb m3, m3, m4 296 vpermq m1, m1, q3120 297 vpermq m3, m3, q3120 298 movdqa [dstq+strideq*1], xm1 299 vextracti128 [dstq+strideq*2], m1, 1 300 movdqa [dstq+stride3q*1], xm3 301 vextracti128 [dstq+strideq*4], m3, 1 302 lea dstq, [dstq+strideq*4] 303 dec iterationd 304 jg .loop 305 REP_RET 306%endif 307 308;----------------------------------------------------------------------------- 309; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride) 310;----------------------------------------------------------------------------- 311 312%macro H264_PRED16x16_PLANE 1 313cglobal pred16x16_plane_%1_8, 2,9,7 314 mov r2, r1 ; +stride 315 neg r1 ; -stride 316 317 movh m0, [r0+r1 -1] 318%if mmsize == 8 319 pxor m4, m4 320 movh m1, [r0+r1 +3 ] 321 movh m2, [r0+r1 +8 ] 322 movh m3, [r0+r1 +12] 323 punpcklbw m0, m4 324 punpcklbw m1, m4 325 punpcklbw m2, m4 326 punpcklbw m3, m4 327 pmullw m0, [pw_m8tom1 ] 328 pmullw m1, [pw_m8tom1+8] 329 pmullw m2, [pw_1to8 ] 330 pmullw m3, [pw_1to8 +8] 331 paddw m0, m2 332 paddw m1, m3 333%else ; mmsize == 16 334%if cpuflag(ssse3) 335 movhps m0, [r0+r1 +8] 336 pmaddubsw m0, [plane_shuf] ; H coefficients 337%else ; sse2 338 pxor m2, m2 339 movh m1, [r0+r1 +8] 340 punpcklbw m0, m2 341 punpcklbw m1, m2 342 pmullw m0, [pw_m8tom1] 343 pmullw m1, [pw_1to8] 344 paddw m0, m1 345%endif 346 movhlps m1, m0 347%endif 348 paddw m0, m1 349%if cpuflag(mmxext) 350 PSHUFLW m1, m0, 0xE 351%elif cpuflag(mmx) 352 mova m1, m0 353 psrlq m1, 32 354%endif 355 paddw m0, m1 356%if cpuflag(mmxext) 357 PSHUFLW m1, m0, 0x1 358%elif cpuflag(mmx) 359 mova m1, m0 360 psrlq m1, 16 361%endif 362 paddw m0, m1 ; sum of H coefficients 363 364 lea r4, [r0+r2*8-1] 365 lea r3, [r0+r2*4-1] 366 add r4, r2 367 368%if ARCH_X86_64 369%define e_reg r8 370%else 371%define e_reg r0 372%endif 373 374 movzx e_reg, byte [r3+r2*2 ] 375 movzx r5, byte [r4+r1 ] 376 sub r5, e_reg 377 378 movzx e_reg, byte [r3+r2 ] 379 movzx r6, byte [r4 ] 380 sub r6, e_reg 381 lea r5, [r5+r6*2] 382 383 movzx e_reg, byte [r3+r1 ] 384 movzx r6, byte [r4+r2*2 ] 385 sub r6, e_reg 386 lea r5, [r5+r6*4] 387 388 movzx e_reg, byte [r3 ] 389%if ARCH_X86_64 390 movzx r7, byte [r4+r2 ] 391 sub r7, e_reg 392%else 393 movzx r6, byte [r4+r2 ] 394 sub r6, e_reg 395 lea r5, [r5+r6*4] 396 sub r5, r6 397%endif 398 399 lea e_reg, [r3+r1*4] 400 lea r3, [r4+r2*4] 401 402 movzx r4, byte [e_reg+r2 ] 403 movzx r6, byte [r3 ] 404 sub r6, r4 405%if ARCH_X86_64 406 lea r6, [r7+r6*2] 407 lea r5, [r5+r6*2] 408 add r5, r6 409%else 410 lea r5, [r5+r6*4] 411 lea r5, [r5+r6*2] 412%endif 413 414 movzx r4, byte [e_reg ] 415%if ARCH_X86_64 416 movzx r7, byte [r3 +r2 ] 417 sub r7, r4 418 sub r5, r7 419%else 420 movzx r6, byte [r3 +r2 ] 421 sub r6, r4 422 lea r5, [r5+r6*8] 423 sub r5, r6 424%endif 425 426 movzx r4, byte [e_reg+r1 ] 427 movzx r6, byte [r3 +r2*2] 428 sub r6, r4 429%if ARCH_X86_64 430 add r6, r7 431%endif 432 lea r5, [r5+r6*8] 433 434 movzx r4, byte [e_reg+r2*2] 435 movzx r6, byte [r3 +r1 ] 436 sub r6, r4 437 lea r5, [r5+r6*4] 438 add r5, r6 ; sum of V coefficients 439 440%if ARCH_X86_64 == 0 441 mov r0, r0m 442%endif 443 444%ifidn %1, h264 445 lea r5, [r5*5+32] 446 sar r5, 6 447%elifidn %1, rv40 448 lea r5, [r5*5] 449 sar r5, 6 450%elifidn %1, svq3 451 test r5, r5 452 lea r6, [r5+3] 453 cmovs r5, r6 454 sar r5, 2 ; V/4 455 lea r5, [r5*5] ; 5*(V/4) 456 test r5, r5 457 lea r6, [r5+15] 458 cmovs r5, r6 459 sar r5, 4 ; (5*(V/4))/16 460%endif 461 462 movzx r4, byte [r0+r1 +15] 463 movzx r3, byte [r3+r2*2 ] 464 lea r3, [r3+r4+1] 465 shl r3, 4 466 467 movd r1d, m0 468 movsx r1d, r1w 469%ifnidn %1, svq3 470%ifidn %1, h264 471 lea r1d, [r1d*5+32] 472%else ; rv40 473 lea r1d, [r1d*5] 474%endif 475 sar r1d, 6 476%else ; svq3 477 test r1d, r1d 478 lea r4d, [r1d+3] 479 cmovs r1d, r4d 480 sar r1d, 2 ; H/4 481 lea r1d, [r1d*5] ; 5*(H/4) 482 test r1d, r1d 483 lea r4d, [r1d+15] 484 cmovs r1d, r4d 485 sar r1d, 4 ; (5*(H/4))/16 486%endif 487 movd m0, r1d 488 489 add r1d, r5d 490 add r3d, r1d 491 shl r1d, 3 492 sub r3d, r1d ; a 493 494 movd m1, r5d 495 movd m3, r3d 496 SPLATW m0, m0, 0 ; H 497 SPLATW m1, m1, 0 ; V 498 SPLATW m3, m3, 0 ; a 499%ifidn %1, svq3 500 SWAP 0, 1 501%endif 502 mova m2, m0 503%if mmsize == 8 504 mova m5, m0 505%endif 506 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) 507%if mmsize == 16 508 psllw m2, 3 509%else 510 psllw m5, 3 511 psllw m2, 2 512 mova m6, m5 513 paddw m6, m2 514%endif 515 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H 516 paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H 517%if mmsize == 8 518 paddw m5, m0 ; a + {8,9,10,11}*H 519 paddw m6, m0 ; a + {12,13,14,15}*H 520%endif 521 522 mov r4, 8 523.loop: 524 mova m3, m0 ; b[0..7] 525 mova m4, m2 ; b[8..15] 526 psraw m3, 5 527 psraw m4, 5 528 packuswb m3, m4 529 mova [r0], m3 530%if mmsize == 8 531 mova m3, m5 ; b[8..11] 532 mova m4, m6 ; b[12..15] 533 psraw m3, 5 534 psraw m4, 5 535 packuswb m3, m4 536 mova [r0+8], m3 537%endif 538 paddw m0, m1 539 paddw m2, m1 540%if mmsize == 8 541 paddw m5, m1 542 paddw m6, m1 543%endif 544 545 mova m3, m0 ; b[0..7] 546 mova m4, m2 ; b[8..15] 547 psraw m3, 5 548 psraw m4, 5 549 packuswb m3, m4 550 mova [r0+r2], m3 551%if mmsize == 8 552 mova m3, m5 ; b[8..11] 553 mova m4, m6 ; b[12..15] 554 psraw m3, 5 555 psraw m4, 5 556 packuswb m3, m4 557 mova [r0+r2+8], m3 558%endif 559 paddw m0, m1 560 paddw m2, m1 561%if mmsize == 8 562 paddw m5, m1 563 paddw m6, m1 564%endif 565 566 lea r0, [r0+r2*2] 567 dec r4 568 jg .loop 569 REP_RET 570%endmacro 571 572INIT_MMX mmx 573H264_PRED16x16_PLANE h264 574H264_PRED16x16_PLANE rv40 575H264_PRED16x16_PLANE svq3 576INIT_MMX mmxext 577H264_PRED16x16_PLANE h264 578H264_PRED16x16_PLANE rv40 579H264_PRED16x16_PLANE svq3 580INIT_XMM sse2 581H264_PRED16x16_PLANE h264 582H264_PRED16x16_PLANE rv40 583H264_PRED16x16_PLANE svq3 584INIT_XMM ssse3 585H264_PRED16x16_PLANE h264 586H264_PRED16x16_PLANE rv40 587H264_PRED16x16_PLANE svq3 588 589;----------------------------------------------------------------------------- 590; void ff_pred8x8_plane_8(uint8_t *src, ptrdiff_t stride) 591;----------------------------------------------------------------------------- 592 593%macro H264_PRED8x8_PLANE 0 594cglobal pred8x8_plane_8, 2,9,7 595 mov r2, r1 ; +stride 596 neg r1 ; -stride 597 598 movd m0, [r0+r1 -1] 599%if mmsize == 8 600 pxor m2, m2 601 movh m1, [r0+r1 +4 ] 602 punpcklbw m0, m2 603 punpcklbw m1, m2 604 pmullw m0, [pw_m4to4] 605 pmullw m1, [pw_m4to4+8] 606%else ; mmsize == 16 607%if cpuflag(ssse3) 608 movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary 609 pmaddubsw m0, [plane8_shuf] ; H coefficients 610%else ; sse2 611 pxor m2, m2 612 movd m1, [r0+r1 +4] 613 punpckldq m0, m1 614 punpcklbw m0, m2 615 pmullw m0, [pw_m4to4] 616%endif 617 movhlps m1, m0 618%endif 619 paddw m0, m1 620 621%if notcpuflag(ssse3) 622%if cpuflag(mmxext) 623 PSHUFLW m1, m0, 0xE 624%elif cpuflag(mmx) 625 mova m1, m0 626 psrlq m1, 32 627%endif 628 paddw m0, m1 629%endif ; !ssse3 630 631%if cpuflag(mmxext) 632 PSHUFLW m1, m0, 0x1 633%elif cpuflag(mmx) 634 mova m1, m0 635 psrlq m1, 16 636%endif 637 paddw m0, m1 ; sum of H coefficients 638 639 lea r4, [r0+r2*4-1] 640 lea r3, [r0 -1] 641 add r4, r2 642 643%if ARCH_X86_64 644%define e_reg r8 645%else 646%define e_reg r0 647%endif 648 649 movzx e_reg, byte [r3+r2*2 ] 650 movzx r5, byte [r4+r1 ] 651 sub r5, e_reg 652 653 movzx e_reg, byte [r3 ] 654%if ARCH_X86_64 655 movzx r7, byte [r4+r2 ] 656 sub r7, e_reg 657 sub r5, r7 658%else 659 movzx r6, byte [r4+r2 ] 660 sub r6, e_reg 661 lea r5, [r5+r6*4] 662 sub r5, r6 663%endif 664 665 movzx e_reg, byte [r3+r1 ] 666 movzx r6, byte [r4+r2*2 ] 667 sub r6, e_reg 668%if ARCH_X86_64 669 add r6, r7 670%endif 671 lea r5, [r5+r6*4] 672 673 movzx e_reg, byte [r3+r2 ] 674 movzx r6, byte [r4 ] 675 sub r6, e_reg 676 lea r6, [r5+r6*2] 677 678 lea r5, [r6*9+16] 679 lea r5, [r5+r6*8] 680 sar r5, 5 681 682%if ARCH_X86_64 == 0 683 mov r0, r0m 684%endif 685 686 movzx r3, byte [r4+r2*2 ] 687 movzx r4, byte [r0+r1 +7] 688 lea r3, [r3+r4+1] 689 shl r3, 4 690 movd r1d, m0 691 movsx r1d, r1w 692 imul r1d, 17 693 add r1d, 16 694 sar r1d, 5 695 movd m0, r1d 696 add r1d, r5d 697 sub r3d, r1d 698 add r1d, r1d 699 sub r3d, r1d ; a 700 701 movd m1, r5d 702 movd m3, r3d 703 SPLATW m0, m0, 0 ; H 704 SPLATW m1, m1, 0 ; V 705 SPLATW m3, m3, 0 ; a 706%if mmsize == 8 707 mova m2, m0 708%endif 709 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) 710 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H 711%if mmsize == 8 712 psllw m2, 2 713 paddw m2, m0 ; a + {4,5,6,7}*H 714%endif 715 716 mov r4, 4 717ALIGN 16 718.loop: 719%if mmsize == 16 720 mova m3, m0 ; b[0..7] 721 paddw m0, m1 722 psraw m3, 5 723 mova m4, m0 ; V+b[0..7] 724 paddw m0, m1 725 psraw m4, 5 726 packuswb m3, m4 727 movh [r0], m3 728 movhps [r0+r2], m3 729%else ; mmsize == 8 730 mova m3, m0 ; b[0..3] 731 mova m4, m2 ; b[4..7] 732 paddw m0, m1 733 paddw m2, m1 734 psraw m3, 5 735 psraw m4, 5 736 mova m5, m0 ; V+b[0..3] 737 mova m6, m2 ; V+b[4..7] 738 paddw m0, m1 739 paddw m2, m1 740 psraw m5, 5 741 psraw m6, 5 742 packuswb m3, m4 743 packuswb m5, m6 744 mova [r0], m3 745 mova [r0+r2], m5 746%endif 747 748 lea r0, [r0+r2*2] 749 dec r4 750 jg .loop 751 REP_RET 752%endmacro 753 754INIT_MMX mmx 755H264_PRED8x8_PLANE 756INIT_MMX mmxext 757H264_PRED8x8_PLANE 758INIT_XMM sse2 759H264_PRED8x8_PLANE 760INIT_XMM ssse3 761H264_PRED8x8_PLANE 762 763;----------------------------------------------------------------------------- 764; void ff_pred8x8_vertical_8(uint8_t *src, ptrdiff_t stride) 765;----------------------------------------------------------------------------- 766 767INIT_MMX mmx 768cglobal pred8x8_vertical_8, 2,2 769 sub r0, r1 770 movq mm0, [r0] 771%rep 3 772 movq [r0+r1*1], mm0 773 movq [r0+r1*2], mm0 774 lea r0, [r0+r1*2] 775%endrep 776 movq [r0+r1*1], mm0 777 movq [r0+r1*2], mm0 778 RET 779 780;----------------------------------------------------------------------------- 781; void ff_pred8x8_horizontal_8(uint8_t *src, ptrdiff_t stride) 782;----------------------------------------------------------------------------- 783 784%macro PRED8x8_H 0 785cglobal pred8x8_horizontal_8, 2,3 786 mov r2, 4 787%if cpuflag(ssse3) 788 mova m2, [pb_3] 789%endif 790.loop: 791 SPLATB_LOAD m0, r0+r1*0-1, m2 792 SPLATB_LOAD m1, r0+r1*1-1, m2 793 mova [r0+r1*0], m0 794 mova [r0+r1*1], m1 795 lea r0, [r0+r1*2] 796 dec r2 797 jg .loop 798 REP_RET 799%endmacro 800 801INIT_MMX mmx 802PRED8x8_H 803INIT_MMX mmxext 804PRED8x8_H 805INIT_MMX ssse3 806PRED8x8_H 807 808;----------------------------------------------------------------------------- 809; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, ptrdiff_t stride) 810;----------------------------------------------------------------------------- 811INIT_MMX mmxext 812cglobal pred8x8_top_dc_8, 2,5 813 sub r0, r1 814 movq mm0, [r0] 815 pxor mm1, mm1 816 pxor mm2, mm2 817 lea r2, [r0+r1*2] 818 punpckhbw mm1, mm0 819 punpcklbw mm0, mm2 820 psadbw mm1, mm2 ; s1 821 lea r3, [r2+r1*2] 822 psadbw mm0, mm2 ; s0 823 psrlw mm1, 1 824 psrlw mm0, 1 825 pavgw mm1, mm2 826 lea r4, [r3+r1*2] 827 pavgw mm0, mm2 828 pshufw mm1, mm1, 0 829 pshufw mm0, mm0, 0 ; dc0 (w) 830 packuswb mm0, mm1 ; dc0,dc1 (b) 831 movq [r0+r1*1], mm0 832 movq [r0+r1*2], mm0 833 lea r0, [r3+r1*2] 834 movq [r2+r1*1], mm0 835 movq [r2+r1*2], mm0 836 movq [r3+r1*1], mm0 837 movq [r3+r1*2], mm0 838 movq [r0+r1*1], mm0 839 movq [r0+r1*2], mm0 840 RET 841 842;----------------------------------------------------------------------------- 843; void ff_pred8x8_dc_8_mmxext(uint8_t *src, ptrdiff_t stride) 844;----------------------------------------------------------------------------- 845 846INIT_MMX mmxext 847cglobal pred8x8_dc_8, 2,5 848 sub r0, r1 849 pxor m7, m7 850 movd m0, [r0+0] 851 movd m1, [r0+4] 852 psadbw m0, m7 ; s0 853 mov r4, r0 854 psadbw m1, m7 ; s1 855 856 movzx r2d, byte [r0+r1*1-1] 857 movzx r3d, byte [r0+r1*2-1] 858 lea r0, [r0+r1*2] 859 add r2d, r3d 860 movzx r3d, byte [r0+r1*1-1] 861 add r2d, r3d 862 movzx r3d, byte [r0+r1*2-1] 863 add r2d, r3d 864 lea r0, [r0+r1*2] 865 movd m2, r2d ; s2 866 movzx r2d, byte [r0+r1*1-1] 867 movzx r3d, byte [r0+r1*2-1] 868 lea r0, [r0+r1*2] 869 add r2d, r3d 870 movzx r3d, byte [r0+r1*1-1] 871 add r2d, r3d 872 movzx r3d, byte [r0+r1*2-1] 873 add r2d, r3d 874 movd m3, r2d ; s3 875 876 punpcklwd m0, m1 877 mov r0, r4 878 punpcklwd m2, m3 879 punpckldq m0, m2 ; s0, s1, s2, s3 880 pshufw m3, m0, 11110110b ; s2, s1, s3, s3 881 lea r2, [r0+r1*2] 882 pshufw m0, m0, 01110100b ; s0, s1, s3, s1 883 paddw m0, m3 884 lea r3, [r2+r1*2] 885 psrlw m0, 2 886 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3 887 lea r4, [r3+r1*2] 888 packuswb m0, m0 889 punpcklbw m0, m0 890 movq m1, m0 891 punpcklbw m0, m0 892 punpckhbw m1, m1 893 movq [r0+r1*1], m0 894 movq [r0+r1*2], m0 895 movq [r2+r1*1], m0 896 movq [r2+r1*2], m0 897 movq [r3+r1*1], m1 898 movq [r3+r1*2], m1 899 movq [r4+r1*1], m1 900 movq [r4+r1*2], m1 901 RET 902 903;----------------------------------------------------------------------------- 904; void ff_pred8x8_dc_rv40_8(uint8_t *src, ptrdiff_t stride) 905;----------------------------------------------------------------------------- 906 907INIT_MMX mmxext 908cglobal pred8x8_dc_rv40_8, 2,7 909 mov r4, r0 910 sub r0, r1 911 pxor mm0, mm0 912 psadbw mm0, [r0] 913 dec r0 914 movzx r5d, byte [r0+r1*1] 915 movd r6d, mm0 916 lea r0, [r0+r1*2] 917%rep 3 918 movzx r2d, byte [r0+r1*0] 919 movzx r3d, byte [r0+r1*1] 920 add r5d, r2d 921 add r6d, r3d 922 lea r0, [r0+r1*2] 923%endrep 924 movzx r2d, byte [r0+r1*0] 925 add r5d, r6d 926 lea r2d, [r2+r5+8] 927 shr r2d, 4 928 movd mm0, r2d 929 punpcklbw mm0, mm0 930 pshufw mm0, mm0, 0 931 mov r3d, 4 932.loop: 933 movq [r4+r1*0], mm0 934 movq [r4+r1*1], mm0 935 lea r4, [r4+r1*2] 936 dec r3d 937 jg .loop 938 REP_RET 939 940;----------------------------------------------------------------------------- 941; void ff_pred8x8_tm_vp8_8(uint8_t *src, ptrdiff_t stride) 942;----------------------------------------------------------------------------- 943 944%macro PRED8x8_TM 0 945cglobal pred8x8_tm_vp8_8, 2,6 946 sub r0, r1 947 pxor mm7, mm7 948 movq mm0, [r0] 949 movq mm1, mm0 950 punpcklbw mm0, mm7 951 punpckhbw mm1, mm7 952 movzx r4d, byte [r0-1] 953 mov r5d, 4 954.loop: 955 movzx r2d, byte [r0+r1*1-1] 956 movzx r3d, byte [r0+r1*2-1] 957 sub r2d, r4d 958 sub r3d, r4d 959 movd mm2, r2d 960 movd mm4, r3d 961 SPLATW mm2, mm2, 0 962 SPLATW mm4, mm4, 0 963 movq mm3, mm2 964 movq mm5, mm4 965 paddw mm2, mm0 966 paddw mm3, mm1 967 paddw mm4, mm0 968 paddw mm5, mm1 969 packuswb mm2, mm3 970 packuswb mm4, mm5 971 movq [r0+r1*1], mm2 972 movq [r0+r1*2], mm4 973 lea r0, [r0+r1*2] 974 dec r5d 975 jg .loop 976 REP_RET 977%endmacro 978 979INIT_MMX mmx 980PRED8x8_TM 981INIT_MMX mmxext 982PRED8x8_TM 983 984INIT_XMM sse2 985cglobal pred8x8_tm_vp8_8, 2,6,4 986 sub r0, r1 987 pxor xmm1, xmm1 988 movq xmm0, [r0] 989 punpcklbw xmm0, xmm1 990 movzx r4d, byte [r0-1] 991 mov r5d, 4 992.loop: 993 movzx r2d, byte [r0+r1*1-1] 994 movzx r3d, byte [r0+r1*2-1] 995 sub r2d, r4d 996 sub r3d, r4d 997 movd xmm2, r2d 998 movd xmm3, r3d 999 pshuflw xmm2, xmm2, 0 1000 pshuflw xmm3, xmm3, 0 1001 punpcklqdq xmm2, xmm2 1002 punpcklqdq xmm3, xmm3 1003 paddw xmm2, xmm0 1004 paddw xmm3, xmm0 1005 packuswb xmm2, xmm3 1006 movq [r0+r1*1], xmm2 1007 movhps [r0+r1*2], xmm2 1008 lea r0, [r0+r1*2] 1009 dec r5d 1010 jg .loop 1011 REP_RET 1012 1013INIT_XMM ssse3 1014cglobal pred8x8_tm_vp8_8, 2,3,6 1015 sub r0, r1 1016 movdqa xmm4, [tm_shuf] 1017 pxor xmm1, xmm1 1018 movq xmm0, [r0] 1019 punpcklbw xmm0, xmm1 1020 movd xmm5, [r0-4] 1021 pshufb xmm5, xmm4 1022 mov r2d, 4 1023.loop: 1024 movd xmm2, [r0+r1*1-4] 1025 movd xmm3, [r0+r1*2-4] 1026 pshufb xmm2, xmm4 1027 pshufb xmm3, xmm4 1028 psubw xmm2, xmm5 1029 psubw xmm3, xmm5 1030 paddw xmm2, xmm0 1031 paddw xmm3, xmm0 1032 packuswb xmm2, xmm3 1033 movq [r0+r1*1], xmm2 1034 movhps [r0+r1*2], xmm2 1035 lea r0, [r0+r1*2] 1036 dec r2d 1037 jg .loop 1038 REP_RET 1039 1040; dest, left, right, src, tmp 1041; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 1042%macro PRED4x4_LOWPASS 5 1043 mova %5, %2 1044 pavgb %2, %3 1045 pxor %3, %5 1046 mova %1, %4 1047 pand %3, [pb_1] 1048 psubusb %2, %3 1049 pavgb %1, %2 1050%endmacro 1051 1052;----------------------------------------------------------------------------- 1053; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright, 1054; ptrdiff_t stride) 1055;----------------------------------------------------------------------------- 1056%macro PRED8x8L_TOP_DC 0 1057cglobal pred8x8l_top_dc_8, 4,4 1058 sub r0, r3 1059 pxor mm7, mm7 1060 movq mm0, [r0-8] 1061 movq mm3, [r0] 1062 movq mm1, [r0+8] 1063 movq mm2, mm3 1064 movq mm4, mm3 1065 PALIGNR mm2, mm0, 7, mm0 1066 PALIGNR mm1, mm4, 1, mm4 1067 test r1d, r1d ; top_left 1068 jz .fix_lt_2 1069 test r2d, r2d ; top_right 1070 jz .fix_tr_1 1071 jmp .body 1072.fix_lt_2: 1073 movq mm5, mm3 1074 pxor mm5, mm2 1075 psllq mm5, 56 1076 psrlq mm5, 56 1077 pxor mm2, mm5 1078 test r2d, r2d ; top_right 1079 jnz .body 1080.fix_tr_1: 1081 movq mm5, mm3 1082 pxor mm5, mm1 1083 psrlq mm5, 56 1084 psllq mm5, 56 1085 pxor mm1, mm5 1086.body: 1087 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 1088 psadbw mm7, mm0 1089 paddw mm7, [pw_4] 1090 psrlw mm7, 3 1091 pshufw mm7, mm7, 0 1092 packuswb mm7, mm7 1093%rep 3 1094 movq [r0+r3*1], mm7 1095 movq [r0+r3*2], mm7 1096 lea r0, [r0+r3*2] 1097%endrep 1098 movq [r0+r3*1], mm7 1099 movq [r0+r3*2], mm7 1100 RET 1101%endmacro 1102 1103INIT_MMX mmxext 1104PRED8x8L_TOP_DC 1105INIT_MMX ssse3 1106PRED8x8L_TOP_DC 1107 1108;----------------------------------------------------------------------------- 1109; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright, 1110; ptrdiff_t stride) 1111;----------------------------------------------------------------------------- 1112 1113%macro PRED8x8L_DC 0 1114cglobal pred8x8l_dc_8, 4,5 1115 sub r0, r3 1116 lea r4, [r0+r3*2] 1117 movq mm0, [r0+r3*1-8] 1118 punpckhbw mm0, [r0+r3*0-8] 1119 movq mm1, [r4+r3*1-8] 1120 punpckhbw mm1, [r0+r3*2-8] 1121 mov r4, r0 1122 punpckhwd mm1, mm0 1123 lea r0, [r0+r3*4] 1124 movq mm2, [r0+r3*1-8] 1125 punpckhbw mm2, [r0+r3*0-8] 1126 lea r0, [r0+r3*2] 1127 movq mm3, [r0+r3*1-8] 1128 punpckhbw mm3, [r0+r3*0-8] 1129 punpckhwd mm3, mm2 1130 punpckhdq mm3, mm1 1131 lea r0, [r0+r3*2] 1132 movq mm0, [r0+r3*0-8] 1133 movq mm1, [r4] 1134 mov r0, r4 1135 movq mm4, mm3 1136 movq mm2, mm3 1137 PALIGNR mm4, mm0, 7, mm0 1138 PALIGNR mm1, mm2, 1, mm2 1139 test r1d, r1d 1140 jnz .do_left 1141.fix_lt_1: 1142 movq mm5, mm3 1143 pxor mm5, mm4 1144 psrlq mm5, 56 1145 psllq mm5, 48 1146 pxor mm1, mm5 1147 jmp .do_left 1148.fix_lt_2: 1149 movq mm5, mm3 1150 pxor mm5, mm2 1151 psllq mm5, 56 1152 psrlq mm5, 56 1153 pxor mm2, mm5 1154 test r2d, r2d 1155 jnz .body 1156.fix_tr_1: 1157 movq mm5, mm3 1158 pxor mm5, mm1 1159 psrlq mm5, 56 1160 psllq mm5, 56 1161 pxor mm1, mm5 1162 jmp .body 1163.do_left: 1164 movq mm0, mm4 1165 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1166 movq mm4, mm0 1167 movq mm7, mm2 1168 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1169 psllq mm1, 56 1170 PALIGNR mm7, mm1, 7, mm3 1171 movq mm0, [r0-8] 1172 movq mm3, [r0] 1173 movq mm1, [r0+8] 1174 movq mm2, mm3 1175 movq mm4, mm3 1176 PALIGNR mm2, mm0, 7, mm0 1177 PALIGNR mm1, mm4, 1, mm4 1178 test r1d, r1d 1179 jz .fix_lt_2 1180 test r2d, r2d 1181 jz .fix_tr_1 1182.body: 1183 lea r1, [r0+r3*2] 1184 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 1185 pxor mm0, mm0 1186 pxor mm1, mm1 1187 lea r2, [r1+r3*2] 1188 psadbw mm0, mm7 1189 psadbw mm1, mm6 1190 paddw mm0, [pw_8] 1191 paddw mm0, mm1 1192 lea r4, [r2+r3*2] 1193 psrlw mm0, 4 1194 pshufw mm0, mm0, 0 1195 packuswb mm0, mm0 1196 movq [r0+r3*1], mm0 1197 movq [r0+r3*2], mm0 1198 movq [r1+r3*1], mm0 1199 movq [r1+r3*2], mm0 1200 movq [r2+r3*1], mm0 1201 movq [r2+r3*2], mm0 1202 movq [r4+r3*1], mm0 1203 movq [r4+r3*2], mm0 1204 RET 1205%endmacro 1206 1207INIT_MMX mmxext 1208PRED8x8L_DC 1209INIT_MMX ssse3 1210PRED8x8L_DC 1211 1212;----------------------------------------------------------------------------- 1213; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft, 1214; int has_topright, ptrdiff_t stride) 1215;----------------------------------------------------------------------------- 1216 1217%macro PRED8x8L_HORIZONTAL 0 1218cglobal pred8x8l_horizontal_8, 4,4 1219 sub r0, r3 1220 lea r2, [r0+r3*2] 1221 movq mm0, [r0+r3*1-8] 1222 test r1d, r1d 1223 lea r1, [r0+r3] 1224 cmovnz r1, r0 1225 punpckhbw mm0, [r1+r3*0-8] 1226 movq mm1, [r2+r3*1-8] 1227 punpckhbw mm1, [r0+r3*2-8] 1228 mov r2, r0 1229 punpckhwd mm1, mm0 1230 lea r0, [r0+r3*4] 1231 movq mm2, [r0+r3*1-8] 1232 punpckhbw mm2, [r0+r3*0-8] 1233 lea r0, [r0+r3*2] 1234 movq mm3, [r0+r3*1-8] 1235 punpckhbw mm3, [r0+r3*0-8] 1236 punpckhwd mm3, mm2 1237 punpckhdq mm3, mm1 1238 lea r0, [r0+r3*2] 1239 movq mm0, [r0+r3*0-8] 1240 movq mm1, [r1+r3*0-8] 1241 mov r0, r2 1242 movq mm4, mm3 1243 movq mm2, mm3 1244 PALIGNR mm4, mm0, 7, mm0 1245 PALIGNR mm1, mm2, 1, mm2 1246 movq mm0, mm4 1247 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1248 movq mm4, mm0 1249 movq mm7, mm2 1250 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1251 psllq mm1, 56 1252 PALIGNR mm7, mm1, 7, mm3 1253 movq mm3, mm7 1254 lea r1, [r0+r3*2] 1255 movq mm7, mm3 1256 punpckhbw mm3, mm3 1257 punpcklbw mm7, mm7 1258 pshufw mm0, mm3, 0xff 1259 pshufw mm1, mm3, 0xaa 1260 lea r2, [r1+r3*2] 1261 pshufw mm2, mm3, 0x55 1262 pshufw mm3, mm3, 0x00 1263 pshufw mm4, mm7, 0xff 1264 pshufw mm5, mm7, 0xaa 1265 pshufw mm6, mm7, 0x55 1266 pshufw mm7, mm7, 0x00 1267 movq [r0+r3*1], mm0 1268 movq [r0+r3*2], mm1 1269 movq [r1+r3*1], mm2 1270 movq [r1+r3*2], mm3 1271 movq [r2+r3*1], mm4 1272 movq [r2+r3*2], mm5 1273 lea r0, [r2+r3*2] 1274 movq [r0+r3*1], mm6 1275 movq [r0+r3*2], mm7 1276 RET 1277%endmacro 1278 1279INIT_MMX mmxext 1280PRED8x8L_HORIZONTAL 1281INIT_MMX ssse3 1282PRED8x8L_HORIZONTAL 1283 1284;----------------------------------------------------------------------------- 1285; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright, 1286; ptrdiff_t stride) 1287;----------------------------------------------------------------------------- 1288 1289%macro PRED8x8L_VERTICAL 0 1290cglobal pred8x8l_vertical_8, 4,4 1291 sub r0, r3 1292 movq mm0, [r0-8] 1293 movq mm3, [r0] 1294 movq mm1, [r0+8] 1295 movq mm2, mm3 1296 movq mm4, mm3 1297 PALIGNR mm2, mm0, 7, mm0 1298 PALIGNR mm1, mm4, 1, mm4 1299 test r1d, r1d ; top_left 1300 jz .fix_lt_2 1301 test r2d, r2d ; top_right 1302 jz .fix_tr_1 1303 jmp .body 1304.fix_lt_2: 1305 movq mm5, mm3 1306 pxor mm5, mm2 1307 psllq mm5, 56 1308 psrlq mm5, 56 1309 pxor mm2, mm5 1310 test r2d, r2d ; top_right 1311 jnz .body 1312.fix_tr_1: 1313 movq mm5, mm3 1314 pxor mm5, mm1 1315 psrlq mm5, 56 1316 psllq mm5, 56 1317 pxor mm1, mm5 1318.body: 1319 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 1320%rep 3 1321 movq [r0+r3*1], mm0 1322 movq [r0+r3*2], mm0 1323 lea r0, [r0+r3*2] 1324%endrep 1325 movq [r0+r3*1], mm0 1326 movq [r0+r3*2], mm0 1327 RET 1328%endmacro 1329 1330INIT_MMX mmxext 1331PRED8x8L_VERTICAL 1332INIT_MMX ssse3 1333PRED8x8L_VERTICAL 1334 1335;----------------------------------------------------------------------------- 1336; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft, 1337; int has_topright, ptrdiff_t stride) 1338;----------------------------------------------------------------------------- 1339 1340INIT_MMX mmxext 1341cglobal pred8x8l_down_left_8, 4,5 1342 sub r0, r3 1343 movq mm0, [r0-8] 1344 movq mm3, [r0] 1345 movq mm1, [r0+8] 1346 movq mm2, mm3 1347 movq mm4, mm3 1348 PALIGNR mm2, mm0, 7, mm0 1349 PALIGNR mm1, mm4, 1, mm4 1350 test r1d, r1d 1351 jz .fix_lt_2 1352 test r2d, r2d 1353 jz .fix_tr_1 1354 jmp .do_top 1355.fix_lt_2: 1356 movq mm5, mm3 1357 pxor mm5, mm2 1358 psllq mm5, 56 1359 psrlq mm5, 56 1360 pxor mm2, mm5 1361 test r2d, r2d 1362 jnz .do_top 1363.fix_tr_1: 1364 movq mm5, mm3 1365 pxor mm5, mm1 1366 psrlq mm5, 56 1367 psllq mm5, 56 1368 pxor mm1, mm5 1369 jmp .do_top 1370.fix_tr_2: 1371 punpckhbw mm3, mm3 1372 pshufw mm1, mm3, 0xFF 1373 jmp .do_topright 1374.do_top: 1375 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1376 movq mm7, mm4 1377 test r2d, r2d 1378 jz .fix_tr_2 1379 movq mm0, [r0+8] 1380 movq mm5, mm0 1381 movq mm2, mm0 1382 movq mm4, mm0 1383 psrlq mm5, 56 1384 PALIGNR mm2, mm3, 7, mm3 1385 PALIGNR mm5, mm4, 1, mm4 1386 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 1387.do_topright: 1388 lea r1, [r0+r3*2] 1389 movq mm6, mm1 1390 psrlq mm1, 56 1391 movq mm4, mm1 1392 lea r2, [r1+r3*2] 1393 movq mm2, mm6 1394 PALIGNR mm2, mm7, 1, mm0 1395 movq mm3, mm6 1396 PALIGNR mm3, mm7, 7, mm0 1397 PALIGNR mm4, mm6, 1, mm0 1398 movq mm5, mm7 1399 movq mm1, mm7 1400 movq mm7, mm6 1401 lea r4, [r2+r3*2] 1402 psllq mm1, 8 1403 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6 1404 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6 1405 movq [r4+r3*2], mm1 1406 movq mm2, mm0 1407 psllq mm1, 8 1408 psrlq mm2, 56 1409 psllq mm0, 8 1410 por mm1, mm2 1411 movq [r4+r3*1], mm1 1412 movq mm2, mm0 1413 psllq mm1, 8 1414 psrlq mm2, 56 1415 psllq mm0, 8 1416 por mm1, mm2 1417 movq [r2+r3*2], mm1 1418 movq mm2, mm0 1419 psllq mm1, 8 1420 psrlq mm2, 56 1421 psllq mm0, 8 1422 por mm1, mm2 1423 movq [r2+r3*1], mm1 1424 movq mm2, mm0 1425 psllq mm1, 8 1426 psrlq mm2, 56 1427 psllq mm0, 8 1428 por mm1, mm2 1429 movq [r1+r3*2], mm1 1430 movq mm2, mm0 1431 psllq mm1, 8 1432 psrlq mm2, 56 1433 psllq mm0, 8 1434 por mm1, mm2 1435 movq [r1+r3*1], mm1 1436 movq mm2, mm0 1437 psllq mm1, 8 1438 psrlq mm2, 56 1439 psllq mm0, 8 1440 por mm1, mm2 1441 movq [r0+r3*2], mm1 1442 psllq mm1, 8 1443 psrlq mm0, 56 1444 por mm1, mm0 1445 movq [r0+r3*1], mm1 1446 RET 1447 1448%macro PRED8x8L_DOWN_LEFT 0 1449cglobal pred8x8l_down_left_8, 4,4 1450 sub r0, r3 1451 movq mm0, [r0-8] 1452 movq mm3, [r0] 1453 movq mm1, [r0+8] 1454 movq mm2, mm3 1455 movq mm4, mm3 1456 PALIGNR mm2, mm0, 7, mm0 1457 PALIGNR mm1, mm4, 1, mm4 1458 test r1d, r1d ; top_left 1459 jz .fix_lt_2 1460 test r2d, r2d ; top_right 1461 jz .fix_tr_1 1462 jmp .do_top 1463.fix_lt_2: 1464 movq mm5, mm3 1465 pxor mm5, mm2 1466 psllq mm5, 56 1467 psrlq mm5, 56 1468 pxor mm2, mm5 1469 test r2d, r2d ; top_right 1470 jnz .do_top 1471.fix_tr_1: 1472 movq mm5, mm3 1473 pxor mm5, mm1 1474 psrlq mm5, 56 1475 psllq mm5, 56 1476 pxor mm1, mm5 1477 jmp .do_top 1478.fix_tr_2: 1479 punpckhbw mm3, mm3 1480 pshufw mm1, mm3, 0xFF 1481 jmp .do_topright 1482.do_top: 1483 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1484 movq2dq xmm3, mm4 1485 test r2d, r2d ; top_right 1486 jz .fix_tr_2 1487 movq mm0, [r0+8] 1488 movq mm5, mm0 1489 movq mm2, mm0 1490 movq mm4, mm0 1491 psrlq mm5, 56 1492 PALIGNR mm2, mm3, 7, mm3 1493 PALIGNR mm5, mm4, 1, mm4 1494 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 1495.do_topright: 1496 movq2dq xmm4, mm1 1497 psrlq mm1, 56 1498 movq2dq xmm5, mm1 1499 lea r1, [r0+r3*2] 1500 pslldq xmm4, 8 1501 por xmm3, xmm4 1502 movdqa xmm2, xmm3 1503 psrldq xmm2, 1 1504 pslldq xmm5, 15 1505 por xmm2, xmm5 1506 lea r2, [r1+r3*2] 1507 movdqa xmm1, xmm3 1508 pslldq xmm1, 1 1509INIT_XMM cpuname 1510 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 1511 psrldq xmm0, 1 1512 movq [r0+r3*1], xmm0 1513 psrldq xmm0, 1 1514 movq [r0+r3*2], xmm0 1515 psrldq xmm0, 1 1516 lea r0, [r2+r3*2] 1517 movq [r1+r3*1], xmm0 1518 psrldq xmm0, 1 1519 movq [r1+r3*2], xmm0 1520 psrldq xmm0, 1 1521 movq [r2+r3*1], xmm0 1522 psrldq xmm0, 1 1523 movq [r2+r3*2], xmm0 1524 psrldq xmm0, 1 1525 movq [r0+r3*1], xmm0 1526 psrldq xmm0, 1 1527 movq [r0+r3*2], xmm0 1528 RET 1529%endmacro 1530 1531INIT_MMX sse2 1532PRED8x8L_DOWN_LEFT 1533INIT_MMX ssse3 1534PRED8x8L_DOWN_LEFT 1535 1536;----------------------------------------------------------------------------- 1537; void ff_pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft, 1538; int has_topright, ptrdiff_t stride) 1539;----------------------------------------------------------------------------- 1540 1541INIT_MMX mmxext 1542cglobal pred8x8l_down_right_8, 4,5 1543 sub r0, r3 1544 lea r4, [r0+r3*2] 1545 movq mm0, [r0+r3*1-8] 1546 punpckhbw mm0, [r0+r3*0-8] 1547 movq mm1, [r4+r3*1-8] 1548 punpckhbw mm1, [r0+r3*2-8] 1549 mov r4, r0 1550 punpckhwd mm1, mm0 1551 lea r0, [r0+r3*4] 1552 movq mm2, [r0+r3*1-8] 1553 punpckhbw mm2, [r0+r3*0-8] 1554 lea r0, [r0+r3*2] 1555 movq mm3, [r0+r3*1-8] 1556 punpckhbw mm3, [r0+r3*0-8] 1557 punpckhwd mm3, mm2 1558 punpckhdq mm3, mm1 1559 lea r0, [r0+r3*2] 1560 movq mm0, [r0+r3*0-8] 1561 movq mm1, [r4] 1562 mov r0, r4 1563 movq mm4, mm3 1564 movq mm2, mm3 1565 PALIGNR mm4, mm0, 7, mm0 1566 PALIGNR mm1, mm2, 1, mm2 1567 test r1d, r1d ; top_left 1568 jz .fix_lt_1 1569.do_left: 1570 movq mm0, mm4 1571 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1572 movq mm4, mm0 1573 movq mm7, mm2 1574 movq mm6, mm2 1575 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1576 psllq mm1, 56 1577 PALIGNR mm7, mm1, 7, mm3 1578 movq mm0, [r0-8] 1579 movq mm3, [r0] 1580 movq mm1, [r0+8] 1581 movq mm2, mm3 1582 movq mm4, mm3 1583 PALIGNR mm2, mm0, 7, mm0 1584 PALIGNR mm1, mm4, 1, mm4 1585 test r1d, r1d ; top_left 1586 jz .fix_lt_2 1587 test r2d, r2d ; top_right 1588 jz .fix_tr_1 1589.do_top: 1590 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1591 movq mm5, mm4 1592 jmp .body 1593.fix_lt_1: 1594 movq mm5, mm3 1595 pxor mm5, mm4 1596 psrlq mm5, 56 1597 psllq mm5, 48 1598 pxor mm1, mm5 1599 jmp .do_left 1600.fix_lt_2: 1601 movq mm5, mm3 1602 pxor mm5, mm2 1603 psllq mm5, 56 1604 psrlq mm5, 56 1605 pxor mm2, mm5 1606 test r2d, r2d ; top_right 1607 jnz .do_top 1608.fix_tr_1: 1609 movq mm5, mm3 1610 pxor mm5, mm1 1611 psrlq mm5, 56 1612 psllq mm5, 56 1613 pxor mm1, mm5 1614 jmp .do_top 1615.body: 1616 lea r1, [r0+r3*2] 1617 movq mm1, mm7 1618 movq mm7, mm5 1619 movq mm5, mm6 1620 movq mm2, mm7 1621 lea r2, [r1+r3*2] 1622 PALIGNR mm2, mm6, 1, mm0 1623 movq mm3, mm7 1624 PALIGNR mm3, mm6, 7, mm0 1625 movq mm4, mm7 1626 lea r4, [r2+r3*2] 1627 psrlq mm4, 8 1628 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6 1629 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6 1630 movq [r4+r3*2], mm0 1631 movq mm2, mm1 1632 psrlq mm0, 8 1633 psllq mm2, 56 1634 psrlq mm1, 8 1635 por mm0, mm2 1636 movq [r4+r3*1], mm0 1637 movq mm2, mm1 1638 psrlq mm0, 8 1639 psllq mm2, 56 1640 psrlq mm1, 8 1641 por mm0, mm2 1642 movq [r2+r3*2], mm0 1643 movq mm2, mm1 1644 psrlq mm0, 8 1645 psllq mm2, 56 1646 psrlq mm1, 8 1647 por mm0, mm2 1648 movq [r2+r3*1], mm0 1649 movq mm2, mm1 1650 psrlq mm0, 8 1651 psllq mm2, 56 1652 psrlq mm1, 8 1653 por mm0, mm2 1654 movq [r1+r3*2], mm0 1655 movq mm2, mm1 1656 psrlq mm0, 8 1657 psllq mm2, 56 1658 psrlq mm1, 8 1659 por mm0, mm2 1660 movq [r1+r3*1], mm0 1661 movq mm2, mm1 1662 psrlq mm0, 8 1663 psllq mm2, 56 1664 psrlq mm1, 8 1665 por mm0, mm2 1666 movq [r0+r3*2], mm0 1667 psrlq mm0, 8 1668 psllq mm1, 56 1669 por mm0, mm1 1670 movq [r0+r3*1], mm0 1671 RET 1672 1673%macro PRED8x8L_DOWN_RIGHT 0 1674cglobal pred8x8l_down_right_8, 4,5 1675 sub r0, r3 1676 lea r4, [r0+r3*2] 1677 movq mm0, [r0+r3*1-8] 1678 punpckhbw mm0, [r0+r3*0-8] 1679 movq mm1, [r4+r3*1-8] 1680 punpckhbw mm1, [r0+r3*2-8] 1681 mov r4, r0 1682 punpckhwd mm1, mm0 1683 lea r0, [r0+r3*4] 1684 movq mm2, [r0+r3*1-8] 1685 punpckhbw mm2, [r0+r3*0-8] 1686 lea r0, [r0+r3*2] 1687 movq mm3, [r0+r3*1-8] 1688 punpckhbw mm3, [r0+r3*0-8] 1689 punpckhwd mm3, mm2 1690 punpckhdq mm3, mm1 1691 lea r0, [r0+r3*2] 1692 movq mm0, [r0+r3*0-8] 1693 movq mm1, [r4] 1694 mov r0, r4 1695 movq mm4, mm3 1696 movq mm2, mm3 1697 PALIGNR mm4, mm0, 7, mm0 1698 PALIGNR mm1, mm2, 1, mm2 1699 test r1d, r1d 1700 jz .fix_lt_1 1701 jmp .do_left 1702.fix_lt_1: 1703 movq mm5, mm3 1704 pxor mm5, mm4 1705 psrlq mm5, 56 1706 psllq mm5, 48 1707 pxor mm1, mm5 1708 jmp .do_left 1709.fix_lt_2: 1710 movq mm5, mm3 1711 pxor mm5, mm2 1712 psllq mm5, 56 1713 psrlq mm5, 56 1714 pxor mm2, mm5 1715 test r2d, r2d 1716 jnz .do_top 1717.fix_tr_1: 1718 movq mm5, mm3 1719 pxor mm5, mm1 1720 psrlq mm5, 56 1721 psllq mm5, 56 1722 pxor mm1, mm5 1723 jmp .do_top 1724.do_left: 1725 movq mm0, mm4 1726 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1727 movq mm4, mm0 1728 movq mm7, mm2 1729 movq2dq xmm3, mm2 1730 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1731 psllq mm1, 56 1732 PALIGNR mm7, mm1, 7, mm3 1733 movq2dq xmm1, mm7 1734 movq mm0, [r0-8] 1735 movq mm3, [r0] 1736 movq mm1, [r0+8] 1737 movq mm2, mm3 1738 movq mm4, mm3 1739 PALIGNR mm2, mm0, 7, mm0 1740 PALIGNR mm1, mm4, 1, mm4 1741 test r1d, r1d 1742 jz .fix_lt_2 1743 test r2d, r2d 1744 jz .fix_tr_1 1745.do_top: 1746 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1747 movq2dq xmm4, mm4 1748 lea r1, [r0+r3*2] 1749 movdqa xmm0, xmm3 1750 pslldq xmm4, 8 1751 por xmm3, xmm4 1752 lea r2, [r1+r3*2] 1753 pslldq xmm4, 1 1754 por xmm1, xmm4 1755 psrldq xmm0, 7 1756 pslldq xmm0, 15 1757 psrldq xmm0, 7 1758 por xmm1, xmm0 1759 lea r0, [r2+r3*2] 1760 movdqa xmm2, xmm3 1761 psrldq xmm2, 1 1762INIT_XMM cpuname 1763 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 1764 movdqa xmm1, xmm0 1765 psrldq xmm1, 1 1766 movq [r0+r3*2], xmm0 1767 movq [r0+r3*1], xmm1 1768 psrldq xmm0, 2 1769 psrldq xmm1, 2 1770 movq [r2+r3*2], xmm0 1771 movq [r2+r3*1], xmm1 1772 psrldq xmm0, 2 1773 psrldq xmm1, 2 1774 movq [r1+r3*2], xmm0 1775 movq [r1+r3*1], xmm1 1776 psrldq xmm0, 2 1777 psrldq xmm1, 2 1778 movq [r4+r3*2], xmm0 1779 movq [r4+r3*1], xmm1 1780 RET 1781%endmacro 1782 1783INIT_MMX sse2 1784PRED8x8L_DOWN_RIGHT 1785INIT_MMX ssse3 1786PRED8x8L_DOWN_RIGHT 1787 1788;----------------------------------------------------------------------------- 1789; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft, 1790; int has_topright, ptrdiff_t stride) 1791;----------------------------------------------------------------------------- 1792 1793INIT_MMX mmxext 1794cglobal pred8x8l_vertical_right_8, 4,5 1795 sub r0, r3 1796 lea r4, [r0+r3*2] 1797 movq mm0, [r0+r3*1-8] 1798 punpckhbw mm0, [r0+r3*0-8] 1799 movq mm1, [r4+r3*1-8] 1800 punpckhbw mm1, [r0+r3*2-8] 1801 mov r4, r0 1802 punpckhwd mm1, mm0 1803 lea r0, [r0+r3*4] 1804 movq mm2, [r0+r3*1-8] 1805 punpckhbw mm2, [r0+r3*0-8] 1806 lea r0, [r0+r3*2] 1807 movq mm3, [r0+r3*1-8] 1808 punpckhbw mm3, [r0+r3*0-8] 1809 punpckhwd mm3, mm2 1810 punpckhdq mm3, mm1 1811 lea r0, [r0+r3*2] 1812 movq mm0, [r0+r3*0-8] 1813 movq mm1, [r4] 1814 mov r0, r4 1815 movq mm4, mm3 1816 movq mm2, mm3 1817 PALIGNR mm4, mm0, 7, mm0 1818 PALIGNR mm1, mm2, 1, mm2 1819 test r1d, r1d 1820 jz .fix_lt_1 1821 jmp .do_left 1822.fix_lt_1: 1823 movq mm5, mm3 1824 pxor mm5, mm4 1825 psrlq mm5, 56 1826 psllq mm5, 48 1827 pxor mm1, mm5 1828 jmp .do_left 1829.fix_lt_2: 1830 movq mm5, mm3 1831 pxor mm5, mm2 1832 psllq mm5, 56 1833 psrlq mm5, 56 1834 pxor mm2, mm5 1835 test r2d, r2d 1836 jnz .do_top 1837.fix_tr_1: 1838 movq mm5, mm3 1839 pxor mm5, mm1 1840 psrlq mm5, 56 1841 psllq mm5, 56 1842 pxor mm1, mm5 1843 jmp .do_top 1844.do_left: 1845 movq mm0, mm4 1846 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1847 movq mm7, mm2 1848 movq mm0, [r0-8] 1849 movq mm3, [r0] 1850 movq mm1, [r0+8] 1851 movq mm2, mm3 1852 movq mm4, mm3 1853 PALIGNR mm2, mm0, 7, mm0 1854 PALIGNR mm1, mm4, 1, mm4 1855 test r1d, r1d 1856 jz .fix_lt_2 1857 test r2d, r2d 1858 jz .fix_tr_1 1859.do_top: 1860 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 1861 lea r1, [r0+r3*2] 1862 movq mm2, mm6 1863 movq mm3, mm6 1864 PALIGNR mm3, mm7, 7, mm0 1865 PALIGNR mm6, mm7, 6, mm1 1866 movq mm4, mm3 1867 pavgb mm3, mm2 1868 lea r2, [r1+r3*2] 1869 PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5 1870 movq [r0+r3*1], mm3 1871 movq [r0+r3*2], mm0 1872 movq mm5, mm0 1873 movq mm6, mm3 1874 movq mm1, mm7 1875 movq mm2, mm1 1876 psllq mm2, 8 1877 movq mm3, mm1 1878 psllq mm3, 16 1879 lea r4, [r2+r3*2] 1880 PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4 1881 PALIGNR mm6, mm0, 7, mm2 1882 movq [r1+r3*1], mm6 1883 psllq mm0, 8 1884 PALIGNR mm5, mm0, 7, mm1 1885 movq [r1+r3*2], mm5 1886 psllq mm0, 8 1887 PALIGNR mm6, mm0, 7, mm2 1888 movq [r2+r3*1], mm6 1889 psllq mm0, 8 1890 PALIGNR mm5, mm0, 7, mm1 1891 movq [r2+r3*2], mm5 1892 psllq mm0, 8 1893 PALIGNR mm6, mm0, 7, mm2 1894 movq [r4+r3*1], mm6 1895 psllq mm0, 8 1896 PALIGNR mm5, mm0, 7, mm1 1897 movq [r4+r3*2], mm5 1898 RET 1899 1900%macro PRED8x8L_VERTICAL_RIGHT 0 1901cglobal pred8x8l_vertical_right_8, 4,5,7 1902 ; manually spill XMM registers for Win64 because 1903 ; the code here is initialized with INIT_MMX 1904 WIN64_SPILL_XMM 7 1905 sub r0, r3 1906 lea r4, [r0+r3*2] 1907 movq mm0, [r0+r3*1-8] 1908 punpckhbw mm0, [r0+r3*0-8] 1909 movq mm1, [r4+r3*1-8] 1910 punpckhbw mm1, [r0+r3*2-8] 1911 mov r4, r0 1912 punpckhwd mm1, mm0 1913 lea r0, [r0+r3*4] 1914 movq mm2, [r0+r3*1-8] 1915 punpckhbw mm2, [r0+r3*0-8] 1916 lea r0, [r0+r3*2] 1917 movq mm3, [r0+r3*1-8] 1918 punpckhbw mm3, [r0+r3*0-8] 1919 punpckhwd mm3, mm2 1920 punpckhdq mm3, mm1 1921 lea r0, [r0+r3*2] 1922 movq mm0, [r0+r3*0-8] 1923 movq mm1, [r4] 1924 mov r0, r4 1925 movq mm4, mm3 1926 movq mm2, mm3 1927 PALIGNR mm4, mm0, 7, mm0 1928 PALIGNR mm1, mm2, 1, mm2 1929 test r1d, r1d 1930 jnz .do_left 1931.fix_lt_1: 1932 movq mm5, mm3 1933 pxor mm5, mm4 1934 psrlq mm5, 56 1935 psllq mm5, 48 1936 pxor mm1, mm5 1937 jmp .do_left 1938.fix_lt_2: 1939 movq mm5, mm3 1940 pxor mm5, mm2 1941 psllq mm5, 56 1942 psrlq mm5, 56 1943 pxor mm2, mm5 1944 test r2d, r2d 1945 jnz .do_top 1946.fix_tr_1: 1947 movq mm5, mm3 1948 pxor mm5, mm1 1949 psrlq mm5, 56 1950 psllq mm5, 56 1951 pxor mm1, mm5 1952 jmp .do_top 1953.do_left: 1954 movq mm0, mm4 1955 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1956 movq2dq xmm0, mm2 1957 movq mm0, [r0-8] 1958 movq mm3, [r0] 1959 movq mm1, [r0+8] 1960 movq mm2, mm3 1961 movq mm4, mm3 1962 PALIGNR mm2, mm0, 7, mm0 1963 PALIGNR mm1, mm4, 1, mm4 1964 test r1d, r1d 1965 jz .fix_lt_2 1966 test r2d, r2d 1967 jz .fix_tr_1 1968.do_top: 1969 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 1970 lea r1, [r0+r3*2] 1971 movq2dq xmm4, mm6 1972 pslldq xmm4, 8 1973 por xmm0, xmm4 1974 movdqa xmm6, [pw_ff00] 1975 movdqa xmm1, xmm0 1976 lea r2, [r1+r3*2] 1977 movdqa xmm2, xmm0 1978 movdqa xmm3, xmm0 1979 pslldq xmm0, 1 1980 pslldq xmm1, 2 1981 pavgb xmm2, xmm0 1982INIT_XMM cpuname 1983 PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5 1984 pandn xmm6, xmm4 1985 movdqa xmm5, xmm4 1986 psrlw xmm4, 8 1987 packuswb xmm6, xmm4 1988 movhlps xmm4, xmm6 1989 movhps [r0+r3*2], xmm5 1990 movhps [r0+r3*1], xmm2 1991 psrldq xmm5, 4 1992 movss xmm5, xmm6 1993 psrldq xmm2, 4 1994 movss xmm2, xmm4 1995 lea r0, [r2+r3*2] 1996 psrldq xmm5, 1 1997 psrldq xmm2, 1 1998 movq [r0+r3*2], xmm5 1999 movq [r0+r3*1], xmm2 2000 psrldq xmm5, 1 2001 psrldq xmm2, 1 2002 movq [r2+r3*2], xmm5 2003 movq [r2+r3*1], xmm2 2004 psrldq xmm5, 1 2005 psrldq xmm2, 1 2006 movq [r1+r3*2], xmm5 2007 movq [r1+r3*1], xmm2 2008 RET 2009%endmacro 2010 2011INIT_MMX sse2 2012PRED8x8L_VERTICAL_RIGHT 2013INIT_MMX ssse3 2014PRED8x8L_VERTICAL_RIGHT 2015 2016;----------------------------------------------------------------------------- 2017; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft, 2018; int has_topright, ptrdiff_t stride) 2019;----------------------------------------------------------------------------- 2020 2021%macro PRED8x8L_VERTICAL_LEFT 0 2022cglobal pred8x8l_vertical_left_8, 4,4 2023 sub r0, r3 2024 movq mm0, [r0-8] 2025 movq mm3, [r0] 2026 movq mm1, [r0+8] 2027 movq mm2, mm3 2028 movq mm4, mm3 2029 PALIGNR mm2, mm0, 7, mm0 2030 PALIGNR mm1, mm4, 1, mm4 2031 test r1d, r1d 2032 jz .fix_lt_2 2033 test r2d, r2d 2034 jz .fix_tr_1 2035 jmp .do_top 2036.fix_lt_2: 2037 movq mm5, mm3 2038 pxor mm5, mm2 2039 psllq mm5, 56 2040 psrlq mm5, 56 2041 pxor mm2, mm5 2042 test r2d, r2d 2043 jnz .do_top 2044.fix_tr_1: 2045 movq mm5, mm3 2046 pxor mm5, mm1 2047 psrlq mm5, 56 2048 psllq mm5, 56 2049 pxor mm1, mm5 2050 jmp .do_top 2051.fix_tr_2: 2052 punpckhbw mm3, mm3 2053 pshufw mm1, mm3, 0xFF 2054 jmp .do_topright 2055.do_top: 2056 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 2057 movq2dq xmm4, mm4 2058 test r2d, r2d 2059 jz .fix_tr_2 2060 movq mm0, [r0+8] 2061 movq mm5, mm0 2062 movq mm2, mm0 2063 movq mm4, mm0 2064 psrlq mm5, 56 2065 PALIGNR mm2, mm3, 7, mm3 2066 PALIGNR mm5, mm4, 1, mm4 2067 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 2068.do_topright: 2069 movq2dq xmm3, mm1 2070 lea r1, [r0+r3*2] 2071 pslldq xmm3, 8 2072 por xmm4, xmm3 2073 movdqa xmm2, xmm4 2074 movdqa xmm1, xmm4 2075 movdqa xmm3, xmm4 2076 psrldq xmm2, 1 2077 pslldq xmm1, 1 2078 pavgb xmm3, xmm2 2079 lea r2, [r1+r3*2] 2080INIT_XMM cpuname 2081 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5 2082 psrldq xmm0, 1 2083 movq [r0+r3*1], xmm3 2084 movq [r0+r3*2], xmm0 2085 lea r0, [r2+r3*2] 2086 psrldq xmm3, 1 2087 psrldq xmm0, 1 2088 movq [r1+r3*1], xmm3 2089 movq [r1+r3*2], xmm0 2090 psrldq xmm3, 1 2091 psrldq xmm0, 1 2092 movq [r2+r3*1], xmm3 2093 movq [r2+r3*2], xmm0 2094 psrldq xmm3, 1 2095 psrldq xmm0, 1 2096 movq [r0+r3*1], xmm3 2097 movq [r0+r3*2], xmm0 2098 RET 2099%endmacro 2100 2101INIT_MMX sse2 2102PRED8x8L_VERTICAL_LEFT 2103INIT_MMX ssse3 2104PRED8x8L_VERTICAL_LEFT 2105 2106;----------------------------------------------------------------------------- 2107; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft, 2108; int has_topright, ptrdiff_t stride) 2109;----------------------------------------------------------------------------- 2110 2111%macro PRED8x8L_HORIZONTAL_UP 0 2112cglobal pred8x8l_horizontal_up_8, 4,4 2113 sub r0, r3 2114 lea r2, [r0+r3*2] 2115 movq mm0, [r0+r3*1-8] 2116 test r1d, r1d 2117 lea r1, [r0+r3] 2118 cmovnz r1, r0 2119 punpckhbw mm0, [r1+r3*0-8] 2120 movq mm1, [r2+r3*1-8] 2121 punpckhbw mm1, [r0+r3*2-8] 2122 mov r2, r0 2123 punpckhwd mm1, mm0 2124 lea r0, [r0+r3*4] 2125 movq mm2, [r0+r3*1-8] 2126 punpckhbw mm2, [r0+r3*0-8] 2127 lea r0, [r0+r3*2] 2128 movq mm3, [r0+r3*1-8] 2129 punpckhbw mm3, [r0+r3*0-8] 2130 punpckhwd mm3, mm2 2131 punpckhdq mm3, mm1 2132 lea r0, [r0+r3*2] 2133 movq mm0, [r0+r3*0-8] 2134 movq mm1, [r1+r3*0-8] 2135 mov r0, r2 2136 movq mm4, mm3 2137 movq mm2, mm3 2138 PALIGNR mm4, mm0, 7, mm0 2139 PALIGNR mm1, mm2, 1, mm2 2140 movq mm0, mm4 2141 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 2142 movq mm4, mm0 2143 movq mm7, mm2 2144 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 2145 psllq mm1, 56 2146 PALIGNR mm7, mm1, 7, mm3 2147 lea r1, [r0+r3*2] 2148 pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 2149 psllq mm7, 56 ; l7 .. .. .. .. .. .. .. 2150 movq mm2, mm0 2151 psllw mm0, 8 2152 psrlw mm2, 8 2153 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 2154 movq mm3, mm2 2155 movq mm4, mm2 2156 movq mm5, mm2 2157 psrlq mm2, 8 2158 psrlq mm3, 16 2159 lea r2, [r1+r3*2] 2160 por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1 2161 punpckhbw mm7, mm7 2162 por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2 2163 pavgb mm4, mm2 2164 PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6 2165 movq mm5, mm4 2166 punpcklbw mm4, mm1 ; p4 p3 p2 p1 2167 punpckhbw mm5, mm1 ; p8 p7 p6 p5 2168 movq mm6, mm5 2169 movq mm7, mm5 2170 movq mm0, mm5 2171 PALIGNR mm5, mm4, 2, mm1 2172 pshufw mm1, mm6, 11111001b 2173 PALIGNR mm6, mm4, 4, mm2 2174 pshufw mm2, mm7, 11111110b 2175 PALIGNR mm7, mm4, 6, mm3 2176 pshufw mm3, mm0, 11111111b 2177 movq [r0+r3*1], mm4 2178 movq [r0+r3*2], mm5 2179 lea r0, [r2+r3*2] 2180 movq [r1+r3*1], mm6 2181 movq [r1+r3*2], mm7 2182 movq [r2+r3*1], mm0 2183 movq [r2+r3*2], mm1 2184 movq [r0+r3*1], mm2 2185 movq [r0+r3*2], mm3 2186 RET 2187%endmacro 2188 2189INIT_MMX mmxext 2190PRED8x8L_HORIZONTAL_UP 2191INIT_MMX ssse3 2192PRED8x8L_HORIZONTAL_UP 2193 2194;----------------------------------------------------------------------------- 2195; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft, 2196; int has_topright, ptrdiff_t stride) 2197;----------------------------------------------------------------------------- 2198 2199INIT_MMX mmxext 2200cglobal pred8x8l_horizontal_down_8, 4,5 2201 sub r0, r3 2202 lea r4, [r0+r3*2] 2203 movq mm0, [r0+r3*1-8] 2204 punpckhbw mm0, [r0+r3*0-8] 2205 movq mm1, [r4+r3*1-8] 2206 punpckhbw mm1, [r0+r3*2-8] 2207 mov r4, r0 2208 punpckhwd mm1, mm0 2209 lea r0, [r0+r3*4] 2210 movq mm2, [r0+r3*1-8] 2211 punpckhbw mm2, [r0+r3*0-8] 2212 lea r0, [r0+r3*2] 2213 movq mm3, [r0+r3*1-8] 2214 punpckhbw mm3, [r0+r3*0-8] 2215 punpckhwd mm3, mm2 2216 punpckhdq mm3, mm1 2217 lea r0, [r0+r3*2] 2218 movq mm0, [r0+r3*0-8] 2219 movq mm1, [r4] 2220 mov r0, r4 2221 movq mm4, mm3 2222 movq mm2, mm3 2223 PALIGNR mm4, mm0, 7, mm0 2224 PALIGNR mm1, mm2, 1, mm2 2225 test r1d, r1d 2226 jnz .do_left 2227.fix_lt_1: 2228 movq mm5, mm3 2229 pxor mm5, mm4 2230 psrlq mm5, 56 2231 psllq mm5, 48 2232 pxor mm1, mm5 2233 jmp .do_left 2234.fix_lt_2: 2235 movq mm5, mm3 2236 pxor mm5, mm2 2237 psllq mm5, 56 2238 psrlq mm5, 56 2239 pxor mm2, mm5 2240 test r2d, r2d 2241 jnz .do_top 2242.fix_tr_1: 2243 movq mm5, mm3 2244 pxor mm5, mm1 2245 psrlq mm5, 56 2246 psllq mm5, 56 2247 pxor mm1, mm5 2248 jmp .do_top 2249.do_left: 2250 movq mm0, mm4 2251 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 2252 movq mm4, mm0 2253 movq mm7, mm2 2254 movq mm6, mm2 2255 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 2256 psllq mm1, 56 2257 PALIGNR mm7, mm1, 7, mm3 2258 movq mm0, [r0-8] 2259 movq mm3, [r0] 2260 movq mm1, [r0+8] 2261 movq mm2, mm3 2262 movq mm4, mm3 2263 PALIGNR mm2, mm0, 7, mm0 2264 PALIGNR mm1, mm4, 1, mm4 2265 test r1d, r1d 2266 jz .fix_lt_2 2267 test r2d, r2d 2268 jz .fix_tr_1 2269.do_top: 2270 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 2271 movq mm5, mm4 2272 lea r1, [r0+r3*2] 2273 psllq mm7, 56 2274 movq mm2, mm5 2275 movq mm3, mm6 2276 movq mm4, mm2 2277 PALIGNR mm2, mm6, 7, mm5 2278 PALIGNR mm6, mm7, 7, mm0 2279 lea r2, [r1+r3*2] 2280 PALIGNR mm4, mm3, 1, mm7 2281 movq mm5, mm3 2282 pavgb mm3, mm6 2283 PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7 2284 movq mm4, mm2 2285 movq mm1, mm2 2286 lea r4, [r2+r3*2] 2287 psrlq mm4, 16 2288 psrlq mm1, 8 2289 PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5 2290 movq mm7, mm3 2291 punpcklbw mm3, mm0 2292 punpckhbw mm7, mm0 2293 movq mm1, mm7 2294 movq mm0, mm7 2295 movq mm4, mm7 2296 movq [r4+r3*2], mm3 2297 PALIGNR mm7, mm3, 2, mm5 2298 movq [r4+r3*1], mm7 2299 PALIGNR mm1, mm3, 4, mm5 2300 movq [r2+r3*2], mm1 2301 PALIGNR mm0, mm3, 6, mm3 2302 movq [r2+r3*1], mm0 2303 movq mm2, mm6 2304 movq mm3, mm6 2305 movq [r1+r3*2], mm4 2306 PALIGNR mm6, mm4, 2, mm5 2307 movq [r1+r3*1], mm6 2308 PALIGNR mm2, mm4, 4, mm5 2309 movq [r0+r3*2], mm2 2310 PALIGNR mm3, mm4, 6, mm4 2311 movq [r0+r3*1], mm3 2312 RET 2313 2314%macro PRED8x8L_HORIZONTAL_DOWN 0 2315cglobal pred8x8l_horizontal_down_8, 4,5 2316 sub r0, r3 2317 lea r4, [r0+r3*2] 2318 movq mm0, [r0+r3*1-8] 2319 punpckhbw mm0, [r0+r3*0-8] 2320 movq mm1, [r4+r3*1-8] 2321 punpckhbw mm1, [r0+r3*2-8] 2322 mov r4, r0 2323 punpckhwd mm1, mm0 2324 lea r0, [r0+r3*4] 2325 movq mm2, [r0+r3*1-8] 2326 punpckhbw mm2, [r0+r3*0-8] 2327 lea r0, [r0+r3*2] 2328 movq mm3, [r0+r3*1-8] 2329 punpckhbw mm3, [r0+r3*0-8] 2330 punpckhwd mm3, mm2 2331 punpckhdq mm3, mm1 2332 lea r0, [r0+r3*2] 2333 movq mm0, [r0+r3*0-8] 2334 movq mm1, [r4] 2335 mov r0, r4 2336 movq mm4, mm3 2337 movq mm2, mm3 2338 PALIGNR mm4, mm0, 7, mm0 2339 PALIGNR mm1, mm2, 1, mm2 2340 test r1d, r1d 2341 jnz .do_left 2342.fix_lt_1: 2343 movq mm5, mm3 2344 pxor mm5, mm4 2345 psrlq mm5, 56 2346 psllq mm5, 48 2347 pxor mm1, mm5 2348 jmp .do_left 2349.fix_lt_2: 2350 movq mm5, mm3 2351 pxor mm5, mm2 2352 psllq mm5, 56 2353 psrlq mm5, 56 2354 pxor mm2, mm5 2355 test r2d, r2d 2356 jnz .do_top 2357.fix_tr_1: 2358 movq mm5, mm3 2359 pxor mm5, mm1 2360 psrlq mm5, 56 2361 psllq mm5, 56 2362 pxor mm1, mm5 2363 jmp .do_top 2364.fix_tr_2: 2365 punpckhbw mm3, mm3 2366 pshufw mm1, mm3, 0xFF 2367 jmp .do_topright 2368.do_left: 2369 movq mm0, mm4 2370 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 2371 movq2dq xmm0, mm2 2372 pslldq xmm0, 8 2373 movq mm4, mm0 2374 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 2375 movq2dq xmm2, mm1 2376 pslldq xmm2, 15 2377 psrldq xmm2, 8 2378 por xmm0, xmm2 2379 movq mm0, [r0-8] 2380 movq mm3, [r0] 2381 movq mm1, [r0+8] 2382 movq mm2, mm3 2383 movq mm4, mm3 2384 PALIGNR mm2, mm0, 7, mm0 2385 PALIGNR mm1, mm4, 1, mm4 2386 test r1d, r1d 2387 jz .fix_lt_2 2388 test r2d, r2d 2389 jz .fix_tr_1 2390.do_top: 2391 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 2392 movq2dq xmm1, mm4 2393 test r2d, r2d 2394 jz .fix_tr_2 2395 movq mm0, [r0+8] 2396 movq mm5, mm0 2397 movq mm2, mm0 2398 movq mm4, mm0 2399 psrlq mm5, 56 2400 PALIGNR mm2, mm3, 7, mm3 2401 PALIGNR mm5, mm4, 1, mm4 2402 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 2403.do_topright: 2404 movq2dq xmm5, mm1 2405 pslldq xmm5, 8 2406 por xmm1, xmm5 2407INIT_XMM cpuname 2408 lea r2, [r4+r3*2] 2409 movdqa xmm2, xmm1 2410 movdqa xmm3, xmm1 2411 PALIGNR xmm1, xmm0, 7, xmm4 2412 PALIGNR xmm2, xmm0, 9, xmm5 2413 lea r1, [r2+r3*2] 2414 PALIGNR xmm3, xmm0, 8, xmm0 2415 movdqa xmm4, xmm1 2416 pavgb xmm4, xmm3 2417 lea r0, [r1+r3*2] 2418 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5 2419 punpcklbw xmm4, xmm0 2420 movhlps xmm0, xmm4 2421 movq [r0+r3*2], xmm4 2422 movq [r2+r3*2], xmm0 2423 psrldq xmm4, 2 2424 psrldq xmm0, 2 2425 movq [r0+r3*1], xmm4 2426 movq [r2+r3*1], xmm0 2427 psrldq xmm4, 2 2428 psrldq xmm0, 2 2429 movq [r1+r3*2], xmm4 2430 movq [r4+r3*2], xmm0 2431 psrldq xmm4, 2 2432 psrldq xmm0, 2 2433 movq [r1+r3*1], xmm4 2434 movq [r4+r3*1], xmm0 2435 RET 2436%endmacro 2437 2438INIT_MMX sse2 2439PRED8x8L_HORIZONTAL_DOWN 2440INIT_MMX ssse3 2441PRED8x8L_HORIZONTAL_DOWN 2442 2443;------------------------------------------------------------------------------- 2444; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, 2445; ptrdiff_t stride) 2446;------------------------------------------------------------------------------- 2447 2448INIT_MMX mmxext 2449cglobal pred4x4_dc_8, 3,5 2450 pxor mm7, mm7 2451 mov r4, r0 2452 sub r0, r2 2453 movd mm0, [r0] 2454 psadbw mm0, mm7 2455 movzx r1d, byte [r0+r2*1-1] 2456 movd r3d, mm0 2457 add r3d, r1d 2458 movzx r1d, byte [r0+r2*2-1] 2459 lea r0, [r0+r2*2] 2460 add r3d, r1d 2461 movzx r1d, byte [r0+r2*1-1] 2462 add r3d, r1d 2463 movzx r1d, byte [r0+r2*2-1] 2464 add r3d, r1d 2465 add r3d, 4 2466 shr r3d, 3 2467 imul r3d, 0x01010101 2468 mov [r4+r2*0], r3d 2469 mov [r0+r2*0], r3d 2470 mov [r0+r2*1], r3d 2471 mov [r0+r2*2], r3d 2472 RET 2473 2474;----------------------------------------------------------------------------- 2475; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, 2476; ptrdiff_t stride) 2477;----------------------------------------------------------------------------- 2478 2479%macro PRED4x4_TM 0 2480cglobal pred4x4_tm_vp8_8, 3,6 2481 sub r0, r2 2482 pxor mm7, mm7 2483 movd mm0, [r0] 2484 punpcklbw mm0, mm7 2485 movzx r4d, byte [r0-1] 2486 mov r5d, 2 2487.loop: 2488 movzx r1d, byte [r0+r2*1-1] 2489 movzx r3d, byte [r0+r2*2-1] 2490 sub r1d, r4d 2491 sub r3d, r4d 2492 movd mm2, r1d 2493 movd mm4, r3d 2494%if cpuflag(mmxext) 2495 pshufw mm2, mm2, 0 2496 pshufw mm4, mm4, 0 2497%else 2498 punpcklwd mm2, mm2 2499 punpcklwd mm4, mm4 2500 punpckldq mm2, mm2 2501 punpckldq mm4, mm4 2502%endif 2503 paddw mm2, mm0 2504 paddw mm4, mm0 2505 packuswb mm2, mm2 2506 packuswb mm4, mm4 2507 movd [r0+r2*1], mm2 2508 movd [r0+r2*2], mm4 2509 lea r0, [r0+r2*2] 2510 dec r5d 2511 jg .loop 2512 REP_RET 2513%endmacro 2514 2515INIT_MMX mmx 2516PRED4x4_TM 2517INIT_MMX mmxext 2518PRED4x4_TM 2519 2520INIT_XMM ssse3 2521cglobal pred4x4_tm_vp8_8, 3,3 2522 sub r0, r2 2523 movq mm6, [tm_shuf] 2524 pxor mm1, mm1 2525 movd mm0, [r0] 2526 punpcklbw mm0, mm1 2527 movd mm7, [r0-4] 2528 pshufb mm7, mm6 2529 lea r1, [r0+r2*2] 2530 movd mm2, [r0+r2*1-4] 2531 movd mm3, [r0+r2*2-4] 2532 movd mm4, [r1+r2*1-4] 2533 movd mm5, [r1+r2*2-4] 2534 pshufb mm2, mm6 2535 pshufb mm3, mm6 2536 pshufb mm4, mm6 2537 pshufb mm5, mm6 2538 psubw mm0, mm7 2539 paddw mm2, mm0 2540 paddw mm3, mm0 2541 paddw mm4, mm0 2542 paddw mm5, mm0 2543 packuswb mm2, mm2 2544 packuswb mm3, mm3 2545 packuswb mm4, mm4 2546 packuswb mm5, mm5 2547 movd [r0+r2*1], mm2 2548 movd [r0+r2*2], mm3 2549 movd [r1+r2*1], mm4 2550 movd [r1+r2*2], mm5 2551 RET 2552 2553;----------------------------------------------------------------------------- 2554; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, 2555; ptrdiff_t stride) 2556;----------------------------------------------------------------------------- 2557 2558INIT_MMX mmxext 2559cglobal pred4x4_vertical_vp8_8, 3,3 2560 sub r0, r2 2561 movd m1, [r0-1] 2562 movd m0, [r0] 2563 mova m2, m0 ;t0 t1 t2 t3 2564 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7 2565 lea r1, [r0+r2*2] 2566 psrlq m0, 8 ;t1 t2 t3 t4 2567 PRED4x4_LOWPASS m3, m1, m0, m2, m4 2568 movd [r0+r2*1], m3 2569 movd [r0+r2*2], m3 2570 movd [r1+r2*1], m3 2571 movd [r1+r2*2], m3 2572 RET 2573 2574;----------------------------------------------------------------------------- 2575; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright, 2576; ptrdiff_t stride) 2577;----------------------------------------------------------------------------- 2578INIT_MMX mmxext 2579cglobal pred4x4_down_left_8, 3,3 2580 sub r0, r2 2581 movq m1, [r0] 2582 punpckldq m1, [r1] 2583 movq m2, m1 2584 movq m3, m1 2585 psllq m1, 8 2586 pxor m2, m1 2587 psrlq m2, 8 2588 pxor m2, m3 2589 PRED4x4_LOWPASS m0, m1, m2, m3, m4 2590 lea r1, [r0+r2*2] 2591 psrlq m0, 8 2592 movd [r0+r2*1], m0 2593 psrlq m0, 8 2594 movd [r0+r2*2], m0 2595 psrlq m0, 8 2596 movd [r1+r2*1], m0 2597 psrlq m0, 8 2598 movd [r1+r2*2], m0 2599 RET 2600 2601;------------------------------------------------------------------------------ 2602; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright, 2603; ptrdiff_t stride) 2604;------------------------------------------------------------------------------ 2605 2606INIT_MMX mmxext 2607cglobal pred4x4_vertical_left_8, 3,3 2608 sub r0, r2 2609 movq m1, [r0] 2610 punpckldq m1, [r1] 2611 movq m3, m1 2612 movq m2, m1 2613 psrlq m3, 8 2614 psrlq m2, 16 2615 movq m4, m3 2616 pavgb m4, m1 2617 PRED4x4_LOWPASS m0, m1, m2, m3, m5 2618 lea r1, [r0+r2*2] 2619 movh [r0+r2*1], m4 2620 movh [r0+r2*2], m0 2621 psrlq m4, 8 2622 psrlq m0, 8 2623 movh [r1+r2*1], m4 2624 movh [r1+r2*2], m0 2625 RET 2626 2627;------------------------------------------------------------------------------ 2628; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright, 2629; ptrdiff_t stride) 2630;------------------------------------------------------------------------------ 2631 2632INIT_MMX mmxext 2633cglobal pred4x4_horizontal_up_8, 3,3 2634 sub r0, r2 2635 lea r1, [r0+r2*2] 2636 movd m0, [r0+r2*1-4] 2637 punpcklbw m0, [r0+r2*2-4] 2638 movd m1, [r1+r2*1-4] 2639 punpcklbw m1, [r1+r2*2-4] 2640 punpckhwd m0, m1 2641 movq m1, m0 2642 punpckhbw m1, m1 2643 pshufw m1, m1, 0xFF 2644 punpckhdq m0, m1 2645 movq m2, m0 2646 movq m3, m0 2647 movq m7, m0 2648 psrlq m2, 16 2649 psrlq m3, 8 2650 pavgb m7, m3 2651 PRED4x4_LOWPASS m4, m0, m2, m3, m5 2652 punpcklbw m7, m4 2653 movd [r0+r2*1], m7 2654 psrlq m7, 16 2655 movd [r0+r2*2], m7 2656 psrlq m7, 16 2657 movd [r1+r2*1], m7 2658 movd [r1+r2*2], m1 2659 RET 2660 2661;------------------------------------------------------------------------------ 2662; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src, 2663; const uint8_t *topright, 2664; ptrdiff_t stride) 2665;------------------------------------------------------------------------------ 2666 2667INIT_MMX mmxext 2668cglobal pred4x4_horizontal_down_8, 3,3 2669 sub r0, r2 2670 lea r1, [r0+r2*2] 2671 movh m0, [r0-4] ; lt .. 2672 punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. .. 2673 psllq m0, 8 ; t2 t1 t0 lt .. .. .. .. 2674 movd m1, [r1+r2*2-4] ; l3 2675 punpcklbw m1, [r1+r2*1-4] ; l2 l3 2676 movd m2, [r0+r2*2-4] ; l1 2677 punpcklbw m2, [r0+r2*1-4] ; l0 l1 2678 punpckhwd m1, m2 ; l0 l1 l2 l3 2679 punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 2680 movq m0, m1 2681 movq m2, m1 2682 movq m5, m1 2683 psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1 2684 psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2 2685 pavgb m5, m2 2686 PRED4x4_LOWPASS m3, m1, m0, m2, m4 2687 punpcklbw m5, m3 2688 psrlq m3, 32 2689 PALIGNR m3, m5, 6, m4 2690 movh [r1+r2*2], m5 2691 psrlq m5, 16 2692 movh [r1+r2*1], m5 2693 psrlq m5, 16 2694 movh [r0+r2*2], m5 2695 movh [r0+r2*1], m3 2696 RET 2697 2698;----------------------------------------------------------------------------- 2699; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src, 2700; const uint8_t *topright, 2701; ptrdiff_t stride) 2702;----------------------------------------------------------------------------- 2703 2704INIT_MMX mmxext 2705cglobal pred4x4_vertical_right_8, 3,3 2706 sub r0, r2 2707 lea r1, [r0+r2*2] 2708 movh m0, [r0] ; ........t3t2t1t0 2709 movq m5, m0 2710 PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt 2711 pavgb m5, m0 2712 PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0 2713 movq m1, m0 2714 PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1 2715 movq m2, m0 2716 PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2 2717 PRED4x4_LOWPASS m3, m1, m0, m2, m4 2718 movq m1, m3 2719 psrlq m3, 16 2720 psllq m1, 48 2721 movh [r0+r2*1], m5 2722 movh [r0+r2*2], m3 2723 PALIGNR m5, m1, 7, m2 2724 psllq m1, 8 2725 movh [r1+r2*1], m5 2726 PALIGNR m3, m1, 7, m1 2727 movh [r1+r2*2], m3 2728 RET 2729 2730;----------------------------------------------------------------------------- 2731; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright, 2732; ptrdiff_t stride) 2733;----------------------------------------------------------------------------- 2734 2735INIT_MMX mmxext 2736cglobal pred4x4_down_right_8, 3,3 2737 sub r0, r2 2738 lea r1, [r0+r2*2] 2739 movq m1, [r1-8] 2740 movq m2, [r0+r2*1-8] 2741 punpckhbw m2, [r0-8] 2742 movh m3, [r0] 2743 punpckhwd m1, m2 2744 PALIGNR m3, m1, 5, m1 2745 movq m1, m3 2746 PALIGNR m3, [r1+r2*1-8], 7, m4 2747 movq m2, m3 2748 PALIGNR m3, [r1+r2*2-8], 7, m4 2749 PRED4x4_LOWPASS m0, m3, m1, m2, m4 2750 movh [r1+r2*2], m0 2751 psrlq m0, 8 2752 movh [r1+r2*1], m0 2753 psrlq m0, 8 2754 movh [r0+r2*2], m0 2755 psrlq m0, 8 2756 movh [r0+r2*1], m0 2757 RET 2758