1;****************************************************************************** 2;* MMX/SSSE3-optimized functions for H264 chroma MC 3;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, 4;* 2005-2008 Loren Merritt 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 26 27rnd_rv40_2d_tbl: times 4 dw 0 28 times 4 dw 16 29 times 4 dw 32 30 times 4 dw 16 31 times 4 dw 32 32 times 4 dw 28 33 times 4 dw 32 34 times 4 dw 28 35 times 4 dw 0 36 times 4 dw 32 37 times 4 dw 16 38 times 4 dw 32 39 times 4 dw 32 40 times 4 dw 28 41 times 4 dw 32 42 times 4 dw 28 43rnd_rv40_1d_tbl: times 4 dw 0 44 times 4 dw 2 45 times 4 dw 4 46 times 4 dw 2 47 times 4 dw 4 48 times 4 dw 3 49 times 4 dw 4 50 times 4 dw 3 51 times 4 dw 0 52 times 4 dw 4 53 times 4 dw 2 54 times 4 dw 4 55 times 4 dw 4 56 times 4 dw 3 57 times 4 dw 4 58 times 4 dw 3 59 60cextern pw_3 61cextern pw_4 62cextern pw_8 63pw_28: times 8 dw 28 64cextern pw_32 65cextern pw_64 66 67SECTION .text 68 69%macro mv0_pixels_mc8 0 70 lea r4, [r2*2 ] 71.next4rows: 72 movq mm0, [r1 ] 73 movq mm1, [r1+r2] 74 add r1, r4 75 CHROMAMC_AVG mm0, [r0 ] 76 CHROMAMC_AVG mm1, [r0+r2] 77 movq [r0 ], mm0 78 movq [r0+r2], mm1 79 add r0, r4 80 movq mm0, [r1 ] 81 movq mm1, [r1+r2] 82 add r1, r4 83 CHROMAMC_AVG mm0, [r0 ] 84 CHROMAMC_AVG mm1, [r0+r2] 85 movq [r0 ], mm0 86 movq [r0+r2], mm1 87 add r0, r4 88 sub r3d, 4 89 jne .next4rows 90%endmacro 91 92%macro chroma_mc8_mmx_func 2-3 93%ifidn %2, rv40 94%ifdef PIC 95%define rnd_1d_rv40 r8 96%define rnd_2d_rv40 r8 97%define extra_regs 2 98%else ; no-PIC 99%define rnd_1d_rv40 rnd_rv40_1d_tbl 100%define rnd_2d_rv40 rnd_rv40_2d_tbl 101%define extra_regs 1 102%endif ; PIC 103%else 104%define extra_regs 0 105%endif ; rv40 106; void ff_put/avg_h264_chroma_mc8_*(uint8_t *dst /* align 8 */, 107; uint8_t *src /* align 1 */, 108; int stride, int h, int mx, int my) 109cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0 110%if ARCH_X86_64 111 movsxd r2, r2d 112%endif 113 mov r6d, r5d 114 or r6d, r4d 115 jne .at_least_one_non_zero 116 ; mx == 0 AND my == 0 - no filter needed 117 mv0_pixels_mc8 118 REP_RET 119 120.at_least_one_non_zero: 121%ifidn %2, rv40 122%if ARCH_X86_64 123 mov r7, r5 124 and r7, 6 ; &~1 for mx/my=[0,7] 125 lea r7, [r7*4+r4] 126 sar r7d, 1 127%define rnd_bias r7 128%define dest_reg r0 129%else ; x86-32 130 mov r0, r5 131 and r0, 6 ; &~1 for mx/my=[0,7] 132 lea r0, [r0*4+r4] 133 sar r0d, 1 134%define rnd_bias r0 135%define dest_reg r5 136%endif 137%else ; vc1, h264 138%define rnd_bias 0 139%define dest_reg r0 140%endif 141 142 test r5d, r5d 143 mov r6, 1 144 je .my_is_zero 145 test r4d, r4d 146 mov r6, r2 ; dxy = x ? 1 : stride 147 jne .both_non_zero 148.my_is_zero: 149 ; mx == 0 XOR my == 0 - 1 dimensional filter only 150 or r4d, r5d ; x + y 151 152%ifidn %2, rv40 153%ifdef PIC 154 lea r8, [rnd_rv40_1d_tbl] 155%endif 156%if ARCH_X86_64 == 0 157 mov r5, r0m 158%endif 159%endif 160 161 movd m5, r4d 162 movq m4, [pw_8] 163 movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3 164 punpcklwd m5, m5 165 punpckldq m5, m5 ; mm5 = B = x 166 pxor m7, m7 167 psubw m4, m5 ; mm4 = A = 8-x 168 169.next1drow: 170 movq m0, [r1 ] ; mm0 = src[0..7] 171 movq m2, [r1+r6] ; mm1 = src[1..8] 172 173 movq m1, m0 174 movq m3, m2 175 punpcklbw m0, m7 176 punpckhbw m1, m7 177 punpcklbw m2, m7 178 punpckhbw m3, m7 179 pmullw m0, m4 ; [mm0,mm1] = A * src[0..7] 180 pmullw m1, m4 181 pmullw m2, m5 ; [mm2,mm3] = B * src[1..8] 182 pmullw m3, m5 183 184 paddw m0, m6 185 paddw m1, m6 186 paddw m0, m2 187 paddw m1, m3 188 psrlw m0, 3 189 psrlw m1, 3 190 packuswb m0, m1 191 CHROMAMC_AVG m0, [dest_reg] 192 movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 193 194 add dest_reg, r2 195 add r1, r2 196 dec r3d 197 jne .next1drow 198 REP_RET 199 200.both_non_zero: ; general case, bilinear 201 movd m4, r4d ; x 202 movd m6, r5d ; y 203%ifidn %2, rv40 204%ifdef PIC 205 lea r8, [rnd_rv40_2d_tbl] 206%endif 207%if ARCH_X86_64 == 0 208 mov r5, r0m 209%endif 210%endif 211 mov r6, rsp ; backup stack pointer 212 and rsp, ~(mmsize-1) ; align stack 213 sub rsp, 16 ; AA and DD 214 215 punpcklwd m4, m4 216 punpcklwd m6, m6 217 punpckldq m4, m4 ; mm4 = x words 218 punpckldq m6, m6 ; mm6 = y words 219 movq m5, m4 220 pmullw m4, m6 ; mm4 = x * y 221 psllw m5, 3 222 psllw m6, 3 223 movq m7, m5 224 paddw m7, m6 225 movq [rsp+8], m4 ; DD = x * y 226 psubw m5, m4 ; mm5 = B = 8x - xy 227 psubw m6, m4 ; mm6 = C = 8y - xy 228 paddw m4, [pw_64] 229 psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64 230 pxor m7, m7 231 movq [rsp ], m4 232 233 movq m0, [r1 ] ; mm0 = src[0..7] 234 movq m1, [r1+1] ; mm1 = src[1..8] 235.next2drow: 236 add r1, r2 237 238 movq m2, m0 239 movq m3, m1 240 punpckhbw m0, m7 241 punpcklbw m1, m7 242 punpcklbw m2, m7 243 punpckhbw m3, m7 244 pmullw m0, [rsp] 245 pmullw m2, [rsp] 246 pmullw m1, m5 247 pmullw m3, m5 248 paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4] 249 paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8] 250 251 movq m0, [r1] 252 movq m1, m0 253 punpcklbw m0, m7 254 punpckhbw m1, m7 255 pmullw m0, m6 256 pmullw m1, m6 257 paddw m2, m0 258 paddw m3, m1 ; [mm2,mm3] += C * src[0..7] 259 260 movq m1, [r1+1] 261 movq m0, m1 262 movq m4, m1 263 punpcklbw m0, m7 264 punpckhbw m4, m7 265 pmullw m0, [rsp+8] 266 pmullw m4, [rsp+8] 267 paddw m2, m0 268 paddw m3, m4 ; [mm2,mm3] += D * src[1..8] 269 movq m0, [r1] 270 271 paddw m2, [rnd_2d_%2+rnd_bias*8] 272 paddw m3, [rnd_2d_%2+rnd_bias*8] 273 psrlw m2, 6 274 psrlw m3, 6 275 packuswb m2, m3 276 CHROMAMC_AVG m2, [dest_reg] 277 movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6 278 279 add dest_reg, r2 280 dec r3d 281 jne .next2drow 282 mov rsp, r6 ; restore stack pointer 283 RET 284%endmacro 285 286%macro chroma_mc4_mmx_func 2 287%define extra_regs 0 288%ifidn %2, rv40 289%ifdef PIC 290%define extra_regs 1 291%endif ; PIC 292%endif ; rv40 293cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0 294%if ARCH_X86_64 295 movsxd r2, r2d 296%endif 297 pxor m7, m7 298 movd m2, r4d ; x 299 movd m3, r5d ; y 300 movq m4, [pw_8] 301 movq m5, [pw_8] 302 punpcklwd m2, m2 303 punpcklwd m3, m3 304 punpcklwd m2, m2 305 punpcklwd m3, m3 306 psubw m4, m2 307 psubw m5, m3 308 309%ifidn %2, rv40 310%ifdef PIC 311 lea r6, [rnd_rv40_2d_tbl] 312%define rnd_2d_rv40 r6 313%else 314%define rnd_2d_rv40 rnd_rv40_2d_tbl 315%endif 316 and r5, 6 ; &~1 for mx/my=[0,7] 317 lea r5, [r5*4+r4] 318 sar r5d, 1 319%define rnd_bias r5 320%else ; vc1, h264 321%define rnd_bias 0 322%endif 323 324 movd m0, [r1 ] 325 movd m6, [r1+1] 326 add r1, r2 327 punpcklbw m0, m7 328 punpcklbw m6, m7 329 pmullw m0, m4 330 pmullw m6, m2 331 paddw m6, m0 332 333.next2rows: 334 movd m0, [r1 ] 335 movd m1, [r1+1] 336 add r1, r2 337 punpcklbw m0, m7 338 punpcklbw m1, m7 339 pmullw m0, m4 340 pmullw m1, m2 341 paddw m1, m0 342 movq m0, m1 343 344 pmullw m6, m5 345 pmullw m1, m3 346 paddw m6, [rnd_2d_%2+rnd_bias*8] 347 paddw m1, m6 348 psrlw m1, 6 349 packuswb m1, m1 350 CHROMAMC_AVG4 m1, m6, [r0] 351 movd [r0], m1 352 add r0, r2 353 354 movd m6, [r1 ] 355 movd m1, [r1+1] 356 add r1, r2 357 punpcklbw m6, m7 358 punpcklbw m1, m7 359 pmullw m6, m4 360 pmullw m1, m2 361 paddw m1, m6 362 movq m6, m1 363 pmullw m0, m5 364 pmullw m1, m3 365 paddw m0, [rnd_2d_%2+rnd_bias*8] 366 paddw m1, m0 367 psrlw m1, 6 368 packuswb m1, m1 369 CHROMAMC_AVG4 m1, m0, [r0] 370 movd [r0], m1 371 add r0, r2 372 sub r3d, 2 373 jnz .next2rows 374 REP_RET 375%endmacro 376 377%macro chroma_mc2_mmx_func 2 378cglobal %1_%2_chroma_mc2, 6, 7, 0 379%if ARCH_X86_64 380 movsxd r2, r2d 381%endif 382 383 mov r6d, r4d 384 shl r4d, 16 385 sub r4d, r6d 386 add r4d, 8 387 imul r5d, r4d ; x*y<<16 | y*(8-x) 388 shl r4d, 3 389 sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y) 390 391 movd m5, r4d 392 movd m6, r5d 393 punpckldq m5, m5 ; mm5 = {A,B,A,B} 394 punpckldq m6, m6 ; mm6 = {C,D,C,D} 395 pxor m7, m7 396 movd m2, [r1] 397 punpcklbw m2, m7 398 pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2] 399 400.nextrow: 401 add r1, r2 402 movq m1, m2 403 pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] 404 movd m0, [r1] 405 punpcklbw m0, m7 406 pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2] 407 movq m2, m0 408 pmaddwd m0, m6 409 paddw m1, [rnd_2d_%2] 410 paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2] 411 psrlw m1, 6 412 packssdw m1, m7 413 packuswb m1, m7 414 CHROMAMC_AVG4 m1, m3, [r0] 415 movd r5d, m1 416 mov [r0], r5w 417 add r0, r2 418 sub r3d, 1 419 jnz .nextrow 420 REP_RET 421%endmacro 422 423%define rnd_1d_h264 pw_4 424%define rnd_2d_h264 pw_32 425%define rnd_1d_vc1 pw_3 426%define rnd_2d_vc1 pw_28 427 428%macro NOTHING 2-3 429%endmacro 430%macro DIRECT_AVG 2 431 PAVGB %1, %2 432%endmacro 433%macro COPY_AVG 3 434 movd %2, %3 435 PAVGB %1, %2 436%endmacro 437 438INIT_MMX mmx 439%define CHROMAMC_AVG NOTHING 440%define CHROMAMC_AVG4 NOTHING 441chroma_mc8_mmx_func put, h264, _rnd 442chroma_mc8_mmx_func put, vc1, _nornd 443chroma_mc8_mmx_func put, rv40 444chroma_mc4_mmx_func put, h264 445chroma_mc4_mmx_func put, rv40 446 447INIT_MMX mmxext 448chroma_mc2_mmx_func put, h264 449 450%define CHROMAMC_AVG DIRECT_AVG 451%define CHROMAMC_AVG4 COPY_AVG 452chroma_mc8_mmx_func avg, h264, _rnd 453chroma_mc8_mmx_func avg, vc1, _nornd 454chroma_mc8_mmx_func avg, rv40 455chroma_mc4_mmx_func avg, h264 456chroma_mc4_mmx_func avg, rv40 457chroma_mc2_mmx_func avg, h264 458 459INIT_MMX 3dnow 460chroma_mc8_mmx_func avg, h264, _rnd 461chroma_mc8_mmx_func avg, vc1, _nornd 462chroma_mc8_mmx_func avg, rv40 463chroma_mc4_mmx_func avg, h264 464chroma_mc4_mmx_func avg, rv40 465 466%macro chroma_mc8_ssse3_func 2-3 467cglobal %1_%2_chroma_mc8%3, 6, 7, 8 468%if ARCH_X86_64 469 movsxd r2, r2d 470%endif 471 mov r6d, r5d 472 or r6d, r4d 473 jne .at_least_one_non_zero 474 ; mx == 0 AND my == 0 - no filter needed 475 mv0_pixels_mc8 476 REP_RET 477 478.at_least_one_non_zero: 479 test r5d, r5d 480 je .my_is_zero 481 test r4d, r4d 482 je .mx_is_zero 483 484 ; general case, bilinear 485 mov r6d, r4d 486 shl r4d, 8 487 sub r4, r6 488 mov r6, 8 489 add r4, 8 ; x*288+8 = x<<8 | (8-x) 490 sub r6d, r5d 491 imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) 492 imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) 493 494 movd m7, r6d 495 movd m6, r4d 496 movdqa m5, [rnd_2d_%2] 497 movq m0, [r1 ] 498 movq m1, [r1+1] 499 pshuflw m7, m7, 0 500 pshuflw m6, m6, 0 501 punpcklbw m0, m1 502 movlhps m7, m7 503 movlhps m6, m6 504 505.next2rows: 506 movq m1, [r1+r2*1 ] 507 movq m2, [r1+r2*1+1] 508 movq m3, [r1+r2*2 ] 509 movq m4, [r1+r2*2+1] 510 lea r1, [r1+r2*2] 511 punpcklbw m1, m2 512 movdqa m2, m1 513 punpcklbw m3, m4 514 movdqa m4, m3 515 pmaddubsw m0, m7 516 pmaddubsw m1, m6 517 pmaddubsw m2, m7 518 pmaddubsw m3, m6 519 paddw m0, m5 520 paddw m2, m5 521 paddw m1, m0 522 paddw m3, m2 523 psrlw m1, 6 524 movdqa m0, m4 525 psrlw m3, 6 526%ifidn %1, avg 527 movq m2, [r0 ] 528 movhps m2, [r0+r2] 529%endif 530 packuswb m1, m3 531 CHROMAMC_AVG m1, m2 532 movq [r0 ], m1 533 movhps [r0+r2], m1 534 sub r3d, 2 535 lea r0, [r0+r2*2] 536 jg .next2rows 537 REP_RET 538 539.my_is_zero: 540 mov r5d, r4d 541 shl r4d, 8 542 add r4, 8 543 sub r4, r5 ; 255*x+8 = x<<8 | (8-x) 544 movd m7, r4d 545 movdqa m6, [rnd_1d_%2] 546 pshuflw m7, m7, 0 547 movlhps m7, m7 548 549.next2xrows: 550 movq m0, [r1 ] 551 movq m1, [r1 +1] 552 movq m2, [r1+r2 ] 553 movq m3, [r1+r2+1] 554 punpcklbw m0, m1 555 punpcklbw m2, m3 556 pmaddubsw m0, m7 557 pmaddubsw m2, m7 558%ifidn %1, avg 559 movq m4, [r0 ] 560 movhps m4, [r0+r2] 561%endif 562 paddw m0, m6 563 paddw m2, m6 564 psrlw m0, 3 565 psrlw m2, 3 566 packuswb m0, m2 567 CHROMAMC_AVG m0, m4 568 movq [r0 ], m0 569 movhps [r0+r2], m0 570 sub r3d, 2 571 lea r0, [r0+r2*2] 572 lea r1, [r1+r2*2] 573 jg .next2xrows 574 REP_RET 575 576.mx_is_zero: 577 mov r4d, r5d 578 shl r5d, 8 579 add r5, 8 580 sub r5, r4 ; 255*y+8 = y<<8 | (8-y) 581 movd m7, r5d 582 movdqa m6, [rnd_1d_%2] 583 pshuflw m7, m7, 0 584 movlhps m7, m7 585 586.next2yrows: 587 movq m0, [r1 ] 588 movq m1, [r1+r2 ] 589 movdqa m2, m1 590 movq m3, [r1+r2*2] 591 lea r1, [r1+r2*2] 592 punpcklbw m0, m1 593 punpcklbw m2, m3 594 pmaddubsw m0, m7 595 pmaddubsw m2, m7 596%ifidn %1, avg 597 movq m4, [r0 ] 598 movhps m4, [r0+r2] 599%endif 600 paddw m0, m6 601 paddw m2, m6 602 psrlw m0, 3 603 psrlw m2, 3 604 packuswb m0, m2 605 CHROMAMC_AVG m0, m4 606 movq [r0 ], m0 607 movhps [r0+r2], m0 608 sub r3d, 2 609 lea r0, [r0+r2*2] 610 jg .next2yrows 611 REP_RET 612%endmacro 613 614%macro chroma_mc4_ssse3_func 2 615cglobal %1_%2_chroma_mc4, 6, 7, 0 616%if ARCH_X86_64 617 movsxd r2, r2d 618%endif 619 mov r6, r4 620 shl r4d, 8 621 sub r4d, r6d 622 mov r6, 8 623 add r4d, 8 ; x*288+8 624 sub r6d, r5d 625 imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) 626 imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) 627 628 movd m7, r6d 629 movd m6, r4d 630 movq m5, [pw_32] 631 movd m0, [r1 ] 632 pshufw m7, m7, 0 633 punpcklbw m0, [r1+1] 634 pshufw m6, m6, 0 635 636.next2rows: 637 movd m1, [r1+r2*1 ] 638 movd m3, [r1+r2*2 ] 639 punpcklbw m1, [r1+r2*1+1] 640 punpcklbw m3, [r1+r2*2+1] 641 lea r1, [r1+r2*2] 642 movq m2, m1 643 movq m4, m3 644 pmaddubsw m0, m7 645 pmaddubsw m1, m6 646 pmaddubsw m2, m7 647 pmaddubsw m3, m6 648 paddw m0, m5 649 paddw m2, m5 650 paddw m1, m0 651 paddw m3, m2 652 psrlw m1, 6 653 movq m0, m4 654 psrlw m3, 6 655 packuswb m1, m1 656 packuswb m3, m3 657 CHROMAMC_AVG m1, [r0 ] 658 CHROMAMC_AVG m3, [r0+r2] 659 movd [r0 ], m1 660 movd [r0+r2], m3 661 sub r3d, 2 662 lea r0, [r0+r2*2] 663 jg .next2rows 664 REP_RET 665%endmacro 666 667%define CHROMAMC_AVG NOTHING 668INIT_XMM ssse3 669chroma_mc8_ssse3_func put, h264, _rnd 670chroma_mc8_ssse3_func put, vc1, _nornd 671INIT_MMX ssse3 672chroma_mc4_ssse3_func put, h264 673 674%define CHROMAMC_AVG DIRECT_AVG 675INIT_XMM ssse3 676chroma_mc8_ssse3_func avg, h264, _rnd 677chroma_mc8_ssse3_func avg, vc1, _nornd 678INIT_MMX ssse3 679chroma_mc4_ssse3_func avg, h264 680