1;***************************************************************************** 2;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code 3;***************************************************************************** 4;* Copyright (C) 2011 x264 project 5;* 6;* Authors: Daniel Kang <daniel.d.kang@gmail.com> 7;* 8;* This file is part of FFmpeg. 9;* 10;* FFmpeg is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* FFmpeg is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with FFmpeg; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "libavutil/x86/x86util.asm" 26 27SECTION_RODATA 32 28 29cextern pd_65535 30cextern pw_1023 31%define pw_pixel_max pw_1023 32cextern pw_16 33cextern pw_1 34cextern pb_0 35 36pad10: times 8 dw 10*1023 37pad20: times 8 dw 20*1023 38pad30: times 8 dw 30*1023 39depad: times 4 dd 32*20*1023 + 512 40depad2: times 8 dw 20*1023 + 16*1022 + 16 41unpad: times 8 dw 16*1022/32 ; needs to be mod 16 42 43tap1: times 4 dw 1, -5 44tap2: times 4 dw 20, 20 45tap3: times 4 dw -5, 1 46 47SECTION .text 48 49 50%macro AVG_MOV 2 51 pavgw %2, %1 52 mova %1, %2 53%endmacro 54 55%macro ADDW 3 56%if mmsize == 8 57 paddw %1, %2 58%else 59 movu %3, %2 60 paddw %1, %3 61%endif 62%endmacro 63 64%macro FILT_H 4 65 paddw %1, %4 66 psubw %1, %2 ; a-b 67 psraw %1, 2 ; (a-b)/4 68 psubw %1, %2 ; (a-b)/4-b 69 paddw %1, %3 ; (a-b)/4-b+c 70 psraw %1, 2 ; ((a-b)/4-b+c)/4 71 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 72%endmacro 73 74%macro PRELOAD_V 0 75 lea r3, [r2*3] 76 sub r1, r3 77 movu m0, [r1+r2] 78 movu m1, [r1+r2*2] 79 add r1, r3 80 movu m2, [r1] 81 movu m3, [r1+r2] 82 movu m4, [r1+r2*2] 83 add r1, r3 84%endmacro 85 86%macro FILT_V 8 87 movu %6, [r1] 88 paddw %1, %6 89 mova %7, %2 90 paddw %7, %5 91 mova %8, %3 92 paddw %8, %4 93 FILT_H %1, %7, %8, [pw_16] 94 psraw %1, 1 95 CLIPW %1, [pb_0], [pw_pixel_max] 96%endmacro 97 98%macro MC 1 99%define OP_MOV mova 100INIT_MMX mmxext 101%1 put, 4 102INIT_XMM sse2 103%1 put, 8 104 105%define OP_MOV AVG_MOV 106INIT_MMX mmxext 107%1 avg, 4 108INIT_XMM sse2 109%1 avg, 8 110%endmacro 111 112%macro MCAxA_OP 7 113%if ARCH_X86_32 114cglobal %1_h264_qpel%4_%2_10, %5,%6,%7 115 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX 116 mov r0, r0m 117 mov r1, r1m 118 add r0, %3*2 119 add r1, %3*2 120 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX 121 mov r0, r0m 122 mov r1, r1m 123 lea r0, [r0+r2*%3] 124 lea r1, [r1+r2*%3] 125 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX 126 mov r0, r0m 127 mov r1, r1m 128 lea r0, [r0+r2*%3+%3*2] 129 lea r1, [r1+r2*%3+%3*2] 130 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX 131 RET 132%else ; ARCH_X86_64 133cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7 134 mov r%6, r0 135%assign p1 %6+1 136 mov r %+ p1, r1 137 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX 138 lea r0, [r%6+%3*2] 139 lea r1, [r %+ p1+%3*2] 140 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX 141 lea r0, [r%6+r2*%3] 142 lea r1, [r %+ p1+r2*%3] 143 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX 144 lea r0, [r%6+r2*%3+%3*2] 145 lea r1, [r %+ p1+r2*%3+%3*2] 146%if UNIX64 == 0 ; fall through to function 147 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX 148 RET 149%endif 150%endif 151%endmacro 152 153;cpu, put/avg, mc, 4/8, ... 154%macro cglobal_mc 6 155%assign i %3*2 156%if ARCH_X86_32 || cpuflag(sse2) 157MCAxA_OP %1, %2, %3, i, %4,%5,%6 158%endif 159 160cglobal %1_h264_qpel%3_%2_10, %4,%5,%6 161%if UNIX64 == 0 ; no prologue or epilogue for UNIX64 162 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX 163 RET 164%endif 165 166stub_%1_h264_qpel%3_%2_10 %+ SUFFIX: 167%endmacro 168 169;----------------------------------------------------------------------------- 170; void ff_h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride) 171;----------------------------------------------------------------------------- 172%macro COPY4 0 173 movu m0, [r1 ] 174 OP_MOV [r0 ], m0 175 movu m0, [r1+r2 ] 176 OP_MOV [r0+r2 ], m0 177 movu m0, [r1+r2*2] 178 OP_MOV [r0+r2*2], m0 179 movu m0, [r1+r3 ] 180 OP_MOV [r0+r3 ], m0 181%endmacro 182 183%macro MC00 1 184INIT_MMX mmxext 185cglobal_mc %1, mc00, 4, 3,4,0 186 lea r3, [r2*3] 187 COPY4 188 ret 189 190INIT_XMM sse2 191cglobal %1_h264_qpel8_mc00_10, 3,4 192 lea r3, [r2*3] 193 COPY4 194 lea r0, [r0+r2*4] 195 lea r1, [r1+r2*4] 196 COPY4 197 RET 198 199cglobal %1_h264_qpel16_mc00_10, 3,4 200 mov r3d, 8 201.loop: 202 movu m0, [r1 ] 203 movu m1, [r1 +16] 204 OP_MOV [r0 ], m0 205 OP_MOV [r0 +16], m1 206 movu m0, [r1+r2 ] 207 movu m1, [r1+r2+16] 208 OP_MOV [r0+r2 ], m0 209 OP_MOV [r0+r2+16], m1 210 lea r0, [r0+r2*2] 211 lea r1, [r1+r2*2] 212 dec r3d 213 jg .loop 214 REP_RET 215%endmacro 216 217%define OP_MOV mova 218MC00 put 219 220%define OP_MOV AVG_MOV 221MC00 avg 222 223;----------------------------------------------------------------------------- 224; void ff_h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride) 225;----------------------------------------------------------------------------- 226%macro MC_CACHE 1 227%define OP_MOV mova 228INIT_MMX mmxext 229%1 put, 4 230INIT_XMM sse2, cache64 231%1 put, 8 232INIT_XMM ssse3, cache64 233%1 put, 8 234INIT_XMM sse2 235%1 put, 8 236 237%define OP_MOV AVG_MOV 238INIT_MMX mmxext 239%1 avg, 4 240INIT_XMM sse2, cache64 241%1 avg, 8 242INIT_XMM ssse3, cache64 243%1 avg, 8 244INIT_XMM sse2 245%1 avg, 8 246%endmacro 247 248%macro MC20 2 249cglobal_mc %1, mc20, %2, 3,4,9 250 mov r3d, %2 251 mova m1, [pw_pixel_max] 252%if num_mmregs > 8 253 mova m8, [pw_16] 254 %define p16 m8 255%else 256 %define p16 [pw_16] 257%endif 258.nextrow: 259%if %0 == 4 260 movu m2, [r1-4] 261 movu m3, [r1-2] 262 movu m4, [r1+0] 263 ADDW m2, [r1+6], m5 264 ADDW m3, [r1+4], m5 265 ADDW m4, [r1+2], m5 266%else ; movu is slow on these processors 267%if mmsize==16 268 movu m2, [r1-4] 269 movu m0, [r1+6] 270 mova m6, m0 271 psrldq m0, 6 272 273 paddw m6, m2 274 PALIGNR m3, m0, m2, 2, m5 275 PALIGNR m7, m0, m2, 8, m5 276 paddw m3, m7 277 PALIGNR m4, m0, m2, 4, m5 278 PALIGNR m7, m0, m2, 6, m5 279 paddw m4, m7 280 SWAP 2, 6 281%else 282 movu m2, [r1-4] 283 movu m6, [r1+4] 284 PALIGNR m3, m6, m2, 2, m5 285 paddw m3, m6 286 PALIGNR m4, m6, m2, 4, m5 287 PALIGNR m7, m6, m2, 6, m5 288 paddw m4, m7 289 paddw m2, [r1+6] 290%endif 291%endif 292 293 FILT_H m2, m3, m4, p16 294 psraw m2, 1 295 pxor m0, m0 296 CLIPW m2, m0, m1 297 OP_MOV [r0], m2 298 add r0, r2 299 add r1, r2 300 dec r3d 301 jg .nextrow 302 rep ret 303%endmacro 304 305MC_CACHE MC20 306 307;----------------------------------------------------------------------------- 308; void ff_h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride) 309;----------------------------------------------------------------------------- 310%macro MC30 2 311cglobal_mc %1, mc30, %2, 3,5,9 312 lea r4, [r1+2] 313 jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body 314%endmacro 315 316MC_CACHE MC30 317 318;----------------------------------------------------------------------------- 319; void ff_h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride) 320;----------------------------------------------------------------------------- 321%macro MC10 2 322cglobal_mc %1, mc10, %2, 3,5,9 323 mov r4, r1 324.body: 325 mov r3d, %2 326 mova m1, [pw_pixel_max] 327%if num_mmregs > 8 328 mova m8, [pw_16] 329 %define p16 m8 330%else 331 %define p16 [pw_16] 332%endif 333.nextrow: 334%if %0 == 4 335 movu m2, [r1-4] 336 movu m3, [r1-2] 337 movu m4, [r1+0] 338 ADDW m2, [r1+6], m5 339 ADDW m3, [r1+4], m5 340 ADDW m4, [r1+2], m5 341%else ; movu is slow on these processors 342%if mmsize==16 343 movu m2, [r1-4] 344 movu m0, [r1+6] 345 mova m6, m0 346 psrldq m0, 6 347 348 paddw m6, m2 349 PALIGNR m3, m0, m2, 2, m5 350 PALIGNR m7, m0, m2, 8, m5 351 paddw m3, m7 352 PALIGNR m4, m0, m2, 4, m5 353 PALIGNR m7, m0, m2, 6, m5 354 paddw m4, m7 355 SWAP 2, 6 356%else 357 movu m2, [r1-4] 358 movu m6, [r1+4] 359 PALIGNR m3, m6, m2, 2, m5 360 paddw m3, m6 361 PALIGNR m4, m6, m2, 4, m5 362 PALIGNR m7, m6, m2, 6, m5 363 paddw m4, m7 364 paddw m2, [r1+6] 365%endif 366%endif 367 368 FILT_H m2, m3, m4, p16 369 psraw m2, 1 370 pxor m0, m0 371 CLIPW m2, m0, m1 372 movu m3, [r4] 373 pavgw m2, m3 374 OP_MOV [r0], m2 375 add r0, r2 376 add r1, r2 377 add r4, r2 378 dec r3d 379 jg .nextrow 380 rep ret 381%endmacro 382 383MC_CACHE MC10 384 385;----------------------------------------------------------------------------- 386; void ff_h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride) 387;----------------------------------------------------------------------------- 388%macro V_FILT 10 389v_filt%9_%10_10: 390 add r4, r2 391.no_addr4: 392 FILT_V m0, m1, m2, m3, m4, m5, m6, m7 393 add r1, r2 394 add r0, r2 395 ret 396%endmacro 397 398INIT_MMX mmxext 399RESET_MM_PERMUTATION 400%assign i 0 401%rep 4 402V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i 403SWAP 0,1,2,3,4,5 404%assign i i+1 405%endrep 406 407INIT_XMM sse2 408RESET_MM_PERMUTATION 409%assign i 0 410%rep 6 411V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i 412SWAP 0,1,2,3,4,5 413%assign i i+1 414%endrep 415 416%macro MC02 2 417cglobal_mc %1, mc02, %2, 3,4,8 418 PRELOAD_V 419 420 sub r0, r2 421%assign j 0 422%rep %2 423 %assign i (j % 6) 424 call v_filt%2_ %+ i %+ _10.no_addr4 425 OP_MOV [r0], m0 426 SWAP 0,1,2,3,4,5 427 %assign j j+1 428%endrep 429 ret 430%endmacro 431 432MC MC02 433 434;----------------------------------------------------------------------------- 435; void ff_h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride) 436;----------------------------------------------------------------------------- 437%macro MC01 2 438cglobal_mc %1, mc01, %2, 3,5,8 439 mov r4, r1 440.body: 441 PRELOAD_V 442 443 sub r4, r2 444 sub r0, r2 445%assign j 0 446%rep %2 447 %assign i (j % 6) 448 call v_filt%2_ %+ i %+ _10 449 movu m7, [r4] 450 pavgw m0, m7 451 OP_MOV [r0], m0 452 SWAP 0,1,2,3,4,5 453 %assign j j+1 454%endrep 455 ret 456%endmacro 457 458MC MC01 459 460;----------------------------------------------------------------------------- 461; void ff_h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride) 462;----------------------------------------------------------------------------- 463%macro MC03 2 464cglobal_mc %1, mc03, %2, 3,5,8 465 lea r4, [r1+r2] 466 jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body 467%endmacro 468 469MC MC03 470 471;----------------------------------------------------------------------------- 472; void ff_h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride) 473;----------------------------------------------------------------------------- 474%macro H_FILT_AVG 2-3 475h_filt%1_%2_10: 476;FILT_H with fewer registers and averaged with the FILT_V result 477;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration 478;unfortunately I need three registers, so m5 will have to be re-read from memory 479 movu m5, [r4-4] 480 ADDW m5, [r4+6], m7 481 movu m6, [r4-2] 482 ADDW m6, [r4+4], m7 483 paddw m5, [pw_16] 484 psubw m5, m6 ; a-b 485 psraw m5, 2 ; (a-b)/4 486 psubw m5, m6 ; (a-b)/4-b 487 movu m6, [r4+0] 488 ADDW m6, [r4+2], m7 489 paddw m5, m6 ; (a-b)/4-b+c 490 psraw m5, 2 ; ((a-b)/4-b+c)/4 491 paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 492 psraw m5, 1 493 CLIPW m5, [pb_0], [pw_pixel_max] 494;avg FILT_V, FILT_H 495 pavgw m0, m5 496%if %0!=4 497 movu m5, [r1+r5] 498%endif 499 ret 500%endmacro 501 502INIT_MMX mmxext 503RESET_MM_PERMUTATION 504%assign i 0 505%rep 3 506H_FILT_AVG 4, i 507SWAP 0,1,2,3,4,5 508%assign i i+1 509%endrep 510H_FILT_AVG 4, i, 0 511 512INIT_XMM sse2 513RESET_MM_PERMUTATION 514%assign i 0 515%rep 6 516%if i==1 517H_FILT_AVG 8, i, 0 518%else 519H_FILT_AVG 8, i 520%endif 521SWAP 0,1,2,3,4,5 522%assign i i+1 523%endrep 524 525%macro MC11 2 526; this REALLY needs x86_64 527cglobal_mc %1, mc11, %2, 3,6,8 528 mov r4, r1 529.body: 530 PRELOAD_V 531 532 sub r0, r2 533 sub r4, r2 534 mov r5, r2 535 neg r5 536%assign j 0 537%rep %2 538 %assign i (j % 6) 539 call v_filt%2_ %+ i %+ _10 540 call h_filt%2_ %+ i %+ _10 541%if %2==8 && i==1 542 movu m5, [r1+r5] 543%endif 544 OP_MOV [r0], m0 545 SWAP 0,1,2,3,4,5 546 %assign j j+1 547%endrep 548 ret 549%endmacro 550 551MC MC11 552 553;----------------------------------------------------------------------------- 554; void ff_h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride) 555;----------------------------------------------------------------------------- 556%macro MC31 2 557cglobal_mc %1, mc31, %2, 3,6,8 558 mov r4, r1 559 add r1, 2 560 jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body 561%endmacro 562 563MC MC31 564 565;----------------------------------------------------------------------------- 566; void ff_h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride) 567;----------------------------------------------------------------------------- 568%macro MC13 2 569cglobal_mc %1, mc13, %2, 3,7,12 570 lea r4, [r1+r2] 571 jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body 572%endmacro 573 574MC MC13 575 576;----------------------------------------------------------------------------- 577; void ff_h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride) 578;----------------------------------------------------------------------------- 579%macro MC33 2 580cglobal_mc %1, mc33, %2, 3,6,8 581 lea r4, [r1+r2] 582 add r1, 2 583 jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body 584%endmacro 585 586MC MC33 587 588;----------------------------------------------------------------------------- 589; void ff_h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride) 590;----------------------------------------------------------------------------- 591%macro FILT_H2 3 592 psubw %1, %2 ; a-b 593 psubw %2, %3 ; b-c 594 psllw %2, 2 595 psubw %1, %2 ; a-5*b+4*c 596 psllw %3, 4 597 paddw %1, %3 ; a-5*b+20*c 598%endmacro 599 600%macro FILT_VNRD 8 601 movu %6, [r1] 602 paddw %1, %6 603 mova %7, %2 604 paddw %7, %5 605 mova %8, %3 606 paddw %8, %4 607 FILT_H2 %1, %7, %8 608%endmacro 609 610%macro HV 1 611%if mmsize==16 612%define PAD 12 613%define COUNT 2 614%else 615%define PAD 4 616%define COUNT 3 617%endif 618put_hv%1_10: 619 neg r2 ; This actually saves instructions 620 lea r1, [r1+r2*2-mmsize+PAD] 621 lea r4, [rsp+PAD+gprsize] 622 mov r3d, COUNT 623.v_loop: 624 movu m0, [r1] 625 sub r1, r2 626 movu m1, [r1] 627 sub r1, r2 628 movu m2, [r1] 629 sub r1, r2 630 movu m3, [r1] 631 sub r1, r2 632 movu m4, [r1] 633 sub r1, r2 634%assign i 0 635%rep %1-1 636 FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7 637 psubw m0, [pad20] 638 movu [r4+i*mmsize*3], m0 639 sub r1, r2 640 SWAP 0,1,2,3,4,5 641%assign i i+1 642%endrep 643 FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7 644 psubw m0, [pad20] 645 movu [r4+i*mmsize*3], m0 646 add r4, mmsize 647 lea r1, [r1+r2*8+mmsize] 648%if %1==8 649 lea r1, [r1+r2*4] 650%endif 651 dec r3d 652 jg .v_loop 653 neg r2 654 ret 655%endmacro 656 657INIT_MMX mmxext 658HV 4 659INIT_XMM sse2 660HV 8 661 662%macro H_LOOP 1 663%if num_mmregs > 8 664 %define s1 m8 665 %define s2 m9 666 %define s3 m10 667 %define d1 m11 668%else 669 %define s1 [tap1] 670 %define s2 [tap2] 671 %define s3 [tap3] 672 %define d1 [depad] 673%endif 674h%1_loop_op: 675 movu m1, [r1+mmsize-4] 676 movu m2, [r1+mmsize-2] 677 mova m3, [r1+mmsize+0] 678 movu m4, [r1+mmsize+2] 679 movu m5, [r1+mmsize+4] 680 movu m6, [r1+mmsize+6] 681%if num_mmregs > 8 682 pmaddwd m1, s1 683 pmaddwd m2, s1 684 pmaddwd m3, s2 685 pmaddwd m4, s2 686 pmaddwd m5, s3 687 pmaddwd m6, s3 688 paddd m1, d1 689 paddd m2, d1 690%else 691 mova m0, s1 692 pmaddwd m1, m0 693 pmaddwd m2, m0 694 mova m0, s2 695 pmaddwd m3, m0 696 pmaddwd m4, m0 697 mova m0, s3 698 pmaddwd m5, m0 699 pmaddwd m6, m0 700 mova m0, d1 701 paddd m1, m0 702 paddd m2, m0 703%endif 704 paddd m3, m5 705 paddd m4, m6 706 paddd m1, m3 707 paddd m2, m4 708 psrad m1, 10 709 psrad m2, 10 710 pslld m2, 16 711 pand m1, [pd_65535] 712 por m1, m2 713%if num_mmregs <= 8 714 pxor m0, m0 715%endif 716 CLIPW m1, m0, m7 717 add r1, mmsize*3 718 ret 719%endmacro 720 721INIT_MMX mmxext 722H_LOOP 4 723INIT_XMM sse2 724H_LOOP 8 725 726%macro MC22 2 727cglobal_mc %1, mc22, %2, 3,7,12 728%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel) 729 mov r6, rsp ; backup stack pointer 730 and rsp, ~(mmsize-1) ; align stack 731 sub rsp, PAD 732 733 call put_hv%2_10 734 735 mov r3d, %2 736 mova m7, [pw_pixel_max] 737%if num_mmregs > 8 738 pxor m0, m0 739 mova m8, [tap1] 740 mova m9, [tap2] 741 mova m10, [tap3] 742 mova m11, [depad] 743%endif 744 mov r1, rsp 745.h_loop: 746 call h%2_loop_op 747 748 OP_MOV [r0], m1 749 add r0, r2 750 dec r3d 751 jg .h_loop 752 753 mov rsp, r6 ; restore stack pointer 754 ret 755%endmacro 756 757MC MC22 758 759;----------------------------------------------------------------------------- 760; void ff_h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride) 761;----------------------------------------------------------------------------- 762%macro MC12 2 763cglobal_mc %1, mc12, %2, 3,7,12 764%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel) 765 mov r6, rsp ; backup stack pointer 766 and rsp, ~(mmsize-1) ; align stack 767 sub rsp, PAD 768 769 call put_hv%2_10 770 771 xor r4d, r4d 772.body: 773 mov r3d, %2 774 pxor m0, m0 775 mova m7, [pw_pixel_max] 776%if num_mmregs > 8 777 mova m8, [tap1] 778 mova m9, [tap2] 779 mova m10, [tap3] 780 mova m11, [depad] 781%endif 782 mov r1, rsp 783.h_loop: 784 call h%2_loop_op 785 786 movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc 787 paddw m3, [depad2] 788 psrlw m3, 5 789 psubw m3, [unpad] 790 CLIPW m3, m0, m7 791 pavgw m1, m3 792 793 OP_MOV [r0], m1 794 add r0, r2 795 dec r3d 796 jg .h_loop 797 798 mov rsp, r6 ; restore stack pointer 799 ret 800%endmacro 801 802MC MC12 803 804;----------------------------------------------------------------------------- 805; void ff_h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride) 806;----------------------------------------------------------------------------- 807%macro MC32 2 808cglobal_mc %1, mc32, %2, 3,7,12 809%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) 810 mov r6, rsp ; backup stack pointer 811 and rsp, ~(mmsize-1) ; align stack 812 sub rsp, PAD 813 814 call put_hv%2_10 815 816 mov r4d, 2 ; sizeof(pixel) 817 jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body 818%endmacro 819 820MC MC32 821 822;----------------------------------------------------------------------------- 823; void ff_h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride) 824;----------------------------------------------------------------------------- 825%macro H_NRD 1 826put_h%1_10: 827 add rsp, gprsize 828 mov r3d, %1 829 xor r4d, r4d 830 mova m6, [pad20] 831.nextrow: 832 movu m2, [r5-4] 833 movu m3, [r5-2] 834 movu m4, [r5+0] 835 ADDW m2, [r5+6], m5 836 ADDW m3, [r5+4], m5 837 ADDW m4, [r5+2], m5 838 839 FILT_H2 m2, m3, m4 840 psubw m2, m6 841 mova [rsp+r4], m2 842 add r4d, mmsize*3 843 add r5, r2 844 dec r3d 845 jg .nextrow 846 sub rsp, gprsize 847 ret 848%endmacro 849 850INIT_MMX mmxext 851H_NRD 4 852INIT_XMM sse2 853H_NRD 8 854 855%macro MC21 2 856cglobal_mc %1, mc21, %2, 3,7,12 857 mov r5, r1 858.body: 859%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) 860 mov r6, rsp ; backup stack pointer 861 and rsp, ~(mmsize-1) ; align stack 862 863 sub rsp, PAD 864 call put_h%2_10 865 866 sub rsp, PAD 867 call put_hv%2_10 868 869 mov r4d, PAD-mmsize ; H buffer 870 jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body 871%endmacro 872 873MC MC21 874 875;----------------------------------------------------------------------------- 876; void ff_h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride) 877;----------------------------------------------------------------------------- 878%macro MC23 2 879cglobal_mc %1, mc23, %2, 3,7,12 880 lea r5, [r1+r2] 881 jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body 882%endmacro 883 884MC MC23 885