1;***************************************************************************** 2;* MMX/SSE2/SSSE3-optimized H.264 QPEL code 3;***************************************************************************** 4;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt 5;* Copyright (C) 2012 Daniel Kang 6;* 7;* Authors: Daniel Kang <daniel.d.kang@gmail.com> 8;* 9;* This file is part of FFmpeg. 10;* 11;* FFmpeg is free software; you can redistribute it and/or 12;* modify it under the terms of the GNU Lesser General Public 13;* License as published by the Free Software Foundation; either 14;* version 2.1 of the License, or (at your option) any later version. 15;* 16;* FFmpeg is distributed in the hope that it will be useful, 17;* but WITHOUT ANY WARRANTY; without even the implied warranty of 18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19;* Lesser General Public License for more details. 20;* 21;* You should have received a copy of the GNU Lesser General Public 22;* License along with FFmpeg; if not, write to the Free Software 23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24;****************************************************************************** 25 26%include "libavutil/x86/x86util.asm" 27 28SECTION_RODATA 32 29 30cextern pw_16 31cextern pw_5 32cextern pb_0 33 34SECTION .text 35 36 37%macro op_avgh 3 38 movh %3, %2 39 pavgb %1, %3 40 movh %2, %1 41%endmacro 42 43%macro op_avg 2-3 44 pavgb %1, %2 45 mova %2, %1 46%endmacro 47 48%macro op_puth 2-3 49 movh %2, %1 50%endmacro 51 52%macro op_put 2-3 53 mova %2, %1 54%endmacro 55 56%macro QPEL4_H_LOWPASS_OP 1 57cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride 58 movsxdifnidn r2, r2d 59 movsxdifnidn r3, r3d 60 pxor m7, m7 61 mova m4, [pw_5] 62 mova m5, [pw_16] 63 mov r4d, 4 64.loop: 65 movh m1, [r1-1] 66 movh m2, [r1+0] 67 movh m3, [r1+1] 68 movh m0, [r1+2] 69 punpcklbw m1, m7 70 punpcklbw m2, m7 71 punpcklbw m3, m7 72 punpcklbw m0, m7 73 paddw m1, m0 74 paddw m2, m3 75 movh m0, [r1-2] 76 movh m3, [r1+3] 77 punpcklbw m0, m7 78 punpcklbw m3, m7 79 paddw m0, m3 80 psllw m2, 2 81 psubw m2, m1 82 pmullw m2, m4 83 paddw m0, m5 84 paddw m0, m2 85 psraw m0, 5 86 packuswb m0, m0 87 op_%1h m0, [r0], m6 88 add r0, r2 89 add r1, r3 90 dec r4d 91 jg .loop 92 REP_RET 93%endmacro 94 95INIT_MMX mmxext 96QPEL4_H_LOWPASS_OP put 97QPEL4_H_LOWPASS_OP avg 98 99%macro QPEL8_H_LOWPASS_OP 1 100cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride 101 movsxdifnidn r2, r2d 102 movsxdifnidn r3, r3d 103 mov r4d, 8 104 pxor m7, m7 105 mova m6, [pw_5] 106.loop: 107 mova m0, [r1] 108 mova m2, [r1+1] 109 mova m1, m0 110 mova m3, m2 111 punpcklbw m0, m7 112 punpckhbw m1, m7 113 punpcklbw m2, m7 114 punpckhbw m3, m7 115 paddw m0, m2 116 paddw m1, m3 117 psllw m0, 2 118 psllw m1, 2 119 mova m2, [r1-1] 120 mova m4, [r1+2] 121 mova m3, m2 122 mova m5, m4 123 punpcklbw m2, m7 124 punpckhbw m3, m7 125 punpcklbw m4, m7 126 punpckhbw m5, m7 127 paddw m2, m4 128 paddw m5, m3 129 psubw m0, m2 130 psubw m1, m5 131 pmullw m0, m6 132 pmullw m1, m6 133 movd m2, [r1-2] 134 movd m5, [r1+7] 135 punpcklbw m2, m7 136 punpcklbw m5, m7 137 paddw m2, m3 138 paddw m4, m5 139 mova m5, [pw_16] 140 paddw m2, m5 141 paddw m4, m5 142 paddw m0, m2 143 paddw m1, m4 144 psraw m0, 5 145 psraw m1, 5 146 packuswb m0, m1 147 op_%1 m0, [r0], m4 148 add r0, r2 149 add r1, r3 150 dec r4d 151 jg .loop 152 REP_RET 153%endmacro 154 155INIT_MMX mmxext 156QPEL8_H_LOWPASS_OP put 157QPEL8_H_LOWPASS_OP avg 158 159%macro QPEL8_H_LOWPASS_OP_XMM 1 160cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride 161 movsxdifnidn r2, r2d 162 movsxdifnidn r3, r3d 163 mov r4d, 8 164 pxor m7, m7 165 mova m6, [pw_5] 166.loop: 167 movu m1, [r1-2] 168 mova m0, m1 169 punpckhbw m1, m7 170 punpcklbw m0, m7 171 mova m2, m1 172 mova m3, m1 173 mova m4, m1 174 mova m5, m1 175 palignr m4, m0, 2 176 palignr m3, m0, 4 177 palignr m2, m0, 6 178 palignr m1, m0, 8 179 palignr m5, m0, 10 180 paddw m0, m5 181 paddw m2, m3 182 paddw m1, m4 183 psllw m2, 2 184 psubw m2, m1 185 paddw m0, [pw_16] 186 pmullw m2, m6 187 paddw m2, m0 188 psraw m2, 5 189 packuswb m2, m2 190 op_%1h m2, [r0], m4 191 add r1, r3 192 add r0, r2 193 dec r4d 194 jne .loop 195 REP_RET 196%endmacro 197 198INIT_XMM ssse3 199QPEL8_H_LOWPASS_OP_XMM put 200QPEL8_H_LOWPASS_OP_XMM avg 201 202 203%macro QPEL4_H_LOWPASS_L2_OP 1 204cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride 205 movsxdifnidn r3, r3d 206 movsxdifnidn r4, r4d 207 pxor m7, m7 208 mova m4, [pw_5] 209 mova m5, [pw_16] 210 mov r5d, 4 211.loop: 212 movh m1, [r1-1] 213 movh m2, [r1+0] 214 movh m3, [r1+1] 215 movh m0, [r1+2] 216 punpcklbw m1, m7 217 punpcklbw m2, m7 218 punpcklbw m3, m7 219 punpcklbw m0, m7 220 paddw m1, m0 221 paddw m2, m3 222 movh m0, [r1-2] 223 movh m3, [r1+3] 224 punpcklbw m0, m7 225 punpcklbw m3, m7 226 paddw m0, m3 227 psllw m2, 2 228 psubw m2, m1 229 pmullw m2, m4 230 paddw m0, m5 231 paddw m0, m2 232 movh m3, [r2] 233 psraw m0, 5 234 packuswb m0, m0 235 pavgb m0, m3 236 op_%1h m0, [r0], m6 237 add r0, r3 238 add r1, r3 239 add r2, r4 240 dec r5d 241 jg .loop 242 REP_RET 243%endmacro 244 245INIT_MMX mmxext 246QPEL4_H_LOWPASS_L2_OP put 247QPEL4_H_LOWPASS_L2_OP avg 248 249 250%macro QPEL8_H_LOWPASS_L2_OP 1 251cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride 252 movsxdifnidn r3, r3d 253 movsxdifnidn r4, r4d 254 mov r5d, 8 255 pxor m7, m7 256 mova m6, [pw_5] 257.loop: 258 mova m0, [r1] 259 mova m2, [r1+1] 260 mova m1, m0 261 mova m3, m2 262 punpcklbw m0, m7 263 punpckhbw m1, m7 264 punpcklbw m2, m7 265 punpckhbw m3, m7 266 paddw m0, m2 267 paddw m1, m3 268 psllw m0, 2 269 psllw m1, 2 270 mova m2, [r1-1] 271 mova m4, [r1+2] 272 mova m3, m2 273 mova m5, m4 274 punpcklbw m2, m7 275 punpckhbw m3, m7 276 punpcklbw m4, m7 277 punpckhbw m5, m7 278 paddw m2, m4 279 paddw m5, m3 280 psubw m0, m2 281 psubw m1, m5 282 pmullw m0, m6 283 pmullw m1, m6 284 movd m2, [r1-2] 285 movd m5, [r1+7] 286 punpcklbw m2, m7 287 punpcklbw m5, m7 288 paddw m2, m3 289 paddw m4, m5 290 mova m5, [pw_16] 291 paddw m2, m5 292 paddw m4, m5 293 paddw m0, m2 294 paddw m1, m4 295 psraw m0, 5 296 psraw m1, 5 297 mova m4, [r2] 298 packuswb m0, m1 299 pavgb m0, m4 300 op_%1 m0, [r0], m4 301 add r0, r3 302 add r1, r3 303 add r2, r4 304 dec r5d 305 jg .loop 306 REP_RET 307%endmacro 308 309INIT_MMX mmxext 310QPEL8_H_LOWPASS_L2_OP put 311QPEL8_H_LOWPASS_L2_OP avg 312 313 314%macro QPEL8_H_LOWPASS_L2_OP_XMM 1 315cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride 316 movsxdifnidn r3, r3d 317 movsxdifnidn r4, r4d 318 mov r5d, 8 319 pxor m7, m7 320 mova m6, [pw_5] 321.loop: 322 lddqu m1, [r1-2] 323 mova m0, m1 324 punpckhbw m1, m7 325 punpcklbw m0, m7 326 mova m2, m1 327 mova m3, m1 328 mova m4, m1 329 mova m5, m1 330 palignr m4, m0, 2 331 palignr m3, m0, 4 332 palignr m2, m0, 6 333 palignr m1, m0, 8 334 palignr m5, m0, 10 335 paddw m0, m5 336 paddw m2, m3 337 paddw m1, m4 338 psllw m2, 2 339 movh m3, [r2] 340 psubw m2, m1 341 paddw m0, [pw_16] 342 pmullw m2, m6 343 paddw m2, m0 344 psraw m2, 5 345 packuswb m2, m2 346 pavgb m2, m3 347 op_%1h m2, [r0], m4 348 add r1, r3 349 add r0, r3 350 add r2, r4 351 dec r5d 352 jg .loop 353 REP_RET 354%endmacro 355 356INIT_XMM ssse3 357QPEL8_H_LOWPASS_L2_OP_XMM put 358QPEL8_H_LOWPASS_L2_OP_XMM avg 359 360 361; All functions that call this are required to have function arguments of 362; dst, src, dstStride, srcStride 363%macro FILT_V 1 364 mova m6, m2 365 movh m5, [r1] 366 paddw m6, m3 367 psllw m6, 2 368 psubw m6, m1 369 psubw m6, m4 370 punpcklbw m5, m7 371 pmullw m6, [pw_5] 372 paddw m0, [pw_16] 373 add r1, r3 374 paddw m0, m5 375 paddw m6, m0 376 psraw m6, 5 377 packuswb m6, m6 378 op_%1h m6, [r0], m0 ; 1 379 add r0, r2 380 SWAP 0, 1, 2, 3, 4, 5 381%endmacro 382 383%macro QPEL4_V_LOWPASS_OP 1 384cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride 385 movsxdifnidn r2, r2d 386 movsxdifnidn r3, r3d 387 sub r1, r3 388 sub r1, r3 389 pxor m7, m7 390 movh m0, [r1] 391 movh m1, [r1+r3] 392 lea r1, [r1+2*r3] 393 movh m2, [r1] 394 movh m3, [r1+r3] 395 lea r1, [r1+2*r3] 396 movh m4, [r1] 397 add r1, r3 398 punpcklbw m0, m7 399 punpcklbw m1, m7 400 punpcklbw m2, m7 401 punpcklbw m3, m7 402 punpcklbw m4, m7 403 FILT_V %1 404 FILT_V %1 405 FILT_V %1 406 FILT_V %1 407 RET 408%endmacro 409 410INIT_MMX mmxext 411QPEL4_V_LOWPASS_OP put 412QPEL4_V_LOWPASS_OP avg 413 414 415 416%macro QPEL8OR16_V_LOWPASS_OP 1 417%if cpuflag(sse2) 418cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h 419 movsxdifnidn r2, r2d 420 movsxdifnidn r3, r3d 421 sub r1, r3 422 sub r1, r3 423%else 424cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h 425 movsxdifnidn r2, r2d 426 movsxdifnidn r3, r3d 427%endif 428 pxor m7, m7 429 movh m0, [r1] 430 movh m1, [r1+r3] 431 lea r1, [r1+2*r3] 432 movh m2, [r1] 433 movh m3, [r1+r3] 434 lea r1, [r1+2*r3] 435 movh m4, [r1] 436 add r1, r3 437 punpcklbw m0, m7 438 punpcklbw m1, m7 439 punpcklbw m2, m7 440 punpcklbw m3, m7 441 punpcklbw m4, m7 442 FILT_V %1 443 FILT_V %1 444 FILT_V %1 445 FILT_V %1 446 FILT_V %1 447 FILT_V %1 448 FILT_V %1 449 FILT_V %1 450 cmp r4d, 16 451 jne .end 452 FILT_V %1 453 FILT_V %1 454 FILT_V %1 455 FILT_V %1 456 FILT_V %1 457 FILT_V %1 458 FILT_V %1 459 FILT_V %1 460.end: 461 REP_RET 462%endmacro 463 464INIT_MMX mmxext 465QPEL8OR16_V_LOWPASS_OP put 466QPEL8OR16_V_LOWPASS_OP avg 467 468INIT_XMM sse2 469QPEL8OR16_V_LOWPASS_OP put 470QPEL8OR16_V_LOWPASS_OP avg 471 472 473; All functions that use this are required to have args: 474; src, tmp, srcSize 475%macro FILT_HV 1 ; offset 476 mova m6, m2 477 movh m5, [r0] 478 paddw m6, m3 479 psllw m6, 2 480 paddw m0, [pw_16] 481 psubw m6, m1 482 psubw m6, m4 483 punpcklbw m5, m7 484 pmullw m6, [pw_5] 485 paddw m0, m5 486 add r0, r2 487 paddw m6, m0 488 mova [r1+%1], m6 489 SWAP 0, 1, 2, 3, 4, 5 490%endmacro 491 492%macro QPEL4_HV1_LOWPASS_OP 1 493cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride 494 movsxdifnidn r2, r2d 495 pxor m7, m7 496 movh m0, [r0] 497 movh m1, [r0+r2] 498 lea r0, [r0+2*r2] 499 movh m2, [r0] 500 movh m3, [r0+r2] 501 lea r0, [r0+2*r2] 502 movh m4, [r0] 503 add r0, r2 504 punpcklbw m0, m7 505 punpcklbw m1, m7 506 punpcklbw m2, m7 507 punpcklbw m3, m7 508 punpcklbw m4, m7 509 FILT_HV 0*24 510 FILT_HV 1*24 511 FILT_HV 2*24 512 FILT_HV 3*24 513 RET 514 515cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride 516 movsxdifnidn r2, r2d 517 mov r3d, 4 518.loop: 519 mova m0, [r0] 520 paddw m0, [r0+10] 521 mova m1, [r0+2] 522 paddw m1, [r0+8] 523 mova m2, [r0+4] 524 paddw m2, [r0+6] 525 psubw m0, m1 526 psraw m0, 2 527 psubw m0, m1 528 paddsw m0, m2 529 psraw m0, 2 530 paddw m0, m2 531 psraw m0, 6 532 packuswb m0, m0 533 op_%1h m0, [r1], m7 534 add r0, 24 535 add r1, r2 536 dec r3d 537 jnz .loop 538 REP_RET 539%endmacro 540 541INIT_MMX mmxext 542QPEL4_HV1_LOWPASS_OP put 543QPEL4_HV1_LOWPASS_OP avg 544 545%macro QPEL8OR16_HV1_LOWPASS_OP 1 546cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size 547 movsxdifnidn r2, r2d 548 pxor m7, m7 549 movh m0, [r0] 550 movh m1, [r0+r2] 551 lea r0, [r0+2*r2] 552 movh m2, [r0] 553 movh m3, [r0+r2] 554 lea r0, [r0+2*r2] 555 movh m4, [r0] 556 add r0, r2 557 punpcklbw m0, m7 558 punpcklbw m1, m7 559 punpcklbw m2, m7 560 punpcklbw m3, m7 561 punpcklbw m4, m7 562 FILT_HV 0*48 563 FILT_HV 1*48 564 FILT_HV 2*48 565 FILT_HV 3*48 566 FILT_HV 4*48 567 FILT_HV 5*48 568 FILT_HV 6*48 569 FILT_HV 7*48 570 cmp r3d, 16 571 jne .end 572 FILT_HV 8*48 573 FILT_HV 9*48 574 FILT_HV 10*48 575 FILT_HV 11*48 576 FILT_HV 12*48 577 FILT_HV 13*48 578 FILT_HV 14*48 579 FILT_HV 15*48 580.end: 581 REP_RET 582%endmacro 583 584INIT_MMX mmxext 585QPEL8OR16_HV1_LOWPASS_OP put 586QPEL8OR16_HV1_LOWPASS_OP avg 587 588INIT_XMM sse2 589QPEL8OR16_HV1_LOWPASS_OP put 590 591 592 593%macro QPEL8OR16_HV2_LOWPASS_OP 1 594; unused is to match ssse3 and mmxext args 595cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h 596 movsxdifnidn r2, r2d 597.loop: 598 mova m0, [r1] 599 mova m3, [r1+8] 600 mova m1, [r1+2] 601 mova m4, [r1+10] 602 paddw m0, m4 603 paddw m1, m3 604 paddw m3, [r1+18] 605 paddw m4, [r1+16] 606 mova m2, [r1+4] 607 mova m5, [r1+12] 608 paddw m2, [r1+6] 609 paddw m5, [r1+14] 610 psubw m0, m1 611 psubw m3, m4 612 psraw m0, 2 613 psraw m3, 2 614 psubw m0, m1 615 psubw m3, m4 616 paddsw m0, m2 617 paddsw m3, m5 618 psraw m0, 2 619 psraw m3, 2 620 paddw m0, m2 621 paddw m3, m5 622 psraw m0, 6 623 psraw m3, 6 624 packuswb m0, m3 625 op_%1 m0, [r0], m7 626 add r1, 48 627 add r0, r2 628 dec r4d 629 jne .loop 630 REP_RET 631%endmacro 632 633INIT_MMX mmxext 634QPEL8OR16_HV2_LOWPASS_OP put 635QPEL8OR16_HV2_LOWPASS_OP avg 636 637%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1 638cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size 639 movsxdifnidn r2, r2d 640 movsxdifnidn r3, r3d 641 cmp r4d, 16 642 je .op16 643.loop8: 644 mova m1, [r1+16] 645 mova m0, [r1] 646 mova m2, m1 647 mova m3, m1 648 mova m4, m1 649 mova m5, m1 650 palignr m5, m0, 10 651 palignr m4, m0, 8 652 palignr m3, m0, 6 653 palignr m2, m0, 4 654 palignr m1, m0, 2 655 paddw m0, m5 656 paddw m1, m4 657 paddw m2, m3 658 psubw m0, m1 659 psraw m0, 2 660 psubw m0, m1 661 paddw m0, m2 662 psraw m0, 2 663 paddw m0, m2 664 psraw m0, 6 665 packuswb m0, m0 666 op_%1h m0, [r0], m7 667 add r1, 48 668 add r0, r2 669 dec r4d 670 jne .loop8 671 jmp .done 672.op16: 673 mova m4, [r1+32] 674 mova m5, [r1+16] 675 mova m7, [r1] 676 mova m3, m4 677 mova m2, m4 678 mova m1, m4 679 mova m0, m4 680 palignr m0, m5, 10 681 palignr m1, m5, 8 682 palignr m2, m5, 6 683 palignr m3, m5, 4 684 palignr m4, m5, 2 685 paddw m0, m5 686 paddw m1, m4 687 paddw m2, m3 688 mova m6, m5 689 mova m4, m5 690 mova m3, m5 691 palignr m4, m7, 8 692 palignr m6, m7, 2 693 palignr m3, m7, 10 694 paddw m4, m6 695 mova m6, m5 696 palignr m5, m7, 6 697 palignr m6, m7, 4 698 paddw m3, m7 699 paddw m5, m6 700 psubw m0, m1 701 psubw m3, m4 702 psraw m0, 2 703 psraw m3, 2 704 psubw m0, m1 705 psubw m3, m4 706 paddw m0, m2 707 paddw m3, m5 708 psraw m0, 2 709 psraw m3, 2 710 paddw m0, m2 711 paddw m3, m5 712 psraw m0, 6 713 psraw m3, 6 714 packuswb m3, m0 715 op_%1 m3, [r0], m7 716 add r1, 48 717 add r0, r2 718 dec r4d 719 jne .op16 720.done: 721 REP_RET 722%endmacro 723 724INIT_XMM ssse3 725QPEL8OR16_HV2_LOWPASS_OP_XMM put 726QPEL8OR16_HV2_LOWPASS_OP_XMM avg 727 728 729%macro PIXELS4_L2_SHIFT5 1 730cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h 731 movsxdifnidn r3, r3d 732 movsxdifnidn r4, r4d 733 mova m0, [r1] 734 mova m1, [r1+24] 735 psraw m0, 5 736 psraw m1, 5 737 packuswb m0, m0 738 packuswb m1, m1 739 pavgb m0, [r2] 740 pavgb m1, [r2+r4] 741 op_%1h m0, [r0], m4 742 op_%1h m1, [r0+r3], m5 743 lea r2, [r2+r4*2] 744 lea r0, [r0+r3*2] 745 mova m0, [r1+48] 746 mova m1, [r1+72] 747 psraw m0, 5 748 psraw m1, 5 749 packuswb m0, m0 750 packuswb m1, m1 751 pavgb m0, [r2] 752 pavgb m1, [r2+r4] 753 op_%1h m0, [r0], m4 754 op_%1h m1, [r0+r3], m5 755 RET 756%endmacro 757 758INIT_MMX mmxext 759PIXELS4_L2_SHIFT5 put 760PIXELS4_L2_SHIFT5 avg 761 762 763%macro PIXELS8_L2_SHIFT5 1 764cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h 765 movsxdifnidn r3, r3d 766 movsxdifnidn r4, r4d 767.loop: 768 mova m0, [r1] 769 mova m1, [r1+8] 770 mova m2, [r1+48] 771 mova m3, [r1+48+8] 772 psraw m0, 5 773 psraw m1, 5 774 psraw m2, 5 775 psraw m3, 5 776 packuswb m0, m1 777 packuswb m2, m3 778 pavgb m0, [r2] 779 pavgb m2, [r2+r4] 780 op_%1 m0, [r0], m4 781 op_%1 m2, [r0+r3], m5 782 lea r2, [r2+2*r4] 783 add r1, 48*2 784 lea r0, [r0+2*r3] 785 sub r5d, 2 786 jne .loop 787 REP_RET 788%endmacro 789 790INIT_MMX mmxext 791PIXELS8_L2_SHIFT5 put 792PIXELS8_L2_SHIFT5 avg 793 794 795%if ARCH_X86_64 796%macro QPEL16_H_LOWPASS_L2_OP 1 797cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride 798 movsxdifnidn r3, r3d 799 movsxdifnidn r4, r4d 800 mov r5d, 16 801 pxor m15, m15 802 mova m14, [pw_5] 803 mova m13, [pw_16] 804.loop: 805 lddqu m1, [r1+6] 806 lddqu m7, [r1-2] 807 mova m0, m1 808 punpckhbw m1, m15 809 punpcklbw m0, m15 810 punpcklbw m7, m15 811 mova m2, m1 812 mova m6, m0 813 mova m3, m1 814 mova m8, m0 815 mova m4, m1 816 mova m9, m0 817 mova m12, m0 818 mova m11, m1 819 palignr m11, m0, 10 820 palignr m12, m7, 10 821 palignr m4, m0, 2 822 palignr m9, m7, 2 823 palignr m3, m0, 4 824 palignr m8, m7, 4 825 palignr m2, m0, 6 826 palignr m6, m7, 6 827 paddw m11, m0 828 palignr m1, m0, 8 829 palignr m0, m7, 8 830 paddw m7, m12 831 paddw m2, m3 832 paddw m6, m8 833 paddw m1, m4 834 paddw m0, m9 835 psllw m2, 2 836 psllw m6, 2 837 psubw m2, m1 838 psubw m6, m0 839 paddw m11, m13 840 paddw m7, m13 841 pmullw m2, m14 842 pmullw m6, m14 843 lddqu m3, [r2] 844 paddw m2, m11 845 paddw m6, m7 846 psraw m2, 5 847 psraw m6, 5 848 packuswb m6, m2 849 pavgb m6, m3 850 op_%1 m6, [r0], m11 851 add r1, r3 852 add r0, r3 853 add r2, r4 854 dec r5d 855 jg .loop 856 REP_RET 857%endmacro 858 859INIT_XMM ssse3 860QPEL16_H_LOWPASS_L2_OP put 861QPEL16_H_LOWPASS_L2_OP avg 862%endif 863