1; /* 2; * Provide SSE luma and chroma mc functions for HEVC decoding 3; * Copyright (c) 2013 Pierre-Edouard LEPERE 4; * 5; * This file is part of FFmpeg. 6; * 7; * FFmpeg is free software; you can redistribute it and/or 8; * modify it under the terms of the GNU Lesser General Public 9; * License as published by the Free Software Foundation; either 10; * version 2.1 of the License, or (at your option) any later version. 11; * 12; * FFmpeg is distributed in the hope that it will be useful, 13; * but WITHOUT ANY WARRANTY; without even the implied warranty of 14; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15; * Lesser General Public License for more details. 16; * 17; * You should have received a copy of the GNU Lesser General Public 18; * License along with FFmpeg; if not, write to the Free Software 19; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20; */ 21%include "libavutil/x86/x86util.asm" 22 23SECTION_RODATA 32 24cextern pw_255 25cextern pw_512 26cextern pw_2048 27cextern pw_8192 28cextern pw_1023 29cextern pw_1024 30cextern pw_4096 31%define pw_8 pw_512 32%define pw_10 pw_2048 33%define pw_12 pw_8192 34%define pw_bi_10 pw_1024 35%define pw_bi_12 pw_4096 36%define max_pixels_8 pw_255 37%define max_pixels_10 pw_1023 38pw_bi_8: times 16 dw (1 << 8) 39max_pixels_12: times 16 dw ((1 << 12)-1) 40cextern pd_1 41cextern pb_0 42 43%macro EPEL_TABLE 4 44hevc_epel_filters_%4_%1 times %2 d%3 -2, 58 45 times %2 d%3 10, -2 46 times %2 d%3 -4, 54 47 times %2 d%3 16, -2 48 times %2 d%3 -6, 46 49 times %2 d%3 28, -4 50 times %2 d%3 -4, 36 51 times %2 d%3 36, -4 52 times %2 d%3 -4, 28 53 times %2 d%3 46, -6 54 times %2 d%3 -2, 16 55 times %2 d%3 54, -4 56 times %2 d%3 -2, 10 57 times %2 d%3 58, -2 58%endmacro 59 60 61EPEL_TABLE 8,16, b, avx2 62EPEL_TABLE 10, 8, w, avx2 63 64EPEL_TABLE 8, 8, b, sse4 65EPEL_TABLE 10, 4, w, sse4 66EPEL_TABLE 12, 4, w, sse4 67 68%macro QPEL_TABLE 4 69hevc_qpel_filters_%4_%1 times %2 d%3 -1, 4 70 times %2 d%3 -10, 58 71 times %2 d%3 17, -5 72 times %2 d%3 1, 0 73 times %2 d%3 -1, 4 74 times %2 d%3 -11, 40 75 times %2 d%3 40,-11 76 times %2 d%3 4, -1 77 times %2 d%3 0, 1 78 times %2 d%3 -5, 17 79 times %2 d%3 58,-10 80 times %2 d%3 4, -1 81%endmacro 82 83QPEL_TABLE 8, 8, b, sse4 84QPEL_TABLE 10, 4, w, sse4 85QPEL_TABLE 12, 4, w, sse4 86 87QPEL_TABLE 8,16, b, avx2 88QPEL_TABLE 10, 8, w, avx2 89 90SECTION .text 91 92%define MAX_PB_SIZE 64 93 94%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10 95 96%define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10 97 98%if ARCH_X86_64 99 100%macro SIMPLE_BILOAD 4 ;width, tab, r1, r2 101%if %1 <= 4 102 movq %3, [%2] ; load data from source2 103%elif %1 <= 8 104 movdqa %3, [%2] ; load data from source2 105%elif %1 <= 12 106%if cpuflag(avx2) 107 mova %3, [%2] 108%else 109 movdqa %3, [%2] ; load data from source2 110 movq %4, [%2+16] ; load data from source2 111%endif ;avx 112%elif %1 <= 16 113%if cpuflag(avx2) 114 mova %3, [%2] 115%else 116 movdqa %3, [%2] ; load data from source2 117 movdqa %4, [%2+16] ; load data from source2 118%endif ; avx 119%else ; %1 = 32 120 mova %3, [%2] 121 mova %4, [%2+32] 122%endif 123%endmacro 124 125%macro SIMPLE_LOAD 4 ;width, bitd, tab, r1 126%if %1 == 2 || (%2 == 8 && %1 <= 4) 127 movd %4, [%3] ; load data from source 128%elif %1 == 4 || (%2 == 8 && %1 <= 8) 129 movq %4, [%3] ; load data from source 130%elif notcpuflag(avx) 131 movu %4, [%3] ; load data from source 132%elif %1 <= 8 || (%2 == 8 && %1 <= 16) 133 movdqu %4, [%3] 134%else 135 movu %4, [%3] 136%endif 137%endmacro 138 139 140%macro EPEL_FILTER 5 ; bit depth, filter index, xmma, xmmb, gprtmp 141%if cpuflag(avx2) 142%assign %%offset 32 143%ifdef PIC 144 lea %5q, [hevc_epel_filters_avx2_%1] 145 %define FILTER %5q 146%else 147 %define FILTER hevc_epel_filters_avx2_%1 148%endif 149%else 150%assign %%offset 16 151%ifdef PIC 152 lea %5q, [hevc_epel_filters_sse4_%1] 153 %define FILTER %5q 154%else 155 %define FILTER hevc_epel_filters_sse4_%1 156%endif 157%endif ;cpuflag(avx2) 158 sub %2q, 1 159%if cpuflag(avx2) 160 shl %2q, 6 ; multiply by 64 161 %else 162 shl %2q, 5 ; multiply by 32 163%endif 164 mova %3, [FILTER + %2q] ; get 2 first values of filters 165 mova %4, [FILTER + %2q+%%offset] ; get 2 last values of filters 166%endmacro 167 168%macro EPEL_HV_FILTER 1 169%if cpuflag(avx2) 170%assign %%offset 32 171%assign %%shift 6 172%define %%table hevc_epel_filters_avx2_%1 173%else 174%assign %%offset 16 175%assign %%shift 5 176%define %%table hevc_epel_filters_sse4_%1 177%endif 178 179%ifdef PIC 180 lea r3srcq, [%%table] 181 %define FILTER r3srcq 182%else 183 %define FILTER %%table 184%endif 185 sub mxq, 1 186 sub myq, 1 187 shl mxq, %%shift ; multiply by 32 188 shl myq, %%shift ; multiply by 32 189 mova m14, [FILTER + mxq] ; get 2 first values of filters 190 mova m15, [FILTER + mxq+%%offset] ; get 2 last values of filters 191 192%if cpuflag(avx2) 193%define %%table hevc_epel_filters_avx2_10 194%else 195%define %%table hevc_epel_filters_sse4_10 196%endif 197%ifdef PIC 198 lea r3srcq, [%%table] 199 %define FILTER r3srcq 200%else 201 %define FILTER %%table 202%endif 203 mova m12, [FILTER + myq] ; get 2 first values of filters 204 mova m13, [FILTER + myq+%%offset] ; get 2 last values of filters 205 lea r3srcq, [srcstrideq*3] 206%endmacro 207 208%macro QPEL_FILTER 2 209 210%if cpuflag(avx2) 211%assign %%offset 32 212%assign %%shift 7 213%define %%table hevc_qpel_filters_avx2_%1 214%else 215%assign %%offset 16 216%assign %%shift 6 217%define %%table hevc_qpel_filters_sse4_%1 218%endif 219 220%ifdef PIC 221 lea rfilterq, [%%table] 222%else 223 %define rfilterq %%table 224%endif 225 sub %2q, 1 226 shl %2q, %%shift ; multiply by 32 227 mova m12, [rfilterq + %2q] ; get 4 first values of filters 228 mova m13, [rfilterq + %2q + %%offset] ; get 4 first values of filters 229 mova m14, [rfilterq + %2q + 2*%%offset] ; get 4 first values of filters 230 mova m15, [rfilterq + %2q + 3*%%offset] ; get 4 first values of filters 231%endmacro 232 233%macro EPEL_LOAD 4 234%if (%1 == 8 && %4 <= 4) 235%define %%load movd 236%elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4) 237%define %%load movq 238%else 239%define %%load movdqu 240%endif 241 242 %%load m0, [%2q ] 243%ifnum %3 244 %%load m1, [%2q+ %3] 245 %%load m2, [%2q+2*%3] 246 %%load m3, [%2q+3*%3] 247%else 248 %%load m1, [%2q+ %3q] 249 %%load m2, [%2q+2*%3q] 250 %%load m3, [%2q+r3srcq] 251%endif 252%if %1 == 8 253%if %4 > 8 254 SBUTTERFLY bw, 0, 1, 7 255 SBUTTERFLY bw, 2, 3, 7 256%else 257 punpcklbw m0, m1 258 punpcklbw m2, m3 259%endif 260%else 261%if %4 > 4 262 SBUTTERFLY wd, 0, 1, 7 263 SBUTTERFLY wd, 2, 3, 7 264%else 265 punpcklwd m0, m1 266 punpcklwd m2, m3 267%endif 268%endif 269%endmacro 270 271 272%macro QPEL_H_LOAD 4 273%assign %%stride (%1+7)/8 274%if %1 == 8 275%if %3 <= 4 276%define %%load movd 277%elif %3 == 8 278%define %%load movq 279%else 280%define %%load movu 281%endif 282%else 283%if %3 == 2 284%define %%load movd 285%elif %3 == 4 286%define %%load movq 287%else 288%define %%load movu 289%endif 290%endif 291 %%load m0, [%2-3*%%stride] ;load data from source 292 %%load m1, [%2-2*%%stride] 293 %%load m2, [%2-%%stride ] 294 %%load m3, [%2 ] 295 %%load m4, [%2+%%stride ] 296 %%load m5, [%2+2*%%stride] 297 %%load m6, [%2+3*%%stride] 298 %%load m7, [%2+4*%%stride] 299 300%if %1 == 8 301%if %3 > 8 302 SBUTTERFLY wd, 0, 1, %4 303 SBUTTERFLY wd, 2, 3, %4 304 SBUTTERFLY wd, 4, 5, %4 305 SBUTTERFLY wd, 6, 7, %4 306%else 307 punpcklbw m0, m1 308 punpcklbw m2, m3 309 punpcklbw m4, m5 310 punpcklbw m6, m7 311%endif 312%else 313%if %3 > 4 314 SBUTTERFLY dq, 0, 1, %4 315 SBUTTERFLY dq, 2, 3, %4 316 SBUTTERFLY dq, 4, 5, %4 317 SBUTTERFLY dq, 6, 7, %4 318%else 319 punpcklwd m0, m1 320 punpcklwd m2, m3 321 punpcklwd m4, m5 322 punpcklwd m6, m7 323%endif 324%endif 325%endmacro 326 327%macro QPEL_V_LOAD 5 328 lea %5q, [%2] 329 sub %5q, r3srcq 330 movu m0, [%5q ] ;load x- 3*srcstride 331 movu m1, [%5q+ %3q ] ;load x- 2*srcstride 332 movu m2, [%5q+ 2*%3q ] ;load x-srcstride 333 movu m3, [%2 ] ;load x 334 movu m4, [%2+ %3q] ;load x+stride 335 movu m5, [%2+ 2*%3q] ;load x+2*stride 336 movu m6, [%2+r3srcq] ;load x+3*stride 337 movu m7, [%2+ 4*%3q] ;load x+4*stride 338%if %1 == 8 339%if %4 > 8 340 SBUTTERFLY bw, 0, 1, 8 341 SBUTTERFLY bw, 2, 3, 8 342 SBUTTERFLY bw, 4, 5, 8 343 SBUTTERFLY bw, 6, 7, 8 344%else 345 punpcklbw m0, m1 346 punpcklbw m2, m3 347 punpcklbw m4, m5 348 punpcklbw m6, m7 349%endif 350%else 351%if %4 > 4 352 SBUTTERFLY wd, 0, 1, 8 353 SBUTTERFLY wd, 2, 3, 8 354 SBUTTERFLY wd, 4, 5, 8 355 SBUTTERFLY wd, 6, 7, 8 356%else 357 punpcklwd m0, m1 358 punpcklwd m2, m3 359 punpcklwd m4, m5 360 punpcklwd m6, m7 361%endif 362%endif 363%endmacro 364 365%macro PEL_12STORE2 3 366 movd [%1], %2 367%endmacro 368%macro PEL_12STORE4 3 369 movq [%1], %2 370%endmacro 371%macro PEL_12STORE6 3 372 movq [%1], %2 373 psrldq %2, 8 374 movd [%1+8], %2 375%endmacro 376%macro PEL_12STORE8 3 377 movdqa [%1], %2 378%endmacro 379%macro PEL_12STORE12 3 380 movdqa [%1], %2 381 movq [%1+16], %3 382%endmacro 383%macro PEL_12STORE16 3 384 PEL_12STORE8 %1, %2, %3 385 movdqa [%1+16], %3 386%endmacro 387 388%macro PEL_10STORE2 3 389 movd [%1], %2 390%endmacro 391%macro PEL_10STORE4 3 392 movq [%1], %2 393%endmacro 394%macro PEL_10STORE6 3 395 movq [%1], %2 396 psrldq %2, 8 397 movd [%1+8], %2 398%endmacro 399%macro PEL_10STORE8 3 400 movdqa [%1], %2 401%endmacro 402%macro PEL_10STORE12 3 403 movdqa [%1], %2 404 movq [%1+16], %3 405%endmacro 406%macro PEL_10STORE16 3 407%if cpuflag(avx2) 408 movu [%1], %2 409%else 410 PEL_10STORE8 %1, %2, %3 411 movdqa [%1+16], %3 412%endif 413%endmacro 414 415%macro PEL_10STORE32 3 416 PEL_10STORE16 %1, %2, %3 417 movu [%1+32], %3 418%endmacro 419 420%macro PEL_8STORE2 3 421 pextrw [%1], %2, 0 422%endmacro 423%macro PEL_8STORE4 3 424 movd [%1], %2 425%endmacro 426%macro PEL_8STORE6 3 427 movd [%1], %2 428 pextrw [%1+4], %2, 2 429%endmacro 430%macro PEL_8STORE8 3 431 movq [%1], %2 432%endmacro 433%macro PEL_8STORE12 3 434 movq [%1], %2 435 psrldq %2, 8 436 movd [%1+8], %2 437%endmacro 438%macro PEL_8STORE16 3 439%if cpuflag(avx2) 440 movdqu [%1], %2 441%else 442 mova [%1], %2 443%endif ; avx 444%endmacro 445%macro PEL_8STORE32 3 446 movu [%1], %2 447%endmacro 448 449%macro LOOP_END 3 450 add %1q, 2*MAX_PB_SIZE ; dst += dststride 451 add %2q, %3q ; src += srcstride 452 dec heightd ; cmp height 453 jnz .loop ; height loop 454%endmacro 455 456 457%macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth 458%if %2 == 8 459%if cpuflag(avx2) && %0 ==3 460%if %1 > 16 461 vextracti128 xm1, m0, 1 462 pmovzxbw m1, xm1 463 psllw m1, 14-%2 464%endif 465 pmovzxbw m0, xm0 466%else ; not avx 467%if %1 > 8 468 punpckhbw m1, m0, m2 469 psllw m1, 14-%2 470%endif 471 punpcklbw m0, m2 472%endif 473%endif ;avx 474 psllw m0, 14-%2 475%endmacro 476 477%macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3 478%if %0 == 8 479%define %%reg0 %5 480%define %%reg2 %6 481%define %%reg1 %7 482%define %%reg3 %8 483%else 484%define %%reg0 m0 485%define %%reg2 m2 486%define %%reg1 m1 487%define %%reg3 m3 488%endif 489%if %1 == 8 490%if cpuflag(avx2) && (%0 == 5) 491%if %2 > 16 492 vperm2i128 m10, m0, m1, q0301 493%endif 494 vinserti128 m0, m0, xm1, 1 495 mova m1, m10 496%if %2 > 16 497 vperm2i128 m10, m2, m3, q0301 498%endif 499 vinserti128 m2, m2, xm3, 1 500 mova m3, m10 501%endif 502 pmaddubsw %%reg0, %3 ;x1*c1+x2*c2 503 pmaddubsw %%reg2, %4 ;x3*c3+x4*c4 504 paddw %%reg0, %%reg2 505%if %2 > 8 506 pmaddubsw %%reg1, %3 507 pmaddubsw %%reg3, %4 508 paddw %%reg1, %%reg3 509%endif 510%else 511 pmaddwd %%reg0, %3 512 pmaddwd %%reg2, %4 513 paddd %%reg0, %%reg2 514%if %2 > 4 515 pmaddwd %%reg1, %3 516 pmaddwd %%reg3, %4 517 paddd %%reg1, %%reg3 518%if %1 != 8 519 psrad %%reg1, %1-8 520%endif 521%endif 522%if %1 != 8 523 psrad %%reg0, %1-8 524%endif 525 packssdw %%reg0, %%reg1 526%endif 527%endmacro 528 529%macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx 530 531%if cpuflag(avx2) 532%assign %%offset 32 533%define %%table hevc_qpel_filters_avx2_%2 534%else 535%assign %%offset 16 536%define %%table hevc_qpel_filters_sse4_%2 537%endif 538 539%ifdef PIC 540 lea rfilterq, [%%table] 541%else 542 %define rfilterq %%table 543%endif 544 545%if %2 == 8 546 pmaddubsw m0, [rfilterq + %3q*8 ] ;x1*c1+x2*c2 547 pmaddubsw m2, [rfilterq + %3q*8+%%offset] ;x3*c3+x4*c4 548 pmaddubsw m4, [rfilterq + %3q*8+2*%%offset] ;x5*c5+x6*c6 549 pmaddubsw m6, [rfilterq + %3q*8+3*%%offset] ;x7*c7+x8*c8 550 paddw m0, m2 551 paddw m4, m6 552 paddw m0, m4 553%else 554 pmaddwd m0, [rfilterq + %3q*8 ] 555 pmaddwd m2, [rfilterq + %3q*8+%%offset] 556 pmaddwd m4, [rfilterq + %3q*8+2*%%offset] 557 pmaddwd m6, [rfilterq + %3q*8+3*%%offset] 558 paddd m0, m2 559 paddd m4, m6 560 paddd m0, m4 561%if %2 != 8 562 psrad m0, %2-8 563%endif 564%if %1 > 4 565 pmaddwd m1, [rfilterq + %3q*8 ] 566 pmaddwd m3, [rfilterq + %3q*8+%%offset] 567 pmaddwd m5, [rfilterq + %3q*8+2*%%offset] 568 pmaddwd m7, [rfilterq + %3q*8+3*%%offset] 569 paddd m1, m3 570 paddd m5, m7 571 paddd m1, m5 572%if %2 != 8 573 psrad m1, %2-8 574%endif 575%endif 576 p%4 m0, m1 577%endif 578%endmacro 579 580%macro QPEL_COMPUTE 2-3 ; width, bitdepth 581%if %2 == 8 582%if cpuflag(avx2) && (%0 == 3) 583 584 vperm2i128 m10, m0, m1, q0301 585 vinserti128 m0, m0, xm1, 1 586 SWAP 1, 10 587 588 vperm2i128 m10, m2, m3, q0301 589 vinserti128 m2, m2, xm3, 1 590 SWAP 3, 10 591 592 593 vperm2i128 m10, m4, m5, q0301 594 vinserti128 m4, m4, xm5, 1 595 SWAP 5, 10 596 597 vperm2i128 m10, m6, m7, q0301 598 vinserti128 m6, m6, xm7, 1 599 SWAP 7, 10 600%endif 601 602 pmaddubsw m0, m12 ;x1*c1+x2*c2 603 pmaddubsw m2, m13 ;x3*c3+x4*c4 604 pmaddubsw m4, m14 ;x5*c5+x6*c6 605 pmaddubsw m6, m15 ;x7*c7+x8*c8 606 paddw m0, m2 607 paddw m4, m6 608 paddw m0, m4 609%if %1 > 8 610 pmaddubsw m1, m12 611 pmaddubsw m3, m13 612 pmaddubsw m5, m14 613 pmaddubsw m7, m15 614 paddw m1, m3 615 paddw m5, m7 616 paddw m1, m5 617%endif 618%else 619 pmaddwd m0, m12 620 pmaddwd m2, m13 621 pmaddwd m4, m14 622 pmaddwd m6, m15 623 paddd m0, m2 624 paddd m4, m6 625 paddd m0, m4 626%if %2 != 8 627 psrad m0, %2-8 628%endif 629%if %1 > 4 630 pmaddwd m1, m12 631 pmaddwd m3, m13 632 pmaddwd m5, m14 633 pmaddwd m7, m15 634 paddd m1, m3 635 paddd m5, m7 636 paddd m1, m5 637%if %2 != 8 638 psrad m1, %2-8 639%endif 640%endif 641%endif 642%endmacro 643 644%macro BI_COMPUTE 7-8 ; width, bitd, src1l, src1h, scr2l, scr2h, pw 645 paddsw %3, %5 646%if %1 > 8 647 paddsw %4, %6 648%endif 649 UNI_COMPUTE %1, %2, %3, %4, %7 650%if %0 == 8 && cpuflag(avx2) && (%2 == 8) 651 vpermq %3, %3, 216 652 vpermq %4, %4, 216 653%endif 654%endmacro 655 656%macro UNI_COMPUTE 5 657 pmulhrsw %3, %5 658%if %1 > 8 || (%2 > 8 && %1 > 4) 659 pmulhrsw %4, %5 660%endif 661%if %2 == 8 662 packuswb %3, %4 663%else 664 CLIPW %3, [pb_0], [max_pixels_%2] 665%if (%1 > 8 && notcpuflag(avx)) || %1 > 16 666 CLIPW %4, [pb_0], [max_pixels_%2] 667%endif 668%endif 669%endmacro 670 671 672; ****************************** 673; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride, 674; uint8_t *_src, ptrdiff_t _srcstride, 675; int height, int mx, int my) 676; ****************************** 677 678%macro HEVC_PUT_HEVC_PEL_PIXELS 2 679HEVC_PEL_PIXELS %1, %2 680HEVC_UNI_PEL_PIXELS %1, %2 681HEVC_BI_PEL_PIXELS %1, %2 682%endmacro 683 684%macro HEVC_PEL_PIXELS 2 685cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height 686 pxor m2, m2 687.loop: 688 SIMPLE_LOAD %1, %2, srcq, m0 689 MC_PIXEL_COMPUTE %1, %2, 1 690 PEL_10STORE%1 dstq, m0, m1 691 LOOP_END dst, src, srcstride 692 RET 693 %endmacro 694 695%macro HEVC_UNI_PEL_PIXELS 2 696cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height 697.loop: 698 SIMPLE_LOAD %1, %2, srcq, m0 699 PEL_%2STORE%1 dstq, m0, m1 700 add dstq, dststrideq ; dst += dststride 701 add srcq, srcstrideq ; src += srcstride 702 dec heightd ; cmp height 703 jnz .loop ; height loop 704 RET 705%endmacro 706 707%macro HEVC_BI_PEL_PIXELS 2 708cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height 709 pxor m2, m2 710 movdqa m5, [pw_bi_%2] 711.loop: 712 SIMPLE_LOAD %1, %2, srcq, m0 713 SIMPLE_BILOAD %1, src2q, m3, m4 714 MC_PIXEL_COMPUTE %1, %2, 1 715 BI_COMPUTE %1, %2, m0, m1, m3, m4, m5, 1 716 PEL_%2STORE%1 dstq, m0, m1 717 add dstq, dststrideq ; dst += dststride 718 add srcq, srcstrideq ; src += srcstride 719 add src2q, 2*MAX_PB_SIZE ; src += srcstride 720 dec heightd ; cmp height 721 jnz .loop ; height loop 722 RET 723%endmacro 724 725 726; ****************************** 727; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride, 728; uint8_t *_src, ptrdiff_t _srcstride, 729; int height, int mx, int my, int width); 730; ****************************** 731 732 733%macro HEVC_PUT_HEVC_EPEL 2 734%if cpuflag(avx2) 735%define XMM_REGS 11 736%else 737%define XMM_REGS 8 738%endif 739 740cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, XMM_REGS, dst, src, srcstride, height, mx, rfilter 741%assign %%stride ((%2 + 7)/8) 742 EPEL_FILTER %2, mx, m4, m5, rfilter 743.loop: 744 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 745 EPEL_COMPUTE %2, %1, m4, m5, 1 746 PEL_10STORE%1 dstq, m0, m1 747 LOOP_END dst, src, srcstride 748 RET 749 750cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, mx, rfilter 751%assign %%stride ((%2 + 7)/8) 752 movdqa m6, [pw_%2] 753 EPEL_FILTER %2, mx, m4, m5, rfilter 754.loop: 755 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 756 EPEL_COMPUTE %2, %1, m4, m5 757 UNI_COMPUTE %1, %2, m0, m1, m6 758 PEL_%2STORE%1 dstq, m0, m1 759 add dstq, dststrideq ; dst += dststride 760 add srcq, srcstrideq ; src += srcstride 761 dec heightd ; cmp height 762 jnz .loop ; height loop 763 RET 764 765cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter 766 movdqa m6, [pw_bi_%2] 767 EPEL_FILTER %2, mx, m4, m5, rfilter 768.loop: 769 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 770 EPEL_COMPUTE %2, %1, m4, m5, 1 771 SIMPLE_BILOAD %1, src2q, m2, m3 772 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1 773 PEL_%2STORE%1 dstq, m0, m1 774 add dstq, dststrideq ; dst += dststride 775 add srcq, srcstrideq ; src += srcstride 776 add src2q, 2*MAX_PB_SIZE ; src += srcstride 777 dec heightd ; cmp height 778 jnz .loop ; height loop 779 RET 780 781; ****************************** 782; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride, 783; uint8_t *_src, ptrdiff_t _srcstride, 784; int height, int mx, int my, int width) 785; ****************************** 786 787cglobal hevc_put_hevc_epel_v%1_%2, 4, 6, XMM_REGS, dst, src, srcstride, height, r3src, my 788 movifnidn myd, mym 789 sub srcq, srcstrideq 790 EPEL_FILTER %2, my, m4, m5, r3src 791 lea r3srcq, [srcstrideq*3] 792.loop: 793 EPEL_LOAD %2, srcq, srcstride, %1 794 EPEL_COMPUTE %2, %1, m4, m5, 1 795 PEL_10STORE%1 dstq, m0, m1 796 LOOP_END dst, src, srcstride 797 RET 798 799cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, my 800 movifnidn myd, mym 801 movdqa m6, [pw_%2] 802 sub srcq, srcstrideq 803 EPEL_FILTER %2, my, m4, m5, r3src 804 lea r3srcq, [srcstrideq*3] 805.loop: 806 EPEL_LOAD %2, srcq, srcstride, %1 807 EPEL_COMPUTE %2, %1, m4, m5 808 UNI_COMPUTE %1, %2, m0, m1, m6 809 PEL_%2STORE%1 dstq, m0, m1 810 add dstq, dststrideq ; dst += dststride 811 add srcq, srcstrideq ; src += srcstride 812 dec heightd ; cmp height 813 jnz .loop ; height loop 814 RET 815 816 817cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my 818 movifnidn myd, mym 819 movdqa m6, [pw_bi_%2] 820 sub srcq, srcstrideq 821 EPEL_FILTER %2, my, m4, m5, r3src 822 lea r3srcq, [srcstrideq*3] 823.loop: 824 EPEL_LOAD %2, srcq, srcstride, %1 825 EPEL_COMPUTE %2, %1, m4, m5, 1 826 SIMPLE_BILOAD %1, src2q, m2, m3 827 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1 828 PEL_%2STORE%1 dstq, m0, m1 829 add dstq, dststrideq ; dst += dststride 830 add srcq, srcstrideq ; src += srcstride 831 add src2q, 2*MAX_PB_SIZE ; src += srcstride 832 dec heightd ; cmp height 833 jnz .loop ; height loop 834 RET 835%endmacro 836 837 838; ****************************** 839; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride, 840; uint8_t *_src, ptrdiff_t _srcstride, 841; int height, int mx, int my, int width) 842; ****************************** 843 844%macro HEVC_PUT_HEVC_EPEL_HV 2 845cglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src 846%assign %%stride ((%2 + 7)/8) 847 sub srcq, srcstrideq 848 EPEL_HV_FILTER %2 849 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 850 EPEL_COMPUTE %2, %1, m14, m15 851%if (%1 > 8 && (%2 == 8)) 852 SWAP m8, m1 853%endif 854 SWAP m4, m0 855 add srcq, srcstrideq 856 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 857 EPEL_COMPUTE %2, %1, m14, m15 858%if (%1 > 8 && (%2 == 8)) 859 SWAP m9, m1 860%endif 861 SWAP m5, m0 862 add srcq, srcstrideq 863 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 864 EPEL_COMPUTE %2, %1, m14, m15 865%if (%1 > 8 && (%2 == 8)) 866 SWAP m10, m1 867%endif 868 SWAP m6, m0 869 add srcq, srcstrideq 870.loop: 871 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 872 EPEL_COMPUTE %2, %1, m14, m15 873%if (%1 > 8 && (%2 == 8)) 874 SWAP m11, m1 875%endif 876 SWAP m7, m0 877 punpcklwd m0, m4, m5 878 punpcklwd m2, m6, m7 879%if %1 > 4 880 punpckhwd m1, m4, m5 881 punpckhwd m3, m6, m7 882%endif 883 EPEL_COMPUTE 14, %1, m12, m13 884%if (%1 > 8 && (%2 == 8)) 885 punpcklwd m4, m8, m9 886 punpcklwd m2, m10, m11 887 punpckhwd m8, m8, m9 888 punpckhwd m3, m10, m11 889 EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3 890%if cpuflag(avx2) 891 vinserti128 m2, m0, xm4, 1 892 vperm2i128 m3, m0, m4, q0301 893 PEL_10STORE%1 dstq, m2, m3 894%else 895 PEL_10STORE%1 dstq, m0, m4 896%endif 897%else 898 PEL_10STORE%1 dstq, m0, m1 899%endif 900 movdqa m4, m5 901 movdqa m5, m6 902 movdqa m6, m7 903%if (%1 > 8 && (%2 == 8)) 904 mova m8, m9 905 mova m9, m10 906 mova m10, m11 907%endif 908 LOOP_END dst, src, srcstride 909 RET 910 911cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src 912%assign %%stride ((%2 + 7)/8) 913 sub srcq, srcstrideq 914 EPEL_HV_FILTER %2 915 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 916 EPEL_COMPUTE %2, %1, m14, m15 917%if (%1 > 8 && (%2 == 8)) 918 SWAP m8, m1 919%endif 920 SWAP m4, m0 921 add srcq, srcstrideq 922 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 923 EPEL_COMPUTE %2, %1, m14, m15 924%if (%1 > 8 && (%2 == 8)) 925 SWAP m9, m1 926%endif 927 SWAP m5, m0 928 add srcq, srcstrideq 929 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 930 EPEL_COMPUTE %2, %1, m14, m15 931%if (%1 > 8 && (%2 == 8)) 932 SWAP m10, m1 933%endif 934 SWAP m6, m0 935 add srcq, srcstrideq 936.loop: 937 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 938 EPEL_COMPUTE %2, %1, m14, m15 939%if (%1 > 8 && (%2 == 8)) 940 SWAP m11, m1 941%endif 942 mova m7, m0 943 punpcklwd m0, m4, m5 944 punpcklwd m2, m6, m7 945%if %1 > 4 946 punpckhwd m1, m4, m5 947 punpckhwd m3, m6, m7 948%endif 949 EPEL_COMPUTE 14, %1, m12, m13 950%if (%1 > 8 && (%2 == 8)) 951 punpcklwd m4, m8, m9 952 punpcklwd m2, m10, m11 953 punpckhwd m8, m8, m9 954 punpckhwd m3, m10, m11 955 EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3 956 UNI_COMPUTE %1, %2, m0, m4, [pw_%2] 957%else 958 UNI_COMPUTE %1, %2, m0, m1, [pw_%2] 959%endif 960 PEL_%2STORE%1 dstq, m0, m1 961 mova m4, m5 962 mova m5, m6 963 mova m6, m7 964%if (%1 > 8 && (%2 == 8)) 965 mova m8, m9 966 mova m9, m10 967 mova m10, m11 968%endif 969 add dstq, dststrideq ; dst += dststride 970 add srcq, srcstrideq ; src += srcstride 971 dec heightd ; cmp height 972 jnz .loop ; height loop 973 RET 974 975cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src 976%assign %%stride ((%2 + 7)/8) 977 sub srcq, srcstrideq 978 EPEL_HV_FILTER %2 979 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 980 EPEL_COMPUTE %2, %1, m14, m15 981%if (%1 > 8 && (%2 == 8)) 982 SWAP m8, m1 983%endif 984 SWAP m4, m0 985 add srcq, srcstrideq 986 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 987 EPEL_COMPUTE %2, %1, m14, m15 988%if (%1 > 8 && (%2 == 8)) 989 SWAP m9, m1 990%endif 991 SWAP m5, m0 992 add srcq, srcstrideq 993 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 994 EPEL_COMPUTE %2, %1, m14, m15 995%if (%1 > 8 && (%2 == 8)) 996 SWAP m10, m1 997%endif 998 SWAP m6, m0 999 add srcq, srcstrideq 1000.loop: 1001 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 1002 EPEL_COMPUTE %2, %1, m14, m15 1003%if (%1 > 8 && (%2 == 8)) 1004 SWAP m11, m1 1005%endif 1006 SWAP m7, m0 1007 punpcklwd m0, m4, m5 1008 punpcklwd m2, m6, m7 1009%if %1 > 4 1010 punpckhwd m1, m4, m5 1011 punpckhwd m3, m6, m7 1012%endif 1013 EPEL_COMPUTE 14, %1, m12, m13 1014%if (%1 > 8 && (%2 == 8)) 1015 punpcklwd m4, m8, m9 1016 punpcklwd m2, m10, m11 1017 punpckhwd m8, m8, m9 1018 punpckhwd m3, m10, m11 1019 EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3 1020 SIMPLE_BILOAD %1, src2q, m8, m3 1021%if cpuflag(avx2) 1022 vinserti128 m1, m8, xm3, 1 1023 vperm2i128 m2, m8, m3, q0301 1024 BI_COMPUTE %1, %2, m0, m4, m1, m2, [pw_bi_%2] 1025%else 1026 BI_COMPUTE %1, %2, m0, m4, m8, m3, [pw_bi_%2] 1027%endif 1028%else 1029 SIMPLE_BILOAD %1, src2q, m8, m9 1030 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2] 1031%endif 1032 PEL_%2STORE%1 dstq, m0, m4 1033 mova m4, m5 1034 mova m5, m6 1035 mova m6, m7 1036%if (%1 > 8 && (%2 == 8)) 1037 mova m8, m9 1038 mova m9, m10 1039 mova m10, m11 1040%endif 1041 add dstq, dststrideq ; dst += dststride 1042 add srcq, srcstrideq ; src += srcstride 1043 add src2q, 2*MAX_PB_SIZE ; src += srcstride 1044 dec heightd ; cmp height 1045 jnz .loop ; height loop 1046 RET 1047%endmacro 1048 1049; ****************************** 1050; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride, 1051; uint8_t *_src, ptrdiff_t _srcstride, 1052; int height, int mx, int my, int width) 1053; ****************************** 1054 1055%macro HEVC_PUT_HEVC_QPEL 2 1056cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter 1057 QPEL_FILTER %2, mx 1058.loop: 1059 QPEL_H_LOAD %2, srcq, %1, 10 1060 QPEL_COMPUTE %1, %2, 1 1061%if %2 > 8 1062 packssdw m0, m1 1063%endif 1064 PEL_10STORE%1 dstq, m0, m1 1065 LOOP_END dst, src, srcstride 1066 RET 1067 1068cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter 1069 mova m9, [pw_%2] 1070 QPEL_FILTER %2, mx 1071.loop: 1072 QPEL_H_LOAD %2, srcq, %1, 10 1073 QPEL_COMPUTE %1, %2 1074%if %2 > 8 1075 packssdw m0, m1 1076%endif 1077 UNI_COMPUTE %1, %2, m0, m1, m9 1078 PEL_%2STORE%1 dstq, m0, m1 1079 add dstq, dststrideq ; dst += dststride 1080 add srcq, srcstrideq ; src += srcstride 1081 dec heightd ; cmp height 1082 jnz .loop ; height loop 1083 RET 1084 1085cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter 1086 movdqa m9, [pw_bi_%2] 1087 QPEL_FILTER %2, mx 1088.loop: 1089 QPEL_H_LOAD %2, srcq, %1, 10 1090 QPEL_COMPUTE %1, %2, 1 1091%if %2 > 8 1092 packssdw m0, m1 1093%endif 1094 SIMPLE_BILOAD %1, src2q, m10, m11 1095 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1 1096 PEL_%2STORE%1 dstq, m0, m1 1097 add dstq, dststrideq ; dst += dststride 1098 add srcq, srcstrideq ; src += srcstride 1099 add src2q, 2*MAX_PB_SIZE ; src += srcstride 1100 dec heightd ; cmp height 1101 jnz .loop ; height loop 1102 RET 1103 1104 1105; ****************************** 1106; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride, 1107; uint8_t *_src, ptrdiff_t _srcstride, 1108; int height, int mx, int my, int width) 1109; ****************************** 1110 1111cglobal hevc_put_hevc_qpel_v%1_%2, 4, 8, 16, dst, src, srcstride, height, r3src, my, rfilter 1112 movifnidn myd, mym 1113 lea r3srcq, [srcstrideq*3] 1114 QPEL_FILTER %2, my 1115.loop: 1116 QPEL_V_LOAD %2, srcq, srcstride, %1, r7 1117 QPEL_COMPUTE %1, %2, 1 1118%if %2 > 8 1119 packssdw m0, m1 1120%endif 1121 PEL_10STORE%1 dstq, m0, m1 1122 LOOP_END dst, src, srcstride 1123 RET 1124 1125cglobal hevc_put_hevc_uni_qpel_v%1_%2, 5, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter 1126 movifnidn myd, mym 1127 movdqa m9, [pw_%2] 1128 lea r3srcq, [srcstrideq*3] 1129 QPEL_FILTER %2, my 1130.loop: 1131 QPEL_V_LOAD %2, srcq, srcstride, %1, r8 1132 QPEL_COMPUTE %1, %2 1133%if %2 > 8 1134 packssdw m0, m1 1135%endif 1136 UNI_COMPUTE %1, %2, m0, m1, m9 1137 PEL_%2STORE%1 dstq, m0, m1 1138 add dstq, dststrideq ; dst += dststride 1139 add srcq, srcstrideq ; src += srcstride 1140 dec heightd ; cmp height 1141 jnz .loop ; height loop 1142 RET 1143 1144cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter 1145 movifnidn myd, mym 1146 movdqa m9, [pw_bi_%2] 1147 lea r3srcq, [srcstrideq*3] 1148 QPEL_FILTER %2, my 1149.loop: 1150 QPEL_V_LOAD %2, srcq, srcstride, %1, r9 1151 QPEL_COMPUTE %1, %2, 1 1152%if %2 > 8 1153 packssdw m0, m1 1154%endif 1155 SIMPLE_BILOAD %1, src2q, m10, m11 1156 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1 1157 PEL_%2STORE%1 dstq, m0, m1 1158 add dstq, dststrideq ; dst += dststride 1159 add srcq, srcstrideq ; src += srcstride 1160 add src2q, 2*MAX_PB_SIZE ; src += srcstride 1161 dec heightd ; cmp height 1162 jnz .loop ; height loop 1163 RET 1164%endmacro 1165 1166 1167; ****************************** 1168; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride, 1169; uint8_t *_src, ptrdiff_t _srcstride, 1170; int height, int mx, int my) 1171; ****************************** 1172%macro HEVC_PUT_HEVC_QPEL_HV 2 1173cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter 1174%if cpuflag(avx2) 1175%assign %%shift 4 1176%else 1177%assign %%shift 3 1178%endif 1179 sub mxq, 1 1180 sub myq, 1 1181 shl mxq, %%shift ; multiply by 32 1182 shl myq, %%shift ; multiply by 32 1183 lea r3srcq, [srcstrideq*3] 1184 sub srcq, r3srcq 1185 QPEL_H_LOAD %2, srcq, %1, 15 1186 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1187 SWAP m8, m0 1188 add srcq, srcstrideq 1189 QPEL_H_LOAD %2, srcq, %1, 15 1190 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1191 SWAP m9, m0 1192 add srcq, srcstrideq 1193 QPEL_H_LOAD %2, srcq, %1, 15 1194 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1195 SWAP m10, m0 1196 add srcq, srcstrideq 1197 QPEL_H_LOAD %2, srcq, %1, 15 1198 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1199 SWAP m11, m0 1200 add srcq, srcstrideq 1201 QPEL_H_LOAD %2, srcq, %1, 15 1202 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1203 SWAP m12, m0 1204 add srcq, srcstrideq 1205 QPEL_H_LOAD %2, srcq, %1, 15 1206 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1207 SWAP m13, m0 1208 add srcq, srcstrideq 1209 QPEL_H_LOAD %2, srcq, %1, 15 1210 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1211 SWAP m14, m0 1212 add srcq, srcstrideq 1213.loop: 1214 QPEL_H_LOAD %2, srcq, %1, 15 1215 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1216 SWAP m15, m0 1217 punpcklwd m0, m8, m9 1218 punpcklwd m2, m10, m11 1219 punpcklwd m4, m12, m13 1220 punpcklwd m6, m14, m15 1221%if %1 > 4 1222 punpckhwd m1, m8, m9 1223 punpckhwd m3, m10, m11 1224 punpckhwd m5, m12, m13 1225 punpckhwd m7, m14, m15 1226%endif 1227 QPEL_HV_COMPUTE %1, 14, my, ackssdw 1228 PEL_10STORE%1 dstq, m0, m1 1229%if %1 <= 4 1230 movq m8, m9 1231 movq m9, m10 1232 movq m10, m11 1233 movq m11, m12 1234 movq m12, m13 1235 movq m13, m14 1236 movq m14, m15 1237%else 1238 movdqa m8, m9 1239 movdqa m9, m10 1240 movdqa m10, m11 1241 movdqa m11, m12 1242 movdqa m12, m13 1243 movdqa m13, m14 1244 movdqa m14, m15 1245%endif 1246 LOOP_END dst, src, srcstride 1247 RET 1248 1249cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter 1250%if cpuflag(avx2) 1251%assign %%shift 4 1252%else 1253%assign %%shift 3 1254%endif 1255 sub mxq, 1 1256 sub myq, 1 1257 shl mxq, %%shift ; multiply by 32 1258 shl myq, %%shift ; multiply by 32 1259 lea r3srcq, [srcstrideq*3] 1260 sub srcq, r3srcq 1261 QPEL_H_LOAD %2, srcq, %1, 15 1262 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1263 SWAP m8, m0 1264 add srcq, srcstrideq 1265 QPEL_H_LOAD %2, srcq, %1, 15 1266 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1267 SWAP m9, m0 1268 add srcq, srcstrideq 1269 QPEL_H_LOAD %2, srcq, %1, 15 1270 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1271 SWAP m10, m0 1272 add srcq, srcstrideq 1273 QPEL_H_LOAD %2, srcq, %1, 15 1274 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1275 SWAP m11, m0 1276 add srcq, srcstrideq 1277 QPEL_H_LOAD %2, srcq, %1, 15 1278 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1279 SWAP m12, m0 1280 add srcq, srcstrideq 1281 QPEL_H_LOAD %2, srcq, %1, 15 1282 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1283 SWAP m13, m0 1284 add srcq, srcstrideq 1285 QPEL_H_LOAD %2, srcq, %1, 15 1286 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1287 SWAP m14, m0 1288 add srcq, srcstrideq 1289.loop: 1290 QPEL_H_LOAD %2, srcq, %1, 15 1291 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1292 SWAP m15, m0 1293 punpcklwd m0, m8, m9 1294 punpcklwd m2, m10, m11 1295 punpcklwd m4, m12, m13 1296 punpcklwd m6, m14, m15 1297%if %1 > 4 1298 punpckhwd m1, m8, m9 1299 punpckhwd m3, m10, m11 1300 punpckhwd m5, m12, m13 1301 punpckhwd m7, m14, m15 1302%endif 1303 QPEL_HV_COMPUTE %1, 14, my, ackusdw 1304 UNI_COMPUTE %1, %2, m0, m1, [pw_%2] 1305 PEL_%2STORE%1 dstq, m0, m1 1306 1307%if %1 <= 4 1308 movq m8, m9 1309 movq m9, m10 1310 movq m10, m11 1311 movq m11, m12 1312 movq m12, m13 1313 movq m13, m14 1314 movq m14, m15 1315%else 1316 mova m8, m9 1317 mova m9, m10 1318 mova m10, m11 1319 mova m11, m12 1320 mova m12, m13 1321 mova m13, m14 1322 mova m14, m15 1323%endif 1324 add dstq, dststrideq ; dst += dststride 1325 add srcq, srcstrideq ; src += srcstride 1326 dec heightd ; cmp height 1327 jnz .loop ; height loop 1328 RET 1329 1330cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter 1331%if cpuflag(avx2) 1332%assign %%shift 4 1333%else 1334%assign %%shift 3 1335%endif 1336 sub mxq, 1 1337 sub myq, 1 1338 shl mxq, %%shift ; multiply by 32 1339 shl myq, %%shift ; multiply by 32 1340 lea r3srcq, [srcstrideq*3] 1341 sub srcq, r3srcq 1342 QPEL_H_LOAD %2, srcq, %1, 15 1343 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1344 SWAP m8, m0 1345 add srcq, srcstrideq 1346 QPEL_H_LOAD %2, srcq, %1, 15 1347 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1348 SWAP m9, m0 1349 add srcq, srcstrideq 1350 QPEL_H_LOAD %2, srcq, %1, 15 1351 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1352 SWAP m10, m0 1353 add srcq, srcstrideq 1354 QPEL_H_LOAD %2, srcq, %1, 15 1355 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1356 SWAP m11, m0 1357 add srcq, srcstrideq 1358 QPEL_H_LOAD %2, srcq, %1, 15 1359 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1360 SWAP m12, m0 1361 add srcq, srcstrideq 1362 QPEL_H_LOAD %2, srcq, %1, 15 1363 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1364 SWAP m13, m0 1365 add srcq, srcstrideq 1366 QPEL_H_LOAD %2, srcq, %1, 15 1367 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1368 SWAP m14, m0 1369 add srcq, srcstrideq 1370.loop: 1371 QPEL_H_LOAD %2, srcq, %1, 15 1372 QPEL_HV_COMPUTE %1, %2, mx, ackssdw 1373 SWAP m15, m0 1374 punpcklwd m0, m8, m9 1375 punpcklwd m2, m10, m11 1376 punpcklwd m4, m12, m13 1377 punpcklwd m6, m14, m15 1378%if %1 > 4 1379 punpckhwd m1, m8, m9 1380 punpckhwd m3, m10, m11 1381 punpckhwd m5, m12, m13 1382 punpckhwd m7, m14, m15 1383%endif 1384 QPEL_HV_COMPUTE %1, 14, my, ackssdw 1385 SIMPLE_BILOAD %1, src2q, m8, m9 ;m9 not used in this case 1386 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2] 1387 PEL_%2STORE%1 dstq, m0, m1 1388 1389%if %1 <= 4 1390 movq m8, m9 1391 movq m9, m10 1392 movq m10, m11 1393 movq m11, m12 1394 movq m12, m13 1395 movq m13, m14 1396 movq m14, m15 1397%else 1398 movdqa m8, m9 1399 movdqa m9, m10 1400 movdqa m10, m11 1401 movdqa m11, m12 1402 movdqa m12, m13 1403 movdqa m13, m14 1404 movdqa m14, m15 1405%endif 1406 add dstq, dststrideq ; dst += dststride 1407 add srcq, srcstrideq ; src += srcstride 1408 add src2q, 2*MAX_PB_SIZE ; src += srcstride 1409 dec heightd ; cmp height 1410 jnz .loop ; height loop 1411 RET 1412%endmacro 1413 1414%macro WEIGHTING_FUNCS 2 1415%if WIN64 || ARCH_X86_32 1416cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, height, denom, wx, ox 1417 mov r4d, denomm 1418%define SHIFT r4d 1419%else 1420cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, height, denom, wx, ox 1421%define SHIFT denomd 1422%endif 1423 lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom 1424%if %1 <= 4 1425 pxor m1, m1 1426%endif 1427 movd m2, wxm ; WX 1428 movd m4, SHIFT ; shift 1429%if %1 <= 4 1430 punpcklwd m2, m1 1431%else 1432 punpcklwd m2, m2 1433%endif 1434 dec SHIFT 1435 movdqu m5, [pd_1] 1436 movd m6, SHIFT 1437 pshufd m2, m2, 0 1438 mov SHIFT, oxm 1439 pslld m5, m6 1440%if %2 != 8 1441 shl SHIFT, %2-8 ; ox << (bitd - 8) 1442%endif 1443 movd m3, SHIFT ; OX 1444 pshufd m3, m3, 0 1445%if WIN64 || ARCH_X86_32 1446 mov SHIFT, heightm 1447%endif 1448.loop: 1449 SIMPLE_LOAD %1, 10, srcq, m0 1450%if %1 <= 4 1451 punpcklwd m0, m1 1452 pmaddwd m0, m2 1453 paddd m0, m5 1454 psrad m0, m4 1455 paddd m0, m3 1456%else 1457 pmulhw m6, m0, m2 1458 pmullw m0, m2 1459 punpckhwd m1, m0, m6 1460 punpcklwd m0, m6 1461 paddd m0, m5 1462 paddd m1, m5 1463 psrad m0, m4 1464 psrad m1, m4 1465 paddd m0, m3 1466 paddd m1, m3 1467%endif 1468 packssdw m0, m1 1469%if %2 == 8 1470 packuswb m0, m0 1471%else 1472 CLIPW m0, [pb_0], [max_pixels_%2] 1473%endif 1474 PEL_%2STORE%1 dstq, m0, m1 1475 add dstq, dststrideq ; dst += dststride 1476 add srcq, 2*MAX_PB_SIZE ; src += srcstride 1477 dec heightd ; cmp height 1478 jnz .loop ; height loop 1479 RET 1480 1481cglobal hevc_put_hevc_bi_w%1_%2, 4, 6, 10, dst, dststride, src, src2, height, denom, wx0, wx1, ox0, ox1 1482 movifnidn r5d, denomm 1483%if %1 <= 4 1484 pxor m1, m1 1485%endif 1486 movd m2, wx0m ; WX0 1487 lea r5d, [r5d+14-%2] ; shift = 14 - bitd + denom 1488 movd m3, wx1m ; WX1 1489 movd m0, r5d ; shift 1490%if %1 <= 4 1491 punpcklwd m2, m1 1492 punpcklwd m3, m1 1493%else 1494 punpcklwd m2, m2 1495 punpcklwd m3, m3 1496%endif 1497 inc r5d 1498 movd m5, r5d ; shift+1 1499 pshufd m2, m2, 0 1500 mov r5d, ox0m 1501 pshufd m3, m3, 0 1502 add r5d, ox1m 1503%if %2 != 8 1504 shl r5d, %2-8 ; ox << (bitd - 8) 1505%endif 1506 inc r5d 1507 movd m4, r5d ; offset 1508 pshufd m4, m4, 0 1509%if UNIX64 1510%define h heightd 1511%else 1512 mov r5d, heightm 1513%define h r5d 1514%endif 1515 pslld m4, m0 1516 1517.loop: 1518 SIMPLE_LOAD %1, 10, srcq, m0 1519 SIMPLE_LOAD %1, 10, src2q, m8 1520%if %1 <= 4 1521 punpcklwd m0, m1 1522 punpcklwd m8, m1 1523 pmaddwd m0, m3 1524 pmaddwd m8, m2 1525 paddd m0, m4 1526 paddd m0, m8 1527 psrad m0, m5 1528%else 1529 pmulhw m6, m0, m3 1530 pmullw m0, m3 1531 pmulhw m7, m8, m2 1532 pmullw m8, m2 1533 punpckhwd m1, m0, m6 1534 punpcklwd m0, m6 1535 punpckhwd m9, m8, m7 1536 punpcklwd m8, m7 1537 paddd m0, m8 1538 paddd m1, m9 1539 paddd m0, m4 1540 paddd m1, m4 1541 psrad m0, m5 1542 psrad m1, m5 1543%endif 1544 packssdw m0, m1 1545%if %2 == 8 1546 packuswb m0, m0 1547%else 1548 CLIPW m0, [pb_0], [max_pixels_%2] 1549%endif 1550 PEL_%2STORE%1 dstq, m0, m1 1551 add dstq, dststrideq ; dst += dststride 1552 add srcq, 2*MAX_PB_SIZE ; src += srcstride 1553 add src2q, 2*MAX_PB_SIZE ; src2 += srcstride 1554 dec h ; cmp height 1555 jnz .loop ; height loop 1556 RET 1557%endmacro 1558 1559INIT_XMM sse4 ; adds ff_ and _sse4 to function name 1560 1561WEIGHTING_FUNCS 2, 8 1562WEIGHTING_FUNCS 4, 8 1563WEIGHTING_FUNCS 6, 8 1564WEIGHTING_FUNCS 8, 8 1565 1566WEIGHTING_FUNCS 2, 10 1567WEIGHTING_FUNCS 4, 10 1568WEIGHTING_FUNCS 6, 10 1569WEIGHTING_FUNCS 8, 10 1570 1571WEIGHTING_FUNCS 2, 12 1572WEIGHTING_FUNCS 4, 12 1573WEIGHTING_FUNCS 6, 12 1574WEIGHTING_FUNCS 8, 12 1575 1576HEVC_PUT_HEVC_PEL_PIXELS 2, 8 1577HEVC_PUT_HEVC_PEL_PIXELS 4, 8 1578HEVC_PUT_HEVC_PEL_PIXELS 6, 8 1579HEVC_PUT_HEVC_PEL_PIXELS 8, 8 1580HEVC_PUT_HEVC_PEL_PIXELS 12, 8 1581HEVC_PUT_HEVC_PEL_PIXELS 16, 8 1582 1583HEVC_PUT_HEVC_PEL_PIXELS 2, 10 1584HEVC_PUT_HEVC_PEL_PIXELS 4, 10 1585HEVC_PUT_HEVC_PEL_PIXELS 6, 10 1586HEVC_PUT_HEVC_PEL_PIXELS 8, 10 1587 1588HEVC_PUT_HEVC_PEL_PIXELS 2, 12 1589HEVC_PUT_HEVC_PEL_PIXELS 4, 12 1590HEVC_PUT_HEVC_PEL_PIXELS 6, 12 1591HEVC_PUT_HEVC_PEL_PIXELS 8, 12 1592 1593HEVC_PUT_HEVC_EPEL 2, 8 1594HEVC_PUT_HEVC_EPEL 4, 8 1595HEVC_PUT_HEVC_EPEL 6, 8 1596HEVC_PUT_HEVC_EPEL 8, 8 1597HEVC_PUT_HEVC_EPEL 12, 8 1598HEVC_PUT_HEVC_EPEL 16, 8 1599 1600 1601HEVC_PUT_HEVC_EPEL 2, 10 1602HEVC_PUT_HEVC_EPEL 4, 10 1603HEVC_PUT_HEVC_EPEL 6, 10 1604HEVC_PUT_HEVC_EPEL 8, 10 1605 1606HEVC_PUT_HEVC_EPEL 2, 12 1607HEVC_PUT_HEVC_EPEL 4, 12 1608HEVC_PUT_HEVC_EPEL 6, 12 1609HEVC_PUT_HEVC_EPEL 8, 12 1610 1611HEVC_PUT_HEVC_EPEL_HV 2, 8 1612HEVC_PUT_HEVC_EPEL_HV 4, 8 1613HEVC_PUT_HEVC_EPEL_HV 6, 8 1614HEVC_PUT_HEVC_EPEL_HV 8, 8 1615HEVC_PUT_HEVC_EPEL_HV 16, 8 1616 1617HEVC_PUT_HEVC_EPEL_HV 2, 10 1618HEVC_PUT_HEVC_EPEL_HV 4, 10 1619HEVC_PUT_HEVC_EPEL_HV 6, 10 1620HEVC_PUT_HEVC_EPEL_HV 8, 10 1621 1622HEVC_PUT_HEVC_EPEL_HV 2, 12 1623HEVC_PUT_HEVC_EPEL_HV 4, 12 1624HEVC_PUT_HEVC_EPEL_HV 6, 12 1625HEVC_PUT_HEVC_EPEL_HV 8, 12 1626 1627HEVC_PUT_HEVC_QPEL 4, 8 1628HEVC_PUT_HEVC_QPEL 8, 8 1629HEVC_PUT_HEVC_QPEL 12, 8 1630HEVC_PUT_HEVC_QPEL 16, 8 1631 1632HEVC_PUT_HEVC_QPEL 4, 10 1633HEVC_PUT_HEVC_QPEL 8, 10 1634 1635HEVC_PUT_HEVC_QPEL 4, 12 1636HEVC_PUT_HEVC_QPEL 8, 12 1637 1638HEVC_PUT_HEVC_QPEL_HV 2, 8 1639HEVC_PUT_HEVC_QPEL_HV 4, 8 1640HEVC_PUT_HEVC_QPEL_HV 6, 8 1641HEVC_PUT_HEVC_QPEL_HV 8, 8 1642 1643HEVC_PUT_HEVC_QPEL_HV 2, 10 1644HEVC_PUT_HEVC_QPEL_HV 4, 10 1645HEVC_PUT_HEVC_QPEL_HV 6, 10 1646HEVC_PUT_HEVC_QPEL_HV 8, 10 1647 1648HEVC_PUT_HEVC_QPEL_HV 2, 12 1649HEVC_PUT_HEVC_QPEL_HV 4, 12 1650HEVC_PUT_HEVC_QPEL_HV 6, 12 1651HEVC_PUT_HEVC_QPEL_HV 8, 12 1652 1653%if HAVE_AVX2_EXTERNAL 1654INIT_YMM avx2 ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0 1655 1656HEVC_PUT_HEVC_PEL_PIXELS 32, 8 1657HEVC_PUT_HEVC_PEL_PIXELS 16, 10 1658 1659HEVC_PUT_HEVC_EPEL 32, 8 1660HEVC_PUT_HEVC_EPEL 16, 10 1661 1662HEVC_PUT_HEVC_EPEL_HV 16, 10 1663HEVC_PUT_HEVC_EPEL_HV 32, 8 1664 1665HEVC_PUT_HEVC_QPEL 32, 8 1666 1667HEVC_PUT_HEVC_QPEL 16, 10 1668 1669HEVC_PUT_HEVC_QPEL_HV 16, 10 1670 1671%endif ;AVX2 1672%endif ; ARCH_X86_64 1673