1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29SECTION_RODATA 30 31wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 32wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 33wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 34wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 35wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 36wiener_lshuf5: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 37wiener_lshuf7: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 38pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 39 40pb_m10_m9: times 8 db -10, -9 41pb_m6_m5: times 8 db -6, -5 42pb_m2_m1: times 8 db -2, -1 43pb_2_3: times 8 db 2, 3 44pb_6_7: times 8 db 6, 7 45pd_m262128: times 4 dd -262128 46 47wiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192 48wiener_round: dd 1049600, 1048832 49 50SECTION .text 51 52INIT_XMM ssse3 53%if ARCH_X86_32 54DECLARE_REG_TMP 4, 6 55 %if STACK_ALIGNMENT < 16 56 %assign stack_size 13*16+384*12 57 %else 58 %assign stack_size 11*16+384*12 59 %endif 60cglobal wiener_filter7_16bpc, 5, 7, 8, -stack_size, dst, dst_stride, left, \ 61 lpf, lpf_stride, w, flt 62 %if STACK_ALIGNMENT < 16 63 %define lpfm dword [esp+calloff+16*10+0] 64 %define lpf_stridem dword [esp+calloff+16*10+4] 65 %define wm dword [esp+calloff+16*10+8] 66 %define hd dword [esp+calloff+16*10+12] 67 %define edgeb byte [esp+calloff+16*10+16] 68 %else 69 %define hd dword r6m 70 %define edgeb byte r8m 71 %endif 72 %define PICmem dword [esp+calloff+4*0] 73 %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers 74 %define t1m dword [esp+calloff+4*2] 75 %define t2m dword [esp+calloff+4*3] 76 %define t3m dword [esp+calloff+4*4] 77 %define t4m dword [esp+calloff+4*5] 78 %define t5m dword [esp+calloff+4*6] 79 %define t6m dword [esp+calloff+4*7] 80 %define t2 t2m 81 %define t3 t3m 82 %define t4 t4m 83 %define t5 t5m 84 %define t6 t6m 85 %define m8 [esp+calloff+16*2] 86 %define m9 [esp+calloff+16*3] 87 %define m10 [esp+calloff+16*4] 88 %define m11 [esp+calloff+16*5] 89 %define m12 [esp+calloff+16*6] 90 %define m13 [esp+calloff+16*7] 91 %define m14 [esp+calloff+16*8] 92 %define m15 [esp+calloff+16*9] 93 %define base t0-wiener_shifts 94 %assign calloff 0 95 %if STACK_ALIGNMENT < 16 96 mov wd, [rstk+stack_offset+24] 97 mov lpf_stridem, lpf_strideq 98 mov wm, wd 99 mov r4, [rstk+stack_offset+28] 100 mov hd, r4 101 mov r4, [rstk+stack_offset+36] 102 mov [esp+16*11], r4 ; edge 103 %endif 104%else 105DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers 106cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ 107 lpf_stride, w, edge, flt, h 108 %define base 109%endif 110%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 111 movifnidn wd, wm 112%endif 113%if ARCH_X86_64 114 mov fltq, fltmp 115 mov edged, r8m 116 mov hd, r6m 117 mov t3d, r9m ; pixel_max 118 movq m13, [fltq] 119 movq m15, [fltq+16] 120%else 121 %if STACK_ALIGNMENT < 16 122 mov t0, [rstk+stack_offset+32] 123 mov t1, [rstk+stack_offset+40] ; pixel_max 124 movq m1, [t0] ; fx 125 movq m3, [t0+16] ; fy 126 LEA t0, wiener_shifts 127 mov PICmem, t0 128 %else 129 LEA t0, wiener_shifts 130 mov fltq, r7m 131 movq m1, [fltq] 132 movq m3, [fltq+16] 133 mov t1, r9m ; pixel_max 134 mov PICmem, t0 135 %endif 136%endif 137 mova m6, [base+wiener_shufA] 138 mova m7, [base+wiener_shufB] 139%if ARCH_X86_64 140 lea t4, [wiener_shifts] 141 add wd, wd 142 pshufd m12, m13, q0000 ; x0 x1 143 pshufd m13, m13, q1111 ; x2 x3 144 pshufd m14, m15, q0000 ; y0 y1 145 pshufd m15, m15, q1111 ; y2 y3 146 mova m8, [wiener_shufC] 147 mova m9, [wiener_shufD] 148 add lpfq, wq 149 lea t1, [rsp+wq+16] 150 add dstq, wq 151 neg wq 152 shr t3d, 11 153 %define base t4-wiener_shifts 154 movd m10, [base+wiener_round+t3*4] 155 movq m11, [base+wiener_shifts+t3*8] 156 pshufd m10, m10, q0000 157 pshufd m0, m11, q0000 158 pshufd m11, m11, q1111 159 pmullw m12, m0 ; upshift filter coefs to make the 160 pmullw m13, m0 ; horizontal downshift constant 161 DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w 162 %define lpfm [rsp+0] 163 %define lpf_stridem [rsp+8] 164 %define base 165%else 166 add wd, wd 167 mova m4, [base+wiener_shufC] 168 mova m5, [base+wiener_shufD] 169 pshufd m0, m1, q0000 170 pshufd m1, m1, q1111 171 pshufd m2, m3, q0000 172 pshufd m3, m3, q1111 173 mova m8, m4 174 mova m9, m5 175 mova m14, m2 176 mova m15, m3 177 shr t1, 11 178 add lpfq, wq 179 movd m4, [base+wiener_round+t1*4] 180 movq m5, [base+wiener_shifts+t1*8] 181 %if STACK_ALIGNMENT < 16 182 lea t1, [esp+16*12+wq+16] 183 %else 184 lea t1, [esp+16*10+wq+16] 185 %endif 186 add dstq, wq 187 neg wq 188 pshufd m4, m4, q0000 189 pshufd m2, m5, q0000 190 pshufd m5, m5, q1111 191 mov wm, wq 192 pmullw m0, m2 193 pmullw m1, m2 194 mova m10, m4 195 mova m11, m5 196 mova m12, m0 197 mova m13, m1 198%endif 199 test edgeb, 4 ; LR_HAVE_TOP 200 jz .no_top 201 call .h_top 202%if ARCH_X86_64 203 add lpfq, lpf_strideq 204%else 205 add lpfq, lpf_stridem 206%endif 207 mov t6, t1 208 mov t5, t1 209 add t1, 384*2 210 call .h_top 211%if ARCH_X86_64 212 lea r7, [lpfq+lpf_strideq*4] 213 mov lpfq, dstq 214 mov t4, t1 215 add t1, 384*2 216 mov lpf_stridem, lpf_strideq 217 add r7, lpf_strideq 218 mov lpfm, r7 ; below 219%else 220 mov t4m, t1 221 mov t0, lpf_stridem 222 lea t1, [lpfq+t0*4] 223 mov lpfq, dstq 224 add t1, t0 225 mov lpfm, t1 ; below 226 mov t1, t4m 227 mov t0, PICmem 228 add t1, 384*2 229%endif 230 call .h 231 mov t3, t1 232 mov t2, t1 233 dec hd 234 jz .v1 235 add lpfq, dst_strideq 236 add t1, 384*2 237 call .h 238 mov t2, t1 239 dec hd 240 jz .v2 241 add lpfq, dst_strideq 242 add t1, 384*2 243 call .h 244 dec hd 245 jz .v3 246.main: 247 lea t0, [t1+384*2] 248.main_loop: 249 call .hv 250 dec hd 251 jnz .main_loop 252 test edgeb, 8 ; LR_HAVE_BOTTOM 253 jz .v3 254 mov lpfq, lpfm 255 call .hv_bottom 256 add lpfq, lpf_stridem 257 call .hv_bottom 258.v1: 259 call .v 260 RET 261.no_top: 262%if ARCH_X86_64 263 lea r7, [lpfq+lpf_strideq*4] 264 mov lpfq, dstq 265 mov lpf_stridem, lpf_strideq 266 lea r7, [r7+lpf_strideq*2] 267 mov lpfm, r7 268 call .h 269%else 270 mov t1m, t1 271 mov t0, lpf_stridem 272 lea t1, [lpfq+t0*4] 273 mov lpfq, dstq 274 lea t1, [t1+t0*2] 275 mov lpfm, t1 276 mov t0, PICmem 277 mov t1, t1m 278 call .h 279%endif 280 mov t6, t1 281 mov t5, t1 282 mov t4, t1 283 mov t3, t1 284 mov t2, t1 285 dec hd 286 jz .v1 287 add lpfq, dst_strideq 288 add t1, 384*2 289 call .h 290 mov t2, t1 291 dec hd 292 jz .v2 293 add lpfq, dst_strideq 294 add t1, 384*2 295 call .h 296 dec hd 297 jz .v3 298 lea t0, [t1+384*2] 299 call .hv 300 dec hd 301 jz .v3 302 add t0, 384*8 303 call .hv 304 dec hd 305 jnz .main 306.v3: 307 call .v 308%if ARCH_X86_32 309 mov wq, wm 310%endif 311.v2: 312 call .v 313%if ARCH_X86_32 314 mov wq, wm 315%endif 316 jmp .v1 317.extend_right: 318%assign stack_offset_tmp stack_offset 319%assign stack_offset stack_offset+8 320%assign calloff 8 321 pxor m0, m0 322 movd m1, wd 323 mova m2, [base+pb_0to15] 324 pshufb m1, m0 325 mova m0, [base+pb_6_7] 326 psubb m0, m1 327 pminub m0, m2 328 pshufb m3, m0 329 mova m0, [base+pb_m2_m1] 330 psubb m0, m1 331 pminub m0, m2 332 pshufb m4, m0 333 mova m0, [base+pb_m10_m9] 334 psubb m0, m1 335 pminub m0, m2 336 pshufb m5, m0 337 ret 338%assign stack_offset stack_offset-4 339%assign calloff 4 340.h: 341%if ARCH_X86_64 342 mov wq, r5 343%else 344 mov wq, wm 345%endif 346 test edgeb, 1 ; LR_HAVE_LEFT 347 jz .h_extend_left 348 movq m3, [leftq] 349 movhps m3, [lpfq+wq] 350 add leftq, 8 351 jmp .h_main 352.h_extend_left: 353 mova m3, [lpfq+wq] ; avoid accessing memory located 354 pshufb m3, [base+wiener_lshuf7] ; before the start of the buffer 355 jmp .h_main 356.h_top: 357%if ARCH_X86_64 358 mov wq, r5 359%endif 360 test edgeb, 1 ; LR_HAVE_LEFT 361 jz .h_extend_left 362.h_loop: 363 movu m3, [lpfq+wq-8] 364.h_main: 365 mova m4, [lpfq+wq+0] 366 movu m5, [lpfq+wq+8] 367 test edgeb, 2 ; LR_HAVE_RIGHT 368 jnz .h_have_right 369 cmp wd, -18 370 jl .h_have_right 371 call .extend_right 372.h_have_right: 373 pshufb m0, m3, m6 374 pshufb m1, m4, m7 375 paddw m0, m1 376 pshufb m3, m8 377 pmaddwd m0, m12 378 pshufb m1, m4, m9 379 paddw m3, m1 380 pshufb m1, m4, m6 381 pmaddwd m3, m13 382 pshufb m2, m5, m7 383 paddw m1, m2 384 mova m2, [base+pd_m262128] ; (1 << 4) - (1 << 18) 385 pshufb m4, m8 386 pmaddwd m1, m12 387 pshufb m5, m9 388 paddw m4, m5 389 pmaddwd m4, m13 390 paddd m0, m2 391 paddd m1, m2 392 paddd m0, m3 393 paddd m1, m4 394 psrad m0, 4 395 psrad m1, 4 396 packssdw m0, m1 397 psraw m0, 1 398 mova [t1+wq], m0 399 add wq, 16 400 jl .h_loop 401%if ARCH_X86_32 402 mov wq, wm 403%endif 404 ret 405ALIGN function_align 406.hv: 407 add lpfq, dst_strideq 408%if ARCH_X86_64 409 mov wq, r5 410%else 411 mov t0m, t0 412 mov t1m, t1 413 mov t0, PICmem 414%endif 415 test edgeb, 1 ; LR_HAVE_LEFT 416 jz .hv_extend_left 417 movq m3, [leftq] 418 movhps m3, [lpfq+wq] 419 add leftq, 8 420 jmp .hv_main 421.hv_extend_left: 422 mova m3, [lpfq+wq] 423 pshufb m3, [base+wiener_lshuf7] 424 jmp .hv_main 425.hv_bottom: 426%if ARCH_X86_64 427 mov wq, r5 428%else 429 mov t0m, t0 430 mov t1m, t1 431 mov t0, PICmem 432%endif 433 test edgeb, 1 ; LR_HAVE_LEFT 434 jz .hv_extend_left 435.hv_loop: 436 movu m3, [lpfq+wq-8] 437.hv_main: 438 mova m4, [lpfq+wq+0] 439 movu m5, [lpfq+wq+8] 440 test edgeb, 2 ; LR_HAVE_RIGHT 441 jnz .hv_have_right 442 cmp wd, -18 443 jl .hv_have_right 444 call .extend_right 445.hv_have_right: 446%if ARCH_X86_32 447 mov t1, t4m 448%endif 449 pshufb m0, m3, m6 450 pshufb m1, m4, m7 451 paddw m0, m1 452 pshufb m3, m8 453 pmaddwd m0, m12 454 pshufb m1, m4, m9 455 paddw m3, m1 456 pshufb m1, m4, m6 457 pmaddwd m3, m13 458 pshufb m2, m5, m7 459 paddw m1, m2 460 mova m2, [base+pd_m262128] 461 pshufb m4, m8 462 pmaddwd m1, m12 463 pshufb m5, m9 464 paddw m4, m5 465 pmaddwd m4, m13 466 paddd m0, m2 467 paddd m1, m2 468%if ARCH_X86_64 469 mova m2, [t4+wq] 470 paddw m2, [t2+wq] 471 mova m5, [t3+wq] 472%else 473 mov t0, t0m 474 mova m2, [t1+wq] 475 mov t1, t2m 476 paddw m2, [t1+wq] 477 mov t1, t3m 478 mova m5, [t1+wq] 479 mov t1, t5m 480%endif 481 paddd m0, m3 482 paddd m1, m4 483 psrad m0, 4 484 psrad m1, 4 485 packssdw m0, m1 486%if ARCH_X86_64 487 mova m4, [t5+wq] 488 paddw m4, [t1+wq] 489 psraw m0, 1 490 paddw m3, m0, [t6+wq] 491%else 492 mova m4, [t1+wq] 493 mov t1, t1m 494 paddw m4, [t1+wq] 495 psraw m0, 1 496 mov t1, t6m 497 paddw m3, m0, [t1+wq] 498%endif 499 mova [t0+wq], m0 500 punpcklwd m0, m2, m5 501 pmaddwd m0, m15 502 punpckhwd m2, m5 503 pmaddwd m2, m15 504 punpcklwd m1, m3, m4 505 pmaddwd m1, m14 506 punpckhwd m3, m4 507 pmaddwd m3, m14 508 paddd m0, m10 509 paddd m2, m10 510 paddd m0, m1 511 paddd m2, m3 512 psrad m0, 6 513 psrad m2, 6 514 packssdw m0, m2 515 pmulhw m0, m11 516 pxor m1, m1 517 pmaxsw m0, m1 518 mova [dstq+wq], m0 519 add wq, 16 520%if ARCH_X86_64 521 jl .hv_loop 522 mov t6, t5 523 mov t5, t4 524 mov t4, t3 525 mov t3, t2 526 mov t2, t1 527 mov t1, t0 528 mov t0, t6 529%else 530 jge .hv_end 531 mov t0, PICmem 532 jmp .hv_loop 533.hv_end: 534 mov r5, t5m 535 mov t1, t4m 536 mov t6m, r5 537 mov t5m, t1 538 mov r5, t3m 539 mov t1, t2m 540 mov t4m, r5 541 mov t3m, t1 542 mov r5, t1m 543 mov t1, t0 544 mov t2m, r5 545 mov t0, t6m 546 mov wq, wm 547%endif 548 add dstq, dst_strideq 549 ret 550.v: 551%if ARCH_X86_64 552 mov wq, r5 553.v_loop: 554 mova m1, [t4+wq] 555 paddw m1, [t2+wq] 556 mova m2, [t3+wq] 557 mova m4, [t1+wq] 558 paddw m3, m4, [t6+wq] 559 paddw m4, [t5+wq] 560%else 561 mov t1m, t1 562.v_loop: 563 mov t1, t4m 564 mova m1, [t1+wq] 565 mov t1, t2m 566 paddw m1, [t1+wq] 567 mov t1, t3m 568 mova m2, [t1+wq] 569 mov t1, t1m 570 mova m4, [t1+wq] 571 mov t1, t6m 572 paddw m3, m4, [t1+wq] 573 mov t1, t5m 574 paddw m4, [t1+wq] 575%endif 576 punpcklwd m0, m1, m2 577 pmaddwd m0, m15 578 punpckhwd m1, m2 579 pmaddwd m1, m15 580 punpcklwd m2, m3, m4 581 pmaddwd m2, m14 582 punpckhwd m3, m4 583 pmaddwd m3, m14 584 paddd m0, m10 585 paddd m1, m10 586 paddd m0, m2 587 paddd m1, m3 588 psrad m0, 6 589 psrad m1, 6 590 packssdw m0, m1 591 pmulhw m0, m11 592 pxor m1, m1 593 pmaxsw m0, m1 594 mova [dstq+wq], m0 595 add wq, 16 596 jl .v_loop 597%if ARCH_X86_64 598 mov t6, t5 599 mov t5, t4 600 mov t4, t3 601 mov t3, t2 602 mov t2, t1 603%else 604 mov t1, t5m 605 mov r5, t4m 606 mov t6m, t1 607 mov t5m, r5 608 mov t1, t3m 609 mov r5, t2m 610 mov t4m, t1 611 mov t3m, r5 612 mov t1, t1m 613 mov t2m, t1 614%endif 615 add dstq, dst_strideq 616 ret 617 618%if ARCH_X86_32 619 %if STACK_ALIGNMENT < 16 620 %assign stack_size 12*16+384*8 621 %else 622 %assign stack_size 11*16+384*8 623 %endif 624cglobal wiener_filter5_16bpc, 5, 7, 8, -stack_size, dst, dst_stride, left, \ 625 lpf, lpf_stride, w, flt 626 %if STACK_ALIGNMENT < 16 627 %define lpfm dword [esp+calloff+4*6] 628 %define lpf_stridem dword [esp+calloff+4*7] 629 %define wm dword [esp+calloff+16*10+0] 630 %define hd dword [esp+calloff+16*10+4] 631 %define edgeb byte [esp+calloff+16*10+8] 632 %else 633 %define hd dword r6m 634 %define edgeb byte r8m 635 %endif 636 %define PICmem dword [esp+calloff+4*0] 637 %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers 638 %define t1m dword [esp+calloff+4*2] 639 %define t2m dword [esp+calloff+4*3] 640 %define t3m dword [esp+calloff+4*4] 641 %define t4m dword [esp+calloff+4*5] 642 %define t2 t2m 643 %define t3 t3m 644 %define t4 t4m 645 %define m8 [esp+calloff+16*2] 646 %define m9 [esp+calloff+16*3] 647 %define m10 [esp+calloff+16*4] 648 %define m11 [esp+calloff+16*5] 649 %define m12 [esp+calloff+16*6] 650 %define m13 [esp+calloff+16*7] 651 %define m14 [esp+calloff+16*8] 652 %define m15 [esp+calloff+16*9] 653 %define base t0-wiener_shifts 654 %assign calloff 0 655 %if STACK_ALIGNMENT < 16 656 mov wd, [rstk+stack_offset+24] 657 mov lpf_stridem, lpf_strideq 658 mov wm, wd 659 mov r4, [rstk+stack_offset+28] 660 mov hd, r4 661 mov r4, [rstk+stack_offset+36] 662 mov [esp+16*10+8], r4 ; edge 663 %endif 664%else 665cglobal wiener_filter5_16bpc, 5, 15, 16, 384*8+16, dst, dst_stride, left, lpf, \ 666 lpf_stride, w, edge, flt, h 667 %define base 668%endif 669%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 670 movifnidn wd, wm 671%endif 672%if ARCH_X86_64 673 mov fltq, fltmp 674 mov edged, r8m 675 mov hd, r6m 676 mov t3d, r9m ; pixel_max 677 movq m12, [fltq] 678 movq m14, [fltq+16] 679%else 680 %if STACK_ALIGNMENT < 16 681 mov t0, [rstk+stack_offset+32] 682 mov t1, [rstk+stack_offset+40] ; pixel_max 683 movq m1, [t0] ; fx 684 movq m3, [t0+16] ; fy 685 LEA t0, wiener_shifts 686 mov PICmem, t0 687 %else 688 LEA t0, wiener_shifts 689 mov fltq, r7m 690 movq m1, [fltq] 691 movq m3, [fltq+16] 692 mov t1, r9m ; pixel_max 693 mov PICmem, t0 694 %endif 695%endif 696 mova m5, [base+wiener_shufE] 697 mova m6, [base+wiener_shufB] 698 mova m7, [base+wiener_shufD] 699%if ARCH_X86_64 700 lea t4, [wiener_shifts] 701 add wd, wd 702 punpcklwd m11, m12, m12 703 pshufd m11, m11, q1111 ; x1 704 pshufd m12, m12, q1111 ; x2 x3 705 punpcklwd m13, m14, m14 706 pshufd m13, m13, q1111 ; y1 707 pshufd m14, m14, q1111 ; y2 y3 708 shr t3d, 11 709 mova m8, [pd_m262128] ; (1 << 4) - (1 << 18) 710 add lpfq, wq 711 lea t1, [rsp+wq+16] 712 add dstq, wq 713 neg wq 714 %define base t4-wiener_shifts 715 movd m9, [base+wiener_round+t3*4] 716 movq m10, [base+wiener_shifts+t3*8] 717 pshufd m9, m9, q0000 718 pshufd m0, m10, q0000 719 pshufd m10, m10, q1111 720 mova m15, [wiener_lshuf5] 721 pmullw m11, m0 722 pmullw m12, m0 723 DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w 724 %define lpfm [rsp+0] 725 %define lpf_stridem [rsp+8] 726 %define base 727%else 728 add wd, wd 729 punpcklwd m0, m1, m1 730 pshufd m0, m0, q1111 ; x1 731 pshufd m1, m1, q1111 ; x2 x3 732 punpcklwd m2, m3, m3 733 pshufd m2, m2, q1111 ; y1 734 pshufd m3, m3, q1111 ; y2 y3 735 mova m4, [base+pd_m262128] ; (1 << 4) - (1 << 18) 736 mova m13, m2 737 mova m14, m3 738 mova m8, m4 739 shr t1, 11 740 add lpfq, wq 741 movd m2, [base+wiener_round+t1*4] 742 movq m3, [base+wiener_shifts+t1*8] 743 %if STACK_ALIGNMENT < 16 744 lea t1, [esp+16*11+wq+16] 745 %else 746 lea t1, [esp+16*10+wq+16] 747 %endif 748 add dstq, wq 749 neg wq 750 pshufd m2, m2, q0000 751 pshufd m4, m3, q0000 752 pshufd m3, m3, q1111 753 mov wm, wq 754 pmullw m0, m4 755 pmullw m1, m4 756 mova m4, [base+wiener_lshuf5] 757 mova m9, m2 758 mova m10, m3 759 mova m11, m0 760 mova m12, m1 761 mova m15, m4 762%endif 763 test edgeb, 4 ; LR_HAVE_TOP 764 jz .no_top 765 call .h_top 766%if ARCH_X86_64 767 add lpfq, lpf_strideq 768%else 769 add lpfq, lpf_stridem 770%endif 771 mov t4, t1 772 add t1, 384*2 773 call .h_top 774%if ARCH_X86_64 775 lea r7, [lpfq+lpf_strideq*4] 776 mov lpfq, dstq 777 mov t3, t1 778 add t1, 384*2 779 mov lpf_stridem, lpf_strideq 780 add r7, lpf_strideq 781 mov lpfm, r7 ; below 782%else 783 mov t3m, t1 784 mov t0, lpf_stridem 785 lea t1, [lpfq+t0*4] 786 mov lpfq, dstq 787 add t1, t0 788 mov lpfm, t1 ; below 789 mov t1, t3m 790 add t1, 384*2 791%endif 792 call .h 793 mov t2, t1 794 dec hd 795 jz .v1 796 add lpfq, dst_strideq 797 add t1, 384*2 798 call .h 799 dec hd 800 jz .v2 801.main: 802 mov t0, t4 803.main_loop: 804 call .hv 805 dec hd 806 jnz .main_loop 807 test edgeb, 8 ; LR_HAVE_BOTTOM 808 jz .v2 809 mov lpfq, lpfm 810 call .hv_bottom 811 add lpfq, lpf_stridem 812 call .hv_bottom 813.end: 814 RET 815.no_top: 816%if ARCH_X86_64 817 lea r7, [lpfq+lpf_strideq*4] 818 mov lpfq, dstq 819 mov lpf_stridem, lpf_strideq 820 lea r7, [r7+lpf_strideq*2] 821 mov lpfm, r7 822 call .h 823%else 824 mov t1m, t1 825 mov t0, lpf_stridem 826 lea t1, [lpfq+t0*4] 827 mov lpfq, dstq 828 lea t1, [t1+t0*2] 829 mov lpfm, t1 830 mov t1, t1m 831 call .h 832%endif 833 mov t4, t1 834 mov t3, t1 835 mov t2, t1 836 dec hd 837 jz .v1 838 add lpfq, dst_strideq 839 add t1, 384*2 840 call .h 841 dec hd 842 jz .v2 843 lea t0, [t1+384*2] 844 call .hv 845 dec hd 846 jz .v2 847 add t0, 384*6 848 call .hv 849 dec hd 850 jnz .main 851.v2: 852 call .v 853%if ARCH_X86_64 854 mov t4, t3 855 mov t3, t2 856 mov t2, t1 857%else 858 mov t0, t3m 859 mov r5, t2m 860 mov t1, t1m 861 mov t4m, t0 862 mov t3m, r5 863 mov t2m, t1 864 mov wq, wm 865%endif 866 add dstq, dst_strideq 867.v1: 868 call .v 869 jmp .end 870.extend_right: 871%assign stack_offset_tmp stack_offset 872%assign stack_offset stack_offset+8 873%assign calloff 8 874%if ARCH_X86_32 875 mov t0, PICmem 876%endif 877 pxor m1, m1 878 movd m2, wd 879 mova m0, [base+pb_2_3] 880 pshufb m2, m1 881 mova m1, [base+pb_m6_m5] 882 psubb m0, m2 883 psubb m1, m2 884 mova m2, [base+pb_0to15] 885 pminub m0, m2 886 pminub m1, m2 887 pshufb m3, m0 888 pshufb m4, m1 889 ret 890%assign stack_offset stack_offset-4 891%assign calloff 4 892.h: 893%if ARCH_X86_64 894 mov wq, r5 895%else 896 mov wq, wm 897%endif 898 test edgeb, 1 ; LR_HAVE_LEFT 899 jz .h_extend_left 900 mova m4, [lpfq+wq] 901 movd m3, [leftq+4] 902 pslldq m4, 4 903 por m3, m4 904 add leftq, 8 905 jmp .h_main 906.h_extend_left: 907 mova m3, [lpfq+wq] ; avoid accessing memory located 908 pshufb m3, m15 ; before the start of the buffer 909 jmp .h_main 910.h_top: 911%if ARCH_X86_64 912 mov wq, r5 913%else 914 mov wq, wm 915%endif 916 test edgeb, 1 ; LR_HAVE_LEFT 917 jz .h_extend_left 918.h_loop: 919 movu m3, [lpfq+wq-4] 920.h_main: 921 movu m4, [lpfq+wq+4] 922 test edgeb, 2 ; LR_HAVE_RIGHT 923 jnz .h_have_right 924 cmp wd, -18 925 jl .h_have_right 926 call .extend_right 927.h_have_right: 928 pshufb m0, m3, m5 929 pmaddwd m0, m11 930 pshufb m1, m4, m5 931 pmaddwd m1, m11 932 pshufb m2, m3, m6 933 pshufb m3, m7 934 paddw m2, m3 935 pshufb m3, m4, m6 936 pmaddwd m2, m12 937 pshufb m4, m7 938 paddw m3, m4 939 pmaddwd m3, m12 940 paddd m0, m8 941 paddd m1, m8 942 paddd m0, m2 943 paddd m1, m3 944 psrad m0, 4 945 psrad m1, 4 946 packssdw m0, m1 947 psraw m0, 1 948 mova [t1+wq], m0 949 add wq, 16 950 jl .h_loop 951%if ARCH_X86_32 952 mov wq, wm 953%endif 954 ret 955ALIGN function_align 956.hv: 957 add lpfq, dst_strideq 958%if ARCH_X86_64 959 mov wq, r5 960%else 961 mov t0m, t0 962 mov t1m, t1 963%endif 964 test edgeb, 1 ; LR_HAVE_LEFT 965 jz .hv_extend_left 966 mova m4, [lpfq+wq] 967 movd m3, [leftq+4] 968 pslldq m4, 4 969 por m3, m4 970 add leftq, 8 971 jmp .hv_main 972.hv_extend_left: 973 mova m3, [lpfq+wq] 974 pshufb m3, m15 975 jmp .hv_main 976.hv_bottom: 977%if ARCH_X86_64 978 mov wq, r5 979%else 980 mov t0m, t0 981 mov t1m, t1 982%endif 983 test edgeb, 1 ; LR_HAVE_LEFT 984 jz .hv_extend_left 985.hv_loop: 986 movu m3, [lpfq+wq-4] 987.hv_main: 988 movu m4, [lpfq+wq+4] 989 test edgeb, 2 ; LR_HAVE_RIGHT 990 jnz .hv_have_right 991 cmp wd, -18 992 jl .hv_have_right 993 call .extend_right 994.hv_have_right: 995%if ARCH_X86_32 996 mov t1, t1m 997 mov t0, t3m 998%endif 999 pshufb m0, m3, m5 1000 pmaddwd m0, m11 1001 pshufb m1, m4, m5 1002 pmaddwd m1, m11 1003 pshufb m2, m3, m6 1004 pshufb m3, m7 1005 paddw m2, m3 1006 pshufb m3, m4, m6 1007 pmaddwd m2, m12 1008 pshufb m4, m7 1009 paddw m3, m4 1010 pmaddwd m3, m12 1011 paddd m0, m8 1012 paddd m1, m8 1013 paddd m0, m2 1014%if ARCH_X86_64 1015 mova m2, [t3+wq] 1016 paddw m2, [t1+wq] 1017 paddd m1, m3 1018 mova m4, [t2+wq] 1019%else 1020 mova m2, [t0+wq] 1021 mov t0, t2m 1022 paddw m2, [t1+wq] 1023 mov t1, t4m 1024 paddd m1, m3 1025 mova m4, [t0+wq] 1026 mov t0, t0m 1027%endif 1028 punpckhwd m3, m2, m4 1029 pmaddwd m3, m14 1030 punpcklwd m2, m4 1031%if ARCH_X86_64 1032 mova m4, [t4+wq] 1033%else 1034 mova m4, [t1+wq] 1035%endif 1036 psrad m0, 4 1037 psrad m1, 4 1038 packssdw m0, m1 1039 pmaddwd m2, m14 1040 psraw m0, 1 1041 mova [t0+wq], m0 1042 punpckhwd m1, m0, m4 1043 pmaddwd m1, m13 1044 punpcklwd m0, m4 1045 pmaddwd m0, m13 1046 paddd m3, m9 1047 paddd m2, m9 1048 paddd m1, m3 1049 paddd m0, m2 1050 psrad m1, 6 1051 psrad m0, 6 1052 packssdw m0, m1 1053 pmulhw m0, m10 1054 pxor m1, m1 1055 pmaxsw m0, m1 1056 mova [dstq+wq], m0 1057 add wq, 16 1058 jl .hv_loop 1059%if ARCH_X86_64 1060 mov t4, t3 1061 mov t3, t2 1062 mov t2, t1 1063 mov t1, t0 1064 mov t0, t4 1065%else 1066 mov r5, t3m 1067 mov t1, t2m 1068 mov t4m, r5 1069 mov t3m, t1 1070 mov r5, t1m 1071 mov t1, t0 1072 mov t2m, r5 1073 mov t0, t4m 1074 mov wq, wm 1075%endif 1076 add dstq, dst_strideq 1077 ret 1078.v: 1079%if ARCH_X86_64 1080 mov wq, r5 1081.v_loop: 1082 mova m0, [t1+wq] 1083 paddw m2, m0, [t3+wq] 1084 mova m1, [t2+wq] 1085 mova m4, [t4+wq] 1086%else 1087 mov t1m, t1 1088.v_loop: 1089 mov t0, t3m 1090 mova m0, [t1+wq] 1091 mov t1, t2m 1092 paddw m2, m0, [t0+wq] 1093 mov t0, t4m 1094 mova m1, [t1+wq] 1095 mova m4, [t0+wq] 1096%endif 1097 punpckhwd m3, m2, m1 1098 pmaddwd m3, m14 1099 punpcklwd m2, m1 1100 pmaddwd m2, m14 1101 punpckhwd m1, m0, m4 1102 pmaddwd m1, m13 1103 punpcklwd m0, m4 1104 pmaddwd m0, m13 1105 paddd m3, m9 1106 paddd m2, m9 1107 paddd m1, m3 1108 paddd m0, m2 1109 psrad m1, 6 1110 psrad m0, 6 1111 packssdw m0, m1 1112 pmulhw m0, m10 1113 pxor m1, m1 1114 pmaxsw m0, m1 1115 mova [dstq+wq], m0 1116 add wq, 16 1117%if ARCH_X86_64 1118 jl .v_loop 1119%else 1120 jge .v_end 1121 mov t1, t1m 1122 jmp .v_loop 1123.v_end: 1124%endif 1125 ret 1126