1; Copyright © 2018, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; Copyright © 2018, VideoLabs 4; All rights reserved. 5; 6; Redistribution and use in source and binary forms, with or without 7; modification, are permitted provided that the following conditions are met: 8; 9; 1. Redistributions of source code must retain the above copyright notice, this 10; list of conditions and the following disclaimer. 11; 12; 2. Redistributions in binary form must reproduce the above copyright notice, 13; this list of conditions and the following disclaimer in the documentation 14; and/or other materials provided with the distribution. 15; 16; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 27%include "config.asm" 28%include "ext/x86/x86inc.asm" 29 30SECTION_RODATA 16 31 32wiener_init: db 6, 7, 6, 7, 6, 7, 6, 7, 0, 0, 0, 0, 2, 4, 2, 4 33wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 34wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 35wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 36wiener_shufD: db 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1, 10, -1, 11, -1 37wiener_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 38pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 39 40pb_right_ext_mask: times 24 db 0xff 41 times 8 db 0 42pb_0: times 16 db 0 43pb_3: times 16 db 3 44pb_15: times 16 db 15 45pb_0_1: times 8 db 0, 1 46pb_14_15: times 8 db 14, 15 47pw_1: times 8 dw 1 48pw_16: times 8 dw 16 49pw_128: times 8 dw 128 50pw_256: times 8 dw 256 51pw_2048: times 8 dw 2048 52pw_2056: times 8 dw 2056 53pw_m16380: times 8 dw -16380 54pw_5_6: times 4 dw 5, 6 55pd_1024: times 4 dd 1024 56%if ARCH_X86_32 57pd_512: times 4 dd 512 58pd_2048: times 4 dd 2048 59%endif 60pd_0xF0080029: times 4 dd 0xF0080029 61pd_0xF00801C7: times 4 dd 0XF00801C7 62 63cextern sgr_x_by_x 64 65SECTION .text 66 67%if ARCH_X86_32 68 %define PIC_base_offset $$ 69 70 %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg 71 %assign pic_reg_stk_off 4 72 %xdefine PIC_reg %1 73 %if %2 == 1 74 mov [esp], %1 75 %endif 76 LEA PIC_reg, PIC_base_offset 77 %if %3 == 1 78 XCHG_PIC_REG 79 %endif 80 %endmacro 81 82 %macro XCHG_PIC_REG 0 83 mov [esp+pic_reg_stk_off], PIC_reg 84 %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8 85 mov PIC_reg, [esp+pic_reg_stk_off] 86 %endmacro 87 88 %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) 89 90%else 91 %macro XCHG_PIC_REG 0 92 %endmacro 93 94 %define PIC_sym(sym) (sym) 95%endif 96 97%macro WIENER 0 98%if ARCH_X86_64 99DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers 100cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ 101 lpf_stride, w, edge, flt, h, x 102 %define base 0 103 mov fltq, fltmp 104 mov edged, r8m 105 mov wd, wm 106 mov hd, r6m 107 movq m14, [fltq] 108 add lpfq, wq 109 lea t1, [rsp+wq*2+16] 110 mova m15, [pw_2056] 111 add dstq, wq 112 movq m7, [fltq+16] 113 neg wq 114%if cpuflag(ssse3) 115 pshufb m14, [wiener_init] 116 mova m8, [wiener_shufA] 117 pshufd m12, m14, q2222 ; x0 x0 118 mova m9, [wiener_shufB] 119 pshufd m13, m14, q3333 ; x1 x2 120 mova m10, [wiener_shufC] 121 punpcklqdq m14, m14 ; x3 122 mova m11, [wiener_shufD] 123%else 124 mova m10, [pw_m16380] 125 punpcklwd m14, m14 126 pshufd m11, m14, q0000 ; x0 127 pshufd m12, m14, q1111 ; x1 128 pshufd m13, m14, q2222 ; x2 129 pshufd m14, m14, q3333 ; x3 130%endif 131%else 132DECLARE_REG_TMP 4, 0, _, 5 133%if cpuflag(ssse3) 134 %define m10 [base+wiener_shufC] 135 %define m11 [base+wiener_shufD] 136 %define stk_off 96 137%else 138 %define m10 [base+pw_m16380] 139 %define m11 [stk+96] 140 %define stk_off 112 141%endif 142cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride 143 %define base r6-pb_right_ext_mask-21 144 %define stk esp 145 %define dstq leftq 146 %define edgeb byte edged 147 %define edged [stk+ 8] 148 %define dstmp [stk+12] 149 %define hd dword [stk+16] 150 %define wq [stk+20] 151 %define dst_strideq [stk+24] 152 %define leftmp [stk+28] 153 %define t2 [stk+32] 154 %define t4 [stk+36] 155 %define t5 [stk+40] 156 %define t6 [stk+44] 157 %define m8 [base+wiener_shufA] 158 %define m9 [base+wiener_shufB] 159 %define m12 [stk+48] 160 %define m13 [stk+64] 161 %define m14 [stk+80] 162 %define m15 [base+pw_2056] 163 mov r1, r7m ; flt 164 mov r0, r0m ; dst 165 mov r5, r5m ; w 166 mov lpfq, lpfm 167 mov r2, r8m ; edge 168 mov r4, r6m ; h 169 movq m3, [r1+ 0] 170 movq m7, [r1+16] 171 add r0, r5 172 mov r1, r1m ; dst_stride 173 add lpfq, r5 174 mov edged, r2 175 mov r2, r2m ; left 176 mov dstmp, r0 177 lea t1, [rsp+r5*2+stk_off] 178 mov hd, r4 179 neg r5 180 mov lpf_strideq, lpf_stridem 181 LEA r6, pb_right_ext_mask+21 182 mov wq, r5 183 mov dst_strideq, r1 184 mov leftmp, r2 185%if cpuflag(ssse3) 186 pshufb m3, [base+wiener_init] 187 pshufd m1, m3, q2222 188 pshufd m2, m3, q3333 189 punpcklqdq m3, m3 190%else 191 punpcklwd m3, m3 192 pshufd m0, m3, q0000 193 pshufd m1, m3, q1111 194 pshufd m2, m3, q2222 195 pshufd m3, m3, q3333 196 mova m11, m0 197%endif 198 mova m12, m1 199 mova m13, m2 200 mova m14, m3 201%endif 202 pshufd m6, m7, q0000 ; y0 y1 203 pshufd m7, m7, q1111 ; y2 y3 204 test edgeb, 4 ; LR_HAVE_TOP 205 jz .no_top 206 call .h_top 207 add lpfq, lpf_strideq 208 mov t6, t1 209 mov t5, t1 210 add t1, 384*2 211 call .h_top 212 lea t3, [lpfq+lpf_strideq*4] 213 mov lpfq, dstmp 214 mov [rsp+gprsize*1], lpf_strideq 215 add t3, lpf_strideq 216 mov [rsp+gprsize*0], t3 ; below 217 mov t4, t1 218 add t1, 384*2 219 call .h 220 mov t3, t1 221 mov t2, t1 222 dec hd 223 jz .v1 224 add lpfq, dst_strideq 225 add t1, 384*2 226 call .h 227 mov t2, t1 228 dec hd 229 jz .v2 230 add lpfq, dst_strideq 231 add t1, 384*2 232 call .h 233 dec hd 234 jz .v3 235.main: 236 lea t0, [t1+384*2] 237.main_loop: 238 call .hv 239 dec hd 240 jnz .main_loop 241 test edgeb, 8 ; LR_HAVE_BOTTOM 242 jz .v3 243 mov lpfq, [rsp+gprsize*0] 244 call .hv_bottom 245 add lpfq, [rsp+gprsize*1] 246 call .hv_bottom 247.v1: 248 call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v 249 RET 250.no_top: 251 lea t3, [lpfq+lpf_strideq*4] 252 mov lpfq, dstmp 253 mov [rsp+gprsize*1], lpf_strideq 254 lea t3, [t3+lpf_strideq*2] 255 mov [rsp+gprsize*0], t3 256 call .h 257 mov t6, t1 258 mov t5, t1 259 mov t4, t1 260 mov t3, t1 261 mov t2, t1 262 dec hd 263 jz .v1 264 add lpfq, dst_strideq 265 add t1, 384*2 266 call .h 267 mov t2, t1 268 dec hd 269 jz .v2 270 add lpfq, dst_strideq 271 add t1, 384*2 272 call .h 273 dec hd 274 jz .v3 275 lea t0, [t1+384*2] 276 call .hv 277 dec hd 278 jz .v3 279 add t0, 384*8 280 call .hv 281 dec hd 282 jnz .main 283.v3: 284 call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v 285.v2: 286 call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v 287 jmp .v1 288.extend_right: 289 movd m2, [lpfq-4] 290%if ARCH_X86_64 291 push r0 292 lea r0, [pb_right_ext_mask+21] 293 movu m0, [r0+xq+0] 294 movu m1, [r0+xq+8] 295 pop r0 296%else 297 movu m0, [r6+xq+0] 298 movu m1, [r6+xq+8] 299%endif 300%if cpuflag(ssse3) 301 pshufb m2, [base+pb_3] 302%else 303 punpcklbw m2, m2 304 pshuflw m2, m2, q3333 305 punpcklqdq m2, m2 306%endif 307 pand m4, m0 308 pand m5, m1 309 pandn m0, m2 310 pandn m1, m2 311 por m4, m0 312 por m5, m1 313 ret 314.h: 315 %define stk esp+4 ; offset due to call 316 mov xq, wq 317 test edgeb, 1 ; LR_HAVE_LEFT 318 jz .h_extend_left 319 movifnidn leftq, leftmp 320 mova m4, [lpfq+xq] 321 movd m5, [leftq] 322 add leftq, 4 323 pslldq m4, 4 324 por m4, m5 325 movifnidn leftmp, leftq 326 jmp .h_main 327.h_extend_left: 328%if cpuflag(ssse3) 329 mova m4, [lpfq+xq] 330 pshufb m4, [base+wiener_l_shuf] 331%else 332 mova m5, [lpfq+xq] 333 pshufd m4, m5, q2103 334 punpcklbw m5, m5 335 punpcklwd m5, m5 336 movss m4, m5 337%endif 338 jmp .h_main 339.h_top: 340 mov xq, wq 341 test edgeb, 1 ; LR_HAVE_LEFT 342 jz .h_extend_left 343.h_loop: 344 movu m4, [lpfq+xq-4] 345.h_main: 346 movu m5, [lpfq+xq+4] 347 test edgeb, 2 ; LR_HAVE_RIGHT 348 jnz .h_have_right 349 cmp xd, -18 350 jl .h_have_right 351 call .extend_right 352.h_have_right: 353%macro %%h7 0 354%if cpuflag(ssse3) 355 pshufb m0, m4, m8 356 pmaddubsw m0, m12 357 pshufb m1, m5, m8 358 pmaddubsw m1, m12 359 pshufb m2, m4, m9 360 pmaddubsw m2, m13 361 pshufb m3, m5, m9 362 pmaddubsw m3, m13 363 paddw m0, m2 364 pshufb m2, m4, m10 365 pmaddubsw m2, m13 366 paddw m1, m3 367 pshufb m3, m5, m10 368 pmaddubsw m3, m13 369 pshufb m4, m11 370 paddw m0, m2 371 pmullw m2, m14, m4 372 pshufb m5, m11 373 paddw m1, m3 374 pmullw m3, m14, m5 375 psllw m4, 7 376 psllw m5, 7 377 paddw m0, m2 378 mova m2, [base+pw_m16380] 379 paddw m1, m3 380 paddw m4, m2 381 paddw m5, m2 382 paddsw m0, m4 383 paddsw m1, m5 384%else 385 psrldq m0, m4, 1 386 pslldq m1, m4, 1 387 pxor m3, m3 388 punpcklbw m0, m3 389 punpckhbw m1, m3 390 paddw m0, m1 391 pmullw m0, m11 392 psrldq m1, m4, 2 393 pslldq m2, m4, 2 394 punpcklbw m1, m3 395 punpckhbw m2, m3 396 paddw m1, m2 397 pmullw m1, m12 398 paddw m0, m1 399 pshufd m2, m4, q0321 400 punpcklbw m2, m3 401 pmullw m1, m14, m2 402 paddw m0, m1 403 psrldq m1, m4, 3 404 pslldq m4, 3 405 punpcklbw m1, m3 406 punpckhbw m4, m3 407 paddw m1, m4 408 pmullw m1, m13 409 paddw m0, m1 410 psllw m2, 7 411 paddw m2, m10 412 paddsw m0, m2 413 psrldq m1, m5, 1 414 pslldq m2, m5, 1 415 punpcklbw m1, m3 416 punpckhbw m2, m3 417 paddw m1, m2 418 pmullw m1, m11 419 psrldq m2, m5, 2 420 pslldq m4, m5, 2 421 punpcklbw m2, m3 422 punpckhbw m4, m3 423 paddw m2, m4 424 pmullw m2, m12 425 paddw m1, m2 426 pshufd m4, m5, q0321 427 punpcklbw m4, m3 428 pmullw m2, m14, m4 429 paddw m1, m2 430 psrldq m2, m5, 3 431 pslldq m5, 3 432 punpcklbw m2, m3 433 punpckhbw m5, m3 434 paddw m2, m5 435 pmullw m2, m13 436 paddw m1, m2 437 psllw m4, 7 438 paddw m4, m10 439 paddsw m1, m4 440%endif 441%endmacro 442 %%h7 443 psraw m0, 3 444 psraw m1, 3 445 paddw m0, m15 446 paddw m1, m15 447 mova [t1+xq*2+ 0], m0 448 mova [t1+xq*2+16], m1 449 add xq, 16 450 jl .h_loop 451 ret 452ALIGN function_align 453.hv: 454 add lpfq, dst_strideq 455 mov xq, wq 456 test edgeb, 1 ; LR_HAVE_LEFT 457 jz .hv_extend_left 458 movifnidn leftq, leftmp 459 mova m4, [lpfq+xq] 460 movd m5, [leftq] 461 add leftq, 4 462 pslldq m4, 4 463 por m4, m5 464 movifnidn leftmp, leftq 465 jmp .hv_main 466.hv_extend_left: 467%if cpuflag(ssse3) 468 mova m4, [lpfq+xq] 469 pshufb m4, [base+wiener_l_shuf] 470%else 471 mova m5, [lpfq+xq] 472 pshufd m4, m5, q2103 473 punpcklbw m5, m5 474 punpcklwd m5, m5 475 movss m4, m5 476%endif 477 jmp .hv_main 478.hv_bottom: 479 mov xq, wq 480 test edgeb, 1 ; LR_HAVE_LEFT 481 jz .hv_extend_left 482.hv_loop: 483 movu m4, [lpfq+xq-4] 484.hv_main: 485 movu m5, [lpfq+xq+4] 486 test edgeb, 2 ; LR_HAVE_RIGHT 487 jnz .hv_have_right 488 cmp xd, -18 489 jl .hv_have_right 490 call .extend_right 491.hv_have_right: 492 %%h7 493%if ARCH_X86_64 494 mova m2, [t4+xq*2] 495 paddw m2, [t2+xq*2] 496%else 497 mov r2, t4 498 mova m2, [r2+xq*2] 499 mov r2, t2 500 paddw m2, [r2+xq*2] 501 mov r2, t5 502%endif 503 mova m3, [t3+xq*2] 504%if ARCH_X86_64 505 mova m5, [t5+xq*2] 506%else 507 mova m5, [r2+xq*2] 508 mov r2, t6 509%endif 510 paddw m5, [t1+xq*2] 511 psraw m0, 3 512 psraw m1, 3 513 paddw m0, m15 514 paddw m1, m15 515%if ARCH_X86_64 516 paddw m4, m0, [t6+xq*2] 517%else 518 paddw m4, m0, [r2+xq*2] 519 mov r2, t4 520%endif 521 mova [t0+xq*2], m0 522 punpcklwd m0, m2, m3 523 pmaddwd m0, m7 524 punpckhwd m2, m3 525 pmaddwd m2, m7 526 punpcklwd m3, m4, m5 527 pmaddwd m3, m6 528 punpckhwd m4, m5 529 pmaddwd m4, m6 530 paddd m0, m3 531 mova m3, [t3+xq*2+16] 532 paddd m4, m2 533%if ARCH_X86_64 534 mova m2, [t4+xq*2+16] 535 paddw m2, [t2+xq*2+16] 536 mova m5, [t5+xq*2+16] 537%else 538 mova m2, [r2+xq*2+16] 539 mov r2, t2 540 paddw m2, [r2+xq*2+16] 541 mov r2, t5 542 mova m5, [r2+xq*2+16] 543 mov r2, t6 544%endif 545 paddw m5, [t1+xq*2+16] 546 psrad m0, 11 547 psrad m4, 11 548 packssdw m0, m4 549%if ARCH_X86_64 550 paddw m4, m1, [t6+xq*2+16] 551%else 552 paddw m4, m1, [r2+xq*2+16] 553 mov dstq, dstmp 554%endif 555 mova [t0+xq*2+16], m1 556 punpcklwd m1, m2, m3 557 pmaddwd m1, m7 558 punpckhwd m2, m3 559 pmaddwd m2, m7 560 punpcklwd m3, m4, m5 561 pmaddwd m3, m6 562 punpckhwd m4, m5 563 pmaddwd m4, m6 564 paddd m1, m3 565 paddd m2, m4 566 psrad m1, 11 567 psrad m2, 11 568 packssdw m1, m2 569 packuswb m0, m1 570 mova [dstq+xq], m0 571 add xq, 16 572 jl .hv_loop 573 add dstq, dst_strideq 574%if ARCH_X86_64 575 mov t6, t5 576 mov t5, t4 577 mov t4, t3 578 mov t3, t2 579 mov t2, t1 580 mov t1, t0 581 mov t0, t6 582%else 583 mov dstmp, dstq 584 mov r1, t5 585 mov r2, t4 586 mov t6, r1 587 mov t5, r2 588 mov t4, t3 589 mov t3, t2 590 mov t2, t1 591 mov t1, t0 592 mov t0, r1 593%endif 594 ret 595%if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code 596.v: 597 mov xq, wq 598.v_loop: 599%if ARCH_X86_64 600 mova m1, [t4+xq*2] 601 paddw m1, [t2+xq*2] 602%else 603 mov r2, t4 604 mova m1, [r2+xq*2] 605 mov r2, t2 606 paddw m1, [r2+xq*2] 607 mov r2, t6 608%endif 609 mova m2, [t3+xq*2] 610 mova m4, [t1+xq*2] 611%if ARCH_X86_64 612 paddw m3, m4, [t6+xq*2] 613 paddw m4, [t5+xq*2] 614%else 615 paddw m3, m4, [r2+xq*2] 616 mov r2, t5 617 paddw m4, [r2+xq*2] 618 mov r2, t4 619%endif 620 punpcklwd m0, m1, m2 621 pmaddwd m0, m7 622 punpckhwd m1, m2 623 pmaddwd m1, m7 624 punpcklwd m2, m3, m4 625 pmaddwd m2, m6 626 punpckhwd m3, m4 627 pmaddwd m3, m6 628 paddd m0, m2 629 paddd m1, m3 630%if ARCH_X86_64 631 mova m2, [t4+xq*2+16] 632 paddw m2, [t2+xq*2+16] 633%else 634 mova m2, [r2+xq*2+16] 635 mov r2, t2 636 paddw m2, [r2+xq*2+16] 637 mov r2, t6 638%endif 639 mova m3, [t3+xq*2+16] 640 mova m5, [t1+xq*2+16] 641%if ARCH_X86_64 642 paddw m4, m5, [t6+xq*2+16] 643 paddw m5, [t5+xq*2+16] 644%else 645 paddw m4, m5, [r2+xq*2+16] 646 mov r2, t5 647 paddw m5, [r2+xq*2+16] 648 movifnidn dstq, dstmp 649%endif 650 psrad m0, 11 651 psrad m1, 11 652 packssdw m0, m1 653 punpcklwd m1, m2, m3 654 pmaddwd m1, m7 655 punpckhwd m2, m3 656 pmaddwd m2, m7 657 punpcklwd m3, m4, m5 658 pmaddwd m3, m6 659 punpckhwd m4, m5 660 pmaddwd m4, m6 661 paddd m1, m3 662 paddd m2, m4 663 psrad m1, 11 664 psrad m2, 11 665 packssdw m1, m2 666 packuswb m0, m1 667 mova [dstq+xq], m0 668 add xq, 16 669 jl .v_loop 670 add dstq, dst_strideq 671%if ARCH_X86_64 672 mov t6, t5 673 mov t5, t4 674%else 675 mov dstmp, dstq 676 mov r1, t5 677 mov r2, t4 678 mov t6, r1 679 mov t5, r2 680%endif 681 mov t4, t3 682 mov t3, t2 683 mov t2, t1 684 ret 685%endif 686 687%if ARCH_X86_64 688cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ 689 lpf_stride, w, edge, flt, h, x 690 mov fltq, fltmp 691 mov edged, r8m 692 mov wd, wm 693 mov hd, r6m 694 movq m14, [fltq] 695 add lpfq, wq 696 mova m8, [pw_m16380] 697 lea t1, [rsp+wq*2+16] 698 mova m15, [pw_2056] 699 add dstq, wq 700 movq m7, [fltq+16] 701 neg wq 702%if cpuflag(ssse3) 703 pshufb m14, [wiener_init] 704 mova m9, [wiener_shufB] 705 pshufd m13, m14, q3333 ; x1 x2 706 mova m10, [wiener_shufC] 707 punpcklqdq m14, m14 ; x3 708 mova m11, [wiener_shufD] 709 mova m12, [wiener_l_shuf] 710%else 711 punpcklwd m14, m14 712 pshufd m11, m14, q1111 ; x1 713 pshufd m13, m14, q2222 ; x2 714 pshufd m14, m14, q3333 ; x3 715%endif 716%else 717%if cpuflag(ssse3) 718 %define stk_off 80 719%else 720 %define m11 [stk+80] 721 %define stk_off 96 722%endif 723cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride 724 %define stk esp 725 %define leftmp [stk+28] 726 %define m8 [base+pw_m16380] 727 %define m12 [base+wiener_l_shuf] 728 %define m14 [stk+48] 729 mov r1, r7m ; flt 730 mov r0, r0m ; dst 731 mov r5, r5m ; w 732 mov lpfq, lpfm 733 mov r2, r8m ; edge 734 mov r4, r6m ; h 735 movq m2, [r1+ 0] 736 movq m7, [r1+16] 737 add r0, r5 738 mov r1, r1m ; dst_stride 739 add lpfq, r5 740 mov edged, r2 741 mov r2, r2m ; left 742 mov dstmp, r0 743 lea t1, [rsp+r5*2+stk_off] 744 mov hd, r4 745 neg r5 746 mov lpf_strideq, lpf_stridem 747 LEA r6, pb_right_ext_mask+21 748 mov wq, r5 749 mov dst_strideq, r1 750 mov leftmp, r2 751%if cpuflag(ssse3) 752 pshufb m2, [base+wiener_init] 753 pshufd m1, m2, q3333 754 punpcklqdq m2, m2 755%else 756 punpcklwd m2, m2 757 pshufd m0, m2, q1111 758 pshufd m1, m2, q2222 759 pshufd m2, m2, q3333 760 mova m11, m0 761%endif 762 mova m13, m1 763 mova m14, m2 764%endif 765 pshufd m6, m7, q0000 ; __ y1 766 pshufd m7, m7, q1111 ; y2 y3 767 test edgeb, 4 ; LR_HAVE_TOP 768 jz .no_top 769 call .h_top 770 add lpfq, lpf_strideq 771 mov t4, t1 772 add t1, 384*2 773 call .h_top 774 lea xq, [lpfq+lpf_strideq*4] 775 mov lpfq, dstmp 776 mov t3, t1 777 add t1, 384*2 778 mov [rsp+gprsize*1], lpf_strideq 779 add xq, lpf_strideq 780 mov [rsp+gprsize*0], xq ; below 781 call .h 782 mov t2, t1 783 dec hd 784 jz .v1 785 add lpfq, dst_strideq 786 add t1, 384*2 787 call .h 788 dec hd 789 jz .v2 790.main: 791 mov t0, t4 792.main_loop: 793 call .hv 794 dec hd 795 jnz .main_loop 796 test edgeb, 8 ; LR_HAVE_BOTTOM 797 jz .v2 798 mov lpfq, [rsp+gprsize*0] 799 call .hv_bottom 800 add lpfq, [rsp+gprsize*1] 801 call .hv_bottom 802.end: 803 RET 804.no_top: 805 lea t3, [lpfq+lpf_strideq*4] 806 mov lpfq, dstmp 807 mov [rsp+gprsize*1], lpf_strideq 808 lea t3, [t3+lpf_strideq*2] 809 mov [rsp+gprsize*0], t3 810 call .h 811 mov t4, t1 812 mov t3, t1 813 mov t2, t1 814 dec hd 815 jz .v1 816 add lpfq, dst_strideq 817 add t1, 384*2 818 call .h 819 dec hd 820 jz .v2 821 lea t0, [t1+384*2] 822 call .hv 823 dec hd 824 jz .v2 825 add t0, 384*6 826 call .hv 827 dec hd 828 jnz .main 829.v2: 830 call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v 831 add dstq, dst_strideq 832 mov t4, t3 833 mov t3, t2 834 mov t2, t1 835 movifnidn dstmp, dstq 836.v1: 837 call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v 838 jmp .end 839.h: 840 %define stk esp+4 841 mov xq, wq 842 test edgeb, 1 ; LR_HAVE_LEFT 843 jz .h_extend_left 844 movifnidn leftq, leftmp 845 mova m4, [lpfq+xq] 846 movd m5, [leftq] 847 add leftq, 4 848 pslldq m4, 4 849 por m4, m5 850 movifnidn leftmp, leftq 851 jmp .h_main 852.h_extend_left: 853%if cpuflag(ssse3) 854 mova m4, [lpfq+xq] 855 pshufb m4, m12 856%else 857 mova m5, [lpfq+xq] 858 pshufd m4, m5, q2103 859 punpcklbw m5, m5 860 punpcklwd m5, m5 861 movss m4, m5 862%endif 863 jmp .h_main 864.h_top: 865 mov xq, wq 866 test edgeb, 1 ; LR_HAVE_LEFT 867 jz .h_extend_left 868.h_loop: 869 movu m4, [lpfq+xq-4] 870.h_main: 871 movu m5, [lpfq+xq+4] 872 test edgeb, 2 ; LR_HAVE_RIGHT 873 jnz .h_have_right 874 cmp xd, -17 875 jl .h_have_right 876 call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right 877.h_have_right: 878%macro %%h5 0 879%if cpuflag(ssse3) 880 pshufb m0, m4, m9 881 pmaddubsw m0, m13 882 pshufb m1, m5, m9 883 pmaddubsw m1, m13 884 pshufb m2, m4, m10 885 pmaddubsw m2, m13 886 pshufb m3, m5, m10 887 pmaddubsw m3, m13 888 pshufb m4, m11 889 paddw m0, m2 890 pmullw m2, m14, m4 891 pshufb m5, m11 892 paddw m1, m3 893 pmullw m3, m14, m5 894 psllw m4, 7 895 psllw m5, 7 896 paddw m4, m8 897 paddw m5, m8 898 paddw m0, m2 899 paddw m1, m3 900 paddsw m0, m4 901 paddsw m1, m5 902%else 903 psrldq m0, m4, 2 904 pslldq m1, m4, 2 905 pxor m3, m3 906 punpcklbw m0, m3 907 punpckhbw m1, m3 908 paddw m0, m1 909 pmullw m0, m11 910 pshufd m2, m4, q0321 911 punpcklbw m2, m3 912 pmullw m1, m14, m2 913 paddw m0, m1 914 psrldq m1, m4, 3 915 pslldq m4, 3 916 punpcklbw m1, m3 917 punpckhbw m4, m3 918 paddw m1, m4 919 pmullw m1, m13 920 paddw m0, m1 921 psllw m2, 7 922 paddw m2, m8 923 paddsw m0, m2 924 psrldq m1, m5, 2 925 pslldq m4, m5, 2 926 punpcklbw m1, m3 927 punpckhbw m4, m3 928 paddw m1, m4 929 pmullw m1, m11 930 pshufd m4, m5, q0321 931 punpcklbw m4, m3 932 pmullw m2, m14, m4 933 paddw m1, m2 934 psrldq m2, m5, 3 935 pslldq m5, 3 936 punpcklbw m2, m3 937 punpckhbw m5, m3 938 paddw m2, m5 939 pmullw m2, m13 940 paddw m1, m2 941 psllw m4, 7 942 paddw m4, m8 943 paddsw m1, m4 944%endif 945%endmacro 946 %%h5 947 psraw m0, 3 948 psraw m1, 3 949 paddw m0, m15 950 paddw m1, m15 951 mova [t1+xq*2+ 0], m0 952 mova [t1+xq*2+16], m1 953 add xq, 16 954 jl .h_loop 955 ret 956ALIGN function_align 957.hv: 958 add lpfq, dst_strideq 959 mov xq, wq 960 test edgeb, 1 ; LR_HAVE_LEFT 961 jz .hv_extend_left 962 movifnidn leftq, leftmp 963 mova m4, [lpfq+xq] 964 movd m5, [leftq] 965 add leftq, 4 966 pslldq m4, 4 967 por m4, m5 968 movifnidn leftmp, leftq 969 jmp .hv_main 970.hv_extend_left: 971%if cpuflag(ssse3) 972 mova m4, [lpfq+xq] 973 pshufb m4, m12 974%else 975 mova m5, [lpfq+xq] 976 pshufd m4, m5, q2103 977 punpcklbw m5, m5 978 punpcklwd m5, m5 979 movss m4, m5 980%endif 981 jmp .hv_main 982.hv_bottom: 983 mov xq, wq 984 test edgeb, 1 ; LR_HAVE_LEFT 985 jz .hv_extend_left 986.hv_loop: 987 movu m4, [lpfq+xq-4] 988.hv_main: 989 movu m5, [lpfq+xq+4] 990 test edgeb, 2 ; LR_HAVE_RIGHT 991 jnz .hv_have_right 992 cmp xd, -17 993 jl .hv_have_right 994 call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right 995.hv_have_right: 996 %%h5 997 mova m2, [t3+xq*2] 998 paddw m2, [t1+xq*2] 999 psraw m0, 3 1000 psraw m1, 3 1001 paddw m0, m15 1002 paddw m1, m15 1003%if ARCH_X86_64 1004 mova m3, [t2+xq*2] 1005 paddw m4, m0, [t4+xq*2] 1006%else 1007 mov r2, t2 1008 mova m3, [r2+xq*2] 1009 mov r2, t4 1010 paddw m4, m0, [r2+xq*2] 1011%endif 1012 mova [t0+xq*2], m0 1013 punpcklwd m0, m2, m3 1014 pmaddwd m0, m7 1015 punpckhwd m2, m3 1016 pmaddwd m2, m7 1017 punpcklwd m3, m4, m4 1018 pmaddwd m3, m6 1019 punpckhwd m4, m4 1020 pmaddwd m4, m6 1021 paddd m0, m3 1022 paddd m4, m2 1023 mova m2, [t3+xq*2+16] 1024 paddw m2, [t1+xq*2+16] 1025 psrad m0, 11 1026 psrad m4, 11 1027 packssdw m0, m4 1028%if ARCH_X86_64 1029 mova m3, [t2+xq*2+16] 1030 paddw m4, m1, [t4+xq*2+16] 1031%else 1032 paddw m4, m1, [r2+xq*2+16] 1033 mov r2, t2 1034 mova m3, [r2+xq*2+16] 1035 mov dstq, dstmp 1036%endif 1037 mova [t0+xq*2+16], m1 1038 punpcklwd m1, m2, m3 1039 pmaddwd m1, m7 1040 punpckhwd m2, m3 1041 pmaddwd m2, m7 1042 punpcklwd m3, m4, m4 1043 pmaddwd m3, m6 1044 punpckhwd m4, m4 1045 pmaddwd m4, m6 1046 paddd m1, m3 1047 paddd m2, m4 1048 psrad m1, 11 1049 psrad m2, 11 1050 packssdw m1, m2 1051 packuswb m0, m1 1052 mova [dstq+xq], m0 1053 add xq, 16 1054 jl .hv_loop 1055 add dstq, dst_strideq 1056 mov t4, t3 1057 mov t3, t2 1058 mov t2, t1 1059 mov t1, t0 1060 mov t0, t4 1061 movifnidn dstmp, dstq 1062 ret 1063%if cpuflag(ssse3) 1064.v: 1065 mov xq, wq 1066.v_loop: 1067 mova m3, [t1+xq*2] 1068 paddw m1, m3, [t3+xq*2] 1069%if ARCH_X86_64 1070 mova m2, [t2+xq*2] 1071 paddw m3, [t4+xq*2] 1072%else 1073 mov r2, t2 1074 mova m2, [r2+xq*2] 1075 mov r2, t4 1076 paddw m3, [r2+xq*2] 1077%endif 1078 punpcklwd m0, m1, m2 1079 pmaddwd m0, m7 1080 punpckhwd m1, m2 1081 pmaddwd m1, m7 1082 punpcklwd m2, m3 1083 pmaddwd m2, m6 1084 punpckhwd m3, m3 1085 pmaddwd m3, m6 1086 paddd m0, m2 1087 paddd m1, m3 1088 mova m4, [t1+xq*2+16] 1089 paddw m2, m4, [t3+xq*2+16] 1090%if ARCH_X86_64 1091 mova m3, [t2+xq*2+16] 1092 paddw m4, [t4+xq*2+16] 1093%else 1094 paddw m4, [r2+xq*2+16] 1095 mov r2, t2 1096 mova m3, [r2+xq*2+16] 1097 mov dstq, dstmp 1098%endif 1099 psrad m0, 11 1100 psrad m1, 11 1101 packssdw m0, m1 1102 punpcklwd m1, m2, m3 1103 pmaddwd m1, m7 1104 punpckhwd m2, m3 1105 pmaddwd m2, m7 1106 punpcklwd m3, m4 1107 pmaddwd m3, m6 1108 punpckhwd m4, m4 1109 pmaddwd m4, m6 1110 paddd m1, m3 1111 paddd m2, m4 1112 psrad m1, 11 1113 psrad m2, 11 1114 packssdw m1, m2 1115 packuswb m0, m1 1116 mova [dstq+xq], m0 1117 add xq, 16 1118 jl .v_loop 1119 ret 1120%endif 1121%endmacro 1122 1123INIT_XMM sse2 1124WIENER 1125 1126INIT_XMM ssse3 1127WIENER 1128 1129;;;;;;;;;;;;;;;;;;;;;;;;;; 1130;; self-guided ;; 1131;;;;;;;;;;;;;;;;;;;;;;;;;; 1132 1133%macro MULLD 2 1134 pmulhuw m5, %1, %2 1135 pmullw %1, %2 1136 pslld m5, 16 1137 paddd %1, m5 1138%endmacro 1139 1140%macro GATHERDD 2 1141 mova m5, m7 1142 movd r6d, %2 1143 %if ARCH_X86_64 1144 movd %1, [r5+r6] 1145 pextrw r6d, %2, 2 1146 pinsrw m5, [r5+r6+2], 3 1147 pextrw r6d, %2, 4 1148 pinsrw %1, [r5+r6+2], 5 1149 pextrw r6d, %2, 6 1150 pinsrw m5, [r5+r6+2], 7 1151 %else 1152 movd %1, [PIC_sym(sgr_x_by_x-0xF03)+r6] 1153 pextrw r6d, %2, 2 1154 pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 3 1155 pextrw r6d, %2, 4 1156 pinsrw %1, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 5 1157 pextrw r6d, %2, 6 1158 pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 7 1159 %endif 1160 por %1, m5 1161%endmacro 1162 1163%if ARCH_X86_64 1164cglobal sgr_box3_h_8bpc, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim 1165 mov xlimd, edgem 1166 movifnidn xd, xm 1167 mov hd, hm 1168 mov edged, xlimd 1169 and xlimd, 2 ; have_right 1170 add xd, xlimd 1171 xor xlimd, 2 ; 2*!have_right 1172%else 1173cglobal sgr_box3_h_8bpc, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim 1174 %define wq r0m 1175 %define xlimd r1m 1176 %define hd hmp 1177 %define edgeb byte edgem 1178 1179 mov r6, edgem 1180 and r6, 2 ; have_right 1181 add xd, r6 1182 xor r6, 2 ; 2*!have_right 1183 mov xlimd, r6 1184 SETUP_PIC r6, 0 1185%endif 1186 1187 jnz .no_right 1188 add xd, 7 1189 and xd, ~7 1190.no_right: 1191 pxor m1, m1 1192 lea srcq, [srcq+xq] 1193 lea sumq, [sumq+xq*2-2] 1194 lea sumsqq, [sumsqq+xq*4-4] 1195 neg xq 1196 mov wq, xq 1197%if ARCH_X86_64 1198 lea r10, [pb_right_ext_mask+24] 1199%endif 1200.loop_y: 1201 mov xq, wq 1202 1203 ; load left 1204 test edgeb, 1 ; have_left 1205 jz .no_left 1206 test leftq, leftq 1207 jz .load_left_from_main 1208 movd m0, [leftq] 1209 pslldq m0, 12 1210 add leftq, 4 1211 jmp .expand_x 1212.no_left: 1213 movd m0, [srcq+xq] 1214 pshufb m0, [PIC_sym(pb_0)] 1215 jmp .expand_x 1216.load_left_from_main: 1217 movd m0, [srcq+xq-2] 1218 pslldq m0, 14 1219.expand_x: 1220 punpckhbw xm0, xm1 1221 1222 ; when we reach this, m0 contains left two px in highest words 1223 cmp xd, -8 1224 jle .loop_x 1225.partial_load_and_extend: 1226 movd m3, [srcq-4] 1227 pshufb m3, [PIC_sym(pb_3)] 1228 movq m2, [srcq+xq] 1229 punpcklbw m2, m1 1230 punpcklbw m3, m1 1231%if ARCH_X86_64 1232 movu m4, [r10+xq*2] 1233%else 1234 movu m4, [PIC_sym(pb_right_ext_mask)+xd*2+24] 1235%endif 1236 pand m2, m4 1237 pandn m4, m3 1238 por m2, m4 1239 jmp .loop_x_noload 1240.right_extend: 1241 pshufb m2, m0, [PIC_sym(pb_14_15)] 1242 jmp .loop_x_noload 1243 1244.loop_x: 1245 movq m2, [srcq+xq] 1246 punpcklbw m2, m1 1247.loop_x_noload: 1248 palignr m3, m2, m0, 12 1249 palignr m4, m2, m0, 14 1250 1251 punpcklwd m5, m3, m2 1252 punpckhwd m6, m3, m2 1253 paddw m3, m4 1254 punpcklwd m7, m4, m1 1255 punpckhwd m4, m1 1256 pmaddwd m5, m5 1257 pmaddwd m6, m6 1258 pmaddwd m7, m7 1259 pmaddwd m4, m4 1260 paddd m5, m7 1261 paddd m6, m4 1262 paddw m3, m2 1263 movu [sumq+xq*2], m3 1264 movu [sumsqq+xq*4+ 0], m5 1265 movu [sumsqq+xq*4+16], m6 1266 1267 mova m0, m2 1268 add xq, 8 1269 1270 ; if x <= -8 we can reload more pixels 1271 ; else if x < 0 we reload and extend (this implies have_right=0) 1272 ; else if x < xlimd we extend from previous load (this implies have_right=0) 1273 ; else we are done 1274 1275 cmp xd, -8 1276 jle .loop_x 1277 test xd, xd 1278 jl .partial_load_and_extend 1279 cmp xd, xlimd 1280 jl .right_extend 1281 1282 add sumsqq, (384+16)*4 1283 add sumq, (384+16)*2 1284 add srcq, strideq 1285 dec hd 1286 jg .loop_y 1287 RET 1288 1289%if ARCH_X86_64 1290cglobal sgr_box3_v_8bpc, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim 1291 movifnidn edged, edgem 1292%else 1293cglobal sgr_box3_v_8bpc, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y 1294 %define sumsq_baseq dword [esp+0] 1295 %define sum_baseq dword [esp+4] 1296 %define ylimd dword [esp+8] 1297 %define m8 [esp+12] 1298 mov edged, r4m 1299 mov hd, r3m 1300%endif 1301 mov xq, -2 1302%if ARCH_X86_64 1303 mov ylimd, edged 1304 and ylimd, 8 ; have_bottom 1305 shr ylimd, 2 1306 sub ylimd, 2 ; -2 if have_bottom=0, else 0 1307 mov sumsq_baseq, sumsqq 1308 mov sum_baseq, sumq 1309.loop_x: 1310 mov sumsqq, sumsq_baseq 1311 mov sumq, sum_baseq 1312 lea yd, [hq+ylimq+2] 1313%else 1314 mov yd, edged 1315 and yd, 8 ; have_bottom 1316 shr yd, 2 1317 sub yd, 2 ; -2 if have_bottom=0, else 0 1318 mov sumsq_baseq, sumsqq 1319 mov sum_baseq, sumq 1320 mov ylimd, yd 1321.loop_x: 1322 mov sumsqd, sumsq_baseq 1323 mov sumd, sum_baseq 1324 lea yd, [hq+2] 1325 add yd, ylimd 1326%endif 1327 lea sumsqq, [sumsqq+xq*4+4-(384+16)*4] 1328 lea sumq, [sumq+xq*2+2-(384+16)*2] 1329 test edgeb, 4 ; have_top 1330 jnz .load_top 1331 movu m0, [sumsqq+(384+16)*4*1] 1332 movu m1, [sumsqq+(384+16)*4*1+16] 1333 mova m2, m0 1334 mova m3, m1 1335 mova m4, m0 1336 mova m5, m1 1337 movu m6, [sumq+(384+16)*2*1] 1338 mova m7, m6 1339 mova m8, m6 1340 jmp .loop_y_noload 1341.load_top: 1342 movu m0, [sumsqq-(384+16)*4*1] ; l2sq [left] 1343 movu m1, [sumsqq-(384+16)*4*1+16] ; l2sq [right] 1344 movu m2, [sumsqq-(384+16)*4*0] ; l1sq [left] 1345 movu m3, [sumsqq-(384+16)*4*0+16] ; l1sq [right] 1346 movu m6, [sumq-(384+16)*2*1] ; l2 1347 movu m7, [sumq-(384+16)*2*0] ; l1 1348.loop_y: 1349%if ARCH_X86_64 1350 movu m8, [sumq+(384+16)*2*1] ; l0 1351%else 1352 movu m4, [sumq+(384+16)*2*1] ; l0 1353 mova m8, m4 1354%endif 1355 movu m4, [sumsqq+(384+16)*4*1] ; l0sq [left] 1356 movu m5, [sumsqq+(384+16)*4*1+16] ; l0sq [right] 1357.loop_y_noload: 1358 paddd m0, m2 1359 paddd m1, m3 1360 paddw m6, m7 1361 paddd m0, m4 1362 paddd m1, m5 1363 paddw m6, m8 1364 movu [sumsqq+ 0], m0 1365 movu [sumsqq+16], m1 1366 movu [sumq], m6 1367 1368 ; shift position down by one 1369 mova m0, m2 1370 mova m1, m3 1371 mova m2, m4 1372 mova m3, m5 1373 mova m6, m7 1374 mova m7, m8 1375 add sumsqq, (384+16)*4 1376 add sumq, (384+16)*2 1377 dec yd 1378 jg .loop_y 1379 cmp yd, ylimd 1380 jg .loop_y_noload 1381 add xd, 8 1382 cmp xd, wd 1383 jl .loop_x 1384 RET 1385 1386cglobal sgr_calc_ab1_8bpc, 4, 7, 12, a, b, w, h, s 1387 movifnidn sd, sm 1388 sub aq, (384+16-1)*4 1389 sub bq, (384+16-1)*2 1390 add hd, 2 1391%if ARCH_X86_64 1392 LEA r5, sgr_x_by_x-0xF03 1393%else 1394 SETUP_PIC r5, 0 1395%endif 1396 movd m6, sd 1397 pshuflw m6, m6, q0000 1398 punpcklqdq m6, m6 1399 pxor m7, m7 1400 DEFINE_ARGS a, b, w, h, x 1401%if ARCH_X86_64 1402 mova m8, [pd_0xF00801C7] 1403 mova m9, [pw_256] 1404 psrld m10, m9, 13 ; pd_2048 1405 mova m11, [pb_unpcklwdw] 1406%else 1407 %define m8 [PIC_sym(pd_0xF00801C7)] 1408 %define m9 [PIC_sym(pw_256)] 1409 %define m10 [PIC_sym(pd_2048)] 1410 %define m11 [PIC_sym(pb_unpcklwdw)] 1411%endif 1412.loop_y: 1413 mov xq, -2 1414.loop_x: 1415 movq m0, [bq+xq*2] 1416 movq m1, [bq+xq*2+(384+16)*2] 1417 punpcklwd m0, m7 1418 punpcklwd m1, m7 1419 movu m2, [aq+xq*4] 1420 movu m3, [aq+xq*4+(384+16)*4] 1421 pslld m4, m2, 3 1422 pslld m5, m3, 3 1423 paddd m2, m4 ; aa * 9 1424 paddd m3, m5 1425 pmaddwd m4, m0, m0 1426 pmaddwd m5, m1, m1 1427 pmaddwd m0, m8 1428 pmaddwd m1, m8 1429 psubd m2, m4 ; p = aa * 9 - bb * bb 1430 psubd m3, m5 1431 MULLD m2, m6 1432 MULLD m3, m6 1433 paddusw m2, m8 1434 paddusw m3, m8 1435 psrld m2, 20 ; z 1436 psrld m3, 20 1437 GATHERDD m4, m2 ; xx 1438 GATHERDD m2, m3 1439 psrld m4, 24 1440 psrld m2, 24 1441 packssdw m3, m4, m2 1442 pshufb m4, m11 1443 MULLD m0, m4 1444 pshufb m2, m11 1445 MULLD m1, m2 1446 psubw m5, m9, m3 1447 paddd m0, m10 1448 paddd m1, m10 1449 psrld m0, 12 1450 psrld m1, 12 1451 movq [bq+xq*2], m5 1452 psrldq m5, 8 1453 movq [bq+xq*2+(384+16)*2], m5 1454 movu [aq+xq*4], m0 1455 movu [aq+xq*4+(384+16)*4], m1 1456 add xd, 4 1457 cmp xd, wd 1458 jl .loop_x 1459 add aq, (384+16)*4*2 1460 add bq, (384+16)*2*2 1461 sub hd, 2 1462 jg .loop_y 1463 RET 1464 1465%if ARCH_X86_64 1466cglobal sgr_finish_filter1_8bpc, 5, 13, 16, t, src, stride, a, b, w, h, \ 1467 tmp_base, src_base, a_base, b_base, x, y 1468 movifnidn wd, wm 1469 mov hd, hm 1470 mova m15, [pw_16] 1471 mov tmp_baseq, tq 1472 mov src_baseq, srcq 1473 mov a_baseq, aq 1474 mov b_baseq, bq 1475 xor xd, xd 1476%else 1477cglobal sgr_finish_filter1_8bpc, 7, 7, 8, -144, t, src, stride, a, b, x, y 1478 %define tmp_baseq [esp+8] 1479 %define src_baseq [esp+12] 1480 %define a_baseq [esp+16] 1481 %define b_baseq [esp+20] 1482 %define wd [esp+24] 1483 %define hd [esp+28] 1484 mov tmp_baseq, tq 1485 mov src_baseq, srcq 1486 mov a_baseq, aq 1487 mov b_baseq, bq 1488 mov wd, xd 1489 mov hd, yd 1490 xor xd, xd 1491 SETUP_PIC yd, 1, 1 1492 jmp .loop_start 1493%endif 1494 1495.loop_x: 1496 mov tq, tmp_baseq 1497 mov srcq, src_baseq 1498 mov aq, a_baseq 1499 mov bq, b_baseq 1500%if ARCH_X86_32 1501.loop_start: 1502 movu m0, [bq+xq*2-(384+16)*2-2] 1503 movu m2, [bq+xq*2-(384+16)*2+2] 1504 mova m1, [bq+xq*2-(384+16)*2] ; b:top 1505 paddw m0, m2 ; b:tl+tr 1506 movu m2, [bq+xq*2-2] 1507 movu m3, [bq+xq*2+2] 1508 paddw m1, [bq+xq*2] ; b:top+ctr 1509 paddw m2, m3 ; b:l+r 1510 mova [esp+0x80], m0 1511 mova [esp+0x70], m1 1512 mova [esp+0x60], m2 1513%endif 1514 movu m0, [aq+xq*4-(384+16)*4-4] 1515 movu m2, [aq+xq*4-(384+16)*4+4] 1516 mova m1, [aq+xq*4-(384+16)*4] ; a:top [first half] 1517 paddd m0, m2 ; a:tl+tr [first half] 1518 movu m2, [aq+xq*4-(384+16)*4-4+16] 1519 movu m4, [aq+xq*4-(384+16)*4+4+16] 1520 mova m3, [aq+xq*4-(384+16)*4+16] ; a:top [second half] 1521 paddd m2, m4 ; a:tl+tr [second half] 1522 movu m4, [aq+xq*4-4] 1523 movu m5, [aq+xq*4+4] 1524 paddd m1, [aq+xq*4] ; a:top+ctr [first half] 1525 paddd m4, m5 ; a:l+r [first half] 1526 movu m5, [aq+xq*4+16-4] 1527 movu m6, [aq+xq*4+16+4] 1528 paddd m3, [aq+xq*4+16] ; a:top+ctr [second half] 1529 paddd m5, m6 ; a:l+r [second half] 1530%if ARCH_X86_64 1531 movu m6, [bq+xq*2-(384+16)*2-2] 1532 movu m8, [bq+xq*2-(384+16)*2+2] 1533 mova m7, [bq+xq*2-(384+16)*2] ; b:top 1534 paddw m6, m8 ; b:tl+tr 1535 movu m8, [bq+xq*2-2] 1536 movu m9, [bq+xq*2+2] 1537 paddw m7, [bq+xq*2] ; b:top+ctr 1538 paddw m8, m9 ; b:l+r 1539%endif 1540 1541 lea tq, [tq+xq*2] 1542 lea srcq, [srcq+xq*1] 1543 lea aq, [aq+xq*4+(384+16)*4] 1544 lea bq, [bq+xq*2+(384+16)*2] 1545 mov yd, hd 1546.loop_y: 1547%if ARCH_X86_64 1548 movu m9, [bq-2] 1549 movu m10, [bq+2] 1550 paddw m7, [bq] ; b:top+ctr+bottom 1551 paddw m9, m10 ; b:bl+br 1552 paddw m10, m7, m8 ; b:top+ctr+bottom+l+r 1553 paddw m6, m9 ; b:tl+tr+bl+br 1554 psubw m7, [bq-(384+16)*2*2] ; b:ctr+bottom 1555 paddw m10, m6 1556 psllw m10, 2 1557 psubw m10, m6 ; aa 1558 pxor m14, m14 1559 movq m12, [srcq] 1560 punpcklbw m12, m14 1561 punpcklwd m6, m10, m15 1562 punpckhwd m10, m15 1563 punpcklwd m13, m12, m15 1564 punpckhwd m12, m15 1565 pmaddwd m6, m13 ; aa*src[x]+256 [first half] 1566 pmaddwd m10, m12 ; aa*src[x]+256 [second half] 1567%else 1568 paddd m1, [aq] ; a:top+ctr+bottom [first half] 1569 paddd m3, [aq+16] ; a:top+ctr+bottom [second half] 1570 mova [esp+0x50], m1 1571 mova [esp+0x40], m3 1572 mova [esp+0x30], m4 1573 movu m6, [aq-4] 1574 movu m7, [aq+4] 1575 paddd m1, m4 ; a:top+ctr+bottom+l+r [first half] 1576 paddd m3, m5 ; a:top+ctr+bottom+l+r [second half] 1577 paddd m6, m7 ; a:bl+br [first half] 1578 movu m7, [aq+16-4] 1579 movu m4, [aq+16+4] 1580 paddd m7, m4 ; a:bl+br [second half] 1581 paddd m0, m6 ; a:tl+tr+bl+br [first half] 1582 paddd m2, m7 ; a:tl+tr+bl+br [second half] 1583 paddd m1, m0 1584 paddd m3, m2 1585 pslld m1, 2 1586 pslld m3, 2 1587 psubd m1, m0 ; bb [first half] 1588 psubd m3, m2 ; bb [second half] 1589%endif 1590 1591%if ARCH_X86_64 1592 movu m11, [aq-4] 1593 movu m12, [aq+4] 1594 paddd m1, [aq] ; a:top+ctr+bottom [first half] 1595 paddd m11, m12 ; a:bl+br [first half] 1596 movu m12, [aq+16-4] 1597 movu m13, [aq+16+4] 1598 paddd m3, [aq+16] ; a:top+ctr+bottom [second half] 1599 paddd m12, m13 ; a:bl+br [second half] 1600 paddd m13, m1, m4 ; a:top+ctr+bottom+l+r [first half] 1601 paddd m14, m3, m5 ; a:top+ctr+bottom+l+r [second half] 1602 paddd m0, m11 ; a:tl+tr+bl+br [first half] 1603 paddd m2, m12 ; a:tl+tr+bl+br [second half] 1604 paddd m13, m0 1605 paddd m14, m2 1606 pslld m13, 2 1607 pslld m14, 2 1608 psubd m13, m0 ; bb [first half] 1609 psubd m14, m2 ; bb [second half] 1610 psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half] 1611 psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half] 1612%else 1613 mova m4, [esp+0x80] 1614 mova [esp+0x80], m5 1615 mova m5, [esp+0x70] 1616 mova [esp+0x70], m6 1617 mova m6, [esp+0x60] 1618 mova [esp+0x60], m7 1619 mova [esp+0x20], m1 1620 movu m7, [bq-2] 1621 movu m1, [bq+2] 1622 paddw m5, [bq] ; b:top+ctr+bottom 1623 paddw m7, m1 1624 paddw m1, m5, m6 ; b:top+ctr+bottom+l+r 1625 paddw m4, m7 ; b:tl+tr+bl+br 1626 psubw m5, [bq-(384+16)*2*2] ; b:ctr+bottom 1627 paddw m1, m4 1628 psllw m1, 2 1629 psubw m1, m4 ; aa 1630 movq m0, [srcq] 1631 XCHG_PIC_REG 1632 punpcklbw m0, [PIC_sym(pb_0)] 1633 punpcklwd m4, m1, [PIC_sym(pw_16)] 1634 punpckhwd m1, [PIC_sym(pw_16)] 1635 punpcklwd m2, m0, [PIC_sym(pw_16)] 1636 punpckhwd m0, [PIC_sym(pw_16)] 1637 XCHG_PIC_REG 1638 pmaddwd m4, m2 ; aa*src[x]+256 [first half] 1639 pmaddwd m1, m0 ; aa*src[x]+256 [second half] 1640%endif 1641 1642%if ARCH_X86_64 1643 paddd m6, m13 1644 paddd m10, m14 1645 psrad m6, 9 1646 psrad m10, 9 1647 packssdw m6, m10 1648 mova [tq], m6 1649%else 1650 paddd m4, [esp+0x20] 1651 paddd m1, m3 1652 psrad m4, 9 1653 psrad m1, 9 1654 packssdw m4, m1 1655 mova [tq], m4 1656%endif 1657 1658 ; shift to next row 1659%if ARCH_X86_64 1660 mova m0, m4 1661 mova m2, m5 1662 mova m4, m11 1663 mova m5, m12 1664 mova m6, m8 1665 mova m8, m9 1666%else 1667 mova m1, [esp+0x50] 1668 mova m3, [esp+0x40] 1669 mova m0, [esp+0x30] 1670 mova m2, [esp+0x80] 1671 mova m4, [esp+0x70] 1672 mova [esp+0x70], m5 1673 mova m5, [esp+0x60] 1674 mova [esp+0x80], m6 1675 mova [esp+0x60], m7 1676 psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half] 1677 psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half] 1678%endif 1679 1680 add srcq, strideq 1681 add aq, (384+16)*4 1682 add bq, (384+16)*2 1683 add tq, 384*2 1684 dec yd 1685 jg .loop_y 1686 add xd, 8 1687 cmp xd, wd 1688 jl .loop_x 1689 RET 1690 1691cglobal sgr_weighted1_8bpc, 4, 7, 8, dst, stride, t, w, h, wt 1692 movifnidn hd, hm 1693%if ARCH_X86_32 1694 SETUP_PIC r6, 0 1695%endif 1696 movd m0, wtm 1697 pshufb m0, [PIC_sym(pb_0_1)] 1698 psllw m0, 4 1699 pxor m7, m7 1700 DEFINE_ARGS dst, stride, t, w, h, idx 1701.loop_y: 1702 xor idxd, idxd 1703.loop_x: 1704 mova m1, [tq+idxq*2+ 0] 1705 mova m4, [tq+idxq*2+16] 1706 mova m5, [dstq+idxq] 1707 punpcklbw m2, m5, m7 1708 punpckhbw m5, m7 1709 psllw m3, m2, 4 1710 psllw m6, m5, 4 1711 psubw m1, m3 1712 psubw m4, m6 1713 pmulhrsw m1, m0 1714 pmulhrsw m4, m0 1715 paddw m1, m2 1716 paddw m4, m5 1717 packuswb m1, m4 1718 mova [dstq+idxq], m1 1719 add idxd, 16 1720 cmp idxd, wd 1721 jl .loop_x 1722 add dstq, strideq 1723 add tq, 384 * 2 1724 dec hd 1725 jg .loop_y 1726 RET 1727 1728%if ARCH_X86_64 1729cglobal sgr_box5_h_8bpc, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim 1730 mov edged, edgem 1731 movifnidn wd, wm 1732 mov hd, hm 1733 mova m10, [pb_0] 1734 mova m11, [pb_0_1] 1735%else 1736cglobal sgr_box5_h_8bpc, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge 1737 %define edgeb byte edgem 1738 %define wd xd 1739 %define wq wd 1740 %define wm r5m 1741 %define strideq r4m 1742 SUB esp, 8 1743 SETUP_PIC sumsqd, 1, 1 1744 1745 %define m10 [PIC_sym(pb_0)] 1746 %define m11 [PIC_sym(pb_0_1)] 1747%endif 1748 1749 test edgeb, 2 ; have_right 1750 jz .no_right 1751 xor xlimd, xlimd 1752 add wd, 2 1753 add wd, 15 1754 and wd, ~15 1755 jmp .right_done 1756.no_right: 1757 mov xlimd, 3 1758 dec wd 1759.right_done: 1760 pxor m1, m1 1761 lea srcq, [srcq+wq+1] 1762 lea sumq, [sumq+wq*2-2] 1763 lea sumsqq, [sumsqq+wq*4-4] 1764 neg wq 1765%if ARCH_X86_64 1766 lea r10, [pb_right_ext_mask+24] 1767%else 1768 mov wm, xd 1769 %define wq wm 1770%endif 1771 1772.loop_y: 1773 mov xq, wq 1774 ; load left 1775 test edgeb, 1 ; have_left 1776 jz .no_left 1777 test leftq, leftq 1778 jz .load_left_from_main 1779 movd m0, [leftq] 1780 movd m2, [srcq+xq-1] 1781 pslldq m2, 4 1782 por m0, m2 1783 pslldq m0, 11 1784 add leftq, 4 1785 jmp .expand_x 1786.no_left: 1787 movd m0, [srcq+xq-1] 1788 XCHG_PIC_REG 1789 pshufb m0, m10 1790 XCHG_PIC_REG 1791 jmp .expand_x 1792.load_left_from_main: 1793 movd m0, [srcq+xq-4] 1794 pslldq m0, 12 1795.expand_x: 1796 punpckhbw m0, m1 1797 1798 ; when we reach this, m0 contains left two px in highest words 1799 cmp xd, -8 1800 jle .loop_x 1801 test xd, xd 1802 jge .right_extend 1803.partial_load_and_extend: 1804 XCHG_PIC_REG 1805 movd m3, [srcq-1] 1806 movq m2, [srcq+xq] 1807 pshufb m3, m10 1808 punpcklbw m3, m1 1809 punpcklbw m2, m1 1810%if ARCH_X86_64 1811 movu m4, [r10+xq*2] 1812%else 1813 movu m4, [PIC_sym(pb_right_ext_mask)+xd*2+24] 1814 XCHG_PIC_REG 1815%endif 1816 pand m2, m4 1817 pandn m4, m3 1818 por m2, m4 1819 jmp .loop_x_noload 1820.right_extend: 1821 psrldq m2, m0, 14 1822 XCHG_PIC_REG 1823 pshufb m2, m11 1824 XCHG_PIC_REG 1825 jmp .loop_x_noload 1826 1827.loop_x: 1828 movq m2, [srcq+xq] 1829 punpcklbw m2, m1 1830.loop_x_noload: 1831 palignr m3, m2, m0, 8 1832 palignr m4, m2, m0, 10 1833 palignr m5, m2, m0, 12 1834 palignr m6, m2, m0, 14 1835 1836%if ARCH_X86_64 1837 paddw m0, m3, m2 1838 punpcklwd m7, m3, m2 1839 punpckhwd m3, m2 1840 paddw m0, m4 1841 punpcklwd m8, m4, m5 1842 punpckhwd m4, m5 1843 paddw m0, m5 1844 punpcklwd m9, m6, m1 1845 punpckhwd m5, m6, m1 1846 paddw m0, m6 1847 pmaddwd m7, m7 1848 pmaddwd m3, m3 1849 pmaddwd m8, m8 1850 pmaddwd m4, m4 1851 pmaddwd m9, m9 1852 pmaddwd m5, m5 1853 paddd m7, m8 1854 paddd m3, m4 1855 paddd m7, m9 1856 paddd m3, m5 1857 movu [sumq+xq*2], m0 1858 movu [sumsqq+xq*4+ 0], m7 1859 movu [sumsqq+xq*4+16], m3 1860%else 1861 paddw m0, m3, m2 1862 paddw m0, m4 1863 paddw m0, m5 1864 paddw m0, m6 1865 movu [sumq+xq*2], m0 1866 punpcklwd m7, m3, m2 1867 punpckhwd m3, m2 1868 punpcklwd m0, m4, m5 1869 punpckhwd m4, m5 1870 punpckhwd m5, m6, m1 1871 pmaddwd m7, m7 1872 pmaddwd m3, m3 1873 pmaddwd m0, m0 1874 pmaddwd m4, m4 1875 pmaddwd m5, m5 1876 paddd m7, m0 1877 paddd m3, m4 1878 paddd m3, m5 1879 punpcklwd m0, m6, m1 1880 pmaddwd m0, m0 1881 paddd m7, m0 1882 movu [sumsqq+xq*4+ 0], m7 1883 movu [sumsqq+xq*4+16], m3 1884%endif 1885 1886 mova m0, m2 1887 add xq, 8 1888 1889 ; if x <= -8 we can reload more pixels 1890 ; else if x < 0 we reload and extend (this implies have_right=0) 1891 ; else if x < xlimd we extend from previous load (this implies have_right=0) 1892 ; else we are done 1893 1894 cmp xd, -8 1895 jle .loop_x 1896 test xd, xd 1897 jl .partial_load_and_extend 1898 cmp xd, xlimd 1899 jl .right_extend 1900 1901 add srcq, strideq 1902 add sumsqq, (384+16)*4 1903 add sumq, (384+16)*2 1904 dec hd 1905 jg .loop_y 1906%if ARCH_X86_32 1907 ADD esp, 8 1908%endif 1909 RET 1910 1911%if ARCH_X86_64 1912cglobal sgr_box5_v_8bpc, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim 1913 movifnidn edged, edgem 1914 mov ylimd, edged 1915%else 1916cglobal sgr_box5_v_8bpc, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr 1917 %define wm [esp+0] 1918 %define hm [esp+4] 1919 %define edgem [esp+8] 1920 mov wm, xd 1921 mov hm, yd 1922 mov edgem, ylimd 1923%endif 1924 1925 and ylimd, 8 ; have_bottom 1926 shr ylimd, 2 1927 sub ylimd, 3 ; -3 if have_bottom=0, else -1 1928 mov xq, -2 1929%if ARCH_X86_64 1930.loop_x: 1931 lea yd, [hd+ylimd+2] 1932 lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] 1933 lea sum_ptrq, [ sumq+xq*2+2-(384+16)*2] 1934 test edgeb, 4 ; have_top 1935 jnz .load_top 1936 movu m0, [sumsq_ptrq+(384+16)*4*1] 1937 movu m1, [sumsq_ptrq+(384+16)*4*1+16] 1938 mova m2, m0 1939 mova m3, m1 1940 mova m4, m0 1941 mova m5, m1 1942 mova m6, m0 1943 mova m7, m1 1944 movu m10, [sum_ptrq+(384+16)*2*1] 1945 mova m11, m10 1946 mova m12, m10 1947 mova m13, m10 1948 jmp .loop_y_second_load 1949.load_top: 1950 movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left] 1951 movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right] 1952 movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] 1953 movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right] 1954 mova m2, m0 1955 mova m3, m1 1956 movu m10, [sum_ptrq-(384+16)*2*1] ; l3/4 1957 movu m12, [sum_ptrq-(384+16)*2*0] ; l2 1958 mova m11, m10 1959.loop_y: 1960 movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] 1961 movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right] 1962 movu m13, [sum_ptrq+(384+16)*2*1] ; l1 1963.loop_y_second_load: 1964 test yd, yd 1965 jle .emulate_second_load 1966 movu m8, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left] 1967 movu m9, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right] 1968 movu m14, [sum_ptrq+(384+16)*2*2] ; l0 1969.loop_y_noload: 1970 paddd m0, m2 1971 paddd m1, m3 1972 paddw m10, m11 1973 paddd m0, m4 1974 paddd m1, m5 1975 paddw m10, m12 1976 paddd m0, m6 1977 paddd m1, m7 1978 paddw m10, m13 1979 paddd m0, m8 1980 paddd m1, m9 1981 paddw m10, m14 1982 movu [sumsq_ptrq+ 0], m0 1983 movu [sumsq_ptrq+16], m1 1984 movu [sum_ptrq], m10 1985 1986 ; shift position down by one 1987 mova m0, m4 1988 mova m1, m5 1989 mova m2, m6 1990 mova m3, m7 1991 mova m4, m8 1992 mova m5, m9 1993 mova m10, m12 1994 mova m11, m13 1995 mova m12, m14 1996 add sumsq_ptrq, (384+16)*4*2 1997 add sum_ptrq, (384+16)*2*2 1998 sub yd, 2 1999 jge .loop_y 2000 ; l1 = l0 2001 mova m6, m8 2002 mova m7, m9 2003 mova m13, m14 2004 cmp yd, ylimd 2005 jg .loop_y_noload 2006 add xd, 8 2007 cmp xd, wd 2008 jl .loop_x 2009 RET 2010.emulate_second_load: 2011 mova m8, m6 2012 mova m9, m7 2013 mova m14, m13 2014 jmp .loop_y_noload 2015%else 2016.sumsq_loop_x: 2017 lea yd, [ylimd+2] 2018 add yd, hm 2019 lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] 2020 test byte edgem, 4 ; have_top 2021 jnz .sumsq_load_top 2022 movu m0, [sumsq_ptrq+(384+16)*4*1] 2023 movu m1, [sumsq_ptrq+(384+16)*4*1+16] 2024 mova m4, m0 2025 mova m5, m1 2026 mova m6, m0 2027 mova m7, m1 2028 mova [esp+0x1c], m0 2029 mova [esp+0x0c], m1 2030 jmp .sumsq_loop_y_second_load 2031.sumsq_load_top: 2032 movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left] 2033 movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right] 2034 movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] 2035 movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right] 2036 mova [esp+0x1c], m0 2037 mova [esp+0x0c], m1 2038.sumsq_loop_y: 2039 movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] 2040 movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right] 2041.sumsq_loop_y_second_load: 2042 test yd, yd 2043 jle .sumsq_emulate_second_load 2044 movu m2, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left] 2045 movu m3, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right] 2046.sumsq_loop_y_noload: 2047 paddd m0, [esp+0x1c] 2048 paddd m1, [esp+0x0c] 2049 paddd m0, m4 2050 paddd m1, m5 2051 paddd m0, m6 2052 paddd m1, m7 2053 paddd m0, m2 2054 paddd m1, m3 2055 movu [sumsq_ptrq+ 0], m0 2056 movu [sumsq_ptrq+16], m1 2057 2058 ; shift position down by one 2059 mova m0, m4 2060 mova m1, m5 2061 mova m4, m2 2062 mova m5, m3 2063 mova [esp+0x1c], m6 2064 mova [esp+0x0c], m7 2065 add sumsq_ptrq, (384+16)*4*2 2066 sub yd, 2 2067 jge .sumsq_loop_y 2068 ; l1 = l0 2069 mova m6, m2 2070 mova m7, m3 2071 cmp yd, ylimd 2072 jg .sumsq_loop_y_noload 2073 add xd, 8 2074 cmp xd, wm 2075 jl .sumsq_loop_x 2076 2077 mov xd, -2 2078.sum_loop_x: 2079 lea yd, [ylimd+2] 2080 add yd, hm 2081 lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] 2082 test byte edgem, 4 ; have_top 2083 jnz .sum_load_top 2084 movu m0, [sum_ptrq+(384+16)*2*1] 2085 mova m1, m0 2086 mova m2, m0 2087 mova m3, m0 2088 jmp .sum_loop_y_second_load 2089.sum_load_top: 2090 movu m0, [sum_ptrq-(384+16)*2*1] ; l3/4 2091 movu m2, [sum_ptrq-(384+16)*2*0] ; l2 2092 mova m1, m0 2093.sum_loop_y: 2094 movu m3, [sum_ptrq+(384+16)*2*1] ; l1 2095.sum_loop_y_second_load: 2096 test yd, yd 2097 jle .sum_emulate_second_load 2098 movu m4, [sum_ptrq+(384+16)*2*2] ; l0 2099.sum_loop_y_noload: 2100 paddw m0, m1 2101 paddw m0, m2 2102 paddw m0, m3 2103 paddw m0, m4 2104 movu [sum_ptrq], m0 2105 2106 ; shift position down by one 2107 mova m0, m2 2108 mova m1, m3 2109 mova m2, m4 2110 add sum_ptrq, (384+16)*2*2 2111 sub yd, 2 2112 jge .sum_loop_y 2113 ; l1 = l0 2114 mova m3, m4 2115 cmp yd, ylimd 2116 jg .sum_loop_y_noload 2117 add xd, 8 2118 cmp xd, wm 2119 jl .sum_loop_x 2120 RET 2121.sumsq_emulate_second_load: 2122 mova m2, m6 2123 mova m3, m7 2124 jmp .sumsq_loop_y_noload 2125.sum_emulate_second_load: 2126 mova m4, m3 2127 jmp .sum_loop_y_noload 2128%endif 2129 2130cglobal sgr_calc_ab2_8bpc, 4, 7, 11, a, b, w, h, s 2131 movifnidn sd, sm 2132 sub aq, (384+16-1)*4 2133 sub bq, (384+16-1)*2 2134 add hd, 2 2135%if ARCH_X86_64 2136 LEA r5, sgr_x_by_x-0xF03 2137%else 2138 SETUP_PIC r5, 0 2139%endif 2140 movd m6, sd 2141 pshuflw m6, m6, q0000 2142 punpcklqdq m6, m6 2143 pxor m7, m7 2144 DEFINE_ARGS a, b, w, h, x 2145%if ARCH_X86_64 2146 mova m8, [pd_0xF0080029] 2147 mova m9, [pw_256] 2148 psrld m10, m9, 15 ; pd_512 2149%else 2150 %define m8 [PIC_sym(pd_0xF0080029)] 2151 %define m9 [PIC_sym(pw_256)] 2152 %define m10 [PIC_sym(pd_512)] 2153%endif 2154.loop_y: 2155 mov xq, -2 2156.loop_x: 2157 movq m0, [bq+xq*2+0] 2158 movq m1, [bq+xq*2+8] 2159 punpcklwd m0, m7 2160 punpcklwd m1, m7 2161 movu m2, [aq+xq*4+ 0] 2162 movu m3, [aq+xq*4+16] 2163 pslld m4, m2, 3 ; aa * 8 2164 pslld m5, m3, 3 2165 paddd m2, m4 ; aa * 9 2166 paddd m3, m5 2167 paddd m4, m4 ; aa * 16 2168 paddd m5, m5 2169 paddd m2, m4 ; aa * 25 2170 paddd m3, m5 2171 pmaddwd m4, m0, m0 2172 pmaddwd m5, m1, m1 2173 psubd m2, m4 ; p = aa * 25 - bb * bb 2174 psubd m3, m5 2175 MULLD m2, m6 2176 MULLD m3, m6 2177 paddusw m2, m8 2178 paddusw m3, m8 2179 psrld m2, 20 ; z 2180 psrld m3, 20 2181 GATHERDD m4, m2 ; xx 2182 GATHERDD m2, m3 2183 psrld m4, 24 2184 psrld m2, 24 2185 packssdw m3, m4, m2 2186 pmullw m4, m8 2187 pmullw m2, m8 2188 psubw m5, m9, m3 2189 pmaddwd m0, m4 2190 pmaddwd m1, m2 2191 paddd m0, m10 2192 paddd m1, m10 2193 psrld m0, 10 2194 psrld m1, 10 2195 movu [bq+xq*2], m5 2196 movu [aq+xq*4+ 0], m0 2197 movu [aq+xq*4+16], m1 2198 add xd, 8 2199 cmp xd, wd 2200 jl .loop_x 2201 add aq, (384+16)*4*2 2202 add bq, (384+16)*2*2 2203 sub hd, 2 2204 jg .loop_y 2205 RET 2206 2207%if ARCH_X86_64 2208cglobal sgr_finish_filter2_8bpc, 5, 13, 14, t, src, stride, a, b, w, h, \ 2209 tmp_base, src_base, a_base, b_base, x, y 2210 movifnidn wd, wm 2211 mov hd, hm 2212 mov tmp_baseq, tq 2213 mov src_baseq, srcq 2214 mov a_baseq, aq 2215 mov b_baseq, bq 2216 mova m9, [pw_5_6] 2217 mova m12, [pw_256] 2218 psrlw m10, m12, 8 ; pw_1 2219 psrlw m11, m12, 1 ; pw_128 2220 pxor m13, m13 2221%else 2222cglobal sgr_finish_filter2_8bpc, 6, 7, 8, t, src, stride, a, b, x, y 2223 %define tmp_baseq r0m 2224 %define src_baseq r1m 2225 %define a_baseq r3m 2226 %define b_baseq r4m 2227 %define wd r5m 2228 %define hd r6m 2229 2230 SUB esp, 8 2231 SETUP_PIC yd 2232 2233 %define m8 m5 2234 %define m9 [PIC_sym(pw_5_6)] 2235 %define m10 [PIC_sym(pw_1)] 2236 %define m11 [PIC_sym(pw_128)] 2237 %define m12 [PIC_sym(pw_256)] 2238 %define m13 m0 2239%endif 2240 xor xd, xd 2241.loop_x: 2242 mov tq, tmp_baseq 2243 mov srcq, src_baseq 2244 mov aq, a_baseq 2245 mov bq, b_baseq 2246 movu m0, [aq+xq*4-(384+16)*4-4] 2247 mova m1, [aq+xq*4-(384+16)*4] 2248 movu m2, [aq+xq*4-(384+16)*4+4] 2249 movu m3, [aq+xq*4-(384+16)*4-4+16] 2250 mova m4, [aq+xq*4-(384+16)*4+16] 2251 movu m5, [aq+xq*4-(384+16)*4+4+16] 2252 paddd m0, m2 2253 paddd m3, m5 2254 paddd m0, m1 2255 paddd m3, m4 2256 pslld m2, m0, 2 2257 pslld m5, m3, 2 2258 paddd m2, m0 2259 paddd m5, m3 2260 paddd m0, m2, m1 ; prev_odd_b [first half] 2261 paddd m1, m5, m4 ; prev_odd_b [second half] 2262 movu m3, [bq+xq*2-(384+16)*2-2] 2263 mova m4, [bq+xq*2-(384+16)*2] 2264 movu m5, [bq+xq*2-(384+16)*2+2] 2265 paddw m3, m5 2266 punpcklwd m5, m3, m4 2267 punpckhwd m3, m4 2268 pmaddwd m5, m9 2269 pmaddwd m3, m9 2270 mova m2, m5 2271 packssdw m2, m3 ; prev_odd_a 2272 lea tq, [tq+xq*2] 2273 lea srcq, [srcq+xq*1] 2274 lea aq, [aq+xq*4+(384+16)*4] 2275 lea bq, [bq+xq*2+(384+16)*2] 2276%if ARCH_X86_32 2277 mov [esp], PIC_reg 2278%endif 2279 mov yd, hd 2280 XCHG_PIC_REG 2281.loop_y: 2282 movu m3, [aq-4] 2283 mova m4, [aq] 2284 movu m5, [aq+4] 2285 paddd m3, m5 2286 paddd m3, m4 2287 pslld m5, m3, 2 2288 paddd m5, m3 2289 paddd m5, m4 ; cur_odd_b [first half] 2290 movu m3, [aq+16-4] 2291 mova m6, [aq+16] 2292 movu m7, [aq+16+4] 2293 paddd m3, m7 2294 paddd m3, m6 2295 pslld m7, m3, 2 2296 paddd m7, m3 2297 paddd m4, m7, m6 ; cur_odd_b [second half] 2298 movu m3, [bq-2] 2299 mova m6, [bq] 2300 movu m7, [bq+2] 2301 paddw m3, m7 2302 punpcklwd m7, m3, m6 2303 punpckhwd m3, m6 2304 pmaddwd m7, m9 2305 pmaddwd m3, m9 2306 packssdw m6, m7, m3 ; cur_odd_a 2307 2308 paddd m0, m5 ; cur_even_b [first half] 2309 paddd m1, m4 ; cur_even_b [second half] 2310 paddw m2, m6 ; cur_even_a 2311 2312 movq m3, [srcq] 2313%if ARCH_X86_64 2314 punpcklbw m3, m13 2315%else 2316 mova [td], m5 2317 pxor m7, m7 2318 punpcklbw m3, m7 2319%endif 2320 punpcklwd m7, m3, m10 2321 punpckhwd m3, m10 2322 punpcklwd m8, m2, m12 2323 punpckhwd m2, m12 2324 pmaddwd m7, m8 2325 pmaddwd m3, m2 2326 paddd m7, m0 2327 paddd m3, m1 2328 psrad m7, 9 2329 psrad m3, 9 2330 2331%if ARCH_X86_32 2332 pxor m13, m13 2333%endif 2334 movq m8, [srcq+strideq] 2335 punpcklbw m8, m13 2336 punpcklwd m0, m8, m10 2337 punpckhwd m8, m10 2338 punpcklwd m1, m6, m11 2339 punpckhwd m2, m6, m11 2340 pmaddwd m0, m1 2341 pmaddwd m8, m2 2342%if ARCH_X86_64 2343 paddd m0, m5 2344%else 2345 paddd m0, [td] 2346%endif 2347 paddd m8, m4 2348 psrad m0, 8 2349 psrad m8, 8 2350 2351 packssdw m7, m3 2352 packssdw m0, m8 2353%if ARCH_X86_32 2354 mova m5, [td] 2355%endif 2356 mova [tq+384*2*0], m7 2357 mova [tq+384*2*1], m0 2358 2359 mova m0, m5 2360 mova m1, m4 2361 mova m2, m6 2362 add aq, (384+16)*4*2 2363 add bq, (384+16)*2*2 2364 add tq, 384*2*2 2365 lea srcq, [srcq+strideq*2] 2366%if ARCH_X86_64 2367 sub yd, 2 2368%else 2369 sub dword [esp+4], 2 2370%endif 2371 jg .loop_y 2372 add xd, 8 2373 cmp xd, wd 2374 jl .loop_x 2375%if ARCH_X86_32 2376 ADD esp, 8 2377%endif 2378 RET 2379 2380%undef t2 2381cglobal sgr_weighted2_8bpc, 4, 7, 12, dst, stride, t1, t2, w, h, wt 2382 movifnidn wd, wm 2383 movd m0, wtm 2384%if ARCH_X86_64 2385 movifnidn hd, hm 2386 mova m10, [pd_1024] 2387 pxor m11, m11 2388%else 2389 SETUP_PIC hd, 0 2390 %define m10 [PIC_sym(pd_1024)] 2391 %define m11 m7 2392%endif 2393 pshufd m0, m0, 0 2394 DEFINE_ARGS dst, stride, t1, t2, w, h, idx 2395%if ARCH_X86_32 2396 %define hd hmp 2397%endif 2398 2399.loop_y: 2400 xor idxd, idxd 2401.loop_x: 2402 mova m1, [t1q+idxq*2+ 0] 2403 mova m2, [t1q+idxq*2+16] 2404 mova m3, [t2q+idxq*2+ 0] 2405 mova m4, [t2q+idxq*2+16] 2406 mova m6, [dstq+idxq] 2407%if ARCH_X86_32 2408 pxor m11, m11 2409%endif 2410 punpcklbw m5, m6, m11 2411 punpckhbw m6, m11 2412 psllw m7, m5, 4 2413 psubw m1, m7 2414 psubw m3, m7 2415 psllw m7, m6, 4 2416 psubw m2, m7 2417 psubw m4, m7 2418 punpcklwd m7, m1, m3 2419 punpckhwd m1, m3 2420 punpcklwd m3, m2, m4 2421 punpckhwd m2, m4 2422 pmaddwd m7, m0 2423 pmaddwd m1, m0 2424 pmaddwd m3, m0 2425 pmaddwd m2, m0 2426 paddd m7, m10 2427 paddd m1, m10 2428 paddd m3, m10 2429 paddd m2, m10 2430 psrad m7, 11 2431 psrad m1, 11 2432 psrad m3, 11 2433 psrad m2, 11 2434 packssdw m7, m1 2435 packssdw m3, m2 2436 paddw m7, m5 2437 paddw m3, m6 2438 packuswb m7, m3 2439 mova [dstq+idxq], m7 2440 add idxd, 16 2441 cmp idxd, wd 2442 jl .loop_x 2443 add dstq, strideq 2444 add t1q, 384 * 2 2445 add t2q, 384 * 2 2446 dec hd 2447 jg .loop_y 2448 RET 2449