1; Copyright © 2018, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; Copyright © 2018, VideoLabs 4; All rights reserved. 5; 6; Redistribution and use in source and binary forms, with or without 7; modification, are permitted provided that the following conditions are met: 8; 9; 1. Redistributions of source code must retain the above copyright notice, this 10; list of conditions and the following disclaimer. 11; 12; 2. Redistributions in binary form must reproduce the above copyright notice, 13; this list of conditions and the following disclaimer in the documentation 14; and/or other materials provided with the distribution. 15; 16; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 27%include "config.asm" 28%include "ext/x86/x86inc.asm" 29 30SECTION_RODATA 16 31 32wiener_init: db 6, 7, 6, 7, 6, 7, 6, 7, 0, 0, 0, 0, 2, 4, 2, 4 33wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 34wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 35wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 36wiener_shufD: db 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1, 10, -1, 11, -1 37wiener_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 38sgr_lshuf3: db 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 39sgr_lshuf5: db 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 40pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 41 42pb_right_ext_mask: times 24 db 0xff 43 times 8 db 0 44pb_1: times 16 db 1 45pb_3: times 16 db 3 46pw_256: times 8 dw 256 47pw_2056: times 8 dw 2056 48pw_m16380: times 8 dw -16380 49pd_4096: times 4 dd 4096 50pd_34816: times 4 dd 34816 51pd_0xffff: times 4 dd 0xffff 52pd_0xf00800a4: times 4 dd 0xf00800a4 53pd_0xf00801c7: times 4 dd 0xf00801c7 54 55cextern sgr_x_by_x 56 57SECTION .text 58 59%macro movif64 2 ; dst, src 60 %if ARCH_X86_64 61 mov %1, %2 62 %endif 63%endmacro 64 65%macro movif32 2 ; dst, src 66 %if ARCH_X86_32 67 mov %1, %2 68 %endif 69%endmacro 70 71%if ARCH_X86_32 72 %define PIC_base_offset $$ 73 74 %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg 75 %assign pic_reg_stk_off 4 76 %xdefine PIC_reg %1 77 %if %2 == 1 78 mov [esp], %1 79 %endif 80 LEA PIC_reg, PIC_base_offset 81 %if %3 == 1 82 XCHG_PIC_REG 83 %endif 84 %endmacro 85 86 %macro XCHG_PIC_REG 0 87 mov [esp+pic_reg_stk_off], PIC_reg 88 %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8 89 mov PIC_reg, [esp+pic_reg_stk_off] 90 %endmacro 91 92 %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) 93 94%else 95 %macro XCHG_PIC_REG 0 96 %endmacro 97 98 %define PIC_sym(sym) (sym) 99%endif 100 101%macro WIENER 0 102%if ARCH_X86_64 103DECLARE_REG_TMP 9, 7, 10, 11, 12, 13, 14 ; ring buffer pointers 104cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ 105 w, h, edge, flt, x 106 %define tmpstrideq strideq 107 %define base 0 108 mov fltq, r6mp 109 mov wd, wm 110 movifnidn hd, hm 111 mov edged, r7m 112 movq m14, [fltq] 113 add lpfq, wq 114 movq m7, [fltq+16] 115 add dstq, wq 116 lea t1, [rsp+wq*2+16] 117 mova m15, [pw_2056] 118 neg wq 119%if cpuflag(ssse3) 120 pshufb m14, [wiener_init] 121 mova m8, [wiener_shufA] 122 pshufd m12, m14, q2222 ; x0 x0 123 mova m9, [wiener_shufB] 124 pshufd m13, m14, q3333 ; x1 x2 125 mova m10, [wiener_shufC] 126 punpcklqdq m14, m14 ; x3 127 mova m11, [wiener_shufD] 128%else 129 mova m10, [pw_m16380] 130 punpcklwd m14, m14 131 pshufd m11, m14, q0000 ; x0 132 pshufd m12, m14, q1111 ; x1 133 pshufd m13, m14, q2222 ; x2 134 pshufd m14, m14, q3333 ; x3 135%endif 136%else 137DECLARE_REG_TMP 4, 0, _, 5 138%if cpuflag(ssse3) 139 %define m10 [base+wiener_shufC] 140 %define m11 [base+wiener_shufD] 141 %define stk_off 96 142%else 143 %define m10 [base+pw_m16380] 144 %define m11 [stk+96] 145 %define stk_off 112 146%endif 147cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstride 148 %define base r6-pb_right_ext_mask-21 149 %define stk esp 150 %define dstq leftq 151 %define edgeb byte edged 152 %define edged [stk+ 8] 153 %define dstmp [stk+12] 154 %define hd dword [stk+16] 155 %define wq [stk+20] 156 %define strideq [stk+24] 157 %define leftmp [stk+28] 158 %define t2 [stk+32] 159 %define t4 [stk+36] 160 %define t5 [stk+40] 161 %define t6 [stk+44] 162 %define m8 [base+wiener_shufA] 163 %define m9 [base+wiener_shufB] 164 %define m12 [stk+48] 165 %define m13 [stk+64] 166 %define m14 [stk+80] 167 %define m15 [base+pw_2056] 168 mov r1, r6m ; flt 169 mov r0, r0m ; dst 170 mov r4, r4m ; w 171 mov lpfq, lpfm 172 mov r2, r7m ; edge 173 mov r5, r5m ; h 174 movq m3, [r1+ 0] 175 movq m7, [r1+16] 176 add r0, r4 177 mov r1, r1m ; stride 178 add lpfq, r4 179 mov edged, r2 180 mov r2, r2m ; left 181 mov dstmp, r0 182 lea t1, [rsp+r4*2+stk_off] 183 mov hd, r5 184 neg r4 185 LEA r6, pb_right_ext_mask+21 186 mov wq, r4 187 mov strideq, r1 188 mov leftmp, r2 189 mov r4, r1 190%if cpuflag(ssse3) 191 pshufb m3, [base+wiener_init] 192 pshufd m1, m3, q2222 193 pshufd m2, m3, q3333 194 punpcklqdq m3, m3 195%else 196 punpcklwd m3, m3 197 pshufd m0, m3, q0000 198 pshufd m1, m3, q1111 199 pshufd m2, m3, q2222 200 pshufd m3, m3, q3333 201 mova m11, m0 202%endif 203 mova m12, m1 204 mova m13, m2 205 mova m14, m3 206%endif 207 psllw m7, 5 208 pshufd m6, m7, q0000 ; y0 y1 209 pshufd m7, m7, q1111 ; y2 y3 210 test edgeb, 4 ; LR_HAVE_TOP 211 jz .no_top 212 call .h_top 213 add lpfq, strideq 214 mov t6, t1 215 mov t5, t1 216 add t1, 384*2 217 call .h_top 218 lea t3, [lpfq+tmpstrideq*4] 219 mov lpfq, dstmp 220 add t3, tmpstrideq 221 mov [rsp], t3 ; below 222 mov t4, t1 223 add t1, 384*2 224 call .h 225 mov t3, t1 226 mov t2, t1 227 dec hd 228 jz .v1 229 add lpfq, strideq 230 add t1, 384*2 231 call .h 232 mov t2, t1 233 dec hd 234 jz .v2 235 add lpfq, strideq 236 add t1, 384*2 237 call .h 238 dec hd 239 jz .v3 240.main: 241 lea t0, [t1+384*2] 242.main_loop: 243 call .hv 244 dec hd 245 jnz .main_loop 246 test edgeb, 8 ; LR_HAVE_BOTTOM 247 jz .v3 248 mov lpfq, [rsp] 249 call .hv_bottom 250 add lpfq, strideq 251 call .hv_bottom 252.v1: 253 call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v 254 RET 255.no_top: 256 lea t3, [lpfq+tmpstrideq*4] 257 mov lpfq, dstmp 258 lea t3, [t3+tmpstrideq*2] 259 mov [rsp], t3 260 call .h 261 mov t6, t1 262 mov t5, t1 263 mov t4, t1 264 mov t3, t1 265 mov t2, t1 266 dec hd 267 jz .v1 268 add lpfq, strideq 269 add t1, 384*2 270 call .h 271 mov t2, t1 272 dec hd 273 jz .v2 274 add lpfq, strideq 275 add t1, 384*2 276 call .h 277 dec hd 278 jz .v3 279 lea t0, [t1+384*2] 280 call .hv 281 dec hd 282 jz .v3 283 add t0, 384*8 284 call .hv 285 dec hd 286 jnz .main 287.v3: 288 call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v 289.v2: 290 call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v 291 jmp .v1 292.extend_right: 293 movd m2, [lpfq-4] 294%if ARCH_X86_64 295 push r0 296 lea r0, [pb_right_ext_mask+21] 297 movu m0, [r0+xq+0] 298 movu m1, [r0+xq+8] 299 pop r0 300%else 301 movu m0, [r6+xq+0] 302 movu m1, [r6+xq+8] 303%endif 304%if cpuflag(ssse3) 305 pshufb m2, [base+pb_3] 306%else 307 punpcklbw m2, m2 308 pshuflw m2, m2, q3333 309 punpcklqdq m2, m2 310%endif 311 pand m4, m0 312 pand m5, m1 313 pandn m0, m2 314 pandn m1, m2 315 por m4, m0 316 por m5, m1 317 ret 318.h: 319 %define stk esp+4 ; offset due to call 320 mov xq, wq 321 test edgeb, 1 ; LR_HAVE_LEFT 322 jz .h_extend_left 323 movifnidn leftq, leftmp 324 mova m4, [lpfq+xq] 325 movd m5, [leftq] 326 add leftq, 4 327 pslldq m4, 4 328 por m4, m5 329 movifnidn leftmp, leftq 330 jmp .h_main 331.h_extend_left: 332%if cpuflag(ssse3) 333 mova m4, [lpfq+xq] 334 pshufb m4, [base+wiener_l_shuf] 335%else 336 mova m5, [lpfq+xq] 337 pshufd m4, m5, q2103 338 punpcklbw m5, m5 339 punpcklwd m5, m5 340 movss m4, m5 341%endif 342 jmp .h_main 343.h_top: 344 mov xq, wq 345 test edgeb, 1 ; LR_HAVE_LEFT 346 jz .h_extend_left 347.h_loop: 348 movu m4, [lpfq+xq-4] 349.h_main: 350 movu m5, [lpfq+xq+4] 351 test edgeb, 2 ; LR_HAVE_RIGHT 352 jnz .h_have_right 353 cmp xd, -18 354 jl .h_have_right 355 call .extend_right 356.h_have_right: 357%macro %%h7 0 358%if cpuflag(ssse3) 359 pshufb m0, m4, m8 360 pmaddubsw m0, m12 361 pshufb m1, m5, m8 362 pmaddubsw m1, m12 363 pshufb m2, m4, m9 364 pmaddubsw m2, m13 365 pshufb m3, m5, m9 366 pmaddubsw m3, m13 367 paddw m0, m2 368 pshufb m2, m4, m10 369 pmaddubsw m2, m13 370 paddw m1, m3 371 pshufb m3, m5, m10 372 pmaddubsw m3, m13 373 pshufb m4, m11 374 paddw m0, m2 375 pmullw m2, m14, m4 376 pshufb m5, m11 377 paddw m1, m3 378 pmullw m3, m14, m5 379 psllw m4, 7 380 psllw m5, 7 381 paddw m0, m2 382 mova m2, [base+pw_m16380] 383 paddw m1, m3 384 paddw m4, m2 385 paddw m5, m2 386 paddsw m0, m4 387 paddsw m1, m5 388%else 389 psrldq m0, m4, 1 390 pslldq m1, m4, 1 391 pxor m3, m3 392 punpcklbw m0, m3 393 punpckhbw m1, m3 394 paddw m0, m1 395 pmullw m0, m11 396 psrldq m1, m4, 2 397 pslldq m2, m4, 2 398 punpcklbw m1, m3 399 punpckhbw m2, m3 400 paddw m1, m2 401 pmullw m1, m12 402 paddw m0, m1 403 pshufd m2, m4, q0321 404 punpcklbw m2, m3 405 pmullw m1, m14, m2 406 paddw m0, m1 407 psrldq m1, m4, 3 408 pslldq m4, 3 409 punpcklbw m1, m3 410 punpckhbw m4, m3 411 paddw m1, m4 412 pmullw m1, m13 413 paddw m0, m1 414 psllw m2, 7 415 paddw m2, m10 416 paddsw m0, m2 417 psrldq m1, m5, 1 418 pslldq m2, m5, 1 419 punpcklbw m1, m3 420 punpckhbw m2, m3 421 paddw m1, m2 422 pmullw m1, m11 423 psrldq m2, m5, 2 424 pslldq m4, m5, 2 425 punpcklbw m2, m3 426 punpckhbw m4, m3 427 paddw m2, m4 428 pmullw m2, m12 429 paddw m1, m2 430 pshufd m4, m5, q0321 431 punpcklbw m4, m3 432 pmullw m2, m14, m4 433 paddw m1, m2 434 psrldq m2, m5, 3 435 pslldq m5, 3 436 punpcklbw m2, m3 437 punpckhbw m5, m3 438 paddw m2, m5 439 pmullw m2, m13 440 paddw m1, m2 441 psllw m4, 7 442 paddw m4, m10 443 paddsw m1, m4 444%endif 445%endmacro 446 %%h7 447 psraw m0, 3 448 psraw m1, 3 449 paddw m0, m15 450 paddw m1, m15 451 mova [t1+xq*2+ 0], m0 452 mova [t1+xq*2+16], m1 453 add xq, 16 454 jl .h_loop 455 ret 456ALIGN function_align 457.hv: 458 add lpfq, strideq 459 mov xq, wq 460 test edgeb, 1 ; LR_HAVE_LEFT 461 jz .hv_extend_left 462 movifnidn leftq, leftmp 463 mova m4, [lpfq+xq] 464 movd m5, [leftq] 465 add leftq, 4 466 pslldq m4, 4 467 por m4, m5 468 movifnidn leftmp, leftq 469 jmp .hv_main 470.hv_extend_left: 471%if cpuflag(ssse3) 472 mova m4, [lpfq+xq] 473 pshufb m4, [base+wiener_l_shuf] 474%else 475 mova m5, [lpfq+xq] 476 pshufd m4, m5, q2103 477 punpcklbw m5, m5 478 punpcklwd m5, m5 479 movss m4, m5 480%endif 481 jmp .hv_main 482.hv_bottom: 483 mov xq, wq 484 test edgeb, 1 ; LR_HAVE_LEFT 485 jz .hv_extend_left 486.hv_loop: 487 movu m4, [lpfq+xq-4] 488.hv_main: 489 movu m5, [lpfq+xq+4] 490 test edgeb, 2 ; LR_HAVE_RIGHT 491 jnz .hv_have_right 492 cmp xd, -18 493 jl .hv_have_right 494 call .extend_right 495.hv_have_right: 496 %%h7 497%if ARCH_X86_64 498 mova m2, [t4+xq*2] 499 paddw m2, [t2+xq*2] 500%else 501 mov r2, t4 502 mova m2, [r2+xq*2] 503 mov r2, t2 504 paddw m2, [r2+xq*2] 505 mov r2, t5 506%endif 507 mova m3, [t3+xq*2] 508%if ARCH_X86_64 509 mova m5, [t5+xq*2] 510%else 511 mova m5, [r2+xq*2] 512 mov r2, t6 513%endif 514 paddw m5, [t1+xq*2] 515 psraw m0, 3 516 psraw m1, 3 517 paddw m0, m15 518 paddw m1, m15 519%if ARCH_X86_64 520 paddw m4, m0, [t6+xq*2] 521%else 522 paddw m4, m0, [r2+xq*2] 523 mov r2, t4 524%endif 525 mova [t0+xq*2], m0 526 punpcklwd m0, m2, m3 527 pmaddwd m0, m7 528 punpckhwd m2, m3 529 pmaddwd m2, m7 530 punpcklwd m3, m4, m5 531 pmaddwd m3, m6 532 punpckhwd m4, m5 533 pmaddwd m4, m6 534 paddd m0, m3 535 mova m3, [t3+xq*2+16] 536 paddd m4, m2 537%if ARCH_X86_64 538 mova m2, [t4+xq*2+16] 539 paddw m2, [t2+xq*2+16] 540 mova m5, [t5+xq*2+16] 541%else 542 mova m2, [r2+xq*2+16] 543 mov r2, t2 544 paddw m2, [r2+xq*2+16] 545 mov r2, t5 546 mova m5, [r2+xq*2+16] 547 mov r2, t6 548%endif 549 paddw m5, [t1+xq*2+16] 550 packuswb m0, m4 551%if ARCH_X86_64 552 paddw m4, m1, [t6+xq*2+16] 553%else 554 paddw m4, m1, [r2+xq*2+16] 555 mov dstq, dstmp 556%endif 557 mova [t0+xq*2+16], m1 558 punpcklwd m1, m2, m3 559 pmaddwd m1, m7 560 punpckhwd m2, m3 561 pmaddwd m2, m7 562 punpcklwd m3, m4, m5 563 pmaddwd m3, m6 564 punpckhwd m4, m5 565 pmaddwd m4, m6 566 paddd m1, m3 567 paddd m2, m4 568 packuswb m1, m2 569 psrlw m0, 8 570 psrlw m1, 8 571 packuswb m0, m1 572 mova [dstq+xq], m0 573 add xq, 16 574 jl .hv_loop 575 add dstq, strideq 576%if ARCH_X86_64 577 mov t6, t5 578 mov t5, t4 579 mov t4, t3 580 mov t3, t2 581 mov t2, t1 582 mov t1, t0 583 mov t0, t6 584%else 585 mov dstmp, dstq 586 mov r1, t5 587 mov r2, t4 588 mov t6, r1 589 mov t5, r2 590 mov t4, t3 591 mov t3, t2 592 mov t2, t1 593 mov t1, t0 594 mov t0, r1 595%endif 596 ret 597%if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code 598.v: 599 mov xq, wq 600.v_loop: 601%if ARCH_X86_64 602 mova m1, [t4+xq*2] 603 paddw m1, [t2+xq*2] 604%else 605 mov r2, t4 606 mova m1, [r2+xq*2] 607 mov r2, t2 608 paddw m1, [r2+xq*2] 609 mov r2, t6 610%endif 611 mova m2, [t3+xq*2] 612 mova m4, [t1+xq*2] 613%if ARCH_X86_64 614 paddw m3, m4, [t6+xq*2] 615 paddw m4, [t5+xq*2] 616%else 617 paddw m3, m4, [r2+xq*2] 618 mov r2, t5 619 paddw m4, [r2+xq*2] 620 mov r2, t4 621%endif 622 punpcklwd m0, m1, m2 623 pmaddwd m0, m7 624 punpckhwd m1, m2 625 pmaddwd m1, m7 626 punpcklwd m2, m3, m4 627 pmaddwd m2, m6 628 punpckhwd m3, m4 629 pmaddwd m3, m6 630 paddd m0, m2 631 paddd m1, m3 632%if ARCH_X86_64 633 mova m2, [t4+xq*2+16] 634 paddw m2, [t2+xq*2+16] 635%else 636 mova m2, [r2+xq*2+16] 637 mov r2, t2 638 paddw m2, [r2+xq*2+16] 639 mov r2, t6 640%endif 641 mova m3, [t3+xq*2+16] 642 mova m5, [t1+xq*2+16] 643%if ARCH_X86_64 644 paddw m4, m5, [t6+xq*2+16] 645 paddw m5, [t5+xq*2+16] 646%else 647 paddw m4, m5, [r2+xq*2+16] 648 mov r2, t5 649 paddw m5, [r2+xq*2+16] 650 movifnidn dstq, dstmp 651%endif 652 packuswb m0, m1 653 punpcklwd m1, m2, m3 654 pmaddwd m1, m7 655 punpckhwd m2, m3 656 pmaddwd m2, m7 657 punpcklwd m3, m4, m5 658 pmaddwd m3, m6 659 punpckhwd m4, m5 660 pmaddwd m4, m6 661 paddd m1, m3 662 paddd m2, m4 663 packuswb m1, m2 664 psrlw m0, 8 665 psrlw m1, 8 666 packuswb m0, m1 667 mova [dstq+xq], m0 668 add xq, 16 669 jl .v_loop 670 add dstq, strideq 671%if ARCH_X86_64 672 mov t6, t5 673 mov t5, t4 674%else 675 mov dstmp, dstq 676 mov r1, t5 677 mov r2, t4 678 mov t6, r1 679 mov t5, r2 680%endif 681 mov t4, t3 682 mov t3, t2 683 mov t2, t1 684 ret 685%endif 686 687%if ARCH_X86_64 688cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ 689 w, h, edge, flt, x 690 mov fltq, r6mp 691 mov wd, wm 692 movifnidn hd, hm 693 mov edged, r7m 694 movq m14, [fltq] 695 add lpfq, wq 696 movq m7, [fltq+16] 697 add dstq, wq 698 mova m8, [pw_m16380] 699 lea t1, [rsp+wq*2+16] 700 mova m15, [pw_2056] 701 neg wq 702%if cpuflag(ssse3) 703 pshufb m14, [wiener_init] 704 mova m9, [wiener_shufB] 705 pshufd m13, m14, q3333 ; x1 x2 706 mova m10, [wiener_shufC] 707 punpcklqdq m14, m14 ; x3 708 mova m11, [wiener_shufD] 709 mova m12, [wiener_l_shuf] 710%else 711 punpcklwd m14, m14 712 pshufd m11, m14, q1111 ; x1 713 pshufd m13, m14, q2222 ; x2 714 pshufd m14, m14, q3333 ; x3 715%endif 716%else 717%if cpuflag(ssse3) 718 %define stk_off 80 719%else 720 %define m11 [stk+80] 721 %define stk_off 96 722%endif 723cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, tmpstride 724 %define stk esp 725 %define leftmp [stk+28] 726 %define m8 [base+pw_m16380] 727 %define m12 [base+wiener_l_shuf] 728 %define m14 [stk+48] 729 mov r1, r6m ; flt 730 mov r0, r0m ; dst 731 mov r4, r4m ; w 732 mov lpfq, lpfm 733 mov r2, r7m ; edge 734 mov r5, r5m ; h 735 movq m2, [r1+ 0] 736 movq m7, [r1+16] 737 add r0, r4 738 mov r1, r1m ; stride 739 add lpfq, r4 740 mov edged, r2 741 mov r2, r2m ; left 742 mov dstmp, r0 743 lea t1, [rsp+r4*2+stk_off] 744 mov hd, r5 745 neg r4 746 LEA r6, pb_right_ext_mask+21 747 mov wq, r4 748 mov strideq, r1 749 mov leftmp, r2 750 mov r4, r1 751%if cpuflag(ssse3) 752 pshufb m2, [base+wiener_init] 753 pshufd m1, m2, q3333 754 punpcklqdq m2, m2 755%else 756 punpcklwd m2, m2 757 pshufd m0, m2, q1111 758 pshufd m1, m2, q2222 759 pshufd m2, m2, q3333 760 mova m11, m0 761%endif 762 mova m13, m1 763 mova m14, m2 764%endif 765 psllw m7, 5 766 pshufd m6, m7, q0000 ; __ y1 767 pshufd m7, m7, q1111 ; y2 y3 768 test edgeb, 4 ; LR_HAVE_TOP 769 jz .no_top 770 call .h_top 771 add lpfq, strideq 772 mov t4, t1 773 add t1, 384*2 774 call .h_top 775 lea xq, [lpfq+tmpstrideq*4] 776 mov lpfq, dstmp 777 mov t3, t1 778 add t1, 384*2 779 add xq, tmpstrideq 780 mov [rsp], xq ; below 781 call .h 782 mov t2, t1 783 dec hd 784 jz .v1 785 add lpfq, strideq 786 add t1, 384*2 787 call .h 788 dec hd 789 jz .v2 790.main: 791 mov t0, t4 792.main_loop: 793 call .hv 794 dec hd 795 jnz .main_loop 796 test edgeb, 8 ; LR_HAVE_BOTTOM 797 jz .v2 798 mov lpfq, [rsp] 799 call .hv_bottom 800 add lpfq, strideq 801 call .hv_bottom 802.end: 803 RET 804.no_top: 805 lea t3, [lpfq+tmpstrideq*4] 806 mov lpfq, dstmp 807 lea t3, [t3+tmpstrideq*2] 808 mov [rsp], t3 809 call .h 810 mov t4, t1 811 mov t3, t1 812 mov t2, t1 813 dec hd 814 jz .v1 815 add lpfq, strideq 816 add t1, 384*2 817 call .h 818 dec hd 819 jz .v2 820 lea t0, [t1+384*2] 821 call .hv 822 dec hd 823 jz .v2 824 add t0, 384*6 825 call .hv 826 dec hd 827 jnz .main 828.v2: 829 call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v 830 add dstq, strideq 831 mov t4, t3 832 mov t3, t2 833 mov t2, t1 834 movifnidn dstmp, dstq 835.v1: 836 call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v 837 jmp .end 838.h: 839 %define stk esp+4 840 mov xq, wq 841 test edgeb, 1 ; LR_HAVE_LEFT 842 jz .h_extend_left 843 movifnidn leftq, leftmp 844 mova m4, [lpfq+xq] 845 movd m5, [leftq] 846 add leftq, 4 847 pslldq m4, 4 848 por m4, m5 849 movifnidn leftmp, leftq 850 jmp .h_main 851.h_extend_left: 852%if cpuflag(ssse3) 853 mova m4, [lpfq+xq] 854 pshufb m4, m12 855%else 856 mova m5, [lpfq+xq] 857 pshufd m4, m5, q2103 858 punpcklbw m5, m5 859 punpcklwd m5, m5 860 movss m4, m5 861%endif 862 jmp .h_main 863.h_top: 864 mov xq, wq 865 test edgeb, 1 ; LR_HAVE_LEFT 866 jz .h_extend_left 867.h_loop: 868 movu m4, [lpfq+xq-4] 869.h_main: 870 movu m5, [lpfq+xq+4] 871 test edgeb, 2 ; LR_HAVE_RIGHT 872 jnz .h_have_right 873 cmp xd, -17 874 jl .h_have_right 875 call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right 876.h_have_right: 877%macro %%h5 0 878%if cpuflag(ssse3) 879 pshufb m0, m4, m9 880 pmaddubsw m0, m13 881 pshufb m1, m5, m9 882 pmaddubsw m1, m13 883 pshufb m2, m4, m10 884 pmaddubsw m2, m13 885 pshufb m3, m5, m10 886 pmaddubsw m3, m13 887 pshufb m4, m11 888 paddw m0, m2 889 pmullw m2, m14, m4 890 pshufb m5, m11 891 paddw m1, m3 892 pmullw m3, m14, m5 893 psllw m4, 7 894 psllw m5, 7 895 paddw m4, m8 896 paddw m5, m8 897 paddw m0, m2 898 paddw m1, m3 899 paddsw m0, m4 900 paddsw m1, m5 901%else 902 psrldq m0, m4, 2 903 pslldq m1, m4, 2 904 pxor m3, m3 905 punpcklbw m0, m3 906 punpckhbw m1, m3 907 paddw m0, m1 908 pmullw m0, m11 909 pshufd m2, m4, q0321 910 punpcklbw m2, m3 911 pmullw m1, m14, m2 912 paddw m0, m1 913 psrldq m1, m4, 3 914 pslldq m4, 3 915 punpcklbw m1, m3 916 punpckhbw m4, m3 917 paddw m1, m4 918 pmullw m1, m13 919 paddw m0, m1 920 psllw m2, 7 921 paddw m2, m8 922 paddsw m0, m2 923 psrldq m1, m5, 2 924 pslldq m4, m5, 2 925 punpcklbw m1, m3 926 punpckhbw m4, m3 927 paddw m1, m4 928 pmullw m1, m11 929 pshufd m4, m5, q0321 930 punpcklbw m4, m3 931 pmullw m2, m14, m4 932 paddw m1, m2 933 psrldq m2, m5, 3 934 pslldq m5, 3 935 punpcklbw m2, m3 936 punpckhbw m5, m3 937 paddw m2, m5 938 pmullw m2, m13 939 paddw m1, m2 940 psllw m4, 7 941 paddw m4, m8 942 paddsw m1, m4 943%endif 944%endmacro 945 %%h5 946 psraw m0, 3 947 psraw m1, 3 948 paddw m0, m15 949 paddw m1, m15 950 mova [t1+xq*2+ 0], m0 951 mova [t1+xq*2+16], m1 952 add xq, 16 953 jl .h_loop 954 ret 955ALIGN function_align 956.hv: 957 add lpfq, strideq 958 mov xq, wq 959 test edgeb, 1 ; LR_HAVE_LEFT 960 jz .hv_extend_left 961 movifnidn leftq, leftmp 962 mova m4, [lpfq+xq] 963 movd m5, [leftq] 964 add leftq, 4 965 pslldq m4, 4 966 por m4, m5 967 movifnidn leftmp, leftq 968 jmp .hv_main 969.hv_extend_left: 970%if cpuflag(ssse3) 971 mova m4, [lpfq+xq] 972 pshufb m4, m12 973%else 974 mova m5, [lpfq+xq] 975 pshufd m4, m5, q2103 976 punpcklbw m5, m5 977 punpcklwd m5, m5 978 movss m4, m5 979%endif 980 jmp .hv_main 981.hv_bottom: 982 mov xq, wq 983 test edgeb, 1 ; LR_HAVE_LEFT 984 jz .hv_extend_left 985.hv_loop: 986 movu m4, [lpfq+xq-4] 987.hv_main: 988 movu m5, [lpfq+xq+4] 989 test edgeb, 2 ; LR_HAVE_RIGHT 990 jnz .hv_have_right 991 cmp xd, -17 992 jl .hv_have_right 993 call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right 994.hv_have_right: 995 %%h5 996 mova m2, [t3+xq*2] 997 paddw m2, [t1+xq*2] 998 psraw m0, 3 999 psraw m1, 3 1000 paddw m0, m15 1001 paddw m1, m15 1002%if ARCH_X86_64 1003 mova m3, [t2+xq*2] 1004 paddw m4, m0, [t4+xq*2] 1005%else 1006 mov r2, t2 1007 mova m3, [r2+xq*2] 1008 mov r2, t4 1009 paddw m4, m0, [r2+xq*2] 1010%endif 1011 mova [t0+xq*2], m0 1012 punpcklwd m0, m2, m3 1013 pmaddwd m0, m7 1014 punpckhwd m2, m3 1015 pmaddwd m2, m7 1016 punpcklwd m3, m4, m4 1017 pmaddwd m3, m6 1018 punpckhwd m4, m4 1019 pmaddwd m4, m6 1020 paddd m0, m3 1021 paddd m4, m2 1022 mova m2, [t3+xq*2+16] 1023 paddw m2, [t1+xq*2+16] 1024 packuswb m0, m4 1025%if ARCH_X86_64 1026 mova m3, [t2+xq*2+16] 1027 paddw m4, m1, [t4+xq*2+16] 1028%else 1029 paddw m4, m1, [r2+xq*2+16] 1030 mov r2, t2 1031 mova m3, [r2+xq*2+16] 1032 mov dstq, dstmp 1033%endif 1034 mova [t0+xq*2+16], m1 1035 punpcklwd m1, m2, m3 1036 pmaddwd m1, m7 1037 punpckhwd m2, m3 1038 pmaddwd m2, m7 1039 punpcklwd m3, m4, m4 1040 pmaddwd m3, m6 1041 punpckhwd m4, m4 1042 pmaddwd m4, m6 1043 paddd m1, m3 1044 paddd m2, m4 1045 packuswb m1, m2 1046 psrlw m0, 8 1047 psrlw m1, 8 1048 packuswb m0, m1 1049 mova [dstq+xq], m0 1050 add xq, 16 1051 jl .hv_loop 1052 add dstq, strideq 1053 mov t4, t3 1054 mov t3, t2 1055 mov t2, t1 1056 mov t1, t0 1057 mov t0, t4 1058 movifnidn dstmp, dstq 1059 ret 1060%if cpuflag(ssse3) 1061.v: 1062 mov xq, wq 1063.v_loop: 1064 mova m3, [t1+xq*2] 1065 paddw m1, m3, [t3+xq*2] 1066%if ARCH_X86_64 1067 mova m2, [t2+xq*2] 1068 paddw m3, [t4+xq*2] 1069%else 1070 mov r2, t2 1071 mova m2, [r2+xq*2] 1072 mov r2, t4 1073 paddw m3, [r2+xq*2] 1074%endif 1075 punpcklwd m0, m1, m2 1076 pmaddwd m0, m7 1077 punpckhwd m1, m2 1078 pmaddwd m1, m7 1079 punpcklwd m2, m3 1080 pmaddwd m2, m6 1081 punpckhwd m3, m3 1082 pmaddwd m3, m6 1083 paddd m0, m2 1084 paddd m1, m3 1085 mova m4, [t1+xq*2+16] 1086 paddw m2, m4, [t3+xq*2+16] 1087%if ARCH_X86_64 1088 mova m3, [t2+xq*2+16] 1089 paddw m4, [t4+xq*2+16] 1090%else 1091 paddw m4, [r2+xq*2+16] 1092 mov r2, t2 1093 mova m3, [r2+xq*2+16] 1094 mov dstq, dstmp 1095%endif 1096 packuswb m0, m1 1097 punpcklwd m1, m2, m3 1098 pmaddwd m1, m7 1099 punpckhwd m2, m3 1100 pmaddwd m2, m7 1101 punpcklwd m3, m4 1102 pmaddwd m3, m6 1103 punpckhwd m4, m4 1104 pmaddwd m4, m6 1105 paddd m1, m3 1106 paddd m2, m4 1107 packuswb m1, m2 1108 psrlw m0, 8 1109 psrlw m1, 8 1110 packuswb m0, m1 1111 mova [dstq+xq], m0 1112 add xq, 16 1113 jl .v_loop 1114 ret 1115%endif 1116%endmacro 1117 1118INIT_XMM sse2 1119WIENER 1120 1121INIT_XMM ssse3 1122WIENER 1123 1124;;;;;;;;;;;;;;;;;;;;;;;;;; 1125;; self-guided ;; 1126;;;;;;;;;;;;;;;;;;;;;;;;;; 1127 1128%macro GATHERDD 3 ; dst, src, tmp 1129 movd %3d, %2 1130 %if ARCH_X86_64 1131 movd %1, [r13+%3] 1132 pextrw %3d, %2, 2 1133 pinsrw %1, [r13+%3+2], 3 1134 pextrw %3d, %2, 4 1135 pinsrw %1, [r13+%3+2], 5 1136 pextrw %3d, %2, 6 1137 pinsrw %1, [r13+%3+2], 7 1138 %else 1139 movd %1, [base+sgr_x_by_x-0xf03+%3] 1140 pextrw %3, %2, 2 1141 pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3 1142 pextrw %3, %2, 4 1143 pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5 1144 pextrw %3, %2, 6 1145 pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7 1146 %endif 1147%endmacro 1148 1149%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore 1150 %if ARCH_X86_64 1151 %define tmp r14 1152 %else 1153 %define tmp %4 1154 %endif 1155 GATHERDD %1, %2, tmp 1156 GATHERDD %2, %3, tmp 1157 movif32 %4, %5 1158 psrld %1, 24 1159 psrld %2, 24 1160 packssdw %1, %2 1161%endmacro 1162 1163%macro MULLD 3 ; dst, src, tmp 1164 pmulhuw %3, %1, %2 1165 pmullw %1, %2 1166 pslld %3, 16 1167 paddd %1, %3 1168%endmacro 1169 1170%if ARCH_X86_32 1171DECLARE_REG_TMP 0, 1, 2, 3, 5 1172 %if STACK_ALIGNMENT < 16 1173 %assign extra_stack 5*16 1174 %else 1175 %assign extra_stack 3*16 1176 %endif 1177cglobal sgr_filter_5x5_8bpc, 1, 7, 8, -400*24-16-extra_stack, \ 1178 dst, stride, left, lpf, w 1179 %if STACK_ALIGNMENT < 16 1180 %define dstm dword [esp+calloff+16*0+4*6] 1181 %define stridemp dword [esp+calloff+16*0+4*7] 1182 %define leftm dword [esp+calloff+16*3+4*0] 1183 %define lpfm dword [esp+calloff+16*3+4*1] 1184 %define w0m dword [esp+calloff+16*3+4*2] 1185 %define hd dword [esp+calloff+16*3+4*3] 1186 %define edgeb byte [esp+calloff+16*3+4*4] 1187 %define edged dword [esp+calloff+16*3+4*4] 1188 %define leftmp leftm 1189 %else 1190 %define w0m wm 1191 %define hd dword r5m 1192 %define edgeb byte r7m 1193 %define edged dword r7m 1194 %endif 1195 %define hvsrcm dword [esp+calloff+4*0] 1196 %define w1m dword [esp+calloff+4*1] 1197 %define t0m dword [esp+calloff+4*2] 1198 %define t2m dword [esp+calloff+4*3] 1199 %define t3m dword [esp+calloff+4*4] 1200 %define t4m dword [esp+calloff+4*5] 1201 %define m8 [base+pb_1] 1202 %define m9 [esp+calloff+16*2] 1203 %define m10 [base+pd_0xf00800a4] 1204 %define m11 [base+sgr_lshuf5] 1205 %define m12 [base+pd_34816] 1206 %define m13 [base+pb_0to15] 1207 %define r10 r4 1208 %define base r6-$$ 1209 %assign calloff 0 1210 %if STACK_ALIGNMENT < 16 1211 mov strideq, [rstk+stack_offset+ 8] 1212 mov leftq, [rstk+stack_offset+12] 1213 mov lpfq, [rstk+stack_offset+16] 1214 mov wd, [rstk+stack_offset+20] 1215 mov dstm, dstq 1216 mov stridemp, strideq 1217 mov leftm, leftq 1218 mov r1, [rstk+stack_offset+24] 1219 mov r2, [rstk+stack_offset+32] 1220 mov lpfm, lpfq 1221 mov hd, r1 1222 mov edged, r2 1223 %endif 1224%else 1225DECLARE_REG_TMP 8, 7, 9, 11, 12 1226cglobal sgr_filter_5x5_8bpc, 4, 15, 14, -400*24-16, dst, stride, left, lpf, \ 1227 w, h, edge, params 1228%endif 1229%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 1230 mov wd, wm 1231%endif 1232%if ARCH_X86_64 1233 mov paramsq, r6mp 1234 lea r13, [sgr_x_by_x-0xf03] 1235 movifnidn hd, hm 1236 mov edged, r7m 1237 movu m9, [paramsq] 1238 add lpfq, wq 1239 mova m8, [pb_1] 1240 lea t1, [rsp+wq*2+20] 1241 mova m10, [pd_0xf00800a4] 1242 add dstq, wq 1243 lea t3, [rsp+wq*4+400*12+16] 1244 mova m12, [pd_34816] ; (1 << 11) + (1 << 15) 1245 lea t4, [rsp+wq*2+400*20+16] 1246 pshufhw m7, m9, q0000 1247 pshufb m9, [pw_256] ; s0 1248 punpckhqdq m7, m7 ; w0 1249 neg wq 1250 mova m13, [pb_0to15] 1251 pxor m6, m6 1252 mova m11, [sgr_lshuf5] 1253 psllw m7, 4 1254 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 1255 %define lpfm [rsp] 1256%else 1257 mov r1, [rstk+stack_offset+28] ; params 1258 LEA r6, $$ 1259 movu m1, [r1] 1260 add lpfm, wq 1261 lea t1, [rsp+extra_stack+wq*2+20] 1262 add dstq, wq 1263 lea t3, [rsp+extra_stack+wq*4+400*12+16] 1264 mov dstm, dstq 1265 lea t4, [rsp+extra_stack+wq*2+400*20+16] 1266 mov t3m, t3 1267 pshufhw m7, m1, q0000 1268 mov t4m, t4 1269 pshufb m1, [base+pw_256] ; s0 1270 punpckhqdq m7, m7 ; w0 1271 psllw m7, 4 1272 neg wq 1273 mova m9, m1 1274 pxor m6, m6 1275 mov w1m, wd 1276 sub wd, 2 1277 mov lpfq, lpfm 1278 mov w0m, wd 1279 %define strideq r5 1280%endif 1281 test edgeb, 4 ; LR_HAVE_TOP 1282 jz .no_top 1283 call .h_top 1284 add lpfq, stridemp 1285 movif32 t2m, t1 1286 mov t2, t1 1287 call .top_fixup 1288 add t1, 400*6 1289 call .h_top 1290 movif32 strideq, stridemp 1291 lea r10, [lpfq+strideq*4] 1292 mov lpfq, dstq 1293 add r10, strideq 1294 mov lpfm, r10 ; below 1295 movif32 t0m, t2 1296 mov t0, t2 1297 dec hd 1298 jz .height1 1299 or edged, 16 1300 call .h 1301.main: 1302 add lpfq, stridemp 1303 movif32 t4, t4m 1304 call .hv 1305 call .prep_n 1306 sub hd, 2 1307 jl .extend_bottom 1308.main_loop: 1309 movif32 lpfq, hvsrcm 1310 add lpfq, stridemp 1311%if ARCH_X86_64 1312 test hb, hb 1313%else 1314 mov r4, hd 1315 test r4, r4 1316%endif 1317 jz .odd_height 1318 call .h 1319 add lpfq, stridemp 1320 call .hv 1321 movif32 dstq, dstm 1322 call .n0 1323 call .n1 1324 sub hd, 2 1325 movif32 t0, t0m 1326 jge .main_loop 1327 test edgeb, 8 ; LR_HAVE_BOTTOM 1328 jz .extend_bottom 1329 mov lpfq, lpfm 1330 call .h_top 1331 add lpfq, stridemp 1332 call .hv_bottom 1333.end: 1334 movif32 dstq, dstm 1335 call .n0 1336 call .n1 1337.end2: 1338 RET 1339.height1: 1340 movif32 t4, t4m 1341 call .hv 1342 call .prep_n 1343 jmp .odd_height_end 1344.odd_height: 1345 call .hv 1346 movif32 dstq, dstm 1347 call .n0 1348 call .n1 1349.odd_height_end: 1350 call .v 1351 movif32 dstq, dstm 1352 call .n0 1353 jmp .end2 1354.extend_bottom: 1355 call .v 1356 jmp .end 1357.no_top: 1358 movif32 strideq, stridemp 1359 lea r10, [lpfq+strideq*4] 1360 mov lpfq, dstq 1361 lea r10, [r10+strideq*2] 1362 mov lpfm, r10 1363 call .h 1364 lea t2, [t1+400*6] 1365 movif32 t2m, t2 1366 call .top_fixup 1367 dec hd 1368 jz .no_top_height1 1369 or edged, 16 1370 mov t0, t1 1371 mov t1, t2 1372 movif32 t0m, t0 1373 jmp .main 1374.no_top_height1: 1375 movif32 t3, t3m 1376 movif32 t4, t4m 1377 call .v 1378 call .prep_n 1379 jmp .odd_height_end 1380.extend_right: 1381%assign stack_offset stack_offset+8 1382%assign calloff 8 1383 movd m1, wd 1384 movd m3, [lpfq-1] 1385 pshufb m1, m6 1386 pshufb m3, m6 1387 psubb m2, m8, m1 1388 pcmpgtb m2, m13 1389 pand m5, m2 1390 pandn m2, m3 1391 por m5, m2 1392 ret 1393%assign stack_offset stack_offset-4 1394%assign calloff 4 1395.h: ; horizontal boxsum 1396%if ARCH_X86_64 1397 lea wq, [r4-2] 1398%else 1399 %define leftq r4 1400%endif 1401 test edgeb, 1 ; LR_HAVE_LEFT 1402 jz .h_extend_left 1403 movif32 leftq, leftm 1404 movddup m4, [leftq-4] 1405 movif32 wq, w0m 1406 mova m5, [lpfq+wq+2] 1407 add leftmp, 4 1408 palignr m5, m4, 13 1409 jmp .h_main 1410.h_extend_left: 1411 movif32 wq, w0m 1412 mova m5, [lpfq+wq+2] 1413 pshufb m5, m11 1414 jmp .h_main 1415.h_top: 1416%if ARCH_X86_64 1417 lea wq, [r4-2] 1418%endif 1419 test edgeb, 1 ; LR_HAVE_LEFT 1420 jz .h_extend_left 1421 movif32 wq, w0m 1422.h_loop: 1423 movu m5, [lpfq+wq-1] 1424.h_main: 1425 test edgeb, 2 ; LR_HAVE_RIGHT 1426 jnz .h_have_right 1427 cmp wd, -10 1428 jl .h_have_right 1429 call .extend_right 1430.h_have_right: 1431 punpcklbw m4, m5, m6 1432 punpckhbw m5, m6 1433 palignr m2, m5, m4, 2 1434 paddw m0, m4, m2 1435 palignr m3, m5, m4, 6 1436 paddw m0, m3 1437 punpcklwd m1, m2, m3 1438 pmaddwd m1, m1 1439 punpckhwd m2, m3 1440 pmaddwd m2, m2 1441 palignr m5, m4, 8 1442 paddw m0, m5 1443 punpcklwd m3, m4, m5 1444 pmaddwd m3, m3 1445 paddd m1, m3 1446 punpckhwd m3, m4, m5 1447 pmaddwd m3, m3 1448 shufps m4, m5, q2121 1449 paddw m0, m4 ; sum 1450 punpcklwd m5, m4, m6 1451 pmaddwd m5, m5 1452 punpckhwd m4, m6 1453 pmaddwd m4, m4 1454 paddd m2, m3 1455 test edgeb, 16 ; y > 0 1456 jz .h_loop_end 1457 paddw m0, [t1+wq*2+400*0] 1458 paddd m1, [t1+wq*2+400*2] 1459 paddd m2, [t1+wq*2+400*4] 1460.h_loop_end: 1461 paddd m1, m5 ; sumsq 1462 paddd m2, m4 1463 mova [t1+wq*2+400*0], m0 1464 mova [t1+wq*2+400*2], m1 1465 mova [t1+wq*2+400*4], m2 1466 add wq, 8 1467 jl .h_loop 1468 ret 1469.top_fixup: 1470%if ARCH_X86_64 1471 lea wq, [r4-2] 1472%else 1473 mov wd, w0m 1474%endif 1475.top_fixup_loop: ; the sums of the first row needs to be doubled 1476 mova m0, [t1+wq*2+400*0] 1477 mova m1, [t1+wq*2+400*2] 1478 mova m2, [t1+wq*2+400*4] 1479 paddw m0, m0 1480 paddd m1, m1 1481 paddd m2, m2 1482 mova [t2+wq*2+400*0], m0 1483 mova [t2+wq*2+400*2], m1 1484 mova [t2+wq*2+400*4], m2 1485 add wq, 8 1486 jl .top_fixup_loop 1487 ret 1488ALIGN function_align 1489.hv: ; horizontal boxsum + vertical boxsum + ab 1490%if ARCH_X86_64 1491 lea wq, [r4-2] 1492%else 1493 mov hvsrcm, lpfq 1494%endif 1495 test edgeb, 1 ; LR_HAVE_LEFT 1496 jz .hv_extend_left 1497 movif32 leftq, leftm 1498 movddup m4, [leftq-4] 1499 movif32 wq, w0m 1500 mova m5, [lpfq+wq+2] 1501 add leftmp, 4 1502 palignr m5, m4, 13 1503 jmp .hv_main 1504.hv_extend_left: 1505 movif32 wq, w0m 1506 mova m5, [lpfq+wq+2] 1507 pshufb m5, m11 1508 jmp .hv_main 1509.hv_bottom: 1510%if ARCH_X86_64 1511 lea wq, [r4-2] 1512%else 1513 mov hvsrcm, lpfq 1514%endif 1515 test edgeb, 1 ; LR_HAVE_LEFT 1516 jz .hv_extend_left 1517 movif32 wq, w0m 1518%if ARCH_X86_32 1519 jmp .hv_loop_start 1520%endif 1521.hv_loop: 1522 movif32 lpfq, hvsrcm 1523.hv_loop_start: 1524 movu m5, [lpfq+wq-1] 1525.hv_main: 1526 test edgeb, 2 ; LR_HAVE_RIGHT 1527 jnz .hv_have_right 1528 cmp wd, -10 1529 jl .hv_have_right 1530 call .extend_right 1531.hv_have_right: 1532 movif32 t3, hd 1533 punpcklbw m4, m5, m6 1534 punpckhbw m5, m6 1535 palignr m3, m5, m4, 2 1536 paddw m0, m4, m3 1537 palignr m1, m5, m4, 6 1538 paddw m0, m1 1539 punpcklwd m2, m3, m1 1540 pmaddwd m2, m2 1541 punpckhwd m3, m1 1542 pmaddwd m3, m3 1543 palignr m5, m4, 8 1544 paddw m0, m5 1545 punpcklwd m1, m4, m5 1546 pmaddwd m1, m1 1547 paddd m2, m1 1548 punpckhwd m1, m4, m5 1549 pmaddwd m1, m1 1550 shufps m4, m5, q2121 1551 paddw m0, m4 ; h sum 1552 punpcklwd m5, m4, m6 1553 pmaddwd m5, m5 1554 punpckhwd m4, m6 1555 pmaddwd m4, m4 1556 paddd m3, m1 1557 paddd m2, m5 ; h sumsq 1558 paddd m3, m4 1559 paddw m1, m0, [t1+wq*2+400*0] 1560 paddd m4, m2, [t1+wq*2+400*2] 1561 paddd m5, m3, [t1+wq*2+400*4] 1562%if ARCH_X86_64 1563 test hd, hd 1564%else 1565 test t3, t3 1566%endif 1567 jz .hv_last_row 1568.hv_main2: 1569 paddw m1, [t2+wq*2+400*0] ; hv sum 1570 paddd m4, [t2+wq*2+400*2] ; hv sumsq 1571 paddd m5, [t2+wq*2+400*4] 1572 mova [t0+wq*2+400*0], m0 1573 pslld m0, m4, 4 1574 mova [t0+wq*2+400*2], m2 1575 mova [t0+wq*2+400*4], m3 1576 pslld m2, m4, 3 1577 paddd m4, m0 1578 pslld m0, m5, 4 1579 paddd m4, m2 ; a * 25 1580 pslld m2, m5, 3 1581 paddd m5, m0 1582 paddd m5, m2 1583 punpcklwd m0, m1, m6 ; b 1584 punpckhwd m1, m6 1585 pmaddwd m2, m0, m0 ; b * b 1586 pmaddwd m3, m1, m1 1587 psubd m4, m2 ; p 1588 psubd m5, m3 1589 MULLD m4, m9, m2 ; p * s 1590 MULLD m5, m9, m2 1591 pmaddwd m0, m10 ; b * 164 1592 pmaddwd m1, m10 1593 paddusw m4, m10 1594 paddusw m5, m10 1595 psrld m4, 20 ; min(z, 255) 1596 movif32 t3, t3m 1597 psrld m5, 20 1598 GATHER_X_BY_X m3, m4, m5, t2, t2m 1599 punpcklwd m4, m3, m3 1600 punpckhwd m5, m3, m3 1601 MULLD m0, m4, m2 1602 MULLD m1, m5, m2 1603 paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15) 1604 paddd m1, m12 1605 mova [t4+wq*2+4], m3 1606 psrld m0, 12 ; b 1607 psrld m1, 12 1608 mova [t3+wq*4+ 8], m0 1609 mova [t3+wq*4+24], m1 1610 add wq, 8 1611 jl .hv_loop 1612 mov t2, t1 1613 mov t1, t0 1614 mov t0, t2 1615 movif32 t2m, t2 1616 movif32 t0m, t0 1617 ret 1618.hv_last_row: ; esoteric edge case for odd heights 1619 mova [t1+wq*2+400*0], m1 1620 paddw m1, m0 1621 mova [t1+wq*2+400*2], m4 1622 paddd m4, m2 1623 mova [t1+wq*2+400*4], m5 1624 paddd m5, m3 1625 jmp .hv_main2 1626.v: ; vertical boxsum + ab 1627%if ARCH_X86_64 1628 lea wq, [r4-2] 1629%else 1630 mov wd, w0m 1631%endif 1632.v_loop: 1633 mova m0, [t1+wq*2+400*0] 1634 mova m2, [t1+wq*2+400*2] 1635 mova m3, [t1+wq*2+400*4] 1636 paddw m1, m0, [t2+wq*2+400*0] 1637 paddd m4, m2, [t2+wq*2+400*2] 1638 paddd m5, m3, [t2+wq*2+400*4] 1639 paddw m0, m0 1640 paddd m2, m2 1641 paddd m3, m3 1642 paddw m1, m0 ; hv sum 1643 paddd m4, m2 ; hv sumsq 1644 pslld m0, m4, 4 1645 paddd m5, m3 1646 pslld m2, m4, 3 1647 paddd m4, m0 1648 pslld m0, m5, 4 1649 paddd m4, m2 ; a * 25 1650 pslld m2, m5, 3 1651 paddd m5, m0 1652 paddd m5, m2 1653 punpcklwd m0, m1, m6 1654 punpckhwd m1, m6 1655 pmaddwd m2, m0, m0 ; b * b 1656 pmaddwd m3, m1, m1 1657 psubd m4, m2 ; p 1658 psubd m5, m3 1659 MULLD m4, m9, m2 ; p * s 1660 MULLD m5, m9, m2 1661 pmaddwd m0, m10 ; b * 164 1662 pmaddwd m1, m10 1663 paddusw m4, m10 1664 paddusw m5, m10 1665 psrld m4, 20 ; min(z, 255) 1666 psrld m5, 20 1667 GATHER_X_BY_X m3, m4, m5, t2, t2m 1668 punpcklwd m4, m3, m3 1669 punpckhwd m5, m3, m3 1670 MULLD m0, m4, m2 1671 MULLD m1, m5, m2 1672 paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15) 1673 paddd m1, m12 1674 mova [t4+wq*2+4], m3 1675 psrld m0, 12 ; b 1676 psrld m1, 12 1677 mova [t3+wq*4+ 8], m0 1678 mova [t3+wq*4+24], m1 1679 add wq, 8 1680 jl .v_loop 1681 ret 1682.prep_n: ; initial neighbor setup 1683 movif64 wq, r4 1684 movif32 wd, w1m 1685.prep_n_loop: 1686 movu m0, [t4+wq*2+ 2] 1687 movu m3, [t4+wq*2+ 4] 1688 movu m1, [t3+wq*4+ 4] 1689 movu m4, [t3+wq*4+ 8] 1690 movu m2, [t3+wq*4+20] 1691 movu m5, [t3+wq*4+24] 1692 paddw m3, m0 1693 paddd m4, m1 1694 paddd m5, m2 1695 paddw m3, [t4+wq*2+ 0] 1696 paddd m4, [t3+wq*4+ 0] 1697 paddd m5, [t3+wq*4+16] 1698 paddw m0, m3 1699 psllw m3, 2 1700 paddd m1, m4 1701 pslld m4, 2 1702 paddd m2, m5 1703 pslld m5, 2 1704 paddw m0, m3 ; a 565 1705 paddd m1, m4 ; b 565 1706 paddd m2, m5 1707 mova [t4+wq*2+400*2+ 0], m0 1708 mova [t3+wq*4+400*4+ 0], m1 1709 mova [t3+wq*4+400*4+16], m2 1710 add wq, 8 1711 jl .prep_n_loop 1712 ret 1713ALIGN function_align 1714.n0: ; neighbor + output (even rows) 1715 movif64 wq, r4 1716 movif32 wd, w1m 1717.n0_loop: 1718 movu m0, [t4+wq*2+ 2] 1719 movu m3, [t4+wq*2+ 4] 1720 movu m1, [t3+wq*4+ 4] 1721 movu m4, [t3+wq*4+ 8] 1722 movu m2, [t3+wq*4+20] 1723 movu m5, [t3+wq*4+24] 1724 paddw m3, m0 1725 paddd m4, m1 1726 paddd m5, m2 1727 paddw m3, [t4+wq*2+ 0] 1728 paddd m4, [t3+wq*4+ 0] 1729 paddd m5, [t3+wq*4+16] 1730 paddw m0, m3 1731 psllw m3, 2 1732 paddd m1, m4 1733 pslld m4, 2 1734 paddd m2, m5 1735 pslld m5, 2 1736 paddw m0, m3 ; a 565 1737 paddd m1, m4 ; b 565 1738 paddd m2, m5 1739 paddw m3, m0, [t4+wq*2+400*2+ 0] 1740 paddd m4, m1, [t3+wq*4+400*4+ 0] 1741 paddd m5, m2, [t3+wq*4+400*4+16] 1742 mova [t4+wq*2+400*2+ 0], m0 1743 mova [t3+wq*4+400*4+ 0], m1 1744 mova [t3+wq*4+400*4+16], m2 1745 movq m0, [dstq+wq] 1746 punpcklbw m0, m6 1747 punpcklwd m1, m0, m6 ; src 1748 punpcklwd m2, m3, m6 ; a 1749 pmaddwd m2, m1 ; a * src 1750 punpckhwd m1, m0, m6 1751 punpckhwd m3, m6 1752 pmaddwd m3, m1 1753 psubd m4, m2 ; b - a * src + (1 << 8) 1754 psubd m5, m3 1755 psrad m4, 9 1756 psrad m5, 9 1757 packssdw m4, m5 1758 pmulhrsw m4, m7 1759 paddw m0, m4 1760 packuswb m0, m0 1761 movq [dstq+wq], m0 1762 add wq, 8 1763 jl .n0_loop 1764 add dstq, stridemp 1765 ret 1766ALIGN function_align 1767.n1: ; neighbor + output (odd rows) 1768 movif64 wq, r4 1769 movif32 wd, w1m 1770.n1_loop: 1771 movq m0, [dstq+wq] 1772 mova m3, [t4+wq*2+400*2+ 0] 1773 mova m4, [t3+wq*4+400*4+ 0] 1774 mova m5, [t3+wq*4+400*4+16] 1775 punpcklbw m0, m6 1776 punpcklwd m1, m0, m6 ; src 1777 punpcklwd m2, m3, m6 ; a 1778 pmaddwd m2, m1 ; a * src 1779 punpckhwd m1, m0, m6 1780 punpckhwd m3, m6 1781 pmaddwd m3, m1 1782 psubd m4, m2 ; b - a * src + (1 << 7) 1783 psubd m5, m3 1784 psrad m4, 8 1785 psrad m5, 8 1786 packssdw m4, m5 1787 pmulhrsw m4, m7 1788 paddw m0, m4 1789 packuswb m0, m0 1790 movq [dstq+wq], m0 1791 add wq, 8 1792 jl .n1_loop 1793 add dstq, stridemp 1794 movif32 dstm, dstq 1795 ret 1796 1797%if ARCH_X86_32 1798 %if STACK_ALIGNMENT < 16 1799 %assign extra_stack 4*16 1800 %else 1801 %assign extra_stack 2*16 1802 %endif 1803cglobal sgr_filter_3x3_8bpc, 1, 7, 8, -400*42-16-extra_stack, \ 1804 dst, stride, left, lpf, w 1805 %if STACK_ALIGNMENT < 16 1806 %define dstm dword [esp+calloff+16*2+4*0] 1807 %define stridemp dword [esp+calloff+16*2+4*1] 1808 %define leftm dword [esp+calloff+16*2+4*2] 1809 %define lpfm dword [esp+calloff+16*2+4*3] 1810 %define w0m dword [esp+calloff+16*2+4*4] 1811 %define hd dword [esp+calloff+16*2+4*5] 1812 %define edgeb byte [esp+calloff+16*2+4*6] 1813 %define edged dword [esp+calloff+16*2+4*6] 1814 %define leftmp leftm 1815 %else 1816 %define w0m wm 1817 %define hd dword r5m 1818 %define edgeb byte r7m 1819 %define edged dword r7m 1820 %endif 1821 %define hvsrcm dword [esp+calloff+4*0] 1822 %define w1m dword [esp+calloff+4*1] 1823 %define t3m dword [esp+calloff+4*2] 1824 %define t4m dword [esp+calloff+4*3] 1825 %define m8 [base+pb_0to15] 1826 %define m9 [esp+calloff+16*1] 1827 %define m10 [base+pd_0xf00801c7] 1828 %define m11 [base+pd_34816] 1829 %define m12 m6 1830 %define m13 [base+sgr_lshuf3] 1831 %define base r6-$$ 1832 %assign calloff 0 1833 %if STACK_ALIGNMENT < 16 1834 mov strideq, [rstk+stack_offset+ 8] 1835 mov leftq, [rstk+stack_offset+12] 1836 mov lpfq, [rstk+stack_offset+16] 1837 mov wd, [rstk+stack_offset+20] 1838 mov dstm, dstq 1839 mov stridemp, strideq 1840 mov leftm, leftq 1841 mov r1, [rstk+stack_offset+24] 1842 mov r2, [rstk+stack_offset+32] 1843 mov lpfm, lpfq 1844 mov hd, r1 1845 mov edged, r2 1846 %endif 1847%else 1848cglobal sgr_filter_3x3_8bpc, 4, 15, 14, -400*42-8, dst, stride, left, lpf, \ 1849 w, h, edge, params 1850%endif 1851%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 1852 mov wd, wm 1853%endif 1854%if ARCH_X86_64 1855 mov paramsq, r6mp 1856 lea r13, [sgr_x_by_x-0xf03] 1857 mov hd, hm 1858 mov edged, r7m 1859 movq m9, [paramsq+4] 1860 add lpfq, wq 1861 lea t1, [rsp+wq*2+12] 1862 mova m8, [pb_0to15] 1863 add dstq, wq 1864 lea t3, [rsp+wq*4+400*12+8] 1865 mova m10, [pd_0xf00801c7] 1866 lea t4, [rsp+wq*2+400*32+8] 1867 mova m11, [pd_34816] 1868 pshuflw m7, m9, q3333 1869 pshufb m9, [pw_256] ; s1 1870 punpcklqdq m7, m7 ; w1 1871 neg wq 1872 pxor m6, m6 1873 mova m13, [sgr_lshuf3] 1874 psllw m7, 4 1875 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 1876 %define lpfm [rsp] 1877%else 1878 mov r1, [rstk+stack_offset+28] ; params 1879 LEA r6, $$ 1880 movq m1, [r1+4] 1881 add lpfm, wq 1882 lea t1, [rsp+extra_stack+wq*2+20] 1883 add dstq, wq 1884 lea t3, [rsp+extra_stack+wq*4+400*12+16] 1885 mov dstm, dstq 1886 lea t4, [rsp+extra_stack+wq*2+400*32+16] 1887 mov t3m, t3 1888 pshuflw m7, m1, q3333 1889 mov t4m, t4 1890 pshufb m1, [base+pw_256] ; s1 1891 punpcklqdq m7, m7 ; w1 1892 psllw m7, 4 1893 neg wq 1894 mova m9, m1 1895 pxor m6, m6 1896 mov w1m, wd 1897 sub wd, 2 1898 mov lpfq, lpfm 1899 mov w0m, wd 1900 %define strideq r5 1901%endif 1902 test edgeb, 4 ; LR_HAVE_TOP 1903 jz .no_top 1904 call .h_top 1905 add lpfq, stridemp 1906 mov t2, t1 1907 add t1, 400*6 1908 call .h_top 1909 movif32 strideq, stridemp 1910 lea r10, [lpfq+strideq*4] 1911 mov lpfq, dstq 1912 add r10, strideq 1913 mov lpfm, r10 ; below 1914 movif32 t4, t4m 1915 call .hv0 1916.main: 1917 dec hd 1918 jz .height1 1919 movif32 lpfq, hvsrcm 1920 add lpfq, stridemp 1921 call .hv1 1922 call .prep_n 1923 sub hd, 2 1924 jl .extend_bottom 1925.main_loop: 1926 movif32 lpfq, hvsrcm 1927 add lpfq, stridemp 1928 call .hv0 1929%if ARCH_X86_64 1930 test hb, hb 1931%else 1932 mov r4, hd 1933 test r4, r4 1934%endif 1935 jz .odd_height 1936 movif32 lpfq, hvsrcm 1937 add lpfq, stridemp 1938 call .hv1 1939 call .n0 1940 call .n1 1941 sub hd, 2 1942 jge .main_loop 1943 test edgeb, 8 ; LR_HAVE_BOTTOM 1944 jz .extend_bottom 1945 mov lpfq, lpfm 1946 call .hv0_bottom 1947 movif32 lpfq, hvsrcm 1948 add lpfq, stridemp 1949 call .hv1_bottom 1950.end: 1951 call .n0 1952 call .n1 1953.end2: 1954 RET 1955.height1: 1956 call .v1 1957 call .prep_n 1958 jmp .odd_height_end 1959.odd_height: 1960 call .v1 1961 call .n0 1962 call .n1 1963.odd_height_end: 1964 call .v0 1965 call .v1 1966 call .n0 1967 jmp .end2 1968.extend_bottom: 1969 call .v0 1970 call .v1 1971 jmp .end 1972.no_top: 1973 movif32 strideq, stridemp 1974 lea r10, [lpfq+strideq*4] 1975 mov lpfq, dstq 1976 lea r10, [r10+strideq*2] 1977 mov lpfm, r10 1978 call .h 1979%if ARCH_X86_64 1980 lea wq, [r4-2] 1981%else 1982 mov wq, w0m 1983 mov hvsrcm, lpfq 1984%endif 1985 lea t2, [t1+400*6] 1986.top_fixup_loop: 1987 mova m0, [t1+wq*2+400*0] 1988 mova m1, [t1+wq*2+400*2] 1989 mova m2, [t1+wq*2+400*4] 1990 mova [t2+wq*2+400*0], m0 1991 mova [t2+wq*2+400*2], m1 1992 mova [t2+wq*2+400*4], m2 1993 add wq, 8 1994 jl .top_fixup_loop 1995 movif32 t3, t3m 1996 movif32 t4, t4m 1997 call .v0 1998 jmp .main 1999.extend_right: 2000%assign stack_offset stack_offset+8 2001%assign calloff 8 2002 movd m0, [lpfq-1] 2003 movd m1, wd 2004 mova m3, m8 2005 pshufb m0, m6 2006 pshufb m1, m6 2007 mova m2, m6 2008 psubb m2, m1 2009 pcmpgtb m2, m3 2010 pand m5, m2 2011 pandn m2, m0 2012 por m5, m2 2013 ret 2014%assign stack_offset stack_offset-4 2015%assign calloff 4 2016.h: ; horizontal boxsum 2017%if ARCH_X86_64 2018 lea wq, [r4-2] 2019%else 2020 %define leftq r4 2021%endif 2022 test edgeb, 1 ; LR_HAVE_LEFT 2023 jz .h_extend_left 2024 movif32 leftq, leftm 2025 movddup m4, [leftq-4] 2026 movif32 wq, w0m 2027 mova m5, [lpfq+wq+2] 2028 add leftmp, 4 2029 palignr m5, m4, 14 2030 jmp .h_main 2031.h_extend_left: 2032 movif32 wq, w0m 2033 mova m5, [lpfq+wq+2] 2034 pshufb m5, m13 2035 jmp .h_main 2036.h_top: 2037%if ARCH_X86_64 2038 lea wq, [r4-2] 2039%endif 2040 test edgeb, 1 ; LR_HAVE_LEFT 2041 jz .h_extend_left 2042 movif32 wq, w0m 2043.h_loop: 2044 movu m5, [lpfq+wq] 2045.h_main: 2046 test edgeb, 2 ; LR_HAVE_RIGHT 2047 jnz .h_have_right 2048 cmp wd, -9 2049 jl .h_have_right 2050 call .extend_right 2051.h_have_right: 2052 punpcklbw m4, m5, m6 2053 punpckhbw m5, m6 2054 palignr m0, m5, m4, 2 2055 paddw m1, m4, m0 2056 punpcklwd m2, m4, m0 2057 pmaddwd m2, m2 2058 punpckhwd m3, m4, m0 2059 pmaddwd m3, m3 2060 palignr m5, m4, 4 2061 paddw m1, m5 ; sum 2062 punpcklwd m4, m5, m6 2063 pmaddwd m4, m4 2064 punpckhwd m5, m6 2065 pmaddwd m5, m5 2066 paddd m2, m4 ; sumsq 2067 paddd m3, m5 2068 mova [t1+wq*2+400*0], m1 2069 mova [t1+wq*2+400*2], m2 2070 mova [t1+wq*2+400*4], m3 2071 add wq, 8 2072 jl .h_loop 2073 ret 2074ALIGN function_align 2075.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) 2076%if ARCH_X86_64 2077 lea wq, [r4-2] 2078%else 2079 mov hvsrcm, lpfq 2080%endif 2081 test edgeb, 1 ; LR_HAVE_LEFT 2082 jz .hv0_extend_left 2083 movif32 leftq, leftm 2084 movddup m4, [leftq-4] 2085 movif32 wq, w0m 2086 mova m5, [lpfq+wq+2] 2087 add leftmp, 4 2088 palignr m5, m4, 14 2089 jmp .hv0_main 2090.hv0_extend_left: 2091 movif32 wq, w0m 2092 mova m5, [lpfq+wq+2] 2093 pshufb m5, m13 2094 jmp .hv0_main 2095.hv0_bottom: 2096%if ARCH_X86_64 2097 lea wq, [r4-2] 2098%else 2099 mov hvsrcm, lpfq 2100%endif 2101 test edgeb, 1 ; LR_HAVE_LEFT 2102 jz .hv0_extend_left 2103 movif32 wq, w0m 2104%if ARCH_X86_32 2105 jmp .hv0_loop_start 2106%endif 2107.hv0_loop: 2108 movif32 lpfq, hvsrcm 2109.hv0_loop_start: 2110 movu m5, [lpfq+wq] 2111.hv0_main: 2112 test edgeb, 2 ; LR_HAVE_RIGHT 2113 jnz .hv0_have_right 2114 cmp wd, -9 2115 jl .hv0_have_right 2116 call .extend_right 2117.hv0_have_right: 2118 punpcklbw m4, m5, m6 2119 punpckhbw m5, m6 2120 palignr m0, m5, m4, 2 2121 paddw m1, m4, m0 2122 punpcklwd m2, m4, m0 2123 pmaddwd m2, m2 2124 punpckhwd m3, m4, m0 2125 pmaddwd m3, m3 2126 palignr m5, m4, 4 2127 paddw m1, m5 ; sum 2128 punpcklwd m4, m5, m6 2129 pmaddwd m4, m4 2130 punpckhwd m5, m6 2131 pmaddwd m5, m5 2132 paddd m2, m4 ; sumsq 2133 paddd m3, m5 2134 paddw m0, m1, [t1+wq*2+400*0] 2135 paddd m4, m2, [t1+wq*2+400*2] 2136 paddd m5, m3, [t1+wq*2+400*4] 2137 mova [t1+wq*2+400*0], m1 2138 mova [t1+wq*2+400*2], m2 2139 mova [t1+wq*2+400*4], m3 2140 paddw m1, m0, [t2+wq*2+400*0] 2141 paddd m2, m4, [t2+wq*2+400*2] 2142 paddd m3, m5, [t2+wq*2+400*4] 2143 mova [t2+wq*2+400*0], m0 2144 mova [t2+wq*2+400*2], m4 2145 mova [t2+wq*2+400*4], m5 2146 pslld m4, m2, 3 2147 pslld m5, m3, 3 2148 paddd m4, m2 ; a * 9 2149 paddd m5, m3 2150 punpcklwd m0, m1, m6 ; b 2151 pmaddwd m2, m0, m0 ; b * b 2152 punpckhwd m1, m6 2153 pmaddwd m3, m1, m1 2154 psubd m4, m2 ; p 2155 psubd m5, m3 2156 MULLD m4, m9, m12 ; p * s 2157 MULLD m5, m9, m12 2158 pmaddwd m0, m10 ; b * 455 2159 pmaddwd m1, m10 2160 paddusw m4, m10 2161 paddusw m5, m10 2162 psrld m4, 20 ; min(z, 255) 2163 movif32 t3, t3m 2164 psrld m5, 20 2165 GATHER_X_BY_X m3, m4, m5, r0, dstm 2166 punpcklwd m4, m3, m3 2167 punpckhwd m5, m3, m3 2168 MULLD m0, m4, m12 2169 MULLD m1, m5, m12 2170%if ARCH_X86_32 2171 pxor m6, m6 2172%endif 2173 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2174 paddd m1, m11 2175 mova [t4+wq*2+4], m3 2176 psrld m0, 12 2177 psrld m1, 12 2178 mova [t3+wq*4+ 8], m0 2179 mova [t3+wq*4+24], m1 2180 add wq, 8 2181 jl .hv0_loop 2182 ret 2183ALIGN function_align 2184.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 2185%if ARCH_X86_64 2186 lea wq, [r4-2] 2187%else 2188 mov hvsrcm, lpfq 2189%endif 2190 test edgeb, 1 ; LR_HAVE_LEFT 2191 jz .hv1_extend_left 2192 movif32 leftq, leftm 2193 movddup m4, [leftq-4] 2194 movif32 wq, w0m 2195 mova m5, [lpfq+wq+2] 2196 add leftmp, 4 2197 palignr m5, m4, 14 2198 jmp .hv1_main 2199.hv1_extend_left: 2200 movif32 wq, w0m 2201 mova m5, [lpfq+wq+2] 2202 pshufb m5, m13 2203 jmp .hv1_main 2204.hv1_bottom: 2205%if ARCH_X86_64 2206 lea wq, [r4-2] 2207%else 2208 mov hvsrcm, lpfq 2209%endif 2210 test edgeb, 1 ; LR_HAVE_LEFT 2211 jz .hv1_extend_left 2212 movif32 wq, w0m 2213%if ARCH_X86_32 2214 jmp .hv1_loop_start 2215%endif 2216.hv1_loop: 2217 movif32 lpfq, hvsrcm 2218.hv1_loop_start: 2219 movu m5, [lpfq+wq] 2220.hv1_main: 2221 test edgeb, 2 ; LR_HAVE_RIGHT 2222 jnz .hv1_have_right 2223 cmp wd, -9 2224 jl .hv1_have_right 2225 call .extend_right 2226.hv1_have_right: 2227 punpcklbw m4, m5, m6 2228 punpckhbw m5, m6 2229 palignr m1, m5, m4, 2 2230 paddw m0, m4, m1 2231 punpcklwd m2, m4, m1 2232 pmaddwd m2, m2 2233 punpckhwd m3, m4, m1 2234 pmaddwd m3, m3 2235 palignr m5, m4, 4 2236 paddw m0, m5 ; h sum 2237 punpcklwd m1, m5, m6 2238 pmaddwd m1, m1 2239 punpckhwd m5, m6 2240 pmaddwd m5, m5 2241 paddd m2, m1 ; h sumsq 2242 paddd m3, m5 2243 paddw m1, m0, [t2+wq*2+400*0] 2244 paddd m4, m2, [t2+wq*2+400*2] 2245 paddd m5, m3, [t2+wq*2+400*4] 2246 mova [t2+wq*2+400*0], m0 2247 mova [t2+wq*2+400*2], m2 2248 mova [t2+wq*2+400*4], m3 2249 pslld m2, m4, 3 2250 pslld m3, m5, 3 2251 paddd m4, m2 ; a * 9 2252 paddd m5, m3 2253 punpcklwd m0, m1, m6 ; b 2254 pmaddwd m2, m0, m0 ; b * b 2255 punpckhwd m1, m6 2256 pmaddwd m3, m1, m1 2257 psubd m4, m2 ; p 2258 psubd m5, m3 2259 MULLD m4, m9, m12 ; p * s 2260 MULLD m5, m9, m12 2261 pmaddwd m0, m10 ; b * 455 2262 pmaddwd m1, m10 2263 paddusw m4, m10 2264 paddusw m5, m10 2265 psrld m4, 20 ; min(z, 255) 2266 movif32 t3, t3m 2267 psrld m5, 20 2268 GATHER_X_BY_X m3, m4, m5, r0, dstm 2269 punpcklwd m4, m3, m3 2270 punpckhwd m5, m3, m3 2271 MULLD m0, m4, m12 2272 MULLD m1, m5, m12 2273%if ARCH_X86_32 2274 pxor m6, m6 2275%endif 2276 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2277 paddd m1, m11 2278 mova [t4+wq*2+400*2 +4], m3 2279 psrld m0, 12 2280 psrld m1, 12 2281 mova [t3+wq*4+400*4+ 8], m0 2282 mova [t3+wq*4+400*4+24], m1 2283 add wq, 8 2284 jl .hv1_loop 2285 mov r10, t2 2286 mov t2, t1 2287 mov t1, r10 2288 ret 2289.v0: ; vertical boxsums + ab (even rows) 2290%if ARCH_X86_64 2291 lea wq, [r4-2] 2292%else 2293 mov wd, w0m 2294%endif 2295.v0_loop: 2296 mova m0, [t1+wq*2+400*0] 2297 mova m4, [t1+wq*2+400*2] 2298 mova m5, [t1+wq*2+400*4] 2299 paddw m0, m0 2300 paddd m4, m4 2301 paddd m5, m5 2302 paddw m1, m0, [t2+wq*2+400*0] 2303 paddd m2, m4, [t2+wq*2+400*2] 2304 paddd m3, m5, [t2+wq*2+400*4] 2305 mova [t2+wq*2+400*0], m0 2306 mova [t2+wq*2+400*2], m4 2307 mova [t2+wq*2+400*4], m5 2308 pslld m4, m2, 3 2309 pslld m5, m3, 3 2310 paddd m4, m2 ; a * 9 2311 paddd m5, m3 2312 punpcklwd m0, m1, m6 ; b 2313 pmaddwd m2, m0, m0 ; b * b 2314 punpckhwd m1, m6 2315 pmaddwd m3, m1, m1 2316 psubd m4, m2 ; p 2317 psubd m5, m3 2318 MULLD m4, m9, m12 ; p * s 2319 MULLD m5, m9, m12 2320 pmaddwd m0, m10 ; b * 455 2321 pmaddwd m1, m10 2322 paddusw m4, m10 2323 paddusw m5, m10 2324 psrld m4, 20 ; min(z, 255) 2325 psrld m5, 20 2326 GATHER_X_BY_X m3, m4, m5, r0, dstm 2327 punpcklwd m4, m3, m3 2328 punpckhwd m5, m3, m3 2329 MULLD m0, m4, m12 2330 MULLD m1, m5, m12 2331%if ARCH_X86_32 2332 pxor m6, m6 2333%endif 2334 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2335 paddd m1, m11 2336 mova [t4+wq*2+4], m3 2337 psrld m0, 12 2338 psrld m1, 12 2339 mova [t3+wq*4+ 8], m0 2340 mova [t3+wq*4+24], m1 2341 add wq, 8 2342 jl .v0_loop 2343 ret 2344.v1: ; vertical boxsums + ab (odd rows) 2345%if ARCH_X86_64 2346 lea wq, [r4-2] 2347%else 2348 mov wd, w0m 2349%endif 2350.v1_loop: 2351 mova m0, [t1+wq*2+400*0] 2352 mova m4, [t1+wq*2+400*2] 2353 mova m5, [t1+wq*2+400*4] 2354 paddw m1, m0, [t2+wq*2+400*0] 2355 paddd m2, m4, [t2+wq*2+400*2] 2356 paddd m3, m5, [t2+wq*2+400*4] 2357 mova [t2+wq*2+400*0], m0 2358 mova [t2+wq*2+400*2], m4 2359 mova [t2+wq*2+400*4], m5 2360 pslld m4, m2, 3 2361 pslld m5, m3, 3 2362 paddd m4, m2 ; a * 9 2363 paddd m5, m3 2364 punpcklwd m0, m1, m6 ; b 2365 pmaddwd m2, m0, m0 ; b * b 2366 punpckhwd m1, m6 2367 pmaddwd m3, m1, m1 2368 psubd m4, m2 ; p 2369 psubd m5, m3 2370 MULLD m4, m9, m12 ; p * s 2371 MULLD m5, m9, m12 2372 pmaddwd m0, m10 ; b * 455 2373 pmaddwd m1, m10 2374 paddusw m4, m10 2375 paddusw m5, m10 2376 psrld m4, 20 ; min(z, 255) 2377 psrld m5, 20 2378 GATHER_X_BY_X m3, m4, m5, r0, dstm 2379 punpcklwd m4, m3, m3 2380 punpckhwd m5, m3, m3 2381 MULLD m0, m4, m12 2382 MULLD m1, m5, m12 2383%if ARCH_X86_32 2384 pxor m6, m6 2385%endif 2386 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2387 paddd m1, m11 2388 mova [t4+wq*2+400*2+ 4], m3 2389 psrld m0, 12 2390 psrld m1, 12 2391 mova [t3+wq*4+400*4+ 8], m0 2392 mova [t3+wq*4+400*4+24], m1 2393 add wq, 8 2394 jl .v1_loop 2395 mov r10, t2 2396 mov t2, t1 2397 mov t1, r10 2398 ret 2399.prep_n: ; initial neighbor setup 2400 movif64 wq, r4 2401 movif32 wd, w1m 2402.prep_n_loop: 2403 movu m0, [t4+wq*2+400*0+ 4] 2404 movu m1, [t3+wq*4+400*0+ 8] 2405 movu m2, [t3+wq*4+400*0+24] 2406 movu m3, [t4+wq*2+400*0+ 2] 2407 movu m4, [t3+wq*4+400*0+ 4] 2408 movu m5, [t3+wq*4+400*0+20] 2409 paddw m0, [t4+wq*2+400*0+ 0] 2410 paddd m1, [t3+wq*4+400*0+ 0] 2411 paddd m2, [t3+wq*4+400*0+16] 2412 paddw m3, m0 2413 paddd m4, m1 2414 paddd m5, m2 2415 psllw m3, 2 ; a[-1] 444 2416 pslld m4, 2 ; b[-1] 444 2417 pslld m5, 2 2418 psubw m3, m0 ; a[-1] 343 2419 psubd m4, m1 ; b[-1] 343 2420 psubd m5, m2 2421 mova [t4+wq*2+400*4], m3 2422 mova [t3+wq*4+400*8+ 0], m4 2423 mova [t3+wq*4+400*8+16], m5 2424 movu m0, [t4+wq*2+400*2+ 4] 2425 movu m1, [t3+wq*4+400*4+ 8] 2426 movu m2, [t3+wq*4+400*4+24] 2427 movu m3, [t4+wq*2+400*2+ 2] 2428 movu m4, [t3+wq*4+400*4+ 4] 2429 movu m5, [t3+wq*4+400*4+20] 2430 paddw m0, [t4+wq*2+400*2+ 0] 2431 paddd m1, [t3+wq*4+400*4+ 0] 2432 paddd m2, [t3+wq*4+400*4+16] 2433 paddw m3, m0 2434 paddd m4, m1 2435 paddd m5, m2 2436 psllw m3, 2 ; a[ 0] 444 2437 pslld m4, 2 ; b[ 0] 444 2438 pslld m5, 2 2439 mova [t4+wq*2+400* 6], m3 2440 mova [t3+wq*4+400*12+ 0], m4 2441 mova [t3+wq*4+400*12+16], m5 2442 psubw m3, m0 ; a[ 0] 343 2443 psubd m4, m1 ; b[ 0] 343 2444 psubd m5, m2 2445 mova [t4+wq*2+400* 8], m3 2446 mova [t3+wq*4+400*16+ 0], m4 2447 mova [t3+wq*4+400*16+16], m5 2448 add wq, 8 2449 jl .prep_n_loop 2450 ret 2451ALIGN function_align 2452.n0: ; neighbor + output (even rows) 2453 movif64 wq, r4 2454 movif32 wd, w1m 2455.n0_loop: 2456 movu m3, [t4+wq*2+400*0+4] 2457 movu m1, [t4+wq*2+400*0+2] 2458 paddw m3, [t4+wq*2+400*0+0] 2459 paddw m1, m3 2460 psllw m1, 2 ; a[ 1] 444 2461 psubw m2, m1, m3 ; a[ 1] 343 2462 paddw m3, m2, [t4+wq*2+400*4] 2463 paddw m3, [t4+wq*2+400*6] 2464 mova [t4+wq*2+400*4], m2 2465 mova [t4+wq*2+400*6], m1 2466 movu m4, [t3+wq*4+400*0+8] 2467 movu m1, [t3+wq*4+400*0+4] 2468 paddd m4, [t3+wq*4+400*0+0] 2469 paddd m1, m4 2470 pslld m1, 2 ; b[ 1] 444 2471 psubd m2, m1, m4 ; b[ 1] 343 2472 paddd m4, m2, [t3+wq*4+400* 8+ 0] 2473 paddd m4, [t3+wq*4+400*12+ 0] 2474 mova [t3+wq*4+400* 8+ 0], m2 2475 mova [t3+wq*4+400*12+ 0], m1 2476 movu m5, [t3+wq*4+400*0+24] 2477 movu m1, [t3+wq*4+400*0+20] 2478 paddd m5, [t3+wq*4+400*0+16] 2479 paddd m1, m5 2480 pslld m1, 2 2481 psubd m2, m1, m5 2482 paddd m5, m2, [t3+wq*4+400* 8+16] 2483 paddd m5, [t3+wq*4+400*12+16] 2484 mova [t3+wq*4+400* 8+16], m2 2485 mova [t3+wq*4+400*12+16], m1 2486 movq m0, [dstq+wq] 2487 punpcklbw m0, m6 2488 punpcklwd m1, m0, m6 2489 punpcklwd m2, m3, m6 2490 pmaddwd m2, m1 ; a * src 2491 punpckhwd m1, m0, m6 2492 punpckhwd m3, m6 2493 pmaddwd m3, m1 2494 psubd m4, m2 ; b - a * src + (1 << 8) 2495 psubd m5, m3 2496 psrad m4, 9 2497 psrad m5, 9 2498 packssdw m4, m5 2499 pmulhrsw m4, m7 2500 paddw m0, m4 2501 packuswb m0, m0 2502 movq [dstq+wq], m0 2503 add wq, 8 2504 jl .n0_loop 2505 add dstq, stridemp 2506 ret 2507ALIGN function_align 2508.n1: ; neighbor + output (odd rows) 2509 movif64 wq, r4 2510 movif32 wd, w1m 2511.n1_loop: 2512 movu m3, [t4+wq*2+400*2+4] 2513 movu m1, [t4+wq*2+400*2+2] 2514 paddw m3, [t4+wq*2+400*2+0] 2515 paddw m1, m3 2516 psllw m1, 2 ; a[ 1] 444 2517 psubw m2, m1, m3 ; a[ 1] 343 2518 paddw m3, m2, [t4+wq*2+400*6] 2519 paddw m3, [t4+wq*2+400*8] 2520 mova [t4+wq*2+400*6], m1 2521 mova [t4+wq*2+400*8], m2 2522 movu m4, [t3+wq*4+400*4+8] 2523 movu m1, [t3+wq*4+400*4+4] 2524 paddd m4, [t3+wq*4+400*4+0] 2525 paddd m1, m4 2526 pslld m1, 2 ; b[ 1] 444 2527 psubd m2, m1, m4 ; b[ 1] 343 2528 paddd m4, m2, [t3+wq*4+400*12+ 0] 2529 paddd m4, [t3+wq*4+400*16+ 0] 2530 mova [t3+wq*4+400*12+ 0], m1 2531 mova [t3+wq*4+400*16+ 0], m2 2532 movu m5, [t3+wq*4+400*4+24] 2533 movu m1, [t3+wq*4+400*4+20] 2534 paddd m5, [t3+wq*4+400*4+16] 2535 paddd m1, m5 2536 pslld m1, 2 2537 psubd m2, m1, m5 2538 paddd m5, m2, [t3+wq*4+400*12+16] 2539 paddd m5, [t3+wq*4+400*16+16] 2540 mova [t3+wq*4+400*12+16], m1 2541 mova [t3+wq*4+400*16+16], m2 2542 movq m0, [dstq+wq] 2543 punpcklbw m0, m6 2544 punpcklwd m1, m0, m6 2545 punpcklwd m2, m3, m6 2546 pmaddwd m2, m1 ; a * src 2547 punpckhwd m1, m0, m6 2548 punpckhwd m3, m6 2549 pmaddwd m3, m1 2550 psubd m4, m2 ; b - a * src + (1 << 8) 2551 psubd m5, m3 2552 psrad m4, 9 2553 psrad m5, 9 2554 packssdw m4, m5 2555 pmulhrsw m4, m7 2556 paddw m0, m4 2557 packuswb m0, m0 2558 movq [dstq+wq], m0 2559 add wq, 8 2560 jl .n1_loop 2561 add dstq, stridemp 2562 movif32 dstm, dstq 2563 ret 2564 2565%if ARCH_X86_32 2566 %if STACK_ALIGNMENT < 16 2567 %assign extra_stack 10*16 2568 %else 2569 %assign extra_stack 8*16 2570 %endif 2571cglobal sgr_filter_mix_8bpc, 1, 7, 8, -400*66-48-extra_stack, \ 2572 dst, stride, left, lpf, w 2573 %if STACK_ALIGNMENT < 16 2574 %define dstm dword [esp+calloff+16*8+4*0] 2575 %define stridemp dword [esp+calloff+16*8+4*1] 2576 %define leftm dword [esp+calloff+16*8+4*2] 2577 %define lpfm dword [esp+calloff+16*8+4*3] 2578 %define w0m dword [esp+calloff+16*8+4*4] 2579 %define hd dword [esp+calloff+16*8+4*5] 2580 %define edgeb byte [esp+calloff+16*8+4*6] 2581 %define edged dword [esp+calloff+16*8+4*6] 2582 %define leftmp leftm 2583 %else 2584 %define w0m wm 2585 %define hd dword r5m 2586 %define edgeb byte r7m 2587 %define edged dword r7m 2588 %endif 2589 %define hvsrcm dword [esp+calloff+4*0] 2590 %define w1m dword [esp+calloff+4*1] 2591 %define t3m dword [esp+calloff+4*2] 2592 %define t4m dword [esp+calloff+4*3] 2593 %xdefine m8 m6 2594 %define m9 [base+pd_0xffff] 2595 %define m10 [base+pd_34816] 2596 %define m11 [base+pd_0xf00801c7] 2597 %define m12 [base+pd_0xf00800a4] 2598 %define m13 [esp+calloff+16*4] 2599 %define m14 [esp+calloff+16*5] 2600 %define m15 [esp+calloff+16*6] 2601 %define m6 [esp+calloff+16*7] 2602 %define base r6-$$ 2603 %assign calloff 0 2604 %if STACK_ALIGNMENT < 16 2605 mov strideq, [rstk+stack_offset+ 8] 2606 mov leftq, [rstk+stack_offset+12] 2607 mov lpfq, [rstk+stack_offset+16] 2608 mov wd, [rstk+stack_offset+20] 2609 mov dstm, dstq 2610 mov stridemp, strideq 2611 mov leftm, leftq 2612 mov r1, [rstk+stack_offset+24] 2613 mov r2, [rstk+stack_offset+32] 2614 mov lpfm, lpfq 2615 mov hd, r1 2616 mov edged, r2 2617 %endif 2618%else 2619cglobal sgr_filter_mix_8bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \ 2620 w, h, edge, params 2621%endif 2622%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 2623 mov wd, wm 2624%endif 2625%if ARCH_X86_64 2626 mov paramsq, r6mp 2627 lea r13, [sgr_x_by_x-0xf03] 2628 movifnidn hd, hm 2629 mov edged, r7m 2630 mova m15, [paramsq] 2631 add lpfq, wq 2632 mova m9, [pd_0xffff] 2633 lea t1, [rsp+wq*2+44] 2634 mova m10, [pd_34816] 2635 add dstq, wq 2636 lea t3, [rsp+wq*4+400*24+40] 2637 mova m11, [pd_0xf00801c7] 2638 lea t4, [rsp+wq*2+400*52+40] 2639 mova m12, [base+pd_0xf00800a4] 2640 neg wq 2641 pshuflw m13, m15, q0000 2642 pshuflw m14, m15, q2222 2643 pshufhw m15, m15, q1010 2644 punpcklqdq m13, m13 ; s0 2645 punpcklqdq m14, m14 ; s1 2646 punpckhqdq m15, m15 ; w0 w1 2647 pxor m6, m6 2648 psllw m15, 2 2649 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 2650 %define lpfm [rsp] 2651%else 2652 mov r1, [rstk+stack_offset+28] ; params 2653 LEA r6, $$ 2654 mova m2, [r1] 2655 add lpfm, wq 2656 lea t1, [rsp+extra_stack+wq*2+52] 2657 add dstq, wq 2658 lea t3, [rsp+extra_stack+wq*4+400*24+48] 2659 mov dstm, dstq 2660 lea t4, [rsp+extra_stack+wq*2+400*52+48] 2661 mov t3m, t3 2662 mov t4m, t4 2663 neg wq 2664 pshuflw m0, m2, q0000 2665 pshuflw m1, m2, q2222 2666 pshufhw m2, m2, q1010 2667 punpcklqdq m0, m0 ; s0 2668 punpcklqdq m1, m1 ; s1 2669 punpckhqdq m2, m2 ; w0 w1 2670 mov w1m, wd 2671 pxor m3, m3 2672 psllw m2, 2 2673 mova m13, m0 2674 mova m14, m1 2675 sub wd, 2 2676 mova m15, m2 2677 mova m6, m3 2678 mov lpfq, lpfm 2679 mov w0m, wd 2680 %define strideq r5 2681%endif 2682 test edgeb, 4 ; LR_HAVE_TOP 2683 jz .no_top 2684 call .h_top 2685 add lpfq, stridemp 2686 mov t2, t1 2687%if ARCH_X86_64 2688 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup 2689%else 2690 mov wq, w0m 2691 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup_loop 2692%endif 2693 add t1, 400*12 2694 call .h_top 2695 movif32 strideq, stridemp 2696 lea r10, [lpfq+strideq*4] 2697 mov lpfq, dstq 2698 add r10, strideq 2699 mov lpfm, r10 ; below 2700 movif32 t4, t4m 2701 call .hv0 2702.main: 2703 dec hd 2704 jz .height1 2705 movif32 lpfq, hvsrcm 2706 add lpfq, stridemp 2707 call .hv1 2708 call .prep_n 2709 sub hd, 2 2710 jl .extend_bottom 2711.main_loop: 2712 movif32 lpfq, hvsrcm 2713 add lpfq, stridemp 2714 call .hv0 2715%if ARCH_X86_64 2716 test hd, hd 2717%else 2718 mov r4, hd 2719 test r4, r4 2720%endif 2721 jz .odd_height 2722 movif32 lpfq, hvsrcm 2723 add lpfq, stridemp 2724 call .hv1 2725 call .n0 2726 call .n1 2727 sub hd, 2 2728 jge .main_loop 2729 test edgeb, 8 ; LR_HAVE_BOTTOM 2730 jz .extend_bottom 2731 mov lpfq, lpfm 2732 call .hv0_bottom 2733 movif32 lpfq, hvsrcm 2734 add lpfq, stridemp 2735 call .hv1_bottom 2736.end: 2737 call .n0 2738 call .n1 2739.end2: 2740 RET 2741.height1: 2742 call .v1 2743 call .prep_n 2744 jmp .odd_height_end 2745.odd_height: 2746 call .v1 2747 call .n0 2748 call .n1 2749.odd_height_end: 2750 call .v0 2751 call .v1 2752 call .n0 2753 jmp .end2 2754.extend_bottom: 2755 call .v0 2756 call .v1 2757 jmp .end 2758.no_top: 2759 movif32 strideq, stridemp 2760 lea r10, [lpfq+strideq*4] 2761 mov lpfq, dstq 2762 lea r10, [r10+strideq*2] 2763 mov lpfm, r10 2764 call .h 2765%if ARCH_X86_64 2766 lea wq, [r4-2] 2767%else 2768 mov wq, w0m 2769 mov hvsrcm, lpfq 2770%endif 2771 lea t2, [t1+400*12] 2772.top_fixup_loop: 2773 mova m0, [t1+wq*2+400* 0] 2774 mova m1, [t1+wq*2+400* 2] 2775 mova m2, [t1+wq*2+400* 4] 2776 paddw m0, m0 2777 mova m3, [t1+wq*2+400* 6] 2778 paddd m1, m1 2779 mova m4, [t1+wq*2+400* 8] 2780 paddd m2, m2 2781 mova m5, [t1+wq*2+400*10] 2782 mova [t2+wq*2+400* 0], m0 2783 mova [t2+wq*2+400* 2], m1 2784 mova [t2+wq*2+400* 4], m2 2785 mova [t2+wq*2+400* 6], m3 2786 mova [t2+wq*2+400* 8], m4 2787 mova [t2+wq*2+400*10], m5 2788 add wq, 8 2789 jl .top_fixup_loop 2790 movif32 t3, t3m 2791 movif32 t4, t4m 2792 call .v0 2793 jmp .main 2794.extend_right: 2795%assign stack_offset stack_offset+8 2796%assign calloff 8 2797%if ARCH_X86_64 2798 SWAP m8, m6 2799%endif 2800 movd m1, wd 2801 movd m3, [lpfq-1] 2802 pshufb m1, m8 2803 pshufb m3, m8 2804 psubb m2, [base+pb_1], m1 2805 pcmpgtb m2, [base+pb_0to15] 2806 pand m5, m2 2807 pandn m2, m3 2808 por m5, m2 2809%if ARCH_X86_64 2810 SWAP m6, m8 2811%endif 2812 ret 2813%assign stack_offset stack_offset-4 2814%assign calloff 4 2815.h: ; horizontal boxsum 2816%if ARCH_X86_64 2817 lea wq, [r4-2] 2818%else 2819 %define leftq r4 2820%endif 2821 test edgeb, 1 ; LR_HAVE_LEFT 2822 jz .h_extend_left 2823 movif32 leftq, leftm 2824 movddup m4, [leftq-4] 2825 movif32 wq, w0m 2826 mova m5, [lpfq+wq+2] 2827 add leftmp, 4 2828 palignr m5, m4, 13 2829 jmp .h_main 2830.h_extend_left: 2831 movif32 wq, w0m 2832 mova m5, [lpfq+wq+2] 2833 pshufb m5, [base+sgr_lshuf5] 2834 jmp .h_main 2835.h_top: 2836%if ARCH_X86_64 2837 lea wq, [r4-2] 2838%endif 2839 test edgeb, 1 ; LR_HAVE_LEFT 2840 jz .h_extend_left 2841 movif32 wq, w0m 2842.h_loop: 2843 movu m5, [lpfq+wq-1] 2844.h_main: 2845 test edgeb, 2 ; LR_HAVE_RIGHT 2846%if ARCH_X86_32 2847 pxor m8, m8 2848%else 2849 SWAP m8, m6 2850%endif 2851 jnz .h_have_right 2852 cmp wd, -10 2853 jl .h_have_right 2854 call .extend_right 2855.h_have_right: 2856 punpcklbw m4, m5, m8 2857 punpckhbw m5, m8 2858 palignr m3, m5, m4, 2 2859 palignr m0, m5, m4, 4 2860 paddw m1, m3, m0 2861 punpcklwd m2, m3, m0 2862 pmaddwd m2, m2 2863 punpckhwd m3, m0 2864 pmaddwd m3, m3 2865 palignr m0, m5, m4, 6 2866 paddw m1, m0 ; sum3 2867 punpcklwd m7, m0, m8 2868 pmaddwd m7, m7 2869 punpckhwd m0, m8 2870 pmaddwd m0, m0 2871%if ARCH_X86_64 2872 SWAP m6, m8 2873%endif 2874 paddd m2, m7 ; sumsq3 2875 palignr m5, m4, 8 2876 punpcklwd m7, m5, m4 2877 paddw m8, m4, m5 2878 pmaddwd m7, m7 2879 punpckhwd m5, m4 2880 pmaddwd m5, m5 2881 paddd m3, m0 2882 mova [t1+wq*2+400* 6], m1 2883 mova [t1+wq*2+400* 8], m2 2884 mova [t1+wq*2+400*10], m3 2885 paddw m8, m1 ; sum5 2886 paddd m7, m2 ; sumsq5 2887 paddd m5, m3 2888 mova [t1+wq*2+400* 0], m8 2889 mova [t1+wq*2+400* 2], m7 2890 mova [t1+wq*2+400* 4], m5 2891 add wq, 8 2892 jl .h_loop 2893 ret 2894ALIGN function_align 2895.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) 2896%if ARCH_X86_64 2897 lea wq, [r4-2] 2898%else 2899 mov hvsrcm, lpfq 2900%endif 2901 test edgeb, 1 ; LR_HAVE_LEFT 2902 jz .hv0_extend_left 2903 movif32 leftq, leftm 2904 movddup m4, [leftq-4] 2905 movif32 wq, w0m 2906 mova m5, [lpfq+wq+2] 2907 add leftmp, 4 2908 palignr m5, m4, 13 2909 jmp .hv0_main 2910.hv0_extend_left: 2911 movif32 wq, w0m 2912 mova m5, [lpfq+wq+2] 2913 pshufb m5, [base+sgr_lshuf5] 2914 jmp .hv0_main 2915.hv0_bottom: 2916%if ARCH_X86_64 2917 lea wq, [r4-2] 2918%else 2919 mov hvsrcm, lpfq 2920%endif 2921 test edgeb, 1 ; LR_HAVE_LEFT 2922 jz .hv0_extend_left 2923 movif32 wq, w0m 2924%if ARCH_X86_32 2925 jmp .hv0_loop_start 2926%endif 2927.hv0_loop: 2928 movif32 lpfq, hvsrcm 2929.hv0_loop_start: 2930 movu m5, [lpfq+wq-1] 2931.hv0_main: 2932 test edgeb, 2 ; LR_HAVE_RIGHT 2933%if ARCH_X86_32 2934 pxor m8, m8 2935%else 2936 SWAP m8, m6 2937%endif 2938 jnz .hv0_have_right 2939 cmp wd, -10 2940 jl .hv0_have_right 2941 call .extend_right 2942.hv0_have_right: 2943 punpcklbw m4, m5, m8 2944 punpckhbw m5, m8 2945 palignr m3, m5, m4, 2 2946 palignr m0, m5, m4, 4 2947 movif32 t3, t3m 2948 paddw m1, m3, m0 2949 punpcklwd m2, m3, m0 2950 pmaddwd m2, m2 2951 punpckhwd m3, m0 2952 pmaddwd m3, m3 2953 palignr m0, m5, m4, 6 2954 paddw m1, m0 ; h sum3 2955 punpcklwd m7, m0, m8 2956 pmaddwd m7, m7 2957 punpckhwd m0, m8 2958%if ARCH_X86_64 2959 SWAP m6, m8 2960%endif 2961 pmaddwd m0, m0 2962 paddd m2, m7 ; h sumsq3 2963 palignr m5, m4, 8 2964 punpcklwd m7, m5, m4 2965 paddw m8, m4, m5 2966 pmaddwd m7, m7 2967 punpckhwd m5, m4 2968 pmaddwd m5, m5 2969 paddd m3, m0 2970 paddw m8, m1 ; h sum5 2971 paddd m7, m2 ; h sumsq5 2972 paddd m5, m3 2973 mova [t3+wq*4+400*8+ 8], m8 2974 mova [t3+wq*4+400*0+ 8], m7 2975 mova [t3+wq*4+400*0+24], m5 2976 paddw m8, [t1+wq*2+400* 0] 2977 paddd m7, [t1+wq*2+400* 2] 2978 paddd m5, [t1+wq*2+400* 4] 2979 mova [t1+wq*2+400* 0], m8 2980 mova [t1+wq*2+400* 2], m7 2981 mova [t1+wq*2+400* 4], m5 2982 paddw m0, m1, [t1+wq*2+400* 6] 2983 paddd m4, m2, [t1+wq*2+400* 8] 2984 paddd m5, m3, [t1+wq*2+400*10] 2985 mova [t1+wq*2+400* 6], m1 2986 mova [t1+wq*2+400* 8], m2 2987 mova [t1+wq*2+400*10], m3 2988 paddw m1, m0, [t2+wq*2+400* 6] 2989 paddd m2, m4, [t2+wq*2+400* 8] 2990 paddd m3, m5, [t2+wq*2+400*10] 2991 mova [t2+wq*2+400* 6], m0 2992 mova [t2+wq*2+400* 8], m4 2993 mova [t2+wq*2+400*10], m5 2994%if ARCH_X86_32 2995 pxor m7, m7 2996%else 2997 SWAP m7, m6 2998%endif 2999 pslld m4, m2, 3 3000 pslld m5, m3, 3 3001 paddd m4, m2 ; a3 * 9 3002 paddd m5, m3 3003 punpcklwd m0, m1, m7 ; b3 3004 pmaddwd m2, m0, m0 3005 punpckhwd m1, m7 3006 pmaddwd m3, m1, m1 3007%if ARCH_X86_64 3008 SWAP m7, m6 3009%endif 3010 psubd m4, m2 ; p3 3011 psubd m5, m3 3012 MULLD m4, m14, m7 ; p3 * s1 3013 MULLD m5, m14, m7 3014 pmaddwd m0, m11 ; b3 * 455 3015 pmaddwd m1, m11 3016 paddusw m4, m11 3017 paddusw m5, m11 3018 psrld m4, 20 ; min(z3, 255) 3019 psrld m5, 20 3020 GATHER_X_BY_X m3, m4, m5, r0, dstm 3021 punpcklwd m4, m3, m3 3022 punpckhwd m5, m3, m3 3023 MULLD m0, m4, m7 3024 MULLD m1, m5, m7 3025 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3026 paddd m1, m10 3027 mova [t4+wq*2+400*2+ 4], m3 3028 psrld m0, 12 3029 psrld m1, 12 3030 mova [t3+wq*4+400*4+ 8], m0 3031 mova [t3+wq*4+400*4+24], m1 3032 add wq, 8 3033 jl .hv0_loop 3034 ret 3035ALIGN function_align 3036.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 3037%if ARCH_X86_64 3038 lea wq, [r4-2] 3039%else 3040 mov hvsrcm, lpfq 3041%endif 3042 test edgeb, 1 ; LR_HAVE_LEFT 3043 jz .hv1_extend_left 3044 movif32 leftq, leftm 3045 movddup m4, [leftq-4] 3046 movif32 wq, w0m 3047 mova m5, [lpfq+wq+2] 3048 add leftmp, 4 3049 palignr m5, m4, 13 3050 jmp .hv1_main 3051.hv1_extend_left: 3052 movif32 wq, w0m 3053 mova m5, [lpfq+wq+2] 3054 pshufb m5, [base+sgr_lshuf5] 3055 jmp .hv1_main 3056.hv1_bottom: 3057%if ARCH_X86_64 3058 lea wq, [r4-2] 3059%else 3060 mov hvsrcm, lpfq 3061%endif 3062 test edgeb, 1 ; LR_HAVE_LEFT 3063 jz .hv1_extend_left 3064 movif32 wq, w0m 3065%if ARCH_X86_32 3066 jmp .hv1_loop_start 3067%endif 3068.hv1_loop: 3069 movif32 lpfq, hvsrcm 3070.hv1_loop_start: 3071 movu m5, [lpfq+wq-1] 3072.hv1_main: 3073 test edgeb, 2 ; LR_HAVE_RIGHT 3074%if ARCH_X86_32 3075 pxor m8, m8 3076%else 3077 SWAP m8, m6 3078%endif 3079 jnz .hv1_have_right 3080 cmp wd, -10 3081 jl .hv1_have_right 3082 call .extend_right 3083.hv1_have_right: 3084 punpcklbw m4, m5, m8 3085 punpckhbw m5, m8 3086 palignr m7, m5, m4, 2 3087 palignr m3, m5, m4, 4 3088 paddw m2, m7, m3 3089 punpcklwd m0, m7, m3 3090 pmaddwd m0, m0 3091 punpckhwd m7, m3 3092 pmaddwd m7, m7 3093 palignr m3, m5, m4, 6 3094 paddw m2, m3 ; h sum3 3095 punpcklwd m1, m3, m8 3096 pmaddwd m1, m1 3097 punpckhwd m3, m8 3098%if ARCH_X86_64 3099 SWAP m6, m8 3100%endif 3101 pmaddwd m3, m3 3102 paddd m0, m1 ; h sumsq3 3103 palignr m5, m4, 8 3104 punpckhwd m1, m4, m5 3105 paddw m8, m4, m5 3106 pmaddwd m1, m1 3107 punpcklwd m4, m5 3108 pmaddwd m4, m4 3109 paddd m7, m3 3110 paddw m5, m2, [t2+wq*2+400* 6] 3111 mova [t2+wq*2+400* 6], m2 3112 paddw m8, m2 ; h sum5 3113 paddd m2, m0, [t2+wq*2+400* 8] 3114 paddd m3, m7, [t2+wq*2+400*10] 3115 mova [t2+wq*2+400* 8], m0 3116 mova [t2+wq*2+400*10], m7 3117 paddd m4, m0 ; h sumsq5 3118 paddd m1, m7 3119 pslld m0, m2, 3 3120 pslld m7, m3, 3 3121 paddd m2, m0 ; a3 * 9 3122 paddd m3, m7 3123%if ARCH_X86_32 3124 mova [esp+20], m8 3125 pxor m8, m8 3126%else 3127 SWAP m8, m6 3128%endif 3129 punpcklwd m0, m5, m8 ; b3 3130 pmaddwd m7, m0, m0 3131 punpckhwd m5, m8 3132 pmaddwd m8, m5, m5 3133 psubd m2, m7 ; p3 3134 psubd m3, m8 3135 MULLD m2, m14, m8 ; p3 * s1 3136 MULLD m3, m14, m8 3137 pmaddwd m0, m11 ; b3 * 455 3138 pmaddwd m5, m11 3139 paddusw m2, m11 3140 paddusw m3, m11 3141 psrld m2, 20 ; min(z3, 255) 3142 movif32 t3, t3m 3143 psrld m3, 20 3144 GATHER_X_BY_X m8, m2, m3, r0, dstm 3145 punpcklwd m2, m8, m8 3146 punpckhwd m3, m8, m8 3147 MULLD m0, m2, m7 3148 MULLD m5, m3, m7 3149 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3150 paddd m5, m10 3151 psrld m0, 12 3152 psrld m5, 12 3153 mova [t4+wq*2+400*4+ 4], m8 3154 mova [t3+wq*4+400*8+ 8], m0 3155 mova [t3+wq*4+400*8+24], m5 3156%if ARCH_X86_32 3157 mova m8, [esp+20] 3158%else 3159 SWAP m6, m8 3160 pxor m6, m6 3161%endif 3162 paddw m5, m8, [t2+wq*2+400*0] 3163 paddd m2, m4, [t2+wq*2+400*2] 3164 paddd m3, m1, [t2+wq*2+400*4] 3165 paddw m5, [t1+wq*2+400*0] 3166 paddd m2, [t1+wq*2+400*2] 3167 paddd m3, [t1+wq*2+400*4] 3168 mova [t2+wq*2+400*0], m8 3169 pslld m0, m2, 4 3170 mova [t2+wq*2+400*2], m4 3171 pslld m8, m3, 4 3172 mova [t2+wq*2+400*4], m1 3173 pslld m4, m2, 3 3174 paddd m2, m0 3175 pslld m7, m3, 3 3176 paddd m3, m8 3177 paddd m2, m4 ; a5 * 25 3178 paddd m3, m7 3179%if ARCH_X86_32 3180 pxor m7, m7 3181%else 3182 SWAP m7, m6 3183%endif 3184 punpcklwd m0, m5, m7 ; b5 3185 pmaddwd m4, m0, m0 3186 punpckhwd m5, m7 3187 pmaddwd m1, m5, m5 3188%if ARCH_X86_64 3189 SWAP m7, m6 3190%endif 3191 psubd m2, m4 ; p5 3192 psubd m3, m1 3193 MULLD m2, m13, m7 ; p5 * s0 3194 MULLD m3, m13, m7 3195 pmaddwd m0, m12 ; b5 * 164 3196 pmaddwd m5, m12 3197 paddusw m2, m12 3198 paddusw m3, m12 3199 psrld m2, 20 ; min(z5, 255) 3200 psrld m3, 20 3201 GATHER_X_BY_X m1, m2, m3, r0, dstm 3202 punpcklwd m2, m1, m1 3203 punpckhwd m3, m1, m1 3204 MULLD m0, m2, m7 3205 MULLD m5, m3, m7 3206 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 3207 paddd m5, m10 3208 mova [t4+wq*2+4], m1 3209 psrld m0, 12 3210 psrld m5, 12 3211 mova [t3+wq*4+ 8], m0 3212 mova [t3+wq*4+24], m5 3213 add wq, 8 3214 jl .hv1_loop 3215 mov r10, t2 3216 mov t2, t1 3217 mov t1, r10 3218 ret 3219.v0: ; vertical boxsums + ab3 (even rows) 3220%if ARCH_X86_64 3221 lea wq, [r4-2] 3222%else 3223 mov wd, w0m 3224%endif 3225.v0_loop: 3226 mova m0, [t1+wq*2+400* 6] 3227 mova m4, [t1+wq*2+400* 8] 3228 mova m5, [t1+wq*2+400*10] 3229 paddw m0, m0 3230 paddd m4, m4 3231 paddd m5, m5 3232 paddw m1, m0, [t2+wq*2+400* 6] 3233 paddd m2, m4, [t2+wq*2+400* 8] 3234 paddd m3, m5, [t2+wq*2+400*10] 3235 mova [t2+wq*2+400* 6], m0 3236 mova [t2+wq*2+400* 8], m4 3237 mova [t2+wq*2+400*10], m5 3238%if ARCH_X86_32 3239 pxor m7, m7 3240%else 3241 SWAP m7, m6 3242%endif 3243 pslld m4, m2, 3 3244 pslld m5, m3, 3 3245 paddd m4, m2 ; a3 * 9 3246 paddd m5, m3 3247 punpcklwd m0, m1, m7 ; b3 3248 pmaddwd m2, m0, m0 3249 punpckhwd m1, m7 3250 pmaddwd m3, m1, m1 3251 psubd m4, m2 ; p3 3252 psubd m5, m3 3253%if ARCH_X86_64 3254 SWAP m7, m6 3255%endif 3256 MULLD m4, m14, m7 ; p3 * s1 3257 MULLD m5, m14, m7 3258 pmaddwd m0, m11 ; b3 * 455 3259 pmaddwd m1, m11 3260 paddusw m4, m11 3261 paddusw m5, m11 3262 psrld m4, 20 ; min(z3, 255) 3263 psrld m5, 20 3264 GATHER_X_BY_X m3, m4, m5, r0, dstm 3265 punpcklwd m4, m3, m3 3266 punpckhwd m5, m3, m3 3267 MULLD m0, m4, m7 3268 MULLD m1, m5, m7 3269 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3270 paddd m1, m10 3271 mova [t4+wq*2+400*2+4], m3 3272 psrld m0, 12 3273 psrld m1, 12 3274 mova m3, [t1+wq*2+400*0] 3275 mova m4, [t1+wq*2+400*2] 3276 mova m5, [t1+wq*2+400*4] 3277 mova [t3+wq*4+400*8+ 8], m3 3278 mova [t3+wq*4+400*0+ 8], m4 3279 mova [t3+wq*4+400*0+24], m5 3280 paddw m3, m3 ; cc5 3281 paddd m4, m4 3282 paddd m5, m5 3283 mova [t1+wq*2+400*0], m3 3284 mova [t1+wq*2+400*2], m4 3285 mova [t1+wq*2+400*4], m5 3286 mova [t3+wq*4+400*4+ 8], m0 3287 mova [t3+wq*4+400*4+24], m1 3288 add wq, 8 3289 jl .v0_loop 3290 ret 3291.v1: ; vertical boxsums + ab (odd rows) 3292%if ARCH_X86_64 3293 lea wq, [r4-2] 3294%else 3295 mov wd, w0m 3296%endif 3297.v1_loop: 3298 mova m4, [t1+wq*2+400* 6] 3299 mova m5, [t1+wq*2+400* 8] 3300 mova m7, [t1+wq*2+400*10] 3301 paddw m1, m4, [t2+wq*2+400* 6] 3302 paddd m2, m5, [t2+wq*2+400* 8] 3303 paddd m3, m7, [t2+wq*2+400*10] 3304 mova [t2+wq*2+400* 6], m4 3305 mova [t2+wq*2+400* 8], m5 3306 mova [t2+wq*2+400*10], m7 3307%if ARCH_X86_32 3308 pxor m7, m7 3309%else 3310 SWAP m7, m6 3311%endif 3312 pslld m4, m2, 3 3313 pslld m5, m3, 3 3314 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 3315 paddd m5, m3 3316 punpcklwd m0, m1, m7 ; b3 3317 pmaddwd m2, m0, m0 3318 punpckhwd m1, m7 3319 pmaddwd m3, m1, m1 3320 psubd m4, m2 ; p3 3321 psubd m5, m3 3322%if ARCH_X86_64 3323 SWAP m7, m6 3324%endif 3325 MULLD m4, m14, m7 ; p3 * s1 3326 MULLD m5, m14, m7 3327 pmaddwd m0, m11 ; b3 * 455 3328 pmaddwd m1, m11 3329 paddusw m4, m11 3330 paddusw m5, m11 3331 psrld m4, 20 ; min(z3, 255) 3332 psrld m5, 20 3333 GATHER_X_BY_X m3, m4, m5, r0, dstm 3334 punpcklwd m4, m3, m3 3335 punpckhwd m5, m3, m3 3336 MULLD m0, m4, m7 3337 MULLD m1, m5, m7 3338 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3339 paddd m1, m10 3340 mova [t4+wq*2+400*4+4], m3 3341 psrld m0, 12 3342 psrld m8, m1, 12 3343 mova m4, [t3+wq*4+400*8+ 8] 3344 mova m5, [t3+wq*4+400*0+ 8] 3345 mova m7, [t3+wq*4+400*0+24] 3346 paddw m1, m4, [t2+wq*2+400*0] 3347 paddd m2, m5, [t2+wq*2+400*2] 3348 paddd m3, m7, [t2+wq*2+400*4] 3349 paddw m1, [t1+wq*2+400*0] 3350 paddd m2, [t1+wq*2+400*2] 3351 paddd m3, [t1+wq*2+400*4] 3352 mova [t2+wq*2+400*0], m4 3353 mova [t2+wq*2+400*2], m5 3354 mova [t2+wq*2+400*4], m7 3355 pslld m4, m2, 4 3356 mova [t3+wq*4+400*8+ 8], m0 3357 pslld m5, m3, 4 3358 mova [t3+wq*4+400*8+24], m8 3359 pslld m7, m2, 3 3360 paddd m2, m4 3361 pslld m8, m3, 3 3362 paddd m3, m5 3363 paddd m2, m7 ; a5 * 25 3364 paddd m3, m8 3365%if ARCH_X86_32 3366 pxor m7, m7 3367%else 3368 SWAP m7, m6 3369%endif 3370 punpcklwd m0, m1, m7 ; b5 3371 pmaddwd m4, m0, m0 3372 punpckhwd m1, m7 3373 pmaddwd m5, m1, m1 3374 psubd m2, m4 ; p5 3375 psubd m3, m5 3376%if ARCH_X86_64 3377 SWAP m7, m6 3378%endif 3379 MULLD m2, m13, m7 ; p5 * s0 3380 MULLD m3, m13, m7 3381 pmaddwd m0, m12 ; b5 * 164 3382 pmaddwd m1, m12 3383 paddusw m2, m12 3384 paddusw m3, m12 3385 psrld m2, 20 ; min(z5, 255) 3386 psrld m3, 20 3387 GATHER_X_BY_X m4, m2, m3, r0, dstm 3388 punpcklwd m2, m4, m4 3389 punpckhwd m3, m4, m4 3390 MULLD m0, m2, m7 3391 MULLD m1, m3, m7 3392 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 3393 paddd m1, m10 3394 mova [t4+wq*2+4], m4 3395 psrld m0, 12 3396 psrld m1, 12 3397 mova [t3+wq*4+ 8], m0 3398 mova [t3+wq*4+24], m1 3399 add wq, 8 3400 jl .v1_loop 3401 mov r10, t2 3402 mov t2, t1 3403 mov t1, r10 3404 ret 3405.prep_n: ; initial neighbor setup 3406 movif64 wq, r4 3407 movif32 wd, w1m 3408.prep_n_loop: 3409 movu m0, [t4+wq*2+400*0+ 2] 3410 movu m1, [t3+wq*4+400*0+ 4] 3411 movu m2, [t3+wq*4+400*0+20] 3412 movu m7, [t4+wq*2+400*0+ 4] 3413 movu m8, [t3+wq*4+400*0+ 8] 3414 paddw m3, m0, [t4+wq*2+400*0+ 0] 3415 paddd m4, m1, [t3+wq*4+400*0+ 0] 3416 paddd m5, m2, [t3+wq*4+400*0+16] 3417 paddw m3, m7 3418 paddd m4, m8 3419 movu m7, [t3+wq*4+400*0+24] 3420 paddw m0, m3 3421 paddd m1, m4 3422 psllw m3, 2 3423 pslld m4, 2 3424 paddd m5, m7 3425 paddd m2, m5 3426 pslld m5, 2 3427 paddw m0, m3 ; a5 565 3428 paddd m1, m4 ; b5 565 3429 paddd m2, m5 3430 mova [t4+wq*2+400* 6+ 0], m0 3431 mova [t3+wq*4+400*12+ 0], m1 3432 mova [t3+wq*4+400*12+16], m2 3433 movu m0, [t4+wq*2+400*2+ 4] 3434 movu m1, [t3+wq*4+400*4+ 8] 3435 movu m2, [t3+wq*4+400*4+24] 3436 movu m3, [t4+wq*2+400*2+ 2] 3437 movu m4, [t3+wq*4+400*4+ 4] 3438 movu m5, [t3+wq*4+400*4+20] 3439 paddw m0, [t4+wq*2+400*2+ 0] 3440 paddd m1, [t3+wq*4+400*4+ 0] 3441 paddd m2, [t3+wq*4+400*4+16] 3442 paddw m3, m0 3443 paddd m4, m1 3444 paddd m5, m2 3445 psllw m3, 2 ; a3[-1] 444 3446 pslld m4, 2 ; b3[-1] 444 3447 pslld m5, 2 3448 psubw m3, m0 ; a3[-1] 343 3449 psubd m4, m1 ; b3[-1] 343 3450 psubd m5, m2 3451 mova [t4+wq*2+400* 8+ 0], m3 3452 mova [t3+wq*4+400*16+ 0], m4 3453 mova [t3+wq*4+400*16+16], m5 3454 movu m0, [t4+wq*2+400*4+ 4] 3455 movu m1, [t3+wq*4+400*8+ 8] 3456 movu m2, [t3+wq*4+400*8+24] 3457 movu m3, [t4+wq*2+400*4+ 2] 3458 movu m4, [t3+wq*4+400*8+ 4] 3459 movu m5, [t3+wq*4+400*8+20] 3460 paddw m0, [t4+wq*2+400*4+ 0] 3461 paddd m1, [t3+wq*4+400*8+ 0] 3462 paddd m2, [t3+wq*4+400*8+16] 3463 paddw m3, m0 3464 paddd m4, m1 3465 paddd m5, m2 3466 psllw m3, 2 ; a3[ 0] 444 3467 pslld m4, 2 ; b3[ 0] 444 3468 pslld m5, 2 3469 mova [t4+wq*2+400*10+ 0], m3 3470 mova [t3+wq*4+400*20+ 0], m4 3471 mova [t3+wq*4+400*20+16], m5 3472 psubw m3, m0 ; a3[ 0] 343 3473 psubd m4, m1 ; b3[ 0] 343 3474 psubd m5, m2 3475 mova [t4+wq*2+400*12+ 0], m3 3476 mova [t3+wq*4+400*24+ 0], m4 3477 mova [t3+wq*4+400*24+16], m5 3478 add wq, 8 3479 jl .prep_n_loop 3480 ret 3481ALIGN function_align 3482.n0: ; neighbor + output (even rows) 3483 movif64 wq, r4 3484 movif32 wd, w1m 3485.n0_loop: 3486 movu m0, [t4+wq*2+ 4] 3487 movu m2, [t4+wq*2+ 2] 3488 paddw m0, [t4+wq*2+ 0] 3489 paddw m0, m2 3490 paddw m2, m0 3491 psllw m0, 2 3492 paddw m0, m2 ; a5 3493 movu m4, [t3+wq*4+ 8] 3494 movu m5, [t3+wq*4+24] 3495 movu m1, [t3+wq*4+ 4] 3496 movu m3, [t3+wq*4+20] 3497 paddd m4, [t3+wq*4+ 0] 3498 paddd m5, [t3+wq*4+16] 3499 paddd m4, m1 3500 paddd m5, m3 3501 paddd m1, m4 3502 paddd m3, m5 3503 pslld m4, 2 3504 pslld m5, 2 3505 paddd m4, m1 ; b5 3506 paddd m5, m3 3507 movu m2, [t4+wq*2+400* 6] 3508 paddw m2, m0 3509 mova [t4+wq*2+400* 6], m0 3510 paddd m0, m4, [t3+wq*4+400*12+ 0] 3511 paddd m1, m5, [t3+wq*4+400*12+16] 3512 mova [t3+wq*4+400*12+ 0], m4 3513 mova [t3+wq*4+400*12+16], m5 3514 mova [rsp+16+ARCH_X86_32*4], m1 3515 movu m3, [t4+wq*2+400*2+4] 3516 movu m5, [t4+wq*2+400*2+2] 3517 paddw m3, [t4+wq*2+400*2+0] 3518 paddw m5, m3 3519 psllw m5, 2 ; a3[ 1] 444 3520 psubw m4, m5, m3 ; a3[ 1] 343 3521 movu m3, [t4+wq*2+400* 8] 3522 paddw m3, [t4+wq*2+400*10] 3523 paddw m3, m4 3524 mova [t4+wq*2+400* 8], m4 3525 mova [t4+wq*2+400*10], m5 3526 movu m1, [t3+wq*4+400*4+ 8] 3527 movu m5, [t3+wq*4+400*4+ 4] 3528 movu m7, [t3+wq*4+400*4+24] 3529 movu m8, [t3+wq*4+400*4+20] 3530 paddd m1, [t3+wq*4+400*4+ 0] 3531 paddd m7, [t3+wq*4+400*4+16] 3532 paddd m5, m1 3533 paddd m8, m7 3534 pslld m5, 2 ; b3[ 1] 444 3535 pslld m8, 2 3536 psubd m4, m5, m1 ; b3[ 1] 343 3537%if ARCH_X86_32 3538 mova [esp+52], m8 3539 psubd m8, m7 3540%else 3541 psubd m6, m8, m7 3542 SWAP m8, m6 3543%endif 3544 paddd m1, m4, [t3+wq*4+400*16+ 0] 3545 paddd m7, m8, [t3+wq*4+400*16+16] 3546 paddd m1, [t3+wq*4+400*20+ 0] 3547 paddd m7, [t3+wq*4+400*20+16] 3548 mova [t3+wq*4+400*16+ 0], m4 3549 mova [t3+wq*4+400*16+16], m8 3550 mova [t3+wq*4+400*20+ 0], m5 3551%if ARCH_X86_32 3552 mova m8, [esp+52] 3553%else 3554 SWAP m8, m6 3555 pxor m6, m6 3556%endif 3557 mova [t3+wq*4+400*20+16], m8 3558 mova [rsp+32+ARCH_X86_32*4], m7 3559 movq m4, [dstq+wq] 3560 punpcklbw m4, m6 3561 punpcklwd m5, m4, m6 3562 punpcklwd m7, m2, m6 3563 pmaddwd m7, m5 ; a5 * src 3564 punpcklwd m8, m3, m6 3565 pmaddwd m8, m5 ; a3 * src 3566 punpckhwd m5, m4, m6 3567 punpckhwd m2, m6 3568 pmaddwd m2, m5 3569 punpckhwd m3, m6 3570 pmaddwd m3, m5 3571 psubd m0, m7 ; b5 - a5 * src + (1 << 8) - (src << 13) 3572 psubd m1, m8 ; b3 - a3 * src + (1 << 8) - (src << 13) 3573 psrld m0, 9 3574 pslld m1, 7 3575 pand m0, m9 3576 pandn m8, m9, m1 3577 por m0, m8 3578 mova m1, [rsp+16+ARCH_X86_32*4] 3579 psubd m1, m2 3580 mova m2, [rsp+32+ARCH_X86_32*4] 3581 psubd m2, m3 3582 mova m3, [base+pd_4096] 3583 psrld m1, 9 3584 pslld m2, 7 3585 pand m1, m9 3586 pandn m5, m9, m2 3587 por m1, m5 3588 pmaddwd m0, m15 3589 pmaddwd m1, m15 3590 paddd m0, m3 3591 paddd m1, m3 3592 psrad m0, 13 3593 psrad m1, 13 3594 packssdw m0, m1 3595 paddw m0, m4 3596 packuswb m0, m0 3597 movq [dstq+wq], m0 3598 add wq, 8 3599 jl .n0_loop 3600 add dstq, stridemp 3601 ret 3602ALIGN function_align 3603.n1: ; neighbor + output (odd rows) 3604 movif64 wq, r4 3605 movif32 wd, w1m 3606.n1_loop: 3607 movu m3, [t4+wq*2+400*4+4] 3608 movu m5, [t4+wq*2+400*4+2] 3609 paddw m3, [t4+wq*2+400*4+0] 3610 paddw m5, m3 3611 psllw m5, 2 ; a3[ 1] 444 3612 psubw m4, m5, m3 ; a3[ 1] 343 3613 paddw m3, m4, [t4+wq*2+400*12] 3614 paddw m3, [t4+wq*2+400*10] 3615 mova [t4+wq*2+400*10], m5 3616 mova [t4+wq*2+400*12], m4 3617 movu m1, [t3+wq*4+400*8+ 8] 3618 movu m5, [t3+wq*4+400*8+ 4] 3619 movu m7, [t3+wq*4+400*8+24] 3620 movu m8, [t3+wq*4+400*8+20] 3621 paddd m1, [t3+wq*4+400*8+ 0] 3622 paddd m7, [t3+wq*4+400*8+16] 3623 paddd m5, m1 3624 paddd m8, m7 3625 pslld m5, 2 ; b3[ 1] 444 3626 pslld m8, 2 3627 psubd m4, m5, m1 ; b3[ 1] 343 3628 psubd m0, m8, m7 3629 paddd m1, m4, [t3+wq*4+400*24+ 0] 3630 paddd m7, m0, [t3+wq*4+400*24+16] 3631 paddd m1, [t3+wq*4+400*20+ 0] 3632 paddd m7, [t3+wq*4+400*20+16] 3633 mova [t3+wq*4+400*20+ 0], m5 3634 mova [t3+wq*4+400*20+16], m8 3635 mova [t3+wq*4+400*24+ 0], m4 3636 mova [t3+wq*4+400*24+16], m0 3637 movq m5, [dstq+wq] 3638 mova m2, [t4+wq*2+400* 6] 3639 punpcklbw m5, m6 3640 punpcklwd m4, m5, m6 3641 punpcklwd m8, m2, m6 3642 pmaddwd m8, m4 ; a5 * src 3643 punpcklwd m0, m3, m6 3644 pmaddwd m0, m4 ; a3 * src 3645 punpckhwd m4, m5, m6 3646 punpckhwd m2, m6 3647 pmaddwd m2, m4 3648 punpckhwd m3, m6 3649 pmaddwd m3, m4 3650 psubd m1, m0 ; b3 - a3 * src + (1 << 8) - (src << 13) 3651 mova m0, [t3+wq*4+400*12+ 0] 3652 psubd m0, m8 ; b5 - a5 * src + (1 << 8) - (src << 13) 3653 mova m4, [t3+wq*4+400*12+16] 3654 psubd m4, m2 3655 psubd m7, m3 3656 pslld m1, 7 3657 psrld m0, 8 3658 psrld m4, 8 3659 pslld m7, 7 3660 pandn m3, m9, m1 3661 pand m0, m9 3662 por m0, m3 3663 pand m4, m9 3664 pandn m2, m9, m7 3665 por m2, m4 3666 mova m1, [base+pd_4096] 3667 pmaddwd m0, m15 3668 pmaddwd m2, m15 3669 paddd m0, m1 3670 paddd m2, m1 3671 psrad m0, 13 3672 psrad m2, 13 3673 packssdw m0, m2 3674 paddw m0, m5 3675 packuswb m0, m0 3676 movq [dstq+wq], m0 3677 add wq, 8 3678 jl .n1_loop 3679 add dstq, stridemp 3680 movif32 dstm, dstq 3681 ret 3682