1; Copyright © 2018, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; Copyright © 2018, VideoLabs 4; All rights reserved. 5; 6; Redistribution and use in source and binary forms, with or without 7; modification, are permitted provided that the following conditions are met: 8; 9; 1. Redistributions of source code must retain the above copyright notice, this 10; list of conditions and the following disclaimer. 11; 12; 2. Redistributions in binary form must reproduce the above copyright notice, 13; this list of conditions and the following disclaimer in the documentation 14; and/or other materials provided with the distribution. 15; 16; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 27%include "config.asm" 28%include "ext/x86/x86inc.asm" 29 30SECTION_RODATA 16 31 32pb_right_ext_mask: times 16 db 0xff 33 times 16 db 0 34pb_14x0_1_2: times 14 db 0 35 db 1, 2 36pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 37 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 38pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 39pb_0: times 16 db 0 40pb_2: times 16 db 2 41pb_3: times 16 db 3 42pb_4: times 16 db 4 43pb_15: times 16 db 15 44pb_0_1: times 8 db 0, 1 45pb_6_7: times 8 db 6, 7 46pb_14_15: times 8 db 14, 15 47pw_1: times 8 dw 1 48pw_16: times 8 dw 16 49pw_128: times 8 dw 128 50pw_255: times 8 dw 255 51pw_256: times 8 dw 256 52pw_2048: times 8 dw 2048 53pw_16380: times 8 dw 16380 54pw_5_6: times 4 dw 5, 6 55pw_0_128: times 4 dw 0, 128 56pd_1024: times 4 dd 1024 57%if ARCH_X86_32 58pd_256: times 4 dd 256 59pd_512: times 4 dd 512 60pd_2048: times 4 dd 2048 61%endif 62pd_0xF0080029: times 4 dd 0xF0080029 63pd_0xF00801C7: times 4 dd 0XF00801C7 64 65cextern sgr_x_by_x 66 67SECTION .text 68 69%if ARCH_X86_32 70 %define PIC_base_offset $$ 71 72 %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg 73 %assign pic_reg_stk_off 4 74 %xdefine PIC_reg %1 75 %if %2 == 1 76 mov [esp], %1 77 %endif 78 LEA PIC_reg, PIC_base_offset 79 %if %3 == 1 80 XCHG_PIC_REG 81 %endif 82 %endmacro 83 84 %macro XCHG_PIC_REG 0 85 mov [esp+pic_reg_stk_off], PIC_reg 86 %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8 87 mov PIC_reg, [esp+pic_reg_stk_off] 88 %endmacro 89 90 %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) 91 92%else 93 %macro XCHG_PIC_REG 0 94 %endmacro 95 96 %define PIC_sym(sym) (sym) 97%endif 98 99%macro PALIGNR 4 ; dst, src1, src2, shift 100 %if cpuflag(ssse3) 101 palignr %1, %2, %3, %4 102 %else 103 %assign %%i regnumof%+%1 + 1 104 %define %%tmp m %+ %%i 105 psrldq %1, %3, %4 106 pslldq %%tmp, %2, 16-%4 107 por %1, %%tmp 108 %endif 109%endmacro 110 111%macro PMADDUBSW 5 ; dst, src, zero, tmp, reset_zero 112 %if cpuflag(ssse3) 113 pmaddubsw %1, %2 114 %else 115 %if %5 == 1 116 pxor %3, %3 117 %endif 118 punpckhbw %4, %1, %3 119 punpcklbw %1, %3 120 pmaddwd %4, %2 121 pmaddwd %1, %2 122 packssdw %1, %4 123 %endif 124%endmacro 125 126;;;;;;;;;;;;;;;;;;;;;; 127;; wiener ;; 128;;;;;;;;;;;;;;;;;;;;;; 129 130%macro WIENER_H 0 131%if ARCH_X86_64 132cglobal wiener_filter_h, 5, 15, 16, dst, left, src, stride, fh, w, h, edge 133 mov edged, edgem 134 movifnidn wd, wm 135 mov hd, hm 136%else 137cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge 138 mov r5, edgem 139 mov [esp+12], r5 140 mov wd, wm 141 mov hd, hm 142 SETUP_PIC hd 143 %define m15 m0 144 %define m14 m1 145 %define m13 m2 146 %define m12 m3 147%endif 148 149 movq m15, [fhq] 150%if cpuflag(ssse3) 151 pshufb m12, m15, [PIC_sym(pb_6_7)] 152 pshufb m13, m15, [PIC_sym(pb_4)] 153 pshufb m14, m15, [PIC_sym(pb_2)] 154 pshufb m15, m15, [PIC_sym(pb_0)] 155%else 156 pshuflw m12, m15, q3333 157 punpcklbw m15, m15 158 pshufhw m13, m15, q0000 159 pshuflw m14, m15, q2222 160 pshuflw m15, m15, q0000 161 punpcklqdq m12, m12 162 punpckhqdq m13, m13 163 punpcklqdq m14, m14 164 punpcklqdq m15, m15 165 psraw m13, 8 166 psraw m14, 8 167 psraw m15, 8 168%endif 169 170%if ARCH_X86_64 171 mova m11, [pw_2048] 172 mova m10, [pw_16380] 173 lea r11, [pb_right_ext_mask] 174 175 DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim 176%else 177 %define m10 [PIC_sym(pw_16380)] 178 %define m11 [PIC_sym(pw_2048)] 179 %define m12 [esp+0x14] 180 %define m13 [esp+0x24] 181 %define m14 [esp+0x34] 182 %define m15 [esp+0x44] 183 mova m12, m3 184 mova m13, m2 185 mova m14, m1 186 mova m15, m0 187 188 DEFINE_ARGS dst, left, src, stride, x, w, h, edge 189 %define srcptrq srcq 190 %define dstptrq dstq 191 %define hd dword [esp+ 0] 192 %define edgeb byte [esp+12] 193 %define xlimd dword [esp+16] 194%endif 195 196 ; if (edge & has_right) align_w_to_16 197 ; else w -= 3, and use that as limit in x loop 198 test edgeb, 2 ; has_right 199 jnz .align 200 mov xlimd, -3 201 jmp .loop 202.align: 203 add wd, 15 204 and wd, ~15 205%if ARCH_X86_64 206 xor xlimd, xlimd 207%else 208 mov xlimd, 0 209%endif 210 211 ; main y loop for vertical filter 212.loop: 213%if ARCH_X86_64 214 mov srcptrq, srcq 215 mov dstptrq, dstq 216 lea xd, [wq+xlimq] 217%else 218 mov [esp+8], srcq 219 mov [esp+4], dstq 220 mov xd, xlimd 221 add xd, wd 222%endif 223 224 ; load left edge pixels 225 test edgeb, 1 ; have_left 226 jz .emu_left 227 test leftq, leftq ; left == NULL for the edge-extended bottom/top 228 jz .load_left_combined 229 movd m0, [leftq] 230 movd m1, [srcq] 231 punpckldq m0, m1 232 pslldq m0, 9 233 add leftq, 4 234 jmp .left_load_done 235.load_left_combined: 236 movq m0, [srcq-3] 237 pslldq m0, 10 238 jmp .left_load_done 239.emu_left: 240 movd m0, [srcq] 241%if cpuflag(ssse3) 242 pshufb m0, [PIC_sym(pb_14x0_1_2)] 243%else 244 pslldq m1, m0, 13 245 punpcklbw m0, m0 246 pshuflw m0, m0, q0000 247 punpcklqdq m0, m0 248 psrldq m0, 2 249 por m0, m1 250%endif 251 252 ; load right edge pixels 253.left_load_done: 254 cmp xd, 16 255 jg .main_load 256 test xd, xd 257 jg .load_and_splat 258 je .splat_right 259 260 ; for very small images (w=[1-2]), edge-extend the original cache, 261 ; ugly, but only runs in very odd cases 262%if cpuflag(ssse3) 263 add wd, wd 264 %if ARCH_X86_64 265 pshufb m0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16] 266 %else 267 pshufb m0, [PIC_sym(pb_0_to_15_min_n)+wq*8-16] 268 %endif 269 shr wd, 1 270%else 271 shl wd, 4 272 pcmpeqd m2, m2 273 movd m3, wd 274 psrldq m2, 2 275 punpckhbw m1, m0, m0 276 pshufhw m1, m1, q1122 277 psllq m1, m3 278 pand m0, m2 279 pandn m2, m1 280 por m0, m2 281 shr wd, 4 282%endif 283 284 ; main x loop, mostly this starts in .main_load 285.splat_right: 286 ; no need to load new pixels, just extend them from the (possibly previously 287 ; extended) previous load into m0 288%if cpuflag(ssse3) 289 pshufb m1, m0, [PIC_sym(pb_15)] 290%else 291 punpckhbw m1, m0, m0 292 pshufhw m1, m1, q3333 293 punpckhqdq m1, m1 294%endif 295 jmp .main_loop 296.load_and_splat: 297 ; load new pixels and extend edge for right-most 298 movu m1, [srcptrq+3] 299%if ARCH_X86_64 300 sub r11, xq 301 movu m2, [r11+16] 302 add r11, xq 303%else 304 sub PIC_reg, xd 305 movu m2, [PIC_sym(pb_right_ext_mask)+16] 306 add PIC_reg, xd 307%endif 308 movd m3, [srcptrq+2+xq] 309%if cpuflag(ssse3) 310 pshufb m3, [PIC_sym(pb_0)] 311%else 312 punpcklbw m3, m3 313 pshuflw m3, m3, q0000 314 punpcklqdq m3, m3 315%endif 316 pand m1, m2 317 pxor m2, [PIC_sym(pb_right_ext_mask)] 318 pand m3, m2 319 pxor m2, [PIC_sym(pb_right_ext_mask)] 320 por m1, m3 321 jmp .main_loop 322.main_load: 323 ; load subsequent line 324 movu m1, [srcptrq+3] 325.main_loop: 326%if ARCH_X86_64 327 PALIGNR m2, m1, m0, 10 328 PALIGNR m3, m1, m0, 11 329 PALIGNR m4, m1, m0, 12 330 PALIGNR m5, m1, m0, 13 331 PALIGNR m6, m1, m0, 14 332 PALIGNR m7, m1, m0, 15 333 334 punpcklbw m0, m2, m1 335 punpckhbw m2, m1 336 punpcklbw m8, m3, m7 337 punpckhbw m3, m7 338 punpcklbw m7, m4, m6 339 punpckhbw m4, m6 340 PMADDUBSW m0, m15, m6, m9, 1 341 PMADDUBSW m2, m15, m6, m9, 0 342 PMADDUBSW m8, m14, m6, m9, 0 343 PMADDUBSW m3, m14, m6, m9, 0 344 PMADDUBSW m7, m13, m6, m9, 0 345 PMADDUBSW m4, m13, m6, m9, 0 346 paddw m0, m8 347 paddw m2, m3 348 %if cpuflag(ssse3) 349 pxor m6, m6 350 %endif 351 punpcklbw m3, m5, m6 352 punpckhbw m5, m6 353 psllw m8, m3, 7 354 psllw m6, m5, 7 355 psubw m8, m10 356 psubw m6, m10 357 pmullw m3, m12 358 pmullw m5, m12 359 paddw m0, m7 360 paddw m2, m4 361 paddw m0, m3 362 paddw m2, m5 363 paddsw m0, m8 ; see the avx2 for an explanation 364 paddsw m2, m6 ; of how the clipping works here 365 psraw m0, 3 366 psraw m2, 3 367 paddw m0, m11 368 paddw m2, m11 369 mova [dstptrq+ 0], m0 370 mova [dstptrq+16], m2 371%else 372 PALIGNR m2, m1, m0, 10 373 punpcklbw m3, m2, m1 374 punpckhbw m2, m1 375 PMADDUBSW m3, m15, m4, m5, 1 376 PMADDUBSW m2, m15, m4, m5, 0 377 PALIGNR m4, m1, m0, 11 378 PALIGNR m5, m1, m0, 15 379 punpcklbw m6, m4, m5 380 punpckhbw m4, m5 381 PMADDUBSW m6, m14, m5, m7, 1 382 PMADDUBSW m4, m14, m5, m7, 0 383 paddw m3, m6 384 paddw m2, m4 385 PALIGNR m4, m1, m0, 12 386 PALIGNR m5, m1, m0, 14 387 punpcklbw m6, m4, m5 388 punpckhbw m4, m5 389 PMADDUBSW m6, m13, m5, m7, 1 390 PMADDUBSW m4, m13, m5, m7, 0 391 paddw m3, m6 392 paddw m2, m4 393 PALIGNR m6, m1, m0, 13 394 %if cpuflag(ssse3) 395 pxor m5, m5 396 %endif 397 punpcklbw m4, m6, m5 398 punpckhbw m6, m5 399 psllw m5, m4, 7 400 psllw m7, m6, 7 401 psubw m5, m10 402 psubw m7, m10 403 pmullw m4, m12 404 pmullw m6, m12 405 paddw m3, m4 406 paddw m2, m6 407 paddsw m3, m5 408 paddsw m2, m7 409 psraw m3, 3 410 psraw m2, 3 411 paddw m3, m11 412 paddw m2, m11 413 mova [dstptrq+ 0], m3 414 mova [dstptrq+16], m2 415%endif 416 417 mova m0, m1 418 add srcptrq, 16 419 add dstptrq, 32 420 sub xd, 16 421 cmp xd, 16 422 jg .main_load 423 test xd, xd 424 jg .load_and_splat 425 cmp xd, xlimd 426 jg .splat_right 427 428%if ARCH_X86_32 429 mov srcq, [esp+8] 430 mov dstq, [esp+4] 431%endif 432 add srcq, strideq 433 add dstq, 384*2 434 dec hd 435 jg .loop 436 RET 437%endmacro 438 439%macro WIENER_V 0 440%if ARCH_X86_64 441cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, fv, edge 442 mov edged, edgem 443 movifnidn fvq, fvmp 444 movifnidn hd, hm 445 movq m15, [fvq] 446 pshufd m14, m15, q1111 447 pshufd m15, m15, q0000 448 paddw m14, [pw_0_128] 449 mova m12, [pd_1024] 450 451 DEFINE_ARGS dst, stride, mid, w, h, y, edge, ylim, mptr, dstptr 452 453 mov ylimd, edged 454 and ylimd, 8 ; have_bottom 455 shr ylimd, 2 456 sub ylimd, 3 457%else 458cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge 459 %define ylimd [esp+12] 460 461 mov r5d, edgem 462 and r5d, 8 463 shr r5d, 2 464 sub r5d, 3 465 mov ylimd, r5d 466 mov fvq, fvmp 467 mov edged, edgem 468 469 SETUP_PIC edged 470 471 movq m0, [fvq] 472 pshufd m1, m0, q1111 473 pshufd m0, m0, q0000 474 paddw m1, [PIC_sym(pw_0_128)] 475 mova [esp+0x50], m0 476 mova [esp+0x40], m1 477 478 DEFINE_ARGS dst, stride, mid, w, h, y, edge 479 %define mptrq midq 480 %define dstptrq dstq 481 %define edgeb byte [esp] 482%endif 483 484 ; main x loop for vertical filter, does one column of 16 pixels 485.loop_x: 486 mova m3, [midq] ; middle line 487 488 ; load top pixels 489 test edgeb, 4 ; have_top 490 jz .emu_top 491 mova m0, [midq-384*4] 492 mova m2, [midq-384*2] 493 mova m1, m0 494 jmp .load_bottom_pixels 495.emu_top: 496 mova m0, m3 497 mova m1, m3 498 mova m2, m3 499 500 ; load bottom pixels 501.load_bottom_pixels: 502 mov yd, hd 503%if ARCH_X86_64 504 mov mptrq, midq 505 mov dstptrq, dstq 506 add yd, ylimd 507%else 508 mov [esp+8], midq 509 mov [esp+4], dstq 510 add yd, ylimd 511%endif 512 jg .load_threelines 513 514 ; the remainder here is somewhat messy but only runs in very weird 515 ; circumstances at the bottom of the image in very small blocks (h=[1-3]), 516 ; so performance is not terribly important here... 517 je .load_twolines 518 cmp yd, -1 519 je .load_oneline 520 ; h == 1 case 521 mova m5, m3 522 mova m4, m3 523 mova m6, m3 524 jmp .loop 525.load_oneline: 526 ; h == 2 case 527 mova m4, [midq+384*2] 528 mova m5, m4 529 mova m6, m4 530 jmp .loop 531.load_twolines: 532 ; h == 3 case 533 mova m4, [midq+384*2] 534 mova m5, [midq+384*4] 535 mova m6, m5 536 jmp .loop 537.load_threelines: 538 ; h > 3 case 539 mova m4, [midq+384*2] 540 mova m5, [midq+384*4] 541 ; third line loaded in main loop below 542 543 ; main y loop for vertical filter 544.loop_load: 545 ; load one line into m6. if that pixel is no longer available, do 546 ; nothing, since m6 still has the data from the previous line in it. We 547 ; try to structure the loop so that the common case is evaluated fastest 548 mova m6, [mptrq+384*6] 549.loop: 550%if ARCH_X86_64 551 paddw m7, m0, m6 552 paddw m8, m1, m5 553 paddw m9, m2, m4 554 punpcklwd m10, m7, m8 555 punpckhwd m7, m8 556 punpcklwd m11, m9, m3 557 punpckhwd m9, m3 558 pmaddwd m10, m15 559 pmaddwd m7, m15 560 pmaddwd m11, m14 561 pmaddwd m9, m14 562 paddd m10, m12 563 paddd m7, m12 564 paddd m10, m11 565 paddd m7, m9 566 psrad m10, 11 567 psrad m7, 11 568 packssdw m10, m7 569 packuswb m10, m10 570 movq [dstptrq], m10 571%else 572 mova [esp+0x30], m1 573 mova [esp+0x20], m2 574 mova [esp+0x10], m3 575 paddw m0, m6 576 paddw m1, m5 577 paddw m2, m4 578 punpcklwd m7, m2, m3 579 punpckhwd m2, m3 580 punpcklwd m3, m0, m1 581 punpckhwd m0, m1 582 mova m1, [esp+0x50] 583 pmaddwd m3, m1 584 pmaddwd m0, m1 585 mova m1, [esp+0x40] 586 pmaddwd m7, m1 587 pmaddwd m2, m1 588 paddd m3, [PIC_sym(pd_1024)] 589 paddd m0, [PIC_sym(pd_1024)] 590 paddd m3, m7 591 paddd m0, m2 592 psrad m3, 11 593 psrad m0, 11 594 packssdw m3, m0 595 packuswb m3, m3 596 movq [dstq], m3 597 mova m1, [esp+0x30] 598 mova m2, [esp+0x20] 599 mova m3, [esp+0x10] 600%endif 601 ; shift pixels one position 602 mova m0, m1 603 mova m1, m2 604 mova m2, m3 605 mova m3, m4 606 mova m4, m5 607 mova m5, m6 608 add mptrq, 384*2 609 add dstptrq, strideq 610 dec yd 611 jg .loop_load 612 ; for the bottom pixels, continue using m6 (as extended edge) 613 cmp yd, ylimd 614 jg .loop 615 616%if ARCH_X86_32 617 mov midq, [esp+8] 618 mov dstq, [esp+4] 619%endif 620 add midq, 16 621 add dstq, 8 622 sub wd, 8 623 jg .loop_x 624 RET 625%endmacro 626 627INIT_XMM sse2 628WIENER_H 629WIENER_V 630 631INIT_XMM ssse3 632WIENER_H 633WIENER_V 634 635;;;;;;;;;;;;;;;;;;;;;;;;;; 636;; self-guided ;; 637;;;;;;;;;;;;;;;;;;;;;;;;;; 638 639%macro MULLD 2 640 pmulhuw m5, %1, %2 641 pmullw %1, %2 642 pslld m5, 16 643 paddd %1, m5 644%endmacro 645 646%macro GATHERDD 2 647 mova m5, m7 648 movd r6d, %2 649 %if ARCH_X86_64 650 movd %1, [r5+r6] 651 pextrw r6d, %2, 2 652 pinsrw m5, [r5+r6+2], 3 653 pextrw r6d, %2, 4 654 pinsrw %1, [r5+r6+2], 5 655 pextrw r6d, %2, 6 656 pinsrw m5, [r5+r6+2], 7 657 %else 658 movd %1, [PIC_sym(sgr_x_by_x-0xF03)+r6] 659 pextrw r6d, %2, 2 660 pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 3 661 pextrw r6d, %2, 4 662 pinsrw %1, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 5 663 pextrw r6d, %2, 6 664 pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 7 665 %endif 666 por %1, m5 667%endmacro 668 669%if ARCH_X86_64 670cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim 671 mov xlimd, edgem 672 movifnidn xd, xm 673 mov hd, hm 674 mov edged, xlimd 675 and xlimd, 2 ; have_right 676 add xd, xlimd 677 xor xlimd, 2 ; 2*!have_right 678%else 679cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim 680 %define wq r0m 681 %define xlimd r1m 682 %define hd hmp 683 %define edgeb byte edgem 684 685 mov r6, edgem 686 and r6, 2 ; have_right 687 add xd, r6 688 xor r6, 2 ; 2*!have_right 689 mov xlimd, r6 690 SETUP_PIC r6, 0 691%endif 692 693 jnz .no_right 694 add xd, 7 695 and xd, ~7 696.no_right: 697 pxor m1, m1 698 lea srcq, [srcq+xq] 699 lea sumq, [sumq+xq*2-2] 700 lea sumsqq, [sumsqq+xq*4-4] 701 neg xq 702 mov wq, xq 703%if ARCH_X86_64 704 lea r10, [pb_right_ext_mask+16] 705%endif 706.loop_y: 707 mov xq, wq 708 709 ; load left 710 test edgeb, 1 ; have_left 711 jz .no_left 712 test leftq, leftq 713 jz .load_left_from_main 714 movd m0, [leftq] 715 pslldq m0, 12 716 add leftq, 4 717 jmp .expand_x 718.no_left: 719 movd m0, [srcq+xq] 720 pshufb m0, [PIC_sym(pb_0)] 721 jmp .expand_x 722.load_left_from_main: 723 movd m0, [srcq+xq-2] 724 pslldq m0, 14 725.expand_x: 726 punpckhbw xm0, xm1 727 728 ; when we reach this, m0 contains left two px in highest words 729 cmp xd, -8 730 jle .loop_x 731.partial_load_and_extend: 732 movd m3, [srcq-4] 733 pshufb m3, [PIC_sym(pb_3)] 734 movq m2, [srcq+xq] 735 punpcklbw m2, m1 736 punpcklbw m3, m1 737%if ARCH_X86_64 738 movu m4, [r10+xq*2] 739%else 740 movu m4, [PIC_sym(pb_right_ext_mask+16)+xd*2] 741%endif 742 pand m2, m4 743 pandn m4, m3 744 por m2, m4 745 jmp .loop_x_noload 746.right_extend: 747 pshufb m2, m0, [PIC_sym(pb_14_15)] 748 jmp .loop_x_noload 749 750.loop_x: 751 movq m2, [srcq+xq] 752 punpcklbw m2, m1 753.loop_x_noload: 754 palignr m3, m2, m0, 12 755 palignr m4, m2, m0, 14 756 757 punpcklwd m5, m3, m2 758 punpckhwd m6, m3, m2 759 paddw m3, m4 760 punpcklwd m7, m4, m1 761 punpckhwd m4, m1 762 pmaddwd m5, m5 763 pmaddwd m6, m6 764 pmaddwd m7, m7 765 pmaddwd m4, m4 766 paddd m5, m7 767 paddd m6, m4 768 paddw m3, m2 769 movu [sumq+xq*2], m3 770 movu [sumsqq+xq*4+ 0], m5 771 movu [sumsqq+xq*4+16], m6 772 773 mova m0, m2 774 add xq, 8 775 776 ; if x <= -8 we can reload more pixels 777 ; else if x < 0 we reload and extend (this implies have_right=0) 778 ; else if x < xlimd we extend from previous load (this implies have_right=0) 779 ; else we are done 780 781 cmp xd, -8 782 jle .loop_x 783 test xd, xd 784 jl .partial_load_and_extend 785 cmp xd, xlimd 786 jl .right_extend 787 788 add sumsqq, (384+16)*4 789 add sumq, (384+16)*2 790 add srcq, strideq 791 dec hd 792 jg .loop_y 793 RET 794 795%if ARCH_X86_64 796cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim 797 movifnidn edged, edgem 798%else 799cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y 800 %define sumsq_baseq dword [esp+0] 801 %define sum_baseq dword [esp+4] 802 %define ylimd dword [esp+8] 803 %define m8 [esp+12] 804 mov edged, r4m 805 mov hd, r3m 806%endif 807 mov xq, -2 808%if ARCH_X86_64 809 mov ylimd, edged 810 and ylimd, 8 ; have_bottom 811 shr ylimd, 2 812 sub ylimd, 2 ; -2 if have_bottom=0, else 0 813 mov sumsq_baseq, sumsqq 814 mov sum_baseq, sumq 815.loop_x: 816 mov sumsqq, sumsq_baseq 817 mov sumq, sum_baseq 818 lea yd, [hq+ylimq+2] 819%else 820 mov yd, edged 821 and yd, 8 ; have_bottom 822 shr yd, 2 823 sub yd, 2 ; -2 if have_bottom=0, else 0 824 mov sumsq_baseq, sumsqq 825 mov sum_baseq, sumq 826 mov ylimd, yd 827.loop_x: 828 mov sumsqd, sumsq_baseq 829 mov sumd, sum_baseq 830 lea yd, [hq+2] 831 add yd, ylimd 832%endif 833 lea sumsqq, [sumsqq+xq*4+4-(384+16)*4] 834 lea sumq, [sumq+xq*2+2-(384+16)*2] 835 test edgeb, 4 ; have_top 836 jnz .load_top 837 movu m0, [sumsqq+(384+16)*4*1] 838 movu m1, [sumsqq+(384+16)*4*1+16] 839 mova m2, m0 840 mova m3, m1 841 mova m4, m0 842 mova m5, m1 843 movu m6, [sumq+(384+16)*2*1] 844 mova m7, m6 845 mova m8, m6 846 jmp .loop_y_noload 847.load_top: 848 movu m0, [sumsqq-(384+16)*4*1] ; l2sq [left] 849 movu m1, [sumsqq-(384+16)*4*1+16] ; l2sq [right] 850 movu m2, [sumsqq-(384+16)*4*0] ; l1sq [left] 851 movu m3, [sumsqq-(384+16)*4*0+16] ; l1sq [right] 852 movu m6, [sumq-(384+16)*2*1] ; l2 853 movu m7, [sumq-(384+16)*2*0] ; l1 854.loop_y: 855%if ARCH_X86_64 856 movu m8, [sumq+(384+16)*2*1] ; l0 857%else 858 movu m4, [sumq+(384+16)*2*1] ; l0 859 mova m8, m4 860%endif 861 movu m4, [sumsqq+(384+16)*4*1] ; l0sq [left] 862 movu m5, [sumsqq+(384+16)*4*1+16] ; l0sq [right] 863.loop_y_noload: 864 paddd m0, m2 865 paddd m1, m3 866 paddw m6, m7 867 paddd m0, m4 868 paddd m1, m5 869 paddw m6, m8 870 movu [sumsqq+ 0], m0 871 movu [sumsqq+16], m1 872 movu [sumq], m6 873 874 ; shift position down by one 875 mova m0, m2 876 mova m1, m3 877 mova m2, m4 878 mova m3, m5 879 mova m6, m7 880 mova m7, m8 881 add sumsqq, (384+16)*4 882 add sumq, (384+16)*2 883 dec yd 884 jg .loop_y 885 cmp yd, ylimd 886 jg .loop_y_noload 887 add xd, 8 888 cmp xd, wd 889 jl .loop_x 890 RET 891 892cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s 893 movifnidn sd, sm 894 sub aq, (384+16-1)*4 895 sub bq, (384+16-1)*2 896 add hd, 2 897%if ARCH_X86_64 898 LEA r5, sgr_x_by_x-0xF03 899%else 900 SETUP_PIC r5, 0 901%endif 902 movd m6, sd 903 pshuflw m6, m6, q0000 904 punpcklqdq m6, m6 905 pxor m7, m7 906 DEFINE_ARGS a, b, w, h, x 907%if ARCH_X86_64 908 mova m8, [pd_0xF00801C7] 909 mova m9, [pw_256] 910 psrld m10, m9, 13 ; pd_2048 911 mova m11, [pb_unpcklwdw] 912%else 913 %define m8 [PIC_sym(pd_0xF00801C7)] 914 %define m9 [PIC_sym(pw_256)] 915 %define m10 [PIC_sym(pd_2048)] 916 %define m11 [PIC_sym(pb_unpcklwdw)] 917%endif 918.loop_y: 919 mov xq, -2 920.loop_x: 921 movq m0, [bq+xq*2] 922 movq m1, [bq+xq*2+(384+16)*2] 923 punpcklwd m0, m7 924 punpcklwd m1, m7 925 movu m2, [aq+xq*4] 926 movu m3, [aq+xq*4+(384+16)*4] 927 pslld m4, m2, 3 928 pslld m5, m3, 3 929 paddd m2, m4 ; aa * 9 930 paddd m3, m5 931 pmaddwd m4, m0, m0 932 pmaddwd m5, m1, m1 933 pmaddwd m0, m8 934 pmaddwd m1, m8 935 psubd m2, m4 ; p = aa * 9 - bb * bb 936 psubd m3, m5 937 MULLD m2, m6 938 MULLD m3, m6 939 paddusw m2, m8 940 paddusw m3, m8 941 psrld m2, 20 ; z 942 psrld m3, 20 943 GATHERDD m4, m2 ; xx 944 GATHERDD m2, m3 945 psrld m4, 24 946 psrld m2, 24 947 packssdw m3, m4, m2 948 pshufb m4, m11 949 MULLD m0, m4 950 pshufb m2, m11 951 MULLD m1, m2 952 psubw m5, m9, m3 953 paddd m0, m10 954 paddd m1, m10 955 psrld m0, 12 956 psrld m1, 12 957 movq [bq+xq*2], m5 958 psrldq m5, 8 959 movq [bq+xq*2+(384+16)*2], m5 960 movu [aq+xq*4], m0 961 movu [aq+xq*4+(384+16)*4], m1 962 add xd, 4 963 cmp xd, wd 964 jl .loop_x 965 add aq, (384+16)*4*2 966 add bq, (384+16)*2*2 967 sub hd, 2 968 jg .loop_y 969 RET 970 971%if ARCH_X86_64 972cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ 973 tmp_base, src_base, a_base, b_base, x, y 974 movifnidn wd, wm 975 mov hd, hm 976 mova m15, [pw_16] 977 mov tmp_baseq, tq 978 mov src_baseq, srcq 979 mov a_baseq, aq 980 mov b_baseq, bq 981 xor xd, xd 982%else 983cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y 984 %define tmp_baseq [esp+8] 985 %define src_baseq [esp+12] 986 %define a_baseq [esp+16] 987 %define b_baseq [esp+20] 988 %define wd [esp+24] 989 %define hd [esp+28] 990 mov tmp_baseq, tq 991 mov src_baseq, srcq 992 mov a_baseq, aq 993 mov b_baseq, bq 994 mov wd, xd 995 mov hd, yd 996 xor xd, xd 997 SETUP_PIC yd, 1, 1 998 jmp .loop_start 999%endif 1000 1001.loop_x: 1002 mov tq, tmp_baseq 1003 mov srcq, src_baseq 1004 mov aq, a_baseq 1005 mov bq, b_baseq 1006%if ARCH_X86_32 1007.loop_start: 1008 movu m0, [bq+xq*2-(384+16)*2-2] 1009 movu m2, [bq+xq*2-(384+16)*2+2] 1010 mova m1, [bq+xq*2-(384+16)*2] ; b:top 1011 paddw m0, m2 ; b:tl+tr 1012 movu m2, [bq+xq*2-2] 1013 movu m3, [bq+xq*2+2] 1014 paddw m1, [bq+xq*2] ; b:top+ctr 1015 paddw m2, m3 ; b:l+r 1016 mova [esp+0x80], m0 1017 mova [esp+0x70], m1 1018 mova [esp+0x60], m2 1019%endif 1020 movu m0, [aq+xq*4-(384+16)*4-4] 1021 movu m2, [aq+xq*4-(384+16)*4+4] 1022 mova m1, [aq+xq*4-(384+16)*4] ; a:top [first half] 1023 paddd m0, m2 ; a:tl+tr [first half] 1024 movu m2, [aq+xq*4-(384+16)*4-4+16] 1025 movu m4, [aq+xq*4-(384+16)*4+4+16] 1026 mova m3, [aq+xq*4-(384+16)*4+16] ; a:top [second half] 1027 paddd m2, m4 ; a:tl+tr [second half] 1028 movu m4, [aq+xq*4-4] 1029 movu m5, [aq+xq*4+4] 1030 paddd m1, [aq+xq*4] ; a:top+ctr [first half] 1031 paddd m4, m5 ; a:l+r [first half] 1032 movu m5, [aq+xq*4+16-4] 1033 movu m6, [aq+xq*4+16+4] 1034 paddd m3, [aq+xq*4+16] ; a:top+ctr [second half] 1035 paddd m5, m6 ; a:l+r [second half] 1036%if ARCH_X86_64 1037 movu m6, [bq+xq*2-(384+16)*2-2] 1038 movu m8, [bq+xq*2-(384+16)*2+2] 1039 mova m7, [bq+xq*2-(384+16)*2] ; b:top 1040 paddw m6, m8 ; b:tl+tr 1041 movu m8, [bq+xq*2-2] 1042 movu m9, [bq+xq*2+2] 1043 paddw m7, [bq+xq*2] ; b:top+ctr 1044 paddw m8, m9 ; b:l+r 1045%endif 1046 1047 lea tq, [tq+xq*2] 1048 lea srcq, [srcq+xq*1] 1049 lea aq, [aq+xq*4+(384+16)*4] 1050 lea bq, [bq+xq*2+(384+16)*2] 1051 mov yd, hd 1052.loop_y: 1053%if ARCH_X86_64 1054 movu m9, [bq-2] 1055 movu m10, [bq+2] 1056 paddw m7, [bq] ; b:top+ctr+bottom 1057 paddw m9, m10 ; b:bl+br 1058 paddw m10, m7, m8 ; b:top+ctr+bottom+l+r 1059 paddw m6, m9 ; b:tl+tr+bl+br 1060 psubw m7, [bq-(384+16)*2*2] ; b:ctr+bottom 1061 paddw m10, m6 1062 psllw m10, 2 1063 psubw m10, m6 ; aa 1064 pxor m14, m14 1065 movq m12, [srcq] 1066 punpcklbw m12, m14 1067 punpcklwd m6, m10, m15 1068 punpckhwd m10, m15 1069 punpcklwd m13, m12, m15 1070 punpckhwd m12, m15 1071 pmaddwd m6, m13 ; aa*src[x]+256 [first half] 1072 pmaddwd m10, m12 ; aa*src[x]+256 [second half] 1073%else 1074 paddd m1, [aq] ; a:top+ctr+bottom [first half] 1075 paddd m3, [aq+16] ; a:top+ctr+bottom [second half] 1076 mova [esp+0x50], m1 1077 mova [esp+0x40], m3 1078 mova [esp+0x30], m4 1079 movu m6, [aq-4] 1080 movu m7, [aq+4] 1081 paddd m1, m4 ; a:top+ctr+bottom+l+r [first half] 1082 paddd m3, m5 ; a:top+ctr+bottom+l+r [second half] 1083 paddd m6, m7 ; a:bl+br [first half] 1084 movu m7, [aq+16-4] 1085 movu m4, [aq+16+4] 1086 paddd m7, m4 ; a:bl+br [second half] 1087 paddd m0, m6 ; a:tl+tr+bl+br [first half] 1088 paddd m2, m7 ; a:tl+tr+bl+br [second half] 1089 paddd m1, m0 1090 paddd m3, m2 1091 pslld m1, 2 1092 pslld m3, 2 1093 psubd m1, m0 ; bb [first half] 1094 psubd m3, m2 ; bb [second half] 1095%endif 1096 1097%if ARCH_X86_64 1098 movu m11, [aq-4] 1099 movu m12, [aq+4] 1100 paddd m1, [aq] ; a:top+ctr+bottom [first half] 1101 paddd m11, m12 ; a:bl+br [first half] 1102 movu m12, [aq+16-4] 1103 movu m13, [aq+16+4] 1104 paddd m3, [aq+16] ; a:top+ctr+bottom [second half] 1105 paddd m12, m13 ; a:bl+br [second half] 1106 paddd m13, m1, m4 ; a:top+ctr+bottom+l+r [first half] 1107 paddd m14, m3, m5 ; a:top+ctr+bottom+l+r [second half] 1108 paddd m0, m11 ; a:tl+tr+bl+br [first half] 1109 paddd m2, m12 ; a:tl+tr+bl+br [second half] 1110 paddd m13, m0 1111 paddd m14, m2 1112 pslld m13, 2 1113 pslld m14, 2 1114 psubd m13, m0 ; bb [first half] 1115 psubd m14, m2 ; bb [second half] 1116 psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half] 1117 psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half] 1118%else 1119 mova m4, [esp+0x80] 1120 mova [esp+0x80], m5 1121 mova m5, [esp+0x70] 1122 mova [esp+0x70], m6 1123 mova m6, [esp+0x60] 1124 mova [esp+0x60], m7 1125 mova [esp+0x20], m1 1126 movu m7, [bq-2] 1127 movu m1, [bq+2] 1128 paddw m5, [bq] ; b:top+ctr+bottom 1129 paddw m7, m1 1130 paddw m1, m5, m6 ; b:top+ctr+bottom+l+r 1131 paddw m4, m7 ; b:tl+tr+bl+br 1132 psubw m5, [bq-(384+16)*2*2] ; b:ctr+bottom 1133 paddw m1, m4 1134 psllw m1, 2 1135 psubw m1, m4 ; aa 1136 movq m0, [srcq] 1137 XCHG_PIC_REG 1138 punpcklbw m0, [PIC_sym(pb_right_ext_mask)+16] 1139 punpcklwd m4, m1, [PIC_sym(pw_16)] 1140 punpckhwd m1, [PIC_sym(pw_16)] 1141 punpcklwd m2, m0, [PIC_sym(pw_16)] 1142 punpckhwd m0, [PIC_sym(pw_16)] 1143 XCHG_PIC_REG 1144 pmaddwd m4, m2 ; aa*src[x]+256 [first half] 1145 pmaddwd m1, m0 ; aa*src[x]+256 [second half] 1146%endif 1147 1148%if ARCH_X86_64 1149 paddd m6, m13 1150 paddd m10, m14 1151 psrad m6, 9 1152 psrad m10, 9 1153 packssdw m6, m10 1154 mova [tq], m6 1155%else 1156 paddd m4, [esp+0x20] 1157 paddd m1, m3 1158 psrad m4, 9 1159 psrad m1, 9 1160 packssdw m4, m1 1161 mova [tq], m4 1162%endif 1163 1164 ; shift to next row 1165%if ARCH_X86_64 1166 mova m0, m4 1167 mova m2, m5 1168 mova m4, m11 1169 mova m5, m12 1170 mova m6, m8 1171 mova m8, m9 1172%else 1173 mova m1, [esp+0x50] 1174 mova m3, [esp+0x40] 1175 mova m0, [esp+0x30] 1176 mova m2, [esp+0x80] 1177 mova m4, [esp+0x70] 1178 mova [esp+0x70], m5 1179 mova m5, [esp+0x60] 1180 mova [esp+0x80], m6 1181 mova [esp+0x60], m7 1182 psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half] 1183 psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half] 1184%endif 1185 1186 add srcq, strideq 1187 add aq, (384+16)*4 1188 add bq, (384+16)*2 1189 add tq, 384*2 1190 dec yd 1191 jg .loop_y 1192 add xd, 8 1193 cmp xd, wd 1194 jl .loop_x 1195 RET 1196 1197cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt 1198 movifnidn hd, hm 1199%if ARCH_X86_32 1200 SETUP_PIC r6, 0 1201%endif 1202 movd m0, wtm 1203 pshufb m0, [PIC_sym(pb_0_1)] 1204 psllw m0, 4 1205 pxor m7, m7 1206 DEFINE_ARGS dst, stride, t, w, h, idx 1207.loop_y: 1208 xor idxd, idxd 1209.loop_x: 1210 mova m1, [tq+idxq*2+ 0] 1211 mova m4, [tq+idxq*2+16] 1212 mova m5, [dstq+idxq] 1213 punpcklbw m2, m5, m7 1214 punpckhbw m5, m7 1215 psllw m3, m2, 4 1216 psllw m6, m5, 4 1217 psubw m1, m3 1218 psubw m4, m6 1219 pmulhrsw m1, m0 1220 pmulhrsw m4, m0 1221 paddw m1, m2 1222 paddw m4, m5 1223 packuswb m1, m4 1224 mova [dstq+idxq], m1 1225 add idxd, 16 1226 cmp idxd, wd 1227 jl .loop_x 1228 add dstq, strideq 1229 add tq, 384 * 2 1230 dec hd 1231 jg .loop_y 1232 RET 1233 1234%if ARCH_X86_64 1235cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim 1236 mov edged, edgem 1237 movifnidn wd, wm 1238 mov hd, hm 1239 mova m10, [pb_0] 1240 mova m11, [pb_0_1] 1241%else 1242cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge 1243 %define edgeb byte edgem 1244 %define wd xd 1245 %define wq wd 1246 %define wm r5m 1247 %define strideq r4m 1248 SUB esp, 8 1249 SETUP_PIC sumsqd, 1, 1 1250 1251 %define m10 [PIC_sym(pb_0)] 1252 %define m11 [PIC_sym(pb_0_1)] 1253%endif 1254 1255 test edgeb, 2 ; have_right 1256 jz .no_right 1257 xor xlimd, xlimd 1258 add wd, 2 1259 add wd, 15 1260 and wd, ~15 1261 jmp .right_done 1262.no_right: 1263 mov xlimd, 3 1264 dec wd 1265.right_done: 1266 pxor m1, m1 1267 lea srcq, [srcq+wq+1] 1268 lea sumq, [sumq+wq*2-2] 1269 lea sumsqq, [sumsqq+wq*4-4] 1270 neg wq 1271%if ARCH_X86_64 1272 lea r10, [pb_right_ext_mask+16] 1273%else 1274 mov wm, xd 1275 %define wq wm 1276%endif 1277 1278.loop_y: 1279 mov xq, wq 1280 ; load left 1281 test edgeb, 1 ; have_left 1282 jz .no_left 1283 test leftq, leftq 1284 jz .load_left_from_main 1285 movd m0, [leftq] 1286 movd m2, [srcq+xq-1] 1287 pslldq m2, 4 1288 por m0, m2 1289 pslldq m0, 11 1290 add leftq, 4 1291 jmp .expand_x 1292.no_left: 1293 movd m0, [srcq+xq-1] 1294 XCHG_PIC_REG 1295 pshufb m0, m10 1296 XCHG_PIC_REG 1297 jmp .expand_x 1298.load_left_from_main: 1299 movd m0, [srcq+xq-4] 1300 pslldq m0, 12 1301.expand_x: 1302 punpckhbw m0, m1 1303 1304 ; when we reach this, m0 contains left two px in highest words 1305 cmp xd, -8 1306 jle .loop_x 1307 test xd, xd 1308 jge .right_extend 1309.partial_load_and_extend: 1310 XCHG_PIC_REG 1311 movd m3, [srcq-1] 1312 movq m2, [srcq+xq] 1313 pshufb m3, m10 1314 punpcklbw m3, m1 1315 punpcklbw m2, m1 1316%if ARCH_X86_64 1317 movu m4, [r10+xq*2] 1318%else 1319 movu m4, [PIC_sym(pb_right_ext_mask+16)+xd*2] 1320 XCHG_PIC_REG 1321%endif 1322 pand m2, m4 1323 pandn m4, m3 1324 por m2, m4 1325 jmp .loop_x_noload 1326.right_extend: 1327 psrldq m2, m0, 14 1328 XCHG_PIC_REG 1329 pshufb m2, m11 1330 XCHG_PIC_REG 1331 jmp .loop_x_noload 1332 1333.loop_x: 1334 movq m2, [srcq+xq] 1335 punpcklbw m2, m1 1336.loop_x_noload: 1337 palignr m3, m2, m0, 8 1338 palignr m4, m2, m0, 10 1339 palignr m5, m2, m0, 12 1340 palignr m6, m2, m0, 14 1341 1342%if ARCH_X86_64 1343 paddw m0, m3, m2 1344 punpcklwd m7, m3, m2 1345 punpckhwd m3, m2 1346 paddw m0, m4 1347 punpcklwd m8, m4, m5 1348 punpckhwd m4, m5 1349 paddw m0, m5 1350 punpcklwd m9, m6, m1 1351 punpckhwd m5, m6, m1 1352 paddw m0, m6 1353 pmaddwd m7, m7 1354 pmaddwd m3, m3 1355 pmaddwd m8, m8 1356 pmaddwd m4, m4 1357 pmaddwd m9, m9 1358 pmaddwd m5, m5 1359 paddd m7, m8 1360 paddd m3, m4 1361 paddd m7, m9 1362 paddd m3, m5 1363 movu [sumq+xq*2], m0 1364 movu [sumsqq+xq*4+ 0], m7 1365 movu [sumsqq+xq*4+16], m3 1366%else 1367 paddw m0, m3, m2 1368 paddw m0, m4 1369 paddw m0, m5 1370 paddw m0, m6 1371 movu [sumq+xq*2], m0 1372 punpcklwd m7, m3, m2 1373 punpckhwd m3, m2 1374 punpcklwd m0, m4, m5 1375 punpckhwd m4, m5 1376 punpckhwd m5, m6, m1 1377 pmaddwd m7, m7 1378 pmaddwd m3, m3 1379 pmaddwd m0, m0 1380 pmaddwd m4, m4 1381 pmaddwd m5, m5 1382 paddd m7, m0 1383 paddd m3, m4 1384 paddd m3, m5 1385 punpcklwd m0, m6, m1 1386 pmaddwd m0, m0 1387 paddd m7, m0 1388 movu [sumsqq+xq*4+ 0], m7 1389 movu [sumsqq+xq*4+16], m3 1390%endif 1391 1392 mova m0, m2 1393 add xq, 8 1394 1395 ; if x <= -8 we can reload more pixels 1396 ; else if x < 0 we reload and extend (this implies have_right=0) 1397 ; else if x < xlimd we extend from previous load (this implies have_right=0) 1398 ; else we are done 1399 1400 cmp xd, -8 1401 jle .loop_x 1402 test xd, xd 1403 jl .partial_load_and_extend 1404 cmp xd, xlimd 1405 jl .right_extend 1406 1407 add srcq, strideq 1408 add sumsqq, (384+16)*4 1409 add sumq, (384+16)*2 1410 dec hd 1411 jg .loop_y 1412%if ARCH_X86_32 1413 ADD esp, 8 1414%endif 1415 RET 1416 1417%if ARCH_X86_64 1418cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim 1419 movifnidn edged, edgem 1420 mov ylimd, edged 1421%else 1422cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr 1423 %define wm [esp+0] 1424 %define hm [esp+4] 1425 %define edgem [esp+8] 1426 mov wm, xd 1427 mov hm, yd 1428 mov edgem, ylimd 1429%endif 1430 1431 and ylimd, 8 ; have_bottom 1432 shr ylimd, 2 1433 sub ylimd, 3 ; -3 if have_bottom=0, else -1 1434 mov xq, -2 1435%if ARCH_X86_64 1436.loop_x: 1437 lea yd, [hd+ylimd+2] 1438 lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] 1439 lea sum_ptrq, [ sumq+xq*2+2-(384+16)*2] 1440 test edgeb, 4 ; have_top 1441 jnz .load_top 1442 movu m0, [sumsq_ptrq+(384+16)*4*1] 1443 movu m1, [sumsq_ptrq+(384+16)*4*1+16] 1444 mova m2, m0 1445 mova m3, m1 1446 mova m4, m0 1447 mova m5, m1 1448 mova m6, m0 1449 mova m7, m1 1450 movu m10, [sum_ptrq+(384+16)*2*1] 1451 mova m11, m10 1452 mova m12, m10 1453 mova m13, m10 1454 jmp .loop_y_second_load 1455.load_top: 1456 movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left] 1457 movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right] 1458 movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] 1459 movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right] 1460 mova m2, m0 1461 mova m3, m1 1462 movu m10, [sum_ptrq-(384+16)*2*1] ; l3/4 1463 movu m12, [sum_ptrq-(384+16)*2*0] ; l2 1464 mova m11, m10 1465.loop_y: 1466 movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] 1467 movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right] 1468 movu m13, [sum_ptrq+(384+16)*2*1] ; l1 1469.loop_y_second_load: 1470 test yd, yd 1471 jle .emulate_second_load 1472 movu m8, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left] 1473 movu m9, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right] 1474 movu m14, [sum_ptrq+(384+16)*2*2] ; l0 1475.loop_y_noload: 1476 paddd m0, m2 1477 paddd m1, m3 1478 paddw m10, m11 1479 paddd m0, m4 1480 paddd m1, m5 1481 paddw m10, m12 1482 paddd m0, m6 1483 paddd m1, m7 1484 paddw m10, m13 1485 paddd m0, m8 1486 paddd m1, m9 1487 paddw m10, m14 1488 movu [sumsq_ptrq+ 0], m0 1489 movu [sumsq_ptrq+16], m1 1490 movu [sum_ptrq], m10 1491 1492 ; shift position down by one 1493 mova m0, m4 1494 mova m1, m5 1495 mova m2, m6 1496 mova m3, m7 1497 mova m4, m8 1498 mova m5, m9 1499 mova m10, m12 1500 mova m11, m13 1501 mova m12, m14 1502 add sumsq_ptrq, (384+16)*4*2 1503 add sum_ptrq, (384+16)*2*2 1504 sub yd, 2 1505 jge .loop_y 1506 ; l1 = l0 1507 mova m6, m8 1508 mova m7, m9 1509 mova m13, m14 1510 cmp yd, ylimd 1511 jg .loop_y_noload 1512 add xd, 8 1513 cmp xd, wd 1514 jl .loop_x 1515 RET 1516.emulate_second_load: 1517 mova m8, m6 1518 mova m9, m7 1519 mova m14, m13 1520 jmp .loop_y_noload 1521%else 1522.sumsq_loop_x: 1523 lea yd, [ylimd+2] 1524 add yd, hm 1525 lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] 1526 test byte edgem, 4 ; have_top 1527 jnz .sumsq_load_top 1528 movu m0, [sumsq_ptrq+(384+16)*4*1] 1529 movu m1, [sumsq_ptrq+(384+16)*4*1+16] 1530 mova m4, m0 1531 mova m5, m1 1532 mova m6, m0 1533 mova m7, m1 1534 mova [esp+0x1c], m0 1535 mova [esp+0x0c], m1 1536 jmp .sumsq_loop_y_second_load 1537.sumsq_load_top: 1538 movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left] 1539 movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right] 1540 movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] 1541 movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right] 1542 mova [esp+0x1c], m0 1543 mova [esp+0x0c], m1 1544.sumsq_loop_y: 1545 movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] 1546 movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right] 1547.sumsq_loop_y_second_load: 1548 test yd, yd 1549 jle .sumsq_emulate_second_load 1550 movu m2, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left] 1551 movu m3, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right] 1552.sumsq_loop_y_noload: 1553 paddd m0, [esp+0x1c] 1554 paddd m1, [esp+0x0c] 1555 paddd m0, m4 1556 paddd m1, m5 1557 paddd m0, m6 1558 paddd m1, m7 1559 paddd m0, m2 1560 paddd m1, m3 1561 movu [sumsq_ptrq+ 0], m0 1562 movu [sumsq_ptrq+16], m1 1563 1564 ; shift position down by one 1565 mova m0, m4 1566 mova m1, m5 1567 mova m4, m2 1568 mova m5, m3 1569 mova [esp+0x1c], m6 1570 mova [esp+0x0c], m7 1571 add sumsq_ptrq, (384+16)*4*2 1572 sub yd, 2 1573 jge .sumsq_loop_y 1574 ; l1 = l0 1575 mova m6, m2 1576 mova m7, m3 1577 cmp yd, ylimd 1578 jg .sumsq_loop_y_noload 1579 add xd, 8 1580 cmp xd, wm 1581 jl .sumsq_loop_x 1582 1583 mov xd, -2 1584.sum_loop_x: 1585 lea yd, [ylimd+2] 1586 add yd, hm 1587 lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] 1588 test byte edgem, 4 ; have_top 1589 jnz .sum_load_top 1590 movu m0, [sum_ptrq+(384+16)*2*1] 1591 mova m1, m0 1592 mova m2, m0 1593 mova m3, m0 1594 jmp .sum_loop_y_second_load 1595.sum_load_top: 1596 movu m0, [sum_ptrq-(384+16)*2*1] ; l3/4 1597 movu m2, [sum_ptrq-(384+16)*2*0] ; l2 1598 mova m1, m0 1599.sum_loop_y: 1600 movu m3, [sum_ptrq+(384+16)*2*1] ; l1 1601.sum_loop_y_second_load: 1602 test yd, yd 1603 jle .sum_emulate_second_load 1604 movu m4, [sum_ptrq+(384+16)*2*2] ; l0 1605.sum_loop_y_noload: 1606 paddw m0, m1 1607 paddw m0, m2 1608 paddw m0, m3 1609 paddw m0, m4 1610 movu [sum_ptrq], m0 1611 1612 ; shift position down by one 1613 mova m0, m2 1614 mova m1, m3 1615 mova m2, m4 1616 add sum_ptrq, (384+16)*2*2 1617 sub yd, 2 1618 jge .sum_loop_y 1619 ; l1 = l0 1620 mova m3, m4 1621 cmp yd, ylimd 1622 jg .sum_loop_y_noload 1623 add xd, 8 1624 cmp xd, wm 1625 jl .sum_loop_x 1626 RET 1627.sumsq_emulate_second_load: 1628 mova m2, m6 1629 mova m3, m7 1630 jmp .sumsq_loop_y_noload 1631.sum_emulate_second_load: 1632 mova m4, m3 1633 jmp .sum_loop_y_noload 1634%endif 1635 1636cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s 1637 movifnidn sd, sm 1638 sub aq, (384+16-1)*4 1639 sub bq, (384+16-1)*2 1640 add hd, 2 1641%if ARCH_X86_64 1642 LEA r5, sgr_x_by_x-0xF03 1643%else 1644 SETUP_PIC r5, 0 1645%endif 1646 movd m6, sd 1647 pshuflw m6, m6, q0000 1648 punpcklqdq m6, m6 1649 pxor m7, m7 1650 DEFINE_ARGS a, b, w, h, x 1651%if ARCH_X86_64 1652 mova m8, [pd_0xF0080029] 1653 mova m9, [pw_256] 1654 psrld m10, m9, 15 ; pd_512 1655%else 1656 %define m8 [PIC_sym(pd_0xF0080029)] 1657 %define m9 [PIC_sym(pw_256)] 1658 %define m10 [PIC_sym(pd_512)] 1659%endif 1660.loop_y: 1661 mov xq, -2 1662.loop_x: 1663 movq m0, [bq+xq*2+0] 1664 movq m1, [bq+xq*2+8] 1665 punpcklwd m0, m7 1666 punpcklwd m1, m7 1667 movu m2, [aq+xq*4+ 0] 1668 movu m3, [aq+xq*4+16] 1669 pslld m4, m2, 3 ; aa * 8 1670 pslld m5, m3, 3 1671 paddd m2, m4 ; aa * 9 1672 paddd m3, m5 1673 paddd m4, m4 ; aa * 16 1674 paddd m5, m5 1675 paddd m2, m4 ; aa * 25 1676 paddd m3, m5 1677 pmaddwd m4, m0, m0 1678 pmaddwd m5, m1, m1 1679 psubd m2, m4 ; p = aa * 25 - bb * bb 1680 psubd m3, m5 1681 MULLD m2, m6 1682 MULLD m3, m6 1683 paddusw m2, m8 1684 paddusw m3, m8 1685 psrld m2, 20 ; z 1686 psrld m3, 20 1687 GATHERDD m4, m2 ; xx 1688 GATHERDD m2, m3 1689 psrld m4, 24 1690 psrld m2, 24 1691 packssdw m3, m4, m2 1692 pmullw m4, m8 1693 pmullw m2, m8 1694 psubw m5, m9, m3 1695 pmaddwd m0, m4 1696 pmaddwd m1, m2 1697 paddd m0, m10 1698 paddd m1, m10 1699 psrld m0, 10 1700 psrld m1, 10 1701 movu [bq+xq*2], m5 1702 movu [aq+xq*4+ 0], m0 1703 movu [aq+xq*4+16], m1 1704 add xd, 8 1705 cmp xd, wd 1706 jl .loop_x 1707 add aq, (384+16)*4*2 1708 add bq, (384+16)*2*2 1709 sub hd, 2 1710 jg .loop_y 1711 RET 1712 1713%if ARCH_X86_64 1714cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \ 1715 tmp_base, src_base, a_base, b_base, x, y 1716 movifnidn wd, wm 1717 mov hd, hm 1718 mov tmp_baseq, tq 1719 mov src_baseq, srcq 1720 mov a_baseq, aq 1721 mov b_baseq, bq 1722 mova m9, [pw_5_6] 1723 mova m12, [pw_256] 1724 psrlw m10, m12, 8 ; pw_1 1725 psrlw m11, m12, 1 ; pw_128 1726 pxor m13, m13 1727%else 1728cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y 1729 %define tmp_baseq r0m 1730 %define src_baseq r1m 1731 %define a_baseq r3m 1732 %define b_baseq r4m 1733 %define wd r5m 1734 %define hd r6m 1735 1736 SUB esp, 8 1737 SETUP_PIC yd 1738 1739 %define m8 m5 1740 %define m9 [PIC_sym(pw_5_6)] 1741 %define m10 [PIC_sym(pw_1)] 1742 %define m11 [PIC_sym(pw_128)] 1743 %define m12 [PIC_sym(pw_256)] 1744 %define m13 m0 1745%endif 1746 xor xd, xd 1747.loop_x: 1748 mov tq, tmp_baseq 1749 mov srcq, src_baseq 1750 mov aq, a_baseq 1751 mov bq, b_baseq 1752 movu m0, [aq+xq*4-(384+16)*4-4] 1753 mova m1, [aq+xq*4-(384+16)*4] 1754 movu m2, [aq+xq*4-(384+16)*4+4] 1755 movu m3, [aq+xq*4-(384+16)*4-4+16] 1756 mova m4, [aq+xq*4-(384+16)*4+16] 1757 movu m5, [aq+xq*4-(384+16)*4+4+16] 1758 paddd m0, m2 1759 paddd m3, m5 1760 paddd m0, m1 1761 paddd m3, m4 1762 pslld m2, m0, 2 1763 pslld m5, m3, 2 1764 paddd m2, m0 1765 paddd m5, m3 1766 paddd m0, m2, m1 ; prev_odd_b [first half] 1767 paddd m1, m5, m4 ; prev_odd_b [second half] 1768 movu m3, [bq+xq*2-(384+16)*2-2] 1769 mova m4, [bq+xq*2-(384+16)*2] 1770 movu m5, [bq+xq*2-(384+16)*2+2] 1771 paddw m3, m5 1772 punpcklwd m5, m3, m4 1773 punpckhwd m3, m4 1774 pmaddwd m5, m9 1775 pmaddwd m3, m9 1776 mova m2, m5 1777 packssdw m2, m3 ; prev_odd_a 1778 lea tq, [tq+xq*2] 1779 lea srcq, [srcq+xq*1] 1780 lea aq, [aq+xq*4+(384+16)*4] 1781 lea bq, [bq+xq*2+(384+16)*2] 1782%if ARCH_X86_32 1783 mov [esp], PIC_reg 1784%endif 1785 mov yd, hd 1786 XCHG_PIC_REG 1787.loop_y: 1788 movu m3, [aq-4] 1789 mova m4, [aq] 1790 movu m5, [aq+4] 1791 paddd m3, m5 1792 paddd m3, m4 1793 pslld m5, m3, 2 1794 paddd m5, m3 1795 paddd m5, m4 ; cur_odd_b [first half] 1796 movu m3, [aq+16-4] 1797 mova m6, [aq+16] 1798 movu m7, [aq+16+4] 1799 paddd m3, m7 1800 paddd m3, m6 1801 pslld m7, m3, 2 1802 paddd m7, m3 1803 paddd m4, m7, m6 ; cur_odd_b [second half] 1804 movu m3, [bq-2] 1805 mova m6, [bq] 1806 movu m7, [bq+2] 1807 paddw m3, m7 1808 punpcklwd m7, m3, m6 1809 punpckhwd m3, m6 1810 pmaddwd m7, m9 1811 pmaddwd m3, m9 1812 packssdw m6, m7, m3 ; cur_odd_a 1813 1814 paddd m0, m5 ; cur_even_b [first half] 1815 paddd m1, m4 ; cur_even_b [second half] 1816 paddw m2, m6 ; cur_even_a 1817 1818 movq m3, [srcq] 1819%if ARCH_X86_64 1820 punpcklbw m3, m13 1821%else 1822 mova [td], m5 1823 pxor m7, m7 1824 punpcklbw m3, m7 1825%endif 1826 punpcklwd m7, m3, m10 1827 punpckhwd m3, m10 1828 punpcklwd m8, m2, m12 1829 punpckhwd m2, m12 1830 pmaddwd m7, m8 1831 pmaddwd m3, m2 1832 paddd m7, m0 1833 paddd m3, m1 1834 psrad m7, 9 1835 psrad m3, 9 1836 1837%if ARCH_X86_32 1838 pxor m13, m13 1839%endif 1840 movq m8, [srcq+strideq] 1841 punpcklbw m8, m13 1842 punpcklwd m0, m8, m10 1843 punpckhwd m8, m10 1844 punpcklwd m1, m6, m11 1845 punpckhwd m2, m6, m11 1846 pmaddwd m0, m1 1847 pmaddwd m8, m2 1848%if ARCH_X86_64 1849 paddd m0, m5 1850%else 1851 paddd m0, [td] 1852%endif 1853 paddd m8, m4 1854 psrad m0, 8 1855 psrad m8, 8 1856 1857 packssdw m7, m3 1858 packssdw m0, m8 1859%if ARCH_X86_32 1860 mova m5, [td] 1861%endif 1862 mova [tq+384*2*0], m7 1863 mova [tq+384*2*1], m0 1864 1865 mova m0, m5 1866 mova m1, m4 1867 mova m2, m6 1868 add aq, (384+16)*4*2 1869 add bq, (384+16)*2*2 1870 add tq, 384*2*2 1871 lea srcq, [srcq+strideq*2] 1872%if ARCH_X86_64 1873 sub yd, 2 1874%else 1875 sub dword [esp+4], 2 1876%endif 1877 jg .loop_y 1878 add xd, 8 1879 cmp xd, wd 1880 jl .loop_x 1881%if ARCH_X86_32 1882 ADD esp, 8 1883%endif 1884 RET 1885 1886cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt 1887 movifnidn wd, wm 1888 movd m0, wtm 1889%if ARCH_X86_64 1890 movifnidn hd, hm 1891 mova m10, [pd_1024] 1892 pxor m11, m11 1893%else 1894 SETUP_PIC hd, 0 1895 %define m10 [PIC_sym(pd_1024)] 1896 %define m11 m7 1897%endif 1898 pshufd m0, m0, 0 1899 DEFINE_ARGS dst, stride, t1, t2, w, h, idx 1900%if ARCH_X86_32 1901 %define hd hmp 1902%endif 1903 1904.loop_y: 1905 xor idxd, idxd 1906.loop_x: 1907 mova m1, [t1q+idxq*2+ 0] 1908 mova m2, [t1q+idxq*2+16] 1909 mova m3, [t2q+idxq*2+ 0] 1910 mova m4, [t2q+idxq*2+16] 1911 mova m6, [dstq+idxq] 1912%if ARCH_X86_32 1913 pxor m11, m11 1914%endif 1915 punpcklbw m5, m6, m11 1916 punpckhbw m6, m11 1917 psllw m7, m5, 4 1918 psubw m1, m7 1919 psubw m3, m7 1920 psllw m7, m6, 4 1921 psubw m2, m7 1922 psubw m4, m7 1923 punpcklwd m7, m1, m3 1924 punpckhwd m1, m3 1925 punpcklwd m3, m2, m4 1926 punpckhwd m2, m4 1927 pmaddwd m7, m0 1928 pmaddwd m1, m0 1929 pmaddwd m3, m0 1930 pmaddwd m2, m0 1931 paddd m7, m10 1932 paddd m1, m10 1933 paddd m3, m10 1934 paddd m2, m10 1935 psrad m7, 11 1936 psrad m1, 11 1937 psrad m3, 11 1938 psrad m2, 11 1939 packssdw m7, m1 1940 packssdw m3, m2 1941 paddw m7, m5 1942 paddw m3, m6 1943 packuswb m7, m3 1944 mova [dstq+idxq], m7 1945 add idxd, 16 1946 cmp idxd, wd 1947 jl .loop_x 1948 add dstq, strideq 1949 add t1q, 384 * 2 1950 add t2q, 384 * 2 1951 dec hd 1952 jg .loop_y 1953 RET 1954