1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29SECTION_RODATA 30 31wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 32wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 33wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 34wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 35wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 36wiener_lshuf5: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 37wiener_lshuf7: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 38sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 39sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 40pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 41 42pb_m14_m13: times 8 db -14,-13 43pb_m10_m9: times 8 db -10, -9 44pb_m6_m5: times 8 db -6, -5 45pb_m2_m1: times 8 db -2, -1 46pb_2_3: times 8 db 2, 3 47pb_6_7: times 8 db 6, 7 48pw_25: times 8 dw 25 49pw_256: times 8 dw 256 50pw_1023: times 8 dw 1023 51pd_8: times 4 dd 8 52pd_4096: times 4 dd 4096 53pd_34816: times 4 dd 34816 54pd_m262128: times 4 dd -262128 55pd_0xffff: times 4 dd 0xffff 56pd_0xf00800a4: times 4 dd 0xf00800a4 57pd_0xf00801c7: times 4 dd 0xf00801c7 58 59wiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192 60wiener_round: dd 1049600, 1048832 61 62cextern sgr_x_by_x 63 64SECTION .text 65 66%macro movif64 2 ; dst, src 67 %if ARCH_X86_64 68 mov %1, %2 69 %endif 70%endmacro 71 72%macro movif32 2 ; dst, src 73 %if ARCH_X86_32 74 mov %1, %2 75 %endif 76%endmacro 77 78INIT_XMM ssse3 79%if ARCH_X86_32 80DECLARE_REG_TMP 4, 6 81 %if STACK_ALIGNMENT < 16 82 %assign extra_stack 14*16 83 %else 84 %assign extra_stack 12*16 85 %endif 86cglobal wiener_filter7_16bpc, 5, 7, 8, -384*12-16-extra_stack, \ 87 dst, dst_stride, left, lpf, lpf_stride, w, flt 88 %if STACK_ALIGNMENT < 16 89 %define lpfm dword [esp+calloff+16*12+ 0] 90 %define lpf_stridem dword [esp+calloff+16*12+ 4] 91 %define wm dword [esp+calloff+16*12+ 8] 92 %define hd dword [esp+calloff+16*12+12] 93 %define edgeb byte [esp+calloff+16*12+16] 94 %define edged dword [esp+calloff+16*12+16] 95 %else 96 %define hd dword r6m 97 %define edgeb byte r8m 98 %endif 99 %define PICmem dword [esp+calloff+4*0] 100 %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers 101 %define t1m dword [esp+calloff+4*2] 102 %define t2m dword [esp+calloff+4*3] 103 %define t3m dword [esp+calloff+4*4] 104 %define t4m dword [esp+calloff+4*5] 105 %define t5m dword [esp+calloff+4*6] 106 %define t6m dword [esp+calloff+4*7] 107 %define t2 t2m 108 %define t3 t3m 109 %define t4 t4m 110 %define t5 t5m 111 %define t6 t6m 112 %define m8 [esp+calloff+16*2] 113 %define m9 [esp+calloff+16*3] 114 %define m10 [esp+calloff+16*4] 115 %define m11 [esp+calloff+16*5] 116 %define m12 [esp+calloff+16*6] 117 %define m13 [esp+calloff+16*7] 118 %define m14 [esp+calloff+16*8] 119 %define m15 [esp+calloff+16*9] 120 %define r10 r5 121 %define base t0-wiener_shifts 122 %assign calloff 0 123 %if STACK_ALIGNMENT < 16 124 mov wd, [rstk+stack_offset+24] 125 mov lpf_stridem, lpf_strideq 126 mov wm, wd 127 mov r4, [rstk+stack_offset+28] 128 mov hd, r4 129 mov r4, [rstk+stack_offset+36] 130 mov edged, r4 ; edge 131 %endif 132%else 133DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers 134cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ 135 lpf_stride, w, edge, flt, h 136 %define base 137%endif 138%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 139 movifnidn wd, wm 140%endif 141%if ARCH_X86_64 142 mov fltq, fltmp 143 mov edged, r8m 144 mov hd, r6m 145 mov t3d, r9m ; pixel_max 146 movq m13, [fltq] 147 movq m15, [fltq+16] 148%else 149 %if STACK_ALIGNMENT < 16 150 mov t0, [rstk+stack_offset+32] 151 mov t1, [rstk+stack_offset+40] ; pixel_max 152 movq m1, [t0] ; fx 153 movq m3, [t0+16] ; fy 154 LEA t0, wiener_shifts 155 %else 156 LEA t0, wiener_shifts 157 mov fltq, r7m 158 movq m1, [fltq] 159 movq m3, [fltq+16] 160 mov t1, r9m ; pixel_max 161 %endif 162 mov PICmem, t0 163%endif 164 mova m6, [base+wiener_shufA] 165 mova m7, [base+wiener_shufB] 166%if ARCH_X86_64 167 lea t4, [wiener_shifts] 168 add wd, wd 169 pshufd m12, m13, q0000 ; x0 x1 170 pshufd m13, m13, q1111 ; x2 x3 171 pshufd m14, m15, q0000 ; y0 y1 172 pshufd m15, m15, q1111 ; y2 y3 173 mova m8, [wiener_shufC] 174 mova m9, [wiener_shufD] 175 add lpfq, wq 176 lea t1, [rsp+wq+16] 177 add dstq, wq 178 neg wq 179 shr t3d, 11 180 %define base t4-wiener_shifts 181 movd m10, [base+wiener_round+t3*4] 182 movq m11, [base+wiener_shifts+t3*8] 183 pshufd m10, m10, q0000 184 pshufd m0, m11, q0000 185 pshufd m11, m11, q1111 186 pmullw m12, m0 ; upshift filter coefs to make the 187 pmullw m13, m0 ; horizontal downshift constant 188 DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w 189 %define lpfm [rsp+0] 190 %define lpf_stridem [rsp+8] 191 %define base 192 %define wiener_lshuf7_mem [wiener_lshuf7] 193 %define pd_m262128_mem [pd_m262128] 194%else 195 add wd, wd 196 mova m4, [base+wiener_shufC] 197 mova m5, [base+wiener_shufD] 198 pshufd m0, m1, q0000 199 pshufd m1, m1, q1111 200 pshufd m2, m3, q0000 201 pshufd m3, m3, q1111 202 mova m8, m4 203 mova m9, m5 204 mova m14, m2 205 mova m15, m3 206 shr t1, 11 207 add lpfq, wq 208 mova m3, [base+pd_m262128] 209 movd m4, [base+wiener_round+t1*4] 210 movq m5, [base+wiener_shifts+t1*8] 211 lea t1, [esp+extra_stack+wq+16] 212 add dstq, wq 213 neg wq 214 pshufd m4, m4, q0000 215 pshufd m2, m5, q0000 216 pshufd m5, m5, q1111 217 mov wm, wq 218 pmullw m0, m2 219 pmullw m1, m2 220 mova m2, [base+wiener_lshuf7] 221 %define pd_m262128_mem [esp+calloff+16*10] 222 mova pd_m262128_mem, m3 223 mova m10, m4 224 mova m11, m5 225 mova m12, m0 226 mova m13, m1 227 %define wiener_lshuf7_mem [esp+calloff+16*11] 228 mova wiener_lshuf7_mem, m2 229%endif 230 test edgeb, 4 ; LR_HAVE_TOP 231 jz .no_top 232 call .h_top 233%if ARCH_X86_64 234 add lpfq, lpf_strideq 235%else 236 add lpfq, lpf_stridem 237%endif 238 mov t6, t1 239 mov t5, t1 240 add t1, 384*2 241 call .h_top 242 movif32 lpf_strideq, lpf_stridem 243 lea r10, [lpfq+lpf_strideq*4] 244 mov lpfq, dstq 245 mov t4, t1 246 add t1, 384*2 247 movif64 lpf_stridem, lpf_strideq 248 add r10, lpf_strideq 249 mov lpfm, r10 ; below 250 call .h 251 mov t3, t1 252 mov t2, t1 253 dec hd 254 jz .v1 255 add lpfq, dst_strideq 256 add t1, 384*2 257 call .h 258 mov t2, t1 259 dec hd 260 jz .v2 261 add lpfq, dst_strideq 262 add t1, 384*2 263 call .h 264 dec hd 265 jz .v3 266.main: 267 lea t0, [t1+384*2] 268.main_loop: 269 call .hv 270 dec hd 271 jnz .main_loop 272 test edgeb, 8 ; LR_HAVE_BOTTOM 273 jz .v3 274 mov lpfq, lpfm 275 call .hv_bottom 276 add lpfq, lpf_stridem 277 call .hv_bottom 278.v1: 279 call .v 280 RET 281.no_top: 282 movif32 lpf_strideq, lpf_stridem 283 lea r10, [lpfq+lpf_strideq*4] 284 mov lpfq, dstq 285 movif64 lpf_stridem, lpf_strideq 286 lea r10, [r10+lpf_strideq*2] 287 mov lpfm, r10 288 call .h 289 mov t6, t1 290 mov t5, t1 291 mov t4, t1 292 mov t3, t1 293 mov t2, t1 294 dec hd 295 jz .v1 296 add lpfq, dst_strideq 297 add t1, 384*2 298 call .h 299 mov t2, t1 300 dec hd 301 jz .v2 302 add lpfq, dst_strideq 303 add t1, 384*2 304 call .h 305 dec hd 306 jz .v3 307 lea t0, [t1+384*2] 308 call .hv 309 dec hd 310 jz .v3 311 add t0, 384*8 312 call .hv 313 dec hd 314 jnz .main 315.v3: 316 call .v 317 movif32 wq, wm 318.v2: 319 call .v 320 movif32 wq, wm 321 jmp .v1 322.extend_right: 323%assign stack_offset stack_offset+8 324%assign calloff 8 325 movif32 t0, PICmem 326 pxor m0, m0 327 movd m1, wd 328 mova m2, [base+pb_0to15] 329 pshufb m1, m0 330 mova m0, [base+pb_6_7] 331 psubb m0, m1 332 pminub m0, m2 333 pshufb m3, m0 334 mova m0, [base+pb_m2_m1] 335 psubb m0, m1 336 pminub m0, m2 337 pshufb m4, m0 338 mova m0, [base+pb_m10_m9] 339 psubb m0, m1 340 pminub m0, m2 341 pshufb m5, m0 342 movif32 t0, t0m 343 ret 344%assign stack_offset stack_offset-4 345%assign calloff 4 346.h: 347 movif64 wq, r5 348 movif32 wq, wm 349 test edgeb, 1 ; LR_HAVE_LEFT 350 jz .h_extend_left 351 movq m3, [leftq] 352 movhps m3, [lpfq+wq] 353 add leftq, 8 354 jmp .h_main 355.h_extend_left: 356 mova m3, [lpfq+wq] ; avoid accessing memory located 357 pshufb m3, wiener_lshuf7_mem ; before the start of the buffer 358 jmp .h_main 359.h_top: 360 movif64 wq, r5 361 test edgeb, 1 ; LR_HAVE_LEFT 362 jz .h_extend_left 363.h_loop: 364 movu m3, [lpfq+wq-8] 365.h_main: 366 mova m4, [lpfq+wq+0] 367 movu m5, [lpfq+wq+8] 368 test edgeb, 2 ; LR_HAVE_RIGHT 369 jnz .h_have_right 370 cmp wd, -18 371 jl .h_have_right 372 call .extend_right 373.h_have_right: 374 pshufb m0, m3, m6 375 pshufb m1, m4, m7 376 paddw m0, m1 377 pshufb m3, m8 378 pmaddwd m0, m12 379 pshufb m1, m4, m9 380 paddw m3, m1 381 pshufb m1, m4, m6 382 pmaddwd m3, m13 383 pshufb m2, m5, m7 384 paddw m1, m2 385 mova m2, pd_m262128_mem ; (1 << 4) - (1 << 18) 386 pshufb m4, m8 387 pmaddwd m1, m12 388 pshufb m5, m9 389 paddw m4, m5 390 pmaddwd m4, m13 391 paddd m0, m2 392 paddd m1, m2 393 paddd m0, m3 394 paddd m1, m4 395 psrad m0, 4 396 psrad m1, 4 397 packssdw m0, m1 398 psraw m0, 1 399 mova [t1+wq], m0 400 add wq, 16 401 jl .h_loop 402 movif32 wq, wm 403 ret 404ALIGN function_align 405.hv: 406 add lpfq, dst_strideq 407 movif64 wq, r5 408 movif32 t0m, t0 409 movif32 t1m, t1 410 test edgeb, 1 ; LR_HAVE_LEFT 411 jz .hv_extend_left 412 movq m3, [leftq] 413 movhps m3, [lpfq+wq] 414 add leftq, 8 415 jmp .hv_main 416.hv_extend_left: 417 mova m3, [lpfq+wq] 418 pshufb m3, wiener_lshuf7_mem 419 jmp .hv_main 420.hv_bottom: 421 movif64 wq, r5 422 movif32 t0m, t0 423 movif32 t1m, t1 424 test edgeb, 1 ; LR_HAVE_LEFT 425 jz .hv_extend_left 426.hv_loop: 427 movu m3, [lpfq+wq-8] 428.hv_main: 429 mova m4, [lpfq+wq+0] 430 movu m5, [lpfq+wq+8] 431 test edgeb, 2 ; LR_HAVE_RIGHT 432 jnz .hv_have_right 433 cmp wd, -18 434 jl .hv_have_right 435 call .extend_right 436.hv_have_right: 437 movif32 t1, t4m 438 movif32 t0, t2m 439 pshufb m0, m3, m6 440 pshufb m1, m4, m7 441 paddw m0, m1 442 pshufb m3, m8 443 pmaddwd m0, m12 444 pshufb m1, m4, m9 445 paddw m3, m1 446 pshufb m1, m4, m6 447 pmaddwd m3, m13 448 pshufb m2, m5, m7 449 paddw m1, m2 450 mova m2, pd_m262128_mem 451 pshufb m4, m8 452 pmaddwd m1, m12 453 pshufb m5, m9 454 paddw m4, m5 455 pmaddwd m4, m13 456 paddd m0, m2 457 paddd m1, m2 458%if ARCH_X86_64 459 mova m2, [t4+wq] 460 paddw m2, [t2+wq] 461 mova m5, [t3+wq] 462%else 463 mova m2, [t1+wq] 464 paddw m2, [t0+wq] 465 mov t1, t3m 466 mov t0, t5m 467 mova m5, [t1+wq] 468 mov t1, t1m 469%endif 470 paddd m0, m3 471 paddd m1, m4 472 psrad m0, 4 473 psrad m1, 4 474 packssdw m0, m1 475%if ARCH_X86_64 476 mova m4, [t5+wq] 477 paddw m4, [t1+wq] 478 psraw m0, 1 479 paddw m3, m0, [t6+wq] 480%else 481 mova m4, [t0+wq] 482 paddw m4, [t1+wq] 483 mov t0, t0m 484 mov t1, t6m 485 psraw m0, 1 486 paddw m3, m0, [t1+wq] 487%endif 488 mova [t0+wq], m0 489 punpcklwd m0, m2, m5 490 pmaddwd m0, m15 491 punpckhwd m2, m5 492 pmaddwd m2, m15 493 punpcklwd m1, m3, m4 494 pmaddwd m1, m14 495 punpckhwd m3, m4 496 pmaddwd m3, m14 497 paddd m0, m10 498 paddd m2, m10 499 paddd m0, m1 500 paddd m2, m3 501 psrad m0, 6 502 psrad m2, 6 503 packssdw m0, m2 504 pmulhw m0, m11 505 pxor m1, m1 506 pmaxsw m0, m1 507 mova [dstq+wq], m0 508 add wq, 16 509 jl .hv_loop 510%if ARCH_X86_64 511 mov t6, t5 512 mov t5, t4 513 mov t4, t3 514 mov t3, t2 515 mov t2, t1 516 mov t1, t0 517 mov t0, t6 518%else 519 mov r5, t5m 520 mov t1, t4m 521 mov t6m, r5 522 mov t5m, t1 523 mov r5, t3m 524 mov t1, t2m 525 mov t4m, r5 526 mov t3m, t1 527 mov r5, t1m 528 mov t1, t0 529 mov t2m, r5 530 mov t0, t6m 531 mov wq, wm 532%endif 533 add dstq, dst_strideq 534 ret 535.v: 536 movif64 wq, r5 537 movif32 t0m, t0 538 movif32 t1m, t1 539.v_loop: 540%if ARCH_X86_64 541 mova m1, [t4+wq] 542 paddw m1, [t2+wq] 543 mova m2, [t3+wq] 544 mova m4, [t1+wq] 545 paddw m3, m4, [t6+wq] 546 paddw m4, [t5+wq] 547%else 548 mov t0, t4m 549 mov t1, t2m 550 mova m1, [t0+wq] 551 paddw m1, [t1+wq] 552 mov t0, t3m 553 mov t1, t1m 554 mova m2, [t0+wq] 555 mova m4, [t1+wq] 556 mov t0, t6m 557 mov t1, t5m 558 paddw m3, m4, [t0+wq] 559 paddw m4, [t1+wq] 560%endif 561 punpcklwd m0, m1, m2 562 pmaddwd m0, m15 563 punpckhwd m1, m2 564 pmaddwd m1, m15 565 punpcklwd m2, m3, m4 566 pmaddwd m2, m14 567 punpckhwd m3, m4 568 pmaddwd m3, m14 569 paddd m0, m10 570 paddd m1, m10 571 paddd m0, m2 572 paddd m1, m3 573 psrad m0, 6 574 psrad m1, 6 575 packssdw m0, m1 576 pmulhw m0, m11 577 pxor m1, m1 578 pmaxsw m0, m1 579 mova [dstq+wq], m0 580 add wq, 16 581 jl .v_loop 582%if ARCH_X86_64 583 mov t6, t5 584 mov t5, t4 585 mov t4, t3 586 mov t3, t2 587 mov t2, t1 588%else 589 mov t0, t5m 590 mov t1, t4m 591 mov r5, t3m 592 mov t6m, t0 593 mov t5m, t1 594 mov t4m, r5 595 mov r5, t2m 596 mov t1, t1m 597 mov t0, t0m 598 mov t3m, r5 599 mov t2m, t1 600%endif 601 add dstq, dst_strideq 602 ret 603 604%if ARCH_X86_32 605 %if STACK_ALIGNMENT < 16 606 %assign stack_size 12*16+384*8 607 %else 608 %assign stack_size 11*16+384*8 609 %endif 610cglobal wiener_filter5_16bpc, 5, 7, 8, -stack_size, dst, dst_stride, left, \ 611 lpf, lpf_stride, w, flt 612 %if STACK_ALIGNMENT < 16 613 %define lpfm dword [esp+calloff+4*6] 614 %define lpf_stridem dword [esp+calloff+4*7] 615 %define wm dword [esp+calloff+16*10+0] 616 %define hd dword [esp+calloff+16*10+4] 617 %define edgeb byte [esp+calloff+16*10+8] 618 %define edged dword [esp+calloff+16*10+8] 619 %else 620 %define hd dword r6m 621 %define edgeb byte r8m 622 %endif 623 %define PICmem dword [esp+calloff+4*0] 624 %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers 625 %define t1m dword [esp+calloff+4*2] 626 %define t2m dword [esp+calloff+4*3] 627 %define t3m dword [esp+calloff+4*4] 628 %define t4m dword [esp+calloff+4*5] 629 %define t2 t2m 630 %define t3 t3m 631 %define t4 t4m 632 %define m8 [esp+calloff+16*2] 633 %define m9 [esp+calloff+16*3] 634 %define m10 [esp+calloff+16*4] 635 %define m11 [esp+calloff+16*5] 636 %define m12 [esp+calloff+16*6] 637 %define m13 [esp+calloff+16*7] 638 %define m14 [esp+calloff+16*8] 639 %define m15 [esp+calloff+16*9] 640 %define base t0-wiener_shifts 641 %assign calloff 0 642 %if STACK_ALIGNMENT < 16 643 mov wd, [rstk+stack_offset+24] 644 mov lpf_stridem, lpf_strideq 645 mov wm, wd 646 mov r4, [rstk+stack_offset+28] 647 mov hd, r4 648 mov r4, [rstk+stack_offset+36] 649 mov edged, r4 ; edge 650 %endif 651%else 652cglobal wiener_filter5_16bpc, 5, 14, 16, 384*8+16, dst, dst_stride, left, lpf, \ 653 lpf_stride, w, edge, flt, h 654 %define base 655%endif 656%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 657 movifnidn wd, wm 658%endif 659%if ARCH_X86_64 660 mov fltq, fltmp 661 mov edged, r8m 662 mov hd, r6m 663 mov t3d, r9m ; pixel_max 664 movq m12, [fltq] 665 movq m14, [fltq+16] 666%else 667 %if STACK_ALIGNMENT < 16 668 mov t0, [rstk+stack_offset+32] 669 mov t1, [rstk+stack_offset+40] ; pixel_max 670 movq m1, [t0] ; fx 671 movq m3, [t0+16] ; fy 672 LEA t0, wiener_shifts 673 %else 674 LEA t0, wiener_shifts 675 mov fltq, r7m 676 movq m1, [fltq] 677 movq m3, [fltq+16] 678 mov t1, r9m ; pixel_max 679 %endif 680 mov PICmem, t0 681%endif 682 mova m5, [base+wiener_shufE] 683 mova m6, [base+wiener_shufB] 684 mova m7, [base+wiener_shufD] 685%if ARCH_X86_64 686 lea t4, [wiener_shifts] 687 add wd, wd 688 punpcklwd m11, m12, m12 689 pshufd m11, m11, q1111 ; x1 690 pshufd m12, m12, q1111 ; x2 x3 691 punpcklwd m13, m14, m14 692 pshufd m13, m13, q1111 ; y1 693 pshufd m14, m14, q1111 ; y2 y3 694 shr t3d, 11 695 mova m8, [pd_m262128] ; (1 << 4) - (1 << 18) 696 add lpfq, wq 697 lea t1, [rsp+wq+16] 698 add dstq, wq 699 neg wq 700 %define base t4-wiener_shifts 701 movd m9, [base+wiener_round+t3*4] 702 movq m10, [base+wiener_shifts+t3*8] 703 pshufd m9, m9, q0000 704 pshufd m0, m10, q0000 705 pshufd m10, m10, q1111 706 mova m15, [wiener_lshuf5] 707 pmullw m11, m0 708 pmullw m12, m0 709 DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w 710 %define lpfm [rsp+0] 711 %define lpf_stridem [rsp+8] 712 %define base 713%else 714 add wd, wd 715 punpcklwd m0, m1, m1 716 pshufd m0, m0, q1111 ; x1 717 pshufd m1, m1, q1111 ; x2 x3 718 punpcklwd m2, m3, m3 719 pshufd m2, m2, q1111 ; y1 720 pshufd m3, m3, q1111 ; y2 y3 721 mova m4, [base+pd_m262128] ; (1 << 4) - (1 << 18) 722 mova m13, m2 723 mova m14, m3 724 mova m8, m4 725 shr t1, 11 726 add lpfq, wq 727 movd m2, [base+wiener_round+t1*4] 728 movq m3, [base+wiener_shifts+t1*8] 729 %if STACK_ALIGNMENT < 16 730 lea t1, [esp+16*11+wq+16] 731 %else 732 lea t1, [esp+16*10+wq+16] 733 %endif 734 add dstq, wq 735 neg wq 736 pshufd m2, m2, q0000 737 pshufd m4, m3, q0000 738 pshufd m3, m3, q1111 739 mov wm, wq 740 pmullw m0, m4 741 pmullw m1, m4 742 mova m4, [base+wiener_lshuf5] 743 mova m9, m2 744 mova m10, m3 745 mova m11, m0 746 mova m12, m1 747 mova m15, m4 748%endif 749 test edgeb, 4 ; LR_HAVE_TOP 750 jz .no_top 751 call .h_top 752%if ARCH_X86_64 753 add lpfq, lpf_strideq 754%else 755 add lpfq, lpf_stridem 756%endif 757 mov t4, t1 758 add t1, 384*2 759 call .h_top 760 movif32 lpf_strideq, lpf_stridem 761 lea r10, [lpfq+lpf_strideq*4] 762 mov lpfq, dstq 763 mov t3, t1 764 add t1, 384*2 765 movif64 lpf_stridem, lpf_strideq 766 add r10, lpf_strideq 767 mov lpfm, r10 ; below 768 call .h 769 mov t2, t1 770 dec hd 771 jz .v1 772 add lpfq, dst_strideq 773 add t1, 384*2 774 call .h 775 dec hd 776 jz .v2 777.main: 778 mov t0, t4 779.main_loop: 780 call .hv 781 dec hd 782 jnz .main_loop 783 test edgeb, 8 ; LR_HAVE_BOTTOM 784 jz .v2 785 mov lpfq, lpfm 786 call .hv_bottom 787 add lpfq, lpf_stridem 788 call .hv_bottom 789.end: 790 RET 791.no_top: 792 movif32 lpf_strideq, lpf_stridem 793 lea r10, [lpfq+lpf_strideq*4] 794 mov lpfq, dstq 795 movif64 lpf_stridem, lpf_strideq 796 lea r10, [r10+lpf_strideq*2] 797 mov lpfm, r10 798 call .h 799 mov t4, t1 800 mov t3, t1 801 mov t2, t1 802 dec hd 803 jz .v1 804 add lpfq, dst_strideq 805 add t1, 384*2 806 call .h 807 dec hd 808 jz .v2 809 lea t0, [t1+384*2] 810 call .hv 811 dec hd 812 jz .v2 813 add t0, 384*6 814 call .hv 815 dec hd 816 jnz .main 817.v2: 818 call .v 819%if ARCH_X86_64 820 mov t4, t3 821 mov t3, t2 822 mov t2, t1 823%else 824 mov t0, t3m 825 mov r5, t2m 826 mov t1, t1m 827 mov t4m, t0 828 mov t3m, r5 829 mov t2m, t1 830 mov wq, wm 831%endif 832 add dstq, dst_strideq 833.v1: 834 call .v 835 jmp .end 836.extend_right: 837%assign stack_offset stack_offset+8 838%assign calloff 8 839 movif32 t0, PICmem 840 pxor m1, m1 841 movd m2, wd 842 mova m0, [base+pb_2_3] 843 pshufb m2, m1 844 mova m1, [base+pb_m6_m5] 845 psubb m0, m2 846 psubb m1, m2 847 mova m2, [base+pb_0to15] 848 pminub m0, m2 849 pminub m1, m2 850 pshufb m3, m0 851 pshufb m4, m1 852 ret 853%assign stack_offset stack_offset-4 854%assign calloff 4 855.h: 856 movif64 wq, r5 857 movif32 wq, wm 858 test edgeb, 1 ; LR_HAVE_LEFT 859 jz .h_extend_left 860 mova m4, [lpfq+wq] 861 movd m3, [leftq+4] 862 pslldq m4, 4 863 por m3, m4 864 add leftq, 8 865 jmp .h_main 866.h_extend_left: 867 mova m3, [lpfq+wq] ; avoid accessing memory located 868 pshufb m3, m15 ; before the start of the buffer 869 jmp .h_main 870.h_top: 871 movif64 wq, r5 872 movif32 wq, wm 873 test edgeb, 1 ; LR_HAVE_LEFT 874 jz .h_extend_left 875.h_loop: 876 movu m3, [lpfq+wq-4] 877.h_main: 878 movu m4, [lpfq+wq+4] 879 test edgeb, 2 ; LR_HAVE_RIGHT 880 jnz .h_have_right 881 cmp wd, -18 882 jl .h_have_right 883 call .extend_right 884.h_have_right: 885 pshufb m0, m3, m5 886 pmaddwd m0, m11 887 pshufb m1, m4, m5 888 pmaddwd m1, m11 889 pshufb m2, m3, m6 890 pshufb m3, m7 891 paddw m2, m3 892 pshufb m3, m4, m6 893 pmaddwd m2, m12 894 pshufb m4, m7 895 paddw m3, m4 896 pmaddwd m3, m12 897 paddd m0, m8 898 paddd m1, m8 899 paddd m0, m2 900 paddd m1, m3 901 psrad m0, 4 902 psrad m1, 4 903 packssdw m0, m1 904 psraw m0, 1 905 mova [t1+wq], m0 906 add wq, 16 907 jl .h_loop 908 movif32 wq, wm 909 ret 910ALIGN function_align 911.hv: 912 add lpfq, dst_strideq 913 movif64 wq, r5 914 movif32 t0m, t0 915 movif32 t1m, t1 916 test edgeb, 1 ; LR_HAVE_LEFT 917 jz .hv_extend_left 918 mova m4, [lpfq+wq] 919 movd m3, [leftq+4] 920 pslldq m4, 4 921 por m3, m4 922 add leftq, 8 923 jmp .hv_main 924.hv_extend_left: 925 mova m3, [lpfq+wq] 926 pshufb m3, m15 927 jmp .hv_main 928.hv_bottom: 929 movif64 wq, r5 930 movif32 t0m, t0 931 movif32 t1m, t1 932 test edgeb, 1 ; LR_HAVE_LEFT 933 jz .hv_extend_left 934.hv_loop: 935 movu m3, [lpfq+wq-4] 936.hv_main: 937 movu m4, [lpfq+wq+4] 938 test edgeb, 2 ; LR_HAVE_RIGHT 939 jnz .hv_have_right 940 cmp wd, -18 941 jl .hv_have_right 942 call .extend_right 943.hv_have_right: 944 movif32 t1, t1m 945 movif32 t0, t3m 946 pshufb m0, m3, m5 947 pmaddwd m0, m11 948 pshufb m1, m4, m5 949 pmaddwd m1, m11 950 pshufb m2, m3, m6 951 pshufb m3, m7 952 paddw m2, m3 953 pshufb m3, m4, m6 954 pmaddwd m2, m12 955 pshufb m4, m7 956 paddw m3, m4 957 pmaddwd m3, m12 958 paddd m0, m8 959 paddd m1, m8 960 paddd m0, m2 961%if ARCH_X86_64 962 mova m2, [t3+wq] 963 paddw m2, [t1+wq] 964 paddd m1, m3 965 mova m4, [t2+wq] 966%else 967 mova m2, [t0+wq] 968 mov t0, t2m 969 paddw m2, [t1+wq] 970 mov t1, t4m 971 paddd m1, m3 972 mova m4, [t0+wq] 973 mov t0, t0m 974%endif 975 punpckhwd m3, m2, m4 976 pmaddwd m3, m14 977 punpcklwd m2, m4 978%if ARCH_X86_64 979 mova m4, [t4+wq] 980%else 981 mova m4, [t1+wq] 982%endif 983 psrad m0, 4 984 psrad m1, 4 985 packssdw m0, m1 986 pmaddwd m2, m14 987 psraw m0, 1 988 mova [t0+wq], m0 989 punpckhwd m1, m0, m4 990 pmaddwd m1, m13 991 punpcklwd m0, m4 992 pmaddwd m0, m13 993 paddd m3, m9 994 paddd m2, m9 995 paddd m1, m3 996 paddd m0, m2 997 psrad m1, 6 998 psrad m0, 6 999 packssdw m0, m1 1000 pmulhw m0, m10 1001 pxor m1, m1 1002 pmaxsw m0, m1 1003 mova [dstq+wq], m0 1004 add wq, 16 1005 jl .hv_loop 1006%if ARCH_X86_64 1007 mov t4, t3 1008 mov t3, t2 1009 mov t2, t1 1010 mov t1, t0 1011 mov t0, t4 1012%else 1013 mov r5, t3m 1014 mov t1, t2m 1015 mov t4m, r5 1016 mov t3m, t1 1017 mov r5, t1m 1018 mov t1, t0 1019 mov t2m, r5 1020 mov t0, t4m 1021 mov wq, wm 1022%endif 1023 add dstq, dst_strideq 1024 ret 1025.v: 1026 movif64 wq, r5 1027 movif32 t1m, t1 1028.v_loop: 1029%if ARCH_X86_64 1030 mova m0, [t1+wq] 1031 paddw m2, m0, [t3+wq] 1032 mova m1, [t2+wq] 1033 mova m4, [t4+wq] 1034%else 1035 mov t0, t3m 1036 mova m0, [t1+wq] 1037 mov t1, t2m 1038 paddw m2, m0, [t0+wq] 1039 mov t0, t4m 1040 mova m1, [t1+wq] 1041 mova m4, [t0+wq] 1042%endif 1043 punpckhwd m3, m2, m1 1044 pmaddwd m3, m14 1045 punpcklwd m2, m1 1046 pmaddwd m2, m14 1047 punpckhwd m1, m0, m4 1048 pmaddwd m1, m13 1049 punpcklwd m0, m4 1050 pmaddwd m0, m13 1051 paddd m3, m9 1052 paddd m2, m9 1053 paddd m1, m3 1054 paddd m0, m2 1055 psrad m1, 6 1056 psrad m0, 6 1057 packssdw m0, m1 1058 pmulhw m0, m10 1059 pxor m1, m1 1060 pmaxsw m0, m1 1061 mova [dstq+wq], m0 1062 add wq, 16 1063%if ARCH_X86_64 1064 jl .v_loop 1065%else 1066 jge .v_end 1067 mov t1, t1m 1068 jmp .v_loop 1069.v_end: 1070%endif 1071 ret 1072 1073%macro GATHERDD 3 ; dst, src, tmp 1074 movd %3d, %2 1075 %if ARCH_X86_64 1076 movd %1, [r13+%3] 1077 pextrw %3d, %2, 2 1078 pinsrw %1, [r13+%3+2], 3 1079 pextrw %3d, %2, 4 1080 pinsrw %1, [r13+%3+2], 5 1081 pextrw %3d, %2, 6 1082 pinsrw %1, [r13+%3+2], 7 1083 %else 1084 movd %1, [base+sgr_x_by_x-0xf03+%3] 1085 pextrw %3, %2, 2 1086 pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3 1087 pextrw %3, %2, 4 1088 pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5 1089 pextrw %3, %2, 6 1090 pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7 1091 %endif 1092%endmacro 1093 1094%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore 1095 %if ARCH_X86_64 1096 %define tmp r14 1097 %else 1098 %define tmp %4 1099 %endif 1100 GATHERDD %1, %2, tmp 1101 GATHERDD %2, %3, tmp 1102 movif32 %4, %5 1103 psrld %1, 24 1104 psrld %2, 24 1105 packssdw %1, %2 1106%endmacro 1107 1108%macro MAXSD 3-4 0 ; dst, src, restore_tmp 1109 pcmpgtd %3, %1, %2 1110 pand %1, %3 1111 pandn %3, %2 1112 por %1, %3 1113 %if %4 == 1 1114 pxor %3, %3 1115 %endif 1116%endmacro 1117 1118%macro MULLD 3 ; dst, src, tmp 1119 pmulhuw %3, %1, %2 1120 pmullw %1, %2 1121 pslld %3, 16 1122 paddd %1, %3 1123%endmacro 1124 1125%if ARCH_X86_32 1126DECLARE_REG_TMP 0, 1, 2, 3, 4 1127 %if STACK_ALIGNMENT < 16 1128 %assign extra_stack 5*16 1129 %else 1130 %assign extra_stack 3*16 1131 %endif 1132cglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \ 1133 dst, dst_stride, left, lpf, lpf_stride, w, params, h 1134 %if STACK_ALIGNMENT < 16 1135 %define dstm dword [esp+calloff+16*0+4*6] 1136 %define dst_stridemp dword [esp+calloff+16*3+4*7] 1137 %define leftm dword [esp+calloff+16*3+4*0] 1138 %define lpfm dword [esp+calloff+16*3+4*1] 1139 %define lpf_stridem dword [esp+calloff+16*3+4*2] 1140 %define w0m dword [esp+calloff+16*3+4*3] 1141 %define hd dword [esp+calloff+16*3+4*4] 1142 %define edgeb byte [esp+calloff+16*3+4*5] 1143 %define edged dword [esp+calloff+16*3+4*5] 1144 %define leftmp leftm 1145 %else 1146 %define w0m wm 1147 %define hd dword r6m 1148 %define edgeb byte r8m 1149 %define edged dword r8m 1150 %endif 1151 %define hvsrcm dword [esp+calloff+4*0] 1152 %define w1m dword [esp+calloff+4*1] 1153 %define t0m dword [esp+calloff+4*2] 1154 %define t2m dword [esp+calloff+4*3] 1155 %define t3m dword [esp+calloff+4*4] 1156 %define t4m dword [esp+calloff+4*5] 1157 %define m8 [base+pd_8] 1158 %define m9 [base+pw_25] 1159 %define m10 [esp+calloff+16*2] 1160 %define m11 [base+pd_0xf00800a4] 1161 %define m12 [base+pw_256] 1162 %define m13 [base+pd_34816] 1163 %define m14 [base+pw_1023] 1164 %define m15 [base+sgr_lshuf5] 1165 %define r10 r5 1166 %define base r6-$$ 1167 %assign calloff 0 1168 %if STACK_ALIGNMENT < 16 1169 mov dst_strideq, [rstk+stack_offset+ 8] 1170 mov leftq, [rstk+stack_offset+12] 1171 mov lpfq, [rstk+stack_offset+16] 1172 mov lpf_strideq, [rstk+stack_offset+20] 1173 mov wd, [rstk+stack_offset+24] 1174 mov dstm, dstq 1175 mov dst_stridemp, dst_strideq 1176 mov leftm, leftq 1177 mov r1, [rstk+stack_offset+28] 1178 mov r2, [rstk+stack_offset+36] 1179 mov lpfm, lpfq 1180 mov lpf_stridem, lpf_strideq 1181 mov hd, r1 1182 mov edged, r2 1183 %endif 1184%else 1185cglobal sgr_filter_5x5_16bpc, 5, 15, 16, -400*24-16, dst, dst_stride, left, lpf, \ 1186 lpf_stride, w, edge, params, h 1187%endif 1188%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 1189 movifnidn wd, wm 1190%endif 1191%if ARCH_X86_64 1192 mov paramsq, paramsmp 1193 lea r13, [sgr_x_by_x-0xf03] 1194 mov edged, r8m 1195 add wd, wd 1196 mov hd, r6m 1197 movu m10, [paramsq] 1198 mova m12, [pw_256] 1199 add lpfq, wq 1200 mova m8, [pd_8] 1201 lea t1, [rsp+wq+20] 1202 mova m9, [pw_25] 1203 add dstq, wq 1204 lea t3, [rsp+wq*2+400*12+16] 1205 mova m11, [pd_0xf00800a4] 1206 lea t4, [rsp+wq+400*20+16] 1207 pshufhw m7, m10, q0000 1208 pshufb m10, m12 ; s0 1209 punpckhqdq m7, m7 ; w0 1210 neg wq 1211 mova m13, [pd_34816] ; (1 << 11) + (1 << 15) 1212 pxor m6, m6 1213 mova m14, [pw_1023] 1214 psllw m7, 4 1215 mova m15, [sgr_lshuf5] 1216 DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w 1217 %define lpfm [rsp+0] 1218 %define lpf_stridem [rsp+8] 1219%else 1220 mov r1, [rstk+stack_offset+32] ; params 1221 LEA r6, $$ 1222 add wd, wd 1223 movu m1, [r1] 1224 add lpfm, wq 1225 lea t1, [rsp+extra_stack+wq+20] 1226 add dstq, wq 1227 lea t3, [rsp+extra_stack+wq*2+400*12+16] 1228 mov dstm, dstq 1229 lea t4, [rsp+extra_stack+wq+400*20+16] 1230 mov t3m, t3 1231 pshufhw m7, m1, q0000 1232 mov t4m, t4 1233 pshufb m1, m12 ; s0 1234 punpckhqdq m7, m7 ; w0 1235 psllw m7, 4 1236 neg wq 1237 mova m10, m1 1238 pxor m6, m6 1239 mov w1m, wd 1240 sub wd, 4 1241 mov lpfq, lpfm 1242 mov lpf_strideq, lpf_stridem 1243 mov w0m, wd 1244%endif 1245 test edgeb, 4 ; LR_HAVE_TOP 1246 jz .no_top 1247 call .h_top 1248 add lpfq, lpf_strideq 1249 movif32 t2m, t1 1250 mov t2, t1 1251 call .top_fixup 1252 add t1, 400*6 1253 call .h_top 1254 lea r10, [lpfq+lpf_strideq*4] 1255 mov lpfq, dstq 1256 movif64 lpf_stridem, lpf_strideq 1257 add r10, lpf_strideq 1258 mov lpfm, r10 ; below 1259 movif32 t0m, t2 1260 mov t0, t2 1261 dec hd 1262 jz .height1 1263 or edged, 16 1264 call .h 1265.main: 1266 add lpfq, dst_stridemp 1267 movif32 t4, t4m 1268 call .hv 1269 call .prep_n 1270 sub hd, 2 1271 jl .extend_bottom 1272.main_loop: 1273 movif32 lpfq, hvsrcm 1274 add lpfq, dst_stridemp 1275%if ARCH_X86_64 1276 test hb, hb 1277%else 1278 mov r5, hd 1279 test r5, r5 1280%endif 1281 jz .odd_height 1282 call .h 1283 add lpfq, dst_stridemp 1284 call .hv 1285 movif32 dstq, dstm 1286 call .n0 1287 call .n1 1288 sub hd, 2 1289 movif32 t0, t0m 1290 jge .main_loop 1291 test edgeb, 8 ; LR_HAVE_BOTTOM 1292 jz .extend_bottom 1293 mov lpfq, lpfm 1294 call .h_top 1295 add lpfq, lpf_stridem 1296 call .hv_bottom 1297.end: 1298 movif32 dstq, dstm 1299 call .n0 1300 call .n1 1301.end2: 1302 RET 1303.height1: 1304 movif32 t4, t4m 1305 call .hv 1306 call .prep_n 1307 jmp .odd_height_end 1308.odd_height: 1309 call .hv 1310 movif32 dstq, dstm 1311 call .n0 1312 call .n1 1313.odd_height_end: 1314 call .v 1315 movif32 dstq, dstm 1316 call .n0 1317 jmp .end2 1318.extend_bottom: 1319 call .v 1320 jmp .end 1321.no_top: 1322 lea r10, [lpfq+lpf_strideq*4] 1323 mov lpfq, dstq 1324 movif64 lpf_stridem, lpf_strideq 1325 lea r10, [r10+lpf_strideq*2] 1326 mov lpfm, r10 1327 call .h 1328 lea t2, [t1+400*6] 1329 movif32 t2m, t2 1330 call .top_fixup 1331 dec hd 1332 jz .no_top_height1 1333 or edged, 16 1334 mov t0, t1 1335 mov t1, t2 1336 movif32 t0m, t0 1337 jmp .main 1338.no_top_height1: 1339 movif32 t3, t3m 1340 movif32 t4, t4m 1341 call .v 1342 call .prep_n 1343 jmp .odd_height_end 1344.extend_right: 1345%assign stack_offset stack_offset+8 1346%assign calloff 8 1347 movd m1, wd 1348 mova m3, [base+pb_m14_m13] 1349 mova m0, [base+pb_0to15] 1350 pshufb m1, m6 1351 psubb m2, m12, m1 1352 psubb m3, m1 1353 movd m1, [lpfq-2] 1354 pcmpgtb m2, m0 1355 pcmpgtb m3, m0 1356 pshufb m1, m12 1357 pand m4, m2 1358 pand m5, m3 1359 pandn m2, m1 1360 pandn m3, m1 1361 por m4, m2 1362 por m5, m3 1363 ret 1364%assign stack_offset stack_offset-4 1365%assign calloff 4 1366.h: ; horizontal boxsum 1367%if ARCH_X86_64 1368 lea wq, [r5-4] 1369%else 1370 %define leftq r5 1371%endif 1372 test edgeb, 1 ; LR_HAVE_LEFT 1373 jz .h_extend_left 1374 movif32 leftq, leftm 1375 movddup m5, [leftq] 1376 movif32 wq, w0m 1377 mova m4, [lpfq+wq+4] 1378 add leftmp, 8 1379 palignr m4, m5, 10 1380 jmp .h_main 1381.h_extend_left: 1382 movif32 wq, w0m 1383 mova m4, [lpfq+wq+4] 1384 pshufb m4, m15 1385 jmp .h_main 1386.h_top: 1387%if ARCH_X86_64 1388 lea wq, [r5-4] 1389%endif 1390 test edgeb, 1 ; LR_HAVE_LEFT 1391 jz .h_extend_left 1392 movif32 wq, w0m 1393.h_loop: 1394 movu m4, [lpfq+wq- 2] 1395.h_main: 1396 movu m5, [lpfq+wq+14] 1397 test edgeb, 2 ; LR_HAVE_RIGHT 1398 jnz .h_have_right 1399 cmp wd, -20 1400 jl .h_have_right 1401 call .extend_right 1402.h_have_right: 1403 palignr m2, m5, m4, 2 1404 paddw m0, m4, m2 1405 palignr m3, m5, m4, 6 1406 paddw m0, m3 1407 punpcklwd m1, m2, m3 1408 pmaddwd m1, m1 1409 punpckhwd m2, m3 1410 pmaddwd m2, m2 1411 palignr m5, m4, 8 1412 paddw m0, m5 1413 punpcklwd m3, m4, m5 1414 pmaddwd m3, m3 1415 paddd m1, m3 1416 punpckhwd m3, m4, m5 1417 pmaddwd m3, m3 1418 shufps m4, m5, q2121 1419 paddw m0, m4 ; sum 1420 punpcklwd m5, m4, m6 1421 pmaddwd m5, m5 1422 punpckhwd m4, m6 1423 pmaddwd m4, m4 1424 paddd m2, m3 1425 test edgeb, 16 ; y > 0 1426 jz .h_loop_end 1427 paddw m0, [t1+wq+400*0] 1428 paddd m1, [t1+wq+400*2] 1429 paddd m2, [t1+wq+400*4] 1430.h_loop_end: 1431 paddd m1, m5 ; sumsq 1432 paddd m2, m4 1433 mova [t1+wq+400*0], m0 1434 mova [t1+wq+400*2], m1 1435 mova [t1+wq+400*4], m2 1436 add wq, 16 1437 jl .h_loop 1438 ret 1439.top_fixup: 1440%if ARCH_X86_64 1441 lea wq, [r5-4] 1442%else 1443 mov wd, w0m 1444%endif 1445.top_fixup_loop: ; the sums of the first row needs to be doubled 1446 mova m0, [t1+wq+400*0] 1447 mova m1, [t1+wq+400*2] 1448 mova m2, [t1+wq+400*4] 1449 paddw m0, m0 1450 paddd m1, m1 1451 paddd m2, m2 1452 mova [t2+wq+400*0], m0 1453 mova [t2+wq+400*2], m1 1454 mova [t2+wq+400*4], m2 1455 add wq, 16 1456 jl .top_fixup_loop 1457 ret 1458ALIGN function_align 1459.hv: ; horizontal boxsum + vertical boxsum + ab 1460%if ARCH_X86_64 1461 lea wq, [r5-4] 1462%else 1463 mov hvsrcm, lpfq 1464%endif 1465 test edgeb, 1 ; LR_HAVE_LEFT 1466 jz .hv_extend_left 1467 movif32 leftq, leftm 1468 movddup m5, [leftq] 1469 movif32 wq, w0m 1470 mova m4, [lpfq+wq+4] 1471 add leftmp, 8 1472 palignr m4, m5, 10 1473 jmp .hv_main 1474.hv_extend_left: 1475 movif32 wq, w0m 1476 mova m4, [lpfq+wq+4] 1477 pshufb m4, m15 1478 jmp .hv_main 1479.hv_bottom: 1480%if ARCH_X86_64 1481 lea wq, [r5-4] 1482%else 1483 mov hvsrcm, lpfq 1484%endif 1485 test edgeb, 1 ; LR_HAVE_LEFT 1486 jz .hv_extend_left 1487 movif32 wq, w0m 1488%if ARCH_X86_32 1489 jmp .hv_loop_start 1490%endif 1491.hv_loop: 1492 movif32 lpfq, hvsrcm 1493.hv_loop_start: 1494 movu m4, [lpfq+wq- 2] 1495.hv_main: 1496 movu m5, [lpfq+wq+14] 1497 test edgeb, 2 ; LR_HAVE_RIGHT 1498 jnz .hv_have_right 1499 cmp wd, -20 1500 jl .hv_have_right 1501 call .extend_right 1502.hv_have_right: 1503 movif32 t3, hd 1504 palignr m3, m5, m4, 2 1505 paddw m0, m4, m3 1506 palignr m1, m5, m4, 6 1507 paddw m0, m1 1508 punpcklwd m2, m3, m1 1509 pmaddwd m2, m2 1510 punpckhwd m3, m1 1511 pmaddwd m3, m3 1512 palignr m5, m4, 8 1513 paddw m0, m5 1514 punpcklwd m1, m4, m5 1515 pmaddwd m1, m1 1516 paddd m2, m1 1517 punpckhwd m1, m4, m5 1518 pmaddwd m1, m1 1519 shufps m4, m5, q2121 1520 paddw m0, m4 ; h sum 1521 punpcklwd m5, m4, m6 1522 pmaddwd m5, m5 1523 punpckhwd m4, m6 1524 pmaddwd m4, m4 1525 paddd m3, m1 1526 paddd m2, m5 ; h sumsq 1527 paddd m3, m4 1528 paddw m1, m0, [t1+wq+400*0] 1529 paddd m4, m2, [t1+wq+400*2] 1530 paddd m5, m3, [t1+wq+400*4] 1531%if ARCH_X86_64 1532 test hd, hd 1533%else 1534 test t3, t3 1535%endif 1536 jz .hv_last_row 1537.hv_main2: 1538 paddw m1, [t2+wq+400*0] ; hv sum 1539 paddd m4, [t2+wq+400*2] ; hv sumsq 1540 paddd m5, [t2+wq+400*4] 1541 mova [t0+wq+400*0], m0 1542 mova [t0+wq+400*2], m2 1543 mova [t0+wq+400*4], m3 1544 psrlw m3, m1, 1 1545 paddd m4, m8 1546 pavgw m3, m6 ; (b + 2) >> 2 1547 paddd m5, m8 1548 psrld m4, 4 ; (a + 8) >> 4 1549 punpcklwd m2, m3, m6 1550 psrld m5, 4 1551 punpckhwd m3, m6 1552 MULLD m4, m9, m0 ; a * 25 1553 MULLD m5, m9, m0 1554 pmaddwd m2, m2 ; b * b 1555 pmaddwd m3, m3 1556 punpcklwd m0, m1, m6 ; b 1557 punpckhwd m1, m6 1558 MAXSD m4, m2, m6 1559 MAXSD m5, m3, m6, 1 1560 psubd m4, m2 ; p 1561 psubd m5, m3 1562 MULLD m4, m10, m2 ; p * s 1563 MULLD m5, m10, m2 1564 pmaddwd m0, m11 ; b * 164 1565 pmaddwd m1, m11 1566 paddusw m4, m11 1567 paddusw m5, m11 1568 psrld m3, m4, 20 ; min(z, 255) 1569 movif32 t3, t3m 1570 psrld m4, m5, 20 1571 GATHER_X_BY_X m2, m3, m4, t2, t2m 1572 punpcklwd m3, m2, m2 1573 punpckhwd m4, m2, m2 1574 MULLD m0, m3, m5 1575 MULLD m1, m4, m5 1576 psubw m5, m12, m2 ; a 1577 paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) 1578 paddd m1, m13 1579 mova [t4+wq+4], m5 1580 psrld m0, 12 ; b 1581 psrld m1, 12 1582 mova [t3+wq*2+ 8], m0 1583 mova [t3+wq*2+24], m1 1584 add wq, 16 1585 jl .hv_loop 1586 mov t2, t1 1587 mov t1, t0 1588 mov t0, t2 1589 movif32 t2m, t2 1590 movif32 t0m, t0 1591 ret 1592.hv_last_row: ; esoteric edge case for odd heights 1593 mova [t1+wq+400*0], m1 1594 paddw m1, m0 1595 mova [t1+wq+400*2], m4 1596 paddd m4, m2 1597 mova [t1+wq+400*4], m5 1598 paddd m5, m3 1599 jmp .hv_main2 1600.v: ; vertical boxsum + ab 1601%if ARCH_X86_64 1602 lea wq, [r5-4] 1603%else 1604 mov wd, w0m 1605%endif 1606.v_loop: 1607 mova m0, [t1+wq+400*0] 1608 mova m2, [t1+wq+400*2] 1609 mova m3, [t1+wq+400*4] 1610 paddw m1, m0, [t2+wq+400*0] 1611 paddd m4, m2, [t2+wq+400*2] 1612 paddd m5, m3, [t2+wq+400*4] 1613 paddw m0, m0 1614 paddd m2, m2 1615 paddd m3, m3 1616 paddw m1, m0 ; hv sum 1617 paddd m4, m2 ; hv sumsq 1618 paddd m5, m3 1619 psrlw m3, m1, 1 1620 paddd m4, m8 1621 pavgw m3, m6 ; (b + 2) >> 2 1622 paddd m5, m8 1623 psrld m4, 4 ; (a + 8) >> 4 1624 punpcklwd m2, m3, m6 1625 psrld m5, 4 1626 punpckhwd m3, m6 1627 MULLD m4, m9, m0 ; a * 25 1628 MULLD m5, m9, m0 1629 pmaddwd m2, m2 ; b * b 1630 pmaddwd m3, m3 1631 punpcklwd m0, m1, m6 ; b 1632 punpckhwd m1, m6 1633 MAXSD m4, m2, m6 1634 MAXSD m5, m3, m6, 1 1635 psubd m4, m2 ; p 1636 psubd m5, m3 1637 MULLD m4, m10, m2 ; p * s 1638 MULLD m5, m10, m2 1639 pmaddwd m0, m11 ; b * 164 1640 pmaddwd m1, m11 1641 paddusw m4, m11 1642 paddusw m5, m11 1643 psrld m3, m4, 20 ; min(z, 255) 1644 psrld m4, m5, 20 1645 GATHER_X_BY_X m2, m3, m4, t2, t2m 1646 punpcklwd m3, m2, m2 1647 punpckhwd m4, m2, m2 1648 MULLD m0, m3, m5 1649 MULLD m1, m4, m5 1650 psubw m5, m12, m2 ; a 1651 paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) 1652 paddd m1, m13 1653 mova [t4+wq+4], m5 1654 psrld m0, 12 ; b 1655 psrld m1, 12 1656 mova [t3+wq*2+ 8], m0 1657 mova [t3+wq*2+24], m1 1658 add wq, 16 1659 jl .v_loop 1660 ret 1661.prep_n: ; initial neighbor setup 1662 movif64 wq, r5 1663 movif32 wd, w1m 1664.prep_n_loop: 1665 movu m0, [t4+wq*1+ 2] 1666 movu m3, [t4+wq*1+ 4] 1667 movu m1, [t3+wq*2+ 4] 1668 movu m4, [t3+wq*2+ 8] 1669 movu m2, [t3+wq*2+20] 1670 movu m5, [t3+wq*2+24] 1671 paddw m3, m0 1672 paddd m4, m1 1673 paddd m5, m2 1674 paddw m3, [t4+wq*1+ 0] 1675 paddd m4, [t3+wq*2+ 0] 1676 paddd m5, [t3+wq*2+16] 1677 paddw m0, m3 1678 psllw m3, 2 1679 paddd m1, m4 1680 pslld m4, 2 1681 paddd m2, m5 1682 pslld m5, 2 1683 paddw m0, m3 ; a 565 1684 paddd m1, m4 ; b 565 1685 paddd m2, m5 1686 mova [t4+wq*1+400*2+ 0], m0 1687 mova [t3+wq*2+400*4+ 0], m1 1688 mova [t3+wq*2+400*4+16], m2 1689 add wq, 16 1690 jl .prep_n_loop 1691 ret 1692ALIGN function_align 1693.n0: ; neighbor + output (even rows) 1694 movif64 wq, r5 1695 movif32 wd, w1m 1696.n0_loop: 1697 movu m0, [t4+wq*1+ 2] 1698 movu m3, [t4+wq*1+ 4] 1699 movu m1, [t3+wq*2+ 4] 1700 movu m4, [t3+wq*2+ 8] 1701 movu m2, [t3+wq*2+20] 1702 movu m5, [t3+wq*2+24] 1703 paddw m3, m0 1704 paddd m4, m1 1705 paddd m5, m2 1706 paddw m3, [t4+wq*1+ 0] 1707 paddd m4, [t3+wq*2+ 0] 1708 paddd m5, [t3+wq*2+16] 1709 paddw m0, m3 1710 psllw m3, 2 1711 paddd m1, m4 1712 pslld m4, 2 1713 paddd m2, m5 1714 pslld m5, 2 1715 paddw m0, m3 ; a 565 1716 paddd m1, m4 ; b 565 1717 paddd m2, m5 1718 paddw m3, m0, [t4+wq*1+400*2+ 0] 1719 paddd m4, m1, [t3+wq*2+400*4+ 0] 1720 paddd m5, m2, [t3+wq*2+400*4+16] 1721 mova [t4+wq*1+400*2+ 0], m0 1722 mova [t3+wq*2+400*4+ 0], m1 1723 mova [t3+wq*2+400*4+16], m2 1724 mova m0, [dstq+wq] 1725 punpcklwd m1, m0, m6 ; src 1726 punpcklwd m2, m3, m6 ; a 1727 pmaddwd m2, m1 ; a * src 1728 punpckhwd m1, m0, m6 1729 punpckhwd m3, m6 1730 pmaddwd m3, m1 1731 paddd m2, m4 ; a * src + b + (1 << 8) 1732 paddd m3, m5 1733 psrld m2, 9 1734 psrld m3, 9 1735 packssdw m2, m3 1736 psllw m1, m0, 4 1737 psubw m2, m1 1738 pmulhrsw m2, m7 1739 paddw m0, m2 1740 pmaxsw m0, m6 1741 pminsw m0, m14 1742 mova [dstq+wq], m0 1743 add wq, 16 1744 jl .n0_loop 1745 add dstq, dst_stridemp 1746 ret 1747ALIGN function_align 1748.n1: ; neighbor + output (odd rows) 1749 movif64 wq, r5 1750 movif32 wd, w1m 1751.n1_loop: 1752 mova m0, [dstq+wq] 1753 mova m3, [t4+wq*1+400*2+ 0] 1754 mova m4, [t3+wq*2+400*4+ 0] 1755 mova m5, [t3+wq*2+400*4+16] 1756 punpcklwd m1, m0, m6 ; src 1757 punpcklwd m2, m3, m6 ; a 1758 pmaddwd m2, m1 1759 punpckhwd m1, m0, m6 1760 punpckhwd m3, m6 1761 pmaddwd m3, m1 1762 paddd m2, m4 ; a * src + b + (1 << 7) 1763 paddd m3, m5 1764 psrld m2, 8 1765 psrld m3, 8 1766 packssdw m2, m3 1767 psllw m1, m0, 4 1768 psubw m2, m1 1769 pmulhrsw m2, m7 1770 paddw m0, m2 1771 pmaxsw m0, m6 1772 pminsw m0, m14 1773 mova [dstq+wq], m0 1774 add wq, 16 1775 jl .n1_loop 1776 add dstq, dst_stridemp 1777 movif32 dstm, dstq 1778 ret 1779 1780%if ARCH_X86_32 1781 %if STACK_ALIGNMENT < 16 1782 %assign extra_stack 4*16 1783 %else 1784 %assign extra_stack 2*16 1785 %endif 1786cglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \ 1787 dst, dst_stride, left, lpf, lpf_stride, w, params, h 1788 %if STACK_ALIGNMENT < 16 1789 %define dstm dword [esp+calloff+16*2+4*0] 1790 %define dst_stridemp dword [esp+calloff+16*2+4*1] 1791 %define leftm dword [esp+calloff+16*2+4*2] 1792 %define lpfm dword [esp+calloff+16*2+4*3] 1793 %define lpf_stridem dword [esp+calloff+16*2+4*4] 1794 %define w0m dword [esp+calloff+16*2+4*5] 1795 %define hd dword [esp+calloff+16*2+4*6] 1796 %define edgeb byte [esp+calloff+16*2+4*7] 1797 %define edged dword [esp+calloff+16*2+4*7] 1798 %define leftmp leftm 1799 %else 1800 %define w0m wm 1801 %define hd dword r6m 1802 %define edgeb byte r8m 1803 %define edged dword r8m 1804 %endif 1805 %define hvsrcm dword [esp+calloff+4*0] 1806 %define w1m dword [esp+calloff+4*1] 1807 %define t3m dword [esp+calloff+4*2] 1808 %define t4m dword [esp+calloff+4*3] 1809 %define m8 [base+pd_8] 1810 %define m9 [esp+calloff+16*1] 1811 %define m10 [base+pd_0xf00801c7] 1812 %define m11 [base+pd_34816] 1813 %define m12 [base+pw_256] 1814 %define m13 [base+pw_1023] 1815 %define m14 [base+sgr_lshuf3] 1816 %define m15 m6 1817 %define base r6-$$ 1818 %assign calloff 0 1819 %if STACK_ALIGNMENT < 16 1820 mov dst_strideq, [rstk+stack_offset+ 8] 1821 mov leftq, [rstk+stack_offset+12] 1822 mov lpfq, [rstk+stack_offset+16] 1823 mov lpf_strideq, [rstk+stack_offset+20] 1824 mov wd, [rstk+stack_offset+24] 1825 mov dstm, dstq 1826 mov dst_stridemp, dst_strideq 1827 mov leftm, leftq 1828 mov r1, [rstk+stack_offset+28] 1829 mov r2, [rstk+stack_offset+36] 1830 mov lpfm, lpfq 1831 mov lpf_stridem, lpf_strideq 1832 mov hd, r1 1833 mov edged, r2 1834 %endif 1835%else 1836cglobal sgr_filter_3x3_16bpc, 5, 15, 16, 400*42+8, dst, dst_stride, left, lpf, \ 1837 lpf_stride, w, edge, params, h 1838%endif 1839%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 1840 movifnidn wd, wm 1841%endif 1842%if ARCH_X86_64 1843 mov paramsq, paramsmp 1844 lea r13, [sgr_x_by_x-0xf03] 1845 mov edged, r8m 1846 add wd, wd 1847 mov hd, r6m 1848 movq m9, [paramsq+4] 1849 mova m12, [pw_256] 1850 add lpfq, wq 1851 lea t1, [rsp+wq+12] 1852 mova m8, [pd_8] 1853 add dstq, wq 1854 lea t3, [rsp+wq*2+400*12+8] 1855 mova m10, [pd_0xf00801c7] 1856 lea t4, [rsp+wq+400*32+8] 1857 mova m11, [pd_34816] 1858 pshuflw m7, m9, q3333 1859 pshufb m9, m12 ; s1 1860 punpcklqdq m7, m7 ; w1 1861 neg wq 1862 pxor m6, m6 1863 mova m13, [pw_1023] 1864 psllw m7, 4 1865 mova m14, [sgr_lshuf3] 1866 DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w 1867 %define lpfm [rsp] 1868%else 1869 mov r1, [rstk+stack_offset+32] ; params 1870 LEA r6, $$ 1871 add wd, wd 1872 movq m1, [r1+4] 1873 add lpfm, wq 1874 lea t1, [rsp+extra_stack+wq+20] 1875 add dstq, wq 1876 lea t3, [rsp+extra_stack+wq*2+400*12+16] 1877 mov dstm, dstq 1878 lea t4, [rsp+extra_stack+wq+400*32+16] 1879 mov t3m, t3 1880 pshuflw m7, m1, q3333 1881 mov t4m, t4 1882 pshufb m1, m12 ; s1 1883 punpcklqdq m7, m7 ; w1 1884 psllw m7, 4 1885 neg wq 1886 mova m9, m1 1887 pxor m6, m6 1888 mov w1m, wd 1889 sub wd, 4 1890 mov lpfq, lpfm 1891 mov lpf_strideq, lpf_stridem 1892 mov w0m, wd 1893%endif 1894 test edgeb, 4 ; LR_HAVE_TOP 1895 jz .no_top 1896 call .h_top 1897 add lpfq, lpf_strideq 1898 mov t2, t1 1899 add t1, 400*6 1900 call .h_top 1901 lea r10, [lpfq+lpf_strideq*4] 1902 mov lpfq, dstq 1903 add r10, lpf_strideq 1904 mov lpfm, r10 ; below 1905 movif32 t4, t4m 1906 call .hv0 1907.main: 1908 dec hd 1909 jz .height1 1910 movif32 lpfq, hvsrcm 1911 add lpfq, dst_stridemp 1912 call .hv1 1913 call .prep_n 1914 sub hd, 2 1915 jl .extend_bottom 1916.main_loop: 1917 movif32 lpfq, hvsrcm 1918 add lpfq, dst_stridemp 1919 call .hv0 1920%if ARCH_X86_64 1921 test hb, hb 1922%else 1923 mov r5, hd 1924 test r5, r5 1925%endif 1926 jz .odd_height 1927 movif32 lpfq, hvsrcm 1928 add lpfq, dst_stridemp 1929 call .hv1 1930 call .n0 1931 call .n1 1932 sub hd, 2 1933 jge .main_loop 1934 test edgeb, 8 ; LR_HAVE_BOTTOM 1935 jz .extend_bottom 1936 mov lpfq, lpfm 1937 call .hv0_bottom 1938%if ARCH_X86_64 1939 add lpfq, lpf_strideq 1940%else 1941 mov lpfq, hvsrcm 1942 add lpfq, lpf_stridem 1943%endif 1944 call .hv1_bottom 1945.end: 1946 call .n0 1947 call .n1 1948.end2: 1949 RET 1950.height1: 1951 call .v1 1952 call .prep_n 1953 jmp .odd_height_end 1954.odd_height: 1955 call .v1 1956 call .n0 1957 call .n1 1958.odd_height_end: 1959 call .v0 1960 call .v1 1961 call .n0 1962 jmp .end2 1963.extend_bottom: 1964 call .v0 1965 call .v1 1966 jmp .end 1967.no_top: 1968 lea r10, [lpfq+lpf_strideq*4] 1969 mov lpfq, dstq 1970 lea r10, [r10+lpf_strideq*2] 1971 mov lpfm, r10 1972 call .h 1973%if ARCH_X86_64 1974 lea wq, [r5-4] 1975%else 1976 mov wq, w0m 1977 mov hvsrcm, lpfq 1978%endif 1979 lea t2, [t1+400*6] 1980.top_fixup_loop: 1981 mova m0, [t1+wq+400*0] 1982 mova m1, [t1+wq+400*2] 1983 mova m2, [t1+wq+400*4] 1984 mova [t2+wq+400*0], m0 1985 mova [t2+wq+400*2], m1 1986 mova [t2+wq+400*4], m2 1987 add wq, 16 1988 jl .top_fixup_loop 1989 movif32 t3, t3m 1990 movif32 t4, t4m 1991 call .v0 1992 jmp .main 1993.extend_right: 1994%assign stack_offset stack_offset+8 1995%assign calloff 8 1996 movd m1, wd 1997 mova m2, [base+pb_m2_m1] 1998 mova m3, [base+pb_0to15] 1999 movd m5, [lpfq-2] 2000 pshufb m1, m6 2001 pshufb m5, m12 2002 psubb m2, m1 2003 pcmpgtb m2, m3 2004 pand m4, m2 2005 pandn m2, m5 2006 por m4, m2 2007 ret 2008%assign stack_offset stack_offset-4 2009%assign calloff 4 2010.h: ; horizontal boxsum 2011%if ARCH_X86_64 2012 lea wq, [r5-4] 2013%else 2014 %define leftq r5 2015%endif 2016 test edgeb, 1 ; LR_HAVE_LEFT 2017 jz .h_extend_left 2018 movif32 leftq, leftm 2019 movddup m5, [leftq] 2020 movif32 wq, w0m 2021 mova m4, [lpfq+wq+4] 2022 add leftmp, 8 2023 palignr m4, m5, 12 2024 jmp .h_main 2025.h_extend_left: 2026 movif32 wq, w0m 2027 mova m4, [lpfq+wq+4] 2028 pshufb m4, m14 2029 jmp .h_main 2030.h_top: 2031%if ARCH_X86_64 2032 lea wq, [r5-4] 2033%endif 2034 test edgeb, 1 ; LR_HAVE_LEFT 2035 jz .h_extend_left 2036 movif32 wq, w0m 2037.h_loop: 2038 movu m4, [lpfq+wq+ 0] 2039.h_main: 2040 movu m5, [lpfq+wq+16] 2041 test edgeb, 2 ; LR_HAVE_RIGHT 2042 jnz .h_have_right 2043 cmp wd, -18 2044 jl .h_have_right 2045 call .extend_right 2046.h_have_right: 2047 palignr m0, m5, m4, 2 2048 paddw m1, m4, m0 2049 punpcklwd m2, m4, m0 2050 pmaddwd m2, m2 2051 punpckhwd m3, m4, m0 2052 pmaddwd m3, m3 2053 palignr m5, m4, 4 2054 paddw m1, m5 ; sum 2055 punpcklwd m4, m5, m6 2056 pmaddwd m4, m4 2057 punpckhwd m5, m6 2058 pmaddwd m5, m5 2059 paddd m2, m4 ; sumsq 2060 paddd m3, m5 2061 mova [t1+wq+400*0], m1 2062 mova [t1+wq+400*2], m2 2063 mova [t1+wq+400*4], m3 2064 add wq, 16 2065 jl .h_loop 2066 ret 2067ALIGN function_align 2068.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) 2069%if ARCH_X86_64 2070 lea wq, [r5-4] 2071%else 2072 mov hvsrcm, lpfq 2073%endif 2074 test edgeb, 1 ; LR_HAVE_LEFT 2075 jz .hv0_extend_left 2076 movif32 leftq, leftm 2077 movddup m5, [leftq] 2078 movif32 wq, w0m 2079 mova m4, [lpfq+wq+4] 2080 add leftmp, 8 2081 palignr m4, m5, 12 2082 jmp .hv0_main 2083.hv0_extend_left: 2084 movif32 wq, w0m 2085 mova m4, [lpfq+wq+4] 2086 pshufb m4, m14 2087 jmp .hv0_main 2088.hv0_bottom: 2089%if ARCH_X86_64 2090 lea wq, [r5-4] 2091%else 2092 mov hvsrcm, lpfq 2093%endif 2094 test edgeb, 1 ; LR_HAVE_LEFT 2095 jz .hv0_extend_left 2096 movif32 wq, w0m 2097%if ARCH_X86_32 2098 jmp .hv0_loop_start 2099%endif 2100.hv0_loop: 2101 movif32 lpfq, hvsrcm 2102.hv0_loop_start: 2103 movu m4, [lpfq+wq+ 0] 2104.hv0_main: 2105 movu m5, [lpfq+wq+16] 2106 test edgeb, 2 ; LR_HAVE_RIGHT 2107 jnz .hv0_have_right 2108 cmp wd, -18 2109 jl .hv0_have_right 2110 call .extend_right 2111.hv0_have_right: 2112 palignr m0, m5, m4, 2 2113 paddw m1, m4, m0 2114 punpcklwd m2, m4, m0 2115 pmaddwd m2, m2 2116 punpckhwd m3, m4, m0 2117 pmaddwd m3, m3 2118 palignr m5, m4, 4 2119 paddw m1, m5 ; sum 2120 punpcklwd m4, m5, m6 2121 pmaddwd m4, m4 2122 punpckhwd m5, m6 2123 pmaddwd m5, m5 2124 paddd m2, m4 ; sumsq 2125 paddd m3, m5 2126 paddw m0, m1, [t1+wq+400*0] 2127 paddd m4, m2, [t1+wq+400*2] 2128 paddd m5, m3, [t1+wq+400*4] 2129 mova [t1+wq+400*0], m1 2130 mova [t1+wq+400*2], m2 2131 mova [t1+wq+400*4], m3 2132 paddw m1, m0, [t2+wq+400*0] 2133 paddd m2, m4, [t2+wq+400*2] 2134 paddd m3, m5, [t2+wq+400*4] 2135 mova [t2+wq+400*0], m0 2136 mova [t2+wq+400*2], m4 2137 mova [t2+wq+400*4], m5 2138 paddd m2, m8 2139 paddd m3, m8 2140 psrld m2, 4 ; (a + 8) >> 4 2141 psrld m3, 4 2142 pslld m4, m2, 3 2143 pslld m5, m3, 3 2144 paddd m4, m2 ; ((a + 8) >> 4) * 9 2145 paddd m5, m3 2146 psrlw m3, m1, 1 2147 pavgw m3, m6 ; (b + 2) >> 2 2148 punpcklwd m2, m3, m6 2149 pmaddwd m2, m2 2150 punpckhwd m3, m6 2151 pmaddwd m3, m3 2152 punpcklwd m0, m1, m6 ; b 2153 punpckhwd m1, m6 2154 MAXSD m4, m2, m15 2155 MAXSD m5, m3, m15 2156 psubd m4, m2 ; p 2157 psubd m5, m3 2158 MULLD m4, m9, m15 ; p * s 2159 MULLD m5, m9, m15 2160 pmaddwd m0, m10 ; b * 455 2161 pmaddwd m1, m10 2162 paddusw m4, m10 2163 paddusw m5, m10 2164 psrld m3, m4, 20 ; min(z, 255) 2165 movif32 t3, t3m 2166 psrld m4, m5, 20 2167 GATHER_X_BY_X m2, m3, m4, r0, dstm 2168 punpcklwd m3, m2, m2 2169 punpckhwd m4, m2, m2 2170 MULLD m0, m3, m15 2171 MULLD m1, m4, m15 2172 psubw m5, m12, m2 2173%if ARCH_X86_32 2174 pxor m6, m6 2175%endif 2176 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2177 paddd m1, m11 2178 mova [t4+wq+4], m5 2179 psrld m0, 12 2180 psrld m1, 12 2181 mova [t3+wq*2+ 8], m0 2182 mova [t3+wq*2+24], m1 2183 add wq, 16 2184 jl .hv0_loop 2185 ret 2186ALIGN function_align 2187.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 2188%if ARCH_X86_64 2189 lea wq, [r5-4] 2190%else 2191 mov hvsrcm, lpfq 2192%endif 2193 test edgeb, 1 ; LR_HAVE_LEFT 2194 jz .hv1_extend_left 2195 movif32 leftq, leftm 2196 movddup m5, [leftq] 2197 movif32 wq, w0m 2198 mova m4, [lpfq+wq+4] 2199 add leftmp, 8 2200 palignr m4, m5, 12 2201 jmp .hv1_main 2202.hv1_extend_left: 2203 movif32 wq, w0m 2204 mova m4, [lpfq+wq+4] 2205 pshufb m4, m14 2206 jmp .hv1_main 2207.hv1_bottom: 2208%if ARCH_X86_64 2209 lea wq, [r5-4] 2210%else 2211 mov hvsrcm, lpfq 2212%endif 2213 test edgeb, 1 ; LR_HAVE_LEFT 2214 jz .hv1_extend_left 2215 movif32 wq, w0m 2216%if ARCH_X86_32 2217 jmp .hv1_loop_start 2218%endif 2219.hv1_loop: 2220 movif32 lpfq, hvsrcm 2221.hv1_loop_start: 2222 movu m4, [lpfq+wq+ 0] 2223.hv1_main: 2224 movu m5, [lpfq+wq+16] 2225 test edgeb, 2 ; LR_HAVE_RIGHT 2226 jnz .hv1_have_right 2227 cmp wd, -18 2228 jl .hv1_have_right 2229 call .extend_right 2230.hv1_have_right: 2231 palignr m1, m5, m4, 2 2232 paddw m0, m4, m1 2233 punpcklwd m2, m4, m1 2234 pmaddwd m2, m2 2235 punpckhwd m3, m4, m1 2236 pmaddwd m3, m3 2237 palignr m5, m4, 4 2238 paddw m0, m5 ; h sum 2239 punpcklwd m1, m5, m6 2240 pmaddwd m1, m1 2241 punpckhwd m5, m6 2242 pmaddwd m5, m5 2243 paddd m2, m1 ; h sumsq 2244 paddd m3, m5 2245 paddw m1, m0, [t2+wq+400*0] 2246 paddd m4, m2, [t2+wq+400*2] 2247 paddd m5, m3, [t2+wq+400*4] 2248 mova [t2+wq+400*0], m0 2249 mova [t2+wq+400*2], m2 2250 mova [t2+wq+400*4], m3 2251 paddd m4, m8 2252 paddd m5, m8 2253 psrld m4, 4 ; (a + 8) >> 4 2254 psrld m5, 4 2255 pslld m2, m4, 3 2256 pslld m3, m5, 3 2257 paddd m4, m2 ; ((a + 8) >> 4) * 9 2258 paddd m5, m3 2259 psrlw m3, m1, 1 2260 pavgw m3, m6 ; (b + 2) >> 2 2261 punpcklwd m2, m3, m6 2262 pmaddwd m2, m2 2263 punpckhwd m3, m6 2264 pmaddwd m3, m3 2265 punpcklwd m0, m1, m6 ; b 2266 punpckhwd m1, m6 2267 MAXSD m4, m2, m15 2268 MAXSD m5, m3, m15 2269 psubd m4, m2 ; p 2270 psubd m5, m3 2271 MULLD m4, m9, m15 ; p * s 2272 MULLD m5, m9, m15 2273 pmaddwd m0, m10 ; b * 455 2274 pmaddwd m1, m10 2275 paddusw m4, m10 2276 paddusw m5, m10 2277 psrld m3, m4, 20 ; min(z, 255) 2278 movif32 t3, t3m 2279 psrld m4, m5, 20 2280 GATHER_X_BY_X m2, m3, m4, r0, dstm 2281 punpcklwd m3, m2, m2 2282 punpckhwd m4, m2, m2 2283 MULLD m0, m3, m15 2284 MULLD m1, m4, m15 2285 psubw m5, m12, m2 2286%if ARCH_X86_32 2287 pxor m6, m6 2288%endif 2289 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2290 paddd m1, m11 2291 mova [t4+wq*1+400*2 +4], m5 2292 psrld m0, 12 2293 psrld m1, 12 2294 mova [t3+wq*2+400*4+ 8], m0 2295 mova [t3+wq*2+400*4+24], m1 2296 add wq, 16 2297 jl .hv1_loop 2298 mov r10, t2 2299 mov t2, t1 2300 mov t1, r10 2301 ret 2302.v0: ; vertical boxsums + ab (even rows) 2303%if ARCH_X86_64 2304 lea wq, [r5-4] 2305%else 2306 mov wd, w0m 2307%endif 2308.v0_loop: 2309 mova m0, [t1+wq+400*0] 2310 mova m4, [t1+wq+400*2] 2311 mova m5, [t1+wq+400*4] 2312 paddw m0, m0 2313 paddd m4, m4 2314 paddd m5, m5 2315 paddw m1, m0, [t2+wq+400*0] 2316 paddd m2, m4, [t2+wq+400*2] 2317 paddd m3, m5, [t2+wq+400*4] 2318 mova [t2+wq+400*0], m0 2319 mova [t2+wq+400*2], m4 2320 mova [t2+wq+400*4], m5 2321 paddd m2, m8 2322 paddd m3, m8 2323 psrld m2, 4 ; (a + 8) >> 4 2324 psrld m3, 4 2325 pslld m4, m2, 3 2326 pslld m5, m3, 3 2327 paddd m4, m2 ; ((a + 8) >> 4) * 9 2328 paddd m5, m3 2329 psrlw m3, m1, 1 2330 pavgw m3, m6 ; (b + 2) >> 2 2331 punpcklwd m2, m3, m6 2332 pmaddwd m2, m2 2333 punpckhwd m3, m6 2334 pmaddwd m3, m3 2335 punpcklwd m0, m1, m6 ; b 2336 punpckhwd m1, m6 2337 MAXSD m4, m2, m15 2338 MAXSD m5, m3, m15 2339 psubd m4, m2 ; p 2340 psubd m5, m3 2341 MULLD m4, m9, m15 ; p * s 2342 MULLD m5, m9, m15 2343 pmaddwd m0, m10 ; b * 455 2344 pmaddwd m1, m10 2345 paddusw m4, m10 2346 paddusw m5, m10 2347 psrld m3, m4, 20 ; min(z, 255) 2348 psrld m4, m5, 20 2349 GATHER_X_BY_X m2, m3, m4, r0, dstm 2350 punpcklwd m3, m2, m2 2351 punpckhwd m4, m2, m2 2352 MULLD m0, m3, m15 2353 MULLD m1, m4, m15 2354 psubw m5, m12, m2 2355%if ARCH_X86_32 2356 pxor m6, m6 2357%endif 2358 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2359 paddd m1, m11 2360 mova [t4+wq*1+400*0+ 4], m5 2361 psrld m0, 12 2362 psrld m1, 12 2363 mova [t3+wq*2+400*0+ 8], m0 2364 mova [t3+wq*2+400*0+24], m1 2365 add wq, 16 2366 jl .v0_loop 2367 ret 2368.v1: ; vertical boxsums + ab (odd rows) 2369%if ARCH_X86_64 2370 lea wq, [r5-4] 2371%else 2372 mov wd, w0m 2373%endif 2374.v1_loop: 2375 mova m0, [t1+wq+400*0] 2376 mova m4, [t1+wq+400*2] 2377 mova m5, [t1+wq+400*4] 2378 paddw m1, m0, [t2+wq+400*0] 2379 paddd m2, m4, [t2+wq+400*2] 2380 paddd m3, m5, [t2+wq+400*4] 2381 mova [t2+wq+400*0], m0 2382 mova [t2+wq+400*2], m4 2383 mova [t2+wq+400*4], m5 2384 paddd m2, m8 2385 paddd m3, m8 2386 psrld m2, 4 ; (a + 8) >> 4 2387 psrld m3, 4 2388 pslld m4, m2, 3 2389 pslld m5, m3, 3 2390 paddd m4, m2 ; ((a + 8) >> 4) * 9 2391 paddd m5, m3 2392 psrlw m3, m1, 1 2393 pavgw m3, m6 ; (b + 2) >> 2 2394 punpcklwd m2, m3, m6 2395 pmaddwd m2, m2 2396 punpckhwd m3, m6 2397 pmaddwd m3, m3 2398 punpcklwd m0, m1, m6 ; b 2399 punpckhwd m1, m6 2400 MAXSD m4, m2, m15 2401 MAXSD m5, m3, m15 2402 psubd m4, m2 ; p 2403 psubd m5, m3 2404 MULLD m4, m9, m15 ; p * s 2405 MULLD m5, m9, m15 2406 pmaddwd m0, m10 ; b * 455 2407 pmaddwd m1, m10 2408 paddusw m4, m10 2409 paddusw m5, m10 2410 psrld m3, m4, 20 ; min(z, 255) 2411 psrld m4, m5, 20 2412 GATHER_X_BY_X m2, m3, m4, r0, dstm 2413 punpcklwd m3, m2, m2 2414 punpckhwd m4, m2, m2 2415 MULLD m0, m3, m15 2416 MULLD m1, m4, m15 2417 psubw m5, m12, m2 2418%if ARCH_X86_32 2419 pxor m6, m6 2420%endif 2421 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2422 paddd m1, m11 2423 mova [t4+wq*1+400*2+ 4], m5 2424 psrld m0, 12 2425 psrld m1, 12 2426 mova [t3+wq*2+400*4+ 8], m0 2427 mova [t3+wq*2+400*4+24], m1 2428 add wq, 16 2429 jl .v1_loop 2430 mov r10, t2 2431 mov t2, t1 2432 mov t1, r10 2433 ret 2434.prep_n: ; initial neighbor setup 2435 movif64 wq, r5 2436 movif32 wd, w1m 2437.prep_n_loop: 2438 movu m0, [t4+wq*1+400*0+ 4] 2439 movu m1, [t3+wq*2+400*0+ 8] 2440 movu m2, [t3+wq*2+400*0+24] 2441 movu m3, [t4+wq*1+400*0+ 2] 2442 movu m4, [t3+wq*2+400*0+ 4] 2443 movu m5, [t3+wq*2+400*0+20] 2444 paddw m0, [t4+wq*1+400*0+ 0] 2445 paddd m1, [t3+wq*2+400*0+ 0] 2446 paddd m2, [t3+wq*2+400*0+16] 2447 paddw m3, m0 2448 paddd m4, m1 2449 paddd m5, m2 2450 psllw m3, 2 ; a[-1] 444 2451 pslld m4, 2 ; b[-1] 444 2452 pslld m5, 2 2453 psubw m3, m0 ; a[-1] 343 2454 psubd m4, m1 ; b[-1] 343 2455 psubd m5, m2 2456 mova [t4+wq*1+400*4], m3 2457 mova [t3+wq*2+400*8+ 0], m4 2458 mova [t3+wq*2+400*8+16], m5 2459 movu m0, [t4+wq*1+400*2+ 4] 2460 movu m1, [t3+wq*2+400*4+ 8] 2461 movu m2, [t3+wq*2+400*4+24] 2462 movu m3, [t4+wq*1+400*2+ 2] 2463 movu m4, [t3+wq*2+400*4+ 4] 2464 movu m5, [t3+wq*2+400*4+20] 2465 paddw m0, [t4+wq*1+400*2+ 0] 2466 paddd m1, [t3+wq*2+400*4+ 0] 2467 paddd m2, [t3+wq*2+400*4+16] 2468 paddw m3, m0 2469 paddd m4, m1 2470 paddd m5, m2 2471 psllw m3, 2 ; a[ 0] 444 2472 pslld m4, 2 ; b[ 0] 444 2473 pslld m5, 2 2474 mova [t4+wq*1+400* 6], m3 2475 mova [t3+wq*2+400*12+ 0], m4 2476 mova [t3+wq*2+400*12+16], m5 2477 psubw m3, m0 ; a[ 0] 343 2478 psubd m4, m1 ; b[ 0] 343 2479 psubd m5, m2 2480 mova [t4+wq*1+400* 8], m3 2481 mova [t3+wq*2+400*16+ 0], m4 2482 mova [t3+wq*2+400*16+16], m5 2483 add wq, 16 2484 jl .prep_n_loop 2485 ret 2486ALIGN function_align 2487.n0: ; neighbor + output (even rows) 2488 movif64 wq, r5 2489 movif32 wd, w1m 2490.n0_loop: 2491 movu m3, [t4+wq*1+400*0+4] 2492 movu m1, [t4+wq*1+400*0+2] 2493 paddw m3, [t4+wq*1+400*0+0] 2494 paddw m1, m3 2495 psllw m1, 2 ; a[ 1] 444 2496 psubw m2, m1, m3 ; a[ 1] 343 2497 paddw m3, m2, [t4+wq*1+400*4] 2498 paddw m3, [t4+wq*1+400*6] 2499 mova [t4+wq*1+400*4], m2 2500 mova [t4+wq*1+400*6], m1 2501 movu m4, [t3+wq*2+400*0+8] 2502 movu m1, [t3+wq*2+400*0+4] 2503 paddd m4, [t3+wq*2+400*0+0] 2504 paddd m1, m4 2505 pslld m1, 2 ; b[ 1] 444 2506 psubd m2, m1, m4 ; b[ 1] 343 2507 paddd m4, m2, [t3+wq*2+400* 8+ 0] 2508 paddd m4, [t3+wq*2+400*12+ 0] 2509 mova [t3+wq*2+400* 8+ 0], m2 2510 mova [t3+wq*2+400*12+ 0], m1 2511 movu m5, [t3+wq*2+400*0+24] 2512 movu m1, [t3+wq*2+400*0+20] 2513 paddd m5, [t3+wq*2+400*0+16] 2514 paddd m1, m5 2515 pslld m1, 2 2516 psubd m2, m1, m5 2517 paddd m5, m2, [t3+wq*2+400* 8+16] 2518 paddd m5, [t3+wq*2+400*12+16] 2519 mova [t3+wq*2+400* 8+16], m2 2520 mova [t3+wq*2+400*12+16], m1 2521 mova m0, [dstq+wq] 2522 punpcklwd m1, m0, m6 2523 punpcklwd m2, m3, m6 2524 pmaddwd m2, m1 ; a * src 2525 punpckhwd m1, m0, m6 2526 punpckhwd m3, m6 2527 pmaddwd m3, m1 2528 paddd m2, m4 ; a * src + b + (1 << 8) 2529 paddd m3, m5 2530 psrld m2, 9 2531 psrld m3, 9 2532 packssdw m2, m3 2533 psllw m1, m0, 4 2534 psubw m2, m1 2535 pmulhrsw m2, m7 2536 paddw m0, m2 2537 pmaxsw m0, m6 2538 pminsw m0, m13 2539 mova [dstq+wq], m0 2540 add wq, 16 2541 jl .n0_loop 2542 add dstq, dst_stridemp 2543 ret 2544ALIGN function_align 2545.n1: ; neighbor + output (odd rows) 2546 movif64 wq, r5 2547 movif32 wd, w1m 2548.n1_loop: 2549 movu m3, [t4+wq*1+400*2+4] 2550 movu m1, [t4+wq*1+400*2+2] 2551 paddw m3, [t4+wq*1+400*2+0] 2552 paddw m1, m3 2553 psllw m1, 2 ; a[ 1] 444 2554 psubw m2, m1, m3 ; a[ 1] 343 2555 paddw m3, m2, [t4+wq*1+400*6] 2556 paddw m3, [t4+wq*1+400*8] 2557 mova [t4+wq*1+400*6], m1 2558 mova [t4+wq*1+400*8], m2 2559 movu m4, [t3+wq*2+400*4+8] 2560 movu m1, [t3+wq*2+400*4+4] 2561 paddd m4, [t3+wq*2+400*4+0] 2562 paddd m1, m4 2563 pslld m1, 2 ; b[ 1] 444 2564 psubd m2, m1, m4 ; b[ 1] 343 2565 paddd m4, m2, [t3+wq*2+400*12+ 0] 2566 paddd m4, [t3+wq*2+400*16+ 0] 2567 mova [t3+wq*2+400*12+ 0], m1 2568 mova [t3+wq*2+400*16+ 0], m2 2569 movu m5, [t3+wq*2+400*4+24] 2570 movu m1, [t3+wq*2+400*4+20] 2571 paddd m5, [t3+wq*2+400*4+16] 2572 paddd m1, m5 2573 pslld m1, 2 2574 psubd m2, m1, m5 2575 paddd m5, m2, [t3+wq*2+400*12+16] 2576 paddd m5, [t3+wq*2+400*16+16] 2577 mova [t3+wq*2+400*12+16], m1 2578 mova [t3+wq*2+400*16+16], m2 2579 mova m0, [dstq+wq] 2580 punpcklwd m1, m0, m6 2581 punpcklwd m2, m3, m6 2582 pmaddwd m2, m1 ; a * src 2583 punpckhwd m1, m0, m6 2584 punpckhwd m3, m6 2585 pmaddwd m3, m1 2586 paddd m2, m4 ; a * src + b + (1 << 8) 2587 paddd m3, m5 2588 psrld m2, 9 2589 psrld m3, 9 2590 packssdw m2, m3 2591 psllw m1, m0, 4 2592 psubw m2, m1 2593 pmulhrsw m2, m7 2594 paddw m0, m2 2595 pmaxsw m0, m6 2596 pminsw m0, m13 2597 mova [dstq+wq], m0 2598 add wq, 16 2599 jl .n1_loop 2600 add dstq, dst_stridemp 2601 movif32 dstm, dstq 2602 ret 2603 2604%if ARCH_X86_32 2605 %if STACK_ALIGNMENT < 16 2606 %assign extra_stack 10*16 2607 %else 2608 %assign extra_stack 8*16 2609 %endif 2610cglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \ 2611 dst, dst_stride, left, lpf, lpf_stride, w, params, h 2612 %if STACK_ALIGNMENT < 16 2613 %define dstm dword [esp+calloff+16*8+4*0] 2614 %define dst_stridemp dword [esp+calloff+16*8+4*1] 2615 %define leftm dword [esp+calloff+16*8+4*2] 2616 %define lpfm dword [esp+calloff+16*8+4*3] 2617 %define lpf_stridem dword [esp+calloff+16*8+4*4] 2618 %define w0m dword [esp+calloff+16*8+4*5] 2619 %define hd dword [esp+calloff+16*8+4*6] 2620 %define edgeb byte [esp+calloff+16*8+4*7] 2621 %define edged dword [esp+calloff+16*8+4*7] 2622 %define leftmp leftm 2623 %else 2624 %define w0m wm 2625 %define hd dword r6m 2626 %define edgeb byte r8m 2627 %define edged dword r8m 2628 %endif 2629 %define hvsrcm dword [esp+calloff+4*0] 2630 %define w1m dword [esp+calloff+4*1] 2631 %define t3m dword [esp+calloff+4*2] 2632 %define t4m dword [esp+calloff+4*3] 2633 %xdefine m8 m6 2634 %define m9 [base+pd_8] 2635 %define m10 [base+pd_34816] 2636 %define m11 [base+pd_0xf00801c7] 2637 %define m12 [base+pw_256] 2638 %define m13 [esp+calloff+16*4] 2639 %define m14 [esp+calloff+16*5] 2640 %define m15 [esp+calloff+16*6] 2641 %define m6 [esp+calloff+16*7] 2642 %define base r6-$$ 2643 %assign calloff 0 2644 %if STACK_ALIGNMENT < 16 2645 mov dst_strideq, [rstk+stack_offset+ 8] 2646 mov leftq, [rstk+stack_offset+12] 2647 mov lpfq, [rstk+stack_offset+16] 2648 mov lpf_strideq, [rstk+stack_offset+20] 2649 mov wd, [rstk+stack_offset+24] 2650 mov dstm, dstq 2651 mov dst_stridemp, dst_strideq 2652 mov leftm, leftq 2653 mov r1, [rstk+stack_offset+28] 2654 mov r2, [rstk+stack_offset+36] 2655 mov lpfm, lpfq 2656 mov lpf_stridem, lpf_strideq 2657 mov hd, r1 2658 mov edged, r2 2659 %endif 2660%else 2661cglobal sgr_filter_mix_16bpc, 5, 15, 16, -400*66-40, dst, dst_stride, left, \ 2662 lpf, lpf_stride, w, edge, \ 2663 params, h 2664%endif 2665%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 2666 movifnidn wd, wm 2667%endif 2668%if ARCH_X86_64 2669 mov paramsq, paramsmp 2670 lea r13, [sgr_x_by_x-0xf03] 2671 mov edged, r8m 2672 add wd, wd 2673 mov hd, r6m 2674 mova m15, [paramsq] 2675 add lpfq, wq 2676 mova m9, [pd_8] 2677 lea t1, [rsp+wq+44] 2678 mova m10, [pd_34816] 2679 add dstq, wq 2680 mova m12, [pw_256] 2681 lea t3, [rsp+wq*2+400*24+40] 2682 mova m11, [pd_0xf00801c7] 2683 lea t4, [rsp+wq+400*52+40] 2684 neg wq 2685 pshuflw m13, m15, q0000 2686 pshuflw m14, m15, q2222 2687 pshufhw m15, m15, q1010 2688 punpcklqdq m13, m13 ; s0 2689 punpcklqdq m14, m14 ; s1 2690 punpckhqdq m15, m15 ; w0 w1 2691 pxor m6, m6 2692 psllw m15, 2 2693 DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w 2694 %define lpfm [rsp] 2695%else 2696 mov r1, [rstk+stack_offset+32] ; params 2697 LEA r6, $$ 2698 add wd, wd 2699 mova m2, [r1] 2700 add lpfm, wq 2701 lea t1, [rsp+extra_stack+wq+52] 2702 add dstq, wq 2703 lea t3, [rsp+extra_stack+wq*2+400*24+48] 2704 mov dstm, dstq 2705 lea t4, [rsp+extra_stack+wq+400*52+48] 2706 mov t3m, t3 2707 mov t4m, t4 2708 neg wq 2709 pshuflw m0, m2, q0000 2710 pshuflw m1, m2, q2222 2711 pshufhw m2, m2, q1010 2712 punpcklqdq m0, m0 ; s0 2713 punpcklqdq m1, m1 ; s1 2714 punpckhqdq m2, m2 ; w0 w1 2715 mov w1m, wd 2716 pxor m3, m3 2717 psllw m2, 2 2718 mova m13, m0 2719 mova m14, m1 2720 sub wd, 4 2721 mova m15, m2 2722 mova m6, m3 2723 mov lpfq, lpfm 2724 mov lpf_strideq, lpf_stridem 2725 mov w0m, wd 2726%endif 2727 test edgeb, 4 ; LR_HAVE_TOP 2728 jz .no_top 2729 call .h_top 2730 add lpfq, lpf_strideq 2731 mov t2, t1 2732%if ARCH_X86_64 2733 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup 2734%else 2735 mov wq, w0m 2736 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup_loop 2737%endif 2738 add t1, 400*12 2739 call .h_top 2740 lea r10, [lpfq+lpf_strideq*4] 2741 mov lpfq, dstq 2742 add r10, lpf_strideq 2743 mov lpfm, r10 ; below 2744 movif32 t4, t4m 2745 call .hv0 2746.main: 2747 dec hd 2748 jz .height1 2749 movif32 lpfq, hvsrcm 2750 add lpfq, dst_stridemp 2751 call .hv1 2752 call .prep_n 2753 sub hd, 2 2754 jl .extend_bottom 2755.main_loop: 2756 movif32 lpfq, hvsrcm 2757 add lpfq, dst_stridemp 2758 call .hv0 2759%if ARCH_X86_64 2760 test hd, hd 2761%else 2762 mov r5, hd 2763 test r5, r5 2764%endif 2765 jz .odd_height 2766 movif32 lpfq, hvsrcm 2767 add lpfq, dst_stridemp 2768 call .hv1 2769 call .n0 2770 call .n1 2771 sub hd, 2 2772 jge .main_loop 2773 test edgeb, 8 ; LR_HAVE_BOTTOM 2774 jz .extend_bottom 2775 mov lpfq, lpfm 2776 call .hv0_bottom 2777%if ARCH_X86_64 2778 add lpfq, lpf_strideq 2779%else 2780 mov lpfq, hvsrcm 2781 add lpfq, lpf_stridem 2782%endif 2783 call .hv1_bottom 2784.end: 2785 call .n0 2786 call .n1 2787.end2: 2788 RET 2789.height1: 2790 call .v1 2791 call .prep_n 2792 jmp .odd_height_end 2793.odd_height: 2794 call .v1 2795 call .n0 2796 call .n1 2797.odd_height_end: 2798 call .v0 2799 call .v1 2800 call .n0 2801 jmp .end2 2802.extend_bottom: 2803 call .v0 2804 call .v1 2805 jmp .end 2806.no_top: 2807 lea r10, [lpfq+lpf_strideq*4] 2808 mov lpfq, dstq 2809 lea r10, [r10+lpf_strideq*2] 2810 mov lpfm, r10 2811 call .h 2812%if ARCH_X86_64 2813 lea wq, [r5-4] 2814%else 2815 mov wq, w0m 2816 mov hvsrcm, lpfq 2817%endif 2818 lea t2, [t1+400*12] 2819.top_fixup_loop: 2820 mova m0, [t1+wq+400* 0] 2821 mova m1, [t1+wq+400* 2] 2822 mova m2, [t1+wq+400* 4] 2823 paddw m0, m0 2824 mova m3, [t1+wq+400* 6] 2825 paddd m1, m1 2826 mova m4, [t1+wq+400* 8] 2827 paddd m2, m2 2828 mova m5, [t1+wq+400*10] 2829 mova [t2+wq+400* 0], m0 2830 mova [t2+wq+400* 2], m1 2831 mova [t2+wq+400* 4], m2 2832 mova [t2+wq+400* 6], m3 2833 mova [t2+wq+400* 8], m4 2834 mova [t2+wq+400*10], m5 2835 add wq, 16 2836 jl .top_fixup_loop 2837 movif32 t3, t3m 2838 movif32 t4, t4m 2839 call .v0 2840 jmp .main 2841.h: ; horizontal boxsum 2842%assign stack_offset stack_offset+4 2843%assign calloff 4 2844%if ARCH_X86_64 2845 lea wq, [r5-4] 2846%else 2847 %define leftq r5 2848%endif 2849 test edgeb, 1 ; LR_HAVE_LEFT 2850 jz .h_extend_left 2851 movif32 leftq, leftm 2852 movddup m5, [leftq] 2853 movif32 wq, w0m 2854 mova m4, [lpfq+wq+4] 2855 add leftmp, 8 2856 palignr m4, m5, 10 2857 jmp .h_main 2858.h_extend_left: 2859 movif32 wq, w0m 2860 mova m4, [lpfq+wq+4] 2861 pshufb m4, [base+sgr_lshuf5] 2862 jmp .h_main 2863.h_top: 2864%if ARCH_X86_64 2865 lea wq, [r5-4] 2866%endif 2867 test edgeb, 1 ; LR_HAVE_LEFT 2868 jz .h_extend_left 2869 movif32 wq, w0m 2870.h_loop: 2871 movu m4, [lpfq+wq- 2] 2872.h_main: 2873 movu m5, [lpfq+wq+14] 2874 test edgeb, 2 ; LR_HAVE_RIGHT 2875 jnz .h_have_right 2876 cmp wd, -20 2877 jl .h_have_right 2878%if ARCH_X86_32 2879 pxor m8, m8 2880%endif 2881 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right 2882.h_have_right: 2883 palignr m3, m5, m4, 2 2884 palignr m0, m5, m4, 4 2885 paddw m1, m3, m0 2886 punpcklwd m2, m3, m0 2887 pmaddwd m2, m2 2888 punpckhwd m3, m0 2889 pmaddwd m3, m3 2890 palignr m0, m5, m4, 6 2891 paddw m1, m0 ; sum3 2892 punpcklwd m7, m0, m6 2893 pmaddwd m7, m7 2894 punpckhwd m0, m6 2895 pmaddwd m0, m0 2896 paddd m2, m7 ; sumsq3 2897 palignr m5, m4, 8 2898 punpcklwd m7, m5, m4 2899 paddw m8, m4, m5 2900 pmaddwd m7, m7 2901 punpckhwd m5, m4 2902 pmaddwd m5, m5 2903 paddd m3, m0 2904 mova [t1+wq+400* 6], m1 2905 mova [t1+wq+400* 8], m2 2906 mova [t1+wq+400*10], m3 2907 paddw m8, m1 ; sum5 2908 paddd m7, m2 ; sumsq5 2909 paddd m5, m3 2910 mova [t1+wq+400* 0], m8 2911 mova [t1+wq+400* 2], m7 2912 mova [t1+wq+400* 4], m5 2913 add wq, 16 2914 jl .h_loop 2915 ret 2916ALIGN function_align 2917.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) 2918%if ARCH_X86_64 2919 lea wq, [r5-4] 2920%else 2921 mov hvsrcm, lpfq 2922%endif 2923 test edgeb, 1 ; LR_HAVE_LEFT 2924 jz .hv0_extend_left 2925 movif32 leftq, leftm 2926 movddup m5, [leftq] 2927 movif32 wq, w0m 2928 mova m4, [lpfq+wq+4] 2929 add leftmp, 8 2930 palignr m4, m5, 10 2931 jmp .hv0_main 2932.hv0_extend_left: 2933 movif32 wq, w0m 2934 mova m4, [lpfq+wq+4] 2935 pshufb m4, [base+sgr_lshuf5] 2936 jmp .hv0_main 2937.hv0_bottom: 2938%if ARCH_X86_64 2939 lea wq, [r5-4] 2940%else 2941 mov hvsrcm, lpfq 2942%endif 2943 test edgeb, 1 ; LR_HAVE_LEFT 2944 jz .hv0_extend_left 2945 movif32 wq, w0m 2946%if ARCH_X86_32 2947 jmp .hv0_loop_start 2948%endif 2949.hv0_loop: 2950 movif32 lpfq, hvsrcm 2951.hv0_loop_start: 2952 movu m4, [lpfq+wq- 2] 2953.hv0_main: 2954 movu m5, [lpfq+wq+14] 2955 test edgeb, 2 ; LR_HAVE_RIGHT 2956 jnz .hv0_have_right 2957 cmp wd, -20 2958 jl .hv0_have_right 2959%if ARCH_X86_32 2960 pxor m8, m8 2961%endif 2962 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right 2963.hv0_have_right: 2964 palignr m3, m5, m4, 2 2965 palignr m0, m5, m4, 4 2966 movif32 t3, t3m 2967 paddw m1, m3, m0 2968 punpcklwd m2, m3, m0 2969 pmaddwd m2, m2 2970 punpckhwd m3, m0 2971 pmaddwd m3, m3 2972 palignr m0, m5, m4, 6 2973 paddw m1, m0 ; h sum3 2974 punpcklwd m7, m0, m6 2975 pmaddwd m7, m7 2976 punpckhwd m0, m6 2977 pmaddwd m0, m0 2978 paddd m2, m7 ; h sumsq3 2979 palignr m5, m4, 8 2980 punpcklwd m7, m5, m4 2981 paddw m8, m4, m5 2982 pmaddwd m7, m7 2983 punpckhwd m5, m4 2984 pmaddwd m5, m5 2985 paddd m3, m0 2986 paddw m8, m1 ; h sum5 2987 paddd m7, m2 ; h sumsq5 2988 paddd m5, m3 2989 mova [t3+wq*2+400*8+ 8], m8 2990 mova [t3+wq*2+400*0+ 8], m7 2991 mova [t3+wq*2+400*0+24], m5 2992 paddw m8, [t1+wq+400* 0] 2993 paddd m7, [t1+wq+400* 2] 2994 paddd m5, [t1+wq+400* 4] 2995 mova [t1+wq+400* 0], m8 2996 mova [t1+wq+400* 2], m7 2997 mova [t1+wq+400* 4], m5 2998 paddw m0, m1, [t1+wq+400* 6] 2999 paddd m4, m2, [t1+wq+400* 8] 3000 paddd m5, m3, [t1+wq+400*10] 3001 mova [t1+wq+400* 6], m1 3002 mova [t1+wq+400* 8], m2 3003 mova [t1+wq+400*10], m3 3004 paddw m1, m0, [t2+wq+400* 6] 3005 paddd m2, m4, [t2+wq+400* 8] 3006 paddd m3, m5, [t2+wq+400*10] 3007 mova [t2+wq+400* 6], m0 3008 mova [t2+wq+400* 8], m4 3009 mova [t2+wq+400*10], m5 3010 paddd m2, m9 3011 paddd m3, m9 3012 psrld m2, 4 ; (a3 + 8) >> 4 3013 psrld m3, 4 3014%if ARCH_X86_32 3015 pxor m7, m7 3016%else 3017 SWAP m7, m6 3018%endif 3019 pslld m4, m2, 3 3020 pslld m5, m3, 3 3021 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 3022 paddd m5, m3 3023 psrlw m3, m1, 1 3024 pavgw m3, m7 ; (b3 + 2) >> 2 3025 punpcklwd m2, m3, m7 3026 pmaddwd m2, m2 3027 punpckhwd m3, m7 3028 pmaddwd m3, m3 3029 punpcklwd m0, m1, m7 ; b3 3030 punpckhwd m1, m7 3031%if ARCH_X86_64 3032 SWAP m7, m6 3033%endif 3034 MAXSD m4, m2, m7 3035 MAXSD m5, m3, m7 3036 psubd m4, m2 ; p3 3037 psubd m5, m3 3038 MULLD m4, m14, m7 ; p3 * s1 3039 MULLD m5, m14, m7 3040 pmaddwd m0, m11 ; b3 * 455 3041 pmaddwd m1, m11 3042 paddusw m4, m11 3043 paddusw m5, m11 3044 psrld m3, m4, 20 ; min(z3, 255) 3045 psrld m4, m5, 20 3046 GATHER_X_BY_X m2, m3, m4, r0, dstm 3047 punpcklwd m3, m2, m2 3048 punpckhwd m4, m2, m2 3049 MULLD m0, m3, m7 3050 MULLD m1, m4, m7 3051 psubw m5, m12, m2 3052 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3053 paddd m1, m10 3054 mova [t4+wq*1+400*2+ 4], m5 3055 psrld m0, 12 3056 psrld m1, 12 3057 mova [t3+wq*2+400*4+ 8], m0 3058 mova [t3+wq*2+400*4+24], m1 3059 add wq, 16 3060 jl .hv0_loop 3061 ret 3062ALIGN function_align 3063.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 3064%if ARCH_X86_64 3065 lea wq, [r5-4] 3066%else 3067 mov hvsrcm, lpfq 3068%endif 3069 test edgeb, 1 ; LR_HAVE_LEFT 3070 jz .hv1_extend_left 3071 movif32 leftq, leftm 3072 movddup m5, [leftq] 3073 movif32 wq, w0m 3074 mova m4, [lpfq+wq+4] 3075 add leftmp, 8 3076 palignr m4, m5, 10 3077 jmp .hv1_main 3078.hv1_extend_left: 3079 movif32 wq, w0m 3080 mova m4, [lpfq+wq+4] 3081 pshufb m4, [base+sgr_lshuf5] 3082 jmp .hv1_main 3083.hv1_bottom: 3084%if ARCH_X86_64 3085 lea wq, [r5-4] 3086%else 3087 mov hvsrcm, lpfq 3088%endif 3089 test edgeb, 1 ; LR_HAVE_LEFT 3090 jz .hv1_extend_left 3091 movif32 wq, w0m 3092%if ARCH_X86_32 3093 jmp .hv1_loop_start 3094%endif 3095.hv1_loop: 3096 movif32 lpfq, hvsrcm 3097.hv1_loop_start: 3098 movu m4, [lpfq+wq- 2] 3099.hv1_main: 3100 movu m5, [lpfq+wq+14] 3101 test edgeb, 2 ; LR_HAVE_RIGHT 3102 jnz .hv1_have_right 3103 cmp wd, -20 3104 jl .hv1_have_right 3105%if ARCH_X86_32 3106 pxor m8, m8 3107%endif 3108 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right 3109.hv1_have_right: 3110 palignr m7, m5, m4, 2 3111 palignr m3, m5, m4, 4 3112 paddw m2, m7, m3 3113 punpcklwd m0, m7, m3 3114 pmaddwd m0, m0 3115 punpckhwd m7, m3 3116 pmaddwd m7, m7 3117 palignr m3, m5, m4, 6 3118 paddw m2, m3 ; h sum3 3119 punpcklwd m1, m3, m6 3120 pmaddwd m1, m1 3121 punpckhwd m3, m6 3122 pmaddwd m3, m3 3123 paddd m0, m1 ; h sumsq3 3124 palignr m5, m4, 8 3125 punpckhwd m1, m4, m5 3126 paddw m8, m4, m5 3127 pmaddwd m1, m1 3128 punpcklwd m4, m5 3129 pmaddwd m4, m4 3130 paddd m7, m3 3131 paddw m5, m2, [t2+wq+400* 6] 3132 mova [t2+wq+400* 6], m2 3133 paddw m8, m2 ; h sum5 3134 paddd m2, m0, [t2+wq+400* 8] 3135 paddd m3, m7, [t2+wq+400*10] 3136 mova [t2+wq+400* 8], m0 3137 mova [t2+wq+400*10], m7 3138 paddd m4, m0 ; h sumsq5 3139 paddd m1, m7 3140 paddd m2, m9 3141 paddd m3, m9 3142 psrld m2, 4 ; (a3 + 8) >> 4 3143 psrld m3, 4 3144 pslld m0, m2, 3 3145 pslld m7, m3, 3 3146 paddd m2, m0 ; ((a3 + 8) >> 4) * 9 3147 paddd m3, m7 3148 psrlw m7, m5, 1 3149 pavgw m7, m6 ; (b3 + 2) >> 2 3150 punpcklwd m0, m7, m6 3151 pmaddwd m0, m0 3152 punpckhwd m7, m6 3153 pmaddwd m7, m7 3154%if ARCH_X86_32 3155 mova [esp+20], m8 3156%else 3157 SWAP m8, m6 3158%endif 3159 MAXSD m2, m0, m8 3160 MAXSD m3, m7, m8 3161 pxor m8, m8 3162 psubd m2, m0 ; p3 3163 psubd m3, m7 3164 punpcklwd m0, m5, m8 ; b3 3165 punpckhwd m5, m8 3166 MULLD m2, m14, m8 ; p3 * s1 3167 MULLD m3, m14, m8 3168 pmaddwd m0, m11 ; b3 * 455 3169 pmaddwd m5, m11 3170 paddusw m2, m11 3171 paddusw m3, m11 3172 psrld m8, m2, 20 ; min(z3, 255) 3173 movif32 t3, t3m 3174 psrld m2, m3, 20 3175 GATHER_X_BY_X m7, m8, m2, r0, dstm 3176 punpcklwd m2, m7, m7 3177 punpckhwd m8, m7, m7 3178 MULLD m0, m2, m3 3179 MULLD m5, m8, m3 3180 psubw m3, m12, m7 3181%if ARCH_X86_32 3182 mova m8, [esp+20] 3183%endif 3184 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3185 paddd m5, m10 3186 psrld m0, 12 3187 psrld m5, 12 3188 mova [t4+wq*1+400*4+4], m3 3189 mova [t3+wq*2+400*8+ 8], m0 3190 mova [t3+wq*2+400*8+24], m5 3191%if ARCH_X86_64 3192 SWAP m6, m8 3193 pxor m6, m6 3194%endif 3195 paddw m5, m8, [t2+wq+400*0] 3196 paddd m2, m4, [t2+wq+400*2] 3197 paddd m3, m1, [t2+wq+400*4] 3198 paddw m5, [t1+wq+400*0] 3199 paddd m2, [t1+wq+400*2] 3200 paddd m3, [t1+wq+400*4] 3201 mova [t2+wq+400*0], m8 3202 mova [t2+wq+400*2], m4 3203 mova [t2+wq+400*4], m1 3204 mova m4, [base+pw_25] 3205 paddd m2, m9 3206 paddd m3, m9 3207 psrld m2, 4 ; (a5 + 8) >> 4 3208 psrld m3, 4 3209 MULLD m2, m4, m7 ; ((a5 + 8) >> 4) * 25 3210 MULLD m3, m4, m7 3211%if ARCH_X86_32 3212 pxor m7, m7 3213%else 3214 SWAP m7, m6 3215%endif 3216 psrlw m1, m5, 1 3217 pavgw m1, m7 ; (b5 + 2) >> 2 3218 punpcklwd m4, m1, m7 3219 pmaddwd m4, m4 3220 punpckhwd m1, m7 3221 pmaddwd m1, m1 3222 punpcklwd m0, m5, m7 ; b5 3223 punpckhwd m5, m7 3224%if ARCH_X86_64 3225 SWAP m7, m6 3226%endif 3227 MAXSD m2, m4, m7 3228 psubd m2, m4 ; p5 3229 mova m4, [base+pd_0xf00800a4] 3230 MAXSD m3, m1, m7 3231 psubd m3, m1 3232 MULLD m2, m13, m7 ; p5 * s0 3233 MULLD m3, m13, m7 3234 pmaddwd m0, m4 ; b5 * 164 3235 pmaddwd m5, m4 3236 paddusw m2, m4 3237 paddusw m3, m4 3238 psrld m1, m2, 20 ; min(z5, 255) 3239 psrld m2, m3, 20 3240 GATHER_X_BY_X m4, m1, m2, r0, dstm 3241 punpcklwd m2, m4, m4 3242 punpckhwd m3, m4, m4 3243 MULLD m0, m2, m7 3244 MULLD m5, m3, m7 3245 psubw m1, m12, m4 3246 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 3247 paddd m5, m10 3248 mova [t4+wq*1+400*0+ 4], m1 3249 psrld m0, 12 3250 psrld m5, 12 3251 mova [t3+wq*2+400*0+ 8], m0 3252 mova [t3+wq*2+400*0+24], m5 3253 add wq, 16 3254 jl .hv1_loop 3255 mov r10, t2 3256 mov t2, t1 3257 mov t1, r10 3258 ret 3259.v0: ; vertical boxsums + ab3 (even rows) 3260%if ARCH_X86_64 3261 lea wq, [r5-4] 3262%else 3263 mov wd, w0m 3264%endif 3265.v0_loop: 3266 mova m0, [t1+wq+400* 6] 3267 mova m4, [t1+wq+400* 8] 3268 mova m5, [t1+wq+400*10] 3269 paddw m0, m0 3270 paddd m4, m4 3271 paddd m5, m5 3272 paddw m1, m0, [t2+wq+400* 6] 3273 paddd m2, m4, [t2+wq+400* 8] 3274 paddd m3, m5, [t2+wq+400*10] 3275 mova [t2+wq+400* 6], m0 3276 mova [t2+wq+400* 8], m4 3277 mova [t2+wq+400*10], m5 3278 paddd m2, m9 3279 paddd m3, m9 3280 psrld m2, 4 ; (a3 + 8) >> 4 3281 psrld m3, 4 3282%if ARCH_X86_32 3283 pxor m7, m7 3284%else 3285 SWAP m7, m6 3286%endif 3287 pslld m4, m2, 3 3288 pslld m5, m3, 3 3289 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 3290 paddd m5, m3 3291 psrlw m3, m1, 1 3292 pavgw m3, m7 ; (b3 + 2) >> 2 3293 punpcklwd m2, m3, m7 3294 pmaddwd m2, m2 3295 punpckhwd m3, m7 3296 pmaddwd m3, m3 3297 punpcklwd m0, m1, m7 ; b3 3298 punpckhwd m1, m7 3299%if ARCH_X86_64 3300 SWAP m7, m6 3301%endif 3302 MAXSD m4, m2, m7 3303 MAXSD m5, m3, m7 3304 psubd m4, m2 ; p3 3305 psubd m5, m3 3306 MULLD m4, m14, m7 ; p3 * s1 3307 MULLD m5, m14, m7 3308 pmaddwd m0, m11 ; b3 * 455 3309 pmaddwd m1, m11 3310 paddusw m4, m11 3311 paddusw m5, m11 3312 psrld m3, m4, 20 ; min(z3, 255) 3313 psrld m4, m5, 20 3314 GATHER_X_BY_X m2, m3, m4, r0, dstm 3315 punpcklwd m3, m2, m2 3316 punpckhwd m4, m2, m2 3317 MULLD m0, m3, m7 3318 MULLD m1, m4, m7 3319 psubw m5, m12, m2 3320 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3321 paddd m1, m10 3322 mova [t4+wq*1+400*2+4], m5 3323 psrld m0, 12 3324 psrld m1, 12 3325 mova m3, [t1+wq+400*0] 3326 mova m4, [t1+wq+400*2] 3327 mova m5, [t1+wq+400*4] 3328 mova [t3+wq*2+400*8+ 8], m3 3329 mova [t3+wq*2+400*0+ 8], m4 3330 mova [t3+wq*2+400*0+24], m5 3331 paddw m3, m3 ; cc5 3332 paddd m4, m4 3333 paddd m5, m5 3334 mova [t1+wq+400*0], m3 3335 mova [t1+wq+400*2], m4 3336 mova [t1+wq+400*4], m5 3337 mova [t3+wq*2+400*4+ 8], m0 3338 mova [t3+wq*2+400*4+24], m1 3339 add wq, 16 3340 jl .v0_loop 3341 ret 3342.v1: ; vertical boxsums + ab (odd rows) 3343%if ARCH_X86_64 3344 lea wq, [r5-4] 3345%else 3346 mov wd, w0m 3347%endif 3348.v1_loop: 3349 mova m4, [t1+wq+400* 6] 3350 mova m5, [t1+wq+400* 8] 3351 mova m7, [t1+wq+400*10] 3352 paddw m1, m4, [t2+wq+400* 6] 3353 paddd m2, m5, [t2+wq+400* 8] 3354 paddd m3, m7, [t2+wq+400*10] 3355 mova [t2+wq+400* 6], m4 3356 mova [t2+wq+400* 8], m5 3357 mova [t2+wq+400*10], m7 3358 paddd m2, m9 3359 paddd m3, m9 3360 psrld m2, 4 ; (a3 + 8) >> 4 3361 psrld m3, 4 3362%if ARCH_X86_32 3363 pxor m7, m7 3364%else 3365 SWAP m7, m6 3366%endif 3367 pslld m4, m2, 3 3368 pslld m5, m3, 3 3369 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 3370 paddd m5, m3 3371 psrlw m3, m1, 1 3372 pavgw m3, m7 ; (b3 + 2) >> 2 3373 punpcklwd m2, m3, m7 3374 pmaddwd m2, m2 3375 punpckhwd m3, m7 3376 pmaddwd m3, m3 3377 punpcklwd m0, m1, m7 ; b3 3378 punpckhwd m1, m7 3379%if ARCH_X86_64 3380 SWAP m7, m6 3381%endif 3382 MAXSD m4, m2, m7 3383 MAXSD m5, m3, m7 3384 psubd m4, m2 ; p3 3385 psubd m5, m3 3386 MULLD m4, m14, m7 ; p3 * s1 3387 MULLD m5, m14, m7 3388 pmaddwd m0, m11 ; b3 * 455 3389 pmaddwd m1, m11 3390 paddusw m4, m11 3391 paddusw m5, m11 3392 psrld m3, m4, 20 ; min(z3, 255) 3393 psrld m4, m5, 20 3394 GATHER_X_BY_X m2, m3, m4, r0, dstm 3395 punpcklwd m3, m2, m2 3396 punpckhwd m4, m2, m2 3397 MULLD m0, m3, m7 3398 MULLD m1, m4, m7 3399 psubw m5, m12, m2 3400 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3401 paddd m1, m10 3402 mova [t4+wq*1+400*4+4], m5 3403 psrld m0, 12 3404 psrld m8, m1, 12 3405 mova m4, [t3+wq*2+400*8+ 8] 3406 mova m5, [t3+wq*2+400*0+ 8] 3407 mova m7, [t3+wq*2+400*0+24] 3408 paddw m1, m4, [t2+wq+400*0] 3409 paddd m2, m5, [t2+wq+400*2] 3410 paddd m3, m7, [t2+wq+400*4] 3411 paddw m1, [t1+wq+400*0] 3412 paddd m2, [t1+wq+400*2] 3413 paddd m3, [t1+wq+400*4] 3414 mova [t2+wq+400*0], m4 3415 mova [t2+wq+400*2], m5 3416 mova [t2+wq+400*4], m7 3417 mova m4, [base+pw_25] 3418 mova [t3+wq*2+400*8+ 8], m0 3419 mova [t3+wq*2+400*8+24], m8 3420 paddd m2, m9 3421 paddd m3, m9 3422 psrld m2, 4 ; (a5 + 8) >> 4 3423 psrld m3, 4 3424 MULLD m2, m4, m7 ; ((a5 + 8) >> 4) * 25 3425 MULLD m3, m4, m7 3426%if ARCH_X86_32 3427 pxor m7, m7 3428%else 3429 SWAP m7, m6 3430%endif 3431 psrlw m5, m1, 1 3432 pavgw m5, m7 ; (b5 + 2) >> 2 3433 punpcklwd m4, m5, m7 3434 pmaddwd m4, m4 3435 punpckhwd m5, m7 3436 pmaddwd m5, m5 3437 punpcklwd m0, m1, m7 ; b5 3438 punpckhwd m1, m7 3439%if ARCH_X86_64 3440 SWAP m7, m6 3441%endif 3442 MAXSD m2, m4, m7 3443 psubd m2, m4 ; p5 3444 mova m4, [base+pd_0xf00800a4] 3445 MAXSD m3, m5, m7 3446 psubd m3, m5 3447 MULLD m2, m13, m7 ; p5 * s0 3448 MULLD m3, m13, m7 3449 pmaddwd m0, m4 ; b5 * 164 3450 pmaddwd m1, m4 3451 paddusw m2, m4 3452 paddusw m3, m4 3453 psrld m5, m2, 20 ; min(z5, 255) 3454 psrld m2, m3, 20 3455 GATHER_X_BY_X m4, m5, m2, r0, dstm 3456 punpcklwd m2, m4, m4 3457 punpckhwd m3, m4, m4 3458 psubw m5, m12, m4 3459 MULLD m0, m2, m7 3460 MULLD m1, m3, m7 3461 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 3462 paddd m1, m10 3463 mova [t4+wq*1+400*0+ 4], m5 3464 psrld m0, 12 3465 psrld m1, 12 3466 mova [t3+wq*2+400*0+ 8], m0 3467 mova [t3+wq*2+400*0+24], m1 3468 add wq, 16 3469 jl .v1_loop 3470 mov r10, t2 3471 mov t2, t1 3472 mov t1, r10 3473 ret 3474.prep_n: ; initial neighbor setup 3475 movif64 wq, r5 3476 movif32 wd, w1m 3477.prep_n_loop: 3478 movu m0, [t4+wq*1+400*0+ 2] 3479 movu m1, [t3+wq*2+400*0+ 4] 3480 movu m2, [t3+wq*2+400*0+20] 3481 movu m7, [t4+wq*1+400*0+ 4] 3482 movu m8, [t3+wq*2+400*0+ 8] 3483 paddw m3, m0, [t4+wq*1+400*0+ 0] 3484 paddd m4, m1, [t3+wq*2+400*0+ 0] 3485 paddd m5, m2, [t3+wq*2+400*0+16] 3486 paddw m3, m7 3487 paddd m4, m8 3488 movu m7, [t3+wq*2+400*0+24] 3489 paddw m0, m3 3490 paddd m1, m4 3491 psllw m3, 2 3492 pslld m4, 2 3493 paddd m5, m7 3494 paddd m2, m5 3495 pslld m5, 2 3496 paddw m0, m3 ; a5 565 3497 paddd m1, m4 ; b5 565 3498 paddd m2, m5 3499 mova [t4+wq*1+400* 6+ 0], m0 3500 mova [t3+wq*2+400*12+ 0], m1 3501 mova [t3+wq*2+400*12+16], m2 3502 movu m0, [t4+wq*1+400*2+ 4] 3503 movu m3, [t4+wq*1+400*2+ 2] 3504 paddw m0, [t4+wq*1+400*2+ 0] 3505 movu m1, [t3+wq*2+400*4+ 8] 3506 movu m4, [t3+wq*2+400*4+ 4] 3507 paddd m1, [t3+wq*2+400*4+ 0] 3508 movu m2, [t3+wq*2+400*4+24] 3509 movu m5, [t3+wq*2+400*4+20] 3510 paddd m2, [t3+wq*2+400*4+16] 3511 paddw m3, m0 3512 paddd m4, m1 3513 paddd m5, m2 3514 psllw m3, 2 ; a3[-1] 444 3515 pslld m4, 2 ; b3[-1] 444 3516 pslld m5, 2 3517 psubw m3, m0 ; a3[-1] 343 3518 psubd m4, m1 ; b3[-1] 343 3519 psubd m5, m2 3520 mova [t4+wq*1+400* 8+ 0], m3 3521 mova [t3+wq*2+400*16+ 0], m4 3522 mova [t3+wq*2+400*16+16], m5 3523 movu m0, [t4+wq*1+400*4+ 4] 3524 movu m3, [t4+wq*1+400*4+ 2] 3525 paddw m0, [t4+wq*1+400*4+ 0] 3526 movu m1, [t3+wq*2+400*8+ 8] 3527 movu m4, [t3+wq*2+400*8+ 4] 3528 paddd m1, [t3+wq*2+400*8+ 0] 3529 movu m2, [t3+wq*2+400*8+24] 3530 movu m5, [t3+wq*2+400*8+20] 3531 paddd m2, [t3+wq*2+400*8+16] 3532 paddw m3, m0 3533 paddd m4, m1 3534 paddd m5, m2 3535 psllw m3, 2 ; a3[ 0] 444 3536 pslld m4, 2 ; b3[ 0] 444 3537 pslld m5, 2 3538 mova [t4+wq*1+400*10+ 0], m3 3539 mova [t3+wq*2+400*20+ 0], m4 3540 mova [t3+wq*2+400*20+16], m5 3541 psubw m3, m0 ; a3[ 0] 343 3542 psubd m4, m1 ; b3[ 0] 343 3543 psubd m5, m2 3544 mova [t4+wq*1+400*12+ 0], m3 3545 mova [t3+wq*2+400*24+ 0], m4 3546 mova [t3+wq*2+400*24+16], m5 3547 add wq, 16 3548 jl .prep_n_loop 3549 ret 3550ALIGN function_align 3551.n0: ; neighbor + output (even rows) 3552 movif64 wq, r5 3553 movif32 wd, w1m 3554.n0_loop: 3555 movu m0, [t4+wq*1+ 4] 3556 movu m2, [t4+wq*1+ 2] 3557 paddw m0, [t4+wq*1+ 0] 3558 paddw m0, m2 3559 paddw m2, m0 3560 psllw m0, 2 3561 paddw m0, m2 ; a5 3562 movu m4, [t3+wq*2+ 8] 3563 movu m5, [t3+wq*2+24] 3564 movu m1, [t3+wq*2+ 4] 3565 movu m3, [t3+wq*2+20] 3566 paddd m4, [t3+wq*2+ 0] 3567 paddd m5, [t3+wq*2+16] 3568 paddd m4, m1 3569 paddd m5, m3 3570 paddd m1, m4 3571 paddd m3, m5 3572 pslld m4, 2 3573 pslld m5, 2 3574 paddd m4, m1 ; b5 3575 paddd m5, m3 3576 movu m2, [t4+wq*1+400* 6] 3577 paddw m2, m0 3578 mova [t4+wq*1+400* 6], m0 3579 paddd m0, m4, [t3+wq*2+400*12+ 0] 3580 paddd m1, m5, [t3+wq*2+400*12+16] 3581 mova [t3+wq*2+400*12+ 0], m4 3582 mova [t3+wq*2+400*12+16], m5 3583 mova [rsp+16+ARCH_X86_32*4], m1 3584 movu m3, [t4+wq*1+400*2+4] 3585 movu m5, [t4+wq*1+400*2+2] 3586 paddw m3, [t4+wq*1+400*2+0] 3587 paddw m5, m3 3588 psllw m5, 2 ; a3[ 1] 444 3589 psubw m4, m5, m3 ; a3[ 1] 343 3590 movu m3, [t4+wq*1+400* 8] 3591 paddw m3, [t4+wq*1+400*10] 3592 paddw m3, m4 3593 mova [t4+wq*1+400* 8], m4 3594 mova [t4+wq*1+400*10], m5 3595 movu m1, [t3+wq*2+400*4+ 8] 3596 movu m5, [t3+wq*2+400*4+ 4] 3597 movu m7, [t3+wq*2+400*4+24] 3598 movu m8, [t3+wq*2+400*4+20] 3599 paddd m1, [t3+wq*2+400*4+ 0] 3600 paddd m7, [t3+wq*2+400*4+16] 3601 paddd m5, m1 3602 paddd m8, m7 3603 pslld m5, 2 ; b3[ 1] 444 3604 pslld m8, 2 3605 psubd m4, m5, m1 ; b3[ 1] 343 3606%if ARCH_X86_32 3607 mova [esp+52], m8 3608 psubd m8, m7 3609%else 3610 psubd m6, m8, m7 3611 SWAP m8, m6 3612%endif 3613 paddd m1, m4, [t3+wq*2+400*16+ 0] 3614 paddd m7, m8, [t3+wq*2+400*16+16] 3615 paddd m1, [t3+wq*2+400*20+ 0] 3616 paddd m7, [t3+wq*2+400*20+16] 3617 mova [t3+wq*2+400*16+ 0], m4 3618 mova [t3+wq*2+400*16+16], m8 3619 mova [t3+wq*2+400*20+ 0], m5 3620%if ARCH_X86_32 3621 mova m8, [esp+52] 3622%else 3623 SWAP m8, m6 3624 pxor m6, m6 3625%endif 3626 mova [t3+wq*2+400*20+16], m8 3627 mova [rsp+32+ARCH_X86_32*4], m7 3628 movu m4, [dstq+wq] 3629 punpcklwd m7, m2, m6 3630 punpckhwd m2, m6 3631 punpcklwd m8, m3, m6 3632 punpckhwd m3, m6 3633 punpcklwd m5, m4, m6 3634 punpckhwd m4, m6 3635 pmaddwd m7, m5 ; a5 * src 3636 pmaddwd m8, m5 ; a3 * src 3637 pmaddwd m2, m4 3638 pmaddwd m3, m4 3639 pslld m5, 13 3640 pslld m4, 13 3641 psubd m0, m5 3642 psubd m1, m5 3643 paddd m0, m7 ; a5 * src + b5 + (1 << 8) - (src << 13) 3644 paddd m1, m8 ; a3 * src + b3 + (1 << 8) - (src << 13) 3645 mova m7, [base+pd_0xffff] 3646 psrld m0, 9 3647 pslld m1, 7 3648 pand m0, m7 3649 pandn m8, m7, m1 3650 por m0, m8 3651 psubd m1, m4, [rsp+16+ARCH_X86_32*4] 3652 psubd m8, m4, [rsp+32+ARCH_X86_32*4] 3653 psubd m2, m1 3654 psubd m3, m8 3655 mova m1, [base+pd_4096] 3656 psrld m2, 9 3657 pslld m3, 7 3658 pand m2, m7 3659 pandn m7, m3 3660 por m2, m7 3661 pmaddwd m0, m15 3662 pmaddwd m2, m15 3663%if ARCH_X86_32 3664 pxor m7, m7 3665%else 3666 SWAP m7, m6 3667%endif 3668 paddd m5, m1 3669 paddd m4, m1 3670 paddd m0, m5 3671 paddd m2, m4 3672 psrad m0, 8 3673 psrad m2, 8 3674 packssdw m0, m2 ; clip 3675 pmaxsw m0, m7 3676 psrlw m0, 5 3677 mova [dstq+wq], m0 3678 add wq, 16 3679 jl .n0_loop 3680 add dstq, dst_stridemp 3681 ret 3682%if ARCH_X86_64 3683 SWAP m6, m7 3684%endif 3685ALIGN function_align 3686.n1: ; neighbor + output (odd rows) 3687 movif64 wq, r5 3688 movif32 wd, w1m 3689.n1_loop: 3690 movu m3, [t4+wq*1+400*4+4] 3691 movu m5, [t4+wq*1+400*4+2] 3692 paddw m3, [t4+wq*1+400*4+0] 3693 paddw m5, m3 3694 psllw m5, 2 ; a3[ 1] 444 3695 psubw m4, m5, m3 ; a3[ 1] 343 3696 paddw m3, m4, [t4+wq*1+400*12] 3697 paddw m3, [t4+wq*1+400*10] 3698 mova [t4+wq*1+400*10], m5 3699 mova [t4+wq*1+400*12], m4 3700 movu m1, [t3+wq*2+400*8+ 8] 3701 movu m5, [t3+wq*2+400*8+ 4] 3702 movu m7, [t3+wq*2+400*8+24] 3703 movu m8, [t3+wq*2+400*8+20] 3704 paddd m1, [t3+wq*2+400*8+ 0] 3705 paddd m7, [t3+wq*2+400*8+16] 3706 paddd m5, m1 3707 paddd m8, m7 3708 pslld m5, 2 ; b3[ 1] 444 3709 pslld m8, 2 3710 psubd m4, m5, m1 ; b3[ 1] 343 3711 psubd m0, m8, m7 3712 paddd m1, m4, [t3+wq*2+400*24+ 0] 3713 paddd m7, m0, [t3+wq*2+400*24+16] 3714 paddd m1, [t3+wq*2+400*20+ 0] 3715 paddd m7, [t3+wq*2+400*20+16] 3716 mova [t3+wq*2+400*20+ 0], m5 3717 mova [t3+wq*2+400*20+16], m8 3718 mova [t3+wq*2+400*24+ 0], m4 3719 mova [t3+wq*2+400*24+16], m0 3720 mova m5, [dstq+wq] 3721 mova m8, [t4+wq*1+400* 6] 3722 punpcklwd m4, m5, m6 3723 punpckhwd m5, m6 3724 punpcklwd m0, m8, m6 3725 punpckhwd m8, m6 3726 punpcklwd m2, m3, m6 3727 punpckhwd m3, m6 3728 pmaddwd m0, m4 ; a5 * src 3729 pmaddwd m2, m4 ; a3 * src 3730 pmaddwd m8, m5 3731 pmaddwd m3, m5 3732 paddd m1, m2 ; a3 * src + b3 + (1 << 8) - (src << 13) 3733 pslld m4, 12 3734 pslld m5, 12 3735 psubd m2, m4, [t3+wq*2+400*12+ 0] 3736 psubd m0, m2 ; a5 * src + b5 + (1 << 8) - (src << 13) 3737 psubd m2, m5, [t3+wq*2+400*12+16] 3738 psubd m8, m2 3739 paddd m4, m4 3740 paddd m5, m5 3741 paddd m7, m3 3742 mova m2, [base+pd_0xffff] 3743 psubd m1, m4 3744 psubd m7, m5 3745 psrld m0, 8 3746 psrld m8, 8 3747 pslld m1, 7 3748 pslld m7, 7 3749 pand m0, m2 3750 pand m8, m2 3751 pandn m3, m2, m1 3752 pandn m2, m7 3753 por m0, m3 3754 por m8, m2 3755 mova m1, [base+pd_4096] 3756 pmaddwd m0, m15 3757 pmaddwd m8, m15 3758%if ARCH_X86_64 3759 pxor m6, m6 3760 SWAP m7, m6 3761%else 3762 pxor m7, m7 3763%endif 3764 paddd m4, m1 3765 paddd m5, m1 3766 paddd m0, m4 3767 paddd m8, m5 3768 psrad m0, 8 3769 psrad m8, 8 3770 packssdw m0, m8 ; clip 3771 pmaxsw m0, m7 3772 psrlw m0, 5 3773 mova [dstq+wq], m0 3774 add wq, 16 3775 jl .n1_loop 3776 add dstq, dst_stridemp 3777 movif32 dstm, dstq 3778 ret 3779