1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 32 32 33sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 34sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 35wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 36wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 37wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 38wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 39wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 40wiener_lshuf5: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 41wiener_lshuf7: db 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 12, 13, 14, 15 42pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 43 db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 44 45wiener_hshift: dw 4, 4, 1, 1 46wiener_vshift: dw 1024, 1024, 4096, 4096 47wiener_round: dd 1049600, 1048832 48 49pb_m10_m9: times 2 db -10, -9 50pb_m6_m5: times 2 db -6, -5 51pb_m2_m1: times 2 db -2, -1 52pb_2_3: times 2 db 2, 3 53pb_6_7: times 2 db 6, 7 54pw_1023: times 2 dw 1023 55pd_8: dd 8 56pd_25: dd 25 57pd_4096: dd 4096 58pd_34816: dd 34816 59pd_m262128 dd -262128 60pd_0xf00800a4: dd 0xf00800a4 61pd_0xf00801c7: dd 0xf00801c7 62 63%define pw_256 sgr_lshuf5 64 65cextern sgr_x_by_x_avx2 66 67SECTION .text 68 69%macro REPX 2-* 70 %xdefine %%f(x) %1 71%rep %0 - 1 72 %rotate 1 73 %%f(%1) 74%endrep 75%endmacro 76 77DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers 78 79INIT_YMM avx2 80cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ 81 lpf_stride, w, edge, flt, h 82%define base t4-wiener_hshift 83 mov fltq, fltmp 84 mov edged, r8m 85 movifnidn wd, wm 86 mov hd, r6m 87 mov t3d, r9m ; pixel_max 88 vbroadcasti128 m6, [wiener_shufA] 89 vpbroadcastd m12, [fltq+ 0] ; x0 x1 90 lea t4, [wiener_hshift] 91 vbroadcasti128 m7, [wiener_shufB] 92 add wd, wd 93 vpbroadcastd m13, [fltq+ 4] ; x2 x3 94 shr t3d, 11 95 vbroadcasti128 m8, [wiener_shufC] 96 add lpfq, wq 97 vbroadcasti128 m9, [wiener_shufD] 98 lea t1, [rsp+wq+16] 99 vpbroadcastd m14, [fltq+16] ; y0 y1 100 add dstq, wq 101 vpbroadcastd m15, [fltq+20] ; y2 y3 102 neg wq 103 vpbroadcastd m0, [base+wiener_hshift+t3*4] 104 vpbroadcastd m10, [base+wiener_round+t3*4] 105 vpbroadcastd m11, [base+wiener_vshift+t3*4] 106 pmullw m12, m0 ; upshift filter coefs to make the 107 pmullw m13, m0 ; horizontal downshift constant 108 test edgeb, 4 ; LR_HAVE_TOP 109 jz .no_top 110 call .h_top 111 add lpfq, lpf_strideq 112 mov t6, t1 113 mov t5, t1 114 add t1, 384*2 115 call .h_top 116 lea r7, [lpfq+lpf_strideq*4] 117 mov lpfq, dstq 118 mov t4, t1 119 add t1, 384*2 120 mov [rsp+8*1], lpf_strideq 121 add r7, lpf_strideq 122 mov [rsp+8*0], r7 ; below 123 call .h 124 mov t3, t1 125 mov t2, t1 126 dec hd 127 jz .v1 128 add lpfq, dst_strideq 129 add t1, 384*2 130 call .h 131 mov t2, t1 132 dec hd 133 jz .v2 134 add lpfq, dst_strideq 135 add t1, 384*2 136 call .h 137 dec hd 138 jz .v3 139.main: 140 lea t0, [t1+384*2] 141.main_loop: 142 call .hv 143 dec hd 144 jnz .main_loop 145 test edgeb, 8 ; LR_HAVE_BOTTOM 146 jz .v3 147 mov lpfq, [rsp+8*0] 148 call .hv_bottom 149 add lpfq, [rsp+8*1] 150 call .hv_bottom 151.v1: 152 call .v 153 RET 154.no_top: 155 lea r7, [lpfq+lpf_strideq*4] 156 mov lpfq, dstq 157 mov [rsp+8*1], lpf_strideq 158 lea r7, [r7+lpf_strideq*2] 159 mov [rsp+8*0], r7 160 call .h 161 mov t6, t1 162 mov t5, t1 163 mov t4, t1 164 mov t3, t1 165 mov t2, t1 166 dec hd 167 jz .v1 168 add lpfq, dst_strideq 169 add t1, 384*2 170 call .h 171 mov t2, t1 172 dec hd 173 jz .v2 174 add lpfq, dst_strideq 175 add t1, 384*2 176 call .h 177 dec hd 178 jz .v3 179 lea t0, [t1+384*2] 180 call .hv 181 dec hd 182 jz .v3 183 add t0, 384*8 184 call .hv 185 dec hd 186 jnz .main 187.v3: 188 call .v 189.v2: 190 call .v 191 jmp .v1 192.extend_right: 193 movd xm1, r10d 194 vpbroadcastd m0, [pb_6_7] 195 movu m2, [pb_0to31] 196 vpbroadcastb m1, xm1 197 psubb m0, m1 198 pminub m0, m2 199 pshufb m3, m0 200 vpbroadcastd m0, [pb_m2_m1] 201 psubb m0, m1 202 pminub m0, m2 203 pshufb m4, m0 204 vpbroadcastd m0, [pb_m10_m9] 205 psubb m0, m1 206 pminub m0, m2 207 pshufb m5, m0 208 ret 209.h: 210 mov r10, wq 211 test edgeb, 1 ; LR_HAVE_LEFT 212 jz .h_extend_left 213 movq xm3, [leftq] 214 vpblendd m3, [lpfq+r10-8], 0xfc 215 add leftq, 8 216 jmp .h_main 217.h_extend_left: 218 vbroadcasti128 m3, [lpfq+r10] ; avoid accessing memory located 219 mova m4, [lpfq+r10] ; before the start of the buffer 220 shufpd m3, m4, 0x05 221 pshufb m3, [wiener_lshuf7] 222 jmp .h_main2 223.h_top: 224 mov r10, wq 225 test edgeb, 1 ; LR_HAVE_LEFT 226 jz .h_extend_left 227.h_loop: 228 movu m3, [lpfq+r10-8] 229.h_main: 230 mova m4, [lpfq+r10+0] 231.h_main2: 232 movu m5, [lpfq+r10+8] 233 test edgeb, 2 ; LR_HAVE_RIGHT 234 jnz .h_have_right 235 cmp r10d, -36 236 jl .h_have_right 237 call .extend_right 238.h_have_right: 239 pshufb m0, m3, m6 240 pshufb m1, m4, m7 241 paddw m0, m1 242 pshufb m3, m8 243 pmaddwd m0, m12 244 pshufb m1, m4, m9 245 paddw m3, m1 246 pshufb m1, m4, m6 247 pmaddwd m3, m13 248 pshufb m2, m5, m7 249 paddw m1, m2 250 vpbroadcastd m2, [pd_m262128] ; (1 << 4) - (1 << 18) 251 pshufb m4, m8 252 pmaddwd m1, m12 253 pshufb m5, m9 254 paddw m4, m5 255 pmaddwd m4, m13 256 paddd m0, m2 257 paddd m1, m2 258 paddd m0, m3 259 paddd m1, m4 260 psrad m0, 4 261 psrad m1, 4 262 packssdw m0, m1 263 psraw m0, 1 264 mova [t1+r10], m0 265 add r10, 32 266 jl .h_loop 267 ret 268ALIGN function_align 269.hv: 270 add lpfq, dst_strideq 271 mov r10, wq 272 test edgeb, 1 ; LR_HAVE_LEFT 273 jz .hv_extend_left 274 movq xm3, [leftq] 275 vpblendd m3, [lpfq+r10-8], 0xfc 276 add leftq, 8 277 jmp .hv_main 278.hv_extend_left: 279 movu m3, [lpfq+r10-8] 280 pshufb m3, [wiener_lshuf7] 281 jmp .hv_main 282.hv_bottom: 283 mov r10, wq 284 test edgeb, 1 ; LR_HAVE_LEFT 285 jz .hv_extend_left 286.hv_loop: 287 movu m3, [lpfq+r10-8] 288.hv_main: 289 mova m4, [lpfq+r10+0] 290 movu m5, [lpfq+r10+8] 291 test edgeb, 2 ; LR_HAVE_RIGHT 292 jnz .hv_have_right 293 cmp r10d, -36 294 jl .hv_have_right 295 call .extend_right 296.hv_have_right: 297 pshufb m0, m3, m6 298 pshufb m1, m4, m7 299 paddw m0, m1 300 pshufb m3, m8 301 pmaddwd m0, m12 302 pshufb m1, m4, m9 303 paddw m3, m1 304 pshufb m1, m4, m6 305 pmaddwd m3, m13 306 pshufb m2, m5, m7 307 paddw m1, m2 308 vpbroadcastd m2, [pd_m262128] 309 pshufb m4, m8 310 pmaddwd m1, m12 311 pshufb m5, m9 312 paddw m4, m5 313 pmaddwd m4, m13 314 paddd m0, m2 315 paddd m1, m2 316 mova m2, [t4+r10] 317 paddw m2, [t2+r10] 318 mova m5, [t3+r10] 319 paddd m0, m3 320 paddd m1, m4 321 psrad m0, 4 322 psrad m1, 4 323 packssdw m0, m1 324 mova m4, [t5+r10] 325 paddw m4, [t1+r10] 326 psraw m0, 1 327 paddw m3, m0, [t6+r10] 328 mova [t0+r10], m0 329 punpcklwd m0, m2, m5 330 pmaddwd m0, m15 331 punpckhwd m2, m5 332 pmaddwd m2, m15 333 punpcklwd m1, m3, m4 334 pmaddwd m1, m14 335 punpckhwd m3, m4 336 pmaddwd m3, m14 337 paddd m0, m10 338 paddd m2, m10 339 paddd m0, m1 340 paddd m2, m3 341 psrad m0, 5 342 psrad m2, 5 343 packusdw m0, m2 344 pmulhuw m0, m11 345 mova [dstq+r10], m0 346 add r10, 32 347 jl .hv_loop 348 mov t6, t5 349 mov t5, t4 350 mov t4, t3 351 mov t3, t2 352 mov t2, t1 353 mov t1, t0 354 mov t0, t6 355 add dstq, dst_strideq 356 ret 357.v: 358 mov r10, wq 359.v_loop: 360 mova m1, [t4+r10] 361 paddw m1, [t2+r10] 362 mova m2, [t3+r10] 363 mova m4, [t1+r10] 364 paddw m3, m4, [t6+r10] 365 paddw m4, [t5+r10] 366 punpcklwd m0, m1, m2 367 pmaddwd m0, m15 368 punpckhwd m1, m2 369 pmaddwd m1, m15 370 punpcklwd m2, m3, m4 371 pmaddwd m2, m14 372 punpckhwd m3, m4 373 pmaddwd m3, m14 374 paddd m0, m10 375 paddd m1, m10 376 paddd m0, m2 377 paddd m1, m3 378 psrad m0, 5 379 psrad m1, 5 380 packusdw m0, m1 381 pmulhuw m0, m11 382 mova [dstq+r10], m0 383 add r10, 32 384 jl .v_loop 385 mov t6, t5 386 mov t5, t4 387 mov t4, t3 388 mov t3, t2 389 mov t2, t1 390 add dstq, dst_strideq 391 ret 392cglobal wiener_filter5_16bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ 393 lpf_stride, w, edge, flt, h 394%define base t4-wiener_hshift 395 mov fltq, fltmp 396 mov edged, r8m 397 movifnidn wd, wm 398 mov hd, r6m 399 mov t3d, r9m ; pixel_max 400 vbroadcasti128 m5, [wiener_shufE] 401 vpbroadcastw m11, [fltq+ 2] ; x1 402 vbroadcasti128 m6, [wiener_shufB] 403 lea t4, [wiener_hshift] 404 vbroadcasti128 m7, [wiener_shufD] 405 add wd, wd 406 vpbroadcastd m12, [fltq+ 4] ; x2 x3 407 shr t3d, 11 408 vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18) 409 add lpfq, wq 410 lea t1, [rsp+wq+16] 411 vpbroadcastw m13, [fltq+18] ; y1 412 add dstq, wq 413 vpbroadcastd m14, [fltq+20] ; y2 y3 414 neg wq 415 vpbroadcastd m0, [base+wiener_hshift+t3*4] 416 vpbroadcastd m9, [base+wiener_round+t3*4] 417 vpbroadcastd m10, [base+wiener_vshift+t3*4] 418 movu xm15, [wiener_lshuf5] 419 pmullw m11, m0 420 vinserti128 m15, [pb_0to31], 1 421 pmullw m12, m0 422 test edgeb, 4 ; LR_HAVE_TOP 423 jz .no_top 424 call .h_top 425 add lpfq, lpf_strideq 426 mov t4, t1 427 add t1, 384*2 428 call .h_top 429 lea r7, [lpfq+lpf_strideq*4] 430 mov lpfq, dstq 431 mov t3, t1 432 add t1, 384*2 433 mov [rsp+8*1], lpf_strideq 434 add r7, lpf_strideq 435 mov [rsp+8*0], r7 ; below 436 call .h 437 mov t2, t1 438 dec hd 439 jz .v1 440 add lpfq, dst_strideq 441 add t1, 384*2 442 call .h 443 dec hd 444 jz .v2 445.main: 446 mov t0, t4 447.main_loop: 448 call .hv 449 dec hd 450 jnz .main_loop 451 test edgeb, 8 ; LR_HAVE_BOTTOM 452 jz .v2 453 mov lpfq, [rsp+8*0] 454 call .hv_bottom 455 add lpfq, [rsp+8*1] 456 call .hv_bottom 457.end: 458 RET 459.no_top: 460 lea r7, [lpfq+lpf_strideq*4] 461 mov lpfq, dstq 462 mov [rsp+8*1], lpf_strideq 463 lea r7, [r7+lpf_strideq*2] 464 mov [rsp+8*0], r7 465 call .h 466 mov t4, t1 467 mov t3, t1 468 mov t2, t1 469 dec hd 470 jz .v1 471 add lpfq, dst_strideq 472 add t1, 384*2 473 call .h 474 dec hd 475 jz .v2 476 lea t0, [t1+384*2] 477 call .hv 478 dec hd 479 jz .v2 480 add t0, 384*6 481 call .hv 482 dec hd 483 jnz .main 484.v2: 485 call .v 486 mov t4, t3 487 mov t3, t2 488 mov t2, t1 489 add dstq, dst_strideq 490.v1: 491 call .v 492 jmp .end 493.extend_right: 494 movd xm2, r10d 495 vpbroadcastd m0, [pb_2_3] 496 vpbroadcastd m1, [pb_m6_m5] 497 vpbroadcastb m2, xm2 498 psubb m0, m2 499 psubb m1, m2 500 movu m2, [pb_0to31] 501 pminub m0, m2 502 pminub m1, m2 503 pshufb m3, m0 504 pshufb m4, m1 505 ret 506.h: 507 mov r10, wq 508 test edgeb, 1 ; LR_HAVE_LEFT 509 jz .h_extend_left 510 movd xm3, [leftq+4] 511 vpblendd m3, [lpfq+r10-4], 0xfe 512 add leftq, 8 513 jmp .h_main 514.h_extend_left: 515 vbroadcasti128 m4, [lpfq+r10] ; avoid accessing memory located 516 mova m3, [lpfq+r10] ; before the start of the buffer 517 palignr m3, m4, 12 518 pshufb m3, m15 519 jmp .h_main 520.h_top: 521 mov r10, wq 522 test edgeb, 1 ; LR_HAVE_LEFT 523 jz .h_extend_left 524.h_loop: 525 movu m3, [lpfq+r10-4] 526.h_main: 527 movu m4, [lpfq+r10+4] 528 test edgeb, 2 ; LR_HAVE_RIGHT 529 jnz .h_have_right 530 cmp r10d, -34 531 jl .h_have_right 532 call .extend_right 533.h_have_right: 534 pshufb m0, m3, m5 535 pmaddwd m0, m11 536 pshufb m1, m4, m5 537 pmaddwd m1, m11 538 pshufb m2, m3, m6 539 pshufb m3, m7 540 paddw m2, m3 541 pshufb m3, m4, m6 542 pmaddwd m2, m12 543 pshufb m4, m7 544 paddw m3, m4 545 pmaddwd m3, m12 546 paddd m0, m8 547 paddd m1, m8 548 paddd m0, m2 549 paddd m1, m3 550 psrad m0, 4 551 psrad m1, 4 552 packssdw m0, m1 553 psraw m0, 1 554 mova [t1+r10], m0 555 add r10, 32 556 jl .h_loop 557 ret 558ALIGN function_align 559.hv: 560 add lpfq, dst_strideq 561 mov r10, wq 562 test edgeb, 1 ; LR_HAVE_LEFT 563 jz .hv_extend_left 564 movd xm3, [leftq+4] 565 vpblendd m3, [lpfq+r10-4], 0xfe 566 add leftq, 8 567 jmp .hv_main 568.hv_extend_left: 569 movu m3, [lpfq+r10-4] 570 pshufb m3, m15 571 jmp .hv_main 572.hv_bottom: 573 mov r10, wq 574 test edgeb, 1 ; LR_HAVE_LEFT 575 jz .hv_extend_left 576.hv_loop: 577 movu m3, [lpfq+r10-4] 578.hv_main: 579 movu m4, [lpfq+r10+4] 580 test edgeb, 2 ; LR_HAVE_RIGHT 581 jnz .hv_have_right 582 cmp r10d, -34 583 jl .hv_have_right 584 call .extend_right 585.hv_have_right: 586 pshufb m0, m3, m5 587 pmaddwd m0, m11 588 pshufb m1, m4, m5 589 pmaddwd m1, m11 590 pshufb m2, m3, m6 591 pshufb m3, m7 592 paddw m2, m3 593 pshufb m3, m4, m6 594 pmaddwd m2, m12 595 pshufb m4, m7 596 paddw m3, m4 597 pmaddwd m3, m12 598 paddd m0, m8 599 paddd m1, m8 600 paddd m0, m2 601 mova m2, [t3+r10] 602 paddw m2, [t1+r10] 603 paddd m1, m3 604 mova m4, [t2+r10] 605 punpckhwd m3, m2, m4 606 pmaddwd m3, m14 607 punpcklwd m2, m4 608 mova m4, [t4+r10] 609 psrad m0, 4 610 psrad m1, 4 611 packssdw m0, m1 612 pmaddwd m2, m14 613 psraw m0, 1 614 mova [t0+r10], m0 615 punpckhwd m1, m0, m4 616 pmaddwd m1, m13 617 punpcklwd m0, m4 618 pmaddwd m0, m13 619 paddd m3, m9 620 paddd m2, m9 621 paddd m1, m3 622 paddd m0, m2 623 psrad m1, 5 624 psrad m0, 5 625 packusdw m0, m1 626 pmulhuw m0, m10 627 mova [dstq+r10], m0 628 add r10, 32 629 jl .hv_loop 630 mov t4, t3 631 mov t3, t2 632 mov t2, t1 633 mov t1, t0 634 mov t0, t4 635 add dstq, dst_strideq 636 ret 637.v: 638 mov r10, wq 639.v_loop: 640 mova m0, [t1+r10] 641 paddw m2, m0, [t3+r10] 642 mova m1, [t2+r10] 643 mova m4, [t4+r10] 644 punpckhwd m3, m2, m1 645 pmaddwd m3, m14 646 punpcklwd m2, m1 647 pmaddwd m2, m14 648 punpckhwd m1, m0, m4 649 pmaddwd m1, m13 650 punpcklwd m0, m4 651 pmaddwd m0, m13 652 paddd m3, m9 653 paddd m2, m9 654 paddd m1, m3 655 paddd m0, m2 656 psrad m1, 5 657 psrad m0, 5 658 packusdw m0, m1 659 pmulhuw m0, m10 660 mova [dstq+r10], m0 661 add r10, 32 662 jl .v_loop 663 ret 664 665cglobal sgr_filter_5x5_16bpc, 5, 14, 16, 400*24+16, dst, dst_stride, left, lpf, \ 666 lpf_stride, w, edge, params, h 667 movifnidn wd, wm 668 mov paramsq, paramsmp 669 lea r13, [sgr_x_by_x_avx2+256*4] 670 mov edged, r8m 671 mov hd, r6m 672 add wd, wd 673 vpbroadcastw m7, [paramsq+8] ; w0 674 add lpfq, wq 675 vpbroadcastd m8, [pd_8] 676 lea t1, [rsp+wq+20] 677 vpbroadcastd m9, [pd_25] 678 add dstq, wq 679 vpbroadcastd m10, [paramsq+0] ; s0 680 lea t3, [rsp+wq*2+400*12+16] 681 vpbroadcastd m11, [pd_0xf00800a4] 682 lea t4, [rsp+wq+400*20+16] 683 vpbroadcastd m12, [pw_256] 684 neg wq 685 vpbroadcastd m13, [pd_34816] ; (1 << 11) + (1 << 15) 686 pxor m6, m6 687 vpbroadcastd m14, [pw_1023] 688 psllw m7, 4 689 mova xm15, [sgr_lshuf5] 690 test edgeb, 4 ; LR_HAVE_TOP 691 jz .no_top 692 call .h_top 693 add lpfq, lpf_strideq 694 mov t2, t1 695 call .top_fixup 696 add t1, 400*6 697 call .h_top 698 lea r10, [lpfq+lpf_strideq*4] 699 mov lpfq, dstq 700 mov [rsp+8*1], lpf_strideq 701 add r10, lpf_strideq 702 mov [rsp+8*0], r10 ; below 703 mov t0, t2 704 dec hd 705 jz .height1 706 or edged, 16 707 call .h 708.main: 709 add lpfq, dst_strideq 710 call .hv 711 call .prep_n 712 sub hd, 2 713 jl .extend_bottom 714.main_loop: 715 add lpfq, dst_strideq 716 test hd, hd 717 jz .odd_height 718 call .h 719 add lpfq, dst_strideq 720 call .hv 721 call .n0 722 call .n1 723 sub hd, 2 724 jge .main_loop 725 test edgeb, 8 ; LR_HAVE_BOTTOM 726 jz .extend_bottom 727 mov lpfq, [rsp+8*0] 728 call .h_top 729 add lpfq, [rsp+8*1] 730 call .hv_bottom 731.end: 732 call .n0 733 call .n1 734.end2: 735 RET 736.height1: 737 call .hv 738 call .prep_n 739 jmp .odd_height_end 740.odd_height: 741 call .hv 742 call .n0 743 call .n1 744.odd_height_end: 745 call .v 746 call .n0 747 jmp .end2 748.extend_bottom: 749 call .v 750 jmp .end 751.no_top: 752 lea r10, [lpfq+lpf_strideq*4] 753 mov lpfq, dstq 754 mov [rsp+8*1], lpf_strideq 755 lea r10, [r10+lpf_strideq*2] 756 mov [rsp+8*0], r10 757 call .h 758 lea t2, [t1+400*6] 759 call .top_fixup 760 dec hd 761 jz .no_top_height1 762 or edged, 16 763 mov t0, t1 764 mov t1, t2 765 jmp .main 766.no_top_height1: 767 call .v 768 call .prep_n 769 jmp .odd_height_end 770.extend_right: 771 vpbroadcastw m0, [lpfq-2] 772 movu m1, [r13+r10+ 0] 773 movu m2, [r13+r10+16] 774 vpblendvb m4, m0, m1 775 vpblendvb m5, m0, m2 776 ret 777.h: ; horizontal boxsum 778 lea r10, [wq-4] 779 test edgeb, 1 ; LR_HAVE_LEFT 780 jz .h_extend_left 781 vpbroadcastq xm5, [leftq] 782 vinserti128 m5, [lpfq+wq], 1 783 mova m4, [lpfq+wq] 784 add leftq, 8 785 palignr m4, m5, 10 786 jmp .h_main 787.h_extend_left: 788 mova xm4, [lpfq+wq] 789 pshufb xm4, xm15 790 vinserti128 m4, [lpfq+wq+10], 1 791 jmp .h_main 792.h_top: 793 lea r10, [wq-4] 794 test edgeb, 1 ; LR_HAVE_LEFT 795 jz .h_extend_left 796.h_loop: 797 movu m4, [lpfq+r10- 2] 798.h_main: 799 movu m5, [lpfq+r10+14] 800 test edgeb, 2 ; LR_HAVE_RIGHT 801 jnz .h_have_right 802 cmp r10d, -36 803 jl .h_have_right 804 call .extend_right 805.h_have_right: 806 palignr m2, m5, m4, 2 807 paddw m0, m4, m2 808 palignr m3, m5, m4, 6 809 paddw m0, m3 810 punpcklwd m1, m2, m3 811 pmaddwd m1, m1 812 punpckhwd m2, m3 813 pmaddwd m2, m2 814 shufpd m5, m4, m5, 0x05 815 paddw m0, m5 816 punpcklwd m3, m4, m5 817 pmaddwd m3, m3 818 paddd m1, m3 819 punpckhwd m3, m4, m5 820 pmaddwd m3, m3 821 shufps m4, m5, q2121 822 paddw m0, m4 ; sum 823 punpcklwd m5, m4, m6 824 pmaddwd m5, m5 825 punpckhwd m4, m6 826 pmaddwd m4, m4 827 paddd m2, m3 828 test edgeb, 16 ; y > 0 829 jz .h_loop_end 830 paddw m0, [t1+r10+400*0] 831 paddd m1, [t1+r10+400*2] 832 paddd m2, [t1+r10+400*4] 833.h_loop_end: 834 paddd m1, m5 ; sumsq 835 paddd m2, m4 836 mova [t1+r10+400*0], m0 837 mova [t1+r10+400*2], m1 838 mova [t1+r10+400*4], m2 839 add r10, 32 840 jl .h_loop 841 ret 842.top_fixup: 843 lea r10, [wq-4] 844.top_fixup_loop: ; the sums of the first row needs to be doubled 845 mova m0, [t1+r10+400*0] 846 mova m1, [t1+r10+400*2] 847 mova m2, [t1+r10+400*4] 848 paddw m0, m0 849 paddd m1, m1 850 paddd m2, m2 851 mova [t2+r10+400*0], m0 852 mova [t2+r10+400*2], m1 853 mova [t2+r10+400*4], m2 854 add r10, 32 855 jl .top_fixup_loop 856 ret 857ALIGN function_align 858.hv: ; horizontal boxsum + vertical boxsum + ab 859 lea r10, [wq-4] 860 test edgeb, 1 ; LR_HAVE_LEFT 861 jz .hv_extend_left 862 vpbroadcastq xm5, [leftq] 863 vinserti128 m5, [lpfq+wq], 1 864 mova m4, [lpfq+wq] 865 add leftq, 8 866 palignr m4, m5, 10 867 jmp .hv_main 868.hv_extend_left: 869 mova xm4, [lpfq+wq] 870 pshufb xm4, xm15 871 vinserti128 m4, [lpfq+wq+10], 1 872 jmp .hv_main 873.hv_bottom: 874 lea r10, [wq-4] 875 test edgeb, 1 ; LR_HAVE_LEFT 876 jz .hv_extend_left 877.hv_loop: 878 movu m4, [lpfq+r10- 2] 879.hv_main: 880 movu m5, [lpfq+r10+14] 881 test edgeb, 2 ; LR_HAVE_RIGHT 882 jnz .hv_have_right 883 cmp r10d, -36 884 jl .hv_have_right 885 call .extend_right 886.hv_have_right: 887 palignr m3, m5, m4, 2 888 paddw m0, m4, m3 889 palignr m1, m5, m4, 6 890 paddw m0, m1 891 punpcklwd m2, m3, m1 892 pmaddwd m2, m2 893 punpckhwd m3, m1 894 pmaddwd m3, m3 895 shufpd m5, m4, m5, 0x05 896 paddw m0, m5 897 punpcklwd m1, m4, m5 898 pmaddwd m1, m1 899 paddd m2, m1 900 punpckhwd m1, m4, m5 901 pmaddwd m1, m1 902 shufps m4, m5, q2121 903 paddw m0, m4 ; h sum 904 punpcklwd m5, m4, m6 905 pmaddwd m5, m5 906 punpckhwd m4, m6 907 pmaddwd m4, m4 908 paddd m3, m1 909 paddd m2, m5 ; h sumsq 910 paddd m3, m4 911 paddw m1, m0, [t1+r10+400*0] 912 paddd m4, m2, [t1+r10+400*2] 913 paddd m5, m3, [t1+r10+400*4] 914 test hd, hd 915 jz .hv_last_row 916.hv_main2: 917 paddw m1, [t2+r10+400*0] ; hv sum 918 paddd m4, [t2+r10+400*2] ; hv sumsq 919 paddd m5, [t2+r10+400*4] 920 mova [t0+r10+400*0], m0 921 mova [t0+r10+400*2], m2 922 mova [t0+r10+400*4], m3 923 psrlw m3, m1, 1 924 paddd m4, m8 925 pavgw m3, m6 ; (b + 2) >> 2 926 paddd m5, m8 927 psrld m4, 4 ; (a + 8) >> 4 928 punpcklwd m2, m3, m6 929 psrld m5, 4 930 punpckhwd m3, m6 931 pmulld m4, m9 ; a * 25 932 pmulld m5, m9 933 pmaddwd m2, m2 ; b * b 934 pmaddwd m3, m3 935 punpcklwd m0, m1, m6 ; b 936 punpckhwd m1, m6 937 pmaxud m4, m2 938 pmaxud m5, m3 939 psubd m4, m2 ; p 940 psubd m5, m3 941 pmulld m4, m10 ; p * s 942 pmulld m5, m10 943 pmaddwd m0, m11 ; b * 164 944 pmaddwd m1, m11 945 paddusw m4, m11 946 paddusw m5, m11 947 psrad m3, m4, 20 ; min(z, 255) - 256 948 vpgatherdd m2, [r13+m3*4], m4 949 psrad m4, m5, 20 950 vpgatherdd m3, [r13+m4*4], m5 951 pmulld m0, m2 952 pmulld m1, m3 953 packssdw m2, m3 954 psubw m2, m12, m2 ; a 955 paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) 956 paddd m1, m13 957 mova [t4+r10+4], m2 958 psrld m0, 12 ; b 959 psrld m1, 12 960 mova [t3+r10*2+ 8], xm0 961 vextracti128 [t3+r10*2+40], m0, 1 962 mova [t3+r10*2+24], xm1 963 vextracti128 [t3+r10*2+56], m1, 1 964 add r10, 32 965 jl .hv_loop 966 mov t2, t1 967 mov t1, t0 968 mov t0, t2 969 ret 970.hv_last_row: ; esoteric edge case for odd heights 971 mova [t1+r10+400*0], m1 972 paddw m1, m0 973 mova [t1+r10+400*2], m4 974 paddd m4, m2 975 mova [t1+r10+400*4], m5 976 paddd m5, m3 977 jmp .hv_main2 978.v: ; vertical boxsum + ab 979 lea r10, [wq-4] 980.v_loop: 981 mova m0, [t1+r10+400*0] 982 mova m2, [t1+r10+400*2] 983 mova m3, [t1+r10+400*4] 984 paddw m1, m0, [t2+r10+400*0] 985 paddd m4, m2, [t2+r10+400*2] 986 paddd m5, m3, [t2+r10+400*4] 987 paddw m0, m0 988 paddd m2, m2 989 paddd m3, m3 990 paddw m1, m0 ; hv sum 991 paddd m4, m2 ; hv sumsq 992 paddd m5, m3 993 psrlw m3, m1, 1 994 paddd m4, m8 995 pavgw m3, m6 ; (b + 2) >> 2 996 paddd m5, m8 997 psrld m4, 4 ; (a + 8) >> 4 998 punpcklwd m2, m3, m6 999 psrld m5, 4 1000 punpckhwd m3, m6 1001 pmulld m4, m9 ; a * 25 1002 pmulld m5, m9 1003 pmaddwd m2, m2 ; b * b 1004 pmaddwd m3, m3 1005 punpcklwd m0, m1, m6 ; b 1006 punpckhwd m1, m6 1007 pmaxud m4, m2 1008 pmaxud m5, m3 1009 psubd m4, m2 ; p 1010 psubd m5, m3 1011 pmulld m4, m10 ; p * s 1012 pmulld m5, m10 1013 pmaddwd m0, m11 ; b * 164 1014 pmaddwd m1, m11 1015 paddusw m4, m11 1016 paddusw m5, m11 1017 psrad m3, m4, 20 ; min(z, 255) - 256 1018 vpgatherdd m2, [r13+m3*4], m4 1019 psrad m4, m5, 20 1020 vpgatherdd m3, [r13+m4*4], m5 1021 pmulld m0, m2 1022 pmulld m1, m3 1023 packssdw m2, m3 1024 psubw m2, m12, m2 ; a 1025 paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) 1026 paddd m1, m13 1027 mova [t4+r10+4], m2 1028 psrld m0, 12 ; b 1029 psrld m1, 12 1030 mova [t3+r10*2+ 8], xm0 1031 vextracti128 [t3+r10*2+40], m0, 1 1032 mova [t3+r10*2+24], xm1 1033 vextracti128 [t3+r10*2+56], m1, 1 1034 add r10, 32 1035 jl .v_loop 1036 ret 1037.prep_n: ; initial neighbor setup 1038 mov r10, wq 1039.prep_n_loop: 1040 movu m0, [t4+r10*1+ 2] 1041 movu m1, [t3+r10*2+ 4] 1042 movu m2, [t3+r10*2+36] 1043 paddw m3, m0, [t4+r10*1+ 0] 1044 paddd m4, m1, [t3+r10*2+ 0] 1045 paddd m5, m2, [t3+r10*2+32] 1046 paddw m3, [t4+r10*1+ 4] 1047 paddd m4, [t3+r10*2+ 8] 1048 paddd m5, [t3+r10*2+40] 1049 paddw m0, m3 1050 psllw m3, 2 1051 paddd m1, m4 1052 pslld m4, 2 1053 paddd m2, m5 1054 pslld m5, 2 1055 paddw m0, m3 ; a 565 1056 paddd m1, m4 ; b 565 1057 paddd m2, m5 1058 mova [t4+r10*1+400*2+ 0], m0 1059 mova [t3+r10*2+400*4+ 0], m1 1060 mova [t3+r10*2+400*4+32], m2 1061 add r10, 32 1062 jl .prep_n_loop 1063 ret 1064ALIGN function_align 1065.n0: ; neighbor + output (even rows) 1066 mov r10, wq 1067.n0_loop: 1068 movu m0, [t4+r10*1+ 2] 1069 movu m1, [t3+r10*2+ 4] 1070 movu m2, [t3+r10*2+36] 1071 paddw m3, m0, [t4+r10*1+ 0] 1072 paddd m4, m1, [t3+r10*2+ 0] 1073 paddd m5, m2, [t3+r10*2+32] 1074 paddw m3, [t4+r10*1+ 4] 1075 paddd m4, [t3+r10*2+ 8] 1076 paddd m5, [t3+r10*2+40] 1077 paddw m0, m3 1078 psllw m3, 2 1079 paddd m1, m4 1080 pslld m4, 2 1081 paddd m2, m5 1082 pslld m5, 2 1083 paddw m0, m3 ; a 565 1084 paddd m1, m4 ; b 565 1085 paddd m2, m5 1086 paddw m3, m0, [t4+r10*1+400*2+ 0] 1087 paddd m4, m1, [t3+r10*2+400*4+ 0] 1088 paddd m5, m2, [t3+r10*2+400*4+32] 1089 mova [t4+r10*1+400*2+ 0], m0 1090 mova [t3+r10*2+400*4+ 0], m1 1091 mova [t3+r10*2+400*4+32], m2 1092 mova m0, [dstq+r10] 1093 punpcklwd m1, m0, m6 ; src 1094 punpcklwd m2, m3, m6 ; a 1095 pmaddwd m2, m1 ; a * src 1096 punpckhwd m1, m0, m6 1097 punpckhwd m3, m6 1098 pmaddwd m3, m1 1099 vinserti128 m1, m4, xm5, 1 1100 vperm2i128 m4, m5, 0x31 1101 paddd m2, m1 ; a * src + b + (1 << 8) 1102 paddd m3, m4 1103 psrld m2, 9 1104 psrld m3, 9 1105 packssdw m2, m3 1106 psllw m1, m0, 4 1107 psubw m2, m1 1108 pmulhrsw m2, m7 1109 paddw m0, m2 1110 pmaxsw m0, m6 1111 pminsw m0, m14 1112 mova [dstq+r10], m0 1113 add r10, 32 1114 jl .n0_loop 1115 add dstq, dst_strideq 1116 ret 1117ALIGN function_align 1118.n1: ; neighbor + output (odd rows) 1119 mov r10, wq 1120.n1_loop: 1121 mova m0, [dstq+r10] 1122 mova m3, [t4+r10*1+400*2+ 0] 1123 mova m4, [t3+r10*2+400*4+ 0] 1124 mova m5, [t3+r10*2+400*4+32] 1125 punpcklwd m1, m0, m6 ; src 1126 punpcklwd m2, m3, m6 ; a 1127 pmaddwd m2, m1 1128 punpckhwd m1, m0, m6 1129 punpckhwd m3, m6 1130 pmaddwd m3, m1 1131 vinserti128 m1, m4, xm5, 1 1132 vperm2i128 m4, m5, 0x31 1133 paddd m2, m1 ; a * src + b + (1 <<7) 1134 paddd m3, m4 1135 psrld m2, 8 1136 psrld m3, 8 1137 packssdw m2, m3 1138 psllw m1, m0, 4 1139 psubw m2, m1 1140 pmulhrsw m2, m7 1141 paddw m0, m2 1142 pmaxsw m0, m6 1143 pminsw m0, m14 1144 mova [dstq+r10], m0 1145 add r10, 32 1146 jl .n1_loop 1147 add dstq, dst_strideq 1148 ret 1149 1150cglobal sgr_filter_3x3_16bpc, 5, 14, 15, 400*42+8, dst, dst_stride, left, lpf, \ 1151 lpf_stride, w, edge, params, h 1152 movifnidn wd, wm 1153 mov paramsq, paramsmp 1154 lea r13, [sgr_x_by_x_avx2+256*4] 1155 mov edged, r8m 1156 add wd, wd 1157 mov hd, r6m 1158 add lpfq, wq 1159 vpbroadcastw m7, [paramsq+10] ; w1 1160 lea t1, [rsp+wq+12] 1161 vpbroadcastd m8, [pd_8] 1162 add dstq, wq 1163 vpbroadcastd m9, [paramsq+ 4] ; s1 1164 lea t3, [rsp+wq*2+400*12+8] 1165 vpbroadcastd m10, [pd_0xf00801c7] 1166 lea t4, [rsp+wq+400*32+8] 1167 vpbroadcastd m11, [pd_34816] 1168 neg wq 1169 vpbroadcastd m12, [pw_256] 1170 pxor m6, m6 1171 vpbroadcastd m13, [pw_1023] 1172 psllw m7, 4 1173 mova xm14, [sgr_lshuf3] 1174 test edgeb, 4 ; LR_HAVE_TOP 1175 jz .no_top 1176 call .h_top 1177 add lpfq, lpf_strideq 1178 mov t2, t1 1179 add t1, 400*6 1180 call .h_top 1181 lea r10, [lpfq+lpf_strideq*4] 1182 mov lpfq, dstq 1183 add r10, lpf_strideq 1184 mov [rsp], r10 ; below 1185 call .hv0 1186.main: 1187 dec hd 1188 jz .height1 1189 add lpfq, dst_strideq 1190 call .hv1 1191 call .prep_n 1192 sub hd, 2 1193 jl .extend_bottom 1194.main_loop: 1195 add lpfq, dst_strideq 1196 call .hv0 1197 test hd, hd 1198 jz .odd_height 1199 add lpfq, dst_strideq 1200 call .hv1 1201 call .n0 1202 call .n1 1203 sub hd, 2 1204 jge .main_loop 1205 test edgeb, 8 ; LR_HAVE_BOTTOM 1206 jz .extend_bottom 1207 mov lpfq, [rsp] 1208 call .hv0_bottom 1209 add lpfq, lpf_strideq 1210 call .hv1_bottom 1211.end: 1212 call .n0 1213 call .n1 1214.end2: 1215 RET 1216.height1: 1217 call .v1 1218 call .prep_n 1219 jmp .odd_height_end 1220.odd_height: 1221 call .v1 1222 call .n0 1223 call .n1 1224.odd_height_end: 1225 call .v0 1226 call .v1 1227 call .n0 1228 jmp .end2 1229.extend_bottom: 1230 call .v0 1231 call .v1 1232 jmp .end 1233.no_top: 1234 lea r10, [lpfq+lpf_strideq*4] 1235 mov lpfq, dstq 1236 lea r10, [r10+lpf_strideq*2] 1237 mov [rsp], r10 1238 call .h 1239 lea r10, [wq-4] 1240 lea t2, [t1+400*6] 1241.top_fixup_loop: 1242 mova m0, [t1+r10+400*0] 1243 mova m1, [t1+r10+400*2] 1244 mova m2, [t1+r10+400*4] 1245 mova [t2+r10+400*0], m0 1246 mova [t2+r10+400*2], m1 1247 mova [t2+r10+400*4], m2 1248 add r10, 32 1249 jl .top_fixup_loop 1250 call .v0 1251 jmp .main 1252.extend_right: 1253 vpbroadcastw m0, [lpfq-2] 1254 movu m1, [r13+r10+ 2] 1255 movu m2, [r13+r10+18] 1256 vpblendvb m4, m0, m1 1257 vpblendvb m5, m0, m2 1258 ret 1259.h: ; horizontal boxsum 1260 lea r10, [wq-4] 1261 test edgeb, 1 ; LR_HAVE_LEFT 1262 jz .h_extend_left 1263 vpbroadcastq xm5, [leftq] 1264 vinserti128 m5, [lpfq+wq], 1 1265 mova m4, [lpfq+wq] 1266 add leftq, 8 1267 palignr m4, m5, 12 1268 jmp .h_main 1269.h_extend_left: 1270 mova xm4, [lpfq+wq] 1271 pshufb xm4, xm14 1272 vinserti128 m4, [lpfq+wq+12], 1 1273 jmp .h_main 1274.h_top: 1275 lea r10, [wq-4] 1276 test edgeb, 1 ; LR_HAVE_LEFT 1277 jz .h_extend_left 1278.h_loop: 1279 movu m4, [lpfq+r10+ 0] 1280.h_main: 1281 movu m5, [lpfq+r10+16] 1282 test edgeb, 2 ; LR_HAVE_RIGHT 1283 jnz .h_have_right 1284 cmp r10d, -34 1285 jl .h_have_right 1286 call .extend_right 1287.h_have_right: 1288 palignr m0, m5, m4, 2 1289 paddw m1, m4, m0 1290 punpcklwd m2, m4, m0 1291 pmaddwd m2, m2 1292 punpckhwd m3, m4, m0 1293 pmaddwd m3, m3 1294 palignr m5, m4, 4 1295 paddw m1, m5 ; sum 1296 punpcklwd m4, m5, m6 1297 pmaddwd m4, m4 1298 punpckhwd m5, m6 1299 pmaddwd m5, m5 1300 paddd m2, m4 ; sumsq 1301 paddd m3, m5 1302 mova [t1+r10+400*0], m1 1303 mova [t1+r10+400*2], m2 1304 mova [t1+r10+400*4], m3 1305 add r10, 32 1306 jl .h_loop 1307 ret 1308ALIGN function_align 1309.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) 1310 lea r10, [wq-4] 1311 test edgeb, 1 ; LR_HAVE_LEFT 1312 jz .hv0_extend_left 1313 vpbroadcastq xm5, [leftq] 1314 vinserti128 m5, [lpfq+wq], 1 1315 mova m4, [lpfq+wq] 1316 add leftq, 8 1317 palignr m4, m5, 12 1318 jmp .hv0_main 1319.hv0_extend_left: 1320 mova xm4, [lpfq+wq] 1321 pshufb xm4, xm14 1322 vinserti128 m4, [lpfq+wq+12], 1 1323 jmp .hv0_main 1324.hv0_bottom: 1325 lea r10, [wq-4] 1326 test edgeb, 1 ; LR_HAVE_LEFT 1327 jz .hv0_extend_left 1328.hv0_loop: 1329 movu m4, [lpfq+r10+ 0] 1330.hv0_main: 1331 movu m5, [lpfq+r10+16] 1332 test edgeb, 2 ; LR_HAVE_RIGHT 1333 jnz .hv0_have_right 1334 cmp r10d, -34 1335 jl .hv0_have_right 1336 call .extend_right 1337.hv0_have_right: 1338 palignr m0, m5, m4, 2 1339 paddw m1, m4, m0 1340 punpcklwd m2, m4, m0 1341 pmaddwd m2, m2 1342 punpckhwd m3, m4, m0 1343 pmaddwd m3, m3 1344 palignr m5, m4, 4 1345 paddw m1, m5 ; sum 1346 punpcklwd m4, m5, m6 1347 pmaddwd m4, m4 1348 punpckhwd m5, m6 1349 pmaddwd m5, m5 1350 paddd m2, m4 ; sumsq 1351 paddd m3, m5 1352 paddw m0, m1, [t1+r10+400*0] 1353 paddd m4, m2, [t1+r10+400*2] 1354 paddd m5, m3, [t1+r10+400*4] 1355 mova [t1+r10+400*0], m1 1356 mova [t1+r10+400*2], m2 1357 mova [t1+r10+400*4], m3 1358 paddw m1, m0, [t2+r10+400*0] 1359 paddd m2, m4, [t2+r10+400*2] 1360 paddd m3, m5, [t2+r10+400*4] 1361 mova [t2+r10+400*0], m0 1362 mova [t2+r10+400*2], m4 1363 mova [t2+r10+400*4], m5 1364 paddd m2, m8 1365 paddd m3, m8 1366 psrld m2, 4 ; (a + 8) >> 4 1367 psrld m3, 4 1368 pslld m4, m2, 3 1369 pslld m5, m3, 3 1370 paddd m4, m2 ; ((a + 8) >> 4) * 9 1371 paddd m5, m3 1372 psrlw m3, m1, 1 1373 pavgw m3, m6 ; (b + 2) >> 2 1374 punpcklwd m2, m3, m6 1375 pmaddwd m2, m2 1376 punpckhwd m3, m6 1377 pmaddwd m3, m3 1378 punpcklwd m0, m1, m6 ; b 1379 punpckhwd m1, m6 1380 pmaxud m4, m2 1381 psubd m4, m2 ; p 1382 pmaxud m5, m3 1383 psubd m5, m3 1384 pmulld m4, m9 ; p * s 1385 pmulld m5, m9 1386 pmaddwd m0, m10 ; b * 455 1387 pmaddwd m1, m10 1388 paddusw m4, m10 1389 paddusw m5, m10 1390 psrad m3, m4, 20 ; min(z, 255) - 256 1391 vpgatherdd m2, [r13+m3*4], m4 1392 psrad m4, m5, 20 1393 vpgatherdd m3, [r13+m4*4], m5 1394 pmulld m0, m2 1395 pmulld m1, m3 1396 packssdw m2, m3 1397 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 1398 paddd m1, m11 1399 psubw m2, m12, m2 1400 psrld m0, 12 1401 psrld m1, 12 1402 mova [t4+r10*1+400*0+ 4], m2 1403 mova [t3+r10*2+400*0+ 8], xm0 1404 vextracti128 [t3+r10*2+400*0+40], m0, 1 1405 mova [t3+r10*2+400*0+24], xm1 1406 vextracti128 [t3+r10*2+400*0+56], m1, 1 1407 add r10, 32 1408 jl .hv0_loop 1409 ret 1410ALIGN function_align 1411.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 1412 lea r10, [wq-4] 1413 test edgeb, 1 ; LR_HAVE_LEFT 1414 jz .hv1_extend_left 1415 vpbroadcastq xm5, [leftq] 1416 vinserti128 m5, [lpfq+wq], 1 1417 mova m4, [lpfq+wq] 1418 add leftq, 8 1419 palignr m4, m5, 12 1420 jmp .hv1_main 1421.hv1_extend_left: 1422 mova xm4, [lpfq+wq] 1423 pshufb xm4, xm14 1424 vinserti128 m4, [lpfq+wq+12], 1 1425 jmp .hv1_main 1426.hv1_bottom: 1427 lea r10, [wq-4] 1428 test edgeb, 1 ; LR_HAVE_LEFT 1429 jz .hv1_extend_left 1430.hv1_loop: 1431 movu m4, [lpfq+r10+ 0] 1432.hv1_main: 1433 movu m5, [lpfq+r10+16] 1434 test edgeb, 2 ; LR_HAVE_RIGHT 1435 jnz .hv1_have_right 1436 cmp r10d, -34 1437 jl .hv1_have_right 1438 call .extend_right 1439.hv1_have_right: 1440 palignr m1, m5, m4, 2 1441 paddw m0, m4, m1 1442 punpcklwd m2, m4, m1 1443 pmaddwd m2, m2 1444 punpckhwd m3, m4, m1 1445 pmaddwd m3, m3 1446 palignr m5, m4, 4 1447 paddw m0, m5 ; h sum 1448 punpcklwd m1, m5, m6 1449 pmaddwd m1, m1 1450 punpckhwd m5, m6 1451 pmaddwd m5, m5 1452 paddd m2, m1 ; h sumsq 1453 paddd m3, m5 1454 paddw m1, m0, [t2+r10+400*0] 1455 paddd m4, m2, [t2+r10+400*2] 1456 paddd m5, m3, [t2+r10+400*4] 1457 mova [t2+r10+400*0], m0 1458 mova [t2+r10+400*2], m2 1459 mova [t2+r10+400*4], m3 1460 paddd m4, m8 1461 paddd m5, m8 1462 psrld m4, 4 ; (a + 8) >> 4 1463 psrld m5, 4 1464 pslld m2, m4, 3 1465 pslld m3, m5, 3 1466 paddd m4, m2 ; ((a + 8) >> 4) * 9 1467 paddd m5, m3 1468 psrlw m3, m1, 1 1469 pavgw m3, m6 ; (b + 2) >> 2 1470 punpcklwd m2, m3, m6 1471 pmaddwd m2, m2 1472 punpckhwd m3, m6 1473 pmaddwd m3, m3 1474 punpcklwd m0, m1, m6 ; b 1475 punpckhwd m1, m6 1476 pmaxud m4, m2 1477 psubd m4, m2 ; p 1478 pmaxud m5, m3 1479 psubd m5, m3 1480 pmulld m4, m9 ; p * s 1481 pmulld m5, m9 1482 pmaddwd m0, m10 ; b * 455 1483 pmaddwd m1, m10 1484 paddusw m4, m10 1485 paddusw m5, m10 1486 psrad m3, m4, 20 ; min(z, 255) - 256 1487 vpgatherdd m2, [r13+m3*4], m4 1488 psrad m4, m5, 20 1489 vpgatherdd m3, [r13+m4*4], m5 1490 pmulld m0, m2 1491 pmulld m1, m3 1492 packssdw m2, m3 1493 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 1494 paddd m1, m11 1495 psubw m2, m12, m2 1496 psrld m0, 12 1497 psrld m1, 12 1498 mova [t4+r10*1+400*2 +4], m2 1499 mova [t3+r10*2+400*4+ 8], xm0 1500 vextracti128 [t3+r10*2+400*4+40], m0, 1 1501 mova [t3+r10*2+400*4+24], xm1 1502 vextracti128 [t3+r10*2+400*4+56], m1, 1 1503 add r10, 32 1504 jl .hv1_loop 1505 mov r10, t2 1506 mov t2, t1 1507 mov t1, r10 1508 ret 1509.v0: ; vertical boxsums + ab (even rows) 1510 lea r10, [wq-4] 1511.v0_loop: 1512 mova m0, [t1+r10+400*0] 1513 mova m4, [t1+r10+400*2] 1514 mova m5, [t1+r10+400*4] 1515 paddw m0, m0 1516 paddd m4, m4 1517 paddd m5, m5 1518 paddw m1, m0, [t2+r10+400*0] 1519 paddd m2, m4, [t2+r10+400*2] 1520 paddd m3, m5, [t2+r10+400*4] 1521 mova [t2+r10+400*0], m0 1522 mova [t2+r10+400*2], m4 1523 mova [t2+r10+400*4], m5 1524 paddd m2, m8 1525 paddd m3, m8 1526 psrld m2, 4 ; (a + 8) >> 4 1527 psrld m3, 4 1528 pslld m4, m2, 3 1529 pslld m5, m3, 3 1530 paddd m4, m2 ; ((a + 8) >> 4) * 9 1531 paddd m5, m3 1532 psrlw m3, m1, 1 1533 pavgw m3, m6 ; (b + 2) >> 2 1534 punpcklwd m2, m3, m6 1535 pmaddwd m2, m2 1536 punpckhwd m3, m6 1537 pmaddwd m3, m3 1538 punpcklwd m0, m1, m6 ; b 1539 punpckhwd m1, m6 1540 pmaxud m4, m2 1541 psubd m4, m2 ; p 1542 pmaxud m5, m3 1543 psubd m5, m3 1544 pmulld m4, m9 ; p * s 1545 pmulld m5, m9 1546 pmaddwd m0, m10 ; b * 455 1547 pmaddwd m1, m10 1548 paddusw m4, m10 1549 paddusw m5, m10 1550 psrad m3, m4, 20 ; min(z, 255) - 256 1551 vpgatherdd m2, [r13+m3*4], m4 1552 psrad m4, m5, 20 1553 vpgatherdd m3, [r13+m4*4], m5 1554 pmulld m0, m2 1555 pmulld m1, m3 1556 packssdw m2, m3 1557 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 1558 paddd m1, m11 1559 psubw m2, m12, m2 1560 psrld m0, 12 1561 psrld m1, 12 1562 mova [t4+r10*1+400*0+ 4], m2 1563 mova [t3+r10*2+400*0+ 8], xm0 1564 vextracti128 [t3+r10*2+400*0+40], m0, 1 1565 mova [t3+r10*2+400*0+24], xm1 1566 vextracti128 [t3+r10*2+400*0+56], m1, 1 1567 add r10, 32 1568 jl .v0_loop 1569 ret 1570.v1: ; vertical boxsums + ab (odd rows) 1571 lea r10, [wq-4] 1572.v1_loop: 1573 mova m0, [t1+r10+400*0] 1574 mova m4, [t1+r10+400*2] 1575 mova m5, [t1+r10+400*4] 1576 paddw m1, m0, [t2+r10+400*0] 1577 paddd m2, m4, [t2+r10+400*2] 1578 paddd m3, m5, [t2+r10+400*4] 1579 mova [t2+r10+400*0], m0 1580 mova [t2+r10+400*2], m4 1581 mova [t2+r10+400*4], m5 1582 paddd m2, m8 1583 paddd m3, m8 1584 psrld m2, 4 ; (a + 8) >> 4 1585 psrld m3, 4 1586 pslld m4, m2, 3 1587 pslld m5, m3, 3 1588 paddd m4, m2 ; ((a + 8) >> 4) * 9 1589 paddd m5, m3 1590 psrlw m3, m1, 1 1591 pavgw m3, m6 ; (b + 2) >> 2 1592 punpcklwd m2, m3, m6 1593 pmaddwd m2, m2 1594 punpckhwd m3, m6 1595 pmaddwd m3, m3 1596 punpcklwd m0, m1, m6 ; b 1597 punpckhwd m1, m6 1598 pmaxud m4, m2 1599 psubd m4, m2 ; p 1600 pmaxud m5, m3 1601 psubd m5, m3 1602 pmulld m4, m9 ; p * s 1603 pmulld m5, m9 1604 pmaddwd m0, m10 ; b * 455 1605 pmaddwd m1, m10 1606 paddusw m4, m10 1607 paddusw m5, m10 1608 psrad m3, m4, 20 ; min(z, 255) - 256 1609 vpgatherdd m2, [r13+m3*4], m4 1610 psrad m4, m5, 20 1611 vpgatherdd m3, [r13+m4*4], m5 1612 pmulld m0, m2 1613 pmulld m1, m3 1614 packssdw m2, m3 1615 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 1616 paddd m1, m11 1617 psubw m2, m12, m2 1618 psrld m0, 12 1619 psrld m1, 12 1620 mova [t4+r10*1+400*2+ 4], m2 1621 mova [t3+r10*2+400*4+ 8], xm0 1622 vextracti128 [t3+r10*2+400*4+40], m0, 1 1623 mova [t3+r10*2+400*4+24], xm1 1624 vextracti128 [t3+r10*2+400*4+56], m1, 1 1625 add r10, 32 1626 jl .v1_loop 1627 mov r10, t2 1628 mov t2, t1 1629 mov t1, r10 1630 ret 1631.prep_n: ; initial neighbor setup 1632 mov r10, wq 1633.prep_n_loop: 1634 mova xm0, [t4+r10*1+400*0+0] 1635 paddw xm0, [t4+r10*1+400*0+4] 1636 paddw xm2, xm0, [t4+r10*1+400*0+2] 1637 mova m1, [t3+r10*2+400*0+0] 1638 paddd m1, [t3+r10*2+400*0+8] 1639 paddd m3, m1, [t3+r10*2+400*0+4] 1640 psllw xm2, 2 ; a[-1] 444 1641 pslld m3, 2 ; b[-1] 444 1642 psubw xm2, xm0 ; a[-1] 343 1643 psubd m3, m1 ; b[-1] 343 1644 mova [t4+r10*1+400* 4], xm2 1645 mova [t3+r10*2+400* 8], m3 1646 mova xm0, [t4+r10*1+400*2+0] 1647 paddw xm0, [t4+r10*1+400*2+4] 1648 paddw xm2, xm0, [t4+r10*1+400*2+2] 1649 mova m1, [t3+r10*2+400*4+0] 1650 paddd m1, [t3+r10*2+400*4+8] 1651 paddd m3, m1, [t3+r10*2+400*4+4] 1652 psllw xm2, 2 ; a[ 0] 444 1653 pslld m3, 2 ; b[ 0] 444 1654 mova [t4+r10*1+400* 6], xm2 1655 mova [t3+r10*2+400*12], m3 1656 psubw xm2, xm0 ; a[ 0] 343 1657 psubd m3, m1 ; b[ 0] 343 1658 mova [t4+r10*1+400* 8], xm2 1659 mova [t3+r10*2+400*16], m3 1660 add r10, 16 1661 jl .prep_n_loop 1662 ret 1663ALIGN function_align 1664.n0: ; neighbor + output (even rows) 1665 mov r10, wq 1666.n0_loop: 1667 mova m3, [t4+r10*1+400*0+0] 1668 paddw m3, [t4+r10*1+400*0+4] 1669 paddw m1, m3, [t4+r10*1+400*0+2] 1670 psllw m1, 2 ; a[ 1] 444 1671 psubw m2, m1, m3 ; a[ 1] 343 1672 paddw m3, m2, [t4+r10*1+400*4] 1673 paddw m3, [t4+r10*1+400*6] 1674 mova [t4+r10*1+400*4], m2 1675 mova [t4+r10*1+400*6], m1 1676 mova m4, [t3+r10*2+400*0+0] 1677 paddd m4, [t3+r10*2+400*0+8] 1678 paddd m1, m4, [t3+r10*2+400*0+4] 1679 pslld m1, 2 ; b[ 1] 444 1680 psubd m2, m1, m4 ; b[ 1] 343 1681 paddd m4, m2, [t3+r10*2+400* 8+ 0] 1682 paddd m4, [t3+r10*2+400*12+ 0] 1683 mova [t3+r10*2+400* 8+ 0], m2 1684 mova [t3+r10*2+400*12+ 0], m1 1685 mova m5, [t3+r10*2+400*0+32] 1686 paddd m5, [t3+r10*2+400*0+40] 1687 paddd m1, m5, [t3+r10*2+400*0+36] 1688 pslld m1, 2 1689 psubd m2, m1, m5 1690 paddd m5, m2, [t3+r10*2+400* 8+32] 1691 paddd m5, [t3+r10*2+400*12+32] 1692 mova [t3+r10*2+400* 8+32], m2 1693 mova [t3+r10*2+400*12+32], m1 1694 mova m0, [dstq+r10] 1695 punpcklwd m1, m0, m6 1696 punpcklwd m2, m3, m6 1697 pmaddwd m2, m1 ; a * src 1698 punpckhwd m1, m0, m6 1699 punpckhwd m3, m6 1700 pmaddwd m3, m1 1701 vinserti128 m1, m4, xm5, 1 1702 vperm2i128 m4, m5, 0x31 1703 paddd m2, m1 ; a * src + b + (1 << 8) 1704 paddd m3, m4 1705 psrld m2, 9 1706 psrld m3, 9 1707 packssdw m2, m3 1708 psllw m1, m0, 4 1709 psubw m2, m1 1710 pmulhrsw m2, m7 1711 paddw m0, m2 1712 pmaxsw m0, m6 1713 pminsw m0, m13 1714 mova [dstq+r10], m0 1715 add r10, 32 1716 jl .n0_loop 1717 add dstq, dst_strideq 1718 ret 1719ALIGN function_align 1720.n1: ; neighbor + output (odd rows) 1721 mov r10, wq 1722.n1_loop: 1723 mova m3, [t4+r10*1+400*2+0] 1724 paddw m3, [t4+r10*1+400*2+4] 1725 paddw m1, m3, [t4+r10*1+400*2+2] 1726 psllw m1, 2 ; a[ 1] 444 1727 psubw m2, m1, m3 ; a[ 1] 343 1728 paddw m3, m2, [t4+r10*1+400*6] 1729 paddw m3, [t4+r10*1+400*8] 1730 mova [t4+r10*1+400*6], m1 1731 mova [t4+r10*1+400*8], m2 1732 mova m4, [t3+r10*2+400*4+0] 1733 paddd m4, [t3+r10*2+400*4+8] 1734 paddd m1, m4, [t3+r10*2+400*4+4] 1735 pslld m1, 2 ; b[ 1] 444 1736 psubd m2, m1, m4 ; b[ 1] 343 1737 paddd m4, m2, [t3+r10*2+400*12+ 0] 1738 paddd m4, [t3+r10*2+400*16+ 0] 1739 mova [t3+r10*2+400*12+ 0], m1 1740 mova [t3+r10*2+400*16+ 0], m2 1741 mova m5, [t3+r10*2+400*4+32] 1742 paddd m5, [t3+r10*2+400*4+40] 1743 paddd m1, m5, [t3+r10*2+400*4+36] 1744 pslld m1, 2 1745 psubd m2, m1, m5 1746 paddd m5, m2, [t3+r10*2+400*12+32] 1747 paddd m5, [t3+r10*2+400*16+32] 1748 mova [t3+r10*2+400*12+32], m1 1749 mova [t3+r10*2+400*16+32], m2 1750 mova m0, [dstq+r10] 1751 punpcklwd m1, m0, m6 1752 punpcklwd m2, m3, m6 1753 pmaddwd m2, m1 ; a * src 1754 punpckhwd m1, m0, m6 1755 punpckhwd m3, m6 1756 pmaddwd m3, m1 1757 vinserti128 m1, m4, xm5, 1 1758 vperm2i128 m4, m5, 0x31 1759 paddd m2, m1 ; a * src + b + (1 << 8) 1760 paddd m3, m4 1761 psrld m2, 9 1762 psrld m3, 9 1763 packssdw m2, m3 1764 psllw m1, m0, 4 1765 psubw m2, m1 1766 pmulhrsw m2, m7 1767 paddw m0, m2 1768 pmaxsw m0, m6 1769 pminsw m0, m13 1770 mova [dstq+r10], m0 1771 add r10, 32 1772 jl .n1_loop 1773 add dstq, dst_strideq 1774 ret 1775 1776cglobal sgr_filter_mix_16bpc, 5, 14, 16, 400*66+8, dst, dst_stride, left, lpf, \ 1777 lpf_stride, w, edge, params, h 1778 movifnidn wd, wm 1779 mov paramsq, paramsmp 1780 lea r13, [sgr_x_by_x_avx2+256*4] 1781 mov edged, r8m 1782 add wd, wd 1783 mov hd, r6m 1784 add lpfq, wq 1785 vpbroadcastd m9, [pd_8] 1786 lea t1, [rsp+wq+12] 1787 vpbroadcastd m10, [pd_34816] 1788 add dstq, wq 1789 vpbroadcastd m11, [pw_256] 1790 lea t3, [rsp+wq*2+400*24+8] 1791 vpbroadcastd m12, [pd_0xf00801c7] 1792 lea t4, [rsp+wq+400*52+8] 1793 vpbroadcastd m15, [paramsq+8] ; w0 w1 1794 neg wq 1795 vpbroadcastd m13, [paramsq+0] ; s0 1796 pxor m7, m7 1797 vpbroadcastd m14, [paramsq+4] ; s1 1798 psllw m15, 2 1799 test edgeb, 4 ; LR_HAVE_TOP 1800 jz .no_top 1801 call .h_top 1802 add lpfq, lpf_strideq 1803 mov t2, t1 1804 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).top_fixup 1805 add t1, 400*12 1806 call .h_top 1807 lea r10, [lpfq+lpf_strideq*4] 1808 mov lpfq, dstq 1809 add r10, lpf_strideq 1810 mov [rsp], r10 ; below 1811 call .hv0 1812.main: 1813 dec hd 1814 jz .height1 1815 add lpfq, dst_strideq 1816 call .hv1 1817 call .prep_n 1818 sub hd, 2 1819 jl .extend_bottom 1820.main_loop: 1821 add lpfq, dst_strideq 1822 call .hv0 1823 test hd, hd 1824 jz .odd_height 1825 add lpfq, dst_strideq 1826 call .hv1 1827 call .n0 1828 call .n1 1829 sub hd, 2 1830 jge .main_loop 1831 test edgeb, 8 ; LR_HAVE_BOTTOM 1832 jz .extend_bottom 1833 mov lpfq, [rsp] 1834 call .hv0_bottom 1835 add lpfq, lpf_strideq 1836 call .hv1_bottom 1837.end: 1838 call .n0 1839 call .n1 1840.end2: 1841 RET 1842.height1: 1843 call .v1 1844 call .prep_n 1845 jmp .odd_height_end 1846.odd_height: 1847 call .v1 1848 call .n0 1849 call .n1 1850.odd_height_end: 1851 call .v0 1852 call .v1 1853 call .n0 1854 jmp .end2 1855.extend_bottom: 1856 call .v0 1857 call .v1 1858 jmp .end 1859.no_top: 1860 lea r10, [lpfq+lpf_strideq*4] 1861 mov lpfq, dstq 1862 lea r10, [r10+lpf_strideq*2] 1863 mov [rsp], r10 1864 call .h 1865 lea r10, [wq-4] 1866 lea t2, [t1+400*12] 1867.top_fixup_loop: 1868 mova m0, [t1+r10+400* 0] 1869 mova m1, [t1+r10+400* 2] 1870 mova m2, [t1+r10+400* 4] 1871 paddw m0, m0 1872 mova m3, [t1+r10+400* 6] 1873 paddd m1, m1 1874 mova m4, [t1+r10+400* 8] 1875 paddd m2, m2 1876 mova m5, [t1+r10+400*10] 1877 mova [t2+r10+400* 0], m0 1878 mova [t2+r10+400* 2], m1 1879 mova [t2+r10+400* 4], m2 1880 mova [t2+r10+400* 6], m3 1881 mova [t2+r10+400* 8], m4 1882 mova [t2+r10+400*10], m5 1883 add r10, 32 1884 jl .top_fixup_loop 1885 call .v0 1886 jmp .main 1887.h: ; horizontal boxsum 1888 lea r10, [wq-4] 1889 test edgeb, 1 ; LR_HAVE_LEFT 1890 jz .h_extend_left 1891 vpbroadcastq xm5, [leftq] 1892 vinserti128 m5, [lpfq+wq], 1 1893 mova m4, [lpfq+wq] 1894 add leftq, 8 1895 palignr m4, m5, 10 1896 jmp .h_main 1897.h_extend_left: 1898 mova xm4, [lpfq+wq] 1899 pshufb xm4, [sgr_lshuf5] 1900 vinserti128 m4, [lpfq+wq+10], 1 1901 jmp .h_main 1902.h_top: 1903 lea r10, [wq-4] 1904 test edgeb, 1 ; LR_HAVE_LEFT 1905 jz .h_extend_left 1906.h_loop: 1907 movu m4, [lpfq+r10- 2] 1908.h_main: 1909 movu m5, [lpfq+r10+14] 1910 test edgeb, 2 ; LR_HAVE_RIGHT 1911 jnz .h_have_right 1912 cmp r10d, -36 1913 jl .h_have_right 1914 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right 1915.h_have_right: 1916 palignr m3, m5, m4, 2 1917 palignr m0, m5, m4, 4 1918 paddw m1, m3, m0 1919 punpcklwd m2, m3, m0 1920 pmaddwd m2, m2 1921 punpckhwd m3, m0 1922 pmaddwd m3, m3 1923 palignr m0, m5, m4, 6 1924 paddw m1, m0 ; sum3 1925 punpcklwd m6, m0, m7 1926 pmaddwd m6, m6 1927 punpckhwd m0, m7 1928 pmaddwd m0, m0 1929 paddd m2, m6 ; sumsq3 1930 shufpd m6, m4, m5, 0x05 1931 punpcklwd m5, m6, m4 1932 paddw m8, m4, m6 1933 pmaddwd m5, m5 1934 punpckhwd m6, m4 1935 pmaddwd m6, m6 1936 paddd m3, m0 1937 mova [t1+r10+400* 6], m1 1938 mova [t1+r10+400* 8], m2 1939 mova [t1+r10+400*10], m3 1940 paddw m8, m1 ; sum5 1941 paddd m5, m2 ; sumsq5 1942 paddd m6, m3 1943 mova [t1+r10+400* 0], m8 1944 mova [t1+r10+400* 2], m5 1945 mova [t1+r10+400* 4], m6 1946 add r10, 32 1947 jl .h_loop 1948 ret 1949ALIGN function_align 1950.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) 1951 lea r10, [wq-4] 1952 test edgeb, 1 ; LR_HAVE_LEFT 1953 jz .hv0_extend_left 1954 vpbroadcastq xm5, [leftq] 1955 vinserti128 m5, [lpfq+wq], 1 1956 mova m4, [lpfq+wq] 1957 add leftq, 8 1958 palignr m4, m5, 10 1959 jmp .hv0_main 1960.hv0_extend_left: 1961 mova xm4, [lpfq+wq] 1962 pshufb xm4, [sgr_lshuf5] 1963 vinserti128 m4, [lpfq+wq+10], 1 1964 jmp .hv0_main 1965.hv0_bottom: 1966 lea r10, [wq-4] 1967 test edgeb, 1 ; LR_HAVE_LEFT 1968 jz .hv0_extend_left 1969.hv0_loop: 1970 movu m4, [lpfq+r10- 2] 1971.hv0_main: 1972 movu m5, [lpfq+r10+14] 1973 test edgeb, 2 ; LR_HAVE_RIGHT 1974 jnz .hv0_have_right 1975 cmp r10d, -36 1976 jl .hv0_have_right 1977 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right 1978.hv0_have_right: 1979 palignr m3, m5, m4, 2 1980 palignr m0, m5, m4, 4 1981 paddw m1, m3, m0 1982 punpcklwd m2, m3, m0 1983 pmaddwd m2, m2 1984 punpckhwd m3, m0 1985 pmaddwd m3, m3 1986 palignr m0, m5, m4, 6 1987 paddw m1, m0 ; h sum3 1988 punpcklwd m6, m0, m7 1989 pmaddwd m6, m6 1990 punpckhwd m0, m7 1991 pmaddwd m0, m0 1992 paddd m2, m6 ; h sumsq3 1993 shufpd m6, m4, m5, 0x05 1994 punpcklwd m5, m6, m4 1995 paddw m8, m4, m6 1996 pmaddwd m5, m5 1997 punpckhwd m6, m4 1998 pmaddwd m6, m6 1999 paddd m3, m0 2000 paddw m8, m1 ; h sum5 2001 paddd m5, m2 ; h sumsq5 2002 paddd m6, m3 2003 mova [t3+r10*2+400*8+ 8], m8 ; we need a clean copy of the last row TODO: t4? 2004 mova [t3+r10*2+400*0+ 8], m5 ; in case height is odd 2005 mova [t3+r10*2+400*0+40], m6 2006 paddw m8, [t1+r10+400* 0] 2007 paddd m5, [t1+r10+400* 2] 2008 paddd m6, [t1+r10+400* 4] 2009 mova [t1+r10+400* 0], m8 2010 mova [t1+r10+400* 2], m5 2011 mova [t1+r10+400* 4], m6 2012 paddw m0, m1, [t1+r10+400* 6] 2013 paddd m4, m2, [t1+r10+400* 8] 2014 paddd m5, m3, [t1+r10+400*10] 2015 mova [t1+r10+400* 6], m1 2016 mova [t1+r10+400* 8], m2 2017 mova [t1+r10+400*10], m3 2018 paddw m1, m0, [t2+r10+400* 6] 2019 paddd m2, m4, [t2+r10+400* 8] 2020 paddd m3, m5, [t2+r10+400*10] 2021 mova [t2+r10+400* 6], m0 2022 mova [t2+r10+400* 8], m4 2023 mova [t2+r10+400*10], m5 2024 paddd m2, m9 2025 paddd m3, m9 2026 psrld m2, 4 ; (a3 + 8) >> 4 2027 psrld m3, 4 2028 pslld m4, m2, 3 2029 pslld m5, m3, 3 2030 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 2031 paddd m5, m3 2032 psrlw m3, m1, 1 2033 pavgw m3, m7 ; (b3 + 2) >> 2 2034 punpcklwd m2, m3, m7 2035 pmaddwd m2, m2 2036 punpckhwd m3, m7 2037 pmaddwd m3, m3 2038 punpcklwd m0, m1, m7 ; b3 2039 punpckhwd m1, m7 2040 pmaxud m4, m2 2041 psubd m4, m2 ; p3 2042 pmaxud m5, m3 2043 psubd m5, m3 2044 pmulld m4, m14 ; p3 * s1 2045 pmulld m5, m14 2046 pmaddwd m0, m12 ; b3 * 455 2047 pmaddwd m1, m12 2048 paddusw m4, m12 2049 paddusw m5, m12 2050 psrad m3, m4, 20 ; min(z3, 255) - 256 2051 vpgatherdd m2, [r13+m3*4], m4 2052 psrad m4, m5, 20 2053 vpgatherdd m3, [r13+m4*4], m5 2054 pmulld m0, m2 2055 pmulld m1, m3 2056 packssdw m2, m3 2057 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2058 paddd m1, m10 2059 psubw m2, m11, m2 2060 psrld m0, 12 2061 psrld m1, 12 2062 mova [t4+r10*1+400*2+ 4], m2 2063 mova [t3+r10*2+400*4+ 8], xm0 2064 vextracti128 [t3+r10*2+400*4+40], m0, 1 2065 mova [t3+r10*2+400*4+24], xm1 2066 vextracti128 [t3+r10*2+400*4+56], m1, 1 2067 add r10, 32 2068 jl .hv0_loop 2069 ret 2070ALIGN function_align 2071.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 2072 lea r10, [wq-4] 2073 test edgeb, 1 ; LR_HAVE_LEFT 2074 jz .hv1_extend_left 2075 vpbroadcastq xm5, [leftq] 2076 vinserti128 m5, [lpfq+wq], 1 2077 mova m4, [lpfq+wq] 2078 add leftq, 8 2079 palignr m4, m5, 10 2080 jmp .hv1_main 2081.hv1_extend_left: 2082 mova xm4, [lpfq+wq] 2083 pshufb xm4, [sgr_lshuf5] 2084 vinserti128 m4, [lpfq+wq+10], 1 2085 jmp .hv1_main 2086.hv1_bottom: 2087 lea r10, [wq-4] 2088 test edgeb, 1 ; LR_HAVE_LEFT 2089 jz .hv1_extend_left 2090.hv1_loop: 2091 movu m4, [lpfq+r10- 2] 2092.hv1_main: 2093 movu m5, [lpfq+r10+14] 2094 test edgeb, 2 ; LR_HAVE_RIGHT 2095 jnz .hv1_have_right 2096 cmp r10d, -36 2097 jl .hv1_have_right 2098 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right 2099.hv1_have_right: 2100 palignr m6, m5, m4, 2 2101 palignr m3, m5, m4, 4 2102 paddw m2, m6, m3 2103 punpcklwd m0, m6, m3 2104 pmaddwd m0, m0 2105 punpckhwd m6, m3 2106 pmaddwd m6, m6 2107 palignr m3, m5, m4, 6 2108 paddw m2, m3 ; h sum3 2109 punpcklwd m1, m3, m7 2110 pmaddwd m1, m1 2111 punpckhwd m3, m7 2112 pmaddwd m3, m3 2113 paddd m0, m1 ; h sumsq3 2114 shufpd m1, m4, m5, 0x05 2115 punpckhwd m5, m4, m1 2116 paddw m8, m4, m1 2117 pmaddwd m5, m5 2118 punpcklwd m4, m1 2119 pmaddwd m4, m4 2120 paddd m6, m3 2121 paddw m1, m2, [t2+r10+400* 6] 2122 mova [t2+r10+400* 6], m2 2123 paddw m8, m2 ; h sum5 2124 paddd m2, m0, [t2+r10+400* 8] 2125 paddd m3, m6, [t2+r10+400*10] 2126 mova [t2+r10+400* 8], m0 2127 mova [t2+r10+400*10], m6 2128 paddd m4, m0 ; h sumsq5 2129 paddd m5, m6 2130 paddd m2, m9 2131 paddd m3, m9 2132 psrld m2, 4 ; (a3 + 8) >> 4 2133 psrld m3, 4 2134 pslld m0, m2, 3 2135 pslld m6, m3, 3 2136 paddd m2, m0 ; ((a3 + 8) >> 4) * 9 2137 paddd m3, m6 2138 psrlw m6, m1, 1 2139 pavgw m6, m7 ; (b3 + 2) >> 2 2140 punpcklwd m0, m6, m7 2141 pmaddwd m0, m0 2142 punpckhwd m6, m7 2143 pmaddwd m6, m6 2144 pmaxud m2, m0 2145 psubd m2, m0 ; p3 2146 pmaxud m3, m6 2147 psubd m3, m6 2148 punpcklwd m0, m1, m7 ; b3 2149 punpckhwd m1, m7 2150 pmulld m2, m14 ; p3 * s1 2151 pmulld m3, m14 2152 pmaddwd m0, m12 ; b3 * 455 2153 pmaddwd m1, m12 2154 paddusw m2, m12 2155 paddusw m3, m12 2156 psrad m7, m2, 20 ; min(z3, 255) - 256 2157 vpgatherdd m6, [r13+m7*4], m2 2158 psrad m2, m3, 20 2159 vpgatherdd m7, [r13+m2*4], m3 2160 pmulld m0, m6 2161 packssdw m6, m7 2162 pmulld m7, m1 2163 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2164 paddd m7, m10 2165 psubw m6, m11, m6 2166 psrld m0, 12 2167 psrld m7, 12 2168 paddw m1, m8, [t2+r10+400*0] 2169 paddd m2, m4, [t2+r10+400*2] 2170 paddd m3, m5, [t2+r10+400*4] 2171 paddw m1, [t1+r10+400*0] 2172 paddd m2, [t1+r10+400*2] 2173 paddd m3, [t1+r10+400*4] 2174 mova [t2+r10+400*0], m8 2175 mova [t2+r10+400*2], m4 2176 mova [t2+r10+400*4], m5 2177 mova [t4+r10*1+400*4 +4], m6 2178 mova [t3+r10*2+400*8+ 8], xm0 2179 vextracti128 [t3+r10*2+400*8+40], m0, 1 2180 mova [t3+r10*2+400*8+24], xm7 2181 vextracti128 [t3+r10*2+400*8+56], m7, 1 2182 vpbroadcastd m4, [pd_25] 2183 pxor m7, m7 2184 paddd m2, m9 2185 paddd m3, m9 2186 psrld m2, 4 ; (a5 + 8) >> 4 2187 psrld m3, 4 2188 pmulld m2, m4 ; ((a5 + 8) >> 4) * 25 2189 pmulld m3, m4 2190 psrlw m5, m1, 1 2191 pavgw m5, m7 ; (b5 + 2) >> 2 2192 punpcklwd m4, m5, m7 2193 pmaddwd m4, m4 2194 punpckhwd m5, m7 2195 pmaddwd m5, m5 2196 punpcklwd m0, m1, m7 ; b5 2197 punpckhwd m1, m7 2198 pmaxud m2, m4 2199 psubd m2, m4 ; p5 2200 vpbroadcastd m4, [pd_0xf00800a4] 2201 pmaxud m3, m5 2202 psubd m3, m5 2203 pmulld m2, m13 ; p5 * s0 2204 pmulld m3, m13 2205 pmaddwd m0, m4 ; b5 * 164 2206 pmaddwd m1, m4 2207 paddusw m2, m4 2208 paddusw m3, m4 2209 psrad m5, m2, 20 ; min(z5, 255) - 256 2210 vpgatherdd m4, [r13+m5*4], m2 2211 psrad m2, m3, 20 2212 vpgatherdd m5, [r13+m2*4], m3 2213 pmulld m0, m4 2214 pmulld m1, m5 2215 packssdw m4, m5 2216 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 2217 paddd m1, m10 2218 psubw m4, m11, m4 2219 psrld m0, 12 2220 psrld m1, 12 2221 mova [t4+r10*1+400*0+ 4], m4 2222 mova [t3+r10*2+400*0+ 8], xm0 2223 vextracti128 [t3+r10*2+400*0+40], m0, 1 2224 mova [t3+r10*2+400*0+24], xm1 2225 vextracti128 [t3+r10*2+400*0+56], m1, 1 2226 add r10, 32 2227 jl .hv1_loop 2228 mov r10, t2 2229 mov t2, t1 2230 mov t1, r10 2231 ret 2232.v0: ; vertical boxsums + ab3 (even rows) 2233 lea r10, [wq-4] 2234.v0_loop: 2235 mova m0, [t1+r10+400* 6] 2236 mova m4, [t1+r10+400* 8] 2237 mova m5, [t1+r10+400*10] 2238 paddw m0, m0 2239 paddd m4, m4 2240 paddd m5, m5 2241 paddw m1, m0, [t2+r10+400* 6] 2242 paddd m2, m4, [t2+r10+400* 8] 2243 paddd m3, m5, [t2+r10+400*10] 2244 mova [t2+r10+400* 6], m0 2245 mova [t2+r10+400* 8], m4 2246 mova [t2+r10+400*10], m5 2247 paddd m2, m9 2248 paddd m3, m9 2249 psrld m2, 4 ; (a3 + 8) >> 4 2250 psrld m3, 4 2251 pslld m4, m2, 3 2252 pslld m5, m3, 3 2253 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 2254 paddd m5, m3 2255 psrlw m3, m1, 1 2256 pavgw m3, m7 ; (b3 + 2) >> 2 2257 punpcklwd m2, m3, m7 2258 pmaddwd m2, m2 2259 punpckhwd m3, m7 2260 pmaddwd m3, m3 2261 punpcklwd m0, m1, m7 ; b3 2262 punpckhwd m1, m7 2263 pmaxud m4, m2 2264 psubd m4, m2 ; p3 2265 pmaxud m5, m3 2266 psubd m5, m3 2267 pmulld m4, m14 ; p3 * s1 2268 pmulld m5, m14 2269 pmaddwd m0, m12 ; b3 * 455 2270 pmaddwd m1, m12 2271 paddusw m4, m12 2272 paddusw m5, m12 2273 psrad m3, m4, 20 ; min(z3, 255) - 256 2274 vpgatherdd m2, [r13+m3*4], m4 2275 psrad m4, m5, 20 2276 vpgatherdd m3, [r13+m4*4], m5 2277 pmulld m0, m2 2278 pmulld m1, m3 2279 packssdw m2, m3 2280 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2281 paddd m1, m10 2282 psubw m2, m11, m2 2283 psrld m0, 12 2284 psrld m1, 12 2285 mova m3, [t1+r10+400*0] 2286 mova m4, [t1+r10+400*2] 2287 mova m5, [t1+r10+400*4] 2288 mova [t3+r10*2+400*8+ 8], m3 2289 mova [t3+r10*2+400*0+ 8], m4 2290 mova [t3+r10*2+400*0+40], m5 2291 paddw m3, m3 ; cc5 2292 paddd m4, m4 2293 paddd m5, m5 2294 mova [t1+r10+400*0], m3 2295 mova [t1+r10+400*2], m4 2296 mova [t1+r10+400*4], m5 2297 mova [t4+r10*1+400*2+ 4], m2 2298 mova [t3+r10*2+400*4+ 8], xm0 2299 vextracti128 [t3+r10*2+400*4+40], m0, 1 2300 mova [t3+r10*2+400*4+24], xm1 2301 vextracti128 [t3+r10*2+400*4+56], m1, 1 2302 add r10, 32 2303 jl .v0_loop 2304 ret 2305.v1: ; vertical boxsums + ab (odd rows) 2306 lea r10, [wq-4] 2307.v1_loop: 2308 mova m4, [t1+r10+400* 6] 2309 mova m5, [t1+r10+400* 8] 2310 mova m6, [t1+r10+400*10] 2311 paddw m1, m4, [t2+r10+400* 6] 2312 paddd m2, m5, [t2+r10+400* 8] 2313 paddd m3, m6, [t2+r10+400*10] 2314 mova [t2+r10+400* 6], m4 2315 mova [t2+r10+400* 8], m5 2316 mova [t2+r10+400*10], m6 2317 paddd m2, m9 2318 paddd m3, m9 2319 psrld m2, 4 ; (a3 + 8) >> 4 2320 psrld m3, 4 2321 pslld m4, m2, 3 2322 pslld m5, m3, 3 2323 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 2324 paddd m5, m3 2325 psrlw m3, m1, 1 2326 pavgw m3, m7 ; (b3 + 2) >> 2 2327 punpcklwd m2, m3, m7 2328 pmaddwd m2, m2 2329 punpckhwd m3, m7 2330 pmaddwd m3, m3 2331 punpcklwd m0, m1, m7 ; b3 2332 punpckhwd m1, m7 2333 pmaxud m4, m2 2334 psubd m4, m2 ; p3 2335 pmaxud m5, m3 2336 psubd m5, m3 2337 pmulld m4, m14 ; p3 * s1 2338 pmulld m5, m14 2339 pmaddwd m0, m12 ; b3 * 455 2340 pmaddwd m1, m12 2341 paddusw m4, m12 2342 paddusw m5, m12 2343 psrad m3, m4, 20 ; min(z3, 255) - 256 2344 vpgatherdd m2, [r13+m3*4], m4 2345 psrad m4, m5, 20 2346 vpgatherdd m3, [r13+m4*4], m5 2347 pmulld m0, m2 2348 pmulld m1, m3 2349 packssdw m2, m3 2350 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2351 paddd m1, m10 2352 psubw m2, m11, m2 2353 psrld m0, 12 2354 psrld m8, m1, 12 2355 mova [t4+r10*1+400*4+4], m2 2356 mova m4, [t3+r10*2+400*8+ 8] 2357 mova m5, [t3+r10*2+400*0+ 8] 2358 mova m6, [t3+r10*2+400*0+40] 2359 paddw m1, m4, [t2+r10+400*0] 2360 paddd m2, m5, [t2+r10+400*2] 2361 paddd m3, m6, [t2+r10+400*4] 2362 paddw m1, [t1+r10+400*0] 2363 paddd m2, [t1+r10+400*2] 2364 paddd m3, [t1+r10+400*4] 2365 mova [t2+r10+400*0], m4 2366 mova [t2+r10+400*2], m5 2367 mova [t2+r10+400*4], m6 2368 vpbroadcastd m4, [pd_25] 2369 mova [t3+r10*2+400*8+ 8], xm0 2370 vextracti128 [t3+r10*2+400*8+40], m0, 1 2371 mova [t3+r10*2+400*8+24], xm8 2372 vextracti128 [t3+r10*2+400*8+56], m8, 1 2373 paddd m2, m9 2374 paddd m3, m9 2375 psrld m2, 4 ; (a5 + 8) >> 4 2376 psrld m3, 4 2377 pmulld m2, m4 ; ((a5 + 8) >> 4) * 25 2378 pmulld m3, m4 2379 psrlw m5, m1, 1 2380 pavgw m5, m7 ; (b5 + 2) >> 2 2381 punpcklwd m4, m5, m7 2382 pmaddwd m4, m4 2383 punpckhwd m5, m7 2384 pmaddwd m5, m5 2385 punpcklwd m0, m1, m7 ; b5 2386 punpckhwd m1, m7 2387 pmaxud m2, m4 2388 psubd m2, m4 ; p5 2389 vpbroadcastd m4, [pd_0xf00800a4] 2390 pmaxud m3, m5 2391 psubd m3, m5 2392 pmulld m2, m13 ; p5 * s0 2393 pmulld m3, m13 2394 pmaddwd m0, m4 ; b5 * 164 2395 pmaddwd m1, m4 2396 paddusw m2, m4 2397 paddusw m3, m4 2398 psrad m5, m2, 20 ; min(z5, 255) - 256 2399 vpgatherdd m4, [r13+m5*4], m2 2400 psrad m2, m3, 20 2401 vpgatherdd m5, [r13+m2*4], m3 2402 pmulld m0, m4 2403 pmulld m1, m5 2404 packssdw m4, m5 2405 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 2406 paddd m1, m10 2407 psubw m4, m11, m4 2408 psrld m0, 12 2409 psrld m1, 12 2410 mova [t4+r10*1+400*0+ 4], m4 2411 mova [t3+r10*2+400*0+ 8], xm0 2412 vextracti128 [t3+r10*2+400*0+40], m0, 1 2413 mova [t3+r10*2+400*0+24], xm1 2414 vextracti128 [t3+r10*2+400*0+56], m1, 1 2415 add r10, 32 2416 jl .v1_loop 2417 mov r10, t2 2418 mov t2, t1 2419 mov t1, r10 2420 ret 2421.prep_n: ; initial neighbor setup 2422 mov r10, wq 2423.prep_n_loop: 2424 movu xm0, [t4+r10*1+400*0+2] 2425 paddw xm2, xm0, [t4+r10*1+400*0+0] 2426 paddw xm2, [t4+r10*1+400*0+4] 2427 movu m1, [t3+r10*2+400*0+4] 2428 paddd m3, m1, [t3+r10*2+400*0+0] 2429 paddd m3, [t3+r10*2+400*0+8] 2430 paddw xm0, xm2 2431 paddd m1, m3 2432 psllw xm2, 2 2433 pslld m3, 2 2434 paddw xm0, xm2 ; a5 565 2435 paddd m1, m3 ; b5 565 2436 mova [t4+r10*1+400* 6], xm0 2437 mova [t3+r10*2+400*12], m1 2438 mova xm0, [t4+r10*1+400*2+0] 2439 paddw xm0, [t4+r10*1+400*2+4] 2440 paddw xm2, xm0, [t4+r10*1+400*2+2] 2441 mova m1, [t3+r10*2+400*4+0] 2442 paddd m1, [t3+r10*2+400*4+8] 2443 paddd m3, m1, [t3+r10*2+400*4+4] 2444 psllw xm2, 2 ; a3[-1] 444 2445 pslld m3, 2 ; b3[-1] 444 2446 psubw xm2, xm0 ; a3[-1] 343 2447 psubd m3, m1 ; b3[-1] 343 2448 mova [t4+r10*1+400* 8], xm2 2449 mova [t3+r10*2+400*16], m3 2450 mova xm0, [t4+r10*1+400*4+0] 2451 paddw xm0, [t4+r10*1+400*4+4] 2452 paddw xm2, xm0, [t4+r10*1+400*4+2] 2453 mova m1, [t3+r10*2+400*8+0] 2454 paddd m1, [t3+r10*2+400*8+8] 2455 paddd m3, m1, [t3+r10*2+400*8+4] 2456 psllw xm2, 2 ; a3[ 0] 444 2457 pslld m3, 2 ; b3[ 0] 444 2458 mova [t4+r10*1+400*10], xm2 2459 mova [t3+r10*2+400*20], m3 2460 psubw xm2, xm0 ; a3[ 0] 343 2461 psubd m3, m1 ; b3[ 0] 343 2462 mova [t4+r10*1+400*12], xm2 2463 mova [t3+r10*2+400*24], m3 2464 add r10, 16 2465 jl .prep_n_loop 2466 ret 2467ALIGN function_align 2468.n0: ; neighbor + output (even rows) 2469 mov r10, wq 2470.n0_loop: 2471 movu xm2, [t4+r10*1+2] 2472 paddw xm0, xm2, [t4+r10*1+0] 2473 paddw xm0, [t4+r10*1+4] 2474 paddw xm2, xm0 2475 psllw xm0, 2 2476 paddw xm0, xm2 ; a5 2477 movu m1, [t3+r10*2+4] 2478 paddd m4, m1, [t3+r10*2+0] 2479 paddd m4, [t3+r10*2+8] 2480 paddd m1, m4 2481 pslld m4, 2 2482 paddd m4, m1 ; b5 2483 paddw xm2, xm0, [t4+r10*1+400* 6] 2484 mova [t4+r10*1+400* 6], xm0 2485 paddd m0, m4, [t3+r10*2+400*12] 2486 mova [t3+r10*2+400*12], m4 2487 mova xm3, [t4+r10*1+400*2+0] 2488 paddw xm3, [t4+r10*1+400*2+4] 2489 paddw xm5, xm3, [t4+r10*1+400*2+2] 2490 psllw xm5, 2 ; a3[ 1] 444 2491 psubw xm4, xm5, xm3 ; a3[ 1] 343 2492 paddw xm3, xm4, [t4+r10*1+400* 8] 2493 paddw xm3, [t4+r10*1+400*10] 2494 mova [t4+r10*1+400* 8], xm4 2495 mova [t4+r10*1+400*10], xm5 2496 mova m1, [t3+r10*2+400*4+0] 2497 paddd m1, [t3+r10*2+400*4+8] 2498 paddd m5, m1, [t3+r10*2+400*4+4] 2499 pslld m5, 2 ; b3[ 1] 444 2500 psubd m4, m5, m1 ; b3[ 1] 343 2501 paddd m1, m4, [t3+r10*2+400*16] 2502 paddd m1, [t3+r10*2+400*20] 2503 mova [t3+r10*2+400*16], m4 2504 mova [t3+r10*2+400*20], m5 2505 pmovzxwd m4, [dstq+r10] 2506 pmovzxwd m2, xm2 ; a5 2507 pmovzxwd m3, xm3 ; a3 2508 pmaddwd m2, m4 ; a5 * src 2509 pmaddwd m3, m4 ; a3 * src 2510 pslld m4, 13 2511 psubd m0, m4 2512 psubd m1, m4 2513 paddd m0, m2 ; a5 * src + b5 + (1 << 8) - (src << 13) 2514 paddd m1, m3 ; a3 * src + b3 + (1 << 8) - (src << 13) 2515 psrld m0, 9 2516 pslld m1, 7 2517 pblendw m0, m1, 0xaa 2518 pmaddwd m0, m15 2519 vpbroadcastd m1, [pd_4096] 2520 paddd m4, m1 2521 paddd m0, m4 2522 psrad m0, 7 2523 vextracti128 xm1, m0, 1 2524 packusdw xm0, xm1 ; clip 2525 psrlw xm0, 6 2526 mova [dstq+r10], xm0 2527 add r10, 16 2528 jl .n0_loop 2529 add dstq, dst_strideq 2530 ret 2531ALIGN function_align 2532.n1: ; neighbor + output (odd rows) 2533 mov r10, wq 2534.n1_loop: 2535 mova xm3, [t4+r10*1+400*4+0] 2536 paddw xm3, [t4+r10*1+400*4+4] 2537 paddw xm5, xm3, [t4+r10*1+400*4+2] 2538 psllw xm5, 2 ; a3[ 1] 444 2539 psubw xm4, xm5, xm3 ; a3[ 1] 343 2540 paddw xm3, xm4, [t4+r10*1+400*12] 2541 paddw xm3, [t4+r10*1+400*10] 2542 mova [t4+r10*1+400*10], xm5 2543 mova [t4+r10*1+400*12], xm4 2544 mova m1, [t3+r10*2+400*8+0] 2545 paddd m1, [t3+r10*2+400*8+8] 2546 paddd m5, m1, [t3+r10*2+400*8+4] 2547 pslld m5, 2 ; b3[ 1] 444 2548 psubd m4, m5, m1 ; b3[ 1] 343 2549 paddd m1, m4, [t3+r10*2+400*24] 2550 paddd m1, [t3+r10*2+400*20] 2551 mova [t3+r10*2+400*20], m5 2552 mova [t3+r10*2+400*24], m4 2553 pmovzxwd m4, [dstq+r10] 2554 pmovzxwd m0, [t4+r10*1+400* 6] 2555 pmovzxwd m3, xm3 2556 pmaddwd m0, m4 ; a5 * src 2557 pmaddwd m3, m4 ; a3 * src 2558 pslld m4, 12 2559 psubd m2, m4, [t3+r10*2+400*12] 2560 paddd m4, m4 2561 psubd m1, m4 2562 psubd m0, m2 ; a5 * src + b5 + (1 << 8) - (src << 13) 2563 paddd m1, m3 ; a3 * src + b3 + (1 << 8) - (src << 13) 2564 psrld m0, 8 2565 pslld m1, 7 2566 pblendw m0, m1, 0xaa 2567 pmaddwd m0, m15 2568 vpbroadcastd m1, [pd_4096] 2569 paddd m4, m1 2570 paddd m0, m4 2571 psrad m0, 7 2572 vextracti128 xm1, m0, 1 2573 packusdw xm0, xm1 ; clip 2574 psrlw xm0, 6 2575 mova [dstq+r10], xm0 2576 add r10, 16 2577 jl .n1_loop 2578 add dstq, dst_strideq 2579 ret 2580 2581%endif ; ARCH_X86_64 2582