1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 32 32 33sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 34sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 35wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 36wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 37wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 38wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 39wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 40wiener_lshuf5: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 41wiener_lshuf7: db 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 12, 13, 14, 15 42pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 43 db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 44 45wiener_hshift: dw 4, 4, 1, 1 46wiener_vshift: dw 1024, 1024, 4096, 4096 47wiener_round: dd 1049600, 1048832 48 49pb_m10_m9: times 2 db -10, -9 50pb_m6_m5: times 2 db -6, -5 51pb_m2_m1: times 2 db -2, -1 52pb_2_3: times 2 db 2, 3 53pb_6_7: times 2 db 6, 7 54pw_1023: times 2 dw 1023 55pd_8: dd 8 56pd_25: dd 25 57pd_4096: dd 4096 58pd_34816: dd 34816 59pd_m262128: dd -262128 60pd_0xf00800a4: dd 0xf00800a4 61pd_0xf00801c7: dd 0xf00801c7 62 63%define pw_256 sgr_lshuf5 64 65cextern sgr_x_by_x_avx2 66 67SECTION .text 68 69DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers 70 71INIT_YMM avx2 72cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ 73 w, h, edge, flt 74%define base t4-wiener_hshift 75 mov fltq, r6mp 76 movifnidn wd, wm 77 movifnidn hd, hm 78 mov edged, r7m 79 mov t3d, r8m ; pixel_max 80 vbroadcasti128 m6, [wiener_shufA] 81 vpbroadcastd m12, [fltq+ 0] ; x0 x1 82 lea t4, [wiener_hshift] 83 vbroadcasti128 m7, [wiener_shufB] 84 add wd, wd 85 vpbroadcastd m13, [fltq+ 4] ; x2 x3 86 shr t3d, 11 87 vpbroadcastd m14, [fltq+16] ; y0 y1 88 add lpfq, wq 89 vpbroadcastd m15, [fltq+20] ; y2 y3 90 add dstq, wq 91 vbroadcasti128 m8, [wiener_shufC] 92 lea t1, [rsp+wq+16] 93 vbroadcasti128 m9, [wiener_shufD] 94 neg wq 95 vpbroadcastd m0, [base+wiener_hshift+t3*4] 96 vpbroadcastd m10, [base+wiener_round+t3*4] 97 vpbroadcastd m11, [base+wiener_vshift+t3*4] 98 pmullw m12, m0 ; upshift filter coefs to make the 99 pmullw m13, m0 ; horizontal downshift constant 100 test edgeb, 4 ; LR_HAVE_TOP 101 jz .no_top 102 call .h_top 103 add lpfq, strideq 104 mov t6, t1 105 mov t5, t1 106 add t1, 384*2 107 call .h_top 108 lea r10, [lpfq+strideq*4] 109 mov lpfq, dstq 110 mov t4, t1 111 add t1, 384*2 112 add r10, strideq 113 mov [rsp], r10 ; below 114 call .h 115 mov t3, t1 116 mov t2, t1 117 dec hd 118 jz .v1 119 add lpfq, strideq 120 add t1, 384*2 121 call .h 122 mov t2, t1 123 dec hd 124 jz .v2 125 add lpfq, strideq 126 add t1, 384*2 127 call .h 128 dec hd 129 jz .v3 130.main: 131 lea t0, [t1+384*2] 132.main_loop: 133 call .hv 134 dec hd 135 jnz .main_loop 136 test edgeb, 8 ; LR_HAVE_BOTTOM 137 jz .v3 138 mov lpfq, [rsp] 139 call .hv_bottom 140 add lpfq, strideq 141 call .hv_bottom 142.v1: 143 call .v 144 RET 145.no_top: 146 lea r10, [lpfq+strideq*4] 147 mov lpfq, dstq 148 lea r10, [r10+strideq*2] 149 mov [rsp], r10 150 call .h 151 mov t6, t1 152 mov t5, t1 153 mov t4, t1 154 mov t3, t1 155 mov t2, t1 156 dec hd 157 jz .v1 158 add lpfq, strideq 159 add t1, 384*2 160 call .h 161 mov t2, t1 162 dec hd 163 jz .v2 164 add lpfq, strideq 165 add t1, 384*2 166 call .h 167 dec hd 168 jz .v3 169 lea t0, [t1+384*2] 170 call .hv 171 dec hd 172 jz .v3 173 add t0, 384*8 174 call .hv 175 dec hd 176 jnz .main 177.v3: 178 call .v 179.v2: 180 call .v 181 jmp .v1 182.extend_right: 183 movd xm1, r10d 184 vpbroadcastd m0, [pb_6_7] 185 movu m2, [pb_0to31] 186 vpbroadcastb m1, xm1 187 psubb m0, m1 188 pminub m0, m2 189 pshufb m3, m0 190 vpbroadcastd m0, [pb_m2_m1] 191 psubb m0, m1 192 pminub m0, m2 193 pshufb m4, m0 194 vpbroadcastd m0, [pb_m10_m9] 195 psubb m0, m1 196 pminub m0, m2 197 pshufb m5, m0 198 ret 199.h: 200 mov r10, wq 201 test edgeb, 1 ; LR_HAVE_LEFT 202 jz .h_extend_left 203 movq xm3, [leftq] 204 vpblendd m3, [lpfq+r10-8], 0xfc 205 add leftq, 8 206 jmp .h_main 207.h_extend_left: 208 vbroadcasti128 m3, [lpfq+r10] ; avoid accessing memory located 209 mova m4, [lpfq+r10] ; before the start of the buffer 210 shufpd m3, m4, 0x05 211 pshufb m3, [wiener_lshuf7] 212 jmp .h_main2 213.h_top: 214 mov r10, wq 215 test edgeb, 1 ; LR_HAVE_LEFT 216 jz .h_extend_left 217.h_loop: 218 movu m3, [lpfq+r10-8] 219.h_main: 220 mova m4, [lpfq+r10+0] 221.h_main2: 222 movu m5, [lpfq+r10+8] 223 test edgeb, 2 ; LR_HAVE_RIGHT 224 jnz .h_have_right 225 cmp r10d, -36 226 jl .h_have_right 227 call .extend_right 228.h_have_right: 229 pshufb m0, m3, m6 230 pshufb m1, m4, m7 231 paddw m0, m1 232 pshufb m3, m8 233 pmaddwd m0, m12 234 pshufb m1, m4, m9 235 paddw m3, m1 236 pshufb m1, m4, m6 237 pmaddwd m3, m13 238 pshufb m2, m5, m7 239 paddw m1, m2 240 vpbroadcastd m2, [pd_m262128] ; (1 << 4) - (1 << 18) 241 pshufb m4, m8 242 pmaddwd m1, m12 243 pshufb m5, m9 244 paddw m4, m5 245 pmaddwd m4, m13 246 paddd m0, m2 247 paddd m1, m2 248 paddd m0, m3 249 paddd m1, m4 250 psrad m0, 4 251 psrad m1, 4 252 packssdw m0, m1 253 psraw m0, 1 254 mova [t1+r10], m0 255 add r10, 32 256 jl .h_loop 257 ret 258ALIGN function_align 259.hv: 260 add lpfq, strideq 261 mov r10, wq 262 test edgeb, 1 ; LR_HAVE_LEFT 263 jz .hv_extend_left 264 movq xm3, [leftq] 265 vpblendd m3, [lpfq+r10-8], 0xfc 266 add leftq, 8 267 jmp .hv_main 268.hv_extend_left: 269 movu m3, [lpfq+r10-8] 270 pshufb m3, [wiener_lshuf7] 271 jmp .hv_main 272.hv_bottom: 273 mov r10, wq 274 test edgeb, 1 ; LR_HAVE_LEFT 275 jz .hv_extend_left 276.hv_loop: 277 movu m3, [lpfq+r10-8] 278.hv_main: 279 mova m4, [lpfq+r10+0] 280 movu m5, [lpfq+r10+8] 281 test edgeb, 2 ; LR_HAVE_RIGHT 282 jnz .hv_have_right 283 cmp r10d, -36 284 jl .hv_have_right 285 call .extend_right 286.hv_have_right: 287 pshufb m0, m3, m6 288 pshufb m1, m4, m7 289 paddw m0, m1 290 pshufb m3, m8 291 pmaddwd m0, m12 292 pshufb m1, m4, m9 293 paddw m3, m1 294 pshufb m1, m4, m6 295 pmaddwd m3, m13 296 pshufb m2, m5, m7 297 paddw m1, m2 298 vpbroadcastd m2, [pd_m262128] 299 pshufb m4, m8 300 pmaddwd m1, m12 301 pshufb m5, m9 302 paddw m4, m5 303 pmaddwd m4, m13 304 paddd m0, m2 305 paddd m1, m2 306 mova m2, [t4+r10] 307 paddw m2, [t2+r10] 308 mova m5, [t3+r10] 309 paddd m0, m3 310 paddd m1, m4 311 psrad m0, 4 312 psrad m1, 4 313 packssdw m0, m1 314 mova m4, [t5+r10] 315 paddw m4, [t1+r10] 316 psraw m0, 1 317 paddw m3, m0, [t6+r10] 318 mova [t0+r10], m0 319 punpcklwd m0, m2, m5 320 pmaddwd m0, m15 321 punpckhwd m2, m5 322 pmaddwd m2, m15 323 punpcklwd m1, m3, m4 324 pmaddwd m1, m14 325 punpckhwd m3, m4 326 pmaddwd m3, m14 327 paddd m0, m10 328 paddd m2, m10 329 paddd m0, m1 330 paddd m2, m3 331 psrad m0, 5 332 psrad m2, 5 333 packusdw m0, m2 334 pmulhuw m0, m11 335 mova [dstq+r10], m0 336 add r10, 32 337 jl .hv_loop 338 mov t6, t5 339 mov t5, t4 340 mov t4, t3 341 mov t3, t2 342 mov t2, t1 343 mov t1, t0 344 mov t0, t6 345 add dstq, strideq 346 ret 347.v: 348 mov r10, wq 349.v_loop: 350 mova m1, [t4+r10] 351 paddw m1, [t2+r10] 352 mova m2, [t3+r10] 353 mova m4, [t1+r10] 354 paddw m3, m4, [t6+r10] 355 paddw m4, [t5+r10] 356 punpcklwd m0, m1, m2 357 pmaddwd m0, m15 358 punpckhwd m1, m2 359 pmaddwd m1, m15 360 punpcklwd m2, m3, m4 361 pmaddwd m2, m14 362 punpckhwd m3, m4 363 pmaddwd m3, m14 364 paddd m0, m10 365 paddd m1, m10 366 paddd m0, m2 367 paddd m1, m3 368 psrad m0, 5 369 psrad m1, 5 370 packusdw m0, m1 371 pmulhuw m0, m11 372 mova [dstq+r10], m0 373 add r10, 32 374 jl .v_loop 375 mov t6, t5 376 mov t5, t4 377 mov t4, t3 378 mov t3, t2 379 mov t2, t1 380 add dstq, strideq 381 ret 382 383cglobal wiener_filter5_16bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ 384 w, h, edge, flt 385%define base t4-wiener_hshift 386 mov fltq, r6mp 387 movifnidn wd, wm 388 movifnidn hd, hm 389 mov edged, r7m 390 mov t3d, r8m ; pixel_max 391 vbroadcasti128 m5, [wiener_shufE] 392 vpbroadcastw m11, [fltq+ 2] ; x1 393 vbroadcasti128 m6, [wiener_shufB] 394 lea t4, [wiener_hshift] 395 vbroadcasti128 m7, [wiener_shufD] 396 add wd, wd 397 vpbroadcastd m12, [fltq+ 4] ; x2 x3 398 shr t3d, 11 399 vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18) 400 add lpfq, wq 401 vpbroadcastw m13, [fltq+18] ; y1 402 add dstq, wq 403 vpbroadcastd m14, [fltq+20] ; y2 y3 404 lea t1, [rsp+wq+16] 405 neg wq 406 vpbroadcastd m0, [base+wiener_hshift+t3*4] 407 vpbroadcastd m9, [base+wiener_round+t3*4] 408 vpbroadcastd m10, [base+wiener_vshift+t3*4] 409 movu xm15, [wiener_lshuf5] 410 pmullw m11, m0 411 vinserti128 m15, [pb_0to31], 1 412 pmullw m12, m0 413 test edgeb, 4 ; LR_HAVE_TOP 414 jz .no_top 415 call .h_top 416 add lpfq, strideq 417 mov t4, t1 418 add t1, 384*2 419 call .h_top 420 lea r10, [lpfq+strideq*4] 421 mov lpfq, dstq 422 mov t3, t1 423 add t1, 384*2 424 add r10, strideq 425 mov [rsp], r10 ; below 426 call .h 427 mov t2, t1 428 dec hd 429 jz .v1 430 add lpfq, strideq 431 add t1, 384*2 432 call .h 433 dec hd 434 jz .v2 435.main: 436 mov t0, t4 437.main_loop: 438 call .hv 439 dec hd 440 jnz .main_loop 441 test edgeb, 8 ; LR_HAVE_BOTTOM 442 jz .v2 443 mov lpfq, [rsp] 444 call .hv_bottom 445 add lpfq, strideq 446 call .hv_bottom 447.end: 448 RET 449.no_top: 450 lea r10, [lpfq+strideq*4] 451 mov lpfq, dstq 452 lea r10, [r10+strideq*2] 453 mov [rsp], r10 454 call .h 455 mov t4, t1 456 mov t3, t1 457 mov t2, t1 458 dec hd 459 jz .v1 460 add lpfq, strideq 461 add t1, 384*2 462 call .h 463 dec hd 464 jz .v2 465 lea t0, [t1+384*2] 466 call .hv 467 dec hd 468 jz .v2 469 add t0, 384*6 470 call .hv 471 dec hd 472 jnz .main 473.v2: 474 call .v 475 mov t4, t3 476 mov t3, t2 477 mov t2, t1 478 add dstq, strideq 479.v1: 480 call .v 481 jmp .end 482.extend_right: 483 movd xm2, r10d 484 vpbroadcastd m0, [pb_2_3] 485 vpbroadcastd m1, [pb_m6_m5] 486 vpbroadcastb m2, xm2 487 psubb m0, m2 488 psubb m1, m2 489 movu m2, [pb_0to31] 490 pminub m0, m2 491 pminub m1, m2 492 pshufb m3, m0 493 pshufb m4, m1 494 ret 495.h: 496 mov r10, wq 497 test edgeb, 1 ; LR_HAVE_LEFT 498 jz .h_extend_left 499 movd xm3, [leftq+4] 500 vpblendd m3, [lpfq+r10-4], 0xfe 501 add leftq, 8 502 jmp .h_main 503.h_extend_left: 504 vbroadcasti128 m4, [lpfq+r10] ; avoid accessing memory located 505 mova m3, [lpfq+r10] ; before the start of the buffer 506 palignr m3, m4, 12 507 pshufb m3, m15 508 jmp .h_main 509.h_top: 510 mov r10, wq 511 test edgeb, 1 ; LR_HAVE_LEFT 512 jz .h_extend_left 513.h_loop: 514 movu m3, [lpfq+r10-4] 515.h_main: 516 movu m4, [lpfq+r10+4] 517 test edgeb, 2 ; LR_HAVE_RIGHT 518 jnz .h_have_right 519 cmp r10d, -34 520 jl .h_have_right 521 call .extend_right 522.h_have_right: 523 pshufb m0, m3, m5 524 pmaddwd m0, m11 525 pshufb m1, m4, m5 526 pmaddwd m1, m11 527 pshufb m2, m3, m6 528 pshufb m3, m7 529 paddw m2, m3 530 pshufb m3, m4, m6 531 pmaddwd m2, m12 532 pshufb m4, m7 533 paddw m3, m4 534 pmaddwd m3, m12 535 paddd m0, m8 536 paddd m1, m8 537 paddd m0, m2 538 paddd m1, m3 539 psrad m0, 4 540 psrad m1, 4 541 packssdw m0, m1 542 psraw m0, 1 543 mova [t1+r10], m0 544 add r10, 32 545 jl .h_loop 546 ret 547ALIGN function_align 548.hv: 549 add lpfq, strideq 550 mov r10, wq 551 test edgeb, 1 ; LR_HAVE_LEFT 552 jz .hv_extend_left 553 movd xm3, [leftq+4] 554 vpblendd m3, [lpfq+r10-4], 0xfe 555 add leftq, 8 556 jmp .hv_main 557.hv_extend_left: 558 movu m3, [lpfq+r10-4] 559 pshufb m3, m15 560 jmp .hv_main 561.hv_bottom: 562 mov r10, wq 563 test edgeb, 1 ; LR_HAVE_LEFT 564 jz .hv_extend_left 565.hv_loop: 566 movu m3, [lpfq+r10-4] 567.hv_main: 568 movu m4, [lpfq+r10+4] 569 test edgeb, 2 ; LR_HAVE_RIGHT 570 jnz .hv_have_right 571 cmp r10d, -34 572 jl .hv_have_right 573 call .extend_right 574.hv_have_right: 575 pshufb m0, m3, m5 576 pmaddwd m0, m11 577 pshufb m1, m4, m5 578 pmaddwd m1, m11 579 pshufb m2, m3, m6 580 pshufb m3, m7 581 paddw m2, m3 582 pshufb m3, m4, m6 583 pmaddwd m2, m12 584 pshufb m4, m7 585 paddw m3, m4 586 pmaddwd m3, m12 587 paddd m0, m8 588 paddd m1, m8 589 paddd m0, m2 590 mova m2, [t3+r10] 591 paddw m2, [t1+r10] 592 paddd m1, m3 593 mova m4, [t2+r10] 594 punpckhwd m3, m2, m4 595 pmaddwd m3, m14 596 punpcklwd m2, m4 597 mova m4, [t4+r10] 598 psrad m0, 4 599 psrad m1, 4 600 packssdw m0, m1 601 pmaddwd m2, m14 602 psraw m0, 1 603 mova [t0+r10], m0 604 punpckhwd m1, m0, m4 605 pmaddwd m1, m13 606 punpcklwd m0, m4 607 pmaddwd m0, m13 608 paddd m3, m9 609 paddd m2, m9 610 paddd m1, m3 611 paddd m0, m2 612 psrad m1, 5 613 psrad m0, 5 614 packusdw m0, m1 615 pmulhuw m0, m10 616 mova [dstq+r10], m0 617 add r10, 32 618 jl .hv_loop 619 mov t4, t3 620 mov t3, t2 621 mov t2, t1 622 mov t1, t0 623 mov t0, t4 624 add dstq, strideq 625 ret 626.v: 627 mov r10, wq 628.v_loop: 629 mova m0, [t1+r10] 630 paddw m2, m0, [t3+r10] 631 mova m1, [t2+r10] 632 mova m4, [t4+r10] 633 punpckhwd m3, m2, m1 634 pmaddwd m3, m14 635 punpcklwd m2, m1 636 pmaddwd m2, m14 637 punpckhwd m1, m0, m4 638 pmaddwd m1, m13 639 punpcklwd m0, m4 640 pmaddwd m0, m13 641 paddd m3, m9 642 paddd m2, m9 643 paddd m1, m3 644 paddd m0, m2 645 psrad m1, 5 646 psrad m0, 5 647 packusdw m0, m1 648 pmulhuw m0, m10 649 mova [dstq+r10], m0 650 add r10, 32 651 jl .v_loop 652 ret 653 654cglobal sgr_filter_5x5_16bpc, 4, 14, 15, 400*24+16, dst, stride, left, lpf, \ 655 w, h, edge, params 656 movifnidn wd, wm 657 mov paramsq, r6mp 658 lea r13, [sgr_x_by_x_avx2+256*4] 659 movifnidn hd, hm 660 mov edged, r7m 661 add wd, wd 662 vpbroadcastw m7, [paramsq+8] ; w0 663 add lpfq, wq 664 vpbroadcastd m8, [pd_8] 665 add dstq, wq 666 vpbroadcastd m9, [pd_25] 667 lea t3, [rsp+wq*2+400*12+16] 668 vpbroadcastd m10, [paramsq+0] ; s0 669 lea t4, [rsp+wq+400*20+16] 670 vpbroadcastd m11, [pd_0xf00800a4] 671 lea t1, [rsp+wq+20] 672 mova xm12, [sgr_lshuf5] 673 neg wq 674 vpbroadcastd m13, [pd_34816] ; (1 << 11) + (1 << 15) 675 pxor m6, m6 676 vpbroadcastd m14, [pw_1023] 677 psllw m7, 4 678 test edgeb, 4 ; LR_HAVE_TOP 679 jz .no_top 680 call .h_top 681 add lpfq, strideq 682 mov t2, t1 683 call .top_fixup 684 add t1, 400*6 685 call .h_top 686 lea r10, [lpfq+strideq*4] 687 mov lpfq, dstq 688 add r10, strideq 689 mov [rsp], r10 ; below 690 mov t0, t2 691 dec hd 692 jz .height1 693 or edged, 16 694 call .h 695.main: 696 add lpfq, strideq 697 call .hv 698 call .prep_n 699 sub hd, 2 700 jl .extend_bottom 701.main_loop: 702 add lpfq, strideq 703 test hd, hd 704 jz .odd_height 705 call .h 706 add lpfq, strideq 707 call .hv 708 call .n0 709 call .n1 710 sub hd, 2 711 jge .main_loop 712 test edgeb, 8 ; LR_HAVE_BOTTOM 713 jz .extend_bottom 714 mov lpfq, [rsp] 715 call .h_top 716 add lpfq, strideq 717 call .hv_bottom 718.end: 719 call .n0 720 call .n1 721.end2: 722 RET 723.height1: 724 call .hv 725 call .prep_n 726 jmp .odd_height_end 727.odd_height: 728 call .hv 729 call .n0 730 call .n1 731.odd_height_end: 732 call .v 733 call .n0 734 jmp .end2 735.extend_bottom: 736 call .v 737 jmp .end 738.no_top: 739 lea r10, [lpfq+strideq*4] 740 mov lpfq, dstq 741 lea r10, [r10+strideq*2] 742 mov [rsp], r10 743 call .h 744 lea t2, [t1+400*6] 745 call .top_fixup 746 dec hd 747 jz .no_top_height1 748 or edged, 16 749 mov t0, t1 750 mov t1, t2 751 jmp .main 752.no_top_height1: 753 call .v 754 call .prep_n 755 jmp .odd_height_end 756.extend_right: 757 vpbroadcastw m0, [lpfq-2] 758 movu m1, [r13+r10+ 0] 759 movu m2, [r13+r10+16] 760 vpblendvb m4, m0, m1 761 vpblendvb m5, m0, m2 762 ret 763.h: ; horizontal boxsum 764 lea r10, [wq-4] 765 test edgeb, 1 ; LR_HAVE_LEFT 766 jz .h_extend_left 767 vpbroadcastq xm5, [leftq] 768 vinserti128 m5, [lpfq+wq], 1 769 mova m4, [lpfq+wq] 770 add leftq, 8 771 palignr m4, m5, 10 772 jmp .h_main 773.h_extend_left: 774 mova xm4, [lpfq+wq] 775 pshufb xm4, xm12 776 vinserti128 m4, [lpfq+wq+10], 1 777 jmp .h_main 778.h_top: 779 lea r10, [wq-4] 780 test edgeb, 1 ; LR_HAVE_LEFT 781 jz .h_extend_left 782.h_loop: 783 movu m4, [lpfq+r10- 2] 784.h_main: 785 movu m5, [lpfq+r10+14] 786 test edgeb, 2 ; LR_HAVE_RIGHT 787 jnz .h_have_right 788 cmp r10d, -36 789 jl .h_have_right 790 call .extend_right 791.h_have_right: 792 palignr m2, m5, m4, 2 793 paddw m0, m4, m2 794 palignr m3, m5, m4, 6 795 paddw m0, m3 796 punpcklwd m1, m2, m3 797 pmaddwd m1, m1 798 punpckhwd m2, m3 799 pmaddwd m2, m2 800 shufpd m5, m4, m5, 0x05 801 paddw m0, m5 802 punpcklwd m3, m4, m5 803 pmaddwd m3, m3 804 paddd m1, m3 805 punpckhwd m3, m4, m5 806 pmaddwd m3, m3 807 shufps m4, m5, q2121 808 paddw m0, m4 ; sum 809 punpcklwd m5, m4, m6 810 pmaddwd m5, m5 811 punpckhwd m4, m6 812 pmaddwd m4, m4 813 paddd m2, m3 814 test edgeb, 16 ; y > 0 815 jz .h_loop_end 816 paddw m0, [t1+r10+400*0] 817 paddd m1, [t1+r10+400*2] 818 paddd m2, [t1+r10+400*4] 819.h_loop_end: 820 paddd m1, m5 ; sumsq 821 paddd m2, m4 822 mova [t1+r10+400*0], m0 823 mova [t1+r10+400*2], m1 824 mova [t1+r10+400*4], m2 825 add r10, 32 826 jl .h_loop 827 ret 828.top_fixup: 829 lea r10, [wq-4] 830.top_fixup_loop: ; the sums of the first row needs to be doubled 831 mova m0, [t1+r10+400*0] 832 mova m1, [t1+r10+400*2] 833 mova m2, [t1+r10+400*4] 834 paddw m0, m0 835 paddd m1, m1 836 paddd m2, m2 837 mova [t2+r10+400*0], m0 838 mova [t2+r10+400*2], m1 839 mova [t2+r10+400*4], m2 840 add r10, 32 841 jl .top_fixup_loop 842 ret 843ALIGN function_align 844.hv: ; horizontal boxsum + vertical boxsum + ab 845 lea r10, [wq-4] 846 test edgeb, 1 ; LR_HAVE_LEFT 847 jz .hv_extend_left 848 vpbroadcastq xm5, [leftq] 849 vinserti128 m5, [lpfq+wq], 1 850 mova m4, [lpfq+wq] 851 add leftq, 8 852 palignr m4, m5, 10 853 jmp .hv_main 854.hv_extend_left: 855 mova xm4, [lpfq+wq] 856 pshufb xm4, xm12 857 vinserti128 m4, [lpfq+wq+10], 1 858 jmp .hv_main 859.hv_bottom: 860 lea r10, [wq-4] 861 test edgeb, 1 ; LR_HAVE_LEFT 862 jz .hv_extend_left 863.hv_loop: 864 movu m4, [lpfq+r10- 2] 865.hv_main: 866 movu m5, [lpfq+r10+14] 867 test edgeb, 2 ; LR_HAVE_RIGHT 868 jnz .hv_have_right 869 cmp r10d, -36 870 jl .hv_have_right 871 call .extend_right 872.hv_have_right: 873 palignr m3, m5, m4, 2 874 paddw m0, m4, m3 875 palignr m1, m5, m4, 6 876 paddw m0, m1 877 punpcklwd m2, m3, m1 878 pmaddwd m2, m2 879 punpckhwd m3, m1 880 pmaddwd m3, m3 881 shufpd m5, m4, m5, 0x05 882 paddw m0, m5 883 punpcklwd m1, m4, m5 884 pmaddwd m1, m1 885 paddd m2, m1 886 punpckhwd m1, m4, m5 887 pmaddwd m1, m1 888 shufps m4, m5, q2121 889 paddw m0, m4 ; h sum 890 punpcklwd m5, m4, m6 891 pmaddwd m5, m5 892 punpckhwd m4, m6 893 pmaddwd m4, m4 894 paddd m3, m1 895 paddd m2, m5 ; h sumsq 896 paddd m3, m4 897 paddw m1, m0, [t1+r10+400*0] 898 paddd m4, m2, [t1+r10+400*2] 899 paddd m5, m3, [t1+r10+400*4] 900 test hd, hd 901 jz .hv_last_row 902.hv_main2: 903 paddw m1, [t2+r10+400*0] ; hv sum 904 paddd m4, [t2+r10+400*2] ; hv sumsq 905 paddd m5, [t2+r10+400*4] 906 mova [t0+r10+400*0], m0 907 mova [t0+r10+400*2], m2 908 mova [t0+r10+400*4], m3 909 psrlw m3, m1, 1 910 paddd m4, m8 911 pavgw m3, m6 ; (b + 2) >> 2 912 paddd m5, m8 913 psrld m4, 4 ; (a + 8) >> 4 914 punpcklwd m2, m3, m6 915 psrld m5, 4 916 punpckhwd m3, m6 917 pmulld m4, m9 ; a * 25 918 pmulld m5, m9 919 pmaddwd m2, m2 ; b * b 920 pmaddwd m3, m3 921 punpcklwd m0, m1, m6 ; b 922 punpckhwd m1, m6 923 pmaxud m4, m2 924 pmaxud m5, m3 925 psubd m4, m2 ; p 926 psubd m5, m3 927 pmulld m4, m10 ; p * s 928 pmulld m5, m10 929 pmaddwd m0, m11 ; b * 164 930 pmaddwd m1, m11 931 paddusw m4, m11 932 paddusw m5, m11 933 psrad m3, m4, 20 ; min(z, 255) - 256 934 vpgatherdd m2, [r13+m3*4], m4 ; x 935 psrad m4, m5, 20 936 vpgatherdd m3, [r13+m4*4], m5 937 pmulld m0, m2 938 pmulld m1, m3 939 packssdw m2, m3 940 paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) 941 paddd m1, m13 942 mova [t4+r10+4], m2 943 psrld m0, 12 ; b 944 psrld m1, 12 945 mova [t3+r10*2+ 8], xm0 946 vextracti128 [t3+r10*2+40], m0, 1 947 mova [t3+r10*2+24], xm1 948 vextracti128 [t3+r10*2+56], m1, 1 949 add r10, 32 950 jl .hv_loop 951 mov t2, t1 952 mov t1, t0 953 mov t0, t2 954 ret 955.hv_last_row: ; esoteric edge case for odd heights 956 mova [t1+r10+400*0], m1 957 paddw m1, m0 958 mova [t1+r10+400*2], m4 959 paddd m4, m2 960 mova [t1+r10+400*4], m5 961 paddd m5, m3 962 jmp .hv_main2 963.v: ; vertical boxsum + ab 964 lea r10, [wq-4] 965.v_loop: 966 mova m0, [t1+r10+400*0] 967 mova m2, [t1+r10+400*2] 968 mova m3, [t1+r10+400*4] 969 paddw m1, m0, [t2+r10+400*0] 970 paddd m4, m2, [t2+r10+400*2] 971 paddd m5, m3, [t2+r10+400*4] 972 paddw m0, m0 973 paddd m2, m2 974 paddd m3, m3 975 paddw m1, m0 ; hv sum 976 paddd m4, m2 ; hv sumsq 977 paddd m5, m3 978 psrlw m3, m1, 1 979 paddd m4, m8 980 pavgw m3, m6 ; (b + 2) >> 2 981 paddd m5, m8 982 psrld m4, 4 ; (a + 8) >> 4 983 punpcklwd m2, m3, m6 984 psrld m5, 4 985 punpckhwd m3, m6 986 pmulld m4, m9 ; a * 25 987 pmulld m5, m9 988 pmaddwd m2, m2 ; b * b 989 pmaddwd m3, m3 990 punpcklwd m0, m1, m6 ; b 991 punpckhwd m1, m6 992 pmaxud m4, m2 993 pmaxud m5, m3 994 psubd m4, m2 ; p 995 psubd m5, m3 996 pmulld m4, m10 ; p * s 997 pmulld m5, m10 998 pmaddwd m0, m11 ; b * 164 999 pmaddwd m1, m11 1000 paddusw m4, m11 1001 paddusw m5, m11 1002 psrad m3, m4, 20 ; min(z, 255) - 256 1003 vpgatherdd m2, [r13+m3*4], m4 ; x 1004 psrad m4, m5, 20 1005 vpgatherdd m3, [r13+m4*4], m5 1006 pmulld m0, m2 1007 pmulld m1, m3 1008 packssdw m2, m3 1009 paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) 1010 paddd m1, m13 1011 mova [t4+r10+4], m2 1012 psrld m0, 12 ; b 1013 psrld m1, 12 1014 mova [t3+r10*2+ 8], xm0 1015 vextracti128 [t3+r10*2+40], m0, 1 1016 mova [t3+r10*2+24], xm1 1017 vextracti128 [t3+r10*2+56], m1, 1 1018 add r10, 32 1019 jl .v_loop 1020 ret 1021.prep_n: ; initial neighbor setup 1022 mov r10, wq 1023.prep_n_loop: 1024 movu m0, [t4+r10*1+ 2] 1025 movu m1, [t3+r10*2+ 4] 1026 movu m2, [t3+r10*2+36] 1027 paddw m3, m0, [t4+r10*1+ 0] 1028 paddd m4, m1, [t3+r10*2+ 0] 1029 paddd m5, m2, [t3+r10*2+32] 1030 paddw m3, [t4+r10*1+ 4] 1031 paddd m4, [t3+r10*2+ 8] 1032 paddd m5, [t3+r10*2+40] 1033 paddw m0, m3 1034 psllw m3, 2 1035 paddd m1, m4 1036 pslld m4, 2 1037 paddd m2, m5 1038 pslld m5, 2 1039 paddw m0, m3 ; a 565 1040 paddd m1, m4 ; b 565 1041 paddd m2, m5 1042 mova [t4+r10*1+400*2+ 0], m0 1043 mova [t3+r10*2+400*4+ 0], m1 1044 mova [t3+r10*2+400*4+32], m2 1045 add r10, 32 1046 jl .prep_n_loop 1047 ret 1048ALIGN function_align 1049.n0: ; neighbor + output (even rows) 1050 mov r10, wq 1051.n0_loop: 1052 movu m0, [t4+r10*1+ 2] 1053 movu m1, [t3+r10*2+ 4] 1054 movu m2, [t3+r10*2+36] 1055 paddw m3, m0, [t4+r10*1+ 0] 1056 paddd m4, m1, [t3+r10*2+ 0] 1057 paddd m5, m2, [t3+r10*2+32] 1058 paddw m3, [t4+r10*1+ 4] 1059 paddd m4, [t3+r10*2+ 8] 1060 paddd m5, [t3+r10*2+40] 1061 paddw m0, m3 1062 psllw m3, 2 1063 paddd m1, m4 1064 pslld m4, 2 1065 paddd m2, m5 1066 pslld m5, 2 1067 paddw m0, m3 ; a 565 1068 paddd m1, m4 ; b 565 1069 paddd m2, m5 1070 paddw m3, m0, [t4+r10*1+400*2+ 0] 1071 paddd m4, m1, [t3+r10*2+400*4+ 0] 1072 paddd m5, m2, [t3+r10*2+400*4+32] 1073 mova [t4+r10*1+400*2+ 0], m0 1074 mova [t3+r10*2+400*4+ 0], m1 1075 mova [t3+r10*2+400*4+32], m2 1076 mova m0, [dstq+r10] 1077 punpcklwd m1, m0, m6 ; src 1078 punpcklwd m2, m3, m6 ; a 1079 pmaddwd m2, m1 ; a * src 1080 punpckhwd m1, m0, m6 1081 punpckhwd m3, m6 1082 pmaddwd m3, m1 1083 vinserti128 m1, m4, xm5, 1 1084 vperm2i128 m4, m5, 0x31 1085 psubd m1, m2 ; b - a * src + (1 << 8) 1086 psubd m4, m3 1087 psrad m1, 9 1088 psrad m4, 9 1089 packssdw m1, m4 1090 pmulhrsw m1, m7 1091 paddw m0, m1 1092 pmaxsw m0, m6 1093 pminsw m0, m14 1094 mova [dstq+r10], m0 1095 add r10, 32 1096 jl .n0_loop 1097 add dstq, strideq 1098 ret 1099ALIGN function_align 1100.n1: ; neighbor + output (odd rows) 1101 mov r10, wq 1102.n1_loop: 1103 mova m0, [dstq+r10] 1104 mova m3, [t4+r10*1+400*2+ 0] 1105 mova m4, [t3+r10*2+400*4+ 0] 1106 mova m5, [t3+r10*2+400*4+32] 1107 punpcklwd m1, m0, m6 ; src 1108 punpcklwd m2, m3, m6 ; a 1109 pmaddwd m2, m1 1110 punpckhwd m1, m0, m6 1111 punpckhwd m3, m6 1112 pmaddwd m3, m1 1113 vinserti128 m1, m4, xm5, 1 1114 vperm2i128 m4, m5, 0x31 1115 psubd m1, m2 ; b - a * src + (1 << 7) 1116 psubd m4, m3 1117 psrad m1, 8 1118 psrad m4, 8 1119 packssdw m1, m4 1120 pmulhrsw m1, m7 1121 paddw m0, m1 1122 pmaxsw m0, m6 1123 pminsw m0, m14 1124 mova [dstq+r10], m0 1125 add r10, 32 1126 jl .n1_loop 1127 add dstq, strideq 1128 ret 1129 1130cglobal sgr_filter_3x3_16bpc, 4, 14, 14, 400*42+8, dst, stride, left, lpf, \ 1131 w, h, edge, params 1132 movifnidn wd, wm 1133 mov paramsq, r6mp 1134 lea r13, [sgr_x_by_x_avx2+256*4] 1135 add wd, wd 1136 movifnidn hd, hm 1137 mov edged, r7m 1138 add lpfq, wq 1139 vpbroadcastw m7, [paramsq+10] ; w1 1140 add dstq, wq 1141 vpbroadcastd m9, [paramsq+ 4] ; s1 1142 lea t3, [rsp+wq*2+400*12+8] 1143 vpbroadcastd m8, [pd_8] 1144 lea t4, [rsp+wq+400*32+8] 1145 vpbroadcastd m10, [pd_0xf00801c7] 1146 lea t1, [rsp+wq+12] 1147 vpbroadcastd m11, [pd_34816] 1148 neg wq 1149 mova xm12, [sgr_lshuf3] 1150 pxor m6, m6 1151 vpbroadcastd m13, [pw_1023] 1152 psllw m7, 4 1153 test edgeb, 4 ; LR_HAVE_TOP 1154 jz .no_top 1155 call .h_top 1156 add lpfq, strideq 1157 mov t2, t1 1158 add t1, 400*6 1159 call .h_top 1160 lea r10, [lpfq+strideq*4] 1161 mov lpfq, dstq 1162 add r10, strideq 1163 mov [rsp], r10 ; below 1164 call .hv0 1165.main: 1166 dec hd 1167 jz .height1 1168 add lpfq, strideq 1169 call .hv1 1170 call .prep_n 1171 sub hd, 2 1172 jl .extend_bottom 1173.main_loop: 1174 add lpfq, strideq 1175 call .hv0 1176 test hd, hd 1177 jz .odd_height 1178 add lpfq, strideq 1179 call .hv1 1180 call .n0 1181 call .n1 1182 sub hd, 2 1183 jge .main_loop 1184 test edgeb, 8 ; LR_HAVE_BOTTOM 1185 jz .extend_bottom 1186 mov lpfq, [rsp] 1187 call .hv0_bottom 1188 add lpfq, strideq 1189 call .hv1_bottom 1190.end: 1191 call .n0 1192 call .n1 1193.end2: 1194 RET 1195.height1: 1196 call .v1 1197 call .prep_n 1198 jmp .odd_height_end 1199.odd_height: 1200 call .v1 1201 call .n0 1202 call .n1 1203.odd_height_end: 1204 call .v0 1205 call .v1 1206 call .n0 1207 jmp .end2 1208.extend_bottom: 1209 call .v0 1210 call .v1 1211 jmp .end 1212.no_top: 1213 lea r10, [lpfq+strideq*4] 1214 mov lpfq, dstq 1215 lea r10, [r10+strideq*2] 1216 mov [rsp], r10 1217 call .h 1218 lea r10, [wq-4] 1219 lea t2, [t1+400*6] 1220.top_fixup_loop: 1221 mova m0, [t1+r10+400*0] 1222 mova m1, [t1+r10+400*2] 1223 mova m2, [t1+r10+400*4] 1224 mova [t2+r10+400*0], m0 1225 mova [t2+r10+400*2], m1 1226 mova [t2+r10+400*4], m2 1227 add r10, 32 1228 jl .top_fixup_loop 1229 call .v0 1230 jmp .main 1231.extend_right: 1232 vpbroadcastw m0, [lpfq-2] 1233 movu m1, [r13+r10+ 2] 1234 movu m2, [r13+r10+18] 1235 vpblendvb m4, m0, m1 1236 vpblendvb m5, m0, m2 1237 ret 1238.h: ; horizontal boxsum 1239 lea r10, [wq-4] 1240 test edgeb, 1 ; LR_HAVE_LEFT 1241 jz .h_extend_left 1242 vpbroadcastq xm5, [leftq] 1243 vinserti128 m5, [lpfq+wq], 1 1244 mova m4, [lpfq+wq] 1245 add leftq, 8 1246 palignr m4, m5, 12 1247 jmp .h_main 1248.h_extend_left: 1249 mova xm4, [lpfq+wq] 1250 pshufb xm4, xm12 1251 vinserti128 m4, [lpfq+wq+12], 1 1252 jmp .h_main 1253.h_top: 1254 lea r10, [wq-4] 1255 test edgeb, 1 ; LR_HAVE_LEFT 1256 jz .h_extend_left 1257.h_loop: 1258 movu m4, [lpfq+r10+ 0] 1259.h_main: 1260 movu m5, [lpfq+r10+16] 1261 test edgeb, 2 ; LR_HAVE_RIGHT 1262 jnz .h_have_right 1263 cmp r10d, -34 1264 jl .h_have_right 1265 call .extend_right 1266.h_have_right: 1267 palignr m0, m5, m4, 2 1268 paddw m1, m4, m0 1269 punpcklwd m2, m4, m0 1270 pmaddwd m2, m2 1271 punpckhwd m3, m4, m0 1272 pmaddwd m3, m3 1273 palignr m5, m4, 4 1274 paddw m1, m5 ; sum 1275 punpcklwd m4, m5, m6 1276 pmaddwd m4, m4 1277 punpckhwd m5, m6 1278 pmaddwd m5, m5 1279 paddd m2, m4 ; sumsq 1280 paddd m3, m5 1281 mova [t1+r10+400*0], m1 1282 mova [t1+r10+400*2], m2 1283 mova [t1+r10+400*4], m3 1284 add r10, 32 1285 jl .h_loop 1286 ret 1287ALIGN function_align 1288.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) 1289 lea r10, [wq-4] 1290 test edgeb, 1 ; LR_HAVE_LEFT 1291 jz .hv0_extend_left 1292 vpbroadcastq xm5, [leftq] 1293 vinserti128 m5, [lpfq+wq], 1 1294 mova m4, [lpfq+wq] 1295 add leftq, 8 1296 palignr m4, m5, 12 1297 jmp .hv0_main 1298.hv0_extend_left: 1299 mova xm4, [lpfq+wq] 1300 pshufb xm4, xm12 1301 vinserti128 m4, [lpfq+wq+12], 1 1302 jmp .hv0_main 1303.hv0_bottom: 1304 lea r10, [wq-4] 1305 test edgeb, 1 ; LR_HAVE_LEFT 1306 jz .hv0_extend_left 1307.hv0_loop: 1308 movu m4, [lpfq+r10+ 0] 1309.hv0_main: 1310 movu m5, [lpfq+r10+16] 1311 test edgeb, 2 ; LR_HAVE_RIGHT 1312 jnz .hv0_have_right 1313 cmp r10d, -34 1314 jl .hv0_have_right 1315 call .extend_right 1316.hv0_have_right: 1317 palignr m0, m5, m4, 2 1318 paddw m1, m4, m0 1319 punpcklwd m2, m4, m0 1320 pmaddwd m2, m2 1321 punpckhwd m3, m4, m0 1322 pmaddwd m3, m3 1323 palignr m5, m4, 4 1324 paddw m1, m5 ; sum 1325 punpcklwd m4, m5, m6 1326 pmaddwd m4, m4 1327 punpckhwd m5, m6 1328 pmaddwd m5, m5 1329 paddd m2, m4 ; sumsq 1330 paddd m3, m5 1331 paddw m0, m1, [t1+r10+400*0] 1332 paddd m4, m2, [t1+r10+400*2] 1333 paddd m5, m3, [t1+r10+400*4] 1334 mova [t1+r10+400*0], m1 1335 mova [t1+r10+400*2], m2 1336 mova [t1+r10+400*4], m3 1337 paddw m1, m0, [t2+r10+400*0] 1338 paddd m2, m4, [t2+r10+400*2] 1339 paddd m3, m5, [t2+r10+400*4] 1340 mova [t2+r10+400*0], m0 1341 mova [t2+r10+400*2], m4 1342 mova [t2+r10+400*4], m5 1343 paddd m2, m8 1344 paddd m3, m8 1345 psrld m2, 4 ; (a + 8) >> 4 1346 psrld m3, 4 1347 pslld m4, m2, 3 1348 pslld m5, m3, 3 1349 paddd m4, m2 ; ((a + 8) >> 4) * 9 1350 paddd m5, m3 1351 psrlw m3, m1, 1 1352 pavgw m3, m6 ; (b + 2) >> 2 1353 punpcklwd m2, m3, m6 1354 pmaddwd m2, m2 1355 punpckhwd m3, m6 1356 pmaddwd m3, m3 1357 punpcklwd m0, m1, m6 ; b 1358 punpckhwd m1, m6 1359 pmaxud m4, m2 1360 psubd m4, m2 ; p 1361 pmaxud m5, m3 1362 psubd m5, m3 1363 pmulld m4, m9 ; p * s 1364 pmulld m5, m9 1365 pmaddwd m0, m10 ; b * 455 1366 pmaddwd m1, m10 1367 paddusw m4, m10 1368 paddusw m5, m10 1369 psrad m3, m4, 20 ; min(z, 255) - 256 1370 vpgatherdd m2, [r13+m3*4], m4 ; x 1371 psrad m4, m5, 20 1372 vpgatherdd m3, [r13+m4*4], m5 1373 pmulld m0, m2 1374 pmulld m1, m3 1375 packssdw m2, m3 1376 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 1377 paddd m1, m11 1378 psrld m0, 12 1379 psrld m1, 12 1380 mova [t4+r10*1+400*0+ 4], m2 1381 mova [t3+r10*2+400*0+ 8], xm0 1382 vextracti128 [t3+r10*2+400*0+40], m0, 1 1383 mova [t3+r10*2+400*0+24], xm1 1384 vextracti128 [t3+r10*2+400*0+56], m1, 1 1385 add r10, 32 1386 jl .hv0_loop 1387 ret 1388ALIGN function_align 1389.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 1390 lea r10, [wq-4] 1391 test edgeb, 1 ; LR_HAVE_LEFT 1392 jz .hv1_extend_left 1393 vpbroadcastq xm5, [leftq] 1394 vinserti128 m5, [lpfq+wq], 1 1395 mova m4, [lpfq+wq] 1396 add leftq, 8 1397 palignr m4, m5, 12 1398 jmp .hv1_main 1399.hv1_extend_left: 1400 mova xm4, [lpfq+wq] 1401 pshufb xm4, xm12 1402 vinserti128 m4, [lpfq+wq+12], 1 1403 jmp .hv1_main 1404.hv1_bottom: 1405 lea r10, [wq-4] 1406 test edgeb, 1 ; LR_HAVE_LEFT 1407 jz .hv1_extend_left 1408.hv1_loop: 1409 movu m4, [lpfq+r10+ 0] 1410.hv1_main: 1411 movu m5, [lpfq+r10+16] 1412 test edgeb, 2 ; LR_HAVE_RIGHT 1413 jnz .hv1_have_right 1414 cmp r10d, -34 1415 jl .hv1_have_right 1416 call .extend_right 1417.hv1_have_right: 1418 palignr m1, m5, m4, 2 1419 paddw m0, m4, m1 1420 punpcklwd m2, m4, m1 1421 pmaddwd m2, m2 1422 punpckhwd m3, m4, m1 1423 pmaddwd m3, m3 1424 palignr m5, m4, 4 1425 paddw m0, m5 ; h sum 1426 punpcklwd m1, m5, m6 1427 pmaddwd m1, m1 1428 punpckhwd m5, m6 1429 pmaddwd m5, m5 1430 paddd m2, m1 ; h sumsq 1431 paddd m3, m5 1432 paddw m1, m0, [t2+r10+400*0] 1433 paddd m4, m2, [t2+r10+400*2] 1434 paddd m5, m3, [t2+r10+400*4] 1435 mova [t2+r10+400*0], m0 1436 mova [t2+r10+400*2], m2 1437 mova [t2+r10+400*4], m3 1438 paddd m4, m8 1439 paddd m5, m8 1440 psrld m4, 4 ; (a + 8) >> 4 1441 psrld m5, 4 1442 pslld m2, m4, 3 1443 pslld m3, m5, 3 1444 paddd m4, m2 ; ((a + 8) >> 4) * 9 1445 paddd m5, m3 1446 psrlw m3, m1, 1 1447 pavgw m3, m6 ; (b + 2) >> 2 1448 punpcklwd m2, m3, m6 1449 pmaddwd m2, m2 1450 punpckhwd m3, m6 1451 pmaddwd m3, m3 1452 punpcklwd m0, m1, m6 ; b 1453 punpckhwd m1, m6 1454 pmaxud m4, m2 1455 psubd m4, m2 ; p 1456 pmaxud m5, m3 1457 psubd m5, m3 1458 pmulld m4, m9 ; p * s 1459 pmulld m5, m9 1460 pmaddwd m0, m10 ; b * 455 1461 pmaddwd m1, m10 1462 paddusw m4, m10 1463 paddusw m5, m10 1464 psrad m3, m4, 20 ; min(z, 255) - 256 1465 vpgatherdd m2, [r13+m3*4], m4 ; x 1466 psrad m4, m5, 20 1467 vpgatherdd m3, [r13+m4*4], m5 1468 pmulld m0, m2 1469 pmulld m1, m3 1470 packssdw m2, m3 1471 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 1472 paddd m1, m11 1473 psrld m0, 12 1474 psrld m1, 12 1475 mova [t4+r10*1+400*2 +4], m2 1476 mova [t3+r10*2+400*4+ 8], xm0 1477 vextracti128 [t3+r10*2+400*4+40], m0, 1 1478 mova [t3+r10*2+400*4+24], xm1 1479 vextracti128 [t3+r10*2+400*4+56], m1, 1 1480 add r10, 32 1481 jl .hv1_loop 1482 mov r10, t2 1483 mov t2, t1 1484 mov t1, r10 1485 ret 1486.v0: ; vertical boxsums + ab (even rows) 1487 lea r10, [wq-4] 1488.v0_loop: 1489 mova m0, [t1+r10+400*0] 1490 mova m4, [t1+r10+400*2] 1491 mova m5, [t1+r10+400*4] 1492 paddw m0, m0 1493 paddd m4, m4 1494 paddd m5, m5 1495 paddw m1, m0, [t2+r10+400*0] 1496 paddd m2, m4, [t2+r10+400*2] 1497 paddd m3, m5, [t2+r10+400*4] 1498 mova [t2+r10+400*0], m0 1499 mova [t2+r10+400*2], m4 1500 mova [t2+r10+400*4], m5 1501 paddd m2, m8 1502 paddd m3, m8 1503 psrld m2, 4 ; (a + 8) >> 4 1504 psrld m3, 4 1505 pslld m4, m2, 3 1506 pslld m5, m3, 3 1507 paddd m4, m2 ; ((a + 8) >> 4) * 9 1508 paddd m5, m3 1509 psrlw m3, m1, 1 1510 pavgw m3, m6 ; (b + 2) >> 2 1511 punpcklwd m2, m3, m6 1512 pmaddwd m2, m2 1513 punpckhwd m3, m6 1514 pmaddwd m3, m3 1515 punpcklwd m0, m1, m6 ; b 1516 punpckhwd m1, m6 1517 pmaxud m4, m2 1518 psubd m4, m2 ; p 1519 pmaxud m5, m3 1520 psubd m5, m3 1521 pmulld m4, m9 ; p * s 1522 pmulld m5, m9 1523 pmaddwd m0, m10 ; b * 455 1524 pmaddwd m1, m10 1525 paddusw m4, m10 1526 paddusw m5, m10 1527 psrad m3, m4, 20 ; min(z, 255) - 256 1528 vpgatherdd m2, [r13+m3*4], m4 ; x 1529 psrad m4, m5, 20 1530 vpgatherdd m3, [r13+m4*4], m5 1531 pmulld m0, m2 1532 pmulld m1, m3 1533 packssdw m2, m3 1534 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 1535 paddd m1, m11 1536 psrld m0, 12 1537 psrld m1, 12 1538 mova [t4+r10*1+400*0+ 4], m2 1539 mova [t3+r10*2+400*0+ 8], xm0 1540 vextracti128 [t3+r10*2+400*0+40], m0, 1 1541 mova [t3+r10*2+400*0+24], xm1 1542 vextracti128 [t3+r10*2+400*0+56], m1, 1 1543 add r10, 32 1544 jl .v0_loop 1545 ret 1546.v1: ; vertical boxsums + ab (odd rows) 1547 lea r10, [wq-4] 1548.v1_loop: 1549 mova m0, [t1+r10+400*0] 1550 mova m4, [t1+r10+400*2] 1551 mova m5, [t1+r10+400*4] 1552 paddw m1, m0, [t2+r10+400*0] 1553 paddd m2, m4, [t2+r10+400*2] 1554 paddd m3, m5, [t2+r10+400*4] 1555 mova [t2+r10+400*0], m0 1556 mova [t2+r10+400*2], m4 1557 mova [t2+r10+400*4], m5 1558 paddd m2, m8 1559 paddd m3, m8 1560 psrld m2, 4 ; (a + 8) >> 4 1561 psrld m3, 4 1562 pslld m4, m2, 3 1563 pslld m5, m3, 3 1564 paddd m4, m2 ; ((a + 8) >> 4) * 9 1565 paddd m5, m3 1566 psrlw m3, m1, 1 1567 pavgw m3, m6 ; (b + 2) >> 2 1568 punpcklwd m2, m3, m6 1569 pmaddwd m2, m2 1570 punpckhwd m3, m6 1571 pmaddwd m3, m3 1572 punpcklwd m0, m1, m6 ; b 1573 punpckhwd m1, m6 1574 pmaxud m4, m2 1575 psubd m4, m2 ; p 1576 pmaxud m5, m3 1577 psubd m5, m3 1578 pmulld m4, m9 ; p * s 1579 pmulld m5, m9 1580 pmaddwd m0, m10 ; b * 455 1581 pmaddwd m1, m10 1582 paddusw m4, m10 1583 paddusw m5, m10 1584 psrad m3, m4, 20 ; min(z, 255) - 256 1585 vpgatherdd m2, [r13+m3*4], m4 ; x 1586 psrad m4, m5, 20 1587 vpgatherdd m3, [r13+m4*4], m5 1588 pmulld m0, m2 1589 pmulld m1, m3 1590 packssdw m2, m3 1591 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 1592 paddd m1, m11 1593 psrld m0, 12 1594 psrld m1, 12 1595 mova [t4+r10*1+400*2+ 4], m2 1596 mova [t3+r10*2+400*4+ 8], xm0 1597 vextracti128 [t3+r10*2+400*4+40], m0, 1 1598 mova [t3+r10*2+400*4+24], xm1 1599 vextracti128 [t3+r10*2+400*4+56], m1, 1 1600 add r10, 32 1601 jl .v1_loop 1602 mov r10, t2 1603 mov t2, t1 1604 mov t1, r10 1605 ret 1606.prep_n: ; initial neighbor setup 1607 mov r10, wq 1608.prep_n_loop: 1609 mova xm0, [t4+r10*1+400*0+0] 1610 paddw xm0, [t4+r10*1+400*0+4] 1611 paddw xm2, xm0, [t4+r10*1+400*0+2] 1612 mova m1, [t3+r10*2+400*0+0] 1613 paddd m1, [t3+r10*2+400*0+8] 1614 paddd m3, m1, [t3+r10*2+400*0+4] 1615 psllw xm2, 2 ; a[-1] 444 1616 pslld m3, 2 ; b[-1] 444 1617 psubw xm2, xm0 ; a[-1] 343 1618 psubd m3, m1 ; b[-1] 343 1619 mova [t4+r10*1+400* 4], xm2 1620 mova [t3+r10*2+400* 8], m3 1621 mova xm0, [t4+r10*1+400*2+0] 1622 paddw xm0, [t4+r10*1+400*2+4] 1623 paddw xm2, xm0, [t4+r10*1+400*2+2] 1624 mova m1, [t3+r10*2+400*4+0] 1625 paddd m1, [t3+r10*2+400*4+8] 1626 paddd m3, m1, [t3+r10*2+400*4+4] 1627 psllw xm2, 2 ; a[ 0] 444 1628 pslld m3, 2 ; b[ 0] 444 1629 mova [t4+r10*1+400* 6], xm2 1630 mova [t3+r10*2+400*12], m3 1631 psubw xm2, xm0 ; a[ 0] 343 1632 psubd m3, m1 ; b[ 0] 343 1633 mova [t4+r10*1+400* 8], xm2 1634 mova [t3+r10*2+400*16], m3 1635 add r10, 16 1636 jl .prep_n_loop 1637 ret 1638ALIGN function_align 1639.n0: ; neighbor + output (even rows) 1640 mov r10, wq 1641.n0_loop: 1642 mova m3, [t4+r10*1+400*0+0] 1643 paddw m3, [t4+r10*1+400*0+4] 1644 paddw m1, m3, [t4+r10*1+400*0+2] 1645 psllw m1, 2 ; a[ 1] 444 1646 psubw m2, m1, m3 ; a[ 1] 343 1647 paddw m3, m2, [t4+r10*1+400*4] 1648 paddw m3, [t4+r10*1+400*6] 1649 mova [t4+r10*1+400*4], m2 1650 mova [t4+r10*1+400*6], m1 1651 mova m4, [t3+r10*2+400*0+0] 1652 paddd m4, [t3+r10*2+400*0+8] 1653 paddd m1, m4, [t3+r10*2+400*0+4] 1654 pslld m1, 2 ; b[ 1] 444 1655 psubd m2, m1, m4 ; b[ 1] 343 1656 paddd m4, m2, [t3+r10*2+400* 8+ 0] 1657 paddd m4, [t3+r10*2+400*12+ 0] 1658 mova [t3+r10*2+400* 8+ 0], m2 1659 mova [t3+r10*2+400*12+ 0], m1 1660 mova m5, [t3+r10*2+400*0+32] 1661 paddd m5, [t3+r10*2+400*0+40] 1662 paddd m1, m5, [t3+r10*2+400*0+36] 1663 pslld m1, 2 1664 psubd m2, m1, m5 1665 paddd m5, m2, [t3+r10*2+400* 8+32] 1666 paddd m5, [t3+r10*2+400*12+32] 1667 mova [t3+r10*2+400* 8+32], m2 1668 mova [t3+r10*2+400*12+32], m1 1669 mova m0, [dstq+r10] 1670 punpcklwd m1, m0, m6 1671 punpcklwd m2, m3, m6 1672 pmaddwd m2, m1 ; a * src 1673 punpckhwd m1, m0, m6 1674 punpckhwd m3, m6 1675 pmaddwd m3, m1 1676 vinserti128 m1, m4, xm5, 1 1677 vperm2i128 m4, m5, 0x31 1678 psubd m1, m2 ; b - a * src + (1 << 8) 1679 psubd m4, m3 1680 psrad m1, 9 1681 psrad m4, 9 1682 packssdw m1, m4 1683 pmulhrsw m1, m7 1684 paddw m0, m1 1685 pmaxsw m0, m6 1686 pminsw m0, m13 1687 mova [dstq+r10], m0 1688 add r10, 32 1689 jl .n0_loop 1690 add dstq, strideq 1691 ret 1692ALIGN function_align 1693.n1: ; neighbor + output (odd rows) 1694 mov r10, wq 1695.n1_loop: 1696 mova m3, [t4+r10*1+400*2+0] 1697 paddw m3, [t4+r10*1+400*2+4] 1698 paddw m1, m3, [t4+r10*1+400*2+2] 1699 psllw m1, 2 ; a[ 1] 444 1700 psubw m2, m1, m3 ; a[ 1] 343 1701 paddw m3, m2, [t4+r10*1+400*6] 1702 paddw m3, [t4+r10*1+400*8] 1703 mova [t4+r10*1+400*6], m1 1704 mova [t4+r10*1+400*8], m2 1705 mova m4, [t3+r10*2+400*4+0] 1706 paddd m4, [t3+r10*2+400*4+8] 1707 paddd m1, m4, [t3+r10*2+400*4+4] 1708 pslld m1, 2 ; b[ 1] 444 1709 psubd m2, m1, m4 ; b[ 1] 343 1710 paddd m4, m2, [t3+r10*2+400*12+ 0] 1711 paddd m4, [t3+r10*2+400*16+ 0] 1712 mova [t3+r10*2+400*12+ 0], m1 1713 mova [t3+r10*2+400*16+ 0], m2 1714 mova m5, [t3+r10*2+400*4+32] 1715 paddd m5, [t3+r10*2+400*4+40] 1716 paddd m1, m5, [t3+r10*2+400*4+36] 1717 pslld m1, 2 1718 psubd m2, m1, m5 1719 paddd m5, m2, [t3+r10*2+400*12+32] 1720 paddd m5, [t3+r10*2+400*16+32] 1721 mova [t3+r10*2+400*12+32], m1 1722 mova [t3+r10*2+400*16+32], m2 1723 mova m0, [dstq+r10] 1724 punpcklwd m1, m0, m6 1725 punpcklwd m2, m3, m6 1726 pmaddwd m2, m1 ; a * src 1727 punpckhwd m1, m0, m6 1728 punpckhwd m3, m6 1729 pmaddwd m3, m1 1730 vinserti128 m1, m4, xm5, 1 1731 vperm2i128 m4, m5, 0x31 1732 psubd m1, m2 ; b - a * src + (1 << 8) 1733 psubd m4, m3 1734 psrad m1, 9 1735 psrad m4, 9 1736 packssdw m1, m4 1737 pmulhrsw m1, m7 1738 paddw m0, m1 1739 pmaxsw m0, m6 1740 pminsw m0, m13 1741 mova [dstq+r10], m0 1742 add r10, 32 1743 jl .n1_loop 1744 add dstq, strideq 1745 ret 1746 1747cglobal sgr_filter_mix_16bpc, 4, 14, 16, 400*66+8, dst, stride, left, lpf, \ 1748 w, h, edge, params 1749 movifnidn wd, wm 1750 mov paramsq, r6mp 1751 lea r13, [sgr_x_by_x_avx2+256*4] 1752 add wd, wd 1753 movifnidn hd, hm 1754 mov edged, r7m 1755 add lpfq, wq 1756 vpbroadcastd m15, [paramsq+8] ; w0 w1 1757 add dstq, wq 1758 vpbroadcastd m13, [paramsq+0] ; s0 1759 lea t3, [rsp+wq*2+400*24+8] 1760 vpbroadcastd m14, [paramsq+4] ; s1 1761 lea t4, [rsp+wq+400*52+8] 1762 vpbroadcastd m9, [pd_8] 1763 lea t1, [rsp+wq+12] 1764 vpbroadcastd m10, [pd_34816] 1765 neg wq 1766 vpbroadcastd m11, [pd_4096] 1767 pxor m7, m7 1768 vpbroadcastd m12, [pd_0xf00801c7] 1769 psllw m15, 2 1770 test edgeb, 4 ; LR_HAVE_TOP 1771 jz .no_top 1772 call .h_top 1773 add lpfq, strideq 1774 mov t2, t1 1775 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).top_fixup 1776 add t1, 400*12 1777 call .h_top 1778 lea r10, [lpfq+strideq*4] 1779 mov lpfq, dstq 1780 add r10, strideq 1781 mov [rsp], r10 ; below 1782 call .hv0 1783.main: 1784 dec hd 1785 jz .height1 1786 add lpfq, strideq 1787 call .hv1 1788 call .prep_n 1789 sub hd, 2 1790 jl .extend_bottom 1791.main_loop: 1792 add lpfq, strideq 1793 call .hv0 1794 test hd, hd 1795 jz .odd_height 1796 add lpfq, strideq 1797 call .hv1 1798 call .n0 1799 call .n1 1800 sub hd, 2 1801 jge .main_loop 1802 test edgeb, 8 ; LR_HAVE_BOTTOM 1803 jz .extend_bottom 1804 mov lpfq, [rsp] 1805 call .hv0_bottom 1806 add lpfq, strideq 1807 call .hv1_bottom 1808.end: 1809 call .n0 1810 call .n1 1811.end2: 1812 RET 1813.height1: 1814 call .v1 1815 call .prep_n 1816 jmp .odd_height_end 1817.odd_height: 1818 call .v1 1819 call .n0 1820 call .n1 1821.odd_height_end: 1822 call .v0 1823 call .v1 1824 call .n0 1825 jmp .end2 1826.extend_bottom: 1827 call .v0 1828 call .v1 1829 jmp .end 1830.no_top: 1831 lea r10, [lpfq+strideq*4] 1832 mov lpfq, dstq 1833 lea r10, [r10+strideq*2] 1834 mov [rsp], r10 1835 call .h 1836 lea r10, [wq-4] 1837 lea t2, [t1+400*12] 1838.top_fixup_loop: 1839 mova m0, [t1+r10+400* 0] 1840 mova m1, [t1+r10+400* 2] 1841 mova m2, [t1+r10+400* 4] 1842 paddw m0, m0 1843 mova m3, [t1+r10+400* 6] 1844 paddd m1, m1 1845 mova m4, [t1+r10+400* 8] 1846 paddd m2, m2 1847 mova m5, [t1+r10+400*10] 1848 mova [t2+r10+400* 0], m0 1849 mova [t2+r10+400* 2], m1 1850 mova [t2+r10+400* 4], m2 1851 mova [t2+r10+400* 6], m3 1852 mova [t2+r10+400* 8], m4 1853 mova [t2+r10+400*10], m5 1854 add r10, 32 1855 jl .top_fixup_loop 1856 call .v0 1857 jmp .main 1858.h: ; horizontal boxsum 1859 lea r10, [wq-4] 1860 test edgeb, 1 ; LR_HAVE_LEFT 1861 jz .h_extend_left 1862 vpbroadcastq xm5, [leftq] 1863 vinserti128 m5, [lpfq+wq], 1 1864 mova m4, [lpfq+wq] 1865 add leftq, 8 1866 palignr m4, m5, 10 1867 jmp .h_main 1868.h_extend_left: 1869 mova xm4, [lpfq+wq] 1870 pshufb xm4, [sgr_lshuf5] 1871 vinserti128 m4, [lpfq+wq+10], 1 1872 jmp .h_main 1873.h_top: 1874 lea r10, [wq-4] 1875 test edgeb, 1 ; LR_HAVE_LEFT 1876 jz .h_extend_left 1877.h_loop: 1878 movu m4, [lpfq+r10- 2] 1879.h_main: 1880 movu m5, [lpfq+r10+14] 1881 test edgeb, 2 ; LR_HAVE_RIGHT 1882 jnz .h_have_right 1883 cmp r10d, -36 1884 jl .h_have_right 1885 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right 1886.h_have_right: 1887 palignr m3, m5, m4, 2 1888 palignr m0, m5, m4, 4 1889 paddw m1, m3, m0 1890 punpcklwd m2, m3, m0 1891 pmaddwd m2, m2 1892 punpckhwd m3, m0 1893 pmaddwd m3, m3 1894 palignr m0, m5, m4, 6 1895 paddw m1, m0 ; sum3 1896 punpcklwd m6, m0, m7 1897 pmaddwd m6, m6 1898 punpckhwd m0, m7 1899 pmaddwd m0, m0 1900 paddd m2, m6 ; sumsq3 1901 shufpd m6, m4, m5, 0x05 1902 punpcklwd m5, m6, m4 1903 paddw m8, m4, m6 1904 pmaddwd m5, m5 1905 punpckhwd m6, m4 1906 pmaddwd m6, m6 1907 paddd m3, m0 1908 mova [t1+r10+400* 6], m1 1909 mova [t1+r10+400* 8], m2 1910 mova [t1+r10+400*10], m3 1911 paddw m8, m1 ; sum5 1912 paddd m5, m2 ; sumsq5 1913 paddd m6, m3 1914 mova [t1+r10+400* 0], m8 1915 mova [t1+r10+400* 2], m5 1916 mova [t1+r10+400* 4], m6 1917 add r10, 32 1918 jl .h_loop 1919 ret 1920ALIGN function_align 1921.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) 1922 lea r10, [wq-4] 1923 test edgeb, 1 ; LR_HAVE_LEFT 1924 jz .hv0_extend_left 1925 vpbroadcastq xm5, [leftq] 1926 vinserti128 m5, [lpfq+wq], 1 1927 mova m4, [lpfq+wq] 1928 add leftq, 8 1929 palignr m4, m5, 10 1930 jmp .hv0_main 1931.hv0_extend_left: 1932 mova xm4, [lpfq+wq] 1933 pshufb xm4, [sgr_lshuf5] 1934 vinserti128 m4, [lpfq+wq+10], 1 1935 jmp .hv0_main 1936.hv0_bottom: 1937 lea r10, [wq-4] 1938 test edgeb, 1 ; LR_HAVE_LEFT 1939 jz .hv0_extend_left 1940.hv0_loop: 1941 movu m4, [lpfq+r10- 2] 1942.hv0_main: 1943 movu m5, [lpfq+r10+14] 1944 test edgeb, 2 ; LR_HAVE_RIGHT 1945 jnz .hv0_have_right 1946 cmp r10d, -36 1947 jl .hv0_have_right 1948 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right 1949.hv0_have_right: 1950 palignr m3, m5, m4, 2 1951 palignr m0, m5, m4, 4 1952 paddw m1, m3, m0 1953 punpcklwd m2, m3, m0 1954 pmaddwd m2, m2 1955 punpckhwd m3, m0 1956 pmaddwd m3, m3 1957 palignr m0, m5, m4, 6 1958 paddw m1, m0 ; h sum3 1959 punpcklwd m6, m0, m7 1960 pmaddwd m6, m6 1961 punpckhwd m0, m7 1962 pmaddwd m0, m0 1963 paddd m2, m6 ; h sumsq3 1964 shufpd m6, m4, m5, 0x05 1965 punpcklwd m5, m6, m4 1966 paddw m8, m4, m6 1967 pmaddwd m5, m5 1968 punpckhwd m6, m4 1969 pmaddwd m6, m6 1970 paddd m3, m0 1971 paddw m8, m1 ; h sum5 1972 paddd m5, m2 ; h sumsq5 1973 paddd m6, m3 1974 mova [t3+r10*2+400*8+ 8], m8 ; we need a clean copy of the last row TODO: t4? 1975 mova [t3+r10*2+400*0+ 8], m5 ; in case height is odd 1976 mova [t3+r10*2+400*0+40], m6 1977 paddw m8, [t1+r10+400* 0] 1978 paddd m5, [t1+r10+400* 2] 1979 paddd m6, [t1+r10+400* 4] 1980 mova [t1+r10+400* 0], m8 1981 mova [t1+r10+400* 2], m5 1982 mova [t1+r10+400* 4], m6 1983 paddw m0, m1, [t1+r10+400* 6] 1984 paddd m4, m2, [t1+r10+400* 8] 1985 paddd m5, m3, [t1+r10+400*10] 1986 mova [t1+r10+400* 6], m1 1987 mova [t1+r10+400* 8], m2 1988 mova [t1+r10+400*10], m3 1989 paddw m1, m0, [t2+r10+400* 6] 1990 paddd m2, m4, [t2+r10+400* 8] 1991 paddd m3, m5, [t2+r10+400*10] 1992 mova [t2+r10+400* 6], m0 1993 mova [t2+r10+400* 8], m4 1994 mova [t2+r10+400*10], m5 1995 paddd m2, m9 1996 paddd m3, m9 1997 psrld m2, 4 ; (a3 + 8) >> 4 1998 psrld m3, 4 1999 pslld m4, m2, 3 2000 pslld m5, m3, 3 2001 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 2002 paddd m5, m3 2003 psrlw m3, m1, 1 2004 pavgw m3, m7 ; (b3 + 2) >> 2 2005 punpcklwd m2, m3, m7 2006 pmaddwd m2, m2 2007 punpckhwd m3, m7 2008 pmaddwd m3, m3 2009 punpcklwd m0, m1, m7 ; b3 2010 punpckhwd m1, m7 2011 pmaxud m4, m2 2012 psubd m4, m2 ; p3 2013 pmaxud m5, m3 2014 psubd m5, m3 2015 pmulld m4, m14 ; p3 * s1 2016 pmulld m5, m14 2017 pmaddwd m0, m12 ; b3 * 455 2018 pmaddwd m1, m12 2019 paddusw m4, m12 2020 paddusw m5, m12 2021 psrad m3, m4, 20 ; min(z3, 255) - 256 2022 vpgatherdd m2, [r13+m3*4], m4 ; x3 2023 psrad m4, m5, 20 2024 vpgatherdd m3, [r13+m4*4], m5 2025 pmulld m0, m2 2026 pmulld m1, m3 2027 packssdw m2, m3 2028 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2029 paddd m1, m10 2030 psrld m0, 12 2031 psrld m1, 12 2032 mova [t4+r10*1+400*2+ 4], m2 2033 mova [t3+r10*2+400*4+ 8], xm0 2034 vextracti128 [t3+r10*2+400*4+40], m0, 1 2035 mova [t3+r10*2+400*4+24], xm1 2036 vextracti128 [t3+r10*2+400*4+56], m1, 1 2037 add r10, 32 2038 jl .hv0_loop 2039 ret 2040ALIGN function_align 2041.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 2042 lea r10, [wq-4] 2043 test edgeb, 1 ; LR_HAVE_LEFT 2044 jz .hv1_extend_left 2045 vpbroadcastq xm5, [leftq] 2046 vinserti128 m5, [lpfq+wq], 1 2047 mova m4, [lpfq+wq] 2048 add leftq, 8 2049 palignr m4, m5, 10 2050 jmp .hv1_main 2051.hv1_extend_left: 2052 mova xm4, [lpfq+wq] 2053 pshufb xm4, [sgr_lshuf5] 2054 vinserti128 m4, [lpfq+wq+10], 1 2055 jmp .hv1_main 2056.hv1_bottom: 2057 lea r10, [wq-4] 2058 test edgeb, 1 ; LR_HAVE_LEFT 2059 jz .hv1_extend_left 2060.hv1_loop: 2061 movu m4, [lpfq+r10- 2] 2062.hv1_main: 2063 movu m5, [lpfq+r10+14] 2064 test edgeb, 2 ; LR_HAVE_RIGHT 2065 jnz .hv1_have_right 2066 cmp r10d, -36 2067 jl .hv1_have_right 2068 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right 2069.hv1_have_right: 2070 palignr m6, m5, m4, 2 2071 palignr m3, m5, m4, 4 2072 paddw m2, m6, m3 2073 punpcklwd m0, m6, m3 2074 pmaddwd m0, m0 2075 punpckhwd m6, m3 2076 pmaddwd m6, m6 2077 palignr m3, m5, m4, 6 2078 paddw m2, m3 ; h sum3 2079 punpcklwd m1, m3, m7 2080 pmaddwd m1, m1 2081 punpckhwd m3, m7 2082 pmaddwd m3, m3 2083 paddd m0, m1 ; h sumsq3 2084 shufpd m1, m4, m5, 0x05 2085 punpckhwd m5, m4, m1 2086 paddw m8, m4, m1 2087 pmaddwd m5, m5 2088 punpcklwd m4, m1 2089 pmaddwd m4, m4 2090 paddd m6, m3 2091 paddw m1, m2, [t2+r10+400* 6] 2092 mova [t2+r10+400* 6], m2 2093 paddw m8, m2 ; h sum5 2094 paddd m2, m0, [t2+r10+400* 8] 2095 paddd m3, m6, [t2+r10+400*10] 2096 mova [t2+r10+400* 8], m0 2097 mova [t2+r10+400*10], m6 2098 paddd m4, m0 ; h sumsq5 2099 paddd m5, m6 2100 paddd m2, m9 2101 paddd m3, m9 2102 psrld m2, 4 ; (a3 + 8) >> 4 2103 psrld m3, 4 2104 pslld m0, m2, 3 2105 pslld m6, m3, 3 2106 paddd m2, m0 ; ((a3 + 8) >> 4) * 9 2107 paddd m3, m6 2108 psrlw m6, m1, 1 2109 pavgw m6, m7 ; (b3 + 2) >> 2 2110 punpcklwd m0, m6, m7 2111 pmaddwd m0, m0 2112 punpckhwd m6, m7 2113 pmaddwd m6, m6 2114 pmaxud m2, m0 2115 psubd m2, m0 ; p3 2116 pmaxud m3, m6 2117 psubd m3, m6 2118 punpcklwd m0, m1, m7 ; b3 2119 punpckhwd m1, m7 2120 pmulld m2, m14 ; p3 * s1 2121 pmulld m3, m14 2122 pmaddwd m0, m12 ; b3 * 455 2123 pmaddwd m1, m12 2124 paddusw m2, m12 2125 paddusw m3, m12 2126 psrad m7, m2, 20 ; min(z3, 255) - 256 2127 vpgatherdd m6, [r13+m7*4], m2 ; x3 2128 psrad m2, m3, 20 2129 vpgatherdd m7, [r13+m2*4], m3 2130 pmulld m0, m6 2131 packssdw m6, m7 2132 pmulld m7, m1 2133 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2134 paddd m7, m10 2135 psrld m0, 12 2136 psrld m7, 12 2137 paddw m1, m8, [t2+r10+400*0] 2138 paddd m2, m4, [t2+r10+400*2] 2139 paddd m3, m5, [t2+r10+400*4] 2140 paddw m1, [t1+r10+400*0] 2141 paddd m2, [t1+r10+400*2] 2142 paddd m3, [t1+r10+400*4] 2143 mova [t2+r10+400*0], m8 2144 mova [t2+r10+400*2], m4 2145 mova [t2+r10+400*4], m5 2146 mova [t4+r10*1+400*4 +4], m6 2147 mova [t3+r10*2+400*8+ 8], xm0 2148 vextracti128 [t3+r10*2+400*8+40], m0, 1 2149 mova [t3+r10*2+400*8+24], xm7 2150 vextracti128 [t3+r10*2+400*8+56], m7, 1 2151 vpbroadcastd m4, [pd_25] 2152 pxor m7, m7 2153 paddd m2, m9 2154 paddd m3, m9 2155 psrld m2, 4 ; (a5 + 8) >> 4 2156 psrld m3, 4 2157 pmulld m2, m4 ; ((a5 + 8) >> 4) * 25 2158 pmulld m3, m4 2159 psrlw m5, m1, 1 2160 pavgw m5, m7 ; (b5 + 2) >> 2 2161 punpcklwd m4, m5, m7 2162 pmaddwd m4, m4 2163 punpckhwd m5, m7 2164 pmaddwd m5, m5 2165 punpcklwd m0, m1, m7 ; b5 2166 punpckhwd m1, m7 2167 pmaxud m2, m4 2168 psubd m2, m4 ; p5 2169 vpbroadcastd m4, [pd_0xf00800a4] 2170 pmaxud m3, m5 2171 psubd m3, m5 2172 pmulld m2, m13 ; p5 * s0 2173 pmulld m3, m13 2174 pmaddwd m0, m4 ; b5 * 164 2175 pmaddwd m1, m4 2176 paddusw m2, m4 2177 paddusw m3, m4 2178 psrad m5, m2, 20 ; min(z5, 255) - 256 2179 vpgatherdd m4, [r13+m5*4], m2 ; x5 2180 psrad m2, m3, 20 2181 vpgatherdd m5, [r13+m2*4], m3 2182 pmulld m0, m4 2183 pmulld m1, m5 2184 packssdw m4, m5 2185 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 2186 paddd m1, m10 2187 psrld m0, 12 2188 psrld m1, 12 2189 mova [t4+r10*1+400*0+ 4], m4 2190 mova [t3+r10*2+400*0+ 8], xm0 2191 vextracti128 [t3+r10*2+400*0+40], m0, 1 2192 mova [t3+r10*2+400*0+24], xm1 2193 vextracti128 [t3+r10*2+400*0+56], m1, 1 2194 add r10, 32 2195 jl .hv1_loop 2196 mov r10, t2 2197 mov t2, t1 2198 mov t1, r10 2199 ret 2200.v0: ; vertical boxsums + ab3 (even rows) 2201 lea r10, [wq-4] 2202.v0_loop: 2203 mova m0, [t1+r10+400* 6] 2204 mova m4, [t1+r10+400* 8] 2205 mova m5, [t1+r10+400*10] 2206 paddw m0, m0 2207 paddd m4, m4 2208 paddd m5, m5 2209 paddw m1, m0, [t2+r10+400* 6] 2210 paddd m2, m4, [t2+r10+400* 8] 2211 paddd m3, m5, [t2+r10+400*10] 2212 mova [t2+r10+400* 6], m0 2213 mova [t2+r10+400* 8], m4 2214 mova [t2+r10+400*10], m5 2215 paddd m2, m9 2216 paddd m3, m9 2217 psrld m2, 4 ; (a3 + 8) >> 4 2218 psrld m3, 4 2219 pslld m4, m2, 3 2220 pslld m5, m3, 3 2221 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 2222 paddd m5, m3 2223 psrlw m3, m1, 1 2224 pavgw m3, m7 ; (b3 + 2) >> 2 2225 punpcklwd m2, m3, m7 2226 pmaddwd m2, m2 2227 punpckhwd m3, m7 2228 pmaddwd m3, m3 2229 punpcklwd m0, m1, m7 ; b3 2230 punpckhwd m1, m7 2231 pmaxud m4, m2 2232 psubd m4, m2 ; p3 2233 pmaxud m5, m3 2234 psubd m5, m3 2235 pmulld m4, m14 ; p3 * s1 2236 pmulld m5, m14 2237 pmaddwd m0, m12 ; b3 * 455 2238 pmaddwd m1, m12 2239 paddusw m4, m12 2240 paddusw m5, m12 2241 psrad m3, m4, 20 ; min(z3, 255) - 256 2242 vpgatherdd m2, [r13+m3*4], m4 ; x3 2243 psrad m4, m5, 20 2244 vpgatherdd m3, [r13+m4*4], m5 2245 pmulld m0, m2 2246 pmulld m1, m3 2247 packssdw m2, m3 2248 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2249 paddd m1, m10 2250 psrld m0, 12 2251 psrld m1, 12 2252 mova m3, [t1+r10+400*0] 2253 mova m4, [t1+r10+400*2] 2254 mova m5, [t1+r10+400*4] 2255 mova [t3+r10*2+400*8+ 8], m3 2256 mova [t3+r10*2+400*0+ 8], m4 2257 mova [t3+r10*2+400*0+40], m5 2258 paddw m3, m3 ; cc5 2259 paddd m4, m4 2260 paddd m5, m5 2261 mova [t1+r10+400*0], m3 2262 mova [t1+r10+400*2], m4 2263 mova [t1+r10+400*4], m5 2264 mova [t4+r10*1+400*2+ 4], m2 2265 mova [t3+r10*2+400*4+ 8], xm0 2266 vextracti128 [t3+r10*2+400*4+40], m0, 1 2267 mova [t3+r10*2+400*4+24], xm1 2268 vextracti128 [t3+r10*2+400*4+56], m1, 1 2269 add r10, 32 2270 jl .v0_loop 2271 ret 2272.v1: ; vertical boxsums + ab (odd rows) 2273 lea r10, [wq-4] 2274.v1_loop: 2275 mova m4, [t1+r10+400* 6] 2276 mova m5, [t1+r10+400* 8] 2277 mova m6, [t1+r10+400*10] 2278 paddw m1, m4, [t2+r10+400* 6] 2279 paddd m2, m5, [t2+r10+400* 8] 2280 paddd m3, m6, [t2+r10+400*10] 2281 mova [t2+r10+400* 6], m4 2282 mova [t2+r10+400* 8], m5 2283 mova [t2+r10+400*10], m6 2284 paddd m2, m9 2285 paddd m3, m9 2286 psrld m2, 4 ; (a3 + 8) >> 4 2287 psrld m3, 4 2288 pslld m4, m2, 3 2289 pslld m5, m3, 3 2290 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 2291 paddd m5, m3 2292 psrlw m3, m1, 1 2293 pavgw m3, m7 ; (b3 + 2) >> 2 2294 punpcklwd m2, m3, m7 2295 pmaddwd m2, m2 2296 punpckhwd m3, m7 2297 pmaddwd m3, m3 2298 punpcklwd m0, m1, m7 ; b3 2299 punpckhwd m1, m7 2300 pmaxud m4, m2 2301 psubd m4, m2 ; p3 2302 pmaxud m5, m3 2303 psubd m5, m3 2304 pmulld m4, m14 ; p3 * s1 2305 pmulld m5, m14 2306 pmaddwd m0, m12 ; b3 * 455 2307 pmaddwd m1, m12 2308 paddusw m4, m12 2309 paddusw m5, m12 2310 psrad m3, m4, 20 ; min(z3, 255) - 256 2311 vpgatherdd m2, [r13+m3*4], m4 ; x3 2312 psrad m4, m5, 20 2313 vpgatherdd m3, [r13+m4*4], m5 2314 pmulld m0, m2 2315 pmulld m1, m3 2316 packssdw m2, m3 2317 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2318 paddd m1, m10 2319 psrld m0, 12 2320 psrld m8, m1, 12 2321 mova [t4+r10*1+400*4+4], m2 2322 mova m4, [t3+r10*2+400*8+ 8] 2323 mova m5, [t3+r10*2+400*0+ 8] 2324 mova m6, [t3+r10*2+400*0+40] 2325 paddw m1, m4, [t2+r10+400*0] 2326 paddd m2, m5, [t2+r10+400*2] 2327 paddd m3, m6, [t2+r10+400*4] 2328 paddw m1, [t1+r10+400*0] 2329 paddd m2, [t1+r10+400*2] 2330 paddd m3, [t1+r10+400*4] 2331 mova [t2+r10+400*0], m4 2332 mova [t2+r10+400*2], m5 2333 mova [t2+r10+400*4], m6 2334 vpbroadcastd m4, [pd_25] 2335 mova [t3+r10*2+400*8+ 8], xm0 2336 vextracti128 [t3+r10*2+400*8+40], m0, 1 2337 mova [t3+r10*2+400*8+24], xm8 2338 vextracti128 [t3+r10*2+400*8+56], m8, 1 2339 paddd m2, m9 2340 paddd m3, m9 2341 psrld m2, 4 ; (a5 + 8) >> 4 2342 psrld m3, 4 2343 pmulld m2, m4 ; ((a5 + 8) >> 4) * 25 2344 pmulld m3, m4 2345 psrlw m5, m1, 1 2346 pavgw m5, m7 ; (b5 + 2) >> 2 2347 punpcklwd m4, m5, m7 2348 pmaddwd m4, m4 2349 punpckhwd m5, m7 2350 pmaddwd m5, m5 2351 punpcklwd m0, m1, m7 ; b5 2352 punpckhwd m1, m7 2353 pmaxud m2, m4 2354 psubd m2, m4 ; p5 2355 vpbroadcastd m4, [pd_0xf00800a4] 2356 pmaxud m3, m5 2357 psubd m3, m5 2358 pmulld m2, m13 ; p5 * s0 2359 pmulld m3, m13 2360 pmaddwd m0, m4 ; b5 * 164 2361 pmaddwd m1, m4 2362 paddusw m2, m4 2363 paddusw m3, m4 2364 psrad m5, m2, 20 ; min(z5, 255) - 256 2365 vpgatherdd m4, [r13+m5*4], m2 ; x5 2366 psrad m2, m3, 20 2367 vpgatherdd m5, [r13+m2*4], m3 2368 pmulld m0, m4 2369 pmulld m1, m5 2370 packssdw m4, m5 2371 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 2372 paddd m1, m10 2373 psrld m0, 12 2374 psrld m1, 12 2375 mova [t4+r10*1+400*0+ 4], m4 2376 mova [t3+r10*2+400*0+ 8], xm0 2377 vextracti128 [t3+r10*2+400*0+40], m0, 1 2378 mova [t3+r10*2+400*0+24], xm1 2379 vextracti128 [t3+r10*2+400*0+56], m1, 1 2380 add r10, 32 2381 jl .v1_loop 2382 mov r10, t2 2383 mov t2, t1 2384 mov t1, r10 2385 ret 2386.prep_n: ; initial neighbor setup 2387 mov r10, wq 2388.prep_n_loop: 2389 movu xm0, [t4+r10*1+400*0+2] 2390 paddw xm2, xm0, [t4+r10*1+400*0+0] 2391 paddw xm2, [t4+r10*1+400*0+4] 2392 movu m1, [t3+r10*2+400*0+4] 2393 paddd m3, m1, [t3+r10*2+400*0+0] 2394 paddd m3, [t3+r10*2+400*0+8] 2395 paddw xm0, xm2 2396 paddd m1, m3 2397 psllw xm2, 2 2398 pslld m3, 2 2399 paddw xm0, xm2 ; a5 565 2400 paddd m1, m3 ; b5 565 2401 mova [t4+r10*1+400* 6], xm0 2402 mova [t3+r10*2+400*12], m1 2403 mova xm0, [t4+r10*1+400*2+0] 2404 paddw xm0, [t4+r10*1+400*2+4] 2405 paddw xm2, xm0, [t4+r10*1+400*2+2] 2406 mova m1, [t3+r10*2+400*4+0] 2407 paddd m1, [t3+r10*2+400*4+8] 2408 paddd m3, m1, [t3+r10*2+400*4+4] 2409 psllw xm2, 2 ; a3[-1] 444 2410 pslld m3, 2 ; b3[-1] 444 2411 psubw xm2, xm0 ; a3[-1] 343 2412 psubd m3, m1 ; b3[-1] 343 2413 mova [t4+r10*1+400* 8], xm2 2414 mova [t3+r10*2+400*16], m3 2415 mova xm0, [t4+r10*1+400*4+0] 2416 paddw xm0, [t4+r10*1+400*4+4] 2417 paddw xm2, xm0, [t4+r10*1+400*4+2] 2418 mova m1, [t3+r10*2+400*8+0] 2419 paddd m1, [t3+r10*2+400*8+8] 2420 paddd m3, m1, [t3+r10*2+400*8+4] 2421 psllw xm2, 2 ; a3[ 0] 444 2422 pslld m3, 2 ; b3[ 0] 444 2423 mova [t4+r10*1+400*10], xm2 2424 mova [t3+r10*2+400*20], m3 2425 psubw xm2, xm0 ; a3[ 0] 343 2426 psubd m3, m1 ; b3[ 0] 343 2427 mova [t4+r10*1+400*12], xm2 2428 mova [t3+r10*2+400*24], m3 2429 add r10, 16 2430 jl .prep_n_loop 2431 ret 2432ALIGN function_align 2433.n0: ; neighbor + output (even rows) 2434 mov r10, wq 2435.n0_loop: 2436 movu xm2, [t4+r10*1+2] 2437 paddw xm0, xm2, [t4+r10*1+0] 2438 paddw xm0, [t4+r10*1+4] 2439 paddw xm2, xm0 2440 psllw xm0, 2 2441 paddw xm0, xm2 ; a5 2442 movu m1, [t3+r10*2+4] 2443 paddd m4, m1, [t3+r10*2+0] 2444 paddd m4, [t3+r10*2+8] 2445 paddd m1, m4 2446 pslld m4, 2 2447 paddd m4, m1 ; b5 2448 paddw xm2, xm0, [t4+r10*1+400* 6] 2449 mova [t4+r10*1+400* 6], xm0 2450 paddd m0, m4, [t3+r10*2+400*12] 2451 mova [t3+r10*2+400*12], m4 2452 mova xm3, [t4+r10*1+400*2+0] 2453 paddw xm3, [t4+r10*1+400*2+4] 2454 paddw xm5, xm3, [t4+r10*1+400*2+2] 2455 psllw xm5, 2 ; a3[ 1] 444 2456 psubw xm4, xm5, xm3 ; a3[ 1] 343 2457 paddw xm3, xm4, [t4+r10*1+400* 8] 2458 paddw xm3, [t4+r10*1+400*10] 2459 mova [t4+r10*1+400* 8], xm4 2460 mova [t4+r10*1+400*10], xm5 2461 mova m1, [t3+r10*2+400*4+0] 2462 paddd m1, [t3+r10*2+400*4+8] 2463 paddd m5, m1, [t3+r10*2+400*4+4] 2464 pslld m5, 2 ; b3[ 1] 444 2465 psubd m4, m5, m1 ; b3[ 1] 343 2466 paddd m1, m4, [t3+r10*2+400*16] 2467 paddd m1, [t3+r10*2+400*20] 2468 mova [t3+r10*2+400*16], m4 2469 mova [t3+r10*2+400*20], m5 2470 pmovzxwd m4, [dstq+r10] 2471 pmovzxwd m2, xm2 ; a5 2472 pmovzxwd m3, xm3 ; a3 2473 pmaddwd m2, m4 ; a5 * src 2474 pmaddwd m3, m4 ; a3 * src 2475 pslld m4, 13 2476 psubd m0, m2 ; b5 - a5 * src + (1 << 8) 2477 psubd m1, m3 ; b3 - a3 * src + (1 << 8) 2478 psrld m0, 9 2479 pslld m1, 7 2480 pblendw m0, m1, 0xaa 2481 pmaddwd m0, m15 2482 paddd m4, m11 2483 paddd m0, m4 2484 psrad m0, 7 2485 vextracti128 xm1, m0, 1 2486 packusdw xm0, xm1 ; clip 2487 psrlw xm0, 6 2488 mova [dstq+r10], xm0 2489 add r10, 16 2490 jl .n0_loop 2491 add dstq, strideq 2492 ret 2493ALIGN function_align 2494.n1: ; neighbor + output (odd rows) 2495 mov r10, wq 2496.n1_loop: 2497 mova xm3, [t4+r10*1+400*4+0] 2498 paddw xm3, [t4+r10*1+400*4+4] 2499 paddw xm5, xm3, [t4+r10*1+400*4+2] 2500 psllw xm5, 2 ; a3[ 1] 444 2501 psubw xm4, xm5, xm3 ; a3[ 1] 343 2502 paddw xm3, xm4, [t4+r10*1+400*12] 2503 paddw xm3, [t4+r10*1+400*10] 2504 mova [t4+r10*1+400*10], xm5 2505 mova [t4+r10*1+400*12], xm4 2506 mova m1, [t3+r10*2+400*8+0] 2507 paddd m1, [t3+r10*2+400*8+8] 2508 paddd m5, m1, [t3+r10*2+400*8+4] 2509 pslld m5, 2 ; b3[ 1] 444 2510 psubd m4, m5, m1 ; b3[ 1] 343 2511 paddd m1, m4, [t3+r10*2+400*24] 2512 paddd m1, [t3+r10*2+400*20] 2513 mova [t3+r10*2+400*20], m5 2514 mova [t3+r10*2+400*24], m4 2515 pmovzxwd m4, [dstq+r10] 2516 pmovzxwd m2, [t4+r10*1+400* 6] 2517 pmovzxwd m3, xm3 2518 mova m0, [t3+r10*2+400*12] 2519 pmaddwd m2, m4 ; a5 * src 2520 pmaddwd m3, m4 ; a3 * src 2521 pslld m4, 13 2522 psubd m0, m2 ; b5 - a5 * src + (1 << 8) 2523 psubd m1, m3 ; b3 - a3 * src + (1 << 8) 2524 psrld m0, 8 2525 pslld m1, 7 2526 pblendw m0, m1, 0xaa 2527 pmaddwd m0, m15 2528 paddd m4, m11 2529 paddd m0, m4 2530 psrad m0, 7 2531 vextracti128 xm1, m0, 1 2532 packusdw xm0, xm1 ; clip 2533 psrlw xm0, 6 2534 mova [dstq+r10], xm0 2535 add r10, 16 2536 jl .n1_loop 2537 add dstq, strideq 2538 ret 2539 2540%endif ; ARCH_X86_64 2541