1; Copyright © 2018, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 32 32 33wiener_l_shuf: db 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 34pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 35 db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 36wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 37wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 38wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 39sgr_r_ext: times 16 db 1 40 times 16 db 9 41 42; dword version of dav1d_sgr_x_by_x[] for use with gathers, wastes a bit of 43; cache but eliminates some shifts in the inner sgr loop which is overall a win 44const sgr_x_by_x_avx2 45 dd 255,128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16 46 dd 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8 47 dd 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5 48 dd 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 49 dd 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3 50 dd 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 51 dd 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 52 dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 53 dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 54 dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 55 dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1 56 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 57 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 58 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 59 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 60 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 61 62 times 4 db -1 ; needed for 16-bit sgr 63pb_m5: times 4 db -5 64pb_3: times 4 db 3 65pw_5_6: dw 5, 6 66 67sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 68sgr_shuf: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1 69 db 9, -1, 10, -1, 11, -1, 12, -1 70 71pw_256: times 2 dw 256 72pw_2056: times 2 dw 2056 73pw_m16380: times 2 dw -16380 74pd_25: dd 25 75pd_34816: dd 34816 76pd_m4096: dd -4096 77pd_0xf00801c7: dd 0xf00801c7 78pd_0xf00800a4: dd 0xf00800a4 79 80SECTION .text 81 82%macro REPX 2-* 83 %xdefine %%f(x) %1 84%rep %0 - 1 85 %rotate 1 86 %%f(%1) 87%endrep 88%endmacro 89 90DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; ring buffer pointers 91 92INIT_YMM avx2 93cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ 94 lpf_stride, w, edge, flt, h 95 mov fltq, fltmp 96 mov edged, r8m 97 mov wd, wm 98 mov hd, r6m 99 vbroadcasti128 m6, [wiener_shufA] 100 vpbroadcastb m11, [fltq+ 0] ; x0 x0 101 vbroadcasti128 m7, [wiener_shufB] 102 vpbroadcastd m12, [fltq+ 2] 103 vbroadcasti128 m8, [wiener_shufC] 104 packsswb m12, m12 ; x1 x2 105 vpbroadcastw m13, [fltq+ 6] ; x3 106 vbroadcasti128 m9, [sgr_shuf+6] 107 add lpfq, wq 108 vpbroadcastd m10, [pw_m16380] 109 lea t1, [rsp+wq*2+16] 110 vpbroadcastd m14, [fltq+16] ; y0 y1 111 add dstq, wq 112 vpbroadcastd m15, [fltq+20] ; y2 y3 113 neg wq 114 test edgeb, 4 ; LR_HAVE_TOP 115 jz .no_top 116 call .h_top 117 add lpfq, lpf_strideq 118 mov t6, t1 119 mov t5, t1 120 add t1, 384*2 121 call .h_top 122 lea r7, [lpfq+lpf_strideq*4] 123 mov lpfq, dstq 124 mov t4, t1 125 add t1, 384*2 126 mov [rsp+8*1], lpf_strideq 127 add r7, lpf_strideq 128 mov [rsp+8*0], r7 ; below 129 call .h 130 mov t3, t1 131 mov t2, t1 132 dec hd 133 jz .v1 134 add lpfq, dst_strideq 135 add t1, 384*2 136 call .h 137 mov t2, t1 138 dec hd 139 jz .v2 140 add lpfq, dst_strideq 141 add t1, 384*2 142 call .h 143 dec hd 144 jz .v3 145.main: 146 lea t0, [t1+384*2] 147.main_loop: 148 call .hv 149 dec hd 150 jnz .main_loop 151 test edgeb, 8 ; LR_HAVE_BOTTOM 152 jz .v3 153 mov lpfq, [rsp+8*0] 154 call .hv_bottom 155 add lpfq, [rsp+8*1] 156 call .hv_bottom 157.v1: 158 call .v 159 RET 160.no_top: 161 lea r7, [lpfq+lpf_strideq*4] 162 mov lpfq, dstq 163 mov [rsp+8*1], lpf_strideq 164 lea r7, [r7+lpf_strideq*2] 165 mov [rsp+8*0], r7 166 call .h 167 mov t6, t1 168 mov t5, t1 169 mov t4, t1 170 mov t3, t1 171 mov t2, t1 172 dec hd 173 jz .v1 174 add lpfq, dst_strideq 175 add t1, 384*2 176 call .h 177 mov t2, t1 178 dec hd 179 jz .v2 180 add lpfq, dst_strideq 181 add t1, 384*2 182 call .h 183 dec hd 184 jz .v3 185 lea t0, [t1+384*2] 186 call .hv 187 dec hd 188 jz .v3 189 add t0, 384*8 190 call .hv 191 dec hd 192 jnz .main 193.v3: 194 call .v 195.v2: 196 call .v 197 jmp .v1 198.extend_right: 199 movd xm2, r10d 200 vpbroadcastd m0, [pb_3] 201 vpbroadcastd m1, [pb_m5] 202 vpbroadcastb m2, xm2 203 movu m3, [pb_0to31] 204 psubb m0, m2 205 psubb m1, m2 206 pminub m0, m3 207 pminub m1, m3 208 pshufb m4, m0 209 pshufb m5, m1 210 ret 211.h: 212 mov r10, wq 213 test edgeb, 1 ; LR_HAVE_LEFT 214 jz .h_extend_left 215 movd xm4, [leftq] 216 vpblendd m4, [lpfq+r10-4], 0xfe 217 add leftq, 4 218 jmp .h_main 219.h_extend_left: 220 vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located 221 mova m4, [lpfq+r10] ; before the start of the buffer 222 palignr m4, m5, 12 223 pshufb m4, [wiener_l_shuf] 224 jmp .h_main 225.h_top: 226 mov r10, wq 227 test edgeb, 1 ; LR_HAVE_LEFT 228 jz .h_extend_left 229.h_loop: 230 movu m4, [lpfq+r10-4] 231.h_main: 232 movu m5, [lpfq+r10+4] 233 test edgeb, 2 ; LR_HAVE_RIGHT 234 jnz .h_have_right 235 cmp r10d, -34 236 jl .h_have_right 237 call .extend_right 238.h_have_right: 239 pshufb m0, m4, m6 240 pmaddubsw m0, m11 241 pshufb m1, m5, m6 242 pmaddubsw m1, m11 243 pshufb m2, m4, m7 244 pmaddubsw m2, m12 245 pshufb m3, m5, m7 246 pmaddubsw m3, m12 247 paddw m0, m2 248 pshufb m2, m4, m8 249 pmaddubsw m2, m12 250 paddw m1, m3 251 pshufb m3, m5, m8 252 pmaddubsw m3, m12 253 pshufb m4, m9 254 paddw m0, m2 255 pmullw m2, m4, m13 256 pshufb m5, m9 257 paddw m1, m3 258 pmullw m3, m5, m13 259 psllw m4, 7 260 psllw m5, 7 261 paddw m4, m10 262 paddw m5, m10 263 paddw m0, m2 264 vpbroadcastd m2, [pw_2056] 265 paddw m1, m3 266 paddsw m0, m4 267 paddsw m1, m5 268 psraw m0, 3 269 psraw m1, 3 270 paddw m0, m2 271 paddw m1, m2 272 mova [t1+r10*2+ 0], m0 273 mova [t1+r10*2+32], m1 274 add r10, 32 275 jl .h_loop 276 ret 277ALIGN function_align 278.hv: 279 add lpfq, dst_strideq 280 mov r10, wq 281 test edgeb, 1 ; LR_HAVE_LEFT 282 jz .hv_extend_left 283 movd xm4, [leftq] 284 vpblendd m4, [lpfq+r10-4], 0xfe 285 add leftq, 4 286 jmp .hv_main 287.hv_extend_left: 288 movu m4, [lpfq+r10-4] 289 pshufb m4, [wiener_l_shuf] 290 jmp .hv_main 291.hv_bottom: 292 mov r10, wq 293 test edgeb, 1 ; LR_HAVE_LEFT 294 jz .hv_extend_left 295.hv_loop: 296 movu m4, [lpfq+r10-4] 297.hv_main: 298 movu m5, [lpfq+r10+4] 299 test edgeb, 2 ; LR_HAVE_RIGHT 300 jnz .hv_have_right 301 cmp r10d, -34 302 jl .hv_have_right 303 call .extend_right 304.hv_have_right: 305 pshufb m0, m4, m6 306 pmaddubsw m0, m11 307 pshufb m1, m5, m6 308 pmaddubsw m1, m11 309 pshufb m2, m4, m7 310 pmaddubsw m2, m12 311 pshufb m3, m5, m7 312 pmaddubsw m3, m12 313 paddw m0, m2 314 pshufb m2, m4, m8 315 pmaddubsw m2, m12 316 paddw m1, m3 317 pshufb m3, m5, m8 318 pmaddubsw m3, m12 319 pshufb m4, m9 320 paddw m0, m2 321 pmullw m2, m4, m13 322 pshufb m5, m9 323 paddw m1, m3 324 pmullw m3, m5, m13 325 psllw m4, 7 326 psllw m5, 7 327 paddw m4, m10 328 paddw m5, m10 329 paddw m0, m2 330 paddw m1, m3 331 mova m2, [t4+r10*2] 332 paddw m2, [t2+r10*2] 333 mova m3, [t3+r10*2] 334 paddsw m0, m4 335 vpbroadcastd m4, [pw_2056] 336 paddsw m1, m5 337 mova m5, [t5+r10*2] 338 paddw m5, [t1+r10*2] 339 psraw m0, 3 340 psraw m1, 3 341 paddw m0, m4 342 paddw m1, m4 343 paddw m4, m0, [t6+r10*2] 344 mova [t0+r10*2], m0 345 punpcklwd m0, m2, m3 346 pmaddwd m0, m15 347 punpckhwd m2, m3 348 pmaddwd m2, m15 349 punpcklwd m3, m4, m5 350 pmaddwd m3, m14 351 punpckhwd m4, m5 352 pmaddwd m4, m14 353 paddd m0, m3 354 paddd m4, m2 355 mova m2, [t4+r10*2+32] 356 paddw m2, [t2+r10*2+32] 357 mova m3, [t3+r10*2+32] 358 mova m5, [t5+r10*2+32] 359 paddw m5, [t1+r10*2+32] 360 psrad m0, 11 361 psrad m4, 11 362 packssdw m0, m4 363 paddw m4, m1, [t6+r10*2+32] 364 mova [t0+r10*2+32], m1 365 punpcklwd m1, m2, m3 366 pmaddwd m1, m15 367 punpckhwd m2, m3 368 pmaddwd m2, m15 369 punpcklwd m3, m4, m5 370 pmaddwd m3, m14 371 punpckhwd m4, m5 372 pmaddwd m4, m14 373 paddd m1, m3 374 paddd m2, m4 375 psrad m1, 11 376 psrad m2, 11 377 packssdw m1, m2 378 packuswb m0, m1 379 mova [dstq+r10], m0 380 add r10, 32 381 jl .hv_loop 382 mov t6, t5 383 mov t5, t4 384 mov t4, t3 385 mov t3, t2 386 mov t2, t1 387 mov t1, t0 388 mov t0, t6 389 add dstq, dst_strideq 390 ret 391.v: 392 mov r10, wq 393.v_loop: 394 mova m2, [t4+r10*2+ 0] 395 paddw m2, [t2+r10*2+ 0] 396 mova m4, [t3+r10*2+ 0] 397 mova m6, [t1+r10*2+ 0] 398 paddw m8, m6, [t6+r10*2+ 0] 399 paddw m6, [t5+r10*2+ 0] 400 mova m3, [t4+r10*2+32] 401 paddw m3, [t2+r10*2+32] 402 mova m5, [t3+r10*2+32] 403 mova m7, [t1+r10*2+32] 404 paddw m9, m7, [t6+r10*2+32] 405 paddw m7, [t5+r10*2+32] 406 punpcklwd m0, m2, m4 407 pmaddwd m0, m15 408 punpckhwd m2, m4 409 pmaddwd m2, m15 410 punpcklwd m4, m8, m6 411 pmaddwd m4, m14 412 punpckhwd m6, m8, m6 413 pmaddwd m6, m14 414 punpcklwd m1, m3, m5 415 pmaddwd m1, m15 416 punpckhwd m3, m5 417 pmaddwd m3, m15 418 punpcklwd m5, m9, m7 419 pmaddwd m5, m14 420 punpckhwd m7, m9, m7 421 pmaddwd m7, m14 422 paddd m0, m4 423 paddd m2, m6 424 paddd m1, m5 425 paddd m3, m7 426 REPX {psrad x, 11}, m0, m2, m1, m3 427 packssdw m0, m2 428 packssdw m1, m3 429 packuswb m0, m1 430 mova [dstq+r10], m0 431 add r10, 32 432 jl .v_loop 433 mov t6, t5 434 mov t5, t4 435 mov t4, t3 436 mov t3, t2 437 mov t2, t1 438 add dstq, dst_strideq 439 ret 440 441cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ 442 lpf_stride, w, edge, flt, h 443 mov fltq, fltmp 444 mov edged, r8m 445 mov wd, wm 446 mov hd, r6m 447 vbroadcasti128 m6, [wiener_shufB] 448 vpbroadcastd m12, [fltq+ 2] 449 vbroadcasti128 m7, [wiener_shufC] 450 packsswb m12, m12 ; x1 x2 451 vpbroadcastw m13, [fltq+ 6] ; x3 452 vbroadcasti128 m8, [sgr_shuf+6] 453 add lpfq, wq 454 vpbroadcastd m9, [pw_m16380] 455 vpbroadcastd m10, [pw_2056] 456 lea t1, [rsp+wq*2+16] 457 mova m11, [wiener_l_shuf] 458 vpbroadcastd m14, [fltq+16] ; __ y1 459 add dstq, wq 460 vpbroadcastd m15, [fltq+20] ; y2 y3 461 neg wq 462 test edgeb, 4 ; LR_HAVE_TOP 463 jz .no_top 464 call .h_top 465 add lpfq, lpf_strideq 466 mov t4, t1 467 add t1, 384*2 468 call .h_top 469 lea r7, [lpfq+lpf_strideq*4] 470 mov lpfq, dstq 471 mov t3, t1 472 add t1, 384*2 473 mov [rsp+8*1], lpf_strideq 474 add r7, lpf_strideq 475 mov [rsp+8*0], r7 ; below 476 call .h 477 mov t2, t1 478 dec hd 479 jz .v1 480 add lpfq, dst_strideq 481 add t1, 384*2 482 call .h 483 dec hd 484 jz .v2 485.main: 486 mov t0, t4 487.main_loop: 488 call .hv 489 dec hd 490 jnz .main_loop 491 test edgeb, 8 ; LR_HAVE_BOTTOM 492 jz .v2 493 mov lpfq, [rsp+8*0] 494 call .hv_bottom 495 add lpfq, [rsp+8*1] 496 call .hv_bottom 497.end: 498 RET 499.no_top: 500 lea r7, [lpfq+lpf_strideq*4] 501 mov lpfq, dstq 502 mov [rsp+8*1], lpf_strideq 503 lea r7, [r7+lpf_strideq*2] 504 mov [rsp+8*0], r7 505 call .h 506 mov t4, t1 507 mov t3, t1 508 mov t2, t1 509 dec hd 510 jz .v1 511 add lpfq, dst_strideq 512 add t1, 384*2 513 call .h 514 dec hd 515 jz .v2 516 lea t0, [t1+384*2] 517 call .hv 518 dec hd 519 jz .v2 520 add t0, 384*6 521 call .hv 522 dec hd 523 jnz .main 524.v2: 525 call .v 526 mov t4, t3 527 mov t3, t2 528 mov t2, t1 529 add dstq, dst_strideq 530.v1: 531 call .v 532 jmp .end 533.h: 534 mov r10, wq 535 test edgeb, 1 ; LR_HAVE_LEFT 536 jz .h_extend_left 537 movd xm4, [leftq] 538 vpblendd m4, [lpfq+r10-4], 0xfe 539 add leftq, 4 540 jmp .h_main 541.h_extend_left: 542 vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located 543 mova m4, [lpfq+r10] ; before the start of the buffer 544 palignr m4, m5, 12 545 pshufb m4, m11 546 jmp .h_main 547.h_top: 548 mov r10, wq 549 test edgeb, 1 ; LR_HAVE_LEFT 550 jz .h_extend_left 551.h_loop: 552 movu m4, [lpfq+r10-4] 553.h_main: 554 movu m5, [lpfq+r10+4] 555 test edgeb, 2 ; LR_HAVE_RIGHT 556 jnz .h_have_right 557 cmp r10d, -33 558 jl .h_have_right 559 call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right 560.h_have_right: 561 pshufb m0, m4, m6 562 pmaddubsw m0, m12 563 pshufb m1, m5, m6 564 pmaddubsw m1, m12 565 pshufb m2, m4, m7 566 pmaddubsw m2, m12 567 pshufb m3, m5, m7 568 pmaddubsw m3, m12 569 pshufb m4, m8 570 paddw m0, m2 571 pmullw m2, m4, m13 572 pshufb m5, m8 573 paddw m1, m3 574 pmullw m3, m5, m13 575 psllw m4, 7 576 psllw m5, 7 577 paddw m4, m9 578 paddw m5, m9 579 paddw m0, m2 580 paddw m1, m3 581 paddsw m0, m4 582 paddsw m1, m5 583 psraw m0, 3 584 psraw m1, 3 585 paddw m0, m10 586 paddw m1, m10 587 mova [t1+r10*2+ 0], m0 588 mova [t1+r10*2+32], m1 589 add r10, 32 590 jl .h_loop 591 ret 592ALIGN function_align 593.hv: 594 add lpfq, dst_strideq 595 mov r10, wq 596 test edgeb, 1 ; LR_HAVE_LEFT 597 jz .hv_extend_left 598 movd xm4, [leftq] 599 vpblendd m4, [lpfq+r10-4], 0xfe 600 add leftq, 4 601 jmp .hv_main 602.hv_extend_left: 603 movu m4, [lpfq+r10-4] 604 pshufb m4, m11 605 jmp .hv_main 606.hv_bottom: 607 mov r10, wq 608 test edgeb, 1 ; LR_HAVE_LEFT 609 jz .hv_extend_left 610.hv_loop: 611 movu m4, [lpfq+r10-4] 612.hv_main: 613 movu m5, [lpfq+r10+4] 614 test edgeb, 2 ; LR_HAVE_RIGHT 615 jnz .hv_have_right 616 cmp r10d, -33 617 jl .hv_have_right 618 call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right 619.hv_have_right: 620 pshufb m0, m4, m6 621 pmaddubsw m0, m12 622 pshufb m1, m5, m6 623 pmaddubsw m1, m12 624 pshufb m2, m4, m7 625 pmaddubsw m2, m12 626 pshufb m3, m5, m7 627 pmaddubsw m3, m12 628 pshufb m4, m8 629 paddw m0, m2 630 pmullw m2, m4, m13 631 pshufb m5, m8 632 paddw m1, m3 633 pmullw m3, m5, m13 634 psllw m4, 7 635 psllw m5, 7 636 paddw m4, m9 637 paddw m5, m9 638 paddw m0, m2 639 paddw m1, m3 640 mova m2, [t3+r10*2] 641 paddw m2, [t1+r10*2] 642 mova m3, [t2+r10*2] 643 paddsw m0, m4 644 paddsw m1, m5 645 psraw m0, 3 646 psraw m1, 3 647 paddw m0, m10 648 paddw m1, m10 649 paddw m4, m0, [t4+r10*2] 650 mova [t0+r10*2], m0 651 punpcklwd m0, m2, m3 652 pmaddwd m0, m15 653 punpckhwd m2, m3 654 pmaddwd m2, m15 655 punpcklwd m3, m4, m4 656 pmaddwd m3, m14 657 punpckhwd m4, m4 658 pmaddwd m4, m14 659 paddd m0, m3 660 paddd m4, m2 661 mova m2, [t3+r10*2+32] 662 paddw m2, [t1+r10*2+32] 663 mova m3, [t2+r10*2+32] 664 psrad m0, 11 665 psrad m4, 11 666 packssdw m0, m4 667 paddw m4, m1, [t4+r10*2+32] 668 mova [t0+r10*2+32], m1 669 punpcklwd m1, m2, m3 670 pmaddwd m1, m15 671 punpckhwd m2, m3 672 pmaddwd m2, m15 673 punpcklwd m3, m4, m4 674 pmaddwd m3, m14 675 punpckhwd m4, m4 676 pmaddwd m4, m14 677 paddd m1, m3 678 paddd m2, m4 679 psrad m1, 11 680 psrad m2, 11 681 packssdw m1, m2 682 packuswb m0, m1 683 mova [dstq+r10], m0 684 add r10, 32 685 jl .hv_loop 686 mov t4, t3 687 mov t3, t2 688 mov t2, t1 689 mov t1, t0 690 mov t0, t4 691 add dstq, dst_strideq 692 ret 693.v: 694 mov r10, wq 695 psrld m13, m14, 16 ; y1 __ 696.v_loop: 697 mova m6, [t1+r10*2+ 0] 698 paddw m2, m6, [t3+r10*2+ 0] 699 mova m4, [t2+r10*2+ 0] 700 mova m7, [t1+r10*2+32] 701 paddw m3, m7, [t3+r10*2+32] 702 mova m5, [t2+r10*2+32] 703 paddw m6, [t4+r10*2+ 0] 704 paddw m7, [t4+r10*2+32] 705 punpcklwd m0, m2, m4 706 pmaddwd m0, m15 707 punpckhwd m2, m4 708 pmaddwd m2, m15 709 punpcklwd m1, m3, m5 710 pmaddwd m1, m15 711 punpckhwd m3, m5 712 pmaddwd m3, m15 713 punpcklwd m5, m7, m6 714 pmaddwd m4, m5, m14 715 punpckhwd m7, m6 716 pmaddwd m6, m7, m14 717 pmaddwd m5, m13 718 pmaddwd m7, m13 719 paddd m0, m4 720 paddd m2, m6 721 paddd m1, m5 722 paddd m3, m7 723 REPX {psrad x, 11}, m0, m2, m1, m3 724 packssdw m0, m2 725 packssdw m1, m3 726 packuswb m0, m1 727 mova [dstq+r10], m0 728 add r10, 32 729 jl .v_loop 730 ret 731 732cglobal sgr_filter_5x5_8bpc, 5, 13, 16, 400*24+16, dst, dst_stride, left, lpf, \ 733 lpf_stride, w, edge, params, h 734%define base r12-sgr_x_by_x_avx2-256*4 735 lea r12, [sgr_x_by_x_avx2+256*4] 736 mov paramsq, paramsmp 737 mov wd, wm 738 mov edged, r8m 739 mov hd, r6m 740 vbroadcasti128 m8, [base+sgr_shuf+0] 741 add lpfq, wq 742 vbroadcasti128 m9, [base+sgr_shuf+8] 743 lea t1, [rsp+wq*2+20] 744 vbroadcasti128 m10, [base+sgr_shuf+2] 745 add dstq, wq 746 vbroadcasti128 m11, [base+sgr_shuf+6] 747 lea t3, [rsp+wq*4+16+400*12] 748 vpbroadcastd m12, [paramsq+0] ; s0 749 neg wq 750 vpbroadcastd m13, [base+pd_0xf00800a4] 751 pxor m6, m6 752 vpbroadcastw m7, [paramsq+8] ; w0 753 vpbroadcastd m14, [base+pd_34816] ; (1 << 11) + (1 << 15) 754 psllw m7, 4 755 vpbroadcastd m15, [base+pd_m4096] 756 lea r10, [lpfq+lpf_strideq*4] 757 mov [rsp+8*1], lpf_strideq 758 add r10, lpf_strideq 759 mov [rsp+8*0], r10 ; below 760 test edgeb, 4 ; LR_HAVE_TOP 761 jz .no_top 762 call .h_top 763 add lpfq, lpf_strideq 764 mov t2, t1 765 call .top_fixup 766 add t1, 400*6 767 call .h_top 768 lea r10, [lpfq+lpf_strideq*4] 769 mov lpfq, dstq 770 mov [rsp+8*1], lpf_strideq 771 add r10, lpf_strideq 772 mov [rsp+8*0], r10 ; below 773 mov t0, t2 774 dec hd 775 jz .height1 776 or edged, 16 777 call .h 778.main: 779 add lpfq, dst_strideq 780 call .hv 781 call .prep_n 782 sub hd, 2 783 jl .extend_bottom 784.main_loop: 785 add lpfq, dst_strideq 786 test hd, hd 787 jz .odd_height 788 call .h 789 add lpfq, dst_strideq 790 call .hv 791 call .n0 792 call .n1 793 sub hd, 2 794 jge .main_loop 795 test edgeb, 8 ; LR_HAVE_BOTTOM 796 jz .extend_bottom 797 mov lpfq, [rsp+8*0] 798 call .h_top 799 add lpfq, [rsp+8*1] 800 call .hv_bottom 801.end: 802 call .n0 803 call .n1 804.end2: 805 RET 806.height1: 807 call .hv 808 call .prep_n 809 jmp .odd_height_end 810.odd_height: 811 call .hv 812 call .n0 813 call .n1 814.odd_height_end: 815 call .v 816 call .n0 817 jmp .end2 818.extend_bottom: 819 call .v 820 jmp .end 821.no_top: 822 lea r10, [lpfq+lpf_strideq*4] 823 mov lpfq, dstq 824 mov [rsp+8*1], lpf_strideq 825 lea r10, [r10+lpf_strideq*2] 826 mov [rsp+8*0], r10 827 call .h 828 lea t2, [t1+400*6] 829 call .top_fixup 830 dec hd 831 jz .no_top_height1 832 or edged, 16 833 mov t0, t1 834 mov t1, t2 835 jmp .main 836.no_top_height1: 837 call .v 838 call .prep_n 839 jmp .odd_height_end 840.extend_right: 841 movd xm2, r10d 842 mova m0, [sgr_r_ext] 843 vpbroadcastb m2, xm2 844 psubb m0, m2 845 pminub m0, [pb_0to31] 846 pshufb m5, m0 847 ret 848.h: ; horizontal boxsum 849 lea r10, [wq-2] 850 test edgeb, 1 ; LR_HAVE_LEFT 851 jz .h_extend_left 852 vpbroadcastd xm0, [leftq] 853 mova xm5, [lpfq+wq] 854 palignr xm5, xm0, 12 855 add leftq, 4 856 jmp .h_main 857.h_extend_left: 858 mova xm5, [lpfq+wq] 859 pshufb xm5, [base+sgr_l_shuf] 860 jmp .h_main 861.h_top: 862 lea r10, [wq-2] 863 test edgeb, 1 ; LR_HAVE_LEFT 864 jz .h_extend_left 865.h_loop: 866 movu xm5, [lpfq+r10-2] 867.h_main: 868 vinserti128 m5, [lpfq+r10+6], 1 869 test edgeb, 2 ; LR_HAVE_RIGHT 870 jnz .h_have_right 871 cmp r10d, -18 872 jl .h_have_right 873 call .extend_right 874.h_have_right: 875 pshufb m3, m5, m8 876 pmullw m4, m3, m3 877 pshufb m2, m5, m9 878 paddw m0, m3, m2 879 shufps m3, m2, q2121 880 paddw m0, m3 881 punpcklwd m1, m2, m3 882 pmaddwd m1, m1 883 punpckhwd m2, m3 884 pmaddwd m2, m2 885 punpcklwd m3, m4, m6 886 paddd m1, m3 887 punpckhwd m4, m6 888 paddd m2, m4 889 pshufb m4, m5, m10 890 paddw m0, m4 891 pshufb m5, m11 892 paddw m0, m5 ; sum 893 punpcklwd m3, m4, m5 894 pmaddwd m3, m3 895 punpckhwd m4, m5 896 pmaddwd m4, m4 897 test edgeb, 16 ; y > 0 898 jz .h_loop_end 899 paddw m0, [t1+r10*2+400*0] 900 paddd m1, [t1+r10*2+400*2] 901 paddd m2, [t1+r10*2+400*4] 902.h_loop_end: 903 paddd m1, m3 ; sumsq 904 paddd m2, m4 905 mova [t1+r10*2+400*0], m0 906 mova [t1+r10*2+400*2], m1 907 mova [t1+r10*2+400*4], m2 908 add r10, 16 909 jl .h_loop 910 ret 911.top_fixup: 912 lea r10, [wq-2] 913.top_fixup_loop: ; the sums of the first row needs to be doubled 914 mova m0, [t1+r10*2+400*0] 915 mova m1, [t1+r10*2+400*2] 916 mova m2, [t1+r10*2+400*4] 917 paddw m0, m0 918 paddd m1, m1 919 paddd m2, m2 920 mova [t2+r10*2+400*0], m0 921 mova [t2+r10*2+400*2], m1 922 mova [t2+r10*2+400*4], m2 923 add r10, 16 924 jl .top_fixup_loop 925 ret 926ALIGN function_align 927.hv: ; horizontal boxsum + vertical boxsum + ab 928 lea r10, [wq-2] 929 test edgeb, 1 ; LR_HAVE_LEFT 930 jz .hv_extend_left 931 vpbroadcastd xm0, [leftq] 932 mova xm5, [lpfq+wq] 933 palignr xm5, xm0, 12 934 add leftq, 4 935 jmp .hv_main 936.hv_extend_left: 937 mova xm5, [lpfq+wq] 938 pshufb xm5, [base+sgr_l_shuf] 939 jmp .hv_main 940.hv_bottom: 941 lea r10, [wq-2] 942 test edgeb, 1 ; LR_HAVE_LEFT 943 jz .hv_extend_left 944.hv_loop: 945 movu xm5, [lpfq+r10-2] 946.hv_main: 947 vinserti128 m5, [lpfq+r10+6], 1 948 test edgeb, 2 ; LR_HAVE_RIGHT 949 jnz .hv_have_right 950 cmp r10d, -18 951 jl .hv_have_right 952 call .extend_right 953.hv_have_right: 954 pshufb m1, m5, m8 955 pmullw m4, m1, m1 956 pshufb m3, m5, m9 957 paddw m0, m1, m3 958 shufps m1, m3, q2121 959 paddw m0, m1 960 punpcklwd m2, m3, m1 961 pmaddwd m2, m2 962 punpckhwd m3, m1 963 pmaddwd m3, m3 964 punpcklwd m1, m4, m6 965 paddd m2, m1 966 punpckhwd m4, m6 967 paddd m3, m4 968 pshufb m1, m5, m10 969 paddw m0, m1 970 pshufb m5, m11 971 paddw m0, m5 ; h sum 972 punpcklwd m4, m5, m1 973 pmaddwd m4, m4 974 punpckhwd m5, m1 975 pmaddwd m5, m5 976 paddw m1, m0, [t1+r10*2+400*0] 977 paddd m2, m4 ; h sumsq 978 paddd m3, m5 979 paddd m4, m2, [t1+r10*2+400*2] 980 paddd m5, m3, [t1+r10*2+400*4] 981 test hd, hd 982 jz .hv_last_row 983.hv_main2: 984 paddw m1, [t2+r10*2+400*0] ; hv sum 985 paddd m4, [t2+r10*2+400*2] ; hv sumsq 986 paddd m5, [t2+r10*2+400*4] 987 mova [t0+r10*2+400*0], m0 988 mova [t0+r10*2+400*2], m2 989 mova [t0+r10*2+400*4], m3 990 vpbroadcastd m2, [pd_25] 991 punpcklwd m0, m1, m6 ; b 992 punpckhwd m1, m6 993 pmulld m4, m2 ; a * 25 994 pmulld m5, m2 995 pmaddwd m2, m0, m0 ; b * b 996 pmaddwd m3, m1, m1 997 psubd m4, m2 ; p 998 psubd m5, m3 999 pmulld m4, m12 ; p * s 1000 pmulld m5, m12 1001 pmaddwd m0, m13 ; b * 164 1002 pmaddwd m1, m13 1003 paddusw m4, m13 1004 paddusw m5, m13 1005 psrad m3, m4, 20 ; min(z, 255) - 256 1006 vpgatherdd m2, [r12+m3*4], m4 1007 psrad m4, m5, 20 1008 vpgatherdd m3, [r12+m4*4], m5 1009 pmulld m0, m2 1010 pmulld m1, m3 1011 paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) 1012 paddd m1, m14 1013 pand m0, m15 1014 pand m1, m15 1015 por m0, m2 ; a | (b << 12) 1016 por m1, m3 1017 mova [t3+r10*4+ 8], xm0 ; The neighbor calculations requires 1018 vextracti128 [t3+r10*4+40], m0, 1 ; 13 bits for a and 21 bits for b. 1019 mova [t3+r10*4+24], xm1 ; Packing them allows for 12+20, but 1020 vextracti128 [t3+r10*4+56], m1, 1 ; that gets us most of the way. 1021 add r10, 16 1022 jl .hv_loop 1023 mov t2, t1 1024 mov t1, t0 1025 mov t0, t2 1026 ret 1027.hv_last_row: ; esoteric edge case for odd heights 1028 mova [t1+r10*2+400*0], m1 1029 paddw m1, m0 1030 mova [t1+r10*2+400*2], m4 1031 paddd m4, m2 1032 mova [t1+r10*2+400*4], m5 1033 paddd m5, m3 1034 jmp .hv_main2 1035.v: ; vertical boxsum + ab 1036 lea r10, [wq-2] 1037.v_loop: 1038 mova m0, [t1+r10*2+400*0] 1039 mova m2, [t1+r10*2+400*2] 1040 mova m3, [t1+r10*2+400*4] 1041 paddw m1, m0, [t2+r10*2+400*0] 1042 paddd m4, m2, [t2+r10*2+400*2] 1043 paddd m5, m3, [t2+r10*2+400*4] 1044 paddw m0, m0 1045 paddd m2, m2 1046 paddd m3, m3 1047 paddw m1, m0 ; hv sum 1048 paddd m4, m2 ; hv sumsq 1049 paddd m5, m3 1050 vpbroadcastd m2, [pd_25] 1051 punpcklwd m0, m1, m6 ; b 1052 punpckhwd m1, m6 1053 pmulld m4, m2 ; a * 25 1054 pmulld m5, m2 1055 pmaddwd m2, m0, m0 ; b * b 1056 pmaddwd m3, m1, m1 1057 psubd m4, m2 ; p 1058 psubd m5, m3 1059 pmulld m4, m12 ; p * s 1060 pmulld m5, m12 1061 pmaddwd m0, m13 ; b * 164 1062 pmaddwd m1, m13 1063 paddusw m4, m13 1064 paddusw m5, m13 1065 psrad m3, m4, 20 ; min(z, 255) - 256 1066 vpgatherdd m2, [r12+m3*4], m4 1067 psrad m4, m5, 20 1068 vpgatherdd m3, [r12+m4*4], m5 1069 pmulld m0, m2 1070 pmulld m1, m3 1071 paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) 1072 paddd m1, m14 1073 pand m0, m15 1074 pand m1, m15 1075 por m0, m2 ; a | (b << 12) 1076 por m1, m3 1077 mova [t3+r10*4+ 8], xm0 1078 vextracti128 [t3+r10*4+40], m0, 1 1079 mova [t3+r10*4+24], xm1 1080 vextracti128 [t3+r10*4+56], m1, 1 1081 add r10, 16 1082 jl .v_loop 1083 ret 1084.prep_n: ; initial neighbor setup 1085 mov r10, wq 1086.prep_n_loop: 1087 movu m0, [t3+r10*4+ 4] 1088 movu m1, [t3+r10*4+36] 1089 paddd m2, m0, [t3+r10*4+ 0] 1090 paddd m3, m1, [t3+r10*4+32] 1091 paddd m2, [t3+r10*4+ 8] 1092 paddd m3, [t3+r10*4+40] 1093 paddd m0, m2 1094 pslld m2, 2 1095 paddd m1, m3 1096 pslld m3, 2 1097 paddd m2, m0 ; ab 565 1098 paddd m3, m1 1099 ; a = 4096 - (ab & 4095) = -(ab | ~4095), so by 1100 ; using OR instead of AND for the masking we get 1101 ; the subtraction for free (with a negated result) 1102 por m0, m15, m2 ; -a 1103 psrld m2, 12 ; b 1104 por m1, m15, m3 1105 psrld m3, 12 1106 mova [t3+r10*4+400*4+ 0], m0 1107 mova [t3+r10*4+400*8+ 0], m2 1108 mova [t3+r10*4+400*4+32], m1 1109 mova [t3+r10*4+400*8+32], m3 1110 add r10, 16 1111 jl .prep_n_loop 1112 ret 1113ALIGN function_align 1114.n0: ; neighbor + output (even rows) 1115 mov r10, wq 1116.n0_loop: 1117 movu m0, [t3+r10*4+ 4] 1118 movu m1, [t3+r10*4+36] 1119 paddd m2, m0, [t3+r10*4+ 0] 1120 paddd m3, m1, [t3+r10*4+32] 1121 paddd m2, [t3+r10*4+ 8] 1122 paddd m3, [t3+r10*4+40] 1123 paddd m0, m2 1124 pslld m2, 2 1125 paddd m1, m3 1126 pslld m3, 2 1127 paddd m2, m0 1128 paddd m3, m1 1129 por m0, m15, m2 1130 psrld m2, 12 1131 por m1, m15, m3 1132 psrld m3, 12 1133 paddd m4, m0, [t3+r10*4+400*4+ 0] ; -a 1134 paddd m5, m1, [t3+r10*4+400*4+32] 1135 mova [t3+r10*4+400*4+ 0], m0 1136 mova [t3+r10*4+400*4+32], m1 1137 paddd m0, m2, [t3+r10*4+400*8+ 0] ; b 1138 paddd m1, m3, [t3+r10*4+400*8+32] 1139 mova [t3+r10*4+400*8+ 0], m2 1140 mova [t3+r10*4+400*8+32], m3 1141 pmovzxbd m2, [dstq+r10+0] 1142 pmovzxbd m3, [dstq+r10+8] 1143 pmaddwd m4, m2 ; -a * src 1144 pmaddwd m5, m3 1145 packssdw m2, m3 1146 psubd m0, m4 ; a * src + b + (1 << 8) 1147 psubd m1, m5 1148 psrld m0, 9 1149 psrld m1, 9 1150 packssdw m0, m1 1151 psllw m1, m2, 4 1152 psubw m0, m1 1153 pmulhrsw m0, m7 1154 paddw m0, m2 1155 vextracti128 xm1, m0, 1 1156 packuswb xm0, xm1 1157 pshufd xm0, xm0, q3120 1158 mova [dstq+r10], xm0 1159 add r10, 16 1160 jl .n0_loop 1161 add dstq, dst_strideq 1162 ret 1163ALIGN function_align 1164.n1: ; neighbor + output (odd rows) 1165 mov r10, wq 1166.n1_loop: 1167 pmovzxbd m2, [dstq+r10+0] 1168 pmovzxbd m3, [dstq+r10+8] 1169 pmaddwd m4, m2, [t3+r10*4+400*4+ 0] ; -a * src 1170 pmaddwd m5, m3, [t3+r10*4+400*4+32] 1171 mova m0, [t3+r10*4+400*8+ 0] ; b 1172 mova m1, [t3+r10*4+400*8+32] 1173 packssdw m2, m3 1174 psubd m0, m4 ; a * src + b + (1 << 7) 1175 psubd m1, m5 1176 psrld m0, 8 1177 psrld m1, 8 1178 packssdw m0, m1 1179 psllw m1, m2, 4 1180 psubw m0, m1 1181 pmulhrsw m0, m7 1182 paddw m0, m2 1183 vextracti128 xm1, m0, 1 1184 packuswb xm0, xm1 1185 pshufd xm0, xm0, q3120 1186 mova [dstq+r10], xm0 1187 add r10, 16 1188 jl .n1_loop 1189 add dstq, dst_strideq 1190 ret 1191 1192cglobal sgr_filter_3x3_8bpc, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \ 1193 lpf_stride, w, edge, params, h 1194%define base r14-sgr_x_by_x_avx2-256*4 1195 mov paramsq, paramsmp 1196 mov edged, r8m 1197 mov wd, wm 1198 mov hd, r6m 1199 lea r14, [sgr_x_by_x_avx2+256*4] 1200 vbroadcasti128 m8, [base+sgr_shuf+2] 1201 add lpfq, wq 1202 vbroadcasti128 m9, [base+sgr_shuf+4] 1203 lea t1, [rsp+wq*2+20] 1204 vbroadcasti128 m10, [base+sgr_shuf+6] 1205 add dstq, wq 1206 vpbroadcastd m11, [paramsq+ 4] ; s1 1207 lea t3, [rsp+wq*4+16+400*12] 1208 vpbroadcastd m12, [base+pd_0xf00801c7] 1209 neg wq 1210 vpbroadcastw m7, [paramsq+10] ; w1 1211 pxor m6, m6 1212 vpbroadcastd m13, [base+pd_34816] ; (1 << 11) + (1 << 15) 1213 psllw m7, 4 1214 vpbroadcastd m14, [base+pd_m4096] 1215 test edgeb, 4 ; LR_HAVE_TOP 1216 jz .no_top 1217 call .h_top 1218 add lpfq, lpf_strideq 1219 mov t2, t1 1220 add t1, 400*6 1221 call .h_top 1222 lea t4, [lpfq+lpf_strideq*4] 1223 mov lpfq, dstq 1224 mov [rsp+8*1], lpf_strideq 1225 add t4, lpf_strideq 1226 mov [rsp+8*0], t4 ; below 1227 mov t0, t2 1228 call .hv 1229.main: 1230 mov t5, t3 1231 add t3, 400*4 1232 dec hd 1233 jz .height1 1234 add lpfq, dst_strideq 1235 call .hv 1236 call .prep_n 1237 dec hd 1238 jz .extend_bottom 1239.main_loop: 1240 add lpfq, dst_strideq 1241 call .hv 1242 call .n 1243 dec hd 1244 jnz .main_loop 1245 test edgeb, 8 ; LR_HAVE_BOTTOM 1246 jz .extend_bottom 1247 mov lpfq, [rsp+8*0] 1248 call .hv_bottom 1249 call .n 1250 add lpfq, [rsp+8*1] 1251 call .hv_bottom 1252.end: 1253 call .n 1254 RET 1255.height1: 1256 call .v 1257 call .prep_n 1258 mov t2, t1 1259 call .v 1260 jmp .end 1261.extend_bottom: 1262 call .v 1263 call .n 1264 mov t2, t1 1265 call .v 1266 jmp .end 1267.no_top: 1268 lea t4, [lpfq+lpf_strideq*4] 1269 mov lpfq, dstq 1270 mov [rsp+8*1], lpf_strideq 1271 lea t4, [t4+lpf_strideq*2] 1272 mov [rsp+8*0], t4 1273 call .h 1274 lea t0, [t1+400*6] 1275 mov t2, t1 1276 call .v 1277 jmp .main 1278.h: ; horizontal boxsum 1279 lea r10, [wq-2] 1280 test edgeb, 1 ; LR_HAVE_LEFT 1281 jz .h_extend_left 1282 vpbroadcastd xm0, [leftq] 1283 mova xm5, [lpfq+wq] 1284 palignr xm5, xm0, 12 1285 add leftq, 4 1286 jmp .h_main 1287.h_extend_left: 1288 mova xm5, [lpfq+wq] 1289 pshufb xm5, [base+sgr_l_shuf] 1290 jmp .h_main 1291.h_top: 1292 lea r10, [wq-2] 1293 test edgeb, 1 ; LR_HAVE_LEFT 1294 jz .h_extend_left 1295.h_loop: 1296 movu xm5, [lpfq+r10-2] 1297.h_main: 1298 vinserti128 m5, [lpfq+r10+6], 1 1299 test edgeb, 2 ; LR_HAVE_RIGHT 1300 jnz .h_have_right 1301 cmp r10d, -17 1302 jl .h_have_right 1303 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right 1304.h_have_right: 1305 pshufb m0, m5, m8 1306 pmullw m2, m0, m0 1307 pshufb m4, m5, m9 1308 paddw m0, m4 1309 pshufb m5, m10 1310 paddw m0, m5 ; sum 1311 punpcklwd m3, m4, m5 1312 pmaddwd m3, m3 1313 punpckhwd m4, m5 1314 pmaddwd m4, m4 1315 punpcklwd m1, m2, m6 1316 punpckhwd m2, m6 1317 mova [t1+r10*2+400*0], m0 1318 paddd m1, m3 ; sumsq 1319 paddd m2, m4 1320 mova [t1+r10*2+400*2], m1 1321 mova [t1+r10*2+400*4], m2 1322 add r10, 16 1323 jl .h_loop 1324 ret 1325ALIGN function_align 1326.hv: ; horizontal boxsum + vertical boxsum + ab 1327 lea r10, [wq-2] 1328 test edgeb, 1 ; LR_HAVE_LEFT 1329 jz .hv_extend_left 1330 vpbroadcastd xm0, [leftq] 1331 mova xm5, [lpfq+wq] 1332 palignr xm5, xm0, 12 1333 add leftq, 4 1334 jmp .hv_main 1335.hv_extend_left: 1336 mova xm5, [lpfq+wq] 1337 pshufb xm5, [base+sgr_l_shuf] 1338 jmp .hv_main 1339.hv_bottom: 1340 lea r10, [wq-2] 1341 test edgeb, 1 ; LR_HAVE_LEFT 1342 jz .hv_extend_left 1343.hv_loop: 1344 movu xm5, [lpfq+r10-2] 1345.hv_main: 1346 vinserti128 m5, [lpfq+r10+6], 1 1347 test edgeb, 2 ; LR_HAVE_RIGHT 1348 jnz .hv_have_right 1349 cmp r10d, -17 1350 jl .hv_have_right 1351 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right 1352.hv_have_right: 1353 pshufb m0, m5, m8 1354 pmullw m3, m0, m0 1355 pshufb m1, m5, m9 1356 paddw m0, m1 1357 pshufb m5, m10 1358 paddw m0, m5 ; h sum 1359 punpcklwd m4, m5, m1 1360 pmaddwd m4, m4 1361 punpckhwd m5, m1 1362 pmaddwd m5, m5 1363 paddw m1, m0, [t2+r10*2+400*0] 1364 paddw m1, [t1+r10*2+400*0] ; hv sum 1365 punpcklwd m2, m3, m6 1366 punpckhwd m3, m6 1367 paddd m4, m2 ; h sumsq 1368 paddd m5, m3 1369 paddd m2, m4, [t2+r10*2+400*2] 1370 paddd m3, m5, [t2+r10*2+400*4] 1371 paddd m2, [t1+r10*2+400*2] ; hv sumsq 1372 paddd m3, [t1+r10*2+400*4] 1373 mova [t0+r10*2+400*0], m0 1374 punpcklwd m0, m1, m6 ; b 1375 punpckhwd m1, m6 1376 mova [t0+r10*2+400*2], m4 1377 pslld m4, m2, 3 1378 mova [t0+r10*2+400*4], m5 1379 pslld m5, m3, 3 1380 paddd m4, m2 ; a * 9 1381 pmaddwd m2, m0, m0 ; b * b 1382 paddd m5, m3 1383 pmaddwd m3, m1, m1 1384 psubd m4, m2 ; p 1385 psubd m5, m3 1386 pmulld m4, m11 ; p * s 1387 pmulld m5, m11 1388 pmaddwd m0, m12 ; b * 455 1389 pmaddwd m1, m12 1390 paddusw m4, m12 1391 paddusw m5, m12 1392 psrad m3, m4, 20 ; min(z, 255) - 256 1393 vpgatherdd m2, [r14+m3*4], m4 1394 psrad m4, m5, 20 1395 vpgatherdd m3, [r14+m4*4], m5 1396 pmulld m0, m2 1397 pmulld m1, m3 1398 paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) 1399 paddd m1, m13 1400 pand m0, m14 1401 pand m1, m14 1402 por m0, m2 ; a | (b << 12) 1403 por m1, m3 1404 mova [t3+r10*4+ 8], xm0 1405 vextracti128 [t3+r10*4+40], m0, 1 1406 mova [t3+r10*4+24], xm1 1407 vextracti128 [t3+r10*4+56], m1, 1 1408 add r10, 16 1409 jl .hv_loop 1410 mov t2, t1 1411 mov t1, t0 1412 mov t0, t2 1413 ret 1414.v: ; vertical boxsum + ab 1415 lea r10, [wq-2] 1416.v_loop: 1417 mova m1, [t1+r10*2+400*0] 1418 paddw m1, m1 1419 paddw m1, [t2+r10*2+400*0] ; hv sum 1420 mova m2, [t1+r10*2+400*2] 1421 mova m3, [t1+r10*2+400*4] 1422 paddd m2, m2 1423 paddd m3, m3 1424 paddd m2, [t2+r10*2+400*2] ; hv sumsq 1425 paddd m3, [t2+r10*2+400*4] 1426 punpcklwd m0, m1, m6 ; b 1427 punpckhwd m1, m6 1428 pslld m4, m2, 3 1429 pslld m5, m3, 3 1430 paddd m4, m2 ; a * 9 1431 pmaddwd m2, m0, m0 ; b * b 1432 paddd m5, m3 1433 pmaddwd m3, m1, m1 1434 psubd m4, m2 ; p 1435 psubd m5, m3 1436 pmulld m4, m11 ; p * s 1437 pmulld m5, m11 1438 pmaddwd m0, m12 ; b * 455 1439 pmaddwd m1, m12 1440 paddusw m4, m12 1441 paddusw m5, m12 1442 psrad m3, m4, 20 ; min(z, 255) - 256 1443 vpgatherdd m2, [r14+m3*4], m4 1444 psrad m4, m5, 20 1445 vpgatherdd m3, [r14+m4*4], m5 1446 pmulld m0, m2 1447 pmulld m1, m3 1448 paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) 1449 paddd m1, m13 1450 pand m0, m14 1451 pand m1, m14 1452 por m0, m2 ; a | (b << 12) 1453 por m1, m3 1454 mova [t3+r10*4+ 8], xm0 1455 vextracti128 [t3+r10*4+40], m0, 1 1456 mova [t3+r10*4+24], xm1 1457 vextracti128 [t3+r10*4+56], m1, 1 1458 add r10, 16 1459 jl .v_loop 1460 ret 1461.prep_n: ; initial neighbor setup 1462 mov r10, wq 1463 mov t4, t3 1464 add t3, 400*4 1465.prep_n_loop: 1466 mova m2, [t5+r10*4+0] 1467 mova m3, [t4+r10*4+0] 1468 paddd m2, [t5+r10*4+8] 1469 paddd m3, [t4+r10*4+8] 1470 paddd m0, m2, [t5+r10*4+4] 1471 paddd m1, m3, [t4+r10*4+4] 1472 pslld m0, 2 1473 paddd m1, m1 ; ab[ 0] 222 1474 psubd m0, m2 ; ab[-1] 343 1475 mova [t3+r10*4+400*4], m1 1476 paddd m1, m1 1477 mova [t5+r10*4], m0 1478 psubd m1, m3 ; ab[ 0] 343 1479 mova [t4+r10*4], m1 1480 add r10, 8 1481 jl .prep_n_loop 1482 ret 1483; a+b are packed together in a single dword, but we can't do the 1484; full neighbor calculations before splitting them since we don't 1485; have sufficient precision. The solution is to do the calculations 1486; in two equal halves and split a and b before doing the final sum. 1487ALIGN function_align 1488.n: ; neighbor + output 1489 mov r10, wq 1490.n_loop: 1491 mova m4, [t3+r10*4+ 0] 1492 paddd m4, [t3+r10*4+ 8] 1493 paddd m5, m4, [t3+r10*4+ 4] 1494 paddd m5, m5 ; ab[+1] 222 1495 mova m2, [t3+r10*4+400*4+ 0] 1496 paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343 1497 mova m3, [t3+r10*4+400*4+32] 1498 paddd m1, m3, [t5+r10*4+32] 1499 mova [t3+r10*4+400*4+ 0], m5 1500 paddd m5, m5 1501 psubd m5, m4 ; ab[+1] 343 1502 mova [t5+r10*4+ 0], m5 1503 paddd m2, m5 ; ab[ 0] 222 + ab[+1] 343 1504 mova m4, [t3+r10*4+32] 1505 paddd m4, [t3+r10*4+40] 1506 paddd m5, m4, [t3+r10*4+36] 1507 paddd m5, m5 1508 mova [t3+r10*4+400*4+32], m5 1509 paddd m5, m5 1510 psubd m5, m4 1511 mova [t5+r10*4+32], m5 1512 por m4, m14, m0 1513 psrld m0, 12 1514 paddd m3, m5 1515 por m5, m14, m2 1516 psrld m2, 12 1517 paddd m4, m5 ; -a 1518 por m5, m14, m1 1519 psrld m1, 12 1520 paddd m0, m2 ; b + (1 << 8) 1521 por m2, m14, m3 1522 psrld m3, 12 1523 paddd m5, m2 1524 pmovzxbd m2, [dstq+r10+0] 1525 paddd m1, m3 1526 pmovzxbd m3, [dstq+r10+8] 1527 pmaddwd m4, m2 ; -a * src 1528 pmaddwd m5, m3 1529 packssdw m2, m3 1530 psubd m0, m4 ; a * src + b + (1 << 8) 1531 psubd m1, m5 1532 psrld m0, 9 1533 psrld m1, 9 1534 packssdw m0, m1 1535 psllw m1, m2, 4 1536 psubw m0, m1 1537 pmulhrsw m0, m7 1538 paddw m0, m2 1539 vextracti128 xm1, m0, 1 1540 packuswb xm0, xm1 1541 pshufd xm0, xm0, q3120 1542 mova [dstq+r10], xm0 1543 add r10, 16 1544 jl .n_loop 1545 mov r10, t5 1546 mov t5, t4 1547 mov t4, r10 1548 add dstq, dst_strideq 1549 ret 1550 1551cglobal sgr_filter_mix_8bpc, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \ 1552 lpf_stride, w, edge, params, h 1553%define base r12-sgr_x_by_x_avx2-256*4 1554 lea r12, [sgr_x_by_x_avx2+256*4] 1555 mov paramsq, paramsmp 1556 mov wd, wm 1557 mov edged, r8m 1558 mov hd, r6m 1559 vbroadcasti128 m9, [base+sgr_shuf+0] 1560 add lpfq, wq 1561 vbroadcasti128 m10, [base+sgr_shuf+8] 1562 lea t1, [rsp+wq*2+12] 1563 vbroadcasti128 m11, [base+sgr_shuf+2] 1564 add dstq, wq 1565 vbroadcasti128 m12, [base+sgr_shuf+6] 1566 lea t3, [rsp+wq*4+400*24+8] 1567 vpbroadcastd m15, [paramsq+8] ; w0 w1 1568 neg wq 1569 vpbroadcastd m13, [paramsq+0] ; s0 1570 pxor m7, m7 1571 vpbroadcastd m14, [paramsq+4] ; s1 1572 psllw m15, 2 ; to reuse existing pd_m4096 register for rounding 1573 test edgeb, 4 ; LR_HAVE_TOP 1574 jz .no_top 1575 call .h_top 1576 add lpfq, lpf_strideq 1577 mov t2, t1 1578 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup 1579 add t1, 400*12 1580 call .h_top 1581 lea r10, [lpfq+lpf_strideq*4] 1582 mov lpfq, dstq 1583 add r10, lpf_strideq 1584 mov [rsp], r10 ; below 1585 call .hv0 1586.main: 1587 dec hd 1588 jz .height1 1589 add lpfq, dst_strideq 1590 call .hv1 1591 call .prep_n 1592 sub hd, 2 1593 jl .extend_bottom 1594.main_loop: 1595 add lpfq, dst_strideq 1596 call .hv0 1597 test hd, hd 1598 jz .odd_height 1599 add lpfq, dst_strideq 1600 call .hv1 1601 call .n0 1602 call .n1 1603 sub hd, 2 1604 jge .main_loop 1605 test edgeb, 8 ; LR_HAVE_BOTTOM 1606 jz .extend_bottom 1607 mov lpfq, [rsp] 1608 call .hv0_bottom 1609 add lpfq, lpf_strideq 1610 call .hv1_bottom 1611.end: 1612 call .n0 1613 call .n1 1614.end2: 1615 RET 1616.height1: 1617 call .v1 1618 call .prep_n 1619 jmp .odd_height_end 1620.odd_height: 1621 call .v1 1622 call .n0 1623 call .n1 1624.odd_height_end: 1625 call .v0 1626 call .v1 1627 call .n0 1628 jmp .end2 1629.extend_bottom: 1630 call .v0 1631 call .v1 1632 jmp .end 1633.no_top: 1634 lea r10, [lpfq+lpf_strideq*4] 1635 mov lpfq, dstq 1636 lea r10, [r10+lpf_strideq*2] 1637 mov [rsp], r10 1638 call .h 1639 lea t2, [t1+400*12] 1640 lea r10, [wq-2] 1641.top_fixup_loop: 1642 mova m0, [t1+r10*2+400* 0] 1643 mova m1, [t1+r10*2+400* 2] 1644 mova m2, [t1+r10*2+400* 4] 1645 paddw m0, m0 1646 mova m3, [t1+r10*2+400* 6] 1647 paddd m1, m1 1648 mova m4, [t1+r10*2+400* 8] 1649 paddd m2, m2 1650 mova m5, [t1+r10*2+400*10] 1651 mova [t2+r10*2+400* 0], m0 1652 mova [t2+r10*2+400* 2], m1 1653 mova [t2+r10*2+400* 4], m2 1654 mova [t2+r10*2+400* 6], m3 1655 mova [t2+r10*2+400* 8], m4 1656 mova [t2+r10*2+400*10], m5 1657 add r10, 16 1658 jl .top_fixup_loop 1659 call .v0 1660 jmp .main 1661.h: ; horizontal boxsums 1662 lea r10, [wq-2] 1663 test edgeb, 1 ; LR_HAVE_LEFT 1664 jz .h_extend_left 1665 vpbroadcastd xm0, [leftq] 1666 mova xm5, [lpfq+wq] 1667 palignr xm5, xm0, 12 1668 add leftq, 4 1669 jmp .h_main 1670.h_extend_left: 1671 mova xm5, [lpfq+wq] 1672 pshufb xm5, [base+sgr_l_shuf] 1673 jmp .h_main 1674.h_top: 1675 lea r10, [wq-2] 1676 test edgeb, 1 ; LR_HAVE_LEFT 1677 jz .h_extend_left 1678.h_loop: 1679 movu xm5, [lpfq+r10-2] 1680.h_main: 1681 vinserti128 m5, [lpfq+r10+6], 1 1682 test edgeb, 2 ; LR_HAVE_RIGHT 1683 jnz .h_have_right 1684 cmp r10d, -18 1685 jl .h_have_right 1686 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right 1687.h_have_right: 1688 pshufb m6, m5, m9 1689 pshufb m4, m5, m10 1690 paddw m8, m6, m4 1691 shufps m0, m6, m4, q2121 1692 pmullw m3, m0, m0 1693 pshufb m2, m5, m11 1694 paddw m0, m2 1695 pshufb m5, m12 1696 paddw m0, m5 ; sum3 1697 punpcklwd m1, m2, m5 1698 pmaddwd m1, m1 1699 punpckhwd m2, m5 1700 pmaddwd m2, m2 1701 punpcklwd m5, m6, m4 1702 pmaddwd m5, m5 1703 punpckhwd m6, m4 1704 pmaddwd m6, m6 1705 punpcklwd m4, m3, m7 1706 paddd m1, m4 ; sumsq3 1707 punpckhwd m3, m7 1708 paddd m2, m3 1709 mova [t1+r10*2+400* 6], m0 1710 mova [t1+r10*2+400* 8], m1 1711 mova [t1+r10*2+400*10], m2 1712 paddw m8, m0 ; sum5 1713 paddd m5, m1 ; sumsq5 1714 paddd m6, m2 1715 mova [t1+r10*2+400* 0], m8 1716 mova [t1+r10*2+400* 2], m5 1717 mova [t1+r10*2+400* 4], m6 1718 add r10, 16 1719 jl .h_loop 1720 ret 1721ALIGN function_align 1722.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows) 1723 lea r10, [wq-2] 1724 test edgeb, 1 ; LR_HAVE_LEFT 1725 jz .hv0_extend_left 1726 vpbroadcastd xm0, [leftq] 1727 mova xm5, [lpfq+wq] 1728 palignr xm5, xm0, 12 1729 add leftq, 4 1730 jmp .hv0_main 1731.hv0_extend_left: 1732 mova xm5, [lpfq+wq] 1733 pshufb xm5, [base+sgr_l_shuf] 1734 jmp .hv0_main 1735.hv0_bottom: 1736 lea r10, [wq-2] 1737 test edgeb, 1 ; LR_HAVE_LEFT 1738 jz .hv0_extend_left 1739.hv0_loop: 1740 movu xm5, [lpfq+r10-2] 1741.hv0_main: 1742 vinserti128 m5, [lpfq+r10+6], 1 1743 test edgeb, 2 ; LR_HAVE_RIGHT 1744 jnz .hv0_have_right 1745 cmp r10d, -18 1746 jl .hv0_have_right 1747 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right 1748.hv0_have_right: 1749 pshufb m6, m5, m9 1750 pshufb m4, m5, m10 1751 paddw m8, m6, m4 1752 shufps m1, m6, m4, q2121 1753 pmullw m0, m1, m1 1754 pshufb m3, m5, m11 1755 paddw m1, m3 1756 pshufb m5, m12 1757 paddw m1, m5 ; sum3 1758 punpcklwd m2, m3, m5 1759 pmaddwd m2, m2 1760 punpckhwd m3, m5 1761 pmaddwd m3, m3 1762 punpcklwd m5, m6, m4 1763 pmaddwd m5, m5 1764 punpckhwd m6, m4 1765 pmaddwd m6, m6 1766 punpcklwd m4, m0, m7 1767 paddd m2, m4 ; sumsq3 1768 punpckhwd m0, m7 1769 paddd m3, m0 1770 paddw m8, m1 ; sum5 1771 paddd m5, m2 ; sumsq5 1772 paddd m6, m3 1773 mova [t3+r10*4+400*8+ 8], m8 ; we need a clean copy of the last row 1774 mova [t3+r10*4+400*0+ 8], m5 ; in case height is odd 1775 mova [t3+r10*4+400*0+40], m6 1776 paddw m8, [t1+r10*2+400* 0] 1777 paddd m5, [t1+r10*2+400* 2] 1778 paddd m6, [t1+r10*2+400* 4] 1779 mova [t1+r10*2+400* 0], m8 1780 mova [t1+r10*2+400* 2], m5 1781 mova [t1+r10*2+400* 4], m6 1782 paddw m0, m1, [t1+r10*2+400* 6] 1783 paddd m4, m2, [t1+r10*2+400* 8] 1784 paddd m5, m3, [t1+r10*2+400*10] 1785 mova [t1+r10*2+400* 6], m1 1786 mova [t1+r10*2+400* 8], m2 1787 mova [t1+r10*2+400*10], m3 1788 paddw m1, m0, [t2+r10*2+400* 6] 1789 paddd m2, m4, [t2+r10*2+400* 8] 1790 paddd m3, m5, [t2+r10*2+400*10] 1791 mova [t2+r10*2+400* 6], m0 1792 mova [t2+r10*2+400* 8], m4 1793 mova [t2+r10*2+400*10], m5 1794 punpcklwd m0, m1, m7 ; b3 1795 punpckhwd m1, m7 1796 pslld m4, m2, 3 1797 pslld m5, m3, 3 1798 paddd m4, m2 ; a3 * 9 1799 pmaddwd m2, m0, m0 ; b3 * b 1800 paddd m5, m3 1801 pmaddwd m3, m1, m1 1802 psubd m4, m2 ; p3 1803 vpbroadcastd m2, [base+pd_0xf00801c7] 1804 psubd m5, m3 1805 pmulld m4, m14 ; p3 * s1 1806 pmulld m5, m14 1807 pmaddwd m0, m2 ; b3 * 455 1808 pmaddwd m1, m2 1809 paddusw m4, m2 1810 paddusw m5, m2 1811 psrad m3, m4, 20 ; min(z3, 255) - 256 1812 vpgatherdd m2, [r12+m3*4], m4 1813 psrad m4, m5, 20 1814 vpgatherdd m3, [r12+m4*4], m5 1815 vpbroadcastd m4, [base+pd_34816] 1816 pmulld m0, m2 1817 vpbroadcastd m5, [base+pd_m4096] 1818 pmulld m1, m3 1819 paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 1820 paddd m1, m4 1821 pand m0, m5 1822 pand m1, m5 1823 por m0, m2 ; a3 | (b3 << 12) 1824 por m1, m3 1825 mova [t3+r10*4+400*4+ 8], xm0 1826 vextracti128 [t3+r10*4+400*4+40], m0, 1 1827 mova [t3+r10*4+400*4+24], xm1 1828 vextracti128 [t3+r10*4+400*4+56], m1, 1 1829 add r10, 16 1830 jl .hv0_loop 1831 ret 1832ALIGN function_align 1833.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 1834 lea r10, [wq-2] 1835 test edgeb, 1 ; LR_HAVE_LEFT 1836 jz .hv1_extend_left 1837 vpbroadcastd xm0, [leftq] 1838 mova xm5, [lpfq+wq] 1839 palignr xm5, xm0, 12 1840 add leftq, 4 1841 jmp .hv1_main 1842.hv1_extend_left: 1843 mova xm5, [lpfq+wq] 1844 pshufb xm5, [base+sgr_l_shuf] 1845 jmp .hv1_main 1846.hv1_bottom: 1847 lea r10, [wq-2] 1848 test edgeb, 1 ; LR_HAVE_LEFT 1849 jz .hv1_extend_left 1850.hv1_loop: 1851 movu xm5, [lpfq+r10-2] 1852.hv1_main: 1853 vinserti128 m5, [lpfq+r10+6], 1 1854 test edgeb, 2 ; LR_HAVE_RIGHT 1855 jnz .hv1_have_right 1856 cmp r10d, -18 1857 jl .hv1_have_right 1858 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right 1859.hv1_have_right: 1860 pshufb m6, m5, m9 1861 pshufb m3, m5, m10 1862 paddw m8, m6, m3 1863 shufps m2, m6, m3, q2121 1864 pmullw m1, m2, m2 1865 pshufb m0, m5, m11 1866 paddw m2, m0 1867 pshufb m5, m12 1868 paddw m2, m5 ; sum3 1869 punpcklwd m4, m5, m0 1870 pmaddwd m4, m4 1871 punpckhwd m5, m0 1872 pmaddwd m5, m5 1873 punpcklwd m0, m6, m3 1874 pmaddwd m0, m0 1875 punpckhwd m6, m3 1876 pmaddwd m6, m6 1877 punpcklwd m3, m1, m7 1878 paddd m4, m3 ; sumsq3 1879 punpckhwd m1, m7 1880 paddd m5, m1 1881 paddw m1, m2, [t2+r10*2+400* 6] 1882 mova [t2+r10*2+400* 6], m2 1883 paddw m8, m2 ; sum5 1884 paddd m2, m4, [t2+r10*2+400* 8] 1885 paddd m3, m5, [t2+r10*2+400*10] 1886 mova [t2+r10*2+400* 8], m4 1887 mova [t2+r10*2+400*10], m5 1888 paddd m4, m0 ; sumsq5 1889 paddd m5, m6 1890 punpcklwd m0, m1, m7 ; b3 1891 punpckhwd m1, m7 1892 pslld m6, m2, 3 1893 pslld m7, m3, 3 1894 paddd m6, m2 ; a3 * 9 1895 pmaddwd m2, m0, m0 ; b3 * b3 1896 paddd m7, m3 1897 pmaddwd m3, m1, m1 1898 psubd m6, m2 ; p3 1899 vpbroadcastd m2, [base+pd_0xf00801c7] 1900 psubd m7, m3 1901 pmulld m6, m14 ; p3 * s1 1902 pmulld m7, m14 1903 pmaddwd m0, m2 ; b3 * 455 1904 pmaddwd m1, m2 1905 paddusw m6, m2 1906 paddusw m7, m2 1907 psrad m3, m6, 20 ; min(z3, 255) - 256 1908 vpgatherdd m2, [r12+m3*4], m6 1909 psrad m6, m7, 20 1910 vpgatherdd m3, [r12+m6*4], m7 1911 vpbroadcastd m6, [base+pd_34816] 1912 pmulld m0, m2 1913 vpbroadcastd m7, [base+pd_m4096] 1914 pmulld m1, m3 1915 paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 1916 paddd m1, m6 1917 pand m0, m7 1918 pand m7, m1 1919 por m0, m2 ; a3 | (b3 << 12) 1920 por m7, m3 1921 paddw m1, m8, [t2+r10*2+400*0] 1922 paddd m2, m4, [t2+r10*2+400*2] 1923 paddd m3, m5, [t2+r10*2+400*4] 1924 paddw m1, [t1+r10*2+400*0] 1925 paddd m2, [t1+r10*2+400*2] 1926 paddd m3, [t1+r10*2+400*4] 1927 mova [t2+r10*2+400*0], m8 1928 mova [t2+r10*2+400*2], m4 1929 mova [t2+r10*2+400*4], m5 1930 mova [t3+r10*4+400*8+ 8], xm0 1931 vextracti128 [t3+r10*4+400*8+40], m0, 1 1932 mova [t3+r10*4+400*8+24], xm7 1933 vextracti128 [t3+r10*4+400*8+56], m7, 1 1934 vpbroadcastd m4, [base+pd_25] 1935 pxor m7, m7 1936 punpcklwd m0, m1, m7 ; b5 1937 punpckhwd m1, m7 1938 pmulld m2, m4 ; a5 * 25 1939 pmulld m3, m4 1940 pmaddwd m4, m0, m0 ; b5 * b5 1941 pmaddwd m5, m1, m1 1942 psubd m2, m4 ; p5 1943 vpbroadcastd m4, [base+pd_0xf00800a4] 1944 psubd m3, m5 1945 pmulld m2, m13 ; p5 * s0 1946 pmulld m3, m13 1947 pmaddwd m0, m4 ; b5 * 164 1948 pmaddwd m1, m4 1949 paddusw m2, m4 1950 paddusw m3, m4 1951 psrad m5, m2, 20 ; min(z5, 255) - 256 1952 vpgatherdd m4, [r12+m5*4], m2 1953 psrad m2, m3, 20 1954 vpgatherdd m5, [r12+m2*4], m3 1955 pmulld m0, m4 1956 pmulld m1, m5 1957 paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 1958 paddd m1, m6 1959 vpbroadcastd m6, [base+pd_m4096] 1960 pand m0, m6 1961 pand m1, m6 1962 por m0, m4 ; a5 | (b5 << 12) 1963 por m1, m5 1964 mova [t3+r10*4+400*0+ 8], xm0 1965 vextracti128 [t3+r10*4+400*0+40], m0, 1 1966 mova [t3+r10*4+400*0+24], xm1 1967 vextracti128 [t3+r10*4+400*0+56], m1, 1 1968 add r10, 16 1969 jl .hv1_loop 1970 mov r10, t2 1971 mov t2, t1 1972 mov t1, r10 1973 ret 1974.v0: ; vertical boxsums + ab3 (even rows) 1975 lea r10, [wq-2] 1976 vpbroadcastd m6, [base+pd_34816] 1977 vpbroadcastd m8, [base+pd_m4096] 1978.v0_loop: 1979 mova m0, [t1+r10*2+400* 6] 1980 mova m4, [t1+r10*2+400* 8] 1981 mova m5, [t1+r10*2+400*10] 1982 paddw m0, m0 1983 paddd m4, m4 1984 paddd m5, m5 1985 paddw m1, m0, [t2+r10*2+400* 6] 1986 paddd m2, m4, [t2+r10*2+400* 8] 1987 paddd m3, m5, [t2+r10*2+400*10] 1988 mova [t2+r10*2+400* 6], m0 1989 mova [t2+r10*2+400* 8], m4 1990 mova [t2+r10*2+400*10], m5 1991 punpcklwd m0, m1, m7 ; b3 1992 punpckhwd m1, m7 1993 pslld m4, m2, 3 1994 pslld m5, m3, 3 1995 paddd m4, m2 ; a3 * 9 1996 pmaddwd m2, m0, m0 ; b3 * b3 1997 paddd m5, m3 1998 pmaddwd m3, m1, m1 1999 psubd m4, m2 ; p3 2000 vpbroadcastd m2, [base+pd_0xf00801c7] 2001 psubd m5, m3 2002 pmulld m4, m14 ; p3 * s1 2003 pmulld m5, m14 2004 pmaddwd m0, m2 ; b3 * 455 2005 pmaddwd m1, m2 2006 paddusw m4, m2 2007 paddusw m5, m2 2008 psrad m3, m4, 20 ; min(z3, 255) - 256 2009 vpgatherdd m2, [r12+m3*4], m4 2010 psrad m4, m5, 20 2011 vpgatherdd m3, [r12+m4*4], m5 2012 pmulld m0, m2 2013 pmulld m1, m3 2014 paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2015 paddd m1, m6 2016 pand m0, m8 2017 pand m1, m8 2018 por m0, m2 ; a3 | (b3 << 12) 2019 por m1, m3 2020 mova m2, [t1+r10*2+400*0] 2021 mova m3, [t1+r10*2+400*2] 2022 mova m4, [t1+r10*2+400*4] 2023 mova [t3+r10*4+400*8+ 8], m2 2024 mova [t3+r10*4+400*0+ 8], m3 2025 mova [t3+r10*4+400*0+40], m4 2026 paddw m2, m2 ; cc5 2027 paddd m3, m3 2028 paddd m4, m4 2029 mova [t1+r10*2+400*0], m2 2030 mova [t1+r10*2+400*2], m3 2031 mova [t1+r10*2+400*4], m4 2032 mova [t3+r10*4+400*4+ 8], xm0 2033 vextracti128 [t3+r10*4+400*4+40], m0, 1 2034 mova [t3+r10*4+400*4+24], xm1 2035 vextracti128 [t3+r10*4+400*4+56], m1, 1 2036 add r10, 16 2037 jl .v0_loop 2038 ret 2039.v1: ; vertical boxsums + ab (odd rows) 2040 lea r10, [wq-2] 2041.v1_loop: 2042 mova m4, [t1+r10*2+400* 6] 2043 mova m5, [t1+r10*2+400* 8] 2044 mova m6, [t1+r10*2+400*10] 2045 paddw m1, m4, [t2+r10*2+400* 6] 2046 paddd m2, m5, [t2+r10*2+400* 8] 2047 paddd m3, m6, [t2+r10*2+400*10] 2048 mova [t2+r10*2+400* 6], m4 2049 mova [t2+r10*2+400* 8], m5 2050 mova [t2+r10*2+400*10], m6 2051 punpcklwd m0, m1, m7 ; b3 2052 punpckhwd m1, m7 2053 pslld m4, m2, 3 2054 pslld m5, m3, 3 2055 paddd m4, m2 ; a3 * 9 2056 pmaddwd m2, m0, m0 ; b3 * b3 2057 paddd m5, m3 2058 pmaddwd m3, m1, m1 2059 psubd m4, m2 ; p3 2060 vpbroadcastd m2, [base+pd_0xf00801c7] 2061 psubd m5, m3 2062 pmulld m4, m14 ; p3 * s1 2063 pmulld m5, m14 2064 pmaddwd m0, m2 ; b3 * 455 2065 pmaddwd m1, m2 2066 paddusw m4, m2 2067 paddusw m5, m2 2068 psrad m3, m4, 20 ; min(z3, 255) - 256 2069 vpgatherdd m2, [r12+m3*4], m4 2070 psrad m4, m5, 20 2071 vpgatherdd m3, [r12+m4*4], m5 2072 vpbroadcastd m4, [base+pd_34816] 2073 pmulld m0, m2 2074 vpbroadcastd m8, [base+pd_m4096] 2075 pmulld m1, m3 2076 paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2077 paddd m1, m4 2078 pand m0, m8 2079 pand m8, m1 2080 por m0, m2 ; a3 | (b3 << 12) 2081 por m8, m3 2082 mova m4, [t3+r10*4+400*8+ 8] 2083 mova m5, [t3+r10*4+400*0+ 8] 2084 mova m6, [t3+r10*4+400*0+40] 2085 paddw m1, m4, [t2+r10*2+400*0] 2086 paddd m2, m5, [t2+r10*2+400*2] 2087 paddd m3, m6, [t2+r10*2+400*4] 2088 paddw m1, [t1+r10*2+400*0] 2089 paddd m2, [t1+r10*2+400*2] 2090 paddd m3, [t1+r10*2+400*4] 2091 mova [t2+r10*2+400*0], m4 2092 mova [t2+r10*2+400*2], m5 2093 mova [t2+r10*2+400*4], m6 2094 vpbroadcastd m4, [base+pd_25] 2095 mova [t3+r10*4+400*8+ 8], xm0 2096 vextracti128 [t3+r10*4+400*8+40], m0, 1 2097 mova [t3+r10*4+400*8+24], xm8 2098 vextracti128 [t3+r10*4+400*8+56], m8, 1 2099 punpcklwd m0, m1, m7 ; b5 2100 punpckhwd m1, m7 2101 pmulld m2, m4 ; a5 * 25 2102 pmulld m3, m4 2103 pmaddwd m4, m0, m0 ; b5 * b5 2104 pmaddwd m5, m1, m1 2105 psubd m2, m4 ; p5 2106 vpbroadcastd m4, [base+pd_0xf00800a4] 2107 psubd m3, m5 2108 pmulld m2, m13 ; p5 * s0 2109 pmulld m3, m13 2110 pmaddwd m0, m4 ; b5 * 164 2111 pmaddwd m1, m4 2112 paddusw m2, m4 2113 paddusw m3, m4 2114 psrad m5, m2, 20 ; min(z5, 255) - 256 2115 vpgatherdd m4, [r12+m5*4], m2 2116 psrad m2, m3, 20 2117 vpgatherdd m5, [r12+m2*4], m3 2118 pmulld m0, m4 2119 vpbroadcastd m6, [base+pd_34816] 2120 pmulld m1, m5 2121 paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 2122 paddd m1, m6 2123 vpbroadcastd m6, [base+pd_m4096] 2124 pand m0, m6 2125 pand m1, m6 2126 por m0, m4 ; a5 | (b5 << 12) 2127 por m1, m5 2128 mova [t3+r10*4+400*0+ 8], xm0 2129 vextracti128 [t3+r10*4+400*0+40], m0, 1 2130 mova [t3+r10*4+400*0+24], xm1 2131 vextracti128 [t3+r10*4+400*0+56], m1, 1 2132 add r10, 16 2133 jl .v1_loop 2134 mov r10, t2 2135 mov t2, t1 2136 mov t1, r10 2137 ret 2138.prep_n: ; initial neighbor setup 2139 mov r10, wq 2140.prep_n_loop: 2141 movu m0, [t3+r10*4+400*0+4] 2142 paddd m1, m0, [t3+r10*4+400*0+0] 2143 mova m4, [t3+r10*4+400*4+0] 2144 paddd m1, [t3+r10*4+400*0+8] 2145 mova m5, [t3+r10*4+400*8+0] 2146 paddd m4, [t3+r10*4+400*4+8] 2147 paddd m5, [t3+r10*4+400*8+8] 2148 paddd m2, m4, [t3+r10*4+400*4+4] 2149 paddd m3, m5, [t3+r10*4+400*8+4] 2150 paddd m0, m1 2151 pslld m1, 2 2152 pslld m2, 2 2153 paddd m1, m0 ; ab5 565 2154 paddd m3, m3 ; ab3[ 0] 222 2155 psubd m2, m4 ; ab3[-1] 343 2156 mova [t3+r10*4+400*20], m3 2157 por m0, m6, m1 ; a5 565 2158 mova [t3+r10*4+400*24], m2 2159 psrld m1, 12 ; b5 565 2160 mova [t3+r10*4+400*12], m0 2161 paddd m3, m3 2162 mova [t3+r10*4+400*16], m1 2163 psubd m3, m5 ; ab3[ 0] 343 2164 mova [t3+r10*4+400*28], m3 2165 add r10, 8 2166 jl .prep_n_loop 2167 ret 2168ALIGN function_align 2169.n0: ; neighbor + output (even rows) 2170 mov r10, wq 2171.n0_loop: 2172 movu m0, [t3+r10*4+4] 2173 paddd m4, m0, [t3+r10*4+0] 2174 paddd m4, [t3+r10*4+8] 2175 paddd m0, m4 2176 pslld m4, 2 2177 paddd m4, m0 2178 por m0, m6, m4 2179 psrld m4, 12 2180 paddd m2, m0, [t3+r10*4+400*12] ; -a5 2181 mova [t3+r10*4+400*12], m0 2182 paddd m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8) 2183 mova [t3+r10*4+400*16], m4 2184 mova m3, [t3+r10*4+400*4+0] 2185 paddd m3, [t3+r10*4+400*4+8] 2186 paddd m5, m3, [t3+r10*4+400*4+4] 2187 paddd m5, m5 ; ab3[ 1] 222 2188 mova m4, [t3+r10*4+400*20] 2189 paddd m1, m4, [t3+r10*4+400*24] ; ab3[ 0] 222 + ab3[-1] 343 2190 mova [t3+r10*4+400*20], m5 2191 paddd m5, m5 2192 psubd m5, m3 ; ab3[ 1] 343 2193 mova [t3+r10*4+400*24], m5 2194 paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 2195 por m3, m6, m1 2196 psrld m1, 12 2197 por m5, m6, m4 2198 psrld m4, 12 2199 paddd m3, m5 ; -a3 2200 paddd m1, m4 ; b3 + (1 << 8) 2201 pmovzxbd m4, [dstq+r10] 2202 pmaddwd m2, m4 ; -a5 * src 2203 pmaddwd m3, m4 ; -a3 * src 2204 pslld m4, 13 2205 psubd m0, m4 2206 psubd m1, m4 2207 psubd m0, m2 ; a5 * src + b5 + (1 << 8) 2208 psubd m1, m3 ; a3 * src + b3 + (1 << 8) 2209 psrld m0, 9 2210 pslld m1, 7 2211 pblendw m0, m1, 0xaa 2212 pmaddwd m0, m15 2213 psubd m4, m6 2214 paddd m0, m4 2215 psrad m0, 13 2216 vextracti128 xm1, m0, 1 2217 packssdw xm0, xm1 2218 packuswb xm0, xm0 2219 movq [dstq+r10], xm0 2220 add r10, 8 2221 jl .n0_loop 2222 add dstq, dst_strideq 2223 ret 2224ALIGN function_align 2225.n1: ; neighbor + output (odd rows) 2226 mov r10, wq 2227.n1_loop: 2228 mova m3, [t3+r10*4+400*8+0] 2229 paddd m3, [t3+r10*4+400*8+8] 2230 paddd m5, m3, [t3+r10*4+400*8+4] 2231 paddd m5, m5 ; ab3[ 1] 222 2232 mova m4, [t3+r10*4+400*20] 2233 paddd m1, m4, [t3+r10*4+400*28] ; ab3[ 0] 222 + ab3[-1] 343 2234 mova [t3+r10*4+400*20], m5 2235 paddd m5, m5 2236 psubd m5, m3 ; ab3[ 1] 343 2237 mova [t3+r10*4+400*28], m5 2238 paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 2239 por m3, m6, m1 2240 psrld m1, 12 2241 por m5, m6, m4 2242 psrld m4, 12 2243 paddd m3, m5 ; -a3 2244 paddd m1, m4 ; b3 + (1 << 8) 2245 pmovzxbd m4, [dstq+r10] 2246 pmaddwd m2, m4, [t3+r10*4+400*12] ; -a5 * src 2247 mova m0, [t3+r10*4+400*16] ; b5 + (1 << 7) 2248 pmaddwd m3, m4 ; -a3 * src 2249 pslld m4, 12 2250 psubd m0, m4 2251 paddd m4, m4 2252 psubd m1, m4 2253 psubd m0, m2 ; a5 * src + b5 + (1 << 7) 2254 psubd m1, m3 ; a3 * src + b3 + (1 << 8) 2255 psrld m0, 8 2256 pslld m1, 7 2257 pblendw m0, m1, 0xaa 2258 pmaddwd m0, m15 2259 psubd m4, m6 2260 paddd m0, m4 2261 psrad m0, 13 2262 vextracti128 xm1, m0, 1 2263 packssdw xm0, xm1 2264 packuswb xm0, xm0 2265 movq [dstq+r10], xm0 2266 add r10, 8 2267 jl .n1_loop 2268 add dstq, dst_strideq 2269 ret 2270%endif ; ARCH_X86_64 2271