1; Copyright © 2018, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "ext/x86/x86inc.asm" 27 28%if ARCH_X86_64 29 30SECTION_RODATA 32 31pb_right_ext_mask: times 32 db 0xff 32 times 32 db 0 33pb_14x0_1_2: times 14 db 0 34 db 1, 2 35pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 36 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 37pb_15: times 16 db 15 38pw_16: times 2 dw 16 39pw_256: times 2 dw 256 40pw_2048: times 2 dw 2048 41pw_16380: times 2 dw 16380 42pw_0_128: dw 0, 128 43pw_5_6: dw 5, 6 44pd_6: dd 6 45pd_1024: dd 1024 46pd_0xf0080029: dd 0xf0080029 47pd_0xf00801c7: dd 0xf00801c7 48 49cextern sgr_x_by_x 50 51SECTION .text 52 53INIT_YMM avx2 54cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge 55 vpbroadcastb m15, [fhq+0] 56 vpbroadcastb m14, [fhq+2] 57 vpbroadcastb m13, [fhq+4] 58 vpbroadcastw m12, [fhq+6] 59 vpbroadcastd m11, [pw_2048] 60 vpbroadcastd m10, [pw_16380] 61 lea r11, [pb_right_ext_mask] 62 63 DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim 64 65 ; if (edge & has_right) align_w_to_32 66 ; else w -= 32, and use that as limit in x loop 67 test edged, 2 ; has_right 68 jnz .align 69 mov xlimq, -3 70 jmp .loop 71.align: 72 add wd, 31 73 and wd, ~31 74 xor xlimd, xlimd 75 76 ; main y loop for vertical filter 77.loop: 78 mov srcptrq, srcq 79 mov dstptrq, dstq 80 lea xq, [wq+xlimq] 81 82 ; load left edge pixels 83 test edged, 1 ; have_left 84 jz .emu_left 85 test leftq, leftq ; left == NULL for the edge-extended bottom/top 86 jz .load_left_combined 87 movd xm0, [leftq] 88 add leftq, 4 89 pinsrd xm0, [srcq], 1 90 pslldq xm0, 9 91 jmp .left_load_done 92.load_left_combined: 93 movq xm0, [srcq-3] 94 pslldq xm0, 10 95 jmp .left_load_done 96.emu_left: 97 movd xm0, [srcq] 98 pshufb xm0, [pb_14x0_1_2] 99 100 ; load right edge pixels 101.left_load_done: 102 cmp xd, 32 103 jg .main_load 104 test xd, xd 105 jg .load_and_splat 106 je .splat_right 107 108 ; for very small images (w=[1-2]), edge-extend the original cache, 109 ; ugly, but only runs in very odd cases 110 add wd, wd 111 pshufb xm0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16] 112 shr wd, 1 113 114 ; main x loop, mostly this starts in .main_load 115.splat_right: 116 ; no need to load new pixels, just extend them from the (possibly previously 117 ; extended) previous load into m0 118 pshufb xm1, xm0, [pb_15] 119 jmp .main_loop 120.load_and_splat: 121 ; load new pixels and extend edge for right-most 122 movu m1, [srcptrq+3] 123 sub r11, xq 124 movu m2, [r11-pb_right_ext_mask+pb_right_ext_mask+32] 125 add r11, xq 126 vpbroadcastb m3, [srcptrq+2+xq] 127 pand m1, m2 128 pandn m3, m2, m3 129 por m1, m3 130 jmp .main_loop 131.main_load: 132 ; load subsequent line 133 movu m1, [srcptrq+3] 134.main_loop: 135 vinserti128 m0, xm1, 1 136 137 palignr m2, m1, m0, 10 138 palignr m3, m1, m0, 11 139 palignr m4, m1, m0, 12 140 palignr m5, m1, m0, 13 141 palignr m6, m1, m0, 14 142 palignr m7, m1, m0, 15 143 144 punpcklbw m0, m2, m1 145 punpckhbw m2, m1 146 punpcklbw m8, m3, m7 147 punpckhbw m3, m7 148 punpcklbw m7, m4, m6 149 punpckhbw m4, m6 150 pxor m9, m9 151 punpcklbw m6, m5, m9 152 punpckhbw m5, m9 153 154 pmaddubsw m0, m15 155 pmaddubsw m2, m15 156 pmaddubsw m8, m14 157 pmaddubsw m3, m14 158 pmaddubsw m7, m13 159 pmaddubsw m4, m13 160 paddw m0, m8 161 paddw m2, m3 162 psllw m8, m6, 7 163 psllw m3, m5, 7 164 psubw m8, m10 165 psubw m3, m10 166 pmullw m6, m12 167 pmullw m5, m12 168 paddw m0, m7 169 paddw m2, m4 170 paddw m0, m6 171 paddw m2, m5 172 paddsw m0, m8 173 paddsw m2, m3 174 psraw m0, 3 175 psraw m2, 3 176 paddw m0, m11 177 paddw m2, m11 178 mova [dstptrq], xm0 179 mova [dstptrq+16], xm2 180 vextracti128 [dstptrq+32], m0, 1 181 vextracti128 [dstptrq+48], m2, 1 182 vextracti128 xm0, m1, 1 183 add srcptrq, 32 184 add dstptrq, 64 185 sub xq, 32 186 cmp xd, 32 187 jg .main_load 188 test xd, xd 189 jg .load_and_splat 190 cmp xd, xlimd 191 jg .splat_right 192 193 add srcq, strideq 194 add dstq, 384*2 195 dec hd 196 jg .loop 197 RET 198 199cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge 200 vpbroadcastd m14, [fvq+4] 201 vpbroadcastd m15, [fvq] 202 vpbroadcastd m13, [pw_0_128] 203 paddw m14, m13 204 vpbroadcastd m12, [pd_1024] 205 206 DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr 207 mov ylimd, edged 208 and ylimd, 8 ; have_bottom 209 shr ylimd, 2 210 sub ylimd, 3 211 212 ; main x loop for vertical filter, does one column of 16 pixels 213.loop_x: 214 mova m3, [midq] ; middle line 215 216 ; load top pixels 217 test edged, 4 ; have_top 218 jz .emu_top 219 mova m0, [midq-384*4] 220 mova m2, [midq-384*2] 221 mova m1, m0 222 jmp .load_bottom_pixels 223.emu_top: 224 mova m0, m3 225 mova m1, m3 226 mova m2, m3 227 228 ; load bottom pixels 229.load_bottom_pixels: 230 mov yd, hd 231 mov mptrq, midq 232 mov dstptrq, dstq 233 add yd, ylimd 234 jg .load_threelines 235 236 ; the remainder here is somewhat messy but only runs in very weird 237 ; circumstances at the bottom of the image in very small blocks (h=[1-3]), 238 ; so performance is not terribly important here... 239 je .load_twolines 240 cmp yd, -1 241 je .load_oneline 242 ; h == 1 case 243 mova m5, m3 244 mova m4, m3 245 mova m6, m3 246 jmp .loop 247.load_oneline: 248 ; h == 2 case 249 mova m4, [midq+384*2] 250 mova m5, m4 251 mova m6, m4 252 jmp .loop 253.load_twolines: 254 ; h == 3 case 255 mova m4, [midq+384*2] 256 mova m5, [midq+384*4] 257 mova m6, m5 258 jmp .loop 259.load_threelines: 260 ; h > 3 case 261 mova m4, [midq+384*2] 262 mova m5, [midq+384*4] 263 ; third line loaded in main loop below 264 265 ; main y loop for vertical filter 266.loop_load: 267 ; load one line into m6. if that pixel is no longer available, do 268 ; nothing, since m6 still has the data from the previous line in it. We 269 ; try to structure the loop so that the common case is evaluated fastest 270 mova m6, [mptrq+384*6] 271.loop: 272 paddw m7, m0, m6 273 paddw m8, m1, m5 274 paddw m9, m2, m4 275 punpcklwd m10, m7, m8 276 punpckhwd m7, m8 277 punpcklwd m11, m9, m3 278 punpckhwd m9, m3 279 pmaddwd m10, m15 280 pmaddwd m7, m15 281 pmaddwd m11, m14 282 pmaddwd m9, m14 283 paddd m10, m11 284 paddd m7, m9 285 paddd m10, m12 286 paddd m7, m12 287 psrad m10, 11 288 psrad m7, 11 289 packssdw m10, m7 290 packuswb m10, m10 291 vpermq m10, m10, q3120 292 mova [dstptrq], xm10 293 ; shift pixels one position 294 mova m0, m1 295 mova m1, m2 296 mova m2, m3 297 mova m3, m4 298 mova m4, m5 299 mova m5, m6 300 add dstptrq, strideq 301 add mptrq, 384*2 302 dec yd 303 jg .loop_load 304 ; for the bottom pixels, continue using m6 (as extended edge) 305 cmp yd, ylimd 306 jg .loop 307 308 add dstq, 16 309 add midq, 32 310 sub wd, 16 311 jg .loop_x 312 RET 313 314INIT_YMM avx2 315cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim 316 mov xlimd, edged 317 and xlimd, 2 ; have_right 318 add wd, xlimd 319 xor xlimd, 2 ; 2*!have_right 320 jnz .no_right 321 add wd, 15 322 and wd, ~15 323.no_right: 324 pxor m1, m1 325 lea srcq, [srcq+wq] 326 lea sumq, [sumq+wq*2-2] 327 lea sumsqq, [sumsqq+wq*4-4] 328 neg wq 329 lea r10, [pb_right_ext_mask+32] 330.loop_y: 331 mov xq, wq 332 333 ; load left 334 test edged, 1 ; have_left 335 jz .no_left 336 test leftq, leftq 337 jz .load_left_from_main 338 pinsrw xm0, [leftq+2], 7 339 add leftq, 4 340 jmp .expand_x 341.no_left: 342 vpbroadcastb xm0, [srcq+xq] 343 jmp .expand_x 344.load_left_from_main: 345 pinsrw xm0, [srcq+xq-2], 7 346.expand_x: 347 punpckhbw xm0, xm1 348 349 ; when we reach this, xm0 contains left two px in highest words 350 cmp xd, -16 351 jle .loop_x 352.partial_load_and_extend: 353 vpbroadcastb m3, [srcq-1] 354 pmovzxbw m2, [srcq+xq] 355 punpcklbw m3, m1 356 movu m4, [r10+xq*2] 357 pand m2, m4 358 pandn m4, m3 359 por m2, m4 360 jmp .loop_x_noload 361.right_extend: 362 psrldq xm2, xm0, 14 363 vpbroadcastw m2, xm2 364 jmp .loop_x_noload 365 366.loop_x: 367 pmovzxbw m2, [srcq+xq] 368.loop_x_noload: 369 vinserti128 m0, xm2, 1 370 palignr m3, m2, m0, 12 371 palignr m4, m2, m0, 14 372 373 punpcklwd m5, m3, m2 374 punpckhwd m6, m3, m2 375 paddw m3, m4 376 punpcklwd m7, m4, m1 377 punpckhwd m4, m1 378 pmaddwd m5, m5 379 pmaddwd m6, m6 380 pmaddwd m7, m7 381 pmaddwd m4, m4 382 paddd m5, m7 383 paddd m6, m4 384 paddw m3, m2 385 movu [sumq+xq*2], m3 386 movu [sumsqq+xq*4+ 0], xm5 387 movu [sumsqq+xq*4+16], xm6 388 vextracti128 [sumsqq+xq*4+32], m5, 1 389 vextracti128 [sumsqq+xq*4+48], m6, 1 390 391 vextracti128 xm0, m2, 1 392 add xq, 16 393 394 ; if x <= -16 we can reload more pixels 395 ; else if x < 0 we reload and extend (this implies have_right=0) 396 ; else if x < xlimd we extend from previous load (this implies have_right=0) 397 ; else we are done 398 399 cmp xd, -16 400 jle .loop_x 401 test xd, xd 402 jl .partial_load_and_extend 403 cmp xd, xlimd 404 jl .right_extend 405 406 add sumsqq, (384+16)*4 407 add sumq, (384+16)*2 408 add srcq, strideq 409 dec hd 410 jg .loop_y 411 RET 412 413INIT_YMM avx2 414cglobal sgr_box3_v, 5, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim 415 mov xq, -2 416 mov ylimd, edged 417 and ylimd, 8 ; have_bottom 418 shr ylimd, 2 419 sub ylimd, 2 ; -2 if have_bottom=0, else 0 420.loop_x: 421 lea yd, [hq+ylimq+2] 422 lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] 423 lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] 424 test edged, 4 ; have_top 425 jnz .load_top 426 movu m0, [sumsq_ptrq+(384+16)*4*1] 427 movu m1, [sumsq_ptrq+(384+16)*4*1+32] 428 mova m2, m0 429 mova m3, m1 430 mova m4, m0 431 mova m5, m1 432 movu m6, [sum_ptrq+(384+16)*2*1] 433 mova m7, m6 434 mova m8, m6 435 jmp .loop_y_noload 436.load_top: 437 movu m0, [sumsq_ptrq-(384+16)*4*1] ; l2sq [left] 438 movu m1, [sumsq_ptrq-(384+16)*4*1+32] ; l2sq [right] 439 movu m2, [sumsq_ptrq-(384+16)*4*0] ; l1sq [left] 440 movu m3, [sumsq_ptrq-(384+16)*4*0+32] ; l1sq [right] 441 movu m6, [sum_ptrq-(384+16)*2*1] ; l2 442 movu m7, [sum_ptrq-(384+16)*2*0] ; l1 443.loop_y: 444 movu m4, [sumsq_ptrq+(384+16)*4*1] ; l0sq [left] 445 movu m5, [sumsq_ptrq+(384+16)*4*1+32] ; l0sq [right] 446 movu m8, [sum_ptrq+(384+16)*2*1] ; l0 447.loop_y_noload: 448 paddd m0, m2 449 paddd m1, m3 450 paddw m6, m7 451 paddd m0, m4 452 paddd m1, m5 453 paddw m6, m8 454 movu [sumsq_ptrq+ 0], m0 455 movu [sumsq_ptrq+32], m1 456 movu [sum_ptrq], m6 457 458 ; shift position down by one 459 mova m0, m2 460 mova m1, m3 461 mova m2, m4 462 mova m3, m5 463 mova m6, m7 464 mova m7, m8 465 add sumsq_ptrq, (384+16)*4 466 add sum_ptrq, (384+16)*2 467 dec yd 468 jg .loop_y 469 cmp yd, ylimd 470 jg .loop_y_noload 471 add xd, 16 472 cmp xd, wd 473 jl .loop_x 474 RET 475 476INIT_YMM avx2 477cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s 478 sub aq, (384+16-1)*4 479 sub bq, (384+16-1)*2 480 add hd, 2 481 lea r5, [sgr_x_by_x-0xf03] 482%ifidn sd, sm 483 movd xm6, sd 484 vpbroadcastd m6, xm6 485%else 486 vpbroadcastd m6, sm 487%endif 488 vpbroadcastd m8, [pd_0xf00801c7] 489 vpbroadcastd m9, [pw_256] 490 pcmpeqb m7, m7 491 psrld m10, m9, 13 ; pd_2048 492 DEFINE_ARGS a, b, w, h, x 493 494.loop_y: 495 mov xq, -2 496.loop_x: 497 pmovzxwd m0, [bq+xq*2] 498 pmovzxwd m1, [bq+xq*2+(384+16)*2] 499 movu m2, [aq+xq*4] 500 movu m3, [aq+xq*4+(384+16)*4] 501 pslld m4, m2, 3 502 pslld m5, m3, 3 503 paddd m2, m4 ; aa * 9 504 paddd m3, m5 505 pmaddwd m4, m0, m0 506 pmaddwd m5, m1, m1 507 pmaddwd m0, m8 508 pmaddwd m1, m8 509 psubd m2, m4 ; p = aa * 9 - bb * bb 510 psubd m3, m5 511 pmulld m2, m6 512 pmulld m3, m6 513 paddusw m2, m8 514 paddusw m3, m8 515 psrld m2, 20 ; z 516 psrld m3, 20 517 mova m5, m7 518 vpgatherdd m4, [r5+m2], m5 ; xx 519 mova m5, m7 520 vpgatherdd m2, [r5+m3], m5 521 psrld m4, 24 522 psrld m2, 24 523 pmulld m0, m4 524 pmulld m1, m2 525 packssdw m4, m2 526 psubw m4, m9, m4 527 vpermq m4, m4, q3120 528 paddd m0, m10 529 paddd m1, m10 530 psrld m0, 12 531 psrld m1, 12 532 movu [bq+xq*2], xm4 533 vextracti128 [bq+xq*2+(384+16)*2], m4, 1 534 movu [aq+xq*4], m0 535 movu [aq+xq*4+(384+16)*4], m1 536 add xd, 8 537 cmp xd, wd 538 jl .loop_x 539 add aq, (384+16)*4*2 540 add bq, (384+16)*2*2 541 sub hd, 2 542 jg .loop_y 543 RET 544 545INIT_YMM avx2 546cglobal sgr_finish_filter1, 7, 13, 16, t, src, stride, a, b, w, h, \ 547 tmp_ptr, src_ptr, a_ptr, b_ptr, x, y 548 vpbroadcastd m15, [pw_16] 549 xor xd, xd 550.loop_x: 551 lea tmp_ptrq, [tq+xq*2] 552 lea src_ptrq, [srcq+xq*1] 553 lea a_ptrq, [aq+xq*4+(384+16)*4] 554 lea b_ptrq, [bq+xq*2+(384+16)*2] 555 movu m0, [aq+xq*4-(384+16)*4-4] 556 movu m2, [aq+xq*4-(384+16)*4+4] 557 mova m1, [aq+xq*4-(384+16)*4] ; a:top [first half] 558 paddd m0, m2 ; a:tl+tr [first half] 559 movu m2, [aq+xq*4-(384+16)*4-4+32] 560 movu m4, [aq+xq*4-(384+16)*4+4+32] 561 mova m3, [aq+xq*4-(384+16)*4+32] ; a:top [second half] 562 paddd m2, m4 ; a:tl+tr [second half] 563 movu m4, [aq+xq*4-4] 564 movu m5, [aq+xq*4+4] 565 paddd m1, [aq+xq*4] ; a:top+ctr [first half] 566 paddd m4, m5 ; a:l+r [first half] 567 movu m5, [aq+xq*4+32-4] 568 movu m6, [aq+xq*4+32+4] 569 paddd m3, [aq+xq*4+32] ; a:top+ctr [second half] 570 paddd m5, m6 ; a:l+r [second half] 571 572 movu m6, [bq+xq*2-(384+16)*2-2] 573 movu m8, [bq+xq*2-(384+16)*2+2] 574 mova m7, [bq+xq*2-(384+16)*2] ; b:top 575 paddw m6, m8 ; b:tl+tr 576 movu m8, [bq+xq*2-2] 577 movu m9, [bq+xq*2+2] 578 paddw m7, [bq+xq*2] ; b:top+ctr 579 paddw m8, m9 ; b:l+r 580 mov yd, hd 581.loop_y: 582 movu m9, [b_ptrq-2] 583 movu m10, [b_ptrq+2] 584 paddw m7, [b_ptrq] ; b:top+ctr+bottom 585 paddw m9, m10 ; b:bl+br 586 paddw m10, m7, m8 ; b:top+ctr+bottom+l+r 587 paddw m6, m9 ; b:tl+tr+bl+br 588 psubw m7, [b_ptrq-(384+16)*2*2] ; b:ctr+bottom 589 paddw m10, m6 590 psllw m10, 2 591 psubw m10, m6 ; aa 592 pmovzxbw m12, [src_ptrq] 593 punpcklwd m6, m10, m15 594 punpckhwd m10, m15 595 punpcklwd m13, m12, m15 596 punpckhwd m12, m15 597 pmaddwd m6, m13 ; aa*src[x]+256 [first half] 598 pmaddwd m10, m12 ; aa*src[x]+256 [second half] 599 600 movu m11, [a_ptrq-4] 601 movu m12, [a_ptrq+4] 602 paddd m1, [a_ptrq] ; a:top+ctr+bottom [first half] 603 paddd m11, m12 ; a:bl+br [first half] 604 movu m12, [a_ptrq+32-4] 605 movu m13, [a_ptrq+32+4] 606 paddd m3, [a_ptrq+32] ; a:top+ctr+bottom [second half] 607 paddd m12, m13 ; a:bl+br [second half] 608 paddd m13, m1, m4 ; a:top+ctr+bottom+l+r [first half] 609 paddd m14, m3, m5 ; a:top+ctr+bottom+l+r [second half] 610 paddd m0, m11 ; a:tl+tr+bl+br [first half] 611 paddd m2, m12 ; a:tl+tr+bl+br [second half] 612 paddd m13, m0 613 paddd m14, m2 614 pslld m13, 2 615 pslld m14, 2 616 psubd m13, m0 ; bb [first half] 617 psubd m14, m2 ; bb [second half] 618 vperm2i128 m0, m13, m14, 0x31 619 vinserti128 m13, xm14, 1 620 psubd m1, [a_ptrq-(384+16)*4*2] ; a:ctr+bottom [first half] 621 psubd m3, [a_ptrq-(384+16)*4*2+32] ; a:ctr+bottom [second half] 622 623 paddd m6, m13 624 paddd m10, m0 625 psrad m6, 9 626 psrad m10, 9 627 packssdw m6, m10 628 mova [tmp_ptrq], m6 629 630 ; shift to next row 631 mova m0, m4 632 mova m2, m5 633 mova m4, m11 634 mova m5, m12 635 mova m6, m8 636 mova m8, m9 637 638 add a_ptrq, (384+16)*4 639 add b_ptrq, (384+16)*2 640 add tmp_ptrq, 384*2 641 add src_ptrq, strideq 642 dec yd 643 jg .loop_y 644 add xd, 16 645 cmp xd, wd 646 jl .loop_x 647 RET 648 649INIT_YMM avx2 650cglobal sgr_weighted1, 6, 6, 7, dst, stride, t, w, h, wt 651 movd xm0, wtd 652 vpbroadcastw m0, xm0 653 psllw m0, 4 654 DEFINE_ARGS dst, stride, t, w, h, idx 655.loop_y: 656 xor idxd, idxd 657.loop_x: 658 mova m1, [tq+idxq*2+ 0] 659 mova m4, [tq+idxq*2+32] 660 pmovzxbw m2, [dstq+idxq+ 0] 661 pmovzxbw m5, [dstq+idxq+16] 662 psllw m3, m2, 4 663 psllw m6, m5, 4 664 psubw m1, m3 665 psubw m4, m6 666 pmulhrsw m1, m0 667 pmulhrsw m4, m0 668 paddw m1, m2 669 paddw m4, m5 670 packuswb m1, m4 671 vpermq m1, m1, q3120 672 mova [dstq+idxq], m1 673 add idxd, 32 674 cmp idxd, wd 675 jl .loop_x 676 add dstq, strideq 677 add tq, 384 * 2 678 dec hd 679 jg .loop_y 680 RET 681 682INIT_YMM avx2 683cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim 684 test edged, 2 ; have_right 685 jz .no_right 686 xor xlimd, xlimd 687 add wd, 2 688 add wd, 15 689 and wd, ~15 690 jmp .right_done 691.no_right: 692 mov xlimd, 3 693 sub wd, 1 694.right_done: 695 pxor m1, m1 696 lea srcq, [srcq+wq+1] 697 lea sumq, [sumq+wq*2-2] 698 lea sumsqq, [sumsqq+wq*4-4] 699 neg wq 700 lea r10, [pb_right_ext_mask+32] 701.loop_y: 702 mov xq, wq 703 704 ; load left 705 test edged, 1 ; have_left 706 jz .no_left 707 test leftq, leftq 708 jz .load_left_from_main 709 movd xm0, [leftq] 710 pinsrd xm0, [srcq+xq-1], 1 711 pslldq xm0, 11 712 add leftq, 4 713 jmp .expand_x 714.no_left: 715 vpbroadcastb xm0, [srcq+xq-1] 716 jmp .expand_x 717.load_left_from_main: 718 pinsrd xm0, [srcq+xq-4], 3 719.expand_x: 720 punpckhbw xm0, xm1 721 722 ; when we reach this, xm0 contains left two px in highest words 723 cmp xd, -16 724 jle .loop_x 725 test xd, xd 726 jge .right_extend 727.partial_load_and_extend: 728 vpbroadcastb m3, [srcq-1] 729 pmovzxbw m2, [srcq+xq] 730 punpcklbw m3, m1 731 movu m4, [r10+xq*2] 732 pand m2, m4 733 pandn m4, m3 734 por m2, m4 735 jmp .loop_x_noload 736.right_extend: 737 psrldq xm2, xm0, 14 738 vpbroadcastw m2, xm2 739 jmp .loop_x_noload 740 741.loop_x: 742 pmovzxbw m2, [srcq+xq] 743.loop_x_noload: 744 vinserti128 m0, xm2, 1 745 palignr m3, m2, m0, 8 746 palignr m4, m2, m0, 10 747 palignr m5, m2, m0, 12 748 palignr m6, m2, m0, 14 749 750 paddw m0, m3, m2 751 punpcklwd m7, m3, m2 752 punpckhwd m3, m2 753 paddw m0, m4 754 punpcklwd m8, m4, m5 755 punpckhwd m4, m5 756 paddw m0, m5 757 punpcklwd m9, m6, m1 758 punpckhwd m5, m6, m1 759 paddw m0, m6 760 pmaddwd m7, m7 761 pmaddwd m3, m3 762 pmaddwd m8, m8 763 pmaddwd m4, m4 764 pmaddwd m9, m9 765 pmaddwd m5, m5 766 paddd m7, m8 767 paddd m3, m4 768 paddd m7, m9 769 paddd m3, m5 770 movu [sumq+xq*2], m0 771 movu [sumsqq+xq*4+ 0], xm7 772 movu [sumsqq+xq*4+16], xm3 773 vextracti128 [sumsqq+xq*4+32], m7, 1 774 vextracti128 [sumsqq+xq*4+48], m3, 1 775 776 vextracti128 xm0, m2, 1 777 add xq, 16 778 779 ; if x <= -16 we can reload more pixels 780 ; else if x < 0 we reload and extend (this implies have_right=0) 781 ; else if x < xlimd we extend from previous load (this implies have_right=0) 782 ; else we are done 783 784 cmp xd, -16 785 jle .loop_x 786 test xd, xd 787 jl .partial_load_and_extend 788 cmp xd, xlimd 789 jl .right_extend 790 791 add sumsqq, (384+16)*4 792 add sumq, (384+16)*2 793 add srcq, strideq 794 dec hd 795 jg .loop_y 796 RET 797 798INIT_YMM avx2 799cglobal sgr_box5_v, 5, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim 800 mov xq, -2 801 mov ylimd, edged 802 and ylimd, 8 ; have_bottom 803 shr ylimd, 2 804 sub ylimd, 3 ; -3 if have_bottom=0, else -1 805.loop_x: 806 lea yd, [hq+ylimq+2] 807 lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] 808 lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] 809 test edged, 4 ; have_top 810 jnz .load_top 811 movu m0, [sumsq_ptrq+(384+16)*4*1] 812 movu m1, [sumsq_ptrq+(384+16)*4*1+32] 813 mova m2, m0 814 mova m3, m1 815 mova m4, m0 816 mova m5, m1 817 mova m6, m0 818 mova m7, m1 819 movu m10, [sum_ptrq+(384+16)*2*1] 820 mova m11, m10 821 mova m12, m10 822 mova m13, m10 823 jmp .loop_y_second_load 824.load_top: 825 movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left] 826 movu m1, [sumsq_ptrq-(384+16)*4*1+32] ; l3/4sq [right] 827 movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] 828 movu m5, [sumsq_ptrq-(384+16)*4*0+32] ; l2sq [right] 829 mova m2, m0 830 mova m3, m1 831 movu m10, [sum_ptrq-(384+16)*2*1] ; l3/4 832 movu m12, [sum_ptrq-(384+16)*2*0] ; l2 833 mova m11, m10 834.loop_y: 835 movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] 836 movu m7, [sumsq_ptrq+(384+16)*4*1+32] ; l1sq [right] 837 movu m13, [sum_ptrq+(384+16)*2*1] ; l1 838.loop_y_second_load: 839 test yd, yd 840 jle .emulate_second_load 841 movu m8, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left] 842 movu m9, [sumsq_ptrq+(384+16)*4*2+32] ; l0sq [right] 843 movu m14, [sum_ptrq+(384+16)*2*2] ; l0 844.loop_y_noload: 845 paddd m0, m2 846 paddd m1, m3 847 paddw m10, m11 848 paddd m0, m4 849 paddd m1, m5 850 paddw m10, m12 851 paddd m0, m6 852 paddd m1, m7 853 paddw m10, m13 854 paddd m0, m8 855 paddd m1, m9 856 paddw m10, m14 857 movu [sumsq_ptrq+ 0], m0 858 movu [sumsq_ptrq+32], m1 859 movu [sum_ptrq], m10 860 861 ; shift position down by one 862 mova m0, m4 863 mova m1, m5 864 mova m2, m6 865 mova m3, m7 866 mova m4, m8 867 mova m5, m9 868 mova m10, m12 869 mova m11, m13 870 mova m12, m14 871 add sumsq_ptrq, (384+16)*4*2 872 add sum_ptrq, (384+16)*2*2 873 sub yd, 2 874 jge .loop_y 875 ; l1 = l0 876 mova m6, m8 877 mova m7, m9 878 mova m13, m14 879 cmp yd, ylimd 880 jg .loop_y_noload 881 add xd, 16 882 cmp xd, wd 883 jl .loop_x 884 RET 885.emulate_second_load: 886 mova m8, m6 887 mova m9, m7 888 mova m14, m13 889 jmp .loop_y_noload 890 891INIT_YMM avx2 892cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s 893 sub aq, (384+16-1)*4 894 sub bq, (384+16-1)*2 895 add hd, 2 896 lea r5, [sgr_x_by_x-0xf03] 897%ifidn sd, sm 898 movd xm6, sd 899 vpbroadcastd m6, xm6 900%else 901 vpbroadcastd m6, sm 902%endif 903 vpbroadcastd m8, [pd_0xf0080029] 904 vpbroadcastd m9, [pw_256] 905 pcmpeqb m7, m7 906 psrld m10, m9, 15 ; pd_512 907 DEFINE_ARGS a, b, w, h, x 908.loop_y: 909 mov xq, -2 910.loop_x: 911 pmovzxwd m0, [bq+xq*2+ 0] 912 pmovzxwd m1, [bq+xq*2+16] 913 movu m2, [aq+xq*4+ 0] 914 movu m3, [aq+xq*4+32] 915 pslld m4, m2, 3 ; aa * 8 916 pslld m5, m3, 3 917 paddd m2, m4 ; aa * 9 918 paddd m3, m5 919 paddd m4, m4 ; aa * 16 920 paddd m5, m5 921 paddd m2, m4 ; aa * 25 922 paddd m3, m5 923 pmaddwd m4, m0, m0 924 pmaddwd m5, m1, m1 925 psubd m2, m4 ; p = aa * 25 - bb * bb 926 psubd m3, m5 927 pmulld m2, m6 928 pmulld m3, m6 929 paddusw m2, m8 930 paddusw m3, m8 931 psrld m2, 20 ; z 932 psrld m3, 20 933 mova m5, m7 934 vpgatherdd m4, [r5+m2], m5 ; xx 935 mova m5, m7 936 vpgatherdd m2, [r5+m3], m5 937 psrld m4, 24 938 psrld m2, 24 939 packssdw m3, m4, m2 940 pmullw m4, m8 941 pmullw m2, m8 942 psubw m3, m9, m3 943 vpermq m3, m3, q3120 944 pmaddwd m0, m4 945 pmaddwd m1, m2 946 paddd m0, m10 947 paddd m1, m10 948 psrld m0, 10 949 psrld m1, 10 950 movu [bq+xq*2], m3 951 movu [aq+xq*4+ 0], m0 952 movu [aq+xq*4+32], m1 953 add xd, 16 954 cmp xd, wd 955 jl .loop_x 956 add aq, (384+16)*4*2 957 add bq, (384+16)*2*2 958 sub hd, 2 959 jg .loop_y 960 RET 961 962INIT_YMM avx2 963cglobal sgr_finish_filter2, 7, 13, 13, t, src, stride, a, b, w, h, \ 964 tmp_ptr, src_ptr, a_ptr, b_ptr, x, y 965 vpbroadcastd m9, [pw_5_6] 966 vpbroadcastd m12, [pw_256] 967 psrlw m11, m12, 1 ; pw_128 968 psrlw m10, m12, 8 ; pw_1 969 xor xd, xd 970.loop_x: 971 lea tmp_ptrq, [tq+xq*2] 972 lea src_ptrq, [srcq+xq*1] 973 lea a_ptrq, [aq+xq*4+(384+16)*4] 974 lea b_ptrq, [bq+xq*2+(384+16)*2] 975 movu m0, [aq+xq*4-(384+16)*4-4] 976 mova m1, [aq+xq*4-(384+16)*4] 977 movu m2, [aq+xq*4-(384+16)*4+4] 978 movu m3, [aq+xq*4-(384+16)*4-4+32] 979 mova m4, [aq+xq*4-(384+16)*4+32] 980 movu m5, [aq+xq*4-(384+16)*4+4+32] 981 paddd m0, m2 982 paddd m3, m5 983 paddd m0, m1 984 paddd m3, m4 985 pslld m2, m0, 2 986 pslld m5, m3, 2 987 paddd m2, m0 988 paddd m5, m3 989 paddd m0, m2, m1 ; prev_odd_b [first half] 990 paddd m1, m5, m4 ; prev_odd_b [second half] 991 movu m3, [bq+xq*2-(384+16)*2-2] 992 mova m4, [bq+xq*2-(384+16)*2] 993 movu m5, [bq+xq*2-(384+16)*2+2] 994 paddw m3, m5 995 punpcklwd m5, m3, m4 996 punpckhwd m3, m4 997 pmaddwd m5, m9 998 pmaddwd m3, m9 999 packssdw m2, m5, m3 ; prev_odd_a 1000 mov yd, hd 1001.loop_y: 1002 movu m3, [a_ptrq-4] 1003 mova m4, [a_ptrq] 1004 movu m5, [a_ptrq+4] 1005 movu m6, [a_ptrq+32-4] 1006 mova m7, [a_ptrq+32] 1007 movu m8, [a_ptrq+32+4] 1008 paddd m3, m5 1009 paddd m6, m8 1010 paddd m3, m4 1011 paddd m6, m7 1012 pslld m5, m3, 2 1013 pslld m8, m6, 2 1014 paddd m5, m3 1015 paddd m8, m6 1016 paddd m3, m5, m4 ; cur_odd_b [first half] 1017 paddd m4, m8, m7 ; cur_odd_b [second half] 1018 movu m5, [b_ptrq-2] 1019 mova m6, [b_ptrq] 1020 movu m7, [b_ptrq+2] 1021 paddw m5, m7 1022 punpcklwd m7, m5, m6 1023 punpckhwd m5, m6 1024 pmaddwd m7, m9 1025 pmaddwd m5, m9 1026 packssdw m5, m7, m5 ; cur_odd_a 1027 1028 paddd m0, m3 ; cur_even_b [first half] 1029 paddd m1, m4 ; cur_even_b [second half] 1030 paddw m2, m5 ; cur_even_a 1031 1032 pmovzxbw m6, [src_ptrq] 1033 vperm2i128 m8, m0, m1, 0x31 1034 vinserti128 m0, xm1, 1 1035 punpcklwd m7, m6, m10 1036 punpckhwd m6, m10 1037 punpcklwd m1, m2, m12 1038 punpckhwd m2, m12 1039 pmaddwd m7, m1 1040 pmaddwd m6, m2 1041 paddd m7, m0 1042 paddd m6, m8 1043 psrad m7, 9 1044 psrad m6, 9 1045 1046 pmovzxbw m8, [src_ptrq+strideq] 1047 punpcklwd m0, m8, m10 1048 punpckhwd m8, m10 1049 punpcklwd m1, m5, m11 1050 punpckhwd m2, m5, m11 1051 pmaddwd m0, m1 1052 pmaddwd m8, m2 1053 vinserti128 m2, m3, xm4, 1 1054 vperm2i128 m1, m3, m4, 0x31 1055 paddd m0, m2 1056 paddd m8, m1 1057 psrad m0, 8 1058 psrad m8, 8 1059 1060 packssdw m7, m6 1061 packssdw m0, m8 1062 mova [tmp_ptrq+384*2*0], m7 1063 mova [tmp_ptrq+384*2*1], m0 1064 1065 mova m0, m3 1066 mova m1, m4 1067 mova m2, m5 1068 add a_ptrq, (384+16)*4*2 1069 add b_ptrq, (384+16)*2*2 1070 add tmp_ptrq, 384*2*2 1071 lea src_ptrq, [src_ptrq+strideq*2] 1072 sub yd, 2 1073 jg .loop_y 1074 add xd, 16 1075 cmp xd, wd 1076 jl .loop_x 1077 RET 1078 1079INIT_YMM avx2 1080cglobal sgr_weighted2, 7, 7, 11, dst, stride, t1, t2, w, h, wt 1081 vpbroadcastd m0, [wtq] 1082 vpbroadcastd m10, [pd_1024] 1083 DEFINE_ARGS dst, stride, t1, t2, w, h, idx 1084.loop_y: 1085 xor idxd, idxd 1086.loop_x: 1087 mova m1, [t1q+idxq*2+ 0] 1088 mova m2, [t1q+idxq*2+32] 1089 mova m3, [t2q+idxq*2+ 0] 1090 mova m4, [t2q+idxq*2+32] 1091 pmovzxbw m5, [dstq+idxq+ 0] 1092 pmovzxbw m6, [dstq+idxq+16] 1093 psllw m7, m5, 4 1094 psllw m8, m6, 4 1095 psubw m1, m7 1096 psubw m2, m8 1097 psubw m3, m7 1098 psubw m4, m8 1099 punpcklwd m9, m1, m3 1100 punpckhwd m1, m3 1101 punpcklwd m3, m2, m4 1102 punpckhwd m2, m4 1103 pmaddwd m9, m0 1104 pmaddwd m1, m0 1105 pmaddwd m3, m0 1106 pmaddwd m2, m0 1107 paddd m9, m10 1108 paddd m1, m10 1109 paddd m3, m10 1110 paddd m2, m10 1111 psrad m9, 11 1112 psrad m1, 11 1113 psrad m3, 11 1114 psrad m2, 11 1115 packssdw m1, m9, m1 1116 packssdw m2, m3, m2 1117 paddw m1, m5 1118 paddw m2, m6 1119 packuswb m1, m2 1120 vpermq m1, m1, q3120 1121 mova [dstq+idxq], m1 1122 add idxd, 32 1123 cmp idxd, wd 1124 jl .loop_x 1125 add dstq, strideq 1126 add t1q, 384 * 2 1127 add t2q, 384 * 2 1128 dec hd 1129 jg .loop_y 1130 RET 1131%endif ; ARCH_X86_64 1132