1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28%include "x86/filmgrain_common.asm" 29 30SECTION_RODATA 16 31pd_16: times 4 dd 16 32pw_1: times 8 dw 1 33pw_16384: times 8 dw 16384 34pw_8192: times 8 dw 8192 35pw_23_22: dw 23, 22 36 times 3 dw 0, 32 37pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 38pw_27_17_17_27: dw 27, 17, 17, 27 39 times 2 dw 0, 32 40rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 41pw_seed_xor: times 2 dw 0xb524 42 times 2 dw 0x49d8 43pb_1: times 4 db 1 44hmul_bits: dw 32768, 16384, 8192, 4096 45round: dw 2048, 1024, 512 46mul_bits: dw 256, 128, 64, 32, 16 47round_vals: dw 32, 64, 128, 256, 512, 1024 48max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16 49min: dw 0, 16*4, 16*16 50; these two should be next to each other 51pw_4: times 2 dw 4 52pw_16: times 2 dw 16 53 54%macro JMP_TABLE 1-* 55 %xdefine %1_table %%table 56 %xdefine %%base %1_table 57 %xdefine %%prefix mangle(private_prefix %+ _%1) 58 %%table: 59 %rep %0 - 1 60 dd %%prefix %+ .ar%2 - %%base 61 %rotate 1 62 %endrep 63%endmacro 64 65JMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3 66JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3 67JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3 68JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3 69 70SECTION .text 71 72%if ARCH_X86_32 73%undef base 74%define PIC_ptr(a) base+a 75%else 76%define PIC_ptr(a) a 77%endif 78 79%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) 80 81%macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg 82%assign %%idx 0 83%define %%tmp %2 84%if %0 == 8 85%define %%tmp %8 86%endif 87%rep (%6/2) 88%if %%idx == 0 89 movd %5 %+ d, %2 90 pshuflw %%tmp, %2, q3232 91%else 92 movd %5 %+ d, %%tmp 93%if %6 == 8 94%if %%idx == 2 95 punpckhqdq %%tmp, %%tmp 96%elif %%idx == 4 97 psrlq %%tmp, 32 98%endif 99%endif 100%endif 101 movzx %4 %+ d, %5 %+ w 102 shr %5 %+ d, 16 103 104%if %%idx == 0 105 movd %1, [%3+%4*%7] 106%else 107 pinsrw %1, [%3+%4*%7], %%idx + 0 108%endif 109 pinsrw %1, [%3+%5*%7], %%idx + 1 110%assign %%idx %%idx+2 111%endrep 112%endmacro 113 114%macro SPLATD 2 ; dst, src 115%ifnidn %1, %2 116 movd %1, %2 117%endif 118 pshufd %1, %1, q0000 119%endmacro 120 121%macro SPLATW 2 ; dst, src 122%ifnidn %1, %2 123 movd %1, %2 124%endif 125 pshuflw %1, %1, q0000 126 punpcklqdq %1, %1 127%endmacro 128 129 130INIT_XMM ssse3 131%if ARCH_X86_64 132cglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax 133 lea r4, [pb_mask] 134%define base r4-pb_mask 135%else 136cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax 137 LEA r4, $$ 138%define base r4-$$ 139%endif 140 movq m1, [base+rnd_next_upperbit_mask] 141 movq m4, [base+mul_bits] 142 movq m7, [base+hmul_bits] 143 mov r3d, [fg_dataq+FGData.grain_scale_shift] 144 lea r5d, [bdmaxq+1] 145 shr r5d, 11 ; 0 for 10bpc, 2 for 12bpc 146 sub r3, r5 147 SPLATW m6, [base+round+r3*2-2] 148 mova m5, [base+pb_mask] 149 SPLATW m0, [fg_dataq+FGData.seed] 150 mov r3, -73*82*2 151 sub bufq, r3 152%if ARCH_X86_64 153 lea r6, [gaussian_sequence] 154%endif 155.loop: 156 pand m2, m0, m1 157 psrlw m3, m2, 10 158 por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 159 pmullw m2, m4 ; bits 0x0f00 are set 160 pshufb m3, m5, m2 ; set 15th bit for next 4 seeds 161 psllq m2, m3, 30 162 por m2, m3 163 psllq m3, m2, 15 164 por m2, m3 ; aggregate each bit into next seed's high bit 165 pmulhuw m3, m0, m7 166 por m2, m3 ; 4 next output seeds 167 pshuflw m0, m2, q3333 168 psrlw m2, 5 169%if ARCH_X86_64 170 vpgatherdw m3, m2, r6, r5, r7, 4, 2 171%else 172 vpgatherdw m3, m2, base+gaussian_sequence, r5, r2, 4, 2 173%endif 174 paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 175 ; shifts by 0, which pmulhrsw does not support 176 pmulhrsw m3, m6 177 movq [bufq+r3], m3 178 add r3, 4*2 179 jl .loop 180 181 ; auto-regression code 182 movsxd r3, [fg_dataq+FGData.ar_coeff_lag] 183 movsxd r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4] 184 lea r3, [r3+base+generate_grain_y_16bpc_ssse3_table] 185 jmp r3 186 187.ar1: 188%if WIN64 189 DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0 190 lea bufq, [r0-2*(82*73-(82*3+79))] 191 PUSH r8 192%else 193%if ARCH_X86_64 194 DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 195%else ; x86-32 196 DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0 197 PUSH r6 198%define shiftd r1d 199%endif 200 sub bufq, 2*(82*73-(82*3+79)) 201%endif 202 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] 203 movd m4, [fg_dataq+FGData.ar_coeffs_y] 204 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 205%if WIN64 206 DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0 207%elif ARCH_X86_64 208 DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 209%else ; x86-32 210%undef shiftd 211 DEFINE_ARGS buf, shift, min, val3, x, cf3, val0 212%define hd dword r0m 213%define maxd dword minm 214%endif 215%if cpuflag(sse4) 216 pmovsxbw m4, m4 217%else 218 pxor m3, m3 219 pcmpgtb m3, m4 220 punpcklbw m4, m3 221%endif 222 pinsrw m4, [base+pw_1], 3 223 pshufd m5, m4, q1111 224 pshufd m4, m4, q0000 225 SPLATW m3, [base+round_vals+shiftq*2-12] ; rnd 226 mov hd, 70 227 sar maxd, 1 228 mov mind, maxd 229 xor mind, -1 230.y_loop_ar1: 231 mov xq, -76 232 movsx val3d, word [bufq+xq*2-2] 233.x_loop_ar1: 234 movu m0, [bufq+xq*2-82*2-2] ; top/left 235 psrldq m2, m0, 2 ; top 236 psrldq m1, m0, 4 ; top/right 237 punpcklwd m0, m2 238 punpcklwd m1, m3 239 pmaddwd m0, m4 240 pmaddwd m1, m5 241 paddd m0, m1 242.x_loop_ar1_inner: 243 movd val0d, m0 244 psrldq m0, 4 245 imul val3d, cf3d 246 add val3d, val0d 247 sar val3d, shiftb 248 movsx val0d, word [bufq+xq*2] 249 add val3d, val0d 250 cmp val3d, maxd 251 cmovg val3d, maxd 252 cmp val3d, mind 253 cmovl val3d, mind 254 mov word [bufq+xq*2], val3w 255 ; keep val3d in-place as left for next x iteration 256 inc xq 257 jz .x_loop_ar1_end 258 test xq, 3 259 jnz .x_loop_ar1_inner 260 jmp .x_loop_ar1 261 262.x_loop_ar1_end: 263 add bufq, 82*2 264 dec hd 265 jg .y_loop_ar1 266%if WIN64 267 POP r8 268%elif ARCH_X86_32 269 POP r6 270%undef maxd 271%undef hd 272%endif 273.ar0: 274 RET 275 276.ar2: 277%if ARCH_X86_32 278%assign stack_offset_old stack_offset 279 ALLOC_STACK -16*8 280%endif 281 DEFINE_ARGS buf, fg_data, bdmax, shift 282 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 283 movd m0, [base+round_vals-12+shiftq*2] 284 pshuflw m0, m0, q0000 285 movu m6, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-11 286 pxor m2, m2 287 punpcklwd m0, m2 288 pcmpgtb m2, m6 289 punpckhbw m3, m6, m2 290 punpcklbw m6, m2 291 pshufd m2, m6, q3333 292 pshufd m1, m6, q2222 293 pshufd m7, m6, q1111 294 pshufd m6, m6, q0000 295 pshufd m4, m3, q1111 296 pshufd m3, m3, q0000 297%if ARCH_X86_64 298 SWAP 0, 12 299 SWAP 1, 8 300 SWAP 2, 9 301 SWAP 3, 10 302 SWAP 4, 11 303%else 304%define m12 [rsp+0*16] 305%define m8 [rsp+1*16] 306%define m9 [rsp+2*16] 307%define m10 [rsp+3*16] 308%define m11 [rsp+4*16] 309 mova m12, m0 310 mova m8, m1 311 mova m9, m2 312 mova m10, m3 313 mova m11, m4 314 mov bdmaxd, bdmaxm 315%endif 316 sar bdmaxd, 1 317 SPLATW m0, bdmaxd ; max_grain 318 pcmpeqw m1, m1 319%if !cpuflag(sse4) 320 pcmpeqw m2, m2 321 psrldq m2, 14 322 pslldq m2, 2 323 pxor m2, m1 324%endif 325 pxor m1, m0 ; min_grain 326%if ARCH_X86_64 327 SWAP 0, 13 328 SWAP 1, 14 329 SWAP 2, 15 330%else 331%define m13 [rsp+5*16] 332%define m14 [rsp+6*16] 333 mova m13, m0 334 mova m14, m1 335%if !cpuflag(sse4) 336%define m15 [rsp+7*16] 337 mova m15, m2 338%endif 339%endif 340 sub bufq, 2*(82*73-(82*3+79)) 341 DEFINE_ARGS buf, fg_data, h, x 342 mov hd, 70 343.y_loop_ar2: 344 mov xq, -76 345 346.x_loop_ar2: 347 movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] 348 movu m1, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] 349 psrldq m2, m0, 2 350 psrldq m3, m0, 4 351 psrldq m4, m0, 6 352 psrldq m5, m0, 8 353 punpcklwd m0, m2 354 punpcklwd m3, m4 355 punpcklwd m5, m1 356 psrldq m2, m1, 2 357 psrldq m4, m1, 4 358 punpcklwd m2, m4 359 psrldq m4, m1, 6 360 psrldq m1, 8 361 punpcklwd m4, m1 362 pmaddwd m0, m6 363 pmaddwd m3, m7 364 pmaddwd m5, m8 365 pmaddwd m2, m9 366 pmaddwd m4, m10 367 paddd m0, m3 368 paddd m5, m2 369 paddd m0, m4 370 paddd m0, m5 ; accumulated top 2 rows 371 paddd m0, m12 372 373 movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] 374 pshufd m4, m1, q3321 375 pxor m2, m2 376 pcmpgtw m2, m4 377 punpcklwd m4, m2 ; in dwords, y=0,x=[0,3] 378.x_loop_ar2_inner: 379 pmaddwd m2, m1, m11 380 paddd m2, m0 381 psrldq m0, 4 ; shift top to next pixel 382 psrad m2, [fg_dataq+FGData.ar_coeff_shift] 383 paddd m2, m4 384 packssdw m2, m2 385 pminsw m2, m13 386 pmaxsw m2, m14 387 psrldq m4, 4 388 pslldq m2, 2 389 psrldq m1, 2 390%if cpuflag(sse4) 391 pblendw m1, m2, 00000010b 392%else 393 pand m1, m15 394 pandn m3, m15, m2 395 por m1, m3 396%endif 397 ; overwrite previous pixel, this should be ok 398 movd [bufq+xq*2-2], m1 399 inc xq 400 jz .x_loop_ar2_end 401 test xq, 3 402 jnz .x_loop_ar2_inner 403 jmp .x_loop_ar2 404 405.x_loop_ar2_end: 406 add bufq, 82*2 407 dec hd 408 jg .y_loop_ar2 409%if ARCH_X86_32 410%undef m8 411%undef m9 412%undef m10 413%undef m11 414%undef m12 415%undef m13 416%undef m14 417%undef m15 418%endif 419 RET 420 421.ar3: 422 DEFINE_ARGS buf, fg_data, bdmax, shift 423%if WIN64 424 mov r6, rsp 425 and rsp, ~15 426 sub rsp, 64 427 %define tmp rsp 428%elif ARCH_X86_64 429 %define tmp rsp+stack_offset-72 430%else 431%assign stack_offset stack_offset_old 432 ALLOC_STACK -16*12 433 %define tmp rsp 434 mov bdmaxd, bdmaxm 435%endif 436 sar bdmaxd, 1 437 SPLATW m7, bdmaxd ; max_grain 438 pcmpeqw m6, m6 439%if !cpuflag(sse4) 440 pcmpeqw m4, m4 441 psrldq m4, 14 442 pslldq m4, 4 443 pxor m4, m6 444%endif 445 pxor m6, m7 ; min_grain 446 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 447 448%if ARCH_X86_64 449 SWAP 6, 14 450 SWAP 7, 15 451%else 452%define m14 [rsp+10*16] 453%define m15 [esp+11*16] 454 mova m14, m6 455 mova m15, m7 456%endif 457 458 ; build cf0-1 until 18-19 in m5-12 and r0/1 459 pxor m1, m1 460 movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 461 pcmpgtb m1, m0 462 punpckhbw m2, m0, m1 463 punpcklbw m0, m1 464 465%if cpuflag(sse4) 466 pshufd m4, m2, q3333 467%else 468 pshufd m5, m2, q3333 469 mova [tmp+48], m5 470%endif 471 pshufd m3, m2, q2222 472 pshufd m1, m2, q0000 473 pshufd m2, m2, q1111 474 pshufd m7, m0, q2222 475 pshufd m6, m0, q1111 476 pshufd m5, m0, q0000 477 pshufd m0, m0, q3333 478 479%if ARCH_X86_64 480 SWAP 0, 8 481 SWAP 1, 9 482 SWAP 2, 10 483 SWAP 3, 11 484 SWAP 4, 12 485%else 486%define m8 [rsp+4*16] 487%define m9 [esp+5*16] 488%define m10 [rsp+6*16] 489%define m11 [esp+7*16] 490%define m12 [rsp+8*16] 491 mova m8, m0 492 mova m9, m1 493 mova m10, m2 494 mova m11, m3 495 mova m12, m4 496%endif 497 498 ; build cf20,round in r2 499 ; build cf21-23,round*2 in m13 500 pxor m1, m1 501 movq m0, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 502 pcmpgtb m1, m0 503 punpcklbw m0, m1 504 pshufd m1, m0, q0000 505 pshufd m2, m0, q1111 506 mova [tmp+ 0], m1 507 mova [tmp+16], m2 508 psrldq m3, m0, 10 509 pinsrw m3, [base+round_vals+shiftq*2-10], 3 510 511%if ARCH_X86_64 512 SWAP 3, 13 513%else 514%define m13 [esp+9*16] 515 mova m13, m3 516%endif 517 518 pinsrw m0, [base+round_vals+shiftq*2-12], 5 519 pshufd m3, m0, q2222 520 mova [tmp+32], m3 521 522 DEFINE_ARGS buf, fg_data, h, x 523 sub bufq, 2*(82*73-(82*3+79)) 524 mov hd, 70 525.y_loop_ar3: 526 mov xq, -76 527 528.x_loop_ar3: 529 movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] 530 movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] 531 palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] 532 palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] 533 punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] 534 punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] 535 shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] 536 537 pmaddwd m0, m5 538 pmaddwd m2, m6 539 pmaddwd m3, m7 540 paddd m0, m2 541 paddd m0, m3 542 ; m0 = top line first 6 multiplied by cf, m1 = top line last entry 543 544 movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] 545 movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] 546 punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] 547 palignr m4, m3, m2, 2 ; y=-3,x=[-2,+5] 548 palignr m3, m3, m2, 4 ; y=-3,x=[-1,+6] 549 punpckhwd m2, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] 550 punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] 551 shufps m3, m4, m2, q1032 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] 552 553 pmaddwd m1, m8 554 pmaddwd m4, m9 555 pmaddwd m3, m10 556 pmaddwd m2, m11 557 paddd m1, m4 558 paddd m3, m2 559 paddd m0, m1 560 paddd m0, m3 561 ; m0 = top 2 lines multiplied by cf 562 563 movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] 564 movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] 565 palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] 566 palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] 567 punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] 568 punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] 569 shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] 570 punpcklwd m2, [base+pw_1] 571 572%if cpuflag(sse4) 573 pmaddwd m1, m12 574%else 575 pmaddwd m1, [tmp+48] 576%endif 577 pmaddwd m3, [tmp+ 0] 578 pmaddwd m4, [tmp+16] 579 pmaddwd m2, [tmp+32] 580 paddd m1, m3 581 paddd m4, m2 582 paddd m0, m1 583 paddd m0, m4 584 ; m0 = top 3 lines multiplied by cf plus rounding for downshift 585 586 movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] 587.x_loop_ar3_inner: 588 pmaddwd m2, m1, m13 589 pshufd m3, m2, q1111 590 paddd m2, m3 ; left+cur 591 paddd m2, m0 ; add top 592 psrldq m0, 4 593 psrad m2, [fg_dataq+FGData.ar_coeff_shift] 594 packssdw m2, m2 595 pminsw m2, m15 596 pmaxsw m2, m14 597 pslldq m2, 4 598 psrldq m1, 2 599%if cpuflag(sse4) 600 pblendw m1, m2, 00000100b 601%else 602 pand m1, m12 603 pandn m3, m12, m2 604 por m1, m3 605%endif 606 ; overwrite a couple of pixels, should be ok 607 movq [bufq+xq*2-4], m1 608 inc xq 609 jz .x_loop_ar3_end 610 test xq, 3 611 jnz .x_loop_ar3_inner 612 jmp .x_loop_ar3 613 614.x_loop_ar3_end: 615 add bufq, 82*2 616 dec hd 617 jg .y_loop_ar3 618%if WIN64 619 mov rsp, r6 620%elif ARCH_X86_32 621%undef m8 622%undef m9 623%undef m10 624%undef m11 625%undef m12 626%undef m13 627%undef m14 628%undef m15 629%endif 630 RET 631 632%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y 633INIT_XMM ssse3 634%if ARCH_X86_64 635cglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg 636%define base r8-pb_mask 637 lea r8, [pb_mask] 638 movifnidn bdmaxd, bdmaxm 639 lea r6d, [bdmaxq+1] 640%else 641cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h 642%define base r2-$$ 643 LEA r2, $$ 644 mov fg_dataq, r2m 645 mov r6d, r4m 646 inc r6d 647%endif 648 movq m1, [base+rnd_next_upperbit_mask] 649 movq m4, [base+mul_bits] 650 movq m7, [base+hmul_bits] 651 mov r5d, [fg_dataq+FGData.grain_scale_shift] 652 shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc 653 sub r5, r6 654 SPLATW m6, [base+round+r5*2-2] 655 mova m5, [base+pb_mask] 656 SPLATW m0, [fg_dataq+FGData.seed] 657%if ARCH_X86_64 658 SPLATW m2, [base+pw_seed_xor+uvq*4] 659%else 660 mov r5d, r3m 661 SPLATW m2, [base+pw_seed_xor+r5*4] 662%endif 663 pxor m0, m2 664%if ARCH_X86_64 665 lea r6, [gaussian_sequence] 666%endif 667%if %2 668 mov hd, 73-35*%3 669 add bufq, 44*2 670.loop_y: 671 mov xq, -44 672%else 673 mov xq, -82*73 674 add bufq, 82*73*2 675%endif 676.loop_x: 677 pand m2, m0, m1 678 psrlw m3, m2, 10 679 por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 680 pmullw m2, m4 ; bits 0x0f00 are set 681 pshufb m3, m5, m2 ; set 15th bit for next 4 seeds 682 psllq m2, m3, 30 683 por m2, m3 684 psllq m3, m2, 15 685 por m2, m3 ; aggregate each bit into next seed's high bit 686 pmulhuw m3, m0, m7 687 por m2, m3 ; 4 next output seeds 688 pshuflw m0, m2, q3333 689 psrlw m2, 5 690%if ARCH_X86_64 691 vpgatherdw m3, m2, r6, r9, r10, 4, 2 692%else 693 vpgatherdw m3, m2, base+gaussian_sequence, r5, r6, 4, 2 694%endif 695 paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 696 ; shifts by 0, which pmulhrsw does not support 697 pmulhrsw m3, m6 698 movq [bufq+xq*2], m3 699 add xq, 4 700 jl .loop_x 701%if %2 702 add bufq, 82*2 703 dec hd 704 jg .loop_y 705%endif 706 707 ; auto-regression code 708 movsxd r5, [fg_dataq+FGData.ar_coeff_lag] 709 movsxd r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4] 710 lea r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table] 711 jmp r5 712 713.ar0: 714%if ARCH_X86_64 715 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift 716%else 717 DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift 718%assign stack_offset_old stack_offset 719 ALLOC_STACK -16*2 720 mov bufyq, r1m 721 mov uvd, r3m 722%endif 723 imul uvd, 28 724 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 725 movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] 726 SPLATW m3, [base+hmul_bits+shiftq*2-10] 727%if ARCH_X86_64 728 sar bdmaxd, 1 729 SPLATW m1, bdmaxd ; max_gain 730%else 731 SPLATW m1, r4m 732 psraw m1, 1 733%endif 734 pcmpeqw m7, m7 735 pxor m7, m1 ; min_grain 736%if ARCH_X86_64 737 SWAP 1, 14 738 DEFINE_ARGS buf, bufy, h, x 739%else 740%define m14 [rsp+0*16] 741 mova m14, m1 742 DEFINE_ARGS buf, bufy, pic_reg, h, x 743%endif 744 pxor m5, m5 745 pcmpgtb m5, m4 746 punpcklbw m4, m5 747%if %2 748 SPLATW m6, [base+hmul_bits+2+%3*2] 749%endif 750 SPLATW m4, m4 751 pxor m5, m5 752%if %2 753%if !cpuflag(sse4) 754 pcmpeqw m2, m2 755 pslldq m2, 12 756%if ARCH_X86_64 757 SWAP 2, 12 758%else 759%define m12 [rsp+1*16] 760 mova m12, m2 761%endif 762%endif 763%endif 764%if %2 765 sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) 766%else 767 sub bufq, 2*(82*70-3) 768%endif 769 add bufyq, 2*(3+82*3) 770 mov hd, 70-35*%3 771.y_loop_ar0: 772 ; first 32 pixels 773 xor xd, xd 774.x_loop_ar0: 775 movu m0, [bufyq+xq*(2<<%2)] 776%if %2 777%if %3 778 movu m2, [bufyq+xq*4+82*2] 779 paddw m0, m2 780%endif 781 movu m1, [bufyq+xq*4 +16] 782%if %3 783 movu m2, [bufyq+xq*4+82*2+16] 784 paddw m1, m2 785%endif 786 phaddw m0, m1 787 pmulhrsw m0, m6 788%endif 789 punpckhwd m1, m0, m5 790 punpcklwd m0, m5 791 REPX {pmaddwd x, m4}, m0, m1 792 REPX {psrad x, 5}, m0, m1 793 packssdw m0, m1 794 pmulhrsw m0, m3 795 movu m1, [bufq+xq*2] 796 paddw m0, m1 797 pminsw m0, m14 798 pmaxsw m0, m7 799 cmp xd, 72-40*%2 800 je .end 801 movu [bufq+xq*2], m0 802 add xd, 8 803 jmp .x_loop_ar0 804 805 ; last 6/4 pixels 806.end: 807%if %2 808%if cpuflag(sse4) 809 pblendw m0, m1, 11000000b 810%else 811 pand m1, m12 812 pandn m2, m12, m0 813 por m0, m1, m2 814%endif 815 movu [bufq+xq*2], m0 816%else 817 movq [bufq+xq*2], m0 818%endif 819 820 add bufq, 82*2 821 add bufyq, 82*(2<<%3) 822 dec hd 823 jg .y_loop_ar0 824%if ARCH_X86_32 825%undef m12 826%undef m14 827%endif 828 RET 829 830.ar1: 831%if ARCH_X86_64 832 DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x 833%else 834%assign stack_offset stack_offset_old 835%xdefine rstk rsp 836%assign stack_size_padded 0 837 DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3 838 mov bufyq, r1m 839 mov uvd, r3m 840%endif 841 imul uvd, 28 842 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] 843 movq m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] 844%if WIN64 845 DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0 846%if %2 847 lea bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))] 848%else 849 lea bufq, [r0-2*(82*69+3)] 850%endif 851%else 852%if ARCH_X86_64 853 DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0 854%else 855 DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3 856%define hd dword r1m 857%define mind dword r3m 858%define maxd dword r4m 859%endif 860%if %2 861 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) 862%else 863 sub bufq, 2*(82*69+3) 864%endif 865%endif 866%if ARCH_X86_64 867 mov shiftd, [r2+FGData.ar_coeff_shift] 868%else 869 mov shiftd, [r3+FGData.ar_coeff_shift] 870%endif 871 pxor m5, m5 872 pcmpgtb m5, m4 873 punpcklbw m4, m5 ; cf0-4 in words 874 pshuflw m4, m4, q2100 875 psrldq m4, 2 ; cf0-3,4 in words 876 pshufd m5, m4, q1111 877 pshufd m4, m4, q0000 878 movd m3, [base+round_vals+shiftq*2-12] ; rnd 879 pxor m6, m6 880 punpcklwd m3, m6 881%if %2 882 SPLATW m6, [base+hmul_bits+2+%3*2] 883%endif 884 SPLATD m3, m3 885 add bufyq, 2*(79+82*3) 886 mov hd, 70-35*%3 887 sar maxd, 1 888%if ARCH_X86_64 889 mov mind, maxd 890 xor mind, -1 891%else 892 DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3 893 mov r2, maxd 894 xor r2, -1 895 mov mind, r2 896%endif 897.y_loop_ar1: 898 mov xq, -(76>>%2) 899 movsx val3d, word [bufq+xq*2-2] 900.x_loop_ar1: 901 movu m0, [bufq+xq*2-82*2-2] ; top/left 902%if %2 903 movu m7, [bufyq+xq*4] 904%if %3 905 movu m1, [bufyq+xq*4+82*2] 906 phaddw m7, m1 907%else 908 phaddw m7, m7 909%endif 910%else 911 movq m7, [bufyq+xq*2] 912%endif 913 psrldq m2, m0, 2 ; top 914 psrldq m1, m0, 4 ; top/right 915 punpcklwd m0, m2 916%if %2 917%if %3 918 pshufd m2, m7, q3232 919 paddw m7, m2 920%endif 921 pmulhrsw m7, m6 922%endif 923 punpcklwd m1, m7 924 pmaddwd m0, m4 925 pmaddwd m1, m5 926 paddd m0, m1 927 paddd m0, m3 928.x_loop_ar1_inner: 929 movd val0d, m0 930 psrldq m0, 4 931 imul val3d, cf3d 932 add val3d, val0d 933 sar val3d, shiftb 934 movsx val0d, word [bufq+xq*2] 935 add val3d, val0d 936 cmp val3d, maxd 937 cmovg val3d, maxd 938 cmp val3d, mind 939 cmovl val3d, mind 940 mov word [bufq+xq*2], val3w 941 ; keep val3d in-place as left for next x iteration 942 inc xq 943 jz .x_loop_ar1_end 944 test xq, 3 945 jnz .x_loop_ar1_inner 946 jmp .x_loop_ar1 947 948.x_loop_ar1_end: 949 add bufq, 82*2 950 add bufyq, 82*2<<%3 951 dec hd 952 jg .y_loop_ar1 953%if ARCH_X86_32 954%undef maxd 955%undef mind 956%undef hd 957%endif 958 RET 959 960.ar2: 961%if ARCH_X86_64 962 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift 963%else 964 DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift 965 ALLOC_STACK -16*8 966 mov bufyq, r1m 967 mov uvd, r3m 968%endif 969 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 970 imul uvd, 28 971%if ARCH_X86_64 972 sar bdmaxd, 1 973 SPLATW m5, bdmaxd ; max_grain 974%else 975 SPLATW m5, r4m 976 psraw m5, 1 977%endif 978 pcmpeqw m6, m6 979%if !cpuflag(sse4) 980 pcmpeqw m7, m7 981 psrldq m7, 14 982 pslldq m7, 2 983 pxor m7, m6 984%endif 985 pxor m6, m5 ; min_grain 986%if %2 && cpuflag(sse4) 987 SPLATW m7, [base+hmul_bits+2+%3*2] 988%endif 989 990%if ARCH_X86_64 991 SWAP 5, 13 992 SWAP 6, 14 993 SWAP 7, 15 994%else 995%define m13 [rsp+5*16] 996%define m14 [rsp+6*16] 997%define m15 [rsp+7*16] 998 mova m13, m5 999 mova m14, m6 1000 mova m15, m7 1001%endif 1002 1003 ; coef values 1004 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] 1005 pxor m1, m1 1006 pcmpgtb m1, m0 1007 punpckhbw m2, m0, m1 1008 punpcklbw m0, m1 1009 pinsrw m2, [base+round_vals-12+shiftq*2], 5 1010 1011 pshufd m6, m0, q0000 1012 pshufd m7, m0, q1111 1013 pshufd m1, m0, q3333 1014 pshufd m0, m0, q2222 1015 pshufd m3, m2, q1111 1016 pshufd m4, m2, q2222 1017 pshufd m2, m2, q0000 1018 1019%if ARCH_X86_64 1020 SWAP 0, 8 1021 SWAP 1, 9 1022 SWAP 2, 10 1023 SWAP 3, 11 1024 SWAP 4, 12 1025%else 1026%define m8 [rsp+0*16] 1027%define m9 [rsp+1*16] 1028%define m10 [rsp+2*16] 1029%define m11 [rsp+3*16] 1030%define m12 [rsp+4*16] 1031 mova m8, m0 1032 mova m9, m1 1033 mova m10, m2 1034 mova m11, m3 1035 mova m12, m4 1036%endif 1037 1038%if ARCH_X86_64 1039 DEFINE_ARGS buf, bufy, fg_data, h, x 1040%else 1041 DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x 1042%endif 1043%if %2 1044 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) 1045%else 1046 sub bufq, 2*(82*69+3) 1047%endif 1048 add bufyq, 2*(79+82*3) 1049 mov hd, 70-35*%3 1050.y_loop_ar2: 1051 mov xq, -(76>>%2) 1052 1053.x_loop_ar2: 1054 movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] 1055 movu m5, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] 1056 psrldq m4, m0, 2 ; y=-2,x=[-1,+5] 1057 psrldq m1, m0, 4 ; y=-2,x=[-0,+5] 1058 psrldq m3, m0, 6 ; y=-2,x=[+1,+5] 1059 psrldq m2, m0, 8 ; y=-2,x=[+2,+5] 1060 punpcklwd m0, m4 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] 1061 punpcklwd m1, m3 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] 1062 punpcklwd m2, m5 ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1] 1063 pmaddwd m0, m6 1064 pmaddwd m1, m7 1065 pmaddwd m2, m8 1066 paddd m0, m1 1067 paddd m0, m2 1068 psrldq m3, m5, 2 ; y=-1,x=[-1,+5] 1069 psrldq m1, m5, 4 ; y=-1,x=[-0,+5] 1070 psrldq m4, m5, 6 ; y=-1,x=[+1,+5] 1071 psrldq m2, m5, 8 ; y=-1,x=[+2,+5] 1072 punpcklwd m3, m1 1073 punpcklwd m4, m2 1074 pmaddwd m3, m9 1075 pmaddwd m4, m10 1076 paddd m3, m4 1077 paddd m0, m3 1078 1079 ; luma component & rounding 1080%if %2 1081 movu m1, [bufyq+xq*4] 1082%if %3 1083 movu m2, [bufyq+xq*4+82*2] 1084 phaddw m1, m2 1085 pshufd m2, m1, q3232 1086 paddw m1, m2 1087%else 1088 phaddw m1, m1 1089%endif 1090%if cpuflag(sse4) 1091 pmulhrsw m1, m15 1092%elif %3 1093 pmulhrsw m1, [base+pw_8192] 1094%else 1095 pmulhrsw m1, [base+pw_16384] 1096%endif 1097%else 1098 movq m1, [bufyq+xq*2] 1099%endif 1100 punpcklwd m1, [base+pw_1] 1101 pmaddwd m1, m12 1102 paddd m0, m1 1103 1104 movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] 1105 pshufd m2, m1, q3321 1106 pxor m3, m3 1107 pcmpgtw m3, m2 1108 punpcklwd m2, m3 ; y=0,x=[0,3] in dword 1109.x_loop_ar2_inner: 1110 pmaddwd m3, m1, m11 1111 paddd m3, m0 1112 psrldq m0, 4 ; shift top to next pixel 1113 psrad m3, [fg_dataq+FGData.ar_coeff_shift] 1114 ; we do not need to packssdw since we only care about one value 1115 paddd m3, m2 1116 packssdw m3, m3 1117 pminsw m3, m13 1118 pmaxsw m3, m14 1119 psrldq m1, 2 1120 pslldq m3, 2 1121 psrldq m2, 4 1122%if cpuflag(sse4) 1123 pblendw m1, m3, 00000010b 1124%else 1125 pand m1, m15 1126 pandn m4, m15, m3 1127 por m1, m4 1128%endif 1129 ; overwrite previous pixel, should be ok 1130 movd [bufq+xq*2-2], m1 1131 inc xq 1132 jz .x_loop_ar2_end 1133 test xq, 3 1134 jnz .x_loop_ar2_inner 1135 jmp .x_loop_ar2 1136 1137.x_loop_ar2_end: 1138 add bufq, 82*2 1139 add bufyq, 82*2<<%3 1140 dec hd 1141 jg .y_loop_ar2 1142%if ARCH_X86_32 1143%undef m13 1144%undef m14 1145%undef m15 1146%endif 1147 RET 1148 1149.ar3: 1150%if ARCH_X86_64 1151 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift 1152%if WIN64 1153 mov r6, rsp 1154 and rsp, ~15 1155 sub rsp, 96 1156 %define tmp rsp 1157%else 1158 %define tmp rsp+stack_offset-120 1159%endif 1160%else 1161 DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift 1162%assign stack_offset stack_offset_old 1163 ALLOC_STACK -16*14 1164 mov bufyq, r1m 1165 mov uvd, r3m 1166 %define tmp rsp 1167%endif 1168 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 1169 imul uvd, 28 1170 SPLATW m4, [base+round_vals-12+shiftq*2] 1171 pxor m5, m5 1172 pcmpgtw m5, m4 1173 punpcklwd m4, m5 1174%if ARCH_X86_64 1175 sar bdmaxd, 1 1176 SPLATW m6, bdmaxd ; max_grain 1177%else 1178 SPLATW m6, r4m 1179 psraw m6, 1 1180%endif 1181 pcmpeqw m7, m7 1182%if !cpuflag(sse4) 1183 pcmpeqw m3, m3 1184 psrldq m3, 14 1185 pslldq m3, 4 1186 pxor m3, m7 1187%endif 1188 pxor m7, m6 ; min_grain 1189%if %2 && cpuflag(sse4) 1190 SPLATW m3, [base+hmul_bits+2+%3*2] 1191%endif 1192 1193%if ARCH_X86_64 1194 SWAP 3, 11 1195 SWAP 4, 12 1196 SWAP 6, 14 1197 SWAP 7, 15 1198%else 1199%define m11 [rsp+ 9*16] 1200%define m12 [rsp+10*16] 1201%define m14 [rsp+12*16] 1202%define m15 [rsp+13*16] 1203 mova m11, m3 1204 mova m12, m4 1205 mova m14, m6 1206 mova m15, m7 1207%endif 1208 1209 ; cf from y=-3,x=-3 until y=-3,x=-2 1210 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] 1211 pxor m1, m1 1212 pcmpgtb m1, m0 1213 punpckhbw m2, m0, m1 1214 punpcklbw m0, m1 1215 pshufd m1, m0, q0000 1216 pshufd m3, m0, q1111 1217 pshufd m4, m0, q2222 1218 pshufd m0, m0, q3333 1219 pshufd m5, m2, q0000 1220 pshufd m6, m2, q1111 1221 mova [tmp+16*0], m1 1222 mova [tmp+16*1], m3 1223 mova [tmp+16*2], m4 1224 mova [tmp+16*3], m0 1225 mova [tmp+16*4], m5 1226 mova [tmp+16*5], m6 1227 pshufd m6, m2, q2222 1228 pshufd m7, m2, q3333 1229 1230 ; cf from y=-1,x=-1 to y=0,x=-1 + luma component 1231 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] 1232 pxor m1, m1 1233 pcmpgtb m1, m0 1234 punpckhbw m2, m0, m1 ; luma 1235 punpcklbw m0, m1 1236 pshufd m3, m0, q3232 1237 psrldq m5, m0, 10 1238 ; y=0,x=[-3 to -1] + "1.0" for current pixel 1239 pinsrw m5, [base+round_vals-10+shiftq*2], 3 1240 ; y=-1,x=[-1 to +2] 1241 pshufd m1, m0, q0000 1242 pshufd m0, m0, q1111 1243 ; y=-1,x=+3 + luma 1244 punpcklwd m3, m2 1245 pshufd m3, m3, q0000 1246 1247%if ARCH_X86_64 1248 SWAP 1, 8 1249 SWAP 0, 9 1250 SWAP 3, 10 1251 SWAP 5, 13 1252 DEFINE_ARGS buf, bufy, fg_data, h, x 1253%else 1254%define m8 [rsp+ 6*16] 1255%define m9 [rsp+ 7*16] 1256%define m10 [rsp+ 8*16] 1257%define m13 [rsp+11*16] 1258 mova m8, m1 1259 mova m9, m0 1260 mova m10, m3 1261 mova m13, m5 1262 DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x 1263%endif 1264%if %2 1265 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) 1266%else 1267 sub bufq, 2*(82*69+3) 1268%endif 1269 add bufyq, 2*(79+82*3) 1270 mov hd, 70-35*%3 1271.y_loop_ar3: 1272 mov xq, -(76>>%2) 1273 1274.x_loop_ar3: 1275 ; first line 1276 movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] 1277 movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] 1278 palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] 1279 palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] 1280 punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] 1281 punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] 1282 shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] 1283 1284 pmaddwd m0, [tmp+0*16] 1285 pmaddwd m2, [tmp+1*16] 1286 pmaddwd m3, [tmp+2*16] 1287 paddd m0, m2 1288 paddd m0, m3 ; first 6 x of top y 1289 1290 ; second line [m0/1 are busy] 1291 movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] 1292 movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] 1293 punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] 1294 palignr m4, m3, m2, 2 ; y=-2,x=[-2,+5] 1295 palignr m3, m3, m2, 4 ; y=-2,x=[-2,+5] 1296 punpckhwd m5, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] 1297 punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] 1298 shufps m3, m4, m5, q1032 ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] 1299 pmaddwd m1, [tmp+3*16] 1300 pmaddwd m4, [tmp+4*16] 1301 pmaddwd m3, [tmp+5*16] 1302 pmaddwd m5, m6 1303 paddd m1, m4 1304 paddd m3, m5 1305 paddd m0, m1 1306 paddd m0, m3 ; top 2 lines 1307 1308 ; third line [m0 is busy] & luma + round 1309 movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] 1310 movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] 1311%if %2 1312 movu m5, [bufyq+xq*4] 1313%if %3 1314 movu m4, [bufyq+xq*4+82*2] 1315 phaddw m5, m4 1316%else 1317 phaddw m5, m5 1318%endif 1319%else 1320 movq m5, [bufyq+xq*2] 1321%endif 1322 palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] 1323 palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] 1324%if %3 1325 pshufd m4, m5, q3232 1326 paddw m5, m4 1327%endif 1328%if %2 1329%if cpuflag(sse4) 1330 pmulhrsw m5, m11 1331%elif %3 1332 pmulhrsw m5, [base+pw_8192] 1333%else 1334 pmulhrsw m5, [base+pw_16384] 1335%endif 1336%endif 1337 punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] 1338 punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] 1339 shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] 1340 punpcklwd m2, m5 1341 pmaddwd m1, m7 1342 pmaddwd m3, m8 1343 pmaddwd m4, m9 1344 pmaddwd m2, m10 1345 paddd m1, m3 1346 paddd m4, m2 1347 paddd m0, m12 ; += round 1348 paddd m1, m4 1349 paddd m0, m1 1350 1351 movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] 1352.x_loop_ar3_inner: 1353 pmaddwd m2, m1, m13 1354 pshufd m3, m2, q1111 1355 paddd m2, m3 ; left+cur 1356 paddd m2, m0 ; add top 1357 psrldq m0, 4 1358 psrad m2, [fg_dataq+FGData.ar_coeff_shift] 1359 packssdw m2, m2 1360 pminsw m2, m14 1361 pmaxsw m2, m15 1362 pslldq m2, 4 1363 psrldq m1, 2 1364%if cpuflag(sse4) 1365 pblendw m1, m2, 00000100b 1366%else 1367 pand m1, m11 1368 pandn m3, m11, m2 1369 por m1, m3 1370%endif 1371 ; overwrite previous pixels, should be ok 1372 movq [bufq+xq*2-4], m1 1373 inc xq 1374 jz .x_loop_ar3_end 1375 test xq, 3 1376 jnz .x_loop_ar3_inner 1377 jmp .x_loop_ar3 1378 1379.x_loop_ar3_end: 1380 add bufq, 82*2 1381 add bufyq, 82*2<<%3 1382 dec hd 1383 jg .y_loop_ar3 1384%if WIN64 1385 mov rsp, r6 1386%elif ARCH_X86_32 1387%undef m8 1388%undef m9 1389%undef m10 1390%undef m11 1391%undef m12 1392%undef m13 1393%undef m14 1394%undef m15 1395%endif 1396 RET 1397%endmacro 1398 1399generate_grain_uv_fn 420, 1, 1 1400generate_grain_uv_fn 422, 1, 0 1401generate_grain_uv_fn 444, 0, 0 1402 1403%macro SCRATCH 3 1404%if ARCH_X86_32 1405 mova [rsp+%3*mmsize], m%1 1406%define m%2 [rsp+%3*mmsize] 1407%else 1408 SWAP %1, %2 1409%endif 1410%endmacro 1411 1412INIT_XMM ssse3 1413%if ARCH_X86_32 1414%if STACK_ALIGNMENT < mmsize 1415cglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \ 1416 dst, src, scaling, unused1, fg_data, picptr, unused2 1417 ; copy stack arguments to new position post-alignment, so that we 1418 ; don't have to keep the old stack location in a separate register 1419 mov r0, r0m 1420 mov r1, r2m 1421 mov r2, r4m 1422 mov r3, r6m 1423 mov r4, r7m 1424 mov r5, r8m 1425 1426%define r0m [rsp+8*mmsize+ 3*gprsize] 1427%define r2m [rsp+8*mmsize+ 5*gprsize] 1428%define r4m [rsp+8*mmsize+ 7*gprsize] 1429%define r6m [rsp+8*mmsize+ 9*gprsize] 1430%define r7m [rsp+8*mmsize+10*gprsize] 1431%define r8m [rsp+8*mmsize+11*gprsize] 1432 1433 mov r0m, r0 1434 mov r2m, r1 1435 mov r4m, r2 1436 mov r6m, r3 1437 mov r7m, r4 1438 mov r8m, r5 1439%else 1440cglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \ 1441 dst, src, scaling, unused1, fg_data, picptr, unused2 1442%endif 1443 mov srcq, srcm 1444 mov scalingq, r5m 1445 mov fg_dataq, r3m 1446%if STACK_ALIGNMENT < mmsize 1447 mov r6, r9m 1448 1449%define r9m [rsp+8*mmsize+ 4*gprsize] 1450%define r3m [rsp+8*mmsize+ 6*gprsize] 1451%define r5m [rsp+8*mmsize+ 8*gprsize] 1452 1453 mov r9m, r6 1454%endif 1455 LEA r5, $$ 1456%define base r5-$$ 1457 mov r5m, picptrq 1458%else 1459cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut 1460 lea r8, [pb_mask] 1461%define base r8-pb_mask 1462%endif 1463 mov r6d, [fg_dataq+FGData.scaling_shift] 1464 SPLATW m3, [base+mul_bits+r6*2-14] 1465 mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 1466%if ARCH_X86_32 1467 DECLARE_REG_TMP 0, 3 1468%else 1469 DECLARE_REG_TMP 9, 10 1470%endif 1471 mov t0d, r9m ; bdmax 1472 sar t0d, 11 ; is_12bpc 1473 inc t0d 1474 mov t1d, r6d 1475 imul t1d, t0d 1476 dec t0d 1477 SPLATW m5, [base+min+t1*2] 1478 lea t0d, [t0d*3] 1479 lea t0d, [r6d*2+t0d] 1480 SPLATW m4, [base+max+t0*2] 1481 SPLATW m2, r9m 1482 1483 pcmpeqw m1, m1 1484 psraw m7, m2, 1 ; max_grain 1485 pxor m1, m7 ; min_grain 1486 SPLATD m6, [base+pd_16] 1487 1488 SCRATCH 1, 9, 0 1489 SCRATCH 2, 10, 1 1490 SCRATCH 3, 11, 2 1491 SCRATCH 4, 12, 3 1492 SCRATCH 5, 13, 4 1493 SCRATCH 6, 14, 5 1494 SCRATCH 7, 15, 6 1495 1496 mova m6, [base+pw_27_17_17_27] ; for horizontal filter 1497 1498%if ARCH_X86_32 1499 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2 1500 DECLARE_REG_TMP 0 1501%else 1502 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ 1503 sby, see 1504 DECLARE_REG_TMP 7 1505%endif 1506 1507 mov sbyd, r8m 1508 movzx t0d, byte [fg_dataq+FGData.overlap_flag] 1509 test t0d, t0d 1510 jz .no_vertical_overlap 1511 test sbyd, sbyd 1512 jnz .vertical_overlap 1513.no_vertical_overlap: 1514 mov dword r8m, t0d 1515 1516%if ARCH_X86_32 1517 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused 1518 imul seed, (173 << 24) | 37 1519%else 1520 imul seed, sbyd, (173 << 24) | 37 1521%endif 1522 add seed, (105 << 24) | 178 1523 rol seed, 8 1524 movzx seed, seew 1525 xor seed, [fg_dataq+FGData.seed] 1526 1527%if ARCH_X86_32 1528 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1529 1530 mov r3m, seed 1531 mov wq, r4m 1532%else 1533 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1534 unused1, unused2, see, src_bak 1535%endif 1536 1537 lea src_bakq, [srcq+wq*2] 1538 mov r9mp, src_bakq 1539 neg wq 1540 sub dstmp, srcq 1541%if ARCH_X86_32 1542 mov r4m, wq 1543%endif 1544 1545.loop_x: 1546%if ARCH_X86_32 1547 mov seed, r3m 1548%endif 1549 mov r6d, seed 1550 or seed, 0xEFF4 1551 shr r6d, 1 1552 test seeb, seeh 1553 lea seed, [r6+0x8000] 1554 cmovp seed, r6d ; updated seed 1555 1556%if ARCH_X86_32 1557 mov r3m, seed 1558 1559 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1560 1561 mov offxd, offyd 1562%else 1563 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1564 offx, offy, see, src_bak 1565 1566 mov offyd, seed 1567 mov offxd, seed 1568%endif 1569 ror offyd, 8 1570 shr offxd, 12 1571 and offyd, 0xf 1572 imul offyd, 164 1573 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1574 1575%if ARCH_X86_32 1576 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1577%else 1578 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1579 h, offxy, see, src_bak 1580%endif 1581 1582.loop_x_odd: 1583 movzx hd, word r7m 1584 mov grain_lutq, grain_lutmp 1585.loop_y: 1586 ; src 1587 pand m0, m10, [srcq+ 0] 1588 pand m1, m10, [srcq+16] ; m0-1: src as word 1589 1590 ; scaling[src] 1591%if ARCH_X86_32 1592 vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m4 1593 vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m4 1594%else 1595 vpgatherdw m2, m0, scalingq-1, r11, r13, 8, 1, m4 1596 vpgatherdw m3, m1, scalingq-1, r11, r13, 8, 1, m4 1597%endif 1598 REPX {psrlw x, 8}, m2, m3 1599 1600 ; grain = grain_lut[offy+y][offx+x] 1601 movu m4, [grain_lutq+offxyq*2] 1602 movu m5, [grain_lutq+offxyq*2+16] 1603 1604 ; noise = round2(scaling[src] * grain, scaling_shift) 1605 REPX {pmullw x, m11}, m2, m3 1606 pmulhrsw m4, m2 1607 pmulhrsw m5, m3 1608 1609 ; dst = clip_pixel(src, noise) 1610 paddw m0, m4 1611 paddw m1, m5 1612 pmaxsw m0, m13 1613 pmaxsw m1, m13 1614 pminsw m0, m12 1615 pminsw m1, m12 1616 movifnidn dstq, dstmp 1617 mova [dstq+srcq+ 0], m0 1618 mova [dstq+srcq+16], m1 1619 1620 add srcq, r2mp ; src += stride 1621 add grain_lutq, 82*2 1622 dec hd 1623 jg .loop_y 1624 1625%if ARCH_X86_32 1626 add r4mp, 16 1627%else 1628 add wq, 16 1629%endif 1630 jge .end 1631%if ARCH_X86_32 1632 mov srcq, r9mp 1633 add srcq, r4mp 1634 add srcq, r4mp 1635%else 1636 mov src_bakq, r9mp 1637 lea srcq, [src_bakq+wq*2] 1638%endif 1639 btc dword r8m, 2 1640 jc .next_blk 1641 add offxyd, 16 1642 test dword r8m, 2 1643 jz .loop_x_odd 1644%if ARCH_X86_32 1645 add dword [rsp+8*mmsize+1*gprsize], 16 1646%else 1647 add r12d, 16 ; top_offxy += 16 1648%endif 1649 jmp .loop_x_odd_v_overlap 1650 1651.next_blk: 1652 test dword r8m, 1 1653 jz .loop_x 1654 1655 ; r8m = sbym 1656 test dword r8m, 2 1657 jnz .loop_x_hv_overlap 1658 1659 ; horizontal overlap (without vertical overlap) 1660.loop_x_h_overlap: 1661%if ARCH_X86_32 1662 add offxyd, 16 1663 mov [rsp+8*mmsize+0*gprsize], offxyd 1664 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1665 mov seed, r3m 1666%endif 1667 1668 mov r6d, seed 1669 or seed, 0xEFF4 1670 shr r6d, 1 1671 test seeb, seeh 1672 lea seed, [r6+0x8000] 1673 cmovp seed, r6d ; updated seed 1674 1675%if ARCH_X86_32 1676 mov r3m, seed 1677 1678 DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx 1679 1680 mov offxd, offyd 1681%else 1682 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1683 offx, offy, see, src_bak, left_offxy 1684 1685 lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx 1686 1687 mov offyd, seed 1688 mov offxd, seed 1689%endif 1690 ror offyd, 8 1691 shr offxd, 12 1692 and offyd, 0xf 1693 imul offyd, 164 1694 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1695 1696%if ARCH_X86_32 1697 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1698%else 1699 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1700 h, offxy, see, src_bak, left_offxy 1701%endif 1702 1703 mov hd, dword r7m 1704 mov grain_lutq, grain_lutmp 1705.loop_y_h_overlap: 1706 ; grain = grain_lut[offy+y][offx+x] 1707 movu m5, [grain_lutq+offxyq*2] 1708%if ARCH_X86_32 1709 mov r5, [rsp+8*mmsize+0*gprsize] 1710 movd m4, [grain_lutq+r5*2] 1711%else 1712 movd m4, [grain_lutq+left_offxyq*2] 1713%endif 1714 punpcklwd m4, m5 1715 pmaddwd m4, m6 1716 paddd m4, m14 1717 psrad m4, 5 1718 packssdw m4, m4 1719 pminsw m4, m15 1720 pmaxsw m4, m9 1721 shufps m4, m5, q3210 1722 1723 ; src 1724 pand m0, m10, [srcq+ 0] 1725 pand m1, m10, [srcq+16] ; m0-1: src as word 1726 1727 ; scaling[src] 1728%if ARCH_X86_32 1729 vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m5 1730 vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m5 1731%else 1732 vpgatherdw m2, m0, scalingq-1, r13, r14, 8, 1, m5 1733 vpgatherdw m3, m1, scalingq-1, r13, r14, 8, 1, m5 1734%endif 1735 REPX {psrlw x, 8}, m2, m3 1736 1737 ; noise = round2(scaling[src] * grain, scaling_shift) 1738 movu m5, [grain_lutq+offxyq*2+16] 1739 REPX {pmullw x, m11}, m2, m3 1740 pmulhrsw m4, m2 1741 pmulhrsw m5, m3 1742 1743 ; dst = clip_pixel(src, noise) 1744 paddw m0, m4 1745 paddw m1, m5 1746 pmaxsw m0, m13 1747 pmaxsw m1, m13 1748 pminsw m0, m12 1749 pminsw m1, m12 1750 movifnidn dstq, dstmp 1751 mova [dstq+srcq+ 0], m0 1752 mova [dstq+srcq+16], m1 1753 1754 add srcq, r2mp 1755 add grain_lutq, 82*2 1756 dec hd 1757 jg .loop_y_h_overlap 1758 1759%if ARCH_X86_32 1760 add r4mp, 16 1761%else 1762 add wq, 16 1763%endif 1764 jge .end 1765%if ARCH_X86_32 1766 mov srcq, r9mp 1767 add srcq, r4mp 1768 add srcq, r4mp 1769%else 1770 mov src_bakq, r9mp 1771 lea srcq, [src_bakq+wq*2] 1772%endif 1773 or dword r8m, 4 1774 add offxyd, 16 1775 1776 ; r8m = sbym 1777 test dword r8m, 2 1778 jz .loop_x_odd 1779%if ARCH_X86_32 1780 add dword [rsp+8*mmsize+1*gprsize], 16 1781%else 1782 add r12d, 16 ; top_offxy += 16 1783%endif 1784 jmp .loop_x_odd_v_overlap 1785 1786.end: 1787 RET 1788 1789.vertical_overlap: 1790 or t0d, 2 1791 mov r8m, t0d 1792 1793%if ARCH_X86_32 1794 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused 1795%else 1796 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ 1797 sby, see 1798%endif 1799 1800 movzx sbyd, sbyb 1801%if ARCH_X86_32 1802 imul r4, [fg_dataq+FGData.seed], 0x00010001 1803 DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused 1804%else 1805 imul seed, [fg_dataq+FGData.seed], 0x00010001 1806%endif 1807 imul t0d, sbyd, 173 * 0x00010001 1808 imul sbyd, 37 * 0x01000100 1809 add t0d, (105 << 16) | 188 1810 add sbyd, (178 << 24) | (141 << 8) 1811 and t0d, 0x00ff00ff 1812 and sbyd, 0xff00ff00 1813 xor seed, t0d 1814%if ARCH_X86_32 1815 xor sbyd, seed 1816 1817 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1818 1819 mov r3m, seed 1820 mov wq, r4m 1821%else 1822 xor seed, sbyd ; (cur_seed << 16) | top_seed 1823 1824 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1825 unused1, unused2, see, src_bak 1826%endif 1827 1828 lea src_bakq, [srcq+wq*2] 1829 mov r9mp, src_bakq 1830 neg wq 1831 sub dstmp, srcq 1832%if ARCH_X86_32 1833 mov r4m, wq 1834%endif 1835 1836.loop_x_v_overlap: 1837%if ARCH_X86_32 1838 mov r5, r5m 1839 SPLATD m7, [base+pw_27_17_17_27] 1840 mov seed, r3m 1841%else 1842 SPLATD m7, [pw_27_17_17_27] 1843%endif 1844 1845 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1846 mov r6d, seed 1847 or seed, 0xeff4eff4 1848 test seeb, seeh 1849 setp t0b ; parity of top_seed 1850 shr seed, 16 1851 shl t0d, 16 1852 test seeb, seeh 1853 setp t0b ; parity of cur_seed 1854 or r6d, 0x00010001 1855 xor t0d, r6d 1856 mov seed, t0d 1857 ror seed, 1 ; updated (cur_seed << 16) | top_seed 1858 1859%if ARCH_X86_32 1860 mov r3m, seed 1861 1862 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1863 1864 mov offxd, offyd 1865%else 1866 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1867 offx, offy, see, src_bak, unused, top_offxy 1868 1869 mov offyd, seed 1870 mov offxd, seed 1871%endif 1872 ror offyd, 8 1873 ror offxd, 12 1874 and offyd, 0xf000f 1875 and offxd, 0xf000f 1876 imul offyd, 164 1877 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1878 lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1879 1880%if ARCH_X86_32 1881 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut 1882%else 1883 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1884 h, offxy, see, src_bak, unused, top_offxy 1885%endif 1886 1887 movzx top_offxyd, offxyw 1888%if ARCH_X86_32 1889 mov [rsp+8*mmsize+1*gprsize], top_offxyd 1890 1891 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1892%endif 1893 shr offxyd, 16 1894 1895.loop_x_odd_v_overlap: 1896%if ARCH_X86_32 1897 mov r5, r5m 1898%endif 1899 SPLATD m7, [PIC_ptr(pw_27_17_17_27)] 1900 mov hd, dword r7m 1901 mov grain_lutq, grain_lutmp 1902.loop_y_v_overlap: 1903 ; grain = grain_lut[offy+y][offx+x] 1904 movu m3, [grain_lutq+offxyq*2] 1905%if ARCH_X86_32 1906 mov r5, [rsp+8*mmsize+1*gprsize] 1907 movu m2, [grain_lutq+r5*2] 1908%else 1909 movu m2, [grain_lutq+top_offxyq*2] 1910%endif 1911 punpckhwd m4, m2, m3 1912 punpcklwd m2, m3 1913 REPX {pmaddwd x, m7}, m4, m2 1914 REPX {paddd x, m14}, m4, m2 1915 REPX {psrad x, 5}, m4, m2 1916 packssdw m2, m4 1917 pminsw m2, m15 1918 pmaxsw m2, m9 1919 movu m4, [grain_lutq+offxyq*2+16] 1920%if ARCH_X86_32 1921 movu m3, [grain_lutq+r5*2+16] 1922%else 1923 movu m3, [grain_lutq+top_offxyq*2+16] 1924%endif 1925 punpckhwd m5, m3, m4 1926 punpcklwd m3, m4 1927 REPX {pmaddwd x, m7}, m5, m3 1928 REPX {paddd x, m14}, m5, m3 1929 REPX {psrad x, 5}, m5, m3 1930 packssdw m3, m5 1931 pminsw m3, m15 1932 pmaxsw m3, m9 1933 1934 ; src 1935 pand m0, m10, [srcq+ 0] ; m0-1: src as word 1936 pand m1, m10, [srcq+16] ; m0-1: src as word 1937 1938 ; scaling[src] 1939 ; noise = round2(scaling[src] * grain, scaling_shift) 1940%if ARCH_X86_32 1941 vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 1942%else 1943 vpgatherdw m4, m0, scalingq-1, r11, r13, 8, 1, m5 1944%endif 1945 psrlw m4, 8 1946 pmullw m4, m11 1947 pmulhrsw m4, m2 1948%if ARCH_X86_32 1949 vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m2 1950%else 1951 vpgatherdw m5, m1, scalingq-1, r11, r13, 8, 1, m2 1952%endif 1953 psrlw m5, 8 1954 pmullw m5, m11 1955 pmulhrsw m5, m3 1956 1957 ; dst = clip_pixel(src, noise) 1958 paddw m0, m4 1959 paddw m1, m5 1960 pmaxsw m0, m13 1961 pmaxsw m1, m13 1962 pminsw m0, m12 1963 pminsw m1, m12 1964 movifnidn dstq, dstmp 1965 mova [dstq+srcq+ 0], m0 1966 mova [dstq+srcq+16], m1 1967 1968 add srcq, r2mp 1969 add grain_lutq, 82*2 1970 dec hw 1971 jz .end_y_v_overlap 1972 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1973 ; remaining (up to) 30 lines 1974%if ARCH_X86_32 1975 mov r5, r5m 1976%endif 1977 SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] 1978 xor hd, 0x10000 1979 test hd, 0x10000 1980 jnz .loop_y_v_overlap 1981 jmp .loop_y 1982 1983.end_y_v_overlap: 1984%if ARCH_X86_32 1985 add r4mp, 16 1986%else 1987 add wq, 16 1988%endif 1989 jge .end_hv 1990%if ARCH_X86_32 1991 mov srcq, r9mp 1992 add srcq, r4mp 1993 add srcq, r4mp 1994%else 1995 mov src_bakq, r9mp 1996 lea srcq, [src_bakq+wq*2] 1997%endif 1998 btc dword r8m, 2 1999 jc .next_blk_v 2000%if ARCH_X86_32 2001 add dword [rsp+8*mmsize+1*gprsize], 16 2002%else 2003 add top_offxyd, 16 2004%endif 2005 add offxyd, 16 2006 jmp .loop_x_odd_v_overlap 2007 2008.next_blk_v: 2009 ; since fg_dataq.overlap is guaranteed to be set, we never jump 2010 ; back to .loop_x_v_overlap, and instead always fall-through to 2011 ; h+v overlap 2012 2013.loop_x_hv_overlap: 2014%if ARCH_X86_32 2015 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 2016 2017 mov r0, [rsp+8*mmsize+1*gprsize] 2018 add r3, 16 2019 add r0, 16 2020 mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy 2021 mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy 2022 2023 mov seed, r3m 2024 xor r0, r0 2025%else 2026 ; we assume from the block above that bits 8-15 of r7d are zero'ed 2027%endif 2028 mov r6d, seed 2029 or seed, 0xeff4eff4 2030 test seeb, seeh 2031 setp t0b ; parity of top_seed 2032 shr seed, 16 2033 shl t0d, 16 2034 test seeb, seeh 2035 setp t0b ; parity of cur_seed 2036 or r6d, 0x00010001 2037 xor t0d, r6d 2038 mov seed, t0d 2039 ror seed, 1 ; updated (cur_seed << 16) | top_seed 2040 2041%if ARCH_X86_32 2042 mov r3m, seed 2043 2044 DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 2045 2046 mov offxd, offyd 2047%else 2048 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2049 offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy 2050 2051 lea topleft_offxyq, [top_offxyq+16] 2052 lea left_offxyq, [offyq+16] 2053 mov offyd, seed 2054 mov offxd, seed 2055%endif 2056 ror offyd, 8 2057 ror offxd, 12 2058 and offyd, 0xf000f 2059 and offxd, 0xf000f 2060 imul offyd, 164 2061 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 2062 lea offyq, [offyq+offxq*2+0x10001*747+32*82] 2063 2064%if ARCH_X86_32 2065 DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut 2066%else 2067 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2068 h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy 2069%endif 2070 2071 movzx top_offxyd, offxyw 2072%if ARCH_X86_32 2073 mov [rsp+8*mmsize+1*gprsize], top_offxyd 2074 2075 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2076%endif 2077 shr offxyd, 16 2078 2079%if ARCH_X86_32 2080 mov r5, r5m 2081%endif 2082 SPLATD m7, [PIC_ptr(pw_27_17_17_27)] 2083 2084 movzx hd, word r7m 2085 mov grain_lutq, grain_lutmp 2086.loop_y_hv_overlap: 2087 ; grain = grain_lut[offy+y][offx+x] 2088 movu m2, [grain_lutq+offxyq*2] 2089%if ARCH_X86_32 2090 mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy 2091 mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy 2092 movu m4, [grain_lutq+r0*2] 2093 movd m5, [grain_lutq+r5*2] 2094 mov r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy 2095 movd m3, [grain_lutq+r5*2] 2096%else 2097 movu m4, [grain_lutq+top_offxyq*2] 2098 movd m5, [grain_lutq+left_offxyq*2] 2099 movd m3, [grain_lutq+topleft_offxyq*2] 2100%endif 2101 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 2102 punpcklwd m5, m2 2103 punpcklwd m3, m4 2104 REPX {pmaddwd x, m6}, m5, m3 2105 REPX {paddd x, m14}, m5, m3 2106 REPX {psrad x, 5}, m5, m3 2107 packssdw m5, m3 2108 pminsw m5, m15 2109 pmaxsw m5, m9 2110 shufps m3, m5, m2, q3210 2111 shufps m5, m4, q3232 2112 ; followed by v interpolation (top | cur -> cur) 2113 movu m0, [grain_lutq+offxyq*2+16] 2114%if ARCH_X86_32 2115 movu m1, [grain_lutq+r0*2+16] 2116%else 2117 movu m1, [grain_lutq+top_offxyq*2+16] 2118%endif 2119 punpcklwd m2, m5, m3 2120 punpckhwd m5, m3 2121 punpcklwd m3, m1, m0 2122 punpckhwd m1, m0 2123 REPX {pmaddwd x, m7}, m2, m5, m3, m1 2124 REPX {paddd x, m14}, m2, m5, m3, m1 2125 REPX {psrad x, 5}, m2, m5, m3, m1 2126 packssdw m2, m5 2127 packssdw m3, m1 2128 REPX {pminsw x, m15}, m2, m3 2129 REPX {pmaxsw x, m9}, m2, m3 2130 2131 ; src 2132 pand m0, m10, [srcq+ 0] 2133 pand m1, m10, [srcq+16] ; m0-1: src as word 2134 2135 ; scaling[src] 2136 ; noise = round2(scaling[src] * grain, scaling_shift) 2137%if ARCH_X86_32 2138 vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 2139%else 2140 vpgatherdw m4, m0, scalingq-1, r14, r10, 8, 1, m5 2141%endif 2142 psrlw m4, 8 2143 pmullw m4, m11 2144 pmulhrsw m2, m4 2145%if ARCH_X86_32 2146 vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m4 2147%else 2148 vpgatherdw m5, m1, scalingq-1, r14, r10, 8, 1, m4 2149%endif 2150 psrlw m5, 8 2151 pmullw m5, m11 2152 pmulhrsw m3, m5 2153 2154 ; dst = clip_pixel(src, noise) 2155 paddw m0, m2 2156 paddw m1, m3 2157 pmaxsw m0, m13 2158 pmaxsw m1, m13 2159 pminsw m0, m12 2160 pminsw m1, m12 2161 movifnidn dstq, dstmp 2162 mova [dstq+srcq+ 0], m0 2163 mova [dstq+srcq+16], m1 2164 2165 add srcq, r2mp 2166 add grain_lutq, 82*2 2167 dec hw 2168 jz .end_y_hv_overlap 2169 ; 2 lines get vertical overlap, then fall back to non-overlap code for 2170 ; remaining (up to) 30 lines 2171%if ARCH_X86_32 2172 mov r5, r5m 2173%endif 2174 SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] 2175 xor hd, 0x10000 2176 test hd, 0x10000 2177 jnz .loop_y_hv_overlap 2178 jmp .loop_y_h_overlap 2179 2180.end_y_hv_overlap: 2181 or dword r8m, 4 2182%if ARCH_X86_32 2183 add r4mp, 16 2184%else 2185 add wq, 16 2186%endif 2187 jge .end_hv 2188%if ARCH_X86_32 2189 mov r5, r5m 2190 add offxyd, 16 2191 add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16 2192 mov srcq, r9mp 2193 add srcq, r4mp 2194 add srcq, r4mp 2195%else 2196 add offxyd, 16 2197 add top_offxyd, 16 2198 mov src_bakq, r9mp 2199 lea srcq, [src_bakq+wq*2] 2200%endif 2201 jmp .loop_x_odd_v_overlap 2202 2203.end_hv: 2204 RET 2205%if ARCH_X86_32 2206 DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 2207%endif 2208 2209%macro FGUV_FN 3 ; name, ss_hor, ss_ver 2210INIT_XMM ssse3 2211%if ARCH_X86_32 2212%if STACK_ALIGNMENT < mmsize 2213cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \ 2214 tmp, src, scaling, h, fg_data, picptr, unused 2215 mov r0, r0m 2216 mov r1, r1m 2217 mov r2, r2m 2218 mov r4, r3m 2219 mov r3, r4m 2220 mov r5, r5m 2221%define r0m [rsp+8*mmsize+ 3*gprsize] 2222%define r1m [rsp+8*mmsize+ 4*gprsize] 2223%define r2m [rsp+8*mmsize+ 5*gprsize] 2224%define r3m [rsp+8*mmsize+ 6*gprsize] 2225%define r4m [rsp+8*mmsize+ 7*gprsize] 2226%define r5m [rsp+8*mmsize+ 8*gprsize] 2227 mov r0m, r0 2228 mov r2m, r2 2229 mov r4m, r3 2230 mov r5m, r5 2231 2232 mov r0, r6m 2233 mov r2, r7m 2234 mov r3, r8m 2235 mov r5, r9m 2236%define r6m [rsp+8*mmsize+ 9*gprsize] 2237%define r7m [rsp+8*mmsize+10*gprsize] 2238%define r8m [rsp+8*mmsize+11*gprsize] 2239%define r9m [rsp+8*mmsize+12*gprsize] 2240 mov r6m, r0 2241 mov r7m, r2 2242 mov r8m, r3 2243 mov r9m, r5 2244 2245 mov r2, r10m 2246 mov r3, r11m 2247 mov r5, r12m 2248 mov r0, r13m 2249%define r10m [rsp+8*mmsize+13*gprsize] 2250%define r11m [rsp+8*mmsize+14*gprsize] 2251%define r12m [rsp+8*mmsize+15*gprsize] 2252 mov r10m, r2 2253 mov r11m, r3 2254 mov r12m, r5 2255 2256 SPLATW m2, r13m 2257%else 2258cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ 2259 tmp, src, scaling, h, fg_data, picptr, unused 2260 mov srcq, srcm 2261 mov fg_dataq, r3m 2262%endif 2263 LEA r5, $$ 2264%define base r5-$$ 2265 2266 DECLARE_REG_TMP 0, 2, 3 2267%else 2268cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ 2269 grain_lut, h, sby, luma, lstride, uv_pl, is_id 2270%define base r8-pb_mask 2271 lea r8, [pb_mask] 2272 2273 DECLARE_REG_TMP 9, 10, 11 2274%endif 2275 mov r6d, [fg_dataq+FGData.scaling_shift] 2276 SPLATW m3, [base+mul_bits+r6*2-14] 2277 mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 2278%if STACK_ALIGNMENT >= mmsize 2279 mov t0d, r13m ; bdmax 2280%endif 2281 sar t0d, 11 ; is_12bpc 2282 inc t0d 2283 mov t1d, r6d 2284 imul t1d, t0d 2285 dec t0d 2286 SPLATW m5, [base+min+t1*2] 2287 lea t1d, [t0d*3] 2288 mov t2d, r12m 2289 inc t2d 2290 imul r6d, t2d 2291 add t1d, r6d 2292 SPLATW m4, [base+max+t1*2] 2293%if STACK_ALIGNMENT >= mmsize 2294 SPLATW m2, r13m 2295%endif 2296 2297 SCRATCH 2, 10, 2 2298 SCRATCH 3, 11, 3 2299 SCRATCH 4, 12, 4 2300 SCRATCH 5, 13, 5 2301 2302%define mzero m7 2303 2304%if %3 2305 SPLATD m2, [base+pw_23_22] 2306%endif 2307 2308%if ARCH_X86_32 2309 mov scalingq, r5m 2310 mov r5m, r5 2311%else 2312 mov r13mp, strideq 2313%endif 2314 2315 pcmpeqw m0, m0 2316 psraw m1, m10, 1 2317 pxor m0, m1 2318 2319 SCRATCH 0, 8, 0 2320 SCRATCH 1, 9, 1 2321 2322 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 2323 jne .csfl 2324 2325%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v 2326%if ARCH_X86_32 2327 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 2328 2329 DECLARE_REG_TMP 0 2330%else 2331 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 2332 2333 DECLARE_REG_TMP 9 2334%endif 2335 2336%if %1 2337 mov r6d, r11m 2338 SPLATW m0, [fg_dataq+FGData.uv_mult+r6*4] 2339 SPLATW m1, [fg_dataq+FGData.uv_luma_mult+r6*4] 2340 punpcklwd m6, m1, m0 2341 SPLATW m5, [fg_dataq+FGData.uv_offset+r6*4] 2342 SPLATD m7, [base+pw_4+t0*4] 2343 pmullw m5, m7 2344%else 2345 SPLATD m6, [base+pd_16] 2346%if %2 2347 mova m5, [base+pw_23_22] 2348%else 2349 mova m5, [base+pw_27_17_17_27] 2350%endif 2351%endif 2352 2353 SCRATCH 6, 14, 6 2354 SCRATCH 5, 15, 7 2355 2356%if ARCH_X86_32 2357 DECLARE_REG_TMP 0 2358%else 2359 DECLARE_REG_TMP 7 2360%endif 2361 2362 mov sbyd, r8m 2363 mov t0d, [fg_dataq+FGData.overlap_flag] 2364 test t0d, t0d 2365 jz %%no_vertical_overlap 2366 test sbyd, sbyd 2367 jnz %%vertical_overlap 2368 2369%%no_vertical_overlap: 2370 mov r8m, t0d 2371%if ARCH_X86_32 2372 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap 2373 imul seed, (173 << 24) | 37 2374%else 2375 imul seed, sbyd, (173 << 24) | 37 2376%endif 2377 add seed, (105 << 24) | 178 2378 rol seed, 8 2379 movzx seed, seew 2380 xor seed, [fg_dataq+FGData.seed] 2381%if ARCH_X86_32 2382 mov r3m, seed 2383 2384 DEFINE_ARGS dst, src, scaling, see, w, picptr, luma 2385 2386 mov dstq, r0mp 2387 mov lumaq, r9mp 2388 mov wq, r4m 2389 lea r3, [srcq+wq*2] 2390 mov r1mp, r3 2391 lea r3, [dstq+wq*2] 2392 mov r11mp, r3 2393 lea r3, [lumaq+wq*(2<<%2)] 2394 mov r12mp, r3 2395%if %3 2396 shl r10mp, 1 2397%endif 2398%else 2399 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2400 unused2, unused3, see, unused4, unused5, unused6, luma, lstride 2401 2402 mov lstrideq, r10mp 2403%if %3 2404 add lstrideq, lstrideq 2405%endif 2406 mov lumaq, r9mp 2407 lea r10, [srcq+wq*2] 2408 lea r11, [dstq+wq*2] 2409 lea r12, [lumaq+wq*(2<<%2)] 2410 mov r10mp, r10 2411 mov r11mp, r11 2412 mov r12mp, r12 2413%endif 2414 neg wq 2415%if ARCH_X86_32 2416 mov r4mp, wq 2417%endif 2418 2419%%loop_x: 2420%if ARCH_X86_32 2421 mov seed, r3m 2422%endif 2423 2424 mov r6d, seed 2425 or seed, 0xEFF4 2426 shr r6d, 1 2427 test seeb, seeh 2428 lea seed, [r6+0x8000] 2429 cmovp seed, r6d ; updated seed 2430 2431%if ARCH_X86_32 2432 mov r3m, seed 2433 2434 DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 2435 2436 mov offxd, offyd 2437%else 2438 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2439 offx, offy, see, unused1, unused2, unused3, luma, lstride 2440 2441 mov offxd, seed 2442 mov offyd, seed 2443%endif 2444 ror offyd, 8 2445 shr offxd, 12 2446 and offyd, 0xf 2447 imul offyd, 164>>%3 2448 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 2449 2450%if ARCH_X86_32 2451 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2452%else 2453 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2454 h, offxy, see, unused1, unused2, unused3, luma, lstride 2455%endif 2456 2457%if %2 == 0 2458%%loop_x_odd: 2459%endif 2460 mov hd, r7m 2461 mov grain_lutq, grain_lutmp 2462%%loop_y: 2463 ; src 2464 mova m0, [srcq] 2465 mova m1, [srcq+16] ; m0-1: src as word 2466 2467 ; luma_src 2468 pxor mzero, mzero 2469%if ARCH_X86_32 2470 DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut 2471 2472 mov lumaq, r9m 2473%endif 2474 mova m4, [lumaq+ 0] 2475 mova m6, [lumaq+(16<<%2)] 2476%if %2 2477 phaddw m4, [lumaq+16] 2478 phaddw m6, [lumaq+48] 2479%endif 2480%if ARCH_X86_32 2481 add lumaq, r10mp 2482 mov r9m, lumaq 2483%endif 2484%if %2 2485 pavgw m4, mzero 2486 pavgw m6, mzero 2487%endif 2488 2489%if %1 2490 punpckhwd m3, m4, m0 2491 punpcklwd m4, m0 2492 punpckhwd m5, m6, m1 2493 punpcklwd m6, m1 ; { luma, chroma } 2494 REPX {pmaddwd x, m14}, m3, m4, m5, m6 2495 REPX {psrad x, 6}, m3, m4, m5, m6 2496 packssdw m4, m3 2497 packssdw m6, m5 2498 REPX {paddw x, m15}, m4, m6 2499 REPX {pmaxsw x, mzero}, m4, m6 2500 REPX {pminsw x, m10}, m4, m6 ; clip_pixel() 2501%else 2502 REPX {pand x, m10}, m4, m6 2503%endif 2504 2505 ; scaling[luma_src] 2506%if ARCH_X86_32 2507 vpgatherdw m3, m4, scalingq-1, r0, r5, 8, 1 2508 vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 2509%else 2510 vpgatherdw m3, m4, scalingq-1, r10, r12, 8, 1 2511 vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 2512%endif 2513 REPX {psrlw x, 8}, m3, m5 2514 2515 ; grain = grain_lut[offy+y][offx+x] 2516 movu m4, [grain_lutq+offxyq*2] 2517 movu m6, [grain_lutq+offxyq*2+16] 2518 2519 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2520 REPX {pmullw x, m11}, m3, m5 2521 pmulhrsw m4, m3 2522 pmulhrsw m6, m5 2523 2524 ; dst = clip_pixel(src, noise) 2525 paddw m0, m4 2526 paddw m1, m6 2527 pmaxsw m0, m13 2528 pmaxsw m1, m13 2529 pminsw m0, m12 2530 pminsw m1, m12 2531 movifnidn dstq, dstmp 2532 mova [dstq+ 0], m0 2533 mova [dstq+16], m1 2534 2535%if ARCH_X86_32 2536 add srcq, r2mp 2537 add dstq, r2mp 2538 mov dstmp, dstq 2539%else 2540 add srcq, r13mp 2541 add dstq, r13mp 2542 add lumaq, lstrideq 2543%endif 2544 add grain_lutq, 82*2 2545 dec hd 2546 jg %%loop_y 2547 2548%if ARCH_X86_32 2549 DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma 2550 2551 mov wq, r4mp 2552%endif 2553 add wq, 16 2554 jge %%end 2555%if ARCH_X86_32 2556 mov srcq, r1mp 2557%else 2558 mov srcq, r10mp 2559%endif 2560 mov dstq, r11mp 2561 mov lumaq, r12mp 2562 lea srcq, [srcq+wq*2] 2563 lea dstq, [dstq+wq*2] 2564 lea lumaq, [lumaq+wq*(2<<%2)] 2565%if ARCH_X86_32 2566 mov r0m, dstq 2567 mov r9m, lumaq 2568 mov r4m, wq 2569%endif 2570%if %2 == 0 2571 btc dword r8m, 2 2572 jc %%next_blk 2573 add offxyd, 16 2574 test dword r8m, 2 2575 jz %%loop_x_odd 2576%if ARCH_X86_32 2577 add dword [rsp+8*mmsize+1*gprsize], 16 2578%else 2579 add r11d, 16 2580%endif 2581 jmp %%loop_x_odd_v_overlap 2582%%next_blk: 2583%endif 2584 test dword r8m, 1 2585 je %%loop_x 2586 2587 ; r8m = sbym 2588 test dword r8m, 2 2589 jnz %%loop_x_hv_overlap 2590 2591 ; horizontal overlap (without vertical overlap) 2592%%loop_x_h_overlap: 2593%if ARCH_X86_32 2594 add offxyd, 16 2595 mov [rsp+8*mmsize+0*gprsize], offxyd 2596 2597 DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut 2598 2599 mov seed, r3m 2600%endif 2601 mov r6d, seed 2602 or seed, 0xEFF4 2603 shr r6d, 1 2604 test seeb, seeh 2605 lea seed, [r6+0x8000] 2606 cmovp seed, r6d ; updated seed 2607 2608%if ARCH_X86_32 2609 mov r3m, seed 2610 2611 DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 2612 2613 mov offxd, offyd 2614%else 2615 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2616 offx, offy, see, left_offxy, unused1, unused2, luma, lstride 2617 2618 lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx 2619 mov offxd, seed 2620 mov offyd, seed 2621%endif 2622 ror offyd, 8 2623 shr offxd, 12 2624 and offyd, 0xf 2625 imul offyd, 164>>%3 2626 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 2627 2628%if ARCH_X86_32 2629 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2630%else 2631 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2632 h, offxy, see, left_offxy, unused1, unused2, luma, lstride 2633%endif 2634 2635 mov hd, r7m 2636 mov grain_lutq, grain_lutmp 2637%%loop_y_h_overlap: 2638 mova m0, [srcq] 2639 mova m1, [srcq+16] 2640 2641 ; luma_src 2642 pxor mzero, mzero 2643%if ARCH_X86_32 2644 DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut 2645 mov lumaq, r9m 2646%endif 2647 mova m4, [lumaq+ 0] 2648 mova m6, [lumaq+(16<<%2)] 2649%if %2 2650 phaddw m4, [lumaq+16] 2651 phaddw m6, [lumaq+48] 2652%endif 2653%if ARCH_X86_32 2654 add lumaq, r10mp 2655 mov r9m, lumaq 2656%endif 2657%if %2 2658 pavgw m4, mzero 2659 pavgw m6, mzero 2660%endif 2661 2662%if %1 2663 punpckhwd m3, m4, m0 2664 punpcklwd m4, m0 2665 punpckhwd m5, m6, m1 2666 punpcklwd m6, m1 ; { luma, chroma } 2667 REPX {pmaddwd x, m14}, m3, m4, m5, m6 2668 REPX {psrad x, 6}, m3, m4, m5, m6 2669 packssdw m4, m3 2670 packssdw m6, m5 2671 REPX {paddw x, m15}, m4, m6 2672 REPX {pmaxsw x, mzero}, m4, m6 2673 REPX {pminsw x, m10}, m4, m6 ; clip_pixel() 2674%else 2675 REPX {pand x, m10}, m4, m6 2676%endif 2677 2678 ; grain = grain_lut[offy+y][offx+x] 2679 movu m7, [grain_lutq+offxyq*2] 2680%if ARCH_X86_32 2681 mov r5, [rsp+8*mmsize+0*gprsize] 2682 movd m5, [grain_lutq+r5*2] 2683%else 2684 movd m5, [grain_lutq+left_offxyq*2+ 0] 2685%endif 2686 punpcklwd m5, m7 ; {left0, cur0} 2687%if %1 2688%if ARCH_X86_32 2689 mov r5, r5m 2690%endif 2691%if %2 2692 pmaddwd m5, [PIC_ptr(pw_23_22)] 2693%else 2694 pmaddwd m5, [PIC_ptr(pw_27_17_17_27)] 2695%endif 2696 paddd m5, [PIC_ptr(pd_16)] 2697%else 2698 pmaddwd m5, m15 2699 paddd m5, m14 2700%endif 2701 psrad m5, 5 2702 packssdw m5, m5 2703 pmaxsw m5, m8 2704 pminsw m5, m9 2705 shufps m5, m7, q3210 2706 movu m3, [grain_lutq+offxyq*2+16] 2707 2708 ; scaling[luma_src] 2709%if ARCH_X86_32 2710 vpgatherdw m7, m4, scalingq-1, r0, r5, 8, 1 2711 vpgatherdw m4, m6, scalingq-1, r0, r5, 8, 1 2712%else 2713 vpgatherdw m7, m4, scalingq-1, r2, r12, 8, 1 2714 vpgatherdw m4, m6, scalingq-1, r2, r12, 8, 1 2715%endif 2716 REPX {psrlw x, 8}, m7, m4 2717 2718 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2719 REPX {pmullw x, m11}, m7, m4 2720 pmulhrsw m5, m7 2721 pmulhrsw m3, m4 2722 2723 ; dst = clip_pixel(src, noise) 2724 paddw m0, m5 2725 paddw m1, m3 2726 pmaxsw m0, m13 2727 pmaxsw m1, m13 2728 pminsw m0, m12 2729 pminsw m1, m12 2730 movifnidn dstq, dstmp 2731 mova [dstq+ 0], m0 2732 mova [dstq+16], m1 2733 2734%if ARCH_X86_32 2735 add srcq, r2mp 2736 add dstq, r2mp 2737 mov dstmp, dstq 2738%else 2739 add srcq, r13mp 2740 add dstq, r13mp 2741 add lumaq, lstrideq 2742%endif 2743 add grain_lutq, 82*2 2744 dec hd 2745 jg %%loop_y_h_overlap 2746 2747%if ARCH_X86_32 2748 DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut 2749 mov wq, r4mp 2750%endif 2751 add wq, 16 2752 jge %%end 2753%if ARCH_X86_32 2754 mov srcq, r1mp 2755%else 2756 mov srcq, r10mp 2757%endif 2758 mov dstq, r11mp 2759 mov lumaq, r12mp 2760 lea srcq, [srcq+wq*2] 2761 lea dstq, [dstq+wq*2] 2762 lea lumaq, [lumaq+wq*(2<<%2)] 2763%if ARCH_X86_32 2764 mov r0mp, dstq 2765 mov r9mp, lumaq 2766 mov r4m, wq 2767%endif 2768 2769%if %2 2770 ; r8m = sbym 2771 test dword r8m, 2 2772 jne %%loop_x_hv_overlap 2773 jmp %%loop_x_h_overlap 2774%else 2775 or dword r8m, 4 2776 add offxyd, 16 2777 2778 ; r8m = sbym 2779 test dword r8m, 2 2780 jz %%loop_x_odd 2781%if ARCH_X86_32 2782 add dword [rsp+8*mmsize+1*gprsize], 16 2783%else 2784 add r11d, 16 ; top_offxy += 16 2785%endif 2786 jmp %%loop_x_odd_v_overlap 2787%endif 2788 2789%%end: 2790 RET 2791 2792%%vertical_overlap: 2793 or t0d, 2 2794 mov r8m, t0d 2795 2796%if ARCH_X86_32 2797 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 2798%else 2799 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ 2800 sby, see, unused1, unused2, unused3, lstride 2801%endif 2802 2803 movzx sbyd, sbyb 2804%if ARCH_X86_32 2805 imul r4, [fg_dataq+FGData.seed], 0x00010001 2806 2807 DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused 2808%else 2809 imul seed, [fg_dataq+FGData.seed], 0x00010001 2810%endif 2811 imul t0d, sbyd, 173 * 0x00010001 2812 imul sbyd, 37 * 0x01000100 2813 add t0d, (105 << 16) | 188 2814 add sbyd, (178 << 24) | (141 << 8) 2815 and t0d, 0x00ff00ff 2816 and sbyd, 0xff00ff00 2817 xor seed, t0d 2818%if ARCH_X86_32 2819 xor sbyd, seed 2820 2821 DEFINE_ARGS dst, src, scaling, see, w, picptr, luma 2822 2823 mov r3m, seed 2824 mov dstq, r0mp 2825 mov lumaq, r9mp 2826 mov wq, r4m 2827 lea r3, [srcq+wq*2] 2828 mov r1mp, r3 2829 lea r3, [dstq+wq*2] 2830 mov r11mp, r3 2831 lea r3, [lumaq+wq*(2<<%2)] 2832 mov r12mp, r3 2833%if %3 2834 shl r10mp, 1 2835%endif 2836%else 2837 xor seed, sbyd ; (cur_seed << 16) | top_seed 2838 2839 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2840 unused1, unused2, see, unused3, unused4, unused5, luma, lstride 2841 2842 mov lstrideq, r10mp 2843%if %3 2844 add lstrideq, lstrideq 2845%endif 2846 mov lumaq, r9mp 2847 lea r10, [srcq+wq*2] 2848 lea r11, [dstq+wq*2] 2849 lea r12, [lumaq+wq*(2<<%2)] 2850 mov r10mp, r10 2851 mov r11mp, r11 2852 mov r12mp, r12 2853%endif 2854 neg wq 2855%if ARCH_X86_32 2856 mov r4m, wq 2857%endif 2858 2859%%loop_x_v_overlap: 2860%if ARCH_X86_32 2861 mov seed, r3m 2862 xor t0d, t0d 2863%else 2864 ; we assume from the block above that bits 8-15 of r7d are zero'ed 2865%endif 2866 mov r6d, seed 2867 or seed, 0xeff4eff4 2868 test seeb, seeh 2869 setp t0b ; parity of top_seed 2870 shr seed, 16 2871 shl t0d, 16 2872 test seeb, seeh 2873 setp t0b ; parity of cur_seed 2874 or r6d, 0x00010001 2875 xor t0d, r6d 2876 mov seed, t0d 2877 ror seed, 1 ; updated (cur_seed << 16) | top_seed 2878%if ARCH_X86_32 2879 mov r3m, seed 2880 2881 DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 2882 2883 mov offxd, offyd 2884%else 2885 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2886 offx, offy, see, unused1, top_offxy, unused2, luma, lstride 2887 2888 mov offyd, seed 2889 mov offxd, seed 2890%endif 2891 ror offyd, 8 2892 ror offxd, 12 2893 and offyd, 0xf000f 2894 and offxd, 0xf000f 2895 imul offyd, 164>>%3 2896 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 2897 lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 2898 2899%if ARCH_X86_32 2900 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut 2901%else 2902 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2903 h, offxy, see, unused1, top_offxy, unused2, luma, lstride 2904%endif 2905 movzx top_offxyd, offxyw 2906%if ARCH_X86_32 2907 mov [rsp+8*mmsize+1*gprsize], top_offxyd 2908 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2909%endif 2910 shr offxyd, 16 2911 2912%if %2 == 0 2913%%loop_x_odd_v_overlap: 2914%endif 2915%if %3 == 0 2916%if ARCH_X86_32 2917 mov r5, r5m 2918%endif 2919 SPLATD m2, [PIC_ptr(pw_27_17_17_27)] 2920%endif 2921 2922 mov hd, r7m 2923 mov grain_lutq, grain_lutmp 2924%%loop_y_v_overlap: 2925 ; grain = grain_lut[offy+y][offx+x] 2926 movu m3, [grain_lutq+offxyq*2] 2927%if ARCH_X86_32 2928 mov r0, [rsp+mmsize*8+gprsize*1] ; top_offxy 2929 movu m5, [grain_lutq+r0*2] 2930%else 2931 movu m5, [grain_lutq+top_offxyq*2] 2932%endif 2933 punpckhwd m7, m5, m3 2934 punpcklwd m5, m3 ; {top/cur interleaved} 2935 REPX {pmaddwd x, m2}, m7, m5 2936%if %1 2937%if ARCH_X86_32 2938 mov r5, r5m 2939%endif 2940 REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 2941%else 2942 REPX {paddd x, m14}, m7, m5 2943%endif 2944 REPX {psrad x, 5}, m7, m5 2945 packssdw m3, m5, m7 2946 pmaxsw m3, m8 2947 pminsw m3, m9 2948 2949 ; grain = grain_lut[offy+y][offx+x] 2950 movu m4, [grain_lutq+offxyq*2+16] 2951%if ARCH_X86_32 2952 movu m5, [grain_lutq+r0*2+16] 2953%else 2954 movu m5, [grain_lutq+top_offxyq*2+16] 2955%endif 2956 punpckhwd m7, m5, m4 2957 punpcklwd m5, m4 ; {top/cur interleaved} 2958 REPX {pmaddwd x, m2}, m7, m5 2959%if %1 2960 REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 2961%else 2962 REPX {paddd x, m14}, m7, m5 2963%endif 2964 REPX {psrad x, 5}, m7, m5 2965 packssdw m4, m5, m7 2966 pmaxsw m4, m8 2967 pminsw m4, m9 2968 2969 ; src 2970 mova m0, [srcq] 2971 mova m1, [srcq+16] 2972 2973 ; luma_src 2974 pxor mzero, mzero 2975%if ARCH_X86_32 2976 DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut 2977 2978 mov lumaq, r9mp 2979%endif 2980 mova m5, [lumaq+ 0] 2981 mova m6, [lumaq+(16<<%2)] 2982%if %2 2983 phaddw m5, [lumaq+16] 2984 phaddw m6, [lumaq+48] 2985%endif 2986%if ARCH_X86_32 2987 add lumaq, r10mp 2988 mov r9mp, lumaq 2989%endif 2990%if %2 2991 pavgw m5, mzero 2992 pavgw m6, mzero 2993%endif 2994 2995%if %1 2996 punpckhwd m7, m5, m0 2997 punpcklwd m5, m0 2998 REPX {pmaddwd x, m14}, m7, m5 2999 REPX {psrad x, 6}, m7, m5 3000 packssdw m5, m7 3001 punpckhwd m7, m6, m1 3002 punpcklwd m6, m1 ; { luma, chroma } 3003 REPX {pmaddwd x, m14}, m7, m6 3004 REPX {psrad x, 6}, m7, m6 3005 packssdw m6, m7 3006 pxor mzero, mzero 3007 REPX {paddw x, m15}, m5, m6 3008 REPX {pmaxsw x, mzero}, m5, m6 3009 REPX {pminsw x, m10}, m5, m6 ; clip_pixel() 3010%else 3011 REPX {pand x, m10}, m5, m6 3012%endif 3013 3014 ; scaling[luma_src] 3015%if ARCH_X86_32 3016 vpgatherdw m7, m5, scalingq-1, r0, r5, 8, 1 3017 vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 3018%else 3019 vpgatherdw m7, m5, scalingq-1, r10, r12, 8, 1 3020 vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 3021%endif 3022 REPX {psrlw x, 8}, m7, m5 3023 3024 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 3025 REPX {pmullw x, m11}, m7, m5 3026 pmulhrsw m3, m7 3027 pmulhrsw m4, m5 3028 3029 ; dst = clip_pixel(src, noise) 3030 paddw m0, m3 3031 paddw m1, m4 3032 pmaxsw m0, m13 3033 pmaxsw m1, m13 3034 pminsw m0, m12 3035 pminsw m1, m12 3036 movifnidn dstq, dstmp 3037 mova [dstq+ 0], m0 3038 mova [dstq+16], m1 3039 3040 dec hw 3041 jle %%end_y_v_overlap 3042%if ARCH_X86_32 3043 add srcq, r2mp 3044 add dstq, r2mp 3045 mov dstmp, dstq 3046%else 3047 add srcq, r13mp 3048 add dstq, r13mp 3049 add lumaq, lstrideq 3050%endif 3051 add grain_lutq, 82*2 3052%if %3 3053 jmp %%loop_y 3054%else 3055 btc hd, 16 3056 jc %%loop_y 3057%if ARCH_X86_32 3058 mov r5, r5m 3059%endif 3060 SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] 3061 jmp %%loop_y_v_overlap 3062%endif 3063 3064%%end_y_v_overlap: 3065%if ARCH_X86_32 3066 DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut 3067 3068 mov wq, r4m 3069%endif 3070 add wq, 16 3071 jge %%end_hv 3072%if ARCH_X86_32 3073 mov srcq, r1mp 3074%else 3075 mov srcq, r10mp 3076%endif 3077 mov dstq, r11mp 3078 mov lumaq, r12mp 3079 lea srcq, [srcq+wq*2] 3080 lea dstq, [dstq+wq*2] 3081 lea lumaq, [lumaq+wq*(2<<%2)] 3082%if ARCH_X86_32 3083 mov r0mp, dstq 3084 mov r9mp, lumaq 3085 mov r4m, wq 3086%endif 3087 3088%if %2 3089 ; since fg_dataq.overlap is guaranteed to be set, we never jump 3090 ; back to .loop_x_v_overlap, and instead always fall-through to 3091 ; h+v overlap 3092%else 3093 btc dword r8m, 2 3094 jc %%loop_x_hv_overlap 3095 add offxyd, 16 3096%if ARCH_X86_32 3097 add dword [rsp+8*mmsize+1*gprsize], 16 3098%else 3099 add r11d, 16 3100%endif 3101 jmp %%loop_x_odd_v_overlap 3102%endif 3103 3104%%loop_x_hv_overlap: 3105%if ARCH_X86_32 3106 DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut 3107 3108 mov t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy 3109 add offxyd, 16 3110 add t0d, 16 3111 mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd 3112 mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd 3113 3114 DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut 3115 3116 mov seed, r3m 3117 xor t0d, t0d 3118%else 3119 ; we assume from the block above that bits 8-15 of r7d are zero'ed 3120%endif 3121 mov r6d, seed 3122 or seed, 0xeff4eff4 3123 test seeb, seeh 3124 setp t0b ; parity of top_seed 3125 shr seed, 16 3126 shl t0d, 16 3127 test seeb, seeh 3128 setp t0b ; parity of cur_seed 3129 or r6d, 0x00010001 3130 xor t0d, r6d 3131 mov seed, t0d 3132 ror seed, 1 ; updated (cur_seed << 16) | top_seed 3133%if ARCH_X86_32 3134 mov r3m, seed 3135 3136 DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 3137 3138 mov offxd, offyd 3139%else 3140 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 3141 offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride 3142 3143 lea topleft_offxyq, [top_offxyq+16] 3144 lea left_offxyq, [offyq+16] 3145 mov offyd, seed 3146 mov offxd, seed 3147%endif 3148 ror offyd, 8 3149 ror offxd, 12 3150 and offyd, 0xf000f 3151 and offxd, 0xf000f 3152 imul offyd, 164>>%3 3153 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 3154 lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 3155 3156%if ARCH_X86_32 3157 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy 3158%else 3159 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 3160 h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride 3161%endif 3162 movzx top_offxyd, offxyw 3163%if ARCH_X86_32 3164 mov [rsp+8*mmsize+1*gprsize], top_offxyd 3165 3166 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 3167%endif 3168 shr offxyd, 16 3169 3170%if %3 == 0 3171%if ARCH_X86_32 3172 mov r5, r5m 3173%endif 3174 SPLATD m2, [PIC_ptr(pw_27_17_17_27)] 3175%endif 3176 3177 mov hd, r7m 3178 mov grain_lutq, grain_lutmp 3179%%loop_y_hv_overlap: 3180 ; grain = grain_lut[offy+y][offx+x] 3181%if ARCH_X86_32 3182 mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy 3183 mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy 3184 movd m5, [grain_lutq+r5*2] 3185%else 3186 movd m5, [grain_lutq+left_offxyq*2] 3187%endif 3188 movu m7, [grain_lutq+offxyq*2] 3189%if ARCH_X86_32 3190 mov r5, [rsp+8*mmsize+2*gprsize] 3191 movu m4, [grain_lutq+r0*2] 3192%if %2 3193 pinsrw m5, [grain_lutq+r5*2], 2 3194%else 3195 movd m3, [grain_lutq+r5*2] 3196%endif 3197%else 3198 movu m4, [grain_lutq+top_offxyq*2] 3199%if %2 3200 pinsrw m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left } 3201%else 3202 movd m3, [grain_lutq+topleft_offxyq*2] 3203%endif 3204%endif 3205%if %2 == 0 3206 punpckldq m5, m3 3207%endif 3208 punpckldq m3, m7, m4 ; { cur0/1,top0/1,cur2/3,top2/3 } 3209 punpcklwd m5, m3 ; { left/cur0,_/cur1,topleft/top0,_/top1 } 3210%if %1 3211%if ARCH_X86_32 3212 mov r5, r5m 3213%endif 3214%if %2 3215 movddup m0, [PIC_ptr(pw_23_22)] 3216%else 3217 movddup m0, [PIC_ptr(pw_27_17_17_27)] 3218%endif 3219%else 3220 pshufd m0, m15, q1010 3221%endif 3222 pmaddwd m5, m0 3223%if %1 3224 paddd m5, [PIC_ptr(pd_16)] 3225%else 3226 paddd m5, m14 3227%endif 3228 psrad m5, 5 3229 packssdw m5, m5 3230 pmaxsw m5, m8 3231 pminsw m5, m9 3232 shufps m5, m3, q3210 ; cur0/1,top0/1,cur2/3,top2/3 3233 shufps m3, m5, m7, q3220 ; cur0-7 post-h_filter 3234 shufps m5, m4, q3231 ; top0-7 post-h_filter 3235 3236 punpckhwd m7, m5, m3 3237 punpcklwd m5, m3 ; {top/cur interleaved} 3238 REPX {pmaddwd x, m2}, m7, m5 3239%if %1 3240 REPX {paddd x, [PIC_ptr(pd_16)]}, m5, m7 3241%else 3242 REPX {paddd x, m14}, m5, m7 3243%endif 3244 REPX {psrad x, 5}, m5, m7 3245 packssdw m3, m5, m7 3246 pmaxsw m3, m8 3247 pminsw m3, m9 3248 3249 ; right half 3250 movu m4, [grain_lutq+offxyq*2+16] 3251%if ARCH_X86_32 3252 movu m0, [grain_lutq+r0*2+16] 3253%else 3254 movu m0, [grain_lutq+top_offxyq*2+16] 3255%endif 3256 punpckhwd m1, m0, m4 3257 punpcklwd m0, m4 ; {top/cur interleaved} 3258 REPX {pmaddwd x, m2}, m1, m0 3259%if %1 3260 REPX {paddd x, [PIC_ptr(pd_16)]}, m1, m0 3261%else 3262 REPX {paddd x, m14}, m1, m0 3263%endif 3264 REPX {psrad x, 5}, m1, m0 3265 packssdw m4, m0, m1 3266 pmaxsw m4, m8 3267 pminsw m4, m9 3268 3269 ; src 3270 mova m0, [srcq] 3271 mova m1, [srcq+16] 3272 3273 ; luma_src 3274 pxor mzero, mzero 3275%if ARCH_X86_32 3276 DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut 3277 3278 mov lumaq, r9mp 3279%endif 3280 mova m6, [lumaq+ 0] 3281 mova m5, [lumaq+(16<<%2)] 3282%if %2 3283 phaddw m6, [lumaq+16] 3284 phaddw m5, [lumaq+48] 3285%endif 3286%if ARCH_X86_32 3287 add lumaq, r10mp 3288 mov r9mp, lumaq 3289%endif 3290%if %2 3291 pavgw m6, mzero 3292 pavgw m5, mzero 3293%endif 3294 3295%if %1 3296 punpckhwd m7, m6, m0 3297 punpcklwd m6, m0 3298 REPX {pmaddwd x, m14}, m7, m6 3299 REPX {psrad x, 6}, m7, m6 3300 packssdw m6, m7 3301 punpckhwd m7, m5, m1 3302 punpcklwd m5, m1 ; { luma, chroma } 3303 REPX {pmaddwd x, m14}, m7, m5 3304 REPX {psrad x, 6}, m7, m5 3305 packssdw m5, m7 3306 pxor mzero, mzero 3307 REPX {paddw x, m15}, m6, m5 3308 REPX {pmaxsw x, mzero}, m6, m5 3309 REPX {pminsw x, m10}, m6, m5 ; clip_pixel() 3310%else 3311 REPX {pand x, m10}, m6, m5 3312%endif 3313 3314 ; scaling[luma_src] 3315%if ARCH_X86_32 3316 vpgatherdw m7, m6, scalingq-1, r0, r5, 8, 1 3317 vpgatherdw m6, m5, scalingq-1, r0, r5, 8, 1 3318%else 3319%if %3 == 0 3320 ; register shortage :) 3321 push r12 3322%endif 3323 vpgatherdw m7, m6, scalingq-1, r2, r12, 8, 1 3324 vpgatherdw m6, m5, scalingq-1, r2, r12, 8, 1 3325%if %3 == 0 3326 pop r12 3327%endif 3328%endif 3329 REPX {psrlw x, 8}, m7, m6 3330 3331 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 3332 REPX {pmullw x, m11}, m7, m6 3333 pmulhrsw m3, m7 3334 pmulhrsw m4, m6 3335 3336 ; dst = clip_pixel(src, noise) 3337 paddw m0, m3 3338 paddw m1, m4 3339 pmaxsw m0, m13 3340 pmaxsw m1, m13 3341 pminsw m0, m12 3342 pminsw m1, m12 3343 movifnidn dstq, dstmp 3344 mova [dstq+ 0], m0 3345 mova [dstq+16], m1 3346 3347%if ARCH_X86_32 3348 add srcq, r2mp 3349 add dstq, r2mp 3350 mov dstmp, dstq 3351%else 3352 add srcq, r13mp 3353 add dstq, r13mp 3354 add lumaq, lstrideq 3355%endif 3356 add grain_lutq, 82*2 3357 dec hw 3358%if %3 3359 jg %%loop_y_h_overlap 3360%else 3361 jle %%end_y_hv_overlap 3362 btc hd, 16 3363 jc %%loop_y_h_overlap 3364%if ARCH_X86_32 3365 mov r5, r5m 3366%endif 3367 SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] 3368 jmp %%loop_y_hv_overlap 3369%%end_y_hv_overlap: 3370%endif 3371%if ARCH_X86_32 3372 DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut 3373 3374 mov wq, r4m 3375%endif 3376 add wq, 16 3377 jge %%end_hv 3378%if ARCH_X86_32 3379 mov srcq, r1mp 3380%else 3381 mov srcq, r10mp 3382%endif 3383 mov dstq, r11mp 3384 mov lumaq, r12mp 3385 lea srcq, [srcq+wq*2] 3386 lea dstq, [dstq+wq*2] 3387 lea lumaq, [lumaq+wq*(2<<%2)] 3388%if ARCH_X86_32 3389 mov dstmp, dstq 3390 mov r9mp, lumaq 3391 mov r4m, wq 3392%endif 3393%if %2 3394 jmp %%loop_x_hv_overlap 3395%else 3396 or dword r8m, 4 3397 add offxyd, 16 3398%if ARCH_X86_32 3399 add dword [rsp+8*mmsize+1*gprsize], 16 3400%else 3401 add r11d, 16 ; top_offxy += 16 3402%endif 3403 jmp %%loop_x_odd_v_overlap 3404%endif 3405 3406%%end_hv: 3407 RET 3408%endmacro 3409 3410 %%FGUV_32x32xN_LOOP 1, %2, %3 3411.csfl: 3412 %%FGUV_32x32xN_LOOP 0, %2, %3 3413 3414%if STACK_ALIGNMENT < mmsize 3415DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 3416%endif 3417%endmacro 3418 3419FGUV_FN 420, 1, 1 3420FGUV_FN 422, 1, 0 3421FGUV_FN 444, 0, 0 3422