1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 32 32pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 33rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 34pw_seed_xor: times 2 dw 0xb524 35 times 2 dw 0x49d8 36pd_16: dd 16 37pd_m65536: dd ~0xffff 38pb_1: times 4 db 1 39hmul_bits: dw 32768, 16384, 8192, 4096 40round: dw 2048, 1024, 512 41mul_bits: dw 256, 128, 64, 32, 16 42round_vals: dw 32, 64, 128, 256, 512, 1024 43max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16 44min: dw 0, 16*4, 16*16 45pw_27_17_17_27: dw 27, 17, 17, 27 46; these two should be next to each other 47pw_4: times 2 dw 4 48pw_16: times 2 dw 16 49pw_23_22: dw 23, 22, 0, 32 50 51%macro JMP_TABLE 1-* 52 %xdefine %1_table %%table 53 %xdefine %%base %1_table 54 %xdefine %%prefix mangle(private_prefix %+ _%1) 55 %%table: 56 %rep %0 - 1 57 dd %%prefix %+ .ar%2 - %%base 58 %rotate 1 59 %endrep 60%endmacro 61 62JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3 63JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3 64JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3 65JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3 66 67struc FGData 68 .seed: resd 1 69 .num_y_points: resd 1 70 .y_points: resb 14 * 2 71 .chroma_scaling_from_luma: resd 1 72 .num_uv_points: resd 2 73 .uv_points: resb 2 * 10 * 2 74 .scaling_shift: resd 1 75 .ar_coeff_lag: resd 1 76 .ar_coeffs_y: resb 24 77 .ar_coeffs_uv: resb 2 * 28 ; includes padding 78 .ar_coeff_shift: resq 1 79 .grain_scale_shift: resd 1 80 .uv_mult: resd 2 81 .uv_luma_mult: resd 2 82 .uv_offset: resd 2 83 .overlap_flag: resd 1 84 .clip_to_restricted_range: resd 1 85endstruc 86 87cextern gaussian_sequence 88 89SECTION .text 90 91%macro REPX 2-* 92 %xdefine %%f(x) %1 93%rep %0 - 1 94 %rotate 1 95 %%f(%1) 96%endrep 97%endmacro 98 99%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) 100 101INIT_YMM avx2 102cglobal generate_grain_y_16bpc, 3, 9, 16, buf, fg_data, bdmax 103 lea r4, [pb_mask] 104%define base r4-pb_mask 105 movq xm1, [base+rnd_next_upperbit_mask] 106 movq xm4, [base+mul_bits] 107 movq xm7, [base+hmul_bits] 108 mov r3d, [fg_dataq+FGData.grain_scale_shift] 109 lea r6d, [bdmaxq+1] 110 shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc 111 sub r3, r6 112 vpbroadcastw xm8, [base+round+r3*2-2] 113 mova xm5, [base+pb_mask] 114 vpbroadcastw xm0, [fg_dataq+FGData.seed] 115 vpbroadcastd xm9, [base+pd_m65536] 116 mov r3, -73*82*2 117 sub bufq, r3 118 lea r6, [gaussian_sequence] 119.loop: 120 pand xm2, xm0, xm1 121 psrlw xm3, xm2, 10 122 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 123 pmullw xm2, xm4 ; bits 0x0f00 are set 124 pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds 125 psllq xm6, xm2, 30 126 por xm2, xm6 127 psllq xm6, xm2, 15 128 por xm2, xm6 ; aggregate each bit into next seed's high bit 129 pmulhuw xm3, xm0, xm7 130 por xm2, xm3 ; 4 next output seeds 131 pshuflw xm0, xm2, q3333 132 psrlw xm2, 5 133 pmovzxwd xm3, xm2 134 mova xm6, xm9 135 vpgatherdd xm2, [r6+xm3*2], xm6 136 pandn xm2, xm9, xm2 137 packusdw xm2, xm2 138 paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 139 ; shifts by 0, which pmulhrsw does not support 140 pmulhrsw xm2, xm8 141 movq [bufq+r3], xm2 142 add r3, 4*2 143 jl .loop 144 145 ; auto-regression code 146 movsxd r3, [fg_dataq+FGData.ar_coeff_lag] 147 movsxd r3, [base+generate_grain_y_16bpc_avx2_table+r3*4] 148 lea r3, [r3+base+generate_grain_y_16bpc_avx2_table] 149 jmp r3 150 151.ar1: 152 DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 153 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 154 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] 155 movd xm4, [fg_dataq+FGData.ar_coeffs_y] 156 DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 157 pinsrb xm4, [pb_1], 3 158 pmovsxbw xm4, xm4 159 pshufd xm5, xm4, q1111 160 pshufd xm4, xm4, q0000 161 vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd 162 sub bufq, 2*(82*73-(82*3+79)) 163 mov hd, 70 164 sar maxd, 1 165 mov mind, maxd 166 xor mind, -1 167.y_loop_ar1: 168 mov xq, -76 169 movsx val3d, word [bufq+xq*2-2] 170.x_loop_ar1: 171 movu xm0, [bufq+xq*2-82*2-2] ; top/left 172 psrldq xm2, xm0, 2 ; top 173 psrldq xm1, xm0, 4 ; top/right 174 punpcklwd xm0, xm2 175 punpcklwd xm1, xm3 176 pmaddwd xm0, xm4 177 pmaddwd xm1, xm5 178 paddd xm0, xm1 179.x_loop_ar1_inner: 180 movd val0d, xm0 181 psrldq xm0, 4 182 imul val3d, cf3d 183 add val3d, val0d 184 sarx val3d, val3d, shiftd 185 movsx val0d, word [bufq+xq*2] 186 add val3d, val0d 187 cmp val3d, maxd 188 cmovg val3d, maxd 189 cmp val3d, mind 190 cmovl val3d, mind 191 mov word [bufq+xq*2], val3w 192 ; keep val3d in-place as left for next x iteration 193 inc xq 194 jz .x_loop_ar1_end 195 test xq, 3 196 jnz .x_loop_ar1_inner 197 jmp .x_loop_ar1 198 199.x_loop_ar1_end: 200 add bufq, 82*2 201 dec hd 202 jg .y_loop_ar1 203.ar0: 204 RET 205 206.ar2: 207 DEFINE_ARGS buf, fg_data, bdmax, shift 208 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 209 vpbroadcastw xm14, [base+round_vals-12+shiftq*2] 210 movq xm8, [fg_dataq+FGData.ar_coeffs_y+5] ; cf5-11 211 vinserti128 m8, [fg_dataq+FGData.ar_coeffs_y+0], 1 ; cf0-4 212 pxor m9, m9 213 punpcklwd xm14, xm9 214 pcmpgtb m9, m8 215 punpcklbw m8, m9 ; cf5-11,0-4 216 vpermq m9, m8, q3333 ; cf4 217 psrldq xm10, xm8, 6 ; cf8-11 218 vpblendw xm9, xm10, 11111110b ; cf4,9-11 219 pshufd m12, m8, q0000 ; cf[5,6], cf[0-1] 220 pshufd m11, m8, q1111 ; cf[7,8], cf[2-3] 221 pshufd xm13, xm9, q1111 ; cf[10,11] 222 pshufd xm10, xm9, q0000 ; cf[4,9] 223 sar bdmaxd, 1 224 movd xm15, bdmaxd 225 pcmpeqd xm7, xm7 226 vpbroadcastd xm15, xm15 ; max_grain 227 pxor xm7, xm15 ; min_grain 228 sub bufq, 2*(82*73-(82*3+79)) 229 DEFINE_ARGS buf, fg_data, h, x 230 mov hd, 70 231.y_loop_ar2: 232 mov xq, -76 233 234.x_loop_ar2: 235 movu xm0, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] 236 vinserti128 m0, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5] 237 psrldq m1, m0, 2 ; y=-1/-2,x=[-1,+5] 238 psrldq m2, m0, 4 ; y=-1/-2,x=[-0,+5] 239 psrldq m3, m0, 6 ; y=-1/-2,x=[+1,+5] 240 241 vextracti128 xm4, m0, 1 ; y=-2,x=[-2,+5] 242 punpcklwd m2, m3 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] 243 punpckhwd xm4, xm0 ; y=-2/-1 interleaved, x=[+2,+5] 244 punpcklwd m0, m1 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] 245 246 pmaddwd m2, m11 247 pmaddwd m0, m12 248 pmaddwd xm4, xm10 249 250 paddd m0, m2 251 vextracti128 xm2, m0, 1 252 paddd xm4, xm0 253 paddd xm2, xm14 254 paddd xm2, xm4 255 256 movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] 257 pshufd xm4, xm0, q3321 258 pmovsxwd xm4, xm4 ; in dwords, y=0,x=[0,3] 259.x_loop_ar2_inner: 260 pmaddwd xm3, xm0, xm13 261 paddd xm3, xm2 262 psrldq xm2, 4 ; shift top to next pixel 263 psrad xm3, [fg_dataq+FGData.ar_coeff_shift] 264 ; skip packssdw because we only care about one value 265 paddd xm3, xm4 266 pminsd xm3, xm15 267 pmaxsd xm3, xm7 268 pextrw [bufq+xq*2], xm3, 0 269 psrldq xm4, 4 270 pslldq xm3, 2 271 psrldq xm0, 2 272 vpblendw xm0, xm3, 0010b 273 inc xq 274 jz .x_loop_ar2_end 275 test xq, 3 276 jnz .x_loop_ar2_inner 277 jmp .x_loop_ar2 278 279.x_loop_ar2_end: 280 add bufq, 82*2 281 dec hd 282 jg .y_loop_ar2 283 RET 284 285.ar3: 286 DEFINE_ARGS buf, fg_data, bdmax, shift 287%if WIN64 288 mov r6, rsp 289 and rsp, ~31 290 sub rsp, 64 291 %define tmp rsp 292%elif STACK_ALIGNMENT < 32 293 mov r6, rsp 294 and r6, ~31 295 %define tmp r6-64 296%else 297 %define tmp rsp+stack_offset-88 298%endif 299 sar bdmaxd, 1 300 movd xm15, bdmaxd 301 pcmpeqd xm13, xm13 302 vpbroadcastd xm15, xm15 ; max_grain 303 pxor xm13, xm15 ; min_grain 304 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 305 vpbroadcastw m14, [base+round_vals+shiftq*2-12] 306 movq xm0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-6 307 movd xm1, [fg_dataq+FGData.ar_coeffs_y+14] ; cf14-16 308 pinsrb xm0, [fg_dataq+FGData.ar_coeffs_y+13], 7 ; cf0-6,13 309 pinsrb xm1, [pb_1], 3 ; cf14-16,pb_1 310 movd xm2, [fg_dataq+FGData.ar_coeffs_y+21] ; cf21-23 311 vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+ 7], 1 ; cf7-13 312 vinserti128 m1, [fg_dataq+FGData.ar_coeffs_y+17], 1 ; cf17-20 313 punpcklbw m0, m0 ; sign-extension 314 punpcklbw m1, m1 ; sign-extension 315 punpcklbw xm2, xm2 316 REPX {psraw x, 8}, m0, m1, xm2 317 318 pshufd m8, m0, q0000 ; cf[0,1] | cf[7,8] 319 pshufd m9, m0, q1111 ; cf[2,3] | cf[9,10] 320 pshufd m10, m0, q2222 ; cf[4,5] | cf[11,12] 321 pshufd xm11, xm0, q3333 ; cf[6,13] 322 323 pshufd m3, m1, q0000 ; cf[14,15] | cf[17,18] 324 pshufd m4, m1, q1111 ; cf[16],pw_1 | cf[19,20] 325 mova [tmp+0*32], m3 326 mova [tmp+1*32], m4 327 328 paddw xm5, xm14, xm14 329 vpblendw xm12, xm2, xm5, 00001000b 330 331 DEFINE_ARGS buf, fg_data, h, x 332 sub bufq, 2*(82*73-(82*3+79)) 333 mov hd, 70 334.y_loop_ar3: 335 mov xq, -76 336 337.x_loop_ar3: 338 movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] 339 movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] 340 movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] 341 vinserti128 m0, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] 342 vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] 343 vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] 344 345 palignr m4, m1, m0, 2 ; y=-3/-2,x=[-2,+5] 346 palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6] 347 punpckhwd m5, m0, m4 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] 348 punpcklwd m0, m4 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] 349 palignr m6, m5, m0, 8 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] 350 vextracti128 xm7, m1, 1 351 punpcklwd xm1, xm7 ; y=-3/-2 interleaved,x=[+3,+4,+5,+6] 352 353 psrldq m3, m2, 2 354 psrldq m4, m2, 4 355 psrldq m7, m2, 6 356 vpblendd m7, m14, 00001111b ; rounding constant 357 punpcklwd m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] 358 ; x=[+0/+1,+1/+2,+2/+3,+3/+4] 359 punpcklwd m4, m7 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] 360 ; x=[+2/+3,+3/+4,+4/+5,+5,+6] 361 362 pmaddwd m0, m8 363 pmaddwd m6, m9 364 pmaddwd m5, m10 365 pmaddwd xm1, xm11 366 pmaddwd m2, [tmp+0*32] 367 pmaddwd m4, [tmp+1*32] 368 369 paddd m0, m6 370 paddd m5, m2 371 paddd m0, m4 372 paddd m0, m5 373 vextracti128 xm4, m0, 1 374 paddd xm0, xm1 375 paddd xm0, xm4 376 377 movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] 378.x_loop_ar3_inner: 379 pmaddwd xm2, xm1, xm12 380 pshufd xm3, xm2, q1111 381 paddd xm2, xm3 ; left+cur 382 paddd xm2, xm0 ; add top 383 psrldq xm0, 4 384 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] 385 ; skip packssdw because we only care about one value 386 pminsd xm2, xm15 387 pmaxsd xm2, xm13 388 pextrw [bufq+xq*2], xm2, 0 389 pslldq xm2, 4 390 psrldq xm1, 2 391 vpblendw xm1, xm2, 0100b 392 inc xq 393 jz .x_loop_ar3_end 394 test xq, 3 395 jnz .x_loop_ar3_inner 396 jmp .x_loop_ar3 397 398.x_loop_ar3_end: 399 add bufq, 82*2 400 dec hd 401 jg .y_loop_ar3 402%if WIN64 403 mov rsp, r6 404%endif 405 RET 406 407%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y 408INIT_XMM avx2 409cglobal generate_grain_uv_%1_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax 410%define base r8-pb_mask 411 lea r8, [pb_mask] 412 movifnidn bdmaxd, bdmaxm 413 movq xm1, [base+rnd_next_upperbit_mask] 414 movq xm4, [base+mul_bits] 415 movq xm7, [base+hmul_bits] 416 mov r5d, [fg_dataq+FGData.grain_scale_shift] 417 lea r6d, [bdmaxq+1] 418 shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc 419 sub r5, r6 420 vpbroadcastw xm8, [base+round+r5*2-2] 421 mova xm5, [base+pb_mask] 422 vpbroadcastw xm0, [fg_dataq+FGData.seed] 423 vpbroadcastw xm9, [base+pw_seed_xor+uvq*4] 424 pxor xm0, xm9 425 vpbroadcastd xm9, [base+pd_m65536] 426 lea r6, [gaussian_sequence] 427%if %2 428 mov r7d, 73-35*%3 429 add bufq, 44*2 430.loop_y: 431 mov r5, -44 432%else 433 mov r5, -82*73 434 add bufq, 2*82*73 435%endif 436.loop_x: 437 pand xm2, xm0, xm1 438 psrlw xm3, xm2, 10 439 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 440 pmullw xm2, xm4 ; bits 0x0f00 are set 441 pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds 442 psllq xm6, xm2, 30 443 por xm2, xm6 444 psllq xm6, xm2, 15 445 por xm2, xm6 ; aggregate each bit into next seed's high bit 446 pmulhuw xm3, xm0, xm7 447 por xm2, xm3 ; 4 next output seeds 448 pshuflw xm0, xm2, q3333 449 psrlw xm2, 5 450 pmovzxwd xm3, xm2 451 mova xm6, xm9 452 vpgatherdd xm2, [r6+xm3*2], xm6 453 pandn xm2, xm9, xm2 454 packusdw xm2, xm2 455 paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 456 ; shifts by 0, which pmulhrsw does not support 457 pmulhrsw xm2, xm8 458 movq [bufq+r5*2], xm2 459 add r5, 4 460 jl .loop_x 461%if %2 462 add bufq, 82*2 463 dec r7d 464 jg .loop_y 465%endif 466 467 ; auto-regression code 468 movsxd r5, [fg_dataq+FGData.ar_coeff_lag] 469 movsxd r5, [base+generate_grain_uv_%1_16bpc_avx2_table+r5*4] 470 lea r5, [r5+base+generate_grain_uv_%1_16bpc_avx2_table] 471 jmp r5 472 473.ar0: 474 INIT_YMM avx2 475 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift 476 imul uvd, 28 477 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 478 movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] 479 vpbroadcastw m3, [base+hmul_bits+shiftq*2-10] 480 sar bdmaxd, 1 481 movd xm14, bdmaxd 482 pcmpeqw m7, m7 483 vpbroadcastw m14, xm14 ; max_gain 484 pxor m7, m14 ; min_grain 485 DEFINE_ARGS buf, bufy, h, x 486 pmovsxbw xm4, xm4 487%if %2 488 vpbroadcastw m6, [hmul_bits+2+%3*2] 489%endif 490 vpbroadcastw m4, xm4 491 pxor m5, m5 492%if %2 493 sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) 494%else 495 sub bufq, 2*(82*70-3) 496%endif 497 add bufyq, 2*(3+82*3) 498 mov hd, 70-35*%3 499.y_loop_ar0: 500%if %2 501 ; first 32 pixels 502 movu xm8, [bufyq] 503 movu xm10, [bufyq+ 16] 504%if %3 505 movu xm9, [bufyq+82*2] 506 movu xm11, [bufyq+82*2+16] 507%endif 508 vinserti128 m8, [bufyq+ 32], 1 509 vinserti128 m10, [bufyq+ 48], 1 510%if %3 511 vinserti128 m9, [bufyq+82*2+32], 1 512 vinserti128 m11, [bufyq+82*2+48], 1 513 paddw m8, m9 514 paddw m10, m11 515%endif 516 phaddw m8, m10 517 movu xm10, [bufyq+ 64] 518 movu xm12, [bufyq+ 80] 519%if %3 520 movu xm11, [bufyq+82*2+64] 521 movu xm13, [bufyq+82*2+80] 522%endif 523 vinserti128 m10, [bufyq+ 96], 1 524 vinserti128 m12, [bufyq+ 112], 1 525%if %3 526 vinserti128 m11, [bufyq+82*2+96], 1 527 vinserti128 m13, [bufyq+82*2+112], 1 528 paddw m10, m11 529 paddw m12, m13 530%endif 531 phaddw m10, m12 532 pmulhrsw m8, m6 533 pmulhrsw m10, m6 534%else 535 xor xd, xd 536.x_loop_ar0: 537 movu m8, [bufyq+xq*2] 538 movu m10, [bufyq+xq*2+32] 539%endif 540 punpckhwd m9, m8, m5 541 punpcklwd m8, m5 542 punpckhwd m11, m10, m5 543 punpcklwd m10, m5 544 REPX {pmaddwd x, m4}, m8, m9, m10, m11 545 REPX {psrad x, 5}, m8, m9, m10, m11 546 packssdw m8, m9 547 packssdw m10, m11 548 REPX {pmulhrsw x, m3}, m8, m10 549%if %2 550 paddw m8, [bufq+ 0] 551 paddw m10, [bufq+32] 552%else 553 paddw m8, [bufq+xq*2+ 0] 554 paddw m10, [bufq+xq*2+32] 555%endif 556 pminsw m8, m14 557 pminsw m10, m14 558 pmaxsw m8, m7 559 pmaxsw m10, m7 560%if %2 561 movu [bufq+ 0], m8 562 movu [bufq+32], m10 563 564 ; last 6 pixels 565 movu xm8, [bufyq+32*4] 566 movu xm10, [bufyq+32*4+16] 567%if %3 568 paddw xm8, [bufyq+32*4+82*2] 569 paddw xm10, [bufyq+32*4+82*2+16] 570%endif 571 phaddw xm8, xm10 572 pmulhrsw xm8, xm6 573 punpckhwd xm9, xm8, xm5 574 punpcklwd xm8, xm5 575 REPX {pmaddwd x, xm4}, xm8, xm9 576 REPX {psrad x, 5}, xm8, xm9 577 packssdw xm8, xm9 578 pmulhrsw xm8, xm3 579 movu xm0, [bufq+32*2] 580 paddw xm8, xm0 581 pminsw xm8, xm14 582 pmaxsw xm8, xm7 583 vpblendw xm0, xm8, xm0, 11000000b 584 movu [bufq+32*2], xm0 585%else 586 movu [bufq+xq*2+ 0], m8 587 movu [bufq+xq*2+32], m10 588 add xd, 32 589 cmp xd, 64 590 jl .x_loop_ar0 591 592 ; last 12 pixels 593 movu m8, [bufyq+64*2] 594 punpckhwd m9, m8, m5 595 punpcklwd m8, m5 596 REPX {pmaddwd x, m4}, m8, m9 597 REPX {psrad x, 5}, m8, m9 598 packssdw m8, m9 599 pmulhrsw m8, m3 600 movu m0, [bufq+64*2] 601 paddw m8, m0 602 pminsw m8, m14 603 pmaxsw m8, m7 604 vpblendd m0, m8, m0, 11000000b 605 movu [bufq+64*2], m0 606%endif 607 608 add bufq, 82*2 609 add bufyq, 82*2<<%3 610 dec hd 611 jg .y_loop_ar0 612 RET 613 614.ar1: 615 INIT_XMM avx2 616 DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x, shift 617 imul uvd, 28 618 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 619 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] 620 movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] 621 pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 622 DEFINE_ARGS buf, bufy, h, val0, max, cf3, min, val3, x, shift 623 pmovsxbw xm4, xm4 624 pshufd xm5, xm4, q1111 625 pshufd xm4, xm4, q0000 626 pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd 627 vpbroadcastw xm6, [hmul_bits+2+%3*2] 628 vpbroadcastd xm3, xm3 629%if %2 630 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) 631%else 632 sub bufq, 2*(82*69+3) 633%endif 634 add bufyq, 2*(79+82*3) 635 mov hd, 70-35*%3 636 sar maxd, 1 637 mov mind, maxd 638 xor mind, -1 639.y_loop_ar1: 640 mov xq, -(76>>%2) 641 movsx val3d, word [bufq+xq*2-2] 642.x_loop_ar1: 643 movu xm0, [bufq+xq*2-82*2-2] ; top/left 644%if %2 645 movu xm8, [bufyq+xq*4] 646%else 647 movq xm8, [bufyq+xq*2] 648%endif 649 psrldq xm2, xm0, 2 ; top 650 psrldq xm1, xm0, 4 ; top/right 651%if %2 652%if %3 653 phaddw xm8, [bufyq+xq*4+82*2] 654 pshufd xm9, xm8, q3232 655 paddw xm8, xm9 656%else 657 phaddw xm8, xm8 658%endif 659 pmulhrsw xm8, xm6 660%endif 661 punpcklwd xm0, xm2 662 punpcklwd xm1, xm8 663 pmaddwd xm0, xm4 664 pmaddwd xm1, xm5 665 paddd xm0, xm1 666 paddd xm0, xm3 667.x_loop_ar1_inner: 668 movd val0d, xm0 669 psrldq xm0, 4 670 imul val3d, cf3d 671 add val3d, val0d 672 sarx val3d, val3d, shiftd 673 movsx val0d, word [bufq+xq*2] 674 add val3d, val0d 675 cmp val3d, maxd 676 cmovg val3d, maxd 677 cmp val3d, mind 678 cmovl val3d, mind 679 mov word [bufq+xq*2], val3w 680 ; keep val3d in-place as left for next x iteration 681 inc xq 682 jz .x_loop_ar1_end 683 test xq, 3 684 jnz .x_loop_ar1_inner 685 jmp .x_loop_ar1 686 687.x_loop_ar1_end: 688 add bufq, 82*2 689 add bufyq, 82*2<<%3 690 dec hd 691 jg .y_loop_ar1 692 RET 693 694 INIT_YMM avx2 695.ar2: 696 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift 697 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 698 imul uvd, 28 699 sar bdmaxd, 1 700 movd xm6, bdmaxd 701 pcmpeqd xm5, xm5 702 vpbroadcastd xm6, xm6 ; max_grain 703 pxor xm5, xm6 ; min_grain 704%if %2 705 vpbroadcastw xm7, [base+hmul_bits+2+%3*2] 706%endif 707 vpbroadcastw xm15, [base+round_vals-12+shiftq*2] 708 709 movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+5] 710 pinsrb xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+12], 4 711 pinsrb xm0, [pb_1], 5 712 pinsrw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+10], 3 713 movhps xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] 714 pinsrb xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+9], 13 715 pmovsxbw m0, xm0 716 717 pshufd xm13, xm0, q3333 718 pshufd m12, m0, q0000 719 pshufd m11, m0, q1111 720 pshufd m10, m0, q2222 721 722 DEFINE_ARGS buf, bufy, fg_data, h, x 723%if %2 724 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) 725%else 726 sub bufq, 2*(82*69+3) 727%endif 728 add bufyq, 2*(79+82*3) 729 mov hd, 70-35*%3 730.y_loop_ar2: 731 mov xq, -(76>>%2) 732 733.x_loop_ar2: 734 movu xm0, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] 735 vinserti128 m0, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5] 736 psrldq m1, m0, 2 ; y=-1/-2,x=[-1,+5] 737 psrldq m2, m0, 4 ; y=-1/-2,x=[-0,+5] 738 psrldq m3, m0, 6 ; y=-1/-2,x=[+1,+5] 739 740%if %2 741 movu xm8, [bufyq+xq*4] 742%if %3 743 paddw xm8, [bufyq+xq*4+82*2] 744%endif 745 phaddw xm8, xm8 746%else 747 movq xm8, [bufyq+xq*2] 748%endif 749 750 vinserti128 m4, xm0, 1 ; y=-1,x=[-2,+5] 751 punpcklwd m2, m3 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] 752 punpckhwd m4, m0, m4 ; y=-2/-1 interleaved, x=[+2,+5] 753 punpcklwd m0, m1 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] 754 755%if %2 756 pmulhrsw xm1, xm8, xm7 757 punpcklwd xm1, xm15 ; luma, round interleaved 758%else 759 punpcklwd xm1, xm8, xm15 760%endif 761 vpblendd m1, m1, m4, 11110000b 762 763 pmaddwd m2, m11 764 pmaddwd m0, m12 765 pmaddwd m1, m10 766 paddd m2, m0 767 paddd m2, m1 768 vextracti128 xm0, m2, 1 769 paddd xm2, xm0 770 771 movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] 772 pshufd xm4, xm0, q3321 773 pmovsxwd xm4, xm4 ; y=0,x=[0,3] in dword 774.x_loop_ar2_inner: 775 pmaddwd xm3, xm0, xm13 776 paddd xm3, xm2 777 psrldq xm2, 4 ; shift top to next pixel 778 psrad xm3, [fg_dataq+FGData.ar_coeff_shift] 779 ; we do not need to packssdw since we only care about one value 780 paddd xm3, xm4 781 pminsd xm3, xm6 782 pmaxsd xm3, xm5 783 pextrw [bufq+xq*2], xm3, 0 784 psrldq xm0, 2 785 pslldq xm3, 2 786 psrldq xm4, 4 787 vpblendw xm0, xm3, 00000010b 788 inc xq 789 jz .x_loop_ar2_end 790 test xq, 3 791 jnz .x_loop_ar2_inner 792 jmp .x_loop_ar2 793 794.x_loop_ar2_end: 795 add bufq, 82*2 796 add bufyq, 82*2<<%3 797 dec hd 798 jg .y_loop_ar2 799 RET 800 801.ar3: 802 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift 803%if WIN64 804 mov r6, rsp 805 and rsp, ~31 806 sub rsp, 96 807 %define tmp rsp 808%elif STACK_ALIGNMENT < 32 809 mov r6, rsp 810 and r6, ~31 811 %define tmp r6-96 812%else 813 %define tmp rsp+stack_offset-120 814%endif 815 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 816 imul uvd, 28 817 vpbroadcastw xm14, [base+round_vals-12+shiftq*2] 818 sar bdmaxd, 1 819 movd xm15, bdmaxd 820 pcmpeqd xm13, xm13 821 vpbroadcastd xm15, xm15 ; max_grain 822 pxor xm13, xm15 ; min_grain 823%if %2 824 vpbroadcastw xm12, [base+hmul_bits+2+%3*2] 825%endif 826 827 movq xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] 828 pinsrb xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7 ; luma 829 movhps xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 7] 830 pmovsxbw m0, xm0 831 832 pshufd m11, m0, q3333 833 pshufd m10, m0, q2222 834 pshufd m9, m0, q1111 835 pshufd m8, m0, q0000 836 837 movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+14] 838 pinsrb xm0, [pb_1], 3 839 pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+21], 1 840 pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+17], 2 841 pmovsxbw m0, xm0 842 843 pshufd m1, m0, q0000 844 pshufd m2, m0, q1111 845 mova [tmp+32*2], m11 846 pshufd xm11, xm0, q3232 847 mova [tmp+32*0], m1 848 mova [tmp+32*1], m2 849 pinsrw xm11, [base+round_vals-10+shiftq*2], 3 850 851 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 852%if %2 853 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) 854%else 855 sub bufq, 2*(82*69+3) 856%endif 857 add bufyq, 2*(79+82*3) 858 mov hd, 70-35*%3 859.y_loop_ar3: 860 mov xq, -(76>>%2) 861 862.x_loop_ar3: 863 movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] 864 movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] 865 movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] 866 vinserti128 m0, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] 867 vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] 868 vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] 869 870%if %2 871 movu xm7, [bufyq+xq*4] 872%if %3 873 paddw xm7, [bufyq+xq*4+82*2] 874%endif 875 phaddw xm7, xm7 876%else 877 movq xm7, [bufyq+xq*2] 878%endif 879 880 palignr m4, m1, m0, 2 ; y=-3/-2,x=[-2,+5] 881 palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6] 882 punpckhwd m5, m0, m4 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] 883 punpcklwd m0, m4 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] 884 palignr m6, m5, m0, 8 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] 885%if %2 886 pmulhrsw xm7, xm12 887%endif 888 punpcklwd m1, m7 889 890 psrldq m3, m2, 2 891 psrldq m4, m2, 4 892 psrldq m7, m2, 6 893 vpblendd m7, m14, 00001111b ; rounding constant 894 punpcklwd m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] 895 ; x=[+0/+1,+1/+2,+2/+3,+3/+4] 896 punpcklwd m4, m7 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] 897 ; x=[+2/+3,+3/+4,+4/+5,+5,+6] 898 899 pmaddwd m0, m8 900 pmaddwd m6, m9 901 pmaddwd m5, m10 902 pmaddwd m1, [tmp+32*2] 903 pmaddwd m2, [tmp+32*0] 904 pmaddwd m4, [tmp+32*1] 905 906 paddd m0, m6 907 paddd m5, m2 908 paddd m4, m1 909 paddd m0, m4 910 paddd m0, m5 911 vextracti128 xm4, m0, 1 912 paddd xm0, xm4 913 914 movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] 915.x_loop_ar3_inner: 916 pmaddwd xm2, xm1, xm11 917 pshufd xm3, xm2, q1111 918 paddd xm2, xm3 ; left+cur 919 paddd xm2, xm0 ; add top 920 psrldq xm0, 4 921 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] 922 ; no need to packssdw since we only care about one value 923 pminsd xm2, xm15 924 pmaxsd xm2, xm13 925 pextrw [bufq+xq*2], xm2, 0 926 pslldq xm2, 4 927 psrldq xm1, 2 928 vpblendw xm1, xm2, 00000100b 929 inc xq 930 jz .x_loop_ar3_end 931 test xq, 3 932 jnz .x_loop_ar3_inner 933 jmp .x_loop_ar3 934 935.x_loop_ar3_end: 936 add bufq, 82*2 937 add bufyq, 82*2<<%3 938 dec hd 939 jg .y_loop_ar3 940%if WIN64 941 mov rsp, r6 942%endif 943 RET 944%endmacro 945 946generate_grain_uv_fn 420, 1, 1 947generate_grain_uv_fn 422, 1, 0 948generate_grain_uv_fn 444, 0, 0 949 950INIT_YMM avx2 951cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, grain_lut 952 mov r7d, [fg_dataq+FGData.scaling_shift] 953 lea r8, [pb_mask] 954%define base r8-pb_mask 955 vpbroadcastw m11, [base+mul_bits+r7*2-14] 956 mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 957 mov r9d, r9m ; bdmax 958 sar r9d, 11 ; is_12bpc 959 shlx r10d, r6d, r9d 960 vpbroadcastw m13, [base+min+r10*2] 961 lea r9d, [r9d*3] 962 lea r9d, [r6d*2+r9d] 963 vpbroadcastw m12, [base+max+r9*2] 964 vpbroadcastw m10, r9m 965 pxor m2, m2 966 967 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ 968 sby, see 969 970 movifnidn sbyd, sbym 971 test sbyd, sbyd 972 setnz r7b 973 test r7b, byte [fg_dataq+FGData.overlap_flag] 974 jnz .vertical_overlap 975 976 imul seed, sbyd, (173 << 24) | 37 977 add seed, (105 << 24) | 178 978 rol seed, 8 979 movzx seed, seew 980 xor seed, [fg_dataq+FGData.seed] 981 982 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 983 unused1, unused2, see, src_bak 984 985 lea src_bakq, [srcq+wq*2] 986 neg wq 987 sub dstq, srcq 988 989.loop_x: 990 mov r6d, seed 991 or seed, 0xEFF4 992 shr r6d, 1 993 test seeb, seeh 994 lea seed, [r6+0x8000] 995 cmovp seed, r6d ; updated seed 996 997 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 998 offx, offy, see, src_bak 999 1000 mov offxd, seed 1001 rorx offyd, seed, 8 1002 shr offxd, 12 1003 and offyd, 0xf 1004 imul offyd, 164 1005 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1006 1007 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1008 h, offxy, see, src_bak 1009 1010 mov hd, hm 1011 mov grain_lutq, grain_lutmp 1012.loop_y: 1013 ; src 1014 pminuw m0, m10, [srcq+ 0] 1015 pminuw m1, m10, [srcq+32] ; m0-1: src as word 1016 punpckhwd m5, m0, m2 1017 punpcklwd m4, m0, m2 1018 punpckhwd m7, m1, m2 1019 punpcklwd m6, m1, m2 ; m4-7: src as dword 1020 1021 ; scaling[src] 1022 pcmpeqw m3, m3 1023 mova m9, m3 1024 vpgatherdd m8, [scalingq+m4-3], m3 1025 vpgatherdd m4, [scalingq+m5-3], m9 1026 pcmpeqw m3, m3 1027 mova m9, m3 1028 vpgatherdd m5, [scalingq+m6-3], m3 1029 vpgatherdd m6, [scalingq+m7-3], m9 1030 REPX {psrld x, 24}, m8, m4, m5, m6 1031 packssdw m8, m4 1032 packssdw m5, m6 1033 1034 ; grain = grain_lut[offy+y][offx+x] 1035 movu m9, [grain_lutq+offxyq*2] 1036 movu m3, [grain_lutq+offxyq*2+32] 1037 1038 ; noise = round2(scaling[src] * grain, scaling_shift) 1039 REPX {pmullw x, m11}, m8, m5 1040 pmulhrsw m9, m8 1041 pmulhrsw m3, m5 1042 1043 ; dst = clip_pixel(src, noise) 1044 paddw m0, m9 1045 paddw m1, m3 1046 pmaxsw m0, m13 1047 pmaxsw m1, m13 1048 pminsw m0, m12 1049 pminsw m1, m12 1050 mova [dstq+srcq+ 0], m0 1051 mova [dstq+srcq+32], m1 1052 1053 add srcq, strideq 1054 add grain_lutq, 82*2 1055 dec hd 1056 jg .loop_y 1057 1058 add wq, 32 1059 jge .end 1060 lea srcq, [src_bakq+wq*2] 1061 cmp byte [fg_dataq+FGData.overlap_flag], 0 1062 je .loop_x 1063 1064 ; r8m = sbym 1065 movq xm15, [pw_27_17_17_27] 1066 cmp dword r8m, 0 1067 jne .loop_x_hv_overlap 1068 1069 ; horizontal overlap (without vertical overlap) 1070 vpbroadcastd xm14, [pd_16] 1071.loop_x_h_overlap: 1072 mov r6d, seed 1073 or seed, 0xEFF4 1074 shr r6d, 1 1075 test seeb, seeh 1076 lea seed, [r6+0x8000] 1077 cmovp seed, r6d ; updated seed 1078 1079 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1080 offx, offy, see, src_bak, left_offxy 1081 1082 lea left_offxyd, [offyd+32] ; previous column's offy*stride+offx 1083 mov offxd, seed 1084 rorx offyd, seed, 8 1085 shr offxd, 12 1086 and offyd, 0xf 1087 imul offyd, 164 1088 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1089 1090 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1091 h, offxy, see, src_bak, left_offxy 1092 1093 mov hd, hm 1094 mov grain_lutq, grain_lutmp 1095.loop_y_h_overlap: 1096 ; src 1097 pminuw m0, m10, [srcq+ 0] 1098 pminuw m1, m10, [srcq+32] ; m0-1: src as word 1099 punpckhwd m5, m0, m2 1100 punpcklwd m4, m0, m2 1101 punpckhwd m7, m1, m2 1102 punpcklwd m6, m1, m2 ; m4-7: src as dword 1103 1104 ; scaling[src] 1105 pcmpeqw m3, m3 1106 mova m9, m3 1107 vpgatherdd m8, [scalingq+m4-3], m3 1108 vpgatherdd m4, [scalingq+m5-3], m9 1109 pcmpeqw m3, m3 1110 mova m9, m3 1111 vpgatherdd m5, [scalingq+m6-3], m3 1112 vpgatherdd m6, [scalingq+m7-3], m9 1113 REPX {psrld x, 24}, m8, m4, m5, m6 1114 packssdw m8, m4 1115 packssdw m5, m6 1116 1117 ; grain = grain_lut[offy+y][offx+x] 1118 movu m9, [grain_lutq+offxyq*2] 1119 movd xm7, [grain_lutq+left_offxyq*2] 1120 punpcklwd xm7, xm9 1121 pmaddwd xm7, xm15 1122 paddd xm7, xm14 1123 psrad xm7, 5 1124 packssdw xm7, xm7 1125 vpblendd m9, m7, 00000001b 1126 pcmpeqw m3, m3 1127 psraw m7, m10, 1 ; max_grain 1128 pxor m3, m7 ; min_grain 1129 pminsw m9, m7 1130 pmaxsw m9, m3 1131 movu m3, [grain_lutq+offxyq*2+32] 1132 1133 ; noise = round2(scaling[src] * grain, scaling_shift) 1134 REPX {pmullw x, m11}, m8, m5 1135 pmulhrsw m9, m8 1136 pmulhrsw m3, m5 1137 1138 ; dst = clip_pixel(src, noise) 1139 paddw m0, m9 1140 paddw m1, m3 1141 pmaxsw m0, m13 1142 pmaxsw m1, m13 1143 pminsw m0, m12 1144 pminsw m1, m12 1145 mova [dstq+srcq+ 0], m0 1146 mova [dstq+srcq+32], m1 1147 1148 add srcq, strideq 1149 add grain_lutq, 82*2 1150 dec hd 1151 jg .loop_y_h_overlap 1152 1153 add wq, 32 1154 jge .end 1155 lea srcq, [src_bakq+wq*2] 1156 1157 ; r8m = sbym 1158 cmp dword r8m, 0 1159 jne .loop_x_hv_overlap 1160 jmp .loop_x_h_overlap 1161 1162.end: 1163 RET 1164 1165.vertical_overlap: 1166 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ 1167 sby, see 1168 1169 movzx sbyd, sbyb 1170 imul seed, [fg_dataq+FGData.seed], 0x00010001 1171 imul r7d, sbyd, 173 * 0x00010001 1172 imul sbyd, 37 * 0x01000100 1173 add r7d, (105 << 16) | 188 1174 add sbyd, (178 << 24) | (141 << 8) 1175 and r7d, 0x00ff00ff 1176 and sbyd, 0xff00ff00 1177 xor seed, r7d 1178 xor seed, sbyd ; (cur_seed << 16) | top_seed 1179 1180 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1181 unused1, unused2, see, src_bak 1182 1183 lea src_bakq, [srcq+wq*2] 1184 neg wq 1185 sub dstq, srcq 1186 1187 vpbroadcastd m14, [pd_16] 1188.loop_x_v_overlap: 1189 vpbroadcastd m15, [pw_27_17_17_27] 1190 1191 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1192 mov r6d, seed 1193 or seed, 0xeff4eff4 1194 test seeb, seeh 1195 setp r7b ; parity of top_seed 1196 shr seed, 16 1197 shl r7d, 16 1198 test seeb, seeh 1199 setp r7b ; parity of cur_seed 1200 or r6d, 0x00010001 1201 xor r7d, r6d 1202 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1203 1204 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1205 offx, offy, see, src_bak, unused, top_offxy 1206 1207 rorx offyd, seed, 8 1208 rorx offxd, seed, 12 1209 and offyd, 0xf000f 1210 and offxd, 0xf000f 1211 imul offyd, 164 1212 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1213 lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1214 1215 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1216 h, offxy, see, src_bak, unused, top_offxy 1217 1218 movzx top_offxyd, offxyw 1219 shr offxyd, 16 1220 1221 mov hd, hm 1222 mov grain_lutq, grain_lutmp 1223.loop_y_v_overlap: 1224 ; grain = grain_lut[offy+y][offx+x] 1225 movu m3, [grain_lutq+offxyq*2] 1226 movu m7, [grain_lutq+top_offxyq*2] 1227 punpckhwd m9, m7, m3 1228 punpcklwd m7, m3 1229 REPX {pmaddwd x, m15}, m9, m7 1230 REPX {paddd x, m14}, m9, m7 1231 REPX {psrad x, 5}, m9, m7 1232 packssdw m7, m9 1233 pcmpeqw m0, m0 1234 psraw m1, m10, 1 ; max_grain 1235 pxor m0, m1 ; min_grain 1236 pminsw m7, m1 1237 pmaxsw m7, m0 1238 movu m3, [grain_lutq+offxyq*2+32] 1239 movu m8, [grain_lutq+top_offxyq*2+32] 1240 punpckhwd m9, m8, m3 1241 punpcklwd m8, m3 1242 REPX {pmaddwd x, m15}, m9, m8 1243 REPX {paddd x, m14}, m9, m8 1244 REPX {psrad x, 5}, m9, m8 1245 packssdw m8, m9 1246 pminsw m8, m1 1247 pmaxsw m8, m0 1248 1249 ; src 1250 pminuw m0, m10, [srcq+ 0] ; m0-1: src as word 1251 punpckhwd m5, m0, m2 1252 punpcklwd m4, m0, m2 1253 1254 ; scaling[src] 1255 pcmpeqw m3, m3 1256 mova m9, m3 1257 vpgatherdd m6, [scalingq+m4-3], m3 1258 vpgatherdd m4, [scalingq+m5-3], m9 1259 REPX {psrld x, 24}, m6, m4 1260 packssdw m6, m4 1261 1262 ; noise = round2(scaling[src] * grain, scaling_shift) 1263 pmullw m6, m11 1264 pmulhrsw m6, m7 1265 1266 ; same for the other half 1267 pminuw m1, m10, [srcq+32] ; m0-1: src as word 1268 punpckhwd m9, m1, m2 1269 punpcklwd m4, m1, m2 ; m4-7: src as dword 1270 pcmpeqw m3, m3 1271 mova m7, m3 1272 vpgatherdd m5, [scalingq+m4-3], m3 1273 vpgatherdd m4, [scalingq+m9-3], m7 1274 REPX {psrld x, 24}, m5, m4 1275 packssdw m5, m4 1276 1277 pmullw m5, m11 1278 pmulhrsw m5, m8 1279 1280 ; dst = clip_pixel(src, noise) 1281 paddw m0, m6 1282 paddw m1, m5 1283 pmaxsw m0, m13 1284 pmaxsw m1, m13 1285 pminsw m0, m12 1286 pminsw m1, m12 1287 mova [dstq+srcq+ 0], m0 1288 mova [dstq+srcq+32], m1 1289 1290 vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line 1291 add srcq, strideq 1292 add grain_lutq, 82*2 1293 dec hw 1294 jz .end_y_v_overlap 1295 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1296 ; remaining (up to) 30 lines 1297 xor hd, 0x10000 1298 test hd, 0x10000 1299 jnz .loop_y_v_overlap 1300 jmp .loop_y 1301 1302.end_y_v_overlap: 1303 add wq, 32 1304 jge .end_hv 1305 lea srcq, [src_bakq+wq*2] 1306 1307 ; since fg_dataq.overlap is guaranteed to be set, we never jump 1308 ; back to .loop_x_v_overlap, and instead always fall-through to 1309 ; h+v overlap 1310 1311 movq xm15, [pw_27_17_17_27] 1312.loop_x_hv_overlap: 1313 vpbroadcastd m8, [pw_27_17_17_27] 1314 1315 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1316 mov r6d, seed 1317 or seed, 0xeff4eff4 1318 test seeb, seeh 1319 setp r7b ; parity of top_seed 1320 shr seed, 16 1321 shl r7d, 16 1322 test seeb, seeh 1323 setp r7b ; parity of cur_seed 1324 or r6d, 0x00010001 1325 xor r7d, r6d 1326 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1327 1328 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1329 offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy 1330 1331 lea topleft_offxyq, [top_offxyq+32] 1332 lea left_offxyq, [offyq+32] 1333 rorx offyd, seed, 8 1334 rorx offxd, seed, 12 1335 and offyd, 0xf000f 1336 and offxd, 0xf000f 1337 imul offyd, 164 1338 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1339 lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1340 1341 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1342 h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy 1343 1344 movzx top_offxyd, offxyw 1345 shr offxyd, 16 1346 1347 mov hd, hm 1348 mov grain_lutq, grain_lutmp 1349.loop_y_hv_overlap: 1350 ; grain = grain_lut[offy+y][offx+x] 1351 movu m3, [grain_lutq+offxyq*2] 1352 movu m0, [grain_lutq+offxyq*2+32] 1353 movu m6, [grain_lutq+top_offxyq*2] 1354 movu m1, [grain_lutq+top_offxyq*2+32] 1355 movd xm4, [grain_lutq+left_offxyq*2] 1356 movd xm7, [grain_lutq+topleft_offxyq*2] 1357 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 1358 punpcklwd xm4, xm3 1359 punpcklwd xm7, xm6 1360 REPX {pmaddwd x, xm15}, xm4, xm7 1361 REPX {paddd x, xm14}, xm4, xm7 1362 REPX {psrad x, 5}, xm4, xm7 1363 REPX {packssdw x, x}, xm4, xm7 1364 pcmpeqw m5, m5 1365 psraw m9, m10, 1 ; max_grain 1366 pxor m5, m9 ; min_grain 1367 REPX {pminsw x, xm9}, xm4, xm7 1368 REPX {pmaxsw x, xm5}, xm4, xm7 1369 vpblendd m3, m4, 00000001b 1370 vpblendd m6, m7, 00000001b 1371 ; followed by v interpolation (top | cur -> cur) 1372 punpckhwd m7, m6, m3 1373 punpcklwd m6, m3 1374 punpckhwd m3, m1, m0 1375 punpcklwd m1, m0 1376 REPX {pmaddwd x, m8}, m7, m6, m3, m1 1377 REPX {paddd x, m14}, m7, m6, m3, m1 1378 REPX {psrad x, 5}, m7, m6, m3, m1 1379 packssdw m7, m6, m7 1380 packssdw m3, m1, m3 1381 REPX {pminsw x, m9}, m7, m3 1382 REPX {pmaxsw x, m5}, m7, m3 1383 1384 ; src 1385 pminuw m0, m10, [srcq+ 0] 1386 pminuw m1, m10, [srcq+32] ; m0-1: src as word 1387 punpckhwd m5, m0, m2 1388 punpcklwd m4, m0, m2 1389 1390 ; scaling[src] 1391 pcmpeqw m9, m9 1392 vpgatherdd m6, [scalingq+m4-3], m9 1393 pcmpeqw m9, m9 1394 vpgatherdd m4, [scalingq+m5-3], m9 1395 REPX {psrld x, 24}, m6, m4 1396 packssdw m6, m4 1397 1398 ; noise = round2(scaling[src] * grain, scaling_shift) 1399 pmullw m6, m11 1400 pmulhrsw m7, m6 1401 1402 ; other half 1403 punpckhwd m5, m1, m2 1404 punpcklwd m4, m1, m2 ; m4-7: src as dword 1405 1406 ; scaling[src] 1407 pcmpeqw m6, m6 1408 vpgatherdd m9, [scalingq+m4-3], m6 1409 pcmpeqw m6, m6 1410 vpgatherdd m4, [scalingq+m5-3], m6 1411 REPX {psrld x, 24}, m9, m4 1412 packssdw m9, m4 1413 1414 ; noise = round2(scaling[src] * grain, scaling_shift) 1415 pmullw m9, m11 1416 pmulhrsw m3, m9 1417 1418 ; dst = clip_pixel(src, noise) 1419 paddw m0, m7 1420 paddw m1, m3 1421 pmaxsw m0, m13 1422 pmaxsw m1, m13 1423 pminsw m0, m12 1424 pminsw m1, m12 1425 mova [dstq+srcq+ 0], m0 1426 mova [dstq+srcq+32], m1 1427 1428 vpbroadcastd m8, [pw_27_17_17_27+4] ; swap weights for second v-overlap line 1429 add srcq, strideq 1430 add grain_lutq, 82*2 1431 dec hw 1432 jz .end_y_hv_overlap 1433 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1434 ; remaining (up to) 30 lines 1435 xor hd, 0x10000 1436 test hd, 0x10000 1437 jnz .loop_y_hv_overlap 1438 jmp .loop_y_h_overlap 1439 1440.end_y_hv_overlap: 1441 add wq, 32 1442 lea srcq, [src_bakq+wq*2] 1443 jl .loop_x_hv_overlap 1444 1445.end_hv: 1446 RET 1447 1448%macro FGUV_FN 3 ; name, ss_hor, ss_ver 1449cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ 1450 grain_lut, h, sby, luma, lstride, uv_pl, is_id 1451%define base r8-pb_mask 1452 lea r8, [pb_mask] 1453 mov r7d, [fg_dataq+FGData.scaling_shift] 1454 vpbroadcastw m11, [base+mul_bits+r7*2-14] 1455 mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 1456 mov r9d, r13m ; bdmax 1457 sar r9d, 11 ; is_12bpc 1458 shlx r10d, r6d, r9d 1459 vpbroadcastw m13, [base+min+r10*2] 1460 lea r10d, [r9d*3] 1461 mov r11d, is_idm 1462 shlx r6d, r6d, r11d 1463 add r10d, r6d 1464 vpbroadcastw m12, [base+max+r10*2] 1465 vpbroadcastw m10, r13m 1466 pxor m2, m2 1467 1468 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 1469 jne .csfl 1470 1471%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver 1472 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 1473 1474%if %1 1475 mov r7d, r11m 1476 vpbroadcastw m0, [fg_dataq+FGData.uv_mult+r7*4] 1477 vpbroadcastw m1, [fg_dataq+FGData.uv_luma_mult+r7*4] 1478 punpcklwd m14, m1, m0 1479 vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r7*4] 1480 vpbroadcastd m9, [base+pw_4+r9*4] 1481 pmullw m15, m9 1482%else 1483 vpbroadcastd m14, [pd_16] 1484%if %2 1485 vpbroadcastq m15, [pw_23_22] 1486%else 1487 vpbroadcastq m15, [pw_27_17_17_27] 1488%endif 1489%endif 1490 1491 movifnidn sbyd, sbym 1492 test sbyd, sbyd 1493 setnz r7b 1494 test r7b, byte [fg_dataq+FGData.overlap_flag] 1495 jnz %%vertical_overlap 1496 1497 imul seed, sbyd, (173 << 24) | 37 1498 add seed, (105 << 24) | 178 1499 rol seed, 8 1500 movzx seed, seew 1501 xor seed, [fg_dataq+FGData.seed] 1502 1503 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1504 unused2, unused3, see, unused4, unused5, unused6, luma, lstride 1505 1506 mov lumaq, r9mp 1507 mov lstrideq, r10mp 1508 lea r10, [srcq+wq*2] 1509 lea r11, [dstq+wq*2] 1510 lea r12, [lumaq+wq*(2<<%2)] 1511 mov r10mp, r10 1512 mov r11mp, r11 1513 mov r12mp, r12 1514 neg wq 1515 1516%%loop_x: 1517 mov r6d, seed 1518 or seed, 0xEFF4 1519 shr r6d, 1 1520 test seeb, seeh 1521 lea seed, [r6+0x8000] 1522 cmovp seed, r6d ; updated seed 1523 1524 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1525 offx, offy, see, unused1, unused2, unused3, luma, lstride 1526 1527 mov offxd, seed 1528 rorx offyd, seed, 8 1529 shr offxd, 12 1530 and offyd, 0xf 1531 imul offyd, 164>>%3 1532 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx 1533 1534 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1535 h, offxy, see, unused1, unused2, unused3, luma, lstride 1536 1537 mov hd, hm 1538 mov grain_lutq, grain_lutmp 1539%%loop_y: 1540 ; src 1541 mova m0, [srcq] 1542%if %2 1543 mova m1, [srcq+strideq] ; m0-1: src as word 1544%else 1545 mova m1, [srcq+32] 1546%endif 1547 1548 ; luma_src 1549%if %2 1550 mova xm4, [lumaq+lstrideq*0+ 0] 1551 mova xm7, [lumaq+lstrideq*0+16] 1552 vinserti128 m4, [lumaq+lstrideq*0+32], 1 1553 vinserti128 m7, [lumaq+lstrideq*0+48], 1 1554 mova xm6, [lumaq+lstrideq*(1<<%3)+ 0] 1555 mova xm8, [lumaq+lstrideq*(1<<%3)+16] 1556 vinserti128 m6, [lumaq+lstrideq*(1<<%3)+32], 1 1557 vinserti128 m8, [lumaq+lstrideq*(1<<%3)+48], 1 1558 phaddw m4, m7 1559 phaddw m6, m8 1560 pavgw m4, m2 1561 pavgw m6, m2 1562%else 1563 mova m4, [lumaq] 1564 mova m6, [lumaq+32] 1565%endif 1566 1567%if %1 1568 punpckhwd m3, m4, m0 1569 punpcklwd m4, m0 1570 punpckhwd m5, m6, m1 1571 punpcklwd m6, m1 ; { luma, chroma } 1572 REPX {pmaddwd x, m14}, m3, m4, m5, m6 1573 REPX {psrad x, 6}, m3, m4, m5, m6 1574 packssdw m4, m3 1575 packssdw m6, m5 1576 REPX {paddw x, m15}, m4, m6 1577 REPX {pmaxsw x, m2}, m4, m6 1578 REPX {pminsw x, m10}, m4, m6 ; clip_pixel() 1579%else 1580 REPX {pminuw x, m10}, m4, m6 1581%endif 1582 1583 punpckhwd m5, m4, m2 1584 punpcklwd m4, m2 1585 punpckhwd m7, m6, m2 1586 punpcklwd m6, m2 ; m4-7: luma_src as dword 1587 1588 ; scaling[luma_src] 1589 pcmpeqw m3, m3 1590 mova m9, m3 1591 vpgatherdd m8, [scalingq+m4-3], m3 1592 vpgatherdd m4, [scalingq+m5-3], m9 1593 pcmpeqw m3, m3 1594 mova m9, m3 1595 vpgatherdd m5, [scalingq+m6-3], m3 1596 vpgatherdd m6, [scalingq+m7-3], m9 1597 REPX {psrld x, 24}, m8, m4, m5, m6 1598 packssdw m8, m4 1599 packssdw m5, m6 1600 1601 ; grain = grain_lut[offy+y][offx+x] 1602 movu m9, [grain_lutq+offxyq*2] 1603%if %2 1604 movu m3, [grain_lutq+offxyq*2+82*2] 1605%else 1606 movu m3, [grain_lutq+offxyq*2+32] 1607%endif 1608 1609 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1610 REPX {pmullw x, m11}, m8, m5 1611 pmulhrsw m9, m8 1612 pmulhrsw m3, m5 1613 1614 ; dst = clip_pixel(src, noise) 1615 paddw m0, m9 1616 paddw m1, m3 1617 pmaxsw m0, m13 1618 pmaxsw m1, m13 1619 pminsw m0, m12 1620 pminsw m1, m12 1621 mova [dstq], m0 1622%if %2 1623 mova [dstq+strideq], m1 1624 1625 lea srcq, [srcq+strideq*2] 1626 lea dstq, [dstq+strideq*2] 1627 lea lumaq, [lumaq+lstrideq*(2<<%3)] 1628%else 1629 mova [dstq+32], m1 1630 add srcq, strideq 1631 add dstq, strideq 1632 add lumaq, lstrideq 1633%endif 1634 add grain_lutq, 82*(2<<%2) 1635%if %2 1636 sub hb, 2 1637%else 1638 dec hb 1639%endif 1640 jg %%loop_y 1641 1642 add wq, 32>>%2 1643 jge %%end 1644 mov srcq, r10mp 1645 mov dstq, r11mp 1646 mov lumaq, r12mp 1647 lea srcq, [srcq+wq*2] 1648 lea dstq, [dstq+wq*2] 1649 lea lumaq, [lumaq+wq*(2<<%2)] 1650 cmp byte [fg_dataq+FGData.overlap_flag], 0 1651 je %%loop_x 1652 1653 ; r8m = sbym 1654 cmp dword r8m, 0 1655 jne %%loop_x_hv_overlap 1656 1657 ; horizontal overlap (without vertical overlap) 1658%%loop_x_h_overlap: 1659 mov r6d, seed 1660 or seed, 0xEFF4 1661 shr r6d, 1 1662 test seeb, seeh 1663 lea seed, [r6+0x8000] 1664 cmovp seed, r6d ; updated seed 1665 1666 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1667 offx, offy, see, left_offxy, unused1, unused2, luma, lstride 1668 1669 lea left_offxyd, [offyd+(32>>%2)] ; previous column's offy*stride+offx 1670 mov offxd, seed 1671 rorx offyd, seed, 8 1672 shr offxd, 12 1673 and offyd, 0xf 1674 imul offyd, 164>>%3 1675 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 1676 1677 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1678 h, offxy, see, left_offxy, unused1, unused2, luma, lstride 1679 1680 mov hd, hm 1681 mov grain_lutq, grain_lutmp 1682%%loop_y_h_overlap: 1683 mova m0, [srcq] 1684%if %2 1685 mova m1, [srcq+strideq] 1686 1687 ; luma_src 1688 mova xm4, [lumaq+lstrideq*0+ 0] 1689 mova xm7, [lumaq+lstrideq*0+16] 1690 vinserti128 m4, [lumaq+lstrideq*0+32], 1 1691 vinserti128 m7, [lumaq+lstrideq*0+48], 1 1692 mova xm6, [lumaq+lstrideq*(1<<%3)+ 0] 1693 mova xm8, [lumaq+lstrideq*(1<<%3)+16] 1694 vinserti128 m6, [lumaq+lstrideq*(1<<%3)+32], 1 1695 vinserti128 m8, [lumaq+lstrideq*(1<<%3)+48], 1 1696 phaddw m4, m7 1697 phaddw m6, m8 1698 pavgw m4, m2 1699 pavgw m6, m2 1700%else 1701 mova m1, [srcq+32] 1702 1703 ; luma_src 1704 mova m4, [lumaq] 1705 mova m6, [lumaq+32] 1706%endif 1707 1708%if %1 1709 punpckhwd m3, m4, m0 1710 punpcklwd m4, m0 1711 punpckhwd m5, m6, m1 1712 punpcklwd m6, m1 ; { luma, chroma } 1713 REPX {pmaddwd x, m14}, m3, m4, m5, m6 1714 REPX {psrad x, 6}, m3, m4, m5, m6 1715 packssdw m4, m3 1716 packssdw m6, m5 1717 REPX {paddw x, m15}, m4, m6 1718 REPX {pmaxsw x, m2}, m4, m6 1719 REPX {pminsw x, m10}, m4, m6 ; clip_pixel() 1720%else 1721 REPX {pminuw x, m10}, m4, m6 1722%endif 1723 1724 ; grain = grain_lut[offy+y][offx+x] 1725 movu m9, [grain_lutq+offxyq*2] 1726%if %2 1727 movu m3, [grain_lutq+offxyq*2+82*2] 1728%else 1729 movu m3, [grain_lutq+offxyq*2+32] 1730%endif 1731 movd xm5, [grain_lutq+left_offxyq*2+ 0] 1732%if %2 1733 pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1} 1734 punpckldq xm7, xm9, xm3 ; {cur0, cur1} 1735 punpcklwd xm5, xm7 ; {left0, cur0, left1, cur1} 1736%else 1737 punpcklwd xm5, xm9 1738%endif 1739%if %1 1740%if %2 1741 vpbroadcastq xm8, [pw_23_22] 1742%else 1743 movq xm8, [pw_27_17_17_27] 1744%endif 1745 pmaddwd xm5, xm8 1746 vpbroadcastd xm8, [pd_16] 1747 paddd xm5, xm8 1748%else 1749 pmaddwd xm5, xm15 1750 paddd xm5, xm14 1751%endif 1752 psrad xm5, 5 1753 packssdw xm5, xm5 1754 pcmpeqw xm8, xm8 1755 psraw xm7, xm10, 1 1756 pxor xm8, xm7 1757 pmaxsw xm5, xm8 1758 pminsw xm5, xm7 1759 vpblendd m9, m9, m5, 00000001b 1760%if %2 1761 psrldq xm5, 4 1762 vpblendd m3, m3, m5, 00000001b 1763%endif 1764 1765 ; scaling[luma_src] 1766 punpckhwd m5, m4, m2 1767 punpcklwd m4, m2 1768 pcmpeqw m7, m7 1769 vpgatherdd m8, [scalingq+m4-3], m7 1770 pcmpeqw m7, m7 1771 vpgatherdd m4, [scalingq+m5-3], m7 1772 REPX {psrld x, 24}, m8, m4 1773 packssdw m8, m4 1774 1775 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1776 pmullw m8, m11 1777 pmulhrsw m9, m8 1778 1779 ; same for the other half 1780 punpckhwd m7, m6, m2 1781 punpcklwd m6, m2 ; m4-7: luma_src as dword 1782 pcmpeqw m8, m8 1783 mova m4, m8 1784 vpgatherdd m5, [scalingq+m6-3], m8 1785 vpgatherdd m6, [scalingq+m7-3], m4 1786 REPX {psrld x, 24}, m5, m6 1787 packssdw m5, m6 1788 1789 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1790 pmullw m5, m11 1791 pmulhrsw m3, m5 1792 1793 ; dst = clip_pixel(src, noise) 1794 paddw m0, m9 1795 paddw m1, m3 1796 pmaxsw m0, m13 1797 pmaxsw m1, m13 1798 pminsw m0, m12 1799 pminsw m1, m12 1800 mova [dstq], m0 1801%if %2 1802 mova [dstq+strideq], m1 1803 1804 lea srcq, [srcq+strideq*2] 1805 lea dstq, [dstq+strideq*2] 1806 lea lumaq, [lumaq+lstrideq*(2<<%3)] 1807%else 1808 mova [dstq+32], m1 1809 1810 add srcq, strideq 1811 add dstq, strideq 1812 add lumaq, lstrideq 1813%endif 1814 1815 add grain_lutq, 82*(2<<%2) 1816%if %2 1817 sub hb, 2 1818%else 1819 dec hb 1820%endif 1821 jg %%loop_y_h_overlap 1822 1823 add wq, 32>>%2 1824 jge %%end 1825 mov srcq, r10mp 1826 mov dstq, r11mp 1827 mov lumaq, r12mp 1828 lea srcq, [srcq+wq*2] 1829 lea dstq, [dstq+wq*2] 1830 lea lumaq, [lumaq+wq*(2<<%2)] 1831 1832 ; r8m = sbym 1833 cmp dword r8m, 0 1834 jne %%loop_x_hv_overlap 1835 jmp %%loop_x_h_overlap 1836 1837%%end: 1838 RET 1839 1840%%vertical_overlap: 1841 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ 1842 sby, see, unused1, unused2, unused3, lstride 1843 1844 movzx sbyd, sbyb 1845 imul seed, [fg_dataq+FGData.seed], 0x00010001 1846 imul r7d, sbyd, 173 * 0x00010001 1847 imul sbyd, 37 * 0x01000100 1848 add r7d, (105 << 16) | 188 1849 add sbyd, (178 << 24) | (141 << 8) 1850 and r7d, 0x00ff00ff 1851 and sbyd, 0xff00ff00 1852 xor seed, r7d 1853 xor seed, sbyd ; (cur_seed << 16) | top_seed 1854 1855 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1856 unused1, unused2, see, unused3, unused4, unused5, luma, lstride 1857 1858 mov lumaq, r9mp 1859 mov lstrideq, r10mp 1860 lea r10, [srcq+wq*2] 1861 lea r11, [dstq+wq*2] 1862 lea r12, [lumaq+wq*(2<<%2)] 1863 mov r10mp, r10 1864 mov r11mp, r11 1865 mov r12mp, r12 1866 neg wq 1867 1868%%loop_x_v_overlap: 1869 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1870 mov r6d, seed 1871 or seed, 0xeff4eff4 1872 test seeb, seeh 1873 setp r7b ; parity of top_seed 1874 shr seed, 16 1875 shl r7d, 16 1876 test seeb, seeh 1877 setp r7b ; parity of cur_seed 1878 or r6d, 0x00010001 1879 xor r7d, r6d 1880 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1881 1882 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1883 offx, offy, see, unused1, top_offxy, unused2, luma, lstride 1884 1885 rorx offyd, seed, 8 1886 rorx offxd, seed, 12 1887 and offyd, 0xf000f 1888 and offxd, 0xf000f 1889 imul offyd, 164>>%3 1890 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1891 lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 1892 1893 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1894 h, offxy, see, unused1, top_offxy, unused2, luma, lstride 1895 1896 movzx top_offxyd, offxyw 1897 shr offxyd, 16 1898 1899%if %2 == 0 1900 lea r10, [pw_27_17_17_27] 1901%endif 1902 mov hd, hm 1903 mov grain_lutq, grain_lutmp 1904%%loop_y_v_overlap: 1905 ; src 1906 mova m0, [srcq] 1907%if %2 1908 mova m1, [srcq+strideq] 1909 1910 ; luma_src 1911 mova xm4, [lumaq+lstrideq*0+ 0] 1912 mova xm7, [lumaq+lstrideq*0+16] 1913 vinserti128 m4, [lumaq+lstrideq*0+32], 1 1914 vinserti128 m7, [lumaq+lstrideq*0+48], 1 1915 mova xm6, [lumaq+lstrideq*(1<<%3)+ 0] 1916 mova xm8, [lumaq+lstrideq*(1<<%3)+16] 1917 vinserti128 m6, [lumaq+lstrideq*(1<<%3)+32], 1 1918 vinserti128 m8, [lumaq+lstrideq*(1<<%3)+48], 1 1919 phaddw m4, m7 1920 phaddw m6, m8 1921 pavgw m4, m2 1922 pavgw m6, m2 1923%else 1924 mova m1, [srcq+32] 1925 1926 ; luma_src 1927 mova m4, [lumaq] 1928 mova m6, [lumaq+32] 1929%endif 1930 1931%if %1 1932 punpckhwd m3, m4, m0 1933 punpcklwd m4, m0 1934 punpckhwd m5, m6, m1 1935 punpcklwd m6, m1 ; { luma, chroma } 1936 REPX {pmaddwd x, m14}, m3, m4, m5, m6 1937 REPX {psrad x, 6}, m3, m4, m5, m6 1938 packssdw m4, m3 1939 packssdw m6, m5 1940 REPX {paddw x, m15}, m4, m6 1941 REPX {pmaxsw x, m2}, m4, m6 1942 REPX {pminsw x, m10}, m4, m6 ; clip_pixel() 1943%else 1944 REPX {pminuw x, m10}, m4, m6 1945%endif 1946 1947 ; grain = grain_lut[offy+y][offx+x] 1948 movu m9, [grain_lutq+offxyq*2] 1949 movu m5, [grain_lutq+top_offxyq*2] 1950 punpckhwd m7, m5, m9 1951 punpcklwd m5, m9 ; {top/cur interleaved} 1952%if %3 1953 vpbroadcastd m3, [pw_23_22] 1954%elif %2 1955 vpbroadcastd m3, [pw_27_17_17_27] 1956%else 1957 vpbroadcastd m3, [r10] 1958%endif 1959 REPX {pmaddwd x, m3}, m7, m5 1960%if %1 1961 vpbroadcastd m8, [pd_16] 1962 REPX {paddd x, m8}, m7, m5 1963%else 1964 REPX {paddd x, m14}, m7, m5 1965%endif 1966 REPX {psrad x, 5}, m7, m5 1967 packssdw m9, m5, m7 1968%if %2 1969 movu m3, [grain_lutq+offxyq*2+82*2] 1970%else 1971 movu m3, [grain_lutq+offxyq*2+32] 1972%endif 1973%if %3 == 0 1974%if %2 1975 movu m5, [grain_lutq+top_offxyq*2+82*2] 1976%else 1977 movu m5, [grain_lutq+top_offxyq*2+32] 1978%endif 1979 punpckhwd m7, m5, m3 1980 punpcklwd m5, m3 ; {top/cur interleaved} 1981%if %2 1982 vpbroadcastd m3, [pw_27_17_17_27+4] 1983%else 1984 vpbroadcastd m3, [r10] 1985%endif 1986 REPX {pmaddwd x, m3}, m7, m5 1987%if %1 1988 REPX {paddd x, m8}, m7, m5 1989%else 1990 REPX {paddd x, m14}, m7, m5 1991%endif 1992 REPX {psrad x, 5}, m7, m5 1993 packssdw m3, m5, m7 1994%endif ; %3 == 0 1995 pcmpeqw m7, m7 1996 psraw m5, m10, 1 1997 pxor m7, m5 1998%if %3 1999 pmaxsw m9, m7 2000 pminsw m9, m5 2001%else 2002 REPX {pmaxsw x, m7}, m9, m3 2003 REPX {pminsw x, m5}, m9, m3 2004%endif 2005 2006 ; scaling[luma_src] 2007 punpckhwd m5, m4, m2 2008 punpcklwd m4, m2 2009 pcmpeqw m7, m7 2010 vpgatherdd m8, [scalingq+m4-3], m7 2011 pcmpeqw m7, m7 2012 vpgatherdd m4, [scalingq+m5-3], m7 2013 REPX {psrld x, 24}, m8, m4 2014 packssdw m8, m4 2015 2016 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2017 pmullw m8, m11 2018 pmulhrsw m9, m8 2019 2020 ; scaling for the other half 2021 punpckhwd m7, m6, m2 2022 punpcklwd m6, m2 ; m4-7: luma_src as dword 2023 pcmpeqw m8, m8 2024 mova m4, m8 2025 vpgatherdd m5, [scalingq+m6-3], m8 2026 vpgatherdd m6, [scalingq+m7-3], m4 2027 REPX {psrld x, 24}, m5, m6 2028 packssdw m5, m6 2029 2030 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2031 pmullw m5, m11 2032 pmulhrsw m3, m5 2033 2034 ; dst = clip_pixel(src, noise) 2035 paddw m0, m9 2036 paddw m1, m3 2037 pmaxsw m0, m13 2038 pmaxsw m1, m13 2039 pminsw m0, m12 2040 pminsw m1, m12 2041 mova [dstq], m0 2042%if %2 2043 mova [dstq+strideq], m1 2044 2045 sub hb, 2 2046%else 2047 mova [dstq+32], m1 2048 dec hb 2049%endif 2050 jle %%end_y_v_overlap 2051%if %2 2052 lea srcq, [srcq+strideq*2] 2053 lea dstq, [dstq+strideq*2] 2054 lea lumaq, [lumaq+lstrideq*(2<<%3)] 2055%else 2056 add srcq, strideq 2057 add dstq, strideq 2058 add lumaq, lstrideq 2059%endif 2060 add grain_lutq, 82*(2<<%2) 2061%if %2 2062 jmp %%loop_y 2063%else 2064 btc hd, 16 2065 jc %%loop_y 2066 add r10, 4 2067 jmp %%loop_y_v_overlap 2068%endif 2069 2070%%end_y_v_overlap: 2071 add wq, 32>>%2 2072 jge %%end_hv 2073 mov srcq, r10mp 2074 mov dstq, r11mp 2075 mov lumaq, r12mp 2076 lea srcq, [srcq+wq*2] 2077 lea dstq, [dstq+wq*2] 2078 lea lumaq, [lumaq+wq*(2<<%2)] 2079 2080 ; since fg_dataq.overlap is guaranteed to be set, we never jump 2081 ; back to .loop_x_v_overlap, and instead always fall-through to 2082 ; h+v overlap 2083 2084%%loop_x_hv_overlap: 2085 ; we assume from the block above that bits 8-15 of r7d are zero'ed 2086 mov r6d, seed 2087 or seed, 0xeff4eff4 2088 test seeb, seeh 2089 setp r7b ; parity of top_seed 2090 shr seed, 16 2091 shl r7d, 16 2092 test seeb, seeh 2093 setp r7b ; parity of cur_seed 2094 or r6d, 0x00010001 2095 xor r7d, r6d 2096 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 2097 2098 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2099 offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride 2100 2101%if %2 == 0 2102 lea r12, [pw_27_17_17_27] 2103 mov r13mp, r12 2104%endif 2105 lea topleft_offxyq, [top_offxyq+(32>>%2)] 2106 lea left_offxyq, [offyq+(32>>%2)] 2107 rorx offyd, seed, 8 2108 rorx offxd, seed, 12 2109 and offyd, 0xf000f 2110 and offxd, 0xf000f 2111 imul offyd, 164>>%3 2112 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 2113 lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 2114 2115 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 2116 h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride 2117 2118 movzx top_offxyd, offxyw 2119 shr offxyd, 16 2120 2121 mov hd, hm 2122 mov grain_lutq, grain_lutmp 2123%%loop_y_hv_overlap: 2124 ; grain = grain_lut[offy+y][offx+x] 2125 movd xm5, [grain_lutq+left_offxyq*2] 2126%if %2 2127 pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 2 2128%if %3 2129 vinserti128 m5, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left } 2130%else 2131 ; insert both top/left lines 2132 movd xm9, [grain_lutq+topleft_offxyq*2+82*2] 2133 pinsrw xm9, [grain_lutq+topleft_offxyq*2], 2 2134 vinserti128 m5, xm9, 1 2135%endif 2136%else 2137 pinsrd xm5, [grain_lutq+topleft_offxyq*2], 1 2138%endif 2139 movu m9, [grain_lutq+offxyq*2] 2140%if %2 2141 movu m3, [grain_lutq+offxyq*2+82*2] 2142%else 2143 movu m3, [grain_lutq+offxyq*2+32] 2144%endif 2145 movu m8, [grain_lutq+top_offxyq*2] 2146%if %2 2147 punpckldq xm7, xm9, xm3 ; { cur0, cur1 } 2148%if %3 2149 vinserti128 m7, xm8, 1 ; { cur0, cur1, top0 } 2150%else 2151 ; insert both top lines 2152 movu m1, [grain_lutq+top_offxyq*2+82*2] 2153 punpckldq xm0, xm1, xm8 2154 vinserti128 m7, xm0, 1 2155%endif 2156%else 2157 movu m1, [grain_lutq+top_offxyq*2+32] 2158 punpckldq xm7, xm9, xm8 2159%endif 2160 punpcklwd m5, m7 ; { cur/left } interleaved 2161%if %2 2162%if %1 2163 vpbroadcastq m0, [pw_23_22] 2164 pmaddwd m5, m0 2165 vpbroadcastd m0, [pd_16] 2166 paddd m5, m0 2167%else 2168 pmaddwd m5, m15 2169 paddd m5, m14 2170%endif 2171 psrad m5, 5 2172 vextracti128 xm0, m5, 1 2173 packssdw xm5, xm0 2174%else 2175%if %1 2176 movddup xm0, [pw_27_17_17_27] 2177 pmaddwd xm5, xm0 2178 vpbroadcastd m0, [pd_16] 2179 paddd xm5, xm0 2180%else 2181 pmaddwd xm5, xm15 2182 paddd xm5, xm14 2183%endif 2184 psrad xm5, 5 2185 packssdw xm5, xm5 2186%endif 2187 pcmpeqw m0, m0 2188 psraw m7, m10, 1 2189 pxor m0, m7 2190 pminsw xm5, xm7 2191 pmaxsw xm5, xm0 2192 vpblendd m9, m9, m5, 00000001b 2193%if %2 2194 psrldq xm5, 4 2195 vpblendd m3, m3, m5, 00000001b 2196%if %3 == 0 2197 psrldq xm5, 4 2198 vpblendd m1, m1, m5, 00000001b 2199%endif 2200%endif 2201 psrldq xm5, 4 2202 vpblendd m5, m8, m5, 00000001b 2203 2204 punpckhwd m8, m5, m9 2205 punpcklwd m5, m9 ; {top/cur interleaved} 2206%if %3 2207 vpbroadcastd m9, [pw_23_22] 2208%elif %2 2209 vpbroadcastd m9, [pw_27_17_17_27] 2210%else 2211 xchg r12, r13mp 2212 vpbroadcastd m9, [r12] 2213%endif 2214 REPX {pmaddwd x, m9}, m8, m5 2215%if %1 2216 vpbroadcastd m4, [pd_16] 2217 REPX {paddd x, m4}, m8, m5 2218%else 2219 REPX {paddd x, m14}, m8, m5 2220%endif 2221 REPX {psrad x, 5}, m8, m5 2222 packssdw m9, m5, m8 2223%if %3 2224 pminsw m9, m7 2225 pmaxsw m9, m0 2226%else 2227 punpckhwd m8, m1, m3 2228 punpcklwd m1, m3 ; {top/cur interleaved} 2229%if %2 2230 vpbroadcastd m3, [pw_27_17_17_27+4] 2231%else 2232 vpbroadcastd m3, [r12] 2233 xchg r12, r13mp 2234%endif 2235 REPX {pmaddwd x, m3}, m8, m1 2236%if %1 2237 REPX {paddd x, m4}, m8, m1 2238%else 2239 REPX {paddd x, m14}, m8, m1 2240%endif 2241 REPX {psrad x, 5}, m8, m1 2242 packssdw m3, m1, m8 2243 REPX {pminsw x, m7}, m9, m3 2244 REPX {pmaxsw x, m0}, m9, m3 2245%endif 2246 2247 ; src 2248 mova m0, [srcq] 2249%if %2 2250 mova m1, [srcq+strideq] 2251%else 2252 mova m1, [srcq+32] 2253%endif 2254 2255 ; luma_src 2256%if %2 2257 mova xm4, [lumaq+lstrideq*0+ 0] 2258 mova xm7, [lumaq+lstrideq*0+16] 2259 vinserti128 m4, [lumaq+lstrideq*0+32], 1 2260 vinserti128 m7, [lumaq+lstrideq*0+48], 1 2261 mova xm6, [lumaq+lstrideq*(1<<%3)+ 0] 2262 mova xm8, [lumaq+lstrideq*(1<<%3)+16] 2263 vinserti128 m6, [lumaq+lstrideq*(1<<%3)+32], 1 2264 vinserti128 m8, [lumaq+lstrideq*(1<<%3)+48], 1 2265 phaddw m4, m7 2266 phaddw m6, m8 2267 pavgw m4, m2 2268 pavgw m6, m2 2269%else 2270 mova m4, [lumaq] 2271 mova m6, [lumaq+32] 2272%endif 2273 2274%if %1 2275 punpckhwd m8, m4, m0 2276 punpcklwd m4, m0 2277 punpckhwd m5, m6, m1 2278 punpcklwd m6, m1 ; { luma, chroma } 2279 REPX {pmaddwd x, m14}, m8, m4, m5, m6 2280 REPX {psrad x, 6}, m8, m4, m5, m6 2281 packssdw m4, m8 2282 packssdw m6, m5 2283 REPX {paddw x, m15}, m4, m6 2284 REPX {pmaxsw x, m2}, m4, m6 2285 REPX {pminsw x, m10}, m4, m6 ; clip_pixel() 2286%else 2287 REPX {pminuw x, m10}, m4, m6 2288%endif 2289 2290 ; scaling[luma_src] 2291 punpckhwd m5, m4, m2 2292 punpcklwd m4, m2 2293 pcmpeqw m7, m7 2294 vpgatherdd m8, [scalingq+m4-3], m7 2295 pcmpeqw m7, m7 2296 vpgatherdd m4, [scalingq+m5-3], m7 2297 REPX {psrld x, 24}, m8, m4 2298 packssdw m8, m4 2299 2300 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2301 pmullw m8, m11 2302 pmulhrsw m9, m8 2303 2304 ; same for the other half 2305 punpckhwd m7, m6, m2 2306 punpcklwd m6, m2 ; m4-7: luma_src as dword 2307 pcmpeqw m8, m8 2308 mova m4, m8 2309 vpgatherdd m5, [scalingq+m6-3], m8 2310 vpgatherdd m6, [scalingq+m7-3], m4 2311 REPX {psrld x, 24}, m5, m6 2312 packssdw m5, m6 2313 2314 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2315 pmullw m5, m11 2316 pmulhrsw m3, m5 2317 2318 ; dst = clip_pixel(src, noise) 2319 paddw m0, m9 2320 paddw m1, m3 2321 pmaxsw m0, m13 2322 pmaxsw m1, m13 2323 pminsw m0, m12 2324 pminsw m1, m12 2325 mova [dstq], m0 2326%if %2 2327 mova [dstq+strideq], m1 2328 2329 lea srcq, [srcq+strideq*2] 2330 lea dstq, [dstq+strideq*2] 2331 lea lumaq, [lumaq+lstrideq*(2<<%3)] 2332%else 2333 mova [dstq+32], m1 2334 2335 add srcq, strideq 2336 add dstq, strideq 2337 add lumaq, lstrideq 2338%endif 2339 add grain_lutq, 82*(2<<%2) 2340%if %2 2341 sub hb, 2 2342 jg %%loop_y_h_overlap 2343%else 2344 dec hb 2345 jle %%end_y_hv_overlap 2346 btc hd, 16 2347 jc %%loop_y_h_overlap 2348 add r13mp, 4 2349 jmp %%loop_y_hv_overlap 2350%endif 2351 2352%%end_y_hv_overlap: 2353 add wq, 32>>%2 2354 jge %%end_hv 2355 mov srcq, r10mp 2356 mov dstq, r11mp 2357 mov lumaq, r12mp 2358 lea srcq, [srcq+wq*2] 2359 lea dstq, [dstq+wq*2] 2360 lea lumaq, [lumaq+wq*(2<<%2)] 2361 jmp %%loop_x_hv_overlap 2362 2363%%end_hv: 2364 RET 2365%endmacro 2366 2367 %%FGUV_32x32xN_LOOP 1, %2, %3 2368.csfl: 2369 %%FGUV_32x32xN_LOOP 0, %2, %3 2370%endmacro 2371 2372FGUV_FN 420, 1, 1 2373FGUV_FN 422, 1, 0 2374FGUV_FN 444, 0, 0 2375%endif ; ARCH_X86_64 2376