1; Copyright © 2021-2022, VideoLAN and dav1d authors 2; Copyright © 2021-2022, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28%include "x86/filmgrain_common.asm" 29 30%if ARCH_X86_64 31 32SECTION_RODATA 16 33pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0 34gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 35gen_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 36next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 37pw_27_17_17_27: dw 27, 17, 17, 27 38pw_23_22: dw 23, 22, 0, 32 39pw_seed_xor: times 2 dw 0xb524 40 times 2 dw 0x49d8 41gen_ar0_shift: times 4 db 128 42 times 4 db 64 43 times 4 db 32 44 times 4 db 16 45pd_16: dd 16 46pd_m65536: dd -65536 47pb_1: times 4 db 1 48grain_max: times 2 dw 511 49 times 2 dw 2047 50grain_min: times 2 dw -512 51 times 2 dw -2048 52fg_max: times 2 dw 1023 53 times 2 dw 4095 54 times 2 dw 960 55 times 2 dw 3840 56 times 2 dw 940 57 times 2 dw 3760 58fg_min: times 2 dw 0 59 times 2 dw 64 60 times 2 dw 256 61uv_offset_mul: dd 256 62 dd 1024 63hmul_bits: dw 32768, 16384, 8192, 4096 64round: dw 2048, 1024, 512 65mul_bits: dw 256, 128, 64, 32, 16, 8 66round_vals: dw 32, 64, 128, 256, 512, 1024 67pb_8_9_0_1: db 8, 9, 0, 1 68 69%macro JMP_TABLE 1-* 70 %xdefine %1_table %%table 71 %xdefine %%base %1_table 72 %xdefine %%prefix mangle(private_prefix %+ _%1) 73 %%table: 74 %rep %0 - 1 75 dd %%prefix %+ .ar%2 - %%base 76 %rotate 1 77 %endrep 78%endmacro 79 80JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3 81JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3 82JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3 83JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3 84 85SECTION .text 86 87%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) 88 89INIT_YMM avx2 90cglobal generate_grain_y_16bpc, 3, 9, 14, buf, fg_data, bdmax 91%define base r4-generate_grain_y_16bpc_avx2_table 92 lea r4, [generate_grain_y_16bpc_avx2_table] 93 vpbroadcastw xm0, [fg_dataq+FGData.seed] 94 mov r6d, [fg_dataq+FGData.grain_scale_shift] 95 movq xm1, [base+next_upperbit_mask] 96 mov r3, -73*82*2 97 movsxd r5, [fg_dataq+FGData.ar_coeff_lag] 98 lea r7d, [bdmaxq+1] 99 movq xm4, [base+mul_bits] 100 shr r7d, 11 ; 0 for 10bpc, 2 for 12bpc 101 movq xm5, [base+hmul_bits] 102 sub r6, r7 103 mova xm6, [base+pb_mask] 104 sub bufq, r3 105 vpbroadcastw xm7, [base+round+r6*2-2] 106 lea r6, [gaussian_sequence] 107 movsxd r5, [r4+r5*4] 108.loop: 109 pand xm2, xm0, xm1 110 psrlw xm3, xm2, 10 111 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 112 pmullw xm2, xm4 ; bits 0x0f00 are set 113 pmulhuw xm0, xm5 114 pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds 115 psllq xm2, xm3, 30 116 por xm2, xm3 117 psllq xm3, xm2, 15 118 por xm2, xm0 ; aggregate each bit into next seed's high bit 119 por xm3, xm2 ; 4 next output seeds 120 pshuflw xm0, xm3, q3333 121 psrlw xm3, 5 122 pand xm2, xm0, xm1 123 movq r7, xm3 124 psrlw xm3, xm2, 10 125 por xm2, xm3 126 pmullw xm2, xm4 127 pmulhuw xm0, xm5 128 movzx r8d, r7w 129 pshufb xm3, xm6, xm2 130 psllq xm2, xm3, 30 131 por xm2, xm3 132 psllq xm3, xm2, 15 133 por xm0, xm2 134 movd xm2, [r6+r8*2] 135 rorx r8, r7, 32 136 por xm3, xm0 137 shr r7d, 16 138 pinsrw xm2, [r6+r7*2], 1 139 pshuflw xm0, xm3, q3333 140 movzx r7d, r8w 141 psrlw xm3, 5 142 pinsrw xm2, [r6+r7*2], 2 143 shr r8d, 16 144 movq r7, xm3 145 pinsrw xm2, [r6+r8*2], 3 146 movzx r8d, r7w 147 pinsrw xm2, [r6+r8*2], 4 148 rorx r8, r7, 32 149 shr r7d, 16 150 pinsrw xm2, [r6+r7*2], 5 151 movzx r7d, r8w 152 pinsrw xm2, [r6+r7*2], 6 153 shr r8d, 16 154 pinsrw xm2, [r6+r8*2], 7 155 paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 156 pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support 157 mova [bufq+r3], xm2 158 add r3, 8*2 159 jl .loop 160 161 ; auto-regression code 162 add r5, r4 163 jmp r5 164 165.ar1: 166 DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 167 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 168 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] 169 movd xm4, [fg_dataq+FGData.ar_coeffs_y] 170 DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 171 pinsrb xm4, [base+pb_1], 3 172 pmovsxbw xm4, xm4 173 pshufd xm5, xm4, q1111 174 pshufd xm4, xm4, q0000 175 vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd 176 sub bufq, 2*(82*73-(82*3+79)) 177 mov hd, 70 178 sar maxd, 1 179 mov mind, maxd 180 xor mind, -1 181.y_loop_ar1: 182 mov xq, -76 183 movsx val3d, word [bufq+xq*2-2] 184.x_loop_ar1: 185 movu xm0, [bufq+xq*2-82*2-2] ; top/left 186 psrldq xm2, xm0, 2 ; top 187 psrldq xm1, xm0, 4 ; top/right 188 punpcklwd xm0, xm2 189 punpcklwd xm1, xm3 190 pmaddwd xm0, xm4 191 pmaddwd xm1, xm5 192 paddd xm0, xm1 193.x_loop_ar1_inner: 194 movd val0d, xm0 195 psrldq xm0, 4 196 imul val3d, cf3d 197 add val3d, val0d 198 sarx val3d, val3d, shiftd 199 movsx val0d, word [bufq+xq*2] 200 add val3d, val0d 201 cmp val3d, maxd 202 cmovg val3d, maxd 203 cmp val3d, mind 204 cmovl val3d, mind 205 mov word [bufq+xq*2], val3w 206 ; keep val3d in-place as left for next x iteration 207 inc xq 208 jz .x_loop_ar1_end 209 test xb, 3 210 jnz .x_loop_ar1_inner 211 jmp .x_loop_ar1 212.x_loop_ar1_end: 213 add bufq, 82*2 214 dec hd 215 jg .y_loop_ar1 216.ar0: 217 RET 218 219.ar2: 220 DEFINE_ARGS buf, fg_data, bdmax, shift 221 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 222 movq xm0, [fg_dataq+FGData.ar_coeffs_y+5] ; cf5-11 223 vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+0], 1 ; cf0-4 224 vpbroadcastw xm10, [base+round_vals-12+shiftq*2] 225 pxor m1, m1 226 punpcklwd xm10, xm1 227 pcmpgtb m1, m0 228 punpcklbw m0, m1 ; cf5-11,0-4 229 vpermq m1, m0, q3333 ; cf4 230 vbroadcasti128 m11, [base+gen_shufA] 231 pshufd m6, m0, q0000 ; cf[5,6], cf[0-1] 232 vbroadcasti128 m12, [base+gen_shufB] 233 pshufd m7, m0, q1111 ; cf[7,8], cf[2-3] 234 punpckhwd xm1, xm0 235 pshufhw xm9, xm0, q2121 236 pshufd xm8, xm1, q0000 ; cf[4,9] 237 sar bdmaxd, 1 238 punpckhqdq xm9, xm9 ; cf[10,11] 239 movd xm4, bdmaxd ; max_grain 240 pcmpeqd xm5, xm5 241 sub bufq, 2*(82*73-(82*3+79)) 242 pxor xm5, xm4 ; min_grain 243 DEFINE_ARGS buf, fg_data, h, x 244 mov hd, 70 245.y_loop_ar2: 246 mov xq, -76 247.x_loop_ar2: 248 vbroadcasti128 m2, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] 249 vinserti128 m1, m2, [bufq+xq*2-82*2-4], 0 ; y=-1,x=[-2,+5] 250 pshufb m0, m1, m11 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] 251 pmaddwd m0, m6 252 punpckhwd xm2, xm1 ; y=-2/-1 interleaved, x=[+2,+5] 253 pshufb m1, m12 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] 254 pmaddwd m1, m7 255 pmaddwd xm2, xm8 256 paddd m0, m1 257 vextracti128 xm1, m0, 1 258 paddd xm0, xm10 259 paddd xm2, xm0 260 movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] 261 paddd xm2, xm1 262 pmovsxwd xm1, [bufq+xq*2] ; in dwords, y=0,x=[0,3] 263.x_loop_ar2_inner: 264 pmaddwd xm3, xm9, xm0 265 psrldq xm0, 2 266 paddd xm3, xm2 267 psrldq xm2, 4 ; shift top to next pixel 268 psrad xm3, [fg_dataq+FGData.ar_coeff_shift] 269 ; skip packssdw because we only care about one value 270 paddd xm3, xm1 271 pminsd xm3, xm4 272 psrldq xm1, 4 273 pmaxsd xm3, xm5 274 pextrw [bufq+xq*2], xm3, 0 275 punpcklwd xm3, xm3 276 pblendw xm0, xm3, 0010b 277 inc xq 278 jz .x_loop_ar2_end 279 test xb, 3 280 jnz .x_loop_ar2_inner 281 jmp .x_loop_ar2 282.x_loop_ar2_end: 283 add bufq, 82*2 284 dec hd 285 jg .y_loop_ar2 286 RET 287 288.ar3: 289 DEFINE_ARGS buf, fg_data, bdmax, shift 290 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 291 sar bdmaxd, 1 292 movq xm7, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-6 293 movd xm0, [fg_dataq+FGData.ar_coeffs_y+14] ; cf14-16 294 pinsrb xm7, [fg_dataq+FGData.ar_coeffs_y+13], 7 ; cf0-6,13 295 pinsrb xm0, [base+pb_1], 3 ; cf14-16,pb_1 296 movd xm1, [fg_dataq+FGData.ar_coeffs_y+21] ; cf21-23 297 vinserti128 m7, [fg_dataq+FGData.ar_coeffs_y+ 7], 1 ; cf7-13 298 vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+17], 1 ; cf17-20 299 vpbroadcastw xm11, [base+round_vals+shiftq*2-12] 300 movd xm12, bdmaxd ; max_grain 301 punpcklbw m7, m7 ; sign-extension 302 punpcklbw m0, m0 ; sign-extension 303 punpcklbw xm1, xm1 304 REPX {psraw x, 8}, m7, m0, xm1 305 pshufd m4, m7, q0000 ; cf[0,1] | cf[7,8] 306 pshufd m5, m7, q1111 ; cf[2,3] | cf[9,10] 307 pshufd m6, m7, q2222 ; cf[4,5] | cf[11,12] 308 pshufd xm7, xm7, q3333 ; cf[6,13] 309 pshufd m8, m0, q0000 ; cf[14,15] | cf[17,18] 310 pshufd m9, m0, q1111 ; cf[16],pw_1 | cf[19,20] 311 paddw xm0, xm11, xm11 312 pcmpeqd xm13, xm13 313 pblendw xm10, xm1, xm0, 00001000b 314 pxor xm13, xm12 ; min_grain 315 DEFINE_ARGS buf, fg_data, h, x 316 sub bufq, 2*(82*73-(82*3+79)) 317 mov hd, 70 318.y_loop_ar3: 319 mov xq, -76 320.x_loop_ar3: 321 movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] 322 vinserti128 m0, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] 323 movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] 324 vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] 325 palignr m3, m1, m0, 2 ; y=-3/-2,x=[-2,+5] 326 palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6] 327 punpckhwd m2, m0, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] 328 punpcklwd m0, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] 329 shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] 330 pmaddwd m0, m4 331 pmaddwd m2, m6 332 pmaddwd m3, m5 333 paddd m0, m2 334 movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] 335 vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] 336 paddd m0, m3 337 psrldq m3, m2, 2 338 punpcklwd m3, m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] 339 pmaddwd m3, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4] 340 paddd m0, m3 341 psrldq m3, m2, 4 342 psrldq m2, 6 343 vpblendd m2, m11, 0x0f ; rounding constant 344 punpcklwd m3, m2 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] 345 pmaddwd m3, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6] 346 vextracti128 xm2, m1, 1 347 punpcklwd xm1, xm2 348 pmaddwd xm1, xm7 ; y=-3/-2 interleaved,x=[+3,+4,+5,+6] 349 paddd m0, m3 350 vextracti128 xm2, m0, 1 351 paddd xm0, xm1 352 movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] 353 paddd xm0, xm2 354.x_loop_ar3_inner: 355 pmaddwd xm2, xm1, xm10 356 pshuflw xm3, xm2, q1032 357 paddd xm2, xm0 ; add top 358 paddd xm2, xm3 ; left+cur 359 psrldq xm0, 4 360 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] 361 ; skip packssdw because we only care about one value 362 pminsd xm2, xm12 363 pmaxsd xm2, xm13 364 pextrw [bufq+xq*2], xm2, 0 365 pslldq xm2, 4 366 psrldq xm1, 2 367 pblendw xm1, xm2, 0100b 368 inc xq 369 jz .x_loop_ar3_end 370 test xb, 3 371 jnz .x_loop_ar3_inner 372 jmp .x_loop_ar3 373.x_loop_ar3_end: 374 add bufq, 82*2 375 dec hd 376 jg .y_loop_ar3 377 RET 378 379%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y 380INIT_XMM avx2 381cglobal generate_grain_uv_%1_16bpc, 4, 11, 8, buf, bufy, fg_data, uv, bdmax 382%define base r8-generate_grain_uv_%1_16bpc_avx2_table 383 lea r8, [generate_grain_uv_%1_16bpc_avx2_table] 384 movifnidn bdmaxd, bdmaxm 385 vpbroadcastw xm0, [fg_dataq+FGData.seed] 386 mov r5d, [fg_dataq+FGData.grain_scale_shift] 387 movq xm1, [base+next_upperbit_mask] 388 lea r6d, [bdmaxq+1] 389 movq xm4, [base+mul_bits] 390 shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc 391 movq xm5, [base+hmul_bits] 392 sub r5, r6 393 mova xm6, [base+pb_mask] 394 vpbroadcastd xm2, [base+pw_seed_xor+uvq*4] 395 vpbroadcastw xm7, [base+round+r5*2-2] 396 pxor xm0, xm2 397 lea r6, [gaussian_sequence] 398%if %2 399 mov r7d, 73-35*%3 400 add bufq, 44*2 401.loop_y: 402 mov r5, -44*2 403%else 404 mov r5, -82*73*2 405 sub bufq, r5 406%endif 407.loop_x: 408 pand xm2, xm0, xm1 409 psrlw xm3, xm2, 10 410 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 411 pmullw xm2, xm4 ; bits 0x0f00 are set 412 pmulhuw xm0, xm5 413 pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds 414 psllq xm2, xm3, 30 415 por xm2, xm3 416 psllq xm3, xm2, 15 417 por xm2, xm0 ; aggregate each bit into next seed's high bit 418 por xm2, xm3 ; 4 next output seeds 419 pshuflw xm0, xm2, q3333 420 psrlw xm2, 5 421 movq r10, xm2 422 movzx r9d, r10w 423 movd xm2, [r6+r9*2] 424 rorx r9, r10, 32 425 shr r10d, 16 426 pinsrw xm2, [r6+r10*2], 1 427 movzx r10d, r9w 428 pinsrw xm2, [r6+r10*2], 2 429 shr r9d, 16 430 pinsrw xm2, [r6+r9*2], 3 431 paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 432 pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support 433 movq [bufq+r5], xm2 434 add r5, 8 435 jl .loop_x 436%if %2 437 add bufq, 82*2 438 dec r7d 439 jg .loop_y 440%endif 441 442 ; auto-regression code 443 movsxd r6, [fg_dataq+FGData.ar_coeff_lag] 444 movsxd r6, [r8+r6*4] 445 add r6, r8 446 jmp r6 447 448INIT_YMM avx2 449.ar0: 450 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift 451 imul uvd, 28 452 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 453 vpbroadcastb m0, [fg_dataq+FGData.ar_coeffs_uv+uvq] 454 sar bdmaxd, 1 455 vpbroadcastd m4, [base+gen_ar0_shift-24+shiftq*4] 456 movd xm6, bdmaxd 457 pcmpeqw m7, m7 458 pmaddubsw m4, m0 ; ar_coeff << (14 - shift) 459 vpbroadcastw m6, xm6 ; max_gain 460 pxor m7, m6 ; min_grain 461 DEFINE_ARGS buf, bufy, h, x 462%if %2 463 vpbroadcastw m5, [base+hmul_bits+2+%3*2] 464 sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) 465%else 466 sub bufq, 2*(82*70-3) 467%endif 468 add bufyq, 2*(3+82*3) 469 mov hd, 70-35*%3 470.y_loop_ar0: 471%if %2 472 ; first 32 pixels 473 movu xm0, [bufyq+16*0] 474 vinserti128 m0, [bufyq+16*2], 1 475 movu xm1, [bufyq+16*1] 476 vinserti128 m1, [bufyq+16*3], 1 477%if %3 478 movu xm2, [bufyq+82*2+16*0] 479 vinserti128 m2, [bufyq+82*2+16*2], 1 480 movu xm3, [bufyq+82*2+16*1] 481 vinserti128 m3, [bufyq+82*2+16*3], 1 482 paddw m0, m2 483 paddw m1, m3 484%endif 485 phaddw m0, m1 486 movu xm1, [bufyq+16*4] 487 vinserti128 m1, [bufyq+16*6], 1 488 movu xm2, [bufyq+16*5] 489 vinserti128 m2, [bufyq+16*7], 1 490%if %3 491 movu xm3, [bufyq+82*2+16*4] 492 vinserti128 m3, [bufyq+82*2+16*6], 1 493 paddw m1, m3 494 movu xm3, [bufyq+82*2+16*5] 495 vinserti128 m3, [bufyq+82*2+16*7], 1 496 paddw m2, m3 497%endif 498 phaddw m1, m2 499 pmulhrsw m0, m5 500 pmulhrsw m1, m5 501%else 502 xor xd, xd 503.x_loop_ar0: 504 movu m0, [bufyq+xq*2] 505 movu m1, [bufyq+xq*2+32] 506%endif 507 paddw m0, m0 508 paddw m1, m1 509 pmulhrsw m0, m4 510 pmulhrsw m1, m4 511%if %2 512 paddw m0, [bufq+ 0] 513 paddw m1, [bufq+32] 514%else 515 paddw m0, [bufq+xq*2+ 0] 516 paddw m1, [bufq+xq*2+32] 517%endif 518 pminsw m0, m6 519 pminsw m1, m6 520 pmaxsw m0, m7 521 pmaxsw m1, m7 522%if %2 523 movu [bufq+ 0], m0 524 movu [bufq+32], m1 525 526 ; last 6 pixels 527 movu xm0, [bufyq+32*4] 528 movu xm1, [bufyq+32*4+16] 529%if %3 530 paddw xm0, [bufyq+32*4+82*2] 531 paddw xm1, [bufyq+32*4+82*2+16] 532%endif 533 phaddw xm0, xm1 534 movu xm1, [bufq+32*2] 535 pmulhrsw xm0, xm5 536 paddw xm0, xm0 537 pmulhrsw xm0, xm4 538 paddw xm0, xm1 539 pminsw xm0, xm6 540 pmaxsw xm0, xm7 541 vpblendd xm0, xm1, 0x08 542 movu [bufq+32*2], xm0 543%else 544 movu [bufq+xq*2+ 0], m0 545 movu [bufq+xq*2+32], m1 546 add xd, 32 547 cmp xd, 64 548 jl .x_loop_ar0 549 550 ; last 12 pixels 551 movu m0, [bufyq+64*2] 552 movu m1, [bufq+64*2] 553 paddw m0, m0 554 pmulhrsw m0, m4 555 paddw m0, m1 556 pminsw m0, m6 557 pmaxsw m0, m7 558 vpblendd m0, m1, 0xc0 559 movu [bufq+64*2], m0 560%endif 561 add bufq, 82*2 562 add bufyq, 82*2<<%3 563 dec hd 564 jg .y_loop_ar0 565 RET 566 567INIT_XMM avx2 568.ar1: 569 DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x, shift 570 imul uvd, 28 571 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 572 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] 573 movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] 574 pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 575 DEFINE_ARGS buf, bufy, h, val0, max, cf3, min, val3, x, shift 576 pmovsxbw xm4, xm4 577 pshufd xm5, xm4, q1111 578 pshufd xm4, xm4, q0000 579 pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd 580 vpbroadcastw xm6, [base+hmul_bits+2+%3*2] 581 vpbroadcastd xm3, xm3 582%if %2 583 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) 584%else 585 sub bufq, 2*(82*69+3) 586%endif 587 add bufyq, 2*(79+82*3) 588 mov hd, 70-35*%3 589 sar maxd, 1 590 mov mind, maxd 591 xor mind, -1 592.y_loop_ar1: 593 mov xq, -(76>>%2) 594 movsx val3d, word [bufq+xq*2-2] 595.x_loop_ar1: 596 movu xm0, [bufq+xq*2-82*2-2] ; top/left 597%if %2 598 movu xm2, [bufyq+xq*4] 599%else 600 movq xm2, [bufyq+xq*2] 601%endif 602%if %2 603%if %3 604 phaddw xm2, [bufyq+xq*4+82*2] 605 punpckhqdq xm1, xm2, xm2 606 paddw xm2, xm1 607%else 608 phaddw xm2, xm2 609%endif 610 pmulhrsw xm2, xm6 611%endif 612 psrldq xm1, xm0, 4 ; top/right 613 punpcklwd xm1, xm2 614 psrldq xm2, xm0, 2 ; top 615 punpcklwd xm0, xm2 616 pmaddwd xm1, xm5 617 pmaddwd xm0, xm4 618 paddd xm1, xm3 619 paddd xm0, xm1 620.x_loop_ar1_inner: 621 movd val0d, xm0 622 psrldq xm0, 4 623 imul val3d, cf3d 624 add val3d, val0d 625 sarx val3d, val3d, shiftd 626 movsx val0d, word [bufq+xq*2] 627 add val3d, val0d 628 cmp val3d, maxd 629 cmovg val3d, maxd 630 cmp val3d, mind 631 cmovl val3d, mind 632 mov word [bufq+xq*2], val3w 633 ; keep val3d in-place as left for next x iteration 634 inc xq 635 jz .x_loop_ar1_end 636 test xb, 3 637 jnz .x_loop_ar1_inner 638 jmp .x_loop_ar1 639.x_loop_ar1_end: 640 add bufq, 82*2 641 add bufyq, 82*2<<%3 642 dec hd 643 jg .y_loop_ar1 644 RET 645 646INIT_YMM avx2 647.ar2: 648%if WIN64 649 ; xmm6 and xmm7 already saved 650 %assign xmm_regs_used 13 + %2 651 %assign stack_size_padded 136 652 SUB rsp, stack_size_padded 653 movaps [rsp+16*2], xmm8 654 movaps [rsp+16*3], xmm9 655 movaps [rsp+16*4], xmm10 656 movaps [rsp+16*5], xmm11 657 movaps [rsp+16*6], xmm12 658%if %2 659 movaps [rsp+16*7], xmm13 660%endif 661%endif 662 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift 663 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 664 imul uvd, 28 665 vbroadcasti128 m10, [base+gen_shufA] 666 sar bdmaxd, 1 667 vbroadcasti128 m11, [base+gen_shufB] 668 movd xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 5] 669 pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+12], 4 670 pinsrb xm7, [base+pb_1], 5 671 pinsrw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+10], 3 672 movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] 673 pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 9], 13 674 pmovsxbw m7, xm7 675 movd xm8, bdmaxd ; max_grain 676 pshufd m4, m7, q0000 677 vpbroadcastw xm12, [base+round_vals-12+shiftq*2] 678 pshufd m5, m7, q1111 679 pcmpeqd xm9, xm9 680 pshufd m6, m7, q2222 681 pxor xm9, xm8 ; min_grain 682 pshufd xm7, xm7, q3333 683 DEFINE_ARGS buf, bufy, fg_data, h, x 684%if %2 685 vpbroadcastw xm13, [base+hmul_bits+2+%3*2] 686 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) 687%else 688 sub bufq, 2*(82*69+3) 689%endif 690 add bufyq, 2*(79+82*3) 691 mov hd, 70-35*%3 692.y_loop_ar2: 693 mov xq, -(76>>%2) 694.x_loop_ar2: 695 vbroadcasti128 m3, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] 696 vinserti128 m2, m3, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5] 697 pshufb m0, m2, m10 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] 698 pmaddwd m0, m4 699 pshufb m1, m2, m11 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] 700 pmaddwd m1, m5 701 punpckhwd m2, m3 ; y=-2/-1 interleaved, x=[+2,+5] 702%if %2 703 movu xm3, [bufyq+xq*4] 704%if %3 705 paddw xm3, [bufyq+xq*4+82*2] 706%endif 707 phaddw xm3, xm3 708 pmulhrsw xm3, xm13 709%else 710 movq xm3, [bufyq+xq*2] 711%endif 712 punpcklwd xm3, xm12 ; luma, round interleaved 713 vpblendd m2, m3, 0x0f 714 pmaddwd m2, m6 715 paddd m1, m0 716 movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] 717 paddd m2, m1 718 vextracti128 xm1, m2, 1 719 paddd xm2, xm1 720 pshufd xm1, xm0, q3321 721 pmovsxwd xm1, xm1 ; y=0,x=[0,3] in dword 722.x_loop_ar2_inner: 723 pmaddwd xm3, xm7, xm0 724 paddd xm3, xm2 725 psrldq xm2, 4 ; shift top to next pixel 726 psrad xm3, [fg_dataq+FGData.ar_coeff_shift] 727 ; we do not need to packssdw since we only care about one value 728 paddd xm3, xm1 729 psrldq xm1, 4 730 pminsd xm3, xm8 731 pmaxsd xm3, xm9 732 pextrw [bufq+xq*2], xm3, 0 733 psrldq xm0, 2 734 pslldq xm3, 2 735 pblendw xm0, xm3, 00000010b 736 inc xq 737 jz .x_loop_ar2_end 738 test xb, 3 739 jnz .x_loop_ar2_inner 740 jmp .x_loop_ar2 741.x_loop_ar2_end: 742 add bufq, 82*2 743 add bufyq, 82*2<<%3 744 dec hd 745 jg .y_loop_ar2 746 RET 747 748.ar3: 749%if WIN64 750 ; xmm6 and xmm7 already saved 751 %assign stack_offset 32 752 %assign xmm_regs_used 14 + %2 753 %assign stack_size_padded 152 754 SUB rsp, stack_size_padded 755 movaps [rsp+16*2], xmm8 756 movaps [rsp+16*3], xmm9 757 movaps [rsp+16*4], xmm10 758 movaps [rsp+16*5], xmm11 759 movaps [rsp+16*6], xmm12 760 movaps [rsp+16*7], xmm13 761%if %2 762 movaps [rsp+16*8], xmm14 763%endif 764%endif 765 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift 766 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 767 imul uvd, 28 768 vpbroadcastw xm11, [base+round_vals-12+shiftq*2] 769 sar bdmaxd, 1 770 movq xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] 771 pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7 ; luma 772 movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 7] 773 pmovsxbw m7, xm7 774%if %2 775 vpbroadcastw xm14, [base+hmul_bits+2+%3*2] 776%endif 777 pshufd m4, m7, q0000 778 pshufd m5, m7, q1111 779 pshufd m6, m7, q2222 780 pshufd m7, m7, q3333 781 movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+14] 782 pinsrb xm0, [base+pb_1], 3 783 pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+21], 1 784 pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+17], 2 785 pmovsxbw m0, xm0 786 movd xm12, bdmaxd ; max_grain 787 pshufd m8, m0, q0000 788 pshufd m9, m0, q1111 789 pcmpeqd xm13, xm13 790 punpckhqdq xm10, xm0, xm0 791 pxor xm13, xm12 ; min_grain 792 pinsrw xm10, [base+round_vals-10+shiftq*2], 3 793 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 794%if %2 795 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) 796%else 797 sub bufq, 2*(82*69+3) 798%endif 799 add bufyq, 2*(79+82*3) 800 mov hd, 70-35*%3 801.y_loop_ar3: 802 mov xq, -(76>>%2) 803.x_loop_ar3: 804 movu xm2, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] 805 vinserti128 m2, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] 806 movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] 807 vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] 808 palignr m3, m1, m2, 2 ; y=-3/-2,x=[-2,+5] 809 palignr m1, m2, 12 ; y=-3/-2,x=[+3,+6] 810 punpcklwd m0, m2, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] 811 punpckhwd m2, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] 812 shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] 813 pmaddwd m0, m4 814 pmaddwd m2, m6 815 pmaddwd m3, m5 816 paddd m0, m2 817 paddd m0, m3 818 movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] 819 vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] 820%if %2 821 movu xm3, [bufyq+xq*4] 822%if %3 823 paddw xm3, [bufyq+xq*4+82*2] 824%endif 825 phaddw xm3, xm3 826 pmulhrsw xm3, xm14 827%else 828 movq xm3, [bufyq+xq*2] 829%endif 830 punpcklwd m1, m3 831 pmaddwd m1, m7 832 paddd m0, m1 833 psrldq m1, m2, 4 834 psrldq m3, m2, 6 835 vpblendd m3, m11, 0x0f ; rounding constant 836 punpcklwd m1, m3 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] 837 pmaddwd m1, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6] 838 psrldq m3, m2, 2 839 punpcklwd m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] 840 pmaddwd m2, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4] 841 paddd m0, m1 842 movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] 843 paddd m0, m2 844 vextracti128 xm2, m0, 1 845 paddd xm0, xm2 846.x_loop_ar3_inner: 847 pmaddwd xm2, xm1, xm10 848 pshuflw xm3, xm2, q1032 849 paddd xm2, xm0 ; add top 850 paddd xm2, xm3 ; left+cur 851 psrldq xm0, 4 852 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] 853 psrldq xm1, 2 854 ; no need to packssdw since we only care about one value 855 pminsd xm2, xm12 856 pmaxsd xm2, xm13 857 pextrw [bufq+xq*2], xm2, 0 858 pslldq xm2, 4 859 pblendw xm1, xm2, 00000100b 860 inc xq 861 jz .x_loop_ar3_end 862 test xb, 3 863 jnz .x_loop_ar3_inner 864 jmp .x_loop_ar3 865.x_loop_ar3_end: 866 add bufq, 82*2 867 add bufyq, 82*2<<%3 868 dec hd 869 jg .y_loop_ar3 870 RET 871%endmacro 872 873cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \ 874 grain_lut, unused, sby, see 875%define base r11-grain_min 876 lea r11, [grain_min] 877 mov r6d, r9m ; bdmax 878 mov r9d, [fg_dataq+FGData.clip_to_restricted_range] 879 mov r7d, [fg_dataq+FGData.scaling_shift] 880 mov sbyd, sbym 881 vpbroadcastd m8, r9m 882 shr r6d, 11 ; is_12bpc 883 vpbroadcastd m9, [base+grain_min+r6*4] 884 shlx r10d, r9d, r6d 885 vpbroadcastd m10, [base+grain_max+r6*4] 886 lea r9d, [r6+r9*4] 887 vpbroadcastw m11, [base+mul_bits+r7*2-12] 888 vpbroadcastd m12, [base+fg_min+r10*4] 889 vpbroadcastd m13, [base+fg_max+r9*4] 890 test sbyd, sbyd 891 setnz r7b 892 vpbroadcastd m14, [base+pd_16] 893 test r7b, [fg_dataq+FGData.overlap_flag] 894 jnz .vertical_overlap 895 896 imul seed, sbyd, (173 << 24) | 37 897 add seed, (105 << 24) | 178 898 rorx seed, seed, 24 899 movzx seed, seew 900 xor seed, [fg_dataq+FGData.seed] 901 902 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 903 offx, offy, see, src_bak 904 905 lea src_bakq, [srcq+wq*2] 906 neg wq 907 sub dstq, srcq 908 909.loop_x: 910 rorx r6, seeq, 1 911 or seed, 0xEFF4 912 test seeb, seeh 913 lea seed, [r6+0x8000] 914 cmovp seed, r6d ; updated seed 915 rorx offyd, seed, 8 916 rorx offxq, seeq, 12 917 and offyd, 0xf 918 imul offyd, 164 919 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx 920 921 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 922 h, offxy, see, src_bak 923 924 mov grain_lutq, grain_lutmp 925 mov hd, hm 926.loop_y: 927 ; scaling[src] 928 mova m0, [srcq+ 0] 929 mova m1, [srcq+32] 930 pand m4, m8, m0 931 psrld m3, m0, 16 932 mova m6, m9 933 vpgatherdd m2, [scalingq+m4-0], m9 934 pand m3, m8 935 mova m9, m6 936 vpgatherdd m4, [scalingq+m3-2], m6 937 pand m5, m8, m1 938 mova m6, m9 939 vpgatherdd m3, [scalingq+m5-0], m9 940 pblendw m4, m2, 0x55 941 psrld m2, m1, 16 942 mova m9, m6 943 pand m2, m8 944 vpgatherdd m5, [scalingq+m2-2], m6 945 pblendw m5, m3, 0x55 946 947 ; noise = round2(scaling[src] * grain, scaling_shift) 948 pmaddubsw m4, m11 949 pmaddubsw m5, m11 950 paddw m4, m4 951 paddw m5, m5 952 pmulhrsw m4, [grain_lutq+offxyq*2] 953 pmulhrsw m5, [grain_lutq+offxyq*2+32] 954 955 ; dst = clip_pixel(src, noise) 956 paddw m0, m4 957 paddw m1, m5 958 pmaxsw m0, m12 959 pmaxsw m1, m12 960 pminsw m0, m13 961 pminsw m1, m13 962 mova [dstq+srcq+ 0], m0 963 mova [dstq+srcq+32], m1 964 965 add srcq, strideq 966 add grain_lutq, 82*2 967 dec hd 968 jg .loop_y 969 add wq, 32 970 jge .end 971 lea srcq, [src_bakq+wq*2] 972 cmp byte [fg_dataq+FGData.overlap_flag], 0 973 je .loop_x 974 movq xm7, [pw_27_17_17_27] 975 cmp dword r8m, 0 ; sby 976 jne .loop_x_hv_overlap 977 978 ; horizontal overlap (without vertical overlap) 979.loop_x_h_overlap: 980 rorx r6, seeq, 1 981 or seed, 0xEFF4 982 test seeb, seeh 983 lea seed, [r6+0x8000] 984 cmovp seed, r6d ; updated seed 985 986 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 987 offx, offy, see, src_bak, left_offxy 988 989 lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx 990 rorx offyd, seed, 8 991 rorx offxq, seeq, 12 992 and offyd, 0xf 993 imul offyd, 164 994 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx 995 996 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 997 h, offxy, see, src_bak, left_offxy 998 999 mov grain_lutq, grain_lutmp 1000 mov hd, hm 1001.loop_y_h_overlap: 1002 ; scaling[src] 1003 mova m0, [srcq+ 0] 1004 mova m1, [srcq+32] 1005 pand m4, m8, m0 1006 psrld m3, m0, 16 1007 mova m6, m9 1008 vpgatherdd m2, [scalingq+m4-0], m9 1009 pand m3, m8 1010 mova m9, m6 1011 vpgatherdd m4, [scalingq+m3-2], m6 1012 pand m5, m8, m1 1013 mova m6, m9 1014 vpgatherdd m3, [scalingq+m5-0], m9 1015 pblendw m4, m2, 0x55 1016 psrld m2, m1, 16 1017 mova m9, m6 1018 pand m2, m8 1019 vpgatherdd m5, [scalingq+m2-2], m6 1020 pblendw m5, m3, 0x55 1021 1022 ; grain = grain_lut[offy+y][offx+x] 1023 movu m3, [grain_lutq+offxyq*2] 1024 movd xm6, [grain_lutq+left_offxyq*2] 1025 punpcklwd xm6, xm3 1026 pmaddwd xm6, xm7 1027 paddd xm6, xm14 1028 psrad xm6, 5 1029 packssdw xm6, xm6 1030 pmaxsw xm6, xm9 1031 pminsw xm6, xm10 1032 vpblendd m3, m6, 0x01 1033 1034 ; noise = round2(scaling[src] * grain, scaling_shift) 1035 pmaddubsw m4, m11 1036 pmaddubsw m5, m11 1037 paddw m4, m4 1038 paddw m5, m5 1039 pmulhrsw m4, m3 1040 pmulhrsw m5, [grain_lutq+offxyq*2+32] 1041 1042 ; dst = clip_pixel(src, noise) 1043 paddw m0, m4 1044 paddw m1, m5 1045 pmaxsw m0, m12 1046 pmaxsw m1, m12 1047 pminsw m0, m13 1048 pminsw m1, m13 1049 mova [dstq+srcq+ 0], m0 1050 mova [dstq+srcq+32], m1 1051 1052 add srcq, strideq 1053 add grain_lutq, 82*2 1054 dec hd 1055 jg .loop_y_h_overlap 1056 add wq, 32 1057 jge .end 1058 lea srcq, [src_bakq+wq*2] 1059 cmp dword r8m, 0 ; sby 1060 jne .loop_x_hv_overlap 1061 jmp .loop_x_h_overlap 1062 1063.vertical_overlap: 1064 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ 1065 sby, see, src_bak 1066 1067 movzx sbyd, sbyb 1068 imul seed, [fg_dataq+FGData.seed], 0x00010001 1069 imul r7d, sbyd, 173 * 0x00010001 1070 imul sbyd, 37 * 0x01000100 1071 add r7d, (105 << 16) | 188 1072 add sbyd, (178 << 24) | (141 << 8) 1073 and r7d, 0x00ff00ff 1074 and sbyd, 0xff00ff00 1075 xor seed, r7d 1076 xor seed, sbyd ; (cur_seed << 16) | top_seed 1077 1078 lea src_bakq, [srcq+wq*2] 1079 neg wq 1080 sub dstq, srcq 1081 1082.loop_x_v_overlap: 1083 vpbroadcastd m15, [pw_27_17_17_27] 1084 1085 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1086 mov r6d, seed 1087 or seed, 0xeff4eff4 1088 test seeb, seeh 1089 setp r7b ; parity of top_seed 1090 shr seed, 16 1091 shl r7d, 16 1092 test seeb, seeh 1093 setp r7b ; parity of cur_seed 1094 or r6d, 0x00010001 1095 xor r7d, r6d 1096 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1097 1098 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1099 offx, offy, see, src_bak, unused, top_offxy 1100 1101 rorx offyd, seed, 8 1102 rorx offxd, seed, 12 1103 and offyd, 0xf000f 1104 and offxd, 0xf000f 1105 imul offyd, 164 1106 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1107 lea offyd, [offyq+offxq*2+0x10001*747+32*82] 1108 1109 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1110 h, offxy, see, src_bak, unused, top_offxy 1111 1112 mov grain_lutq, grain_lutmp 1113 mov hd, hm 1114 movzx top_offxyd, offxyw 1115 shr offxyd, 16 1116.loop_y_v_overlap: 1117 ; scaling[src] 1118 mova m0, [srcq+ 0] 1119 mova m1, [srcq+32] 1120 pand m4, m8, m0 1121 psrld m3, m0, 16 1122 mova m6, m9 1123 vpgatherdd m2, [scalingq+m4-0], m9 1124 pand m3, m8 1125 mova m9, m6 1126 vpgatherdd m4, [scalingq+m3-2], m6 1127 pand m5, m8, m1 1128 mova m6, m9 1129 vpgatherdd m3, [scalingq+m5-0], m9 1130 pblendw m2, m4, 0xaa 1131 psrld m4, m1, 16 1132 mova m9, m6 1133 pand m4, m8 1134 vpgatherdd m5, [scalingq+m4-2], m6 1135 pblendw m3, m5, 0xaa 1136 1137 ; grain = grain_lut[offy+y][offx+x] 1138 movu m6, [grain_lutq+offxyq*2] 1139 movu m5, [grain_lutq+top_offxyq*2] 1140 punpcklwd m4, m5, m6 1141 punpckhwd m5, m6 1142 pmaddwd m4, m15 1143 pmaddwd m5, m15 1144 movu m7, [grain_lutq+offxyq*2+32] 1145 movu m6, [grain_lutq+top_offxyq*2+32] 1146 paddd m4, m14 1147 paddd m5, m14 1148 psrad m4, 5 1149 psrad m5, 5 1150 packssdw m4, m5 1151 punpcklwd m5, m6, m7 1152 punpckhwd m6, m7 1153 pmaddwd m5, m15 1154 pmaddwd m6, m15 1155 paddd m5, m14 1156 paddd m6, m14 1157 psrad m5, 5 1158 psrad m6, 5 1159 packssdw m5, m6 1160 pmaxsw m4, m9 1161 pmaxsw m5, m9 1162 pminsw m4, m10 1163 pminsw m5, m10 1164 1165 ; noise = round2(scaling[src] * grain, scaling_shift) 1166 pmaddubsw m2, m11 1167 pmaddubsw m3, m11 1168 paddw m2, m2 1169 paddw m3, m3 1170 pmulhrsw m4, m2 1171 pmulhrsw m5, m3 1172 1173 ; dst = clip_pixel(src, noise) 1174 paddw m0, m4 1175 paddw m1, m5 1176 pmaxsw m0, m12 1177 pmaxsw m1, m12 1178 pminsw m0, m13 1179 pminsw m1, m13 1180 mova [dstq+srcq+ 0], m0 1181 mova [dstq+srcq+32], m1 1182 1183 add srcq, strideq 1184 add grain_lutq, 82*2 1185 dec hb 1186 jz .end_y_v_overlap 1187 vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line 1188 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1189 ; remaining (up to) 30 lines 1190 add hd, 0x80000000 1191 jnc .loop_y_v_overlap 1192 jmp .loop_y 1193.end_y_v_overlap: 1194 add wq, 32 1195 jge .end 1196 lea srcq, [src_bakq+wq*2] 1197 1198 ; since fg_dataq.overlap is guaranteed to be set, we never jump 1199 ; back to .loop_x_v_overlap, and instead always fall-through to 1200 ; h+v overlap 1201 1202.loop_x_hv_overlap: 1203 vpbroadcastd m15, [pw_27_17_17_27] 1204 1205 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1206 mov r6d, seed 1207 or seed, 0xeff4eff4 1208 test seeb, seeh 1209 setp r7b ; parity of top_seed 1210 shr seed, 16 1211 shl r7d, 16 1212 test seeb, seeh 1213 setp r7b ; parity of cur_seed 1214 or r6d, 0x00010001 1215 xor r7d, r6d 1216 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1217 1218 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1219 offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy 1220 1221 lea topleft_offxyd, [top_offxyq+32] 1222 lea left_offxyd, [offyq+32] 1223 rorx offyd, seed, 8 1224 rorx offxd, seed, 12 1225 and offyd, 0xf000f 1226 and offxd, 0xf000f 1227 imul offyd, 164 1228 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1229 lea offyd, [offyq+offxq*2+0x10001*747+32*82] 1230 1231 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1232 h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy 1233 1234 mov grain_lutq, grain_lutmp 1235 mov hd, hm 1236 movzx top_offxyd, offxyw 1237 shr offxyd, 16 1238.loop_y_hv_overlap: 1239 ; scaling[src] 1240 mova m0, [srcq+ 0] 1241 mova m1, [srcq+32] 1242 pand m4, m8, m0 1243 psrld m3, m0, 16 1244 mova m6, m9 1245 vpgatherdd m2, [scalingq+m4-0], m9 1246 pand m3, m8 1247 mova m9, m6 1248 vpgatherdd m4, [scalingq+m3-2], m6 1249 pand m5, m8, m1 1250 mova m6, m9 1251 vpgatherdd m3, [scalingq+m5-0], m9 1252 pblendw m2, m4, 0xaa 1253 psrld m4, m1, 16 1254 mova m9, m6 1255 pand m4, m8 1256 vpgatherdd m5, [scalingq+m4-2], m6 1257 pblendw m3, m5, 0xaa 1258 1259 ; grain = grain_lut[offy+y][offx+x] 1260 movu m7, [grain_lutq+offxyq*2] 1261 movd xm6, [grain_lutq+left_offxyq*2] 1262 movu m5, [grain_lutq+top_offxyq*2] 1263 movd xm4, [grain_lutq+topleft_offxyq*2] 1264 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 1265 punpcklwd xm6, xm7 1266 punpcklwd xm4, xm5 1267 punpcklqdq xm6, xm4 1268 movddup xm4, [pw_27_17_17_27] 1269 pmaddwd xm6, xm4 1270 paddd xm6, xm14 1271 psrad xm6, 5 1272 packssdw xm6, xm6 1273 pmaxsw xm6, xm9 1274 pminsw xm6, xm10 1275 pshuflw xm4, xm6, q1032 1276 vpblendd m6, m7, 0xfe 1277 vpblendd m4, m5, 0xfe 1278 ; followed by v interpolation (top | cur -> cur) 1279 punpckhwd m5, m7 1280 pmaddwd m5, m15 1281 punpcklwd m4, m6 1282 pmaddwd m4, m15 1283 movu m7, [grain_lutq+offxyq*2+32] 1284 movu m6, [grain_lutq+top_offxyq*2+32] 1285 paddd m5, m14 1286 paddd m4, m14 1287 psrad m5, 5 1288 psrad m4, 5 1289 packssdw m4, m5 1290 punpcklwd m5, m6, m7 1291 punpckhwd m6, m7 1292 pmaddwd m5, m15 1293 pmaddwd m6, m15 1294 paddd m5, m14 1295 paddd m6, m14 1296 psrad m5, 5 1297 psrad m6, 5 1298 packssdw m5, m6 1299 pmaxsw m4, m9 1300 pmaxsw m5, m9 1301 pminsw m4, m10 1302 pminsw m5, m10 1303 1304 ; noise = round2(scaling[src] * grain, scaling_shift) 1305 pmaddubsw m2, m11 1306 pmaddubsw m3, m11 1307 paddw m2, m2 1308 paddw m3, m3 1309 pmulhrsw m4, m2 1310 pmulhrsw m5, m3 1311 1312 ; dst = clip_pixel(src, noise) 1313 paddw m0, m4 1314 paddw m1, m5 1315 pmaxsw m0, m12 1316 pmaxsw m1, m12 1317 pminsw m0, m13 1318 pminsw m1, m13 1319 mova [dstq+srcq+ 0], m0 1320 mova [dstq+srcq+32], m1 1321 1322 add srcq, strideq 1323 add grain_lutq, 82*2 1324 dec hb 1325 jz .end_y_hv_overlap 1326 vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line 1327 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1328 ; remaining (up to) 30 lines 1329 add hd, 0x80000000 1330 jnc .loop_y_hv_overlap 1331 movq xm7, [pw_27_17_17_27] 1332 jmp .loop_y_h_overlap 1333.end_y_hv_overlap: 1334 add wq, 32 1335 lea srcq, [src_bakq+wq*2] 1336 jl .loop_x_hv_overlap 1337.end: 1338 RET 1339 1340%macro FGUV_FN 3 ; name, ss_hor, ss_ver 1341cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ 1342 grain_lut, h, sby, luma, lstride, uv_pl, is_id 1343%define base r12-grain_min 1344 lea r12, [grain_min] 1345 mov r9d, r13m ; bdmax 1346 mov r7d, [fg_dataq+FGData.scaling_shift] 1347 mov r11d, is_idm 1348 mov sbyd, sbym 1349 vpbroadcastw m11, [base+mul_bits+r7*2-12] 1350 mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 1351 shr r9d, 11 ; is_12bpc 1352 vpbroadcastd m8, [base+grain_min+r9*4] 1353 shlx r10d, r6d, r9d 1354 vpbroadcastd m9, [base+grain_max+r9*4] 1355 vpbroadcastw m10, r13m 1356 shlx r6d, r6d, r11d 1357 vpbroadcastd m12, [base+fg_min+r10*4] 1358 lea r6d, [r9+r6*2] 1359 vpbroadcastd m13, [base+fg_max+r6*4] 1360 test sbyd, sbyd 1361 setnz r7b 1362 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 1363 jne .csfl 1364 1365%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver 1366 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1367 unused, sby, see, overlap 1368 1369%if %1 1370 mov r6d, r11m 1371 vpbroadcastd m0, [base+pb_8_9_0_1] 1372 vpbroadcastd m1, [base+uv_offset_mul+r9*4] 1373 vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4] 1374 vpbroadcastd m15, [fg_dataq+FGData.uv_offset+r6*4] 1375 pshufb m14, m0 ; { uv_luma_mult, uv_mult } 1376 pmaddwd m15, m1 1377%else 1378%if %2 1379 vpbroadcastq m15, [base+pw_23_22] 1380%else 1381 vpbroadcastq m15, [base+pw_27_17_17_27] 1382%endif 1383 vpbroadcastd m14, [base+pd_16] 1384%endif 1385 test r7b, [fg_dataq+FGData.overlap_flag] 1386 jnz %%vertical_overlap 1387 1388 imul seed, sbyd, (173 << 24) | 37 1389 add seed, (105 << 24) | 178 1390 rorx seed, seed, 24 1391 movzx seed, seew 1392 xor seed, [fg_dataq+FGData.seed] 1393 1394 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1395 unused2, unused3, see, unused4, unused5, unused6, luma, lstride 1396 1397 mov lumaq, r9mp 1398 mov lstrideq, r10mp 1399 lea r10, [srcq+wq*2] 1400 lea r11, [dstq+wq*2] 1401 lea r12, [lumaq+wq*(2<<%2)] 1402 mov r9mp, r10 1403 mov r11mp, r11 1404 mov r12mp, r12 1405 neg wq 1406 1407%%loop_x: 1408 rorx r6, seeq, 1 1409 or seed, 0xEFF4 1410 test seeb, seeh 1411 lea seed, [r6+0x8000] 1412 cmovp seed, r6d ; updated seed 1413 1414 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1415 offx, offy, see, unused1, unused2, unused3, luma, lstride 1416 1417 rorx offyd, seed, 8 1418 rorx offxq, seeq, 12 1419 and offyd, 0xf 1420 imul offyd, 164>>%3 1421 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx 1422 1423 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1424 h, offxy, see, unused1, unused2, unused3, luma, lstride 1425 1426 mov grain_lutq, grain_lutmp 1427 mov hd, hm 1428%%loop_y: 1429 ; luma_src 1430%if %2 1431 mova xm2, [lumaq+lstrideq*0+ 0] 1432 vinserti128 m2, [lumaq+lstrideq*0+32], 1 1433 mova xm4, [lumaq+lstrideq*0+16] 1434 vinserti128 m4, [lumaq+lstrideq*0+48], 1 1435 mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] 1436 vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 1437 mova xm5, [lumaq+lstrideq*(1<<%3)+16] 1438 vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 1439 phaddw m2, m4 1440 phaddw m3, m5 1441 pxor m4, m4 1442 pavgw m2, m4 1443 pavgw m3, m4 1444%elif %1 1445 mova m2, [lumaq+ 0] 1446 mova m3, [lumaq+32] 1447%endif 1448%if %1 1449 mova m0, [srcq] 1450%if %2 1451 mova m1, [srcq+strideq] 1452%else 1453 mova m1, [srcq+32] 1454%endif 1455 punpckhwd m4, m2, m0 1456 punpcklwd m2, m0 1457 punpckhwd m5, m3, m1 1458 punpcklwd m3, m1 ; { luma, chroma } 1459 REPX {pmaddwd x, m14}, m4, m2, m5, m3 1460 REPX {paddd x, m15}, m4, m2, m5, m3 1461 REPX {psrad x, 6 }, m4, m2, m5, m3 1462 packusdw m2, m4 1463 packusdw m3, m5 1464 pminuw m2, m10 1465 pminuw m3, m10 ; clip_pixel() 1466%elif %2 1467 pand m2, m10 1468 pand m3, m10 1469%else 1470 pand m2, m10, [lumaq+ 0] 1471 pand m3, m10, [lumaq+32] 1472%endif 1473 1474 ; scaling[luma_src] 1475 vpbroadcastd m7, [pd_m65536] 1476 pandn m4, m7, m2 1477 mova m6, m7 1478 vpgatherdd m5, [scalingq+m4-0], m7 1479 psrld m2, 16 1480 mova m7, m6 1481 vpgatherdd m4, [scalingq+m2-2], m6 1482 pblendw m4, m5, 0x55 1483 pandn m5, m7, m3 1484 mova m6, m7 1485 vpgatherdd m2, [scalingq+m5-0], m7 1486 psrld m3, 16 1487 vpgatherdd m5, [scalingq+m3-2], m6 1488 pblendw m5, m2, 0x55 1489 1490 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1491 pmaddubsw m4, m11 1492 pmaddubsw m5, m11 1493 paddw m4, m4 1494 paddw m5, m5 1495 pmulhrsw m4, [grain_lutq+offxyq*2] 1496%if %2 1497 pmulhrsw m5, [grain_lutq+offxyq*2+82*2] 1498%else 1499 pmulhrsw m5, [grain_lutq+offxyq*2+32] 1500%endif 1501 1502 ; dst = clip_pixel(src, noise) 1503%if %1 1504 paddw m0, m4 1505 paddw m1, m5 1506%else 1507 paddw m0, m4, [srcq] 1508%if %2 1509 paddw m1, m5, [srcq+strideq] 1510%else 1511 paddw m1, m5, [srcq+32] 1512%endif 1513%endif 1514 pmaxsw m0, m12 1515 pmaxsw m1, m12 1516 pminsw m0, m13 1517 pminsw m1, m13 1518 mova [dstq], m0 1519%if %2 1520 mova [dstq+strideq], m1 1521 lea srcq, [srcq+strideq*2] 1522 lea dstq, [dstq+strideq*2] 1523 lea lumaq, [lumaq+lstrideq*(2<<%3)] 1524%else 1525 mova [dstq+32], m1 1526 add srcq, strideq 1527 add dstq, strideq 1528 add lumaq, lstrideq 1529%endif 1530 add grain_lutq, 82*(2<<%2) 1531%if %2 1532 sub hb, 2 1533%else 1534 dec hb 1535%endif 1536 jg %%loop_y 1537 add wq, 32>>%2 1538 jge .end 1539 mov srcq, r9mp 1540 mov dstq, r11mp 1541 mov lumaq, r12mp 1542 lea srcq, [srcq+wq*2] 1543 lea dstq, [dstq+wq*2] 1544 lea lumaq, [lumaq+wq*(2<<%2)] 1545 cmp byte [fg_dataq+FGData.overlap_flag], 0 1546 je %%loop_x 1547 cmp dword r8m, 0 ; sby 1548 jne %%loop_x_hv_overlap 1549 1550 ; horizontal overlap (without vertical overlap) 1551%%loop_x_h_overlap: 1552 rorx r6, seeq, 1 1553 or seed, 0xEFF4 1554 test seeb, seeh 1555 lea seed, [r6+0x8000] 1556 cmovp seed, r6d ; updated seed 1557 1558 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1559 offx, offy, see, left_offxy, unused1, unused2, luma, lstride 1560 1561 lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx 1562 rorx offyd, seed, 8 1563 rorx offxq, seeq, 12 1564 and offyd, 0xf 1565 imul offyd, 164>>%3 1566 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 1567 1568 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1569 h, offxy, see, left_offxy, unused1, unused2, luma, lstride 1570 1571 mov grain_lutq, grain_lutmp 1572 mov hd, hm 1573%%loop_y_h_overlap: 1574 ; luma_src 1575%if %2 1576 mova xm2, [lumaq+lstrideq*0+ 0] 1577 vinserti128 m2, [lumaq+lstrideq*0+32], 1 1578 mova xm4, [lumaq+lstrideq*0+16] 1579 vinserti128 m4, [lumaq+lstrideq*0+48], 1 1580 mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] 1581 vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 1582 mova xm5, [lumaq+lstrideq*(1<<%3)+16] 1583 vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 1584 phaddw m2, m4 1585 phaddw m3, m5 1586 pxor m4, m4 1587 pavgw m2, m4 1588 pavgw m3, m4 1589%elif %1 1590 mova m2, [lumaq] 1591 mova m3, [lumaq+32] 1592%endif 1593%if %1 1594 mova m0, [srcq] 1595%if %2 1596 mova m1, [srcq+strideq] 1597%else 1598 mova m1, [srcq+32] 1599%endif 1600 punpckhwd m4, m2, m0 1601 punpcklwd m2, m0 1602 punpckhwd m5, m3, m1 1603 punpcklwd m3, m1 ; { luma, chroma } 1604 REPX {pmaddwd x, m14}, m4, m2, m5, m3 1605 REPX {paddd x, m15}, m4, m2, m5, m3 1606 REPX {psrad x, 6 }, m4, m2, m5, m3 1607 packusdw m2, m4 1608 packusdw m3, m5 1609 pminuw m2, m10 ; clip_pixel() 1610 pminuw m3, m10 1611%elif %2 1612 pand m2, m10 1613 pand m3, m10 1614%else 1615 pand m2, m10, [lumaq+ 0] 1616 pand m3, m10, [lumaq+32] 1617%endif 1618 1619 ; scaling[luma_src] 1620 vpbroadcastd m7, [pd_m65536] 1621 pandn m4, m7, m2 1622 mova m6, m7 1623 vpgatherdd m5, [scalingq+m4-0], m7 1624 psrld m2, 16 1625 mova m7, m6 1626 vpgatherdd m4, [scalingq+m2-2], m6 1627 pblendw m4, m5, 0x55 1628 pandn m5, m7, m3 1629 mova m6, m7 1630 vpgatherdd m2, [scalingq+m5-0], m7 1631 psrld m3, 16 1632 vpgatherdd m5, [scalingq+m3-2], m6 1633 pblendw m5, m2, 0x55 1634 1635 ; grain = grain_lut[offy+y][offx+x] 1636 movu m2, [grain_lutq+offxyq*2] 1637%if %2 1638 movu m3, [grain_lutq+offxyq*2+82*2] 1639%else 1640 movu m3, [grain_lutq+offxyq*2+32] 1641%endif 1642 movd xm6, [grain_lutq+left_offxyq*2] 1643%if %2 1644 pinsrw xm6, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1} 1645 punpckldq xm7, xm2, xm3 ; {cur0, cur1} 1646 punpcklwd xm6, xm7 ; {left0, cur0, left1, cur1} 1647%else 1648 punpcklwd xm6, xm2 1649%endif 1650%if %1 1651%if %2 1652 vpbroadcastq xm7, [pw_23_22] 1653%else 1654 movq xm7, [pw_27_17_17_27] 1655%endif 1656 pmaddwd xm6, xm7 1657 vpbroadcastd xm7, [pd_16] 1658 paddd xm6, xm7 1659%else 1660 pmaddwd xm6, xm15 1661 paddd xm6, xm14 1662%endif 1663 psrad xm6, 5 1664 packssdw xm6, xm6 1665 pmaxsw xm6, xm8 1666 pminsw xm6, xm9 1667 vpblendd m2, m6, 0x01 1668%if %2 1669 pshuflw xm6, xm6, q1032 1670 vpblendd m3, m6, 0x01 1671%endif 1672 1673 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1674 pmaddubsw m4, m11 1675 pmaddubsw m5, m11 1676 paddw m4, m4 1677 paddw m5, m5 1678 pmulhrsw m2, m4 1679 pmulhrsw m3, m5 1680 1681 ; dst = clip_pixel(src, noise) 1682%if %1 1683 paddw m0, m2 1684 paddw m1, m3 1685%else 1686 paddw m0, m2, [srcq] 1687%if %2 1688 paddw m1, m3, [srcq+strideq] 1689%else 1690 paddw m1, m3, [srcq+32] 1691%endif 1692%endif 1693 pmaxsw m0, m12 1694 pmaxsw m1, m12 1695 pminsw m0, m13 1696 pminsw m1, m13 1697 mova [dstq], m0 1698%if %2 1699 mova [dstq+strideq], m1 1700 lea srcq, [srcq+strideq*2] 1701 lea dstq, [dstq+strideq*2] 1702 lea lumaq, [lumaq+lstrideq*(2<<%3)] 1703%else 1704 mova [dstq+32], m1 1705 add srcq, strideq 1706 add dstq, strideq 1707 add lumaq, r10mp 1708%endif 1709 add grain_lutq, 82*(2<<%2) 1710%if %2 1711 sub hb, 2 1712%else 1713 dec hb 1714%endif 1715 jg %%loop_y_h_overlap 1716 add wq, 32>>%2 1717 jge .end 1718 mov srcq, r9mp 1719 mov dstq, r11mp 1720 mov lumaq, r12mp 1721 lea srcq, [srcq+wq*2] 1722 lea dstq, [dstq+wq*2] 1723 lea lumaq, [lumaq+wq*(2<<%2)] 1724 cmp dword r8m, 0 ; sby 1725 jne %%loop_x_hv_overlap 1726 jmp %%loop_x_h_overlap 1727 1728%%vertical_overlap: 1729 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ 1730 sby, see, unused1, unused2, unused3, lstride 1731 1732 movzx sbyd, sbyb 1733 imul seed, [fg_dataq+FGData.seed], 0x00010001 1734 imul r7d, sbyd, 173 * 0x00010001 1735 imul sbyd, 37 * 0x01000100 1736 add r7d, (105 << 16) | 188 1737 add sbyd, (178 << 24) | (141 << 8) 1738 and r7d, 0x00ff00ff 1739 and sbyd, 0xff00ff00 1740 xor seed, r7d 1741 xor seed, sbyd ; (cur_seed << 16) | top_seed 1742 1743 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1744 offx, offy, see, unused1, top_offxy, unused2, luma, lstride 1745 1746 mov lumaq, r9mp 1747 mov lstrideq, r10mp 1748 lea r10, [srcq+wq*2] 1749 lea r11, [dstq+wq*2] 1750 lea r12, [lumaq+wq*(2<<%2)] 1751 mov r9mp, r10 1752 mov r11mp, r11 1753 mov r12mp, r12 1754 neg wq 1755 1756%%loop_x_v_overlap: 1757 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1758 mov r6d, seed 1759 or seed, 0xeff4eff4 1760 test seeb, seeh 1761 setp r7b ; parity of top_seed 1762 shr seed, 16 1763 shl r7d, 16 1764 test seeb, seeh 1765 setp r7b ; parity of cur_seed 1766 or r6d, 0x00010001 1767 xor r7d, r6d 1768 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1769 1770 rorx offyd, seed, 8 1771 rorx offxd, seed, 12 1772 and offyd, 0xf000f 1773 and offxd, 0xf000f 1774 imul offyd, 164>>%3 1775 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1776 lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 1777 1778 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1779 h, offxy, see, unused1, top_offxy, unused2, luma, lstride 1780 1781 mov grain_lutq, grain_lutmp 1782 mov hd, hm 1783 movzx top_offxyd, offxyw 1784 shr offxyd, 16 1785%if %2 == 0 1786 lea r10, [pw_27_17_17_27] 1787%endif 1788%%loop_y_v_overlap: 1789 ; luma_src 1790%if %2 1791 mova xm2, [lumaq+lstrideq*0+ 0] 1792 vinserti128 m2, [lumaq+lstrideq*0+32], 1 1793 mova xm4, [lumaq+lstrideq*0+16] 1794 vinserti128 m4, [lumaq+lstrideq*0+48], 1 1795 mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] 1796 vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 1797 mova xm5, [lumaq+lstrideq*(1<<%3)+16] 1798 vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 1799 phaddw m2, m4 1800 phaddw m3, m5 1801 pxor m4, m4 1802 pavgw m2, m4 1803 pavgw m3, m4 1804%elif %1 1805 mova m2, [lumaq] 1806 mova m3, [lumaq+32] 1807%endif 1808%if %1 1809 mova m0, [srcq] 1810%if %2 1811 mova m1, [srcq+strideq] 1812%else 1813 mova m1, [srcq+32] 1814%endif 1815 punpckhwd m4, m2, m0 1816 punpcklwd m2, m0 1817 punpckhwd m5, m3, m1 1818 punpcklwd m3, m1 ; { luma, chroma } 1819 REPX {pmaddwd x, m14}, m4, m2, m5, m3 1820 REPX {paddd x, m15}, m4, m2, m5, m3 1821 REPX {psrad x, 6 }, m4, m2, m5, m3 1822 packusdw m2, m4 1823 packusdw m3, m5 1824 pminuw m2, m10 ; clip_pixel() 1825 pminuw m3, m10 1826%elif %2 1827 pand m2, m10 1828 pand m3, m10 1829%else 1830 pand m2, m10, [lumaq+ 0] 1831 pand m3, m10, [lumaq+32] 1832%endif 1833 1834 ; scaling[luma_src] 1835 vpbroadcastd m7, [pd_m65536] 1836 pandn m4, m7, m2 1837 mova m6, m7 1838 vpgatherdd m5, [scalingq+m4-0], m7 1839 psrld m2, 16 1840 mova m7, m6 1841 vpgatherdd m4, [scalingq+m2-2], m6 1842 pblendw m4, m5, 0x55 1843 pandn m5, m7, m3 1844 mova m6, m7 1845 vpgatherdd m2, [scalingq+m5-0], m7 1846 psrld m3, 16 1847 vpgatherdd m5, [scalingq+m3-2], m6 1848 pblendw m5, m2, 0x55 1849 1850 ; grain = grain_lut[offy+y][offx+x] 1851 movu m6, [grain_lutq+offxyq*2] 1852 movu m3, [grain_lutq+top_offxyq*2] 1853 punpcklwd m2, m3, m6 1854 punpckhwd m3, m6 ; { top, cur } 1855%if %3 1856 vpbroadcastd m0, [pw_23_22] 1857%elif %2 1858 vpbroadcastd m0, [pw_27_17_17_27] 1859%else 1860 vpbroadcastd m0, [r10] 1861%endif 1862 REPX {pmaddwd x, m0}, m2, m3 1863%if %1 1864 vpbroadcastd m1, [pd_16] 1865 REPX {paddd x, m1}, m2, m3 1866%else 1867 REPX {paddd x, m14}, m2, m3 1868%endif 1869 REPX {psrad x, 5}, m2, m3 1870 packssdw m2, m3 1871%if %2 1872 movu m3, [grain_lutq+offxyq*2+82*2] 1873%else 1874 movu m3, [grain_lutq+offxyq*2+32] 1875%endif 1876%if %3 1877 pmaxsw m2, m8 1878 pminsw m2, m9 1879%else 1880%if %2 1881 movu m7, [grain_lutq+top_offxyq*2+82*2] 1882 punpckhwd m6, m3, m7 ; { cur, top } 1883 punpcklwd m3, m7 1884%else 1885 movu m7, [grain_lutq+top_offxyq*2+32] 1886 punpckhwd m6, m7, m3 1887 punpcklwd m3, m7, m3 ; { top, cur } 1888%endif 1889 pmaddwd m6, m0 1890 pmaddwd m3, m0 1891%if %1 1892 paddd m6, m1 1893 paddd m3, m1 1894%else 1895 paddd m6, m14 1896 paddd m3, m14 1897%endif 1898 psrad m6, 5 1899 psrad m3, 5 1900 packssdw m3, m6 1901 pmaxsw m2, m8 1902 pmaxsw m3, m8 1903 pminsw m2, m9 1904 pminsw m3, m9 1905%endif 1906 1907 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1908 pmaddubsw m4, m11 1909 pmaddubsw m5, m11 1910 paddw m4, m4 1911 paddw m5, m5 1912 pmulhrsw m2, m4 1913 pmulhrsw m3, m5 1914 1915 ; dst = clip_pixel(src, noise) 1916 paddw m0, m2, [srcq] 1917%if %2 1918 paddw m1, m3, [srcq+strideq] 1919%else 1920 paddw m1, m3, [srcq+32] 1921%endif 1922 pmaxsw m0, m12 1923 pmaxsw m1, m12 1924 pminsw m0, m13 1925 pminsw m1, m13 1926 mova [dstq], m0 1927%if %2 1928 mova [dstq+strideq], m1 1929 sub hb, 2 1930%else 1931 mova [dstq+32], m1 1932 dec hb 1933%endif 1934 jle %%end_y_v_overlap 1935%if %2 1936 lea srcq, [srcq+strideq*2] 1937 lea dstq, [dstq+strideq*2] 1938 lea lumaq, [lumaq+lstrideq*(2<<%3)] 1939%else 1940 add srcq, strideq 1941 add dstq, strideq 1942 add lumaq, lstrideq 1943%endif 1944 add grain_lutq, 82*(2<<%2) 1945%if %2 1946 jmp %%loop_y 1947%else 1948 add hd, 0x80000000 1949 jc %%loop_y 1950 add r10, 4 1951 jmp %%loop_y_v_overlap 1952%endif 1953%%end_y_v_overlap: 1954 add wq, 32>>%2 1955 jge .end 1956 mov srcq, r9mp 1957 mov dstq, r11mp 1958 mov lumaq, r12mp 1959 lea srcq, [srcq+wq*2] 1960 lea dstq, [dstq+wq*2] 1961 lea lumaq, [lumaq+wq*(2<<%2)] 1962 1963 ; since fg_dataq.overlap is guaranteed to be set, we never jump 1964 ; back to .loop_x_v_overlap, and instead always fall-through to 1965 ; h+v overlap 1966%%loop_x_hv_overlap: 1967 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1968 mov r6d, seed 1969 or seed, 0xeff4eff4 1970 test seeb, seeh 1971 setp r7b ; parity of top_seed 1972 shr seed, 16 1973 shl r7d, 16 1974 test seeb, seeh 1975 setp r7b ; parity of cur_seed 1976 or r6d, 0x00010001 1977 xor r7d, r6d 1978 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1979 1980 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1981 offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride 1982 1983%if %2 == 0 1984 lea r14, [pw_27_17_17_27] 1985%endif 1986 lea topleft_offxyq, [top_offxyq+(32>>%2)] 1987 lea left_offxyq, [offyq+(32>>%2)] 1988 rorx offyd, seed, 8 1989 rorx offxd, seed, 12 1990 and offyd, 0xf000f 1991 and offxd, 0xf000f 1992 imul offyd, 164>>%3 1993 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1994 lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 1995 1996 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1997 h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride 1998 1999 mov grain_lutq, grain_lutmp 2000 mov hd, hm 2001 movzx top_offxyd, offxyw 2002 shr offxyd, 16 2003%%loop_y_hv_overlap: 2004 ; luma_src 2005%if %2 2006 mova xm2, [lumaq+lstrideq*0+ 0] 2007 vinserti128 m2, [lumaq+lstrideq*0+32], 1 2008 mova xm4, [lumaq+lstrideq*0+16] 2009 vinserti128 m4, [lumaq+lstrideq*0+48], 1 2010 mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] 2011 vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 2012 mova xm5, [lumaq+lstrideq*(1<<%3)+16] 2013 vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 2014 phaddw m2, m4 2015 phaddw m3, m5 2016 pxor m4, m4 2017 pavgw m2, m4 2018 pavgw m3, m4 2019%elif %1 2020 mova m2, [lumaq] 2021 mova m3, [lumaq+32] 2022%endif 2023%if %1 2024 mova m0, [srcq] 2025%if %2 2026 mova m1, [srcq+strideq] 2027%else 2028 mova m1, [srcq+32] 2029%endif 2030 punpckhwd m4, m2, m0 2031 punpcklwd m2, m0 2032 punpckhwd m5, m3, m1 2033 punpcklwd m3, m1 ; { luma, chroma } 2034 REPX {pmaddwd x, m14}, m4, m2, m5, m3 2035 REPX {paddd x, m15}, m4, m2, m5, m3 2036 REPX {psrad x, 6 }, m4, m2, m5, m3 2037 packusdw m2, m4 2038 packusdw m3, m5 2039 pminuw m2, m10 ; clip_pixel() 2040 pminuw m3, m10 2041%elif %2 2042 pand m2, m10 2043 pand m3, m10 2044%else 2045 pand m2, m10, [lumaq+ 0] 2046 pand m3, m10, [lumaq+32] 2047%endif 2048 2049 ; scaling[luma_src] 2050 vpbroadcastd m7, [pd_m65536] 2051 pandn m4, m7, m2 2052 mova m6, m7 2053 vpgatherdd m5, [scalingq+m4-0], m7 2054 psrld m2, 16 2055 mova m7, m6 2056 vpgatherdd m4, [scalingq+m2-2], m6 2057 pblendw m4, m5, 0x55 2058 pandn m5, m7, m3 2059 mova m6, m7 2060 vpgatherdd m2, [scalingq+m5-0], m7 2061 psrld m3, 16 2062 vpgatherdd m5, [scalingq+m3-2], m6 2063 pblendw m5, m2, 0x55 2064 2065 ; grain = grain_lut[offy+y][offx+x] 2066 movu m0, [grain_lutq+offxyq*2] 2067 movd xm2, [grain_lutq+left_offxyq*2] 2068 movu m6, [grain_lutq+top_offxyq*2] 2069%if %2 2070 pinsrw xm2, [grain_lutq+left_offxyq*2+82*2], 2 2071 movu m3, [grain_lutq+offxyq*2+82*2] 2072 punpckldq xm1, xm0, xm3 ; { cur0, cur1 } 2073%if %3 2074 vinserti128 m2, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left } 2075 vinserti128 m1, [grain_lutq+top_offxyq*2], 1 ; { cur0, cur1, top0 } 2076%else 2077 vinserti128 m2, [grain_lutq+topleft_offxyq*2+82*2], 1 2078 vpbroadcastd m7, [grain_lutq+topleft_offxyq*2] 2079 vpblendd m2, m7, 0x20 2080 movd xm7, [grain_lutq+top_offxyq*2+82*2] 2081 punpckldq xm7, xm6 2082 vinserti128 m1, xm7, 1 2083 movu m7, [grain_lutq+top_offxyq*2+82*2] 2084%endif 2085 punpcklwd m2, m1 ; { cur, left } 2086%if %1 2087 vpbroadcastq m1, [pw_23_22] 2088 pmaddwd m2, m1 2089 vpbroadcastd m1, [pd_16] 2090 paddd m2, m1 2091 psrad m2, 5 2092 packssdw m2, m2 2093 vpermq m2, m2, q3120 2094%else 2095 pmaddwd m2, m15 2096 paddd m2, m14 2097 psrad m2, 5 2098 vextracti128 xm1, m2, 1 2099 packssdw xm2, xm1 2100%endif 2101%else 2102 pinsrd xm2, [grain_lutq+topleft_offxyq*2], 1 2103 movu m3, [grain_lutq+offxyq*2+32] 2104 movu m7, [grain_lutq+top_offxyq*2+32] 2105 punpckldq xm1, xm0, xm6 2106 punpcklwd xm2, xm1 ; { cur, left } 2107%if %1 2108 movddup xm1, [pw_27_17_17_27] 2109 pmaddwd xm2, xm1 2110 vpbroadcastd m1, [pd_16] 2111 paddd xm2, xm1 2112%else 2113 pmaddwd xm2, xm15 2114 paddd xm2, xm14 2115%endif 2116 psrad xm2, 5 2117 packssdw xm2, xm2 2118%endif 2119 pmaxsw xm2, xm8 2120 pminsw xm2, xm9 2121 vpblendd m0, m2, 0x01 2122%if %2 2123 pshufd xm2, xm2, q0321 2124 vpblendd m3, m2, 0x01 2125%if %3 == 0 2126 pshufd xm2, xm2, q0321 2127 vpblendd m7, m2, 0x01 2128%endif 2129%endif 2130 pshuflw xm2, xm2, q1032 2131 vpblendd m2, m6, 0xfe 2132 punpckhwd m6, m0 ; { top, cur } 2133 punpcklwd m2, m0 2134%if %3 2135 vpbroadcastd m0, [pw_23_22] 2136%elif %2 2137 vpbroadcastd m0, [pw_27_17_17_27] 2138%else 2139 vpbroadcastd m0, [r14] 2140%endif 2141 pmaddwd m6, m0 2142 pmaddwd m2, m0 2143%if %1 2144 paddd m6, m1 2145 paddd m2, m1 2146%else 2147 paddd m6, m14 2148 paddd m2, m14 2149%endif 2150 psrad m6, 5 2151 psrad m2, 5 2152 packssdw m2, m6 2153 2154%if %3 2155 pmaxsw m2, m8 2156 pminsw m2, m9 2157%else 2158%if %2 2159 punpckhwd m6, m3, m7 2160 punpcklwd m3, m7 ; { cur, top } 2161%else 2162 punpckhwd m6, m7, m3 2163 punpcklwd m3, m7, m3 ; { top, cur } 2164%endif 2165 REPX {pmaddwd x, m0}, m6, m3 2166%if %1 2167 REPX {paddd x, m1}, m6, m3 2168%else 2169 REPX {paddd x, m14}, m6, m3 2170%endif 2171 REPX {psrad x, 5}, m6, m3 2172 packssdw m3, m6 2173 pmaxsw m2, m8 2174 pmaxsw m3, m8 2175 pminsw m2, m9 2176 pminsw m3, m9 2177%endif 2178 2179 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2180 pmaddubsw m4, m11 2181 pmaddubsw m5, m11 2182 paddw m4, m4 2183 paddw m5, m5 2184 pmulhrsw m2, m4 2185 pmulhrsw m3, m5 2186 2187 ; dst = clip_pixel(src, noise) 2188 paddw m0, m2, [srcq] 2189%if %2 2190 paddw m1, m3, [srcq+strideq] 2191%else 2192 paddw m1, m3, [srcq+32] 2193%endif 2194 pmaxsw m0, m12 2195 pmaxsw m1, m12 2196 pminsw m0, m13 2197 pminsw m1, m13 2198 mova [dstq], m0 2199%if %2 2200 mova [dstq+strideq], m1 2201 lea srcq, [srcq+strideq*2] 2202 lea dstq, [dstq+strideq*2] 2203 lea lumaq, [lumaq+lstrideq*(2<<%3)] 2204%else 2205 mova [dstq+32], m1 2206 add srcq, strideq 2207 add dstq, strideq 2208 add lumaq, r10mp 2209%endif 2210 add grain_lutq, 82*(2<<%2) 2211%if %2 2212 sub hb, 2 2213 jg %%loop_y_h_overlap 2214%else 2215 dec hb 2216 jle %%end_y_hv_overlap 2217 add hd, 0x80000000 2218 jc %%loop_y_h_overlap 2219 add r14, 4 2220 jmp %%loop_y_hv_overlap 2221%endif 2222%%end_y_hv_overlap: 2223 add wq, 32>>%2 2224 jge .end 2225 mov srcq, r9mp 2226 mov dstq, r11mp 2227 mov lumaq, r12mp 2228 lea srcq, [srcq+wq*2] 2229 lea dstq, [dstq+wq*2] 2230 lea lumaq, [lumaq+wq*(2<<%2)] 2231 jmp %%loop_x_hv_overlap 2232%endmacro 2233 2234 %%FGUV_32x32xN_LOOP 1, %2, %3 2235.csfl: 2236 %%FGUV_32x32xN_LOOP 0, %2, %3 2237.end: 2238 RET 2239%endmacro 2240 2241GEN_GRAIN_UV_FN 420, 1, 1 2242FGUV_FN 420, 1, 1 2243GEN_GRAIN_UV_FN 422, 1, 0 2244FGUV_FN 422, 1, 0 2245GEN_GRAIN_UV_FN 444, 0, 0 2246FGUV_FN 444, 0, 0 2247 2248%endif ; ARCH_X86_64 2249