1; Copyright © 2019-2022, VideoLAN and dav1d authors 2; Copyright © 2019-2022, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28%include "x86/filmgrain_common.asm" 29 30%if ARCH_X86_64 31 32SECTION_RODATA 32 33pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0 34gen_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 35gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 36gen_shufB: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 37gen_shufC: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 38gen_shufD: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 39; note: the order of (some of) the following constants matter 40pb_27_17: times 2 db 27, 17 41byte_blend: db 0, 0, 0, -1 42pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32 43pb_17_27: times 2 db 17, 27 44pb_1: times 4 db 1 45pb_23_22: db 23, 22, 0, 32, 0, 32, 0, 32 46next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 47pw_seed_xor: times 2 dw 0xb524 48 times 2 dw 0x49d8 49fg_min: times 4 db 0 50 times 4 db 16 51fg_max: times 4 db 255 52 times 4 db 240 53 times 4 db 235 54pd_m65536: dd -65536 55pw_8: times 2 dw 8 56pw_1024: times 2 dw 1024 57hmul_bits: dw 32768, 16384, 8192, 4096 58round: dw 2048, 1024, 512 59mul_bits: dw 256, 128, 64, 32, 16 60round_vals: dw 32, 64, 128, 256, 512 61pw_1: dw 1 62 63%macro JMP_TABLE 2-* 64 %1_8bpc_%2_table: 65 %xdefine %%base %1_8bpc_%2_table 66 %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) 67 %rep %0 - 2 68 dd %%prefix %+ .ar%3 - %%base 69 %rotate 1 70 %endrep 71%endmacro 72 73JMP_TABLE generate_grain_y, avx2, 0, 1, 2, 3 74JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3 75JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3 76JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3 77 78SECTION .text 79 80INIT_YMM avx2 81cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data 82%define base r4-generate_grain_y_8bpc_avx2_table 83 lea r4, [generate_grain_y_8bpc_avx2_table] 84 vpbroadcastw xm0, [fg_dataq+FGData.seed] 85 mov r6d, [fg_dataq+FGData.grain_scale_shift] 86 movq xm1, [base+next_upperbit_mask] 87 movsxd r5, [fg_dataq+FGData.ar_coeff_lag] 88 movq xm4, [base+mul_bits] 89 movq xm5, [base+hmul_bits] 90 mov r7, -73*82 91 mova xm6, [base+pb_mask] 92 sub bufq, r7 93 vpbroadcastw xm7, [base+round+r6*2] 94 lea r6, [gaussian_sequence] 95 movsxd r5, [r4+r5*4] 96.loop: 97 pand xm2, xm0, xm1 98 psrlw xm3, xm2, 10 99 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 100 pmullw xm2, xm4 ; bits 0x0f00 are set 101 pmulhuw xm0, xm5 102 pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds 103 psllq xm2, xm3, 30 104 por xm2, xm3 105 psllq xm3, xm2, 15 106 por xm2, xm0 ; aggregate each bit into next seed's high bit 107 por xm3, xm2 ; 4 next output seeds 108 pshuflw xm0, xm3, q3333 109 psrlw xm3, 5 110 pand xm2, xm0, xm1 111 movq r2, xm3 112 psrlw xm3, xm2, 10 113 por xm2, xm3 114 pmullw xm2, xm4 115 pmulhuw xm0, xm5 116 movzx r3d, r2w 117 pshufb xm3, xm6, xm2 118 psllq xm2, xm3, 30 119 por xm2, xm3 120 psllq xm3, xm2, 15 121 por xm0, xm2 122 movd xm2, [r6+r3*2] 123 rorx r3, r2, 32 124 por xm3, xm0 125 shr r2d, 16 126 pinsrw xm2, [r6+r2*2], 1 127 pshuflw xm0, xm3, q3333 128 movzx r2d, r3w 129 psrlw xm3, 5 130 pinsrw xm2, [r6+r2*2], 2 131 shr r3d, 16 132 movq r2, xm3 133 pinsrw xm2, [r6+r3*2], 3 134 movzx r3d, r2w 135 pinsrw xm2, [r6+r3*2], 4 136 rorx r3, r2, 32 137 shr r2d, 16 138 pinsrw xm2, [r6+r2*2], 5 139 movzx r2d, r3w 140 pinsrw xm2, [r6+r2*2], 6 141 shr r3d, 16 142 pinsrw xm2, [r6+r3*2], 7 143 pmulhrsw xm2, xm7 144 packsswb xm2, xm2 145 movq [bufq+r7], xm2 146 add r7, 8 147 jl .loop 148 149 ; auto-regression code 150 add r5, r4 151 jmp r5 152 153.ar1: 154 DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 155 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 156 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] 157 movd xm5, [fg_dataq+FGData.ar_coeffs_y] 158 mova xm2, [base+gen_shufC] 159 DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 160 pinsrb xm5, [base+pb_1], 3 161 vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd 162 pmovsxbw xm5, xm5 163 pshufd xm4, xm5, q0000 164 pshufd xm5, xm5, q1111 165 sub bufq, 82*73-(82*3+79) 166 mov hd, 70 167 mov mind, -128 168 mov maxd, 127 169.y_loop_ar1: 170 mov xq, -76 171 movsx val3d, byte [bufq+xq-1] 172.x_loop_ar1: 173 pmovsxbw xm1, [bufq+xq-82-3] 174 pshufb xm0, xm1, xm2 175 punpckhwd xm1, xm3 176 pmaddwd xm0, xm4 177 pmaddwd xm1, xm5 178 paddd xm0, xm1 179.x_loop_ar1_inner: 180 movd val0d, xm0 181 psrldq xm0, 4 182 imul val3d, cf3d 183 add val3d, val0d 184 movsx val0d, byte [bufq+xq] 185 sarx val3d, val3d, shiftd 186 add val3d, val0d 187 cmp val3d, maxd 188 cmovns val3d, maxd 189 cmp val3d, mind 190 cmovs val3d, mind 191 mov [bufq+xq], val3b 192 ; keep val3d in-place as left for next x iteration 193 inc xq 194 jz .x_loop_ar1_end 195 test xb, 3 196 jnz .x_loop_ar1_inner 197 jmp .x_loop_ar1 198.x_loop_ar1_end: 199 add bufq, 82 200 dec hd 201 jg .y_loop_ar1 202.ar0: 203 RET 204 205.ar2: 206%if WIN64 207 ; xmm6 and xmm7 already saved 208 %assign xmm_regs_used 16 209 %assign stack_size_padded 168 210 SUB rsp, stack_size_padded 211 movaps [rsp+16*2], xmm8 212 movaps [rsp+16*3], xmm9 213 movaps [rsp+16*4], xmm10 214 movaps [rsp+16*5], xmm11 215 movaps [rsp+16*6], xmm12 216 movaps [rsp+16*7], xmm13 217 movaps [rsp+16*8], xmm14 218 movaps [rsp+16*9], xmm15 219%endif 220 DEFINE_ARGS buf, fg_data, h, x 221 mov r6d, [fg_dataq+FGData.ar_coeff_shift] 222 pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 223 movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 224 vpbroadcastd xm10, [base+round_vals-14+r6*2] 225 movd xm11, [base+byte_blend+1] 226 pmovsxbw xm9, xm9 227 pshufd xm4, xm7, q0000 228 mova xm12, [base+gen_shufA] 229 pshufd xm5, xm7, q3333 230 mova xm13, [base+gen_shufB] 231 pshufd xm6, xm7, q1111 232 mova xm14, [base+gen_shufC] 233 pshufd xm7, xm7, q2222 234 mova xm15, [base+gen_shufD] 235 pshufd xm8, xm9, q0000 236 psrld xm10, 16 237 pshufd xm9, xm9, q1111 238 sub bufq, 82*73-(82*3+79) 239 mov hd, 70 240.y_loop_ar2: 241 mov xq, -76 242.x_loop_ar2: 243 pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 244 pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 245 pshufb xm2, xm0, xm12 246 pmaddwd xm2, xm4 247 pshufb xm3, xm1, xm13 248 pmaddwd xm3, xm5 249 paddd xm2, xm3 250 pshufb xm3, xm0, xm14 251 pmaddwd xm3, xm6 252 punpckhqdq xm0, xm0 253 punpcklwd xm0, xm1 254 pmaddwd xm0, xm7 255 pshufb xm1, xm15 256 pmaddwd xm1, xm8 257 paddd xm2, xm10 258 paddd xm2, xm3 259 paddd xm0, xm1 260 paddd xm2, xm0 261 movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] 262.x_loop_ar2_inner: 263 pmovsxbw xm1, xm0 264 pmaddwd xm3, xm9, xm1 265 psrldq xm1, 4 ; y=0,x=0 266 paddd xm3, xm2 267 psrldq xm2, 4 ; shift top to next pixel 268 psrad xm3, [fg_dataq+FGData.ar_coeff_shift] 269 ; don't packssdw since we only care about one value 270 paddw xm3, xm1 271 packsswb xm3, xm3 272 pextrb [bufq+xq], xm3, 0 273 pslldq xm3, 2 274 vpblendvb xm0, xm3, xm11 275 psrldq xm0, 1 276 inc xq 277 jz .x_loop_ar2_end 278 test xb, 3 279 jnz .x_loop_ar2_inner 280 jmp .x_loop_ar2 281.x_loop_ar2_end: 282 add bufq, 82 283 dec hd 284 jg .y_loop_ar2 285 RET 286 287INIT_YMM avx2 288.ar3: 289%if WIN64 290 ; xmm6 and xmm7 already saved 291 %assign stack_offset 16 292 ALLOC_STACK 16*14 293 %assign stack_size stack_size - 16*4 294 %assign xmm_regs_used 12 295 movaps [rsp+16*12], xmm8 296 movaps [rsp+16*13], xmm9 297 movaps [rsp+16*14], xmm10 298 movaps [rsp+16*15], xmm11 299%else 300 ALLOC_STACK 16*12 301%endif 302 mov r6d, [fg_dataq+FGData.ar_coeff_shift] 303 movd xm11, [base+byte_blend] 304 pmovsxbw m1, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 305 pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 306 pshufd m0, m1, q0000 307 mova [rsp+16* 0], m0 308 pshufd m0, m1, q1111 309 mova [rsp+16* 2], m0 310 pshufd m0, m1, q2222 311 mova [rsp+16* 4], m0 312 pshufd m1, m1, q3333 313 mova [rsp+16* 6], m1 314 pshufd xm0, xm2, q0000 315 mova [rsp+16* 8], xm0 316 pshufd xm0, xm2, q1111 317 mova [rsp+16* 9], xm0 318 psrldq xm7, xm2, 10 319 mova m8, [base+gen_shufA] 320 pinsrw xm2, [base+pw_1], 5 321 mova m9, [base+gen_shufC] 322 pshufd xm2, xm2, q2222 323 movu m10, [base+gen_shufE] 324 vpbroadcastw xm6, [base+round_vals-12+r6*2] 325 pinsrw xm7, [base+round_vals+r6*2-10], 3 326 mova [rsp+16*10], xm2 327 DEFINE_ARGS buf, fg_data, h, x 328 sub bufq, 82*73-(82*3+79) 329 mov hd, 70 330.y_loop_ar3: 331 mov xq, -76 332.x_loop_ar3: 333 movu xm5, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] 334 vinserti128 m5, [bufq+xq-82*2-3], 1 ; y=-2,x=[-3,+12] 335 movu xm4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 336 punpcklbw m3, m5, m5 337 punpckhwd m5, m4 338 psraw m3, 8 339 punpcklbw m5, m5 340 psraw m5, 8 341 punpcklbw xm4, xm4 342 psraw xm4, 8 343 pshufb m0, m3, m8 344 pmaddwd m0, [rsp+16*0] 345 pshufb m1, m3, m9 346 pmaddwd m1, [rsp+16*2] 347 shufps m2, m3, m5, q1032 348 paddd m0, m1 349 pshufb m1, m2, m8 350 vperm2i128 m3, m4, 0x21 351 pmaddwd m1, [rsp+16*4] 352 shufps xm2, xm3, q1021 353 vpblendd m2, m3, 0xf0 354 pshufb m2, m10 355 paddd m0, m1 356 pmaddwd m2, [rsp+16*6] 357 pshufb xm1, xm4, xm9 358 pmaddwd xm1, [rsp+16*8] 359 shufps xm4, xm5, q1132 360 paddd m0, m2 361 pshufb xm2, xm4, xm8 362 pshufd xm4, xm4, q2121 363 pmaddwd xm2, [rsp+16*9] 364 punpcklwd xm4, xm6 365 pmaddwd xm4, [rsp+16*10] 366 vextracti128 xm3, m0, 1 367 paddd xm0, xm1 368 movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] 369 paddd xm2, xm4 370 paddd xm0, xm2 371 paddd xm0, xm3 372.x_loop_ar3_inner: 373 pmovsxbw xm2, xm1 374 pmaddwd xm2, xm7 375 pshufd xm3, xm2, q1111 376 paddd xm2, xm0 ; add top 377 paddd xm2, xm3 ; left+cur 378 psrldq xm0, 4 379 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] 380 ; don't packssdw since we only care about one value 381 packsswb xm2, xm2 382 pextrb [bufq+xq], xm2, 0 383 pslldq xm2, 3 384 vpblendvb xm1, xm2, xm11 385 psrldq xm1, 1 386 inc xq 387 jz .x_loop_ar3_end 388 test xb, 3 389 jnz .x_loop_ar3_inner 390 jmp .x_loop_ar3 391.x_loop_ar3_end: 392 add bufq, 82 393 dec hd 394 jg .y_loop_ar3 395 RET 396 397%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y 398INIT_XMM avx2 399cglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv 400%define base r4-generate_grain_uv_%1_8bpc_avx2_table 401 lea r4, [generate_grain_uv_%1_8bpc_avx2_table] 402 vpbroadcastw xm0, [fg_dataq+FGData.seed] 403 mov r6d, [fg_dataq+FGData.grain_scale_shift] 404 movq xm1, [base+next_upperbit_mask] 405 movq xm4, [base+mul_bits] 406 movq xm5, [base+hmul_bits] 407 mova xm6, [base+pb_mask] 408 vpbroadcastw xm7, [base+round+r6*2] 409 vpbroadcastd xm2, [base+pw_seed_xor+uvq*4] 410 pxor xm0, xm2 411 lea r6, [gaussian_sequence] 412%if %2 413 mov r7d, 73-35*%3 414 add bufq, 44 415.loop_y: 416 mov r5, -44 417%else 418 mov r5, -73*82 419 sub bufq, r5 420%endif 421.loop: 422 pand xm2, xm0, xm1 423 psrlw xm3, xm2, 10 424 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 425 pmullw xm2, xm4 ; bits 0x0f00 are set 426 pmulhuw xm0, xm5 427 pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds 428 psllq xm2, xm3, 30 429 por xm2, xm3 430 psllq xm3, xm2, 15 431 por xm2, xm0 ; aggregate each bit into next seed's high bit 432 por xm2, xm3 ; 4 next output seeds 433 pshuflw xm0, xm2, q3333 434 psrlw xm2, 5 435 movq r8, xm2 436 movzx r9d, r8w 437 movd xm2, [r6+r9*2] 438 rorx r9, r8, 32 439 shr r8d, 16 440 pinsrw xm2, [r6+r8*2], 1 441 movzx r8d, r9w 442 pinsrw xm2, [r6+r8*2], 2 443 shr r9d, 16 444 pinsrw xm2, [r6+r9*2], 3 445 pmulhrsw xm2, xm7 446 packsswb xm2, xm2 447 movd [bufq+r5], xm2 448 add r5, 4 449 jl .loop 450%if %2 451 add bufq, 82 452 dec r7d 453 jg .loop_y 454%endif 455 456 ; auto-regression code 457 movsxd r6, [fg_dataq+FGData.ar_coeff_lag] 458 movsxd r6, [base+generate_grain_uv_%1_8bpc_avx2_table+r6*4] 459 add r6, r4 460 jmp r6 461 462INIT_YMM avx2 463.ar0: 464 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 465 imul uvd, 28 466 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 467 movd xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq] 468 movd xm3, [base+hmul_bits+shiftq*2] 469 DEFINE_ARGS buf, bufy, h 470 pmovsxbw xm2, xm2 471%if %2 472 vpbroadcastd m7, [base+pb_1] 473 vpbroadcastw m6, [base+hmul_bits+2+%3*2] 474%endif 475 vpbroadcastw m2, xm2 476 vpbroadcastw m3, xm3 477 pxor m12, m12 478%if %2 479 sub bufq, 82*(73-35*%3)+82-(82*3+41) 480%else 481 sub bufq, 82*70-3 482%endif 483 add bufyq, 3+82*3 484 mov hd, 70-35*%3 485.y_loop_ar0: 486%if %2 487 ; first 32 pixels 488 movu xm4, [bufyq] 489 vinserti128 m4, [bufyq+32], 1 490%if %3 491 movu xm0, [bufyq+82] 492 vinserti128 m0, [bufyq+82+32], 1 493%endif 494 movu xm5, [bufyq+16] 495 vinserti128 m5, [bufyq+48], 1 496%if %3 497 movu xm1, [bufyq+82+16] 498 vinserti128 m1, [bufyq+82+48], 1 499%endif 500 pmaddubsw m4, m7, m4 501%if %3 502 pmaddubsw m0, m7, m0 503%endif 504 pmaddubsw m5, m7, m5 505%if %3 506 pmaddubsw m1, m7, m1 507 paddw m4, m0 508 paddw m5, m1 509%endif 510 pmulhrsw m4, m6 511 pmulhrsw m5, m6 512%else 513 xor r3d, r3d 514 ; first 32x2 pixels 515.x_loop_ar0: 516 movu m4, [bufyq+r3] 517 pcmpgtb m0, m12, m4 518 punpckhbw m5, m4, m0 519 punpcklbw m4, m0 520%endif 521 pmullw m4, m2 522 pmullw m5, m2 523 pmulhrsw m4, m3 524 pmulhrsw m5, m3 525%if %2 526 movu m1, [bufq] 527%else 528 movu m1, [bufq+r3] 529%endif 530 pcmpgtb m8, m12, m1 531 punpcklbw m0, m1, m8 532 punpckhbw m1, m8 533 paddw m0, m4 534 paddw m1, m5 535 packsswb m0, m1 536%if %2 537 movu [bufq], m0 538%else 539 movu [bufq+r3], m0 540 add r3d, 32 541 cmp r3d, 64 542 jl .x_loop_ar0 543%endif 544 545 ; last 6/12 pixels 546 movu xm4, [bufyq+32*2] 547%if %2 548%if %3 549 movu xm5, [bufyq+32*2+82] 550%endif 551 pmaddubsw xm4, xm7, xm4 552%if %3 553 pmaddubsw xm5, xm7, xm5 554 paddw xm4, xm5 555%endif 556 movq xm0, [bufq+32] 557 pmulhrsw xm4, xm6 558 pmullw xm4, xm2 559 pmulhrsw xm4, xm3 560 pcmpgtb xm5, xm12, xm0 561 punpcklbw xm5, xm0, xm5 562 paddw xm4, xm5 563 packsswb xm4, xm4 564 pblendw xm0, xm4, xm0, 1000b 565 movq [bufq+32], xm0 566%else 567 movu xm0, [bufq+64] 568 pcmpgtb xm1, xm12, xm4 569 punpckhbw xm5, xm4, xm1 570 punpcklbw xm4, xm1 571 pmullw xm5, xm2 572 pmullw xm4, xm2 573 vpblendd xm1, xm3, xm12, 0x0c 574 pmulhrsw xm5, xm1 575 pmulhrsw xm4, xm3 576 pcmpgtb xm1, xm12, xm0 577 punpckhbw xm8, xm0, xm1 578 punpcklbw xm0, xm1 579 paddw xm5, xm8 580 paddw xm0, xm4 581 packsswb xm0, xm5 582 movu [bufq+64], xm0 583%endif 584 add bufq, 82 585 add bufyq, 82<<%3 586 dec hd 587 jg .y_loop_ar0 588 RET 589 590INIT_XMM avx2 591.ar1: 592 DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift 593 imul uvd, 28 594 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 595 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] 596 movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] 597 pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 598 DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift 599 pmovsxbw xm4, xm4 600 pshufd xm5, xm4, q1111 601 pshufd xm4, xm4, q0000 602 pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd 603%if %2 604 vpbroadcastd xm7, [base+pb_1] 605 vpbroadcastw xm6, [base+hmul_bits+2+%3*2] 606%endif 607 vpbroadcastd xm3, xm3 608%if %2 609 sub bufq, 82*(73-35*%3)+44-(82*3+41) 610%else 611 sub bufq, 82*70-(82-3) 612%endif 613 add bufyq, 79+82*3 614 mov hd, 70-35*%3 615 mov mind, -128 616 mov maxd, 127 617.y_loop_ar1: 618 mov xq, -(76>>%2) 619 movsx val3d, byte [bufq+xq-1] 620.x_loop_ar1: 621 pmovsxbw xm0, [bufq+xq-82-1] ; top/left 622%if %2 623 movq xm8, [bufyq+xq*2] 624%if %3 625 movq xm9, [bufyq+xq*2+82] 626%endif 627%endif 628 psrldq xm2, xm0, 2 ; top 629 psrldq xm1, xm0, 4 ; top/right 630%if %2 631 pmaddubsw xm8, xm7, xm8 632%if %3 633 pmaddubsw xm9, xm7, xm9 634 paddw xm8, xm9 635%endif 636 pmulhrsw xm8, xm6 637%else 638 pmovsxbw xm8, [bufyq+xq] 639%endif 640 punpcklwd xm0, xm2 641 punpcklwd xm1, xm8 642 pmaddwd xm0, xm4 643 pmaddwd xm1, xm5 644 paddd xm0, xm1 645 paddd xm0, xm3 646.x_loop_ar1_inner: 647 movd val0d, xm0 648 psrldq xm0, 4 649 imul val3d, cf3d 650 add val3d, val0d 651 sarx val3d, val3d, shiftd 652 movsx val0d, byte [bufq+xq] 653 add val3d, val0d 654 cmp val3d, maxd 655 cmovns val3d, maxd 656 cmp val3d, mind 657 cmovs val3d, mind 658 mov byte [bufq+xq], val3b 659 ; keep val3d in-place as left for next x iteration 660 inc xq 661 jz .x_loop_ar1_end 662 test xq, 3 663 jnz .x_loop_ar1_inner 664 jmp .x_loop_ar1 665 666.x_loop_ar1_end: 667 add bufq, 82 668 add bufyq, 82<<%3 669 dec hd 670 jg .y_loop_ar1 671 RET 672 673.ar2: 674 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 675 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 676 imul uvd, 28 677 vpbroadcastw xm13, [base+round_vals-12+shiftq*2] 678 pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7 679 pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12 680 pinsrw xm0, [base+pw_1], 5 681%if %2 682 vpbroadcastw xm12, [base+hmul_bits+2+%3*2] 683 vpbroadcastd xm11, [base+pb_1] 684%endif 685 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 686 pshufd xm4, xm7, q0000 687 pshufd xm5, xm7, q3333 688 pshufd xm6, xm7, q1111 689 pshufd xm7, xm7, q2222 690 pshufd xm8, xm0, q0000 691 pshufd xm9, xm0, q1111 692 pshufd xm10, xm0, q2222 693%if %2 694 sub bufq, 82*(73-35*%3)+44-(82*3+41) 695%else 696 sub bufq, 82*70-(82-3) 697%endif 698 add bufyq, 79+82*3 699 mov hd, 70-35*%3 700.y_loop_ar2: 701 mov xq, -(76>>%2) 702 703.x_loop_ar2: 704 pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 705 pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 706 pshufb xm2, xm0, [base+gen_shufA] 707 pmaddwd xm2, xm4 708 pshufb xm3, xm1, [base+gen_shufB] 709 pmaddwd xm3, xm5 710 paddd xm2, xm3 711 pshufb xm3, xm0, [base+gen_shufC] 712 pmaddwd xm3, xm6 713 punpckhqdq xm0, xm0 ; y=-2,x=[+2,+5] 714 punpcklwd xm0, xm1 715 pmaddwd xm0, xm7 716 pshufb xm1, [gen_shufD] 717 pmaddwd xm1, xm8 718 paddd xm2, xm3 719 paddd xm0, xm1 720 paddd xm2, xm0 721 722%if %2 723 movq xm0, [bufyq+xq*2] 724%if %3 725 movq xm3, [bufyq+xq*2+82] 726%endif 727 pmaddubsw xm0, xm11, xm0 728%if %3 729 pmaddubsw xm3, xm11, xm3 730 paddw xm0, xm3 731%endif 732 pmulhrsw xm0, xm12 733%else 734 pmovsxbw xm0, [bufyq+xq] 735%endif 736 punpcklwd xm0, xm13 737 pmaddwd xm0, xm10 738 paddd xm2, xm0 739 740 movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] 741.x_loop_ar2_inner: 742 pmovsxbw xm0, xm0 743 pmaddwd xm3, xm0, xm9 744 psrldq xm0, 2 745 paddd xm3, xm2 746 psrldq xm2, 4 ; shift top to next pixel 747 psrad xm3, [fg_dataq+FGData.ar_coeff_shift] 748 pslldq xm3, 2 749 paddw xm3, xm0 750 pblendw xm0, xm3, 00000010b 751 packsswb xm0, xm0 752 pextrb [bufq+xq], xm0, 1 753 inc xq 754 jz .x_loop_ar2_end 755 test xb, 3 756 jnz .x_loop_ar2_inner 757 jmp .x_loop_ar2 758 759.x_loop_ar2_end: 760 add bufq, 82 761 add bufyq, 82<<%3 762 dec hd 763 jg .y_loop_ar2 764 RET 765 766INIT_YMM avx2 767.ar3: 768 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 769 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 770 imul uvd, 28 771 pmovsxbw m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 772 pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23 773 vpbroadcastb xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma] 774 movd xm13, [base+round_vals-10+shiftq*2] 775 vpbroadcastd xm14, [base+round_vals-14+shiftq*2] 776 pshufd m6, m0, q0000 777 pshufd m7, m0, q1111 778 pshufd m8, m0, q2222 779 pshufd m9, m0, q3333 780 pshufd xm10, xm1, q0000 781 pshufd xm11, xm1, q1111 782 pshufhw xm12, xm1, q0000 783 psraw xm2, 8 784 palignr xm13, xm1, 10 785 punpckhwd xm12, xm2 ; interleave luma cf 786 psrld xm14, 16 787 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 788%if %2 789 vpbroadcastw xm15, [base+hmul_bits+2+%3*2] 790 sub bufq, 82*(73-35*%3)+44-(82*3+41) 791%else 792 sub bufq, 82*70-(82-3) 793%endif 794 add bufyq, 79+82*3 795 mov hd, 70-35*%3 796.y_loop_ar3: 797 mov xq, -(76>>%2) 798.x_loop_ar3: 799 vbroadcasti128 m3, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12 800 palignr xm1, xm3, [bufq+xq-82*3-9], 6 ; y=-3,x=[-3,+12] 801 vbroadcasti128 m4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 802 vpblendd m3, m1, 0x0f 803 pxor m0, m0 804 pcmpgtb m2, m0, m3 805 pcmpgtb m0, m4 806 punpcklbw m1, m3, m2 807 punpckhbw m3, m2 808 punpcklbw m2, m4, m0 809 punpckhbw xm4, xm0 810 pshufb m0, m1, [base+gen_shufA] 811 pmaddwd m0, m6 812 pshufb m5, m1, [base+gen_shufC] 813 pmaddwd m5, m7 814 shufps m1, m3, q1032 815 paddd m0, m5 816 pshufb m5, m1, [base+gen_shufA] 817 pmaddwd m5, m8 818 shufps xm1, xm3, q2121 819 vpblendd m1, m2, 0xf0 820 pshufb m1, [base+gen_shufE] 821 pmaddwd m1, m9 822 paddd m0, m5 823 pshufb xm3, xm2, [base+gen_shufC] 824 paddd m0, m1 825 pmaddwd xm3, xm10 826 palignr xm1, xm4, xm2, 2 827 punpckhwd xm1, xm2, xm1 828 pmaddwd xm1, xm11 829 palignr xm4, xm2, 12 830 paddd xm3, xm1 831%if %2 832 vpbroadcastd xm5, [base+pb_1] 833 movq xm1, [bufyq+xq*2] 834 pmaddubsw xm1, xm5, xm1 835%if %3 836 movq xm2, [bufyq+xq*2+82] 837 pmaddubsw xm5, xm2 838 paddw xm1, xm5 839%endif 840 pmulhrsw xm1, xm15 841%else 842 pmovsxbw xm1, [bufyq+xq] 843%endif 844 punpcklwd xm4, xm1 845 pmaddwd xm4, xm12 846 movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] 847 vextracti128 xm2, m0, 1 848 paddd xm0, xm14 849 paddd xm3, xm4 850 paddd xm0, xm3 851 paddd xm0, xm2 852.x_loop_ar3_inner: 853 pmovsxbw xm1, xm1 854 pmaddwd xm2, xm13, xm1 855 pshuflw xm3, xm2, q1032 856 paddd xm2, xm0 ; add top 857 paddd xm2, xm3 ; left+cur 858 psrldq xm0, 4 859 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] 860 psrldq xm1, 2 861 ; don't packssdw, we only care about one value 862 punpckldq xm2, xm2 863 pblendw xm1, xm2, 0100b 864 packsswb xm1, xm1 865 pextrb [bufq+xq], xm1, 2 866 inc xq 867 jz .x_loop_ar3_end 868 test xb, 3 869 jnz .x_loop_ar3_inner 870 jmp .x_loop_ar3 871.x_loop_ar3_end: 872 add bufq, 82 873 add bufyq, 82<<%3 874 dec hd 875 jg .y_loop_ar3 876 RET 877%endmacro 878 879INIT_YMM avx2 880cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \ 881 grain_lut, h, sby, see, overlap 882%define base r9-pd_m65536 883 lea r9, [pd_m65536] 884 mov r6d, [fg_dataq+FGData.scaling_shift] 885 mov r7d, [fg_dataq+FGData.clip_to_restricted_range] 886 mov sbyd, sbym 887 mov overlapd, [fg_dataq+FGData.overlap_flag] 888 vpbroadcastd m8, [base+pd_m65536] 889 vpbroadcastw m9, [base+mul_bits+r6*2-14] 890 vpbroadcastd m10, [base+fg_min+r7*4] 891 vpbroadcastd m11, [base+fg_max+r7*8] 892 vpbroadcastd m12, [base+pw_1024] 893 movq xm13, [base+pb_27_17_17_27] 894 test sbyd, sbyd 895 setnz r7b 896 pxor m7, m7 897 test r7b, overlapb 898 jnz .vertical_overlap 899 900 imul seed, sbyd, (173 << 24) | 37 901 add seed, (105 << 24) | 178 902 rorx seed, seed, 24 903 movzx seed, seew 904 xor seed, [fg_dataq+FGData.seed] 905 906 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 907 offx, offy, see, overlap 908 909 lea src_bakq, [srcq+wq] 910 neg wq 911 sub dstq, srcq 912 913.loop_x: 914 rorx r6, seeq, 1 915 or seed, 0xEFF4 916 test seeb, seeh 917 lea seed, [r6+0x8000] 918 cmovp seed, r6d ; updated seed 919 920 rorx offyd, seed, 8 921 rorx offxq, seeq, 12 922 and offyd, 0xf 923 imul offyd, 164 924 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx 925 926 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 927 h, offxy, see, overlap 928 929 mov hd, hm 930 mov grain_lutq, grain_lutmp 931.loop_y: 932 ; src 933 mova m2, [srcq] 934 punpcklbw m0, m2, m7 935 punpckhbw m1, m2, m7 936 937 ; scaling[src] 938 pandn m4, m8, m0 939 mova m6, m8 940 vpgatherdd m2, [scalingq+m4-0], m8 941 psrld m3, m0, 16 942 mova m8, m6 943 vpgatherdd m4, [scalingq+m3-2], m6 944 pandn m5, m8, m1 945 mova m6, m8 946 vpgatherdd m3, [scalingq+m5-0], m8 947 pblendw m2, m4, 0xaa 948 psrld m4, m1, 16 949 mova m8, m6 950 vpgatherdd m5, [scalingq+m4-2], m6 951 pblendw m3, m5, 0xaa 952 953 ; grain = grain_lut[offy+y][offx+x] 954 movu m5, [grain_lutq+offxyq] 955 punpcklbw m4, m5, m7 956 punpckhbw m5, m7 957 958 ; noise = round2(scaling[src] * grain, scaling_shift) 959 pmaddubsw m2, m4 960 pmaddubsw m3, m5 961 pmulhrsw m2, m9 962 pmulhrsw m3, m9 963 964 ; dst = clip_pixel(src, noise) 965 paddw m0, m2 966 paddw m1, m3 967 packuswb m0, m1 968 pmaxub m0, m10 969 pminub m0, m11 970 mova [dstq+srcq], m0 971 972 add srcq, strideq 973 add grain_lutq, 82 974 dec hd 975 jg .loop_y 976 977 add wq, 32 978 jge .end 979 lea srcq, [src_bakq+wq] 980 test overlapd, overlapd 981 jz .loop_x 982 983 ; r8m = sbym 984 cmp dword r8m, 0 985 jne .loop_x_hv_overlap 986 987 ; horizontal overlap (without vertical overlap) 988.loop_x_h_overlap: 989 rorx r6, seeq, 1 990 or seed, 0xEFF4 991 test seeb, seeh 992 lea seed, [r6+0x8000] 993 cmovp seed, r6d ; updated seed 994 995 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 996 offx, offy, see, left_offxy 997 998 lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx 999 rorx offyd, seed, 8 1000 rorx offxq, seeq, 12 1001 and offyd, 0xf 1002 imul offyd, 164 1003 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx 1004 1005 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1006 h, offxy, see, left_offxy 1007 1008 mov grain_lutq, grain_lutmp 1009 mov hd, hm 1010.loop_y_h_overlap: 1011 ; src 1012 mova m2, [srcq] 1013 punpcklbw m0, m2, m7 1014 punpckhbw m1, m2, m7 1015 1016 ; scaling[src] 1017 pandn m4, m8, m0 1018 mova m6, m8 1019 vpgatherdd m2, [scalingq+m4-0], m8 1020 psrld m3, m0, 16 1021 mova m8, m6 1022 vpgatherdd m4, [scalingq+m3-2], m6 1023 pandn m5, m8, m1 1024 mova m6, m8 1025 vpgatherdd m3, [scalingq+m5-0], m8 1026 pblendw m2, m4, 0xaa 1027 psrld m4, m1, 16 1028 mova m8, m6 1029 vpgatherdd m5, [scalingq+m4-2], m6 1030 pblendw m3, m5, 0xaa 1031 1032 ; grain = grain_lut[offy+y][offx+x] 1033 movu m5, [grain_lutq+offxyq] 1034 movd xm4, [grain_lutq+left_offxyq] 1035 punpcklbw xm4, xm5 1036 pmaddubsw xm4, xm13, xm4 1037 pmulhrsw xm4, xm12 1038 packsswb xm4, xm4 1039 vpblendd m4, m5, 0xfe 1040 punpckhbw m5, m7 1041 punpcklbw m4, m7 1042 1043 ; noise = round2(scaling[src] * grain, scaling_shift) 1044 pmaddubsw m2, m4 1045 pmaddubsw m3, m5 1046 pmulhrsw m2, m9 1047 pmulhrsw m3, m9 1048 1049 ; dst = clip_pixel(src, noise) 1050 paddw m0, m2 1051 paddw m1, m3 1052 packuswb m0, m1 1053 pmaxub m0, m10 1054 pminub m0, m11 1055 mova [dstq+srcq], m0 1056 1057 add srcq, strideq 1058 add grain_lutq, 82 1059 dec hd 1060 jg .loop_y_h_overlap 1061 1062 add wq, 32 1063 jge .end 1064 lea srcq, [src_bakq+wq] 1065 1066 ; r8m = sbym 1067 cmp dword r8m, 0 1068 jne .loop_x_hv_overlap 1069 jmp .loop_x_h_overlap 1070 1071.vertical_overlap: 1072 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1073 unused, sby, see, overlap 1074 1075 movzx sbyd, sbyb 1076 imul seed, [fg_dataq+FGData.seed], 0x00010001 1077 imul r7d, sbyd, 173 * 0x00010001 1078 imul sbyd, 37 * 0x01000100 1079 add r7d, (105 << 16) | 188 1080 add sbyd, (178 << 24) | (141 << 8) 1081 and r7d, 0x00ff00ff 1082 and sbyd, 0xff00ff00 1083 xor seed, r7d 1084 xor seed, sbyd ; (cur_seed << 16) | top_seed 1085 1086 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1087 offx, offy, see, overlap 1088 1089 lea src_bakq, [srcq+wq] 1090 neg wq 1091 sub dstq, srcq 1092 1093.loop_x_v_overlap: 1094 vpbroadcastd m14, [pb_27_17] 1095 1096 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1097 mov r6d, seed 1098 or seed, 0xeff4eff4 1099 test seeb, seeh 1100 setp r7b ; parity of top_seed 1101 shr seed, 16 1102 shl r7d, 16 1103 test seeb, seeh 1104 setp r7b ; parity of cur_seed 1105 or r6d, 0x00010001 1106 xor r7d, r6d 1107 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1108 1109 rorx offyd, seed, 8 1110 rorx offxd, seed, 12 1111 and offyd, 0xf000f 1112 and offxd, 0xf000f 1113 imul offyd, 164 1114 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1115 lea offyd, [offyq+offxq*2+0x10001*747+32*82] 1116 1117 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1118 h, offxy, see, overlap, top_offxy 1119 1120 mov grain_lutq, grain_lutmp 1121 mov hd, hm 1122 movzx top_offxyd, offxyw 1123 shr offxyd, 16 1124.loop_y_v_overlap: 1125 ; src 1126 mova m2, [srcq] 1127 punpcklbw m0, m2, m7 1128 punpckhbw m1, m2, m7 1129 1130 ; scaling[src] 1131 pandn m4, m8, m0 1132 mova m6, m8 1133 vpgatherdd m2, [scalingq+m4-0], m8 1134 psrld m3, m0, 16 1135 mova m8, m6 1136 vpgatherdd m4, [scalingq+m3-2], m6 1137 pandn m5, m8, m1 1138 mova m6, m8 1139 vpgatherdd m3, [scalingq+m5-0], m8 1140 pblendw m2, m4, 0xaa 1141 psrld m4, m1, 16 1142 mova m8, m6 1143 vpgatherdd m5, [scalingq+m4-2], m6 1144 pblendw m3, m5, 0xaa 1145 1146 ; grain = grain_lut[offy+y][offx+x] 1147 movu m6, [grain_lutq+offxyq] 1148 movu m4, [grain_lutq+top_offxyq] 1149 punpcklbw m5, m4, m6 1150 punpckhbw m4, m6 1151 pmaddubsw m5, m14, m5 1152 pmaddubsw m4, m14, m4 1153 pmulhrsw m5, m12 1154 pmulhrsw m4, m12 1155 packsswb m5, m4 1156 punpcklbw m4, m5, m7 1157 punpckhbw m5, m7 1158 1159 ; noise = round2(scaling[src] * grain, scaling_shift) 1160 pmaddubsw m2, m4 1161 pmaddubsw m3, m5 1162 pmulhrsw m2, m9 1163 pmulhrsw m3, m9 1164 1165 ; dst = clip_pixel(src, noise) 1166 paddw m0, m2 1167 paddw m1, m3 1168 packuswb m0, m1 1169 pmaxub m0, m10 1170 pminub m0, m11 1171 mova [dstq+srcq], m0 1172 1173 add srcq, strideq 1174 add grain_lutq, 82 1175 dec hb 1176 jz .end_y_v_overlap 1177 vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line 1178 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1179 ; remaining (up to) 30 lines 1180 add hd, 0x80000000 1181 jnc .loop_y_v_overlap 1182 jmp .loop_y 1183.end_y_v_overlap: 1184 add wq, 32 1185 jge .end 1186 lea srcq, [src_bakq+wq] 1187 1188 ; since fg_dataq.overlap is guaranteed to be set, we never jump 1189 ; back to .loop_x_v_overlap, and instead always fall-through to 1190 ; h+v overlap 1191.loop_x_hv_overlap: 1192 vpbroadcastd m14, [pb_27_17] 1193 1194 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1195 mov r6d, seed 1196 or seed, 0xeff4eff4 1197 test seeb, seeh 1198 setp r7b ; parity of top_seed 1199 shr seed, 16 1200 shl r7d, 16 1201 test seeb, seeh 1202 setp r7b ; parity of cur_seed 1203 or r6d, 0x00010001 1204 xor r7d, r6d 1205 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1206 1207 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1208 offx, offy, see, left_offxy, top_offxy, topleft_offxy 1209 1210 lea topleft_offxyd, [top_offxyq+32] 1211 lea left_offxyd, [offyq+32] 1212 rorx offyd, seed, 8 1213 rorx offxd, seed, 12 1214 and offyd, 0xf000f 1215 and offxd, 0xf000f 1216 imul offyd, 164 1217 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1218 lea offyd, [offyq+offxq*2+0x10001*747+32*82] 1219 1220 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1221 h, offxy, see, left_offxy, top_offxy, topleft_offxy 1222 1223 mov grain_lutq, grain_lutmp 1224 mov hd, hm 1225 movzx top_offxyd, offxyw 1226 shr offxyd, 16 1227.loop_y_hv_overlap: 1228 ; src 1229 mova m2, [srcq] 1230 punpcklbw m0, m2, m7 1231 punpckhbw m1, m2, m7 1232 1233 ; scaling[src] 1234 pandn m4, m8, m0 1235 mova m6, m8 1236 vpgatherdd m2, [scalingq+m4-0], m8 1237 psrld m3, m0, 16 1238 mova m8, m6 1239 vpgatherdd m4, [scalingq+m3-2], m6 1240 pandn m5, m8, m1 1241 mova m6, m8 1242 vpgatherdd m3, [scalingq+m5-0], m8 1243 pblendw m2, m4, 0xaa 1244 psrld m4, m1, 16 1245 mova m8, m6 1246 vpgatherdd m5, [scalingq+m4-2], m6 1247 pblendw m3, m5, 0xaa 1248 1249 ; grain = grain_lut[offy+y][offx+x] 1250 movu m6, [grain_lutq+offxyq] 1251 movd xm7, [grain_lutq+left_offxyq] 1252 movu m4, [grain_lutq+top_offxyq] 1253 movd xm5, [grain_lutq+topleft_offxyq] 1254 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 1255 punpcklbw xm7, xm6 1256 punpcklbw xm5, xm4 1257 pmaddubsw xm7, xm13, xm7 1258 pmaddubsw xm5, xm13, xm5 1259 pmulhrsw xm7, xm12 1260 pmulhrsw xm5, xm12 1261 packsswb xm7, xm7 1262 packsswb xm5, xm5 1263 vpblendd m7, m6, 0xfe 1264 vpblendd m5, m4, 0xfe 1265 ; followed by v interpolation (top | cur -> cur) 1266 punpckhbw m4, m6 1267 punpcklbw m5, m7 1268 pmaddubsw m4, m14, m4 1269 pmaddubsw m5, m14, m5 1270 pmulhrsw m4, m12 1271 pmulhrsw m5, m12 1272 pxor m7, m7 1273 packsswb m5, m4 1274 punpcklbw m4, m5, m7 1275 punpckhbw m5, m7 1276 1277 ; noise = round2(scaling[src] * grain, scaling_shift) 1278 pmaddubsw m2, m4 1279 pmaddubsw m3, m5 1280 pmulhrsw m2, m9 1281 pmulhrsw m3, m9 1282 1283 ; dst = clip_pixel(src, noise) 1284 paddw m0, m2 1285 paddw m1, m3 1286 packuswb m0, m1 1287 pmaxub m0, m10 1288 pminub m0, m11 1289 mova [dstq+srcq], m0 1290 1291 add srcq, strideq 1292 add grain_lutq, 82 1293 dec hb 1294 jz .end_y_hv_overlap 1295 vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line 1296 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1297 ; remaining (up to) 30 lines 1298 add hd, 0x80000000 1299 jnc .loop_y_hv_overlap 1300 jmp .loop_y_h_overlap 1301.end_y_hv_overlap: 1302 add wq, 32 1303 lea srcq, [src_bakq+wq] 1304 jl .loop_x_hv_overlap 1305.end: 1306 RET 1307 1308%macro FGUV_FN 3 ; name, ss_hor, ss_ver 1309cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ 1310 grain_lut, h, sby, luma, overlap, uv_pl, is_id 1311%define base r11-pd_m65536 1312 lea r11, [pd_m65536] 1313 mov r6d, [fg_dataq+FGData.scaling_shift] 1314 mov r7d, [fg_dataq+FGData.clip_to_restricted_range] 1315 mov r9d, is_idm 1316 mov sbyd, sbym 1317 mov overlapd, [fg_dataq+FGData.overlap_flag] 1318 vpbroadcastd m8, [base+pd_m65536] 1319 vpbroadcastw m9, [base+mul_bits+r6*2-14] 1320 vpbroadcastd m10, [base+fg_min+r7*4] 1321 shlx r7d, r7d, r9d 1322 vpbroadcastd m11, [base+fg_max+r7*4] 1323 vpbroadcastd m12, [base+pw_1024] 1324 pxor m7, m7 1325 test sbyd, sbyd 1326 setnz r7b 1327 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 1328 jne .csfl 1329 1330%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver 1331 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ 1332 h, sby, see, overlap, uv_pl 1333%if %1 1334 mov r6d, uv_plm 1335 vpbroadcastd m0, [base+pw_8] 1336 vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4] 1337 vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4] 1338 pshufb m14, m0 ; uv_luma_mult, uv_mult 1339%elif %2 1340 vpbroadcastq m15, [base+pb_23_22] 1341%else 1342 vpbroadcastq xm15, [base+pb_27_17_17_27] 1343%endif 1344%if %3 1345 vpbroadcastw m13, [base+pb_23_22] 1346%elif %2 1347 pshufd m13, [base+pb_27_17], q0000 ; 8x27_17, 8x17_27 1348%endif 1349 test r7b, overlapb 1350 jnz %%vertical_overlap 1351 1352 imul seed, sbyd, (173 << 24) | 37 1353 add seed, (105 << 24) | 178 1354 rorx seed, seed, 24 1355 movzx seed, seew 1356 xor seed, [fg_dataq+FGData.seed] 1357 1358 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1359 unused2, unused3, see, overlap, unused4, unused5, lstride 1360 1361 mov lumaq, r9mp 1362 lea r12, [srcq+wq] 1363 lea r13, [dstq+wq] 1364 lea r14, [lumaq+wq*(1+%2)] 1365 mov r11mp, r12 1366 mov r12mp, r13 1367 mov lstrideq, r10mp 1368 neg wq 1369 1370%%loop_x: 1371 rorx r6, seeq, 1 1372 or seed, 0xEFF4 1373 test seeb, seeh 1374 lea seed, [r6+0x8000] 1375 cmovp seed, r6d ; updated seed 1376 1377 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1378 offx, offy, see, overlap, unused1, unused2, lstride 1379 1380 rorx offyd, seed, 8 1381 rorx offxq, seeq, 12 1382 and offyd, 0xf 1383 imul offyd, 164>>%3 1384 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 1385 1386 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1387 h, offxy, see, overlap, unused1, unused2, lstride 1388 1389 mov grain_lutq, grain_lutmp 1390 mov hd, hm 1391%%loop_y: 1392 ; src 1393%if %2 1394 mova xm3, [lumaq+lstrideq*0+ 0] 1395 vinserti128 m3, [lumaq+lstrideq*(1+%3) +0], 1 1396 vpbroadcastd m2, [pb_1] 1397 mova xm0, [lumaq+lstrideq*0+16] 1398 vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 1399 mova xm1, [srcq] 1400 vinserti128 m1, [srcq+strideq], 1 1401 pmaddubsw m3, m2 1402 pmaddubsw m0, m2 1403 pavgw m3, m7 1404 pavgw m0, m7 1405%else 1406 mova m2, [lumaq] 1407 mova m1, [srcq] 1408%endif 1409%if %1 1410%if %2 1411 packuswb m2, m3, m0 ; luma 1412%endif 1413 punpckhbw m3, m2, m1 1414 punpcklbw m2, m1 ; { luma, chroma } 1415 pmaddubsw m3, m14 1416 pmaddubsw m2, m14 1417 psraw m3, 6 1418 psraw m2, 6 1419 paddw m3, m15 1420 paddw m2, m15 1421 packuswb m2, m3 ; pack+unpack = clip 1422%endif 1423%if %1 || %2 == 0 1424 punpcklbw m3, m2, m7 1425 punpckhbw m0, m2, m7 1426%endif 1427 1428 ; scaling[luma_src] 1429 pandn m4, m8, m3 1430 mova m6, m8 1431 vpgatherdd m2, [scalingq+m4-0], m8 1432 psrld m3, 16 1433 mova m8, m6 1434 vpgatherdd m4, [scalingq+m3-2], m6 1435 pandn m5, m8, m0 1436 mova m6, m8 1437 vpgatherdd m3, [scalingq+m5-0], m8 1438 psrld m0, 16 1439 mova m8, m6 1440 vpgatherdd m5, [scalingq+m0-2], m6 1441 pblendw m2, m4, 0xaa 1442 pblendw m3, m5, 0xaa 1443 1444 ; grain = grain_lut[offy+y][offx+x] 1445%if %2 1446 movu xm5, [grain_lutq+offxyq+ 0] 1447 vinserti128 m5, [grain_lutq+offxyq+82], 1 1448%else 1449 movu m5, [grain_lutq+offxyq] 1450%endif 1451 punpcklbw m4, m5, m7 1452 punpckhbw m5, m7 1453 1454 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1455 pmaddubsw m2, m4 1456 pmaddubsw m3, m5 1457 pmulhrsw m2, m9 1458 pmulhrsw m3, m9 1459 1460 ; unpack chroma_source 1461 punpcklbw m0, m1, m7 1462 punpckhbw m1, m7 1463 1464 ; dst = clip_pixel(src, noise) 1465 paddw m0, m2 1466 paddw m1, m3 1467 packuswb m0, m1 1468 pmaxub m0, m10 1469 pminub m0, m11 1470%if %2 1471 mova [dstq], xm0 1472 vextracti128 [dstq+strideq], m0, 1 1473%else 1474 mova [dstq], m0 1475%endif 1476 1477%if %2 1478 lea srcq, [srcq+strideq*2] 1479 lea dstq, [dstq+strideq*2] 1480 lea lumaq, [lumaq+lstrideq*(2<<%3)] 1481%else 1482 add srcq, strideq 1483 add dstq, strideq 1484 add lumaq, lstrideq 1485%endif 1486 add grain_lutq, 82<<%2 1487 sub hb, 1+%2 1488 jg %%loop_y 1489 1490 add wq, 32>>%2 1491 jge .end 1492 mov srcq, r11mp 1493 mov dstq, r12mp 1494 lea lumaq, [r14+wq*(1+%2)] 1495 add srcq, wq 1496 add dstq, wq 1497 test overlapd, overlapd 1498 jz %%loop_x 1499 1500 ; r8m = sbym 1501 cmp dword r8m, 0 1502 jne %%loop_x_hv_overlap 1503 1504 ; horizontal overlap (without vertical overlap) 1505%%loop_x_h_overlap: 1506 rorx r6, seeq, 1 1507 or seed, 0xEFF4 1508 test seeb, seeh 1509 lea seed, [r6+0x8000] 1510 cmovp seed, r6d ; updated seed 1511 1512 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1513 offx, offy, see, left_offxy, unused1, unused2, lstride 1514 1515 lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx 1516 rorx offyd, seed, 8 1517 rorx offxq, seeq, 12 1518 and offyd, 0xf 1519 imul offyd, 164>>%3 1520 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 1521 1522 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1523 h, offxy, see, left_offxy, unused1, unused2, lstride 1524 1525 mov grain_lutq, grain_lutmp 1526 mov hd, hm 1527%%loop_y_h_overlap: 1528 ; src 1529%if %2 1530 mova xm3, [lumaq+lstrideq*0+ 0] 1531 vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 1532 vpbroadcastd m2, [pb_1] 1533 mova xm0, [lumaq+lstrideq*0+16] 1534 vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 1535 mova xm1, [srcq] 1536 vinserti128 m1, [srcq+strideq], 1 1537 pmaddubsw m3, m2 1538 pmaddubsw m0, m2 1539 pavgw m3, m7 1540 pavgw m0, m7 1541%else 1542 mova m2, [lumaq] 1543 mova m1, [srcq] 1544%endif 1545%if %1 1546%if %2 1547 packuswb m2, m3, m0 ; luma 1548%endif 1549 punpckhbw m3, m2, m1 1550 punpcklbw m2, m1 ; { luma, chroma } 1551 pmaddubsw m3, m14 1552 pmaddubsw m2, m14 1553 psraw m3, 6 1554 psraw m2, 6 1555 paddw m3, m15 1556 paddw m2, m15 1557 packuswb m2, m3 ; pack+unpack = clip 1558%endif 1559%if %1 || %2 == 0 1560 punpcklbw m3, m2, m7 1561 punpckhbw m0, m2, m7 1562%endif 1563 1564 ; scaling[luma_src] 1565 pandn m4, m8, m3 1566 mova m6, m8 1567 vpgatherdd m2, [scalingq+m4-0], m8 1568 psrld m3, 16 1569 mova m8, m6 1570 vpgatherdd m4, [scalingq+m3-2], m6 1571 pandn m5, m8, m0 1572 mova m6, m8 1573 vpgatherdd m3, [scalingq+m5-0], m8 1574 psrld m0, 16 1575 mova m8, m6 1576 vpgatherdd m5, [scalingq+m0-2], m6 1577 pblendw m2, m4, 0xaa 1578 pblendw m3, m5, 0xaa 1579 1580 ; grain = grain_lut[offy+y][offx+x] 1581%if %2 1582 movu xm5, [grain_lutq+offxyq+ 0] 1583 vinserti128 m5, [grain_lutq+offxyq+82], 1 1584 movd xm4, [grain_lutq+left_offxyq+ 0] 1585 vinserti128 m4, [grain_lutq+left_offxyq+82], 1 1586 punpcklbw m4, m5 1587%if %1 1588 vpbroadcastq m0, [pb_23_22] 1589 pmaddubsw m4, m0, m4 1590%else 1591 pmaddubsw m4, m15, m4 1592%endif 1593 pmulhrsw m4, m12 1594 packsswb m4, m4 1595 vpblendd m4, m5, 0xee 1596%else 1597 movu m5, [grain_lutq+offxyq] 1598 movd xm4, [grain_lutq+left_offxyq] 1599 punpcklbw xm4, xm5 1600%if %1 1601 movq xm0, [pb_27_17_17_27] 1602 pmaddubsw xm4, xm0, xm4 1603%else 1604 pmaddubsw xm4, xm15, xm4 1605%endif 1606 pmulhrsw xm4, xm12 1607 packsswb xm4, xm4 1608 vpblendd m4, m5, 0xfe 1609%endif 1610 punpckhbw m5, m7 1611 punpcklbw m4, m7 1612 1613 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1614 pmaddubsw m2, m4 1615 pmaddubsw m3, m5 1616 pmulhrsw m2, m9 1617 pmulhrsw m3, m9 1618 1619 ; unpack chroma_source 1620 punpcklbw m0, m1, m7 1621 punpckhbw m1, m7 1622 1623 ; dst = clip_pixel(src, noise) 1624 paddw m0, m2 1625 paddw m1, m3 1626 packuswb m0, m1 1627 pmaxub m0, m10 1628 pminub m0, m11 1629%if %2 1630 mova [dstq], xm0 1631 vextracti128 [dstq+strideq], m0, 1 1632%else 1633 mova [dstq], m0 1634%endif 1635 1636%if %2 1637 lea srcq, [srcq+strideq*2] 1638 lea dstq, [dstq+strideq*2] 1639 lea lumaq, [lumaq+lstrideq*(2<<%3)] 1640%else 1641 add srcq, strideq 1642 add dstq, strideq 1643 add lumaq, lstrideq 1644%endif 1645 add grain_lutq, 82*(1+%2) 1646 sub hb, 1+%2 1647 jg %%loop_y_h_overlap 1648 1649 add wq, 32>>%2 1650 jge .end 1651 mov srcq, r11mp 1652 mov dstq, r12mp 1653 lea lumaq, [r14+wq*(1+%2)] 1654 add srcq, wq 1655 add dstq, wq 1656 1657 ; r8m = sbym 1658 cmp dword r8m, 0 1659 jne %%loop_x_hv_overlap 1660 jmp %%loop_x_h_overlap 1661 1662%%vertical_overlap: 1663 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ 1664 sby, see, overlap, unused1, unused2, lstride 1665 1666 movzx sbyd, sbyb 1667 imul seed, [fg_dataq+FGData.seed], 0x00010001 1668 imul r7d, sbyd, 173 * 0x00010001 1669 imul sbyd, 37 * 0x01000100 1670 add r7d, (105 << 16) | 188 1671 add sbyd, (178 << 24) | (141 << 8) 1672 and r7d, 0x00ff00ff 1673 and sbyd, 0xff00ff00 1674 xor seed, r7d 1675 xor seed, sbyd ; (cur_seed << 16) | top_seed 1676 1677 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1678 unused1, unused2, see, overlap, unused3, unused4, lstride 1679 1680 mov lumaq, r9mp 1681 lea r12, [srcq+wq] 1682 lea r13, [dstq+wq] 1683 lea r14, [lumaq+wq*(1+%2)] 1684 mov r11mp, r12 1685 mov r12mp, r13 1686 mov lstrideq, r10mp 1687 neg wq 1688 1689%%loop_x_v_overlap: 1690 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1691 mov r6d, seed 1692 or seed, 0xeff4eff4 1693 test seeb, seeh 1694 setp r7b ; parity of top_seed 1695 shr seed, 16 1696 shl r7d, 16 1697 test seeb, seeh 1698 setp r7b ; parity of cur_seed 1699 or r6d, 0x00010001 1700 xor r7d, r6d 1701 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1702 1703 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1704 offx, offy, see, overlap, top_offxy, unused, lstride 1705 1706 rorx offyd, seed, 8 1707 rorx offxd, seed, 12 1708 and offyd, 0xf000f 1709 and offxd, 0xf000f 1710 imul offyd, 164>>%3 1711 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1712 lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 1713 1714 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1715 h, offxy, see, overlap, top_offxy, unused, lstride 1716 1717 mov grain_lutq, grain_lutmp 1718 mov hd, hm 1719 movzx top_offxyd, offxyw 1720 shr offxyd, 16 1721%if %2 == 0 1722 vpbroadcastd m13, [pb_27_17] 1723%endif 1724%%loop_y_v_overlap: 1725 ; src 1726%if %2 1727 mova xm3, [lumaq+lstrideq*0+ 0] 1728 vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 1729 vpbroadcastd m2, [pb_1] 1730 mova xm0, [lumaq+lstrideq*0+16] 1731 vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 1732 mova xm1, [srcq] 1733 vinserti128 m1, [srcq+strideq], 1 1734 pmaddubsw m3, m2 1735 pmaddubsw m0, m2 1736 pavgw m3, m7 1737 pavgw m0, m7 1738%else 1739 mova m2, [lumaq] 1740 mova m1, [srcq] 1741%endif 1742%if %1 1743%if %2 1744 packuswb m2, m3, m0 ; luma 1745%endif 1746 punpckhbw m3, m2, m1 1747 punpcklbw m2, m1 ; { luma, chroma } 1748 pmaddubsw m3, m14 1749 pmaddubsw m2, m14 1750 psraw m3, 6 1751 psraw m2, 6 1752 paddw m3, m15 1753 paddw m2, m15 1754 packuswb m2, m3 ; pack+unpack = clip 1755%endif 1756%if %1 || %2 == 0 1757 punpcklbw m3, m2, m7 1758 punpckhbw m0, m2, m7 1759%endif 1760 1761 ; scaling[luma_src] 1762 pandn m4, m8, m3 1763 mova m6, m8 1764 vpgatherdd m2, [scalingq+m4-0], m8 1765 psrld m3, 16 1766 mova m8, m6 1767 vpgatherdd m4, [scalingq+m3-2], m6 1768 pandn m5, m8, m0 1769 mova m6, m8 1770 vpgatherdd m3, [scalingq+m5-0], m8 1771 psrld m0, 16 1772 mova m8, m6 1773 vpgatherdd m5, [scalingq+m0-2], m6 1774 pblendw m2, m4, 0xaa 1775 pblendw m3, m5, 0xaa 1776 1777 ; grain = grain_lut[offy+y][offx+x] 1778%if %3 == 0 1779%if %2 1780 movu xm0, [grain_lutq+offxyq] 1781 vinserti128 m0, [grain_lutq+offxyq+82], 1 1782 movu xm4, [grain_lutq+top_offxyq] 1783 vinserti128 m4, [grain_lutq+top_offxyq+82], 1 1784%else 1785 movu m0, [grain_lutq+offxyq] 1786 movu m4, [grain_lutq+top_offxyq] 1787%endif 1788 punpcklbw m5, m4, m0 1789 punpckhbw m4, m0 1790 pmaddubsw m5, m13, m5 1791 pmaddubsw m4, m13, m4 1792 pmulhrsw m5, m12 1793 pmulhrsw m4, m12 1794 packsswb m5, m4 1795%else 1796 movq xm4, [grain_lutq+offxyq] 1797 vinserti128 m4, [grain_lutq+offxyq+8], 1 1798 movq xm5, [grain_lutq+top_offxyq] 1799 vinserti128 m5, [grain_lutq+top_offxyq+8], 1 1800 punpcklbw m5, m4 1801 pmaddubsw m5, m13, m5 1802 pmulhrsw m5, m12 1803 vextracti128 xm4, m5, 1 1804 packsswb xm5, xm4 1805 ; only interpolate first line, insert second line unmodified 1806 vinserti128 m5, [grain_lutq+offxyq+82], 1 1807%endif 1808 punpcklbw m4, m5, m7 1809 punpckhbw m5, m7 1810 1811 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1812 pmaddubsw m2, m4 1813 pmaddubsw m3, m5 1814 pmulhrsw m2, m9 1815 pmulhrsw m3, m9 1816 1817 ; unpack chroma_source 1818 punpcklbw m0, m1, m7 1819 punpckhbw m1, m7 1820 1821 ; dst = clip_pixel(src, noise) 1822 paddw m0, m2 1823 paddw m1, m3 1824 packuswb m0, m1 1825 pmaxub m0, m10 1826 pminub m0, m11 1827%if %2 1828 mova [dstq], xm0 1829 vextracti128 [dstq+strideq], m0, 1 1830%else 1831 mova [dstq], m0 1832%endif 1833 1834 sub hb, 1+%2 1835 jle %%end_y_v_overlap 1836%if %2 1837 lea srcq, [srcq+strideq*2] 1838 lea dstq, [dstq+strideq*2] 1839 lea lumaq, [lumaq+lstrideq*(2<<%3)] 1840%else 1841 add srcq, strideq 1842 add dstq, strideq 1843 add lumaq, lstrideq 1844%endif 1845 add grain_lutq, 82<<%2 1846%if %2 == 0 1847 vpbroadcastd m13, [pb_17_27] 1848 add hd, 0x80000000 1849 jnc %%loop_y_v_overlap 1850%endif 1851 jmp %%loop_y 1852 1853%%end_y_v_overlap: 1854 add wq, 32>>%2 1855 jge .end 1856 mov srcq, r11mp 1857 mov dstq, r12mp 1858 lea lumaq, [r14+wq*(1+%2)] 1859 add srcq, wq 1860 add dstq, wq 1861 1862 ; since fg_dataq.overlap is guaranteed to be set, we never jump 1863 ; back to .loop_x_v_overlap, and instead always fall-through to 1864 ; h+v overlap 1865 1866%%loop_x_hv_overlap: 1867 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1868 mov r6d, seed 1869 or seed, 0xeff4eff4 1870 test seeb, seeh 1871 setp r7b ; parity of top_seed 1872 shr seed, 16 1873 shl r7d, 16 1874 test seeb, seeh 1875 setp r7b ; parity of cur_seed 1876 or r6d, 0x00010001 1877 xor r7d, r6d 1878 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1879 1880 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1881 offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride 1882 1883 lea topleft_offxyd, [top_offxyq+(32>>%2)] 1884 lea left_offxyd, [offyq+(32>>%2)] 1885 rorx offyd, seed, 8 1886 rorx offxd, seed, 12 1887 and offyd, 0xf000f 1888 and offxd, 0xf000f 1889 imul offyd, 164>>%3 1890 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1891 lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 1892 1893 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1894 h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride 1895 1896 mov grain_lutq, grain_lutmp 1897 mov hd, hm 1898 movzx top_offxyd, offxyw 1899 shr offxyd, 16 1900%if %2 == 0 1901 vpbroadcastd m13, [pb_27_17] 1902%endif 1903%%loop_y_hv_overlap: 1904 ; src 1905%if %2 1906 mova xm3, [lumaq+lstrideq*0+ 0] 1907 vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 1908 vpbroadcastd m2, [pb_1] 1909 mova xm0, [lumaq+lstrideq*0+16] 1910 vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 1911 mova xm1, [srcq] 1912 vinserti128 m1, [srcq+strideq], 1 1913 pmaddubsw m3, m2 1914 pmaddubsw m0, m2 1915 pavgw m3, m7 1916 pavgw m0, m7 1917%else 1918 mova m2, [lumaq] 1919 mova m1, [srcq] 1920%endif 1921%if %1 1922%if %2 1923 packuswb m2, m3, m0 ; luma 1924%endif 1925 punpckhbw m3, m2, m1 1926 punpcklbw m2, m1 ; { luma, chroma } 1927 pmaddubsw m3, m14 1928 pmaddubsw m2, m14 1929 psraw m3, 6 1930 psraw m2, 6 1931 paddw m3, m15 1932 paddw m2, m15 1933 packuswb m2, m3 ; pack+unpack = clip 1934%endif 1935%if %1 || %2 == 0 1936 punpcklbw m3, m2, m7 1937 punpckhbw m0, m2, m7 1938%endif 1939 1940 ; scaling[luma_src] 1941 pandn m4, m8, m3 1942 mova m6, m8 1943 vpgatherdd m2, [scalingq+m4-0], m8 1944 psrld m3, 16 1945 mova m8, m6 1946 vpgatherdd m4, [scalingq+m3-2], m6 1947 pandn m5, m8, m0 1948 mova m6, m8 1949 vpgatherdd m3, [scalingq+m5-0], m8 1950 psrld m0, 16 1951 mova m8, m6 1952 vpgatherdd m5, [scalingq+m0-2], m6 1953 pblendw m2, m4, 0xaa 1954 pblendw m3, m5, 0xaa 1955 1956 ; grain = grain_lut[offy+y][offx+x] 1957%if %2 1958 movu xm4, [grain_lutq+offxyq] 1959 vinserti128 m4, [grain_lutq+offxyq+82], 1 1960 movd xm0, [grain_lutq+left_offxyq] 1961 vinserti128 m0, [grain_lutq+left_offxyq+82], 1 1962 movd xm6, [grain_lutq+topleft_offxyq] 1963%if %3 1964 movq xm5, [grain_lutq+top_offxyq] 1965 vinserti128 m5, [grain_lutq+top_offxyq+8], 1 1966%else 1967 vinserti128 m6, [grain_lutq+topleft_offxyq+82], 1 1968 movu xm5, [grain_lutq+top_offxyq] 1969 vinserti128 m5, [grain_lutq+top_offxyq+82], 1 1970%endif 1971 1972 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 1973 punpcklbw m0, m4 1974%if %3 1975 punpcklbw xm6, xm5 1976%else 1977 punpcklbw m6, m5 1978%endif 1979 punpcklqdq m0, m6 1980%if %1 1981 vpbroadcastq m6, [pb_23_22] 1982 pmaddubsw m0, m6, m0 1983%else 1984 pmaddubsw m0, m15, m0 1985%endif 1986 pmulhrsw m0, m12 1987 packsswb m0, m0 1988 vpblendd m4, m0, 0x11 1989%if %3 1990 pshuflw xm0, xm0, q1032 1991 vpblendd m5, m0, 0x01 1992%else 1993 pshuflw m0, m0, q1032 1994 vpblendd m5, m0, 0x11 1995%endif 1996%else 1997 movu m4, [grain_lutq+offxyq] 1998 movd xm0, [grain_lutq+left_offxyq] 1999 movu m5, [grain_lutq+top_offxyq] 2000 movd xm6, [grain_lutq+topleft_offxyq] 2001 punpcklbw xm0, xm4 2002 punpcklbw xm6, xm5 2003 punpcklqdq xm0, xm6 2004%if %1 2005 vpbroadcastq xm6, [pb_27_17_17_27] 2006 pmaddubsw xm0, xm6, xm0 2007%else 2008 pmaddubsw xm0, xm15, xm0 2009%endif 2010 pmulhrsw xm0, xm12 2011 packsswb xm0, xm0 2012 vpblendd m4, m0, 0x01 2013 pshuflw xm0, xm0, q1032 2014 vpblendd m5, m0, 0x01 2015%endif 2016 2017 ; followed by v interpolation (top | cur -> cur) 2018%if %3 2019 vpermq m0, m4, q3120 2020 punpcklbw m5, m0 2021 pmaddubsw m5, m13, m5 2022 pmulhrsw m5, m12 2023 vextracti128 xm0, m5, 1 2024 packsswb xm5, xm0 2025 vpblendd m5, m4, 0xf0 2026%else 2027 punpckhbw m0, m5, m4 2028 punpcklbw m5, m4 2029 pmaddubsw m4, m13, m0 2030 pmaddubsw m5, m13, m5 2031 pmulhrsw m4, m12 2032 pmulhrsw m5, m12 2033 packsswb m5, m4 2034%endif 2035 punpcklbw m4, m5, m7 2036 punpckhbw m5, m7 2037 2038 ; noise = round2(scaling[src] * grain, scaling_shift) 2039 pmaddubsw m2, m4 2040 pmaddubsw m3, m5 2041 pmulhrsw m2, m9 2042 pmulhrsw m3, m9 2043 2044 ; unpack chroma source 2045 punpcklbw m0, m1, m7 2046 punpckhbw m1, m7 2047 2048 ; dst = clip_pixel(src, noise) 2049 paddw m0, m2 2050 paddw m1, m3 2051 packuswb m0, m1 2052 pmaxub m0, m10 2053 pminub m0, m11 2054%if %2 2055 mova [dstq], xm0 2056 vextracti128 [dstq+strideq], m0, 1 2057%else 2058 mova [dstq], m0 2059%endif 2060 2061%if %2 2062 lea srcq, [srcq+strideq*2] 2063 lea dstq, [dstq+strideq*2] 2064 lea lumaq, [lumaq+lstrideq*(2<<%3)] 2065%else 2066 add srcq, strideq 2067 add dstq, strideq 2068 add lumaq, lstrideq 2069%endif 2070 add grain_lutq, 82<<%2 2071 sub hb, 1+%2 2072%if %2 2073 jg %%loop_y_h_overlap 2074%else 2075 je %%end_y_hv_overlap 2076 vpbroadcastd m13, [pb_17_27] 2077 add hd, 0x80000000 2078 jnc %%loop_y_hv_overlap 2079 jmp %%loop_y_h_overlap 2080%endif 2081 2082%%end_y_hv_overlap: 2083 add wq, 32>>%2 2084 jge .end 2085 mov srcq, r11mp 2086 mov dstq, r12mp 2087 lea lumaq, [r14+wq*(1+%2)] 2088 add srcq, wq 2089 add dstq, wq 2090 jmp %%loop_x_hv_overlap 2091%endmacro 2092 2093 %%FGUV_32x32xN_LOOP 1, %2, %3 2094.csfl: 2095 %%FGUV_32x32xN_LOOP 0, %2, %3 2096.end: 2097 RET 2098%endmacro 2099 2100GEN_GRAIN_UV_FN 420, 1, 1 2101FGUV_FN 420, 1, 1 2102GEN_GRAIN_UV_FN 422, 1, 0 2103FGUV_FN 422, 1, 0 2104GEN_GRAIN_UV_FN 444, 0, 0 2105FGUV_FN 444, 0, 0 2106 2107%endif ; ARCH_X86_64 2108