1; Copyright © 2019, VideoLAN and dav1d authors 2; Copyright © 2019, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 32 32pb_8x_27_17_8x_17_27: times 8 db 27, 17 33 times 8 db 17, 27 34pw_1024: times 16 dw 1024 35pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 36rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 37byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 38pw_seed_xor: times 2 dw 0xb524 39 times 2 dw 0x49d8 40pd_m65536: dd ~0xffff 41pb_23_22: times 2 db 23, 22 42pb_1: times 4 db 1 43hmul_bits: dw 32768, 16384, 8192, 4096 44round: dw 2048, 1024, 512 45mul_bits: dw 256, 128, 64, 32, 16 46round_vals: dw 32, 64, 128, 256, 512 47max: dw 255, 240, 235 48min: dw 0, 16 49pb_27_17_17_27: db 27, 17, 17, 27 50pw_1: dw 1 51 52%macro JMP_TABLE 1-* 53 %xdefine %1_table %%table 54 %xdefine %%base %1_table 55 %xdefine %%prefix mangle(private_prefix %+ _%1) 56 %%table: 57 %rep %0 - 1 58 dd %%prefix %+ .ar%2 - %%base 59 %rotate 1 60 %endrep 61%endmacro 62 63ALIGN 4 64JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3 65JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3 66JMP_TABLE generate_grain_uv_422_avx2, 0, 1, 2, 3 67JMP_TABLE generate_grain_uv_444_avx2, 0, 1, 2, 3 68 69struc FGData 70 .seed: resd 1 71 .num_y_points: resd 1 72 .y_points: resb 14 * 2 73 .chroma_scaling_from_luma: resd 1 74 .num_uv_points: resd 2 75 .uv_points: resb 2 * 10 * 2 76 .scaling_shift: resd 1 77 .ar_coeff_lag: resd 1 78 .ar_coeffs_y: resb 24 79 .ar_coeffs_uv: resb 2 * 28 ; includes padding 80 .ar_coeff_shift: resq 1 81 .grain_scale_shift: resd 1 82 .uv_mult: resd 2 83 .uv_luma_mult: resd 2 84 .uv_offset: resd 2 85 .overlap_flag: resd 1 86 .clip_to_restricted_range: resd 1 87endstruc 88 89cextern gaussian_sequence 90 91SECTION .text 92 93INIT_XMM avx2 94cglobal generate_grain_y, 2, 9, 16, buf, fg_data 95 lea r4, [pb_mask] 96%define base r4-pb_mask 97 movq xm1, [base+rnd_next_upperbit_mask] 98 movq xm4, [base+mul_bits] 99 movq xm7, [base+hmul_bits] 100 mov r2d, [fg_dataq+FGData.grain_scale_shift] 101 vpbroadcastw xm8, [base+round+r2*2] 102 mova xm5, [base+pb_mask] 103 vpbroadcastw xm0, [fg_dataq+FGData.seed] 104 vpbroadcastd xm9, [base+pd_m65536] 105 mov r2, -73*82 106 sub bufq, r2 107 lea r3, [gaussian_sequence] 108.loop: 109 pand xm2, xm0, xm1 110 psrlw xm3, xm2, 10 111 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 112 pmullw xm2, xm4 ; bits 0x0f00 are set 113 pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds 114 psllq xm6, xm2, 30 115 por xm2, xm6 116 psllq xm6, xm2, 15 117 por xm2, xm6 ; aggregate each bit into next seed's high bit 118 pmulhuw xm3, xm0, xm7 119 por xm2, xm3 ; 4 next output seeds 120 pshuflw xm0, xm2, q3333 121 psrlw xm2, 5 122 pmovzxwd xm3, xm2 123 mova xm6, xm9 124 vpgatherdd xm2, [r3+xm3*2], xm6 125 pandn xm2, xm9, xm2 126 packusdw xm2, xm2 127 pmulhrsw xm2, xm8 128 packsswb xm2, xm2 129 movd [bufq+r2], xm2 130 add r2, 4 131 jl .loop 132 133 ; auto-regression code 134 movsxd r2, [fg_dataq+FGData.ar_coeff_lag] 135 movsxd r2, [base+generate_grain_y_avx2_table+r2*4] 136 lea r2, [r2+base+generate_grain_y_avx2_table] 137 jmp r2 138 139.ar1: 140 DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 141 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 142 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] 143 movd xm4, [fg_dataq+FGData.ar_coeffs_y] 144 DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 145 pinsrb xm4, [pb_1], 3 146 pmovsxbw xm4, xm4 147 pshufd xm5, xm4, q1111 148 pshufd xm4, xm4, q0000 149 vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd 150 sub bufq, 82*73-(82*3+79) 151 mov hd, 70 152 mov mind, -128 153 mov maxd, 127 154.y_loop_ar1: 155 mov xq, -76 156 movsx val3d, byte [bufq+xq-1] 157.x_loop_ar1: 158 pmovsxbw xm0, [bufq+xq-82-1] ; top/left 159 pmovsxbw xm2, [bufq+xq-82+0] ; top 160 pmovsxbw xm1, [bufq+xq-82+1] ; top/right 161 punpcklwd xm0, xm2 162 punpcklwd xm1, xm3 163 pmaddwd xm0, xm4 164 pmaddwd xm1, xm5 165 paddd xm0, xm1 166.x_loop_ar1_inner: 167 movd val0d, xm0 168 psrldq xm0, 4 169 imul val3d, cf3d 170 add val3d, val0d 171%if WIN64 172 sarx val3d, val3d, shiftd 173%else 174 sar val3d, shiftb 175%endif 176 movsx val0d, byte [bufq+xq] 177 add val3d, val0d 178 cmp val3d, maxd 179 cmovns val3d, maxd 180 cmp val3d, mind 181 cmovs val3d, mind 182 mov byte [bufq+xq], val3b 183 ; keep val3d in-place as left for next x iteration 184 inc xq 185 jz .x_loop_ar1_end 186 test xq, 3 187 jnz .x_loop_ar1_inner 188 jmp .x_loop_ar1 189 190.x_loop_ar1_end: 191 add bufq, 82 192 dec hd 193 jg .y_loop_ar1 194.ar0: 195 RET 196 197.ar2: 198 DEFINE_ARGS buf, fg_data, shift 199 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 200 vpbroadcastw xm14, [base+round_vals-12+shiftq*2] 201 movq xm15, [base+byte_blend+1] 202 pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 203 movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 204 pmovsxbw xm9, xm9 205 DEFINE_ARGS buf, fg_data, h, x 206 pshufd xm12, xm9, q0000 207 pshufd xm13, xm9, q1111 208 pshufd xm11, xm8, q3333 209 pshufd xm10, xm8, q2222 210 pshufd xm9, xm8, q1111 211 pshufd xm8, xm8, q0000 212 pmovzxwd xm14, xm14 213 sub bufq, 82*73-(82*3+79) 214 mov hd, 70 215.y_loop_ar2: 216 mov xq, -76 217 218.x_loop_ar2: 219 pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 220 pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 221 psrldq xm2, xm0, 2 ; y=-2,x=[-1,+5] 222 psrldq xm3, xm1, 2 ; y=-1,x=[-1,+5] 223 psrldq xm4, xm1, 4 ; y=-1,x=[+0,+5] 224 punpcklwd xm2, xm0, xm2 225 punpcklwd xm3, xm4 226 pmaddwd xm2, xm8 227 pmaddwd xm3, xm11 228 paddd xm2, xm3 229 230 psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5] 231 psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5] 232 psrldq xm6, xm0, 8 ; y=-2,x=[+2,+5] 233 punpcklwd xm4, xm5 234 punpcklwd xm6, xm1 235 psrldq xm7, xm1, 6 ; y=-1,x=[+1,+5] 236 psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5] 237 punpcklwd xm7, xm1 238 pmaddwd xm4, xm9 239 pmaddwd xm6, xm10 240 pmaddwd xm7, xm12 241 paddd xm4, xm6 242 paddd xm2, xm7 243 paddd xm2, xm4 244 paddd xm2, xm14 245 246 movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] 247.x_loop_ar2_inner: 248 pmovsxbw xm1, xm0 249 pmaddwd xm3, xm1, xm13 250 paddd xm3, xm2 251 psrldq xm1, 4 ; y=0,x=0 252 psrldq xm2, 4 ; shift top to next pixel 253 psrad xm3, [fg_dataq+FGData.ar_coeff_shift] 254 ; don't packssdw since we only care about one value 255 paddw xm3, xm1 256 packsswb xm3, xm3 257 pextrb [bufq+xq], xm3, 0 258 pslldq xm3, 2 259 pand xm3, xm15 260 pandn xm0, xm15, xm0 261 por xm0, xm3 262 psrldq xm0, 1 263 inc xq 264 jz .x_loop_ar2_end 265 test xq, 3 266 jnz .x_loop_ar2_inner 267 jmp .x_loop_ar2 268 269.x_loop_ar2_end: 270 add bufq, 82 271 dec hd 272 jg .y_loop_ar2 273 RET 274 275.ar3: 276 DEFINE_ARGS buf, fg_data, shift 277%if WIN64 278 SUB rsp, 16*12 279%assign stack_size_padded (stack_size_padded+16*12) 280%assign stack_size (stack_size+16*12) 281%else 282 ALLOC_STACK 16*12 283%endif 284 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 285 vpbroadcastw xm14, [base+round_vals-12+shiftq*2] 286 movq xm15, [base+byte_blend] 287 pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-7 288 pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_y+ 8] ; cf8-15 289 pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 290 pshufd xm9, xm0, q1111 291 pshufd xm10, xm0, q2222 292 pshufd xm11, xm0, q3333 293 pshufd xm0, xm0, q0000 294 pshufd xm6, xm1, q1111 295 pshufd xm7, xm1, q2222 296 pshufd xm8, xm1, q3333 297 pshufd xm1, xm1, q0000 298 pshufd xm3, xm2, q1111 299 psrldq xm13, xm2, 10 300 pinsrw xm2, [pw_1], 5 301 pshufd xm4, xm2, q2222 302 pshufd xm2, xm2, q0000 303 pinsrw xm13, [base+round_vals+shiftq*2-10], 3 304 mova [rsp+ 0*16], xm0 305 mova [rsp+ 1*16], xm9 306 mova [rsp+ 2*16], xm10 307 mova [rsp+ 3*16], xm11 308 mova [rsp+ 4*16], xm1 309 mova [rsp+ 5*16], xm6 310 mova [rsp+ 6*16], xm7 311 mova [rsp+ 7*16], xm8 312 mova [rsp+ 8*16], xm2 313 mova [rsp+ 9*16], xm3 314 mova [rsp+10*16], xm4 315 DEFINE_ARGS buf, fg_data, h, x 316 sub bufq, 82*73-(82*3+79) 317 mov hd, 70 318.y_loop_ar3: 319 mov xq, -76 320 321.x_loop_ar3: 322 movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] 323 movu xm1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] 324 movu xm2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 325 pxor xm3, xm3 326 pcmpgtb xm6, xm3, xm2 327 pcmpgtb xm5, xm3, xm1 328 pcmpgtb xm4, xm3, xm0 329 punpckhbw xm3, xm0, xm4 330 punpcklbw xm0, xm4 331 punpckhbw xm4, xm1, xm5 332 punpcklbw xm1, xm5 333 punpckhbw xm5, xm2, xm6 334 punpcklbw xm2, xm6 335 336 psrldq xm6, xm0, 2 337 psrldq xm7, xm0, 4 338 psrldq xm8, xm0, 6 339 psrldq xm9, xm0, 8 340 palignr xm10, xm3, xm0, 10 341 palignr xm11, xm3, xm0, 12 342 343 punpcklwd xm0, xm6 344 punpcklwd xm7, xm8 345 punpcklwd xm9, xm10 346 punpcklwd xm11, xm1 347 pmaddwd xm0, [rsp+ 0*16] 348 pmaddwd xm7, [rsp+ 1*16] 349 pmaddwd xm9, [rsp+ 2*16] 350 pmaddwd xm11, [rsp+ 3*16] 351 paddd xm0, xm7 352 paddd xm9, xm11 353 paddd xm0, xm9 354 355 psrldq xm6, xm1, 2 356 psrldq xm7, xm1, 4 357 psrldq xm8, xm1, 6 358 psrldq xm9, xm1, 8 359 palignr xm10, xm4, xm1, 10 360 palignr xm11, xm4, xm1, 12 361 psrldq xm12, xm2, 2 362 363 punpcklwd xm6, xm7 364 punpcklwd xm8, xm9 365 punpcklwd xm10, xm11 366 punpcklwd xm12, xm2, xm12 367 pmaddwd xm6, [rsp+ 4*16] 368 pmaddwd xm8, [rsp+ 5*16] 369 pmaddwd xm10, [rsp+ 6*16] 370 pmaddwd xm12, [rsp+ 7*16] 371 paddd xm6, xm8 372 paddd xm10, xm12 373 paddd xm6, xm10 374 paddd xm0, xm6 375 376 psrldq xm6, xm2, 4 377 psrldq xm7, xm2, 6 378 psrldq xm8, xm2, 8 379 palignr xm9, xm5, xm2, 10 380 palignr xm5, xm5, xm2, 12 381 382 punpcklwd xm6, xm7 383 punpcklwd xm8, xm9 384 punpcklwd xm5, xm14 385 pmaddwd xm6, [rsp+ 8*16] 386 pmaddwd xm8, [rsp+ 9*16] 387 pmaddwd xm5, [rsp+10*16] 388 paddd xm0, xm6 389 paddd xm8, xm5 390 paddd xm0, xm8 391 392 movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] 393.x_loop_ar3_inner: 394 pmovsxbw xm2, xm1 395 pmaddwd xm2, xm13 396 pshufd xm3, xm2, q1111 397 paddd xm2, xm3 ; left+cur 398 paddd xm2, xm0 ; add top 399 psrldq xm0, 4 400 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] 401 ; don't packssdw since we only care about one value 402 packsswb xm2, xm2 403 pextrb [bufq+xq], xm2, 0 404 pslldq xm2, 3 405 pand xm2, xm15 406 pandn xm1, xm15, xm1 407 por xm1, xm2 408 psrldq xm1, 1 409 inc xq 410 jz .x_loop_ar3_end 411 test xq, 3 412 jnz .x_loop_ar3_inner 413 jmp .x_loop_ar3 414 415.x_loop_ar3_end: 416 add bufq, 82 417 dec hd 418 jg .y_loop_ar3 419 RET 420 421%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y 422INIT_XMM avx2 423cglobal generate_grain_uv_%1, 4, 10, 16, buf, bufy, fg_data, uv 424 lea r4, [pb_mask] 425%define base r4-pb_mask 426 movq xm1, [base+rnd_next_upperbit_mask] 427 movq xm4, [base+mul_bits] 428 movq xm7, [base+hmul_bits] 429 mov r5d, [fg_dataq+FGData.grain_scale_shift] 430 vpbroadcastw xm8, [base+round+r5*2] 431 mova xm5, [base+pb_mask] 432 vpbroadcastw xm0, [fg_dataq+FGData.seed] 433 vpbroadcastw xm9, [base+pw_seed_xor+uvq*4] 434 pxor xm0, xm9 435 vpbroadcastd xm9, [base+pd_m65536] 436 lea r6, [gaussian_sequence] 437%if %2 438 mov r7d, 73-35*%3 439 add bufq, 44 440.loop_y: 441 mov r5, -44 442.loop_x: 443%else 444 mov r5, -73*82 445 sub bufq, r5 446.loop: 447%endif 448 pand xm2, xm0, xm1 449 psrlw xm3, xm2, 10 450 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 451 pmullw xm2, xm4 ; bits 0x0f00 are set 452 pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds 453 psllq xm6, xm2, 30 454 por xm2, xm6 455 psllq xm6, xm2, 15 456 por xm2, xm6 ; aggregate each bit into next seed's high bit 457 pmulhuw xm3, xm0, xm7 458 por xm2, xm3 ; 4 next output seeds 459 pshuflw xm0, xm2, q3333 460 psrlw xm2, 5 461 pmovzxwd xm3, xm2 462 mova xm6, xm9 463 vpgatherdd xm2, [r6+xm3*2], xm6 464 pandn xm2, xm9, xm2 465 packusdw xm2, xm2 466 pmulhrsw xm2, xm8 467 packsswb xm2, xm2 468 movd [bufq+r5], xm2 469 add r5, 4 470%if %2 471 jl .loop_x 472 add bufq, 82 473 dec r7d 474 jg .loop_y 475%else 476 jl .loop 477%endif 478 479 ; auto-regression code 480 movsxd r5, [fg_dataq+FGData.ar_coeff_lag] 481 movsxd r5, [base+generate_grain_uv_%1_avx2_table+r5*4] 482 lea r5, [r5+base+generate_grain_uv_%1_avx2_table] 483 jmp r5 484 485.ar0: 486 INIT_YMM avx2 487 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 488 imul uvd, 28 489 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 490 movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] 491 movd xm3, [base+hmul_bits+shiftq*2] 492 DEFINE_ARGS buf, bufy, h 493 pmovsxbw xm4, xm4 494%if %2 495 vpbroadcastd m7, [pb_1] 496 vpbroadcastw m6, [hmul_bits+2+%3*2] 497%endif 498 vpbroadcastw m4, xm4 499 vpbroadcastw m3, xm3 500 pxor m12, m12 501%if %2 502 sub bufq, 82*(73-35*%3)+82-(82*3+41) 503%else 504 sub bufq, 82*70-3 505%endif 506 add bufyq, 3+82*3 507 mov hd, 70-35*%3 508.y_loop_ar0: 509%if %2 510 ; first 32 pixels 511 movu xm8, [bufyq] 512%if %3 513 movu xm9, [bufyq+82] 514%endif 515 movu xm10, [bufyq+16] 516%if %3 517 movu xm11, [bufyq+82+16] 518%endif 519 vinserti128 m8, [bufyq+32], 1 520%if %3 521 vinserti128 m9, [bufyq+82+32], 1 522%endif 523 vinserti128 m10, [bufyq+48], 1 524%if %3 525 vinserti128 m11, [bufyq+82+48], 1 526%endif 527 pmaddubsw m8, m7, m8 528%if %3 529 pmaddubsw m9, m7, m9 530%endif 531 pmaddubsw m10, m7, m10 532%if %3 533 pmaddubsw m11, m7, m11 534 paddw m8, m9 535 paddw m10, m11 536%endif 537 pmulhrsw m8, m6 538 pmulhrsw m10, m6 539%else 540 xor r3d, r3d 541 ; first 32x2 pixels 542.x_loop_ar0: 543 movu m8, [bufyq+r3] 544 pcmpgtb m9, m12, m8 545 punpckhbw m10, m8, m9 546 punpcklbw m8, m9 547%endif 548 pmullw m8, m4 549 pmullw m10, m4 550 pmulhrsw m8, m3 551 pmulhrsw m10, m3 552%if %2 553 movu m0, [bufq] 554%else 555 movu m0, [bufq+r3] 556%endif 557 pcmpgtb m1, m12, m0 558 punpckhbw m9, m0, m1 559 punpcklbw m0, m1 560 paddw m0, m8 561 paddw m9, m10 562 packsswb m0, m9 563%if %2 564 movu [bufq], m0 565%else 566 movu [bufq+r3], m0 567 add r3d, 32 568 cmp r3d, 64 569 jl .x_loop_ar0 570%endif 571 572 ; last 6/12 pixels 573 movu xm8, [bufyq+32*2] 574%if %2 575%if %3 576 movu xm9, [bufyq+32*2+82] 577%endif 578 pmaddubsw xm8, xm7, xm8 579%if %3 580 pmaddubsw xm9, xm7, xm9 581 paddw xm8, xm9 582%endif 583 pmulhrsw xm8, xm6 584 pmullw xm8, xm4 585 pmulhrsw xm8, xm3 586 movq xm0, [bufq+32] 587 pcmpgtb xm9, xm12, xm0 588 punpcklbw xm9, xm0, xm9 589 paddw xm8, xm9 590 packsswb xm8, xm8 591 vpblendw xm0, xm8, xm0, 1000b 592 movq [bufq+32], xm0 593%else 594 pcmpgtb xm9, xm12, xm8 595 punpckhbw xm10, xm8, xm9 596 punpcklbw xm8, xm9 597 pmullw xm10, xm4 598 pmullw xm8, xm4 599 pmulhrsw xm10, xm3 600 pmulhrsw xm8, xm3 601 movu xm0, [bufq+64] 602 pcmpgtb xm9, xm12, xm0 603 punpcklbw xm1, xm0, xm9 604 punpckhbw xm9, xm0, xm9 605 paddw xm1, xm8 606 paddw xm9, xm10 607 packsswb xm1, xm9 608 vpblendw xm0, xm1, xm0, 11000000b 609 movu [bufq+64], xm0 610%endif 611 612 add bufq, 82 613 add bufyq, 82<<%3 614 dec hd 615 jg .y_loop_ar0 616 RET 617 618.ar1: 619 INIT_XMM avx2 620 DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift 621 imul uvd, 28 622 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 623 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] 624 movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] 625 pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 626 DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift 627 pmovsxbw xm4, xm4 628 pshufd xm5, xm4, q1111 629 pshufd xm4, xm4, q0000 630 pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd 631%if %2 632 vpbroadcastd xm7, [pb_1] 633 vpbroadcastw xm6, [hmul_bits+2+%3*2] 634%endif 635 vpbroadcastd xm3, xm3 636%if %2 637 sub bufq, 82*(73-35*%3)+44-(82*3+41) 638%else 639 sub bufq, 82*70-(82-3) 640%endif 641 add bufyq, 79+82*3 642 mov hd, 70-35*%3 643 mov mind, -128 644 mov maxd, 127 645.y_loop_ar1: 646 mov xq, -(76>>%2) 647 movsx val3d, byte [bufq+xq-1] 648.x_loop_ar1: 649 pmovsxbw xm0, [bufq+xq-82-1] ; top/left 650%if %2 651 movq xm8, [bufyq+xq*2] 652%if %3 653 movq xm9, [bufyq+xq*2+82] 654%endif 655%endif 656 psrldq xm2, xm0, 2 ; top 657 psrldq xm1, xm0, 4 ; top/right 658%if %2 659 pmaddubsw xm8, xm7, xm8 660%if %3 661 pmaddubsw xm9, xm7, xm9 662 paddw xm8, xm9 663%endif 664 pmulhrsw xm8, xm6 665%else 666 pmovsxbw xm8, [bufyq+xq] 667%endif 668 punpcklwd xm0, xm2 669 punpcklwd xm1, xm8 670 pmaddwd xm0, xm4 671 pmaddwd xm1, xm5 672 paddd xm0, xm1 673 paddd xm0, xm3 674.x_loop_ar1_inner: 675 movd val0d, xm0 676 psrldq xm0, 4 677 imul val3d, cf3d 678 add val3d, val0d 679 sarx val3d, val3d, shiftd 680 movsx val0d, byte [bufq+xq] 681 add val3d, val0d 682 cmp val3d, maxd 683 cmovns val3d, maxd 684 cmp val3d, mind 685 cmovs val3d, mind 686 mov byte [bufq+xq], val3b 687 ; keep val3d in-place as left for next x iteration 688 inc xq 689 jz .x_loop_ar1_end 690 test xq, 3 691 jnz .x_loop_ar1_inner 692 jmp .x_loop_ar1 693 694.x_loop_ar1_end: 695 add bufq, 82 696 add bufyq, 82<<%3 697 dec hd 698 jg .y_loop_ar1 699 RET 700 701.ar2: 702 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 703 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 704 imul uvd, 28 705 vpbroadcastw xm15, [base+round_vals-12+shiftq*2] 706 pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7 707 pmovsxbw xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12 708 pinsrw xm9, [base+pw_1], 5 709%if %2 710 vpbroadcastw xm7, [base+hmul_bits+2+%3*2] 711 vpbroadcastd xm6, [base+pb_1] 712%endif 713 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 714 pshufd xm12, xm9, q0000 715 pshufd xm13, xm9, q1111 716 pshufd xm14, xm9, q2222 717 pshufd xm11, xm8, q3333 718 pshufd xm10, xm8, q2222 719 pshufd xm9, xm8, q1111 720 pshufd xm8, xm8, q0000 721%if %2 722 sub bufq, 82*(73-35*%3)+44-(82*3+41) 723%else 724 sub bufq, 82*70-(82-3) 725%endif 726 add bufyq, 79+82*3 727 mov hd, 70-35*%3 728.y_loop_ar2: 729 mov xq, -(76>>%2) 730 731.x_loop_ar2: 732 pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 733 pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 734 psrldq xm2, xm0, 2 ; y=-2,x=[-1,+5] 735 psrldq xm3, xm1, 2 ; y=-1,x=[-1,+5] 736 psrldq xm4, xm1, 4 ; y=-1,x=[+0,+5] 737 punpcklwd xm2, xm0, xm2 738 punpcklwd xm3, xm4 739 pmaddwd xm2, xm8 740 pmaddwd xm3, xm11 741 paddd xm2, xm3 742 743 psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5] 744 psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5] 745 psrldq xm0, 8 ; y=-2,x=[+2,+5] 746 punpcklwd xm4, xm5 747 punpcklwd xm0, xm1 748 psrldq xm3, xm1, 6 ; y=-1,x=[+1,+5] 749 psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5] 750 punpcklwd xm3, xm1 751 pmaddwd xm4, xm9 752 pmaddwd xm0, xm10 753 pmaddwd xm3, xm12 754 paddd xm4, xm0 755 paddd xm2, xm3 756 paddd xm2, xm4 757 758%if %2 759 movq xm0, [bufyq+xq*2] 760%if %3 761 movq xm3, [bufyq+xq*2+82] 762%endif 763 pmaddubsw xm0, xm6, xm0 764%if %3 765 pmaddubsw xm3, xm6, xm3 766 paddw xm0, xm3 767%endif 768 pmulhrsw xm0, xm7 769%else 770 pmovsxbw xm0, [bufyq+xq] 771%endif 772 punpcklwd xm0, xm15 773 pmaddwd xm0, xm14 774 paddd xm2, xm0 775 776 movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] 777.x_loop_ar2_inner: 778 pmovsxbw xm0, xm0 779 pmaddwd xm3, xm0, xm13 780 paddd xm3, xm2 781 psrldq xm2, 4 ; shift top to next pixel 782 psrad xm3, [fg_dataq+FGData.ar_coeff_shift] 783 pslldq xm3, 2 784 psrldq xm0, 2 785 paddw xm3, xm0 786 vpblendw xm0, xm3, 00000010b 787 packsswb xm0, xm0 788 pextrb [bufq+xq], xm0, 1 789 inc xq 790 jz .x_loop_ar2_end 791 test xq, 3 792 jnz .x_loop_ar2_inner 793 jmp .x_loop_ar2 794 795.x_loop_ar2_end: 796 add bufq, 82 797 add bufyq, 82<<%3 798 dec hd 799 jg .y_loop_ar2 800 RET 801 802.ar3: 803 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 804 SUB rsp, 16*12 805%assign stack_size_padded (stack_size_padded+16*12) 806%assign stack_size (stack_size+16*12) 807 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 808 imul uvd, 28 809 vpbroadcastw xm14, [base+round_vals-12+shiftq*2] 810 pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-7 811 pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8] ; cf8-15 812 pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23 813 pmovsxbw xm5, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma] 814 pshufd xm9, xm0, q1111 815 pshufd xm10, xm0, q2222 816 pshufd xm11, xm0, q3333 817 pshufd xm0, xm0, q0000 818 pshufd xm6, xm1, q1111 819 pshufd xm7, xm1, q2222 820 pshufd xm8, xm1, q3333 821 pshufd xm1, xm1, q0000 822 pshufd xm3, xm2, q1111 823 pshufd xm4, xm2, q2222 824 vpbroadcastw xm5, xm5 825 vpblendw xm4, xm5, 10101010b ; interleave luma cf 826 psrldq xm5, xm2, 10 827 pshufd xm2, xm2, q0000 828 pinsrw xm5, [base+round_vals+shiftq*2-10], 3 829 pmovzxwd xm14, xm14 830 mova [rsp+ 0*16], xm0 831 mova [rsp+ 1*16], xm9 832 mova [rsp+ 2*16], xm10 833 mova [rsp+ 3*16], xm11 834 mova [rsp+ 4*16], xm1 835 mova [rsp+ 5*16], xm6 836 mova [rsp+ 6*16], xm7 837 mova [rsp+ 7*16], xm8 838 mova [rsp+ 8*16], xm2 839 mova [rsp+ 9*16], xm3 840 mova [rsp+10*16], xm4 841 mova [rsp+11*16], xm5 842%if %2 843 vpbroadcastd xm13, [base+pb_1] 844 vpbroadcastw xm15, [base+hmul_bits+2+%3*2] 845%endif 846 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 847%if %2 848 sub bufq, 82*(73-35*%3)+44-(82*3+41) 849%else 850 sub bufq, 82*70-(82-3) 851%endif 852 add bufyq, 79+82*3 853 mov hd, 70-35*%3 854.y_loop_ar3: 855 mov xq, -(76>>%2) 856 857.x_loop_ar3: 858 movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] 859 movu xm1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] 860 movu xm2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 861 pxor xm3, xm3 862 pcmpgtb xm6, xm3, xm2 863 pcmpgtb xm5, xm3, xm1 864 pcmpgtb xm4, xm3, xm0 865 punpckhbw xm3, xm0, xm4 866 punpcklbw xm0, xm4 867 punpckhbw xm4, xm1, xm5 868 punpcklbw xm1, xm5 869 punpckhbw xm5, xm2, xm6 870 punpcklbw xm2, xm6 871 872 psrldq xm6, xm0, 2 873 psrldq xm7, xm0, 4 874 psrldq xm8, xm0, 6 875 psrldq xm9, xm0, 8 876 palignr xm10, xm3, xm0, 10 877 palignr xm11, xm3, xm0, 12 878 879 punpcklwd xm0, xm6 880 punpcklwd xm7, xm8 881 punpcklwd xm9, xm10 882 punpcklwd xm11, xm1 883 pmaddwd xm0, [rsp+ 0*16] 884 pmaddwd xm7, [rsp+ 1*16] 885 pmaddwd xm9, [rsp+ 2*16] 886 pmaddwd xm11, [rsp+ 3*16] 887 paddd xm0, xm7 888 paddd xm9, xm11 889 paddd xm0, xm9 890 891 psrldq xm6, xm1, 2 892 psrldq xm7, xm1, 4 893 psrldq xm8, xm1, 6 894 psrldq xm9, xm1, 8 895 palignr xm10, xm4, xm1, 10 896 palignr xm11, xm4, xm1, 12 897 psrldq xm12, xm2, 2 898 899 punpcklwd xm6, xm7 900 punpcklwd xm8, xm9 901 punpcklwd xm10, xm11 902 punpcklwd xm12, xm2, xm12 903 pmaddwd xm6, [rsp+ 4*16] 904 pmaddwd xm8, [rsp+ 5*16] 905 pmaddwd xm10, [rsp+ 6*16] 906 pmaddwd xm12, [rsp+ 7*16] 907 paddd xm6, xm8 908 paddd xm10, xm12 909 paddd xm6, xm10 910 paddd xm0, xm6 911 912 psrldq xm6, xm2, 4 913 psrldq xm7, xm2, 6 914 psrldq xm8, xm2, 8 915 palignr xm9, xm5, xm2, 10 916 palignr xm5, xm5, xm2, 12 917 918%if %2 919 movq xm1, [bufyq+xq*2] 920%if %3 921 movq xm2, [bufyq+xq*2+82] 922%endif 923 pmaddubsw xm1, xm13, xm1 924%if %3 925 pmaddubsw xm2, xm13, xm2 926 paddw xm1, xm2 927%endif 928 pmulhrsw xm1, xm15 929%else 930 pmovsxbw xm1, [bufyq+xq] 931%endif 932 933 punpcklwd xm6, xm7 934 punpcklwd xm8, xm9 935 punpcklwd xm5, xm1 936 pmaddwd xm6, [rsp+ 8*16] 937 pmaddwd xm8, [rsp+ 9*16] 938 pmaddwd xm5, [rsp+10*16] 939 paddd xm0, xm6 940 paddd xm8, xm5 941 paddd xm0, xm8 942 paddd xm0, xm14 943 944 movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] 945.x_loop_ar3_inner: 946 pmovsxbw xm1, xm1 947 pmaddwd xm2, xm1, [rsp+16*11] 948 pshufd xm3, xm2, q1111 949 paddd xm2, xm3 ; left+cur 950 paddd xm2, xm0 ; add top 951 psrldq xm0, 4 952 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] 953 ; don't packssdw, we only care about one value 954 pslldq xm2, 6 955 vpblendw xm1, xm2, 1000b 956 packsswb xm1, xm1 957 pextrb [bufq+xq], xm1, 3 958 psrldq xm1, 1 959 inc xq 960 jz .x_loop_ar3_end 961 test xq, 3 962 jnz .x_loop_ar3_inner 963 jmp .x_loop_ar3 964 965.x_loop_ar3_end: 966 add bufq, 82 967 add bufyq, 82<<%3 968 dec hd 969 jg .y_loop_ar3 970 RET 971%endmacro 972 973generate_grain_uv_fn 420, 1, 1 974generate_grain_uv_fn 422, 1, 0 975generate_grain_uv_fn 444, 0, 0 976 977INIT_YMM avx2 978cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut 979 pcmpeqw m10, m10 980 psrld m10, 24 981 mov r7d, [fg_dataq+FGData.scaling_shift] 982 lea r8, [pb_mask] 983%define base r8-pb_mask 984 vpbroadcastw m11, [base+mul_bits+r7*2-14] 985 mov r7d, [fg_dataq+FGData.clip_to_restricted_range] 986 vpbroadcastw m12, [base+max+r7*4] 987 vpbroadcastw m13, [base+min+r7*2] 988 989 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 990 991 mov overlapd, [fg_dataq+FGData.overlap_flag] 992 movifnidn sbyd, sbym 993 test sbyd, sbyd 994 setnz r7b 995 test r7b, overlapb 996 jnz .vertical_overlap 997 998 imul seed, sbyd, (173 << 24) | 37 999 add seed, (105 << 24) | 178 1000 rol seed, 8 1001 movzx seed, seew 1002 xor seed, [fg_dataq+FGData.seed] 1003 1004 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1005 unused1, unused2, see, overlap 1006 1007 lea src_bakq, [srcq+wq] 1008 neg wq 1009 sub dstq, srcq 1010 1011.loop_x: 1012 mov r6d, seed 1013 or seed, 0xEFF4 1014 shr r6d, 1 1015 test seeb, seeh 1016 lea seed, [r6+0x8000] 1017 cmovp seed, r6d ; updated seed 1018 1019 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1020 offx, offy, see, overlap 1021 1022 mov offxd, seed 1023 rorx offyd, seed, 8 1024 shr offxd, 12 1025 and offyd, 0xf 1026 imul offyd, 164 1027 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1028 1029 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1030 h, offxy, see, overlap 1031 1032 mov hd, hm 1033 mov grain_lutq, grain_lutmp 1034.loop_y: 1035 ; src 1036 mova m0, [srcq] 1037 pxor m2, m2 1038 punpckhbw m1, m0, m2 1039 punpcklbw m0, m2 ; m0-1: src as word 1040 punpckhwd m5, m0, m2 1041 punpcklwd m4, m0, m2 1042 punpckhwd m7, m1, m2 1043 punpcklwd m6, m1, m2 ; m4-7: src as dword 1044 1045 ; scaling[src] 1046 pcmpeqw m3, m3 1047 pcmpeqw m9, m9 1048 vpgatherdd m8, [scalingq+m4], m3 1049 vpgatherdd m4, [scalingq+m5], m9 1050 pcmpeqw m3, m3 1051 pcmpeqw m9, m9 1052 vpgatherdd m5, [scalingq+m6], m3 1053 vpgatherdd m6, [scalingq+m7], m9 1054 pand m8, m10 1055 pand m4, m10 1056 pand m5, m10 1057 pand m6, m10 1058 packusdw m8, m4 1059 packusdw m5, m6 1060 1061 ; grain = grain_lut[offy+y][offx+x] 1062 movu m3, [grain_lutq+offxyq] 1063 pcmpgtb m7, m2, m3 1064 punpcklbw m2, m3, m7 1065 punpckhbw m3, m7 1066 1067 ; noise = round2(scaling[src] * grain, scaling_shift) 1068 pmullw m2, m8 1069 pmullw m3, m5 1070 pmulhrsw m2, m11 1071 pmulhrsw m3, m11 1072 1073 ; dst = clip_pixel(src, noise) 1074 paddw m0, m2 1075 paddw m1, m3 1076 pmaxsw m0, m13 1077 pmaxsw m1, m13 1078 pminsw m0, m12 1079 pminsw m1, m12 1080 packuswb m0, m1 1081 mova [dstq+srcq], m0 1082 1083 add srcq, strideq 1084 add grain_lutq, 82 1085 dec hd 1086 jg .loop_y 1087 1088 add wq, 32 1089 jge .end 1090 lea srcq, [src_bakq+wq] 1091 test overlapd, overlapd 1092 jz .loop_x 1093 1094 ; r8m = sbym 1095 movd xm15, [pb_27_17_17_27] 1096 cmp dword r8m, 0 1097 jne .loop_x_hv_overlap 1098 1099 ; horizontal overlap (without vertical overlap) 1100 movd xm14, [pw_1024] 1101.loop_x_h_overlap: 1102 mov r6d, seed 1103 or seed, 0xEFF4 1104 shr r6d, 1 1105 test seeb, seeh 1106 lea seed, [r6+0x8000] 1107 cmovp seed, r6d ; updated seed 1108 1109 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1110 offx, offy, see, left_offxy 1111 1112 lea left_offxyd, [offyd+32] ; previous column's offy*stride+offx 1113 mov offxd, seed 1114 rorx offyd, seed, 8 1115 shr offxd, 12 1116 and offyd, 0xf 1117 imul offyd, 164 1118 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1119 1120 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1121 h, offxy, see, left_offxy 1122 1123 mov hd, hm 1124 mov grain_lutq, grain_lutmp 1125.loop_y_h_overlap: 1126 ; src 1127 mova m0, [srcq] 1128 pxor m2, m2 1129 punpckhbw m1, m0, m2 1130 punpcklbw m0, m2 ; m0-1: src as word 1131 punpckhwd m5, m0, m2 1132 punpcklwd m4, m0, m2 1133 punpckhwd m7, m1, m2 1134 punpcklwd m6, m1, m2 ; m4-7: src as dword 1135 1136 ; scaling[src] 1137 pcmpeqw m3, m3 1138 pcmpeqw m9, m9 1139 vpgatherdd m8, [scalingq+m4], m3 1140 vpgatherdd m4, [scalingq+m5], m9 1141 pcmpeqw m3, m3 1142 pcmpeqw m9, m9 1143 vpgatherdd m5, [scalingq+m6], m3 1144 vpgatherdd m6, [scalingq+m7], m9 1145 pand m8, m10 1146 pand m4, m10 1147 pand m5, m10 1148 pand m6, m10 1149 packusdw m8, m4 1150 packusdw m5, m6 1151 1152 ; grain = grain_lut[offy+y][offx+x] 1153 movu m3, [grain_lutq+offxyq] 1154 movd xm4, [grain_lutq+left_offxyq] 1155 punpcklbw xm4, xm3 1156 pmaddubsw xm4, xm15, xm4 1157 pmulhrsw xm4, xm14 1158 packsswb xm4, xm4 1159 vpblendw xm4, xm3, 11111110b 1160 vpblendd m3, m4, 00001111b 1161 pcmpgtb m7, m2, m3 1162 punpcklbw m2, m3, m7 1163 punpckhbw m3, m7 1164 1165 ; noise = round2(scaling[src] * grain, scaling_shift) 1166 pmullw m2, m8 1167 pmullw m3, m5 1168 pmulhrsw m2, m11 1169 pmulhrsw m3, m11 1170 1171 ; dst = clip_pixel(src, noise) 1172 paddw m0, m2 1173 paddw m1, m3 1174 pmaxsw m0, m13 1175 pmaxsw m1, m13 1176 pminsw m0, m12 1177 pminsw m1, m12 1178 packuswb m0, m1 1179 mova [dstq+srcq], m0 1180 1181 add srcq, strideq 1182 add grain_lutq, 82 1183 dec hd 1184 jg .loop_y_h_overlap 1185 1186 add wq, 32 1187 jge .end 1188 lea srcq, [src_bakq+wq] 1189 1190 ; r8m = sbym 1191 cmp dword r8m, 0 1192 jne .loop_x_hv_overlap 1193 jmp .loop_x_h_overlap 1194 1195.end: 1196 RET 1197 1198.vertical_overlap: 1199 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 1200 1201 movzx sbyd, sbyb 1202 imul seed, [fg_dataq+FGData.seed], 0x00010001 1203 imul r7d, sbyd, 173 * 0x00010001 1204 imul sbyd, 37 * 0x01000100 1205 add r7d, (105 << 16) | 188 1206 add sbyd, (178 << 24) | (141 << 8) 1207 and r7d, 0x00ff00ff 1208 and sbyd, 0xff00ff00 1209 xor seed, r7d 1210 xor seed, sbyd ; (cur_seed << 16) | top_seed 1211 1212 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1213 unused1, unused2, see, overlap 1214 1215 lea src_bakq, [srcq+wq] 1216 neg wq 1217 sub dstq, srcq 1218 1219 vpbroadcastd m14, [pw_1024] 1220.loop_x_v_overlap: 1221 vpbroadcastw m15, [pb_27_17_17_27] 1222 1223 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1224 mov r6d, seed 1225 or seed, 0xeff4eff4 1226 test seeb, seeh 1227 setp r7b ; parity of top_seed 1228 shr seed, 16 1229 shl r7d, 16 1230 test seeb, seeh 1231 setp r7b ; parity of cur_seed 1232 or r6d, 0x00010001 1233 xor r7d, r6d 1234 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1235 1236 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1237 offx, offy, see, overlap, top_offxy 1238 1239 rorx offyd, seed, 8 1240 rorx offxd, seed, 12 1241 and offyd, 0xf000f 1242 and offxd, 0xf000f 1243 imul offyd, 164 1244 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1245 lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1246 1247 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1248 h, offxy, see, overlap, top_offxy 1249 1250 movzx top_offxyd, offxyw 1251 shr offxyd, 16 1252 1253 mov hd, hm 1254 mov grain_lutq, grain_lutmp 1255.loop_y_v_overlap: 1256 ; src 1257 mova m0, [srcq] 1258 pxor m2, m2 1259 punpckhbw m1, m0, m2 1260 punpcklbw m0, m2 ; m0-1: src as word 1261 punpckhwd m5, m0, m2 1262 punpcklwd m4, m0, m2 1263 punpckhwd m7, m1, m2 1264 punpcklwd m6, m1, m2 ; m4-7: src as dword 1265 1266 ; scaling[src] 1267 pcmpeqw m3, m3 1268 pcmpeqw m9, m9 1269 vpgatherdd m8, [scalingq+m4], m3 1270 vpgatherdd m4, [scalingq+m5], m9 1271 pcmpeqw m3, m3 1272 pcmpeqw m9, m9 1273 vpgatherdd m5, [scalingq+m6], m3 1274 vpgatherdd m6, [scalingq+m7], m9 1275 pand m8, m10 1276 pand m4, m10 1277 pand m5, m10 1278 pand m6, m10 1279 packusdw m8, m4 1280 packusdw m5, m6 1281 1282 ; grain = grain_lut[offy+y][offx+x] 1283 movu m3, [grain_lutq+offxyq] 1284 movu m4, [grain_lutq+top_offxyq] 1285 punpckhbw m6, m4, m3 1286 punpcklbw m4, m3 1287 pmaddubsw m6, m15, m6 1288 pmaddubsw m4, m15, m4 1289 pmulhrsw m6, m14 1290 pmulhrsw m4, m14 1291 packsswb m3, m4, m6 1292 pcmpgtb m7, m2, m3 1293 punpcklbw m2, m3, m7 1294 punpckhbw m3, m7 1295 1296 ; noise = round2(scaling[src] * grain, scaling_shift) 1297 pmullw m2, m8 1298 pmullw m3, m5 1299 pmulhrsw m2, m11 1300 pmulhrsw m3, m11 1301 1302 ; dst = clip_pixel(src, noise) 1303 paddw m0, m2 1304 paddw m1, m3 1305 pmaxsw m0, m13 1306 pmaxsw m1, m13 1307 pminsw m0, m12 1308 pminsw m1, m12 1309 packuswb m0, m1 1310 mova [dstq+srcq], m0 1311 1312 vpbroadcastw m15, [pb_27_17_17_27+2] ; swap weights for second v-overlap line 1313 add srcq, strideq 1314 add grain_lutq, 82 1315 dec hw 1316 jz .end_y_v_overlap 1317 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1318 ; remaining (up to) 30 lines 1319 btc hd, 16 1320 jnc .loop_y_v_overlap 1321 jmp .loop_y 1322 1323.end_y_v_overlap: 1324 add wq, 32 1325 jge .end_hv 1326 lea srcq, [src_bakq+wq] 1327 1328 ; since fg_dataq.overlap is guaranteed to be set, we never jump 1329 ; back to .loop_x_v_overlap, and instead always fall-through to 1330 ; h+v overlap 1331 1332 movd xm15, [pb_27_17_17_27] 1333.loop_x_hv_overlap: 1334 vpbroadcastw m8, [pb_27_17_17_27] 1335 1336 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1337 mov r6d, seed 1338 or seed, 0xeff4eff4 1339 test seeb, seeh 1340 setp r7b ; parity of top_seed 1341 shr seed, 16 1342 shl r7d, 16 1343 test seeb, seeh 1344 setp r7b ; parity of cur_seed 1345 or r6d, 0x00010001 1346 xor r7d, r6d 1347 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1348 1349 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1350 offx, offy, see, left_offxy, top_offxy, topleft_offxy 1351 1352 lea topleft_offxyq, [top_offxyq+32] 1353 lea left_offxyq, [offyq+32] 1354 rorx offyd, seed, 8 1355 rorx offxd, seed, 12 1356 and offyd, 0xf000f 1357 and offxd, 0xf000f 1358 imul offyd, 164 1359 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1360 lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1361 1362 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1363 h, offxy, see, left_offxy, top_offxy, topleft_offxy 1364 1365 movzx top_offxyd, offxyw 1366 shr offxyd, 16 1367 1368 mov hd, hm 1369 mov grain_lutq, grain_lutmp 1370.loop_y_hv_overlap: 1371 ; src 1372 mova m0, [srcq] 1373 pxor m2, m2 1374 punpckhbw m1, m0, m2 1375 punpcklbw m0, m2 ; m0-1: src as word 1376 punpckhwd m5, m0, m2 1377 punpcklwd m4, m0, m2 1378 punpckhwd m7, m1, m2 1379 punpcklwd m6, m1, m2 ; m4-7: src as dword 1380 1381 ; scaling[src] 1382 pcmpeqw m3, m3 1383 ; FIXME it would be nice to have another register here to do 2 vpgatherdd's in parallel 1384 vpgatherdd m9, [scalingq+m4], m3 1385 pcmpeqw m3, m3 1386 vpgatherdd m4, [scalingq+m5], m3 1387 pcmpeqw m3, m3 1388 vpgatherdd m5, [scalingq+m6], m3 1389 pcmpeqw m3, m3 1390 vpgatherdd m6, [scalingq+m7], m3 1391 pand m9, m10 1392 pand m4, m10 1393 pand m5, m10 1394 pand m6, m10 1395 packusdw m9, m4 1396 packusdw m5, m6 1397 1398 ; grain = grain_lut[offy+y][offx+x] 1399 movu m3, [grain_lutq+offxyq] 1400 movu m6, [grain_lutq+top_offxyq] 1401 movd xm4, [grain_lutq+left_offxyq] 1402 movd xm7, [grain_lutq+topleft_offxyq] 1403 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 1404 punpcklbw xm4, xm3 1405 punpcklbw xm7, xm6 1406 pmaddubsw xm4, xm15, xm4 1407 pmaddubsw xm7, xm15, xm7 1408 pmulhrsw xm4, xm14 1409 pmulhrsw xm7, xm14 1410 packsswb xm4, xm4 1411 packsswb xm7, xm7 1412 vpblendw xm4, xm3, 11111110b 1413 vpblendw xm7, xm6, 11111110b 1414 vpblendd m3, m4, 00001111b 1415 vpblendd m6, m7, 00001111b 1416 ; followed by v interpolation (top | cur -> cur) 1417 punpckhbw m7, m6, m3 1418 punpcklbw m6, m3 1419 pmaddubsw m7, m8, m7 1420 pmaddubsw m6, m8, m6 1421 pmulhrsw m7, m14 1422 pmulhrsw m6, m14 1423 packsswb m3, m6, m7 1424 pcmpgtb m7, m2, m3 1425 punpcklbw m2, m3, m7 1426 punpckhbw m3, m7 1427 1428 ; noise = round2(scaling[src] * grain, scaling_shift) 1429 pmullw m2, m9 1430 pmullw m3, m5 1431 pmulhrsw m2, m11 1432 pmulhrsw m3, m11 1433 1434 ; dst = clip_pixel(src, noise) 1435 paddw m0, m2 1436 paddw m1, m3 1437 pmaxsw m0, m13 1438 pmaxsw m1, m13 1439 pminsw m0, m12 1440 pminsw m1, m12 1441 packuswb m0, m1 1442 mova [dstq+srcq], m0 1443 1444 vpbroadcastw m8, [pb_27_17_17_27+2] ; swap weights for second v-overlap line 1445 add srcq, strideq 1446 add grain_lutq, 82 1447 dec hw 1448 jz .end_y_hv_overlap 1449 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1450 ; remaining (up to) 30 lines 1451 btc hd, 16 1452 jnc .loop_y_hv_overlap 1453 jmp .loop_y_h_overlap 1454 1455.end_y_hv_overlap: 1456 add wq, 32 1457 lea srcq, [src_bakq+wq] 1458 jl .loop_x_hv_overlap 1459 1460.end_hv: 1461 RET 1462 1463%macro FGUV_FN 3 ; name, ss_hor, ss_ver 1464cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ 1465 grain_lut, h, sby, luma, lstride, uv_pl, is_id 1466 pcmpeqw m10, m10 1467 psrld m10, 24 1468 mov r7d, [fg_dataq+FGData.scaling_shift] 1469 lea r8, [pb_mask] 1470%define base r8-pb_mask 1471 vpbroadcastw m11, [base+mul_bits+r7*2-14] 1472 mov r7d, [fg_dataq+FGData.clip_to_restricted_range] 1473 mov r9d, dword is_idm 1474 vpbroadcastw m13, [base+min+r7*2] 1475 shlx r7d, r7d, r9d 1476 vpbroadcastw m12, [base+max+r7*2] 1477 1478 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 1479 jne .csfl 1480 1481%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver 1482 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 1483 1484%if %1 1485 mov r7d, dword r11m 1486 vpbroadcastb m0, [fg_dataq+FGData.uv_mult+r7*4] 1487 vpbroadcastb m1, [fg_dataq+FGData.uv_luma_mult+r7*4] 1488 punpcklbw m14, m1, m0 1489 vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r7*4] 1490%else 1491 vpbroadcastd m14, [pw_1024] 1492%if %2 1493 vpbroadcastd m15, [pb_23_22] 1494%else 1495 vpbroadcastd xm15, [pb_27_17_17_27] 1496%endif 1497%endif 1498 1499 mov overlapd, [fg_dataq+FGData.overlap_flag] 1500 movifnidn sbyd, sbym 1501 test sbyd, sbyd 1502 setnz r7b 1503 test r7b, overlapb 1504 jnz %%vertical_overlap 1505 1506 imul seed, sbyd, (173 << 24) | 37 1507 add seed, (105 << 24) | 178 1508 rol seed, 8 1509 movzx seed, seew 1510 xor seed, [fg_dataq+FGData.seed] 1511 1512 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1513 unused2, unused3, see, overlap, unused4, unused5, lstride 1514 1515 mov lumaq, r9mp 1516 lea r12, [srcq+wq] 1517 lea r13, [dstq+wq] 1518 lea r14, [lumaq+wq*(1+%2)] 1519 mov r11mp, r12 1520 mov r12mp, r13 1521 mov lstrideq, r10mp 1522 neg wq 1523 1524%%loop_x: 1525 mov r6d, seed 1526 or seed, 0xEFF4 1527 shr r6d, 1 1528 test seeb, seeh 1529 lea seed, [r6+0x8000] 1530 cmovp seed, r6d ; updated seed 1531 1532 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1533 offx, offy, see, overlap, unused1, unused2, lstride 1534 1535 mov offxd, seed 1536 rorx offyd, seed, 8 1537 shr offxd, 12 1538 and offyd, 0xf 1539 imul offyd, 164>>%3 1540 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 1541 1542 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1543 h, offxy, see, overlap, unused1, unused2, lstride 1544 1545 mov hd, hm 1546 mov grain_lutq, grain_lutmp 1547%%loop_y: 1548 ; src 1549%if %2 1550 mova xm4, [lumaq+lstrideq*0+ 0] 1551 mova xm6, [lumaq+lstrideq*0+16] 1552 mova xm0, [srcq] 1553 vpbroadcastd m7, [pb_1] 1554 vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 1555 vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 1556 vinserti128 m0, [srcq+strideq], 1 1557 pxor m2, m2 1558 pmaddubsw m4, m7 1559 pmaddubsw m6, m7 1560 pavgw m4, m2 1561 pavgw m6, m2 1562%else 1563 pxor m2, m2 1564 mova m4, [lumaq] 1565 mova m0, [srcq] 1566%endif 1567 1568%if %1 1569%if %2 1570 packuswb m4, m6 ; luma 1571%endif 1572 punpckhbw m6, m4, m0 1573 punpcklbw m4, m0 ; { luma, chroma } 1574 pmaddubsw m6, m14 1575 pmaddubsw m4, m14 1576 psraw m6, 6 1577 psraw m4, 6 1578 paddw m6, m15 1579 paddw m4, m15 1580 packuswb m4, m6 ; pack+unpack = clip 1581 punpckhbw m6, m4, m2 1582 punpcklbw m4, m2 1583%elif %2 == 0 1584 punpckhbw m6, m4, m2 1585 punpcklbw m4, m2 1586%endif 1587 1588 punpckhwd m5, m4, m2 1589 punpcklwd m4, m2 1590 punpckhwd m7, m6, m2 1591 punpcklwd m6, m2 ; m4-7: luma_src as dword 1592 1593 ; scaling[luma_src] 1594 pcmpeqw m3, m3 1595 pcmpeqw m9, m9 1596 vpgatherdd m8, [scalingq+m4], m3 1597 vpgatherdd m4, [scalingq+m5], m9 1598 pcmpeqw m3, m3 1599 pcmpeqw m9, m9 1600 vpgatherdd m5, [scalingq+m6], m3 1601 vpgatherdd m6, [scalingq+m7], m9 1602 pand m8, m10 1603 pand m4, m10 1604 pand m5, m10 1605 pand m6, m10 1606 packusdw m8, m4 1607 packusdw m5, m6 1608 1609 ; unpack chroma_source 1610 punpckhbw m1, m0, m2 1611 punpcklbw m0, m2 ; m0-1: src as word 1612 1613 ; grain = grain_lut[offy+y][offx+x] 1614%if %2 1615 movu xm3, [grain_lutq+offxyq+ 0] 1616 vinserti128 m3, [grain_lutq+offxyq+82], 1 1617%else 1618 movu m3, [grain_lutq+offxyq] 1619%endif 1620 pcmpgtb m7, m2, m3 1621 punpcklbw m2, m3, m7 1622 punpckhbw m3, m7 1623 1624 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1625 pmullw m2, m8 1626 pmullw m3, m5 1627 pmulhrsw m2, m11 1628 pmulhrsw m3, m11 1629 1630 ; dst = clip_pixel(src, noise) 1631 paddw m0, m2 1632 paddw m1, m3 1633 pmaxsw m0, m13 1634 pmaxsw m1, m13 1635 pminsw m0, m12 1636 pminsw m1, m12 1637 packuswb m0, m1 1638%if %2 1639 mova [dstq], xm0 1640 vextracti128 [dstq+strideq], m0, 1 1641%else 1642 mova [dstq], m0 1643%endif 1644 1645%if %2 1646 lea srcq, [srcq+strideq*2] 1647 lea dstq, [dstq+strideq*2] 1648 lea lumaq, [lumaq+lstrideq*(2<<%3)] 1649%else 1650 add srcq, strideq 1651 add dstq, strideq 1652 add lumaq, lstrideq 1653%endif 1654 add grain_lutq, 82<<%2 1655 sub hb, 1+%2 1656 jg %%loop_y 1657 1658 add wq, 32>>%2 1659 jge %%end 1660 mov srcq, r11mp 1661 mov dstq, r12mp 1662 lea lumaq, [r14+wq*(1+%2)] 1663 add srcq, wq 1664 add dstq, wq 1665 test overlapd, overlapd 1666 jz %%loop_x 1667 1668 ; r8m = sbym 1669 cmp dword r8m, 0 1670 jne %%loop_x_hv_overlap 1671 1672 ; horizontal overlap (without vertical overlap) 1673%%loop_x_h_overlap: 1674 mov r6d, seed 1675 or seed, 0xEFF4 1676 shr r6d, 1 1677 test seeb, seeh 1678 lea seed, [r6+0x8000] 1679 cmovp seed, r6d ; updated seed 1680 1681 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1682 offx, offy, see, left_offxy, unused1, unused2, lstride 1683 1684 lea left_offxyd, [offyd+(32>>%2)] ; previous column's offy*stride+offx 1685 mov offxd, seed 1686 rorx offyd, seed, 8 1687 shr offxd, 12 1688 and offyd, 0xf 1689 imul offyd, 164>>%3 1690 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 1691 1692 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1693 h, offxy, see, left_offxy, unused1, unused2, lstride 1694 1695 mov hd, hm 1696 mov grain_lutq, grain_lutmp 1697%%loop_y_h_overlap: 1698 ; src 1699%if %2 1700 mova xm4, [lumaq+lstrideq*0+ 0] 1701 mova xm6, [lumaq+lstrideq*0+16] 1702 mova xm0, [srcq] 1703 vpbroadcastd m7, [pb_1] 1704 vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 1705 vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 1706 vinserti128 m0, [srcq+strideq], 1 1707 pxor m2, m2 1708 pmaddubsw m4, m7 1709 pmaddubsw m6, m7 1710 pavgw m4, m2 1711 pavgw m6, m2 1712%else 1713 mova m4, [lumaq] 1714 mova m0, [srcq] 1715 pxor m2, m2 1716%endif 1717 1718%if %1 1719%if %2 1720 packuswb m4, m6 ; luma 1721%endif 1722 punpckhbw m6, m4, m0 1723 punpcklbw m4, m0 ; { luma, chroma } 1724 pmaddubsw m6, m14 1725 pmaddubsw m4, m14 1726 psraw m6, 6 1727 psraw m4, 6 1728 paddw m6, m15 1729 paddw m4, m15 1730 packuswb m4, m6 ; pack+unpack = clip 1731 punpckhbw m6, m4, m2 1732 punpcklbw m4, m2 1733%elif %2 == 0 1734 punpckhbw m6, m4, m2 1735 punpcklbw m4, m2 1736%endif 1737 1738 punpckhwd m5, m4, m2 1739 punpcklwd m4, m2 1740 punpckhwd m7, m6, m2 1741 punpcklwd m6, m2 ; m4-7: luma_src as dword 1742 1743 ; scaling[luma_src] 1744 pcmpeqw m3, m3 1745 pcmpeqw m9, m9 1746 vpgatherdd m8, [scalingq+m4], m3 1747 vpgatherdd m4, [scalingq+m5], m9 1748 pcmpeqw m3, m3 1749 pcmpeqw m9, m9 1750 vpgatherdd m5, [scalingq+m6], m3 1751 vpgatherdd m6, [scalingq+m7], m9 1752 pand m8, m10 1753 pand m4, m10 1754 pand m5, m10 1755 pand m6, m10 1756 packusdw m8, m4 1757 packusdw m5, m6 1758 1759 ; unpack chroma_source 1760 punpckhbw m1, m0, m2 1761 punpcklbw m0, m2 ; m0-1: src as word 1762 1763 ; grain = grain_lut[offy+y][offx+x] 1764%if %2 1765%if %1 1766 vpbroadcastd m6, [pb_23_22] ; FIXME 1767%endif 1768 movu xm3, [grain_lutq+offxyq+ 0] 1769 movd xm4, [grain_lutq+left_offxyq+ 0] 1770 vinserti128 m3, [grain_lutq+offxyq+82], 1 1771 vinserti128 m4, [grain_lutq+left_offxyq+82], 1 1772 punpcklbw m4, m3 1773%if %1 1774 pmaddubsw m4, m6, m4 1775 pmulhrsw m4, [pw_1024] 1776%else 1777 pmaddubsw m4, m15, m4 1778 pmulhrsw m4, m14 1779%endif 1780 packsswb m4, m4 1781 pcmpeqw m6, m6 ; FIXME 1782 psrldq m6, 15 ; FIXME 1783 vpblendvb m3, m3, m4, m6 1784%else 1785%if %1 1786 vpbroadcastd xm6, [pb_27_17_17_27] 1787%endif 1788 movu m3, [grain_lutq+offxyq] 1789 movd xm4, [grain_lutq+left_offxyq] 1790 punpcklbw xm4, xm3 1791%if %1 1792 pmaddubsw xm4, xm6, xm4 1793 pmulhrsw xm4, [pw_1024] 1794%else 1795 pmaddubsw xm4, xm15, xm4 1796 pmulhrsw xm4, xm14 1797%endif 1798 packsswb xm4, xm4 1799 pcmpeqw xm6, xm6 1800 psrldq xm6, 14 1801 vpblendvb m3, m3, m4, m6 1802%endif 1803 pcmpgtb m7, m2, m3 1804 punpcklbw m2, m3, m7 1805 punpckhbw m3, m7 1806 1807 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1808 pmullw m2, m8 1809 pmullw m3, m5 1810 pmulhrsw m2, m11 1811 pmulhrsw m3, m11 1812 1813 ; dst = clip_pixel(src, noise) 1814 paddw m0, m2 1815 paddw m1, m3 1816 pmaxsw m0, m13 1817 pmaxsw m1, m13 1818 pminsw m0, m12 1819 pminsw m1, m12 1820 packuswb m0, m1 1821%if %2 1822 mova [dstq], xm0 1823 vextracti128 [dstq+strideq], m0, 1 1824%else 1825 mova [dstq], m0 1826%endif 1827 1828%if %2 1829 lea srcq, [srcq+strideq*2] 1830 lea dstq, [dstq+strideq*2] 1831 lea lumaq, [lumaq+lstrideq*(2<<%3)] 1832%else 1833 add srcq, strideq 1834 add dstq, strideq 1835 add lumaq, lstrideq 1836%endif 1837 add grain_lutq, 82*(1+%2) 1838 sub hb, 1+%2 1839 jg %%loop_y_h_overlap 1840 1841 add wq, 32>>%2 1842 jge %%end 1843 mov srcq, r11mp 1844 mov dstq, r12mp 1845 lea lumaq, [r14+wq*(1+%2)] 1846 add srcq, wq 1847 add dstq, wq 1848 1849 ; r8m = sbym 1850 cmp dword r8m, 0 1851 jne %%loop_x_hv_overlap 1852 jmp %%loop_x_h_overlap 1853 1854%%end: 1855 RET 1856 1857%%vertical_overlap: 1858 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ 1859 sby, see, overlap, unused1, unused2, lstride 1860 1861 movzx sbyd, sbyb 1862 imul seed, [fg_dataq+FGData.seed], 0x00010001 1863 imul r7d, sbyd, 173 * 0x00010001 1864 imul sbyd, 37 * 0x01000100 1865 add r7d, (105 << 16) | 188 1866 add sbyd, (178 << 24) | (141 << 8) 1867 and r7d, 0x00ff00ff 1868 and sbyd, 0xff00ff00 1869 xor seed, r7d 1870 xor seed, sbyd ; (cur_seed << 16) | top_seed 1871 1872 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1873 unused1, unused2, see, overlap, unused3, unused4, lstride 1874 1875 mov lumaq, r9mp 1876 lea r12, [srcq+wq] 1877 lea r13, [dstq+wq] 1878 lea r14, [lumaq+wq*(1+%2)] 1879 mov r11mp, r12 1880 mov r12mp, r13 1881 mov lstrideq, r10mp 1882 neg wq 1883 1884%%loop_x_v_overlap: 1885 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1886 mov r6d, seed 1887 or seed, 0xeff4eff4 1888 test seeb, seeh 1889 setp r7b ; parity of top_seed 1890 shr seed, 16 1891 shl r7d, 16 1892 test seeb, seeh 1893 setp r7b ; parity of cur_seed 1894 or r6d, 0x00010001 1895 xor r7d, r6d 1896 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1897 1898 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1899 offx, offy, see, overlap, top_offxy, unused, lstride 1900 1901 rorx offyd, seed, 8 1902 rorx offxd, seed, 12 1903 and offyd, 0xf000f 1904 and offxd, 0xf000f 1905 imul offyd, 164>>%3 1906 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1907 lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 1908 1909 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1910 h, offxy, see, overlap, top_offxy, unused, lstride 1911 1912 movzx top_offxyd, offxyw 1913 shr offxyd, 16 1914 1915 mov hd, hm 1916 mov grain_lutq, grain_lutmp 1917%if %2 == 0 1918 vbroadcasti128 m1, [pb_8x_27_17_8x_17_27] 1919%endif 1920%%loop_y_v_overlap: 1921 ; src 1922%if %2 1923 mova xm4, [lumaq+lstrideq*0+ 0] 1924 mova xm6, [lumaq+lstrideq*0+16] 1925 mova xm0, [srcq] 1926 vpbroadcastd m7, [pb_1] 1927 vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 1928 vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 1929 vinserti128 m0, [srcq+strideq], 1 1930 pxor m2, m2 1931 pmaddubsw m4, m7 1932 pmaddubsw m6, m7 1933 pavgw m4, m2 1934 pavgw m6, m2 1935%else 1936 mova m4, [lumaq] 1937 mova m0, [srcq] 1938 pxor m2, m2 1939%endif 1940 1941%if %1 1942%if %2 1943 packuswb m4, m6 ; luma 1944%endif 1945 punpckhbw m6, m4, m0 1946 punpcklbw m4, m0 ; { luma, chroma } 1947 pmaddubsw m6, m14 1948 pmaddubsw m4, m14 1949 psraw m6, 6 1950 psraw m4, 6 1951 paddw m6, m15 1952 paddw m4, m15 1953 packuswb m4, m6 ; pack+unpack = clip 1954 punpckhbw m6, m4, m2 1955 punpcklbw m4, m2 1956%elif %2 == 0 1957 punpckhbw m6, m4, m2 1958 punpcklbw m4, m2 1959%endif 1960 1961 punpckhwd m5, m4, m2 1962 punpcklwd m4, m2 1963 punpckhwd m7, m6, m2 1964 punpcklwd m6, m2 ; m4-7: luma_src as dword 1965 1966 ; scaling[luma_src] 1967 pcmpeqw m3, m3 1968 pcmpeqw m9, m9 1969 vpgatherdd m8, [scalingq+m4], m3 1970 vpgatherdd m4, [scalingq+m5], m9 1971 pcmpeqw m3, m3 1972 pcmpeqw m9, m9 1973 vpgatherdd m5, [scalingq+m6], m3 1974 vpgatherdd m6, [scalingq+m7], m9 1975 pand m8, m10 1976 pand m4, m10 1977 pand m5, m10 1978 pand m6, m10 1979 packusdw m8, m4 1980 packusdw m5, m6 1981 1982%if %2 1983 ; unpack chroma_source 1984 punpckhbw m1, m0, m2 1985 punpcklbw m0, m2 ; m0-1: src as word 1986%endif 1987 1988 ; grain = grain_lut[offy+y][offx+x] 1989%if %3 == 0 1990%if %2 1991 mova m6, [pb_8x_27_17_8x_17_27] 1992 movu xm3, [grain_lutq+offxyq] 1993 movu xm4, [grain_lutq+top_offxyq] 1994 vinserti128 m3, [grain_lutq+offxyq+82], 1 1995 vinserti128 m4, [grain_lutq+top_offxyq+82], 1 1996%else 1997 movu m3, [grain_lutq+offxyq] 1998 movu m4, [grain_lutq+top_offxyq] 1999%endif 2000 punpckhbw m9, m4, m3 2001 punpcklbw m4, m3 2002%if %2 2003 pmaddubsw m9, m6, m9 2004 pmaddubsw m4, m6, m4 2005%else 2006 pmaddubsw m9, m1, m9 2007 pmaddubsw m4, m1, m4 2008%endif 2009%if %1 2010 pmulhrsw m9, [pw_1024] 2011 pmulhrsw m4, [pw_1024] 2012%else 2013 pmulhrsw m9, m14 2014 pmulhrsw m4, m14 2015%endif 2016 packsswb m3, m4, m9 2017%else 2018%if %1 2019 vpbroadcastd m6, [pb_23_22] 2020%endif 2021 movq xm3, [grain_lutq+offxyq] 2022 movq xm4, [grain_lutq+top_offxyq] 2023 vinserti128 m3, [grain_lutq+offxyq+8], 1 2024 vinserti128 m4, [grain_lutq+top_offxyq+8], 1 2025 punpcklbw m4, m3 2026%if %1 2027 pmaddubsw m4, m6, m4 2028 pmulhrsw m4, [pw_1024] 2029%else 2030 pmaddubsw m4, m15, m4 2031 pmulhrsw m4, m14 2032%endif 2033 packsswb m4, m4 2034 vpermq m4, m4, q3120 2035 ; only interpolate first line, insert second line unmodified 2036 vinserti128 m3, m4, [grain_lutq+offxyq+82], 1 2037%endif 2038 pcmpgtb m7, m2, m3 2039 punpcklbw m2, m3, m7 2040 punpckhbw m3, m7 2041 2042 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2043 pmullw m2, m8 2044 pmullw m3, m5 2045 pmulhrsw m2, m11 2046 pmulhrsw m3, m11 2047 2048 ; dst = clip_pixel(src, noise) 2049%if %2 2050 paddw m0, m2 2051 paddw m1, m3 2052 pmaxsw m0, m13 2053 pmaxsw m1, m13 2054 pminsw m0, m12 2055 pminsw m1, m12 2056 packuswb m0, m1 2057 mova [dstq], xm0 2058 vextracti128 [dstq+strideq], m0, 1 2059%else 2060 pxor m6, m6 2061 punpckhbw m9, m0, m6 2062 punpcklbw m0, m6 ; m0-1: src as word 2063 2064 paddw m0, m2 2065 paddw m9, m3 2066 pmaxsw m0, m13 2067 pmaxsw m9, m13 2068 pminsw m0, m12 2069 pminsw m9, m12 2070 packuswb m0, m9 2071 mova [dstq], m0 2072%endif 2073 2074 sub hb, 1+%2 2075 jl %%end_y_v_overlap 2076%if %2 2077 lea srcq, [srcq+strideq*2] 2078 lea dstq, [dstq+strideq*2] 2079 lea lumaq, [lumaq+lstrideq*(2<<%3)] 2080%else 2081 add srcq, strideq 2082 add dstq, strideq 2083 add lumaq, lstrideq 2084%endif 2085 add grain_lutq, 82<<%2 2086%if %2 == 0 2087 vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16] 2088 btc hd, 16 2089 jnc %%loop_y_v_overlap 2090%endif 2091 jmp %%loop_y 2092 2093%%end_y_v_overlap: 2094 add wq, 32>>%2 2095 jge %%end_hv 2096 mov srcq, r11mp 2097 mov dstq, r12mp 2098 lea lumaq, [r14+wq*(1+%2)] 2099 add srcq, wq 2100 add dstq, wq 2101 2102 ; since fg_dataq.overlap is guaranteed to be set, we never jump 2103 ; back to .loop_x_v_overlap, and instead always fall-through to 2104 ; h+v overlap 2105 2106%%loop_x_hv_overlap: 2107 ; we assume from the block above that bits 8-15 of r7d are zero'ed 2108 mov r6d, seed 2109 or seed, 0xeff4eff4 2110 test seeb, seeh 2111 setp r7b ; parity of top_seed 2112 shr seed, 16 2113 shl r7d, 16 2114 test seeb, seeh 2115 setp r7b ; parity of cur_seed 2116 or r6d, 0x00010001 2117 xor r7d, r6d 2118 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 2119 2120 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2121 offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride 2122 2123 lea topleft_offxyq, [top_offxyq+(32>>%2)] 2124 lea left_offxyq, [offyq+(32>>%2)] 2125 rorx offyd, seed, 8 2126 rorx offxd, seed, 12 2127 and offyd, 0xf000f 2128 and offxd, 0xf000f 2129 imul offyd, 164>>%3 2130 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 2131 lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 2132 2133 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2134 h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride 2135 2136 movzx top_offxyd, offxyw 2137 shr offxyd, 16 2138 2139 mov hd, hm 2140 mov grain_lutq, grain_lutmp 2141%if %2 == 0 2142 vbroadcasti128 m1, [pb_8x_27_17_8x_17_27] 2143%endif 2144%%loop_y_hv_overlap: 2145 ; src 2146%if %2 2147 mova xm4, [lumaq+lstrideq*0+ 0] 2148 mova xm6, [lumaq+lstrideq*0+16] 2149 mova xm0, [srcq] 2150 vpbroadcastd m7, [pb_1] 2151 vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 2152 vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 2153 vinserti128 m0, [srcq+strideq], 1 2154 pxor m2, m2 2155 pmaddubsw m4, m7 2156 pmaddubsw m6, m7 2157 pavgw m4, m2 2158 pavgw m6, m2 2159%else 2160 mova m4, [lumaq] 2161 mova m0, [srcq] 2162 pxor m2, m2 2163%endif 2164 2165%if %1 2166%if %2 2167 packuswb m4, m6 ; luma 2168%endif 2169 punpckhbw m6, m4, m0 2170 punpcklbw m4, m0 ; { luma, chroma } 2171 pmaddubsw m6, m14 2172 pmaddubsw m4, m14 2173 psraw m6, 6 2174 psraw m4, 6 2175 paddw m6, m15 2176 paddw m4, m15 2177 packuswb m4, m6 ; pack+unpack = clip 2178 punpckhbw m6, m4, m2 2179 punpcklbw m4, m2 2180%elif %2 == 0 2181 punpckhbw m6, m4, m2 2182 punpcklbw m4, m2 2183%endif 2184 2185 punpckhwd m5, m4, m2 2186 punpcklwd m4, m2 2187 punpckhwd m7, m6, m2 2188 punpcklwd m6, m2 ; m4-7: src as dword 2189 2190 ; scaling[src] 2191 pcmpeqw m9, m9 2192 pcmpeqw m3, m3 2193 vpgatherdd m8, [scalingq+m4], m9 2194 vpgatherdd m4, [scalingq+m5], m3 2195 pcmpeqw m9, m9 2196 pcmpeqw m3, m3 2197 vpgatherdd m5, [scalingq+m6], m9 2198 vpgatherdd m6, [scalingq+m7], m3 2199 pand m8, m10 2200 pand m4, m10 2201 pand m5, m10 2202 pand m6, m10 2203 packusdw m8, m4 2204 packusdw m5, m6 2205 2206%if %2 2207 ; unpack chroma source 2208 punpckhbw m1, m0, m2 2209 punpcklbw m0, m2 ; m0-1: src as word 2210%endif 2211 2212 ; grain = grain_lut[offy+y][offx+x] 2213%if %1 2214%if %2 2215 vpbroadcastd m9, [pb_23_22] 2216%else 2217 vpbroadcastd xm9, [pb_27_17_17_27] 2218%endif 2219%endif 2220 2221%if %2 2222 movu xm3, [grain_lutq+offxyq] 2223%if %3 2224 movq xm6, [grain_lutq+top_offxyq] 2225%else 2226 movu xm6, [grain_lutq+top_offxyq] 2227%endif 2228 vinserti128 m3, [grain_lutq+offxyq+82], 1 2229%if %3 2230 vinserti128 m6, [grain_lutq+top_offxyq+8], 1 2231%else 2232 vinserti128 m6, [grain_lutq+top_offxyq+82], 1 2233%endif 2234%else 2235 movu m3, [grain_lutq+offxyq] 2236 movu m6, [grain_lutq+top_offxyq] 2237%endif 2238 movd xm4, [grain_lutq+left_offxyq] 2239 movd xm7, [grain_lutq+topleft_offxyq] 2240%if %2 2241 vinserti128 m4, [grain_lutq+left_offxyq+82], 1 2242%if %3 == 0 2243 vinserti128 m7, [grain_lutq+topleft_offxyq+82], 1 2244%endif 2245%endif 2246 2247 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 2248%if %2 2249 punpcklbw m4, m3 2250%if %3 2251 punpcklbw xm7, xm6 2252%else 2253 punpcklbw m7, m6 2254%endif 2255 punpcklwd m4, m7 2256%if %1 2257 pmaddubsw m4, m9, m4 2258 pmulhrsw m4, [pw_1024] 2259%else 2260 pmaddubsw m4, m15, m4 2261 pmulhrsw m4, m14 2262%endif 2263 packsswb m4, m4 2264 pcmpeqw m9, m9 ; this is kind of ugly 2265 psrldq m9, 15 2266 vpblendvb m3, m3, m4, m9 2267 psrldq m4, 1 2268%if %3 2269 shufpd m9, m9, m9, 1110b ; clear upper lane 2270%endif 2271 vpblendvb m6, m6, m4, m9 2272%else 2273 punpcklbw xm4, xm3 2274 punpcklbw xm7, xm6 2275 punpckldq xm4, xm7 2276%if %1 2277 pmaddubsw xm4, xm9, xm4 2278 pmulhrsw xm4, [pw_1024] 2279%else 2280 pmaddubsw xm4, xm15, xm4 2281 pmulhrsw xm4, xm14 2282%endif 2283 packsswb xm4, xm4 2284 pcmpeqw xm9, xm9 ; this is kind of ugly 2285 psrldq xm9, 14 2286 vpblendvb m3, m3, m4, m9 2287 psrldq xm4, 2 2288 vpblendvb m6, m6, m4, m9 2289%endif 2290 2291 ; followed by v interpolation (top | cur -> cur) 2292%if %3 2293 vpermq m9, m3, q3120 2294 punpcklbw m6, m9 2295%if %1 2296 vpbroadcastd m9, [pb_23_22] 2297 pmaddubsw m6, m9, m6 2298 pmulhrsw m6, [pw_1024] 2299%else 2300 pmaddubsw m6, m15, m6 2301 pmulhrsw m6, m14 2302%endif 2303 packsswb m6, m6 2304 vpermq m6, m6, q3120 2305 vpblendd m3, m3, m6, 00001111b 2306%else 2307 punpckhbw m9, m6, m3 2308 punpcklbw m6, m3 2309%if %2 2310 mova m3, [pb_8x_27_17_8x_17_27] 2311 pmaddubsw m9, m3, m9 2312 pmaddubsw m6, m3, m6 2313%else 2314 pmaddubsw m9, m1, m9 2315 pmaddubsw m6, m1, m6 2316%endif 2317%if %1 2318 pmulhrsw m9, [pw_1024] 2319 pmulhrsw m6, [pw_1024] 2320%else 2321 pmulhrsw m9, m14 2322 pmulhrsw m6, m14 2323%endif 2324 packsswb m3, m6, m9 2325%endif 2326 pcmpgtb m7, m2, m3 2327 punpcklbw m2, m3, m7 2328 punpckhbw m3, m7 2329 2330 ; noise = round2(scaling[src] * grain, scaling_shift) 2331 pmullw m2, m8 2332 pmullw m3, m5 2333 pmulhrsw m2, m11 2334 pmulhrsw m3, m11 2335 2336 ; dst = clip_pixel(src, noise) 2337%if %2 2338 paddw m0, m2 2339 paddw m1, m3 2340 pmaxsw m0, m13 2341 pmaxsw m1, m13 2342 pminsw m0, m12 2343 pminsw m1, m12 2344 packuswb m0, m1 2345 mova [dstq], xm0 2346 vextracti128 [dstq+strideq], m0, 1 2347%else 2348 pxor m6, m6 2349 punpckhbw m9, m0, m6 2350 punpcklbw m0, m6 ; m0-1: src as word 2351 paddw m0, m2 2352 paddw m9, m3 2353 pmaxsw m0, m13 2354 pmaxsw m9, m13 2355 pminsw m0, m12 2356 pminsw m9, m12 2357 packuswb m0, m9 2358 mova [dstq], m0 2359%endif 2360 2361%if %2 2362 lea srcq, [srcq+strideq*2] 2363 lea dstq, [dstq+strideq*2] 2364 lea lumaq, [lumaq+lstrideq*(2<<%3)] 2365%else 2366 add srcq, strideq 2367 add dstq, strideq 2368 add lumaq, lstrideq 2369%endif 2370 add grain_lutq, 82<<%2 2371 sub hb, 1+%2 2372%if %2 2373 jg %%loop_y_h_overlap 2374%else 2375 je %%end_y_hv_overlap 2376 vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16] 2377 btc hd, 16 2378 jnc %%loop_y_hv_overlap 2379 jmp %%loop_y_h_overlap 2380%endif 2381 2382%%end_y_hv_overlap: 2383 add wq, 32>>%2 2384 jge %%end_hv 2385 mov srcq, r11mp 2386 mov dstq, r12mp 2387 lea lumaq, [r14+wq*(1+%2)] 2388 add srcq, wq 2389 add dstq, wq 2390 jmp %%loop_x_hv_overlap 2391 2392%%end_hv: 2393 RET 2394%endmacro 2395 2396 %%FGUV_32x32xN_LOOP 1, %2, %3 2397.csfl: 2398 %%FGUV_32x32xN_LOOP 0, %2, %3 2399%endmacro 2400 2401FGUV_FN 420, 1, 1 2402FGUV_FN 422, 1, 0 2403FGUV_FN 444, 0, 0 2404 2405%endif ; ARCH_X86_64 2406