1; Copyright © 2019, VideoLAN and dav1d authors 2; Copyright © 2019, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "ext/x86/x86inc.asm" 27 28%if ARCH_X86_64 29 30SECTION_RODATA 32 31pw_1024: times 16 dw 1024 32pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 33rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 34byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 35pw_seed_xor: times 2 dw 0xb524 36 times 2 dw 0x49d8 37pd_m65536: dd ~0xffff 38pb_23_22: times 2 db 23, 22 39pb_1: times 4 db 1 40hmul_bits: dw 32768, 16384, 8192, 4096 41round: dw 2048, 1024, 512 42mul_bits: dw 256, 128, 64, 32, 16 43round_vals: dw 32, 64, 128, 256, 512 44max: dw 255, 240, 235 45min: dw 0, 16 46pb_27_17_17_27: db 27, 17, 17, 27 47pw_1: dw 1 48 49%macro JMP_TABLE 1-* 50 %xdefine %1_table %%table 51 %xdefine %%base %1_table 52 %xdefine %%prefix mangle(private_prefix %+ _%1) 53 %%table: 54 %rep %0 - 1 55 dd %%prefix %+ .ar%2 - %%base 56 %rotate 1 57 %endrep 58%endmacro 59 60ALIGN 4 61JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3 62JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3 63 64struc FGData 65 .seed: resd 1 66 .num_y_points: resd 1 67 .y_points: resb 14 * 2 68 .chroma_scaling_from_luma: resd 1 69 .num_uv_points: resd 2 70 .uv_points: resb 2 * 10 * 2 71 .scaling_shift: resd 1 72 .ar_coeff_lag: resd 1 73 .ar_coeffs_y: resb 24 74 .ar_coeffs_uv: resb 2 * 28 ; includes padding 75 .ar_coeff_shift: resq 1 76 .grain_scale_shift: resd 1 77 .uv_mult: resd 2 78 .uv_luma_mult: resd 2 79 .uv_offset: resd 2 80 .overlap_flag: resd 1 81 .clip_to_restricted_range: resd 1 82endstruc 83 84cextern gaussian_sequence 85 86SECTION .text 87 88INIT_XMM avx2 89cglobal generate_grain_y, 2, 9, 16, buf, fg_data 90 lea r4, [pb_mask] 91%define base r4-pb_mask 92 movq xm1, [base+rnd_next_upperbit_mask] 93 movq xm4, [base+mul_bits] 94 movq xm7, [base+hmul_bits] 95 mov r2d, [fg_dataq+FGData.grain_scale_shift] 96 vpbroadcastw xm8, [base+round+r2*2] 97 mova xm5, [base+pb_mask] 98 vpbroadcastw xm0, [fg_dataq+FGData.seed] 99 vpbroadcastd xm9, [base+pd_m65536] 100 mov r2, -73*82 101 sub bufq, r2 102 lea r3, [gaussian_sequence] 103.loop: 104 pand xm2, xm0, xm1 105 psrlw xm3, xm2, 10 106 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 107 pmullw xm2, xm4 ; bits 0x0f00 are set 108 pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds 109 psllq xm6, xm2, 30 110 por xm2, xm6 111 psllq xm6, xm2, 15 112 por xm2, xm6 ; aggregate each bit into next seed's high bit 113 pmulhuw xm3, xm0, xm7 114 por xm2, xm3 ; 4 next output seeds 115 pshuflw xm0, xm2, q3333 116 psrlw xm2, 5 117 pmovzxwd xm3, xm2 118 mova xm6, xm9 119 vpgatherdd xm2, [r3+xm3*2], xm6 120 pandn xm2, xm9, xm2 121 packusdw xm2, xm2 122 pmulhrsw xm2, xm8 123 packsswb xm2, xm2 124 movd [bufq+r2], xm2 125 add r2, 4 126 jl .loop 127 128 ; auto-regression code 129 movsxd r2, [fg_dataq+FGData.ar_coeff_lag] 130 movsxd r2, [base+generate_grain_y_avx2_table+r2*4] 131 lea r2, [r2+base+generate_grain_y_avx2_table] 132 jmp r2 133 134.ar1: 135 DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 136 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 137 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] 138 movd xm4, [fg_dataq+FGData.ar_coeffs_y] 139 DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 140 pinsrb xm4, [pb_1], 3 141 pmovsxbw xm4, xm4 142 pshufd xm5, xm4, q1111 143 pshufd xm4, xm4, q0000 144 vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd 145 sub bufq, 82*73-(82*3+79) 146 mov hd, 70 147 mov mind, -128 148 mov maxd, 127 149.y_loop_ar1: 150 mov xq, -76 151 movsx val3d, byte [bufq+xq-1] 152.x_loop_ar1: 153 pmovsxbw xm0, [bufq+xq-82-1] ; top/left 154 pmovsxbw xm2, [bufq+xq-82+0] ; top 155 pmovsxbw xm1, [bufq+xq-82+1] ; top/right 156 punpcklwd xm0, xm2 157 punpcklwd xm1, xm3 158 pmaddwd xm0, xm4 159 pmaddwd xm1, xm5 160 paddd xm0, xm1 161.x_loop_ar1_inner: 162 movd val0d, xm0 163 psrldq xm0, 4 164 imul val3d, cf3d 165 add val3d, val0d 166%if WIN64 167 sarx val3d, val3d, shiftd 168%else 169 sar val3d, shiftb 170%endif 171 movsx val0d, byte [bufq+xq] 172 add val3d, val0d 173 cmp val3d, maxd 174 cmovns val3d, maxd 175 cmp val3d, mind 176 cmovs val3d, mind 177 mov byte [bufq+xq], val3b 178 ; keep val3d in-place as left for next x iteration 179 inc xq 180 jz .x_loop_ar1_end 181 test xq, 3 182 jnz .x_loop_ar1_inner 183 jmp .x_loop_ar1 184 185.x_loop_ar1_end: 186 add bufq, 82 187 dec hd 188 jg .y_loop_ar1 189.ar0: 190 RET 191 192.ar2: 193 DEFINE_ARGS buf, fg_data, shift 194 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 195 vpbroadcastw xm14, [base+round_vals-12+shiftq*2] 196 movq xm15, [base+byte_blend+1] 197 pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 198 movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 199 pmovsxbw xm9, xm9 200 DEFINE_ARGS buf, fg_data, h, x 201 pshufd xm12, xm9, q0000 202 pshufd xm13, xm9, q1111 203 pshufd xm11, xm8, q3333 204 pshufd xm10, xm8, q2222 205 pshufd xm9, xm8, q1111 206 pshufd xm8, xm8, q0000 207 pmovzxwd xm14, xm14 208 sub bufq, 82*73-(82*3+79) 209 mov hd, 70 210.y_loop_ar2: 211 mov xq, -76 212 213.x_loop_ar2: 214 pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 215 pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 216 psrldq xm2, xm0, 2 ; y=-2,x=[-1,+5] 217 psrldq xm3, xm1, 2 ; y=-1,x=[-1,+5] 218 psrldq xm4, xm1, 4 ; y=-1,x=[+0,+5] 219 punpcklwd xm2, xm0, xm2 220 punpcklwd xm3, xm4 221 pmaddwd xm2, xm8 222 pmaddwd xm3, xm11 223 paddd xm2, xm3 224 225 psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5] 226 psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5] 227 psrldq xm6, xm0, 8 ; y=-2,x=[+2,+5] 228 punpcklwd xm4, xm5 229 punpcklwd xm6, xm1 230 psrldq xm7, xm1, 6 ; y=-1,x=[+1,+5] 231 psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5] 232 punpcklwd xm7, xm1 233 pmaddwd xm4, xm9 234 pmaddwd xm6, xm10 235 pmaddwd xm7, xm12 236 paddd xm4, xm6 237 paddd xm2, xm7 238 paddd xm2, xm4 239 paddd xm2, xm14 240 241 movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] 242.x_loop_ar2_inner: 243 pmovsxbw xm1, xm0 244 pmaddwd xm3, xm1, xm13 245 paddd xm3, xm2 246 psrldq xm1, 4 ; y=0,x=0 247 psrldq xm2, 4 ; shift top to next pixel 248 psrad xm3, [fg_dataq+FGData.ar_coeff_shift] 249 ; don't packssdw since we only care about one value 250 paddw xm3, xm1 251 packsswb xm3, xm3 252 pextrb [bufq+xq], xm3, 0 253 pslldq xm3, 2 254 pand xm3, xm15 255 pandn xm0, xm15, xm0 256 por xm0, xm3 257 psrldq xm0, 1 258 inc xq 259 jz .x_loop_ar2_end 260 test xq, 3 261 jnz .x_loop_ar2_inner 262 jmp .x_loop_ar2 263 264.x_loop_ar2_end: 265 add bufq, 82 266 dec hd 267 jg .y_loop_ar2 268 RET 269 270.ar3: 271 DEFINE_ARGS buf, fg_data, shift 272%if WIN64 273 SUB rsp, 16*12 274%assign stack_size_padded (stack_size_padded+16*12) 275%assign stack_size (stack_size+16*12) 276%else 277 ALLOC_STACK 16*12 278%endif 279 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 280 vpbroadcastw xm14, [base+round_vals-12+shiftq*2] 281 movq xm15, [base+byte_blend] 282 pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-7 283 pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_y+ 8] ; cf8-15 284 pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 285 pshufd xm9, xm0, q1111 286 pshufd xm10, xm0, q2222 287 pshufd xm11, xm0, q3333 288 pshufd xm0, xm0, q0000 289 pshufd xm6, xm1, q1111 290 pshufd xm7, xm1, q2222 291 pshufd xm8, xm1, q3333 292 pshufd xm1, xm1, q0000 293 pshufd xm3, xm2, q1111 294 psrldq xm13, xm2, 10 295 pinsrw xm2, [pw_1], 5 296 pshufd xm4, xm2, q2222 297 pshufd xm2, xm2, q0000 298 pinsrw xm13, [base+round_vals+shiftq*2-10], 3 299 mova [rsp+ 0*16], xm0 300 mova [rsp+ 1*16], xm9 301 mova [rsp+ 2*16], xm10 302 mova [rsp+ 3*16], xm11 303 mova [rsp+ 4*16], xm1 304 mova [rsp+ 5*16], xm6 305 mova [rsp+ 6*16], xm7 306 mova [rsp+ 7*16], xm8 307 mova [rsp+ 8*16], xm2 308 mova [rsp+ 9*16], xm3 309 mova [rsp+10*16], xm4 310 DEFINE_ARGS buf, fg_data, h, x 311 sub bufq, 82*73-(82*3+79) 312 mov hd, 70 313.y_loop_ar3: 314 mov xq, -76 315 316.x_loop_ar3: 317 movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] 318 movu xm1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] 319 movu xm2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 320 pxor xm3, xm3 321 pcmpgtb xm6, xm3, xm2 322 pcmpgtb xm5, xm3, xm1 323 pcmpgtb xm4, xm3, xm0 324 punpckhbw xm3, xm0, xm4 325 punpcklbw xm0, xm4 326 punpckhbw xm4, xm1, xm5 327 punpcklbw xm1, xm5 328 punpckhbw xm5, xm2, xm6 329 punpcklbw xm2, xm6 330 331 psrldq xm6, xm0, 2 332 psrldq xm7, xm0, 4 333 psrldq xm8, xm0, 6 334 psrldq xm9, xm0, 8 335 palignr xm10, xm3, xm0, 10 336 palignr xm11, xm3, xm0, 12 337 338 punpcklwd xm0, xm6 339 punpcklwd xm7, xm8 340 punpcklwd xm9, xm10 341 punpcklwd xm11, xm1 342 pmaddwd xm0, [rsp+ 0*16] 343 pmaddwd xm7, [rsp+ 1*16] 344 pmaddwd xm9, [rsp+ 2*16] 345 pmaddwd xm11, [rsp+ 3*16] 346 paddd xm0, xm7 347 paddd xm9, xm11 348 paddd xm0, xm9 349 350 psrldq xm6, xm1, 2 351 psrldq xm7, xm1, 4 352 psrldq xm8, xm1, 6 353 psrldq xm9, xm1, 8 354 palignr xm10, xm4, xm1, 10 355 palignr xm11, xm4, xm1, 12 356 psrldq xm12, xm2, 2 357 358 punpcklwd xm6, xm7 359 punpcklwd xm8, xm9 360 punpcklwd xm10, xm11 361 punpcklwd xm12, xm2, xm12 362 pmaddwd xm6, [rsp+ 4*16] 363 pmaddwd xm8, [rsp+ 5*16] 364 pmaddwd xm10, [rsp+ 6*16] 365 pmaddwd xm12, [rsp+ 7*16] 366 paddd xm6, xm8 367 paddd xm10, xm12 368 paddd xm6, xm10 369 paddd xm0, xm6 370 371 psrldq xm6, xm2, 4 372 psrldq xm7, xm2, 6 373 psrldq xm8, xm2, 8 374 palignr xm9, xm5, xm2, 10 375 palignr xm5, xm5, xm2, 12 376 377 punpcklwd xm6, xm7 378 punpcklwd xm8, xm9 379 punpcklwd xm5, xm14 380 pmaddwd xm6, [rsp+ 8*16] 381 pmaddwd xm8, [rsp+ 9*16] 382 pmaddwd xm5, [rsp+10*16] 383 paddd xm0, xm6 384 paddd xm8, xm5 385 paddd xm0, xm8 386 387 movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] 388.x_loop_ar3_inner: 389 pmovsxbw xm2, xm1 390 pmaddwd xm2, xm13 391 pshufd xm3, xm2, q1111 392 paddd xm2, xm3 ; left+cur 393 paddd xm2, xm0 ; add top 394 psrldq xm0, 4 395 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] 396 ; don't packssdw since we only care about one value 397 packsswb xm2, xm2 398 pextrb [bufq+xq], xm2, 0 399 pslldq xm2, 3 400 pand xm2, xm15 401 pandn xm1, xm15, xm1 402 por xm1, xm2 403 psrldq xm1, 1 404 inc xq 405 jz .x_loop_ar3_end 406 test xq, 3 407 jnz .x_loop_ar3_inner 408 jmp .x_loop_ar3 409 410.x_loop_ar3_end: 411 add bufq, 82 412 dec hd 413 jg .y_loop_ar3 414 RET 415 416INIT_XMM avx2 417cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv 418 lea r4, [pb_mask] 419%define base r4-pb_mask 420 movq xm1, [base+rnd_next_upperbit_mask] 421 movq xm4, [base+mul_bits] 422 movq xm7, [base+hmul_bits] 423 mov r5d, [fg_dataq+FGData.grain_scale_shift] 424 vpbroadcastw xm8, [base+round+r5*2] 425 mova xm5, [base+pb_mask] 426 vpbroadcastw xm0, [fg_dataq+FGData.seed] 427 vpbroadcastw xm9, [base+pw_seed_xor+uvq*4] 428 pxor xm0, xm9 429 vpbroadcastd xm9, [base+pd_m65536] 430 lea r6, [gaussian_sequence] 431 mov r7d, 38 432 add bufq, 44 433.loop_y: 434 mov r5, -44 435.loop_x: 436 pand xm2, xm0, xm1 437 psrlw xm3, xm2, 10 438 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 439 pmullw xm2, xm4 ; bits 0x0f00 are set 440 pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds 441 psllq xm6, xm2, 30 442 por xm2, xm6 443 psllq xm6, xm2, 15 444 por xm2, xm6 ; aggregate each bit into next seed's high bit 445 pmulhuw xm3, xm0, xm7 446 por xm2, xm3 ; 4 next output seeds 447 pshuflw xm0, xm2, q3333 448 psrlw xm2, 5 449 pmovzxwd xm3, xm2 450 mova xm6, xm9 451 vpgatherdd xm2, [r6+xm3*2], xm6 452 pandn xm2, xm9, xm2 453 packusdw xm2, xm2 454 pmulhrsw xm2, xm8 455 packsswb xm2, xm2 456 movd [bufq+r5], xm2 457 add r5, 4 458 jl .loop_x 459 add bufq, 82 460 dec r7d 461 jg .loop_y 462 463 ; auto-regression code 464 movsxd r5, [fg_dataq+FGData.ar_coeff_lag] 465 movsxd r5, [base+generate_grain_uv_420_avx2_table+r5*4] 466 lea r5, [r5+base+generate_grain_uv_420_avx2_table] 467 jmp r5 468 469.ar0: 470 INIT_YMM avx2 471 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 472 imul uvd, 28 473 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 474 movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] 475 movd xm3, [base+hmul_bits+shiftq*2] 476 DEFINE_ARGS buf, bufy, h 477 pmovsxbw xm4, xm4 478 vpbroadcastd m7, [pb_1] 479 vpbroadcastw m6, [hmul_bits+4] 480 vpbroadcastw m4, xm4 481 vpbroadcastw m3, xm3 482 sub bufq, 82*38+82-(82*3+41) 483 add bufyq, 3+82*3 484 mov hd, 35 485.y_loop_ar0: 486 ; first 32 pixels 487 movu xm8, [bufyq] 488 movu xm9, [bufyq+82] 489 movu xm10, [bufyq+16] 490 movu xm11, [bufyq+82+16] 491 vinserti128 m8, [bufyq+32], 1 492 vinserti128 m9, [bufyq+82+32], 1 493 vinserti128 m10, [bufyq+48], 1 494 vinserti128 m11, [bufyq+82+48], 1 495 pmaddubsw m8, m7, m8 496 pmaddubsw m9, m7, m9 497 pmaddubsw m10, m7, m10 498 pmaddubsw m11, m7, m11 499 paddw m8, m9 500 paddw m10, m11 501 pmulhrsw m8, m6 502 pmulhrsw m10, m6 503 pmullw m8, m4 504 pmullw m10, m4 505 pmulhrsw m8, m3 506 pmulhrsw m10, m3 507 packsswb m8, m10 508 movu m0, [bufq] 509 punpckhbw m1, m0, m8 510 punpcklbw m0, m8 511 pmaddubsw m1, m7, m1 512 pmaddubsw m0, m7, m0 513 packsswb m0, m1 514 movu [bufq], m0 515 516 ; last 6 pixels 517 movu xm8, [bufyq+32*2] 518 movu xm9, [bufyq+32*2+82] 519 pmaddubsw xm8, xm7, xm8 520 pmaddubsw xm9, xm7, xm9 521 paddw xm8, xm9 522 pmulhrsw xm8, xm6 523 pmullw xm8, xm4 524 pmulhrsw xm8, xm3 525 packsswb xm8, xm8 526 movq xm0, [bufq+32] 527 punpcklbw xm8, xm0 528 pmaddubsw xm8, xm7, xm8 529 packsswb xm8, xm8 530 vpblendw xm0, xm8, xm0, 1000b 531 movq [bufq+32], xm0 532 533 add bufq, 82 534 add bufyq, 82*2 535 dec hd 536 jg .y_loop_ar0 537 RET 538 539.ar1: 540 INIT_XMM avx2 541 DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift 542 imul uvd, 28 543 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 544 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] 545 movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] 546 pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 547 DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift 548 pmovsxbw xm4, xm4 549 pshufd xm5, xm4, q1111 550 pshufd xm4, xm4, q0000 551 pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd 552 vpbroadcastd xm7, [pb_1] 553 vpbroadcastw xm6, [hmul_bits+4] 554 vpbroadcastd xm3, xm3 555 sub bufq, 82*38+44-(82*3+41) 556 add bufyq, 79+82*3 557 mov hd, 35 558 mov mind, -128 559 mov maxd, 127 560.y_loop_ar1: 561 mov xq, -38 562 movsx val3d, byte [bufq+xq-1] 563.x_loop_ar1: 564 pmovsxbw xm0, [bufq+xq-82-1] ; top/left 565 movq xm8, [bufyq+xq*2] 566 movq xm9, [bufyq+xq*2+82] 567 psrldq xm2, xm0, 2 ; top 568 psrldq xm1, xm0, 4 ; top/right 569 pmaddubsw xm8, xm7, xm8 570 pmaddubsw xm9, xm7, xm9 571 paddw xm8, xm9 572 pmulhrsw xm8, xm6 573 punpcklwd xm0, xm2 574 punpcklwd xm1, xm8 575 pmaddwd xm0, xm4 576 pmaddwd xm1, xm5 577 paddd xm0, xm1 578 paddd xm0, xm3 579.x_loop_ar1_inner: 580 movd val0d, xm0 581 psrldq xm0, 4 582 imul val3d, cf3d 583 add val3d, val0d 584 sarx val3d, val3d, shiftd 585 movsx val0d, byte [bufq+xq] 586 add val3d, val0d 587 cmp val3d, maxd 588 cmovns val3d, maxd 589 cmp val3d, mind 590 cmovs val3d, mind 591 mov byte [bufq+xq], val3b 592 ; keep val3d in-place as left for next x iteration 593 inc xq 594 jz .x_loop_ar1_end 595 test xq, 3 596 jnz .x_loop_ar1_inner 597 jmp .x_loop_ar1 598 599.x_loop_ar1_end: 600 add bufq, 82 601 add bufyq, 82*2 602 dec hd 603 jg .y_loop_ar1 604 RET 605 606.ar2: 607 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 608 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 609 imul uvd, 28 610 vpbroadcastw xm15, [base+round_vals-12+shiftq*2] 611 pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7 612 pmovsxbw xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12 613 pinsrw xm9, [base+pw_1], 5 614 vpbroadcastw xm7, [base+hmul_bits+4] 615 vpbroadcastd xm6, [base+pb_1] 616 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 617 pshufd xm12, xm9, q0000 618 pshufd xm13, xm9, q1111 619 pshufd xm14, xm9, q2222 620 pshufd xm11, xm8, q3333 621 pshufd xm10, xm8, q2222 622 pshufd xm9, xm8, q1111 623 pshufd xm8, xm8, q0000 624 sub bufq, 82*38+44-(82*3+41) 625 add bufyq, 79+82*3 626 mov hd, 35 627.y_loop_ar2: 628 mov xq, -38 629 630.x_loop_ar2: 631 pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 632 pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 633 psrldq xm2, xm0, 2 ; y=-2,x=[-1,+5] 634 psrldq xm3, xm1, 2 ; y=-1,x=[-1,+5] 635 psrldq xm4, xm1, 4 ; y=-1,x=[+0,+5] 636 punpcklwd xm2, xm0, xm2 637 punpcklwd xm3, xm4 638 pmaddwd xm2, xm8 639 pmaddwd xm3, xm11 640 paddd xm2, xm3 641 642 psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5] 643 psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5] 644 psrldq xm0, 8 ; y=-2,x=[+2,+5] 645 punpcklwd xm4, xm5 646 punpcklwd xm0, xm1 647 psrldq xm3, xm1, 6 ; y=-1,x=[+1,+5] 648 psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5] 649 punpcklwd xm3, xm1 650 pmaddwd xm4, xm9 651 pmaddwd xm0, xm10 652 pmaddwd xm3, xm12 653 paddd xm4, xm0 654 paddd xm2, xm3 655 paddd xm2, xm4 656 657 movq xm0, [bufyq+xq*2] 658 movq xm3, [bufyq+xq*2+82] 659 pmaddubsw xm0, xm6, xm0 660 pmaddubsw xm3, xm6, xm3 661 paddw xm0, xm3 662 pmulhrsw xm0, xm7 663 punpcklwd xm0, xm15 664 pmaddwd xm0, xm14 665 paddd xm2, xm0 666 667 movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] 668.x_loop_ar2_inner: 669 pmovsxbw xm0, xm0 670 pmaddwd xm3, xm0, xm13 671 paddd xm3, xm2 672 psrldq xm2, 4 ; shift top to next pixel 673 psrad xm3, [fg_dataq+FGData.ar_coeff_shift] 674 pslldq xm3, 2 675 psrldq xm0, 2 676 paddw xm3, xm0 677 vpblendw xm0, xm3, 00000010b 678 packsswb xm0, xm0 679 pextrb [bufq+xq], xm0, 1 680 inc xq 681 jz .x_loop_ar2_end 682 test xq, 3 683 jnz .x_loop_ar2_inner 684 jmp .x_loop_ar2 685 686.x_loop_ar2_end: 687 add bufq, 82 688 add bufyq, 82*2 689 dec hd 690 jg .y_loop_ar2 691 RET 692 693.ar3: 694 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 695 SUB rsp, 16*12 696%assign stack_size_padded (stack_size_padded+16*12) 697%assign stack_size (stack_size+16*12) 698 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 699 imul uvd, 28 700 vpbroadcastw xm14, [base+round_vals-12+shiftq*2] 701 pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-7 702 pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8] ; cf8-15 703 pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23 704 pmovsxbw xm5, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma] 705 pshufd xm9, xm0, q1111 706 pshufd xm10, xm0, q2222 707 pshufd xm11, xm0, q3333 708 pshufd xm0, xm0, q0000 709 pshufd xm6, xm1, q1111 710 pshufd xm7, xm1, q2222 711 pshufd xm8, xm1, q3333 712 pshufd xm1, xm1, q0000 713 pshufd xm3, xm2, q1111 714 pshufd xm4, xm2, q2222 715 vpbroadcastw xm5, xm5 716 vpblendw xm4, xm5, 10101010b ; interleave luma cf 717 psrldq xm5, xm2, 10 718 pshufd xm2, xm2, q0000 719 pinsrw xm5, [base+round_vals+shiftq*2-10], 3 720 pmovzxwd xm14, xm14 721 mova [rsp+ 0*16], xm0 722 mova [rsp+ 1*16], xm9 723 mova [rsp+ 2*16], xm10 724 mova [rsp+ 3*16], xm11 725 mova [rsp+ 4*16], xm1 726 mova [rsp+ 5*16], xm6 727 mova [rsp+ 6*16], xm7 728 mova [rsp+ 7*16], xm8 729 mova [rsp+ 8*16], xm2 730 mova [rsp+ 9*16], xm3 731 mova [rsp+10*16], xm4 732 mova [rsp+11*16], xm5 733 vpbroadcastd xm13, [base+pb_1] 734 vpbroadcastw xm15, [base+hmul_bits+4] 735 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 736 sub bufq, 82*38+44-(82*3+41) 737 add bufyq, 79+82*3 738 mov hd, 35 739.y_loop_ar3: 740 mov xq, -38 741 742.x_loop_ar3: 743 movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] 744 movu xm1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] 745 movu xm2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 746 pxor xm3, xm3 747 pcmpgtb xm6, xm3, xm2 748 pcmpgtb xm5, xm3, xm1 749 pcmpgtb xm4, xm3, xm0 750 punpckhbw xm3, xm0, xm4 751 punpcklbw xm0, xm4 752 punpckhbw xm4, xm1, xm5 753 punpcklbw xm1, xm5 754 punpckhbw xm5, xm2, xm6 755 punpcklbw xm2, xm6 756 757 psrldq xm6, xm0, 2 758 psrldq xm7, xm0, 4 759 psrldq xm8, xm0, 6 760 psrldq xm9, xm0, 8 761 palignr xm10, xm3, xm0, 10 762 palignr xm11, xm3, xm0, 12 763 764 punpcklwd xm0, xm6 765 punpcklwd xm7, xm8 766 punpcklwd xm9, xm10 767 punpcklwd xm11, xm1 768 pmaddwd xm0, [rsp+ 0*16] 769 pmaddwd xm7, [rsp+ 1*16] 770 pmaddwd xm9, [rsp+ 2*16] 771 pmaddwd xm11, [rsp+ 3*16] 772 paddd xm0, xm7 773 paddd xm9, xm11 774 paddd xm0, xm9 775 776 psrldq xm6, xm1, 2 777 psrldq xm7, xm1, 4 778 psrldq xm8, xm1, 6 779 psrldq xm9, xm1, 8 780 palignr xm10, xm4, xm1, 10 781 palignr xm11, xm4, xm1, 12 782 psrldq xm12, xm2, 2 783 784 punpcklwd xm6, xm7 785 punpcklwd xm8, xm9 786 punpcklwd xm10, xm11 787 punpcklwd xm12, xm2, xm12 788 pmaddwd xm6, [rsp+ 4*16] 789 pmaddwd xm8, [rsp+ 5*16] 790 pmaddwd xm10, [rsp+ 6*16] 791 pmaddwd xm12, [rsp+ 7*16] 792 paddd xm6, xm8 793 paddd xm10, xm12 794 paddd xm6, xm10 795 paddd xm0, xm6 796 797 psrldq xm6, xm2, 4 798 psrldq xm7, xm2, 6 799 psrldq xm8, xm2, 8 800 palignr xm9, xm5, xm2, 10 801 palignr xm5, xm5, xm2, 12 802 803 movq xm1, [bufyq+xq*2] 804 movq xm2, [bufyq+xq*2+82] 805 pmaddubsw xm1, xm13, xm1 806 pmaddubsw xm2, xm13, xm2 807 paddw xm1, xm2 808 pmulhrsw xm1, xm15 809 810 punpcklwd xm6, xm7 811 punpcklwd xm8, xm9 812 punpcklwd xm5, xm1 813 pmaddwd xm6, [rsp+ 8*16] 814 pmaddwd xm8, [rsp+ 9*16] 815 pmaddwd xm5, [rsp+10*16] 816 paddd xm0, xm6 817 paddd xm8, xm5 818 paddd xm0, xm8 819 paddd xm0, xm14 820 821 movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] 822.x_loop_ar3_inner: 823 pmovsxbw xm1, xm1 824 pmaddwd xm2, xm1, [rsp+16*11] 825 pshufd xm3, xm2, q1111 826 paddd xm2, xm3 ; left+cur 827 paddd xm2, xm0 ; add top 828 psrldq xm0, 4 829 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] 830 ; don't packssdw, we only care about one value 831 pslldq xm2, 6 832 vpblendw xm1, xm2, 1000b 833 packsswb xm1, xm1 834 pextrb [bufq+xq], xm1, 3 835 psrldq xm1, 1 836 inc xq 837 jz .x_loop_ar3_end 838 test xq, 3 839 jnz .x_loop_ar3_inner 840 jmp .x_loop_ar3 841 842.x_loop_ar3_end: 843 add bufq, 82 844 add bufyq, 82*2 845 dec hd 846 jg .y_loop_ar3 847 RET 848 849INIT_YMM avx2 850cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut 851 pcmpeqw m10, m10 852 psrld m10, 24 853 mov r7d, [fg_dataq+FGData.scaling_shift] 854 lea r8, [pb_mask] 855%define base r8-pb_mask 856 vpbroadcastw m11, [base+mul_bits+r7*2-14] 857 mov r7d, [fg_dataq+FGData.clip_to_restricted_range] 858 vpbroadcastw m12, [base+max+r7*4] 859 vpbroadcastw m13, [base+min+r7*2] 860 861 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 862 863 mov overlapd, [fg_dataq+FGData.overlap_flag] 864 movifnidn sbyd, sbym 865 test sbyd, sbyd 866 setnz r7b 867 test r7b, overlapb 868 jnz .vertical_overlap 869 870 imul seed, sbyd, (173 << 24) | 37 871 add seed, (105 << 24) | 178 872 rol seed, 8 873 movzx seed, seew 874 xor seed, [fg_dataq+FGData.seed] 875 876 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 877 unused1, unused2, see, overlap 878 879 lea src_bakq, [srcq+wq] 880 neg wq 881 sub dstq, srcq 882 883.loop_x: 884 mov r6d, seed 885 or seed, 0xEFF4 886 shr r6d, 1 887 test seeb, seeh 888 lea seed, [r6+0x8000] 889 cmovp seed, r6d ; updated seed 890 891 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 892 offx, offy, see, overlap 893 894 mov offxd, seed 895 rorx offyd, seed, 8 896 shr offxd, 12 897 and offyd, 0xf 898 imul offyd, 164 899 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 900 901 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 902 h, offxy, see, overlap 903 904 mov hd, hm 905 mov grain_lutq, grain_lutmp 906.loop_y: 907 ; src 908 mova m0, [srcq] 909 pxor m2, m2 910 punpckhbw m1, m0, m2 911 punpcklbw m0, m2 ; m0-1: src as word 912 punpckhwd m5, m0, m2 913 punpcklwd m4, m0, m2 914 punpckhwd m7, m1, m2 915 punpcklwd m6, m1, m2 ; m4-7: src as dword 916 917 ; scaling[src] 918 pcmpeqw m3, m3 919 pcmpeqw m9, m9 920 vpgatherdd m8, [scalingq+m4], m3 921 vpgatherdd m4, [scalingq+m5], m9 922 pcmpeqw m3, m3 923 pcmpeqw m9, m9 924 vpgatherdd m5, [scalingq+m6], m3 925 vpgatherdd m6, [scalingq+m7], m9 926 pand m8, m10 927 pand m4, m10 928 pand m5, m10 929 pand m6, m10 930 packusdw m8, m4 931 packusdw m5, m6 932 933 ; grain = grain_lut[offy+y][offx+x] 934 movu m3, [grain_lutq+offxyq] 935 pcmpgtb m7, m2, m3 936 punpcklbw m2, m3, m7 937 punpckhbw m3, m7 938 939 ; noise = round2(scaling[src] * grain, scaling_shift) 940 pmullw m2, m8 941 pmullw m3, m5 942 pmulhrsw m2, m11 943 pmulhrsw m3, m11 944 945 ; dst = clip_pixel(src, noise) 946 paddw m0, m2 947 paddw m1, m3 948 pmaxsw m0, m13 949 pmaxsw m1, m13 950 pminsw m0, m12 951 pminsw m1, m12 952 packuswb m0, m1 953 mova [dstq+srcq], m0 954 955 add srcq, strideq 956 add grain_lutq, 82 957 dec hd 958 jg .loop_y 959 960 add wq, 32 961 jge .end 962 lea srcq, [src_bakq+wq] 963 test overlapd, overlapd 964 jz .loop_x 965 966 ; r8m = sbym 967 movd xm15, [pb_27_17_17_27] 968 cmp dword r8m, 0 969 jne .loop_x_hv_overlap 970 971 ; horizontal overlap (without vertical overlap) 972 movd xm14, [pw_1024] 973.loop_x_h_overlap: 974 mov r6d, seed 975 or seed, 0xEFF4 976 shr r6d, 1 977 test seeb, seeh 978 lea seed, [r6+0x8000] 979 cmovp seed, r6d ; updated seed 980 981 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 982 offx, offy, see, left_offxy 983 984 lea left_offxyd, [offyd+32] ; previous column's offy*stride+offx 985 mov offxd, seed 986 rorx offyd, seed, 8 987 shr offxd, 12 988 and offyd, 0xf 989 imul offyd, 164 990 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 991 992 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 993 h, offxy, see, left_offxy 994 995 mov hd, hm 996 mov grain_lutq, grain_lutmp 997.loop_y_h_overlap: 998 ; src 999 mova m0, [srcq] 1000 pxor m2, m2 1001 punpckhbw m1, m0, m2 1002 punpcklbw m0, m2 ; m0-1: src as word 1003 punpckhwd m5, m0, m2 1004 punpcklwd m4, m0, m2 1005 punpckhwd m7, m1, m2 1006 punpcklwd m6, m1, m2 ; m4-7: src as dword 1007 1008 ; scaling[src] 1009 pcmpeqw m3, m3 1010 pcmpeqw m9, m9 1011 vpgatherdd m8, [scalingq+m4], m3 1012 vpgatherdd m4, [scalingq+m5], m9 1013 pcmpeqw m3, m3 1014 pcmpeqw m9, m9 1015 vpgatherdd m5, [scalingq+m6], m3 1016 vpgatherdd m6, [scalingq+m7], m9 1017 pand m8, m10 1018 pand m4, m10 1019 pand m5, m10 1020 pand m6, m10 1021 packusdw m8, m4 1022 packusdw m5, m6 1023 1024 ; grain = grain_lut[offy+y][offx+x] 1025 movu m3, [grain_lutq+offxyq] 1026 movd xm4, [grain_lutq+left_offxyq] 1027 punpcklbw xm4, xm3 1028 pmaddubsw xm4, xm15, xm4 1029 pmulhrsw xm4, xm14 1030 packsswb xm4, xm4 1031 vpblendw xm4, xm3, 11111110b 1032 vpblendd m3, m4, 00001111b 1033 pcmpgtb m7, m2, m3 1034 punpcklbw m2, m3, m7 1035 punpckhbw m3, m7 1036 1037 ; noise = round2(scaling[src] * grain, scaling_shift) 1038 pmullw m2, m8 1039 pmullw m3, m5 1040 pmulhrsw m2, m11 1041 pmulhrsw m3, m11 1042 1043 ; dst = clip_pixel(src, noise) 1044 paddw m0, m2 1045 paddw m1, m3 1046 pmaxsw m0, m13 1047 pmaxsw m1, m13 1048 pminsw m0, m12 1049 pminsw m1, m12 1050 packuswb m0, m1 1051 mova [dstq+srcq], m0 1052 1053 add srcq, strideq 1054 add grain_lutq, 82 1055 dec hd 1056 jg .loop_y_h_overlap 1057 1058 add wq, 32 1059 jge .end 1060 lea srcq, [src_bakq+wq] 1061 1062 ; r8m = sbym 1063 cmp dword r8m, 0 1064 jne .loop_x_hv_overlap 1065 jmp .loop_x_h_overlap 1066 1067.end: 1068 RET 1069 1070.vertical_overlap: 1071 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 1072 1073 movzx sbyd, sbyb 1074 imul seed, [fg_dataq+FGData.seed], 0x00010001 1075 imul r7d, sbyd, 173 * 0x00010001 1076 imul sbyd, 37 * 0x01000100 1077 add r7d, (105 << 16) | 188 1078 add sbyd, (178 << 24) | (141 << 8) 1079 and r7d, 0x00ff00ff 1080 and sbyd, 0xff00ff00 1081 xor seed, r7d 1082 xor seed, sbyd ; (cur_seed << 16) | top_seed 1083 1084 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1085 unused1, unused2, see, overlap 1086 1087 lea src_bakq, [srcq+wq] 1088 neg wq 1089 sub dstq, srcq 1090 1091 vpbroadcastd m14, [pw_1024] 1092.loop_x_v_overlap: 1093 vpbroadcastw m15, [pb_27_17_17_27] 1094 1095 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1096 mov r6d, seed 1097 or seed, 0xeff4eff4 1098 test seeb, seeh 1099 setp r7b ; parity of top_seed 1100 shr seed, 16 1101 shl r7d, 16 1102 test seeb, seeh 1103 setp r7b ; parity of cur_seed 1104 or r6d, 0x00010001 1105 xor r7d, r6d 1106 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1107 1108 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1109 offx, offy, see, overlap, top_offxy 1110 1111 rorx offyd, seed, 8 1112 rorx offxd, seed, 12 1113 and offyd, 0xf000f 1114 and offxd, 0xf000f 1115 imul offyd, 164 1116 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1117 lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1118 1119 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1120 h, offxy, see, overlap, top_offxy 1121 1122 movzx top_offxyd, offxyw 1123 shr offxyd, 16 1124 1125 mov hd, hm 1126 mov grain_lutq, grain_lutmp 1127.loop_y_v_overlap: 1128 ; src 1129 mova m0, [srcq] 1130 pxor m2, m2 1131 punpckhbw m1, m0, m2 1132 punpcklbw m0, m2 ; m0-1: src as word 1133 punpckhwd m5, m0, m2 1134 punpcklwd m4, m0, m2 1135 punpckhwd m7, m1, m2 1136 punpcklwd m6, m1, m2 ; m4-7: src as dword 1137 1138 ; scaling[src] 1139 pcmpeqw m3, m3 1140 pcmpeqw m9, m9 1141 vpgatherdd m8, [scalingq+m4], m3 1142 vpgatherdd m4, [scalingq+m5], m9 1143 pcmpeqw m3, m3 1144 pcmpeqw m9, m9 1145 vpgatherdd m5, [scalingq+m6], m3 1146 vpgatherdd m6, [scalingq+m7], m9 1147 pand m8, m10 1148 pand m4, m10 1149 pand m5, m10 1150 pand m6, m10 1151 packusdw m8, m4 1152 packusdw m5, m6 1153 1154 ; grain = grain_lut[offy+y][offx+x] 1155 movu m3, [grain_lutq+offxyq] 1156 movu m4, [grain_lutq+top_offxyq] 1157 punpckhbw m6, m4, m3 1158 punpcklbw m4, m3 1159 pmaddubsw m6, m15, m6 1160 pmaddubsw m4, m15, m4 1161 pmulhrsw m6, m14 1162 pmulhrsw m4, m14 1163 packsswb m3, m4, m6 1164 pcmpgtb m7, m2, m3 1165 punpcklbw m2, m3, m7 1166 punpckhbw m3, m7 1167 1168 ; noise = round2(scaling[src] * grain, scaling_shift) 1169 pmullw m2, m8 1170 pmullw m3, m5 1171 pmulhrsw m2, m11 1172 pmulhrsw m3, m11 1173 1174 ; dst = clip_pixel(src, noise) 1175 paddw m0, m2 1176 paddw m1, m3 1177 pmaxsw m0, m13 1178 pmaxsw m1, m13 1179 pminsw m0, m12 1180 pminsw m1, m12 1181 packuswb m0, m1 1182 mova [dstq+srcq], m0 1183 1184 vpbroadcastw m15, [pb_27_17_17_27+2] ; swap weights for second v-overlap line 1185 add srcq, strideq 1186 add grain_lutq, 82 1187 dec hw 1188 jz .end_y_v_overlap 1189 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1190 ; remaining (up to) 30 lines 1191 xor hd, 0x10000 1192 test hd, 0x10000 1193 jnz .loop_y_v_overlap 1194 jmp .loop_y 1195 1196.end_y_v_overlap: 1197 add wq, 32 1198 jge .end_hv 1199 lea srcq, [src_bakq+wq] 1200 1201 ; since fg_dataq.overlap is guaranteed to be set, we never jump 1202 ; back to .loop_x_v_overlap, and instead always fall-through to 1203 ; h+v overlap 1204 1205 movd xm15, [pb_27_17_17_27] 1206.loop_x_hv_overlap: 1207 vpbroadcastw m8, [pb_27_17_17_27] 1208 1209 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1210 mov r6d, seed 1211 or seed, 0xeff4eff4 1212 test seeb, seeh 1213 setp r7b ; parity of top_seed 1214 shr seed, 16 1215 shl r7d, 16 1216 test seeb, seeh 1217 setp r7b ; parity of cur_seed 1218 or r6d, 0x00010001 1219 xor r7d, r6d 1220 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1221 1222 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1223 offx, offy, see, left_offxy, top_offxy, topleft_offxy 1224 1225 lea topleft_offxyq, [top_offxyq+32] 1226 lea left_offxyq, [offyq+32] 1227 rorx offyd, seed, 8 1228 rorx offxd, seed, 12 1229 and offyd, 0xf000f 1230 and offxd, 0xf000f 1231 imul offyd, 164 1232 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1233 lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1234 1235 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1236 h, offxy, see, left_offxy, top_offxy, topleft_offxy 1237 1238 movzx top_offxyd, offxyw 1239 shr offxyd, 16 1240 1241 mov hd, hm 1242 mov grain_lutq, grain_lutmp 1243.loop_y_hv_overlap: 1244 ; src 1245 mova m0, [srcq] 1246 pxor m2, m2 1247 punpckhbw m1, m0, m2 1248 punpcklbw m0, m2 ; m0-1: src as word 1249 punpckhwd m5, m0, m2 1250 punpcklwd m4, m0, m2 1251 punpckhwd m7, m1, m2 1252 punpcklwd m6, m1, m2 ; m4-7: src as dword 1253 1254 ; scaling[src] 1255 pcmpeqw m3, m3 1256 ; FIXME it would be nice to have another register here to do 2 vpgatherdd's in parallel 1257 vpgatherdd m9, [scalingq+m4], m3 1258 pcmpeqw m3, m3 1259 vpgatherdd m4, [scalingq+m5], m3 1260 pcmpeqw m3, m3 1261 vpgatherdd m5, [scalingq+m6], m3 1262 pcmpeqw m3, m3 1263 vpgatherdd m6, [scalingq+m7], m3 1264 pand m9, m10 1265 pand m4, m10 1266 pand m5, m10 1267 pand m6, m10 1268 packusdw m9, m4 1269 packusdw m5, m6 1270 1271 ; grain = grain_lut[offy+y][offx+x] 1272 movu m3, [grain_lutq+offxyq] 1273 movu m6, [grain_lutq+top_offxyq] 1274 movd xm4, [grain_lutq+left_offxyq] 1275 movd xm7, [grain_lutq+topleft_offxyq] 1276 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 1277 punpcklbw xm4, xm3 1278 punpcklbw xm7, xm6 1279 pmaddubsw xm4, xm15, xm4 1280 pmaddubsw xm7, xm15, xm7 1281 pmulhrsw xm4, xm14 1282 pmulhrsw xm7, xm14 1283 packsswb xm4, xm4 1284 packsswb xm7, xm7 1285 vpblendw xm4, xm3, 11111110b 1286 vpblendw xm7, xm6, 11111110b 1287 vpblendd m3, m4, 00001111b 1288 vpblendd m6, m7, 00001111b 1289 ; followed by v interpolation (top | cur -> cur) 1290 punpckhbw m7, m6, m3 1291 punpcklbw m6, m3 1292 pmaddubsw m7, m8, m7 1293 pmaddubsw m6, m8, m6 1294 pmulhrsw m7, m14 1295 pmulhrsw m6, m14 1296 packsswb m3, m6, m7 1297 pcmpgtb m7, m2, m3 1298 punpcklbw m2, m3, m7 1299 punpckhbw m3, m7 1300 1301 ; noise = round2(scaling[src] * grain, scaling_shift) 1302 pmullw m2, m9 1303 pmullw m3, m5 1304 pmulhrsw m2, m11 1305 pmulhrsw m3, m11 1306 1307 ; dst = clip_pixel(src, noise) 1308 paddw m0, m2 1309 paddw m1, m3 1310 pmaxsw m0, m13 1311 pmaxsw m1, m13 1312 pminsw m0, m12 1313 pminsw m1, m12 1314 packuswb m0, m1 1315 mova [dstq+srcq], m0 1316 1317 vpbroadcastw m8, [pb_27_17_17_27+2] ; swap weights for second v-overlap line 1318 add srcq, strideq 1319 add grain_lutq, 82 1320 dec hw 1321 jz .end_y_hv_overlap 1322 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1323 ; remaining (up to) 30 lines 1324 xor hd, 0x10000 1325 test hd, 0x10000 1326 jnz .loop_y_hv_overlap 1327 jmp .loop_y_h_overlap 1328 1329.end_y_hv_overlap: 1330 add wq, 32 1331 lea srcq, [src_bakq+wq] 1332 jl .loop_x_hv_overlap 1333 1334.end_hv: 1335 RET 1336 1337cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ 1338 grain_lut, h, sby, luma, lstride, uv_pl, is_id 1339 pcmpeqw m10, m10 1340 psrld m10, 24 1341 mov r7d, [fg_dataq+FGData.scaling_shift] 1342 lea r8, [pb_mask] 1343%define base r8-pb_mask 1344 vpbroadcastw m11, [base+mul_bits+r7*2-14] 1345 mov r7d, [fg_dataq+FGData.clip_to_restricted_range] 1346 mov r9d, dword is_idm 1347 vpbroadcastw m13, [base+min+r7*2] 1348 shlx r7d, r7d, r9d 1349 vpbroadcastw m12, [base+max+r7*2] 1350 1351 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 1352 jne .csfl 1353 1354%macro FGUV_32x32xN_LOOP 1 ; not-csfl 1355 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 1356 1357%if %1 1358 mov r7d, dword r11m 1359 vpbroadcastb m0, [fg_dataq+FGData.uv_mult+r7*4] 1360 vpbroadcastb m1, [fg_dataq+FGData.uv_luma_mult+r7*4] 1361 punpcklbw m14, m1, m0 1362 vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r7*4] 1363%else 1364 vpbroadcastd m14, [pw_1024] 1365 vpbroadcastd m15, [pb_23_22] 1366%endif 1367 1368 mov overlapd, [fg_dataq+FGData.overlap_flag] 1369 movifnidn sbyd, sbym 1370 test sbyd, sbyd 1371 setnz r7b 1372 test r7b, overlapb 1373 jnz %%vertical_overlap 1374 1375 imul seed, sbyd, (173 << 24) | 37 1376 add seed, (105 << 24) | 178 1377 rol seed, 8 1378 movzx seed, seew 1379 xor seed, [fg_dataq+FGData.seed] 1380 1381 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1382 unused2, unused3, see, overlap, unused4, unused5, lstride 1383 1384 mov lumaq, r9mp 1385 lea r12, [srcq+wq] 1386 lea r13, [dstq+wq] 1387 lea r14, [lumaq+wq*2] 1388 mov r11mp, r12 1389 mov r12mp, r13 1390 mov lstrideq, r10mp 1391 neg wq 1392 1393%%loop_x: 1394 mov r6d, seed 1395 or seed, 0xEFF4 1396 shr r6d, 1 1397 test seeb, seeh 1398 lea seed, [r6+0x8000] 1399 cmovp seed, r6d ; updated seed 1400 1401 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1402 offx, offy, see, overlap, unused1, unused2, lstride 1403 1404 mov offxd, seed 1405 rorx offyd, seed, 8 1406 shr offxd, 12 1407 and offyd, 0xf 1408 imul offyd, 82 1409 lea offyq, [offyq+offxq+498] ; offy*stride+offx 1410 1411 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1412 h, offxy, see, overlap, unused1, unused2, lstride 1413 1414 mov hd, hm 1415 mov grain_lutq, grain_lutmp 1416%%loop_y: 1417 ; src 1418 mova xm4, [lumaq+lstrideq*0+ 0] 1419 mova xm6, [lumaq+lstrideq*0+16] 1420 mova xm0, [srcq] 1421 vpbroadcastd m7, [pb_1] 1422 vinserti128 m4, [lumaq+lstrideq*2 +0], 1 1423 vinserti128 m6, [lumaq+lstrideq*2+16], 1 1424 vinserti128 m0, [srcq+strideq], 1 1425 pxor m2, m2 1426 pmaddubsw m4, m7 1427 pmaddubsw m6, m7 1428 pavgw m4, m2 1429 pavgw m6, m2 1430 1431%if %1 1432 packuswb m4, m6 ; luma 1433 punpckhbw m6, m4, m0 1434 punpcklbw m4, m0 ; { luma, chroma } 1435 pmaddubsw m6, m14 1436 pmaddubsw m4, m14 1437 psraw m6, 6 1438 psraw m4, 6 1439 paddw m6, m15 1440 paddw m4, m15 1441 packuswb m4, m6 ; pack+unpack = clip 1442 punpckhbw m6, m4, m2 1443 punpcklbw m4, m2 1444%endif 1445 1446 punpckhwd m5, m4, m2 1447 punpcklwd m4, m2 1448 punpckhwd m7, m6, m2 1449 punpcklwd m6, m2 ; m4-7: luma_src as dword 1450 1451 ; scaling[luma_src] 1452 pcmpeqw m3, m3 1453 pcmpeqw m9, m9 1454 vpgatherdd m8, [scalingq+m4], m3 1455 vpgatherdd m4, [scalingq+m5], m9 1456 pcmpeqw m3, m3 1457 pcmpeqw m9, m9 1458 vpgatherdd m5, [scalingq+m6], m3 1459 vpgatherdd m6, [scalingq+m7], m9 1460 pand m8, m10 1461 pand m4, m10 1462 pand m5, m10 1463 pand m6, m10 1464 packusdw m8, m4 1465 packusdw m5, m6 1466 1467 ; unpack chroma_source 1468 punpckhbw m1, m0, m2 1469 punpcklbw m0, m2 ; m0-1: src as word 1470 1471 ; grain = grain_lut[offy+y][offx+x] 1472 movu xm3, [grain_lutq+offxyq+ 0] 1473 vinserti128 m3, [grain_lutq+offxyq+82], 1 1474 pcmpgtb m7, m2, m3 1475 punpcklbw m2, m3, m7 1476 punpckhbw m3, m7 1477 1478 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1479 pmullw m2, m8 1480 pmullw m3, m5 1481 pmulhrsw m2, m11 1482 pmulhrsw m3, m11 1483 1484 ; dst = clip_pixel(src, noise) 1485 paddw m0, m2 1486 paddw m1, m3 1487 pmaxsw m0, m13 1488 pmaxsw m1, m13 1489 pminsw m0, m12 1490 pminsw m1, m12 1491 packuswb m0, m1 1492 mova [dstq], xm0 1493 vextracti128 [dstq+strideq], m0, 1 1494 1495 lea srcq, [srcq+strideq*2] 1496 lea dstq, [dstq+strideq*2] 1497 lea lumaq, [lumaq+lstrideq*4] 1498 add grain_lutq, 82*2 1499 sub hb, 2 1500 jg %%loop_y 1501 1502 add wq, 16 1503 jge %%end 1504 mov srcq, r11mp 1505 mov dstq, r12mp 1506 lea lumaq, [r14+wq*2] 1507 add srcq, wq 1508 add dstq, wq 1509 test overlapd, overlapd 1510 jz %%loop_x 1511 1512 ; r8m = sbym 1513 cmp dword r8m, 0 1514 jne %%loop_x_hv_overlap 1515 1516 ; horizontal overlap (without vertical overlap) 1517%%loop_x_h_overlap: 1518 mov r6d, seed 1519 or seed, 0xEFF4 1520 shr r6d, 1 1521 test seeb, seeh 1522 lea seed, [r6+0x8000] 1523 cmovp seed, r6d ; updated seed 1524 1525 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1526 offx, offy, see, left_offxy, unused1, unused2, lstride 1527 1528 lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx 1529 mov offxd, seed 1530 rorx offyd, seed, 8 1531 shr offxd, 12 1532 and offyd, 0xf 1533 imul offyd, 82 1534 lea offyq, [offyq+offxq+498] ; offy*stride+offx 1535 1536 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1537 h, offxy, see, left_offxy, unused1, unused2, lstride 1538 1539 mov hd, hm 1540 mov grain_lutq, grain_lutmp 1541%%loop_y_h_overlap: 1542 ; src 1543 mova xm4, [lumaq+lstrideq*0+ 0] 1544 mova xm6, [lumaq+lstrideq*0+16] 1545 mova xm0, [srcq] 1546 vpbroadcastd m7, [pb_1] 1547 vinserti128 m4, [lumaq+lstrideq*2 +0], 1 1548 vinserti128 m6, [lumaq+lstrideq*2+16], 1 1549 vinserti128 m0, [srcq+strideq], 1 1550 pxor m2, m2 1551 pmaddubsw m4, m7 1552 pmaddubsw m6, m7 1553 pavgw m4, m2 1554 pavgw m6, m2 1555 1556%if %1 1557 packuswb m4, m6 ; luma 1558 punpckhbw m6, m4, m0 1559 punpcklbw m4, m0 ; { luma, chroma } 1560 pmaddubsw m6, m14 1561 pmaddubsw m4, m14 1562 psraw m6, 6 1563 psraw m4, 6 1564 paddw m6, m15 1565 paddw m4, m15 1566 packuswb m4, m6 ; pack+unpack = clip 1567 punpckhbw m6, m4, m2 1568 punpcklbw m4, m2 1569%endif 1570 1571 punpckhwd m5, m4, m2 1572 punpcklwd m4, m2 1573 punpckhwd m7, m6, m2 1574 punpcklwd m6, m2 ; m4-7: luma_src as dword 1575 1576 ; scaling[luma_src] 1577 pcmpeqw m3, m3 1578 pcmpeqw m9, m9 1579 vpgatherdd m8, [scalingq+m4], m3 1580 vpgatherdd m4, [scalingq+m5], m9 1581 pcmpeqw m3, m3 1582 pcmpeqw m9, m9 1583 vpgatherdd m5, [scalingq+m6], m3 1584 vpgatherdd m6, [scalingq+m7], m9 1585 pand m8, m10 1586 pand m4, m10 1587 pand m5, m10 1588 pand m6, m10 1589 packusdw m8, m4 1590 packusdw m5, m6 1591 1592 ; unpack chroma_source 1593 punpckhbw m1, m0, m2 1594 punpcklbw m0, m2 ; m0-1: src as word 1595 1596 ; grain = grain_lut[offy+y][offx+x] 1597%if %1 1598 vpbroadcastd m6, [pb_23_22] ; FIXME 1599%endif 1600 movu xm3, [grain_lutq+offxyq+ 0] 1601 movd xm4, [grain_lutq+left_offxyq+ 0] 1602 vinserti128 m3, [grain_lutq+offxyq+82], 1 1603 vinserti128 m4, [grain_lutq+left_offxyq+82], 1 1604 punpcklbw m4, m3 1605%if %1 1606 pmaddubsw m4, m6, m4 1607 pmulhrsw m4, [pw_1024] 1608%else 1609 pmaddubsw m4, m15, m4 1610 pmulhrsw m4, m14 1611%endif 1612 packsswb m4, m4 1613 pcmpeqw m6, m6 ; FIXME 1614 psrldq m6, 15 ; FIXME 1615 vpblendvb m3, m3, m4, m6 1616 pcmpgtb m7, m2, m3 1617 punpcklbw m2, m3, m7 1618 punpckhbw m3, m7 1619 1620 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1621 pmullw m2, m8 1622 pmullw m3, m5 1623 pmulhrsw m2, m11 1624 pmulhrsw m3, m11 1625 1626 ; dst = clip_pixel(src, noise) 1627 paddw m0, m2 1628 paddw m1, m3 1629 pmaxsw m0, m13 1630 pmaxsw m1, m13 1631 pminsw m0, m12 1632 pminsw m1, m12 1633 packuswb m0, m1 1634 mova [dstq], xm0 1635 vextracti128 [dstq+strideq], m0, 1 1636 1637 lea srcq, [srcq+strideq*2] 1638 lea dstq, [dstq+strideq*2] 1639 lea lumaq, [lumaq+lstrideq*4] 1640 add grain_lutq, 82*2 1641 sub hb, 2 1642 jg %%loop_y_h_overlap 1643 1644 add wq, 16 1645 jge %%end 1646 mov srcq, r11mp 1647 mov dstq, r12mp 1648 lea lumaq, [r14+wq*2] 1649 add srcq, wq 1650 add dstq, wq 1651 1652 ; r8m = sbym 1653 cmp dword r8m, 0 1654 jne %%loop_x_hv_overlap 1655 jmp %%loop_x_h_overlap 1656 1657%%end: 1658 RET 1659 1660%%vertical_overlap: 1661 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ 1662 sby, see, overlap, unused1, unused2, lstride 1663 1664 movzx sbyd, sbyb 1665 imul seed, [fg_dataq+FGData.seed], 0x00010001 1666 imul r7d, sbyd, 173 * 0x00010001 1667 imul sbyd, 37 * 0x01000100 1668 add r7d, (105 << 16) | 188 1669 add sbyd, (178 << 24) | (141 << 8) 1670 and r7d, 0x00ff00ff 1671 and sbyd, 0xff00ff00 1672 xor seed, r7d 1673 xor seed, sbyd ; (cur_seed << 16) | top_seed 1674 1675 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1676 unused1, unused2, see, overlap, unused3, unused4, lstride 1677 1678 mov lumaq, r9mp 1679 lea r12, [srcq+wq] 1680 lea r13, [dstq+wq] 1681 lea r14, [lumaq+wq*2] 1682 mov r11mp, r12 1683 mov r12mp, r13 1684 mov lstrideq, r10mp 1685 neg wq 1686 1687%%loop_x_v_overlap: 1688 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1689 mov r6d, seed 1690 or seed, 0xeff4eff4 1691 test seeb, seeh 1692 setp r7b ; parity of top_seed 1693 shr seed, 16 1694 shl r7d, 16 1695 test seeb, seeh 1696 setp r7b ; parity of cur_seed 1697 or r6d, 0x00010001 1698 xor r7d, r6d 1699 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1700 1701 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1702 offx, offy, see, overlap, top_offxy, unused, lstride 1703 1704 rorx offyd, seed, 8 1705 rorx offxd, seed, 12 1706 and offyd, 0xf000f 1707 and offxd, 0xf000f 1708 imul offyd, 82 1709 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1710 lea offyq, [offyq+offxq+0x10001*498+16*82] 1711 1712 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1713 h, offxy, see, overlap, top_offxy, unused, lstride 1714 1715 movzx top_offxyd, offxyw 1716 shr offxyd, 16 1717 1718 mov hd, hm 1719 mov grain_lutq, grain_lutmp 1720%%loop_y_v_overlap: 1721 ; src 1722 mova xm4, [lumaq+lstrideq*0+ 0] 1723 mova xm6, [lumaq+lstrideq*0+16] 1724 mova xm0, [srcq] 1725 vpbroadcastd m7, [pb_1] 1726 vinserti128 m4, [lumaq+lstrideq*2 +0], 1 1727 vinserti128 m6, [lumaq+lstrideq*2+16], 1 1728 vinserti128 m0, [srcq+strideq], 1 1729 pxor m2, m2 1730 pmaddubsw m4, m7 1731 pmaddubsw m6, m7 1732 pavgw m4, m2 1733 pavgw m6, m2 1734 1735%if %1 1736 packuswb m4, m6 ; luma 1737 punpckhbw m6, m4, m0 1738 punpcklbw m4, m0 ; { luma, chroma } 1739 pmaddubsw m6, m14 1740 pmaddubsw m4, m14 1741 psraw m6, 6 1742 psraw m4, 6 1743 paddw m6, m15 1744 paddw m4, m15 1745 packuswb m4, m6 ; pack+unpack = clip 1746 punpckhbw m6, m4, m2 1747 punpcklbw m4, m2 1748%endif 1749 1750 punpckhwd m5, m4, m2 1751 punpcklwd m4, m2 1752 punpckhwd m7, m6, m2 1753 punpcklwd m6, m2 ; m4-7: luma_src as dword 1754 1755 ; scaling[luma_src] 1756 pcmpeqw m3, m3 1757 pcmpeqw m9, m9 1758 vpgatherdd m8, [scalingq+m4], m3 1759 vpgatherdd m4, [scalingq+m5], m9 1760 pcmpeqw m3, m3 1761 pcmpeqw m9, m9 1762 vpgatherdd m5, [scalingq+m6], m3 1763 vpgatherdd m6, [scalingq+m7], m9 1764 pand m8, m10 1765 pand m4, m10 1766 pand m5, m10 1767 pand m6, m10 1768 packusdw m8, m4 1769 packusdw m5, m6 1770 1771 ; unpack chroma_source 1772 punpckhbw m1, m0, m2 1773 punpcklbw m0, m2 ; m0-1: src as word 1774 1775 ; grain = grain_lut[offy+y][offx+x] 1776%if %1 1777 vpbroadcastd m6, [pb_23_22] 1778%endif 1779 movq xm3, [grain_lutq+offxyq] 1780 movq xm4, [grain_lutq+top_offxyq] 1781 vinserti128 m3, [grain_lutq+offxyq+8], 1 1782 vinserti128 m4, [grain_lutq+top_offxyq+8], 1 1783 punpcklbw m4, m3 1784%if %1 1785 pmaddubsw m4, m6, m4 1786 pmulhrsw m4, [pw_1024] 1787%else 1788 pmaddubsw m4, m15, m4 1789 pmulhrsw m4, m14 1790%endif 1791 packsswb m4, m4 1792 vpermq m4, m4, q3120 1793 ; only interpolate first line, insert second line unmodified 1794 vinserti128 m3, m4, [grain_lutq+offxyq+82], 1 1795 pcmpgtb m7, m2, m3 1796 punpcklbw m2, m3, m7 1797 punpckhbw m3, m7 1798 1799 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 1800 pmullw m2, m8 1801 pmullw m3, m5 1802 pmulhrsw m2, m11 1803 pmulhrsw m3, m11 1804 1805 ; dst = clip_pixel(src, noise) 1806 paddw m0, m2 1807 paddw m1, m3 1808 pmaxsw m0, m13 1809 pmaxsw m1, m13 1810 pminsw m0, m12 1811 pminsw m1, m12 1812 packuswb m0, m1 1813 mova [dstq], xm0 1814 vextracti128 [dstq+strideq], m0, 1 1815 1816 sub hb, 2 1817 jl %%end_y_v_overlap 1818 lea srcq, [srcq+strideq*2] 1819 lea dstq, [dstq+strideq*2] 1820 lea lumaq, [lumaq+lstrideq*4] 1821 add grain_lutq, 82*2 1822 jmp %%loop_y 1823 1824%%end_y_v_overlap: 1825 add wq, 16 1826 jge %%end_hv 1827 mov srcq, r11mp 1828 mov dstq, r12mp 1829 lea lumaq, [r14+wq*2] 1830 add srcq, wq 1831 add dstq, wq 1832 1833 ; since fg_dataq.overlap is guaranteed to be set, we never jump 1834 ; back to .loop_x_v_overlap, and instead always fall-through to 1835 ; h+v overlap 1836 1837%%loop_x_hv_overlap: 1838 ; we assume from the block above that bits 8-15 of r7d are zero'ed 1839 mov r6d, seed 1840 or seed, 0xeff4eff4 1841 test seeb, seeh 1842 setp r7b ; parity of top_seed 1843 shr seed, 16 1844 shl r7d, 16 1845 test seeb, seeh 1846 setp r7b ; parity of cur_seed 1847 or r6d, 0x00010001 1848 xor r7d, r6d 1849 rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed 1850 1851 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1852 offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride 1853 1854 lea topleft_offxyq, [top_offxyq+16] 1855 lea left_offxyq, [offyq+16] 1856 rorx offyd, seed, 8 1857 rorx offxd, seed, 12 1858 and offyd, 0xf000f 1859 and offxd, 0xf000f 1860 imul offyd, 82 1861 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1862 lea offyq, [offyq+offxq+0x10001*498+16*82] 1863 1864 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 1865 h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride 1866 1867 movzx top_offxyd, offxyw 1868 shr offxyd, 16 1869 1870 mov hd, hm 1871 mov grain_lutq, grain_lutmp 1872%%loop_y_hv_overlap: 1873 ; src 1874 mova xm4, [lumaq+lstrideq*0+ 0] 1875 mova xm6, [lumaq+lstrideq*0+16] 1876 mova xm0, [srcq] 1877 vpbroadcastd m7, [pb_1] 1878 vinserti128 m4, [lumaq+lstrideq*2 +0], 1 1879 vinserti128 m6, [lumaq+lstrideq*2+16], 1 1880 vinserti128 m0, [srcq+strideq], 1 1881 pxor m2, m2 1882 pmaddubsw m4, m7 1883 pmaddubsw m6, m7 1884 pavgw m4, m2 1885 pavgw m6, m2 1886 1887%if %1 1888 packuswb m4, m6 ; luma 1889 punpckhbw m6, m4, m0 1890 punpcklbw m4, m0 ; { luma, chroma } 1891 pmaddubsw m6, m14 1892 pmaddubsw m4, m14 1893 psraw m6, 6 1894 psraw m4, 6 1895 paddw m6, m15 1896 paddw m4, m15 1897 packuswb m4, m6 ; pack+unpack = clip 1898 punpckhbw m6, m4, m2 1899 punpcklbw m4, m2 1900%endif 1901 1902 punpckhwd m5, m4, m2 1903 punpcklwd m4, m2 1904 punpckhwd m7, m6, m2 1905 punpcklwd m6, m2 ; m4-7: src as dword 1906 1907 ; scaling[src] 1908 pcmpeqw m9, m9 1909 pcmpeqw m3, m3 1910 vpgatherdd m8, [scalingq+m4], m9 1911 vpgatherdd m4, [scalingq+m5], m3 1912 pcmpeqw m9, m9 1913 pcmpeqw m3, m3 1914 vpgatherdd m5, [scalingq+m6], m9 1915 vpgatherdd m6, [scalingq+m7], m3 1916 pand m8, m10 1917 pand m4, m10 1918 pand m5, m10 1919 pand m6, m10 1920 packusdw m8, m4 1921 packusdw m5, m6 1922 1923 ; unpack chroma source 1924 punpckhbw m1, m0, m2 1925 punpcklbw m0, m2 ; m0-1: src as word 1926 1927 ; grain = grain_lut[offy+y][offx+x] 1928%if %1 1929 vpbroadcastd m9, [pb_23_22] 1930%endif 1931 movu xm3, [grain_lutq+offxyq] 1932 movq xm6, [grain_lutq+top_offxyq] 1933 vinserti128 m3, [grain_lutq+offxyq+82], 1 1934 vinserti128 m6, [grain_lutq+top_offxyq+8], 1 1935 movd xm4, [grain_lutq+left_offxyq] 1936 movd xm7, [grain_lutq+topleft_offxyq] 1937 vinserti128 m4, [grain_lutq+left_offxyq+82], 1 1938 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 1939 punpcklbw m4, m3 1940 punpcklbw xm7, xm6 1941%if %1 1942 pmaddubsw m4, m9, m4 1943 pmaddubsw xm7, xm9, xm7 1944 pmulhrsw m4, [pw_1024] 1945 pmulhrsw xm7, [pw_1024] 1946%else 1947 pmaddubsw m4, m15, m4 1948 pmaddubsw xm7, xm15, xm7 1949 pmulhrsw m4, m14 1950 pmulhrsw xm7, xm14 1951%endif 1952 packsswb m4, m4 1953 packsswb xm7, xm7 1954 pcmpeqw m9, m9 ; this is kind of ugly 1955 psrldq m9, 15 1956 vpblendvb m3, m3, m4, m9 1957 shufpd m9, m9, m9, 1110b 1958 vpblendvb m6, m6, m7, m9 1959 vpermq m9, m3, q3120 1960 ; followed by v interpolation (top | cur -> cur) 1961 punpcklbw m6, m9 1962%if %1 1963 vpbroadcastd m9, [pb_23_22] 1964 pmaddubsw m6, m9, m6 1965 pmulhrsw m6, [pw_1024] 1966%else 1967 pmaddubsw m6, m15, m6 1968 pmulhrsw m6, m14 1969%endif 1970 packsswb m6, m6 1971 vpermq m6, m6, q3120 1972 vpblendd m3, m3, m6, 00001111b 1973 pcmpgtb m7, m2, m3 1974 punpcklbw m2, m3, m7 1975 punpckhbw m3, m7 1976 1977 ; noise = round2(scaling[src] * grain, scaling_shift) 1978 pmullw m2, m8 1979 pmullw m3, m5 1980 pmulhrsw m2, m11 1981 pmulhrsw m3, m11 1982 1983 ; dst = clip_pixel(src, noise) 1984 paddw m0, m2 1985 paddw m1, m3 1986 pmaxsw m0, m13 1987 pmaxsw m1, m13 1988 pminsw m0, m12 1989 pminsw m1, m12 1990 packuswb m0, m1 1991 mova [dstq], xm0 1992 vextracti128 [dstq+strideq], m0, 1 1993 1994 lea srcq, [srcq+strideq*2] 1995 lea dstq, [dstq+strideq*2] 1996 lea lumaq, [lumaq+lstrideq*4] 1997 add grain_lutq, 82*2 1998 sub hb, 2 1999 jg %%loop_y_h_overlap 2000 2001%%end_y_hv_overlap: 2002 add wq, 16 2003 jge %%end_hv 2004 mov srcq, r11mp 2005 mov dstq, r12mp 2006 lea lumaq, [r14+wq*2] 2007 add srcq, wq 2008 add dstq, wq 2009 jmp %%loop_x_hv_overlap 2010 2011%%end_hv: 2012 RET 2013%endmacro 2014 2015 FGUV_32x32xN_LOOP 1 2016.csfl: 2017 FGUV_32x32xN_LOOP 0 2018 2019%endif ; ARCH_X86_64 2020