1; Copyright © 2019, VideoLAN and dav1d authors 2; Copyright © 2019, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29SECTION_RODATA 30 31pw_1024: times 8 dw 1024 32pb_27_17: times 8 db 27, 17 33pb_17_27: times 8 db 17, 27 34pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 35rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 36byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 37pw_seed_xor: times 2 dw 0xb524 38 times 2 dw 0x49d8 39pb_23_22: times 2 db 23, 22 40pb_1: times 4 db 1 41hmul_bits: dw 32768, 16384, 8192, 4096 42round: dw 2048, 1024, 512 43mul_bits: dw 256, 128, 64, 32, 16 44round_vals: dw 32, 64, 128, 256, 512 45max: dw 255, 240, 235 46min: dw 0, 16 47pw_1: dw 1 48 49%define pb_27_17_17_27 pb_17_27 - 2 50 51%macro JMP_TABLE 1-* 52 %xdefine %1_table %%table 53 %xdefine %%base %1_table 54 %xdefine %%prefix mangle(private_prefix %+ _%1) 55 %%table: 56 %rep %0 - 1 57 dd %%prefix %+ .ar%2 - %%base 58 %rotate 1 59 %endrep 60%endmacro 61 62JMP_TABLE generate_grain_y_ssse3, 0, 1, 2, 3 63JMP_TABLE generate_grain_uv_420_ssse3, 0, 1, 2, 3 64JMP_TABLE generate_grain_uv_422_ssse3, 0, 1, 2, 3 65JMP_TABLE generate_grain_uv_444_ssse3, 0, 1, 2, 3 66 67struc FGData 68 .seed: resd 1 69 .num_y_points: resd 1 70 .y_points: resb 14 * 2 71 .chroma_scaling_from_luma: resd 1 72 .num_uv_points: resd 2 73 .uv_points: resb 2 * 10 * 2 74 .scaling_shift: resd 1 75 .ar_coeff_lag: resd 1 76 .ar_coeffs_y: resb 24 77 .ar_coeffs_uv: resb 2 * 28 ; includes padding 78 .ar_coeff_shift: resq 1 79 .grain_scale_shift: resd 1 80 .uv_mult: resd 2 81 .uv_luma_mult: resd 2 82 .uv_offset: resd 2 83 .overlap_flag: resd 1 84 .clip_to_restricted_range: resd 1 85endstruc 86 87cextern gaussian_sequence 88 89SECTION .text 90 91%macro SCRATCH 3 92%if ARCH_X86_32 93 mova [rsp+%3*mmsize], m%1 94%define m%2 [rsp+%3*mmsize] 95%else 96 SWAP %1, %2 97%endif 98%endmacro 99 100INIT_XMM ssse3 101cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data 102 LEA r4, $$ 103%define base r4-$$ 104 movq m1, [base+rnd_next_upperbit_mask] 105 movq m4, [base+mul_bits] 106 movq m7, [base+hmul_bits] 107 mov r2d, [fg_dataq+FGData.grain_scale_shift] 108 movd m2, [base+round+r2*2] 109 movd m0, [fg_dataq+FGData.seed] 110 mova m5, [base+pb_mask] 111 pshuflw m2, m2, q0000 112 pshuflw m0, m0, q0000 113 mov r2, -73*82 114 sub bufq, r2 115 lea r3, [base+gaussian_sequence] 116.loop: 117 pand m6, m0, m1 118 psrlw m3, m6, 10 119 por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 120 pmullw m6, m4 ; bits 0x0f00 are set 121 pshufb m3, m5, m6 ; set 15th bit for next 4 seeds 122 psllq m6, m3, 30 123 por m3, m6 124 psllq m6, m3, 15 125 por m3, m6 ; aggregate each bit into next seed's high bit 126 pmulhuw m6, m0, m7 127 por m3, m6 ; 4 next output seeds 128 pshuflw m0, m3, q3333 129 psrlw m3, 5 130%if ARCH_X86_64 131 movq r6, m3 132 mov r8, r6 133 movzx r5d, r6w 134 shr r6d, 16 135 shr r8, 32 136 movzx r7, r8w 137 shr r8, 16 138 139 movd m6, [r3+r5*2] 140 pinsrw m6, [r3+r6*2], 1 141 pinsrw m6, [r3+r7*2], 2 142 pinsrw m6, [r3+r8*2], 3 143%else 144 movd r6, m3 145 pshuflw m3, m3, q3232 146 movzx r5, r6w 147 shr r6, 16 148 149 movd m6, [r3+r5*2] 150 pinsrw m6, [r3+r6*2], 1 151 152 movd r6, m3 153 movzx r5, r6w 154 shr r6, 16 155 156 pinsrw m6, [r3+r5*2], 2 157 pinsrw m6, [r3+r6*2], 3 158%endif 159 pmulhrsw m6, m2 160 packsswb m6, m6 161 movd [bufq+r2], m6 162 add r2, 4 163 jl .loop 164 165 ; auto-regression code 166 movsxd r2, [fg_dataq+FGData.ar_coeff_lag] 167 movsxd r2, [base+generate_grain_y_ssse3_table+r2*4] 168 lea r2, [r2+base+generate_grain_y_ssse3_table] 169 jmp r2 170 171.ar1: 172%if ARCH_X86_32 173 DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max 174%elif WIN64 175 DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0 176 mov bufq, r0 177%else 178 DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 179%endif 180 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] 181 movd m4, [fg_dataq+FGData.ar_coeffs_y] 182 mov ecx, [fg_dataq+FGData.ar_coeff_shift] 183%if ARCH_X86_32 184 mov r1m, cf3d 185 DEFINE_ARGS buf, shift, val3, min, max, x, val0 186%define hd r0mp 187%define cf3d r1mp 188%elif WIN64 189 DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0 190%else 191 DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 192%endif 193 pxor m6, m6 194 pcmpgtb m7, m6, m4 195 punpcklbw m4, m7 196 pinsrw m4, [base+pw_1], 3 197 pshufd m5, m4, q1111 198 pshufd m4, m4, q0000 199 movd m3, [base+round_vals+shiftq*2-12] ; rnd 200 pshuflw m3, m3, q0000 201 sub bufq, 82*73-(82*3+79) 202 mov hd, 70 203 mov mind, -128 204 mov maxd, 127 205.y_loop_ar1: 206 mov xq, -76 207 movsx val3d, byte [bufq+xq-1] 208.x_loop_ar1: 209 movq m0, [bufq+xq-82-1] ; top/left 210 pcmpgtb m7, m6, m0 211 punpcklbw m0, m7 212 psrldq m2, m0, 2 ; top 213 psrldq m1, m0, 4 ; top/right 214 punpcklwd m0, m2 215 punpcklwd m1, m3 216 pmaddwd m0, m4 217 pmaddwd m1, m5 218 paddd m0, m1 219.x_loop_ar1_inner: 220 movd val0d, m0 221 psrldq m0, 4 222 imul val3d, cf3d 223 add val3d, val0d 224 sar val3d, shiftb 225 movsx val0d, byte [bufq+xq] 226 add val3d, val0d 227 cmp val3d, maxd 228 cmovns val3d, maxd 229 cmp val3d, mind 230 cmovs val3d, mind 231 mov byte [bufq+xq], val3b 232 ; keep val3d in-place as left for next x iteration 233 inc xq 234 jz .x_loop_ar1_end 235 test xq, 3 236 jnz .x_loop_ar1_inner 237 jmp .x_loop_ar1 238 239.x_loop_ar1_end: 240 add bufq, 82 241 dec hd 242 jg .y_loop_ar1 243.ar0: 244 RET 245 246.ar2: 247%if ARCH_X86_32 248%assign stack_offset_old stack_offset 249 ALLOC_STACK -16*8 250%endif 251 DEFINE_ARGS buf, fg_data, shift 252 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 253 movd m6, [base+round_vals-12+shiftq*2] 254 movd m7, [base+byte_blend+1] 255 SCRATCH 7, 15, 7 256 movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 257 movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 258 pxor m7, m7 259 pshuflw m6, m6, q0000 260 punpcklwd m6, m7 261 pcmpgtb m4, m7, m0 262 pcmpgtb m5, m7, m1 263 punpcklbw m0, m4 264 punpcklbw m1, m5 265 DEFINE_ARGS buf, fg_data, h, x 266 pshufd m4, m1, q0000 267 pshufd m5, m1, q1111 268 pshufd m3, m0, q3333 269 pshufd m2, m0, q2222 270 pshufd m1, m0, q1111 271 pshufd m0, m0, q0000 272 SCRATCH 0, 8, 0 273 SCRATCH 1, 9, 1 274 SCRATCH 2, 10, 2 275 SCRATCH 3, 11, 3 276 SCRATCH 4, 12, 4 277 SCRATCH 5, 13, 5 278 SCRATCH 6, 14, 6 279 sub bufq, 82*73-(82*3+79) 280 mov hd, 70 281.y_loop_ar2: 282 mov xq, -76 283 284.x_loop_ar2: 285 movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 286 movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 287 pcmpgtb m2, m7, m0 288 punpckhbw m1, m0, m2 289 punpcklbw m0, m2 290 psrldq m5, m0, 2 ; y=-2,x=[-1,+5] 291 psrldq m3, m1, 2 ; y=-1,x=[-1,+5] 292 psrldq m4, m1, 4 ; y=-1,x=[+0,+5] 293 punpcklwd m2, m0, m5 294 punpcklwd m3, m4 295 pmaddwd m2, m8 296 pmaddwd m3, m11 297 paddd m2, m3 298 299 psrldq m4, m0, 4 ; y=-2,x=[+0,+5] 300 psrldq m5, m0, 6 ; y=-2,x=[+1,+5] 301 psrldq m6, m0, 8 ; y=-2,x=[+2,+5] 302 punpcklwd m4, m5 303 punpcklwd m6, m1 304 psrldq m5, m1, 6 ; y=-1,x=[+1,+5] 305 psrldq m1, m1, 8 ; y=-1,x=[+2,+5] 306 punpcklwd m5, m1 307 pmaddwd m4, m9 308 pmaddwd m6, m10 309 pmaddwd m5, m12 310 paddd m4, m6 311 paddd m2, m5 312 paddd m2, m4 313 paddd m2, m14 314 315 movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] 316.x_loop_ar2_inner: 317 pcmpgtb m4, m7, m0 318 punpcklbw m1, m0, m4 319 pmaddwd m3, m1, m13 320 paddd m3, m2 321 psrldq m1, 4 ; y=0,x=0 322 psrldq m2, 4 ; shift top to next pixel 323 psrad m3, [fg_dataq+FGData.ar_coeff_shift] 324 ; don't packssdw since we only care about one value 325 paddw m3, m1 326 packsswb m3, m3 327 pslldq m3, 2 328 pand m3, m15 329 pandn m1, m15, m0 330 por m0, m1, m3 331 psrldq m0, 1 332 ; overwrite 2 pixels, but that's ok 333 movd [bufq+xq-1], m0 334 inc xq 335 jz .x_loop_ar2_end 336 test xq, 3 337 jnz .x_loop_ar2_inner 338 jmp .x_loop_ar2 339 340.x_loop_ar2_end: 341 add bufq, 82 342 dec hd 343 jg .y_loop_ar2 344 RET 345 346.ar3: 347 DEFINE_ARGS buf, fg_data, shift 348%if ARCH_X86_32 349%assign stack_offset stack_offset_old 350 ALLOC_STACK -16*14 351%elif WIN64 352 SUB rsp, 16*6 353%assign stack_size_padded (stack_size_padded+16*6) 354%assign stack_size (stack_size+16*6) 355%else 356 ALLOC_STACK -16*6 357%endif 358 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 359 movd m6, [base+round_vals-12+shiftq*2] 360 movd m7, [base+byte_blend] 361 movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 362 movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 363 pxor m3, m3 364 pcmpgtb m4, m3, m0 365 pcmpgtb m3, m2 366 pshuflw m6, m6, q0000 367 SCRATCH 6, 14, 12 368 SCRATCH 7, 15, 13 369 punpckhbw m1, m0, m4 370 punpcklbw m0, m4 371 punpcklbw m2, m3 372 pshufd m3, m0, q1111 373 pshufd m4, m0, q2222 374 pshufd m5, m0, q3333 375 pshufd m0, m0, q0000 376 mova [rsp+ 0*16], m0 377 mova [rsp+ 1*16], m3 378 mova [rsp+ 2*16], m4 379 mova [rsp+ 3*16], m5 380 pshufd m6, m1, q1111 381 pshufd m7, m1, q2222 382 pshufd m5, m1, q3333 383 pshufd m1, m1, q0000 384 pshufd m3, m2, q1111 385 psrldq m0, m2, 10 386 pinsrw m2, [base+pw_1], 5 387 pshufd m4, m2, q2222 388 pshufd m2, m2, q0000 389 pinsrw m0, [base+round_vals+shiftq*2-10], 3 390 mova [rsp+ 4*16], m1 391 mova [rsp+ 5*16], m6 392 SCRATCH 7, 8, 6 393 SCRATCH 5, 9, 7 394 SCRATCH 2, 10, 8 395 SCRATCH 3, 11, 9 396 SCRATCH 4, 12, 10 397 SCRATCH 0, 13, 11 398 DEFINE_ARGS buf, fg_data, h, x 399 sub bufq, 82*73-(82*3+79) 400 mov hd, 70 401.y_loop_ar3: 402 mov xq, -76 403 404.x_loop_ar3: 405 movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] 406 pxor m3, m3 407 pcmpgtb m3, m0 408 punpckhbw m2, m0, m3 409 punpcklbw m0, m3 410 411 psrldq m5, m0, 2 412 psrldq m6, m0, 4 413 psrldq m7, m0, 6 414 punpcklwd m4, m0, m5 415 punpcklwd m6, m7 416 pmaddwd m4, [rsp+ 0*16] 417 pmaddwd m6, [rsp+ 1*16] 418 paddd m4, m6 419 420 movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] 421 pxor m5, m5 422 pcmpgtb m5, m1 423 punpckhbw m3, m1, m5 424 punpcklbw m1, m5 425 palignr m6, m2, m0, 10 426 palignr m7, m2, m0, 12 427 psrldq m0, 8 428 punpcklwd m0, m6 429 punpcklwd m7, m1 430 pmaddwd m0, [rsp+ 2*16] 431 pmaddwd m7, [rsp+ 3*16] 432 paddd m0, m7 433 paddd m0, m4 434 435 psrldq m4, m1, 2 436 psrldq m5, m1, 4 437 psrldq m6, m1, 6 438 psrldq m7, m1, 8 439 punpcklwd m4, m5 440 punpcklwd m6, m7 441 pmaddwd m4, [rsp+ 4*16] 442 pmaddwd m6, [rsp+ 5*16] 443 paddd m4, m6 444 paddd m0, m4 445 446 movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 447 pxor m7, m7 448 pcmpgtb m7, m2 449 punpckhbw m5, m2, m7 450 punpcklbw m2, m7 451 palignr m7, m3, m1, 10 452 palignr m3, m1, 12 453 psrldq m1, m2, 2 454 punpcklwd m7, m3 455 punpcklwd m3, m2, m1 456 pmaddwd m7, m8 457 pmaddwd m3, m9 458 paddd m7, m3 459 paddd m0, m7 460 461 psrldq m6, m2, 4 462 psrldq m1, m2, 6 463 psrldq m3, m2, 8 464 palignr m4, m5, m2, 10 465 palignr m5, m5, m2, 12 466 467 punpcklwd m6, m1 468 punpcklwd m3, m4 469 punpcklwd m5, m14 470 pmaddwd m6, m10 471 pmaddwd m3, m11 472 pmaddwd m5, m12 473 paddd m0, m6 474 paddd m3, m5 475 paddd m0, m3 476 477 movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] 478.x_loop_ar3_inner: 479 pxor m5, m5 480 pcmpgtb m5, m1 481 punpcklbw m2, m1, m5 482 pmaddwd m2, m13 483 pshufd m3, m2, q1111 484 paddd m2, m3 ; left+cur 485 paddd m2, m0 ; add top 486 psrldq m0, 4 487 psrad m2, [fg_dataq+FGData.ar_coeff_shift] 488 ; don't packssdw since we only care about one value 489 packsswb m2, m2 490 pslldq m2, 3 491 pand m2, m15 492 pandn m3, m15, m1 493 por m1, m2, m3 494 movd [bufq+xq-3], m1 495 psrldq m1, 1 496 inc xq 497 jz .x_loop_ar3_end 498 test xq, 3 499 jnz .x_loop_ar3_inner 500 jmp .x_loop_ar3 501 502.x_loop_ar3_end: 503 add bufq, 82 504 dec hd 505 jg .y_loop_ar3 506 RET 507 508%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y 509INIT_XMM ssse3 510cglobal generate_grain_uv_%1, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv 511 movifnidn r2, r2mp 512 movifnidn r3, r3mp 513 LEA r4, $$ 514%define base r4-$$ 515 movq m1, [base+rnd_next_upperbit_mask] 516 movq m4, [base+mul_bits] 517 movq m7, [base+hmul_bits] 518 mov r5d, [fg_dataq+FGData.grain_scale_shift] 519 movd m6, [base+round+r5*2] 520 mova m5, [base+pb_mask] 521 movd m0, [fg_dataq+FGData.seed] 522 movd m2, [base+pw_seed_xor+uvq*4] 523 pxor m0, m2 524 pshuflw m6, m6, q0000 525 pshuflw m0, m0, q0000 526 lea r6, [base+gaussian_sequence] 527%if %2 528%if ARCH_X86_64 529 mov r7d, 73-35*%3 530%else 531 mov r3mp, 73-35*%3 532%endif 533 add bufq, 44 534.loop_y: 535 mov r5, -44 536.loop_x: 537%else 538 mov r5, -82*73 539 sub bufq, r5 540.loop: 541%endif 542 pand m2, m0, m1 543 psrlw m3, m2, 10 544 por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 545 pmullw m2, m4 ; bits 0x0f00 are set 546 pshufb m3, m5, m2 ; set 15th bit for next 4 seeds 547 psllq m2, m3, 30 548 por m3, m2 549 psllq m2, m3, 15 550 por m3, m2 ; aggregate each bit into next seed's high bit 551 pmulhuw m2, m0, m7 552 por m2, m3 ; 4 next output seeds 553 pshuflw m0, m2, q3333 554 psrlw m2, 5 555%if ARCH_X86_64 556 movd r9d, m2 557 pshuflw m2, m2, q3232 558 movzx r8, r9w 559 shr r9, 16 560 561 movd m3, [r6+r8*2] 562 pinsrw m3, [r6+r9*2], 1 563 564 movd r9d, m2 565 movzx r8, r9w 566 shr r9, 16 567 568 pinsrw m3, [r6+r8*2], 2 569 pinsrw m3, [r6+r9*2], 3 570%else 571 movd r2, m2 572 pshuflw m2, m2, q3232 573 movzx r1, r2w 574 shr r2, 16 575 576 movd m3, [r6+r1*2] 577 pinsrw m3, [r6+r2*2], 1 578 579 movd r2, m2 580 movzx r1, r2w 581 shr r2, 16 582 583 pinsrw m3, [r6+r1*2], 2 584 pinsrw m3, [r6+r2*2], 3 585%endif 586 pmulhrsw m3, m6 587 packsswb m3, m3 588 movd [bufq+r5], m3 589 add r5, 4 590%if %2 591 jl .loop_x 592 add bufq, 82 593%if ARCH_X86_64 594 dec r7d 595%else 596 dec r3mp 597%endif 598 jg .loop_y 599%else 600 jl .loop 601%endif 602 603%if ARCH_X86_32 604 mov r2, r2mp 605%endif 606 607 ; auto-regression code 608 movsxd r5, [fg_dataq+FGData.ar_coeff_lag] 609 movsxd r5, [base+generate_grain_uv_%1_ssse3_table+r5*4] 610 lea r5, [r5+base+generate_grain_uv_%1_ssse3_table] 611 jmp r5 612 613.ar0: 614 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 615 movifnidn bufyq, bufymp 616%if ARCH_X86_32 617%assign stack_offset_old stack_offset 618 ALLOC_STACK -2*16 619%endif 620 imul uvd, 28 621 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 622 movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq] 623 movd m4, [base+hmul_bits+shiftq*2] 624 DEFINE_ARGS buf, bufy, h, x 625 pxor m0, m0 626 pcmpgtb m0, m5 627 punpcklbw m5, m0 628 movd m7, [base+pb_1] 629%if %2 630 movd m6, [base+hmul_bits+2+%3*2] 631%endif 632 pshuflw m5, m5, q0000 633 pshuflw m4, m4, q0000 634 pshufd m7, m7, q0000 635%if %2 636 pshuflw m6, m6, q0000 637%endif 638 punpcklqdq m5, m5 639 punpcklqdq m4, m4 640%if %2 641 punpcklqdq m6, m6 642%endif 643 pcmpeqw m1, m1 644 pslldq m1, 12>>%2 645 SCRATCH 1, 8, 0 646 SCRATCH 4, 9, 1 647%if %2 648 sub bufq, 82*(73-35*%3)+82-(82*3+41) 649%else 650 sub bufq, 82*70-3 651%endif 652 add bufyq, 3+82*3 653 mov hd, 70-35*%3 654.y_loop_ar0: 655 xor xd, xd 656.x_loop_ar0: 657 ; first 32 pixels 658%if %2 659 movu m1, [bufyq+xq*2] 660%if %3 661 movu m2, [bufyq+xq*2+82] 662%endif 663 movu m3, [bufyq+xq*2+16] 664%if %3 665 movu m4, [bufyq+xq*2+82+16] 666%endif 667 pmaddubsw m0, m7, m1 668%if %3 669 pmaddubsw m1, m7, m2 670%endif 671 pmaddubsw m2, m7, m3 672%if %3 673 pmaddubsw m3, m7, m4 674 paddw m0, m1 675 paddw m2, m3 676%endif 677 pmulhrsw m0, m6 678 pmulhrsw m2, m6 679%else 680 movu m0, [bufyq+xq] 681 pxor m6, m6 682 pcmpgtb m6, m0 683 punpckhbw m2, m0, m6 684 punpcklbw m0, m6 685%endif 686 pmullw m0, m5 687 pmullw m2, m5 688 pmulhrsw m0, m9 689 pmulhrsw m2, m9 690 movu m1, [bufq+xq] 691 pxor m4, m4 692 pcmpgtb m4, m1 693 punpckhbw m3, m1, m4 694%if %2 695 punpcklbw m1, m4 696 paddw m2, m3 697 paddw m0, m1 698%else 699 punpcklbw m6, m1, m4 700 paddw m2, m3 701 paddw m0, m6 702%endif 703 packsswb m0, m2 704%if %2 705 movu [bufq+xq], m0 706 add xd, 16 707 cmp xd, 32 708 jl .x_loop_ar0 709 710 ; last 6/12 pixels 711 movu m1, [bufyq+xq*(1+%2)] 712%if %3 713 movu m2, [bufyq+xq*2+82] 714%endif 715 pmaddubsw m0, m7, m1 716%if %3 717 pmaddubsw m1, m7, m2 718 paddw m0, m1 719%endif 720 pmulhrsw m0, m6 721 pmullw m0, m5 722 pmulhrsw m0, m9 723 movq m1, [bufq+xq] 724 pxor m4, m4 725 pcmpgtb m4, m1 726 punpcklbw m2, m1, m4 727 paddw m0, m2 728 packsswb m0, m0 729 pandn m2, m8, m0 730 pand m1, m8 731 por m2, m1 732 movq [bufq+xq], m2 733%else 734 add xd, 16 735 cmp xd, 80 736 je .y_loop_final_ar0 737 movu [bufq+xq-16], m0 738 jmp .x_loop_ar0 739.y_loop_final_ar0: 740 pandn m2, m8, m0 741 pand m1, m8 742 por m2, m1 743 movu [bufq+xq-16], m2 744%endif 745 746 add bufq, 82 747 add bufyq, 82<<%3 748 dec hd 749 jg .y_loop_ar0 750 RET 751 752.ar1: 753%if ARCH_X86_32 754%assign stack_offset stack_offset_old 755%assign stack_size_padded 0 756%xdefine rstk rsp 757%endif 758 DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x 759 imul uvd, 28 760 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] 761 movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1] 762 pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2 763%if ARCH_X86_32 764 mov r3mp, cf3d 765 DEFINE_ARGS buf, shift, fg_data, val3, min, max, x 766%elif WIN64 767 DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x 768 mov bufq, r0 769%else 770 DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x 771%endif 772 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 773 movd m3, [base+round_vals+shiftq*2-12] ; rnd 774%if %2 775 movd m7, [base+pb_1] 776 movd m6, [base+hmul_bits+2+%3*2] 777%endif 778 psrldq m4, 1 779%if ARCH_X86_32 780 DEFINE_ARGS buf, shift, val0, val3, min, max, x 781%elif WIN64 782 DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0 783%else 784 DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0 785%endif 786 pxor m5, m5 787 punpcklwd m3, m5 788%if %2 789 punpcklwd m6, m6 790%endif 791 pcmpgtb m5, m4 792 punpcklbw m4, m5 793 pshufd m5, m4, q1111 794 pshufd m4, m4, q0000 795 pshufd m3, m3, q0000 796%if %2 797 pshufd m7, m7, q0000 798 pshufd m6, m6, q0000 799 sub bufq, 82*(73-35*%3)+44-(82*3+41) 800%else 801 sub bufq, 82*69+3 802%endif 803%if ARCH_X86_32 804 add r1mp, 79+82*3 805 mov r0mp, 70-35*%3 806%else 807 add bufyq, 79+82*3 808 mov hd, 70-35*%3 809%endif 810 mov mind, -128 811 mov maxd, 127 812.y_loop_ar1: 813 mov xq, -(76>>%2) 814 movsx val3d, byte [bufq+xq-1] 815.x_loop_ar1: 816%if %2 817%if ARCH_X86_32 818 mov r2, r1mp 819 movq m0, [r2+xq*2] 820%if %3 821 movq m1, [r2+xq*2+82] 822%endif 823%else 824 movq m0, [bufyq+xq*2] 825%if %3 826 movq m1, [bufyq+xq*2+82] 827%endif 828%endif 829 pmaddubsw m2, m7, m0 830%if %3 831 pmaddubsw m0, m7, m1 832 paddw m2, m0 833%endif 834 pmulhrsw m2, m6 835%else 836%if ARCH_X86_32 837 mov r2, r1mp 838 movd m2, [r2+xq] 839%else 840 movd m2, [bufyq+xq] 841%endif 842 pxor m0, m0 843 pcmpgtb m0, m2 844 punpcklbw m2, m0 845%endif 846 847 movq m0, [bufq+xq-82-1] ; top/left 848 pxor m1, m1 849 pcmpgtb m1, m0 850 punpcklbw m0, m1 851 psrldq m1, m0, 4 ; top/right 852 punpcklwd m1, m2 853 psrldq m2, m0, 2 ; top 854 punpcklwd m0, m2 855 pmaddwd m0, m4 856 pmaddwd m1, m5 857 paddd m0, m1 858 paddd m0, m3 859.x_loop_ar1_inner: 860 movd val0d, m0 861 psrldq m0, 4 862%if ARCH_X86_32 863 imul val3d, r3mp 864%else 865 imul val3d, cf3d 866%endif 867 add val3d, val0d 868 sar val3d, shiftb 869 movsx val0d, byte [bufq+xq] 870 add val3d, val0d 871 cmp val3d, maxd 872 cmovns val3d, maxd 873 cmp val3d, mind 874 cmovs val3d, mind 875 mov byte [bufq+xq], val3b 876 ; keep val3d in-place as left for next x iteration 877 inc xq 878 jz .x_loop_ar1_end 879 test xq, 3 880 jnz .x_loop_ar1_inner 881 jmp .x_loop_ar1 882 883.x_loop_ar1_end: 884 add bufq, 82 885%if ARCH_X86_32 886 add r1mp, 82<<%3 887 dec r0mp 888%else 889 add bufyq, 82<<%3 890 dec hd 891%endif 892 jg .y_loop_ar1 893 RET 894 895.ar2: 896%if ARCH_X86_32 897%assign stack_offset stack_offset_old 898%assign stack_size_padded 0 899%xdefine rstk rsp 900 ALLOC_STACK -8*16 901%endif 902 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 903 movifnidn bufyq, bufymp 904 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 905 imul uvd, 28 906 movd m7, [base+round_vals-12+shiftq*2] 907 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12 908 pxor m2, m2 909 pcmpgtb m2, m0 910 punpckhbw m1, m0, m2 911 punpcklbw m0, m2 912 pinsrw m1, [base+pw_1], 5 913 punpcklwd m7, m7 914 pshufd m7, m7, q0000 915 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 916 pshufd m4, m1, q0000 917 pshufd m5, m1, q1111 918 pshufd m6, m1, q2222 919 pshufd m3, m0, q3333 920 pshufd m2, m0, q2222 921 pshufd m1, m0, q1111 922 pshufd m0, m0, q0000 923 SCRATCH 0, 8, 0 924 SCRATCH 1, 9, 1 925 SCRATCH 2, 10, 2 926 SCRATCH 3, 11, 3 927 SCRATCH 4, 12, 4 928 SCRATCH 5, 13, 5 929 SCRATCH 6, 14, 6 930 SCRATCH 7, 15, 7 931%if %2 932 movd m7, [base+hmul_bits+2+%3*2] 933 movd m6, [base+pb_1] 934 punpcklwd m7, m7 935 pshufd m6, m6, q0000 936 pshufd m7, m7, q0000 937 sub bufq, 82*(73-35*%3)+44-(82*3+41) 938%else 939 sub bufq, 82*69+3 940%endif 941 add bufyq, 79+82*3 942 mov hd, 70-35*%3 943.y_loop_ar2: 944 mov xq, -(76>>%2) 945 946.x_loop_ar2: 947 pxor m2, m2 948 movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 949 movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 950 pcmpgtb m2, m0 951 punpckhbw m1, m0, m2 952 punpcklbw m0, m2 953 psrldq m5, m0, 2 ; y=-2,x=[-1,+5] 954 psrldq m3, m1, 2 ; y=-1,x=[-1,+5] 955 psrldq m4, m1, 4 ; y=-1,x=[+0,+5] 956 punpcklwd m2, m0, m5 957 punpcklwd m3, m4 958 pmaddwd m2, m8 959 pmaddwd m3, m11 960 paddd m2, m3 961 962 psrldq m4, m0, 4 ; y=-2,x=[+0,+5] 963 psrldq m5, m0, 6 ; y=-2,x=[+1,+5] 964 psrldq m0, 8 ; y=-2,x=[+2,+5] 965 punpcklwd m4, m5 966 punpcklwd m0, m1 967 psrldq m3, m1, 6 ; y=-1,x=[+1,+5] 968 psrldq m1, m1, 8 ; y=-1,x=[+2,+5] 969 punpcklwd m3, m1 970 pmaddwd m4, m9 971 pmaddwd m0, m10 972 pmaddwd m3, m12 973 paddd m4, m0 974 paddd m2, m3 975 paddd m2, m4 976 977%if %2 978 movq m1, [bufyq+xq*2] 979%if %3 980 movq m3, [bufyq+xq*2+82] 981%endif 982 pmaddubsw m0, m6, m1 983%if %3 984 pmaddubsw m1, m6, m3 985 paddw m0, m1 986%endif 987 pmulhrsw m0, m7 988%else 989 movd m0, [bufyq+xq] 990 pxor m1, m1 991 pcmpgtb m1, m0 992 punpcklbw m0, m1 993%endif 994 punpcklwd m0, m15 995 pmaddwd m0, m14 996 paddd m2, m0 997 998 movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] 999 pxor m4, m4 1000 movd m5, [base+byte_blend+1] 1001 punpcklbw m5, m5 1002.x_loop_ar2_inner: 1003 pcmpgtb m1, m4, m0 1004 punpcklbw m0, m1 1005 pmaddwd m3, m0, m13 1006 paddd m3, m2 1007 psrldq m2, 4 ; shift top to next pixel 1008 psrad m3, [fg_dataq+FGData.ar_coeff_shift] 1009 pslldq m3, 4 1010 pand m3, m5 1011 paddw m0, m3 1012 packsswb m0, m0 1013 movd [bufq+xq-2], m0 1014 psrldq m0, 1 1015 inc xq 1016 jz .x_loop_ar2_end 1017 test xq, 3 1018 jnz .x_loop_ar2_inner 1019 jmp .x_loop_ar2 1020 1021.x_loop_ar2_end: 1022 add bufq, 82 1023 add bufyq, 82<<%3 1024 dec hd 1025 jg .y_loop_ar2 1026 RET 1027 1028.ar3: 1029%if ARCH_X86_32 1030%assign stack_offset stack_offset_old 1031%assign stack_size_padded 0 1032%xdefine rstk rsp 1033%endif 1034 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 1035 movifnidn bufyq, bufymp 1036%if ARCH_X86_32 1037 ALLOC_STACK -15*16 1038%else 1039 SUB rsp, 16*7 1040%assign stack_size_padded (stack_size_padded+16*7) 1041%assign stack_size (stack_size+16*7) 1042%endif 1043 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 1044 imul uvd, 28 1045 1046 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 1047 pxor m3, m3 1048 pcmpgtb m3, m0 1049 punpckhbw m1, m0, m3 1050 punpcklbw m0, m3 1051 pshufd m2, m0, q1111 1052 pshufd m3, m0, q2222 1053 pshufd m4, m0, q3333 1054 pshufd m0, m0, q0000 1055 pshufd m5, m1, q1111 1056 pshufd m6, m1, q2222 1057 pshufd m7, m1, q3333 1058 pshufd m1, m1, q0000 1059 mova [rsp+ 0*16], m0 1060 mova [rsp+ 1*16], m2 1061 mova [rsp+ 2*16], m3 1062 mova [rsp+ 3*16], m4 1063 mova [rsp+ 4*16], m1 1064 mova [rsp+ 5*16], m5 1065 mova [rsp+ 6*16], m6 1066 SCRATCH 7, 8, 7 1067 1068 movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma] 1069 pxor m4, m4 1070 pcmpgtb m4, m2 1071 punpckhbw m5, m2, m4 1072 punpcklbw m2, m4 1073 pshufd m4, m2, q3232 1074 punpcklwd m3, m4, m5 1075 pshuflw m5, m4, q3321 1076 pshufd m4, m3, q0000 1077 pshufd m3, m2, q1111 1078 pshufd m2, m2, q0000 1079 pinsrw m5, [base+round_vals+shiftq*2-10], 3 1080 SCRATCH 2, 9, 8 1081 SCRATCH 3, 10, 9 1082 SCRATCH 4, 11, 10 1083 SCRATCH 5, 12, 11 1084 1085 movd m2, [base+round_vals-12+shiftq*2] 1086%if %2 1087 movd m1, [base+pb_1] 1088 movd m3, [base+hmul_bits+2+%3*2] 1089%endif 1090 pxor m0, m0 1091 punpcklwd m2, m0 1092%if %2 1093 punpcklwd m3, m3 1094%endif 1095 pshufd m2, m2, q0000 1096%if %2 1097 pshufd m1, m1, q0000 1098 pshufd m3, m3, q0000 1099 SCRATCH 1, 13, 12 1100%endif 1101 SCRATCH 2, 14, 13 1102%if %2 1103 SCRATCH 3, 15, 14 1104%endif 1105 1106 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 1107%if %2 1108 sub bufq, 82*(73-35*%3)+44-(82*3+41) 1109%else 1110 sub bufq, 82*69+3 1111%endif 1112 add bufyq, 79+82*3 1113 mov hd, 70-35*%3 1114.y_loop_ar3: 1115 mov xq, -(76>>%2) 1116 1117.x_loop_ar3: 1118 movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] 1119 pxor m4, m4 1120 pcmpgtb m4, m0 1121 punpckhbw m3, m0, m4 1122 punpcklbw m0, m4 1123 1124 psrldq m5, m0, 2 1125 psrldq m6, m0, 4 1126 psrldq m7, m0, 6 1127 punpcklwd m4, m0, m5 1128 punpcklwd m6, m7 1129 pmaddwd m4, [rsp+ 0*16] 1130 pmaddwd m6, [rsp+ 1*16] 1131 paddd m4, m6 1132 1133 palignr m2, m3, m0, 10 1134 palignr m3, m0, 12 1135 psrldq m0, 8 1136 1137 movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] 1138 pxor m6, m6 1139 pcmpgtb m6, m1 1140 punpckhbw m5, m1, m6 1141 punpcklbw m1, m6 1142 1143 punpcklwd m0, m2 1144 punpcklwd m3, m1 1145 pmaddwd m0, [rsp+ 2*16] 1146 pmaddwd m3, [rsp+ 3*16] 1147 paddd m0, m3 1148 paddd m0, m4 1149 1150 movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 1151 pxor m7, m7 1152 pcmpgtb m7, m2 1153 punpckhbw m6, m2, m7 1154 punpcklbw m2, m7 1155 1156 palignr m3, m5, m1, 10 1157 palignr m5, m1, 12 1158 psrldq m4, m2, 2 1159 1160 punpcklwd m3, m5 1161 punpcklwd m5, m2, m4 1162 pmaddwd m3, [rsp+ 6*16] 1163 pmaddwd m5, m8 1164 paddd m3, m5 1165 paddd m0, m3 1166 1167 psrldq m3, m1, 2 1168 psrldq m4, m1, 4 1169 psrldq m5, m1, 6 1170 psrldq m1, 8 1171 1172 punpcklwd m3, m4 1173 punpcklwd m5, m1 1174 pmaddwd m3, [rsp+ 4*16] 1175 pmaddwd m5, [rsp+ 5*16] 1176 paddd m3, m5 1177 paddd m0, m3 1178 1179%if %2 1180 movq m1, [bufyq+xq*2] 1181%if %3 1182 movq m3, [bufyq+xq*2+82] 1183%endif 1184 pmaddubsw m7, m13, m1 1185%if %3 1186 pmaddubsw m5, m13, m3 1187 paddw m7, m5 1188%endif 1189 pmulhrsw m7, m15 1190%else 1191 movd m7, [bufyq+xq] 1192 pxor m1, m1 1193 pcmpgtb m1, m7 1194 punpcklbw m7, m1 1195%endif 1196 1197 psrldq m1, m2, 4 1198 psrldq m3, m2, 6 1199 palignr m4, m6, m2, 10 1200 palignr m6, m2, 12 1201 psrldq m2, 8 1202 1203 punpcklwd m1, m3 1204 punpcklwd m2, m4 1205 punpcklwd m6, m7 1206 pmaddwd m1, m9 1207 pmaddwd m2, m10 1208 pmaddwd m6, m11 1209 paddd m1, m2 1210 paddd m0, m6 1211 paddd m0, m1 1212 paddd m0, m14 1213 1214 movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] 1215 pxor m4, m4 1216 movd m5, [base+byte_blend] 1217.x_loop_ar3_inner: 1218 pcmpgtb m2, m4, m1 1219 punpcklbw m3, m1, m2 1220 pmaddwd m2, m3, m12 1221 pshufd m3, m2, q1111 1222 paddd m2, m3 ; left+cur 1223 paddd m2, m0 ; add top 1224 psrldq m0, 4 1225 psrad m2, [fg_dataq+FGData.ar_coeff_shift] 1226 ; don't packssdw, we only care about one value 1227 packsswb m2, m2 1228 pandn m3, m5, m1 1229 pslld m2, 24 1230 pand m2, m5 1231 por m1, m2, m3 1232 movd [bufq+xq-3], m1 1233 psrldq m1, 1 1234 inc xq 1235 jz .x_loop_ar3_end 1236 test xq, 3 1237 jnz .x_loop_ar3_inner 1238 jmp .x_loop_ar3 1239 1240.x_loop_ar3_end: 1241 add bufq, 82 1242 add bufyq, 82<<%3 1243 dec hd 1244 jg .y_loop_ar3 1245 RET 1246%endmacro 1247 1248generate_grain_uv_fn 420, 1, 1 1249generate_grain_uv_fn 422, 1, 0 1250generate_grain_uv_fn 444, 0, 0 1251 1252%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg 1253%assign %%idx 0 1254%define %%tmp %2 1255%if %0 == 6 1256%define %%tmp %6 1257%endif 1258%rep 4 1259%if %%idx == 0 1260 movd %5 %+ d, %2 1261 pshuflw %%tmp, %2, q3232 1262%else 1263 movd %5 %+ d, %%tmp 1264%if %%idx == 2 1265 punpckhqdq %%tmp, %%tmp 1266%elif %%idx == 4 1267 psrlq %%tmp, 32 1268%endif 1269%endif 1270 movzx %4 %+ d, %5 %+ w 1271 shr %5 %+ d, 16 1272 1273%if %%idx == 0 1274 movd %1, [%3+%4] 1275%else 1276 pinsrw %1, [%3+%4], %%idx + 0 1277%endif 1278 pinsrw %1, [%3+%5], %%idx + 1 1279%assign %%idx %%idx+2 1280%endrep 1281%endmacro 1282 1283INIT_XMM ssse3 1284; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby) 1285%if ARCH_X86_32 1286%if STACK_ALIGNMENT < mmsize 1287cglobal fgy_32x32xn, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \ 1288 dst, src, scaling, unused1, fg_data, picptr, unused2 1289 ; copy stack arguments to new position post-alignment, so that we 1290 ; don't have to keep the old stack location in a separate register 1291 mov r0, r0m 1292 mov r1, r2m 1293 mov r2, r4m 1294 mov r3, r6m 1295 mov r4, r7m 1296 mov r5, r8m 1297 1298 mov [rsp+6*mmsize+ 3*gprsize], r0 1299 mov [rsp+6*mmsize+ 5*gprsize], r1 1300 mov [rsp+6*mmsize+ 7*gprsize], r2 1301 mov [rsp+6*mmsize+ 9*gprsize], r3 1302 mov [rsp+6*mmsize+10*gprsize], r4 1303 mov [rsp+6*mmsize+11*gprsize], r5 1304%else 1305cglobal fgy_32x32xn, 0, 7, 16, 6 * mmsize + (3 + 1) * gprsize, \ 1306 dst, src, scaling, unused1, fg_data, picptr, unused2 1307%endif 1308 mov srcq, srcm 1309 mov fg_dataq, r3m 1310 mov scalingq, r5m 1311%if STACK_ALIGNMENT < mmsize 1312%define r0m [rsp+6*mmsize+ 3*gprsize] 1313%define r1m [rsp+6*mmsize+ 4*gprsize] 1314%define r2m [rsp+6*mmsize+ 5*gprsize] 1315%define r3m [rsp+6*mmsize+ 6*gprsize] 1316%define r4m [rsp+6*mmsize+ 7*gprsize] 1317%define r5m [rsp+6*mmsize+ 8*gprsize] 1318%define r6m [rsp+6*mmsize+ 9*gprsize] 1319%define r7m [rsp+6*mmsize+10*gprsize] 1320%define r8m [rsp+6*mmsize+11*gprsize] 1321%endif 1322 LEA r5, pb_mask 1323%define base r5-pb_mask 1324 mov r5m, picptrq 1325%else 1326cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut 1327 lea r7, [pb_mask] 1328%define base r7-pb_mask 1329%endif 1330 mov r6d, [fg_dataq+FGData.scaling_shift] 1331 movd m3, [base+mul_bits+r6*2-14] 1332 mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 1333 pcmpeqw m2, m2 1334 psrldq m2, 14 1335 movd m4, [base+max+r6*4] 1336 movd m5, [base+min+r6*2] 1337 punpcklwd m3, m3 1338 punpcklwd m4, m4 1339 punpcklwd m5, m5 1340 pshufd m3, m3, q0000 1341 pshufd m4, m4, q0000 1342 pshufd m5, m5, q0000 1343 SCRATCH 2, 10, 0 1344 SCRATCH 3, 11, 1 1345 SCRATCH 4, 12, 2 1346 SCRATCH 5, 13, 3 1347 1348%if ARCH_X86_32 1349 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 1350%else 1351 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 1352%endif 1353 1354 mov sbyd, r8m 1355 mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 1356 test overlapd, overlapd 1357 jz .no_vertical_overlap 1358 mova m6, [base+pw_1024] 1359 movd m7, [base+pb_27_17_17_27] 1360 SCRATCH 6, 14, 4 1361 SCRATCH 7, 15, 5 1362 test sbyd, sbyd 1363 jnz .vertical_overlap 1364 ; fall-through 1365 1366.no_vertical_overlap: 1367 mov r8m, overlapd 1368%if ARCH_X86_32 1369 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused 1370 imul seed, (173 << 24) | 37 1371%else 1372 imul seed, sbyd, (173 << 24) | 37 1373%endif 1374 add seed, (105 << 24) | 178 1375 rol seed, 8 1376 movzx seed, seew 1377 xor seed, [fg_dataq+FGData.seed] 1378 1379%if ARCH_X86_32 1380 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1381 1382 mov r3m, seed 1383 mov wq, r4m 1384%else 1385 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1386 unused1, unused2, see, unused3 1387%endif 1388 1389 lea src_bakq, [srcq+wq] 1390 neg wq 1391 sub dstmp, srcq 1392%if ARCH_X86_32 1393 mov r1m, src_bakq 1394 mov r4m, wq 1395 DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 1396%endif 1397 1398.loop_x: 1399%if ARCH_X86_32 1400 mov seed, r3m 1401%endif 1402 mov r6d, seed 1403 or seed, 0xEFF4 1404 shr r6d, 1 1405 test seeb, seeh 1406 lea seed, [r6+0x8000] 1407 cmovp seed, r6d ; updated seed 1408%if ARCH_X86_32 1409 mov r3m, seed 1410 1411 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1412 1413 mov offxd, offyd 1414%else 1415 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1416 offx, offy, see, unused 1417 1418 mov offyd, seed 1419 mov offxd, seed 1420%endif 1421 ror offyd, 8 1422 shr offxd, 12 1423 and offyd, 0xf 1424 imul offyd, 164 1425 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1426 1427%if ARCH_X86_32 1428 ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, 1429 ; r6m=grain_lut, r7m=h, r8m=overlap_v|h 1430 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1431%else 1432 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1433 h, offxy, see, unused 1434%endif 1435 1436.loop_x_odd: 1437 mov hd, r7m 1438 mov grain_lutq, grain_lutmp 1439.loop_y: 1440 ; src 1441 mova m0, [srcq] 1442 pxor m2, m2 1443 punpckhbw m1, m0, m2 1444 punpcklbw m0, m2 ; m0-1: src as word 1445 1446 ; scaling[src] 1447%if ARCH_X86_32 1448 vpgatherdw m4, m0, scalingq, r0, r5, m3 1449 vpgatherdw m5, m1, scalingq, r0, r5, m3 1450%else 1451 vpgatherdw m4, m0, scalingq, r12, r13, m3 1452 vpgatherdw m5, m1, scalingq, r12, r13, m3 1453%endif 1454 pcmpeqw m3, m3 1455 psrlw m3, 8 1456 pand m4, m3 1457 pand m5, m3 1458 1459 ; grain = grain_lut[offy+y][offx+x] 1460 movu m3, [grain_lutq+offxyq] 1461 pcmpgtb m7, m2, m3 1462 punpcklbw m2, m3, m7 1463 punpckhbw m3, m7 1464 1465 ; noise = round2(scaling[src] * grain, scaling_shift) 1466 pmullw m2, m4 1467 pmullw m3, m5 1468 pmulhrsw m2, m11 1469 pmulhrsw m3, m11 1470 1471 ; dst = clip_pixel(src, noise) 1472 paddw m0, m2 1473 paddw m1, m3 1474 pmaxsw m0, m13 1475 pmaxsw m1, m13 1476 pminsw m0, m12 1477 pminsw m1, m12 1478 packuswb m0, m1 1479 movifnidn dstq, dstmp 1480 mova [dstq+srcq], m0 1481 1482 add srcq, r2mp 1483 add grain_lutq, 82 1484 dec hd 1485 jg .loop_y 1486 1487%if ARCH_X86_32 1488 add r4mp, 16 1489%else 1490 add wq, 16 1491%endif 1492 jge .end 1493%if ARCH_X86_32 1494 mov srcq, r1mp 1495 add srcq, r4mp 1496%else 1497 lea srcq, [src_bakq+wq] 1498%endif 1499 btc dword r8m, 2 1500 jc .next_blk 1501 1502 add offxyd, 16 1503 test dword r8m, 2 ; r8m & 2 = have_top_overlap 1504 jz .loop_x_odd 1505 1506%if ARCH_X86_32 1507 add dword [rsp+6*mmsize+1*gprsize], 16 1508%else 1509 add r11d, 16 ; top_offxyd 1510%endif 1511 jnz .loop_x_odd_v_overlap 1512 1513.next_blk: 1514 test dword r8m, 1 1515 jz .loop_x 1516 1517 test dword r8m, 2 1518 jnz .loop_x_hv_overlap 1519 1520 ; horizontal overlap (without vertical overlap) 1521.loop_x_h_overlap: 1522%if ARCH_X86_32 1523 ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, 1524 ; r6m=grain_lut, r7m=h, r8m=overlap_v|h 1525 DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3 1526 1527 add offxyd, 16 ; left_offxyd 1528 mov [rsp+6*mmsize+0*gprsize], offxyd 1529 1530 DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 1531 1532 mov seed, r3m 1533%else 1534 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1535 offx, offy, see, left_offxy 1536 1537 lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx 1538%endif 1539 1540 mov r6d, seed 1541 or seed, 0xEFF4 1542 shr r6d, 1 1543 test seeb, seeh 1544 lea seed, [r6+0x8000] 1545 cmovp seed, r6d ; updated seed 1546 1547%if ARCH_X86_32 1548 mov r3m, seed 1549 1550 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1551 1552 mov offxd, offyd 1553%else 1554 mov offyd, seed 1555 mov offxd, seed 1556%endif 1557 ror offyd, 8 1558 shr offxd, 12 1559 and offyd, 0xf 1560 imul offyd, 164 1561 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1562 1563%if ARCH_X86_32 1564 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1565%else 1566 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1567 h, offxy, see, left_offxy 1568%endif 1569 1570 mov hd, r7m 1571 mov grain_lutq, grain_lutmp 1572.loop_y_h_overlap: 1573 ; src 1574 mova m0, [srcq] 1575 pxor m2, m2 1576 punpckhbw m1, m0, m2 1577 punpcklbw m0, m2 ; m0-1: src as word 1578 1579 ; scaling[src] 1580%if ARCH_X86_32 1581 vpgatherdw m4, m0, scalingq, r0, r5, m3 1582 vpgatherdw m5, m1, scalingq, r0, r5, m3 1583%else 1584 vpgatherdw m4, m0, scalingq, r12, r13, m3 1585 vpgatherdw m5, m1, scalingq, r12, r13, m3 1586%endif 1587 pcmpeqw m3, m3 1588 psrlw m3, 8 1589 pand m4, m3 1590 pand m5, m3 1591 1592 ; grain = grain_lut[offy+y][offx+x] 1593 movu m3, [grain_lutq+offxyq] 1594%if ARCH_X86_32 1595 mov r5, [rsp+6*mmsize+0*gprsize] 1596 movd m7, [grain_lutq+r5] 1597%else 1598 movd m7, [grain_lutq+left_offxyq] 1599%endif 1600 punpcklbw m7, m3 1601 pmaddubsw m6, m15, m7 1602 pmulhrsw m6, m14 1603 packsswb m6, m6 1604 pand m6, m10 1605 pandn m7, m10, m3 1606 por m6, m7 1607 pcmpgtb m2, m6 1608 punpcklbw m7, m6, m2 1609 punpckhbw m6, m2 1610 1611 ; noise = round2(scaling[src] * grain, scaling_shift) 1612 pmullw m7, m4 1613 pmullw m6, m5 1614 pmulhrsw m7, m11 1615 pmulhrsw m6, m11 1616 1617 ; dst = clip_pixel(src, noise) 1618 paddw m0, m7 1619 paddw m1, m6 1620 pmaxsw m0, m13 1621 pmaxsw m1, m13 1622 pminsw m0, m12 1623 pminsw m1, m12 1624 packuswb m0, m1 1625 movifnidn dstq, dstmp 1626 mova [dstq+srcq], m0 1627 1628 add srcq, r2mp 1629 add grain_lutq, 82 1630 dec hd 1631 jg .loop_y_h_overlap 1632 1633%if ARCH_X86_32 1634 add r4mp, 16 1635%else 1636 add wq, 16 1637%endif 1638 jge .end 1639%if ARCH_X86_32 1640 mov srcq, r1m 1641 add srcq, r4m 1642%else 1643 lea srcq, [src_bakq+wq] 1644%endif 1645 xor dword r8m, 4 1646 add offxyd, 16 1647 1648 ; since this half-block had left-overlap, the next does not 1649 test dword r8m, 2 ; have_top_overlap 1650 jz .loop_x_odd 1651%if ARCH_X86_32 1652 add dword [rsp+6*mmsize+1*gprsize], 16 1653%else 1654 add r11d, 16 ; top_offxyd 1655%endif 1656 jmp .loop_x_odd_v_overlap 1657 1658.end: 1659 RET 1660 1661.vertical_overlap: 1662%if ARCH_X86_32 1663 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 1664%else 1665 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap 1666%endif 1667 1668 or overlapd, 2 ; top_overlap: overlap & 2 1669 mov r8m, overlapd 1670 movzx sbyd, sbyb 1671%if ARCH_X86_32 1672 imul r4, [fg_dataq+FGData.seed], 0x00010001 1673 DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused 1674%else 1675 imul seed, [fg_dataq+FGData.seed], 0x00010001 1676%endif 1677 imul tmpd, sbyd, 173 * 0x00010001 1678 imul sbyd, 37 * 0x01000100 1679 add tmpd, (105 << 16) | 188 1680 add sbyd, (178 << 24) | (141 << 8) 1681 and tmpd, 0x00ff00ff 1682 and sbyd, 0xff00ff00 1683 xor seed, tmpd 1684%if ARCH_X86_32 1685 xor sbyd, seed ; (cur_seed << 16) | top_seed 1686 1687 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1688 1689 mov r3m, seed 1690 mov wq, r4m 1691%else 1692 xor seed, sbyd ; (cur_seed << 16) | top_seed 1693 1694 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1695 tmp, unused2, see, unused3 1696%endif 1697 1698 lea src_bakq, [srcq+wq] 1699 neg wq 1700 sub dstmp, srcq 1701%if ARCH_X86_32 1702 mov r1m, src_bakq 1703 mov r4m, wq 1704 DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 1705%endif 1706 1707.loop_x_v_overlap: 1708%if ARCH_X86_32 1709 mov seed, r3m 1710%endif 1711 ; we assume from the block above that bits 8-15 of tmpd are zero'ed, 1712 ; because of the 'and tmpd, 0x00ff00ff' above 1713 mov r6d, seed 1714 or seed, 0xeff4eff4 1715 test seeb, seeh 1716 setp tmpb ; parity of top_seed 1717 shr seed, 16 1718 shl tmpd, 16 1719 test seeb, seeh 1720 setp tmpb ; parity of cur_seed 1721 or r6d, 0x00010001 1722 xor tmpd, r6d 1723 mov seed, tmpd 1724 ror seed, 1 ; updated (cur_seed << 16) | top_seed 1725 1726%if ARCH_X86_32 1727 mov r3m, seed 1728 1729 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1730 1731 mov offxd, offyd 1732%else 1733 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1734 offx, offy, see, unused, top_offxy 1735 1736 mov offyd, seed 1737 mov offxd, seed 1738%endif 1739 1740 ror offyd, 8 1741 ror offxd, 12 1742 and offyd, 0xf000f 1743 and offxd, 0xf000f 1744 imul offyd, 164 1745 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1746 lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1747 1748%if ARCH_X86_32 1749 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut 1750%else 1751 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1752 h, offxy, see, unused, top_offxy 1753%endif 1754 1755 movzx top_offxyd, offxyw 1756%if ARCH_X86_32 1757 mov [rsp+6*mmsize+1*gprsize], top_offxyd 1758 1759 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1760%endif 1761 shr offxyd, 16 1762 1763.loop_x_odd_v_overlap: 1764%if ARCH_X86_32 1765 mov r5, r5m 1766 lea r5, [base+pb_27_17] 1767 mov [rsp+5*mmsize+8], r5 1768%else 1769 mova m8, [pb_27_17] 1770%endif 1771 mov hd, r7m 1772 mov grain_lutq, grain_lutmp 1773.loop_y_v_overlap: 1774 ; src 1775 mova m0, [srcq] 1776 pxor m2, m2 1777 punpckhbw m1, m0, m2 1778 punpcklbw m0, m2 ; m0-1: src as word 1779 1780 ; scaling[src] 1781%if ARCH_X86_32 1782 vpgatherdw m4, m0, scalingq, r0, r5, m3 1783 vpgatherdw m5, m1, scalingq, r0, r5, m3 1784%else 1785 vpgatherdw m4, m0, scalingq, r12, r13, m3 1786 vpgatherdw m5, m1, scalingq, r12, r13, m3 1787%endif 1788 pcmpeqw m3, m3 1789 psrlw m3, 8 1790 pand m4, m3 1791 pand m5, m3 1792 1793 ; grain = grain_lut[offy+y][offx+x] 1794 movu m3, [grain_lutq+offxyq] 1795%if ARCH_X86_32 1796 mov r5, [rsp+6*mmsize+1*gprsize] 1797 movu m7, [grain_lutq+r5] 1798%else 1799 movu m7, [grain_lutq+top_offxyq] 1800%endif 1801 punpckhbw m6, m7, m3 1802 punpcklbw m7, m3 1803%if ARCH_X86_32 1804 mov r5, [rsp+5*mmsize+8] 1805 pmaddubsw m3, [r5], m6 1806 pmaddubsw m6, [r5], m7 1807%else 1808 pmaddubsw m3, m8, m6 1809 pmaddubsw m6, m8, m7 1810%endif 1811 pmulhrsw m3, m14 1812 pmulhrsw m6, m14 1813 packsswb m6, m3 1814 pcmpgtb m7, m2, m6 1815 punpcklbw m2, m6, m7 1816 punpckhbw m6, m7 1817 1818 ; noise = round2(scaling[src] * grain, scaling_shift) 1819 pmullw m2, m4 1820 pmullw m6, m5 1821 pmulhrsw m2, m11 1822 pmulhrsw m6, m11 1823 1824 ; dst = clip_pixel(src, noise) 1825 paddw m0, m2 1826 paddw m1, m6 1827 pmaxsw m0, m13 1828 pmaxsw m1, m13 1829 pminsw m0, m12 1830 pminsw m1, m12 1831 packuswb m0, m1 1832 movifnidn dstq, dstmp 1833 mova [dstq+srcq], m0 1834 1835%if ARCH_X86_32 1836 add dword [rsp+5*mmsize+8], mmsize 1837%else 1838 mova m8, [pb_17_27] 1839%endif 1840 add srcq, r2mp 1841 add grain_lutq, 82 1842 dec hw 1843 jz .end_y_v_overlap 1844 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1845 ; remaining (up to) 30 lines 1846 btc hd, 16 1847 jnc .loop_y_v_overlap 1848 jmp .loop_y 1849 1850.end_y_v_overlap: 1851%if ARCH_X86_32 1852 add r4mp, 16 1853%else 1854 add wq, 16 1855%endif 1856 jge .end_hv 1857%if ARCH_X86_32 1858 mov srcq, r1mp 1859 add srcq, r4mp 1860%else 1861 lea srcq, [src_bakq+wq] 1862%endif 1863 btc dword r8m, 2 1864 jc .loop_x_hv_overlap 1865 add offxyd, 16 1866%if ARCH_X86_32 1867 add dword [rsp+6*mmsize+1*gprsize], 16 1868%else 1869 add top_offxyd, 16 1870%endif 1871 jmp .loop_x_odd_v_overlap 1872 1873.loop_x_hv_overlap: 1874%if ARCH_X86_32 1875 mov r5, r5m 1876 lea r5, [base+pb_27_17] 1877 mov [rsp+5*mmsize+8], r5 1878 1879 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak 1880 1881 mov r5, [rsp+6*mmsize+1*gprsize] 1882 mov r4, offxyd 1883 add r5, 16 1884 add r4, 16 1885 mov [rsp+6*mmsize+2*gprsize], r5 ; topleft_offxy 1886 mov [rsp+6*mmsize+0*gprsize], r4 ; left_offxy 1887 1888 DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak 1889 1890 xor tmpd, tmpd 1891 mov seed, r3m 1892%else 1893 mova m8, [pb_27_17] 1894 1895 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1896 tmp, unused2, see, unused3 1897 1898 ; we assume from the block above that bits 8-15 of tmpd are zero'ed 1899%endif 1900 mov r6d, seed 1901 or seed, 0xeff4eff4 1902 test seeb, seeh 1903 setp tmpb ; parity of top_seed 1904 shr seed, 16 1905 shl tmpd, 16 1906 test seeb, seeh 1907 setp tmpb ; parity of cur_seed 1908 or r6d, 0x00010001 1909 xor tmpd, r6d 1910 mov seed, tmpd 1911 ror seed, 1 ; updated (cur_seed << 16) | top_seed 1912 1913%if ARCH_X86_32 1914 mov r3m, seed 1915 1916 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1917 1918 mov offxd, offyd 1919%else 1920 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1921 offx, offy, see, left_offxy, top_offxy, topleft_offxy 1922 1923 lea topleft_offxyq, [top_offxyq+16] 1924 lea left_offxyq, [offyq+16] 1925 mov offyd, seed 1926 mov offxd, seed 1927%endif 1928 ror offyd, 8 1929 ror offxd, 12 1930 and offyd, 0xf000f 1931 and offxd, 0xf000f 1932 imul offyd, 164 1933 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1934 lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1935 1936%if ARCH_X86_32 1937 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1938 1939 movzx r5, offxyw ; top_offxy 1940 mov [rsp+6*mmsize+1*gprsize], r5 1941%else 1942 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1943 h, offxy, see, left_offxy, top_offxy, topleft_offxy 1944 1945 movzx top_offxyd, offxyw 1946%endif 1947 shr offxyd, 16 1948 1949 mov hd, r7m 1950 mov grain_lutq, grain_lutmp 1951.loop_y_hv_overlap: 1952 ; grain = grain_lut[offy+y][offx+x] 1953 movu m3, [grain_lutq+offxyq] 1954%if ARCH_X86_32 1955 mov r5, [rsp+6*mmsize+1*gprsize] ; top_offxy 1956 mov r0, [rsp+6*mmsize+0*gprsize] ; left_offxy 1957 movu m6, [grain_lutq+r5] 1958 mov r5, [rsp+6*mmsize+2*gprsize] ; topleft_offxy 1959 movd m4, [grain_lutq+r0] 1960 movd m7, [grain_lutq+r5] 1961%else 1962 movu m6, [grain_lutq+top_offxyq] 1963 movd m4, [grain_lutq+left_offxyq] 1964 movd m7, [grain_lutq+topleft_offxyq] 1965%endif 1966 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 1967 punpcklbw m4, m3 1968 punpcklbw m7, m6 1969 pmaddubsw m2, m15, m4 1970 pmaddubsw m4, m15, m7 1971 pmulhrsw m2, m14 1972 pmulhrsw m4, m14 1973 packsswb m2, m2 1974 packsswb m4, m4 1975 pand m2, m10 1976 pand m4, m10 1977 pandn m7, m10, m3 1978 pandn m3, m10, m6 1979 por m7, m2 1980 por m3, m4 1981 ; followed by v interpolation (top | cur -> cur) 1982 punpckhbw m4, m3, m7 1983 punpcklbw m3, m7 1984%if ARCH_X86_32 1985 mov r5, [rsp+5*mmsize+8] 1986 pmaddubsw m7, [r5], m4 1987 pmaddubsw m4, [r5], m3 1988%else 1989 pmaddubsw m7, m8, m4 1990 pmaddubsw m4, m8, m3 1991%endif 1992 pmulhrsw m7, m14 1993 pmulhrsw m4, m14 1994 packsswb m4, m7 1995 pxor m2, m2 1996 pcmpgtb m7, m2, m4 1997 punpcklbw m3, m4, m7 1998 punpckhbw m4, m7 1999 2000 ; src 2001 mova m0, [srcq] 2002 punpckhbw m1, m0, m2 2003 punpcklbw m0, m2 ; m0-1: src as word 2004 2005 ; scaling[src] 2006%if ARCH_X86_32 2007 vpgatherdw m5, m0, scalingq, r0, r5, m7 2008 vpgatherdw m6, m1, scalingq, r0, r5, m7 2009%else 2010 vpgatherdw m5, m0, scalingq, r13, r14, m7 2011 vpgatherdw m6, m1, scalingq, r13, r14, m7 2012%endif 2013 pcmpeqw m7, m7 2014 psrlw m7, 8 2015 pand m5, m7 2016 pand m6, m7 2017 2018 ; noise = round2(scaling[src] * grain, scaling_shift) 2019 pmullw m3, m5 2020 pmullw m4, m6 2021 pmulhrsw m3, m11 2022 pmulhrsw m4, m11 2023 2024 ; dst = clip_pixel(src, noise) 2025 paddw m0, m3 2026 paddw m1, m4 2027 pmaxsw m0, m13 2028 pmaxsw m1, m13 2029 pminsw m0, m12 2030 pminsw m1, m12 2031 packuswb m0, m1 2032 movifnidn dstq, dstmp 2033 mova [dstq+srcq], m0 2034 2035%if ARCH_X86_32 2036 add dword [rsp+5*mmsize+8], mmsize 2037%else 2038 mova m8, [pb_17_27] 2039%endif 2040 add srcq, r2mp 2041 add grain_lutq, 82 2042 dec hw 2043 jz .end_y_hv_overlap 2044 ; 2 lines get vertical overlap, then fall back to non-overlap code for 2045 ; remaining (up to) 30 lines 2046 btc hd, 16 2047 jnc .loop_y_hv_overlap 2048 jmp .loop_y_h_overlap 2049 2050.end_y_hv_overlap: 2051%if ARCH_X86_32 2052 add r4mp, 16 2053%else 2054 add wq, 16 2055%endif 2056 jge .end_hv 2057%if ARCH_X86_32 2058 mov srcq, r1m 2059 add srcq, r4m 2060%else 2061 lea srcq, [src_bakq+wq] 2062%endif 2063 xor dword r8m, 4 2064 add offxyd, 16 2065%if ARCH_X86_32 2066 add dword [rsp+6*mmsize+1*gprsize], 16 2067%else 2068 add top_offxyd, 16 2069%endif 2070 jmp .loop_x_odd_v_overlap 2071 2072.end_hv: 2073 RET 2074 2075%macro FGUV_FN 3 ; name, ss_hor, ss_ver 2076INIT_XMM ssse3 2077%if ARCH_X86_32 2078; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h, 2079; sby, luma, lstride, uv_pl, is_id) 2080%if STACK_ALIGNMENT < mmsize 2081DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8 2082cglobal fguv_32x32xn_i%1, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \ 2083 tmp, src, scaling, h, fg_data, picptr, unused 2084 mov r0, r0m 2085 mov r1, r2m 2086 mov r2, r4m 2087 mov r3, r6m 2088 mov r4, r7m 2089 mov [rsp+8*mmsize+3*gprsize], r0 2090 mov [rsp+8*mmsize+5*gprsize], r1 2091 mov [rsp+8*mmsize+7*gprsize], r2 2092 mov [rsp+8*mmsize+9*gprsize], r3 2093 mov [rsp+8*mmsize+10*gprsize], r4 2094 2095 mov r0, r8m 2096 mov r1, r9m 2097 mov r2, r10m 2098 mov r4, r11m 2099 mov r3, r12m 2100 mov [rsp+8*mmsize+11*gprsize], r0 2101 mov [rsp+8*mmsize+12*gprsize], r1 2102 mov [rsp+8*mmsize+13*gprsize], r2 2103 mov [rsp+8*mmsize+14*gprsize], r4 2104%else 2105cglobal fguv_32x32xn_i%1, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ 2106 tmp, src, scaling, h, fg_data, picptr, unused 2107%endif 2108 mov srcq, srcm 2109 mov fg_dataq, r3m 2110 mov scalingq, r5m 2111%if STACK_ALIGNMENT < mmsize 2112%define r0m [rsp+8*mmsize+ 3*gprsize] 2113%define r1m [rsp+8*mmsize+ 4*gprsize] 2114%define r2m [rsp+8*mmsize+ 5*gprsize] 2115%define r3m [rsp+8*mmsize+ 6*gprsize] 2116%define r4m [rsp+8*mmsize+ 7*gprsize] 2117%define r5m [rsp+8*mmsize+ 8*gprsize] 2118%define r6m [rsp+8*mmsize+ 9*gprsize] 2119%define r7m [rsp+8*mmsize+10*gprsize] 2120%define r8m [rsp+8*mmsize+11*gprsize] 2121%define r9m [rsp+8*mmsize+12*gprsize] 2122%define r10m [rsp+8*mmsize+13*gprsize] 2123%define r11m [rsp+8*mmsize+14*gprsize] 2124%define r12m [rsp+8*mmsize+15*gprsize] 2125%endif 2126 LEA r5, pb_mask 2127%define base r5-pb_mask 2128 mov r5m, r5 2129%else 2130cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ 2131 grain_lut, tmp, sby, luma, lstride, uv_pl, is_id 2132 lea r8, [pb_mask] 2133%define base r8-pb_mask 2134%endif 2135 mov r6d, [fg_dataq+FGData.scaling_shift] 2136 pcmpeqw m2, m2 2137 movd m3, [base+mul_bits+r6*2-14] 2138 mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 2139 lea tmpd, [r6d*2] 2140%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize 2141 test r3, r3 2142%else 2143 cmp dword r12m, 0 ; is_idm 2144%endif 2145 movd m5, [base+min+r6*2] 2146 cmovne r6d, tmpd 2147 movd m4, [base+max+r6*2] 2148 psrldq m2, 14+%2 2149 punpcklwd m3, m3 2150 punpcklwd m5, m5 2151 punpcklwd m4, m4 2152 pshufd m3, m3, q0000 2153 pshufd m5, m5, q0000 2154 pshufd m4, m4, q0000 2155 SCRATCH 2, 10, 0 2156 SCRATCH 3, 11, 1 2157 SCRATCH 4, 12, 2 2158 SCRATCH 5, 13, 3 2159 2160 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 2161 jne .csfl 2162 2163%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver 2164%if ARCH_X86_32 2165 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 2166%else 2167 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 2168%endif 2169 2170%if %1 2171 mov r6d, dword r11m 2172 movd m0, [fg_dataq+FGData.uv_mult+r6*4] 2173 movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4] 2174 punpcklbw m6, m1, m0 2175 movd m7, [fg_dataq+FGData.uv_offset+r6*4] 2176 punpcklwd m6, m6 2177 punpcklwd m7, m7 2178 pshufd m6, m6, q0000 2179 pshufd m7, m7, q0000 2180 SCRATCH 6, 14, 4 2181 SCRATCH 7, 15, 5 2182%endif 2183 2184 mov sbyd, r8m 2185 mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 2186 test overlapd, overlapd 2187 jz %%no_vertical_overlap 2188%if ARCH_X86_32 2189%if %2 2190 movd m1, [base+pb_23_22] 2191%else 2192 movd m1, [base+pb_27_17_17_27] 2193%endif 2194 mova m0, [base+pw_1024] 2195%else 2196%if %2 2197 movd m1, [pb_23_22] 2198%else 2199 movd m1, [pb_27_17_17_27] 2200%endif 2201 mova m0, [pw_1024] 2202%endif 2203 pshufd m1, m1, q0000 2204 SCRATCH 0, 8, 6 2205 SCRATCH 1, 9, 7 2206 test sbyd, sbyd 2207 jnz %%vertical_overlap 2208 ; fall-through 2209 2210%%no_vertical_overlap: 2211 mov r8m, overlapd 2212%if ARCH_X86_32 2213 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap 2214 imul seed, (173 << 24) | 37 2215%else 2216 imul seed, sbyd, (173 << 24) | 37 2217%endif 2218 add seed, (105 << 24) | 178 2219 rol seed, 8 2220 movzx seed, seew 2221 xor seed, [fg_dataq+FGData.seed] 2222 2223%if ARCH_X86_32 2224 mov r3m, seed 2225 2226 DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak 2227%define luma_bakq lumaq 2228 2229 mov wq, r4m 2230%if %3 2231 shl r10mp, 1 2232%endif 2233%else 2234 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2235 unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak 2236 2237 mov lstrideq, r10mp 2238%endif 2239 2240 mov lumaq, r9mp 2241 lea src_bakq, [srcq+wq] 2242 lea luma_bakq, [lumaq+wq*(1+%2)] 2243 neg wq 2244 sub r0mp, srcq 2245%if ARCH_X86_32 2246 mov r1m, src_bakq 2247 mov r11m, luma_bakq 2248 mov r4m, wq 2249 2250 DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 2251%else 2252 mov r11mp, src_bakq 2253 mov r12mp, strideq 2254%endif 2255 2256%%loop_x: 2257%if ARCH_X86_32 2258 mov seed, r3m 2259%endif 2260 mov r6d, seed 2261 or seed, 0xEFF4 2262 shr r6d, 1 2263 test seeb, seeh 2264 lea seed, [r6+0x8000] 2265 cmovp seed, r6d ; updated seed 2266%if ARCH_X86_32 2267 mov r3m, seed 2268 2269 DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 2270 2271 mov offxd, offyd 2272%else 2273 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2274 offx, offy, see, overlap, unused1, unused2, lstride 2275 2276 mov offyd, seed 2277 mov offxd, seed 2278%endif 2279 ror offyd, 8 2280 shr offxd, 12 2281 and offyd, 0xf 2282 imul offyd, 164>>%3 2283 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx 2284 2285%if ARCH_X86_32 2286 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut 2287%else 2288 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2289 h, offxy, see, overlap, unused1, unused2, lstride, luma_bak 2290%endif 2291 2292%%loop_x_odd: 2293 mov hd, r7m 2294 mov grain_lutq, grain_lutmp 2295%%loop_y: 2296 ; src 2297%if ARCH_X86_32 2298 mov lumaq, r9mp 2299%endif 2300%if %2 2301 mova m4, [lumaq+ 0] 2302 mova m6, [lumaq+16] 2303 mova m0, [srcq] 2304%if ARCH_X86_32 2305 add lumaq, r10mp 2306 mov r9mp, lumaq 2307 mov r5, r5m 2308 movd m7, [base+pb_1] 2309%else 2310 movd m7, [pb_1] 2311%endif 2312 pshufd m7, m7, q0000 2313 pxor m2, m2 2314 pmaddubsw m4, m7 2315 pmaddubsw m6, m7 2316 pavgw m4, m2 2317 pavgw m6, m2 2318%else 2319 mova m4, [lumaq] 2320 mova m0, [srcq] 2321%if ARCH_X86_32 2322 add lumaq, r10mp 2323 mov r9mp, lumaq 2324%endif 2325 pxor m2, m2 2326%endif 2327 2328%if %1 2329%if %2 2330 packuswb m4, m6 ; luma 2331%endif 2332 punpckhbw m6, m4, m0 2333 punpcklbw m4, m0 ; { luma, chroma } 2334 pmaddubsw m6, m14 2335 pmaddubsw m4, m14 2336 psraw m6, 6 2337 psraw m4, 6 2338 paddw m6, m15 2339 paddw m4, m15 2340 packuswb m4, m6 ; pack+unpack = clip 2341 punpckhbw m6, m4, m2 2342 punpcklbw m4, m2 2343%elif %2 == 0 2344 punpckhbw m6, m4, m2 2345 punpcklbw m4, m2 2346%endif 2347 2348 ; scaling[luma_src] 2349%if ARCH_X86_32 2350 vpgatherdw m7, m4, scalingq, r0, r5 2351 vpgatherdw m5, m6, scalingq, r0, r5 2352%else 2353 vpgatherdw m7, m4, scalingq, r12, r2 2354 vpgatherdw m5, m6, scalingq, r12, r2 2355%endif 2356 pcmpeqw m1, m1 2357 psrlw m1, 8 2358 pand m7, m1 2359 pand m5, m1 2360 2361 ; unpack chroma_source 2362 punpckhbw m1, m0, m2 2363 punpcklbw m0, m2 ; m0-1: src as word 2364 2365 ; grain = grain_lut[offy+y][offx+x] 2366 movu m3, [grain_lutq+offxyq+ 0] 2367 pcmpgtb m6, m2, m3 2368 punpcklbw m2, m3, m6 2369 punpckhbw m3, m6 2370 2371 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2372 pmullw m2, m7 2373 pmullw m3, m5 2374 pmulhrsw m2, m11 2375 pmulhrsw m3, m11 2376 2377%if ARCH_X86_32 2378 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2379%endif 2380 2381 ; dst = clip_pixel(src, noise) 2382 paddw m0, m2 2383 paddw m1, m3 2384 pmaxsw m0, m13 2385 pmaxsw m1, m13 2386 pminsw m0, m12 2387 pminsw m1, m12 2388 packuswb m0, m1 2389 movifnidn dstq, dstmp 2390 mova [dstq+srcq], m0 2391 2392%if ARCH_X86_32 2393 add srcq, r2mp 2394 ; we already incremented lumaq above 2395%else 2396 add srcq, r12mp 2397%if %3 2398 lea lumaq, [lumaq+lstrideq*2] 2399%else 2400 add lumaq, lstrideq 2401%endif 2402%endif 2403 add grain_lutq, 82 2404 dec hw 2405 jg %%loop_y 2406 2407%if ARCH_X86_32 2408 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 2409 2410 mov wq, r4m 2411%endif 2412 add wq, 16 2413 jge %%end 2414%if ARCH_X86_32 2415 mov srcq, r1mp 2416 mov lumaq, r11mp 2417%else 2418 mov srcq, r11mp 2419%endif 2420 lea lumaq, [luma_bakq+wq*(1+%2)] 2421 add srcq, wq 2422%if ARCH_X86_32 2423 mov r4m, wq 2424 mov r9m, lumaq 2425%endif 2426%if %2 == 0 2427 ; adjust top_offxy 2428%if ARCH_X86_32 2429 add dword [rsp+8*mmsize+1*gprsize], 16 2430%else 2431 add r11d, 16 2432%endif 2433 add offxyd, 16 2434 btc dword r8m, 2 2435 jc %%loop_x_even 2436 test dword r8m, 2 2437 jz %%loop_x_odd 2438 jmp %%loop_x_odd_v_overlap 2439%%loop_x_even: 2440%endif 2441 test dword r8m, 1 2442 jz %%loop_x 2443 2444 ; r8m = sbym 2445 test dword r8m, 2 2446 jne %%loop_x_hv_overlap 2447 2448 ; horizontal overlap (without vertical overlap) 2449%%loop_x_h_overlap: 2450%if ARCH_X86_32 2451%if %2 2452 lea r6, [offxyd+16] 2453 mov [rsp+8*mmsize+0*gprsize], r6 2454%else 2455 mov [rsp+8*mmsize+0*gprsize], offxyd 2456%endif 2457 2458 DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut 2459 2460 mov seed, r3m 2461%else 2462 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2463 offx, offy, see, left_offxy, unused1, unused2, lstride 2464 2465%if %2 2466 lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx 2467%else 2468 mov left_offxyd, offyd 2469%endif 2470%endif 2471 mov r6d, seed 2472 or seed, 0xEFF4 2473 shr r6d, 1 2474 test seeb, seeh 2475 lea seed, [r6+0x8000] 2476 cmovp seed, r6d ; updated seed 2477 2478%if ARCH_X86_32 2479 mov r3m, seed 2480 2481 DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx 2482 2483 mov offxd, offyd 2484%else 2485 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2486 offx, offy, see, left_offxy, unused1, unused2, lstride 2487 2488 mov offyd, seed 2489 mov offxd, seed 2490%endif 2491 ror offyd, 8 2492 shr offxd, 12 2493 and offyd, 0xf 2494 imul offyd, 164>>%3 2495 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 2496 2497%if ARCH_X86_32 2498 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut 2499%else 2500 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2501 h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak 2502%endif 2503 2504 mov hd, r7m 2505 mov grain_lutq, grain_lutmp 2506%%loop_y_h_overlap: 2507 ; src 2508%if ARCH_X86_32 2509 mov lumaq, r9mp 2510%endif 2511%if %2 2512 mova m4, [lumaq+ 0] 2513 mova m6, [lumaq+16] 2514 mova m0, [srcq] 2515%if ARCH_X86_32 2516 add lumaq, r10mp 2517 mov r9mp, lumaq 2518 mov r5, r5m 2519 movd m7, [base+pb_1] 2520%else 2521 movd m7, [pb_1] 2522%endif 2523 pshufd m7, m7, q0000 2524 pxor m2, m2 2525 pmaddubsw m4, m7 2526 pmaddubsw m6, m7 2527 pavgw m4, m2 2528 pavgw m6, m2 2529%else 2530 mova m4, [lumaq] 2531 mova m0, [srcq] 2532%if ARCH_X86_32 2533 add lumaq, r10mp 2534 mov r9mp, lumaq 2535%endif 2536 pxor m2, m2 2537%endif 2538 2539%if %1 2540%if %2 2541 packuswb m4, m6 ; luma 2542%endif 2543 punpckhbw m6, m4, m0 2544 punpcklbw m4, m0 ; { luma, chroma } 2545 pmaddubsw m6, m14 2546 pmaddubsw m4, m14 2547 psraw m6, 6 2548 psraw m4, 6 2549 paddw m6, m15 2550 paddw m4, m15 2551 packuswb m4, m6 ; pack+unpack = clip 2552 punpckhbw m6, m4, m2 2553 punpcklbw m4, m2 2554%elif %2 == 0 2555 punpckhbw m6, m4, m2 2556 punpcklbw m4, m2 2557%endif 2558 2559 ; scaling[luma_src] 2560%if ARCH_X86_32 2561 vpgatherdw m7, m4, scalingq, r0, r5 2562 vpgatherdw m5, m6, scalingq, r0, r5 2563%else 2564 vpgatherdw m7, m4, scalingq, r12, r2 2565 vpgatherdw m5, m6, scalingq, r12, r2 2566%endif 2567 pcmpeqw m1, m1 2568 psrlw m1, 8 2569 pand m7, m1 2570 pand m5, m1 2571 2572 ; unpack chroma_source 2573 punpckhbw m1, m0, m2 2574 punpcklbw m0, m2 ; m0-1: src as word 2575 2576 ; grain = grain_lut[offy+y][offx+x] 2577 movu m3, [grain_lutq+offxyq+ 0] 2578%if ARCH_X86_32 2579 mov r0, [rsp+8*mmsize+0*gprsize] 2580 movd m4, [grain_lutq+r0+ 0] 2581%else 2582 movd m4, [grain_lutq+left_offxyq+ 0] 2583%endif 2584 punpcklbw m2, m4, m3 2585 pmaddubsw m4, m9, m2 2586 pmulhrsw m4, m8 2587 packsswb m4, m4 2588 pand m4, m10 2589 pandn m2, m10, m3 2590 por m3, m4, m2 2591 pxor m4, m4 2592 pcmpgtb m4, m3 2593 punpcklbw m2, m3, m4 2594 punpckhbw m3, m4 2595 2596 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2597 pmullw m2, m7 2598 pmullw m3, m5 2599 pmulhrsw m2, m11 2600 pmulhrsw m3, m11 2601 2602%if ARCH_X86_32 2603 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2604%endif 2605 2606 ; dst = clip_pixel(src, noise) 2607 paddw m0, m2 2608 paddw m1, m3 2609 pmaxsw m0, m13 2610 pmaxsw m1, m13 2611 pminsw m0, m12 2612 pminsw m1, m12 2613 packuswb m0, m1 2614 movifnidn dstq, dstmp 2615 mova [dstq+srcq], m0 2616 2617%if ARCH_X86_32 2618 add srcq, r2mp 2619 ; lumaq has already been incremented above 2620%else 2621 add srcq, r12mp 2622%if %3 2623 lea lumaq, [lumaq+lstrideq*2] 2624%else 2625 add lumaq, lstrideq 2626%endif 2627%endif 2628 add grain_lutq, 82 2629 dec hw 2630 jg %%loop_y_h_overlap 2631 2632%if ARCH_X86_32 2633 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 2634 2635 mov wq, r4m 2636%endif 2637 add wq, 16 2638 jge %%end 2639%if ARCH_X86_32 2640 mov srcq, r1mp 2641 mov lumaq, r11mp 2642%else 2643 mov srcq, r11mp 2644%endif 2645 lea lumaq, [luma_bakq+wq*(1+%2)] 2646 add srcq, wq 2647%if ARCH_X86_32 2648 mov r4m, wq 2649 mov r9m, lumaq 2650%endif 2651%if %2 == 0 2652 xor dword r8m, 4 2653 ; adjust top_offxyd 2654%if ARCH_X86_32 2655 add dword [rsp+8*mmsize+1*gprsize], 16 2656%else 2657 add r11d, 16 2658%endif 2659 add offxyd, 16 2660%endif 2661 2662 ; r8m = sbym 2663 test dword r8m, 2 2664%if %2 2665 jne %%loop_x_hv_overlap 2666 jmp %%loop_x_h_overlap 2667%else 2668 jne %%loop_x_odd_v_overlap 2669 jmp %%loop_x_odd 2670%endif 2671 2672%%end: 2673 RET 2674 2675%%vertical_overlap: 2676%if ARCH_X86_32 2677 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 2678%else 2679 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap 2680%endif 2681 2682 or overlapd, 2 ; top_overlap: overlap & 2 2683 mov r8m, overlapd 2684 movzx sbyd, sbyb 2685%if ARCH_X86_32 2686 imul r4, [fg_dataq+FGData.seed], 0x00010001 2687 DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused 2688%else 2689 imul seed, [fg_dataq+FGData.seed], 0x00010001 2690%endif 2691 imul tmpd, sbyd, 173 * 0x00010001 2692 imul sbyd, 37 * 0x01000100 2693 add tmpd, (105 << 16) | 188 2694 add sbyd, (178 << 24) | (141 << 8) 2695 and tmpd, 0x00ff00ff 2696 and sbyd, 0xff00ff00 2697 xor seed, tmpd 2698%if ARCH_X86_32 2699 xor sbyd, seed ; (cur_seed << 16) | top_seed 2700 2701 DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak 2702 2703 mov r3m, seed 2704 mov wq, r4m 2705%if %3 2706 shl r10mp, 1 2707%endif 2708%else 2709 xor seed, sbyd ; (cur_seed << 16) | top_seed 2710 2711 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2712 tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak 2713 2714 mov lstrideq, r10mp 2715%endif 2716 2717 mov lumaq, r9mp 2718 lea src_bakq, [srcq+wq] 2719 lea luma_bakq, [lumaq+wq*(1+%2)] 2720 neg wq 2721 sub r0mp, srcq 2722%if ARCH_X86_32 2723 mov r1m, src_bakq 2724 mov r11m, luma_bakq 2725 mov r4m, wq 2726 2727 DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 2728%else 2729 mov r11mp, src_bakq 2730 mov r12mp, strideq 2731%endif 2732 2733%%loop_x_v_overlap: 2734%if ARCH_X86_32 2735 mov seed, r3m 2736 xor tmpd, tmpd 2737%endif 2738 ; we assume from the block above that bits 8-15 of tmpd are zero'ed 2739 mov r6d, seed 2740 or seed, 0xeff4eff4 2741 test seeb, seeh 2742 setp tmpb ; parity of top_seed 2743 shr seed, 16 2744 shl tmpd, 16 2745 test seeb, seeh 2746 setp tmpb ; parity of cur_seed 2747 or r6d, 0x00010001 2748 xor tmpd, r6d 2749 mov seed, tmpd 2750 ror seed, 1 ; updated (cur_seed << 16) | top_seed 2751 2752%if ARCH_X86_32 2753 mov r3m, seed 2754 2755 DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx 2756 2757 mov offxd, offyd 2758%else 2759 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2760 offx, offy, see, overlap, top_offxy, unused, lstride 2761 2762 mov offxd, seed 2763 mov offyd, seed 2764%endif 2765 ror offyd, 8 2766 ror offxd, 12 2767 and offyd, 0xf000f 2768 and offxd, 0xf000f 2769 imul offyd, 164>>%3 2770 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 2771 lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 2772 2773%if ARCH_X86_32 2774 DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy 2775%else 2776 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2777 h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak 2778%endif 2779 2780 movzx top_offxyd, offxyw 2781 shr offxyd, 16 2782%if ARCH_X86_32 2783 mov [rsp+8*mmsize+1*gprsize], top_offxyd 2784 2785 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut 2786%endif 2787 2788%%loop_x_odd_v_overlap: 2789 mov hd, r7m 2790 mov grain_lutq, grain_lutmp 2791%if ARCH_X86_32 2792 mov r5, r5m 2793 mova m1, [base+pb_27_17] 2794%else 2795 mova m1, [pb_27_17] 2796%endif 2797%%loop_y_v_overlap: 2798%if ARCH_X86_32 2799 mov lumaq, r9mp 2800%endif 2801%if %2 2802 mova m4, [lumaq+ 0] 2803 mova m6, [lumaq+16] 2804 mova m0, [srcq] 2805%if ARCH_X86_32 2806 add lumaq, r10mp 2807 mov r9mp, lumaq 2808 mov r5, r5m 2809 movd m7, [base+pb_1] 2810%else 2811 movd m7, [pb_1] 2812%endif 2813 pshufd m7, m7, q0000 2814 pxor m2, m2 2815 pmaddubsw m4, m7 2816 pmaddubsw m6, m7 2817 pavgw m4, m2 2818 pavgw m6, m2 2819%else 2820 mova m4, [lumaq] 2821 mova m0, [srcq] 2822%if ARCH_X86_32 2823 add lumaq, r10mp 2824 mov r9mp, lumaq 2825%endif 2826 pxor m2, m2 2827%endif 2828 2829%if %1 2830%if %2 2831 packuswb m4, m6 ; luma 2832%endif 2833 punpckhbw m6, m4, m0 2834 punpcklbw m4, m0 ; { luma, chroma } 2835 pmaddubsw m6, m14 2836 pmaddubsw m4, m14 2837 psraw m6, 6 2838 psraw m4, 6 2839 paddw m6, m15 2840 paddw m4, m15 2841 packuswb m4, m6 ; pack+unpack = clip 2842 punpckhbw m6, m4, m2 2843 punpcklbw m4, m2 2844%elif %2 == 0 2845 punpckhbw m6, m4, m2 2846 punpcklbw m4, m2 2847%endif 2848 2849 ; scaling[luma_src] 2850%if ARCH_X86_32 2851 vpgatherdw m7, m4, scalingq, r0, r5 2852 vpgatherdw m5, m6, scalingq, r0, r5 2853%else 2854 vpgatherdw m7, m4, scalingq, r12, r2 2855 vpgatherdw m5, m6, scalingq, r12, r2 2856%endif 2857 pcmpeqw m4, m4 2858 psrlw m4, 8 2859 pand m7, m4 2860 pand m5, m4 2861 2862 ; grain = grain_lut[offy+y][offx+x] 2863 movu m3, [grain_lutq+offxyq] 2864%if ARCH_X86_32 2865 mov r0, [rsp+8*mmsize+1*gprsize] 2866 movu m4, [grain_lutq+r0] 2867%else 2868 movu m4, [grain_lutq+top_offxyq] 2869%endif 2870 punpckhbw m6, m4, m3 2871 punpcklbw m4, m3 2872%if %3 2873 pmaddubsw m2, m9, m6 2874 pmaddubsw m3, m9, m4 2875%else 2876 pmaddubsw m2, m1, m6 2877 pmaddubsw m3, m1, m4 2878%endif 2879 pmulhrsw m2, m8 2880 pmulhrsw m3, m8 2881 packsswb m3, m2 2882 pxor m6, m6 2883 pcmpgtb m6, m3 2884 punpcklbw m2, m3, m6 2885 punpckhbw m3, m6 2886 2887 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2888 pmullw m2, m7 2889 pmullw m3, m5 2890 pmulhrsw m2, m11 2891 pmulhrsw m3, m11 2892 2893 ; unpack chroma_source 2894 pxor m4, m4 2895 punpckhbw m6, m0, m4 2896 punpcklbw m0, m4 ; m0-1: src as word 2897 2898%if ARCH_X86_32 2899 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2900%endif 2901 2902 ; dst = clip_pixel(src, noise) 2903 paddw m0, m2 2904 paddw m6, m3 2905 pmaxsw m0, m13 2906 pmaxsw m6, m13 2907 pminsw m0, m12 2908 pminsw m6, m12 2909 packuswb m0, m6 2910 movifnidn dstq, dstmp 2911 mova [dstq+srcq], m0 2912 2913 dec hw 2914 je %%end_y_v_overlap 2915%if ARCH_X86_32 2916 add srcq, r2mp 2917 ; lumaq has already been incremented above 2918%else 2919 add srcq, r12mp 2920%if %3 2921 lea lumaq, [lumaq+lstrideq*2] 2922%else 2923 add lumaq, lstrideq 2924%endif 2925%endif 2926 add grain_lutq, 82 2927%if %3 == 0 2928 btc hd, 16 2929%if ARCH_X86_32 2930 mov r5, r5m 2931 mova m1, [base+pb_17_27] 2932%else 2933 mova m1, [pb_17_27] 2934%endif 2935 jnc %%loop_y_v_overlap 2936%endif 2937 jmp %%loop_y 2938 2939%%end_y_v_overlap: 2940%if ARCH_X86_32 2941 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 2942 2943 mov wq, r4m 2944%endif 2945 add wq, 16 2946 jge %%end_hv 2947%if ARCH_X86_32 2948 mov srcq, r1mp 2949 mov lumaq, r11mp 2950%else 2951 mov srcq, r11mp 2952%endif 2953 lea lumaq, [luma_bakq+wq*(1+%2)] 2954 add srcq, wq 2955%if ARCH_X86_32 2956 mov r4m, wq 2957 mov r9m, lumaq 2958%endif 2959 2960%if %2 2961 ; since fg_dataq.overlap is guaranteed to be set, we never jump 2962 ; back to .loop_x_v_overlap, and instead always fall-through to 2963 ; h+v overlap 2964%else 2965%if ARCH_X86_32 2966 add dword [rsp+8*mmsize+1*gprsize], 16 2967%else 2968 add top_offxyd, 16 2969%endif 2970 add offxyd, 16 2971 btc dword r8m, 2 2972 jnc %%loop_x_odd_v_overlap 2973%endif 2974 2975%%loop_x_hv_overlap: 2976%if ARCH_X86_32 2977 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused 2978 2979 mov r6, [rsp+8*mmsize+1*gprsize] 2980%if %2 2981 lea r0, [r3d+16] 2982 add r6, 16 2983 mov [rsp+8*mmsize+0*gprsize], r0 ; left_offxy 2984%else 2985 mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy 2986%endif 2987 mov [rsp+8*mmsize+2*gprsize], r6 ; topleft_offxy 2988 2989 DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused 2990 2991 mov seed, r3m 2992 xor tmpd, tmpd 2993%else 2994 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2995 tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride 2996 2997%if %2 2998 lea topleft_offxyq, [top_offxyq+16] 2999 lea left_offxyq, [offxyq+16] 3000%else 3001 mov topleft_offxyq, top_offxyq 3002 mov left_offxyq, offxyq 3003%endif 3004 3005 ; we assume from the block above that bits 8-15 of tmpd are zero'ed 3006%endif 3007 mov r6d, seed 3008 or seed, 0xeff4eff4 3009 test seeb, seeh 3010 setp tmpb ; parity of top_seed 3011 shr seed, 16 3012 shl tmpd, 16 3013 test seeb, seeh 3014 setp tmpb ; parity of cur_seed 3015 or r6d, 0x00010001 3016 xor tmpd, r6d 3017 mov seed, tmpd 3018 ror seed, 1 ; updated (cur_seed << 16) | top_seed 3019 3020%if ARCH_X86_32 3021 mov r3m, seed 3022 3023 DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx 3024 3025 mov offxd, offyd 3026%else 3027 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 3028 offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride 3029 3030 mov offxd, seed 3031 mov offyd, seed 3032%endif 3033 ror offyd, 8 3034 ror offxd, 12 3035 and offyd, 0xf000f 3036 and offxd, 0xf000f 3037 imul offyd, 164>>%3 3038 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 3039 lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 3040 3041%if ARCH_X86_32 3042 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut 3043%else 3044 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 3045 h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak 3046%endif 3047 3048 movzx top_offxyd, offxyw 3049 shr offxyd, 16 3050%if ARCH_X86_32 3051 mov [rsp+8*mmsize+1*gprsize], top_offxyd 3052%endif 3053 3054 mov hd, r7m 3055 mov grain_lutq, grain_lutmp 3056%if ARCH_X86_32 3057 mov r5, r5m 3058 mova m3, [base+pb_27_17] 3059%else 3060 mova m3, [pb_27_17] 3061%endif 3062%%loop_y_hv_overlap: 3063 ; src 3064%if ARCH_X86_32 3065 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 3066 3067 mov lumaq, r9mp 3068%endif 3069%if %2 3070 mova m4, [lumaq+ 0] 3071 mova m6, [lumaq+16] 3072 mova m0, [srcq] 3073%if ARCH_X86_32 3074 add lumaq, r10mp 3075 mov r9mp, lumaq 3076 mov r5, r5m 3077 movd m7, [base+pb_1] 3078%else 3079 movd m7, [pb_1] 3080%endif 3081 pshufd m7, m7, q0000 3082 pxor m2, m2 3083 pmaddubsw m4, m7 3084 pmaddubsw m6, m7 3085 pavgw m4, m2 3086 pavgw m6, m2 3087%else 3088 mova m4, [lumaq] 3089 mova m0, [srcq] 3090%if ARCH_X86_32 3091 add lumaq, r10mp 3092 mov r9mp, lumaq 3093%endif 3094 pxor m2, m2 3095%endif 3096 3097%if %1 3098%if %2 3099 packuswb m4, m6 ; luma 3100%endif 3101 punpckhbw m6, m4, m0 3102 punpcklbw m4, m0 ; { luma, chroma } 3103 pmaddubsw m6, m14 3104 pmaddubsw m4, m14 3105 psraw m6, 6 3106 psraw m4, 6 3107 paddw m6, m15 3108 paddw m4, m15 3109 packuswb m4, m6 ; pack+unpack = clip 3110 punpckhbw m6, m4, m2 3111 punpcklbw m4, m2 3112%elif %2 == 0 3113 punpckhbw m6, m4, m2 3114 punpcklbw m4, m2 3115%endif 3116 3117 ; scaling[src] 3118%if ARCH_X86_32 3119 vpgatherdw m7, m4, scalingq, r0, r5 3120 vpgatherdw m5, m6, scalingq, r0, r5 3121%else 3122 movd m1, [grain_lutq+topleft_offxyq] 3123%if %3 3124 vpgatherdw m7, m4, scalingq, r2, r12 3125 vpgatherdw m5, m6, scalingq, r2, r12 3126%else 3127 vpgatherdw m7, m4, scalingq, r2, r13 3128 vpgatherdw m5, m6, scalingq, r2, r13 3129%endif 3130%endif 3131 pcmpeqw m2, m2 3132 psrlw m2, 8 3133 pand m7, m2 3134 pand m5, m2 3135 3136 ; grain = grain_lut[offy+y][offx+x] 3137%if ARCH_X86_32 3138 mov r0, [rsp+8*mmsize+2*gprsize] ; topleft_offxy 3139 mov r5, [rsp+8*mmsize+1*gprsize] ; top_offxy 3140 movd m1, [grain_lutq+r0] 3141 mov r0, [rsp+8*mmsize+0*gprsize] ; left_offxy 3142%endif 3143 movu m2, [grain_lutq+offxyq] 3144%if ARCH_X86_32 3145 movu m6, [grain_lutq+r5] 3146 movd m4, [grain_lutq+r0] 3147%else 3148 movu m6, [grain_lutq+top_offxyq] 3149 movd m4, [grain_lutq+left_offxyq] 3150%endif 3151 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 3152 punpcklbw m1, m6 3153 punpcklbw m4, m2 3154%if %2 3155 punpcklwd m4, m1 3156%else 3157 punpckldq m4, m1 3158%endif 3159 pmaddubsw m1, m9, m4 3160 pmulhrsw m1, m8 3161 packsswb m1, m1 3162 pandn m4, m10, m2 3163 pandn m2, m10, m6 3164 psrldq m6, m1, 2-%2 3165 pand m1, m10 3166 pand m6, m10 3167 por m4, m1 3168 por m2, m6 3169 ; followed by v interpolation (top | cur -> cur) 3170 punpckhbw m1, m2, m4 3171 punpcklbw m2, m4 3172%if %3 3173 pmaddubsw m4, m9, m1 3174 pmaddubsw m1, m9, m2 3175%else 3176 pmaddubsw m4, m3, m1 3177 pmaddubsw m1, m3, m2 3178%endif 3179 pmulhrsw m4, m8 3180 pmulhrsw m1, m8 3181 packsswb m1, m4 3182 pxor m4, m4 3183 pcmpgtb m4, m1 3184 punpcklbw m2, m1, m4 3185 punpckhbw m1, m4 3186 3187 ; noise = round2(scaling[src] * grain, scaling_shift) 3188 pmullw m2, m7 3189 pmullw m1, m5 3190 pmulhrsw m2, m11 3191 pmulhrsw m1, m11 3192 3193%if ARCH_X86_32 3194 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 3195%endif 3196 3197 ; unpack chroma source 3198 pxor m4, m4 3199 punpckhbw m5, m0, m4 3200 punpcklbw m0, m4 ; m0-1: src as word 3201 3202 ; dst = clip_pixel(src, noise) 3203 paddw m0, m2 3204 paddw m5, m1 3205 pmaxsw m0, m13 3206 pmaxsw m5, m13 3207 pminsw m0, m12 3208 pminsw m5, m12 3209 packuswb m0, m5 3210 movifnidn dstq, dstmp 3211 mova [dstq+srcq], m0 3212 3213%if ARCH_X86_32 3214 add srcq, r2mp 3215 ; lumaq has been adjusted above already 3216%else 3217 add srcq, r12mp 3218%if %3 3219 lea lumaq, [lumaq+lstrideq*(1+%2)] 3220%else 3221 add lumaq, r10mp 3222%endif 3223%endif 3224 add grain_lutq, 82 3225 dec hw 3226%if %3 3227 jg %%loop_y_h_overlap 3228%else 3229 jle %%end_y_hv_overlap 3230%if ARCH_X86_32 3231 mov r5, r5m 3232 mova m3, [base+pb_17_27] 3233%else 3234 mova m3, [pb_17_27] 3235%endif 3236 btc hd, 16 3237 jnc %%loop_y_hv_overlap 3238%if ARCH_X86_64 3239 mov lstrideq, r10mp 3240%endif 3241 jmp %%loop_y_h_overlap 3242%%end_y_hv_overlap: 3243%if ARCH_X86_64 3244 mov lstrideq, r10mp 3245%endif 3246%endif 3247 3248%if ARCH_X86_32 3249 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 3250 3251 mov wq, r4m 3252%endif 3253 add wq, 16 3254 jge %%end_hv 3255%if ARCH_X86_32 3256 mov srcq, r1mp 3257 mov lumaq, r11mp 3258%else 3259 mov srcq, r11mp 3260%endif 3261 lea lumaq, [luma_bakq+wq*(1+%2)] 3262 add srcq, wq 3263%if ARCH_X86_32 3264 mov r4m, wq 3265 mov r9m, lumaq 3266%endif 3267%if %2 3268 jmp %%loop_x_hv_overlap 3269%else 3270%if ARCH_X86_32 3271 add dword [rsp+8*mmsize+1*gprsize], 16 3272%else 3273 add top_offxyd, 16 3274%endif 3275 add offxyd, 16 3276 xor dword r8m, 4 3277 jmp %%loop_x_odd_v_overlap 3278%endif 3279 3280%%end_hv: 3281 RET 3282%endmacro 3283 3284 %%FGUV_32x32xN_LOOP 1, %2, %3 3285.csfl: 3286 %%FGUV_32x32xN_LOOP 0, %2, %3 3287%endmacro 3288 3289FGUV_FN 420, 1, 1 3290 3291%if STACK_ALIGNMENT < mmsize 3292DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 3293%endif 3294 3295FGUV_FN 422, 1, 0 3296 3297%if STACK_ALIGNMENT < mmsize 3298DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 3299%endif 3300 3301FGUV_FN 444, 0, 0 3302