1; Copyright © 2019-2021, VideoLAN and dav1d authors 2; Copyright © 2019, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29SECTION_RODATA 30 31pw_1024: times 8 dw 1024 32pb_27_17_17_27: db 27, 17, 17, 27 33 times 6 db 0, 32 34pb_23_22_h: db 23, 22 35 times 7 db 0, 32 36pb_27_17: times 8 db 27, 17 37pb_17_27: times 8 db 17, 27 38pb_23_22: times 8 db 23, 22 39pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 40rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 41byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 42pw_seed_xor: times 2 dw 0xb524 43 times 2 dw 0x49d8 44pb_1: times 4 db 1 45hmul_bits: dw 32768, 16384, 8192, 4096 46round: dw 2048, 1024, 512 47mul_bits: dw 256, 128, 64, 32, 16 48round_vals: dw 32, 64, 128, 256, 512 49max: dw 255, 240, 235 50min: dw 0, 16 51pw_1: dw 1 52 53%macro JMP_TABLE 2-* 54 %xdefine %1_8bpc_%2_table %%table 55 %xdefine %%base %1_8bpc_%2_table 56 %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) 57 %%table: 58 %rep %0 - 2 59 dd %%prefix %+ .ar%3 - %%base 60 %rotate 1 61 %endrep 62%endmacro 63 64JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3 65JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3 66JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3 67JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3 68 69struc FGData 70 .seed: resd 1 71 .num_y_points: resd 1 72 .y_points: resb 14 * 2 73 .chroma_scaling_from_luma: resd 1 74 .num_uv_points: resd 2 75 .uv_points: resb 2 * 10 * 2 76 .scaling_shift: resd 1 77 .ar_coeff_lag: resd 1 78 .ar_coeffs_y: resb 24 79 .ar_coeffs_uv: resb 2 * 28 ; includes padding 80 .ar_coeff_shift: resq 1 81 .grain_scale_shift: resd 1 82 .uv_mult: resd 2 83 .uv_luma_mult: resd 2 84 .uv_offset: resd 2 85 .overlap_flag: resd 1 86 .clip_to_restricted_range: resd 1 87endstruc 88 89cextern gaussian_sequence 90 91SECTION .text 92 93%macro REPX 2-* 94 %xdefine %%f(x) %1 95%rep %0 - 1 96 %rotate 1 97 %%f(%1) 98%endrep 99%endmacro 100 101%if ARCH_X86_32 102%define PIC_ptr(a) base+a 103%else 104%define PIC_ptr(a) a 105%endif 106 107%macro SCRATCH 3 108%if ARCH_X86_32 109 mova [rsp+%3*mmsize], m%1 110%define m%2 [rsp+%3*mmsize] 111%else 112 SWAP %1, %2 113%endif 114%endmacro 115 116INIT_XMM ssse3 117cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data 118 LEA r4, $$ 119%define base r4-$$ 120 movq m1, [base+rnd_next_upperbit_mask] 121 movq m4, [base+mul_bits] 122 movq m7, [base+hmul_bits] 123 mov r2d, [fg_dataq+FGData.grain_scale_shift] 124 movd m2, [base+round+r2*2] 125 movd m0, [fg_dataq+FGData.seed] 126 mova m5, [base+pb_mask] 127 pshuflw m2, m2, q0000 128 pshuflw m0, m0, q0000 129 mov r2, -73*82 130 sub bufq, r2 131 lea r3, [base+gaussian_sequence] 132.loop: 133 pand m6, m0, m1 134 psrlw m3, m6, 10 135 por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 136 pmullw m6, m4 ; bits 0x0f00 are set 137 pshufb m3, m5, m6 ; set 15th bit for next 4 seeds 138 psllq m6, m3, 30 139 por m3, m6 140 psllq m6, m3, 15 141 por m3, m6 ; aggregate each bit into next seed's high bit 142 pmulhuw m6, m0, m7 143 por m3, m6 ; 4 next output seeds 144 pshuflw m0, m3, q3333 145 psrlw m3, 5 146%if ARCH_X86_64 147 movq r6, m3 148 mov r8, r6 149 movzx r5d, r6w 150 shr r6d, 16 151 shr r8, 32 152 movzx r7, r8w 153 shr r8, 16 154 155 movd m6, [r3+r5*2] 156 pinsrw m6, [r3+r6*2], 1 157 pinsrw m6, [r3+r7*2], 2 158 pinsrw m6, [r3+r8*2], 3 159%else 160 movd r6, m3 161 pshuflw m3, m3, q3232 162 movzx r5, r6w 163 shr r6, 16 164 165 movd m6, [r3+r5*2] 166 pinsrw m6, [r3+r6*2], 1 167 168 movd r6, m3 169 movzx r5, r6w 170 shr r6, 16 171 172 pinsrw m6, [r3+r5*2], 2 173 pinsrw m6, [r3+r6*2], 3 174%endif 175 pmulhrsw m6, m2 176 packsswb m6, m6 177 movd [bufq+r2], m6 178 add r2, 4 179 jl .loop 180 181 ; auto-regression code 182 movsxd r2, [fg_dataq+FGData.ar_coeff_lag] 183 movsxd r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4] 184 lea r2, [r2+base+generate_grain_y_8bpc_ssse3_table] 185 jmp r2 186 187.ar1: 188%if ARCH_X86_32 189 DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max 190%elif WIN64 191 DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0 192 mov bufq, r0 193%else 194 DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 195%endif 196 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] 197 movd m4, [fg_dataq+FGData.ar_coeffs_y] 198 mov ecx, [fg_dataq+FGData.ar_coeff_shift] 199%if ARCH_X86_32 200 mov r1m, cf3d 201 DEFINE_ARGS buf, shift, val3, min, max, x, val0 202%define hd r0mp 203%define cf3d r1mp 204%elif WIN64 205 DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0 206%else 207 DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 208%endif 209 pxor m6, m6 210 pcmpgtb m7, m6, m4 211 punpcklbw m4, m7 212 pinsrw m4, [base+pw_1], 3 213 pshufd m5, m4, q1111 214 pshufd m4, m4, q0000 215 movd m3, [base+round_vals+shiftq*2-12] ; rnd 216 pshuflw m3, m3, q0000 217 sub bufq, 82*73-(82*3+79) 218 mov hd, 70 219 mov mind, -128 220 mov maxd, 127 221.y_loop_ar1: 222 mov xq, -76 223 movsx val3d, byte [bufq+xq-1] 224.x_loop_ar1: 225 movq m0, [bufq+xq-82-1] ; top/left 226 pcmpgtb m7, m6, m0 227 punpcklbw m0, m7 228 psrldq m2, m0, 2 ; top 229 psrldq m1, m0, 4 ; top/right 230 punpcklwd m0, m2 231 punpcklwd m1, m3 232 pmaddwd m0, m4 233 pmaddwd m1, m5 234 paddd m0, m1 235.x_loop_ar1_inner: 236 movd val0d, m0 237 psrldq m0, 4 238 imul val3d, cf3d 239 add val3d, val0d 240 sar val3d, shiftb 241 movsx val0d, byte [bufq+xq] 242 add val3d, val0d 243 cmp val3d, maxd 244 cmovns val3d, maxd 245 cmp val3d, mind 246 cmovs val3d, mind 247 mov byte [bufq+xq], val3b 248 ; keep val3d in-place as left for next x iteration 249 inc xq 250 jz .x_loop_ar1_end 251 test xq, 3 252 jnz .x_loop_ar1_inner 253 jmp .x_loop_ar1 254 255.x_loop_ar1_end: 256 add bufq, 82 257 dec hd 258 jg .y_loop_ar1 259.ar0: 260 RET 261 262.ar2: 263%if ARCH_X86_32 264%assign stack_offset_old stack_offset 265 ALLOC_STACK -16*8 266%endif 267 DEFINE_ARGS buf, fg_data, shift 268 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 269 movd m6, [base+round_vals-12+shiftq*2] 270 movd m7, [base+byte_blend+1] 271 SCRATCH 7, 15, 7 272 movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 273 movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 274 pxor m7, m7 275 pshuflw m6, m6, q0000 276 punpcklwd m6, m7 277 pcmpgtb m4, m7, m0 278 pcmpgtb m5, m7, m1 279 punpcklbw m0, m4 280 punpcklbw m1, m5 281 DEFINE_ARGS buf, fg_data, h, x 282 pshufd m4, m1, q0000 283 pshufd m5, m1, q1111 284 pshufd m3, m0, q3333 285 pshufd m2, m0, q2222 286 pshufd m1, m0, q1111 287 pshufd m0, m0, q0000 288 SCRATCH 0, 8, 0 289 SCRATCH 1, 9, 1 290 SCRATCH 2, 10, 2 291 SCRATCH 3, 11, 3 292 SCRATCH 4, 12, 4 293 SCRATCH 5, 13, 5 294 SCRATCH 6, 14, 6 295 sub bufq, 82*73-(82*3+79) 296 mov hd, 70 297.y_loop_ar2: 298 mov xq, -76 299 300.x_loop_ar2: 301 movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 302 movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 303 pcmpgtb m2, m7, m0 304 punpckhbw m1, m0, m2 305 punpcklbw m0, m2 306 psrldq m5, m0, 2 ; y=-2,x=[-1,+5] 307 psrldq m3, m1, 2 ; y=-1,x=[-1,+5] 308 psrldq m4, m1, 4 ; y=-1,x=[+0,+5] 309 punpcklwd m2, m0, m5 310 punpcklwd m3, m4 311 pmaddwd m2, m8 312 pmaddwd m3, m11 313 paddd m2, m3 314 315 psrldq m4, m0, 4 ; y=-2,x=[+0,+5] 316 psrldq m5, m0, 6 ; y=-2,x=[+1,+5] 317 psrldq m6, m0, 8 ; y=-2,x=[+2,+5] 318 punpcklwd m4, m5 319 punpcklwd m6, m1 320 psrldq m5, m1, 6 ; y=-1,x=[+1,+5] 321 psrldq m1, m1, 8 ; y=-1,x=[+2,+5] 322 punpcklwd m5, m1 323 pmaddwd m4, m9 324 pmaddwd m6, m10 325 pmaddwd m5, m12 326 paddd m4, m6 327 paddd m2, m5 328 paddd m2, m4 329 paddd m2, m14 330 331 movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] 332.x_loop_ar2_inner: 333 pcmpgtb m4, m7, m0 334 punpcklbw m1, m0, m4 335 pmaddwd m3, m1, m13 336 paddd m3, m2 337 psrldq m1, 4 ; y=0,x=0 338 psrldq m2, 4 ; shift top to next pixel 339 psrad m3, [fg_dataq+FGData.ar_coeff_shift] 340 ; don't packssdw since we only care about one value 341 paddw m3, m1 342 packsswb m3, m3 343 pslldq m3, 2 344 pand m3, m15 345 pandn m1, m15, m0 346 por m0, m1, m3 347 psrldq m0, 1 348 ; overwrite 2 pixels, but that's ok 349 movd [bufq+xq-1], m0 350 inc xq 351 jz .x_loop_ar2_end 352 test xq, 3 353 jnz .x_loop_ar2_inner 354 jmp .x_loop_ar2 355 356.x_loop_ar2_end: 357 add bufq, 82 358 dec hd 359 jg .y_loop_ar2 360 RET 361 362.ar3: 363 DEFINE_ARGS buf, fg_data, shift 364%if ARCH_X86_32 365%assign stack_offset stack_offset_old 366 ALLOC_STACK -16*14 367%elif WIN64 368 SUB rsp, 16*6 369%assign stack_size_padded (stack_size_padded+16*6) 370%assign stack_size (stack_size+16*6) 371%else 372 ALLOC_STACK -16*6 373%endif 374 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 375 movd m6, [base+round_vals-12+shiftq*2] 376 movd m7, [base+byte_blend] 377 movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 378 movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 379 pxor m3, m3 380 pcmpgtb m4, m3, m0 381 pcmpgtb m3, m2 382 pshuflw m6, m6, q0000 383 SCRATCH 6, 14, 12 384 SCRATCH 7, 15, 13 385 punpckhbw m1, m0, m4 386 punpcklbw m0, m4 387 punpcklbw m2, m3 388 pshufd m3, m0, q1111 389 pshufd m4, m0, q2222 390 pshufd m5, m0, q3333 391 pshufd m0, m0, q0000 392 mova [rsp+ 0*16], m0 393 mova [rsp+ 1*16], m3 394 mova [rsp+ 2*16], m4 395 mova [rsp+ 3*16], m5 396 pshufd m6, m1, q1111 397 pshufd m7, m1, q2222 398 pshufd m5, m1, q3333 399 pshufd m1, m1, q0000 400 pshufd m3, m2, q1111 401 psrldq m0, m2, 10 402 pinsrw m2, [base+pw_1], 5 403 pshufd m4, m2, q2222 404 pshufd m2, m2, q0000 405 pinsrw m0, [base+round_vals+shiftq*2-10], 3 406 mova [rsp+ 4*16], m1 407 mova [rsp+ 5*16], m6 408 SCRATCH 7, 8, 6 409 SCRATCH 5, 9, 7 410 SCRATCH 2, 10, 8 411 SCRATCH 3, 11, 9 412 SCRATCH 4, 12, 10 413 SCRATCH 0, 13, 11 414 DEFINE_ARGS buf, fg_data, h, x 415 sub bufq, 82*73-(82*3+79) 416 mov hd, 70 417.y_loop_ar3: 418 mov xq, -76 419 420.x_loop_ar3: 421 movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] 422 pxor m3, m3 423 pcmpgtb m3, m0 424 punpckhbw m2, m0, m3 425 punpcklbw m0, m3 426 427 psrldq m5, m0, 2 428 psrldq m6, m0, 4 429 psrldq m7, m0, 6 430 punpcklwd m4, m0, m5 431 punpcklwd m6, m7 432 pmaddwd m4, [rsp+ 0*16] 433 pmaddwd m6, [rsp+ 1*16] 434 paddd m4, m6 435 436 movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] 437 pxor m5, m5 438 pcmpgtb m5, m1 439 punpckhbw m3, m1, m5 440 punpcklbw m1, m5 441 palignr m6, m2, m0, 10 442 palignr m7, m2, m0, 12 443 psrldq m0, 8 444 punpcklwd m0, m6 445 punpcklwd m7, m1 446 pmaddwd m0, [rsp+ 2*16] 447 pmaddwd m7, [rsp+ 3*16] 448 paddd m0, m7 449 paddd m0, m4 450 451 psrldq m4, m1, 2 452 psrldq m5, m1, 4 453 psrldq m6, m1, 6 454 psrldq m7, m1, 8 455 punpcklwd m4, m5 456 punpcklwd m6, m7 457 pmaddwd m4, [rsp+ 4*16] 458 pmaddwd m6, [rsp+ 5*16] 459 paddd m4, m6 460 paddd m0, m4 461 462 movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 463 pxor m7, m7 464 pcmpgtb m7, m2 465 punpckhbw m5, m2, m7 466 punpcklbw m2, m7 467 palignr m7, m3, m1, 10 468 palignr m3, m1, 12 469 psrldq m1, m2, 2 470 punpcklwd m7, m3 471 punpcklwd m3, m2, m1 472 pmaddwd m7, m8 473 pmaddwd m3, m9 474 paddd m7, m3 475 paddd m0, m7 476 477 psrldq m6, m2, 4 478 psrldq m1, m2, 6 479 psrldq m3, m2, 8 480 palignr m4, m5, m2, 10 481 palignr m5, m5, m2, 12 482 483 punpcklwd m6, m1 484 punpcklwd m3, m4 485 punpcklwd m5, m14 486 pmaddwd m6, m10 487 pmaddwd m3, m11 488 pmaddwd m5, m12 489 paddd m0, m6 490 paddd m3, m5 491 paddd m0, m3 492 493 movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] 494.x_loop_ar3_inner: 495 pxor m5, m5 496 pcmpgtb m5, m1 497 punpcklbw m2, m1, m5 498 pmaddwd m2, m13 499 pshufd m3, m2, q1111 500 paddd m2, m3 ; left+cur 501 paddd m2, m0 ; add top 502 psrldq m0, 4 503 psrad m2, [fg_dataq+FGData.ar_coeff_shift] 504 ; don't packssdw since we only care about one value 505 packsswb m2, m2 506 pslldq m2, 3 507 pand m2, m15 508 pandn m3, m15, m1 509 por m1, m2, m3 510 movd [bufq+xq-3], m1 511 psrldq m1, 1 512 inc xq 513 jz .x_loop_ar3_end 514 test xq, 3 515 jnz .x_loop_ar3_inner 516 jmp .x_loop_ar3 517 518.x_loop_ar3_end: 519 add bufq, 82 520 dec hd 521 jg .y_loop_ar3 522 RET 523 524%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y 525INIT_XMM ssse3 526cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv 527 movifnidn r2, r2mp 528 movifnidn r3, r3mp 529 LEA r4, $$ 530%define base r4-$$ 531 movq m1, [base+rnd_next_upperbit_mask] 532 movq m4, [base+mul_bits] 533 movq m7, [base+hmul_bits] 534 mov r5d, [fg_dataq+FGData.grain_scale_shift] 535 movd m6, [base+round+r5*2] 536 mova m5, [base+pb_mask] 537 movd m0, [fg_dataq+FGData.seed] 538 movd m2, [base+pw_seed_xor+uvq*4] 539 pxor m0, m2 540 pshuflw m6, m6, q0000 541 pshuflw m0, m0, q0000 542 lea r6, [base+gaussian_sequence] 543%if %2 544%if ARCH_X86_64 545 mov r7d, 73-35*%3 546%else 547 mov r3mp, 73-35*%3 548%endif 549 add bufq, 44 550.loop_y: 551 mov r5, -44 552.loop_x: 553%else 554 mov r5, -82*73 555 sub bufq, r5 556.loop: 557%endif 558 pand m2, m0, m1 559 psrlw m3, m2, 10 560 por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 561 pmullw m2, m4 ; bits 0x0f00 are set 562 pshufb m3, m5, m2 ; set 15th bit for next 4 seeds 563 psllq m2, m3, 30 564 por m3, m2 565 psllq m2, m3, 15 566 por m3, m2 ; aggregate each bit into next seed's high bit 567 pmulhuw m2, m0, m7 568 por m2, m3 ; 4 next output seeds 569 pshuflw m0, m2, q3333 570 psrlw m2, 5 571%if ARCH_X86_64 572 movd r9d, m2 573 pshuflw m2, m2, q3232 574 movzx r8, r9w 575 shr r9, 16 576 577 movd m3, [r6+r8*2] 578 pinsrw m3, [r6+r9*2], 1 579 580 movd r9d, m2 581 movzx r8, r9w 582 shr r9, 16 583 584 pinsrw m3, [r6+r8*2], 2 585 pinsrw m3, [r6+r9*2], 3 586%else 587 movd r2, m2 588 pshuflw m2, m2, q3232 589 movzx r1, r2w 590 shr r2, 16 591 592 movd m3, [r6+r1*2] 593 pinsrw m3, [r6+r2*2], 1 594 595 movd r2, m2 596 movzx r1, r2w 597 shr r2, 16 598 599 pinsrw m3, [r6+r1*2], 2 600 pinsrw m3, [r6+r2*2], 3 601%endif 602 pmulhrsw m3, m6 603 packsswb m3, m3 604 movd [bufq+r5], m3 605 add r5, 4 606%if %2 607 jl .loop_x 608 add bufq, 82 609%if ARCH_X86_64 610 dec r7d 611%else 612 dec r3mp 613%endif 614 jg .loop_y 615%else 616 jl .loop 617%endif 618 619%if ARCH_X86_32 620 mov r2, r2mp 621%endif 622 623 ; auto-regression code 624 movsxd r5, [fg_dataq+FGData.ar_coeff_lag] 625 movsxd r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4] 626 lea r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table] 627 jmp r5 628 629.ar0: 630 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 631 movifnidn bufyq, bufymp 632%if ARCH_X86_32 633%assign stack_offset_old stack_offset 634 ALLOC_STACK -2*16 635%endif 636 imul uvd, 28 637 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 638 movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq] 639 movd m4, [base+hmul_bits+shiftq*2] 640 DEFINE_ARGS buf, bufy, h, x 641 pxor m0, m0 642 pcmpgtb m0, m5 643 punpcklbw m5, m0 644 movd m7, [base+pb_1] 645%if %2 646 movd m6, [base+hmul_bits+2+%3*2] 647%endif 648 pshuflw m5, m5, q0000 649 pshuflw m4, m4, q0000 650 pshufd m7, m7, q0000 651%if %2 652 pshuflw m6, m6, q0000 653%endif 654 punpcklqdq m5, m5 655 punpcklqdq m4, m4 656%if %2 657 punpcklqdq m6, m6 658%endif 659 pcmpeqw m1, m1 660 pslldq m1, 12>>%2 661 SCRATCH 1, 8, 0 662 SCRATCH 4, 9, 1 663%if %2 664 sub bufq, 82*(73-35*%3)+82-(82*3+41) 665%else 666 sub bufq, 82*70-3 667%endif 668 add bufyq, 3+82*3 669 mov hd, 70-35*%3 670.y_loop_ar0: 671 xor xd, xd 672.x_loop_ar0: 673 ; first 32 pixels 674%if %2 675 movu m1, [bufyq+xq*2] 676%if %3 677 movu m2, [bufyq+xq*2+82] 678%endif 679 movu m3, [bufyq+xq*2+16] 680%if %3 681 movu m4, [bufyq+xq*2+82+16] 682%endif 683 pmaddubsw m0, m7, m1 684%if %3 685 pmaddubsw m1, m7, m2 686%endif 687 pmaddubsw m2, m7, m3 688%if %3 689 pmaddubsw m3, m7, m4 690 paddw m0, m1 691 paddw m2, m3 692%endif 693 pmulhrsw m0, m6 694 pmulhrsw m2, m6 695%else 696 movu m0, [bufyq+xq] 697 pxor m6, m6 698 pcmpgtb m6, m0 699 punpckhbw m2, m0, m6 700 punpcklbw m0, m6 701%endif 702 pmullw m0, m5 703 pmullw m2, m5 704 pmulhrsw m0, m9 705 pmulhrsw m2, m9 706 movu m1, [bufq+xq] 707 pxor m4, m4 708 pcmpgtb m4, m1 709 punpckhbw m3, m1, m4 710%if %2 711 punpcklbw m1, m4 712 paddw m2, m3 713 paddw m0, m1 714%else 715 punpcklbw m6, m1, m4 716 paddw m2, m3 717 paddw m0, m6 718%endif 719 packsswb m0, m2 720%if %2 721 movu [bufq+xq], m0 722 add xd, 16 723 cmp xd, 32 724 jl .x_loop_ar0 725 726 ; last 6/12 pixels 727 movu m1, [bufyq+xq*(1+%2)] 728%if %3 729 movu m2, [bufyq+xq*2+82] 730%endif 731 pmaddubsw m0, m7, m1 732%if %3 733 pmaddubsw m1, m7, m2 734 paddw m0, m1 735%endif 736 pmulhrsw m0, m6 737 pmullw m0, m5 738 pmulhrsw m0, m9 739 movq m1, [bufq+xq] 740 pxor m4, m4 741 pcmpgtb m4, m1 742 punpcklbw m2, m1, m4 743 paddw m0, m2 744 packsswb m0, m0 745 pandn m2, m8, m0 746 pand m1, m8 747 por m2, m1 748 movq [bufq+xq], m2 749%else 750 add xd, 16 751 cmp xd, 80 752 je .y_loop_final_ar0 753 movu [bufq+xq-16], m0 754 jmp .x_loop_ar0 755.y_loop_final_ar0: 756 pandn m2, m8, m0 757 pand m1, m8 758 por m2, m1 759 movu [bufq+xq-16], m2 760%endif 761 762 add bufq, 82 763 add bufyq, 82<<%3 764 dec hd 765 jg .y_loop_ar0 766 RET 767 768.ar1: 769%if ARCH_X86_32 770%assign stack_offset stack_offset_old 771%assign stack_size_padded 0 772%xdefine rstk rsp 773%endif 774 DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x 775 imul uvd, 28 776 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] 777 movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1] 778 pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2 779%if ARCH_X86_32 780 mov r3mp, cf3d 781 DEFINE_ARGS buf, shift, fg_data, val3, min, max, x 782%elif WIN64 783 DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x 784 mov bufq, r0 785%else 786 DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x 787%endif 788 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 789 movd m3, [base+round_vals+shiftq*2-12] ; rnd 790%if %2 791 movd m7, [base+pb_1] 792 movd m6, [base+hmul_bits+2+%3*2] 793%endif 794 psrldq m4, 1 795%if ARCH_X86_32 796 DEFINE_ARGS buf, shift, val0, val3, min, max, x 797%elif WIN64 798 DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0 799%else 800 DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0 801%endif 802 pxor m5, m5 803 punpcklwd m3, m5 804%if %2 805 punpcklwd m6, m6 806%endif 807 pcmpgtb m5, m4 808 punpcklbw m4, m5 809 pshufd m5, m4, q1111 810 pshufd m4, m4, q0000 811 pshufd m3, m3, q0000 812%if %2 813 pshufd m7, m7, q0000 814 pshufd m6, m6, q0000 815 sub bufq, 82*(73-35*%3)+44-(82*3+41) 816%else 817 sub bufq, 82*69+3 818%endif 819%if ARCH_X86_32 820 add r1mp, 79+82*3 821 mov r0mp, 70-35*%3 822%else 823 add bufyq, 79+82*3 824 mov hd, 70-35*%3 825%endif 826 mov mind, -128 827 mov maxd, 127 828.y_loop_ar1: 829 mov xq, -(76>>%2) 830 movsx val3d, byte [bufq+xq-1] 831.x_loop_ar1: 832%if %2 833%if ARCH_X86_32 834 mov r2, r1mp 835 movq m0, [r2+xq*2] 836%if %3 837 movq m1, [r2+xq*2+82] 838%endif 839%else 840 movq m0, [bufyq+xq*2] 841%if %3 842 movq m1, [bufyq+xq*2+82] 843%endif 844%endif 845 pmaddubsw m2, m7, m0 846%if %3 847 pmaddubsw m0, m7, m1 848 paddw m2, m0 849%endif 850 pmulhrsw m2, m6 851%else 852%if ARCH_X86_32 853 mov r2, r1mp 854 movd m2, [r2+xq] 855%else 856 movd m2, [bufyq+xq] 857%endif 858 pxor m0, m0 859 pcmpgtb m0, m2 860 punpcklbw m2, m0 861%endif 862 863 movq m0, [bufq+xq-82-1] ; top/left 864 pxor m1, m1 865 pcmpgtb m1, m0 866 punpcklbw m0, m1 867 psrldq m1, m0, 4 ; top/right 868 punpcklwd m1, m2 869 psrldq m2, m0, 2 ; top 870 punpcklwd m0, m2 871 pmaddwd m0, m4 872 pmaddwd m1, m5 873 paddd m0, m1 874 paddd m0, m3 875.x_loop_ar1_inner: 876 movd val0d, m0 877 psrldq m0, 4 878%if ARCH_X86_32 879 imul val3d, r3mp 880%else 881 imul val3d, cf3d 882%endif 883 add val3d, val0d 884 sar val3d, shiftb 885 movsx val0d, byte [bufq+xq] 886 add val3d, val0d 887 cmp val3d, maxd 888 cmovns val3d, maxd 889 cmp val3d, mind 890 cmovs val3d, mind 891 mov byte [bufq+xq], val3b 892 ; keep val3d in-place as left for next x iteration 893 inc xq 894 jz .x_loop_ar1_end 895 test xq, 3 896 jnz .x_loop_ar1_inner 897 jmp .x_loop_ar1 898 899.x_loop_ar1_end: 900 add bufq, 82 901%if ARCH_X86_32 902 add r1mp, 82<<%3 903 dec r0mp 904%else 905 add bufyq, 82<<%3 906 dec hd 907%endif 908 jg .y_loop_ar1 909 RET 910 911.ar2: 912%if ARCH_X86_32 913%assign stack_offset stack_offset_old 914%assign stack_size_padded 0 915%xdefine rstk rsp 916 ALLOC_STACK -8*16 917%endif 918 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 919 movifnidn bufyq, bufymp 920 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 921 imul uvd, 28 922 movd m7, [base+round_vals-12+shiftq*2] 923 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12 924 pxor m2, m2 925 pcmpgtb m2, m0 926 punpckhbw m1, m0, m2 927 punpcklbw m0, m2 928 pinsrw m1, [base+pw_1], 5 929 punpcklwd m7, m7 930 pshufd m7, m7, q0000 931 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 932 pshufd m4, m1, q0000 933 pshufd m5, m1, q1111 934 pshufd m6, m1, q2222 935 pshufd m3, m0, q3333 936 pshufd m2, m0, q2222 937 pshufd m1, m0, q1111 938 pshufd m0, m0, q0000 939 SCRATCH 0, 8, 0 940 SCRATCH 1, 9, 1 941 SCRATCH 2, 10, 2 942 SCRATCH 3, 11, 3 943 SCRATCH 4, 12, 4 944 SCRATCH 5, 13, 5 945 SCRATCH 6, 14, 6 946 SCRATCH 7, 15, 7 947%if %2 948 movd m7, [base+hmul_bits+2+%3*2] 949 movd m6, [base+pb_1] 950 punpcklwd m7, m7 951 pshufd m6, m6, q0000 952 pshufd m7, m7, q0000 953 sub bufq, 82*(73-35*%3)+44-(82*3+41) 954%else 955 sub bufq, 82*69+3 956%endif 957 add bufyq, 79+82*3 958 mov hd, 70-35*%3 959.y_loop_ar2: 960 mov xq, -(76>>%2) 961 962.x_loop_ar2: 963 pxor m2, m2 964 movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 965 movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 966 pcmpgtb m2, m0 967 punpckhbw m1, m0, m2 968 punpcklbw m0, m2 969 psrldq m5, m0, 2 ; y=-2,x=[-1,+5] 970 psrldq m3, m1, 2 ; y=-1,x=[-1,+5] 971 psrldq m4, m1, 4 ; y=-1,x=[+0,+5] 972 punpcklwd m2, m0, m5 973 punpcklwd m3, m4 974 pmaddwd m2, m8 975 pmaddwd m3, m11 976 paddd m2, m3 977 978 psrldq m4, m0, 4 ; y=-2,x=[+0,+5] 979 psrldq m5, m0, 6 ; y=-2,x=[+1,+5] 980 psrldq m0, 8 ; y=-2,x=[+2,+5] 981 punpcklwd m4, m5 982 punpcklwd m0, m1 983 psrldq m3, m1, 6 ; y=-1,x=[+1,+5] 984 psrldq m1, m1, 8 ; y=-1,x=[+2,+5] 985 punpcklwd m3, m1 986 pmaddwd m4, m9 987 pmaddwd m0, m10 988 pmaddwd m3, m12 989 paddd m4, m0 990 paddd m2, m3 991 paddd m2, m4 992 993%if %2 994 movq m1, [bufyq+xq*2] 995%if %3 996 movq m3, [bufyq+xq*2+82] 997%endif 998 pmaddubsw m0, m6, m1 999%if %3 1000 pmaddubsw m1, m6, m3 1001 paddw m0, m1 1002%endif 1003 pmulhrsw m0, m7 1004%else 1005 movd m0, [bufyq+xq] 1006 pxor m1, m1 1007 pcmpgtb m1, m0 1008 punpcklbw m0, m1 1009%endif 1010 punpcklwd m0, m15 1011 pmaddwd m0, m14 1012 paddd m2, m0 1013 1014 movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] 1015 pxor m4, m4 1016 movd m5, [base+byte_blend+1] 1017 punpcklbw m5, m5 1018.x_loop_ar2_inner: 1019 pcmpgtb m1, m4, m0 1020 punpcklbw m0, m1 1021 pmaddwd m3, m0, m13 1022 paddd m3, m2 1023 psrldq m2, 4 ; shift top to next pixel 1024 psrad m3, [fg_dataq+FGData.ar_coeff_shift] 1025 pslldq m3, 4 1026 pand m3, m5 1027 paddw m0, m3 1028 packsswb m0, m0 1029 movd [bufq+xq-2], m0 1030 psrldq m0, 1 1031 inc xq 1032 jz .x_loop_ar2_end 1033 test xq, 3 1034 jnz .x_loop_ar2_inner 1035 jmp .x_loop_ar2 1036 1037.x_loop_ar2_end: 1038 add bufq, 82 1039 add bufyq, 82<<%3 1040 dec hd 1041 jg .y_loop_ar2 1042 RET 1043 1044.ar3: 1045%if ARCH_X86_32 1046%assign stack_offset stack_offset_old 1047%assign stack_size_padded 0 1048%xdefine rstk rsp 1049%endif 1050 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 1051 movifnidn bufyq, bufymp 1052%if ARCH_X86_32 1053 ALLOC_STACK -15*16 1054%else 1055 SUB rsp, 16*7 1056%assign stack_size_padded (stack_size_padded+16*7) 1057%assign stack_size (stack_size+16*7) 1058%endif 1059 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 1060 imul uvd, 28 1061 1062 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 1063 pxor m3, m3 1064 pcmpgtb m3, m0 1065 punpckhbw m1, m0, m3 1066 punpcklbw m0, m3 1067 pshufd m2, m0, q1111 1068 pshufd m3, m0, q2222 1069 pshufd m4, m0, q3333 1070 pshufd m0, m0, q0000 1071 pshufd m5, m1, q1111 1072 pshufd m6, m1, q2222 1073 pshufd m7, m1, q3333 1074 pshufd m1, m1, q0000 1075 mova [rsp+ 0*16], m0 1076 mova [rsp+ 1*16], m2 1077 mova [rsp+ 2*16], m3 1078 mova [rsp+ 3*16], m4 1079 mova [rsp+ 4*16], m1 1080 mova [rsp+ 5*16], m5 1081 mova [rsp+ 6*16], m6 1082 SCRATCH 7, 8, 7 1083 1084 movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma] 1085 pxor m4, m4 1086 pcmpgtb m4, m2 1087 punpckhbw m5, m2, m4 1088 punpcklbw m2, m4 1089 pshufd m4, m2, q3232 1090 punpcklwd m3, m4, m5 1091 pshuflw m5, m4, q3321 1092 pshufd m4, m3, q0000 1093 pshufd m3, m2, q1111 1094 pshufd m2, m2, q0000 1095 pinsrw m5, [base+round_vals+shiftq*2-10], 3 1096 SCRATCH 2, 9, 8 1097 SCRATCH 3, 10, 9 1098 SCRATCH 4, 11, 10 1099 SCRATCH 5, 12, 11 1100 1101 movd m2, [base+round_vals-12+shiftq*2] 1102%if %2 1103 movd m1, [base+pb_1] 1104 movd m3, [base+hmul_bits+2+%3*2] 1105%endif 1106 pxor m0, m0 1107 punpcklwd m2, m0 1108%if %2 1109 punpcklwd m3, m3 1110%endif 1111 pshufd m2, m2, q0000 1112%if %2 1113 pshufd m1, m1, q0000 1114 pshufd m3, m3, q0000 1115 SCRATCH 1, 13, 12 1116%endif 1117 SCRATCH 2, 14, 13 1118%if %2 1119 SCRATCH 3, 15, 14 1120%endif 1121 1122 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 1123%if %2 1124 sub bufq, 82*(73-35*%3)+44-(82*3+41) 1125%else 1126 sub bufq, 82*69+3 1127%endif 1128 add bufyq, 79+82*3 1129 mov hd, 70-35*%3 1130.y_loop_ar3: 1131 mov xq, -(76>>%2) 1132 1133.x_loop_ar3: 1134 movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] 1135 pxor m4, m4 1136 pcmpgtb m4, m0 1137 punpckhbw m3, m0, m4 1138 punpcklbw m0, m4 1139 1140 psrldq m5, m0, 2 1141 psrldq m6, m0, 4 1142 psrldq m7, m0, 6 1143 punpcklwd m4, m0, m5 1144 punpcklwd m6, m7 1145 pmaddwd m4, [rsp+ 0*16] 1146 pmaddwd m6, [rsp+ 1*16] 1147 paddd m4, m6 1148 1149 palignr m2, m3, m0, 10 1150 palignr m3, m0, 12 1151 psrldq m0, 8 1152 1153 movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] 1154 pxor m6, m6 1155 pcmpgtb m6, m1 1156 punpckhbw m5, m1, m6 1157 punpcklbw m1, m6 1158 1159 punpcklwd m0, m2 1160 punpcklwd m3, m1 1161 pmaddwd m0, [rsp+ 2*16] 1162 pmaddwd m3, [rsp+ 3*16] 1163 paddd m0, m3 1164 paddd m0, m4 1165 1166 movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 1167 pxor m7, m7 1168 pcmpgtb m7, m2 1169 punpckhbw m6, m2, m7 1170 punpcklbw m2, m7 1171 1172 palignr m3, m5, m1, 10 1173 palignr m5, m1, 12 1174 psrldq m4, m2, 2 1175 1176 punpcklwd m3, m5 1177 punpcklwd m5, m2, m4 1178 pmaddwd m3, [rsp+ 6*16] 1179 pmaddwd m5, m8 1180 paddd m3, m5 1181 paddd m0, m3 1182 1183 psrldq m3, m1, 2 1184 psrldq m4, m1, 4 1185 psrldq m5, m1, 6 1186 psrldq m1, 8 1187 1188 punpcklwd m3, m4 1189 punpcklwd m5, m1 1190 pmaddwd m3, [rsp+ 4*16] 1191 pmaddwd m5, [rsp+ 5*16] 1192 paddd m3, m5 1193 paddd m0, m3 1194 1195%if %2 1196 movq m1, [bufyq+xq*2] 1197%if %3 1198 movq m3, [bufyq+xq*2+82] 1199%endif 1200 pmaddubsw m7, m13, m1 1201%if %3 1202 pmaddubsw m5, m13, m3 1203 paddw m7, m5 1204%endif 1205 pmulhrsw m7, m15 1206%else 1207 movd m7, [bufyq+xq] 1208 pxor m1, m1 1209 pcmpgtb m1, m7 1210 punpcklbw m7, m1 1211%endif 1212 1213 psrldq m1, m2, 4 1214 psrldq m3, m2, 6 1215 palignr m4, m6, m2, 10 1216 palignr m6, m2, 12 1217 psrldq m2, 8 1218 1219 punpcklwd m1, m3 1220 punpcklwd m2, m4 1221 punpcklwd m6, m7 1222 pmaddwd m1, m9 1223 pmaddwd m2, m10 1224 pmaddwd m6, m11 1225 paddd m1, m2 1226 paddd m0, m6 1227 paddd m0, m1 1228 paddd m0, m14 1229 1230 movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] 1231 pxor m4, m4 1232 movd m5, [base+byte_blend] 1233.x_loop_ar3_inner: 1234 pcmpgtb m2, m4, m1 1235 punpcklbw m3, m1, m2 1236 pmaddwd m2, m3, m12 1237 pshufd m3, m2, q1111 1238 paddd m2, m3 ; left+cur 1239 paddd m2, m0 ; add top 1240 psrldq m0, 4 1241 psrad m2, [fg_dataq+FGData.ar_coeff_shift] 1242 ; don't packssdw, we only care about one value 1243 packsswb m2, m2 1244 pandn m3, m5, m1 1245 pslld m2, 24 1246 pand m2, m5 1247 por m1, m2, m3 1248 movd [bufq+xq-3], m1 1249 psrldq m1, 1 1250 inc xq 1251 jz .x_loop_ar3_end 1252 test xq, 3 1253 jnz .x_loop_ar3_inner 1254 jmp .x_loop_ar3 1255 1256.x_loop_ar3_end: 1257 add bufq, 82 1258 add bufyq, 82<<%3 1259 dec hd 1260 jg .y_loop_ar3 1261 RET 1262%endmacro 1263 1264generate_grain_uv_fn 420, 1, 1 1265generate_grain_uv_fn 422, 1, 0 1266generate_grain_uv_fn 444, 0, 0 1267 1268%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg 1269%assign %%idx 0 1270%define %%tmp %2 1271%if %0 == 6 1272%define %%tmp %6 1273%endif 1274%rep 4 1275%if %%idx == 0 1276 movd %5 %+ d, %2 1277 pshuflw %%tmp, %2, q3232 1278%else 1279 movd %5 %+ d, %%tmp 1280%if %%idx == 2 1281 punpckhqdq %%tmp, %%tmp 1282%elif %%idx == 4 1283 psrlq %%tmp, 32 1284%endif 1285%endif 1286 movzx %4 %+ d, %5 %+ w 1287 shr %5 %+ d, 16 1288 1289%if %%idx == 0 1290 movd %1, [%3+%4] 1291%else 1292 pinsrw %1, [%3+%4], %%idx + 0 1293%endif 1294 pinsrw %1, [%3+%5], %%idx + 1 1295%assign %%idx %%idx+2 1296%endrep 1297%endmacro 1298 1299INIT_XMM ssse3 1300; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby) 1301%if ARCH_X86_32 1302%if STACK_ALIGNMENT < mmsize 1303cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \ 1304 dst, src, scaling, unused1, fg_data, picptr, unused2 1305 ; copy stack arguments to new position post-alignment, so that we 1306 ; don't have to keep the old stack location in a separate register 1307 mov r0, r0m 1308 mov r1, r2m 1309 mov r2, r4m 1310 mov r3, r6m 1311 mov r4, r7m 1312 mov r5, r8m 1313 1314 mov [rsp+5*mmsize+ 4*gprsize], r0 1315 mov [rsp+5*mmsize+ 6*gprsize], r1 1316 mov [rsp+5*mmsize+ 8*gprsize], r2 1317 mov [rsp+5*mmsize+10*gprsize], r3 1318 mov [rsp+5*mmsize+11*gprsize], r4 1319 mov [rsp+5*mmsize+12*gprsize], r5 1320%else 1321cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \ 1322 dst, src, scaling, unused1, fg_data, picptr, unused2 1323%endif 1324 mov srcq, srcm 1325 mov fg_dataq, r3m 1326 mov scalingq, r5m 1327%if STACK_ALIGNMENT < mmsize 1328%define r0m [rsp+5*mmsize+ 4*gprsize] 1329%define r1m [rsp+5*mmsize+ 5*gprsize] 1330%define r2m [rsp+5*mmsize+ 6*gprsize] 1331%define r3m [rsp+5*mmsize+ 7*gprsize] 1332%define r4m [rsp+5*mmsize+ 8*gprsize] 1333%define r5m [rsp+5*mmsize+ 9*gprsize] 1334%define r6m [rsp+5*mmsize+10*gprsize] 1335%define r7m [rsp+5*mmsize+11*gprsize] 1336%define r8m [rsp+5*mmsize+12*gprsize] 1337%endif 1338 LEA r5, pb_mask 1339%define base r5-pb_mask 1340 mov r5m, picptrq 1341%else 1342cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut 1343 lea r7, [pb_mask] 1344%define base r7-pb_mask 1345%endif 1346 mov r6d, [fg_dataq+FGData.scaling_shift] 1347 movd m3, [base+mul_bits+r6*2-14] 1348 mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 1349 movd m4, [base+max+r6*4] 1350 movd m5, [base+min+r6*2] 1351 punpcklwd m3, m3 1352 punpcklwd m4, m4 1353 punpcklwd m5, m5 1354 pshufd m3, m3, q0000 1355 pshufd m4, m4, q0000 1356 pshufd m5, m5, q0000 1357 SCRATCH 3, 11, 0 1358 SCRATCH 4, 12, 1 1359 SCRATCH 5, 13, 2 1360 1361%if ARCH_X86_32 1362 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 1363%else 1364 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 1365%endif 1366 1367 mov sbyd, r8m 1368 mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 1369 test overlapd, overlapd 1370 jz .no_vertical_overlap 1371 mova m6, [base+pw_1024] 1372 mova m7, [base+pb_27_17_17_27] 1373 SCRATCH 6, 14, 3 1374 SCRATCH 7, 15, 4 1375 test sbyd, sbyd 1376 jnz .vertical_overlap 1377 ; fall-through 1378 1379.no_vertical_overlap: 1380 mov r8m, overlapd 1381%if ARCH_X86_32 1382 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused 1383 imul seed, (173 << 24) | 37 1384%else 1385 imul seed, sbyd, (173 << 24) | 37 1386%endif 1387 add seed, (105 << 24) | 178 1388 rol seed, 8 1389 movzx seed, seew 1390 xor seed, [fg_dataq+FGData.seed] 1391 1392%if ARCH_X86_32 1393 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1394 1395 mov r3m, seed 1396 mov wq, r4m 1397%else 1398 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1399 unused1, unused2, see, unused3 1400%endif 1401 1402 lea src_bakq, [srcq+wq] 1403 neg wq 1404 sub dstmp, srcq 1405%if ARCH_X86_32 1406 mov r1m, src_bakq 1407 mov r4m, wq 1408 DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 1409%endif 1410 1411.loop_x: 1412%if ARCH_X86_32 1413 mov seed, r3m 1414%endif 1415 mov r6d, seed 1416 or seed, 0xEFF4 1417 shr r6d, 1 1418 test seeb, seeh 1419 lea seed, [r6+0x8000] 1420 cmovp seed, r6d ; updated seed 1421%if ARCH_X86_32 1422 mov r3m, seed 1423 1424 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1425 1426 mov offxd, offyd 1427%else 1428 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1429 offx, offy, see, unused 1430 1431 mov offyd, seed 1432 mov offxd, seed 1433%endif 1434 ror offyd, 8 1435 shr offxd, 12 1436 and offyd, 0xf 1437 imul offyd, 164 1438 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1439 1440%if ARCH_X86_32 1441 ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, 1442 ; r6m=grain_lut, r7m=h, r8m=overlap_v|h 1443 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1444%else 1445 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1446 h, offxy, see, unused 1447%endif 1448 1449.loop_x_odd: 1450 mov hd, r7m 1451 mov grain_lutq, grain_lutmp 1452.loop_y: 1453 ; src 1454 mova m0, [srcq] 1455 pxor m2, m2 1456 punpckhbw m1, m0, m2 1457 punpcklbw m0, m2 ; m0-1: src as word 1458 1459 ; scaling[src] 1460%if ARCH_X86_32 1461 vpgatherdw m4, m0, scalingq-1, r0, r5, m3 1462 vpgatherdw m5, m1, scalingq-1, r0, r5, m3 1463%else 1464 vpgatherdw m4, m0, scalingq-1, r12, r13, m3 1465 vpgatherdw m5, m1, scalingq-1, r12, r13, m3 1466%endif 1467 REPX {psrlw x, 8}, m4, m5 1468 1469 ; grain = grain_lut[offy+y][offx+x] 1470 movu m3, [grain_lutq+offxyq] 1471 pcmpgtb m7, m2, m3 1472 punpcklbw m2, m3, m7 1473 punpckhbw m3, m7 1474 1475 ; noise = round2(scaling[src] * grain, scaling_shift) 1476 pmullw m2, m4 1477 pmullw m3, m5 1478 pmulhrsw m2, m11 1479 pmulhrsw m3, m11 1480 1481 ; dst = clip_pixel(src, noise) 1482 paddw m0, m2 1483 paddw m1, m3 1484 pmaxsw m0, m13 1485 pmaxsw m1, m13 1486 pminsw m0, m12 1487 pminsw m1, m12 1488 packuswb m0, m1 1489 movifnidn dstq, dstmp 1490 mova [dstq+srcq], m0 1491 1492 add srcq, r2mp 1493 add grain_lutq, 82 1494 dec hd 1495 jg .loop_y 1496 1497%if ARCH_X86_32 1498 add r4mp, 16 1499%else 1500 add wq, 16 1501%endif 1502 jge .end 1503%if ARCH_X86_32 1504 mov srcq, r1mp 1505 add srcq, r4mp 1506%else 1507 lea srcq, [src_bakq+wq] 1508%endif 1509 btc dword r8m, 2 1510 jc .next_blk 1511 1512 add offxyd, 16 1513 test dword r8m, 2 ; r8m & 2 = have_top_overlap 1514 jz .loop_x_odd 1515 1516%if ARCH_X86_32 1517 add dword [rsp+5*mmsize+1*gprsize], 16 1518%else 1519 add r11d, 16 ; top_offxyd 1520%endif 1521 jnz .loop_x_odd_v_overlap 1522 1523.next_blk: 1524 test dword r8m, 1 1525 jz .loop_x 1526 1527 test dword r8m, 2 1528 jnz .loop_x_hv_overlap 1529 1530 ; horizontal overlap (without vertical overlap) 1531.loop_x_h_overlap: 1532%if ARCH_X86_32 1533 ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, 1534 ; r6m=grain_lut, r7m=h, r8m=overlap_v|h 1535 DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3 1536 1537 add offxyd, 16 ; left_offxyd 1538 mov [rsp+5*mmsize+0*gprsize], offxyd 1539 1540 DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 1541 1542 mov seed, r3m 1543%else 1544 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1545 offx, offy, see, left_offxy 1546 1547 lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx 1548%endif 1549 1550 mov r6d, seed 1551 or seed, 0xEFF4 1552 shr r6d, 1 1553 test seeb, seeh 1554 lea seed, [r6+0x8000] 1555 cmovp seed, r6d ; updated seed 1556 1557%if ARCH_X86_32 1558 mov r3m, seed 1559 1560 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1561 1562 mov offxd, offyd 1563%else 1564 mov offyd, seed 1565 mov offxd, seed 1566%endif 1567 ror offyd, 8 1568 shr offxd, 12 1569 and offyd, 0xf 1570 imul offyd, 164 1571 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1572 1573%if ARCH_X86_32 1574 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1575%else 1576 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1577 h, offxy, see, left_offxy 1578%endif 1579 1580 mov hd, r7m 1581 mov grain_lutq, grain_lutmp 1582.loop_y_h_overlap: 1583 ; src 1584 mova m0, [srcq] 1585 pxor m2, m2 1586 punpckhbw m1, m0, m2 1587 punpcklbw m0, m2 ; m0-1: src as word 1588 1589 ; scaling[src] 1590%if ARCH_X86_32 1591 vpgatherdw m4, m0, scalingq-1, r0, r5, m3 1592 vpgatherdw m5, m1, scalingq-1, r0, r5, m3 1593%else 1594 vpgatherdw m4, m0, scalingq-1, r12, r13, m3 1595 vpgatherdw m5, m1, scalingq-1, r12, r13, m3 1596%endif 1597 REPX {psrlw x, 8}, m4, m5 1598 1599 ; grain = grain_lut[offy+y][offx+x] 1600 movu m3, [grain_lutq+offxyq] 1601%if ARCH_X86_32 1602 mov r5, [rsp+5*mmsize+0*gprsize] 1603 movd m7, [grain_lutq+r5] 1604%else 1605 movd m7, [grain_lutq+left_offxyq] 1606%endif 1607 punpcklbw m7, m3 1608 pmaddubsw m6, m15, m7 1609 pmulhrsw m6, m14 1610 packsswb m6, m6 1611 shufps m6, m3, q3210 1612 pcmpgtb m2, m6 1613 punpcklbw m7, m6, m2 1614 punpckhbw m6, m2 1615 1616 ; noise = round2(scaling[src] * grain, scaling_shift) 1617 pmullw m7, m4 1618 pmullw m6, m5 1619 pmulhrsw m7, m11 1620 pmulhrsw m6, m11 1621 1622 ; dst = clip_pixel(src, noise) 1623 paddw m0, m7 1624 paddw m1, m6 1625 pmaxsw m0, m13 1626 pmaxsw m1, m13 1627 pminsw m0, m12 1628 pminsw m1, m12 1629 packuswb m0, m1 1630 movifnidn dstq, dstmp 1631 mova [dstq+srcq], m0 1632 1633 add srcq, r2mp 1634 add grain_lutq, 82 1635 dec hd 1636 jg .loop_y_h_overlap 1637 1638%if ARCH_X86_32 1639 add r4mp, 16 1640%else 1641 add wq, 16 1642%endif 1643 jge .end 1644%if ARCH_X86_32 1645 mov srcq, r1m 1646 add srcq, r4m 1647%else 1648 lea srcq, [src_bakq+wq] 1649%endif 1650 xor dword r8m, 4 1651 add offxyd, 16 1652 1653 ; since this half-block had left-overlap, the next does not 1654 test dword r8m, 2 ; have_top_overlap 1655 jz .loop_x_odd 1656%if ARCH_X86_32 1657 add dword [rsp+5*mmsize+1*gprsize], 16 1658%else 1659 add r11d, 16 ; top_offxyd 1660%endif 1661 jmp .loop_x_odd_v_overlap 1662 1663.end: 1664 RET 1665 1666.vertical_overlap: 1667%if ARCH_X86_32 1668 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 1669%else 1670 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap 1671%endif 1672 1673 or overlapd, 2 ; top_overlap: overlap & 2 1674 mov r8m, overlapd 1675 movzx sbyd, sbyb 1676%if ARCH_X86_32 1677 imul r4, [fg_dataq+FGData.seed], 0x00010001 1678 DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused 1679%else 1680 imul seed, [fg_dataq+FGData.seed], 0x00010001 1681%endif 1682 imul tmpd, sbyd, 173 * 0x00010001 1683 imul sbyd, 37 * 0x01000100 1684 add tmpd, (105 << 16) | 188 1685 add sbyd, (178 << 24) | (141 << 8) 1686 and tmpd, 0x00ff00ff 1687 and sbyd, 0xff00ff00 1688 xor seed, tmpd 1689%if ARCH_X86_32 1690 xor sbyd, seed ; (cur_seed << 16) | top_seed 1691 1692 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1693 1694 mov r3m, seed 1695 mov wq, r4m 1696%else 1697 xor seed, sbyd ; (cur_seed << 16) | top_seed 1698 1699 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1700 tmp, unused2, see, unused3 1701%endif 1702 1703 lea src_bakq, [srcq+wq] 1704 neg wq 1705 sub dstmp, srcq 1706%if ARCH_X86_32 1707 mov r1m, src_bakq 1708 mov r4m, wq 1709 DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 1710%endif 1711 1712.loop_x_v_overlap: 1713%if ARCH_X86_32 1714 mov seed, r3m 1715%endif 1716 ; we assume from the block above that bits 8-15 of tmpd are zero'ed, 1717 ; because of the 'and tmpd, 0x00ff00ff' above 1718 mov r6d, seed 1719 or seed, 0xeff4eff4 1720 test seeb, seeh 1721 setp tmpb ; parity of top_seed 1722 shr seed, 16 1723 shl tmpd, 16 1724 test seeb, seeh 1725 setp tmpb ; parity of cur_seed 1726 or r6d, 0x00010001 1727 xor tmpd, r6d 1728 mov seed, tmpd 1729 ror seed, 1 ; updated (cur_seed << 16) | top_seed 1730 1731%if ARCH_X86_32 1732 mov r3m, seed 1733 1734 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1735 1736 mov offxd, offyd 1737%else 1738 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1739 offx, offy, see, unused, top_offxy 1740 1741 mov offyd, seed 1742 mov offxd, seed 1743%endif 1744 1745 ror offyd, 8 1746 ror offxd, 12 1747 and offyd, 0xf000f 1748 and offxd, 0xf000f 1749 imul offyd, 164 1750 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1751 lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1752 1753%if ARCH_X86_32 1754 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut 1755%else 1756 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1757 h, offxy, see, unused, top_offxy 1758%endif 1759 1760 movzx top_offxyd, offxyw 1761%if ARCH_X86_32 1762 mov [rsp+5*mmsize+1*gprsize], top_offxyd 1763 1764 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1765%endif 1766 shr offxyd, 16 1767 1768.loop_x_odd_v_overlap: 1769%if ARCH_X86_32 1770 mov r5, r5m 1771 lea r5, [base+pb_27_17] 1772 mov [rsp+5*mmsize+12], r5 1773%else 1774 mova m8, [pb_27_17] 1775%endif 1776 mov hd, r7m 1777 mov grain_lutq, grain_lutmp 1778.loop_y_v_overlap: 1779 ; src 1780 mova m0, [srcq] 1781 pxor m2, m2 1782 punpckhbw m1, m0, m2 1783 punpcklbw m0, m2 ; m0-1: src as word 1784 1785 ; scaling[src] 1786%if ARCH_X86_32 1787 vpgatherdw m4, m0, scalingq-1, r0, r5, m3 1788 vpgatherdw m5, m1, scalingq-1, r0, r5, m3 1789%else 1790 vpgatherdw m4, m0, scalingq-1, r12, r13, m3 1791 vpgatherdw m5, m1, scalingq-1, r12, r13, m3 1792%endif 1793 REPX {psrlw x, 8}, m4, m5 1794 1795 ; grain = grain_lut[offy+y][offx+x] 1796 movu m3, [grain_lutq+offxyq] 1797%if ARCH_X86_32 1798 mov r5, [rsp+5*mmsize+1*gprsize] 1799 movu m7, [grain_lutq+r5] 1800%else 1801 movu m7, [grain_lutq+top_offxyq] 1802%endif 1803 punpckhbw m6, m7, m3 1804 punpcklbw m7, m3 1805%if ARCH_X86_32 1806 mov r5, [rsp+5*mmsize+12] 1807 pmaddubsw m3, [r5], m6 1808 pmaddubsw m6, [r5], m7 1809%else 1810 pmaddubsw m3, m8, m6 1811 pmaddubsw m6, m8, m7 1812%endif 1813 pmulhrsw m3, m14 1814 pmulhrsw m6, m14 1815 packsswb m6, m3 1816 pcmpgtb m7, m2, m6 1817 punpcklbw m2, m6, m7 1818 punpckhbw m6, m7 1819 1820 ; noise = round2(scaling[src] * grain, scaling_shift) 1821 pmullw m2, m4 1822 pmullw m6, m5 1823 pmulhrsw m2, m11 1824 pmulhrsw m6, m11 1825 1826 ; dst = clip_pixel(src, noise) 1827 paddw m0, m2 1828 paddw m1, m6 1829 pmaxsw m0, m13 1830 pmaxsw m1, m13 1831 pminsw m0, m12 1832 pminsw m1, m12 1833 packuswb m0, m1 1834 movifnidn dstq, dstmp 1835 mova [dstq+srcq], m0 1836 1837%if ARCH_X86_32 1838 add dword [rsp+5*mmsize+12], mmsize 1839%else 1840 mova m8, [pb_17_27] 1841%endif 1842 add srcq, r2mp 1843 add grain_lutq, 82 1844 dec hw 1845 jz .end_y_v_overlap 1846 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1847 ; remaining (up to) 30 lines 1848 btc hd, 16 1849 jnc .loop_y_v_overlap 1850 jmp .loop_y 1851 1852.end_y_v_overlap: 1853%if ARCH_X86_32 1854 add r4mp, 16 1855%else 1856 add wq, 16 1857%endif 1858 jge .end_hv 1859%if ARCH_X86_32 1860 mov srcq, r1mp 1861 add srcq, r4mp 1862%else 1863 lea srcq, [src_bakq+wq] 1864%endif 1865 btc dword r8m, 2 1866 jc .loop_x_hv_overlap 1867 add offxyd, 16 1868%if ARCH_X86_32 1869 add dword [rsp+5*mmsize+1*gprsize], 16 1870%else 1871 add top_offxyd, 16 1872%endif 1873 jmp .loop_x_odd_v_overlap 1874 1875.loop_x_hv_overlap: 1876%if ARCH_X86_32 1877 mov r5, r5m 1878 lea r5, [base+pb_27_17] 1879 mov [rsp+5*mmsize+12], r5 1880 1881 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak 1882 1883 mov r5, [rsp+5*mmsize+1*gprsize] 1884 mov r4, offxyd 1885 add r5, 16 1886 add r4, 16 1887 mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy 1888 mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy 1889 1890 DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak 1891 1892 xor tmpd, tmpd 1893 mov seed, r3m 1894%else 1895 mova m8, [pb_27_17] 1896 1897 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1898 tmp, unused2, see, unused3 1899 1900 ; we assume from the block above that bits 8-15 of tmpd are zero'ed 1901%endif 1902 mov r6d, seed 1903 or seed, 0xeff4eff4 1904 test seeb, seeh 1905 setp tmpb ; parity of top_seed 1906 shr seed, 16 1907 shl tmpd, 16 1908 test seeb, seeh 1909 setp tmpb ; parity of cur_seed 1910 or r6d, 0x00010001 1911 xor tmpd, r6d 1912 mov seed, tmpd 1913 ror seed, 1 ; updated (cur_seed << 16) | top_seed 1914 1915%if ARCH_X86_32 1916 mov r3m, seed 1917 1918 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1919 1920 mov offxd, offyd 1921%else 1922 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1923 offx, offy, see, left_offxy, top_offxy, topleft_offxy 1924 1925 lea topleft_offxyq, [top_offxyq+16] 1926 lea left_offxyq, [offyq+16] 1927 mov offyd, seed 1928 mov offxd, seed 1929%endif 1930 ror offyd, 8 1931 ror offxd, 12 1932 and offyd, 0xf000f 1933 and offxd, 0xf000f 1934 imul offyd, 164 1935 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1936 lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1937 1938%if ARCH_X86_32 1939 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1940 1941 movzx r5, offxyw ; top_offxy 1942 mov [rsp+5*mmsize+1*gprsize], r5 1943%else 1944 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1945 h, offxy, see, left_offxy, top_offxy, topleft_offxy 1946 1947 movzx top_offxyd, offxyw 1948%endif 1949 shr offxyd, 16 1950 1951 mov hd, r7m 1952 mov grain_lutq, grain_lutmp 1953.loop_y_hv_overlap: 1954 ; grain = grain_lut[offy+y][offx+x] 1955 movu m3, [grain_lutq+offxyq] 1956%if ARCH_X86_32 1957 mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy 1958 mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy 1959 movu m6, [grain_lutq+r5] 1960 mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy 1961 movd m4, [grain_lutq+r0] 1962 movd m7, [grain_lutq+r5] 1963%else 1964 movu m6, [grain_lutq+top_offxyq] 1965 movd m4, [grain_lutq+left_offxyq] 1966 movd m7, [grain_lutq+topleft_offxyq] 1967%endif 1968 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 1969 punpcklbw m4, m3 1970 punpcklbw m7, m6 1971 pmaddubsw m2, m15, m4 1972 pmaddubsw m4, m15, m7 1973 pmulhrsw m2, m14 1974 pmulhrsw m4, m14 1975 packsswb m2, m2 1976 packsswb m4, m4 1977 shufps m2, m3, q3210 1978 shufps m4, m6, q3210 1979 ; followed by v interpolation (top | cur -> cur) 1980 punpcklbw m3, m4, m2 1981 punpckhbw m4, m2 1982%if ARCH_X86_32 1983 mov r5, [rsp+5*mmsize+12] 1984 pmaddubsw m7, [r5], m4 1985 pmaddubsw m4, [r5], m3 1986%else 1987 pmaddubsw m7, m8, m4 1988 pmaddubsw m4, m8, m3 1989%endif 1990 pmulhrsw m7, m14 1991 pmulhrsw m4, m14 1992 packsswb m4, m7 1993 pxor m2, m2 1994 pcmpgtb m7, m2, m4 1995 punpcklbw m3, m4, m7 1996 punpckhbw m4, m7 1997 1998 ; src 1999 mova m0, [srcq] 2000 punpckhbw m1, m0, m2 2001 punpcklbw m0, m2 ; m0-1: src as word 2002 2003 ; scaling[src] 2004%if ARCH_X86_32 2005 vpgatherdw m5, m0, scalingq-1, r0, r5, m7 2006 vpgatherdw m6, m1, scalingq-1, r0, r5, m7 2007%else 2008 vpgatherdw m5, m0, scalingq-1, r13, r14, m7 2009 vpgatherdw m6, m1, scalingq-1, r13, r14, m7 2010%endif 2011 REPX {psrlw x, 8}, m5, m6 2012 2013 ; noise = round2(scaling[src] * grain, scaling_shift) 2014 pmullw m3, m5 2015 pmullw m4, m6 2016 pmulhrsw m3, m11 2017 pmulhrsw m4, m11 2018 2019 ; dst = clip_pixel(src, noise) 2020 paddw m0, m3 2021 paddw m1, m4 2022 pmaxsw m0, m13 2023 pmaxsw m1, m13 2024 pminsw m0, m12 2025 pminsw m1, m12 2026 packuswb m0, m1 2027 movifnidn dstq, dstmp 2028 mova [dstq+srcq], m0 2029 2030%if ARCH_X86_32 2031 add dword [rsp+5*mmsize+12], mmsize 2032%else 2033 mova m8, [pb_17_27] 2034%endif 2035 add srcq, r2mp 2036 add grain_lutq, 82 2037 dec hw 2038 jz .end_y_hv_overlap 2039 ; 2 lines get vertical overlap, then fall back to non-overlap code for 2040 ; remaining (up to) 30 lines 2041 btc hd, 16 2042 jnc .loop_y_hv_overlap 2043 jmp .loop_y_h_overlap 2044 2045.end_y_hv_overlap: 2046%if ARCH_X86_32 2047 add r4mp, 16 2048%else 2049 add wq, 16 2050%endif 2051 jge .end_hv 2052%if ARCH_X86_32 2053 mov srcq, r1m 2054 add srcq, r4m 2055%else 2056 lea srcq, [src_bakq+wq] 2057%endif 2058 xor dword r8m, 4 2059 add offxyd, 16 2060%if ARCH_X86_32 2061 add dword [rsp+5*mmsize+1*gprsize], 16 2062%else 2063 add top_offxyd, 16 2064%endif 2065 jmp .loop_x_odd_v_overlap 2066 2067.end_hv: 2068 RET 2069 2070%macro FGUV_FN 3 ; name, ss_hor, ss_ver 2071INIT_XMM ssse3 2072%if ARCH_X86_32 2073; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h, 2074; sby, luma, lstride, uv_pl, is_id) 2075%if STACK_ALIGNMENT < mmsize 2076DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8 2077cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \ 2078 tmp, src, scaling, h, fg_data, picptr, unused 2079 mov r0, r0m 2080 mov r1, r2m 2081 mov r2, r4m 2082 mov r3, r6m 2083 mov r4, r7m 2084 mov [rsp+7*mmsize+3*gprsize], r0 2085 mov [rsp+7*mmsize+5*gprsize], r1 2086 mov [rsp+7*mmsize+7*gprsize], r2 2087 mov [rsp+7*mmsize+9*gprsize], r3 2088 mov [rsp+7*mmsize+10*gprsize], r4 2089 2090 mov r0, r8m 2091 mov r1, r9m 2092 mov r2, r10m 2093 mov r4, r11m 2094 mov r3, r12m 2095 mov [rsp+7*mmsize+11*gprsize], r0 2096 mov [rsp+7*mmsize+12*gprsize], r1 2097 mov [rsp+7*mmsize+13*gprsize], r2 2098 mov [rsp+7*mmsize+14*gprsize], r4 2099%else 2100cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \ 2101 tmp, src, scaling, h, fg_data, picptr, unused 2102%endif 2103 mov srcq, srcm 2104 mov fg_dataq, r3m 2105 mov scalingq, r5m 2106%if STACK_ALIGNMENT < mmsize 2107%define r0m [rsp+7*mmsize+ 3*gprsize] 2108%define r1m [rsp+7*mmsize+ 4*gprsize] 2109%define r2m [rsp+7*mmsize+ 5*gprsize] 2110%define r3m [rsp+7*mmsize+ 6*gprsize] 2111%define r4m [rsp+7*mmsize+ 7*gprsize] 2112%define r5m [rsp+7*mmsize+ 8*gprsize] 2113%define r6m [rsp+7*mmsize+ 9*gprsize] 2114%define r7m [rsp+7*mmsize+10*gprsize] 2115%define r8m [rsp+7*mmsize+11*gprsize] 2116%define r9m [rsp+7*mmsize+12*gprsize] 2117%define r10m [rsp+7*mmsize+13*gprsize] 2118%define r11m [rsp+7*mmsize+14*gprsize] 2119%define r12m [rsp+7*mmsize+15*gprsize] 2120%endif 2121 LEA r5, pb_mask 2122%define base r5-pb_mask 2123 mov r5m, r5 2124%else 2125cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ 2126 grain_lut, tmp, sby, luma, lstride, uv_pl, is_id 2127 lea r8, [pb_mask] 2128%define base r8-pb_mask 2129%endif 2130 mov r6d, [fg_dataq+FGData.scaling_shift] 2131 movd m3, [base+mul_bits+r6*2-14] 2132 mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 2133 lea tmpd, [r6d*2] 2134%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize 2135 test r3, r3 2136%else 2137 cmp dword r12m, 0 ; is_idm 2138%endif 2139 movd m5, [base+min+r6*2] 2140 cmovne r6d, tmpd 2141 movd m4, [base+max+r6*2] 2142 punpcklwd m3, m3 2143 punpcklwd m5, m5 2144 punpcklwd m4, m4 2145 pshufd m3, m3, q0000 2146 pshufd m5, m5, q0000 2147 pshufd m4, m4, q0000 2148 SCRATCH 3, 11, 0 2149 SCRATCH 4, 12, 1 2150 SCRATCH 5, 13, 2 2151 2152 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 2153 jne .csfl 2154 2155%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver 2156%if ARCH_X86_32 2157 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 2158%else 2159 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 2160%endif 2161 2162%if %1 2163 mov r6d, dword r11m 2164 movd m0, [fg_dataq+FGData.uv_mult+r6*4] 2165 movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4] 2166 punpcklbw m6, m1, m0 2167 movd m7, [fg_dataq+FGData.uv_offset+r6*4] 2168 punpcklwd m6, m6 2169 punpcklwd m7, m7 2170 pshufd m6, m6, q0000 2171 pshufd m7, m7, q0000 2172 SCRATCH 6, 14, 3 2173 SCRATCH 7, 15, 4 2174%endif 2175 2176 mov sbyd, r8m 2177 mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 2178 test overlapd, overlapd 2179 jz %%no_vertical_overlap 2180%if ARCH_X86_32 2181%if %2 2182 mova m1, [base+pb_23_22_h] 2183%else 2184 mova m1, [base+pb_27_17_17_27] 2185%endif 2186 mova m0, [base+pw_1024] 2187%else 2188%if %2 2189 mova m1, [pb_23_22_h] 2190%else 2191 mova m1, [pb_27_17_17_27] 2192%endif 2193 mova m0, [pw_1024] 2194%endif 2195 SCRATCH 0, 8, 5 2196 SCRATCH 1, 9, 6 2197 test sbyd, sbyd 2198 jnz %%vertical_overlap 2199 ; fall-through 2200 2201%%no_vertical_overlap: 2202 mov r8m, overlapd 2203%if ARCH_X86_32 2204 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap 2205 imul seed, (173 << 24) | 37 2206%else 2207 imul seed, sbyd, (173 << 24) | 37 2208%endif 2209 add seed, (105 << 24) | 178 2210 rol seed, 8 2211 movzx seed, seew 2212 xor seed, [fg_dataq+FGData.seed] 2213 2214%if ARCH_X86_32 2215 mov r3m, seed 2216 2217 DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak 2218%define luma_bakq lumaq 2219 2220 mov wq, r4m 2221%if %3 2222 shl r10mp, 1 2223%endif 2224%else 2225 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2226 unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak 2227 2228 mov lstrideq, r10mp 2229%endif 2230 2231 mov lumaq, r9mp 2232 lea src_bakq, [srcq+wq] 2233 lea luma_bakq, [lumaq+wq*(1+%2)] 2234 neg wq 2235 sub r0mp, srcq 2236%if ARCH_X86_32 2237 mov r1m, src_bakq 2238 mov r11m, luma_bakq 2239 mov r4m, wq 2240 2241 DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 2242%else 2243 mov r11mp, src_bakq 2244 mov r12mp, strideq 2245%endif 2246 2247%%loop_x: 2248%if ARCH_X86_32 2249 mov seed, r3m 2250%endif 2251 mov r6d, seed 2252 or seed, 0xEFF4 2253 shr r6d, 1 2254 test seeb, seeh 2255 lea seed, [r6+0x8000] 2256 cmovp seed, r6d ; updated seed 2257%if ARCH_X86_32 2258 mov r3m, seed 2259 2260 DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 2261 2262 mov offxd, offyd 2263%else 2264 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2265 offx, offy, see, overlap, unused1, unused2, lstride 2266 2267 mov offyd, seed 2268 mov offxd, seed 2269%endif 2270 ror offyd, 8 2271 shr offxd, 12 2272 and offyd, 0xf 2273 imul offyd, 164>>%3 2274 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx 2275 2276%if ARCH_X86_32 2277 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut 2278%else 2279 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2280 h, offxy, see, overlap, unused1, unused2, lstride, luma_bak 2281%endif 2282 2283%%loop_x_odd: 2284 mov hd, r7m 2285 mov grain_lutq, grain_lutmp 2286%%loop_y: 2287 ; src 2288%if ARCH_X86_32 2289 mov lumaq, r9mp 2290%endif 2291%if %2 2292 mova m4, [lumaq+ 0] 2293 mova m6, [lumaq+16] 2294 mova m0, [srcq] 2295%if ARCH_X86_32 2296 add lumaq, r10mp 2297 mov r9mp, lumaq 2298 mov r5, r5m 2299 movd m7, [base+pb_1] 2300%else 2301 movd m7, [pb_1] 2302%endif 2303 pshufd m7, m7, q0000 2304 pxor m2, m2 2305 pmaddubsw m4, m7 2306 pmaddubsw m6, m7 2307 pavgw m4, m2 2308 pavgw m6, m2 2309%else 2310 mova m4, [lumaq] 2311 mova m0, [srcq] 2312%if ARCH_X86_32 2313 add lumaq, r10mp 2314 mov r9mp, lumaq 2315%endif 2316 pxor m2, m2 2317%endif 2318 2319%if %1 2320%if %2 2321 packuswb m4, m6 ; luma 2322%endif 2323 punpckhbw m6, m4, m0 2324 punpcklbw m4, m0 ; { luma, chroma } 2325 pmaddubsw m6, m14 2326 pmaddubsw m4, m14 2327 psraw m6, 6 2328 psraw m4, 6 2329 paddw m6, m15 2330 paddw m4, m15 2331 packuswb m4, m6 ; pack+unpack = clip 2332 punpckhbw m6, m4, m2 2333 punpcklbw m4, m2 2334%elif %2 == 0 2335 punpckhbw m6, m4, m2 2336 punpcklbw m4, m2 2337%endif 2338 2339 ; scaling[luma_src] 2340%if ARCH_X86_32 2341 vpgatherdw m7, m4, scalingq-1, r0, r5 2342 vpgatherdw m5, m6, scalingq-1, r0, r5 2343%else 2344 vpgatherdw m7, m4, scalingq-1, r12, r2 2345 vpgatherdw m5, m6, scalingq-1, r12, r2 2346%endif 2347 REPX {psrlw x, 8}, m7, m5 2348 2349 ; unpack chroma_source 2350 punpckhbw m1, m0, m2 2351 punpcklbw m0, m2 ; m0-1: src as word 2352 2353 ; grain = grain_lut[offy+y][offx+x] 2354 movu m3, [grain_lutq+offxyq+ 0] 2355 pcmpgtb m6, m2, m3 2356 punpcklbw m2, m3, m6 2357 punpckhbw m3, m6 2358 2359 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2360 pmullw m2, m7 2361 pmullw m3, m5 2362 pmulhrsw m2, m11 2363 pmulhrsw m3, m11 2364 2365%if ARCH_X86_32 2366 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2367%endif 2368 2369 ; dst = clip_pixel(src, noise) 2370 paddw m0, m2 2371 paddw m1, m3 2372 pmaxsw m0, m13 2373 pmaxsw m1, m13 2374 pminsw m0, m12 2375 pminsw m1, m12 2376 packuswb m0, m1 2377 movifnidn dstq, dstmp 2378 mova [dstq+srcq], m0 2379 2380%if ARCH_X86_32 2381 add srcq, r2mp 2382 ; we already incremented lumaq above 2383%else 2384 add srcq, r12mp 2385%if %3 2386 lea lumaq, [lumaq+lstrideq*2] 2387%else 2388 add lumaq, lstrideq 2389%endif 2390%endif 2391 add grain_lutq, 82 2392 dec hw 2393 jg %%loop_y 2394 2395%if ARCH_X86_32 2396 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 2397 2398 mov wq, r4m 2399%endif 2400 add wq, 16 2401 jge %%end 2402%if ARCH_X86_32 2403 mov srcq, r1mp 2404 mov lumaq, r11mp 2405%else 2406 mov srcq, r11mp 2407%endif 2408 lea lumaq, [luma_bakq+wq*(1+%2)] 2409 add srcq, wq 2410%if ARCH_X86_32 2411 mov r4m, wq 2412 mov r9m, lumaq 2413%endif 2414%if %2 == 0 2415 ; adjust top_offxy 2416%if ARCH_X86_32 2417 add dword [rsp+7*mmsize+1*gprsize], 16 2418%else 2419 add r11d, 16 2420%endif 2421 add offxyd, 16 2422 btc dword r8m, 2 2423 jc %%loop_x_even 2424 test dword r8m, 2 2425 jz %%loop_x_odd 2426 jmp %%loop_x_odd_v_overlap 2427%%loop_x_even: 2428%endif 2429 test dword r8m, 1 2430 jz %%loop_x 2431 2432 ; r8m = sbym 2433 test dword r8m, 2 2434 jne %%loop_x_hv_overlap 2435 2436 ; horizontal overlap (without vertical overlap) 2437%%loop_x_h_overlap: 2438%if ARCH_X86_32 2439%if %2 2440 lea r6, [offxyd+16] 2441 mov [rsp+7*mmsize+0*gprsize], r6 2442%else 2443 mov [rsp+7*mmsize+0*gprsize], offxyd 2444%endif 2445 2446 DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut 2447 2448 mov seed, r3m 2449%else 2450 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2451 offx, offy, see, left_offxy, unused1, unused2, lstride 2452 2453%if %2 2454 lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx 2455%else 2456 mov left_offxyd, offyd 2457%endif 2458%endif 2459 mov r6d, seed 2460 or seed, 0xEFF4 2461 shr r6d, 1 2462 test seeb, seeh 2463 lea seed, [r6+0x8000] 2464 cmovp seed, r6d ; updated seed 2465 2466%if ARCH_X86_32 2467 mov r3m, seed 2468 2469 DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx 2470 2471 mov offxd, offyd 2472%else 2473 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2474 offx, offy, see, left_offxy, unused1, unused2, lstride 2475 2476 mov offyd, seed 2477 mov offxd, seed 2478%endif 2479 ror offyd, 8 2480 shr offxd, 12 2481 and offyd, 0xf 2482 imul offyd, 164>>%3 2483 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 2484 2485%if ARCH_X86_32 2486 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut 2487%else 2488 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2489 h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak 2490%endif 2491 2492 mov hd, r7m 2493 mov grain_lutq, grain_lutmp 2494%%loop_y_h_overlap: 2495 ; src 2496%if ARCH_X86_32 2497 mov lumaq, r9mp 2498%endif 2499%if %2 2500 mova m4, [lumaq+ 0] 2501 mova m6, [lumaq+16] 2502 mova m0, [srcq] 2503%if ARCH_X86_32 2504 add lumaq, r10mp 2505 mov r9mp, lumaq 2506 mov r5, r5m 2507 movd m7, [base+pb_1] 2508%else 2509 movd m7, [pb_1] 2510%endif 2511 pshufd m7, m7, q0000 2512 pxor m2, m2 2513 pmaddubsw m4, m7 2514 pmaddubsw m6, m7 2515 pavgw m4, m2 2516 pavgw m6, m2 2517%else 2518 mova m4, [lumaq] 2519 mova m0, [srcq] 2520%if ARCH_X86_32 2521 add lumaq, r10mp 2522 mov r9mp, lumaq 2523%endif 2524 pxor m2, m2 2525%endif 2526 2527%if %1 2528%if %2 2529 packuswb m4, m6 ; luma 2530%endif 2531 punpckhbw m6, m4, m0 2532 punpcklbw m4, m0 ; { luma, chroma } 2533 pmaddubsw m6, m14 2534 pmaddubsw m4, m14 2535 psraw m6, 6 2536 psraw m4, 6 2537 paddw m6, m15 2538 paddw m4, m15 2539 packuswb m4, m6 ; pack+unpack = clip 2540 punpckhbw m6, m4, m2 2541 punpcklbw m4, m2 2542%elif %2 == 0 2543 punpckhbw m6, m4, m2 2544 punpcklbw m4, m2 2545%endif 2546 2547 ; scaling[luma_src] 2548%if ARCH_X86_32 2549 vpgatherdw m7, m4, scalingq-1, r0, r5 2550 vpgatherdw m5, m6, scalingq-1, r0, r5 2551%else 2552 vpgatherdw m7, m4, scalingq-1, r12, r2 2553 vpgatherdw m5, m6, scalingq-1, r12, r2 2554%endif 2555 REPX {psrlw x, 8}, m7, m5 2556 2557 ; unpack chroma_source 2558 punpckhbw m1, m0, m2 2559 punpcklbw m0, m2 ; m0-1: src as word 2560 2561 ; grain = grain_lut[offy+y][offx+x] 2562 movu m4, [grain_lutq+offxyq+ 0] 2563%if ARCH_X86_32 2564 mov r0, [rsp+7*mmsize+0*gprsize] 2565 movd m2, [grain_lutq+r0+ 0] 2566%else 2567 movd m2, [grain_lutq+left_offxyq+ 0] 2568%endif 2569 punpcklbw m2, m4 2570 pmaddubsw m3, m9, m2 2571 pmulhrsw m3, m8 2572 packsswb m3, m3 2573 shufps m3, m4, q3210 2574 pxor m4, m4 2575 pcmpgtb m4, m3 2576 punpcklbw m2, m3, m4 2577 punpckhbw m3, m4 2578 2579 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2580 pmullw m2, m7 2581 pmullw m3, m5 2582 pmulhrsw m2, m11 2583 pmulhrsw m3, m11 2584 2585%if ARCH_X86_32 2586 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2587%endif 2588 2589 ; dst = clip_pixel(src, noise) 2590 paddw m0, m2 2591 paddw m1, m3 2592 pmaxsw m0, m13 2593 pmaxsw m1, m13 2594 pminsw m0, m12 2595 pminsw m1, m12 2596 packuswb m0, m1 2597 movifnidn dstq, dstmp 2598 mova [dstq+srcq], m0 2599 2600%if ARCH_X86_32 2601 add srcq, r2mp 2602 ; lumaq has already been incremented above 2603%else 2604 add srcq, r12mp 2605%if %3 2606 lea lumaq, [lumaq+lstrideq*2] 2607%else 2608 add lumaq, lstrideq 2609%endif 2610%endif 2611 add grain_lutq, 82 2612 dec hw 2613 jg %%loop_y_h_overlap 2614 2615%if ARCH_X86_32 2616 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 2617 2618 mov wq, r4m 2619%endif 2620 add wq, 16 2621 jge %%end 2622%if ARCH_X86_32 2623 mov srcq, r1mp 2624 mov lumaq, r11mp 2625%else 2626 mov srcq, r11mp 2627%endif 2628 lea lumaq, [luma_bakq+wq*(1+%2)] 2629 add srcq, wq 2630%if ARCH_X86_32 2631 mov r4m, wq 2632 mov r9m, lumaq 2633%endif 2634%if %2 == 0 2635 xor dword r8m, 4 2636 ; adjust top_offxyd 2637%if ARCH_X86_32 2638 add dword [rsp+7*mmsize+1*gprsize], 16 2639%else 2640 add r11d, 16 2641%endif 2642 add offxyd, 16 2643%endif 2644 2645 ; r8m = sbym 2646 test dword r8m, 2 2647%if %2 2648 jne %%loop_x_hv_overlap 2649 jmp %%loop_x_h_overlap 2650%else 2651 jne %%loop_x_odd_v_overlap 2652 jmp %%loop_x_odd 2653%endif 2654 2655%%end: 2656 RET 2657 2658%%vertical_overlap: 2659%if ARCH_X86_32 2660 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 2661%else 2662 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap 2663%endif 2664 2665 or overlapd, 2 ; top_overlap: overlap & 2 2666 mov r8m, overlapd 2667 movzx sbyd, sbyb 2668%if ARCH_X86_32 2669 imul r4, [fg_dataq+FGData.seed], 0x00010001 2670 DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused 2671%else 2672 imul seed, [fg_dataq+FGData.seed], 0x00010001 2673%endif 2674 imul tmpd, sbyd, 173 * 0x00010001 2675 imul sbyd, 37 * 0x01000100 2676 add tmpd, (105 << 16) | 188 2677 add sbyd, (178 << 24) | (141 << 8) 2678 and tmpd, 0x00ff00ff 2679 and sbyd, 0xff00ff00 2680 xor seed, tmpd 2681%if ARCH_X86_32 2682 xor sbyd, seed ; (cur_seed << 16) | top_seed 2683 2684 DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak 2685 2686 mov r3m, seed 2687 mov wq, r4m 2688%if %3 2689 shl r10mp, 1 2690%endif 2691%else 2692 xor seed, sbyd ; (cur_seed << 16) | top_seed 2693 2694 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2695 tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak 2696 2697 mov lstrideq, r10mp 2698%endif 2699 2700 mov lumaq, r9mp 2701 lea src_bakq, [srcq+wq] 2702 lea luma_bakq, [lumaq+wq*(1+%2)] 2703 neg wq 2704 sub r0mp, srcq 2705%if ARCH_X86_32 2706 mov r1m, src_bakq 2707 mov r11m, luma_bakq 2708 mov r4m, wq 2709 2710 DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 2711%else 2712 mov r11mp, src_bakq 2713 mov r12mp, strideq 2714%endif 2715 2716%%loop_x_v_overlap: 2717%if ARCH_X86_32 2718 mov seed, r3m 2719 xor tmpd, tmpd 2720%endif 2721 ; we assume from the block above that bits 8-15 of tmpd are zero'ed 2722 mov r6d, seed 2723 or seed, 0xeff4eff4 2724 test seeb, seeh 2725 setp tmpb ; parity of top_seed 2726 shr seed, 16 2727 shl tmpd, 16 2728 test seeb, seeh 2729 setp tmpb ; parity of cur_seed 2730 or r6d, 0x00010001 2731 xor tmpd, r6d 2732 mov seed, tmpd 2733 ror seed, 1 ; updated (cur_seed << 16) | top_seed 2734 2735%if ARCH_X86_32 2736 mov r3m, seed 2737 2738 DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx 2739 2740 mov offxd, offyd 2741%else 2742 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2743 offx, offy, see, overlap, top_offxy, unused, lstride 2744 2745 mov offxd, seed 2746 mov offyd, seed 2747%endif 2748 ror offyd, 8 2749 ror offxd, 12 2750 and offyd, 0xf000f 2751 and offxd, 0xf000f 2752 imul offyd, 164>>%3 2753 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 2754 lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 2755 2756%if ARCH_X86_32 2757 DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy 2758%else 2759 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2760 h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak 2761%endif 2762 2763 movzx top_offxyd, offxyw 2764 shr offxyd, 16 2765%if ARCH_X86_32 2766 mov [rsp+7*mmsize+1*gprsize], top_offxyd 2767 2768 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut 2769%endif 2770 2771%%loop_x_odd_v_overlap: 2772 mov hd, r7m 2773 mov grain_lutq, grain_lutmp 2774%if ARCH_X86_32 2775 mov r5, r5m 2776%endif 2777%if %3 2778 mova m1, [PIC_ptr(pb_23_22)] 2779%else 2780 mova m1, [PIC_ptr(pb_27_17)] 2781%endif 2782%%loop_y_v_overlap: 2783%if ARCH_X86_32 2784 mov lumaq, r9mp 2785%endif 2786%if %2 2787 mova m4, [lumaq+ 0] 2788 mova m6, [lumaq+16] 2789 mova m0, [srcq] 2790%if ARCH_X86_32 2791 add lumaq, r10mp 2792 mov r9mp, lumaq 2793 mov r5, r5m 2794 movd m7, [base+pb_1] 2795%else 2796 movd m7, [pb_1] 2797%endif 2798 pshufd m7, m7, q0000 2799 pxor m2, m2 2800 pmaddubsw m4, m7 2801 pmaddubsw m6, m7 2802 pavgw m4, m2 2803 pavgw m6, m2 2804%else 2805 mova m4, [lumaq] 2806 mova m0, [srcq] 2807%if ARCH_X86_32 2808 add lumaq, r10mp 2809 mov r9mp, lumaq 2810%endif 2811 pxor m2, m2 2812%endif 2813 2814%if %1 2815%if %2 2816 packuswb m4, m6 ; luma 2817%endif 2818 punpckhbw m6, m4, m0 2819 punpcklbw m4, m0 ; { luma, chroma } 2820 pmaddubsw m6, m14 2821 pmaddubsw m4, m14 2822 psraw m6, 6 2823 psraw m4, 6 2824 paddw m6, m15 2825 paddw m4, m15 2826 packuswb m4, m6 ; pack+unpack = clip 2827 punpckhbw m6, m4, m2 2828 punpcklbw m4, m2 2829%elif %2 == 0 2830 punpckhbw m6, m4, m2 2831 punpcklbw m4, m2 2832%endif 2833 2834 ; scaling[luma_src] 2835%if ARCH_X86_32 2836 vpgatherdw m7, m4, scalingq-1, r0, r5 2837 vpgatherdw m5, m6, scalingq-1, r0, r5 2838%else 2839 vpgatherdw m7, m4, scalingq-1, r12, r2 2840 vpgatherdw m5, m6, scalingq-1, r12, r2 2841%endif 2842 REPX {psrlw x, 8}, m7, m5 2843 2844 ; grain = grain_lut[offy+y][offx+x] 2845 movu m3, [grain_lutq+offxyq] 2846%if ARCH_X86_32 2847 mov r0, [rsp+7*mmsize+1*gprsize] 2848 movu m4, [grain_lutq+r0] 2849%else 2850 movu m4, [grain_lutq+top_offxyq] 2851%endif 2852 punpckhbw m6, m4, m3 2853 punpcklbw m4, m3 2854 pmaddubsw m2, m1, m6 2855 pmaddubsw m3, m1, m4 2856 pmulhrsw m2, m8 2857 pmulhrsw m3, m8 2858 packsswb m3, m2 2859 pxor m6, m6 2860 pcmpgtb m6, m3 2861 punpcklbw m2, m3, m6 2862 punpckhbw m3, m6 2863 2864 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2865 pmullw m2, m7 2866 pmullw m3, m5 2867 pmulhrsw m2, m11 2868 pmulhrsw m3, m11 2869 2870 ; unpack chroma_source 2871 pxor m4, m4 2872 punpckhbw m6, m0, m4 2873 punpcklbw m0, m4 ; m0-1: src as word 2874 2875%if ARCH_X86_32 2876 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2877%endif 2878 2879 ; dst = clip_pixel(src, noise) 2880 paddw m0, m2 2881 paddw m6, m3 2882 pmaxsw m0, m13 2883 pmaxsw m6, m13 2884 pminsw m0, m12 2885 pminsw m6, m12 2886 packuswb m0, m6 2887 movifnidn dstq, dstmp 2888 mova [dstq+srcq], m0 2889 2890 dec hw 2891 je %%end_y_v_overlap 2892%if ARCH_X86_32 2893 add srcq, r2mp 2894 ; lumaq has already been incremented above 2895%else 2896 add srcq, r12mp 2897%if %3 2898 lea lumaq, [lumaq+lstrideq*2] 2899%else 2900 add lumaq, lstrideq 2901%endif 2902%endif 2903 add grain_lutq, 82 2904%if %3 == 0 2905 btc hd, 16 2906%if ARCH_X86_32 2907 mov r5, r5m 2908%endif 2909 mova m1, [PIC_ptr(pb_17_27)] 2910 jnc %%loop_y_v_overlap 2911%endif 2912 jmp %%loop_y 2913 2914%%end_y_v_overlap: 2915%if ARCH_X86_32 2916 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 2917 2918 mov wq, r4m 2919%endif 2920 add wq, 16 2921 jge %%end_hv 2922%if ARCH_X86_32 2923 mov srcq, r1mp 2924 mov lumaq, r11mp 2925%else 2926 mov srcq, r11mp 2927%endif 2928 lea lumaq, [luma_bakq+wq*(1+%2)] 2929 add srcq, wq 2930%if ARCH_X86_32 2931 mov r4m, wq 2932 mov r9m, lumaq 2933%endif 2934 2935%if %2 2936 ; since fg_dataq.overlap is guaranteed to be set, we never jump 2937 ; back to .loop_x_v_overlap, and instead always fall-through to 2938 ; h+v overlap 2939%else 2940%if ARCH_X86_32 2941 add dword [rsp+7*mmsize+1*gprsize], 16 2942%else 2943 add top_offxyd, 16 2944%endif 2945 add offxyd, 16 2946 btc dword r8m, 2 2947 jnc %%loop_x_odd_v_overlap 2948%endif 2949 2950%%loop_x_hv_overlap: 2951%if ARCH_X86_32 2952 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused 2953 2954 mov r6, [rsp+7*mmsize+1*gprsize] 2955%if %2 2956 lea r0, [r3d+16] 2957 add r6, 16 2958 mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy 2959%else 2960 mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy 2961%endif 2962 mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy 2963 2964 DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused 2965 2966 mov seed, r3m 2967 xor tmpd, tmpd 2968%else 2969 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2970 tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride 2971 2972%if %2 2973 lea topleft_offxyq, [top_offxyq+16] 2974 lea left_offxyq, [offxyq+16] 2975%else 2976 mov topleft_offxyq, top_offxyq 2977 mov left_offxyq, offxyq 2978%endif 2979 2980 ; we assume from the block above that bits 8-15 of tmpd are zero'ed 2981%endif 2982 mov r6d, seed 2983 or seed, 0xeff4eff4 2984 test seeb, seeh 2985 setp tmpb ; parity of top_seed 2986 shr seed, 16 2987 shl tmpd, 16 2988 test seeb, seeh 2989 setp tmpb ; parity of cur_seed 2990 or r6d, 0x00010001 2991 xor tmpd, r6d 2992 mov seed, tmpd 2993 ror seed, 1 ; updated (cur_seed << 16) | top_seed 2994 2995%if ARCH_X86_32 2996 mov r3m, seed 2997 2998 DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx 2999 3000 mov offxd, offyd 3001%else 3002 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 3003 offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride 3004 3005 mov offxd, seed 3006 mov offyd, seed 3007%endif 3008 ror offyd, 8 3009 ror offxd, 12 3010 and offyd, 0xf000f 3011 and offxd, 0xf000f 3012 imul offyd, 164>>%3 3013 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 3014 lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 3015 3016%if ARCH_X86_32 3017 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut 3018%else 3019 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 3020 h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak 3021%endif 3022 3023 movzx top_offxyd, offxyw 3024 shr offxyd, 16 3025%if ARCH_X86_32 3026 mov [rsp+7*mmsize+1*gprsize], top_offxyd 3027%endif 3028 3029 mov hd, r7m 3030 mov grain_lutq, grain_lutmp 3031%if ARCH_X86_32 3032 mov r5, r5m 3033%endif 3034%if %3 3035 mova m3, [PIC_ptr(pb_23_22)] 3036%else 3037 mova m3, [PIC_ptr(pb_27_17)] 3038%endif 3039%%loop_y_hv_overlap: 3040 ; grain = grain_lut[offy+y][offx+x] 3041%if ARCH_X86_32 3042 mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy 3043 mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy 3044 movd m1, [grain_lutq+r0] 3045 mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy 3046%else 3047 movd m1, [grain_lutq+topleft_offxyq] 3048%endif 3049 movu m2, [grain_lutq+offxyq] 3050%if ARCH_X86_32 3051 movu m6, [grain_lutq+r5] 3052 movd m4, [grain_lutq+r0] 3053%else 3054 movu m6, [grain_lutq+top_offxyq] 3055 movd m4, [grain_lutq+left_offxyq] 3056%endif 3057 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 3058 punpcklbw m1, m6 3059 punpcklbw m4, m2 3060 pmaddubsw m0, m9, m1 3061 pmaddubsw m1, m9, m4 3062 REPX {pmulhrsw x, m8}, m0, m1 3063 packsswb m0, m1 3064 shufps m4, m0, m2, q3232 3065 shufps m0, m6, q3210 3066 ; followed by v interpolation (top | cur -> cur) 3067 punpcklbw m2, m0, m4 3068 punpckhbw m0, m4 3069 pmaddubsw m4, m3, m0 3070 pmaddubsw m1, m3, m2 3071 pmulhrsw m4, m8 3072 pmulhrsw m1, m8 3073 packsswb m1, m4 3074 3075 ; src 3076%if ARCH_X86_32 3077 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 3078 3079 mov lumaq, r9mp 3080%endif 3081%if %2 3082 mova m4, [lumaq+ 0] 3083 mova m6, [lumaq+16] 3084 mova m0, [srcq] 3085%if ARCH_X86_32 3086 add lumaq, r10mp 3087 mov r9mp, lumaq 3088 mov r5, r5m 3089 movd m7, [base+pb_1] 3090%else 3091 movd m7, [pb_1] 3092%endif 3093 pshufd m7, m7, q0000 3094 pxor m2, m2 3095 pmaddubsw m4, m7 3096 pmaddubsw m6, m7 3097 pavgw m4, m2 3098 pavgw m6, m2 3099%else 3100 mova m4, [lumaq] 3101 mova m0, [srcq] 3102%if ARCH_X86_32 3103 add lumaq, r10mp 3104 mov r9mp, lumaq 3105%endif 3106 pxor m2, m2 3107%endif 3108 3109%if %1 3110%if %2 3111 packuswb m4, m6 ; luma 3112%endif 3113 punpckhbw m6, m4, m0 3114 punpcklbw m4, m0 ; { luma, chroma } 3115 pmaddubsw m6, m14 3116 pmaddubsw m4, m14 3117 psraw m6, 6 3118 psraw m4, 6 3119 paddw m6, m15 3120 paddw m4, m15 3121 packuswb m4, m6 ; pack+unpack = clip 3122 punpckhbw m6, m4, m2 3123 punpcklbw m4, m2 3124%elif %2 == 0 3125 punpckhbw m6, m4, m2 3126 punpcklbw m4, m2 3127%endif 3128 3129 ; scaling[src] 3130%if ARCH_X86_32 3131 vpgatherdw m7, m4, scalingq-1, r0, r5 3132 vpgatherdw m5, m6, scalingq-1, r0, r5 3133%else 3134%if %3 3135 vpgatherdw m7, m4, scalingq-1, r2, r12 3136 vpgatherdw m5, m6, scalingq-1, r2, r12 3137%else 3138 vpgatherdw m7, m4, scalingq-1, r2, r13 3139 vpgatherdw m5, m6, scalingq-1, r2, r13 3140%endif 3141%endif 3142 REPX {psrlw x, 8}, m7, m5 3143 3144 ; unpack grain 3145 pxor m4, m4 3146 pcmpgtb m4, m1 3147 punpcklbw m2, m1, m4 3148 punpckhbw m1, m4 3149 3150 ; noise = round2(scaling[src] * grain, scaling_shift) 3151 pmullw m2, m7 3152 pmullw m1, m5 3153 pmulhrsw m2, m11 3154 pmulhrsw m1, m11 3155 3156%if ARCH_X86_32 3157 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 3158%endif 3159 3160 ; unpack chroma source 3161 pxor m4, m4 3162 punpckhbw m5, m0, m4 3163 punpcklbw m0, m4 ; m0-1: src as word 3164 3165 ; dst = clip_pixel(src, noise) 3166 paddw m0, m2 3167 paddw m5, m1 3168 pmaxsw m0, m13 3169 pmaxsw m5, m13 3170 pminsw m0, m12 3171 pminsw m5, m12 3172 packuswb m0, m5 3173 movifnidn dstq, dstmp 3174 mova [dstq+srcq], m0 3175 3176%if ARCH_X86_32 3177 add srcq, r2mp 3178 ; lumaq has been adjusted above already 3179%else 3180 add srcq, r12mp 3181%if %3 3182 lea lumaq, [lumaq+lstrideq*(1+%2)] 3183%else 3184 add lumaq, r10mp 3185%endif 3186%endif 3187 add grain_lutq, 82 3188 dec hw 3189%if %3 3190 jg %%loop_y_h_overlap 3191%else 3192 jle %%end_y_hv_overlap 3193%if ARCH_X86_32 3194 mov r5, r5m 3195%endif 3196 mova m3, [PIC_ptr(pb_17_27)] 3197 btc hd, 16 3198 jnc %%loop_y_hv_overlap 3199%if ARCH_X86_64 3200 mov lstrideq, r10mp 3201%endif 3202 jmp %%loop_y_h_overlap 3203%%end_y_hv_overlap: 3204%if ARCH_X86_64 3205 mov lstrideq, r10mp 3206%endif 3207%endif 3208 3209%if ARCH_X86_32 3210 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 3211 3212 mov wq, r4m 3213%endif 3214 add wq, 16 3215 jge %%end_hv 3216%if ARCH_X86_32 3217 mov srcq, r1mp 3218 mov lumaq, r11mp 3219%else 3220 mov srcq, r11mp 3221%endif 3222 lea lumaq, [luma_bakq+wq*(1+%2)] 3223 add srcq, wq 3224%if ARCH_X86_32 3225 mov r4m, wq 3226 mov r9m, lumaq 3227%endif 3228%if %2 3229 jmp %%loop_x_hv_overlap 3230%else 3231%if ARCH_X86_32 3232 add dword [rsp+7*mmsize+1*gprsize], 16 3233%else 3234 add top_offxyd, 16 3235%endif 3236 add offxyd, 16 3237 xor dword r8m, 4 3238 jmp %%loop_x_odd_v_overlap 3239%endif 3240 3241%%end_hv: 3242 RET 3243%endmacro 3244 3245 %%FGUV_32x32xN_LOOP 1, %2, %3 3246.csfl: 3247 %%FGUV_32x32xN_LOOP 0, %2, %3 3248%endmacro 3249 3250FGUV_FN 420, 1, 1 3251 3252%if STACK_ALIGNMENT < mmsize 3253DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 3254%endif 3255 3256FGUV_FN 422, 1, 0 3257 3258%if STACK_ALIGNMENT < mmsize 3259DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 3260%endif 3261 3262FGUV_FN 444, 0, 0 3263