1; Copyright © 2019-2021, VideoLAN and dav1d authors 2; Copyright © 2019, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28%include "x86/filmgrain_common.asm" 29 30SECTION_RODATA 31 32pw_1024: times 8 dw 1024 33pb_27_17_17_27: db 27, 17, 17, 27 34 times 6 db 0, 32 35pb_23_22_h: db 23, 22 36 times 7 db 0, 32 37pb_27_17: times 8 db 27, 17 38pb_17_27: times 8 db 17, 27 39pb_23_22: times 8 db 23, 22 40pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 41rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 42byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 43pw_seed_xor: times 2 dw 0xb524 44 times 2 dw 0x49d8 45pb_1: times 4 db 1 46hmul_bits: dw 32768, 16384, 8192, 4096 47round: dw 2048, 1024, 512 48mul_bits: dw 256, 128, 64, 32, 16 49round_vals: dw 32, 64, 128, 256, 512 50max: dw 255, 240, 235 51min: dw 0, 16 52pw_1: dw 1 53 54%macro JMP_TABLE 2-* 55 %xdefine %1_8bpc_%2_table %%table 56 %xdefine %%base %1_8bpc_%2_table 57 %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) 58 %%table: 59 %rep %0 - 2 60 dd %%prefix %+ .ar%3 - %%base 61 %rotate 1 62 %endrep 63%endmacro 64 65JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3 66JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3 67JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3 68JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3 69 70SECTION .text 71 72%if ARCH_X86_32 73%define PIC_ptr(a) base+a 74%else 75%define PIC_ptr(a) a 76%endif 77 78%macro SCRATCH 3 79%if ARCH_X86_32 80 mova [rsp+%3*mmsize], m%1 81%define m%2 [rsp+%3*mmsize] 82%else 83 SWAP %1, %2 84%endif 85%endmacro 86 87INIT_XMM ssse3 88cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data 89 LEA r4, $$ 90%define base r4-$$ 91 movq m1, [base+rnd_next_upperbit_mask] 92 movq m4, [base+mul_bits] 93 movq m7, [base+hmul_bits] 94 mov r2d, [fg_dataq+FGData.grain_scale_shift] 95 movd m2, [base+round+r2*2] 96 movd m0, [fg_dataq+FGData.seed] 97 mova m5, [base+pb_mask] 98 pshuflw m2, m2, q0000 99 pshuflw m0, m0, q0000 100 mov r2, -73*82 101 sub bufq, r2 102 lea r3, [base+gaussian_sequence] 103.loop: 104 pand m6, m0, m1 105 psrlw m3, m6, 10 106 por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 107 pmullw m6, m4 ; bits 0x0f00 are set 108 pshufb m3, m5, m6 ; set 15th bit for next 4 seeds 109 psllq m6, m3, 30 110 por m3, m6 111 psllq m6, m3, 15 112 por m3, m6 ; aggregate each bit into next seed's high bit 113 pmulhuw m6, m0, m7 114 por m3, m6 ; 4 next output seeds 115 pshuflw m0, m3, q3333 116 psrlw m3, 5 117%if ARCH_X86_64 118 movq r6, m3 119 mov r8, r6 120 movzx r5d, r6w 121 shr r6d, 16 122 shr r8, 32 123 movzx r7, r8w 124 shr r8, 16 125 126 movd m6, [r3+r5*2] 127 pinsrw m6, [r3+r6*2], 1 128 pinsrw m6, [r3+r7*2], 2 129 pinsrw m6, [r3+r8*2], 3 130%else 131 movd r6, m3 132 pshuflw m3, m3, q3232 133 movzx r5, r6w 134 shr r6, 16 135 136 movd m6, [r3+r5*2] 137 pinsrw m6, [r3+r6*2], 1 138 139 movd r6, m3 140 movzx r5, r6w 141 shr r6, 16 142 143 pinsrw m6, [r3+r5*2], 2 144 pinsrw m6, [r3+r6*2], 3 145%endif 146 pmulhrsw m6, m2 147 packsswb m6, m6 148 movd [bufq+r2], m6 149 add r2, 4 150 jl .loop 151 152 ; auto-regression code 153 movsxd r2, [fg_dataq+FGData.ar_coeff_lag] 154 movsxd r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4] 155 lea r2, [r2+base+generate_grain_y_8bpc_ssse3_table] 156 jmp r2 157 158.ar1: 159%if ARCH_X86_32 160 DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max 161%elif WIN64 162 DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0 163 mov bufq, r0 164%else 165 DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 166%endif 167 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] 168 movd m4, [fg_dataq+FGData.ar_coeffs_y] 169 mov ecx, [fg_dataq+FGData.ar_coeff_shift] 170%if ARCH_X86_32 171 mov r1m, cf3d 172 DEFINE_ARGS buf, shift, val3, min, max, x, val0 173%define hd r0mp 174%define cf3d r1mp 175%elif WIN64 176 DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0 177%else 178 DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 179%endif 180 pxor m6, m6 181 pcmpgtb m7, m6, m4 182 punpcklbw m4, m7 183 pinsrw m4, [base+pw_1], 3 184 pshufd m5, m4, q1111 185 pshufd m4, m4, q0000 186 movd m3, [base+round_vals+shiftq*2-12] ; rnd 187 pshuflw m3, m3, q0000 188 sub bufq, 82*73-(82*3+79) 189 mov hd, 70 190 mov mind, -128 191 mov maxd, 127 192.y_loop_ar1: 193 mov xq, -76 194 movsx val3d, byte [bufq+xq-1] 195.x_loop_ar1: 196 movq m0, [bufq+xq-82-1] ; top/left 197 pcmpgtb m7, m6, m0 198 punpcklbw m0, m7 199 psrldq m2, m0, 2 ; top 200 psrldq m1, m0, 4 ; top/right 201 punpcklwd m0, m2 202 punpcklwd m1, m3 203 pmaddwd m0, m4 204 pmaddwd m1, m5 205 paddd m0, m1 206.x_loop_ar1_inner: 207 movd val0d, m0 208 psrldq m0, 4 209 imul val3d, cf3d 210 add val3d, val0d 211 sar val3d, shiftb 212 movsx val0d, byte [bufq+xq] 213 add val3d, val0d 214 cmp val3d, maxd 215 cmovns val3d, maxd 216 cmp val3d, mind 217 cmovs val3d, mind 218 mov byte [bufq+xq], val3b 219 ; keep val3d in-place as left for next x iteration 220 inc xq 221 jz .x_loop_ar1_end 222 test xq, 3 223 jnz .x_loop_ar1_inner 224 jmp .x_loop_ar1 225 226.x_loop_ar1_end: 227 add bufq, 82 228 dec hd 229 jg .y_loop_ar1 230.ar0: 231 RET 232 233.ar2: 234%if ARCH_X86_32 235%assign stack_offset_old stack_offset 236 ALLOC_STACK -16*8 237%endif 238 DEFINE_ARGS buf, fg_data, shift 239 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 240 movd m6, [base+round_vals-12+shiftq*2] 241 movd m7, [base+byte_blend+1] 242 SCRATCH 7, 15, 7 243 movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 244 movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 245 pxor m7, m7 246 pshuflw m6, m6, q0000 247 punpcklwd m6, m7 248 pcmpgtb m4, m7, m0 249 pcmpgtb m5, m7, m1 250 punpcklbw m0, m4 251 punpcklbw m1, m5 252 DEFINE_ARGS buf, fg_data, h, x 253 pshufd m4, m1, q0000 254 pshufd m5, m1, q1111 255 pshufd m3, m0, q3333 256 pshufd m2, m0, q2222 257 pshufd m1, m0, q1111 258 pshufd m0, m0, q0000 259 SCRATCH 0, 8, 0 260 SCRATCH 1, 9, 1 261 SCRATCH 2, 10, 2 262 SCRATCH 3, 11, 3 263 SCRATCH 4, 12, 4 264 SCRATCH 5, 13, 5 265 SCRATCH 6, 14, 6 266 sub bufq, 82*73-(82*3+79) 267 mov hd, 70 268.y_loop_ar2: 269 mov xq, -76 270 271.x_loop_ar2: 272 movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 273 movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 274 pcmpgtb m2, m7, m0 275 punpckhbw m1, m0, m2 276 punpcklbw m0, m2 277 psrldq m5, m0, 2 ; y=-2,x=[-1,+5] 278 psrldq m3, m1, 2 ; y=-1,x=[-1,+5] 279 psrldq m4, m1, 4 ; y=-1,x=[+0,+5] 280 punpcklwd m2, m0, m5 281 punpcklwd m3, m4 282 pmaddwd m2, m8 283 pmaddwd m3, m11 284 paddd m2, m3 285 286 psrldq m4, m0, 4 ; y=-2,x=[+0,+5] 287 psrldq m5, m0, 6 ; y=-2,x=[+1,+5] 288 psrldq m6, m0, 8 ; y=-2,x=[+2,+5] 289 punpcklwd m4, m5 290 punpcklwd m6, m1 291 psrldq m5, m1, 6 ; y=-1,x=[+1,+5] 292 psrldq m1, m1, 8 ; y=-1,x=[+2,+5] 293 punpcklwd m5, m1 294 pmaddwd m4, m9 295 pmaddwd m6, m10 296 pmaddwd m5, m12 297 paddd m4, m6 298 paddd m2, m5 299 paddd m2, m4 300 paddd m2, m14 301 302 movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] 303.x_loop_ar2_inner: 304 pcmpgtb m4, m7, m0 305 punpcklbw m1, m0, m4 306 pmaddwd m3, m1, m13 307 paddd m3, m2 308 psrldq m1, 4 ; y=0,x=0 309 psrldq m2, 4 ; shift top to next pixel 310 psrad m3, [fg_dataq+FGData.ar_coeff_shift] 311 ; don't packssdw since we only care about one value 312 paddw m3, m1 313 packsswb m3, m3 314 pslldq m3, 2 315 pand m3, m15 316 pandn m1, m15, m0 317 por m0, m1, m3 318 psrldq m0, 1 319 ; overwrite 2 pixels, but that's ok 320 movd [bufq+xq-1], m0 321 inc xq 322 jz .x_loop_ar2_end 323 test xq, 3 324 jnz .x_loop_ar2_inner 325 jmp .x_loop_ar2 326 327.x_loop_ar2_end: 328 add bufq, 82 329 dec hd 330 jg .y_loop_ar2 331 RET 332 333.ar3: 334 DEFINE_ARGS buf, fg_data, shift 335%if ARCH_X86_32 336%assign stack_offset stack_offset_old 337 ALLOC_STACK -16*14 338%elif WIN64 339 SUB rsp, 16*6 340%assign stack_size_padded (stack_size_padded+16*6) 341%assign stack_size (stack_size+16*6) 342%else 343 ALLOC_STACK -16*6 344%endif 345 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 346 movd m6, [base+round_vals-12+shiftq*2] 347 movd m7, [base+byte_blend] 348 movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 349 movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 350 pxor m3, m3 351 pcmpgtb m4, m3, m0 352 pcmpgtb m3, m2 353 pshuflw m6, m6, q0000 354 SCRATCH 6, 14, 12 355 SCRATCH 7, 15, 13 356 punpckhbw m1, m0, m4 357 punpcklbw m0, m4 358 punpcklbw m2, m3 359 pshufd m3, m0, q1111 360 pshufd m4, m0, q2222 361 pshufd m5, m0, q3333 362 pshufd m0, m0, q0000 363 mova [rsp+ 0*16], m0 364 mova [rsp+ 1*16], m3 365 mova [rsp+ 2*16], m4 366 mova [rsp+ 3*16], m5 367 pshufd m6, m1, q1111 368 pshufd m7, m1, q2222 369 pshufd m5, m1, q3333 370 pshufd m1, m1, q0000 371 pshufd m3, m2, q1111 372 psrldq m0, m2, 10 373 pinsrw m2, [base+pw_1], 5 374 pshufd m4, m2, q2222 375 pshufd m2, m2, q0000 376 pinsrw m0, [base+round_vals+shiftq*2-10], 3 377 mova [rsp+ 4*16], m1 378 mova [rsp+ 5*16], m6 379 SCRATCH 7, 8, 6 380 SCRATCH 5, 9, 7 381 SCRATCH 2, 10, 8 382 SCRATCH 3, 11, 9 383 SCRATCH 4, 12, 10 384 SCRATCH 0, 13, 11 385 DEFINE_ARGS buf, fg_data, h, x 386 sub bufq, 82*73-(82*3+79) 387 mov hd, 70 388.y_loop_ar3: 389 mov xq, -76 390 391.x_loop_ar3: 392 movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] 393 pxor m3, m3 394 pcmpgtb m3, m0 395 punpckhbw m2, m0, m3 396 punpcklbw m0, m3 397 398 psrldq m5, m0, 2 399 psrldq m6, m0, 4 400 psrldq m7, m0, 6 401 punpcklwd m4, m0, m5 402 punpcklwd m6, m7 403 pmaddwd m4, [rsp+ 0*16] 404 pmaddwd m6, [rsp+ 1*16] 405 paddd m4, m6 406 407 movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] 408 pxor m5, m5 409 pcmpgtb m5, m1 410 punpckhbw m3, m1, m5 411 punpcklbw m1, m5 412 palignr m6, m2, m0, 10 413 palignr m7, m2, m0, 12 414 psrldq m0, 8 415 punpcklwd m0, m6 416 punpcklwd m7, m1 417 pmaddwd m0, [rsp+ 2*16] 418 pmaddwd m7, [rsp+ 3*16] 419 paddd m0, m7 420 paddd m0, m4 421 422 psrldq m4, m1, 2 423 psrldq m5, m1, 4 424 psrldq m6, m1, 6 425 psrldq m7, m1, 8 426 punpcklwd m4, m5 427 punpcklwd m6, m7 428 pmaddwd m4, [rsp+ 4*16] 429 pmaddwd m6, [rsp+ 5*16] 430 paddd m4, m6 431 paddd m0, m4 432 433 movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 434 pxor m7, m7 435 pcmpgtb m7, m2 436 punpckhbw m5, m2, m7 437 punpcklbw m2, m7 438 palignr m7, m3, m1, 10 439 palignr m3, m1, 12 440 psrldq m1, m2, 2 441 punpcklwd m7, m3 442 punpcklwd m3, m2, m1 443 pmaddwd m7, m8 444 pmaddwd m3, m9 445 paddd m7, m3 446 paddd m0, m7 447 448 psrldq m6, m2, 4 449 psrldq m1, m2, 6 450 psrldq m3, m2, 8 451 palignr m4, m5, m2, 10 452 palignr m5, m5, m2, 12 453 454 punpcklwd m6, m1 455 punpcklwd m3, m4 456 punpcklwd m5, m14 457 pmaddwd m6, m10 458 pmaddwd m3, m11 459 pmaddwd m5, m12 460 paddd m0, m6 461 paddd m3, m5 462 paddd m0, m3 463 464 movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] 465.x_loop_ar3_inner: 466 pxor m5, m5 467 pcmpgtb m5, m1 468 punpcklbw m2, m1, m5 469 pmaddwd m2, m13 470 pshufd m3, m2, q1111 471 paddd m2, m3 ; left+cur 472 paddd m2, m0 ; add top 473 psrldq m0, 4 474 psrad m2, [fg_dataq+FGData.ar_coeff_shift] 475 ; don't packssdw since we only care about one value 476 packsswb m2, m2 477 pslldq m2, 3 478 pand m2, m15 479 pandn m3, m15, m1 480 por m1, m2, m3 481 movd [bufq+xq-3], m1 482 psrldq m1, 1 483 inc xq 484 jz .x_loop_ar3_end 485 test xq, 3 486 jnz .x_loop_ar3_inner 487 jmp .x_loop_ar3 488 489.x_loop_ar3_end: 490 add bufq, 82 491 dec hd 492 jg .y_loop_ar3 493 RET 494 495%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y 496INIT_XMM ssse3 497cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv 498 movifnidn r2, r2mp 499 movifnidn r3, r3mp 500 LEA r4, $$ 501%define base r4-$$ 502 movq m1, [base+rnd_next_upperbit_mask] 503 movq m4, [base+mul_bits] 504 movq m7, [base+hmul_bits] 505 mov r5d, [fg_dataq+FGData.grain_scale_shift] 506 movd m6, [base+round+r5*2] 507 mova m5, [base+pb_mask] 508 movd m0, [fg_dataq+FGData.seed] 509 movd m2, [base+pw_seed_xor+uvq*4] 510 pxor m0, m2 511 pshuflw m6, m6, q0000 512 pshuflw m0, m0, q0000 513 lea r6, [base+gaussian_sequence] 514%if %2 515%if ARCH_X86_64 516 mov r7d, 73-35*%3 517%else 518 mov r3mp, 73-35*%3 519%endif 520 add bufq, 44 521.loop_y: 522 mov r5, -44 523.loop_x: 524%else 525 mov r5, -82*73 526 sub bufq, r5 527.loop: 528%endif 529 pand m2, m0, m1 530 psrlw m3, m2, 10 531 por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 532 pmullw m2, m4 ; bits 0x0f00 are set 533 pshufb m3, m5, m2 ; set 15th bit for next 4 seeds 534 psllq m2, m3, 30 535 por m3, m2 536 psllq m2, m3, 15 537 por m3, m2 ; aggregate each bit into next seed's high bit 538 pmulhuw m2, m0, m7 539 por m2, m3 ; 4 next output seeds 540 pshuflw m0, m2, q3333 541 psrlw m2, 5 542%if ARCH_X86_64 543 movd r9d, m2 544 pshuflw m2, m2, q3232 545 movzx r8, r9w 546 shr r9, 16 547 548 movd m3, [r6+r8*2] 549 pinsrw m3, [r6+r9*2], 1 550 551 movd r9d, m2 552 movzx r8, r9w 553 shr r9, 16 554 555 pinsrw m3, [r6+r8*2], 2 556 pinsrw m3, [r6+r9*2], 3 557%else 558 movd r2, m2 559 pshuflw m2, m2, q3232 560 movzx r1, r2w 561 shr r2, 16 562 563 movd m3, [r6+r1*2] 564 pinsrw m3, [r6+r2*2], 1 565 566 movd r2, m2 567 movzx r1, r2w 568 shr r2, 16 569 570 pinsrw m3, [r6+r1*2], 2 571 pinsrw m3, [r6+r2*2], 3 572%endif 573 pmulhrsw m3, m6 574 packsswb m3, m3 575 movd [bufq+r5], m3 576 add r5, 4 577%if %2 578 jl .loop_x 579 add bufq, 82 580%if ARCH_X86_64 581 dec r7d 582%else 583 dec r3mp 584%endif 585 jg .loop_y 586%else 587 jl .loop 588%endif 589 590%if ARCH_X86_32 591 mov r2, r2mp 592%endif 593 594 ; auto-regression code 595 movsxd r5, [fg_dataq+FGData.ar_coeff_lag] 596 movsxd r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4] 597 lea r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table] 598 jmp r5 599 600.ar0: 601 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 602 movifnidn bufyq, bufymp 603%if ARCH_X86_32 604%assign stack_offset_old stack_offset 605 ALLOC_STACK -2*16 606%endif 607 imul uvd, 28 608 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 609 movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq] 610 movd m4, [base+hmul_bits+shiftq*2] 611 DEFINE_ARGS buf, bufy, h, x 612 pxor m0, m0 613 pcmpgtb m0, m5 614 punpcklbw m5, m0 615 movd m7, [base+pb_1] 616%if %2 617 movd m6, [base+hmul_bits+2+%3*2] 618%endif 619 pshuflw m5, m5, q0000 620 pshuflw m4, m4, q0000 621 pshufd m7, m7, q0000 622%if %2 623 pshuflw m6, m6, q0000 624%endif 625 punpcklqdq m5, m5 626 punpcklqdq m4, m4 627%if %2 628 punpcklqdq m6, m6 629%endif 630 pcmpeqw m1, m1 631 pslldq m1, 12>>%2 632 SCRATCH 1, 8, 0 633 SCRATCH 4, 9, 1 634%if %2 635 sub bufq, 82*(73-35*%3)+82-(82*3+41) 636%else 637 sub bufq, 82*70-3 638%endif 639 add bufyq, 3+82*3 640 mov hd, 70-35*%3 641.y_loop_ar0: 642 xor xd, xd 643.x_loop_ar0: 644 ; first 32 pixels 645%if %2 646 movu m1, [bufyq+xq*2] 647%if %3 648 movu m2, [bufyq+xq*2+82] 649%endif 650 movu m3, [bufyq+xq*2+16] 651%if %3 652 movu m4, [bufyq+xq*2+82+16] 653%endif 654 pmaddubsw m0, m7, m1 655%if %3 656 pmaddubsw m1, m7, m2 657%endif 658 pmaddubsw m2, m7, m3 659%if %3 660 pmaddubsw m3, m7, m4 661 paddw m0, m1 662 paddw m2, m3 663%endif 664 pmulhrsw m0, m6 665 pmulhrsw m2, m6 666%else 667 movu m0, [bufyq+xq] 668 pxor m6, m6 669 pcmpgtb m6, m0 670 punpckhbw m2, m0, m6 671 punpcklbw m0, m6 672%endif 673 pmullw m0, m5 674 pmullw m2, m5 675 pmulhrsw m0, m9 676 pmulhrsw m2, m9 677 movu m1, [bufq+xq] 678 pxor m4, m4 679 pcmpgtb m4, m1 680 punpckhbw m3, m1, m4 681%if %2 682 punpcklbw m1, m4 683 paddw m2, m3 684 paddw m0, m1 685%else 686 punpcklbw m6, m1, m4 687 paddw m2, m3 688 paddw m0, m6 689%endif 690 packsswb m0, m2 691%if %2 692 movu [bufq+xq], m0 693 add xd, 16 694 cmp xd, 32 695 jl .x_loop_ar0 696 697 ; last 6/12 pixels 698 movu m1, [bufyq+xq*(1+%2)] 699%if %3 700 movu m2, [bufyq+xq*2+82] 701%endif 702 pmaddubsw m0, m7, m1 703%if %3 704 pmaddubsw m1, m7, m2 705 paddw m0, m1 706%endif 707 pmulhrsw m0, m6 708 pmullw m0, m5 709 pmulhrsw m0, m9 710 movq m1, [bufq+xq] 711 pxor m4, m4 712 pcmpgtb m4, m1 713 punpcklbw m2, m1, m4 714 paddw m0, m2 715 packsswb m0, m0 716 pandn m2, m8, m0 717 pand m1, m8 718 por m2, m1 719 movq [bufq+xq], m2 720%else 721 add xd, 16 722 cmp xd, 80 723 je .y_loop_final_ar0 724 movu [bufq+xq-16], m0 725 jmp .x_loop_ar0 726.y_loop_final_ar0: 727 pandn m2, m8, m0 728 pand m1, m8 729 por m2, m1 730 movu [bufq+xq-16], m2 731%endif 732 733 add bufq, 82 734 add bufyq, 82<<%3 735 dec hd 736 jg .y_loop_ar0 737 RET 738 739.ar1: 740%if ARCH_X86_32 741%assign stack_offset stack_offset_old 742%assign stack_size_padded 0 743%xdefine rstk rsp 744%endif 745 DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x 746 imul uvd, 28 747 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] 748 movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1] 749 pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2 750%if ARCH_X86_32 751 mov r3mp, cf3d 752 DEFINE_ARGS buf, shift, fg_data, val3, min, max, x 753%elif WIN64 754 DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x 755 mov bufq, r0 756%else 757 DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x 758%endif 759 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 760 movd m3, [base+round_vals+shiftq*2-12] ; rnd 761%if %2 762 movd m7, [base+pb_1] 763 movd m6, [base+hmul_bits+2+%3*2] 764%endif 765 psrldq m4, 1 766%if ARCH_X86_32 767 DEFINE_ARGS buf, shift, val0, val3, min, max, x 768%elif WIN64 769 DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0 770%else 771 DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0 772%endif 773 pxor m5, m5 774 punpcklwd m3, m5 775%if %2 776 punpcklwd m6, m6 777%endif 778 pcmpgtb m5, m4 779 punpcklbw m4, m5 780 pshufd m5, m4, q1111 781 pshufd m4, m4, q0000 782 pshufd m3, m3, q0000 783%if %2 784 pshufd m7, m7, q0000 785 pshufd m6, m6, q0000 786 sub bufq, 82*(73-35*%3)+44-(82*3+41) 787%else 788 sub bufq, 82*69+3 789%endif 790%if ARCH_X86_32 791 add r1mp, 79+82*3 792 mov r0mp, 70-35*%3 793%else 794 add bufyq, 79+82*3 795 mov hd, 70-35*%3 796%endif 797 mov mind, -128 798 mov maxd, 127 799.y_loop_ar1: 800 mov xq, -(76>>%2) 801 movsx val3d, byte [bufq+xq-1] 802.x_loop_ar1: 803%if %2 804%if ARCH_X86_32 805 mov r2, r1mp 806 movq m0, [r2+xq*2] 807%if %3 808 movq m1, [r2+xq*2+82] 809%endif 810%else 811 movq m0, [bufyq+xq*2] 812%if %3 813 movq m1, [bufyq+xq*2+82] 814%endif 815%endif 816 pmaddubsw m2, m7, m0 817%if %3 818 pmaddubsw m0, m7, m1 819 paddw m2, m0 820%endif 821 pmulhrsw m2, m6 822%else 823%if ARCH_X86_32 824 mov r2, r1mp 825 movd m2, [r2+xq] 826%else 827 movd m2, [bufyq+xq] 828%endif 829 pxor m0, m0 830 pcmpgtb m0, m2 831 punpcklbw m2, m0 832%endif 833 834 movq m0, [bufq+xq-82-1] ; top/left 835 pxor m1, m1 836 pcmpgtb m1, m0 837 punpcklbw m0, m1 838 psrldq m1, m0, 4 ; top/right 839 punpcklwd m1, m2 840 psrldq m2, m0, 2 ; top 841 punpcklwd m0, m2 842 pmaddwd m0, m4 843 pmaddwd m1, m5 844 paddd m0, m1 845 paddd m0, m3 846.x_loop_ar1_inner: 847 movd val0d, m0 848 psrldq m0, 4 849%if ARCH_X86_32 850 imul val3d, r3mp 851%else 852 imul val3d, cf3d 853%endif 854 add val3d, val0d 855 sar val3d, shiftb 856 movsx val0d, byte [bufq+xq] 857 add val3d, val0d 858 cmp val3d, maxd 859 cmovns val3d, maxd 860 cmp val3d, mind 861 cmovs val3d, mind 862 mov byte [bufq+xq], val3b 863 ; keep val3d in-place as left for next x iteration 864 inc xq 865 jz .x_loop_ar1_end 866 test xq, 3 867 jnz .x_loop_ar1_inner 868 jmp .x_loop_ar1 869 870.x_loop_ar1_end: 871 add bufq, 82 872%if ARCH_X86_32 873 add r1mp, 82<<%3 874 dec r0mp 875%else 876 add bufyq, 82<<%3 877 dec hd 878%endif 879 jg .y_loop_ar1 880 RET 881 882.ar2: 883%if ARCH_X86_32 884%assign stack_offset stack_offset_old 885%assign stack_size_padded 0 886%xdefine rstk rsp 887 ALLOC_STACK -8*16 888%endif 889 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 890 movifnidn bufyq, bufymp 891 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 892 imul uvd, 28 893 movd m7, [base+round_vals-12+shiftq*2] 894 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12 895 pxor m2, m2 896 pcmpgtb m2, m0 897 punpckhbw m1, m0, m2 898 punpcklbw m0, m2 899 pinsrw m1, [base+pw_1], 5 900 punpcklwd m7, m7 901 pshufd m7, m7, q0000 902 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 903 pshufd m4, m1, q0000 904 pshufd m5, m1, q1111 905 pshufd m6, m1, q2222 906 pshufd m3, m0, q3333 907 pshufd m2, m0, q2222 908 pshufd m1, m0, q1111 909 pshufd m0, m0, q0000 910 SCRATCH 0, 8, 0 911 SCRATCH 1, 9, 1 912 SCRATCH 2, 10, 2 913 SCRATCH 3, 11, 3 914 SCRATCH 4, 12, 4 915 SCRATCH 5, 13, 5 916 SCRATCH 6, 14, 6 917 SCRATCH 7, 15, 7 918%if %2 919 movd m7, [base+hmul_bits+2+%3*2] 920 movd m6, [base+pb_1] 921 punpcklwd m7, m7 922 pshufd m6, m6, q0000 923 pshufd m7, m7, q0000 924 sub bufq, 82*(73-35*%3)+44-(82*3+41) 925%else 926 sub bufq, 82*69+3 927%endif 928 add bufyq, 79+82*3 929 mov hd, 70-35*%3 930.y_loop_ar2: 931 mov xq, -(76>>%2) 932 933.x_loop_ar2: 934 pxor m2, m2 935 movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 936 movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 937 pcmpgtb m2, m0 938 punpckhbw m1, m0, m2 939 punpcklbw m0, m2 940 psrldq m5, m0, 2 ; y=-2,x=[-1,+5] 941 psrldq m3, m1, 2 ; y=-1,x=[-1,+5] 942 psrldq m4, m1, 4 ; y=-1,x=[+0,+5] 943 punpcklwd m2, m0, m5 944 punpcklwd m3, m4 945 pmaddwd m2, m8 946 pmaddwd m3, m11 947 paddd m2, m3 948 949 psrldq m4, m0, 4 ; y=-2,x=[+0,+5] 950 psrldq m5, m0, 6 ; y=-2,x=[+1,+5] 951 psrldq m0, 8 ; y=-2,x=[+2,+5] 952 punpcklwd m4, m5 953 punpcklwd m0, m1 954 psrldq m3, m1, 6 ; y=-1,x=[+1,+5] 955 psrldq m1, m1, 8 ; y=-1,x=[+2,+5] 956 punpcklwd m3, m1 957 pmaddwd m4, m9 958 pmaddwd m0, m10 959 pmaddwd m3, m12 960 paddd m4, m0 961 paddd m2, m3 962 paddd m2, m4 963 964%if %2 965 movq m1, [bufyq+xq*2] 966%if %3 967 movq m3, [bufyq+xq*2+82] 968%endif 969 pmaddubsw m0, m6, m1 970%if %3 971 pmaddubsw m1, m6, m3 972 paddw m0, m1 973%endif 974 pmulhrsw m0, m7 975%else 976 movd m0, [bufyq+xq] 977 pxor m1, m1 978 pcmpgtb m1, m0 979 punpcklbw m0, m1 980%endif 981 punpcklwd m0, m15 982 pmaddwd m0, m14 983 paddd m2, m0 984 985 movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] 986 pxor m4, m4 987 movd m5, [base+byte_blend+1] 988 punpcklbw m5, m5 989.x_loop_ar2_inner: 990 pcmpgtb m1, m4, m0 991 punpcklbw m0, m1 992 pmaddwd m3, m0, m13 993 paddd m3, m2 994 psrldq m2, 4 ; shift top to next pixel 995 psrad m3, [fg_dataq+FGData.ar_coeff_shift] 996 pslldq m3, 4 997 pand m3, m5 998 paddw m0, m3 999 packsswb m0, m0 1000 movd [bufq+xq-2], m0 1001 psrldq m0, 1 1002 inc xq 1003 jz .x_loop_ar2_end 1004 test xq, 3 1005 jnz .x_loop_ar2_inner 1006 jmp .x_loop_ar2 1007 1008.x_loop_ar2_end: 1009 add bufq, 82 1010 add bufyq, 82<<%3 1011 dec hd 1012 jg .y_loop_ar2 1013 RET 1014 1015.ar3: 1016%if ARCH_X86_32 1017%assign stack_offset stack_offset_old 1018%assign stack_size_padded 0 1019%xdefine rstk rsp 1020%endif 1021 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 1022 movifnidn bufyq, bufymp 1023%if ARCH_X86_32 1024 ALLOC_STACK -15*16 1025%else 1026 SUB rsp, 16*7 1027%assign stack_size_padded (stack_size_padded+16*7) 1028%assign stack_size (stack_size+16*7) 1029%endif 1030 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 1031 imul uvd, 28 1032 1033 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 1034 pxor m3, m3 1035 pcmpgtb m3, m0 1036 punpckhbw m1, m0, m3 1037 punpcklbw m0, m3 1038 pshufd m2, m0, q1111 1039 pshufd m3, m0, q2222 1040 pshufd m4, m0, q3333 1041 pshufd m0, m0, q0000 1042 pshufd m5, m1, q1111 1043 pshufd m6, m1, q2222 1044 pshufd m7, m1, q3333 1045 pshufd m1, m1, q0000 1046 mova [rsp+ 0*16], m0 1047 mova [rsp+ 1*16], m2 1048 mova [rsp+ 2*16], m3 1049 mova [rsp+ 3*16], m4 1050 mova [rsp+ 4*16], m1 1051 mova [rsp+ 5*16], m5 1052 mova [rsp+ 6*16], m6 1053 SCRATCH 7, 8, 7 1054 1055 movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma] 1056 pxor m4, m4 1057 pcmpgtb m4, m2 1058 punpckhbw m5, m2, m4 1059 punpcklbw m2, m4 1060 pshufd m4, m2, q3232 1061 punpcklwd m3, m4, m5 1062 pshuflw m5, m4, q3321 1063 pshufd m4, m3, q0000 1064 pshufd m3, m2, q1111 1065 pshufd m2, m2, q0000 1066 pinsrw m5, [base+round_vals+shiftq*2-10], 3 1067 SCRATCH 2, 9, 8 1068 SCRATCH 3, 10, 9 1069 SCRATCH 4, 11, 10 1070 SCRATCH 5, 12, 11 1071 1072 movd m2, [base+round_vals-12+shiftq*2] 1073%if %2 1074 movd m1, [base+pb_1] 1075 movd m3, [base+hmul_bits+2+%3*2] 1076%endif 1077 pxor m0, m0 1078 punpcklwd m2, m0 1079%if %2 1080 punpcklwd m3, m3 1081%endif 1082 pshufd m2, m2, q0000 1083%if %2 1084 pshufd m1, m1, q0000 1085 pshufd m3, m3, q0000 1086 SCRATCH 1, 13, 12 1087%endif 1088 SCRATCH 2, 14, 13 1089%if %2 1090 SCRATCH 3, 15, 14 1091%endif 1092 1093 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 1094%if %2 1095 sub bufq, 82*(73-35*%3)+44-(82*3+41) 1096%else 1097 sub bufq, 82*69+3 1098%endif 1099 add bufyq, 79+82*3 1100 mov hd, 70-35*%3 1101.y_loop_ar3: 1102 mov xq, -(76>>%2) 1103 1104.x_loop_ar3: 1105 movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] 1106 pxor m4, m4 1107 pcmpgtb m4, m0 1108 punpckhbw m3, m0, m4 1109 punpcklbw m0, m4 1110 1111 psrldq m5, m0, 2 1112 psrldq m6, m0, 4 1113 psrldq m7, m0, 6 1114 punpcklwd m4, m0, m5 1115 punpcklwd m6, m7 1116 pmaddwd m4, [rsp+ 0*16] 1117 pmaddwd m6, [rsp+ 1*16] 1118 paddd m4, m6 1119 1120 palignr m2, m3, m0, 10 1121 palignr m3, m0, 12 1122 psrldq m0, 8 1123 1124 movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] 1125 pxor m6, m6 1126 pcmpgtb m6, m1 1127 punpckhbw m5, m1, m6 1128 punpcklbw m1, m6 1129 1130 punpcklwd m0, m2 1131 punpcklwd m3, m1 1132 pmaddwd m0, [rsp+ 2*16] 1133 pmaddwd m3, [rsp+ 3*16] 1134 paddd m0, m3 1135 paddd m0, m4 1136 1137 movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 1138 pxor m7, m7 1139 pcmpgtb m7, m2 1140 punpckhbw m6, m2, m7 1141 punpcklbw m2, m7 1142 1143 palignr m3, m5, m1, 10 1144 palignr m5, m1, 12 1145 psrldq m4, m2, 2 1146 1147 punpcklwd m3, m5 1148 punpcklwd m5, m2, m4 1149 pmaddwd m3, [rsp+ 6*16] 1150 pmaddwd m5, m8 1151 paddd m3, m5 1152 paddd m0, m3 1153 1154 psrldq m3, m1, 2 1155 psrldq m4, m1, 4 1156 psrldq m5, m1, 6 1157 psrldq m1, 8 1158 1159 punpcklwd m3, m4 1160 punpcklwd m5, m1 1161 pmaddwd m3, [rsp+ 4*16] 1162 pmaddwd m5, [rsp+ 5*16] 1163 paddd m3, m5 1164 paddd m0, m3 1165 1166%if %2 1167 movq m1, [bufyq+xq*2] 1168%if %3 1169 movq m3, [bufyq+xq*2+82] 1170%endif 1171 pmaddubsw m7, m13, m1 1172%if %3 1173 pmaddubsw m5, m13, m3 1174 paddw m7, m5 1175%endif 1176 pmulhrsw m7, m15 1177%else 1178 movd m7, [bufyq+xq] 1179 pxor m1, m1 1180 pcmpgtb m1, m7 1181 punpcklbw m7, m1 1182%endif 1183 1184 psrldq m1, m2, 4 1185 psrldq m3, m2, 6 1186 palignr m4, m6, m2, 10 1187 palignr m6, m2, 12 1188 psrldq m2, 8 1189 1190 punpcklwd m1, m3 1191 punpcklwd m2, m4 1192 punpcklwd m6, m7 1193 pmaddwd m1, m9 1194 pmaddwd m2, m10 1195 pmaddwd m6, m11 1196 paddd m1, m2 1197 paddd m0, m6 1198 paddd m0, m1 1199 paddd m0, m14 1200 1201 movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] 1202 pxor m4, m4 1203 movd m5, [base+byte_blend] 1204.x_loop_ar3_inner: 1205 pcmpgtb m2, m4, m1 1206 punpcklbw m3, m1, m2 1207 pmaddwd m2, m3, m12 1208 pshufd m3, m2, q1111 1209 paddd m2, m3 ; left+cur 1210 paddd m2, m0 ; add top 1211 psrldq m0, 4 1212 psrad m2, [fg_dataq+FGData.ar_coeff_shift] 1213 ; don't packssdw, we only care about one value 1214 packsswb m2, m2 1215 pandn m3, m5, m1 1216 pslld m2, 24 1217 pand m2, m5 1218 por m1, m2, m3 1219 movd [bufq+xq-3], m1 1220 psrldq m1, 1 1221 inc xq 1222 jz .x_loop_ar3_end 1223 test xq, 3 1224 jnz .x_loop_ar3_inner 1225 jmp .x_loop_ar3 1226 1227.x_loop_ar3_end: 1228 add bufq, 82 1229 add bufyq, 82<<%3 1230 dec hd 1231 jg .y_loop_ar3 1232 RET 1233%endmacro 1234 1235generate_grain_uv_fn 420, 1, 1 1236generate_grain_uv_fn 422, 1, 0 1237generate_grain_uv_fn 444, 0, 0 1238 1239%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg 1240%assign %%idx 0 1241%define %%tmp %2 1242%if %0 == 6 1243%define %%tmp %6 1244%endif 1245%rep 4 1246%if %%idx == 0 1247 movd %5 %+ d, %2 1248 pshuflw %%tmp, %2, q3232 1249%else 1250 movd %5 %+ d, %%tmp 1251%if %%idx == 2 1252 punpckhqdq %%tmp, %%tmp 1253%elif %%idx == 4 1254 psrlq %%tmp, 32 1255%endif 1256%endif 1257 movzx %4 %+ d, %5 %+ w 1258 shr %5 %+ d, 16 1259 1260%if %%idx == 0 1261 movd %1, [%3+%4] 1262%else 1263 pinsrw %1, [%3+%4], %%idx + 0 1264%endif 1265 pinsrw %1, [%3+%5], %%idx + 1 1266%assign %%idx %%idx+2 1267%endrep 1268%endmacro 1269 1270INIT_XMM ssse3 1271; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby) 1272%if ARCH_X86_32 1273%if STACK_ALIGNMENT < mmsize 1274cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \ 1275 dst, src, scaling, unused1, fg_data, picptr, unused2 1276 ; copy stack arguments to new position post-alignment, so that we 1277 ; don't have to keep the old stack location in a separate register 1278 mov r0, r0m 1279 mov r1, r2m 1280 mov r2, r4m 1281 mov r3, r6m 1282 mov r4, r7m 1283 mov r5, r8m 1284 1285 mov [rsp+5*mmsize+ 4*gprsize], r0 1286 mov [rsp+5*mmsize+ 6*gprsize], r1 1287 mov [rsp+5*mmsize+ 8*gprsize], r2 1288 mov [rsp+5*mmsize+10*gprsize], r3 1289 mov [rsp+5*mmsize+11*gprsize], r4 1290 mov [rsp+5*mmsize+12*gprsize], r5 1291%else 1292cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \ 1293 dst, src, scaling, unused1, fg_data, picptr, unused2 1294%endif 1295 mov srcq, srcm 1296 mov fg_dataq, r3m 1297 mov scalingq, r5m 1298%if STACK_ALIGNMENT < mmsize 1299%define r0m [rsp+5*mmsize+ 4*gprsize] 1300%define r1m [rsp+5*mmsize+ 5*gprsize] 1301%define r2m [rsp+5*mmsize+ 6*gprsize] 1302%define r3m [rsp+5*mmsize+ 7*gprsize] 1303%define r4m [rsp+5*mmsize+ 8*gprsize] 1304%define r5m [rsp+5*mmsize+ 9*gprsize] 1305%define r6m [rsp+5*mmsize+10*gprsize] 1306%define r7m [rsp+5*mmsize+11*gprsize] 1307%define r8m [rsp+5*mmsize+12*gprsize] 1308%endif 1309 LEA r5, pb_mask 1310%define base r5-pb_mask 1311 mov r5m, picptrq 1312%else 1313cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut 1314 lea r7, [pb_mask] 1315%define base r7-pb_mask 1316%endif 1317 mov r6d, [fg_dataq+FGData.scaling_shift] 1318 movd m3, [base+mul_bits+r6*2-14] 1319 mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 1320 movd m4, [base+max+r6*4] 1321 movd m5, [base+min+r6*2] 1322 punpcklwd m3, m3 1323 punpcklwd m4, m4 1324 punpcklwd m5, m5 1325 pshufd m3, m3, q0000 1326 pshufd m4, m4, q0000 1327 pshufd m5, m5, q0000 1328 SCRATCH 3, 11, 0 1329 SCRATCH 4, 12, 1 1330 SCRATCH 5, 13, 2 1331 1332%if ARCH_X86_32 1333 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 1334%else 1335 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 1336%endif 1337 1338 mov sbyd, r8m 1339 mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 1340 test overlapd, overlapd 1341 jz .no_vertical_overlap 1342 mova m6, [base+pw_1024] 1343 mova m7, [base+pb_27_17_17_27] 1344 SCRATCH 6, 14, 3 1345 SCRATCH 7, 15, 4 1346 test sbyd, sbyd 1347 jnz .vertical_overlap 1348 ; fall-through 1349 1350.no_vertical_overlap: 1351 mov r8m, overlapd 1352%if ARCH_X86_32 1353 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused 1354 imul seed, (173 << 24) | 37 1355%else 1356 imul seed, sbyd, (173 << 24) | 37 1357%endif 1358 add seed, (105 << 24) | 178 1359 rol seed, 8 1360 movzx seed, seew 1361 xor seed, [fg_dataq+FGData.seed] 1362 1363%if ARCH_X86_32 1364 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1365 1366 mov r3m, seed 1367 mov wq, r4m 1368%else 1369 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1370 unused1, unused2, see, unused3 1371%endif 1372 1373 lea src_bakq, [srcq+wq] 1374 neg wq 1375 sub dstmp, srcq 1376%if ARCH_X86_32 1377 mov r1m, src_bakq 1378 mov r4m, wq 1379 DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 1380%endif 1381 1382.loop_x: 1383%if ARCH_X86_32 1384 mov seed, r3m 1385%endif 1386 mov r6d, seed 1387 or seed, 0xEFF4 1388 shr r6d, 1 1389 test seeb, seeh 1390 lea seed, [r6+0x8000] 1391 cmovp seed, r6d ; updated seed 1392%if ARCH_X86_32 1393 mov r3m, seed 1394 1395 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1396 1397 mov offxd, offyd 1398%else 1399 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1400 offx, offy, see, unused 1401 1402 mov offyd, seed 1403 mov offxd, seed 1404%endif 1405 ror offyd, 8 1406 shr offxd, 12 1407 and offyd, 0xf 1408 imul offyd, 164 1409 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1410 1411%if ARCH_X86_32 1412 ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, 1413 ; r6m=grain_lut, r7m=h, r8m=overlap_v|h 1414 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1415%else 1416 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1417 h, offxy, see, unused 1418%endif 1419 1420.loop_x_odd: 1421 mov hd, r7m 1422 mov grain_lutq, grain_lutmp 1423.loop_y: 1424 ; src 1425 mova m0, [srcq] 1426 pxor m2, m2 1427 punpckhbw m1, m0, m2 1428 punpcklbw m0, m2 ; m0-1: src as word 1429 1430 ; scaling[src] 1431%if ARCH_X86_32 1432 vpgatherdw m4, m0, scalingq-1, r0, r5, m3 1433 vpgatherdw m5, m1, scalingq-1, r0, r5, m3 1434%else 1435 vpgatherdw m4, m0, scalingq-1, r12, r13, m3 1436 vpgatherdw m5, m1, scalingq-1, r12, r13, m3 1437%endif 1438 REPX {psrlw x, 8}, m4, m5 1439 1440 ; grain = grain_lut[offy+y][offx+x] 1441 movu m3, [grain_lutq+offxyq] 1442 pcmpgtb m7, m2, m3 1443 punpcklbw m2, m3, m7 1444 punpckhbw m3, m7 1445 1446 ; noise = round2(scaling[src] * grain, scaling_shift) 1447 pmullw m2, m4 1448 pmullw m3, m5 1449 pmulhrsw m2, m11 1450 pmulhrsw m3, m11 1451 1452 ; dst = clip_pixel(src, noise) 1453 paddw m0, m2 1454 paddw m1, m3 1455 pmaxsw m0, m13 1456 pmaxsw m1, m13 1457 pminsw m0, m12 1458 pminsw m1, m12 1459 packuswb m0, m1 1460 movifnidn dstq, dstmp 1461 mova [dstq+srcq], m0 1462 1463 add srcq, r2mp 1464 add grain_lutq, 82 1465 dec hd 1466 jg .loop_y 1467 1468%if ARCH_X86_32 1469 add r4mp, 16 1470%else 1471 add wq, 16 1472%endif 1473 jge .end 1474%if ARCH_X86_32 1475 mov srcq, r1mp 1476 add srcq, r4mp 1477%else 1478 lea srcq, [src_bakq+wq] 1479%endif 1480 btc dword r8m, 2 1481 jc .next_blk 1482 1483 add offxyd, 16 1484 test dword r8m, 2 ; r8m & 2 = have_top_overlap 1485 jz .loop_x_odd 1486 1487%if ARCH_X86_32 1488 add dword [rsp+5*mmsize+1*gprsize], 16 1489%else 1490 add r11d, 16 ; top_offxyd 1491%endif 1492 jnz .loop_x_odd_v_overlap 1493 1494.next_blk: 1495 test dword r8m, 1 1496 jz .loop_x 1497 1498 test dword r8m, 2 1499 jnz .loop_x_hv_overlap 1500 1501 ; horizontal overlap (without vertical overlap) 1502.loop_x_h_overlap: 1503%if ARCH_X86_32 1504 ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, 1505 ; r6m=grain_lut, r7m=h, r8m=overlap_v|h 1506 DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3 1507 1508 add offxyd, 16 ; left_offxyd 1509 mov [rsp+5*mmsize+0*gprsize], offxyd 1510 1511 DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 1512 1513 mov seed, r3m 1514%else 1515 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1516 offx, offy, see, left_offxy 1517 1518 lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx 1519%endif 1520 1521 mov r6d, seed 1522 or seed, 0xEFF4 1523 shr r6d, 1 1524 test seeb, seeh 1525 lea seed, [r6+0x8000] 1526 cmovp seed, r6d ; updated seed 1527 1528%if ARCH_X86_32 1529 mov r3m, seed 1530 1531 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1532 1533 mov offxd, offyd 1534%else 1535 mov offyd, seed 1536 mov offxd, seed 1537%endif 1538 ror offyd, 8 1539 shr offxd, 12 1540 and offyd, 0xf 1541 imul offyd, 164 1542 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1543 1544%if ARCH_X86_32 1545 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1546%else 1547 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1548 h, offxy, see, left_offxy 1549%endif 1550 1551 mov hd, r7m 1552 mov grain_lutq, grain_lutmp 1553.loop_y_h_overlap: 1554 ; src 1555 mova m0, [srcq] 1556 pxor m2, m2 1557 punpckhbw m1, m0, m2 1558 punpcklbw m0, m2 ; m0-1: src as word 1559 1560 ; scaling[src] 1561%if ARCH_X86_32 1562 vpgatherdw m4, m0, scalingq-1, r0, r5, m3 1563 vpgatherdw m5, m1, scalingq-1, r0, r5, m3 1564%else 1565 vpgatherdw m4, m0, scalingq-1, r12, r13, m3 1566 vpgatherdw m5, m1, scalingq-1, r12, r13, m3 1567%endif 1568 REPX {psrlw x, 8}, m4, m5 1569 1570 ; grain = grain_lut[offy+y][offx+x] 1571 movu m3, [grain_lutq+offxyq] 1572%if ARCH_X86_32 1573 mov r5, [rsp+5*mmsize+0*gprsize] 1574 movd m7, [grain_lutq+r5] 1575%else 1576 movd m7, [grain_lutq+left_offxyq] 1577%endif 1578 punpcklbw m7, m3 1579 pmaddubsw m6, m15, m7 1580 pmulhrsw m6, m14 1581 packsswb m6, m6 1582 shufps m6, m3, q3210 1583 pcmpgtb m2, m6 1584 punpcklbw m7, m6, m2 1585 punpckhbw m6, m2 1586 1587 ; noise = round2(scaling[src] * grain, scaling_shift) 1588 pmullw m7, m4 1589 pmullw m6, m5 1590 pmulhrsw m7, m11 1591 pmulhrsw m6, m11 1592 1593 ; dst = clip_pixel(src, noise) 1594 paddw m0, m7 1595 paddw m1, m6 1596 pmaxsw m0, m13 1597 pmaxsw m1, m13 1598 pminsw m0, m12 1599 pminsw m1, m12 1600 packuswb m0, m1 1601 movifnidn dstq, dstmp 1602 mova [dstq+srcq], m0 1603 1604 add srcq, r2mp 1605 add grain_lutq, 82 1606 dec hd 1607 jg .loop_y_h_overlap 1608 1609%if ARCH_X86_32 1610 add r4mp, 16 1611%else 1612 add wq, 16 1613%endif 1614 jge .end 1615%if ARCH_X86_32 1616 mov srcq, r1m 1617 add srcq, r4m 1618%else 1619 lea srcq, [src_bakq+wq] 1620%endif 1621 xor dword r8m, 4 1622 add offxyd, 16 1623 1624 ; since this half-block had left-overlap, the next does not 1625 test dword r8m, 2 ; have_top_overlap 1626 jz .loop_x_odd 1627%if ARCH_X86_32 1628 add dword [rsp+5*mmsize+1*gprsize], 16 1629%else 1630 add r11d, 16 ; top_offxyd 1631%endif 1632 jmp .loop_x_odd_v_overlap 1633 1634.end: 1635 RET 1636 1637.vertical_overlap: 1638%if ARCH_X86_32 1639 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 1640%else 1641 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap 1642%endif 1643 1644 or overlapd, 2 ; top_overlap: overlap & 2 1645 mov r8m, overlapd 1646 movzx sbyd, sbyb 1647%if ARCH_X86_32 1648 imul r4, [fg_dataq+FGData.seed], 0x00010001 1649 DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused 1650%else 1651 imul seed, [fg_dataq+FGData.seed], 0x00010001 1652%endif 1653 imul tmpd, sbyd, 173 * 0x00010001 1654 imul sbyd, 37 * 0x01000100 1655 add tmpd, (105 << 16) | 188 1656 add sbyd, (178 << 24) | (141 << 8) 1657 and tmpd, 0x00ff00ff 1658 and sbyd, 0xff00ff00 1659 xor seed, tmpd 1660%if ARCH_X86_32 1661 xor sbyd, seed ; (cur_seed << 16) | top_seed 1662 1663 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1664 1665 mov r3m, seed 1666 mov wq, r4m 1667%else 1668 xor seed, sbyd ; (cur_seed << 16) | top_seed 1669 1670 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1671 tmp, unused2, see, unused3 1672%endif 1673 1674 lea src_bakq, [srcq+wq] 1675 neg wq 1676 sub dstmp, srcq 1677%if ARCH_X86_32 1678 mov r1m, src_bakq 1679 mov r4m, wq 1680 DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 1681%endif 1682 1683.loop_x_v_overlap: 1684%if ARCH_X86_32 1685 mov seed, r3m 1686%endif 1687 ; we assume from the block above that bits 8-15 of tmpd are zero'ed, 1688 ; because of the 'and tmpd, 0x00ff00ff' above 1689 mov r6d, seed 1690 or seed, 0xeff4eff4 1691 test seeb, seeh 1692 setp tmpb ; parity of top_seed 1693 shr seed, 16 1694 shl tmpd, 16 1695 test seeb, seeh 1696 setp tmpb ; parity of cur_seed 1697 or r6d, 0x00010001 1698 xor tmpd, r6d 1699 mov seed, tmpd 1700 ror seed, 1 ; updated (cur_seed << 16) | top_seed 1701 1702%if ARCH_X86_32 1703 mov r3m, seed 1704 1705 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1706 1707 mov offxd, offyd 1708%else 1709 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1710 offx, offy, see, unused, top_offxy 1711 1712 mov offyd, seed 1713 mov offxd, seed 1714%endif 1715 1716 ror offyd, 8 1717 ror offxd, 12 1718 and offyd, 0xf000f 1719 and offxd, 0xf000f 1720 imul offyd, 164 1721 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1722 lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1723 1724%if ARCH_X86_32 1725 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut 1726%else 1727 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1728 h, offxy, see, unused, top_offxy 1729%endif 1730 1731 movzx top_offxyd, offxyw 1732%if ARCH_X86_32 1733 mov [rsp+5*mmsize+1*gprsize], top_offxyd 1734 1735 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1736%endif 1737 shr offxyd, 16 1738 1739.loop_x_odd_v_overlap: 1740%if ARCH_X86_32 1741 mov r5, r5m 1742 lea r5, [base+pb_27_17] 1743 mov [rsp+5*mmsize+12], r5 1744%else 1745 mova m8, [pb_27_17] 1746%endif 1747 mov hd, r7m 1748 mov grain_lutq, grain_lutmp 1749.loop_y_v_overlap: 1750 ; src 1751 mova m0, [srcq] 1752 pxor m2, m2 1753 punpckhbw m1, m0, m2 1754 punpcklbw m0, m2 ; m0-1: src as word 1755 1756 ; scaling[src] 1757%if ARCH_X86_32 1758 vpgatherdw m4, m0, scalingq-1, r0, r5, m3 1759 vpgatherdw m5, m1, scalingq-1, r0, r5, m3 1760%else 1761 vpgatherdw m4, m0, scalingq-1, r12, r13, m3 1762 vpgatherdw m5, m1, scalingq-1, r12, r13, m3 1763%endif 1764 REPX {psrlw x, 8}, m4, m5 1765 1766 ; grain = grain_lut[offy+y][offx+x] 1767 movu m3, [grain_lutq+offxyq] 1768%if ARCH_X86_32 1769 mov r5, [rsp+5*mmsize+1*gprsize] 1770 movu m7, [grain_lutq+r5] 1771%else 1772 movu m7, [grain_lutq+top_offxyq] 1773%endif 1774 punpckhbw m6, m7, m3 1775 punpcklbw m7, m3 1776%if ARCH_X86_32 1777 mov r5, [rsp+5*mmsize+12] 1778 pmaddubsw m3, [r5], m6 1779 pmaddubsw m6, [r5], m7 1780%else 1781 pmaddubsw m3, m8, m6 1782 pmaddubsw m6, m8, m7 1783%endif 1784 pmulhrsw m3, m14 1785 pmulhrsw m6, m14 1786 packsswb m6, m3 1787 pcmpgtb m7, m2, m6 1788 punpcklbw m2, m6, m7 1789 punpckhbw m6, m7 1790 1791 ; noise = round2(scaling[src] * grain, scaling_shift) 1792 pmullw m2, m4 1793 pmullw m6, m5 1794 pmulhrsw m2, m11 1795 pmulhrsw m6, m11 1796 1797 ; dst = clip_pixel(src, noise) 1798 paddw m0, m2 1799 paddw m1, m6 1800 pmaxsw m0, m13 1801 pmaxsw m1, m13 1802 pminsw m0, m12 1803 pminsw m1, m12 1804 packuswb m0, m1 1805 movifnidn dstq, dstmp 1806 mova [dstq+srcq], m0 1807 1808%if ARCH_X86_32 1809 add dword [rsp+5*mmsize+12], mmsize 1810%else 1811 mova m8, [pb_17_27] 1812%endif 1813 add srcq, r2mp 1814 add grain_lutq, 82 1815 dec hw 1816 jz .end_y_v_overlap 1817 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1818 ; remaining (up to) 30 lines 1819 btc hd, 16 1820 jnc .loop_y_v_overlap 1821 jmp .loop_y 1822 1823.end_y_v_overlap: 1824%if ARCH_X86_32 1825 add r4mp, 16 1826%else 1827 add wq, 16 1828%endif 1829 jge .end_hv 1830%if ARCH_X86_32 1831 mov srcq, r1mp 1832 add srcq, r4mp 1833%else 1834 lea srcq, [src_bakq+wq] 1835%endif 1836 btc dword r8m, 2 1837 jc .loop_x_hv_overlap 1838 add offxyd, 16 1839%if ARCH_X86_32 1840 add dword [rsp+5*mmsize+1*gprsize], 16 1841%else 1842 add top_offxyd, 16 1843%endif 1844 jmp .loop_x_odd_v_overlap 1845 1846.loop_x_hv_overlap: 1847%if ARCH_X86_32 1848 mov r5, r5m 1849 lea r5, [base+pb_27_17] 1850 mov [rsp+5*mmsize+12], r5 1851 1852 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak 1853 1854 mov r5, [rsp+5*mmsize+1*gprsize] 1855 mov r4, offxyd 1856 add r5, 16 1857 add r4, 16 1858 mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy 1859 mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy 1860 1861 DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak 1862 1863 xor tmpd, tmpd 1864 mov seed, r3m 1865%else 1866 mova m8, [pb_27_17] 1867 1868 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1869 tmp, unused2, see, unused3 1870 1871 ; we assume from the block above that bits 8-15 of tmpd are zero'ed 1872%endif 1873 mov r6d, seed 1874 or seed, 0xeff4eff4 1875 test seeb, seeh 1876 setp tmpb ; parity of top_seed 1877 shr seed, 16 1878 shl tmpd, 16 1879 test seeb, seeh 1880 setp tmpb ; parity of cur_seed 1881 or r6d, 0x00010001 1882 xor tmpd, r6d 1883 mov seed, tmpd 1884 ror seed, 1 ; updated (cur_seed << 16) | top_seed 1885 1886%if ARCH_X86_32 1887 mov r3m, seed 1888 1889 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1890 1891 mov offxd, offyd 1892%else 1893 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1894 offx, offy, see, left_offxy, top_offxy, topleft_offxy 1895 1896 lea topleft_offxyq, [top_offxyq+16] 1897 lea left_offxyq, [offyq+16] 1898 mov offyd, seed 1899 mov offxd, seed 1900%endif 1901 ror offyd, 8 1902 ror offxd, 12 1903 and offyd, 0xf000f 1904 and offxd, 0xf000f 1905 imul offyd, 164 1906 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1907 lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1908 1909%if ARCH_X86_32 1910 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1911 1912 movzx r5, offxyw ; top_offxy 1913 mov [rsp+5*mmsize+1*gprsize], r5 1914%else 1915 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1916 h, offxy, see, left_offxy, top_offxy, topleft_offxy 1917 1918 movzx top_offxyd, offxyw 1919%endif 1920 shr offxyd, 16 1921 1922 mov hd, r7m 1923 mov grain_lutq, grain_lutmp 1924.loop_y_hv_overlap: 1925 ; grain = grain_lut[offy+y][offx+x] 1926 movu m3, [grain_lutq+offxyq] 1927%if ARCH_X86_32 1928 mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy 1929 mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy 1930 movu m6, [grain_lutq+r5] 1931 mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy 1932 movd m4, [grain_lutq+r0] 1933 movd m7, [grain_lutq+r5] 1934%else 1935 movu m6, [grain_lutq+top_offxyq] 1936 movd m4, [grain_lutq+left_offxyq] 1937 movd m7, [grain_lutq+topleft_offxyq] 1938%endif 1939 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 1940 punpcklbw m4, m3 1941 punpcklbw m7, m6 1942 pmaddubsw m2, m15, m4 1943 pmaddubsw m4, m15, m7 1944 pmulhrsw m2, m14 1945 pmulhrsw m4, m14 1946 packsswb m2, m2 1947 packsswb m4, m4 1948 shufps m2, m3, q3210 1949 shufps m4, m6, q3210 1950 ; followed by v interpolation (top | cur -> cur) 1951 punpcklbw m3, m4, m2 1952 punpckhbw m4, m2 1953%if ARCH_X86_32 1954 mov r5, [rsp+5*mmsize+12] 1955 pmaddubsw m7, [r5], m4 1956 pmaddubsw m4, [r5], m3 1957%else 1958 pmaddubsw m7, m8, m4 1959 pmaddubsw m4, m8, m3 1960%endif 1961 pmulhrsw m7, m14 1962 pmulhrsw m4, m14 1963 packsswb m4, m7 1964 pxor m2, m2 1965 pcmpgtb m7, m2, m4 1966 punpcklbw m3, m4, m7 1967 punpckhbw m4, m7 1968 1969 ; src 1970 mova m0, [srcq] 1971 punpckhbw m1, m0, m2 1972 punpcklbw m0, m2 ; m0-1: src as word 1973 1974 ; scaling[src] 1975%if ARCH_X86_32 1976 vpgatherdw m5, m0, scalingq-1, r0, r5, m7 1977 vpgatherdw m6, m1, scalingq-1, r0, r5, m7 1978%else 1979 vpgatherdw m5, m0, scalingq-1, r13, r14, m7 1980 vpgatherdw m6, m1, scalingq-1, r13, r14, m7 1981%endif 1982 REPX {psrlw x, 8}, m5, m6 1983 1984 ; noise = round2(scaling[src] * grain, scaling_shift) 1985 pmullw m3, m5 1986 pmullw m4, m6 1987 pmulhrsw m3, m11 1988 pmulhrsw m4, m11 1989 1990 ; dst = clip_pixel(src, noise) 1991 paddw m0, m3 1992 paddw m1, m4 1993 pmaxsw m0, m13 1994 pmaxsw m1, m13 1995 pminsw m0, m12 1996 pminsw m1, m12 1997 packuswb m0, m1 1998 movifnidn dstq, dstmp 1999 mova [dstq+srcq], m0 2000 2001%if ARCH_X86_32 2002 add dword [rsp+5*mmsize+12], mmsize 2003%else 2004 mova m8, [pb_17_27] 2005%endif 2006 add srcq, r2mp 2007 add grain_lutq, 82 2008 dec hw 2009 jz .end_y_hv_overlap 2010 ; 2 lines get vertical overlap, then fall back to non-overlap code for 2011 ; remaining (up to) 30 lines 2012 btc hd, 16 2013 jnc .loop_y_hv_overlap 2014 jmp .loop_y_h_overlap 2015 2016.end_y_hv_overlap: 2017%if ARCH_X86_32 2018 add r4mp, 16 2019%else 2020 add wq, 16 2021%endif 2022 jge .end_hv 2023%if ARCH_X86_32 2024 mov srcq, r1m 2025 add srcq, r4m 2026%else 2027 lea srcq, [src_bakq+wq] 2028%endif 2029 xor dword r8m, 4 2030 add offxyd, 16 2031%if ARCH_X86_32 2032 add dword [rsp+5*mmsize+1*gprsize], 16 2033%else 2034 add top_offxyd, 16 2035%endif 2036 jmp .loop_x_odd_v_overlap 2037 2038.end_hv: 2039 RET 2040 2041%macro FGUV_FN 3 ; name, ss_hor, ss_ver 2042INIT_XMM ssse3 2043%if ARCH_X86_32 2044; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h, 2045; sby, luma, lstride, uv_pl, is_id) 2046%if STACK_ALIGNMENT < mmsize 2047DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8 2048cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \ 2049 tmp, src, scaling, h, fg_data, picptr, unused 2050 mov r0, r0m 2051 mov r1, r2m 2052 mov r2, r4m 2053 mov r3, r6m 2054 mov r4, r7m 2055 mov [rsp+7*mmsize+3*gprsize], r0 2056 mov [rsp+7*mmsize+5*gprsize], r1 2057 mov [rsp+7*mmsize+7*gprsize], r2 2058 mov [rsp+7*mmsize+9*gprsize], r3 2059 mov [rsp+7*mmsize+10*gprsize], r4 2060 2061 mov r0, r8m 2062 mov r1, r9m 2063 mov r2, r10m 2064 mov r4, r11m 2065 mov r3, r12m 2066 mov [rsp+7*mmsize+11*gprsize], r0 2067 mov [rsp+7*mmsize+12*gprsize], r1 2068 mov [rsp+7*mmsize+13*gprsize], r2 2069 mov [rsp+7*mmsize+14*gprsize], r4 2070%else 2071cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \ 2072 tmp, src, scaling, h, fg_data, picptr, unused 2073%endif 2074 mov srcq, srcm 2075 mov fg_dataq, r3m 2076 mov scalingq, r5m 2077%if STACK_ALIGNMENT < mmsize 2078%define r0m [rsp+7*mmsize+ 3*gprsize] 2079%define r1m [rsp+7*mmsize+ 4*gprsize] 2080%define r2m [rsp+7*mmsize+ 5*gprsize] 2081%define r3m [rsp+7*mmsize+ 6*gprsize] 2082%define r4m [rsp+7*mmsize+ 7*gprsize] 2083%define r5m [rsp+7*mmsize+ 8*gprsize] 2084%define r6m [rsp+7*mmsize+ 9*gprsize] 2085%define r7m [rsp+7*mmsize+10*gprsize] 2086%define r8m [rsp+7*mmsize+11*gprsize] 2087%define r9m [rsp+7*mmsize+12*gprsize] 2088%define r10m [rsp+7*mmsize+13*gprsize] 2089%define r11m [rsp+7*mmsize+14*gprsize] 2090%define r12m [rsp+7*mmsize+15*gprsize] 2091%endif 2092 LEA r5, pb_mask 2093%define base r5-pb_mask 2094 mov r5m, r5 2095%else 2096cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ 2097 grain_lut, tmp, sby, luma, lstride, uv_pl, is_id 2098 lea r8, [pb_mask] 2099%define base r8-pb_mask 2100%endif 2101 mov r6d, [fg_dataq+FGData.scaling_shift] 2102 movd m3, [base+mul_bits+r6*2-14] 2103 mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 2104 lea tmpd, [r6d*2] 2105%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize 2106 test r3, r3 2107%else 2108 cmp dword r12m, 0 ; is_idm 2109%endif 2110 movd m5, [base+min+r6*2] 2111 cmovne r6d, tmpd 2112 movd m4, [base+max+r6*2] 2113 punpcklwd m3, m3 2114 punpcklwd m5, m5 2115 punpcklwd m4, m4 2116 pshufd m3, m3, q0000 2117 pshufd m5, m5, q0000 2118 pshufd m4, m4, q0000 2119 SCRATCH 3, 11, 0 2120 SCRATCH 4, 12, 1 2121 SCRATCH 5, 13, 2 2122 2123 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 2124 jne .csfl 2125 2126%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver 2127%if ARCH_X86_32 2128 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 2129%else 2130 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 2131%endif 2132 2133%if %1 2134 mov r6d, dword r11m 2135 movd m0, [fg_dataq+FGData.uv_mult+r6*4] 2136 movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4] 2137 punpcklbw m6, m1, m0 2138 movd m7, [fg_dataq+FGData.uv_offset+r6*4] 2139 punpcklwd m6, m6 2140 punpcklwd m7, m7 2141 pshufd m6, m6, q0000 2142 pshufd m7, m7, q0000 2143 SCRATCH 6, 14, 3 2144 SCRATCH 7, 15, 4 2145%endif 2146 2147 mov sbyd, r8m 2148 mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 2149 test overlapd, overlapd 2150 jz %%no_vertical_overlap 2151%if ARCH_X86_32 2152%if %2 2153 mova m1, [base+pb_23_22_h] 2154%else 2155 mova m1, [base+pb_27_17_17_27] 2156%endif 2157 mova m0, [base+pw_1024] 2158%else 2159%if %2 2160 mova m1, [pb_23_22_h] 2161%else 2162 mova m1, [pb_27_17_17_27] 2163%endif 2164 mova m0, [pw_1024] 2165%endif 2166 SCRATCH 0, 8, 5 2167 SCRATCH 1, 9, 6 2168 test sbyd, sbyd 2169 jnz %%vertical_overlap 2170 ; fall-through 2171 2172%%no_vertical_overlap: 2173 mov r8m, overlapd 2174%if ARCH_X86_32 2175 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap 2176 imul seed, (173 << 24) | 37 2177%else 2178 imul seed, sbyd, (173 << 24) | 37 2179%endif 2180 add seed, (105 << 24) | 178 2181 rol seed, 8 2182 movzx seed, seew 2183 xor seed, [fg_dataq+FGData.seed] 2184 2185%if ARCH_X86_32 2186 mov r3m, seed 2187 2188 DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak 2189%define luma_bakq lumaq 2190 2191 mov wq, r4m 2192%if %3 2193 shl r10mp, 1 2194%endif 2195%else 2196 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2197 unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak 2198 2199 mov lstrideq, r10mp 2200%endif 2201 2202 mov lumaq, r9mp 2203 lea src_bakq, [srcq+wq] 2204 lea luma_bakq, [lumaq+wq*(1+%2)] 2205 neg wq 2206 sub r0mp, srcq 2207%if ARCH_X86_32 2208 mov r1m, src_bakq 2209 mov r11m, luma_bakq 2210 mov r4m, wq 2211 2212 DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 2213%else 2214 mov r11mp, src_bakq 2215 mov r12mp, strideq 2216%endif 2217 2218%%loop_x: 2219%if ARCH_X86_32 2220 mov seed, r3m 2221%endif 2222 mov r6d, seed 2223 or seed, 0xEFF4 2224 shr r6d, 1 2225 test seeb, seeh 2226 lea seed, [r6+0x8000] 2227 cmovp seed, r6d ; updated seed 2228%if ARCH_X86_32 2229 mov r3m, seed 2230 2231 DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 2232 2233 mov offxd, offyd 2234%else 2235 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2236 offx, offy, see, overlap, unused1, unused2, lstride 2237 2238 mov offyd, seed 2239 mov offxd, seed 2240%endif 2241 ror offyd, 8 2242 shr offxd, 12 2243 and offyd, 0xf 2244 imul offyd, 164>>%3 2245 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx 2246 2247%if ARCH_X86_32 2248 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut 2249%else 2250 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2251 h, offxy, see, overlap, unused1, unused2, lstride, luma_bak 2252%endif 2253 2254%%loop_x_odd: 2255 mov hd, r7m 2256 mov grain_lutq, grain_lutmp 2257%%loop_y: 2258 ; src 2259%if ARCH_X86_32 2260 mov lumaq, r9mp 2261%endif 2262%if %2 2263 mova m4, [lumaq+ 0] 2264 mova m6, [lumaq+16] 2265 mova m0, [srcq] 2266%if ARCH_X86_32 2267 add lumaq, r10mp 2268 mov r9mp, lumaq 2269 mov r5, r5m 2270 movd m7, [base+pb_1] 2271%else 2272 movd m7, [pb_1] 2273%endif 2274 pshufd m7, m7, q0000 2275 pxor m2, m2 2276 pmaddubsw m4, m7 2277 pmaddubsw m6, m7 2278 pavgw m4, m2 2279 pavgw m6, m2 2280%else 2281 mova m4, [lumaq] 2282 mova m0, [srcq] 2283%if ARCH_X86_32 2284 add lumaq, r10mp 2285 mov r9mp, lumaq 2286%endif 2287 pxor m2, m2 2288%endif 2289 2290%if %1 2291%if %2 2292 packuswb m4, m6 ; luma 2293%endif 2294 punpckhbw m6, m4, m0 2295 punpcklbw m4, m0 ; { luma, chroma } 2296 pmaddubsw m6, m14 2297 pmaddubsw m4, m14 2298 psraw m6, 6 2299 psraw m4, 6 2300 paddw m6, m15 2301 paddw m4, m15 2302 packuswb m4, m6 ; pack+unpack = clip 2303 punpckhbw m6, m4, m2 2304 punpcklbw m4, m2 2305%elif %2 == 0 2306 punpckhbw m6, m4, m2 2307 punpcklbw m4, m2 2308%endif 2309 2310 ; scaling[luma_src] 2311%if ARCH_X86_32 2312 vpgatherdw m7, m4, scalingq-1, r0, r5 2313 vpgatherdw m5, m6, scalingq-1, r0, r5 2314%else 2315 vpgatherdw m7, m4, scalingq-1, r12, r2 2316 vpgatherdw m5, m6, scalingq-1, r12, r2 2317%endif 2318 REPX {psrlw x, 8}, m7, m5 2319 2320 ; unpack chroma_source 2321 punpckhbw m1, m0, m2 2322 punpcklbw m0, m2 ; m0-1: src as word 2323 2324 ; grain = grain_lut[offy+y][offx+x] 2325 movu m3, [grain_lutq+offxyq+ 0] 2326 pcmpgtb m6, m2, m3 2327 punpcklbw m2, m3, m6 2328 punpckhbw m3, m6 2329 2330 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2331 pmullw m2, m7 2332 pmullw m3, m5 2333 pmulhrsw m2, m11 2334 pmulhrsw m3, m11 2335 2336%if ARCH_X86_32 2337 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2338%endif 2339 2340 ; dst = clip_pixel(src, noise) 2341 paddw m0, m2 2342 paddw m1, m3 2343 pmaxsw m0, m13 2344 pmaxsw m1, m13 2345 pminsw m0, m12 2346 pminsw m1, m12 2347 packuswb m0, m1 2348 movifnidn dstq, dstmp 2349 mova [dstq+srcq], m0 2350 2351%if ARCH_X86_32 2352 add srcq, r2mp 2353 ; we already incremented lumaq above 2354%else 2355 add srcq, r12mp 2356%if %3 2357 lea lumaq, [lumaq+lstrideq*2] 2358%else 2359 add lumaq, lstrideq 2360%endif 2361%endif 2362 add grain_lutq, 82 2363 dec hw 2364 jg %%loop_y 2365 2366%if ARCH_X86_32 2367 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 2368 2369 mov wq, r4m 2370%endif 2371 add wq, 16 2372 jge %%end 2373%if ARCH_X86_32 2374 mov srcq, r1mp 2375 mov lumaq, r11mp 2376%else 2377 mov srcq, r11mp 2378%endif 2379 lea lumaq, [luma_bakq+wq*(1+%2)] 2380 add srcq, wq 2381%if ARCH_X86_32 2382 mov r4m, wq 2383 mov r9m, lumaq 2384%endif 2385%if %2 == 0 2386 ; adjust top_offxy 2387%if ARCH_X86_32 2388 add dword [rsp+7*mmsize+1*gprsize], 16 2389%else 2390 add r11d, 16 2391%endif 2392 add offxyd, 16 2393 btc dword r8m, 2 2394 jc %%loop_x_even 2395 test dword r8m, 2 2396 jz %%loop_x_odd 2397 jmp %%loop_x_odd_v_overlap 2398%%loop_x_even: 2399%endif 2400 test dword r8m, 1 2401 jz %%loop_x 2402 2403 ; r8m = sbym 2404 test dword r8m, 2 2405 jne %%loop_x_hv_overlap 2406 2407 ; horizontal overlap (without vertical overlap) 2408%%loop_x_h_overlap: 2409%if ARCH_X86_32 2410%if %2 2411 lea r6, [offxyd+16] 2412 mov [rsp+7*mmsize+0*gprsize], r6 2413%else 2414 mov [rsp+7*mmsize+0*gprsize], offxyd 2415%endif 2416 2417 DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut 2418 2419 mov seed, r3m 2420%else 2421 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2422 offx, offy, see, left_offxy, unused1, unused2, lstride 2423 2424%if %2 2425 lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx 2426%else 2427 mov left_offxyd, offyd 2428%endif 2429%endif 2430 mov r6d, seed 2431 or seed, 0xEFF4 2432 shr r6d, 1 2433 test seeb, seeh 2434 lea seed, [r6+0x8000] 2435 cmovp seed, r6d ; updated seed 2436 2437%if ARCH_X86_32 2438 mov r3m, seed 2439 2440 DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx 2441 2442 mov offxd, offyd 2443%else 2444 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2445 offx, offy, see, left_offxy, unused1, unused2, lstride 2446 2447 mov offyd, seed 2448 mov offxd, seed 2449%endif 2450 ror offyd, 8 2451 shr offxd, 12 2452 and offyd, 0xf 2453 imul offyd, 164>>%3 2454 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx 2455 2456%if ARCH_X86_32 2457 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut 2458%else 2459 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2460 h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak 2461%endif 2462 2463 mov hd, r7m 2464 mov grain_lutq, grain_lutmp 2465%%loop_y_h_overlap: 2466 ; src 2467%if ARCH_X86_32 2468 mov lumaq, r9mp 2469%endif 2470%if %2 2471 mova m4, [lumaq+ 0] 2472 mova m6, [lumaq+16] 2473 mova m0, [srcq] 2474%if ARCH_X86_32 2475 add lumaq, r10mp 2476 mov r9mp, lumaq 2477 mov r5, r5m 2478 movd m7, [base+pb_1] 2479%else 2480 movd m7, [pb_1] 2481%endif 2482 pshufd m7, m7, q0000 2483 pxor m2, m2 2484 pmaddubsw m4, m7 2485 pmaddubsw m6, m7 2486 pavgw m4, m2 2487 pavgw m6, m2 2488%else 2489 mova m4, [lumaq] 2490 mova m0, [srcq] 2491%if ARCH_X86_32 2492 add lumaq, r10mp 2493 mov r9mp, lumaq 2494%endif 2495 pxor m2, m2 2496%endif 2497 2498%if %1 2499%if %2 2500 packuswb m4, m6 ; luma 2501%endif 2502 punpckhbw m6, m4, m0 2503 punpcklbw m4, m0 ; { luma, chroma } 2504 pmaddubsw m6, m14 2505 pmaddubsw m4, m14 2506 psraw m6, 6 2507 psraw m4, 6 2508 paddw m6, m15 2509 paddw m4, m15 2510 packuswb m4, m6 ; pack+unpack = clip 2511 punpckhbw m6, m4, m2 2512 punpcklbw m4, m2 2513%elif %2 == 0 2514 punpckhbw m6, m4, m2 2515 punpcklbw m4, m2 2516%endif 2517 2518 ; scaling[luma_src] 2519%if ARCH_X86_32 2520 vpgatherdw m7, m4, scalingq-1, r0, r5 2521 vpgatherdw m5, m6, scalingq-1, r0, r5 2522%else 2523 vpgatherdw m7, m4, scalingq-1, r12, r2 2524 vpgatherdw m5, m6, scalingq-1, r12, r2 2525%endif 2526 REPX {psrlw x, 8}, m7, m5 2527 2528 ; unpack chroma_source 2529 punpckhbw m1, m0, m2 2530 punpcklbw m0, m2 ; m0-1: src as word 2531 2532 ; grain = grain_lut[offy+y][offx+x] 2533 movu m4, [grain_lutq+offxyq+ 0] 2534%if ARCH_X86_32 2535 mov r0, [rsp+7*mmsize+0*gprsize] 2536 movd m2, [grain_lutq+r0+ 0] 2537%else 2538 movd m2, [grain_lutq+left_offxyq+ 0] 2539%endif 2540 punpcklbw m2, m4 2541 pmaddubsw m3, m9, m2 2542 pmulhrsw m3, m8 2543 packsswb m3, m3 2544 shufps m3, m4, q3210 2545 pxor m4, m4 2546 pcmpgtb m4, m3 2547 punpcklbw m2, m3, m4 2548 punpckhbw m3, m4 2549 2550 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2551 pmullw m2, m7 2552 pmullw m3, m5 2553 pmulhrsw m2, m11 2554 pmulhrsw m3, m11 2555 2556%if ARCH_X86_32 2557 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2558%endif 2559 2560 ; dst = clip_pixel(src, noise) 2561 paddw m0, m2 2562 paddw m1, m3 2563 pmaxsw m0, m13 2564 pmaxsw m1, m13 2565 pminsw m0, m12 2566 pminsw m1, m12 2567 packuswb m0, m1 2568 movifnidn dstq, dstmp 2569 mova [dstq+srcq], m0 2570 2571%if ARCH_X86_32 2572 add srcq, r2mp 2573 ; lumaq has already been incremented above 2574%else 2575 add srcq, r12mp 2576%if %3 2577 lea lumaq, [lumaq+lstrideq*2] 2578%else 2579 add lumaq, lstrideq 2580%endif 2581%endif 2582 add grain_lutq, 82 2583 dec hw 2584 jg %%loop_y_h_overlap 2585 2586%if ARCH_X86_32 2587 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 2588 2589 mov wq, r4m 2590%endif 2591 add wq, 16 2592 jge %%end 2593%if ARCH_X86_32 2594 mov srcq, r1mp 2595 mov lumaq, r11mp 2596%else 2597 mov srcq, r11mp 2598%endif 2599 lea lumaq, [luma_bakq+wq*(1+%2)] 2600 add srcq, wq 2601%if ARCH_X86_32 2602 mov r4m, wq 2603 mov r9m, lumaq 2604%endif 2605%if %2 == 0 2606 xor dword r8m, 4 2607 ; adjust top_offxyd 2608%if ARCH_X86_32 2609 add dword [rsp+7*mmsize+1*gprsize], 16 2610%else 2611 add r11d, 16 2612%endif 2613 add offxyd, 16 2614%endif 2615 2616 ; r8m = sbym 2617 test dword r8m, 2 2618%if %2 2619 jne %%loop_x_hv_overlap 2620 jmp %%loop_x_h_overlap 2621%else 2622 jne %%loop_x_odd_v_overlap 2623 jmp %%loop_x_odd 2624%endif 2625 2626%%end: 2627 RET 2628 2629%%vertical_overlap: 2630%if ARCH_X86_32 2631 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 2632%else 2633 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap 2634%endif 2635 2636 or overlapd, 2 ; top_overlap: overlap & 2 2637 mov r8m, overlapd 2638 movzx sbyd, sbyb 2639%if ARCH_X86_32 2640 imul r4, [fg_dataq+FGData.seed], 0x00010001 2641 DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused 2642%else 2643 imul seed, [fg_dataq+FGData.seed], 0x00010001 2644%endif 2645 imul tmpd, sbyd, 173 * 0x00010001 2646 imul sbyd, 37 * 0x01000100 2647 add tmpd, (105 << 16) | 188 2648 add sbyd, (178 << 24) | (141 << 8) 2649 and tmpd, 0x00ff00ff 2650 and sbyd, 0xff00ff00 2651 xor seed, tmpd 2652%if ARCH_X86_32 2653 xor sbyd, seed ; (cur_seed << 16) | top_seed 2654 2655 DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak 2656 2657 mov r3m, seed 2658 mov wq, r4m 2659%if %3 2660 shl r10mp, 1 2661%endif 2662%else 2663 xor seed, sbyd ; (cur_seed << 16) | top_seed 2664 2665 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2666 tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak 2667 2668 mov lstrideq, r10mp 2669%endif 2670 2671 mov lumaq, r9mp 2672 lea src_bakq, [srcq+wq] 2673 lea luma_bakq, [lumaq+wq*(1+%2)] 2674 neg wq 2675 sub r0mp, srcq 2676%if ARCH_X86_32 2677 mov r1m, src_bakq 2678 mov r11m, luma_bakq 2679 mov r4m, wq 2680 2681 DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 2682%else 2683 mov r11mp, src_bakq 2684 mov r12mp, strideq 2685%endif 2686 2687%%loop_x_v_overlap: 2688%if ARCH_X86_32 2689 mov seed, r3m 2690 xor tmpd, tmpd 2691%endif 2692 ; we assume from the block above that bits 8-15 of tmpd are zero'ed 2693 mov r6d, seed 2694 or seed, 0xeff4eff4 2695 test seeb, seeh 2696 setp tmpb ; parity of top_seed 2697 shr seed, 16 2698 shl tmpd, 16 2699 test seeb, seeh 2700 setp tmpb ; parity of cur_seed 2701 or r6d, 0x00010001 2702 xor tmpd, r6d 2703 mov seed, tmpd 2704 ror seed, 1 ; updated (cur_seed << 16) | top_seed 2705 2706%if ARCH_X86_32 2707 mov r3m, seed 2708 2709 DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx 2710 2711 mov offxd, offyd 2712%else 2713 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2714 offx, offy, see, overlap, top_offxy, unused, lstride 2715 2716 mov offxd, seed 2717 mov offyd, seed 2718%endif 2719 ror offyd, 8 2720 ror offxd, 12 2721 and offyd, 0xf000f 2722 and offxd, 0xf000f 2723 imul offyd, 164>>%3 2724 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 2725 lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 2726 2727%if ARCH_X86_32 2728 DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy 2729%else 2730 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2731 h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak 2732%endif 2733 2734 movzx top_offxyd, offxyw 2735 shr offxyd, 16 2736%if ARCH_X86_32 2737 mov [rsp+7*mmsize+1*gprsize], top_offxyd 2738 2739 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut 2740%endif 2741 2742%%loop_x_odd_v_overlap: 2743 mov hd, r7m 2744 mov grain_lutq, grain_lutmp 2745%if ARCH_X86_32 2746 mov r5, r5m 2747%endif 2748%if %3 2749 mova m1, [PIC_ptr(pb_23_22)] 2750%else 2751 mova m1, [PIC_ptr(pb_27_17)] 2752%endif 2753%%loop_y_v_overlap: 2754%if ARCH_X86_32 2755 mov lumaq, r9mp 2756%endif 2757%if %2 2758 mova m4, [lumaq+ 0] 2759 mova m6, [lumaq+16] 2760 mova m0, [srcq] 2761%if ARCH_X86_32 2762 add lumaq, r10mp 2763 mov r9mp, lumaq 2764 mov r5, r5m 2765 movd m7, [base+pb_1] 2766%else 2767 movd m7, [pb_1] 2768%endif 2769 pshufd m7, m7, q0000 2770 pxor m2, m2 2771 pmaddubsw m4, m7 2772 pmaddubsw m6, m7 2773 pavgw m4, m2 2774 pavgw m6, m2 2775%else 2776 mova m4, [lumaq] 2777 mova m0, [srcq] 2778%if ARCH_X86_32 2779 add lumaq, r10mp 2780 mov r9mp, lumaq 2781%endif 2782 pxor m2, m2 2783%endif 2784 2785%if %1 2786%if %2 2787 packuswb m4, m6 ; luma 2788%endif 2789 punpckhbw m6, m4, m0 2790 punpcklbw m4, m0 ; { luma, chroma } 2791 pmaddubsw m6, m14 2792 pmaddubsw m4, m14 2793 psraw m6, 6 2794 psraw m4, 6 2795 paddw m6, m15 2796 paddw m4, m15 2797 packuswb m4, m6 ; pack+unpack = clip 2798 punpckhbw m6, m4, m2 2799 punpcklbw m4, m2 2800%elif %2 == 0 2801 punpckhbw m6, m4, m2 2802 punpcklbw m4, m2 2803%endif 2804 2805 ; scaling[luma_src] 2806%if ARCH_X86_32 2807 vpgatherdw m7, m4, scalingq-1, r0, r5 2808 vpgatherdw m5, m6, scalingq-1, r0, r5 2809%else 2810 vpgatherdw m7, m4, scalingq-1, r12, r2 2811 vpgatherdw m5, m6, scalingq-1, r12, r2 2812%endif 2813 REPX {psrlw x, 8}, m7, m5 2814 2815 ; grain = grain_lut[offy+y][offx+x] 2816 movu m3, [grain_lutq+offxyq] 2817%if ARCH_X86_32 2818 mov r0, [rsp+7*mmsize+1*gprsize] 2819 movu m4, [grain_lutq+r0] 2820%else 2821 movu m4, [grain_lutq+top_offxyq] 2822%endif 2823 punpckhbw m6, m4, m3 2824 punpcklbw m4, m3 2825 pmaddubsw m2, m1, m6 2826 pmaddubsw m3, m1, m4 2827 pmulhrsw m2, m8 2828 pmulhrsw m3, m8 2829 packsswb m3, m2 2830 pxor m6, m6 2831 pcmpgtb m6, m3 2832 punpcklbw m2, m3, m6 2833 punpckhbw m3, m6 2834 2835 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2836 pmullw m2, m7 2837 pmullw m3, m5 2838 pmulhrsw m2, m11 2839 pmulhrsw m3, m11 2840 2841 ; unpack chroma_source 2842 pxor m4, m4 2843 punpckhbw m6, m0, m4 2844 punpcklbw m0, m4 ; m0-1: src as word 2845 2846%if ARCH_X86_32 2847 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2848%endif 2849 2850 ; dst = clip_pixel(src, noise) 2851 paddw m0, m2 2852 paddw m6, m3 2853 pmaxsw m0, m13 2854 pmaxsw m6, m13 2855 pminsw m0, m12 2856 pminsw m6, m12 2857 packuswb m0, m6 2858 movifnidn dstq, dstmp 2859 mova [dstq+srcq], m0 2860 2861 dec hw 2862 je %%end_y_v_overlap 2863%if ARCH_X86_32 2864 add srcq, r2mp 2865 ; lumaq has already been incremented above 2866%else 2867 add srcq, r12mp 2868%if %3 2869 lea lumaq, [lumaq+lstrideq*2] 2870%else 2871 add lumaq, lstrideq 2872%endif 2873%endif 2874 add grain_lutq, 82 2875%if %3 == 0 2876 btc hd, 16 2877%if ARCH_X86_32 2878 mov r5, r5m 2879%endif 2880 mova m1, [PIC_ptr(pb_17_27)] 2881 jnc %%loop_y_v_overlap 2882%endif 2883 jmp %%loop_y 2884 2885%%end_y_v_overlap: 2886%if ARCH_X86_32 2887 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 2888 2889 mov wq, r4m 2890%endif 2891 add wq, 16 2892 jge %%end_hv 2893%if ARCH_X86_32 2894 mov srcq, r1mp 2895 mov lumaq, r11mp 2896%else 2897 mov srcq, r11mp 2898%endif 2899 lea lumaq, [luma_bakq+wq*(1+%2)] 2900 add srcq, wq 2901%if ARCH_X86_32 2902 mov r4m, wq 2903 mov r9m, lumaq 2904%endif 2905 2906%if %2 2907 ; since fg_dataq.overlap is guaranteed to be set, we never jump 2908 ; back to .loop_x_v_overlap, and instead always fall-through to 2909 ; h+v overlap 2910%else 2911%if ARCH_X86_32 2912 add dword [rsp+7*mmsize+1*gprsize], 16 2913%else 2914 add top_offxyd, 16 2915%endif 2916 add offxyd, 16 2917 btc dword r8m, 2 2918 jnc %%loop_x_odd_v_overlap 2919%endif 2920 2921%%loop_x_hv_overlap: 2922%if ARCH_X86_32 2923 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused 2924 2925 mov r6, [rsp+7*mmsize+1*gprsize] 2926%if %2 2927 lea r0, [r3d+16] 2928 add r6, 16 2929 mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy 2930%else 2931 mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy 2932%endif 2933 mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy 2934 2935 DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused 2936 2937 mov seed, r3m 2938 xor tmpd, tmpd 2939%else 2940 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2941 tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride 2942 2943%if %2 2944 lea topleft_offxyq, [top_offxyq+16] 2945 lea left_offxyq, [offxyq+16] 2946%else 2947 mov topleft_offxyq, top_offxyq 2948 mov left_offxyq, offxyq 2949%endif 2950 2951 ; we assume from the block above that bits 8-15 of tmpd are zero'ed 2952%endif 2953 mov r6d, seed 2954 or seed, 0xeff4eff4 2955 test seeb, seeh 2956 setp tmpb ; parity of top_seed 2957 shr seed, 16 2958 shl tmpd, 16 2959 test seeb, seeh 2960 setp tmpb ; parity of cur_seed 2961 or r6d, 0x00010001 2962 xor tmpd, r6d 2963 mov seed, tmpd 2964 ror seed, 1 ; updated (cur_seed << 16) | top_seed 2965 2966%if ARCH_X86_32 2967 mov r3m, seed 2968 2969 DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx 2970 2971 mov offxd, offyd 2972%else 2973 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2974 offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride 2975 2976 mov offxd, seed 2977 mov offyd, seed 2978%endif 2979 ror offyd, 8 2980 ror offxd, 12 2981 and offyd, 0xf000f 2982 and offxd, 0xf000f 2983 imul offyd, 164>>%3 2984 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 2985 lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] 2986 2987%if ARCH_X86_32 2988 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut 2989%else 2990 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2991 h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak 2992%endif 2993 2994 movzx top_offxyd, offxyw 2995 shr offxyd, 16 2996%if ARCH_X86_32 2997 mov [rsp+7*mmsize+1*gprsize], top_offxyd 2998%endif 2999 3000 mov hd, r7m 3001 mov grain_lutq, grain_lutmp 3002%if ARCH_X86_32 3003 mov r5, r5m 3004%endif 3005%if %3 3006 mova m3, [PIC_ptr(pb_23_22)] 3007%else 3008 mova m3, [PIC_ptr(pb_27_17)] 3009%endif 3010%%loop_y_hv_overlap: 3011 ; grain = grain_lut[offy+y][offx+x] 3012%if ARCH_X86_32 3013 mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy 3014 mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy 3015 movd m1, [grain_lutq+r0] 3016 mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy 3017%else 3018 movd m1, [grain_lutq+topleft_offxyq] 3019%endif 3020 movu m2, [grain_lutq+offxyq] 3021%if ARCH_X86_32 3022 movu m6, [grain_lutq+r5] 3023 movd m4, [grain_lutq+r0] 3024%else 3025 movu m6, [grain_lutq+top_offxyq] 3026 movd m4, [grain_lutq+left_offxyq] 3027%endif 3028 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 3029 punpcklbw m1, m6 3030 punpcklbw m4, m2 3031 pmaddubsw m0, m9, m1 3032 pmaddubsw m1, m9, m4 3033 REPX {pmulhrsw x, m8}, m0, m1 3034 packsswb m0, m1 3035 shufps m4, m0, m2, q3232 3036 shufps m0, m6, q3210 3037 ; followed by v interpolation (top | cur -> cur) 3038 punpcklbw m2, m0, m4 3039 punpckhbw m0, m4 3040 pmaddubsw m4, m3, m0 3041 pmaddubsw m1, m3, m2 3042 pmulhrsw m4, m8 3043 pmulhrsw m1, m8 3044 packsswb m1, m4 3045 3046 ; src 3047%if ARCH_X86_32 3048 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 3049 3050 mov lumaq, r9mp 3051%endif 3052%if %2 3053 mova m4, [lumaq+ 0] 3054 mova m6, [lumaq+16] 3055 mova m0, [srcq] 3056%if ARCH_X86_32 3057 add lumaq, r10mp 3058 mov r9mp, lumaq 3059 mov r5, r5m 3060 movd m7, [base+pb_1] 3061%else 3062 movd m7, [pb_1] 3063%endif 3064 pshufd m7, m7, q0000 3065 pxor m2, m2 3066 pmaddubsw m4, m7 3067 pmaddubsw m6, m7 3068 pavgw m4, m2 3069 pavgw m6, m2 3070%else 3071 mova m4, [lumaq] 3072 mova m0, [srcq] 3073%if ARCH_X86_32 3074 add lumaq, r10mp 3075 mov r9mp, lumaq 3076%endif 3077 pxor m2, m2 3078%endif 3079 3080%if %1 3081%if %2 3082 packuswb m4, m6 ; luma 3083%endif 3084 punpckhbw m6, m4, m0 3085 punpcklbw m4, m0 ; { luma, chroma } 3086 pmaddubsw m6, m14 3087 pmaddubsw m4, m14 3088 psraw m6, 6 3089 psraw m4, 6 3090 paddw m6, m15 3091 paddw m4, m15 3092 packuswb m4, m6 ; pack+unpack = clip 3093 punpckhbw m6, m4, m2 3094 punpcklbw m4, m2 3095%elif %2 == 0 3096 punpckhbw m6, m4, m2 3097 punpcklbw m4, m2 3098%endif 3099 3100 ; scaling[src] 3101%if ARCH_X86_32 3102 vpgatherdw m7, m4, scalingq-1, r0, r5 3103 vpgatherdw m5, m6, scalingq-1, r0, r5 3104%else 3105%if %3 3106 vpgatherdw m7, m4, scalingq-1, r2, r12 3107 vpgatherdw m5, m6, scalingq-1, r2, r12 3108%else 3109 vpgatherdw m7, m4, scalingq-1, r2, r13 3110 vpgatherdw m5, m6, scalingq-1, r2, r13 3111%endif 3112%endif 3113 REPX {psrlw x, 8}, m7, m5 3114 3115 ; unpack grain 3116 pxor m4, m4 3117 pcmpgtb m4, m1 3118 punpcklbw m2, m1, m4 3119 punpckhbw m1, m4 3120 3121 ; noise = round2(scaling[src] * grain, scaling_shift) 3122 pmullw m2, m7 3123 pmullw m1, m5 3124 pmulhrsw m2, m11 3125 pmulhrsw m1, m11 3126 3127%if ARCH_X86_32 3128 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 3129%endif 3130 3131 ; unpack chroma source 3132 pxor m4, m4 3133 punpckhbw m5, m0, m4 3134 punpcklbw m0, m4 ; m0-1: src as word 3135 3136 ; dst = clip_pixel(src, noise) 3137 paddw m0, m2 3138 paddw m5, m1 3139 pmaxsw m0, m13 3140 pmaxsw m5, m13 3141 pminsw m0, m12 3142 pminsw m5, m12 3143 packuswb m0, m5 3144 movifnidn dstq, dstmp 3145 mova [dstq+srcq], m0 3146 3147%if ARCH_X86_32 3148 add srcq, r2mp 3149 ; lumaq has been adjusted above already 3150%else 3151 add srcq, r12mp 3152%if %3 3153 lea lumaq, [lumaq+lstrideq*(1+%2)] 3154%else 3155 add lumaq, r10mp 3156%endif 3157%endif 3158 add grain_lutq, 82 3159 dec hw 3160%if %3 3161 jg %%loop_y_h_overlap 3162%else 3163 jle %%end_y_hv_overlap 3164%if ARCH_X86_32 3165 mov r5, r5m 3166%endif 3167 mova m3, [PIC_ptr(pb_17_27)] 3168 btc hd, 16 3169 jnc %%loop_y_hv_overlap 3170%if ARCH_X86_64 3171 mov lstrideq, r10mp 3172%endif 3173 jmp %%loop_y_h_overlap 3174%%end_y_hv_overlap: 3175%if ARCH_X86_64 3176 mov lstrideq, r10mp 3177%endif 3178%endif 3179 3180%if ARCH_X86_32 3181 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 3182 3183 mov wq, r4m 3184%endif 3185 add wq, 16 3186 jge %%end_hv 3187%if ARCH_X86_32 3188 mov srcq, r1mp 3189 mov lumaq, r11mp 3190%else 3191 mov srcq, r11mp 3192%endif 3193 lea lumaq, [luma_bakq+wq*(1+%2)] 3194 add srcq, wq 3195%if ARCH_X86_32 3196 mov r4m, wq 3197 mov r9m, lumaq 3198%endif 3199%if %2 3200 jmp %%loop_x_hv_overlap 3201%else 3202%if ARCH_X86_32 3203 add dword [rsp+7*mmsize+1*gprsize], 16 3204%else 3205 add top_offxyd, 16 3206%endif 3207 add offxyd, 16 3208 xor dword r8m, 4 3209 jmp %%loop_x_odd_v_overlap 3210%endif 3211 3212%%end_hv: 3213 RET 3214%endmacro 3215 3216 %%FGUV_32x32xN_LOOP 1, %2, %3 3217.csfl: 3218 %%FGUV_32x32xN_LOOP 0, %2, %3 3219%endmacro 3220 3221FGUV_FN 420, 1, 1 3222 3223%if STACK_ALIGNMENT < mmsize 3224DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 3225%endif 3226 3227FGUV_FN 422, 1, 0 3228 3229%if STACK_ALIGNMENT < mmsize 3230DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 3231%endif 3232 3233FGUV_FN 444, 0, 0 3234