1; Copyright © 2019, VideoLAN and dav1d authors 2; Copyright © 2019, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "ext/x86/x86inc.asm" 27 28SECTION_RODATA 29 30pw_1024: times 8 dw 1024 31pb_27_17: times 8 db 27, 17 32pb_17_27: times 8 db 17, 27 33pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 34rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 35byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 36pw_seed_xor: times 2 dw 0xb524 37 times 2 dw 0x49d8 38pb_23_22: times 2 db 23, 22 39pb_1: times 4 db 1 40hmul_bits: dw 32768, 16384, 8192, 4096 41round: dw 2048, 1024, 512 42mul_bits: dw 256, 128, 64, 32, 16 43round_vals: dw 32, 64, 128, 256, 512 44max: dw 255, 240, 235 45min: dw 0, 16 46pw_1: dw 1 47 48%define pb_27_17_17_27 pb_17_27 - 2 49 50%macro JMP_TABLE 1-* 51 %xdefine %1_table %%table 52 %xdefine %%base %1_table 53 %xdefine %%prefix mangle(private_prefix %+ _%1) 54 %%table: 55 %rep %0 - 1 56 dd %%prefix %+ .ar%2 - %%base 57 %rotate 1 58 %endrep 59%endmacro 60 61JMP_TABLE generate_grain_y_ssse3, 0, 1, 2, 3 62JMP_TABLE generate_grain_uv_420_ssse3, 0, 1, 2, 3 63 64struc FGData 65 .seed: resd 1 66 .num_y_points: resd 1 67 .y_points: resb 14 * 2 68 .chroma_scaling_from_luma: resd 1 69 .num_uv_points: resd 2 70 .uv_points: resb 2 * 10 * 2 71 .scaling_shift: resd 1 72 .ar_coeff_lag: resd 1 73 .ar_coeffs_y: resb 24 74 .ar_coeffs_uv: resb 2 * 28 ; includes padding 75 .ar_coeff_shift: resq 1 76 .grain_scale_shift: resd 1 77 .uv_mult: resd 2 78 .uv_luma_mult: resd 2 79 .uv_offset: resd 2 80 .overlap_flag: resd 1 81 .clip_to_restricted_range: resd 1 82endstruc 83 84cextern gaussian_sequence 85 86SECTION .text 87 88%macro SCRATCH 3 89%if ARCH_X86_32 90 mova [rsp+%3*mmsize], m%1 91%define m%2 [rsp+%3*mmsize] 92%else 93 SWAP %1, %2 94%endif 95%endmacro 96 97INIT_XMM ssse3 98cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data 99 LEA r4, $$ 100%define base r4-$$ 101 movq m1, [base+rnd_next_upperbit_mask] 102 movq m4, [base+mul_bits] 103 movq m7, [base+hmul_bits] 104 mov r2d, [fg_dataq+FGData.grain_scale_shift] 105 movd m2, [base+round+r2*2] 106 movd m0, [fg_dataq+FGData.seed] 107 mova m5, [base+pb_mask] 108 pshuflw m2, m2, q0000 109 pshuflw m0, m0, q0000 110 mov r2, -73*82 111 sub bufq, r2 112 lea r3, [base+gaussian_sequence] 113.loop: 114 pand m6, m0, m1 115 psrlw m3, m6, 10 116 por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 117 pmullw m6, m4 ; bits 0x0f00 are set 118 pshufb m3, m5, m6 ; set 15th bit for next 4 seeds 119 psllq m6, m3, 30 120 por m3, m6 121 psllq m6, m3, 15 122 por m3, m6 ; aggregate each bit into next seed's high bit 123 pmulhuw m6, m0, m7 124 por m3, m6 ; 4 next output seeds 125 pshuflw m0, m3, q3333 126 psrlw m3, 5 127%if ARCH_X86_64 128 movq r6, m3 129 mov r8, r6 130 movzx r5d, r6w 131 shr r6d, 16 132 shr r8, 32 133 movzx r7, r8w 134 shr r8, 16 135 136 movd m6, [r3+r5*2] 137 pinsrw m6, [r3+r6*2], 1 138 pinsrw m6, [r3+r7*2], 2 139 pinsrw m6, [r3+r8*2], 3 140%else 141 movd r6, m3 142 pshuflw m3, m3, q3232 143 movzx r5, r6w 144 shr r6, 16 145 146 movd m6, [r3+r5*2] 147 pinsrw m6, [r3+r6*2], 1 148 149 movd r6, m3 150 movzx r5, r6w 151 shr r6, 16 152 153 pinsrw m6, [r3+r5*2], 2 154 pinsrw m6, [r3+r6*2], 3 155%endif 156 pmulhrsw m6, m2 157 packsswb m6, m6 158 movd [bufq+r2], m6 159 add r2, 4 160 jl .loop 161 162 ; auto-regression code 163 movsxd r2, [fg_dataq+FGData.ar_coeff_lag] 164 movsxd r2, [base+generate_grain_y_ssse3_table+r2*4] 165 lea r2, [r2+base+generate_grain_y_ssse3_table] 166 jmp r2 167 168.ar1: 169%if ARCH_X86_32 170 DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max 171%elif WIN64 172 DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0 173 mov bufq, r0 174%else 175 DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 176%endif 177 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] 178 movd m4, [fg_dataq+FGData.ar_coeffs_y] 179 mov ecx, [fg_dataq+FGData.ar_coeff_shift] 180%if ARCH_X86_32 181 mov r1m, cf3d 182 DEFINE_ARGS buf, shift, val3, min, max, x, val0 183%define hd r0mp 184%define cf3d r1mp 185%elif WIN64 186 DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0 187%else 188 DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 189%endif 190 pxor m6, m6 191 pcmpgtb m7, m6, m4 192 punpcklbw m4, m7 193 pinsrw m4, [base+pw_1], 3 194 pshufd m5, m4, q1111 195 pshufd m4, m4, q0000 196 movd m3, [base+round_vals+shiftq*2-12] ; rnd 197 pshuflw m3, m3, q0000 198 sub bufq, 82*73-(82*3+79) 199 mov hd, 70 200 mov mind, -128 201 mov maxd, 127 202.y_loop_ar1: 203 mov xq, -76 204 movsx val3d, byte [bufq+xq-1] 205.x_loop_ar1: 206 movq m0, [bufq+xq-82-1] ; top/left 207 pcmpgtb m7, m6, m0 208 punpcklbw m0, m7 209 psrldq m2, m0, 2 ; top 210 psrldq m1, m0, 4 ; top/right 211 punpcklwd m0, m2 212 punpcklwd m1, m3 213 pmaddwd m0, m4 214 pmaddwd m1, m5 215 paddd m0, m1 216.x_loop_ar1_inner: 217 movd val0d, m0 218 psrldq m0, 4 219 imul val3d, cf3d 220 add val3d, val0d 221 sar val3d, shiftb 222 movsx val0d, byte [bufq+xq] 223 add val3d, val0d 224 cmp val3d, maxd 225 cmovns val3d, maxd 226 cmp val3d, mind 227 cmovs val3d, mind 228 mov byte [bufq+xq], val3b 229 ; keep val3d in-place as left for next x iteration 230 inc xq 231 jz .x_loop_ar1_end 232 test xq, 3 233 jnz .x_loop_ar1_inner 234 jmp .x_loop_ar1 235 236.x_loop_ar1_end: 237 add bufq, 82 238 dec hd 239 jg .y_loop_ar1 240.ar0: 241 RET 242 243.ar2: 244%if ARCH_X86_32 245%assign stack_offset_old stack_offset 246 ALLOC_STACK -16*8 247%endif 248 DEFINE_ARGS buf, fg_data, shift 249 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 250 movd m6, [base+round_vals-12+shiftq*2] 251 movd m7, [base+byte_blend+1] 252 SCRATCH 7, 15, 7 253 movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 254 movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 255 pxor m7, m7 256 pshuflw m6, m6, q0000 257 punpcklwd m6, m7 258 pcmpgtb m4, m7, m0 259 pcmpgtb m5, m7, m1 260 punpcklbw m0, m4 261 punpcklbw m1, m5 262 DEFINE_ARGS buf, fg_data, h, x 263 pshufd m4, m1, q0000 264 pshufd m5, m1, q1111 265 pshufd m3, m0, q3333 266 pshufd m2, m0, q2222 267 pshufd m1, m0, q1111 268 pshufd m0, m0, q0000 269 SCRATCH 0, 8, 0 270 SCRATCH 1, 9, 1 271 SCRATCH 2, 10, 2 272 SCRATCH 3, 11, 3 273 SCRATCH 4, 12, 4 274 SCRATCH 5, 13, 5 275 SCRATCH 6, 14, 6 276 sub bufq, 82*73-(82*3+79) 277 mov hd, 70 278.y_loop_ar2: 279 mov xq, -76 280 281.x_loop_ar2: 282 movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 283 movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 284 pcmpgtb m2, m7, m0 285 punpckhbw m1, m0, m2 286 punpcklbw m0, m2 287 psrldq m5, m0, 2 ; y=-2,x=[-1,+5] 288 psrldq m3, m1, 2 ; y=-1,x=[-1,+5] 289 psrldq m4, m1, 4 ; y=-1,x=[+0,+5] 290 punpcklwd m2, m0, m5 291 punpcklwd m3, m4 292 pmaddwd m2, m8 293 pmaddwd m3, m11 294 paddd m2, m3 295 296 psrldq m4, m0, 4 ; y=-2,x=[+0,+5] 297 psrldq m5, m0, 6 ; y=-2,x=[+1,+5] 298 psrldq m6, m0, 8 ; y=-2,x=[+2,+5] 299 punpcklwd m4, m5 300 punpcklwd m6, m1 301 psrldq m5, m1, 6 ; y=-1,x=[+1,+5] 302 psrldq m1, m1, 8 ; y=-1,x=[+2,+5] 303 punpcklwd m5, m1 304 pmaddwd m4, m9 305 pmaddwd m6, m10 306 pmaddwd m5, m12 307 paddd m4, m6 308 paddd m2, m5 309 paddd m2, m4 310 paddd m2, m14 311 312 movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] 313.x_loop_ar2_inner: 314 pcmpgtb m4, m7, m0 315 punpcklbw m1, m0, m4 316 pmaddwd m3, m1, m13 317 paddd m3, m2 318 psrldq m1, 4 ; y=0,x=0 319 psrldq m2, 4 ; shift top to next pixel 320 psrad m3, [fg_dataq+FGData.ar_coeff_shift] 321 ; don't packssdw since we only care about one value 322 paddw m3, m1 323 packsswb m3, m3 324 pslldq m3, 2 325 pand m3, m15 326 pandn m1, m15, m0 327 por m0, m1, m3 328 psrldq m0, 1 329 ; overwrite 2 pixels, but that's ok 330 movd [bufq+xq-1], m0 331 inc xq 332 jz .x_loop_ar2_end 333 test xq, 3 334 jnz .x_loop_ar2_inner 335 jmp .x_loop_ar2 336 337.x_loop_ar2_end: 338 add bufq, 82 339 dec hd 340 jg .y_loop_ar2 341 RET 342 343.ar3: 344 DEFINE_ARGS buf, fg_data, shift 345%if ARCH_X86_32 346%assign stack_offset stack_offset_old 347 ALLOC_STACK -16*14 348%elif WIN64 349 SUB rsp, 16*6 350%assign stack_size_padded (stack_size_padded+16*6) 351%assign stack_size (stack_size+16*6) 352%else 353 ALLOC_STACK -16*6 354%endif 355 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 356 movd m6, [base+round_vals-12+shiftq*2] 357 movd m7, [base+byte_blend] 358 movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 359 movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 360 pxor m3, m3 361 pcmpgtb m4, m3, m0 362 pcmpgtb m3, m2 363 pshuflw m6, m6, q0000 364 SCRATCH 6, 14, 12 365 SCRATCH 7, 15, 13 366 punpckhbw m1, m0, m4 367 punpcklbw m0, m4 368 punpcklbw m2, m3 369 pshufd m3, m0, q1111 370 pshufd m4, m0, q2222 371 pshufd m5, m0, q3333 372 pshufd m0, m0, q0000 373 mova [rsp+ 0*16], m0 374 mova [rsp+ 1*16], m3 375 mova [rsp+ 2*16], m4 376 mova [rsp+ 3*16], m5 377 pshufd m6, m1, q1111 378 pshufd m7, m1, q2222 379 pshufd m5, m1, q3333 380 pshufd m1, m1, q0000 381 pshufd m3, m2, q1111 382 psrldq m0, m2, 10 383 pinsrw m2, [base+pw_1], 5 384 pshufd m4, m2, q2222 385 pshufd m2, m2, q0000 386 pinsrw m0, [base+round_vals+shiftq*2-10], 3 387 mova [rsp+ 4*16], m1 388 mova [rsp+ 5*16], m6 389 SCRATCH 7, 8, 6 390 SCRATCH 5, 9, 7 391 SCRATCH 2, 10, 8 392 SCRATCH 3, 11, 9 393 SCRATCH 4, 12, 10 394 SCRATCH 0, 13, 11 395 DEFINE_ARGS buf, fg_data, h, x 396 sub bufq, 82*73-(82*3+79) 397 mov hd, 70 398.y_loop_ar3: 399 mov xq, -76 400 401.x_loop_ar3: 402 movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] 403 pxor m3, m3 404 pcmpgtb m3, m0 405 punpckhbw m2, m0, m3 406 punpcklbw m0, m3 407 408 psrldq m5, m0, 2 409 psrldq m6, m0, 4 410 psrldq m7, m0, 6 411 punpcklwd m4, m0, m5 412 punpcklwd m6, m7 413 pmaddwd m4, [rsp+ 0*16] 414 pmaddwd m6, [rsp+ 1*16] 415 paddd m4, m6 416 417 movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] 418 pxor m5, m5 419 pcmpgtb m5, m1 420 punpckhbw m3, m1, m5 421 punpcklbw m1, m5 422 palignr m6, m2, m0, 10 423 palignr m7, m2, m0, 12 424 psrldq m0, 8 425 punpcklwd m0, m6 426 punpcklwd m7, m1 427 pmaddwd m0, [rsp+ 2*16] 428 pmaddwd m7, [rsp+ 3*16] 429 paddd m0, m7 430 paddd m0, m4 431 432 psrldq m4, m1, 2 433 psrldq m5, m1, 4 434 psrldq m6, m1, 6 435 psrldq m7, m1, 8 436 punpcklwd m4, m5 437 punpcklwd m6, m7 438 pmaddwd m4, [rsp+ 4*16] 439 pmaddwd m6, [rsp+ 5*16] 440 paddd m4, m6 441 paddd m0, m4 442 443 movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 444 pxor m7, m7 445 pcmpgtb m7, m2 446 punpckhbw m5, m2, m7 447 punpcklbw m2, m7 448 palignr m7, m3, m1, 10 449 palignr m3, m1, 12 450 psrldq m1, m2, 2 451 punpcklwd m7, m3 452 punpcklwd m3, m2, m1 453 pmaddwd m7, m8 454 pmaddwd m3, m9 455 paddd m7, m3 456 paddd m0, m7 457 458 psrldq m6, m2, 4 459 psrldq m1, m2, 6 460 psrldq m3, m2, 8 461 palignr m4, m5, m2, 10 462 palignr m5, m5, m2, 12 463 464 punpcklwd m6, m1 465 punpcklwd m3, m4 466 punpcklwd m5, m14 467 pmaddwd m6, m10 468 pmaddwd m3, m11 469 pmaddwd m5, m12 470 paddd m0, m6 471 paddd m3, m5 472 paddd m0, m3 473 474 movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] 475.x_loop_ar3_inner: 476 pxor m5, m5 477 pcmpgtb m5, m1 478 punpcklbw m2, m1, m5 479 pmaddwd m2, m13 480 pshufd m3, m2, q1111 481 paddd m2, m3 ; left+cur 482 paddd m2, m0 ; add top 483 psrldq m0, 4 484 psrad m2, [fg_dataq+FGData.ar_coeff_shift] 485 ; don't packssdw since we only care about one value 486 packsswb m2, m2 487 pslldq m2, 3 488 pand m2, m15 489 pandn m3, m15, m1 490 por m1, m2, m3 491 movd [bufq+xq-3], m1 492 psrldq m1, 1 493 inc xq 494 jz .x_loop_ar3_end 495 test xq, 3 496 jnz .x_loop_ar3_inner 497 jmp .x_loop_ar3 498 499.x_loop_ar3_end: 500 add bufq, 82 501 dec hd 502 jg .y_loop_ar3 503 RET 504 505INIT_XMM ssse3 506cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv 507 movifnidn r2, r2mp 508 movifnidn r3, r3mp 509 LEA r4, $$ 510%define base r4-$$ 511 movq m1, [base+rnd_next_upperbit_mask] 512 movq m4, [base+mul_bits] 513 movq m7, [base+hmul_bits] 514 mov r5d, [fg_dataq+FGData.grain_scale_shift] 515 movd m6, [base+round+r5*2] 516 mova m5, [base+pb_mask] 517 movd m0, [fg_dataq+FGData.seed] 518 movd m2, [base+pw_seed_xor+uvq*4] 519 pxor m0, m2 520 pshuflw m6, m6, q0000 521 pshuflw m0, m0, q0000 522 lea r6, [base+gaussian_sequence] 523%if ARCH_X86_64 524 mov r7d, 38 525%else 526 mov r3mp, 38 527%endif 528 add bufq, 44 529.loop_y: 530 mov r5, -44 531.loop_x: 532 pand m2, m0, m1 533 psrlw m3, m2, 10 534 por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set 535 pmullw m2, m4 ; bits 0x0f00 are set 536 pshufb m3, m5, m2 ; set 15th bit for next 4 seeds 537 psllq m2, m3, 30 538 por m3, m2 539 psllq m2, m3, 15 540 por m3, m2 ; aggregate each bit into next seed's high bit 541 pmulhuw m2, m0, m7 542 por m2, m3 ; 4 next output seeds 543 pshuflw m0, m2, q3333 544 psrlw m2, 5 545%if ARCH_X86_64 546 movd r9d, m2 547 pshuflw m2, m2, q3232 548 movzx r8, r9w 549 shr r9, 16 550 551 movd m3, [r6+r8*2] 552 pinsrw m3, [r6+r9*2], 1 553 554 movd r9d, m2 555 movzx r8, r9w 556 shr r9, 16 557 558 pinsrw m3, [r6+r8*2], 2 559 pinsrw m3, [r6+r9*2], 3 560%else 561 movd r2, m2 562 pshuflw m2, m2, q3232 563 movzx r1, r2w 564 shr r2, 16 565 566 movd m3, [r6+r1*2] 567 pinsrw m3, [r6+r2*2], 1 568 569 movd r2, m2 570 movzx r1, r2w 571 shr r2, 16 572 573 pinsrw m3, [r6+r1*2], 2 574 pinsrw m3, [r6+r2*2], 3 575%endif 576 pmulhrsw m3, m6 577 packsswb m3, m3 578 movd [bufq+r5], m3 579 add r5, 4 580 jl .loop_x 581 add bufq, 82 582%if ARCH_X86_64 583 dec r7d 584%else 585 dec r3mp 586%endif 587 jg .loop_y 588 589%if ARCH_X86_32 590 mov r2, r2mp 591%endif 592 593 ; auto-regression code 594 movsxd r5, [fg_dataq+FGData.ar_coeff_lag] 595 movsxd r5, [base+generate_grain_uv_420_ssse3_table+r5*4] 596 lea r5, [r5+base+generate_grain_uv_420_ssse3_table] 597 jmp r5 598 599.ar0: 600 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 601 movifnidn bufyq, bufymp 602%if ARCH_X86_32 603%assign stack_offset_old stack_offset 604 ALLOC_STACK -2*16 605%endif 606 imul uvd, 28 607 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 608 movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq] 609 movd m4, [base+hmul_bits+shiftq*2] 610 movd m1, [base+byte_blend] 611 DEFINE_ARGS buf, bufy, h 612 pxor m0, m0 613 pcmpgtb m0, m5 614 punpcklbw m5, m0 615 movd m7, [base+pb_1] 616 movd m6, [base+hmul_bits+4] 617 pshuflw m5, m5, q0000 618 pshuflw m4, m4, q0000 619 pshufd m7, m7, q0000 620 pshuflw m6, m6, q0000 621 punpcklqdq m5, m5 622 punpcklqdq m4, m4 623 punpcklqdq m6, m6 624 punpcklbw m1, m1 625 SCRATCH 1, 8, 0 626 SCRATCH 4, 9, 1 627 sub bufq, 82*38+82-(82*3+41) 628 add bufyq, 3+82*3 629 mov hd, 35 630.y_loop_ar0: 631 ; first 32 pixels 632 movu m1, [bufyq] 633 movu m2, [bufyq+82] 634 movu m3, [bufyq+16] 635 movu m4, [bufyq+82+16] 636 pmaddubsw m0, m7, m1 637 pmaddubsw m1, m7, m2 638 pmaddubsw m2, m7, m3 639 pmaddubsw m3, m7, m4 640 paddw m0, m1 641 paddw m2, m3 642 pmulhrsw m0, m6 643 pmulhrsw m2, m6 644 pmullw m0, m5 645 pmullw m2, m5 646 pmulhrsw m0, m9 647 pmulhrsw m2, m9 648 packsswb m0, m2 649 movu m1, [bufq] 650 punpckhbw m2, m0, m1 651 punpcklbw m0, m1 652 pmaddubsw m1, m7, m2 653 pmaddubsw m2, m7, m0 654 packsswb m2, m1 655 movu [bufq], m2 656 add bufyq, 32 657 add bufq, 16 658 xor hd, 0x10000 659 test hd, 0x10000 660 jnz .y_loop_ar0 661 662 ; last 6 pixels 663 movu m1, [bufyq] 664 movu m2, [bufyq+82] 665 pmaddubsw m0, m7, m1 666 pmaddubsw m1, m7, m2 667 paddw m0, m1 668 pmulhrsw m0, m6 669 pmullw m0, m5 670 pmulhrsw m0, m9 671 packsswb m0, m0 672 movq m1, [bufq] 673 punpcklbw m0, m1 674 pmaddubsw m2, m7, m0 675 packsswb m2, m2 676 pandn m0, m8, m2 677 pand m1, m8 678 por m0, m1 679 movq [bufq], m0 680 681 add bufq, 82-32 682 add bufyq, 82*2-64 683 dec hd 684 jg .y_loop_ar0 685 RET 686 687.ar1: 688%if ARCH_X86_32 689%assign stack_offset stack_offset_old 690%assign stack_size_padded 0 691%xdefine rstk rsp 692%endif 693 DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x 694 imul uvd, 28 695 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] 696 movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1] 697 pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2 698%if ARCH_X86_32 699 mov r3mp, cf3d 700 DEFINE_ARGS buf, shift, fg_data, val3, min, max, x 701%elif WIN64 702 DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x 703 mov bufq, r0 704%else 705 DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x 706%endif 707 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 708 movd m3, [base+round_vals+shiftq*2-12] ; rnd 709 movd m7, [base+pb_1] 710 movd m6, [base+hmul_bits+4] 711 psrldq m4, 1 712%if ARCH_X86_32 713 DEFINE_ARGS buf, shift, val0, val3, min, max, x 714%elif WIN64 715 DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0 716%else 717 DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0 718%endif 719 pxor m5, m5 720 punpcklwd m3, m5 721 punpcklwd m6, m6 722 pcmpgtb m5, m4 723 punpcklbw m4, m5 724 pshufd m5, m4, q1111 725 pshufd m4, m4, q0000 726 pshufd m3, m3, q0000 727 pshufd m7, m7, q0000 728 pshufd m6, m6, q0000 729 sub bufq, 82*38+44-(82*3+41) 730%if ARCH_X86_32 731 add r1mp, 79+82*3 732 mov r0mp, 35 733%else 734 add bufyq, 79+82*3 735 mov hd, 35 736%endif 737 mov mind, -128 738 mov maxd, 127 739.y_loop_ar1: 740 mov xq, -38 741 movsx val3d, byte [bufq+xq-1] 742.x_loop_ar1: 743%if ARCH_X86_32 744 mov r2, r1mp 745 movq m0, [r2+xq*2] 746 movq m1, [r2+xq*2+82] 747%else 748 movq m0, [bufyq+xq*2] 749 movq m1, [bufyq+xq*2+82] 750%endif 751 pmaddubsw m2, m7, m0 752 pmaddubsw m0, m7, m1 753 paddw m2, m0 754 pmulhrsw m2, m6 755 756 movq m0, [bufq+xq-82-1] ; top/left 757 pxor m1, m1 758 pcmpgtb m1, m0 759 punpcklbw m0, m1 760 psrldq m1, m0, 4 ; top/right 761 punpcklwd m1, m2 762 psrldq m2, m0, 2 ; top 763 punpcklwd m0, m2 764 pmaddwd m0, m4 765 pmaddwd m1, m5 766 paddd m0, m1 767 paddd m0, m3 768.x_loop_ar1_inner: 769 movd val0d, m0 770 psrldq m0, 4 771%if ARCH_X86_32 772 imul val3d, r3mp 773%else 774 imul val3d, cf3d 775%endif 776 add val3d, val0d 777 sar val3d, shiftb 778 movsx val0d, byte [bufq+xq] 779 add val3d, val0d 780 cmp val3d, maxd 781 cmovns val3d, maxd 782 cmp val3d, mind 783 cmovs val3d, mind 784 mov byte [bufq+xq], val3b 785 ; keep val3d in-place as left for next x iteration 786 inc xq 787 jz .x_loop_ar1_end 788 test xq, 3 789 jnz .x_loop_ar1_inner 790 jmp .x_loop_ar1 791 792.x_loop_ar1_end: 793 add bufq, 82 794%if ARCH_X86_32 795 add r1mp, 82*2 796 dec r0mp 797%else 798 add bufyq, 82*2 799 dec hd 800%endif 801 jg .y_loop_ar1 802 RET 803 804.ar2: 805%if ARCH_X86_32 806%assign stack_offset stack_offset_old 807%assign stack_size_padded 0 808%xdefine rstk rsp 809 ALLOC_STACK -8*16 810%endif 811 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 812 movifnidn bufyq, bufymp 813 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 814 imul uvd, 28 815 movd m7, [base+round_vals-12+shiftq*2] 816 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12 817 pxor m2, m2 818 pcmpgtb m2, m0 819 punpckhbw m1, m0, m2 820 punpcklbw m0, m2 821 pinsrw m1, [base+pw_1], 5 822 punpcklwd m7, m7 823 pshufd m7, m7, q0000 824 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 825 pshufd m4, m1, q0000 826 pshufd m5, m1, q1111 827 pshufd m6, m1, q2222 828 pshufd m3, m0, q3333 829 pshufd m2, m0, q2222 830 pshufd m1, m0, q1111 831 pshufd m0, m0, q0000 832 SCRATCH 0, 8, 0 833 SCRATCH 1, 9, 1 834 SCRATCH 2, 10, 2 835 SCRATCH 3, 11, 3 836 SCRATCH 4, 12, 4 837 SCRATCH 5, 13, 5 838 SCRATCH 6, 14, 6 839 SCRATCH 7, 15, 7 840 movd m7, [base+hmul_bits+4] 841 movd m6, [base+pb_1] 842 punpcklwd m7, m7 843 pshufd m6, m6, q0000 844 pshufd m7, m7, q0000 845 sub bufq, 82*38+44-(82*3+41) 846 add bufyq, 79+82*3 847 mov hd, 35 848.y_loop_ar2: 849 mov xq, -38 850 851.x_loop_ar2: 852 pxor m2, m2 853 movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] 854 movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] 855 pcmpgtb m2, m0 856 punpckhbw m1, m0, m2 857 punpcklbw m0, m2 858 psrldq m5, m0, 2 ; y=-2,x=[-1,+5] 859 psrldq m3, m1, 2 ; y=-1,x=[-1,+5] 860 psrldq m4, m1, 4 ; y=-1,x=[+0,+5] 861 punpcklwd m2, m0, m5 862 punpcklwd m3, m4 863 pmaddwd m2, m8 864 pmaddwd m3, m11 865 paddd m2, m3 866 867 psrldq m4, m0, 4 ; y=-2,x=[+0,+5] 868 psrldq m5, m0, 6 ; y=-2,x=[+1,+5] 869 psrldq m0, 8 ; y=-2,x=[+2,+5] 870 punpcklwd m4, m5 871 punpcklwd m0, m1 872 psrldq m3, m1, 6 ; y=-1,x=[+1,+5] 873 psrldq m1, m1, 8 ; y=-1,x=[+2,+5] 874 punpcklwd m3, m1 875 pmaddwd m4, m9 876 pmaddwd m0, m10 877 pmaddwd m3, m12 878 paddd m4, m0 879 paddd m2, m3 880 paddd m2, m4 881 882 movq m0, [bufyq+xq*2] 883 movq m3, [bufyq+xq*2+82] 884 pmaddubsw m1, m6, m0 885 pmaddubsw m0, m6, m3 886 paddw m0, m1 887 pmulhrsw m0, m7 888 punpcklwd m0, m15 889 pmaddwd m0, m14 890 paddd m2, m0 891 892 movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] 893 pxor m4, m4 894 movd m5, [base+byte_blend+1] 895 punpcklbw m5, m5 896.x_loop_ar2_inner: 897 pcmpgtb m1, m4, m0 898 punpcklbw m0, m1 899 pmaddwd m3, m0, m13 900 paddd m3, m2 901 psrldq m2, 4 ; shift top to next pixel 902 psrad m3, [fg_dataq+FGData.ar_coeff_shift] 903 pslldq m3, 4 904 pand m3, m5 905 paddw m0, m3 906 packsswb m0, m0 907 movd [bufq+xq-2], m0 908 psrldq m0, 1 909 inc xq 910 jz .x_loop_ar2_end 911 test xq, 3 912 jnz .x_loop_ar2_inner 913 jmp .x_loop_ar2 914 915.x_loop_ar2_end: 916 add bufq, 82 917 add bufyq, 82*2 918 dec hd 919 jg .y_loop_ar2 920 RET 921 922.ar3: 923%if ARCH_X86_32 924%assign stack_offset stack_offset_old 925%assign stack_size_padded 0 926%xdefine rstk rsp 927%endif 928 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift 929 movifnidn bufyq, bufymp 930%if ARCH_X86_32 931 ALLOC_STACK -15*16 932%else 933 SUB rsp, 16*7 934%assign stack_size_padded (stack_size_padded+16*7) 935%assign stack_size (stack_size+16*7) 936%endif 937 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] 938 imul uvd, 28 939 940 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 941 pxor m3, m3 942 pcmpgtb m3, m0 943 punpckhbw m1, m0, m3 944 punpcklbw m0, m3 945 pshufd m2, m0, q1111 946 pshufd m3, m0, q2222 947 pshufd m4, m0, q3333 948 pshufd m0, m0, q0000 949 pshufd m5, m1, q1111 950 pshufd m6, m1, q2222 951 pshufd m7, m1, q3333 952 pshufd m1, m1, q0000 953 mova [rsp+ 0*16], m0 954 mova [rsp+ 1*16], m2 955 mova [rsp+ 2*16], m3 956 mova [rsp+ 3*16], m4 957 mova [rsp+ 4*16], m1 958 mova [rsp+ 5*16], m5 959 mova [rsp+ 6*16], m6 960 SCRATCH 7, 8, 7 961 962 movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma] 963 pxor m4, m4 964 pcmpgtb m4, m2 965 punpckhbw m5, m2, m4 966 punpcklbw m2, m4 967 pshufd m4, m2, q3232 968 punpcklwd m3, m4, m5 969 pshuflw m5, m4, q3321 970 pshufd m4, m3, q0000 971 pshufd m3, m2, q1111 972 pshufd m2, m2, q0000 973 pinsrw m5, [base+round_vals+shiftq*2-10], 3 974 SCRATCH 2, 9, 8 975 SCRATCH 3, 10, 9 976 SCRATCH 4, 11, 10 977 SCRATCH 5, 12, 11 978 979 movd m2, [base+round_vals-12+shiftq*2] 980 movd m1, [base+pb_1] 981 movd m3, [base+hmul_bits+4] 982 pxor m0, m0 983 punpcklwd m2, m0 984 punpcklwd m3, m3 985 pshufd m2, m2, q0000 986 pshufd m1, m1, q0000 987 pshufd m3, m3, q0000 988 SCRATCH 1, 13, 12 989 SCRATCH 2, 14, 13 990 SCRATCH 3, 15, 14 991 992 DEFINE_ARGS buf, bufy, fg_data, h, unused, x 993 sub bufq, 82*38+44-(82*3+41) 994 add bufyq, 79+82*3 995 mov hd, 35 996.y_loop_ar3: 997 mov xq, -38 998 999.x_loop_ar3: 1000 movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] 1001 pxor m4, m4 1002 pcmpgtb m4, m0 1003 punpckhbw m3, m0, m4 1004 punpcklbw m0, m4 1005 1006 psrldq m5, m0, 2 1007 psrldq m6, m0, 4 1008 psrldq m7, m0, 6 1009 punpcklwd m4, m0, m5 1010 punpcklwd m6, m7 1011 pmaddwd m4, [rsp+ 0*16] 1012 pmaddwd m6, [rsp+ 1*16] 1013 paddd m4, m6 1014 1015 palignr m2, m3, m0, 10 1016 palignr m3, m0, 12 1017 psrldq m0, 8 1018 1019 movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] 1020 pxor m6, m6 1021 pcmpgtb m6, m1 1022 punpckhbw m5, m1, m6 1023 punpcklbw m1, m6 1024 1025 punpcklwd m0, m2 1026 punpcklwd m3, m1 1027 pmaddwd m0, [rsp+ 2*16] 1028 pmaddwd m3, [rsp+ 3*16] 1029 paddd m0, m3 1030 paddd m0, m4 1031 1032 movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] 1033 pxor m7, m7 1034 pcmpgtb m7, m2 1035 punpckhbw m6, m2, m7 1036 punpcklbw m2, m7 1037 1038 palignr m3, m5, m1, 10 1039 palignr m5, m1, 12 1040 psrldq m4, m2, 2 1041 1042 punpcklwd m3, m5 1043 punpcklwd m5, m2, m4 1044 pmaddwd m3, [rsp+ 6*16] 1045 pmaddwd m5, m8 1046 paddd m3, m5 1047 paddd m0, m3 1048 1049 psrldq m3, m1, 2 1050 psrldq m4, m1, 4 1051 psrldq m5, m1, 6 1052 psrldq m1, 8 1053 1054 punpcklwd m3, m4 1055 punpcklwd m5, m1 1056 pmaddwd m3, [rsp+ 4*16] 1057 pmaddwd m5, [rsp+ 5*16] 1058 paddd m3, m5 1059 paddd m0, m3 1060 1061 movq m1, [bufyq+xq*2] 1062 movq m3, [bufyq+xq*2+82] 1063 pmaddubsw m5, m13, m1 1064 pmaddubsw m7, m13, m3 1065 paddw m7, m5 1066 pmulhrsw m7, m15 1067 1068 psrldq m1, m2, 4 1069 psrldq m3, m2, 6 1070 palignr m4, m6, m2, 10 1071 palignr m6, m2, 12 1072 psrldq m2, 8 1073 1074 punpcklwd m1, m3 1075 punpcklwd m2, m4 1076 punpcklwd m6, m7 1077 pmaddwd m1, m9 1078 pmaddwd m2, m10 1079 pmaddwd m6, m11 1080 paddd m1, m2 1081 paddd m0, m6 1082 paddd m0, m1 1083 paddd m0, m14 1084 1085 movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] 1086 pxor m4, m4 1087 movd m5, [base+byte_blend] 1088.x_loop_ar3_inner: 1089 pcmpgtb m2, m4, m1 1090 punpcklbw m3, m1, m2 1091 pmaddwd m2, m3, m12 1092 pshufd m3, m2, q1111 1093 paddd m2, m3 ; left+cur 1094 paddd m2, m0 ; add top 1095 psrldq m0, 4 1096 psrad m2, [fg_dataq+FGData.ar_coeff_shift] 1097 ; don't packssdw, we only care about one value 1098 packsswb m2, m2 1099 pandn m3, m5, m1 1100 pslld m2, 24 1101 pand m2, m5 1102 por m1, m2, m3 1103 movd [bufq+xq-3], m1 1104 psrldq m1, 1 1105 inc xq 1106 jz .x_loop_ar3_end 1107 test xq, 3 1108 jnz .x_loop_ar3_inner 1109 jmp .x_loop_ar3 1110 1111.x_loop_ar3_end: 1112 add bufq, 82 1113 add bufyq, 82*2 1114 dec hd 1115 jg .y_loop_ar3 1116 RET 1117 1118%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg 1119%assign %%idx 0 1120%define %%tmp %2 1121%if %0 == 6 1122%define %%tmp %6 1123%endif 1124%rep 4 1125%if %%idx == 0 1126 movd %5 %+ d, %2 1127 pshuflw %%tmp, %2, q3232 1128%else 1129 movd %5 %+ d, %%tmp 1130%if %%idx == 2 1131 punpckhqdq %%tmp, %%tmp 1132%elif %%idx == 4 1133 psrlq %%tmp, 32 1134%endif 1135%endif 1136 movzx %4 %+ d, %5 %+ w 1137 shr %5 %+ d, 16 1138 1139%if %%idx == 0 1140 movd %1, [%3+%4] 1141%else 1142 pinsrw %1, [%3+%4], %%idx + 0 1143%endif 1144 pinsrw %1, [%3+%5], %%idx + 1 1145%assign %%idx %%idx+2 1146%endrep 1147%endmacro 1148 1149INIT_XMM ssse3 1150; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby) 1151%if ARCH_X86_32 1152%if STACK_ALIGNMENT < mmsize 1153cglobal fgy_32x32xn, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \ 1154 dst, src, scaling, unused1, fg_data, picptr, unused2 1155 ; copy stack arguments to new position post-alignment, so that we 1156 ; don't have to keep the old stack location in a separate register 1157 mov r0, r0m 1158 mov r1, r2m 1159 mov r2, r4m 1160 mov r3, r6m 1161 mov r4, r7m 1162 mov r5, r8m 1163 1164 mov [rsp+6*mmsize+ 3*gprsize], r0 1165 mov [rsp+6*mmsize+ 5*gprsize], r1 1166 mov [rsp+6*mmsize+ 7*gprsize], r2 1167 mov [rsp+6*mmsize+ 9*gprsize], r3 1168 mov [rsp+6*mmsize+10*gprsize], r4 1169 mov [rsp+6*mmsize+11*gprsize], r5 1170%else 1171cglobal fgy_32x32xn, 0, 7, 16, 6 * mmsize + (3 + 1) * gprsize, \ 1172 dst, src, scaling, unused1, fg_data, picptr, unused2 1173%endif 1174 mov srcq, srcm 1175 mov fg_dataq, r3m 1176 mov scalingq, r5m 1177%if STACK_ALIGNMENT < mmsize 1178%define r0m [rsp+6*mmsize+ 3*gprsize] 1179%define r1m [rsp+6*mmsize+ 4*gprsize] 1180%define r2m [rsp+6*mmsize+ 5*gprsize] 1181%define r3m [rsp+6*mmsize+ 6*gprsize] 1182%define r4m [rsp+6*mmsize+ 7*gprsize] 1183%define r5m [rsp+6*mmsize+ 8*gprsize] 1184%define r6m [rsp+6*mmsize+ 9*gprsize] 1185%define r7m [rsp+6*mmsize+10*gprsize] 1186%define r8m [rsp+6*mmsize+11*gprsize] 1187%endif 1188 LEA r5, pb_mask 1189%define base r5-pb_mask 1190 mov r5m, picptrq 1191%else 1192cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut 1193 lea r7, [pb_mask] 1194%define base r7-pb_mask 1195%endif 1196 mov r6d, [fg_dataq+FGData.scaling_shift] 1197 movd m3, [base+mul_bits+r6*2-14] 1198 mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 1199 pcmpeqw m2, m2 1200 psrldq m2, 14 1201 movd m4, [base+max+r6*4] 1202 movd m5, [base+min+r6*2] 1203 punpcklwd m3, m3 1204 punpcklwd m4, m4 1205 punpcklwd m5, m5 1206 pshufd m3, m3, q0000 1207 pshufd m4, m4, q0000 1208 pshufd m5, m5, q0000 1209 SCRATCH 2, 10, 0 1210 SCRATCH 3, 11, 1 1211 SCRATCH 4, 12, 2 1212 SCRATCH 5, 13, 3 1213 1214%if ARCH_X86_32 1215 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 1216%else 1217 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 1218%endif 1219 1220 mov sbyd, r8m 1221 mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 1222 test overlapd, overlapd 1223 jz .no_vertical_overlap 1224 mova m6, [base+pw_1024] 1225 movd m7, [base+pb_27_17_17_27] 1226 SCRATCH 6, 14, 4 1227 SCRATCH 7, 15, 5 1228 test sbyd, sbyd 1229 jnz .vertical_overlap 1230 ; fall-through 1231 1232.no_vertical_overlap: 1233 mov r8m, overlapd 1234%if ARCH_X86_32 1235 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused 1236 imul seed, (173 << 24) | 37 1237%else 1238 imul seed, sbyd, (173 << 24) | 37 1239%endif 1240 add seed, (105 << 24) | 178 1241 rol seed, 8 1242 movzx seed, seew 1243 xor seed, [fg_dataq+FGData.seed] 1244 1245%if ARCH_X86_32 1246 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1247 1248 mov r3m, seed 1249 mov wq, r4m 1250%else 1251 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1252 unused1, unused2, see, unused3 1253%endif 1254 1255 lea src_bakq, [srcq+wq] 1256 neg wq 1257 sub dstmp, srcq 1258%if ARCH_X86_32 1259 mov r1m, src_bakq 1260 mov r4m, wq 1261 DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 1262%endif 1263 1264.loop_x: 1265%if ARCH_X86_32 1266 mov seed, r3m 1267%endif 1268 mov r6d, seed 1269 or seed, 0xEFF4 1270 shr r6d, 1 1271 test seeb, seeh 1272 lea seed, [r6+0x8000] 1273 cmovp seed, r6d ; updated seed 1274%if ARCH_X86_32 1275 mov r3m, seed 1276 1277 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1278 1279 mov offxd, offyd 1280%else 1281 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1282 offx, offy, see, unused 1283 1284 mov offyd, seed 1285 mov offxd, seed 1286%endif 1287 ror offyd, 8 1288 shr offxd, 12 1289 and offyd, 0xf 1290 imul offyd, 164 1291 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1292 1293%if ARCH_X86_32 1294 ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, 1295 ; r6m=grain_lut, r7m=h, r8m=overlap_v|h 1296 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1297%else 1298 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1299 h, offxy, see, unused 1300%endif 1301 1302.loop_x_odd: 1303 mov hd, r7m 1304 mov grain_lutq, grain_lutmp 1305.loop_y: 1306 ; src 1307 mova m0, [srcq] 1308 pxor m2, m2 1309 punpckhbw m1, m0, m2 1310 punpcklbw m0, m2 ; m0-1: src as word 1311 1312 ; scaling[src] 1313%if ARCH_X86_32 1314 vpgatherdw m4, m0, scalingq, r0, r5, m3 1315 vpgatherdw m5, m1, scalingq, r0, r5, m3 1316%else 1317 vpgatherdw m4, m0, scalingq, r12, r13, m3 1318 vpgatherdw m5, m1, scalingq, r12, r13, m3 1319%endif 1320 pcmpeqw m3, m3 1321 psrlw m3, 8 1322 pand m4, m3 1323 pand m5, m3 1324 1325 ; grain = grain_lut[offy+y][offx+x] 1326 movu m3, [grain_lutq+offxyq] 1327 pcmpgtb m7, m2, m3 1328 punpcklbw m2, m3, m7 1329 punpckhbw m3, m7 1330 1331 ; noise = round2(scaling[src] * grain, scaling_shift) 1332 pmullw m2, m4 1333 pmullw m3, m5 1334 pmulhrsw m2, m11 1335 pmulhrsw m3, m11 1336 1337 ; dst = clip_pixel(src, noise) 1338 paddw m0, m2 1339 paddw m1, m3 1340 pmaxsw m0, m13 1341 pmaxsw m1, m13 1342 pminsw m0, m12 1343 pminsw m1, m12 1344 packuswb m0, m1 1345 movifnidn dstq, dstmp 1346 mova [dstq+srcq], m0 1347 1348 add srcq, r2mp 1349 add grain_lutq, 82 1350 dec hd 1351 jg .loop_y 1352 1353%if ARCH_X86_32 1354 add r4mp, 16 1355%else 1356 add wq, 16 1357%endif 1358 jge .end 1359%if ARCH_X86_32 1360 mov srcq, r1mp 1361 add srcq, r4mp 1362 xor r8mp, 4 1363 test r8mp, 4 1364%else 1365 lea srcq, [src_bakq+wq] 1366 test srcq, 16 ; this relies on buffer alignment... 1367%endif 1368 jz .next_blk 1369 1370 add offxyd, 16 1371 test dword r8m, 2 ; r8m & 2 = have_top_overlap 1372 jz .loop_x_odd 1373 1374%if ARCH_X86_32 1375 add dword [rsp+6*mmsize+1*gprsize], 16 1376%else 1377 add r11d, 16 ; top_offxyd 1378%endif 1379 jnz .loop_x_odd_v_overlap 1380 1381.next_blk: 1382 test dword r8m, 1 1383 jz .loop_x 1384 1385 test dword r8m, 2 1386 jnz .loop_x_hv_overlap 1387 1388 ; horizontal overlap (without vertical overlap) 1389.loop_x_h_overlap: 1390%if ARCH_X86_32 1391 ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, 1392 ; r6m=grain_lut, r7m=h, r8m=overlap_v|h 1393 DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3 1394 1395 add offxyd, 16 ; left_offxyd 1396 mov [rsp+6*mmsize+0*gprsize], offxyd 1397 1398 DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 1399 1400 mov seed, r3m 1401%else 1402 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1403 offx, offy, see, left_offxy 1404 1405 lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx 1406%endif 1407 1408 mov r6d, seed 1409 or seed, 0xEFF4 1410 shr r6d, 1 1411 test seeb, seeh 1412 lea seed, [r6+0x8000] 1413 cmovp seed, r6d ; updated seed 1414 1415%if ARCH_X86_32 1416 mov r3m, seed 1417 1418 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1419 1420 mov offxd, offyd 1421%else 1422 mov offyd, seed 1423 mov offxd, seed 1424%endif 1425 ror offyd, 8 1426 shr offxd, 12 1427 and offyd, 0xf 1428 imul offyd, 164 1429 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx 1430 1431%if ARCH_X86_32 1432 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1433%else 1434 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1435 h, offxy, see, left_offxy 1436%endif 1437 1438 mov hd, r7m 1439 mov grain_lutq, grain_lutmp 1440.loop_y_h_overlap: 1441 ; src 1442 mova m0, [srcq] 1443 pxor m2, m2 1444 punpckhbw m1, m0, m2 1445 punpcklbw m0, m2 ; m0-1: src as word 1446 1447 ; scaling[src] 1448%if ARCH_X86_32 1449 vpgatherdw m4, m0, scalingq, r0, r5, m3 1450 vpgatherdw m5, m1, scalingq, r0, r5, m3 1451%else 1452 vpgatherdw m4, m0, scalingq, r12, r13, m3 1453 vpgatherdw m5, m1, scalingq, r12, r13, m3 1454%endif 1455 pcmpeqw m3, m3 1456 psrlw m3, 8 1457 pand m4, m3 1458 pand m5, m3 1459 1460 ; grain = grain_lut[offy+y][offx+x] 1461 movu m3, [grain_lutq+offxyq] 1462%if ARCH_X86_32 1463 mov r5, [rsp+6*mmsize+0*gprsize] 1464 movd m7, [grain_lutq+r5] 1465%else 1466 movd m7, [grain_lutq+left_offxyq] 1467%endif 1468 punpcklbw m7, m3 1469 pmaddubsw m6, m15, m7 1470 pmulhrsw m6, m14 1471 packsswb m6, m6 1472 pand m6, m10 1473 pandn m7, m10, m3 1474 por m6, m7 1475 pcmpgtb m2, m6 1476 punpcklbw m7, m6, m2 1477 punpckhbw m6, m2 1478 1479 ; noise = round2(scaling[src] * grain, scaling_shift) 1480 pmullw m7, m4 1481 pmullw m6, m5 1482 pmulhrsw m7, m11 1483 pmulhrsw m6, m11 1484 1485 ; dst = clip_pixel(src, noise) 1486 paddw m0, m7 1487 paddw m1, m6 1488 pmaxsw m0, m13 1489 pmaxsw m1, m13 1490 pminsw m0, m12 1491 pminsw m1, m12 1492 packuswb m0, m1 1493 movifnidn dstq, dstmp 1494 mova [dstq+srcq], m0 1495 1496 add srcq, r2mp 1497 add grain_lutq, 82 1498 dec hd 1499 jg .loop_y_h_overlap 1500 1501%if ARCH_X86_32 1502 add r4mp, 16 1503%else 1504 add wq, 16 1505%endif 1506 jge .end 1507%if ARCH_X86_32 1508 mov srcq, r1m 1509 add srcq, r4m 1510 xor r8mp, 4 1511%else 1512 lea srcq, [src_bakq+wq] 1513%endif 1514 ; assert(srcq & 16) != 0 1515 add offxyd, 16 1516 1517 ; since this half-block had left-overlap, the next does not 1518 test dword r8m, 2 ; have_top_overlap 1519 jz .loop_x_odd 1520%if ARCH_X86_32 1521 add dword [rsp+6*mmsize+1*gprsize], 16 1522%else 1523 add r11d, 16 ; top_offxyd 1524%endif 1525 jmp .loop_x_odd_v_overlap 1526 1527.end: 1528 RET 1529 1530.vertical_overlap: 1531%if ARCH_X86_32 1532 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 1533%else 1534 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap 1535%endif 1536 1537 or overlapd, 2 ; top_overlap: overlap & 2 1538 mov r8m, overlapd 1539 movzx sbyd, sbyb 1540%if ARCH_X86_32 1541 imul r4, [fg_dataq+FGData.seed], 0x00010001 1542 DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused 1543%else 1544 imul seed, [fg_dataq+FGData.seed], 0x00010001 1545%endif 1546 imul tmpd, sbyd, 173 * 0x00010001 1547 imul sbyd, 37 * 0x01000100 1548 add tmpd, (105 << 16) | 188 1549 add sbyd, (178 << 24) | (141 << 8) 1550 and tmpd, 0x00ff00ff 1551 and sbyd, 0xff00ff00 1552 xor seed, tmpd 1553%if ARCH_X86_32 1554 xor sbyd, seed ; (cur_seed << 16) | top_seed 1555 1556 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak 1557 1558 mov r3m, seed 1559 mov wq, r4m 1560%else 1561 xor seed, sbyd ; (cur_seed << 16) | top_seed 1562 1563 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1564 tmp, unused2, see, unused3 1565%endif 1566 1567 lea src_bakq, [srcq+wq] 1568 neg wq 1569 sub dstmp, srcq 1570%if ARCH_X86_32 1571 mov r1m, src_bakq 1572 mov r4m, wq 1573 DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 1574%endif 1575 1576.loop_x_v_overlap: 1577%if ARCH_X86_32 1578 mov seed, r3m 1579%endif 1580 ; we assume from the block above that bits 8-15 of tmpd are zero'ed, 1581 ; because of the 'and tmpd, 0x00ff00ff' above 1582 mov r6d, seed 1583 or seed, 0xeff4eff4 1584 test seeb, seeh 1585 setp tmpb ; parity of top_seed 1586 shr seed, 16 1587 shl tmpd, 16 1588 test seeb, seeh 1589 setp tmpb ; parity of cur_seed 1590 or r6d, 0x00010001 1591 xor tmpd, r6d 1592 mov seed, tmpd 1593 ror seed, 1 ; updated (cur_seed << 16) | top_seed 1594 1595%if ARCH_X86_32 1596 mov r3m, seed 1597 1598 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1599 1600 mov offxd, offyd 1601%else 1602 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1603 offx, offy, see, unused, top_offxy 1604 1605 mov offyd, seed 1606 mov offxd, seed 1607%endif 1608 1609 ror offyd, 8 1610 ror offxd, 12 1611 and offyd, 0xf000f 1612 and offxd, 0xf000f 1613 imul offyd, 164 1614 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1615 lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1616 1617%if ARCH_X86_32 1618 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut 1619%else 1620 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1621 h, offxy, see, unused, top_offxy 1622%endif 1623 1624 movzx top_offxyd, offxyw 1625%if ARCH_X86_32 1626 mov [rsp+6*mmsize+1*gprsize], top_offxyd 1627 1628 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1629%endif 1630 shr offxyd, 16 1631 1632.loop_x_odd_v_overlap: 1633%if ARCH_X86_32 1634 mov r5, r5m 1635 lea r5, [base+pb_27_17] 1636 mov [rsp+5*mmsize+8], r5 1637%else 1638 mova m8, [pb_27_17] 1639%endif 1640 mov hd, r7m 1641 mov grain_lutq, grain_lutmp 1642.loop_y_v_overlap: 1643 ; src 1644 mova m0, [srcq] 1645 pxor m2, m2 1646 punpckhbw m1, m0, m2 1647 punpcklbw m0, m2 ; m0-1: src as word 1648 1649 ; scaling[src] 1650%if ARCH_X86_32 1651 vpgatherdw m4, m0, scalingq, r0, r5, m3 1652 vpgatherdw m5, m1, scalingq, r0, r5, m3 1653%else 1654 vpgatherdw m4, m0, scalingq, r12, r13, m3 1655 vpgatherdw m5, m1, scalingq, r12, r13, m3 1656%endif 1657 pcmpeqw m3, m3 1658 psrlw m3, 8 1659 pand m4, m3 1660 pand m5, m3 1661 1662 ; grain = grain_lut[offy+y][offx+x] 1663 movu m3, [grain_lutq+offxyq] 1664%if ARCH_X86_32 1665 mov r5, [rsp+6*mmsize+1*gprsize] 1666 movu m7, [grain_lutq+r5] 1667%else 1668 movu m7, [grain_lutq+top_offxyq] 1669%endif 1670 punpckhbw m6, m7, m3 1671 punpcklbw m7, m3 1672%if ARCH_X86_32 1673 mov r5, [rsp+5*mmsize+8] 1674 pmaddubsw m3, [r5], m6 1675 pmaddubsw m6, [r5], m7 1676%else 1677 pmaddubsw m3, m8, m6 1678 pmaddubsw m6, m8, m7 1679%endif 1680 pmulhrsw m3, m14 1681 pmulhrsw m6, m14 1682 packsswb m6, m3 1683 pcmpgtb m7, m2, m6 1684 punpcklbw m2, m6, m7 1685 punpckhbw m6, m7 1686 1687 ; noise = round2(scaling[src] * grain, scaling_shift) 1688 pmullw m2, m4 1689 pmullw m6, m5 1690 pmulhrsw m2, m11 1691 pmulhrsw m6, m11 1692 1693 ; dst = clip_pixel(src, noise) 1694 paddw m0, m2 1695 paddw m1, m6 1696 pmaxsw m0, m13 1697 pmaxsw m1, m13 1698 pminsw m0, m12 1699 pminsw m1, m12 1700 packuswb m0, m1 1701 movifnidn dstq, dstmp 1702 mova [dstq+srcq], m0 1703 1704%if ARCH_X86_32 1705 add dword [rsp+5*mmsize+8], mmsize 1706%else 1707 mova m8, [pb_17_27] 1708%endif 1709 add srcq, r2mp 1710 add grain_lutq, 82 1711 dec hw 1712 jz .end_y_v_overlap 1713 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1714 ; remaining (up to) 30 lines 1715 xor hd, 0x10000 1716 test hd, 0x10000 1717 jnz .loop_y_v_overlap 1718 jmp .loop_y 1719 1720.end_y_v_overlap: 1721%if ARCH_X86_32 1722 add r4mp, 16 1723%else 1724 add wq, 16 1725%endif 1726 jge .end_hv 1727%if ARCH_X86_32 1728 mov srcq, r1mp 1729 add srcq, r4mp 1730 xor r8mp, 4 1731 test r8mp, 4 1732%else 1733 lea srcq, [src_bakq+wq] 1734 test srcq, 16 1735%endif 1736 jz .loop_x_hv_overlap 1737 add offxyd, 16 1738%if ARCH_X86_32 1739 add dword [rsp+6*mmsize+1*gprsize], 16 1740%else 1741 add top_offxyd, 16 1742%endif 1743 jmp .loop_x_odd_v_overlap 1744 1745.loop_x_hv_overlap: 1746%if ARCH_X86_32 1747 mov r5, r5m 1748 lea r5, [base+pb_27_17] 1749 mov [rsp+5*mmsize+8], r5 1750 1751 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak 1752 1753 mov r5, [rsp+6*mmsize+1*gprsize] 1754 mov r4, offxyd 1755 add r5, 16 1756 add r4, 16 1757 mov [rsp+6*mmsize+2*gprsize], r5 ; topleft_offxy 1758 mov [rsp+6*mmsize+0*gprsize], r4 ; left_offxy 1759 1760 DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak 1761 1762 xor tmpd, tmpd 1763 mov seed, r3m 1764%else 1765 mova m8, [pb_27_17] 1766 1767 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1768 tmp, unused2, see, unused3 1769 1770 ; we assume from the block above that bits 8-15 of tmpd are zero'ed 1771%endif 1772 mov r6d, seed 1773 or seed, 0xeff4eff4 1774 test seeb, seeh 1775 setp tmpb ; parity of top_seed 1776 shr seed, 16 1777 shl tmpd, 16 1778 test seeb, seeh 1779 setp tmpb ; parity of cur_seed 1780 or r6d, 0x00010001 1781 xor tmpd, r6d 1782 mov seed, tmpd 1783 ror seed, 1 ; updated (cur_seed << 16) | top_seed 1784 1785%if ARCH_X86_32 1786 mov r3m, seed 1787 1788 DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx 1789 1790 mov offxd, offyd 1791%else 1792 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1793 offx, offy, see, left_offxy, top_offxy, topleft_offxy 1794 1795 lea topleft_offxyq, [top_offxyq+16] 1796 lea left_offxyq, [offyq+16] 1797 mov offyd, seed 1798 mov offxd, seed 1799%endif 1800 ror offyd, 8 1801 ror offxd, 12 1802 and offyd, 0xf000f 1803 and offxd, 0xf000f 1804 imul offyd, 164 1805 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 1806 lea offyq, [offyq+offxq*2+0x10001*747+32*82] 1807 1808%if ARCH_X86_32 1809 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 1810 1811 movzx r5, offxyw ; top_offxy 1812 mov [rsp+6*mmsize+1*gprsize], r5 1813%else 1814 DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ 1815 h, offxy, see, left_offxy, top_offxy, topleft_offxy 1816 1817 movzx top_offxyd, offxyw 1818%endif 1819 shr offxyd, 16 1820 1821 mov hd, r7m 1822 mov grain_lutq, grain_lutmp 1823.loop_y_hv_overlap: 1824 ; grain = grain_lut[offy+y][offx+x] 1825 movu m3, [grain_lutq+offxyq] 1826%if ARCH_X86_32 1827 mov r5, [rsp+6*mmsize+1*gprsize] ; top_offxy 1828 mov r0, [rsp+6*mmsize+0*gprsize] ; left_offxy 1829 movu m6, [grain_lutq+r5] 1830 mov r5, [rsp+6*mmsize+2*gprsize] ; topleft_offxy 1831 movd m4, [grain_lutq+r0] 1832 movd m7, [grain_lutq+r5] 1833%else 1834 movu m6, [grain_lutq+top_offxyq] 1835 movd m4, [grain_lutq+left_offxyq] 1836 movd m7, [grain_lutq+topleft_offxyq] 1837%endif 1838 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 1839 punpcklbw m4, m3 1840 punpcklbw m7, m6 1841 pmaddubsw m2, m15, m4 1842 pmaddubsw m4, m15, m7 1843 pmulhrsw m2, m14 1844 pmulhrsw m4, m14 1845 packsswb m2, m2 1846 packsswb m4, m4 1847 pand m2, m10 1848 pand m4, m10 1849 pandn m7, m10, m3 1850 pandn m3, m10, m6 1851 por m7, m2 1852 por m3, m4 1853 ; followed by v interpolation (top | cur -> cur) 1854 punpckhbw m4, m3, m7 1855 punpcklbw m3, m7 1856%if ARCH_X86_32 1857 mov r5, [rsp+5*mmsize+8] 1858 pmaddubsw m7, [r5], m4 1859 pmaddubsw m4, [r5], m3 1860%else 1861 pmaddubsw m7, m8, m4 1862 pmaddubsw m4, m8, m3 1863%endif 1864 pmulhrsw m7, m14 1865 pmulhrsw m4, m14 1866 packsswb m4, m7 1867 pxor m2, m2 1868 pcmpgtb m7, m2, m4 1869 punpcklbw m3, m4, m7 1870 punpckhbw m4, m7 1871 1872 ; src 1873 mova m0, [srcq] 1874 punpckhbw m1, m0, m2 1875 punpcklbw m0, m2 ; m0-1: src as word 1876 1877 ; scaling[src] 1878%if ARCH_X86_32 1879 vpgatherdw m5, m0, scalingq, r0, r5, m7 1880 vpgatherdw m6, m1, scalingq, r0, r5, m7 1881%else 1882 vpgatherdw m5, m0, scalingq, r13, r14, m7 1883 vpgatherdw m6, m1, scalingq, r13, r14, m7 1884%endif 1885 pcmpeqw m7, m7 1886 psrlw m7, 8 1887 pand m5, m7 1888 pand m6, m7 1889 1890 ; noise = round2(scaling[src] * grain, scaling_shift) 1891 pmullw m3, m5 1892 pmullw m4, m6 1893 pmulhrsw m3, m11 1894 pmulhrsw m4, m11 1895 1896 ; dst = clip_pixel(src, noise) 1897 paddw m0, m3 1898 paddw m1, m4 1899 pmaxsw m0, m13 1900 pmaxsw m1, m13 1901 pminsw m0, m12 1902 pminsw m1, m12 1903 packuswb m0, m1 1904 movifnidn dstq, dstmp 1905 mova [dstq+srcq], m0 1906 1907%if ARCH_X86_32 1908 add dword [rsp+5*mmsize+8], mmsize 1909%else 1910 mova m8, [pb_17_27] 1911%endif 1912 add srcq, r2mp 1913 add grain_lutq, 82 1914 dec hw 1915 jz .end_y_hv_overlap 1916 ; 2 lines get vertical overlap, then fall back to non-overlap code for 1917 ; remaining (up to) 30 lines 1918 xor hd, 0x10000 1919 test hd, 0x10000 1920 jnz .loop_y_hv_overlap 1921 jmp .loop_y_h_overlap 1922 1923.end_y_hv_overlap: 1924%if ARCH_X86_32 1925 add r4mp, 16 1926%else 1927 add wq, 16 1928%endif 1929 jge .end_hv 1930%if ARCH_X86_32 1931 mov srcq, r1m 1932 add srcq, r4m 1933 xor r8mp, 4 1934%else 1935 lea srcq, [src_bakq+wq] 1936%endif 1937 ; assert(srcq & 16) != 0 1938 add offxyd, 16 1939%if ARCH_X86_32 1940 add dword [rsp+6*mmsize+1*gprsize], 16 1941%else 1942 add top_offxyd, 16 1943%endif 1944 jmp .loop_x_odd_v_overlap 1945 1946.end_hv: 1947 RET 1948 1949INIT_XMM ssse3 1950%if ARCH_X86_32 1951; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h, 1952; sby, luma, lstride, uv_pl, is_id) 1953%if STACK_ALIGNMENT < mmsize 1954DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8 1955cglobal fguv_32x32xn_i420, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \ 1956 tmp, src, scaling, h, fg_data, picptr, unused 1957 mov r0, r0m 1958 mov r1, r2m 1959 mov r2, r4m 1960 mov r3, r6m 1961 mov r4, r7m 1962 mov [rsp+8*mmsize+3*gprsize], r0 1963 mov [rsp+8*mmsize+5*gprsize], r1 1964 mov [rsp+8*mmsize+7*gprsize], r2 1965 mov [rsp+8*mmsize+9*gprsize], r3 1966 mov [rsp+8*mmsize+10*gprsize], r4 1967 1968 mov r0, r8m 1969 mov r1, r9m 1970 mov r2, r10m 1971 mov r4, r11m 1972 mov r3, r12m 1973 mov [rsp+8*mmsize+11*gprsize], r0 1974 mov [rsp+8*mmsize+12*gprsize], r1 1975 mov [rsp+8*mmsize+13*gprsize], r2 1976 mov [rsp+8*mmsize+14*gprsize], r4 1977%else 1978cglobal fguv_32x32xn_i420, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ 1979 tmp, src, scaling, h, fg_data, picptr, unused 1980%endif 1981 mov srcq, srcm 1982 mov fg_dataq, r3m 1983 mov scalingq, r5m 1984%if STACK_ALIGNMENT < mmsize 1985%define r0m [rsp+8*mmsize+ 3*gprsize] 1986%define r1m [rsp+8*mmsize+ 4*gprsize] 1987%define r2m [rsp+8*mmsize+ 5*gprsize] 1988%define r3m [rsp+8*mmsize+ 6*gprsize] 1989%define r4m [rsp+8*mmsize+ 7*gprsize] 1990%define r5m [rsp+8*mmsize+ 8*gprsize] 1991%define r6m [rsp+8*mmsize+ 9*gprsize] 1992%define r7m [rsp+8*mmsize+10*gprsize] 1993%define r8m [rsp+8*mmsize+11*gprsize] 1994%define r9m [rsp+8*mmsize+12*gprsize] 1995%define r10m [rsp+8*mmsize+13*gprsize] 1996%define r11m [rsp+8*mmsize+14*gprsize] 1997%define r12m [rsp+8*mmsize+15*gprsize] 1998%endif 1999 LEA r5, pb_mask 2000%define base r5-pb_mask 2001 mov r5m, r5 2002%else 2003cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ 2004 grain_lut, tmp, sby, luma, lstride, uv_pl, is_id 2005 lea r8, [pb_mask] 2006%define base r8-pb_mask 2007%endif 2008 mov r6d, [fg_dataq+FGData.scaling_shift] 2009 movd m2, [base+byte_blend+3] 2010 movd m3, [base+mul_bits+r6*2-14] 2011 mov r6d, [fg_dataq+FGData.clip_to_restricted_range] 2012 lea tmpd, [r6d*2] 2013%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize 2014 test r3, r3 2015%else 2016 cmp dword r12m, 0 ; is_idm 2017%endif 2018 movd m5, [base+min+r6*2] 2019 cmovne r6d, tmpd 2020 movd m4, [base+max+r6*2] 2021 punpcklwd m3, m3 2022 punpcklwd m5, m5 2023 punpcklwd m4, m4 2024 pshufd m3, m3, q0000 2025 pshufd m5, m5, q0000 2026 pshufd m4, m4, q0000 2027 SCRATCH 2, 10, 0 2028 SCRATCH 3, 11, 1 2029 SCRATCH 4, 12, 2 2030 SCRATCH 5, 13, 3 2031 2032 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 2033 jne .csfl 2034 2035%macro FGUV_32x32xN_LOOP 1 ; not-csfl 2036%if ARCH_X86_32 2037 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 2038%else 2039 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap 2040%endif 2041 2042%if %1 2043 mov r6d, dword r11m 2044 movd m0, [fg_dataq+FGData.uv_mult+r6*4] 2045 movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4] 2046 punpcklbw m6, m1, m0 2047 movd m7, [fg_dataq+FGData.uv_offset+r6*4] 2048 punpcklwd m6, m6 2049 punpcklwd m7, m7 2050 pshufd m6, m6, q0000 2051 pshufd m7, m7, q0000 2052 SCRATCH 6, 14, 4 2053 SCRATCH 7, 15, 5 2054%endif 2055 2056 mov sbyd, r8m 2057 mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 2058 test overlapd, overlapd 2059 jz %%no_vertical_overlap 2060%if ARCH_X86_32 2061 movd m1, [base+pb_23_22] 2062 mova m0, [base+pw_1024] 2063%else 2064 movd m1, [pb_23_22] 2065 mova m0, [pw_1024] 2066%endif 2067 pshufd m1, m1, q0000 2068 SCRATCH 0, 8, 6 2069 SCRATCH 1, 9, 7 2070 test sbyd, sbyd 2071 jnz %%vertical_overlap 2072 ; fall-through 2073 2074%%no_vertical_overlap: 2075 mov r8m, overlapd 2076%if ARCH_X86_32 2077 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap 2078 imul seed, (173 << 24) | 37 2079%else 2080 imul seed, sbyd, (173 << 24) | 37 2081%endif 2082 add seed, (105 << 24) | 178 2083 rol seed, 8 2084 movzx seed, seew 2085 xor seed, [fg_dataq+FGData.seed] 2086 2087%if ARCH_X86_32 2088 mov r3m, seed 2089 2090 DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak 2091%define luma_bakq lumaq 2092 2093 mov wq, r4m 2094 shl r10mp, 1 2095%else 2096 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2097 unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak 2098 2099 mov lstrideq, r10mp 2100%endif 2101 2102 mov lumaq, r9mp 2103 lea src_bakq, [srcq+wq] 2104 lea luma_bakq, [lumaq+wq*2] 2105 neg wq 2106 sub r0mp, srcq 2107%if ARCH_X86_32 2108 mov r1m, src_bakq 2109 mov r11m, luma_bakq 2110 mov r4m, wq 2111 2112 DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 2113%else 2114 mov r11mp, src_bakq 2115 mov r10mp, strideq 2116%endif 2117 2118%%loop_x: 2119%if ARCH_X86_32 2120 mov seed, r3m 2121%endif 2122 mov r6d, seed 2123 or seed, 0xEFF4 2124 shr r6d, 1 2125 test seeb, seeh 2126 lea seed, [r6+0x8000] 2127 cmovp seed, r6d ; updated seed 2128%if ARCH_X86_32 2129 mov r3m, seed 2130 2131 DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx 2132 2133 mov offxd, offyd 2134%else 2135 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2136 offx, offy, see, overlap, unused1, unused2, lstride 2137 2138 mov offyd, seed 2139 mov offxd, seed 2140%endif 2141 ror offyd, 8 2142 shr offxd, 12 2143 and offyd, 0xf 2144 imul offyd, 82 2145 lea offyq, [offyq+offxq+498] ; offy*stride+offx 2146 2147%if ARCH_X86_32 2148 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut 2149%else 2150 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2151 h, offxy, see, overlap, unused1, unused2, lstride, luma_bak 2152%endif 2153 2154 mov hd, r7m 2155 mov grain_lutq, grain_lutmp 2156%%loop_y: 2157 ; src 2158%if ARCH_X86_32 2159 mov lumaq, r9mp 2160%endif 2161 mova m4, [lumaq+ 0] 2162 mova m6, [lumaq+16] 2163 mova m0, [srcq] 2164%if ARCH_X86_32 2165 add lumaq, r10mp 2166 mov r9mp, lumaq 2167 mov r5, r5m 2168 movd m7, [base+pb_1] 2169%else 2170 movd m7, [pb_1] 2171%endif 2172 pshufd m7, m7, q0000 2173 pxor m2, m2 2174 pmaddubsw m4, m7 2175 pmaddubsw m6, m7 2176 pavgw m4, m2 2177 pavgw m6, m2 2178 2179%if %1 2180 packuswb m4, m6 ; luma 2181 punpckhbw m6, m4, m0 2182 punpcklbw m4, m0 ; { luma, chroma } 2183 pmaddubsw m6, m14 2184 pmaddubsw m4, m14 2185 psraw m6, 6 2186 psraw m4, 6 2187 paddw m6, m15 2188 paddw m4, m15 2189 packuswb m4, m6 ; pack+unpack = clip 2190 punpckhbw m6, m4, m2 2191 punpcklbw m4, m2 2192%endif 2193 2194 ; scaling[luma_src] 2195%if ARCH_X86_32 2196 vpgatherdw m7, m4, scalingq, r0, r5 2197 vpgatherdw m5, m6, scalingq, r0, r5 2198%else 2199 vpgatherdw m7, m4, scalingq, r12, r2 2200 vpgatherdw m5, m6, scalingq, r12, r2 2201%endif 2202 pcmpeqw m1, m1 2203 psrlw m1, 8 2204 pand m7, m1 2205 pand m5, m1 2206 2207 ; unpack chroma_source 2208 punpckhbw m1, m0, m2 2209 punpcklbw m0, m2 ; m0-1: src as word 2210 2211 ; grain = grain_lut[offy+y][offx+x] 2212 movu m3, [grain_lutq+offxyq+ 0] 2213 pcmpgtb m6, m2, m3 2214 punpcklbw m2, m3, m6 2215 punpckhbw m3, m6 2216 2217 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2218 pmullw m2, m7 2219 pmullw m3, m5 2220 pmulhrsw m2, m11 2221 pmulhrsw m3, m11 2222 2223%if ARCH_X86_32 2224 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2225%endif 2226 2227 ; dst = clip_pixel(src, noise) 2228 paddw m0, m2 2229 paddw m1, m3 2230 pmaxsw m0, m13 2231 pmaxsw m1, m13 2232 pminsw m0, m12 2233 pminsw m1, m12 2234 packuswb m0, m1 2235 movifnidn dstq, dstmp 2236 mova [dstq+srcq], m0 2237 2238%if ARCH_X86_32 2239 add srcq, r2mp 2240 ; we already incremented lumaq above 2241%else 2242 add srcq, r10mp 2243 lea lumaq, [lumaq+lstrideq*2] 2244%endif 2245 add grain_lutq, 82 2246 dec hw 2247 jg %%loop_y 2248 2249%if ARCH_X86_32 2250 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 2251 2252 mov wq, r4m 2253%endif 2254 add wq, 16 2255 jge %%end 2256%if ARCH_X86_32 2257 mov srcq, r1mp 2258 mov lumaq, r11mp 2259%else 2260 mov srcq, r11mp 2261%endif 2262 lea lumaq, [luma_bakq+wq*2] 2263 add srcq, wq 2264%if ARCH_X86_32 2265 mov r4m, wq 2266 mov r9m, lumaq 2267%endif 2268 test dword r8m, 1 2269 jz %%loop_x 2270 2271 ; r8m = sbym 2272 test dword r8m, 2 2273 jne %%loop_x_hv_overlap 2274 2275 ; horizontal overlap (without vertical overlap) 2276%%loop_x_h_overlap: 2277%if ARCH_X86_32 2278 lea r6, [offxyd+16] 2279 mov [rsp+8*mmsize+0*gprsize], r6 2280 2281 DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut 2282 2283 mov seed, r3m 2284%else 2285 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2286 offx, offy, see, left_offxy, unused1, unused2, lstride 2287 2288 lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx 2289%endif 2290 mov r6d, seed 2291 or seed, 0xEFF4 2292 shr r6d, 1 2293 test seeb, seeh 2294 lea seed, [r6+0x8000] 2295 cmovp seed, r6d ; updated seed 2296 2297%if ARCH_X86_32 2298 mov r3m, seed 2299 2300 DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx 2301 2302 mov offxd, offyd 2303%else 2304 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2305 offx, offy, see, left_offxy, unused1, unused2, lstride 2306 2307 mov offyd, seed 2308 mov offxd, seed 2309%endif 2310 ror offyd, 8 2311 shr offxd, 12 2312 and offyd, 0xf 2313 imul offyd, 82 2314 lea offyq, [offyq+offxq+498] ; offy*stride+offx 2315 2316%if ARCH_X86_32 2317 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut 2318%else 2319 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2320 h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak 2321%endif 2322 2323 mov hd, r7m 2324 mov grain_lutq, grain_lutmp 2325%%loop_y_h_overlap: 2326 ; src 2327%if ARCH_X86_32 2328 mov lumaq, r9mp 2329%endif 2330 mova m4, [lumaq+ 0] 2331 mova m6, [lumaq+16] 2332 mova m0, [srcq] 2333%if ARCH_X86_32 2334 add lumaq, r10mp 2335 mov r9mp, lumaq 2336 mov r5, r5m 2337 movd m7, [base+pb_1] 2338%else 2339 movd m7, [pb_1] 2340%endif 2341 pshufd m7, m7, q0000 2342 pxor m2, m2 2343 pmaddubsw m4, m7 2344 pmaddubsw m6, m7 2345 pavgw m4, m2 2346 pavgw m6, m2 2347 2348%if %1 2349 packuswb m4, m6 ; luma 2350 punpckhbw m6, m4, m0 2351 punpcklbw m4, m0 ; { luma, chroma } 2352 pmaddubsw m6, m14 2353 pmaddubsw m4, m14 2354 psraw m6, 6 2355 psraw m4, 6 2356 paddw m6, m15 2357 paddw m4, m15 2358 packuswb m4, m6 ; pack+unpack = clip 2359 punpckhbw m6, m4, m2 2360 punpcklbw m4, m2 2361%endif 2362 2363 ; scaling[luma_src] 2364%if ARCH_X86_32 2365 vpgatherdw m7, m4, scalingq, r0, r5 2366 vpgatherdw m5, m6, scalingq, r0, r5 2367%else 2368 vpgatherdw m7, m4, scalingq, r12, r2 2369 vpgatherdw m5, m6, scalingq, r12, r2 2370%endif 2371 pcmpeqw m1, m1 2372 psrlw m1, 8 2373 pand m7, m1 2374 pand m5, m1 2375 2376 ; unpack chroma_source 2377 punpckhbw m1, m0, m2 2378 punpcklbw m0, m2 ; m0-1: src as word 2379 2380 ; grain = grain_lut[offy+y][offx+x] 2381 movu m3, [grain_lutq+offxyq+ 0] 2382%if ARCH_X86_32 2383 mov r0, [rsp+8*mmsize+0*gprsize] 2384 movd m4, [grain_lutq+r0+ 0] 2385%else 2386 movd m4, [grain_lutq+left_offxyq+ 0] 2387%endif 2388 punpcklbw m2, m4, m3 2389 pmaddubsw m4, m9, m2 2390 pmulhrsw m4, m8 2391 packsswb m4, m4 2392 pand m4, m10 2393 pandn m2, m10, m3 2394 por m3, m4, m2 2395 pxor m4, m4 2396 pcmpgtb m4, m3 2397 punpcklbw m2, m3, m4 2398 punpckhbw m3, m4 2399 2400 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2401 pmullw m2, m7 2402 pmullw m3, m5 2403 pmulhrsw m2, m11 2404 pmulhrsw m3, m11 2405 2406%if ARCH_X86_32 2407 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2408%endif 2409 2410 ; dst = clip_pixel(src, noise) 2411 paddw m0, m2 2412 paddw m1, m3 2413 pmaxsw m0, m13 2414 pmaxsw m1, m13 2415 pminsw m0, m12 2416 pminsw m1, m12 2417 packuswb m0, m1 2418 movifnidn dstq, dstmp 2419 mova [dstq+srcq], m0 2420 2421%if ARCH_X86_32 2422 add srcq, r2mp 2423 ; lumaq has already been incremented above 2424%else 2425 add srcq, r10mp 2426 lea lumaq, [lumaq+lstrideq*2] 2427%endif 2428 add grain_lutq, 82 2429 dec hw 2430 jg %%loop_y_h_overlap 2431 2432%if ARCH_X86_32 2433 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 2434 2435 mov wq, r4m 2436%endif 2437 add wq, 16 2438 jge %%end 2439%if ARCH_X86_32 2440 mov srcq, r1mp 2441 mov lumaq, r11mp 2442%else 2443 mov srcq, r11mp 2444%endif 2445 lea lumaq, [luma_bakq+wq*2] 2446 add srcq, wq 2447%if ARCH_X86_32 2448 mov r4m, wq 2449 mov r9m, lumaq 2450%endif 2451 2452 ; r8m = sbym 2453 test dword r8m, 2 2454 jne %%loop_x_hv_overlap 2455 jmp %%loop_x_h_overlap 2456 2457%%end: 2458 RET 2459 2460%%vertical_overlap: 2461%if ARCH_X86_32 2462 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap 2463%else 2464 DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap 2465%endif 2466 2467 or overlapd, 2 ; top_overlap: overlap & 2 2468 mov r8m, overlapd 2469 movzx sbyd, sbyb 2470%if ARCH_X86_32 2471 imul r4, [fg_dataq+FGData.seed], 0x00010001 2472 DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused 2473%else 2474 imul seed, [fg_dataq+FGData.seed], 0x00010001 2475%endif 2476 imul tmpd, sbyd, 173 * 0x00010001 2477 imul sbyd, 37 * 0x01000100 2478 add tmpd, (105 << 16) | 188 2479 add sbyd, (178 << 24) | (141 << 8) 2480 and tmpd, 0x00ff00ff 2481 and sbyd, 0xff00ff00 2482 xor seed, tmpd 2483%if ARCH_X86_32 2484 xor sbyd, seed ; (cur_seed << 16) | top_seed 2485 2486 DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak 2487 2488 mov r3m, seed 2489 mov wq, r4m 2490 shl r10mp, 1 2491%else 2492 xor seed, sbyd ; (cur_seed << 16) | top_seed 2493 2494 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2495 tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak 2496 2497 mov lstrideq, r10mp 2498%endif 2499 2500 mov lumaq, r9mp 2501 lea src_bakq, [srcq+wq] 2502 lea luma_bakq, [lumaq+wq*2] 2503 neg wq 2504 sub r0mp, srcq 2505%if ARCH_X86_32 2506 mov r1m, src_bakq 2507 mov r11m, luma_bakq 2508 mov r4m, wq 2509 2510 DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 2511%else 2512 mov r11mp, src_bakq 2513 mov r10mp, strideq 2514%endif 2515 2516%%loop_x_v_overlap: 2517%if ARCH_X86_32 2518 mov seed, r3m 2519 xor tmpd, tmpd 2520%endif 2521 ; we assume from the block above that bits 8-15 of tmpd are zero'ed 2522 mov r6d, seed 2523 or seed, 0xeff4eff4 2524 test seeb, seeh 2525 setp tmpb ; parity of top_seed 2526 shr seed, 16 2527 shl tmpd, 16 2528 test seeb, seeh 2529 setp tmpb ; parity of cur_seed 2530 or r6d, 0x00010001 2531 xor tmpd, r6d 2532 mov seed, tmpd 2533 ror seed, 1 ; updated (cur_seed << 16) | top_seed 2534 2535%if ARCH_X86_32 2536 mov r3m, seed 2537 2538 DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx 2539 2540 mov offxd, offyd 2541%else 2542 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2543 offx, offy, see, overlap, top_offxy, unused, lstride 2544 2545 mov offxd, seed 2546 mov offyd, seed 2547%endif 2548 ror offyd, 8 2549 ror offxd, 12 2550 and offyd, 0xf000f 2551 and offxd, 0xf000f 2552 imul offyd, 82 2553 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 2554 lea offyq, [offyq+offxq+0x10001*498+16*82] 2555 2556%if ARCH_X86_32 2557 DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy 2558%else 2559 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2560 h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak 2561%endif 2562 2563 movzx top_offxyd, offxyw 2564 shr offxyd, 16 2565%if ARCH_X86_32 2566 mov [rsp+8*mmsize+1*gprsize], top_offxyd 2567 2568 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut 2569%endif 2570 2571 mov hd, r7m 2572 mov grain_lutq, grain_lutmp 2573%%loop_y_v_overlap: 2574%if ARCH_X86_32 2575 mov lumaq, r9mp 2576%endif 2577 mova m4, [lumaq+ 0] 2578 mova m6, [lumaq+16] 2579 mova m0, [srcq] 2580%if ARCH_X86_32 2581 add lumaq, r10mp 2582 mov r9mp, lumaq 2583 mov r5, r5m 2584 movd m7, [base+pb_1] 2585%else 2586 movd m7, [pb_1] 2587%endif 2588 pshufd m7, m7, q0000 2589 pxor m2, m2 2590 pmaddubsw m4, m7 2591 pmaddubsw m6, m7 2592 pavgw m4, m2 2593 pavgw m6, m2 2594 2595%if %1 2596 packuswb m4, m6 ; luma 2597 punpckhbw m6, m4, m0 2598 punpcklbw m4, m0 ; { luma, chroma } 2599 pmaddubsw m6, m14 2600 pmaddubsw m4, m14 2601 psraw m6, 6 2602 psraw m4, 6 2603 paddw m6, m15 2604 paddw m4, m15 2605 packuswb m4, m6 ; pack+unpack = clip 2606 punpckhbw m6, m4, m2 2607 punpcklbw m4, m2 2608%endif 2609 2610 ; scaling[luma_src] 2611%if ARCH_X86_32 2612 vpgatherdw m7, m4, scalingq, r0, r5 2613 vpgatherdw m5, m6, scalingq, r0, r5 2614%else 2615 vpgatherdw m7, m4, scalingq, r12, r2 2616 vpgatherdw m5, m6, scalingq, r12, r2 2617%endif 2618 pcmpeqw m1, m1 2619 psrlw m1, 8 2620 pand m7, m1 2621 pand m5, m1 2622 2623 ; grain = grain_lut[offy+y][offx+x] 2624 movu m3, [grain_lutq+offxyq] 2625%if ARCH_X86_32 2626 mov r0, [rsp+8*mmsize+1*gprsize] 2627 movu m4, [grain_lutq+r0] 2628%else 2629 movu m4, [grain_lutq+top_offxyq] 2630%endif 2631 punpckhbw m1, m4, m3 2632 punpcklbw m4, m3 2633 pmaddubsw m2, m9, m1 2634 pmaddubsw m3, m9, m4 2635 pmulhrsw m2, m8 2636 pmulhrsw m3, m8 2637 packsswb m3, m2 2638 pxor m1, m1 2639 pcmpgtb m1, m3 2640 punpcklbw m2, m3, m1 2641 punpckhbw m3, m1 2642 2643 ; noise = round2(scaling[luma_src] * grain, scaling_shift) 2644 pmullw m2, m7 2645 pmullw m3, m5 2646 pmulhrsw m2, m11 2647 pmulhrsw m3, m11 2648 2649 ; unpack chroma_source 2650 pxor m4, m4 2651 punpckhbw m1, m0, m4 2652 punpcklbw m0, m4 ; m0-1: src as word 2653 2654%if ARCH_X86_32 2655 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2656%endif 2657 2658 ; dst = clip_pixel(src, noise) 2659 paddw m0, m2 2660 paddw m1, m3 2661 pmaxsw m0, m13 2662 pmaxsw m1, m13 2663 pminsw m0, m12 2664 pminsw m1, m12 2665 packuswb m0, m1 2666 movifnidn dstq, dstmp 2667 mova [dstq+srcq], m0 2668 2669 dec hw 2670 je %%end_y_v_overlap 2671%if ARCH_X86_32 2672 add srcq, r2mp 2673 ; lumaq has already been incremented above 2674%else 2675 add srcq, r10mp 2676 lea lumaq, [lumaq+lstrideq*2] 2677%endif 2678 add grain_lutq, 82 2679 jmp %%loop_y 2680 2681%%end_y_v_overlap: 2682%if ARCH_X86_32 2683 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 2684 2685 mov wq, r4m 2686%endif 2687 add wq, 16 2688 jge %%end_hv 2689%if ARCH_X86_32 2690 mov srcq, r1mp 2691 mov lumaq, r11mp 2692%else 2693 mov srcq, r11mp 2694%endif 2695 lea lumaq, [luma_bakq+wq*2] 2696 add srcq, wq 2697%if ARCH_X86_32 2698 mov r4m, wq 2699 mov r9m, lumaq 2700%endif 2701 2702 ; since fg_dataq.overlap is guaranteed to be set, we never jump 2703 ; back to .loop_x_v_overlap, and instead always fall-through to 2704 ; h+v overlap 2705 2706%%loop_x_hv_overlap: 2707%if ARCH_X86_32 2708 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused 2709 2710 mov r6, [rsp+8*mmsize+1*gprsize] 2711 lea r0, [r3d+16] 2712 add r6, 16 2713 mov [rsp+8*mmsize+0*gprsize], r0 ; left_offxy 2714 mov [rsp+8*mmsize+2*gprsize], r6 ; topleft_offxy 2715 2716 DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused 2717 2718 mov seed, r3m 2719 xor tmpd, tmpd 2720%else 2721 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2722 tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride 2723 2724 lea topleft_offxyq, [top_offxyq+16] 2725 lea left_offxyq, [offxyq+16] 2726 2727 ; we assume from the block above that bits 8-15 of tmpd are zero'ed 2728%endif 2729 mov r6d, seed 2730 or seed, 0xeff4eff4 2731 test seeb, seeh 2732 setp tmpb ; parity of top_seed 2733 shr seed, 16 2734 shl tmpd, 16 2735 test seeb, seeh 2736 setp tmpb ; parity of cur_seed 2737 or r6d, 0x00010001 2738 xor tmpd, r6d 2739 mov seed, tmpd 2740 ror seed, 1 ; updated (cur_seed << 16) | top_seed 2741 2742%if ARCH_X86_32 2743 mov r3m, seed 2744 2745 DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx 2746 2747 mov offxd, offyd 2748%else 2749 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2750 offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride 2751 2752 mov offxd, seed 2753 mov offyd, seed 2754%endif 2755 ror offyd, 8 2756 ror offxd, 12 2757 and offyd, 0xf000f 2758 and offxd, 0xf000f 2759 imul offyd, 82 2760 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy 2761 lea offyq, [offyq+offxq+0x10001*498+16*82] 2762 2763%if ARCH_X86_32 2764 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut 2765%else 2766 DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ 2767 h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak 2768%endif 2769 2770 movzx top_offxyd, offxyw 2771 shr offxyd, 16 2772%if ARCH_X86_32 2773 mov [rsp+8*mmsize+1*gprsize], top_offxyd 2774%endif 2775 2776 mov hd, r7m 2777 mov grain_lutq, grain_lutmp 2778%%loop_y_hv_overlap: 2779 ; src 2780%if ARCH_X86_32 2781 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 2782 2783 mov lumaq, r9mp 2784%endif 2785 mova m4, [lumaq+ 0] 2786 mova m6, [lumaq+16] 2787 mova m0, [srcq] 2788%if ARCH_X86_32 2789 add lumaq, r10mp 2790 mov r9mp, lumaq 2791 mov r5, r5m 2792 movd m7, [base+pb_1] 2793%else 2794 movd m7, [pb_1] 2795%endif 2796 pshufd m7, m7, q0000 2797 pxor m2, m2 2798 pmaddubsw m4, m7 2799 pmaddubsw m6, m7 2800 pavgw m4, m2 2801 pavgw m6, m2 2802 2803%if %1 2804 packuswb m4, m6 ; luma 2805 punpckhbw m6, m4, m0 2806 punpcklbw m4, m0 ; { luma, chroma } 2807 pmaddubsw m6, m14 2808 pmaddubsw m4, m14 2809 psraw m6, 6 2810 psraw m4, 6 2811 paddw m6, m15 2812 paddw m4, m15 2813 packuswb m4, m6 ; pack+unpack = clip 2814 punpckhbw m6, m4, m2 2815 punpcklbw m4, m2 2816%endif 2817 2818 ; scaling[src] 2819%if ARCH_X86_32 2820 vpgatherdw m7, m4, scalingq, r0, r5 2821 vpgatherdw m5, m6, scalingq, r0, r5 2822%else 2823 movd m1, [grain_lutq+topleft_offxyq] 2824 vpgatherdw m7, m4, scalingq, r2, r12 2825 vpgatherdw m5, m6, scalingq, r2, r12 2826%endif 2827 pcmpeqw m2, m2 2828 psrlw m2, 8 2829 pand m7, m2 2830 pand m5, m2 2831 2832 ; grain = grain_lut[offy+y][offx+x] 2833%if ARCH_X86_32 2834 mov r0, [rsp+8*mmsize+2*gprsize] ; topleft_offxy 2835 mov r5, [rsp+8*mmsize+1*gprsize] ; top_offxy 2836 movd m1, [grain_lutq+r0] 2837 mov r0, [rsp+8*mmsize+0*gprsize] ; left_offxy 2838%endif 2839 movu m3, [grain_lutq+offxyq] 2840%if ARCH_X86_32 2841 movu m6, [grain_lutq+r5] 2842 movd m4, [grain_lutq+r0] 2843%else 2844 movu m6, [grain_lutq+top_offxyq] 2845 movd m4, [grain_lutq+left_offxyq] 2846%endif 2847 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) 2848 punpcklbw m1, m6 2849 punpcklbw m4, m3 2850 punpcklwd m4, m1 2851 pmaddubsw m1, m9, m4 2852 pmulhrsw m1, m8 2853 packsswb m1, m1 2854 pandn m4, m10, m3 2855 pandn m3, m10, m6 2856 psrldq m6, m1, 1 2857 pand m1, m10 2858 pand m6, m10 2859 por m4, m1 2860 por m3, m6 2861 ; followed by v interpolation (top | cur -> cur) 2862 punpckhbw m1, m3, m4 2863 punpcklbw m3, m4 2864 pmaddubsw m4, m9, m1 2865 pmaddubsw m1, m9, m3 2866 pmulhrsw m4, m8 2867 pmulhrsw m1, m8 2868 packsswb m1, m4 2869 pxor m4, m4 2870 pcmpgtb m4, m1 2871 punpcklbw m2, m1, m4 2872 punpckhbw m1, m4 2873 2874 ; noise = round2(scaling[src] * grain, scaling_shift) 2875 pmullw m2, m7 2876 pmullw m1, m5 2877 pmulhrsw m2, m11 2878 pmulhrsw m1, m11 2879 2880%if ARCH_X86_32 2881 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut 2882%endif 2883 2884 ; unpack chroma source 2885 pxor m4, m4 2886 punpckhbw m3, m0, m4 2887 punpcklbw m0, m4 ; m0-1: src as word 2888 2889 ; dst = clip_pixel(src, noise) 2890 paddw m0, m2 2891 paddw m3, m1 2892 pmaxsw m0, m13 2893 pmaxsw m3, m13 2894 pminsw m0, m12 2895 pminsw m3, m12 2896 packuswb m0, m3 2897 movifnidn dstq, dstmp 2898 mova [dstq+srcq], m0 2899 2900%if ARCH_X86_32 2901 add srcq, r2mp 2902 ; lumaq has been adjusted above already 2903%else 2904 add srcq, r10mp 2905 lea lumaq, [lumaq+lstrideq*2] 2906%endif 2907 add grain_lutq, 82 2908 dec hw 2909 jg %%loop_y_h_overlap 2910 2911%if ARCH_X86_32 2912 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut 2913 2914 mov wq, r4m 2915%endif 2916 add wq, 16 2917 jge %%end_hv 2918%if ARCH_X86_32 2919 mov srcq, r1mp 2920 mov lumaq, r11mp 2921%else 2922 mov srcq, r11mp 2923%endif 2924 lea lumaq, [luma_bakq+wq*2] 2925 add srcq, wq 2926%if ARCH_X86_32 2927 mov r4m, wq 2928 mov r9m, lumaq 2929%endif 2930 jmp %%loop_x_hv_overlap 2931 2932%%end_hv: 2933 RET 2934%endmacro 2935 2936 FGUV_32x32xN_LOOP 1 2937.csfl: 2938 FGUV_32x32xN_LOOP 0 2939