1; Copyright © 2019-2022, VideoLAN and dav1d authors
2; Copyright © 2019-2022, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28%include "x86/filmgrain_common.asm"
29
30%if ARCH_X86_64
31
32SECTION_RODATA 32
33pb_mask:       db  0,128,128,  0,128,  0,  0,128,128,  0,  0,128,  0,128,128,  0
34gen_shufE:     db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
35gen_shufA:     db  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
36gen_shufB:     db  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11
37gen_shufC:     db  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13
38gen_shufD:     db  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
39; note: the order of (some of) the following constants matter
40pb_27_17:      times 2 db 27, 17
41byte_blend:            db  0,  0,  0, -1
42pb_27_17_17_27:        db 27, 17, 17, 27,  0, 32,  0, 32
43pb_17_27:      times 2 db 17, 27
44pb_1:          times 4 db 1
45pb_23_22:              db 23, 22,  0, 32,  0, 32,  0, 32
46next_upperbit_mask:    dw 0x100B, 0x2016, 0x402C, 0x8058
47pw_seed_xor:   times 2 dw 0xb524
48               times 2 dw 0x49d8
49fg_min:        times 4 db 0
50               times 4 db 16
51fg_max:        times 4 db 255
52               times 4 db 240
53               times 4 db 235
54pd_m65536:             dd -65536
55pw_8:          times 2 dw 8
56pw_1024:       times 2 dw 1024
57hmul_bits:             dw 32768, 16384,  8192,  4096
58round:                 dw  2048,  1024,   512
59mul_bits:              dw   256,   128,    64,    32,    16
60round_vals:            dw    32,    64,   128,   256,   512
61pw_1:                  dw 1
62
63%macro JMP_TABLE 2-*
64    %1_8bpc_%2_table:
65    %xdefine %%base %1_8bpc_%2_table
66    %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
67    %rep %0 - 2
68        dd %%prefix %+ .ar%3 - %%base
69        %rotate 1
70    %endrep
71%endmacro
72
73JMP_TABLE generate_grain_y,      avx2, 0, 1, 2, 3
74JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3
75JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3
76JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3
77
78SECTION .text
79
80INIT_YMM avx2
81cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data
82%define base r4-generate_grain_y_8bpc_avx2_table
83    lea              r4, [generate_grain_y_8bpc_avx2_table]
84    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
85    mov             r6d, [fg_dataq+FGData.grain_scale_shift]
86    movq            xm1, [base+next_upperbit_mask]
87    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
88    movq            xm4, [base+mul_bits]
89    movq            xm5, [base+hmul_bits]
90    mov              r7, -73*82
91    mova            xm6, [base+pb_mask]
92    sub            bufq, r7
93    vpbroadcastw    xm7, [base+round+r6*2]
94    lea              r6, [gaussian_sequence]
95    movsxd           r5, [r4+r5*4]
96.loop:
97    pand            xm2, xm0, xm1
98    psrlw           xm3, xm2, 10
99    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
100    pmullw          xm2, xm4            ; bits 0x0f00 are set
101    pmulhuw         xm0, xm5
102    pshufb          xm3, xm6, xm2       ; set 15th bit for next 4 seeds
103    psllq           xm2, xm3, 30
104    por             xm2, xm3
105    psllq           xm3, xm2, 15
106    por             xm2, xm0            ; aggregate each bit into next seed's high bit
107    por             xm3, xm2            ; 4 next output seeds
108    pshuflw         xm0, xm3, q3333
109    psrlw           xm3, 5
110    pand            xm2, xm0, xm1
111    movq             r2, xm3
112    psrlw           xm3, xm2, 10
113    por             xm2, xm3
114    pmullw          xm2, xm4
115    pmulhuw         xm0, xm5
116    movzx           r3d, r2w
117    pshufb          xm3, xm6, xm2
118    psllq           xm2, xm3, 30
119    por             xm2, xm3
120    psllq           xm3, xm2, 15
121    por             xm0, xm2
122    movd            xm2, [r6+r3*2]
123    rorx             r3, r2, 32
124    por             xm3, xm0
125    shr             r2d, 16
126    pinsrw          xm2, [r6+r2*2], 1
127    pshuflw         xm0, xm3, q3333
128    movzx           r2d, r3w
129    psrlw           xm3, 5
130    pinsrw          xm2, [r6+r2*2], 2
131    shr             r3d, 16
132    movq             r2, xm3
133    pinsrw          xm2, [r6+r3*2], 3
134    movzx           r3d, r2w
135    pinsrw          xm2, [r6+r3*2], 4
136    rorx             r3, r2, 32
137    shr             r2d, 16
138    pinsrw          xm2, [r6+r2*2], 5
139    movzx           r2d, r3w
140    pinsrw          xm2, [r6+r2*2], 6
141    shr             r3d, 16
142    pinsrw          xm2, [r6+r3*2], 7
143    pmulhrsw        xm2, xm7
144    packsswb        xm2, xm2
145    movq      [bufq+r7], xm2
146    add              r7, 8
147    jl .loop
148
149    ; auto-regression code
150    add              r5, r4
151    jmp              r5
152
153.ar1:
154    DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
155    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
156    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
157    movd            xm5, [fg_dataq+FGData.ar_coeffs_y]
158    mova            xm2, [base+gen_shufC]
159    DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
160    pinsrb          xm5, [base+pb_1], 3
161    vpbroadcastw    xm3, [base+round_vals+shiftq*2-12]    ; rnd
162    pmovsxbw        xm5, xm5
163    pshufd          xm4, xm5, q0000
164    pshufd          xm5, xm5, q1111
165    sub            bufq, 82*73-(82*3+79)
166    mov              hd, 70
167    mov            mind, -128
168    mov            maxd, 127
169.y_loop_ar1:
170    mov              xq, -76
171    movsx         val3d, byte [bufq+xq-1]
172.x_loop_ar1:
173    pmovsxbw        xm1, [bufq+xq-82-3]
174    pshufb          xm0, xm1, xm2
175    punpckhwd       xm1, xm3
176    pmaddwd         xm0, xm4
177    pmaddwd         xm1, xm5
178    paddd           xm0, xm1
179.x_loop_ar1_inner:
180    movd          val0d, xm0
181    psrldq          xm0, 4
182    imul          val3d, cf3d
183    add           val3d, val0d
184    movsx         val0d, byte [bufq+xq]
185    sarx          val3d, val3d, shiftd
186    add           val3d, val0d
187    cmp           val3d, maxd
188    cmovns        val3d, maxd
189    cmp           val3d, mind
190    cmovs         val3d, mind
191    mov       [bufq+xq], val3b
192    ; keep val3d in-place as left for next x iteration
193    inc              xq
194    jz .x_loop_ar1_end
195    test             xb, 3
196    jnz .x_loop_ar1_inner
197    jmp .x_loop_ar1
198.x_loop_ar1_end:
199    add            bufq, 82
200    dec              hd
201    jg .y_loop_ar1
202.ar0:
203    RET
204
205.ar2:
206%if WIN64
207    ; xmm6 and xmm7 already saved
208    %assign xmm_regs_used 16
209    %assign stack_size_padded 168
210    SUB             rsp, stack_size_padded
211    movaps   [rsp+16*2], xmm8
212    movaps   [rsp+16*3], xmm9
213    movaps   [rsp+16*4], xmm10
214    movaps   [rsp+16*5], xmm11
215    movaps   [rsp+16*6], xmm12
216    movaps   [rsp+16*7], xmm13
217    movaps   [rsp+16*8], xmm14
218    movaps   [rsp+16*9], xmm15
219%endif
220    DEFINE_ARGS buf, fg_data, h, x
221    mov             r6d, [fg_dataq+FGData.ar_coeff_shift]
222    pmovsxbw        xm7, [fg_dataq+FGData.ar_coeffs_y+0]    ; cf0-7
223    movd            xm9, [fg_dataq+FGData.ar_coeffs_y+8]    ; cf8-11
224    vpbroadcastd   xm10, [base+round_vals-14+r6*2]
225    movd           xm11, [base+byte_blend+1]
226    pmovsxbw        xm9, xm9
227    pshufd          xm4, xm7, q0000
228    mova           xm12, [base+gen_shufA]
229    pshufd          xm5, xm7, q3333
230    mova           xm13, [base+gen_shufB]
231    pshufd          xm6, xm7, q1111
232    mova           xm14, [base+gen_shufC]
233    pshufd          xm7, xm7, q2222
234    mova           xm15, [base+gen_shufD]
235    pshufd          xm8, xm9, q0000
236    psrld          xm10, 16
237    pshufd          xm9, xm9, q1111
238    sub            bufq, 82*73-(82*3+79)
239    mov              hd, 70
240.y_loop_ar2:
241    mov              xq, -76
242.x_loop_ar2:
243    pmovsxbw        xm0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
244    pmovsxbw        xm1, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
245    pshufb          xm2, xm0, xm12
246    pmaddwd         xm2, xm4
247    pshufb          xm3, xm1, xm13
248    pmaddwd         xm3, xm5
249    paddd           xm2, xm3
250    pshufb          xm3, xm0, xm14
251    pmaddwd         xm3, xm6
252    punpckhqdq      xm0, xm0
253    punpcklwd       xm0, xm1
254    pmaddwd         xm0, xm7
255    pshufb          xm1, xm15
256    pmaddwd         xm1, xm8
257    paddd           xm2, xm10
258    paddd           xm2, xm3
259    paddd           xm0, xm1
260    paddd           xm2, xm0
261    movq            xm0, [bufq+xq-2]        ; y=0,x=[-2,+5]
262.x_loop_ar2_inner:
263    pmovsxbw        xm1, xm0
264    pmaddwd         xm3, xm9, xm1
265    psrldq          xm1, 4                  ; y=0,x=0
266    paddd           xm3, xm2
267    psrldq          xm2, 4                  ; shift top to next pixel
268    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
269    ; don't packssdw since we only care about one value
270    paddw           xm3, xm1
271    packsswb        xm3, xm3
272    pextrb    [bufq+xq], xm3, 0
273    pslldq          xm3, 2
274    vpblendvb       xm0, xm3, xm11
275    psrldq          xm0, 1
276    inc              xq
277    jz .x_loop_ar2_end
278    test             xb, 3
279    jnz .x_loop_ar2_inner
280    jmp .x_loop_ar2
281.x_loop_ar2_end:
282    add            bufq, 82
283    dec              hd
284    jg .y_loop_ar2
285    RET
286
287INIT_YMM avx2
288.ar3:
289%if WIN64
290    ; xmm6 and xmm7 already saved
291    %assign stack_offset 16
292    ALLOC_STACK   16*14
293    %assign stack_size stack_size - 16*4
294    %assign xmm_regs_used 12
295    movaps  [rsp+16*12], xmm8
296    movaps  [rsp+16*13], xmm9
297    movaps  [rsp+16*14], xmm10
298    movaps  [rsp+16*15], xmm11
299%else
300    ALLOC_STACK   16*12
301%endif
302    mov             r6d, [fg_dataq+FGData.ar_coeff_shift]
303    movd           xm11, [base+byte_blend]
304    pmovsxbw         m1, [fg_dataq+FGData.ar_coeffs_y+ 0]   ; cf0-15
305    pmovsxbw        xm2, [fg_dataq+FGData.ar_coeffs_y+16]   ; cf16-23
306    pshufd           m0, m1, q0000
307    mova    [rsp+16* 0], m0
308    pshufd           m0, m1, q1111
309    mova    [rsp+16* 2], m0
310    pshufd           m0, m1, q2222
311    mova    [rsp+16* 4], m0
312    pshufd           m1, m1, q3333
313    mova    [rsp+16* 6], m1
314    pshufd          xm0, xm2, q0000
315    mova    [rsp+16* 8], xm0
316    pshufd          xm0, xm2, q1111
317    mova    [rsp+16* 9], xm0
318    psrldq          xm7, xm2, 10
319    mova             m8, [base+gen_shufA]
320    pinsrw          xm2, [base+pw_1], 5
321    mova             m9, [base+gen_shufC]
322    pshufd          xm2, xm2, q2222
323    movu            m10, [base+gen_shufE]
324    vpbroadcastw    xm6, [base+round_vals-12+r6*2]
325    pinsrw          xm7, [base+round_vals+r6*2-10], 3
326    mova    [rsp+16*10], xm2
327    DEFINE_ARGS buf, fg_data, h, x
328    sub            bufq, 82*73-(82*3+79)
329    mov              hd, 70
330.y_loop_ar3:
331    mov              xq, -76
332.x_loop_ar3:
333    movu            xm5, [bufq+xq-82*3-3]    ; y=-3,x=[-3,+12]
334    vinserti128      m5, [bufq+xq-82*2-3], 1 ; y=-2,x=[-3,+12]
335    movu            xm4, [bufq+xq-82*1-3]    ; y=-1,x=[-3,+12]
336    punpcklbw        m3, m5, m5
337    punpckhwd        m5, m4
338    psraw            m3, 8
339    punpcklbw        m5, m5
340    psraw            m5, 8
341    punpcklbw       xm4, xm4
342    psraw           xm4, 8
343    pshufb           m0, m3, m8
344    pmaddwd          m0, [rsp+16*0]
345    pshufb           m1, m3, m9
346    pmaddwd          m1, [rsp+16*2]
347    shufps           m2, m3, m5, q1032
348    paddd            m0, m1
349    pshufb           m1, m2, m8
350    vperm2i128       m3, m4, 0x21
351    pmaddwd          m1, [rsp+16*4]
352    shufps          xm2, xm3, q1021
353    vpblendd         m2, m3, 0xf0
354    pshufb           m2, m10
355    paddd            m0, m1
356    pmaddwd          m2, [rsp+16*6]
357    pshufb          xm1, xm4, xm9
358    pmaddwd         xm1, [rsp+16*8]
359    shufps          xm4, xm5, q1132
360    paddd            m0, m2
361    pshufb          xm2, xm4, xm8
362    pshufd          xm4, xm4, q2121
363    pmaddwd         xm2, [rsp+16*9]
364    punpcklwd       xm4, xm6
365    pmaddwd         xm4, [rsp+16*10]
366    vextracti128    xm3, m0, 1
367    paddd           xm0, xm1
368    movq            xm1, [bufq+xq-3]        ; y=0,x=[-3,+4]
369    paddd           xm2, xm4
370    paddd           xm0, xm2
371    paddd           xm0, xm3
372.x_loop_ar3_inner:
373    pmovsxbw        xm2, xm1
374    pmaddwd         xm2, xm7
375    pshufd          xm3, xm2, q1111
376    paddd           xm2, xm0                ; add top
377    paddd           xm2, xm3                ; left+cur
378    psrldq          xm0, 4
379    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
380    ; don't packssdw since we only care about one value
381    packsswb        xm2, xm2
382    pextrb    [bufq+xq], xm2, 0
383    pslldq          xm2, 3
384    vpblendvb       xm1, xm2, xm11
385    psrldq          xm1, 1
386    inc              xq
387    jz .x_loop_ar3_end
388    test             xb, 3
389    jnz .x_loop_ar3_inner
390    jmp .x_loop_ar3
391.x_loop_ar3_end:
392    add            bufq, 82
393    dec              hd
394    jg .y_loop_ar3
395    RET
396
397%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y
398INIT_XMM avx2
399cglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv
400%define base r4-generate_grain_uv_%1_8bpc_avx2_table
401    lea              r4, [generate_grain_uv_%1_8bpc_avx2_table]
402    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
403    mov             r6d, [fg_dataq+FGData.grain_scale_shift]
404    movq            xm1, [base+next_upperbit_mask]
405    movq            xm4, [base+mul_bits]
406    movq            xm5, [base+hmul_bits]
407    mova            xm6, [base+pb_mask]
408    vpbroadcastw    xm7, [base+round+r6*2]
409    vpbroadcastd    xm2, [base+pw_seed_xor+uvq*4]
410    pxor            xm0, xm2
411    lea              r6, [gaussian_sequence]
412%if %2
413    mov             r7d, 73-35*%3
414    add            bufq, 44
415.loop_y:
416    mov              r5, -44
417%else
418    mov              r5, -73*82
419    sub            bufq, r5
420%endif
421.loop:
422    pand            xm2, xm0, xm1
423    psrlw           xm3, xm2, 10
424    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
425    pmullw          xm2, xm4            ; bits 0x0f00 are set
426    pmulhuw         xm0, xm5
427    pshufb          xm3, xm6, xm2       ; set 15th bit for next 4 seeds
428    psllq           xm2, xm3, 30
429    por             xm2, xm3
430    psllq           xm3, xm2, 15
431    por             xm2, xm0            ; aggregate each bit into next seed's high bit
432    por             xm2, xm3            ; 4 next output seeds
433    pshuflw         xm0, xm2, q3333
434    psrlw           xm2, 5
435    movq             r8, xm2
436    movzx           r9d, r8w
437    movd            xm2, [r6+r9*2]
438    rorx             r9, r8, 32
439    shr             r8d, 16
440    pinsrw          xm2, [r6+r8*2], 1
441    movzx           r8d, r9w
442    pinsrw          xm2, [r6+r8*2], 2
443    shr             r9d, 16
444    pinsrw          xm2, [r6+r9*2], 3
445    pmulhrsw        xm2, xm7
446    packsswb        xm2, xm2
447    movd      [bufq+r5], xm2
448    add              r5, 4
449    jl .loop
450%if %2
451    add            bufq, 82
452    dec             r7d
453    jg .loop_y
454%endif
455
456    ; auto-regression code
457    movsxd           r6, [fg_dataq+FGData.ar_coeff_lag]
458    movsxd           r6, [base+generate_grain_uv_%1_8bpc_avx2_table+r6*4]
459    add              r6, r4
460    jmp              r6
461
462INIT_YMM avx2
463.ar0:
464    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
465    imul            uvd, 28
466    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
467    movd            xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq]
468    movd            xm3, [base+hmul_bits+shiftq*2]
469    DEFINE_ARGS buf, bufy, h
470    pmovsxbw        xm2, xm2
471%if %2
472    vpbroadcastd     m7, [base+pb_1]
473    vpbroadcastw     m6, [base+hmul_bits+2+%3*2]
474%endif
475    vpbroadcastw     m2, xm2
476    vpbroadcastw     m3, xm3
477    pxor            m12, m12
478%if %2
479    sub            bufq, 82*(73-35*%3)+82-(82*3+41)
480%else
481    sub            bufq, 82*70-3
482%endif
483    add           bufyq, 3+82*3
484    mov              hd, 70-35*%3
485.y_loop_ar0:
486%if %2
487    ; first 32 pixels
488    movu            xm4, [bufyq]
489    vinserti128      m4, [bufyq+32], 1
490%if %3
491    movu            xm0, [bufyq+82]
492    vinserti128      m0, [bufyq+82+32], 1
493%endif
494    movu            xm5, [bufyq+16]
495    vinserti128      m5, [bufyq+48], 1
496%if %3
497    movu            xm1, [bufyq+82+16]
498    vinserti128      m1, [bufyq+82+48], 1
499%endif
500    pmaddubsw        m4, m7, m4
501%if %3
502    pmaddubsw        m0, m7, m0
503%endif
504    pmaddubsw        m5, m7, m5
505%if %3
506    pmaddubsw        m1, m7, m1
507    paddw            m4, m0
508    paddw            m5, m1
509%endif
510    pmulhrsw         m4, m6
511    pmulhrsw         m5, m6
512%else
513    xor             r3d, r3d
514    ; first 32x2 pixels
515.x_loop_ar0:
516    movu             m4, [bufyq+r3]
517    pcmpgtb          m0, m12, m4
518    punpckhbw        m5, m4, m0
519    punpcklbw        m4, m0
520%endif
521    pmullw           m4, m2
522    pmullw           m5, m2
523    pmulhrsw         m4, m3
524    pmulhrsw         m5, m3
525%if %2
526    movu             m1, [bufq]
527%else
528    movu             m1, [bufq+r3]
529%endif
530    pcmpgtb          m8, m12, m1
531    punpcklbw        m0, m1, m8
532    punpckhbw        m1, m8
533    paddw            m0, m4
534    paddw            m1, m5
535    packsswb         m0, m1
536%if %2
537    movu         [bufq], m0
538%else
539    movu      [bufq+r3], m0
540    add             r3d, 32
541    cmp             r3d, 64
542    jl .x_loop_ar0
543%endif
544
545    ; last 6/12 pixels
546    movu            xm4, [bufyq+32*2]
547%if %2
548%if %3
549    movu            xm5, [bufyq+32*2+82]
550%endif
551    pmaddubsw       xm4, xm7, xm4
552%if %3
553    pmaddubsw       xm5, xm7, xm5
554    paddw           xm4, xm5
555%endif
556    movq            xm0, [bufq+32]
557    pmulhrsw        xm4, xm6
558    pmullw          xm4, xm2
559    pmulhrsw        xm4, xm3
560    pcmpgtb         xm5, xm12, xm0
561    punpcklbw       xm5, xm0, xm5
562    paddw           xm4, xm5
563    packsswb        xm4, xm4
564    pblendw         xm0, xm4, xm0, 1000b
565    movq      [bufq+32], xm0
566%else
567    movu            xm0, [bufq+64]
568    pcmpgtb         xm1, xm12, xm4
569    punpckhbw       xm5, xm4, xm1
570    punpcklbw       xm4, xm1
571    pmullw          xm5, xm2
572    pmullw          xm4, xm2
573    vpblendd        xm1, xm3, xm12, 0x0c
574    pmulhrsw        xm5, xm1
575    pmulhrsw        xm4, xm3
576    pcmpgtb         xm1, xm12, xm0
577    punpckhbw       xm8, xm0, xm1
578    punpcklbw       xm0, xm1
579    paddw           xm5, xm8
580    paddw           xm0, xm4
581    packsswb        xm0, xm5
582    movu      [bufq+64], xm0
583%endif
584    add            bufq, 82
585    add           bufyq, 82<<%3
586    dec              hd
587    jg .y_loop_ar0
588    RET
589
590INIT_XMM avx2
591.ar1:
592    DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift
593    imul            uvd, 28
594    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
595    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
596    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
597    pinsrb          xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
598    DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift
599    pmovsxbw        xm4, xm4
600    pshufd          xm5, xm4, q1111
601    pshufd          xm4, xm4, q0000
602    pmovsxwd        xm3, [base+round_vals+shiftq*2-12]    ; rnd
603%if %2
604    vpbroadcastd    xm7, [base+pb_1]
605    vpbroadcastw    xm6, [base+hmul_bits+2+%3*2]
606%endif
607    vpbroadcastd    xm3, xm3
608%if %2
609    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
610%else
611    sub            bufq, 82*70-(82-3)
612%endif
613    add           bufyq, 79+82*3
614    mov              hd, 70-35*%3
615    mov            mind, -128
616    mov            maxd, 127
617.y_loop_ar1:
618    mov              xq, -(76>>%2)
619    movsx         val3d, byte [bufq+xq-1]
620.x_loop_ar1:
621    pmovsxbw        xm0, [bufq+xq-82-1]     ; top/left
622%if %2
623    movq            xm8, [bufyq+xq*2]
624%if %3
625    movq            xm9, [bufyq+xq*2+82]
626%endif
627%endif
628    psrldq          xm2, xm0, 2             ; top
629    psrldq          xm1, xm0, 4             ; top/right
630%if %2
631    pmaddubsw       xm8, xm7, xm8
632%if %3
633    pmaddubsw       xm9, xm7, xm9
634    paddw           xm8, xm9
635%endif
636    pmulhrsw        xm8, xm6
637%else
638    pmovsxbw        xm8, [bufyq+xq]
639%endif
640    punpcklwd       xm0, xm2
641    punpcklwd       xm1, xm8
642    pmaddwd         xm0, xm4
643    pmaddwd         xm1, xm5
644    paddd           xm0, xm1
645    paddd           xm0, xm3
646.x_loop_ar1_inner:
647    movd          val0d, xm0
648    psrldq          xm0, 4
649    imul          val3d, cf3d
650    add           val3d, val0d
651    sarx          val3d, val3d, shiftd
652    movsx         val0d, byte [bufq+xq]
653    add           val3d, val0d
654    cmp           val3d, maxd
655    cmovns        val3d, maxd
656    cmp           val3d, mind
657    cmovs         val3d, mind
658    mov  byte [bufq+xq], val3b
659    ; keep val3d in-place as left for next x iteration
660    inc              xq
661    jz .x_loop_ar1_end
662    test             xq, 3
663    jnz .x_loop_ar1_inner
664    jmp .x_loop_ar1
665
666.x_loop_ar1_end:
667    add            bufq, 82
668    add           bufyq, 82<<%3
669    dec              hd
670    jg .y_loop_ar1
671    RET
672
673.ar2:
674    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
675    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
676    imul            uvd, 28
677    vpbroadcastw   xm13, [base+round_vals-12+shiftq*2]
678    pmovsxbw        xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]   ; cf0-7
679    pmovsxbw        xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+8]   ; cf8-12
680    pinsrw          xm0, [base+pw_1], 5
681%if %2
682    vpbroadcastw   xm12, [base+hmul_bits+2+%3*2]
683    vpbroadcastd   xm11, [base+pb_1]
684%endif
685    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
686    pshufd          xm4, xm7, q0000
687    pshufd          xm5, xm7, q3333
688    pshufd          xm6, xm7, q1111
689    pshufd          xm7, xm7, q2222
690    pshufd          xm8, xm0, q0000
691    pshufd          xm9, xm0, q1111
692    pshufd         xm10, xm0, q2222
693%if %2
694    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
695%else
696    sub            bufq, 82*70-(82-3)
697%endif
698    add           bufyq, 79+82*3
699    mov              hd, 70-35*%3
700.y_loop_ar2:
701    mov              xq, -(76>>%2)
702
703.x_loop_ar2:
704    pmovsxbw        xm0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
705    pmovsxbw        xm1, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
706    pshufb          xm2, xm0, [base+gen_shufA]
707    pmaddwd         xm2, xm4
708    pshufb          xm3, xm1, [base+gen_shufB]
709    pmaddwd         xm3, xm5
710    paddd           xm2, xm3
711    pshufb          xm3, xm0, [base+gen_shufC]
712    pmaddwd         xm3, xm6
713    punpckhqdq      xm0, xm0                 ; y=-2,x=[+2,+5]
714    punpcklwd       xm0, xm1
715    pmaddwd         xm0, xm7
716    pshufb          xm1, [gen_shufD]
717    pmaddwd         xm1, xm8
718    paddd           xm2, xm3
719    paddd           xm0, xm1
720    paddd           xm2, xm0
721
722%if %2
723    movq            xm0, [bufyq+xq*2]
724%if %3
725    movq            xm3, [bufyq+xq*2+82]
726%endif
727    pmaddubsw       xm0, xm11, xm0
728%if %3
729    pmaddubsw       xm3, xm11, xm3
730    paddw           xm0, xm3
731%endif
732    pmulhrsw        xm0, xm12
733%else
734    pmovsxbw        xm0, [bufyq+xq]
735%endif
736    punpcklwd       xm0, xm13
737    pmaddwd         xm0, xm10
738    paddd           xm2, xm0
739
740    movq            xm0, [bufq+xq-2]        ; y=0,x=[-2,+5]
741.x_loop_ar2_inner:
742    pmovsxbw        xm0, xm0
743    pmaddwd         xm3, xm0, xm9
744    psrldq          xm0, 2
745    paddd           xm3, xm2
746    psrldq          xm2, 4                  ; shift top to next pixel
747    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
748    pslldq          xm3, 2
749    paddw           xm3, xm0
750    pblendw         xm0, xm3, 00000010b
751    packsswb        xm0, xm0
752    pextrb    [bufq+xq], xm0, 1
753    inc              xq
754    jz .x_loop_ar2_end
755    test             xb, 3
756    jnz .x_loop_ar2_inner
757    jmp .x_loop_ar2
758
759.x_loop_ar2_end:
760    add            bufq, 82
761    add           bufyq, 82<<%3
762    dec              hd
763    jg .y_loop_ar2
764    RET
765
766INIT_YMM avx2
767.ar3:
768    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
769    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
770    imul            uvd, 28
771    pmovsxbw         m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15
772    pmovsxbw        xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23
773    vpbroadcastb    xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma]
774    movd           xm13, [base+round_vals-10+shiftq*2]
775    vpbroadcastd   xm14, [base+round_vals-14+shiftq*2]
776    pshufd           m6, m0, q0000
777    pshufd           m7, m0, q1111
778    pshufd           m8, m0, q2222
779    pshufd           m9, m0, q3333
780    pshufd         xm10, xm1, q0000
781    pshufd         xm11, xm1, q1111
782    pshufhw        xm12, xm1, q0000
783    psraw           xm2, 8
784    palignr        xm13, xm1, 10
785    punpckhwd      xm12, xm2                     ; interleave luma cf
786    psrld          xm14, 16
787    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
788%if %2
789    vpbroadcastw   xm15, [base+hmul_bits+2+%3*2]
790    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
791%else
792    sub            bufq, 82*70-(82-3)
793%endif
794    add           bufyq, 79+82*3
795    mov              hd, 70-35*%3
796.y_loop_ar3:
797    mov              xq, -(76>>%2)
798.x_loop_ar3:
799    vbroadcasti128   m3, [bufq+xq-82*2-3]         ; y=-2,x=[-3,+12
800    palignr         xm1, xm3, [bufq+xq-82*3-9], 6 ; y=-3,x=[-3,+12]
801    vbroadcasti128   m4, [bufq+xq-82*1-3]    ; y=-1,x=[-3,+12]
802    vpblendd         m3, m1, 0x0f
803    pxor             m0, m0
804    pcmpgtb          m2, m0, m3
805    pcmpgtb          m0, m4
806    punpcklbw        m1, m3, m2
807    punpckhbw        m3, m2
808    punpcklbw        m2, m4, m0
809    punpckhbw       xm4, xm0
810    pshufb           m0, m1, [base+gen_shufA]
811    pmaddwd          m0, m6
812    pshufb           m5, m1, [base+gen_shufC]
813    pmaddwd          m5, m7
814    shufps           m1, m3, q1032
815    paddd            m0, m5
816    pshufb           m5, m1, [base+gen_shufA]
817    pmaddwd          m5, m8
818    shufps          xm1, xm3, q2121
819    vpblendd         m1, m2, 0xf0
820    pshufb           m1, [base+gen_shufE]
821    pmaddwd          m1, m9
822    paddd            m0, m5
823    pshufb          xm3, xm2, [base+gen_shufC]
824    paddd            m0, m1
825    pmaddwd         xm3, xm10
826    palignr         xm1, xm4, xm2, 2
827    punpckhwd       xm1, xm2, xm1
828    pmaddwd         xm1, xm11
829    palignr         xm4, xm2, 12
830    paddd           xm3, xm1
831%if %2
832    vpbroadcastd    xm5, [base+pb_1]
833    movq            xm1, [bufyq+xq*2]
834    pmaddubsw       xm1, xm5, xm1
835%if %3
836    movq            xm2, [bufyq+xq*2+82]
837    pmaddubsw       xm5, xm2
838    paddw           xm1, xm5
839%endif
840    pmulhrsw        xm1, xm15
841%else
842    pmovsxbw        xm1, [bufyq+xq]
843%endif
844    punpcklwd       xm4, xm1
845    pmaddwd         xm4, xm12
846    movq            xm1, [bufq+xq-3]        ; y=0,x=[-3,+4]
847    vextracti128    xm2, m0, 1
848    paddd           xm0, xm14
849    paddd           xm3, xm4
850    paddd           xm0, xm3
851    paddd           xm0, xm2
852.x_loop_ar3_inner:
853    pmovsxbw        xm1, xm1
854    pmaddwd         xm2, xm13, xm1
855    pshuflw         xm3, xm2, q1032
856    paddd           xm2, xm0                ; add top
857    paddd           xm2, xm3                ; left+cur
858    psrldq          xm0, 4
859    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
860    psrldq          xm1, 2
861    ; don't packssdw, we only care about one value
862    punpckldq       xm2, xm2
863    pblendw         xm1, xm2, 0100b
864    packsswb        xm1, xm1
865    pextrb    [bufq+xq], xm1, 2
866    inc              xq
867    jz .x_loop_ar3_end
868    test             xb, 3
869    jnz .x_loop_ar3_inner
870    jmp .x_loop_ar3
871.x_loop_ar3_end:
872    add            bufq, 82
873    add           bufyq, 82<<%3
874    dec              hd
875    jg .y_loop_ar3
876    RET
877%endmacro
878
879INIT_YMM avx2
880cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \
881                                     grain_lut, h, sby, see, overlap
882%define base r9-pd_m65536
883    lea              r9, [pd_m65536]
884    mov             r6d, [fg_dataq+FGData.scaling_shift]
885    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
886    mov            sbyd, sbym
887    mov        overlapd, [fg_dataq+FGData.overlap_flag]
888    vpbroadcastd     m8, [base+pd_m65536]
889    vpbroadcastw     m9, [base+mul_bits+r6*2-14]
890    vpbroadcastd    m10, [base+fg_min+r7*4]
891    vpbroadcastd    m11, [base+fg_max+r7*8]
892    vpbroadcastd    m12, [base+pw_1024]
893    movq           xm13, [base+pb_27_17_17_27]
894    test           sbyd, sbyd
895    setnz           r7b
896    pxor             m7, m7
897    test            r7b, overlapb
898    jnz .vertical_overlap
899
900    imul           seed, sbyd, (173 << 24) | 37
901    add            seed, (105 << 24) | 178
902    rorx           seed, seed, 24
903    movzx          seed, seew
904    xor            seed, [fg_dataq+FGData.seed]
905
906    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
907                offx, offy, see, overlap
908
909    lea        src_bakq, [srcq+wq]
910    neg              wq
911    sub            dstq, srcq
912
913.loop_x:
914    rorx             r6, seeq, 1
915    or             seed, 0xEFF4
916    test           seeb, seeh
917    lea            seed, [r6+0x8000]
918    cmovp          seed, r6d                ; updated seed
919
920    rorx          offyd, seed, 8
921    rorx          offxq, seeq, 12
922    and           offyd, 0xf
923    imul          offyd, 164
924    lea           offyd, [offyq+offxq*2+747] ; offy*stride+offx
925
926    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
927                h, offxy, see, overlap
928
929    mov              hd, hm
930    mov      grain_lutq, grain_lutmp
931.loop_y:
932    ; src
933    mova             m2, [srcq]
934    punpcklbw        m0, m2, m7
935    punpckhbw        m1, m2, m7
936
937    ; scaling[src]
938    pandn            m4, m8, m0
939    mova             m6, m8
940    vpgatherdd       m2, [scalingq+m4-0], m8
941    psrld            m3, m0, 16
942    mova             m8, m6
943    vpgatherdd       m4, [scalingq+m3-2], m6
944    pandn            m5, m8, m1
945    mova             m6, m8
946    vpgatherdd       m3, [scalingq+m5-0], m8
947    pblendw          m2, m4, 0xaa
948    psrld            m4, m1, 16
949    mova             m8, m6
950    vpgatherdd       m5, [scalingq+m4-2], m6
951    pblendw          m3, m5, 0xaa
952
953    ; grain = grain_lut[offy+y][offx+x]
954    movu             m5, [grain_lutq+offxyq]
955    punpcklbw        m4, m5, m7
956    punpckhbw        m5, m7
957
958    ; noise = round2(scaling[src] * grain, scaling_shift)
959    pmaddubsw        m2, m4
960    pmaddubsw        m3, m5
961    pmulhrsw         m2, m9
962    pmulhrsw         m3, m9
963
964    ; dst = clip_pixel(src, noise)
965    paddw            m0, m2
966    paddw            m1, m3
967    packuswb         m0, m1
968    pmaxub           m0, m10
969    pminub           m0, m11
970    mova    [dstq+srcq], m0
971
972    add            srcq, strideq
973    add      grain_lutq, 82
974    dec              hd
975    jg .loop_y
976
977    add              wq, 32
978    jge .end
979    lea            srcq, [src_bakq+wq]
980    test       overlapd, overlapd
981    jz .loop_x
982
983    ; r8m = sbym
984    cmp       dword r8m, 0
985    jne .loop_x_hv_overlap
986
987    ; horizontal overlap (without vertical overlap)
988.loop_x_h_overlap:
989    rorx             r6, seeq, 1
990    or             seed, 0xEFF4
991    test           seeb, seeh
992    lea            seed, [r6+0x8000]
993    cmovp          seed, r6d                ; updated seed
994
995    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
996                offx, offy, see, left_offxy
997
998    lea     left_offxyd, [offyq+32]         ; previous column's offy*stride+offx
999    rorx          offyd, seed, 8
1000    rorx          offxq, seeq, 12
1001    and           offyd, 0xf
1002    imul          offyd, 164
1003    lea           offyd, [offyq+offxq*2+747] ; offy*stride+offx
1004
1005    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1006                h, offxy, see, left_offxy
1007
1008    mov      grain_lutq, grain_lutmp
1009    mov              hd, hm
1010.loop_y_h_overlap:
1011    ; src
1012    mova             m2, [srcq]
1013    punpcklbw        m0, m2, m7
1014    punpckhbw        m1, m2, m7
1015
1016    ; scaling[src]
1017    pandn            m4, m8, m0
1018    mova             m6, m8
1019    vpgatherdd       m2, [scalingq+m4-0], m8
1020    psrld            m3, m0, 16
1021    mova             m8, m6
1022    vpgatherdd       m4, [scalingq+m3-2], m6
1023    pandn            m5, m8, m1
1024    mova             m6, m8
1025    vpgatherdd       m3, [scalingq+m5-0], m8
1026    pblendw          m2, m4, 0xaa
1027    psrld            m4, m1, 16
1028    mova             m8, m6
1029    vpgatherdd       m5, [scalingq+m4-2], m6
1030    pblendw          m3, m5, 0xaa
1031
1032    ; grain = grain_lut[offy+y][offx+x]
1033    movu             m5, [grain_lutq+offxyq]
1034    movd            xm4, [grain_lutq+left_offxyq]
1035    punpcklbw       xm4, xm5
1036    pmaddubsw       xm4, xm13, xm4
1037    pmulhrsw        xm4, xm12
1038    packsswb        xm4, xm4
1039    vpblendd         m4, m5, 0xfe
1040    punpckhbw        m5, m7
1041    punpcklbw        m4, m7
1042
1043    ; noise = round2(scaling[src] * grain, scaling_shift)
1044    pmaddubsw        m2, m4
1045    pmaddubsw        m3, m5
1046    pmulhrsw         m2, m9
1047    pmulhrsw         m3, m9
1048
1049    ; dst = clip_pixel(src, noise)
1050    paddw            m0, m2
1051    paddw            m1, m3
1052    packuswb         m0, m1
1053    pmaxub           m0, m10
1054    pminub           m0, m11
1055    mova    [dstq+srcq], m0
1056
1057    add            srcq, strideq
1058    add      grain_lutq, 82
1059    dec              hd
1060    jg .loop_y_h_overlap
1061
1062    add              wq, 32
1063    jge .end
1064    lea            srcq, [src_bakq+wq]
1065
1066    ; r8m = sbym
1067    cmp       dword r8m, 0
1068    jne .loop_x_hv_overlap
1069    jmp .loop_x_h_overlap
1070
1071.vertical_overlap:
1072    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1073                unused, sby, see, overlap
1074
1075    movzx          sbyd, sbyb
1076    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1077    imul            r7d, sbyd, 173 * 0x00010001
1078    imul           sbyd, 37 * 0x01000100
1079    add             r7d, (105 << 16) | 188
1080    add            sbyd, (178 << 24) | (141 << 8)
1081    and             r7d, 0x00ff00ff
1082    and            sbyd, 0xff00ff00
1083    xor            seed, r7d
1084    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1085
1086    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1087                offx, offy, see, overlap
1088
1089    lea        src_bakq, [srcq+wq]
1090    neg              wq
1091    sub            dstq, srcq
1092
1093.loop_x_v_overlap:
1094    vpbroadcastd    m14, [pb_27_17]
1095
1096    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1097    mov             r6d, seed
1098    or             seed, 0xeff4eff4
1099    test           seeb, seeh
1100    setp            r7b                     ; parity of top_seed
1101    shr            seed, 16
1102    shl             r7d, 16
1103    test           seeb, seeh
1104    setp            r7b                     ; parity of cur_seed
1105    or              r6d, 0x00010001
1106    xor             r7d, r6d
1107    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1108
1109    rorx          offyd, seed, 8
1110    rorx          offxd, seed, 12
1111    and           offyd, 0xf000f
1112    and           offxd, 0xf000f
1113    imul          offyd, 164
1114    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1115    lea           offyd, [offyq+offxq*2+0x10001*747+32*82]
1116
1117    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1118                h, offxy, see, overlap, top_offxy
1119
1120    mov      grain_lutq, grain_lutmp
1121    mov              hd, hm
1122    movzx    top_offxyd, offxyw
1123    shr          offxyd, 16
1124.loop_y_v_overlap:
1125    ; src
1126    mova             m2, [srcq]
1127    punpcklbw        m0, m2, m7
1128    punpckhbw        m1, m2, m7
1129
1130    ; scaling[src]
1131    pandn            m4, m8, m0
1132    mova             m6, m8
1133    vpgatherdd       m2, [scalingq+m4-0], m8
1134    psrld            m3, m0, 16
1135    mova             m8, m6
1136    vpgatherdd       m4, [scalingq+m3-2], m6
1137    pandn            m5, m8, m1
1138    mova             m6, m8
1139    vpgatherdd       m3, [scalingq+m5-0], m8
1140    pblendw          m2, m4, 0xaa
1141    psrld            m4, m1, 16
1142    mova             m8, m6
1143    vpgatherdd       m5, [scalingq+m4-2], m6
1144    pblendw          m3, m5, 0xaa
1145
1146    ; grain = grain_lut[offy+y][offx+x]
1147    movu             m6, [grain_lutq+offxyq]
1148    movu             m4, [grain_lutq+top_offxyq]
1149    punpcklbw        m5, m4, m6
1150    punpckhbw        m4, m6
1151    pmaddubsw        m5, m14, m5
1152    pmaddubsw        m4, m14, m4
1153    pmulhrsw         m5, m12
1154    pmulhrsw         m4, m12
1155    packsswb         m5, m4
1156    punpcklbw        m4, m5, m7
1157    punpckhbw        m5, m7
1158
1159    ; noise = round2(scaling[src] * grain, scaling_shift)
1160    pmaddubsw        m2, m4
1161    pmaddubsw        m3, m5
1162    pmulhrsw         m2, m9
1163    pmulhrsw         m3, m9
1164
1165    ; dst = clip_pixel(src, noise)
1166    paddw            m0, m2
1167    paddw            m1, m3
1168    packuswb         m0, m1
1169    pmaxub           m0, m10
1170    pminub           m0, m11
1171    mova    [dstq+srcq], m0
1172
1173    add            srcq, strideq
1174    add      grain_lutq, 82
1175    dec              hb
1176    jz .end_y_v_overlap
1177    vpbroadcastd    m14, [pb_17_27] ; swap weights for second v-overlap line
1178    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1179    ; remaining (up to) 30 lines
1180    add              hd, 0x80000000
1181    jnc .loop_y_v_overlap
1182    jmp .loop_y
1183.end_y_v_overlap:
1184    add              wq, 32
1185    jge .end
1186    lea            srcq, [src_bakq+wq]
1187
1188    ; since fg_dataq.overlap is guaranteed to be set, we never jump
1189    ; back to .loop_x_v_overlap, and instead always fall-through to
1190    ; h+v overlap
1191.loop_x_hv_overlap:
1192    vpbroadcastd    m14, [pb_27_17]
1193
1194    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1195    mov             r6d, seed
1196    or             seed, 0xeff4eff4
1197    test           seeb, seeh
1198    setp            r7b                     ; parity of top_seed
1199    shr            seed, 16
1200    shl             r7d, 16
1201    test           seeb, seeh
1202    setp            r7b                     ; parity of cur_seed
1203    or              r6d, 0x00010001
1204    xor             r7d, r6d
1205    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1206
1207    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1208                offx, offy, see, left_offxy, top_offxy, topleft_offxy
1209
1210    lea  topleft_offxyd, [top_offxyq+32]
1211    lea     left_offxyd, [offyq+32]
1212    rorx          offyd, seed, 8
1213    rorx          offxd, seed, 12
1214    and           offyd, 0xf000f
1215    and           offxd, 0xf000f
1216    imul          offyd, 164
1217    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1218    lea           offyd, [offyq+offxq*2+0x10001*747+32*82]
1219
1220    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1221                h, offxy, see, left_offxy, top_offxy, topleft_offxy
1222
1223    mov      grain_lutq, grain_lutmp
1224    mov              hd, hm
1225    movzx    top_offxyd, offxyw
1226    shr          offxyd, 16
1227.loop_y_hv_overlap:
1228    ; src
1229    mova             m2, [srcq]
1230    punpcklbw        m0, m2, m7
1231    punpckhbw        m1, m2, m7
1232
1233    ; scaling[src]
1234    pandn            m4, m8, m0
1235    mova             m6, m8
1236    vpgatherdd       m2, [scalingq+m4-0], m8
1237    psrld            m3, m0, 16
1238    mova             m8, m6
1239    vpgatherdd       m4, [scalingq+m3-2], m6
1240    pandn            m5, m8, m1
1241    mova             m6, m8
1242    vpgatherdd       m3, [scalingq+m5-0], m8
1243    pblendw          m2, m4, 0xaa
1244    psrld            m4, m1, 16
1245    mova             m8, m6
1246    vpgatherdd       m5, [scalingq+m4-2], m6
1247    pblendw          m3, m5, 0xaa
1248
1249    ; grain = grain_lut[offy+y][offx+x]
1250    movu             m6, [grain_lutq+offxyq]
1251    movd            xm7, [grain_lutq+left_offxyq]
1252    movu             m4, [grain_lutq+top_offxyq]
1253    movd            xm5, [grain_lutq+topleft_offxyq]
1254    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
1255    punpcklbw       xm7, xm6
1256    punpcklbw       xm5, xm4
1257    pmaddubsw       xm7, xm13, xm7
1258    pmaddubsw       xm5, xm13, xm5
1259    pmulhrsw        xm7, xm12
1260    pmulhrsw        xm5, xm12
1261    packsswb        xm7, xm7
1262    packsswb        xm5, xm5
1263    vpblendd         m7, m6, 0xfe
1264    vpblendd         m5, m4, 0xfe
1265    ; followed by v interpolation (top | cur -> cur)
1266    punpckhbw        m4, m6
1267    punpcklbw        m5, m7
1268    pmaddubsw        m4, m14, m4
1269    pmaddubsw        m5, m14, m5
1270    pmulhrsw         m4, m12
1271    pmulhrsw         m5, m12
1272    pxor             m7, m7
1273    packsswb         m5, m4
1274    punpcklbw        m4, m5, m7
1275    punpckhbw        m5, m7
1276
1277    ; noise = round2(scaling[src] * grain, scaling_shift)
1278    pmaddubsw        m2, m4
1279    pmaddubsw        m3, m5
1280    pmulhrsw         m2, m9
1281    pmulhrsw         m3, m9
1282
1283    ; dst = clip_pixel(src, noise)
1284    paddw            m0, m2
1285    paddw            m1, m3
1286    packuswb         m0, m1
1287    pmaxub           m0, m10
1288    pminub           m0, m11
1289    mova    [dstq+srcq], m0
1290
1291    add            srcq, strideq
1292    add      grain_lutq, 82
1293    dec              hb
1294    jz .end_y_hv_overlap
1295    vpbroadcastd    m14, [pb_17_27] ; swap weights for second v-overlap line
1296    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1297    ; remaining (up to) 30 lines
1298    add              hd, 0x80000000
1299    jnc .loop_y_hv_overlap
1300    jmp .loop_y_h_overlap
1301.end_y_hv_overlap:
1302    add              wq, 32
1303    lea            srcq, [src_bakq+wq]
1304    jl .loop_x_hv_overlap
1305.end:
1306    RET
1307
1308%macro FGUV_FN 3 ; name, ss_hor, ss_ver
1309cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
1310                                          grain_lut, h, sby, luma, overlap, uv_pl, is_id
1311%define base r11-pd_m65536
1312    lea             r11, [pd_m65536]
1313    mov             r6d, [fg_dataq+FGData.scaling_shift]
1314    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
1315    mov             r9d, is_idm
1316    mov            sbyd, sbym
1317    mov        overlapd, [fg_dataq+FGData.overlap_flag]
1318    vpbroadcastd     m8, [base+pd_m65536]
1319    vpbroadcastw     m9, [base+mul_bits+r6*2-14]
1320    vpbroadcastd    m10, [base+fg_min+r7*4]
1321    shlx            r7d, r7d, r9d
1322    vpbroadcastd    m11, [base+fg_max+r7*4]
1323    vpbroadcastd    m12, [base+pw_1024]
1324    pxor             m7, m7
1325    test           sbyd, sbyd
1326    setnz           r7b
1327    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
1328    jne .csfl
1329
1330%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
1331    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1332                h, sby, see, overlap, uv_pl
1333%if %1
1334    mov             r6d, uv_plm
1335    vpbroadcastd     m0, [base+pw_8]
1336    vbroadcasti128  m14, [fg_dataq+FGData.uv_mult+r6*4]
1337    vpbroadcastw    m15, [fg_dataq+FGData.uv_offset+r6*4]
1338    pshufb          m14, m0 ; uv_luma_mult, uv_mult
1339%elif %2
1340    vpbroadcastq    m15, [base+pb_23_22]
1341%else
1342    vpbroadcastq   xm15, [base+pb_27_17_17_27]
1343%endif
1344%if %3
1345    vpbroadcastw    m13, [base+pb_23_22]
1346%elif %2
1347    pshufd          m13, [base+pb_27_17], q0000 ; 8x27_17, 8x17_27
1348%endif
1349    test            r7b, overlapb
1350    jnz %%vertical_overlap
1351
1352    imul           seed, sbyd, (173 << 24) | 37
1353    add            seed, (105 << 24) | 178
1354    rorx           seed, seed, 24
1355    movzx          seed, seew
1356    xor            seed, [fg_dataq+FGData.seed]
1357
1358    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1359                unused2, unused3, see, overlap, unused4, unused5, lstride
1360
1361    mov           lumaq, r9mp
1362    lea             r12, [srcq+wq]
1363    lea             r13, [dstq+wq]
1364    lea             r14, [lumaq+wq*(1+%2)]
1365    mov           r11mp, r12
1366    mov           r12mp, r13
1367    mov        lstrideq, r10mp
1368    neg              wq
1369
1370%%loop_x:
1371    rorx             r6, seeq, 1
1372    or             seed, 0xEFF4
1373    test           seeb, seeh
1374    lea            seed, [r6+0x8000]
1375    cmovp          seed, r6d               ; updated seed
1376
1377    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1378                offx, offy, see, overlap, unused1, unused2, lstride
1379
1380    rorx          offyd, seed, 8
1381    rorx          offxq, seeq, 12
1382    and           offyd, 0xf
1383    imul          offyd, 164>>%3
1384    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
1385
1386    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1387                h, offxy, see, overlap, unused1, unused2, lstride
1388
1389    mov      grain_lutq, grain_lutmp
1390    mov              hd, hm
1391%%loop_y:
1392    ; src
1393%if %2
1394    mova            xm3, [lumaq+lstrideq*0+ 0]
1395    vinserti128      m3, [lumaq+lstrideq*(1+%3) +0], 1
1396    vpbroadcastd     m2, [pb_1]
1397    mova            xm0, [lumaq+lstrideq*0+16]
1398    vinserti128      m0, [lumaq+lstrideq*(1+%3)+16], 1
1399    mova            xm1, [srcq]
1400    vinserti128      m1, [srcq+strideq], 1
1401    pmaddubsw        m3, m2
1402    pmaddubsw        m0, m2
1403    pavgw            m3, m7
1404    pavgw            m0, m7
1405%else
1406    mova             m2, [lumaq]
1407    mova             m1, [srcq]
1408%endif
1409%if %1
1410%if %2
1411    packuswb         m2, m3, m0             ; luma
1412%endif
1413    punpckhbw        m3, m2, m1
1414    punpcklbw        m2, m1                 ; { luma, chroma }
1415    pmaddubsw        m3, m14
1416    pmaddubsw        m2, m14
1417    psraw            m3, 6
1418    psraw            m2, 6
1419    paddw            m3, m15
1420    paddw            m2, m15
1421    packuswb         m2, m3                 ; pack+unpack = clip
1422%endif
1423%if %1 || %2 == 0
1424    punpcklbw        m3, m2, m7
1425    punpckhbw        m0, m2, m7
1426%endif
1427
1428    ; scaling[luma_src]
1429    pandn            m4, m8, m3
1430    mova             m6, m8
1431    vpgatherdd       m2, [scalingq+m4-0], m8
1432    psrld            m3, 16
1433    mova             m8, m6
1434    vpgatherdd       m4, [scalingq+m3-2], m6
1435    pandn            m5, m8, m0
1436    mova             m6, m8
1437    vpgatherdd       m3, [scalingq+m5-0], m8
1438    psrld            m0, 16
1439    mova             m8, m6
1440    vpgatherdd       m5, [scalingq+m0-2], m6
1441    pblendw          m2, m4, 0xaa
1442    pblendw          m3, m5, 0xaa
1443
1444    ; grain = grain_lut[offy+y][offx+x]
1445%if %2
1446    movu            xm5, [grain_lutq+offxyq+ 0]
1447    vinserti128      m5, [grain_lutq+offxyq+82], 1
1448%else
1449    movu             m5, [grain_lutq+offxyq]
1450%endif
1451    punpcklbw        m4, m5, m7
1452    punpckhbw        m5, m7
1453
1454    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1455    pmaddubsw        m2, m4
1456    pmaddubsw        m3, m5
1457    pmulhrsw         m2, m9
1458    pmulhrsw         m3, m9
1459
1460    ; unpack chroma_source
1461    punpcklbw        m0, m1, m7
1462    punpckhbw        m1, m7
1463
1464    ; dst = clip_pixel(src, noise)
1465    paddw            m0, m2
1466    paddw            m1, m3
1467    packuswb         m0, m1
1468    pmaxub           m0, m10
1469    pminub           m0, m11
1470%if %2
1471    mova         [dstq], xm0
1472    vextracti128 [dstq+strideq], m0, 1
1473%else
1474    mova         [dstq], m0
1475%endif
1476
1477%if %2
1478    lea            srcq, [srcq+strideq*2]
1479    lea            dstq, [dstq+strideq*2]
1480    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
1481%else
1482    add            srcq, strideq
1483    add            dstq, strideq
1484    add           lumaq, lstrideq
1485%endif
1486    add      grain_lutq, 82<<%2
1487    sub              hb, 1+%2
1488    jg %%loop_y
1489
1490    add              wq, 32>>%2
1491    jge .end
1492    mov            srcq, r11mp
1493    mov            dstq, r12mp
1494    lea           lumaq, [r14+wq*(1+%2)]
1495    add            srcq, wq
1496    add            dstq, wq
1497    test       overlapd, overlapd
1498    jz %%loop_x
1499
1500    ; r8m = sbym
1501    cmp       dword r8m, 0
1502    jne %%loop_x_hv_overlap
1503
1504    ; horizontal overlap (without vertical overlap)
1505%%loop_x_h_overlap:
1506    rorx             r6, seeq, 1
1507    or             seed, 0xEFF4
1508    test           seeb, seeh
1509    lea            seed, [r6+0x8000]
1510    cmovp          seed, r6d               ; updated seed
1511
1512    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1513                offx, offy, see, left_offxy, unused1, unused2, lstride
1514
1515    lea     left_offxyd, [offyq+(32>>%2)]         ; previous column's offy*stride+offx
1516    rorx          offyd, seed, 8
1517    rorx          offxq, seeq, 12
1518    and           offyd, 0xf
1519    imul          offyd, 164>>%3
1520    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
1521
1522    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1523                h, offxy, see, left_offxy, unused1, unused2, lstride
1524
1525    mov      grain_lutq, grain_lutmp
1526    mov              hd, hm
1527%%loop_y_h_overlap:
1528    ; src
1529%if %2
1530    mova            xm3, [lumaq+lstrideq*0+ 0]
1531    vinserti128      m3, [lumaq+lstrideq*(1+%3)+ 0], 1
1532    vpbroadcastd     m2, [pb_1]
1533    mova            xm0, [lumaq+lstrideq*0+16]
1534    vinserti128      m0, [lumaq+lstrideq*(1+%3)+16], 1
1535    mova            xm1, [srcq]
1536    vinserti128      m1, [srcq+strideq], 1
1537    pmaddubsw        m3, m2
1538    pmaddubsw        m0, m2
1539    pavgw            m3, m7
1540    pavgw            m0, m7
1541%else
1542    mova             m2, [lumaq]
1543    mova             m1, [srcq]
1544%endif
1545%if %1
1546%if %2
1547    packuswb         m2, m3, m0             ; luma
1548%endif
1549    punpckhbw        m3, m2, m1
1550    punpcklbw        m2, m1                 ; { luma, chroma }
1551    pmaddubsw        m3, m14
1552    pmaddubsw        m2, m14
1553    psraw            m3, 6
1554    psraw            m2, 6
1555    paddw            m3, m15
1556    paddw            m2, m15
1557    packuswb         m2, m3                 ; pack+unpack = clip
1558%endif
1559%if %1 || %2 == 0
1560    punpcklbw        m3, m2, m7
1561    punpckhbw        m0, m2, m7
1562%endif
1563
1564    ; scaling[luma_src]
1565    pandn            m4, m8, m3
1566    mova             m6, m8
1567    vpgatherdd       m2, [scalingq+m4-0], m8
1568    psrld            m3, 16
1569    mova             m8, m6
1570    vpgatherdd       m4, [scalingq+m3-2], m6
1571    pandn            m5, m8, m0
1572    mova             m6, m8
1573    vpgatherdd       m3, [scalingq+m5-0], m8
1574    psrld            m0, 16
1575    mova             m8, m6
1576    vpgatherdd       m5, [scalingq+m0-2], m6
1577    pblendw          m2, m4, 0xaa
1578    pblendw          m3, m5, 0xaa
1579
1580    ; grain = grain_lut[offy+y][offx+x]
1581%if %2
1582    movu            xm5, [grain_lutq+offxyq+ 0]
1583    vinserti128      m5, [grain_lutq+offxyq+82], 1
1584    movd            xm4, [grain_lutq+left_offxyq+ 0]
1585    vinserti128      m4, [grain_lutq+left_offxyq+82], 1
1586    punpcklbw        m4, m5
1587%if %1
1588    vpbroadcastq     m0, [pb_23_22]
1589    pmaddubsw        m4, m0, m4
1590%else
1591    pmaddubsw        m4, m15, m4
1592%endif
1593    pmulhrsw         m4, m12
1594    packsswb         m4, m4
1595    vpblendd         m4, m5, 0xee
1596%else
1597    movu             m5, [grain_lutq+offxyq]
1598    movd            xm4, [grain_lutq+left_offxyq]
1599    punpcklbw       xm4, xm5
1600%if %1
1601    movq            xm0, [pb_27_17_17_27]
1602    pmaddubsw       xm4, xm0, xm4
1603%else
1604    pmaddubsw       xm4, xm15, xm4
1605%endif
1606    pmulhrsw        xm4, xm12
1607    packsswb        xm4, xm4
1608    vpblendd         m4, m5, 0xfe
1609%endif
1610    punpckhbw        m5, m7
1611    punpcklbw        m4, m7
1612
1613    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1614    pmaddubsw        m2, m4
1615    pmaddubsw        m3, m5
1616    pmulhrsw         m2, m9
1617    pmulhrsw         m3, m9
1618
1619    ; unpack chroma_source
1620    punpcklbw        m0, m1, m7
1621    punpckhbw        m1, m7
1622
1623    ; dst = clip_pixel(src, noise)
1624    paddw            m0, m2
1625    paddw            m1, m3
1626    packuswb         m0, m1
1627    pmaxub           m0, m10
1628    pminub           m0, m11
1629%if %2
1630    mova         [dstq], xm0
1631    vextracti128 [dstq+strideq], m0, 1
1632%else
1633    mova         [dstq], m0
1634%endif
1635
1636%if %2
1637    lea            srcq, [srcq+strideq*2]
1638    lea            dstq, [dstq+strideq*2]
1639    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
1640%else
1641    add            srcq, strideq
1642    add            dstq, strideq
1643    add           lumaq, lstrideq
1644%endif
1645    add      grain_lutq, 82*(1+%2)
1646    sub              hb, 1+%2
1647    jg %%loop_y_h_overlap
1648
1649    add              wq, 32>>%2
1650    jge .end
1651    mov            srcq, r11mp
1652    mov            dstq, r12mp
1653    lea           lumaq, [r14+wq*(1+%2)]
1654    add            srcq, wq
1655    add            dstq, wq
1656
1657    ; r8m = sbym
1658    cmp       dword r8m, 0
1659    jne %%loop_x_hv_overlap
1660    jmp %%loop_x_h_overlap
1661
1662%%vertical_overlap:
1663    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
1664                sby, see, overlap, unused1, unused2, lstride
1665
1666    movzx          sbyd, sbyb
1667    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1668    imul            r7d, sbyd, 173 * 0x00010001
1669    imul           sbyd, 37 * 0x01000100
1670    add             r7d, (105 << 16) | 188
1671    add            sbyd, (178 << 24) | (141 << 8)
1672    and             r7d, 0x00ff00ff
1673    and            sbyd, 0xff00ff00
1674    xor            seed, r7d
1675    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1676
1677    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1678                unused1, unused2, see, overlap, unused3, unused4, lstride
1679
1680    mov           lumaq, r9mp
1681    lea             r12, [srcq+wq]
1682    lea             r13, [dstq+wq]
1683    lea             r14, [lumaq+wq*(1+%2)]
1684    mov           r11mp, r12
1685    mov           r12mp, r13
1686    mov        lstrideq, r10mp
1687    neg              wq
1688
1689%%loop_x_v_overlap:
1690    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1691    mov             r6d, seed
1692    or             seed, 0xeff4eff4
1693    test           seeb, seeh
1694    setp            r7b                     ; parity of top_seed
1695    shr            seed, 16
1696    shl             r7d, 16
1697    test           seeb, seeh
1698    setp            r7b                     ; parity of cur_seed
1699    or              r6d, 0x00010001
1700    xor             r7d, r6d
1701    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1702
1703    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1704                offx, offy, see, overlap, top_offxy, unused, lstride
1705
1706    rorx          offyd, seed, 8
1707    rorx          offxd, seed, 12
1708    and           offyd, 0xf000f
1709    and           offxd, 0xf000f
1710    imul          offyd, 164>>%3
1711    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1712    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
1713
1714    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1715                h, offxy, see, overlap, top_offxy, unused, lstride
1716
1717    mov      grain_lutq, grain_lutmp
1718    mov              hd, hm
1719    movzx    top_offxyd, offxyw
1720    shr          offxyd, 16
1721%if %2 == 0
1722    vpbroadcastd    m13, [pb_27_17]
1723%endif
1724%%loop_y_v_overlap:
1725    ; src
1726%if %2
1727    mova            xm3, [lumaq+lstrideq*0+ 0]
1728    vinserti128      m3, [lumaq+lstrideq*(1+%3)+ 0], 1
1729    vpbroadcastd     m2, [pb_1]
1730    mova            xm0, [lumaq+lstrideq*0+16]
1731    vinserti128      m0, [lumaq+lstrideq*(1+%3)+16], 1
1732    mova            xm1, [srcq]
1733    vinserti128      m1, [srcq+strideq], 1
1734    pmaddubsw        m3, m2
1735    pmaddubsw        m0, m2
1736    pavgw            m3, m7
1737    pavgw            m0, m7
1738%else
1739    mova             m2, [lumaq]
1740    mova             m1, [srcq]
1741%endif
1742%if %1
1743%if %2
1744    packuswb         m2, m3, m0             ; luma
1745%endif
1746    punpckhbw        m3, m2, m1
1747    punpcklbw        m2, m1                 ; { luma, chroma }
1748    pmaddubsw        m3, m14
1749    pmaddubsw        m2, m14
1750    psraw            m3, 6
1751    psraw            m2, 6
1752    paddw            m3, m15
1753    paddw            m2, m15
1754    packuswb         m2, m3                 ; pack+unpack = clip
1755%endif
1756%if %1 || %2 == 0
1757    punpcklbw        m3, m2, m7
1758    punpckhbw        m0, m2, m7
1759%endif
1760
1761    ; scaling[luma_src]
1762    pandn            m4, m8, m3
1763    mova             m6, m8
1764    vpgatherdd       m2, [scalingq+m4-0], m8
1765    psrld            m3, 16
1766    mova             m8, m6
1767    vpgatherdd       m4, [scalingq+m3-2], m6
1768    pandn            m5, m8, m0
1769    mova             m6, m8
1770    vpgatherdd       m3, [scalingq+m5-0], m8
1771    psrld            m0, 16
1772    mova             m8, m6
1773    vpgatherdd       m5, [scalingq+m0-2], m6
1774    pblendw          m2, m4, 0xaa
1775    pblendw          m3, m5, 0xaa
1776
1777    ; grain = grain_lut[offy+y][offx+x]
1778%if %3 == 0
1779%if %2
1780    movu            xm0, [grain_lutq+offxyq]
1781    vinserti128      m0, [grain_lutq+offxyq+82], 1
1782    movu            xm4, [grain_lutq+top_offxyq]
1783    vinserti128      m4, [grain_lutq+top_offxyq+82], 1
1784%else
1785    movu             m0, [grain_lutq+offxyq]
1786    movu             m4, [grain_lutq+top_offxyq]
1787%endif
1788    punpcklbw        m5, m4, m0
1789    punpckhbw        m4, m0
1790    pmaddubsw        m5, m13, m5
1791    pmaddubsw        m4, m13, m4
1792    pmulhrsw         m5, m12
1793    pmulhrsw         m4, m12
1794    packsswb         m5, m4
1795%else
1796    movq            xm4, [grain_lutq+offxyq]
1797    vinserti128      m4, [grain_lutq+offxyq+8], 1
1798    movq            xm5, [grain_lutq+top_offxyq]
1799    vinserti128      m5, [grain_lutq+top_offxyq+8], 1
1800    punpcklbw        m5, m4
1801    pmaddubsw        m5, m13, m5
1802    pmulhrsw         m5, m12
1803    vextracti128    xm4, m5, 1
1804    packsswb        xm5, xm4
1805    ; only interpolate first line, insert second line unmodified
1806    vinserti128      m5, [grain_lutq+offxyq+82], 1
1807%endif
1808    punpcklbw        m4, m5, m7
1809    punpckhbw        m5, m7
1810
1811    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1812    pmaddubsw        m2, m4
1813    pmaddubsw        m3, m5
1814    pmulhrsw         m2, m9
1815    pmulhrsw         m3, m9
1816
1817    ; unpack chroma_source
1818    punpcklbw        m0, m1, m7
1819    punpckhbw        m1, m7
1820
1821    ; dst = clip_pixel(src, noise)
1822    paddw            m0, m2
1823    paddw            m1, m3
1824    packuswb         m0, m1
1825    pmaxub           m0, m10
1826    pminub           m0, m11
1827%if %2
1828    mova         [dstq], xm0
1829    vextracti128 [dstq+strideq], m0, 1
1830%else
1831    mova         [dstq], m0
1832%endif
1833
1834    sub              hb, 1+%2
1835    jle %%end_y_v_overlap
1836%if %2
1837    lea            srcq, [srcq+strideq*2]
1838    lea            dstq, [dstq+strideq*2]
1839    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
1840%else
1841    add            srcq, strideq
1842    add            dstq, strideq
1843    add           lumaq, lstrideq
1844%endif
1845    add      grain_lutq, 82<<%2
1846%if %2 == 0
1847    vpbroadcastd    m13, [pb_17_27]
1848    add              hd, 0x80000000
1849    jnc %%loop_y_v_overlap
1850%endif
1851    jmp %%loop_y
1852
1853%%end_y_v_overlap:
1854    add              wq, 32>>%2
1855    jge .end
1856    mov            srcq, r11mp
1857    mov            dstq, r12mp
1858    lea           lumaq, [r14+wq*(1+%2)]
1859    add            srcq, wq
1860    add            dstq, wq
1861
1862    ; since fg_dataq.overlap is guaranteed to be set, we never jump
1863    ; back to .loop_x_v_overlap, and instead always fall-through to
1864    ; h+v overlap
1865
1866%%loop_x_hv_overlap:
1867    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1868    mov             r6d, seed
1869    or             seed, 0xeff4eff4
1870    test           seeb, seeh
1871    setp            r7b                     ; parity of top_seed
1872    shr            seed, 16
1873    shl             r7d, 16
1874    test           seeb, seeh
1875    setp            r7b                     ; parity of cur_seed
1876    or              r6d, 0x00010001
1877    xor             r7d, r6d
1878    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1879
1880    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1881                offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
1882
1883    lea  topleft_offxyd, [top_offxyq+(32>>%2)]
1884    lea     left_offxyd, [offyq+(32>>%2)]
1885    rorx          offyd, seed, 8
1886    rorx          offxd, seed, 12
1887    and           offyd, 0xf000f
1888    and           offxd, 0xf000f
1889    imul          offyd, 164>>%3
1890    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1891    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
1892
1893    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1894                h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
1895
1896    mov      grain_lutq, grain_lutmp
1897    mov              hd, hm
1898    movzx    top_offxyd, offxyw
1899    shr          offxyd, 16
1900%if %2 == 0
1901    vpbroadcastd    m13, [pb_27_17]
1902%endif
1903%%loop_y_hv_overlap:
1904    ; src
1905%if %2
1906    mova            xm3, [lumaq+lstrideq*0+ 0]
1907    vinserti128      m3, [lumaq+lstrideq*(1+%3)+ 0], 1
1908    vpbroadcastd     m2, [pb_1]
1909    mova            xm0, [lumaq+lstrideq*0+16]
1910    vinserti128      m0, [lumaq+lstrideq*(1+%3)+16], 1
1911    mova            xm1, [srcq]
1912    vinserti128      m1, [srcq+strideq], 1
1913    pmaddubsw        m3, m2
1914    pmaddubsw        m0, m2
1915    pavgw            m3, m7
1916    pavgw            m0, m7
1917%else
1918    mova             m2, [lumaq]
1919    mova             m1, [srcq]
1920%endif
1921%if %1
1922%if %2
1923    packuswb         m2, m3, m0             ; luma
1924%endif
1925    punpckhbw        m3, m2, m1
1926    punpcklbw        m2, m1                 ; { luma, chroma }
1927    pmaddubsw        m3, m14
1928    pmaddubsw        m2, m14
1929    psraw            m3, 6
1930    psraw            m2, 6
1931    paddw            m3, m15
1932    paddw            m2, m15
1933    packuswb         m2, m3                 ; pack+unpack = clip
1934%endif
1935%if %1 || %2 == 0
1936    punpcklbw        m3, m2, m7
1937    punpckhbw        m0, m2, m7
1938%endif
1939
1940    ; scaling[luma_src]
1941    pandn            m4, m8, m3
1942    mova             m6, m8
1943    vpgatherdd       m2, [scalingq+m4-0], m8
1944    psrld            m3, 16
1945    mova             m8, m6
1946    vpgatherdd       m4, [scalingq+m3-2], m6
1947    pandn            m5, m8, m0
1948    mova             m6, m8
1949    vpgatherdd       m3, [scalingq+m5-0], m8
1950    psrld            m0, 16
1951    mova             m8, m6
1952    vpgatherdd       m5, [scalingq+m0-2], m6
1953    pblendw          m2, m4, 0xaa
1954    pblendw          m3, m5, 0xaa
1955
1956    ; grain = grain_lut[offy+y][offx+x]
1957%if %2
1958    movu            xm4, [grain_lutq+offxyq]
1959    vinserti128      m4, [grain_lutq+offxyq+82], 1
1960    movd            xm0, [grain_lutq+left_offxyq]
1961    vinserti128      m0, [grain_lutq+left_offxyq+82], 1
1962    movd            xm6, [grain_lutq+topleft_offxyq]
1963%if %3
1964    movq            xm5, [grain_lutq+top_offxyq]
1965    vinserti128      m5, [grain_lutq+top_offxyq+8], 1
1966%else
1967    vinserti128      m6, [grain_lutq+topleft_offxyq+82], 1
1968    movu            xm5, [grain_lutq+top_offxyq]
1969    vinserti128      m5, [grain_lutq+top_offxyq+82], 1
1970%endif
1971
1972    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
1973    punpcklbw        m0, m4
1974%if %3
1975    punpcklbw       xm6, xm5
1976%else
1977    punpcklbw        m6, m5
1978%endif
1979    punpcklqdq       m0, m6
1980%if %1
1981    vpbroadcastq     m6, [pb_23_22]
1982    pmaddubsw        m0, m6, m0
1983%else
1984    pmaddubsw        m0, m15, m0
1985%endif
1986    pmulhrsw         m0, m12
1987    packsswb         m0, m0
1988    vpblendd         m4, m0, 0x11
1989%if %3
1990    pshuflw         xm0, xm0, q1032
1991    vpblendd         m5, m0, 0x01
1992%else
1993    pshuflw          m0, m0, q1032
1994    vpblendd         m5, m0, 0x11
1995%endif
1996%else
1997    movu             m4, [grain_lutq+offxyq]
1998    movd            xm0, [grain_lutq+left_offxyq]
1999    movu             m5, [grain_lutq+top_offxyq]
2000    movd            xm6, [grain_lutq+topleft_offxyq]
2001    punpcklbw       xm0, xm4
2002    punpcklbw       xm6, xm5
2003    punpcklqdq      xm0, xm6
2004%if %1
2005    vpbroadcastq    xm6, [pb_27_17_17_27]
2006    pmaddubsw       xm0, xm6, xm0
2007%else
2008    pmaddubsw       xm0, xm15, xm0
2009%endif
2010    pmulhrsw        xm0, xm12
2011    packsswb        xm0, xm0
2012    vpblendd         m4, m0, 0x01
2013    pshuflw         xm0, xm0, q1032
2014    vpblendd         m5, m0, 0x01
2015%endif
2016
2017    ; followed by v interpolation (top | cur -> cur)
2018%if %3
2019    vpermq           m0, m4, q3120
2020    punpcklbw        m5, m0
2021    pmaddubsw        m5, m13, m5
2022    pmulhrsw         m5, m12
2023    vextracti128    xm0, m5, 1
2024    packsswb        xm5, xm0
2025    vpblendd         m5, m4, 0xf0
2026%else
2027    punpckhbw        m0, m5, m4
2028    punpcklbw        m5, m4
2029    pmaddubsw        m4, m13, m0
2030    pmaddubsw        m5, m13, m5
2031    pmulhrsw         m4, m12
2032    pmulhrsw         m5, m12
2033    packsswb         m5, m4
2034%endif
2035    punpcklbw        m4, m5, m7
2036    punpckhbw        m5, m7
2037
2038    ; noise = round2(scaling[src] * grain, scaling_shift)
2039    pmaddubsw        m2, m4
2040    pmaddubsw        m3, m5
2041    pmulhrsw         m2, m9
2042    pmulhrsw         m3, m9
2043
2044    ; unpack chroma source
2045    punpcklbw        m0, m1, m7
2046    punpckhbw        m1, m7
2047
2048    ; dst = clip_pixel(src, noise)
2049    paddw            m0, m2
2050    paddw            m1, m3
2051    packuswb         m0, m1
2052    pmaxub           m0, m10
2053    pminub           m0, m11
2054%if %2
2055    mova         [dstq], xm0
2056    vextracti128 [dstq+strideq], m0, 1
2057%else
2058    mova         [dstq], m0
2059%endif
2060
2061%if %2
2062    lea            srcq, [srcq+strideq*2]
2063    lea            dstq, [dstq+strideq*2]
2064    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
2065%else
2066    add            srcq, strideq
2067    add            dstq, strideq
2068    add           lumaq, lstrideq
2069%endif
2070    add      grain_lutq, 82<<%2
2071    sub              hb, 1+%2
2072%if %2
2073    jg %%loop_y_h_overlap
2074%else
2075    je %%end_y_hv_overlap
2076    vpbroadcastd    m13, [pb_17_27]
2077    add              hd, 0x80000000
2078    jnc %%loop_y_hv_overlap
2079    jmp %%loop_y_h_overlap
2080%endif
2081
2082%%end_y_hv_overlap:
2083    add              wq, 32>>%2
2084    jge .end
2085    mov            srcq, r11mp
2086    mov            dstq, r12mp
2087    lea           lumaq, [r14+wq*(1+%2)]
2088    add            srcq, wq
2089    add            dstq, wq
2090    jmp %%loop_x_hv_overlap
2091%endmacro
2092
2093    %%FGUV_32x32xN_LOOP 1, %2, %3
2094.csfl:
2095    %%FGUV_32x32xN_LOOP 0, %2, %3
2096.end:
2097    RET
2098%endmacro
2099
2100GEN_GRAIN_UV_FN 420, 1, 1
2101FGUV_FN         420, 1, 1
2102GEN_GRAIN_UV_FN 422, 1, 0
2103FGUV_FN         422, 1, 0
2104GEN_GRAIN_UV_FN 444, 0, 0
2105FGUV_FN         444, 0, 0
2106
2107%endif ; ARCH_X86_64
2108