1; Copyright © 2021-2022, VideoLAN and dav1d authors
2; Copyright © 2021-2022, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28%include "x86/filmgrain_common.asm"
29
30%if ARCH_X86_64
31
32SECTION_RODATA 16
33pb_mask:       db  0,128,128,  0,128,  0,  0,128,128,  0,  0,128,  0,128,128,  0
34gen_shufA:     db  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
35gen_shufB:     db  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13
36next_upperbit_mask:    dw 0x100B, 0x2016, 0x402C, 0x8058
37pw_27_17_17_27:        dw 27, 17, 17, 27
38pw_23_22:              dw 23, 22, 0, 32
39pw_seed_xor:   times 2 dw 0xb524
40               times 2 dw 0x49d8
41gen_ar0_shift: times 4 db 128
42               times 4 db 64
43               times 4 db 32
44               times 4 db 16
45pd_16:                 dd 16
46pd_m65536:             dd -65536
47pb_1:          times 4 db 1
48grain_max:     times 2 dw  511
49               times 2 dw 2047
50grain_min:     times 2 dw -512
51               times 2 dw -2048
52fg_max:        times 2 dw 1023
53               times 2 dw 4095
54               times 2 dw 960
55               times 2 dw 3840
56               times 2 dw 940
57               times 2 dw 3760
58fg_min:        times 2 dw 0
59               times 2 dw 64
60               times 2 dw 256
61uv_offset_mul:         dd 256
62                       dd 1024
63hmul_bits:             dw 32768, 16384,  8192,  4096
64round:                 dw  2048,  1024,   512
65mul_bits:              dw   256,   128,    64,    32,    16,     8
66round_vals:            dw    32,    64,   128,   256,   512,  1024
67pb_8_9_0_1:            db 8, 9, 0, 1
68
69%macro JMP_TABLE 1-*
70    %xdefine %1_table %%table
71    %xdefine %%base %1_table
72    %xdefine %%prefix mangle(private_prefix %+ _%1)
73    %%table:
74    %rep %0 - 1
75        dd %%prefix %+ .ar%2 - %%base
76        %rotate 1
77    %endrep
78%endmacro
79
80JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3
81JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3
82JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3
83JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3
84
85SECTION .text
86
87%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
88
89INIT_YMM avx2
90cglobal generate_grain_y_16bpc, 3, 9, 14, buf, fg_data, bdmax
91%define base r4-generate_grain_y_16bpc_avx2_table
92    lea              r4, [generate_grain_y_16bpc_avx2_table]
93    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
94    mov             r6d, [fg_dataq+FGData.grain_scale_shift]
95    movq            xm1, [base+next_upperbit_mask]
96    mov              r3, -73*82*2
97    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
98    lea             r7d, [bdmaxq+1]
99    movq            xm4, [base+mul_bits]
100    shr             r7d, 11             ; 0 for 10bpc, 2 for 12bpc
101    movq            xm5, [base+hmul_bits]
102    sub              r6, r7
103    mova            xm6, [base+pb_mask]
104    sub            bufq, r3
105    vpbroadcastw    xm7, [base+round+r6*2-2]
106    lea              r6, [gaussian_sequence]
107    movsxd           r5, [r4+r5*4]
108.loop:
109    pand            xm2, xm0, xm1
110    psrlw           xm3, xm2, 10
111    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
112    pmullw          xm2, xm4            ; bits 0x0f00 are set
113    pmulhuw         xm0, xm5
114    pshufb          xm3, xm6, xm2       ; set 15th bit for next 4 seeds
115    psllq           xm2, xm3, 30
116    por             xm2, xm3
117    psllq           xm3, xm2, 15
118    por             xm2, xm0            ; aggregate each bit into next seed's high bit
119    por             xm3, xm2            ; 4 next output seeds
120    pshuflw         xm0, xm3, q3333
121    psrlw           xm3, 5
122    pand            xm2, xm0, xm1
123    movq             r7, xm3
124    psrlw           xm3, xm2, 10
125    por             xm2, xm3
126    pmullw          xm2, xm4
127    pmulhuw         xm0, xm5
128    movzx           r8d, r7w
129    pshufb          xm3, xm6, xm2
130    psllq           xm2, xm3, 30
131    por             xm2, xm3
132    psllq           xm3, xm2, 15
133    por             xm0, xm2
134    movd            xm2, [r6+r8*2]
135    rorx             r8, r7, 32
136    por             xm3, xm0
137    shr             r7d, 16
138    pinsrw          xm2, [r6+r7*2], 1
139    pshuflw         xm0, xm3, q3333
140    movzx           r7d, r8w
141    psrlw           xm3, 5
142    pinsrw          xm2, [r6+r7*2], 2
143    shr             r8d, 16
144    movq             r7, xm3
145    pinsrw          xm2, [r6+r8*2], 3
146    movzx           r8d, r7w
147    pinsrw          xm2, [r6+r8*2], 4
148    rorx             r8, r7, 32
149    shr             r7d, 16
150    pinsrw          xm2, [r6+r7*2], 5
151    movzx           r7d, r8w
152    pinsrw          xm2, [r6+r7*2], 6
153    shr             r8d, 16
154    pinsrw          xm2, [r6+r8*2], 7
155    paddw           xm2, xm2            ; otherwise bpc=12 w/ grain_scale_shift=0
156    pmulhrsw        xm2, xm7            ; shifts by 0, which pmulhrsw does not support
157    mova      [bufq+r3], xm2
158    add              r3, 8*2
159    jl .loop
160
161    ; auto-regression code
162    add              r5, r4
163    jmp              r5
164
165.ar1:
166    DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0
167    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
168    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
169    movd            xm4, [fg_dataq+FGData.ar_coeffs_y]
170    DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0
171    pinsrb          xm4, [base+pb_1], 3
172    pmovsxbw        xm4, xm4
173    pshufd          xm5, xm4, q1111
174    pshufd          xm4, xm4, q0000
175    vpbroadcastw    xm3, [base+round_vals+shiftq*2-12]    ; rnd
176    sub            bufq, 2*(82*73-(82*3+79))
177    mov              hd, 70
178    sar            maxd, 1
179    mov            mind, maxd
180    xor            mind, -1
181.y_loop_ar1:
182    mov              xq, -76
183    movsx         val3d, word [bufq+xq*2-2]
184.x_loop_ar1:
185    movu            xm0, [bufq+xq*2-82*2-2]     ; top/left
186    psrldq          xm2, xm0, 2                 ; top
187    psrldq          xm1, xm0, 4                 ; top/right
188    punpcklwd       xm0, xm2
189    punpcklwd       xm1, xm3
190    pmaddwd         xm0, xm4
191    pmaddwd         xm1, xm5
192    paddd           xm0, xm1
193.x_loop_ar1_inner:
194    movd          val0d, xm0
195    psrldq          xm0, 4
196    imul          val3d, cf3d
197    add           val3d, val0d
198    sarx          val3d, val3d, shiftd
199    movsx         val0d, word [bufq+xq*2]
200    add           val3d, val0d
201    cmp           val3d, maxd
202    cmovg         val3d, maxd
203    cmp           val3d, mind
204    cmovl         val3d, mind
205    mov word [bufq+xq*2], val3w
206    ; keep val3d in-place as left for next x iteration
207    inc              xq
208    jz .x_loop_ar1_end
209    test             xb, 3
210    jnz .x_loop_ar1_inner
211    jmp .x_loop_ar1
212.x_loop_ar1_end:
213    add            bufq, 82*2
214    dec              hd
215    jg .y_loop_ar1
216.ar0:
217    RET
218
219.ar2:
220    DEFINE_ARGS buf, fg_data, bdmax, shift
221    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
222    movq            xm0, [fg_dataq+FGData.ar_coeffs_y+5]    ; cf5-11
223    vinserti128      m0, [fg_dataq+FGData.ar_coeffs_y+0], 1 ; cf0-4
224    vpbroadcastw   xm10, [base+round_vals-12+shiftq*2]
225    pxor             m1, m1
226    punpcklwd      xm10, xm1
227    pcmpgtb          m1, m0
228    punpcklbw        m0, m1                                 ; cf5-11,0-4
229    vpermq           m1, m0, q3333                          ; cf4
230    vbroadcasti128  m11, [base+gen_shufA]
231    pshufd           m6, m0, q0000                          ; cf[5,6], cf[0-1]
232    vbroadcasti128  m12, [base+gen_shufB]
233    pshufd           m7, m0, q1111                          ; cf[7,8], cf[2-3]
234    punpckhwd       xm1, xm0
235    pshufhw         xm9, xm0, q2121
236    pshufd          xm8, xm1, q0000                         ; cf[4,9]
237    sar          bdmaxd, 1
238    punpckhqdq      xm9, xm9                                ; cf[10,11]
239    movd            xm4, bdmaxd                             ; max_grain
240    pcmpeqd         xm5, xm5
241    sub            bufq, 2*(82*73-(82*3+79))
242    pxor            xm5, xm4                                ; min_grain
243    DEFINE_ARGS buf, fg_data, h, x
244    mov              hd, 70
245.y_loop_ar2:
246    mov              xq, -76
247.x_loop_ar2:
248    vbroadcasti128   m2, [bufq+xq*2-82*4-4]        ; y=-2,x=[-2,+5]
249    vinserti128      m1, m2, [bufq+xq*2-82*2-4], 0 ; y=-1,x=[-2,+5]
250    pshufb           m0, m1, m11                   ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
251    pmaddwd          m0, m6
252    punpckhwd       xm2, xm1                       ; y=-2/-1 interleaved, x=[+2,+5]
253    pshufb           m1, m12                       ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
254    pmaddwd          m1, m7
255    pmaddwd         xm2, xm8
256    paddd            m0, m1
257    vextracti128    xm1, m0, 1
258    paddd           xm0, xm10
259    paddd           xm2, xm0
260    movu            xm0, [bufq+xq*2-4]      ; y=0,x=[-2,+5]
261    paddd           xm2, xm1
262    pmovsxwd        xm1, [bufq+xq*2]        ; in dwords, y=0,x=[0,3]
263.x_loop_ar2_inner:
264    pmaddwd         xm3, xm9, xm0
265    psrldq          xm0, 2
266    paddd           xm3, xm2
267    psrldq          xm2, 4                  ; shift top to next pixel
268    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
269    ; skip packssdw because we only care about one value
270    paddd           xm3, xm1
271    pminsd          xm3, xm4
272    psrldq          xm1, 4
273    pmaxsd          xm3, xm5
274    pextrw  [bufq+xq*2], xm3, 0
275    punpcklwd       xm3, xm3
276    pblendw         xm0, xm3, 0010b
277    inc              xq
278    jz .x_loop_ar2_end
279    test             xb, 3
280    jnz .x_loop_ar2_inner
281    jmp .x_loop_ar2
282.x_loop_ar2_end:
283    add            bufq, 82*2
284    dec              hd
285    jg .y_loop_ar2
286    RET
287
288.ar3:
289    DEFINE_ARGS buf, fg_data, bdmax, shift
290    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
291    sar          bdmaxd, 1
292    movq            xm7, [fg_dataq+FGData.ar_coeffs_y+ 0]    ; cf0-6
293    movd            xm0, [fg_dataq+FGData.ar_coeffs_y+14]    ; cf14-16
294    pinsrb          xm7, [fg_dataq+FGData.ar_coeffs_y+13], 7 ; cf0-6,13
295    pinsrb          xm0, [base+pb_1], 3                      ; cf14-16,pb_1
296    movd            xm1, [fg_dataq+FGData.ar_coeffs_y+21]    ; cf21-23
297    vinserti128      m7, [fg_dataq+FGData.ar_coeffs_y+ 7], 1 ; cf7-13
298    vinserti128      m0, [fg_dataq+FGData.ar_coeffs_y+17], 1 ; cf17-20
299    vpbroadcastw   xm11, [base+round_vals+shiftq*2-12]
300    movd           xm12, bdmaxd                              ; max_grain
301    punpcklbw        m7, m7                                  ; sign-extension
302    punpcklbw        m0, m0                                  ; sign-extension
303    punpcklbw       xm1, xm1
304    REPX   {psraw x, 8}, m7, m0, xm1
305    pshufd           m4, m7, q0000                           ; cf[0,1] | cf[7,8]
306    pshufd           m5, m7, q1111                           ; cf[2,3] | cf[9,10]
307    pshufd           m6, m7, q2222                           ; cf[4,5] | cf[11,12]
308    pshufd          xm7, xm7, q3333                          ; cf[6,13]
309    pshufd           m8, m0, q0000                           ; cf[14,15] | cf[17,18]
310    pshufd           m9, m0, q1111                           ; cf[16],pw_1 | cf[19,20]
311    paddw           xm0, xm11, xm11
312    pcmpeqd        xm13, xm13
313    pblendw        xm10, xm1, xm0, 00001000b
314    pxor           xm13, xm12                                ; min_grain
315    DEFINE_ARGS buf, fg_data, h, x
316    sub            bufq, 2*(82*73-(82*3+79))
317    mov              hd, 70
318.y_loop_ar3:
319    mov              xq, -76
320.x_loop_ar3:
321    movu            xm0, [bufq+xq*2-82*6-6+ 0]      ; y=-3,x=[-3,+4]
322    vinserti128      m0, [bufq+xq*2-82*4-6+ 0], 1   ; y=-3/-2,x=[-3,+4]
323    movq            xm1, [bufq+xq*2-82*6-6+16]      ; y=-3,x=[+5,+8]
324    vinserti128      m1, [bufq+xq*2-82*4-6+16], 1   ; y=-3/-2,x=[+5,+12]
325    palignr          m3, m1, m0, 2                  ; y=-3/-2,x=[-2,+5]
326    palignr          m1, m0, 12                     ; y=-3/-2,x=[+3,+6]
327    punpckhwd        m2, m0, m3                     ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5]
328    punpcklwd        m0, m3                         ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1]
329    shufps           m3, m0, m2, q1032              ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3]
330    pmaddwd          m0, m4
331    pmaddwd          m2, m6
332    pmaddwd          m3, m5
333    paddd            m0, m2
334    movu            xm2, [bufq+xq*2-82*2-6+ 0]      ; y=-1,x=[-3,+4]
335    vinserti128      m2, [bufq+xq*2-82*2-6+ 6], 1   ; y=-1,x=[+1,+8]
336    paddd            m0, m3
337    psrldq           m3, m2, 2
338    punpcklwd        m3, m2, m3                     ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
339    pmaddwd          m3, m8                         ;      x=[+0/+1,+1/+2,+2/+3,+3/+4]
340    paddd            m0, m3
341    psrldq           m3, m2, 4
342    psrldq           m2, 6
343    vpblendd         m2, m11, 0x0f                  ; rounding constant
344    punpcklwd        m3, m2                         ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd]
345    pmaddwd          m3, m9                         ;      x=[+2/+3,+3/+4,+4/+5,+5,+6]
346    vextracti128    xm2, m1, 1
347    punpcklwd       xm1, xm2
348    pmaddwd         xm1, xm7                        ; y=-3/-2 interleaved,x=[+3,+4,+5,+6]
349    paddd            m0, m3
350    vextracti128    xm2, m0, 1
351    paddd           xm0, xm1
352    movu            xm1, [bufq+xq*2-6]        ; y=0,x=[-3,+4]
353    paddd           xm0, xm2
354.x_loop_ar3_inner:
355    pmaddwd         xm2, xm1, xm10
356    pshuflw         xm3, xm2, q1032
357    paddd           xm2, xm0                ; add top
358    paddd           xm2, xm3                ; left+cur
359    psrldq          xm0, 4
360    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
361    ; skip packssdw because we only care about one value
362    pminsd          xm2, xm12
363    pmaxsd          xm2, xm13
364    pextrw  [bufq+xq*2], xm2, 0
365    pslldq          xm2, 4
366    psrldq          xm1, 2
367    pblendw         xm1, xm2, 0100b
368    inc              xq
369    jz .x_loop_ar3_end
370    test             xb, 3
371    jnz .x_loop_ar3_inner
372    jmp .x_loop_ar3
373.x_loop_ar3_end:
374    add            bufq, 82*2
375    dec              hd
376    jg .y_loop_ar3
377    RET
378
379%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y
380INIT_XMM avx2
381cglobal generate_grain_uv_%1_16bpc, 4, 11, 8, buf, bufy, fg_data, uv, bdmax
382%define base r8-generate_grain_uv_%1_16bpc_avx2_table
383    lea              r8, [generate_grain_uv_%1_16bpc_avx2_table]
384    movifnidn    bdmaxd, bdmaxm
385    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
386    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
387    movq            xm1, [base+next_upperbit_mask]
388    lea             r6d, [bdmaxq+1]
389    movq            xm4, [base+mul_bits]
390    shr             r6d, 11             ; 0 for 10bpc, 2 for 12bpc
391    movq            xm5, [base+hmul_bits]
392    sub              r5, r6
393    mova            xm6, [base+pb_mask]
394    vpbroadcastd    xm2, [base+pw_seed_xor+uvq*4]
395    vpbroadcastw    xm7, [base+round+r5*2-2]
396    pxor            xm0, xm2
397    lea              r6, [gaussian_sequence]
398%if %2
399    mov             r7d, 73-35*%3
400    add            bufq, 44*2
401.loop_y:
402    mov              r5, -44*2
403%else
404    mov              r5, -82*73*2
405    sub            bufq, r5
406%endif
407.loop_x:
408    pand            xm2, xm0, xm1
409    psrlw           xm3, xm2, 10
410    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
411    pmullw          xm2, xm4            ; bits 0x0f00 are set
412    pmulhuw         xm0, xm5
413    pshufb          xm3, xm6, xm2       ; set 15th bit for next 4 seeds
414    psllq           xm2, xm3, 30
415    por             xm2, xm3
416    psllq           xm3, xm2, 15
417    por             xm2, xm0            ; aggregate each bit into next seed's high bit
418    por             xm2, xm3            ; 4 next output seeds
419    pshuflw         xm0, xm2, q3333
420    psrlw           xm2, 5
421    movq            r10, xm2
422    movzx           r9d, r10w
423    movd            xm2, [r6+r9*2]
424    rorx             r9, r10, 32
425    shr            r10d, 16
426    pinsrw          xm2, [r6+r10*2], 1
427    movzx          r10d, r9w
428    pinsrw          xm2, [r6+r10*2], 2
429    shr             r9d, 16
430    pinsrw          xm2, [r6+r9*2], 3
431    paddw           xm2, xm2            ; otherwise bpc=12 w/ grain_scale_shift=0
432    pmulhrsw        xm2, xm7            ; shifts by 0, which pmulhrsw does not support
433    movq      [bufq+r5], xm2
434    add              r5, 8
435    jl .loop_x
436%if %2
437    add            bufq, 82*2
438    dec             r7d
439    jg .loop_y
440%endif
441
442    ; auto-regression code
443    movsxd           r6, [fg_dataq+FGData.ar_coeff_lag]
444    movsxd           r6, [r8+r6*4]
445    add              r6, r8
446    jmp              r6
447
448INIT_YMM avx2
449.ar0:
450    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
451    imul            uvd, 28
452    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
453    vpbroadcastb     m0, [fg_dataq+FGData.ar_coeffs_uv+uvq]
454    sar          bdmaxd, 1
455    vpbroadcastd     m4, [base+gen_ar0_shift-24+shiftq*4]
456    movd            xm6, bdmaxd
457    pcmpeqw          m7, m7
458    pmaddubsw        m4, m0  ; ar_coeff << (14 - shift)
459    vpbroadcastw     m6, xm6 ; max_gain
460    pxor             m7, m6  ; min_grain
461    DEFINE_ARGS buf, bufy, h, x
462%if %2
463    vpbroadcastw     m5, [base+hmul_bits+2+%3*2]
464    sub            bufq, 2*(82*(73-35*%3)+82-(82*3+41))
465%else
466    sub            bufq, 2*(82*70-3)
467%endif
468    add           bufyq, 2*(3+82*3)
469    mov              hd, 70-35*%3
470.y_loop_ar0:
471%if %2
472    ; first 32 pixels
473    movu            xm0, [bufyq+16*0]
474    vinserti128      m0, [bufyq+16*2], 1
475    movu            xm1, [bufyq+16*1]
476    vinserti128      m1, [bufyq+16*3], 1
477%if %3
478    movu            xm2, [bufyq+82*2+16*0]
479    vinserti128      m2, [bufyq+82*2+16*2], 1
480    movu            xm3, [bufyq+82*2+16*1]
481    vinserti128      m3, [bufyq+82*2+16*3], 1
482    paddw            m0, m2
483    paddw            m1, m3
484%endif
485    phaddw           m0, m1
486    movu            xm1, [bufyq+16*4]
487    vinserti128      m1, [bufyq+16*6], 1
488    movu            xm2, [bufyq+16*5]
489    vinserti128      m2, [bufyq+16*7], 1
490%if %3
491    movu            xm3, [bufyq+82*2+16*4]
492    vinserti128      m3, [bufyq+82*2+16*6], 1
493    paddw            m1, m3
494    movu            xm3, [bufyq+82*2+16*5]
495    vinserti128      m3, [bufyq+82*2+16*7], 1
496    paddw            m2, m3
497%endif
498    phaddw           m1, m2
499    pmulhrsw         m0, m5
500    pmulhrsw         m1, m5
501%else
502    xor              xd, xd
503.x_loop_ar0:
504    movu             m0, [bufyq+xq*2]
505    movu             m1, [bufyq+xq*2+32]
506%endif
507    paddw            m0, m0
508    paddw            m1, m1
509    pmulhrsw         m0, m4
510    pmulhrsw         m1, m4
511%if %2
512    paddw            m0, [bufq+ 0]
513    paddw            m1, [bufq+32]
514%else
515    paddw            m0, [bufq+xq*2+ 0]
516    paddw            m1, [bufq+xq*2+32]
517%endif
518    pminsw           m0, m6
519    pminsw           m1, m6
520    pmaxsw           m0, m7
521    pmaxsw           m1, m7
522%if %2
523    movu      [bufq+ 0], m0
524    movu      [bufq+32], m1
525
526    ; last 6 pixels
527    movu            xm0, [bufyq+32*4]
528    movu            xm1, [bufyq+32*4+16]
529%if %3
530    paddw           xm0, [bufyq+32*4+82*2]
531    paddw           xm1, [bufyq+32*4+82*2+16]
532%endif
533    phaddw          xm0, xm1
534    movu            xm1, [bufq+32*2]
535    pmulhrsw        xm0, xm5
536    paddw           xm0, xm0
537    pmulhrsw        xm0, xm4
538    paddw           xm0, xm1
539    pminsw          xm0, xm6
540    pmaxsw          xm0, xm7
541    vpblendd        xm0, xm1, 0x08
542    movu    [bufq+32*2], xm0
543%else
544    movu [bufq+xq*2+ 0], m0
545    movu [bufq+xq*2+32], m1
546    add              xd, 32
547    cmp              xd, 64
548    jl .x_loop_ar0
549
550    ; last 12 pixels
551    movu             m0, [bufyq+64*2]
552    movu             m1, [bufq+64*2]
553    paddw            m0, m0
554    pmulhrsw         m0, m4
555    paddw            m0, m1
556    pminsw           m0, m6
557    pmaxsw           m0, m7
558    vpblendd         m0, m1, 0xc0
559    movu    [bufq+64*2], m0
560%endif
561    add            bufq, 82*2
562    add           bufyq, 82*2<<%3
563    dec              hd
564    jg .y_loop_ar0
565    RET
566
567INIT_XMM avx2
568.ar1:
569    DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x, shift
570    imul            uvd, 28
571    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
572    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
573    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
574    pinsrb          xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
575    DEFINE_ARGS buf, bufy, h, val0, max, cf3, min, val3, x, shift
576    pmovsxbw        xm4, xm4
577    pshufd          xm5, xm4, q1111
578    pshufd          xm4, xm4, q0000
579    pmovsxwd        xm3, [base+round_vals+shiftq*2-12]    ; rnd
580    vpbroadcastw    xm6, [base+hmul_bits+2+%3*2]
581    vpbroadcastd    xm3, xm3
582%if %2
583    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
584%else
585    sub            bufq, 2*(82*69+3)
586%endif
587    add           bufyq, 2*(79+82*3)
588    mov              hd, 70-35*%3
589    sar            maxd, 1
590    mov            mind, maxd
591    xor            mind, -1
592.y_loop_ar1:
593    mov              xq, -(76>>%2)
594    movsx         val3d, word [bufq+xq*2-2]
595.x_loop_ar1:
596    movu            xm0, [bufq+xq*2-82*2-2] ; top/left
597%if %2
598    movu            xm2, [bufyq+xq*4]
599%else
600    movq            xm2, [bufyq+xq*2]
601%endif
602%if %2
603%if %3
604    phaddw          xm2, [bufyq+xq*4+82*2]
605    punpckhqdq      xm1, xm2, xm2
606    paddw           xm2, xm1
607%else
608    phaddw          xm2, xm2
609%endif
610    pmulhrsw        xm2, xm6
611%endif
612    psrldq          xm1, xm0, 4             ; top/right
613    punpcklwd       xm1, xm2
614    psrldq          xm2, xm0, 2             ; top
615    punpcklwd       xm0, xm2
616    pmaddwd         xm1, xm5
617    pmaddwd         xm0, xm4
618    paddd           xm1, xm3
619    paddd           xm0, xm1
620.x_loop_ar1_inner:
621    movd          val0d, xm0
622    psrldq          xm0, 4
623    imul          val3d, cf3d
624    add           val3d, val0d
625    sarx          val3d, val3d, shiftd
626    movsx         val0d, word [bufq+xq*2]
627    add           val3d, val0d
628    cmp           val3d, maxd
629    cmovg         val3d, maxd
630    cmp           val3d, mind
631    cmovl         val3d, mind
632    mov word [bufq+xq*2], val3w
633    ; keep val3d in-place as left for next x iteration
634    inc              xq
635    jz .x_loop_ar1_end
636    test             xb, 3
637    jnz .x_loop_ar1_inner
638    jmp .x_loop_ar1
639.x_loop_ar1_end:
640    add            bufq, 82*2
641    add           bufyq, 82*2<<%3
642    dec              hd
643    jg .y_loop_ar1
644    RET
645
646INIT_YMM avx2
647.ar2:
648%if WIN64
649    ; xmm6 and xmm7 already saved
650    %assign xmm_regs_used 13 + %2
651    %assign stack_size_padded 136
652    SUB             rsp, stack_size_padded
653    movaps   [rsp+16*2], xmm8
654    movaps   [rsp+16*3], xmm9
655    movaps   [rsp+16*4], xmm10
656    movaps   [rsp+16*5], xmm11
657    movaps   [rsp+16*6], xmm12
658%if %2
659    movaps   [rsp+16*7], xmm13
660%endif
661%endif
662    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
663    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
664    imul            uvd, 28
665    vbroadcasti128  m10, [base+gen_shufA]
666    sar          bdmaxd, 1
667    vbroadcasti128  m11, [base+gen_shufB]
668    movd            xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 5]
669    pinsrb          xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+12], 4
670    pinsrb          xm7, [base+pb_1], 5
671    pinsrw          xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+10], 3
672    movhps          xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
673    pinsrb          xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 9], 13
674    pmovsxbw         m7, xm7
675    movd            xm8, bdmaxd             ; max_grain
676    pshufd           m4, m7, q0000
677    vpbroadcastw   xm12, [base+round_vals-12+shiftq*2]
678    pshufd           m5, m7, q1111
679    pcmpeqd         xm9, xm9
680    pshufd           m6, m7, q2222
681    pxor            xm9, xm8                ; min_grain
682    pshufd          xm7, xm7, q3333
683    DEFINE_ARGS buf, bufy, fg_data, h, x
684%if %2
685    vpbroadcastw   xm13, [base+hmul_bits+2+%3*2]
686    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
687%else
688    sub            bufq, 2*(82*69+3)
689%endif
690    add           bufyq, 2*(79+82*3)
691    mov              hd, 70-35*%3
692.y_loop_ar2:
693    mov              xq, -(76>>%2)
694.x_loop_ar2:
695    vbroadcasti128   m3, [bufq+xq*2-82*2-4]        ; y=-1,x=[-2,+5]
696    vinserti128      m2, m3, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5]
697    pshufb           m0, m2, m10                   ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
698    pmaddwd          m0, m4
699    pshufb           m1, m2, m11                   ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
700    pmaddwd          m1, m5
701    punpckhwd        m2, m3                        ; y=-2/-1 interleaved, x=[+2,+5]
702%if %2
703    movu            xm3, [bufyq+xq*4]
704%if %3
705    paddw           xm3, [bufyq+xq*4+82*2]
706%endif
707    phaddw          xm3, xm3
708    pmulhrsw        xm3, xm13
709%else
710    movq            xm3, [bufyq+xq*2]
711%endif
712    punpcklwd       xm3, xm12                   ; luma, round interleaved
713    vpblendd         m2, m3, 0x0f
714    pmaddwd          m2, m6
715    paddd            m1, m0
716    movu            xm0, [bufq+xq*2-4]      ; y=0,x=[-2,+5]
717    paddd            m2, m1
718    vextracti128    xm1, m2, 1
719    paddd           xm2, xm1
720    pshufd          xm1, xm0, q3321
721    pmovsxwd        xm1, xm1                ; y=0,x=[0,3] in dword
722.x_loop_ar2_inner:
723    pmaddwd         xm3, xm7, xm0
724    paddd           xm3, xm2
725    psrldq          xm2, 4                  ; shift top to next pixel
726    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
727    ; we do not need to packssdw since we only care about one value
728    paddd           xm3, xm1
729    psrldq          xm1, 4
730    pminsd          xm3, xm8
731    pmaxsd          xm3, xm9
732    pextrw  [bufq+xq*2], xm3, 0
733    psrldq          xm0, 2
734    pslldq          xm3, 2
735    pblendw         xm0, xm3, 00000010b
736    inc              xq
737    jz .x_loop_ar2_end
738    test             xb, 3
739    jnz .x_loop_ar2_inner
740    jmp .x_loop_ar2
741.x_loop_ar2_end:
742    add            bufq, 82*2
743    add           bufyq, 82*2<<%3
744    dec              hd
745    jg .y_loop_ar2
746    RET
747
748.ar3:
749%if WIN64
750    ; xmm6 and xmm7 already saved
751    %assign stack_offset 32
752    %assign xmm_regs_used 14 + %2
753    %assign stack_size_padded 152
754    SUB             rsp, stack_size_padded
755    movaps   [rsp+16*2], xmm8
756    movaps   [rsp+16*3], xmm9
757    movaps   [rsp+16*4], xmm10
758    movaps   [rsp+16*5], xmm11
759    movaps   [rsp+16*6], xmm12
760    movaps   [rsp+16*7], xmm13
761%if %2
762    movaps   [rsp+16*8], xmm14
763%endif
764%endif
765    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
766    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
767    imul            uvd, 28
768    vpbroadcastw   xm11, [base+round_vals-12+shiftq*2]
769    sar          bdmaxd, 1
770    movq            xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
771    pinsrb          xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7 ; luma
772    movhps          xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 7]
773    pmovsxbw         m7, xm7
774%if %2
775    vpbroadcastw   xm14, [base+hmul_bits+2+%3*2]
776%endif
777    pshufd           m4, m7, q0000
778    pshufd           m5, m7, q1111
779    pshufd           m6, m7, q2222
780    pshufd           m7, m7, q3333
781    movd            xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+14]
782    pinsrb          xm0, [base+pb_1], 3
783    pinsrd          xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+21], 1
784    pinsrd          xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+17], 2
785    pmovsxbw         m0, xm0
786    movd           xm12, bdmaxd                 ; max_grain
787    pshufd           m8, m0, q0000
788    pshufd           m9, m0, q1111
789    pcmpeqd        xm13, xm13
790    punpckhqdq     xm10, xm0, xm0
791    pxor           xm13, xm12                   ; min_grain
792    pinsrw         xm10, [base+round_vals-10+shiftq*2], 3
793    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
794%if %2
795    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
796%else
797    sub            bufq, 2*(82*69+3)
798%endif
799    add           bufyq, 2*(79+82*3)
800    mov              hd, 70-35*%3
801.y_loop_ar3:
802    mov              xq, -(76>>%2)
803.x_loop_ar3:
804    movu            xm2, [bufq+xq*2-82*6-6+ 0]    ; y=-3,x=[-3,+4]
805    vinserti128      m2, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4]
806    movq            xm1, [bufq+xq*2-82*6-6+16]    ; y=-3,x=[+5,+8]
807    vinserti128      m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12]
808    palignr          m3, m1, m2, 2                ; y=-3/-2,x=[-2,+5]
809    palignr          m1, m2, 12                   ; y=-3/-2,x=[+3,+6]
810    punpcklwd        m0, m2, m3                   ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1]
811    punpckhwd        m2, m3                       ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5]
812    shufps           m3, m0, m2, q1032            ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3]
813    pmaddwd          m0, m4
814    pmaddwd          m2, m6
815    pmaddwd          m3, m5
816    paddd            m0, m2
817    paddd            m0, m3
818    movu            xm2, [bufq+xq*2-82*2-6+ 0]    ; y=-1,x=[-3,+4]
819    vinserti128      m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8]
820%if %2
821    movu            xm3, [bufyq+xq*4]
822%if %3
823    paddw           xm3, [bufyq+xq*4+82*2]
824%endif
825    phaddw          xm3, xm3
826    pmulhrsw        xm3, xm14
827%else
828    movq            xm3, [bufyq+xq*2]
829%endif
830    punpcklwd        m1, m3
831    pmaddwd          m1, m7
832    paddd            m0, m1
833    psrldq           m1, m2, 4
834    psrldq           m3, m2, 6
835    vpblendd         m3, m11, 0x0f                ; rounding constant
836    punpcklwd        m1, m3                       ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd]
837    pmaddwd          m1, m9                       ;      x=[+2/+3,+3/+4,+4/+5,+5,+6]
838    psrldq           m3, m2, 2
839    punpcklwd        m2, m3                       ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
840    pmaddwd          m2, m8                       ;      x=[+0/+1,+1/+2,+2/+3,+3/+4]
841    paddd            m0, m1
842    movu            xm1, [bufq+xq*2-6]            ; y=0,x=[-3,+4]
843    paddd            m0, m2
844    vextracti128    xm2, m0, 1
845    paddd           xm0, xm2
846.x_loop_ar3_inner:
847    pmaddwd         xm2, xm1, xm10
848    pshuflw         xm3, xm2, q1032
849    paddd           xm2, xm0                      ; add top
850    paddd           xm2, xm3                      ; left+cur
851    psrldq          xm0, 4
852    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
853    psrldq          xm1, 2
854    ; no need to packssdw since we only care about one value
855    pminsd          xm2, xm12
856    pmaxsd          xm2, xm13
857    pextrw  [bufq+xq*2], xm2, 0
858    pslldq          xm2, 4
859    pblendw         xm1, xm2, 00000100b
860    inc              xq
861    jz .x_loop_ar3_end
862    test             xb, 3
863    jnz .x_loop_ar3_inner
864    jmp .x_loop_ar3
865.x_loop_ar3_end:
866    add            bufq, 82*2
867    add           bufyq, 82*2<<%3
868    dec              hd
869    jg .y_loop_ar3
870    RET
871%endmacro
872
873cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \
874                                      grain_lut, unused, sby, see
875%define base r11-grain_min
876    lea             r11, [grain_min]
877    mov             r6d, r9m ; bdmax
878    mov             r9d, [fg_dataq+FGData.clip_to_restricted_range]
879    mov             r7d, [fg_dataq+FGData.scaling_shift]
880    mov            sbyd, sbym
881    vpbroadcastd     m8, r9m
882    shr             r6d, 11  ; is_12bpc
883    vpbroadcastd     m9, [base+grain_min+r6*4]
884    shlx           r10d, r9d, r6d
885    vpbroadcastd    m10, [base+grain_max+r6*4]
886    lea             r9d, [r6+r9*4]
887    vpbroadcastw    m11, [base+mul_bits+r7*2-12]
888    vpbroadcastd    m12, [base+fg_min+r10*4]
889    vpbroadcastd    m13, [base+fg_max+r9*4]
890    test           sbyd, sbyd
891    setnz           r7b
892    vpbroadcastd    m14, [base+pd_16]
893    test            r7b, [fg_dataq+FGData.overlap_flag]
894    jnz .vertical_overlap
895
896    imul           seed, sbyd, (173 << 24) | 37
897    add            seed, (105 << 24) | 178
898    rorx           seed, seed, 24
899    movzx          seed, seew
900    xor            seed, [fg_dataq+FGData.seed]
901
902    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
903                offx, offy, see, src_bak
904
905    lea        src_bakq, [srcq+wq*2]
906    neg              wq
907    sub            dstq, srcq
908
909.loop_x:
910    rorx             r6, seeq, 1
911    or             seed, 0xEFF4
912    test           seeb, seeh
913    lea            seed, [r6+0x8000]
914    cmovp          seed, r6d                ; updated seed
915    rorx          offyd, seed, 8
916    rorx          offxq, seeq, 12
917    and           offyd, 0xf
918    imul          offyd, 164
919    lea           offyd, [offyq+offxq*2+747] ; offy*stride+offx
920
921    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
922                h, offxy, see, src_bak
923
924    mov      grain_lutq, grain_lutmp
925    mov              hd, hm
926.loop_y:
927    ; scaling[src]
928    mova             m0, [srcq+ 0]
929    mova             m1, [srcq+32]
930    pand             m4, m8, m0
931    psrld            m3, m0, 16
932    mova             m6, m9
933    vpgatherdd       m2, [scalingq+m4-0], m9
934    pand             m3, m8
935    mova             m9, m6
936    vpgatherdd       m4, [scalingq+m3-2], m6
937    pand             m5, m8, m1
938    mova             m6, m9
939    vpgatherdd       m3, [scalingq+m5-0], m9
940    pblendw          m4, m2, 0x55
941    psrld            m2, m1, 16
942    mova             m9, m6
943    pand             m2, m8
944    vpgatherdd       m5, [scalingq+m2-2], m6
945    pblendw          m5, m3, 0x55
946
947    ; noise = round2(scaling[src] * grain, scaling_shift)
948    pmaddubsw        m4, m11
949    pmaddubsw        m5, m11
950    paddw            m4, m4
951    paddw            m5, m5
952    pmulhrsw         m4, [grain_lutq+offxyq*2]
953    pmulhrsw         m5, [grain_lutq+offxyq*2+32]
954
955    ; dst = clip_pixel(src, noise)
956    paddw            m0, m4
957    paddw            m1, m5
958    pmaxsw           m0, m12
959    pmaxsw           m1, m12
960    pminsw           m0, m13
961    pminsw           m1, m13
962    mova [dstq+srcq+ 0], m0
963    mova [dstq+srcq+32], m1
964
965    add            srcq, strideq
966    add      grain_lutq, 82*2
967    dec              hd
968    jg .loop_y
969    add              wq, 32
970    jge .end
971    lea            srcq, [src_bakq+wq*2]
972    cmp byte [fg_dataq+FGData.overlap_flag], 0
973    je .loop_x
974    movq            xm7, [pw_27_17_17_27]
975    cmp       dword r8m, 0 ; sby
976    jne .loop_x_hv_overlap
977
978    ; horizontal overlap (without vertical overlap)
979.loop_x_h_overlap:
980    rorx             r6, seeq, 1
981    or             seed, 0xEFF4
982    test           seeb, seeh
983    lea            seed, [r6+0x8000]
984    cmovp          seed, r6d                ; updated seed
985
986    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
987                offx, offy, see, src_bak, left_offxy
988
989    lea     left_offxyd, [offyq+32]         ; previous column's offy*stride+offx
990    rorx          offyd, seed, 8
991    rorx          offxq, seeq, 12
992    and           offyd, 0xf
993    imul          offyd, 164
994    lea           offyd, [offyq+offxq*2+747] ; offy*stride+offx
995
996    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
997                h, offxy, see, src_bak, left_offxy
998
999    mov      grain_lutq, grain_lutmp
1000    mov              hd, hm
1001.loop_y_h_overlap:
1002    ; scaling[src]
1003    mova             m0, [srcq+ 0]
1004    mova             m1, [srcq+32]
1005    pand             m4, m8, m0
1006    psrld            m3, m0, 16
1007    mova             m6, m9
1008    vpgatherdd       m2, [scalingq+m4-0], m9
1009    pand             m3, m8
1010    mova             m9, m6
1011    vpgatherdd       m4, [scalingq+m3-2], m6
1012    pand             m5, m8, m1
1013    mova             m6, m9
1014    vpgatherdd       m3, [scalingq+m5-0], m9
1015    pblendw          m4, m2, 0x55
1016    psrld            m2, m1, 16
1017    mova             m9, m6
1018    pand             m2, m8
1019    vpgatherdd       m5, [scalingq+m2-2], m6
1020    pblendw          m5, m3, 0x55
1021
1022    ; grain = grain_lut[offy+y][offx+x]
1023    movu             m3, [grain_lutq+offxyq*2]
1024    movd            xm6, [grain_lutq+left_offxyq*2]
1025    punpcklwd       xm6, xm3
1026    pmaddwd         xm6, xm7
1027    paddd           xm6, xm14
1028    psrad           xm6, 5
1029    packssdw        xm6, xm6
1030    pmaxsw          xm6, xm9
1031    pminsw          xm6, xm10
1032    vpblendd         m3, m6, 0x01
1033
1034    ; noise = round2(scaling[src] * grain, scaling_shift)
1035    pmaddubsw        m4, m11
1036    pmaddubsw        m5, m11
1037    paddw            m4, m4
1038    paddw            m5, m5
1039    pmulhrsw         m4, m3
1040    pmulhrsw         m5, [grain_lutq+offxyq*2+32]
1041
1042    ; dst = clip_pixel(src, noise)
1043    paddw            m0, m4
1044    paddw            m1, m5
1045    pmaxsw           m0, m12
1046    pmaxsw           m1, m12
1047    pminsw           m0, m13
1048    pminsw           m1, m13
1049    mova [dstq+srcq+ 0], m0
1050    mova [dstq+srcq+32], m1
1051
1052    add            srcq, strideq
1053    add      grain_lutq, 82*2
1054    dec              hd
1055    jg .loop_y_h_overlap
1056    add              wq, 32
1057    jge .end
1058    lea            srcq, [src_bakq+wq*2]
1059    cmp       dword r8m, 0 ; sby
1060    jne .loop_x_hv_overlap
1061    jmp .loop_x_h_overlap
1062
1063.vertical_overlap:
1064    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
1065                sby, see, src_bak
1066
1067    movzx          sbyd, sbyb
1068    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1069    imul            r7d, sbyd, 173 * 0x00010001
1070    imul           sbyd, 37 * 0x01000100
1071    add             r7d, (105 << 16) | 188
1072    add            sbyd, (178 << 24) | (141 << 8)
1073    and             r7d, 0x00ff00ff
1074    and            sbyd, 0xff00ff00
1075    xor            seed, r7d
1076    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1077
1078    lea        src_bakq, [srcq+wq*2]
1079    neg              wq
1080    sub            dstq, srcq
1081
1082.loop_x_v_overlap:
1083    vpbroadcastd    m15, [pw_27_17_17_27]
1084
1085    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1086    mov             r6d, seed
1087    or             seed, 0xeff4eff4
1088    test           seeb, seeh
1089    setp            r7b                     ; parity of top_seed
1090    shr            seed, 16
1091    shl             r7d, 16
1092    test           seeb, seeh
1093    setp            r7b                     ; parity of cur_seed
1094    or              r6d, 0x00010001
1095    xor             r7d, r6d
1096    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1097
1098    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1099                offx, offy, see, src_bak, unused, top_offxy
1100
1101    rorx          offyd, seed, 8
1102    rorx          offxd, seed, 12
1103    and           offyd, 0xf000f
1104    and           offxd, 0xf000f
1105    imul          offyd, 164
1106    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1107    lea           offyd, [offyq+offxq*2+0x10001*747+32*82]
1108
1109    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1110                h, offxy, see, src_bak, unused, top_offxy
1111
1112    mov      grain_lutq, grain_lutmp
1113    mov              hd, hm
1114    movzx    top_offxyd, offxyw
1115    shr          offxyd, 16
1116.loop_y_v_overlap:
1117    ; scaling[src]
1118    mova             m0, [srcq+ 0]
1119    mova             m1, [srcq+32]
1120    pand             m4, m8, m0
1121    psrld            m3, m0, 16
1122    mova             m6, m9
1123    vpgatherdd       m2, [scalingq+m4-0], m9
1124    pand             m3, m8
1125    mova             m9, m6
1126    vpgatherdd       m4, [scalingq+m3-2], m6
1127    pand             m5, m8, m1
1128    mova             m6, m9
1129    vpgatherdd       m3, [scalingq+m5-0], m9
1130    pblendw          m2, m4, 0xaa
1131    psrld            m4, m1, 16
1132    mova             m9, m6
1133    pand             m4, m8
1134    vpgatherdd       m5, [scalingq+m4-2], m6
1135    pblendw          m3, m5, 0xaa
1136
1137    ; grain = grain_lut[offy+y][offx+x]
1138    movu             m6, [grain_lutq+offxyq*2]
1139    movu             m5, [grain_lutq+top_offxyq*2]
1140    punpcklwd        m4, m5, m6
1141    punpckhwd        m5, m6
1142    pmaddwd          m4, m15
1143    pmaddwd          m5, m15
1144    movu             m7, [grain_lutq+offxyq*2+32]
1145    movu             m6, [grain_lutq+top_offxyq*2+32]
1146    paddd            m4, m14
1147    paddd            m5, m14
1148    psrad            m4, 5
1149    psrad            m5, 5
1150    packssdw         m4, m5
1151    punpcklwd        m5, m6, m7
1152    punpckhwd        m6, m7
1153    pmaddwd          m5, m15
1154    pmaddwd          m6, m15
1155    paddd            m5, m14
1156    paddd            m6, m14
1157    psrad            m5, 5
1158    psrad            m6, 5
1159    packssdw         m5, m6
1160    pmaxsw           m4, m9
1161    pmaxsw           m5, m9
1162    pminsw           m4, m10
1163    pminsw           m5, m10
1164
1165    ; noise = round2(scaling[src] * grain, scaling_shift)
1166    pmaddubsw        m2, m11
1167    pmaddubsw        m3, m11
1168    paddw            m2, m2
1169    paddw            m3, m3
1170    pmulhrsw         m4, m2
1171    pmulhrsw         m5, m3
1172
1173    ; dst = clip_pixel(src, noise)
1174    paddw            m0, m4
1175    paddw            m1, m5
1176    pmaxsw           m0, m12
1177    pmaxsw           m1, m12
1178    pminsw           m0, m13
1179    pminsw           m1, m13
1180    mova [dstq+srcq+ 0], m0
1181    mova [dstq+srcq+32], m1
1182
1183    add            srcq, strideq
1184    add      grain_lutq, 82*2
1185    dec              hb
1186    jz .end_y_v_overlap
1187    vpbroadcastd    m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line
1188    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1189    ; remaining (up to) 30 lines
1190    add              hd, 0x80000000
1191    jnc .loop_y_v_overlap
1192    jmp .loop_y
1193.end_y_v_overlap:
1194    add              wq, 32
1195    jge .end
1196    lea            srcq, [src_bakq+wq*2]
1197
1198    ; since fg_dataq.overlap is guaranteed to be set, we never jump
1199    ; back to .loop_x_v_overlap, and instead always fall-through to
1200    ; h+v overlap
1201
1202.loop_x_hv_overlap:
1203    vpbroadcastd    m15, [pw_27_17_17_27]
1204
1205    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1206    mov             r6d, seed
1207    or             seed, 0xeff4eff4
1208    test           seeb, seeh
1209    setp            r7b                     ; parity of top_seed
1210    shr            seed, 16
1211    shl             r7d, 16
1212    test           seeb, seeh
1213    setp            r7b                     ; parity of cur_seed
1214    or              r6d, 0x00010001
1215    xor             r7d, r6d
1216    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1217
1218    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1219                offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy
1220
1221    lea  topleft_offxyd, [top_offxyq+32]
1222    lea     left_offxyd, [offyq+32]
1223    rorx          offyd, seed, 8
1224    rorx          offxd, seed, 12
1225    and           offyd, 0xf000f
1226    and           offxd, 0xf000f
1227    imul          offyd, 164
1228    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1229    lea           offyd, [offyq+offxq*2+0x10001*747+32*82]
1230
1231    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1232                h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy
1233
1234    mov      grain_lutq, grain_lutmp
1235    mov              hd, hm
1236    movzx    top_offxyd, offxyw
1237    shr          offxyd, 16
1238.loop_y_hv_overlap:
1239    ; scaling[src]
1240    mova             m0, [srcq+ 0]
1241    mova             m1, [srcq+32]
1242    pand             m4, m8, m0
1243    psrld            m3, m0, 16
1244    mova             m6, m9
1245    vpgatherdd       m2, [scalingq+m4-0], m9
1246    pand             m3, m8
1247    mova             m9, m6
1248    vpgatherdd       m4, [scalingq+m3-2], m6
1249    pand             m5, m8, m1
1250    mova             m6, m9
1251    vpgatherdd       m3, [scalingq+m5-0], m9
1252    pblendw          m2, m4, 0xaa
1253    psrld            m4, m1, 16
1254    mova             m9, m6
1255    pand             m4, m8
1256    vpgatherdd       m5, [scalingq+m4-2], m6
1257    pblendw          m3, m5, 0xaa
1258
1259    ; grain = grain_lut[offy+y][offx+x]
1260    movu             m7, [grain_lutq+offxyq*2]
1261    movd            xm6, [grain_lutq+left_offxyq*2]
1262    movu             m5, [grain_lutq+top_offxyq*2]
1263    movd            xm4, [grain_lutq+topleft_offxyq*2]
1264    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
1265    punpcklwd       xm6, xm7
1266    punpcklwd       xm4, xm5
1267    punpcklqdq      xm6, xm4
1268    movddup         xm4, [pw_27_17_17_27]
1269    pmaddwd         xm6, xm4
1270    paddd           xm6, xm14
1271    psrad           xm6, 5
1272    packssdw        xm6, xm6
1273    pmaxsw          xm6, xm9
1274    pminsw          xm6, xm10
1275    pshuflw         xm4, xm6, q1032
1276    vpblendd         m6, m7, 0xfe
1277    vpblendd         m4, m5, 0xfe
1278    ; followed by v interpolation (top | cur -> cur)
1279    punpckhwd        m5, m7
1280    pmaddwd          m5, m15
1281    punpcklwd        m4, m6
1282    pmaddwd          m4, m15
1283    movu             m7, [grain_lutq+offxyq*2+32]
1284    movu             m6, [grain_lutq+top_offxyq*2+32]
1285    paddd            m5, m14
1286    paddd            m4, m14
1287    psrad            m5, 5
1288    psrad            m4, 5
1289    packssdw         m4, m5
1290    punpcklwd        m5, m6, m7
1291    punpckhwd        m6, m7
1292    pmaddwd          m5, m15
1293    pmaddwd          m6, m15
1294    paddd            m5, m14
1295    paddd            m6, m14
1296    psrad            m5, 5
1297    psrad            m6, 5
1298    packssdw         m5, m6
1299    pmaxsw           m4, m9
1300    pmaxsw           m5, m9
1301    pminsw           m4, m10
1302    pminsw           m5, m10
1303
1304    ; noise = round2(scaling[src] * grain, scaling_shift)
1305    pmaddubsw        m2, m11
1306    pmaddubsw        m3, m11
1307    paddw            m2, m2
1308    paddw            m3, m3
1309    pmulhrsw         m4, m2
1310    pmulhrsw         m5, m3
1311
1312    ; dst = clip_pixel(src, noise)
1313    paddw            m0, m4
1314    paddw            m1, m5
1315    pmaxsw           m0, m12
1316    pmaxsw           m1, m12
1317    pminsw           m0, m13
1318    pminsw           m1, m13
1319    mova [dstq+srcq+ 0], m0
1320    mova [dstq+srcq+32], m1
1321
1322    add            srcq, strideq
1323    add      grain_lutq, 82*2
1324    dec              hb
1325    jz .end_y_hv_overlap
1326    vpbroadcastd    m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line
1327    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1328    ; remaining (up to) 30 lines
1329    add              hd, 0x80000000
1330    jnc .loop_y_hv_overlap
1331    movq            xm7, [pw_27_17_17_27]
1332    jmp .loop_y_h_overlap
1333.end_y_hv_overlap:
1334    add              wq, 32
1335    lea            srcq, [src_bakq+wq*2]
1336    jl .loop_x_hv_overlap
1337.end:
1338    RET
1339
1340%macro FGUV_FN 3 ; name, ss_hor, ss_ver
1341cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
1342                                           grain_lut, h, sby, luma, lstride, uv_pl, is_id
1343%define base r12-grain_min
1344    lea             r12, [grain_min]
1345    mov             r9d, r13m               ; bdmax
1346    mov             r7d, [fg_dataq+FGData.scaling_shift]
1347    mov            r11d, is_idm
1348    mov            sbyd, sbym
1349    vpbroadcastw    m11, [base+mul_bits+r7*2-12]
1350    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
1351    shr             r9d, 11                 ; is_12bpc
1352    vpbroadcastd     m8, [base+grain_min+r9*4]
1353    shlx           r10d, r6d, r9d
1354    vpbroadcastd     m9, [base+grain_max+r9*4]
1355    vpbroadcastw    m10, r13m
1356    shlx            r6d, r6d, r11d
1357    vpbroadcastd    m12, [base+fg_min+r10*4]
1358    lea             r6d, [r9+r6*2]
1359    vpbroadcastd    m13, [base+fg_max+r6*4]
1360    test           sbyd, sbyd
1361    setnz           r7b
1362    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
1363    jne .csfl
1364
1365%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
1366    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1367                unused, sby, see, overlap
1368
1369%if %1
1370    mov             r6d, r11m
1371    vpbroadcastd     m0, [base+pb_8_9_0_1]
1372    vpbroadcastd     m1, [base+uv_offset_mul+r9*4]
1373    vbroadcasti128  m14, [fg_dataq+FGData.uv_mult+r6*4]
1374    vpbroadcastd    m15, [fg_dataq+FGData.uv_offset+r6*4]
1375    pshufb          m14, m0 ; { uv_luma_mult, uv_mult }
1376    pmaddwd         m15, m1
1377%else
1378%if %2
1379    vpbroadcastq    m15, [base+pw_23_22]
1380%else
1381    vpbroadcastq    m15, [base+pw_27_17_17_27]
1382%endif
1383    vpbroadcastd    m14, [base+pd_16]
1384%endif
1385    test            r7b, [fg_dataq+FGData.overlap_flag]
1386    jnz %%vertical_overlap
1387
1388    imul           seed, sbyd, (173 << 24) | 37
1389    add            seed, (105 << 24) | 178
1390    rorx           seed, seed, 24
1391    movzx          seed, seew
1392    xor            seed, [fg_dataq+FGData.seed]
1393
1394    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1395                unused2, unused3, see, unused4, unused5, unused6, luma, lstride
1396
1397    mov           lumaq, r9mp
1398    mov        lstrideq, r10mp
1399    lea             r10, [srcq+wq*2]
1400    lea             r11, [dstq+wq*2]
1401    lea             r12, [lumaq+wq*(2<<%2)]
1402    mov            r9mp, r10
1403    mov           r11mp, r11
1404    mov           r12mp, r12
1405    neg              wq
1406
1407%%loop_x:
1408    rorx             r6, seeq, 1
1409    or             seed, 0xEFF4
1410    test           seeb, seeh
1411    lea            seed, [r6+0x8000]
1412    cmovp          seed, r6d               ; updated seed
1413
1414    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1415                offx, offy, see, unused1, unused2, unused3, luma, lstride
1416
1417    rorx          offyd, seed, 8
1418    rorx          offxq, seeq, 12
1419    and           offyd, 0xf
1420    imul          offyd, 164>>%3
1421    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))]  ; offy*stride+offx
1422
1423    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1424                h, offxy, see, unused1, unused2, unused3, luma, lstride
1425
1426    mov      grain_lutq, grain_lutmp
1427    mov              hd, hm
1428%%loop_y:
1429    ; luma_src
1430%if %2
1431    mova            xm2, [lumaq+lstrideq*0+ 0]
1432    vinserti128      m2, [lumaq+lstrideq*0+32], 1
1433    mova            xm4, [lumaq+lstrideq*0+16]
1434    vinserti128      m4, [lumaq+lstrideq*0+48], 1
1435    mova            xm3, [lumaq+lstrideq*(1<<%3)+ 0]
1436    vinserti128      m3, [lumaq+lstrideq*(1<<%3)+32], 1
1437    mova            xm5, [lumaq+lstrideq*(1<<%3)+16]
1438    vinserti128      m5, [lumaq+lstrideq*(1<<%3)+48], 1
1439    phaddw           m2, m4
1440    phaddw           m3, m5
1441    pxor             m4, m4
1442    pavgw            m2, m4
1443    pavgw            m3, m4
1444%elif %1
1445    mova             m2, [lumaq+ 0]
1446    mova             m3, [lumaq+32]
1447%endif
1448%if %1
1449    mova             m0, [srcq]
1450%if %2
1451    mova             m1, [srcq+strideq]
1452%else
1453    mova             m1, [srcq+32]
1454%endif
1455    punpckhwd        m4, m2, m0
1456    punpcklwd        m2, m0
1457    punpckhwd        m5, m3, m1
1458    punpcklwd        m3, m1                 ; { luma, chroma }
1459    REPX {pmaddwd x, m14}, m4, m2, m5, m3
1460    REPX {paddd   x, m15}, m4, m2, m5, m3
1461    REPX {psrad   x, 6  }, m4, m2, m5, m3
1462    packusdw         m2, m4
1463    packusdw         m3, m5
1464    pminuw           m2, m10
1465    pminuw           m3, m10                ; clip_pixel()
1466%elif %2
1467    pand             m2, m10
1468    pand             m3, m10
1469%else
1470    pand             m2, m10, [lumaq+ 0]
1471    pand             m3, m10, [lumaq+32]
1472%endif
1473
1474    ; scaling[luma_src]
1475    vpbroadcastd     m7, [pd_m65536]
1476    pandn            m4, m7, m2
1477    mova             m6, m7
1478    vpgatherdd       m5, [scalingq+m4-0], m7
1479    psrld            m2, 16
1480    mova             m7, m6
1481    vpgatherdd       m4, [scalingq+m2-2], m6
1482    pblendw          m4, m5, 0x55
1483    pandn            m5, m7, m3
1484    mova             m6, m7
1485    vpgatherdd       m2, [scalingq+m5-0], m7
1486    psrld            m3, 16
1487    vpgatherdd       m5, [scalingq+m3-2], m6
1488    pblendw          m5, m2, 0x55
1489
1490    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1491    pmaddubsw        m4, m11
1492    pmaddubsw        m5, m11
1493    paddw            m4, m4
1494    paddw            m5, m5
1495    pmulhrsw         m4, [grain_lutq+offxyq*2]
1496%if %2
1497    pmulhrsw         m5, [grain_lutq+offxyq*2+82*2]
1498%else
1499    pmulhrsw         m5, [grain_lutq+offxyq*2+32]
1500%endif
1501
1502    ; dst = clip_pixel(src, noise)
1503%if %1
1504    paddw            m0, m4
1505    paddw            m1, m5
1506%else
1507    paddw            m0, m4, [srcq]
1508%if %2
1509    paddw            m1, m5, [srcq+strideq]
1510%else
1511    paddw            m1, m5, [srcq+32]
1512%endif
1513%endif
1514    pmaxsw           m0, m12
1515    pmaxsw           m1, m12
1516    pminsw           m0, m13
1517    pminsw           m1, m13
1518    mova         [dstq], m0
1519%if %2
1520    mova [dstq+strideq], m1
1521    lea            srcq, [srcq+strideq*2]
1522    lea            dstq, [dstq+strideq*2]
1523    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
1524%else
1525    mova      [dstq+32], m1
1526    add            srcq, strideq
1527    add            dstq, strideq
1528    add           lumaq, lstrideq
1529%endif
1530    add      grain_lutq, 82*(2<<%2)
1531%if %2
1532    sub              hb, 2
1533%else
1534    dec              hb
1535%endif
1536    jg %%loop_y
1537    add              wq, 32>>%2
1538    jge .end
1539    mov            srcq, r9mp
1540    mov            dstq, r11mp
1541    mov           lumaq, r12mp
1542    lea            srcq, [srcq+wq*2]
1543    lea            dstq, [dstq+wq*2]
1544    lea           lumaq, [lumaq+wq*(2<<%2)]
1545    cmp byte [fg_dataq+FGData.overlap_flag], 0
1546    je %%loop_x
1547    cmp       dword r8m, 0 ; sby
1548    jne %%loop_x_hv_overlap
1549
1550    ; horizontal overlap (without vertical overlap)
1551%%loop_x_h_overlap:
1552    rorx             r6, seeq, 1
1553    or             seed, 0xEFF4
1554    test           seeb, seeh
1555    lea            seed, [r6+0x8000]
1556    cmovp          seed, r6d               ; updated seed
1557
1558    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1559                offx, offy, see, left_offxy, unused1, unused2, luma, lstride
1560
1561    lea     left_offxyd, [offyq+(32>>%2)]         ; previous column's offy*stride+offx
1562    rorx          offyd, seed, 8
1563    rorx          offxq, seeq, 12
1564    and           offyd, 0xf
1565    imul          offyd, 164>>%3
1566    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
1567
1568    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1569                h, offxy, see, left_offxy, unused1, unused2, luma, lstride
1570
1571    mov      grain_lutq, grain_lutmp
1572    mov              hd, hm
1573%%loop_y_h_overlap:
1574    ; luma_src
1575%if %2
1576    mova            xm2, [lumaq+lstrideq*0+ 0]
1577    vinserti128      m2, [lumaq+lstrideq*0+32], 1
1578    mova            xm4, [lumaq+lstrideq*0+16]
1579    vinserti128      m4, [lumaq+lstrideq*0+48], 1
1580    mova            xm3, [lumaq+lstrideq*(1<<%3)+ 0]
1581    vinserti128      m3, [lumaq+lstrideq*(1<<%3)+32], 1
1582    mova            xm5, [lumaq+lstrideq*(1<<%3)+16]
1583    vinserti128      m5, [lumaq+lstrideq*(1<<%3)+48], 1
1584    phaddw           m2, m4
1585    phaddw           m3, m5
1586    pxor             m4, m4
1587    pavgw            m2, m4
1588    pavgw            m3, m4
1589%elif %1
1590    mova             m2, [lumaq]
1591    mova             m3, [lumaq+32]
1592%endif
1593%if %1
1594    mova             m0, [srcq]
1595%if %2
1596    mova             m1, [srcq+strideq]
1597%else
1598    mova             m1, [srcq+32]
1599%endif
1600    punpckhwd        m4, m2, m0
1601    punpcklwd        m2, m0
1602    punpckhwd        m5, m3, m1
1603    punpcklwd        m3, m1                 ; { luma, chroma }
1604    REPX {pmaddwd x, m14}, m4, m2, m5, m3
1605    REPX {paddd   x, m15}, m4, m2, m5, m3
1606    REPX {psrad   x, 6  }, m4, m2, m5, m3
1607    packusdw         m2, m4
1608    packusdw         m3, m5
1609    pminuw           m2, m10                ; clip_pixel()
1610    pminuw           m3, m10
1611%elif %2
1612    pand             m2, m10
1613    pand             m3, m10
1614%else
1615    pand             m2, m10, [lumaq+ 0]
1616    pand             m3, m10, [lumaq+32]
1617%endif
1618
1619    ; scaling[luma_src]
1620    vpbroadcastd     m7, [pd_m65536]
1621    pandn            m4, m7, m2
1622    mova             m6, m7
1623    vpgatherdd       m5, [scalingq+m4-0], m7
1624    psrld            m2, 16
1625    mova             m7, m6
1626    vpgatherdd       m4, [scalingq+m2-2], m6
1627    pblendw          m4, m5, 0x55
1628    pandn            m5, m7, m3
1629    mova             m6, m7
1630    vpgatherdd       m2, [scalingq+m5-0], m7
1631    psrld            m3, 16
1632    vpgatherdd       m5, [scalingq+m3-2], m6
1633    pblendw          m5, m2, 0x55
1634
1635    ; grain = grain_lut[offy+y][offx+x]
1636    movu             m2, [grain_lutq+offxyq*2]
1637%if %2
1638    movu             m3, [grain_lutq+offxyq*2+82*2]
1639%else
1640    movu             m3, [grain_lutq+offxyq*2+32]
1641%endif
1642    movd            xm6, [grain_lutq+left_offxyq*2]
1643%if %2
1644    pinsrw          xm6, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1}
1645    punpckldq       xm7, xm2, xm3           ; {cur0, cur1}
1646    punpcklwd       xm6, xm7                ; {left0, cur0, left1, cur1}
1647%else
1648    punpcklwd       xm6, xm2
1649%endif
1650%if %1
1651%if %2
1652    vpbroadcastq    xm7, [pw_23_22]
1653%else
1654    movq            xm7, [pw_27_17_17_27]
1655%endif
1656    pmaddwd         xm6, xm7
1657    vpbroadcastd    xm7, [pd_16]
1658    paddd           xm6, xm7
1659%else
1660    pmaddwd         xm6, xm15
1661    paddd           xm6, xm14
1662%endif
1663    psrad           xm6, 5
1664    packssdw        xm6, xm6
1665    pmaxsw          xm6, xm8
1666    pminsw          xm6, xm9
1667    vpblendd         m2, m6, 0x01
1668%if %2
1669    pshuflw         xm6, xm6, q1032
1670    vpblendd         m3, m6, 0x01
1671%endif
1672
1673    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1674    pmaddubsw        m4, m11
1675    pmaddubsw        m5, m11
1676    paddw            m4, m4
1677    paddw            m5, m5
1678    pmulhrsw         m2, m4
1679    pmulhrsw         m3, m5
1680
1681    ; dst = clip_pixel(src, noise)
1682%if %1
1683    paddw            m0, m2
1684    paddw            m1, m3
1685%else
1686    paddw            m0, m2, [srcq]
1687%if %2
1688    paddw            m1, m3, [srcq+strideq]
1689%else
1690    paddw            m1, m3, [srcq+32]
1691%endif
1692%endif
1693    pmaxsw           m0, m12
1694    pmaxsw           m1, m12
1695    pminsw           m0, m13
1696    pminsw           m1, m13
1697    mova         [dstq], m0
1698%if %2
1699    mova [dstq+strideq], m1
1700    lea            srcq, [srcq+strideq*2]
1701    lea            dstq, [dstq+strideq*2]
1702    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
1703%else
1704    mova      [dstq+32], m1
1705    add            srcq, strideq
1706    add            dstq, strideq
1707    add           lumaq, r10mp
1708%endif
1709    add      grain_lutq, 82*(2<<%2)
1710%if %2
1711    sub              hb, 2
1712%else
1713    dec              hb
1714%endif
1715    jg %%loop_y_h_overlap
1716    add              wq, 32>>%2
1717    jge .end
1718    mov            srcq, r9mp
1719    mov            dstq, r11mp
1720    mov           lumaq, r12mp
1721    lea            srcq, [srcq+wq*2]
1722    lea            dstq, [dstq+wq*2]
1723    lea           lumaq, [lumaq+wq*(2<<%2)]
1724    cmp       dword r8m, 0 ; sby
1725    jne %%loop_x_hv_overlap
1726    jmp %%loop_x_h_overlap
1727
1728%%vertical_overlap:
1729    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
1730                sby, see, unused1, unused2, unused3, lstride
1731
1732    movzx          sbyd, sbyb
1733    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1734    imul            r7d, sbyd, 173 * 0x00010001
1735    imul           sbyd, 37 * 0x01000100
1736    add             r7d, (105 << 16) | 188
1737    add            sbyd, (178 << 24) | (141 << 8)
1738    and             r7d, 0x00ff00ff
1739    and            sbyd, 0xff00ff00
1740    xor            seed, r7d
1741    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1742
1743    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1744                offx, offy, see, unused1, top_offxy, unused2, luma, lstride
1745
1746    mov           lumaq, r9mp
1747    mov        lstrideq, r10mp
1748    lea             r10, [srcq+wq*2]
1749    lea             r11, [dstq+wq*2]
1750    lea             r12, [lumaq+wq*(2<<%2)]
1751    mov            r9mp, r10
1752    mov           r11mp, r11
1753    mov           r12mp, r12
1754    neg              wq
1755
1756%%loop_x_v_overlap:
1757    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1758    mov             r6d, seed
1759    or             seed, 0xeff4eff4
1760    test           seeb, seeh
1761    setp            r7b                     ; parity of top_seed
1762    shr            seed, 16
1763    shl             r7d, 16
1764    test           seeb, seeh
1765    setp            r7b                     ; parity of cur_seed
1766    or              r6d, 0x00010001
1767    xor             r7d, r6d
1768    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1769
1770    rorx          offyd, seed, 8
1771    rorx          offxd, seed, 12
1772    and           offyd, 0xf000f
1773    and           offxd, 0xf000f
1774    imul          offyd, 164>>%3
1775    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1776    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
1777
1778    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1779                h, offxy, see, unused1, top_offxy, unused2, luma, lstride
1780
1781    mov      grain_lutq, grain_lutmp
1782    mov              hd, hm
1783    movzx    top_offxyd, offxyw
1784    shr          offxyd, 16
1785%if %2 == 0
1786    lea             r10, [pw_27_17_17_27]
1787%endif
1788%%loop_y_v_overlap:
1789    ; luma_src
1790%if %2
1791    mova            xm2, [lumaq+lstrideq*0+ 0]
1792    vinserti128      m2, [lumaq+lstrideq*0+32], 1
1793    mova            xm4, [lumaq+lstrideq*0+16]
1794    vinserti128      m4, [lumaq+lstrideq*0+48], 1
1795    mova            xm3, [lumaq+lstrideq*(1<<%3)+ 0]
1796    vinserti128      m3, [lumaq+lstrideq*(1<<%3)+32], 1
1797    mova            xm5, [lumaq+lstrideq*(1<<%3)+16]
1798    vinserti128      m5, [lumaq+lstrideq*(1<<%3)+48], 1
1799    phaddw           m2, m4
1800    phaddw           m3, m5
1801    pxor             m4, m4
1802    pavgw            m2, m4
1803    pavgw            m3, m4
1804%elif %1
1805    mova             m2, [lumaq]
1806    mova             m3, [lumaq+32]
1807%endif
1808%if %1
1809    mova             m0, [srcq]
1810%if %2
1811    mova             m1, [srcq+strideq]
1812%else
1813    mova             m1, [srcq+32]
1814%endif
1815    punpckhwd        m4, m2, m0
1816    punpcklwd        m2, m0
1817    punpckhwd        m5, m3, m1
1818    punpcklwd        m3, m1                 ; { luma, chroma }
1819    REPX {pmaddwd x, m14}, m4, m2, m5, m3
1820    REPX {paddd   x, m15}, m4, m2, m5, m3
1821    REPX {psrad   x, 6  }, m4, m2, m5, m3
1822    packusdw         m2, m4
1823    packusdw         m3, m5
1824    pminuw           m2, m10                ; clip_pixel()
1825    pminuw           m3, m10
1826%elif %2
1827    pand             m2, m10
1828    pand             m3, m10
1829%else
1830    pand             m2, m10, [lumaq+ 0]
1831    pand             m3, m10, [lumaq+32]
1832%endif
1833
1834    ; scaling[luma_src]
1835    vpbroadcastd     m7, [pd_m65536]
1836    pandn            m4, m7, m2
1837    mova             m6, m7
1838    vpgatherdd       m5, [scalingq+m4-0], m7
1839    psrld            m2, 16
1840    mova             m7, m6
1841    vpgatherdd       m4, [scalingq+m2-2], m6
1842    pblendw          m4, m5, 0x55
1843    pandn            m5, m7, m3
1844    mova             m6, m7
1845    vpgatherdd       m2, [scalingq+m5-0], m7
1846    psrld            m3, 16
1847    vpgatherdd       m5, [scalingq+m3-2], m6
1848    pblendw          m5, m2, 0x55
1849
1850    ; grain = grain_lut[offy+y][offx+x]
1851    movu             m6, [grain_lutq+offxyq*2]
1852    movu             m3, [grain_lutq+top_offxyq*2]
1853    punpcklwd        m2, m3, m6
1854    punpckhwd        m3, m6                 ; { top, cur }
1855%if %3
1856    vpbroadcastd     m0, [pw_23_22]
1857%elif %2
1858    vpbroadcastd     m0, [pw_27_17_17_27]
1859%else
1860    vpbroadcastd     m0, [r10]
1861%endif
1862    REPX {pmaddwd x, m0}, m2, m3
1863%if %1
1864    vpbroadcastd     m1, [pd_16]
1865    REPX  {paddd x, m1}, m2, m3
1866%else
1867    REPX {paddd x, m14}, m2, m3
1868%endif
1869    REPX   {psrad x, 5}, m2, m3
1870    packssdw         m2, m3
1871%if %2
1872    movu             m3, [grain_lutq+offxyq*2+82*2]
1873%else
1874    movu             m3, [grain_lutq+offxyq*2+32]
1875%endif
1876%if %3
1877    pmaxsw           m2, m8
1878    pminsw           m2, m9
1879%else
1880%if %2
1881    movu             m7, [grain_lutq+top_offxyq*2+82*2]
1882    punpckhwd        m6, m3, m7             ; { cur, top }
1883    punpcklwd        m3, m7
1884%else
1885    movu             m7, [grain_lutq+top_offxyq*2+32]
1886    punpckhwd        m6, m7, m3
1887    punpcklwd        m3, m7, m3             ; { top, cur }
1888%endif
1889    pmaddwd          m6, m0
1890    pmaddwd          m3, m0
1891%if %1
1892    paddd            m6, m1
1893    paddd            m3, m1
1894%else
1895    paddd            m6, m14
1896    paddd            m3, m14
1897%endif
1898    psrad            m6, 5
1899    psrad            m3, 5
1900    packssdw         m3, m6
1901    pmaxsw           m2, m8
1902    pmaxsw           m3, m8
1903    pminsw           m2, m9
1904    pminsw           m3, m9
1905%endif
1906
1907    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1908    pmaddubsw        m4, m11
1909    pmaddubsw        m5, m11
1910    paddw            m4, m4
1911    paddw            m5, m5
1912    pmulhrsw         m2, m4
1913    pmulhrsw         m3, m5
1914
1915    ; dst = clip_pixel(src, noise)
1916    paddw            m0, m2, [srcq]
1917%if %2
1918    paddw            m1, m3, [srcq+strideq]
1919%else
1920    paddw            m1, m3, [srcq+32]
1921%endif
1922    pmaxsw           m0, m12
1923    pmaxsw           m1, m12
1924    pminsw           m0, m13
1925    pminsw           m1, m13
1926    mova         [dstq], m0
1927%if %2
1928    mova [dstq+strideq], m1
1929    sub              hb, 2
1930%else
1931    mova      [dstq+32], m1
1932    dec              hb
1933%endif
1934    jle %%end_y_v_overlap
1935%if %2
1936    lea            srcq, [srcq+strideq*2]
1937    lea            dstq, [dstq+strideq*2]
1938    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
1939%else
1940    add            srcq, strideq
1941    add            dstq, strideq
1942    add           lumaq, lstrideq
1943%endif
1944    add      grain_lutq, 82*(2<<%2)
1945%if %2
1946    jmp %%loop_y
1947%else
1948    add              hd, 0x80000000
1949    jc %%loop_y
1950    add             r10, 4
1951    jmp %%loop_y_v_overlap
1952%endif
1953%%end_y_v_overlap:
1954    add              wq, 32>>%2
1955    jge .end
1956    mov            srcq, r9mp
1957    mov            dstq, r11mp
1958    mov           lumaq, r12mp
1959    lea            srcq, [srcq+wq*2]
1960    lea            dstq, [dstq+wq*2]
1961    lea           lumaq, [lumaq+wq*(2<<%2)]
1962
1963    ; since fg_dataq.overlap is guaranteed to be set, we never jump
1964    ; back to .loop_x_v_overlap, and instead always fall-through to
1965    ; h+v overlap
1966%%loop_x_hv_overlap:
1967    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1968    mov             r6d, seed
1969    or             seed, 0xeff4eff4
1970    test           seeb, seeh
1971    setp            r7b                     ; parity of top_seed
1972    shr            seed, 16
1973    shl             r7d, 16
1974    test           seeb, seeh
1975    setp            r7b                     ; parity of cur_seed
1976    or              r6d, 0x00010001
1977    xor             r7d, r6d
1978    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1979
1980    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1981                offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
1982
1983%if %2 == 0
1984    lea             r14, [pw_27_17_17_27]
1985%endif
1986    lea  topleft_offxyq, [top_offxyq+(32>>%2)]
1987    lea     left_offxyq, [offyq+(32>>%2)]
1988    rorx          offyd, seed, 8
1989    rorx          offxd, seed, 12
1990    and           offyd, 0xf000f
1991    and           offxd, 0xf000f
1992    imul          offyd, 164>>%3
1993    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1994    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
1995
1996    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1997                h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
1998
1999    mov      grain_lutq, grain_lutmp
2000    mov              hd, hm
2001    movzx    top_offxyd, offxyw
2002    shr          offxyd, 16
2003%%loop_y_hv_overlap:
2004    ; luma_src
2005%if %2
2006    mova            xm2, [lumaq+lstrideq*0+ 0]
2007    vinserti128      m2, [lumaq+lstrideq*0+32], 1
2008    mova            xm4, [lumaq+lstrideq*0+16]
2009    vinserti128      m4, [lumaq+lstrideq*0+48], 1
2010    mova            xm3, [lumaq+lstrideq*(1<<%3)+ 0]
2011    vinserti128      m3, [lumaq+lstrideq*(1<<%3)+32], 1
2012    mova            xm5, [lumaq+lstrideq*(1<<%3)+16]
2013    vinserti128      m5, [lumaq+lstrideq*(1<<%3)+48], 1
2014    phaddw           m2, m4
2015    phaddw           m3, m5
2016    pxor             m4, m4
2017    pavgw            m2, m4
2018    pavgw            m3, m4
2019%elif %1
2020    mova             m2, [lumaq]
2021    mova             m3, [lumaq+32]
2022%endif
2023%if %1
2024    mova             m0, [srcq]
2025%if %2
2026    mova             m1, [srcq+strideq]
2027%else
2028    mova             m1, [srcq+32]
2029%endif
2030    punpckhwd        m4, m2, m0
2031    punpcklwd        m2, m0
2032    punpckhwd        m5, m3, m1
2033    punpcklwd        m3, m1                 ; { luma, chroma }
2034    REPX {pmaddwd x, m14}, m4, m2, m5, m3
2035    REPX {paddd   x, m15}, m4, m2, m5, m3
2036    REPX {psrad   x, 6  }, m4, m2, m5, m3
2037    packusdw         m2, m4
2038    packusdw         m3, m5
2039    pminuw           m2, m10                ; clip_pixel()
2040    pminuw           m3, m10
2041%elif %2
2042    pand             m2, m10
2043    pand             m3, m10
2044%else
2045    pand             m2, m10, [lumaq+ 0]
2046    pand             m3, m10, [lumaq+32]
2047%endif
2048
2049    ; scaling[luma_src]
2050    vpbroadcastd     m7, [pd_m65536]
2051    pandn            m4, m7, m2
2052    mova             m6, m7
2053    vpgatherdd       m5, [scalingq+m4-0], m7
2054    psrld            m2, 16
2055    mova             m7, m6
2056    vpgatherdd       m4, [scalingq+m2-2], m6
2057    pblendw          m4, m5, 0x55
2058    pandn            m5, m7, m3
2059    mova             m6, m7
2060    vpgatherdd       m2, [scalingq+m5-0], m7
2061    psrld            m3, 16
2062    vpgatherdd       m5, [scalingq+m3-2], m6
2063    pblendw          m5, m2, 0x55
2064
2065    ; grain = grain_lut[offy+y][offx+x]
2066    movu             m0, [grain_lutq+offxyq*2]
2067    movd            xm2, [grain_lutq+left_offxyq*2]
2068    movu             m6, [grain_lutq+top_offxyq*2]
2069%if %2
2070    pinsrw          xm2, [grain_lutq+left_offxyq*2+82*2], 2
2071    movu             m3, [grain_lutq+offxyq*2+82*2]
2072    punpckldq       xm1, xm0, xm3           ; { cur0, cur1 }
2073%if %3
2074    vinserti128      m2, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left }
2075    vinserti128      m1, [grain_lutq+top_offxyq*2], 1     ; { cur0, cur1, top0 }
2076%else
2077    vinserti128      m2, [grain_lutq+topleft_offxyq*2+82*2], 1
2078    vpbroadcastd     m7, [grain_lutq+topleft_offxyq*2]
2079    vpblendd         m2, m7, 0x20
2080    movd            xm7, [grain_lutq+top_offxyq*2+82*2]
2081    punpckldq       xm7, xm6
2082    vinserti128      m1, xm7, 1
2083    movu             m7, [grain_lutq+top_offxyq*2+82*2]
2084%endif
2085    punpcklwd        m2, m1                 ; { cur, left }
2086%if %1
2087    vpbroadcastq     m1, [pw_23_22]
2088    pmaddwd          m2, m1
2089    vpbroadcastd     m1, [pd_16]
2090    paddd            m2, m1
2091    psrad            m2, 5
2092    packssdw         m2, m2
2093    vpermq           m2, m2, q3120
2094%else
2095    pmaddwd          m2, m15
2096    paddd            m2, m14
2097    psrad            m2, 5
2098    vextracti128    xm1, m2, 1
2099    packssdw        xm2, xm1
2100%endif
2101%else
2102    pinsrd          xm2, [grain_lutq+topleft_offxyq*2], 1
2103    movu             m3, [grain_lutq+offxyq*2+32]
2104    movu             m7, [grain_lutq+top_offxyq*2+32]
2105    punpckldq       xm1, xm0, xm6
2106    punpcklwd       xm2, xm1                ; { cur, left }
2107%if %1
2108    movddup         xm1, [pw_27_17_17_27]
2109    pmaddwd         xm2, xm1
2110    vpbroadcastd     m1, [pd_16]
2111    paddd           xm2, xm1
2112%else
2113    pmaddwd         xm2, xm15
2114    paddd           xm2, xm14
2115%endif
2116    psrad           xm2, 5
2117    packssdw        xm2, xm2
2118%endif
2119    pmaxsw          xm2, xm8
2120    pminsw          xm2, xm9
2121    vpblendd         m0, m2, 0x01
2122%if %2
2123    pshufd          xm2, xm2, q0321
2124    vpblendd         m3, m2, 0x01
2125%if %3 == 0
2126    pshufd          xm2, xm2, q0321
2127    vpblendd         m7, m2, 0x01
2128%endif
2129%endif
2130    pshuflw         xm2, xm2, q1032
2131    vpblendd         m2, m6, 0xfe
2132    punpckhwd        m6, m0                 ; { top, cur }
2133    punpcklwd        m2, m0
2134%if %3
2135    vpbroadcastd     m0, [pw_23_22]
2136%elif %2
2137    vpbroadcastd     m0, [pw_27_17_17_27]
2138%else
2139    vpbroadcastd     m0, [r14]
2140%endif
2141    pmaddwd          m6, m0
2142    pmaddwd          m2, m0
2143%if %1
2144    paddd            m6, m1
2145    paddd            m2, m1
2146%else
2147    paddd            m6, m14
2148    paddd            m2, m14
2149%endif
2150    psrad            m6, 5
2151    psrad            m2, 5
2152    packssdw         m2, m6
2153
2154%if %3
2155    pmaxsw           m2, m8
2156    pminsw           m2, m9
2157%else
2158%if %2
2159    punpckhwd        m6, m3, m7
2160    punpcklwd        m3, m7                 ; { cur, top }
2161%else
2162    punpckhwd        m6, m7, m3
2163    punpcklwd        m3, m7, m3             ; { top, cur }
2164%endif
2165    REPX {pmaddwd x, m0}, m6, m3
2166%if %1
2167    REPX  {paddd x, m1}, m6, m3
2168%else
2169    REPX {paddd x, m14}, m6, m3
2170%endif
2171    REPX   {psrad x, 5}, m6, m3
2172    packssdw         m3, m6
2173    pmaxsw           m2, m8
2174    pmaxsw           m3, m8
2175    pminsw           m2, m9
2176    pminsw           m3, m9
2177%endif
2178
2179    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2180    pmaddubsw        m4, m11
2181    pmaddubsw        m5, m11
2182    paddw            m4, m4
2183    paddw            m5, m5
2184    pmulhrsw         m2, m4
2185    pmulhrsw         m3, m5
2186
2187    ; dst = clip_pixel(src, noise)
2188    paddw            m0, m2, [srcq]
2189%if %2
2190    paddw            m1, m3, [srcq+strideq]
2191%else
2192    paddw            m1, m3, [srcq+32]
2193%endif
2194    pmaxsw           m0, m12
2195    pmaxsw           m1, m12
2196    pminsw           m0, m13
2197    pminsw           m1, m13
2198    mova         [dstq], m0
2199%if %2
2200    mova [dstq+strideq], m1
2201    lea            srcq, [srcq+strideq*2]
2202    lea            dstq, [dstq+strideq*2]
2203    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
2204%else
2205    mova      [dstq+32], m1
2206    add            srcq, strideq
2207    add            dstq, strideq
2208    add           lumaq, r10mp
2209%endif
2210    add      grain_lutq, 82*(2<<%2)
2211%if %2
2212    sub              hb, 2
2213    jg %%loop_y_h_overlap
2214%else
2215    dec              hb
2216    jle %%end_y_hv_overlap
2217    add              hd, 0x80000000
2218    jc %%loop_y_h_overlap
2219    add             r14, 4
2220    jmp %%loop_y_hv_overlap
2221%endif
2222%%end_y_hv_overlap:
2223    add              wq, 32>>%2
2224    jge .end
2225    mov            srcq, r9mp
2226    mov            dstq, r11mp
2227    mov           lumaq, r12mp
2228    lea            srcq, [srcq+wq*2]
2229    lea            dstq, [dstq+wq*2]
2230    lea           lumaq, [lumaq+wq*(2<<%2)]
2231    jmp %%loop_x_hv_overlap
2232%endmacro
2233
2234    %%FGUV_32x32xN_LOOP 1, %2, %3
2235.csfl:
2236    %%FGUV_32x32xN_LOOP 0, %2, %3
2237.end:
2238    RET
2239%endmacro
2240
2241GEN_GRAIN_UV_FN 420, 1, 1
2242FGUV_FN 420,         1, 1
2243GEN_GRAIN_UV_FN 422, 1, 0
2244FGUV_FN 422,         1, 0
2245GEN_GRAIN_UV_FN 444, 0, 0
2246FGUV_FN 444,         0, 0
2247
2248%endif ; ARCH_X86_64
2249