1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 32
32pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
33rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
34pw_seed_xor: times 2 dw 0xb524
35             times 2 dw 0x49d8
36pd_16: dd 16
37pd_m65536: dd ~0xffff
38pb_1: times 4 db 1
39hmul_bits: dw 32768, 16384, 8192, 4096
40round: dw 2048, 1024, 512
41mul_bits: dw 256, 128, 64, 32, 16
42round_vals: dw 32, 64, 128, 256, 512, 1024
43max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16
44min: dw 0, 16*4, 16*16
45pw_27_17_17_27: dw 27, 17, 17, 27
46; these two should be next to each other
47pw_4: times 2 dw 4
48pw_16: times 2 dw 16
49pw_23_22: dw 23, 22, 0, 32
50
51%macro JMP_TABLE 1-*
52    %xdefine %1_table %%table
53    %xdefine %%base %1_table
54    %xdefine %%prefix mangle(private_prefix %+ _%1)
55    %%table:
56    %rep %0 - 1
57        dd %%prefix %+ .ar%2 - %%base
58        %rotate 1
59    %endrep
60%endmacro
61
62JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3
63JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3
64JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3
65JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3
66
67struc FGData
68    .seed:                      resd 1
69    .num_y_points:              resd 1
70    .y_points:                  resb 14 * 2
71    .chroma_scaling_from_luma:  resd 1
72    .num_uv_points:             resd 2
73    .uv_points:                 resb 2 * 10 * 2
74    .scaling_shift:             resd 1
75    .ar_coeff_lag:              resd 1
76    .ar_coeffs_y:               resb 24
77    .ar_coeffs_uv:              resb 2 * 28 ; includes padding
78    .ar_coeff_shift:            resq 1
79    .grain_scale_shift:         resd 1
80    .uv_mult:                   resd 2
81    .uv_luma_mult:              resd 2
82    .uv_offset:                 resd 2
83    .overlap_flag:              resd 1
84    .clip_to_restricted_range:  resd 1
85endstruc
86
87cextern gaussian_sequence
88
89SECTION .text
90
91%macro REPX 2-*
92    %xdefine %%f(x) %1
93%rep %0 - 1
94    %rotate 1
95    %%f(%1)
96%endrep
97%endmacro
98
99%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
100
101INIT_YMM avx2
102cglobal generate_grain_y_16bpc, 3, 9, 16, buf, fg_data, bdmax
103    lea              r4, [pb_mask]
104%define base r4-pb_mask
105    movq            xm1, [base+rnd_next_upperbit_mask]
106    movq            xm4, [base+mul_bits]
107    movq            xm7, [base+hmul_bits]
108    mov             r3d, [fg_dataq+FGData.grain_scale_shift]
109    lea             r6d, [bdmaxq+1]
110    shr             r6d, 11             ; 0 for 10bpc, 2 for 12bpc
111    sub              r3, r6
112    vpbroadcastw    xm8, [base+round+r3*2-2]
113    mova            xm5, [base+pb_mask]
114    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
115    vpbroadcastd    xm9, [base+pd_m65536]
116    mov              r3, -73*82*2
117    sub            bufq, r3
118    lea              r6, [gaussian_sequence]
119.loop:
120    pand            xm2, xm0, xm1
121    psrlw           xm3, xm2, 10
122    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
123    pmullw          xm2, xm4            ; bits 0x0f00 are set
124    pshufb          xm2, xm5, xm2       ; set 15th bit for next 4 seeds
125    psllq           xm6, xm2, 30
126    por             xm2, xm6
127    psllq           xm6, xm2, 15
128    por             xm2, xm6            ; aggregate each bit into next seed's high bit
129    pmulhuw         xm3, xm0, xm7
130    por             xm2, xm3            ; 4 next output seeds
131    pshuflw         xm0, xm2, q3333
132    psrlw           xm2, 5
133    pmovzxwd        xm3, xm2
134    mova            xm6, xm9
135    vpgatherdd      xm2, [r6+xm3*2], xm6
136    pandn           xm2, xm9, xm2
137    packusdw        xm2, xm2
138    paddw           xm2, xm2            ; otherwise bpc=12 w/ grain_scale_shift=0
139                                        ; shifts by 0, which pmulhrsw does not support
140    pmulhrsw        xm2, xm8
141    movq      [bufq+r3], xm2
142    add              r3, 4*2
143    jl .loop
144
145    ; auto-regression code
146    movsxd           r3, [fg_dataq+FGData.ar_coeff_lag]
147    movsxd           r3, [base+generate_grain_y_16bpc_avx2_table+r3*4]
148    lea              r3, [r3+base+generate_grain_y_16bpc_avx2_table]
149    jmp              r3
150
151.ar1:
152    DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0
153    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
154    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
155    movd            xm4, [fg_dataq+FGData.ar_coeffs_y]
156    DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0
157    pinsrb          xm4, [pb_1], 3
158    pmovsxbw        xm4, xm4
159    pshufd          xm5, xm4, q1111
160    pshufd          xm4, xm4, q0000
161    vpbroadcastw    xm3, [base+round_vals+shiftq*2-12]    ; rnd
162    sub            bufq, 2*(82*73-(82*3+79))
163    mov              hd, 70
164    sar            maxd, 1
165    mov            mind, maxd
166    xor            mind, -1
167.y_loop_ar1:
168    mov              xq, -76
169    movsx         val3d, word [bufq+xq*2-2]
170.x_loop_ar1:
171    movu            xm0, [bufq+xq*2-82*2-2]     ; top/left
172    psrldq          xm2, xm0, 2                 ; top
173    psrldq          xm1, xm0, 4                 ; top/right
174    punpcklwd       xm0, xm2
175    punpcklwd       xm1, xm3
176    pmaddwd         xm0, xm4
177    pmaddwd         xm1, xm5
178    paddd           xm0, xm1
179.x_loop_ar1_inner:
180    movd          val0d, xm0
181    psrldq          xm0, 4
182    imul          val3d, cf3d
183    add           val3d, val0d
184    sarx          val3d, val3d, shiftd
185    movsx         val0d, word [bufq+xq*2]
186    add           val3d, val0d
187    cmp           val3d, maxd
188    cmovg         val3d, maxd
189    cmp           val3d, mind
190    cmovl         val3d, mind
191    mov word [bufq+xq*2], val3w
192    ; keep val3d in-place as left for next x iteration
193    inc              xq
194    jz .x_loop_ar1_end
195    test             xq, 3
196    jnz .x_loop_ar1_inner
197    jmp .x_loop_ar1
198
199.x_loop_ar1_end:
200    add            bufq, 82*2
201    dec              hd
202    jg .y_loop_ar1
203.ar0:
204    RET
205
206.ar2:
207    DEFINE_ARGS buf, fg_data, bdmax, shift
208    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
209    vpbroadcastw   xm14, [base+round_vals-12+shiftq*2]
210    movq            xm8, [fg_dataq+FGData.ar_coeffs_y+5]    ; cf5-11
211    vinserti128      m8, [fg_dataq+FGData.ar_coeffs_y+0], 1 ; cf0-4
212    pxor             m9, m9
213    punpcklwd      xm14, xm9
214    pcmpgtb          m9, m8
215    punpcklbw        m8, m9                                 ; cf5-11,0-4
216    vpermq           m9, m8, q3333                          ; cf4
217    psrldq         xm10, xm8, 6                             ; cf8-11
218    vpblendw        xm9, xm10, 11111110b                    ; cf4,9-11
219    pshufd          m12, m8, q0000                          ; cf[5,6], cf[0-1]
220    pshufd          m11, m8, q1111                          ; cf[7,8], cf[2-3]
221    pshufd         xm13, xm9, q1111                         ; cf[10,11]
222    pshufd         xm10, xm9, q0000                         ; cf[4,9]
223    sar          bdmaxd, 1
224    movd           xm15, bdmaxd
225    pcmpeqd         xm7, xm7
226    vpbroadcastd   xm15, xm15                               ; max_grain
227    pxor            xm7, xm15                               ; min_grain
228    sub            bufq, 2*(82*73-(82*3+79))
229    DEFINE_ARGS buf, fg_data, h, x
230    mov              hd, 70
231.y_loop_ar2:
232    mov              xq, -76
233
234.x_loop_ar2:
235    movu            xm0, [bufq+xq*2-82*2-4]     ; y=-1,x=[-2,+5]
236    vinserti128      m0, [bufq+xq*2-82*4-4], 1  ; y=-2,x=[-2,+5]
237    psrldq           m1, m0, 2                  ; y=-1/-2,x=[-1,+5]
238    psrldq           m2, m0, 4                  ; y=-1/-2,x=[-0,+5]
239    psrldq           m3, m0, 6                  ; y=-1/-2,x=[+1,+5]
240
241    vextracti128    xm4, m0, 1                  ; y=-2,x=[-2,+5]
242    punpcklwd        m2, m3                     ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
243    punpckhwd       xm4, xm0                    ; y=-2/-1 interleaved, x=[+2,+5]
244    punpcklwd        m0, m1                     ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
245
246    pmaddwd          m2, m11
247    pmaddwd          m0, m12
248    pmaddwd         xm4, xm10
249
250    paddd            m0, m2
251    vextracti128    xm2, m0, 1
252    paddd           xm4, xm0
253    paddd           xm2, xm14
254    paddd           xm2, xm4
255
256    movu            xm0, [bufq+xq*2-4]      ; y=0,x=[-2,+5]
257    pshufd          xm4, xm0, q3321
258    pmovsxwd        xm4, xm4                ; in dwords, y=0,x=[0,3]
259.x_loop_ar2_inner:
260    pmaddwd         xm3, xm0, xm13
261    paddd           xm3, xm2
262    psrldq          xm2, 4                  ; shift top to next pixel
263    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
264    ; skip packssdw because we only care about one value
265    paddd           xm3, xm4
266    pminsd          xm3, xm15
267    pmaxsd          xm3, xm7
268    pextrw  [bufq+xq*2], xm3, 0
269    psrldq          xm4, 4
270    pslldq          xm3, 2
271    psrldq          xm0, 2
272    vpblendw        xm0, xm3, 0010b
273    inc              xq
274    jz .x_loop_ar2_end
275    test             xq, 3
276    jnz .x_loop_ar2_inner
277    jmp .x_loop_ar2
278
279.x_loop_ar2_end:
280    add            bufq, 82*2
281    dec              hd
282    jg .y_loop_ar2
283    RET
284
285.ar3:
286    DEFINE_ARGS buf, fg_data, bdmax, shift
287%if WIN64
288    mov              r6, rsp
289    and             rsp, ~31
290    sub             rsp, 64
291    %define         tmp  rsp
292%elif STACK_ALIGNMENT < 32
293    mov              r6, rsp
294    and              r6, ~31
295    %define         tmp  r6-64
296%else
297    %define         tmp  rsp+stack_offset-88
298%endif
299    sar          bdmaxd, 1
300    movd           xm15, bdmaxd
301    pcmpeqd        xm13, xm13
302    vpbroadcastd   xm15, xm15                                   ; max_grain
303    pxor           xm13, xm15                                   ; min_grain
304    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
305    vpbroadcastw    m14, [base+round_vals+shiftq*2-12]
306    movq            xm0, [fg_dataq+FGData.ar_coeffs_y+ 0]       ; cf0-6
307    movd            xm1, [fg_dataq+FGData.ar_coeffs_y+14]       ; cf14-16
308    pinsrb          xm0, [fg_dataq+FGData.ar_coeffs_y+13], 7    ; cf0-6,13
309    pinsrb          xm1, [pb_1], 3                              ; cf14-16,pb_1
310    movd            xm2, [fg_dataq+FGData.ar_coeffs_y+21]       ; cf21-23
311    vinserti128      m0, [fg_dataq+FGData.ar_coeffs_y+ 7], 1    ; cf7-13
312    vinserti128      m1, [fg_dataq+FGData.ar_coeffs_y+17], 1    ; cf17-20
313    punpcklbw        m0, m0                                     ; sign-extension
314    punpcklbw        m1, m1                                     ; sign-extension
315    punpcklbw       xm2, xm2
316    REPX   {psraw x, 8}, m0, m1, xm2
317
318    pshufd           m8, m0, q0000              ; cf[0,1] | cf[7,8]
319    pshufd           m9, m0, q1111              ; cf[2,3] | cf[9,10]
320    pshufd          m10, m0, q2222              ; cf[4,5] | cf[11,12]
321    pshufd         xm11, xm0, q3333             ; cf[6,13]
322
323    pshufd           m3, m1, q0000              ; cf[14,15] | cf[17,18]
324    pshufd           m4, m1, q1111              ; cf[16],pw_1 | cf[19,20]
325    mova     [tmp+0*32], m3
326    mova     [tmp+1*32], m4
327
328    paddw           xm5, xm14, xm14
329    vpblendw       xm12, xm2, xm5, 00001000b
330
331    DEFINE_ARGS buf, fg_data, h, x
332    sub            bufq, 2*(82*73-(82*3+79))
333    mov              hd, 70
334.y_loop_ar3:
335    mov              xq, -76
336
337.x_loop_ar3:
338    movu            xm0, [bufq+xq*2-82*6-6+ 0]      ; y=-3,x=[-3,+4]
339    movq            xm1, [bufq+xq*2-82*6-6+16]      ; y=-3,x=[+5,+8]
340    movu            xm2, [bufq+xq*2-82*2-6+ 0]      ; y=-1,x=[-3,+4]
341    vinserti128      m0, [bufq+xq*2-82*4-6+ 0], 1   ; y=-3/-2,x=[-3,+4]
342    vinserti128      m1, [bufq+xq*2-82*4-6+16], 1   ; y=-3/-2,x=[+5,+12]
343    vinserti128      m2, [bufq+xq*2-82*2-6+ 6], 1   ; y=-1,x=[+1,+8]
344
345    palignr         m4, m1, m0, 2                   ; y=-3/-2,x=[-2,+5]
346    palignr         m1, m0, 12                      ; y=-3/-2,x=[+3,+6]
347    punpckhwd       m5, m0, m4                      ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5]
348    punpcklwd       m0, m4                          ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1]
349    palignr         m6, m5, m0, 8                   ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3]
350    vextracti128   xm7, m1, 1
351    punpcklwd      xm1, xm7                         ; y=-3/-2 interleaved,x=[+3,+4,+5,+6]
352
353    psrldq          m3, m2, 2
354    psrldq          m4, m2, 4
355    psrldq          m7, m2, 6
356    vpblendd        m7, m14, 00001111b              ; rounding constant
357    punpcklwd       m2, m3                          ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
358                                                    ;      x=[+0/+1,+1/+2,+2/+3,+3/+4]
359    punpcklwd       m4, m7                          ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd]
360                                                    ;      x=[+2/+3,+3/+4,+4/+5,+5,+6]
361
362    pmaddwd          m0, m8
363    pmaddwd          m6, m9
364    pmaddwd          m5, m10
365    pmaddwd         xm1, xm11
366    pmaddwd          m2, [tmp+0*32]
367    pmaddwd          m4, [tmp+1*32]
368
369    paddd            m0, m6
370    paddd            m5, m2
371    paddd            m0, m4
372    paddd            m0, m5
373    vextracti128    xm4, m0, 1
374    paddd           xm0, xm1
375    paddd           xm0, xm4
376
377    movu            xm1, [bufq+xq*2-6]        ; y=0,x=[-3,+4]
378.x_loop_ar3_inner:
379    pmaddwd         xm2, xm1, xm12
380    pshufd          xm3, xm2, q1111
381    paddd           xm2, xm3                ; left+cur
382    paddd           xm2, xm0                ; add top
383    psrldq          xm0, 4
384    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
385    ; skip packssdw because we only care about one value
386    pminsd          xm2, xm15
387    pmaxsd          xm2, xm13
388    pextrw  [bufq+xq*2], xm2, 0
389    pslldq          xm2, 4
390    psrldq          xm1, 2
391    vpblendw        xm1, xm2, 0100b
392    inc              xq
393    jz .x_loop_ar3_end
394    test             xq, 3
395    jnz .x_loop_ar3_inner
396    jmp .x_loop_ar3
397
398.x_loop_ar3_end:
399    add            bufq, 82*2
400    dec              hd
401    jg .y_loop_ar3
402%if WIN64
403    mov             rsp, r6
404%endif
405    RET
406
407%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
408INIT_XMM avx2
409cglobal generate_grain_uv_%1_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax
410%define base r8-pb_mask
411    lea              r8, [pb_mask]
412    movifnidn    bdmaxd, bdmaxm
413    movq            xm1, [base+rnd_next_upperbit_mask]
414    movq            xm4, [base+mul_bits]
415    movq            xm7, [base+hmul_bits]
416    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
417    lea             r6d, [bdmaxq+1]
418    shr             r6d, 11             ; 0 for 10bpc, 2 for 12bpc
419    sub              r5, r6
420    vpbroadcastw    xm8, [base+round+r5*2-2]
421    mova            xm5, [base+pb_mask]
422    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
423    vpbroadcastw    xm9, [base+pw_seed_xor+uvq*4]
424    pxor            xm0, xm9
425    vpbroadcastd    xm9, [base+pd_m65536]
426    lea              r6, [gaussian_sequence]
427%if %2
428    mov             r7d, 73-35*%3
429    add            bufq, 44*2
430.loop_y:
431    mov              r5, -44
432%else
433    mov              r5, -82*73
434    add            bufq, 2*82*73
435%endif
436.loop_x:
437    pand            xm2, xm0, xm1
438    psrlw           xm3, xm2, 10
439    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
440    pmullw          xm2, xm4            ; bits 0x0f00 are set
441    pshufb          xm2, xm5, xm2       ; set 15th bit for next 4 seeds
442    psllq           xm6, xm2, 30
443    por             xm2, xm6
444    psllq           xm6, xm2, 15
445    por             xm2, xm6            ; aggregate each bit into next seed's high bit
446    pmulhuw         xm3, xm0, xm7
447    por             xm2, xm3            ; 4 next output seeds
448    pshuflw         xm0, xm2, q3333
449    psrlw           xm2, 5
450    pmovzxwd        xm3, xm2
451    mova            xm6, xm9
452    vpgatherdd      xm2, [r6+xm3*2], xm6
453    pandn           xm2, xm9, xm2
454    packusdw        xm2, xm2
455    paddw           xm2, xm2            ; otherwise bpc=12 w/ grain_scale_shift=0
456                                        ; shifts by 0, which pmulhrsw does not support
457    pmulhrsw        xm2, xm8
458    movq    [bufq+r5*2], xm2
459    add              r5, 4
460    jl .loop_x
461%if %2
462    add            bufq, 82*2
463    dec             r7d
464    jg .loop_y
465%endif
466
467    ; auto-regression code
468    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
469    movsxd           r5, [base+generate_grain_uv_%1_16bpc_avx2_table+r5*4]
470    lea              r5, [r5+base+generate_grain_uv_%1_16bpc_avx2_table]
471    jmp              r5
472
473.ar0:
474    INIT_YMM avx2
475    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
476    imul            uvd, 28
477    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
478    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
479    vpbroadcastw     m3, [base+hmul_bits+shiftq*2-10]
480    sar          bdmaxd, 1
481    movd           xm14, bdmaxd
482    pcmpeqw          m7, m7
483    vpbroadcastw    m14, xm14                       ; max_gain
484    pxor             m7, m14                        ; min_grain
485    DEFINE_ARGS buf, bufy, h, x
486    pmovsxbw        xm4, xm4
487%if %2
488    vpbroadcastw     m6, [hmul_bits+2+%3*2]
489%endif
490    vpbroadcastw     m4, xm4
491    pxor             m5, m5
492%if %2
493    sub            bufq, 2*(82*(73-35*%3)+82-(82*3+41))
494%else
495    sub            bufq, 2*(82*70-3)
496%endif
497    add           bufyq, 2*(3+82*3)
498    mov              hd, 70-35*%3
499.y_loop_ar0:
500%if %2
501    ; first 32 pixels
502    movu            xm8, [bufyq]
503    movu           xm10, [bufyq+     16]
504%if %3
505    movu            xm9, [bufyq+82*2]
506    movu           xm11, [bufyq+82*2+16]
507%endif
508    vinserti128      m8, [bufyq+     32], 1
509    vinserti128     m10, [bufyq+     48], 1
510%if %3
511    vinserti128      m9, [bufyq+82*2+32], 1
512    vinserti128     m11, [bufyq+82*2+48], 1
513    paddw            m8, m9
514    paddw           m10, m11
515%endif
516    phaddw           m8, m10
517    movu           xm10, [bufyq+     64]
518    movu           xm12, [bufyq+     80]
519%if %3
520    movu           xm11, [bufyq+82*2+64]
521    movu           xm13, [bufyq+82*2+80]
522%endif
523    vinserti128     m10, [bufyq+     96], 1
524    vinserti128     m12, [bufyq+     112], 1
525%if %3
526    vinserti128     m11, [bufyq+82*2+96], 1
527    vinserti128     m13, [bufyq+82*2+112], 1
528    paddw           m10, m11
529    paddw           m12, m13
530%endif
531    phaddw          m10, m12
532    pmulhrsw         m8, m6
533    pmulhrsw        m10, m6
534%else
535    xor              xd, xd
536.x_loop_ar0:
537    movu             m8, [bufyq+xq*2]
538    movu            m10, [bufyq+xq*2+32]
539%endif
540    punpckhwd        m9, m8, m5
541    punpcklwd        m8, m5
542    punpckhwd       m11, m10, m5
543    punpcklwd       m10, m5
544    REPX {pmaddwd x, m4}, m8, m9, m10, m11
545    REPX {psrad x, 5}, m8, m9, m10, m11
546    packssdw         m8, m9
547    packssdw        m10, m11
548    REPX {pmulhrsw x, m3}, m8, m10
549%if %2
550    paddw            m8, [bufq+ 0]
551    paddw           m10, [bufq+32]
552%else
553    paddw            m8, [bufq+xq*2+ 0]
554    paddw           m10, [bufq+xq*2+32]
555%endif
556    pminsw           m8, m14
557    pminsw          m10, m14
558    pmaxsw           m8, m7
559    pmaxsw          m10, m7
560%if %2
561    movu      [bufq+ 0], m8
562    movu      [bufq+32], m10
563
564    ; last 6 pixels
565    movu            xm8, [bufyq+32*4]
566    movu           xm10, [bufyq+32*4+16]
567%if %3
568    paddw           xm8, [bufyq+32*4+82*2]
569    paddw          xm10, [bufyq+32*4+82*2+16]
570%endif
571    phaddw          xm8, xm10
572    pmulhrsw        xm8, xm6
573    punpckhwd       xm9, xm8, xm5
574    punpcklwd       xm8, xm5
575    REPX {pmaddwd x, xm4}, xm8, xm9
576    REPX {psrad   x, 5}, xm8, xm9
577    packssdw        xm8, xm9
578    pmulhrsw        xm8, xm3
579    movu            xm0, [bufq+32*2]
580    paddw           xm8, xm0
581    pminsw          xm8, xm14
582    pmaxsw          xm8, xm7
583    vpblendw        xm0, xm8, xm0, 11000000b
584    movu    [bufq+32*2], xm0
585%else
586    movu [bufq+xq*2+ 0], m8
587    movu [bufq+xq*2+32], m10
588    add              xd, 32
589    cmp              xd, 64
590    jl .x_loop_ar0
591
592    ; last 12 pixels
593    movu             m8, [bufyq+64*2]
594    punpckhwd        m9, m8, m5
595    punpcklwd        m8, m5
596    REPX {pmaddwd x, m4}, m8, m9
597    REPX {psrad   x, 5}, m8, m9
598    packssdw         m8, m9
599    pmulhrsw         m8, m3
600    movu             m0, [bufq+64*2]
601    paddw            m8, m0
602    pminsw           m8, m14
603    pmaxsw           m8, m7
604    vpblendd         m0, m8, m0, 11000000b
605    movu    [bufq+64*2], m0
606%endif
607
608    add            bufq, 82*2
609    add           bufyq, 82*2<<%3
610    dec              hd
611    jg .y_loop_ar0
612    RET
613
614.ar1:
615    INIT_XMM avx2
616    DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x, shift
617    imul            uvd, 28
618    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
619    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
620    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
621    pinsrb          xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
622    DEFINE_ARGS buf, bufy, h, val0, max, cf3, min, val3, x, shift
623    pmovsxbw        xm4, xm4
624    pshufd          xm5, xm4, q1111
625    pshufd          xm4, xm4, q0000
626    pmovsxwd        xm3, [base+round_vals+shiftq*2-12]    ; rnd
627    vpbroadcastw    xm6, [hmul_bits+2+%3*2]
628    vpbroadcastd    xm3, xm3
629%if %2
630    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
631%else
632    sub            bufq, 2*(82*69+3)
633%endif
634    add           bufyq, 2*(79+82*3)
635    mov              hd, 70-35*%3
636    sar            maxd, 1
637    mov            mind, maxd
638    xor            mind, -1
639.y_loop_ar1:
640    mov              xq, -(76>>%2)
641    movsx         val3d, word [bufq+xq*2-2]
642.x_loop_ar1:
643    movu            xm0, [bufq+xq*2-82*2-2] ; top/left
644%if %2
645    movu            xm8, [bufyq+xq*4]
646%else
647    movq            xm8, [bufyq+xq*2]
648%endif
649    psrldq          xm2, xm0, 2             ; top
650    psrldq          xm1, xm0, 4             ; top/right
651%if %2
652%if %3
653    phaddw          xm8, [bufyq+xq*4+82*2]
654    pshufd          xm9, xm8, q3232
655    paddw           xm8, xm9
656%else
657    phaddw          xm8, xm8
658%endif
659    pmulhrsw        xm8, xm6
660%endif
661    punpcklwd       xm0, xm2
662    punpcklwd       xm1, xm8
663    pmaddwd         xm0, xm4
664    pmaddwd         xm1, xm5
665    paddd           xm0, xm1
666    paddd           xm0, xm3
667.x_loop_ar1_inner:
668    movd          val0d, xm0
669    psrldq          xm0, 4
670    imul          val3d, cf3d
671    add           val3d, val0d
672    sarx          val3d, val3d, shiftd
673    movsx         val0d, word [bufq+xq*2]
674    add           val3d, val0d
675    cmp           val3d, maxd
676    cmovg         val3d, maxd
677    cmp           val3d, mind
678    cmovl         val3d, mind
679    mov word [bufq+xq*2], val3w
680    ; keep val3d in-place as left for next x iteration
681    inc              xq
682    jz .x_loop_ar1_end
683    test             xq, 3
684    jnz .x_loop_ar1_inner
685    jmp .x_loop_ar1
686
687.x_loop_ar1_end:
688    add            bufq, 82*2
689    add           bufyq, 82*2<<%3
690    dec              hd
691    jg .y_loop_ar1
692    RET
693
694    INIT_YMM avx2
695.ar2:
696    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
697    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
698    imul            uvd, 28
699    sar          bdmaxd, 1
700    movd            xm6, bdmaxd
701    pcmpeqd         xm5, xm5
702    vpbroadcastd    xm6, xm6                ; max_grain
703    pxor            xm5, xm6                ; min_grain
704%if %2
705    vpbroadcastw    xm7, [base+hmul_bits+2+%3*2]
706%endif
707    vpbroadcastw   xm15, [base+round_vals-12+shiftq*2]
708
709    movd            xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+5]
710    pinsrb          xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+12], 4
711    pinsrb          xm0, [pb_1], 5
712    pinsrw          xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+10], 3
713    movhps          xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]
714    pinsrb          xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+9], 13
715    pmovsxbw         m0, xm0
716
717    pshufd         xm13, xm0, q3333
718    pshufd          m12, m0, q0000
719    pshufd          m11, m0, q1111
720    pshufd          m10, m0, q2222
721
722    DEFINE_ARGS buf, bufy, fg_data, h, x
723%if %2
724    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
725%else
726    sub            bufq, 2*(82*69+3)
727%endif
728    add           bufyq, 2*(79+82*3)
729    mov              hd, 70-35*%3
730.y_loop_ar2:
731    mov              xq, -(76>>%2)
732
733.x_loop_ar2:
734    movu            xm0, [bufq+xq*2-82*2-4]     ; y=-1,x=[-2,+5]
735    vinserti128      m0, [bufq+xq*2-82*4-4], 1  ; y=-2,x=[-2,+5]
736    psrldq           m1, m0, 2                  ; y=-1/-2,x=[-1,+5]
737    psrldq           m2, m0, 4                  ; y=-1/-2,x=[-0,+5]
738    psrldq           m3, m0, 6                  ; y=-1/-2,x=[+1,+5]
739
740%if %2
741    movu            xm8, [bufyq+xq*4]
742%if %3
743    paddw           xm8, [bufyq+xq*4+82*2]
744%endif
745    phaddw          xm8, xm8
746%else
747    movq            xm8, [bufyq+xq*2]
748%endif
749
750    vinserti128      m4, xm0, 1                 ; y=-1,x=[-2,+5]
751    punpcklwd        m2, m3                     ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
752    punpckhwd        m4, m0, m4                 ; y=-2/-1 interleaved, x=[+2,+5]
753    punpcklwd        m0, m1                     ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
754
755%if %2
756    pmulhrsw        xm1, xm8, xm7
757    punpcklwd       xm1, xm15                   ; luma, round interleaved
758%else
759    punpcklwd       xm1, xm8, xm15
760%endif
761    vpblendd         m1, m1, m4, 11110000b
762
763    pmaddwd          m2, m11
764    pmaddwd          m0, m12
765    pmaddwd          m1, m10
766    paddd            m2, m0
767    paddd            m2, m1
768    vextracti128    xm0, m2, 1
769    paddd           xm2, xm0
770
771    movu            xm0, [bufq+xq*2-4]      ; y=0,x=[-2,+5]
772    pshufd          xm4, xm0, q3321
773    pmovsxwd        xm4, xm4                ; y=0,x=[0,3] in dword
774.x_loop_ar2_inner:
775    pmaddwd         xm3, xm0, xm13
776    paddd           xm3, xm2
777    psrldq          xm2, 4                  ; shift top to next pixel
778    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
779    ; we do not need to packssdw since we only care about one value
780    paddd           xm3, xm4
781    pminsd          xm3, xm6
782    pmaxsd          xm3, xm5
783    pextrw  [bufq+xq*2], xm3, 0
784    psrldq          xm0, 2
785    pslldq          xm3, 2
786    psrldq          xm4, 4
787    vpblendw        xm0, xm3, 00000010b
788    inc              xq
789    jz .x_loop_ar2_end
790    test             xq, 3
791    jnz .x_loop_ar2_inner
792    jmp .x_loop_ar2
793
794.x_loop_ar2_end:
795    add            bufq, 82*2
796    add           bufyq, 82*2<<%3
797    dec              hd
798    jg .y_loop_ar2
799    RET
800
801.ar3:
802    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
803%if WIN64
804    mov              r6, rsp
805    and             rsp, ~31
806    sub             rsp, 96
807    %define         tmp  rsp
808%elif STACK_ALIGNMENT < 32
809    mov              r6, rsp
810    and              r6, ~31
811    %define         tmp  r6-96
812%else
813    %define         tmp  rsp+stack_offset-120
814%endif
815    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
816    imul            uvd, 28
817    vpbroadcastw   xm14, [base+round_vals-12+shiftq*2]
818    sar          bdmaxd, 1
819    movd           xm15, bdmaxd
820    pcmpeqd        xm13, xm13
821    vpbroadcastd   xm15, xm15                   ; max_grain
822    pxor           xm13, xm15                   ; min_grain
823%if %2
824    vpbroadcastw   xm12, [base+hmul_bits+2+%3*2]
825%endif
826
827    movq            xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
828    pinsrb          xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7   ; luma
829    movhps          xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 7]
830    pmovsxbw         m0, xm0
831
832    pshufd          m11, m0, q3333
833    pshufd          m10, m0, q2222
834    pshufd           m9, m0, q1111
835    pshufd           m8, m0, q0000
836
837    movd            xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+14]
838    pinsrb          xm0, [pb_1], 3
839    pinsrd          xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+21], 1
840    pinsrd          xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+17], 2
841    pmovsxbw         m0, xm0
842
843    pshufd           m1, m0, q0000
844    pshufd           m2, m0, q1111
845    mova     [tmp+32*2], m11
846    pshufd         xm11, xm0, q3232
847    mova     [tmp+32*0], m1
848    mova     [tmp+32*1], m2
849    pinsrw         xm11, [base+round_vals-10+shiftq*2], 3
850
851    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
852%if %2
853    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
854%else
855    sub            bufq, 2*(82*69+3)
856%endif
857    add           bufyq, 2*(79+82*3)
858    mov              hd, 70-35*%3
859.y_loop_ar3:
860    mov              xq, -(76>>%2)
861
862.x_loop_ar3:
863    movu            xm0, [bufq+xq*2-82*6-6+ 0]      ; y=-3,x=[-3,+4]
864    movq            xm1, [bufq+xq*2-82*6-6+16]      ; y=-3,x=[+5,+8]
865    movu            xm2, [bufq+xq*2-82*2-6+ 0]      ; y=-1,x=[-3,+4]
866    vinserti128      m0, [bufq+xq*2-82*4-6+ 0], 1   ; y=-3/-2,x=[-3,+4]
867    vinserti128      m1, [bufq+xq*2-82*4-6+16], 1   ; y=-3/-2,x=[+5,+12]
868    vinserti128      m2, [bufq+xq*2-82*2-6+ 6], 1   ; y=-1,x=[+1,+8]
869
870%if %2
871    movu           xm7, [bufyq+xq*4]
872%if %3
873    paddw          xm7, [bufyq+xq*4+82*2]
874%endif
875    phaddw         xm7, xm7
876%else
877    movq           xm7, [bufyq+xq*2]
878%endif
879
880    palignr         m4, m1, m0, 2                   ; y=-3/-2,x=[-2,+5]
881    palignr         m1, m0, 12                      ; y=-3/-2,x=[+3,+6]
882    punpckhwd       m5, m0, m4                      ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5]
883    punpcklwd       m0, m4                          ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1]
884    palignr         m6, m5, m0, 8                   ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3]
885%if %2
886    pmulhrsw       xm7, xm12
887%endif
888    punpcklwd       m1, m7
889
890    psrldq          m3, m2, 2
891    psrldq          m4, m2, 4
892    psrldq          m7, m2, 6
893    vpblendd        m7, m14, 00001111b              ; rounding constant
894    punpcklwd       m2, m3                          ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
895                                                    ;      x=[+0/+1,+1/+2,+2/+3,+3/+4]
896    punpcklwd       m4, m7                          ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd]
897                                                    ;      x=[+2/+3,+3/+4,+4/+5,+5,+6]
898
899    pmaddwd          m0, m8
900    pmaddwd          m6, m9
901    pmaddwd          m5, m10
902    pmaddwd          m1, [tmp+32*2]
903    pmaddwd          m2, [tmp+32*0]
904    pmaddwd          m4, [tmp+32*1]
905
906    paddd            m0, m6
907    paddd            m5, m2
908    paddd            m4, m1
909    paddd            m0, m4
910    paddd            m0, m5
911    vextracti128    xm4, m0, 1
912    paddd           xm0, xm4
913
914    movu            xm1, [bufq+xq*2-6]        ; y=0,x=[-3,+4]
915.x_loop_ar3_inner:
916    pmaddwd         xm2, xm1, xm11
917    pshufd          xm3, xm2, q1111
918    paddd           xm2, xm3                ; left+cur
919    paddd           xm2, xm0                ; add top
920    psrldq          xm0, 4
921    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
922    ; no need to packssdw since we only care about one value
923    pminsd          xm2, xm15
924    pmaxsd          xm2, xm13
925    pextrw  [bufq+xq*2], xm2, 0
926    pslldq          xm2, 4
927    psrldq          xm1, 2
928    vpblendw        xm1, xm2, 00000100b
929    inc              xq
930    jz .x_loop_ar3_end
931    test             xq, 3
932    jnz .x_loop_ar3_inner
933    jmp .x_loop_ar3
934
935.x_loop_ar3_end:
936    add            bufq, 82*2
937    add           bufyq, 82*2<<%3
938    dec              hd
939    jg .y_loop_ar3
940%if WIN64
941    mov             rsp, r6
942%endif
943    RET
944%endmacro
945
946generate_grain_uv_fn 420, 1, 1
947generate_grain_uv_fn 422, 1, 0
948generate_grain_uv_fn 444, 0, 0
949
950INIT_YMM avx2
951cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, grain_lut
952    mov             r7d, [fg_dataq+FGData.scaling_shift]
953    lea              r8, [pb_mask]
954%define base r8-pb_mask
955    vpbroadcastw    m11, [base+mul_bits+r7*2-14]
956    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
957    mov             r9d, r9m        ; bdmax
958    sar             r9d, 11         ; is_12bpc
959    shlx           r10d, r6d, r9d
960    vpbroadcastw    m13, [base+min+r10*2]
961    lea             r9d, [r9d*3]
962    lea             r9d, [r6d*2+r9d]
963    vpbroadcastw    m12, [base+max+r9*2]
964    vpbroadcastw    m10, r9m
965    pxor             m2, m2
966
967    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
968                sby, see
969
970    movifnidn      sbyd, sbym
971    test           sbyd, sbyd
972    setnz           r7b
973    test            r7b, byte [fg_dataq+FGData.overlap_flag]
974    jnz .vertical_overlap
975
976    imul           seed, sbyd, (173 << 24) | 37
977    add            seed, (105 << 24) | 178
978    rol            seed, 8
979    movzx          seed, seew
980    xor            seed, [fg_dataq+FGData.seed]
981
982    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
983                unused1, unused2, see, src_bak
984
985    lea        src_bakq, [srcq+wq*2]
986    neg              wq
987    sub            dstq, srcq
988
989.loop_x:
990    mov             r6d, seed
991    or             seed, 0xEFF4
992    shr             r6d, 1
993    test           seeb, seeh
994    lea            seed, [r6+0x8000]
995    cmovp          seed, r6d                ; updated seed
996
997    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
998                offx, offy, see, src_bak
999
1000    mov           offxd, seed
1001    rorx          offyd, seed, 8
1002    shr           offxd, 12
1003    and           offyd, 0xf
1004    imul          offyd, 164
1005    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1006
1007    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1008                h, offxy, see, src_bak
1009
1010    mov              hd, hm
1011    mov      grain_lutq, grain_lutmp
1012.loop_y:
1013    ; src
1014    pminuw           m0, m10, [srcq+ 0]
1015    pminuw           m1, m10, [srcq+32]          ; m0-1: src as word
1016    punpckhwd        m5, m0, m2
1017    punpcklwd        m4, m0, m2
1018    punpckhwd        m7, m1, m2
1019    punpcklwd        m6, m1, m2             ; m4-7: src as dword
1020
1021    ; scaling[src]
1022    pcmpeqw          m3, m3
1023    mova             m9, m3
1024    vpgatherdd       m8, [scalingq+m4-3], m3
1025    vpgatherdd       m4, [scalingq+m5-3], m9
1026    pcmpeqw          m3, m3
1027    mova             m9, m3
1028    vpgatherdd       m5, [scalingq+m6-3], m3
1029    vpgatherdd       m6, [scalingq+m7-3], m9
1030    REPX  {psrld x, 24}, m8, m4, m5, m6
1031    packssdw         m8, m4
1032    packssdw         m5, m6
1033
1034    ; grain = grain_lut[offy+y][offx+x]
1035    movu             m9, [grain_lutq+offxyq*2]
1036    movu             m3, [grain_lutq+offxyq*2+32]
1037
1038    ; noise = round2(scaling[src] * grain, scaling_shift)
1039    REPX {pmullw x, m11}, m8, m5
1040    pmulhrsw         m9, m8
1041    pmulhrsw         m3, m5
1042
1043    ; dst = clip_pixel(src, noise)
1044    paddw            m0, m9
1045    paddw            m1, m3
1046    pmaxsw           m0, m13
1047    pmaxsw           m1, m13
1048    pminsw           m0, m12
1049    pminsw           m1, m12
1050    mova [dstq+srcq+ 0], m0
1051    mova [dstq+srcq+32], m1
1052
1053    add            srcq, strideq
1054    add      grain_lutq, 82*2
1055    dec              hd
1056    jg .loop_y
1057
1058    add              wq, 32
1059    jge .end
1060    lea            srcq, [src_bakq+wq*2]
1061    cmp byte [fg_dataq+FGData.overlap_flag], 0
1062    je .loop_x
1063
1064    ; r8m = sbym
1065    movq           xm15, [pw_27_17_17_27]
1066    cmp       dword r8m, 0
1067    jne .loop_x_hv_overlap
1068
1069    ; horizontal overlap (without vertical overlap)
1070    vpbroadcastd   xm14, [pd_16]
1071.loop_x_h_overlap:
1072    mov             r6d, seed
1073    or             seed, 0xEFF4
1074    shr             r6d, 1
1075    test           seeb, seeh
1076    lea            seed, [r6+0x8000]
1077    cmovp          seed, r6d                ; updated seed
1078
1079    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1080                offx, offy, see, src_bak, left_offxy
1081
1082    lea     left_offxyd, [offyd+32]         ; previous column's offy*stride+offx
1083    mov           offxd, seed
1084    rorx          offyd, seed, 8
1085    shr           offxd, 12
1086    and           offyd, 0xf
1087    imul          offyd, 164
1088    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1089
1090    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1091                h, offxy, see, src_bak, left_offxy
1092
1093    mov              hd, hm
1094    mov      grain_lutq, grain_lutmp
1095.loop_y_h_overlap:
1096    ; src
1097    pminuw           m0, m10, [srcq+ 0]
1098    pminuw           m1, m10, [srcq+32]          ; m0-1: src as word
1099    punpckhwd        m5, m0, m2
1100    punpcklwd        m4, m0, m2
1101    punpckhwd        m7, m1, m2
1102    punpcklwd        m6, m1, m2             ; m4-7: src as dword
1103
1104    ; scaling[src]
1105    pcmpeqw          m3, m3
1106    mova             m9, m3
1107    vpgatherdd       m8, [scalingq+m4-3], m3
1108    vpgatherdd       m4, [scalingq+m5-3], m9
1109    pcmpeqw          m3, m3
1110    mova             m9, m3
1111    vpgatherdd       m5, [scalingq+m6-3], m3
1112    vpgatherdd       m6, [scalingq+m7-3], m9
1113    REPX  {psrld x, 24}, m8, m4, m5, m6
1114    packssdw         m8, m4
1115    packssdw         m5, m6
1116
1117    ; grain = grain_lut[offy+y][offx+x]
1118    movu             m9, [grain_lutq+offxyq*2]
1119    movd            xm7, [grain_lutq+left_offxyq*2]
1120    punpcklwd       xm7, xm9
1121    pmaddwd         xm7, xm15
1122    paddd           xm7, xm14
1123    psrad           xm7, 5
1124    packssdw        xm7, xm7
1125    vpblendd         m9, m7, 00000001b
1126    pcmpeqw          m3, m3
1127    psraw            m7, m10, 1             ; max_grain
1128    pxor             m3, m7                 ; min_grain
1129    pminsw           m9, m7
1130    pmaxsw           m9, m3
1131    movu             m3, [grain_lutq+offxyq*2+32]
1132
1133    ; noise = round2(scaling[src] * grain, scaling_shift)
1134    REPX {pmullw x, m11}, m8, m5
1135    pmulhrsw         m9, m8
1136    pmulhrsw         m3, m5
1137
1138    ; dst = clip_pixel(src, noise)
1139    paddw            m0, m9
1140    paddw            m1, m3
1141    pmaxsw           m0, m13
1142    pmaxsw           m1, m13
1143    pminsw           m0, m12
1144    pminsw           m1, m12
1145    mova [dstq+srcq+ 0], m0
1146    mova [dstq+srcq+32], m1
1147
1148    add            srcq, strideq
1149    add      grain_lutq, 82*2
1150    dec              hd
1151    jg .loop_y_h_overlap
1152
1153    add              wq, 32
1154    jge .end
1155    lea            srcq, [src_bakq+wq*2]
1156
1157    ; r8m = sbym
1158    cmp       dword r8m, 0
1159    jne .loop_x_hv_overlap
1160    jmp .loop_x_h_overlap
1161
1162.end:
1163    RET
1164
1165.vertical_overlap:
1166    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
1167                sby, see
1168
1169    movzx          sbyd, sbyb
1170    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1171    imul            r7d, sbyd, 173 * 0x00010001
1172    imul           sbyd, 37 * 0x01000100
1173    add             r7d, (105 << 16) | 188
1174    add            sbyd, (178 << 24) | (141 << 8)
1175    and             r7d, 0x00ff00ff
1176    and            sbyd, 0xff00ff00
1177    xor            seed, r7d
1178    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1179
1180    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1181                unused1, unused2, see, src_bak
1182
1183    lea        src_bakq, [srcq+wq*2]
1184    neg              wq
1185    sub            dstq, srcq
1186
1187    vpbroadcastd    m14, [pd_16]
1188.loop_x_v_overlap:
1189    vpbroadcastd    m15, [pw_27_17_17_27]
1190
1191    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1192    mov             r6d, seed
1193    or             seed, 0xeff4eff4
1194    test           seeb, seeh
1195    setp            r7b                     ; parity of top_seed
1196    shr            seed, 16
1197    shl             r7d, 16
1198    test           seeb, seeh
1199    setp            r7b                     ; parity of cur_seed
1200    or              r6d, 0x00010001
1201    xor             r7d, r6d
1202    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1203
1204    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1205                offx, offy, see, src_bak, unused, top_offxy
1206
1207    rorx          offyd, seed, 8
1208    rorx          offxd, seed, 12
1209    and           offyd, 0xf000f
1210    and           offxd, 0xf000f
1211    imul          offyd, 164
1212    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1213    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1214
1215    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1216                h, offxy, see, src_bak, unused, top_offxy
1217
1218    movzx    top_offxyd, offxyw
1219    shr          offxyd, 16
1220
1221    mov              hd, hm
1222    mov      grain_lutq, grain_lutmp
1223.loop_y_v_overlap:
1224    ; grain = grain_lut[offy+y][offx+x]
1225    movu             m3, [grain_lutq+offxyq*2]
1226    movu             m7, [grain_lutq+top_offxyq*2]
1227    punpckhwd        m9, m7, m3
1228    punpcklwd        m7, m3
1229    REPX {pmaddwd x, m15}, m9, m7
1230    REPX {paddd   x, m14}, m9, m7
1231    REPX {psrad   x, 5}, m9, m7
1232    packssdw         m7, m9
1233    pcmpeqw          m0, m0
1234    psraw            m1, m10, 1             ; max_grain
1235    pxor             m0, m1                 ; min_grain
1236    pminsw           m7, m1
1237    pmaxsw           m7, m0
1238    movu             m3, [grain_lutq+offxyq*2+32]
1239    movu             m8, [grain_lutq+top_offxyq*2+32]
1240    punpckhwd        m9, m8, m3
1241    punpcklwd        m8, m3
1242    REPX {pmaddwd x, m15}, m9, m8
1243    REPX {paddd   x, m14}, m9, m8
1244    REPX {psrad   x, 5}, m9, m8
1245    packssdw         m8, m9
1246    pminsw           m8, m1
1247    pmaxsw           m8, m0
1248
1249    ; src
1250    pminuw           m0, m10, [srcq+ 0]          ; m0-1: src as word
1251    punpckhwd        m5, m0, m2
1252    punpcklwd        m4, m0, m2
1253
1254    ; scaling[src]
1255    pcmpeqw          m3, m3
1256    mova             m9, m3
1257    vpgatherdd       m6, [scalingq+m4-3], m3
1258    vpgatherdd       m4, [scalingq+m5-3], m9
1259    REPX  {psrld x, 24}, m6, m4
1260    packssdw         m6, m4
1261
1262    ; noise = round2(scaling[src] * grain, scaling_shift)
1263    pmullw           m6, m11
1264    pmulhrsw         m6, m7
1265
1266    ; same for the other half
1267    pminuw           m1, m10, [srcq+32]          ; m0-1: src as word
1268    punpckhwd        m9, m1, m2
1269    punpcklwd        m4, m1, m2             ; m4-7: src as dword
1270    pcmpeqw          m3, m3
1271    mova             m7, m3
1272    vpgatherdd       m5, [scalingq+m4-3], m3
1273    vpgatherdd       m4, [scalingq+m9-3], m7
1274    REPX  {psrld x, 24}, m5, m4
1275    packssdw         m5, m4
1276
1277    pmullw           m5, m11
1278    pmulhrsw         m5, m8
1279
1280    ; dst = clip_pixel(src, noise)
1281    paddw            m0, m6
1282    paddw            m1, m5
1283    pmaxsw           m0, m13
1284    pmaxsw           m1, m13
1285    pminsw           m0, m12
1286    pminsw           m1, m12
1287    mova [dstq+srcq+ 0], m0
1288    mova [dstq+srcq+32], m1
1289
1290    vpbroadcastd    m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line
1291    add            srcq, strideq
1292    add      grain_lutq, 82*2
1293    dec              hw
1294    jz .end_y_v_overlap
1295    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1296    ; remaining (up to) 30 lines
1297    xor              hd, 0x10000
1298    test             hd, 0x10000
1299    jnz .loop_y_v_overlap
1300    jmp .loop_y
1301
1302.end_y_v_overlap:
1303    add              wq, 32
1304    jge .end_hv
1305    lea            srcq, [src_bakq+wq*2]
1306
1307    ; since fg_dataq.overlap is guaranteed to be set, we never jump
1308    ; back to .loop_x_v_overlap, and instead always fall-through to
1309    ; h+v overlap
1310
1311    movq           xm15, [pw_27_17_17_27]
1312.loop_x_hv_overlap:
1313    vpbroadcastd     m8, [pw_27_17_17_27]
1314
1315    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1316    mov             r6d, seed
1317    or             seed, 0xeff4eff4
1318    test           seeb, seeh
1319    setp            r7b                     ; parity of top_seed
1320    shr            seed, 16
1321    shl             r7d, 16
1322    test           seeb, seeh
1323    setp            r7b                     ; parity of cur_seed
1324    or              r6d, 0x00010001
1325    xor             r7d, r6d
1326    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1327
1328    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1329                offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy
1330
1331    lea  topleft_offxyq, [top_offxyq+32]
1332    lea     left_offxyq, [offyq+32]
1333    rorx          offyd, seed, 8
1334    rorx          offxd, seed, 12
1335    and           offyd, 0xf000f
1336    and           offxd, 0xf000f
1337    imul          offyd, 164
1338    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1339    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1340
1341    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1342                h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy
1343
1344    movzx    top_offxyd, offxyw
1345    shr          offxyd, 16
1346
1347    mov              hd, hm
1348    mov      grain_lutq, grain_lutmp
1349.loop_y_hv_overlap:
1350    ; grain = grain_lut[offy+y][offx+x]
1351    movu             m3, [grain_lutq+offxyq*2]
1352    movu             m0, [grain_lutq+offxyq*2+32]
1353    movu             m6, [grain_lutq+top_offxyq*2]
1354    movu             m1, [grain_lutq+top_offxyq*2+32]
1355    movd            xm4, [grain_lutq+left_offxyq*2]
1356    movd            xm7, [grain_lutq+topleft_offxyq*2]
1357    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
1358    punpcklwd       xm4, xm3
1359    punpcklwd       xm7, xm6
1360    REPX {pmaddwd x, xm15}, xm4, xm7
1361    REPX {paddd   x, xm14}, xm4, xm7
1362    REPX {psrad   x, 5}, xm4, xm7
1363    REPX {packssdw x, x}, xm4, xm7
1364    pcmpeqw          m5, m5
1365    psraw            m9, m10, 1             ; max_grain
1366    pxor             m5, m9                 ; min_grain
1367    REPX {pminsw x, xm9}, xm4, xm7
1368    REPX {pmaxsw x, xm5}, xm4, xm7
1369    vpblendd         m3, m4, 00000001b
1370    vpblendd         m6, m7, 00000001b
1371    ; followed by v interpolation (top | cur -> cur)
1372    punpckhwd        m7, m6, m3
1373    punpcklwd        m6, m3
1374    punpckhwd        m3, m1, m0
1375    punpcklwd        m1, m0
1376    REPX {pmaddwd x, m8}, m7, m6, m3, m1
1377    REPX {paddd   x, m14}, m7, m6, m3, m1
1378    REPX {psrad   x, 5}, m7, m6, m3, m1
1379    packssdw         m7, m6, m7
1380    packssdw         m3, m1, m3
1381    REPX {pminsw x, m9}, m7, m3
1382    REPX {pmaxsw x, m5}, m7, m3
1383
1384    ; src
1385    pminuw           m0, m10, [srcq+ 0]
1386    pminuw           m1, m10, [srcq+32]          ; m0-1: src as word
1387    punpckhwd        m5, m0, m2
1388    punpcklwd        m4, m0, m2
1389
1390    ; scaling[src]
1391    pcmpeqw          m9, m9
1392    vpgatherdd       m6, [scalingq+m4-3], m9
1393    pcmpeqw          m9, m9
1394    vpgatherdd       m4, [scalingq+m5-3], m9
1395    REPX  {psrld x, 24}, m6, m4
1396    packssdw         m6, m4
1397
1398    ; noise = round2(scaling[src] * grain, scaling_shift)
1399    pmullw           m6, m11
1400    pmulhrsw         m7, m6
1401
1402    ; other half
1403    punpckhwd        m5, m1, m2
1404    punpcklwd        m4, m1, m2             ; m4-7: src as dword
1405
1406    ; scaling[src]
1407    pcmpeqw          m6, m6
1408    vpgatherdd       m9, [scalingq+m4-3], m6
1409    pcmpeqw          m6, m6
1410    vpgatherdd       m4, [scalingq+m5-3], m6
1411    REPX  {psrld x, 24}, m9, m4
1412    packssdw         m9, m4
1413
1414    ; noise = round2(scaling[src] * grain, scaling_shift)
1415    pmullw           m9, m11
1416    pmulhrsw         m3, m9
1417
1418    ; dst = clip_pixel(src, noise)
1419    paddw            m0, m7
1420    paddw            m1, m3
1421    pmaxsw           m0, m13
1422    pmaxsw           m1, m13
1423    pminsw           m0, m12
1424    pminsw           m1, m12
1425    mova [dstq+srcq+ 0], m0
1426    mova [dstq+srcq+32], m1
1427
1428    vpbroadcastd     m8, [pw_27_17_17_27+4] ; swap weights for second v-overlap line
1429    add            srcq, strideq
1430    add      grain_lutq, 82*2
1431    dec              hw
1432    jz .end_y_hv_overlap
1433    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1434    ; remaining (up to) 30 lines
1435    xor              hd, 0x10000
1436    test             hd, 0x10000
1437    jnz .loop_y_hv_overlap
1438    jmp .loop_y_h_overlap
1439
1440.end_y_hv_overlap:
1441    add              wq, 32
1442    lea            srcq, [src_bakq+wq*2]
1443    jl .loop_x_hv_overlap
1444
1445.end_hv:
1446    RET
1447
1448%macro FGUV_FN 3 ; name, ss_hor, ss_ver
1449cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
1450                                      grain_lut, h, sby, luma, lstride, uv_pl, is_id
1451%define base r8-pb_mask
1452    lea              r8, [pb_mask]
1453    mov             r7d, [fg_dataq+FGData.scaling_shift]
1454    vpbroadcastw    m11, [base+mul_bits+r7*2-14]
1455    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
1456    mov             r9d, r13m               ; bdmax
1457    sar             r9d, 11                 ; is_12bpc
1458    shlx           r10d, r6d, r9d
1459    vpbroadcastw    m13, [base+min+r10*2]
1460    lea            r10d, [r9d*3]
1461    mov            r11d, is_idm
1462    shlx            r6d, r6d, r11d
1463    add            r10d, r6d
1464    vpbroadcastw    m12, [base+max+r10*2]
1465    vpbroadcastw    m10, r13m
1466    pxor             m2, m2
1467
1468    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
1469    jne .csfl
1470
1471%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
1472    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
1473
1474%if %1
1475    mov             r7d, r11m
1476    vpbroadcastw     m0, [fg_dataq+FGData.uv_mult+r7*4]
1477    vpbroadcastw     m1, [fg_dataq+FGData.uv_luma_mult+r7*4]
1478    punpcklwd       m14, m1, m0
1479    vpbroadcastw    m15, [fg_dataq+FGData.uv_offset+r7*4]
1480    vpbroadcastd     m9, [base+pw_4+r9*4]
1481    pmullw          m15, m9
1482%else
1483    vpbroadcastd    m14, [pd_16]
1484%if %2
1485    vpbroadcastq    m15, [pw_23_22]
1486%else
1487    vpbroadcastq    m15, [pw_27_17_17_27]
1488%endif
1489%endif
1490
1491    movifnidn      sbyd, sbym
1492    test           sbyd, sbyd
1493    setnz           r7b
1494    test            r7b, byte [fg_dataq+FGData.overlap_flag]
1495    jnz %%vertical_overlap
1496
1497    imul           seed, sbyd, (173 << 24) | 37
1498    add            seed, (105 << 24) | 178
1499    rol            seed, 8
1500    movzx          seed, seew
1501    xor            seed, [fg_dataq+FGData.seed]
1502
1503    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1504                unused2, unused3, see, unused4, unused5, unused6, luma, lstride
1505
1506    mov           lumaq, r9mp
1507    mov        lstrideq, r10mp
1508    lea             r10, [srcq+wq*2]
1509    lea             r11, [dstq+wq*2]
1510    lea             r12, [lumaq+wq*(2<<%2)]
1511    mov           r10mp, r10
1512    mov           r11mp, r11
1513    mov           r12mp, r12
1514    neg              wq
1515
1516%%loop_x:
1517    mov             r6d, seed
1518    or             seed, 0xEFF4
1519    shr             r6d, 1
1520    test           seeb, seeh
1521    lea            seed, [r6+0x8000]
1522    cmovp          seed, r6d               ; updated seed
1523
1524    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1525                offx, offy, see, unused1, unused2, unused3, luma, lstride
1526
1527    mov           offxd, seed
1528    rorx          offyd, seed, 8
1529    shr           offxd, 12
1530    and           offyd, 0xf
1531    imul          offyd, 164>>%3
1532    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))]  ; offy*stride+offx
1533
1534    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1535                h, offxy, see, unused1, unused2, unused3, luma, lstride
1536
1537    mov              hd, hm
1538    mov      grain_lutq, grain_lutmp
1539%%loop_y:
1540    ; src
1541    mova             m0, [srcq]
1542%if %2
1543    mova             m1, [srcq+strideq]     ; m0-1: src as word
1544%else
1545    mova             m1, [srcq+32]
1546%endif
1547
1548    ; luma_src
1549%if %2
1550    mova            xm4, [lumaq+lstrideq*0+ 0]
1551    mova            xm7, [lumaq+lstrideq*0+16]
1552    vinserti128      m4, [lumaq+lstrideq*0+32], 1
1553    vinserti128      m7, [lumaq+lstrideq*0+48], 1
1554    mova            xm6, [lumaq+lstrideq*(1<<%3)+ 0]
1555    mova            xm8, [lumaq+lstrideq*(1<<%3)+16]
1556    vinserti128      m6, [lumaq+lstrideq*(1<<%3)+32], 1
1557    vinserti128      m8, [lumaq+lstrideq*(1<<%3)+48], 1
1558    phaddw           m4, m7
1559    phaddw           m6, m8
1560    pavgw            m4, m2
1561    pavgw            m6, m2
1562%else
1563    mova             m4, [lumaq]
1564    mova             m6, [lumaq+32]
1565%endif
1566
1567%if %1
1568    punpckhwd        m3, m4, m0
1569    punpcklwd        m4, m0
1570    punpckhwd        m5, m6, m1
1571    punpcklwd        m6, m1                 ; { luma, chroma }
1572    REPX {pmaddwd x, m14}, m3, m4, m5, m6
1573    REPX {psrad   x, 6}, m3, m4, m5, m6
1574    packssdw         m4, m3
1575    packssdw         m6, m5
1576    REPX {paddw x, m15}, m4, m6
1577    REPX {pmaxsw x, m2}, m4, m6
1578    REPX {pminsw x, m10}, m4, m6             ; clip_pixel()
1579%else
1580    REPX {pminuw x, m10}, m4, m6
1581%endif
1582
1583    punpckhwd        m5, m4, m2
1584    punpcklwd        m4, m2
1585    punpckhwd        m7, m6, m2
1586    punpcklwd        m6, m2                 ; m4-7: luma_src as dword
1587
1588    ; scaling[luma_src]
1589    pcmpeqw          m3, m3
1590    mova             m9, m3
1591    vpgatherdd       m8, [scalingq+m4-3], m3
1592    vpgatherdd       m4, [scalingq+m5-3], m9
1593    pcmpeqw          m3, m3
1594    mova             m9, m3
1595    vpgatherdd       m5, [scalingq+m6-3], m3
1596    vpgatherdd       m6, [scalingq+m7-3], m9
1597    REPX  {psrld x, 24}, m8, m4, m5, m6
1598    packssdw         m8, m4
1599    packssdw         m5, m6
1600
1601    ; grain = grain_lut[offy+y][offx+x]
1602    movu             m9, [grain_lutq+offxyq*2]
1603%if %2
1604    movu             m3, [grain_lutq+offxyq*2+82*2]
1605%else
1606    movu             m3, [grain_lutq+offxyq*2+32]
1607%endif
1608
1609    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1610    REPX {pmullw x, m11}, m8, m5
1611    pmulhrsw         m9, m8
1612    pmulhrsw         m3, m5
1613
1614    ; dst = clip_pixel(src, noise)
1615    paddw            m0, m9
1616    paddw            m1, m3
1617    pmaxsw           m0, m13
1618    pmaxsw           m1, m13
1619    pminsw           m0, m12
1620    pminsw           m1, m12
1621    mova         [dstq], m0
1622%if %2
1623    mova [dstq+strideq], m1
1624
1625    lea            srcq, [srcq+strideq*2]
1626    lea            dstq, [dstq+strideq*2]
1627    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
1628%else
1629    mova      [dstq+32], m1
1630    add            srcq, strideq
1631    add            dstq, strideq
1632    add           lumaq, lstrideq
1633%endif
1634    add      grain_lutq, 82*(2<<%2)
1635%if %2
1636    sub              hb, 2
1637%else
1638    dec              hb
1639%endif
1640    jg %%loop_y
1641
1642    add              wq, 32>>%2
1643    jge %%end
1644    mov            srcq, r10mp
1645    mov            dstq, r11mp
1646    mov           lumaq, r12mp
1647    lea            srcq, [srcq+wq*2]
1648    lea            dstq, [dstq+wq*2]
1649    lea           lumaq, [lumaq+wq*(2<<%2)]
1650    cmp byte [fg_dataq+FGData.overlap_flag], 0
1651    je %%loop_x
1652
1653    ; r8m = sbym
1654    cmp       dword r8m, 0
1655    jne %%loop_x_hv_overlap
1656
1657    ; horizontal overlap (without vertical overlap)
1658%%loop_x_h_overlap:
1659    mov             r6d, seed
1660    or             seed, 0xEFF4
1661    shr             r6d, 1
1662    test           seeb, seeh
1663    lea            seed, [r6+0x8000]
1664    cmovp          seed, r6d               ; updated seed
1665
1666    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1667                offx, offy, see, left_offxy, unused1, unused2, luma, lstride
1668
1669    lea     left_offxyd, [offyd+(32>>%2)]         ; previous column's offy*stride+offx
1670    mov           offxd, seed
1671    rorx          offyd, seed, 8
1672    shr           offxd, 12
1673    and           offyd, 0xf
1674    imul          offyd, 164>>%3
1675    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
1676
1677    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1678                h, offxy, see, left_offxy, unused1, unused2, luma, lstride
1679
1680    mov              hd, hm
1681    mov      grain_lutq, grain_lutmp
1682%%loop_y_h_overlap:
1683    mova             m0, [srcq]
1684%if %2
1685    mova             m1, [srcq+strideq]
1686
1687    ; luma_src
1688    mova            xm4, [lumaq+lstrideq*0+ 0]
1689    mova            xm7, [lumaq+lstrideq*0+16]
1690    vinserti128      m4, [lumaq+lstrideq*0+32], 1
1691    vinserti128      m7, [lumaq+lstrideq*0+48], 1
1692    mova            xm6, [lumaq+lstrideq*(1<<%3)+ 0]
1693    mova            xm8, [lumaq+lstrideq*(1<<%3)+16]
1694    vinserti128      m6, [lumaq+lstrideq*(1<<%3)+32], 1
1695    vinserti128      m8, [lumaq+lstrideq*(1<<%3)+48], 1
1696    phaddw           m4, m7
1697    phaddw           m6, m8
1698    pavgw            m4, m2
1699    pavgw            m6, m2
1700%else
1701    mova             m1, [srcq+32]
1702
1703    ; luma_src
1704    mova             m4, [lumaq]
1705    mova             m6, [lumaq+32]
1706%endif
1707
1708%if %1
1709    punpckhwd        m3, m4, m0
1710    punpcklwd        m4, m0
1711    punpckhwd        m5, m6, m1
1712    punpcklwd        m6, m1                 ; { luma, chroma }
1713    REPX {pmaddwd x, m14}, m3, m4, m5, m6
1714    REPX {psrad   x, 6}, m3, m4, m5, m6
1715    packssdw         m4, m3
1716    packssdw         m6, m5
1717    REPX {paddw x, m15}, m4, m6
1718    REPX {pmaxsw x, m2}, m4, m6
1719    REPX {pminsw x, m10}, m4, m6             ; clip_pixel()
1720%else
1721    REPX {pminuw x, m10}, m4, m6
1722%endif
1723
1724    ; grain = grain_lut[offy+y][offx+x]
1725    movu             m9, [grain_lutq+offxyq*2]
1726%if %2
1727    movu             m3, [grain_lutq+offxyq*2+82*2]
1728%else
1729    movu             m3, [grain_lutq+offxyq*2+32]
1730%endif
1731    movd            xm5, [grain_lutq+left_offxyq*2+ 0]
1732%if %2
1733    pinsrw          xm5, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1}
1734    punpckldq       xm7, xm9, xm3           ; {cur0, cur1}
1735    punpcklwd       xm5, xm7                ; {left0, cur0, left1, cur1}
1736%else
1737    punpcklwd       xm5, xm9
1738%endif
1739%if %1
1740%if %2
1741    vpbroadcastq    xm8, [pw_23_22]
1742%else
1743    movq            xm8, [pw_27_17_17_27]
1744%endif
1745    pmaddwd         xm5, xm8
1746    vpbroadcastd    xm8, [pd_16]
1747    paddd           xm5, xm8
1748%else
1749    pmaddwd         xm5, xm15
1750    paddd           xm5, xm14
1751%endif
1752    psrad           xm5, 5
1753    packssdw        xm5, xm5
1754    pcmpeqw         xm8, xm8
1755    psraw           xm7, xm10, 1
1756    pxor            xm8, xm7
1757    pmaxsw          xm5, xm8
1758    pminsw          xm5, xm7
1759    vpblendd         m9, m9, m5, 00000001b
1760%if %2
1761    psrldq          xm5, 4
1762    vpblendd         m3, m3, m5, 00000001b
1763%endif
1764
1765    ; scaling[luma_src]
1766    punpckhwd        m5, m4, m2
1767    punpcklwd        m4, m2
1768    pcmpeqw          m7, m7
1769    vpgatherdd       m8, [scalingq+m4-3], m7
1770    pcmpeqw          m7, m7
1771    vpgatherdd       m4, [scalingq+m5-3], m7
1772    REPX  {psrld x, 24}, m8, m4
1773    packssdw         m8, m4
1774
1775    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1776    pmullw           m8, m11
1777    pmulhrsw         m9, m8
1778
1779    ; same for the other half
1780    punpckhwd        m7, m6, m2
1781    punpcklwd        m6, m2                 ; m4-7: luma_src as dword
1782    pcmpeqw          m8, m8
1783    mova             m4, m8
1784    vpgatherdd       m5, [scalingq+m6-3], m8
1785    vpgatherdd       m6, [scalingq+m7-3], m4
1786    REPX  {psrld x, 24}, m5, m6
1787    packssdw         m5, m6
1788
1789    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1790    pmullw           m5, m11
1791    pmulhrsw         m3, m5
1792
1793    ; dst = clip_pixel(src, noise)
1794    paddw            m0, m9
1795    paddw            m1, m3
1796    pmaxsw           m0, m13
1797    pmaxsw           m1, m13
1798    pminsw           m0, m12
1799    pminsw           m1, m12
1800    mova         [dstq], m0
1801%if %2
1802    mova [dstq+strideq], m1
1803
1804    lea            srcq, [srcq+strideq*2]
1805    lea            dstq, [dstq+strideq*2]
1806    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
1807%else
1808    mova      [dstq+32], m1
1809
1810    add            srcq, strideq
1811    add            dstq, strideq
1812    add           lumaq, lstrideq
1813%endif
1814
1815    add      grain_lutq, 82*(2<<%2)
1816%if %2
1817    sub              hb, 2
1818%else
1819    dec              hb
1820%endif
1821    jg %%loop_y_h_overlap
1822
1823    add              wq, 32>>%2
1824    jge %%end
1825    mov            srcq, r10mp
1826    mov            dstq, r11mp
1827    mov           lumaq, r12mp
1828    lea            srcq, [srcq+wq*2]
1829    lea            dstq, [dstq+wq*2]
1830    lea           lumaq, [lumaq+wq*(2<<%2)]
1831
1832    ; r8m = sbym
1833    cmp       dword r8m, 0
1834    jne %%loop_x_hv_overlap
1835    jmp %%loop_x_h_overlap
1836
1837%%end:
1838    RET
1839
1840%%vertical_overlap:
1841    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
1842                sby, see, unused1, unused2, unused3, lstride
1843
1844    movzx          sbyd, sbyb
1845    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1846    imul            r7d, sbyd, 173 * 0x00010001
1847    imul           sbyd, 37 * 0x01000100
1848    add             r7d, (105 << 16) | 188
1849    add            sbyd, (178 << 24) | (141 << 8)
1850    and             r7d, 0x00ff00ff
1851    and            sbyd, 0xff00ff00
1852    xor            seed, r7d
1853    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1854
1855    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1856                unused1, unused2, see, unused3, unused4, unused5, luma, lstride
1857
1858    mov           lumaq, r9mp
1859    mov        lstrideq, r10mp
1860    lea             r10, [srcq+wq*2]
1861    lea             r11, [dstq+wq*2]
1862    lea             r12, [lumaq+wq*(2<<%2)]
1863    mov           r10mp, r10
1864    mov           r11mp, r11
1865    mov           r12mp, r12
1866    neg              wq
1867
1868%%loop_x_v_overlap:
1869    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1870    mov             r6d, seed
1871    or             seed, 0xeff4eff4
1872    test           seeb, seeh
1873    setp            r7b                     ; parity of top_seed
1874    shr            seed, 16
1875    shl             r7d, 16
1876    test           seeb, seeh
1877    setp            r7b                     ; parity of cur_seed
1878    or              r6d, 0x00010001
1879    xor             r7d, r6d
1880    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1881
1882    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1883                offx, offy, see, unused1, top_offxy, unused2, luma, lstride
1884
1885    rorx          offyd, seed, 8
1886    rorx          offxd, seed, 12
1887    and           offyd, 0xf000f
1888    and           offxd, 0xf000f
1889    imul          offyd, 164>>%3
1890    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1891    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
1892
1893    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1894                h, offxy, see, unused1, top_offxy, unused2, luma, lstride
1895
1896    movzx    top_offxyd, offxyw
1897    shr          offxyd, 16
1898
1899%if %2 == 0
1900    lea             r10, [pw_27_17_17_27]
1901%endif
1902    mov              hd, hm
1903    mov      grain_lutq, grain_lutmp
1904%%loop_y_v_overlap:
1905    ; src
1906    mova             m0, [srcq]
1907%if %2
1908    mova             m1, [srcq+strideq]
1909
1910    ; luma_src
1911    mova            xm4, [lumaq+lstrideq*0+ 0]
1912    mova            xm7, [lumaq+lstrideq*0+16]
1913    vinserti128      m4, [lumaq+lstrideq*0+32], 1
1914    vinserti128      m7, [lumaq+lstrideq*0+48], 1
1915    mova            xm6, [lumaq+lstrideq*(1<<%3)+ 0]
1916    mova            xm8, [lumaq+lstrideq*(1<<%3)+16]
1917    vinserti128      m6, [lumaq+lstrideq*(1<<%3)+32], 1
1918    vinserti128      m8, [lumaq+lstrideq*(1<<%3)+48], 1
1919    phaddw           m4, m7
1920    phaddw           m6, m8
1921    pavgw            m4, m2
1922    pavgw            m6, m2
1923%else
1924    mova             m1, [srcq+32]
1925
1926    ; luma_src
1927    mova             m4, [lumaq]
1928    mova             m6, [lumaq+32]
1929%endif
1930
1931%if %1
1932    punpckhwd        m3, m4, m0
1933    punpcklwd        m4, m0
1934    punpckhwd        m5, m6, m1
1935    punpcklwd        m6, m1                 ; { luma, chroma }
1936    REPX {pmaddwd x, m14}, m3, m4, m5, m6
1937    REPX {psrad   x, 6}, m3, m4, m5, m6
1938    packssdw         m4, m3
1939    packssdw         m6, m5
1940    REPX {paddw x, m15}, m4, m6
1941    REPX {pmaxsw x, m2}, m4, m6
1942    REPX {pminsw x, m10}, m4, m6             ; clip_pixel()
1943%else
1944    REPX {pminuw x, m10}, m4, m6
1945%endif
1946
1947    ; grain = grain_lut[offy+y][offx+x]
1948    movu             m9, [grain_lutq+offxyq*2]
1949    movu             m5, [grain_lutq+top_offxyq*2]
1950    punpckhwd        m7, m5, m9
1951    punpcklwd        m5, m9                 ; {top/cur interleaved}
1952%if %3
1953    vpbroadcastd     m3, [pw_23_22]
1954%elif %2
1955    vpbroadcastd     m3, [pw_27_17_17_27]
1956%else
1957    vpbroadcastd     m3, [r10]
1958%endif
1959    REPX {pmaddwd x, m3}, m7, m5
1960%if %1
1961    vpbroadcastd     m8, [pd_16]
1962    REPX  {paddd x, m8}, m7, m5
1963%else
1964    REPX {paddd x, m14}, m7, m5
1965%endif
1966    REPX   {psrad x, 5}, m7, m5
1967    packssdw         m9, m5, m7
1968%if %2
1969    movu             m3, [grain_lutq+offxyq*2+82*2]
1970%else
1971    movu             m3, [grain_lutq+offxyq*2+32]
1972%endif
1973%if %3 == 0
1974%if %2
1975    movu             m5, [grain_lutq+top_offxyq*2+82*2]
1976%else
1977    movu             m5, [grain_lutq+top_offxyq*2+32]
1978%endif
1979    punpckhwd        m7, m5, m3
1980    punpcklwd        m5, m3                 ; {top/cur interleaved}
1981%if %2
1982    vpbroadcastd     m3, [pw_27_17_17_27+4]
1983%else
1984    vpbroadcastd     m3, [r10]
1985%endif
1986    REPX {pmaddwd x, m3}, m7, m5
1987%if %1
1988    REPX  {paddd x, m8}, m7, m5
1989%else
1990    REPX {paddd x, m14}, m7, m5
1991%endif
1992    REPX   {psrad x, 5}, m7, m5
1993    packssdw         m3, m5, m7
1994%endif ; %3 == 0
1995    pcmpeqw          m7, m7
1996    psraw            m5, m10, 1
1997    pxor             m7, m5
1998%if %3
1999    pmaxsw           m9, m7
2000    pminsw           m9, m5
2001%else
2002    REPX {pmaxsw x, m7}, m9, m3
2003    REPX {pminsw x, m5}, m9, m3
2004%endif
2005
2006    ; scaling[luma_src]
2007    punpckhwd        m5, m4, m2
2008    punpcklwd        m4, m2
2009    pcmpeqw          m7, m7
2010    vpgatherdd       m8, [scalingq+m4-3], m7
2011    pcmpeqw          m7, m7
2012    vpgatherdd       m4, [scalingq+m5-3], m7
2013    REPX  {psrld x, 24}, m8, m4
2014    packssdw         m8, m4
2015
2016    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2017    pmullw           m8, m11
2018    pmulhrsw         m9, m8
2019
2020    ; scaling for the other half
2021    punpckhwd        m7, m6, m2
2022    punpcklwd        m6, m2                 ; m4-7: luma_src as dword
2023    pcmpeqw          m8, m8
2024    mova             m4, m8
2025    vpgatherdd       m5, [scalingq+m6-3], m8
2026    vpgatherdd       m6, [scalingq+m7-3], m4
2027    REPX  {psrld x, 24}, m5, m6
2028    packssdw         m5, m6
2029
2030    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2031    pmullw           m5, m11
2032    pmulhrsw         m3, m5
2033
2034    ; dst = clip_pixel(src, noise)
2035    paddw            m0, m9
2036    paddw            m1, m3
2037    pmaxsw           m0, m13
2038    pmaxsw           m1, m13
2039    pminsw           m0, m12
2040    pminsw           m1, m12
2041    mova         [dstq], m0
2042%if %2
2043    mova [dstq+strideq], m1
2044
2045    sub              hb, 2
2046%else
2047    mova      [dstq+32], m1
2048    dec              hb
2049%endif
2050    jle %%end_y_v_overlap
2051%if %2
2052    lea            srcq, [srcq+strideq*2]
2053    lea            dstq, [dstq+strideq*2]
2054    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
2055%else
2056    add            srcq, strideq
2057    add            dstq, strideq
2058    add           lumaq, lstrideq
2059%endif
2060    add      grain_lutq, 82*(2<<%2)
2061%if %2
2062    jmp %%loop_y
2063%else
2064    btc              hd, 16
2065    jc %%loop_y
2066    add             r10, 4
2067    jmp %%loop_y_v_overlap
2068%endif
2069
2070%%end_y_v_overlap:
2071    add              wq, 32>>%2
2072    jge %%end_hv
2073    mov            srcq, r10mp
2074    mov            dstq, r11mp
2075    mov           lumaq, r12mp
2076    lea            srcq, [srcq+wq*2]
2077    lea            dstq, [dstq+wq*2]
2078    lea           lumaq, [lumaq+wq*(2<<%2)]
2079
2080    ; since fg_dataq.overlap is guaranteed to be set, we never jump
2081    ; back to .loop_x_v_overlap, and instead always fall-through to
2082    ; h+v overlap
2083
2084%%loop_x_hv_overlap:
2085    ; we assume from the block above that bits 8-15 of r7d are zero'ed
2086    mov             r6d, seed
2087    or             seed, 0xeff4eff4
2088    test           seeb, seeh
2089    setp            r7b                     ; parity of top_seed
2090    shr            seed, 16
2091    shl             r7d, 16
2092    test           seeb, seeh
2093    setp            r7b                     ; parity of cur_seed
2094    or              r6d, 0x00010001
2095    xor             r7d, r6d
2096    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
2097
2098    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2099                offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
2100
2101%if %2 == 0
2102    lea             r12, [pw_27_17_17_27]
2103    mov           r13mp, r12
2104%endif
2105    lea  topleft_offxyq, [top_offxyq+(32>>%2)]
2106    lea     left_offxyq, [offyq+(32>>%2)]
2107    rorx          offyd, seed, 8
2108    rorx          offxd, seed, 12
2109    and           offyd, 0xf000f
2110    and           offxd, 0xf000f
2111    imul          offyd, 164>>%3
2112    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
2113    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
2114
2115    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2116                h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
2117
2118    movzx    top_offxyd, offxyw
2119    shr          offxyd, 16
2120
2121    mov              hd, hm
2122    mov      grain_lutq, grain_lutmp
2123%%loop_y_hv_overlap:
2124    ; grain = grain_lut[offy+y][offx+x]
2125    movd            xm5, [grain_lutq+left_offxyq*2]
2126%if %2
2127    pinsrw          xm5, [grain_lutq+left_offxyq*2+82*2], 2
2128%if %3
2129    vinserti128      m5, [grain_lutq+topleft_offxyq*2], 1   ; { left0, left1, top/left }
2130%else
2131    ; insert both top/left lines
2132    movd            xm9, [grain_lutq+topleft_offxyq*2+82*2]
2133    pinsrw          xm9, [grain_lutq+topleft_offxyq*2], 2
2134    vinserti128      m5, xm9, 1
2135%endif
2136%else
2137    pinsrd          xm5, [grain_lutq+topleft_offxyq*2], 1
2138%endif
2139    movu             m9, [grain_lutq+offxyq*2]
2140%if %2
2141    movu             m3, [grain_lutq+offxyq*2+82*2]
2142%else
2143    movu             m3, [grain_lutq+offxyq*2+32]
2144%endif
2145    movu             m8, [grain_lutq+top_offxyq*2]
2146%if %2
2147    punpckldq       xm7, xm9, xm3           ; { cur0, cur1 }
2148%if %3
2149    vinserti128      m7, xm8, 1             ; { cur0, cur1, top0 }
2150%else
2151    ; insert both top lines
2152    movu             m1, [grain_lutq+top_offxyq*2+82*2]
2153    punpckldq       xm0, xm1, xm8
2154    vinserti128      m7, xm0, 1
2155%endif
2156%else
2157    movu             m1, [grain_lutq+top_offxyq*2+32]
2158    punpckldq       xm7, xm9, xm8
2159%endif
2160    punpcklwd        m5, m7                 ; { cur/left } interleaved
2161%if %2
2162%if %1
2163    vpbroadcastq     m0, [pw_23_22]
2164    pmaddwd          m5, m0
2165    vpbroadcastd     m0, [pd_16]
2166    paddd            m5, m0
2167%else
2168    pmaddwd          m5, m15
2169    paddd            m5, m14
2170%endif
2171    psrad            m5, 5
2172    vextracti128    xm0, m5, 1
2173    packssdw        xm5, xm0
2174%else
2175%if %1
2176    movddup         xm0, [pw_27_17_17_27]
2177    pmaddwd         xm5, xm0
2178    vpbroadcastd     m0, [pd_16]
2179    paddd           xm5, xm0
2180%else
2181    pmaddwd         xm5, xm15
2182    paddd           xm5, xm14
2183%endif
2184    psrad           xm5, 5
2185    packssdw        xm5, xm5
2186%endif
2187    pcmpeqw          m0, m0
2188    psraw            m7, m10, 1
2189    pxor             m0, m7
2190    pminsw          xm5, xm7
2191    pmaxsw          xm5, xm0
2192    vpblendd         m9, m9, m5, 00000001b
2193%if %2
2194    psrldq          xm5, 4
2195    vpblendd         m3, m3, m5, 00000001b
2196%if %3 == 0
2197    psrldq          xm5, 4
2198    vpblendd         m1, m1, m5, 00000001b
2199%endif
2200%endif
2201    psrldq          xm5, 4
2202    vpblendd         m5, m8, m5, 00000001b
2203
2204    punpckhwd        m8, m5, m9
2205    punpcklwd        m5, m9                 ; {top/cur interleaved}
2206%if %3
2207    vpbroadcastd     m9, [pw_23_22]
2208%elif %2
2209    vpbroadcastd     m9, [pw_27_17_17_27]
2210%else
2211    xchg            r12, r13mp
2212    vpbroadcastd     m9, [r12]
2213%endif
2214    REPX {pmaddwd x, m9}, m8, m5
2215%if %1
2216    vpbroadcastd     m4, [pd_16]
2217    REPX  {paddd x, m4}, m8, m5
2218%else
2219    REPX {paddd x, m14}, m8, m5
2220%endif
2221    REPX   {psrad x, 5}, m8, m5
2222    packssdw         m9, m5, m8
2223%if %3
2224    pminsw           m9, m7
2225    pmaxsw           m9, m0
2226%else
2227    punpckhwd        m8, m1, m3
2228    punpcklwd        m1, m3                 ; {top/cur interleaved}
2229%if %2
2230    vpbroadcastd     m3, [pw_27_17_17_27+4]
2231%else
2232    vpbroadcastd     m3, [r12]
2233    xchg            r12, r13mp
2234%endif
2235    REPX {pmaddwd x, m3}, m8, m1
2236%if %1
2237    REPX  {paddd x, m4}, m8, m1
2238%else
2239    REPX {paddd x, m14}, m8, m1
2240%endif
2241    REPX   {psrad x, 5}, m8, m1
2242    packssdw         m3, m1, m8
2243    REPX {pminsw x, m7}, m9, m3
2244    REPX {pmaxsw x, m0}, m9, m3
2245%endif
2246
2247    ; src
2248    mova             m0, [srcq]
2249%if %2
2250    mova             m1, [srcq+strideq]
2251%else
2252    mova             m1, [srcq+32]
2253%endif
2254
2255    ; luma_src
2256%if %2
2257    mova            xm4, [lumaq+lstrideq*0+ 0]
2258    mova            xm7, [lumaq+lstrideq*0+16]
2259    vinserti128      m4, [lumaq+lstrideq*0+32], 1
2260    vinserti128      m7, [lumaq+lstrideq*0+48], 1
2261    mova            xm6, [lumaq+lstrideq*(1<<%3)+ 0]
2262    mova            xm8, [lumaq+lstrideq*(1<<%3)+16]
2263    vinserti128      m6, [lumaq+lstrideq*(1<<%3)+32], 1
2264    vinserti128      m8, [lumaq+lstrideq*(1<<%3)+48], 1
2265    phaddw           m4, m7
2266    phaddw           m6, m8
2267    pavgw            m4, m2
2268    pavgw            m6, m2
2269%else
2270    mova             m4, [lumaq]
2271    mova             m6, [lumaq+32]
2272%endif
2273
2274%if %1
2275    punpckhwd        m8, m4, m0
2276    punpcklwd        m4, m0
2277    punpckhwd        m5, m6, m1
2278    punpcklwd        m6, m1                 ; { luma, chroma }
2279    REPX {pmaddwd x, m14}, m8, m4, m5, m6
2280    REPX {psrad   x, 6}, m8, m4, m5, m6
2281    packssdw         m4, m8
2282    packssdw         m6, m5
2283    REPX {paddw x, m15}, m4, m6
2284    REPX {pmaxsw x, m2}, m4, m6
2285    REPX {pminsw x, m10}, m4, m6             ; clip_pixel()
2286%else
2287    REPX {pminuw x, m10}, m4, m6
2288%endif
2289
2290    ; scaling[luma_src]
2291    punpckhwd        m5, m4, m2
2292    punpcklwd        m4, m2
2293    pcmpeqw          m7, m7
2294    vpgatherdd       m8, [scalingq+m4-3], m7
2295    pcmpeqw          m7, m7
2296    vpgatherdd       m4, [scalingq+m5-3], m7
2297    REPX  {psrld x, 24}, m8, m4
2298    packssdw         m8, m4
2299
2300    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2301    pmullw           m8, m11
2302    pmulhrsw         m9, m8
2303
2304    ; same for the other half
2305    punpckhwd        m7, m6, m2
2306    punpcklwd        m6, m2                 ; m4-7: luma_src as dword
2307    pcmpeqw          m8, m8
2308    mova             m4, m8
2309    vpgatherdd       m5, [scalingq+m6-3], m8
2310    vpgatherdd       m6, [scalingq+m7-3], m4
2311    REPX  {psrld x, 24}, m5, m6
2312    packssdw         m5, m6
2313
2314    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2315    pmullw           m5, m11
2316    pmulhrsw         m3, m5
2317
2318    ; dst = clip_pixel(src, noise)
2319    paddw            m0, m9
2320    paddw            m1, m3
2321    pmaxsw           m0, m13
2322    pmaxsw           m1, m13
2323    pminsw           m0, m12
2324    pminsw           m1, m12
2325    mova         [dstq], m0
2326%if %2
2327    mova [dstq+strideq], m1
2328
2329    lea            srcq, [srcq+strideq*2]
2330    lea            dstq, [dstq+strideq*2]
2331    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
2332%else
2333    mova      [dstq+32], m1
2334
2335    add            srcq, strideq
2336    add            dstq, strideq
2337    add           lumaq, lstrideq
2338%endif
2339    add      grain_lutq, 82*(2<<%2)
2340%if %2
2341    sub              hb, 2
2342    jg %%loop_y_h_overlap
2343%else
2344    dec              hb
2345    jle %%end_y_hv_overlap
2346    btc              hd, 16
2347    jc %%loop_y_h_overlap
2348    add           r13mp, 4
2349    jmp %%loop_y_hv_overlap
2350%endif
2351
2352%%end_y_hv_overlap:
2353    add              wq, 32>>%2
2354    jge %%end_hv
2355    mov            srcq, r10mp
2356    mov            dstq, r11mp
2357    mov           lumaq, r12mp
2358    lea            srcq, [srcq+wq*2]
2359    lea            dstq, [dstq+wq*2]
2360    lea           lumaq, [lumaq+wq*(2<<%2)]
2361    jmp %%loop_x_hv_overlap
2362
2363%%end_hv:
2364    RET
2365%endmacro
2366
2367    %%FGUV_32x32xN_LOOP 1, %2, %3
2368.csfl:
2369    %%FGUV_32x32xN_LOOP 0, %2, %3
2370%endmacro
2371
2372FGUV_FN 420, 1, 1
2373FGUV_FN 422, 1, 0
2374FGUV_FN 444, 0, 0
2375%endif ; ARCH_X86_64
2376