1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28%include "x86/filmgrain_common.asm"
29
30SECTION_RODATA 16
31pd_16: times 4 dd 16
32pw_1: times 8 dw 1
33pw_16384: times 8 dw 16384
34pw_8192: times 8 dw 8192
35pw_23_22: dw 23, 22
36          times 3 dw 0, 32
37pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
38pw_27_17_17_27: dw 27, 17, 17, 27
39                times 2 dw 0, 32
40rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
41pw_seed_xor: times 2 dw 0xb524
42             times 2 dw 0x49d8
43pb_1: times 4 db 1
44hmul_bits: dw 32768, 16384, 8192, 4096
45round: dw 2048, 1024, 512
46mul_bits: dw 256, 128, 64, 32, 16
47round_vals: dw 32, 64, 128, 256, 512, 1024
48max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16
49min: dw 0, 16*4, 16*16
50; these two should be next to each other
51pw_4: times 2 dw 4
52pw_16: times 2 dw 16
53
54%macro JMP_TABLE 1-*
55    %xdefine %1_table %%table
56    %xdefine %%base %1_table
57    %xdefine %%prefix mangle(private_prefix %+ _%1)
58    %%table:
59    %rep %0 - 1
60        dd %%prefix %+ .ar%2 - %%base
61        %rotate 1
62    %endrep
63%endmacro
64
65JMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3
66JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3
67JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3
68JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3
69
70SECTION .text
71
72%if ARCH_X86_32
73%undef base
74%define PIC_ptr(a) base+a
75%else
76%define PIC_ptr(a) a
77%endif
78
79%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
80
81%macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg
82%assign %%idx 0
83%define %%tmp %2
84%if %0 == 8
85%define %%tmp %8
86%endif
87%rep (%6/2)
88%if %%idx == 0
89    movd        %5 %+ d, %2
90    pshuflw       %%tmp, %2, q3232
91%else
92    movd        %5 %+ d, %%tmp
93%if %6 == 8
94%if %%idx == 2
95    punpckhqdq    %%tmp, %%tmp
96%elif %%idx == 4
97    psrlq         %%tmp, 32
98%endif
99%endif
100%endif
101    movzx       %4 %+ d, %5 %+ w
102    shr         %5 %+ d, 16
103
104%if %%idx == 0
105    movd             %1, [%3+%4*%7]
106%else
107    pinsrw           %1, [%3+%4*%7], %%idx + 0
108%endif
109    pinsrw           %1, [%3+%5*%7], %%idx + 1
110%assign %%idx %%idx+2
111%endrep
112%endmacro
113
114%macro SPLATD 2 ; dst, src
115%ifnidn %1, %2
116    movd %1, %2
117%endif
118    pshufd %1, %1, q0000
119%endmacro
120
121%macro SPLATW 2 ; dst, src
122%ifnidn %1, %2
123    movd %1, %2
124%endif
125    pshuflw %1, %1, q0000
126    punpcklqdq %1, %1
127%endmacro
128
129
130INIT_XMM ssse3
131%if ARCH_X86_64
132cglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax
133    lea              r4, [pb_mask]
134%define base r4-pb_mask
135%else
136cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax
137    LEA              r4, $$
138%define base r4-$$
139%endif
140    movq             m1, [base+rnd_next_upperbit_mask]
141    movq             m4, [base+mul_bits]
142    movq             m7, [base+hmul_bits]
143    mov             r3d, [fg_dataq+FGData.grain_scale_shift]
144    lea             r5d, [bdmaxq+1]
145    shr             r5d, 11             ; 0 for 10bpc, 2 for 12bpc
146    sub              r3, r5
147    SPLATW           m6, [base+round+r3*2-2]
148    mova             m5, [base+pb_mask]
149    SPLATW           m0, [fg_dataq+FGData.seed]
150    mov              r3, -73*82*2
151    sub            bufq, r3
152%if ARCH_X86_64
153    lea              r6, [gaussian_sequence]
154%endif
155.loop:
156    pand             m2, m0, m1
157    psrlw            m3, m2, 10
158    por              m2, m3             ; bits 0xf, 0x1e, 0x3c and 0x78 are set
159    pmullw           m2, m4             ; bits 0x0f00 are set
160    pshufb           m3, m5, m2         ; set 15th bit for next 4 seeds
161    psllq            m2, m3, 30
162    por              m2, m3
163    psllq            m3, m2, 15
164    por              m2, m3             ; aggregate each bit into next seed's high bit
165    pmulhuw          m3, m0, m7
166    por              m2, m3             ; 4 next output seeds
167    pshuflw          m0, m2, q3333
168    psrlw            m2, 5
169%if ARCH_X86_64
170    vpgatherdw       m3, m2, r6, r5, r7, 4, 2
171%else
172    vpgatherdw       m3, m2, base+gaussian_sequence, r5, r2, 4, 2
173%endif
174    paddw            m3, m3             ; otherwise bpc=12 w/ grain_scale_shift=0
175                                        ; shifts by 0, which pmulhrsw does not support
176    pmulhrsw         m3, m6
177    movq      [bufq+r3], m3
178    add              r3, 4*2
179    jl .loop
180
181    ; auto-regression code
182    movsxd           r3, [fg_dataq+FGData.ar_coeff_lag]
183    movsxd           r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4]
184    lea              r3, [r3+base+generate_grain_y_16bpc_ssse3_table]
185    jmp              r3
186
187.ar1:
188%if WIN64
189    DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0
190    lea            bufq, [r0-2*(82*73-(82*3+79))]
191    PUSH             r8
192%else
193%if ARCH_X86_64
194    DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0
195%else ; x86-32
196    DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0
197    PUSH             r6
198%define shiftd r1d
199%endif
200    sub            bufq, 2*(82*73-(82*3+79))
201%endif
202    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
203    movd             m4, [fg_dataq+FGData.ar_coeffs_y]
204    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
205%if WIN64
206    DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0
207%elif ARCH_X86_64
208    DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0
209%else ; x86-32
210%undef shiftd
211    DEFINE_ARGS buf, shift, min, val3, x, cf3, val0
212%define hd dword r0m
213%define maxd dword minm
214%endif
215%if cpuflag(sse4)
216    pmovsxbw         m4, m4
217%else
218    pxor             m3, m3
219    pcmpgtb          m3, m4
220    punpcklbw        m4, m3
221%endif
222    pinsrw           m4, [base+pw_1], 3
223    pshufd           m5, m4, q1111
224    pshufd           m4, m4, q0000
225    SPLATW           m3, [base+round_vals+shiftq*2-12]    ; rnd
226    mov              hd, 70
227    sar            maxd, 1
228    mov            mind, maxd
229    xor            mind, -1
230.y_loop_ar1:
231    mov              xq, -76
232    movsx         val3d, word [bufq+xq*2-2]
233.x_loop_ar1:
234    movu             m0, [bufq+xq*2-82*2-2]     ; top/left
235    psrldq           m2, m0, 2                  ; top
236    psrldq           m1, m0, 4                  ; top/right
237    punpcklwd        m0, m2
238    punpcklwd        m1, m3
239    pmaddwd          m0, m4
240    pmaddwd          m1, m5
241    paddd            m0, m1
242.x_loop_ar1_inner:
243    movd          val0d, m0
244    psrldq           m0, 4
245    imul          val3d, cf3d
246    add           val3d, val0d
247    sar           val3d, shiftb
248    movsx         val0d, word [bufq+xq*2]
249    add           val3d, val0d
250    cmp           val3d, maxd
251    cmovg         val3d, maxd
252    cmp           val3d, mind
253    cmovl         val3d, mind
254    mov word [bufq+xq*2], val3w
255    ; keep val3d in-place as left for next x iteration
256    inc              xq
257    jz .x_loop_ar1_end
258    test             xq, 3
259    jnz .x_loop_ar1_inner
260    jmp .x_loop_ar1
261
262.x_loop_ar1_end:
263    add            bufq, 82*2
264    dec              hd
265    jg .y_loop_ar1
266%if WIN64
267    POP              r8
268%elif ARCH_X86_32
269    POP              r6
270%undef maxd
271%undef hd
272%endif
273.ar0:
274    RET
275
276.ar2:
277%if ARCH_X86_32
278%assign stack_offset_old stack_offset
279    ALLOC_STACK -16*8
280%endif
281    DEFINE_ARGS buf, fg_data, bdmax, shift
282    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
283    movd             m0, [base+round_vals-12+shiftq*2]
284    pshuflw          m0, m0, q0000
285    movu             m6, [fg_dataq+FGData.ar_coeffs_y+0]    ; cf0-11
286    pxor             m2, m2
287    punpcklwd        m0, m2
288    pcmpgtb          m2, m6
289    punpckhbw        m3, m6, m2
290    punpcklbw        m6, m2
291    pshufd           m2, m6, q3333
292    pshufd           m1, m6, q2222
293    pshufd           m7, m6, q1111
294    pshufd           m6, m6, q0000
295    pshufd           m4, m3, q1111
296    pshufd           m3, m3, q0000
297%if ARCH_X86_64
298    SWAP              0, 12
299    SWAP              1, 8
300    SWAP              2, 9
301    SWAP              3, 10
302    SWAP              4, 11
303%else
304%define m12 [rsp+0*16]
305%define m8 [rsp+1*16]
306%define m9 [rsp+2*16]
307%define m10 [rsp+3*16]
308%define m11 [rsp+4*16]
309    mova            m12, m0
310    mova             m8, m1
311    mova             m9, m2
312    mova            m10, m3
313    mova            m11, m4
314    mov          bdmaxd, bdmaxm
315%endif
316    sar          bdmaxd, 1
317    SPLATW           m0, bdmaxd                             ; max_grain
318    pcmpeqw          m1, m1
319%if !cpuflag(sse4)
320    pcmpeqw          m2, m2
321    psrldq           m2, 14
322    pslldq           m2, 2
323    pxor             m2, m1
324%endif
325    pxor             m1, m0                                 ; min_grain
326%if ARCH_X86_64
327    SWAP              0, 13
328    SWAP              1, 14
329    SWAP              2, 15
330%else
331%define m13 [rsp+5*16]
332%define m14 [rsp+6*16]
333    mova            m13, m0
334    mova            m14, m1
335%if !cpuflag(sse4)
336%define m15 [rsp+7*16]
337    mova            m15, m2
338%endif
339%endif
340    sub            bufq, 2*(82*73-(82*3+79))
341    DEFINE_ARGS buf, fg_data, h, x
342    mov              hd, 70
343.y_loop_ar2:
344    mov              xq, -76
345
346.x_loop_ar2:
347    movu             m0, [bufq+xq*2-82*4-4]     ; y=-2,x=[-2,+5]
348    movu             m1, [bufq+xq*2-82*2-4]     ; y=-1,x=[-2,+5]
349    psrldq           m2, m0, 2
350    psrldq           m3, m0, 4
351    psrldq           m4, m0, 6
352    psrldq           m5, m0, 8
353    punpcklwd        m0, m2
354    punpcklwd        m3, m4
355    punpcklwd        m5, m1
356    psrldq           m2, m1, 2
357    psrldq           m4, m1, 4
358    punpcklwd        m2, m4
359    psrldq           m4, m1, 6
360    psrldq           m1, 8
361    punpcklwd        m4, m1
362    pmaddwd          m0, m6
363    pmaddwd          m3, m7
364    pmaddwd          m5, m8
365    pmaddwd          m2, m9
366    pmaddwd          m4, m10
367    paddd            m0, m3
368    paddd            m5, m2
369    paddd            m0, m4
370    paddd            m0, m5                     ; accumulated top 2 rows
371    paddd            m0, m12
372
373    movu             m1, [bufq+xq*2-4]      ; y=0,x=[-2,+5]
374    pshufd           m4, m1, q3321
375    pxor             m2, m2
376    pcmpgtw          m2, m4
377    punpcklwd        m4, m2                 ; in dwords, y=0,x=[0,3]
378.x_loop_ar2_inner:
379    pmaddwd          m2, m1, m11
380    paddd            m2, m0
381    psrldq           m0, 4                  ; shift top to next pixel
382    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
383    paddd            m2, m4
384    packssdw         m2, m2
385    pminsw           m2, m13
386    pmaxsw           m2, m14
387    psrldq           m4, 4
388    pslldq           m2, 2
389    psrldq           m1, 2
390%if cpuflag(sse4)
391    pblendw          m1, m2, 00000010b
392%else
393    pand             m1, m15
394    pandn            m3, m15, m2
395    por              m1, m3
396%endif
397    ; overwrite previous pixel, this should be ok
398    movd  [bufq+xq*2-2], m1
399    inc              xq
400    jz .x_loop_ar2_end
401    test             xq, 3
402    jnz .x_loop_ar2_inner
403    jmp .x_loop_ar2
404
405.x_loop_ar2_end:
406    add            bufq, 82*2
407    dec              hd
408    jg .y_loop_ar2
409%if ARCH_X86_32
410%undef m8
411%undef m9
412%undef m10
413%undef m11
414%undef m12
415%undef m13
416%undef m14
417%undef m15
418%endif
419    RET
420
421.ar3:
422    DEFINE_ARGS buf, fg_data, bdmax, shift
423%if WIN64
424    mov              r6, rsp
425    and             rsp, ~15
426    sub             rsp, 64
427    %define         tmp  rsp
428%elif ARCH_X86_64
429    %define         tmp  rsp+stack_offset-72
430%else
431%assign stack_offset stack_offset_old
432    ALLOC_STACK  -16*12
433    %define         tmp  rsp
434    mov          bdmaxd, bdmaxm
435%endif
436    sar          bdmaxd, 1
437    SPLATW           m7, bdmaxd                                 ; max_grain
438    pcmpeqw          m6, m6
439%if !cpuflag(sse4)
440    pcmpeqw          m4, m4
441    psrldq           m4, 14
442    pslldq           m4, 4
443    pxor             m4, m6
444%endif
445    pxor             m6, m7                                    ; min_grain
446    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
447
448%if ARCH_X86_64
449    SWAP              6, 14
450    SWAP              7, 15
451%else
452%define m14 [rsp+10*16]
453%define m15 [esp+11*16]
454    mova            m14, m6
455    mova            m15, m7
456%endif
457
458    ; build cf0-1 until 18-19 in m5-12 and r0/1
459    pxor             m1, m1
460    movu             m0, [fg_dataq+FGData.ar_coeffs_y+ 0]       ; cf0-15
461    pcmpgtb          m1, m0
462    punpckhbw        m2, m0, m1
463    punpcklbw        m0, m1
464
465%if cpuflag(sse4)
466    pshufd           m4, m2, q3333
467%else
468    pshufd           m5, m2, q3333
469    mova       [tmp+48], m5
470%endif
471    pshufd           m3, m2, q2222
472    pshufd           m1, m2, q0000
473    pshufd           m2, m2, q1111
474    pshufd           m7, m0, q2222
475    pshufd           m6, m0, q1111
476    pshufd           m5, m0, q0000
477    pshufd           m0, m0, q3333
478
479%if ARCH_X86_64
480    SWAP              0, 8
481    SWAP              1, 9
482    SWAP              2, 10
483    SWAP              3, 11
484    SWAP              4, 12
485%else
486%define m8 [rsp+4*16]
487%define m9 [esp+5*16]
488%define m10 [rsp+6*16]
489%define m11 [esp+7*16]
490%define m12 [rsp+8*16]
491    mova             m8, m0
492    mova             m9, m1
493    mova            m10, m2
494    mova            m11, m3
495    mova            m12, m4
496%endif
497
498    ; build cf20,round in r2
499    ; build cf21-23,round*2 in m13
500    pxor             m1, m1
501    movq             m0, [fg_dataq+FGData.ar_coeffs_y+16]       ; cf16-23
502    pcmpgtb          m1, m0
503    punpcklbw        m0, m1
504    pshufd           m1, m0, q0000
505    pshufd           m2, m0, q1111
506    mova       [tmp+ 0], m1
507    mova       [tmp+16], m2
508    psrldq           m3, m0, 10
509    pinsrw           m3, [base+round_vals+shiftq*2-10], 3
510
511%if ARCH_X86_64
512    SWAP              3, 13
513%else
514%define m13 [esp+9*16]
515    mova            m13, m3
516%endif
517
518    pinsrw           m0, [base+round_vals+shiftq*2-12], 5
519    pshufd           m3, m0, q2222
520    mova       [tmp+32], m3
521
522    DEFINE_ARGS buf, fg_data, h, x
523    sub            bufq, 2*(82*73-(82*3+79))
524    mov              hd, 70
525.y_loop_ar3:
526    mov              xq, -76
527
528.x_loop_ar3:
529    movu             m0, [bufq+xq*2-82*6-6+ 0]      ; y=-3,x=[-3,+4]
530    movd             m1, [bufq+xq*2-82*6-6+16]      ; y=-3,x=[+5,+6]
531    palignr          m2, m1, m0, 2                  ; y=-3,x=[-2,+5]
532    palignr          m1, m1, m0, 12                 ; y=-3,x=[+3,+6]
533    punpckhwd        m3, m0, m2                     ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5]
534    punpcklwd        m0, m2                         ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1]
535    shufps           m2, m0, m3, q1032              ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3]
536
537    pmaddwd          m0, m5
538    pmaddwd          m2, m6
539    pmaddwd          m3, m7
540    paddd            m0, m2
541    paddd            m0, m3
542    ; m0 = top line first 6 multiplied by cf, m1 = top line last entry
543
544    movu             m2, [bufq+xq*2-82*4-6+ 0]      ; y=-2,x=[-3,+4]
545    movd             m3, [bufq+xq*2-82*4-6+16]      ; y=-2,x=[+5,+6]
546    punpcklwd        m1, m2                         ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0]
547    palignr          m4, m3, m2, 2                  ; y=-3,x=[-2,+5]
548    palignr          m3, m3, m2, 4                  ; y=-3,x=[-1,+6]
549    punpckhwd        m2, m4, m3                     ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6]
550    punpcklwd        m4, m3                         ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
551    shufps           m3, m4, m2, q1032              ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
552
553    pmaddwd          m1, m8
554    pmaddwd          m4, m9
555    pmaddwd          m3, m10
556    pmaddwd          m2, m11
557    paddd            m1, m4
558    paddd            m3, m2
559    paddd            m0, m1
560    paddd            m0, m3
561    ; m0 = top 2 lines multiplied by cf
562
563    movu             m1, [bufq+xq*2-82*2-6+ 0]      ; y=-1,x=[-3,+4]
564    movd             m2, [bufq+xq*2-82*2-6+16]      ; y=-1,x=[+5,+6]
565    palignr          m3, m2, m1, 2                  ; y=-1,x=[-2,+5]
566    palignr          m2, m2, m1, 12                 ; y=-1,x=[+3,+6]
567    punpckhwd        m4, m1, m3                     ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5]
568    punpcklwd        m1, m3                         ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
569    shufps           m3, m1, m4, q1032              ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3]
570    punpcklwd        m2, [base+pw_1]
571
572%if cpuflag(sse4)
573    pmaddwd          m1, m12
574%else
575    pmaddwd          m1, [tmp+48]
576%endif
577    pmaddwd          m3, [tmp+ 0]
578    pmaddwd          m4, [tmp+16]
579    pmaddwd          m2, [tmp+32]
580    paddd            m1, m3
581    paddd            m4, m2
582    paddd            m0, m1
583    paddd            m0, m4
584    ; m0 = top 3 lines multiplied by cf plus rounding for downshift
585
586    movu             m1, [bufq+xq*2-6]      ; y=0,x=[-3,+4]
587.x_loop_ar3_inner:
588    pmaddwd          m2, m1, m13
589    pshufd           m3, m2, q1111
590    paddd            m2, m3                 ; left+cur
591    paddd            m2, m0                 ; add top
592    psrldq           m0, 4
593    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
594    packssdw         m2, m2
595    pminsw           m2, m15
596    pmaxsw           m2, m14
597    pslldq           m2, 4
598    psrldq           m1, 2
599%if cpuflag(sse4)
600    pblendw          m1, m2, 00000100b
601%else
602    pand             m1, m12
603    pandn            m3, m12, m2
604    por              m1, m3
605%endif
606    ; overwrite a couple of pixels, should be ok
607    movq  [bufq+xq*2-4], m1
608    inc              xq
609    jz .x_loop_ar3_end
610    test             xq, 3
611    jnz .x_loop_ar3_inner
612    jmp .x_loop_ar3
613
614.x_loop_ar3_end:
615    add            bufq, 82*2
616    dec              hd
617    jg .y_loop_ar3
618%if WIN64
619    mov             rsp, r6
620%elif ARCH_X86_32
621%undef m8
622%undef m9
623%undef m10
624%undef m11
625%undef m12
626%undef m13
627%undef m14
628%undef m15
629%endif
630    RET
631
632%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
633INIT_XMM ssse3
634%if ARCH_X86_64
635cglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg
636%define base r8-pb_mask
637    lea              r8, [pb_mask]
638    movifnidn    bdmaxd, bdmaxm
639    lea             r6d, [bdmaxq+1]
640%else
641cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h
642%define base r2-$$
643    LEA              r2, $$
644    mov        fg_dataq, r2m
645    mov             r6d, r4m
646    inc             r6d
647%endif
648    movq             m1, [base+rnd_next_upperbit_mask]
649    movq             m4, [base+mul_bits]
650    movq             m7, [base+hmul_bits]
651    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
652    shr             r6d, 11             ; 0 for 10bpc, 2 for 12bpc
653    sub              r5, r6
654    SPLATW           m6, [base+round+r5*2-2]
655    mova             m5, [base+pb_mask]
656    SPLATW           m0, [fg_dataq+FGData.seed]
657%if ARCH_X86_64
658    SPLATW           m2, [base+pw_seed_xor+uvq*4]
659%else
660    mov             r5d, r3m
661    SPLATW           m2, [base+pw_seed_xor+r5*4]
662%endif
663    pxor             m0, m2
664%if ARCH_X86_64
665    lea              r6, [gaussian_sequence]
666%endif
667%if %2
668    mov              hd, 73-35*%3
669    add            bufq, 44*2
670.loop_y:
671    mov              xq, -44
672%else
673    mov              xq, -82*73
674    add            bufq, 82*73*2
675%endif
676.loop_x:
677    pand             m2, m0, m1
678    psrlw            m3, m2, 10
679    por              m2, m3             ; bits 0xf, 0x1e, 0x3c and 0x78 are set
680    pmullw           m2, m4             ; bits 0x0f00 are set
681    pshufb           m3, m5, m2         ; set 15th bit for next 4 seeds
682    psllq            m2, m3, 30
683    por              m2, m3
684    psllq            m3, m2, 15
685    por              m2, m3             ; aggregate each bit into next seed's high bit
686    pmulhuw          m3, m0, m7
687    por              m2, m3             ; 4 next output seeds
688    pshuflw          m0, m2, q3333
689    psrlw            m2, 5
690%if ARCH_X86_64
691    vpgatherdw       m3, m2, r6, r9, r10, 4, 2
692%else
693    vpgatherdw       m3, m2, base+gaussian_sequence, r5, r6, 4, 2
694%endif
695    paddw            m3, m3             ; otherwise bpc=12 w/ grain_scale_shift=0
696                                        ; shifts by 0, which pmulhrsw does not support
697    pmulhrsw         m3, m6
698    movq    [bufq+xq*2], m3
699    add              xq, 4
700    jl .loop_x
701%if %2
702    add            bufq, 82*2
703    dec              hd
704    jg .loop_y
705%endif
706
707    ; auto-regression code
708    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
709    movsxd           r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4]
710    lea              r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table]
711    jmp              r5
712
713.ar0:
714%if ARCH_X86_64
715    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
716%else
717    DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
718%assign stack_offset_old stack_offset
719    ALLOC_STACK  -16*2
720    mov           bufyq, r1m
721    mov             uvd, r3m
722%endif
723    imul            uvd, 28
724    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
725    movd             m4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
726    SPLATW           m3, [base+hmul_bits+shiftq*2-10]
727%if ARCH_X86_64
728    sar          bdmaxd, 1
729    SPLATW           m1, bdmaxd                     ; max_gain
730%else
731    SPLATW           m1, r4m
732    psraw            m1, 1
733%endif
734    pcmpeqw          m7, m7
735    pxor             m7, m1                         ; min_grain
736%if ARCH_X86_64
737    SWAP              1, 14
738    DEFINE_ARGS buf, bufy, h, x
739%else
740%define m14 [rsp+0*16]
741    mova            m14, m1
742    DEFINE_ARGS buf, bufy, pic_reg, h, x
743%endif
744    pxor             m5, m5
745    pcmpgtb          m5, m4
746    punpcklbw        m4, m5
747%if %2
748    SPLATW           m6, [base+hmul_bits+2+%3*2]
749%endif
750    SPLATW           m4, m4
751    pxor             m5, m5
752%if %2
753%if !cpuflag(sse4)
754    pcmpeqw          m2, m2
755    pslldq           m2, 12
756%if ARCH_X86_64
757    SWAP              2, 12
758%else
759%define m12 [rsp+1*16]
760    mova            m12, m2
761%endif
762%endif
763%endif
764%if %2
765    sub            bufq, 2*(82*(73-35*%3)+82-(82*3+41))
766%else
767    sub            bufq, 2*(82*70-3)
768%endif
769    add           bufyq, 2*(3+82*3)
770    mov              hd, 70-35*%3
771.y_loop_ar0:
772    ; first 32 pixels
773    xor              xd, xd
774.x_loop_ar0:
775    movu             m0, [bufyq+xq*(2<<%2)]
776%if %2
777%if %3
778    movu             m2, [bufyq+xq*4+82*2]
779    paddw            m0, m2
780%endif
781    movu             m1, [bufyq+xq*4     +16]
782%if %3
783    movu             m2, [bufyq+xq*4+82*2+16]
784    paddw            m1, m2
785%endif
786    phaddw           m0, m1
787    pmulhrsw         m0, m6
788%endif
789    punpckhwd        m1, m0, m5
790    punpcklwd        m0, m5
791    REPX {pmaddwd x, m4}, m0, m1
792    REPX {psrad x, 5}, m0, m1
793    packssdw         m0, m1
794    pmulhrsw         m0, m3
795    movu             m1, [bufq+xq*2]
796    paddw            m0, m1
797    pminsw           m0, m14
798    pmaxsw           m0, m7
799    cmp              xd, 72-40*%2
800    je .end
801    movu    [bufq+xq*2], m0
802    add              xd, 8
803    jmp .x_loop_ar0
804
805    ; last 6/4 pixels
806.end:
807%if %2
808%if cpuflag(sse4)
809    pblendw          m0, m1, 11000000b
810%else
811    pand             m1, m12
812    pandn            m2, m12, m0
813    por              m0, m1, m2
814%endif
815    movu    [bufq+xq*2], m0
816%else
817    movq    [bufq+xq*2], m0
818%endif
819
820    add            bufq, 82*2
821    add           bufyq, 82*(2<<%3)
822    dec              hd
823    jg .y_loop_ar0
824%if ARCH_X86_32
825%undef m12
826%undef m14
827%endif
828    RET
829
830.ar1:
831%if ARCH_X86_64
832    DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x
833%else
834%assign stack_offset stack_offset_old
835%xdefine rstk rsp
836%assign stack_size_padded 0
837    DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3
838    mov           bufyq, r1m
839    mov             uvd, r3m
840%endif
841    imul            uvd, 28
842    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
843    movq             m4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
844%if WIN64
845    DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0
846%if %2
847    lea            bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))]
848%else
849    lea            bufq, [r0-2*(82*69+3)]
850%endif
851%else
852%if ARCH_X86_64
853    DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0
854%else
855    DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3
856%define hd dword r1m
857%define mind dword r3m
858%define maxd dword r4m
859%endif
860%if %2
861    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
862%else
863    sub            bufq, 2*(82*69+3)
864%endif
865%endif
866%if ARCH_X86_64
867    mov          shiftd, [r2+FGData.ar_coeff_shift]
868%else
869    mov          shiftd, [r3+FGData.ar_coeff_shift]
870%endif
871    pxor             m5, m5
872    pcmpgtb          m5, m4
873    punpcklbw        m4, m5                 ; cf0-4 in words
874    pshuflw          m4, m4, q2100
875    psrldq           m4, 2                  ; cf0-3,4 in words
876    pshufd           m5, m4, q1111
877    pshufd           m4, m4, q0000
878    movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
879    pxor             m6, m6
880    punpcklwd        m3, m6
881%if %2
882    SPLATW           m6, [base+hmul_bits+2+%3*2]
883%endif
884    SPLATD           m3, m3
885    add           bufyq, 2*(79+82*3)
886    mov              hd, 70-35*%3
887    sar            maxd, 1
888%if ARCH_X86_64
889    mov            mind, maxd
890    xor            mind, -1
891%else
892    DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3
893    mov              r2, maxd
894    xor              r2, -1
895    mov            mind, r2
896%endif
897.y_loop_ar1:
898    mov              xq, -(76>>%2)
899    movsx         val3d, word [bufq+xq*2-2]
900.x_loop_ar1:
901    movu             m0, [bufq+xq*2-82*2-2] ; top/left
902%if %2
903    movu             m7, [bufyq+xq*4]
904%if %3
905    movu             m1, [bufyq+xq*4+82*2]
906    phaddw           m7, m1
907%else
908    phaddw           m7, m7
909%endif
910%else
911    movq             m7, [bufyq+xq*2]
912%endif
913    psrldq           m2, m0, 2              ; top
914    psrldq           m1, m0, 4              ; top/right
915    punpcklwd        m0, m2
916%if %2
917%if %3
918    pshufd           m2, m7, q3232
919    paddw            m7, m2
920%endif
921    pmulhrsw         m7, m6
922%endif
923    punpcklwd        m1, m7
924    pmaddwd          m0, m4
925    pmaddwd          m1, m5
926    paddd            m0, m1
927    paddd            m0, m3
928.x_loop_ar1_inner:
929    movd          val0d, m0
930    psrldq           m0, 4
931    imul          val3d, cf3d
932    add           val3d, val0d
933    sar           val3d, shiftb
934    movsx         val0d, word [bufq+xq*2]
935    add           val3d, val0d
936    cmp           val3d, maxd
937    cmovg         val3d, maxd
938    cmp           val3d, mind
939    cmovl         val3d, mind
940    mov word [bufq+xq*2], val3w
941    ; keep val3d in-place as left for next x iteration
942    inc              xq
943    jz .x_loop_ar1_end
944    test             xq, 3
945    jnz .x_loop_ar1_inner
946    jmp .x_loop_ar1
947
948.x_loop_ar1_end:
949    add            bufq, 82*2
950    add           bufyq, 82*2<<%3
951    dec              hd
952    jg .y_loop_ar1
953%if ARCH_X86_32
954%undef maxd
955%undef mind
956%undef hd
957%endif
958    RET
959
960.ar2:
961%if ARCH_X86_64
962    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
963%else
964    DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
965    ALLOC_STACK  -16*8
966    mov           bufyq, r1m
967    mov             uvd, r3m
968%endif
969    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
970    imul            uvd, 28
971%if ARCH_X86_64
972    sar          bdmaxd, 1
973    SPLATW           m5, bdmaxd                 ; max_grain
974%else
975    SPLATW           m5, r4m
976    psraw            m5, 1
977%endif
978    pcmpeqw          m6, m6
979%if !cpuflag(sse4)
980    pcmpeqw          m7, m7
981    psrldq           m7, 14
982    pslldq           m7, 2
983    pxor             m7, m6
984%endif
985    pxor             m6, m5                    ; min_grain
986%if %2 && cpuflag(sse4)
987    SPLATW           m7, [base+hmul_bits+2+%3*2]
988%endif
989
990%if ARCH_X86_64
991    SWAP              5, 13
992    SWAP              6, 14
993    SWAP              7, 15
994%else
995%define m13 [rsp+5*16]
996%define m14 [rsp+6*16]
997%define m15 [rsp+7*16]
998    mova            m13, m5
999    mova            m14, m6
1000    mova            m15, m7
1001%endif
1002
1003    ; coef values
1004    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]
1005    pxor             m1, m1
1006    pcmpgtb          m1, m0
1007    punpckhbw        m2, m0, m1
1008    punpcklbw        m0, m1
1009    pinsrw           m2, [base+round_vals-12+shiftq*2], 5
1010
1011    pshufd           m6, m0, q0000
1012    pshufd           m7, m0, q1111
1013    pshufd           m1, m0, q3333
1014    pshufd           m0, m0, q2222
1015    pshufd           m3, m2, q1111
1016    pshufd           m4, m2, q2222
1017    pshufd           m2, m2, q0000
1018
1019%if ARCH_X86_64
1020    SWAP              0, 8
1021    SWAP              1, 9
1022    SWAP              2, 10
1023    SWAP              3, 11
1024    SWAP              4, 12
1025%else
1026%define m8 [rsp+0*16]
1027%define m9 [rsp+1*16]
1028%define m10 [rsp+2*16]
1029%define m11 [rsp+3*16]
1030%define m12 [rsp+4*16]
1031    mova             m8, m0
1032    mova             m9, m1
1033    mova            m10, m2
1034    mova            m11, m3
1035    mova            m12, m4
1036%endif
1037
1038%if ARCH_X86_64
1039    DEFINE_ARGS buf, bufy, fg_data, h, x
1040%else
1041    DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x
1042%endif
1043%if %2
1044    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
1045%else
1046    sub            bufq, 2*(82*69+3)
1047%endif
1048    add           bufyq, 2*(79+82*3)
1049    mov              hd, 70-35*%3
1050.y_loop_ar2:
1051    mov              xq, -(76>>%2)
1052
1053.x_loop_ar2:
1054    movu             m0, [bufq+xq*2-82*4-4]     ; y=-2,x=[-2,+5]
1055    movu             m5, [bufq+xq*2-82*2-4]     ; y=-1,x=[-2,+5]
1056    psrldq           m4, m0, 2                  ; y=-2,x=[-1,+5]
1057    psrldq           m1, m0, 4                  ; y=-2,x=[-0,+5]
1058    psrldq           m3, m0, 6                  ; y=-2,x=[+1,+5]
1059    psrldq           m2, m0, 8                  ; y=-2,x=[+2,+5]
1060    punpcklwd        m0, m4                     ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
1061    punpcklwd        m1, m3                     ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
1062    punpcklwd        m2, m5                     ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1]
1063    pmaddwd          m0, m6
1064    pmaddwd          m1, m7
1065    pmaddwd          m2, m8
1066    paddd            m0, m1
1067    paddd            m0, m2
1068    psrldq           m3, m5, 2                  ; y=-1,x=[-1,+5]
1069    psrldq           m1, m5, 4                  ; y=-1,x=[-0,+5]
1070    psrldq           m4, m5, 6                  ; y=-1,x=[+1,+5]
1071    psrldq           m2, m5, 8                  ; y=-1,x=[+2,+5]
1072    punpcklwd        m3, m1
1073    punpcklwd        m4, m2
1074    pmaddwd          m3, m9
1075    pmaddwd          m4, m10
1076    paddd            m3, m4
1077    paddd            m0, m3
1078
1079    ; luma component & rounding
1080%if %2
1081    movu             m1, [bufyq+xq*4]
1082%if %3
1083    movu             m2, [bufyq+xq*4+82*2]
1084    phaddw           m1, m2
1085    pshufd           m2, m1, q3232
1086    paddw            m1, m2
1087%else
1088    phaddw           m1, m1
1089%endif
1090%if cpuflag(sse4)
1091    pmulhrsw         m1, m15
1092%elif %3
1093    pmulhrsw         m1, [base+pw_8192]
1094%else
1095    pmulhrsw         m1, [base+pw_16384]
1096%endif
1097%else
1098    movq             m1, [bufyq+xq*2]
1099%endif
1100    punpcklwd        m1, [base+pw_1]
1101    pmaddwd          m1, m12
1102    paddd            m0, m1
1103
1104    movu             m1, [bufq+xq*2-4]      ; y=0,x=[-2,+5]
1105    pshufd           m2, m1, q3321
1106    pxor             m3, m3
1107    pcmpgtw          m3, m2
1108    punpcklwd        m2, m3                 ; y=0,x=[0,3] in dword
1109.x_loop_ar2_inner:
1110    pmaddwd          m3, m1, m11
1111    paddd            m3, m0
1112    psrldq           m0, 4                  ; shift top to next pixel
1113    psrad            m3, [fg_dataq+FGData.ar_coeff_shift]
1114    ; we do not need to packssdw since we only care about one value
1115    paddd            m3, m2
1116    packssdw         m3, m3
1117    pminsw           m3, m13
1118    pmaxsw           m3, m14
1119    psrldq           m1, 2
1120    pslldq           m3, 2
1121    psrldq           m2, 4
1122%if cpuflag(sse4)
1123    pblendw          m1, m3, 00000010b
1124%else
1125    pand             m1, m15
1126    pandn            m4, m15, m3
1127    por              m1, m4
1128%endif
1129    ; overwrite previous pixel, should be ok
1130    movd  [bufq+xq*2-2], m1
1131    inc              xq
1132    jz .x_loop_ar2_end
1133    test             xq, 3
1134    jnz .x_loop_ar2_inner
1135    jmp .x_loop_ar2
1136
1137.x_loop_ar2_end:
1138    add            bufq, 82*2
1139    add           bufyq, 82*2<<%3
1140    dec              hd
1141    jg .y_loop_ar2
1142%if ARCH_X86_32
1143%undef m13
1144%undef m14
1145%undef m15
1146%endif
1147    RET
1148
1149.ar3:
1150%if ARCH_X86_64
1151    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
1152%if WIN64
1153    mov              r6, rsp
1154    and             rsp, ~15
1155    sub             rsp, 96
1156    %define         tmp  rsp
1157%else
1158    %define         tmp  rsp+stack_offset-120
1159%endif
1160%else
1161    DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
1162%assign stack_offset stack_offset_old
1163    ALLOC_STACK  -16*14
1164    mov           bufyq, r1m
1165    mov             uvd, r3m
1166    %define         tmp  rsp
1167%endif
1168    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
1169    imul            uvd, 28
1170    SPLATW           m4, [base+round_vals-12+shiftq*2]
1171    pxor             m5, m5
1172    pcmpgtw          m5, m4
1173    punpcklwd        m4, m5
1174%if ARCH_X86_64
1175    sar          bdmaxd, 1
1176    SPLATW           m6, bdmaxd                 ; max_grain
1177%else
1178    SPLATW           m6, r4m
1179    psraw            m6, 1
1180%endif
1181    pcmpeqw          m7, m7
1182%if !cpuflag(sse4)
1183    pcmpeqw          m3, m3
1184    psrldq           m3, 14
1185    pslldq           m3, 4
1186    pxor             m3, m7
1187%endif
1188    pxor             m7, m6                     ; min_grain
1189%if %2 && cpuflag(sse4)
1190    SPLATW           m3, [base+hmul_bits+2+%3*2]
1191%endif
1192
1193%if ARCH_X86_64
1194    SWAP              3, 11
1195    SWAP              4, 12
1196    SWAP              6, 14
1197    SWAP              7, 15
1198%else
1199%define m11 [rsp+ 9*16]
1200%define m12 [rsp+10*16]
1201%define m14 [rsp+12*16]
1202%define m15 [rsp+13*16]
1203    mova            m11, m3
1204    mova            m12, m4
1205    mova            m14, m6
1206    mova            m15, m7
1207%endif
1208
1209    ; cf from y=-3,x=-3 until y=-3,x=-2
1210    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
1211    pxor             m1, m1
1212    pcmpgtb          m1, m0
1213    punpckhbw        m2, m0, m1
1214    punpcklbw        m0, m1
1215    pshufd           m1, m0, q0000
1216    pshufd           m3, m0, q1111
1217    pshufd           m4, m0, q2222
1218    pshufd           m0, m0, q3333
1219    pshufd           m5, m2, q0000
1220    pshufd           m6, m2, q1111
1221    mova     [tmp+16*0], m1
1222    mova     [tmp+16*1], m3
1223    mova     [tmp+16*2], m4
1224    mova     [tmp+16*3], m0
1225    mova     [tmp+16*4], m5
1226    mova     [tmp+16*5], m6
1227    pshufd           m6, m2, q2222
1228    pshufd           m7, m2, q3333
1229
1230    ; cf from y=-1,x=-1 to y=0,x=-1 + luma component
1231    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]
1232    pxor             m1, m1
1233    pcmpgtb          m1, m0
1234    punpckhbw        m2, m0, m1                 ; luma
1235    punpcklbw        m0, m1
1236    pshufd           m3, m0, q3232
1237    psrldq           m5, m0, 10
1238    ; y=0,x=[-3 to -1] + "1.0" for current pixel
1239    pinsrw           m5, [base+round_vals-10+shiftq*2], 3
1240    ; y=-1,x=[-1 to +2]
1241    pshufd           m1, m0, q0000
1242    pshufd           m0, m0, q1111
1243    ; y=-1,x=+3 + luma
1244    punpcklwd        m3, m2
1245    pshufd           m3, m3, q0000
1246
1247%if ARCH_X86_64
1248    SWAP              1, 8
1249    SWAP              0, 9
1250    SWAP              3, 10
1251    SWAP              5, 13
1252    DEFINE_ARGS buf, bufy, fg_data, h, x
1253%else
1254%define m8  [rsp+ 6*16]
1255%define m9  [rsp+ 7*16]
1256%define m10 [rsp+ 8*16]
1257%define m13 [rsp+11*16]
1258    mova             m8, m1
1259    mova             m9, m0
1260    mova            m10, m3
1261    mova            m13, m5
1262    DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x
1263%endif
1264%if %2
1265    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
1266%else
1267    sub            bufq, 2*(82*69+3)
1268%endif
1269    add           bufyq, 2*(79+82*3)
1270    mov              hd, 70-35*%3
1271.y_loop_ar3:
1272    mov              xq, -(76>>%2)
1273
1274.x_loop_ar3:
1275    ; first line
1276    movu             m0, [bufq+xq*2-82*6-6+ 0]      ; y=-3,x=[-3,+4]
1277    movd             m1, [bufq+xq*2-82*6-6+16]      ; y=-3,x=[+5,+6]
1278    palignr          m2, m1, m0, 2                  ; y=-3,x=[-2,+5]
1279    palignr          m1, m1, m0, 12                 ; y=-3,x=[+3,+6]
1280    punpckhwd        m3, m0, m2                     ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5]
1281    punpcklwd        m0, m2                         ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1]
1282    shufps           m2, m0, m3, q1032              ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3]
1283
1284    pmaddwd          m0, [tmp+0*16]
1285    pmaddwd          m2, [tmp+1*16]
1286    pmaddwd          m3, [tmp+2*16]
1287    paddd            m0, m2
1288    paddd            m0, m3                         ; first 6 x of top y
1289
1290    ; second line [m0/1 are busy]
1291    movu             m2, [bufq+xq*2-82*4-6+ 0]      ; y=-2,x=[-3,+4]
1292    movd             m3, [bufq+xq*2-82*4-6+16]      ; y=-2,x=[+5,+6]
1293    punpcklwd        m1, m2                         ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0]
1294    palignr          m4, m3, m2, 2                  ; y=-2,x=[-2,+5]
1295    palignr          m3, m3, m2, 4                  ; y=-2,x=[-2,+5]
1296    punpckhwd        m5, m4, m3                     ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6]
1297    punpcklwd        m4, m3                         ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
1298    shufps           m3, m4, m5, q1032              ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
1299    pmaddwd          m1, [tmp+3*16]
1300    pmaddwd          m4, [tmp+4*16]
1301    pmaddwd          m3, [tmp+5*16]
1302    pmaddwd          m5, m6
1303    paddd            m1, m4
1304    paddd            m3, m5
1305    paddd            m0, m1
1306    paddd            m0, m3                         ; top 2 lines
1307
1308    ; third line [m0 is busy] & luma + round
1309    movu             m1, [bufq+xq*2-82*2-6+ 0]      ; y=-1,x=[-3,+4]
1310    movd             m2, [bufq+xq*2-82*2-6+16]      ; y=-1,x=[+5,+6]
1311%if %2
1312    movu             m5, [bufyq+xq*4]
1313%if %3
1314    movu             m4, [bufyq+xq*4+82*2]
1315    phaddw           m5, m4
1316%else
1317    phaddw           m5, m5
1318%endif
1319%else
1320    movq             m5, [bufyq+xq*2]
1321%endif
1322    palignr          m3, m2, m1, 2                  ; y=-1,x=[-2,+5]
1323    palignr          m2, m2, m1, 12                 ; y=-1,x=[+3,+6]
1324%if %3
1325    pshufd           m4, m5, q3232
1326    paddw            m5, m4
1327%endif
1328%if %2
1329%if cpuflag(sse4)
1330    pmulhrsw         m5, m11
1331%elif %3
1332    pmulhrsw         m5, [base+pw_8192]
1333%else
1334    pmulhrsw         m5, [base+pw_16384]
1335%endif
1336%endif
1337    punpckhwd        m4, m1, m3                     ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5]
1338    punpcklwd        m1, m3                         ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
1339    shufps           m3, m1, m4, q1032              ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3]
1340    punpcklwd        m2, m5
1341    pmaddwd          m1, m7
1342    pmaddwd          m3, m8
1343    pmaddwd          m4, m9
1344    pmaddwd          m2, m10
1345    paddd            m1, m3
1346    paddd            m4, m2
1347    paddd            m0, m12                        ; += round
1348    paddd            m1, m4
1349    paddd            m0, m1
1350
1351    movu             m1, [bufq+xq*2-6]      ; y=0,x=[-3,+4]
1352.x_loop_ar3_inner:
1353    pmaddwd          m2, m1, m13
1354    pshufd           m3, m2, q1111
1355    paddd            m2, m3                 ; left+cur
1356    paddd            m2, m0                 ; add top
1357    psrldq           m0, 4
1358    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
1359    packssdw         m2, m2
1360    pminsw           m2, m14
1361    pmaxsw           m2, m15
1362    pslldq           m2, 4
1363    psrldq           m1, 2
1364%if cpuflag(sse4)
1365    pblendw          m1, m2, 00000100b
1366%else
1367    pand             m1, m11
1368    pandn            m3, m11, m2
1369    por              m1, m3
1370%endif
1371    ; overwrite previous pixels, should be ok
1372    movq  [bufq+xq*2-4], m1
1373    inc              xq
1374    jz .x_loop_ar3_end
1375    test             xq, 3
1376    jnz .x_loop_ar3_inner
1377    jmp .x_loop_ar3
1378
1379.x_loop_ar3_end:
1380    add            bufq, 82*2
1381    add           bufyq, 82*2<<%3
1382    dec              hd
1383    jg .y_loop_ar3
1384%if WIN64
1385    mov             rsp, r6
1386%elif ARCH_X86_32
1387%undef m8
1388%undef m9
1389%undef m10
1390%undef m11
1391%undef m12
1392%undef m13
1393%undef m14
1394%undef m15
1395%endif
1396    RET
1397%endmacro
1398
1399generate_grain_uv_fn 420, 1, 1
1400generate_grain_uv_fn 422, 1, 0
1401generate_grain_uv_fn 444, 0, 0
1402
1403%macro SCRATCH 3
1404%if ARCH_X86_32
1405    mova [rsp+%3*mmsize], m%1
1406%define m%2 [rsp+%3*mmsize]
1407%else
1408    SWAP             %1, %2
1409%endif
1410%endmacro
1411
1412INIT_XMM ssse3
1413%if ARCH_X86_32
1414%if STACK_ALIGNMENT < mmsize
1415cglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \
1416        dst, src, scaling, unused1, fg_data, picptr, unused2
1417    ; copy stack arguments to new position post-alignment, so that we
1418    ; don't have to keep the old stack location in a separate register
1419    mov              r0, r0m
1420    mov              r1, r2m
1421    mov              r2, r4m
1422    mov              r3, r6m
1423    mov              r4, r7m
1424    mov              r5, r8m
1425
1426%define r0m [rsp+8*mmsize+ 3*gprsize]
1427%define r2m [rsp+8*mmsize+ 5*gprsize]
1428%define r4m [rsp+8*mmsize+ 7*gprsize]
1429%define r6m [rsp+8*mmsize+ 9*gprsize]
1430%define r7m [rsp+8*mmsize+10*gprsize]
1431%define r8m [rsp+8*mmsize+11*gprsize]
1432
1433    mov             r0m, r0
1434    mov             r2m, r1
1435    mov             r4m, r2
1436    mov             r6m, r3
1437    mov             r7m, r4
1438    mov             r8m, r5
1439%else
1440cglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \
1441        dst, src, scaling, unused1, fg_data, picptr, unused2
1442%endif
1443    mov            srcq, srcm
1444    mov        scalingq, r5m
1445    mov        fg_dataq, r3m
1446%if STACK_ALIGNMENT < mmsize
1447    mov              r6, r9m
1448
1449%define r9m [rsp+8*mmsize+ 4*gprsize]
1450%define r3m [rsp+8*mmsize+ 6*gprsize]
1451%define r5m [rsp+8*mmsize+ 8*gprsize]
1452
1453    mov             r9m, r6
1454%endif
1455    LEA              r5, $$
1456%define base r5-$$
1457    mov             r5m, picptrq
1458%else
1459cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
1460    lea              r8, [pb_mask]
1461%define base r8-pb_mask
1462%endif
1463    mov             r6d, [fg_dataq+FGData.scaling_shift]
1464    SPLATW           m3, [base+mul_bits+r6*2-14]
1465    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
1466%if ARCH_X86_32
1467    DECLARE_REG_TMP   0, 3
1468%else
1469    DECLARE_REG_TMP   9, 10
1470%endif
1471    mov             t0d, r9m        ; bdmax
1472    sar             t0d, 11         ; is_12bpc
1473    inc             t0d
1474    mov             t1d, r6d
1475    imul            t1d, t0d
1476    dec             t0d
1477    SPLATW           m5, [base+min+t1*2]
1478    lea             t0d, [t0d*3]
1479    lea             t0d, [r6d*2+t0d]
1480    SPLATW           m4, [base+max+t0*2]
1481    SPLATW           m2, r9m
1482
1483    pcmpeqw          m1, m1
1484    psraw            m7, m2, 1              ; max_grain
1485    pxor             m1, m7                 ; min_grain
1486    SPLATD           m6, [base+pd_16]
1487
1488    SCRATCH           1,  9, 0
1489    SCRATCH           2, 10, 1
1490    SCRATCH           3, 11, 2
1491    SCRATCH           4, 12, 3
1492    SCRATCH           5, 13, 4
1493    SCRATCH           6, 14, 5
1494    SCRATCH           7, 15, 6
1495
1496    mova             m6, [base+pw_27_17_17_27]   ; for horizontal filter
1497
1498%if ARCH_X86_32
1499    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2
1500    DECLARE_REG_TMP   0
1501%else
1502    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
1503                sby, see
1504    DECLARE_REG_TMP   7
1505%endif
1506
1507    mov            sbyd, r8m
1508    movzx           t0d, byte [fg_dataq+FGData.overlap_flag]
1509    test            t0d, t0d
1510    jz .no_vertical_overlap
1511    test           sbyd, sbyd
1512    jnz .vertical_overlap
1513.no_vertical_overlap:
1514    mov       dword r8m, t0d
1515
1516%if ARCH_X86_32
1517    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
1518    imul           seed, (173 << 24) | 37
1519%else
1520    imul           seed, sbyd, (173 << 24) | 37
1521%endif
1522    add            seed, (105 << 24) | 178
1523    rol            seed, 8
1524    movzx          seed, seew
1525    xor            seed, [fg_dataq+FGData.seed]
1526
1527%if ARCH_X86_32
1528    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1529
1530    mov             r3m, seed
1531    mov              wq, r4m
1532%else
1533    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1534                unused1, unused2, see, src_bak
1535%endif
1536
1537    lea        src_bakq, [srcq+wq*2]
1538    mov            r9mp, src_bakq
1539    neg              wq
1540    sub           dstmp, srcq
1541%if ARCH_X86_32
1542    mov             r4m, wq
1543%endif
1544
1545.loop_x:
1546%if ARCH_X86_32
1547    mov            seed, r3m
1548%endif
1549    mov             r6d, seed
1550    or             seed, 0xEFF4
1551    shr             r6d, 1
1552    test           seeb, seeh
1553    lea            seed, [r6+0x8000]
1554    cmovp          seed, r6d                ; updated seed
1555
1556%if ARCH_X86_32
1557    mov             r3m, seed
1558
1559    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1560
1561    mov           offxd, offyd
1562%else
1563    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1564                offx, offy, see, src_bak
1565
1566    mov           offyd, seed
1567    mov           offxd, seed
1568%endif
1569    ror           offyd, 8
1570    shr           offxd, 12
1571    and           offyd, 0xf
1572    imul          offyd, 164
1573    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1574
1575%if ARCH_X86_32
1576    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1577%else
1578    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1579                h, offxy, see, src_bak
1580%endif
1581
1582.loop_x_odd:
1583    movzx            hd, word r7m
1584    mov      grain_lutq, grain_lutmp
1585.loop_y:
1586    ; src
1587    pand             m0, m10, [srcq+ 0]
1588    pand             m1, m10, [srcq+16]          ; m0-1: src as word
1589
1590    ; scaling[src]
1591%if ARCH_X86_32
1592    vpgatherdw       m2, m0, scalingq-1, r0, r5, 8, 1, m4
1593    vpgatherdw       m3, m1, scalingq-1, r0, r5, 8, 1, m4
1594%else
1595    vpgatherdw       m2, m0, scalingq-1, r11, r13, 8, 1, m4
1596    vpgatherdw       m3, m1, scalingq-1, r11, r13, 8, 1, m4
1597%endif
1598    REPX   {psrlw x, 8}, m2, m3
1599
1600    ; grain = grain_lut[offy+y][offx+x]
1601    movu             m4, [grain_lutq+offxyq*2]
1602    movu             m5, [grain_lutq+offxyq*2+16]
1603
1604    ; noise = round2(scaling[src] * grain, scaling_shift)
1605    REPX {pmullw x, m11}, m2, m3
1606    pmulhrsw         m4, m2
1607    pmulhrsw         m5, m3
1608
1609    ; dst = clip_pixel(src, noise)
1610    paddw            m0, m4
1611    paddw            m1, m5
1612    pmaxsw           m0, m13
1613    pmaxsw           m1, m13
1614    pminsw           m0, m12
1615    pminsw           m1, m12
1616    movifnidn      dstq, dstmp
1617    mova [dstq+srcq+ 0], m0
1618    mova [dstq+srcq+16], m1
1619
1620    add            srcq, r2mp               ; src += stride
1621    add      grain_lutq, 82*2
1622    dec              hd
1623    jg .loop_y
1624
1625%if ARCH_X86_32
1626    add            r4mp, 16
1627%else
1628    add              wq, 16
1629%endif
1630    jge .end
1631%if ARCH_X86_32
1632    mov            srcq, r9mp
1633    add            srcq, r4mp
1634    add            srcq, r4mp
1635%else
1636    mov        src_bakq, r9mp
1637    lea            srcq, [src_bakq+wq*2]
1638%endif
1639    btc       dword r8m, 2
1640    jc .next_blk
1641    add          offxyd, 16
1642    test      dword r8m, 2
1643    jz .loop_x_odd
1644%if ARCH_X86_32
1645    add dword [rsp+8*mmsize+1*gprsize], 16
1646%else
1647    add            r12d, 16                 ; top_offxy += 16
1648%endif
1649    jmp .loop_x_odd_v_overlap
1650
1651.next_blk:
1652    test      dword r8m, 1
1653    jz .loop_x
1654
1655    ; r8m = sbym
1656    test      dword r8m, 2
1657    jnz .loop_x_hv_overlap
1658
1659    ; horizontal overlap (without vertical overlap)
1660.loop_x_h_overlap:
1661%if ARCH_X86_32
1662    add          offxyd, 16
1663    mov [rsp+8*mmsize+0*gprsize], offxyd
1664    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1665    mov            seed, r3m
1666%endif
1667
1668    mov             r6d, seed
1669    or             seed, 0xEFF4
1670    shr             r6d, 1
1671    test           seeb, seeh
1672    lea            seed, [r6+0x8000]
1673    cmovp          seed, r6d                ; updated seed
1674
1675%if ARCH_X86_32
1676    mov             r3m, seed
1677
1678    DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
1679
1680    mov           offxd, offyd
1681%else
1682    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1683                offx, offy, see, src_bak, left_offxy
1684
1685    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
1686
1687    mov           offyd, seed
1688    mov           offxd, seed
1689%endif
1690    ror           offyd, 8
1691    shr           offxd, 12
1692    and           offyd, 0xf
1693    imul          offyd, 164
1694    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1695
1696%if ARCH_X86_32
1697    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1698%else
1699    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1700                h, offxy, see, src_bak, left_offxy
1701%endif
1702
1703    mov              hd, dword r7m
1704    mov      grain_lutq, grain_lutmp
1705.loop_y_h_overlap:
1706    ; grain = grain_lut[offy+y][offx+x]
1707    movu             m5, [grain_lutq+offxyq*2]
1708%if ARCH_X86_32
1709    mov              r5, [rsp+8*mmsize+0*gprsize]
1710    movd             m4, [grain_lutq+r5*2]
1711%else
1712    movd             m4, [grain_lutq+left_offxyq*2]
1713%endif
1714    punpcklwd        m4, m5
1715    pmaddwd          m4, m6
1716    paddd            m4, m14
1717    psrad            m4, 5
1718    packssdw         m4, m4
1719    pminsw           m4, m15
1720    pmaxsw           m4, m9
1721    shufps           m4, m5, q3210
1722
1723    ; src
1724    pand             m0, m10, [srcq+ 0]
1725    pand             m1, m10, [srcq+16]          ; m0-1: src as word
1726
1727    ; scaling[src]
1728%if ARCH_X86_32
1729    vpgatherdw       m2, m0, scalingq-1, r0, r5, 8, 1, m5
1730    vpgatherdw       m3, m1, scalingq-1, r0, r5, 8, 1, m5
1731%else
1732    vpgatherdw       m2, m0, scalingq-1, r13, r14, 8, 1, m5
1733    vpgatherdw       m3, m1, scalingq-1, r13, r14, 8, 1, m5
1734%endif
1735    REPX   {psrlw x, 8}, m2, m3
1736
1737    ; noise = round2(scaling[src] * grain, scaling_shift)
1738    movu             m5, [grain_lutq+offxyq*2+16]
1739    REPX {pmullw x, m11}, m2, m3
1740    pmulhrsw         m4, m2
1741    pmulhrsw         m5, m3
1742
1743    ; dst = clip_pixel(src, noise)
1744    paddw            m0, m4
1745    paddw            m1, m5
1746    pmaxsw           m0, m13
1747    pmaxsw           m1, m13
1748    pminsw           m0, m12
1749    pminsw           m1, m12
1750    movifnidn      dstq, dstmp
1751    mova [dstq+srcq+ 0], m0
1752    mova [dstq+srcq+16], m1
1753
1754    add            srcq, r2mp
1755    add      grain_lutq, 82*2
1756    dec              hd
1757    jg .loop_y_h_overlap
1758
1759%if ARCH_X86_32
1760    add            r4mp, 16
1761%else
1762    add              wq, 16
1763%endif
1764    jge .end
1765%if ARCH_X86_32
1766    mov            srcq, r9mp
1767    add            srcq, r4mp
1768    add            srcq, r4mp
1769%else
1770    mov        src_bakq, r9mp
1771    lea            srcq, [src_bakq+wq*2]
1772%endif
1773    or        dword r8m, 4
1774    add          offxyd, 16
1775
1776    ; r8m = sbym
1777    test      dword r8m, 2
1778    jz .loop_x_odd
1779%if ARCH_X86_32
1780    add dword [rsp+8*mmsize+1*gprsize], 16
1781%else
1782    add            r12d, 16                 ; top_offxy += 16
1783%endif
1784    jmp .loop_x_odd_v_overlap
1785
1786.end:
1787    RET
1788
1789.vertical_overlap:
1790    or              t0d, 2
1791    mov             r8m, t0d
1792
1793%if ARCH_X86_32
1794    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused
1795%else
1796    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
1797                sby, see
1798%endif
1799
1800    movzx          sbyd, sbyb
1801%if ARCH_X86_32
1802    imul             r4, [fg_dataq+FGData.seed], 0x00010001
1803    DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused
1804%else
1805    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1806%endif
1807    imul            t0d, sbyd, 173 * 0x00010001
1808    imul           sbyd, 37 * 0x01000100
1809    add             t0d, (105 << 16) | 188
1810    add            sbyd, (178 << 24) | (141 << 8)
1811    and             t0d, 0x00ff00ff
1812    and            sbyd, 0xff00ff00
1813    xor            seed, t0d
1814%if ARCH_X86_32
1815    xor            sbyd, seed
1816
1817    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1818
1819    mov             r3m, seed
1820    mov              wq, r4m
1821%else
1822    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1823
1824    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1825                unused1, unused2, see, src_bak
1826%endif
1827
1828    lea        src_bakq, [srcq+wq*2]
1829    mov            r9mp, src_bakq
1830    neg              wq
1831    sub           dstmp, srcq
1832%if ARCH_X86_32
1833    mov             r4m, wq
1834%endif
1835
1836.loop_x_v_overlap:
1837%if ARCH_X86_32
1838    mov              r5, r5m
1839    SPLATD           m7, [base+pw_27_17_17_27]
1840    mov            seed, r3m
1841%else
1842    SPLATD           m7, [pw_27_17_17_27]
1843%endif
1844
1845    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1846    mov             r6d, seed
1847    or             seed, 0xeff4eff4
1848    test           seeb, seeh
1849    setp            t0b                     ; parity of top_seed
1850    shr            seed, 16
1851    shl             t0d, 16
1852    test           seeb, seeh
1853    setp            t0b                     ; parity of cur_seed
1854    or              r6d, 0x00010001
1855    xor             t0d, r6d
1856    mov            seed, t0d
1857    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
1858
1859%if ARCH_X86_32
1860    mov             r3m, seed
1861
1862    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1863
1864    mov           offxd, offyd
1865%else
1866    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1867                offx, offy, see, src_bak, unused, top_offxy
1868
1869    mov           offyd, seed
1870    mov           offxd, seed
1871%endif
1872    ror           offyd, 8
1873    ror           offxd, 12
1874    and           offyd, 0xf000f
1875    and           offxd, 0xf000f
1876    imul          offyd, 164
1877    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1878    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1879
1880%if ARCH_X86_32
1881    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
1882%else
1883    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
1884                h, offxy, see, src_bak, unused, top_offxy
1885%endif
1886
1887    movzx    top_offxyd, offxyw
1888%if ARCH_X86_32
1889    mov [rsp+8*mmsize+1*gprsize], top_offxyd
1890
1891    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1892%endif
1893    shr          offxyd, 16
1894
1895.loop_x_odd_v_overlap:
1896%if ARCH_X86_32
1897    mov              r5, r5m
1898%endif
1899    SPLATD           m7, [PIC_ptr(pw_27_17_17_27)]
1900    mov              hd, dword r7m
1901    mov      grain_lutq, grain_lutmp
1902.loop_y_v_overlap:
1903    ; grain = grain_lut[offy+y][offx+x]
1904    movu             m3, [grain_lutq+offxyq*2]
1905%if ARCH_X86_32
1906    mov              r5, [rsp+8*mmsize+1*gprsize]
1907    movu             m2, [grain_lutq+r5*2]
1908%else
1909    movu             m2, [grain_lutq+top_offxyq*2]
1910%endif
1911    punpckhwd        m4, m2, m3
1912    punpcklwd        m2, m3
1913    REPX {pmaddwd x, m7}, m4, m2
1914    REPX {paddd   x, m14}, m4, m2
1915    REPX {psrad   x, 5}, m4, m2
1916    packssdw         m2, m4
1917    pminsw           m2, m15
1918    pmaxsw           m2, m9
1919    movu             m4, [grain_lutq+offxyq*2+16]
1920%if ARCH_X86_32
1921    movu             m3, [grain_lutq+r5*2+16]
1922%else
1923    movu             m3, [grain_lutq+top_offxyq*2+16]
1924%endif
1925    punpckhwd        m5, m3, m4
1926    punpcklwd        m3, m4
1927    REPX {pmaddwd x, m7}, m5, m3
1928    REPX {paddd   x, m14}, m5, m3
1929    REPX {psrad   x, 5}, m5, m3
1930    packssdw         m3, m5
1931    pminsw           m3, m15
1932    pmaxsw           m3, m9
1933
1934    ; src
1935    pand             m0, m10, [srcq+ 0]          ; m0-1: src as word
1936    pand             m1, m10, [srcq+16]          ; m0-1: src as word
1937
1938    ; scaling[src]
1939    ; noise = round2(scaling[src] * grain, scaling_shift)
1940%if ARCH_X86_32
1941    vpgatherdw       m4, m0, scalingq-1, r0, r5, 8, 1, m5
1942%else
1943    vpgatherdw       m4, m0, scalingq-1, r11, r13, 8, 1, m5
1944%endif
1945    psrlw            m4, 8
1946    pmullw           m4, m11
1947    pmulhrsw         m4, m2
1948%if ARCH_X86_32
1949    vpgatherdw       m5, m1, scalingq-1, r0, r5, 8, 1, m2
1950%else
1951    vpgatherdw       m5, m1, scalingq-1, r11, r13, 8, 1, m2
1952%endif
1953    psrlw            m5, 8
1954    pmullw           m5, m11
1955    pmulhrsw         m5, m3
1956
1957    ; dst = clip_pixel(src, noise)
1958    paddw            m0, m4
1959    paddw            m1, m5
1960    pmaxsw           m0, m13
1961    pmaxsw           m1, m13
1962    pminsw           m0, m12
1963    pminsw           m1, m12
1964    movifnidn      dstq, dstmp
1965    mova [dstq+srcq+ 0], m0
1966    mova [dstq+srcq+16], m1
1967
1968    add            srcq, r2mp
1969    add      grain_lutq, 82*2
1970    dec              hw
1971    jz .end_y_v_overlap
1972    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1973    ; remaining (up to) 30 lines
1974%if ARCH_X86_32
1975    mov              r5, r5m
1976%endif
1977    SPLATD           m7, [PIC_ptr(pw_27_17_17_27)+4]
1978    xor              hd, 0x10000
1979    test             hd, 0x10000
1980    jnz .loop_y_v_overlap
1981    jmp .loop_y
1982
1983.end_y_v_overlap:
1984%if ARCH_X86_32
1985    add            r4mp, 16
1986%else
1987    add              wq, 16
1988%endif
1989    jge .end_hv
1990%if ARCH_X86_32
1991    mov            srcq, r9mp
1992    add            srcq, r4mp
1993    add            srcq, r4mp
1994%else
1995    mov        src_bakq, r9mp
1996    lea            srcq, [src_bakq+wq*2]
1997%endif
1998    btc       dword r8m, 2
1999    jc .next_blk_v
2000%if ARCH_X86_32
2001    add dword [rsp+8*mmsize+1*gprsize], 16
2002%else
2003    add      top_offxyd, 16
2004%endif
2005    add          offxyd, 16
2006    jmp .loop_x_odd_v_overlap
2007
2008.next_blk_v:
2009    ; since fg_dataq.overlap is guaranteed to be set, we never jump
2010    ; back to .loop_x_v_overlap, and instead always fall-through to
2011    ; h+v overlap
2012
2013.loop_x_hv_overlap:
2014%if ARCH_X86_32
2015    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
2016
2017    mov              r0, [rsp+8*mmsize+1*gprsize]
2018    add              r3, 16
2019    add              r0, 16
2020    mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy
2021    mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy
2022
2023    mov            seed, r3m
2024    xor              r0, r0
2025%else
2026    ; we assume from the block above that bits 8-15 of r7d are zero'ed
2027%endif
2028    mov             r6d, seed
2029    or             seed, 0xeff4eff4
2030    test           seeb, seeh
2031    setp            t0b                     ; parity of top_seed
2032    shr            seed, 16
2033    shl             t0d, 16
2034    test           seeb, seeh
2035    setp            t0b                     ; parity of cur_seed
2036    or              r6d, 0x00010001
2037    xor             t0d, r6d
2038    mov            seed, t0d
2039    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
2040
2041%if ARCH_X86_32
2042    mov             r3m, seed
2043
2044    DEFINE_ARGS  dst, src, scaling, offy, w, picptr, offx
2045
2046    mov           offxd, offyd
2047%else
2048    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2049                offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy
2050
2051    lea  topleft_offxyq, [top_offxyq+16]
2052    lea     left_offxyq, [offyq+16]
2053    mov           offyd, seed
2054    mov           offxd, seed
2055%endif
2056    ror           offyd, 8
2057    ror           offxd, 12
2058    and           offyd, 0xf000f
2059    and           offxd, 0xf000f
2060    imul          offyd, 164
2061    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
2062    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
2063
2064%if ARCH_X86_32
2065    DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut
2066%else
2067    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2068                h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy
2069%endif
2070
2071    movzx    top_offxyd, offxyw
2072%if ARCH_X86_32
2073    mov [rsp+8*mmsize+1*gprsize], top_offxyd
2074
2075    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2076%endif
2077    shr          offxyd, 16
2078
2079%if ARCH_X86_32
2080    mov              r5, r5m
2081%endif
2082    SPLATD           m7, [PIC_ptr(pw_27_17_17_27)]
2083
2084    movzx            hd, word r7m
2085    mov      grain_lutq, grain_lutmp
2086.loop_y_hv_overlap:
2087    ; grain = grain_lut[offy+y][offx+x]
2088    movu             m2, [grain_lutq+offxyq*2]
2089%if ARCH_X86_32
2090    mov              r0, [rsp+8*mmsize+1*gprsize] ; top_offxy
2091    mov              r5, [rsp+8*mmsize+0*gprsize] ; left_offxy
2092    movu             m4, [grain_lutq+r0*2]
2093    movd             m5, [grain_lutq+r5*2]
2094    mov              r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy
2095    movd             m3, [grain_lutq+r5*2]
2096%else
2097    movu             m4, [grain_lutq+top_offxyq*2]
2098    movd             m5, [grain_lutq+left_offxyq*2]
2099    movd             m3, [grain_lutq+topleft_offxyq*2]
2100%endif
2101    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
2102    punpcklwd        m5, m2
2103    punpcklwd        m3, m4
2104    REPX {pmaddwd x, m6}, m5, m3
2105    REPX {paddd   x, m14}, m5, m3
2106    REPX {psrad   x, 5}, m5, m3
2107    packssdw         m5, m3
2108    pminsw           m5, m15
2109    pmaxsw           m5, m9
2110    shufps           m3, m5, m2, q3210
2111    shufps           m5, m4, q3232
2112    ; followed by v interpolation (top | cur -> cur)
2113    movu             m0, [grain_lutq+offxyq*2+16]
2114%if ARCH_X86_32
2115    movu             m1, [grain_lutq+r0*2+16]
2116%else
2117    movu             m1, [grain_lutq+top_offxyq*2+16]
2118%endif
2119    punpcklwd        m2, m5, m3
2120    punpckhwd        m5, m3
2121    punpcklwd        m3, m1, m0
2122    punpckhwd        m1, m0
2123    REPX {pmaddwd x, m7}, m2, m5, m3, m1
2124    REPX {paddd   x, m14}, m2, m5, m3, m1
2125    REPX {psrad   x, 5}, m2, m5, m3, m1
2126    packssdw         m2, m5
2127    packssdw         m3, m1
2128    REPX {pminsw x, m15}, m2, m3
2129    REPX {pmaxsw x, m9}, m2, m3
2130
2131    ; src
2132    pand             m0, m10, [srcq+ 0]
2133    pand             m1, m10, [srcq+16]          ; m0-1: src as word
2134
2135    ; scaling[src]
2136    ; noise = round2(scaling[src] * grain, scaling_shift)
2137%if ARCH_X86_32
2138    vpgatherdw       m4, m0, scalingq-1, r0, r5, 8, 1, m5
2139%else
2140    vpgatherdw       m4, m0, scalingq-1, r14, r10, 8, 1, m5
2141%endif
2142    psrlw            m4, 8
2143    pmullw           m4, m11
2144    pmulhrsw         m2, m4
2145%if ARCH_X86_32
2146    vpgatherdw       m5, m1, scalingq-1, r0, r5, 8, 1, m4
2147%else
2148    vpgatherdw       m5, m1, scalingq-1, r14, r10, 8, 1, m4
2149%endif
2150    psrlw            m5, 8
2151    pmullw           m5, m11
2152    pmulhrsw         m3, m5
2153
2154    ; dst = clip_pixel(src, noise)
2155    paddw            m0, m2
2156    paddw            m1, m3
2157    pmaxsw           m0, m13
2158    pmaxsw           m1, m13
2159    pminsw           m0, m12
2160    pminsw           m1, m12
2161    movifnidn      dstq, dstmp
2162    mova [dstq+srcq+ 0], m0
2163    mova [dstq+srcq+16], m1
2164
2165    add            srcq, r2mp
2166    add      grain_lutq, 82*2
2167    dec              hw
2168    jz .end_y_hv_overlap
2169    ; 2 lines get vertical overlap, then fall back to non-overlap code for
2170    ; remaining (up to) 30 lines
2171%if ARCH_X86_32
2172    mov              r5, r5m
2173%endif
2174    SPLATD           m7, [PIC_ptr(pw_27_17_17_27)+4]
2175    xor              hd, 0x10000
2176    test             hd, 0x10000
2177    jnz .loop_y_hv_overlap
2178    jmp .loop_y_h_overlap
2179
2180.end_y_hv_overlap:
2181    or        dword r8m, 4
2182%if ARCH_X86_32
2183    add            r4mp, 16
2184%else
2185    add              wq, 16
2186%endif
2187    jge .end_hv
2188%if ARCH_X86_32
2189    mov              r5, r5m
2190    add          offxyd, 16
2191    add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16
2192    mov            srcq, r9mp
2193    add            srcq, r4mp
2194    add            srcq, r4mp
2195%else
2196    add          offxyd, 16
2197    add      top_offxyd, 16
2198    mov        src_bakq, r9mp
2199    lea            srcq, [src_bakq+wq*2]
2200%endif
2201    jmp .loop_x_odd_v_overlap
2202
2203.end_hv:
2204    RET
2205%if ARCH_X86_32
2206    DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
2207%endif
2208
2209%macro FGUV_FN 3 ; name, ss_hor, ss_ver
2210INIT_XMM ssse3
2211%if ARCH_X86_32
2212%if STACK_ALIGNMENT < mmsize
2213cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \
2214        tmp, src, scaling, h, fg_data, picptr, unused
2215    mov              r0, r0m
2216    mov              r1, r1m
2217    mov              r2, r2m
2218    mov              r4, r3m
2219    mov              r3, r4m
2220    mov              r5, r5m
2221%define r0m [rsp+8*mmsize+ 3*gprsize]
2222%define r1m [rsp+8*mmsize+ 4*gprsize]
2223%define r2m [rsp+8*mmsize+ 5*gprsize]
2224%define r3m [rsp+8*mmsize+ 6*gprsize]
2225%define r4m [rsp+8*mmsize+ 7*gprsize]
2226%define r5m [rsp+8*mmsize+ 8*gprsize]
2227    mov             r0m, r0
2228    mov             r2m, r2
2229    mov             r4m, r3
2230    mov             r5m, r5
2231
2232    mov              r0, r6m
2233    mov              r2, r7m
2234    mov              r3, r8m
2235    mov              r5, r9m
2236%define r6m [rsp+8*mmsize+ 9*gprsize]
2237%define r7m [rsp+8*mmsize+10*gprsize]
2238%define r8m [rsp+8*mmsize+11*gprsize]
2239%define r9m [rsp+8*mmsize+12*gprsize]
2240    mov             r6m, r0
2241    mov             r7m, r2
2242    mov             r8m, r3
2243    mov             r9m, r5
2244
2245    mov              r2, r10m
2246    mov              r3, r11m
2247    mov              r5, r12m
2248    mov              r0, r13m
2249%define r10m [rsp+8*mmsize+13*gprsize]
2250%define r11m [rsp+8*mmsize+14*gprsize]
2251%define r12m [rsp+8*mmsize+15*gprsize]
2252    mov            r10m, r2
2253    mov            r11m, r3
2254    mov            r12m, r5
2255
2256    SPLATW           m2, r13m
2257%else
2258cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
2259        tmp, src, scaling, h, fg_data, picptr, unused
2260    mov            srcq, srcm
2261    mov        fg_dataq, r3m
2262%endif
2263    LEA              r5, $$
2264%define base r5-$$
2265
2266    DECLARE_REG_TMP   0, 2, 3
2267%else
2268cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
2269                                      grain_lut, h, sby, luma, lstride, uv_pl, is_id
2270%define base r8-pb_mask
2271    lea              r8, [pb_mask]
2272
2273    DECLARE_REG_TMP   9, 10, 11
2274%endif
2275    mov             r6d, [fg_dataq+FGData.scaling_shift]
2276    SPLATW           m3, [base+mul_bits+r6*2-14]
2277    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
2278%if STACK_ALIGNMENT >= mmsize
2279    mov             t0d, r13m               ; bdmax
2280%endif
2281    sar             t0d, 11                 ; is_12bpc
2282    inc             t0d
2283    mov             t1d, r6d
2284    imul            t1d, t0d
2285    dec             t0d
2286    SPLATW           m5, [base+min+t1*2]
2287    lea             t1d, [t0d*3]
2288    mov             t2d, r12m
2289    inc             t2d
2290    imul            r6d, t2d
2291    add             t1d, r6d
2292    SPLATW           m4, [base+max+t1*2]
2293%if STACK_ALIGNMENT >= mmsize
2294    SPLATW           m2, r13m
2295%endif
2296
2297    SCRATCH           2, 10, 2
2298    SCRATCH           3, 11, 3
2299    SCRATCH           4, 12, 4
2300    SCRATCH           5, 13, 5
2301
2302%define mzero m7
2303
2304%if %3
2305    SPLATD           m2, [base+pw_23_22]
2306%endif
2307
2308%if ARCH_X86_32
2309    mov        scalingq, r5m
2310    mov             r5m, r5
2311%else
2312    mov           r13mp, strideq
2313%endif
2314
2315    pcmpeqw          m0, m0
2316    psraw            m1, m10, 1
2317    pxor             m0, m1
2318
2319    SCRATCH           0,  8, 0
2320    SCRATCH           1,  9, 1
2321
2322    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
2323    jne .csfl
2324
2325%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v
2326%if ARCH_X86_32
2327    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
2328
2329    DECLARE_REG_TMP    0
2330%else
2331    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
2332
2333    DECLARE_REG_TMP    9
2334%endif
2335
2336%if %1
2337    mov             r6d, r11m
2338    SPLATW           m0, [fg_dataq+FGData.uv_mult+r6*4]
2339    SPLATW           m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
2340    punpcklwd        m6, m1, m0
2341    SPLATW           m5, [fg_dataq+FGData.uv_offset+r6*4]
2342    SPLATD           m7, [base+pw_4+t0*4]
2343    pmullw           m5, m7
2344%else
2345    SPLATD           m6, [base+pd_16]
2346%if %2
2347    mova             m5, [base+pw_23_22]
2348%else
2349    mova             m5, [base+pw_27_17_17_27]
2350%endif
2351%endif
2352
2353    SCRATCH           6, 14, 6
2354    SCRATCH           5, 15, 7
2355
2356%if ARCH_X86_32
2357    DECLARE_REG_TMP   0
2358%else
2359    DECLARE_REG_TMP   7
2360%endif
2361
2362    mov            sbyd, r8m
2363    mov             t0d, [fg_dataq+FGData.overlap_flag]
2364    test            t0d, t0d
2365    jz %%no_vertical_overlap
2366    test           sbyd, sbyd
2367    jnz %%vertical_overlap
2368
2369%%no_vertical_overlap:
2370    mov             r8m, t0d
2371%if ARCH_X86_32
2372    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
2373    imul           seed, (173 << 24) | 37
2374%else
2375    imul           seed, sbyd, (173 << 24) | 37
2376%endif
2377    add            seed, (105 << 24) | 178
2378    rol            seed, 8
2379    movzx          seed, seew
2380    xor            seed, [fg_dataq+FGData.seed]
2381%if ARCH_X86_32
2382    mov             r3m, seed
2383
2384    DEFINE_ARGS dst, src, scaling, see, w, picptr, luma
2385
2386    mov            dstq, r0mp
2387    mov           lumaq, r9mp
2388    mov              wq, r4m
2389    lea              r3, [srcq+wq*2]
2390    mov            r1mp, r3
2391    lea              r3, [dstq+wq*2]
2392    mov           r11mp, r3
2393    lea              r3, [lumaq+wq*(2<<%2)]
2394    mov           r12mp, r3
2395%if %3
2396    shl           r10mp, 1
2397%endif
2398%else
2399    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2400                unused2, unused3, see, unused4, unused5, unused6, luma, lstride
2401
2402    mov        lstrideq, r10mp
2403%if %3
2404    add        lstrideq, lstrideq
2405%endif
2406    mov           lumaq, r9mp
2407    lea             r10, [srcq+wq*2]
2408    lea             r11, [dstq+wq*2]
2409    lea             r12, [lumaq+wq*(2<<%2)]
2410    mov           r10mp, r10
2411    mov           r11mp, r11
2412    mov           r12mp, r12
2413%endif
2414    neg              wq
2415%if ARCH_X86_32
2416    mov           r4mp, wq
2417%endif
2418
2419%%loop_x:
2420%if ARCH_X86_32
2421    mov            seed, r3m
2422%endif
2423
2424    mov             r6d, seed
2425    or             seed, 0xEFF4
2426    shr             r6d, 1
2427    test           seeb, seeh
2428    lea            seed, [r6+0x8000]
2429    cmovp          seed, r6d               ; updated seed
2430
2431%if ARCH_X86_32
2432    mov             r3m, seed
2433
2434    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
2435
2436    mov           offxd, offyd
2437%else
2438    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2439                offx, offy, see, unused1, unused2, unused3, luma, lstride
2440
2441    mov           offxd, seed
2442    mov           offyd, seed
2443%endif
2444    ror           offyd, 8
2445    shr           offxd, 12
2446    and           offyd, 0xf
2447    imul          offyd, 164>>%3
2448    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
2449
2450%if ARCH_X86_32
2451    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2452%else
2453    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2454                h, offxy, see, unused1, unused2, unused3, luma, lstride
2455%endif
2456
2457%if %2 == 0
2458%%loop_x_odd:
2459%endif
2460    mov              hd, r7m
2461    mov      grain_lutq, grain_lutmp
2462%%loop_y:
2463    ; src
2464    mova             m0, [srcq]
2465    mova             m1, [srcq+16]          ; m0-1: src as word
2466
2467    ; luma_src
2468    pxor          mzero, mzero
2469%if ARCH_X86_32
2470    DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
2471
2472    mov           lumaq, r9m
2473%endif
2474    mova             m4, [lumaq+ 0]
2475    mova             m6, [lumaq+(16<<%2)]
2476%if %2
2477    phaddw           m4, [lumaq+16]
2478    phaddw           m6, [lumaq+48]
2479%endif
2480%if ARCH_X86_32
2481    add           lumaq, r10mp
2482    mov             r9m, lumaq
2483%endif
2484%if %2
2485    pavgw            m4, mzero
2486    pavgw            m6, mzero
2487%endif
2488
2489%if %1
2490    punpckhwd        m3, m4, m0
2491    punpcklwd        m4, m0
2492    punpckhwd        m5, m6, m1
2493    punpcklwd        m6, m1                 ; { luma, chroma }
2494    REPX {pmaddwd x, m14}, m3, m4, m5, m6
2495    REPX {psrad   x, 6}, m3, m4, m5, m6
2496    packssdw         m4, m3
2497    packssdw         m6, m5
2498    REPX {paddw x, m15}, m4, m6
2499    REPX {pmaxsw x, mzero}, m4, m6
2500    REPX {pminsw x, m10}, m4, m6             ; clip_pixel()
2501%else
2502    REPX  {pand x, m10}, m4, m6
2503%endif
2504
2505    ; scaling[luma_src]
2506%if ARCH_X86_32
2507    vpgatherdw       m3, m4, scalingq-1, r0, r5, 8, 1
2508    vpgatherdw       m5, m6, scalingq-1, r0, r5, 8, 1
2509%else
2510    vpgatherdw       m3, m4, scalingq-1, r10, r12, 8, 1
2511    vpgatherdw       m5, m6, scalingq-1, r10, r12, 8, 1
2512%endif
2513    REPX   {psrlw x, 8}, m3, m5
2514
2515    ; grain = grain_lut[offy+y][offx+x]
2516    movu             m4, [grain_lutq+offxyq*2]
2517    movu             m6, [grain_lutq+offxyq*2+16]
2518
2519    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2520    REPX {pmullw x, m11}, m3, m5
2521    pmulhrsw         m4, m3
2522    pmulhrsw         m6, m5
2523
2524    ; dst = clip_pixel(src, noise)
2525    paddw            m0, m4
2526    paddw            m1, m6
2527    pmaxsw           m0, m13
2528    pmaxsw           m1, m13
2529    pminsw           m0, m12
2530    pminsw           m1, m12
2531    movifnidn      dstq, dstmp
2532    mova      [dstq+ 0], m0
2533    mova      [dstq+16], m1
2534
2535%if ARCH_X86_32
2536    add            srcq, r2mp
2537    add            dstq, r2mp
2538    mov           dstmp, dstq
2539%else
2540    add            srcq, r13mp
2541    add            dstq, r13mp
2542    add           lumaq, lstrideq
2543%endif
2544    add      grain_lutq, 82*2
2545    dec              hd
2546    jg %%loop_y
2547
2548%if ARCH_X86_32
2549    DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma
2550
2551    mov              wq, r4mp
2552%endif
2553    add              wq, 16
2554    jge %%end
2555%if ARCH_X86_32
2556    mov            srcq, r1mp
2557%else
2558    mov            srcq, r10mp
2559%endif
2560    mov            dstq, r11mp
2561    mov           lumaq, r12mp
2562    lea            srcq, [srcq+wq*2]
2563    lea            dstq, [dstq+wq*2]
2564    lea           lumaq, [lumaq+wq*(2<<%2)]
2565%if ARCH_X86_32
2566    mov             r0m, dstq
2567    mov             r9m, lumaq
2568    mov             r4m, wq
2569%endif
2570%if %2 == 0
2571    btc       dword r8m, 2
2572    jc %%next_blk
2573    add          offxyd, 16
2574    test      dword r8m, 2
2575    jz %%loop_x_odd
2576%if ARCH_X86_32
2577    add dword [rsp+8*mmsize+1*gprsize], 16
2578%else
2579    add            r11d, 16
2580%endif
2581    jmp %%loop_x_odd_v_overlap
2582%%next_blk:
2583%endif
2584    test      dword r8m, 1
2585    je %%loop_x
2586
2587    ; r8m = sbym
2588    test      dword r8m, 2
2589    jnz %%loop_x_hv_overlap
2590
2591    ; horizontal overlap (without vertical overlap)
2592%%loop_x_h_overlap:
2593%if ARCH_X86_32
2594    add          offxyd, 16
2595    mov [rsp+8*mmsize+0*gprsize], offxyd
2596
2597    DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut
2598
2599    mov            seed, r3m
2600%endif
2601    mov             r6d, seed
2602    or             seed, 0xEFF4
2603    shr             r6d, 1
2604    test           seeb, seeh
2605    lea            seed, [r6+0x8000]
2606    cmovp          seed, r6d               ; updated seed
2607
2608%if ARCH_X86_32
2609    mov             r3m, seed
2610
2611    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
2612
2613    mov           offxd, offyd
2614%else
2615    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2616                offx, offy, see, left_offxy, unused1, unused2, luma, lstride
2617
2618    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
2619    mov           offxd, seed
2620    mov           offyd, seed
2621%endif
2622    ror           offyd, 8
2623    shr           offxd, 12
2624    and           offyd, 0xf
2625    imul          offyd, 164>>%3
2626    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
2627
2628%if ARCH_X86_32
2629    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2630%else
2631    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2632                h, offxy, see, left_offxy, unused1, unused2, luma, lstride
2633%endif
2634
2635    mov              hd, r7m
2636    mov      grain_lutq, grain_lutmp
2637%%loop_y_h_overlap:
2638    mova             m0, [srcq]
2639    mova             m1, [srcq+16]
2640
2641    ; luma_src
2642    pxor          mzero, mzero
2643%if ARCH_X86_32
2644    DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
2645    mov           lumaq, r9m
2646%endif
2647    mova             m4, [lumaq+ 0]
2648    mova             m6, [lumaq+(16<<%2)]
2649%if %2
2650    phaddw           m4, [lumaq+16]
2651    phaddw           m6, [lumaq+48]
2652%endif
2653%if ARCH_X86_32
2654    add           lumaq, r10mp
2655    mov             r9m, lumaq
2656%endif
2657%if %2
2658    pavgw            m4, mzero
2659    pavgw            m6, mzero
2660%endif
2661
2662%if %1
2663    punpckhwd        m3, m4, m0
2664    punpcklwd        m4, m0
2665    punpckhwd        m5, m6, m1
2666    punpcklwd        m6, m1                 ; { luma, chroma }
2667    REPX {pmaddwd x, m14}, m3, m4, m5, m6
2668    REPX {psrad   x, 6}, m3, m4, m5, m6
2669    packssdw         m4, m3
2670    packssdw         m6, m5
2671    REPX {paddw x, m15}, m4, m6
2672    REPX {pmaxsw x, mzero}, m4, m6
2673    REPX {pminsw x, m10}, m4, m6             ; clip_pixel()
2674%else
2675    REPX  {pand x, m10}, m4, m6
2676%endif
2677
2678    ; grain = grain_lut[offy+y][offx+x]
2679    movu             m7, [grain_lutq+offxyq*2]
2680%if ARCH_X86_32
2681    mov              r5, [rsp+8*mmsize+0*gprsize]
2682    movd             m5, [grain_lutq+r5*2]
2683%else
2684    movd             m5, [grain_lutq+left_offxyq*2+ 0]
2685%endif
2686    punpcklwd        m5, m7                ; {left0, cur0}
2687%if %1
2688%if ARCH_X86_32
2689    mov              r5, r5m
2690%endif
2691%if %2
2692    pmaddwd          m5, [PIC_ptr(pw_23_22)]
2693%else
2694    pmaddwd          m5, [PIC_ptr(pw_27_17_17_27)]
2695%endif
2696    paddd            m5, [PIC_ptr(pd_16)]
2697%else
2698    pmaddwd          m5, m15
2699    paddd            m5, m14
2700%endif
2701    psrad            m5, 5
2702    packssdw         m5, m5
2703    pmaxsw           m5, m8
2704    pminsw           m5, m9
2705    shufps           m5, m7, q3210
2706    movu             m3, [grain_lutq+offxyq*2+16]
2707
2708    ; scaling[luma_src]
2709%if ARCH_X86_32
2710    vpgatherdw       m7, m4, scalingq-1, r0, r5, 8, 1
2711    vpgatherdw       m4, m6, scalingq-1, r0, r5, 8, 1
2712%else
2713    vpgatherdw       m7, m4, scalingq-1, r2, r12, 8, 1
2714    vpgatherdw       m4, m6, scalingq-1, r2, r12, 8, 1
2715%endif
2716    REPX   {psrlw x, 8}, m7, m4
2717
2718    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2719    REPX {pmullw x, m11}, m7, m4
2720    pmulhrsw         m5, m7
2721    pmulhrsw         m3, m4
2722
2723    ; dst = clip_pixel(src, noise)
2724    paddw            m0, m5
2725    paddw            m1, m3
2726    pmaxsw           m0, m13
2727    pmaxsw           m1, m13
2728    pminsw           m0, m12
2729    pminsw           m1, m12
2730    movifnidn      dstq, dstmp
2731    mova      [dstq+ 0], m0
2732    mova      [dstq+16], m1
2733
2734%if ARCH_X86_32
2735    add            srcq, r2mp
2736    add            dstq, r2mp
2737    mov           dstmp, dstq
2738%else
2739    add            srcq, r13mp
2740    add            dstq, r13mp
2741    add           lumaq, lstrideq
2742%endif
2743    add      grain_lutq, 82*2
2744    dec              hd
2745    jg %%loop_y_h_overlap
2746
2747%if ARCH_X86_32
2748    DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
2749    mov              wq, r4mp
2750%endif
2751    add              wq, 16
2752    jge %%end
2753%if ARCH_X86_32
2754    mov            srcq, r1mp
2755%else
2756    mov            srcq, r10mp
2757%endif
2758    mov            dstq, r11mp
2759    mov           lumaq, r12mp
2760    lea            srcq, [srcq+wq*2]
2761    lea            dstq, [dstq+wq*2]
2762    lea           lumaq, [lumaq+wq*(2<<%2)]
2763%if ARCH_X86_32
2764    mov            r0mp, dstq
2765    mov            r9mp, lumaq
2766    mov             r4m, wq
2767%endif
2768
2769%if %2
2770    ; r8m = sbym
2771    test      dword r8m, 2
2772    jne %%loop_x_hv_overlap
2773    jmp %%loop_x_h_overlap
2774%else
2775    or        dword r8m, 4
2776    add          offxyd, 16
2777
2778    ; r8m = sbym
2779    test      dword r8m, 2
2780    jz %%loop_x_odd
2781%if ARCH_X86_32
2782    add dword [rsp+8*mmsize+1*gprsize], 16
2783%else
2784    add            r11d, 16                 ; top_offxy += 16
2785%endif
2786    jmp %%loop_x_odd_v_overlap
2787%endif
2788
2789%%end:
2790    RET
2791
2792%%vertical_overlap:
2793    or              t0d, 2
2794    mov             r8m, t0d
2795
2796%if ARCH_X86_32
2797    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
2798%else
2799    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
2800                sby, see, unused1, unused2, unused3, lstride
2801%endif
2802
2803    movzx          sbyd, sbyb
2804%if ARCH_X86_32
2805    imul             r4, [fg_dataq+FGData.seed], 0x00010001
2806
2807    DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
2808%else
2809    imul           seed, [fg_dataq+FGData.seed], 0x00010001
2810%endif
2811    imul            t0d, sbyd, 173 * 0x00010001
2812    imul           sbyd, 37 * 0x01000100
2813    add             t0d, (105 << 16) | 188
2814    add            sbyd, (178 << 24) | (141 << 8)
2815    and             t0d, 0x00ff00ff
2816    and            sbyd, 0xff00ff00
2817    xor            seed, t0d
2818%if ARCH_X86_32
2819    xor            sbyd, seed
2820
2821    DEFINE_ARGS dst, src, scaling, see, w, picptr, luma
2822
2823    mov             r3m, seed
2824    mov            dstq, r0mp
2825    mov           lumaq, r9mp
2826    mov              wq, r4m
2827    lea              r3, [srcq+wq*2]
2828    mov            r1mp, r3
2829    lea              r3, [dstq+wq*2]
2830    mov           r11mp, r3
2831    lea              r3, [lumaq+wq*(2<<%2)]
2832    mov           r12mp, r3
2833%if %3
2834    shl           r10mp, 1
2835%endif
2836%else
2837    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
2838
2839    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2840                unused1, unused2, see, unused3, unused4, unused5, luma, lstride
2841
2842    mov        lstrideq, r10mp
2843%if %3
2844    add        lstrideq, lstrideq
2845%endif
2846    mov           lumaq, r9mp
2847    lea             r10, [srcq+wq*2]
2848    lea             r11, [dstq+wq*2]
2849    lea             r12, [lumaq+wq*(2<<%2)]
2850    mov           r10mp, r10
2851    mov           r11mp, r11
2852    mov           r12mp, r12
2853%endif
2854    neg              wq
2855%if ARCH_X86_32
2856    mov             r4m, wq
2857%endif
2858
2859%%loop_x_v_overlap:
2860%if ARCH_X86_32
2861    mov            seed, r3m
2862    xor             t0d, t0d
2863%else
2864    ; we assume from the block above that bits 8-15 of r7d are zero'ed
2865%endif
2866    mov             r6d, seed
2867    or             seed, 0xeff4eff4
2868    test           seeb, seeh
2869    setp            t0b                     ; parity of top_seed
2870    shr            seed, 16
2871    shl             t0d, 16
2872    test           seeb, seeh
2873    setp            t0b                     ; parity of cur_seed
2874    or              r6d, 0x00010001
2875    xor             t0d, r6d
2876    mov            seed, t0d
2877    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
2878%if ARCH_X86_32
2879    mov             r3m, seed
2880
2881    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
2882
2883    mov           offxd, offyd
2884%else
2885    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2886                offx, offy, see, unused1, top_offxy, unused2, luma, lstride
2887
2888    mov           offyd, seed
2889    mov           offxd, seed
2890%endif
2891    ror           offyd, 8
2892    ror           offxd, 12
2893    and           offyd, 0xf000f
2894    and           offxd, 0xf000f
2895    imul          offyd, 164>>%3
2896    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
2897    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
2898
2899%if ARCH_X86_32
2900    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
2901%else
2902    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
2903                h, offxy, see, unused1, top_offxy, unused2, luma, lstride
2904%endif
2905    movzx    top_offxyd, offxyw
2906%if ARCH_X86_32
2907    mov [rsp+8*mmsize+1*gprsize], top_offxyd
2908    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2909%endif
2910    shr          offxyd, 16
2911
2912%if %2 == 0
2913%%loop_x_odd_v_overlap:
2914%endif
2915%if %3 == 0
2916%if ARCH_X86_32
2917    mov              r5, r5m
2918%endif
2919    SPLATD           m2, [PIC_ptr(pw_27_17_17_27)]
2920%endif
2921
2922    mov              hd, r7m
2923    mov      grain_lutq, grain_lutmp
2924%%loop_y_v_overlap:
2925    ; grain = grain_lut[offy+y][offx+x]
2926    movu             m3, [grain_lutq+offxyq*2]
2927%if ARCH_X86_32
2928    mov              r0, [rsp+mmsize*8+gprsize*1] ; top_offxy
2929    movu             m5, [grain_lutq+r0*2]
2930%else
2931    movu             m5, [grain_lutq+top_offxyq*2]
2932%endif
2933    punpckhwd        m7, m5, m3
2934    punpcklwd        m5, m3                 ; {top/cur interleaved}
2935    REPX {pmaddwd x, m2}, m7, m5
2936%if %1
2937%if ARCH_X86_32
2938    mov              r5, r5m
2939%endif
2940    REPX  {paddd x, [PIC_ptr(pd_16)]}, m7, m5
2941%else
2942    REPX  {paddd x, m14}, m7, m5
2943%endif
2944    REPX   {psrad x, 5}, m7, m5
2945    packssdw         m3, m5, m7
2946    pmaxsw           m3, m8
2947    pminsw           m3, m9
2948
2949    ; grain = grain_lut[offy+y][offx+x]
2950    movu             m4, [grain_lutq+offxyq*2+16]
2951%if ARCH_X86_32
2952    movu             m5, [grain_lutq+r0*2+16]
2953%else
2954    movu             m5, [grain_lutq+top_offxyq*2+16]
2955%endif
2956    punpckhwd        m7, m5, m4
2957    punpcklwd        m5, m4                 ; {top/cur interleaved}
2958    REPX {pmaddwd x, m2}, m7, m5
2959%if %1
2960    REPX  {paddd x, [PIC_ptr(pd_16)]}, m7, m5
2961%else
2962    REPX  {paddd x, m14}, m7, m5
2963%endif
2964    REPX   {psrad x, 5}, m7, m5
2965    packssdw         m4, m5, m7
2966    pmaxsw           m4, m8
2967    pminsw           m4, m9
2968
2969    ; src
2970    mova             m0, [srcq]
2971    mova             m1, [srcq+16]
2972
2973    ; luma_src
2974    pxor          mzero, mzero
2975%if ARCH_X86_32
2976    DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
2977
2978    mov           lumaq, r9mp
2979%endif
2980    mova             m5, [lumaq+ 0]
2981    mova             m6, [lumaq+(16<<%2)]
2982%if %2
2983    phaddw           m5, [lumaq+16]
2984    phaddw           m6, [lumaq+48]
2985%endif
2986%if ARCH_X86_32
2987    add           lumaq, r10mp
2988    mov            r9mp, lumaq
2989%endif
2990%if %2
2991    pavgw            m5, mzero
2992    pavgw            m6, mzero
2993%endif
2994
2995%if %1
2996    punpckhwd        m7, m5, m0
2997    punpcklwd        m5, m0
2998    REPX {pmaddwd x, m14}, m7, m5
2999    REPX {psrad   x, 6}, m7, m5
3000    packssdw         m5, m7
3001    punpckhwd        m7, m6, m1
3002    punpcklwd        m6, m1                 ; { luma, chroma }
3003    REPX {pmaddwd x, m14}, m7, m6
3004    REPX {psrad   x, 6}, m7, m6
3005    packssdw         m6, m7
3006    pxor          mzero, mzero
3007    REPX {paddw x, m15}, m5, m6
3008    REPX {pmaxsw x, mzero}, m5, m6
3009    REPX {pminsw x, m10}, m5, m6            ; clip_pixel()
3010%else
3011    REPX  {pand x, m10}, m5, m6
3012%endif
3013
3014    ; scaling[luma_src]
3015%if ARCH_X86_32
3016    vpgatherdw       m7, m5, scalingq-1, r0, r5, 8, 1
3017    vpgatherdw       m5, m6, scalingq-1, r0, r5, 8, 1
3018%else
3019    vpgatherdw       m7, m5, scalingq-1, r10, r12, 8, 1
3020    vpgatherdw       m5, m6, scalingq-1, r10, r12, 8, 1
3021%endif
3022    REPX   {psrlw x, 8}, m7, m5
3023
3024    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
3025    REPX {pmullw x, m11}, m7, m5
3026    pmulhrsw         m3, m7
3027    pmulhrsw         m4, m5
3028
3029    ; dst = clip_pixel(src, noise)
3030    paddw            m0, m3
3031    paddw            m1, m4
3032    pmaxsw           m0, m13
3033    pmaxsw           m1, m13
3034    pminsw           m0, m12
3035    pminsw           m1, m12
3036    movifnidn      dstq, dstmp
3037    mova      [dstq+ 0], m0
3038    mova      [dstq+16], m1
3039
3040    dec              hw
3041    jle %%end_y_v_overlap
3042%if ARCH_X86_32
3043    add            srcq, r2mp
3044    add            dstq, r2mp
3045    mov           dstmp, dstq
3046%else
3047    add            srcq, r13mp
3048    add            dstq, r13mp
3049    add           lumaq, lstrideq
3050%endif
3051    add      grain_lutq, 82*2
3052%if %3
3053    jmp %%loop_y
3054%else
3055    btc              hd, 16
3056    jc %%loop_y
3057%if ARCH_X86_32
3058    mov              r5, r5m
3059%endif
3060    SPLATD           m2, [PIC_ptr(pw_27_17_17_27)+4]
3061    jmp %%loop_y_v_overlap
3062%endif
3063
3064%%end_y_v_overlap:
3065%if ARCH_X86_32
3066    DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
3067
3068    mov              wq, r4m
3069%endif
3070    add              wq, 16
3071    jge %%end_hv
3072%if ARCH_X86_32
3073    mov            srcq, r1mp
3074%else
3075    mov            srcq, r10mp
3076%endif
3077    mov            dstq, r11mp
3078    mov           lumaq, r12mp
3079    lea            srcq, [srcq+wq*2]
3080    lea            dstq, [dstq+wq*2]
3081    lea           lumaq, [lumaq+wq*(2<<%2)]
3082%if ARCH_X86_32
3083    mov            r0mp, dstq
3084    mov            r9mp, lumaq
3085    mov             r4m, wq
3086%endif
3087
3088%if %2
3089    ; since fg_dataq.overlap is guaranteed to be set, we never jump
3090    ; back to .loop_x_v_overlap, and instead always fall-through to
3091    ; h+v overlap
3092%else
3093    btc       dword r8m, 2
3094    jc %%loop_x_hv_overlap
3095    add          offxyd, 16
3096%if ARCH_X86_32
3097    add dword [rsp+8*mmsize+1*gprsize], 16
3098%else
3099    add            r11d, 16
3100%endif
3101    jmp %%loop_x_odd_v_overlap
3102%endif
3103
3104%%loop_x_hv_overlap:
3105%if ARCH_X86_32
3106    DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut
3107
3108    mov             t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy
3109    add          offxyd, 16
3110    add             t0d, 16
3111    mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd
3112    mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd
3113
3114    DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut
3115
3116    mov            seed, r3m
3117    xor             t0d, t0d
3118%else
3119    ; we assume from the block above that bits 8-15 of r7d are zero'ed
3120%endif
3121    mov             r6d, seed
3122    or             seed, 0xeff4eff4
3123    test           seeb, seeh
3124    setp            t0b                     ; parity of top_seed
3125    shr            seed, 16
3126    shl             t0d, 16
3127    test           seeb, seeh
3128    setp            t0b                     ; parity of cur_seed
3129    or              r6d, 0x00010001
3130    xor             t0d, r6d
3131    mov            seed, t0d
3132    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
3133%if ARCH_X86_32
3134    mov             r3m, seed
3135
3136    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
3137
3138    mov           offxd, offyd
3139%else
3140    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
3141                offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
3142
3143    lea  topleft_offxyq, [top_offxyq+16]
3144    lea     left_offxyq, [offyq+16]
3145    mov           offyd, seed
3146    mov           offxd, seed
3147%endif
3148    ror           offyd, 8
3149    ror           offxd, 12
3150    and           offyd, 0xf000f
3151    and           offxd, 0xf000f
3152    imul          offyd, 164>>%3
3153    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
3154    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
3155
3156%if ARCH_X86_32
3157    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy
3158%else
3159    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
3160                h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
3161%endif
3162    movzx    top_offxyd, offxyw
3163%if ARCH_X86_32
3164    mov [rsp+8*mmsize+1*gprsize], top_offxyd
3165
3166    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
3167%endif
3168    shr          offxyd, 16
3169
3170%if %3 == 0
3171%if ARCH_X86_32
3172    mov              r5, r5m
3173%endif
3174    SPLATD           m2, [PIC_ptr(pw_27_17_17_27)]
3175%endif
3176
3177    mov              hd, r7m
3178    mov      grain_lutq, grain_lutmp
3179%%loop_y_hv_overlap:
3180    ; grain = grain_lut[offy+y][offx+x]
3181%if ARCH_X86_32
3182    mov              r5, [rsp+8*mmsize+0*gprsize] ; left_offxy
3183    mov              r0, [rsp+8*mmsize+1*gprsize] ; top_offxy
3184    movd             m5, [grain_lutq+r5*2]
3185%else
3186    movd             m5, [grain_lutq+left_offxyq*2]
3187%endif
3188    movu             m7, [grain_lutq+offxyq*2]
3189%if ARCH_X86_32
3190    mov              r5, [rsp+8*mmsize+2*gprsize]
3191    movu             m4, [grain_lutq+r0*2]
3192%if %2
3193    pinsrw           m5, [grain_lutq+r5*2], 2
3194%else
3195    movd             m3, [grain_lutq+r5*2]
3196%endif
3197%else
3198    movu             m4, [grain_lutq+top_offxyq*2]
3199%if %2
3200    pinsrw           m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left }
3201%else
3202    movd             m3, [grain_lutq+topleft_offxyq*2]
3203%endif
3204%endif
3205%if %2 == 0
3206    punpckldq        m5, m3
3207%endif
3208    punpckldq        m3, m7, m4             ; { cur0/1,top0/1,cur2/3,top2/3 }
3209    punpcklwd        m5, m3                 ; { left/cur0,_/cur1,topleft/top0,_/top1 }
3210%if %1
3211%if ARCH_X86_32
3212    mov              r5, r5m
3213%endif
3214%if %2
3215    movddup          m0, [PIC_ptr(pw_23_22)]
3216%else
3217    movddup          m0, [PIC_ptr(pw_27_17_17_27)]
3218%endif
3219%else
3220    pshufd           m0, m15, q1010
3221%endif
3222    pmaddwd          m5, m0
3223%if %1
3224    paddd            m5, [PIC_ptr(pd_16)]
3225%else
3226    paddd            m5, m14
3227%endif
3228    psrad            m5, 5
3229    packssdw         m5, m5
3230    pmaxsw           m5, m8
3231    pminsw           m5, m9
3232    shufps           m5, m3, q3210          ; cur0/1,top0/1,cur2/3,top2/3
3233    shufps           m3, m5, m7, q3220      ; cur0-7 post-h_filter
3234    shufps           m5, m4, q3231          ; top0-7 post-h_filter
3235
3236    punpckhwd        m7, m5, m3
3237    punpcklwd        m5, m3                 ; {top/cur interleaved}
3238    REPX {pmaddwd x, m2}, m7, m5
3239%if %1
3240    REPX  {paddd x, [PIC_ptr(pd_16)]}, m5, m7
3241%else
3242    REPX  {paddd x, m14}, m5, m7
3243%endif
3244    REPX   {psrad x, 5}, m5, m7
3245    packssdw         m3, m5, m7
3246    pmaxsw           m3, m8
3247    pminsw           m3, m9
3248
3249    ; right half
3250    movu             m4, [grain_lutq+offxyq*2+16]
3251%if ARCH_X86_32
3252    movu             m0, [grain_lutq+r0*2+16]
3253%else
3254    movu             m0, [grain_lutq+top_offxyq*2+16]
3255%endif
3256    punpckhwd        m1, m0, m4
3257    punpcklwd        m0, m4                 ; {top/cur interleaved}
3258    REPX {pmaddwd x, m2}, m1, m0
3259%if %1
3260    REPX  {paddd x, [PIC_ptr(pd_16)]}, m1, m0
3261%else
3262    REPX  {paddd x, m14}, m1, m0
3263%endif
3264    REPX   {psrad x, 5}, m1, m0
3265    packssdw         m4, m0, m1
3266    pmaxsw           m4, m8
3267    pminsw           m4, m9
3268
3269    ; src
3270    mova             m0, [srcq]
3271    mova             m1, [srcq+16]
3272
3273    ; luma_src
3274    pxor          mzero, mzero
3275%if ARCH_X86_32
3276    DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
3277
3278    mov           lumaq, r9mp
3279%endif
3280    mova             m6, [lumaq+ 0]
3281    mova             m5, [lumaq+(16<<%2)]
3282%if %2
3283    phaddw           m6, [lumaq+16]
3284    phaddw           m5, [lumaq+48]
3285%endif
3286%if ARCH_X86_32
3287    add           lumaq, r10mp
3288    mov            r9mp, lumaq
3289%endif
3290%if %2
3291    pavgw            m6, mzero
3292    pavgw            m5, mzero
3293%endif
3294
3295%if %1
3296    punpckhwd        m7, m6, m0
3297    punpcklwd        m6, m0
3298    REPX {pmaddwd x, m14}, m7, m6
3299    REPX {psrad   x, 6}, m7, m6
3300    packssdw         m6, m7
3301    punpckhwd        m7, m5, m1
3302    punpcklwd        m5, m1                 ; { luma, chroma }
3303    REPX {pmaddwd x, m14}, m7, m5
3304    REPX {psrad   x, 6}, m7, m5
3305    packssdw         m5, m7
3306    pxor          mzero, mzero
3307    REPX {paddw x, m15}, m6, m5
3308    REPX {pmaxsw x, mzero}, m6, m5
3309    REPX {pminsw x, m10}, m6, m5            ; clip_pixel()
3310%else
3311    REPX  {pand x, m10}, m6, m5
3312%endif
3313
3314    ; scaling[luma_src]
3315%if ARCH_X86_32
3316    vpgatherdw       m7, m6, scalingq-1, r0, r5, 8, 1
3317    vpgatherdw       m6, m5, scalingq-1, r0, r5, 8, 1
3318%else
3319%if %3 == 0
3320    ; register shortage :)
3321    push            r12
3322%endif
3323    vpgatherdw       m7, m6, scalingq-1, r2, r12, 8, 1
3324    vpgatherdw       m6, m5, scalingq-1, r2, r12, 8, 1
3325%if %3 == 0
3326    pop             r12
3327%endif
3328%endif
3329    REPX   {psrlw x, 8}, m7, m6
3330
3331    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
3332    REPX {pmullw x, m11}, m7, m6
3333    pmulhrsw         m3, m7
3334    pmulhrsw         m4, m6
3335
3336    ; dst = clip_pixel(src, noise)
3337    paddw            m0, m3
3338    paddw            m1, m4
3339    pmaxsw           m0, m13
3340    pmaxsw           m1, m13
3341    pminsw           m0, m12
3342    pminsw           m1, m12
3343    movifnidn      dstq, dstmp
3344    mova      [dstq+ 0], m0
3345    mova      [dstq+16], m1
3346
3347%if ARCH_X86_32
3348    add            srcq, r2mp
3349    add            dstq, r2mp
3350    mov           dstmp, dstq
3351%else
3352    add            srcq, r13mp
3353    add            dstq, r13mp
3354    add           lumaq, lstrideq
3355%endif
3356    add      grain_lutq, 82*2
3357    dec              hw
3358%if %3
3359    jg %%loop_y_h_overlap
3360%else
3361    jle %%end_y_hv_overlap
3362    btc              hd, 16
3363    jc %%loop_y_h_overlap
3364%if ARCH_X86_32
3365    mov              r5, r5m
3366%endif
3367    SPLATD           m2, [PIC_ptr(pw_27_17_17_27)+4]
3368    jmp %%loop_y_hv_overlap
3369%%end_y_hv_overlap:
3370%endif
3371%if ARCH_X86_32
3372    DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
3373
3374    mov              wq, r4m
3375%endif
3376    add              wq, 16
3377    jge %%end_hv
3378%if ARCH_X86_32
3379    mov            srcq, r1mp
3380%else
3381    mov            srcq, r10mp
3382%endif
3383    mov            dstq, r11mp
3384    mov           lumaq, r12mp
3385    lea            srcq, [srcq+wq*2]
3386    lea            dstq, [dstq+wq*2]
3387    lea           lumaq, [lumaq+wq*(2<<%2)]
3388%if ARCH_X86_32
3389    mov           dstmp, dstq
3390    mov            r9mp, lumaq
3391    mov             r4m, wq
3392%endif
3393%if %2
3394    jmp %%loop_x_hv_overlap
3395%else
3396    or        dword r8m, 4
3397    add          offxyd, 16
3398%if ARCH_X86_32
3399    add dword [rsp+8*mmsize+1*gprsize], 16
3400%else
3401    add            r11d, 16                 ; top_offxy += 16
3402%endif
3403    jmp %%loop_x_odd_v_overlap
3404%endif
3405
3406%%end_hv:
3407    RET
3408%endmacro
3409
3410    %%FGUV_32x32xN_LOOP 1, %2, %3
3411.csfl:
3412    %%FGUV_32x32xN_LOOP 0, %2, %3
3413
3414%if STACK_ALIGNMENT < mmsize
3415DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
3416%endif
3417%endmacro
3418
3419FGUV_FN 420, 1, 1
3420FGUV_FN 422, 1, 0
3421FGUV_FN 444, 0, 0
3422