1; Copyright © 2019, VideoLAN and dav1d authors
2; Copyright © 2019, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29SECTION_RODATA
30
31pw_1024: times 8 dw 1024
32pb_27_17: times 8 db 27, 17
33pb_17_27: times 8 db 17, 27
34pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
35rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
36byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
37pw_seed_xor: times 2 dw 0xb524
38             times 2 dw 0x49d8
39pb_23_22: times 2 db 23, 22
40pb_1: times 4 db 1
41hmul_bits: dw 32768, 16384, 8192, 4096
42round: dw 2048, 1024, 512
43mul_bits: dw 256, 128, 64, 32, 16
44round_vals: dw 32, 64, 128, 256, 512
45max: dw 255, 240, 235
46min: dw 0, 16
47pw_1: dw 1
48
49%define pb_27_17_17_27 pb_17_27 - 2
50
51%macro JMP_TABLE 1-*
52    %xdefine %1_table %%table
53    %xdefine %%base %1_table
54    %xdefine %%prefix mangle(private_prefix %+ _%1)
55    %%table:
56    %rep %0 - 1
57        dd %%prefix %+ .ar%2 - %%base
58        %rotate 1
59    %endrep
60%endmacro
61
62JMP_TABLE generate_grain_y_ssse3, 0, 1, 2, 3
63JMP_TABLE generate_grain_uv_420_ssse3, 0, 1, 2, 3
64JMP_TABLE generate_grain_uv_422_ssse3, 0, 1, 2, 3
65JMP_TABLE generate_grain_uv_444_ssse3, 0, 1, 2, 3
66
67struc FGData
68    .seed:                      resd 1
69    .num_y_points:              resd 1
70    .y_points:                  resb 14 * 2
71    .chroma_scaling_from_luma:  resd 1
72    .num_uv_points:             resd 2
73    .uv_points:                 resb 2 * 10 * 2
74    .scaling_shift:             resd 1
75    .ar_coeff_lag:              resd 1
76    .ar_coeffs_y:               resb 24
77    .ar_coeffs_uv:              resb 2 * 28 ; includes padding
78    .ar_coeff_shift:            resq 1
79    .grain_scale_shift:         resd 1
80    .uv_mult:                   resd 2
81    .uv_luma_mult:              resd 2
82    .uv_offset:                 resd 2
83    .overlap_flag:              resd 1
84    .clip_to_restricted_range:  resd 1
85endstruc
86
87cextern gaussian_sequence
88
89SECTION .text
90
91%macro SCRATCH 3
92%if ARCH_X86_32
93    mova [rsp+%3*mmsize], m%1
94%define m%2 [rsp+%3*mmsize]
95%else
96    SWAP             %1, %2
97%endif
98%endmacro
99
100INIT_XMM ssse3
101cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
102    LEA              r4, $$
103%define base r4-$$
104    movq             m1, [base+rnd_next_upperbit_mask]
105    movq             m4, [base+mul_bits]
106    movq             m7, [base+hmul_bits]
107    mov             r2d, [fg_dataq+FGData.grain_scale_shift]
108    movd             m2, [base+round+r2*2]
109    movd             m0, [fg_dataq+FGData.seed]
110    mova             m5, [base+pb_mask]
111    pshuflw          m2, m2, q0000
112    pshuflw          m0, m0, q0000
113    mov              r2, -73*82
114    sub            bufq, r2
115    lea              r3, [base+gaussian_sequence]
116.loop:
117    pand             m6, m0, m1
118    psrlw            m3, m6, 10
119    por              m6, m3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
120    pmullw           m6, m4            ; bits 0x0f00 are set
121    pshufb           m3, m5, m6        ; set 15th bit for next 4 seeds
122    psllq            m6, m3, 30
123    por              m3, m6
124    psllq            m6, m3, 15
125    por              m3, m6            ; aggregate each bit into next seed's high bit
126    pmulhuw          m6, m0, m7
127    por              m3, m6            ; 4 next output seeds
128    pshuflw          m0, m3, q3333
129    psrlw            m3, 5
130%if ARCH_X86_64
131    movq             r6, m3
132    mov              r8, r6
133    movzx           r5d, r6w
134    shr             r6d, 16
135    shr              r8, 32
136    movzx            r7, r8w
137    shr              r8, 16
138
139    movd             m6, [r3+r5*2]
140    pinsrw           m6, [r3+r6*2], 1
141    pinsrw           m6, [r3+r7*2], 2
142    pinsrw           m6, [r3+r8*2], 3
143%else
144    movd             r6, m3
145    pshuflw          m3, m3, q3232
146    movzx            r5, r6w
147    shr              r6, 16
148
149    movd             m6, [r3+r5*2]
150    pinsrw           m6, [r3+r6*2], 1
151
152    movd             r6, m3
153    movzx            r5, r6w
154    shr              r6, 16
155
156    pinsrw           m6, [r3+r5*2], 2
157    pinsrw           m6, [r3+r6*2], 3
158%endif
159    pmulhrsw         m6, m2
160    packsswb         m6, m6
161    movd      [bufq+r2], m6
162    add              r2, 4
163    jl .loop
164
165    ; auto-regression code
166    movsxd           r2, [fg_dataq+FGData.ar_coeff_lag]
167    movsxd           r2, [base+generate_grain_y_ssse3_table+r2*4]
168    lea              r2, [r2+base+generate_grain_y_ssse3_table]
169    jmp              r2
170
171.ar1:
172%if ARCH_X86_32
173    DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max
174%elif WIN64
175    DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0
176    mov            bufq, r0
177%else
178    DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
179%endif
180    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
181    movd             m4, [fg_dataq+FGData.ar_coeffs_y]
182    mov             ecx, [fg_dataq+FGData.ar_coeff_shift]
183%if ARCH_X86_32
184    mov             r1m, cf3d
185    DEFINE_ARGS buf, shift, val3, min, max, x, val0
186%define hd r0mp
187%define cf3d r1mp
188%elif WIN64
189    DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0
190%else
191    DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
192%endif
193    pxor             m6, m6
194    pcmpgtb          m7, m6, m4
195    punpcklbw        m4, m7
196    pinsrw           m4, [base+pw_1], 3
197    pshufd           m5, m4, q1111
198    pshufd           m4, m4, q0000
199    movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
200    pshuflw          m3, m3, q0000
201    sub            bufq, 82*73-(82*3+79)
202    mov              hd, 70
203    mov            mind, -128
204    mov            maxd, 127
205.y_loop_ar1:
206    mov              xq, -76
207    movsx         val3d, byte [bufq+xq-1]
208.x_loop_ar1:
209    movq             m0, [bufq+xq-82-1]     ; top/left
210    pcmpgtb          m7, m6, m0
211    punpcklbw        m0, m7
212    psrldq           m2, m0, 2              ; top
213    psrldq           m1, m0, 4              ; top/right
214    punpcklwd        m0, m2
215    punpcklwd        m1, m3
216    pmaddwd          m0, m4
217    pmaddwd          m1, m5
218    paddd            m0, m1
219.x_loop_ar1_inner:
220    movd          val0d, m0
221    psrldq           m0, 4
222    imul          val3d, cf3d
223    add           val3d, val0d
224    sar           val3d, shiftb
225    movsx         val0d, byte [bufq+xq]
226    add           val3d, val0d
227    cmp           val3d, maxd
228    cmovns        val3d, maxd
229    cmp           val3d, mind
230    cmovs         val3d, mind
231    mov  byte [bufq+xq], val3b
232    ; keep val3d in-place as left for next x iteration
233    inc              xq
234    jz .x_loop_ar1_end
235    test             xq, 3
236    jnz .x_loop_ar1_inner
237    jmp .x_loop_ar1
238
239.x_loop_ar1_end:
240    add            bufq, 82
241    dec              hd
242    jg .y_loop_ar1
243.ar0:
244    RET
245
246.ar2:
247%if ARCH_X86_32
248%assign stack_offset_old stack_offset
249    ALLOC_STACK -16*8
250%endif
251    DEFINE_ARGS buf, fg_data, shift
252    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
253    movd             m6, [base+round_vals-12+shiftq*2]
254    movd             m7, [base+byte_blend+1]
255    SCRATCH           7, 15, 7
256    movq             m0, [fg_dataq+FGData.ar_coeffs_y+0]    ; cf0-7
257    movd             m1, [fg_dataq+FGData.ar_coeffs_y+8]    ; cf8-11
258    pxor             m7, m7
259    pshuflw          m6, m6, q0000
260    punpcklwd        m6, m7
261    pcmpgtb          m4, m7, m0
262    pcmpgtb          m5, m7, m1
263    punpcklbw        m0, m4
264    punpcklbw        m1, m5
265    DEFINE_ARGS buf, fg_data, h, x
266    pshufd           m4, m1, q0000
267    pshufd           m5, m1, q1111
268    pshufd           m3, m0, q3333
269    pshufd           m2, m0, q2222
270    pshufd           m1, m0, q1111
271    pshufd           m0, m0, q0000
272    SCRATCH           0, 8,  0
273    SCRATCH           1, 9,  1
274    SCRATCH           2, 10, 2
275    SCRATCH           3, 11, 3
276    SCRATCH           4, 12, 4
277    SCRATCH           5, 13, 5
278    SCRATCH           6, 14, 6
279    sub            bufq, 82*73-(82*3+79)
280    mov              hd, 70
281.y_loop_ar2:
282    mov              xq, -76
283
284.x_loop_ar2:
285    movq             m0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
286    movhps           m0, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
287    pcmpgtb          m2, m7, m0
288    punpckhbw        m1, m0, m2
289    punpcklbw        m0, m2
290    psrldq           m5, m0, 2              ; y=-2,x=[-1,+5]
291    psrldq           m3, m1, 2              ; y=-1,x=[-1,+5]
292    psrldq           m4, m1, 4              ; y=-1,x=[+0,+5]
293    punpcklwd        m2, m0, m5
294    punpcklwd        m3, m4
295    pmaddwd          m2, m8
296    pmaddwd          m3, m11
297    paddd            m2, m3
298
299    psrldq           m4, m0, 4              ; y=-2,x=[+0,+5]
300    psrldq           m5, m0, 6              ; y=-2,x=[+1,+5]
301    psrldq           m6, m0, 8              ; y=-2,x=[+2,+5]
302    punpcklwd        m4, m5
303    punpcklwd        m6, m1
304    psrldq           m5, m1, 6              ; y=-1,x=[+1,+5]
305    psrldq           m1, m1, 8              ; y=-1,x=[+2,+5]
306    punpcklwd        m5, m1
307    pmaddwd          m4, m9
308    pmaddwd          m6, m10
309    pmaddwd          m5, m12
310    paddd            m4, m6
311    paddd            m2, m5
312    paddd            m2, m4
313    paddd            m2, m14
314
315    movq             m0, [bufq+xq-2]        ; y=0,x=[-2,+5]
316.x_loop_ar2_inner:
317    pcmpgtb          m4, m7, m0
318    punpcklbw        m1, m0, m4
319    pmaddwd          m3, m1, m13
320    paddd            m3, m2
321    psrldq           m1, 4                  ; y=0,x=0
322    psrldq           m2, 4                  ; shift top to next pixel
323    psrad            m3, [fg_dataq+FGData.ar_coeff_shift]
324    ; don't packssdw since we only care about one value
325    paddw            m3, m1
326    packsswb         m3, m3
327    pslldq           m3, 2
328    pand             m3, m15
329    pandn            m1, m15, m0
330    por              m0, m1, m3
331    psrldq           m0, 1
332    ; overwrite 2 pixels, but that's ok
333    movd      [bufq+xq-1], m0
334    inc              xq
335    jz .x_loop_ar2_end
336    test             xq, 3
337    jnz .x_loop_ar2_inner
338    jmp .x_loop_ar2
339
340.x_loop_ar2_end:
341    add            bufq, 82
342    dec              hd
343    jg .y_loop_ar2
344    RET
345
346.ar3:
347    DEFINE_ARGS buf, fg_data, shift
348%if ARCH_X86_32
349%assign stack_offset stack_offset_old
350    ALLOC_STACK  -16*14
351%elif WIN64
352    SUB             rsp, 16*6
353%assign stack_size_padded (stack_size_padded+16*6)
354%assign stack_size (stack_size+16*6)
355%else
356    ALLOC_STACK  -16*6
357%endif
358    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
359    movd             m6, [base+round_vals-12+shiftq*2]
360    movd             m7, [base+byte_blend]
361    movu             m0, [fg_dataq+FGData.ar_coeffs_y+ 0]   ; cf0-15
362    movq             m2, [fg_dataq+FGData.ar_coeffs_y+16]   ; cf16-23
363    pxor             m3, m3
364    pcmpgtb          m4, m3, m0
365    pcmpgtb          m3, m2
366    pshuflw          m6, m6, q0000
367    SCRATCH           6, 14, 12
368    SCRATCH           7, 15, 13
369    punpckhbw        m1, m0, m4
370    punpcklbw        m0, m4
371    punpcklbw        m2, m3
372    pshufd           m3, m0, q1111
373    pshufd           m4, m0, q2222
374    pshufd           m5, m0, q3333
375    pshufd           m0, m0, q0000
376    mova    [rsp+ 0*16], m0
377    mova    [rsp+ 1*16], m3
378    mova    [rsp+ 2*16], m4
379    mova    [rsp+ 3*16], m5
380    pshufd           m6, m1, q1111
381    pshufd           m7, m1, q2222
382    pshufd           m5, m1, q3333
383    pshufd           m1, m1, q0000
384    pshufd           m3, m2, q1111
385    psrldq           m0, m2, 10
386    pinsrw           m2, [base+pw_1], 5
387    pshufd           m4, m2, q2222
388    pshufd           m2, m2, q0000
389    pinsrw           m0, [base+round_vals+shiftq*2-10], 3
390    mova    [rsp+ 4*16], m1
391    mova    [rsp+ 5*16], m6
392    SCRATCH           7, 8,  6
393    SCRATCH           5, 9,  7
394    SCRATCH           2, 10, 8
395    SCRATCH           3, 11, 9
396    SCRATCH           4, 12, 10
397    SCRATCH           0, 13, 11
398    DEFINE_ARGS buf, fg_data, h, x
399    sub            bufq, 82*73-(82*3+79)
400    mov              hd, 70
401.y_loop_ar3:
402    mov              xq, -76
403
404.x_loop_ar3:
405    movu             m0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
406    pxor             m3, m3
407    pcmpgtb          m3, m0
408    punpckhbw        m2, m0, m3
409    punpcklbw        m0, m3
410
411    psrldq           m5, m0, 2
412    psrldq           m6, m0, 4
413    psrldq           m7, m0, 6
414    punpcklwd        m4, m0, m5
415    punpcklwd        m6, m7
416    pmaddwd          m4, [rsp+ 0*16]
417    pmaddwd          m6, [rsp+ 1*16]
418    paddd            m4, m6
419
420    movu             m1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
421    pxor             m5, m5
422    pcmpgtb          m5, m1
423    punpckhbw        m3, m1, m5
424    punpcklbw        m1, m5
425    palignr          m6, m2, m0, 10
426    palignr          m7, m2, m0, 12
427    psrldq           m0, 8
428    punpcklwd        m0, m6
429    punpcklwd        m7, m1
430    pmaddwd          m0, [rsp+ 2*16]
431    pmaddwd          m7, [rsp+ 3*16]
432    paddd            m0, m7
433    paddd            m0, m4
434
435    psrldq           m4, m1, 2
436    psrldq           m5, m1, 4
437    psrldq           m6, m1, 6
438    psrldq           m7, m1, 8
439    punpcklwd        m4, m5
440    punpcklwd        m6, m7
441    pmaddwd          m4, [rsp+ 4*16]
442    pmaddwd          m6, [rsp+ 5*16]
443    paddd            m4, m6
444    paddd            m0, m4
445
446    movu             m2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
447    pxor             m7, m7
448    pcmpgtb          m7, m2
449    punpckhbw        m5, m2, m7
450    punpcklbw        m2, m7
451    palignr          m7, m3, m1, 10
452    palignr          m3, m1, 12
453    psrldq           m1, m2, 2
454    punpcklwd        m7, m3
455    punpcklwd        m3, m2, m1
456    pmaddwd          m7, m8
457    pmaddwd          m3, m9
458    paddd            m7, m3
459    paddd            m0, m7
460
461    psrldq           m6, m2, 4
462    psrldq           m1, m2, 6
463    psrldq           m3, m2, 8
464    palignr          m4, m5, m2, 10
465    palignr          m5, m5, m2, 12
466
467    punpcklwd        m6, m1
468    punpcklwd        m3, m4
469    punpcklwd        m5, m14
470    pmaddwd          m6, m10
471    pmaddwd          m3, m11
472    pmaddwd          m5, m12
473    paddd            m0, m6
474    paddd            m3, m5
475    paddd            m0, m3
476
477    movq             m1, [bufq+xq-3]        ; y=0,x=[-3,+4]
478.x_loop_ar3_inner:
479    pxor             m5, m5
480    pcmpgtb          m5, m1
481    punpcklbw        m2, m1, m5
482    pmaddwd          m2, m13
483    pshufd           m3, m2, q1111
484    paddd            m2, m3                 ; left+cur
485    paddd            m2, m0                 ; add top
486    psrldq           m0, 4
487    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
488    ; don't packssdw since we only care about one value
489    packsswb         m2, m2
490    pslldq           m2, 3
491    pand             m2, m15
492    pandn            m3, m15, m1
493    por              m1, m2, m3
494    movd    [bufq+xq-3], m1
495    psrldq           m1, 1
496    inc              xq
497    jz .x_loop_ar3_end
498    test             xq, 3
499    jnz .x_loop_ar3_inner
500    jmp .x_loop_ar3
501
502.x_loop_ar3_end:
503    add            bufq, 82
504    dec              hd
505    jg .y_loop_ar3
506    RET
507
508%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
509INIT_XMM ssse3
510cglobal generate_grain_uv_%1, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
511    movifnidn        r2, r2mp
512    movifnidn        r3, r3mp
513    LEA              r4, $$
514%define base r4-$$
515    movq             m1, [base+rnd_next_upperbit_mask]
516    movq             m4, [base+mul_bits]
517    movq             m7, [base+hmul_bits]
518    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
519    movd             m6, [base+round+r5*2]
520    mova             m5, [base+pb_mask]
521    movd             m0, [fg_dataq+FGData.seed]
522    movd             m2, [base+pw_seed_xor+uvq*4]
523    pxor             m0, m2
524    pshuflw          m6, m6, q0000
525    pshuflw          m0, m0, q0000
526    lea              r6, [base+gaussian_sequence]
527%if %2
528%if ARCH_X86_64
529    mov             r7d, 73-35*%3
530%else
531    mov            r3mp, 73-35*%3
532%endif
533    add            bufq, 44
534.loop_y:
535    mov              r5, -44
536.loop_x:
537%else
538    mov              r5, -82*73
539    sub            bufq, r5
540.loop:
541%endif
542    pand             m2, m0, m1
543    psrlw            m3, m2, 10
544    por              m2, m3             ; bits 0xf, 0x1e, 0x3c and 0x78 are set
545    pmullw           m2, m4             ; bits 0x0f00 are set
546    pshufb           m3, m5, m2         ; set 15th bit for next 4 seeds
547    psllq            m2, m3, 30
548    por              m3, m2
549    psllq            m2, m3, 15
550    por              m3, m2             ; aggregate each bit into next seed's high bit
551    pmulhuw          m2, m0, m7
552    por              m2, m3             ; 4 next output seeds
553    pshuflw          m0, m2, q3333
554    psrlw            m2, 5
555%if ARCH_X86_64
556    movd            r9d, m2
557    pshuflw          m2, m2, q3232
558    movzx            r8, r9w
559    shr              r9, 16
560
561    movd             m3, [r6+r8*2]
562    pinsrw           m3, [r6+r9*2], 1
563
564    movd            r9d, m2
565    movzx            r8, r9w
566    shr              r9, 16
567
568    pinsrw           m3, [r6+r8*2], 2
569    pinsrw           m3, [r6+r9*2], 3
570%else
571    movd             r2, m2
572    pshuflw          m2, m2, q3232
573    movzx            r1, r2w
574    shr              r2, 16
575
576    movd             m3, [r6+r1*2]
577    pinsrw           m3, [r6+r2*2], 1
578
579    movd             r2, m2
580    movzx            r1, r2w
581    shr              r2, 16
582
583    pinsrw           m3, [r6+r1*2], 2
584    pinsrw           m3, [r6+r2*2], 3
585%endif
586    pmulhrsw         m3, m6
587    packsswb         m3, m3
588    movd      [bufq+r5], m3
589    add              r5, 4
590%if %2
591    jl .loop_x
592    add            bufq, 82
593%if ARCH_X86_64
594    dec             r7d
595%else
596    dec            r3mp
597%endif
598    jg .loop_y
599%else
600    jl .loop
601%endif
602
603%if ARCH_X86_32
604    mov              r2, r2mp
605%endif
606
607    ; auto-regression code
608    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
609    movsxd           r5, [base+generate_grain_uv_%1_ssse3_table+r5*4]
610    lea              r5, [r5+base+generate_grain_uv_%1_ssse3_table]
611    jmp              r5
612
613.ar0:
614    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
615    movifnidn     bufyq, bufymp
616%if ARCH_X86_32
617%assign stack_offset_old stack_offset
618    ALLOC_STACK   -2*16
619%endif
620    imul            uvd, 28
621    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
622    movd             m5, [fg_dataq+FGData.ar_coeffs_uv+uvq]
623    movd             m4, [base+hmul_bits+shiftq*2]
624    DEFINE_ARGS buf, bufy, h, x
625    pxor             m0, m0
626    pcmpgtb          m0, m5
627    punpcklbw        m5, m0
628    movd             m7, [base+pb_1]
629%if %2
630    movd             m6, [base+hmul_bits+2+%3*2]
631%endif
632    pshuflw          m5, m5, q0000
633    pshuflw          m4, m4, q0000
634    pshufd           m7, m7, q0000
635%if %2
636    pshuflw          m6, m6, q0000
637%endif
638    punpcklqdq       m5, m5
639    punpcklqdq       m4, m4
640%if %2
641    punpcklqdq       m6, m6
642%endif
643    pcmpeqw          m1, m1
644    pslldq           m1, 12>>%2
645    SCRATCH           1, 8, 0
646    SCRATCH           4, 9, 1
647%if %2
648    sub            bufq, 82*(73-35*%3)+82-(82*3+41)
649%else
650    sub            bufq, 82*70-3
651%endif
652    add           bufyq, 3+82*3
653    mov              hd, 70-35*%3
654.y_loop_ar0:
655    xor              xd, xd
656.x_loop_ar0:
657    ; first 32 pixels
658%if %2
659    movu             m1, [bufyq+xq*2]
660%if %3
661    movu             m2, [bufyq+xq*2+82]
662%endif
663    movu             m3, [bufyq+xq*2+16]
664%if %3
665    movu             m4, [bufyq+xq*2+82+16]
666%endif
667    pmaddubsw        m0, m7, m1
668%if %3
669    pmaddubsw        m1, m7, m2
670%endif
671    pmaddubsw        m2, m7, m3
672%if %3
673    pmaddubsw        m3, m7, m4
674    paddw            m0, m1
675    paddw            m2, m3
676%endif
677    pmulhrsw         m0, m6
678    pmulhrsw         m2, m6
679%else
680    movu             m0, [bufyq+xq]
681    pxor             m6, m6
682    pcmpgtb          m6, m0
683    punpckhbw        m2, m0, m6
684    punpcklbw        m0, m6
685%endif
686    pmullw           m0, m5
687    pmullw           m2, m5
688    pmulhrsw         m0, m9
689    pmulhrsw         m2, m9
690    movu             m1, [bufq+xq]
691    pxor             m4, m4
692    pcmpgtb          m4, m1
693    punpckhbw        m3, m1, m4
694%if %2
695    punpcklbw        m1, m4
696    paddw            m2, m3
697    paddw            m0, m1
698%else
699    punpcklbw        m6, m1, m4
700    paddw            m2, m3
701    paddw            m0, m6
702%endif
703    packsswb         m0, m2
704%if %2
705    movu      [bufq+xq], m0
706    add              xd, 16
707    cmp              xd, 32
708    jl .x_loop_ar0
709
710    ; last 6/12 pixels
711    movu             m1, [bufyq+xq*(1+%2)]
712%if %3
713    movu             m2, [bufyq+xq*2+82]
714%endif
715    pmaddubsw        m0, m7, m1
716%if %3
717    pmaddubsw        m1, m7, m2
718    paddw            m0, m1
719%endif
720    pmulhrsw         m0, m6
721    pmullw           m0, m5
722    pmulhrsw         m0, m9
723    movq             m1, [bufq+xq]
724    pxor             m4, m4
725    pcmpgtb          m4, m1
726    punpcklbw        m2, m1, m4
727    paddw            m0, m2
728    packsswb         m0, m0
729    pandn            m2, m8, m0
730    pand             m1, m8
731    por              m2, m1
732    movq      [bufq+xq], m2
733%else
734    add              xd, 16
735    cmp              xd, 80
736    je .y_loop_final_ar0
737    movu   [bufq+xq-16], m0
738    jmp .x_loop_ar0
739.y_loop_final_ar0:
740    pandn            m2, m8, m0
741    pand             m1, m8
742    por              m2, m1
743    movu   [bufq+xq-16], m2
744%endif
745
746    add            bufq, 82
747    add           bufyq, 82<<%3
748    dec              hd
749    jg .y_loop_ar0
750    RET
751
752.ar1:
753%if ARCH_X86_32
754%assign stack_offset stack_offset_old
755%assign stack_size_padded 0
756%xdefine rstk rsp
757%endif
758    DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x
759    imul            uvd, 28
760    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
761    movd             m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1]
762    pinsrw           m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2
763%if ARCH_X86_32
764    mov            r3mp, cf3d
765    DEFINE_ARGS buf, shift, fg_data, val3, min, max, x
766%elif WIN64
767    DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x
768    mov            bufq, r0
769%else
770    DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x
771%endif
772    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
773    movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
774%if %2
775    movd             m7, [base+pb_1]
776    movd             m6, [base+hmul_bits+2+%3*2]
777%endif
778    psrldq           m4, 1
779%if ARCH_X86_32
780    DEFINE_ARGS buf, shift, val0, val3, min, max, x
781%elif WIN64
782    DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0
783%else
784    DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0
785%endif
786    pxor             m5, m5
787    punpcklwd        m3, m5
788%if %2
789    punpcklwd        m6, m6
790%endif
791    pcmpgtb          m5, m4
792    punpcklbw        m4, m5
793    pshufd           m5, m4, q1111
794    pshufd           m4, m4, q0000
795    pshufd           m3, m3, q0000
796%if %2
797    pshufd           m7, m7, q0000
798    pshufd           m6, m6, q0000
799    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
800%else
801    sub            bufq, 82*69+3
802%endif
803%if ARCH_X86_32
804    add            r1mp, 79+82*3
805    mov            r0mp, 70-35*%3
806%else
807    add           bufyq, 79+82*3
808    mov              hd, 70-35*%3
809%endif
810    mov            mind, -128
811    mov            maxd, 127
812.y_loop_ar1:
813    mov              xq, -(76>>%2)
814    movsx         val3d, byte [bufq+xq-1]
815.x_loop_ar1:
816%if %2
817%if ARCH_X86_32
818    mov              r2, r1mp
819    movq             m0, [r2+xq*2]
820%if %3
821    movq             m1, [r2+xq*2+82]
822%endif
823%else
824    movq             m0, [bufyq+xq*2]
825%if %3
826    movq             m1, [bufyq+xq*2+82]
827%endif
828%endif
829    pmaddubsw        m2, m7, m0
830%if %3
831    pmaddubsw        m0, m7, m1
832    paddw            m2, m0
833%endif
834    pmulhrsw         m2, m6
835%else
836%if ARCH_X86_32
837    mov              r2, r1mp
838    movd             m2, [r2+xq]
839%else
840    movd             m2, [bufyq+xq]
841%endif
842    pxor             m0, m0
843    pcmpgtb          m0, m2
844    punpcklbw        m2, m0
845%endif
846
847    movq             m0, [bufq+xq-82-1]     ; top/left
848    pxor             m1, m1
849    pcmpgtb          m1, m0
850    punpcklbw        m0, m1
851    psrldq           m1, m0, 4              ; top/right
852    punpcklwd        m1, m2
853    psrldq           m2, m0, 2              ; top
854    punpcklwd        m0, m2
855    pmaddwd          m0, m4
856    pmaddwd          m1, m5
857    paddd            m0, m1
858    paddd            m0, m3
859.x_loop_ar1_inner:
860    movd          val0d, m0
861    psrldq           m0, 4
862%if ARCH_X86_32
863    imul          val3d, r3mp
864%else
865    imul          val3d, cf3d
866%endif
867    add           val3d, val0d
868    sar           val3d, shiftb
869    movsx         val0d, byte [bufq+xq]
870    add           val3d, val0d
871    cmp           val3d, maxd
872    cmovns        val3d, maxd
873    cmp           val3d, mind
874    cmovs         val3d, mind
875    mov  byte [bufq+xq], val3b
876    ; keep val3d in-place as left for next x iteration
877    inc              xq
878    jz .x_loop_ar1_end
879    test             xq, 3
880    jnz .x_loop_ar1_inner
881    jmp .x_loop_ar1
882
883.x_loop_ar1_end:
884    add            bufq, 82
885%if ARCH_X86_32
886    add            r1mp, 82<<%3
887    dec            r0mp
888%else
889    add           bufyq, 82<<%3
890    dec              hd
891%endif
892    jg .y_loop_ar1
893    RET
894
895.ar2:
896%if ARCH_X86_32
897%assign stack_offset stack_offset_old
898%assign stack_size_padded 0
899%xdefine rstk rsp
900    ALLOC_STACK   -8*16
901%endif
902    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
903    movifnidn     bufyq, bufymp
904    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
905    imul            uvd, 28
906    movd             m7, [base+round_vals-12+shiftq*2]
907    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]   ; cf0-12
908    pxor             m2, m2
909    pcmpgtb          m2, m0
910    punpckhbw        m1, m0, m2
911    punpcklbw        m0, m2
912    pinsrw           m1, [base+pw_1], 5
913    punpcklwd        m7, m7
914    pshufd           m7, m7, q0000
915    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
916    pshufd           m4, m1, q0000
917    pshufd           m5, m1, q1111
918    pshufd           m6, m1, q2222
919    pshufd           m3, m0, q3333
920    pshufd           m2, m0, q2222
921    pshufd           m1, m0, q1111
922    pshufd           m0, m0, q0000
923    SCRATCH           0, 8,  0
924    SCRATCH           1, 9,  1
925    SCRATCH           2, 10, 2
926    SCRATCH           3, 11, 3
927    SCRATCH           4, 12, 4
928    SCRATCH           5, 13, 5
929    SCRATCH           6, 14, 6
930    SCRATCH           7, 15, 7
931%if %2
932    movd             m7, [base+hmul_bits+2+%3*2]
933    movd             m6, [base+pb_1]
934    punpcklwd        m7, m7
935    pshufd           m6, m6, q0000
936    pshufd           m7, m7, q0000
937    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
938%else
939    sub            bufq, 82*69+3
940%endif
941    add           bufyq, 79+82*3
942    mov              hd, 70-35*%3
943.y_loop_ar2:
944    mov              xq, -(76>>%2)
945
946.x_loop_ar2:
947    pxor             m2, m2
948    movq             m0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
949    movhps           m0, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
950    pcmpgtb          m2, m0
951    punpckhbw        m1, m0, m2
952    punpcklbw        m0, m2
953    psrldq           m5, m0, 2              ; y=-2,x=[-1,+5]
954    psrldq           m3, m1, 2              ; y=-1,x=[-1,+5]
955    psrldq           m4, m1, 4              ; y=-1,x=[+0,+5]
956    punpcklwd        m2, m0, m5
957    punpcklwd        m3, m4
958    pmaddwd          m2, m8
959    pmaddwd          m3, m11
960    paddd            m2, m3
961
962    psrldq           m4, m0, 4              ; y=-2,x=[+0,+5]
963    psrldq           m5, m0, 6              ; y=-2,x=[+1,+5]
964    psrldq           m0, 8                  ; y=-2,x=[+2,+5]
965    punpcklwd        m4, m5
966    punpcklwd        m0, m1
967    psrldq           m3, m1, 6              ; y=-1,x=[+1,+5]
968    psrldq           m1, m1, 8              ; y=-1,x=[+2,+5]
969    punpcklwd        m3, m1
970    pmaddwd          m4, m9
971    pmaddwd          m0, m10
972    pmaddwd          m3, m12
973    paddd            m4, m0
974    paddd            m2, m3
975    paddd            m2, m4
976
977%if %2
978    movq             m1, [bufyq+xq*2]
979%if %3
980    movq             m3, [bufyq+xq*2+82]
981%endif
982    pmaddubsw        m0, m6, m1
983%if %3
984    pmaddubsw        m1, m6, m3
985    paddw            m0, m1
986%endif
987    pmulhrsw         m0, m7
988%else
989    movd             m0, [bufyq+xq]
990    pxor             m1, m1
991    pcmpgtb          m1, m0
992    punpcklbw        m0, m1
993%endif
994    punpcklwd        m0, m15
995    pmaddwd          m0, m14
996    paddd            m2, m0
997
998    movq             m0, [bufq+xq-2]        ; y=0,x=[-2,+5]
999    pxor             m4, m4
1000    movd             m5, [base+byte_blend+1]
1001    punpcklbw        m5, m5
1002.x_loop_ar2_inner:
1003    pcmpgtb          m1, m4, m0
1004    punpcklbw        m0, m1
1005    pmaddwd          m3, m0, m13
1006    paddd            m3, m2
1007    psrldq           m2, 4                  ; shift top to next pixel
1008    psrad            m3, [fg_dataq+FGData.ar_coeff_shift]
1009    pslldq           m3, 4
1010    pand             m3, m5
1011    paddw            m0, m3
1012    packsswb         m0, m0
1013    movd    [bufq+xq-2], m0
1014    psrldq           m0, 1
1015    inc              xq
1016    jz .x_loop_ar2_end
1017    test             xq, 3
1018    jnz .x_loop_ar2_inner
1019    jmp .x_loop_ar2
1020
1021.x_loop_ar2_end:
1022    add            bufq, 82
1023    add           bufyq, 82<<%3
1024    dec              hd
1025    jg .y_loop_ar2
1026    RET
1027
1028.ar3:
1029%if ARCH_X86_32
1030%assign stack_offset stack_offset_old
1031%assign stack_size_padded 0
1032%xdefine rstk rsp
1033%endif
1034    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
1035    movifnidn     bufyq, bufymp
1036%if ARCH_X86_32
1037    ALLOC_STACK  -15*16
1038%else
1039    SUB             rsp, 16*7
1040%assign stack_size_padded (stack_size_padded+16*7)
1041%assign stack_size (stack_size+16*7)
1042%endif
1043    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
1044    imul            uvd, 28
1045
1046    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]   ; cf0-15
1047    pxor             m3, m3
1048    pcmpgtb          m3, m0
1049    punpckhbw        m1, m0, m3
1050    punpcklbw        m0, m3
1051    pshufd           m2, m0, q1111
1052    pshufd           m3, m0, q2222
1053    pshufd           m4, m0, q3333
1054    pshufd           m0, m0, q0000
1055    pshufd           m5, m1, q1111
1056    pshufd           m6, m1, q2222
1057    pshufd           m7, m1, q3333
1058    pshufd           m1, m1, q0000
1059    mova    [rsp+ 0*16], m0
1060    mova    [rsp+ 1*16], m2
1061    mova    [rsp+ 2*16], m3
1062    mova    [rsp+ 3*16], m4
1063    mova    [rsp+ 4*16], m1
1064    mova    [rsp+ 5*16], m5
1065    mova    [rsp+ 6*16], m6
1066    SCRATCH           7, 8, 7
1067
1068    movu             m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]   ; cf16-24 [24=luma]
1069    pxor             m4, m4
1070    pcmpgtb          m4, m2
1071    punpckhbw        m5, m2, m4
1072    punpcklbw        m2, m4
1073    pshufd           m4, m2, q3232
1074    punpcklwd        m3, m4, m5
1075    pshuflw          m5, m4, q3321
1076    pshufd           m4, m3, q0000
1077    pshufd           m3, m2, q1111
1078    pshufd           m2, m2, q0000
1079    pinsrw           m5, [base+round_vals+shiftq*2-10], 3
1080    SCRATCH           2, 9,  8
1081    SCRATCH           3, 10, 9
1082    SCRATCH           4, 11, 10
1083    SCRATCH           5, 12, 11
1084
1085    movd             m2, [base+round_vals-12+shiftq*2]
1086%if %2
1087    movd             m1, [base+pb_1]
1088    movd             m3, [base+hmul_bits+2+%3*2]
1089%endif
1090    pxor             m0, m0
1091    punpcklwd        m2, m0
1092%if %2
1093    punpcklwd        m3, m3
1094%endif
1095    pshufd           m2, m2, q0000
1096%if %2
1097    pshufd           m1, m1, q0000
1098    pshufd           m3, m3, q0000
1099    SCRATCH           1, 13, 12
1100%endif
1101    SCRATCH           2, 14, 13
1102%if %2
1103    SCRATCH           3, 15, 14
1104%endif
1105
1106    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
1107%if %2
1108    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
1109%else
1110    sub            bufq, 82*69+3
1111%endif
1112    add           bufyq, 79+82*3
1113    mov              hd, 70-35*%3
1114.y_loop_ar3:
1115    mov              xq, -(76>>%2)
1116
1117.x_loop_ar3:
1118    movu             m0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
1119    pxor             m4, m4
1120    pcmpgtb          m4, m0
1121    punpckhbw        m3, m0, m4
1122    punpcklbw        m0, m4
1123
1124    psrldq           m5, m0, 2
1125    psrldq           m6, m0, 4
1126    psrldq           m7, m0, 6
1127    punpcklwd        m4, m0, m5
1128    punpcklwd        m6, m7
1129    pmaddwd          m4, [rsp+ 0*16]
1130    pmaddwd          m6, [rsp+ 1*16]
1131    paddd            m4, m6
1132
1133    palignr          m2, m3, m0, 10
1134    palignr          m3, m0, 12
1135    psrldq           m0, 8
1136
1137    movu             m1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
1138    pxor             m6, m6
1139    pcmpgtb          m6, m1
1140    punpckhbw        m5, m1, m6
1141    punpcklbw        m1, m6
1142
1143    punpcklwd        m0, m2
1144    punpcklwd        m3, m1
1145    pmaddwd          m0, [rsp+ 2*16]
1146    pmaddwd          m3, [rsp+ 3*16]
1147    paddd            m0, m3
1148    paddd            m0, m4
1149
1150    movu             m2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
1151    pxor             m7, m7
1152    pcmpgtb          m7, m2
1153    punpckhbw        m6, m2, m7
1154    punpcklbw        m2, m7
1155
1156    palignr          m3, m5, m1, 10
1157    palignr          m5, m1, 12
1158    psrldq           m4, m2, 2
1159
1160    punpcklwd        m3, m5
1161    punpcklwd        m5, m2, m4
1162    pmaddwd          m3, [rsp+ 6*16]
1163    pmaddwd          m5, m8
1164    paddd            m3, m5
1165    paddd            m0, m3
1166
1167    psrldq           m3, m1, 2
1168    psrldq           m4, m1, 4
1169    psrldq           m5, m1, 6
1170    psrldq           m1, 8
1171
1172    punpcklwd        m3, m4
1173    punpcklwd        m5, m1
1174    pmaddwd          m3, [rsp+ 4*16]
1175    pmaddwd          m5, [rsp+ 5*16]
1176    paddd            m3, m5
1177    paddd            m0, m3
1178
1179%if %2
1180    movq             m1, [bufyq+xq*2]
1181%if %3
1182    movq             m3, [bufyq+xq*2+82]
1183%endif
1184    pmaddubsw        m7, m13, m1
1185%if %3
1186    pmaddubsw        m5, m13, m3
1187    paddw            m7, m5
1188%endif
1189    pmulhrsw         m7, m15
1190%else
1191    movd             m7, [bufyq+xq]
1192    pxor             m1, m1
1193    pcmpgtb          m1, m7
1194    punpcklbw        m7, m1
1195%endif
1196
1197    psrldq           m1, m2, 4
1198    psrldq           m3, m2, 6
1199    palignr          m4, m6, m2, 10
1200    palignr          m6, m2, 12
1201    psrldq           m2, 8
1202
1203    punpcklwd        m1, m3
1204    punpcklwd        m2, m4
1205    punpcklwd        m6, m7
1206    pmaddwd          m1, m9
1207    pmaddwd          m2, m10
1208    pmaddwd          m6, m11
1209    paddd            m1, m2
1210    paddd            m0, m6
1211    paddd            m0, m1
1212    paddd            m0, m14
1213
1214    movq             m1, [bufq+xq-3]        ; y=0,x=[-3,+4]
1215    pxor             m4, m4
1216    movd             m5, [base+byte_blend]
1217.x_loop_ar3_inner:
1218    pcmpgtb          m2, m4, m1
1219    punpcklbw        m3, m1, m2
1220    pmaddwd          m2, m3, m12
1221    pshufd           m3, m2, q1111
1222    paddd            m2, m3                 ; left+cur
1223    paddd            m2, m0                 ; add top
1224    psrldq           m0, 4
1225    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
1226    ; don't packssdw, we only care about one value
1227    packsswb         m2, m2
1228    pandn            m3, m5, m1
1229    pslld            m2, 24
1230    pand             m2, m5
1231    por              m1, m2, m3
1232    movd    [bufq+xq-3], m1
1233    psrldq           m1, 1
1234    inc              xq
1235    jz .x_loop_ar3_end
1236    test             xq, 3
1237    jnz .x_loop_ar3_inner
1238    jmp .x_loop_ar3
1239
1240.x_loop_ar3_end:
1241    add            bufq, 82
1242    add           bufyq, 82<<%3
1243    dec              hd
1244    jg .y_loop_ar3
1245    RET
1246%endmacro
1247
1248generate_grain_uv_fn 420, 1, 1
1249generate_grain_uv_fn 422, 1, 0
1250generate_grain_uv_fn 444, 0, 0
1251
1252%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg
1253%assign %%idx 0
1254%define %%tmp %2
1255%if %0 == 6
1256%define %%tmp %6
1257%endif
1258%rep 4
1259%if %%idx == 0
1260    movd        %5 %+ d, %2
1261    pshuflw       %%tmp, %2, q3232
1262%else
1263    movd        %5 %+ d, %%tmp
1264%if %%idx == 2
1265    punpckhqdq    %%tmp, %%tmp
1266%elif %%idx == 4
1267    psrlq         %%tmp, 32
1268%endif
1269%endif
1270    movzx       %4 %+ d, %5 %+ w
1271    shr         %5 %+ d, 16
1272
1273%if %%idx == 0
1274    movd             %1, [%3+%4]
1275%else
1276    pinsrw           %1, [%3+%4], %%idx + 0
1277%endif
1278    pinsrw           %1, [%3+%5], %%idx + 1
1279%assign %%idx %%idx+2
1280%endrep
1281%endmacro
1282
1283INIT_XMM ssse3
1284; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby)
1285%if ARCH_X86_32
1286%if STACK_ALIGNMENT < mmsize
1287cglobal fgy_32x32xn, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \
1288        dst, src, scaling, unused1, fg_data, picptr, unused2
1289    ; copy stack arguments to new position post-alignment, so that we
1290    ; don't have to keep the old stack location in a separate register
1291    mov              r0, r0m
1292    mov              r1, r2m
1293    mov              r2, r4m
1294    mov              r3, r6m
1295    mov              r4, r7m
1296    mov              r5, r8m
1297
1298    mov [rsp+6*mmsize+ 3*gprsize], r0
1299    mov [rsp+6*mmsize+ 5*gprsize], r1
1300    mov [rsp+6*mmsize+ 7*gprsize], r2
1301    mov [rsp+6*mmsize+ 9*gprsize], r3
1302    mov [rsp+6*mmsize+10*gprsize], r4
1303    mov [rsp+6*mmsize+11*gprsize], r5
1304%else
1305cglobal fgy_32x32xn, 0, 7, 16, 6 * mmsize + (3 + 1) * gprsize, \
1306        dst, src, scaling, unused1, fg_data, picptr, unused2
1307%endif
1308    mov            srcq, srcm
1309    mov        fg_dataq, r3m
1310    mov        scalingq, r5m
1311%if STACK_ALIGNMENT < mmsize
1312%define r0m [rsp+6*mmsize+ 3*gprsize]
1313%define r1m [rsp+6*mmsize+ 4*gprsize]
1314%define r2m [rsp+6*mmsize+ 5*gprsize]
1315%define r3m [rsp+6*mmsize+ 6*gprsize]
1316%define r4m [rsp+6*mmsize+ 7*gprsize]
1317%define r5m [rsp+6*mmsize+ 8*gprsize]
1318%define r6m [rsp+6*mmsize+ 9*gprsize]
1319%define r7m [rsp+6*mmsize+10*gprsize]
1320%define r8m [rsp+6*mmsize+11*gprsize]
1321%endif
1322    LEA              r5, pb_mask
1323%define base r5-pb_mask
1324    mov             r5m, picptrq
1325%else
1326cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
1327    lea              r7, [pb_mask]
1328%define base r7-pb_mask
1329%endif
1330    mov             r6d, [fg_dataq+FGData.scaling_shift]
1331    movd             m3, [base+mul_bits+r6*2-14]
1332    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
1333    pcmpeqw          m2, m2
1334    psrldq           m2, 14
1335    movd             m4, [base+max+r6*4]
1336    movd             m5, [base+min+r6*2]
1337    punpcklwd        m3, m3
1338    punpcklwd        m4, m4
1339    punpcklwd        m5, m5
1340    pshufd           m3, m3, q0000
1341    pshufd           m4, m4, q0000
1342    pshufd           m5, m5, q0000
1343    SCRATCH           2, 10, 0
1344    SCRATCH           3, 11, 1
1345    SCRATCH           4, 12, 2
1346    SCRATCH           5, 13, 3
1347
1348%if ARCH_X86_32
1349    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
1350%else
1351    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
1352%endif
1353
1354    mov            sbyd, r8m
1355    mov        overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
1356    test       overlapd, overlapd
1357    jz .no_vertical_overlap
1358    mova             m6, [base+pw_1024]
1359    movd             m7, [base+pb_27_17_17_27]
1360    SCRATCH           6, 14, 4
1361    SCRATCH           7, 15, 5
1362    test           sbyd, sbyd
1363    jnz .vertical_overlap
1364    ; fall-through
1365
1366.no_vertical_overlap:
1367    mov             r8m, overlapd
1368%if ARCH_X86_32
1369    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
1370    imul           seed, (173 << 24) | 37
1371%else
1372    imul           seed, sbyd, (173 << 24) | 37
1373%endif
1374    add            seed, (105 << 24) | 178
1375    rol            seed, 8
1376    movzx          seed, seew
1377    xor            seed, [fg_dataq+FGData.seed]
1378
1379%if ARCH_X86_32
1380    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1381
1382    mov             r3m, seed
1383    mov              wq, r4m
1384%else
1385    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1386                unused1, unused2, see, unused3
1387%endif
1388
1389    lea        src_bakq, [srcq+wq]
1390    neg              wq
1391    sub           dstmp, srcq
1392%if ARCH_X86_32
1393    mov             r1m, src_bakq
1394    mov             r4m, wq
1395    DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
1396%endif
1397
1398.loop_x:
1399%if ARCH_X86_32
1400    mov            seed, r3m
1401%endif
1402    mov             r6d, seed
1403    or             seed, 0xEFF4
1404    shr             r6d, 1
1405    test           seeb, seeh
1406    lea            seed, [r6+0x8000]
1407    cmovp          seed, r6d                ; updated seed
1408%if ARCH_X86_32
1409    mov             r3m, seed
1410
1411    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1412
1413    mov           offxd, offyd
1414%else
1415    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1416                offx, offy, see, unused
1417
1418    mov           offyd, seed
1419    mov           offxd, seed
1420%endif
1421    ror           offyd, 8
1422    shr           offxd, 12
1423    and           offyd, 0xf
1424    imul          offyd, 164
1425    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1426
1427%if ARCH_X86_32
1428    ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
1429    ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
1430    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1431%else
1432    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1433                h, offxy, see, unused
1434%endif
1435
1436.loop_x_odd:
1437    mov              hd, r7m
1438    mov      grain_lutq, grain_lutmp
1439.loop_y:
1440    ; src
1441    mova             m0, [srcq]
1442    pxor             m2, m2
1443    punpckhbw        m1, m0, m2
1444    punpcklbw        m0, m2                 ; m0-1: src as word
1445
1446    ; scaling[src]
1447%if ARCH_X86_32
1448    vpgatherdw       m4, m0, scalingq, r0, r5, m3
1449    vpgatherdw       m5, m1, scalingq, r0, r5, m3
1450%else
1451    vpgatherdw       m4, m0, scalingq, r12, r13, m3
1452    vpgatherdw       m5, m1, scalingq, r12, r13, m3
1453%endif
1454    pcmpeqw          m3, m3
1455    psrlw            m3, 8
1456    pand             m4, m3
1457    pand             m5, m3
1458
1459    ; grain = grain_lut[offy+y][offx+x]
1460    movu             m3, [grain_lutq+offxyq]
1461    pcmpgtb          m7, m2, m3
1462    punpcklbw        m2, m3, m7
1463    punpckhbw        m3, m7
1464
1465    ; noise = round2(scaling[src] * grain, scaling_shift)
1466    pmullw           m2, m4
1467    pmullw           m3, m5
1468    pmulhrsw         m2, m11
1469    pmulhrsw         m3, m11
1470
1471    ; dst = clip_pixel(src, noise)
1472    paddw            m0, m2
1473    paddw            m1, m3
1474    pmaxsw           m0, m13
1475    pmaxsw           m1, m13
1476    pminsw           m0, m12
1477    pminsw           m1, m12
1478    packuswb         m0, m1
1479    movifnidn      dstq, dstmp
1480    mova    [dstq+srcq], m0
1481
1482    add            srcq, r2mp
1483    add      grain_lutq, 82
1484    dec              hd
1485    jg .loop_y
1486
1487%if ARCH_X86_32
1488    add            r4mp, 16
1489%else
1490    add              wq, 16
1491%endif
1492    jge .end
1493%if ARCH_X86_32
1494    mov            srcq, r1mp
1495    add            srcq, r4mp
1496%else
1497    lea            srcq, [src_bakq+wq]
1498%endif
1499    btc       dword r8m, 2
1500    jc .next_blk
1501
1502    add          offxyd, 16
1503    test      dword r8m, 2              ; r8m & 2 = have_top_overlap
1504    jz .loop_x_odd
1505
1506%if ARCH_X86_32
1507    add dword [rsp+6*mmsize+1*gprsize], 16
1508%else
1509    add            r11d, 16             ; top_offxyd
1510%endif
1511    jnz .loop_x_odd_v_overlap
1512
1513.next_blk:
1514    test      dword r8m, 1
1515    jz .loop_x
1516
1517    test      dword r8m, 2
1518    jnz .loop_x_hv_overlap
1519
1520    ; horizontal overlap (without vertical overlap)
1521.loop_x_h_overlap:
1522%if ARCH_X86_32
1523    ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
1524    ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
1525    DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3
1526
1527    add          offxyd, 16                 ; left_offxyd
1528    mov [rsp+6*mmsize+0*gprsize], offxyd
1529
1530    DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
1531
1532    mov            seed, r3m
1533%else
1534    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1535                offx, offy, see, left_offxy
1536
1537    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
1538%endif
1539
1540    mov             r6d, seed
1541    or             seed, 0xEFF4
1542    shr             r6d, 1
1543    test           seeb, seeh
1544    lea            seed, [r6+0x8000]
1545    cmovp          seed, r6d                ; updated seed
1546
1547%if ARCH_X86_32
1548    mov             r3m, seed
1549
1550    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1551
1552    mov           offxd, offyd
1553%else
1554    mov           offyd, seed
1555    mov           offxd, seed
1556%endif
1557    ror           offyd, 8
1558    shr           offxd, 12
1559    and           offyd, 0xf
1560    imul          offyd, 164
1561    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1562
1563%if ARCH_X86_32
1564    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1565%else
1566    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1567                h, offxy, see, left_offxy
1568%endif
1569
1570    mov              hd, r7m
1571    mov      grain_lutq, grain_lutmp
1572.loop_y_h_overlap:
1573    ; src
1574    mova             m0, [srcq]
1575    pxor             m2, m2
1576    punpckhbw        m1, m0, m2
1577    punpcklbw        m0, m2                 ; m0-1: src as word
1578
1579    ; scaling[src]
1580%if ARCH_X86_32
1581    vpgatherdw       m4, m0, scalingq, r0, r5, m3
1582    vpgatherdw       m5, m1, scalingq, r0, r5, m3
1583%else
1584    vpgatherdw       m4, m0, scalingq, r12, r13, m3
1585    vpgatherdw       m5, m1, scalingq, r12, r13, m3
1586%endif
1587    pcmpeqw          m3, m3
1588    psrlw            m3, 8
1589    pand             m4, m3
1590    pand             m5, m3
1591
1592    ; grain = grain_lut[offy+y][offx+x]
1593    movu             m3, [grain_lutq+offxyq]
1594%if ARCH_X86_32
1595    mov              r5, [rsp+6*mmsize+0*gprsize]
1596    movd             m7, [grain_lutq+r5]
1597%else
1598    movd             m7, [grain_lutq+left_offxyq]
1599%endif
1600    punpcklbw        m7, m3
1601    pmaddubsw        m6, m15, m7
1602    pmulhrsw         m6, m14
1603    packsswb         m6, m6
1604    pand             m6, m10
1605    pandn            m7, m10, m3
1606    por              m6, m7
1607    pcmpgtb          m2, m6
1608    punpcklbw        m7, m6, m2
1609    punpckhbw        m6, m2
1610
1611    ; noise = round2(scaling[src] * grain, scaling_shift)
1612    pmullw           m7, m4
1613    pmullw           m6, m5
1614    pmulhrsw         m7, m11
1615    pmulhrsw         m6, m11
1616
1617    ; dst = clip_pixel(src, noise)
1618    paddw            m0, m7
1619    paddw            m1, m6
1620    pmaxsw           m0, m13
1621    pmaxsw           m1, m13
1622    pminsw           m0, m12
1623    pminsw           m1, m12
1624    packuswb         m0, m1
1625    movifnidn      dstq, dstmp
1626    mova    [dstq+srcq], m0
1627
1628    add            srcq, r2mp
1629    add      grain_lutq, 82
1630    dec              hd
1631    jg .loop_y_h_overlap
1632
1633%if ARCH_X86_32
1634    add            r4mp, 16
1635%else
1636    add              wq, 16
1637%endif
1638    jge .end
1639%if ARCH_X86_32
1640    mov            srcq, r1m
1641    add            srcq, r4m
1642%else
1643    lea            srcq, [src_bakq+wq]
1644%endif
1645    xor       dword r8m, 4
1646    add          offxyd, 16
1647
1648    ; since this half-block had left-overlap, the next does not
1649    test      dword r8m, 2              ; have_top_overlap
1650    jz .loop_x_odd
1651%if ARCH_X86_32
1652    add dword [rsp+6*mmsize+1*gprsize], 16
1653%else
1654    add            r11d, 16             ; top_offxyd
1655%endif
1656    jmp .loop_x_odd_v_overlap
1657
1658.end:
1659    RET
1660
1661.vertical_overlap:
1662%if ARCH_X86_32
1663    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
1664%else
1665    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
1666%endif
1667
1668    or         overlapd, 2                  ; top_overlap: overlap & 2
1669    mov             r8m, overlapd
1670    movzx          sbyd, sbyb
1671%if ARCH_X86_32
1672    imul             r4, [fg_dataq+FGData.seed], 0x00010001
1673    DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
1674%else
1675    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1676%endif
1677    imul           tmpd, sbyd, 173 * 0x00010001
1678    imul           sbyd, 37 * 0x01000100
1679    add            tmpd, (105 << 16) | 188
1680    add            sbyd, (178 << 24) | (141 << 8)
1681    and            tmpd, 0x00ff00ff
1682    and            sbyd, 0xff00ff00
1683    xor            seed, tmpd
1684%if ARCH_X86_32
1685    xor            sbyd, seed               ; (cur_seed << 16) | top_seed
1686
1687    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1688
1689    mov             r3m, seed
1690    mov              wq, r4m
1691%else
1692    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1693
1694    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1695                tmp, unused2, see, unused3
1696%endif
1697
1698    lea        src_bakq, [srcq+wq]
1699    neg              wq
1700    sub           dstmp, srcq
1701%if ARCH_X86_32
1702    mov             r1m, src_bakq
1703    mov             r4m, wq
1704    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
1705%endif
1706
1707.loop_x_v_overlap:
1708%if ARCH_X86_32
1709    mov            seed, r3m
1710%endif
1711    ; we assume from the block above that bits 8-15 of tmpd are zero'ed,
1712    ; because of the 'and tmpd, 0x00ff00ff' above
1713    mov             r6d, seed
1714    or             seed, 0xeff4eff4
1715    test           seeb, seeh
1716    setp           tmpb                     ; parity of top_seed
1717    shr            seed, 16
1718    shl            tmpd, 16
1719    test           seeb, seeh
1720    setp           tmpb                     ; parity of cur_seed
1721    or              r6d, 0x00010001
1722    xor            tmpd, r6d
1723    mov            seed, tmpd
1724    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
1725
1726%if ARCH_X86_32
1727    mov             r3m, seed
1728
1729    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1730
1731    mov           offxd, offyd
1732%else
1733    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1734                offx, offy, see, unused, top_offxy
1735
1736    mov           offyd, seed
1737    mov           offxd, seed
1738%endif
1739
1740    ror           offyd, 8
1741    ror           offxd, 12
1742    and           offyd, 0xf000f
1743    and           offxd, 0xf000f
1744    imul          offyd, 164
1745    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1746    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1747
1748%if ARCH_X86_32
1749    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
1750%else
1751    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1752                h, offxy, see, unused, top_offxy
1753%endif
1754
1755    movzx    top_offxyd, offxyw
1756%if ARCH_X86_32
1757    mov [rsp+6*mmsize+1*gprsize], top_offxyd
1758
1759    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1760%endif
1761    shr          offxyd, 16
1762
1763.loop_x_odd_v_overlap:
1764%if ARCH_X86_32
1765    mov              r5, r5m
1766    lea              r5, [base+pb_27_17]
1767    mov [rsp+5*mmsize+8], r5
1768%else
1769    mova             m8, [pb_27_17]
1770%endif
1771    mov              hd, r7m
1772    mov      grain_lutq, grain_lutmp
1773.loop_y_v_overlap:
1774    ; src
1775    mova             m0, [srcq]
1776    pxor             m2, m2
1777    punpckhbw        m1, m0, m2
1778    punpcklbw        m0, m2                 ; m0-1: src as word
1779
1780    ; scaling[src]
1781%if ARCH_X86_32
1782    vpgatherdw       m4, m0, scalingq, r0, r5, m3
1783    vpgatherdw       m5, m1, scalingq, r0, r5, m3
1784%else
1785    vpgatherdw       m4, m0, scalingq, r12, r13, m3
1786    vpgatherdw       m5, m1, scalingq, r12, r13, m3
1787%endif
1788    pcmpeqw          m3, m3
1789    psrlw            m3, 8
1790    pand             m4, m3
1791    pand             m5, m3
1792
1793    ; grain = grain_lut[offy+y][offx+x]
1794    movu             m3, [grain_lutq+offxyq]
1795%if ARCH_X86_32
1796    mov              r5, [rsp+6*mmsize+1*gprsize]
1797    movu             m7, [grain_lutq+r5]
1798%else
1799    movu             m7, [grain_lutq+top_offxyq]
1800%endif
1801    punpckhbw        m6, m7, m3
1802    punpcklbw        m7, m3
1803%if ARCH_X86_32
1804    mov              r5, [rsp+5*mmsize+8]
1805    pmaddubsw        m3, [r5], m6
1806    pmaddubsw        m6, [r5], m7
1807%else
1808    pmaddubsw        m3, m8, m6
1809    pmaddubsw        m6, m8, m7
1810%endif
1811    pmulhrsw         m3, m14
1812    pmulhrsw         m6, m14
1813    packsswb         m6, m3
1814    pcmpgtb          m7, m2, m6
1815    punpcklbw        m2, m6, m7
1816    punpckhbw        m6, m7
1817
1818    ; noise = round2(scaling[src] * grain, scaling_shift)
1819    pmullw           m2, m4
1820    pmullw           m6, m5
1821    pmulhrsw         m2, m11
1822    pmulhrsw         m6, m11
1823
1824    ; dst = clip_pixel(src, noise)
1825    paddw            m0, m2
1826    paddw            m1, m6
1827    pmaxsw           m0, m13
1828    pmaxsw           m1, m13
1829    pminsw           m0, m12
1830    pminsw           m1, m12
1831    packuswb         m0, m1
1832    movifnidn      dstq, dstmp
1833    mova    [dstq+srcq], m0
1834
1835%if ARCH_X86_32
1836    add dword [rsp+5*mmsize+8], mmsize
1837%else
1838    mova             m8, [pb_17_27]
1839%endif
1840    add            srcq, r2mp
1841    add      grain_lutq, 82
1842    dec              hw
1843    jz .end_y_v_overlap
1844    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1845    ; remaining (up to) 30 lines
1846    btc              hd, 16
1847    jnc .loop_y_v_overlap
1848    jmp .loop_y
1849
1850.end_y_v_overlap:
1851%if ARCH_X86_32
1852    add            r4mp, 16
1853%else
1854    add              wq, 16
1855%endif
1856    jge .end_hv
1857%if ARCH_X86_32
1858    mov            srcq, r1mp
1859    add            srcq, r4mp
1860%else
1861    lea            srcq, [src_bakq+wq]
1862%endif
1863    btc       dword r8m, 2
1864    jc .loop_x_hv_overlap
1865    add          offxyd, 16
1866%if ARCH_X86_32
1867    add dword [rsp+6*mmsize+1*gprsize], 16
1868%else
1869    add      top_offxyd, 16
1870%endif
1871    jmp .loop_x_odd_v_overlap
1872
1873.loop_x_hv_overlap:
1874%if ARCH_X86_32
1875    mov              r5, r5m
1876    lea              r5, [base+pb_27_17]
1877    mov [rsp+5*mmsize+8], r5
1878
1879    DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak
1880
1881    mov              r5, [rsp+6*mmsize+1*gprsize]
1882    mov              r4, offxyd
1883    add              r5, 16
1884    add              r4, 16
1885    mov [rsp+6*mmsize+2*gprsize], r5        ; topleft_offxy
1886    mov [rsp+6*mmsize+0*gprsize], r4        ; left_offxy
1887
1888    DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak
1889
1890    xor            tmpd, tmpd
1891    mov            seed, r3m
1892%else
1893    mova             m8, [pb_27_17]
1894
1895    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1896                tmp, unused2, see, unused3
1897
1898    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
1899%endif
1900    mov             r6d, seed
1901    or             seed, 0xeff4eff4
1902    test           seeb, seeh
1903    setp           tmpb                     ; parity of top_seed
1904    shr            seed, 16
1905    shl            tmpd, 16
1906    test           seeb, seeh
1907    setp           tmpb                     ; parity of cur_seed
1908    or              r6d, 0x00010001
1909    xor            tmpd, r6d
1910    mov            seed, tmpd
1911    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
1912
1913%if ARCH_X86_32
1914    mov             r3m, seed
1915
1916    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1917
1918    mov           offxd, offyd
1919%else
1920    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1921                offx, offy, see, left_offxy, top_offxy, topleft_offxy
1922
1923    lea  topleft_offxyq, [top_offxyq+16]
1924    lea     left_offxyq, [offyq+16]
1925    mov           offyd, seed
1926    mov           offxd, seed
1927%endif
1928    ror           offyd, 8
1929    ror           offxd, 12
1930    and           offyd, 0xf000f
1931    and           offxd, 0xf000f
1932    imul          offyd, 164
1933    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1934    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1935
1936%if ARCH_X86_32
1937    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1938
1939    movzx            r5, offxyw             ; top_offxy
1940    mov [rsp+6*mmsize+1*gprsize], r5
1941%else
1942    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1943                h, offxy, see, left_offxy, top_offxy, topleft_offxy
1944
1945    movzx    top_offxyd, offxyw
1946%endif
1947    shr          offxyd, 16
1948
1949    mov              hd, r7m
1950    mov      grain_lutq, grain_lutmp
1951.loop_y_hv_overlap:
1952    ; grain = grain_lut[offy+y][offx+x]
1953    movu             m3, [grain_lutq+offxyq]
1954%if ARCH_X86_32
1955    mov              r5, [rsp+6*mmsize+1*gprsize]   ; top_offxy
1956    mov              r0, [rsp+6*mmsize+0*gprsize]   ; left_offxy
1957    movu             m6, [grain_lutq+r5]
1958    mov              r5, [rsp+6*mmsize+2*gprsize]   ; topleft_offxy
1959    movd             m4, [grain_lutq+r0]
1960    movd             m7, [grain_lutq+r5]
1961%else
1962    movu             m6, [grain_lutq+top_offxyq]
1963    movd             m4, [grain_lutq+left_offxyq]
1964    movd             m7, [grain_lutq+topleft_offxyq]
1965%endif
1966    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
1967    punpcklbw        m4, m3
1968    punpcklbw        m7, m6
1969    pmaddubsw        m2, m15, m4
1970    pmaddubsw        m4, m15, m7
1971    pmulhrsw         m2, m14
1972    pmulhrsw         m4, m14
1973    packsswb         m2, m2
1974    packsswb         m4, m4
1975    pand             m2, m10
1976    pand             m4, m10
1977    pandn            m7, m10, m3
1978    pandn            m3, m10, m6
1979    por              m7, m2
1980    por              m3, m4
1981    ; followed by v interpolation (top | cur -> cur)
1982    punpckhbw        m4, m3, m7
1983    punpcklbw        m3, m7
1984%if ARCH_X86_32
1985    mov              r5, [rsp+5*mmsize+8]
1986    pmaddubsw        m7, [r5], m4
1987    pmaddubsw        m4, [r5], m3
1988%else
1989    pmaddubsw        m7, m8, m4
1990    pmaddubsw        m4, m8, m3
1991%endif
1992    pmulhrsw         m7, m14
1993    pmulhrsw         m4, m14
1994    packsswb         m4, m7
1995    pxor             m2, m2
1996    pcmpgtb          m7, m2, m4
1997    punpcklbw        m3, m4, m7
1998    punpckhbw        m4, m7
1999
2000    ; src
2001    mova             m0, [srcq]
2002    punpckhbw        m1, m0, m2
2003    punpcklbw        m0, m2                 ; m0-1: src as word
2004
2005    ; scaling[src]
2006%if ARCH_X86_32
2007    vpgatherdw       m5, m0, scalingq, r0, r5, m7
2008    vpgatherdw       m6, m1, scalingq, r0, r5, m7
2009%else
2010    vpgatherdw       m5, m0, scalingq, r13, r14, m7
2011    vpgatherdw       m6, m1, scalingq, r13, r14, m7
2012%endif
2013    pcmpeqw          m7, m7
2014    psrlw            m7, 8
2015    pand             m5, m7
2016    pand             m6, m7
2017
2018    ; noise = round2(scaling[src] * grain, scaling_shift)
2019    pmullw           m3, m5
2020    pmullw           m4, m6
2021    pmulhrsw         m3, m11
2022    pmulhrsw         m4, m11
2023
2024    ; dst = clip_pixel(src, noise)
2025    paddw            m0, m3
2026    paddw            m1, m4
2027    pmaxsw           m0, m13
2028    pmaxsw           m1, m13
2029    pminsw           m0, m12
2030    pminsw           m1, m12
2031    packuswb         m0, m1
2032    movifnidn      dstq, dstmp
2033    mova    [dstq+srcq], m0
2034
2035%if ARCH_X86_32
2036    add dword [rsp+5*mmsize+8], mmsize
2037%else
2038    mova             m8, [pb_17_27]
2039%endif
2040    add            srcq, r2mp
2041    add      grain_lutq, 82
2042    dec              hw
2043    jz .end_y_hv_overlap
2044    ; 2 lines get vertical overlap, then fall back to non-overlap code for
2045    ; remaining (up to) 30 lines
2046    btc              hd, 16
2047    jnc .loop_y_hv_overlap
2048    jmp .loop_y_h_overlap
2049
2050.end_y_hv_overlap:
2051%if ARCH_X86_32
2052    add            r4mp, 16
2053%else
2054    add              wq, 16
2055%endif
2056    jge .end_hv
2057%if ARCH_X86_32
2058    mov            srcq, r1m
2059    add            srcq, r4m
2060%else
2061    lea            srcq, [src_bakq+wq]
2062%endif
2063    xor       dword r8m, 4
2064    add          offxyd, 16
2065%if ARCH_X86_32
2066    add dword [rsp+6*mmsize+1*gprsize], 16
2067%else
2068    add      top_offxyd, 16
2069%endif
2070    jmp .loop_x_odd_v_overlap
2071
2072.end_hv:
2073    RET
2074
2075%macro FGUV_FN 3 ; name, ss_hor, ss_ver
2076INIT_XMM ssse3
2077%if ARCH_X86_32
2078; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h,
2079;                         sby, luma, lstride, uv_pl, is_id)
2080%if STACK_ALIGNMENT < mmsize
2081DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
2082cglobal fguv_32x32xn_i%1, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
2083        tmp, src, scaling, h, fg_data, picptr, unused
2084    mov              r0, r0m
2085    mov              r1, r2m
2086    mov              r2, r4m
2087    mov              r3, r6m
2088    mov              r4, r7m
2089    mov [rsp+8*mmsize+3*gprsize], r0
2090    mov [rsp+8*mmsize+5*gprsize], r1
2091    mov [rsp+8*mmsize+7*gprsize], r2
2092    mov [rsp+8*mmsize+9*gprsize], r3
2093    mov [rsp+8*mmsize+10*gprsize], r4
2094
2095    mov              r0, r8m
2096    mov              r1, r9m
2097    mov              r2, r10m
2098    mov              r4, r11m
2099    mov              r3, r12m
2100    mov [rsp+8*mmsize+11*gprsize], r0
2101    mov [rsp+8*mmsize+12*gprsize], r1
2102    mov [rsp+8*mmsize+13*gprsize], r2
2103    mov [rsp+8*mmsize+14*gprsize], r4
2104%else
2105cglobal fguv_32x32xn_i%1, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
2106        tmp, src, scaling, h, fg_data, picptr, unused
2107%endif
2108    mov            srcq, srcm
2109    mov        fg_dataq, r3m
2110    mov        scalingq, r5m
2111%if STACK_ALIGNMENT < mmsize
2112%define r0m [rsp+8*mmsize+ 3*gprsize]
2113%define r1m [rsp+8*mmsize+ 4*gprsize]
2114%define r2m [rsp+8*mmsize+ 5*gprsize]
2115%define r3m [rsp+8*mmsize+ 6*gprsize]
2116%define r4m [rsp+8*mmsize+ 7*gprsize]
2117%define r5m [rsp+8*mmsize+ 8*gprsize]
2118%define r6m [rsp+8*mmsize+ 9*gprsize]
2119%define r7m [rsp+8*mmsize+10*gprsize]
2120%define r8m [rsp+8*mmsize+11*gprsize]
2121%define r9m [rsp+8*mmsize+12*gprsize]
2122%define r10m [rsp+8*mmsize+13*gprsize]
2123%define r11m [rsp+8*mmsize+14*gprsize]
2124%define r12m [rsp+8*mmsize+15*gprsize]
2125%endif
2126    LEA              r5, pb_mask
2127%define base r5-pb_mask
2128    mov             r5m, r5
2129%else
2130cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
2131                                     grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
2132    lea              r8, [pb_mask]
2133%define base r8-pb_mask
2134%endif
2135    mov             r6d, [fg_dataq+FGData.scaling_shift]
2136    pcmpeqw          m2, m2
2137    movd             m3, [base+mul_bits+r6*2-14]
2138    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
2139    lea            tmpd, [r6d*2]
2140%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize
2141    test             r3, r3
2142%else
2143    cmp      dword r12m, 0                      ; is_idm
2144%endif
2145    movd             m5, [base+min+r6*2]
2146    cmovne          r6d, tmpd
2147    movd             m4, [base+max+r6*2]
2148    psrldq           m2, 14+%2
2149    punpcklwd        m3, m3
2150    punpcklwd        m5, m5
2151    punpcklwd        m4, m4
2152    pshufd           m3, m3, q0000
2153    pshufd           m5, m5, q0000
2154    pshufd           m4, m4, q0000
2155    SCRATCH           2, 10, 0
2156    SCRATCH           3, 11, 1
2157    SCRATCH           4, 12, 2
2158    SCRATCH           5, 13, 3
2159
2160    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
2161    jne .csfl
2162
2163%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
2164%if ARCH_X86_32
2165    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
2166%else
2167    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
2168%endif
2169
2170%if %1
2171    mov             r6d, dword r11m
2172    movd             m0, [fg_dataq+FGData.uv_mult+r6*4]
2173    movd             m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
2174    punpcklbw        m6, m1, m0
2175    movd             m7, [fg_dataq+FGData.uv_offset+r6*4]
2176    punpcklwd        m6, m6
2177    punpcklwd        m7, m7
2178    pshufd           m6, m6, q0000
2179    pshufd           m7, m7, q0000
2180    SCRATCH           6, 14, 4
2181    SCRATCH           7, 15, 5
2182%endif
2183
2184    mov            sbyd, r8m
2185    mov        overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
2186    test       overlapd, overlapd
2187    jz %%no_vertical_overlap
2188%if ARCH_X86_32
2189%if %2
2190    movd             m1, [base+pb_23_22]
2191%else
2192    movd             m1, [base+pb_27_17_17_27]
2193%endif
2194    mova             m0, [base+pw_1024]
2195%else
2196%if %2
2197    movd             m1, [pb_23_22]
2198%else
2199    movd             m1, [pb_27_17_17_27]
2200%endif
2201    mova             m0, [pw_1024]
2202%endif
2203    pshufd           m1, m1, q0000
2204    SCRATCH           0, 8, 6
2205    SCRATCH           1, 9, 7
2206    test           sbyd, sbyd
2207    jnz %%vertical_overlap
2208    ; fall-through
2209
2210%%no_vertical_overlap:
2211    mov             r8m, overlapd
2212%if ARCH_X86_32
2213    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
2214    imul           seed, (173 << 24) | 37
2215%else
2216    imul           seed, sbyd, (173 << 24) | 37
2217%endif
2218    add            seed, (105 << 24) | 178
2219    rol            seed, 8
2220    movzx          seed, seew
2221    xor            seed, [fg_dataq+FGData.seed]
2222
2223%if ARCH_X86_32
2224    mov             r3m, seed
2225
2226    DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
2227%define luma_bakq lumaq
2228
2229    mov              wq, r4m
2230%if %3
2231    shl           r10mp, 1
2232%endif
2233%else
2234    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2235                unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak
2236
2237    mov        lstrideq, r10mp
2238%endif
2239
2240    mov           lumaq, r9mp
2241    lea        src_bakq, [srcq+wq]
2242    lea       luma_bakq, [lumaq+wq*(1+%2)]
2243    neg              wq
2244    sub            r0mp, srcq
2245%if ARCH_X86_32
2246    mov             r1m, src_bakq
2247    mov            r11m, luma_bakq
2248    mov             r4m, wq
2249
2250    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
2251%else
2252    mov           r11mp, src_bakq
2253    mov           r12mp, strideq
2254%endif
2255
2256%%loop_x:
2257%if ARCH_X86_32
2258    mov            seed, r3m
2259%endif
2260    mov             r6d, seed
2261    or             seed, 0xEFF4
2262    shr             r6d, 1
2263    test           seeb, seeh
2264    lea            seed, [r6+0x8000]
2265    cmovp          seed, r6d               ; updated seed
2266%if ARCH_X86_32
2267    mov             r3m, seed
2268
2269    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
2270
2271    mov           offxd, offyd
2272%else
2273    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2274                offx, offy, see, overlap, unused1, unused2, lstride
2275
2276    mov           offyd, seed
2277    mov           offxd, seed
2278%endif
2279    ror           offyd, 8
2280    shr           offxd, 12
2281    and           offyd, 0xf
2282    imul          offyd, 164>>%3
2283    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))]  ; offy*stride+offx
2284
2285%if ARCH_X86_32
2286    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
2287%else
2288    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2289                h, offxy, see, overlap, unused1, unused2, lstride, luma_bak
2290%endif
2291
2292%%loop_x_odd:
2293    mov              hd, r7m
2294    mov      grain_lutq, grain_lutmp
2295%%loop_y:
2296    ; src
2297%if ARCH_X86_32
2298    mov           lumaq, r9mp
2299%endif
2300%if %2
2301    mova             m4, [lumaq+ 0]
2302    mova             m6, [lumaq+16]
2303    mova             m0, [srcq]
2304%if ARCH_X86_32
2305    add           lumaq, r10mp
2306    mov            r9mp, lumaq
2307    mov              r5, r5m
2308    movd             m7, [base+pb_1]
2309%else
2310    movd             m7, [pb_1]
2311%endif
2312    pshufd           m7, m7, q0000
2313    pxor             m2, m2
2314    pmaddubsw        m4, m7
2315    pmaddubsw        m6, m7
2316    pavgw            m4, m2
2317    pavgw            m6, m2
2318%else
2319    mova             m4, [lumaq]
2320    mova             m0, [srcq]
2321%if ARCH_X86_32
2322    add           lumaq, r10mp
2323    mov            r9mp, lumaq
2324%endif
2325    pxor             m2, m2
2326%endif
2327
2328%if %1
2329%if %2
2330    packuswb         m4, m6                 ; luma
2331%endif
2332    punpckhbw        m6, m4, m0
2333    punpcklbw        m4, m0                 ; { luma, chroma }
2334    pmaddubsw        m6, m14
2335    pmaddubsw        m4, m14
2336    psraw            m6, 6
2337    psraw            m4, 6
2338    paddw            m6, m15
2339    paddw            m4, m15
2340    packuswb         m4, m6                 ; pack+unpack = clip
2341    punpckhbw        m6, m4, m2
2342    punpcklbw        m4, m2
2343%elif %2 == 0
2344    punpckhbw        m6, m4, m2
2345    punpcklbw        m4, m2
2346%endif
2347
2348    ; scaling[luma_src]
2349%if ARCH_X86_32
2350    vpgatherdw       m7, m4, scalingq, r0, r5
2351    vpgatherdw       m5, m6, scalingq, r0, r5
2352%else
2353    vpgatherdw       m7, m4, scalingq, r12, r2
2354    vpgatherdw       m5, m6, scalingq, r12, r2
2355%endif
2356    pcmpeqw          m1, m1
2357    psrlw            m1, 8
2358    pand             m7, m1
2359    pand             m5, m1
2360
2361    ; unpack chroma_source
2362    punpckhbw        m1, m0, m2
2363    punpcklbw        m0, m2                 ; m0-1: src as word
2364
2365    ; grain = grain_lut[offy+y][offx+x]
2366    movu             m3, [grain_lutq+offxyq+ 0]
2367    pcmpgtb          m6, m2, m3
2368    punpcklbw        m2, m3, m6
2369    punpckhbw        m3, m6
2370
2371    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2372    pmullw           m2, m7
2373    pmullw           m3, m5
2374    pmulhrsw         m2, m11
2375    pmulhrsw         m3, m11
2376
2377%if ARCH_X86_32
2378    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2379%endif
2380
2381    ; dst = clip_pixel(src, noise)
2382    paddw            m0, m2
2383    paddw            m1, m3
2384    pmaxsw           m0, m13
2385    pmaxsw           m1, m13
2386    pminsw           m0, m12
2387    pminsw           m1, m12
2388    packuswb         m0, m1
2389    movifnidn      dstq, dstmp
2390    mova    [dstq+srcq], m0
2391
2392%if ARCH_X86_32
2393    add            srcq, r2mp
2394    ; we already incremented lumaq above
2395%else
2396    add            srcq, r12mp
2397%if %3
2398    lea           lumaq, [lumaq+lstrideq*2]
2399%else
2400    add           lumaq, lstrideq
2401%endif
2402%endif
2403    add      grain_lutq, 82
2404    dec              hw
2405    jg %%loop_y
2406
2407%if ARCH_X86_32
2408    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
2409
2410    mov              wq, r4m
2411%endif
2412    add              wq, 16
2413    jge %%end
2414%if ARCH_X86_32
2415    mov            srcq, r1mp
2416    mov           lumaq, r11mp
2417%else
2418    mov            srcq, r11mp
2419%endif
2420    lea           lumaq, [luma_bakq+wq*(1+%2)]
2421    add            srcq, wq
2422%if ARCH_X86_32
2423    mov             r4m, wq
2424    mov             r9m, lumaq
2425%endif
2426%if %2 == 0
2427    ; adjust top_offxy
2428%if ARCH_X86_32
2429    add dword [rsp+8*mmsize+1*gprsize], 16
2430%else
2431    add            r11d, 16
2432%endif
2433    add          offxyd, 16
2434    btc       dword r8m, 2
2435    jc %%loop_x_even
2436    test      dword r8m, 2
2437    jz %%loop_x_odd
2438    jmp %%loop_x_odd_v_overlap
2439%%loop_x_even:
2440%endif
2441    test      dword r8m, 1
2442    jz %%loop_x
2443
2444    ; r8m = sbym
2445    test      dword r8m, 2
2446    jne %%loop_x_hv_overlap
2447
2448    ; horizontal overlap (without vertical overlap)
2449%%loop_x_h_overlap:
2450%if ARCH_X86_32
2451%if %2
2452    lea              r6, [offxyd+16]
2453    mov [rsp+8*mmsize+0*gprsize], r6
2454%else
2455    mov [rsp+8*mmsize+0*gprsize], offxyd
2456%endif
2457
2458    DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
2459
2460    mov            seed, r3m
2461%else
2462    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2463                offx, offy, see, left_offxy, unused1, unused2, lstride
2464
2465%if %2
2466    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
2467%else
2468    mov     left_offxyd, offyd
2469%endif
2470%endif
2471    mov             r6d, seed
2472    or             seed, 0xEFF4
2473    shr             r6d, 1
2474    test           seeb, seeh
2475    lea            seed, [r6+0x8000]
2476    cmovp          seed, r6d                ; updated seed
2477
2478%if ARCH_X86_32
2479    mov             r3m, seed
2480
2481    DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx
2482
2483    mov          offxd, offyd
2484%else
2485    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2486                offx, offy, see, left_offxy, unused1, unused2, lstride
2487
2488    mov           offyd, seed
2489    mov           offxd, seed
2490%endif
2491    ror           offyd, 8
2492    shr           offxd, 12
2493    and           offyd, 0xf
2494    imul          offyd, 164>>%3
2495    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
2496
2497%if ARCH_X86_32
2498    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
2499%else
2500    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2501                h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak
2502%endif
2503
2504    mov              hd, r7m
2505    mov      grain_lutq, grain_lutmp
2506%%loop_y_h_overlap:
2507    ; src
2508%if ARCH_X86_32
2509    mov           lumaq, r9mp
2510%endif
2511%if %2
2512    mova             m4, [lumaq+ 0]
2513    mova             m6, [lumaq+16]
2514    mova             m0, [srcq]
2515%if ARCH_X86_32
2516    add           lumaq, r10mp
2517    mov            r9mp, lumaq
2518    mov              r5, r5m
2519    movd             m7, [base+pb_1]
2520%else
2521    movd             m7, [pb_1]
2522%endif
2523    pshufd           m7, m7, q0000
2524    pxor             m2, m2
2525    pmaddubsw        m4, m7
2526    pmaddubsw        m6, m7
2527    pavgw            m4, m2
2528    pavgw            m6, m2
2529%else
2530    mova             m4, [lumaq]
2531    mova             m0, [srcq]
2532%if ARCH_X86_32
2533    add           lumaq, r10mp
2534    mov            r9mp, lumaq
2535%endif
2536    pxor             m2, m2
2537%endif
2538
2539%if %1
2540%if %2
2541    packuswb         m4, m6                 ; luma
2542%endif
2543    punpckhbw        m6, m4, m0
2544    punpcklbw        m4, m0                 ; { luma, chroma }
2545    pmaddubsw        m6, m14
2546    pmaddubsw        m4, m14
2547    psraw            m6, 6
2548    psraw            m4, 6
2549    paddw            m6, m15
2550    paddw            m4, m15
2551    packuswb         m4, m6                 ; pack+unpack = clip
2552    punpckhbw        m6, m4, m2
2553    punpcklbw        m4, m2
2554%elif %2 == 0
2555    punpckhbw        m6, m4, m2
2556    punpcklbw        m4, m2
2557%endif
2558
2559    ; scaling[luma_src]
2560%if ARCH_X86_32
2561    vpgatherdw       m7, m4, scalingq, r0, r5
2562    vpgatherdw       m5, m6, scalingq, r0, r5
2563%else
2564    vpgatherdw       m7, m4, scalingq, r12, r2
2565    vpgatherdw       m5, m6, scalingq, r12, r2
2566%endif
2567    pcmpeqw          m1, m1
2568    psrlw            m1, 8
2569    pand             m7, m1
2570    pand             m5, m1
2571
2572    ; unpack chroma_source
2573    punpckhbw        m1, m0, m2
2574    punpcklbw        m0, m2                 ; m0-1: src as word
2575
2576    ; grain = grain_lut[offy+y][offx+x]
2577    movu             m3, [grain_lutq+offxyq+ 0]
2578%if ARCH_X86_32
2579    mov              r0, [rsp+8*mmsize+0*gprsize]
2580    movd             m4, [grain_lutq+r0+ 0]
2581%else
2582    movd             m4, [grain_lutq+left_offxyq+ 0]
2583%endif
2584    punpcklbw        m2, m4, m3
2585    pmaddubsw        m4, m9, m2
2586    pmulhrsw         m4, m8
2587    packsswb         m4, m4
2588    pand             m4, m10
2589    pandn            m2, m10, m3
2590    por              m3, m4, m2
2591    pxor             m4, m4
2592    pcmpgtb          m4, m3
2593    punpcklbw        m2, m3, m4
2594    punpckhbw        m3, m4
2595
2596    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2597    pmullw           m2, m7
2598    pmullw           m3, m5
2599    pmulhrsw         m2, m11
2600    pmulhrsw         m3, m11
2601
2602%if ARCH_X86_32
2603    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2604%endif
2605
2606    ; dst = clip_pixel(src, noise)
2607    paddw            m0, m2
2608    paddw            m1, m3
2609    pmaxsw           m0, m13
2610    pmaxsw           m1, m13
2611    pminsw           m0, m12
2612    pminsw           m1, m12
2613    packuswb         m0, m1
2614    movifnidn      dstq, dstmp
2615    mova    [dstq+srcq], m0
2616
2617%if ARCH_X86_32
2618    add            srcq, r2mp
2619    ; lumaq has already been incremented above
2620%else
2621    add            srcq, r12mp
2622%if %3
2623    lea           lumaq, [lumaq+lstrideq*2]
2624%else
2625    add           lumaq, lstrideq
2626%endif
2627%endif
2628    add      grain_lutq, 82
2629    dec              hw
2630    jg %%loop_y_h_overlap
2631
2632%if ARCH_X86_32
2633    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
2634
2635    mov              wq, r4m
2636%endif
2637    add              wq, 16
2638    jge %%end
2639%if ARCH_X86_32
2640    mov            srcq, r1mp
2641    mov           lumaq, r11mp
2642%else
2643    mov            srcq, r11mp
2644%endif
2645    lea           lumaq, [luma_bakq+wq*(1+%2)]
2646    add            srcq, wq
2647%if ARCH_X86_32
2648    mov             r4m, wq
2649    mov             r9m, lumaq
2650%endif
2651%if %2 == 0
2652    xor       dword r8m, 4
2653    ; adjust top_offxyd
2654%if ARCH_X86_32
2655    add dword [rsp+8*mmsize+1*gprsize], 16
2656%else
2657    add            r11d, 16
2658%endif
2659    add          offxyd, 16
2660%endif
2661
2662    ; r8m = sbym
2663    test      dword r8m, 2
2664%if %2
2665    jne %%loop_x_hv_overlap
2666    jmp %%loop_x_h_overlap
2667%else
2668    jne %%loop_x_odd_v_overlap
2669    jmp %%loop_x_odd
2670%endif
2671
2672%%end:
2673    RET
2674
2675%%vertical_overlap:
2676%if ARCH_X86_32
2677    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
2678%else
2679    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
2680%endif
2681
2682    or         overlapd, 2                  ; top_overlap: overlap & 2
2683    mov             r8m, overlapd
2684    movzx          sbyd, sbyb
2685%if ARCH_X86_32
2686    imul             r4, [fg_dataq+FGData.seed], 0x00010001
2687    DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
2688%else
2689    imul           seed, [fg_dataq+FGData.seed], 0x00010001
2690%endif
2691    imul           tmpd, sbyd, 173 * 0x00010001
2692    imul           sbyd, 37 * 0x01000100
2693    add            tmpd, (105 << 16) | 188
2694    add            sbyd, (178 << 24) | (141 << 8)
2695    and            tmpd, 0x00ff00ff
2696    and            sbyd, 0xff00ff00
2697    xor            seed, tmpd
2698%if ARCH_X86_32
2699    xor            sbyd, seed               ; (cur_seed << 16) | top_seed
2700
2701    DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
2702
2703    mov             r3m, seed
2704    mov              wq, r4m
2705%if %3
2706    shl           r10mp, 1
2707%endif
2708%else
2709    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
2710
2711    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2712                tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak
2713
2714    mov        lstrideq, r10mp
2715%endif
2716
2717    mov           lumaq, r9mp
2718    lea        src_bakq, [srcq+wq]
2719    lea       luma_bakq, [lumaq+wq*(1+%2)]
2720    neg              wq
2721    sub            r0mp, srcq
2722%if ARCH_X86_32
2723    mov             r1m, src_bakq
2724    mov            r11m, luma_bakq
2725    mov             r4m, wq
2726
2727    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
2728%else
2729    mov           r11mp, src_bakq
2730    mov           r12mp, strideq
2731%endif
2732
2733%%loop_x_v_overlap:
2734%if ARCH_X86_32
2735    mov            seed, r3m
2736    xor            tmpd, tmpd
2737%endif
2738    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
2739    mov             r6d, seed
2740    or             seed, 0xeff4eff4
2741    test           seeb, seeh
2742    setp           tmpb                     ; parity of top_seed
2743    shr            seed, 16
2744    shl            tmpd, 16
2745    test           seeb, seeh
2746    setp           tmpb                     ; parity of cur_seed
2747    or              r6d, 0x00010001
2748    xor            tmpd, r6d
2749    mov            seed, tmpd
2750    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
2751
2752%if ARCH_X86_32
2753    mov             r3m, seed
2754
2755    DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
2756
2757    mov           offxd, offyd
2758%else
2759    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2760                offx, offy, see, overlap, top_offxy, unused, lstride
2761
2762    mov           offxd, seed
2763    mov           offyd, seed
2764%endif
2765    ror           offyd, 8
2766    ror           offxd, 12
2767    and           offyd, 0xf000f
2768    and           offxd, 0xf000f
2769    imul          offyd, 164>>%3
2770    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
2771    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
2772
2773%if ARCH_X86_32
2774    DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy
2775%else
2776    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2777                h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak
2778%endif
2779
2780    movzx    top_offxyd, offxyw
2781    shr          offxyd, 16
2782%if ARCH_X86_32
2783    mov [rsp+8*mmsize+1*gprsize], top_offxyd
2784
2785    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
2786%endif
2787
2788%%loop_x_odd_v_overlap:
2789    mov              hd, r7m
2790    mov      grain_lutq, grain_lutmp
2791%if ARCH_X86_32
2792    mov              r5, r5m
2793    mova             m1, [base+pb_27_17]
2794%else
2795    mova             m1, [pb_27_17]
2796%endif
2797%%loop_y_v_overlap:
2798%if ARCH_X86_32
2799    mov           lumaq, r9mp
2800%endif
2801%if %2
2802    mova             m4, [lumaq+ 0]
2803    mova             m6, [lumaq+16]
2804    mova             m0, [srcq]
2805%if ARCH_X86_32
2806    add           lumaq, r10mp
2807    mov            r9mp, lumaq
2808    mov              r5, r5m
2809    movd             m7, [base+pb_1]
2810%else
2811    movd             m7, [pb_1]
2812%endif
2813    pshufd           m7, m7, q0000
2814    pxor             m2, m2
2815    pmaddubsw        m4, m7
2816    pmaddubsw        m6, m7
2817    pavgw            m4, m2
2818    pavgw            m6, m2
2819%else
2820    mova             m4, [lumaq]
2821    mova             m0, [srcq]
2822%if ARCH_X86_32
2823    add           lumaq, r10mp
2824    mov            r9mp, lumaq
2825%endif
2826    pxor             m2, m2
2827%endif
2828
2829%if %1
2830%if %2
2831    packuswb         m4, m6                 ; luma
2832%endif
2833    punpckhbw        m6, m4, m0
2834    punpcklbw        m4, m0                 ; { luma, chroma }
2835    pmaddubsw        m6, m14
2836    pmaddubsw        m4, m14
2837    psraw            m6, 6
2838    psraw            m4, 6
2839    paddw            m6, m15
2840    paddw            m4, m15
2841    packuswb         m4, m6                 ; pack+unpack = clip
2842    punpckhbw        m6, m4, m2
2843    punpcklbw        m4, m2
2844%elif %2 == 0
2845    punpckhbw        m6, m4, m2
2846    punpcklbw        m4, m2
2847%endif
2848
2849    ; scaling[luma_src]
2850%if ARCH_X86_32
2851    vpgatherdw       m7, m4, scalingq, r0, r5
2852    vpgatherdw       m5, m6, scalingq, r0, r5
2853%else
2854    vpgatherdw       m7, m4, scalingq, r12, r2
2855    vpgatherdw       m5, m6, scalingq, r12, r2
2856%endif
2857    pcmpeqw          m4, m4
2858    psrlw            m4, 8
2859    pand             m7, m4
2860    pand             m5, m4
2861
2862    ; grain = grain_lut[offy+y][offx+x]
2863    movu             m3, [grain_lutq+offxyq]
2864%if ARCH_X86_32
2865    mov              r0, [rsp+8*mmsize+1*gprsize]
2866    movu             m4, [grain_lutq+r0]
2867%else
2868    movu             m4, [grain_lutq+top_offxyq]
2869%endif
2870    punpckhbw        m6, m4, m3
2871    punpcklbw        m4, m3
2872%if %3
2873    pmaddubsw        m2, m9, m6
2874    pmaddubsw        m3, m9, m4
2875%else
2876    pmaddubsw        m2, m1, m6
2877    pmaddubsw        m3, m1, m4
2878%endif
2879    pmulhrsw         m2, m8
2880    pmulhrsw         m3, m8
2881    packsswb         m3, m2
2882    pxor             m6, m6
2883    pcmpgtb          m6, m3
2884    punpcklbw        m2, m3, m6
2885    punpckhbw        m3, m6
2886
2887    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2888    pmullw           m2, m7
2889    pmullw           m3, m5
2890    pmulhrsw         m2, m11
2891    pmulhrsw         m3, m11
2892
2893    ; unpack chroma_source
2894    pxor             m4, m4
2895    punpckhbw        m6, m0, m4
2896    punpcklbw        m0, m4                 ; m0-1: src as word
2897
2898%if ARCH_X86_32
2899    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2900%endif
2901
2902    ; dst = clip_pixel(src, noise)
2903    paddw            m0, m2
2904    paddw            m6, m3
2905    pmaxsw           m0, m13
2906    pmaxsw           m6, m13
2907    pminsw           m0, m12
2908    pminsw           m6, m12
2909    packuswb         m0, m6
2910    movifnidn      dstq, dstmp
2911    mova    [dstq+srcq], m0
2912
2913    dec              hw
2914    je %%end_y_v_overlap
2915%if ARCH_X86_32
2916    add            srcq, r2mp
2917    ; lumaq has already been incremented above
2918%else
2919    add            srcq, r12mp
2920%if %3
2921    lea           lumaq, [lumaq+lstrideq*2]
2922%else
2923    add           lumaq, lstrideq
2924%endif
2925%endif
2926    add      grain_lutq, 82
2927%if %3 == 0
2928    btc              hd, 16
2929%if ARCH_X86_32
2930    mov              r5, r5m
2931    mova             m1, [base+pb_17_27]
2932%else
2933    mova             m1, [pb_17_27]
2934%endif
2935    jnc %%loop_y_v_overlap
2936%endif
2937    jmp %%loop_y
2938
2939%%end_y_v_overlap:
2940%if ARCH_X86_32
2941    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
2942
2943    mov              wq, r4m
2944%endif
2945    add              wq, 16
2946    jge %%end_hv
2947%if ARCH_X86_32
2948    mov            srcq, r1mp
2949    mov           lumaq, r11mp
2950%else
2951    mov            srcq, r11mp
2952%endif
2953    lea           lumaq, [luma_bakq+wq*(1+%2)]
2954    add            srcq, wq
2955%if ARCH_X86_32
2956    mov             r4m, wq
2957    mov             r9m, lumaq
2958%endif
2959
2960%if %2
2961    ; since fg_dataq.overlap is guaranteed to be set, we never jump
2962    ; back to .loop_x_v_overlap, and instead always fall-through to
2963    ; h+v overlap
2964%else
2965%if ARCH_X86_32
2966    add dword [rsp+8*mmsize+1*gprsize], 16
2967%else
2968    add      top_offxyd, 16
2969%endif
2970    add          offxyd, 16
2971    btc       dword r8m, 2
2972    jnc %%loop_x_odd_v_overlap
2973%endif
2974
2975%%loop_x_hv_overlap:
2976%if ARCH_X86_32
2977    DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
2978
2979    mov              r6, [rsp+8*mmsize+1*gprsize]
2980%if %2
2981    lea              r0, [r3d+16]
2982    add              r6, 16
2983    mov [rsp+8*mmsize+0*gprsize], r0        ; left_offxy
2984%else
2985    mov [rsp+8*mmsize+0*gprsize], r3        ; left_offxy
2986%endif
2987    mov [rsp+8*mmsize+2*gprsize], r6        ; topleft_offxy
2988
2989    DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
2990
2991    mov            seed, r3m
2992    xor            tmpd, tmpd
2993%else
2994    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2995                tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
2996
2997%if %2
2998    lea  topleft_offxyq, [top_offxyq+16]
2999    lea     left_offxyq, [offxyq+16]
3000%else
3001    mov  topleft_offxyq, top_offxyq
3002    mov     left_offxyq, offxyq
3003%endif
3004
3005    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
3006%endif
3007    mov             r6d, seed
3008    or             seed, 0xeff4eff4
3009    test           seeb, seeh
3010    setp           tmpb                     ; parity of top_seed
3011    shr            seed, 16
3012    shl            tmpd, 16
3013    test           seeb, seeh
3014    setp           tmpb                     ; parity of cur_seed
3015    or              r6d, 0x00010001
3016    xor            tmpd, r6d
3017    mov            seed, tmpd
3018    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
3019
3020%if ARCH_X86_32
3021    mov             r3m, seed
3022
3023    DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx
3024
3025    mov           offxd, offyd
3026%else
3027    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
3028                offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
3029
3030    mov           offxd, seed
3031    mov           offyd, seed
3032%endif
3033    ror           offyd, 8
3034    ror           offxd, 12
3035    and           offyd, 0xf000f
3036    and           offxd, 0xf000f
3037    imul          offyd, 164>>%3
3038    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
3039    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
3040
3041%if ARCH_X86_32
3042    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
3043%else
3044    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
3045                h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak
3046%endif
3047
3048    movzx    top_offxyd, offxyw
3049    shr          offxyd, 16
3050%if ARCH_X86_32
3051    mov [rsp+8*mmsize+1*gprsize], top_offxyd
3052%endif
3053
3054    mov              hd, r7m
3055    mov      grain_lutq, grain_lutmp
3056%if ARCH_X86_32
3057    mov              r5, r5m
3058    mova             m3, [base+pb_27_17]
3059%else
3060    mova             m3, [pb_27_17]
3061%endif
3062%%loop_y_hv_overlap:
3063    ; src
3064%if ARCH_X86_32
3065    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
3066
3067    mov           lumaq, r9mp
3068%endif
3069%if %2
3070    mova             m4, [lumaq+ 0]
3071    mova             m6, [lumaq+16]
3072    mova             m0, [srcq]
3073%if ARCH_X86_32
3074    add           lumaq, r10mp
3075    mov            r9mp, lumaq
3076    mov              r5, r5m
3077    movd             m7, [base+pb_1]
3078%else
3079    movd             m7, [pb_1]
3080%endif
3081    pshufd           m7, m7, q0000
3082    pxor             m2, m2
3083    pmaddubsw        m4, m7
3084    pmaddubsw        m6, m7
3085    pavgw            m4, m2
3086    pavgw            m6, m2
3087%else
3088    mova             m4, [lumaq]
3089    mova             m0, [srcq]
3090%if ARCH_X86_32
3091    add           lumaq, r10mp
3092    mov            r9mp, lumaq
3093%endif
3094    pxor             m2, m2
3095%endif
3096
3097%if %1
3098%if %2
3099    packuswb         m4, m6                 ; luma
3100%endif
3101    punpckhbw        m6, m4, m0
3102    punpcklbw        m4, m0                 ; { luma, chroma }
3103    pmaddubsw        m6, m14
3104    pmaddubsw        m4, m14
3105    psraw            m6, 6
3106    psraw            m4, 6
3107    paddw            m6, m15
3108    paddw            m4, m15
3109    packuswb         m4, m6                 ; pack+unpack = clip
3110    punpckhbw        m6, m4, m2
3111    punpcklbw        m4, m2
3112%elif %2 == 0
3113    punpckhbw        m6, m4, m2
3114    punpcklbw        m4, m2
3115%endif
3116
3117    ; scaling[src]
3118%if ARCH_X86_32
3119    vpgatherdw       m7, m4, scalingq, r0, r5
3120    vpgatherdw       m5, m6, scalingq, r0, r5
3121%else
3122    movd             m1, [grain_lutq+topleft_offxyq]
3123%if %3
3124    vpgatherdw       m7, m4, scalingq, r2, r12
3125    vpgatherdw       m5, m6, scalingq, r2, r12
3126%else
3127    vpgatherdw       m7, m4, scalingq, r2, r13
3128    vpgatherdw       m5, m6, scalingq, r2, r13
3129%endif
3130%endif
3131    pcmpeqw          m2, m2
3132    psrlw            m2, 8
3133    pand             m7, m2
3134    pand             m5, m2
3135
3136    ; grain = grain_lut[offy+y][offx+x]
3137%if ARCH_X86_32
3138    mov              r0, [rsp+8*mmsize+2*gprsize]       ; topleft_offxy
3139    mov              r5, [rsp+8*mmsize+1*gprsize]       ; top_offxy
3140    movd             m1, [grain_lutq+r0]
3141    mov              r0, [rsp+8*mmsize+0*gprsize]       ; left_offxy
3142%endif
3143    movu             m2, [grain_lutq+offxyq]
3144%if ARCH_X86_32
3145    movu             m6, [grain_lutq+r5]
3146    movd             m4, [grain_lutq+r0]
3147%else
3148    movu             m6, [grain_lutq+top_offxyq]
3149    movd             m4, [grain_lutq+left_offxyq]
3150%endif
3151    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
3152    punpcklbw        m1, m6
3153    punpcklbw        m4, m2
3154%if %2
3155    punpcklwd        m4, m1
3156%else
3157    punpckldq        m4, m1
3158%endif
3159    pmaddubsw        m1, m9, m4
3160    pmulhrsw         m1, m8
3161    packsswb         m1, m1
3162    pandn            m4, m10, m2
3163    pandn            m2, m10, m6
3164    psrldq           m6, m1, 2-%2
3165    pand             m1, m10
3166    pand             m6, m10
3167    por              m4, m1
3168    por              m2, m6
3169    ; followed by v interpolation (top | cur -> cur)
3170    punpckhbw        m1, m2, m4
3171    punpcklbw        m2, m4
3172%if %3
3173    pmaddubsw        m4, m9, m1
3174    pmaddubsw        m1, m9, m2
3175%else
3176    pmaddubsw        m4, m3, m1
3177    pmaddubsw        m1, m3, m2
3178%endif
3179    pmulhrsw         m4, m8
3180    pmulhrsw         m1, m8
3181    packsswb         m1, m4
3182    pxor             m4, m4
3183    pcmpgtb          m4, m1
3184    punpcklbw        m2, m1, m4
3185    punpckhbw        m1, m4
3186
3187    ; noise = round2(scaling[src] * grain, scaling_shift)
3188    pmullw           m2, m7
3189    pmullw           m1, m5
3190    pmulhrsw         m2, m11
3191    pmulhrsw         m1, m11
3192
3193%if ARCH_X86_32
3194    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
3195%endif
3196
3197    ; unpack chroma source
3198    pxor             m4, m4
3199    punpckhbw        m5, m0, m4
3200    punpcklbw        m0, m4                 ; m0-1: src as word
3201
3202    ; dst = clip_pixel(src, noise)
3203    paddw            m0, m2
3204    paddw            m5, m1
3205    pmaxsw           m0, m13
3206    pmaxsw           m5, m13
3207    pminsw           m0, m12
3208    pminsw           m5, m12
3209    packuswb         m0, m5
3210    movifnidn      dstq, dstmp
3211    mova    [dstq+srcq], m0
3212
3213%if ARCH_X86_32
3214    add            srcq, r2mp
3215    ; lumaq has been adjusted above already
3216%else
3217    add            srcq, r12mp
3218%if %3
3219    lea           lumaq, [lumaq+lstrideq*(1+%2)]
3220%else
3221    add           lumaq, r10mp
3222%endif
3223%endif
3224    add      grain_lutq, 82
3225    dec              hw
3226%if %3
3227    jg %%loop_y_h_overlap
3228%else
3229    jle %%end_y_hv_overlap
3230%if ARCH_X86_32
3231    mov              r5, r5m
3232    mova             m3, [base+pb_17_27]
3233%else
3234    mova             m3, [pb_17_27]
3235%endif
3236    btc              hd, 16
3237    jnc %%loop_y_hv_overlap
3238%if ARCH_X86_64
3239    mov        lstrideq, r10mp
3240%endif
3241    jmp %%loop_y_h_overlap
3242%%end_y_hv_overlap:
3243%if ARCH_X86_64
3244    mov        lstrideq, r10mp
3245%endif
3246%endif
3247
3248%if ARCH_X86_32
3249    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
3250
3251    mov              wq, r4m
3252%endif
3253    add              wq, 16
3254    jge %%end_hv
3255%if ARCH_X86_32
3256    mov            srcq, r1mp
3257    mov           lumaq, r11mp
3258%else
3259    mov            srcq, r11mp
3260%endif
3261    lea           lumaq, [luma_bakq+wq*(1+%2)]
3262    add            srcq, wq
3263%if ARCH_X86_32
3264    mov             r4m, wq
3265    mov             r9m, lumaq
3266%endif
3267%if %2
3268    jmp %%loop_x_hv_overlap
3269%else
3270%if ARCH_X86_32
3271    add dword [rsp+8*mmsize+1*gprsize], 16
3272%else
3273    add      top_offxyd, 16
3274%endif
3275    add          offxyd, 16
3276    xor       dword r8m, 4
3277    jmp %%loop_x_odd_v_overlap
3278%endif
3279
3280%%end_hv:
3281    RET
3282%endmacro
3283
3284    %%FGUV_32x32xN_LOOP 1, %2, %3
3285.csfl:
3286    %%FGUV_32x32xN_LOOP 0, %2, %3
3287%endmacro
3288
3289FGUV_FN 420, 1, 1
3290
3291%if STACK_ALIGNMENT < mmsize
3292DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
3293%endif
3294
3295FGUV_FN 422, 1, 0
3296
3297%if STACK_ALIGNMENT < mmsize
3298DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
3299%endif
3300
3301FGUV_FN 444, 0, 0
3302