1; Copyright © 2019, VideoLAN and dav1d authors
2; Copyright © 2019, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "ext/x86/x86inc.asm"
27
28SECTION_RODATA
29
30pw_1024: times 8 dw 1024
31pb_27_17: times 8 db 27, 17
32pb_17_27: times 8 db 17, 27
33pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
34rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
35byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
36pw_seed_xor: times 2 dw 0xb524
37             times 2 dw 0x49d8
38pb_23_22: times 2 db 23, 22
39pb_1: times 4 db 1
40hmul_bits: dw 32768, 16384, 8192, 4096
41round: dw 2048, 1024, 512
42mul_bits: dw 256, 128, 64, 32, 16
43round_vals: dw 32, 64, 128, 256, 512
44max: dw 255, 240, 235
45min: dw 0, 16
46pw_1: dw 1
47
48%define pb_27_17_17_27 pb_17_27 - 2
49
50%macro JMP_TABLE 1-*
51    %xdefine %1_table %%table
52    %xdefine %%base %1_table
53    %xdefine %%prefix mangle(private_prefix %+ _%1)
54    %%table:
55    %rep %0 - 1
56        dd %%prefix %+ .ar%2 - %%base
57        %rotate 1
58    %endrep
59%endmacro
60
61JMP_TABLE generate_grain_y_ssse3, 0, 1, 2, 3
62JMP_TABLE generate_grain_uv_420_ssse3, 0, 1, 2, 3
63
64struc FGData
65    .seed:                      resd 1
66    .num_y_points:              resd 1
67    .y_points:                  resb 14 * 2
68    .chroma_scaling_from_luma:  resd 1
69    .num_uv_points:             resd 2
70    .uv_points:                 resb 2 * 10 * 2
71    .scaling_shift:             resd 1
72    .ar_coeff_lag:              resd 1
73    .ar_coeffs_y:               resb 24
74    .ar_coeffs_uv:              resb 2 * 28 ; includes padding
75    .ar_coeff_shift:            resq 1
76    .grain_scale_shift:         resd 1
77    .uv_mult:                   resd 2
78    .uv_luma_mult:              resd 2
79    .uv_offset:                 resd 2
80    .overlap_flag:              resd 1
81    .clip_to_restricted_range:  resd 1
82endstruc
83
84cextern gaussian_sequence
85
86SECTION .text
87
88%macro SCRATCH 3
89%if ARCH_X86_32
90    mova [rsp+%3*mmsize], m%1
91%define m%2 [rsp+%3*mmsize]
92%else
93    SWAP             %1, %2
94%endif
95%endmacro
96
97INIT_XMM ssse3
98cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
99    LEA              r4, $$
100%define base r4-$$
101    movq             m1, [base+rnd_next_upperbit_mask]
102    movq             m4, [base+mul_bits]
103    movq             m7, [base+hmul_bits]
104    mov             r2d, [fg_dataq+FGData.grain_scale_shift]
105    movd             m2, [base+round+r2*2]
106    movd             m0, [fg_dataq+FGData.seed]
107    mova             m5, [base+pb_mask]
108    pshuflw          m2, m2, q0000
109    pshuflw          m0, m0, q0000
110    mov              r2, -73*82
111    sub            bufq, r2
112    lea              r3, [base+gaussian_sequence]
113.loop:
114    pand             m6, m0, m1
115    psrlw            m3, m6, 10
116    por              m6, m3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
117    pmullw           m6, m4            ; bits 0x0f00 are set
118    pshufb           m3, m5, m6        ; set 15th bit for next 4 seeds
119    psllq            m6, m3, 30
120    por              m3, m6
121    psllq            m6, m3, 15
122    por              m3, m6            ; aggregate each bit into next seed's high bit
123    pmulhuw          m6, m0, m7
124    por              m3, m6            ; 4 next output seeds
125    pshuflw          m0, m3, q3333
126    psrlw            m3, 5
127%if ARCH_X86_64
128    movq             r6, m3
129    mov              r8, r6
130    movzx           r5d, r6w
131    shr             r6d, 16
132    shr              r8, 32
133    movzx            r7, r8w
134    shr              r8, 16
135
136    movd             m6, [r3+r5*2]
137    pinsrw           m6, [r3+r6*2], 1
138    pinsrw           m6, [r3+r7*2], 2
139    pinsrw           m6, [r3+r8*2], 3
140%else
141    movd             r6, m3
142    pshuflw          m3, m3, q3232
143    movzx            r5, r6w
144    shr              r6, 16
145
146    movd             m6, [r3+r5*2]
147    pinsrw           m6, [r3+r6*2], 1
148
149    movd             r6, m3
150    movzx            r5, r6w
151    shr              r6, 16
152
153    pinsrw           m6, [r3+r5*2], 2
154    pinsrw           m6, [r3+r6*2], 3
155%endif
156    pmulhrsw         m6, m2
157    packsswb         m6, m6
158    movd      [bufq+r2], m6
159    add              r2, 4
160    jl .loop
161
162    ; auto-regression code
163    movsxd           r2, [fg_dataq+FGData.ar_coeff_lag]
164    movsxd           r2, [base+generate_grain_y_ssse3_table+r2*4]
165    lea              r2, [r2+base+generate_grain_y_ssse3_table]
166    jmp              r2
167
168.ar1:
169%if ARCH_X86_32
170    DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max
171%elif WIN64
172    DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0
173    mov            bufq, r0
174%else
175    DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
176%endif
177    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
178    movd             m4, [fg_dataq+FGData.ar_coeffs_y]
179    mov             ecx, [fg_dataq+FGData.ar_coeff_shift]
180%if ARCH_X86_32
181    mov             r1m, cf3d
182    DEFINE_ARGS buf, shift, val3, min, max, x, val0
183%define hd r0mp
184%define cf3d r1mp
185%elif WIN64
186    DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0
187%else
188    DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
189%endif
190    pxor             m6, m6
191    pcmpgtb          m7, m6, m4
192    punpcklbw        m4, m7
193    pinsrw           m4, [base+pw_1], 3
194    pshufd           m5, m4, q1111
195    pshufd           m4, m4, q0000
196    movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
197    pshuflw          m3, m3, q0000
198    sub            bufq, 82*73-(82*3+79)
199    mov              hd, 70
200    mov            mind, -128
201    mov            maxd, 127
202.y_loop_ar1:
203    mov              xq, -76
204    movsx         val3d, byte [bufq+xq-1]
205.x_loop_ar1:
206    movq             m0, [bufq+xq-82-1]     ; top/left
207    pcmpgtb          m7, m6, m0
208    punpcklbw        m0, m7
209    psrldq           m2, m0, 2              ; top
210    psrldq           m1, m0, 4              ; top/right
211    punpcklwd        m0, m2
212    punpcklwd        m1, m3
213    pmaddwd          m0, m4
214    pmaddwd          m1, m5
215    paddd            m0, m1
216.x_loop_ar1_inner:
217    movd          val0d, m0
218    psrldq           m0, 4
219    imul          val3d, cf3d
220    add           val3d, val0d
221    sar           val3d, shiftb
222    movsx         val0d, byte [bufq+xq]
223    add           val3d, val0d
224    cmp           val3d, maxd
225    cmovns        val3d, maxd
226    cmp           val3d, mind
227    cmovs         val3d, mind
228    mov  byte [bufq+xq], val3b
229    ; keep val3d in-place as left for next x iteration
230    inc              xq
231    jz .x_loop_ar1_end
232    test             xq, 3
233    jnz .x_loop_ar1_inner
234    jmp .x_loop_ar1
235
236.x_loop_ar1_end:
237    add            bufq, 82
238    dec              hd
239    jg .y_loop_ar1
240.ar0:
241    RET
242
243.ar2:
244%if ARCH_X86_32
245%assign stack_offset_old stack_offset
246    ALLOC_STACK -16*8
247%endif
248    DEFINE_ARGS buf, fg_data, shift
249    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
250    movd             m6, [base+round_vals-12+shiftq*2]
251    movd             m7, [base+byte_blend+1]
252    SCRATCH           7, 15, 7
253    movq             m0, [fg_dataq+FGData.ar_coeffs_y+0]    ; cf0-7
254    movd             m1, [fg_dataq+FGData.ar_coeffs_y+8]    ; cf8-11
255    pxor             m7, m7
256    pshuflw          m6, m6, q0000
257    punpcklwd        m6, m7
258    pcmpgtb          m4, m7, m0
259    pcmpgtb          m5, m7, m1
260    punpcklbw        m0, m4
261    punpcklbw        m1, m5
262    DEFINE_ARGS buf, fg_data, h, x
263    pshufd           m4, m1, q0000
264    pshufd           m5, m1, q1111
265    pshufd           m3, m0, q3333
266    pshufd           m2, m0, q2222
267    pshufd           m1, m0, q1111
268    pshufd           m0, m0, q0000
269    SCRATCH           0, 8,  0
270    SCRATCH           1, 9,  1
271    SCRATCH           2, 10, 2
272    SCRATCH           3, 11, 3
273    SCRATCH           4, 12, 4
274    SCRATCH           5, 13, 5
275    SCRATCH           6, 14, 6
276    sub            bufq, 82*73-(82*3+79)
277    mov              hd, 70
278.y_loop_ar2:
279    mov              xq, -76
280
281.x_loop_ar2:
282    movq             m0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
283    movhps           m0, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
284    pcmpgtb          m2, m7, m0
285    punpckhbw        m1, m0, m2
286    punpcklbw        m0, m2
287    psrldq           m5, m0, 2              ; y=-2,x=[-1,+5]
288    psrldq           m3, m1, 2              ; y=-1,x=[-1,+5]
289    psrldq           m4, m1, 4              ; y=-1,x=[+0,+5]
290    punpcklwd        m2, m0, m5
291    punpcklwd        m3, m4
292    pmaddwd          m2, m8
293    pmaddwd          m3, m11
294    paddd            m2, m3
295
296    psrldq           m4, m0, 4              ; y=-2,x=[+0,+5]
297    psrldq           m5, m0, 6              ; y=-2,x=[+1,+5]
298    psrldq           m6, m0, 8              ; y=-2,x=[+2,+5]
299    punpcklwd        m4, m5
300    punpcklwd        m6, m1
301    psrldq           m5, m1, 6              ; y=-1,x=[+1,+5]
302    psrldq           m1, m1, 8              ; y=-1,x=[+2,+5]
303    punpcklwd        m5, m1
304    pmaddwd          m4, m9
305    pmaddwd          m6, m10
306    pmaddwd          m5, m12
307    paddd            m4, m6
308    paddd            m2, m5
309    paddd            m2, m4
310    paddd            m2, m14
311
312    movq             m0, [bufq+xq-2]        ; y=0,x=[-2,+5]
313.x_loop_ar2_inner:
314    pcmpgtb          m4, m7, m0
315    punpcklbw        m1, m0, m4
316    pmaddwd          m3, m1, m13
317    paddd            m3, m2
318    psrldq           m1, 4                  ; y=0,x=0
319    psrldq           m2, 4                  ; shift top to next pixel
320    psrad            m3, [fg_dataq+FGData.ar_coeff_shift]
321    ; don't packssdw since we only care about one value
322    paddw            m3, m1
323    packsswb         m3, m3
324    pslldq           m3, 2
325    pand             m3, m15
326    pandn            m1, m15, m0
327    por              m0, m1, m3
328    psrldq           m0, 1
329    ; overwrite 2 pixels, but that's ok
330    movd      [bufq+xq-1], m0
331    inc              xq
332    jz .x_loop_ar2_end
333    test             xq, 3
334    jnz .x_loop_ar2_inner
335    jmp .x_loop_ar2
336
337.x_loop_ar2_end:
338    add            bufq, 82
339    dec              hd
340    jg .y_loop_ar2
341    RET
342
343.ar3:
344    DEFINE_ARGS buf, fg_data, shift
345%if ARCH_X86_32
346%assign stack_offset stack_offset_old
347    ALLOC_STACK  -16*14
348%elif WIN64
349    SUB             rsp, 16*6
350%assign stack_size_padded (stack_size_padded+16*6)
351%assign stack_size (stack_size+16*6)
352%else
353    ALLOC_STACK  -16*6
354%endif
355    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
356    movd             m6, [base+round_vals-12+shiftq*2]
357    movd             m7, [base+byte_blend]
358    movu             m0, [fg_dataq+FGData.ar_coeffs_y+ 0]   ; cf0-15
359    movq             m2, [fg_dataq+FGData.ar_coeffs_y+16]   ; cf16-23
360    pxor             m3, m3
361    pcmpgtb          m4, m3, m0
362    pcmpgtb          m3, m2
363    pshuflw          m6, m6, q0000
364    SCRATCH           6, 14, 12
365    SCRATCH           7, 15, 13
366    punpckhbw        m1, m0, m4
367    punpcklbw        m0, m4
368    punpcklbw        m2, m3
369    pshufd           m3, m0, q1111
370    pshufd           m4, m0, q2222
371    pshufd           m5, m0, q3333
372    pshufd           m0, m0, q0000
373    mova    [rsp+ 0*16], m0
374    mova    [rsp+ 1*16], m3
375    mova    [rsp+ 2*16], m4
376    mova    [rsp+ 3*16], m5
377    pshufd           m6, m1, q1111
378    pshufd           m7, m1, q2222
379    pshufd           m5, m1, q3333
380    pshufd           m1, m1, q0000
381    pshufd           m3, m2, q1111
382    psrldq           m0, m2, 10
383    pinsrw           m2, [base+pw_1], 5
384    pshufd           m4, m2, q2222
385    pshufd           m2, m2, q0000
386    pinsrw           m0, [base+round_vals+shiftq*2-10], 3
387    mova    [rsp+ 4*16], m1
388    mova    [rsp+ 5*16], m6
389    SCRATCH           7, 8,  6
390    SCRATCH           5, 9,  7
391    SCRATCH           2, 10, 8
392    SCRATCH           3, 11, 9
393    SCRATCH           4, 12, 10
394    SCRATCH           0, 13, 11
395    DEFINE_ARGS buf, fg_data, h, x
396    sub            bufq, 82*73-(82*3+79)
397    mov              hd, 70
398.y_loop_ar3:
399    mov              xq, -76
400
401.x_loop_ar3:
402    movu             m0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
403    pxor             m3, m3
404    pcmpgtb          m3, m0
405    punpckhbw        m2, m0, m3
406    punpcklbw        m0, m3
407
408    psrldq           m5, m0, 2
409    psrldq           m6, m0, 4
410    psrldq           m7, m0, 6
411    punpcklwd        m4, m0, m5
412    punpcklwd        m6, m7
413    pmaddwd          m4, [rsp+ 0*16]
414    pmaddwd          m6, [rsp+ 1*16]
415    paddd            m4, m6
416
417    movu             m1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
418    pxor             m5, m5
419    pcmpgtb          m5, m1
420    punpckhbw        m3, m1, m5
421    punpcklbw        m1, m5
422    palignr          m6, m2, m0, 10
423    palignr          m7, m2, m0, 12
424    psrldq           m0, 8
425    punpcklwd        m0, m6
426    punpcklwd        m7, m1
427    pmaddwd          m0, [rsp+ 2*16]
428    pmaddwd          m7, [rsp+ 3*16]
429    paddd            m0, m7
430    paddd            m0, m4
431
432    psrldq           m4, m1, 2
433    psrldq           m5, m1, 4
434    psrldq           m6, m1, 6
435    psrldq           m7, m1, 8
436    punpcklwd        m4, m5
437    punpcklwd        m6, m7
438    pmaddwd          m4, [rsp+ 4*16]
439    pmaddwd          m6, [rsp+ 5*16]
440    paddd            m4, m6
441    paddd            m0, m4
442
443    movu             m2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
444    pxor             m7, m7
445    pcmpgtb          m7, m2
446    punpckhbw        m5, m2, m7
447    punpcklbw        m2, m7
448    palignr          m7, m3, m1, 10
449    palignr          m3, m1, 12
450    psrldq           m1, m2, 2
451    punpcklwd        m7, m3
452    punpcklwd        m3, m2, m1
453    pmaddwd          m7, m8
454    pmaddwd          m3, m9
455    paddd            m7, m3
456    paddd            m0, m7
457
458    psrldq           m6, m2, 4
459    psrldq           m1, m2, 6
460    psrldq           m3, m2, 8
461    palignr          m4, m5, m2, 10
462    palignr          m5, m5, m2, 12
463
464    punpcklwd        m6, m1
465    punpcklwd        m3, m4
466    punpcklwd        m5, m14
467    pmaddwd          m6, m10
468    pmaddwd          m3, m11
469    pmaddwd          m5, m12
470    paddd            m0, m6
471    paddd            m3, m5
472    paddd            m0, m3
473
474    movq             m1, [bufq+xq-3]        ; y=0,x=[-3,+4]
475.x_loop_ar3_inner:
476    pxor             m5, m5
477    pcmpgtb          m5, m1
478    punpcklbw        m2, m1, m5
479    pmaddwd          m2, m13
480    pshufd           m3, m2, q1111
481    paddd            m2, m3                 ; left+cur
482    paddd            m2, m0                 ; add top
483    psrldq           m0, 4
484    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
485    ; don't packssdw since we only care about one value
486    packsswb         m2, m2
487    pslldq           m2, 3
488    pand             m2, m15
489    pandn            m3, m15, m1
490    por              m1, m2, m3
491    movd    [bufq+xq-3], m1
492    psrldq           m1, 1
493    inc              xq
494    jz .x_loop_ar3_end
495    test             xq, 3
496    jnz .x_loop_ar3_inner
497    jmp .x_loop_ar3
498
499.x_loop_ar3_end:
500    add            bufq, 82
501    dec              hd
502    jg .y_loop_ar3
503    RET
504
505INIT_XMM ssse3
506cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
507    movifnidn        r2, r2mp
508    movifnidn        r3, r3mp
509    LEA              r4, $$
510%define base r4-$$
511    movq             m1, [base+rnd_next_upperbit_mask]
512    movq             m4, [base+mul_bits]
513    movq             m7, [base+hmul_bits]
514    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
515    movd             m6, [base+round+r5*2]
516    mova             m5, [base+pb_mask]
517    movd             m0, [fg_dataq+FGData.seed]
518    movd             m2, [base+pw_seed_xor+uvq*4]
519    pxor             m0, m2
520    pshuflw          m6, m6, q0000
521    pshuflw          m0, m0, q0000
522    lea              r6, [base+gaussian_sequence]
523%if ARCH_X86_64
524    mov             r7d, 38
525%else
526    mov            r3mp, 38
527%endif
528    add            bufq, 44
529.loop_y:
530    mov              r5, -44
531.loop_x:
532    pand             m2, m0, m1
533    psrlw            m3, m2, 10
534    por              m2, m3             ; bits 0xf, 0x1e, 0x3c and 0x78 are set
535    pmullw           m2, m4             ; bits 0x0f00 are set
536    pshufb           m3, m5, m2         ; set 15th bit for next 4 seeds
537    psllq            m2, m3, 30
538    por              m3, m2
539    psllq            m2, m3, 15
540    por              m3, m2             ; aggregate each bit into next seed's high bit
541    pmulhuw          m2, m0, m7
542    por              m2, m3             ; 4 next output seeds
543    pshuflw          m0, m2, q3333
544    psrlw            m2, 5
545%if ARCH_X86_64
546    movd            r9d, m2
547    pshuflw          m2, m2, q3232
548    movzx            r8, r9w
549    shr              r9, 16
550
551    movd             m3, [r6+r8*2]
552    pinsrw           m3, [r6+r9*2], 1
553
554    movd            r9d, m2
555    movzx            r8, r9w
556    shr              r9, 16
557
558    pinsrw           m3, [r6+r8*2], 2
559    pinsrw           m3, [r6+r9*2], 3
560%else
561    movd             r2, m2
562    pshuflw          m2, m2, q3232
563    movzx            r1, r2w
564    shr              r2, 16
565
566    movd             m3, [r6+r1*2]
567    pinsrw           m3, [r6+r2*2], 1
568
569    movd             r2, m2
570    movzx            r1, r2w
571    shr              r2, 16
572
573    pinsrw           m3, [r6+r1*2], 2
574    pinsrw           m3, [r6+r2*2], 3
575%endif
576    pmulhrsw         m3, m6
577    packsswb         m3, m3
578    movd      [bufq+r5], m3
579    add              r5, 4
580    jl .loop_x
581    add            bufq, 82
582%if ARCH_X86_64
583    dec             r7d
584%else
585    dec            r3mp
586%endif
587    jg .loop_y
588
589%if ARCH_X86_32
590    mov              r2, r2mp
591%endif
592
593    ; auto-regression code
594    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
595    movsxd           r5, [base+generate_grain_uv_420_ssse3_table+r5*4]
596    lea              r5, [r5+base+generate_grain_uv_420_ssse3_table]
597    jmp              r5
598
599.ar0:
600    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
601    movifnidn     bufyq, bufymp
602%if ARCH_X86_32
603%assign stack_offset_old stack_offset
604    ALLOC_STACK   -2*16
605%endif
606    imul            uvd, 28
607    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
608    movd             m5, [fg_dataq+FGData.ar_coeffs_uv+uvq]
609    movd             m4, [base+hmul_bits+shiftq*2]
610    movd             m1, [base+byte_blend]
611    DEFINE_ARGS buf, bufy, h
612    pxor             m0, m0
613    pcmpgtb          m0, m5
614    punpcklbw        m5, m0
615    movd             m7, [base+pb_1]
616    movd             m6, [base+hmul_bits+4]
617    pshuflw          m5, m5, q0000
618    pshuflw          m4, m4, q0000
619    pshufd           m7, m7, q0000
620    pshuflw          m6, m6, q0000
621    punpcklqdq       m5, m5
622    punpcklqdq       m4, m4
623    punpcklqdq       m6, m6
624    punpcklbw        m1, m1
625    SCRATCH           1, 8, 0
626    SCRATCH           4, 9, 1
627    sub            bufq, 82*38+82-(82*3+41)
628    add           bufyq, 3+82*3
629    mov              hd, 35
630.y_loop_ar0:
631    ; first 32 pixels
632    movu             m1, [bufyq]
633    movu             m2, [bufyq+82]
634    movu             m3, [bufyq+16]
635    movu             m4, [bufyq+82+16]
636    pmaddubsw        m0, m7, m1
637    pmaddubsw        m1, m7, m2
638    pmaddubsw        m2, m7, m3
639    pmaddubsw        m3, m7, m4
640    paddw            m0, m1
641    paddw            m2, m3
642    pmulhrsw         m0, m6
643    pmulhrsw         m2, m6
644    pmullw           m0, m5
645    pmullw           m2, m5
646    pmulhrsw         m0, m9
647    pmulhrsw         m2, m9
648    packsswb         m0, m2
649    movu             m1, [bufq]
650    punpckhbw        m2, m0, m1
651    punpcklbw        m0, m1
652    pmaddubsw        m1, m7, m2
653    pmaddubsw        m2, m7, m0
654    packsswb         m2, m1
655    movu         [bufq], m2
656    add           bufyq, 32
657    add            bufq, 16
658    xor              hd, 0x10000
659    test             hd, 0x10000
660    jnz .y_loop_ar0
661
662    ; last 6 pixels
663    movu             m1, [bufyq]
664    movu             m2, [bufyq+82]
665    pmaddubsw        m0, m7, m1
666    pmaddubsw        m1, m7, m2
667    paddw            m0, m1
668    pmulhrsw         m0, m6
669    pmullw           m0, m5
670    pmulhrsw         m0, m9
671    packsswb         m0, m0
672    movq             m1, [bufq]
673    punpcklbw        m0, m1
674    pmaddubsw        m2, m7, m0
675    packsswb         m2, m2
676    pandn            m0, m8, m2
677    pand             m1, m8
678    por              m0, m1
679    movq         [bufq], m0
680
681    add            bufq, 82-32
682    add           bufyq, 82*2-64
683    dec              hd
684    jg .y_loop_ar0
685    RET
686
687.ar1:
688%if ARCH_X86_32
689%assign stack_offset stack_offset_old
690%assign stack_size_padded 0
691%xdefine rstk rsp
692%endif
693    DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x
694    imul            uvd, 28
695    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
696    movd             m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1]
697    pinsrw           m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2
698%if ARCH_X86_32
699    mov            r3mp, cf3d
700    DEFINE_ARGS buf, shift, fg_data, val3, min, max, x
701%elif WIN64
702    DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x
703    mov            bufq, r0
704%else
705    DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x
706%endif
707    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
708    movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
709    movd             m7, [base+pb_1]
710    movd             m6, [base+hmul_bits+4]
711    psrldq           m4, 1
712%if ARCH_X86_32
713    DEFINE_ARGS buf, shift, val0, val3, min, max, x
714%elif WIN64
715    DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0
716%else
717    DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0
718%endif
719    pxor             m5, m5
720    punpcklwd        m3, m5
721    punpcklwd        m6, m6
722    pcmpgtb          m5, m4
723    punpcklbw        m4, m5
724    pshufd           m5, m4, q1111
725    pshufd           m4, m4, q0000
726    pshufd           m3, m3, q0000
727    pshufd           m7, m7, q0000
728    pshufd           m6, m6, q0000
729    sub            bufq, 82*38+44-(82*3+41)
730%if ARCH_X86_32
731    add            r1mp, 79+82*3
732    mov            r0mp, 35
733%else
734    add           bufyq, 79+82*3
735    mov              hd, 35
736%endif
737    mov            mind, -128
738    mov            maxd, 127
739.y_loop_ar1:
740    mov              xq, -38
741    movsx         val3d, byte [bufq+xq-1]
742.x_loop_ar1:
743%if ARCH_X86_32
744    mov              r2, r1mp
745    movq             m0, [r2+xq*2]
746    movq             m1, [r2+xq*2+82]
747%else
748    movq             m0, [bufyq+xq*2]
749    movq             m1, [bufyq+xq*2+82]
750%endif
751    pmaddubsw        m2, m7, m0
752    pmaddubsw        m0, m7, m1
753    paddw            m2, m0
754    pmulhrsw         m2, m6
755
756    movq             m0, [bufq+xq-82-1]     ; top/left
757    pxor             m1, m1
758    pcmpgtb          m1, m0
759    punpcklbw        m0, m1
760    psrldq           m1, m0, 4              ; top/right
761    punpcklwd        m1, m2
762    psrldq           m2, m0, 2              ; top
763    punpcklwd        m0, m2
764    pmaddwd          m0, m4
765    pmaddwd          m1, m5
766    paddd            m0, m1
767    paddd            m0, m3
768.x_loop_ar1_inner:
769    movd          val0d, m0
770    psrldq           m0, 4
771%if ARCH_X86_32
772    imul          val3d, r3mp
773%else
774    imul          val3d, cf3d
775%endif
776    add           val3d, val0d
777    sar           val3d, shiftb
778    movsx         val0d, byte [bufq+xq]
779    add           val3d, val0d
780    cmp           val3d, maxd
781    cmovns        val3d, maxd
782    cmp           val3d, mind
783    cmovs         val3d, mind
784    mov  byte [bufq+xq], val3b
785    ; keep val3d in-place as left for next x iteration
786    inc              xq
787    jz .x_loop_ar1_end
788    test             xq, 3
789    jnz .x_loop_ar1_inner
790    jmp .x_loop_ar1
791
792.x_loop_ar1_end:
793    add            bufq, 82
794%if ARCH_X86_32
795    add            r1mp, 82*2
796    dec            r0mp
797%else
798    add           bufyq, 82*2
799    dec              hd
800%endif
801    jg .y_loop_ar1
802    RET
803
804.ar2:
805%if ARCH_X86_32
806%assign stack_offset stack_offset_old
807%assign stack_size_padded 0
808%xdefine rstk rsp
809    ALLOC_STACK   -8*16
810%endif
811    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
812    movifnidn     bufyq, bufymp
813    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
814    imul            uvd, 28
815    movd             m7, [base+round_vals-12+shiftq*2]
816    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]   ; cf0-12
817    pxor             m2, m2
818    pcmpgtb          m2, m0
819    punpckhbw        m1, m0, m2
820    punpcklbw        m0, m2
821    pinsrw           m1, [base+pw_1], 5
822    punpcklwd        m7, m7
823    pshufd           m7, m7, q0000
824    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
825    pshufd           m4, m1, q0000
826    pshufd           m5, m1, q1111
827    pshufd           m6, m1, q2222
828    pshufd           m3, m0, q3333
829    pshufd           m2, m0, q2222
830    pshufd           m1, m0, q1111
831    pshufd           m0, m0, q0000
832    SCRATCH           0, 8,  0
833    SCRATCH           1, 9,  1
834    SCRATCH           2, 10, 2
835    SCRATCH           3, 11, 3
836    SCRATCH           4, 12, 4
837    SCRATCH           5, 13, 5
838    SCRATCH           6, 14, 6
839    SCRATCH           7, 15, 7
840    movd             m7, [base+hmul_bits+4]
841    movd             m6, [base+pb_1]
842    punpcklwd        m7, m7
843    pshufd           m6, m6, q0000
844    pshufd           m7, m7, q0000
845    sub            bufq, 82*38+44-(82*3+41)
846    add           bufyq, 79+82*3
847    mov              hd, 35
848.y_loop_ar2:
849    mov              xq, -38
850
851.x_loop_ar2:
852    pxor             m2, m2
853    movq             m0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
854    movhps           m0, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
855    pcmpgtb          m2, m0
856    punpckhbw        m1, m0, m2
857    punpcklbw        m0, m2
858    psrldq           m5, m0, 2              ; y=-2,x=[-1,+5]
859    psrldq           m3, m1, 2              ; y=-1,x=[-1,+5]
860    psrldq           m4, m1, 4              ; y=-1,x=[+0,+5]
861    punpcklwd        m2, m0, m5
862    punpcklwd        m3, m4
863    pmaddwd          m2, m8
864    pmaddwd          m3, m11
865    paddd            m2, m3
866
867    psrldq           m4, m0, 4              ; y=-2,x=[+0,+5]
868    psrldq           m5, m0, 6              ; y=-2,x=[+1,+5]
869    psrldq           m0, 8                  ; y=-2,x=[+2,+5]
870    punpcklwd        m4, m5
871    punpcklwd        m0, m1
872    psrldq           m3, m1, 6              ; y=-1,x=[+1,+5]
873    psrldq           m1, m1, 8              ; y=-1,x=[+2,+5]
874    punpcklwd        m3, m1
875    pmaddwd          m4, m9
876    pmaddwd          m0, m10
877    pmaddwd          m3, m12
878    paddd            m4, m0
879    paddd            m2, m3
880    paddd            m2, m4
881
882    movq             m0, [bufyq+xq*2]
883    movq             m3, [bufyq+xq*2+82]
884    pmaddubsw        m1, m6, m0
885    pmaddubsw        m0, m6, m3
886    paddw            m0, m1
887    pmulhrsw         m0, m7
888    punpcklwd        m0, m15
889    pmaddwd          m0, m14
890    paddd            m2, m0
891
892    movq             m0, [bufq+xq-2]        ; y=0,x=[-2,+5]
893    pxor             m4, m4
894    movd             m5, [base+byte_blend+1]
895    punpcklbw        m5, m5
896.x_loop_ar2_inner:
897    pcmpgtb          m1, m4, m0
898    punpcklbw        m0, m1
899    pmaddwd          m3, m0, m13
900    paddd            m3, m2
901    psrldq           m2, 4                  ; shift top to next pixel
902    psrad            m3, [fg_dataq+FGData.ar_coeff_shift]
903    pslldq           m3, 4
904    pand             m3, m5
905    paddw            m0, m3
906    packsswb         m0, m0
907    movd    [bufq+xq-2], m0
908    psrldq           m0, 1
909    inc              xq
910    jz .x_loop_ar2_end
911    test             xq, 3
912    jnz .x_loop_ar2_inner
913    jmp .x_loop_ar2
914
915.x_loop_ar2_end:
916    add            bufq, 82
917    add           bufyq, 82*2
918    dec              hd
919    jg .y_loop_ar2
920    RET
921
922.ar3:
923%if ARCH_X86_32
924%assign stack_offset stack_offset_old
925%assign stack_size_padded 0
926%xdefine rstk rsp
927%endif
928    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
929    movifnidn     bufyq, bufymp
930%if ARCH_X86_32
931    ALLOC_STACK  -15*16
932%else
933    SUB             rsp, 16*7
934%assign stack_size_padded (stack_size_padded+16*7)
935%assign stack_size (stack_size+16*7)
936%endif
937    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
938    imul            uvd, 28
939
940    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]   ; cf0-15
941    pxor             m3, m3
942    pcmpgtb          m3, m0
943    punpckhbw        m1, m0, m3
944    punpcklbw        m0, m3
945    pshufd           m2, m0, q1111
946    pshufd           m3, m0, q2222
947    pshufd           m4, m0, q3333
948    pshufd           m0, m0, q0000
949    pshufd           m5, m1, q1111
950    pshufd           m6, m1, q2222
951    pshufd           m7, m1, q3333
952    pshufd           m1, m1, q0000
953    mova    [rsp+ 0*16], m0
954    mova    [rsp+ 1*16], m2
955    mova    [rsp+ 2*16], m3
956    mova    [rsp+ 3*16], m4
957    mova    [rsp+ 4*16], m1
958    mova    [rsp+ 5*16], m5
959    mova    [rsp+ 6*16], m6
960    SCRATCH           7, 8, 7
961
962    movu             m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]   ; cf16-24 [24=luma]
963    pxor             m4, m4
964    pcmpgtb          m4, m2
965    punpckhbw        m5, m2, m4
966    punpcklbw        m2, m4
967    pshufd           m4, m2, q3232
968    punpcklwd        m3, m4, m5
969    pshuflw          m5, m4, q3321
970    pshufd           m4, m3, q0000
971    pshufd           m3, m2, q1111
972    pshufd           m2, m2, q0000
973    pinsrw           m5, [base+round_vals+shiftq*2-10], 3
974    SCRATCH           2, 9,  8
975    SCRATCH           3, 10, 9
976    SCRATCH           4, 11, 10
977    SCRATCH           5, 12, 11
978
979    movd             m2, [base+round_vals-12+shiftq*2]
980    movd             m1, [base+pb_1]
981    movd             m3, [base+hmul_bits+4]
982    pxor             m0, m0
983    punpcklwd        m2, m0
984    punpcklwd        m3, m3
985    pshufd           m2, m2, q0000
986    pshufd           m1, m1, q0000
987    pshufd           m3, m3, q0000
988    SCRATCH           1, 13, 12
989    SCRATCH           2, 14, 13
990    SCRATCH           3, 15, 14
991
992    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
993    sub            bufq, 82*38+44-(82*3+41)
994    add           bufyq, 79+82*3
995    mov              hd, 35
996.y_loop_ar3:
997    mov              xq, -38
998
999.x_loop_ar3:
1000    movu             m0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
1001    pxor             m4, m4
1002    pcmpgtb          m4, m0
1003    punpckhbw        m3, m0, m4
1004    punpcklbw        m0, m4
1005
1006    psrldq           m5, m0, 2
1007    psrldq           m6, m0, 4
1008    psrldq           m7, m0, 6
1009    punpcklwd        m4, m0, m5
1010    punpcklwd        m6, m7
1011    pmaddwd          m4, [rsp+ 0*16]
1012    pmaddwd          m6, [rsp+ 1*16]
1013    paddd            m4, m6
1014
1015    palignr          m2, m3, m0, 10
1016    palignr          m3, m0, 12
1017    psrldq           m0, 8
1018
1019    movu             m1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
1020    pxor             m6, m6
1021    pcmpgtb          m6, m1
1022    punpckhbw        m5, m1, m6
1023    punpcklbw        m1, m6
1024
1025    punpcklwd        m0, m2
1026    punpcklwd        m3, m1
1027    pmaddwd          m0, [rsp+ 2*16]
1028    pmaddwd          m3, [rsp+ 3*16]
1029    paddd            m0, m3
1030    paddd            m0, m4
1031
1032    movu             m2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
1033    pxor             m7, m7
1034    pcmpgtb          m7, m2
1035    punpckhbw        m6, m2, m7
1036    punpcklbw        m2, m7
1037
1038    palignr          m3, m5, m1, 10
1039    palignr          m5, m1, 12
1040    psrldq           m4, m2, 2
1041
1042    punpcklwd        m3, m5
1043    punpcklwd        m5, m2, m4
1044    pmaddwd          m3, [rsp+ 6*16]
1045    pmaddwd          m5, m8
1046    paddd            m3, m5
1047    paddd            m0, m3
1048
1049    psrldq           m3, m1, 2
1050    psrldq           m4, m1, 4
1051    psrldq           m5, m1, 6
1052    psrldq           m1, 8
1053
1054    punpcklwd        m3, m4
1055    punpcklwd        m5, m1
1056    pmaddwd          m3, [rsp+ 4*16]
1057    pmaddwd          m5, [rsp+ 5*16]
1058    paddd            m3, m5
1059    paddd            m0, m3
1060
1061    movq             m1, [bufyq+xq*2]
1062    movq             m3, [bufyq+xq*2+82]
1063    pmaddubsw        m5, m13, m1
1064    pmaddubsw        m7, m13, m3
1065    paddw            m7, m5
1066    pmulhrsw         m7, m15
1067
1068    psrldq           m1, m2, 4
1069    psrldq           m3, m2, 6
1070    palignr          m4, m6, m2, 10
1071    palignr          m6, m2, 12
1072    psrldq           m2, 8
1073
1074    punpcklwd        m1, m3
1075    punpcklwd        m2, m4
1076    punpcklwd        m6, m7
1077    pmaddwd          m1, m9
1078    pmaddwd          m2, m10
1079    pmaddwd          m6, m11
1080    paddd            m1, m2
1081    paddd            m0, m6
1082    paddd            m0, m1
1083    paddd            m0, m14
1084
1085    movq             m1, [bufq+xq-3]        ; y=0,x=[-3,+4]
1086    pxor             m4, m4
1087    movd             m5, [base+byte_blend]
1088.x_loop_ar3_inner:
1089    pcmpgtb          m2, m4, m1
1090    punpcklbw        m3, m1, m2
1091    pmaddwd          m2, m3, m12
1092    pshufd           m3, m2, q1111
1093    paddd            m2, m3                 ; left+cur
1094    paddd            m2, m0                 ; add top
1095    psrldq           m0, 4
1096    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
1097    ; don't packssdw, we only care about one value
1098    packsswb         m2, m2
1099    pandn            m3, m5, m1
1100    pslld            m2, 24
1101    pand             m2, m5
1102    por              m1, m2, m3
1103    movd    [bufq+xq-3], m1
1104    psrldq           m1, 1
1105    inc              xq
1106    jz .x_loop_ar3_end
1107    test             xq, 3
1108    jnz .x_loop_ar3_inner
1109    jmp .x_loop_ar3
1110
1111.x_loop_ar3_end:
1112    add            bufq, 82
1113    add           bufyq, 82*2
1114    dec              hd
1115    jg .y_loop_ar3
1116    RET
1117
1118%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg
1119%assign %%idx 0
1120%define %%tmp %2
1121%if %0 == 6
1122%define %%tmp %6
1123%endif
1124%rep 4
1125%if %%idx == 0
1126    movd        %5 %+ d, %2
1127    pshuflw       %%tmp, %2, q3232
1128%else
1129    movd        %5 %+ d, %%tmp
1130%if %%idx == 2
1131    punpckhqdq    %%tmp, %%tmp
1132%elif %%idx == 4
1133    psrlq         %%tmp, 32
1134%endif
1135%endif
1136    movzx       %4 %+ d, %5 %+ w
1137    shr         %5 %+ d, 16
1138
1139%if %%idx == 0
1140    movd             %1, [%3+%4]
1141%else
1142    pinsrw           %1, [%3+%4], %%idx + 0
1143%endif
1144    pinsrw           %1, [%3+%5], %%idx + 1
1145%assign %%idx %%idx+2
1146%endrep
1147%endmacro
1148
1149INIT_XMM ssse3
1150; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby)
1151%if ARCH_X86_32
1152%if STACK_ALIGNMENT < mmsize
1153cglobal fgy_32x32xn, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \
1154        dst, src, scaling, unused1, fg_data, picptr, unused2
1155    ; copy stack arguments to new position post-alignment, so that we
1156    ; don't have to keep the old stack location in a separate register
1157    mov              r0, r0m
1158    mov              r1, r2m
1159    mov              r2, r4m
1160    mov              r3, r6m
1161    mov              r4, r7m
1162    mov              r5, r8m
1163
1164    mov [rsp+6*mmsize+ 3*gprsize], r0
1165    mov [rsp+6*mmsize+ 5*gprsize], r1
1166    mov [rsp+6*mmsize+ 7*gprsize], r2
1167    mov [rsp+6*mmsize+ 9*gprsize], r3
1168    mov [rsp+6*mmsize+10*gprsize], r4
1169    mov [rsp+6*mmsize+11*gprsize], r5
1170%else
1171cglobal fgy_32x32xn, 0, 7, 16, 6 * mmsize + (3 + 1) * gprsize, \
1172        dst, src, scaling, unused1, fg_data, picptr, unused2
1173%endif
1174    mov            srcq, srcm
1175    mov        fg_dataq, r3m
1176    mov        scalingq, r5m
1177%if STACK_ALIGNMENT < mmsize
1178%define r0m [rsp+6*mmsize+ 3*gprsize]
1179%define r1m [rsp+6*mmsize+ 4*gprsize]
1180%define r2m [rsp+6*mmsize+ 5*gprsize]
1181%define r3m [rsp+6*mmsize+ 6*gprsize]
1182%define r4m [rsp+6*mmsize+ 7*gprsize]
1183%define r5m [rsp+6*mmsize+ 8*gprsize]
1184%define r6m [rsp+6*mmsize+ 9*gprsize]
1185%define r7m [rsp+6*mmsize+10*gprsize]
1186%define r8m [rsp+6*mmsize+11*gprsize]
1187%endif
1188    LEA              r5, pb_mask
1189%define base r5-pb_mask
1190    mov             r5m, picptrq
1191%else
1192cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
1193    lea              r7, [pb_mask]
1194%define base r7-pb_mask
1195%endif
1196    mov             r6d, [fg_dataq+FGData.scaling_shift]
1197    movd             m3, [base+mul_bits+r6*2-14]
1198    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
1199    pcmpeqw          m2, m2
1200    psrldq           m2, 14
1201    movd             m4, [base+max+r6*4]
1202    movd             m5, [base+min+r6*2]
1203    punpcklwd        m3, m3
1204    punpcklwd        m4, m4
1205    punpcklwd        m5, m5
1206    pshufd           m3, m3, q0000
1207    pshufd           m4, m4, q0000
1208    pshufd           m5, m5, q0000
1209    SCRATCH           2, 10, 0
1210    SCRATCH           3, 11, 1
1211    SCRATCH           4, 12, 2
1212    SCRATCH           5, 13, 3
1213
1214%if ARCH_X86_32
1215    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
1216%else
1217    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
1218%endif
1219
1220    mov            sbyd, r8m
1221    mov        overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
1222    test       overlapd, overlapd
1223    jz .no_vertical_overlap
1224    mova             m6, [base+pw_1024]
1225    movd             m7, [base+pb_27_17_17_27]
1226    SCRATCH           6, 14, 4
1227    SCRATCH           7, 15, 5
1228    test           sbyd, sbyd
1229    jnz .vertical_overlap
1230    ; fall-through
1231
1232.no_vertical_overlap:
1233    mov             r8m, overlapd
1234%if ARCH_X86_32
1235    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
1236    imul           seed, (173 << 24) | 37
1237%else
1238    imul           seed, sbyd, (173 << 24) | 37
1239%endif
1240    add            seed, (105 << 24) | 178
1241    rol            seed, 8
1242    movzx          seed, seew
1243    xor            seed, [fg_dataq+FGData.seed]
1244
1245%if ARCH_X86_32
1246    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1247
1248    mov             r3m, seed
1249    mov              wq, r4m
1250%else
1251    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1252                unused1, unused2, see, unused3
1253%endif
1254
1255    lea        src_bakq, [srcq+wq]
1256    neg              wq
1257    sub           dstmp, srcq
1258%if ARCH_X86_32
1259    mov             r1m, src_bakq
1260    mov             r4m, wq
1261    DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
1262%endif
1263
1264.loop_x:
1265%if ARCH_X86_32
1266    mov            seed, r3m
1267%endif
1268    mov             r6d, seed
1269    or             seed, 0xEFF4
1270    shr             r6d, 1
1271    test           seeb, seeh
1272    lea            seed, [r6+0x8000]
1273    cmovp          seed, r6d                ; updated seed
1274%if ARCH_X86_32
1275    mov             r3m, seed
1276
1277    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1278
1279    mov           offxd, offyd
1280%else
1281    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1282                offx, offy, see, unused
1283
1284    mov           offyd, seed
1285    mov           offxd, seed
1286%endif
1287    ror           offyd, 8
1288    shr           offxd, 12
1289    and           offyd, 0xf
1290    imul          offyd, 164
1291    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1292
1293%if ARCH_X86_32
1294    ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
1295    ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
1296    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1297%else
1298    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1299                h, offxy, see, unused
1300%endif
1301
1302.loop_x_odd:
1303    mov              hd, r7m
1304    mov      grain_lutq, grain_lutmp
1305.loop_y:
1306    ; src
1307    mova             m0, [srcq]
1308    pxor             m2, m2
1309    punpckhbw        m1, m0, m2
1310    punpcklbw        m0, m2                 ; m0-1: src as word
1311
1312    ; scaling[src]
1313%if ARCH_X86_32
1314    vpgatherdw       m4, m0, scalingq, r0, r5, m3
1315    vpgatherdw       m5, m1, scalingq, r0, r5, m3
1316%else
1317    vpgatherdw       m4, m0, scalingq, r12, r13, m3
1318    vpgatherdw       m5, m1, scalingq, r12, r13, m3
1319%endif
1320    pcmpeqw          m3, m3
1321    psrlw            m3, 8
1322    pand             m4, m3
1323    pand             m5, m3
1324
1325    ; grain = grain_lut[offy+y][offx+x]
1326    movu             m3, [grain_lutq+offxyq]
1327    pcmpgtb          m7, m2, m3
1328    punpcklbw        m2, m3, m7
1329    punpckhbw        m3, m7
1330
1331    ; noise = round2(scaling[src] * grain, scaling_shift)
1332    pmullw           m2, m4
1333    pmullw           m3, m5
1334    pmulhrsw         m2, m11
1335    pmulhrsw         m3, m11
1336
1337    ; dst = clip_pixel(src, noise)
1338    paddw            m0, m2
1339    paddw            m1, m3
1340    pmaxsw           m0, m13
1341    pmaxsw           m1, m13
1342    pminsw           m0, m12
1343    pminsw           m1, m12
1344    packuswb         m0, m1
1345    movifnidn      dstq, dstmp
1346    mova    [dstq+srcq], m0
1347
1348    add            srcq, r2mp
1349    add      grain_lutq, 82
1350    dec              hd
1351    jg .loop_y
1352
1353%if ARCH_X86_32
1354    add            r4mp, 16
1355%else
1356    add              wq, 16
1357%endif
1358    jge .end
1359%if ARCH_X86_32
1360    mov            srcq, r1mp
1361    add            srcq, r4mp
1362    xor            r8mp, 4
1363    test           r8mp, 4
1364%else
1365    lea            srcq, [src_bakq+wq]
1366    test           srcq, 16             ; this relies on buffer alignment...
1367%endif
1368    jz .next_blk
1369
1370    add          offxyd, 16
1371    test      dword r8m, 2              ; r8m & 2 = have_top_overlap
1372    jz .loop_x_odd
1373
1374%if ARCH_X86_32
1375    add dword [rsp+6*mmsize+1*gprsize], 16
1376%else
1377    add            r11d, 16             ; top_offxyd
1378%endif
1379    jnz .loop_x_odd_v_overlap
1380
1381.next_blk:
1382    test      dword r8m, 1
1383    jz .loop_x
1384
1385    test      dword r8m, 2
1386    jnz .loop_x_hv_overlap
1387
1388    ; horizontal overlap (without vertical overlap)
1389.loop_x_h_overlap:
1390%if ARCH_X86_32
1391    ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
1392    ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
1393    DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3
1394
1395    add          offxyd, 16                 ; left_offxyd
1396    mov [rsp+6*mmsize+0*gprsize], offxyd
1397
1398    DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
1399
1400    mov            seed, r3m
1401%else
1402    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1403                offx, offy, see, left_offxy
1404
1405    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
1406%endif
1407
1408    mov             r6d, seed
1409    or             seed, 0xEFF4
1410    shr             r6d, 1
1411    test           seeb, seeh
1412    lea            seed, [r6+0x8000]
1413    cmovp          seed, r6d                ; updated seed
1414
1415%if ARCH_X86_32
1416    mov             r3m, seed
1417
1418    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1419
1420    mov           offxd, offyd
1421%else
1422    mov           offyd, seed
1423    mov           offxd, seed
1424%endif
1425    ror           offyd, 8
1426    shr           offxd, 12
1427    and           offyd, 0xf
1428    imul          offyd, 164
1429    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1430
1431%if ARCH_X86_32
1432    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1433%else
1434    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1435                h, offxy, see, left_offxy
1436%endif
1437
1438    mov              hd, r7m
1439    mov      grain_lutq, grain_lutmp
1440.loop_y_h_overlap:
1441    ; src
1442    mova             m0, [srcq]
1443    pxor             m2, m2
1444    punpckhbw        m1, m0, m2
1445    punpcklbw        m0, m2                 ; m0-1: src as word
1446
1447    ; scaling[src]
1448%if ARCH_X86_32
1449    vpgatherdw       m4, m0, scalingq, r0, r5, m3
1450    vpgatherdw       m5, m1, scalingq, r0, r5, m3
1451%else
1452    vpgatherdw       m4, m0, scalingq, r12, r13, m3
1453    vpgatherdw       m5, m1, scalingq, r12, r13, m3
1454%endif
1455    pcmpeqw          m3, m3
1456    psrlw            m3, 8
1457    pand             m4, m3
1458    pand             m5, m3
1459
1460    ; grain = grain_lut[offy+y][offx+x]
1461    movu             m3, [grain_lutq+offxyq]
1462%if ARCH_X86_32
1463    mov              r5, [rsp+6*mmsize+0*gprsize]
1464    movd             m7, [grain_lutq+r5]
1465%else
1466    movd             m7, [grain_lutq+left_offxyq]
1467%endif
1468    punpcklbw        m7, m3
1469    pmaddubsw        m6, m15, m7
1470    pmulhrsw         m6, m14
1471    packsswb         m6, m6
1472    pand             m6, m10
1473    pandn            m7, m10, m3
1474    por              m6, m7
1475    pcmpgtb          m2, m6
1476    punpcklbw        m7, m6, m2
1477    punpckhbw        m6, m2
1478
1479    ; noise = round2(scaling[src] * grain, scaling_shift)
1480    pmullw           m7, m4
1481    pmullw           m6, m5
1482    pmulhrsw         m7, m11
1483    pmulhrsw         m6, m11
1484
1485    ; dst = clip_pixel(src, noise)
1486    paddw            m0, m7
1487    paddw            m1, m6
1488    pmaxsw           m0, m13
1489    pmaxsw           m1, m13
1490    pminsw           m0, m12
1491    pminsw           m1, m12
1492    packuswb         m0, m1
1493    movifnidn      dstq, dstmp
1494    mova    [dstq+srcq], m0
1495
1496    add            srcq, r2mp
1497    add      grain_lutq, 82
1498    dec              hd
1499    jg .loop_y_h_overlap
1500
1501%if ARCH_X86_32
1502    add            r4mp, 16
1503%else
1504    add              wq, 16
1505%endif
1506    jge .end
1507%if ARCH_X86_32
1508    mov            srcq, r1m
1509    add            srcq, r4m
1510    xor            r8mp, 4
1511%else
1512    lea            srcq, [src_bakq+wq]
1513%endif
1514    ; assert(srcq & 16) != 0
1515    add          offxyd, 16
1516
1517    ; since this half-block had left-overlap, the next does not
1518    test      dword r8m, 2              ; have_top_overlap
1519    jz .loop_x_odd
1520%if ARCH_X86_32
1521    add dword [rsp+6*mmsize+1*gprsize], 16
1522%else
1523    add            r11d, 16             ; top_offxyd
1524%endif
1525    jmp .loop_x_odd_v_overlap
1526
1527.end:
1528    RET
1529
1530.vertical_overlap:
1531%if ARCH_X86_32
1532    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
1533%else
1534    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
1535%endif
1536
1537    or         overlapd, 2                  ; top_overlap: overlap & 2
1538    mov             r8m, overlapd
1539    movzx          sbyd, sbyb
1540%if ARCH_X86_32
1541    imul             r4, [fg_dataq+FGData.seed], 0x00010001
1542    DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
1543%else
1544    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1545%endif
1546    imul           tmpd, sbyd, 173 * 0x00010001
1547    imul           sbyd, 37 * 0x01000100
1548    add            tmpd, (105 << 16) | 188
1549    add            sbyd, (178 << 24) | (141 << 8)
1550    and            tmpd, 0x00ff00ff
1551    and            sbyd, 0xff00ff00
1552    xor            seed, tmpd
1553%if ARCH_X86_32
1554    xor            sbyd, seed               ; (cur_seed << 16) | top_seed
1555
1556    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1557
1558    mov             r3m, seed
1559    mov              wq, r4m
1560%else
1561    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1562
1563    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1564                tmp, unused2, see, unused3
1565%endif
1566
1567    lea        src_bakq, [srcq+wq]
1568    neg              wq
1569    sub           dstmp, srcq
1570%if ARCH_X86_32
1571    mov             r1m, src_bakq
1572    mov             r4m, wq
1573    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
1574%endif
1575
1576.loop_x_v_overlap:
1577%if ARCH_X86_32
1578    mov            seed, r3m
1579%endif
1580    ; we assume from the block above that bits 8-15 of tmpd are zero'ed,
1581    ; because of the 'and tmpd, 0x00ff00ff' above
1582    mov             r6d, seed
1583    or             seed, 0xeff4eff4
1584    test           seeb, seeh
1585    setp           tmpb                     ; parity of top_seed
1586    shr            seed, 16
1587    shl            tmpd, 16
1588    test           seeb, seeh
1589    setp           tmpb                     ; parity of cur_seed
1590    or              r6d, 0x00010001
1591    xor            tmpd, r6d
1592    mov            seed, tmpd
1593    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
1594
1595%if ARCH_X86_32
1596    mov             r3m, seed
1597
1598    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1599
1600    mov           offxd, offyd
1601%else
1602    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1603                offx, offy, see, unused, top_offxy
1604
1605    mov           offyd, seed
1606    mov           offxd, seed
1607%endif
1608
1609    ror           offyd, 8
1610    ror           offxd, 12
1611    and           offyd, 0xf000f
1612    and           offxd, 0xf000f
1613    imul          offyd, 164
1614    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1615    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1616
1617%if ARCH_X86_32
1618    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
1619%else
1620    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1621                h, offxy, see, unused, top_offxy
1622%endif
1623
1624    movzx    top_offxyd, offxyw
1625%if ARCH_X86_32
1626    mov [rsp+6*mmsize+1*gprsize], top_offxyd
1627
1628    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1629%endif
1630    shr          offxyd, 16
1631
1632.loop_x_odd_v_overlap:
1633%if ARCH_X86_32
1634    mov              r5, r5m
1635    lea              r5, [base+pb_27_17]
1636    mov [rsp+5*mmsize+8], r5
1637%else
1638    mova             m8, [pb_27_17]
1639%endif
1640    mov              hd, r7m
1641    mov      grain_lutq, grain_lutmp
1642.loop_y_v_overlap:
1643    ; src
1644    mova             m0, [srcq]
1645    pxor             m2, m2
1646    punpckhbw        m1, m0, m2
1647    punpcklbw        m0, m2                 ; m0-1: src as word
1648
1649    ; scaling[src]
1650%if ARCH_X86_32
1651    vpgatherdw       m4, m0, scalingq, r0, r5, m3
1652    vpgatherdw       m5, m1, scalingq, r0, r5, m3
1653%else
1654    vpgatherdw       m4, m0, scalingq, r12, r13, m3
1655    vpgatherdw       m5, m1, scalingq, r12, r13, m3
1656%endif
1657    pcmpeqw          m3, m3
1658    psrlw            m3, 8
1659    pand             m4, m3
1660    pand             m5, m3
1661
1662    ; grain = grain_lut[offy+y][offx+x]
1663    movu             m3, [grain_lutq+offxyq]
1664%if ARCH_X86_32
1665    mov              r5, [rsp+6*mmsize+1*gprsize]
1666    movu             m7, [grain_lutq+r5]
1667%else
1668    movu             m7, [grain_lutq+top_offxyq]
1669%endif
1670    punpckhbw        m6, m7, m3
1671    punpcklbw        m7, m3
1672%if ARCH_X86_32
1673    mov              r5, [rsp+5*mmsize+8]
1674    pmaddubsw        m3, [r5], m6
1675    pmaddubsw        m6, [r5], m7
1676%else
1677    pmaddubsw        m3, m8, m6
1678    pmaddubsw        m6, m8, m7
1679%endif
1680    pmulhrsw         m3, m14
1681    pmulhrsw         m6, m14
1682    packsswb         m6, m3
1683    pcmpgtb          m7, m2, m6
1684    punpcklbw        m2, m6, m7
1685    punpckhbw        m6, m7
1686
1687    ; noise = round2(scaling[src] * grain, scaling_shift)
1688    pmullw           m2, m4
1689    pmullw           m6, m5
1690    pmulhrsw         m2, m11
1691    pmulhrsw         m6, m11
1692
1693    ; dst = clip_pixel(src, noise)
1694    paddw            m0, m2
1695    paddw            m1, m6
1696    pmaxsw           m0, m13
1697    pmaxsw           m1, m13
1698    pminsw           m0, m12
1699    pminsw           m1, m12
1700    packuswb         m0, m1
1701    movifnidn      dstq, dstmp
1702    mova    [dstq+srcq], m0
1703
1704%if ARCH_X86_32
1705    add dword [rsp+5*mmsize+8], mmsize
1706%else
1707    mova             m8, [pb_17_27]
1708%endif
1709    add            srcq, r2mp
1710    add      grain_lutq, 82
1711    dec              hw
1712    jz .end_y_v_overlap
1713    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1714    ; remaining (up to) 30 lines
1715    xor              hd, 0x10000
1716    test             hd, 0x10000
1717    jnz .loop_y_v_overlap
1718    jmp .loop_y
1719
1720.end_y_v_overlap:
1721%if ARCH_X86_32
1722    add            r4mp, 16
1723%else
1724    add              wq, 16
1725%endif
1726    jge .end_hv
1727%if ARCH_X86_32
1728    mov            srcq, r1mp
1729    add            srcq, r4mp
1730    xor            r8mp, 4
1731    test           r8mp, 4
1732%else
1733    lea            srcq, [src_bakq+wq]
1734    test           srcq, 16
1735%endif
1736    jz .loop_x_hv_overlap
1737    add          offxyd, 16
1738%if ARCH_X86_32
1739    add dword [rsp+6*mmsize+1*gprsize], 16
1740%else
1741    add      top_offxyd, 16
1742%endif
1743    jmp .loop_x_odd_v_overlap
1744
1745.loop_x_hv_overlap:
1746%if ARCH_X86_32
1747    mov              r5, r5m
1748    lea              r5, [base+pb_27_17]
1749    mov [rsp+5*mmsize+8], r5
1750
1751    DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak
1752
1753    mov              r5, [rsp+6*mmsize+1*gprsize]
1754    mov              r4, offxyd
1755    add              r5, 16
1756    add              r4, 16
1757    mov [rsp+6*mmsize+2*gprsize], r5        ; topleft_offxy
1758    mov [rsp+6*mmsize+0*gprsize], r4        ; left_offxy
1759
1760    DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak
1761
1762    xor            tmpd, tmpd
1763    mov            seed, r3m
1764%else
1765    mova             m8, [pb_27_17]
1766
1767    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1768                tmp, unused2, see, unused3
1769
1770    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
1771%endif
1772    mov             r6d, seed
1773    or             seed, 0xeff4eff4
1774    test           seeb, seeh
1775    setp           tmpb                     ; parity of top_seed
1776    shr            seed, 16
1777    shl            tmpd, 16
1778    test           seeb, seeh
1779    setp           tmpb                     ; parity of cur_seed
1780    or              r6d, 0x00010001
1781    xor            tmpd, r6d
1782    mov            seed, tmpd
1783    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
1784
1785%if ARCH_X86_32
1786    mov             r3m, seed
1787
1788    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1789
1790    mov           offxd, offyd
1791%else
1792    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1793                offx, offy, see, left_offxy, top_offxy, topleft_offxy
1794
1795    lea  topleft_offxyq, [top_offxyq+16]
1796    lea     left_offxyq, [offyq+16]
1797    mov           offyd, seed
1798    mov           offxd, seed
1799%endif
1800    ror           offyd, 8
1801    ror           offxd, 12
1802    and           offyd, 0xf000f
1803    and           offxd, 0xf000f
1804    imul          offyd, 164
1805    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1806    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1807
1808%if ARCH_X86_32
1809    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1810
1811    movzx            r5, offxyw             ; top_offxy
1812    mov [rsp+6*mmsize+1*gprsize], r5
1813%else
1814    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1815                h, offxy, see, left_offxy, top_offxy, topleft_offxy
1816
1817    movzx    top_offxyd, offxyw
1818%endif
1819    shr          offxyd, 16
1820
1821    mov              hd, r7m
1822    mov      grain_lutq, grain_lutmp
1823.loop_y_hv_overlap:
1824    ; grain = grain_lut[offy+y][offx+x]
1825    movu             m3, [grain_lutq+offxyq]
1826%if ARCH_X86_32
1827    mov              r5, [rsp+6*mmsize+1*gprsize]   ; top_offxy
1828    mov              r0, [rsp+6*mmsize+0*gprsize]   ; left_offxy
1829    movu             m6, [grain_lutq+r5]
1830    mov              r5, [rsp+6*mmsize+2*gprsize]   ; topleft_offxy
1831    movd             m4, [grain_lutq+r0]
1832    movd             m7, [grain_lutq+r5]
1833%else
1834    movu             m6, [grain_lutq+top_offxyq]
1835    movd             m4, [grain_lutq+left_offxyq]
1836    movd             m7, [grain_lutq+topleft_offxyq]
1837%endif
1838    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
1839    punpcklbw        m4, m3
1840    punpcklbw        m7, m6
1841    pmaddubsw        m2, m15, m4
1842    pmaddubsw        m4, m15, m7
1843    pmulhrsw         m2, m14
1844    pmulhrsw         m4, m14
1845    packsswb         m2, m2
1846    packsswb         m4, m4
1847    pand             m2, m10
1848    pand             m4, m10
1849    pandn            m7, m10, m3
1850    pandn            m3, m10, m6
1851    por              m7, m2
1852    por              m3, m4
1853    ; followed by v interpolation (top | cur -> cur)
1854    punpckhbw        m4, m3, m7
1855    punpcklbw        m3, m7
1856%if ARCH_X86_32
1857    mov              r5, [rsp+5*mmsize+8]
1858    pmaddubsw        m7, [r5], m4
1859    pmaddubsw        m4, [r5], m3
1860%else
1861    pmaddubsw        m7, m8, m4
1862    pmaddubsw        m4, m8, m3
1863%endif
1864    pmulhrsw         m7, m14
1865    pmulhrsw         m4, m14
1866    packsswb         m4, m7
1867    pxor             m2, m2
1868    pcmpgtb          m7, m2, m4
1869    punpcklbw        m3, m4, m7
1870    punpckhbw        m4, m7
1871
1872    ; src
1873    mova             m0, [srcq]
1874    punpckhbw        m1, m0, m2
1875    punpcklbw        m0, m2                 ; m0-1: src as word
1876
1877    ; scaling[src]
1878%if ARCH_X86_32
1879    vpgatherdw       m5, m0, scalingq, r0, r5, m7
1880    vpgatherdw       m6, m1, scalingq, r0, r5, m7
1881%else
1882    vpgatherdw       m5, m0, scalingq, r13, r14, m7
1883    vpgatherdw       m6, m1, scalingq, r13, r14, m7
1884%endif
1885    pcmpeqw          m7, m7
1886    psrlw            m7, 8
1887    pand             m5, m7
1888    pand             m6, m7
1889
1890    ; noise = round2(scaling[src] * grain, scaling_shift)
1891    pmullw           m3, m5
1892    pmullw           m4, m6
1893    pmulhrsw         m3, m11
1894    pmulhrsw         m4, m11
1895
1896    ; dst = clip_pixel(src, noise)
1897    paddw            m0, m3
1898    paddw            m1, m4
1899    pmaxsw           m0, m13
1900    pmaxsw           m1, m13
1901    pminsw           m0, m12
1902    pminsw           m1, m12
1903    packuswb         m0, m1
1904    movifnidn      dstq, dstmp
1905    mova    [dstq+srcq], m0
1906
1907%if ARCH_X86_32
1908    add dword [rsp+5*mmsize+8], mmsize
1909%else
1910    mova             m8, [pb_17_27]
1911%endif
1912    add            srcq, r2mp
1913    add      grain_lutq, 82
1914    dec              hw
1915    jz .end_y_hv_overlap
1916    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1917    ; remaining (up to) 30 lines
1918    xor              hd, 0x10000
1919    test             hd, 0x10000
1920    jnz .loop_y_hv_overlap
1921    jmp .loop_y_h_overlap
1922
1923.end_y_hv_overlap:
1924%if ARCH_X86_32
1925    add            r4mp, 16
1926%else
1927    add              wq, 16
1928%endif
1929    jge .end_hv
1930%if ARCH_X86_32
1931    mov            srcq, r1m
1932    add            srcq, r4m
1933    xor            r8mp, 4
1934%else
1935    lea            srcq, [src_bakq+wq]
1936%endif
1937    ; assert(srcq & 16) != 0
1938    add          offxyd, 16
1939%if ARCH_X86_32
1940    add dword [rsp+6*mmsize+1*gprsize], 16
1941%else
1942    add      top_offxyd, 16
1943%endif
1944    jmp .loop_x_odd_v_overlap
1945
1946.end_hv:
1947    RET
1948
1949INIT_XMM ssse3
1950%if ARCH_X86_32
1951; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h,
1952;                         sby, luma, lstride, uv_pl, is_id)
1953%if STACK_ALIGNMENT < mmsize
1954DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
1955cglobal fguv_32x32xn_i420, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
1956        tmp, src, scaling, h, fg_data, picptr, unused
1957    mov              r0, r0m
1958    mov              r1, r2m
1959    mov              r2, r4m
1960    mov              r3, r6m
1961    mov              r4, r7m
1962    mov [rsp+8*mmsize+3*gprsize], r0
1963    mov [rsp+8*mmsize+5*gprsize], r1
1964    mov [rsp+8*mmsize+7*gprsize], r2
1965    mov [rsp+8*mmsize+9*gprsize], r3
1966    mov [rsp+8*mmsize+10*gprsize], r4
1967
1968    mov              r0, r8m
1969    mov              r1, r9m
1970    mov              r2, r10m
1971    mov              r4, r11m
1972    mov              r3, r12m
1973    mov [rsp+8*mmsize+11*gprsize], r0
1974    mov [rsp+8*mmsize+12*gprsize], r1
1975    mov [rsp+8*mmsize+13*gprsize], r2
1976    mov [rsp+8*mmsize+14*gprsize], r4
1977%else
1978cglobal fguv_32x32xn_i420, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
1979        tmp, src, scaling, h, fg_data, picptr, unused
1980%endif
1981    mov            srcq, srcm
1982    mov        fg_dataq, r3m
1983    mov        scalingq, r5m
1984%if STACK_ALIGNMENT < mmsize
1985%define r0m [rsp+8*mmsize+ 3*gprsize]
1986%define r1m [rsp+8*mmsize+ 4*gprsize]
1987%define r2m [rsp+8*mmsize+ 5*gprsize]
1988%define r3m [rsp+8*mmsize+ 6*gprsize]
1989%define r4m [rsp+8*mmsize+ 7*gprsize]
1990%define r5m [rsp+8*mmsize+ 8*gprsize]
1991%define r6m [rsp+8*mmsize+ 9*gprsize]
1992%define r7m [rsp+8*mmsize+10*gprsize]
1993%define r8m [rsp+8*mmsize+11*gprsize]
1994%define r9m [rsp+8*mmsize+12*gprsize]
1995%define r10m [rsp+8*mmsize+13*gprsize]
1996%define r11m [rsp+8*mmsize+14*gprsize]
1997%define r12m [rsp+8*mmsize+15*gprsize]
1998%endif
1999    LEA              r5, pb_mask
2000%define base r5-pb_mask
2001    mov             r5m, r5
2002%else
2003cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
2004                                      grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
2005    lea              r8, [pb_mask]
2006%define base r8-pb_mask
2007%endif
2008    mov             r6d, [fg_dataq+FGData.scaling_shift]
2009    movd             m2, [base+byte_blend+3]
2010    movd             m3, [base+mul_bits+r6*2-14]
2011    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
2012    lea            tmpd, [r6d*2]
2013%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize
2014    test             r3, r3
2015%else
2016    cmp      dword r12m, 0                      ; is_idm
2017%endif
2018    movd             m5, [base+min+r6*2]
2019    cmovne          r6d, tmpd
2020    movd             m4, [base+max+r6*2]
2021    punpcklwd        m3, m3
2022    punpcklwd        m5, m5
2023    punpcklwd        m4, m4
2024    pshufd           m3, m3, q0000
2025    pshufd           m5, m5, q0000
2026    pshufd           m4, m4, q0000
2027    SCRATCH           2, 10, 0
2028    SCRATCH           3, 11, 1
2029    SCRATCH           4, 12, 2
2030    SCRATCH           5, 13, 3
2031
2032    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
2033    jne .csfl
2034
2035%macro FGUV_32x32xN_LOOP 1 ; not-csfl
2036%if ARCH_X86_32
2037    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
2038%else
2039    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
2040%endif
2041
2042%if %1
2043    mov             r6d, dword r11m
2044    movd             m0, [fg_dataq+FGData.uv_mult+r6*4]
2045    movd             m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
2046    punpcklbw        m6, m1, m0
2047    movd             m7, [fg_dataq+FGData.uv_offset+r6*4]
2048    punpcklwd        m6, m6
2049    punpcklwd        m7, m7
2050    pshufd           m6, m6, q0000
2051    pshufd           m7, m7, q0000
2052    SCRATCH           6, 14, 4
2053    SCRATCH           7, 15, 5
2054%endif
2055
2056    mov            sbyd, r8m
2057    mov        overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
2058    test       overlapd, overlapd
2059    jz %%no_vertical_overlap
2060%if ARCH_X86_32
2061    movd             m1, [base+pb_23_22]
2062    mova             m0, [base+pw_1024]
2063%else
2064    movd             m1, [pb_23_22]
2065    mova             m0, [pw_1024]
2066%endif
2067    pshufd           m1, m1, q0000
2068    SCRATCH           0, 8, 6
2069    SCRATCH           1, 9, 7
2070    test           sbyd, sbyd
2071    jnz %%vertical_overlap
2072    ; fall-through
2073
2074%%no_vertical_overlap:
2075    mov             r8m, overlapd
2076%if ARCH_X86_32
2077    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
2078    imul           seed, (173 << 24) | 37
2079%else
2080    imul           seed, sbyd, (173 << 24) | 37
2081%endif
2082    add            seed, (105 << 24) | 178
2083    rol            seed, 8
2084    movzx          seed, seew
2085    xor            seed, [fg_dataq+FGData.seed]
2086
2087%if ARCH_X86_32
2088    mov             r3m, seed
2089
2090    DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
2091%define luma_bakq lumaq
2092
2093    mov              wq, r4m
2094    shl           r10mp, 1
2095%else
2096    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2097                unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak
2098
2099    mov        lstrideq, r10mp
2100%endif
2101
2102    mov           lumaq, r9mp
2103    lea        src_bakq, [srcq+wq]
2104    lea       luma_bakq, [lumaq+wq*2]
2105    neg              wq
2106    sub            r0mp, srcq
2107%if ARCH_X86_32
2108    mov             r1m, src_bakq
2109    mov            r11m, luma_bakq
2110    mov             r4m, wq
2111
2112    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
2113%else
2114    mov           r11mp, src_bakq
2115    mov           r10mp, strideq
2116%endif
2117
2118%%loop_x:
2119%if ARCH_X86_32
2120    mov            seed, r3m
2121%endif
2122    mov             r6d, seed
2123    or             seed, 0xEFF4
2124    shr             r6d, 1
2125    test           seeb, seeh
2126    lea            seed, [r6+0x8000]
2127    cmovp          seed, r6d               ; updated seed
2128%if ARCH_X86_32
2129    mov             r3m, seed
2130
2131    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
2132
2133    mov           offxd, offyd
2134%else
2135    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2136                offx, offy, see, overlap, unused1, unused2, lstride
2137
2138    mov           offyd, seed
2139    mov           offxd, seed
2140%endif
2141    ror           offyd, 8
2142    shr           offxd, 12
2143    and           offyd, 0xf
2144    imul          offyd, 82
2145    lea           offyq, [offyq+offxq+498]  ; offy*stride+offx
2146
2147%if ARCH_X86_32
2148    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
2149%else
2150    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2151                h, offxy, see, overlap, unused1, unused2, lstride, luma_bak
2152%endif
2153
2154    mov              hd, r7m
2155    mov      grain_lutq, grain_lutmp
2156%%loop_y:
2157    ; src
2158%if ARCH_X86_32
2159    mov           lumaq, r9mp
2160%endif
2161    mova             m4, [lumaq+ 0]
2162    mova             m6, [lumaq+16]
2163    mova             m0, [srcq]
2164%if ARCH_X86_32
2165    add           lumaq, r10mp
2166    mov            r9mp, lumaq
2167    mov              r5, r5m
2168    movd             m7, [base+pb_1]
2169%else
2170    movd             m7, [pb_1]
2171%endif
2172    pshufd           m7, m7, q0000
2173    pxor             m2, m2
2174    pmaddubsw        m4, m7
2175    pmaddubsw        m6, m7
2176    pavgw            m4, m2
2177    pavgw            m6, m2
2178
2179%if %1
2180    packuswb         m4, m6                 ; luma
2181    punpckhbw        m6, m4, m0
2182    punpcklbw        m4, m0                 ; { luma, chroma }
2183    pmaddubsw        m6, m14
2184    pmaddubsw        m4, m14
2185    psraw            m6, 6
2186    psraw            m4, 6
2187    paddw            m6, m15
2188    paddw            m4, m15
2189    packuswb         m4, m6                 ; pack+unpack = clip
2190    punpckhbw        m6, m4, m2
2191    punpcklbw        m4, m2
2192%endif
2193
2194    ; scaling[luma_src]
2195%if ARCH_X86_32
2196    vpgatherdw       m7, m4, scalingq, r0, r5
2197    vpgatherdw       m5, m6, scalingq, r0, r5
2198%else
2199    vpgatherdw       m7, m4, scalingq, r12, r2
2200    vpgatherdw       m5, m6, scalingq, r12, r2
2201%endif
2202    pcmpeqw          m1, m1
2203    psrlw            m1, 8
2204    pand             m7, m1
2205    pand             m5, m1
2206
2207    ; unpack chroma_source
2208    punpckhbw        m1, m0, m2
2209    punpcklbw        m0, m2                 ; m0-1: src as word
2210
2211    ; grain = grain_lut[offy+y][offx+x]
2212    movu             m3, [grain_lutq+offxyq+ 0]
2213    pcmpgtb          m6, m2, m3
2214    punpcklbw        m2, m3, m6
2215    punpckhbw        m3, m6
2216
2217    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2218    pmullw           m2, m7
2219    pmullw           m3, m5
2220    pmulhrsw         m2, m11
2221    pmulhrsw         m3, m11
2222
2223%if ARCH_X86_32
2224    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2225%endif
2226
2227    ; dst = clip_pixel(src, noise)
2228    paddw            m0, m2
2229    paddw            m1, m3
2230    pmaxsw           m0, m13
2231    pmaxsw           m1, m13
2232    pminsw           m0, m12
2233    pminsw           m1, m12
2234    packuswb         m0, m1
2235    movifnidn      dstq, dstmp
2236    mova    [dstq+srcq], m0
2237
2238%if ARCH_X86_32
2239    add            srcq, r2mp
2240    ; we already incremented lumaq above
2241%else
2242    add            srcq, r10mp
2243    lea           lumaq, [lumaq+lstrideq*2]
2244%endif
2245    add      grain_lutq, 82
2246    dec              hw
2247    jg %%loop_y
2248
2249%if ARCH_X86_32
2250    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
2251
2252    mov              wq, r4m
2253%endif
2254    add              wq, 16
2255    jge %%end
2256%if ARCH_X86_32
2257    mov            srcq, r1mp
2258    mov           lumaq, r11mp
2259%else
2260    mov            srcq, r11mp
2261%endif
2262    lea           lumaq, [luma_bakq+wq*2]
2263    add            srcq, wq
2264%if ARCH_X86_32
2265    mov             r4m, wq
2266    mov             r9m, lumaq
2267%endif
2268    test      dword r8m, 1
2269    jz %%loop_x
2270
2271    ; r8m = sbym
2272    test      dword r8m, 2
2273    jne %%loop_x_hv_overlap
2274
2275    ; horizontal overlap (without vertical overlap)
2276%%loop_x_h_overlap:
2277%if ARCH_X86_32
2278    lea              r6, [offxyd+16]
2279    mov [rsp+8*mmsize+0*gprsize], r6
2280
2281    DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
2282
2283    mov            seed, r3m
2284%else
2285    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2286                offx, offy, see, left_offxy, unused1, unused2, lstride
2287
2288    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
2289%endif
2290    mov             r6d, seed
2291    or             seed, 0xEFF4
2292    shr             r6d, 1
2293    test           seeb, seeh
2294    lea            seed, [r6+0x8000]
2295    cmovp          seed, r6d                ; updated seed
2296
2297%if ARCH_X86_32
2298    mov             r3m, seed
2299
2300    DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx
2301
2302    mov          offxd, offyd
2303%else
2304    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2305                offx, offy, see, left_offxy, unused1, unused2, lstride
2306
2307    mov           offyd, seed
2308    mov           offxd, seed
2309%endif
2310    ror           offyd, 8
2311    shr           offxd, 12
2312    and           offyd, 0xf
2313    imul          offyd, 82
2314    lea           offyq, [offyq+offxq+498]  ; offy*stride+offx
2315
2316%if ARCH_X86_32
2317    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
2318%else
2319    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2320                h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak
2321%endif
2322
2323    mov              hd, r7m
2324    mov      grain_lutq, grain_lutmp
2325%%loop_y_h_overlap:
2326    ; src
2327%if ARCH_X86_32
2328    mov           lumaq, r9mp
2329%endif
2330    mova             m4, [lumaq+ 0]
2331    mova             m6, [lumaq+16]
2332    mova             m0, [srcq]
2333%if ARCH_X86_32
2334    add           lumaq, r10mp
2335    mov            r9mp, lumaq
2336    mov              r5, r5m
2337    movd             m7, [base+pb_1]
2338%else
2339    movd             m7, [pb_1]
2340%endif
2341    pshufd           m7, m7, q0000
2342    pxor             m2, m2
2343    pmaddubsw        m4, m7
2344    pmaddubsw        m6, m7
2345    pavgw            m4, m2
2346    pavgw            m6, m2
2347
2348%if %1
2349    packuswb         m4, m6                 ; luma
2350    punpckhbw        m6, m4, m0
2351    punpcklbw        m4, m0                 ; { luma, chroma }
2352    pmaddubsw        m6, m14
2353    pmaddubsw        m4, m14
2354    psraw            m6, 6
2355    psraw            m4, 6
2356    paddw            m6, m15
2357    paddw            m4, m15
2358    packuswb         m4, m6                 ; pack+unpack = clip
2359    punpckhbw        m6, m4, m2
2360    punpcklbw        m4, m2
2361%endif
2362
2363    ; scaling[luma_src]
2364%if ARCH_X86_32
2365    vpgatherdw       m7, m4, scalingq, r0, r5
2366    vpgatherdw       m5, m6, scalingq, r0, r5
2367%else
2368    vpgatherdw       m7, m4, scalingq, r12, r2
2369    vpgatherdw       m5, m6, scalingq, r12, r2
2370%endif
2371    pcmpeqw          m1, m1
2372    psrlw            m1, 8
2373    pand             m7, m1
2374    pand             m5, m1
2375
2376    ; unpack chroma_source
2377    punpckhbw        m1, m0, m2
2378    punpcklbw        m0, m2                 ; m0-1: src as word
2379
2380    ; grain = grain_lut[offy+y][offx+x]
2381    movu             m3, [grain_lutq+offxyq+ 0]
2382%if ARCH_X86_32
2383    mov              r0, [rsp+8*mmsize+0*gprsize]
2384    movd             m4, [grain_lutq+r0+ 0]
2385%else
2386    movd             m4, [grain_lutq+left_offxyq+ 0]
2387%endif
2388    punpcklbw        m2, m4, m3
2389    pmaddubsw        m4, m9, m2
2390    pmulhrsw         m4, m8
2391    packsswb         m4, m4
2392    pand             m4, m10
2393    pandn            m2, m10, m3
2394    por              m3, m4, m2
2395    pxor             m4, m4
2396    pcmpgtb          m4, m3
2397    punpcklbw        m2, m3, m4
2398    punpckhbw        m3, m4
2399
2400    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2401    pmullw           m2, m7
2402    pmullw           m3, m5
2403    pmulhrsw         m2, m11
2404    pmulhrsw         m3, m11
2405
2406%if ARCH_X86_32
2407    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2408%endif
2409
2410    ; dst = clip_pixel(src, noise)
2411    paddw            m0, m2
2412    paddw            m1, m3
2413    pmaxsw           m0, m13
2414    pmaxsw           m1, m13
2415    pminsw           m0, m12
2416    pminsw           m1, m12
2417    packuswb         m0, m1
2418    movifnidn      dstq, dstmp
2419    mova    [dstq+srcq], m0
2420
2421%if ARCH_X86_32
2422    add            srcq, r2mp
2423    ; lumaq has already been incremented above
2424%else
2425    add            srcq, r10mp
2426    lea           lumaq, [lumaq+lstrideq*2]
2427%endif
2428    add      grain_lutq, 82
2429    dec              hw
2430    jg %%loop_y_h_overlap
2431
2432%if ARCH_X86_32
2433    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
2434
2435    mov              wq, r4m
2436%endif
2437    add              wq, 16
2438    jge %%end
2439%if ARCH_X86_32
2440    mov            srcq, r1mp
2441    mov           lumaq, r11mp
2442%else
2443    mov            srcq, r11mp
2444%endif
2445    lea           lumaq, [luma_bakq+wq*2]
2446    add            srcq, wq
2447%if ARCH_X86_32
2448    mov             r4m, wq
2449    mov             r9m, lumaq
2450%endif
2451
2452    ; r8m = sbym
2453    test      dword r8m, 2
2454    jne %%loop_x_hv_overlap
2455    jmp %%loop_x_h_overlap
2456
2457%%end:
2458    RET
2459
2460%%vertical_overlap:
2461%if ARCH_X86_32
2462    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
2463%else
2464    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
2465%endif
2466
2467    or         overlapd, 2                  ; top_overlap: overlap & 2
2468    mov             r8m, overlapd
2469    movzx          sbyd, sbyb
2470%if ARCH_X86_32
2471    imul             r4, [fg_dataq+FGData.seed], 0x00010001
2472    DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
2473%else
2474    imul           seed, [fg_dataq+FGData.seed], 0x00010001
2475%endif
2476    imul           tmpd, sbyd, 173 * 0x00010001
2477    imul           sbyd, 37 * 0x01000100
2478    add            tmpd, (105 << 16) | 188
2479    add            sbyd, (178 << 24) | (141 << 8)
2480    and            tmpd, 0x00ff00ff
2481    and            sbyd, 0xff00ff00
2482    xor            seed, tmpd
2483%if ARCH_X86_32
2484    xor            sbyd, seed               ; (cur_seed << 16) | top_seed
2485
2486    DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
2487
2488    mov             r3m, seed
2489    mov              wq, r4m
2490    shl           r10mp, 1
2491%else
2492    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
2493
2494    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2495                tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak
2496
2497    mov        lstrideq, r10mp
2498%endif
2499
2500    mov           lumaq, r9mp
2501    lea        src_bakq, [srcq+wq]
2502    lea       luma_bakq, [lumaq+wq*2]
2503    neg              wq
2504    sub            r0mp, srcq
2505%if ARCH_X86_32
2506    mov             r1m, src_bakq
2507    mov            r11m, luma_bakq
2508    mov             r4m, wq
2509
2510    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
2511%else
2512    mov           r11mp, src_bakq
2513    mov           r10mp, strideq
2514%endif
2515
2516%%loop_x_v_overlap:
2517%if ARCH_X86_32
2518    mov            seed, r3m
2519    xor            tmpd, tmpd
2520%endif
2521    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
2522    mov             r6d, seed
2523    or             seed, 0xeff4eff4
2524    test           seeb, seeh
2525    setp           tmpb                     ; parity of top_seed
2526    shr            seed, 16
2527    shl            tmpd, 16
2528    test           seeb, seeh
2529    setp           tmpb                     ; parity of cur_seed
2530    or              r6d, 0x00010001
2531    xor            tmpd, r6d
2532    mov            seed, tmpd
2533    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
2534
2535%if ARCH_X86_32
2536    mov             r3m, seed
2537
2538    DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
2539
2540    mov           offxd, offyd
2541%else
2542    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2543                offx, offy, see, overlap, top_offxy, unused, lstride
2544
2545    mov           offxd, seed
2546    mov           offyd, seed
2547%endif
2548    ror           offyd, 8
2549    ror           offxd, 12
2550    and           offyd, 0xf000f
2551    and           offxd, 0xf000f
2552    imul          offyd, 82
2553    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
2554    lea           offyq, [offyq+offxq+0x10001*498+16*82]
2555
2556%if ARCH_X86_32
2557    DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy
2558%else
2559    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2560                h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak
2561%endif
2562
2563    movzx    top_offxyd, offxyw
2564    shr          offxyd, 16
2565%if ARCH_X86_32
2566    mov [rsp+8*mmsize+1*gprsize], top_offxyd
2567
2568    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
2569%endif
2570
2571    mov              hd, r7m
2572    mov      grain_lutq, grain_lutmp
2573%%loop_y_v_overlap:
2574%if ARCH_X86_32
2575    mov           lumaq, r9mp
2576%endif
2577    mova             m4, [lumaq+ 0]
2578    mova             m6, [lumaq+16]
2579    mova             m0, [srcq]
2580%if ARCH_X86_32
2581    add           lumaq, r10mp
2582    mov            r9mp, lumaq
2583    mov              r5, r5m
2584    movd             m7, [base+pb_1]
2585%else
2586    movd             m7, [pb_1]
2587%endif
2588    pshufd           m7, m7, q0000
2589    pxor             m2, m2
2590    pmaddubsw        m4, m7
2591    pmaddubsw        m6, m7
2592    pavgw            m4, m2
2593    pavgw            m6, m2
2594
2595%if %1
2596    packuswb         m4, m6                 ; luma
2597    punpckhbw        m6, m4, m0
2598    punpcklbw        m4, m0                 ; { luma, chroma }
2599    pmaddubsw        m6, m14
2600    pmaddubsw        m4, m14
2601    psraw            m6, 6
2602    psraw            m4, 6
2603    paddw            m6, m15
2604    paddw            m4, m15
2605    packuswb         m4, m6                 ; pack+unpack = clip
2606    punpckhbw        m6, m4, m2
2607    punpcklbw        m4, m2
2608%endif
2609
2610    ; scaling[luma_src]
2611%if ARCH_X86_32
2612    vpgatherdw       m7, m4, scalingq, r0, r5
2613    vpgatherdw       m5, m6, scalingq, r0, r5
2614%else
2615    vpgatherdw       m7, m4, scalingq, r12, r2
2616    vpgatherdw       m5, m6, scalingq, r12, r2
2617%endif
2618    pcmpeqw          m1, m1
2619    psrlw            m1, 8
2620    pand             m7, m1
2621    pand             m5, m1
2622
2623    ; grain = grain_lut[offy+y][offx+x]
2624    movu             m3, [grain_lutq+offxyq]
2625%if ARCH_X86_32
2626    mov              r0, [rsp+8*mmsize+1*gprsize]
2627    movu             m4, [grain_lutq+r0]
2628%else
2629    movu             m4, [grain_lutq+top_offxyq]
2630%endif
2631    punpckhbw        m1, m4, m3
2632    punpcklbw        m4, m3
2633    pmaddubsw        m2, m9, m1
2634    pmaddubsw        m3, m9, m4
2635    pmulhrsw         m2, m8
2636    pmulhrsw         m3, m8
2637    packsswb         m3, m2
2638    pxor             m1, m1
2639    pcmpgtb          m1, m3
2640    punpcklbw        m2, m3, m1
2641    punpckhbw        m3, m1
2642
2643    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2644    pmullw           m2, m7
2645    pmullw           m3, m5
2646    pmulhrsw         m2, m11
2647    pmulhrsw         m3, m11
2648
2649    ; unpack chroma_source
2650    pxor             m4, m4
2651    punpckhbw        m1, m0, m4
2652    punpcklbw        m0, m4                 ; m0-1: src as word
2653
2654%if ARCH_X86_32
2655    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2656%endif
2657
2658    ; dst = clip_pixel(src, noise)
2659    paddw            m0, m2
2660    paddw            m1, m3
2661    pmaxsw           m0, m13
2662    pmaxsw           m1, m13
2663    pminsw           m0, m12
2664    pminsw           m1, m12
2665    packuswb         m0, m1
2666    movifnidn      dstq, dstmp
2667    mova    [dstq+srcq], m0
2668
2669    dec              hw
2670    je %%end_y_v_overlap
2671%if ARCH_X86_32
2672    add            srcq, r2mp
2673    ; lumaq has already been incremented above
2674%else
2675    add            srcq, r10mp
2676    lea           lumaq, [lumaq+lstrideq*2]
2677%endif
2678    add      grain_lutq, 82
2679    jmp %%loop_y
2680
2681%%end_y_v_overlap:
2682%if ARCH_X86_32
2683    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
2684
2685    mov              wq, r4m
2686%endif
2687    add              wq, 16
2688    jge %%end_hv
2689%if ARCH_X86_32
2690    mov            srcq, r1mp
2691    mov           lumaq, r11mp
2692%else
2693    mov            srcq, r11mp
2694%endif
2695    lea           lumaq, [luma_bakq+wq*2]
2696    add            srcq, wq
2697%if ARCH_X86_32
2698    mov             r4m, wq
2699    mov             r9m, lumaq
2700%endif
2701
2702    ; since fg_dataq.overlap is guaranteed to be set, we never jump
2703    ; back to .loop_x_v_overlap, and instead always fall-through to
2704    ; h+v overlap
2705
2706%%loop_x_hv_overlap:
2707%if ARCH_X86_32
2708    DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
2709
2710    mov              r6, [rsp+8*mmsize+1*gprsize]
2711    lea              r0, [r3d+16]
2712    add              r6, 16
2713    mov [rsp+8*mmsize+0*gprsize], r0        ; left_offxy
2714    mov [rsp+8*mmsize+2*gprsize], r6        ; topleft_offxy
2715
2716    DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
2717
2718    mov            seed, r3m
2719    xor            tmpd, tmpd
2720%else
2721    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2722                tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
2723
2724    lea  topleft_offxyq, [top_offxyq+16]
2725    lea     left_offxyq, [offxyq+16]
2726
2727    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
2728%endif
2729    mov             r6d, seed
2730    or             seed, 0xeff4eff4
2731    test           seeb, seeh
2732    setp           tmpb                     ; parity of top_seed
2733    shr            seed, 16
2734    shl            tmpd, 16
2735    test           seeb, seeh
2736    setp           tmpb                     ; parity of cur_seed
2737    or              r6d, 0x00010001
2738    xor            tmpd, r6d
2739    mov            seed, tmpd
2740    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
2741
2742%if ARCH_X86_32
2743    mov             r3m, seed
2744
2745    DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx
2746
2747    mov           offxd, offyd
2748%else
2749    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2750                offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
2751
2752    mov           offxd, seed
2753    mov           offyd, seed
2754%endif
2755    ror           offyd, 8
2756    ror           offxd, 12
2757    and           offyd, 0xf000f
2758    and           offxd, 0xf000f
2759    imul          offyd, 82
2760    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
2761    lea           offyq, [offyq+offxq+0x10001*498+16*82]
2762
2763%if ARCH_X86_32
2764    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
2765%else
2766    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2767                h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak
2768%endif
2769
2770    movzx    top_offxyd, offxyw
2771    shr          offxyd, 16
2772%if ARCH_X86_32
2773    mov [rsp+8*mmsize+1*gprsize], top_offxyd
2774%endif
2775
2776    mov              hd, r7m
2777    mov      grain_lutq, grain_lutmp
2778%%loop_y_hv_overlap:
2779    ; src
2780%if ARCH_X86_32
2781    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
2782
2783    mov           lumaq, r9mp
2784%endif
2785    mova             m4, [lumaq+ 0]
2786    mova             m6, [lumaq+16]
2787    mova             m0, [srcq]
2788%if ARCH_X86_32
2789    add           lumaq, r10mp
2790    mov            r9mp, lumaq
2791    mov              r5, r5m
2792    movd             m7, [base+pb_1]
2793%else
2794    movd             m7, [pb_1]
2795%endif
2796    pshufd           m7, m7, q0000
2797    pxor             m2, m2
2798    pmaddubsw        m4, m7
2799    pmaddubsw        m6, m7
2800    pavgw            m4, m2
2801    pavgw            m6, m2
2802
2803%if %1
2804    packuswb         m4, m6                 ; luma
2805    punpckhbw        m6, m4, m0
2806    punpcklbw        m4, m0                 ; { luma, chroma }
2807    pmaddubsw        m6, m14
2808    pmaddubsw        m4, m14
2809    psraw            m6, 6
2810    psraw            m4, 6
2811    paddw            m6, m15
2812    paddw            m4, m15
2813    packuswb         m4, m6                 ; pack+unpack = clip
2814    punpckhbw        m6, m4, m2
2815    punpcklbw        m4, m2
2816%endif
2817
2818    ; scaling[src]
2819%if ARCH_X86_32
2820    vpgatherdw       m7, m4, scalingq, r0, r5
2821    vpgatherdw       m5, m6, scalingq, r0, r5
2822%else
2823    movd             m1, [grain_lutq+topleft_offxyq]
2824    vpgatherdw       m7, m4, scalingq, r2, r12
2825    vpgatherdw       m5, m6, scalingq, r2, r12
2826%endif
2827    pcmpeqw          m2, m2
2828    psrlw            m2, 8
2829    pand             m7, m2
2830    pand             m5, m2
2831
2832    ; grain = grain_lut[offy+y][offx+x]
2833%if ARCH_X86_32
2834    mov              r0, [rsp+8*mmsize+2*gprsize]       ; topleft_offxy
2835    mov              r5, [rsp+8*mmsize+1*gprsize]       ; top_offxy
2836    movd             m1, [grain_lutq+r0]
2837    mov              r0, [rsp+8*mmsize+0*gprsize]       ; left_offxy
2838%endif
2839    movu             m3, [grain_lutq+offxyq]
2840%if ARCH_X86_32
2841    movu             m6, [grain_lutq+r5]
2842    movd             m4, [grain_lutq+r0]
2843%else
2844    movu             m6, [grain_lutq+top_offxyq]
2845    movd             m4, [grain_lutq+left_offxyq]
2846%endif
2847    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
2848    punpcklbw        m1, m6
2849    punpcklbw        m4, m3
2850    punpcklwd        m4, m1
2851    pmaddubsw        m1, m9, m4
2852    pmulhrsw         m1, m8
2853    packsswb         m1, m1
2854    pandn            m4, m10, m3
2855    pandn            m3, m10, m6
2856    psrldq           m6, m1, 1
2857    pand             m1, m10
2858    pand             m6, m10
2859    por              m4, m1
2860    por              m3, m6
2861    ; followed by v interpolation (top | cur -> cur)
2862    punpckhbw        m1, m3, m4
2863    punpcklbw        m3, m4
2864    pmaddubsw        m4, m9, m1
2865    pmaddubsw        m1, m9, m3
2866    pmulhrsw         m4, m8
2867    pmulhrsw         m1, m8
2868    packsswb         m1, m4
2869    pxor             m4, m4
2870    pcmpgtb          m4, m1
2871    punpcklbw        m2, m1, m4
2872    punpckhbw        m1, m4
2873
2874    ; noise = round2(scaling[src] * grain, scaling_shift)
2875    pmullw           m2, m7
2876    pmullw           m1, m5
2877    pmulhrsw         m2, m11
2878    pmulhrsw         m1, m11
2879
2880%if ARCH_X86_32
2881    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2882%endif
2883
2884    ; unpack chroma source
2885    pxor             m4, m4
2886    punpckhbw        m3, m0, m4
2887    punpcklbw        m0, m4                 ; m0-1: src as word
2888
2889    ; dst = clip_pixel(src, noise)
2890    paddw            m0, m2
2891    paddw            m3, m1
2892    pmaxsw           m0, m13
2893    pmaxsw           m3, m13
2894    pminsw           m0, m12
2895    pminsw           m3, m12
2896    packuswb         m0, m3
2897    movifnidn      dstq, dstmp
2898    mova    [dstq+srcq], m0
2899
2900%if ARCH_X86_32
2901    add            srcq, r2mp
2902    ; lumaq has been adjusted above already
2903%else
2904    add            srcq, r10mp
2905    lea           lumaq, [lumaq+lstrideq*2]
2906%endif
2907    add      grain_lutq, 82
2908    dec              hw
2909    jg %%loop_y_h_overlap
2910
2911%if ARCH_X86_32
2912    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
2913
2914    mov              wq, r4m
2915%endif
2916    add              wq, 16
2917    jge %%end_hv
2918%if ARCH_X86_32
2919    mov            srcq, r1mp
2920    mov           lumaq, r11mp
2921%else
2922    mov            srcq, r11mp
2923%endif
2924    lea           lumaq, [luma_bakq+wq*2]
2925    add            srcq, wq
2926%if ARCH_X86_32
2927    mov             r4m, wq
2928    mov             r9m, lumaq
2929%endif
2930    jmp %%loop_x_hv_overlap
2931
2932%%end_hv:
2933    RET
2934%endmacro
2935
2936    FGUV_32x32xN_LOOP 1
2937.csfl:
2938    FGUV_32x32xN_LOOP 0
2939