1; Copyright © 2019-2021, VideoLAN and dav1d authors
2; Copyright © 2019, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28%include "x86/filmgrain_common.asm"
29
30SECTION_RODATA
31
32pw_1024: times 8 dw 1024
33pb_27_17_17_27: db 27, 17, 17, 27
34                times 6 db 0, 32
35pb_23_22_h: db 23, 22
36            times 7 db 0, 32
37pb_27_17: times 8 db 27, 17
38pb_17_27: times 8 db 17, 27
39pb_23_22: times 8 db 23, 22
40pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
41rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
42byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
43pw_seed_xor: times 2 dw 0xb524
44             times 2 dw 0x49d8
45pb_1: times 4 db 1
46hmul_bits: dw 32768, 16384, 8192, 4096
47round: dw 2048, 1024, 512
48mul_bits: dw 256, 128, 64, 32, 16
49round_vals: dw 32, 64, 128, 256, 512
50max: dw 255, 240, 235
51min: dw 0, 16
52pw_1: dw 1
53
54%macro JMP_TABLE 2-*
55    %xdefine %1_8bpc_%2_table %%table
56    %xdefine %%base %1_8bpc_%2_table
57    %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
58    %%table:
59    %rep %0 - 2
60        dd %%prefix %+ .ar%3 - %%base
61        %rotate 1
62    %endrep
63%endmacro
64
65JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3
66JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3
67JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3
68JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3
69
70SECTION .text
71
72%if ARCH_X86_32
73%define PIC_ptr(a) base+a
74%else
75%define PIC_ptr(a) a
76%endif
77
78%macro SCRATCH 3
79%if ARCH_X86_32
80    mova [rsp+%3*mmsize], m%1
81%define m%2 [rsp+%3*mmsize]
82%else
83    SWAP             %1, %2
84%endif
85%endmacro
86
87INIT_XMM ssse3
88cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
89    LEA              r4, $$
90%define base r4-$$
91    movq             m1, [base+rnd_next_upperbit_mask]
92    movq             m4, [base+mul_bits]
93    movq             m7, [base+hmul_bits]
94    mov             r2d, [fg_dataq+FGData.grain_scale_shift]
95    movd             m2, [base+round+r2*2]
96    movd             m0, [fg_dataq+FGData.seed]
97    mova             m5, [base+pb_mask]
98    pshuflw          m2, m2, q0000
99    pshuflw          m0, m0, q0000
100    mov              r2, -73*82
101    sub            bufq, r2
102    lea              r3, [base+gaussian_sequence]
103.loop:
104    pand             m6, m0, m1
105    psrlw            m3, m6, 10
106    por              m6, m3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
107    pmullw           m6, m4            ; bits 0x0f00 are set
108    pshufb           m3, m5, m6        ; set 15th bit for next 4 seeds
109    psllq            m6, m3, 30
110    por              m3, m6
111    psllq            m6, m3, 15
112    por              m3, m6            ; aggregate each bit into next seed's high bit
113    pmulhuw          m6, m0, m7
114    por              m3, m6            ; 4 next output seeds
115    pshuflw          m0, m3, q3333
116    psrlw            m3, 5
117%if ARCH_X86_64
118    movq             r6, m3
119    mov              r8, r6
120    movzx           r5d, r6w
121    shr             r6d, 16
122    shr              r8, 32
123    movzx            r7, r8w
124    shr              r8, 16
125
126    movd             m6, [r3+r5*2]
127    pinsrw           m6, [r3+r6*2], 1
128    pinsrw           m6, [r3+r7*2], 2
129    pinsrw           m6, [r3+r8*2], 3
130%else
131    movd             r6, m3
132    pshuflw          m3, m3, q3232
133    movzx            r5, r6w
134    shr              r6, 16
135
136    movd             m6, [r3+r5*2]
137    pinsrw           m6, [r3+r6*2], 1
138
139    movd             r6, m3
140    movzx            r5, r6w
141    shr              r6, 16
142
143    pinsrw           m6, [r3+r5*2], 2
144    pinsrw           m6, [r3+r6*2], 3
145%endif
146    pmulhrsw         m6, m2
147    packsswb         m6, m6
148    movd      [bufq+r2], m6
149    add              r2, 4
150    jl .loop
151
152    ; auto-regression code
153    movsxd           r2, [fg_dataq+FGData.ar_coeff_lag]
154    movsxd           r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4]
155    lea              r2, [r2+base+generate_grain_y_8bpc_ssse3_table]
156    jmp              r2
157
158.ar1:
159%if ARCH_X86_32
160    DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max
161%elif WIN64
162    DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0
163    mov            bufq, r0
164%else
165    DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
166%endif
167    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
168    movd             m4, [fg_dataq+FGData.ar_coeffs_y]
169    mov             ecx, [fg_dataq+FGData.ar_coeff_shift]
170%if ARCH_X86_32
171    mov             r1m, cf3d
172    DEFINE_ARGS buf, shift, val3, min, max, x, val0
173%define hd r0mp
174%define cf3d r1mp
175%elif WIN64
176    DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0
177%else
178    DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
179%endif
180    pxor             m6, m6
181    pcmpgtb          m7, m6, m4
182    punpcklbw        m4, m7
183    pinsrw           m4, [base+pw_1], 3
184    pshufd           m5, m4, q1111
185    pshufd           m4, m4, q0000
186    movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
187    pshuflw          m3, m3, q0000
188    sub            bufq, 82*73-(82*3+79)
189    mov              hd, 70
190    mov            mind, -128
191    mov            maxd, 127
192.y_loop_ar1:
193    mov              xq, -76
194    movsx         val3d, byte [bufq+xq-1]
195.x_loop_ar1:
196    movq             m0, [bufq+xq-82-1]     ; top/left
197    pcmpgtb          m7, m6, m0
198    punpcklbw        m0, m7
199    psrldq           m2, m0, 2              ; top
200    psrldq           m1, m0, 4              ; top/right
201    punpcklwd        m0, m2
202    punpcklwd        m1, m3
203    pmaddwd          m0, m4
204    pmaddwd          m1, m5
205    paddd            m0, m1
206.x_loop_ar1_inner:
207    movd          val0d, m0
208    psrldq           m0, 4
209    imul          val3d, cf3d
210    add           val3d, val0d
211    sar           val3d, shiftb
212    movsx         val0d, byte [bufq+xq]
213    add           val3d, val0d
214    cmp           val3d, maxd
215    cmovns        val3d, maxd
216    cmp           val3d, mind
217    cmovs         val3d, mind
218    mov  byte [bufq+xq], val3b
219    ; keep val3d in-place as left for next x iteration
220    inc              xq
221    jz .x_loop_ar1_end
222    test             xq, 3
223    jnz .x_loop_ar1_inner
224    jmp .x_loop_ar1
225
226.x_loop_ar1_end:
227    add            bufq, 82
228    dec              hd
229    jg .y_loop_ar1
230.ar0:
231    RET
232
233.ar2:
234%if ARCH_X86_32
235%assign stack_offset_old stack_offset
236    ALLOC_STACK -16*8
237%endif
238    DEFINE_ARGS buf, fg_data, shift
239    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
240    movd             m6, [base+round_vals-12+shiftq*2]
241    movd             m7, [base+byte_blend+1]
242    SCRATCH           7, 15, 7
243    movq             m0, [fg_dataq+FGData.ar_coeffs_y+0]    ; cf0-7
244    movd             m1, [fg_dataq+FGData.ar_coeffs_y+8]    ; cf8-11
245    pxor             m7, m7
246    pshuflw          m6, m6, q0000
247    punpcklwd        m6, m7
248    pcmpgtb          m4, m7, m0
249    pcmpgtb          m5, m7, m1
250    punpcklbw        m0, m4
251    punpcklbw        m1, m5
252    DEFINE_ARGS buf, fg_data, h, x
253    pshufd           m4, m1, q0000
254    pshufd           m5, m1, q1111
255    pshufd           m3, m0, q3333
256    pshufd           m2, m0, q2222
257    pshufd           m1, m0, q1111
258    pshufd           m0, m0, q0000
259    SCRATCH           0, 8,  0
260    SCRATCH           1, 9,  1
261    SCRATCH           2, 10, 2
262    SCRATCH           3, 11, 3
263    SCRATCH           4, 12, 4
264    SCRATCH           5, 13, 5
265    SCRATCH           6, 14, 6
266    sub            bufq, 82*73-(82*3+79)
267    mov              hd, 70
268.y_loop_ar2:
269    mov              xq, -76
270
271.x_loop_ar2:
272    movq             m0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
273    movhps           m0, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
274    pcmpgtb          m2, m7, m0
275    punpckhbw        m1, m0, m2
276    punpcklbw        m0, m2
277    psrldq           m5, m0, 2              ; y=-2,x=[-1,+5]
278    psrldq           m3, m1, 2              ; y=-1,x=[-1,+5]
279    psrldq           m4, m1, 4              ; y=-1,x=[+0,+5]
280    punpcklwd        m2, m0, m5
281    punpcklwd        m3, m4
282    pmaddwd          m2, m8
283    pmaddwd          m3, m11
284    paddd            m2, m3
285
286    psrldq           m4, m0, 4              ; y=-2,x=[+0,+5]
287    psrldq           m5, m0, 6              ; y=-2,x=[+1,+5]
288    psrldq           m6, m0, 8              ; y=-2,x=[+2,+5]
289    punpcklwd        m4, m5
290    punpcklwd        m6, m1
291    psrldq           m5, m1, 6              ; y=-1,x=[+1,+5]
292    psrldq           m1, m1, 8              ; y=-1,x=[+2,+5]
293    punpcklwd        m5, m1
294    pmaddwd          m4, m9
295    pmaddwd          m6, m10
296    pmaddwd          m5, m12
297    paddd            m4, m6
298    paddd            m2, m5
299    paddd            m2, m4
300    paddd            m2, m14
301
302    movq             m0, [bufq+xq-2]        ; y=0,x=[-2,+5]
303.x_loop_ar2_inner:
304    pcmpgtb          m4, m7, m0
305    punpcklbw        m1, m0, m4
306    pmaddwd          m3, m1, m13
307    paddd            m3, m2
308    psrldq           m1, 4                  ; y=0,x=0
309    psrldq           m2, 4                  ; shift top to next pixel
310    psrad            m3, [fg_dataq+FGData.ar_coeff_shift]
311    ; don't packssdw since we only care about one value
312    paddw            m3, m1
313    packsswb         m3, m3
314    pslldq           m3, 2
315    pand             m3, m15
316    pandn            m1, m15, m0
317    por              m0, m1, m3
318    psrldq           m0, 1
319    ; overwrite 2 pixels, but that's ok
320    movd      [bufq+xq-1], m0
321    inc              xq
322    jz .x_loop_ar2_end
323    test             xq, 3
324    jnz .x_loop_ar2_inner
325    jmp .x_loop_ar2
326
327.x_loop_ar2_end:
328    add            bufq, 82
329    dec              hd
330    jg .y_loop_ar2
331    RET
332
333.ar3:
334    DEFINE_ARGS buf, fg_data, shift
335%if ARCH_X86_32
336%assign stack_offset stack_offset_old
337    ALLOC_STACK  -16*14
338%elif WIN64
339    SUB             rsp, 16*6
340%assign stack_size_padded (stack_size_padded+16*6)
341%assign stack_size (stack_size+16*6)
342%else
343    ALLOC_STACK  -16*6
344%endif
345    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
346    movd             m6, [base+round_vals-12+shiftq*2]
347    movd             m7, [base+byte_blend]
348    movu             m0, [fg_dataq+FGData.ar_coeffs_y+ 0]   ; cf0-15
349    movq             m2, [fg_dataq+FGData.ar_coeffs_y+16]   ; cf16-23
350    pxor             m3, m3
351    pcmpgtb          m4, m3, m0
352    pcmpgtb          m3, m2
353    pshuflw          m6, m6, q0000
354    SCRATCH           6, 14, 12
355    SCRATCH           7, 15, 13
356    punpckhbw        m1, m0, m4
357    punpcklbw        m0, m4
358    punpcklbw        m2, m3
359    pshufd           m3, m0, q1111
360    pshufd           m4, m0, q2222
361    pshufd           m5, m0, q3333
362    pshufd           m0, m0, q0000
363    mova    [rsp+ 0*16], m0
364    mova    [rsp+ 1*16], m3
365    mova    [rsp+ 2*16], m4
366    mova    [rsp+ 3*16], m5
367    pshufd           m6, m1, q1111
368    pshufd           m7, m1, q2222
369    pshufd           m5, m1, q3333
370    pshufd           m1, m1, q0000
371    pshufd           m3, m2, q1111
372    psrldq           m0, m2, 10
373    pinsrw           m2, [base+pw_1], 5
374    pshufd           m4, m2, q2222
375    pshufd           m2, m2, q0000
376    pinsrw           m0, [base+round_vals+shiftq*2-10], 3
377    mova    [rsp+ 4*16], m1
378    mova    [rsp+ 5*16], m6
379    SCRATCH           7, 8,  6
380    SCRATCH           5, 9,  7
381    SCRATCH           2, 10, 8
382    SCRATCH           3, 11, 9
383    SCRATCH           4, 12, 10
384    SCRATCH           0, 13, 11
385    DEFINE_ARGS buf, fg_data, h, x
386    sub            bufq, 82*73-(82*3+79)
387    mov              hd, 70
388.y_loop_ar3:
389    mov              xq, -76
390
391.x_loop_ar3:
392    movu             m0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
393    pxor             m3, m3
394    pcmpgtb          m3, m0
395    punpckhbw        m2, m0, m3
396    punpcklbw        m0, m3
397
398    psrldq           m5, m0, 2
399    psrldq           m6, m0, 4
400    psrldq           m7, m0, 6
401    punpcklwd        m4, m0, m5
402    punpcklwd        m6, m7
403    pmaddwd          m4, [rsp+ 0*16]
404    pmaddwd          m6, [rsp+ 1*16]
405    paddd            m4, m6
406
407    movu             m1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
408    pxor             m5, m5
409    pcmpgtb          m5, m1
410    punpckhbw        m3, m1, m5
411    punpcklbw        m1, m5
412    palignr          m6, m2, m0, 10
413    palignr          m7, m2, m0, 12
414    psrldq           m0, 8
415    punpcklwd        m0, m6
416    punpcklwd        m7, m1
417    pmaddwd          m0, [rsp+ 2*16]
418    pmaddwd          m7, [rsp+ 3*16]
419    paddd            m0, m7
420    paddd            m0, m4
421
422    psrldq           m4, m1, 2
423    psrldq           m5, m1, 4
424    psrldq           m6, m1, 6
425    psrldq           m7, m1, 8
426    punpcklwd        m4, m5
427    punpcklwd        m6, m7
428    pmaddwd          m4, [rsp+ 4*16]
429    pmaddwd          m6, [rsp+ 5*16]
430    paddd            m4, m6
431    paddd            m0, m4
432
433    movu             m2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
434    pxor             m7, m7
435    pcmpgtb          m7, m2
436    punpckhbw        m5, m2, m7
437    punpcklbw        m2, m7
438    palignr          m7, m3, m1, 10
439    palignr          m3, m1, 12
440    psrldq           m1, m2, 2
441    punpcklwd        m7, m3
442    punpcklwd        m3, m2, m1
443    pmaddwd          m7, m8
444    pmaddwd          m3, m9
445    paddd            m7, m3
446    paddd            m0, m7
447
448    psrldq           m6, m2, 4
449    psrldq           m1, m2, 6
450    psrldq           m3, m2, 8
451    palignr          m4, m5, m2, 10
452    palignr          m5, m5, m2, 12
453
454    punpcklwd        m6, m1
455    punpcklwd        m3, m4
456    punpcklwd        m5, m14
457    pmaddwd          m6, m10
458    pmaddwd          m3, m11
459    pmaddwd          m5, m12
460    paddd            m0, m6
461    paddd            m3, m5
462    paddd            m0, m3
463
464    movq             m1, [bufq+xq-3]        ; y=0,x=[-3,+4]
465.x_loop_ar3_inner:
466    pxor             m5, m5
467    pcmpgtb          m5, m1
468    punpcklbw        m2, m1, m5
469    pmaddwd          m2, m13
470    pshufd           m3, m2, q1111
471    paddd            m2, m3                 ; left+cur
472    paddd            m2, m0                 ; add top
473    psrldq           m0, 4
474    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
475    ; don't packssdw since we only care about one value
476    packsswb         m2, m2
477    pslldq           m2, 3
478    pand             m2, m15
479    pandn            m3, m15, m1
480    por              m1, m2, m3
481    movd    [bufq+xq-3], m1
482    psrldq           m1, 1
483    inc              xq
484    jz .x_loop_ar3_end
485    test             xq, 3
486    jnz .x_loop_ar3_inner
487    jmp .x_loop_ar3
488
489.x_loop_ar3_end:
490    add            bufq, 82
491    dec              hd
492    jg .y_loop_ar3
493    RET
494
495%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
496INIT_XMM ssse3
497cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
498    movifnidn        r2, r2mp
499    movifnidn        r3, r3mp
500    LEA              r4, $$
501%define base r4-$$
502    movq             m1, [base+rnd_next_upperbit_mask]
503    movq             m4, [base+mul_bits]
504    movq             m7, [base+hmul_bits]
505    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
506    movd             m6, [base+round+r5*2]
507    mova             m5, [base+pb_mask]
508    movd             m0, [fg_dataq+FGData.seed]
509    movd             m2, [base+pw_seed_xor+uvq*4]
510    pxor             m0, m2
511    pshuflw          m6, m6, q0000
512    pshuflw          m0, m0, q0000
513    lea              r6, [base+gaussian_sequence]
514%if %2
515%if ARCH_X86_64
516    mov             r7d, 73-35*%3
517%else
518    mov            r3mp, 73-35*%3
519%endif
520    add            bufq, 44
521.loop_y:
522    mov              r5, -44
523.loop_x:
524%else
525    mov              r5, -82*73
526    sub            bufq, r5
527.loop:
528%endif
529    pand             m2, m0, m1
530    psrlw            m3, m2, 10
531    por              m2, m3             ; bits 0xf, 0x1e, 0x3c and 0x78 are set
532    pmullw           m2, m4             ; bits 0x0f00 are set
533    pshufb           m3, m5, m2         ; set 15th bit for next 4 seeds
534    psllq            m2, m3, 30
535    por              m3, m2
536    psllq            m2, m3, 15
537    por              m3, m2             ; aggregate each bit into next seed's high bit
538    pmulhuw          m2, m0, m7
539    por              m2, m3             ; 4 next output seeds
540    pshuflw          m0, m2, q3333
541    psrlw            m2, 5
542%if ARCH_X86_64
543    movd            r9d, m2
544    pshuflw          m2, m2, q3232
545    movzx            r8, r9w
546    shr              r9, 16
547
548    movd             m3, [r6+r8*2]
549    pinsrw           m3, [r6+r9*2], 1
550
551    movd            r9d, m2
552    movzx            r8, r9w
553    shr              r9, 16
554
555    pinsrw           m3, [r6+r8*2], 2
556    pinsrw           m3, [r6+r9*2], 3
557%else
558    movd             r2, m2
559    pshuflw          m2, m2, q3232
560    movzx            r1, r2w
561    shr              r2, 16
562
563    movd             m3, [r6+r1*2]
564    pinsrw           m3, [r6+r2*2], 1
565
566    movd             r2, m2
567    movzx            r1, r2w
568    shr              r2, 16
569
570    pinsrw           m3, [r6+r1*2], 2
571    pinsrw           m3, [r6+r2*2], 3
572%endif
573    pmulhrsw         m3, m6
574    packsswb         m3, m3
575    movd      [bufq+r5], m3
576    add              r5, 4
577%if %2
578    jl .loop_x
579    add            bufq, 82
580%if ARCH_X86_64
581    dec             r7d
582%else
583    dec            r3mp
584%endif
585    jg .loop_y
586%else
587    jl .loop
588%endif
589
590%if ARCH_X86_32
591    mov              r2, r2mp
592%endif
593
594    ; auto-regression code
595    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
596    movsxd           r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4]
597    lea              r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table]
598    jmp              r5
599
600.ar0:
601    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
602    movifnidn     bufyq, bufymp
603%if ARCH_X86_32
604%assign stack_offset_old stack_offset
605    ALLOC_STACK   -2*16
606%endif
607    imul            uvd, 28
608    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
609    movd             m5, [fg_dataq+FGData.ar_coeffs_uv+uvq]
610    movd             m4, [base+hmul_bits+shiftq*2]
611    DEFINE_ARGS buf, bufy, h, x
612    pxor             m0, m0
613    pcmpgtb          m0, m5
614    punpcklbw        m5, m0
615    movd             m7, [base+pb_1]
616%if %2
617    movd             m6, [base+hmul_bits+2+%3*2]
618%endif
619    pshuflw          m5, m5, q0000
620    pshuflw          m4, m4, q0000
621    pshufd           m7, m7, q0000
622%if %2
623    pshuflw          m6, m6, q0000
624%endif
625    punpcklqdq       m5, m5
626    punpcklqdq       m4, m4
627%if %2
628    punpcklqdq       m6, m6
629%endif
630    pcmpeqw          m1, m1
631    pslldq           m1, 12>>%2
632    SCRATCH           1, 8, 0
633    SCRATCH           4, 9, 1
634%if %2
635    sub            bufq, 82*(73-35*%3)+82-(82*3+41)
636%else
637    sub            bufq, 82*70-3
638%endif
639    add           bufyq, 3+82*3
640    mov              hd, 70-35*%3
641.y_loop_ar0:
642    xor              xd, xd
643.x_loop_ar0:
644    ; first 32 pixels
645%if %2
646    movu             m1, [bufyq+xq*2]
647%if %3
648    movu             m2, [bufyq+xq*2+82]
649%endif
650    movu             m3, [bufyq+xq*2+16]
651%if %3
652    movu             m4, [bufyq+xq*2+82+16]
653%endif
654    pmaddubsw        m0, m7, m1
655%if %3
656    pmaddubsw        m1, m7, m2
657%endif
658    pmaddubsw        m2, m7, m3
659%if %3
660    pmaddubsw        m3, m7, m4
661    paddw            m0, m1
662    paddw            m2, m3
663%endif
664    pmulhrsw         m0, m6
665    pmulhrsw         m2, m6
666%else
667    movu             m0, [bufyq+xq]
668    pxor             m6, m6
669    pcmpgtb          m6, m0
670    punpckhbw        m2, m0, m6
671    punpcklbw        m0, m6
672%endif
673    pmullw           m0, m5
674    pmullw           m2, m5
675    pmulhrsw         m0, m9
676    pmulhrsw         m2, m9
677    movu             m1, [bufq+xq]
678    pxor             m4, m4
679    pcmpgtb          m4, m1
680    punpckhbw        m3, m1, m4
681%if %2
682    punpcklbw        m1, m4
683    paddw            m2, m3
684    paddw            m0, m1
685%else
686    punpcklbw        m6, m1, m4
687    paddw            m2, m3
688    paddw            m0, m6
689%endif
690    packsswb         m0, m2
691%if %2
692    movu      [bufq+xq], m0
693    add              xd, 16
694    cmp              xd, 32
695    jl .x_loop_ar0
696
697    ; last 6/12 pixels
698    movu             m1, [bufyq+xq*(1+%2)]
699%if %3
700    movu             m2, [bufyq+xq*2+82]
701%endif
702    pmaddubsw        m0, m7, m1
703%if %3
704    pmaddubsw        m1, m7, m2
705    paddw            m0, m1
706%endif
707    pmulhrsw         m0, m6
708    pmullw           m0, m5
709    pmulhrsw         m0, m9
710    movq             m1, [bufq+xq]
711    pxor             m4, m4
712    pcmpgtb          m4, m1
713    punpcklbw        m2, m1, m4
714    paddw            m0, m2
715    packsswb         m0, m0
716    pandn            m2, m8, m0
717    pand             m1, m8
718    por              m2, m1
719    movq      [bufq+xq], m2
720%else
721    add              xd, 16
722    cmp              xd, 80
723    je .y_loop_final_ar0
724    movu   [bufq+xq-16], m0
725    jmp .x_loop_ar0
726.y_loop_final_ar0:
727    pandn            m2, m8, m0
728    pand             m1, m8
729    por              m2, m1
730    movu   [bufq+xq-16], m2
731%endif
732
733    add            bufq, 82
734    add           bufyq, 82<<%3
735    dec              hd
736    jg .y_loop_ar0
737    RET
738
739.ar1:
740%if ARCH_X86_32
741%assign stack_offset stack_offset_old
742%assign stack_size_padded 0
743%xdefine rstk rsp
744%endif
745    DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x
746    imul            uvd, 28
747    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
748    movd             m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1]
749    pinsrw           m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2
750%if ARCH_X86_32
751    mov            r3mp, cf3d
752    DEFINE_ARGS buf, shift, fg_data, val3, min, max, x
753%elif WIN64
754    DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x
755    mov            bufq, r0
756%else
757    DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x
758%endif
759    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
760    movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
761%if %2
762    movd             m7, [base+pb_1]
763    movd             m6, [base+hmul_bits+2+%3*2]
764%endif
765    psrldq           m4, 1
766%if ARCH_X86_32
767    DEFINE_ARGS buf, shift, val0, val3, min, max, x
768%elif WIN64
769    DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0
770%else
771    DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0
772%endif
773    pxor             m5, m5
774    punpcklwd        m3, m5
775%if %2
776    punpcklwd        m6, m6
777%endif
778    pcmpgtb          m5, m4
779    punpcklbw        m4, m5
780    pshufd           m5, m4, q1111
781    pshufd           m4, m4, q0000
782    pshufd           m3, m3, q0000
783%if %2
784    pshufd           m7, m7, q0000
785    pshufd           m6, m6, q0000
786    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
787%else
788    sub            bufq, 82*69+3
789%endif
790%if ARCH_X86_32
791    add            r1mp, 79+82*3
792    mov            r0mp, 70-35*%3
793%else
794    add           bufyq, 79+82*3
795    mov              hd, 70-35*%3
796%endif
797    mov            mind, -128
798    mov            maxd, 127
799.y_loop_ar1:
800    mov              xq, -(76>>%2)
801    movsx         val3d, byte [bufq+xq-1]
802.x_loop_ar1:
803%if %2
804%if ARCH_X86_32
805    mov              r2, r1mp
806    movq             m0, [r2+xq*2]
807%if %3
808    movq             m1, [r2+xq*2+82]
809%endif
810%else
811    movq             m0, [bufyq+xq*2]
812%if %3
813    movq             m1, [bufyq+xq*2+82]
814%endif
815%endif
816    pmaddubsw        m2, m7, m0
817%if %3
818    pmaddubsw        m0, m7, m1
819    paddw            m2, m0
820%endif
821    pmulhrsw         m2, m6
822%else
823%if ARCH_X86_32
824    mov              r2, r1mp
825    movd             m2, [r2+xq]
826%else
827    movd             m2, [bufyq+xq]
828%endif
829    pxor             m0, m0
830    pcmpgtb          m0, m2
831    punpcklbw        m2, m0
832%endif
833
834    movq             m0, [bufq+xq-82-1]     ; top/left
835    pxor             m1, m1
836    pcmpgtb          m1, m0
837    punpcklbw        m0, m1
838    psrldq           m1, m0, 4              ; top/right
839    punpcklwd        m1, m2
840    psrldq           m2, m0, 2              ; top
841    punpcklwd        m0, m2
842    pmaddwd          m0, m4
843    pmaddwd          m1, m5
844    paddd            m0, m1
845    paddd            m0, m3
846.x_loop_ar1_inner:
847    movd          val0d, m0
848    psrldq           m0, 4
849%if ARCH_X86_32
850    imul          val3d, r3mp
851%else
852    imul          val3d, cf3d
853%endif
854    add           val3d, val0d
855    sar           val3d, shiftb
856    movsx         val0d, byte [bufq+xq]
857    add           val3d, val0d
858    cmp           val3d, maxd
859    cmovns        val3d, maxd
860    cmp           val3d, mind
861    cmovs         val3d, mind
862    mov  byte [bufq+xq], val3b
863    ; keep val3d in-place as left for next x iteration
864    inc              xq
865    jz .x_loop_ar1_end
866    test             xq, 3
867    jnz .x_loop_ar1_inner
868    jmp .x_loop_ar1
869
870.x_loop_ar1_end:
871    add            bufq, 82
872%if ARCH_X86_32
873    add            r1mp, 82<<%3
874    dec            r0mp
875%else
876    add           bufyq, 82<<%3
877    dec              hd
878%endif
879    jg .y_loop_ar1
880    RET
881
882.ar2:
883%if ARCH_X86_32
884%assign stack_offset stack_offset_old
885%assign stack_size_padded 0
886%xdefine rstk rsp
887    ALLOC_STACK   -8*16
888%endif
889    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
890    movifnidn     bufyq, bufymp
891    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
892    imul            uvd, 28
893    movd             m7, [base+round_vals-12+shiftq*2]
894    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]   ; cf0-12
895    pxor             m2, m2
896    pcmpgtb          m2, m0
897    punpckhbw        m1, m0, m2
898    punpcklbw        m0, m2
899    pinsrw           m1, [base+pw_1], 5
900    punpcklwd        m7, m7
901    pshufd           m7, m7, q0000
902    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
903    pshufd           m4, m1, q0000
904    pshufd           m5, m1, q1111
905    pshufd           m6, m1, q2222
906    pshufd           m3, m0, q3333
907    pshufd           m2, m0, q2222
908    pshufd           m1, m0, q1111
909    pshufd           m0, m0, q0000
910    SCRATCH           0, 8,  0
911    SCRATCH           1, 9,  1
912    SCRATCH           2, 10, 2
913    SCRATCH           3, 11, 3
914    SCRATCH           4, 12, 4
915    SCRATCH           5, 13, 5
916    SCRATCH           6, 14, 6
917    SCRATCH           7, 15, 7
918%if %2
919    movd             m7, [base+hmul_bits+2+%3*2]
920    movd             m6, [base+pb_1]
921    punpcklwd        m7, m7
922    pshufd           m6, m6, q0000
923    pshufd           m7, m7, q0000
924    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
925%else
926    sub            bufq, 82*69+3
927%endif
928    add           bufyq, 79+82*3
929    mov              hd, 70-35*%3
930.y_loop_ar2:
931    mov              xq, -(76>>%2)
932
933.x_loop_ar2:
934    pxor             m2, m2
935    movq             m0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
936    movhps           m0, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
937    pcmpgtb          m2, m0
938    punpckhbw        m1, m0, m2
939    punpcklbw        m0, m2
940    psrldq           m5, m0, 2              ; y=-2,x=[-1,+5]
941    psrldq           m3, m1, 2              ; y=-1,x=[-1,+5]
942    psrldq           m4, m1, 4              ; y=-1,x=[+0,+5]
943    punpcklwd        m2, m0, m5
944    punpcklwd        m3, m4
945    pmaddwd          m2, m8
946    pmaddwd          m3, m11
947    paddd            m2, m3
948
949    psrldq           m4, m0, 4              ; y=-2,x=[+0,+5]
950    psrldq           m5, m0, 6              ; y=-2,x=[+1,+5]
951    psrldq           m0, 8                  ; y=-2,x=[+2,+5]
952    punpcklwd        m4, m5
953    punpcklwd        m0, m1
954    psrldq           m3, m1, 6              ; y=-1,x=[+1,+5]
955    psrldq           m1, m1, 8              ; y=-1,x=[+2,+5]
956    punpcklwd        m3, m1
957    pmaddwd          m4, m9
958    pmaddwd          m0, m10
959    pmaddwd          m3, m12
960    paddd            m4, m0
961    paddd            m2, m3
962    paddd            m2, m4
963
964%if %2
965    movq             m1, [bufyq+xq*2]
966%if %3
967    movq             m3, [bufyq+xq*2+82]
968%endif
969    pmaddubsw        m0, m6, m1
970%if %3
971    pmaddubsw        m1, m6, m3
972    paddw            m0, m1
973%endif
974    pmulhrsw         m0, m7
975%else
976    movd             m0, [bufyq+xq]
977    pxor             m1, m1
978    pcmpgtb          m1, m0
979    punpcklbw        m0, m1
980%endif
981    punpcklwd        m0, m15
982    pmaddwd          m0, m14
983    paddd            m2, m0
984
985    movq             m0, [bufq+xq-2]        ; y=0,x=[-2,+5]
986    pxor             m4, m4
987    movd             m5, [base+byte_blend+1]
988    punpcklbw        m5, m5
989.x_loop_ar2_inner:
990    pcmpgtb          m1, m4, m0
991    punpcklbw        m0, m1
992    pmaddwd          m3, m0, m13
993    paddd            m3, m2
994    psrldq           m2, 4                  ; shift top to next pixel
995    psrad            m3, [fg_dataq+FGData.ar_coeff_shift]
996    pslldq           m3, 4
997    pand             m3, m5
998    paddw            m0, m3
999    packsswb         m0, m0
1000    movd    [bufq+xq-2], m0
1001    psrldq           m0, 1
1002    inc              xq
1003    jz .x_loop_ar2_end
1004    test             xq, 3
1005    jnz .x_loop_ar2_inner
1006    jmp .x_loop_ar2
1007
1008.x_loop_ar2_end:
1009    add            bufq, 82
1010    add           bufyq, 82<<%3
1011    dec              hd
1012    jg .y_loop_ar2
1013    RET
1014
1015.ar3:
1016%if ARCH_X86_32
1017%assign stack_offset stack_offset_old
1018%assign stack_size_padded 0
1019%xdefine rstk rsp
1020%endif
1021    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
1022    movifnidn     bufyq, bufymp
1023%if ARCH_X86_32
1024    ALLOC_STACK  -15*16
1025%else
1026    SUB             rsp, 16*7
1027%assign stack_size_padded (stack_size_padded+16*7)
1028%assign stack_size (stack_size+16*7)
1029%endif
1030    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
1031    imul            uvd, 28
1032
1033    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]   ; cf0-15
1034    pxor             m3, m3
1035    pcmpgtb          m3, m0
1036    punpckhbw        m1, m0, m3
1037    punpcklbw        m0, m3
1038    pshufd           m2, m0, q1111
1039    pshufd           m3, m0, q2222
1040    pshufd           m4, m0, q3333
1041    pshufd           m0, m0, q0000
1042    pshufd           m5, m1, q1111
1043    pshufd           m6, m1, q2222
1044    pshufd           m7, m1, q3333
1045    pshufd           m1, m1, q0000
1046    mova    [rsp+ 0*16], m0
1047    mova    [rsp+ 1*16], m2
1048    mova    [rsp+ 2*16], m3
1049    mova    [rsp+ 3*16], m4
1050    mova    [rsp+ 4*16], m1
1051    mova    [rsp+ 5*16], m5
1052    mova    [rsp+ 6*16], m6
1053    SCRATCH           7, 8, 7
1054
1055    movu             m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]   ; cf16-24 [24=luma]
1056    pxor             m4, m4
1057    pcmpgtb          m4, m2
1058    punpckhbw        m5, m2, m4
1059    punpcklbw        m2, m4
1060    pshufd           m4, m2, q3232
1061    punpcklwd        m3, m4, m5
1062    pshuflw          m5, m4, q3321
1063    pshufd           m4, m3, q0000
1064    pshufd           m3, m2, q1111
1065    pshufd           m2, m2, q0000
1066    pinsrw           m5, [base+round_vals+shiftq*2-10], 3
1067    SCRATCH           2, 9,  8
1068    SCRATCH           3, 10, 9
1069    SCRATCH           4, 11, 10
1070    SCRATCH           5, 12, 11
1071
1072    movd             m2, [base+round_vals-12+shiftq*2]
1073%if %2
1074    movd             m1, [base+pb_1]
1075    movd             m3, [base+hmul_bits+2+%3*2]
1076%endif
1077    pxor             m0, m0
1078    punpcklwd        m2, m0
1079%if %2
1080    punpcklwd        m3, m3
1081%endif
1082    pshufd           m2, m2, q0000
1083%if %2
1084    pshufd           m1, m1, q0000
1085    pshufd           m3, m3, q0000
1086    SCRATCH           1, 13, 12
1087%endif
1088    SCRATCH           2, 14, 13
1089%if %2
1090    SCRATCH           3, 15, 14
1091%endif
1092
1093    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
1094%if %2
1095    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
1096%else
1097    sub            bufq, 82*69+3
1098%endif
1099    add           bufyq, 79+82*3
1100    mov              hd, 70-35*%3
1101.y_loop_ar3:
1102    mov              xq, -(76>>%2)
1103
1104.x_loop_ar3:
1105    movu             m0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
1106    pxor             m4, m4
1107    pcmpgtb          m4, m0
1108    punpckhbw        m3, m0, m4
1109    punpcklbw        m0, m4
1110
1111    psrldq           m5, m0, 2
1112    psrldq           m6, m0, 4
1113    psrldq           m7, m0, 6
1114    punpcklwd        m4, m0, m5
1115    punpcklwd        m6, m7
1116    pmaddwd          m4, [rsp+ 0*16]
1117    pmaddwd          m6, [rsp+ 1*16]
1118    paddd            m4, m6
1119
1120    palignr          m2, m3, m0, 10
1121    palignr          m3, m0, 12
1122    psrldq           m0, 8
1123
1124    movu             m1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
1125    pxor             m6, m6
1126    pcmpgtb          m6, m1
1127    punpckhbw        m5, m1, m6
1128    punpcklbw        m1, m6
1129
1130    punpcklwd        m0, m2
1131    punpcklwd        m3, m1
1132    pmaddwd          m0, [rsp+ 2*16]
1133    pmaddwd          m3, [rsp+ 3*16]
1134    paddd            m0, m3
1135    paddd            m0, m4
1136
1137    movu             m2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
1138    pxor             m7, m7
1139    pcmpgtb          m7, m2
1140    punpckhbw        m6, m2, m7
1141    punpcklbw        m2, m7
1142
1143    palignr          m3, m5, m1, 10
1144    palignr          m5, m1, 12
1145    psrldq           m4, m2, 2
1146
1147    punpcklwd        m3, m5
1148    punpcklwd        m5, m2, m4
1149    pmaddwd          m3, [rsp+ 6*16]
1150    pmaddwd          m5, m8
1151    paddd            m3, m5
1152    paddd            m0, m3
1153
1154    psrldq           m3, m1, 2
1155    psrldq           m4, m1, 4
1156    psrldq           m5, m1, 6
1157    psrldq           m1, 8
1158
1159    punpcklwd        m3, m4
1160    punpcklwd        m5, m1
1161    pmaddwd          m3, [rsp+ 4*16]
1162    pmaddwd          m5, [rsp+ 5*16]
1163    paddd            m3, m5
1164    paddd            m0, m3
1165
1166%if %2
1167    movq             m1, [bufyq+xq*2]
1168%if %3
1169    movq             m3, [bufyq+xq*2+82]
1170%endif
1171    pmaddubsw        m7, m13, m1
1172%if %3
1173    pmaddubsw        m5, m13, m3
1174    paddw            m7, m5
1175%endif
1176    pmulhrsw         m7, m15
1177%else
1178    movd             m7, [bufyq+xq]
1179    pxor             m1, m1
1180    pcmpgtb          m1, m7
1181    punpcklbw        m7, m1
1182%endif
1183
1184    psrldq           m1, m2, 4
1185    psrldq           m3, m2, 6
1186    palignr          m4, m6, m2, 10
1187    palignr          m6, m2, 12
1188    psrldq           m2, 8
1189
1190    punpcklwd        m1, m3
1191    punpcklwd        m2, m4
1192    punpcklwd        m6, m7
1193    pmaddwd          m1, m9
1194    pmaddwd          m2, m10
1195    pmaddwd          m6, m11
1196    paddd            m1, m2
1197    paddd            m0, m6
1198    paddd            m0, m1
1199    paddd            m0, m14
1200
1201    movq             m1, [bufq+xq-3]        ; y=0,x=[-3,+4]
1202    pxor             m4, m4
1203    movd             m5, [base+byte_blend]
1204.x_loop_ar3_inner:
1205    pcmpgtb          m2, m4, m1
1206    punpcklbw        m3, m1, m2
1207    pmaddwd          m2, m3, m12
1208    pshufd           m3, m2, q1111
1209    paddd            m2, m3                 ; left+cur
1210    paddd            m2, m0                 ; add top
1211    psrldq           m0, 4
1212    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
1213    ; don't packssdw, we only care about one value
1214    packsswb         m2, m2
1215    pandn            m3, m5, m1
1216    pslld            m2, 24
1217    pand             m2, m5
1218    por              m1, m2, m3
1219    movd    [bufq+xq-3], m1
1220    psrldq           m1, 1
1221    inc              xq
1222    jz .x_loop_ar3_end
1223    test             xq, 3
1224    jnz .x_loop_ar3_inner
1225    jmp .x_loop_ar3
1226
1227.x_loop_ar3_end:
1228    add            bufq, 82
1229    add           bufyq, 82<<%3
1230    dec              hd
1231    jg .y_loop_ar3
1232    RET
1233%endmacro
1234
1235generate_grain_uv_fn 420, 1, 1
1236generate_grain_uv_fn 422, 1, 0
1237generate_grain_uv_fn 444, 0, 0
1238
1239%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg
1240%assign %%idx 0
1241%define %%tmp %2
1242%if %0 == 6
1243%define %%tmp %6
1244%endif
1245%rep 4
1246%if %%idx == 0
1247    movd        %5 %+ d, %2
1248    pshuflw       %%tmp, %2, q3232
1249%else
1250    movd        %5 %+ d, %%tmp
1251%if %%idx == 2
1252    punpckhqdq    %%tmp, %%tmp
1253%elif %%idx == 4
1254    psrlq         %%tmp, 32
1255%endif
1256%endif
1257    movzx       %4 %+ d, %5 %+ w
1258    shr         %5 %+ d, 16
1259
1260%if %%idx == 0
1261    movd             %1, [%3+%4]
1262%else
1263    pinsrw           %1, [%3+%4], %%idx + 0
1264%endif
1265    pinsrw           %1, [%3+%5], %%idx + 1
1266%assign %%idx %%idx+2
1267%endrep
1268%endmacro
1269
1270INIT_XMM ssse3
1271; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby)
1272%if ARCH_X86_32
1273%if STACK_ALIGNMENT < mmsize
1274cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \
1275        dst, src, scaling, unused1, fg_data, picptr, unused2
1276    ; copy stack arguments to new position post-alignment, so that we
1277    ; don't have to keep the old stack location in a separate register
1278    mov              r0, r0m
1279    mov              r1, r2m
1280    mov              r2, r4m
1281    mov              r3, r6m
1282    mov              r4, r7m
1283    mov              r5, r8m
1284
1285    mov [rsp+5*mmsize+ 4*gprsize], r0
1286    mov [rsp+5*mmsize+ 6*gprsize], r1
1287    mov [rsp+5*mmsize+ 8*gprsize], r2
1288    mov [rsp+5*mmsize+10*gprsize], r3
1289    mov [rsp+5*mmsize+11*gprsize], r4
1290    mov [rsp+5*mmsize+12*gprsize], r5
1291%else
1292cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \
1293        dst, src, scaling, unused1, fg_data, picptr, unused2
1294%endif
1295    mov            srcq, srcm
1296    mov        fg_dataq, r3m
1297    mov        scalingq, r5m
1298%if STACK_ALIGNMENT < mmsize
1299%define r0m [rsp+5*mmsize+ 4*gprsize]
1300%define r1m [rsp+5*mmsize+ 5*gprsize]
1301%define r2m [rsp+5*mmsize+ 6*gprsize]
1302%define r3m [rsp+5*mmsize+ 7*gprsize]
1303%define r4m [rsp+5*mmsize+ 8*gprsize]
1304%define r5m [rsp+5*mmsize+ 9*gprsize]
1305%define r6m [rsp+5*mmsize+10*gprsize]
1306%define r7m [rsp+5*mmsize+11*gprsize]
1307%define r8m [rsp+5*mmsize+12*gprsize]
1308%endif
1309    LEA              r5, pb_mask
1310%define base r5-pb_mask
1311    mov             r5m, picptrq
1312%else
1313cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
1314    lea              r7, [pb_mask]
1315%define base r7-pb_mask
1316%endif
1317    mov             r6d, [fg_dataq+FGData.scaling_shift]
1318    movd             m3, [base+mul_bits+r6*2-14]
1319    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
1320    movd             m4, [base+max+r6*4]
1321    movd             m5, [base+min+r6*2]
1322    punpcklwd        m3, m3
1323    punpcklwd        m4, m4
1324    punpcklwd        m5, m5
1325    pshufd           m3, m3, q0000
1326    pshufd           m4, m4, q0000
1327    pshufd           m5, m5, q0000
1328    SCRATCH           3, 11, 0
1329    SCRATCH           4, 12, 1
1330    SCRATCH           5, 13, 2
1331
1332%if ARCH_X86_32
1333    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
1334%else
1335    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
1336%endif
1337
1338    mov            sbyd, r8m
1339    mov        overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
1340    test       overlapd, overlapd
1341    jz .no_vertical_overlap
1342    mova             m6, [base+pw_1024]
1343    mova             m7, [base+pb_27_17_17_27]
1344    SCRATCH           6, 14, 3
1345    SCRATCH           7, 15, 4
1346    test           sbyd, sbyd
1347    jnz .vertical_overlap
1348    ; fall-through
1349
1350.no_vertical_overlap:
1351    mov             r8m, overlapd
1352%if ARCH_X86_32
1353    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
1354    imul           seed, (173 << 24) | 37
1355%else
1356    imul           seed, sbyd, (173 << 24) | 37
1357%endif
1358    add            seed, (105 << 24) | 178
1359    rol            seed, 8
1360    movzx          seed, seew
1361    xor            seed, [fg_dataq+FGData.seed]
1362
1363%if ARCH_X86_32
1364    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1365
1366    mov             r3m, seed
1367    mov              wq, r4m
1368%else
1369    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1370                unused1, unused2, see, unused3
1371%endif
1372
1373    lea        src_bakq, [srcq+wq]
1374    neg              wq
1375    sub           dstmp, srcq
1376%if ARCH_X86_32
1377    mov             r1m, src_bakq
1378    mov             r4m, wq
1379    DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
1380%endif
1381
1382.loop_x:
1383%if ARCH_X86_32
1384    mov            seed, r3m
1385%endif
1386    mov             r6d, seed
1387    or             seed, 0xEFF4
1388    shr             r6d, 1
1389    test           seeb, seeh
1390    lea            seed, [r6+0x8000]
1391    cmovp          seed, r6d                ; updated seed
1392%if ARCH_X86_32
1393    mov             r3m, seed
1394
1395    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1396
1397    mov           offxd, offyd
1398%else
1399    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1400                offx, offy, see, unused
1401
1402    mov           offyd, seed
1403    mov           offxd, seed
1404%endif
1405    ror           offyd, 8
1406    shr           offxd, 12
1407    and           offyd, 0xf
1408    imul          offyd, 164
1409    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1410
1411%if ARCH_X86_32
1412    ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
1413    ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
1414    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1415%else
1416    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1417                h, offxy, see, unused
1418%endif
1419
1420.loop_x_odd:
1421    mov              hd, r7m
1422    mov      grain_lutq, grain_lutmp
1423.loop_y:
1424    ; src
1425    mova             m0, [srcq]
1426    pxor             m2, m2
1427    punpckhbw        m1, m0, m2
1428    punpcklbw        m0, m2                 ; m0-1: src as word
1429
1430    ; scaling[src]
1431%if ARCH_X86_32
1432    vpgatherdw       m4, m0, scalingq-1, r0, r5, m3
1433    vpgatherdw       m5, m1, scalingq-1, r0, r5, m3
1434%else
1435    vpgatherdw       m4, m0, scalingq-1, r12, r13, m3
1436    vpgatherdw       m5, m1, scalingq-1, r12, r13, m3
1437%endif
1438    REPX {psrlw x, 8}, m4, m5
1439
1440    ; grain = grain_lut[offy+y][offx+x]
1441    movu             m3, [grain_lutq+offxyq]
1442    pcmpgtb          m7, m2, m3
1443    punpcklbw        m2, m3, m7
1444    punpckhbw        m3, m7
1445
1446    ; noise = round2(scaling[src] * grain, scaling_shift)
1447    pmullw           m2, m4
1448    pmullw           m3, m5
1449    pmulhrsw         m2, m11
1450    pmulhrsw         m3, m11
1451
1452    ; dst = clip_pixel(src, noise)
1453    paddw            m0, m2
1454    paddw            m1, m3
1455    pmaxsw           m0, m13
1456    pmaxsw           m1, m13
1457    pminsw           m0, m12
1458    pminsw           m1, m12
1459    packuswb         m0, m1
1460    movifnidn      dstq, dstmp
1461    mova    [dstq+srcq], m0
1462
1463    add            srcq, r2mp
1464    add      grain_lutq, 82
1465    dec              hd
1466    jg .loop_y
1467
1468%if ARCH_X86_32
1469    add            r4mp, 16
1470%else
1471    add              wq, 16
1472%endif
1473    jge .end
1474%if ARCH_X86_32
1475    mov            srcq, r1mp
1476    add            srcq, r4mp
1477%else
1478    lea            srcq, [src_bakq+wq]
1479%endif
1480    btc       dword r8m, 2
1481    jc .next_blk
1482
1483    add          offxyd, 16
1484    test      dword r8m, 2              ; r8m & 2 = have_top_overlap
1485    jz .loop_x_odd
1486
1487%if ARCH_X86_32
1488    add dword [rsp+5*mmsize+1*gprsize], 16
1489%else
1490    add            r11d, 16             ; top_offxyd
1491%endif
1492    jnz .loop_x_odd_v_overlap
1493
1494.next_blk:
1495    test      dword r8m, 1
1496    jz .loop_x
1497
1498    test      dword r8m, 2
1499    jnz .loop_x_hv_overlap
1500
1501    ; horizontal overlap (without vertical overlap)
1502.loop_x_h_overlap:
1503%if ARCH_X86_32
1504    ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
1505    ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
1506    DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3
1507
1508    add          offxyd, 16                 ; left_offxyd
1509    mov [rsp+5*mmsize+0*gprsize], offxyd
1510
1511    DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
1512
1513    mov            seed, r3m
1514%else
1515    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1516                offx, offy, see, left_offxy
1517
1518    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
1519%endif
1520
1521    mov             r6d, seed
1522    or             seed, 0xEFF4
1523    shr             r6d, 1
1524    test           seeb, seeh
1525    lea            seed, [r6+0x8000]
1526    cmovp          seed, r6d                ; updated seed
1527
1528%if ARCH_X86_32
1529    mov             r3m, seed
1530
1531    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1532
1533    mov           offxd, offyd
1534%else
1535    mov           offyd, seed
1536    mov           offxd, seed
1537%endif
1538    ror           offyd, 8
1539    shr           offxd, 12
1540    and           offyd, 0xf
1541    imul          offyd, 164
1542    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1543
1544%if ARCH_X86_32
1545    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1546%else
1547    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1548                h, offxy, see, left_offxy
1549%endif
1550
1551    mov              hd, r7m
1552    mov      grain_lutq, grain_lutmp
1553.loop_y_h_overlap:
1554    ; src
1555    mova             m0, [srcq]
1556    pxor             m2, m2
1557    punpckhbw        m1, m0, m2
1558    punpcklbw        m0, m2                 ; m0-1: src as word
1559
1560    ; scaling[src]
1561%if ARCH_X86_32
1562    vpgatherdw       m4, m0, scalingq-1, r0, r5, m3
1563    vpgatherdw       m5, m1, scalingq-1, r0, r5, m3
1564%else
1565    vpgatherdw       m4, m0, scalingq-1, r12, r13, m3
1566    vpgatherdw       m5, m1, scalingq-1, r12, r13, m3
1567%endif
1568    REPX {psrlw x, 8}, m4, m5
1569
1570    ; grain = grain_lut[offy+y][offx+x]
1571    movu             m3, [grain_lutq+offxyq]
1572%if ARCH_X86_32
1573    mov              r5, [rsp+5*mmsize+0*gprsize]
1574    movd             m7, [grain_lutq+r5]
1575%else
1576    movd             m7, [grain_lutq+left_offxyq]
1577%endif
1578    punpcklbw        m7, m3
1579    pmaddubsw        m6, m15, m7
1580    pmulhrsw         m6, m14
1581    packsswb         m6, m6
1582    shufps           m6, m3, q3210
1583    pcmpgtb          m2, m6
1584    punpcklbw        m7, m6, m2
1585    punpckhbw        m6, m2
1586
1587    ; noise = round2(scaling[src] * grain, scaling_shift)
1588    pmullw           m7, m4
1589    pmullw           m6, m5
1590    pmulhrsw         m7, m11
1591    pmulhrsw         m6, m11
1592
1593    ; dst = clip_pixel(src, noise)
1594    paddw            m0, m7
1595    paddw            m1, m6
1596    pmaxsw           m0, m13
1597    pmaxsw           m1, m13
1598    pminsw           m0, m12
1599    pminsw           m1, m12
1600    packuswb         m0, m1
1601    movifnidn      dstq, dstmp
1602    mova    [dstq+srcq], m0
1603
1604    add            srcq, r2mp
1605    add      grain_lutq, 82
1606    dec              hd
1607    jg .loop_y_h_overlap
1608
1609%if ARCH_X86_32
1610    add            r4mp, 16
1611%else
1612    add              wq, 16
1613%endif
1614    jge .end
1615%if ARCH_X86_32
1616    mov            srcq, r1m
1617    add            srcq, r4m
1618%else
1619    lea            srcq, [src_bakq+wq]
1620%endif
1621    xor       dword r8m, 4
1622    add          offxyd, 16
1623
1624    ; since this half-block had left-overlap, the next does not
1625    test      dword r8m, 2              ; have_top_overlap
1626    jz .loop_x_odd
1627%if ARCH_X86_32
1628    add dword [rsp+5*mmsize+1*gprsize], 16
1629%else
1630    add            r11d, 16             ; top_offxyd
1631%endif
1632    jmp .loop_x_odd_v_overlap
1633
1634.end:
1635    RET
1636
1637.vertical_overlap:
1638%if ARCH_X86_32
1639    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
1640%else
1641    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
1642%endif
1643
1644    or         overlapd, 2                  ; top_overlap: overlap & 2
1645    mov             r8m, overlapd
1646    movzx          sbyd, sbyb
1647%if ARCH_X86_32
1648    imul             r4, [fg_dataq+FGData.seed], 0x00010001
1649    DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
1650%else
1651    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1652%endif
1653    imul           tmpd, sbyd, 173 * 0x00010001
1654    imul           sbyd, 37 * 0x01000100
1655    add            tmpd, (105 << 16) | 188
1656    add            sbyd, (178 << 24) | (141 << 8)
1657    and            tmpd, 0x00ff00ff
1658    and            sbyd, 0xff00ff00
1659    xor            seed, tmpd
1660%if ARCH_X86_32
1661    xor            sbyd, seed               ; (cur_seed << 16) | top_seed
1662
1663    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1664
1665    mov             r3m, seed
1666    mov              wq, r4m
1667%else
1668    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1669
1670    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1671                tmp, unused2, see, unused3
1672%endif
1673
1674    lea        src_bakq, [srcq+wq]
1675    neg              wq
1676    sub           dstmp, srcq
1677%if ARCH_X86_32
1678    mov             r1m, src_bakq
1679    mov             r4m, wq
1680    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
1681%endif
1682
1683.loop_x_v_overlap:
1684%if ARCH_X86_32
1685    mov            seed, r3m
1686%endif
1687    ; we assume from the block above that bits 8-15 of tmpd are zero'ed,
1688    ; because of the 'and tmpd, 0x00ff00ff' above
1689    mov             r6d, seed
1690    or             seed, 0xeff4eff4
1691    test           seeb, seeh
1692    setp           tmpb                     ; parity of top_seed
1693    shr            seed, 16
1694    shl            tmpd, 16
1695    test           seeb, seeh
1696    setp           tmpb                     ; parity of cur_seed
1697    or              r6d, 0x00010001
1698    xor            tmpd, r6d
1699    mov            seed, tmpd
1700    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
1701
1702%if ARCH_X86_32
1703    mov             r3m, seed
1704
1705    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1706
1707    mov           offxd, offyd
1708%else
1709    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1710                offx, offy, see, unused, top_offxy
1711
1712    mov           offyd, seed
1713    mov           offxd, seed
1714%endif
1715
1716    ror           offyd, 8
1717    ror           offxd, 12
1718    and           offyd, 0xf000f
1719    and           offxd, 0xf000f
1720    imul          offyd, 164
1721    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1722    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1723
1724%if ARCH_X86_32
1725    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
1726%else
1727    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1728                h, offxy, see, unused, top_offxy
1729%endif
1730
1731    movzx    top_offxyd, offxyw
1732%if ARCH_X86_32
1733    mov [rsp+5*mmsize+1*gprsize], top_offxyd
1734
1735    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1736%endif
1737    shr          offxyd, 16
1738
1739.loop_x_odd_v_overlap:
1740%if ARCH_X86_32
1741    mov              r5, r5m
1742    lea              r5, [base+pb_27_17]
1743    mov [rsp+5*mmsize+12], r5
1744%else
1745    mova             m8, [pb_27_17]
1746%endif
1747    mov              hd, r7m
1748    mov      grain_lutq, grain_lutmp
1749.loop_y_v_overlap:
1750    ; src
1751    mova             m0, [srcq]
1752    pxor             m2, m2
1753    punpckhbw        m1, m0, m2
1754    punpcklbw        m0, m2                 ; m0-1: src as word
1755
1756    ; scaling[src]
1757%if ARCH_X86_32
1758    vpgatherdw       m4, m0, scalingq-1, r0, r5, m3
1759    vpgatherdw       m5, m1, scalingq-1, r0, r5, m3
1760%else
1761    vpgatherdw       m4, m0, scalingq-1, r12, r13, m3
1762    vpgatherdw       m5, m1, scalingq-1, r12, r13, m3
1763%endif
1764    REPX {psrlw x, 8}, m4, m5
1765
1766    ; grain = grain_lut[offy+y][offx+x]
1767    movu             m3, [grain_lutq+offxyq]
1768%if ARCH_X86_32
1769    mov              r5, [rsp+5*mmsize+1*gprsize]
1770    movu             m7, [grain_lutq+r5]
1771%else
1772    movu             m7, [grain_lutq+top_offxyq]
1773%endif
1774    punpckhbw        m6, m7, m3
1775    punpcklbw        m7, m3
1776%if ARCH_X86_32
1777    mov              r5, [rsp+5*mmsize+12]
1778    pmaddubsw        m3, [r5], m6
1779    pmaddubsw        m6, [r5], m7
1780%else
1781    pmaddubsw        m3, m8, m6
1782    pmaddubsw        m6, m8, m7
1783%endif
1784    pmulhrsw         m3, m14
1785    pmulhrsw         m6, m14
1786    packsswb         m6, m3
1787    pcmpgtb          m7, m2, m6
1788    punpcklbw        m2, m6, m7
1789    punpckhbw        m6, m7
1790
1791    ; noise = round2(scaling[src] * grain, scaling_shift)
1792    pmullw           m2, m4
1793    pmullw           m6, m5
1794    pmulhrsw         m2, m11
1795    pmulhrsw         m6, m11
1796
1797    ; dst = clip_pixel(src, noise)
1798    paddw            m0, m2
1799    paddw            m1, m6
1800    pmaxsw           m0, m13
1801    pmaxsw           m1, m13
1802    pminsw           m0, m12
1803    pminsw           m1, m12
1804    packuswb         m0, m1
1805    movifnidn      dstq, dstmp
1806    mova    [dstq+srcq], m0
1807
1808%if ARCH_X86_32
1809    add dword [rsp+5*mmsize+12], mmsize
1810%else
1811    mova             m8, [pb_17_27]
1812%endif
1813    add            srcq, r2mp
1814    add      grain_lutq, 82
1815    dec              hw
1816    jz .end_y_v_overlap
1817    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1818    ; remaining (up to) 30 lines
1819    btc              hd, 16
1820    jnc .loop_y_v_overlap
1821    jmp .loop_y
1822
1823.end_y_v_overlap:
1824%if ARCH_X86_32
1825    add            r4mp, 16
1826%else
1827    add              wq, 16
1828%endif
1829    jge .end_hv
1830%if ARCH_X86_32
1831    mov            srcq, r1mp
1832    add            srcq, r4mp
1833%else
1834    lea            srcq, [src_bakq+wq]
1835%endif
1836    btc       dword r8m, 2
1837    jc .loop_x_hv_overlap
1838    add          offxyd, 16
1839%if ARCH_X86_32
1840    add dword [rsp+5*mmsize+1*gprsize], 16
1841%else
1842    add      top_offxyd, 16
1843%endif
1844    jmp .loop_x_odd_v_overlap
1845
1846.loop_x_hv_overlap:
1847%if ARCH_X86_32
1848    mov              r5, r5m
1849    lea              r5, [base+pb_27_17]
1850    mov [rsp+5*mmsize+12], r5
1851
1852    DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak
1853
1854    mov              r5, [rsp+5*mmsize+1*gprsize]
1855    mov              r4, offxyd
1856    add              r5, 16
1857    add              r4, 16
1858    mov [rsp+5*mmsize+2*gprsize], r5        ; topleft_offxy
1859    mov [rsp+5*mmsize+0*gprsize], r4        ; left_offxy
1860
1861    DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak
1862
1863    xor            tmpd, tmpd
1864    mov            seed, r3m
1865%else
1866    mova             m8, [pb_27_17]
1867
1868    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1869                tmp, unused2, see, unused3
1870
1871    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
1872%endif
1873    mov             r6d, seed
1874    or             seed, 0xeff4eff4
1875    test           seeb, seeh
1876    setp           tmpb                     ; parity of top_seed
1877    shr            seed, 16
1878    shl            tmpd, 16
1879    test           seeb, seeh
1880    setp           tmpb                     ; parity of cur_seed
1881    or              r6d, 0x00010001
1882    xor            tmpd, r6d
1883    mov            seed, tmpd
1884    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
1885
1886%if ARCH_X86_32
1887    mov             r3m, seed
1888
1889    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1890
1891    mov           offxd, offyd
1892%else
1893    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1894                offx, offy, see, left_offxy, top_offxy, topleft_offxy
1895
1896    lea  topleft_offxyq, [top_offxyq+16]
1897    lea     left_offxyq, [offyq+16]
1898    mov           offyd, seed
1899    mov           offxd, seed
1900%endif
1901    ror           offyd, 8
1902    ror           offxd, 12
1903    and           offyd, 0xf000f
1904    and           offxd, 0xf000f
1905    imul          offyd, 164
1906    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1907    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1908
1909%if ARCH_X86_32
1910    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1911
1912    movzx            r5, offxyw             ; top_offxy
1913    mov [rsp+5*mmsize+1*gprsize], r5
1914%else
1915    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1916                h, offxy, see, left_offxy, top_offxy, topleft_offxy
1917
1918    movzx    top_offxyd, offxyw
1919%endif
1920    shr          offxyd, 16
1921
1922    mov              hd, r7m
1923    mov      grain_lutq, grain_lutmp
1924.loop_y_hv_overlap:
1925    ; grain = grain_lut[offy+y][offx+x]
1926    movu             m3, [grain_lutq+offxyq]
1927%if ARCH_X86_32
1928    mov              r5, [rsp+5*mmsize+1*gprsize]   ; top_offxy
1929    mov              r0, [rsp+5*mmsize+0*gprsize]   ; left_offxy
1930    movu             m6, [grain_lutq+r5]
1931    mov              r5, [rsp+5*mmsize+2*gprsize]   ; topleft_offxy
1932    movd             m4, [grain_lutq+r0]
1933    movd             m7, [grain_lutq+r5]
1934%else
1935    movu             m6, [grain_lutq+top_offxyq]
1936    movd             m4, [grain_lutq+left_offxyq]
1937    movd             m7, [grain_lutq+topleft_offxyq]
1938%endif
1939    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
1940    punpcklbw        m4, m3
1941    punpcklbw        m7, m6
1942    pmaddubsw        m2, m15, m4
1943    pmaddubsw        m4, m15, m7
1944    pmulhrsw         m2, m14
1945    pmulhrsw         m4, m14
1946    packsswb         m2, m2
1947    packsswb         m4, m4
1948    shufps           m2, m3, q3210
1949    shufps           m4, m6, q3210
1950    ; followed by v interpolation (top | cur -> cur)
1951    punpcklbw        m3, m4, m2
1952    punpckhbw        m4, m2
1953%if ARCH_X86_32
1954    mov              r5, [rsp+5*mmsize+12]
1955    pmaddubsw        m7, [r5], m4
1956    pmaddubsw        m4, [r5], m3
1957%else
1958    pmaddubsw        m7, m8, m4
1959    pmaddubsw        m4, m8, m3
1960%endif
1961    pmulhrsw         m7, m14
1962    pmulhrsw         m4, m14
1963    packsswb         m4, m7
1964    pxor             m2, m2
1965    pcmpgtb          m7, m2, m4
1966    punpcklbw        m3, m4, m7
1967    punpckhbw        m4, m7
1968
1969    ; src
1970    mova             m0, [srcq]
1971    punpckhbw        m1, m0, m2
1972    punpcklbw        m0, m2                 ; m0-1: src as word
1973
1974    ; scaling[src]
1975%if ARCH_X86_32
1976    vpgatherdw       m5, m0, scalingq-1, r0, r5, m7
1977    vpgatherdw       m6, m1, scalingq-1, r0, r5, m7
1978%else
1979    vpgatherdw       m5, m0, scalingq-1, r13, r14, m7
1980    vpgatherdw       m6, m1, scalingq-1, r13, r14, m7
1981%endif
1982    REPX {psrlw x, 8}, m5, m6
1983
1984    ; noise = round2(scaling[src] * grain, scaling_shift)
1985    pmullw           m3, m5
1986    pmullw           m4, m6
1987    pmulhrsw         m3, m11
1988    pmulhrsw         m4, m11
1989
1990    ; dst = clip_pixel(src, noise)
1991    paddw            m0, m3
1992    paddw            m1, m4
1993    pmaxsw           m0, m13
1994    pmaxsw           m1, m13
1995    pminsw           m0, m12
1996    pminsw           m1, m12
1997    packuswb         m0, m1
1998    movifnidn      dstq, dstmp
1999    mova    [dstq+srcq], m0
2000
2001%if ARCH_X86_32
2002    add dword [rsp+5*mmsize+12], mmsize
2003%else
2004    mova             m8, [pb_17_27]
2005%endif
2006    add            srcq, r2mp
2007    add      grain_lutq, 82
2008    dec              hw
2009    jz .end_y_hv_overlap
2010    ; 2 lines get vertical overlap, then fall back to non-overlap code for
2011    ; remaining (up to) 30 lines
2012    btc              hd, 16
2013    jnc .loop_y_hv_overlap
2014    jmp .loop_y_h_overlap
2015
2016.end_y_hv_overlap:
2017%if ARCH_X86_32
2018    add            r4mp, 16
2019%else
2020    add              wq, 16
2021%endif
2022    jge .end_hv
2023%if ARCH_X86_32
2024    mov            srcq, r1m
2025    add            srcq, r4m
2026%else
2027    lea            srcq, [src_bakq+wq]
2028%endif
2029    xor       dword r8m, 4
2030    add          offxyd, 16
2031%if ARCH_X86_32
2032    add dword [rsp+5*mmsize+1*gprsize], 16
2033%else
2034    add      top_offxyd, 16
2035%endif
2036    jmp .loop_x_odd_v_overlap
2037
2038.end_hv:
2039    RET
2040
2041%macro FGUV_FN 3 ; name, ss_hor, ss_ver
2042INIT_XMM ssse3
2043%if ARCH_X86_32
2044; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h,
2045;                         sby, luma, lstride, uv_pl, is_id)
2046%if STACK_ALIGNMENT < mmsize
2047DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
2048cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \
2049        tmp, src, scaling, h, fg_data, picptr, unused
2050    mov              r0, r0m
2051    mov              r1, r2m
2052    mov              r2, r4m
2053    mov              r3, r6m
2054    mov              r4, r7m
2055    mov [rsp+7*mmsize+3*gprsize], r0
2056    mov [rsp+7*mmsize+5*gprsize], r1
2057    mov [rsp+7*mmsize+7*gprsize], r2
2058    mov [rsp+7*mmsize+9*gprsize], r3
2059    mov [rsp+7*mmsize+10*gprsize], r4
2060
2061    mov              r0, r8m
2062    mov              r1, r9m
2063    mov              r2, r10m
2064    mov              r4, r11m
2065    mov              r3, r12m
2066    mov [rsp+7*mmsize+11*gprsize], r0
2067    mov [rsp+7*mmsize+12*gprsize], r1
2068    mov [rsp+7*mmsize+13*gprsize], r2
2069    mov [rsp+7*mmsize+14*gprsize], r4
2070%else
2071cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \
2072        tmp, src, scaling, h, fg_data, picptr, unused
2073%endif
2074    mov            srcq, srcm
2075    mov        fg_dataq, r3m
2076    mov        scalingq, r5m
2077%if STACK_ALIGNMENT < mmsize
2078%define r0m [rsp+7*mmsize+ 3*gprsize]
2079%define r1m [rsp+7*mmsize+ 4*gprsize]
2080%define r2m [rsp+7*mmsize+ 5*gprsize]
2081%define r3m [rsp+7*mmsize+ 6*gprsize]
2082%define r4m [rsp+7*mmsize+ 7*gprsize]
2083%define r5m [rsp+7*mmsize+ 8*gprsize]
2084%define r6m [rsp+7*mmsize+ 9*gprsize]
2085%define r7m [rsp+7*mmsize+10*gprsize]
2086%define r8m [rsp+7*mmsize+11*gprsize]
2087%define r9m [rsp+7*mmsize+12*gprsize]
2088%define r10m [rsp+7*mmsize+13*gprsize]
2089%define r11m [rsp+7*mmsize+14*gprsize]
2090%define r12m [rsp+7*mmsize+15*gprsize]
2091%endif
2092    LEA              r5, pb_mask
2093%define base r5-pb_mask
2094    mov             r5m, r5
2095%else
2096cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
2097                                     grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
2098    lea              r8, [pb_mask]
2099%define base r8-pb_mask
2100%endif
2101    mov             r6d, [fg_dataq+FGData.scaling_shift]
2102    movd             m3, [base+mul_bits+r6*2-14]
2103    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
2104    lea            tmpd, [r6d*2]
2105%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize
2106    test             r3, r3
2107%else
2108    cmp      dword r12m, 0                      ; is_idm
2109%endif
2110    movd             m5, [base+min+r6*2]
2111    cmovne          r6d, tmpd
2112    movd             m4, [base+max+r6*2]
2113    punpcklwd        m3, m3
2114    punpcklwd        m5, m5
2115    punpcklwd        m4, m4
2116    pshufd           m3, m3, q0000
2117    pshufd           m5, m5, q0000
2118    pshufd           m4, m4, q0000
2119    SCRATCH           3, 11, 0
2120    SCRATCH           4, 12, 1
2121    SCRATCH           5, 13, 2
2122
2123    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
2124    jne .csfl
2125
2126%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
2127%if ARCH_X86_32
2128    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
2129%else
2130    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
2131%endif
2132
2133%if %1
2134    mov             r6d, dword r11m
2135    movd             m0, [fg_dataq+FGData.uv_mult+r6*4]
2136    movd             m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
2137    punpcklbw        m6, m1, m0
2138    movd             m7, [fg_dataq+FGData.uv_offset+r6*4]
2139    punpcklwd        m6, m6
2140    punpcklwd        m7, m7
2141    pshufd           m6, m6, q0000
2142    pshufd           m7, m7, q0000
2143    SCRATCH           6, 14, 3
2144    SCRATCH           7, 15, 4
2145%endif
2146
2147    mov            sbyd, r8m
2148    mov        overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
2149    test       overlapd, overlapd
2150    jz %%no_vertical_overlap
2151%if ARCH_X86_32
2152%if %2
2153    mova             m1, [base+pb_23_22_h]
2154%else
2155    mova             m1, [base+pb_27_17_17_27]
2156%endif
2157    mova             m0, [base+pw_1024]
2158%else
2159%if %2
2160    mova             m1, [pb_23_22_h]
2161%else
2162    mova             m1, [pb_27_17_17_27]
2163%endif
2164    mova             m0, [pw_1024]
2165%endif
2166    SCRATCH           0, 8, 5
2167    SCRATCH           1, 9, 6
2168    test           sbyd, sbyd
2169    jnz %%vertical_overlap
2170    ; fall-through
2171
2172%%no_vertical_overlap:
2173    mov             r8m, overlapd
2174%if ARCH_X86_32
2175    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
2176    imul           seed, (173 << 24) | 37
2177%else
2178    imul           seed, sbyd, (173 << 24) | 37
2179%endif
2180    add            seed, (105 << 24) | 178
2181    rol            seed, 8
2182    movzx          seed, seew
2183    xor            seed, [fg_dataq+FGData.seed]
2184
2185%if ARCH_X86_32
2186    mov             r3m, seed
2187
2188    DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
2189%define luma_bakq lumaq
2190
2191    mov              wq, r4m
2192%if %3
2193    shl           r10mp, 1
2194%endif
2195%else
2196    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2197                unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak
2198
2199    mov        lstrideq, r10mp
2200%endif
2201
2202    mov           lumaq, r9mp
2203    lea        src_bakq, [srcq+wq]
2204    lea       luma_bakq, [lumaq+wq*(1+%2)]
2205    neg              wq
2206    sub            r0mp, srcq
2207%if ARCH_X86_32
2208    mov             r1m, src_bakq
2209    mov            r11m, luma_bakq
2210    mov             r4m, wq
2211
2212    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
2213%else
2214    mov           r11mp, src_bakq
2215    mov           r12mp, strideq
2216%endif
2217
2218%%loop_x:
2219%if ARCH_X86_32
2220    mov            seed, r3m
2221%endif
2222    mov             r6d, seed
2223    or             seed, 0xEFF4
2224    shr             r6d, 1
2225    test           seeb, seeh
2226    lea            seed, [r6+0x8000]
2227    cmovp          seed, r6d               ; updated seed
2228%if ARCH_X86_32
2229    mov             r3m, seed
2230
2231    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
2232
2233    mov           offxd, offyd
2234%else
2235    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2236                offx, offy, see, overlap, unused1, unused2, lstride
2237
2238    mov           offyd, seed
2239    mov           offxd, seed
2240%endif
2241    ror           offyd, 8
2242    shr           offxd, 12
2243    and           offyd, 0xf
2244    imul          offyd, 164>>%3
2245    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))]  ; offy*stride+offx
2246
2247%if ARCH_X86_32
2248    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
2249%else
2250    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2251                h, offxy, see, overlap, unused1, unused2, lstride, luma_bak
2252%endif
2253
2254%%loop_x_odd:
2255    mov              hd, r7m
2256    mov      grain_lutq, grain_lutmp
2257%%loop_y:
2258    ; src
2259%if ARCH_X86_32
2260    mov           lumaq, r9mp
2261%endif
2262%if %2
2263    mova             m4, [lumaq+ 0]
2264    mova             m6, [lumaq+16]
2265    mova             m0, [srcq]
2266%if ARCH_X86_32
2267    add           lumaq, r10mp
2268    mov            r9mp, lumaq
2269    mov              r5, r5m
2270    movd             m7, [base+pb_1]
2271%else
2272    movd             m7, [pb_1]
2273%endif
2274    pshufd           m7, m7, q0000
2275    pxor             m2, m2
2276    pmaddubsw        m4, m7
2277    pmaddubsw        m6, m7
2278    pavgw            m4, m2
2279    pavgw            m6, m2
2280%else
2281    mova             m4, [lumaq]
2282    mova             m0, [srcq]
2283%if ARCH_X86_32
2284    add           lumaq, r10mp
2285    mov            r9mp, lumaq
2286%endif
2287    pxor             m2, m2
2288%endif
2289
2290%if %1
2291%if %2
2292    packuswb         m4, m6                 ; luma
2293%endif
2294    punpckhbw        m6, m4, m0
2295    punpcklbw        m4, m0                 ; { luma, chroma }
2296    pmaddubsw        m6, m14
2297    pmaddubsw        m4, m14
2298    psraw            m6, 6
2299    psraw            m4, 6
2300    paddw            m6, m15
2301    paddw            m4, m15
2302    packuswb         m4, m6                 ; pack+unpack = clip
2303    punpckhbw        m6, m4, m2
2304    punpcklbw        m4, m2
2305%elif %2 == 0
2306    punpckhbw        m6, m4, m2
2307    punpcklbw        m4, m2
2308%endif
2309
2310    ; scaling[luma_src]
2311%if ARCH_X86_32
2312    vpgatherdw       m7, m4, scalingq-1, r0, r5
2313    vpgatherdw       m5, m6, scalingq-1, r0, r5
2314%else
2315    vpgatherdw       m7, m4, scalingq-1, r12, r2
2316    vpgatherdw       m5, m6, scalingq-1, r12, r2
2317%endif
2318    REPX {psrlw x, 8}, m7, m5
2319
2320    ; unpack chroma_source
2321    punpckhbw        m1, m0, m2
2322    punpcklbw        m0, m2                 ; m0-1: src as word
2323
2324    ; grain = grain_lut[offy+y][offx+x]
2325    movu             m3, [grain_lutq+offxyq+ 0]
2326    pcmpgtb          m6, m2, m3
2327    punpcklbw        m2, m3, m6
2328    punpckhbw        m3, m6
2329
2330    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2331    pmullw           m2, m7
2332    pmullw           m3, m5
2333    pmulhrsw         m2, m11
2334    pmulhrsw         m3, m11
2335
2336%if ARCH_X86_32
2337    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2338%endif
2339
2340    ; dst = clip_pixel(src, noise)
2341    paddw            m0, m2
2342    paddw            m1, m3
2343    pmaxsw           m0, m13
2344    pmaxsw           m1, m13
2345    pminsw           m0, m12
2346    pminsw           m1, m12
2347    packuswb         m0, m1
2348    movifnidn      dstq, dstmp
2349    mova    [dstq+srcq], m0
2350
2351%if ARCH_X86_32
2352    add            srcq, r2mp
2353    ; we already incremented lumaq above
2354%else
2355    add            srcq, r12mp
2356%if %3
2357    lea           lumaq, [lumaq+lstrideq*2]
2358%else
2359    add           lumaq, lstrideq
2360%endif
2361%endif
2362    add      grain_lutq, 82
2363    dec              hw
2364    jg %%loop_y
2365
2366%if ARCH_X86_32
2367    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
2368
2369    mov              wq, r4m
2370%endif
2371    add              wq, 16
2372    jge %%end
2373%if ARCH_X86_32
2374    mov            srcq, r1mp
2375    mov           lumaq, r11mp
2376%else
2377    mov            srcq, r11mp
2378%endif
2379    lea           lumaq, [luma_bakq+wq*(1+%2)]
2380    add            srcq, wq
2381%if ARCH_X86_32
2382    mov             r4m, wq
2383    mov             r9m, lumaq
2384%endif
2385%if %2 == 0
2386    ; adjust top_offxy
2387%if ARCH_X86_32
2388    add dword [rsp+7*mmsize+1*gprsize], 16
2389%else
2390    add            r11d, 16
2391%endif
2392    add          offxyd, 16
2393    btc       dword r8m, 2
2394    jc %%loop_x_even
2395    test      dword r8m, 2
2396    jz %%loop_x_odd
2397    jmp %%loop_x_odd_v_overlap
2398%%loop_x_even:
2399%endif
2400    test      dword r8m, 1
2401    jz %%loop_x
2402
2403    ; r8m = sbym
2404    test      dword r8m, 2
2405    jne %%loop_x_hv_overlap
2406
2407    ; horizontal overlap (without vertical overlap)
2408%%loop_x_h_overlap:
2409%if ARCH_X86_32
2410%if %2
2411    lea              r6, [offxyd+16]
2412    mov [rsp+7*mmsize+0*gprsize], r6
2413%else
2414    mov [rsp+7*mmsize+0*gprsize], offxyd
2415%endif
2416
2417    DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
2418
2419    mov            seed, r3m
2420%else
2421    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2422                offx, offy, see, left_offxy, unused1, unused2, lstride
2423
2424%if %2
2425    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
2426%else
2427    mov     left_offxyd, offyd
2428%endif
2429%endif
2430    mov             r6d, seed
2431    or             seed, 0xEFF4
2432    shr             r6d, 1
2433    test           seeb, seeh
2434    lea            seed, [r6+0x8000]
2435    cmovp          seed, r6d                ; updated seed
2436
2437%if ARCH_X86_32
2438    mov             r3m, seed
2439
2440    DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx
2441
2442    mov          offxd, offyd
2443%else
2444    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2445                offx, offy, see, left_offxy, unused1, unused2, lstride
2446
2447    mov           offyd, seed
2448    mov           offxd, seed
2449%endif
2450    ror           offyd, 8
2451    shr           offxd, 12
2452    and           offyd, 0xf
2453    imul          offyd, 164>>%3
2454    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
2455
2456%if ARCH_X86_32
2457    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
2458%else
2459    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2460                h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak
2461%endif
2462
2463    mov              hd, r7m
2464    mov      grain_lutq, grain_lutmp
2465%%loop_y_h_overlap:
2466    ; src
2467%if ARCH_X86_32
2468    mov           lumaq, r9mp
2469%endif
2470%if %2
2471    mova             m4, [lumaq+ 0]
2472    mova             m6, [lumaq+16]
2473    mova             m0, [srcq]
2474%if ARCH_X86_32
2475    add           lumaq, r10mp
2476    mov            r9mp, lumaq
2477    mov              r5, r5m
2478    movd             m7, [base+pb_1]
2479%else
2480    movd             m7, [pb_1]
2481%endif
2482    pshufd           m7, m7, q0000
2483    pxor             m2, m2
2484    pmaddubsw        m4, m7
2485    pmaddubsw        m6, m7
2486    pavgw            m4, m2
2487    pavgw            m6, m2
2488%else
2489    mova             m4, [lumaq]
2490    mova             m0, [srcq]
2491%if ARCH_X86_32
2492    add           lumaq, r10mp
2493    mov            r9mp, lumaq
2494%endif
2495    pxor             m2, m2
2496%endif
2497
2498%if %1
2499%if %2
2500    packuswb         m4, m6                 ; luma
2501%endif
2502    punpckhbw        m6, m4, m0
2503    punpcklbw        m4, m0                 ; { luma, chroma }
2504    pmaddubsw        m6, m14
2505    pmaddubsw        m4, m14
2506    psraw            m6, 6
2507    psraw            m4, 6
2508    paddw            m6, m15
2509    paddw            m4, m15
2510    packuswb         m4, m6                 ; pack+unpack = clip
2511    punpckhbw        m6, m4, m2
2512    punpcklbw        m4, m2
2513%elif %2 == 0
2514    punpckhbw        m6, m4, m2
2515    punpcklbw        m4, m2
2516%endif
2517
2518    ; scaling[luma_src]
2519%if ARCH_X86_32
2520    vpgatherdw       m7, m4, scalingq-1, r0, r5
2521    vpgatherdw       m5, m6, scalingq-1, r0, r5
2522%else
2523    vpgatherdw       m7, m4, scalingq-1, r12, r2
2524    vpgatherdw       m5, m6, scalingq-1, r12, r2
2525%endif
2526    REPX {psrlw x, 8}, m7, m5
2527
2528    ; unpack chroma_source
2529    punpckhbw        m1, m0, m2
2530    punpcklbw        m0, m2                 ; m0-1: src as word
2531
2532    ; grain = grain_lut[offy+y][offx+x]
2533    movu             m4, [grain_lutq+offxyq+ 0]
2534%if ARCH_X86_32
2535    mov              r0, [rsp+7*mmsize+0*gprsize]
2536    movd             m2, [grain_lutq+r0+ 0]
2537%else
2538    movd             m2, [grain_lutq+left_offxyq+ 0]
2539%endif
2540    punpcklbw        m2, m4
2541    pmaddubsw        m3, m9, m2
2542    pmulhrsw         m3, m8
2543    packsswb         m3, m3
2544    shufps           m3, m4, q3210
2545    pxor             m4, m4
2546    pcmpgtb          m4, m3
2547    punpcklbw        m2, m3, m4
2548    punpckhbw        m3, m4
2549
2550    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2551    pmullw           m2, m7
2552    pmullw           m3, m5
2553    pmulhrsw         m2, m11
2554    pmulhrsw         m3, m11
2555
2556%if ARCH_X86_32
2557    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2558%endif
2559
2560    ; dst = clip_pixel(src, noise)
2561    paddw            m0, m2
2562    paddw            m1, m3
2563    pmaxsw           m0, m13
2564    pmaxsw           m1, m13
2565    pminsw           m0, m12
2566    pminsw           m1, m12
2567    packuswb         m0, m1
2568    movifnidn      dstq, dstmp
2569    mova    [dstq+srcq], m0
2570
2571%if ARCH_X86_32
2572    add            srcq, r2mp
2573    ; lumaq has already been incremented above
2574%else
2575    add            srcq, r12mp
2576%if %3
2577    lea           lumaq, [lumaq+lstrideq*2]
2578%else
2579    add           lumaq, lstrideq
2580%endif
2581%endif
2582    add      grain_lutq, 82
2583    dec              hw
2584    jg %%loop_y_h_overlap
2585
2586%if ARCH_X86_32
2587    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
2588
2589    mov              wq, r4m
2590%endif
2591    add              wq, 16
2592    jge %%end
2593%if ARCH_X86_32
2594    mov            srcq, r1mp
2595    mov           lumaq, r11mp
2596%else
2597    mov            srcq, r11mp
2598%endif
2599    lea           lumaq, [luma_bakq+wq*(1+%2)]
2600    add            srcq, wq
2601%if ARCH_X86_32
2602    mov             r4m, wq
2603    mov             r9m, lumaq
2604%endif
2605%if %2 == 0
2606    xor       dword r8m, 4
2607    ; adjust top_offxyd
2608%if ARCH_X86_32
2609    add dword [rsp+7*mmsize+1*gprsize], 16
2610%else
2611    add            r11d, 16
2612%endif
2613    add          offxyd, 16
2614%endif
2615
2616    ; r8m = sbym
2617    test      dword r8m, 2
2618%if %2
2619    jne %%loop_x_hv_overlap
2620    jmp %%loop_x_h_overlap
2621%else
2622    jne %%loop_x_odd_v_overlap
2623    jmp %%loop_x_odd
2624%endif
2625
2626%%end:
2627    RET
2628
2629%%vertical_overlap:
2630%if ARCH_X86_32
2631    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
2632%else
2633    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
2634%endif
2635
2636    or         overlapd, 2                  ; top_overlap: overlap & 2
2637    mov             r8m, overlapd
2638    movzx          sbyd, sbyb
2639%if ARCH_X86_32
2640    imul             r4, [fg_dataq+FGData.seed], 0x00010001
2641    DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
2642%else
2643    imul           seed, [fg_dataq+FGData.seed], 0x00010001
2644%endif
2645    imul           tmpd, sbyd, 173 * 0x00010001
2646    imul           sbyd, 37 * 0x01000100
2647    add            tmpd, (105 << 16) | 188
2648    add            sbyd, (178 << 24) | (141 << 8)
2649    and            tmpd, 0x00ff00ff
2650    and            sbyd, 0xff00ff00
2651    xor            seed, tmpd
2652%if ARCH_X86_32
2653    xor            sbyd, seed               ; (cur_seed << 16) | top_seed
2654
2655    DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
2656
2657    mov             r3m, seed
2658    mov              wq, r4m
2659%if %3
2660    shl           r10mp, 1
2661%endif
2662%else
2663    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
2664
2665    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2666                tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak
2667
2668    mov        lstrideq, r10mp
2669%endif
2670
2671    mov           lumaq, r9mp
2672    lea        src_bakq, [srcq+wq]
2673    lea       luma_bakq, [lumaq+wq*(1+%2)]
2674    neg              wq
2675    sub            r0mp, srcq
2676%if ARCH_X86_32
2677    mov             r1m, src_bakq
2678    mov            r11m, luma_bakq
2679    mov             r4m, wq
2680
2681    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
2682%else
2683    mov           r11mp, src_bakq
2684    mov           r12mp, strideq
2685%endif
2686
2687%%loop_x_v_overlap:
2688%if ARCH_X86_32
2689    mov            seed, r3m
2690    xor            tmpd, tmpd
2691%endif
2692    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
2693    mov             r6d, seed
2694    or             seed, 0xeff4eff4
2695    test           seeb, seeh
2696    setp           tmpb                     ; parity of top_seed
2697    shr            seed, 16
2698    shl            tmpd, 16
2699    test           seeb, seeh
2700    setp           tmpb                     ; parity of cur_seed
2701    or              r6d, 0x00010001
2702    xor            tmpd, r6d
2703    mov            seed, tmpd
2704    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
2705
2706%if ARCH_X86_32
2707    mov             r3m, seed
2708
2709    DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
2710
2711    mov           offxd, offyd
2712%else
2713    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2714                offx, offy, see, overlap, top_offxy, unused, lstride
2715
2716    mov           offxd, seed
2717    mov           offyd, seed
2718%endif
2719    ror           offyd, 8
2720    ror           offxd, 12
2721    and           offyd, 0xf000f
2722    and           offxd, 0xf000f
2723    imul          offyd, 164>>%3
2724    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
2725    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
2726
2727%if ARCH_X86_32
2728    DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy
2729%else
2730    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2731                h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak
2732%endif
2733
2734    movzx    top_offxyd, offxyw
2735    shr          offxyd, 16
2736%if ARCH_X86_32
2737    mov [rsp+7*mmsize+1*gprsize], top_offxyd
2738
2739    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
2740%endif
2741
2742%%loop_x_odd_v_overlap:
2743    mov              hd, r7m
2744    mov      grain_lutq, grain_lutmp
2745%if ARCH_X86_32
2746    mov              r5, r5m
2747%endif
2748%if %3
2749    mova             m1, [PIC_ptr(pb_23_22)]
2750%else
2751    mova             m1, [PIC_ptr(pb_27_17)]
2752%endif
2753%%loop_y_v_overlap:
2754%if ARCH_X86_32
2755    mov           lumaq, r9mp
2756%endif
2757%if %2
2758    mova             m4, [lumaq+ 0]
2759    mova             m6, [lumaq+16]
2760    mova             m0, [srcq]
2761%if ARCH_X86_32
2762    add           lumaq, r10mp
2763    mov            r9mp, lumaq
2764    mov              r5, r5m
2765    movd             m7, [base+pb_1]
2766%else
2767    movd             m7, [pb_1]
2768%endif
2769    pshufd           m7, m7, q0000
2770    pxor             m2, m2
2771    pmaddubsw        m4, m7
2772    pmaddubsw        m6, m7
2773    pavgw            m4, m2
2774    pavgw            m6, m2
2775%else
2776    mova             m4, [lumaq]
2777    mova             m0, [srcq]
2778%if ARCH_X86_32
2779    add           lumaq, r10mp
2780    mov            r9mp, lumaq
2781%endif
2782    pxor             m2, m2
2783%endif
2784
2785%if %1
2786%if %2
2787    packuswb         m4, m6                 ; luma
2788%endif
2789    punpckhbw        m6, m4, m0
2790    punpcklbw        m4, m0                 ; { luma, chroma }
2791    pmaddubsw        m6, m14
2792    pmaddubsw        m4, m14
2793    psraw            m6, 6
2794    psraw            m4, 6
2795    paddw            m6, m15
2796    paddw            m4, m15
2797    packuswb         m4, m6                 ; pack+unpack = clip
2798    punpckhbw        m6, m4, m2
2799    punpcklbw        m4, m2
2800%elif %2 == 0
2801    punpckhbw        m6, m4, m2
2802    punpcklbw        m4, m2
2803%endif
2804
2805    ; scaling[luma_src]
2806%if ARCH_X86_32
2807    vpgatherdw       m7, m4, scalingq-1, r0, r5
2808    vpgatherdw       m5, m6, scalingq-1, r0, r5
2809%else
2810    vpgatherdw       m7, m4, scalingq-1, r12, r2
2811    vpgatherdw       m5, m6, scalingq-1, r12, r2
2812%endif
2813    REPX {psrlw x, 8}, m7, m5
2814
2815    ; grain = grain_lut[offy+y][offx+x]
2816    movu             m3, [grain_lutq+offxyq]
2817%if ARCH_X86_32
2818    mov              r0, [rsp+7*mmsize+1*gprsize]
2819    movu             m4, [grain_lutq+r0]
2820%else
2821    movu             m4, [grain_lutq+top_offxyq]
2822%endif
2823    punpckhbw        m6, m4, m3
2824    punpcklbw        m4, m3
2825    pmaddubsw        m2, m1, m6
2826    pmaddubsw        m3, m1, m4
2827    pmulhrsw         m2, m8
2828    pmulhrsw         m3, m8
2829    packsswb         m3, m2
2830    pxor             m6, m6
2831    pcmpgtb          m6, m3
2832    punpcklbw        m2, m3, m6
2833    punpckhbw        m3, m6
2834
2835    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2836    pmullw           m2, m7
2837    pmullw           m3, m5
2838    pmulhrsw         m2, m11
2839    pmulhrsw         m3, m11
2840
2841    ; unpack chroma_source
2842    pxor             m4, m4
2843    punpckhbw        m6, m0, m4
2844    punpcklbw        m0, m4                 ; m0-1: src as word
2845
2846%if ARCH_X86_32
2847    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2848%endif
2849
2850    ; dst = clip_pixel(src, noise)
2851    paddw            m0, m2
2852    paddw            m6, m3
2853    pmaxsw           m0, m13
2854    pmaxsw           m6, m13
2855    pminsw           m0, m12
2856    pminsw           m6, m12
2857    packuswb         m0, m6
2858    movifnidn      dstq, dstmp
2859    mova    [dstq+srcq], m0
2860
2861    dec              hw
2862    je %%end_y_v_overlap
2863%if ARCH_X86_32
2864    add            srcq, r2mp
2865    ; lumaq has already been incremented above
2866%else
2867    add            srcq, r12mp
2868%if %3
2869    lea           lumaq, [lumaq+lstrideq*2]
2870%else
2871    add           lumaq, lstrideq
2872%endif
2873%endif
2874    add      grain_lutq, 82
2875%if %3 == 0
2876    btc              hd, 16
2877%if ARCH_X86_32
2878    mov              r5, r5m
2879%endif
2880    mova             m1, [PIC_ptr(pb_17_27)]
2881    jnc %%loop_y_v_overlap
2882%endif
2883    jmp %%loop_y
2884
2885%%end_y_v_overlap:
2886%if ARCH_X86_32
2887    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
2888
2889    mov              wq, r4m
2890%endif
2891    add              wq, 16
2892    jge %%end_hv
2893%if ARCH_X86_32
2894    mov            srcq, r1mp
2895    mov           lumaq, r11mp
2896%else
2897    mov            srcq, r11mp
2898%endif
2899    lea           lumaq, [luma_bakq+wq*(1+%2)]
2900    add            srcq, wq
2901%if ARCH_X86_32
2902    mov             r4m, wq
2903    mov             r9m, lumaq
2904%endif
2905
2906%if %2
2907    ; since fg_dataq.overlap is guaranteed to be set, we never jump
2908    ; back to .loop_x_v_overlap, and instead always fall-through to
2909    ; h+v overlap
2910%else
2911%if ARCH_X86_32
2912    add dword [rsp+7*mmsize+1*gprsize], 16
2913%else
2914    add      top_offxyd, 16
2915%endif
2916    add          offxyd, 16
2917    btc       dword r8m, 2
2918    jnc %%loop_x_odd_v_overlap
2919%endif
2920
2921%%loop_x_hv_overlap:
2922%if ARCH_X86_32
2923    DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
2924
2925    mov              r6, [rsp+7*mmsize+1*gprsize]
2926%if %2
2927    lea              r0, [r3d+16]
2928    add              r6, 16
2929    mov [rsp+7*mmsize+0*gprsize], r0        ; left_offxy
2930%else
2931    mov [rsp+7*mmsize+0*gprsize], r3        ; left_offxy
2932%endif
2933    mov [rsp+7*mmsize+2*gprsize], r6        ; topleft_offxy
2934
2935    DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
2936
2937    mov            seed, r3m
2938    xor            tmpd, tmpd
2939%else
2940    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2941                tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
2942
2943%if %2
2944    lea  topleft_offxyq, [top_offxyq+16]
2945    lea     left_offxyq, [offxyq+16]
2946%else
2947    mov  topleft_offxyq, top_offxyq
2948    mov     left_offxyq, offxyq
2949%endif
2950
2951    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
2952%endif
2953    mov             r6d, seed
2954    or             seed, 0xeff4eff4
2955    test           seeb, seeh
2956    setp           tmpb                     ; parity of top_seed
2957    shr            seed, 16
2958    shl            tmpd, 16
2959    test           seeb, seeh
2960    setp           tmpb                     ; parity of cur_seed
2961    or              r6d, 0x00010001
2962    xor            tmpd, r6d
2963    mov            seed, tmpd
2964    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
2965
2966%if ARCH_X86_32
2967    mov             r3m, seed
2968
2969    DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx
2970
2971    mov           offxd, offyd
2972%else
2973    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2974                offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
2975
2976    mov           offxd, seed
2977    mov           offyd, seed
2978%endif
2979    ror           offyd, 8
2980    ror           offxd, 12
2981    and           offyd, 0xf000f
2982    and           offxd, 0xf000f
2983    imul          offyd, 164>>%3
2984    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
2985    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
2986
2987%if ARCH_X86_32
2988    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
2989%else
2990    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2991                h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak
2992%endif
2993
2994    movzx    top_offxyd, offxyw
2995    shr          offxyd, 16
2996%if ARCH_X86_32
2997    mov [rsp+7*mmsize+1*gprsize], top_offxyd
2998%endif
2999
3000    mov              hd, r7m
3001    mov      grain_lutq, grain_lutmp
3002%if ARCH_X86_32
3003    mov              r5, r5m
3004%endif
3005%if %3
3006    mova             m3, [PIC_ptr(pb_23_22)]
3007%else
3008    mova             m3, [PIC_ptr(pb_27_17)]
3009%endif
3010%%loop_y_hv_overlap:
3011    ; grain = grain_lut[offy+y][offx+x]
3012%if ARCH_X86_32
3013    mov              r0, [rsp+7*mmsize+2*gprsize]       ; topleft_offxy
3014    mov              r5, [rsp+7*mmsize+1*gprsize]       ; top_offxy
3015    movd             m1, [grain_lutq+r0]
3016    mov              r0, [rsp+7*mmsize+0*gprsize]       ; left_offxy
3017%else
3018    movd             m1, [grain_lutq+topleft_offxyq]
3019%endif
3020    movu             m2, [grain_lutq+offxyq]
3021%if ARCH_X86_32
3022    movu             m6, [grain_lutq+r5]
3023    movd             m4, [grain_lutq+r0]
3024%else
3025    movu             m6, [grain_lutq+top_offxyq]
3026    movd             m4, [grain_lutq+left_offxyq]
3027%endif
3028    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
3029    punpcklbw        m1, m6
3030    punpcklbw        m4, m2
3031    pmaddubsw        m0, m9, m1
3032    pmaddubsw        m1, m9, m4
3033    REPX {pmulhrsw x, m8}, m0, m1
3034    packsswb         m0, m1
3035    shufps           m4, m0, m2, q3232
3036    shufps           m0, m6, q3210
3037    ; followed by v interpolation (top | cur -> cur)
3038    punpcklbw        m2, m0, m4
3039    punpckhbw        m0, m4
3040    pmaddubsw        m4, m3, m0
3041    pmaddubsw        m1, m3, m2
3042    pmulhrsw         m4, m8
3043    pmulhrsw         m1, m8
3044    packsswb         m1, m4
3045
3046    ; src
3047%if ARCH_X86_32
3048    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
3049
3050    mov           lumaq, r9mp
3051%endif
3052%if %2
3053    mova             m4, [lumaq+ 0]
3054    mova             m6, [lumaq+16]
3055    mova             m0, [srcq]
3056%if ARCH_X86_32
3057    add           lumaq, r10mp
3058    mov            r9mp, lumaq
3059    mov              r5, r5m
3060    movd             m7, [base+pb_1]
3061%else
3062    movd             m7, [pb_1]
3063%endif
3064    pshufd           m7, m7, q0000
3065    pxor             m2, m2
3066    pmaddubsw        m4, m7
3067    pmaddubsw        m6, m7
3068    pavgw            m4, m2
3069    pavgw            m6, m2
3070%else
3071    mova             m4, [lumaq]
3072    mova             m0, [srcq]
3073%if ARCH_X86_32
3074    add           lumaq, r10mp
3075    mov            r9mp, lumaq
3076%endif
3077    pxor             m2, m2
3078%endif
3079
3080%if %1
3081%if %2
3082    packuswb         m4, m6                 ; luma
3083%endif
3084    punpckhbw        m6, m4, m0
3085    punpcklbw        m4, m0                 ; { luma, chroma }
3086    pmaddubsw        m6, m14
3087    pmaddubsw        m4, m14
3088    psraw            m6, 6
3089    psraw            m4, 6
3090    paddw            m6, m15
3091    paddw            m4, m15
3092    packuswb         m4, m6                 ; pack+unpack = clip
3093    punpckhbw        m6, m4, m2
3094    punpcklbw        m4, m2
3095%elif %2 == 0
3096    punpckhbw        m6, m4, m2
3097    punpcklbw        m4, m2
3098%endif
3099
3100    ; scaling[src]
3101%if ARCH_X86_32
3102    vpgatherdw       m7, m4, scalingq-1, r0, r5
3103    vpgatherdw       m5, m6, scalingq-1, r0, r5
3104%else
3105%if %3
3106    vpgatherdw       m7, m4, scalingq-1, r2, r12
3107    vpgatherdw       m5, m6, scalingq-1, r2, r12
3108%else
3109    vpgatherdw       m7, m4, scalingq-1, r2, r13
3110    vpgatherdw       m5, m6, scalingq-1, r2, r13
3111%endif
3112%endif
3113    REPX {psrlw x, 8}, m7, m5
3114
3115    ; unpack grain
3116    pxor             m4, m4
3117    pcmpgtb          m4, m1
3118    punpcklbw        m2, m1, m4
3119    punpckhbw        m1, m4
3120
3121    ; noise = round2(scaling[src] * grain, scaling_shift)
3122    pmullw           m2, m7
3123    pmullw           m1, m5
3124    pmulhrsw         m2, m11
3125    pmulhrsw         m1, m11
3126
3127%if ARCH_X86_32
3128    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
3129%endif
3130
3131    ; unpack chroma source
3132    pxor             m4, m4
3133    punpckhbw        m5, m0, m4
3134    punpcklbw        m0, m4                 ; m0-1: src as word
3135
3136    ; dst = clip_pixel(src, noise)
3137    paddw            m0, m2
3138    paddw            m5, m1
3139    pmaxsw           m0, m13
3140    pmaxsw           m5, m13
3141    pminsw           m0, m12
3142    pminsw           m5, m12
3143    packuswb         m0, m5
3144    movifnidn      dstq, dstmp
3145    mova    [dstq+srcq], m0
3146
3147%if ARCH_X86_32
3148    add            srcq, r2mp
3149    ; lumaq has been adjusted above already
3150%else
3151    add            srcq, r12mp
3152%if %3
3153    lea           lumaq, [lumaq+lstrideq*(1+%2)]
3154%else
3155    add           lumaq, r10mp
3156%endif
3157%endif
3158    add      grain_lutq, 82
3159    dec              hw
3160%if %3
3161    jg %%loop_y_h_overlap
3162%else
3163    jle %%end_y_hv_overlap
3164%if ARCH_X86_32
3165    mov              r5, r5m
3166%endif
3167    mova             m3, [PIC_ptr(pb_17_27)]
3168    btc              hd, 16
3169    jnc %%loop_y_hv_overlap
3170%if ARCH_X86_64
3171    mov        lstrideq, r10mp
3172%endif
3173    jmp %%loop_y_h_overlap
3174%%end_y_hv_overlap:
3175%if ARCH_X86_64
3176    mov        lstrideq, r10mp
3177%endif
3178%endif
3179
3180%if ARCH_X86_32
3181    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
3182
3183    mov              wq, r4m
3184%endif
3185    add              wq, 16
3186    jge %%end_hv
3187%if ARCH_X86_32
3188    mov            srcq, r1mp
3189    mov           lumaq, r11mp
3190%else
3191    mov            srcq, r11mp
3192%endif
3193    lea           lumaq, [luma_bakq+wq*(1+%2)]
3194    add            srcq, wq
3195%if ARCH_X86_32
3196    mov             r4m, wq
3197    mov             r9m, lumaq
3198%endif
3199%if %2
3200    jmp %%loop_x_hv_overlap
3201%else
3202%if ARCH_X86_32
3203    add dword [rsp+7*mmsize+1*gprsize], 16
3204%else
3205    add      top_offxyd, 16
3206%endif
3207    add          offxyd, 16
3208    xor       dword r8m, 4
3209    jmp %%loop_x_odd_v_overlap
3210%endif
3211
3212%%end_hv:
3213    RET
3214%endmacro
3215
3216    %%FGUV_32x32xN_LOOP 1, %2, %3
3217.csfl:
3218    %%FGUV_32x32xN_LOOP 0, %2, %3
3219%endmacro
3220
3221FGUV_FN 420, 1, 1
3222
3223%if STACK_ALIGNMENT < mmsize
3224DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
3225%endif
3226
3227FGUV_FN 422, 1, 0
3228
3229%if STACK_ALIGNMENT < mmsize
3230DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
3231%endif
3232
3233FGUV_FN 444, 0, 0
3234