1; Copyright © 2019-2021, VideoLAN and dav1d authors
2; Copyright © 2019, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29SECTION_RODATA
30
31pw_1024: times 8 dw 1024
32pb_27_17_17_27: db 27, 17, 17, 27
33                times 6 db 0, 32
34pb_23_22_h: db 23, 22
35            times 7 db 0, 32
36pb_27_17: times 8 db 27, 17
37pb_17_27: times 8 db 17, 27
38pb_23_22: times 8 db 23, 22
39pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
40rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
41byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
42pw_seed_xor: times 2 dw 0xb524
43             times 2 dw 0x49d8
44pb_1: times 4 db 1
45hmul_bits: dw 32768, 16384, 8192, 4096
46round: dw 2048, 1024, 512
47mul_bits: dw 256, 128, 64, 32, 16
48round_vals: dw 32, 64, 128, 256, 512
49max: dw 255, 240, 235
50min: dw 0, 16
51pw_1: dw 1
52
53%macro JMP_TABLE 2-*
54    %xdefine %1_8bpc_%2_table %%table
55    %xdefine %%base %1_8bpc_%2_table
56    %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
57    %%table:
58    %rep %0 - 2
59        dd %%prefix %+ .ar%3 - %%base
60        %rotate 1
61    %endrep
62%endmacro
63
64JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3
65JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3
66JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3
67JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3
68
69struc FGData
70    .seed:                      resd 1
71    .num_y_points:              resd 1
72    .y_points:                  resb 14 * 2
73    .chroma_scaling_from_luma:  resd 1
74    .num_uv_points:             resd 2
75    .uv_points:                 resb 2 * 10 * 2
76    .scaling_shift:             resd 1
77    .ar_coeff_lag:              resd 1
78    .ar_coeffs_y:               resb 24
79    .ar_coeffs_uv:              resb 2 * 28 ; includes padding
80    .ar_coeff_shift:            resq 1
81    .grain_scale_shift:         resd 1
82    .uv_mult:                   resd 2
83    .uv_luma_mult:              resd 2
84    .uv_offset:                 resd 2
85    .overlap_flag:              resd 1
86    .clip_to_restricted_range:  resd 1
87endstruc
88
89cextern gaussian_sequence
90
91SECTION .text
92
93%macro REPX 2-*
94    %xdefine %%f(x) %1
95%rep %0 - 1
96    %rotate 1
97    %%f(%1)
98%endrep
99%endmacro
100
101%if ARCH_X86_32
102%define PIC_ptr(a) base+a
103%else
104%define PIC_ptr(a) a
105%endif
106
107%macro SCRATCH 3
108%if ARCH_X86_32
109    mova [rsp+%3*mmsize], m%1
110%define m%2 [rsp+%3*mmsize]
111%else
112    SWAP             %1, %2
113%endif
114%endmacro
115
116INIT_XMM ssse3
117cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
118    LEA              r4, $$
119%define base r4-$$
120    movq             m1, [base+rnd_next_upperbit_mask]
121    movq             m4, [base+mul_bits]
122    movq             m7, [base+hmul_bits]
123    mov             r2d, [fg_dataq+FGData.grain_scale_shift]
124    movd             m2, [base+round+r2*2]
125    movd             m0, [fg_dataq+FGData.seed]
126    mova             m5, [base+pb_mask]
127    pshuflw          m2, m2, q0000
128    pshuflw          m0, m0, q0000
129    mov              r2, -73*82
130    sub            bufq, r2
131    lea              r3, [base+gaussian_sequence]
132.loop:
133    pand             m6, m0, m1
134    psrlw            m3, m6, 10
135    por              m6, m3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
136    pmullw           m6, m4            ; bits 0x0f00 are set
137    pshufb           m3, m5, m6        ; set 15th bit for next 4 seeds
138    psllq            m6, m3, 30
139    por              m3, m6
140    psllq            m6, m3, 15
141    por              m3, m6            ; aggregate each bit into next seed's high bit
142    pmulhuw          m6, m0, m7
143    por              m3, m6            ; 4 next output seeds
144    pshuflw          m0, m3, q3333
145    psrlw            m3, 5
146%if ARCH_X86_64
147    movq             r6, m3
148    mov              r8, r6
149    movzx           r5d, r6w
150    shr             r6d, 16
151    shr              r8, 32
152    movzx            r7, r8w
153    shr              r8, 16
154
155    movd             m6, [r3+r5*2]
156    pinsrw           m6, [r3+r6*2], 1
157    pinsrw           m6, [r3+r7*2], 2
158    pinsrw           m6, [r3+r8*2], 3
159%else
160    movd             r6, m3
161    pshuflw          m3, m3, q3232
162    movzx            r5, r6w
163    shr              r6, 16
164
165    movd             m6, [r3+r5*2]
166    pinsrw           m6, [r3+r6*2], 1
167
168    movd             r6, m3
169    movzx            r5, r6w
170    shr              r6, 16
171
172    pinsrw           m6, [r3+r5*2], 2
173    pinsrw           m6, [r3+r6*2], 3
174%endif
175    pmulhrsw         m6, m2
176    packsswb         m6, m6
177    movd      [bufq+r2], m6
178    add              r2, 4
179    jl .loop
180
181    ; auto-regression code
182    movsxd           r2, [fg_dataq+FGData.ar_coeff_lag]
183    movsxd           r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4]
184    lea              r2, [r2+base+generate_grain_y_8bpc_ssse3_table]
185    jmp              r2
186
187.ar1:
188%if ARCH_X86_32
189    DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max
190%elif WIN64
191    DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0
192    mov            bufq, r0
193%else
194    DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
195%endif
196    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
197    movd             m4, [fg_dataq+FGData.ar_coeffs_y]
198    mov             ecx, [fg_dataq+FGData.ar_coeff_shift]
199%if ARCH_X86_32
200    mov             r1m, cf3d
201    DEFINE_ARGS buf, shift, val3, min, max, x, val0
202%define hd r0mp
203%define cf3d r1mp
204%elif WIN64
205    DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0
206%else
207    DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
208%endif
209    pxor             m6, m6
210    pcmpgtb          m7, m6, m4
211    punpcklbw        m4, m7
212    pinsrw           m4, [base+pw_1], 3
213    pshufd           m5, m4, q1111
214    pshufd           m4, m4, q0000
215    movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
216    pshuflw          m3, m3, q0000
217    sub            bufq, 82*73-(82*3+79)
218    mov              hd, 70
219    mov            mind, -128
220    mov            maxd, 127
221.y_loop_ar1:
222    mov              xq, -76
223    movsx         val3d, byte [bufq+xq-1]
224.x_loop_ar1:
225    movq             m0, [bufq+xq-82-1]     ; top/left
226    pcmpgtb          m7, m6, m0
227    punpcklbw        m0, m7
228    psrldq           m2, m0, 2              ; top
229    psrldq           m1, m0, 4              ; top/right
230    punpcklwd        m0, m2
231    punpcklwd        m1, m3
232    pmaddwd          m0, m4
233    pmaddwd          m1, m5
234    paddd            m0, m1
235.x_loop_ar1_inner:
236    movd          val0d, m0
237    psrldq           m0, 4
238    imul          val3d, cf3d
239    add           val3d, val0d
240    sar           val3d, shiftb
241    movsx         val0d, byte [bufq+xq]
242    add           val3d, val0d
243    cmp           val3d, maxd
244    cmovns        val3d, maxd
245    cmp           val3d, mind
246    cmovs         val3d, mind
247    mov  byte [bufq+xq], val3b
248    ; keep val3d in-place as left for next x iteration
249    inc              xq
250    jz .x_loop_ar1_end
251    test             xq, 3
252    jnz .x_loop_ar1_inner
253    jmp .x_loop_ar1
254
255.x_loop_ar1_end:
256    add            bufq, 82
257    dec              hd
258    jg .y_loop_ar1
259.ar0:
260    RET
261
262.ar2:
263%if ARCH_X86_32
264%assign stack_offset_old stack_offset
265    ALLOC_STACK -16*8
266%endif
267    DEFINE_ARGS buf, fg_data, shift
268    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
269    movd             m6, [base+round_vals-12+shiftq*2]
270    movd             m7, [base+byte_blend+1]
271    SCRATCH           7, 15, 7
272    movq             m0, [fg_dataq+FGData.ar_coeffs_y+0]    ; cf0-7
273    movd             m1, [fg_dataq+FGData.ar_coeffs_y+8]    ; cf8-11
274    pxor             m7, m7
275    pshuflw          m6, m6, q0000
276    punpcklwd        m6, m7
277    pcmpgtb          m4, m7, m0
278    pcmpgtb          m5, m7, m1
279    punpcklbw        m0, m4
280    punpcklbw        m1, m5
281    DEFINE_ARGS buf, fg_data, h, x
282    pshufd           m4, m1, q0000
283    pshufd           m5, m1, q1111
284    pshufd           m3, m0, q3333
285    pshufd           m2, m0, q2222
286    pshufd           m1, m0, q1111
287    pshufd           m0, m0, q0000
288    SCRATCH           0, 8,  0
289    SCRATCH           1, 9,  1
290    SCRATCH           2, 10, 2
291    SCRATCH           3, 11, 3
292    SCRATCH           4, 12, 4
293    SCRATCH           5, 13, 5
294    SCRATCH           6, 14, 6
295    sub            bufq, 82*73-(82*3+79)
296    mov              hd, 70
297.y_loop_ar2:
298    mov              xq, -76
299
300.x_loop_ar2:
301    movq             m0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
302    movhps           m0, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
303    pcmpgtb          m2, m7, m0
304    punpckhbw        m1, m0, m2
305    punpcklbw        m0, m2
306    psrldq           m5, m0, 2              ; y=-2,x=[-1,+5]
307    psrldq           m3, m1, 2              ; y=-1,x=[-1,+5]
308    psrldq           m4, m1, 4              ; y=-1,x=[+0,+5]
309    punpcklwd        m2, m0, m5
310    punpcklwd        m3, m4
311    pmaddwd          m2, m8
312    pmaddwd          m3, m11
313    paddd            m2, m3
314
315    psrldq           m4, m0, 4              ; y=-2,x=[+0,+5]
316    psrldq           m5, m0, 6              ; y=-2,x=[+1,+5]
317    psrldq           m6, m0, 8              ; y=-2,x=[+2,+5]
318    punpcklwd        m4, m5
319    punpcklwd        m6, m1
320    psrldq           m5, m1, 6              ; y=-1,x=[+1,+5]
321    psrldq           m1, m1, 8              ; y=-1,x=[+2,+5]
322    punpcklwd        m5, m1
323    pmaddwd          m4, m9
324    pmaddwd          m6, m10
325    pmaddwd          m5, m12
326    paddd            m4, m6
327    paddd            m2, m5
328    paddd            m2, m4
329    paddd            m2, m14
330
331    movq             m0, [bufq+xq-2]        ; y=0,x=[-2,+5]
332.x_loop_ar2_inner:
333    pcmpgtb          m4, m7, m0
334    punpcklbw        m1, m0, m4
335    pmaddwd          m3, m1, m13
336    paddd            m3, m2
337    psrldq           m1, 4                  ; y=0,x=0
338    psrldq           m2, 4                  ; shift top to next pixel
339    psrad            m3, [fg_dataq+FGData.ar_coeff_shift]
340    ; don't packssdw since we only care about one value
341    paddw            m3, m1
342    packsswb         m3, m3
343    pslldq           m3, 2
344    pand             m3, m15
345    pandn            m1, m15, m0
346    por              m0, m1, m3
347    psrldq           m0, 1
348    ; overwrite 2 pixels, but that's ok
349    movd      [bufq+xq-1], m0
350    inc              xq
351    jz .x_loop_ar2_end
352    test             xq, 3
353    jnz .x_loop_ar2_inner
354    jmp .x_loop_ar2
355
356.x_loop_ar2_end:
357    add            bufq, 82
358    dec              hd
359    jg .y_loop_ar2
360    RET
361
362.ar3:
363    DEFINE_ARGS buf, fg_data, shift
364%if ARCH_X86_32
365%assign stack_offset stack_offset_old
366    ALLOC_STACK  -16*14
367%elif WIN64
368    SUB             rsp, 16*6
369%assign stack_size_padded (stack_size_padded+16*6)
370%assign stack_size (stack_size+16*6)
371%else
372    ALLOC_STACK  -16*6
373%endif
374    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
375    movd             m6, [base+round_vals-12+shiftq*2]
376    movd             m7, [base+byte_blend]
377    movu             m0, [fg_dataq+FGData.ar_coeffs_y+ 0]   ; cf0-15
378    movq             m2, [fg_dataq+FGData.ar_coeffs_y+16]   ; cf16-23
379    pxor             m3, m3
380    pcmpgtb          m4, m3, m0
381    pcmpgtb          m3, m2
382    pshuflw          m6, m6, q0000
383    SCRATCH           6, 14, 12
384    SCRATCH           7, 15, 13
385    punpckhbw        m1, m0, m4
386    punpcklbw        m0, m4
387    punpcklbw        m2, m3
388    pshufd           m3, m0, q1111
389    pshufd           m4, m0, q2222
390    pshufd           m5, m0, q3333
391    pshufd           m0, m0, q0000
392    mova    [rsp+ 0*16], m0
393    mova    [rsp+ 1*16], m3
394    mova    [rsp+ 2*16], m4
395    mova    [rsp+ 3*16], m5
396    pshufd           m6, m1, q1111
397    pshufd           m7, m1, q2222
398    pshufd           m5, m1, q3333
399    pshufd           m1, m1, q0000
400    pshufd           m3, m2, q1111
401    psrldq           m0, m2, 10
402    pinsrw           m2, [base+pw_1], 5
403    pshufd           m4, m2, q2222
404    pshufd           m2, m2, q0000
405    pinsrw           m0, [base+round_vals+shiftq*2-10], 3
406    mova    [rsp+ 4*16], m1
407    mova    [rsp+ 5*16], m6
408    SCRATCH           7, 8,  6
409    SCRATCH           5, 9,  7
410    SCRATCH           2, 10, 8
411    SCRATCH           3, 11, 9
412    SCRATCH           4, 12, 10
413    SCRATCH           0, 13, 11
414    DEFINE_ARGS buf, fg_data, h, x
415    sub            bufq, 82*73-(82*3+79)
416    mov              hd, 70
417.y_loop_ar3:
418    mov              xq, -76
419
420.x_loop_ar3:
421    movu             m0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
422    pxor             m3, m3
423    pcmpgtb          m3, m0
424    punpckhbw        m2, m0, m3
425    punpcklbw        m0, m3
426
427    psrldq           m5, m0, 2
428    psrldq           m6, m0, 4
429    psrldq           m7, m0, 6
430    punpcklwd        m4, m0, m5
431    punpcklwd        m6, m7
432    pmaddwd          m4, [rsp+ 0*16]
433    pmaddwd          m6, [rsp+ 1*16]
434    paddd            m4, m6
435
436    movu             m1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
437    pxor             m5, m5
438    pcmpgtb          m5, m1
439    punpckhbw        m3, m1, m5
440    punpcklbw        m1, m5
441    palignr          m6, m2, m0, 10
442    palignr          m7, m2, m0, 12
443    psrldq           m0, 8
444    punpcklwd        m0, m6
445    punpcklwd        m7, m1
446    pmaddwd          m0, [rsp+ 2*16]
447    pmaddwd          m7, [rsp+ 3*16]
448    paddd            m0, m7
449    paddd            m0, m4
450
451    psrldq           m4, m1, 2
452    psrldq           m5, m1, 4
453    psrldq           m6, m1, 6
454    psrldq           m7, m1, 8
455    punpcklwd        m4, m5
456    punpcklwd        m6, m7
457    pmaddwd          m4, [rsp+ 4*16]
458    pmaddwd          m6, [rsp+ 5*16]
459    paddd            m4, m6
460    paddd            m0, m4
461
462    movu             m2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
463    pxor             m7, m7
464    pcmpgtb          m7, m2
465    punpckhbw        m5, m2, m7
466    punpcklbw        m2, m7
467    palignr          m7, m3, m1, 10
468    palignr          m3, m1, 12
469    psrldq           m1, m2, 2
470    punpcklwd        m7, m3
471    punpcklwd        m3, m2, m1
472    pmaddwd          m7, m8
473    pmaddwd          m3, m9
474    paddd            m7, m3
475    paddd            m0, m7
476
477    psrldq           m6, m2, 4
478    psrldq           m1, m2, 6
479    psrldq           m3, m2, 8
480    palignr          m4, m5, m2, 10
481    palignr          m5, m5, m2, 12
482
483    punpcklwd        m6, m1
484    punpcklwd        m3, m4
485    punpcklwd        m5, m14
486    pmaddwd          m6, m10
487    pmaddwd          m3, m11
488    pmaddwd          m5, m12
489    paddd            m0, m6
490    paddd            m3, m5
491    paddd            m0, m3
492
493    movq             m1, [bufq+xq-3]        ; y=0,x=[-3,+4]
494.x_loop_ar3_inner:
495    pxor             m5, m5
496    pcmpgtb          m5, m1
497    punpcklbw        m2, m1, m5
498    pmaddwd          m2, m13
499    pshufd           m3, m2, q1111
500    paddd            m2, m3                 ; left+cur
501    paddd            m2, m0                 ; add top
502    psrldq           m0, 4
503    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
504    ; don't packssdw since we only care about one value
505    packsswb         m2, m2
506    pslldq           m2, 3
507    pand             m2, m15
508    pandn            m3, m15, m1
509    por              m1, m2, m3
510    movd    [bufq+xq-3], m1
511    psrldq           m1, 1
512    inc              xq
513    jz .x_loop_ar3_end
514    test             xq, 3
515    jnz .x_loop_ar3_inner
516    jmp .x_loop_ar3
517
518.x_loop_ar3_end:
519    add            bufq, 82
520    dec              hd
521    jg .y_loop_ar3
522    RET
523
524%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
525INIT_XMM ssse3
526cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
527    movifnidn        r2, r2mp
528    movifnidn        r3, r3mp
529    LEA              r4, $$
530%define base r4-$$
531    movq             m1, [base+rnd_next_upperbit_mask]
532    movq             m4, [base+mul_bits]
533    movq             m7, [base+hmul_bits]
534    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
535    movd             m6, [base+round+r5*2]
536    mova             m5, [base+pb_mask]
537    movd             m0, [fg_dataq+FGData.seed]
538    movd             m2, [base+pw_seed_xor+uvq*4]
539    pxor             m0, m2
540    pshuflw          m6, m6, q0000
541    pshuflw          m0, m0, q0000
542    lea              r6, [base+gaussian_sequence]
543%if %2
544%if ARCH_X86_64
545    mov             r7d, 73-35*%3
546%else
547    mov            r3mp, 73-35*%3
548%endif
549    add            bufq, 44
550.loop_y:
551    mov              r5, -44
552.loop_x:
553%else
554    mov              r5, -82*73
555    sub            bufq, r5
556.loop:
557%endif
558    pand             m2, m0, m1
559    psrlw            m3, m2, 10
560    por              m2, m3             ; bits 0xf, 0x1e, 0x3c and 0x78 are set
561    pmullw           m2, m4             ; bits 0x0f00 are set
562    pshufb           m3, m5, m2         ; set 15th bit for next 4 seeds
563    psllq            m2, m3, 30
564    por              m3, m2
565    psllq            m2, m3, 15
566    por              m3, m2             ; aggregate each bit into next seed's high bit
567    pmulhuw          m2, m0, m7
568    por              m2, m3             ; 4 next output seeds
569    pshuflw          m0, m2, q3333
570    psrlw            m2, 5
571%if ARCH_X86_64
572    movd            r9d, m2
573    pshuflw          m2, m2, q3232
574    movzx            r8, r9w
575    shr              r9, 16
576
577    movd             m3, [r6+r8*2]
578    pinsrw           m3, [r6+r9*2], 1
579
580    movd            r9d, m2
581    movzx            r8, r9w
582    shr              r9, 16
583
584    pinsrw           m3, [r6+r8*2], 2
585    pinsrw           m3, [r6+r9*2], 3
586%else
587    movd             r2, m2
588    pshuflw          m2, m2, q3232
589    movzx            r1, r2w
590    shr              r2, 16
591
592    movd             m3, [r6+r1*2]
593    pinsrw           m3, [r6+r2*2], 1
594
595    movd             r2, m2
596    movzx            r1, r2w
597    shr              r2, 16
598
599    pinsrw           m3, [r6+r1*2], 2
600    pinsrw           m3, [r6+r2*2], 3
601%endif
602    pmulhrsw         m3, m6
603    packsswb         m3, m3
604    movd      [bufq+r5], m3
605    add              r5, 4
606%if %2
607    jl .loop_x
608    add            bufq, 82
609%if ARCH_X86_64
610    dec             r7d
611%else
612    dec            r3mp
613%endif
614    jg .loop_y
615%else
616    jl .loop
617%endif
618
619%if ARCH_X86_32
620    mov              r2, r2mp
621%endif
622
623    ; auto-regression code
624    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
625    movsxd           r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4]
626    lea              r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table]
627    jmp              r5
628
629.ar0:
630    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
631    movifnidn     bufyq, bufymp
632%if ARCH_X86_32
633%assign stack_offset_old stack_offset
634    ALLOC_STACK   -2*16
635%endif
636    imul            uvd, 28
637    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
638    movd             m5, [fg_dataq+FGData.ar_coeffs_uv+uvq]
639    movd             m4, [base+hmul_bits+shiftq*2]
640    DEFINE_ARGS buf, bufy, h, x
641    pxor             m0, m0
642    pcmpgtb          m0, m5
643    punpcklbw        m5, m0
644    movd             m7, [base+pb_1]
645%if %2
646    movd             m6, [base+hmul_bits+2+%3*2]
647%endif
648    pshuflw          m5, m5, q0000
649    pshuflw          m4, m4, q0000
650    pshufd           m7, m7, q0000
651%if %2
652    pshuflw          m6, m6, q0000
653%endif
654    punpcklqdq       m5, m5
655    punpcklqdq       m4, m4
656%if %2
657    punpcklqdq       m6, m6
658%endif
659    pcmpeqw          m1, m1
660    pslldq           m1, 12>>%2
661    SCRATCH           1, 8, 0
662    SCRATCH           4, 9, 1
663%if %2
664    sub            bufq, 82*(73-35*%3)+82-(82*3+41)
665%else
666    sub            bufq, 82*70-3
667%endif
668    add           bufyq, 3+82*3
669    mov              hd, 70-35*%3
670.y_loop_ar0:
671    xor              xd, xd
672.x_loop_ar0:
673    ; first 32 pixels
674%if %2
675    movu             m1, [bufyq+xq*2]
676%if %3
677    movu             m2, [bufyq+xq*2+82]
678%endif
679    movu             m3, [bufyq+xq*2+16]
680%if %3
681    movu             m4, [bufyq+xq*2+82+16]
682%endif
683    pmaddubsw        m0, m7, m1
684%if %3
685    pmaddubsw        m1, m7, m2
686%endif
687    pmaddubsw        m2, m7, m3
688%if %3
689    pmaddubsw        m3, m7, m4
690    paddw            m0, m1
691    paddw            m2, m3
692%endif
693    pmulhrsw         m0, m6
694    pmulhrsw         m2, m6
695%else
696    movu             m0, [bufyq+xq]
697    pxor             m6, m6
698    pcmpgtb          m6, m0
699    punpckhbw        m2, m0, m6
700    punpcklbw        m0, m6
701%endif
702    pmullw           m0, m5
703    pmullw           m2, m5
704    pmulhrsw         m0, m9
705    pmulhrsw         m2, m9
706    movu             m1, [bufq+xq]
707    pxor             m4, m4
708    pcmpgtb          m4, m1
709    punpckhbw        m3, m1, m4
710%if %2
711    punpcklbw        m1, m4
712    paddw            m2, m3
713    paddw            m0, m1
714%else
715    punpcklbw        m6, m1, m4
716    paddw            m2, m3
717    paddw            m0, m6
718%endif
719    packsswb         m0, m2
720%if %2
721    movu      [bufq+xq], m0
722    add              xd, 16
723    cmp              xd, 32
724    jl .x_loop_ar0
725
726    ; last 6/12 pixels
727    movu             m1, [bufyq+xq*(1+%2)]
728%if %3
729    movu             m2, [bufyq+xq*2+82]
730%endif
731    pmaddubsw        m0, m7, m1
732%if %3
733    pmaddubsw        m1, m7, m2
734    paddw            m0, m1
735%endif
736    pmulhrsw         m0, m6
737    pmullw           m0, m5
738    pmulhrsw         m0, m9
739    movq             m1, [bufq+xq]
740    pxor             m4, m4
741    pcmpgtb          m4, m1
742    punpcklbw        m2, m1, m4
743    paddw            m0, m2
744    packsswb         m0, m0
745    pandn            m2, m8, m0
746    pand             m1, m8
747    por              m2, m1
748    movq      [bufq+xq], m2
749%else
750    add              xd, 16
751    cmp              xd, 80
752    je .y_loop_final_ar0
753    movu   [bufq+xq-16], m0
754    jmp .x_loop_ar0
755.y_loop_final_ar0:
756    pandn            m2, m8, m0
757    pand             m1, m8
758    por              m2, m1
759    movu   [bufq+xq-16], m2
760%endif
761
762    add            bufq, 82
763    add           bufyq, 82<<%3
764    dec              hd
765    jg .y_loop_ar0
766    RET
767
768.ar1:
769%if ARCH_X86_32
770%assign stack_offset stack_offset_old
771%assign stack_size_padded 0
772%xdefine rstk rsp
773%endif
774    DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x
775    imul            uvd, 28
776    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
777    movd             m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1]
778    pinsrw           m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2
779%if ARCH_X86_32
780    mov            r3mp, cf3d
781    DEFINE_ARGS buf, shift, fg_data, val3, min, max, x
782%elif WIN64
783    DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x
784    mov            bufq, r0
785%else
786    DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x
787%endif
788    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
789    movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
790%if %2
791    movd             m7, [base+pb_1]
792    movd             m6, [base+hmul_bits+2+%3*2]
793%endif
794    psrldq           m4, 1
795%if ARCH_X86_32
796    DEFINE_ARGS buf, shift, val0, val3, min, max, x
797%elif WIN64
798    DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0
799%else
800    DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0
801%endif
802    pxor             m5, m5
803    punpcklwd        m3, m5
804%if %2
805    punpcklwd        m6, m6
806%endif
807    pcmpgtb          m5, m4
808    punpcklbw        m4, m5
809    pshufd           m5, m4, q1111
810    pshufd           m4, m4, q0000
811    pshufd           m3, m3, q0000
812%if %2
813    pshufd           m7, m7, q0000
814    pshufd           m6, m6, q0000
815    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
816%else
817    sub            bufq, 82*69+3
818%endif
819%if ARCH_X86_32
820    add            r1mp, 79+82*3
821    mov            r0mp, 70-35*%3
822%else
823    add           bufyq, 79+82*3
824    mov              hd, 70-35*%3
825%endif
826    mov            mind, -128
827    mov            maxd, 127
828.y_loop_ar1:
829    mov              xq, -(76>>%2)
830    movsx         val3d, byte [bufq+xq-1]
831.x_loop_ar1:
832%if %2
833%if ARCH_X86_32
834    mov              r2, r1mp
835    movq             m0, [r2+xq*2]
836%if %3
837    movq             m1, [r2+xq*2+82]
838%endif
839%else
840    movq             m0, [bufyq+xq*2]
841%if %3
842    movq             m1, [bufyq+xq*2+82]
843%endif
844%endif
845    pmaddubsw        m2, m7, m0
846%if %3
847    pmaddubsw        m0, m7, m1
848    paddw            m2, m0
849%endif
850    pmulhrsw         m2, m6
851%else
852%if ARCH_X86_32
853    mov              r2, r1mp
854    movd             m2, [r2+xq]
855%else
856    movd             m2, [bufyq+xq]
857%endif
858    pxor             m0, m0
859    pcmpgtb          m0, m2
860    punpcklbw        m2, m0
861%endif
862
863    movq             m0, [bufq+xq-82-1]     ; top/left
864    pxor             m1, m1
865    pcmpgtb          m1, m0
866    punpcklbw        m0, m1
867    psrldq           m1, m0, 4              ; top/right
868    punpcklwd        m1, m2
869    psrldq           m2, m0, 2              ; top
870    punpcklwd        m0, m2
871    pmaddwd          m0, m4
872    pmaddwd          m1, m5
873    paddd            m0, m1
874    paddd            m0, m3
875.x_loop_ar1_inner:
876    movd          val0d, m0
877    psrldq           m0, 4
878%if ARCH_X86_32
879    imul          val3d, r3mp
880%else
881    imul          val3d, cf3d
882%endif
883    add           val3d, val0d
884    sar           val3d, shiftb
885    movsx         val0d, byte [bufq+xq]
886    add           val3d, val0d
887    cmp           val3d, maxd
888    cmovns        val3d, maxd
889    cmp           val3d, mind
890    cmovs         val3d, mind
891    mov  byte [bufq+xq], val3b
892    ; keep val3d in-place as left for next x iteration
893    inc              xq
894    jz .x_loop_ar1_end
895    test             xq, 3
896    jnz .x_loop_ar1_inner
897    jmp .x_loop_ar1
898
899.x_loop_ar1_end:
900    add            bufq, 82
901%if ARCH_X86_32
902    add            r1mp, 82<<%3
903    dec            r0mp
904%else
905    add           bufyq, 82<<%3
906    dec              hd
907%endif
908    jg .y_loop_ar1
909    RET
910
911.ar2:
912%if ARCH_X86_32
913%assign stack_offset stack_offset_old
914%assign stack_size_padded 0
915%xdefine rstk rsp
916    ALLOC_STACK   -8*16
917%endif
918    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
919    movifnidn     bufyq, bufymp
920    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
921    imul            uvd, 28
922    movd             m7, [base+round_vals-12+shiftq*2]
923    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]   ; cf0-12
924    pxor             m2, m2
925    pcmpgtb          m2, m0
926    punpckhbw        m1, m0, m2
927    punpcklbw        m0, m2
928    pinsrw           m1, [base+pw_1], 5
929    punpcklwd        m7, m7
930    pshufd           m7, m7, q0000
931    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
932    pshufd           m4, m1, q0000
933    pshufd           m5, m1, q1111
934    pshufd           m6, m1, q2222
935    pshufd           m3, m0, q3333
936    pshufd           m2, m0, q2222
937    pshufd           m1, m0, q1111
938    pshufd           m0, m0, q0000
939    SCRATCH           0, 8,  0
940    SCRATCH           1, 9,  1
941    SCRATCH           2, 10, 2
942    SCRATCH           3, 11, 3
943    SCRATCH           4, 12, 4
944    SCRATCH           5, 13, 5
945    SCRATCH           6, 14, 6
946    SCRATCH           7, 15, 7
947%if %2
948    movd             m7, [base+hmul_bits+2+%3*2]
949    movd             m6, [base+pb_1]
950    punpcklwd        m7, m7
951    pshufd           m6, m6, q0000
952    pshufd           m7, m7, q0000
953    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
954%else
955    sub            bufq, 82*69+3
956%endif
957    add           bufyq, 79+82*3
958    mov              hd, 70-35*%3
959.y_loop_ar2:
960    mov              xq, -(76>>%2)
961
962.x_loop_ar2:
963    pxor             m2, m2
964    movq             m0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
965    movhps           m0, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
966    pcmpgtb          m2, m0
967    punpckhbw        m1, m0, m2
968    punpcklbw        m0, m2
969    psrldq           m5, m0, 2              ; y=-2,x=[-1,+5]
970    psrldq           m3, m1, 2              ; y=-1,x=[-1,+5]
971    psrldq           m4, m1, 4              ; y=-1,x=[+0,+5]
972    punpcklwd        m2, m0, m5
973    punpcklwd        m3, m4
974    pmaddwd          m2, m8
975    pmaddwd          m3, m11
976    paddd            m2, m3
977
978    psrldq           m4, m0, 4              ; y=-2,x=[+0,+5]
979    psrldq           m5, m0, 6              ; y=-2,x=[+1,+5]
980    psrldq           m0, 8                  ; y=-2,x=[+2,+5]
981    punpcklwd        m4, m5
982    punpcklwd        m0, m1
983    psrldq           m3, m1, 6              ; y=-1,x=[+1,+5]
984    psrldq           m1, m1, 8              ; y=-1,x=[+2,+5]
985    punpcklwd        m3, m1
986    pmaddwd          m4, m9
987    pmaddwd          m0, m10
988    pmaddwd          m3, m12
989    paddd            m4, m0
990    paddd            m2, m3
991    paddd            m2, m4
992
993%if %2
994    movq             m1, [bufyq+xq*2]
995%if %3
996    movq             m3, [bufyq+xq*2+82]
997%endif
998    pmaddubsw        m0, m6, m1
999%if %3
1000    pmaddubsw        m1, m6, m3
1001    paddw            m0, m1
1002%endif
1003    pmulhrsw         m0, m7
1004%else
1005    movd             m0, [bufyq+xq]
1006    pxor             m1, m1
1007    pcmpgtb          m1, m0
1008    punpcklbw        m0, m1
1009%endif
1010    punpcklwd        m0, m15
1011    pmaddwd          m0, m14
1012    paddd            m2, m0
1013
1014    movq             m0, [bufq+xq-2]        ; y=0,x=[-2,+5]
1015    pxor             m4, m4
1016    movd             m5, [base+byte_blend+1]
1017    punpcklbw        m5, m5
1018.x_loop_ar2_inner:
1019    pcmpgtb          m1, m4, m0
1020    punpcklbw        m0, m1
1021    pmaddwd          m3, m0, m13
1022    paddd            m3, m2
1023    psrldq           m2, 4                  ; shift top to next pixel
1024    psrad            m3, [fg_dataq+FGData.ar_coeff_shift]
1025    pslldq           m3, 4
1026    pand             m3, m5
1027    paddw            m0, m3
1028    packsswb         m0, m0
1029    movd    [bufq+xq-2], m0
1030    psrldq           m0, 1
1031    inc              xq
1032    jz .x_loop_ar2_end
1033    test             xq, 3
1034    jnz .x_loop_ar2_inner
1035    jmp .x_loop_ar2
1036
1037.x_loop_ar2_end:
1038    add            bufq, 82
1039    add           bufyq, 82<<%3
1040    dec              hd
1041    jg .y_loop_ar2
1042    RET
1043
1044.ar3:
1045%if ARCH_X86_32
1046%assign stack_offset stack_offset_old
1047%assign stack_size_padded 0
1048%xdefine rstk rsp
1049%endif
1050    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
1051    movifnidn     bufyq, bufymp
1052%if ARCH_X86_32
1053    ALLOC_STACK  -15*16
1054%else
1055    SUB             rsp, 16*7
1056%assign stack_size_padded (stack_size_padded+16*7)
1057%assign stack_size (stack_size+16*7)
1058%endif
1059    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
1060    imul            uvd, 28
1061
1062    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]   ; cf0-15
1063    pxor             m3, m3
1064    pcmpgtb          m3, m0
1065    punpckhbw        m1, m0, m3
1066    punpcklbw        m0, m3
1067    pshufd           m2, m0, q1111
1068    pshufd           m3, m0, q2222
1069    pshufd           m4, m0, q3333
1070    pshufd           m0, m0, q0000
1071    pshufd           m5, m1, q1111
1072    pshufd           m6, m1, q2222
1073    pshufd           m7, m1, q3333
1074    pshufd           m1, m1, q0000
1075    mova    [rsp+ 0*16], m0
1076    mova    [rsp+ 1*16], m2
1077    mova    [rsp+ 2*16], m3
1078    mova    [rsp+ 3*16], m4
1079    mova    [rsp+ 4*16], m1
1080    mova    [rsp+ 5*16], m5
1081    mova    [rsp+ 6*16], m6
1082    SCRATCH           7, 8, 7
1083
1084    movu             m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]   ; cf16-24 [24=luma]
1085    pxor             m4, m4
1086    pcmpgtb          m4, m2
1087    punpckhbw        m5, m2, m4
1088    punpcklbw        m2, m4
1089    pshufd           m4, m2, q3232
1090    punpcklwd        m3, m4, m5
1091    pshuflw          m5, m4, q3321
1092    pshufd           m4, m3, q0000
1093    pshufd           m3, m2, q1111
1094    pshufd           m2, m2, q0000
1095    pinsrw           m5, [base+round_vals+shiftq*2-10], 3
1096    SCRATCH           2, 9,  8
1097    SCRATCH           3, 10, 9
1098    SCRATCH           4, 11, 10
1099    SCRATCH           5, 12, 11
1100
1101    movd             m2, [base+round_vals-12+shiftq*2]
1102%if %2
1103    movd             m1, [base+pb_1]
1104    movd             m3, [base+hmul_bits+2+%3*2]
1105%endif
1106    pxor             m0, m0
1107    punpcklwd        m2, m0
1108%if %2
1109    punpcklwd        m3, m3
1110%endif
1111    pshufd           m2, m2, q0000
1112%if %2
1113    pshufd           m1, m1, q0000
1114    pshufd           m3, m3, q0000
1115    SCRATCH           1, 13, 12
1116%endif
1117    SCRATCH           2, 14, 13
1118%if %2
1119    SCRATCH           3, 15, 14
1120%endif
1121
1122    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
1123%if %2
1124    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
1125%else
1126    sub            bufq, 82*69+3
1127%endif
1128    add           bufyq, 79+82*3
1129    mov              hd, 70-35*%3
1130.y_loop_ar3:
1131    mov              xq, -(76>>%2)
1132
1133.x_loop_ar3:
1134    movu             m0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
1135    pxor             m4, m4
1136    pcmpgtb          m4, m0
1137    punpckhbw        m3, m0, m4
1138    punpcklbw        m0, m4
1139
1140    psrldq           m5, m0, 2
1141    psrldq           m6, m0, 4
1142    psrldq           m7, m0, 6
1143    punpcklwd        m4, m0, m5
1144    punpcklwd        m6, m7
1145    pmaddwd          m4, [rsp+ 0*16]
1146    pmaddwd          m6, [rsp+ 1*16]
1147    paddd            m4, m6
1148
1149    palignr          m2, m3, m0, 10
1150    palignr          m3, m0, 12
1151    psrldq           m0, 8
1152
1153    movu             m1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
1154    pxor             m6, m6
1155    pcmpgtb          m6, m1
1156    punpckhbw        m5, m1, m6
1157    punpcklbw        m1, m6
1158
1159    punpcklwd        m0, m2
1160    punpcklwd        m3, m1
1161    pmaddwd          m0, [rsp+ 2*16]
1162    pmaddwd          m3, [rsp+ 3*16]
1163    paddd            m0, m3
1164    paddd            m0, m4
1165
1166    movu             m2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
1167    pxor             m7, m7
1168    pcmpgtb          m7, m2
1169    punpckhbw        m6, m2, m7
1170    punpcklbw        m2, m7
1171
1172    palignr          m3, m5, m1, 10
1173    palignr          m5, m1, 12
1174    psrldq           m4, m2, 2
1175
1176    punpcklwd        m3, m5
1177    punpcklwd        m5, m2, m4
1178    pmaddwd          m3, [rsp+ 6*16]
1179    pmaddwd          m5, m8
1180    paddd            m3, m5
1181    paddd            m0, m3
1182
1183    psrldq           m3, m1, 2
1184    psrldq           m4, m1, 4
1185    psrldq           m5, m1, 6
1186    psrldq           m1, 8
1187
1188    punpcklwd        m3, m4
1189    punpcklwd        m5, m1
1190    pmaddwd          m3, [rsp+ 4*16]
1191    pmaddwd          m5, [rsp+ 5*16]
1192    paddd            m3, m5
1193    paddd            m0, m3
1194
1195%if %2
1196    movq             m1, [bufyq+xq*2]
1197%if %3
1198    movq             m3, [bufyq+xq*2+82]
1199%endif
1200    pmaddubsw        m7, m13, m1
1201%if %3
1202    pmaddubsw        m5, m13, m3
1203    paddw            m7, m5
1204%endif
1205    pmulhrsw         m7, m15
1206%else
1207    movd             m7, [bufyq+xq]
1208    pxor             m1, m1
1209    pcmpgtb          m1, m7
1210    punpcklbw        m7, m1
1211%endif
1212
1213    psrldq           m1, m2, 4
1214    psrldq           m3, m2, 6
1215    palignr          m4, m6, m2, 10
1216    palignr          m6, m2, 12
1217    psrldq           m2, 8
1218
1219    punpcklwd        m1, m3
1220    punpcklwd        m2, m4
1221    punpcklwd        m6, m7
1222    pmaddwd          m1, m9
1223    pmaddwd          m2, m10
1224    pmaddwd          m6, m11
1225    paddd            m1, m2
1226    paddd            m0, m6
1227    paddd            m0, m1
1228    paddd            m0, m14
1229
1230    movq             m1, [bufq+xq-3]        ; y=0,x=[-3,+4]
1231    pxor             m4, m4
1232    movd             m5, [base+byte_blend]
1233.x_loop_ar3_inner:
1234    pcmpgtb          m2, m4, m1
1235    punpcklbw        m3, m1, m2
1236    pmaddwd          m2, m3, m12
1237    pshufd           m3, m2, q1111
1238    paddd            m2, m3                 ; left+cur
1239    paddd            m2, m0                 ; add top
1240    psrldq           m0, 4
1241    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
1242    ; don't packssdw, we only care about one value
1243    packsswb         m2, m2
1244    pandn            m3, m5, m1
1245    pslld            m2, 24
1246    pand             m2, m5
1247    por              m1, m2, m3
1248    movd    [bufq+xq-3], m1
1249    psrldq           m1, 1
1250    inc              xq
1251    jz .x_loop_ar3_end
1252    test             xq, 3
1253    jnz .x_loop_ar3_inner
1254    jmp .x_loop_ar3
1255
1256.x_loop_ar3_end:
1257    add            bufq, 82
1258    add           bufyq, 82<<%3
1259    dec              hd
1260    jg .y_loop_ar3
1261    RET
1262%endmacro
1263
1264generate_grain_uv_fn 420, 1, 1
1265generate_grain_uv_fn 422, 1, 0
1266generate_grain_uv_fn 444, 0, 0
1267
1268%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg
1269%assign %%idx 0
1270%define %%tmp %2
1271%if %0 == 6
1272%define %%tmp %6
1273%endif
1274%rep 4
1275%if %%idx == 0
1276    movd        %5 %+ d, %2
1277    pshuflw       %%tmp, %2, q3232
1278%else
1279    movd        %5 %+ d, %%tmp
1280%if %%idx == 2
1281    punpckhqdq    %%tmp, %%tmp
1282%elif %%idx == 4
1283    psrlq         %%tmp, 32
1284%endif
1285%endif
1286    movzx       %4 %+ d, %5 %+ w
1287    shr         %5 %+ d, 16
1288
1289%if %%idx == 0
1290    movd             %1, [%3+%4]
1291%else
1292    pinsrw           %1, [%3+%4], %%idx + 0
1293%endif
1294    pinsrw           %1, [%3+%5], %%idx + 1
1295%assign %%idx %%idx+2
1296%endrep
1297%endmacro
1298
1299INIT_XMM ssse3
1300; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby)
1301%if ARCH_X86_32
1302%if STACK_ALIGNMENT < mmsize
1303cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \
1304        dst, src, scaling, unused1, fg_data, picptr, unused2
1305    ; copy stack arguments to new position post-alignment, so that we
1306    ; don't have to keep the old stack location in a separate register
1307    mov              r0, r0m
1308    mov              r1, r2m
1309    mov              r2, r4m
1310    mov              r3, r6m
1311    mov              r4, r7m
1312    mov              r5, r8m
1313
1314    mov [rsp+5*mmsize+ 4*gprsize], r0
1315    mov [rsp+5*mmsize+ 6*gprsize], r1
1316    mov [rsp+5*mmsize+ 8*gprsize], r2
1317    mov [rsp+5*mmsize+10*gprsize], r3
1318    mov [rsp+5*mmsize+11*gprsize], r4
1319    mov [rsp+5*mmsize+12*gprsize], r5
1320%else
1321cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \
1322        dst, src, scaling, unused1, fg_data, picptr, unused2
1323%endif
1324    mov            srcq, srcm
1325    mov        fg_dataq, r3m
1326    mov        scalingq, r5m
1327%if STACK_ALIGNMENT < mmsize
1328%define r0m [rsp+5*mmsize+ 4*gprsize]
1329%define r1m [rsp+5*mmsize+ 5*gprsize]
1330%define r2m [rsp+5*mmsize+ 6*gprsize]
1331%define r3m [rsp+5*mmsize+ 7*gprsize]
1332%define r4m [rsp+5*mmsize+ 8*gprsize]
1333%define r5m [rsp+5*mmsize+ 9*gprsize]
1334%define r6m [rsp+5*mmsize+10*gprsize]
1335%define r7m [rsp+5*mmsize+11*gprsize]
1336%define r8m [rsp+5*mmsize+12*gprsize]
1337%endif
1338    LEA              r5, pb_mask
1339%define base r5-pb_mask
1340    mov             r5m, picptrq
1341%else
1342cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
1343    lea              r7, [pb_mask]
1344%define base r7-pb_mask
1345%endif
1346    mov             r6d, [fg_dataq+FGData.scaling_shift]
1347    movd             m3, [base+mul_bits+r6*2-14]
1348    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
1349    movd             m4, [base+max+r6*4]
1350    movd             m5, [base+min+r6*2]
1351    punpcklwd        m3, m3
1352    punpcklwd        m4, m4
1353    punpcklwd        m5, m5
1354    pshufd           m3, m3, q0000
1355    pshufd           m4, m4, q0000
1356    pshufd           m5, m5, q0000
1357    SCRATCH           3, 11, 0
1358    SCRATCH           4, 12, 1
1359    SCRATCH           5, 13, 2
1360
1361%if ARCH_X86_32
1362    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
1363%else
1364    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
1365%endif
1366
1367    mov            sbyd, r8m
1368    mov        overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
1369    test       overlapd, overlapd
1370    jz .no_vertical_overlap
1371    mova             m6, [base+pw_1024]
1372    mova             m7, [base+pb_27_17_17_27]
1373    SCRATCH           6, 14, 3
1374    SCRATCH           7, 15, 4
1375    test           sbyd, sbyd
1376    jnz .vertical_overlap
1377    ; fall-through
1378
1379.no_vertical_overlap:
1380    mov             r8m, overlapd
1381%if ARCH_X86_32
1382    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
1383    imul           seed, (173 << 24) | 37
1384%else
1385    imul           seed, sbyd, (173 << 24) | 37
1386%endif
1387    add            seed, (105 << 24) | 178
1388    rol            seed, 8
1389    movzx          seed, seew
1390    xor            seed, [fg_dataq+FGData.seed]
1391
1392%if ARCH_X86_32
1393    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1394
1395    mov             r3m, seed
1396    mov              wq, r4m
1397%else
1398    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1399                unused1, unused2, see, unused3
1400%endif
1401
1402    lea        src_bakq, [srcq+wq]
1403    neg              wq
1404    sub           dstmp, srcq
1405%if ARCH_X86_32
1406    mov             r1m, src_bakq
1407    mov             r4m, wq
1408    DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
1409%endif
1410
1411.loop_x:
1412%if ARCH_X86_32
1413    mov            seed, r3m
1414%endif
1415    mov             r6d, seed
1416    or             seed, 0xEFF4
1417    shr             r6d, 1
1418    test           seeb, seeh
1419    lea            seed, [r6+0x8000]
1420    cmovp          seed, r6d                ; updated seed
1421%if ARCH_X86_32
1422    mov             r3m, seed
1423
1424    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1425
1426    mov           offxd, offyd
1427%else
1428    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1429                offx, offy, see, unused
1430
1431    mov           offyd, seed
1432    mov           offxd, seed
1433%endif
1434    ror           offyd, 8
1435    shr           offxd, 12
1436    and           offyd, 0xf
1437    imul          offyd, 164
1438    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1439
1440%if ARCH_X86_32
1441    ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
1442    ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
1443    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1444%else
1445    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1446                h, offxy, see, unused
1447%endif
1448
1449.loop_x_odd:
1450    mov              hd, r7m
1451    mov      grain_lutq, grain_lutmp
1452.loop_y:
1453    ; src
1454    mova             m0, [srcq]
1455    pxor             m2, m2
1456    punpckhbw        m1, m0, m2
1457    punpcklbw        m0, m2                 ; m0-1: src as word
1458
1459    ; scaling[src]
1460%if ARCH_X86_32
1461    vpgatherdw       m4, m0, scalingq-1, r0, r5, m3
1462    vpgatherdw       m5, m1, scalingq-1, r0, r5, m3
1463%else
1464    vpgatherdw       m4, m0, scalingq-1, r12, r13, m3
1465    vpgatherdw       m5, m1, scalingq-1, r12, r13, m3
1466%endif
1467    REPX {psrlw x, 8}, m4, m5
1468
1469    ; grain = grain_lut[offy+y][offx+x]
1470    movu             m3, [grain_lutq+offxyq]
1471    pcmpgtb          m7, m2, m3
1472    punpcklbw        m2, m3, m7
1473    punpckhbw        m3, m7
1474
1475    ; noise = round2(scaling[src] * grain, scaling_shift)
1476    pmullw           m2, m4
1477    pmullw           m3, m5
1478    pmulhrsw         m2, m11
1479    pmulhrsw         m3, m11
1480
1481    ; dst = clip_pixel(src, noise)
1482    paddw            m0, m2
1483    paddw            m1, m3
1484    pmaxsw           m0, m13
1485    pmaxsw           m1, m13
1486    pminsw           m0, m12
1487    pminsw           m1, m12
1488    packuswb         m0, m1
1489    movifnidn      dstq, dstmp
1490    mova    [dstq+srcq], m0
1491
1492    add            srcq, r2mp
1493    add      grain_lutq, 82
1494    dec              hd
1495    jg .loop_y
1496
1497%if ARCH_X86_32
1498    add            r4mp, 16
1499%else
1500    add              wq, 16
1501%endif
1502    jge .end
1503%if ARCH_X86_32
1504    mov            srcq, r1mp
1505    add            srcq, r4mp
1506%else
1507    lea            srcq, [src_bakq+wq]
1508%endif
1509    btc       dword r8m, 2
1510    jc .next_blk
1511
1512    add          offxyd, 16
1513    test      dword r8m, 2              ; r8m & 2 = have_top_overlap
1514    jz .loop_x_odd
1515
1516%if ARCH_X86_32
1517    add dword [rsp+5*mmsize+1*gprsize], 16
1518%else
1519    add            r11d, 16             ; top_offxyd
1520%endif
1521    jnz .loop_x_odd_v_overlap
1522
1523.next_blk:
1524    test      dword r8m, 1
1525    jz .loop_x
1526
1527    test      dword r8m, 2
1528    jnz .loop_x_hv_overlap
1529
1530    ; horizontal overlap (without vertical overlap)
1531.loop_x_h_overlap:
1532%if ARCH_X86_32
1533    ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
1534    ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
1535    DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3
1536
1537    add          offxyd, 16                 ; left_offxyd
1538    mov [rsp+5*mmsize+0*gprsize], offxyd
1539
1540    DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
1541
1542    mov            seed, r3m
1543%else
1544    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1545                offx, offy, see, left_offxy
1546
1547    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
1548%endif
1549
1550    mov             r6d, seed
1551    or             seed, 0xEFF4
1552    shr             r6d, 1
1553    test           seeb, seeh
1554    lea            seed, [r6+0x8000]
1555    cmovp          seed, r6d                ; updated seed
1556
1557%if ARCH_X86_32
1558    mov             r3m, seed
1559
1560    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1561
1562    mov           offxd, offyd
1563%else
1564    mov           offyd, seed
1565    mov           offxd, seed
1566%endif
1567    ror           offyd, 8
1568    shr           offxd, 12
1569    and           offyd, 0xf
1570    imul          offyd, 164
1571    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1572
1573%if ARCH_X86_32
1574    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1575%else
1576    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1577                h, offxy, see, left_offxy
1578%endif
1579
1580    mov              hd, r7m
1581    mov      grain_lutq, grain_lutmp
1582.loop_y_h_overlap:
1583    ; src
1584    mova             m0, [srcq]
1585    pxor             m2, m2
1586    punpckhbw        m1, m0, m2
1587    punpcklbw        m0, m2                 ; m0-1: src as word
1588
1589    ; scaling[src]
1590%if ARCH_X86_32
1591    vpgatherdw       m4, m0, scalingq-1, r0, r5, m3
1592    vpgatherdw       m5, m1, scalingq-1, r0, r5, m3
1593%else
1594    vpgatherdw       m4, m0, scalingq-1, r12, r13, m3
1595    vpgatherdw       m5, m1, scalingq-1, r12, r13, m3
1596%endif
1597    REPX {psrlw x, 8}, m4, m5
1598
1599    ; grain = grain_lut[offy+y][offx+x]
1600    movu             m3, [grain_lutq+offxyq]
1601%if ARCH_X86_32
1602    mov              r5, [rsp+5*mmsize+0*gprsize]
1603    movd             m7, [grain_lutq+r5]
1604%else
1605    movd             m7, [grain_lutq+left_offxyq]
1606%endif
1607    punpcklbw        m7, m3
1608    pmaddubsw        m6, m15, m7
1609    pmulhrsw         m6, m14
1610    packsswb         m6, m6
1611    shufps           m6, m3, q3210
1612    pcmpgtb          m2, m6
1613    punpcklbw        m7, m6, m2
1614    punpckhbw        m6, m2
1615
1616    ; noise = round2(scaling[src] * grain, scaling_shift)
1617    pmullw           m7, m4
1618    pmullw           m6, m5
1619    pmulhrsw         m7, m11
1620    pmulhrsw         m6, m11
1621
1622    ; dst = clip_pixel(src, noise)
1623    paddw            m0, m7
1624    paddw            m1, m6
1625    pmaxsw           m0, m13
1626    pmaxsw           m1, m13
1627    pminsw           m0, m12
1628    pminsw           m1, m12
1629    packuswb         m0, m1
1630    movifnidn      dstq, dstmp
1631    mova    [dstq+srcq], m0
1632
1633    add            srcq, r2mp
1634    add      grain_lutq, 82
1635    dec              hd
1636    jg .loop_y_h_overlap
1637
1638%if ARCH_X86_32
1639    add            r4mp, 16
1640%else
1641    add              wq, 16
1642%endif
1643    jge .end
1644%if ARCH_X86_32
1645    mov            srcq, r1m
1646    add            srcq, r4m
1647%else
1648    lea            srcq, [src_bakq+wq]
1649%endif
1650    xor       dword r8m, 4
1651    add          offxyd, 16
1652
1653    ; since this half-block had left-overlap, the next does not
1654    test      dword r8m, 2              ; have_top_overlap
1655    jz .loop_x_odd
1656%if ARCH_X86_32
1657    add dword [rsp+5*mmsize+1*gprsize], 16
1658%else
1659    add            r11d, 16             ; top_offxyd
1660%endif
1661    jmp .loop_x_odd_v_overlap
1662
1663.end:
1664    RET
1665
1666.vertical_overlap:
1667%if ARCH_X86_32
1668    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
1669%else
1670    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
1671%endif
1672
1673    or         overlapd, 2                  ; top_overlap: overlap & 2
1674    mov             r8m, overlapd
1675    movzx          sbyd, sbyb
1676%if ARCH_X86_32
1677    imul             r4, [fg_dataq+FGData.seed], 0x00010001
1678    DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
1679%else
1680    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1681%endif
1682    imul           tmpd, sbyd, 173 * 0x00010001
1683    imul           sbyd, 37 * 0x01000100
1684    add            tmpd, (105 << 16) | 188
1685    add            sbyd, (178 << 24) | (141 << 8)
1686    and            tmpd, 0x00ff00ff
1687    and            sbyd, 0xff00ff00
1688    xor            seed, tmpd
1689%if ARCH_X86_32
1690    xor            sbyd, seed               ; (cur_seed << 16) | top_seed
1691
1692    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
1693
1694    mov             r3m, seed
1695    mov              wq, r4m
1696%else
1697    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1698
1699    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1700                tmp, unused2, see, unused3
1701%endif
1702
1703    lea        src_bakq, [srcq+wq]
1704    neg              wq
1705    sub           dstmp, srcq
1706%if ARCH_X86_32
1707    mov             r1m, src_bakq
1708    mov             r4m, wq
1709    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
1710%endif
1711
1712.loop_x_v_overlap:
1713%if ARCH_X86_32
1714    mov            seed, r3m
1715%endif
1716    ; we assume from the block above that bits 8-15 of tmpd are zero'ed,
1717    ; because of the 'and tmpd, 0x00ff00ff' above
1718    mov             r6d, seed
1719    or             seed, 0xeff4eff4
1720    test           seeb, seeh
1721    setp           tmpb                     ; parity of top_seed
1722    shr            seed, 16
1723    shl            tmpd, 16
1724    test           seeb, seeh
1725    setp           tmpb                     ; parity of cur_seed
1726    or              r6d, 0x00010001
1727    xor            tmpd, r6d
1728    mov            seed, tmpd
1729    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
1730
1731%if ARCH_X86_32
1732    mov             r3m, seed
1733
1734    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1735
1736    mov           offxd, offyd
1737%else
1738    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1739                offx, offy, see, unused, top_offxy
1740
1741    mov           offyd, seed
1742    mov           offxd, seed
1743%endif
1744
1745    ror           offyd, 8
1746    ror           offxd, 12
1747    and           offyd, 0xf000f
1748    and           offxd, 0xf000f
1749    imul          offyd, 164
1750    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1751    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1752
1753%if ARCH_X86_32
1754    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
1755%else
1756    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1757                h, offxy, see, unused, top_offxy
1758%endif
1759
1760    movzx    top_offxyd, offxyw
1761%if ARCH_X86_32
1762    mov [rsp+5*mmsize+1*gprsize], top_offxyd
1763
1764    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1765%endif
1766    shr          offxyd, 16
1767
1768.loop_x_odd_v_overlap:
1769%if ARCH_X86_32
1770    mov              r5, r5m
1771    lea              r5, [base+pb_27_17]
1772    mov [rsp+5*mmsize+12], r5
1773%else
1774    mova             m8, [pb_27_17]
1775%endif
1776    mov              hd, r7m
1777    mov      grain_lutq, grain_lutmp
1778.loop_y_v_overlap:
1779    ; src
1780    mova             m0, [srcq]
1781    pxor             m2, m2
1782    punpckhbw        m1, m0, m2
1783    punpcklbw        m0, m2                 ; m0-1: src as word
1784
1785    ; scaling[src]
1786%if ARCH_X86_32
1787    vpgatherdw       m4, m0, scalingq-1, r0, r5, m3
1788    vpgatherdw       m5, m1, scalingq-1, r0, r5, m3
1789%else
1790    vpgatherdw       m4, m0, scalingq-1, r12, r13, m3
1791    vpgatherdw       m5, m1, scalingq-1, r12, r13, m3
1792%endif
1793    REPX {psrlw x, 8}, m4, m5
1794
1795    ; grain = grain_lut[offy+y][offx+x]
1796    movu             m3, [grain_lutq+offxyq]
1797%if ARCH_X86_32
1798    mov              r5, [rsp+5*mmsize+1*gprsize]
1799    movu             m7, [grain_lutq+r5]
1800%else
1801    movu             m7, [grain_lutq+top_offxyq]
1802%endif
1803    punpckhbw        m6, m7, m3
1804    punpcklbw        m7, m3
1805%if ARCH_X86_32
1806    mov              r5, [rsp+5*mmsize+12]
1807    pmaddubsw        m3, [r5], m6
1808    pmaddubsw        m6, [r5], m7
1809%else
1810    pmaddubsw        m3, m8, m6
1811    pmaddubsw        m6, m8, m7
1812%endif
1813    pmulhrsw         m3, m14
1814    pmulhrsw         m6, m14
1815    packsswb         m6, m3
1816    pcmpgtb          m7, m2, m6
1817    punpcklbw        m2, m6, m7
1818    punpckhbw        m6, m7
1819
1820    ; noise = round2(scaling[src] * grain, scaling_shift)
1821    pmullw           m2, m4
1822    pmullw           m6, m5
1823    pmulhrsw         m2, m11
1824    pmulhrsw         m6, m11
1825
1826    ; dst = clip_pixel(src, noise)
1827    paddw            m0, m2
1828    paddw            m1, m6
1829    pmaxsw           m0, m13
1830    pmaxsw           m1, m13
1831    pminsw           m0, m12
1832    pminsw           m1, m12
1833    packuswb         m0, m1
1834    movifnidn      dstq, dstmp
1835    mova    [dstq+srcq], m0
1836
1837%if ARCH_X86_32
1838    add dword [rsp+5*mmsize+12], mmsize
1839%else
1840    mova             m8, [pb_17_27]
1841%endif
1842    add            srcq, r2mp
1843    add      grain_lutq, 82
1844    dec              hw
1845    jz .end_y_v_overlap
1846    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1847    ; remaining (up to) 30 lines
1848    btc              hd, 16
1849    jnc .loop_y_v_overlap
1850    jmp .loop_y
1851
1852.end_y_v_overlap:
1853%if ARCH_X86_32
1854    add            r4mp, 16
1855%else
1856    add              wq, 16
1857%endif
1858    jge .end_hv
1859%if ARCH_X86_32
1860    mov            srcq, r1mp
1861    add            srcq, r4mp
1862%else
1863    lea            srcq, [src_bakq+wq]
1864%endif
1865    btc       dword r8m, 2
1866    jc .loop_x_hv_overlap
1867    add          offxyd, 16
1868%if ARCH_X86_32
1869    add dword [rsp+5*mmsize+1*gprsize], 16
1870%else
1871    add      top_offxyd, 16
1872%endif
1873    jmp .loop_x_odd_v_overlap
1874
1875.loop_x_hv_overlap:
1876%if ARCH_X86_32
1877    mov              r5, r5m
1878    lea              r5, [base+pb_27_17]
1879    mov [rsp+5*mmsize+12], r5
1880
1881    DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak
1882
1883    mov              r5, [rsp+5*mmsize+1*gprsize]
1884    mov              r4, offxyd
1885    add              r5, 16
1886    add              r4, 16
1887    mov [rsp+5*mmsize+2*gprsize], r5        ; topleft_offxy
1888    mov [rsp+5*mmsize+0*gprsize], r4        ; left_offxy
1889
1890    DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak
1891
1892    xor            tmpd, tmpd
1893    mov            seed, r3m
1894%else
1895    mova             m8, [pb_27_17]
1896
1897    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1898                tmp, unused2, see, unused3
1899
1900    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
1901%endif
1902    mov             r6d, seed
1903    or             seed, 0xeff4eff4
1904    test           seeb, seeh
1905    setp           tmpb                     ; parity of top_seed
1906    shr            seed, 16
1907    shl            tmpd, 16
1908    test           seeb, seeh
1909    setp           tmpb                     ; parity of cur_seed
1910    or              r6d, 0x00010001
1911    xor            tmpd, r6d
1912    mov            seed, tmpd
1913    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
1914
1915%if ARCH_X86_32
1916    mov             r3m, seed
1917
1918    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
1919
1920    mov           offxd, offyd
1921%else
1922    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1923                offx, offy, see, left_offxy, top_offxy, topleft_offxy
1924
1925    lea  topleft_offxyq, [top_offxyq+16]
1926    lea     left_offxyq, [offyq+16]
1927    mov           offyd, seed
1928    mov           offxd, seed
1929%endif
1930    ror           offyd, 8
1931    ror           offxd, 12
1932    and           offyd, 0xf000f
1933    and           offxd, 0xf000f
1934    imul          offyd, 164
1935    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1936    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1937
1938%if ARCH_X86_32
1939    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
1940
1941    movzx            r5, offxyw             ; top_offxy
1942    mov [rsp+5*mmsize+1*gprsize], r5
1943%else
1944    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1945                h, offxy, see, left_offxy, top_offxy, topleft_offxy
1946
1947    movzx    top_offxyd, offxyw
1948%endif
1949    shr          offxyd, 16
1950
1951    mov              hd, r7m
1952    mov      grain_lutq, grain_lutmp
1953.loop_y_hv_overlap:
1954    ; grain = grain_lut[offy+y][offx+x]
1955    movu             m3, [grain_lutq+offxyq]
1956%if ARCH_X86_32
1957    mov              r5, [rsp+5*mmsize+1*gprsize]   ; top_offxy
1958    mov              r0, [rsp+5*mmsize+0*gprsize]   ; left_offxy
1959    movu             m6, [grain_lutq+r5]
1960    mov              r5, [rsp+5*mmsize+2*gprsize]   ; topleft_offxy
1961    movd             m4, [grain_lutq+r0]
1962    movd             m7, [grain_lutq+r5]
1963%else
1964    movu             m6, [grain_lutq+top_offxyq]
1965    movd             m4, [grain_lutq+left_offxyq]
1966    movd             m7, [grain_lutq+topleft_offxyq]
1967%endif
1968    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
1969    punpcklbw        m4, m3
1970    punpcklbw        m7, m6
1971    pmaddubsw        m2, m15, m4
1972    pmaddubsw        m4, m15, m7
1973    pmulhrsw         m2, m14
1974    pmulhrsw         m4, m14
1975    packsswb         m2, m2
1976    packsswb         m4, m4
1977    shufps           m2, m3, q3210
1978    shufps           m4, m6, q3210
1979    ; followed by v interpolation (top | cur -> cur)
1980    punpcklbw        m3, m4, m2
1981    punpckhbw        m4, m2
1982%if ARCH_X86_32
1983    mov              r5, [rsp+5*mmsize+12]
1984    pmaddubsw        m7, [r5], m4
1985    pmaddubsw        m4, [r5], m3
1986%else
1987    pmaddubsw        m7, m8, m4
1988    pmaddubsw        m4, m8, m3
1989%endif
1990    pmulhrsw         m7, m14
1991    pmulhrsw         m4, m14
1992    packsswb         m4, m7
1993    pxor             m2, m2
1994    pcmpgtb          m7, m2, m4
1995    punpcklbw        m3, m4, m7
1996    punpckhbw        m4, m7
1997
1998    ; src
1999    mova             m0, [srcq]
2000    punpckhbw        m1, m0, m2
2001    punpcklbw        m0, m2                 ; m0-1: src as word
2002
2003    ; scaling[src]
2004%if ARCH_X86_32
2005    vpgatherdw       m5, m0, scalingq-1, r0, r5, m7
2006    vpgatherdw       m6, m1, scalingq-1, r0, r5, m7
2007%else
2008    vpgatherdw       m5, m0, scalingq-1, r13, r14, m7
2009    vpgatherdw       m6, m1, scalingq-1, r13, r14, m7
2010%endif
2011    REPX {psrlw x, 8}, m5, m6
2012
2013    ; noise = round2(scaling[src] * grain, scaling_shift)
2014    pmullw           m3, m5
2015    pmullw           m4, m6
2016    pmulhrsw         m3, m11
2017    pmulhrsw         m4, m11
2018
2019    ; dst = clip_pixel(src, noise)
2020    paddw            m0, m3
2021    paddw            m1, m4
2022    pmaxsw           m0, m13
2023    pmaxsw           m1, m13
2024    pminsw           m0, m12
2025    pminsw           m1, m12
2026    packuswb         m0, m1
2027    movifnidn      dstq, dstmp
2028    mova    [dstq+srcq], m0
2029
2030%if ARCH_X86_32
2031    add dword [rsp+5*mmsize+12], mmsize
2032%else
2033    mova             m8, [pb_17_27]
2034%endif
2035    add            srcq, r2mp
2036    add      grain_lutq, 82
2037    dec              hw
2038    jz .end_y_hv_overlap
2039    ; 2 lines get vertical overlap, then fall back to non-overlap code for
2040    ; remaining (up to) 30 lines
2041    btc              hd, 16
2042    jnc .loop_y_hv_overlap
2043    jmp .loop_y_h_overlap
2044
2045.end_y_hv_overlap:
2046%if ARCH_X86_32
2047    add            r4mp, 16
2048%else
2049    add              wq, 16
2050%endif
2051    jge .end_hv
2052%if ARCH_X86_32
2053    mov            srcq, r1m
2054    add            srcq, r4m
2055%else
2056    lea            srcq, [src_bakq+wq]
2057%endif
2058    xor       dword r8m, 4
2059    add          offxyd, 16
2060%if ARCH_X86_32
2061    add dword [rsp+5*mmsize+1*gprsize], 16
2062%else
2063    add      top_offxyd, 16
2064%endif
2065    jmp .loop_x_odd_v_overlap
2066
2067.end_hv:
2068    RET
2069
2070%macro FGUV_FN 3 ; name, ss_hor, ss_ver
2071INIT_XMM ssse3
2072%if ARCH_X86_32
2073; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h,
2074;                         sby, luma, lstride, uv_pl, is_id)
2075%if STACK_ALIGNMENT < mmsize
2076DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
2077cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \
2078        tmp, src, scaling, h, fg_data, picptr, unused
2079    mov              r0, r0m
2080    mov              r1, r2m
2081    mov              r2, r4m
2082    mov              r3, r6m
2083    mov              r4, r7m
2084    mov [rsp+7*mmsize+3*gprsize], r0
2085    mov [rsp+7*mmsize+5*gprsize], r1
2086    mov [rsp+7*mmsize+7*gprsize], r2
2087    mov [rsp+7*mmsize+9*gprsize], r3
2088    mov [rsp+7*mmsize+10*gprsize], r4
2089
2090    mov              r0, r8m
2091    mov              r1, r9m
2092    mov              r2, r10m
2093    mov              r4, r11m
2094    mov              r3, r12m
2095    mov [rsp+7*mmsize+11*gprsize], r0
2096    mov [rsp+7*mmsize+12*gprsize], r1
2097    mov [rsp+7*mmsize+13*gprsize], r2
2098    mov [rsp+7*mmsize+14*gprsize], r4
2099%else
2100cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \
2101        tmp, src, scaling, h, fg_data, picptr, unused
2102%endif
2103    mov            srcq, srcm
2104    mov        fg_dataq, r3m
2105    mov        scalingq, r5m
2106%if STACK_ALIGNMENT < mmsize
2107%define r0m [rsp+7*mmsize+ 3*gprsize]
2108%define r1m [rsp+7*mmsize+ 4*gprsize]
2109%define r2m [rsp+7*mmsize+ 5*gprsize]
2110%define r3m [rsp+7*mmsize+ 6*gprsize]
2111%define r4m [rsp+7*mmsize+ 7*gprsize]
2112%define r5m [rsp+7*mmsize+ 8*gprsize]
2113%define r6m [rsp+7*mmsize+ 9*gprsize]
2114%define r7m [rsp+7*mmsize+10*gprsize]
2115%define r8m [rsp+7*mmsize+11*gprsize]
2116%define r9m [rsp+7*mmsize+12*gprsize]
2117%define r10m [rsp+7*mmsize+13*gprsize]
2118%define r11m [rsp+7*mmsize+14*gprsize]
2119%define r12m [rsp+7*mmsize+15*gprsize]
2120%endif
2121    LEA              r5, pb_mask
2122%define base r5-pb_mask
2123    mov             r5m, r5
2124%else
2125cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
2126                                     grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
2127    lea              r8, [pb_mask]
2128%define base r8-pb_mask
2129%endif
2130    mov             r6d, [fg_dataq+FGData.scaling_shift]
2131    movd             m3, [base+mul_bits+r6*2-14]
2132    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
2133    lea            tmpd, [r6d*2]
2134%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize
2135    test             r3, r3
2136%else
2137    cmp      dword r12m, 0                      ; is_idm
2138%endif
2139    movd             m5, [base+min+r6*2]
2140    cmovne          r6d, tmpd
2141    movd             m4, [base+max+r6*2]
2142    punpcklwd        m3, m3
2143    punpcklwd        m5, m5
2144    punpcklwd        m4, m4
2145    pshufd           m3, m3, q0000
2146    pshufd           m5, m5, q0000
2147    pshufd           m4, m4, q0000
2148    SCRATCH           3, 11, 0
2149    SCRATCH           4, 12, 1
2150    SCRATCH           5, 13, 2
2151
2152    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
2153    jne .csfl
2154
2155%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
2156%if ARCH_X86_32
2157    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
2158%else
2159    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
2160%endif
2161
2162%if %1
2163    mov             r6d, dword r11m
2164    movd             m0, [fg_dataq+FGData.uv_mult+r6*4]
2165    movd             m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
2166    punpcklbw        m6, m1, m0
2167    movd             m7, [fg_dataq+FGData.uv_offset+r6*4]
2168    punpcklwd        m6, m6
2169    punpcklwd        m7, m7
2170    pshufd           m6, m6, q0000
2171    pshufd           m7, m7, q0000
2172    SCRATCH           6, 14, 3
2173    SCRATCH           7, 15, 4
2174%endif
2175
2176    mov            sbyd, r8m
2177    mov        overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
2178    test       overlapd, overlapd
2179    jz %%no_vertical_overlap
2180%if ARCH_X86_32
2181%if %2
2182    mova             m1, [base+pb_23_22_h]
2183%else
2184    mova             m1, [base+pb_27_17_17_27]
2185%endif
2186    mova             m0, [base+pw_1024]
2187%else
2188%if %2
2189    mova             m1, [pb_23_22_h]
2190%else
2191    mova             m1, [pb_27_17_17_27]
2192%endif
2193    mova             m0, [pw_1024]
2194%endif
2195    SCRATCH           0, 8, 5
2196    SCRATCH           1, 9, 6
2197    test           sbyd, sbyd
2198    jnz %%vertical_overlap
2199    ; fall-through
2200
2201%%no_vertical_overlap:
2202    mov             r8m, overlapd
2203%if ARCH_X86_32
2204    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
2205    imul           seed, (173 << 24) | 37
2206%else
2207    imul           seed, sbyd, (173 << 24) | 37
2208%endif
2209    add            seed, (105 << 24) | 178
2210    rol            seed, 8
2211    movzx          seed, seew
2212    xor            seed, [fg_dataq+FGData.seed]
2213
2214%if ARCH_X86_32
2215    mov             r3m, seed
2216
2217    DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
2218%define luma_bakq lumaq
2219
2220    mov              wq, r4m
2221%if %3
2222    shl           r10mp, 1
2223%endif
2224%else
2225    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2226                unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak
2227
2228    mov        lstrideq, r10mp
2229%endif
2230
2231    mov           lumaq, r9mp
2232    lea        src_bakq, [srcq+wq]
2233    lea       luma_bakq, [lumaq+wq*(1+%2)]
2234    neg              wq
2235    sub            r0mp, srcq
2236%if ARCH_X86_32
2237    mov             r1m, src_bakq
2238    mov            r11m, luma_bakq
2239    mov             r4m, wq
2240
2241    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
2242%else
2243    mov           r11mp, src_bakq
2244    mov           r12mp, strideq
2245%endif
2246
2247%%loop_x:
2248%if ARCH_X86_32
2249    mov            seed, r3m
2250%endif
2251    mov             r6d, seed
2252    or             seed, 0xEFF4
2253    shr             r6d, 1
2254    test           seeb, seeh
2255    lea            seed, [r6+0x8000]
2256    cmovp          seed, r6d               ; updated seed
2257%if ARCH_X86_32
2258    mov             r3m, seed
2259
2260    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
2261
2262    mov           offxd, offyd
2263%else
2264    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2265                offx, offy, see, overlap, unused1, unused2, lstride
2266
2267    mov           offyd, seed
2268    mov           offxd, seed
2269%endif
2270    ror           offyd, 8
2271    shr           offxd, 12
2272    and           offyd, 0xf
2273    imul          offyd, 164>>%3
2274    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))]  ; offy*stride+offx
2275
2276%if ARCH_X86_32
2277    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
2278%else
2279    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2280                h, offxy, see, overlap, unused1, unused2, lstride, luma_bak
2281%endif
2282
2283%%loop_x_odd:
2284    mov              hd, r7m
2285    mov      grain_lutq, grain_lutmp
2286%%loop_y:
2287    ; src
2288%if ARCH_X86_32
2289    mov           lumaq, r9mp
2290%endif
2291%if %2
2292    mova             m4, [lumaq+ 0]
2293    mova             m6, [lumaq+16]
2294    mova             m0, [srcq]
2295%if ARCH_X86_32
2296    add           lumaq, r10mp
2297    mov            r9mp, lumaq
2298    mov              r5, r5m
2299    movd             m7, [base+pb_1]
2300%else
2301    movd             m7, [pb_1]
2302%endif
2303    pshufd           m7, m7, q0000
2304    pxor             m2, m2
2305    pmaddubsw        m4, m7
2306    pmaddubsw        m6, m7
2307    pavgw            m4, m2
2308    pavgw            m6, m2
2309%else
2310    mova             m4, [lumaq]
2311    mova             m0, [srcq]
2312%if ARCH_X86_32
2313    add           lumaq, r10mp
2314    mov            r9mp, lumaq
2315%endif
2316    pxor             m2, m2
2317%endif
2318
2319%if %1
2320%if %2
2321    packuswb         m4, m6                 ; luma
2322%endif
2323    punpckhbw        m6, m4, m0
2324    punpcklbw        m4, m0                 ; { luma, chroma }
2325    pmaddubsw        m6, m14
2326    pmaddubsw        m4, m14
2327    psraw            m6, 6
2328    psraw            m4, 6
2329    paddw            m6, m15
2330    paddw            m4, m15
2331    packuswb         m4, m6                 ; pack+unpack = clip
2332    punpckhbw        m6, m4, m2
2333    punpcklbw        m4, m2
2334%elif %2 == 0
2335    punpckhbw        m6, m4, m2
2336    punpcklbw        m4, m2
2337%endif
2338
2339    ; scaling[luma_src]
2340%if ARCH_X86_32
2341    vpgatherdw       m7, m4, scalingq-1, r0, r5
2342    vpgatherdw       m5, m6, scalingq-1, r0, r5
2343%else
2344    vpgatherdw       m7, m4, scalingq-1, r12, r2
2345    vpgatherdw       m5, m6, scalingq-1, r12, r2
2346%endif
2347    REPX {psrlw x, 8}, m7, m5
2348
2349    ; unpack chroma_source
2350    punpckhbw        m1, m0, m2
2351    punpcklbw        m0, m2                 ; m0-1: src as word
2352
2353    ; grain = grain_lut[offy+y][offx+x]
2354    movu             m3, [grain_lutq+offxyq+ 0]
2355    pcmpgtb          m6, m2, m3
2356    punpcklbw        m2, m3, m6
2357    punpckhbw        m3, m6
2358
2359    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2360    pmullw           m2, m7
2361    pmullw           m3, m5
2362    pmulhrsw         m2, m11
2363    pmulhrsw         m3, m11
2364
2365%if ARCH_X86_32
2366    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2367%endif
2368
2369    ; dst = clip_pixel(src, noise)
2370    paddw            m0, m2
2371    paddw            m1, m3
2372    pmaxsw           m0, m13
2373    pmaxsw           m1, m13
2374    pminsw           m0, m12
2375    pminsw           m1, m12
2376    packuswb         m0, m1
2377    movifnidn      dstq, dstmp
2378    mova    [dstq+srcq], m0
2379
2380%if ARCH_X86_32
2381    add            srcq, r2mp
2382    ; we already incremented lumaq above
2383%else
2384    add            srcq, r12mp
2385%if %3
2386    lea           lumaq, [lumaq+lstrideq*2]
2387%else
2388    add           lumaq, lstrideq
2389%endif
2390%endif
2391    add      grain_lutq, 82
2392    dec              hw
2393    jg %%loop_y
2394
2395%if ARCH_X86_32
2396    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
2397
2398    mov              wq, r4m
2399%endif
2400    add              wq, 16
2401    jge %%end
2402%if ARCH_X86_32
2403    mov            srcq, r1mp
2404    mov           lumaq, r11mp
2405%else
2406    mov            srcq, r11mp
2407%endif
2408    lea           lumaq, [luma_bakq+wq*(1+%2)]
2409    add            srcq, wq
2410%if ARCH_X86_32
2411    mov             r4m, wq
2412    mov             r9m, lumaq
2413%endif
2414%if %2 == 0
2415    ; adjust top_offxy
2416%if ARCH_X86_32
2417    add dword [rsp+7*mmsize+1*gprsize], 16
2418%else
2419    add            r11d, 16
2420%endif
2421    add          offxyd, 16
2422    btc       dword r8m, 2
2423    jc %%loop_x_even
2424    test      dword r8m, 2
2425    jz %%loop_x_odd
2426    jmp %%loop_x_odd_v_overlap
2427%%loop_x_even:
2428%endif
2429    test      dword r8m, 1
2430    jz %%loop_x
2431
2432    ; r8m = sbym
2433    test      dword r8m, 2
2434    jne %%loop_x_hv_overlap
2435
2436    ; horizontal overlap (without vertical overlap)
2437%%loop_x_h_overlap:
2438%if ARCH_X86_32
2439%if %2
2440    lea              r6, [offxyd+16]
2441    mov [rsp+7*mmsize+0*gprsize], r6
2442%else
2443    mov [rsp+7*mmsize+0*gprsize], offxyd
2444%endif
2445
2446    DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
2447
2448    mov            seed, r3m
2449%else
2450    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2451                offx, offy, see, left_offxy, unused1, unused2, lstride
2452
2453%if %2
2454    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
2455%else
2456    mov     left_offxyd, offyd
2457%endif
2458%endif
2459    mov             r6d, seed
2460    or             seed, 0xEFF4
2461    shr             r6d, 1
2462    test           seeb, seeh
2463    lea            seed, [r6+0x8000]
2464    cmovp          seed, r6d                ; updated seed
2465
2466%if ARCH_X86_32
2467    mov             r3m, seed
2468
2469    DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx
2470
2471    mov          offxd, offyd
2472%else
2473    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2474                offx, offy, see, left_offxy, unused1, unused2, lstride
2475
2476    mov           offyd, seed
2477    mov           offxd, seed
2478%endif
2479    ror           offyd, 8
2480    shr           offxd, 12
2481    and           offyd, 0xf
2482    imul          offyd, 164>>%3
2483    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
2484
2485%if ARCH_X86_32
2486    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
2487%else
2488    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2489                h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak
2490%endif
2491
2492    mov              hd, r7m
2493    mov      grain_lutq, grain_lutmp
2494%%loop_y_h_overlap:
2495    ; src
2496%if ARCH_X86_32
2497    mov           lumaq, r9mp
2498%endif
2499%if %2
2500    mova             m4, [lumaq+ 0]
2501    mova             m6, [lumaq+16]
2502    mova             m0, [srcq]
2503%if ARCH_X86_32
2504    add           lumaq, r10mp
2505    mov            r9mp, lumaq
2506    mov              r5, r5m
2507    movd             m7, [base+pb_1]
2508%else
2509    movd             m7, [pb_1]
2510%endif
2511    pshufd           m7, m7, q0000
2512    pxor             m2, m2
2513    pmaddubsw        m4, m7
2514    pmaddubsw        m6, m7
2515    pavgw            m4, m2
2516    pavgw            m6, m2
2517%else
2518    mova             m4, [lumaq]
2519    mova             m0, [srcq]
2520%if ARCH_X86_32
2521    add           lumaq, r10mp
2522    mov            r9mp, lumaq
2523%endif
2524    pxor             m2, m2
2525%endif
2526
2527%if %1
2528%if %2
2529    packuswb         m4, m6                 ; luma
2530%endif
2531    punpckhbw        m6, m4, m0
2532    punpcklbw        m4, m0                 ; { luma, chroma }
2533    pmaddubsw        m6, m14
2534    pmaddubsw        m4, m14
2535    psraw            m6, 6
2536    psraw            m4, 6
2537    paddw            m6, m15
2538    paddw            m4, m15
2539    packuswb         m4, m6                 ; pack+unpack = clip
2540    punpckhbw        m6, m4, m2
2541    punpcklbw        m4, m2
2542%elif %2 == 0
2543    punpckhbw        m6, m4, m2
2544    punpcklbw        m4, m2
2545%endif
2546
2547    ; scaling[luma_src]
2548%if ARCH_X86_32
2549    vpgatherdw       m7, m4, scalingq-1, r0, r5
2550    vpgatherdw       m5, m6, scalingq-1, r0, r5
2551%else
2552    vpgatherdw       m7, m4, scalingq-1, r12, r2
2553    vpgatherdw       m5, m6, scalingq-1, r12, r2
2554%endif
2555    REPX {psrlw x, 8}, m7, m5
2556
2557    ; unpack chroma_source
2558    punpckhbw        m1, m0, m2
2559    punpcklbw        m0, m2                 ; m0-1: src as word
2560
2561    ; grain = grain_lut[offy+y][offx+x]
2562    movu             m4, [grain_lutq+offxyq+ 0]
2563%if ARCH_X86_32
2564    mov              r0, [rsp+7*mmsize+0*gprsize]
2565    movd             m2, [grain_lutq+r0+ 0]
2566%else
2567    movd             m2, [grain_lutq+left_offxyq+ 0]
2568%endif
2569    punpcklbw        m2, m4
2570    pmaddubsw        m3, m9, m2
2571    pmulhrsw         m3, m8
2572    packsswb         m3, m3
2573    shufps           m3, m4, q3210
2574    pxor             m4, m4
2575    pcmpgtb          m4, m3
2576    punpcklbw        m2, m3, m4
2577    punpckhbw        m3, m4
2578
2579    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2580    pmullw           m2, m7
2581    pmullw           m3, m5
2582    pmulhrsw         m2, m11
2583    pmulhrsw         m3, m11
2584
2585%if ARCH_X86_32
2586    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2587%endif
2588
2589    ; dst = clip_pixel(src, noise)
2590    paddw            m0, m2
2591    paddw            m1, m3
2592    pmaxsw           m0, m13
2593    pmaxsw           m1, m13
2594    pminsw           m0, m12
2595    pminsw           m1, m12
2596    packuswb         m0, m1
2597    movifnidn      dstq, dstmp
2598    mova    [dstq+srcq], m0
2599
2600%if ARCH_X86_32
2601    add            srcq, r2mp
2602    ; lumaq has already been incremented above
2603%else
2604    add            srcq, r12mp
2605%if %3
2606    lea           lumaq, [lumaq+lstrideq*2]
2607%else
2608    add           lumaq, lstrideq
2609%endif
2610%endif
2611    add      grain_lutq, 82
2612    dec              hw
2613    jg %%loop_y_h_overlap
2614
2615%if ARCH_X86_32
2616    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
2617
2618    mov              wq, r4m
2619%endif
2620    add              wq, 16
2621    jge %%end
2622%if ARCH_X86_32
2623    mov            srcq, r1mp
2624    mov           lumaq, r11mp
2625%else
2626    mov            srcq, r11mp
2627%endif
2628    lea           lumaq, [luma_bakq+wq*(1+%2)]
2629    add            srcq, wq
2630%if ARCH_X86_32
2631    mov             r4m, wq
2632    mov             r9m, lumaq
2633%endif
2634%if %2 == 0
2635    xor       dword r8m, 4
2636    ; adjust top_offxyd
2637%if ARCH_X86_32
2638    add dword [rsp+7*mmsize+1*gprsize], 16
2639%else
2640    add            r11d, 16
2641%endif
2642    add          offxyd, 16
2643%endif
2644
2645    ; r8m = sbym
2646    test      dword r8m, 2
2647%if %2
2648    jne %%loop_x_hv_overlap
2649    jmp %%loop_x_h_overlap
2650%else
2651    jne %%loop_x_odd_v_overlap
2652    jmp %%loop_x_odd
2653%endif
2654
2655%%end:
2656    RET
2657
2658%%vertical_overlap:
2659%if ARCH_X86_32
2660    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
2661%else
2662    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
2663%endif
2664
2665    or         overlapd, 2                  ; top_overlap: overlap & 2
2666    mov             r8m, overlapd
2667    movzx          sbyd, sbyb
2668%if ARCH_X86_32
2669    imul             r4, [fg_dataq+FGData.seed], 0x00010001
2670    DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
2671%else
2672    imul           seed, [fg_dataq+FGData.seed], 0x00010001
2673%endif
2674    imul           tmpd, sbyd, 173 * 0x00010001
2675    imul           sbyd, 37 * 0x01000100
2676    add            tmpd, (105 << 16) | 188
2677    add            sbyd, (178 << 24) | (141 << 8)
2678    and            tmpd, 0x00ff00ff
2679    and            sbyd, 0xff00ff00
2680    xor            seed, tmpd
2681%if ARCH_X86_32
2682    xor            sbyd, seed               ; (cur_seed << 16) | top_seed
2683
2684    DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
2685
2686    mov             r3m, seed
2687    mov              wq, r4m
2688%if %3
2689    shl           r10mp, 1
2690%endif
2691%else
2692    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
2693
2694    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2695                tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak
2696
2697    mov        lstrideq, r10mp
2698%endif
2699
2700    mov           lumaq, r9mp
2701    lea        src_bakq, [srcq+wq]
2702    lea       luma_bakq, [lumaq+wq*(1+%2)]
2703    neg              wq
2704    sub            r0mp, srcq
2705%if ARCH_X86_32
2706    mov             r1m, src_bakq
2707    mov            r11m, luma_bakq
2708    mov             r4m, wq
2709
2710    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
2711%else
2712    mov           r11mp, src_bakq
2713    mov           r12mp, strideq
2714%endif
2715
2716%%loop_x_v_overlap:
2717%if ARCH_X86_32
2718    mov            seed, r3m
2719    xor            tmpd, tmpd
2720%endif
2721    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
2722    mov             r6d, seed
2723    or             seed, 0xeff4eff4
2724    test           seeb, seeh
2725    setp           tmpb                     ; parity of top_seed
2726    shr            seed, 16
2727    shl            tmpd, 16
2728    test           seeb, seeh
2729    setp           tmpb                     ; parity of cur_seed
2730    or              r6d, 0x00010001
2731    xor            tmpd, r6d
2732    mov            seed, tmpd
2733    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
2734
2735%if ARCH_X86_32
2736    mov             r3m, seed
2737
2738    DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
2739
2740    mov           offxd, offyd
2741%else
2742    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2743                offx, offy, see, overlap, top_offxy, unused, lstride
2744
2745    mov           offxd, seed
2746    mov           offyd, seed
2747%endif
2748    ror           offyd, 8
2749    ror           offxd, 12
2750    and           offyd, 0xf000f
2751    and           offxd, 0xf000f
2752    imul          offyd, 164>>%3
2753    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
2754    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
2755
2756%if ARCH_X86_32
2757    DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy
2758%else
2759    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2760                h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak
2761%endif
2762
2763    movzx    top_offxyd, offxyw
2764    shr          offxyd, 16
2765%if ARCH_X86_32
2766    mov [rsp+7*mmsize+1*gprsize], top_offxyd
2767
2768    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
2769%endif
2770
2771%%loop_x_odd_v_overlap:
2772    mov              hd, r7m
2773    mov      grain_lutq, grain_lutmp
2774%if ARCH_X86_32
2775    mov              r5, r5m
2776%endif
2777%if %3
2778    mova             m1, [PIC_ptr(pb_23_22)]
2779%else
2780    mova             m1, [PIC_ptr(pb_27_17)]
2781%endif
2782%%loop_y_v_overlap:
2783%if ARCH_X86_32
2784    mov           lumaq, r9mp
2785%endif
2786%if %2
2787    mova             m4, [lumaq+ 0]
2788    mova             m6, [lumaq+16]
2789    mova             m0, [srcq]
2790%if ARCH_X86_32
2791    add           lumaq, r10mp
2792    mov            r9mp, lumaq
2793    mov              r5, r5m
2794    movd             m7, [base+pb_1]
2795%else
2796    movd             m7, [pb_1]
2797%endif
2798    pshufd           m7, m7, q0000
2799    pxor             m2, m2
2800    pmaddubsw        m4, m7
2801    pmaddubsw        m6, m7
2802    pavgw            m4, m2
2803    pavgw            m6, m2
2804%else
2805    mova             m4, [lumaq]
2806    mova             m0, [srcq]
2807%if ARCH_X86_32
2808    add           lumaq, r10mp
2809    mov            r9mp, lumaq
2810%endif
2811    pxor             m2, m2
2812%endif
2813
2814%if %1
2815%if %2
2816    packuswb         m4, m6                 ; luma
2817%endif
2818    punpckhbw        m6, m4, m0
2819    punpcklbw        m4, m0                 ; { luma, chroma }
2820    pmaddubsw        m6, m14
2821    pmaddubsw        m4, m14
2822    psraw            m6, 6
2823    psraw            m4, 6
2824    paddw            m6, m15
2825    paddw            m4, m15
2826    packuswb         m4, m6                 ; pack+unpack = clip
2827    punpckhbw        m6, m4, m2
2828    punpcklbw        m4, m2
2829%elif %2 == 0
2830    punpckhbw        m6, m4, m2
2831    punpcklbw        m4, m2
2832%endif
2833
2834    ; scaling[luma_src]
2835%if ARCH_X86_32
2836    vpgatherdw       m7, m4, scalingq-1, r0, r5
2837    vpgatherdw       m5, m6, scalingq-1, r0, r5
2838%else
2839    vpgatherdw       m7, m4, scalingq-1, r12, r2
2840    vpgatherdw       m5, m6, scalingq-1, r12, r2
2841%endif
2842    REPX {psrlw x, 8}, m7, m5
2843
2844    ; grain = grain_lut[offy+y][offx+x]
2845    movu             m3, [grain_lutq+offxyq]
2846%if ARCH_X86_32
2847    mov              r0, [rsp+7*mmsize+1*gprsize]
2848    movu             m4, [grain_lutq+r0]
2849%else
2850    movu             m4, [grain_lutq+top_offxyq]
2851%endif
2852    punpckhbw        m6, m4, m3
2853    punpcklbw        m4, m3
2854    pmaddubsw        m2, m1, m6
2855    pmaddubsw        m3, m1, m4
2856    pmulhrsw         m2, m8
2857    pmulhrsw         m3, m8
2858    packsswb         m3, m2
2859    pxor             m6, m6
2860    pcmpgtb          m6, m3
2861    punpcklbw        m2, m3, m6
2862    punpckhbw        m3, m6
2863
2864    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2865    pmullw           m2, m7
2866    pmullw           m3, m5
2867    pmulhrsw         m2, m11
2868    pmulhrsw         m3, m11
2869
2870    ; unpack chroma_source
2871    pxor             m4, m4
2872    punpckhbw        m6, m0, m4
2873    punpcklbw        m0, m4                 ; m0-1: src as word
2874
2875%if ARCH_X86_32
2876    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
2877%endif
2878
2879    ; dst = clip_pixel(src, noise)
2880    paddw            m0, m2
2881    paddw            m6, m3
2882    pmaxsw           m0, m13
2883    pmaxsw           m6, m13
2884    pminsw           m0, m12
2885    pminsw           m6, m12
2886    packuswb         m0, m6
2887    movifnidn      dstq, dstmp
2888    mova    [dstq+srcq], m0
2889
2890    dec              hw
2891    je %%end_y_v_overlap
2892%if ARCH_X86_32
2893    add            srcq, r2mp
2894    ; lumaq has already been incremented above
2895%else
2896    add            srcq, r12mp
2897%if %3
2898    lea           lumaq, [lumaq+lstrideq*2]
2899%else
2900    add           lumaq, lstrideq
2901%endif
2902%endif
2903    add      grain_lutq, 82
2904%if %3 == 0
2905    btc              hd, 16
2906%if ARCH_X86_32
2907    mov              r5, r5m
2908%endif
2909    mova             m1, [PIC_ptr(pb_17_27)]
2910    jnc %%loop_y_v_overlap
2911%endif
2912    jmp %%loop_y
2913
2914%%end_y_v_overlap:
2915%if ARCH_X86_32
2916    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
2917
2918    mov              wq, r4m
2919%endif
2920    add              wq, 16
2921    jge %%end_hv
2922%if ARCH_X86_32
2923    mov            srcq, r1mp
2924    mov           lumaq, r11mp
2925%else
2926    mov            srcq, r11mp
2927%endif
2928    lea           lumaq, [luma_bakq+wq*(1+%2)]
2929    add            srcq, wq
2930%if ARCH_X86_32
2931    mov             r4m, wq
2932    mov             r9m, lumaq
2933%endif
2934
2935%if %2
2936    ; since fg_dataq.overlap is guaranteed to be set, we never jump
2937    ; back to .loop_x_v_overlap, and instead always fall-through to
2938    ; h+v overlap
2939%else
2940%if ARCH_X86_32
2941    add dword [rsp+7*mmsize+1*gprsize], 16
2942%else
2943    add      top_offxyd, 16
2944%endif
2945    add          offxyd, 16
2946    btc       dword r8m, 2
2947    jnc %%loop_x_odd_v_overlap
2948%endif
2949
2950%%loop_x_hv_overlap:
2951%if ARCH_X86_32
2952    DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
2953
2954    mov              r6, [rsp+7*mmsize+1*gprsize]
2955%if %2
2956    lea              r0, [r3d+16]
2957    add              r6, 16
2958    mov [rsp+7*mmsize+0*gprsize], r0        ; left_offxy
2959%else
2960    mov [rsp+7*mmsize+0*gprsize], r3        ; left_offxy
2961%endif
2962    mov [rsp+7*mmsize+2*gprsize], r6        ; topleft_offxy
2963
2964    DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
2965
2966    mov            seed, r3m
2967    xor            tmpd, tmpd
2968%else
2969    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2970                tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
2971
2972%if %2
2973    lea  topleft_offxyq, [top_offxyq+16]
2974    lea     left_offxyq, [offxyq+16]
2975%else
2976    mov  topleft_offxyq, top_offxyq
2977    mov     left_offxyq, offxyq
2978%endif
2979
2980    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
2981%endif
2982    mov             r6d, seed
2983    or             seed, 0xeff4eff4
2984    test           seeb, seeh
2985    setp           tmpb                     ; parity of top_seed
2986    shr            seed, 16
2987    shl            tmpd, 16
2988    test           seeb, seeh
2989    setp           tmpb                     ; parity of cur_seed
2990    or              r6d, 0x00010001
2991    xor            tmpd, r6d
2992    mov            seed, tmpd
2993    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
2994
2995%if ARCH_X86_32
2996    mov             r3m, seed
2997
2998    DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx
2999
3000    mov           offxd, offyd
3001%else
3002    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
3003                offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
3004
3005    mov           offxd, seed
3006    mov           offyd, seed
3007%endif
3008    ror           offyd, 8
3009    ror           offxd, 12
3010    and           offyd, 0xf000f
3011    and           offxd, 0xf000f
3012    imul          offyd, 164>>%3
3013    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
3014    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
3015
3016%if ARCH_X86_32
3017    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
3018%else
3019    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
3020                h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak
3021%endif
3022
3023    movzx    top_offxyd, offxyw
3024    shr          offxyd, 16
3025%if ARCH_X86_32
3026    mov [rsp+7*mmsize+1*gprsize], top_offxyd
3027%endif
3028
3029    mov              hd, r7m
3030    mov      grain_lutq, grain_lutmp
3031%if ARCH_X86_32
3032    mov              r5, r5m
3033%endif
3034%if %3
3035    mova             m3, [PIC_ptr(pb_23_22)]
3036%else
3037    mova             m3, [PIC_ptr(pb_27_17)]
3038%endif
3039%%loop_y_hv_overlap:
3040    ; grain = grain_lut[offy+y][offx+x]
3041%if ARCH_X86_32
3042    mov              r0, [rsp+7*mmsize+2*gprsize]       ; topleft_offxy
3043    mov              r5, [rsp+7*mmsize+1*gprsize]       ; top_offxy
3044    movd             m1, [grain_lutq+r0]
3045    mov              r0, [rsp+7*mmsize+0*gprsize]       ; left_offxy
3046%else
3047    movd             m1, [grain_lutq+topleft_offxyq]
3048%endif
3049    movu             m2, [grain_lutq+offxyq]
3050%if ARCH_X86_32
3051    movu             m6, [grain_lutq+r5]
3052    movd             m4, [grain_lutq+r0]
3053%else
3054    movu             m6, [grain_lutq+top_offxyq]
3055    movd             m4, [grain_lutq+left_offxyq]
3056%endif
3057    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
3058    punpcklbw        m1, m6
3059    punpcklbw        m4, m2
3060    pmaddubsw        m0, m9, m1
3061    pmaddubsw        m1, m9, m4
3062    REPX {pmulhrsw x, m8}, m0, m1
3063    packsswb         m0, m1
3064    shufps           m4, m0, m2, q3232
3065    shufps           m0, m6, q3210
3066    ; followed by v interpolation (top | cur -> cur)
3067    punpcklbw        m2, m0, m4
3068    punpckhbw        m0, m4
3069    pmaddubsw        m4, m3, m0
3070    pmaddubsw        m1, m3, m2
3071    pmulhrsw         m4, m8
3072    pmulhrsw         m1, m8
3073    packsswb         m1, m4
3074
3075    ; src
3076%if ARCH_X86_32
3077    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
3078
3079    mov           lumaq, r9mp
3080%endif
3081%if %2
3082    mova             m4, [lumaq+ 0]
3083    mova             m6, [lumaq+16]
3084    mova             m0, [srcq]
3085%if ARCH_X86_32
3086    add           lumaq, r10mp
3087    mov            r9mp, lumaq
3088    mov              r5, r5m
3089    movd             m7, [base+pb_1]
3090%else
3091    movd             m7, [pb_1]
3092%endif
3093    pshufd           m7, m7, q0000
3094    pxor             m2, m2
3095    pmaddubsw        m4, m7
3096    pmaddubsw        m6, m7
3097    pavgw            m4, m2
3098    pavgw            m6, m2
3099%else
3100    mova             m4, [lumaq]
3101    mova             m0, [srcq]
3102%if ARCH_X86_32
3103    add           lumaq, r10mp
3104    mov            r9mp, lumaq
3105%endif
3106    pxor             m2, m2
3107%endif
3108
3109%if %1
3110%if %2
3111    packuswb         m4, m6                 ; luma
3112%endif
3113    punpckhbw        m6, m4, m0
3114    punpcklbw        m4, m0                 ; { luma, chroma }
3115    pmaddubsw        m6, m14
3116    pmaddubsw        m4, m14
3117    psraw            m6, 6
3118    psraw            m4, 6
3119    paddw            m6, m15
3120    paddw            m4, m15
3121    packuswb         m4, m6                 ; pack+unpack = clip
3122    punpckhbw        m6, m4, m2
3123    punpcklbw        m4, m2
3124%elif %2 == 0
3125    punpckhbw        m6, m4, m2
3126    punpcklbw        m4, m2
3127%endif
3128
3129    ; scaling[src]
3130%if ARCH_X86_32
3131    vpgatherdw       m7, m4, scalingq-1, r0, r5
3132    vpgatherdw       m5, m6, scalingq-1, r0, r5
3133%else
3134%if %3
3135    vpgatherdw       m7, m4, scalingq-1, r2, r12
3136    vpgatherdw       m5, m6, scalingq-1, r2, r12
3137%else
3138    vpgatherdw       m7, m4, scalingq-1, r2, r13
3139    vpgatherdw       m5, m6, scalingq-1, r2, r13
3140%endif
3141%endif
3142    REPX {psrlw x, 8}, m7, m5
3143
3144    ; unpack grain
3145    pxor             m4, m4
3146    pcmpgtb          m4, m1
3147    punpcklbw        m2, m1, m4
3148    punpckhbw        m1, m4
3149
3150    ; noise = round2(scaling[src] * grain, scaling_shift)
3151    pmullw           m2, m7
3152    pmullw           m1, m5
3153    pmulhrsw         m2, m11
3154    pmulhrsw         m1, m11
3155
3156%if ARCH_X86_32
3157    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
3158%endif
3159
3160    ; unpack chroma source
3161    pxor             m4, m4
3162    punpckhbw        m5, m0, m4
3163    punpcklbw        m0, m4                 ; m0-1: src as word
3164
3165    ; dst = clip_pixel(src, noise)
3166    paddw            m0, m2
3167    paddw            m5, m1
3168    pmaxsw           m0, m13
3169    pmaxsw           m5, m13
3170    pminsw           m0, m12
3171    pminsw           m5, m12
3172    packuswb         m0, m5
3173    movifnidn      dstq, dstmp
3174    mova    [dstq+srcq], m0
3175
3176%if ARCH_X86_32
3177    add            srcq, r2mp
3178    ; lumaq has been adjusted above already
3179%else
3180    add            srcq, r12mp
3181%if %3
3182    lea           lumaq, [lumaq+lstrideq*(1+%2)]
3183%else
3184    add           lumaq, r10mp
3185%endif
3186%endif
3187    add      grain_lutq, 82
3188    dec              hw
3189%if %3
3190    jg %%loop_y_h_overlap
3191%else
3192    jle %%end_y_hv_overlap
3193%if ARCH_X86_32
3194    mov              r5, r5m
3195%endif
3196    mova             m3, [PIC_ptr(pb_17_27)]
3197    btc              hd, 16
3198    jnc %%loop_y_hv_overlap
3199%if ARCH_X86_64
3200    mov        lstrideq, r10mp
3201%endif
3202    jmp %%loop_y_h_overlap
3203%%end_y_hv_overlap:
3204%if ARCH_X86_64
3205    mov        lstrideq, r10mp
3206%endif
3207%endif
3208
3209%if ARCH_X86_32
3210    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
3211
3212    mov              wq, r4m
3213%endif
3214    add              wq, 16
3215    jge %%end_hv
3216%if ARCH_X86_32
3217    mov            srcq, r1mp
3218    mov           lumaq, r11mp
3219%else
3220    mov            srcq, r11mp
3221%endif
3222    lea           lumaq, [luma_bakq+wq*(1+%2)]
3223    add            srcq, wq
3224%if ARCH_X86_32
3225    mov             r4m, wq
3226    mov             r9m, lumaq
3227%endif
3228%if %2
3229    jmp %%loop_x_hv_overlap
3230%else
3231%if ARCH_X86_32
3232    add dword [rsp+7*mmsize+1*gprsize], 16
3233%else
3234    add      top_offxyd, 16
3235%endif
3236    add          offxyd, 16
3237    xor       dword r8m, 4
3238    jmp %%loop_x_odd_v_overlap
3239%endif
3240
3241%%end_hv:
3242    RET
3243%endmacro
3244
3245    %%FGUV_32x32xN_LOOP 1, %2, %3
3246.csfl:
3247    %%FGUV_32x32xN_LOOP 0, %2, %3
3248%endmacro
3249
3250FGUV_FN 420, 1, 1
3251
3252%if STACK_ALIGNMENT < mmsize
3253DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
3254%endif
3255
3256FGUV_FN 422, 1, 0
3257
3258%if STACK_ALIGNMENT < mmsize
3259DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
3260%endif
3261
3262FGUV_FN 444, 0, 0
3263