1; Copyright © 2019, VideoLAN and dav1d authors
2; Copyright © 2019, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "ext/x86/x86inc.asm"
27
28%if ARCH_X86_64
29
30SECTION_RODATA 32
31pw_1024: times 16 dw 1024
32pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
33rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
34byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
35pw_seed_xor: times 2 dw 0xb524
36             times 2 dw 0x49d8
37pd_m65536: dd ~0xffff
38pb_23_22: times 2 db 23, 22
39pb_1: times 4 db 1
40hmul_bits: dw 32768, 16384, 8192, 4096
41round: dw 2048, 1024, 512
42mul_bits: dw 256, 128, 64, 32, 16
43round_vals: dw 32, 64, 128, 256, 512
44max: dw 255, 240, 235
45min: dw 0, 16
46pb_27_17_17_27: db 27, 17, 17, 27
47pw_1: dw 1
48
49%macro JMP_TABLE 1-*
50    %xdefine %1_table %%table
51    %xdefine %%base %1_table
52    %xdefine %%prefix mangle(private_prefix %+ _%1)
53    %%table:
54    %rep %0 - 1
55        dd %%prefix %+ .ar%2 - %%base
56        %rotate 1
57    %endrep
58%endmacro
59
60ALIGN 4
61JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3
62JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3
63
64struc FGData
65    .seed:                      resd 1
66    .num_y_points:              resd 1
67    .y_points:                  resb 14 * 2
68    .chroma_scaling_from_luma:  resd 1
69    .num_uv_points:             resd 2
70    .uv_points:                 resb 2 * 10 * 2
71    .scaling_shift:             resd 1
72    .ar_coeff_lag:              resd 1
73    .ar_coeffs_y:               resb 24
74    .ar_coeffs_uv:              resb 2 * 28 ; includes padding
75    .ar_coeff_shift:            resq 1
76    .grain_scale_shift:         resd 1
77    .uv_mult:                   resd 2
78    .uv_luma_mult:              resd 2
79    .uv_offset:                 resd 2
80    .overlap_flag:              resd 1
81    .clip_to_restricted_range:  resd 1
82endstruc
83
84cextern gaussian_sequence
85
86SECTION .text
87
88INIT_XMM avx2
89cglobal generate_grain_y, 2, 9, 16, buf, fg_data
90    lea              r4, [pb_mask]
91%define base r4-pb_mask
92    movq            xm1, [base+rnd_next_upperbit_mask]
93    movq            xm4, [base+mul_bits]
94    movq            xm7, [base+hmul_bits]
95    mov             r2d, [fg_dataq+FGData.grain_scale_shift]
96    vpbroadcastw    xm8, [base+round+r2*2]
97    mova            xm5, [base+pb_mask]
98    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
99    vpbroadcastd    xm9, [base+pd_m65536]
100    mov              r2, -73*82
101    sub            bufq, r2
102    lea              r3, [gaussian_sequence]
103.loop:
104    pand            xm2, xm0, xm1
105    psrlw           xm3, xm2, 10
106    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
107    pmullw          xm2, xm4            ; bits 0x0f00 are set
108    pshufb          xm2, xm5, xm2       ; set 15th bit for next 4 seeds
109    psllq           xm6, xm2, 30
110    por             xm2, xm6
111    psllq           xm6, xm2, 15
112    por             xm2, xm6            ; aggregate each bit into next seed's high bit
113    pmulhuw         xm3, xm0, xm7
114    por             xm2, xm3            ; 4 next output seeds
115    pshuflw         xm0, xm2, q3333
116    psrlw           xm2, 5
117    pmovzxwd        xm3, xm2
118    mova            xm6, xm9
119    vpgatherdd      xm2, [r3+xm3*2], xm6
120    pandn           xm2, xm9, xm2
121    packusdw        xm2, xm2
122    pmulhrsw        xm2, xm8
123    packsswb        xm2, xm2
124    movd      [bufq+r2], xm2
125    add              r2, 4
126    jl .loop
127
128    ; auto-regression code
129    movsxd           r2, [fg_dataq+FGData.ar_coeff_lag]
130    movsxd           r2, [base+generate_grain_y_avx2_table+r2*4]
131    lea              r2, [r2+base+generate_grain_y_avx2_table]
132    jmp              r2
133
134.ar1:
135    DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
136    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
137    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
138    movd            xm4, [fg_dataq+FGData.ar_coeffs_y]
139    DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
140    pinsrb          xm4, [pb_1], 3
141    pmovsxbw        xm4, xm4
142    pshufd          xm5, xm4, q1111
143    pshufd          xm4, xm4, q0000
144    vpbroadcastw    xm3, [base+round_vals+shiftq*2-12]    ; rnd
145    sub            bufq, 82*73-(82*3+79)
146    mov              hd, 70
147    mov            mind, -128
148    mov            maxd, 127
149.y_loop_ar1:
150    mov              xq, -76
151    movsx         val3d, byte [bufq+xq-1]
152.x_loop_ar1:
153    pmovsxbw        xm0, [bufq+xq-82-1]     ; top/left
154    pmovsxbw        xm2, [bufq+xq-82+0]     ; top
155    pmovsxbw        xm1, [bufq+xq-82+1]     ; top/right
156    punpcklwd       xm0, xm2
157    punpcklwd       xm1, xm3
158    pmaddwd         xm0, xm4
159    pmaddwd         xm1, xm5
160    paddd           xm0, xm1
161.x_loop_ar1_inner:
162    movd          val0d, xm0
163    psrldq          xm0, 4
164    imul          val3d, cf3d
165    add           val3d, val0d
166%if WIN64
167    sarx          val3d, val3d, shiftd
168%else
169    sar           val3d, shiftb
170%endif
171    movsx         val0d, byte [bufq+xq]
172    add           val3d, val0d
173    cmp           val3d, maxd
174    cmovns        val3d, maxd
175    cmp           val3d, mind
176    cmovs         val3d, mind
177    mov  byte [bufq+xq], val3b
178    ; keep val3d in-place as left for next x iteration
179    inc              xq
180    jz .x_loop_ar1_end
181    test             xq, 3
182    jnz .x_loop_ar1_inner
183    jmp .x_loop_ar1
184
185.x_loop_ar1_end:
186    add            bufq, 82
187    dec              hd
188    jg .y_loop_ar1
189.ar0:
190    RET
191
192.ar2:
193    DEFINE_ARGS buf, fg_data, shift
194    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
195    vpbroadcastw   xm14, [base+round_vals-12+shiftq*2]
196    movq           xm15, [base+byte_blend+1]
197    pmovsxbw        xm8, [fg_dataq+FGData.ar_coeffs_y+0]    ; cf0-7
198    movd            xm9, [fg_dataq+FGData.ar_coeffs_y+8]    ; cf8-11
199    pmovsxbw        xm9, xm9
200    DEFINE_ARGS buf, fg_data, h, x
201    pshufd         xm12, xm9, q0000
202    pshufd         xm13, xm9, q1111
203    pshufd         xm11, xm8, q3333
204    pshufd         xm10, xm8, q2222
205    pshufd          xm9, xm8, q1111
206    pshufd          xm8, xm8, q0000
207    pmovzxwd       xm14, xm14
208    sub            bufq, 82*73-(82*3+79)
209    mov              hd, 70
210.y_loop_ar2:
211    mov              xq, -76
212
213.x_loop_ar2:
214    pmovsxbw        xm0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
215    pmovsxbw        xm1, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
216    psrldq          xm2, xm0, 2             ; y=-2,x=[-1,+5]
217    psrldq          xm3, xm1, 2             ; y=-1,x=[-1,+5]
218    psrldq          xm4, xm1, 4             ; y=-1,x=[+0,+5]
219    punpcklwd       xm2, xm0, xm2
220    punpcklwd       xm3, xm4
221    pmaddwd         xm2, xm8
222    pmaddwd         xm3, xm11
223    paddd           xm2, xm3
224
225    psrldq          xm4, xm0, 4             ; y=-2,x=[+0,+5]
226    psrldq          xm5, xm0, 6             ; y=-2,x=[+1,+5]
227    psrldq          xm6, xm0, 8             ; y=-2,x=[+2,+5]
228    punpcklwd       xm4, xm5
229    punpcklwd       xm6, xm1
230    psrldq          xm7, xm1, 6             ; y=-1,x=[+1,+5]
231    psrldq          xm1, xm1, 8             ; y=-1,x=[+2,+5]
232    punpcklwd       xm7, xm1
233    pmaddwd         xm4, xm9
234    pmaddwd         xm6, xm10
235    pmaddwd         xm7, xm12
236    paddd           xm4, xm6
237    paddd           xm2, xm7
238    paddd           xm2, xm4
239    paddd           xm2, xm14
240
241    movq            xm0, [bufq+xq-2]        ; y=0,x=[-2,+5]
242.x_loop_ar2_inner:
243    pmovsxbw        xm1, xm0
244    pmaddwd         xm3, xm1, xm13
245    paddd           xm3, xm2
246    psrldq          xm1, 4                  ; y=0,x=0
247    psrldq          xm2, 4                  ; shift top to next pixel
248    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
249    ; don't packssdw since we only care about one value
250    paddw           xm3, xm1
251    packsswb        xm3, xm3
252    pextrb    [bufq+xq], xm3, 0
253    pslldq          xm3, 2
254    pand            xm3, xm15
255    pandn           xm0, xm15, xm0
256    por             xm0, xm3
257    psrldq          xm0, 1
258    inc              xq
259    jz .x_loop_ar2_end
260    test             xq, 3
261    jnz .x_loop_ar2_inner
262    jmp .x_loop_ar2
263
264.x_loop_ar2_end:
265    add            bufq, 82
266    dec              hd
267    jg .y_loop_ar2
268    RET
269
270.ar3:
271    DEFINE_ARGS buf, fg_data, shift
272%if WIN64
273    SUB             rsp, 16*12
274%assign stack_size_padded (stack_size_padded+16*12)
275%assign stack_size (stack_size+16*12)
276%else
277    ALLOC_STACK   16*12
278%endif
279    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
280    vpbroadcastw   xm14, [base+round_vals-12+shiftq*2]
281    movq           xm15, [base+byte_blend]
282    pmovsxbw        xm0, [fg_dataq+FGData.ar_coeffs_y+ 0]   ; cf0-7
283    pmovsxbw        xm1, [fg_dataq+FGData.ar_coeffs_y+ 8]   ; cf8-15
284    pmovsxbw        xm2, [fg_dataq+FGData.ar_coeffs_y+16]   ; cf16-23
285    pshufd          xm9, xm0, q1111
286    pshufd         xm10, xm0, q2222
287    pshufd         xm11, xm0, q3333
288    pshufd          xm0, xm0, q0000
289    pshufd          xm6, xm1, q1111
290    pshufd          xm7, xm1, q2222
291    pshufd          xm8, xm1, q3333
292    pshufd          xm1, xm1, q0000
293    pshufd          xm3, xm2, q1111
294    psrldq         xm13, xm2, 10
295    pinsrw          xm2, [pw_1], 5
296    pshufd          xm4, xm2, q2222
297    pshufd          xm2, xm2, q0000
298    pinsrw         xm13, [base+round_vals+shiftq*2-10], 3
299    mova    [rsp+ 0*16], xm0
300    mova    [rsp+ 1*16], xm9
301    mova    [rsp+ 2*16], xm10
302    mova    [rsp+ 3*16], xm11
303    mova    [rsp+ 4*16], xm1
304    mova    [rsp+ 5*16], xm6
305    mova    [rsp+ 6*16], xm7
306    mova    [rsp+ 7*16], xm8
307    mova    [rsp+ 8*16], xm2
308    mova    [rsp+ 9*16], xm3
309    mova    [rsp+10*16], xm4
310    DEFINE_ARGS buf, fg_data, h, x
311    sub            bufq, 82*73-(82*3+79)
312    mov              hd, 70
313.y_loop_ar3:
314    mov              xq, -76
315
316.x_loop_ar3:
317    movu            xm0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
318    movu            xm1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
319    movu            xm2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
320    pxor            xm3, xm3
321    pcmpgtb         xm6, xm3, xm2
322    pcmpgtb         xm5, xm3, xm1
323    pcmpgtb         xm4, xm3, xm0
324    punpckhbw       xm3, xm0, xm4
325    punpcklbw       xm0, xm4
326    punpckhbw       xm4, xm1, xm5
327    punpcklbw       xm1, xm5
328    punpckhbw       xm5, xm2, xm6
329    punpcklbw       xm2, xm6
330
331    psrldq          xm6, xm0, 2
332    psrldq          xm7, xm0, 4
333    psrldq          xm8, xm0, 6
334    psrldq          xm9, xm0, 8
335    palignr        xm10, xm3, xm0, 10
336    palignr        xm11, xm3, xm0, 12
337
338    punpcklwd       xm0, xm6
339    punpcklwd       xm7, xm8
340    punpcklwd       xm9, xm10
341    punpcklwd      xm11, xm1
342    pmaddwd         xm0, [rsp+ 0*16]
343    pmaddwd         xm7, [rsp+ 1*16]
344    pmaddwd         xm9, [rsp+ 2*16]
345    pmaddwd        xm11, [rsp+ 3*16]
346    paddd           xm0, xm7
347    paddd           xm9, xm11
348    paddd           xm0, xm9
349
350    psrldq          xm6, xm1, 2
351    psrldq          xm7, xm1, 4
352    psrldq          xm8, xm1, 6
353    psrldq          xm9, xm1, 8
354    palignr        xm10, xm4, xm1, 10
355    palignr        xm11, xm4, xm1, 12
356    psrldq         xm12, xm2, 2
357
358    punpcklwd       xm6, xm7
359    punpcklwd       xm8, xm9
360    punpcklwd      xm10, xm11
361    punpcklwd      xm12, xm2, xm12
362    pmaddwd         xm6, [rsp+ 4*16]
363    pmaddwd         xm8, [rsp+ 5*16]
364    pmaddwd        xm10, [rsp+ 6*16]
365    pmaddwd        xm12, [rsp+ 7*16]
366    paddd           xm6, xm8
367    paddd          xm10, xm12
368    paddd           xm6, xm10
369    paddd           xm0, xm6
370
371    psrldq          xm6, xm2, 4
372    psrldq          xm7, xm2, 6
373    psrldq          xm8, xm2, 8
374    palignr         xm9, xm5, xm2, 10
375    palignr         xm5, xm5, xm2, 12
376
377    punpcklwd       xm6, xm7
378    punpcklwd       xm8, xm9
379    punpcklwd       xm5, xm14
380    pmaddwd         xm6, [rsp+ 8*16]
381    pmaddwd         xm8, [rsp+ 9*16]
382    pmaddwd         xm5, [rsp+10*16]
383    paddd           xm0, xm6
384    paddd           xm8, xm5
385    paddd           xm0, xm8
386
387    movq            xm1, [bufq+xq-3]        ; y=0,x=[-3,+4]
388.x_loop_ar3_inner:
389    pmovsxbw        xm2, xm1
390    pmaddwd         xm2, xm13
391    pshufd          xm3, xm2, q1111
392    paddd           xm2, xm3                ; left+cur
393    paddd           xm2, xm0                ; add top
394    psrldq          xm0, 4
395    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
396    ; don't packssdw since we only care about one value
397    packsswb        xm2, xm2
398    pextrb    [bufq+xq], xm2, 0
399    pslldq          xm2, 3
400    pand            xm2, xm15
401    pandn           xm1, xm15, xm1
402    por             xm1, xm2
403    psrldq          xm1, 1
404    inc              xq
405    jz .x_loop_ar3_end
406    test             xq, 3
407    jnz .x_loop_ar3_inner
408    jmp .x_loop_ar3
409
410.x_loop_ar3_end:
411    add            bufq, 82
412    dec              hd
413    jg .y_loop_ar3
414    RET
415
416INIT_XMM avx2
417cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
418    lea              r4, [pb_mask]
419%define base r4-pb_mask
420    movq            xm1, [base+rnd_next_upperbit_mask]
421    movq            xm4, [base+mul_bits]
422    movq            xm7, [base+hmul_bits]
423    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
424    vpbroadcastw    xm8, [base+round+r5*2]
425    mova            xm5, [base+pb_mask]
426    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
427    vpbroadcastw    xm9, [base+pw_seed_xor+uvq*4]
428    pxor            xm0, xm9
429    vpbroadcastd    xm9, [base+pd_m65536]
430    lea              r6, [gaussian_sequence]
431    mov             r7d, 38
432    add            bufq, 44
433.loop_y:
434    mov              r5, -44
435.loop_x:
436    pand            xm2, xm0, xm1
437    psrlw           xm3, xm2, 10
438    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
439    pmullw          xm2, xm4            ; bits 0x0f00 are set
440    pshufb          xm2, xm5, xm2       ; set 15th bit for next 4 seeds
441    psllq           xm6, xm2, 30
442    por             xm2, xm6
443    psllq           xm6, xm2, 15
444    por             xm2, xm6            ; aggregate each bit into next seed's high bit
445    pmulhuw         xm3, xm0, xm7
446    por             xm2, xm3            ; 4 next output seeds
447    pshuflw         xm0, xm2, q3333
448    psrlw           xm2, 5
449    pmovzxwd        xm3, xm2
450    mova            xm6, xm9
451    vpgatherdd      xm2, [r6+xm3*2], xm6
452    pandn           xm2, xm9, xm2
453    packusdw        xm2, xm2
454    pmulhrsw        xm2, xm8
455    packsswb        xm2, xm2
456    movd      [bufq+r5], xm2
457    add              r5, 4
458    jl .loop_x
459    add            bufq, 82
460    dec             r7d
461    jg .loop_y
462
463    ; auto-regression code
464    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
465    movsxd           r5, [base+generate_grain_uv_420_avx2_table+r5*4]
466    lea              r5, [r5+base+generate_grain_uv_420_avx2_table]
467    jmp              r5
468
469.ar0:
470    INIT_YMM avx2
471    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
472    imul            uvd, 28
473    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
474    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
475    movd            xm3, [base+hmul_bits+shiftq*2]
476    DEFINE_ARGS buf, bufy, h
477    pmovsxbw        xm4, xm4
478    vpbroadcastd     m7, [pb_1]
479    vpbroadcastw     m6, [hmul_bits+4]
480    vpbroadcastw     m4, xm4
481    vpbroadcastw     m3, xm3
482    sub            bufq, 82*38+82-(82*3+41)
483    add           bufyq, 3+82*3
484    mov              hd, 35
485.y_loop_ar0:
486    ; first 32 pixels
487    movu            xm8, [bufyq]
488    movu            xm9, [bufyq+82]
489    movu           xm10, [bufyq+16]
490    movu           xm11, [bufyq+82+16]
491    vinserti128      m8, [bufyq+32], 1
492    vinserti128      m9, [bufyq+82+32], 1
493    vinserti128     m10, [bufyq+48], 1
494    vinserti128     m11, [bufyq+82+48], 1
495    pmaddubsw        m8, m7, m8
496    pmaddubsw        m9, m7, m9
497    pmaddubsw       m10, m7, m10
498    pmaddubsw       m11, m7, m11
499    paddw            m8, m9
500    paddw           m10, m11
501    pmulhrsw         m8, m6
502    pmulhrsw        m10, m6
503    pmullw           m8, m4
504    pmullw          m10, m4
505    pmulhrsw         m8, m3
506    pmulhrsw        m10, m3
507    packsswb         m8, m10
508    movu             m0, [bufq]
509    punpckhbw        m1, m0, m8
510    punpcklbw        m0, m8
511    pmaddubsw        m1, m7, m1
512    pmaddubsw        m0, m7, m0
513    packsswb         m0, m1
514    movu         [bufq], m0
515
516    ; last 6 pixels
517    movu            xm8, [bufyq+32*2]
518    movu            xm9, [bufyq+32*2+82]
519    pmaddubsw       xm8, xm7, xm8
520    pmaddubsw       xm9, xm7, xm9
521    paddw           xm8, xm9
522    pmulhrsw        xm8, xm6
523    pmullw          xm8, xm4
524    pmulhrsw        xm8, xm3
525    packsswb        xm8, xm8
526    movq            xm0, [bufq+32]
527    punpcklbw       xm8, xm0
528    pmaddubsw       xm8, xm7, xm8
529    packsswb        xm8, xm8
530    vpblendw        xm0, xm8, xm0, 1000b
531    movq      [bufq+32], xm0
532
533    add            bufq, 82
534    add           bufyq, 82*2
535    dec              hd
536    jg .y_loop_ar0
537    RET
538
539.ar1:
540    INIT_XMM avx2
541    DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift
542    imul            uvd, 28
543    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
544    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
545    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
546    pinsrb          xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
547    DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift
548    pmovsxbw        xm4, xm4
549    pshufd          xm5, xm4, q1111
550    pshufd          xm4, xm4, q0000
551    pmovsxwd        xm3, [base+round_vals+shiftq*2-12]    ; rnd
552    vpbroadcastd    xm7, [pb_1]
553    vpbroadcastw    xm6, [hmul_bits+4]
554    vpbroadcastd    xm3, xm3
555    sub            bufq, 82*38+44-(82*3+41)
556    add           bufyq, 79+82*3
557    mov              hd, 35
558    mov            mind, -128
559    mov            maxd, 127
560.y_loop_ar1:
561    mov              xq, -38
562    movsx         val3d, byte [bufq+xq-1]
563.x_loop_ar1:
564    pmovsxbw        xm0, [bufq+xq-82-1]     ; top/left
565    movq            xm8, [bufyq+xq*2]
566    movq            xm9, [bufyq+xq*2+82]
567    psrldq          xm2, xm0, 2             ; top
568    psrldq          xm1, xm0, 4             ; top/right
569    pmaddubsw       xm8, xm7, xm8
570    pmaddubsw       xm9, xm7, xm9
571    paddw           xm8, xm9
572    pmulhrsw        xm8, xm6
573    punpcklwd       xm0, xm2
574    punpcklwd       xm1, xm8
575    pmaddwd         xm0, xm4
576    pmaddwd         xm1, xm5
577    paddd           xm0, xm1
578    paddd           xm0, xm3
579.x_loop_ar1_inner:
580    movd          val0d, xm0
581    psrldq          xm0, 4
582    imul          val3d, cf3d
583    add           val3d, val0d
584    sarx          val3d, val3d, shiftd
585    movsx         val0d, byte [bufq+xq]
586    add           val3d, val0d
587    cmp           val3d, maxd
588    cmovns        val3d, maxd
589    cmp           val3d, mind
590    cmovs         val3d, mind
591    mov  byte [bufq+xq], val3b
592    ; keep val3d in-place as left for next x iteration
593    inc              xq
594    jz .x_loop_ar1_end
595    test             xq, 3
596    jnz .x_loop_ar1_inner
597    jmp .x_loop_ar1
598
599.x_loop_ar1_end:
600    add            bufq, 82
601    add           bufyq, 82*2
602    dec              hd
603    jg .y_loop_ar1
604    RET
605
606.ar2:
607    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
608    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
609    imul            uvd, 28
610    vpbroadcastw   xm15, [base+round_vals-12+shiftq*2]
611    pmovsxbw        xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]   ; cf0-7
612    pmovsxbw        xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8]   ; cf8-12
613    pinsrw          xm9, [base+pw_1], 5
614    vpbroadcastw    xm7, [base+hmul_bits+4]
615    vpbroadcastd    xm6, [base+pb_1]
616    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
617    pshufd         xm12, xm9, q0000
618    pshufd         xm13, xm9, q1111
619    pshufd         xm14, xm9, q2222
620    pshufd         xm11, xm8, q3333
621    pshufd         xm10, xm8, q2222
622    pshufd          xm9, xm8, q1111
623    pshufd          xm8, xm8, q0000
624    sub            bufq, 82*38+44-(82*3+41)
625    add           bufyq, 79+82*3
626    mov              hd, 35
627.y_loop_ar2:
628    mov              xq, -38
629
630.x_loop_ar2:
631    pmovsxbw        xm0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
632    pmovsxbw        xm1, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
633    psrldq          xm2, xm0, 2             ; y=-2,x=[-1,+5]
634    psrldq          xm3, xm1, 2             ; y=-1,x=[-1,+5]
635    psrldq          xm4, xm1, 4             ; y=-1,x=[+0,+5]
636    punpcklwd       xm2, xm0, xm2
637    punpcklwd       xm3, xm4
638    pmaddwd         xm2, xm8
639    pmaddwd         xm3, xm11
640    paddd           xm2, xm3
641
642    psrldq          xm4, xm0, 4             ; y=-2,x=[+0,+5]
643    psrldq          xm5, xm0, 6             ; y=-2,x=[+1,+5]
644    psrldq          xm0, 8                  ; y=-2,x=[+2,+5]
645    punpcklwd       xm4, xm5
646    punpcklwd       xm0, xm1
647    psrldq          xm3, xm1, 6             ; y=-1,x=[+1,+5]
648    psrldq          xm1, xm1, 8             ; y=-1,x=[+2,+5]
649    punpcklwd       xm3, xm1
650    pmaddwd         xm4, xm9
651    pmaddwd         xm0, xm10
652    pmaddwd         xm3, xm12
653    paddd           xm4, xm0
654    paddd           xm2, xm3
655    paddd           xm2, xm4
656
657    movq            xm0, [bufyq+xq*2]
658    movq            xm3, [bufyq+xq*2+82]
659    pmaddubsw       xm0, xm6, xm0
660    pmaddubsw       xm3, xm6, xm3
661    paddw           xm0, xm3
662    pmulhrsw        xm0, xm7
663    punpcklwd       xm0, xm15
664    pmaddwd         xm0, xm14
665    paddd           xm2, xm0
666
667    movq            xm0, [bufq+xq-2]        ; y=0,x=[-2,+5]
668.x_loop_ar2_inner:
669    pmovsxbw        xm0, xm0
670    pmaddwd         xm3, xm0, xm13
671    paddd           xm3, xm2
672    psrldq          xm2, 4                  ; shift top to next pixel
673    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
674    pslldq          xm3, 2
675    psrldq          xm0, 2
676    paddw           xm3, xm0
677    vpblendw        xm0, xm3, 00000010b
678    packsswb        xm0, xm0
679    pextrb    [bufq+xq], xm0, 1
680    inc              xq
681    jz .x_loop_ar2_end
682    test             xq, 3
683    jnz .x_loop_ar2_inner
684    jmp .x_loop_ar2
685
686.x_loop_ar2_end:
687    add            bufq, 82
688    add           bufyq, 82*2
689    dec              hd
690    jg .y_loop_ar2
691    RET
692
693.ar3:
694    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
695    SUB             rsp, 16*12
696%assign stack_size_padded (stack_size_padded+16*12)
697%assign stack_size (stack_size+16*12)
698    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
699    imul            uvd, 28
700    vpbroadcastw   xm14, [base+round_vals-12+shiftq*2]
701    pmovsxbw        xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]   ; cf0-7
702    pmovsxbw        xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8]   ; cf8-15
703    pmovsxbw        xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]   ; cf16-23
704    pmovsxbw        xm5, [fg_dataq+FGData.ar_coeffs_uv+uvq+24]   ; cf24 [luma]
705    pshufd          xm9, xm0, q1111
706    pshufd         xm10, xm0, q2222
707    pshufd         xm11, xm0, q3333
708    pshufd          xm0, xm0, q0000
709    pshufd          xm6, xm1, q1111
710    pshufd          xm7, xm1, q2222
711    pshufd          xm8, xm1, q3333
712    pshufd          xm1, xm1, q0000
713    pshufd          xm3, xm2, q1111
714    pshufd          xm4, xm2, q2222
715    vpbroadcastw    xm5, xm5
716    vpblendw        xm4, xm5, 10101010b                     ; interleave luma cf
717    psrldq          xm5, xm2, 10
718    pshufd          xm2, xm2, q0000
719    pinsrw          xm5, [base+round_vals+shiftq*2-10], 3
720    pmovzxwd       xm14, xm14
721    mova    [rsp+ 0*16], xm0
722    mova    [rsp+ 1*16], xm9
723    mova    [rsp+ 2*16], xm10
724    mova    [rsp+ 3*16], xm11
725    mova    [rsp+ 4*16], xm1
726    mova    [rsp+ 5*16], xm6
727    mova    [rsp+ 6*16], xm7
728    mova    [rsp+ 7*16], xm8
729    mova    [rsp+ 8*16], xm2
730    mova    [rsp+ 9*16], xm3
731    mova    [rsp+10*16], xm4
732    mova    [rsp+11*16], xm5
733    vpbroadcastd   xm13, [base+pb_1]
734    vpbroadcastw   xm15, [base+hmul_bits+4]
735    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
736    sub            bufq, 82*38+44-(82*3+41)
737    add           bufyq, 79+82*3
738    mov              hd, 35
739.y_loop_ar3:
740    mov              xq, -38
741
742.x_loop_ar3:
743    movu            xm0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
744    movu            xm1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
745    movu            xm2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
746    pxor            xm3, xm3
747    pcmpgtb         xm6, xm3, xm2
748    pcmpgtb         xm5, xm3, xm1
749    pcmpgtb         xm4, xm3, xm0
750    punpckhbw       xm3, xm0, xm4
751    punpcklbw       xm0, xm4
752    punpckhbw       xm4, xm1, xm5
753    punpcklbw       xm1, xm5
754    punpckhbw       xm5, xm2, xm6
755    punpcklbw       xm2, xm6
756
757    psrldq          xm6, xm0, 2
758    psrldq          xm7, xm0, 4
759    psrldq          xm8, xm0, 6
760    psrldq          xm9, xm0, 8
761    palignr        xm10, xm3, xm0, 10
762    palignr        xm11, xm3, xm0, 12
763
764    punpcklwd       xm0, xm6
765    punpcklwd       xm7, xm8
766    punpcklwd       xm9, xm10
767    punpcklwd      xm11, xm1
768    pmaddwd         xm0, [rsp+ 0*16]
769    pmaddwd         xm7, [rsp+ 1*16]
770    pmaddwd         xm9, [rsp+ 2*16]
771    pmaddwd        xm11, [rsp+ 3*16]
772    paddd           xm0, xm7
773    paddd           xm9, xm11
774    paddd           xm0, xm9
775
776    psrldq          xm6, xm1, 2
777    psrldq          xm7, xm1, 4
778    psrldq          xm8, xm1, 6
779    psrldq          xm9, xm1, 8
780    palignr        xm10, xm4, xm1, 10
781    palignr        xm11, xm4, xm1, 12
782    psrldq         xm12, xm2, 2
783
784    punpcklwd       xm6, xm7
785    punpcklwd       xm8, xm9
786    punpcklwd      xm10, xm11
787    punpcklwd      xm12, xm2, xm12
788    pmaddwd         xm6, [rsp+ 4*16]
789    pmaddwd         xm8, [rsp+ 5*16]
790    pmaddwd        xm10, [rsp+ 6*16]
791    pmaddwd        xm12, [rsp+ 7*16]
792    paddd           xm6, xm8
793    paddd          xm10, xm12
794    paddd           xm6, xm10
795    paddd           xm0, xm6
796
797    psrldq          xm6, xm2, 4
798    psrldq          xm7, xm2, 6
799    psrldq          xm8, xm2, 8
800    palignr         xm9, xm5, xm2, 10
801    palignr         xm5, xm5, xm2, 12
802
803    movq            xm1, [bufyq+xq*2]
804    movq            xm2, [bufyq+xq*2+82]
805    pmaddubsw       xm1, xm13, xm1
806    pmaddubsw       xm2, xm13, xm2
807    paddw           xm1, xm2
808    pmulhrsw        xm1, xm15
809
810    punpcklwd       xm6, xm7
811    punpcklwd       xm8, xm9
812    punpcklwd       xm5, xm1
813    pmaddwd         xm6, [rsp+ 8*16]
814    pmaddwd         xm8, [rsp+ 9*16]
815    pmaddwd         xm5, [rsp+10*16]
816    paddd           xm0, xm6
817    paddd           xm8, xm5
818    paddd           xm0, xm8
819    paddd           xm0, xm14
820
821    movq            xm1, [bufq+xq-3]        ; y=0,x=[-3,+4]
822.x_loop_ar3_inner:
823    pmovsxbw        xm1, xm1
824    pmaddwd         xm2, xm1, [rsp+16*11]
825    pshufd          xm3, xm2, q1111
826    paddd           xm2, xm3                ; left+cur
827    paddd           xm2, xm0                ; add top
828    psrldq          xm0, 4
829    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
830    ; don't packssdw, we only care about one value
831    pslldq          xm2, 6
832    vpblendw        xm1, xm2, 1000b
833    packsswb        xm1, xm1
834    pextrb    [bufq+xq], xm1, 3
835    psrldq          xm1, 1
836    inc              xq
837    jz .x_loop_ar3_end
838    test             xq, 3
839    jnz .x_loop_ar3_inner
840    jmp .x_loop_ar3
841
842.x_loop_ar3_end:
843    add            bufq, 82
844    add           bufyq, 82*2
845    dec              hd
846    jg .y_loop_ar3
847    RET
848
849INIT_YMM avx2
850cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
851    pcmpeqw         m10, m10
852    psrld           m10, 24
853    mov             r7d, [fg_dataq+FGData.scaling_shift]
854    lea              r8, [pb_mask]
855%define base r8-pb_mask
856    vpbroadcastw    m11, [base+mul_bits+r7*2-14]
857    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
858    vpbroadcastw    m12, [base+max+r7*4]
859    vpbroadcastw    m13, [base+min+r7*2]
860
861    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
862
863    mov        overlapd, [fg_dataq+FGData.overlap_flag]
864    movifnidn      sbyd, sbym
865    test           sbyd, sbyd
866    setnz           r7b
867    test            r7b, overlapb
868    jnz .vertical_overlap
869
870    imul           seed, sbyd, (173 << 24) | 37
871    add            seed, (105 << 24) | 178
872    rol            seed, 8
873    movzx          seed, seew
874    xor            seed, [fg_dataq+FGData.seed]
875
876    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
877                unused1, unused2, see, overlap
878
879    lea        src_bakq, [srcq+wq]
880    neg              wq
881    sub            dstq, srcq
882
883.loop_x:
884    mov             r6d, seed
885    or             seed, 0xEFF4
886    shr             r6d, 1
887    test           seeb, seeh
888    lea            seed, [r6+0x8000]
889    cmovp          seed, r6d                ; updated seed
890
891    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
892                offx, offy, see, overlap
893
894    mov           offxd, seed
895    rorx          offyd, seed, 8
896    shr           offxd, 12
897    and           offyd, 0xf
898    imul          offyd, 164
899    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
900
901    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
902                h, offxy, see, overlap
903
904    mov              hd, hm
905    mov      grain_lutq, grain_lutmp
906.loop_y:
907    ; src
908    mova             m0, [srcq]
909    pxor             m2, m2
910    punpckhbw        m1, m0, m2
911    punpcklbw        m0, m2                 ; m0-1: src as word
912    punpckhwd        m5, m0, m2
913    punpcklwd        m4, m0, m2
914    punpckhwd        m7, m1, m2
915    punpcklwd        m6, m1, m2             ; m4-7: src as dword
916
917    ; scaling[src]
918    pcmpeqw          m3, m3
919    pcmpeqw          m9, m9
920    vpgatherdd       m8, [scalingq+m4], m3
921    vpgatherdd       m4, [scalingq+m5], m9
922    pcmpeqw          m3, m3
923    pcmpeqw          m9, m9
924    vpgatherdd       m5, [scalingq+m6], m3
925    vpgatherdd       m6, [scalingq+m7], m9
926    pand             m8, m10
927    pand             m4, m10
928    pand             m5, m10
929    pand             m6, m10
930    packusdw         m8, m4
931    packusdw         m5, m6
932
933    ; grain = grain_lut[offy+y][offx+x]
934    movu             m3, [grain_lutq+offxyq]
935    pcmpgtb          m7, m2, m3
936    punpcklbw        m2, m3, m7
937    punpckhbw        m3, m7
938
939    ; noise = round2(scaling[src] * grain, scaling_shift)
940    pmullw           m2, m8
941    pmullw           m3, m5
942    pmulhrsw         m2, m11
943    pmulhrsw         m3, m11
944
945    ; dst = clip_pixel(src, noise)
946    paddw            m0, m2
947    paddw            m1, m3
948    pmaxsw           m0, m13
949    pmaxsw           m1, m13
950    pminsw           m0, m12
951    pminsw           m1, m12
952    packuswb         m0, m1
953    mova    [dstq+srcq], m0
954
955    add            srcq, strideq
956    add      grain_lutq, 82
957    dec              hd
958    jg .loop_y
959
960    add              wq, 32
961    jge .end
962    lea            srcq, [src_bakq+wq]
963    test       overlapd, overlapd
964    jz .loop_x
965
966    ; r8m = sbym
967    movd           xm15, [pb_27_17_17_27]
968    cmp       dword r8m, 0
969    jne .loop_x_hv_overlap
970
971    ; horizontal overlap (without vertical overlap)
972    movd           xm14, [pw_1024]
973.loop_x_h_overlap:
974    mov             r6d, seed
975    or             seed, 0xEFF4
976    shr             r6d, 1
977    test           seeb, seeh
978    lea            seed, [r6+0x8000]
979    cmovp          seed, r6d                ; updated seed
980
981    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
982                offx, offy, see, left_offxy
983
984    lea     left_offxyd, [offyd+32]         ; previous column's offy*stride+offx
985    mov           offxd, seed
986    rorx          offyd, seed, 8
987    shr           offxd, 12
988    and           offyd, 0xf
989    imul          offyd, 164
990    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
991
992    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
993                h, offxy, see, left_offxy
994
995    mov              hd, hm
996    mov      grain_lutq, grain_lutmp
997.loop_y_h_overlap:
998    ; src
999    mova             m0, [srcq]
1000    pxor             m2, m2
1001    punpckhbw        m1, m0, m2
1002    punpcklbw        m0, m2                 ; m0-1: src as word
1003    punpckhwd        m5, m0, m2
1004    punpcklwd        m4, m0, m2
1005    punpckhwd        m7, m1, m2
1006    punpcklwd        m6, m1, m2             ; m4-7: src as dword
1007
1008    ; scaling[src]
1009    pcmpeqw          m3, m3
1010    pcmpeqw          m9, m9
1011    vpgatherdd       m8, [scalingq+m4], m3
1012    vpgatherdd       m4, [scalingq+m5], m9
1013    pcmpeqw          m3, m3
1014    pcmpeqw          m9, m9
1015    vpgatherdd       m5, [scalingq+m6], m3
1016    vpgatherdd       m6, [scalingq+m7], m9
1017    pand             m8, m10
1018    pand             m4, m10
1019    pand             m5, m10
1020    pand             m6, m10
1021    packusdw         m8, m4
1022    packusdw         m5, m6
1023
1024    ; grain = grain_lut[offy+y][offx+x]
1025    movu             m3, [grain_lutq+offxyq]
1026    movd            xm4, [grain_lutq+left_offxyq]
1027    punpcklbw       xm4, xm3
1028    pmaddubsw       xm4, xm15, xm4
1029    pmulhrsw        xm4, xm14
1030    packsswb        xm4, xm4
1031    vpblendw        xm4, xm3, 11111110b
1032    vpblendd         m3, m4, 00001111b
1033    pcmpgtb          m7, m2, m3
1034    punpcklbw        m2, m3, m7
1035    punpckhbw        m3, m7
1036
1037    ; noise = round2(scaling[src] * grain, scaling_shift)
1038    pmullw           m2, m8
1039    pmullw           m3, m5
1040    pmulhrsw         m2, m11
1041    pmulhrsw         m3, m11
1042
1043    ; dst = clip_pixel(src, noise)
1044    paddw            m0, m2
1045    paddw            m1, m3
1046    pmaxsw           m0, m13
1047    pmaxsw           m1, m13
1048    pminsw           m0, m12
1049    pminsw           m1, m12
1050    packuswb         m0, m1
1051    mova    [dstq+srcq], m0
1052
1053    add            srcq, strideq
1054    add      grain_lutq, 82
1055    dec              hd
1056    jg .loop_y_h_overlap
1057
1058    add              wq, 32
1059    jge .end
1060    lea            srcq, [src_bakq+wq]
1061
1062    ; r8m = sbym
1063    cmp       dword r8m, 0
1064    jne .loop_x_hv_overlap
1065    jmp .loop_x_h_overlap
1066
1067.end:
1068    RET
1069
1070.vertical_overlap:
1071    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
1072
1073    movzx          sbyd, sbyb
1074    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1075    imul            r7d, sbyd, 173 * 0x00010001
1076    imul           sbyd, 37 * 0x01000100
1077    add             r7d, (105 << 16) | 188
1078    add            sbyd, (178 << 24) | (141 << 8)
1079    and             r7d, 0x00ff00ff
1080    and            sbyd, 0xff00ff00
1081    xor            seed, r7d
1082    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1083
1084    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1085                unused1, unused2, see, overlap
1086
1087    lea        src_bakq, [srcq+wq]
1088    neg              wq
1089    sub            dstq, srcq
1090
1091    vpbroadcastd    m14, [pw_1024]
1092.loop_x_v_overlap:
1093    vpbroadcastw    m15, [pb_27_17_17_27]
1094
1095    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1096    mov             r6d, seed
1097    or             seed, 0xeff4eff4
1098    test           seeb, seeh
1099    setp            r7b                     ; parity of top_seed
1100    shr            seed, 16
1101    shl             r7d, 16
1102    test           seeb, seeh
1103    setp            r7b                     ; parity of cur_seed
1104    or              r6d, 0x00010001
1105    xor             r7d, r6d
1106    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1107
1108    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1109                offx, offy, see, overlap, top_offxy
1110
1111    rorx          offyd, seed, 8
1112    rorx          offxd, seed, 12
1113    and           offyd, 0xf000f
1114    and           offxd, 0xf000f
1115    imul          offyd, 164
1116    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1117    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1118
1119    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1120                h, offxy, see, overlap, top_offxy
1121
1122    movzx    top_offxyd, offxyw
1123    shr          offxyd, 16
1124
1125    mov              hd, hm
1126    mov      grain_lutq, grain_lutmp
1127.loop_y_v_overlap:
1128    ; src
1129    mova             m0, [srcq]
1130    pxor             m2, m2
1131    punpckhbw        m1, m0, m2
1132    punpcklbw        m0, m2                 ; m0-1: src as word
1133    punpckhwd        m5, m0, m2
1134    punpcklwd        m4, m0, m2
1135    punpckhwd        m7, m1, m2
1136    punpcklwd        m6, m1, m2             ; m4-7: src as dword
1137
1138    ; scaling[src]
1139    pcmpeqw          m3, m3
1140    pcmpeqw          m9, m9
1141    vpgatherdd       m8, [scalingq+m4], m3
1142    vpgatherdd       m4, [scalingq+m5], m9
1143    pcmpeqw          m3, m3
1144    pcmpeqw          m9, m9
1145    vpgatherdd       m5, [scalingq+m6], m3
1146    vpgatherdd       m6, [scalingq+m7], m9
1147    pand             m8, m10
1148    pand             m4, m10
1149    pand             m5, m10
1150    pand             m6, m10
1151    packusdw         m8, m4
1152    packusdw         m5, m6
1153
1154    ; grain = grain_lut[offy+y][offx+x]
1155    movu             m3, [grain_lutq+offxyq]
1156    movu             m4, [grain_lutq+top_offxyq]
1157    punpckhbw        m6, m4, m3
1158    punpcklbw        m4, m3
1159    pmaddubsw        m6, m15, m6
1160    pmaddubsw        m4, m15, m4
1161    pmulhrsw         m6, m14
1162    pmulhrsw         m4, m14
1163    packsswb         m3, m4, m6
1164    pcmpgtb          m7, m2, m3
1165    punpcklbw        m2, m3, m7
1166    punpckhbw        m3, m7
1167
1168    ; noise = round2(scaling[src] * grain, scaling_shift)
1169    pmullw           m2, m8
1170    pmullw           m3, m5
1171    pmulhrsw         m2, m11
1172    pmulhrsw         m3, m11
1173
1174    ; dst = clip_pixel(src, noise)
1175    paddw            m0, m2
1176    paddw            m1, m3
1177    pmaxsw           m0, m13
1178    pmaxsw           m1, m13
1179    pminsw           m0, m12
1180    pminsw           m1, m12
1181    packuswb         m0, m1
1182    mova    [dstq+srcq], m0
1183
1184    vpbroadcastw    m15, [pb_27_17_17_27+2] ; swap weights for second v-overlap line
1185    add            srcq, strideq
1186    add      grain_lutq, 82
1187    dec              hw
1188    jz .end_y_v_overlap
1189    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1190    ; remaining (up to) 30 lines
1191    xor              hd, 0x10000
1192    test             hd, 0x10000
1193    jnz .loop_y_v_overlap
1194    jmp .loop_y
1195
1196.end_y_v_overlap:
1197    add              wq, 32
1198    jge .end_hv
1199    lea            srcq, [src_bakq+wq]
1200
1201    ; since fg_dataq.overlap is guaranteed to be set, we never jump
1202    ; back to .loop_x_v_overlap, and instead always fall-through to
1203    ; h+v overlap
1204
1205    movd           xm15, [pb_27_17_17_27]
1206.loop_x_hv_overlap:
1207    vpbroadcastw     m8, [pb_27_17_17_27]
1208
1209    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1210    mov             r6d, seed
1211    or             seed, 0xeff4eff4
1212    test           seeb, seeh
1213    setp            r7b                     ; parity of top_seed
1214    shr            seed, 16
1215    shl             r7d, 16
1216    test           seeb, seeh
1217    setp            r7b                     ; parity of cur_seed
1218    or              r6d, 0x00010001
1219    xor             r7d, r6d
1220    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1221
1222    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1223                offx, offy, see, left_offxy, top_offxy, topleft_offxy
1224
1225    lea  topleft_offxyq, [top_offxyq+32]
1226    lea     left_offxyq, [offyq+32]
1227    rorx          offyd, seed, 8
1228    rorx          offxd, seed, 12
1229    and           offyd, 0xf000f
1230    and           offxd, 0xf000f
1231    imul          offyd, 164
1232    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1233    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1234
1235    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1236                h, offxy, see, left_offxy, top_offxy, topleft_offxy
1237
1238    movzx    top_offxyd, offxyw
1239    shr          offxyd, 16
1240
1241    mov              hd, hm
1242    mov      grain_lutq, grain_lutmp
1243.loop_y_hv_overlap:
1244    ; src
1245    mova             m0, [srcq]
1246    pxor             m2, m2
1247    punpckhbw        m1, m0, m2
1248    punpcklbw        m0, m2                 ; m0-1: src as word
1249    punpckhwd        m5, m0, m2
1250    punpcklwd        m4, m0, m2
1251    punpckhwd        m7, m1, m2
1252    punpcklwd        m6, m1, m2             ; m4-7: src as dword
1253
1254    ; scaling[src]
1255    pcmpeqw          m3, m3
1256    ; FIXME it would be nice to have another register here to do 2 vpgatherdd's in parallel
1257    vpgatherdd       m9, [scalingq+m4], m3
1258    pcmpeqw          m3, m3
1259    vpgatherdd       m4, [scalingq+m5], m3
1260    pcmpeqw          m3, m3
1261    vpgatherdd       m5, [scalingq+m6], m3
1262    pcmpeqw          m3, m3
1263    vpgatherdd       m6, [scalingq+m7], m3
1264    pand             m9, m10
1265    pand             m4, m10
1266    pand             m5, m10
1267    pand             m6, m10
1268    packusdw         m9, m4
1269    packusdw         m5, m6
1270
1271    ; grain = grain_lut[offy+y][offx+x]
1272    movu             m3, [grain_lutq+offxyq]
1273    movu             m6, [grain_lutq+top_offxyq]
1274    movd            xm4, [grain_lutq+left_offxyq]
1275    movd            xm7, [grain_lutq+topleft_offxyq]
1276    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
1277    punpcklbw       xm4, xm3
1278    punpcklbw       xm7, xm6
1279    pmaddubsw       xm4, xm15, xm4
1280    pmaddubsw       xm7, xm15, xm7
1281    pmulhrsw        xm4, xm14
1282    pmulhrsw        xm7, xm14
1283    packsswb        xm4, xm4
1284    packsswb        xm7, xm7
1285    vpblendw        xm4, xm3, 11111110b
1286    vpblendw        xm7, xm6, 11111110b
1287    vpblendd         m3, m4, 00001111b
1288    vpblendd         m6, m7, 00001111b
1289    ; followed by v interpolation (top | cur -> cur)
1290    punpckhbw        m7, m6, m3
1291    punpcklbw        m6, m3
1292    pmaddubsw        m7, m8, m7
1293    pmaddubsw        m6, m8, m6
1294    pmulhrsw         m7, m14
1295    pmulhrsw         m6, m14
1296    packsswb         m3, m6, m7
1297    pcmpgtb          m7, m2, m3
1298    punpcklbw        m2, m3, m7
1299    punpckhbw        m3, m7
1300
1301    ; noise = round2(scaling[src] * grain, scaling_shift)
1302    pmullw           m2, m9
1303    pmullw           m3, m5
1304    pmulhrsw         m2, m11
1305    pmulhrsw         m3, m11
1306
1307    ; dst = clip_pixel(src, noise)
1308    paddw            m0, m2
1309    paddw            m1, m3
1310    pmaxsw           m0, m13
1311    pmaxsw           m1, m13
1312    pminsw           m0, m12
1313    pminsw           m1, m12
1314    packuswb         m0, m1
1315    mova    [dstq+srcq], m0
1316
1317    vpbroadcastw     m8, [pb_27_17_17_27+2] ; swap weights for second v-overlap line
1318    add            srcq, strideq
1319    add      grain_lutq, 82
1320    dec              hw
1321    jz .end_y_hv_overlap
1322    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1323    ; remaining (up to) 30 lines
1324    xor              hd, 0x10000
1325    test             hd, 0x10000
1326    jnz .loop_y_hv_overlap
1327    jmp .loop_y_h_overlap
1328
1329.end_y_hv_overlap:
1330    add              wq, 32
1331    lea            srcq, [src_bakq+wq]
1332    jl .loop_x_hv_overlap
1333
1334.end_hv:
1335    RET
1336
1337cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
1338                                      grain_lut, h, sby, luma, lstride, uv_pl, is_id
1339    pcmpeqw         m10, m10
1340    psrld           m10, 24
1341    mov             r7d, [fg_dataq+FGData.scaling_shift]
1342    lea              r8, [pb_mask]
1343%define base r8-pb_mask
1344    vpbroadcastw    m11, [base+mul_bits+r7*2-14]
1345    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
1346    mov             r9d, dword is_idm
1347    vpbroadcastw    m13, [base+min+r7*2]
1348    shlx            r7d, r7d, r9d
1349    vpbroadcastw    m12, [base+max+r7*2]
1350
1351    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
1352    jne .csfl
1353
1354%macro FGUV_32x32xN_LOOP 1 ; not-csfl
1355    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
1356
1357%if %1
1358    mov             r7d, dword r11m
1359    vpbroadcastb     m0, [fg_dataq+FGData.uv_mult+r7*4]
1360    vpbroadcastb     m1, [fg_dataq+FGData.uv_luma_mult+r7*4]
1361    punpcklbw       m14, m1, m0
1362    vpbroadcastw    m15, [fg_dataq+FGData.uv_offset+r7*4]
1363%else
1364    vpbroadcastd    m14, [pw_1024]
1365    vpbroadcastd    m15, [pb_23_22]
1366%endif
1367
1368    mov        overlapd, [fg_dataq+FGData.overlap_flag]
1369    movifnidn      sbyd, sbym
1370    test           sbyd, sbyd
1371    setnz           r7b
1372    test            r7b, overlapb
1373    jnz %%vertical_overlap
1374
1375    imul           seed, sbyd, (173 << 24) | 37
1376    add            seed, (105 << 24) | 178
1377    rol            seed, 8
1378    movzx          seed, seew
1379    xor            seed, [fg_dataq+FGData.seed]
1380
1381    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1382                unused2, unused3, see, overlap, unused4, unused5, lstride
1383
1384    mov           lumaq, r9mp
1385    lea             r12, [srcq+wq]
1386    lea             r13, [dstq+wq]
1387    lea             r14, [lumaq+wq*2]
1388    mov           r11mp, r12
1389    mov           r12mp, r13
1390    mov        lstrideq, r10mp
1391    neg              wq
1392
1393%%loop_x:
1394    mov             r6d, seed
1395    or             seed, 0xEFF4
1396    shr             r6d, 1
1397    test           seeb, seeh
1398    lea            seed, [r6+0x8000]
1399    cmovp          seed, r6d               ; updated seed
1400
1401    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1402                offx, offy, see, overlap, unused1, unused2, lstride
1403
1404    mov           offxd, seed
1405    rorx          offyd, seed, 8
1406    shr           offxd, 12
1407    and           offyd, 0xf
1408    imul          offyd, 82
1409    lea           offyq, [offyq+offxq+498]  ; offy*stride+offx
1410
1411    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1412                h, offxy, see, overlap, unused1, unused2, lstride
1413
1414    mov              hd, hm
1415    mov      grain_lutq, grain_lutmp
1416%%loop_y:
1417    ; src
1418    mova            xm4, [lumaq+lstrideq*0+ 0]
1419    mova            xm6, [lumaq+lstrideq*0+16]
1420    mova            xm0, [srcq]
1421    vpbroadcastd     m7, [pb_1]
1422    vinserti128      m4, [lumaq+lstrideq*2 +0], 1
1423    vinserti128      m6, [lumaq+lstrideq*2+16], 1
1424    vinserti128      m0, [srcq+strideq], 1
1425    pxor             m2, m2
1426    pmaddubsw        m4, m7
1427    pmaddubsw        m6, m7
1428    pavgw            m4, m2
1429    pavgw            m6, m2
1430
1431%if %1
1432    packuswb         m4, m6                 ; luma
1433    punpckhbw        m6, m4, m0
1434    punpcklbw        m4, m0                 ; { luma, chroma }
1435    pmaddubsw        m6, m14
1436    pmaddubsw        m4, m14
1437    psraw            m6, 6
1438    psraw            m4, 6
1439    paddw            m6, m15
1440    paddw            m4, m15
1441    packuswb         m4, m6                 ; pack+unpack = clip
1442    punpckhbw        m6, m4, m2
1443    punpcklbw        m4, m2
1444%endif
1445
1446    punpckhwd        m5, m4, m2
1447    punpcklwd        m4, m2
1448    punpckhwd        m7, m6, m2
1449    punpcklwd        m6, m2                 ; m4-7: luma_src as dword
1450
1451    ; scaling[luma_src]
1452    pcmpeqw          m3, m3
1453    pcmpeqw          m9, m9
1454    vpgatherdd       m8, [scalingq+m4], m3
1455    vpgatherdd       m4, [scalingq+m5], m9
1456    pcmpeqw          m3, m3
1457    pcmpeqw          m9, m9
1458    vpgatherdd       m5, [scalingq+m6], m3
1459    vpgatherdd       m6, [scalingq+m7], m9
1460    pand             m8, m10
1461    pand             m4, m10
1462    pand             m5, m10
1463    pand             m6, m10
1464    packusdw         m8, m4
1465    packusdw         m5, m6
1466
1467    ; unpack chroma_source
1468    punpckhbw        m1, m0, m2
1469    punpcklbw        m0, m2                 ; m0-1: src as word
1470
1471    ; grain = grain_lut[offy+y][offx+x]
1472    movu            xm3, [grain_lutq+offxyq+ 0]
1473    vinserti128      m3, [grain_lutq+offxyq+82], 1
1474    pcmpgtb          m7, m2, m3
1475    punpcklbw        m2, m3, m7
1476    punpckhbw        m3, m7
1477
1478    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1479    pmullw           m2, m8
1480    pmullw           m3, m5
1481    pmulhrsw         m2, m11
1482    pmulhrsw         m3, m11
1483
1484    ; dst = clip_pixel(src, noise)
1485    paddw            m0, m2
1486    paddw            m1, m3
1487    pmaxsw           m0, m13
1488    pmaxsw           m1, m13
1489    pminsw           m0, m12
1490    pminsw           m1, m12
1491    packuswb         m0, m1
1492    mova         [dstq], xm0
1493    vextracti128 [dstq+strideq], m0, 1
1494
1495    lea            srcq, [srcq+strideq*2]
1496    lea            dstq, [dstq+strideq*2]
1497    lea           lumaq, [lumaq+lstrideq*4]
1498    add      grain_lutq, 82*2
1499    sub              hb, 2
1500    jg %%loop_y
1501
1502    add              wq, 16
1503    jge %%end
1504    mov            srcq, r11mp
1505    mov            dstq, r12mp
1506    lea           lumaq, [r14+wq*2]
1507    add            srcq, wq
1508    add            dstq, wq
1509    test       overlapd, overlapd
1510    jz %%loop_x
1511
1512    ; r8m = sbym
1513    cmp       dword r8m, 0
1514    jne %%loop_x_hv_overlap
1515
1516    ; horizontal overlap (without vertical overlap)
1517%%loop_x_h_overlap:
1518    mov             r6d, seed
1519    or             seed, 0xEFF4
1520    shr             r6d, 1
1521    test           seeb, seeh
1522    lea            seed, [r6+0x8000]
1523    cmovp          seed, r6d               ; updated seed
1524
1525    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1526                offx, offy, see, left_offxy, unused1, unused2, lstride
1527
1528    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
1529    mov           offxd, seed
1530    rorx          offyd, seed, 8
1531    shr           offxd, 12
1532    and           offyd, 0xf
1533    imul          offyd, 82
1534    lea           offyq, [offyq+offxq+498]  ; offy*stride+offx
1535
1536    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1537                h, offxy, see, left_offxy, unused1, unused2, lstride
1538
1539    mov              hd, hm
1540    mov      grain_lutq, grain_lutmp
1541%%loop_y_h_overlap:
1542    ; src
1543    mova            xm4, [lumaq+lstrideq*0+ 0]
1544    mova            xm6, [lumaq+lstrideq*0+16]
1545    mova            xm0, [srcq]
1546    vpbroadcastd     m7, [pb_1]
1547    vinserti128      m4, [lumaq+lstrideq*2 +0], 1
1548    vinserti128      m6, [lumaq+lstrideq*2+16], 1
1549    vinserti128      m0, [srcq+strideq], 1
1550    pxor             m2, m2
1551    pmaddubsw        m4, m7
1552    pmaddubsw        m6, m7
1553    pavgw            m4, m2
1554    pavgw            m6, m2
1555
1556%if %1
1557    packuswb         m4, m6                 ; luma
1558    punpckhbw        m6, m4, m0
1559    punpcklbw        m4, m0                 ; { luma, chroma }
1560    pmaddubsw        m6, m14
1561    pmaddubsw        m4, m14
1562    psraw            m6, 6
1563    psraw            m4, 6
1564    paddw            m6, m15
1565    paddw            m4, m15
1566    packuswb         m4, m6                 ; pack+unpack = clip
1567    punpckhbw        m6, m4, m2
1568    punpcklbw        m4, m2
1569%endif
1570
1571    punpckhwd        m5, m4, m2
1572    punpcklwd        m4, m2
1573    punpckhwd        m7, m6, m2
1574    punpcklwd        m6, m2                 ; m4-7: luma_src as dword
1575
1576    ; scaling[luma_src]
1577    pcmpeqw          m3, m3
1578    pcmpeqw          m9, m9
1579    vpgatherdd       m8, [scalingq+m4], m3
1580    vpgatherdd       m4, [scalingq+m5], m9
1581    pcmpeqw          m3, m3
1582    pcmpeqw          m9, m9
1583    vpgatherdd       m5, [scalingq+m6], m3
1584    vpgatherdd       m6, [scalingq+m7], m9
1585    pand             m8, m10
1586    pand             m4, m10
1587    pand             m5, m10
1588    pand             m6, m10
1589    packusdw         m8, m4
1590    packusdw         m5, m6
1591
1592    ; unpack chroma_source
1593    punpckhbw        m1, m0, m2
1594    punpcklbw        m0, m2                 ; m0-1: src as word
1595
1596    ; grain = grain_lut[offy+y][offx+x]
1597%if %1
1598    vpbroadcastd     m6, [pb_23_22] ; FIXME
1599%endif
1600    movu            xm3, [grain_lutq+offxyq+ 0]
1601    movd            xm4, [grain_lutq+left_offxyq+ 0]
1602    vinserti128      m3, [grain_lutq+offxyq+82], 1
1603    vinserti128      m4, [grain_lutq+left_offxyq+82], 1
1604    punpcklbw        m4, m3
1605%if %1
1606    pmaddubsw        m4, m6, m4
1607    pmulhrsw         m4, [pw_1024]
1608%else
1609    pmaddubsw        m4, m15, m4
1610    pmulhrsw         m4, m14
1611%endif
1612    packsswb         m4, m4
1613    pcmpeqw          m6, m6 ; FIXME
1614    psrldq           m6, 15 ; FIXME
1615    vpblendvb        m3, m3, m4, m6
1616    pcmpgtb          m7, m2, m3
1617    punpcklbw        m2, m3, m7
1618    punpckhbw        m3, m7
1619
1620    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1621    pmullw           m2, m8
1622    pmullw           m3, m5
1623    pmulhrsw         m2, m11
1624    pmulhrsw         m3, m11
1625
1626    ; dst = clip_pixel(src, noise)
1627    paddw            m0, m2
1628    paddw            m1, m3
1629    pmaxsw           m0, m13
1630    pmaxsw           m1, m13
1631    pminsw           m0, m12
1632    pminsw           m1, m12
1633    packuswb         m0, m1
1634    mova         [dstq], xm0
1635    vextracti128 [dstq+strideq], m0, 1
1636
1637    lea            srcq, [srcq+strideq*2]
1638    lea            dstq, [dstq+strideq*2]
1639    lea           lumaq, [lumaq+lstrideq*4]
1640    add      grain_lutq, 82*2
1641    sub              hb, 2
1642    jg %%loop_y_h_overlap
1643
1644    add              wq, 16
1645    jge %%end
1646    mov            srcq, r11mp
1647    mov            dstq, r12mp
1648    lea           lumaq, [r14+wq*2]
1649    add            srcq, wq
1650    add            dstq, wq
1651
1652    ; r8m = sbym
1653    cmp       dword r8m, 0
1654    jne %%loop_x_hv_overlap
1655    jmp %%loop_x_h_overlap
1656
1657%%end:
1658    RET
1659
1660%%vertical_overlap:
1661    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
1662                sby, see, overlap, unused1, unused2, lstride
1663
1664    movzx          sbyd, sbyb
1665    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1666    imul            r7d, sbyd, 173 * 0x00010001
1667    imul           sbyd, 37 * 0x01000100
1668    add             r7d, (105 << 16) | 188
1669    add            sbyd, (178 << 24) | (141 << 8)
1670    and             r7d, 0x00ff00ff
1671    and            sbyd, 0xff00ff00
1672    xor            seed, r7d
1673    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1674
1675    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1676                unused1, unused2, see, overlap, unused3, unused4, lstride
1677
1678    mov           lumaq, r9mp
1679    lea             r12, [srcq+wq]
1680    lea             r13, [dstq+wq]
1681    lea             r14, [lumaq+wq*2]
1682    mov           r11mp, r12
1683    mov           r12mp, r13
1684    mov        lstrideq, r10mp
1685    neg              wq
1686
1687%%loop_x_v_overlap:
1688    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1689    mov             r6d, seed
1690    or             seed, 0xeff4eff4
1691    test           seeb, seeh
1692    setp            r7b                     ; parity of top_seed
1693    shr            seed, 16
1694    shl             r7d, 16
1695    test           seeb, seeh
1696    setp            r7b                     ; parity of cur_seed
1697    or              r6d, 0x00010001
1698    xor             r7d, r6d
1699    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1700
1701    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1702                offx, offy, see, overlap, top_offxy, unused, lstride
1703
1704    rorx          offyd, seed, 8
1705    rorx          offxd, seed, 12
1706    and           offyd, 0xf000f
1707    and           offxd, 0xf000f
1708    imul          offyd, 82
1709    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1710    lea           offyq, [offyq+offxq+0x10001*498+16*82]
1711
1712    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1713                h, offxy, see, overlap, top_offxy, unused, lstride
1714
1715    movzx    top_offxyd, offxyw
1716    shr          offxyd, 16
1717
1718    mov              hd, hm
1719    mov      grain_lutq, grain_lutmp
1720%%loop_y_v_overlap:
1721    ; src
1722    mova            xm4, [lumaq+lstrideq*0+ 0]
1723    mova            xm6, [lumaq+lstrideq*0+16]
1724    mova            xm0, [srcq]
1725    vpbroadcastd     m7, [pb_1]
1726    vinserti128      m4, [lumaq+lstrideq*2 +0], 1
1727    vinserti128      m6, [lumaq+lstrideq*2+16], 1
1728    vinserti128      m0, [srcq+strideq], 1
1729    pxor             m2, m2
1730    pmaddubsw        m4, m7
1731    pmaddubsw        m6, m7
1732    pavgw            m4, m2
1733    pavgw            m6, m2
1734
1735%if %1
1736    packuswb         m4, m6                 ; luma
1737    punpckhbw        m6, m4, m0
1738    punpcklbw        m4, m0                 ; { luma, chroma }
1739    pmaddubsw        m6, m14
1740    pmaddubsw        m4, m14
1741    psraw            m6, 6
1742    psraw            m4, 6
1743    paddw            m6, m15
1744    paddw            m4, m15
1745    packuswb         m4, m6                 ; pack+unpack = clip
1746    punpckhbw        m6, m4, m2
1747    punpcklbw        m4, m2
1748%endif
1749
1750    punpckhwd        m5, m4, m2
1751    punpcklwd        m4, m2
1752    punpckhwd        m7, m6, m2
1753    punpcklwd        m6, m2                 ; m4-7: luma_src as dword
1754
1755    ; scaling[luma_src]
1756    pcmpeqw          m3, m3
1757    pcmpeqw          m9, m9
1758    vpgatherdd       m8, [scalingq+m4], m3
1759    vpgatherdd       m4, [scalingq+m5], m9
1760    pcmpeqw          m3, m3
1761    pcmpeqw          m9, m9
1762    vpgatherdd       m5, [scalingq+m6], m3
1763    vpgatherdd       m6, [scalingq+m7], m9
1764    pand             m8, m10
1765    pand             m4, m10
1766    pand             m5, m10
1767    pand             m6, m10
1768    packusdw         m8, m4
1769    packusdw         m5, m6
1770
1771    ; unpack chroma_source
1772    punpckhbw        m1, m0, m2
1773    punpcklbw        m0, m2                 ; m0-1: src as word
1774
1775    ; grain = grain_lut[offy+y][offx+x]
1776%if %1
1777    vpbroadcastd     m6, [pb_23_22]
1778%endif
1779    movq            xm3, [grain_lutq+offxyq]
1780    movq            xm4, [grain_lutq+top_offxyq]
1781    vinserti128      m3, [grain_lutq+offxyq+8], 1
1782    vinserti128      m4, [grain_lutq+top_offxyq+8], 1
1783    punpcklbw        m4, m3
1784%if %1
1785    pmaddubsw        m4, m6, m4
1786    pmulhrsw         m4, [pw_1024]
1787%else
1788    pmaddubsw        m4, m15, m4
1789    pmulhrsw         m4, m14
1790%endif
1791    packsswb         m4, m4
1792    vpermq           m4, m4, q3120
1793    ; only interpolate first line, insert second line unmodified
1794    vinserti128      m3, m4, [grain_lutq+offxyq+82], 1
1795    pcmpgtb          m7, m2, m3
1796    punpcklbw        m2, m3, m7
1797    punpckhbw        m3, m7
1798
1799    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1800    pmullw           m2, m8
1801    pmullw           m3, m5
1802    pmulhrsw         m2, m11
1803    pmulhrsw         m3, m11
1804
1805    ; dst = clip_pixel(src, noise)
1806    paddw            m0, m2
1807    paddw            m1, m3
1808    pmaxsw           m0, m13
1809    pmaxsw           m1, m13
1810    pminsw           m0, m12
1811    pminsw           m1, m12
1812    packuswb         m0, m1
1813    mova         [dstq], xm0
1814    vextracti128 [dstq+strideq], m0, 1
1815
1816    sub              hb, 2
1817    jl %%end_y_v_overlap
1818    lea            srcq, [srcq+strideq*2]
1819    lea            dstq, [dstq+strideq*2]
1820    lea           lumaq, [lumaq+lstrideq*4]
1821    add      grain_lutq, 82*2
1822    jmp %%loop_y
1823
1824%%end_y_v_overlap:
1825    add              wq, 16
1826    jge %%end_hv
1827    mov            srcq, r11mp
1828    mov            dstq, r12mp
1829    lea           lumaq, [r14+wq*2]
1830    add            srcq, wq
1831    add            dstq, wq
1832
1833    ; since fg_dataq.overlap is guaranteed to be set, we never jump
1834    ; back to .loop_x_v_overlap, and instead always fall-through to
1835    ; h+v overlap
1836
1837%%loop_x_hv_overlap:
1838    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1839    mov             r6d, seed
1840    or             seed, 0xeff4eff4
1841    test           seeb, seeh
1842    setp            r7b                     ; parity of top_seed
1843    shr            seed, 16
1844    shl             r7d, 16
1845    test           seeb, seeh
1846    setp            r7b                     ; parity of cur_seed
1847    or              r6d, 0x00010001
1848    xor             r7d, r6d
1849    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1850
1851    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1852                offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
1853
1854    lea  topleft_offxyq, [top_offxyq+16]
1855    lea     left_offxyq, [offyq+16]
1856    rorx          offyd, seed, 8
1857    rorx          offxd, seed, 12
1858    and           offyd, 0xf000f
1859    and           offxd, 0xf000f
1860    imul          offyd, 82
1861    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1862    lea           offyq, [offyq+offxq+0x10001*498+16*82]
1863
1864    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1865                h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
1866
1867    movzx    top_offxyd, offxyw
1868    shr          offxyd, 16
1869
1870    mov              hd, hm
1871    mov      grain_lutq, grain_lutmp
1872%%loop_y_hv_overlap:
1873    ; src
1874    mova            xm4, [lumaq+lstrideq*0+ 0]
1875    mova            xm6, [lumaq+lstrideq*0+16]
1876    mova            xm0, [srcq]
1877    vpbroadcastd     m7, [pb_1]
1878    vinserti128      m4, [lumaq+lstrideq*2 +0], 1
1879    vinserti128      m6, [lumaq+lstrideq*2+16], 1
1880    vinserti128      m0, [srcq+strideq], 1
1881    pxor             m2, m2
1882    pmaddubsw        m4, m7
1883    pmaddubsw        m6, m7
1884    pavgw            m4, m2
1885    pavgw            m6, m2
1886
1887%if %1
1888    packuswb         m4, m6                 ; luma
1889    punpckhbw        m6, m4, m0
1890    punpcklbw        m4, m0                 ; { luma, chroma }
1891    pmaddubsw        m6, m14
1892    pmaddubsw        m4, m14
1893    psraw            m6, 6
1894    psraw            m4, 6
1895    paddw            m6, m15
1896    paddw            m4, m15
1897    packuswb         m4, m6                 ; pack+unpack = clip
1898    punpckhbw        m6, m4, m2
1899    punpcklbw        m4, m2
1900%endif
1901
1902    punpckhwd        m5, m4, m2
1903    punpcklwd        m4, m2
1904    punpckhwd        m7, m6, m2
1905    punpcklwd        m6, m2                 ; m4-7: src as dword
1906
1907    ; scaling[src]
1908    pcmpeqw          m9, m9
1909    pcmpeqw          m3, m3
1910    vpgatherdd       m8, [scalingq+m4], m9
1911    vpgatherdd       m4, [scalingq+m5], m3
1912    pcmpeqw          m9, m9
1913    pcmpeqw          m3, m3
1914    vpgatherdd       m5, [scalingq+m6], m9
1915    vpgatherdd       m6, [scalingq+m7], m3
1916    pand             m8, m10
1917    pand             m4, m10
1918    pand             m5, m10
1919    pand             m6, m10
1920    packusdw         m8, m4
1921    packusdw         m5, m6
1922
1923    ; unpack chroma source
1924    punpckhbw        m1, m0, m2
1925    punpcklbw        m0, m2                 ; m0-1: src as word
1926
1927    ; grain = grain_lut[offy+y][offx+x]
1928%if %1
1929    vpbroadcastd     m9, [pb_23_22]
1930%endif
1931    movu            xm3, [grain_lutq+offxyq]
1932    movq            xm6, [grain_lutq+top_offxyq]
1933    vinserti128      m3, [grain_lutq+offxyq+82], 1
1934    vinserti128      m6, [grain_lutq+top_offxyq+8], 1
1935    movd            xm4, [grain_lutq+left_offxyq]
1936    movd            xm7, [grain_lutq+topleft_offxyq]
1937    vinserti128      m4, [grain_lutq+left_offxyq+82], 1
1938    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
1939    punpcklbw        m4, m3
1940    punpcklbw       xm7, xm6
1941%if %1
1942    pmaddubsw        m4, m9, m4
1943    pmaddubsw       xm7, xm9, xm7
1944    pmulhrsw         m4, [pw_1024]
1945    pmulhrsw        xm7, [pw_1024]
1946%else
1947    pmaddubsw        m4, m15, m4
1948    pmaddubsw       xm7, xm15, xm7
1949    pmulhrsw         m4, m14
1950    pmulhrsw        xm7, xm14
1951%endif
1952    packsswb         m4, m4
1953    packsswb        xm7, xm7
1954    pcmpeqw          m9, m9                 ; this is kind of ugly
1955    psrldq           m9, 15
1956    vpblendvb        m3, m3, m4, m9
1957    shufpd           m9, m9, m9, 1110b
1958    vpblendvb        m6, m6, m7, m9
1959    vpermq           m9, m3, q3120
1960    ; followed by v interpolation (top | cur -> cur)
1961    punpcklbw        m6, m9
1962%if %1
1963    vpbroadcastd     m9, [pb_23_22]
1964    pmaddubsw        m6, m9, m6
1965    pmulhrsw         m6, [pw_1024]
1966%else
1967    pmaddubsw        m6, m15, m6
1968    pmulhrsw         m6, m14
1969%endif
1970    packsswb         m6, m6
1971    vpermq           m6, m6, q3120
1972    vpblendd         m3, m3, m6, 00001111b
1973    pcmpgtb          m7, m2, m3
1974    punpcklbw        m2, m3, m7
1975    punpckhbw        m3, m7
1976
1977    ; noise = round2(scaling[src] * grain, scaling_shift)
1978    pmullw           m2, m8
1979    pmullw           m3, m5
1980    pmulhrsw         m2, m11
1981    pmulhrsw         m3, m11
1982
1983    ; dst = clip_pixel(src, noise)
1984    paddw            m0, m2
1985    paddw            m1, m3
1986    pmaxsw           m0, m13
1987    pmaxsw           m1, m13
1988    pminsw           m0, m12
1989    pminsw           m1, m12
1990    packuswb         m0, m1
1991    mova         [dstq], xm0
1992    vextracti128 [dstq+strideq], m0, 1
1993
1994    lea            srcq, [srcq+strideq*2]
1995    lea            dstq, [dstq+strideq*2]
1996    lea           lumaq, [lumaq+lstrideq*4]
1997    add      grain_lutq, 82*2
1998    sub              hb, 2
1999    jg %%loop_y_h_overlap
2000
2001%%end_y_hv_overlap:
2002    add              wq, 16
2003    jge %%end_hv
2004    mov            srcq, r11mp
2005    mov            dstq, r12mp
2006    lea           lumaq, [r14+wq*2]
2007    add            srcq, wq
2008    add            dstq, wq
2009    jmp %%loop_x_hv_overlap
2010
2011%%end_hv:
2012    RET
2013%endmacro
2014
2015    FGUV_32x32xN_LOOP 1
2016.csfl:
2017    FGUV_32x32xN_LOOP 0
2018
2019%endif ; ARCH_X86_64
2020