1; Copyright © 2019, VideoLAN and dav1d authors
2; Copyright © 2019, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 32
32pb_8x_27_17_8x_17_27: times 8 db 27, 17
33                      times 8 db 17, 27
34pw_1024: times 16 dw 1024
35pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
36rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
37byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
38pw_seed_xor: times 2 dw 0xb524
39             times 2 dw 0x49d8
40pd_m65536: dd ~0xffff
41pb_23_22: times 2 db 23, 22
42pb_1: times 4 db 1
43hmul_bits: dw 32768, 16384, 8192, 4096
44round: dw 2048, 1024, 512
45mul_bits: dw 256, 128, 64, 32, 16
46round_vals: dw 32, 64, 128, 256, 512
47max: dw 255, 240, 235
48min: dw 0, 16
49pb_27_17_17_27: db 27, 17, 17, 27
50pw_1: dw 1
51
52%macro JMP_TABLE 1-*
53    %xdefine %1_table %%table
54    %xdefine %%base %1_table
55    %xdefine %%prefix mangle(private_prefix %+ _%1)
56    %%table:
57    %rep %0 - 1
58        dd %%prefix %+ .ar%2 - %%base
59        %rotate 1
60    %endrep
61%endmacro
62
63ALIGN 4
64JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3
65JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3
66JMP_TABLE generate_grain_uv_422_avx2, 0, 1, 2, 3
67JMP_TABLE generate_grain_uv_444_avx2, 0, 1, 2, 3
68
69struc FGData
70    .seed:                      resd 1
71    .num_y_points:              resd 1
72    .y_points:                  resb 14 * 2
73    .chroma_scaling_from_luma:  resd 1
74    .num_uv_points:             resd 2
75    .uv_points:                 resb 2 * 10 * 2
76    .scaling_shift:             resd 1
77    .ar_coeff_lag:              resd 1
78    .ar_coeffs_y:               resb 24
79    .ar_coeffs_uv:              resb 2 * 28 ; includes padding
80    .ar_coeff_shift:            resq 1
81    .grain_scale_shift:         resd 1
82    .uv_mult:                   resd 2
83    .uv_luma_mult:              resd 2
84    .uv_offset:                 resd 2
85    .overlap_flag:              resd 1
86    .clip_to_restricted_range:  resd 1
87endstruc
88
89cextern gaussian_sequence
90
91SECTION .text
92
93INIT_XMM avx2
94cglobal generate_grain_y, 2, 9, 16, buf, fg_data
95    lea              r4, [pb_mask]
96%define base r4-pb_mask
97    movq            xm1, [base+rnd_next_upperbit_mask]
98    movq            xm4, [base+mul_bits]
99    movq            xm7, [base+hmul_bits]
100    mov             r2d, [fg_dataq+FGData.grain_scale_shift]
101    vpbroadcastw    xm8, [base+round+r2*2]
102    mova            xm5, [base+pb_mask]
103    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
104    vpbroadcastd    xm9, [base+pd_m65536]
105    mov              r2, -73*82
106    sub            bufq, r2
107    lea              r3, [gaussian_sequence]
108.loop:
109    pand            xm2, xm0, xm1
110    psrlw           xm3, xm2, 10
111    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
112    pmullw          xm2, xm4            ; bits 0x0f00 are set
113    pshufb          xm2, xm5, xm2       ; set 15th bit for next 4 seeds
114    psllq           xm6, xm2, 30
115    por             xm2, xm6
116    psllq           xm6, xm2, 15
117    por             xm2, xm6            ; aggregate each bit into next seed's high bit
118    pmulhuw         xm3, xm0, xm7
119    por             xm2, xm3            ; 4 next output seeds
120    pshuflw         xm0, xm2, q3333
121    psrlw           xm2, 5
122    pmovzxwd        xm3, xm2
123    mova            xm6, xm9
124    vpgatherdd      xm2, [r3+xm3*2], xm6
125    pandn           xm2, xm9, xm2
126    packusdw        xm2, xm2
127    pmulhrsw        xm2, xm8
128    packsswb        xm2, xm2
129    movd      [bufq+r2], xm2
130    add              r2, 4
131    jl .loop
132
133    ; auto-regression code
134    movsxd           r2, [fg_dataq+FGData.ar_coeff_lag]
135    movsxd           r2, [base+generate_grain_y_avx2_table+r2*4]
136    lea              r2, [r2+base+generate_grain_y_avx2_table]
137    jmp              r2
138
139.ar1:
140    DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
141    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
142    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
143    movd            xm4, [fg_dataq+FGData.ar_coeffs_y]
144    DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
145    pinsrb          xm4, [pb_1], 3
146    pmovsxbw        xm4, xm4
147    pshufd          xm5, xm4, q1111
148    pshufd          xm4, xm4, q0000
149    vpbroadcastw    xm3, [base+round_vals+shiftq*2-12]    ; rnd
150    sub            bufq, 82*73-(82*3+79)
151    mov              hd, 70
152    mov            mind, -128
153    mov            maxd, 127
154.y_loop_ar1:
155    mov              xq, -76
156    movsx         val3d, byte [bufq+xq-1]
157.x_loop_ar1:
158    pmovsxbw        xm0, [bufq+xq-82-1]     ; top/left
159    pmovsxbw        xm2, [bufq+xq-82+0]     ; top
160    pmovsxbw        xm1, [bufq+xq-82+1]     ; top/right
161    punpcklwd       xm0, xm2
162    punpcklwd       xm1, xm3
163    pmaddwd         xm0, xm4
164    pmaddwd         xm1, xm5
165    paddd           xm0, xm1
166.x_loop_ar1_inner:
167    movd          val0d, xm0
168    psrldq          xm0, 4
169    imul          val3d, cf3d
170    add           val3d, val0d
171%if WIN64
172    sarx          val3d, val3d, shiftd
173%else
174    sar           val3d, shiftb
175%endif
176    movsx         val0d, byte [bufq+xq]
177    add           val3d, val0d
178    cmp           val3d, maxd
179    cmovns        val3d, maxd
180    cmp           val3d, mind
181    cmovs         val3d, mind
182    mov  byte [bufq+xq], val3b
183    ; keep val3d in-place as left for next x iteration
184    inc              xq
185    jz .x_loop_ar1_end
186    test             xq, 3
187    jnz .x_loop_ar1_inner
188    jmp .x_loop_ar1
189
190.x_loop_ar1_end:
191    add            bufq, 82
192    dec              hd
193    jg .y_loop_ar1
194.ar0:
195    RET
196
197.ar2:
198    DEFINE_ARGS buf, fg_data, shift
199    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
200    vpbroadcastw   xm14, [base+round_vals-12+shiftq*2]
201    movq           xm15, [base+byte_blend+1]
202    pmovsxbw        xm8, [fg_dataq+FGData.ar_coeffs_y+0]    ; cf0-7
203    movd            xm9, [fg_dataq+FGData.ar_coeffs_y+8]    ; cf8-11
204    pmovsxbw        xm9, xm9
205    DEFINE_ARGS buf, fg_data, h, x
206    pshufd         xm12, xm9, q0000
207    pshufd         xm13, xm9, q1111
208    pshufd         xm11, xm8, q3333
209    pshufd         xm10, xm8, q2222
210    pshufd          xm9, xm8, q1111
211    pshufd          xm8, xm8, q0000
212    pmovzxwd       xm14, xm14
213    sub            bufq, 82*73-(82*3+79)
214    mov              hd, 70
215.y_loop_ar2:
216    mov              xq, -76
217
218.x_loop_ar2:
219    pmovsxbw        xm0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
220    pmovsxbw        xm1, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
221    psrldq          xm2, xm0, 2             ; y=-2,x=[-1,+5]
222    psrldq          xm3, xm1, 2             ; y=-1,x=[-1,+5]
223    psrldq          xm4, xm1, 4             ; y=-1,x=[+0,+5]
224    punpcklwd       xm2, xm0, xm2
225    punpcklwd       xm3, xm4
226    pmaddwd         xm2, xm8
227    pmaddwd         xm3, xm11
228    paddd           xm2, xm3
229
230    psrldq          xm4, xm0, 4             ; y=-2,x=[+0,+5]
231    psrldq          xm5, xm0, 6             ; y=-2,x=[+1,+5]
232    psrldq          xm6, xm0, 8             ; y=-2,x=[+2,+5]
233    punpcklwd       xm4, xm5
234    punpcklwd       xm6, xm1
235    psrldq          xm7, xm1, 6             ; y=-1,x=[+1,+5]
236    psrldq          xm1, xm1, 8             ; y=-1,x=[+2,+5]
237    punpcklwd       xm7, xm1
238    pmaddwd         xm4, xm9
239    pmaddwd         xm6, xm10
240    pmaddwd         xm7, xm12
241    paddd           xm4, xm6
242    paddd           xm2, xm7
243    paddd           xm2, xm4
244    paddd           xm2, xm14
245
246    movq            xm0, [bufq+xq-2]        ; y=0,x=[-2,+5]
247.x_loop_ar2_inner:
248    pmovsxbw        xm1, xm0
249    pmaddwd         xm3, xm1, xm13
250    paddd           xm3, xm2
251    psrldq          xm1, 4                  ; y=0,x=0
252    psrldq          xm2, 4                  ; shift top to next pixel
253    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
254    ; don't packssdw since we only care about one value
255    paddw           xm3, xm1
256    packsswb        xm3, xm3
257    pextrb    [bufq+xq], xm3, 0
258    pslldq          xm3, 2
259    pand            xm3, xm15
260    pandn           xm0, xm15, xm0
261    por             xm0, xm3
262    psrldq          xm0, 1
263    inc              xq
264    jz .x_loop_ar2_end
265    test             xq, 3
266    jnz .x_loop_ar2_inner
267    jmp .x_loop_ar2
268
269.x_loop_ar2_end:
270    add            bufq, 82
271    dec              hd
272    jg .y_loop_ar2
273    RET
274
275.ar3:
276    DEFINE_ARGS buf, fg_data, shift
277%if WIN64
278    SUB             rsp, 16*12
279%assign stack_size_padded (stack_size_padded+16*12)
280%assign stack_size (stack_size+16*12)
281%else
282    ALLOC_STACK   16*12
283%endif
284    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
285    vpbroadcastw   xm14, [base+round_vals-12+shiftq*2]
286    movq           xm15, [base+byte_blend]
287    pmovsxbw        xm0, [fg_dataq+FGData.ar_coeffs_y+ 0]   ; cf0-7
288    pmovsxbw        xm1, [fg_dataq+FGData.ar_coeffs_y+ 8]   ; cf8-15
289    pmovsxbw        xm2, [fg_dataq+FGData.ar_coeffs_y+16]   ; cf16-23
290    pshufd          xm9, xm0, q1111
291    pshufd         xm10, xm0, q2222
292    pshufd         xm11, xm0, q3333
293    pshufd          xm0, xm0, q0000
294    pshufd          xm6, xm1, q1111
295    pshufd          xm7, xm1, q2222
296    pshufd          xm8, xm1, q3333
297    pshufd          xm1, xm1, q0000
298    pshufd          xm3, xm2, q1111
299    psrldq         xm13, xm2, 10
300    pinsrw          xm2, [pw_1], 5
301    pshufd          xm4, xm2, q2222
302    pshufd          xm2, xm2, q0000
303    pinsrw         xm13, [base+round_vals+shiftq*2-10], 3
304    mova    [rsp+ 0*16], xm0
305    mova    [rsp+ 1*16], xm9
306    mova    [rsp+ 2*16], xm10
307    mova    [rsp+ 3*16], xm11
308    mova    [rsp+ 4*16], xm1
309    mova    [rsp+ 5*16], xm6
310    mova    [rsp+ 6*16], xm7
311    mova    [rsp+ 7*16], xm8
312    mova    [rsp+ 8*16], xm2
313    mova    [rsp+ 9*16], xm3
314    mova    [rsp+10*16], xm4
315    DEFINE_ARGS buf, fg_data, h, x
316    sub            bufq, 82*73-(82*3+79)
317    mov              hd, 70
318.y_loop_ar3:
319    mov              xq, -76
320
321.x_loop_ar3:
322    movu            xm0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
323    movu            xm1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
324    movu            xm2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
325    pxor            xm3, xm3
326    pcmpgtb         xm6, xm3, xm2
327    pcmpgtb         xm5, xm3, xm1
328    pcmpgtb         xm4, xm3, xm0
329    punpckhbw       xm3, xm0, xm4
330    punpcklbw       xm0, xm4
331    punpckhbw       xm4, xm1, xm5
332    punpcklbw       xm1, xm5
333    punpckhbw       xm5, xm2, xm6
334    punpcklbw       xm2, xm6
335
336    psrldq          xm6, xm0, 2
337    psrldq          xm7, xm0, 4
338    psrldq          xm8, xm0, 6
339    psrldq          xm9, xm0, 8
340    palignr        xm10, xm3, xm0, 10
341    palignr        xm11, xm3, xm0, 12
342
343    punpcklwd       xm0, xm6
344    punpcklwd       xm7, xm8
345    punpcklwd       xm9, xm10
346    punpcklwd      xm11, xm1
347    pmaddwd         xm0, [rsp+ 0*16]
348    pmaddwd         xm7, [rsp+ 1*16]
349    pmaddwd         xm9, [rsp+ 2*16]
350    pmaddwd        xm11, [rsp+ 3*16]
351    paddd           xm0, xm7
352    paddd           xm9, xm11
353    paddd           xm0, xm9
354
355    psrldq          xm6, xm1, 2
356    psrldq          xm7, xm1, 4
357    psrldq          xm8, xm1, 6
358    psrldq          xm9, xm1, 8
359    palignr        xm10, xm4, xm1, 10
360    palignr        xm11, xm4, xm1, 12
361    psrldq         xm12, xm2, 2
362
363    punpcklwd       xm6, xm7
364    punpcklwd       xm8, xm9
365    punpcklwd      xm10, xm11
366    punpcklwd      xm12, xm2, xm12
367    pmaddwd         xm6, [rsp+ 4*16]
368    pmaddwd         xm8, [rsp+ 5*16]
369    pmaddwd        xm10, [rsp+ 6*16]
370    pmaddwd        xm12, [rsp+ 7*16]
371    paddd           xm6, xm8
372    paddd          xm10, xm12
373    paddd           xm6, xm10
374    paddd           xm0, xm6
375
376    psrldq          xm6, xm2, 4
377    psrldq          xm7, xm2, 6
378    psrldq          xm8, xm2, 8
379    palignr         xm9, xm5, xm2, 10
380    palignr         xm5, xm5, xm2, 12
381
382    punpcklwd       xm6, xm7
383    punpcklwd       xm8, xm9
384    punpcklwd       xm5, xm14
385    pmaddwd         xm6, [rsp+ 8*16]
386    pmaddwd         xm8, [rsp+ 9*16]
387    pmaddwd         xm5, [rsp+10*16]
388    paddd           xm0, xm6
389    paddd           xm8, xm5
390    paddd           xm0, xm8
391
392    movq            xm1, [bufq+xq-3]        ; y=0,x=[-3,+4]
393.x_loop_ar3_inner:
394    pmovsxbw        xm2, xm1
395    pmaddwd         xm2, xm13
396    pshufd          xm3, xm2, q1111
397    paddd           xm2, xm3                ; left+cur
398    paddd           xm2, xm0                ; add top
399    psrldq          xm0, 4
400    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
401    ; don't packssdw since we only care about one value
402    packsswb        xm2, xm2
403    pextrb    [bufq+xq], xm2, 0
404    pslldq          xm2, 3
405    pand            xm2, xm15
406    pandn           xm1, xm15, xm1
407    por             xm1, xm2
408    psrldq          xm1, 1
409    inc              xq
410    jz .x_loop_ar3_end
411    test             xq, 3
412    jnz .x_loop_ar3_inner
413    jmp .x_loop_ar3
414
415.x_loop_ar3_end:
416    add            bufq, 82
417    dec              hd
418    jg .y_loop_ar3
419    RET
420
421%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
422INIT_XMM avx2
423cglobal generate_grain_uv_%1, 4, 10, 16, buf, bufy, fg_data, uv
424    lea              r4, [pb_mask]
425%define base r4-pb_mask
426    movq            xm1, [base+rnd_next_upperbit_mask]
427    movq            xm4, [base+mul_bits]
428    movq            xm7, [base+hmul_bits]
429    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
430    vpbroadcastw    xm8, [base+round+r5*2]
431    mova            xm5, [base+pb_mask]
432    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
433    vpbroadcastw    xm9, [base+pw_seed_xor+uvq*4]
434    pxor            xm0, xm9
435    vpbroadcastd    xm9, [base+pd_m65536]
436    lea              r6, [gaussian_sequence]
437%if %2
438    mov             r7d, 73-35*%3
439    add            bufq, 44
440.loop_y:
441    mov              r5, -44
442.loop_x:
443%else
444    mov              r5, -73*82
445    sub            bufq, r5
446.loop:
447%endif
448    pand            xm2, xm0, xm1
449    psrlw           xm3, xm2, 10
450    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
451    pmullw          xm2, xm4            ; bits 0x0f00 are set
452    pshufb          xm2, xm5, xm2       ; set 15th bit for next 4 seeds
453    psllq           xm6, xm2, 30
454    por             xm2, xm6
455    psllq           xm6, xm2, 15
456    por             xm2, xm6            ; aggregate each bit into next seed's high bit
457    pmulhuw         xm3, xm0, xm7
458    por             xm2, xm3            ; 4 next output seeds
459    pshuflw         xm0, xm2, q3333
460    psrlw           xm2, 5
461    pmovzxwd        xm3, xm2
462    mova            xm6, xm9
463    vpgatherdd      xm2, [r6+xm3*2], xm6
464    pandn           xm2, xm9, xm2
465    packusdw        xm2, xm2
466    pmulhrsw        xm2, xm8
467    packsswb        xm2, xm2
468    movd      [bufq+r5], xm2
469    add              r5, 4
470%if %2
471    jl .loop_x
472    add            bufq, 82
473    dec             r7d
474    jg .loop_y
475%else
476    jl .loop
477%endif
478
479    ; auto-regression code
480    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
481    movsxd           r5, [base+generate_grain_uv_%1_avx2_table+r5*4]
482    lea              r5, [r5+base+generate_grain_uv_%1_avx2_table]
483    jmp              r5
484
485.ar0:
486    INIT_YMM avx2
487    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
488    imul            uvd, 28
489    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
490    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
491    movd            xm3, [base+hmul_bits+shiftq*2]
492    DEFINE_ARGS buf, bufy, h
493    pmovsxbw        xm4, xm4
494%if %2
495    vpbroadcastd     m7, [pb_1]
496    vpbroadcastw     m6, [hmul_bits+2+%3*2]
497%endif
498    vpbroadcastw     m4, xm4
499    vpbroadcastw     m3, xm3
500    pxor            m12, m12
501%if %2
502    sub            bufq, 82*(73-35*%3)+82-(82*3+41)
503%else
504    sub            bufq, 82*70-3
505%endif
506    add           bufyq, 3+82*3
507    mov              hd, 70-35*%3
508.y_loop_ar0:
509%if %2
510    ; first 32 pixels
511    movu            xm8, [bufyq]
512%if %3
513    movu            xm9, [bufyq+82]
514%endif
515    movu           xm10, [bufyq+16]
516%if %3
517    movu           xm11, [bufyq+82+16]
518%endif
519    vinserti128      m8, [bufyq+32], 1
520%if %3
521    vinserti128      m9, [bufyq+82+32], 1
522%endif
523    vinserti128     m10, [bufyq+48], 1
524%if %3
525    vinserti128     m11, [bufyq+82+48], 1
526%endif
527    pmaddubsw        m8, m7, m8
528%if %3
529    pmaddubsw        m9, m7, m9
530%endif
531    pmaddubsw       m10, m7, m10
532%if %3
533    pmaddubsw       m11, m7, m11
534    paddw            m8, m9
535    paddw           m10, m11
536%endif
537    pmulhrsw         m8, m6
538    pmulhrsw        m10, m6
539%else
540    xor             r3d, r3d
541    ; first 32x2 pixels
542.x_loop_ar0:
543    movu             m8, [bufyq+r3]
544    pcmpgtb          m9, m12, m8
545    punpckhbw       m10, m8, m9
546    punpcklbw        m8, m9
547%endif
548    pmullw           m8, m4
549    pmullw          m10, m4
550    pmulhrsw         m8, m3
551    pmulhrsw        m10, m3
552%if %2
553    movu             m0, [bufq]
554%else
555    movu             m0, [bufq+r3]
556%endif
557    pcmpgtb          m1, m12, m0
558    punpckhbw        m9, m0, m1
559    punpcklbw        m0, m1
560    paddw            m0, m8
561    paddw            m9, m10
562    packsswb         m0, m9
563%if %2
564    movu         [bufq], m0
565%else
566    movu      [bufq+r3], m0
567    add             r3d, 32
568    cmp             r3d, 64
569    jl .x_loop_ar0
570%endif
571
572    ; last 6/12 pixels
573    movu            xm8, [bufyq+32*2]
574%if %2
575%if %3
576    movu            xm9, [bufyq+32*2+82]
577%endif
578    pmaddubsw       xm8, xm7, xm8
579%if %3
580    pmaddubsw       xm9, xm7, xm9
581    paddw           xm8, xm9
582%endif
583    pmulhrsw        xm8, xm6
584    pmullw          xm8, xm4
585    pmulhrsw        xm8, xm3
586    movq            xm0, [bufq+32]
587    pcmpgtb         xm9, xm12, xm0
588    punpcklbw       xm9, xm0, xm9
589    paddw           xm8, xm9
590    packsswb        xm8, xm8
591    vpblendw        xm0, xm8, xm0, 1000b
592    movq      [bufq+32], xm0
593%else
594    pcmpgtb         xm9, xm12, xm8
595    punpckhbw      xm10, xm8, xm9
596    punpcklbw       xm8, xm9
597    pmullw         xm10, xm4
598    pmullw          xm8, xm4
599    pmulhrsw       xm10, xm3
600    pmulhrsw        xm8, xm3
601    movu            xm0, [bufq+64]
602    pcmpgtb         xm9, xm12, xm0
603    punpcklbw       xm1, xm0, xm9
604    punpckhbw       xm9, xm0, xm9
605    paddw           xm1, xm8
606    paddw           xm9, xm10
607    packsswb        xm1, xm9
608    vpblendw        xm0, xm1, xm0, 11000000b
609    movu      [bufq+64], xm0
610%endif
611
612    add            bufq, 82
613    add           bufyq, 82<<%3
614    dec              hd
615    jg .y_loop_ar0
616    RET
617
618.ar1:
619    INIT_XMM avx2
620    DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift
621    imul            uvd, 28
622    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
623    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
624    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
625    pinsrb          xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
626    DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift
627    pmovsxbw        xm4, xm4
628    pshufd          xm5, xm4, q1111
629    pshufd          xm4, xm4, q0000
630    pmovsxwd        xm3, [base+round_vals+shiftq*2-12]    ; rnd
631%if %2
632    vpbroadcastd    xm7, [pb_1]
633    vpbroadcastw    xm6, [hmul_bits+2+%3*2]
634%endif
635    vpbroadcastd    xm3, xm3
636%if %2
637    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
638%else
639    sub            bufq, 82*70-(82-3)
640%endif
641    add           bufyq, 79+82*3
642    mov              hd, 70-35*%3
643    mov            mind, -128
644    mov            maxd, 127
645.y_loop_ar1:
646    mov              xq, -(76>>%2)
647    movsx         val3d, byte [bufq+xq-1]
648.x_loop_ar1:
649    pmovsxbw        xm0, [bufq+xq-82-1]     ; top/left
650%if %2
651    movq            xm8, [bufyq+xq*2]
652%if %3
653    movq            xm9, [bufyq+xq*2+82]
654%endif
655%endif
656    psrldq          xm2, xm0, 2             ; top
657    psrldq          xm1, xm0, 4             ; top/right
658%if %2
659    pmaddubsw       xm8, xm7, xm8
660%if %3
661    pmaddubsw       xm9, xm7, xm9
662    paddw           xm8, xm9
663%endif
664    pmulhrsw        xm8, xm6
665%else
666    pmovsxbw        xm8, [bufyq+xq]
667%endif
668    punpcklwd       xm0, xm2
669    punpcklwd       xm1, xm8
670    pmaddwd         xm0, xm4
671    pmaddwd         xm1, xm5
672    paddd           xm0, xm1
673    paddd           xm0, xm3
674.x_loop_ar1_inner:
675    movd          val0d, xm0
676    psrldq          xm0, 4
677    imul          val3d, cf3d
678    add           val3d, val0d
679    sarx          val3d, val3d, shiftd
680    movsx         val0d, byte [bufq+xq]
681    add           val3d, val0d
682    cmp           val3d, maxd
683    cmovns        val3d, maxd
684    cmp           val3d, mind
685    cmovs         val3d, mind
686    mov  byte [bufq+xq], val3b
687    ; keep val3d in-place as left for next x iteration
688    inc              xq
689    jz .x_loop_ar1_end
690    test             xq, 3
691    jnz .x_loop_ar1_inner
692    jmp .x_loop_ar1
693
694.x_loop_ar1_end:
695    add            bufq, 82
696    add           bufyq, 82<<%3
697    dec              hd
698    jg .y_loop_ar1
699    RET
700
701.ar2:
702    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
703    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
704    imul            uvd, 28
705    vpbroadcastw   xm15, [base+round_vals-12+shiftq*2]
706    pmovsxbw        xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]   ; cf0-7
707    pmovsxbw        xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8]   ; cf8-12
708    pinsrw          xm9, [base+pw_1], 5
709%if %2
710    vpbroadcastw    xm7, [base+hmul_bits+2+%3*2]
711    vpbroadcastd    xm6, [base+pb_1]
712%endif
713    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
714    pshufd         xm12, xm9, q0000
715    pshufd         xm13, xm9, q1111
716    pshufd         xm14, xm9, q2222
717    pshufd         xm11, xm8, q3333
718    pshufd         xm10, xm8, q2222
719    pshufd          xm9, xm8, q1111
720    pshufd          xm8, xm8, q0000
721%if %2
722    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
723%else
724    sub            bufq, 82*70-(82-3)
725%endif
726    add           bufyq, 79+82*3
727    mov              hd, 70-35*%3
728.y_loop_ar2:
729    mov              xq, -(76>>%2)
730
731.x_loop_ar2:
732    pmovsxbw        xm0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
733    pmovsxbw        xm1, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
734    psrldq          xm2, xm0, 2             ; y=-2,x=[-1,+5]
735    psrldq          xm3, xm1, 2             ; y=-1,x=[-1,+5]
736    psrldq          xm4, xm1, 4             ; y=-1,x=[+0,+5]
737    punpcklwd       xm2, xm0, xm2
738    punpcklwd       xm3, xm4
739    pmaddwd         xm2, xm8
740    pmaddwd         xm3, xm11
741    paddd           xm2, xm3
742
743    psrldq          xm4, xm0, 4             ; y=-2,x=[+0,+5]
744    psrldq          xm5, xm0, 6             ; y=-2,x=[+1,+5]
745    psrldq          xm0, 8                  ; y=-2,x=[+2,+5]
746    punpcklwd       xm4, xm5
747    punpcklwd       xm0, xm1
748    psrldq          xm3, xm1, 6             ; y=-1,x=[+1,+5]
749    psrldq          xm1, xm1, 8             ; y=-1,x=[+2,+5]
750    punpcklwd       xm3, xm1
751    pmaddwd         xm4, xm9
752    pmaddwd         xm0, xm10
753    pmaddwd         xm3, xm12
754    paddd           xm4, xm0
755    paddd           xm2, xm3
756    paddd           xm2, xm4
757
758%if %2
759    movq            xm0, [bufyq+xq*2]
760%if %3
761    movq            xm3, [bufyq+xq*2+82]
762%endif
763    pmaddubsw       xm0, xm6, xm0
764%if %3
765    pmaddubsw       xm3, xm6, xm3
766    paddw           xm0, xm3
767%endif
768    pmulhrsw        xm0, xm7
769%else
770    pmovsxbw        xm0, [bufyq+xq]
771%endif
772    punpcklwd       xm0, xm15
773    pmaddwd         xm0, xm14
774    paddd           xm2, xm0
775
776    movq            xm0, [bufq+xq-2]        ; y=0,x=[-2,+5]
777.x_loop_ar2_inner:
778    pmovsxbw        xm0, xm0
779    pmaddwd         xm3, xm0, xm13
780    paddd           xm3, xm2
781    psrldq          xm2, 4                  ; shift top to next pixel
782    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
783    pslldq          xm3, 2
784    psrldq          xm0, 2
785    paddw           xm3, xm0
786    vpblendw        xm0, xm3, 00000010b
787    packsswb        xm0, xm0
788    pextrb    [bufq+xq], xm0, 1
789    inc              xq
790    jz .x_loop_ar2_end
791    test             xq, 3
792    jnz .x_loop_ar2_inner
793    jmp .x_loop_ar2
794
795.x_loop_ar2_end:
796    add            bufq, 82
797    add           bufyq, 82<<%3
798    dec              hd
799    jg .y_loop_ar2
800    RET
801
802.ar3:
803    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
804    SUB             rsp, 16*12
805%assign stack_size_padded (stack_size_padded+16*12)
806%assign stack_size (stack_size+16*12)
807    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
808    imul            uvd, 28
809    vpbroadcastw   xm14, [base+round_vals-12+shiftq*2]
810    pmovsxbw        xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]   ; cf0-7
811    pmovsxbw        xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8]   ; cf8-15
812    pmovsxbw        xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]   ; cf16-23
813    pmovsxbw        xm5, [fg_dataq+FGData.ar_coeffs_uv+uvq+24]   ; cf24 [luma]
814    pshufd          xm9, xm0, q1111
815    pshufd         xm10, xm0, q2222
816    pshufd         xm11, xm0, q3333
817    pshufd          xm0, xm0, q0000
818    pshufd          xm6, xm1, q1111
819    pshufd          xm7, xm1, q2222
820    pshufd          xm8, xm1, q3333
821    pshufd          xm1, xm1, q0000
822    pshufd          xm3, xm2, q1111
823    pshufd          xm4, xm2, q2222
824    vpbroadcastw    xm5, xm5
825    vpblendw        xm4, xm5, 10101010b                     ; interleave luma cf
826    psrldq          xm5, xm2, 10
827    pshufd          xm2, xm2, q0000
828    pinsrw          xm5, [base+round_vals+shiftq*2-10], 3
829    pmovzxwd       xm14, xm14
830    mova    [rsp+ 0*16], xm0
831    mova    [rsp+ 1*16], xm9
832    mova    [rsp+ 2*16], xm10
833    mova    [rsp+ 3*16], xm11
834    mova    [rsp+ 4*16], xm1
835    mova    [rsp+ 5*16], xm6
836    mova    [rsp+ 6*16], xm7
837    mova    [rsp+ 7*16], xm8
838    mova    [rsp+ 8*16], xm2
839    mova    [rsp+ 9*16], xm3
840    mova    [rsp+10*16], xm4
841    mova    [rsp+11*16], xm5
842%if %2
843    vpbroadcastd   xm13, [base+pb_1]
844    vpbroadcastw   xm15, [base+hmul_bits+2+%3*2]
845%endif
846    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
847%if %2
848    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
849%else
850    sub            bufq, 82*70-(82-3)
851%endif
852    add           bufyq, 79+82*3
853    mov              hd, 70-35*%3
854.y_loop_ar3:
855    mov              xq, -(76>>%2)
856
857.x_loop_ar3:
858    movu            xm0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
859    movu            xm1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
860    movu            xm2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
861    pxor            xm3, xm3
862    pcmpgtb         xm6, xm3, xm2
863    pcmpgtb         xm5, xm3, xm1
864    pcmpgtb         xm4, xm3, xm0
865    punpckhbw       xm3, xm0, xm4
866    punpcklbw       xm0, xm4
867    punpckhbw       xm4, xm1, xm5
868    punpcklbw       xm1, xm5
869    punpckhbw       xm5, xm2, xm6
870    punpcklbw       xm2, xm6
871
872    psrldq          xm6, xm0, 2
873    psrldq          xm7, xm0, 4
874    psrldq          xm8, xm0, 6
875    psrldq          xm9, xm0, 8
876    palignr        xm10, xm3, xm0, 10
877    palignr        xm11, xm3, xm0, 12
878
879    punpcklwd       xm0, xm6
880    punpcklwd       xm7, xm8
881    punpcklwd       xm9, xm10
882    punpcklwd      xm11, xm1
883    pmaddwd         xm0, [rsp+ 0*16]
884    pmaddwd         xm7, [rsp+ 1*16]
885    pmaddwd         xm9, [rsp+ 2*16]
886    pmaddwd        xm11, [rsp+ 3*16]
887    paddd           xm0, xm7
888    paddd           xm9, xm11
889    paddd           xm0, xm9
890
891    psrldq          xm6, xm1, 2
892    psrldq          xm7, xm1, 4
893    psrldq          xm8, xm1, 6
894    psrldq          xm9, xm1, 8
895    palignr        xm10, xm4, xm1, 10
896    palignr        xm11, xm4, xm1, 12
897    psrldq         xm12, xm2, 2
898
899    punpcklwd       xm6, xm7
900    punpcklwd       xm8, xm9
901    punpcklwd      xm10, xm11
902    punpcklwd      xm12, xm2, xm12
903    pmaddwd         xm6, [rsp+ 4*16]
904    pmaddwd         xm8, [rsp+ 5*16]
905    pmaddwd        xm10, [rsp+ 6*16]
906    pmaddwd        xm12, [rsp+ 7*16]
907    paddd           xm6, xm8
908    paddd          xm10, xm12
909    paddd           xm6, xm10
910    paddd           xm0, xm6
911
912    psrldq          xm6, xm2, 4
913    psrldq          xm7, xm2, 6
914    psrldq          xm8, xm2, 8
915    palignr         xm9, xm5, xm2, 10
916    palignr         xm5, xm5, xm2, 12
917
918%if %2
919    movq            xm1, [bufyq+xq*2]
920%if %3
921    movq            xm2, [bufyq+xq*2+82]
922%endif
923    pmaddubsw       xm1, xm13, xm1
924%if %3
925    pmaddubsw       xm2, xm13, xm2
926    paddw           xm1, xm2
927%endif
928    pmulhrsw        xm1, xm15
929%else
930    pmovsxbw        xm1, [bufyq+xq]
931%endif
932
933    punpcklwd       xm6, xm7
934    punpcklwd       xm8, xm9
935    punpcklwd       xm5, xm1
936    pmaddwd         xm6, [rsp+ 8*16]
937    pmaddwd         xm8, [rsp+ 9*16]
938    pmaddwd         xm5, [rsp+10*16]
939    paddd           xm0, xm6
940    paddd           xm8, xm5
941    paddd           xm0, xm8
942    paddd           xm0, xm14
943
944    movq            xm1, [bufq+xq-3]        ; y=0,x=[-3,+4]
945.x_loop_ar3_inner:
946    pmovsxbw        xm1, xm1
947    pmaddwd         xm2, xm1, [rsp+16*11]
948    pshufd          xm3, xm2, q1111
949    paddd           xm2, xm3                ; left+cur
950    paddd           xm2, xm0                ; add top
951    psrldq          xm0, 4
952    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
953    ; don't packssdw, we only care about one value
954    pslldq          xm2, 6
955    vpblendw        xm1, xm2, 1000b
956    packsswb        xm1, xm1
957    pextrb    [bufq+xq], xm1, 3
958    psrldq          xm1, 1
959    inc              xq
960    jz .x_loop_ar3_end
961    test             xq, 3
962    jnz .x_loop_ar3_inner
963    jmp .x_loop_ar3
964
965.x_loop_ar3_end:
966    add            bufq, 82
967    add           bufyq, 82<<%3
968    dec              hd
969    jg .y_loop_ar3
970    RET
971%endmacro
972
973generate_grain_uv_fn 420, 1, 1
974generate_grain_uv_fn 422, 1, 0
975generate_grain_uv_fn 444, 0, 0
976
977INIT_YMM avx2
978cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
979    pcmpeqw         m10, m10
980    psrld           m10, 24
981    mov             r7d, [fg_dataq+FGData.scaling_shift]
982    lea              r8, [pb_mask]
983%define base r8-pb_mask
984    vpbroadcastw    m11, [base+mul_bits+r7*2-14]
985    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
986    vpbroadcastw    m12, [base+max+r7*4]
987    vpbroadcastw    m13, [base+min+r7*2]
988
989    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
990
991    mov        overlapd, [fg_dataq+FGData.overlap_flag]
992    movifnidn      sbyd, sbym
993    test           sbyd, sbyd
994    setnz           r7b
995    test            r7b, overlapb
996    jnz .vertical_overlap
997
998    imul           seed, sbyd, (173 << 24) | 37
999    add            seed, (105 << 24) | 178
1000    rol            seed, 8
1001    movzx          seed, seew
1002    xor            seed, [fg_dataq+FGData.seed]
1003
1004    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1005                unused1, unused2, see, overlap
1006
1007    lea        src_bakq, [srcq+wq]
1008    neg              wq
1009    sub            dstq, srcq
1010
1011.loop_x:
1012    mov             r6d, seed
1013    or             seed, 0xEFF4
1014    shr             r6d, 1
1015    test           seeb, seeh
1016    lea            seed, [r6+0x8000]
1017    cmovp          seed, r6d                ; updated seed
1018
1019    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1020                offx, offy, see, overlap
1021
1022    mov           offxd, seed
1023    rorx          offyd, seed, 8
1024    shr           offxd, 12
1025    and           offyd, 0xf
1026    imul          offyd, 164
1027    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1028
1029    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1030                h, offxy, see, overlap
1031
1032    mov              hd, hm
1033    mov      grain_lutq, grain_lutmp
1034.loop_y:
1035    ; src
1036    mova             m0, [srcq]
1037    pxor             m2, m2
1038    punpckhbw        m1, m0, m2
1039    punpcklbw        m0, m2                 ; m0-1: src as word
1040    punpckhwd        m5, m0, m2
1041    punpcklwd        m4, m0, m2
1042    punpckhwd        m7, m1, m2
1043    punpcklwd        m6, m1, m2             ; m4-7: src as dword
1044
1045    ; scaling[src]
1046    pcmpeqw          m3, m3
1047    pcmpeqw          m9, m9
1048    vpgatherdd       m8, [scalingq+m4], m3
1049    vpgatherdd       m4, [scalingq+m5], m9
1050    pcmpeqw          m3, m3
1051    pcmpeqw          m9, m9
1052    vpgatherdd       m5, [scalingq+m6], m3
1053    vpgatherdd       m6, [scalingq+m7], m9
1054    pand             m8, m10
1055    pand             m4, m10
1056    pand             m5, m10
1057    pand             m6, m10
1058    packusdw         m8, m4
1059    packusdw         m5, m6
1060
1061    ; grain = grain_lut[offy+y][offx+x]
1062    movu             m3, [grain_lutq+offxyq]
1063    pcmpgtb          m7, m2, m3
1064    punpcklbw        m2, m3, m7
1065    punpckhbw        m3, m7
1066
1067    ; noise = round2(scaling[src] * grain, scaling_shift)
1068    pmullw           m2, m8
1069    pmullw           m3, m5
1070    pmulhrsw         m2, m11
1071    pmulhrsw         m3, m11
1072
1073    ; dst = clip_pixel(src, noise)
1074    paddw            m0, m2
1075    paddw            m1, m3
1076    pmaxsw           m0, m13
1077    pmaxsw           m1, m13
1078    pminsw           m0, m12
1079    pminsw           m1, m12
1080    packuswb         m0, m1
1081    mova    [dstq+srcq], m0
1082
1083    add            srcq, strideq
1084    add      grain_lutq, 82
1085    dec              hd
1086    jg .loop_y
1087
1088    add              wq, 32
1089    jge .end
1090    lea            srcq, [src_bakq+wq]
1091    test       overlapd, overlapd
1092    jz .loop_x
1093
1094    ; r8m = sbym
1095    movd           xm15, [pb_27_17_17_27]
1096    cmp       dword r8m, 0
1097    jne .loop_x_hv_overlap
1098
1099    ; horizontal overlap (without vertical overlap)
1100    movd           xm14, [pw_1024]
1101.loop_x_h_overlap:
1102    mov             r6d, seed
1103    or             seed, 0xEFF4
1104    shr             r6d, 1
1105    test           seeb, seeh
1106    lea            seed, [r6+0x8000]
1107    cmovp          seed, r6d                ; updated seed
1108
1109    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1110                offx, offy, see, left_offxy
1111
1112    lea     left_offxyd, [offyd+32]         ; previous column's offy*stride+offx
1113    mov           offxd, seed
1114    rorx          offyd, seed, 8
1115    shr           offxd, 12
1116    and           offyd, 0xf
1117    imul          offyd, 164
1118    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
1119
1120    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1121                h, offxy, see, left_offxy
1122
1123    mov              hd, hm
1124    mov      grain_lutq, grain_lutmp
1125.loop_y_h_overlap:
1126    ; src
1127    mova             m0, [srcq]
1128    pxor             m2, m2
1129    punpckhbw        m1, m0, m2
1130    punpcklbw        m0, m2                 ; m0-1: src as word
1131    punpckhwd        m5, m0, m2
1132    punpcklwd        m4, m0, m2
1133    punpckhwd        m7, m1, m2
1134    punpcklwd        m6, m1, m2             ; m4-7: src as dword
1135
1136    ; scaling[src]
1137    pcmpeqw          m3, m3
1138    pcmpeqw          m9, m9
1139    vpgatherdd       m8, [scalingq+m4], m3
1140    vpgatherdd       m4, [scalingq+m5], m9
1141    pcmpeqw          m3, m3
1142    pcmpeqw          m9, m9
1143    vpgatherdd       m5, [scalingq+m6], m3
1144    vpgatherdd       m6, [scalingq+m7], m9
1145    pand             m8, m10
1146    pand             m4, m10
1147    pand             m5, m10
1148    pand             m6, m10
1149    packusdw         m8, m4
1150    packusdw         m5, m6
1151
1152    ; grain = grain_lut[offy+y][offx+x]
1153    movu             m3, [grain_lutq+offxyq]
1154    movd            xm4, [grain_lutq+left_offxyq]
1155    punpcklbw       xm4, xm3
1156    pmaddubsw       xm4, xm15, xm4
1157    pmulhrsw        xm4, xm14
1158    packsswb        xm4, xm4
1159    vpblendw        xm4, xm3, 11111110b
1160    vpblendd         m3, m4, 00001111b
1161    pcmpgtb          m7, m2, m3
1162    punpcklbw        m2, m3, m7
1163    punpckhbw        m3, m7
1164
1165    ; noise = round2(scaling[src] * grain, scaling_shift)
1166    pmullw           m2, m8
1167    pmullw           m3, m5
1168    pmulhrsw         m2, m11
1169    pmulhrsw         m3, m11
1170
1171    ; dst = clip_pixel(src, noise)
1172    paddw            m0, m2
1173    paddw            m1, m3
1174    pmaxsw           m0, m13
1175    pmaxsw           m1, m13
1176    pminsw           m0, m12
1177    pminsw           m1, m12
1178    packuswb         m0, m1
1179    mova    [dstq+srcq], m0
1180
1181    add            srcq, strideq
1182    add      grain_lutq, 82
1183    dec              hd
1184    jg .loop_y_h_overlap
1185
1186    add              wq, 32
1187    jge .end
1188    lea            srcq, [src_bakq+wq]
1189
1190    ; r8m = sbym
1191    cmp       dword r8m, 0
1192    jne .loop_x_hv_overlap
1193    jmp .loop_x_h_overlap
1194
1195.end:
1196    RET
1197
1198.vertical_overlap:
1199    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
1200
1201    movzx          sbyd, sbyb
1202    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1203    imul            r7d, sbyd, 173 * 0x00010001
1204    imul           sbyd, 37 * 0x01000100
1205    add             r7d, (105 << 16) | 188
1206    add            sbyd, (178 << 24) | (141 << 8)
1207    and             r7d, 0x00ff00ff
1208    and            sbyd, 0xff00ff00
1209    xor            seed, r7d
1210    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1211
1212    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1213                unused1, unused2, see, overlap
1214
1215    lea        src_bakq, [srcq+wq]
1216    neg              wq
1217    sub            dstq, srcq
1218
1219    vpbroadcastd    m14, [pw_1024]
1220.loop_x_v_overlap:
1221    vpbroadcastw    m15, [pb_27_17_17_27]
1222
1223    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1224    mov             r6d, seed
1225    or             seed, 0xeff4eff4
1226    test           seeb, seeh
1227    setp            r7b                     ; parity of top_seed
1228    shr            seed, 16
1229    shl             r7d, 16
1230    test           seeb, seeh
1231    setp            r7b                     ; parity of cur_seed
1232    or              r6d, 0x00010001
1233    xor             r7d, r6d
1234    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1235
1236    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1237                offx, offy, see, overlap, top_offxy
1238
1239    rorx          offyd, seed, 8
1240    rorx          offxd, seed, 12
1241    and           offyd, 0xf000f
1242    and           offxd, 0xf000f
1243    imul          offyd, 164
1244    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1245    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1246
1247    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1248                h, offxy, see, overlap, top_offxy
1249
1250    movzx    top_offxyd, offxyw
1251    shr          offxyd, 16
1252
1253    mov              hd, hm
1254    mov      grain_lutq, grain_lutmp
1255.loop_y_v_overlap:
1256    ; src
1257    mova             m0, [srcq]
1258    pxor             m2, m2
1259    punpckhbw        m1, m0, m2
1260    punpcklbw        m0, m2                 ; m0-1: src as word
1261    punpckhwd        m5, m0, m2
1262    punpcklwd        m4, m0, m2
1263    punpckhwd        m7, m1, m2
1264    punpcklwd        m6, m1, m2             ; m4-7: src as dword
1265
1266    ; scaling[src]
1267    pcmpeqw          m3, m3
1268    pcmpeqw          m9, m9
1269    vpgatherdd       m8, [scalingq+m4], m3
1270    vpgatherdd       m4, [scalingq+m5], m9
1271    pcmpeqw          m3, m3
1272    pcmpeqw          m9, m9
1273    vpgatherdd       m5, [scalingq+m6], m3
1274    vpgatherdd       m6, [scalingq+m7], m9
1275    pand             m8, m10
1276    pand             m4, m10
1277    pand             m5, m10
1278    pand             m6, m10
1279    packusdw         m8, m4
1280    packusdw         m5, m6
1281
1282    ; grain = grain_lut[offy+y][offx+x]
1283    movu             m3, [grain_lutq+offxyq]
1284    movu             m4, [grain_lutq+top_offxyq]
1285    punpckhbw        m6, m4, m3
1286    punpcklbw        m4, m3
1287    pmaddubsw        m6, m15, m6
1288    pmaddubsw        m4, m15, m4
1289    pmulhrsw         m6, m14
1290    pmulhrsw         m4, m14
1291    packsswb         m3, m4, m6
1292    pcmpgtb          m7, m2, m3
1293    punpcklbw        m2, m3, m7
1294    punpckhbw        m3, m7
1295
1296    ; noise = round2(scaling[src] * grain, scaling_shift)
1297    pmullw           m2, m8
1298    pmullw           m3, m5
1299    pmulhrsw         m2, m11
1300    pmulhrsw         m3, m11
1301
1302    ; dst = clip_pixel(src, noise)
1303    paddw            m0, m2
1304    paddw            m1, m3
1305    pmaxsw           m0, m13
1306    pmaxsw           m1, m13
1307    pminsw           m0, m12
1308    pminsw           m1, m12
1309    packuswb         m0, m1
1310    mova    [dstq+srcq], m0
1311
1312    vpbroadcastw    m15, [pb_27_17_17_27+2] ; swap weights for second v-overlap line
1313    add            srcq, strideq
1314    add      grain_lutq, 82
1315    dec              hw
1316    jz .end_y_v_overlap
1317    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1318    ; remaining (up to) 30 lines
1319    btc              hd, 16
1320    jnc .loop_y_v_overlap
1321    jmp .loop_y
1322
1323.end_y_v_overlap:
1324    add              wq, 32
1325    jge .end_hv
1326    lea            srcq, [src_bakq+wq]
1327
1328    ; since fg_dataq.overlap is guaranteed to be set, we never jump
1329    ; back to .loop_x_v_overlap, and instead always fall-through to
1330    ; h+v overlap
1331
1332    movd           xm15, [pb_27_17_17_27]
1333.loop_x_hv_overlap:
1334    vpbroadcastw     m8, [pb_27_17_17_27]
1335
1336    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1337    mov             r6d, seed
1338    or             seed, 0xeff4eff4
1339    test           seeb, seeh
1340    setp            r7b                     ; parity of top_seed
1341    shr            seed, 16
1342    shl             r7d, 16
1343    test           seeb, seeh
1344    setp            r7b                     ; parity of cur_seed
1345    or              r6d, 0x00010001
1346    xor             r7d, r6d
1347    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1348
1349    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1350                offx, offy, see, left_offxy, top_offxy, topleft_offxy
1351
1352    lea  topleft_offxyq, [top_offxyq+32]
1353    lea     left_offxyq, [offyq+32]
1354    rorx          offyd, seed, 8
1355    rorx          offxd, seed, 12
1356    and           offyd, 0xf000f
1357    and           offxd, 0xf000f
1358    imul          offyd, 164
1359    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1360    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
1361
1362    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
1363                h, offxy, see, left_offxy, top_offxy, topleft_offxy
1364
1365    movzx    top_offxyd, offxyw
1366    shr          offxyd, 16
1367
1368    mov              hd, hm
1369    mov      grain_lutq, grain_lutmp
1370.loop_y_hv_overlap:
1371    ; src
1372    mova             m0, [srcq]
1373    pxor             m2, m2
1374    punpckhbw        m1, m0, m2
1375    punpcklbw        m0, m2                 ; m0-1: src as word
1376    punpckhwd        m5, m0, m2
1377    punpcklwd        m4, m0, m2
1378    punpckhwd        m7, m1, m2
1379    punpcklwd        m6, m1, m2             ; m4-7: src as dword
1380
1381    ; scaling[src]
1382    pcmpeqw          m3, m3
1383    ; FIXME it would be nice to have another register here to do 2 vpgatherdd's in parallel
1384    vpgatherdd       m9, [scalingq+m4], m3
1385    pcmpeqw          m3, m3
1386    vpgatherdd       m4, [scalingq+m5], m3
1387    pcmpeqw          m3, m3
1388    vpgatherdd       m5, [scalingq+m6], m3
1389    pcmpeqw          m3, m3
1390    vpgatherdd       m6, [scalingq+m7], m3
1391    pand             m9, m10
1392    pand             m4, m10
1393    pand             m5, m10
1394    pand             m6, m10
1395    packusdw         m9, m4
1396    packusdw         m5, m6
1397
1398    ; grain = grain_lut[offy+y][offx+x]
1399    movu             m3, [grain_lutq+offxyq]
1400    movu             m6, [grain_lutq+top_offxyq]
1401    movd            xm4, [grain_lutq+left_offxyq]
1402    movd            xm7, [grain_lutq+topleft_offxyq]
1403    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
1404    punpcklbw       xm4, xm3
1405    punpcklbw       xm7, xm6
1406    pmaddubsw       xm4, xm15, xm4
1407    pmaddubsw       xm7, xm15, xm7
1408    pmulhrsw        xm4, xm14
1409    pmulhrsw        xm7, xm14
1410    packsswb        xm4, xm4
1411    packsswb        xm7, xm7
1412    vpblendw        xm4, xm3, 11111110b
1413    vpblendw        xm7, xm6, 11111110b
1414    vpblendd         m3, m4, 00001111b
1415    vpblendd         m6, m7, 00001111b
1416    ; followed by v interpolation (top | cur -> cur)
1417    punpckhbw        m7, m6, m3
1418    punpcklbw        m6, m3
1419    pmaddubsw        m7, m8, m7
1420    pmaddubsw        m6, m8, m6
1421    pmulhrsw         m7, m14
1422    pmulhrsw         m6, m14
1423    packsswb         m3, m6, m7
1424    pcmpgtb          m7, m2, m3
1425    punpcklbw        m2, m3, m7
1426    punpckhbw        m3, m7
1427
1428    ; noise = round2(scaling[src] * grain, scaling_shift)
1429    pmullw           m2, m9
1430    pmullw           m3, m5
1431    pmulhrsw         m2, m11
1432    pmulhrsw         m3, m11
1433
1434    ; dst = clip_pixel(src, noise)
1435    paddw            m0, m2
1436    paddw            m1, m3
1437    pmaxsw           m0, m13
1438    pmaxsw           m1, m13
1439    pminsw           m0, m12
1440    pminsw           m1, m12
1441    packuswb         m0, m1
1442    mova    [dstq+srcq], m0
1443
1444    vpbroadcastw     m8, [pb_27_17_17_27+2] ; swap weights for second v-overlap line
1445    add            srcq, strideq
1446    add      grain_lutq, 82
1447    dec              hw
1448    jz .end_y_hv_overlap
1449    ; 2 lines get vertical overlap, then fall back to non-overlap code for
1450    ; remaining (up to) 30 lines
1451    btc              hd, 16
1452    jnc .loop_y_hv_overlap
1453    jmp .loop_y_h_overlap
1454
1455.end_y_hv_overlap:
1456    add              wq, 32
1457    lea            srcq, [src_bakq+wq]
1458    jl .loop_x_hv_overlap
1459
1460.end_hv:
1461    RET
1462
1463%macro FGUV_FN 3 ; name, ss_hor, ss_ver
1464cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
1465                                     grain_lut, h, sby, luma, lstride, uv_pl, is_id
1466    pcmpeqw         m10, m10
1467    psrld           m10, 24
1468    mov             r7d, [fg_dataq+FGData.scaling_shift]
1469    lea              r8, [pb_mask]
1470%define base r8-pb_mask
1471    vpbroadcastw    m11, [base+mul_bits+r7*2-14]
1472    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
1473    mov             r9d, dword is_idm
1474    vpbroadcastw    m13, [base+min+r7*2]
1475    shlx            r7d, r7d, r9d
1476    vpbroadcastw    m12, [base+max+r7*2]
1477
1478    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
1479    jne .csfl
1480
1481%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
1482    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
1483
1484%if %1
1485    mov             r7d, dword r11m
1486    vpbroadcastb     m0, [fg_dataq+FGData.uv_mult+r7*4]
1487    vpbroadcastb     m1, [fg_dataq+FGData.uv_luma_mult+r7*4]
1488    punpcklbw       m14, m1, m0
1489    vpbroadcastw    m15, [fg_dataq+FGData.uv_offset+r7*4]
1490%else
1491    vpbroadcastd    m14, [pw_1024]
1492%if %2
1493    vpbroadcastd    m15, [pb_23_22]
1494%else
1495    vpbroadcastd   xm15, [pb_27_17_17_27]
1496%endif
1497%endif
1498
1499    mov        overlapd, [fg_dataq+FGData.overlap_flag]
1500    movifnidn      sbyd, sbym
1501    test           sbyd, sbyd
1502    setnz           r7b
1503    test            r7b, overlapb
1504    jnz %%vertical_overlap
1505
1506    imul           seed, sbyd, (173 << 24) | 37
1507    add            seed, (105 << 24) | 178
1508    rol            seed, 8
1509    movzx          seed, seew
1510    xor            seed, [fg_dataq+FGData.seed]
1511
1512    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1513                unused2, unused3, see, overlap, unused4, unused5, lstride
1514
1515    mov           lumaq, r9mp
1516    lea             r12, [srcq+wq]
1517    lea             r13, [dstq+wq]
1518    lea             r14, [lumaq+wq*(1+%2)]
1519    mov           r11mp, r12
1520    mov           r12mp, r13
1521    mov        lstrideq, r10mp
1522    neg              wq
1523
1524%%loop_x:
1525    mov             r6d, seed
1526    or             seed, 0xEFF4
1527    shr             r6d, 1
1528    test           seeb, seeh
1529    lea            seed, [r6+0x8000]
1530    cmovp          seed, r6d               ; updated seed
1531
1532    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1533                offx, offy, see, overlap, unused1, unused2, lstride
1534
1535    mov           offxd, seed
1536    rorx          offyd, seed, 8
1537    shr           offxd, 12
1538    and           offyd, 0xf
1539    imul          offyd, 164>>%3
1540    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
1541
1542    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1543                h, offxy, see, overlap, unused1, unused2, lstride
1544
1545    mov              hd, hm
1546    mov      grain_lutq, grain_lutmp
1547%%loop_y:
1548    ; src
1549%if %2
1550    mova            xm4, [lumaq+lstrideq*0+ 0]
1551    mova            xm6, [lumaq+lstrideq*0+16]
1552    mova            xm0, [srcq]
1553    vpbroadcastd     m7, [pb_1]
1554    vinserti128      m4, [lumaq+lstrideq*(1+%3) +0], 1
1555    vinserti128      m6, [lumaq+lstrideq*(1+%3)+16], 1
1556    vinserti128      m0, [srcq+strideq], 1
1557    pxor             m2, m2
1558    pmaddubsw        m4, m7
1559    pmaddubsw        m6, m7
1560    pavgw            m4, m2
1561    pavgw            m6, m2
1562%else
1563    pxor             m2, m2
1564    mova             m4, [lumaq]
1565    mova             m0, [srcq]
1566%endif
1567
1568%if %1
1569%if %2
1570    packuswb         m4, m6                 ; luma
1571%endif
1572    punpckhbw        m6, m4, m0
1573    punpcklbw        m4, m0                 ; { luma, chroma }
1574    pmaddubsw        m6, m14
1575    pmaddubsw        m4, m14
1576    psraw            m6, 6
1577    psraw            m4, 6
1578    paddw            m6, m15
1579    paddw            m4, m15
1580    packuswb         m4, m6                 ; pack+unpack = clip
1581    punpckhbw        m6, m4, m2
1582    punpcklbw        m4, m2
1583%elif %2 == 0
1584    punpckhbw        m6, m4, m2
1585    punpcklbw        m4, m2
1586%endif
1587
1588    punpckhwd        m5, m4, m2
1589    punpcklwd        m4, m2
1590    punpckhwd        m7, m6, m2
1591    punpcklwd        m6, m2                 ; m4-7: luma_src as dword
1592
1593    ; scaling[luma_src]
1594    pcmpeqw          m3, m3
1595    pcmpeqw          m9, m9
1596    vpgatherdd       m8, [scalingq+m4], m3
1597    vpgatherdd       m4, [scalingq+m5], m9
1598    pcmpeqw          m3, m3
1599    pcmpeqw          m9, m9
1600    vpgatherdd       m5, [scalingq+m6], m3
1601    vpgatherdd       m6, [scalingq+m7], m9
1602    pand             m8, m10
1603    pand             m4, m10
1604    pand             m5, m10
1605    pand             m6, m10
1606    packusdw         m8, m4
1607    packusdw         m5, m6
1608
1609    ; unpack chroma_source
1610    punpckhbw        m1, m0, m2
1611    punpcklbw        m0, m2                 ; m0-1: src as word
1612
1613    ; grain = grain_lut[offy+y][offx+x]
1614%if %2
1615    movu            xm3, [grain_lutq+offxyq+ 0]
1616    vinserti128      m3, [grain_lutq+offxyq+82], 1
1617%else
1618    movu             m3, [grain_lutq+offxyq]
1619%endif
1620    pcmpgtb          m7, m2, m3
1621    punpcklbw        m2, m3, m7
1622    punpckhbw        m3, m7
1623
1624    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1625    pmullw           m2, m8
1626    pmullw           m3, m5
1627    pmulhrsw         m2, m11
1628    pmulhrsw         m3, m11
1629
1630    ; dst = clip_pixel(src, noise)
1631    paddw            m0, m2
1632    paddw            m1, m3
1633    pmaxsw           m0, m13
1634    pmaxsw           m1, m13
1635    pminsw           m0, m12
1636    pminsw           m1, m12
1637    packuswb         m0, m1
1638%if %2
1639    mova         [dstq], xm0
1640    vextracti128 [dstq+strideq], m0, 1
1641%else
1642    mova         [dstq], m0
1643%endif
1644
1645%if %2
1646    lea            srcq, [srcq+strideq*2]
1647    lea            dstq, [dstq+strideq*2]
1648    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
1649%else
1650    add            srcq, strideq
1651    add            dstq, strideq
1652    add           lumaq, lstrideq
1653%endif
1654    add      grain_lutq, 82<<%2
1655    sub              hb, 1+%2
1656    jg %%loop_y
1657
1658    add              wq, 32>>%2
1659    jge %%end
1660    mov            srcq, r11mp
1661    mov            dstq, r12mp
1662    lea           lumaq, [r14+wq*(1+%2)]
1663    add            srcq, wq
1664    add            dstq, wq
1665    test       overlapd, overlapd
1666    jz %%loop_x
1667
1668    ; r8m = sbym
1669    cmp       dword r8m, 0
1670    jne %%loop_x_hv_overlap
1671
1672    ; horizontal overlap (without vertical overlap)
1673%%loop_x_h_overlap:
1674    mov             r6d, seed
1675    or             seed, 0xEFF4
1676    shr             r6d, 1
1677    test           seeb, seeh
1678    lea            seed, [r6+0x8000]
1679    cmovp          seed, r6d               ; updated seed
1680
1681    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1682                offx, offy, see, left_offxy, unused1, unused2, lstride
1683
1684    lea     left_offxyd, [offyd+(32>>%2)]         ; previous column's offy*stride+offx
1685    mov           offxd, seed
1686    rorx          offyd, seed, 8
1687    shr           offxd, 12
1688    and           offyd, 0xf
1689    imul          offyd, 164>>%3
1690    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
1691
1692    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1693                h, offxy, see, left_offxy, unused1, unused2, lstride
1694
1695    mov              hd, hm
1696    mov      grain_lutq, grain_lutmp
1697%%loop_y_h_overlap:
1698    ; src
1699%if %2
1700    mova            xm4, [lumaq+lstrideq*0+ 0]
1701    mova            xm6, [lumaq+lstrideq*0+16]
1702    mova            xm0, [srcq]
1703    vpbroadcastd     m7, [pb_1]
1704    vinserti128      m4, [lumaq+lstrideq*(1+%3) +0], 1
1705    vinserti128      m6, [lumaq+lstrideq*(1+%3)+16], 1
1706    vinserti128      m0, [srcq+strideq], 1
1707    pxor             m2, m2
1708    pmaddubsw        m4, m7
1709    pmaddubsw        m6, m7
1710    pavgw            m4, m2
1711    pavgw            m6, m2
1712%else
1713    mova             m4, [lumaq]
1714    mova             m0, [srcq]
1715    pxor             m2, m2
1716%endif
1717
1718%if %1
1719%if %2
1720    packuswb         m4, m6                 ; luma
1721%endif
1722    punpckhbw        m6, m4, m0
1723    punpcklbw        m4, m0                 ; { luma, chroma }
1724    pmaddubsw        m6, m14
1725    pmaddubsw        m4, m14
1726    psraw            m6, 6
1727    psraw            m4, 6
1728    paddw            m6, m15
1729    paddw            m4, m15
1730    packuswb         m4, m6                 ; pack+unpack = clip
1731    punpckhbw        m6, m4, m2
1732    punpcklbw        m4, m2
1733%elif %2 == 0
1734    punpckhbw        m6, m4, m2
1735    punpcklbw        m4, m2
1736%endif
1737
1738    punpckhwd        m5, m4, m2
1739    punpcklwd        m4, m2
1740    punpckhwd        m7, m6, m2
1741    punpcklwd        m6, m2                 ; m4-7: luma_src as dword
1742
1743    ; scaling[luma_src]
1744    pcmpeqw          m3, m3
1745    pcmpeqw          m9, m9
1746    vpgatherdd       m8, [scalingq+m4], m3
1747    vpgatherdd       m4, [scalingq+m5], m9
1748    pcmpeqw          m3, m3
1749    pcmpeqw          m9, m9
1750    vpgatherdd       m5, [scalingq+m6], m3
1751    vpgatherdd       m6, [scalingq+m7], m9
1752    pand             m8, m10
1753    pand             m4, m10
1754    pand             m5, m10
1755    pand             m6, m10
1756    packusdw         m8, m4
1757    packusdw         m5, m6
1758
1759    ; unpack chroma_source
1760    punpckhbw        m1, m0, m2
1761    punpcklbw        m0, m2                 ; m0-1: src as word
1762
1763    ; grain = grain_lut[offy+y][offx+x]
1764%if %2
1765%if %1
1766    vpbroadcastd     m6, [pb_23_22] ; FIXME
1767%endif
1768    movu            xm3, [grain_lutq+offxyq+ 0]
1769    movd            xm4, [grain_lutq+left_offxyq+ 0]
1770    vinserti128      m3, [grain_lutq+offxyq+82], 1
1771    vinserti128      m4, [grain_lutq+left_offxyq+82], 1
1772    punpcklbw        m4, m3
1773%if %1
1774    pmaddubsw        m4, m6, m4
1775    pmulhrsw         m4, [pw_1024]
1776%else
1777    pmaddubsw        m4, m15, m4
1778    pmulhrsw         m4, m14
1779%endif
1780    packsswb         m4, m4
1781    pcmpeqw          m6, m6 ; FIXME
1782    psrldq           m6, 15 ; FIXME
1783    vpblendvb        m3, m3, m4, m6
1784%else
1785%if %1
1786    vpbroadcastd    xm6, [pb_27_17_17_27]
1787%endif
1788    movu             m3, [grain_lutq+offxyq]
1789    movd            xm4, [grain_lutq+left_offxyq]
1790    punpcklbw       xm4, xm3
1791%if %1
1792    pmaddubsw       xm4, xm6, xm4
1793    pmulhrsw        xm4, [pw_1024]
1794%else
1795    pmaddubsw       xm4, xm15, xm4
1796    pmulhrsw        xm4, xm14
1797%endif
1798    packsswb        xm4, xm4
1799    pcmpeqw         xm6, xm6
1800    psrldq          xm6, 14
1801    vpblendvb        m3, m3, m4, m6
1802%endif
1803    pcmpgtb          m7, m2, m3
1804    punpcklbw        m2, m3, m7
1805    punpckhbw        m3, m7
1806
1807    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
1808    pmullw           m2, m8
1809    pmullw           m3, m5
1810    pmulhrsw         m2, m11
1811    pmulhrsw         m3, m11
1812
1813    ; dst = clip_pixel(src, noise)
1814    paddw            m0, m2
1815    paddw            m1, m3
1816    pmaxsw           m0, m13
1817    pmaxsw           m1, m13
1818    pminsw           m0, m12
1819    pminsw           m1, m12
1820    packuswb         m0, m1
1821%if %2
1822    mova         [dstq], xm0
1823    vextracti128 [dstq+strideq], m0, 1
1824%else
1825    mova         [dstq], m0
1826%endif
1827
1828%if %2
1829    lea            srcq, [srcq+strideq*2]
1830    lea            dstq, [dstq+strideq*2]
1831    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
1832%else
1833    add            srcq, strideq
1834    add            dstq, strideq
1835    add           lumaq, lstrideq
1836%endif
1837    add      grain_lutq, 82*(1+%2)
1838    sub              hb, 1+%2
1839    jg %%loop_y_h_overlap
1840
1841    add              wq, 32>>%2
1842    jge %%end
1843    mov            srcq, r11mp
1844    mov            dstq, r12mp
1845    lea           lumaq, [r14+wq*(1+%2)]
1846    add            srcq, wq
1847    add            dstq, wq
1848
1849    ; r8m = sbym
1850    cmp       dword r8m, 0
1851    jne %%loop_x_hv_overlap
1852    jmp %%loop_x_h_overlap
1853
1854%%end:
1855    RET
1856
1857%%vertical_overlap:
1858    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
1859                sby, see, overlap, unused1, unused2, lstride
1860
1861    movzx          sbyd, sbyb
1862    imul           seed, [fg_dataq+FGData.seed], 0x00010001
1863    imul            r7d, sbyd, 173 * 0x00010001
1864    imul           sbyd, 37 * 0x01000100
1865    add             r7d, (105 << 16) | 188
1866    add            sbyd, (178 << 24) | (141 << 8)
1867    and             r7d, 0x00ff00ff
1868    and            sbyd, 0xff00ff00
1869    xor            seed, r7d
1870    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
1871
1872    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1873                unused1, unused2, see, overlap, unused3, unused4, lstride
1874
1875    mov           lumaq, r9mp
1876    lea             r12, [srcq+wq]
1877    lea             r13, [dstq+wq]
1878    lea             r14, [lumaq+wq*(1+%2)]
1879    mov           r11mp, r12
1880    mov           r12mp, r13
1881    mov        lstrideq, r10mp
1882    neg              wq
1883
1884%%loop_x_v_overlap:
1885    ; we assume from the block above that bits 8-15 of r7d are zero'ed
1886    mov             r6d, seed
1887    or             seed, 0xeff4eff4
1888    test           seeb, seeh
1889    setp            r7b                     ; parity of top_seed
1890    shr            seed, 16
1891    shl             r7d, 16
1892    test           seeb, seeh
1893    setp            r7b                     ; parity of cur_seed
1894    or              r6d, 0x00010001
1895    xor             r7d, r6d
1896    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
1897
1898    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1899                offx, offy, see, overlap, top_offxy, unused, lstride
1900
1901    rorx          offyd, seed, 8
1902    rorx          offxd, seed, 12
1903    and           offyd, 0xf000f
1904    and           offxd, 0xf000f
1905    imul          offyd, 164>>%3
1906    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
1907    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
1908
1909    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
1910                h, offxy, see, overlap, top_offxy, unused, lstride
1911
1912    movzx    top_offxyd, offxyw
1913    shr          offxyd, 16
1914
1915    mov              hd, hm
1916    mov      grain_lutq, grain_lutmp
1917%if %2 == 0
1918    vbroadcasti128   m1, [pb_8x_27_17_8x_17_27]
1919%endif
1920%%loop_y_v_overlap:
1921    ; src
1922%if %2
1923    mova            xm4, [lumaq+lstrideq*0+ 0]
1924    mova            xm6, [lumaq+lstrideq*0+16]
1925    mova            xm0, [srcq]
1926    vpbroadcastd     m7, [pb_1]
1927    vinserti128      m4, [lumaq+lstrideq*(1+%3) +0], 1
1928    vinserti128      m6, [lumaq+lstrideq*(1+%3)+16], 1
1929    vinserti128      m0, [srcq+strideq], 1
1930    pxor             m2, m2
1931    pmaddubsw        m4, m7
1932    pmaddubsw        m6, m7
1933    pavgw            m4, m2
1934    pavgw            m6, m2
1935%else
1936    mova             m4, [lumaq]
1937    mova             m0, [srcq]
1938    pxor             m2, m2
1939%endif
1940
1941%if %1
1942%if %2
1943    packuswb         m4, m6                 ; luma
1944%endif
1945    punpckhbw        m6, m4, m0
1946    punpcklbw        m4, m0                 ; { luma, chroma }
1947    pmaddubsw        m6, m14
1948    pmaddubsw        m4, m14
1949    psraw            m6, 6
1950    psraw            m4, 6
1951    paddw            m6, m15
1952    paddw            m4, m15
1953    packuswb         m4, m6                 ; pack+unpack = clip
1954    punpckhbw        m6, m4, m2
1955    punpcklbw        m4, m2
1956%elif %2 == 0
1957    punpckhbw        m6, m4, m2
1958    punpcklbw        m4, m2
1959%endif
1960
1961    punpckhwd        m5, m4, m2
1962    punpcklwd        m4, m2
1963    punpckhwd        m7, m6, m2
1964    punpcklwd        m6, m2                 ; m4-7: luma_src as dword
1965
1966    ; scaling[luma_src]
1967    pcmpeqw          m3, m3
1968    pcmpeqw          m9, m9
1969    vpgatherdd       m8, [scalingq+m4], m3
1970    vpgatherdd       m4, [scalingq+m5], m9
1971    pcmpeqw          m3, m3
1972    pcmpeqw          m9, m9
1973    vpgatherdd       m5, [scalingq+m6], m3
1974    vpgatherdd       m6, [scalingq+m7], m9
1975    pand             m8, m10
1976    pand             m4, m10
1977    pand             m5, m10
1978    pand             m6, m10
1979    packusdw         m8, m4
1980    packusdw         m5, m6
1981
1982%if %2
1983    ; unpack chroma_source
1984    punpckhbw        m1, m0, m2
1985    punpcklbw        m0, m2                 ; m0-1: src as word
1986%endif
1987
1988    ; grain = grain_lut[offy+y][offx+x]
1989%if %3 == 0
1990%if %2
1991    mova             m6, [pb_8x_27_17_8x_17_27]
1992    movu            xm3, [grain_lutq+offxyq]
1993    movu            xm4, [grain_lutq+top_offxyq]
1994    vinserti128      m3, [grain_lutq+offxyq+82], 1
1995    vinserti128      m4, [grain_lutq+top_offxyq+82], 1
1996%else
1997    movu             m3, [grain_lutq+offxyq]
1998    movu             m4, [grain_lutq+top_offxyq]
1999%endif
2000    punpckhbw        m9, m4, m3
2001    punpcklbw        m4, m3
2002%if %2
2003    pmaddubsw        m9, m6, m9
2004    pmaddubsw        m4, m6, m4
2005%else
2006    pmaddubsw        m9, m1, m9
2007    pmaddubsw        m4, m1, m4
2008%endif
2009%if %1
2010    pmulhrsw         m9, [pw_1024]
2011    pmulhrsw         m4, [pw_1024]
2012%else
2013    pmulhrsw         m9, m14
2014    pmulhrsw         m4, m14
2015%endif
2016    packsswb         m3, m4, m9
2017%else
2018%if %1
2019    vpbroadcastd     m6, [pb_23_22]
2020%endif
2021    movq            xm3, [grain_lutq+offxyq]
2022    movq            xm4, [grain_lutq+top_offxyq]
2023    vinserti128      m3, [grain_lutq+offxyq+8], 1
2024    vinserti128      m4, [grain_lutq+top_offxyq+8], 1
2025    punpcklbw        m4, m3
2026%if %1
2027    pmaddubsw        m4, m6, m4
2028    pmulhrsw         m4, [pw_1024]
2029%else
2030    pmaddubsw        m4, m15, m4
2031    pmulhrsw         m4, m14
2032%endif
2033    packsswb         m4, m4
2034    vpermq           m4, m4, q3120
2035    ; only interpolate first line, insert second line unmodified
2036    vinserti128      m3, m4, [grain_lutq+offxyq+82], 1
2037%endif
2038    pcmpgtb          m7, m2, m3
2039    punpcklbw        m2, m3, m7
2040    punpckhbw        m3, m7
2041
2042    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
2043    pmullw           m2, m8
2044    pmullw           m3, m5
2045    pmulhrsw         m2, m11
2046    pmulhrsw         m3, m11
2047
2048    ; dst = clip_pixel(src, noise)
2049%if %2
2050    paddw            m0, m2
2051    paddw            m1, m3
2052    pmaxsw           m0, m13
2053    pmaxsw           m1, m13
2054    pminsw           m0, m12
2055    pminsw           m1, m12
2056    packuswb         m0, m1
2057    mova         [dstq], xm0
2058    vextracti128 [dstq+strideq], m0, 1
2059%else
2060    pxor             m6, m6
2061    punpckhbw        m9, m0, m6
2062    punpcklbw        m0, m6                 ; m0-1: src as word
2063
2064    paddw            m0, m2
2065    paddw            m9, m3
2066    pmaxsw           m0, m13
2067    pmaxsw           m9, m13
2068    pminsw           m0, m12
2069    pminsw           m9, m12
2070    packuswb         m0, m9
2071    mova         [dstq], m0
2072%endif
2073
2074    sub              hb, 1+%2
2075    jl %%end_y_v_overlap
2076%if %2
2077    lea            srcq, [srcq+strideq*2]
2078    lea            dstq, [dstq+strideq*2]
2079    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
2080%else
2081    add            srcq, strideq
2082    add            dstq, strideq
2083    add           lumaq, lstrideq
2084%endif
2085    add      grain_lutq, 82<<%2
2086%if %2 == 0
2087    vbroadcasti128   m1, [pb_8x_27_17_8x_17_27+16]
2088    btc              hd, 16
2089    jnc %%loop_y_v_overlap
2090%endif
2091    jmp %%loop_y
2092
2093%%end_y_v_overlap:
2094    add              wq, 32>>%2
2095    jge %%end_hv
2096    mov            srcq, r11mp
2097    mov            dstq, r12mp
2098    lea           lumaq, [r14+wq*(1+%2)]
2099    add            srcq, wq
2100    add            dstq, wq
2101
2102    ; since fg_dataq.overlap is guaranteed to be set, we never jump
2103    ; back to .loop_x_v_overlap, and instead always fall-through to
2104    ; h+v overlap
2105
2106%%loop_x_hv_overlap:
2107    ; we assume from the block above that bits 8-15 of r7d are zero'ed
2108    mov             r6d, seed
2109    or             seed, 0xeff4eff4
2110    test           seeb, seeh
2111    setp            r7b                     ; parity of top_seed
2112    shr            seed, 16
2113    shl             r7d, 16
2114    test           seeb, seeh
2115    setp            r7b                     ; parity of cur_seed
2116    or              r6d, 0x00010001
2117    xor             r7d, r6d
2118    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
2119
2120    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2121                offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
2122
2123    lea  topleft_offxyq, [top_offxyq+(32>>%2)]
2124    lea     left_offxyq, [offyq+(32>>%2)]
2125    rorx          offyd, seed, 8
2126    rorx          offxd, seed, 12
2127    and           offyd, 0xf000f
2128    and           offxd, 0xf000f
2129    imul          offyd, 164>>%3
2130    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
2131    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
2132
2133    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
2134                h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
2135
2136    movzx    top_offxyd, offxyw
2137    shr          offxyd, 16
2138
2139    mov              hd, hm
2140    mov      grain_lutq, grain_lutmp
2141%if %2 == 0
2142    vbroadcasti128   m1, [pb_8x_27_17_8x_17_27]
2143%endif
2144%%loop_y_hv_overlap:
2145    ; src
2146%if %2
2147    mova            xm4, [lumaq+lstrideq*0+ 0]
2148    mova            xm6, [lumaq+lstrideq*0+16]
2149    mova            xm0, [srcq]
2150    vpbroadcastd     m7, [pb_1]
2151    vinserti128      m4, [lumaq+lstrideq*(1+%3) +0], 1
2152    vinserti128      m6, [lumaq+lstrideq*(1+%3)+16], 1
2153    vinserti128      m0, [srcq+strideq], 1
2154    pxor             m2, m2
2155    pmaddubsw        m4, m7
2156    pmaddubsw        m6, m7
2157    pavgw            m4, m2
2158    pavgw            m6, m2
2159%else
2160    mova             m4, [lumaq]
2161    mova             m0, [srcq]
2162    pxor             m2, m2
2163%endif
2164
2165%if %1
2166%if %2
2167    packuswb         m4, m6                 ; luma
2168%endif
2169    punpckhbw        m6, m4, m0
2170    punpcklbw        m4, m0                 ; { luma, chroma }
2171    pmaddubsw        m6, m14
2172    pmaddubsw        m4, m14
2173    psraw            m6, 6
2174    psraw            m4, 6
2175    paddw            m6, m15
2176    paddw            m4, m15
2177    packuswb         m4, m6                 ; pack+unpack = clip
2178    punpckhbw        m6, m4, m2
2179    punpcklbw        m4, m2
2180%elif %2 == 0
2181    punpckhbw        m6, m4, m2
2182    punpcklbw        m4, m2
2183%endif
2184
2185    punpckhwd        m5, m4, m2
2186    punpcklwd        m4, m2
2187    punpckhwd        m7, m6, m2
2188    punpcklwd        m6, m2                 ; m4-7: src as dword
2189
2190    ; scaling[src]
2191    pcmpeqw          m9, m9
2192    pcmpeqw          m3, m3
2193    vpgatherdd       m8, [scalingq+m4], m9
2194    vpgatherdd       m4, [scalingq+m5], m3
2195    pcmpeqw          m9, m9
2196    pcmpeqw          m3, m3
2197    vpgatherdd       m5, [scalingq+m6], m9
2198    vpgatherdd       m6, [scalingq+m7], m3
2199    pand             m8, m10
2200    pand             m4, m10
2201    pand             m5, m10
2202    pand             m6, m10
2203    packusdw         m8, m4
2204    packusdw         m5, m6
2205
2206%if %2
2207    ; unpack chroma source
2208    punpckhbw        m1, m0, m2
2209    punpcklbw        m0, m2                 ; m0-1: src as word
2210%endif
2211
2212    ; grain = grain_lut[offy+y][offx+x]
2213%if %1
2214%if %2
2215    vpbroadcastd     m9, [pb_23_22]
2216%else
2217    vpbroadcastd    xm9, [pb_27_17_17_27]
2218%endif
2219%endif
2220
2221%if %2
2222    movu            xm3, [grain_lutq+offxyq]
2223%if %3
2224    movq            xm6, [grain_lutq+top_offxyq]
2225%else
2226    movu            xm6, [grain_lutq+top_offxyq]
2227%endif
2228    vinserti128      m3, [grain_lutq+offxyq+82], 1
2229%if %3
2230    vinserti128      m6, [grain_lutq+top_offxyq+8], 1
2231%else
2232    vinserti128      m6, [grain_lutq+top_offxyq+82], 1
2233%endif
2234%else
2235    movu             m3, [grain_lutq+offxyq]
2236    movu             m6, [grain_lutq+top_offxyq]
2237%endif
2238    movd            xm4, [grain_lutq+left_offxyq]
2239    movd            xm7, [grain_lutq+topleft_offxyq]
2240%if %2
2241    vinserti128      m4, [grain_lutq+left_offxyq+82], 1
2242%if %3 == 0
2243    vinserti128      m7, [grain_lutq+topleft_offxyq+82], 1
2244%endif
2245%endif
2246
2247    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
2248%if %2
2249    punpcklbw        m4, m3
2250%if %3
2251    punpcklbw       xm7, xm6
2252%else
2253    punpcklbw        m7, m6
2254%endif
2255    punpcklwd        m4, m7
2256%if %1
2257    pmaddubsw        m4, m9, m4
2258    pmulhrsw         m4, [pw_1024]
2259%else
2260    pmaddubsw        m4, m15, m4
2261    pmulhrsw         m4, m14
2262%endif
2263    packsswb         m4, m4
2264    pcmpeqw          m9, m9                 ; this is kind of ugly
2265    psrldq           m9, 15
2266    vpblendvb        m3, m3, m4, m9
2267    psrldq           m4, 1
2268%if %3
2269    shufpd           m9, m9, m9, 1110b      ; clear upper lane
2270%endif
2271    vpblendvb        m6, m6, m4, m9
2272%else
2273    punpcklbw       xm4, xm3
2274    punpcklbw       xm7, xm6
2275    punpckldq       xm4, xm7
2276%if %1
2277    pmaddubsw       xm4, xm9, xm4
2278    pmulhrsw        xm4, [pw_1024]
2279%else
2280    pmaddubsw       xm4, xm15, xm4
2281    pmulhrsw        xm4, xm14
2282%endif
2283    packsswb        xm4, xm4
2284    pcmpeqw         xm9, xm9                 ; this is kind of ugly
2285    psrldq          xm9, 14
2286    vpblendvb        m3, m3, m4, m9
2287    psrldq          xm4, 2
2288    vpblendvb        m6, m6, m4, m9
2289%endif
2290
2291    ; followed by v interpolation (top | cur -> cur)
2292%if %3
2293    vpermq           m9, m3, q3120
2294    punpcklbw        m6, m9
2295%if %1
2296    vpbroadcastd     m9, [pb_23_22]
2297    pmaddubsw        m6, m9, m6
2298    pmulhrsw         m6, [pw_1024]
2299%else
2300    pmaddubsw        m6, m15, m6
2301    pmulhrsw         m6, m14
2302%endif
2303    packsswb         m6, m6
2304    vpermq           m6, m6, q3120
2305    vpblendd         m3, m3, m6, 00001111b
2306%else
2307    punpckhbw        m9, m6, m3
2308    punpcklbw        m6, m3
2309%if %2
2310    mova             m3, [pb_8x_27_17_8x_17_27]
2311    pmaddubsw        m9, m3, m9
2312    pmaddubsw        m6, m3, m6
2313%else
2314    pmaddubsw        m9, m1, m9
2315    pmaddubsw        m6, m1, m6
2316%endif
2317%if %1
2318    pmulhrsw         m9, [pw_1024]
2319    pmulhrsw         m6, [pw_1024]
2320%else
2321    pmulhrsw         m9, m14
2322    pmulhrsw         m6, m14
2323%endif
2324    packsswb         m3, m6, m9
2325%endif
2326    pcmpgtb          m7, m2, m3
2327    punpcklbw        m2, m3, m7
2328    punpckhbw        m3, m7
2329
2330    ; noise = round2(scaling[src] * grain, scaling_shift)
2331    pmullw           m2, m8
2332    pmullw           m3, m5
2333    pmulhrsw         m2, m11
2334    pmulhrsw         m3, m11
2335
2336    ; dst = clip_pixel(src, noise)
2337%if %2
2338    paddw            m0, m2
2339    paddw            m1, m3
2340    pmaxsw           m0, m13
2341    pmaxsw           m1, m13
2342    pminsw           m0, m12
2343    pminsw           m1, m12
2344    packuswb         m0, m1
2345    mova         [dstq], xm0
2346    vextracti128 [dstq+strideq], m0, 1
2347%else
2348    pxor             m6, m6
2349    punpckhbw        m9, m0, m6
2350    punpcklbw        m0, m6                 ; m0-1: src as word
2351    paddw            m0, m2
2352    paddw            m9, m3
2353    pmaxsw           m0, m13
2354    pmaxsw           m9, m13
2355    pminsw           m0, m12
2356    pminsw           m9, m12
2357    packuswb         m0, m9
2358    mova         [dstq], m0
2359%endif
2360
2361%if %2
2362    lea            srcq, [srcq+strideq*2]
2363    lea            dstq, [dstq+strideq*2]
2364    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
2365%else
2366    add            srcq, strideq
2367    add            dstq, strideq
2368    add           lumaq, lstrideq
2369%endif
2370    add      grain_lutq, 82<<%2
2371    sub              hb, 1+%2
2372%if %2
2373    jg %%loop_y_h_overlap
2374%else
2375    je %%end_y_hv_overlap
2376    vbroadcasti128   m1, [pb_8x_27_17_8x_17_27+16]
2377    btc              hd, 16
2378    jnc %%loop_y_hv_overlap
2379    jmp %%loop_y_h_overlap
2380%endif
2381
2382%%end_y_hv_overlap:
2383    add              wq, 32>>%2
2384    jge %%end_hv
2385    mov            srcq, r11mp
2386    mov            dstq, r12mp
2387    lea           lumaq, [r14+wq*(1+%2)]
2388    add            srcq, wq
2389    add            dstq, wq
2390    jmp %%loop_x_hv_overlap
2391
2392%%end_hv:
2393    RET
2394%endmacro
2395
2396    %%FGUV_32x32xN_LOOP 1, %2, %3
2397.csfl:
2398    %%FGUV_32x32xN_LOOP 0, %2, %3
2399%endmacro
2400
2401FGUV_FN 420, 1, 1
2402FGUV_FN 422, 1, 0
2403FGUV_FN 444, 0, 0
2404
2405%endif ; ARCH_X86_64
2406