1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 32
32
33sgr_lshuf3:    db  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
34sgr_lshuf5:    db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9
35wiener_shufA:  db  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11
36wiener_shufB:  db  6,  7,  4,  5,  8,  9,  6,  7, 10, 11,  8,  9, 12, 13, 10, 11
37wiener_shufC:  db  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
38wiener_shufD:  db  2,  3, -1, -1,  4,  5, -1, -1,  6,  7, -1, -1,  8,  9, -1, -1
39wiener_shufE:  db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
40wiener_lshuf5: db  4,  5,  4,  5,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
41wiener_lshuf7: db  8,  9,  8,  9,  8,  9,  8,  9,  8,  9, 10, 11, 12, 13, 14, 15
42pb_0to31:      db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
43               db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
44
45wiener_hshift: dw 4, 4, 1, 1
46wiener_vshift: dw 1024, 1024, 4096, 4096
47wiener_round:  dd 1049600, 1048832
48
49pb_m10_m9:     times 2 db -10, -9
50pb_m6_m5:      times 2 db  -6, -5
51pb_m2_m1:      times 2 db  -2, -1
52pb_2_3:        times 2 db   2,  3
53pb_6_7:        times 2 db   6,  7
54pw_1023:       times 2 dw 1023
55pd_8:          dd 8
56pd_25:         dd 25
57pd_4096:       dd 4096
58pd_34816:      dd 34816
59pd_m262128     dd -262128
60pd_0xf00800a4: dd 0xf00800a4
61pd_0xf00801c7: dd 0xf00801c7
62
63%define pw_256 sgr_lshuf5
64
65cextern sgr_x_by_x_avx2
66
67SECTION .text
68
69%macro REPX 2-*
70    %xdefine %%f(x) %1
71%rep %0 - 1
72    %rotate 1
73    %%f(%1)
74%endrep
75%endmacro
76
77DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers
78
79INIT_YMM avx2
80cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
81                                                     lpf_stride, w, edge, flt, h
82%define base t4-wiener_hshift
83    mov           fltq, fltmp
84    mov          edged, r8m
85    movifnidn       wd, wm
86    mov             hd, r6m
87    mov            t3d, r9m ; pixel_max
88    vbroadcasti128  m6, [wiener_shufA]
89    vpbroadcastd   m12, [fltq+ 0] ; x0 x1
90    lea             t4, [wiener_hshift]
91    vbroadcasti128  m7, [wiener_shufB]
92    add             wd, wd
93    vpbroadcastd   m13, [fltq+ 4] ; x2 x3
94    shr            t3d, 11
95    vbroadcasti128  m8, [wiener_shufC]
96    add           lpfq, wq
97    vbroadcasti128  m9, [wiener_shufD]
98    lea             t1, [rsp+wq+16]
99    vpbroadcastd   m14, [fltq+16] ; y0 y1
100    add           dstq, wq
101    vpbroadcastd   m15, [fltq+20] ; y2 y3
102    neg             wq
103    vpbroadcastd    m0, [base+wiener_hshift+t3*4]
104    vpbroadcastd   m10, [base+wiener_round+t3*4]
105    vpbroadcastd   m11, [base+wiener_vshift+t3*4]
106    pmullw         m12, m0 ; upshift filter coefs to make the
107    pmullw         m13, m0 ; horizontal downshift constant
108    test         edgeb, 4 ; LR_HAVE_TOP
109    jz .no_top
110    call .h_top
111    add           lpfq, lpf_strideq
112    mov             t6, t1
113    mov             t5, t1
114    add             t1, 384*2
115    call .h_top
116    lea             r7, [lpfq+lpf_strideq*4]
117    mov           lpfq, dstq
118    mov             t4, t1
119    add             t1, 384*2
120    mov      [rsp+8*1], lpf_strideq
121    add             r7, lpf_strideq
122    mov      [rsp+8*0], r7 ; below
123    call .h
124    mov             t3, t1
125    mov             t2, t1
126    dec             hd
127    jz .v1
128    add           lpfq, dst_strideq
129    add             t1, 384*2
130    call .h
131    mov             t2, t1
132    dec             hd
133    jz .v2
134    add           lpfq, dst_strideq
135    add             t1, 384*2
136    call .h
137    dec             hd
138    jz .v3
139.main:
140    lea             t0, [t1+384*2]
141.main_loop:
142    call .hv
143    dec             hd
144    jnz .main_loop
145    test         edgeb, 8 ; LR_HAVE_BOTTOM
146    jz .v3
147    mov           lpfq, [rsp+8*0]
148    call .hv_bottom
149    add           lpfq, [rsp+8*1]
150    call .hv_bottom
151.v1:
152    call .v
153    RET
154.no_top:
155    lea             r7, [lpfq+lpf_strideq*4]
156    mov           lpfq, dstq
157    mov      [rsp+8*1], lpf_strideq
158    lea             r7, [r7+lpf_strideq*2]
159    mov      [rsp+8*0], r7
160    call .h
161    mov             t6, t1
162    mov             t5, t1
163    mov             t4, t1
164    mov             t3, t1
165    mov             t2, t1
166    dec             hd
167    jz .v1
168    add           lpfq, dst_strideq
169    add             t1, 384*2
170    call .h
171    mov             t2, t1
172    dec             hd
173    jz .v2
174    add           lpfq, dst_strideq
175    add             t1, 384*2
176    call .h
177    dec             hd
178    jz .v3
179    lea             t0, [t1+384*2]
180    call .hv
181    dec             hd
182    jz .v3
183    add             t0, 384*8
184    call .hv
185    dec             hd
186    jnz .main
187.v3:
188    call .v
189.v2:
190    call .v
191    jmp .v1
192.extend_right:
193    movd           xm1, r10d
194    vpbroadcastd    m0, [pb_6_7]
195    movu            m2, [pb_0to31]
196    vpbroadcastb    m1, xm1
197    psubb           m0, m1
198    pminub          m0, m2
199    pshufb          m3, m0
200    vpbroadcastd    m0, [pb_m2_m1]
201    psubb           m0, m1
202    pminub          m0, m2
203    pshufb          m4, m0
204    vpbroadcastd    m0, [pb_m10_m9]
205    psubb           m0, m1
206    pminub          m0, m2
207    pshufb          m5, m0
208    ret
209.h:
210    mov            r10, wq
211    test         edgeb, 1 ; LR_HAVE_LEFT
212    jz .h_extend_left
213    movq           xm3, [leftq]
214    vpblendd        m3, [lpfq+r10-8], 0xfc
215    add          leftq, 8
216    jmp .h_main
217.h_extend_left:
218    vbroadcasti128  m3, [lpfq+r10] ; avoid accessing memory located
219    mova            m4, [lpfq+r10] ; before the start of the buffer
220    shufpd          m3, m4, 0x05
221    pshufb          m3, [wiener_lshuf7]
222    jmp .h_main2
223.h_top:
224    mov            r10, wq
225    test         edgeb, 1 ; LR_HAVE_LEFT
226    jz .h_extend_left
227.h_loop:
228    movu            m3, [lpfq+r10-8]
229.h_main:
230    mova            m4, [lpfq+r10+0]
231.h_main2:
232    movu            m5, [lpfq+r10+8]
233    test         edgeb, 2 ; LR_HAVE_RIGHT
234    jnz .h_have_right
235    cmp           r10d, -36
236    jl .h_have_right
237    call .extend_right
238.h_have_right:
239    pshufb          m0, m3, m6
240    pshufb          m1, m4, m7
241    paddw           m0, m1
242    pshufb          m3, m8
243    pmaddwd         m0, m12
244    pshufb          m1, m4, m9
245    paddw           m3, m1
246    pshufb          m1, m4, m6
247    pmaddwd         m3, m13
248    pshufb          m2, m5, m7
249    paddw           m1, m2
250    vpbroadcastd    m2, [pd_m262128] ; (1 << 4) - (1 << 18)
251    pshufb          m4, m8
252    pmaddwd         m1, m12
253    pshufb          m5, m9
254    paddw           m4, m5
255    pmaddwd         m4, m13
256    paddd           m0, m2
257    paddd           m1, m2
258    paddd           m0, m3
259    paddd           m1, m4
260    psrad           m0, 4
261    psrad           m1, 4
262    packssdw        m0, m1
263    psraw           m0, 1
264    mova      [t1+r10], m0
265    add            r10, 32
266    jl .h_loop
267    ret
268ALIGN function_align
269.hv:
270    add           lpfq, dst_strideq
271    mov            r10, wq
272    test         edgeb, 1 ; LR_HAVE_LEFT
273    jz .hv_extend_left
274    movq           xm3, [leftq]
275    vpblendd        m3, [lpfq+r10-8], 0xfc
276    add          leftq, 8
277    jmp .hv_main
278.hv_extend_left:
279    movu            m3, [lpfq+r10-8]
280    pshufb          m3, [wiener_lshuf7]
281    jmp .hv_main
282.hv_bottom:
283    mov            r10, wq
284    test         edgeb, 1 ; LR_HAVE_LEFT
285    jz .hv_extend_left
286.hv_loop:
287    movu            m3, [lpfq+r10-8]
288.hv_main:
289    mova            m4, [lpfq+r10+0]
290    movu            m5, [lpfq+r10+8]
291    test         edgeb, 2 ; LR_HAVE_RIGHT
292    jnz .hv_have_right
293    cmp           r10d, -36
294    jl .hv_have_right
295    call .extend_right
296.hv_have_right:
297    pshufb          m0, m3, m6
298    pshufb          m1, m4, m7
299    paddw           m0, m1
300    pshufb          m3, m8
301    pmaddwd         m0, m12
302    pshufb          m1, m4, m9
303    paddw           m3, m1
304    pshufb          m1, m4, m6
305    pmaddwd         m3, m13
306    pshufb          m2, m5, m7
307    paddw           m1, m2
308    vpbroadcastd    m2, [pd_m262128]
309    pshufb          m4, m8
310    pmaddwd         m1, m12
311    pshufb          m5, m9
312    paddw           m4, m5
313    pmaddwd         m4, m13
314    paddd           m0, m2
315    paddd           m1, m2
316    mova            m2, [t4+r10]
317    paddw           m2, [t2+r10]
318    mova            m5, [t3+r10]
319    paddd           m0, m3
320    paddd           m1, m4
321    psrad           m0, 4
322    psrad           m1, 4
323    packssdw        m0, m1
324    mova            m4, [t5+r10]
325    paddw           m4, [t1+r10]
326    psraw           m0, 1
327    paddw           m3, m0, [t6+r10]
328    mova      [t0+r10], m0
329    punpcklwd       m0, m2, m5
330    pmaddwd         m0, m15
331    punpckhwd       m2, m5
332    pmaddwd         m2, m15
333    punpcklwd       m1, m3, m4
334    pmaddwd         m1, m14
335    punpckhwd       m3, m4
336    pmaddwd         m3, m14
337    paddd           m0, m10
338    paddd           m2, m10
339    paddd           m0, m1
340    paddd           m2, m3
341    psrad           m0, 5
342    psrad           m2, 5
343    packusdw        m0, m2
344    pmulhuw         m0, m11
345    mova    [dstq+r10], m0
346    add            r10, 32
347    jl .hv_loop
348    mov             t6, t5
349    mov             t5, t4
350    mov             t4, t3
351    mov             t3, t2
352    mov             t2, t1
353    mov             t1, t0
354    mov             t0, t6
355    add           dstq, dst_strideq
356    ret
357.v:
358    mov            r10, wq
359.v_loop:
360    mova            m1, [t4+r10]
361    paddw           m1, [t2+r10]
362    mova            m2, [t3+r10]
363    mova            m4, [t1+r10]
364    paddw           m3, m4, [t6+r10]
365    paddw           m4, [t5+r10]
366    punpcklwd       m0, m1, m2
367    pmaddwd         m0, m15
368    punpckhwd       m1, m2
369    pmaddwd         m1, m15
370    punpcklwd       m2, m3, m4
371    pmaddwd         m2, m14
372    punpckhwd       m3, m4
373    pmaddwd         m3, m14
374    paddd           m0, m10
375    paddd           m1, m10
376    paddd           m0, m2
377    paddd           m1, m3
378    psrad           m0, 5
379    psrad           m1, 5
380    packusdw        m0, m1
381    pmulhuw         m0, m11
382    mova    [dstq+r10], m0
383    add            r10, 32
384    jl .v_loop
385    mov             t6, t5
386    mov             t5, t4
387    mov             t4, t3
388    mov             t3, t2
389    mov             t2, t1
390    add           dstq, dst_strideq
391    ret
392cglobal wiener_filter5_16bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
393                                                   lpf_stride, w, edge, flt, h
394%define base t4-wiener_hshift
395    mov           fltq, fltmp
396    mov          edged, r8m
397    movifnidn       wd, wm
398    mov             hd, r6m
399    mov            t3d, r9m ; pixel_max
400    vbroadcasti128  m5, [wiener_shufE]
401    vpbroadcastw   m11, [fltq+ 2] ; x1
402    vbroadcasti128  m6, [wiener_shufB]
403    lea             t4, [wiener_hshift]
404    vbroadcasti128  m7, [wiener_shufD]
405    add             wd, wd
406    vpbroadcastd   m12, [fltq+ 4] ; x2 x3
407    shr            t3d, 11
408    vpbroadcastd    m8, [pd_m262128] ; (1 << 4) - (1 << 18)
409    add           lpfq, wq
410    lea             t1, [rsp+wq+16]
411    vpbroadcastw   m13, [fltq+18] ; y1
412    add           dstq, wq
413    vpbroadcastd   m14, [fltq+20] ; y2 y3
414    neg             wq
415    vpbroadcastd    m0, [base+wiener_hshift+t3*4]
416    vpbroadcastd    m9, [base+wiener_round+t3*4]
417    vpbroadcastd   m10, [base+wiener_vshift+t3*4]
418    movu          xm15, [wiener_lshuf5]
419    pmullw         m11, m0
420    vinserti128    m15, [pb_0to31], 1
421    pmullw         m12, m0
422    test         edgeb, 4 ; LR_HAVE_TOP
423    jz .no_top
424    call .h_top
425    add           lpfq, lpf_strideq
426    mov             t4, t1
427    add             t1, 384*2
428    call .h_top
429    lea             r7, [lpfq+lpf_strideq*4]
430    mov           lpfq, dstq
431    mov             t3, t1
432    add             t1, 384*2
433    mov      [rsp+8*1], lpf_strideq
434    add             r7, lpf_strideq
435    mov      [rsp+8*0], r7 ; below
436    call .h
437    mov             t2, t1
438    dec             hd
439    jz .v1
440    add           lpfq, dst_strideq
441    add             t1, 384*2
442    call .h
443    dec             hd
444    jz .v2
445.main:
446    mov             t0, t4
447.main_loop:
448    call .hv
449    dec             hd
450    jnz .main_loop
451    test         edgeb, 8 ; LR_HAVE_BOTTOM
452    jz .v2
453    mov           lpfq, [rsp+8*0]
454    call .hv_bottom
455    add           lpfq, [rsp+8*1]
456    call .hv_bottom
457.end:
458    RET
459.no_top:
460    lea             r7, [lpfq+lpf_strideq*4]
461    mov           lpfq, dstq
462    mov      [rsp+8*1], lpf_strideq
463    lea             r7, [r7+lpf_strideq*2]
464    mov      [rsp+8*0], r7
465    call .h
466    mov             t4, t1
467    mov             t3, t1
468    mov             t2, t1
469    dec             hd
470    jz .v1
471    add           lpfq, dst_strideq
472    add             t1, 384*2
473    call .h
474    dec             hd
475    jz .v2
476    lea             t0, [t1+384*2]
477    call .hv
478    dec             hd
479    jz .v2
480    add             t0, 384*6
481    call .hv
482    dec             hd
483    jnz .main
484.v2:
485    call .v
486    mov             t4, t3
487    mov             t3, t2
488    mov             t2, t1
489    add           dstq, dst_strideq
490.v1:
491    call .v
492    jmp .end
493.extend_right:
494    movd           xm2, r10d
495    vpbroadcastd    m0, [pb_2_3]
496    vpbroadcastd    m1, [pb_m6_m5]
497    vpbroadcastb    m2, xm2
498    psubb           m0, m2
499    psubb           m1, m2
500    movu            m2, [pb_0to31]
501    pminub          m0, m2
502    pminub          m1, m2
503    pshufb          m3, m0
504    pshufb          m4, m1
505    ret
506.h:
507    mov            r10, wq
508    test         edgeb, 1 ; LR_HAVE_LEFT
509    jz .h_extend_left
510    movd           xm3, [leftq+4]
511    vpblendd        m3, [lpfq+r10-4], 0xfe
512    add          leftq, 8
513    jmp .h_main
514.h_extend_left:
515    vbroadcasti128  m4, [lpfq+r10] ; avoid accessing memory located
516    mova            m3, [lpfq+r10] ; before the start of the buffer
517    palignr         m3, m4, 12
518    pshufb          m3, m15
519    jmp .h_main
520.h_top:
521    mov            r10, wq
522    test         edgeb, 1 ; LR_HAVE_LEFT
523    jz .h_extend_left
524.h_loop:
525    movu            m3, [lpfq+r10-4]
526.h_main:
527    movu            m4, [lpfq+r10+4]
528    test         edgeb, 2 ; LR_HAVE_RIGHT
529    jnz .h_have_right
530    cmp           r10d, -34
531    jl .h_have_right
532    call .extend_right
533.h_have_right:
534    pshufb          m0, m3, m5
535    pmaddwd         m0, m11
536    pshufb          m1, m4, m5
537    pmaddwd         m1, m11
538    pshufb          m2, m3, m6
539    pshufb          m3, m7
540    paddw           m2, m3
541    pshufb          m3, m4, m6
542    pmaddwd         m2, m12
543    pshufb          m4, m7
544    paddw           m3, m4
545    pmaddwd         m3, m12
546    paddd           m0, m8
547    paddd           m1, m8
548    paddd           m0, m2
549    paddd           m1, m3
550    psrad           m0, 4
551    psrad           m1, 4
552    packssdw        m0, m1
553    psraw           m0, 1
554    mova      [t1+r10], m0
555    add            r10, 32
556    jl .h_loop
557    ret
558ALIGN function_align
559.hv:
560    add           lpfq, dst_strideq
561    mov            r10, wq
562    test         edgeb, 1 ; LR_HAVE_LEFT
563    jz .hv_extend_left
564    movd           xm3, [leftq+4]
565    vpblendd        m3, [lpfq+r10-4], 0xfe
566    add          leftq, 8
567    jmp .hv_main
568.hv_extend_left:
569    movu            m3, [lpfq+r10-4]
570    pshufb          m3, m15
571    jmp .hv_main
572.hv_bottom:
573    mov            r10, wq
574    test         edgeb, 1 ; LR_HAVE_LEFT
575    jz .hv_extend_left
576.hv_loop:
577    movu            m3, [lpfq+r10-4]
578.hv_main:
579    movu            m4, [lpfq+r10+4]
580    test         edgeb, 2 ; LR_HAVE_RIGHT
581    jnz .hv_have_right
582    cmp           r10d, -34
583    jl .hv_have_right
584    call .extend_right
585.hv_have_right:
586    pshufb          m0, m3, m5
587    pmaddwd         m0, m11
588    pshufb          m1, m4, m5
589    pmaddwd         m1, m11
590    pshufb          m2, m3, m6
591    pshufb          m3, m7
592    paddw           m2, m3
593    pshufb          m3, m4, m6
594    pmaddwd         m2, m12
595    pshufb          m4, m7
596    paddw           m3, m4
597    pmaddwd         m3, m12
598    paddd           m0, m8
599    paddd           m1, m8
600    paddd           m0, m2
601    mova            m2, [t3+r10]
602    paddw           m2, [t1+r10]
603    paddd           m1, m3
604    mova            m4, [t2+r10]
605    punpckhwd       m3, m2, m4
606    pmaddwd         m3, m14
607    punpcklwd       m2, m4
608    mova            m4, [t4+r10]
609    psrad           m0, 4
610    psrad           m1, 4
611    packssdw        m0, m1
612    pmaddwd         m2, m14
613    psraw           m0, 1
614    mova      [t0+r10], m0
615    punpckhwd       m1, m0, m4
616    pmaddwd         m1, m13
617    punpcklwd       m0, m4
618    pmaddwd         m0, m13
619    paddd           m3, m9
620    paddd           m2, m9
621    paddd           m1, m3
622    paddd           m0, m2
623    psrad           m1, 5
624    psrad           m0, 5
625    packusdw        m0, m1
626    pmulhuw         m0, m10
627    mova    [dstq+r10], m0
628    add            r10, 32
629    jl .hv_loop
630    mov             t4, t3
631    mov             t3, t2
632    mov             t2, t1
633    mov             t1, t0
634    mov             t0, t4
635    add           dstq, dst_strideq
636    ret
637.v:
638    mov            r10, wq
639.v_loop:
640    mova            m0, [t1+r10]
641    paddw           m2, m0, [t3+r10]
642    mova            m1, [t2+r10]
643    mova            m4, [t4+r10]
644    punpckhwd       m3, m2, m1
645    pmaddwd         m3, m14
646    punpcklwd       m2, m1
647    pmaddwd         m2, m14
648    punpckhwd       m1, m0, m4
649    pmaddwd         m1, m13
650    punpcklwd       m0, m4
651    pmaddwd         m0, m13
652    paddd           m3, m9
653    paddd           m2, m9
654    paddd           m1, m3
655    paddd           m0, m2
656    psrad           m1, 5
657    psrad           m0, 5
658    packusdw        m0, m1
659    pmulhuw         m0, m10
660    mova    [dstq+r10], m0
661    add            r10, 32
662    jl .v_loop
663    ret
664
665cglobal sgr_filter_5x5_16bpc, 5, 14, 16, 400*24+16, dst, dst_stride, left, lpf, \
666                                                    lpf_stride, w, edge, params, h
667    movifnidn       wd, wm
668    mov        paramsq, paramsmp
669    lea            r13, [sgr_x_by_x_avx2+256*4]
670    mov          edged, r8m
671    mov             hd, r6m
672    add             wd, wd
673    vpbroadcastw    m7, [paramsq+8] ; w0
674    add           lpfq, wq
675    vpbroadcastd    m8, [pd_8]
676    lea             t1, [rsp+wq+20]
677    vpbroadcastd    m9, [pd_25]
678    add           dstq, wq
679    vpbroadcastd   m10, [paramsq+0] ; s0
680    lea             t3, [rsp+wq*2+400*12+16]
681    vpbroadcastd   m11, [pd_0xf00800a4]
682    lea             t4, [rsp+wq+400*20+16]
683    vpbroadcastd   m12, [pw_256]
684    neg             wq
685    vpbroadcastd   m13, [pd_34816]  ; (1 << 11) + (1 << 15)
686    pxor            m6, m6
687    vpbroadcastd   m14, [pw_1023]
688    psllw           m7, 4
689    mova          xm15, [sgr_lshuf5]
690    test         edgeb, 4 ; LR_HAVE_TOP
691    jz .no_top
692    call .h_top
693    add           lpfq, lpf_strideq
694    mov             t2, t1
695    call .top_fixup
696    add             t1, 400*6
697    call .h_top
698    lea            r10, [lpfq+lpf_strideq*4]
699    mov           lpfq, dstq
700    mov      [rsp+8*1], lpf_strideq
701    add            r10, lpf_strideq
702    mov      [rsp+8*0], r10 ; below
703    mov             t0, t2
704    dec             hd
705    jz .height1
706    or           edged, 16
707    call .h
708.main:
709    add           lpfq, dst_strideq
710    call .hv
711    call .prep_n
712    sub             hd, 2
713    jl .extend_bottom
714.main_loop:
715    add           lpfq, dst_strideq
716    test            hd, hd
717    jz .odd_height
718    call .h
719    add           lpfq, dst_strideq
720    call .hv
721    call .n0
722    call .n1
723    sub             hd, 2
724    jge .main_loop
725    test         edgeb, 8 ; LR_HAVE_BOTTOM
726    jz .extend_bottom
727    mov           lpfq, [rsp+8*0]
728    call .h_top
729    add           lpfq, [rsp+8*1]
730    call .hv_bottom
731.end:
732    call .n0
733    call .n1
734.end2:
735    RET
736.height1:
737    call .hv
738    call .prep_n
739    jmp .odd_height_end
740.odd_height:
741    call .hv
742    call .n0
743    call .n1
744.odd_height_end:
745    call .v
746    call .n0
747    jmp .end2
748.extend_bottom:
749    call .v
750    jmp .end
751.no_top:
752    lea            r10, [lpfq+lpf_strideq*4]
753    mov           lpfq, dstq
754    mov      [rsp+8*1], lpf_strideq
755    lea            r10, [r10+lpf_strideq*2]
756    mov      [rsp+8*0], r10
757    call .h
758    lea             t2, [t1+400*6]
759    call .top_fixup
760    dec             hd
761    jz .no_top_height1
762    or           edged, 16
763    mov             t0, t1
764    mov             t1, t2
765    jmp .main
766.no_top_height1:
767    call .v
768    call .prep_n
769    jmp .odd_height_end
770.extend_right:
771    vpbroadcastw    m0, [lpfq-2]
772    movu            m1, [r13+r10+ 0]
773    movu            m2, [r13+r10+16]
774    vpblendvb       m4, m0, m1
775    vpblendvb       m5, m0, m2
776    ret
777.h: ; horizontal boxsum
778    lea            r10, [wq-4]
779    test         edgeb, 1 ; LR_HAVE_LEFT
780    jz .h_extend_left
781    vpbroadcastq   xm5, [leftq]
782    vinserti128     m5, [lpfq+wq], 1
783    mova            m4, [lpfq+wq]
784    add          leftq, 8
785    palignr         m4, m5, 10
786    jmp .h_main
787.h_extend_left:
788    mova           xm4, [lpfq+wq]
789    pshufb         xm4, xm15
790    vinserti128     m4, [lpfq+wq+10], 1
791    jmp .h_main
792.h_top:
793    lea            r10, [wq-4]
794    test         edgeb, 1 ; LR_HAVE_LEFT
795    jz .h_extend_left
796.h_loop:
797    movu            m4, [lpfq+r10- 2]
798.h_main:
799    movu            m5, [lpfq+r10+14]
800    test         edgeb, 2 ; LR_HAVE_RIGHT
801    jnz .h_have_right
802    cmp           r10d, -36
803    jl .h_have_right
804    call .extend_right
805.h_have_right:
806    palignr         m2, m5, m4, 2
807    paddw           m0, m4, m2
808    palignr         m3, m5, m4, 6
809    paddw           m0, m3
810    punpcklwd       m1, m2, m3
811    pmaddwd         m1, m1
812    punpckhwd       m2, m3
813    pmaddwd         m2, m2
814    shufpd          m5, m4, m5, 0x05
815    paddw           m0, m5
816    punpcklwd       m3, m4, m5
817    pmaddwd         m3, m3
818    paddd           m1, m3
819    punpckhwd       m3, m4, m5
820    pmaddwd         m3, m3
821    shufps          m4, m5, q2121
822    paddw           m0, m4             ; sum
823    punpcklwd       m5, m4, m6
824    pmaddwd         m5, m5
825    punpckhwd       m4, m6
826    pmaddwd         m4, m4
827    paddd           m2, m3
828    test         edgeb, 16             ; y > 0
829    jz .h_loop_end
830    paddw           m0, [t1+r10+400*0]
831    paddd           m1, [t1+r10+400*2]
832    paddd           m2, [t1+r10+400*4]
833.h_loop_end:
834    paddd           m1, m5             ; sumsq
835    paddd           m2, m4
836    mova [t1+r10+400*0], m0
837    mova [t1+r10+400*2], m1
838    mova [t1+r10+400*4], m2
839    add            r10, 32
840    jl .h_loop
841    ret
842.top_fixup:
843    lea            r10, [wq-4]
844.top_fixup_loop: ; the sums of the first row needs to be doubled
845    mova            m0, [t1+r10+400*0]
846    mova            m1, [t1+r10+400*2]
847    mova            m2, [t1+r10+400*4]
848    paddw           m0, m0
849    paddd           m1, m1
850    paddd           m2, m2
851    mova [t2+r10+400*0], m0
852    mova [t2+r10+400*2], m1
853    mova [t2+r10+400*4], m2
854    add            r10, 32
855    jl .top_fixup_loop
856    ret
857ALIGN function_align
858.hv: ; horizontal boxsum + vertical boxsum + ab
859    lea            r10, [wq-4]
860    test         edgeb, 1 ; LR_HAVE_LEFT
861    jz .hv_extend_left
862    vpbroadcastq   xm5, [leftq]
863    vinserti128     m5, [lpfq+wq], 1
864    mova            m4, [lpfq+wq]
865    add          leftq, 8
866    palignr         m4, m5, 10
867    jmp .hv_main
868.hv_extend_left:
869    mova           xm4, [lpfq+wq]
870    pshufb         xm4, xm15
871    vinserti128     m4, [lpfq+wq+10], 1
872    jmp .hv_main
873.hv_bottom:
874    lea            r10, [wq-4]
875    test         edgeb, 1 ; LR_HAVE_LEFT
876    jz .hv_extend_left
877.hv_loop:
878    movu            m4, [lpfq+r10- 2]
879.hv_main:
880    movu            m5, [lpfq+r10+14]
881    test         edgeb, 2 ; LR_HAVE_RIGHT
882    jnz .hv_have_right
883    cmp           r10d, -36
884    jl .hv_have_right
885    call .extend_right
886.hv_have_right:
887    palignr         m3, m5, m4, 2
888    paddw           m0, m4, m3
889    palignr         m1, m5, m4, 6
890    paddw           m0, m1
891    punpcklwd       m2, m3, m1
892    pmaddwd         m2, m2
893    punpckhwd       m3, m1
894    pmaddwd         m3, m3
895    shufpd          m5, m4, m5, 0x05
896    paddw           m0, m5
897    punpcklwd       m1, m4, m5
898    pmaddwd         m1, m1
899    paddd           m2, m1
900    punpckhwd       m1, m4, m5
901    pmaddwd         m1, m1
902    shufps          m4, m5, q2121
903    paddw           m0, m4            ; h sum
904    punpcklwd       m5, m4, m6
905    pmaddwd         m5, m5
906    punpckhwd       m4, m6
907    pmaddwd         m4, m4
908    paddd           m3, m1
909    paddd           m2, m5            ; h sumsq
910    paddd           m3, m4
911    paddw           m1, m0, [t1+r10+400*0]
912    paddd           m4, m2, [t1+r10+400*2]
913    paddd           m5, m3, [t1+r10+400*4]
914    test            hd, hd
915    jz .hv_last_row
916.hv_main2:
917    paddw           m1, [t2+r10+400*0] ; hv sum
918    paddd           m4, [t2+r10+400*2] ; hv sumsq
919    paddd           m5, [t2+r10+400*4]
920    mova [t0+r10+400*0], m0
921    mova [t0+r10+400*2], m2
922    mova [t0+r10+400*4], m3
923    psrlw           m3, m1, 1
924    paddd           m4, m8
925    pavgw           m3, m6             ; (b + 2) >> 2
926    paddd           m5, m8
927    psrld           m4, 4              ; (a + 8) >> 4
928    punpcklwd       m2, m3, m6
929    psrld           m5, 4
930    punpckhwd       m3, m6
931    pmulld          m4, m9             ; a * 25
932    pmulld          m5, m9
933    pmaddwd         m2, m2             ; b * b
934    pmaddwd         m3, m3
935    punpcklwd       m0, m1, m6         ; b
936    punpckhwd       m1, m6
937    pmaxud          m4, m2
938    pmaxud          m5, m3
939    psubd           m4, m2             ; p
940    psubd           m5, m3
941    pmulld          m4, m10            ; p * s
942    pmulld          m5, m10
943    pmaddwd         m0, m11            ; b * 164
944    pmaddwd         m1, m11
945    paddusw         m4, m11
946    paddusw         m5, m11
947    psrad           m3, m4, 20         ; min(z, 255) - 256
948    vpgatherdd      m2, [r13+m3*4], m4
949    psrad           m4, m5, 20
950    vpgatherdd      m3, [r13+m4*4], m5
951    pmulld          m0, m2
952    pmulld          m1, m3
953    packssdw        m2, m3
954    psubw           m2, m12, m2        ; a
955    paddd           m0, m13            ; x * b * 164 + (1 << 11) + (1 << 15)
956    paddd           m1, m13
957    mova    [t4+r10+4], m2
958    psrld           m0, 12             ; b
959    psrld           m1, 12
960    mova         [t3+r10*2+ 8], xm0
961    vextracti128 [t3+r10*2+40], m0, 1
962    mova         [t3+r10*2+24], xm1
963    vextracti128 [t3+r10*2+56], m1, 1
964    add            r10, 32
965    jl .hv_loop
966    mov             t2, t1
967    mov             t1, t0
968    mov             t0, t2
969    ret
970.hv_last_row: ; esoteric edge case for odd heights
971    mova   [t1+r10+400*0], m1
972    paddw              m1, m0
973    mova   [t1+r10+400*2], m4
974    paddd              m4, m2
975    mova   [t1+r10+400*4], m5
976    paddd              m5, m3
977    jmp .hv_main2
978.v: ; vertical boxsum + ab
979    lea            r10, [wq-4]
980.v_loop:
981    mova            m0, [t1+r10+400*0]
982    mova            m2, [t1+r10+400*2]
983    mova            m3, [t1+r10+400*4]
984    paddw           m1, m0, [t2+r10+400*0]
985    paddd           m4, m2, [t2+r10+400*2]
986    paddd           m5, m3, [t2+r10+400*4]
987    paddw           m0, m0
988    paddd           m2, m2
989    paddd           m3, m3
990    paddw           m1, m0             ; hv sum
991    paddd           m4, m2             ; hv sumsq
992    paddd           m5, m3
993    psrlw           m3, m1, 1
994    paddd           m4, m8
995    pavgw           m3, m6             ; (b + 2) >> 2
996    paddd           m5, m8
997    psrld           m4, 4              ; (a + 8) >> 4
998    punpcklwd       m2, m3, m6
999    psrld           m5, 4
1000    punpckhwd       m3, m6
1001    pmulld          m4, m9             ; a * 25
1002    pmulld          m5, m9
1003    pmaddwd         m2, m2             ; b * b
1004    pmaddwd         m3, m3
1005    punpcklwd       m0, m1, m6         ; b
1006    punpckhwd       m1, m6
1007    pmaxud          m4, m2
1008    pmaxud          m5, m3
1009    psubd           m4, m2             ; p
1010    psubd           m5, m3
1011    pmulld          m4, m10            ; p * s
1012    pmulld          m5, m10
1013    pmaddwd         m0, m11            ; b * 164
1014    pmaddwd         m1, m11
1015    paddusw         m4, m11
1016    paddusw         m5, m11
1017    psrad           m3, m4, 20         ; min(z, 255) - 256
1018    vpgatherdd      m2, [r13+m3*4], m4
1019    psrad           m4, m5, 20
1020    vpgatherdd      m3, [r13+m4*4], m5
1021    pmulld          m0, m2
1022    pmulld          m1, m3
1023    packssdw        m2, m3
1024    psubw           m2, m12, m2        ; a
1025    paddd           m0, m13            ; x * b * 164 + (1 << 11) + (1 << 15)
1026    paddd           m1, m13
1027    mova    [t4+r10+4], m2
1028    psrld           m0, 12             ; b
1029    psrld           m1, 12
1030    mova         [t3+r10*2+ 8], xm0
1031    vextracti128 [t3+r10*2+40], m0, 1
1032    mova         [t3+r10*2+24], xm1
1033    vextracti128 [t3+r10*2+56], m1, 1
1034    add            r10, 32
1035    jl .v_loop
1036    ret
1037.prep_n: ; initial neighbor setup
1038    mov            r10, wq
1039.prep_n_loop:
1040    movu            m0, [t4+r10*1+ 2]
1041    movu            m1, [t3+r10*2+ 4]
1042    movu            m2, [t3+r10*2+36]
1043    paddw           m3, m0, [t4+r10*1+ 0]
1044    paddd           m4, m1, [t3+r10*2+ 0]
1045    paddd           m5, m2, [t3+r10*2+32]
1046    paddw           m3, [t4+r10*1+ 4]
1047    paddd           m4, [t3+r10*2+ 8]
1048    paddd           m5, [t3+r10*2+40]
1049    paddw           m0, m3
1050    psllw           m3, 2
1051    paddd           m1, m4
1052    pslld           m4, 2
1053    paddd           m2, m5
1054    pslld           m5, 2
1055    paddw           m0, m3             ; a 565
1056    paddd           m1, m4             ; b 565
1057    paddd           m2, m5
1058    mova [t4+r10*1+400*2+ 0], m0
1059    mova [t3+r10*2+400*4+ 0], m1
1060    mova [t3+r10*2+400*4+32], m2
1061    add            r10, 32
1062    jl .prep_n_loop
1063    ret
1064ALIGN function_align
1065.n0: ; neighbor + output (even rows)
1066    mov            r10, wq
1067.n0_loop:
1068    movu            m0, [t4+r10*1+ 2]
1069    movu            m1, [t3+r10*2+ 4]
1070    movu            m2, [t3+r10*2+36]
1071    paddw           m3, m0, [t4+r10*1+ 0]
1072    paddd           m4, m1, [t3+r10*2+ 0]
1073    paddd           m5, m2, [t3+r10*2+32]
1074    paddw           m3, [t4+r10*1+ 4]
1075    paddd           m4, [t3+r10*2+ 8]
1076    paddd           m5, [t3+r10*2+40]
1077    paddw           m0, m3
1078    psllw           m3, 2
1079    paddd           m1, m4
1080    pslld           m4, 2
1081    paddd           m2, m5
1082    pslld           m5, 2
1083    paddw           m0, m3             ; a 565
1084    paddd           m1, m4             ; b 565
1085    paddd           m2, m5
1086    paddw           m3, m0, [t4+r10*1+400*2+ 0]
1087    paddd           m4, m1, [t3+r10*2+400*4+ 0]
1088    paddd           m5, m2, [t3+r10*2+400*4+32]
1089    mova [t4+r10*1+400*2+ 0], m0
1090    mova [t3+r10*2+400*4+ 0], m1
1091    mova [t3+r10*2+400*4+32], m2
1092    mova            m0, [dstq+r10]
1093    punpcklwd       m1, m0, m6          ; src
1094    punpcklwd       m2, m3, m6          ; a
1095    pmaddwd         m2, m1              ; a * src
1096    punpckhwd       m1, m0, m6
1097    punpckhwd       m3, m6
1098    pmaddwd         m3, m1
1099    vinserti128     m1, m4, xm5, 1
1100    vperm2i128      m4, m5, 0x31
1101    paddd           m2, m1              ; a * src + b + (1 << 8)
1102    paddd           m3, m4
1103    psrld           m2, 9
1104    psrld           m3, 9
1105    packssdw        m2, m3
1106    psllw           m1, m0, 4
1107    psubw           m2, m1
1108    pmulhrsw        m2, m7
1109    paddw           m0, m2
1110    pmaxsw          m0, m6
1111    pminsw          m0, m14
1112    mova    [dstq+r10], m0
1113    add            r10, 32
1114    jl .n0_loop
1115    add           dstq, dst_strideq
1116    ret
1117ALIGN function_align
1118.n1: ; neighbor + output (odd rows)
1119    mov            r10, wq
1120.n1_loop:
1121    mova            m0, [dstq+r10]
1122    mova            m3, [t4+r10*1+400*2+ 0]
1123    mova            m4, [t3+r10*2+400*4+ 0]
1124    mova            m5, [t3+r10*2+400*4+32]
1125    punpcklwd       m1, m0, m6          ; src
1126    punpcklwd       m2, m3, m6          ; a
1127    pmaddwd         m2, m1
1128    punpckhwd       m1, m0, m6
1129    punpckhwd       m3, m6
1130    pmaddwd         m3, m1
1131    vinserti128     m1, m4, xm5, 1
1132    vperm2i128      m4, m5, 0x31
1133    paddd           m2, m1              ; a * src + b + (1 <<7)
1134    paddd           m3, m4
1135    psrld           m2, 8
1136    psrld           m3, 8
1137    packssdw        m2, m3
1138    psllw           m1, m0, 4
1139    psubw           m2, m1
1140    pmulhrsw        m2, m7
1141    paddw           m0, m2
1142    pmaxsw          m0, m6
1143    pminsw          m0, m14
1144    mova    [dstq+r10], m0
1145    add            r10, 32
1146    jl .n1_loop
1147    add           dstq, dst_strideq
1148    ret
1149
1150cglobal sgr_filter_3x3_16bpc, 5, 14, 15, 400*42+8, dst, dst_stride, left, lpf, \
1151                                                   lpf_stride, w, edge, params, h
1152    movifnidn       wd, wm
1153    mov        paramsq, paramsmp
1154    lea            r13, [sgr_x_by_x_avx2+256*4]
1155    mov          edged, r8m
1156    add             wd, wd
1157    mov             hd, r6m
1158    add           lpfq, wq
1159    vpbroadcastw    m7, [paramsq+10] ; w1
1160    lea             t1, [rsp+wq+12]
1161    vpbroadcastd    m8, [pd_8]
1162    add           dstq, wq
1163    vpbroadcastd    m9, [paramsq+ 4] ; s1
1164    lea             t3, [rsp+wq*2+400*12+8]
1165    vpbroadcastd   m10, [pd_0xf00801c7]
1166    lea             t4, [rsp+wq+400*32+8]
1167    vpbroadcastd   m11, [pd_34816]
1168    neg             wq
1169    vpbroadcastd   m12, [pw_256]
1170    pxor            m6, m6
1171    vpbroadcastd   m13, [pw_1023]
1172    psllw           m7, 4
1173    mova          xm14, [sgr_lshuf3]
1174    test         edgeb, 4 ; LR_HAVE_TOP
1175    jz .no_top
1176    call .h_top
1177    add           lpfq, lpf_strideq
1178    mov             t2, t1
1179    add             t1, 400*6
1180    call .h_top
1181    lea            r10, [lpfq+lpf_strideq*4]
1182    mov           lpfq, dstq
1183    add            r10, lpf_strideq
1184    mov          [rsp], r10 ; below
1185    call .hv0
1186.main:
1187    dec             hd
1188    jz .height1
1189    add           lpfq, dst_strideq
1190    call .hv1
1191    call .prep_n
1192    sub             hd, 2
1193    jl .extend_bottom
1194.main_loop:
1195    add           lpfq, dst_strideq
1196    call .hv0
1197    test            hd, hd
1198    jz .odd_height
1199    add           lpfq, dst_strideq
1200    call .hv1
1201    call .n0
1202    call .n1
1203    sub             hd, 2
1204    jge .main_loop
1205    test         edgeb, 8 ; LR_HAVE_BOTTOM
1206    jz .extend_bottom
1207    mov           lpfq, [rsp]
1208    call .hv0_bottom
1209    add           lpfq, lpf_strideq
1210    call .hv1_bottom
1211.end:
1212    call .n0
1213    call .n1
1214.end2:
1215    RET
1216.height1:
1217    call .v1
1218    call .prep_n
1219    jmp .odd_height_end
1220.odd_height:
1221    call .v1
1222    call .n0
1223    call .n1
1224.odd_height_end:
1225    call .v0
1226    call .v1
1227    call .n0
1228    jmp .end2
1229.extend_bottom:
1230    call .v0
1231    call .v1
1232    jmp .end
1233.no_top:
1234    lea            r10, [lpfq+lpf_strideq*4]
1235    mov           lpfq, dstq
1236    lea            r10, [r10+lpf_strideq*2]
1237    mov          [rsp], r10
1238    call .h
1239    lea            r10, [wq-4]
1240    lea             t2, [t1+400*6]
1241.top_fixup_loop:
1242    mova            m0, [t1+r10+400*0]
1243    mova            m1, [t1+r10+400*2]
1244    mova            m2, [t1+r10+400*4]
1245    mova [t2+r10+400*0], m0
1246    mova [t2+r10+400*2], m1
1247    mova [t2+r10+400*4], m2
1248    add            r10, 32
1249    jl .top_fixup_loop
1250    call .v0
1251    jmp .main
1252.extend_right:
1253    vpbroadcastw    m0, [lpfq-2]
1254    movu            m1, [r13+r10+ 2]
1255    movu            m2, [r13+r10+18]
1256    vpblendvb       m4, m0, m1
1257    vpblendvb       m5, m0, m2
1258    ret
1259.h: ; horizontal boxsum
1260    lea            r10, [wq-4]
1261    test         edgeb, 1 ; LR_HAVE_LEFT
1262    jz .h_extend_left
1263    vpbroadcastq   xm5, [leftq]
1264    vinserti128     m5, [lpfq+wq], 1
1265    mova            m4, [lpfq+wq]
1266    add          leftq, 8
1267    palignr         m4, m5, 12
1268    jmp .h_main
1269.h_extend_left:
1270    mova           xm4, [lpfq+wq]
1271    pshufb         xm4, xm14
1272    vinserti128     m4, [lpfq+wq+12], 1
1273    jmp .h_main
1274.h_top:
1275    lea            r10, [wq-4]
1276    test         edgeb, 1 ; LR_HAVE_LEFT
1277    jz .h_extend_left
1278.h_loop:
1279    movu            m4, [lpfq+r10+ 0]
1280.h_main:
1281    movu            m5, [lpfq+r10+16]
1282    test         edgeb, 2 ; LR_HAVE_RIGHT
1283    jnz .h_have_right
1284    cmp           r10d, -34
1285    jl .h_have_right
1286    call .extend_right
1287.h_have_right:
1288    palignr         m0, m5, m4, 2
1289    paddw           m1, m4, m0
1290    punpcklwd       m2, m4, m0
1291    pmaddwd         m2, m2
1292    punpckhwd       m3, m4, m0
1293    pmaddwd         m3, m3
1294    palignr         m5, m4, 4
1295    paddw           m1, m5             ; sum
1296    punpcklwd       m4, m5, m6
1297    pmaddwd         m4, m4
1298    punpckhwd       m5, m6
1299    pmaddwd         m5, m5
1300    paddd           m2, m4             ; sumsq
1301    paddd           m3, m5
1302    mova [t1+r10+400*0], m1
1303    mova [t1+r10+400*2], m2
1304    mova [t1+r10+400*4], m3
1305    add            r10, 32
1306    jl .h_loop
1307    ret
1308ALIGN function_align
1309.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
1310    lea            r10, [wq-4]
1311    test         edgeb, 1 ; LR_HAVE_LEFT
1312    jz .hv0_extend_left
1313    vpbroadcastq   xm5, [leftq]
1314    vinserti128     m5, [lpfq+wq], 1
1315    mova            m4, [lpfq+wq]
1316    add          leftq, 8
1317    palignr         m4, m5, 12
1318    jmp .hv0_main
1319.hv0_extend_left:
1320    mova           xm4, [lpfq+wq]
1321    pshufb         xm4, xm14
1322    vinserti128     m4, [lpfq+wq+12], 1
1323    jmp .hv0_main
1324.hv0_bottom:
1325    lea            r10, [wq-4]
1326    test         edgeb, 1 ; LR_HAVE_LEFT
1327    jz .hv0_extend_left
1328.hv0_loop:
1329    movu            m4, [lpfq+r10+ 0]
1330.hv0_main:
1331    movu            m5, [lpfq+r10+16]
1332    test         edgeb, 2 ; LR_HAVE_RIGHT
1333    jnz .hv0_have_right
1334    cmp           r10d, -34
1335    jl .hv0_have_right
1336    call .extend_right
1337.hv0_have_right:
1338    palignr         m0, m5, m4, 2
1339    paddw           m1, m4, m0
1340    punpcklwd       m2, m4, m0
1341    pmaddwd         m2, m2
1342    punpckhwd       m3, m4, m0
1343    pmaddwd         m3, m3
1344    palignr         m5, m4, 4
1345    paddw           m1, m5             ; sum
1346    punpcklwd       m4, m5, m6
1347    pmaddwd         m4, m4
1348    punpckhwd       m5, m6
1349    pmaddwd         m5, m5
1350    paddd           m2, m4             ; sumsq
1351    paddd           m3, m5
1352    paddw           m0, m1, [t1+r10+400*0]
1353    paddd           m4, m2, [t1+r10+400*2]
1354    paddd           m5, m3, [t1+r10+400*4]
1355    mova [t1+r10+400*0], m1
1356    mova [t1+r10+400*2], m2
1357    mova [t1+r10+400*4], m3
1358    paddw           m1, m0, [t2+r10+400*0]
1359    paddd           m2, m4, [t2+r10+400*2]
1360    paddd           m3, m5, [t2+r10+400*4]
1361    mova [t2+r10+400*0], m0
1362    mova [t2+r10+400*2], m4
1363    mova [t2+r10+400*4], m5
1364    paddd           m2, m8
1365    paddd           m3, m8
1366    psrld           m2, 4              ; (a + 8) >> 4
1367    psrld           m3, 4
1368    pslld           m4, m2, 3
1369    pslld           m5, m3, 3
1370    paddd           m4, m2             ; ((a + 8) >> 4) * 9
1371    paddd           m5, m3
1372    psrlw           m3, m1, 1
1373    pavgw           m3, m6             ; (b + 2) >> 2
1374    punpcklwd       m2, m3, m6
1375    pmaddwd         m2, m2
1376    punpckhwd       m3, m6
1377    pmaddwd         m3, m3
1378    punpcklwd       m0, m1, m6         ; b
1379    punpckhwd       m1, m6
1380    pmaxud          m4, m2
1381    psubd           m4, m2             ; p
1382    pmaxud          m5, m3
1383    psubd           m5, m3
1384    pmulld          m4, m9             ; p * s
1385    pmulld          m5, m9
1386    pmaddwd         m0, m10            ; b * 455
1387    pmaddwd         m1, m10
1388    paddusw         m4, m10
1389    paddusw         m5, m10
1390    psrad           m3, m4, 20         ; min(z, 255) - 256
1391    vpgatherdd      m2, [r13+m3*4], m4
1392    psrad           m4, m5, 20
1393    vpgatherdd      m3, [r13+m4*4], m5
1394    pmulld          m0, m2
1395    pmulld          m1, m3
1396    packssdw        m2, m3
1397    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
1398    paddd           m1, m11
1399    psubw           m2, m12, m2
1400    psrld           m0, 12
1401    psrld           m1, 12
1402    mova         [t4+r10*1+400*0+ 4], m2
1403    mova         [t3+r10*2+400*0+ 8], xm0
1404    vextracti128 [t3+r10*2+400*0+40], m0, 1
1405    mova         [t3+r10*2+400*0+24], xm1
1406    vextracti128 [t3+r10*2+400*0+56], m1, 1
1407    add            r10, 32
1408    jl .hv0_loop
1409    ret
1410ALIGN function_align
1411.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
1412    lea            r10, [wq-4]
1413    test         edgeb, 1 ; LR_HAVE_LEFT
1414    jz .hv1_extend_left
1415    vpbroadcastq   xm5, [leftq]
1416    vinserti128     m5, [lpfq+wq], 1
1417    mova            m4, [lpfq+wq]
1418    add          leftq, 8
1419    palignr         m4, m5, 12
1420    jmp .hv1_main
1421.hv1_extend_left:
1422    mova           xm4, [lpfq+wq]
1423    pshufb         xm4, xm14
1424    vinserti128     m4, [lpfq+wq+12], 1
1425    jmp .hv1_main
1426.hv1_bottom:
1427    lea            r10, [wq-4]
1428    test         edgeb, 1 ; LR_HAVE_LEFT
1429    jz .hv1_extend_left
1430.hv1_loop:
1431    movu            m4, [lpfq+r10+ 0]
1432.hv1_main:
1433    movu            m5, [lpfq+r10+16]
1434    test         edgeb, 2 ; LR_HAVE_RIGHT
1435    jnz .hv1_have_right
1436    cmp           r10d, -34
1437    jl .hv1_have_right
1438    call .extend_right
1439.hv1_have_right:
1440    palignr         m1, m5, m4, 2
1441    paddw           m0, m4, m1
1442    punpcklwd       m2, m4, m1
1443    pmaddwd         m2, m2
1444    punpckhwd       m3, m4, m1
1445    pmaddwd         m3, m3
1446    palignr         m5, m4, 4
1447    paddw           m0, m5             ; h sum
1448    punpcklwd       m1, m5, m6
1449    pmaddwd         m1, m1
1450    punpckhwd       m5, m6
1451    pmaddwd         m5, m5
1452    paddd           m2, m1             ; h sumsq
1453    paddd           m3, m5
1454    paddw           m1, m0, [t2+r10+400*0]
1455    paddd           m4, m2, [t2+r10+400*2]
1456    paddd           m5, m3, [t2+r10+400*4]
1457    mova [t2+r10+400*0], m0
1458    mova [t2+r10+400*2], m2
1459    mova [t2+r10+400*4], m3
1460    paddd           m4, m8
1461    paddd           m5, m8
1462    psrld           m4, 4              ; (a + 8) >> 4
1463    psrld           m5, 4
1464    pslld           m2, m4, 3
1465    pslld           m3, m5, 3
1466    paddd           m4, m2             ; ((a + 8) >> 4) * 9
1467    paddd           m5, m3
1468    psrlw           m3, m1, 1
1469    pavgw           m3, m6             ; (b + 2) >> 2
1470    punpcklwd       m2, m3, m6
1471    pmaddwd         m2, m2
1472    punpckhwd       m3, m6
1473    pmaddwd         m3, m3
1474    punpcklwd       m0, m1, m6         ; b
1475    punpckhwd       m1, m6
1476    pmaxud          m4, m2
1477    psubd           m4, m2             ; p
1478    pmaxud          m5, m3
1479    psubd           m5, m3
1480    pmulld          m4, m9             ; p * s
1481    pmulld          m5, m9
1482    pmaddwd         m0, m10            ; b * 455
1483    pmaddwd         m1, m10
1484    paddusw         m4, m10
1485    paddusw         m5, m10
1486    psrad           m3, m4, 20         ; min(z, 255) - 256
1487    vpgatherdd      m2, [r13+m3*4], m4
1488    psrad           m4, m5, 20
1489    vpgatherdd      m3, [r13+m4*4], m5
1490    pmulld          m0, m2
1491    pmulld          m1, m3
1492    packssdw        m2, m3
1493    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
1494    paddd           m1, m11
1495    psubw           m2, m12, m2
1496    psrld           m0, 12
1497    psrld           m1, 12
1498    mova         [t4+r10*1+400*2 +4], m2
1499    mova         [t3+r10*2+400*4+ 8], xm0
1500    vextracti128 [t3+r10*2+400*4+40], m0, 1
1501    mova         [t3+r10*2+400*4+24], xm1
1502    vextracti128 [t3+r10*2+400*4+56], m1, 1
1503    add            r10, 32
1504    jl .hv1_loop
1505    mov            r10, t2
1506    mov             t2, t1
1507    mov             t1, r10
1508    ret
1509.v0: ; vertical boxsums + ab (even rows)
1510    lea            r10, [wq-4]
1511.v0_loop:
1512    mova            m0, [t1+r10+400*0]
1513    mova            m4, [t1+r10+400*2]
1514    mova            m5, [t1+r10+400*4]
1515    paddw           m0, m0
1516    paddd           m4, m4
1517    paddd           m5, m5
1518    paddw           m1, m0, [t2+r10+400*0]
1519    paddd           m2, m4, [t2+r10+400*2]
1520    paddd           m3, m5, [t2+r10+400*4]
1521    mova [t2+r10+400*0], m0
1522    mova [t2+r10+400*2], m4
1523    mova [t2+r10+400*4], m5
1524    paddd           m2, m8
1525    paddd           m3, m8
1526    psrld           m2, 4              ; (a + 8) >> 4
1527    psrld           m3, 4
1528    pslld           m4, m2, 3
1529    pslld           m5, m3, 3
1530    paddd           m4, m2             ; ((a + 8) >> 4) * 9
1531    paddd           m5, m3
1532    psrlw           m3, m1, 1
1533    pavgw           m3, m6             ; (b + 2) >> 2
1534    punpcklwd       m2, m3, m6
1535    pmaddwd         m2, m2
1536    punpckhwd       m3, m6
1537    pmaddwd         m3, m3
1538    punpcklwd       m0, m1, m6         ; b
1539    punpckhwd       m1, m6
1540    pmaxud          m4, m2
1541    psubd           m4, m2             ; p
1542    pmaxud          m5, m3
1543    psubd           m5, m3
1544    pmulld          m4, m9             ; p * s
1545    pmulld          m5, m9
1546    pmaddwd         m0, m10            ; b * 455
1547    pmaddwd         m1, m10
1548    paddusw         m4, m10
1549    paddusw         m5, m10
1550    psrad           m3, m4, 20         ; min(z, 255) - 256
1551    vpgatherdd      m2, [r13+m3*4], m4
1552    psrad           m4, m5, 20
1553    vpgatherdd      m3, [r13+m4*4], m5
1554    pmulld          m0, m2
1555    pmulld          m1, m3
1556    packssdw        m2, m3
1557    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
1558    paddd           m1, m11
1559    psubw           m2, m12, m2
1560    psrld           m0, 12
1561    psrld           m1, 12
1562    mova         [t4+r10*1+400*0+ 4], m2
1563    mova         [t3+r10*2+400*0+ 8], xm0
1564    vextracti128 [t3+r10*2+400*0+40], m0, 1
1565    mova         [t3+r10*2+400*0+24], xm1
1566    vextracti128 [t3+r10*2+400*0+56], m1, 1
1567    add            r10, 32
1568    jl .v0_loop
1569    ret
1570.v1: ; vertical boxsums + ab (odd rows)
1571    lea            r10, [wq-4]
1572.v1_loop:
1573    mova            m0, [t1+r10+400*0]
1574    mova            m4, [t1+r10+400*2]
1575    mova            m5, [t1+r10+400*4]
1576    paddw           m1, m0, [t2+r10+400*0]
1577    paddd           m2, m4, [t2+r10+400*2]
1578    paddd           m3, m5, [t2+r10+400*4]
1579    mova [t2+r10+400*0], m0
1580    mova [t2+r10+400*2], m4
1581    mova [t2+r10+400*4], m5
1582    paddd           m2, m8
1583    paddd           m3, m8
1584    psrld           m2, 4              ; (a + 8) >> 4
1585    psrld           m3, 4
1586    pslld           m4, m2, 3
1587    pslld           m5, m3, 3
1588    paddd           m4, m2             ; ((a + 8) >> 4) * 9
1589    paddd           m5, m3
1590    psrlw           m3, m1, 1
1591    pavgw           m3, m6             ; (b + 2) >> 2
1592    punpcklwd       m2, m3, m6
1593    pmaddwd         m2, m2
1594    punpckhwd       m3, m6
1595    pmaddwd         m3, m3
1596    punpcklwd       m0, m1, m6         ; b
1597    punpckhwd       m1, m6
1598    pmaxud          m4, m2
1599    psubd           m4, m2             ; p
1600    pmaxud          m5, m3
1601    psubd           m5, m3
1602    pmulld          m4, m9             ; p * s
1603    pmulld          m5, m9
1604    pmaddwd         m0, m10            ; b * 455
1605    pmaddwd         m1, m10
1606    paddusw         m4, m10
1607    paddusw         m5, m10
1608    psrad           m3, m4, 20         ; min(z, 255) - 256
1609    vpgatherdd      m2, [r13+m3*4], m4
1610    psrad           m4, m5, 20
1611    vpgatherdd      m3, [r13+m4*4], m5
1612    pmulld          m0, m2
1613    pmulld          m1, m3
1614    packssdw        m2, m3
1615    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
1616    paddd           m1, m11
1617    psubw           m2, m12, m2
1618    psrld           m0, 12
1619    psrld           m1, 12
1620    mova         [t4+r10*1+400*2+ 4], m2
1621    mova         [t3+r10*2+400*4+ 8], xm0
1622    vextracti128 [t3+r10*2+400*4+40], m0, 1
1623    mova         [t3+r10*2+400*4+24], xm1
1624    vextracti128 [t3+r10*2+400*4+56], m1, 1
1625    add            r10, 32
1626    jl .v1_loop
1627    mov            r10, t2
1628    mov             t2, t1
1629    mov             t1, r10
1630    ret
1631.prep_n: ; initial neighbor setup
1632    mov            r10, wq
1633.prep_n_loop:
1634    mova           xm0, [t4+r10*1+400*0+0]
1635    paddw          xm0, [t4+r10*1+400*0+4]
1636    paddw          xm2, xm0, [t4+r10*1+400*0+2]
1637    mova            m1, [t3+r10*2+400*0+0]
1638    paddd           m1, [t3+r10*2+400*0+8]
1639    paddd           m3, m1, [t3+r10*2+400*0+4]
1640    psllw          xm2, 2                ; a[-1] 444
1641    pslld           m3, 2                ; b[-1] 444
1642    psubw          xm2, xm0              ; a[-1] 343
1643    psubd           m3, m1               ; b[-1] 343
1644    mova [t4+r10*1+400* 4], xm2
1645    mova [t3+r10*2+400* 8], m3
1646    mova           xm0, [t4+r10*1+400*2+0]
1647    paddw          xm0, [t4+r10*1+400*2+4]
1648    paddw          xm2, xm0, [t4+r10*1+400*2+2]
1649    mova            m1, [t3+r10*2+400*4+0]
1650    paddd           m1, [t3+r10*2+400*4+8]
1651    paddd           m3, m1, [t3+r10*2+400*4+4]
1652    psllw          xm2, 2                 ; a[ 0] 444
1653    pslld           m3, 2                 ; b[ 0] 444
1654    mova [t4+r10*1+400* 6], xm2
1655    mova [t3+r10*2+400*12], m3
1656    psubw          xm2, xm0               ; a[ 0] 343
1657    psubd           m3, m1                ; b[ 0] 343
1658    mova [t4+r10*1+400* 8], xm2
1659    mova [t3+r10*2+400*16], m3
1660    add            r10, 16
1661    jl .prep_n_loop
1662    ret
1663ALIGN function_align
1664.n0: ; neighbor + output (even rows)
1665    mov            r10, wq
1666.n0_loop:
1667    mova            m3, [t4+r10*1+400*0+0]
1668    paddw           m3, [t4+r10*1+400*0+4]
1669    paddw           m1, m3, [t4+r10*1+400*0+2]
1670    psllw           m1, 2                ; a[ 1] 444
1671    psubw           m2, m1, m3           ; a[ 1] 343
1672    paddw           m3, m2, [t4+r10*1+400*4]
1673    paddw           m3, [t4+r10*1+400*6]
1674    mova [t4+r10*1+400*4], m2
1675    mova [t4+r10*1+400*6], m1
1676    mova            m4, [t3+r10*2+400*0+0]
1677    paddd           m4, [t3+r10*2+400*0+8]
1678    paddd           m1, m4, [t3+r10*2+400*0+4]
1679    pslld           m1, 2                ; b[ 1] 444
1680    psubd           m2, m1, m4           ; b[ 1] 343
1681    paddd           m4, m2, [t3+r10*2+400* 8+ 0]
1682    paddd           m4, [t3+r10*2+400*12+ 0]
1683    mova [t3+r10*2+400* 8+ 0], m2
1684    mova [t3+r10*2+400*12+ 0], m1
1685    mova            m5, [t3+r10*2+400*0+32]
1686    paddd           m5, [t3+r10*2+400*0+40]
1687    paddd           m1, m5, [t3+r10*2+400*0+36]
1688    pslld           m1, 2
1689    psubd           m2, m1, m5
1690    paddd           m5, m2, [t3+r10*2+400* 8+32]
1691    paddd           m5, [t3+r10*2+400*12+32]
1692    mova [t3+r10*2+400* 8+32], m2
1693    mova [t3+r10*2+400*12+32], m1
1694    mova            m0, [dstq+r10]
1695    punpcklwd       m1, m0, m6
1696    punpcklwd       m2, m3, m6
1697    pmaddwd         m2, m1               ; a * src
1698    punpckhwd       m1, m0, m6
1699    punpckhwd       m3, m6
1700    pmaddwd         m3, m1
1701    vinserti128     m1, m4, xm5, 1
1702    vperm2i128      m4, m5, 0x31
1703    paddd           m2, m1               ; a * src + b + (1 << 8)
1704    paddd           m3, m4
1705    psrld           m2, 9
1706    psrld           m3, 9
1707    packssdw        m2, m3
1708    psllw           m1, m0, 4
1709    psubw           m2, m1
1710    pmulhrsw        m2, m7
1711    paddw           m0, m2
1712    pmaxsw          m0, m6
1713    pminsw          m0, m13
1714    mova    [dstq+r10], m0
1715    add            r10, 32
1716    jl .n0_loop
1717    add           dstq, dst_strideq
1718    ret
1719ALIGN function_align
1720.n1: ; neighbor + output (odd rows)
1721    mov            r10, wq
1722.n1_loop:
1723    mova            m3, [t4+r10*1+400*2+0]
1724    paddw           m3, [t4+r10*1+400*2+4]
1725    paddw           m1, m3, [t4+r10*1+400*2+2]
1726    psllw           m1, 2                ; a[ 1] 444
1727    psubw           m2, m1, m3           ; a[ 1] 343
1728    paddw           m3, m2, [t4+r10*1+400*6]
1729    paddw           m3, [t4+r10*1+400*8]
1730    mova [t4+r10*1+400*6], m1
1731    mova [t4+r10*1+400*8], m2
1732    mova            m4, [t3+r10*2+400*4+0]
1733    paddd           m4, [t3+r10*2+400*4+8]
1734    paddd           m1, m4, [t3+r10*2+400*4+4]
1735    pslld           m1, 2                ; b[ 1] 444
1736    psubd           m2, m1, m4           ; b[ 1] 343
1737    paddd           m4, m2, [t3+r10*2+400*12+ 0]
1738    paddd           m4, [t3+r10*2+400*16+ 0]
1739    mova [t3+r10*2+400*12+ 0], m1
1740    mova [t3+r10*2+400*16+ 0], m2
1741    mova            m5, [t3+r10*2+400*4+32]
1742    paddd           m5, [t3+r10*2+400*4+40]
1743    paddd           m1, m5, [t3+r10*2+400*4+36]
1744    pslld           m1, 2
1745    psubd           m2, m1, m5
1746    paddd           m5, m2, [t3+r10*2+400*12+32]
1747    paddd           m5, [t3+r10*2+400*16+32]
1748    mova [t3+r10*2+400*12+32], m1
1749    mova [t3+r10*2+400*16+32], m2
1750    mova            m0, [dstq+r10]
1751    punpcklwd       m1, m0, m6
1752    punpcklwd       m2, m3, m6
1753    pmaddwd         m2, m1               ; a * src
1754    punpckhwd       m1, m0, m6
1755    punpckhwd       m3, m6
1756    pmaddwd         m3, m1
1757    vinserti128     m1, m4, xm5, 1
1758    vperm2i128      m4, m5, 0x31
1759    paddd           m2, m1               ; a * src + b + (1 << 8)
1760    paddd           m3, m4
1761    psrld           m2, 9
1762    psrld           m3, 9
1763    packssdw        m2, m3
1764    psllw           m1, m0, 4
1765    psubw           m2, m1
1766    pmulhrsw        m2, m7
1767    paddw           m0, m2
1768    pmaxsw          m0, m6
1769    pminsw          m0, m13
1770    mova    [dstq+r10], m0
1771    add            r10, 32
1772    jl .n1_loop
1773    add           dstq, dst_strideq
1774    ret
1775
1776cglobal sgr_filter_mix_16bpc, 5, 14, 16, 400*66+8, dst, dst_stride, left, lpf, \
1777                                                   lpf_stride, w, edge, params, h
1778    movifnidn       wd, wm
1779    mov        paramsq, paramsmp
1780    lea            r13, [sgr_x_by_x_avx2+256*4]
1781    mov          edged, r8m
1782    add             wd, wd
1783    mov             hd, r6m
1784    add           lpfq, wq
1785    vpbroadcastd    m9, [pd_8]
1786    lea             t1, [rsp+wq+12]
1787    vpbroadcastd   m10, [pd_34816]
1788    add           dstq, wq
1789    vpbroadcastd   m11, [pw_256]
1790    lea             t3, [rsp+wq*2+400*24+8]
1791    vpbroadcastd   m12, [pd_0xf00801c7]
1792    lea             t4, [rsp+wq+400*52+8]
1793    vpbroadcastd   m15, [paramsq+8] ; w0 w1
1794    neg             wq
1795    vpbroadcastd   m13, [paramsq+0] ; s0
1796    pxor            m7, m7
1797    vpbroadcastd   m14, [paramsq+4] ; s1
1798    psllw          m15, 2
1799    test         edgeb, 4 ; LR_HAVE_TOP
1800    jz .no_top
1801    call .h_top
1802    add           lpfq, lpf_strideq
1803    mov             t2, t1
1804    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).top_fixup
1805    add             t1, 400*12
1806    call .h_top
1807    lea            r10, [lpfq+lpf_strideq*4]
1808    mov           lpfq, dstq
1809    add            r10, lpf_strideq
1810    mov          [rsp], r10 ; below
1811    call .hv0
1812.main:
1813    dec             hd
1814    jz .height1
1815    add           lpfq, dst_strideq
1816    call .hv1
1817    call .prep_n
1818    sub             hd, 2
1819    jl .extend_bottom
1820.main_loop:
1821    add           lpfq, dst_strideq
1822    call .hv0
1823    test            hd, hd
1824    jz .odd_height
1825    add           lpfq, dst_strideq
1826    call .hv1
1827    call .n0
1828    call .n1
1829    sub             hd, 2
1830    jge .main_loop
1831    test         edgeb, 8 ; LR_HAVE_BOTTOM
1832    jz .extend_bottom
1833    mov           lpfq, [rsp]
1834    call .hv0_bottom
1835    add           lpfq, lpf_strideq
1836    call .hv1_bottom
1837.end:
1838    call .n0
1839    call .n1
1840.end2:
1841    RET
1842.height1:
1843    call .v1
1844    call .prep_n
1845    jmp .odd_height_end
1846.odd_height:
1847    call .v1
1848    call .n0
1849    call .n1
1850.odd_height_end:
1851    call .v0
1852    call .v1
1853    call .n0
1854    jmp .end2
1855.extend_bottom:
1856    call .v0
1857    call .v1
1858    jmp .end
1859.no_top:
1860    lea            r10, [lpfq+lpf_strideq*4]
1861    mov           lpfq, dstq
1862    lea            r10, [r10+lpf_strideq*2]
1863    mov          [rsp], r10
1864    call .h
1865    lea            r10, [wq-4]
1866    lea             t2, [t1+400*12]
1867.top_fixup_loop:
1868    mova            m0, [t1+r10+400* 0]
1869    mova            m1, [t1+r10+400* 2]
1870    mova            m2, [t1+r10+400* 4]
1871    paddw           m0, m0
1872    mova            m3, [t1+r10+400* 6]
1873    paddd           m1, m1
1874    mova            m4, [t1+r10+400* 8]
1875    paddd           m2, m2
1876    mova            m5, [t1+r10+400*10]
1877    mova [t2+r10+400* 0], m0
1878    mova [t2+r10+400* 2], m1
1879    mova [t2+r10+400* 4], m2
1880    mova [t2+r10+400* 6], m3
1881    mova [t2+r10+400* 8], m4
1882    mova [t2+r10+400*10], m5
1883    add            r10, 32
1884    jl .top_fixup_loop
1885    call .v0
1886    jmp .main
1887.h: ; horizontal boxsum
1888    lea            r10, [wq-4]
1889    test         edgeb, 1 ; LR_HAVE_LEFT
1890    jz .h_extend_left
1891    vpbroadcastq   xm5, [leftq]
1892    vinserti128     m5, [lpfq+wq], 1
1893    mova            m4, [lpfq+wq]
1894    add          leftq, 8
1895    palignr         m4, m5, 10
1896    jmp .h_main
1897.h_extend_left:
1898    mova           xm4, [lpfq+wq]
1899    pshufb         xm4, [sgr_lshuf5]
1900    vinserti128     m4, [lpfq+wq+10], 1
1901    jmp .h_main
1902.h_top:
1903    lea            r10, [wq-4]
1904    test         edgeb, 1 ; LR_HAVE_LEFT
1905    jz .h_extend_left
1906.h_loop:
1907    movu            m4, [lpfq+r10- 2]
1908.h_main:
1909    movu            m5, [lpfq+r10+14]
1910    test         edgeb, 2 ; LR_HAVE_RIGHT
1911    jnz .h_have_right
1912    cmp           r10d, -36
1913    jl .h_have_right
1914    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
1915.h_have_right:
1916    palignr         m3, m5, m4, 2
1917    palignr         m0, m5, m4, 4
1918    paddw           m1, m3, m0
1919    punpcklwd       m2, m3, m0
1920    pmaddwd         m2, m2
1921    punpckhwd       m3, m0
1922    pmaddwd         m3, m3
1923    palignr         m0, m5, m4, 6
1924    paddw           m1, m0             ; sum3
1925    punpcklwd       m6, m0, m7
1926    pmaddwd         m6, m6
1927    punpckhwd       m0, m7
1928    pmaddwd         m0, m0
1929    paddd           m2, m6             ; sumsq3
1930    shufpd          m6, m4, m5, 0x05
1931    punpcklwd       m5, m6, m4
1932    paddw           m8, m4, m6
1933    pmaddwd         m5, m5
1934    punpckhwd       m6, m4
1935    pmaddwd         m6, m6
1936    paddd           m3, m0
1937    mova [t1+r10+400* 6], m1
1938    mova [t1+r10+400* 8], m2
1939    mova [t1+r10+400*10], m3
1940    paddw           m8, m1             ; sum5
1941    paddd           m5, m2             ; sumsq5
1942    paddd           m6, m3
1943    mova [t1+r10+400* 0], m8
1944    mova [t1+r10+400* 2], m5
1945    mova [t1+r10+400* 4], m6
1946    add            r10, 32
1947    jl .h_loop
1948    ret
1949ALIGN function_align
1950.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
1951    lea            r10, [wq-4]
1952    test         edgeb, 1 ; LR_HAVE_LEFT
1953    jz .hv0_extend_left
1954    vpbroadcastq   xm5, [leftq]
1955    vinserti128     m5, [lpfq+wq], 1
1956    mova            m4, [lpfq+wq]
1957    add          leftq, 8
1958    palignr         m4, m5, 10
1959    jmp .hv0_main
1960.hv0_extend_left:
1961    mova           xm4, [lpfq+wq]
1962    pshufb         xm4, [sgr_lshuf5]
1963    vinserti128     m4, [lpfq+wq+10], 1
1964    jmp .hv0_main
1965.hv0_bottom:
1966    lea            r10, [wq-4]
1967    test         edgeb, 1 ; LR_HAVE_LEFT
1968    jz .hv0_extend_left
1969.hv0_loop:
1970    movu            m4, [lpfq+r10- 2]
1971.hv0_main:
1972    movu            m5, [lpfq+r10+14]
1973    test         edgeb, 2 ; LR_HAVE_RIGHT
1974    jnz .hv0_have_right
1975    cmp           r10d, -36
1976    jl .hv0_have_right
1977    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
1978.hv0_have_right:
1979    palignr         m3, m5, m4, 2
1980    palignr         m0, m5, m4, 4
1981    paddw           m1, m3, m0
1982    punpcklwd       m2, m3, m0
1983    pmaddwd         m2, m2
1984    punpckhwd       m3, m0
1985    pmaddwd         m3, m3
1986    palignr         m0, m5, m4, 6
1987    paddw           m1, m0             ; h sum3
1988    punpcklwd       m6, m0, m7
1989    pmaddwd         m6, m6
1990    punpckhwd       m0, m7
1991    pmaddwd         m0, m0
1992    paddd           m2, m6             ; h sumsq3
1993    shufpd          m6, m4, m5, 0x05
1994    punpcklwd       m5, m6, m4
1995    paddw           m8, m4, m6
1996    pmaddwd         m5, m5
1997    punpckhwd       m6, m4
1998    pmaddwd         m6, m6
1999    paddd           m3, m0
2000    paddw           m8, m1             ; h sum5
2001    paddd           m5, m2             ; h sumsq5
2002    paddd           m6, m3
2003    mova [t3+r10*2+400*8+ 8], m8 ; we need a clean copy of the last row TODO: t4?
2004    mova [t3+r10*2+400*0+ 8], m5 ; in case height is odd
2005    mova [t3+r10*2+400*0+40], m6
2006    paddw           m8, [t1+r10+400* 0]
2007    paddd           m5, [t1+r10+400* 2]
2008    paddd           m6, [t1+r10+400* 4]
2009    mova [t1+r10+400* 0], m8
2010    mova [t1+r10+400* 2], m5
2011    mova [t1+r10+400* 4], m6
2012    paddw           m0, m1, [t1+r10+400* 6]
2013    paddd           m4, m2, [t1+r10+400* 8]
2014    paddd           m5, m3, [t1+r10+400*10]
2015    mova [t1+r10+400* 6], m1
2016    mova [t1+r10+400* 8], m2
2017    mova [t1+r10+400*10], m3
2018    paddw           m1, m0, [t2+r10+400* 6]
2019    paddd           m2, m4, [t2+r10+400* 8]
2020    paddd           m3, m5, [t2+r10+400*10]
2021    mova [t2+r10+400* 6], m0
2022    mova [t2+r10+400* 8], m4
2023    mova [t2+r10+400*10], m5
2024    paddd           m2, m9
2025    paddd           m3, m9
2026    psrld           m2, 4              ; (a3 + 8) >> 4
2027    psrld           m3, 4
2028    pslld           m4, m2, 3
2029    pslld           m5, m3, 3
2030    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
2031    paddd           m5, m3
2032    psrlw           m3, m1, 1
2033    pavgw           m3, m7             ; (b3 + 2) >> 2
2034    punpcklwd       m2, m3, m7
2035    pmaddwd         m2, m2
2036    punpckhwd       m3, m7
2037    pmaddwd         m3, m3
2038    punpcklwd       m0, m1, m7         ; b3
2039    punpckhwd       m1, m7
2040    pmaxud          m4, m2
2041    psubd           m4, m2             ; p3
2042    pmaxud          m5, m3
2043    psubd           m5, m3
2044    pmulld          m4, m14            ; p3 * s1
2045    pmulld          m5, m14
2046    pmaddwd         m0, m12            ; b3 * 455
2047    pmaddwd         m1, m12
2048    paddusw         m4, m12
2049    paddusw         m5, m12
2050    psrad           m3, m4, 20         ; min(z3, 255) - 256
2051    vpgatherdd      m2, [r13+m3*4], m4
2052    psrad           m4, m5, 20
2053    vpgatherdd      m3, [r13+m4*4], m5
2054    pmulld          m0, m2
2055    pmulld          m1, m3
2056    packssdw        m2, m3
2057    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2058    paddd           m1, m10
2059    psubw           m2, m11, m2
2060    psrld           m0, 12
2061    psrld           m1, 12
2062    mova         [t4+r10*1+400*2+ 4], m2
2063    mova         [t3+r10*2+400*4+ 8], xm0
2064    vextracti128 [t3+r10*2+400*4+40], m0, 1
2065    mova         [t3+r10*2+400*4+24], xm1
2066    vextracti128 [t3+r10*2+400*4+56], m1, 1
2067    add            r10, 32
2068    jl .hv0_loop
2069    ret
2070ALIGN function_align
2071.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
2072    lea            r10, [wq-4]
2073    test         edgeb, 1 ; LR_HAVE_LEFT
2074    jz .hv1_extend_left
2075    vpbroadcastq   xm5, [leftq]
2076    vinserti128     m5, [lpfq+wq], 1
2077    mova            m4, [lpfq+wq]
2078    add          leftq, 8
2079    palignr         m4, m5, 10
2080    jmp .hv1_main
2081.hv1_extend_left:
2082    mova           xm4, [lpfq+wq]
2083    pshufb         xm4, [sgr_lshuf5]
2084    vinserti128     m4, [lpfq+wq+10], 1
2085    jmp .hv1_main
2086.hv1_bottom:
2087    lea            r10, [wq-4]
2088    test         edgeb, 1 ; LR_HAVE_LEFT
2089    jz .hv1_extend_left
2090.hv1_loop:
2091    movu            m4, [lpfq+r10- 2]
2092.hv1_main:
2093    movu            m5, [lpfq+r10+14]
2094    test         edgeb, 2 ; LR_HAVE_RIGHT
2095    jnz .hv1_have_right
2096    cmp           r10d, -36
2097    jl .hv1_have_right
2098    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
2099.hv1_have_right:
2100    palignr         m6, m5, m4, 2
2101    palignr         m3, m5, m4, 4
2102    paddw           m2, m6, m3
2103    punpcklwd       m0, m6, m3
2104    pmaddwd         m0, m0
2105    punpckhwd       m6, m3
2106    pmaddwd         m6, m6
2107    palignr         m3, m5, m4, 6
2108    paddw           m2, m3             ; h sum3
2109    punpcklwd       m1, m3, m7
2110    pmaddwd         m1, m1
2111    punpckhwd       m3, m7
2112    pmaddwd         m3, m3
2113    paddd           m0, m1             ; h sumsq3
2114    shufpd          m1, m4, m5, 0x05
2115    punpckhwd       m5, m4, m1
2116    paddw           m8, m4, m1
2117    pmaddwd         m5, m5
2118    punpcklwd       m4, m1
2119    pmaddwd         m4, m4
2120    paddd           m6, m3
2121    paddw           m1, m2, [t2+r10+400* 6]
2122    mova [t2+r10+400* 6], m2
2123    paddw           m8, m2             ; h sum5
2124    paddd           m2, m0, [t2+r10+400* 8]
2125    paddd           m3, m6, [t2+r10+400*10]
2126    mova [t2+r10+400* 8], m0
2127    mova [t2+r10+400*10], m6
2128    paddd           m4, m0             ; h sumsq5
2129    paddd           m5, m6
2130    paddd           m2, m9
2131    paddd           m3, m9
2132    psrld           m2, 4              ; (a3 + 8) >> 4
2133    psrld           m3, 4
2134    pslld           m0, m2, 3
2135    pslld           m6, m3, 3
2136    paddd           m2, m0             ; ((a3 + 8) >> 4) * 9
2137    paddd           m3, m6
2138    psrlw           m6, m1, 1
2139    pavgw           m6, m7             ; (b3 + 2) >> 2
2140    punpcklwd       m0, m6, m7
2141    pmaddwd         m0, m0
2142    punpckhwd       m6, m7
2143    pmaddwd         m6, m6
2144    pmaxud          m2, m0
2145    psubd           m2, m0             ; p3
2146    pmaxud          m3, m6
2147    psubd           m3, m6
2148    punpcklwd       m0, m1, m7         ; b3
2149    punpckhwd       m1, m7
2150    pmulld          m2, m14            ; p3 * s1
2151    pmulld          m3, m14
2152    pmaddwd         m0, m12            ; b3 * 455
2153    pmaddwd         m1, m12
2154    paddusw         m2, m12
2155    paddusw         m3, m12
2156    psrad           m7, m2, 20         ; min(z3, 255) - 256
2157    vpgatherdd      m6, [r13+m7*4], m2
2158    psrad           m2, m3, 20
2159    vpgatherdd      m7, [r13+m2*4], m3
2160    pmulld          m0, m6
2161    packssdw        m6, m7
2162    pmulld          m7, m1
2163    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2164    paddd           m7, m10
2165    psubw           m6, m11, m6
2166    psrld           m0, 12
2167    psrld           m7, 12
2168    paddw           m1, m8, [t2+r10+400*0]
2169    paddd           m2, m4, [t2+r10+400*2]
2170    paddd           m3, m5, [t2+r10+400*4]
2171    paddw           m1, [t1+r10+400*0]
2172    paddd           m2, [t1+r10+400*2]
2173    paddd           m3, [t1+r10+400*4]
2174    mova [t2+r10+400*0], m8
2175    mova [t2+r10+400*2], m4
2176    mova [t2+r10+400*4], m5
2177    mova         [t4+r10*1+400*4 +4], m6
2178    mova         [t3+r10*2+400*8+ 8], xm0
2179    vextracti128 [t3+r10*2+400*8+40], m0, 1
2180    mova         [t3+r10*2+400*8+24], xm7
2181    vextracti128 [t3+r10*2+400*8+56], m7, 1
2182    vpbroadcastd    m4, [pd_25]
2183    pxor            m7, m7
2184    paddd           m2, m9
2185    paddd           m3, m9
2186    psrld           m2, 4              ; (a5 + 8) >> 4
2187    psrld           m3, 4
2188    pmulld          m2, m4             ; ((a5 + 8) >> 4) * 25
2189    pmulld          m3, m4
2190    psrlw           m5, m1, 1
2191    pavgw           m5, m7             ; (b5 + 2) >> 2
2192    punpcklwd       m4, m5, m7
2193    pmaddwd         m4, m4
2194    punpckhwd       m5, m7
2195    pmaddwd         m5, m5
2196    punpcklwd       m0, m1, m7         ; b5
2197    punpckhwd       m1, m7
2198    pmaxud          m2, m4
2199    psubd           m2, m4             ; p5
2200    vpbroadcastd    m4, [pd_0xf00800a4]
2201    pmaxud          m3, m5
2202    psubd           m3, m5
2203    pmulld          m2, m13            ; p5 * s0
2204    pmulld          m3, m13
2205    pmaddwd         m0, m4             ; b5 * 164
2206    pmaddwd         m1, m4
2207    paddusw         m2, m4
2208    paddusw         m3, m4
2209    psrad           m5, m2, 20         ; min(z5, 255) - 256
2210    vpgatherdd      m4, [r13+m5*4], m2
2211    psrad           m2, m3, 20
2212    vpgatherdd      m5, [r13+m2*4], m3
2213    pmulld          m0, m4
2214    pmulld          m1, m5
2215    packssdw        m4, m5
2216    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
2217    paddd           m1, m10
2218    psubw           m4, m11, m4
2219    psrld           m0, 12
2220    psrld           m1, 12
2221    mova         [t4+r10*1+400*0+ 4], m4
2222    mova         [t3+r10*2+400*0+ 8], xm0
2223    vextracti128 [t3+r10*2+400*0+40], m0, 1
2224    mova         [t3+r10*2+400*0+24], xm1
2225    vextracti128 [t3+r10*2+400*0+56], m1, 1
2226    add            r10, 32
2227    jl .hv1_loop
2228    mov            r10, t2
2229    mov             t2, t1
2230    mov             t1, r10
2231    ret
2232.v0: ; vertical boxsums + ab3 (even rows)
2233    lea            r10, [wq-4]
2234.v0_loop:
2235    mova            m0, [t1+r10+400* 6]
2236    mova            m4, [t1+r10+400* 8]
2237    mova            m5, [t1+r10+400*10]
2238    paddw           m0, m0
2239    paddd           m4, m4
2240    paddd           m5, m5
2241    paddw           m1, m0, [t2+r10+400* 6]
2242    paddd           m2, m4, [t2+r10+400* 8]
2243    paddd           m3, m5, [t2+r10+400*10]
2244    mova [t2+r10+400* 6], m0
2245    mova [t2+r10+400* 8], m4
2246    mova [t2+r10+400*10], m5
2247    paddd           m2, m9
2248    paddd           m3, m9
2249    psrld           m2, 4              ; (a3 + 8) >> 4
2250    psrld           m3, 4
2251    pslld           m4, m2, 3
2252    pslld           m5, m3, 3
2253    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
2254    paddd           m5, m3
2255    psrlw           m3, m1, 1
2256    pavgw           m3, m7             ; (b3 + 2) >> 2
2257    punpcklwd       m2, m3, m7
2258    pmaddwd         m2, m2
2259    punpckhwd       m3, m7
2260    pmaddwd         m3, m3
2261    punpcklwd       m0, m1, m7         ; b3
2262    punpckhwd       m1, m7
2263    pmaxud          m4, m2
2264    psubd           m4, m2             ; p3
2265    pmaxud          m5, m3
2266    psubd           m5, m3
2267    pmulld          m4, m14            ; p3 * s1
2268    pmulld          m5, m14
2269    pmaddwd         m0, m12            ; b3 * 455
2270    pmaddwd         m1, m12
2271    paddusw         m4, m12
2272    paddusw         m5, m12
2273    psrad           m3, m4, 20         ; min(z3, 255) - 256
2274    vpgatherdd      m2, [r13+m3*4], m4
2275    psrad           m4, m5, 20
2276    vpgatherdd      m3, [r13+m4*4], m5
2277    pmulld          m0, m2
2278    pmulld          m1, m3
2279    packssdw        m2, m3
2280    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2281    paddd           m1, m10
2282    psubw           m2, m11, m2
2283    psrld           m0, 12
2284    psrld           m1, 12
2285    mova            m3, [t1+r10+400*0]
2286    mova            m4, [t1+r10+400*2]
2287    mova            m5, [t1+r10+400*4]
2288    mova [t3+r10*2+400*8+ 8], m3
2289    mova [t3+r10*2+400*0+ 8], m4
2290    mova [t3+r10*2+400*0+40], m5
2291    paddw           m3, m3 ; cc5
2292    paddd           m4, m4
2293    paddd           m5, m5
2294    mova [t1+r10+400*0], m3
2295    mova [t1+r10+400*2], m4
2296    mova [t1+r10+400*4], m5
2297    mova         [t4+r10*1+400*2+ 4], m2
2298    mova         [t3+r10*2+400*4+ 8], xm0
2299    vextracti128 [t3+r10*2+400*4+40], m0, 1
2300    mova         [t3+r10*2+400*4+24], xm1
2301    vextracti128 [t3+r10*2+400*4+56], m1, 1
2302    add            r10, 32
2303    jl .v0_loop
2304    ret
2305.v1: ; vertical boxsums + ab (odd rows)
2306    lea            r10, [wq-4]
2307.v1_loop:
2308    mova            m4, [t1+r10+400* 6]
2309    mova            m5, [t1+r10+400* 8]
2310    mova            m6, [t1+r10+400*10]
2311    paddw           m1, m4, [t2+r10+400* 6]
2312    paddd           m2, m5, [t2+r10+400* 8]
2313    paddd           m3, m6, [t2+r10+400*10]
2314    mova [t2+r10+400* 6], m4
2315    mova [t2+r10+400* 8], m5
2316    mova [t2+r10+400*10], m6
2317    paddd           m2, m9
2318    paddd           m3, m9
2319    psrld           m2, 4              ; (a3 + 8) >> 4
2320    psrld           m3, 4
2321    pslld           m4, m2, 3
2322    pslld           m5, m3, 3
2323    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
2324    paddd           m5, m3
2325    psrlw           m3, m1, 1
2326    pavgw           m3, m7             ; (b3 + 2) >> 2
2327    punpcklwd       m2, m3, m7
2328    pmaddwd         m2, m2
2329    punpckhwd       m3, m7
2330    pmaddwd         m3, m3
2331    punpcklwd       m0, m1, m7         ; b3
2332    punpckhwd       m1, m7
2333    pmaxud          m4, m2
2334    psubd           m4, m2             ; p3
2335    pmaxud          m5, m3
2336    psubd           m5, m3
2337    pmulld          m4, m14            ; p3 * s1
2338    pmulld          m5, m14
2339    pmaddwd         m0, m12            ; b3 * 455
2340    pmaddwd         m1, m12
2341    paddusw         m4, m12
2342    paddusw         m5, m12
2343    psrad           m3, m4, 20         ; min(z3, 255) - 256
2344    vpgatherdd      m2, [r13+m3*4], m4
2345    psrad           m4, m5, 20
2346    vpgatherdd      m3, [r13+m4*4], m5
2347    pmulld          m0, m2
2348    pmulld          m1, m3
2349    packssdw        m2, m3
2350    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2351    paddd           m1, m10
2352    psubw           m2, m11, m2
2353    psrld           m0, 12
2354    psrld           m8, m1, 12
2355    mova [t4+r10*1+400*4+4], m2
2356    mova            m4, [t3+r10*2+400*8+ 8]
2357    mova            m5, [t3+r10*2+400*0+ 8]
2358    mova            m6, [t3+r10*2+400*0+40]
2359    paddw           m1, m4, [t2+r10+400*0]
2360    paddd           m2, m5, [t2+r10+400*2]
2361    paddd           m3, m6, [t2+r10+400*4]
2362    paddw           m1, [t1+r10+400*0]
2363    paddd           m2, [t1+r10+400*2]
2364    paddd           m3, [t1+r10+400*4]
2365    mova [t2+r10+400*0], m4
2366    mova [t2+r10+400*2], m5
2367    mova [t2+r10+400*4], m6
2368    vpbroadcastd    m4, [pd_25]
2369    mova         [t3+r10*2+400*8+ 8], xm0
2370    vextracti128 [t3+r10*2+400*8+40], m0, 1
2371    mova         [t3+r10*2+400*8+24], xm8
2372    vextracti128 [t3+r10*2+400*8+56], m8, 1
2373    paddd           m2, m9
2374    paddd           m3, m9
2375    psrld           m2, 4              ; (a5 + 8) >> 4
2376    psrld           m3, 4
2377    pmulld          m2, m4             ; ((a5 + 8) >> 4) * 25
2378    pmulld          m3, m4
2379    psrlw           m5, m1, 1
2380    pavgw           m5, m7             ; (b5 + 2) >> 2
2381    punpcklwd       m4, m5, m7
2382    pmaddwd         m4, m4
2383    punpckhwd       m5, m7
2384    pmaddwd         m5, m5
2385    punpcklwd       m0, m1, m7         ; b5
2386    punpckhwd       m1, m7
2387    pmaxud          m2, m4
2388    psubd           m2, m4             ; p5
2389    vpbroadcastd    m4, [pd_0xf00800a4]
2390    pmaxud          m3, m5
2391    psubd           m3, m5
2392    pmulld          m2, m13            ; p5 * s0
2393    pmulld          m3, m13
2394    pmaddwd         m0, m4             ; b5 * 164
2395    pmaddwd         m1, m4
2396    paddusw         m2, m4
2397    paddusw         m3, m4
2398    psrad           m5, m2, 20         ; min(z5, 255) - 256
2399    vpgatherdd      m4, [r13+m5*4], m2
2400    psrad           m2, m3, 20
2401    vpgatherdd      m5, [r13+m2*4], m3
2402    pmulld          m0, m4
2403    pmulld          m1, m5
2404    packssdw        m4, m5
2405    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
2406    paddd           m1, m10
2407    psubw           m4, m11, m4
2408    psrld           m0, 12
2409    psrld           m1, 12
2410    mova         [t4+r10*1+400*0+ 4], m4
2411    mova         [t3+r10*2+400*0+ 8], xm0
2412    vextracti128 [t3+r10*2+400*0+40], m0, 1
2413    mova         [t3+r10*2+400*0+24], xm1
2414    vextracti128 [t3+r10*2+400*0+56], m1, 1
2415    add            r10, 32
2416    jl .v1_loop
2417    mov            r10, t2
2418    mov             t2, t1
2419    mov             t1, r10
2420    ret
2421.prep_n: ; initial neighbor setup
2422    mov            r10, wq
2423.prep_n_loop:
2424    movu           xm0, [t4+r10*1+400*0+2]
2425    paddw          xm2, xm0, [t4+r10*1+400*0+0]
2426    paddw          xm2, [t4+r10*1+400*0+4]
2427    movu            m1, [t3+r10*2+400*0+4]
2428    paddd           m3, m1, [t3+r10*2+400*0+0]
2429    paddd           m3, [t3+r10*2+400*0+8]
2430    paddw          xm0, xm2
2431    paddd           m1, m3
2432    psllw          xm2, 2
2433    pslld           m3, 2
2434    paddw          xm0, xm2              ; a5 565
2435    paddd           m1, m3               ; b5 565
2436    mova [t4+r10*1+400* 6], xm0
2437    mova [t3+r10*2+400*12], m1
2438    mova           xm0, [t4+r10*1+400*2+0]
2439    paddw          xm0, [t4+r10*1+400*2+4]
2440    paddw          xm2, xm0, [t4+r10*1+400*2+2]
2441    mova            m1, [t3+r10*2+400*4+0]
2442    paddd           m1, [t3+r10*2+400*4+8]
2443    paddd           m3, m1, [t3+r10*2+400*4+4]
2444    psllw          xm2, 2                ; a3[-1] 444
2445    pslld           m3, 2                ; b3[-1] 444
2446    psubw          xm2, xm0              ; a3[-1] 343
2447    psubd           m3, m1               ; b3[-1] 343
2448    mova [t4+r10*1+400* 8], xm2
2449    mova [t3+r10*2+400*16], m3
2450    mova           xm0, [t4+r10*1+400*4+0]
2451    paddw          xm0, [t4+r10*1+400*4+4]
2452    paddw          xm2, xm0, [t4+r10*1+400*4+2]
2453    mova            m1, [t3+r10*2+400*8+0]
2454    paddd           m1, [t3+r10*2+400*8+8]
2455    paddd           m3, m1, [t3+r10*2+400*8+4]
2456    psllw          xm2, 2                 ; a3[ 0] 444
2457    pslld           m3, 2                 ; b3[ 0] 444
2458    mova [t4+r10*1+400*10], xm2
2459    mova [t3+r10*2+400*20], m3
2460    psubw          xm2, xm0               ; a3[ 0] 343
2461    psubd           m3, m1                ; b3[ 0] 343
2462    mova [t4+r10*1+400*12], xm2
2463    mova [t3+r10*2+400*24], m3
2464    add            r10, 16
2465    jl .prep_n_loop
2466    ret
2467ALIGN function_align
2468.n0: ; neighbor + output (even rows)
2469    mov            r10, wq
2470.n0_loop:
2471    movu           xm2, [t4+r10*1+2]
2472    paddw          xm0, xm2, [t4+r10*1+0]
2473    paddw          xm0, [t4+r10*1+4]
2474    paddw          xm2, xm0
2475    psllw          xm0, 2
2476    paddw          xm0, xm2              ; a5
2477    movu            m1, [t3+r10*2+4]
2478    paddd           m4, m1, [t3+r10*2+0]
2479    paddd           m4, [t3+r10*2+8]
2480    paddd           m1, m4
2481    pslld           m4, 2
2482    paddd           m4, m1               ; b5
2483    paddw          xm2, xm0, [t4+r10*1+400* 6]
2484    mova [t4+r10*1+400* 6], xm0
2485    paddd           m0, m4, [t3+r10*2+400*12]
2486    mova [t3+r10*2+400*12], m4
2487    mova           xm3, [t4+r10*1+400*2+0]
2488    paddw          xm3, [t4+r10*1+400*2+4]
2489    paddw          xm5, xm3, [t4+r10*1+400*2+2]
2490    psllw          xm5, 2                ; a3[ 1] 444
2491    psubw          xm4, xm5, xm3         ; a3[ 1] 343
2492    paddw          xm3, xm4, [t4+r10*1+400* 8]
2493    paddw          xm3, [t4+r10*1+400*10]
2494    mova [t4+r10*1+400* 8], xm4
2495    mova [t4+r10*1+400*10], xm5
2496    mova            m1, [t3+r10*2+400*4+0]
2497    paddd           m1, [t3+r10*2+400*4+8]
2498    paddd           m5, m1, [t3+r10*2+400*4+4]
2499    pslld           m5, 2                ; b3[ 1] 444
2500    psubd           m4, m5, m1           ; b3[ 1] 343
2501    paddd           m1, m4, [t3+r10*2+400*16]
2502    paddd           m1, [t3+r10*2+400*20]
2503    mova [t3+r10*2+400*16], m4
2504    mova [t3+r10*2+400*20], m5
2505    pmovzxwd        m4, [dstq+r10]
2506    pmovzxwd        m2, xm2              ; a5
2507    pmovzxwd        m3, xm3              ; a3
2508    pmaddwd         m2, m4               ; a5 * src
2509    pmaddwd         m3, m4               ; a3 * src
2510    pslld           m4, 13
2511    psubd           m0, m4
2512    psubd           m1, m4
2513    paddd           m0, m2               ; a5 * src + b5 + (1 << 8) - (src << 13)
2514    paddd           m1, m3               ; a3 * src + b3 + (1 << 8) - (src << 13)
2515    psrld           m0, 9
2516    pslld           m1, 7
2517    pblendw         m0, m1, 0xaa
2518    pmaddwd         m0, m15
2519    vpbroadcastd    m1, [pd_4096]
2520    paddd           m4, m1
2521    paddd           m0, m4
2522    psrad           m0, 7
2523    vextracti128   xm1, m0, 1
2524    packusdw       xm0, xm1              ; clip
2525    psrlw          xm0, 6
2526    mova    [dstq+r10], xm0
2527    add            r10, 16
2528    jl .n0_loop
2529    add           dstq, dst_strideq
2530    ret
2531ALIGN function_align
2532.n1: ; neighbor + output (odd rows)
2533    mov            r10, wq
2534.n1_loop:
2535    mova           xm3, [t4+r10*1+400*4+0]
2536    paddw          xm3, [t4+r10*1+400*4+4]
2537    paddw          xm5, xm3, [t4+r10*1+400*4+2]
2538    psllw          xm5, 2                ; a3[ 1] 444
2539    psubw          xm4, xm5, xm3         ; a3[ 1] 343
2540    paddw          xm3, xm4, [t4+r10*1+400*12]
2541    paddw          xm3, [t4+r10*1+400*10]
2542    mova [t4+r10*1+400*10], xm5
2543    mova [t4+r10*1+400*12], xm4
2544    mova            m1, [t3+r10*2+400*8+0]
2545    paddd           m1, [t3+r10*2+400*8+8]
2546    paddd           m5, m1, [t3+r10*2+400*8+4]
2547    pslld           m5, 2                ; b3[ 1] 444
2548    psubd           m4, m5, m1           ; b3[ 1] 343
2549    paddd           m1, m4, [t3+r10*2+400*24]
2550    paddd           m1, [t3+r10*2+400*20]
2551    mova [t3+r10*2+400*20], m5
2552    mova [t3+r10*2+400*24], m4
2553    pmovzxwd        m4, [dstq+r10]
2554    pmovzxwd        m0, [t4+r10*1+400* 6]
2555    pmovzxwd        m3, xm3
2556    pmaddwd         m0, m4               ; a5 * src
2557    pmaddwd         m3, m4               ; a3 * src
2558    pslld           m4, 12
2559    psubd           m2, m4, [t3+r10*2+400*12]
2560    paddd           m4, m4
2561    psubd           m1, m4
2562    psubd           m0, m2               ; a5 * src + b5 + (1 << 8) - (src << 13)
2563    paddd           m1, m3               ; a3 * src + b3 + (1 << 8) - (src << 13)
2564    psrld           m0, 8
2565    pslld           m1, 7
2566    pblendw         m0, m1, 0xaa
2567    pmaddwd         m0, m15
2568    vpbroadcastd    m1, [pd_4096]
2569    paddd           m4, m1
2570    paddd           m0, m4
2571    psrad           m0, 7
2572    vextracti128   xm1, m0, 1
2573    packusdw       xm0, xm1              ; clip
2574    psrlw          xm0, 6
2575    mova    [dstq+r10], xm0
2576    add            r10, 16
2577    jl .n1_loop
2578    add           dstq, dst_strideq
2579    ret
2580
2581%endif ; ARCH_X86_64
2582