1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 32
32
33sgr_lshuf3:    db  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
34sgr_lshuf5:    db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9
35wiener_shufA:  db  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11
36wiener_shufB:  db  6,  7,  4,  5,  8,  9,  6,  7, 10, 11,  8,  9, 12, 13, 10, 11
37wiener_shufC:  db  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
38wiener_shufD:  db  2,  3, -1, -1,  4,  5, -1, -1,  6,  7, -1, -1,  8,  9, -1, -1
39wiener_shufE:  db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
40wiener_lshuf5: db  4,  5,  4,  5,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
41wiener_lshuf7: db  8,  9,  8,  9,  8,  9,  8,  9,  8,  9, 10, 11, 12, 13, 14, 15
42pb_0to31:      db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
43               db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
44
45wiener_hshift: dw 4, 4, 1, 1
46wiener_vshift: dw 1024, 1024, 4096, 4096
47wiener_round:  dd 1049600, 1048832
48
49pb_m10_m9:     times 2 db -10, -9
50pb_m6_m5:      times 2 db  -6, -5
51pb_m2_m1:      times 2 db  -2, -1
52pb_2_3:        times 2 db   2,  3
53pb_6_7:        times 2 db   6,  7
54pw_1023:       times 2 dw 1023
55pd_8:          dd 8
56pd_25:         dd 25
57pd_4096:       dd 4096
58pd_34816:      dd 34816
59pd_m262128:    dd -262128
60pd_0xf00800a4: dd 0xf00800a4
61pd_0xf00801c7: dd 0xf00801c7
62
63%define pw_256 sgr_lshuf5
64
65cextern sgr_x_by_x_avx2
66
67SECTION .text
68
69DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers
70
71INIT_YMM avx2
72cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
73                                                     w, h, edge, flt
74%define base t4-wiener_hshift
75    mov           fltq, r6mp
76    movifnidn       wd, wm
77    movifnidn       hd, hm
78    mov          edged, r7m
79    mov            t3d, r8m ; pixel_max
80    vbroadcasti128  m6, [wiener_shufA]
81    vpbroadcastd   m12, [fltq+ 0] ; x0 x1
82    lea             t4, [wiener_hshift]
83    vbroadcasti128  m7, [wiener_shufB]
84    add             wd, wd
85    vpbroadcastd   m13, [fltq+ 4] ; x2 x3
86    shr            t3d, 11
87    vpbroadcastd   m14, [fltq+16] ; y0 y1
88    add           lpfq, wq
89    vpbroadcastd   m15, [fltq+20] ; y2 y3
90    add           dstq, wq
91    vbroadcasti128  m8, [wiener_shufC]
92    lea             t1, [rsp+wq+16]
93    vbroadcasti128  m9, [wiener_shufD]
94    neg             wq
95    vpbroadcastd    m0, [base+wiener_hshift+t3*4]
96    vpbroadcastd   m10, [base+wiener_round+t3*4]
97    vpbroadcastd   m11, [base+wiener_vshift+t3*4]
98    pmullw         m12, m0 ; upshift filter coefs to make the
99    pmullw         m13, m0 ; horizontal downshift constant
100    test         edgeb, 4 ; LR_HAVE_TOP
101    jz .no_top
102    call .h_top
103    add           lpfq, strideq
104    mov             t6, t1
105    mov             t5, t1
106    add             t1, 384*2
107    call .h_top
108    lea            r10, [lpfq+strideq*4]
109    mov           lpfq, dstq
110    mov             t4, t1
111    add             t1, 384*2
112    add            r10, strideq
113    mov          [rsp], r10 ; below
114    call .h
115    mov             t3, t1
116    mov             t2, t1
117    dec             hd
118    jz .v1
119    add           lpfq, strideq
120    add             t1, 384*2
121    call .h
122    mov             t2, t1
123    dec             hd
124    jz .v2
125    add           lpfq, strideq
126    add             t1, 384*2
127    call .h
128    dec             hd
129    jz .v3
130.main:
131    lea             t0, [t1+384*2]
132.main_loop:
133    call .hv
134    dec             hd
135    jnz .main_loop
136    test         edgeb, 8 ; LR_HAVE_BOTTOM
137    jz .v3
138    mov           lpfq, [rsp]
139    call .hv_bottom
140    add           lpfq, strideq
141    call .hv_bottom
142.v1:
143    call .v
144    RET
145.no_top:
146    lea            r10, [lpfq+strideq*4]
147    mov           lpfq, dstq
148    lea            r10, [r10+strideq*2]
149    mov          [rsp], r10
150    call .h
151    mov             t6, t1
152    mov             t5, t1
153    mov             t4, t1
154    mov             t3, t1
155    mov             t2, t1
156    dec             hd
157    jz .v1
158    add           lpfq, strideq
159    add             t1, 384*2
160    call .h
161    mov             t2, t1
162    dec             hd
163    jz .v2
164    add           lpfq, strideq
165    add             t1, 384*2
166    call .h
167    dec             hd
168    jz .v3
169    lea             t0, [t1+384*2]
170    call .hv
171    dec             hd
172    jz .v3
173    add             t0, 384*8
174    call .hv
175    dec             hd
176    jnz .main
177.v3:
178    call .v
179.v2:
180    call .v
181    jmp .v1
182.extend_right:
183    movd           xm1, r10d
184    vpbroadcastd    m0, [pb_6_7]
185    movu            m2, [pb_0to31]
186    vpbroadcastb    m1, xm1
187    psubb           m0, m1
188    pminub          m0, m2
189    pshufb          m3, m0
190    vpbroadcastd    m0, [pb_m2_m1]
191    psubb           m0, m1
192    pminub          m0, m2
193    pshufb          m4, m0
194    vpbroadcastd    m0, [pb_m10_m9]
195    psubb           m0, m1
196    pminub          m0, m2
197    pshufb          m5, m0
198    ret
199.h:
200    mov            r10, wq
201    test         edgeb, 1 ; LR_HAVE_LEFT
202    jz .h_extend_left
203    movq           xm3, [leftq]
204    vpblendd        m3, [lpfq+r10-8], 0xfc
205    add          leftq, 8
206    jmp .h_main
207.h_extend_left:
208    vbroadcasti128  m3, [lpfq+r10] ; avoid accessing memory located
209    mova            m4, [lpfq+r10] ; before the start of the buffer
210    shufpd          m3, m4, 0x05
211    pshufb          m3, [wiener_lshuf7]
212    jmp .h_main2
213.h_top:
214    mov            r10, wq
215    test         edgeb, 1 ; LR_HAVE_LEFT
216    jz .h_extend_left
217.h_loop:
218    movu            m3, [lpfq+r10-8]
219.h_main:
220    mova            m4, [lpfq+r10+0]
221.h_main2:
222    movu            m5, [lpfq+r10+8]
223    test         edgeb, 2 ; LR_HAVE_RIGHT
224    jnz .h_have_right
225    cmp           r10d, -36
226    jl .h_have_right
227    call .extend_right
228.h_have_right:
229    pshufb          m0, m3, m6
230    pshufb          m1, m4, m7
231    paddw           m0, m1
232    pshufb          m3, m8
233    pmaddwd         m0, m12
234    pshufb          m1, m4, m9
235    paddw           m3, m1
236    pshufb          m1, m4, m6
237    pmaddwd         m3, m13
238    pshufb          m2, m5, m7
239    paddw           m1, m2
240    vpbroadcastd    m2, [pd_m262128] ; (1 << 4) - (1 << 18)
241    pshufb          m4, m8
242    pmaddwd         m1, m12
243    pshufb          m5, m9
244    paddw           m4, m5
245    pmaddwd         m4, m13
246    paddd           m0, m2
247    paddd           m1, m2
248    paddd           m0, m3
249    paddd           m1, m4
250    psrad           m0, 4
251    psrad           m1, 4
252    packssdw        m0, m1
253    psraw           m0, 1
254    mova      [t1+r10], m0
255    add            r10, 32
256    jl .h_loop
257    ret
258ALIGN function_align
259.hv:
260    add           lpfq, strideq
261    mov            r10, wq
262    test         edgeb, 1 ; LR_HAVE_LEFT
263    jz .hv_extend_left
264    movq           xm3, [leftq]
265    vpblendd        m3, [lpfq+r10-8], 0xfc
266    add          leftq, 8
267    jmp .hv_main
268.hv_extend_left:
269    movu            m3, [lpfq+r10-8]
270    pshufb          m3, [wiener_lshuf7]
271    jmp .hv_main
272.hv_bottom:
273    mov            r10, wq
274    test         edgeb, 1 ; LR_HAVE_LEFT
275    jz .hv_extend_left
276.hv_loop:
277    movu            m3, [lpfq+r10-8]
278.hv_main:
279    mova            m4, [lpfq+r10+0]
280    movu            m5, [lpfq+r10+8]
281    test         edgeb, 2 ; LR_HAVE_RIGHT
282    jnz .hv_have_right
283    cmp           r10d, -36
284    jl .hv_have_right
285    call .extend_right
286.hv_have_right:
287    pshufb          m0, m3, m6
288    pshufb          m1, m4, m7
289    paddw           m0, m1
290    pshufb          m3, m8
291    pmaddwd         m0, m12
292    pshufb          m1, m4, m9
293    paddw           m3, m1
294    pshufb          m1, m4, m6
295    pmaddwd         m3, m13
296    pshufb          m2, m5, m7
297    paddw           m1, m2
298    vpbroadcastd    m2, [pd_m262128]
299    pshufb          m4, m8
300    pmaddwd         m1, m12
301    pshufb          m5, m9
302    paddw           m4, m5
303    pmaddwd         m4, m13
304    paddd           m0, m2
305    paddd           m1, m2
306    mova            m2, [t4+r10]
307    paddw           m2, [t2+r10]
308    mova            m5, [t3+r10]
309    paddd           m0, m3
310    paddd           m1, m4
311    psrad           m0, 4
312    psrad           m1, 4
313    packssdw        m0, m1
314    mova            m4, [t5+r10]
315    paddw           m4, [t1+r10]
316    psraw           m0, 1
317    paddw           m3, m0, [t6+r10]
318    mova      [t0+r10], m0
319    punpcklwd       m0, m2, m5
320    pmaddwd         m0, m15
321    punpckhwd       m2, m5
322    pmaddwd         m2, m15
323    punpcklwd       m1, m3, m4
324    pmaddwd         m1, m14
325    punpckhwd       m3, m4
326    pmaddwd         m3, m14
327    paddd           m0, m10
328    paddd           m2, m10
329    paddd           m0, m1
330    paddd           m2, m3
331    psrad           m0, 5
332    psrad           m2, 5
333    packusdw        m0, m2
334    pmulhuw         m0, m11
335    mova    [dstq+r10], m0
336    add            r10, 32
337    jl .hv_loop
338    mov             t6, t5
339    mov             t5, t4
340    mov             t4, t3
341    mov             t3, t2
342    mov             t2, t1
343    mov             t1, t0
344    mov             t0, t6
345    add           dstq, strideq
346    ret
347.v:
348    mov            r10, wq
349.v_loop:
350    mova            m1, [t4+r10]
351    paddw           m1, [t2+r10]
352    mova            m2, [t3+r10]
353    mova            m4, [t1+r10]
354    paddw           m3, m4, [t6+r10]
355    paddw           m4, [t5+r10]
356    punpcklwd       m0, m1, m2
357    pmaddwd         m0, m15
358    punpckhwd       m1, m2
359    pmaddwd         m1, m15
360    punpcklwd       m2, m3, m4
361    pmaddwd         m2, m14
362    punpckhwd       m3, m4
363    pmaddwd         m3, m14
364    paddd           m0, m10
365    paddd           m1, m10
366    paddd           m0, m2
367    paddd           m1, m3
368    psrad           m0, 5
369    psrad           m1, 5
370    packusdw        m0, m1
371    pmulhuw         m0, m11
372    mova    [dstq+r10], m0
373    add            r10, 32
374    jl .v_loop
375    mov             t6, t5
376    mov             t5, t4
377    mov             t4, t3
378    mov             t3, t2
379    mov             t2, t1
380    add           dstq, strideq
381    ret
382
383cglobal wiener_filter5_16bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
384                                                   w, h, edge, flt
385%define base t4-wiener_hshift
386    mov           fltq, r6mp
387    movifnidn       wd, wm
388    movifnidn       hd, hm
389    mov          edged, r7m
390    mov            t3d, r8m ; pixel_max
391    vbroadcasti128  m5, [wiener_shufE]
392    vpbroadcastw   m11, [fltq+ 2] ; x1
393    vbroadcasti128  m6, [wiener_shufB]
394    lea             t4, [wiener_hshift]
395    vbroadcasti128  m7, [wiener_shufD]
396    add             wd, wd
397    vpbroadcastd   m12, [fltq+ 4] ; x2 x3
398    shr            t3d, 11
399    vpbroadcastd    m8, [pd_m262128] ; (1 << 4) - (1 << 18)
400    add           lpfq, wq
401    vpbroadcastw   m13, [fltq+18] ; y1
402    add           dstq, wq
403    vpbroadcastd   m14, [fltq+20] ; y2 y3
404    lea             t1, [rsp+wq+16]
405    neg             wq
406    vpbroadcastd    m0, [base+wiener_hshift+t3*4]
407    vpbroadcastd    m9, [base+wiener_round+t3*4]
408    vpbroadcastd   m10, [base+wiener_vshift+t3*4]
409    movu          xm15, [wiener_lshuf5]
410    pmullw         m11, m0
411    vinserti128    m15, [pb_0to31], 1
412    pmullw         m12, m0
413    test         edgeb, 4 ; LR_HAVE_TOP
414    jz .no_top
415    call .h_top
416    add           lpfq, strideq
417    mov             t4, t1
418    add             t1, 384*2
419    call .h_top
420    lea            r10, [lpfq+strideq*4]
421    mov           lpfq, dstq
422    mov             t3, t1
423    add             t1, 384*2
424    add            r10, strideq
425    mov          [rsp], r10 ; below
426    call .h
427    mov             t2, t1
428    dec             hd
429    jz .v1
430    add           lpfq, strideq
431    add             t1, 384*2
432    call .h
433    dec             hd
434    jz .v2
435.main:
436    mov             t0, t4
437.main_loop:
438    call .hv
439    dec             hd
440    jnz .main_loop
441    test         edgeb, 8 ; LR_HAVE_BOTTOM
442    jz .v2
443    mov           lpfq, [rsp]
444    call .hv_bottom
445    add           lpfq, strideq
446    call .hv_bottom
447.end:
448    RET
449.no_top:
450    lea            r10, [lpfq+strideq*4]
451    mov           lpfq, dstq
452    lea            r10, [r10+strideq*2]
453    mov          [rsp], r10
454    call .h
455    mov             t4, t1
456    mov             t3, t1
457    mov             t2, t1
458    dec             hd
459    jz .v1
460    add           lpfq, strideq
461    add             t1, 384*2
462    call .h
463    dec             hd
464    jz .v2
465    lea             t0, [t1+384*2]
466    call .hv
467    dec             hd
468    jz .v2
469    add             t0, 384*6
470    call .hv
471    dec             hd
472    jnz .main
473.v2:
474    call .v
475    mov             t4, t3
476    mov             t3, t2
477    mov             t2, t1
478    add           dstq, strideq
479.v1:
480    call .v
481    jmp .end
482.extend_right:
483    movd           xm2, r10d
484    vpbroadcastd    m0, [pb_2_3]
485    vpbroadcastd    m1, [pb_m6_m5]
486    vpbroadcastb    m2, xm2
487    psubb           m0, m2
488    psubb           m1, m2
489    movu            m2, [pb_0to31]
490    pminub          m0, m2
491    pminub          m1, m2
492    pshufb          m3, m0
493    pshufb          m4, m1
494    ret
495.h:
496    mov            r10, wq
497    test         edgeb, 1 ; LR_HAVE_LEFT
498    jz .h_extend_left
499    movd           xm3, [leftq+4]
500    vpblendd        m3, [lpfq+r10-4], 0xfe
501    add          leftq, 8
502    jmp .h_main
503.h_extend_left:
504    vbroadcasti128  m4, [lpfq+r10] ; avoid accessing memory located
505    mova            m3, [lpfq+r10] ; before the start of the buffer
506    palignr         m3, m4, 12
507    pshufb          m3, m15
508    jmp .h_main
509.h_top:
510    mov            r10, wq
511    test         edgeb, 1 ; LR_HAVE_LEFT
512    jz .h_extend_left
513.h_loop:
514    movu            m3, [lpfq+r10-4]
515.h_main:
516    movu            m4, [lpfq+r10+4]
517    test         edgeb, 2 ; LR_HAVE_RIGHT
518    jnz .h_have_right
519    cmp           r10d, -34
520    jl .h_have_right
521    call .extend_right
522.h_have_right:
523    pshufb          m0, m3, m5
524    pmaddwd         m0, m11
525    pshufb          m1, m4, m5
526    pmaddwd         m1, m11
527    pshufb          m2, m3, m6
528    pshufb          m3, m7
529    paddw           m2, m3
530    pshufb          m3, m4, m6
531    pmaddwd         m2, m12
532    pshufb          m4, m7
533    paddw           m3, m4
534    pmaddwd         m3, m12
535    paddd           m0, m8
536    paddd           m1, m8
537    paddd           m0, m2
538    paddd           m1, m3
539    psrad           m0, 4
540    psrad           m1, 4
541    packssdw        m0, m1
542    psraw           m0, 1
543    mova      [t1+r10], m0
544    add            r10, 32
545    jl .h_loop
546    ret
547ALIGN function_align
548.hv:
549    add           lpfq, strideq
550    mov            r10, wq
551    test         edgeb, 1 ; LR_HAVE_LEFT
552    jz .hv_extend_left
553    movd           xm3, [leftq+4]
554    vpblendd        m3, [lpfq+r10-4], 0xfe
555    add          leftq, 8
556    jmp .hv_main
557.hv_extend_left:
558    movu            m3, [lpfq+r10-4]
559    pshufb          m3, m15
560    jmp .hv_main
561.hv_bottom:
562    mov            r10, wq
563    test         edgeb, 1 ; LR_HAVE_LEFT
564    jz .hv_extend_left
565.hv_loop:
566    movu            m3, [lpfq+r10-4]
567.hv_main:
568    movu            m4, [lpfq+r10+4]
569    test         edgeb, 2 ; LR_HAVE_RIGHT
570    jnz .hv_have_right
571    cmp           r10d, -34
572    jl .hv_have_right
573    call .extend_right
574.hv_have_right:
575    pshufb          m0, m3, m5
576    pmaddwd         m0, m11
577    pshufb          m1, m4, m5
578    pmaddwd         m1, m11
579    pshufb          m2, m3, m6
580    pshufb          m3, m7
581    paddw           m2, m3
582    pshufb          m3, m4, m6
583    pmaddwd         m2, m12
584    pshufb          m4, m7
585    paddw           m3, m4
586    pmaddwd         m3, m12
587    paddd           m0, m8
588    paddd           m1, m8
589    paddd           m0, m2
590    mova            m2, [t3+r10]
591    paddw           m2, [t1+r10]
592    paddd           m1, m3
593    mova            m4, [t2+r10]
594    punpckhwd       m3, m2, m4
595    pmaddwd         m3, m14
596    punpcklwd       m2, m4
597    mova            m4, [t4+r10]
598    psrad           m0, 4
599    psrad           m1, 4
600    packssdw        m0, m1
601    pmaddwd         m2, m14
602    psraw           m0, 1
603    mova      [t0+r10], m0
604    punpckhwd       m1, m0, m4
605    pmaddwd         m1, m13
606    punpcklwd       m0, m4
607    pmaddwd         m0, m13
608    paddd           m3, m9
609    paddd           m2, m9
610    paddd           m1, m3
611    paddd           m0, m2
612    psrad           m1, 5
613    psrad           m0, 5
614    packusdw        m0, m1
615    pmulhuw         m0, m10
616    mova    [dstq+r10], m0
617    add            r10, 32
618    jl .hv_loop
619    mov             t4, t3
620    mov             t3, t2
621    mov             t2, t1
622    mov             t1, t0
623    mov             t0, t4
624    add           dstq, strideq
625    ret
626.v:
627    mov            r10, wq
628.v_loop:
629    mova            m0, [t1+r10]
630    paddw           m2, m0, [t3+r10]
631    mova            m1, [t2+r10]
632    mova            m4, [t4+r10]
633    punpckhwd       m3, m2, m1
634    pmaddwd         m3, m14
635    punpcklwd       m2, m1
636    pmaddwd         m2, m14
637    punpckhwd       m1, m0, m4
638    pmaddwd         m1, m13
639    punpcklwd       m0, m4
640    pmaddwd         m0, m13
641    paddd           m3, m9
642    paddd           m2, m9
643    paddd           m1, m3
644    paddd           m0, m2
645    psrad           m1, 5
646    psrad           m0, 5
647    packusdw        m0, m1
648    pmulhuw         m0, m10
649    mova    [dstq+r10], m0
650    add            r10, 32
651    jl .v_loop
652    ret
653
654cglobal sgr_filter_5x5_16bpc, 4, 14, 15, 400*24+16, dst, stride, left, lpf, \
655                                                    w, h, edge, params
656    movifnidn       wd, wm
657    mov        paramsq, r6mp
658    lea            r13, [sgr_x_by_x_avx2+256*4]
659    movifnidn       hd, hm
660    mov          edged, r7m
661    add             wd, wd
662    vpbroadcastw    m7, [paramsq+8] ; w0
663    add           lpfq, wq
664    vpbroadcastd    m8, [pd_8]
665    add           dstq, wq
666    vpbroadcastd    m9, [pd_25]
667    lea             t3, [rsp+wq*2+400*12+16]
668    vpbroadcastd   m10, [paramsq+0] ; s0
669    lea             t4, [rsp+wq+400*20+16]
670    vpbroadcastd   m11, [pd_0xf00800a4]
671    lea             t1, [rsp+wq+20]
672    mova          xm12, [sgr_lshuf5]
673    neg             wq
674    vpbroadcastd   m13, [pd_34816]  ; (1 << 11) + (1 << 15)
675    pxor            m6, m6
676    vpbroadcastd   m14, [pw_1023]
677    psllw           m7, 4
678    test         edgeb, 4 ; LR_HAVE_TOP
679    jz .no_top
680    call .h_top
681    add           lpfq, strideq
682    mov             t2, t1
683    call .top_fixup
684    add             t1, 400*6
685    call .h_top
686    lea            r10, [lpfq+strideq*4]
687    mov           lpfq, dstq
688    add            r10, strideq
689    mov          [rsp], r10 ; below
690    mov             t0, t2
691    dec             hd
692    jz .height1
693    or           edged, 16
694    call .h
695.main:
696    add           lpfq, strideq
697    call .hv
698    call .prep_n
699    sub             hd, 2
700    jl .extend_bottom
701.main_loop:
702    add           lpfq, strideq
703    test            hd, hd
704    jz .odd_height
705    call .h
706    add           lpfq, strideq
707    call .hv
708    call .n0
709    call .n1
710    sub             hd, 2
711    jge .main_loop
712    test         edgeb, 8 ; LR_HAVE_BOTTOM
713    jz .extend_bottom
714    mov           lpfq, [rsp]
715    call .h_top
716    add           lpfq, strideq
717    call .hv_bottom
718.end:
719    call .n0
720    call .n1
721.end2:
722    RET
723.height1:
724    call .hv
725    call .prep_n
726    jmp .odd_height_end
727.odd_height:
728    call .hv
729    call .n0
730    call .n1
731.odd_height_end:
732    call .v
733    call .n0
734    jmp .end2
735.extend_bottom:
736    call .v
737    jmp .end
738.no_top:
739    lea            r10, [lpfq+strideq*4]
740    mov           lpfq, dstq
741    lea            r10, [r10+strideq*2]
742    mov          [rsp], r10
743    call .h
744    lea             t2, [t1+400*6]
745    call .top_fixup
746    dec             hd
747    jz .no_top_height1
748    or           edged, 16
749    mov             t0, t1
750    mov             t1, t2
751    jmp .main
752.no_top_height1:
753    call .v
754    call .prep_n
755    jmp .odd_height_end
756.extend_right:
757    vpbroadcastw    m0, [lpfq-2]
758    movu            m1, [r13+r10+ 0]
759    movu            m2, [r13+r10+16]
760    vpblendvb       m4, m0, m1
761    vpblendvb       m5, m0, m2
762    ret
763.h: ; horizontal boxsum
764    lea            r10, [wq-4]
765    test         edgeb, 1 ; LR_HAVE_LEFT
766    jz .h_extend_left
767    vpbroadcastq   xm5, [leftq]
768    vinserti128     m5, [lpfq+wq], 1
769    mova            m4, [lpfq+wq]
770    add          leftq, 8
771    palignr         m4, m5, 10
772    jmp .h_main
773.h_extend_left:
774    mova           xm4, [lpfq+wq]
775    pshufb         xm4, xm12
776    vinserti128     m4, [lpfq+wq+10], 1
777    jmp .h_main
778.h_top:
779    lea            r10, [wq-4]
780    test         edgeb, 1 ; LR_HAVE_LEFT
781    jz .h_extend_left
782.h_loop:
783    movu            m4, [lpfq+r10- 2]
784.h_main:
785    movu            m5, [lpfq+r10+14]
786    test         edgeb, 2 ; LR_HAVE_RIGHT
787    jnz .h_have_right
788    cmp           r10d, -36
789    jl .h_have_right
790    call .extend_right
791.h_have_right:
792    palignr         m2, m5, m4, 2
793    paddw           m0, m4, m2
794    palignr         m3, m5, m4, 6
795    paddw           m0, m3
796    punpcklwd       m1, m2, m3
797    pmaddwd         m1, m1
798    punpckhwd       m2, m3
799    pmaddwd         m2, m2
800    shufpd          m5, m4, m5, 0x05
801    paddw           m0, m5
802    punpcklwd       m3, m4, m5
803    pmaddwd         m3, m3
804    paddd           m1, m3
805    punpckhwd       m3, m4, m5
806    pmaddwd         m3, m3
807    shufps          m4, m5, q2121
808    paddw           m0, m4             ; sum
809    punpcklwd       m5, m4, m6
810    pmaddwd         m5, m5
811    punpckhwd       m4, m6
812    pmaddwd         m4, m4
813    paddd           m2, m3
814    test         edgeb, 16             ; y > 0
815    jz .h_loop_end
816    paddw           m0, [t1+r10+400*0]
817    paddd           m1, [t1+r10+400*2]
818    paddd           m2, [t1+r10+400*4]
819.h_loop_end:
820    paddd           m1, m5             ; sumsq
821    paddd           m2, m4
822    mova [t1+r10+400*0], m0
823    mova [t1+r10+400*2], m1
824    mova [t1+r10+400*4], m2
825    add            r10, 32
826    jl .h_loop
827    ret
828.top_fixup:
829    lea            r10, [wq-4]
830.top_fixup_loop: ; the sums of the first row needs to be doubled
831    mova            m0, [t1+r10+400*0]
832    mova            m1, [t1+r10+400*2]
833    mova            m2, [t1+r10+400*4]
834    paddw           m0, m0
835    paddd           m1, m1
836    paddd           m2, m2
837    mova [t2+r10+400*0], m0
838    mova [t2+r10+400*2], m1
839    mova [t2+r10+400*4], m2
840    add            r10, 32
841    jl .top_fixup_loop
842    ret
843ALIGN function_align
844.hv: ; horizontal boxsum + vertical boxsum + ab
845    lea            r10, [wq-4]
846    test         edgeb, 1 ; LR_HAVE_LEFT
847    jz .hv_extend_left
848    vpbroadcastq   xm5, [leftq]
849    vinserti128     m5, [lpfq+wq], 1
850    mova            m4, [lpfq+wq]
851    add          leftq, 8
852    palignr         m4, m5, 10
853    jmp .hv_main
854.hv_extend_left:
855    mova           xm4, [lpfq+wq]
856    pshufb         xm4, xm12
857    vinserti128     m4, [lpfq+wq+10], 1
858    jmp .hv_main
859.hv_bottom:
860    lea            r10, [wq-4]
861    test         edgeb, 1 ; LR_HAVE_LEFT
862    jz .hv_extend_left
863.hv_loop:
864    movu            m4, [lpfq+r10- 2]
865.hv_main:
866    movu            m5, [lpfq+r10+14]
867    test         edgeb, 2 ; LR_HAVE_RIGHT
868    jnz .hv_have_right
869    cmp           r10d, -36
870    jl .hv_have_right
871    call .extend_right
872.hv_have_right:
873    palignr         m3, m5, m4, 2
874    paddw           m0, m4, m3
875    palignr         m1, m5, m4, 6
876    paddw           m0, m1
877    punpcklwd       m2, m3, m1
878    pmaddwd         m2, m2
879    punpckhwd       m3, m1
880    pmaddwd         m3, m3
881    shufpd          m5, m4, m5, 0x05
882    paddw           m0, m5
883    punpcklwd       m1, m4, m5
884    pmaddwd         m1, m1
885    paddd           m2, m1
886    punpckhwd       m1, m4, m5
887    pmaddwd         m1, m1
888    shufps          m4, m5, q2121
889    paddw           m0, m4            ; h sum
890    punpcklwd       m5, m4, m6
891    pmaddwd         m5, m5
892    punpckhwd       m4, m6
893    pmaddwd         m4, m4
894    paddd           m3, m1
895    paddd           m2, m5            ; h sumsq
896    paddd           m3, m4
897    paddw           m1, m0, [t1+r10+400*0]
898    paddd           m4, m2, [t1+r10+400*2]
899    paddd           m5, m3, [t1+r10+400*4]
900    test            hd, hd
901    jz .hv_last_row
902.hv_main2:
903    paddw           m1, [t2+r10+400*0] ; hv sum
904    paddd           m4, [t2+r10+400*2] ; hv sumsq
905    paddd           m5, [t2+r10+400*4]
906    mova [t0+r10+400*0], m0
907    mova [t0+r10+400*2], m2
908    mova [t0+r10+400*4], m3
909    psrlw           m3, m1, 1
910    paddd           m4, m8
911    pavgw           m3, m6             ; (b + 2) >> 2
912    paddd           m5, m8
913    psrld           m4, 4              ; (a + 8) >> 4
914    punpcklwd       m2, m3, m6
915    psrld           m5, 4
916    punpckhwd       m3, m6
917    pmulld          m4, m9             ; a * 25
918    pmulld          m5, m9
919    pmaddwd         m2, m2             ; b * b
920    pmaddwd         m3, m3
921    punpcklwd       m0, m1, m6         ; b
922    punpckhwd       m1, m6
923    pmaxud          m4, m2
924    pmaxud          m5, m3
925    psubd           m4, m2             ; p
926    psubd           m5, m3
927    pmulld          m4, m10            ; p * s
928    pmulld          m5, m10
929    pmaddwd         m0, m11            ; b * 164
930    pmaddwd         m1, m11
931    paddusw         m4, m11
932    paddusw         m5, m11
933    psrad           m3, m4, 20         ; min(z, 255) - 256
934    vpgatherdd      m2, [r13+m3*4], m4 ; x
935    psrad           m4, m5, 20
936    vpgatherdd      m3, [r13+m4*4], m5
937    pmulld          m0, m2
938    pmulld          m1, m3
939    packssdw        m2, m3
940    paddd           m0, m13            ; x * b * 164 + (1 << 11) + (1 << 15)
941    paddd           m1, m13
942    mova    [t4+r10+4], m2
943    psrld           m0, 12             ; b
944    psrld           m1, 12
945    mova         [t3+r10*2+ 8], xm0
946    vextracti128 [t3+r10*2+40], m0, 1
947    mova         [t3+r10*2+24], xm1
948    vextracti128 [t3+r10*2+56], m1, 1
949    add            r10, 32
950    jl .hv_loop
951    mov             t2, t1
952    mov             t1, t0
953    mov             t0, t2
954    ret
955.hv_last_row: ; esoteric edge case for odd heights
956    mova [t1+r10+400*0], m1
957    paddw            m1, m0
958    mova [t1+r10+400*2], m4
959    paddd            m4, m2
960    mova [t1+r10+400*4], m5
961    paddd            m5, m3
962    jmp .hv_main2
963.v: ; vertical boxsum + ab
964    lea            r10, [wq-4]
965.v_loop:
966    mova            m0, [t1+r10+400*0]
967    mova            m2, [t1+r10+400*2]
968    mova            m3, [t1+r10+400*4]
969    paddw           m1, m0, [t2+r10+400*0]
970    paddd           m4, m2, [t2+r10+400*2]
971    paddd           m5, m3, [t2+r10+400*4]
972    paddw           m0, m0
973    paddd           m2, m2
974    paddd           m3, m3
975    paddw           m1, m0             ; hv sum
976    paddd           m4, m2             ; hv sumsq
977    paddd           m5, m3
978    psrlw           m3, m1, 1
979    paddd           m4, m8
980    pavgw           m3, m6             ; (b + 2) >> 2
981    paddd           m5, m8
982    psrld           m4, 4              ; (a + 8) >> 4
983    punpcklwd       m2, m3, m6
984    psrld           m5, 4
985    punpckhwd       m3, m6
986    pmulld          m4, m9             ; a * 25
987    pmulld          m5, m9
988    pmaddwd         m2, m2             ; b * b
989    pmaddwd         m3, m3
990    punpcklwd       m0, m1, m6         ; b
991    punpckhwd       m1, m6
992    pmaxud          m4, m2
993    pmaxud          m5, m3
994    psubd           m4, m2             ; p
995    psubd           m5, m3
996    pmulld          m4, m10            ; p * s
997    pmulld          m5, m10
998    pmaddwd         m0, m11            ; b * 164
999    pmaddwd         m1, m11
1000    paddusw         m4, m11
1001    paddusw         m5, m11
1002    psrad           m3, m4, 20         ; min(z, 255) - 256
1003    vpgatherdd      m2, [r13+m3*4], m4 ; x
1004    psrad           m4, m5, 20
1005    vpgatherdd      m3, [r13+m4*4], m5
1006    pmulld          m0, m2
1007    pmulld          m1, m3
1008    packssdw        m2, m3
1009    paddd           m0, m13            ; x * b * 164 + (1 << 11) + (1 << 15)
1010    paddd           m1, m13
1011    mova    [t4+r10+4], m2
1012    psrld           m0, 12             ; b
1013    psrld           m1, 12
1014    mova         [t3+r10*2+ 8], xm0
1015    vextracti128 [t3+r10*2+40], m0, 1
1016    mova         [t3+r10*2+24], xm1
1017    vextracti128 [t3+r10*2+56], m1, 1
1018    add            r10, 32
1019    jl .v_loop
1020    ret
1021.prep_n: ; initial neighbor setup
1022    mov            r10, wq
1023.prep_n_loop:
1024    movu            m0, [t4+r10*1+ 2]
1025    movu            m1, [t3+r10*2+ 4]
1026    movu            m2, [t3+r10*2+36]
1027    paddw           m3, m0, [t4+r10*1+ 0]
1028    paddd           m4, m1, [t3+r10*2+ 0]
1029    paddd           m5, m2, [t3+r10*2+32]
1030    paddw           m3, [t4+r10*1+ 4]
1031    paddd           m4, [t3+r10*2+ 8]
1032    paddd           m5, [t3+r10*2+40]
1033    paddw           m0, m3
1034    psllw           m3, 2
1035    paddd           m1, m4
1036    pslld           m4, 2
1037    paddd           m2, m5
1038    pslld           m5, 2
1039    paddw           m0, m3             ; a 565
1040    paddd           m1, m4             ; b 565
1041    paddd           m2, m5
1042    mova [t4+r10*1+400*2+ 0], m0
1043    mova [t3+r10*2+400*4+ 0], m1
1044    mova [t3+r10*2+400*4+32], m2
1045    add            r10, 32
1046    jl .prep_n_loop
1047    ret
1048ALIGN function_align
1049.n0: ; neighbor + output (even rows)
1050    mov            r10, wq
1051.n0_loop:
1052    movu            m0, [t4+r10*1+ 2]
1053    movu            m1, [t3+r10*2+ 4]
1054    movu            m2, [t3+r10*2+36]
1055    paddw           m3, m0, [t4+r10*1+ 0]
1056    paddd           m4, m1, [t3+r10*2+ 0]
1057    paddd           m5, m2, [t3+r10*2+32]
1058    paddw           m3, [t4+r10*1+ 4]
1059    paddd           m4, [t3+r10*2+ 8]
1060    paddd           m5, [t3+r10*2+40]
1061    paddw           m0, m3
1062    psllw           m3, 2
1063    paddd           m1, m4
1064    pslld           m4, 2
1065    paddd           m2, m5
1066    pslld           m5, 2
1067    paddw           m0, m3             ; a 565
1068    paddd           m1, m4             ; b 565
1069    paddd           m2, m5
1070    paddw           m3, m0, [t4+r10*1+400*2+ 0]
1071    paddd           m4, m1, [t3+r10*2+400*4+ 0]
1072    paddd           m5, m2, [t3+r10*2+400*4+32]
1073    mova [t4+r10*1+400*2+ 0], m0
1074    mova [t3+r10*2+400*4+ 0], m1
1075    mova [t3+r10*2+400*4+32], m2
1076    mova            m0, [dstq+r10]
1077    punpcklwd       m1, m0, m6          ; src
1078    punpcklwd       m2, m3, m6          ; a
1079    pmaddwd         m2, m1              ; a * src
1080    punpckhwd       m1, m0, m6
1081    punpckhwd       m3, m6
1082    pmaddwd         m3, m1
1083    vinserti128     m1, m4, xm5, 1
1084    vperm2i128      m4, m5, 0x31
1085    psubd           m1, m2              ; b - a * src + (1 << 8)
1086    psubd           m4, m3
1087    psrad           m1, 9
1088    psrad           m4, 9
1089    packssdw        m1, m4
1090    pmulhrsw        m1, m7
1091    paddw           m0, m1
1092    pmaxsw          m0, m6
1093    pminsw          m0, m14
1094    mova    [dstq+r10], m0
1095    add            r10, 32
1096    jl .n0_loop
1097    add           dstq, strideq
1098    ret
1099ALIGN function_align
1100.n1: ; neighbor + output (odd rows)
1101    mov            r10, wq
1102.n1_loop:
1103    mova            m0, [dstq+r10]
1104    mova            m3, [t4+r10*1+400*2+ 0]
1105    mova            m4, [t3+r10*2+400*4+ 0]
1106    mova            m5, [t3+r10*2+400*4+32]
1107    punpcklwd       m1, m0, m6          ; src
1108    punpcklwd       m2, m3, m6          ; a
1109    pmaddwd         m2, m1
1110    punpckhwd       m1, m0, m6
1111    punpckhwd       m3, m6
1112    pmaddwd         m3, m1
1113    vinserti128     m1, m4, xm5, 1
1114    vperm2i128      m4, m5, 0x31
1115    psubd           m1, m2              ; b - a * src + (1 << 7)
1116    psubd           m4, m3
1117    psrad           m1, 8
1118    psrad           m4, 8
1119    packssdw        m1, m4
1120    pmulhrsw        m1, m7
1121    paddw           m0, m1
1122    pmaxsw          m0, m6
1123    pminsw          m0, m14
1124    mova    [dstq+r10], m0
1125    add            r10, 32
1126    jl .n1_loop
1127    add           dstq, strideq
1128    ret
1129
1130cglobal sgr_filter_3x3_16bpc, 4, 14, 14, 400*42+8, dst, stride, left, lpf, \
1131                                                   w, h, edge, params
1132    movifnidn       wd, wm
1133    mov        paramsq, r6mp
1134    lea            r13, [sgr_x_by_x_avx2+256*4]
1135    add             wd, wd
1136    movifnidn       hd, hm
1137    mov          edged, r7m
1138    add           lpfq, wq
1139    vpbroadcastw    m7, [paramsq+10] ; w1
1140    add           dstq, wq
1141    vpbroadcastd    m9, [paramsq+ 4] ; s1
1142    lea             t3, [rsp+wq*2+400*12+8]
1143    vpbroadcastd    m8, [pd_8]
1144    lea             t4, [rsp+wq+400*32+8]
1145    vpbroadcastd   m10, [pd_0xf00801c7]
1146    lea             t1, [rsp+wq+12]
1147    vpbroadcastd   m11, [pd_34816]
1148    neg             wq
1149    mova          xm12, [sgr_lshuf3]
1150    pxor            m6, m6
1151    vpbroadcastd   m13, [pw_1023]
1152    psllw           m7, 4
1153    test         edgeb, 4 ; LR_HAVE_TOP
1154    jz .no_top
1155    call .h_top
1156    add           lpfq, strideq
1157    mov             t2, t1
1158    add             t1, 400*6
1159    call .h_top
1160    lea            r10, [lpfq+strideq*4]
1161    mov           lpfq, dstq
1162    add            r10, strideq
1163    mov          [rsp], r10 ; below
1164    call .hv0
1165.main:
1166    dec             hd
1167    jz .height1
1168    add           lpfq, strideq
1169    call .hv1
1170    call .prep_n
1171    sub             hd, 2
1172    jl .extend_bottom
1173.main_loop:
1174    add           lpfq, strideq
1175    call .hv0
1176    test            hd, hd
1177    jz .odd_height
1178    add           lpfq, strideq
1179    call .hv1
1180    call .n0
1181    call .n1
1182    sub             hd, 2
1183    jge .main_loop
1184    test         edgeb, 8 ; LR_HAVE_BOTTOM
1185    jz .extend_bottom
1186    mov           lpfq, [rsp]
1187    call .hv0_bottom
1188    add           lpfq, strideq
1189    call .hv1_bottom
1190.end:
1191    call .n0
1192    call .n1
1193.end2:
1194    RET
1195.height1:
1196    call .v1
1197    call .prep_n
1198    jmp .odd_height_end
1199.odd_height:
1200    call .v1
1201    call .n0
1202    call .n1
1203.odd_height_end:
1204    call .v0
1205    call .v1
1206    call .n0
1207    jmp .end2
1208.extend_bottom:
1209    call .v0
1210    call .v1
1211    jmp .end
1212.no_top:
1213    lea            r10, [lpfq+strideq*4]
1214    mov           lpfq, dstq
1215    lea            r10, [r10+strideq*2]
1216    mov          [rsp], r10
1217    call .h
1218    lea            r10, [wq-4]
1219    lea             t2, [t1+400*6]
1220.top_fixup_loop:
1221    mova            m0, [t1+r10+400*0]
1222    mova            m1, [t1+r10+400*2]
1223    mova            m2, [t1+r10+400*4]
1224    mova [t2+r10+400*0], m0
1225    mova [t2+r10+400*2], m1
1226    mova [t2+r10+400*4], m2
1227    add            r10, 32
1228    jl .top_fixup_loop
1229    call .v0
1230    jmp .main
1231.extend_right:
1232    vpbroadcastw    m0, [lpfq-2]
1233    movu            m1, [r13+r10+ 2]
1234    movu            m2, [r13+r10+18]
1235    vpblendvb       m4, m0, m1
1236    vpblendvb       m5, m0, m2
1237    ret
1238.h: ; horizontal boxsum
1239    lea            r10, [wq-4]
1240    test         edgeb, 1 ; LR_HAVE_LEFT
1241    jz .h_extend_left
1242    vpbroadcastq   xm5, [leftq]
1243    vinserti128     m5, [lpfq+wq], 1
1244    mova            m4, [lpfq+wq]
1245    add          leftq, 8
1246    palignr         m4, m5, 12
1247    jmp .h_main
1248.h_extend_left:
1249    mova           xm4, [lpfq+wq]
1250    pshufb         xm4, xm12
1251    vinserti128     m4, [lpfq+wq+12], 1
1252    jmp .h_main
1253.h_top:
1254    lea            r10, [wq-4]
1255    test         edgeb, 1 ; LR_HAVE_LEFT
1256    jz .h_extend_left
1257.h_loop:
1258    movu            m4, [lpfq+r10+ 0]
1259.h_main:
1260    movu            m5, [lpfq+r10+16]
1261    test         edgeb, 2 ; LR_HAVE_RIGHT
1262    jnz .h_have_right
1263    cmp           r10d, -34
1264    jl .h_have_right
1265    call .extend_right
1266.h_have_right:
1267    palignr         m0, m5, m4, 2
1268    paddw           m1, m4, m0
1269    punpcklwd       m2, m4, m0
1270    pmaddwd         m2, m2
1271    punpckhwd       m3, m4, m0
1272    pmaddwd         m3, m3
1273    palignr         m5, m4, 4
1274    paddw           m1, m5             ; sum
1275    punpcklwd       m4, m5, m6
1276    pmaddwd         m4, m4
1277    punpckhwd       m5, m6
1278    pmaddwd         m5, m5
1279    paddd           m2, m4             ; sumsq
1280    paddd           m3, m5
1281    mova [t1+r10+400*0], m1
1282    mova [t1+r10+400*2], m2
1283    mova [t1+r10+400*4], m3
1284    add            r10, 32
1285    jl .h_loop
1286    ret
1287ALIGN function_align
1288.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
1289    lea            r10, [wq-4]
1290    test         edgeb, 1 ; LR_HAVE_LEFT
1291    jz .hv0_extend_left
1292    vpbroadcastq   xm5, [leftq]
1293    vinserti128     m5, [lpfq+wq], 1
1294    mova            m4, [lpfq+wq]
1295    add          leftq, 8
1296    palignr         m4, m5, 12
1297    jmp .hv0_main
1298.hv0_extend_left:
1299    mova           xm4, [lpfq+wq]
1300    pshufb         xm4, xm12
1301    vinserti128     m4, [lpfq+wq+12], 1
1302    jmp .hv0_main
1303.hv0_bottom:
1304    lea            r10, [wq-4]
1305    test         edgeb, 1 ; LR_HAVE_LEFT
1306    jz .hv0_extend_left
1307.hv0_loop:
1308    movu            m4, [lpfq+r10+ 0]
1309.hv0_main:
1310    movu            m5, [lpfq+r10+16]
1311    test         edgeb, 2 ; LR_HAVE_RIGHT
1312    jnz .hv0_have_right
1313    cmp           r10d, -34
1314    jl .hv0_have_right
1315    call .extend_right
1316.hv0_have_right:
1317    palignr         m0, m5, m4, 2
1318    paddw           m1, m4, m0
1319    punpcklwd       m2, m4, m0
1320    pmaddwd         m2, m2
1321    punpckhwd       m3, m4, m0
1322    pmaddwd         m3, m3
1323    palignr         m5, m4, 4
1324    paddw           m1, m5             ; sum
1325    punpcklwd       m4, m5, m6
1326    pmaddwd         m4, m4
1327    punpckhwd       m5, m6
1328    pmaddwd         m5, m5
1329    paddd           m2, m4             ; sumsq
1330    paddd           m3, m5
1331    paddw           m0, m1, [t1+r10+400*0]
1332    paddd           m4, m2, [t1+r10+400*2]
1333    paddd           m5, m3, [t1+r10+400*4]
1334    mova [t1+r10+400*0], m1
1335    mova [t1+r10+400*2], m2
1336    mova [t1+r10+400*4], m3
1337    paddw           m1, m0, [t2+r10+400*0]
1338    paddd           m2, m4, [t2+r10+400*2]
1339    paddd           m3, m5, [t2+r10+400*4]
1340    mova [t2+r10+400*0], m0
1341    mova [t2+r10+400*2], m4
1342    mova [t2+r10+400*4], m5
1343    paddd           m2, m8
1344    paddd           m3, m8
1345    psrld           m2, 4              ; (a + 8) >> 4
1346    psrld           m3, 4
1347    pslld           m4, m2, 3
1348    pslld           m5, m3, 3
1349    paddd           m4, m2             ; ((a + 8) >> 4) * 9
1350    paddd           m5, m3
1351    psrlw           m3, m1, 1
1352    pavgw           m3, m6             ; (b + 2) >> 2
1353    punpcklwd       m2, m3, m6
1354    pmaddwd         m2, m2
1355    punpckhwd       m3, m6
1356    pmaddwd         m3, m3
1357    punpcklwd       m0, m1, m6         ; b
1358    punpckhwd       m1, m6
1359    pmaxud          m4, m2
1360    psubd           m4, m2             ; p
1361    pmaxud          m5, m3
1362    psubd           m5, m3
1363    pmulld          m4, m9             ; p * s
1364    pmulld          m5, m9
1365    pmaddwd         m0, m10            ; b * 455
1366    pmaddwd         m1, m10
1367    paddusw         m4, m10
1368    paddusw         m5, m10
1369    psrad           m3, m4, 20         ; min(z, 255) - 256
1370    vpgatherdd      m2, [r13+m3*4], m4 ; x
1371    psrad           m4, m5, 20
1372    vpgatherdd      m3, [r13+m4*4], m5
1373    pmulld          m0, m2
1374    pmulld          m1, m3
1375    packssdw        m2, m3
1376    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
1377    paddd           m1, m11
1378    psrld           m0, 12
1379    psrld           m1, 12
1380    mova         [t4+r10*1+400*0+ 4], m2
1381    mova         [t3+r10*2+400*0+ 8], xm0
1382    vextracti128 [t3+r10*2+400*0+40], m0, 1
1383    mova         [t3+r10*2+400*0+24], xm1
1384    vextracti128 [t3+r10*2+400*0+56], m1, 1
1385    add            r10, 32
1386    jl .hv0_loop
1387    ret
1388ALIGN function_align
1389.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
1390    lea            r10, [wq-4]
1391    test         edgeb, 1 ; LR_HAVE_LEFT
1392    jz .hv1_extend_left
1393    vpbroadcastq   xm5, [leftq]
1394    vinserti128     m5, [lpfq+wq], 1
1395    mova            m4, [lpfq+wq]
1396    add          leftq, 8
1397    palignr         m4, m5, 12
1398    jmp .hv1_main
1399.hv1_extend_left:
1400    mova           xm4, [lpfq+wq]
1401    pshufb         xm4, xm12
1402    vinserti128     m4, [lpfq+wq+12], 1
1403    jmp .hv1_main
1404.hv1_bottom:
1405    lea            r10, [wq-4]
1406    test         edgeb, 1 ; LR_HAVE_LEFT
1407    jz .hv1_extend_left
1408.hv1_loop:
1409    movu            m4, [lpfq+r10+ 0]
1410.hv1_main:
1411    movu            m5, [lpfq+r10+16]
1412    test         edgeb, 2 ; LR_HAVE_RIGHT
1413    jnz .hv1_have_right
1414    cmp           r10d, -34
1415    jl .hv1_have_right
1416    call .extend_right
1417.hv1_have_right:
1418    palignr         m1, m5, m4, 2
1419    paddw           m0, m4, m1
1420    punpcklwd       m2, m4, m1
1421    pmaddwd         m2, m2
1422    punpckhwd       m3, m4, m1
1423    pmaddwd         m3, m3
1424    palignr         m5, m4, 4
1425    paddw           m0, m5             ; h sum
1426    punpcklwd       m1, m5, m6
1427    pmaddwd         m1, m1
1428    punpckhwd       m5, m6
1429    pmaddwd         m5, m5
1430    paddd           m2, m1             ; h sumsq
1431    paddd           m3, m5
1432    paddw           m1, m0, [t2+r10+400*0]
1433    paddd           m4, m2, [t2+r10+400*2]
1434    paddd           m5, m3, [t2+r10+400*4]
1435    mova [t2+r10+400*0], m0
1436    mova [t2+r10+400*2], m2
1437    mova [t2+r10+400*4], m3
1438    paddd           m4, m8
1439    paddd           m5, m8
1440    psrld           m4, 4              ; (a + 8) >> 4
1441    psrld           m5, 4
1442    pslld           m2, m4, 3
1443    pslld           m3, m5, 3
1444    paddd           m4, m2             ; ((a + 8) >> 4) * 9
1445    paddd           m5, m3
1446    psrlw           m3, m1, 1
1447    pavgw           m3, m6             ; (b + 2) >> 2
1448    punpcklwd       m2, m3, m6
1449    pmaddwd         m2, m2
1450    punpckhwd       m3, m6
1451    pmaddwd         m3, m3
1452    punpcklwd       m0, m1, m6         ; b
1453    punpckhwd       m1, m6
1454    pmaxud          m4, m2
1455    psubd           m4, m2             ; p
1456    pmaxud          m5, m3
1457    psubd           m5, m3
1458    pmulld          m4, m9             ; p * s
1459    pmulld          m5, m9
1460    pmaddwd         m0, m10            ; b * 455
1461    pmaddwd         m1, m10
1462    paddusw         m4, m10
1463    paddusw         m5, m10
1464    psrad           m3, m4, 20         ; min(z, 255) - 256
1465    vpgatherdd      m2, [r13+m3*4], m4 ; x
1466    psrad           m4, m5, 20
1467    vpgatherdd      m3, [r13+m4*4], m5
1468    pmulld          m0, m2
1469    pmulld          m1, m3
1470    packssdw        m2, m3
1471    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
1472    paddd           m1, m11
1473    psrld           m0, 12
1474    psrld           m1, 12
1475    mova         [t4+r10*1+400*2 +4], m2
1476    mova         [t3+r10*2+400*4+ 8], xm0
1477    vextracti128 [t3+r10*2+400*4+40], m0, 1
1478    mova         [t3+r10*2+400*4+24], xm1
1479    vextracti128 [t3+r10*2+400*4+56], m1, 1
1480    add            r10, 32
1481    jl .hv1_loop
1482    mov            r10, t2
1483    mov             t2, t1
1484    mov             t1, r10
1485    ret
1486.v0: ; vertical boxsums + ab (even rows)
1487    lea            r10, [wq-4]
1488.v0_loop:
1489    mova            m0, [t1+r10+400*0]
1490    mova            m4, [t1+r10+400*2]
1491    mova            m5, [t1+r10+400*4]
1492    paddw           m0, m0
1493    paddd           m4, m4
1494    paddd           m5, m5
1495    paddw           m1, m0, [t2+r10+400*0]
1496    paddd           m2, m4, [t2+r10+400*2]
1497    paddd           m3, m5, [t2+r10+400*4]
1498    mova [t2+r10+400*0], m0
1499    mova [t2+r10+400*2], m4
1500    mova [t2+r10+400*4], m5
1501    paddd           m2, m8
1502    paddd           m3, m8
1503    psrld           m2, 4              ; (a + 8) >> 4
1504    psrld           m3, 4
1505    pslld           m4, m2, 3
1506    pslld           m5, m3, 3
1507    paddd           m4, m2             ; ((a + 8) >> 4) * 9
1508    paddd           m5, m3
1509    psrlw           m3, m1, 1
1510    pavgw           m3, m6             ; (b + 2) >> 2
1511    punpcklwd       m2, m3, m6
1512    pmaddwd         m2, m2
1513    punpckhwd       m3, m6
1514    pmaddwd         m3, m3
1515    punpcklwd       m0, m1, m6         ; b
1516    punpckhwd       m1, m6
1517    pmaxud          m4, m2
1518    psubd           m4, m2             ; p
1519    pmaxud          m5, m3
1520    psubd           m5, m3
1521    pmulld          m4, m9             ; p * s
1522    pmulld          m5, m9
1523    pmaddwd         m0, m10            ; b * 455
1524    pmaddwd         m1, m10
1525    paddusw         m4, m10
1526    paddusw         m5, m10
1527    psrad           m3, m4, 20         ; min(z, 255) - 256
1528    vpgatherdd      m2, [r13+m3*4], m4 ; x
1529    psrad           m4, m5, 20
1530    vpgatherdd      m3, [r13+m4*4], m5
1531    pmulld          m0, m2
1532    pmulld          m1, m3
1533    packssdw        m2, m3
1534    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
1535    paddd           m1, m11
1536    psrld           m0, 12
1537    psrld           m1, 12
1538    mova         [t4+r10*1+400*0+ 4], m2
1539    mova         [t3+r10*2+400*0+ 8], xm0
1540    vextracti128 [t3+r10*2+400*0+40], m0, 1
1541    mova         [t3+r10*2+400*0+24], xm1
1542    vextracti128 [t3+r10*2+400*0+56], m1, 1
1543    add            r10, 32
1544    jl .v0_loop
1545    ret
1546.v1: ; vertical boxsums + ab (odd rows)
1547    lea            r10, [wq-4]
1548.v1_loop:
1549    mova            m0, [t1+r10+400*0]
1550    mova            m4, [t1+r10+400*2]
1551    mova            m5, [t1+r10+400*4]
1552    paddw           m1, m0, [t2+r10+400*0]
1553    paddd           m2, m4, [t2+r10+400*2]
1554    paddd           m3, m5, [t2+r10+400*4]
1555    mova [t2+r10+400*0], m0
1556    mova [t2+r10+400*2], m4
1557    mova [t2+r10+400*4], m5
1558    paddd           m2, m8
1559    paddd           m3, m8
1560    psrld           m2, 4              ; (a + 8) >> 4
1561    psrld           m3, 4
1562    pslld           m4, m2, 3
1563    pslld           m5, m3, 3
1564    paddd           m4, m2             ; ((a + 8) >> 4) * 9
1565    paddd           m5, m3
1566    psrlw           m3, m1, 1
1567    pavgw           m3, m6             ; (b + 2) >> 2
1568    punpcklwd       m2, m3, m6
1569    pmaddwd         m2, m2
1570    punpckhwd       m3, m6
1571    pmaddwd         m3, m3
1572    punpcklwd       m0, m1, m6         ; b
1573    punpckhwd       m1, m6
1574    pmaxud          m4, m2
1575    psubd           m4, m2             ; p
1576    pmaxud          m5, m3
1577    psubd           m5, m3
1578    pmulld          m4, m9             ; p * s
1579    pmulld          m5, m9
1580    pmaddwd         m0, m10            ; b * 455
1581    pmaddwd         m1, m10
1582    paddusw         m4, m10
1583    paddusw         m5, m10
1584    psrad           m3, m4, 20         ; min(z, 255) - 256
1585    vpgatherdd      m2, [r13+m3*4], m4 ; x
1586    psrad           m4, m5, 20
1587    vpgatherdd      m3, [r13+m4*4], m5
1588    pmulld          m0, m2
1589    pmulld          m1, m3
1590    packssdw        m2, m3
1591    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
1592    paddd           m1, m11
1593    psrld           m0, 12
1594    psrld           m1, 12
1595    mova         [t4+r10*1+400*2+ 4], m2
1596    mova         [t3+r10*2+400*4+ 8], xm0
1597    vextracti128 [t3+r10*2+400*4+40], m0, 1
1598    mova         [t3+r10*2+400*4+24], xm1
1599    vextracti128 [t3+r10*2+400*4+56], m1, 1
1600    add            r10, 32
1601    jl .v1_loop
1602    mov            r10, t2
1603    mov             t2, t1
1604    mov             t1, r10
1605    ret
1606.prep_n: ; initial neighbor setup
1607    mov            r10, wq
1608.prep_n_loop:
1609    mova           xm0, [t4+r10*1+400*0+0]
1610    paddw          xm0, [t4+r10*1+400*0+4]
1611    paddw          xm2, xm0, [t4+r10*1+400*0+2]
1612    mova            m1, [t3+r10*2+400*0+0]
1613    paddd           m1, [t3+r10*2+400*0+8]
1614    paddd           m3, m1, [t3+r10*2+400*0+4]
1615    psllw          xm2, 2                ; a[-1] 444
1616    pslld           m3, 2                ; b[-1] 444
1617    psubw          xm2, xm0              ; a[-1] 343
1618    psubd           m3, m1               ; b[-1] 343
1619    mova [t4+r10*1+400* 4], xm2
1620    mova [t3+r10*2+400* 8], m3
1621    mova           xm0, [t4+r10*1+400*2+0]
1622    paddw          xm0, [t4+r10*1+400*2+4]
1623    paddw          xm2, xm0, [t4+r10*1+400*2+2]
1624    mova            m1, [t3+r10*2+400*4+0]
1625    paddd           m1, [t3+r10*2+400*4+8]
1626    paddd           m3, m1, [t3+r10*2+400*4+4]
1627    psllw          xm2, 2                 ; a[ 0] 444
1628    pslld           m3, 2                 ; b[ 0] 444
1629    mova [t4+r10*1+400* 6], xm2
1630    mova [t3+r10*2+400*12], m3
1631    psubw          xm2, xm0               ; a[ 0] 343
1632    psubd           m3, m1                ; b[ 0] 343
1633    mova [t4+r10*1+400* 8], xm2
1634    mova [t3+r10*2+400*16], m3
1635    add            r10, 16
1636    jl .prep_n_loop
1637    ret
1638ALIGN function_align
1639.n0: ; neighbor + output (even rows)
1640    mov            r10, wq
1641.n0_loop:
1642    mova            m3, [t4+r10*1+400*0+0]
1643    paddw           m3, [t4+r10*1+400*0+4]
1644    paddw           m1, m3, [t4+r10*1+400*0+2]
1645    psllw           m1, 2                ; a[ 1] 444
1646    psubw           m2, m1, m3           ; a[ 1] 343
1647    paddw           m3, m2, [t4+r10*1+400*4]
1648    paddw           m3, [t4+r10*1+400*6]
1649    mova [t4+r10*1+400*4], m2
1650    mova [t4+r10*1+400*6], m1
1651    mova            m4, [t3+r10*2+400*0+0]
1652    paddd           m4, [t3+r10*2+400*0+8]
1653    paddd           m1, m4, [t3+r10*2+400*0+4]
1654    pslld           m1, 2                ; b[ 1] 444
1655    psubd           m2, m1, m4           ; b[ 1] 343
1656    paddd           m4, m2, [t3+r10*2+400* 8+ 0]
1657    paddd           m4, [t3+r10*2+400*12+ 0]
1658    mova [t3+r10*2+400* 8+ 0], m2
1659    mova [t3+r10*2+400*12+ 0], m1
1660    mova            m5, [t3+r10*2+400*0+32]
1661    paddd           m5, [t3+r10*2+400*0+40]
1662    paddd           m1, m5, [t3+r10*2+400*0+36]
1663    pslld           m1, 2
1664    psubd           m2, m1, m5
1665    paddd           m5, m2, [t3+r10*2+400* 8+32]
1666    paddd           m5, [t3+r10*2+400*12+32]
1667    mova [t3+r10*2+400* 8+32], m2
1668    mova [t3+r10*2+400*12+32], m1
1669    mova            m0, [dstq+r10]
1670    punpcklwd       m1, m0, m6
1671    punpcklwd       m2, m3, m6
1672    pmaddwd         m2, m1               ; a * src
1673    punpckhwd       m1, m0, m6
1674    punpckhwd       m3, m6
1675    pmaddwd         m3, m1
1676    vinserti128     m1, m4, xm5, 1
1677    vperm2i128      m4, m5, 0x31
1678    psubd           m1, m2               ; b - a * src + (1 << 8)
1679    psubd           m4, m3
1680    psrad           m1, 9
1681    psrad           m4, 9
1682    packssdw        m1, m4
1683    pmulhrsw        m1, m7
1684    paddw           m0, m1
1685    pmaxsw          m0, m6
1686    pminsw          m0, m13
1687    mova    [dstq+r10], m0
1688    add            r10, 32
1689    jl .n0_loop
1690    add           dstq, strideq
1691    ret
1692ALIGN function_align
1693.n1: ; neighbor + output (odd rows)
1694    mov            r10, wq
1695.n1_loop:
1696    mova            m3, [t4+r10*1+400*2+0]
1697    paddw           m3, [t4+r10*1+400*2+4]
1698    paddw           m1, m3, [t4+r10*1+400*2+2]
1699    psllw           m1, 2                ; a[ 1] 444
1700    psubw           m2, m1, m3           ; a[ 1] 343
1701    paddw           m3, m2, [t4+r10*1+400*6]
1702    paddw           m3, [t4+r10*1+400*8]
1703    mova [t4+r10*1+400*6], m1
1704    mova [t4+r10*1+400*8], m2
1705    mova            m4, [t3+r10*2+400*4+0]
1706    paddd           m4, [t3+r10*2+400*4+8]
1707    paddd           m1, m4, [t3+r10*2+400*4+4]
1708    pslld           m1, 2                ; b[ 1] 444
1709    psubd           m2, m1, m4           ; b[ 1] 343
1710    paddd           m4, m2, [t3+r10*2+400*12+ 0]
1711    paddd           m4, [t3+r10*2+400*16+ 0]
1712    mova [t3+r10*2+400*12+ 0], m1
1713    mova [t3+r10*2+400*16+ 0], m2
1714    mova            m5, [t3+r10*2+400*4+32]
1715    paddd           m5, [t3+r10*2+400*4+40]
1716    paddd           m1, m5, [t3+r10*2+400*4+36]
1717    pslld           m1, 2
1718    psubd           m2, m1, m5
1719    paddd           m5, m2, [t3+r10*2+400*12+32]
1720    paddd           m5, [t3+r10*2+400*16+32]
1721    mova [t3+r10*2+400*12+32], m1
1722    mova [t3+r10*2+400*16+32], m2
1723    mova            m0, [dstq+r10]
1724    punpcklwd       m1, m0, m6
1725    punpcklwd       m2, m3, m6
1726    pmaddwd         m2, m1               ; a * src
1727    punpckhwd       m1, m0, m6
1728    punpckhwd       m3, m6
1729    pmaddwd         m3, m1
1730    vinserti128     m1, m4, xm5, 1
1731    vperm2i128      m4, m5, 0x31
1732    psubd           m1, m2               ; b - a * src + (1 << 8)
1733    psubd           m4, m3
1734    psrad           m1, 9
1735    psrad           m4, 9
1736    packssdw        m1, m4
1737    pmulhrsw        m1, m7
1738    paddw           m0, m1
1739    pmaxsw          m0, m6
1740    pminsw          m0, m13
1741    mova    [dstq+r10], m0
1742    add            r10, 32
1743    jl .n1_loop
1744    add           dstq, strideq
1745    ret
1746
1747cglobal sgr_filter_mix_16bpc, 4, 14, 16, 400*66+8, dst, stride, left, lpf, \
1748                                                   w, h, edge, params
1749    movifnidn       wd, wm
1750    mov        paramsq, r6mp
1751    lea            r13, [sgr_x_by_x_avx2+256*4]
1752    add             wd, wd
1753    movifnidn       hd, hm
1754    mov          edged, r7m
1755    add           lpfq, wq
1756    vpbroadcastd   m15, [paramsq+8] ; w0 w1
1757    add           dstq, wq
1758    vpbroadcastd   m13, [paramsq+0] ; s0
1759    lea             t3, [rsp+wq*2+400*24+8]
1760    vpbroadcastd   m14, [paramsq+4] ; s1
1761    lea             t4, [rsp+wq+400*52+8]
1762    vpbroadcastd    m9, [pd_8]
1763    lea             t1, [rsp+wq+12]
1764    vpbroadcastd   m10, [pd_34816]
1765    neg             wq
1766    vpbroadcastd   m11, [pd_4096]
1767    pxor            m7, m7
1768    vpbroadcastd   m12, [pd_0xf00801c7]
1769    psllw          m15, 2
1770    test         edgeb, 4 ; LR_HAVE_TOP
1771    jz .no_top
1772    call .h_top
1773    add           lpfq, strideq
1774    mov             t2, t1
1775    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).top_fixup
1776    add             t1, 400*12
1777    call .h_top
1778    lea            r10, [lpfq+strideq*4]
1779    mov           lpfq, dstq
1780    add            r10, strideq
1781    mov          [rsp], r10 ; below
1782    call .hv0
1783.main:
1784    dec             hd
1785    jz .height1
1786    add           lpfq, strideq
1787    call .hv1
1788    call .prep_n
1789    sub             hd, 2
1790    jl .extend_bottom
1791.main_loop:
1792    add           lpfq, strideq
1793    call .hv0
1794    test            hd, hd
1795    jz .odd_height
1796    add           lpfq, strideq
1797    call .hv1
1798    call .n0
1799    call .n1
1800    sub             hd, 2
1801    jge .main_loop
1802    test         edgeb, 8 ; LR_HAVE_BOTTOM
1803    jz .extend_bottom
1804    mov           lpfq, [rsp]
1805    call .hv0_bottom
1806    add           lpfq, strideq
1807    call .hv1_bottom
1808.end:
1809    call .n0
1810    call .n1
1811.end2:
1812    RET
1813.height1:
1814    call .v1
1815    call .prep_n
1816    jmp .odd_height_end
1817.odd_height:
1818    call .v1
1819    call .n0
1820    call .n1
1821.odd_height_end:
1822    call .v0
1823    call .v1
1824    call .n0
1825    jmp .end2
1826.extend_bottom:
1827    call .v0
1828    call .v1
1829    jmp .end
1830.no_top:
1831    lea            r10, [lpfq+strideq*4]
1832    mov           lpfq, dstq
1833    lea            r10, [r10+strideq*2]
1834    mov          [rsp], r10
1835    call .h
1836    lea            r10, [wq-4]
1837    lea             t2, [t1+400*12]
1838.top_fixup_loop:
1839    mova            m0, [t1+r10+400* 0]
1840    mova            m1, [t1+r10+400* 2]
1841    mova            m2, [t1+r10+400* 4]
1842    paddw           m0, m0
1843    mova            m3, [t1+r10+400* 6]
1844    paddd           m1, m1
1845    mova            m4, [t1+r10+400* 8]
1846    paddd           m2, m2
1847    mova            m5, [t1+r10+400*10]
1848    mova [t2+r10+400* 0], m0
1849    mova [t2+r10+400* 2], m1
1850    mova [t2+r10+400* 4], m2
1851    mova [t2+r10+400* 6], m3
1852    mova [t2+r10+400* 8], m4
1853    mova [t2+r10+400*10], m5
1854    add            r10, 32
1855    jl .top_fixup_loop
1856    call .v0
1857    jmp .main
1858.h: ; horizontal boxsum
1859    lea            r10, [wq-4]
1860    test         edgeb, 1 ; LR_HAVE_LEFT
1861    jz .h_extend_left
1862    vpbroadcastq   xm5, [leftq]
1863    vinserti128     m5, [lpfq+wq], 1
1864    mova            m4, [lpfq+wq]
1865    add          leftq, 8
1866    palignr         m4, m5, 10
1867    jmp .h_main
1868.h_extend_left:
1869    mova           xm4, [lpfq+wq]
1870    pshufb         xm4, [sgr_lshuf5]
1871    vinserti128     m4, [lpfq+wq+10], 1
1872    jmp .h_main
1873.h_top:
1874    lea            r10, [wq-4]
1875    test         edgeb, 1 ; LR_HAVE_LEFT
1876    jz .h_extend_left
1877.h_loop:
1878    movu            m4, [lpfq+r10- 2]
1879.h_main:
1880    movu            m5, [lpfq+r10+14]
1881    test         edgeb, 2 ; LR_HAVE_RIGHT
1882    jnz .h_have_right
1883    cmp           r10d, -36
1884    jl .h_have_right
1885    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
1886.h_have_right:
1887    palignr         m3, m5, m4, 2
1888    palignr         m0, m5, m4, 4
1889    paddw           m1, m3, m0
1890    punpcklwd       m2, m3, m0
1891    pmaddwd         m2, m2
1892    punpckhwd       m3, m0
1893    pmaddwd         m3, m3
1894    palignr         m0, m5, m4, 6
1895    paddw           m1, m0             ; sum3
1896    punpcklwd       m6, m0, m7
1897    pmaddwd         m6, m6
1898    punpckhwd       m0, m7
1899    pmaddwd         m0, m0
1900    paddd           m2, m6             ; sumsq3
1901    shufpd          m6, m4, m5, 0x05
1902    punpcklwd       m5, m6, m4
1903    paddw           m8, m4, m6
1904    pmaddwd         m5, m5
1905    punpckhwd       m6, m4
1906    pmaddwd         m6, m6
1907    paddd           m3, m0
1908    mova [t1+r10+400* 6], m1
1909    mova [t1+r10+400* 8], m2
1910    mova [t1+r10+400*10], m3
1911    paddw           m8, m1             ; sum5
1912    paddd           m5, m2             ; sumsq5
1913    paddd           m6, m3
1914    mova [t1+r10+400* 0], m8
1915    mova [t1+r10+400* 2], m5
1916    mova [t1+r10+400* 4], m6
1917    add            r10, 32
1918    jl .h_loop
1919    ret
1920ALIGN function_align
1921.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
1922    lea            r10, [wq-4]
1923    test         edgeb, 1 ; LR_HAVE_LEFT
1924    jz .hv0_extend_left
1925    vpbroadcastq   xm5, [leftq]
1926    vinserti128     m5, [lpfq+wq], 1
1927    mova            m4, [lpfq+wq]
1928    add          leftq, 8
1929    palignr         m4, m5, 10
1930    jmp .hv0_main
1931.hv0_extend_left:
1932    mova           xm4, [lpfq+wq]
1933    pshufb         xm4, [sgr_lshuf5]
1934    vinserti128     m4, [lpfq+wq+10], 1
1935    jmp .hv0_main
1936.hv0_bottom:
1937    lea            r10, [wq-4]
1938    test         edgeb, 1 ; LR_HAVE_LEFT
1939    jz .hv0_extend_left
1940.hv0_loop:
1941    movu            m4, [lpfq+r10- 2]
1942.hv0_main:
1943    movu            m5, [lpfq+r10+14]
1944    test         edgeb, 2 ; LR_HAVE_RIGHT
1945    jnz .hv0_have_right
1946    cmp           r10d, -36
1947    jl .hv0_have_right
1948    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
1949.hv0_have_right:
1950    palignr         m3, m5, m4, 2
1951    palignr         m0, m5, m4, 4
1952    paddw           m1, m3, m0
1953    punpcklwd       m2, m3, m0
1954    pmaddwd         m2, m2
1955    punpckhwd       m3, m0
1956    pmaddwd         m3, m3
1957    palignr         m0, m5, m4, 6
1958    paddw           m1, m0             ; h sum3
1959    punpcklwd       m6, m0, m7
1960    pmaddwd         m6, m6
1961    punpckhwd       m0, m7
1962    pmaddwd         m0, m0
1963    paddd           m2, m6             ; h sumsq3
1964    shufpd          m6, m4, m5, 0x05
1965    punpcklwd       m5, m6, m4
1966    paddw           m8, m4, m6
1967    pmaddwd         m5, m5
1968    punpckhwd       m6, m4
1969    pmaddwd         m6, m6
1970    paddd           m3, m0
1971    paddw           m8, m1             ; h sum5
1972    paddd           m5, m2             ; h sumsq5
1973    paddd           m6, m3
1974    mova [t3+r10*2+400*8+ 8], m8 ; we need a clean copy of the last row TODO: t4?
1975    mova [t3+r10*2+400*0+ 8], m5 ; in case height is odd
1976    mova [t3+r10*2+400*0+40], m6
1977    paddw           m8, [t1+r10+400* 0]
1978    paddd           m5, [t1+r10+400* 2]
1979    paddd           m6, [t1+r10+400* 4]
1980    mova [t1+r10+400* 0], m8
1981    mova [t1+r10+400* 2], m5
1982    mova [t1+r10+400* 4], m6
1983    paddw           m0, m1, [t1+r10+400* 6]
1984    paddd           m4, m2, [t1+r10+400* 8]
1985    paddd           m5, m3, [t1+r10+400*10]
1986    mova [t1+r10+400* 6], m1
1987    mova [t1+r10+400* 8], m2
1988    mova [t1+r10+400*10], m3
1989    paddw           m1, m0, [t2+r10+400* 6]
1990    paddd           m2, m4, [t2+r10+400* 8]
1991    paddd           m3, m5, [t2+r10+400*10]
1992    mova [t2+r10+400* 6], m0
1993    mova [t2+r10+400* 8], m4
1994    mova [t2+r10+400*10], m5
1995    paddd           m2, m9
1996    paddd           m3, m9
1997    psrld           m2, 4              ; (a3 + 8) >> 4
1998    psrld           m3, 4
1999    pslld           m4, m2, 3
2000    pslld           m5, m3, 3
2001    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
2002    paddd           m5, m3
2003    psrlw           m3, m1, 1
2004    pavgw           m3, m7             ; (b3 + 2) >> 2
2005    punpcklwd       m2, m3, m7
2006    pmaddwd         m2, m2
2007    punpckhwd       m3, m7
2008    pmaddwd         m3, m3
2009    punpcklwd       m0, m1, m7         ; b3
2010    punpckhwd       m1, m7
2011    pmaxud          m4, m2
2012    psubd           m4, m2             ; p3
2013    pmaxud          m5, m3
2014    psubd           m5, m3
2015    pmulld          m4, m14            ; p3 * s1
2016    pmulld          m5, m14
2017    pmaddwd         m0, m12            ; b3 * 455
2018    pmaddwd         m1, m12
2019    paddusw         m4, m12
2020    paddusw         m5, m12
2021    psrad           m3, m4, 20         ; min(z3, 255) - 256
2022    vpgatherdd      m2, [r13+m3*4], m4 ; x3
2023    psrad           m4, m5, 20
2024    vpgatherdd      m3, [r13+m4*4], m5
2025    pmulld          m0, m2
2026    pmulld          m1, m3
2027    packssdw        m2, m3
2028    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2029    paddd           m1, m10
2030    psrld           m0, 12
2031    psrld           m1, 12
2032    mova         [t4+r10*1+400*2+ 4], m2
2033    mova         [t3+r10*2+400*4+ 8], xm0
2034    vextracti128 [t3+r10*2+400*4+40], m0, 1
2035    mova         [t3+r10*2+400*4+24], xm1
2036    vextracti128 [t3+r10*2+400*4+56], m1, 1
2037    add            r10, 32
2038    jl .hv0_loop
2039    ret
2040ALIGN function_align
2041.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
2042    lea            r10, [wq-4]
2043    test         edgeb, 1 ; LR_HAVE_LEFT
2044    jz .hv1_extend_left
2045    vpbroadcastq   xm5, [leftq]
2046    vinserti128     m5, [lpfq+wq], 1
2047    mova            m4, [lpfq+wq]
2048    add          leftq, 8
2049    palignr         m4, m5, 10
2050    jmp .hv1_main
2051.hv1_extend_left:
2052    mova           xm4, [lpfq+wq]
2053    pshufb         xm4, [sgr_lshuf5]
2054    vinserti128     m4, [lpfq+wq+10], 1
2055    jmp .hv1_main
2056.hv1_bottom:
2057    lea            r10, [wq-4]
2058    test         edgeb, 1 ; LR_HAVE_LEFT
2059    jz .hv1_extend_left
2060.hv1_loop:
2061    movu            m4, [lpfq+r10- 2]
2062.hv1_main:
2063    movu            m5, [lpfq+r10+14]
2064    test         edgeb, 2 ; LR_HAVE_RIGHT
2065    jnz .hv1_have_right
2066    cmp           r10d, -36
2067    jl .hv1_have_right
2068    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
2069.hv1_have_right:
2070    palignr         m6, m5, m4, 2
2071    palignr         m3, m5, m4, 4
2072    paddw           m2, m6, m3
2073    punpcklwd       m0, m6, m3
2074    pmaddwd         m0, m0
2075    punpckhwd       m6, m3
2076    pmaddwd         m6, m6
2077    palignr         m3, m5, m4, 6
2078    paddw           m2, m3             ; h sum3
2079    punpcklwd       m1, m3, m7
2080    pmaddwd         m1, m1
2081    punpckhwd       m3, m7
2082    pmaddwd         m3, m3
2083    paddd           m0, m1             ; h sumsq3
2084    shufpd          m1, m4, m5, 0x05
2085    punpckhwd       m5, m4, m1
2086    paddw           m8, m4, m1
2087    pmaddwd         m5, m5
2088    punpcklwd       m4, m1
2089    pmaddwd         m4, m4
2090    paddd           m6, m3
2091    paddw           m1, m2, [t2+r10+400* 6]
2092    mova [t2+r10+400* 6], m2
2093    paddw           m8, m2             ; h sum5
2094    paddd           m2, m0, [t2+r10+400* 8]
2095    paddd           m3, m6, [t2+r10+400*10]
2096    mova [t2+r10+400* 8], m0
2097    mova [t2+r10+400*10], m6
2098    paddd           m4, m0             ; h sumsq5
2099    paddd           m5, m6
2100    paddd           m2, m9
2101    paddd           m3, m9
2102    psrld           m2, 4              ; (a3 + 8) >> 4
2103    psrld           m3, 4
2104    pslld           m0, m2, 3
2105    pslld           m6, m3, 3
2106    paddd           m2, m0             ; ((a3 + 8) >> 4) * 9
2107    paddd           m3, m6
2108    psrlw           m6, m1, 1
2109    pavgw           m6, m7             ; (b3 + 2) >> 2
2110    punpcklwd       m0, m6, m7
2111    pmaddwd         m0, m0
2112    punpckhwd       m6, m7
2113    pmaddwd         m6, m6
2114    pmaxud          m2, m0
2115    psubd           m2, m0             ; p3
2116    pmaxud          m3, m6
2117    psubd           m3, m6
2118    punpcklwd       m0, m1, m7         ; b3
2119    punpckhwd       m1, m7
2120    pmulld          m2, m14            ; p3 * s1
2121    pmulld          m3, m14
2122    pmaddwd         m0, m12            ; b3 * 455
2123    pmaddwd         m1, m12
2124    paddusw         m2, m12
2125    paddusw         m3, m12
2126    psrad           m7, m2, 20         ; min(z3, 255) - 256
2127    vpgatherdd      m6, [r13+m7*4], m2 ; x3
2128    psrad           m2, m3, 20
2129    vpgatherdd      m7, [r13+m2*4], m3
2130    pmulld          m0, m6
2131    packssdw        m6, m7
2132    pmulld          m7, m1
2133    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2134    paddd           m7, m10
2135    psrld           m0, 12
2136    psrld           m7, 12
2137    paddw           m1, m8, [t2+r10+400*0]
2138    paddd           m2, m4, [t2+r10+400*2]
2139    paddd           m3, m5, [t2+r10+400*4]
2140    paddw           m1, [t1+r10+400*0]
2141    paddd           m2, [t1+r10+400*2]
2142    paddd           m3, [t1+r10+400*4]
2143    mova [t2+r10+400*0], m8
2144    mova [t2+r10+400*2], m4
2145    mova [t2+r10+400*4], m5
2146    mova         [t4+r10*1+400*4 +4], m6
2147    mova         [t3+r10*2+400*8+ 8], xm0
2148    vextracti128 [t3+r10*2+400*8+40], m0, 1
2149    mova         [t3+r10*2+400*8+24], xm7
2150    vextracti128 [t3+r10*2+400*8+56], m7, 1
2151    vpbroadcastd    m4, [pd_25]
2152    pxor            m7, m7
2153    paddd           m2, m9
2154    paddd           m3, m9
2155    psrld           m2, 4              ; (a5 + 8) >> 4
2156    psrld           m3, 4
2157    pmulld          m2, m4             ; ((a5 + 8) >> 4) * 25
2158    pmulld          m3, m4
2159    psrlw           m5, m1, 1
2160    pavgw           m5, m7             ; (b5 + 2) >> 2
2161    punpcklwd       m4, m5, m7
2162    pmaddwd         m4, m4
2163    punpckhwd       m5, m7
2164    pmaddwd         m5, m5
2165    punpcklwd       m0, m1, m7         ; b5
2166    punpckhwd       m1, m7
2167    pmaxud          m2, m4
2168    psubd           m2, m4             ; p5
2169    vpbroadcastd    m4, [pd_0xf00800a4]
2170    pmaxud          m3, m5
2171    psubd           m3, m5
2172    pmulld          m2, m13            ; p5 * s0
2173    pmulld          m3, m13
2174    pmaddwd         m0, m4             ; b5 * 164
2175    pmaddwd         m1, m4
2176    paddusw         m2, m4
2177    paddusw         m3, m4
2178    psrad           m5, m2, 20         ; min(z5, 255) - 256
2179    vpgatherdd      m4, [r13+m5*4], m2 ; x5
2180    psrad           m2, m3, 20
2181    vpgatherdd      m5, [r13+m2*4], m3
2182    pmulld          m0, m4
2183    pmulld          m1, m5
2184    packssdw        m4, m5
2185    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
2186    paddd           m1, m10
2187    psrld           m0, 12
2188    psrld           m1, 12
2189    mova         [t4+r10*1+400*0+ 4], m4
2190    mova         [t3+r10*2+400*0+ 8], xm0
2191    vextracti128 [t3+r10*2+400*0+40], m0, 1
2192    mova         [t3+r10*2+400*0+24], xm1
2193    vextracti128 [t3+r10*2+400*0+56], m1, 1
2194    add            r10, 32
2195    jl .hv1_loop
2196    mov            r10, t2
2197    mov             t2, t1
2198    mov             t1, r10
2199    ret
2200.v0: ; vertical boxsums + ab3 (even rows)
2201    lea            r10, [wq-4]
2202.v0_loop:
2203    mova            m0, [t1+r10+400* 6]
2204    mova            m4, [t1+r10+400* 8]
2205    mova            m5, [t1+r10+400*10]
2206    paddw           m0, m0
2207    paddd           m4, m4
2208    paddd           m5, m5
2209    paddw           m1, m0, [t2+r10+400* 6]
2210    paddd           m2, m4, [t2+r10+400* 8]
2211    paddd           m3, m5, [t2+r10+400*10]
2212    mova [t2+r10+400* 6], m0
2213    mova [t2+r10+400* 8], m4
2214    mova [t2+r10+400*10], m5
2215    paddd           m2, m9
2216    paddd           m3, m9
2217    psrld           m2, 4              ; (a3 + 8) >> 4
2218    psrld           m3, 4
2219    pslld           m4, m2, 3
2220    pslld           m5, m3, 3
2221    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
2222    paddd           m5, m3
2223    psrlw           m3, m1, 1
2224    pavgw           m3, m7             ; (b3 + 2) >> 2
2225    punpcklwd       m2, m3, m7
2226    pmaddwd         m2, m2
2227    punpckhwd       m3, m7
2228    pmaddwd         m3, m3
2229    punpcklwd       m0, m1, m7         ; b3
2230    punpckhwd       m1, m7
2231    pmaxud          m4, m2
2232    psubd           m4, m2             ; p3
2233    pmaxud          m5, m3
2234    psubd           m5, m3
2235    pmulld          m4, m14            ; p3 * s1
2236    pmulld          m5, m14
2237    pmaddwd         m0, m12            ; b3 * 455
2238    pmaddwd         m1, m12
2239    paddusw         m4, m12
2240    paddusw         m5, m12
2241    psrad           m3, m4, 20         ; min(z3, 255) - 256
2242    vpgatherdd      m2, [r13+m3*4], m4 ; x3
2243    psrad           m4, m5, 20
2244    vpgatherdd      m3, [r13+m4*4], m5
2245    pmulld          m0, m2
2246    pmulld          m1, m3
2247    packssdw        m2, m3
2248    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2249    paddd           m1, m10
2250    psrld           m0, 12
2251    psrld           m1, 12
2252    mova            m3, [t1+r10+400*0]
2253    mova            m4, [t1+r10+400*2]
2254    mova            m5, [t1+r10+400*4]
2255    mova [t3+r10*2+400*8+ 8], m3
2256    mova [t3+r10*2+400*0+ 8], m4
2257    mova [t3+r10*2+400*0+40], m5
2258    paddw           m3, m3 ; cc5
2259    paddd           m4, m4
2260    paddd           m5, m5
2261    mova [t1+r10+400*0], m3
2262    mova [t1+r10+400*2], m4
2263    mova [t1+r10+400*4], m5
2264    mova         [t4+r10*1+400*2+ 4], m2
2265    mova         [t3+r10*2+400*4+ 8], xm0
2266    vextracti128 [t3+r10*2+400*4+40], m0, 1
2267    mova         [t3+r10*2+400*4+24], xm1
2268    vextracti128 [t3+r10*2+400*4+56], m1, 1
2269    add            r10, 32
2270    jl .v0_loop
2271    ret
2272.v1: ; vertical boxsums + ab (odd rows)
2273    lea            r10, [wq-4]
2274.v1_loop:
2275    mova            m4, [t1+r10+400* 6]
2276    mova            m5, [t1+r10+400* 8]
2277    mova            m6, [t1+r10+400*10]
2278    paddw           m1, m4, [t2+r10+400* 6]
2279    paddd           m2, m5, [t2+r10+400* 8]
2280    paddd           m3, m6, [t2+r10+400*10]
2281    mova [t2+r10+400* 6], m4
2282    mova [t2+r10+400* 8], m5
2283    mova [t2+r10+400*10], m6
2284    paddd           m2, m9
2285    paddd           m3, m9
2286    psrld           m2, 4              ; (a3 + 8) >> 4
2287    psrld           m3, 4
2288    pslld           m4, m2, 3
2289    pslld           m5, m3, 3
2290    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
2291    paddd           m5, m3
2292    psrlw           m3, m1, 1
2293    pavgw           m3, m7             ; (b3 + 2) >> 2
2294    punpcklwd       m2, m3, m7
2295    pmaddwd         m2, m2
2296    punpckhwd       m3, m7
2297    pmaddwd         m3, m3
2298    punpcklwd       m0, m1, m7         ; b3
2299    punpckhwd       m1, m7
2300    pmaxud          m4, m2
2301    psubd           m4, m2             ; p3
2302    pmaxud          m5, m3
2303    psubd           m5, m3
2304    pmulld          m4, m14            ; p3 * s1
2305    pmulld          m5, m14
2306    pmaddwd         m0, m12            ; b3 * 455
2307    pmaddwd         m1, m12
2308    paddusw         m4, m12
2309    paddusw         m5, m12
2310    psrad           m3, m4, 20         ; min(z3, 255) - 256
2311    vpgatherdd      m2, [r13+m3*4], m4 ; x3
2312    psrad           m4, m5, 20
2313    vpgatherdd      m3, [r13+m4*4], m5
2314    pmulld          m0, m2
2315    pmulld          m1, m3
2316    packssdw        m2, m3
2317    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2318    paddd           m1, m10
2319    psrld           m0, 12
2320    psrld           m8, m1, 12
2321    mova [t4+r10*1+400*4+4], m2
2322    mova            m4, [t3+r10*2+400*8+ 8]
2323    mova            m5, [t3+r10*2+400*0+ 8]
2324    mova            m6, [t3+r10*2+400*0+40]
2325    paddw           m1, m4, [t2+r10+400*0]
2326    paddd           m2, m5, [t2+r10+400*2]
2327    paddd           m3, m6, [t2+r10+400*4]
2328    paddw           m1, [t1+r10+400*0]
2329    paddd           m2, [t1+r10+400*2]
2330    paddd           m3, [t1+r10+400*4]
2331    mova [t2+r10+400*0], m4
2332    mova [t2+r10+400*2], m5
2333    mova [t2+r10+400*4], m6
2334    vpbroadcastd    m4, [pd_25]
2335    mova         [t3+r10*2+400*8+ 8], xm0
2336    vextracti128 [t3+r10*2+400*8+40], m0, 1
2337    mova         [t3+r10*2+400*8+24], xm8
2338    vextracti128 [t3+r10*2+400*8+56], m8, 1
2339    paddd           m2, m9
2340    paddd           m3, m9
2341    psrld           m2, 4              ; (a5 + 8) >> 4
2342    psrld           m3, 4
2343    pmulld          m2, m4             ; ((a5 + 8) >> 4) * 25
2344    pmulld          m3, m4
2345    psrlw           m5, m1, 1
2346    pavgw           m5, m7             ; (b5 + 2) >> 2
2347    punpcklwd       m4, m5, m7
2348    pmaddwd         m4, m4
2349    punpckhwd       m5, m7
2350    pmaddwd         m5, m5
2351    punpcklwd       m0, m1, m7         ; b5
2352    punpckhwd       m1, m7
2353    pmaxud          m2, m4
2354    psubd           m2, m4             ; p5
2355    vpbroadcastd    m4, [pd_0xf00800a4]
2356    pmaxud          m3, m5
2357    psubd           m3, m5
2358    pmulld          m2, m13            ; p5 * s0
2359    pmulld          m3, m13
2360    pmaddwd         m0, m4             ; b5 * 164
2361    pmaddwd         m1, m4
2362    paddusw         m2, m4
2363    paddusw         m3, m4
2364    psrad           m5, m2, 20         ; min(z5, 255) - 256
2365    vpgatherdd      m4, [r13+m5*4], m2 ; x5
2366    psrad           m2, m3, 20
2367    vpgatherdd      m5, [r13+m2*4], m3
2368    pmulld          m0, m4
2369    pmulld          m1, m5
2370    packssdw        m4, m5
2371    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
2372    paddd           m1, m10
2373    psrld           m0, 12
2374    psrld           m1, 12
2375    mova         [t4+r10*1+400*0+ 4], m4
2376    mova         [t3+r10*2+400*0+ 8], xm0
2377    vextracti128 [t3+r10*2+400*0+40], m0, 1
2378    mova         [t3+r10*2+400*0+24], xm1
2379    vextracti128 [t3+r10*2+400*0+56], m1, 1
2380    add            r10, 32
2381    jl .v1_loop
2382    mov            r10, t2
2383    mov             t2, t1
2384    mov             t1, r10
2385    ret
2386.prep_n: ; initial neighbor setup
2387    mov            r10, wq
2388.prep_n_loop:
2389    movu           xm0, [t4+r10*1+400*0+2]
2390    paddw          xm2, xm0, [t4+r10*1+400*0+0]
2391    paddw          xm2, [t4+r10*1+400*0+4]
2392    movu            m1, [t3+r10*2+400*0+4]
2393    paddd           m3, m1, [t3+r10*2+400*0+0]
2394    paddd           m3, [t3+r10*2+400*0+8]
2395    paddw          xm0, xm2
2396    paddd           m1, m3
2397    psllw          xm2, 2
2398    pslld           m3, 2
2399    paddw          xm0, xm2              ; a5 565
2400    paddd           m1, m3               ; b5 565
2401    mova [t4+r10*1+400* 6], xm0
2402    mova [t3+r10*2+400*12], m1
2403    mova           xm0, [t4+r10*1+400*2+0]
2404    paddw          xm0, [t4+r10*1+400*2+4]
2405    paddw          xm2, xm0, [t4+r10*1+400*2+2]
2406    mova            m1, [t3+r10*2+400*4+0]
2407    paddd           m1, [t3+r10*2+400*4+8]
2408    paddd           m3, m1, [t3+r10*2+400*4+4]
2409    psllw          xm2, 2                ; a3[-1] 444
2410    pslld           m3, 2                ; b3[-1] 444
2411    psubw          xm2, xm0              ; a3[-1] 343
2412    psubd           m3, m1               ; b3[-1] 343
2413    mova [t4+r10*1+400* 8], xm2
2414    mova [t3+r10*2+400*16], m3
2415    mova           xm0, [t4+r10*1+400*4+0]
2416    paddw          xm0, [t4+r10*1+400*4+4]
2417    paddw          xm2, xm0, [t4+r10*1+400*4+2]
2418    mova            m1, [t3+r10*2+400*8+0]
2419    paddd           m1, [t3+r10*2+400*8+8]
2420    paddd           m3, m1, [t3+r10*2+400*8+4]
2421    psllw          xm2, 2                 ; a3[ 0] 444
2422    pslld           m3, 2                 ; b3[ 0] 444
2423    mova [t4+r10*1+400*10], xm2
2424    mova [t3+r10*2+400*20], m3
2425    psubw          xm2, xm0               ; a3[ 0] 343
2426    psubd           m3, m1                ; b3[ 0] 343
2427    mova [t4+r10*1+400*12], xm2
2428    mova [t3+r10*2+400*24], m3
2429    add            r10, 16
2430    jl .prep_n_loop
2431    ret
2432ALIGN function_align
2433.n0: ; neighbor + output (even rows)
2434    mov            r10, wq
2435.n0_loop:
2436    movu           xm2, [t4+r10*1+2]
2437    paddw          xm0, xm2, [t4+r10*1+0]
2438    paddw          xm0, [t4+r10*1+4]
2439    paddw          xm2, xm0
2440    psllw          xm0, 2
2441    paddw          xm0, xm2              ; a5
2442    movu            m1, [t3+r10*2+4]
2443    paddd           m4, m1, [t3+r10*2+0]
2444    paddd           m4, [t3+r10*2+8]
2445    paddd           m1, m4
2446    pslld           m4, 2
2447    paddd           m4, m1               ; b5
2448    paddw          xm2, xm0, [t4+r10*1+400* 6]
2449    mova [t4+r10*1+400* 6], xm0
2450    paddd           m0, m4, [t3+r10*2+400*12]
2451    mova [t3+r10*2+400*12], m4
2452    mova           xm3, [t4+r10*1+400*2+0]
2453    paddw          xm3, [t4+r10*1+400*2+4]
2454    paddw          xm5, xm3, [t4+r10*1+400*2+2]
2455    psllw          xm5, 2                ; a3[ 1] 444
2456    psubw          xm4, xm5, xm3         ; a3[ 1] 343
2457    paddw          xm3, xm4, [t4+r10*1+400* 8]
2458    paddw          xm3, [t4+r10*1+400*10]
2459    mova [t4+r10*1+400* 8], xm4
2460    mova [t4+r10*1+400*10], xm5
2461    mova            m1, [t3+r10*2+400*4+0]
2462    paddd           m1, [t3+r10*2+400*4+8]
2463    paddd           m5, m1, [t3+r10*2+400*4+4]
2464    pslld           m5, 2                ; b3[ 1] 444
2465    psubd           m4, m5, m1           ; b3[ 1] 343
2466    paddd           m1, m4, [t3+r10*2+400*16]
2467    paddd           m1, [t3+r10*2+400*20]
2468    mova [t3+r10*2+400*16], m4
2469    mova [t3+r10*2+400*20], m5
2470    pmovzxwd        m4, [dstq+r10]
2471    pmovzxwd        m2, xm2              ; a5
2472    pmovzxwd        m3, xm3              ; a3
2473    pmaddwd         m2, m4               ; a5 * src
2474    pmaddwd         m3, m4               ; a3 * src
2475    pslld           m4, 13
2476    psubd           m0, m2               ; b5 - a5 * src + (1 << 8)
2477    psubd           m1, m3               ; b3 - a3 * src + (1 << 8)
2478    psrld           m0, 9
2479    pslld           m1, 7
2480    pblendw         m0, m1, 0xaa
2481    pmaddwd         m0, m15
2482    paddd           m4, m11
2483    paddd           m0, m4
2484    psrad           m0, 7
2485    vextracti128   xm1, m0, 1
2486    packusdw       xm0, xm1              ; clip
2487    psrlw          xm0, 6
2488    mova    [dstq+r10], xm0
2489    add            r10, 16
2490    jl .n0_loop
2491    add           dstq, strideq
2492    ret
2493ALIGN function_align
2494.n1: ; neighbor + output (odd rows)
2495    mov            r10, wq
2496.n1_loop:
2497    mova           xm3, [t4+r10*1+400*4+0]
2498    paddw          xm3, [t4+r10*1+400*4+4]
2499    paddw          xm5, xm3, [t4+r10*1+400*4+2]
2500    psllw          xm5, 2                ; a3[ 1] 444
2501    psubw          xm4, xm5, xm3         ; a3[ 1] 343
2502    paddw          xm3, xm4, [t4+r10*1+400*12]
2503    paddw          xm3, [t4+r10*1+400*10]
2504    mova [t4+r10*1+400*10], xm5
2505    mova [t4+r10*1+400*12], xm4
2506    mova            m1, [t3+r10*2+400*8+0]
2507    paddd           m1, [t3+r10*2+400*8+8]
2508    paddd           m5, m1, [t3+r10*2+400*8+4]
2509    pslld           m5, 2                ; b3[ 1] 444
2510    psubd           m4, m5, m1           ; b3[ 1] 343
2511    paddd           m1, m4, [t3+r10*2+400*24]
2512    paddd           m1, [t3+r10*2+400*20]
2513    mova [t3+r10*2+400*20], m5
2514    mova [t3+r10*2+400*24], m4
2515    pmovzxwd        m4, [dstq+r10]
2516    pmovzxwd        m2, [t4+r10*1+400* 6]
2517    pmovzxwd        m3, xm3
2518    mova            m0, [t3+r10*2+400*12]
2519    pmaddwd         m2, m4               ; a5 * src
2520    pmaddwd         m3, m4               ; a3 * src
2521    pslld           m4, 13
2522    psubd           m0, m2               ; b5 - a5 * src + (1 << 8)
2523    psubd           m1, m3               ; b3 - a3 * src + (1 << 8)
2524    psrld           m0, 8
2525    pslld           m1, 7
2526    pblendw         m0, m1, 0xaa
2527    pmaddwd         m0, m15
2528    paddd           m4, m11
2529    paddd           m0, m4
2530    psrad           m0, 7
2531    vextracti128   xm1, m0, 1
2532    packusdw       xm0, xm1              ; clip
2533    psrlw          xm0, 6
2534    mova    [dstq+r10], xm0
2535    add            r10, 16
2536    jl .n1_loop
2537    add           dstq, strideq
2538    ret
2539
2540%endif ; ARCH_X86_64
2541