1; Copyright © 2018, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 32
32
33wiener_l_shuf: db  4,  4,  4,  4,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
34pb_0to31:      db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
35               db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
36wiener_shufA:  db  1,  7,  2,  8,  3,  9,  4, 10,  5, 11,  6, 12,  7, 13,  8, 14
37wiener_shufB:  db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10
38wiener_shufC:  db  6,  5,  7,  6,  8,  7,  9,  8, 10,  9, 11, 10, 12, 11, 13, 12
39sgr_r_ext:     times 16 db 1
40               times 16 db 9
41
42; dword version of dav1d_sgr_x_by_x[] for use with gathers, wastes a bit of
43; cache but eliminates some shifts in the inner sgr loop which is overall a win
44const sgr_x_by_x_avx2
45              dd 255,128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16
46              dd  15, 14, 13, 13, 12, 12, 11, 11, 10, 10,  9,  9,  9,  9,  8,  8
47              dd   8,  8,  7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5
48              dd   5,  5,  5,  5,  5,  5,  5,  5,  4,  4,  4,  4,  4,  4,  4,  4
49              dd   4,  4,  4,  4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3
50              dd   3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3
51              dd   3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2
52              dd   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2
53              dd   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2
54              dd   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2
55              dd   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1
56              dd   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
57              dd   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
58              dd   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
59              dd   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
60              dd   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0
61
62               times 4 db -1 ; needed for 16-bit sgr
63pb_m5:         times 4 db -5
64pb_3:          times 4 db 3
65pw_5_6:        dw 5, 6
66
67sgr_l_shuf:    db  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
68sgr_shuf:      db  1, -1,  2, -1,  3, -1,  4, -1,  5, -1,  6, -1,  7, -1,  8, -1
69               db  9, -1, 10, -1, 11, -1, 12, -1
70
71pw_256:        times 2 dw 256
72pw_2056:       times 2 dw 2056
73pw_m16380:     times 2 dw -16380
74pd_25:         dd 25
75pd_34816:      dd 34816
76pd_m4096:      dd -4096
77pd_0xf00801c7: dd 0xf00801c7
78pd_0xf00800a4: dd 0xf00800a4
79
80SECTION .text
81
82%macro REPX 2-*
83    %xdefine %%f(x) %1
84%rep %0 - 1
85    %rotate 1
86    %%f(%1)
87%endrep
88%endmacro
89
90DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; ring buffer pointers
91
92INIT_YMM avx2
93cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
94                                                    lpf_stride, w, edge, flt, h
95    mov           fltq, fltmp
96    mov          edged, r8m
97    mov             wd, wm
98    mov             hd, r6m
99    vbroadcasti128  m6, [wiener_shufA]
100    vpbroadcastb   m11, [fltq+ 0] ; x0 x0
101    vbroadcasti128  m7, [wiener_shufB]
102    vpbroadcastd   m12, [fltq+ 2]
103    vbroadcasti128  m8, [wiener_shufC]
104    packsswb       m12, m12       ; x1 x2
105    vpbroadcastw   m13, [fltq+ 6] ; x3
106    vbroadcasti128  m9, [sgr_shuf+6]
107    add           lpfq, wq
108    vpbroadcastd   m10, [pw_m16380]
109    lea             t1, [rsp+wq*2+16]
110    vpbroadcastd   m14, [fltq+16] ; y0 y1
111    add           dstq, wq
112    vpbroadcastd   m15, [fltq+20] ; y2 y3
113    neg             wq
114    test         edgeb, 4 ; LR_HAVE_TOP
115    jz .no_top
116    call .h_top
117    add           lpfq, lpf_strideq
118    mov             t6, t1
119    mov             t5, t1
120    add             t1, 384*2
121    call .h_top
122    lea             r7, [lpfq+lpf_strideq*4]
123    mov           lpfq, dstq
124    mov             t4, t1
125    add             t1, 384*2
126    mov      [rsp+8*1], lpf_strideq
127    add             r7, lpf_strideq
128    mov      [rsp+8*0], r7 ; below
129    call .h
130    mov             t3, t1
131    mov             t2, t1
132    dec             hd
133    jz .v1
134    add           lpfq, dst_strideq
135    add             t1, 384*2
136    call .h
137    mov             t2, t1
138    dec             hd
139    jz .v2
140    add           lpfq, dst_strideq
141    add             t1, 384*2
142    call .h
143    dec             hd
144    jz .v3
145.main:
146    lea             t0, [t1+384*2]
147.main_loop:
148    call .hv
149    dec             hd
150    jnz .main_loop
151    test         edgeb, 8 ; LR_HAVE_BOTTOM
152    jz .v3
153    mov           lpfq, [rsp+8*0]
154    call .hv_bottom
155    add           lpfq, [rsp+8*1]
156    call .hv_bottom
157.v1:
158    call .v
159    RET
160.no_top:
161    lea             r7, [lpfq+lpf_strideq*4]
162    mov           lpfq, dstq
163    mov      [rsp+8*1], lpf_strideq
164    lea             r7, [r7+lpf_strideq*2]
165    mov      [rsp+8*0], r7
166    call .h
167    mov             t6, t1
168    mov             t5, t1
169    mov             t4, t1
170    mov             t3, t1
171    mov             t2, t1
172    dec             hd
173    jz .v1
174    add           lpfq, dst_strideq
175    add             t1, 384*2
176    call .h
177    mov             t2, t1
178    dec             hd
179    jz .v2
180    add           lpfq, dst_strideq
181    add             t1, 384*2
182    call .h
183    dec             hd
184    jz .v3
185    lea             t0, [t1+384*2]
186    call .hv
187    dec             hd
188    jz .v3
189    add             t0, 384*8
190    call .hv
191    dec             hd
192    jnz .main
193.v3:
194    call .v
195.v2:
196    call .v
197    jmp .v1
198.extend_right:
199    movd           xm2, r10d
200    vpbroadcastd    m0, [pb_3]
201    vpbroadcastd    m1, [pb_m5]
202    vpbroadcastb    m2, xm2
203    movu            m3, [pb_0to31]
204    psubb           m0, m2
205    psubb           m1, m2
206    pminub          m0, m3
207    pminub          m1, m3
208    pshufb          m4, m0
209    pshufb          m5, m1
210    ret
211.h:
212    mov            r10, wq
213    test         edgeb, 1 ; LR_HAVE_LEFT
214    jz .h_extend_left
215    movd           xm4, [leftq]
216    vpblendd        m4, [lpfq+r10-4], 0xfe
217    add          leftq, 4
218    jmp .h_main
219.h_extend_left:
220    vbroadcasti128  m5, [lpfq+r10] ; avoid accessing memory located
221    mova            m4, [lpfq+r10] ; before the start of the buffer
222    palignr         m4, m5, 12
223    pshufb          m4, [wiener_l_shuf]
224    jmp .h_main
225.h_top:
226    mov            r10, wq
227    test         edgeb, 1 ; LR_HAVE_LEFT
228    jz .h_extend_left
229.h_loop:
230    movu            m4, [lpfq+r10-4]
231.h_main:
232    movu            m5, [lpfq+r10+4]
233    test         edgeb, 2 ; LR_HAVE_RIGHT
234    jnz .h_have_right
235    cmp           r10d, -34
236    jl .h_have_right
237    call .extend_right
238.h_have_right:
239    pshufb          m0, m4, m6
240    pmaddubsw       m0, m11
241    pshufb          m1, m5, m6
242    pmaddubsw       m1, m11
243    pshufb          m2, m4, m7
244    pmaddubsw       m2, m12
245    pshufb          m3, m5, m7
246    pmaddubsw       m3, m12
247    paddw           m0, m2
248    pshufb          m2, m4, m8
249    pmaddubsw       m2, m12
250    paddw           m1, m3
251    pshufb          m3, m5, m8
252    pmaddubsw       m3, m12
253    pshufb          m4, m9
254    paddw           m0, m2
255    pmullw          m2, m4, m13
256    pshufb          m5, m9
257    paddw           m1, m3
258    pmullw          m3, m5, m13
259    psllw           m4, 7
260    psllw           m5, 7
261    paddw           m4, m10
262    paddw           m5, m10
263    paddw           m0, m2
264    vpbroadcastd    m2, [pw_2056]
265    paddw           m1, m3
266    paddsw          m0, m4
267    paddsw          m1, m5
268    psraw           m0, 3
269    psraw           m1, 3
270    paddw           m0, m2
271    paddw           m1, m2
272    mova [t1+r10*2+ 0], m0
273    mova [t1+r10*2+32], m1
274    add            r10, 32
275    jl .h_loop
276    ret
277ALIGN function_align
278.hv:
279    add           lpfq, dst_strideq
280    mov            r10, wq
281    test         edgeb, 1 ; LR_HAVE_LEFT
282    jz .hv_extend_left
283    movd           xm4, [leftq]
284    vpblendd        m4, [lpfq+r10-4], 0xfe
285    add          leftq, 4
286    jmp .hv_main
287.hv_extend_left:
288    movu            m4, [lpfq+r10-4]
289    pshufb          m4, [wiener_l_shuf]
290    jmp .hv_main
291.hv_bottom:
292    mov            r10, wq
293    test         edgeb, 1 ; LR_HAVE_LEFT
294    jz .hv_extend_left
295.hv_loop:
296    movu            m4, [lpfq+r10-4]
297.hv_main:
298    movu            m5, [lpfq+r10+4]
299    test         edgeb, 2 ; LR_HAVE_RIGHT
300    jnz .hv_have_right
301    cmp           r10d, -34
302    jl .hv_have_right
303    call .extend_right
304.hv_have_right:
305    pshufb          m0, m4, m6
306    pmaddubsw       m0, m11
307    pshufb          m1, m5, m6
308    pmaddubsw       m1, m11
309    pshufb          m2, m4, m7
310    pmaddubsw       m2, m12
311    pshufb          m3, m5, m7
312    pmaddubsw       m3, m12
313    paddw           m0, m2
314    pshufb          m2, m4, m8
315    pmaddubsw       m2, m12
316    paddw           m1, m3
317    pshufb          m3, m5, m8
318    pmaddubsw       m3, m12
319    pshufb          m4, m9
320    paddw           m0, m2
321    pmullw          m2, m4, m13
322    pshufb          m5, m9
323    paddw           m1, m3
324    pmullw          m3, m5, m13
325    psllw           m4, 7
326    psllw           m5, 7
327    paddw           m4, m10
328    paddw           m5, m10
329    paddw           m0, m2
330    paddw           m1, m3
331    mova            m2, [t4+r10*2]
332    paddw           m2, [t2+r10*2]
333    mova            m3, [t3+r10*2]
334    paddsw          m0, m4
335    vpbroadcastd    m4, [pw_2056]
336    paddsw          m1, m5
337    mova            m5, [t5+r10*2]
338    paddw           m5, [t1+r10*2]
339    psraw           m0, 3
340    psraw           m1, 3
341    paddw           m0, m4
342    paddw           m1, m4
343    paddw           m4, m0, [t6+r10*2]
344    mova    [t0+r10*2], m0
345    punpcklwd       m0, m2, m3
346    pmaddwd         m0, m15
347    punpckhwd       m2, m3
348    pmaddwd         m2, m15
349    punpcklwd       m3, m4, m5
350    pmaddwd         m3, m14
351    punpckhwd       m4, m5
352    pmaddwd         m4, m14
353    paddd           m0, m3
354    paddd           m4, m2
355    mova            m2, [t4+r10*2+32]
356    paddw           m2, [t2+r10*2+32]
357    mova            m3, [t3+r10*2+32]
358    mova            m5, [t5+r10*2+32]
359    paddw           m5, [t1+r10*2+32]
360    psrad           m0, 11
361    psrad           m4, 11
362    packssdw        m0, m4
363    paddw           m4, m1, [t6+r10*2+32]
364    mova [t0+r10*2+32], m1
365    punpcklwd       m1, m2, m3
366    pmaddwd         m1, m15
367    punpckhwd       m2, m3
368    pmaddwd         m2, m15
369    punpcklwd       m3, m4, m5
370    pmaddwd         m3, m14
371    punpckhwd       m4, m5
372    pmaddwd         m4, m14
373    paddd           m1, m3
374    paddd           m2, m4
375    psrad           m1, 11
376    psrad           m2, 11
377    packssdw        m1, m2
378    packuswb        m0, m1
379    mova    [dstq+r10], m0
380    add            r10, 32
381    jl .hv_loop
382    mov             t6, t5
383    mov             t5, t4
384    mov             t4, t3
385    mov             t3, t2
386    mov             t2, t1
387    mov             t1, t0
388    mov             t0, t6
389    add           dstq, dst_strideq
390    ret
391.v:
392    mov            r10, wq
393.v_loop:
394    mova            m2, [t4+r10*2+ 0]
395    paddw           m2, [t2+r10*2+ 0]
396    mova            m4, [t3+r10*2+ 0]
397    mova            m6, [t1+r10*2+ 0]
398    paddw           m8, m6, [t6+r10*2+ 0]
399    paddw           m6, [t5+r10*2+ 0]
400    mova            m3, [t4+r10*2+32]
401    paddw           m3, [t2+r10*2+32]
402    mova            m5, [t3+r10*2+32]
403    mova            m7, [t1+r10*2+32]
404    paddw           m9, m7, [t6+r10*2+32]
405    paddw           m7, [t5+r10*2+32]
406    punpcklwd       m0, m2, m4
407    pmaddwd         m0, m15
408    punpckhwd       m2, m4
409    pmaddwd         m2, m15
410    punpcklwd       m4, m8, m6
411    pmaddwd         m4, m14
412    punpckhwd       m6, m8, m6
413    pmaddwd         m6, m14
414    punpcklwd       m1, m3, m5
415    pmaddwd         m1, m15
416    punpckhwd       m3, m5
417    pmaddwd         m3, m15
418    punpcklwd       m5, m9, m7
419    pmaddwd         m5, m14
420    punpckhwd       m7, m9, m7
421    pmaddwd         m7, m14
422    paddd           m0, m4
423    paddd           m2, m6
424    paddd           m1, m5
425    paddd           m3, m7
426    REPX {psrad x, 11}, m0, m2, m1, m3
427    packssdw        m0, m2
428    packssdw        m1, m3
429    packuswb        m0, m1
430    mova    [dstq+r10], m0
431    add            r10, 32
432    jl .v_loop
433    mov             t6, t5
434    mov             t5, t4
435    mov             t4, t3
436    mov             t3, t2
437    mov             t2, t1
438    add           dstq, dst_strideq
439    ret
440
441cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
442                                                  lpf_stride, w, edge, flt, h
443    mov           fltq, fltmp
444    mov          edged, r8m
445    mov             wd, wm
446    mov             hd, r6m
447    vbroadcasti128  m6, [wiener_shufB]
448    vpbroadcastd   m12, [fltq+ 2]
449    vbroadcasti128  m7, [wiener_shufC]
450    packsswb       m12, m12       ; x1 x2
451    vpbroadcastw   m13, [fltq+ 6] ; x3
452    vbroadcasti128  m8, [sgr_shuf+6]
453    add           lpfq, wq
454    vpbroadcastd    m9, [pw_m16380]
455    vpbroadcastd   m10, [pw_2056]
456    lea             t1, [rsp+wq*2+16]
457    mova           m11, [wiener_l_shuf]
458    vpbroadcastd   m14, [fltq+16] ; __ y1
459    add           dstq, wq
460    vpbroadcastd   m15, [fltq+20] ; y2 y3
461    neg             wq
462    test         edgeb, 4 ; LR_HAVE_TOP
463    jz .no_top
464    call .h_top
465    add           lpfq, lpf_strideq
466    mov             t4, t1
467    add             t1, 384*2
468    call .h_top
469    lea             r7, [lpfq+lpf_strideq*4]
470    mov           lpfq, dstq
471    mov             t3, t1
472    add             t1, 384*2
473    mov      [rsp+8*1], lpf_strideq
474    add             r7, lpf_strideq
475    mov      [rsp+8*0], r7 ; below
476    call .h
477    mov             t2, t1
478    dec             hd
479    jz .v1
480    add           lpfq, dst_strideq
481    add             t1, 384*2
482    call .h
483    dec             hd
484    jz .v2
485.main:
486    mov             t0, t4
487.main_loop:
488    call .hv
489    dec             hd
490    jnz .main_loop
491    test         edgeb, 8 ; LR_HAVE_BOTTOM
492    jz .v2
493    mov           lpfq, [rsp+8*0]
494    call .hv_bottom
495    add           lpfq, [rsp+8*1]
496    call .hv_bottom
497.end:
498    RET
499.no_top:
500    lea             r7, [lpfq+lpf_strideq*4]
501    mov           lpfq, dstq
502    mov      [rsp+8*1], lpf_strideq
503    lea             r7, [r7+lpf_strideq*2]
504    mov      [rsp+8*0], r7
505    call .h
506    mov             t4, t1
507    mov             t3, t1
508    mov             t2, t1
509    dec             hd
510    jz .v1
511    add           lpfq, dst_strideq
512    add             t1, 384*2
513    call .h
514    dec             hd
515    jz .v2
516    lea             t0, [t1+384*2]
517    call .hv
518    dec             hd
519    jz .v2
520    add             t0, 384*6
521    call .hv
522    dec             hd
523    jnz .main
524.v2:
525    call .v
526    mov             t4, t3
527    mov             t3, t2
528    mov             t2, t1
529    add           dstq, dst_strideq
530.v1:
531    call .v
532    jmp .end
533.h:
534    mov            r10, wq
535    test         edgeb, 1 ; LR_HAVE_LEFT
536    jz .h_extend_left
537    movd           xm4, [leftq]
538    vpblendd        m4, [lpfq+r10-4], 0xfe
539    add          leftq, 4
540    jmp .h_main
541.h_extend_left:
542    vbroadcasti128  m5, [lpfq+r10] ; avoid accessing memory located
543    mova            m4, [lpfq+r10] ; before the start of the buffer
544    palignr         m4, m5, 12
545    pshufb          m4, m11
546    jmp .h_main
547.h_top:
548    mov            r10, wq
549    test         edgeb, 1 ; LR_HAVE_LEFT
550    jz .h_extend_left
551.h_loop:
552    movu            m4, [lpfq+r10-4]
553.h_main:
554    movu            m5, [lpfq+r10+4]
555    test         edgeb, 2 ; LR_HAVE_RIGHT
556    jnz .h_have_right
557    cmp           r10d, -33
558    jl .h_have_right
559    call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
560.h_have_right:
561    pshufb          m0, m4, m6
562    pmaddubsw       m0, m12
563    pshufb          m1, m5, m6
564    pmaddubsw       m1, m12
565    pshufb          m2, m4, m7
566    pmaddubsw       m2, m12
567    pshufb          m3, m5, m7
568    pmaddubsw       m3, m12
569    pshufb          m4, m8
570    paddw           m0, m2
571    pmullw          m2, m4, m13
572    pshufb          m5, m8
573    paddw           m1, m3
574    pmullw          m3, m5, m13
575    psllw           m4, 7
576    psllw           m5, 7
577    paddw           m4, m9
578    paddw           m5, m9
579    paddw           m0, m2
580    paddw           m1, m3
581    paddsw          m0, m4
582    paddsw          m1, m5
583    psraw           m0, 3
584    psraw           m1, 3
585    paddw           m0, m10
586    paddw           m1, m10
587    mova [t1+r10*2+ 0], m0
588    mova [t1+r10*2+32], m1
589    add            r10, 32
590    jl .h_loop
591    ret
592ALIGN function_align
593.hv:
594    add           lpfq, dst_strideq
595    mov            r10, wq
596    test         edgeb, 1 ; LR_HAVE_LEFT
597    jz .hv_extend_left
598    movd           xm4, [leftq]
599    vpblendd        m4, [lpfq+r10-4], 0xfe
600    add          leftq, 4
601    jmp .hv_main
602.hv_extend_left:
603    movu            m4, [lpfq+r10-4]
604    pshufb          m4, m11
605    jmp .hv_main
606.hv_bottom:
607    mov            r10, wq
608    test         edgeb, 1 ; LR_HAVE_LEFT
609    jz .hv_extend_left
610.hv_loop:
611    movu            m4, [lpfq+r10-4]
612.hv_main:
613    movu            m5, [lpfq+r10+4]
614    test         edgeb, 2 ; LR_HAVE_RIGHT
615    jnz .hv_have_right
616    cmp           r10d, -33
617    jl .hv_have_right
618    call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
619.hv_have_right:
620    pshufb          m0, m4, m6
621    pmaddubsw       m0, m12
622    pshufb          m1, m5, m6
623    pmaddubsw       m1, m12
624    pshufb          m2, m4, m7
625    pmaddubsw       m2, m12
626    pshufb          m3, m5, m7
627    pmaddubsw       m3, m12
628    pshufb          m4, m8
629    paddw           m0, m2
630    pmullw          m2, m4, m13
631    pshufb          m5, m8
632    paddw           m1, m3
633    pmullw          m3, m5, m13
634    psllw           m4, 7
635    psllw           m5, 7
636    paddw           m4, m9
637    paddw           m5, m9
638    paddw           m0, m2
639    paddw           m1, m3
640    mova            m2, [t3+r10*2]
641    paddw           m2, [t1+r10*2]
642    mova            m3, [t2+r10*2]
643    paddsw          m0, m4
644    paddsw          m1, m5
645    psraw           m0, 3
646    psraw           m1, 3
647    paddw           m0, m10
648    paddw           m1, m10
649    paddw           m4, m0, [t4+r10*2]
650    mova    [t0+r10*2], m0
651    punpcklwd       m0, m2, m3
652    pmaddwd         m0, m15
653    punpckhwd       m2, m3
654    pmaddwd         m2, m15
655    punpcklwd       m3, m4, m4
656    pmaddwd         m3, m14
657    punpckhwd       m4, m4
658    pmaddwd         m4, m14
659    paddd           m0, m3
660    paddd           m4, m2
661    mova            m2, [t3+r10*2+32]
662    paddw           m2, [t1+r10*2+32]
663    mova            m3, [t2+r10*2+32]
664    psrad           m0, 11
665    psrad           m4, 11
666    packssdw        m0, m4
667    paddw           m4, m1, [t4+r10*2+32]
668    mova [t0+r10*2+32], m1
669    punpcklwd       m1, m2, m3
670    pmaddwd         m1, m15
671    punpckhwd       m2, m3
672    pmaddwd         m2, m15
673    punpcklwd       m3, m4, m4
674    pmaddwd         m3, m14
675    punpckhwd       m4, m4
676    pmaddwd         m4, m14
677    paddd           m1, m3
678    paddd           m2, m4
679    psrad           m1, 11
680    psrad           m2, 11
681    packssdw        m1, m2
682    packuswb        m0, m1
683    mova    [dstq+r10], m0
684    add            r10, 32
685    jl .hv_loop
686    mov             t4, t3
687    mov             t3, t2
688    mov             t2, t1
689    mov             t1, t0
690    mov             t0, t4
691    add           dstq, dst_strideq
692    ret
693.v:
694    mov            r10, wq
695    psrld          m13, m14, 16 ; y1 __
696.v_loop:
697    mova            m6, [t1+r10*2+ 0]
698    paddw           m2, m6, [t3+r10*2+ 0]
699    mova            m4, [t2+r10*2+ 0]
700    mova            m7, [t1+r10*2+32]
701    paddw           m3, m7, [t3+r10*2+32]
702    mova            m5, [t2+r10*2+32]
703    paddw           m6, [t4+r10*2+ 0]
704    paddw           m7, [t4+r10*2+32]
705    punpcklwd       m0, m2, m4
706    pmaddwd         m0, m15
707    punpckhwd       m2, m4
708    pmaddwd         m2, m15
709    punpcklwd       m1, m3, m5
710    pmaddwd         m1, m15
711    punpckhwd       m3, m5
712    pmaddwd         m3, m15
713    punpcklwd       m5, m7, m6
714    pmaddwd         m4, m5, m14
715    punpckhwd       m7, m6
716    pmaddwd         m6, m7, m14
717    pmaddwd         m5, m13
718    pmaddwd         m7, m13
719    paddd           m0, m4
720    paddd           m2, m6
721    paddd           m1, m5
722    paddd           m3, m7
723    REPX {psrad x, 11}, m0, m2, m1, m3
724    packssdw        m0, m2
725    packssdw        m1, m3
726    packuswb        m0, m1
727    mova    [dstq+r10], m0
728    add            r10, 32
729    jl .v_loop
730    ret
731
732cglobal sgr_filter_5x5_8bpc, 5, 13, 16, 400*24+16, dst, dst_stride, left, lpf, \
733                                                   lpf_stride, w, edge, params, h
734%define base r12-sgr_x_by_x_avx2-256*4
735    lea            r12, [sgr_x_by_x_avx2+256*4]
736    mov        paramsq, paramsmp
737    mov             wd, wm
738    mov          edged, r8m
739    mov             hd, r6m
740    vbroadcasti128  m8, [base+sgr_shuf+0]
741    add           lpfq, wq
742    vbroadcasti128  m9, [base+sgr_shuf+8]
743    lea             t1, [rsp+wq*2+20]
744    vbroadcasti128 m10, [base+sgr_shuf+2]
745    add           dstq, wq
746    vbroadcasti128 m11, [base+sgr_shuf+6]
747    lea             t3, [rsp+wq*4+16+400*12]
748    vpbroadcastd   m12, [paramsq+0] ; s0
749    neg             wq
750    vpbroadcastd   m13, [base+pd_0xf00800a4]
751    pxor            m6, m6
752    vpbroadcastw    m7, [paramsq+8] ; w0
753    vpbroadcastd   m14, [base+pd_34816]  ; (1 << 11) + (1 << 15)
754    psllw           m7, 4
755    vpbroadcastd   m15, [base+pd_m4096]
756    lea            r10, [lpfq+lpf_strideq*4]
757    mov      [rsp+8*1], lpf_strideq
758    add            r10, lpf_strideq
759    mov      [rsp+8*0], r10 ; below
760    test         edgeb, 4 ; LR_HAVE_TOP
761    jz .no_top
762    call .h_top
763    add           lpfq, lpf_strideq
764    mov             t2, t1
765    call .top_fixup
766    add             t1, 400*6
767    call .h_top
768    lea            r10, [lpfq+lpf_strideq*4]
769    mov           lpfq, dstq
770    mov      [rsp+8*1], lpf_strideq
771    add            r10, lpf_strideq
772    mov      [rsp+8*0], r10 ; below
773    mov             t0, t2
774    dec             hd
775    jz .height1
776    or           edged, 16
777    call .h
778.main:
779    add           lpfq, dst_strideq
780    call .hv
781    call .prep_n
782    sub             hd, 2
783    jl .extend_bottom
784.main_loop:
785    add           lpfq, dst_strideq
786    test            hd, hd
787    jz .odd_height
788    call .h
789    add           lpfq, dst_strideq
790    call .hv
791    call .n0
792    call .n1
793    sub             hd, 2
794    jge .main_loop
795    test         edgeb, 8 ; LR_HAVE_BOTTOM
796    jz .extend_bottom
797    mov           lpfq, [rsp+8*0]
798    call .h_top
799    add           lpfq, [rsp+8*1]
800    call .hv_bottom
801.end:
802    call .n0
803    call .n1
804.end2:
805    RET
806.height1:
807    call .hv
808    call .prep_n
809    jmp .odd_height_end
810.odd_height:
811    call .hv
812    call .n0
813    call .n1
814.odd_height_end:
815    call .v
816    call .n0
817    jmp .end2
818.extend_bottom:
819    call .v
820    jmp .end
821.no_top:
822    lea            r10, [lpfq+lpf_strideq*4]
823    mov           lpfq, dstq
824    mov      [rsp+8*1], lpf_strideq
825    lea            r10, [r10+lpf_strideq*2]
826    mov      [rsp+8*0], r10
827    call .h
828    lea             t2, [t1+400*6]
829    call .top_fixup
830    dec             hd
831    jz .no_top_height1
832    or           edged, 16
833    mov             t0, t1
834    mov             t1, t2
835    jmp .main
836.no_top_height1:
837    call .v
838    call .prep_n
839    jmp .odd_height_end
840.extend_right:
841    movd           xm2, r10d
842    mova            m0, [sgr_r_ext]
843    vpbroadcastb    m2, xm2
844    psubb           m0, m2
845    pminub          m0, [pb_0to31]
846    pshufb          m5, m0
847    ret
848.h: ; horizontal boxsum
849    lea            r10, [wq-2]
850    test         edgeb, 1 ; LR_HAVE_LEFT
851    jz .h_extend_left
852    vpbroadcastd   xm0, [leftq]
853    mova           xm5, [lpfq+wq]
854    palignr        xm5, xm0, 12
855    add          leftq, 4
856    jmp .h_main
857.h_extend_left:
858    mova           xm5, [lpfq+wq]
859    pshufb         xm5, [base+sgr_l_shuf]
860    jmp .h_main
861.h_top:
862    lea            r10, [wq-2]
863    test         edgeb, 1 ; LR_HAVE_LEFT
864    jz .h_extend_left
865.h_loop:
866    movu           xm5, [lpfq+r10-2]
867.h_main:
868    vinserti128     m5, [lpfq+r10+6], 1
869    test         edgeb, 2 ; LR_HAVE_RIGHT
870    jnz .h_have_right
871    cmp           r10d, -18
872    jl .h_have_right
873    call .extend_right
874.h_have_right:
875    pshufb          m3, m5, m8
876    pmullw          m4, m3, m3
877    pshufb          m2, m5, m9
878    paddw           m0, m3, m2
879    shufps          m3, m2, q2121
880    paddw           m0, m3
881    punpcklwd       m1, m2, m3
882    pmaddwd         m1, m1
883    punpckhwd       m2, m3
884    pmaddwd         m2, m2
885    punpcklwd       m3, m4, m6
886    paddd           m1, m3
887    punpckhwd       m4, m6
888    paddd           m2, m4
889    pshufb          m4, m5, m10
890    paddw           m0, m4
891    pshufb          m5, m11
892    paddw           m0, m5 ; sum
893    punpcklwd       m3, m4, m5
894    pmaddwd         m3, m3
895    punpckhwd       m4, m5
896    pmaddwd         m4, m4
897    test         edgeb, 16 ; y > 0
898    jz .h_loop_end
899    paddw           m0, [t1+r10*2+400*0]
900    paddd           m1, [t1+r10*2+400*2]
901    paddd           m2, [t1+r10*2+400*4]
902.h_loop_end:
903    paddd           m1, m3 ; sumsq
904    paddd           m2, m4
905    mova [t1+r10*2+400*0], m0
906    mova [t1+r10*2+400*2], m1
907    mova [t1+r10*2+400*4], m2
908    add            r10, 16
909    jl .h_loop
910    ret
911.top_fixup:
912    lea            r10, [wq-2]
913.top_fixup_loop: ; the sums of the first row needs to be doubled
914    mova            m0, [t1+r10*2+400*0]
915    mova            m1, [t1+r10*2+400*2]
916    mova            m2, [t1+r10*2+400*4]
917    paddw           m0, m0
918    paddd           m1, m1
919    paddd           m2, m2
920    mova [t2+r10*2+400*0], m0
921    mova [t2+r10*2+400*2], m1
922    mova [t2+r10*2+400*4], m2
923    add            r10, 16
924    jl .top_fixup_loop
925    ret
926ALIGN function_align
927.hv: ; horizontal boxsum + vertical boxsum + ab
928    lea            r10, [wq-2]
929    test         edgeb, 1 ; LR_HAVE_LEFT
930    jz .hv_extend_left
931    vpbroadcastd   xm0, [leftq]
932    mova           xm5, [lpfq+wq]
933    palignr        xm5, xm0, 12
934    add          leftq, 4
935    jmp .hv_main
936.hv_extend_left:
937    mova           xm5, [lpfq+wq]
938    pshufb         xm5, [base+sgr_l_shuf]
939    jmp .hv_main
940.hv_bottom:
941    lea            r10, [wq-2]
942    test         edgeb, 1 ; LR_HAVE_LEFT
943    jz .hv_extend_left
944.hv_loop:
945    movu           xm5, [lpfq+r10-2]
946.hv_main:
947    vinserti128     m5, [lpfq+r10+6], 1
948    test         edgeb, 2 ; LR_HAVE_RIGHT
949    jnz .hv_have_right
950    cmp           r10d, -18
951    jl .hv_have_right
952    call .extend_right
953.hv_have_right:
954    pshufb          m1, m5, m8
955    pmullw          m4, m1, m1
956    pshufb          m3, m5, m9
957    paddw           m0, m1, m3
958    shufps          m1, m3, q2121
959    paddw           m0, m1
960    punpcklwd       m2, m3, m1
961    pmaddwd         m2, m2
962    punpckhwd       m3, m1
963    pmaddwd         m3, m3
964    punpcklwd       m1, m4, m6
965    paddd           m2, m1
966    punpckhwd       m4, m6
967    paddd           m3, m4
968    pshufb          m1, m5, m10
969    paddw           m0, m1
970    pshufb          m5, m11
971    paddw           m0, m5               ; h sum
972    punpcklwd       m4, m5, m1
973    pmaddwd         m4, m4
974    punpckhwd       m5, m1
975    pmaddwd         m5, m5
976    paddw           m1, m0, [t1+r10*2+400*0]
977    paddd           m2, m4               ; h sumsq
978    paddd           m3, m5
979    paddd           m4, m2, [t1+r10*2+400*2]
980    paddd           m5, m3, [t1+r10*2+400*4]
981    test            hd, hd
982    jz .hv_last_row
983.hv_main2:
984    paddw           m1, [t2+r10*2+400*0] ; hv sum
985    paddd           m4, [t2+r10*2+400*2] ; hv sumsq
986    paddd           m5, [t2+r10*2+400*4]
987    mova [t0+r10*2+400*0], m0
988    mova [t0+r10*2+400*2], m2
989    mova [t0+r10*2+400*4], m3
990    vpbroadcastd    m2, [pd_25]
991    punpcklwd       m0, m1, m6           ; b
992    punpckhwd       m1, m6
993    pmulld          m4, m2               ; a * 25
994    pmulld          m5, m2
995    pmaddwd         m2, m0, m0           ; b * b
996    pmaddwd         m3, m1, m1
997    psubd           m4, m2               ; p
998    psubd           m5, m3
999    pmulld          m4, m12              ; p * s
1000    pmulld          m5, m12
1001    pmaddwd         m0, m13              ; b * 164
1002    pmaddwd         m1, m13
1003    paddusw         m4, m13
1004    paddusw         m5, m13
1005    psrad           m3, m4, 20           ; min(z, 255) - 256
1006    vpgatherdd      m2, [r12+m3*4], m4
1007    psrad           m4, m5, 20
1008    vpgatherdd      m3, [r12+m4*4], m5
1009    pmulld          m0, m2
1010    pmulld          m1, m3
1011    paddd           m0, m14              ; x * b * 164 + (1 << 11) + (1 << 15)
1012    paddd           m1, m14
1013    pand            m0, m15
1014    pand            m1, m15
1015    por             m0, m2               ; a | (b << 12)
1016    por             m1, m3
1017    mova         [t3+r10*4+ 8], xm0      ; The neighbor calculations requires
1018    vextracti128 [t3+r10*4+40], m0, 1    ; 13 bits for a and 21 bits for b.
1019    mova         [t3+r10*4+24], xm1      ; Packing them allows for 12+20, but
1020    vextracti128 [t3+r10*4+56], m1, 1    ; that gets us most of the way.
1021    add            r10, 16
1022    jl .hv_loop
1023    mov             t2, t1
1024    mov             t1, t0
1025    mov             t0, t2
1026    ret
1027.hv_last_row: ; esoteric edge case for odd heights
1028    mova [t1+r10*2+400*0], m1
1029    paddw              m1, m0
1030    mova [t1+r10*2+400*2], m4
1031    paddd              m4, m2
1032    mova [t1+r10*2+400*4], m5
1033    paddd              m5, m3
1034    jmp .hv_main2
1035.v: ; vertical boxsum + ab
1036    lea            r10, [wq-2]
1037.v_loop:
1038    mova            m0, [t1+r10*2+400*0]
1039    mova            m2, [t1+r10*2+400*2]
1040    mova            m3, [t1+r10*2+400*4]
1041    paddw           m1, m0, [t2+r10*2+400*0]
1042    paddd           m4, m2, [t2+r10*2+400*2]
1043    paddd           m5, m3, [t2+r10*2+400*4]
1044    paddw           m0, m0
1045    paddd           m2, m2
1046    paddd           m3, m3
1047    paddw           m1, m0               ; hv sum
1048    paddd           m4, m2               ; hv sumsq
1049    paddd           m5, m3
1050    vpbroadcastd    m2, [pd_25]
1051    punpcklwd       m0, m1, m6           ; b
1052    punpckhwd       m1, m6
1053    pmulld          m4, m2               ; a * 25
1054    pmulld          m5, m2
1055    pmaddwd         m2, m0, m0           ; b * b
1056    pmaddwd         m3, m1, m1
1057    psubd           m4, m2               ; p
1058    psubd           m5, m3
1059    pmulld          m4, m12              ; p * s
1060    pmulld          m5, m12
1061    pmaddwd         m0, m13              ; b * 164
1062    pmaddwd         m1, m13
1063    paddusw         m4, m13
1064    paddusw         m5, m13
1065    psrad           m3, m4, 20           ; min(z, 255) - 256
1066    vpgatherdd      m2, [r12+m3*4], m4
1067    psrad           m4, m5, 20
1068    vpgatherdd      m3, [r12+m4*4], m5
1069    pmulld          m0, m2
1070    pmulld          m1, m3
1071    paddd           m0, m14              ; x * b * 164 + (1 << 11) + (1 << 15)
1072    paddd           m1, m14
1073    pand            m0, m15
1074    pand            m1, m15
1075    por             m0, m2               ; a | (b << 12)
1076    por             m1, m3
1077    mova         [t3+r10*4+ 8], xm0
1078    vextracti128 [t3+r10*4+40], m0, 1
1079    mova         [t3+r10*4+24], xm1
1080    vextracti128 [t3+r10*4+56], m1, 1
1081    add            r10, 16
1082    jl .v_loop
1083    ret
1084.prep_n: ; initial neighbor setup
1085    mov            r10, wq
1086.prep_n_loop:
1087    movu            m0, [t3+r10*4+ 4]
1088    movu            m1, [t3+r10*4+36]
1089    paddd           m2, m0, [t3+r10*4+ 0]
1090    paddd           m3, m1, [t3+r10*4+32]
1091    paddd           m2, [t3+r10*4+ 8]
1092    paddd           m3, [t3+r10*4+40]
1093    paddd           m0, m2
1094    pslld           m2, 2
1095    paddd           m1, m3
1096    pslld           m3, 2
1097    paddd           m2, m0                ; ab 565
1098    paddd           m3, m1
1099    ; a = 4096 - (ab & 4095) = -(ab | ~4095), so by
1100    ; using OR instead of AND for the masking we get
1101    ; the subtraction for free (with a negated result)
1102    por             m0, m15, m2           ; -a
1103    psrld           m2, 12                ;  b
1104    por             m1, m15, m3
1105    psrld           m3, 12
1106    mova [t3+r10*4+400*4+ 0], m0
1107    mova [t3+r10*4+400*8+ 0], m2
1108    mova [t3+r10*4+400*4+32], m1
1109    mova [t3+r10*4+400*8+32], m3
1110    add            r10, 16
1111    jl .prep_n_loop
1112    ret
1113ALIGN function_align
1114.n0: ; neighbor + output (even rows)
1115    mov            r10, wq
1116.n0_loop:
1117    movu            m0, [t3+r10*4+ 4]
1118    movu            m1, [t3+r10*4+36]
1119    paddd           m2, m0, [t3+r10*4+ 0]
1120    paddd           m3, m1, [t3+r10*4+32]
1121    paddd           m2, [t3+r10*4+ 8]
1122    paddd           m3, [t3+r10*4+40]
1123    paddd           m0, m2
1124    pslld           m2, 2
1125    paddd           m1, m3
1126    pslld           m3, 2
1127    paddd           m2, m0
1128    paddd           m3, m1
1129    por             m0, m15, m2
1130    psrld           m2, 12
1131    por             m1, m15, m3
1132    psrld           m3, 12
1133    paddd           m4, m0, [t3+r10*4+400*4+ 0] ; -a
1134    paddd           m5, m1, [t3+r10*4+400*4+32]
1135    mova [t3+r10*4+400*4+ 0], m0
1136    mova [t3+r10*4+400*4+32], m1
1137    paddd           m0, m2, [t3+r10*4+400*8+ 0] ; b
1138    paddd           m1, m3, [t3+r10*4+400*8+32]
1139    mova [t3+r10*4+400*8+ 0], m2
1140    mova [t3+r10*4+400*8+32], m3
1141    pmovzxbd        m2, [dstq+r10+0]
1142    pmovzxbd        m3, [dstq+r10+8]
1143    pmaddwd         m4, m2 ; -a * src
1144    pmaddwd         m5, m3
1145    packssdw        m2, m3
1146    psubd           m0, m4 ; a * src + b + (1 << 8)
1147    psubd           m1, m5
1148    psrld           m0, 9
1149    psrld           m1, 9
1150    packssdw        m0, m1
1151    psllw           m1, m2, 4
1152    psubw           m0, m1
1153    pmulhrsw        m0, m7
1154    paddw           m0, m2
1155    vextracti128   xm1, m0, 1
1156    packuswb       xm0, xm1
1157    pshufd         xm0, xm0, q3120
1158    mova    [dstq+r10], xm0
1159    add            r10, 16
1160    jl .n0_loop
1161    add           dstq, dst_strideq
1162    ret
1163ALIGN function_align
1164.n1: ; neighbor + output (odd rows)
1165    mov            r10, wq
1166.n1_loop:
1167    pmovzxbd        m2, [dstq+r10+0]
1168    pmovzxbd        m3, [dstq+r10+8]
1169    pmaddwd         m4, m2, [t3+r10*4+400*4+ 0] ; -a * src
1170    pmaddwd         m5, m3, [t3+r10*4+400*4+32]
1171    mova            m0, [t3+r10*4+400*8+ 0]     ; b
1172    mova            m1, [t3+r10*4+400*8+32]
1173    packssdw        m2, m3
1174    psubd           m0, m4                      ; a * src + b + (1 << 7)
1175    psubd           m1, m5
1176    psrld           m0, 8
1177    psrld           m1, 8
1178    packssdw        m0, m1
1179    psllw           m1, m2, 4
1180    psubw           m0, m1
1181    pmulhrsw        m0, m7
1182    paddw           m0, m2
1183    vextracti128   xm1, m0, 1
1184    packuswb       xm0, xm1
1185    pshufd         xm0, xm0, q3120
1186    mova    [dstq+r10], xm0
1187    add            r10, 16
1188    jl .n1_loop
1189    add           dstq, dst_strideq
1190    ret
1191
1192cglobal sgr_filter_3x3_8bpc, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \
1193                                                    lpf_stride, w, edge, params, h
1194%define base r14-sgr_x_by_x_avx2-256*4
1195    mov        paramsq, paramsmp
1196    mov          edged, r8m
1197    mov             wd, wm
1198    mov             hd, r6m
1199    lea            r14, [sgr_x_by_x_avx2+256*4]
1200    vbroadcasti128  m8, [base+sgr_shuf+2]
1201    add           lpfq, wq
1202    vbroadcasti128  m9, [base+sgr_shuf+4]
1203    lea             t1, [rsp+wq*2+20]
1204    vbroadcasti128 m10, [base+sgr_shuf+6]
1205    add           dstq, wq
1206    vpbroadcastd   m11, [paramsq+ 4] ; s1
1207    lea             t3, [rsp+wq*4+16+400*12]
1208    vpbroadcastd   m12, [base+pd_0xf00801c7]
1209    neg             wq
1210    vpbroadcastw    m7, [paramsq+10] ; w1
1211    pxor            m6, m6
1212    vpbroadcastd   m13, [base+pd_34816] ; (1 << 11) + (1 << 15)
1213    psllw           m7, 4
1214    vpbroadcastd   m14, [base+pd_m4096]
1215    test         edgeb, 4 ; LR_HAVE_TOP
1216    jz .no_top
1217    call .h_top
1218    add           lpfq, lpf_strideq
1219    mov             t2, t1
1220    add             t1, 400*6
1221    call .h_top
1222    lea             t4, [lpfq+lpf_strideq*4]
1223    mov           lpfq, dstq
1224    mov      [rsp+8*1], lpf_strideq
1225    add             t4, lpf_strideq
1226    mov      [rsp+8*0], t4 ; below
1227    mov             t0, t2
1228    call .hv
1229.main:
1230    mov             t5, t3
1231    add             t3, 400*4
1232    dec             hd
1233    jz .height1
1234    add           lpfq, dst_strideq
1235    call .hv
1236    call .prep_n
1237    dec             hd
1238    jz .extend_bottom
1239.main_loop:
1240    add           lpfq, dst_strideq
1241    call .hv
1242    call .n
1243    dec             hd
1244    jnz .main_loop
1245    test         edgeb, 8 ; LR_HAVE_BOTTOM
1246    jz .extend_bottom
1247    mov           lpfq, [rsp+8*0]
1248    call .hv_bottom
1249    call .n
1250    add           lpfq, [rsp+8*1]
1251    call .hv_bottom
1252.end:
1253    call .n
1254    RET
1255.height1:
1256    call .v
1257    call .prep_n
1258    mov             t2, t1
1259    call .v
1260    jmp .end
1261.extend_bottom:
1262    call .v
1263    call .n
1264    mov             t2, t1
1265    call .v
1266    jmp .end
1267.no_top:
1268    lea             t4, [lpfq+lpf_strideq*4]
1269    mov           lpfq, dstq
1270    mov      [rsp+8*1], lpf_strideq
1271    lea             t4, [t4+lpf_strideq*2]
1272    mov      [rsp+8*0], t4
1273    call .h
1274    lea             t0, [t1+400*6]
1275    mov             t2, t1
1276    call .v
1277    jmp .main
1278.h: ; horizontal boxsum
1279    lea            r10, [wq-2]
1280    test         edgeb, 1 ; LR_HAVE_LEFT
1281    jz .h_extend_left
1282    vpbroadcastd   xm0, [leftq]
1283    mova           xm5, [lpfq+wq]
1284    palignr        xm5, xm0, 12
1285    add          leftq, 4
1286    jmp .h_main
1287.h_extend_left:
1288    mova           xm5, [lpfq+wq]
1289    pshufb         xm5, [base+sgr_l_shuf]
1290    jmp .h_main
1291.h_top:
1292    lea            r10, [wq-2]
1293    test         edgeb, 1 ; LR_HAVE_LEFT
1294    jz .h_extend_left
1295.h_loop:
1296    movu           xm5, [lpfq+r10-2]
1297.h_main:
1298    vinserti128     m5, [lpfq+r10+6], 1
1299    test         edgeb, 2 ; LR_HAVE_RIGHT
1300    jnz .h_have_right
1301    cmp           r10d, -17
1302    jl .h_have_right
1303    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
1304.h_have_right:
1305    pshufb          m0, m5, m8
1306    pmullw          m2, m0, m0
1307    pshufb          m4, m5, m9
1308    paddw           m0, m4
1309    pshufb          m5, m10
1310    paddw           m0, m5 ; sum
1311    punpcklwd       m3, m4, m5
1312    pmaddwd         m3, m3
1313    punpckhwd       m4, m5
1314    pmaddwd         m4, m4
1315    punpcklwd       m1, m2, m6
1316    punpckhwd       m2, m6
1317    mova [t1+r10*2+400*0], m0
1318    paddd           m1, m3 ; sumsq
1319    paddd           m2, m4
1320    mova [t1+r10*2+400*2], m1
1321    mova [t1+r10*2+400*4], m2
1322    add            r10, 16
1323    jl .h_loop
1324    ret
1325ALIGN function_align
1326.hv: ; horizontal boxsum + vertical boxsum + ab
1327    lea            r10, [wq-2]
1328    test         edgeb, 1 ; LR_HAVE_LEFT
1329    jz .hv_extend_left
1330    vpbroadcastd   xm0, [leftq]
1331    mova           xm5, [lpfq+wq]
1332    palignr        xm5, xm0, 12
1333    add          leftq, 4
1334    jmp .hv_main
1335.hv_extend_left:
1336    mova           xm5, [lpfq+wq]
1337    pshufb         xm5, [base+sgr_l_shuf]
1338    jmp .hv_main
1339.hv_bottom:
1340    lea            r10, [wq-2]
1341    test         edgeb, 1 ; LR_HAVE_LEFT
1342    jz .hv_extend_left
1343.hv_loop:
1344    movu           xm5, [lpfq+r10-2]
1345.hv_main:
1346    vinserti128     m5, [lpfq+r10+6], 1
1347    test         edgeb, 2 ; LR_HAVE_RIGHT
1348    jnz .hv_have_right
1349    cmp           r10d, -17
1350    jl .hv_have_right
1351    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
1352.hv_have_right:
1353    pshufb          m0, m5, m8
1354    pmullw          m3, m0, m0
1355    pshufb          m1, m5, m9
1356    paddw           m0, m1
1357    pshufb          m5, m10
1358    paddw           m0, m5               ; h sum
1359    punpcklwd       m4, m5, m1
1360    pmaddwd         m4, m4
1361    punpckhwd       m5, m1
1362    pmaddwd         m5, m5
1363    paddw           m1, m0, [t2+r10*2+400*0]
1364    paddw           m1, [t1+r10*2+400*0] ; hv sum
1365    punpcklwd       m2, m3, m6
1366    punpckhwd       m3, m6
1367    paddd           m4, m2               ; h sumsq
1368    paddd           m5, m3
1369    paddd           m2, m4, [t2+r10*2+400*2]
1370    paddd           m3, m5, [t2+r10*2+400*4]
1371    paddd           m2, [t1+r10*2+400*2] ; hv sumsq
1372    paddd           m3, [t1+r10*2+400*4]
1373    mova [t0+r10*2+400*0], m0
1374    punpcklwd       m0, m1, m6           ; b
1375    punpckhwd       m1, m6
1376    mova [t0+r10*2+400*2], m4
1377    pslld           m4, m2, 3
1378    mova [t0+r10*2+400*4], m5
1379    pslld           m5, m3, 3
1380    paddd           m4, m2               ; a * 9
1381    pmaddwd         m2, m0, m0           ; b * b
1382    paddd           m5, m3
1383    pmaddwd         m3, m1, m1
1384    psubd           m4, m2               ; p
1385    psubd           m5, m3
1386    pmulld          m4, m11              ; p * s
1387    pmulld          m5, m11
1388    pmaddwd         m0, m12              ; b * 455
1389    pmaddwd         m1, m12
1390    paddusw         m4, m12
1391    paddusw         m5, m12
1392    psrad           m3, m4, 20           ; min(z, 255) - 256
1393    vpgatherdd      m2, [r14+m3*4], m4
1394    psrad           m4, m5, 20
1395    vpgatherdd      m3, [r14+m4*4], m5
1396    pmulld          m0, m2
1397    pmulld          m1, m3
1398    paddd           m0, m13              ; x * b * 455 + (1 << 11) + (1 << 15)
1399    paddd           m1, m13
1400    pand            m0, m14
1401    pand            m1, m14
1402    por             m0, m2               ; a | (b << 12)
1403    por             m1, m3
1404    mova         [t3+r10*4+ 8], xm0
1405    vextracti128 [t3+r10*4+40], m0, 1
1406    mova         [t3+r10*4+24], xm1
1407    vextracti128 [t3+r10*4+56], m1, 1
1408    add            r10, 16
1409    jl .hv_loop
1410    mov             t2, t1
1411    mov             t1, t0
1412    mov             t0, t2
1413    ret
1414.v: ; vertical boxsum + ab
1415    lea            r10, [wq-2]
1416.v_loop:
1417    mova            m1, [t1+r10*2+400*0]
1418    paddw           m1, m1
1419    paddw           m1, [t2+r10*2+400*0] ; hv sum
1420    mova            m2, [t1+r10*2+400*2]
1421    mova            m3, [t1+r10*2+400*4]
1422    paddd           m2, m2
1423    paddd           m3, m3
1424    paddd           m2, [t2+r10*2+400*2] ; hv sumsq
1425    paddd           m3, [t2+r10*2+400*4]
1426    punpcklwd       m0, m1, m6           ; b
1427    punpckhwd       m1, m6
1428    pslld           m4, m2, 3
1429    pslld           m5, m3, 3
1430    paddd           m4, m2               ; a * 9
1431    pmaddwd         m2, m0, m0           ; b * b
1432    paddd           m5, m3
1433    pmaddwd         m3, m1, m1
1434    psubd           m4, m2               ; p
1435    psubd           m5, m3
1436    pmulld          m4, m11              ; p * s
1437    pmulld          m5, m11
1438    pmaddwd         m0, m12              ; b * 455
1439    pmaddwd         m1, m12
1440    paddusw         m4, m12
1441    paddusw         m5, m12
1442    psrad           m3, m4, 20           ; min(z, 255) - 256
1443    vpgatherdd      m2, [r14+m3*4], m4
1444    psrad           m4, m5, 20
1445    vpgatherdd      m3, [r14+m4*4], m5
1446    pmulld          m0, m2
1447    pmulld          m1, m3
1448    paddd           m0, m13              ; x * b * 455 + (1 << 11) + (1 << 15)
1449    paddd           m1, m13
1450    pand            m0, m14
1451    pand            m1, m14
1452    por             m0, m2               ; a | (b << 12)
1453    por             m1, m3
1454    mova         [t3+r10*4+ 8], xm0
1455    vextracti128 [t3+r10*4+40], m0, 1
1456    mova         [t3+r10*4+24], xm1
1457    vextracti128 [t3+r10*4+56], m1, 1
1458    add            r10, 16
1459    jl .v_loop
1460    ret
1461.prep_n: ; initial neighbor setup
1462    mov            r10, wq
1463    mov             t4, t3
1464    add             t3, 400*4
1465.prep_n_loop:
1466    mova            m2, [t5+r10*4+0]
1467    mova            m3, [t4+r10*4+0]
1468    paddd           m2, [t5+r10*4+8]
1469    paddd           m3, [t4+r10*4+8]
1470    paddd           m0, m2, [t5+r10*4+4]
1471    paddd           m1, m3, [t4+r10*4+4]
1472    pslld           m0, 2
1473    paddd           m1, m1                ; ab[ 0] 222
1474    psubd           m0, m2                ; ab[-1] 343
1475    mova [t3+r10*4+400*4], m1
1476    paddd           m1, m1
1477    mova    [t5+r10*4], m0
1478    psubd           m1, m3                ; ab[ 0] 343
1479    mova    [t4+r10*4], m1
1480    add            r10, 8
1481    jl .prep_n_loop
1482    ret
1483; a+b are packed together in a single dword, but we can't do the
1484; full neighbor calculations before splitting them since we don't
1485; have sufficient precision. The solution is to do the calculations
1486; in two equal halves and split a and b before doing the final sum.
1487ALIGN function_align
1488.n: ; neighbor + output
1489    mov            r10, wq
1490.n_loop:
1491    mova            m4, [t3+r10*4+ 0]
1492    paddd           m4, [t3+r10*4+ 8]
1493    paddd           m5, m4, [t3+r10*4+ 4]
1494    paddd           m5, m5                ; ab[+1] 222
1495    mova            m2, [t3+r10*4+400*4+ 0]
1496    paddd           m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343
1497    mova            m3, [t3+r10*4+400*4+32]
1498    paddd           m1, m3, [t5+r10*4+32]
1499    mova [t3+r10*4+400*4+ 0], m5
1500    paddd           m5, m5
1501    psubd           m5, m4                ; ab[+1] 343
1502    mova [t5+r10*4+ 0], m5
1503    paddd           m2, m5                ; ab[ 0] 222 + ab[+1] 343
1504    mova            m4, [t3+r10*4+32]
1505    paddd           m4, [t3+r10*4+40]
1506    paddd           m5, m4, [t3+r10*4+36]
1507    paddd           m5, m5
1508    mova [t3+r10*4+400*4+32], m5
1509    paddd           m5, m5
1510    psubd           m5, m4
1511    mova [t5+r10*4+32], m5
1512    por             m4, m14, m0
1513    psrld           m0, 12
1514    paddd           m3, m5
1515    por             m5, m14, m2
1516    psrld           m2, 12
1517    paddd           m4, m5                ; -a
1518    por             m5, m14, m1
1519    psrld           m1, 12
1520    paddd           m0, m2                ;  b + (1 << 8)
1521    por             m2, m14, m3
1522    psrld           m3, 12
1523    paddd           m5, m2
1524    pmovzxbd        m2, [dstq+r10+0]
1525    paddd           m1, m3
1526    pmovzxbd        m3, [dstq+r10+8]
1527    pmaddwd         m4, m2                ; -a * src
1528    pmaddwd         m5, m3
1529    packssdw        m2, m3
1530    psubd           m0, m4                ; a * src + b + (1 << 8)
1531    psubd           m1, m5
1532    psrld           m0, 9
1533    psrld           m1, 9
1534    packssdw        m0, m1
1535    psllw           m1, m2, 4
1536    psubw           m0, m1
1537    pmulhrsw        m0, m7
1538    paddw           m0, m2
1539    vextracti128   xm1, m0, 1
1540    packuswb       xm0, xm1
1541    pshufd         xm0, xm0, q3120
1542    mova    [dstq+r10], xm0
1543    add            r10, 16
1544    jl .n_loop
1545    mov            r10, t5
1546    mov             t5, t4
1547    mov             t4, r10
1548    add           dstq, dst_strideq
1549    ret
1550
1551cglobal sgr_filter_mix_8bpc, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \
1552                                                  lpf_stride, w, edge, params, h
1553%define base r12-sgr_x_by_x_avx2-256*4
1554    lea            r12, [sgr_x_by_x_avx2+256*4]
1555    mov        paramsq, paramsmp
1556    mov             wd, wm
1557    mov          edged, r8m
1558    mov             hd, r6m
1559    vbroadcasti128  m9, [base+sgr_shuf+0]
1560    add           lpfq, wq
1561    vbroadcasti128 m10, [base+sgr_shuf+8]
1562    lea             t1, [rsp+wq*2+12]
1563    vbroadcasti128 m11, [base+sgr_shuf+2]
1564    add           dstq, wq
1565    vbroadcasti128 m12, [base+sgr_shuf+6]
1566    lea             t3, [rsp+wq*4+400*24+8]
1567    vpbroadcastd   m15, [paramsq+8] ; w0 w1
1568    neg             wq
1569    vpbroadcastd   m13, [paramsq+0] ; s0
1570    pxor            m7, m7
1571    vpbroadcastd   m14, [paramsq+4] ; s1
1572    psllw          m15, 2 ; to reuse existing pd_m4096 register for rounding
1573    test         edgeb, 4 ; LR_HAVE_TOP
1574    jz .no_top
1575    call .h_top
1576    add           lpfq, lpf_strideq
1577    mov             t2, t1
1578    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup
1579    add             t1, 400*12
1580    call .h_top
1581    lea            r10, [lpfq+lpf_strideq*4]
1582    mov           lpfq, dstq
1583    add            r10, lpf_strideq
1584    mov          [rsp], r10 ; below
1585    call .hv0
1586.main:
1587    dec             hd
1588    jz .height1
1589    add           lpfq, dst_strideq
1590    call .hv1
1591    call .prep_n
1592    sub             hd, 2
1593    jl .extend_bottom
1594.main_loop:
1595    add           lpfq, dst_strideq
1596    call .hv0
1597    test            hd, hd
1598    jz .odd_height
1599    add           lpfq, dst_strideq
1600    call .hv1
1601    call .n0
1602    call .n1
1603    sub             hd, 2
1604    jge .main_loop
1605    test         edgeb, 8 ; LR_HAVE_BOTTOM
1606    jz .extend_bottom
1607    mov           lpfq, [rsp]
1608    call .hv0_bottom
1609    add           lpfq, lpf_strideq
1610    call .hv1_bottom
1611.end:
1612    call .n0
1613    call .n1
1614.end2:
1615    RET
1616.height1:
1617    call .v1
1618    call .prep_n
1619    jmp .odd_height_end
1620.odd_height:
1621    call .v1
1622    call .n0
1623    call .n1
1624.odd_height_end:
1625    call .v0
1626    call .v1
1627    call .n0
1628    jmp .end2
1629.extend_bottom:
1630    call .v0
1631    call .v1
1632    jmp .end
1633.no_top:
1634    lea            r10, [lpfq+lpf_strideq*4]
1635    mov           lpfq, dstq
1636    lea            r10, [r10+lpf_strideq*2]
1637    mov          [rsp], r10
1638    call .h
1639    lea             t2, [t1+400*12]
1640    lea            r10, [wq-2]
1641.top_fixup_loop:
1642    mova            m0, [t1+r10*2+400* 0]
1643    mova            m1, [t1+r10*2+400* 2]
1644    mova            m2, [t1+r10*2+400* 4]
1645    paddw           m0, m0
1646    mova            m3, [t1+r10*2+400* 6]
1647    paddd           m1, m1
1648    mova            m4, [t1+r10*2+400* 8]
1649    paddd           m2, m2
1650    mova            m5, [t1+r10*2+400*10]
1651    mova [t2+r10*2+400* 0], m0
1652    mova [t2+r10*2+400* 2], m1
1653    mova [t2+r10*2+400* 4], m2
1654    mova [t2+r10*2+400* 6], m3
1655    mova [t2+r10*2+400* 8], m4
1656    mova [t2+r10*2+400*10], m5
1657    add            r10, 16
1658    jl .top_fixup_loop
1659    call .v0
1660    jmp .main
1661.h: ; horizontal boxsums
1662    lea            r10, [wq-2]
1663    test         edgeb, 1 ; LR_HAVE_LEFT
1664    jz .h_extend_left
1665    vpbroadcastd   xm0, [leftq]
1666    mova           xm5, [lpfq+wq]
1667    palignr        xm5, xm0, 12
1668    add          leftq, 4
1669    jmp .h_main
1670.h_extend_left:
1671    mova           xm5, [lpfq+wq]
1672    pshufb         xm5, [base+sgr_l_shuf]
1673    jmp .h_main
1674.h_top:
1675    lea            r10, [wq-2]
1676    test         edgeb, 1 ; LR_HAVE_LEFT
1677    jz .h_extend_left
1678.h_loop:
1679    movu           xm5, [lpfq+r10-2]
1680.h_main:
1681    vinserti128     m5, [lpfq+r10+6], 1
1682    test         edgeb, 2 ; LR_HAVE_RIGHT
1683    jnz .h_have_right
1684    cmp           r10d, -18
1685    jl .h_have_right
1686    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
1687.h_have_right:
1688    pshufb          m6, m5, m9
1689    pshufb          m4, m5, m10
1690    paddw           m8, m6, m4
1691    shufps          m0, m6, m4, q2121
1692    pmullw          m3, m0, m0
1693    pshufb          m2, m5, m11
1694    paddw           m0, m2
1695    pshufb          m5, m12
1696    paddw           m0, m5 ; sum3
1697    punpcklwd       m1, m2, m5
1698    pmaddwd         m1, m1
1699    punpckhwd       m2, m5
1700    pmaddwd         m2, m2
1701    punpcklwd       m5, m6, m4
1702    pmaddwd         m5, m5
1703    punpckhwd       m6, m4
1704    pmaddwd         m6, m6
1705    punpcklwd       m4, m3, m7
1706    paddd           m1, m4 ; sumsq3
1707    punpckhwd       m3, m7
1708    paddd           m2, m3
1709    mova [t1+r10*2+400* 6], m0
1710    mova [t1+r10*2+400* 8], m1
1711    mova [t1+r10*2+400*10], m2
1712    paddw           m8, m0 ; sum5
1713    paddd           m5, m1 ; sumsq5
1714    paddd           m6, m2
1715    mova [t1+r10*2+400* 0], m8
1716    mova [t1+r10*2+400* 2], m5
1717    mova [t1+r10*2+400* 4], m6
1718    add            r10, 16
1719    jl .h_loop
1720    ret
1721ALIGN function_align
1722.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows)
1723    lea            r10, [wq-2]
1724    test         edgeb, 1 ; LR_HAVE_LEFT
1725    jz .hv0_extend_left
1726    vpbroadcastd   xm0, [leftq]
1727    mova           xm5, [lpfq+wq]
1728    palignr        xm5, xm0, 12
1729    add          leftq, 4
1730    jmp .hv0_main
1731.hv0_extend_left:
1732    mova           xm5, [lpfq+wq]
1733    pshufb         xm5, [base+sgr_l_shuf]
1734    jmp .hv0_main
1735.hv0_bottom:
1736    lea            r10, [wq-2]
1737    test         edgeb, 1 ; LR_HAVE_LEFT
1738    jz .hv0_extend_left
1739.hv0_loop:
1740    movu           xm5, [lpfq+r10-2]
1741.hv0_main:
1742    vinserti128     m5, [lpfq+r10+6], 1
1743    test         edgeb, 2 ; LR_HAVE_RIGHT
1744    jnz .hv0_have_right
1745    cmp           r10d, -18
1746    jl .hv0_have_right
1747    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
1748.hv0_have_right:
1749    pshufb          m6, m5, m9
1750    pshufb          m4, m5, m10
1751    paddw           m8, m6, m4
1752    shufps          m1, m6, m4, q2121
1753    pmullw          m0, m1, m1
1754    pshufb          m3, m5, m11
1755    paddw           m1, m3
1756    pshufb          m5, m12
1757    paddw           m1, m5 ; sum3
1758    punpcklwd       m2, m3, m5
1759    pmaddwd         m2, m2
1760    punpckhwd       m3, m5
1761    pmaddwd         m3, m3
1762    punpcklwd       m5, m6, m4
1763    pmaddwd         m5, m5
1764    punpckhwd       m6, m4
1765    pmaddwd         m6, m6
1766    punpcklwd       m4, m0, m7
1767    paddd           m2, m4 ; sumsq3
1768    punpckhwd       m0, m7
1769    paddd           m3, m0
1770    paddw           m8, m1 ; sum5
1771    paddd           m5, m2 ; sumsq5
1772    paddd           m6, m3
1773    mova [t3+r10*4+400*8+ 8], m8 ; we need a clean copy of the last row
1774    mova [t3+r10*4+400*0+ 8], m5 ; in case height is odd
1775    mova [t3+r10*4+400*0+40], m6
1776    paddw           m8, [t1+r10*2+400* 0]
1777    paddd           m5, [t1+r10*2+400* 2]
1778    paddd           m6, [t1+r10*2+400* 4]
1779    mova [t1+r10*2+400* 0], m8
1780    mova [t1+r10*2+400* 2], m5
1781    mova [t1+r10*2+400* 4], m6
1782    paddw           m0, m1, [t1+r10*2+400* 6]
1783    paddd           m4, m2, [t1+r10*2+400* 8]
1784    paddd           m5, m3, [t1+r10*2+400*10]
1785    mova [t1+r10*2+400* 6], m1
1786    mova [t1+r10*2+400* 8], m2
1787    mova [t1+r10*2+400*10], m3
1788    paddw           m1, m0, [t2+r10*2+400* 6]
1789    paddd           m2, m4, [t2+r10*2+400* 8]
1790    paddd           m3, m5, [t2+r10*2+400*10]
1791    mova [t2+r10*2+400* 6], m0
1792    mova [t2+r10*2+400* 8], m4
1793    mova [t2+r10*2+400*10], m5
1794    punpcklwd       m0, m1, m7           ; b3
1795    punpckhwd       m1, m7
1796    pslld           m4, m2, 3
1797    pslld           m5, m3, 3
1798    paddd           m4, m2               ; a3 * 9
1799    pmaddwd         m2, m0, m0           ; b3 * b
1800    paddd           m5, m3
1801    pmaddwd         m3, m1, m1
1802    psubd           m4, m2               ; p3
1803    vpbroadcastd    m2, [base+pd_0xf00801c7]
1804    psubd           m5, m3
1805    pmulld          m4, m14              ; p3 * s1
1806    pmulld          m5, m14
1807    pmaddwd         m0, m2               ; b3 * 455
1808    pmaddwd         m1, m2
1809    paddusw         m4, m2
1810    paddusw         m5, m2
1811    psrad           m3, m4, 20           ; min(z3, 255) - 256
1812    vpgatherdd      m2, [r12+m3*4], m4
1813    psrad           m4, m5, 20
1814    vpgatherdd      m3, [r12+m4*4], m5
1815    vpbroadcastd    m4, [base+pd_34816]
1816    pmulld          m0, m2
1817    vpbroadcastd    m5, [base+pd_m4096]
1818    pmulld          m1, m3
1819    paddd           m0, m4               ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
1820    paddd           m1, m4
1821    pand            m0, m5
1822    pand            m1, m5
1823    por             m0, m2               ; a3 | (b3 << 12)
1824    por             m1, m3
1825    mova         [t3+r10*4+400*4+ 8], xm0
1826    vextracti128 [t3+r10*4+400*4+40], m0, 1
1827    mova         [t3+r10*4+400*4+24], xm1
1828    vextracti128 [t3+r10*4+400*4+56], m1, 1
1829    add            r10, 16
1830    jl .hv0_loop
1831    ret
1832ALIGN function_align
1833.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
1834    lea            r10, [wq-2]
1835    test         edgeb, 1 ; LR_HAVE_LEFT
1836    jz .hv1_extend_left
1837    vpbroadcastd   xm0, [leftq]
1838    mova           xm5, [lpfq+wq]
1839    palignr        xm5, xm0, 12
1840    add          leftq, 4
1841    jmp .hv1_main
1842.hv1_extend_left:
1843    mova           xm5, [lpfq+wq]
1844    pshufb         xm5, [base+sgr_l_shuf]
1845    jmp .hv1_main
1846.hv1_bottom:
1847    lea            r10, [wq-2]
1848    test         edgeb, 1 ; LR_HAVE_LEFT
1849    jz .hv1_extend_left
1850.hv1_loop:
1851    movu           xm5, [lpfq+r10-2]
1852.hv1_main:
1853    vinserti128     m5, [lpfq+r10+6], 1
1854    test         edgeb, 2 ; LR_HAVE_RIGHT
1855    jnz .hv1_have_right
1856    cmp           r10d, -18
1857    jl .hv1_have_right
1858    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
1859.hv1_have_right:
1860    pshufb          m6, m5, m9
1861    pshufb          m3, m5, m10
1862    paddw           m8, m6, m3
1863    shufps          m2, m6, m3, q2121
1864    pmullw          m1, m2, m2
1865    pshufb          m0, m5, m11
1866    paddw           m2, m0
1867    pshufb          m5, m12
1868    paddw           m2, m5 ; sum3
1869    punpcklwd       m4, m5, m0
1870    pmaddwd         m4, m4
1871    punpckhwd       m5, m0
1872    pmaddwd         m5, m5
1873    punpcklwd       m0, m6, m3
1874    pmaddwd         m0, m0
1875    punpckhwd       m6, m3
1876    pmaddwd         m6, m6
1877    punpcklwd       m3, m1, m7
1878    paddd           m4, m3 ; sumsq3
1879    punpckhwd       m1, m7
1880    paddd           m5, m1
1881    paddw           m1, m2, [t2+r10*2+400* 6]
1882    mova [t2+r10*2+400* 6], m2
1883    paddw           m8, m2 ; sum5
1884    paddd           m2, m4, [t2+r10*2+400* 8]
1885    paddd           m3, m5, [t2+r10*2+400*10]
1886    mova [t2+r10*2+400* 8], m4
1887    mova [t2+r10*2+400*10], m5
1888    paddd           m4, m0 ; sumsq5
1889    paddd           m5, m6
1890    punpcklwd       m0, m1, m7           ; b3
1891    punpckhwd       m1, m7
1892    pslld           m6, m2, 3
1893    pslld           m7, m3, 3
1894    paddd           m6, m2               ; a3 * 9
1895    pmaddwd         m2, m0, m0           ; b3 * b3
1896    paddd           m7, m3
1897    pmaddwd         m3, m1, m1
1898    psubd           m6, m2               ; p3
1899    vpbroadcastd    m2, [base+pd_0xf00801c7]
1900    psubd           m7, m3
1901    pmulld          m6, m14              ; p3 * s1
1902    pmulld          m7, m14
1903    pmaddwd         m0, m2               ; b3 * 455
1904    pmaddwd         m1, m2
1905    paddusw         m6, m2
1906    paddusw         m7, m2
1907    psrad           m3, m6, 20           ; min(z3, 255) - 256
1908    vpgatherdd      m2, [r12+m3*4], m6
1909    psrad           m6, m7, 20
1910    vpgatherdd      m3, [r12+m6*4], m7
1911    vpbroadcastd    m6, [base+pd_34816]
1912    pmulld          m0, m2
1913    vpbroadcastd    m7, [base+pd_m4096]
1914    pmulld          m1, m3
1915    paddd           m0, m6               ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
1916    paddd           m1, m6
1917    pand            m0, m7
1918    pand            m7, m1
1919    por             m0, m2               ; a3 | (b3 << 12)
1920    por             m7, m3
1921    paddw          m1, m8, [t2+r10*2+400*0]
1922    paddd          m2, m4, [t2+r10*2+400*2]
1923    paddd          m3, m5, [t2+r10*2+400*4]
1924    paddw          m1, [t1+r10*2+400*0]
1925    paddd          m2, [t1+r10*2+400*2]
1926    paddd          m3, [t1+r10*2+400*4]
1927    mova [t2+r10*2+400*0], m8
1928    mova [t2+r10*2+400*2], m4
1929    mova [t2+r10*2+400*4], m5
1930    mova         [t3+r10*4+400*8+ 8], xm0
1931    vextracti128 [t3+r10*4+400*8+40], m0, 1
1932    mova         [t3+r10*4+400*8+24], xm7
1933    vextracti128 [t3+r10*4+400*8+56], m7, 1
1934    vpbroadcastd    m4, [base+pd_25]
1935    pxor            m7, m7
1936    punpcklwd       m0, m1, m7           ; b5
1937    punpckhwd       m1, m7
1938    pmulld          m2, m4               ; a5 * 25
1939    pmulld          m3, m4
1940    pmaddwd         m4, m0, m0           ; b5 * b5
1941    pmaddwd         m5, m1, m1
1942    psubd           m2, m4               ; p5
1943    vpbroadcastd    m4, [base+pd_0xf00800a4]
1944    psubd           m3, m5
1945    pmulld          m2, m13              ; p5 * s0
1946    pmulld          m3, m13
1947    pmaddwd         m0, m4               ; b5 * 164
1948    pmaddwd         m1, m4
1949    paddusw         m2, m4
1950    paddusw         m3, m4
1951    psrad           m5, m2, 20           ; min(z5, 255) - 256
1952    vpgatherdd      m4, [r12+m5*4], m2
1953    psrad           m2, m3, 20
1954    vpgatherdd      m5, [r12+m2*4], m3
1955    pmulld          m0, m4
1956    pmulld          m1, m5
1957    paddd           m0, m6               ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
1958    paddd           m1, m6
1959    vpbroadcastd    m6, [base+pd_m4096]
1960    pand            m0, m6
1961    pand            m1, m6
1962    por             m0, m4               ; a5 | (b5 << 12)
1963    por             m1, m5
1964    mova         [t3+r10*4+400*0+ 8], xm0
1965    vextracti128 [t3+r10*4+400*0+40], m0, 1
1966    mova         [t3+r10*4+400*0+24], xm1
1967    vextracti128 [t3+r10*4+400*0+56], m1, 1
1968    add            r10, 16
1969    jl .hv1_loop
1970    mov            r10, t2
1971    mov             t2, t1
1972    mov             t1, r10
1973    ret
1974.v0: ; vertical boxsums + ab3 (even rows)
1975    lea            r10, [wq-2]
1976    vpbroadcastd    m6, [base+pd_34816]
1977    vpbroadcastd    m8, [base+pd_m4096]
1978.v0_loop:
1979    mova            m0, [t1+r10*2+400* 6]
1980    mova            m4, [t1+r10*2+400* 8]
1981    mova            m5, [t1+r10*2+400*10]
1982    paddw           m0, m0
1983    paddd           m4, m4
1984    paddd           m5, m5
1985    paddw           m1, m0, [t2+r10*2+400* 6]
1986    paddd           m2, m4, [t2+r10*2+400* 8]
1987    paddd           m3, m5, [t2+r10*2+400*10]
1988    mova [t2+r10*2+400* 6], m0
1989    mova [t2+r10*2+400* 8], m4
1990    mova [t2+r10*2+400*10], m5
1991    punpcklwd       m0, m1, m7           ; b3
1992    punpckhwd       m1, m7
1993    pslld           m4, m2, 3
1994    pslld           m5, m3, 3
1995    paddd           m4, m2               ; a3 * 9
1996    pmaddwd         m2, m0, m0           ; b3 * b3
1997    paddd           m5, m3
1998    pmaddwd         m3, m1, m1
1999    psubd           m4, m2               ; p3
2000    vpbroadcastd    m2, [base+pd_0xf00801c7]
2001    psubd           m5, m3
2002    pmulld          m4, m14              ; p3 * s1
2003    pmulld          m5, m14
2004    pmaddwd         m0, m2               ; b3 * 455
2005    pmaddwd         m1, m2
2006    paddusw         m4, m2
2007    paddusw         m5, m2
2008    psrad           m3, m4, 20           ; min(z3, 255) - 256
2009    vpgatherdd      m2, [r12+m3*4], m4
2010    psrad           m4, m5, 20
2011    vpgatherdd      m3, [r12+m4*4], m5
2012    pmulld          m0, m2
2013    pmulld          m1, m3
2014    paddd           m0, m6               ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2015    paddd           m1, m6
2016    pand            m0, m8
2017    pand            m1, m8
2018    por             m0, m2               ; a3 | (b3 << 12)
2019    por             m1, m3
2020    mova            m2, [t1+r10*2+400*0]
2021    mova            m3, [t1+r10*2+400*2]
2022    mova            m4, [t1+r10*2+400*4]
2023    mova [t3+r10*4+400*8+ 8], m2
2024    mova [t3+r10*4+400*0+ 8], m3
2025    mova [t3+r10*4+400*0+40], m4
2026    paddw           m2, m2 ; cc5
2027    paddd           m3, m3
2028    paddd           m4, m4
2029    mova [t1+r10*2+400*0], m2
2030    mova [t1+r10*2+400*2], m3
2031    mova [t1+r10*2+400*4], m4
2032    mova         [t3+r10*4+400*4+ 8], xm0
2033    vextracti128 [t3+r10*4+400*4+40], m0, 1
2034    mova         [t3+r10*4+400*4+24], xm1
2035    vextracti128 [t3+r10*4+400*4+56], m1, 1
2036    add            r10, 16
2037    jl .v0_loop
2038    ret
2039.v1: ; vertical boxsums + ab (odd rows)
2040    lea            r10, [wq-2]
2041.v1_loop:
2042    mova            m4, [t1+r10*2+400* 6]
2043    mova            m5, [t1+r10*2+400* 8]
2044    mova            m6, [t1+r10*2+400*10]
2045    paddw           m1, m4, [t2+r10*2+400* 6]
2046    paddd           m2, m5, [t2+r10*2+400* 8]
2047    paddd           m3, m6, [t2+r10*2+400*10]
2048    mova [t2+r10*2+400* 6], m4
2049    mova [t2+r10*2+400* 8], m5
2050    mova [t2+r10*2+400*10], m6
2051    punpcklwd       m0, m1, m7           ; b3
2052    punpckhwd       m1, m7
2053    pslld           m4, m2, 3
2054    pslld           m5, m3, 3
2055    paddd           m4, m2               ; a3 * 9
2056    pmaddwd         m2, m0, m0           ; b3 * b3
2057    paddd           m5, m3
2058    pmaddwd         m3, m1, m1
2059    psubd           m4, m2               ; p3
2060    vpbroadcastd    m2, [base+pd_0xf00801c7]
2061    psubd           m5, m3
2062    pmulld          m4, m14              ; p3 * s1
2063    pmulld          m5, m14
2064    pmaddwd         m0, m2               ; b3 * 455
2065    pmaddwd         m1, m2
2066    paddusw         m4, m2
2067    paddusw         m5, m2
2068    psrad           m3, m4, 20           ; min(z3, 255) - 256
2069    vpgatherdd      m2, [r12+m3*4], m4
2070    psrad           m4, m5, 20
2071    vpgatherdd      m3, [r12+m4*4], m5
2072    vpbroadcastd    m4, [base+pd_34816]
2073    pmulld          m0, m2
2074    vpbroadcastd    m8, [base+pd_m4096]
2075    pmulld          m1, m3
2076    paddd           m0, m4               ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2077    paddd           m1, m4
2078    pand            m0, m8
2079    pand            m8, m1
2080    por             m0, m2               ; a3 | (b3 << 12)
2081    por             m8, m3
2082    mova            m4, [t3+r10*4+400*8+ 8]
2083    mova            m5, [t3+r10*4+400*0+ 8]
2084    mova            m6, [t3+r10*4+400*0+40]
2085    paddw           m1, m4, [t2+r10*2+400*0]
2086    paddd           m2, m5, [t2+r10*2+400*2]
2087    paddd           m3, m6, [t2+r10*2+400*4]
2088    paddw           m1, [t1+r10*2+400*0]
2089    paddd           m2, [t1+r10*2+400*2]
2090    paddd           m3, [t1+r10*2+400*4]
2091    mova [t2+r10*2+400*0], m4
2092    mova [t2+r10*2+400*2], m5
2093    mova [t2+r10*2+400*4], m6
2094    vpbroadcastd    m4, [base+pd_25]
2095    mova         [t3+r10*4+400*8+ 8], xm0
2096    vextracti128 [t3+r10*4+400*8+40], m0, 1
2097    mova         [t3+r10*4+400*8+24], xm8
2098    vextracti128 [t3+r10*4+400*8+56], m8, 1
2099    punpcklwd       m0, m1, m7           ; b5
2100    punpckhwd       m1, m7
2101    pmulld          m2, m4               ; a5 * 25
2102    pmulld          m3, m4
2103    pmaddwd         m4, m0, m0           ; b5 * b5
2104    pmaddwd         m5, m1, m1
2105    psubd           m2, m4               ; p5
2106    vpbroadcastd    m4, [base+pd_0xf00800a4]
2107    psubd           m3, m5
2108    pmulld          m2, m13              ; p5 * s0
2109    pmulld          m3, m13
2110    pmaddwd         m0, m4               ; b5 * 164
2111    pmaddwd         m1, m4
2112    paddusw         m2, m4
2113    paddusw         m3, m4
2114    psrad           m5, m2, 20           ; min(z5, 255) - 256
2115    vpgatherdd      m4, [r12+m5*4], m2
2116    psrad           m2, m3, 20
2117    vpgatherdd      m5, [r12+m2*4], m3
2118    pmulld          m0, m4
2119    vpbroadcastd    m6, [base+pd_34816]
2120    pmulld          m1, m5
2121    paddd           m0, m6               ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
2122    paddd           m1, m6
2123    vpbroadcastd    m6, [base+pd_m4096]
2124    pand            m0, m6
2125    pand            m1, m6
2126    por             m0, m4               ; a5 | (b5 << 12)
2127    por             m1, m5
2128    mova         [t3+r10*4+400*0+ 8], xm0
2129    vextracti128 [t3+r10*4+400*0+40], m0, 1
2130    mova         [t3+r10*4+400*0+24], xm1
2131    vextracti128 [t3+r10*4+400*0+56], m1, 1
2132    add            r10, 16
2133    jl .v1_loop
2134    mov            r10, t2
2135    mov             t2, t1
2136    mov             t1, r10
2137    ret
2138.prep_n: ; initial neighbor setup
2139    mov            r10, wq
2140.prep_n_loop:
2141    movu            m0, [t3+r10*4+400*0+4]
2142    paddd           m1, m0, [t3+r10*4+400*0+0]
2143    mova            m4, [t3+r10*4+400*4+0]
2144    paddd           m1, [t3+r10*4+400*0+8]
2145    mova            m5, [t3+r10*4+400*8+0]
2146    paddd           m4, [t3+r10*4+400*4+8]
2147    paddd           m5, [t3+r10*4+400*8+8]
2148    paddd           m2, m4, [t3+r10*4+400*4+4]
2149    paddd           m3, m5, [t3+r10*4+400*8+4]
2150    paddd           m0, m1
2151    pslld           m1, 2
2152    pslld           m2, 2
2153    paddd           m1, m0                ; ab5 565
2154    paddd           m3, m3                ; ab3[ 0] 222
2155    psubd           m2, m4                ; ab3[-1] 343
2156    mova [t3+r10*4+400*20], m3
2157    por             m0, m6, m1            ; a5 565
2158    mova [t3+r10*4+400*24], m2
2159    psrld           m1, 12                ; b5 565
2160    mova [t3+r10*4+400*12], m0
2161    paddd           m3, m3
2162    mova [t3+r10*4+400*16], m1
2163    psubd           m3, m5                ; ab3[ 0] 343
2164    mova [t3+r10*4+400*28], m3
2165    add            r10, 8
2166    jl .prep_n_loop
2167    ret
2168ALIGN function_align
2169.n0: ; neighbor + output (even rows)
2170    mov            r10, wq
2171.n0_loop:
2172    movu            m0, [t3+r10*4+4]
2173    paddd           m4, m0, [t3+r10*4+0]
2174    paddd           m4, [t3+r10*4+8]
2175    paddd           m0, m4
2176    pslld           m4, 2
2177    paddd           m4, m0
2178    por             m0, m6, m4
2179    psrld           m4, 12
2180    paddd           m2, m0, [t3+r10*4+400*12] ; -a5
2181    mova [t3+r10*4+400*12], m0
2182    paddd           m0, m4, [t3+r10*4+400*16] ;  b5 + (1 << 8)
2183    mova [t3+r10*4+400*16], m4
2184    mova            m3, [t3+r10*4+400*4+0]
2185    paddd           m3, [t3+r10*4+400*4+8]
2186    paddd           m5, m3, [t3+r10*4+400*4+4]
2187    paddd           m5, m5                    ; ab3[ 1] 222
2188    mova            m4, [t3+r10*4+400*20]
2189    paddd           m1, m4, [t3+r10*4+400*24] ; ab3[ 0] 222 + ab3[-1] 343
2190    mova [t3+r10*4+400*20], m5
2191    paddd           m5, m5
2192    psubd           m5, m3                    ; ab3[ 1] 343
2193    mova [t3+r10*4+400*24], m5
2194    paddd           m4, m5                    ; ab3[ 0] 222 + ab3[ 1] 343
2195    por             m3, m6, m1
2196    psrld           m1, 12
2197    por             m5, m6, m4
2198    psrld           m4, 12
2199    paddd           m3, m5                    ; -a3
2200    paddd           m1, m4                    ;  b3 + (1 << 8)
2201    pmovzxbd        m4, [dstq+r10]
2202    pmaddwd         m2, m4                    ; -a5 * src
2203    pmaddwd         m3, m4                    ; -a3 * src
2204    pslld           m4, 13
2205    psubd           m0, m4
2206    psubd           m1, m4
2207    psubd           m0, m2                    ; a5 * src + b5 + (1 << 8)
2208    psubd           m1, m3                    ; a3 * src + b3 + (1 << 8)
2209    psrld           m0, 9
2210    pslld           m1, 7
2211    pblendw         m0, m1, 0xaa
2212    pmaddwd         m0, m15
2213    psubd           m4, m6
2214    paddd           m0, m4
2215    psrad           m0, 13
2216    vextracti128   xm1, m0, 1
2217    packssdw       xm0, xm1
2218    packuswb       xm0, xm0
2219    movq    [dstq+r10], xm0
2220    add            r10, 8
2221    jl .n0_loop
2222    add           dstq, dst_strideq
2223    ret
2224ALIGN function_align
2225.n1: ; neighbor + output (odd rows)
2226    mov            r10, wq
2227.n1_loop:
2228    mova            m3, [t3+r10*4+400*8+0]
2229    paddd           m3, [t3+r10*4+400*8+8]
2230    paddd           m5, m3, [t3+r10*4+400*8+4]
2231    paddd           m5, m5                    ; ab3[ 1] 222
2232    mova            m4, [t3+r10*4+400*20]
2233    paddd           m1, m4, [t3+r10*4+400*28] ; ab3[ 0] 222 + ab3[-1] 343
2234    mova [t3+r10*4+400*20], m5
2235    paddd           m5, m5
2236    psubd           m5, m3                    ; ab3[ 1] 343
2237    mova [t3+r10*4+400*28], m5
2238    paddd           m4, m5                    ; ab3[ 0] 222 + ab3[ 1] 343
2239    por             m3, m6, m1
2240    psrld           m1, 12
2241    por             m5, m6, m4
2242    psrld           m4, 12
2243    paddd           m3, m5                    ; -a3
2244    paddd           m1, m4                    ;  b3 + (1 << 8)
2245    pmovzxbd        m4, [dstq+r10]
2246    pmaddwd         m2, m4, [t3+r10*4+400*12] ; -a5 * src
2247    mova            m0, [t3+r10*4+400*16]     ;  b5 + (1 << 7)
2248    pmaddwd         m3, m4                    ; -a3 * src
2249    pslld           m4, 12
2250    psubd           m0, m4
2251    paddd           m4, m4
2252    psubd           m1, m4
2253    psubd           m0, m2                    ; a5 * src + b5 + (1 << 7)
2254    psubd           m1, m3                    ; a3 * src + b3 + (1 << 8)
2255    psrld           m0, 8
2256    pslld           m1, 7
2257    pblendw         m0, m1, 0xaa
2258    pmaddwd         m0, m15
2259    psubd           m4, m6
2260    paddd           m0, m4
2261    psrad           m0, 13
2262    vextracti128   xm1, m0, 1
2263    packssdw       xm0, xm1
2264    packuswb       xm0, xm0
2265    movq    [dstq+r10], xm0
2266    add            r10, 8
2267    jl .n1_loop
2268    add           dstq, dst_strideq
2269    ret
2270%endif ; ARCH_X86_64
2271