1; Copyright © 2018, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; Copyright © 2018, VideoLabs
4; All rights reserved.
5;
6; Redistribution and use in source and binary forms, with or without
7; modification, are permitted provided that the following conditions are met:
8;
9; 1. Redistributions of source code must retain the above copyright notice, this
10;    list of conditions and the following disclaimer.
11;
12; 2. Redistributions in binary form must reproduce the above copyright notice,
13;    this list of conditions and the following disclaimer in the documentation
14;    and/or other materials provided with the distribution.
15;
16; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27%include "config.asm"
28%include "ext/x86/x86inc.asm"
29
30SECTION_RODATA 16
31
32wiener_init:   db  6,  7,  6,  7,  6,  7,  6,  7,  0,  0,  0,  0,  2,  4,  2,  4
33wiener_shufA:  db  1,  7,  2,  8,  3,  9,  4, 10,  5, 11,  6, 12,  7, 13,  8, 14
34wiener_shufB:  db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10
35wiener_shufC:  db  6,  5,  7,  6,  8,  7,  9,  8, 10,  9, 11, 10, 12, 11, 13, 12
36wiener_shufD:  db  4, -1,  5, -1,  6, -1,  7, -1,  8, -1,  9, -1, 10, -1, 11, -1
37wiener_l_shuf: db  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
38sgr_lshuf3:    db  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
39sgr_lshuf5:    db  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12
40pb_0to15:      db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
41
42pb_right_ext_mask: times 24 db 0xff
43                   times 8 db 0
44pb_1:          times 16 db 1
45pb_3:          times 16 db 3
46pw_256:        times 8 dw 256
47pw_2056:       times 8 dw 2056
48pw_m16380:     times 8 dw -16380
49pd_4096:       times 4 dd 4096
50pd_34816:      times 4 dd 34816
51pd_0xffff:     times 4 dd 0xffff
52pd_0xf00800a4: times 4 dd 0xf00800a4
53pd_0xf00801c7: times 4 dd 0xf00801c7
54
55cextern sgr_x_by_x
56
57SECTION .text
58
59%macro movif64 2 ; dst, src
60 %if ARCH_X86_64
61    mov             %1, %2
62 %endif
63%endmacro
64
65%macro movif32 2 ; dst, src
66 %if ARCH_X86_32
67    mov             %1, %2
68 %endif
69%endmacro
70
71%if ARCH_X86_32
72 %define PIC_base_offset $$
73
74 %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg
75  %assign pic_reg_stk_off 4
76  %xdefine PIC_reg %1
77  %if %2 == 1
78    mov        [esp], %1
79  %endif
80    LEA      PIC_reg, PIC_base_offset
81  %if %3 == 1
82    XCHG_PIC_REG
83  %endif
84 %endmacro
85
86 %macro XCHG_PIC_REG 0
87    mov [esp+pic_reg_stk_off], PIC_reg
88    %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8
89    mov PIC_reg, [esp+pic_reg_stk_off]
90 %endmacro
91
92 %define PIC_sym(sym)   (PIC_reg+(sym)-PIC_base_offset)
93
94%else
95 %macro XCHG_PIC_REG 0
96 %endmacro
97
98 %define PIC_sym(sym)   (sym)
99%endif
100
101%macro WIENER 0
102%if ARCH_X86_64
103DECLARE_REG_TMP 9, 7, 10, 11, 12, 13, 14 ; ring buffer pointers
104cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
105                                                    w, h, edge, flt, x
106    %define tmpstrideq strideq
107    %define base 0
108    mov           fltq, r6mp
109    mov             wd, wm
110    movifnidn       hd, hm
111    mov          edged, r7m
112    movq           m14, [fltq]
113    add           lpfq, wq
114    movq            m7, [fltq+16]
115    add           dstq, wq
116    lea             t1, [rsp+wq*2+16]
117    mova           m15, [pw_2056]
118    neg             wq
119%if cpuflag(ssse3)
120    pshufb         m14, [wiener_init]
121    mova            m8, [wiener_shufA]
122    pshufd         m12, m14, q2222  ; x0 x0
123    mova            m9, [wiener_shufB]
124    pshufd         m13, m14, q3333  ; x1 x2
125    mova           m10, [wiener_shufC]
126    punpcklqdq     m14, m14         ; x3
127    mova           m11, [wiener_shufD]
128%else
129    mova           m10, [pw_m16380]
130    punpcklwd      m14, m14
131    pshufd         m11, m14, q0000 ; x0
132    pshufd         m12, m14, q1111 ; x1
133    pshufd         m13, m14, q2222 ; x2
134    pshufd         m14, m14, q3333 ; x3
135%endif
136%else
137DECLARE_REG_TMP 4, 0, _, 5
138%if cpuflag(ssse3)
139    %define m10         [base+wiener_shufC]
140    %define m11         [base+wiener_shufD]
141    %define stk_off     96
142%else
143    %define m10         [base+pw_m16380]
144    %define m11         [stk+96]
145    %define stk_off     112
146%endif
147cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstride
148    %define base        r6-pb_right_ext_mask-21
149    %define stk         esp
150    %define dstq        leftq
151    %define edgeb       byte edged
152    %define edged       [stk+ 8]
153    %define dstmp       [stk+12]
154    %define hd    dword [stk+16]
155    %define wq          [stk+20]
156    %define strideq     [stk+24]
157    %define leftmp      [stk+28]
158    %define t2          [stk+32]
159    %define t4          [stk+36]
160    %define t5          [stk+40]
161    %define t6          [stk+44]
162    %define m8          [base+wiener_shufA]
163    %define m9          [base+wiener_shufB]
164    %define m12         [stk+48]
165    %define m13         [stk+64]
166    %define m14         [stk+80]
167    %define m15         [base+pw_2056]
168    mov             r1, r6m ; flt
169    mov             r0, r0m ; dst
170    mov             r4, r4m ; w
171    mov           lpfq, lpfm
172    mov             r2, r7m ; edge
173    mov             r5, r5m ; h
174    movq            m3, [r1+ 0]
175    movq            m7, [r1+16]
176    add             r0, r4
177    mov             r1, r1m ; stride
178    add           lpfq, r4
179    mov          edged, r2
180    mov             r2, r2m ; left
181    mov          dstmp, r0
182    lea             t1, [rsp+r4*2+stk_off]
183    mov             hd, r5
184    neg             r4
185    LEA             r6, pb_right_ext_mask+21
186    mov             wq, r4
187    mov        strideq, r1
188    mov         leftmp, r2
189    mov             r4, r1
190%if cpuflag(ssse3)
191    pshufb          m3, [base+wiener_init]
192    pshufd          m1, m3, q2222
193    pshufd          m2, m3, q3333
194    punpcklqdq      m3, m3
195%else
196    punpcklwd       m3, m3
197    pshufd          m0, m3, q0000
198    pshufd          m1, m3, q1111
199    pshufd          m2, m3, q2222
200    pshufd          m3, m3, q3333
201    mova           m11, m0
202%endif
203    mova           m12, m1
204    mova           m13, m2
205    mova           m14, m3
206%endif
207    psllw           m7, 5
208    pshufd          m6, m7, q0000 ; y0 y1
209    pshufd          m7, m7, q1111 ; y2 y3
210    test         edgeb, 4 ; LR_HAVE_TOP
211    jz .no_top
212    call .h_top
213    add           lpfq, strideq
214    mov             t6, t1
215    mov             t5, t1
216    add             t1, 384*2
217    call .h_top
218    lea             t3, [lpfq+tmpstrideq*4]
219    mov           lpfq, dstmp
220    add             t3, tmpstrideq
221    mov          [rsp], t3 ; below
222    mov             t4, t1
223    add             t1, 384*2
224    call .h
225    mov             t3, t1
226    mov             t2, t1
227    dec             hd
228    jz .v1
229    add           lpfq, strideq
230    add             t1, 384*2
231    call .h
232    mov             t2, t1
233    dec             hd
234    jz .v2
235    add           lpfq, strideq
236    add             t1, 384*2
237    call .h
238    dec             hd
239    jz .v3
240.main:
241    lea             t0, [t1+384*2]
242.main_loop:
243    call .hv
244    dec             hd
245    jnz .main_loop
246    test         edgeb, 8 ; LR_HAVE_BOTTOM
247    jz .v3
248    mov           lpfq, [rsp]
249    call .hv_bottom
250    add           lpfq, strideq
251    call .hv_bottom
252.v1:
253    call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
254    RET
255.no_top:
256    lea             t3, [lpfq+tmpstrideq*4]
257    mov           lpfq, dstmp
258    lea             t3, [t3+tmpstrideq*2]
259    mov          [rsp], t3
260    call .h
261    mov             t6, t1
262    mov             t5, t1
263    mov             t4, t1
264    mov             t3, t1
265    mov             t2, t1
266    dec             hd
267    jz .v1
268    add           lpfq, strideq
269    add             t1, 384*2
270    call .h
271    mov             t2, t1
272    dec             hd
273    jz .v2
274    add           lpfq, strideq
275    add             t1, 384*2
276    call .h
277    dec             hd
278    jz .v3
279    lea             t0, [t1+384*2]
280    call .hv
281    dec             hd
282    jz .v3
283    add             t0, 384*8
284    call .hv
285    dec             hd
286    jnz .main
287.v3:
288    call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
289.v2:
290    call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
291    jmp .v1
292.extend_right:
293    movd            m2, [lpfq-4]
294%if ARCH_X86_64
295    push            r0
296    lea             r0, [pb_right_ext_mask+21]
297    movu            m0, [r0+xq+0]
298    movu            m1, [r0+xq+8]
299    pop             r0
300%else
301    movu            m0, [r6+xq+0]
302    movu            m1, [r6+xq+8]
303%endif
304%if cpuflag(ssse3)
305    pshufb          m2, [base+pb_3]
306%else
307    punpcklbw       m2, m2
308    pshuflw         m2, m2, q3333
309    punpcklqdq      m2, m2
310%endif
311    pand            m4, m0
312    pand            m5, m1
313    pandn           m0, m2
314    pandn           m1, m2
315    por             m4, m0
316    por             m5, m1
317    ret
318.h:
319    %define stk esp+4 ; offset due to call
320    mov             xq, wq
321    test         edgeb, 1 ; LR_HAVE_LEFT
322    jz .h_extend_left
323    movifnidn    leftq, leftmp
324    mova            m4, [lpfq+xq]
325    movd            m5, [leftq]
326    add          leftq, 4
327    pslldq          m4, 4
328    por             m4, m5
329    movifnidn   leftmp, leftq
330    jmp .h_main
331.h_extend_left:
332%if cpuflag(ssse3)
333    mova            m4, [lpfq+xq]
334    pshufb          m4, [base+wiener_l_shuf]
335%else
336    mova            m5, [lpfq+xq]
337    pshufd          m4, m5, q2103
338    punpcklbw       m5, m5
339    punpcklwd       m5, m5
340    movss           m4, m5
341%endif
342    jmp .h_main
343.h_top:
344    mov             xq, wq
345    test         edgeb, 1 ; LR_HAVE_LEFT
346    jz .h_extend_left
347.h_loop:
348    movu            m4, [lpfq+xq-4]
349.h_main:
350    movu            m5, [lpfq+xq+4]
351    test         edgeb, 2 ; LR_HAVE_RIGHT
352    jnz .h_have_right
353    cmp             xd, -18
354    jl .h_have_right
355    call .extend_right
356.h_have_right:
357%macro %%h7 0
358%if cpuflag(ssse3)
359    pshufb          m0, m4, m8
360    pmaddubsw       m0, m12
361    pshufb          m1, m5, m8
362    pmaddubsw       m1, m12
363    pshufb          m2, m4, m9
364    pmaddubsw       m2, m13
365    pshufb          m3, m5, m9
366    pmaddubsw       m3, m13
367    paddw           m0, m2
368    pshufb          m2, m4, m10
369    pmaddubsw       m2, m13
370    paddw           m1, m3
371    pshufb          m3, m5, m10
372    pmaddubsw       m3, m13
373    pshufb          m4, m11
374    paddw           m0, m2
375    pmullw          m2, m14, m4
376    pshufb          m5, m11
377    paddw           m1, m3
378    pmullw          m3, m14, m5
379    psllw           m4, 7
380    psllw           m5, 7
381    paddw           m0, m2
382    mova            m2, [base+pw_m16380]
383    paddw           m1, m3
384    paddw           m4, m2
385    paddw           m5, m2
386    paddsw          m0, m4
387    paddsw          m1, m5
388%else
389    psrldq          m0, m4, 1
390    pslldq          m1, m4, 1
391    pxor            m3, m3
392    punpcklbw       m0, m3
393    punpckhbw       m1, m3
394    paddw           m0, m1
395    pmullw          m0, m11
396    psrldq          m1, m4, 2
397    pslldq          m2, m4, 2
398    punpcklbw       m1, m3
399    punpckhbw       m2, m3
400    paddw           m1, m2
401    pmullw          m1, m12
402    paddw           m0, m1
403    pshufd          m2, m4, q0321
404    punpcklbw       m2, m3
405    pmullw          m1, m14, m2
406    paddw           m0, m1
407    psrldq          m1, m4, 3
408    pslldq          m4, 3
409    punpcklbw       m1, m3
410    punpckhbw       m4, m3
411    paddw           m1, m4
412    pmullw          m1, m13
413    paddw           m0, m1
414    psllw           m2, 7
415    paddw           m2, m10
416    paddsw          m0, m2
417    psrldq          m1, m5, 1
418    pslldq          m2, m5, 1
419    punpcklbw       m1, m3
420    punpckhbw       m2, m3
421    paddw           m1, m2
422    pmullw          m1, m11
423    psrldq          m2, m5, 2
424    pslldq          m4, m5, 2
425    punpcklbw       m2, m3
426    punpckhbw       m4, m3
427    paddw           m2, m4
428    pmullw          m2, m12
429    paddw           m1, m2
430    pshufd          m4, m5, q0321
431    punpcklbw       m4, m3
432    pmullw          m2, m14, m4
433    paddw           m1, m2
434    psrldq          m2, m5, 3
435    pslldq          m5, 3
436    punpcklbw       m2, m3
437    punpckhbw       m5, m3
438    paddw           m2, m5
439    pmullw          m2, m13
440    paddw           m1, m2
441    psllw           m4, 7
442    paddw           m4, m10
443    paddsw          m1, m4
444%endif
445%endmacro
446    %%h7
447    psraw           m0, 3
448    psraw           m1, 3
449    paddw           m0, m15
450    paddw           m1, m15
451    mova  [t1+xq*2+ 0], m0
452    mova  [t1+xq*2+16], m1
453    add             xq, 16
454    jl .h_loop
455    ret
456ALIGN function_align
457.hv:
458    add           lpfq, strideq
459    mov             xq, wq
460    test         edgeb, 1 ; LR_HAVE_LEFT
461    jz .hv_extend_left
462    movifnidn    leftq, leftmp
463    mova            m4, [lpfq+xq]
464    movd            m5, [leftq]
465    add          leftq, 4
466    pslldq          m4, 4
467    por             m4, m5
468    movifnidn   leftmp, leftq
469    jmp .hv_main
470.hv_extend_left:
471%if cpuflag(ssse3)
472    mova            m4, [lpfq+xq]
473    pshufb          m4, [base+wiener_l_shuf]
474%else
475    mova            m5, [lpfq+xq]
476    pshufd          m4, m5, q2103
477    punpcklbw       m5, m5
478    punpcklwd       m5, m5
479    movss           m4, m5
480%endif
481    jmp .hv_main
482.hv_bottom:
483    mov             xq, wq
484    test         edgeb, 1 ; LR_HAVE_LEFT
485    jz .hv_extend_left
486.hv_loop:
487    movu            m4, [lpfq+xq-4]
488.hv_main:
489    movu            m5, [lpfq+xq+4]
490    test         edgeb, 2 ; LR_HAVE_RIGHT
491    jnz .hv_have_right
492    cmp             xd, -18
493    jl .hv_have_right
494    call .extend_right
495.hv_have_right:
496    %%h7
497%if ARCH_X86_64
498    mova            m2, [t4+xq*2]
499    paddw           m2, [t2+xq*2]
500%else
501    mov             r2, t4
502    mova            m2, [r2+xq*2]
503    mov             r2, t2
504    paddw           m2, [r2+xq*2]
505    mov             r2, t5
506%endif
507    mova            m3, [t3+xq*2]
508%if ARCH_X86_64
509    mova            m5, [t5+xq*2]
510%else
511    mova            m5, [r2+xq*2]
512    mov             r2, t6
513%endif
514    paddw           m5, [t1+xq*2]
515    psraw           m0, 3
516    psraw           m1, 3
517    paddw           m0, m15
518    paddw           m1, m15
519%if ARCH_X86_64
520    paddw           m4, m0, [t6+xq*2]
521%else
522    paddw           m4, m0, [r2+xq*2]
523    mov             r2, t4
524%endif
525    mova     [t0+xq*2], m0
526    punpcklwd       m0, m2, m3
527    pmaddwd         m0, m7
528    punpckhwd       m2, m3
529    pmaddwd         m2, m7
530    punpcklwd       m3, m4, m5
531    pmaddwd         m3, m6
532    punpckhwd       m4, m5
533    pmaddwd         m4, m6
534    paddd           m0, m3
535    mova            m3, [t3+xq*2+16]
536    paddd           m4, m2
537%if ARCH_X86_64
538    mova            m2, [t4+xq*2+16]
539    paddw           m2, [t2+xq*2+16]
540    mova            m5, [t5+xq*2+16]
541%else
542    mova            m2, [r2+xq*2+16]
543    mov             r2, t2
544    paddw           m2, [r2+xq*2+16]
545    mov             r2, t5
546    mova            m5, [r2+xq*2+16]
547    mov             r2, t6
548%endif
549    paddw           m5, [t1+xq*2+16]
550    packuswb        m0, m4
551%if ARCH_X86_64
552    paddw           m4, m1, [t6+xq*2+16]
553%else
554    paddw           m4, m1, [r2+xq*2+16]
555    mov           dstq, dstmp
556%endif
557    mova  [t0+xq*2+16], m1
558    punpcklwd       m1, m2, m3
559    pmaddwd         m1, m7
560    punpckhwd       m2, m3
561    pmaddwd         m2, m7
562    punpcklwd       m3, m4, m5
563    pmaddwd         m3, m6
564    punpckhwd       m4, m5
565    pmaddwd         m4, m6
566    paddd           m1, m3
567    paddd           m2, m4
568    packuswb        m1, m2
569    psrlw           m0, 8
570    psrlw           m1, 8
571    packuswb        m0, m1
572    mova     [dstq+xq], m0
573    add             xq, 16
574    jl .hv_loop
575    add           dstq, strideq
576%if ARCH_X86_64
577    mov             t6, t5
578    mov             t5, t4
579    mov             t4, t3
580    mov             t3, t2
581    mov             t2, t1
582    mov             t1, t0
583    mov             t0, t6
584%else
585    mov          dstmp, dstq
586    mov             r1, t5
587    mov             r2, t4
588    mov             t6, r1
589    mov             t5, r2
590    mov             t4, t3
591    mov             t3, t2
592    mov             t2, t1
593    mov             t1, t0
594    mov             t0, r1
595%endif
596    ret
597%if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code
598.v:
599    mov             xq, wq
600.v_loop:
601%if ARCH_X86_64
602    mova            m1, [t4+xq*2]
603    paddw           m1, [t2+xq*2]
604%else
605    mov             r2, t4
606    mova            m1, [r2+xq*2]
607    mov             r2, t2
608    paddw           m1, [r2+xq*2]
609    mov             r2, t6
610%endif
611    mova            m2, [t3+xq*2]
612    mova            m4, [t1+xq*2]
613%if ARCH_X86_64
614    paddw           m3, m4, [t6+xq*2]
615    paddw           m4, [t5+xq*2]
616%else
617    paddw           m3, m4, [r2+xq*2]
618    mov             r2, t5
619    paddw           m4, [r2+xq*2]
620    mov             r2, t4
621%endif
622    punpcklwd       m0, m1, m2
623    pmaddwd         m0, m7
624    punpckhwd       m1, m2
625    pmaddwd         m1, m7
626    punpcklwd       m2, m3, m4
627    pmaddwd         m2, m6
628    punpckhwd       m3, m4
629    pmaddwd         m3, m6
630    paddd           m0, m2
631    paddd           m1, m3
632%if ARCH_X86_64
633    mova            m2, [t4+xq*2+16]
634    paddw           m2, [t2+xq*2+16]
635%else
636    mova            m2, [r2+xq*2+16]
637    mov             r2, t2
638    paddw           m2, [r2+xq*2+16]
639    mov             r2, t6
640%endif
641    mova            m3, [t3+xq*2+16]
642    mova            m5, [t1+xq*2+16]
643%if ARCH_X86_64
644    paddw           m4, m5, [t6+xq*2+16]
645    paddw           m5, [t5+xq*2+16]
646%else
647    paddw           m4, m5, [r2+xq*2+16]
648    mov             r2, t5
649    paddw           m5, [r2+xq*2+16]
650    movifnidn     dstq, dstmp
651%endif
652    packuswb        m0, m1
653    punpcklwd       m1, m2, m3
654    pmaddwd         m1, m7
655    punpckhwd       m2, m3
656    pmaddwd         m2, m7
657    punpcklwd       m3, m4, m5
658    pmaddwd         m3, m6
659    punpckhwd       m4, m5
660    pmaddwd         m4, m6
661    paddd           m1, m3
662    paddd           m2, m4
663    packuswb        m1, m2
664    psrlw           m0, 8
665    psrlw           m1, 8
666    packuswb        m0, m1
667    mova     [dstq+xq], m0
668    add             xq, 16
669    jl .v_loop
670    add           dstq, strideq
671%if ARCH_X86_64
672    mov             t6, t5
673    mov             t5, t4
674%else
675    mov          dstmp, dstq
676    mov             r1, t5
677    mov             r2, t4
678    mov             t6, r1
679    mov             t5, r2
680%endif
681    mov             t4, t3
682    mov             t3, t2
683    mov             t2, t1
684    ret
685%endif
686
687%if ARCH_X86_64
688cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
689                                                  w, h, edge, flt, x
690    mov           fltq, r6mp
691    mov             wd, wm
692    movifnidn       hd, hm
693    mov          edged, r7m
694    movq           m14, [fltq]
695    add           lpfq, wq
696    movq            m7, [fltq+16]
697    add           dstq, wq
698    mova            m8, [pw_m16380]
699    lea             t1, [rsp+wq*2+16]
700    mova           m15, [pw_2056]
701    neg             wq
702%if cpuflag(ssse3)
703    pshufb         m14, [wiener_init]
704    mova            m9, [wiener_shufB]
705    pshufd         m13, m14, q3333  ; x1 x2
706    mova           m10, [wiener_shufC]
707    punpcklqdq     m14, m14         ; x3
708    mova           m11, [wiener_shufD]
709    mova           m12, [wiener_l_shuf]
710%else
711    punpcklwd      m14, m14
712    pshufd         m11, m14, q1111 ; x1
713    pshufd         m13, m14, q2222 ; x2
714    pshufd         m14, m14, q3333 ; x3
715%endif
716%else
717%if cpuflag(ssse3)
718    %define stk_off     80
719%else
720    %define m11         [stk+80]
721    %define stk_off     96
722%endif
723cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, tmpstride
724    %define stk         esp
725    %define leftmp      [stk+28]
726    %define m8          [base+pw_m16380]
727    %define m12         [base+wiener_l_shuf]
728    %define m14         [stk+48]
729    mov             r1, r6m ; flt
730    mov             r0, r0m ; dst
731    mov             r4, r4m ; w
732    mov           lpfq, lpfm
733    mov             r2, r7m ; edge
734    mov             r5, r5m ; h
735    movq            m2, [r1+ 0]
736    movq            m7, [r1+16]
737    add             r0, r4
738    mov             r1, r1m ; stride
739    add           lpfq, r4
740    mov          edged, r2
741    mov             r2, r2m ; left
742    mov          dstmp, r0
743    lea             t1, [rsp+r4*2+stk_off]
744    mov             hd, r5
745    neg             r4
746    LEA             r6, pb_right_ext_mask+21
747    mov             wq, r4
748    mov        strideq, r1
749    mov         leftmp, r2
750    mov             r4, r1
751%if cpuflag(ssse3)
752    pshufb          m2, [base+wiener_init]
753    pshufd          m1, m2, q3333
754    punpcklqdq      m2, m2
755%else
756    punpcklwd       m2, m2
757    pshufd          m0, m2, q1111
758    pshufd          m1, m2, q2222
759    pshufd          m2, m2, q3333
760    mova           m11, m0
761%endif
762    mova           m13, m1
763    mova           m14, m2
764%endif
765    psllw           m7, 5
766    pshufd          m6, m7, q0000 ; __ y1
767    pshufd          m7, m7, q1111 ; y2 y3
768    test         edgeb, 4 ; LR_HAVE_TOP
769    jz .no_top
770    call .h_top
771    add           lpfq, strideq
772    mov             t4, t1
773    add             t1, 384*2
774    call .h_top
775    lea             xq, [lpfq+tmpstrideq*4]
776    mov           lpfq, dstmp
777    mov             t3, t1
778    add             t1, 384*2
779    add             xq, tmpstrideq
780    mov          [rsp], xq ; below
781    call .h
782    mov             t2, t1
783    dec             hd
784    jz .v1
785    add           lpfq, strideq
786    add             t1, 384*2
787    call .h
788    dec             hd
789    jz .v2
790.main:
791    mov             t0, t4
792.main_loop:
793    call .hv
794    dec             hd
795    jnz .main_loop
796    test         edgeb, 8 ; LR_HAVE_BOTTOM
797    jz .v2
798    mov           lpfq, [rsp]
799    call .hv_bottom
800    add           lpfq, strideq
801    call .hv_bottom
802.end:
803    RET
804.no_top:
805    lea             t3, [lpfq+tmpstrideq*4]
806    mov           lpfq, dstmp
807    lea             t3, [t3+tmpstrideq*2]
808    mov          [rsp], t3
809    call .h
810    mov             t4, t1
811    mov             t3, t1
812    mov             t2, t1
813    dec             hd
814    jz .v1
815    add           lpfq, strideq
816    add             t1, 384*2
817    call .h
818    dec             hd
819    jz .v2
820    lea             t0, [t1+384*2]
821    call .hv
822    dec             hd
823    jz .v2
824    add             t0, 384*6
825    call .hv
826    dec             hd
827    jnz .main
828.v2:
829    call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
830    add           dstq, strideq
831    mov             t4, t3
832    mov             t3, t2
833    mov             t2, t1
834    movifnidn    dstmp, dstq
835.v1:
836    call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
837    jmp .end
838.h:
839    %define stk esp+4
840    mov             xq, wq
841    test         edgeb, 1 ; LR_HAVE_LEFT
842    jz .h_extend_left
843    movifnidn    leftq, leftmp
844    mova            m4, [lpfq+xq]
845    movd            m5, [leftq]
846    add          leftq, 4
847    pslldq          m4, 4
848    por             m4, m5
849    movifnidn   leftmp, leftq
850    jmp .h_main
851.h_extend_left:
852%if cpuflag(ssse3)
853    mova            m4, [lpfq+xq]
854    pshufb          m4, m12
855%else
856    mova            m5, [lpfq+xq]
857    pshufd          m4, m5, q2103
858    punpcklbw       m5, m5
859    punpcklwd       m5, m5
860    movss           m4, m5
861%endif
862    jmp .h_main
863.h_top:
864    mov             xq, wq
865    test         edgeb, 1 ; LR_HAVE_LEFT
866    jz .h_extend_left
867.h_loop:
868    movu            m4, [lpfq+xq-4]
869.h_main:
870    movu            m5, [lpfq+xq+4]
871    test         edgeb, 2 ; LR_HAVE_RIGHT
872    jnz .h_have_right
873    cmp             xd, -17
874    jl .h_have_right
875    call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
876.h_have_right:
877%macro %%h5 0
878%if cpuflag(ssse3)
879    pshufb          m0, m4, m9
880    pmaddubsw       m0, m13
881    pshufb          m1, m5, m9
882    pmaddubsw       m1, m13
883    pshufb          m2, m4, m10
884    pmaddubsw       m2, m13
885    pshufb          m3, m5, m10
886    pmaddubsw       m3, m13
887    pshufb          m4, m11
888    paddw           m0, m2
889    pmullw          m2, m14, m4
890    pshufb          m5, m11
891    paddw           m1, m3
892    pmullw          m3, m14, m5
893    psllw           m4, 7
894    psllw           m5, 7
895    paddw           m4, m8
896    paddw           m5, m8
897    paddw           m0, m2
898    paddw           m1, m3
899    paddsw          m0, m4
900    paddsw          m1, m5
901%else
902    psrldq          m0, m4, 2
903    pslldq          m1, m4, 2
904    pxor            m3, m3
905    punpcklbw       m0, m3
906    punpckhbw       m1, m3
907    paddw           m0, m1
908    pmullw          m0, m11
909    pshufd          m2, m4, q0321
910    punpcklbw       m2, m3
911    pmullw          m1, m14, m2
912    paddw           m0, m1
913    psrldq          m1, m4, 3
914    pslldq          m4, 3
915    punpcklbw       m1, m3
916    punpckhbw       m4, m3
917    paddw           m1, m4
918    pmullw          m1, m13
919    paddw           m0, m1
920    psllw           m2, 7
921    paddw           m2, m8
922    paddsw          m0, m2
923    psrldq          m1, m5, 2
924    pslldq          m4, m5, 2
925    punpcklbw       m1, m3
926    punpckhbw       m4, m3
927    paddw           m1, m4
928    pmullw          m1, m11
929    pshufd          m4, m5, q0321
930    punpcklbw       m4, m3
931    pmullw          m2, m14, m4
932    paddw           m1, m2
933    psrldq          m2, m5, 3
934    pslldq          m5, 3
935    punpcklbw       m2, m3
936    punpckhbw       m5, m3
937    paddw           m2, m5
938    pmullw          m2, m13
939    paddw           m1, m2
940    psllw           m4, 7
941    paddw           m4, m8
942    paddsw          m1, m4
943%endif
944%endmacro
945    %%h5
946    psraw           m0, 3
947    psraw           m1, 3
948    paddw           m0, m15
949    paddw           m1, m15
950    mova  [t1+xq*2+ 0], m0
951    mova  [t1+xq*2+16], m1
952    add             xq, 16
953    jl .h_loop
954    ret
955ALIGN function_align
956.hv:
957    add           lpfq, strideq
958    mov             xq, wq
959    test         edgeb, 1 ; LR_HAVE_LEFT
960    jz .hv_extend_left
961    movifnidn    leftq, leftmp
962    mova            m4, [lpfq+xq]
963    movd            m5, [leftq]
964    add          leftq, 4
965    pslldq          m4, 4
966    por             m4, m5
967    movifnidn   leftmp, leftq
968    jmp .hv_main
969.hv_extend_left:
970%if cpuflag(ssse3)
971    mova            m4, [lpfq+xq]
972    pshufb          m4, m12
973%else
974    mova            m5, [lpfq+xq]
975    pshufd          m4, m5, q2103
976    punpcklbw       m5, m5
977    punpcklwd       m5, m5
978    movss           m4, m5
979%endif
980    jmp .hv_main
981.hv_bottom:
982    mov             xq, wq
983    test         edgeb, 1 ; LR_HAVE_LEFT
984    jz .hv_extend_left
985.hv_loop:
986    movu            m4, [lpfq+xq-4]
987.hv_main:
988    movu            m5, [lpfq+xq+4]
989    test         edgeb, 2 ; LR_HAVE_RIGHT
990    jnz .hv_have_right
991    cmp             xd, -17
992    jl .hv_have_right
993    call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
994.hv_have_right:
995    %%h5
996    mova            m2, [t3+xq*2]
997    paddw           m2, [t1+xq*2]
998    psraw           m0, 3
999    psraw           m1, 3
1000    paddw           m0, m15
1001    paddw           m1, m15
1002%if ARCH_X86_64
1003    mova            m3, [t2+xq*2]
1004    paddw           m4, m0, [t4+xq*2]
1005%else
1006    mov             r2, t2
1007    mova            m3, [r2+xq*2]
1008    mov             r2, t4
1009    paddw           m4, m0, [r2+xq*2]
1010%endif
1011    mova     [t0+xq*2], m0
1012    punpcklwd       m0, m2, m3
1013    pmaddwd         m0, m7
1014    punpckhwd       m2, m3
1015    pmaddwd         m2, m7
1016    punpcklwd       m3, m4, m4
1017    pmaddwd         m3, m6
1018    punpckhwd       m4, m4
1019    pmaddwd         m4, m6
1020    paddd           m0, m3
1021    paddd           m4, m2
1022    mova            m2, [t3+xq*2+16]
1023    paddw           m2, [t1+xq*2+16]
1024    packuswb        m0, m4
1025%if ARCH_X86_64
1026    mova            m3, [t2+xq*2+16]
1027    paddw           m4, m1, [t4+xq*2+16]
1028%else
1029    paddw           m4, m1, [r2+xq*2+16]
1030    mov             r2, t2
1031    mova            m3, [r2+xq*2+16]
1032    mov           dstq, dstmp
1033%endif
1034    mova  [t0+xq*2+16], m1
1035    punpcklwd       m1, m2, m3
1036    pmaddwd         m1, m7
1037    punpckhwd       m2, m3
1038    pmaddwd         m2, m7
1039    punpcklwd       m3, m4, m4
1040    pmaddwd         m3, m6
1041    punpckhwd       m4, m4
1042    pmaddwd         m4, m6
1043    paddd           m1, m3
1044    paddd           m2, m4
1045    packuswb        m1, m2
1046    psrlw           m0, 8
1047    psrlw           m1, 8
1048    packuswb        m0, m1
1049    mova     [dstq+xq], m0
1050    add             xq, 16
1051    jl .hv_loop
1052    add           dstq, strideq
1053    mov             t4, t3
1054    mov             t3, t2
1055    mov             t2, t1
1056    mov             t1, t0
1057    mov             t0, t4
1058    movifnidn    dstmp, dstq
1059    ret
1060%if cpuflag(ssse3)
1061.v:
1062    mov             xq, wq
1063.v_loop:
1064    mova            m3, [t1+xq*2]
1065    paddw           m1, m3, [t3+xq*2]
1066%if ARCH_X86_64
1067    mova            m2, [t2+xq*2]
1068    paddw           m3, [t4+xq*2]
1069%else
1070    mov             r2, t2
1071    mova            m2, [r2+xq*2]
1072    mov             r2, t4
1073    paddw           m3, [r2+xq*2]
1074%endif
1075    punpcklwd       m0, m1, m2
1076    pmaddwd         m0, m7
1077    punpckhwd       m1, m2
1078    pmaddwd         m1, m7
1079    punpcklwd       m2, m3
1080    pmaddwd         m2, m6
1081    punpckhwd       m3, m3
1082    pmaddwd         m3, m6
1083    paddd           m0, m2
1084    paddd           m1, m3
1085    mova            m4, [t1+xq*2+16]
1086    paddw           m2, m4, [t3+xq*2+16]
1087%if ARCH_X86_64
1088    mova            m3, [t2+xq*2+16]
1089    paddw           m4, [t4+xq*2+16]
1090%else
1091    paddw           m4, [r2+xq*2+16]
1092    mov             r2, t2
1093    mova            m3, [r2+xq*2+16]
1094    mov           dstq, dstmp
1095%endif
1096    packuswb        m0, m1
1097    punpcklwd       m1, m2, m3
1098    pmaddwd         m1, m7
1099    punpckhwd       m2, m3
1100    pmaddwd         m2, m7
1101    punpcklwd       m3, m4
1102    pmaddwd         m3, m6
1103    punpckhwd       m4, m4
1104    pmaddwd         m4, m6
1105    paddd           m1, m3
1106    paddd           m2, m4
1107    packuswb        m1, m2
1108    psrlw           m0, 8
1109    psrlw           m1, 8
1110    packuswb        m0, m1
1111    mova     [dstq+xq], m0
1112    add             xq, 16
1113    jl .v_loop
1114    ret
1115%endif
1116%endmacro
1117
1118INIT_XMM sse2
1119WIENER
1120
1121INIT_XMM ssse3
1122WIENER
1123
1124;;;;;;;;;;;;;;;;;;;;;;;;;;
1125;;      self-guided     ;;
1126;;;;;;;;;;;;;;;;;;;;;;;;;;
1127
1128%macro GATHERDD 3 ; dst, src, tmp
1129    movd           %3d, %2
1130 %if ARCH_X86_64
1131    movd            %1, [r13+%3]
1132    pextrw         %3d, %2, 2
1133    pinsrw          %1, [r13+%3+2], 3
1134    pextrw         %3d, %2, 4
1135    pinsrw          %1, [r13+%3+2], 5
1136    pextrw         %3d, %2, 6
1137    pinsrw          %1, [r13+%3+2], 7
1138 %else
1139    movd            %1, [base+sgr_x_by_x-0xf03+%3]
1140    pextrw          %3, %2, 2
1141    pinsrw          %1, [base+sgr_x_by_x-0xf03+%3+2], 3
1142    pextrw          %3, %2, 4
1143    pinsrw          %1, [base+sgr_x_by_x-0xf03+%3+2], 5
1144    pextrw          %3, %2, 6
1145    pinsrw          %1, [base+sgr_x_by_x-0xf03+%3+2], 7
1146 %endif
1147%endmacro
1148
1149%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore
1150 %if ARCH_X86_64
1151  %define tmp r14
1152 %else
1153  %define tmp %4
1154 %endif
1155    GATHERDD        %1, %2, tmp
1156    GATHERDD        %2, %3, tmp
1157    movif32         %4, %5
1158    psrld           %1, 24
1159    psrld           %2, 24
1160    packssdw        %1, %2
1161%endmacro
1162
1163%macro MULLD 3 ; dst, src, tmp
1164    pmulhuw         %3, %1, %2
1165    pmullw          %1, %2
1166    pslld           %3, 16
1167    paddd           %1, %3
1168%endmacro
1169
1170%if ARCH_X86_32
1171DECLARE_REG_TMP 0, 1, 2, 3, 5
1172 %if STACK_ALIGNMENT < 16
1173  %assign extra_stack 5*16
1174 %else
1175  %assign extra_stack 3*16
1176 %endif
1177cglobal sgr_filter_5x5_8bpc, 1, 7, 8, -400*24-16-extra_stack, \
1178                             dst, stride, left, lpf, w
1179 %if STACK_ALIGNMENT < 16
1180  %define dstm         dword [esp+calloff+16*0+4*6]
1181  %define stridemp     dword [esp+calloff+16*0+4*7]
1182  %define leftm        dword [esp+calloff+16*3+4*0]
1183  %define lpfm         dword [esp+calloff+16*3+4*1]
1184  %define w0m          dword [esp+calloff+16*3+4*2]
1185  %define hd           dword [esp+calloff+16*3+4*3]
1186  %define edgeb         byte [esp+calloff+16*3+4*4]
1187  %define edged        dword [esp+calloff+16*3+4*4]
1188  %define leftmp leftm
1189 %else
1190  %define w0m wm
1191  %define hd dword r5m
1192  %define edgeb  byte r7m
1193  %define edged dword r7m
1194 %endif
1195 %define hvsrcm dword [esp+calloff+4*0]
1196 %define w1m    dword [esp+calloff+4*1]
1197 %define t0m    dword [esp+calloff+4*2]
1198 %define t2m    dword [esp+calloff+4*3]
1199 %define t3m    dword [esp+calloff+4*4]
1200 %define t4m    dword [esp+calloff+4*5]
1201 %define  m8 [base+pb_1]
1202 %define  m9 [esp+calloff+16*2]
1203 %define m10 [base+pd_0xf00800a4]
1204 %define m11 [base+sgr_lshuf5]
1205 %define m12 [base+pd_34816]
1206 %define m13 [base+pb_0to15]
1207 %define r10 r4
1208 %define base r6-$$
1209 %assign calloff 0
1210 %if STACK_ALIGNMENT < 16
1211    mov        strideq, [rstk+stack_offset+ 8]
1212    mov          leftq, [rstk+stack_offset+12]
1213    mov           lpfq, [rstk+stack_offset+16]
1214    mov             wd, [rstk+stack_offset+20]
1215    mov           dstm, dstq
1216    mov       stridemp, strideq
1217    mov          leftm, leftq
1218    mov             r1, [rstk+stack_offset+24]
1219    mov             r2, [rstk+stack_offset+32]
1220    mov           lpfm, lpfq
1221    mov             hd, r1
1222    mov          edged, r2
1223 %endif
1224%else
1225DECLARE_REG_TMP 8, 7, 9, 11, 12
1226cglobal sgr_filter_5x5_8bpc, 4, 15, 14, -400*24-16, dst, stride, left, lpf, \
1227                                                    w, h, edge, params
1228%endif
1229%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
1230    mov             wd, wm
1231%endif
1232%if ARCH_X86_64
1233    mov        paramsq, r6mp
1234    lea            r13, [sgr_x_by_x-0xf03]
1235    movifnidn       hd, hm
1236    mov          edged, r7m
1237    movu            m9, [paramsq]
1238    add           lpfq, wq
1239    mova            m8, [pb_1]
1240    lea             t1, [rsp+wq*2+20]
1241    mova           m10, [pd_0xf00800a4]
1242    add           dstq, wq
1243    lea             t3, [rsp+wq*4+400*12+16]
1244    mova           m12, [pd_34816]  ; (1 << 11) + (1 << 15)
1245    lea             t4, [rsp+wq*2+400*20+16]
1246    pshufhw         m7, m9, q0000
1247    pshufb          m9, [pw_256]  ; s0
1248    punpckhqdq      m7, m7        ; w0
1249    neg             wq
1250    mova           m13, [pb_0to15]
1251    pxor            m6, m6
1252    mova           m11, [sgr_lshuf5]
1253    psllw           m7, 4
1254 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
1255 %define lpfm [rsp]
1256%else
1257    mov             r1, [rstk+stack_offset+28] ; params
1258    LEA             r6, $$
1259    movu            m1, [r1]
1260    add           lpfm, wq
1261    lea             t1, [rsp+extra_stack+wq*2+20]
1262    add           dstq, wq
1263    lea             t3, [rsp+extra_stack+wq*4+400*12+16]
1264    mov           dstm, dstq
1265    lea             t4, [rsp+extra_stack+wq*2+400*20+16]
1266    mov            t3m, t3
1267    pshufhw         m7, m1, q0000
1268    mov            t4m, t4
1269    pshufb          m1, [base+pw_256] ; s0
1270    punpckhqdq      m7, m7            ; w0
1271    psllw           m7, 4
1272    neg             wq
1273    mova            m9, m1
1274    pxor            m6, m6
1275    mov            w1m, wd
1276    sub             wd, 2
1277    mov           lpfq, lpfm
1278    mov            w0m, wd
1279 %define strideq r5
1280%endif
1281    test         edgeb, 4 ; LR_HAVE_TOP
1282    jz .no_top
1283    call .h_top
1284    add           lpfq, stridemp
1285    movif32        t2m, t1
1286    mov             t2, t1
1287    call .top_fixup
1288    add             t1, 400*6
1289    call .h_top
1290    movif32    strideq, stridemp
1291    lea            r10, [lpfq+strideq*4]
1292    mov           lpfq, dstq
1293    add            r10, strideq
1294    mov           lpfm, r10 ; below
1295    movif32        t0m, t2
1296    mov             t0, t2
1297    dec             hd
1298    jz .height1
1299    or           edged, 16
1300    call .h
1301.main:
1302    add           lpfq, stridemp
1303    movif32         t4, t4m
1304    call .hv
1305    call .prep_n
1306    sub             hd, 2
1307    jl .extend_bottom
1308.main_loop:
1309    movif32       lpfq, hvsrcm
1310    add           lpfq, stridemp
1311%if ARCH_X86_64
1312    test            hb, hb
1313%else
1314    mov             r4, hd
1315    test            r4, r4
1316%endif
1317    jz .odd_height
1318    call .h
1319    add           lpfq, stridemp
1320    call .hv
1321    movif32       dstq, dstm
1322    call .n0
1323    call .n1
1324    sub             hd, 2
1325    movif32         t0, t0m
1326    jge .main_loop
1327    test         edgeb, 8 ; LR_HAVE_BOTTOM
1328    jz .extend_bottom
1329    mov           lpfq, lpfm
1330    call .h_top
1331    add           lpfq, stridemp
1332    call .hv_bottom
1333.end:
1334    movif32       dstq, dstm
1335    call .n0
1336    call .n1
1337.end2:
1338    RET
1339.height1:
1340    movif32         t4, t4m
1341    call .hv
1342    call .prep_n
1343    jmp .odd_height_end
1344.odd_height:
1345    call .hv
1346    movif32       dstq, dstm
1347    call .n0
1348    call .n1
1349.odd_height_end:
1350    call .v
1351    movif32       dstq, dstm
1352    call .n0
1353    jmp .end2
1354.extend_bottom:
1355    call .v
1356    jmp .end
1357.no_top:
1358    movif32    strideq, stridemp
1359    lea            r10, [lpfq+strideq*4]
1360    mov           lpfq, dstq
1361    lea            r10, [r10+strideq*2]
1362    mov           lpfm, r10
1363    call .h
1364    lea             t2, [t1+400*6]
1365    movif32        t2m, t2
1366    call .top_fixup
1367    dec             hd
1368    jz .no_top_height1
1369    or           edged, 16
1370    mov             t0, t1
1371    mov             t1, t2
1372    movif32        t0m, t0
1373    jmp .main
1374.no_top_height1:
1375    movif32         t3, t3m
1376    movif32         t4, t4m
1377    call .v
1378    call .prep_n
1379    jmp .odd_height_end
1380.extend_right:
1381%assign stack_offset stack_offset+8
1382%assign calloff 8
1383    movd            m1, wd
1384    movd            m3, [lpfq-1]
1385    pshufb          m1, m6
1386    pshufb          m3, m6
1387    psubb           m2, m8, m1
1388    pcmpgtb         m2, m13
1389    pand            m5, m2
1390    pandn           m2, m3
1391    por             m5, m2
1392    ret
1393%assign stack_offset stack_offset-4
1394%assign calloff 4
1395.h: ; horizontal boxsum
1396%if ARCH_X86_64
1397    lea             wq, [r4-2]
1398%else
1399 %define leftq r4
1400%endif
1401    test         edgeb, 1 ; LR_HAVE_LEFT
1402    jz .h_extend_left
1403    movif32      leftq, leftm
1404    movddup         m4, [leftq-4]
1405    movif32         wq, w0m
1406    mova            m5, [lpfq+wq+2]
1407    add         leftmp, 4
1408    palignr         m5, m4, 13
1409    jmp .h_main
1410.h_extend_left:
1411    movif32         wq, w0m
1412    mova            m5, [lpfq+wq+2]
1413    pshufb          m5, m11
1414    jmp .h_main
1415.h_top:
1416%if ARCH_X86_64
1417    lea             wq, [r4-2]
1418%endif
1419    test         edgeb, 1 ; LR_HAVE_LEFT
1420    jz .h_extend_left
1421    movif32         wq, w0m
1422.h_loop:
1423    movu            m5, [lpfq+wq-1]
1424.h_main:
1425    test         edgeb, 2 ; LR_HAVE_RIGHT
1426    jnz .h_have_right
1427    cmp             wd, -10
1428    jl .h_have_right
1429    call .extend_right
1430.h_have_right:
1431    punpcklbw       m4, m5, m6
1432    punpckhbw       m5, m6
1433    palignr         m2, m5, m4, 2
1434    paddw           m0, m4, m2
1435    palignr         m3, m5, m4, 6
1436    paddw           m0, m3
1437    punpcklwd       m1, m2, m3
1438    pmaddwd         m1, m1
1439    punpckhwd       m2, m3
1440    pmaddwd         m2, m2
1441    palignr         m5, m4, 8
1442    paddw           m0, m5
1443    punpcklwd       m3, m4, m5
1444    pmaddwd         m3, m3
1445    paddd           m1, m3
1446    punpckhwd       m3, m4, m5
1447    pmaddwd         m3, m3
1448    shufps          m4, m5, q2121
1449    paddw           m0, m4             ; sum
1450    punpcklwd       m5, m4, m6
1451    pmaddwd         m5, m5
1452    punpckhwd       m4, m6
1453    pmaddwd         m4, m4
1454    paddd           m2, m3
1455    test         edgeb, 16             ; y > 0
1456    jz .h_loop_end
1457    paddw           m0, [t1+wq*2+400*0]
1458    paddd           m1, [t1+wq*2+400*2]
1459    paddd           m2, [t1+wq*2+400*4]
1460.h_loop_end:
1461    paddd           m1, m5             ; sumsq
1462    paddd           m2, m4
1463    mova [t1+wq*2+400*0], m0
1464    mova [t1+wq*2+400*2], m1
1465    mova [t1+wq*2+400*4], m2
1466    add             wq, 8
1467    jl .h_loop
1468    ret
1469.top_fixup:
1470%if ARCH_X86_64
1471    lea             wq, [r4-2]
1472%else
1473    mov             wd, w0m
1474%endif
1475.top_fixup_loop: ; the sums of the first row needs to be doubled
1476    mova            m0, [t1+wq*2+400*0]
1477    mova            m1, [t1+wq*2+400*2]
1478    mova            m2, [t1+wq*2+400*4]
1479    paddw           m0, m0
1480    paddd           m1, m1
1481    paddd           m2, m2
1482    mova [t2+wq*2+400*0], m0
1483    mova [t2+wq*2+400*2], m1
1484    mova [t2+wq*2+400*4], m2
1485    add             wq, 8
1486    jl .top_fixup_loop
1487    ret
1488ALIGN function_align
1489.hv: ; horizontal boxsum + vertical boxsum + ab
1490%if ARCH_X86_64
1491    lea             wq, [r4-2]
1492%else
1493    mov         hvsrcm, lpfq
1494%endif
1495    test         edgeb, 1 ; LR_HAVE_LEFT
1496    jz .hv_extend_left
1497    movif32      leftq, leftm
1498    movddup         m4, [leftq-4]
1499    movif32         wq, w0m
1500    mova            m5, [lpfq+wq+2]
1501    add         leftmp, 4
1502    palignr         m5, m4, 13
1503    jmp .hv_main
1504.hv_extend_left:
1505    movif32         wq, w0m
1506    mova            m5, [lpfq+wq+2]
1507    pshufb          m5, m11
1508    jmp .hv_main
1509.hv_bottom:
1510%if ARCH_X86_64
1511    lea             wq, [r4-2]
1512%else
1513    mov         hvsrcm, lpfq
1514%endif
1515    test         edgeb, 1 ; LR_HAVE_LEFT
1516    jz .hv_extend_left
1517    movif32         wq, w0m
1518%if ARCH_X86_32
1519    jmp .hv_loop_start
1520%endif
1521.hv_loop:
1522    movif32       lpfq, hvsrcm
1523.hv_loop_start:
1524    movu            m5, [lpfq+wq-1]
1525.hv_main:
1526    test         edgeb, 2 ; LR_HAVE_RIGHT
1527    jnz .hv_have_right
1528    cmp             wd, -10
1529    jl .hv_have_right
1530    call .extend_right
1531.hv_have_right:
1532    movif32         t3, hd
1533    punpcklbw       m4, m5, m6
1534    punpckhbw       m5, m6
1535    palignr         m3, m5, m4, 2
1536    paddw           m0, m4, m3
1537    palignr         m1, m5, m4, 6
1538    paddw           m0, m1
1539    punpcklwd       m2, m3, m1
1540    pmaddwd         m2, m2
1541    punpckhwd       m3, m1
1542    pmaddwd         m3, m3
1543    palignr         m5, m4, 8
1544    paddw           m0, m5
1545    punpcklwd       m1, m4, m5
1546    pmaddwd         m1, m1
1547    paddd           m2, m1
1548    punpckhwd       m1, m4, m5
1549    pmaddwd         m1, m1
1550    shufps          m4, m5, q2121
1551    paddw           m0, m4            ; h sum
1552    punpcklwd       m5, m4, m6
1553    pmaddwd         m5, m5
1554    punpckhwd       m4, m6
1555    pmaddwd         m4, m4
1556    paddd           m3, m1
1557    paddd           m2, m5            ; h sumsq
1558    paddd           m3, m4
1559    paddw           m1, m0, [t1+wq*2+400*0]
1560    paddd           m4, m2, [t1+wq*2+400*2]
1561    paddd           m5, m3, [t1+wq*2+400*4]
1562%if ARCH_X86_64
1563    test            hd, hd
1564%else
1565    test            t3, t3
1566%endif
1567    jz .hv_last_row
1568.hv_main2:
1569    paddw           m1, [t2+wq*2+400*0] ; hv sum
1570    paddd           m4, [t2+wq*2+400*2] ; hv sumsq
1571    paddd           m5, [t2+wq*2+400*4]
1572    mova [t0+wq*2+400*0], m0
1573    pslld           m0, m4, 4
1574    mova [t0+wq*2+400*2], m2
1575    mova [t0+wq*2+400*4], m3
1576    pslld           m2, m4, 3
1577    paddd           m4, m0
1578    pslld           m0, m5, 4
1579    paddd           m4, m2             ; a * 25
1580    pslld           m2, m5, 3
1581    paddd           m5, m0
1582    paddd           m5, m2
1583    punpcklwd       m0, m1, m6         ; b
1584    punpckhwd       m1, m6
1585    pmaddwd         m2, m0, m0         ; b * b
1586    pmaddwd         m3, m1, m1
1587    psubd           m4, m2             ; p
1588    psubd           m5, m3
1589    MULLD           m4, m9, m2         ; p * s
1590    MULLD           m5, m9, m2
1591    pmaddwd         m0, m10            ; b * 164
1592    pmaddwd         m1, m10
1593    paddusw         m4, m10
1594    paddusw         m5, m10
1595    psrld           m4, 20             ; min(z, 255)
1596    movif32         t3, t3m
1597    psrld           m5, 20
1598    GATHER_X_BY_X   m3, m4, m5, t2, t2m
1599    punpcklwd       m4, m3, m3
1600    punpckhwd       m5, m3, m3
1601    MULLD           m0, m4, m2
1602    MULLD           m1, m5, m2
1603    paddd           m0, m12            ; x * b * 164 + (1 << 11) + (1 << 15)
1604    paddd           m1, m12
1605    mova   [t4+wq*2+4], m3
1606    psrld           m0, 12             ; b
1607    psrld           m1, 12
1608    mova  [t3+wq*4+ 8], m0
1609    mova  [t3+wq*4+24], m1
1610    add             wq, 8
1611    jl .hv_loop
1612    mov             t2, t1
1613    mov             t1, t0
1614    mov             t0, t2
1615    movif32        t2m, t2
1616    movif32        t0m, t0
1617    ret
1618.hv_last_row: ; esoteric edge case for odd heights
1619    mova [t1+wq*2+400*0], m1
1620    paddw             m1, m0
1621    mova [t1+wq*2+400*2], m4
1622    paddd             m4, m2
1623    mova [t1+wq*2+400*4], m5
1624    paddd             m5, m3
1625    jmp .hv_main2
1626.v: ; vertical boxsum + ab
1627%if ARCH_X86_64
1628    lea             wq, [r4-2]
1629%else
1630    mov             wd, w0m
1631%endif
1632.v_loop:
1633    mova            m0, [t1+wq*2+400*0]
1634    mova            m2, [t1+wq*2+400*2]
1635    mova            m3, [t1+wq*2+400*4]
1636    paddw           m1, m0, [t2+wq*2+400*0]
1637    paddd           m4, m2, [t2+wq*2+400*2]
1638    paddd           m5, m3, [t2+wq*2+400*4]
1639    paddw           m0, m0
1640    paddd           m2, m2
1641    paddd           m3, m3
1642    paddw           m1, m0             ; hv sum
1643    paddd           m4, m2             ; hv sumsq
1644    pslld           m0, m4, 4
1645    paddd           m5, m3
1646    pslld           m2, m4, 3
1647    paddd           m4, m0
1648    pslld           m0, m5, 4
1649    paddd           m4, m2             ; a * 25
1650    pslld           m2, m5, 3
1651    paddd           m5, m0
1652    paddd           m5, m2
1653    punpcklwd       m0, m1, m6
1654    punpckhwd       m1, m6
1655    pmaddwd         m2, m0, m0         ; b * b
1656    pmaddwd         m3, m1, m1
1657    psubd           m4, m2             ; p
1658    psubd           m5, m3
1659    MULLD           m4, m9, m2         ; p * s
1660    MULLD           m5, m9, m2
1661    pmaddwd         m0, m10            ; b * 164
1662    pmaddwd         m1, m10
1663    paddusw         m4, m10
1664    paddusw         m5, m10
1665    psrld           m4, 20             ; min(z, 255)
1666    psrld           m5, 20
1667    GATHER_X_BY_X   m3, m4, m5, t2, t2m
1668    punpcklwd       m4, m3, m3
1669    punpckhwd       m5, m3, m3
1670    MULLD           m0, m4, m2
1671    MULLD           m1, m5, m2
1672    paddd           m0, m12            ; x * b * 164 + (1 << 11) + (1 << 15)
1673    paddd           m1, m12
1674    mova   [t4+wq*2+4], m3
1675    psrld           m0, 12             ; b
1676    psrld           m1, 12
1677    mova  [t3+wq*4+ 8], m0
1678    mova  [t3+wq*4+24], m1
1679    add             wq, 8
1680    jl .v_loop
1681    ret
1682.prep_n: ; initial neighbor setup
1683    movif64         wq, r4
1684    movif32         wd, w1m
1685.prep_n_loop:
1686    movu            m0, [t4+wq*2+ 2]
1687    movu            m3, [t4+wq*2+ 4]
1688    movu            m1, [t3+wq*4+ 4]
1689    movu            m4, [t3+wq*4+ 8]
1690    movu            m2, [t3+wq*4+20]
1691    movu            m5, [t3+wq*4+24]
1692    paddw           m3, m0
1693    paddd           m4, m1
1694    paddd           m5, m2
1695    paddw           m3, [t4+wq*2+ 0]
1696    paddd           m4, [t3+wq*4+ 0]
1697    paddd           m5, [t3+wq*4+16]
1698    paddw           m0, m3
1699    psllw           m3, 2
1700    paddd           m1, m4
1701    pslld           m4, 2
1702    paddd           m2, m5
1703    pslld           m5, 2
1704    paddw           m0, m3             ; a 565
1705    paddd           m1, m4             ; b 565
1706    paddd           m2, m5
1707    mova [t4+wq*2+400*2+ 0], m0
1708    mova [t3+wq*4+400*4+ 0], m1
1709    mova [t3+wq*4+400*4+16], m2
1710    add             wq, 8
1711    jl .prep_n_loop
1712    ret
1713ALIGN function_align
1714.n0: ; neighbor + output (even rows)
1715    movif64         wq, r4
1716    movif32         wd, w1m
1717.n0_loop:
1718    movu            m0, [t4+wq*2+ 2]
1719    movu            m3, [t4+wq*2+ 4]
1720    movu            m1, [t3+wq*4+ 4]
1721    movu            m4, [t3+wq*4+ 8]
1722    movu            m2, [t3+wq*4+20]
1723    movu            m5, [t3+wq*4+24]
1724    paddw           m3, m0
1725    paddd           m4, m1
1726    paddd           m5, m2
1727    paddw           m3, [t4+wq*2+ 0]
1728    paddd           m4, [t3+wq*4+ 0]
1729    paddd           m5, [t3+wq*4+16]
1730    paddw           m0, m3
1731    psllw           m3, 2
1732    paddd           m1, m4
1733    pslld           m4, 2
1734    paddd           m2, m5
1735    pslld           m5, 2
1736    paddw           m0, m3             ; a 565
1737    paddd           m1, m4             ; b 565
1738    paddd           m2, m5
1739    paddw           m3, m0, [t4+wq*2+400*2+ 0]
1740    paddd           m4, m1, [t3+wq*4+400*4+ 0]
1741    paddd           m5, m2, [t3+wq*4+400*4+16]
1742    mova [t4+wq*2+400*2+ 0], m0
1743    mova [t3+wq*4+400*4+ 0], m1
1744    mova [t3+wq*4+400*4+16], m2
1745    movq            m0, [dstq+wq]
1746    punpcklbw       m0, m6
1747    punpcklwd       m1, m0, m6          ; src
1748    punpcklwd       m2, m3, m6          ; a
1749    pmaddwd         m2, m1              ; a * src
1750    punpckhwd       m1, m0, m6
1751    punpckhwd       m3, m6
1752    pmaddwd         m3, m1
1753    psubd           m4, m2              ; b - a * src + (1 << 8)
1754    psubd           m5, m3
1755    psrad           m4, 9
1756    psrad           m5, 9
1757    packssdw        m4, m5
1758    pmulhrsw        m4, m7
1759    paddw           m0, m4
1760    packuswb        m0, m0
1761    movq     [dstq+wq], m0
1762    add             wq, 8
1763    jl .n0_loop
1764    add           dstq, stridemp
1765    ret
1766ALIGN function_align
1767.n1: ; neighbor + output (odd rows)
1768    movif64         wq, r4
1769    movif32         wd, w1m
1770.n1_loop:
1771    movq            m0, [dstq+wq]
1772    mova            m3, [t4+wq*2+400*2+ 0]
1773    mova            m4, [t3+wq*4+400*4+ 0]
1774    mova            m5, [t3+wq*4+400*4+16]
1775    punpcklbw       m0, m6
1776    punpcklwd       m1, m0, m6          ; src
1777    punpcklwd       m2, m3, m6          ; a
1778    pmaddwd         m2, m1              ; a * src
1779    punpckhwd       m1, m0, m6
1780    punpckhwd       m3, m6
1781    pmaddwd         m3, m1
1782    psubd           m4, m2              ; b - a * src + (1 << 7)
1783    psubd           m5, m3
1784    psrad           m4, 8
1785    psrad           m5, 8
1786    packssdw        m4, m5
1787    pmulhrsw        m4, m7
1788    paddw           m0, m4
1789    packuswb        m0, m0
1790    movq     [dstq+wq], m0
1791    add             wq, 8
1792    jl .n1_loop
1793    add           dstq, stridemp
1794    movif32       dstm, dstq
1795    ret
1796
1797%if ARCH_X86_32
1798 %if STACK_ALIGNMENT < 16
1799  %assign extra_stack 4*16
1800 %else
1801  %assign extra_stack 2*16
1802 %endif
1803cglobal sgr_filter_3x3_8bpc, 1, 7, 8, -400*42-16-extra_stack, \
1804                             dst, stride, left, lpf, w
1805 %if STACK_ALIGNMENT < 16
1806  %define dstm         dword [esp+calloff+16*2+4*0]
1807  %define stridemp     dword [esp+calloff+16*2+4*1]
1808  %define leftm        dword [esp+calloff+16*2+4*2]
1809  %define lpfm         dword [esp+calloff+16*2+4*3]
1810  %define w0m          dword [esp+calloff+16*2+4*4]
1811  %define hd           dword [esp+calloff+16*2+4*5]
1812  %define edgeb         byte [esp+calloff+16*2+4*6]
1813  %define edged        dword [esp+calloff+16*2+4*6]
1814  %define leftmp leftm
1815 %else
1816  %define w0m wm
1817  %define hd dword r5m
1818  %define edgeb  byte r7m
1819  %define edged dword r7m
1820 %endif
1821 %define hvsrcm dword [esp+calloff+4*0]
1822 %define w1m    dword [esp+calloff+4*1]
1823 %define t3m    dword [esp+calloff+4*2]
1824 %define t4m    dword [esp+calloff+4*3]
1825 %define  m8 [base+pb_0to15]
1826 %define  m9 [esp+calloff+16*1]
1827 %define m10 [base+pd_0xf00801c7]
1828 %define m11 [base+pd_34816]
1829 %define m12 m6
1830 %define m13 [base+sgr_lshuf3]
1831 %define base r6-$$
1832 %assign calloff 0
1833 %if STACK_ALIGNMENT < 16
1834    mov        strideq, [rstk+stack_offset+ 8]
1835    mov          leftq, [rstk+stack_offset+12]
1836    mov           lpfq, [rstk+stack_offset+16]
1837    mov             wd, [rstk+stack_offset+20]
1838    mov           dstm, dstq
1839    mov       stridemp, strideq
1840    mov          leftm, leftq
1841    mov             r1, [rstk+stack_offset+24]
1842    mov             r2, [rstk+stack_offset+32]
1843    mov           lpfm, lpfq
1844    mov             hd, r1
1845    mov          edged, r2
1846 %endif
1847%else
1848cglobal sgr_filter_3x3_8bpc, 4, 15, 14, -400*42-8, dst, stride, left, lpf, \
1849                                                   w, h, edge, params
1850%endif
1851%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
1852    mov             wd, wm
1853%endif
1854%if ARCH_X86_64
1855    mov        paramsq, r6mp
1856    lea            r13, [sgr_x_by_x-0xf03]
1857    mov             hd, hm
1858    mov          edged, r7m
1859    movq            m9, [paramsq+4]
1860    add           lpfq, wq
1861    lea             t1, [rsp+wq*2+12]
1862    mova            m8, [pb_0to15]
1863    add           dstq, wq
1864    lea             t3, [rsp+wq*4+400*12+8]
1865    mova           m10, [pd_0xf00801c7]
1866    lea             t4, [rsp+wq*2+400*32+8]
1867    mova           m11, [pd_34816]
1868    pshuflw         m7, m9, q3333
1869    pshufb          m9, [pw_256]  ; s1
1870    punpcklqdq      m7, m7        ; w1
1871    neg             wq
1872    pxor            m6, m6
1873    mova           m13, [sgr_lshuf3]
1874    psllw           m7, 4
1875 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
1876 %define lpfm [rsp]
1877%else
1878    mov             r1, [rstk+stack_offset+28] ; params
1879    LEA             r6, $$
1880    movq            m1, [r1+4]
1881    add           lpfm, wq
1882    lea             t1, [rsp+extra_stack+wq*2+20]
1883    add           dstq, wq
1884    lea             t3, [rsp+extra_stack+wq*4+400*12+16]
1885    mov           dstm, dstq
1886    lea             t4, [rsp+extra_stack+wq*2+400*32+16]
1887    mov            t3m, t3
1888    pshuflw         m7, m1, q3333
1889    mov            t4m, t4
1890    pshufb          m1, [base+pw_256] ; s1
1891    punpcklqdq      m7, m7            ; w1
1892    psllw           m7, 4
1893    neg             wq
1894    mova            m9, m1
1895    pxor            m6, m6
1896    mov            w1m, wd
1897    sub             wd, 2
1898    mov           lpfq, lpfm
1899    mov            w0m, wd
1900 %define strideq r5
1901%endif
1902    test         edgeb, 4 ; LR_HAVE_TOP
1903    jz .no_top
1904    call .h_top
1905    add           lpfq, stridemp
1906    mov             t2, t1
1907    add             t1, 400*6
1908    call .h_top
1909    movif32    strideq, stridemp
1910    lea            r10, [lpfq+strideq*4]
1911    mov           lpfq, dstq
1912    add            r10, strideq
1913    mov           lpfm, r10 ; below
1914    movif32         t4, t4m
1915    call .hv0
1916.main:
1917    dec             hd
1918    jz .height1
1919    movif32       lpfq, hvsrcm
1920    add           lpfq, stridemp
1921    call .hv1
1922    call .prep_n
1923    sub             hd, 2
1924    jl .extend_bottom
1925.main_loop:
1926    movif32       lpfq, hvsrcm
1927    add           lpfq, stridemp
1928    call .hv0
1929%if ARCH_X86_64
1930    test            hb, hb
1931%else
1932    mov             r4, hd
1933    test            r4, r4
1934%endif
1935    jz .odd_height
1936    movif32       lpfq, hvsrcm
1937    add           lpfq, stridemp
1938    call .hv1
1939    call .n0
1940    call .n1
1941    sub             hd, 2
1942    jge .main_loop
1943    test         edgeb, 8 ; LR_HAVE_BOTTOM
1944    jz .extend_bottom
1945    mov           lpfq, lpfm
1946    call .hv0_bottom
1947    movif32       lpfq, hvsrcm
1948    add           lpfq, stridemp
1949    call .hv1_bottom
1950.end:
1951    call .n0
1952    call .n1
1953.end2:
1954    RET
1955.height1:
1956    call .v1
1957    call .prep_n
1958    jmp .odd_height_end
1959.odd_height:
1960    call .v1
1961    call .n0
1962    call .n1
1963.odd_height_end:
1964    call .v0
1965    call .v1
1966    call .n0
1967    jmp .end2
1968.extend_bottom:
1969    call .v0
1970    call .v1
1971    jmp .end
1972.no_top:
1973    movif32    strideq, stridemp
1974    lea            r10, [lpfq+strideq*4]
1975    mov           lpfq, dstq
1976    lea            r10, [r10+strideq*2]
1977    mov           lpfm, r10
1978    call .h
1979%if ARCH_X86_64
1980    lea             wq, [r4-2]
1981%else
1982    mov             wq, w0m
1983    mov         hvsrcm, lpfq
1984%endif
1985    lea             t2, [t1+400*6]
1986.top_fixup_loop:
1987    mova            m0, [t1+wq*2+400*0]
1988    mova            m1, [t1+wq*2+400*2]
1989    mova            m2, [t1+wq*2+400*4]
1990    mova [t2+wq*2+400*0], m0
1991    mova [t2+wq*2+400*2], m1
1992    mova [t2+wq*2+400*4], m2
1993    add             wq, 8
1994    jl .top_fixup_loop
1995    movif32         t3, t3m
1996    movif32         t4, t4m
1997    call .v0
1998    jmp .main
1999.extend_right:
2000%assign stack_offset stack_offset+8
2001%assign calloff 8
2002    movd            m0, [lpfq-1]
2003    movd            m1, wd
2004    mova            m3, m8
2005    pshufb          m0, m6
2006    pshufb          m1, m6
2007    mova            m2, m6
2008    psubb           m2, m1
2009    pcmpgtb         m2, m3
2010    pand            m5, m2
2011    pandn           m2, m0
2012    por             m5, m2
2013    ret
2014%assign stack_offset stack_offset-4
2015%assign calloff 4
2016.h: ; horizontal boxsum
2017%if ARCH_X86_64
2018    lea             wq, [r4-2]
2019%else
2020 %define leftq r4
2021%endif
2022    test         edgeb, 1 ; LR_HAVE_LEFT
2023    jz .h_extend_left
2024    movif32      leftq, leftm
2025    movddup         m4, [leftq-4]
2026    movif32         wq, w0m
2027    mova            m5, [lpfq+wq+2]
2028    add         leftmp, 4
2029    palignr         m5, m4, 14
2030    jmp .h_main
2031.h_extend_left:
2032    movif32         wq, w0m
2033    mova            m5, [lpfq+wq+2]
2034    pshufb          m5, m13
2035    jmp .h_main
2036.h_top:
2037%if ARCH_X86_64
2038    lea             wq, [r4-2]
2039%endif
2040    test         edgeb, 1 ; LR_HAVE_LEFT
2041    jz .h_extend_left
2042    movif32         wq, w0m
2043.h_loop:
2044    movu            m5, [lpfq+wq]
2045.h_main:
2046    test         edgeb, 2 ; LR_HAVE_RIGHT
2047    jnz .h_have_right
2048    cmp             wd, -9
2049    jl .h_have_right
2050    call .extend_right
2051.h_have_right:
2052    punpcklbw       m4, m5, m6
2053    punpckhbw       m5, m6
2054    palignr         m0, m5, m4, 2
2055    paddw           m1, m4, m0
2056    punpcklwd       m2, m4, m0
2057    pmaddwd         m2, m2
2058    punpckhwd       m3, m4, m0
2059    pmaddwd         m3, m3
2060    palignr         m5, m4, 4
2061    paddw           m1, m5             ; sum
2062    punpcklwd       m4, m5, m6
2063    pmaddwd         m4, m4
2064    punpckhwd       m5, m6
2065    pmaddwd         m5, m5
2066    paddd           m2, m4             ; sumsq
2067    paddd           m3, m5
2068    mova [t1+wq*2+400*0], m1
2069    mova [t1+wq*2+400*2], m2
2070    mova [t1+wq*2+400*4], m3
2071    add             wq, 8
2072    jl .h_loop
2073    ret
2074ALIGN function_align
2075.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
2076%if ARCH_X86_64
2077    lea             wq, [r4-2]
2078%else
2079    mov         hvsrcm, lpfq
2080%endif
2081    test         edgeb, 1 ; LR_HAVE_LEFT
2082    jz .hv0_extend_left
2083    movif32      leftq, leftm
2084    movddup         m4, [leftq-4]
2085    movif32         wq, w0m
2086    mova            m5, [lpfq+wq+2]
2087    add         leftmp, 4
2088    palignr         m5, m4, 14
2089    jmp .hv0_main
2090.hv0_extend_left:
2091    movif32         wq, w0m
2092    mova            m5, [lpfq+wq+2]
2093    pshufb          m5, m13
2094    jmp .hv0_main
2095.hv0_bottom:
2096%if ARCH_X86_64
2097    lea             wq, [r4-2]
2098%else
2099    mov         hvsrcm, lpfq
2100%endif
2101    test         edgeb, 1 ; LR_HAVE_LEFT
2102    jz .hv0_extend_left
2103    movif32         wq, w0m
2104%if ARCH_X86_32
2105    jmp .hv0_loop_start
2106%endif
2107.hv0_loop:
2108    movif32       lpfq, hvsrcm
2109.hv0_loop_start:
2110    movu            m5, [lpfq+wq]
2111.hv0_main:
2112    test         edgeb, 2 ; LR_HAVE_RIGHT
2113    jnz .hv0_have_right
2114    cmp             wd, -9
2115    jl .hv0_have_right
2116    call .extend_right
2117.hv0_have_right:
2118    punpcklbw       m4, m5, m6
2119    punpckhbw       m5, m6
2120    palignr         m0, m5, m4, 2
2121    paddw           m1, m4, m0
2122    punpcklwd       m2, m4, m0
2123    pmaddwd         m2, m2
2124    punpckhwd       m3, m4, m0
2125    pmaddwd         m3, m3
2126    palignr         m5, m4, 4
2127    paddw           m1, m5             ; sum
2128    punpcklwd       m4, m5, m6
2129    pmaddwd         m4, m4
2130    punpckhwd       m5, m6
2131    pmaddwd         m5, m5
2132    paddd           m2, m4             ; sumsq
2133    paddd           m3, m5
2134    paddw           m0, m1, [t1+wq*2+400*0]
2135    paddd           m4, m2, [t1+wq*2+400*2]
2136    paddd           m5, m3, [t1+wq*2+400*4]
2137    mova [t1+wq*2+400*0], m1
2138    mova [t1+wq*2+400*2], m2
2139    mova [t1+wq*2+400*4], m3
2140    paddw           m1, m0, [t2+wq*2+400*0]
2141    paddd           m2, m4, [t2+wq*2+400*2]
2142    paddd           m3, m5, [t2+wq*2+400*4]
2143    mova [t2+wq*2+400*0], m0
2144    mova [t2+wq*2+400*2], m4
2145    mova [t2+wq*2+400*4], m5
2146    pslld           m4, m2, 3
2147    pslld           m5, m3, 3
2148    paddd           m4, m2             ; a * 9
2149    paddd           m5, m3
2150    punpcklwd       m0, m1, m6         ; b
2151    pmaddwd         m2, m0, m0         ; b * b
2152    punpckhwd       m1, m6
2153    pmaddwd         m3, m1, m1
2154    psubd           m4, m2             ; p
2155    psubd           m5, m3
2156    MULLD           m4, m9, m12        ; p * s
2157    MULLD           m5, m9, m12
2158    pmaddwd         m0, m10            ; b * 455
2159    pmaddwd         m1, m10
2160    paddusw         m4, m10
2161    paddusw         m5, m10
2162    psrld           m4, 20             ; min(z, 255)
2163    movif32         t3, t3m
2164    psrld           m5, 20
2165    GATHER_X_BY_X   m3, m4, m5, r0, dstm
2166    punpcklwd       m4, m3, m3
2167    punpckhwd       m5, m3, m3
2168    MULLD           m0, m4, m12
2169    MULLD           m1, m5, m12
2170%if ARCH_X86_32
2171    pxor            m6, m6
2172%endif
2173    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
2174    paddd           m1, m11
2175    mova   [t4+wq*2+4], m3
2176    psrld           m0, 12
2177    psrld           m1, 12
2178    mova  [t3+wq*4+ 8], m0
2179    mova  [t3+wq*4+24], m1
2180    add             wq, 8
2181    jl .hv0_loop
2182    ret
2183ALIGN function_align
2184.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
2185%if ARCH_X86_64
2186    lea             wq, [r4-2]
2187%else
2188    mov         hvsrcm, lpfq
2189%endif
2190    test         edgeb, 1 ; LR_HAVE_LEFT
2191    jz .hv1_extend_left
2192    movif32      leftq, leftm
2193    movddup         m4, [leftq-4]
2194    movif32         wq, w0m
2195    mova            m5, [lpfq+wq+2]
2196    add         leftmp, 4
2197    palignr         m5, m4, 14
2198    jmp .hv1_main
2199.hv1_extend_left:
2200    movif32         wq, w0m
2201    mova            m5, [lpfq+wq+2]
2202    pshufb          m5, m13
2203    jmp .hv1_main
2204.hv1_bottom:
2205%if ARCH_X86_64
2206    lea             wq, [r4-2]
2207%else
2208    mov         hvsrcm, lpfq
2209%endif
2210    test         edgeb, 1 ; LR_HAVE_LEFT
2211    jz .hv1_extend_left
2212    movif32         wq, w0m
2213%if ARCH_X86_32
2214    jmp .hv1_loop_start
2215%endif
2216.hv1_loop:
2217    movif32       lpfq, hvsrcm
2218.hv1_loop_start:
2219    movu            m5, [lpfq+wq]
2220.hv1_main:
2221    test         edgeb, 2 ; LR_HAVE_RIGHT
2222    jnz .hv1_have_right
2223    cmp             wd, -9
2224    jl .hv1_have_right
2225    call .extend_right
2226.hv1_have_right:
2227    punpcklbw       m4, m5, m6
2228    punpckhbw       m5, m6
2229    palignr         m1, m5, m4, 2
2230    paddw           m0, m4, m1
2231    punpcklwd       m2, m4, m1
2232    pmaddwd         m2, m2
2233    punpckhwd       m3, m4, m1
2234    pmaddwd         m3, m3
2235    palignr         m5, m4, 4
2236    paddw           m0, m5             ; h sum
2237    punpcklwd       m1, m5, m6
2238    pmaddwd         m1, m1
2239    punpckhwd       m5, m6
2240    pmaddwd         m5, m5
2241    paddd           m2, m1             ; h sumsq
2242    paddd           m3, m5
2243    paddw           m1, m0, [t2+wq*2+400*0]
2244    paddd           m4, m2, [t2+wq*2+400*2]
2245    paddd           m5, m3, [t2+wq*2+400*4]
2246    mova [t2+wq*2+400*0], m0
2247    mova [t2+wq*2+400*2], m2
2248    mova [t2+wq*2+400*4], m3
2249    pslld           m2, m4, 3
2250    pslld           m3, m5, 3
2251    paddd           m4, m2             ; a * 9
2252    paddd           m5, m3
2253    punpcklwd       m0, m1, m6         ; b
2254    pmaddwd         m2, m0, m0         ; b * b
2255    punpckhwd       m1, m6
2256    pmaddwd         m3, m1, m1
2257    psubd           m4, m2             ; p
2258    psubd           m5, m3
2259    MULLD           m4, m9, m12        ; p * s
2260    MULLD           m5, m9, m12
2261    pmaddwd         m0, m10            ; b * 455
2262    pmaddwd         m1, m10
2263    paddusw         m4, m10
2264    paddusw         m5, m10
2265    psrld           m4, 20             ; min(z, 255)
2266    movif32         t3, t3m
2267    psrld           m5, 20
2268    GATHER_X_BY_X   m3, m4, m5, r0, dstm
2269    punpcklwd       m4, m3, m3
2270    punpckhwd       m5, m3, m3
2271    MULLD           m0, m4, m12
2272    MULLD           m1, m5, m12
2273%if ARCH_X86_32
2274    pxor            m6, m6
2275%endif
2276    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
2277    paddd           m1, m11
2278    mova [t4+wq*2+400*2 +4], m3
2279    psrld           m0, 12
2280    psrld           m1, 12
2281    mova [t3+wq*4+400*4+ 8], m0
2282    mova [t3+wq*4+400*4+24], m1
2283    add             wq, 8
2284    jl .hv1_loop
2285    mov            r10, t2
2286    mov             t2, t1
2287    mov             t1, r10
2288    ret
2289.v0: ; vertical boxsums + ab (even rows)
2290%if ARCH_X86_64
2291    lea             wq, [r4-2]
2292%else
2293    mov             wd, w0m
2294%endif
2295.v0_loop:
2296    mova            m0, [t1+wq*2+400*0]
2297    mova            m4, [t1+wq*2+400*2]
2298    mova            m5, [t1+wq*2+400*4]
2299    paddw           m0, m0
2300    paddd           m4, m4
2301    paddd           m5, m5
2302    paddw           m1, m0, [t2+wq*2+400*0]
2303    paddd           m2, m4, [t2+wq*2+400*2]
2304    paddd           m3, m5, [t2+wq*2+400*4]
2305    mova [t2+wq*2+400*0], m0
2306    mova [t2+wq*2+400*2], m4
2307    mova [t2+wq*2+400*4], m5
2308    pslld           m4, m2, 3
2309    pslld           m5, m3, 3
2310    paddd           m4, m2             ; a * 9
2311    paddd           m5, m3
2312    punpcklwd       m0, m1, m6         ; b
2313    pmaddwd         m2, m0, m0         ; b * b
2314    punpckhwd       m1, m6
2315    pmaddwd         m3, m1, m1
2316    psubd           m4, m2             ; p
2317    psubd           m5, m3
2318    MULLD           m4, m9, m12        ; p * s
2319    MULLD           m5, m9, m12
2320    pmaddwd         m0, m10            ; b * 455
2321    pmaddwd         m1, m10
2322    paddusw         m4, m10
2323    paddusw         m5, m10
2324    psrld           m4, 20             ; min(z, 255)
2325    psrld           m5, 20
2326    GATHER_X_BY_X   m3, m4, m5, r0, dstm
2327    punpcklwd       m4, m3, m3
2328    punpckhwd       m5, m3, m3
2329    MULLD           m0, m4, m12
2330    MULLD           m1, m5, m12
2331%if ARCH_X86_32
2332    pxor            m6, m6
2333%endif
2334    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
2335    paddd           m1, m11
2336    mova   [t4+wq*2+4], m3
2337    psrld           m0, 12
2338    psrld           m1, 12
2339    mova  [t3+wq*4+ 8], m0
2340    mova  [t3+wq*4+24], m1
2341    add             wq, 8
2342    jl .v0_loop
2343    ret
2344.v1: ; vertical boxsums + ab (odd rows)
2345%if ARCH_X86_64
2346    lea             wq, [r4-2]
2347%else
2348    mov             wd, w0m
2349%endif
2350.v1_loop:
2351    mova            m0, [t1+wq*2+400*0]
2352    mova            m4, [t1+wq*2+400*2]
2353    mova            m5, [t1+wq*2+400*4]
2354    paddw           m1, m0, [t2+wq*2+400*0]
2355    paddd           m2, m4, [t2+wq*2+400*2]
2356    paddd           m3, m5, [t2+wq*2+400*4]
2357    mova [t2+wq*2+400*0], m0
2358    mova [t2+wq*2+400*2], m4
2359    mova [t2+wq*2+400*4], m5
2360    pslld           m4, m2, 3
2361    pslld           m5, m3, 3
2362    paddd           m4, m2             ; a * 9
2363    paddd           m5, m3
2364    punpcklwd       m0, m1, m6         ; b
2365    pmaddwd         m2, m0, m0         ; b * b
2366    punpckhwd       m1, m6
2367    pmaddwd         m3, m1, m1
2368    psubd           m4, m2             ; p
2369    psubd           m5, m3
2370    MULLD           m4, m9, m12        ; p * s
2371    MULLD           m5, m9, m12
2372    pmaddwd         m0, m10            ; b * 455
2373    pmaddwd         m1, m10
2374    paddusw         m4, m10
2375    paddusw         m5, m10
2376    psrld           m4, 20             ; min(z, 255)
2377    psrld           m5, 20
2378    GATHER_X_BY_X   m3, m4, m5, r0, dstm
2379    punpcklwd       m4, m3, m3
2380    punpckhwd       m5, m3, m3
2381    MULLD           m0, m4, m12
2382    MULLD           m1, m5, m12
2383%if ARCH_X86_32
2384    pxor            m6, m6
2385%endif
2386    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
2387    paddd           m1, m11
2388    mova [t4+wq*2+400*2+ 4], m3
2389    psrld           m0, 12
2390    psrld           m1, 12
2391    mova [t3+wq*4+400*4+ 8], m0
2392    mova [t3+wq*4+400*4+24], m1
2393    add             wq, 8
2394    jl .v1_loop
2395    mov            r10, t2
2396    mov             t2, t1
2397    mov             t1, r10
2398    ret
2399.prep_n: ; initial neighbor setup
2400    movif64         wq, r4
2401    movif32         wd, w1m
2402.prep_n_loop:
2403    movu            m0, [t4+wq*2+400*0+ 4]
2404    movu            m1, [t3+wq*4+400*0+ 8]
2405    movu            m2, [t3+wq*4+400*0+24]
2406    movu            m3, [t4+wq*2+400*0+ 2]
2407    movu            m4, [t3+wq*4+400*0+ 4]
2408    movu            m5, [t3+wq*4+400*0+20]
2409    paddw           m0, [t4+wq*2+400*0+ 0]
2410    paddd           m1, [t3+wq*4+400*0+ 0]
2411    paddd           m2, [t3+wq*4+400*0+16]
2412    paddw           m3, m0
2413    paddd           m4, m1
2414    paddd           m5, m2
2415    psllw           m3, 2                ; a[-1] 444
2416    pslld           m4, 2                ; b[-1] 444
2417    pslld           m5, 2
2418    psubw           m3, m0               ; a[-1] 343
2419    psubd           m4, m1               ; b[-1] 343
2420    psubd           m5, m2
2421    mova [t4+wq*2+400*4], m3
2422    mova [t3+wq*4+400*8+ 0], m4
2423    mova [t3+wq*4+400*8+16], m5
2424    movu            m0, [t4+wq*2+400*2+ 4]
2425    movu            m1, [t3+wq*4+400*4+ 8]
2426    movu            m2, [t3+wq*4+400*4+24]
2427    movu            m3, [t4+wq*2+400*2+ 2]
2428    movu            m4, [t3+wq*4+400*4+ 4]
2429    movu            m5, [t3+wq*4+400*4+20]
2430    paddw           m0, [t4+wq*2+400*2+ 0]
2431    paddd           m1, [t3+wq*4+400*4+ 0]
2432    paddd           m2, [t3+wq*4+400*4+16]
2433    paddw           m3, m0
2434    paddd           m4, m1
2435    paddd           m5, m2
2436    psllw           m3, 2                 ; a[ 0] 444
2437    pslld           m4, 2                 ; b[ 0] 444
2438    pslld           m5, 2
2439    mova [t4+wq*2+400* 6], m3
2440    mova [t3+wq*4+400*12+ 0], m4
2441    mova [t3+wq*4+400*12+16], m5
2442    psubw           m3, m0                ; a[ 0] 343
2443    psubd           m4, m1                ; b[ 0] 343
2444    psubd           m5, m2
2445    mova [t4+wq*2+400* 8], m3
2446    mova [t3+wq*4+400*16+ 0], m4
2447    mova [t3+wq*4+400*16+16], m5
2448    add             wq, 8
2449    jl .prep_n_loop
2450    ret
2451ALIGN function_align
2452.n0: ; neighbor + output (even rows)
2453    movif64         wq, r4
2454    movif32         wd, w1m
2455.n0_loop:
2456    movu            m3, [t4+wq*2+400*0+4]
2457    movu            m1, [t4+wq*2+400*0+2]
2458    paddw           m3, [t4+wq*2+400*0+0]
2459    paddw           m1, m3
2460    psllw           m1, 2                ; a[ 1] 444
2461    psubw           m2, m1, m3           ; a[ 1] 343
2462    paddw           m3, m2, [t4+wq*2+400*4]
2463    paddw           m3, [t4+wq*2+400*6]
2464    mova [t4+wq*2+400*4], m2
2465    mova [t4+wq*2+400*6], m1
2466    movu            m4, [t3+wq*4+400*0+8]
2467    movu            m1, [t3+wq*4+400*0+4]
2468    paddd           m4, [t3+wq*4+400*0+0]
2469    paddd           m1, m4
2470    pslld           m1, 2                ; b[ 1] 444
2471    psubd           m2, m1, m4           ; b[ 1] 343
2472    paddd           m4, m2, [t3+wq*4+400* 8+ 0]
2473    paddd           m4, [t3+wq*4+400*12+ 0]
2474    mova [t3+wq*4+400* 8+ 0], m2
2475    mova [t3+wq*4+400*12+ 0], m1
2476    movu            m5, [t3+wq*4+400*0+24]
2477    movu            m1, [t3+wq*4+400*0+20]
2478    paddd           m5, [t3+wq*4+400*0+16]
2479    paddd           m1, m5
2480    pslld           m1, 2
2481    psubd           m2, m1, m5
2482    paddd           m5, m2, [t3+wq*4+400* 8+16]
2483    paddd           m5, [t3+wq*4+400*12+16]
2484    mova [t3+wq*4+400* 8+16], m2
2485    mova [t3+wq*4+400*12+16], m1
2486    movq            m0, [dstq+wq]
2487    punpcklbw       m0, m6
2488    punpcklwd       m1, m0, m6
2489    punpcklwd       m2, m3, m6
2490    pmaddwd         m2, m1               ; a * src
2491    punpckhwd       m1, m0, m6
2492    punpckhwd       m3, m6
2493    pmaddwd         m3, m1
2494    psubd           m4, m2               ; b - a * src + (1 << 8)
2495    psubd           m5, m3
2496    psrad           m4, 9
2497    psrad           m5, 9
2498    packssdw        m4, m5
2499    pmulhrsw        m4, m7
2500    paddw           m0, m4
2501    packuswb        m0, m0
2502    movq     [dstq+wq], m0
2503    add             wq, 8
2504    jl .n0_loop
2505    add           dstq, stridemp
2506    ret
2507ALIGN function_align
2508.n1: ; neighbor + output (odd rows)
2509    movif64         wq, r4
2510    movif32         wd, w1m
2511.n1_loop:
2512    movu            m3, [t4+wq*2+400*2+4]
2513    movu            m1, [t4+wq*2+400*2+2]
2514    paddw           m3, [t4+wq*2+400*2+0]
2515    paddw           m1, m3
2516    psllw           m1, 2                ; a[ 1] 444
2517    psubw           m2, m1, m3           ; a[ 1] 343
2518    paddw           m3, m2, [t4+wq*2+400*6]
2519    paddw           m3, [t4+wq*2+400*8]
2520    mova [t4+wq*2+400*6], m1
2521    mova [t4+wq*2+400*8], m2
2522    movu            m4, [t3+wq*4+400*4+8]
2523    movu            m1, [t3+wq*4+400*4+4]
2524    paddd           m4, [t3+wq*4+400*4+0]
2525    paddd           m1, m4
2526    pslld           m1, 2                ; b[ 1] 444
2527    psubd           m2, m1, m4           ; b[ 1] 343
2528    paddd           m4, m2, [t3+wq*4+400*12+ 0]
2529    paddd           m4, [t3+wq*4+400*16+ 0]
2530    mova [t3+wq*4+400*12+ 0], m1
2531    mova [t3+wq*4+400*16+ 0], m2
2532    movu            m5, [t3+wq*4+400*4+24]
2533    movu            m1, [t3+wq*4+400*4+20]
2534    paddd           m5, [t3+wq*4+400*4+16]
2535    paddd           m1, m5
2536    pslld           m1, 2
2537    psubd           m2, m1, m5
2538    paddd           m5, m2, [t3+wq*4+400*12+16]
2539    paddd           m5, [t3+wq*4+400*16+16]
2540    mova [t3+wq*4+400*12+16], m1
2541    mova [t3+wq*4+400*16+16], m2
2542    movq            m0, [dstq+wq]
2543    punpcklbw       m0, m6
2544    punpcklwd       m1, m0, m6
2545    punpcklwd       m2, m3, m6
2546    pmaddwd         m2, m1               ; a * src
2547    punpckhwd       m1, m0, m6
2548    punpckhwd       m3, m6
2549    pmaddwd         m3, m1
2550    psubd           m4, m2               ; b - a * src + (1 << 8)
2551    psubd           m5, m3
2552    psrad           m4, 9
2553    psrad           m5, 9
2554    packssdw        m4, m5
2555    pmulhrsw        m4, m7
2556    paddw           m0, m4
2557    packuswb        m0, m0
2558    movq     [dstq+wq], m0
2559    add             wq, 8
2560    jl .n1_loop
2561    add           dstq, stridemp
2562    movif32       dstm, dstq
2563    ret
2564
2565%if ARCH_X86_32
2566 %if STACK_ALIGNMENT < 16
2567  %assign extra_stack 10*16
2568 %else
2569  %assign extra_stack 8*16
2570 %endif
2571cglobal sgr_filter_mix_8bpc, 1, 7, 8, -400*66-48-extra_stack, \
2572                             dst, stride, left, lpf, w
2573 %if STACK_ALIGNMENT < 16
2574  %define dstm         dword [esp+calloff+16*8+4*0]
2575  %define stridemp     dword [esp+calloff+16*8+4*1]
2576  %define leftm        dword [esp+calloff+16*8+4*2]
2577  %define lpfm         dword [esp+calloff+16*8+4*3]
2578  %define w0m          dword [esp+calloff+16*8+4*4]
2579  %define hd           dword [esp+calloff+16*8+4*5]
2580  %define edgeb         byte [esp+calloff+16*8+4*6]
2581  %define edged        dword [esp+calloff+16*8+4*6]
2582  %define leftmp leftm
2583 %else
2584  %define w0m wm
2585  %define hd dword r5m
2586  %define edgeb  byte r7m
2587  %define edged dword r7m
2588 %endif
2589 %define hvsrcm dword [esp+calloff+4*0]
2590 %define w1m    dword [esp+calloff+4*1]
2591 %define t3m    dword [esp+calloff+4*2]
2592 %define t4m    dword [esp+calloff+4*3]
2593 %xdefine m8 m6
2594 %define  m9 [base+pd_0xffff]
2595 %define m10 [base+pd_34816]
2596 %define m11 [base+pd_0xf00801c7]
2597 %define m12 [base+pd_0xf00800a4]
2598 %define m13 [esp+calloff+16*4]
2599 %define m14 [esp+calloff+16*5]
2600 %define m15 [esp+calloff+16*6]
2601 %define  m6 [esp+calloff+16*7]
2602 %define base r6-$$
2603 %assign calloff 0
2604 %if STACK_ALIGNMENT < 16
2605    mov        strideq, [rstk+stack_offset+ 8]
2606    mov          leftq, [rstk+stack_offset+12]
2607    mov           lpfq, [rstk+stack_offset+16]
2608    mov             wd, [rstk+stack_offset+20]
2609    mov           dstm, dstq
2610    mov       stridemp, strideq
2611    mov          leftm, leftq
2612    mov             r1, [rstk+stack_offset+24]
2613    mov             r2, [rstk+stack_offset+32]
2614    mov           lpfm, lpfq
2615    mov             hd, r1
2616    mov          edged, r2
2617 %endif
2618%else
2619cglobal sgr_filter_mix_8bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
2620                                                    w, h, edge, params
2621%endif
2622%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
2623    mov             wd, wm
2624%endif
2625%if ARCH_X86_64
2626    mov        paramsq, r6mp
2627    lea            r13, [sgr_x_by_x-0xf03]
2628    movifnidn       hd, hm
2629    mov          edged, r7m
2630    mova           m15, [paramsq]
2631    add           lpfq, wq
2632    mova            m9, [pd_0xffff]
2633    lea             t1, [rsp+wq*2+44]
2634    mova           m10, [pd_34816]
2635    add           dstq, wq
2636    lea             t3, [rsp+wq*4+400*24+40]
2637    mova           m11, [pd_0xf00801c7]
2638    lea             t4, [rsp+wq*2+400*52+40]
2639    mova           m12, [base+pd_0xf00800a4]
2640    neg             wq
2641    pshuflw        m13, m15, q0000
2642    pshuflw        m14, m15, q2222
2643    pshufhw        m15, m15, q1010
2644    punpcklqdq     m13, m13 ; s0
2645    punpcklqdq     m14, m14 ; s1
2646    punpckhqdq     m15, m15 ; w0 w1
2647    pxor            m6, m6
2648    psllw          m15, 2
2649 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
2650 %define lpfm [rsp]
2651%else
2652    mov             r1, [rstk+stack_offset+28] ; params
2653    LEA             r6, $$
2654    mova            m2, [r1]
2655    add           lpfm, wq
2656    lea             t1, [rsp+extra_stack+wq*2+52]
2657    add           dstq, wq
2658    lea             t3, [rsp+extra_stack+wq*4+400*24+48]
2659    mov           dstm, dstq
2660    lea             t4, [rsp+extra_stack+wq*2+400*52+48]
2661    mov            t3m, t3
2662    mov            t4m, t4
2663    neg             wq
2664    pshuflw         m0, m2, q0000
2665    pshuflw         m1, m2, q2222
2666    pshufhw         m2, m2, q1010
2667    punpcklqdq      m0, m0 ; s0
2668    punpcklqdq      m1, m1 ; s1
2669    punpckhqdq      m2, m2 ; w0 w1
2670    mov            w1m, wd
2671    pxor            m3, m3
2672    psllw           m2, 2
2673    mova           m13, m0
2674    mova           m14, m1
2675    sub             wd, 2
2676    mova           m15, m2
2677    mova            m6, m3
2678    mov           lpfq, lpfm
2679    mov            w0m, wd
2680 %define strideq r5
2681%endif
2682    test         edgeb, 4 ; LR_HAVE_TOP
2683    jz .no_top
2684    call .h_top
2685    add           lpfq, stridemp
2686    mov             t2, t1
2687%if ARCH_X86_64
2688    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup
2689%else
2690    mov             wq, w0m
2691    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup_loop
2692%endif
2693    add             t1, 400*12
2694    call .h_top
2695    movif32    strideq, stridemp
2696    lea            r10, [lpfq+strideq*4]
2697    mov           lpfq, dstq
2698    add            r10, strideq
2699    mov           lpfm, r10 ; below
2700    movif32         t4, t4m
2701    call .hv0
2702.main:
2703    dec             hd
2704    jz .height1
2705    movif32       lpfq, hvsrcm
2706    add           lpfq, stridemp
2707    call .hv1
2708    call .prep_n
2709    sub             hd, 2
2710    jl .extend_bottom
2711.main_loop:
2712    movif32       lpfq, hvsrcm
2713    add           lpfq, stridemp
2714    call .hv0
2715%if ARCH_X86_64
2716    test            hd, hd
2717%else
2718    mov             r4, hd
2719    test            r4, r4
2720%endif
2721    jz .odd_height
2722    movif32       lpfq, hvsrcm
2723    add           lpfq, stridemp
2724    call .hv1
2725    call .n0
2726    call .n1
2727    sub             hd, 2
2728    jge .main_loop
2729    test         edgeb, 8 ; LR_HAVE_BOTTOM
2730    jz .extend_bottom
2731    mov           lpfq, lpfm
2732    call .hv0_bottom
2733    movif32       lpfq, hvsrcm
2734    add           lpfq, stridemp
2735    call .hv1_bottom
2736.end:
2737    call .n0
2738    call .n1
2739.end2:
2740    RET
2741.height1:
2742    call .v1
2743    call .prep_n
2744    jmp .odd_height_end
2745.odd_height:
2746    call .v1
2747    call .n0
2748    call .n1
2749.odd_height_end:
2750    call .v0
2751    call .v1
2752    call .n0
2753    jmp .end2
2754.extend_bottom:
2755    call .v0
2756    call .v1
2757    jmp .end
2758.no_top:
2759    movif32    strideq, stridemp
2760    lea            r10, [lpfq+strideq*4]
2761    mov           lpfq, dstq
2762    lea            r10, [r10+strideq*2]
2763    mov           lpfm, r10
2764    call .h
2765%if ARCH_X86_64
2766    lea             wq, [r4-2]
2767%else
2768    mov             wq, w0m
2769    mov         hvsrcm, lpfq
2770%endif
2771    lea             t2, [t1+400*12]
2772.top_fixup_loop:
2773    mova            m0, [t1+wq*2+400* 0]
2774    mova            m1, [t1+wq*2+400* 2]
2775    mova            m2, [t1+wq*2+400* 4]
2776    paddw           m0, m0
2777    mova            m3, [t1+wq*2+400* 6]
2778    paddd           m1, m1
2779    mova            m4, [t1+wq*2+400* 8]
2780    paddd           m2, m2
2781    mova            m5, [t1+wq*2+400*10]
2782    mova [t2+wq*2+400* 0], m0
2783    mova [t2+wq*2+400* 2], m1
2784    mova [t2+wq*2+400* 4], m2
2785    mova [t2+wq*2+400* 6], m3
2786    mova [t2+wq*2+400* 8], m4
2787    mova [t2+wq*2+400*10], m5
2788    add             wq, 8
2789    jl .top_fixup_loop
2790    movif32         t3, t3m
2791    movif32         t4, t4m
2792    call .v0
2793    jmp .main
2794.extend_right:
2795%assign stack_offset stack_offset+8
2796%assign calloff 8
2797%if ARCH_X86_64
2798    SWAP            m8, m6
2799%endif
2800    movd            m1, wd
2801    movd            m3, [lpfq-1]
2802    pshufb          m1, m8
2803    pshufb          m3, m8
2804    psubb           m2, [base+pb_1], m1
2805    pcmpgtb         m2, [base+pb_0to15]
2806    pand            m5, m2
2807    pandn           m2, m3
2808    por             m5, m2
2809%if ARCH_X86_64
2810    SWAP            m6, m8
2811%endif
2812    ret
2813%assign stack_offset stack_offset-4
2814%assign calloff 4
2815.h: ; horizontal boxsum
2816%if ARCH_X86_64
2817    lea             wq, [r4-2]
2818%else
2819 %define leftq r4
2820%endif
2821    test         edgeb, 1 ; LR_HAVE_LEFT
2822    jz .h_extend_left
2823    movif32      leftq, leftm
2824    movddup         m4, [leftq-4]
2825    movif32         wq, w0m
2826    mova            m5, [lpfq+wq+2]
2827    add         leftmp, 4
2828    palignr         m5, m4, 13
2829    jmp .h_main
2830.h_extend_left:
2831    movif32         wq, w0m
2832    mova            m5, [lpfq+wq+2]
2833    pshufb          m5, [base+sgr_lshuf5]
2834    jmp .h_main
2835.h_top:
2836%if ARCH_X86_64
2837    lea             wq, [r4-2]
2838%endif
2839    test         edgeb, 1 ; LR_HAVE_LEFT
2840    jz .h_extend_left
2841    movif32         wq, w0m
2842.h_loop:
2843    movu            m5, [lpfq+wq-1]
2844.h_main:
2845    test         edgeb, 2 ; LR_HAVE_RIGHT
2846%if ARCH_X86_32
2847    pxor            m8, m8
2848%else
2849    SWAP            m8, m6
2850%endif
2851    jnz .h_have_right
2852    cmp             wd, -10
2853    jl .h_have_right
2854    call .extend_right
2855.h_have_right:
2856    punpcklbw       m4, m5, m8
2857    punpckhbw       m5, m8
2858    palignr         m3, m5, m4, 2
2859    palignr         m0, m5, m4, 4
2860    paddw           m1, m3, m0
2861    punpcklwd       m2, m3, m0
2862    pmaddwd         m2, m2
2863    punpckhwd       m3, m0
2864    pmaddwd         m3, m3
2865    palignr         m0, m5, m4, 6
2866    paddw           m1, m0             ; sum3
2867    punpcklwd       m7, m0, m8
2868    pmaddwd         m7, m7
2869    punpckhwd       m0, m8
2870    pmaddwd         m0, m0
2871%if ARCH_X86_64
2872    SWAP            m6, m8
2873%endif
2874    paddd           m2, m7             ; sumsq3
2875    palignr         m5, m4, 8
2876    punpcklwd       m7, m5, m4
2877    paddw           m8, m4, m5
2878    pmaddwd         m7, m7
2879    punpckhwd       m5, m4
2880    pmaddwd         m5, m5
2881    paddd           m3, m0
2882    mova [t1+wq*2+400* 6], m1
2883    mova [t1+wq*2+400* 8], m2
2884    mova [t1+wq*2+400*10], m3
2885    paddw           m8, m1             ; sum5
2886    paddd           m7, m2             ; sumsq5
2887    paddd           m5, m3
2888    mova [t1+wq*2+400* 0], m8
2889    mova [t1+wq*2+400* 2], m7
2890    mova [t1+wq*2+400* 4], m5
2891    add             wq, 8
2892    jl .h_loop
2893    ret
2894ALIGN function_align
2895.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
2896%if ARCH_X86_64
2897    lea             wq, [r4-2]
2898%else
2899    mov         hvsrcm, lpfq
2900%endif
2901    test         edgeb, 1 ; LR_HAVE_LEFT
2902    jz .hv0_extend_left
2903    movif32      leftq, leftm
2904    movddup         m4, [leftq-4]
2905    movif32         wq, w0m
2906    mova            m5, [lpfq+wq+2]
2907    add         leftmp, 4
2908    palignr         m5, m4, 13
2909    jmp .hv0_main
2910.hv0_extend_left:
2911    movif32         wq, w0m
2912    mova            m5, [lpfq+wq+2]
2913    pshufb          m5, [base+sgr_lshuf5]
2914    jmp .hv0_main
2915.hv0_bottom:
2916%if ARCH_X86_64
2917    lea             wq, [r4-2]
2918%else
2919    mov         hvsrcm, lpfq
2920%endif
2921    test         edgeb, 1 ; LR_HAVE_LEFT
2922    jz .hv0_extend_left
2923    movif32         wq, w0m
2924%if ARCH_X86_32
2925    jmp .hv0_loop_start
2926%endif
2927.hv0_loop:
2928    movif32       lpfq, hvsrcm
2929.hv0_loop_start:
2930    movu            m5, [lpfq+wq-1]
2931.hv0_main:
2932    test         edgeb, 2 ; LR_HAVE_RIGHT
2933%if ARCH_X86_32
2934    pxor            m8, m8
2935%else
2936    SWAP            m8, m6
2937%endif
2938    jnz .hv0_have_right
2939    cmp             wd, -10
2940    jl .hv0_have_right
2941    call .extend_right
2942.hv0_have_right:
2943    punpcklbw       m4, m5, m8
2944    punpckhbw       m5, m8
2945    palignr         m3, m5, m4, 2
2946    palignr         m0, m5, m4, 4
2947    movif32         t3, t3m
2948    paddw           m1, m3, m0
2949    punpcklwd       m2, m3, m0
2950    pmaddwd         m2, m2
2951    punpckhwd       m3, m0
2952    pmaddwd         m3, m3
2953    palignr         m0, m5, m4, 6
2954    paddw           m1, m0             ; h sum3
2955    punpcklwd       m7, m0, m8
2956    pmaddwd         m7, m7
2957    punpckhwd       m0, m8
2958%if ARCH_X86_64
2959    SWAP            m6, m8
2960%endif
2961    pmaddwd         m0, m0
2962    paddd           m2, m7             ; h sumsq3
2963    palignr         m5, m4, 8
2964    punpcklwd       m7, m5, m4
2965    paddw           m8, m4, m5
2966    pmaddwd         m7, m7
2967    punpckhwd       m5, m4
2968    pmaddwd         m5, m5
2969    paddd           m3, m0
2970    paddw           m8, m1             ; h sum5
2971    paddd           m7, m2             ; h sumsq5
2972    paddd           m5, m3
2973    mova [t3+wq*4+400*8+ 8], m8
2974    mova [t3+wq*4+400*0+ 8], m7
2975    mova [t3+wq*4+400*0+24], m5
2976    paddw           m8, [t1+wq*2+400* 0]
2977    paddd           m7, [t1+wq*2+400* 2]
2978    paddd           m5, [t1+wq*2+400* 4]
2979    mova [t1+wq*2+400* 0], m8
2980    mova [t1+wq*2+400* 2], m7
2981    mova [t1+wq*2+400* 4], m5
2982    paddw           m0, m1, [t1+wq*2+400* 6]
2983    paddd           m4, m2, [t1+wq*2+400* 8]
2984    paddd           m5, m3, [t1+wq*2+400*10]
2985    mova [t1+wq*2+400* 6], m1
2986    mova [t1+wq*2+400* 8], m2
2987    mova [t1+wq*2+400*10], m3
2988    paddw           m1, m0, [t2+wq*2+400* 6]
2989    paddd           m2, m4, [t2+wq*2+400* 8]
2990    paddd           m3, m5, [t2+wq*2+400*10]
2991    mova [t2+wq*2+400* 6], m0
2992    mova [t2+wq*2+400* 8], m4
2993    mova [t2+wq*2+400*10], m5
2994%if ARCH_X86_32
2995    pxor            m7, m7
2996%else
2997    SWAP            m7, m6
2998%endif
2999    pslld           m4, m2, 3
3000    pslld           m5, m3, 3
3001    paddd           m4, m2             ; a3 * 9
3002    paddd           m5, m3
3003    punpcklwd       m0, m1, m7         ; b3
3004    pmaddwd         m2, m0, m0
3005    punpckhwd       m1, m7
3006    pmaddwd         m3, m1, m1
3007%if ARCH_X86_64
3008    SWAP            m7, m6
3009%endif
3010    psubd           m4, m2             ; p3
3011    psubd           m5, m3
3012    MULLD           m4, m14, m7        ; p3 * s1
3013    MULLD           m5, m14, m7
3014    pmaddwd         m0, m11            ; b3 * 455
3015    pmaddwd         m1, m11
3016    paddusw         m4, m11
3017    paddusw         m5, m11
3018    psrld           m4, 20             ; min(z3, 255)
3019    psrld           m5, 20
3020    GATHER_X_BY_X   m3, m4, m5, r0, dstm
3021    punpcklwd       m4, m3, m3
3022    punpckhwd       m5, m3, m3
3023    MULLD           m0, m4, m7
3024    MULLD           m1, m5, m7
3025    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
3026    paddd           m1, m10
3027    mova [t4+wq*2+400*2+ 4], m3
3028    psrld           m0, 12
3029    psrld           m1, 12
3030    mova [t3+wq*4+400*4+ 8], m0
3031    mova [t3+wq*4+400*4+24], m1
3032    add             wq, 8
3033    jl .hv0_loop
3034    ret
3035ALIGN function_align
3036.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
3037%if ARCH_X86_64
3038    lea             wq, [r4-2]
3039%else
3040    mov         hvsrcm, lpfq
3041%endif
3042    test         edgeb, 1 ; LR_HAVE_LEFT
3043    jz .hv1_extend_left
3044    movif32      leftq, leftm
3045    movddup         m4, [leftq-4]
3046    movif32         wq, w0m
3047    mova            m5, [lpfq+wq+2]
3048    add         leftmp, 4
3049    palignr         m5, m4, 13
3050    jmp .hv1_main
3051.hv1_extend_left:
3052    movif32         wq, w0m
3053    mova            m5, [lpfq+wq+2]
3054    pshufb          m5, [base+sgr_lshuf5]
3055    jmp .hv1_main
3056.hv1_bottom:
3057%if ARCH_X86_64
3058    lea             wq, [r4-2]
3059%else
3060    mov         hvsrcm, lpfq
3061%endif
3062    test         edgeb, 1 ; LR_HAVE_LEFT
3063    jz .hv1_extend_left
3064    movif32         wq, w0m
3065%if ARCH_X86_32
3066    jmp .hv1_loop_start
3067%endif
3068.hv1_loop:
3069    movif32       lpfq, hvsrcm
3070.hv1_loop_start:
3071    movu            m5, [lpfq+wq-1]
3072.hv1_main:
3073    test         edgeb, 2 ; LR_HAVE_RIGHT
3074%if ARCH_X86_32
3075    pxor            m8, m8
3076%else
3077    SWAP            m8, m6
3078%endif
3079    jnz .hv1_have_right
3080    cmp             wd, -10
3081    jl .hv1_have_right
3082    call .extend_right
3083.hv1_have_right:
3084    punpcklbw       m4, m5, m8
3085    punpckhbw       m5, m8
3086    palignr         m7, m5, m4, 2
3087    palignr         m3, m5, m4, 4
3088    paddw           m2, m7, m3
3089    punpcklwd       m0, m7, m3
3090    pmaddwd         m0, m0
3091    punpckhwd       m7, m3
3092    pmaddwd         m7, m7
3093    palignr         m3, m5, m4, 6
3094    paddw           m2, m3             ; h sum3
3095    punpcklwd       m1, m3, m8
3096    pmaddwd         m1, m1
3097    punpckhwd       m3, m8
3098%if ARCH_X86_64
3099    SWAP            m6, m8
3100%endif
3101    pmaddwd         m3, m3
3102    paddd           m0, m1             ; h sumsq3
3103    palignr         m5, m4, 8
3104    punpckhwd       m1, m4, m5
3105    paddw           m8, m4, m5
3106    pmaddwd         m1, m1
3107    punpcklwd       m4, m5
3108    pmaddwd         m4, m4
3109    paddd           m7, m3
3110    paddw           m5, m2, [t2+wq*2+400* 6]
3111    mova [t2+wq*2+400* 6], m2
3112    paddw           m8, m2             ; h sum5
3113    paddd           m2, m0, [t2+wq*2+400* 8]
3114    paddd           m3, m7, [t2+wq*2+400*10]
3115    mova [t2+wq*2+400* 8], m0
3116    mova [t2+wq*2+400*10], m7
3117    paddd           m4, m0             ; h sumsq5
3118    paddd           m1, m7
3119    pslld           m0, m2, 3
3120    pslld           m7, m3, 3
3121    paddd           m2, m0             ; a3 * 9
3122    paddd           m3, m7
3123%if ARCH_X86_32
3124    mova      [esp+20], m8
3125    pxor            m8, m8
3126%else
3127    SWAP            m8, m6
3128%endif
3129    punpcklwd       m0, m5, m8         ; b3
3130    pmaddwd         m7, m0, m0
3131    punpckhwd       m5, m8
3132    pmaddwd         m8, m5, m5
3133    psubd           m2, m7             ; p3
3134    psubd           m3, m8
3135    MULLD           m2, m14, m8        ; p3 * s1
3136    MULLD           m3, m14, m8
3137    pmaddwd         m0, m11            ; b3 * 455
3138    pmaddwd         m5, m11
3139    paddusw         m2, m11
3140    paddusw         m3, m11
3141    psrld           m2, 20             ; min(z3, 255)
3142    movif32         t3, t3m
3143    psrld           m3, 20
3144    GATHER_X_BY_X   m8, m2, m3, r0, dstm
3145    punpcklwd       m2, m8, m8
3146    punpckhwd       m3, m8, m8
3147    MULLD           m0, m2, m7
3148    MULLD           m5, m3, m7
3149    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
3150    paddd           m5, m10
3151    psrld           m0, 12
3152    psrld           m5, 12
3153    mova [t4+wq*2+400*4+ 4], m8
3154    mova [t3+wq*4+400*8+ 8], m0
3155    mova [t3+wq*4+400*8+24], m5
3156%if ARCH_X86_32
3157    mova            m8, [esp+20]
3158%else
3159    SWAP            m6, m8
3160    pxor            m6, m6
3161%endif
3162    paddw           m5, m8, [t2+wq*2+400*0]
3163    paddd           m2, m4, [t2+wq*2+400*2]
3164    paddd           m3, m1, [t2+wq*2+400*4]
3165    paddw           m5, [t1+wq*2+400*0]
3166    paddd           m2, [t1+wq*2+400*2]
3167    paddd           m3, [t1+wq*2+400*4]
3168    mova [t2+wq*2+400*0], m8
3169    pslld           m0, m2, 4
3170    mova [t2+wq*2+400*2], m4
3171    pslld           m8, m3, 4
3172    mova [t2+wq*2+400*4], m1
3173    pslld           m4, m2, 3
3174    paddd           m2, m0
3175    pslld           m7, m3, 3
3176    paddd           m3, m8
3177    paddd           m2, m4             ; a5 * 25
3178    paddd           m3, m7
3179%if ARCH_X86_32
3180    pxor            m7, m7
3181%else
3182    SWAP            m7, m6
3183%endif
3184    punpcklwd       m0, m5, m7         ; b5
3185    pmaddwd         m4, m0, m0
3186    punpckhwd       m5, m7
3187    pmaddwd         m1, m5, m5
3188%if ARCH_X86_64
3189    SWAP            m7, m6
3190%endif
3191    psubd           m2, m4             ; p5
3192    psubd           m3, m1
3193    MULLD           m2, m13, m7        ; p5 * s0
3194    MULLD           m3, m13, m7
3195    pmaddwd         m0, m12            ; b5 * 164
3196    pmaddwd         m5, m12
3197    paddusw         m2, m12
3198    paddusw         m3, m12
3199    psrld           m2, 20             ; min(z5, 255)
3200    psrld           m3, 20
3201    GATHER_X_BY_X   m1, m2, m3, r0, dstm
3202    punpcklwd       m2, m1, m1
3203    punpckhwd       m3, m1, m1
3204    MULLD           m0, m2, m7
3205    MULLD           m5, m3, m7
3206    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
3207    paddd           m5, m10
3208    mova   [t4+wq*2+4], m1
3209    psrld           m0, 12
3210    psrld           m5, 12
3211    mova  [t3+wq*4+ 8], m0
3212    mova  [t3+wq*4+24], m5
3213    add             wq, 8
3214    jl .hv1_loop
3215    mov            r10, t2
3216    mov             t2, t1
3217    mov             t1, r10
3218    ret
3219.v0: ; vertical boxsums + ab3 (even rows)
3220%if ARCH_X86_64
3221    lea             wq, [r4-2]
3222%else
3223    mov             wd, w0m
3224%endif
3225.v0_loop:
3226    mova            m0, [t1+wq*2+400* 6]
3227    mova            m4, [t1+wq*2+400* 8]
3228    mova            m5, [t1+wq*2+400*10]
3229    paddw           m0, m0
3230    paddd           m4, m4
3231    paddd           m5, m5
3232    paddw           m1, m0, [t2+wq*2+400* 6]
3233    paddd           m2, m4, [t2+wq*2+400* 8]
3234    paddd           m3, m5, [t2+wq*2+400*10]
3235    mova [t2+wq*2+400* 6], m0
3236    mova [t2+wq*2+400* 8], m4
3237    mova [t2+wq*2+400*10], m5
3238%if ARCH_X86_32
3239    pxor            m7, m7
3240%else
3241    SWAP            m7, m6
3242%endif
3243    pslld           m4, m2, 3
3244    pslld           m5, m3, 3
3245    paddd           m4, m2             ; a3 * 9
3246    paddd           m5, m3
3247    punpcklwd       m0, m1, m7         ; b3
3248    pmaddwd         m2, m0, m0
3249    punpckhwd       m1, m7
3250    pmaddwd         m3, m1, m1
3251    psubd           m4, m2             ; p3
3252    psubd           m5, m3
3253%if ARCH_X86_64
3254    SWAP            m7, m6
3255%endif
3256    MULLD           m4, m14, m7        ; p3 * s1
3257    MULLD           m5, m14, m7
3258    pmaddwd         m0, m11            ; b3 * 455
3259    pmaddwd         m1, m11
3260    paddusw         m4, m11
3261    paddusw         m5, m11
3262    psrld           m4, 20             ; min(z3, 255)
3263    psrld           m5, 20
3264    GATHER_X_BY_X   m3, m4, m5, r0, dstm
3265    punpcklwd       m4, m3, m3
3266    punpckhwd       m5, m3, m3
3267    MULLD           m0, m4, m7
3268    MULLD           m1, m5, m7
3269    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
3270    paddd           m1, m10
3271    mova [t4+wq*2+400*2+4], m3
3272    psrld           m0, 12
3273    psrld           m1, 12
3274    mova            m3, [t1+wq*2+400*0]
3275    mova            m4, [t1+wq*2+400*2]
3276    mova            m5, [t1+wq*2+400*4]
3277    mova [t3+wq*4+400*8+ 8], m3
3278    mova [t3+wq*4+400*0+ 8], m4
3279    mova [t3+wq*4+400*0+24], m5
3280    paddw           m3, m3 ; cc5
3281    paddd           m4, m4
3282    paddd           m5, m5
3283    mova [t1+wq*2+400*0], m3
3284    mova [t1+wq*2+400*2], m4
3285    mova [t1+wq*2+400*4], m5
3286    mova [t3+wq*4+400*4+ 8], m0
3287    mova [t3+wq*4+400*4+24], m1
3288    add             wq, 8
3289    jl .v0_loop
3290    ret
3291.v1: ; vertical boxsums + ab (odd rows)
3292%if ARCH_X86_64
3293    lea             wq, [r4-2]
3294%else
3295    mov             wd, w0m
3296%endif
3297.v1_loop:
3298    mova            m4, [t1+wq*2+400* 6]
3299    mova            m5, [t1+wq*2+400* 8]
3300    mova            m7, [t1+wq*2+400*10]
3301    paddw           m1, m4, [t2+wq*2+400* 6]
3302    paddd           m2, m5, [t2+wq*2+400* 8]
3303    paddd           m3, m7, [t2+wq*2+400*10]
3304    mova [t2+wq*2+400* 6], m4
3305    mova [t2+wq*2+400* 8], m5
3306    mova [t2+wq*2+400*10], m7
3307%if ARCH_X86_32
3308    pxor            m7, m7
3309%else
3310    SWAP            m7, m6
3311%endif
3312    pslld           m4, m2, 3
3313    pslld           m5, m3, 3
3314    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
3315    paddd           m5, m3
3316    punpcklwd       m0, m1, m7         ; b3
3317    pmaddwd         m2, m0, m0
3318    punpckhwd       m1, m7
3319    pmaddwd         m3, m1, m1
3320    psubd           m4, m2             ; p3
3321    psubd           m5, m3
3322%if ARCH_X86_64
3323    SWAP            m7, m6
3324%endif
3325    MULLD           m4, m14, m7        ; p3 * s1
3326    MULLD           m5, m14, m7
3327    pmaddwd         m0, m11            ; b3 * 455
3328    pmaddwd         m1, m11
3329    paddusw         m4, m11
3330    paddusw         m5, m11
3331    psrld           m4, 20             ; min(z3, 255)
3332    psrld           m5, 20
3333    GATHER_X_BY_X   m3, m4, m5, r0, dstm
3334    punpcklwd       m4, m3, m3
3335    punpckhwd       m5, m3, m3
3336    MULLD           m0, m4, m7
3337    MULLD           m1, m5, m7
3338    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
3339    paddd           m1, m10
3340    mova [t4+wq*2+400*4+4], m3
3341    psrld           m0, 12
3342    psrld           m8, m1, 12
3343    mova            m4, [t3+wq*4+400*8+ 8]
3344    mova            m5, [t3+wq*4+400*0+ 8]
3345    mova            m7, [t3+wq*4+400*0+24]
3346    paddw           m1, m4, [t2+wq*2+400*0]
3347    paddd           m2, m5, [t2+wq*2+400*2]
3348    paddd           m3, m7, [t2+wq*2+400*4]
3349    paddw           m1, [t1+wq*2+400*0]
3350    paddd           m2, [t1+wq*2+400*2]
3351    paddd           m3, [t1+wq*2+400*4]
3352    mova [t2+wq*2+400*0], m4
3353    mova [t2+wq*2+400*2], m5
3354    mova [t2+wq*2+400*4], m7
3355    pslld           m4, m2, 4
3356    mova [t3+wq*4+400*8+ 8], m0
3357    pslld           m5, m3, 4
3358    mova [t3+wq*4+400*8+24], m8
3359    pslld           m7, m2, 3
3360    paddd           m2, m4
3361    pslld           m8, m3, 3
3362    paddd           m3, m5
3363    paddd           m2, m7             ; a5 * 25
3364    paddd           m3, m8
3365%if ARCH_X86_32
3366    pxor            m7, m7
3367%else
3368    SWAP            m7, m6
3369%endif
3370    punpcklwd       m0, m1, m7         ; b5
3371    pmaddwd         m4, m0, m0
3372    punpckhwd       m1, m7
3373    pmaddwd         m5, m1, m1
3374    psubd           m2, m4             ; p5
3375    psubd           m3, m5
3376%if ARCH_X86_64
3377    SWAP            m7, m6
3378%endif
3379    MULLD           m2, m13, m7        ; p5 * s0
3380    MULLD           m3, m13, m7
3381    pmaddwd         m0, m12            ; b5 * 164
3382    pmaddwd         m1, m12
3383    paddusw         m2, m12
3384    paddusw         m3, m12
3385    psrld           m2, 20             ; min(z5, 255)
3386    psrld           m3, 20
3387    GATHER_X_BY_X   m4, m2, m3, r0, dstm
3388    punpcklwd       m2, m4, m4
3389    punpckhwd       m3, m4, m4
3390    MULLD           m0, m2, m7
3391    MULLD           m1, m3, m7
3392    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
3393    paddd           m1, m10
3394    mova   [t4+wq*2+4], m4
3395    psrld           m0, 12
3396    psrld           m1, 12
3397    mova  [t3+wq*4+ 8], m0
3398    mova  [t3+wq*4+24], m1
3399    add             wq, 8
3400    jl .v1_loop
3401    mov            r10, t2
3402    mov             t2, t1
3403    mov             t1, r10
3404    ret
3405.prep_n: ; initial neighbor setup
3406    movif64         wq, r4
3407    movif32         wd, w1m
3408.prep_n_loop:
3409    movu            m0, [t4+wq*2+400*0+ 2]
3410    movu            m1, [t3+wq*4+400*0+ 4]
3411    movu            m2, [t3+wq*4+400*0+20]
3412    movu            m7, [t4+wq*2+400*0+ 4]
3413    movu            m8, [t3+wq*4+400*0+ 8]
3414    paddw           m3, m0, [t4+wq*2+400*0+ 0]
3415    paddd           m4, m1, [t3+wq*4+400*0+ 0]
3416    paddd           m5, m2, [t3+wq*4+400*0+16]
3417    paddw           m3, m7
3418    paddd           m4, m8
3419    movu            m7, [t3+wq*4+400*0+24]
3420    paddw           m0, m3
3421    paddd           m1, m4
3422    psllw           m3, 2
3423    pslld           m4, 2
3424    paddd           m5, m7
3425    paddd           m2, m5
3426    pslld           m5, 2
3427    paddw           m0, m3               ; a5 565
3428    paddd           m1, m4               ; b5 565
3429    paddd           m2, m5
3430    mova [t4+wq*2+400* 6+ 0], m0
3431    mova [t3+wq*4+400*12+ 0], m1
3432    mova [t3+wq*4+400*12+16], m2
3433    movu            m0, [t4+wq*2+400*2+ 4]
3434    movu            m1, [t3+wq*4+400*4+ 8]
3435    movu            m2, [t3+wq*4+400*4+24]
3436    movu            m3, [t4+wq*2+400*2+ 2]
3437    movu            m4, [t3+wq*4+400*4+ 4]
3438    movu            m5, [t3+wq*4+400*4+20]
3439    paddw           m0, [t4+wq*2+400*2+ 0]
3440    paddd           m1, [t3+wq*4+400*4+ 0]
3441    paddd           m2, [t3+wq*4+400*4+16]
3442    paddw           m3, m0
3443    paddd           m4, m1
3444    paddd           m5, m2
3445    psllw           m3, 2                ; a3[-1] 444
3446    pslld           m4, 2                ; b3[-1] 444
3447    pslld           m5, 2
3448    psubw           m3, m0               ; a3[-1] 343
3449    psubd           m4, m1               ; b3[-1] 343
3450    psubd           m5, m2
3451    mova [t4+wq*2+400* 8+ 0], m3
3452    mova [t3+wq*4+400*16+ 0], m4
3453    mova [t3+wq*4+400*16+16], m5
3454    movu            m0, [t4+wq*2+400*4+ 4]
3455    movu            m1, [t3+wq*4+400*8+ 8]
3456    movu            m2, [t3+wq*4+400*8+24]
3457    movu            m3, [t4+wq*2+400*4+ 2]
3458    movu            m4, [t3+wq*4+400*8+ 4]
3459    movu            m5, [t3+wq*4+400*8+20]
3460    paddw           m0, [t4+wq*2+400*4+ 0]
3461    paddd           m1, [t3+wq*4+400*8+ 0]
3462    paddd           m2, [t3+wq*4+400*8+16]
3463    paddw           m3, m0
3464    paddd           m4, m1
3465    paddd           m5, m2
3466    psllw           m3, 2                 ; a3[ 0] 444
3467    pslld           m4, 2                 ; b3[ 0] 444
3468    pslld           m5, 2
3469    mova [t4+wq*2+400*10+ 0], m3
3470    mova [t3+wq*4+400*20+ 0], m4
3471    mova [t3+wq*4+400*20+16], m5
3472    psubw           m3, m0                ; a3[ 0] 343
3473    psubd           m4, m1                ; b3[ 0] 343
3474    psubd           m5, m2
3475    mova [t4+wq*2+400*12+ 0], m3
3476    mova [t3+wq*4+400*24+ 0], m4
3477    mova [t3+wq*4+400*24+16], m5
3478    add             wq, 8
3479    jl .prep_n_loop
3480    ret
3481ALIGN function_align
3482.n0: ; neighbor + output (even rows)
3483    movif64         wq, r4
3484    movif32         wd, w1m
3485.n0_loop:
3486    movu            m0, [t4+wq*2+ 4]
3487    movu            m2, [t4+wq*2+ 2]
3488    paddw           m0, [t4+wq*2+ 0]
3489    paddw           m0, m2
3490    paddw           m2, m0
3491    psllw           m0, 2
3492    paddw           m0, m2               ; a5
3493    movu            m4, [t3+wq*4+ 8]
3494    movu            m5, [t3+wq*4+24]
3495    movu            m1, [t3+wq*4+ 4]
3496    movu            m3, [t3+wq*4+20]
3497    paddd           m4, [t3+wq*4+ 0]
3498    paddd           m5, [t3+wq*4+16]
3499    paddd           m4, m1
3500    paddd           m5, m3
3501    paddd           m1, m4
3502    paddd           m3, m5
3503    pslld           m4, 2
3504    pslld           m5, 2
3505    paddd           m4, m1               ; b5
3506    paddd           m5, m3
3507    movu            m2, [t4+wq*2+400* 6]
3508    paddw           m2, m0
3509    mova [t4+wq*2+400* 6], m0
3510    paddd           m0, m4, [t3+wq*4+400*12+ 0]
3511    paddd           m1, m5, [t3+wq*4+400*12+16]
3512    mova [t3+wq*4+400*12+ 0], m4
3513    mova [t3+wq*4+400*12+16], m5
3514    mova [rsp+16+ARCH_X86_32*4], m1
3515    movu            m3, [t4+wq*2+400*2+4]
3516    movu            m5, [t4+wq*2+400*2+2]
3517    paddw           m3, [t4+wq*2+400*2+0]
3518    paddw           m5, m3
3519    psllw           m5, 2                ; a3[ 1] 444
3520    psubw           m4, m5, m3           ; a3[ 1] 343
3521    movu            m3, [t4+wq*2+400* 8]
3522    paddw           m3, [t4+wq*2+400*10]
3523    paddw           m3, m4
3524    mova [t4+wq*2+400* 8], m4
3525    mova [t4+wq*2+400*10], m5
3526    movu            m1, [t3+wq*4+400*4+ 8]
3527    movu            m5, [t3+wq*4+400*4+ 4]
3528    movu            m7, [t3+wq*4+400*4+24]
3529    movu            m8, [t3+wq*4+400*4+20]
3530    paddd           m1, [t3+wq*4+400*4+ 0]
3531    paddd           m7, [t3+wq*4+400*4+16]
3532    paddd           m5, m1
3533    paddd           m8, m7
3534    pslld           m5, 2                ; b3[ 1] 444
3535    pslld           m8, 2
3536    psubd           m4, m5, m1           ; b3[ 1] 343
3537%if ARCH_X86_32
3538    mova      [esp+52], m8
3539    psubd           m8, m7
3540%else
3541    psubd           m6, m8, m7
3542    SWAP            m8, m6
3543%endif
3544    paddd           m1, m4, [t3+wq*4+400*16+ 0]
3545    paddd           m7, m8, [t3+wq*4+400*16+16]
3546    paddd           m1, [t3+wq*4+400*20+ 0]
3547    paddd           m7, [t3+wq*4+400*20+16]
3548    mova [t3+wq*4+400*16+ 0], m4
3549    mova [t3+wq*4+400*16+16], m8
3550    mova [t3+wq*4+400*20+ 0], m5
3551%if ARCH_X86_32
3552    mova            m8, [esp+52]
3553%else
3554    SWAP            m8, m6
3555    pxor            m6, m6
3556%endif
3557    mova [t3+wq*4+400*20+16], m8
3558    mova [rsp+32+ARCH_X86_32*4], m7
3559    movq            m4, [dstq+wq]
3560    punpcklbw       m4, m6
3561    punpcklwd       m5, m4, m6
3562    punpcklwd       m7, m2, m6
3563    pmaddwd         m7, m5               ; a5 * src
3564    punpcklwd       m8, m3, m6
3565    pmaddwd         m8, m5               ; a3 * src
3566    punpckhwd       m5, m4, m6
3567    punpckhwd       m2, m6
3568    pmaddwd         m2, m5
3569    punpckhwd       m3, m6
3570    pmaddwd         m3, m5
3571    psubd           m0, m7               ; b5 - a5 * src + (1 << 8) - (src << 13)
3572    psubd           m1, m8               ; b3 - a3 * src + (1 << 8) - (src << 13)
3573    psrld           m0, 9
3574    pslld           m1, 7
3575    pand            m0, m9
3576    pandn           m8, m9, m1
3577    por             m0, m8
3578    mova            m1, [rsp+16+ARCH_X86_32*4]
3579    psubd           m1, m2
3580    mova            m2, [rsp+32+ARCH_X86_32*4]
3581    psubd           m2, m3
3582    mova            m3, [base+pd_4096]
3583    psrld           m1, 9
3584    pslld           m2, 7
3585    pand            m1, m9
3586    pandn           m5, m9, m2
3587    por             m1, m5
3588    pmaddwd         m0, m15
3589    pmaddwd         m1, m15
3590    paddd           m0, m3
3591    paddd           m1, m3
3592    psrad           m0, 13
3593    psrad           m1, 13
3594    packssdw        m0, m1
3595    paddw           m0, m4
3596    packuswb        m0, m0
3597    movq     [dstq+wq], m0
3598    add             wq, 8
3599    jl .n0_loop
3600    add           dstq, stridemp
3601    ret
3602ALIGN function_align
3603.n1: ; neighbor + output (odd rows)
3604    movif64         wq, r4
3605    movif32         wd, w1m
3606.n1_loop:
3607    movu            m3, [t4+wq*2+400*4+4]
3608    movu            m5, [t4+wq*2+400*4+2]
3609    paddw           m3, [t4+wq*2+400*4+0]
3610    paddw           m5, m3
3611    psllw           m5, 2                ; a3[ 1] 444
3612    psubw           m4, m5, m3           ; a3[ 1] 343
3613    paddw           m3, m4, [t4+wq*2+400*12]
3614    paddw           m3, [t4+wq*2+400*10]
3615    mova [t4+wq*2+400*10], m5
3616    mova [t4+wq*2+400*12], m4
3617    movu            m1, [t3+wq*4+400*8+ 8]
3618    movu            m5, [t3+wq*4+400*8+ 4]
3619    movu            m7, [t3+wq*4+400*8+24]
3620    movu            m8, [t3+wq*4+400*8+20]
3621    paddd           m1, [t3+wq*4+400*8+ 0]
3622    paddd           m7, [t3+wq*4+400*8+16]
3623    paddd           m5, m1
3624    paddd           m8, m7
3625    pslld           m5, 2                ; b3[ 1] 444
3626    pslld           m8, 2
3627    psubd           m4, m5, m1           ; b3[ 1] 343
3628    psubd           m0, m8, m7
3629    paddd           m1, m4, [t3+wq*4+400*24+ 0]
3630    paddd           m7, m0, [t3+wq*4+400*24+16]
3631    paddd           m1, [t3+wq*4+400*20+ 0]
3632    paddd           m7, [t3+wq*4+400*20+16]
3633    mova [t3+wq*4+400*20+ 0], m5
3634    mova [t3+wq*4+400*20+16], m8
3635    mova [t3+wq*4+400*24+ 0], m4
3636    mova [t3+wq*4+400*24+16], m0
3637    movq            m5, [dstq+wq]
3638    mova            m2, [t4+wq*2+400* 6]
3639    punpcklbw       m5, m6
3640    punpcklwd       m4, m5, m6
3641    punpcklwd       m8, m2, m6
3642    pmaddwd         m8, m4               ; a5 * src
3643    punpcklwd       m0, m3, m6
3644    pmaddwd         m0, m4               ; a3 * src
3645    punpckhwd       m4, m5, m6
3646    punpckhwd       m2, m6
3647    pmaddwd         m2, m4
3648    punpckhwd       m3, m6
3649    pmaddwd         m3, m4
3650    psubd           m1, m0               ; b3 - a3 * src + (1 << 8) - (src << 13)
3651    mova            m0, [t3+wq*4+400*12+ 0]
3652    psubd           m0, m8               ; b5 - a5 * src + (1 << 8) - (src << 13)
3653    mova            m4, [t3+wq*4+400*12+16]
3654    psubd           m4, m2
3655    psubd           m7, m3
3656    pslld           m1, 7
3657    psrld           m0, 8
3658    psrld           m4, 8
3659    pslld           m7, 7
3660    pandn           m3, m9, m1
3661    pand            m0, m9
3662    por             m0, m3
3663    pand            m4, m9
3664    pandn           m2, m9, m7
3665    por             m2, m4
3666    mova            m1, [base+pd_4096]
3667    pmaddwd         m0, m15
3668    pmaddwd         m2, m15
3669    paddd           m0, m1
3670    paddd           m2, m1
3671    psrad           m0, 13
3672    psrad           m2, 13
3673    packssdw        m0, m2
3674    paddw           m0, m5
3675    packuswb        m0, m0
3676    movq     [dstq+wq], m0
3677    add             wq, 8
3678    jl .n1_loop
3679    add           dstq, stridemp
3680    movif32       dstm, dstq
3681    ret
3682