1; Copyright © 2018, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; Copyright © 2018, VideoLabs
4; All rights reserved.
5;
6; Redistribution and use in source and binary forms, with or without
7; modification, are permitted provided that the following conditions are met:
8;
9; 1. Redistributions of source code must retain the above copyright notice, this
10;    list of conditions and the following disclaimer.
11;
12; 2. Redistributions in binary form must reproduce the above copyright notice,
13;    this list of conditions and the following disclaimer in the documentation
14;    and/or other materials provided with the distribution.
15;
16; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27%include "config.asm"
28%include "ext/x86/x86inc.asm"
29
30SECTION_RODATA 16
31
32wiener_init:   db  6,  7,  6,  7,  6,  7,  6,  7,  0,  0,  0,  0,  2,  4,  2,  4
33wiener_shufA:  db  1,  7,  2,  8,  3,  9,  4, 10,  5, 11,  6, 12,  7, 13,  8, 14
34wiener_shufB:  db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10
35wiener_shufC:  db  6,  5,  7,  6,  8,  7,  9,  8, 10,  9, 11, 10, 12, 11, 13, 12
36wiener_shufD:  db  4, -1,  5, -1,  6, -1,  7, -1,  8, -1,  9, -1, 10, -1, 11, -1
37wiener_l_shuf: db  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
38pb_unpcklwdw:  db  0,  1,  0,  1,  4,  5,  4,  5,  8,  9,  8,  9, 12, 13, 12, 13
39
40pb_right_ext_mask: times 24 db 0xff
41                   times 8 db 0
42pb_0:          times 16 db 0
43pb_3:          times 16 db 3
44pb_15:         times 16 db 15
45pb_0_1:        times 8 db 0, 1
46pb_14_15:      times 8 db 14, 15
47pw_1:          times 8 dw 1
48pw_16:         times 8 dw 16
49pw_128:        times 8 dw 128
50pw_256:        times 8 dw 256
51pw_2048:       times 8 dw 2048
52pw_2056:       times 8 dw 2056
53pw_m16380:     times 8 dw -16380
54pw_5_6:        times 4 dw 5, 6
55pd_1024:       times 4 dd 1024
56%if ARCH_X86_32
57pd_512:        times 4 dd 512
58pd_2048:       times 4 dd 2048
59%endif
60pd_0xF0080029: times 4 dd 0xF0080029
61pd_0xF00801C7: times 4 dd 0XF00801C7
62
63cextern sgr_x_by_x
64
65SECTION .text
66
67%if ARCH_X86_32
68 %define PIC_base_offset $$
69
70 %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg
71  %assign pic_reg_stk_off 4
72  %xdefine PIC_reg %1
73  %if %2 == 1
74    mov        [esp], %1
75  %endif
76    LEA      PIC_reg, PIC_base_offset
77  %if %3 == 1
78    XCHG_PIC_REG
79  %endif
80 %endmacro
81
82 %macro XCHG_PIC_REG 0
83    mov [esp+pic_reg_stk_off], PIC_reg
84    %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8
85    mov PIC_reg, [esp+pic_reg_stk_off]
86 %endmacro
87
88 %define PIC_sym(sym)   (PIC_reg+(sym)-PIC_base_offset)
89
90%else
91 %macro XCHG_PIC_REG 0
92 %endmacro
93
94 %define PIC_sym(sym)   (sym)
95%endif
96
97%macro WIENER 0
98%if ARCH_X86_64
99DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers
100cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
101                                                    lpf_stride, w, edge, flt, h, x
102    %define base 0
103    mov           fltq, fltmp
104    mov          edged, r8m
105    mov             wd, wm
106    mov             hd, r6m
107    movq           m14, [fltq]
108    add           lpfq, wq
109    lea             t1, [rsp+wq*2+16]
110    mova           m15, [pw_2056]
111    add           dstq, wq
112    movq            m7, [fltq+16]
113    neg             wq
114%if cpuflag(ssse3)
115    pshufb         m14, [wiener_init]
116    mova            m8, [wiener_shufA]
117    pshufd         m12, m14, q2222  ; x0 x0
118    mova            m9, [wiener_shufB]
119    pshufd         m13, m14, q3333  ; x1 x2
120    mova           m10, [wiener_shufC]
121    punpcklqdq     m14, m14         ; x3
122    mova           m11, [wiener_shufD]
123%else
124    mova           m10, [pw_m16380]
125    punpcklwd      m14, m14
126    pshufd         m11, m14, q0000 ; x0
127    pshufd         m12, m14, q1111 ; x1
128    pshufd         m13, m14, q2222 ; x2
129    pshufd         m14, m14, q3333 ; x3
130%endif
131%else
132DECLARE_REG_TMP 4, 0, _, 5
133%if cpuflag(ssse3)
134    %define m10         [base+wiener_shufC]
135    %define m11         [base+wiener_shufD]
136    %define stk_off     96
137%else
138    %define m10         [base+pw_m16380]
139    %define m11         [stk+96]
140    %define stk_off     112
141%endif
142cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
143    %define base        r6-pb_right_ext_mask-21
144    %define stk         esp
145    %define dstq        leftq
146    %define edgeb       byte edged
147    %define edged       [stk+ 8]
148    %define dstmp       [stk+12]
149    %define hd    dword [stk+16]
150    %define wq          [stk+20]
151    %define dst_strideq [stk+24]
152    %define leftmp      [stk+28]
153    %define t2          [stk+32]
154    %define t4          [stk+36]
155    %define t5          [stk+40]
156    %define t6          [stk+44]
157    %define m8          [base+wiener_shufA]
158    %define m9          [base+wiener_shufB]
159    %define m12         [stk+48]
160    %define m13         [stk+64]
161    %define m14         [stk+80]
162    %define m15         [base+pw_2056]
163    mov             r1, r7m ; flt
164    mov             r0, r0m ; dst
165    mov             r5, r5m ; w
166    mov           lpfq, lpfm
167    mov             r2, r8m ; edge
168    mov             r4, r6m ; h
169    movq            m3, [r1+ 0]
170    movq            m7, [r1+16]
171    add             r0, r5
172    mov             r1, r1m ; dst_stride
173    add           lpfq, r5
174    mov          edged, r2
175    mov             r2, r2m ; left
176    mov          dstmp, r0
177    lea             t1, [rsp+r5*2+stk_off]
178    mov             hd, r4
179    neg             r5
180    mov    lpf_strideq, lpf_stridem
181    LEA             r6, pb_right_ext_mask+21
182    mov             wq, r5
183    mov    dst_strideq, r1
184    mov         leftmp, r2
185%if cpuflag(ssse3)
186    pshufb          m3, [base+wiener_init]
187    pshufd          m1, m3, q2222
188    pshufd          m2, m3, q3333
189    punpcklqdq      m3, m3
190%else
191    punpcklwd       m3, m3
192    pshufd          m0, m3, q0000
193    pshufd          m1, m3, q1111
194    pshufd          m2, m3, q2222
195    pshufd          m3, m3, q3333
196    mova           m11, m0
197%endif
198    mova           m12, m1
199    mova           m13, m2
200    mova           m14, m3
201%endif
202    pshufd          m6, m7, q0000 ; y0 y1
203    pshufd          m7, m7, q1111 ; y2 y3
204    test         edgeb, 4 ; LR_HAVE_TOP
205    jz .no_top
206    call .h_top
207    add           lpfq, lpf_strideq
208    mov             t6, t1
209    mov             t5, t1
210    add             t1, 384*2
211    call .h_top
212    lea             t3, [lpfq+lpf_strideq*4]
213    mov           lpfq, dstmp
214    mov [rsp+gprsize*1], lpf_strideq
215    add             t3, lpf_strideq
216    mov [rsp+gprsize*0], t3 ; below
217    mov             t4, t1
218    add             t1, 384*2
219    call .h
220    mov             t3, t1
221    mov             t2, t1
222    dec             hd
223    jz .v1
224    add           lpfq, dst_strideq
225    add             t1, 384*2
226    call .h
227    mov             t2, t1
228    dec             hd
229    jz .v2
230    add           lpfq, dst_strideq
231    add             t1, 384*2
232    call .h
233    dec             hd
234    jz .v3
235.main:
236    lea             t0, [t1+384*2]
237.main_loop:
238    call .hv
239    dec             hd
240    jnz .main_loop
241    test         edgeb, 8 ; LR_HAVE_BOTTOM
242    jz .v3
243    mov           lpfq, [rsp+gprsize*0]
244    call .hv_bottom
245    add           lpfq, [rsp+gprsize*1]
246    call .hv_bottom
247.v1:
248    call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
249    RET
250.no_top:
251    lea             t3, [lpfq+lpf_strideq*4]
252    mov           lpfq, dstmp
253    mov [rsp+gprsize*1], lpf_strideq
254    lea             t3, [t3+lpf_strideq*2]
255    mov [rsp+gprsize*0], t3
256    call .h
257    mov             t6, t1
258    mov             t5, t1
259    mov             t4, t1
260    mov             t3, t1
261    mov             t2, t1
262    dec             hd
263    jz .v1
264    add           lpfq, dst_strideq
265    add             t1, 384*2
266    call .h
267    mov             t2, t1
268    dec             hd
269    jz .v2
270    add           lpfq, dst_strideq
271    add             t1, 384*2
272    call .h
273    dec             hd
274    jz .v3
275    lea             t0, [t1+384*2]
276    call .hv
277    dec             hd
278    jz .v3
279    add             t0, 384*8
280    call .hv
281    dec             hd
282    jnz .main
283.v3:
284    call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
285.v2:
286    call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
287    jmp .v1
288.extend_right:
289    movd            m2, [lpfq-4]
290%if ARCH_X86_64
291    push            r0
292    lea             r0, [pb_right_ext_mask+21]
293    movu            m0, [r0+xq+0]
294    movu            m1, [r0+xq+8]
295    pop             r0
296%else
297    movu            m0, [r6+xq+0]
298    movu            m1, [r6+xq+8]
299%endif
300%if cpuflag(ssse3)
301    pshufb          m2, [base+pb_3]
302%else
303    punpcklbw       m2, m2
304    pshuflw         m2, m2, q3333
305    punpcklqdq      m2, m2
306%endif
307    pand            m4, m0
308    pand            m5, m1
309    pandn           m0, m2
310    pandn           m1, m2
311    por             m4, m0
312    por             m5, m1
313    ret
314.h:
315    %define stk esp+4 ; offset due to call
316    mov             xq, wq
317    test         edgeb, 1 ; LR_HAVE_LEFT
318    jz .h_extend_left
319    movifnidn    leftq, leftmp
320    mova            m4, [lpfq+xq]
321    movd            m5, [leftq]
322    add          leftq, 4
323    pslldq          m4, 4
324    por             m4, m5
325    movifnidn   leftmp, leftq
326    jmp .h_main
327.h_extend_left:
328%if cpuflag(ssse3)
329    mova            m4, [lpfq+xq]
330    pshufb          m4, [base+wiener_l_shuf]
331%else
332    mova            m5, [lpfq+xq]
333    pshufd          m4, m5, q2103
334    punpcklbw       m5, m5
335    punpcklwd       m5, m5
336    movss           m4, m5
337%endif
338    jmp .h_main
339.h_top:
340    mov             xq, wq
341    test         edgeb, 1 ; LR_HAVE_LEFT
342    jz .h_extend_left
343.h_loop:
344    movu            m4, [lpfq+xq-4]
345.h_main:
346    movu            m5, [lpfq+xq+4]
347    test         edgeb, 2 ; LR_HAVE_RIGHT
348    jnz .h_have_right
349    cmp             xd, -18
350    jl .h_have_right
351    call .extend_right
352.h_have_right:
353%macro %%h7 0
354%if cpuflag(ssse3)
355    pshufb          m0, m4, m8
356    pmaddubsw       m0, m12
357    pshufb          m1, m5, m8
358    pmaddubsw       m1, m12
359    pshufb          m2, m4, m9
360    pmaddubsw       m2, m13
361    pshufb          m3, m5, m9
362    pmaddubsw       m3, m13
363    paddw           m0, m2
364    pshufb          m2, m4, m10
365    pmaddubsw       m2, m13
366    paddw           m1, m3
367    pshufb          m3, m5, m10
368    pmaddubsw       m3, m13
369    pshufb          m4, m11
370    paddw           m0, m2
371    pmullw          m2, m14, m4
372    pshufb          m5, m11
373    paddw           m1, m3
374    pmullw          m3, m14, m5
375    psllw           m4, 7
376    psllw           m5, 7
377    paddw           m0, m2
378    mova            m2, [base+pw_m16380]
379    paddw           m1, m3
380    paddw           m4, m2
381    paddw           m5, m2
382    paddsw          m0, m4
383    paddsw          m1, m5
384%else
385    psrldq          m0, m4, 1
386    pslldq          m1, m4, 1
387    pxor            m3, m3
388    punpcklbw       m0, m3
389    punpckhbw       m1, m3
390    paddw           m0, m1
391    pmullw          m0, m11
392    psrldq          m1, m4, 2
393    pslldq          m2, m4, 2
394    punpcklbw       m1, m3
395    punpckhbw       m2, m3
396    paddw           m1, m2
397    pmullw          m1, m12
398    paddw           m0, m1
399    pshufd          m2, m4, q0321
400    punpcklbw       m2, m3
401    pmullw          m1, m14, m2
402    paddw           m0, m1
403    psrldq          m1, m4, 3
404    pslldq          m4, 3
405    punpcklbw       m1, m3
406    punpckhbw       m4, m3
407    paddw           m1, m4
408    pmullw          m1, m13
409    paddw           m0, m1
410    psllw           m2, 7
411    paddw           m2, m10
412    paddsw          m0, m2
413    psrldq          m1, m5, 1
414    pslldq          m2, m5, 1
415    punpcklbw       m1, m3
416    punpckhbw       m2, m3
417    paddw           m1, m2
418    pmullw          m1, m11
419    psrldq          m2, m5, 2
420    pslldq          m4, m5, 2
421    punpcklbw       m2, m3
422    punpckhbw       m4, m3
423    paddw           m2, m4
424    pmullw          m2, m12
425    paddw           m1, m2
426    pshufd          m4, m5, q0321
427    punpcklbw       m4, m3
428    pmullw          m2, m14, m4
429    paddw           m1, m2
430    psrldq          m2, m5, 3
431    pslldq          m5, 3
432    punpcklbw       m2, m3
433    punpckhbw       m5, m3
434    paddw           m2, m5
435    pmullw          m2, m13
436    paddw           m1, m2
437    psllw           m4, 7
438    paddw           m4, m10
439    paddsw          m1, m4
440%endif
441%endmacro
442    %%h7
443    psraw           m0, 3
444    psraw           m1, 3
445    paddw           m0, m15
446    paddw           m1, m15
447    mova  [t1+xq*2+ 0], m0
448    mova  [t1+xq*2+16], m1
449    add             xq, 16
450    jl .h_loop
451    ret
452ALIGN function_align
453.hv:
454    add           lpfq, dst_strideq
455    mov             xq, wq
456    test         edgeb, 1 ; LR_HAVE_LEFT
457    jz .hv_extend_left
458    movifnidn    leftq, leftmp
459    mova            m4, [lpfq+xq]
460    movd            m5, [leftq]
461    add          leftq, 4
462    pslldq          m4, 4
463    por             m4, m5
464    movifnidn   leftmp, leftq
465    jmp .hv_main
466.hv_extend_left:
467%if cpuflag(ssse3)
468    mova            m4, [lpfq+xq]
469    pshufb          m4, [base+wiener_l_shuf]
470%else
471    mova            m5, [lpfq+xq]
472    pshufd          m4, m5, q2103
473    punpcklbw       m5, m5
474    punpcklwd       m5, m5
475    movss           m4, m5
476%endif
477    jmp .hv_main
478.hv_bottom:
479    mov             xq, wq
480    test         edgeb, 1 ; LR_HAVE_LEFT
481    jz .hv_extend_left
482.hv_loop:
483    movu            m4, [lpfq+xq-4]
484.hv_main:
485    movu            m5, [lpfq+xq+4]
486    test         edgeb, 2 ; LR_HAVE_RIGHT
487    jnz .hv_have_right
488    cmp             xd, -18
489    jl .hv_have_right
490    call .extend_right
491.hv_have_right:
492    %%h7
493%if ARCH_X86_64
494    mova            m2, [t4+xq*2]
495    paddw           m2, [t2+xq*2]
496%else
497    mov             r2, t4
498    mova            m2, [r2+xq*2]
499    mov             r2, t2
500    paddw           m2, [r2+xq*2]
501    mov             r2, t5
502%endif
503    mova            m3, [t3+xq*2]
504%if ARCH_X86_64
505    mova            m5, [t5+xq*2]
506%else
507    mova            m5, [r2+xq*2]
508    mov             r2, t6
509%endif
510    paddw           m5, [t1+xq*2]
511    psraw           m0, 3
512    psraw           m1, 3
513    paddw           m0, m15
514    paddw           m1, m15
515%if ARCH_X86_64
516    paddw           m4, m0, [t6+xq*2]
517%else
518    paddw           m4, m0, [r2+xq*2]
519    mov             r2, t4
520%endif
521    mova     [t0+xq*2], m0
522    punpcklwd       m0, m2, m3
523    pmaddwd         m0, m7
524    punpckhwd       m2, m3
525    pmaddwd         m2, m7
526    punpcklwd       m3, m4, m5
527    pmaddwd         m3, m6
528    punpckhwd       m4, m5
529    pmaddwd         m4, m6
530    paddd           m0, m3
531    mova            m3, [t3+xq*2+16]
532    paddd           m4, m2
533%if ARCH_X86_64
534    mova            m2, [t4+xq*2+16]
535    paddw           m2, [t2+xq*2+16]
536    mova            m5, [t5+xq*2+16]
537%else
538    mova            m2, [r2+xq*2+16]
539    mov             r2, t2
540    paddw           m2, [r2+xq*2+16]
541    mov             r2, t5
542    mova            m5, [r2+xq*2+16]
543    mov             r2, t6
544%endif
545    paddw           m5, [t1+xq*2+16]
546    psrad           m0, 11
547    psrad           m4, 11
548    packssdw        m0, m4
549%if ARCH_X86_64
550    paddw           m4, m1, [t6+xq*2+16]
551%else
552    paddw           m4, m1, [r2+xq*2+16]
553    mov           dstq, dstmp
554%endif
555    mova  [t0+xq*2+16], m1
556    punpcklwd       m1, m2, m3
557    pmaddwd         m1, m7
558    punpckhwd       m2, m3
559    pmaddwd         m2, m7
560    punpcklwd       m3, m4, m5
561    pmaddwd         m3, m6
562    punpckhwd       m4, m5
563    pmaddwd         m4, m6
564    paddd           m1, m3
565    paddd           m2, m4
566    psrad           m1, 11
567    psrad           m2, 11
568    packssdw        m1, m2
569    packuswb        m0, m1
570    mova     [dstq+xq], m0
571    add             xq, 16
572    jl .hv_loop
573    add           dstq, dst_strideq
574%if ARCH_X86_64
575    mov             t6, t5
576    mov             t5, t4
577    mov             t4, t3
578    mov             t3, t2
579    mov             t2, t1
580    mov             t1, t0
581    mov             t0, t6
582%else
583    mov          dstmp, dstq
584    mov             r1, t5
585    mov             r2, t4
586    mov             t6, r1
587    mov             t5, r2
588    mov             t4, t3
589    mov             t3, t2
590    mov             t2, t1
591    mov             t1, t0
592    mov             t0, r1
593%endif
594    ret
595%if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code
596.v:
597    mov             xq, wq
598.v_loop:
599%if ARCH_X86_64
600    mova            m1, [t4+xq*2]
601    paddw           m1, [t2+xq*2]
602%else
603    mov             r2, t4
604    mova            m1, [r2+xq*2]
605    mov             r2, t2
606    paddw           m1, [r2+xq*2]
607    mov             r2, t6
608%endif
609    mova            m2, [t3+xq*2]
610    mova            m4, [t1+xq*2]
611%if ARCH_X86_64
612    paddw           m3, m4, [t6+xq*2]
613    paddw           m4, [t5+xq*2]
614%else
615    paddw           m3, m4, [r2+xq*2]
616    mov             r2, t5
617    paddw           m4, [r2+xq*2]
618    mov             r2, t4
619%endif
620    punpcklwd       m0, m1, m2
621    pmaddwd         m0, m7
622    punpckhwd       m1, m2
623    pmaddwd         m1, m7
624    punpcklwd       m2, m3, m4
625    pmaddwd         m2, m6
626    punpckhwd       m3, m4
627    pmaddwd         m3, m6
628    paddd           m0, m2
629    paddd           m1, m3
630%if ARCH_X86_64
631    mova            m2, [t4+xq*2+16]
632    paddw           m2, [t2+xq*2+16]
633%else
634    mova            m2, [r2+xq*2+16]
635    mov             r2, t2
636    paddw           m2, [r2+xq*2+16]
637    mov             r2, t6
638%endif
639    mova            m3, [t3+xq*2+16]
640    mova            m5, [t1+xq*2+16]
641%if ARCH_X86_64
642    paddw           m4, m5, [t6+xq*2+16]
643    paddw           m5, [t5+xq*2+16]
644%else
645    paddw           m4, m5, [r2+xq*2+16]
646    mov             r2, t5
647    paddw           m5, [r2+xq*2+16]
648    movifnidn     dstq, dstmp
649%endif
650    psrad           m0, 11
651    psrad           m1, 11
652    packssdw        m0, m1
653    punpcklwd       m1, m2, m3
654    pmaddwd         m1, m7
655    punpckhwd       m2, m3
656    pmaddwd         m2, m7
657    punpcklwd       m3, m4, m5
658    pmaddwd         m3, m6
659    punpckhwd       m4, m5
660    pmaddwd         m4, m6
661    paddd           m1, m3
662    paddd           m2, m4
663    psrad           m1, 11
664    psrad           m2, 11
665    packssdw        m1, m2
666    packuswb        m0, m1
667    mova     [dstq+xq], m0
668    add             xq, 16
669    jl .v_loop
670    add           dstq, dst_strideq
671%if ARCH_X86_64
672    mov             t6, t5
673    mov             t5, t4
674%else
675    mov          dstmp, dstq
676    mov             r1, t5
677    mov             r2, t4
678    mov             t6, r1
679    mov             t5, r2
680%endif
681    mov             t4, t3
682    mov             t3, t2
683    mov             t2, t1
684    ret
685%endif
686
687%if ARCH_X86_64
688cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
689                                                  lpf_stride, w, edge, flt, h, x
690    mov           fltq, fltmp
691    mov          edged, r8m
692    mov             wd, wm
693    mov             hd, r6m
694    movq           m14, [fltq]
695    add           lpfq, wq
696    mova            m8, [pw_m16380]
697    lea             t1, [rsp+wq*2+16]
698    mova           m15, [pw_2056]
699    add           dstq, wq
700    movq            m7, [fltq+16]
701    neg             wq
702%if cpuflag(ssse3)
703    pshufb         m14, [wiener_init]
704    mova            m9, [wiener_shufB]
705    pshufd         m13, m14, q3333  ; x1 x2
706    mova           m10, [wiener_shufC]
707    punpcklqdq     m14, m14         ; x3
708    mova           m11, [wiener_shufD]
709    mova           m12, [wiener_l_shuf]
710%else
711    punpcklwd      m14, m14
712    pshufd         m11, m14, q1111 ; x1
713    pshufd         m13, m14, q2222 ; x2
714    pshufd         m14, m14, q3333 ; x3
715%endif
716%else
717%if cpuflag(ssse3)
718    %define stk_off     80
719%else
720    %define m11         [stk+80]
721    %define stk_off     96
722%endif
723cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
724    %define stk         esp
725    %define leftmp      [stk+28]
726    %define m8          [base+pw_m16380]
727    %define m12         [base+wiener_l_shuf]
728    %define m14         [stk+48]
729    mov             r1, r7m ; flt
730    mov             r0, r0m ; dst
731    mov             r5, r5m ; w
732    mov           lpfq, lpfm
733    mov             r2, r8m ; edge
734    mov             r4, r6m ; h
735    movq            m2, [r1+ 0]
736    movq            m7, [r1+16]
737    add             r0, r5
738    mov             r1, r1m ; dst_stride
739    add           lpfq, r5
740    mov          edged, r2
741    mov             r2, r2m ; left
742    mov          dstmp, r0
743    lea             t1, [rsp+r5*2+stk_off]
744    mov             hd, r4
745    neg             r5
746    mov    lpf_strideq, lpf_stridem
747    LEA             r6, pb_right_ext_mask+21
748    mov             wq, r5
749    mov    dst_strideq, r1
750    mov         leftmp, r2
751%if cpuflag(ssse3)
752    pshufb          m2, [base+wiener_init]
753    pshufd          m1, m2, q3333
754    punpcklqdq      m2, m2
755%else
756    punpcklwd       m2, m2
757    pshufd          m0, m2, q1111
758    pshufd          m1, m2, q2222
759    pshufd          m2, m2, q3333
760    mova           m11, m0
761%endif
762    mova           m13, m1
763    mova           m14, m2
764%endif
765    pshufd          m6, m7, q0000 ; __ y1
766    pshufd          m7, m7, q1111 ; y2 y3
767    test         edgeb, 4 ; LR_HAVE_TOP
768    jz .no_top
769    call .h_top
770    add           lpfq, lpf_strideq
771    mov             t4, t1
772    add             t1, 384*2
773    call .h_top
774    lea             xq, [lpfq+lpf_strideq*4]
775    mov           lpfq, dstmp
776    mov             t3, t1
777    add             t1, 384*2
778    mov [rsp+gprsize*1], lpf_strideq
779    add             xq, lpf_strideq
780    mov [rsp+gprsize*0], xq ; below
781    call .h
782    mov             t2, t1
783    dec             hd
784    jz .v1
785    add           lpfq, dst_strideq
786    add             t1, 384*2
787    call .h
788    dec             hd
789    jz .v2
790.main:
791    mov             t0, t4
792.main_loop:
793    call .hv
794    dec             hd
795    jnz .main_loop
796    test         edgeb, 8 ; LR_HAVE_BOTTOM
797    jz .v2
798    mov           lpfq, [rsp+gprsize*0]
799    call .hv_bottom
800    add           lpfq, [rsp+gprsize*1]
801    call .hv_bottom
802.end:
803    RET
804.no_top:
805    lea             t3, [lpfq+lpf_strideq*4]
806    mov           lpfq, dstmp
807    mov [rsp+gprsize*1], lpf_strideq
808    lea             t3, [t3+lpf_strideq*2]
809    mov [rsp+gprsize*0], t3
810    call .h
811    mov             t4, t1
812    mov             t3, t1
813    mov             t2, t1
814    dec             hd
815    jz .v1
816    add           lpfq, dst_strideq
817    add             t1, 384*2
818    call .h
819    dec             hd
820    jz .v2
821    lea             t0, [t1+384*2]
822    call .hv
823    dec             hd
824    jz .v2
825    add             t0, 384*6
826    call .hv
827    dec             hd
828    jnz .main
829.v2:
830    call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
831    add           dstq, dst_strideq
832    mov             t4, t3
833    mov             t3, t2
834    mov             t2, t1
835    movifnidn    dstmp, dstq
836.v1:
837    call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
838    jmp .end
839.h:
840    %define stk esp+4
841    mov             xq, wq
842    test         edgeb, 1 ; LR_HAVE_LEFT
843    jz .h_extend_left
844    movifnidn    leftq, leftmp
845    mova            m4, [lpfq+xq]
846    movd            m5, [leftq]
847    add          leftq, 4
848    pslldq          m4, 4
849    por             m4, m5
850    movifnidn   leftmp, leftq
851    jmp .h_main
852.h_extend_left:
853%if cpuflag(ssse3)
854    mova            m4, [lpfq+xq]
855    pshufb          m4, m12
856%else
857    mova            m5, [lpfq+xq]
858    pshufd          m4, m5, q2103
859    punpcklbw       m5, m5
860    punpcklwd       m5, m5
861    movss           m4, m5
862%endif
863    jmp .h_main
864.h_top:
865    mov             xq, wq
866    test         edgeb, 1 ; LR_HAVE_LEFT
867    jz .h_extend_left
868.h_loop:
869    movu            m4, [lpfq+xq-4]
870.h_main:
871    movu            m5, [lpfq+xq+4]
872    test         edgeb, 2 ; LR_HAVE_RIGHT
873    jnz .h_have_right
874    cmp             xd, -17
875    jl .h_have_right
876    call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
877.h_have_right:
878%macro %%h5 0
879%if cpuflag(ssse3)
880    pshufb          m0, m4, m9
881    pmaddubsw       m0, m13
882    pshufb          m1, m5, m9
883    pmaddubsw       m1, m13
884    pshufb          m2, m4, m10
885    pmaddubsw       m2, m13
886    pshufb          m3, m5, m10
887    pmaddubsw       m3, m13
888    pshufb          m4, m11
889    paddw           m0, m2
890    pmullw          m2, m14, m4
891    pshufb          m5, m11
892    paddw           m1, m3
893    pmullw          m3, m14, m5
894    psllw           m4, 7
895    psllw           m5, 7
896    paddw           m4, m8
897    paddw           m5, m8
898    paddw           m0, m2
899    paddw           m1, m3
900    paddsw          m0, m4
901    paddsw          m1, m5
902%else
903    psrldq          m0, m4, 2
904    pslldq          m1, m4, 2
905    pxor            m3, m3
906    punpcklbw       m0, m3
907    punpckhbw       m1, m3
908    paddw           m0, m1
909    pmullw          m0, m11
910    pshufd          m2, m4, q0321
911    punpcklbw       m2, m3
912    pmullw          m1, m14, m2
913    paddw           m0, m1
914    psrldq          m1, m4, 3
915    pslldq          m4, 3
916    punpcklbw       m1, m3
917    punpckhbw       m4, m3
918    paddw           m1, m4
919    pmullw          m1, m13
920    paddw           m0, m1
921    psllw           m2, 7
922    paddw           m2, m8
923    paddsw          m0, m2
924    psrldq          m1, m5, 2
925    pslldq          m4, m5, 2
926    punpcklbw       m1, m3
927    punpckhbw       m4, m3
928    paddw           m1, m4
929    pmullw          m1, m11
930    pshufd          m4, m5, q0321
931    punpcklbw       m4, m3
932    pmullw          m2, m14, m4
933    paddw           m1, m2
934    psrldq          m2, m5, 3
935    pslldq          m5, 3
936    punpcklbw       m2, m3
937    punpckhbw       m5, m3
938    paddw           m2, m5
939    pmullw          m2, m13
940    paddw           m1, m2
941    psllw           m4, 7
942    paddw           m4, m8
943    paddsw          m1, m4
944%endif
945%endmacro
946    %%h5
947    psraw           m0, 3
948    psraw           m1, 3
949    paddw           m0, m15
950    paddw           m1, m15
951    mova  [t1+xq*2+ 0], m0
952    mova  [t1+xq*2+16], m1
953    add             xq, 16
954    jl .h_loop
955    ret
956ALIGN function_align
957.hv:
958    add           lpfq, dst_strideq
959    mov             xq, wq
960    test         edgeb, 1 ; LR_HAVE_LEFT
961    jz .hv_extend_left
962    movifnidn    leftq, leftmp
963    mova            m4, [lpfq+xq]
964    movd            m5, [leftq]
965    add          leftq, 4
966    pslldq          m4, 4
967    por             m4, m5
968    movifnidn   leftmp, leftq
969    jmp .hv_main
970.hv_extend_left:
971%if cpuflag(ssse3)
972    mova            m4, [lpfq+xq]
973    pshufb          m4, m12
974%else
975    mova            m5, [lpfq+xq]
976    pshufd          m4, m5, q2103
977    punpcklbw       m5, m5
978    punpcklwd       m5, m5
979    movss           m4, m5
980%endif
981    jmp .hv_main
982.hv_bottom:
983    mov             xq, wq
984    test         edgeb, 1 ; LR_HAVE_LEFT
985    jz .hv_extend_left
986.hv_loop:
987    movu            m4, [lpfq+xq-4]
988.hv_main:
989    movu            m5, [lpfq+xq+4]
990    test         edgeb, 2 ; LR_HAVE_RIGHT
991    jnz .hv_have_right
992    cmp             xd, -17
993    jl .hv_have_right
994    call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
995.hv_have_right:
996    %%h5
997    mova            m2, [t3+xq*2]
998    paddw           m2, [t1+xq*2]
999    psraw           m0, 3
1000    psraw           m1, 3
1001    paddw           m0, m15
1002    paddw           m1, m15
1003%if ARCH_X86_64
1004    mova            m3, [t2+xq*2]
1005    paddw           m4, m0, [t4+xq*2]
1006%else
1007    mov             r2, t2
1008    mova            m3, [r2+xq*2]
1009    mov             r2, t4
1010    paddw           m4, m0, [r2+xq*2]
1011%endif
1012    mova     [t0+xq*2], m0
1013    punpcklwd       m0, m2, m3
1014    pmaddwd         m0, m7
1015    punpckhwd       m2, m3
1016    pmaddwd         m2, m7
1017    punpcklwd       m3, m4, m4
1018    pmaddwd         m3, m6
1019    punpckhwd       m4, m4
1020    pmaddwd         m4, m6
1021    paddd           m0, m3
1022    paddd           m4, m2
1023    mova            m2, [t3+xq*2+16]
1024    paddw           m2, [t1+xq*2+16]
1025    psrad           m0, 11
1026    psrad           m4, 11
1027    packssdw        m0, m4
1028%if ARCH_X86_64
1029    mova            m3, [t2+xq*2+16]
1030    paddw           m4, m1, [t4+xq*2+16]
1031%else
1032    paddw           m4, m1, [r2+xq*2+16]
1033    mov             r2, t2
1034    mova            m3, [r2+xq*2+16]
1035    mov           dstq, dstmp
1036%endif
1037    mova  [t0+xq*2+16], m1
1038    punpcklwd       m1, m2, m3
1039    pmaddwd         m1, m7
1040    punpckhwd       m2, m3
1041    pmaddwd         m2, m7
1042    punpcklwd       m3, m4, m4
1043    pmaddwd         m3, m6
1044    punpckhwd       m4, m4
1045    pmaddwd         m4, m6
1046    paddd           m1, m3
1047    paddd           m2, m4
1048    psrad           m1, 11
1049    psrad           m2, 11
1050    packssdw        m1, m2
1051    packuswb        m0, m1
1052    mova     [dstq+xq], m0
1053    add             xq, 16
1054    jl .hv_loop
1055    add           dstq, dst_strideq
1056    mov             t4, t3
1057    mov             t3, t2
1058    mov             t2, t1
1059    mov             t1, t0
1060    mov             t0, t4
1061    movifnidn    dstmp, dstq
1062    ret
1063%if cpuflag(ssse3)
1064.v:
1065    mov             xq, wq
1066.v_loop:
1067    mova            m3, [t1+xq*2]
1068    paddw           m1, m3, [t3+xq*2]
1069%if ARCH_X86_64
1070    mova            m2, [t2+xq*2]
1071    paddw           m3, [t4+xq*2]
1072%else
1073    mov             r2, t2
1074    mova            m2, [r2+xq*2]
1075    mov             r2, t4
1076    paddw           m3, [r2+xq*2]
1077%endif
1078    punpcklwd       m0, m1, m2
1079    pmaddwd         m0, m7
1080    punpckhwd       m1, m2
1081    pmaddwd         m1, m7
1082    punpcklwd       m2, m3
1083    pmaddwd         m2, m6
1084    punpckhwd       m3, m3
1085    pmaddwd         m3, m6
1086    paddd           m0, m2
1087    paddd           m1, m3
1088    mova            m4, [t1+xq*2+16]
1089    paddw           m2, m4, [t3+xq*2+16]
1090%if ARCH_X86_64
1091    mova            m3, [t2+xq*2+16]
1092    paddw           m4, [t4+xq*2+16]
1093%else
1094    paddw           m4, [r2+xq*2+16]
1095    mov             r2, t2
1096    mova            m3, [r2+xq*2+16]
1097    mov           dstq, dstmp
1098%endif
1099    psrad           m0, 11
1100    psrad           m1, 11
1101    packssdw        m0, m1
1102    punpcklwd       m1, m2, m3
1103    pmaddwd         m1, m7
1104    punpckhwd       m2, m3
1105    pmaddwd         m2, m7
1106    punpcklwd       m3, m4
1107    pmaddwd         m3, m6
1108    punpckhwd       m4, m4
1109    pmaddwd         m4, m6
1110    paddd           m1, m3
1111    paddd           m2, m4
1112    psrad           m1, 11
1113    psrad           m2, 11
1114    packssdw        m1, m2
1115    packuswb        m0, m1
1116    mova     [dstq+xq], m0
1117    add             xq, 16
1118    jl .v_loop
1119    ret
1120%endif
1121%endmacro
1122
1123INIT_XMM sse2
1124WIENER
1125
1126INIT_XMM ssse3
1127WIENER
1128
1129;;;;;;;;;;;;;;;;;;;;;;;;;;
1130;;      self-guided     ;;
1131;;;;;;;;;;;;;;;;;;;;;;;;;;
1132
1133%macro MULLD 2
1134    pmulhuw       m5, %1, %2
1135    pmullw        %1, %2
1136    pslld         m5, 16
1137    paddd         %1, m5
1138%endmacro
1139
1140%macro GATHERDD 2
1141    mova          m5, m7
1142    movd         r6d, %2
1143 %if ARCH_X86_64
1144    movd          %1, [r5+r6]
1145    pextrw       r6d, %2, 2
1146    pinsrw        m5, [r5+r6+2], 3
1147    pextrw       r6d, %2, 4
1148    pinsrw        %1, [r5+r6+2], 5
1149    pextrw       r6d, %2, 6
1150    pinsrw        m5, [r5+r6+2], 7
1151 %else
1152    movd          %1, [PIC_sym(sgr_x_by_x-0xF03)+r6]
1153    pextrw       r6d, %2, 2
1154    pinsrw        m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 3
1155    pextrw       r6d, %2, 4
1156    pinsrw        %1, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 5
1157    pextrw       r6d, %2, 6
1158    pinsrw        m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 7
1159 %endif
1160    por           %1, m5
1161%endmacro
1162
1163%if ARCH_X86_64
1164cglobal sgr_box3_h_8bpc, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
1165    mov        xlimd, edgem
1166    movifnidn     xd, xm
1167    mov           hd, hm
1168    mov        edged, xlimd
1169    and        xlimd, 2                             ; have_right
1170    add           xd, xlimd
1171    xor        xlimd, 2                             ; 2*!have_right
1172%else
1173cglobal sgr_box3_h_8bpc, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
1174 %define wq     r0m
1175 %define xlimd  r1m
1176 %define hd     hmp
1177 %define edgeb  byte edgem
1178
1179    mov           r6, edgem
1180    and           r6, 2                             ; have_right
1181    add           xd, r6
1182    xor           r6, 2                             ; 2*!have_right
1183    mov        xlimd, r6
1184    SETUP_PIC     r6, 0
1185%endif
1186
1187    jnz .no_right
1188    add           xd, 7
1189    and           xd, ~7
1190.no_right:
1191    pxor          m1, m1
1192    lea         srcq, [srcq+xq]
1193    lea         sumq, [sumq+xq*2-2]
1194    lea       sumsqq, [sumsqq+xq*4-4]
1195    neg           xq
1196    mov           wq, xq
1197%if ARCH_X86_64
1198    lea          r10, [pb_right_ext_mask+24]
1199%endif
1200.loop_y:
1201    mov           xq, wq
1202
1203    ; load left
1204    test       edgeb, 1                             ; have_left
1205    jz .no_left
1206    test       leftq, leftq
1207    jz .load_left_from_main
1208    movd          m0, [leftq]
1209    pslldq        m0, 12
1210    add        leftq, 4
1211    jmp .expand_x
1212.no_left:
1213    movd          m0, [srcq+xq]
1214    pshufb        m0, [PIC_sym(pb_0)]
1215    jmp .expand_x
1216.load_left_from_main:
1217    movd          m0, [srcq+xq-2]
1218    pslldq        m0, 14
1219.expand_x:
1220    punpckhbw    xm0, xm1
1221
1222    ; when we reach this, m0 contains left two px in highest words
1223    cmp           xd, -8
1224    jle .loop_x
1225.partial_load_and_extend:
1226    movd          m3, [srcq-4]
1227    pshufb        m3, [PIC_sym(pb_3)]
1228    movq          m2, [srcq+xq]
1229    punpcklbw     m2, m1
1230    punpcklbw     m3, m1
1231%if ARCH_X86_64
1232    movu          m4, [r10+xq*2]
1233%else
1234    movu          m4, [PIC_sym(pb_right_ext_mask)+xd*2+24]
1235%endif
1236    pand          m2, m4
1237    pandn         m4, m3
1238    por           m2, m4
1239    jmp .loop_x_noload
1240.right_extend:
1241    pshufb        m2, m0, [PIC_sym(pb_14_15)]
1242    jmp .loop_x_noload
1243
1244.loop_x:
1245    movq          m2, [srcq+xq]
1246    punpcklbw     m2, m1
1247.loop_x_noload:
1248    palignr       m3, m2, m0, 12
1249    palignr       m4, m2, m0, 14
1250
1251    punpcklwd     m5, m3, m2
1252    punpckhwd     m6, m3, m2
1253    paddw         m3, m4
1254    punpcklwd     m7, m4, m1
1255    punpckhwd     m4, m1
1256    pmaddwd       m5, m5
1257    pmaddwd       m6, m6
1258    pmaddwd       m7, m7
1259    pmaddwd       m4, m4
1260    paddd         m5, m7
1261    paddd         m6, m4
1262    paddw         m3, m2
1263    movu [sumq+xq*2], m3
1264    movu [sumsqq+xq*4+ 0], m5
1265    movu [sumsqq+xq*4+16], m6
1266
1267    mova          m0, m2
1268    add           xq, 8
1269
1270    ; if x <= -8 we can reload more pixels
1271    ; else if x < 0 we reload and extend (this implies have_right=0)
1272    ; else if x < xlimd we extend from previous load (this implies have_right=0)
1273    ; else we are done
1274
1275    cmp           xd, -8
1276    jle .loop_x
1277    test          xd, xd
1278    jl .partial_load_and_extend
1279    cmp           xd, xlimd
1280    jl .right_extend
1281
1282    add       sumsqq, (384+16)*4
1283    add         sumq, (384+16)*2
1284    add         srcq, strideq
1285    dec           hd
1286    jg .loop_y
1287    RET
1288
1289%if ARCH_X86_64
1290cglobal sgr_box3_v_8bpc, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
1291    movifnidn  edged, edgem
1292%else
1293cglobal sgr_box3_v_8bpc, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
1294 %define sumsq_baseq dword [esp+0]
1295 %define sum_baseq   dword [esp+4]
1296 %define ylimd       dword [esp+8]
1297 %define m8          [esp+12]
1298    mov        edged, r4m
1299    mov           hd, r3m
1300%endif
1301    mov           xq, -2
1302%if ARCH_X86_64
1303    mov        ylimd, edged
1304    and        ylimd, 8                             ; have_bottom
1305    shr        ylimd, 2
1306    sub        ylimd, 2                             ; -2 if have_bottom=0, else 0
1307    mov  sumsq_baseq, sumsqq
1308    mov    sum_baseq, sumq
1309.loop_x:
1310    mov       sumsqq, sumsq_baseq
1311    mov         sumq, sum_baseq
1312    lea           yd, [hq+ylimq+2]
1313%else
1314    mov           yd, edged
1315    and           yd, 8                             ; have_bottom
1316    shr           yd, 2
1317    sub           yd, 2                             ; -2 if have_bottom=0, else 0
1318    mov  sumsq_baseq, sumsqq
1319    mov    sum_baseq, sumq
1320    mov        ylimd, yd
1321.loop_x:
1322    mov       sumsqd, sumsq_baseq
1323    mov         sumd, sum_baseq
1324    lea           yd, [hq+2]
1325    add           yd, ylimd
1326%endif
1327    lea       sumsqq, [sumsqq+xq*4+4-(384+16)*4]
1328    lea         sumq, [sumq+xq*2+2-(384+16)*2]
1329    test       edgeb, 4                             ; have_top
1330    jnz .load_top
1331    movu          m0, [sumsqq+(384+16)*4*1]
1332    movu          m1, [sumsqq+(384+16)*4*1+16]
1333    mova          m2, m0
1334    mova          m3, m1
1335    mova          m4, m0
1336    mova          m5, m1
1337    movu          m6, [sumq+(384+16)*2*1]
1338    mova          m7, m6
1339    mova          m8, m6
1340    jmp .loop_y_noload
1341.load_top:
1342    movu          m0, [sumsqq-(384+16)*4*1]      ; l2sq [left]
1343    movu          m1, [sumsqq-(384+16)*4*1+16]   ; l2sq [right]
1344    movu          m2, [sumsqq-(384+16)*4*0]      ; l1sq [left]
1345    movu          m3, [sumsqq-(384+16)*4*0+16]   ; l1sq [right]
1346    movu          m6, [sumq-(384+16)*2*1]        ; l2
1347    movu          m7, [sumq-(384+16)*2*0]        ; l1
1348.loop_y:
1349%if ARCH_X86_64
1350    movu          m8, [sumq+(384+16)*2*1]        ; l0
1351%else
1352    movu          m4, [sumq+(384+16)*2*1]        ; l0
1353    mova          m8, m4
1354%endif
1355    movu          m4, [sumsqq+(384+16)*4*1]      ; l0sq [left]
1356    movu          m5, [sumsqq+(384+16)*4*1+16]   ; l0sq [right]
1357.loop_y_noload:
1358    paddd         m0, m2
1359    paddd         m1, m3
1360    paddw         m6, m7
1361    paddd         m0, m4
1362    paddd         m1, m5
1363    paddw         m6, m8
1364    movu [sumsqq+ 0], m0
1365    movu [sumsqq+16], m1
1366    movu      [sumq], m6
1367
1368    ; shift position down by one
1369    mova          m0, m2
1370    mova          m1, m3
1371    mova          m2, m4
1372    mova          m3, m5
1373    mova          m6, m7
1374    mova          m7, m8
1375    add       sumsqq, (384+16)*4
1376    add         sumq, (384+16)*2
1377    dec           yd
1378    jg .loop_y
1379    cmp           yd, ylimd
1380    jg .loop_y_noload
1381    add           xd, 8
1382    cmp           xd, wd
1383    jl .loop_x
1384    RET
1385
1386cglobal sgr_calc_ab1_8bpc, 4, 7, 12, a, b, w, h, s
1387    movifnidn     sd, sm
1388    sub           aq, (384+16-1)*4
1389    sub           bq, (384+16-1)*2
1390    add           hd, 2
1391%if ARCH_X86_64
1392    LEA           r5, sgr_x_by_x-0xF03
1393%else
1394    SETUP_PIC r5, 0
1395%endif
1396    movd          m6, sd
1397    pshuflw       m6, m6, q0000
1398    punpcklqdq    m6, m6
1399    pxor          m7, m7
1400    DEFINE_ARGS a, b, w, h, x
1401%if ARCH_X86_64
1402    mova          m8, [pd_0xF00801C7]
1403    mova          m9, [pw_256]
1404    psrld        m10, m9, 13                        ; pd_2048
1405    mova         m11, [pb_unpcklwdw]
1406%else
1407 %define m8     [PIC_sym(pd_0xF00801C7)]
1408 %define m9     [PIC_sym(pw_256)]
1409 %define m10    [PIC_sym(pd_2048)]
1410 %define m11    [PIC_sym(pb_unpcklwdw)]
1411%endif
1412.loop_y:
1413    mov           xq, -2
1414.loop_x:
1415    movq          m0, [bq+xq*2]
1416    movq          m1, [bq+xq*2+(384+16)*2]
1417    punpcklwd     m0, m7
1418    punpcklwd     m1, m7
1419    movu          m2, [aq+xq*4]
1420    movu          m3, [aq+xq*4+(384+16)*4]
1421    pslld         m4, m2, 3
1422    pslld         m5, m3, 3
1423    paddd         m2, m4                            ; aa * 9
1424    paddd         m3, m5
1425    pmaddwd       m4, m0, m0
1426    pmaddwd       m5, m1, m1
1427    pmaddwd       m0, m8
1428    pmaddwd       m1, m8
1429    psubd         m2, m4                            ; p = aa * 9 - bb * bb
1430    psubd         m3, m5
1431    MULLD         m2, m6
1432    MULLD         m3, m6
1433    paddusw       m2, m8
1434    paddusw       m3, m8
1435    psrld         m2, 20                            ; z
1436    psrld         m3, 20
1437    GATHERDD      m4, m2                            ; xx
1438    GATHERDD      m2, m3
1439    psrld         m4, 24
1440    psrld         m2, 24
1441    packssdw      m3, m4, m2
1442    pshufb        m4, m11
1443    MULLD         m0, m4
1444    pshufb        m2, m11
1445    MULLD         m1, m2
1446    psubw         m5, m9, m3
1447    paddd         m0, m10
1448    paddd         m1, m10
1449    psrld         m0, 12
1450    psrld         m1, 12
1451    movq   [bq+xq*2], m5
1452    psrldq        m5, 8
1453    movq [bq+xq*2+(384+16)*2], m5
1454    movu   [aq+xq*4], m0
1455    movu [aq+xq*4+(384+16)*4], m1
1456    add           xd, 4
1457    cmp           xd, wd
1458    jl .loop_x
1459    add           aq, (384+16)*4*2
1460    add           bq, (384+16)*2*2
1461    sub           hd, 2
1462    jg .loop_y
1463    RET
1464
1465%if ARCH_X86_64
1466cglobal sgr_finish_filter1_8bpc, 5, 13, 16, t, src, stride, a, b, w, h, \
1467                                            tmp_base, src_base, a_base, b_base, x, y
1468    movifnidn     wd, wm
1469    mov           hd, hm
1470    mova         m15, [pw_16]
1471    mov    tmp_baseq, tq
1472    mov    src_baseq, srcq
1473    mov      a_baseq, aq
1474    mov      b_baseq, bq
1475    xor           xd, xd
1476%else
1477cglobal sgr_finish_filter1_8bpc, 7, 7, 8, -144, t, src, stride, a, b, x, y
1478 %define tmp_baseq  [esp+8]
1479 %define src_baseq  [esp+12]
1480 %define a_baseq    [esp+16]
1481 %define b_baseq    [esp+20]
1482 %define wd         [esp+24]
1483 %define hd         [esp+28]
1484    mov    tmp_baseq, tq
1485    mov    src_baseq, srcq
1486    mov      a_baseq, aq
1487    mov      b_baseq, bq
1488    mov           wd, xd
1489    mov           hd, yd
1490    xor           xd, xd
1491    SETUP_PIC yd, 1, 1
1492    jmp .loop_start
1493%endif
1494
1495.loop_x:
1496    mov           tq, tmp_baseq
1497    mov         srcq, src_baseq
1498    mov           aq, a_baseq
1499    mov           bq, b_baseq
1500%if ARCH_X86_32
1501.loop_start:
1502    movu          m0, [bq+xq*2-(384+16)*2-2]
1503    movu          m2, [bq+xq*2-(384+16)*2+2]
1504    mova          m1, [bq+xq*2-(384+16)*2]          ; b:top
1505    paddw         m0, m2                            ; b:tl+tr
1506    movu          m2, [bq+xq*2-2]
1507    movu          m3, [bq+xq*2+2]
1508    paddw         m1, [bq+xq*2]                     ; b:top+ctr
1509    paddw         m2, m3                            ; b:l+r
1510    mova  [esp+0x80], m0
1511    mova  [esp+0x70], m1
1512    mova  [esp+0x60], m2
1513%endif
1514    movu          m0, [aq+xq*4-(384+16)*4-4]
1515    movu          m2, [aq+xq*4-(384+16)*4+4]
1516    mova          m1, [aq+xq*4-(384+16)*4]          ; a:top [first half]
1517    paddd         m0, m2                            ; a:tl+tr [first half]
1518    movu          m2, [aq+xq*4-(384+16)*4-4+16]
1519    movu          m4, [aq+xq*4-(384+16)*4+4+16]
1520    mova          m3, [aq+xq*4-(384+16)*4+16]       ; a:top [second half]
1521    paddd         m2, m4                            ; a:tl+tr [second half]
1522    movu          m4, [aq+xq*4-4]
1523    movu          m5, [aq+xq*4+4]
1524    paddd         m1, [aq+xq*4]                     ; a:top+ctr [first half]
1525    paddd         m4, m5                            ; a:l+r [first half]
1526    movu          m5, [aq+xq*4+16-4]
1527    movu          m6, [aq+xq*4+16+4]
1528    paddd         m3, [aq+xq*4+16]                  ; a:top+ctr [second half]
1529    paddd         m5, m6                            ; a:l+r [second half]
1530%if ARCH_X86_64
1531    movu          m6, [bq+xq*2-(384+16)*2-2]
1532    movu          m8, [bq+xq*2-(384+16)*2+2]
1533    mova          m7, [bq+xq*2-(384+16)*2]          ; b:top
1534    paddw         m6, m8                            ; b:tl+tr
1535    movu          m8, [bq+xq*2-2]
1536    movu          m9, [bq+xq*2+2]
1537    paddw         m7, [bq+xq*2]                     ; b:top+ctr
1538    paddw         m8, m9                            ; b:l+r
1539%endif
1540
1541    lea           tq, [tq+xq*2]
1542    lea         srcq, [srcq+xq*1]
1543    lea           aq, [aq+xq*4+(384+16)*4]
1544    lea           bq, [bq+xq*2+(384+16)*2]
1545    mov           yd, hd
1546.loop_y:
1547%if ARCH_X86_64
1548    movu          m9, [bq-2]
1549    movu         m10, [bq+2]
1550    paddw         m7, [bq]                          ; b:top+ctr+bottom
1551    paddw         m9, m10                           ; b:bl+br
1552    paddw        m10, m7, m8                        ; b:top+ctr+bottom+l+r
1553    paddw         m6, m9                            ; b:tl+tr+bl+br
1554    psubw         m7, [bq-(384+16)*2*2]             ; b:ctr+bottom
1555    paddw        m10, m6
1556    psllw        m10, 2
1557    psubw        m10, m6                            ; aa
1558    pxor         m14, m14
1559    movq         m12, [srcq]
1560    punpcklbw    m12, m14
1561    punpcklwd     m6, m10, m15
1562    punpckhwd    m10, m15
1563    punpcklwd    m13, m12, m15
1564    punpckhwd    m12, m15
1565    pmaddwd       m6, m13                           ; aa*src[x]+256 [first half]
1566    pmaddwd      m10, m12                           ; aa*src[x]+256 [second half]
1567%else
1568    paddd         m1, [aq]                          ; a:top+ctr+bottom [first half]
1569    paddd         m3, [aq+16]                       ; a:top+ctr+bottom [second half]
1570    mova  [esp+0x50], m1
1571    mova  [esp+0x40], m3
1572    mova  [esp+0x30], m4
1573    movu          m6, [aq-4]
1574    movu          m7, [aq+4]
1575    paddd         m1, m4                            ; a:top+ctr+bottom+l+r [first half]
1576    paddd         m3, m5                            ; a:top+ctr+bottom+l+r [second half]
1577    paddd         m6, m7                            ; a:bl+br [first half]
1578    movu          m7, [aq+16-4]
1579    movu          m4, [aq+16+4]
1580    paddd         m7, m4                            ; a:bl+br [second half]
1581    paddd         m0, m6                            ; a:tl+tr+bl+br [first half]
1582    paddd         m2, m7                            ; a:tl+tr+bl+br [second half]
1583    paddd         m1, m0
1584    paddd         m3, m2
1585    pslld         m1, 2
1586    pslld         m3, 2
1587    psubd         m1, m0                            ; bb [first half]
1588    psubd         m3, m2                            ; bb [second half]
1589%endif
1590
1591%if ARCH_X86_64
1592    movu         m11, [aq-4]
1593    movu         m12, [aq+4]
1594    paddd         m1, [aq]                          ; a:top+ctr+bottom [first half]
1595    paddd        m11, m12                           ; a:bl+br [first half]
1596    movu         m12, [aq+16-4]
1597    movu         m13, [aq+16+4]
1598    paddd         m3, [aq+16]                       ; a:top+ctr+bottom [second half]
1599    paddd        m12, m13                           ; a:bl+br [second half]
1600    paddd        m13, m1, m4                        ; a:top+ctr+bottom+l+r [first half]
1601    paddd        m14, m3, m5                        ; a:top+ctr+bottom+l+r [second half]
1602    paddd         m0, m11                           ; a:tl+tr+bl+br [first half]
1603    paddd         m2, m12                           ; a:tl+tr+bl+br [second half]
1604    paddd        m13, m0
1605    paddd        m14, m2
1606    pslld        m13, 2
1607    pslld        m14, 2
1608    psubd        m13, m0                            ; bb [first half]
1609    psubd        m14, m2                            ; bb [second half]
1610    psubd         m1, [aq-(384+16)*4*2]             ; a:ctr+bottom [first half]
1611    psubd         m3, [aq-(384+16)*4*2+16]          ; a:ctr+bottom [second half]
1612%else
1613    mova          m4, [esp+0x80]
1614    mova  [esp+0x80], m5
1615    mova          m5, [esp+0x70]
1616    mova  [esp+0x70], m6
1617    mova          m6, [esp+0x60]
1618    mova  [esp+0x60], m7
1619    mova  [esp+0x20], m1
1620    movu          m7, [bq-2]
1621    movu          m1, [bq+2]
1622    paddw         m5, [bq]                          ; b:top+ctr+bottom
1623    paddw         m7, m1
1624    paddw         m1, m5, m6                        ; b:top+ctr+bottom+l+r
1625    paddw         m4, m7                            ; b:tl+tr+bl+br
1626    psubw         m5, [bq-(384+16)*2*2]             ; b:ctr+bottom
1627    paddw         m1, m4
1628    psllw         m1, 2
1629    psubw         m1, m4                            ; aa
1630    movq          m0, [srcq]
1631    XCHG_PIC_REG
1632    punpcklbw     m0, [PIC_sym(pb_0)]
1633    punpcklwd     m4, m1, [PIC_sym(pw_16)]
1634    punpckhwd     m1, [PIC_sym(pw_16)]
1635    punpcklwd     m2, m0, [PIC_sym(pw_16)]
1636    punpckhwd     m0, [PIC_sym(pw_16)]
1637    XCHG_PIC_REG
1638    pmaddwd       m4, m2                            ; aa*src[x]+256 [first half]
1639    pmaddwd       m1, m0                            ; aa*src[x]+256 [second half]
1640%endif
1641
1642%if ARCH_X86_64
1643    paddd         m6, m13
1644    paddd        m10, m14
1645    psrad         m6, 9
1646    psrad        m10, 9
1647    packssdw      m6, m10
1648    mova        [tq], m6
1649%else
1650    paddd         m4, [esp+0x20]
1651    paddd         m1, m3
1652    psrad         m4, 9
1653    psrad         m1, 9
1654    packssdw      m4, m1
1655    mova        [tq], m4
1656%endif
1657
1658    ; shift to next row
1659%if ARCH_X86_64
1660    mova          m0, m4
1661    mova          m2, m5
1662    mova          m4, m11
1663    mova          m5, m12
1664    mova          m6, m8
1665    mova          m8, m9
1666%else
1667    mova          m1, [esp+0x50]
1668    mova          m3, [esp+0x40]
1669    mova          m0, [esp+0x30]
1670    mova          m2, [esp+0x80]
1671    mova          m4, [esp+0x70]
1672    mova  [esp+0x70], m5
1673    mova          m5, [esp+0x60]
1674    mova  [esp+0x80], m6
1675    mova  [esp+0x60], m7
1676    psubd         m1, [aq-(384+16)*4*2]             ; a:ctr+bottom [first half]
1677    psubd         m3, [aq-(384+16)*4*2+16]          ; a:ctr+bottom [second half]
1678%endif
1679
1680    add         srcq, strideq
1681    add           aq, (384+16)*4
1682    add           bq, (384+16)*2
1683    add           tq, 384*2
1684    dec           yd
1685    jg .loop_y
1686    add           xd, 8
1687    cmp           xd, wd
1688    jl .loop_x
1689    RET
1690
1691cglobal sgr_weighted1_8bpc, 4, 7, 8, dst, stride, t, w, h, wt
1692    movifnidn     hd, hm
1693%if ARCH_X86_32
1694    SETUP_PIC r6, 0
1695%endif
1696    movd          m0, wtm
1697    pshufb        m0, [PIC_sym(pb_0_1)]
1698    psllw         m0, 4
1699    pxor          m7, m7
1700    DEFINE_ARGS dst, stride, t, w, h, idx
1701.loop_y:
1702    xor         idxd, idxd
1703.loop_x:
1704    mova          m1, [tq+idxq*2+ 0]
1705    mova          m4, [tq+idxq*2+16]
1706    mova          m5, [dstq+idxq]
1707    punpcklbw     m2, m5, m7
1708    punpckhbw     m5, m7
1709    psllw         m3, m2, 4
1710    psllw         m6, m5, 4
1711    psubw         m1, m3
1712    psubw         m4, m6
1713    pmulhrsw      m1, m0
1714    pmulhrsw      m4, m0
1715    paddw         m1, m2
1716    paddw         m4, m5
1717    packuswb      m1, m4
1718    mova [dstq+idxq], m1
1719    add         idxd, 16
1720    cmp         idxd, wd
1721    jl .loop_x
1722    add         dstq, strideq
1723    add           tq, 384 * 2
1724    dec           hd
1725    jg .loop_y
1726    RET
1727
1728%if ARCH_X86_64
1729cglobal sgr_box5_h_8bpc, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
1730    mov        edged, edgem
1731    movifnidn     wd, wm
1732    mov           hd, hm
1733    mova         m10, [pb_0]
1734    mova         m11, [pb_0_1]
1735%else
1736cglobal sgr_box5_h_8bpc, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
1737 %define edgeb      byte edgem
1738 %define wd         xd
1739 %define wq         wd
1740 %define wm         r5m
1741 %define strideq    r4m
1742    SUB          esp, 8
1743    SETUP_PIC sumsqd, 1, 1
1744
1745 %define m10    [PIC_sym(pb_0)]
1746 %define m11    [PIC_sym(pb_0_1)]
1747%endif
1748
1749    test       edgeb, 2                             ; have_right
1750    jz .no_right
1751    xor        xlimd, xlimd
1752    add           wd, 2
1753    add           wd, 15
1754    and           wd, ~15
1755    jmp .right_done
1756.no_right:
1757    mov        xlimd, 3
1758    dec           wd
1759.right_done:
1760    pxor          m1, m1
1761    lea         srcq, [srcq+wq+1]
1762    lea         sumq, [sumq+wq*2-2]
1763    lea       sumsqq, [sumsqq+wq*4-4]
1764    neg           wq
1765%if ARCH_X86_64
1766    lea          r10, [pb_right_ext_mask+24]
1767%else
1768    mov           wm, xd
1769 %define wq wm
1770%endif
1771
1772.loop_y:
1773    mov           xq, wq
1774    ; load left
1775    test       edgeb, 1                             ; have_left
1776    jz .no_left
1777    test       leftq, leftq
1778    jz .load_left_from_main
1779    movd          m0, [leftq]
1780    movd          m2, [srcq+xq-1]
1781    pslldq        m2, 4
1782    por           m0, m2
1783    pslldq        m0, 11
1784    add        leftq, 4
1785    jmp .expand_x
1786.no_left:
1787    movd          m0, [srcq+xq-1]
1788    XCHG_PIC_REG
1789    pshufb        m0, m10
1790    XCHG_PIC_REG
1791    jmp .expand_x
1792.load_left_from_main:
1793    movd          m0, [srcq+xq-4]
1794    pslldq        m0, 12
1795.expand_x:
1796    punpckhbw     m0, m1
1797
1798    ; when we reach this, m0 contains left two px in highest words
1799    cmp           xd, -8
1800    jle .loop_x
1801    test          xd, xd
1802    jge .right_extend
1803.partial_load_and_extend:
1804    XCHG_PIC_REG
1805    movd          m3, [srcq-1]
1806    movq          m2, [srcq+xq]
1807    pshufb        m3, m10
1808    punpcklbw     m3, m1
1809    punpcklbw     m2, m1
1810%if ARCH_X86_64
1811    movu          m4, [r10+xq*2]
1812%else
1813    movu          m4, [PIC_sym(pb_right_ext_mask)+xd*2+24]
1814    XCHG_PIC_REG
1815%endif
1816    pand          m2, m4
1817    pandn         m4, m3
1818    por           m2, m4
1819    jmp .loop_x_noload
1820.right_extend:
1821    psrldq        m2, m0, 14
1822    XCHG_PIC_REG
1823    pshufb        m2, m11
1824    XCHG_PIC_REG
1825    jmp .loop_x_noload
1826
1827.loop_x:
1828    movq          m2, [srcq+xq]
1829    punpcklbw     m2, m1
1830.loop_x_noload:
1831    palignr       m3, m2, m0, 8
1832    palignr       m4, m2, m0, 10
1833    palignr       m5, m2, m0, 12
1834    palignr       m6, m2, m0, 14
1835
1836%if ARCH_X86_64
1837    paddw         m0, m3, m2
1838    punpcklwd     m7, m3, m2
1839    punpckhwd     m3, m2
1840    paddw         m0, m4
1841    punpcklwd     m8, m4, m5
1842    punpckhwd     m4, m5
1843    paddw         m0, m5
1844    punpcklwd     m9, m6, m1
1845    punpckhwd     m5, m6, m1
1846    paddw         m0, m6
1847    pmaddwd       m7, m7
1848    pmaddwd       m3, m3
1849    pmaddwd       m8, m8
1850    pmaddwd       m4, m4
1851    pmaddwd       m9, m9
1852    pmaddwd       m5, m5
1853    paddd         m7, m8
1854    paddd         m3, m4
1855    paddd         m7, m9
1856    paddd         m3, m5
1857    movu [sumq+xq*2], m0
1858    movu [sumsqq+xq*4+ 0], m7
1859    movu [sumsqq+xq*4+16], m3
1860%else
1861    paddw         m0, m3, m2
1862    paddw         m0, m4
1863    paddw         m0, m5
1864    paddw         m0, m6
1865    movu [sumq+xq*2], m0
1866    punpcklwd     m7, m3, m2
1867    punpckhwd     m3, m2
1868    punpcklwd     m0, m4, m5
1869    punpckhwd     m4, m5
1870    punpckhwd     m5, m6, m1
1871    pmaddwd       m7, m7
1872    pmaddwd       m3, m3
1873    pmaddwd       m0, m0
1874    pmaddwd       m4, m4
1875    pmaddwd       m5, m5
1876    paddd         m7, m0
1877    paddd         m3, m4
1878    paddd         m3, m5
1879    punpcklwd     m0, m6, m1
1880    pmaddwd       m0, m0
1881    paddd         m7, m0
1882    movu [sumsqq+xq*4+ 0], m7
1883    movu [sumsqq+xq*4+16], m3
1884%endif
1885
1886    mova          m0, m2
1887    add           xq, 8
1888
1889    ; if x <= -8 we can reload more pixels
1890    ; else if x < 0 we reload and extend (this implies have_right=0)
1891    ; else if x < xlimd we extend from previous load (this implies have_right=0)
1892    ; else we are done
1893
1894    cmp           xd, -8
1895    jle .loop_x
1896    test          xd, xd
1897    jl .partial_load_and_extend
1898    cmp           xd, xlimd
1899    jl .right_extend
1900
1901    add         srcq, strideq
1902    add       sumsqq, (384+16)*4
1903    add         sumq, (384+16)*2
1904    dec           hd
1905    jg .loop_y
1906%if ARCH_X86_32
1907    ADD          esp, 8
1908%endif
1909    RET
1910
1911%if ARCH_X86_64
1912cglobal sgr_box5_v_8bpc, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
1913    movifnidn  edged, edgem
1914    mov        ylimd, edged
1915%else
1916cglobal sgr_box5_v_8bpc, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
1917 %define wm     [esp+0]
1918 %define hm     [esp+4]
1919 %define edgem  [esp+8]
1920    mov           wm, xd
1921    mov           hm, yd
1922    mov        edgem, ylimd
1923%endif
1924
1925    and        ylimd, 8                             ; have_bottom
1926    shr        ylimd, 2
1927    sub        ylimd, 3                             ; -3 if have_bottom=0, else -1
1928    mov           xq, -2
1929%if ARCH_X86_64
1930.loop_x:
1931    lea           yd, [hd+ylimd+2]
1932    lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
1933    lea     sum_ptrq, [  sumq+xq*2+2-(384+16)*2]
1934    test       edgeb, 4                             ; have_top
1935    jnz .load_top
1936    movu          m0, [sumsq_ptrq+(384+16)*4*1]
1937    movu          m1, [sumsq_ptrq+(384+16)*4*1+16]
1938    mova          m2, m0
1939    mova          m3, m1
1940    mova          m4, m0
1941    mova          m5, m1
1942    mova          m6, m0
1943    mova          m7, m1
1944    movu         m10, [sum_ptrq+(384+16)*2*1]
1945    mova         m11, m10
1946    mova         m12, m10
1947    mova         m13, m10
1948    jmp .loop_y_second_load
1949.load_top:
1950    movu          m0, [sumsq_ptrq-(384+16)*4*1]      ; l3/4sq [left]
1951    movu          m1, [sumsq_ptrq-(384+16)*4*1+16]   ; l3/4sq [right]
1952    movu          m4, [sumsq_ptrq-(384+16)*4*0]      ; l2sq [left]
1953    movu          m5, [sumsq_ptrq-(384+16)*4*0+16]   ; l2sq [right]
1954    mova          m2, m0
1955    mova          m3, m1
1956    movu         m10, [sum_ptrq-(384+16)*2*1]        ; l3/4
1957    movu         m12, [sum_ptrq-(384+16)*2*0]        ; l2
1958    mova         m11, m10
1959.loop_y:
1960    movu          m6, [sumsq_ptrq+(384+16)*4*1]      ; l1sq [left]
1961    movu          m7, [sumsq_ptrq+(384+16)*4*1+16]   ; l1sq [right]
1962    movu         m13, [sum_ptrq+(384+16)*2*1]        ; l1
1963.loop_y_second_load:
1964    test          yd, yd
1965    jle .emulate_second_load
1966    movu          m8, [sumsq_ptrq+(384+16)*4*2]      ; l0sq [left]
1967    movu          m9, [sumsq_ptrq+(384+16)*4*2+16]   ; l0sq [right]
1968    movu         m14, [sum_ptrq+(384+16)*2*2]        ; l0
1969.loop_y_noload:
1970    paddd         m0, m2
1971    paddd         m1, m3
1972    paddw        m10, m11
1973    paddd         m0, m4
1974    paddd         m1, m5
1975    paddw        m10, m12
1976    paddd         m0, m6
1977    paddd         m1, m7
1978    paddw        m10, m13
1979    paddd         m0, m8
1980    paddd         m1, m9
1981    paddw        m10, m14
1982    movu [sumsq_ptrq+ 0], m0
1983    movu [sumsq_ptrq+16], m1
1984    movu  [sum_ptrq], m10
1985
1986    ; shift position down by one
1987    mova          m0, m4
1988    mova          m1, m5
1989    mova          m2, m6
1990    mova          m3, m7
1991    mova          m4, m8
1992    mova          m5, m9
1993    mova         m10, m12
1994    mova         m11, m13
1995    mova         m12, m14
1996    add   sumsq_ptrq, (384+16)*4*2
1997    add     sum_ptrq, (384+16)*2*2
1998    sub           yd, 2
1999    jge .loop_y
2000    ; l1 = l0
2001    mova          m6, m8
2002    mova          m7, m9
2003    mova         m13, m14
2004    cmp           yd, ylimd
2005    jg .loop_y_noload
2006    add           xd, 8
2007    cmp           xd, wd
2008    jl .loop_x
2009    RET
2010.emulate_second_load:
2011    mova          m8, m6
2012    mova          m9, m7
2013    mova         m14, m13
2014    jmp .loop_y_noload
2015%else
2016.sumsq_loop_x:
2017    lea           yd, [ylimd+2]
2018    add           yd, hm
2019    lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
2020    test  byte edgem, 4                             ; have_top
2021    jnz .sumsq_load_top
2022    movu          m0, [sumsq_ptrq+(384+16)*4*1]
2023    movu          m1, [sumsq_ptrq+(384+16)*4*1+16]
2024    mova          m4, m0
2025    mova          m5, m1
2026    mova          m6, m0
2027    mova          m7, m1
2028    mova  [esp+0x1c], m0
2029    mova  [esp+0x0c], m1
2030    jmp .sumsq_loop_y_second_load
2031.sumsq_load_top:
2032    movu          m0, [sumsq_ptrq-(384+16)*4*1]      ; l3/4sq [left]
2033    movu          m1, [sumsq_ptrq-(384+16)*4*1+16]   ; l3/4sq [right]
2034    movu          m4, [sumsq_ptrq-(384+16)*4*0]      ; l2sq [left]
2035    movu          m5, [sumsq_ptrq-(384+16)*4*0+16]   ; l2sq [right]
2036    mova  [esp+0x1c], m0
2037    mova  [esp+0x0c], m1
2038.sumsq_loop_y:
2039    movu          m6, [sumsq_ptrq+(384+16)*4*1]      ; l1sq [left]
2040    movu          m7, [sumsq_ptrq+(384+16)*4*1+16]   ; l1sq [right]
2041.sumsq_loop_y_second_load:
2042    test          yd, yd
2043    jle .sumsq_emulate_second_load
2044    movu          m2, [sumsq_ptrq+(384+16)*4*2]      ; l0sq [left]
2045    movu          m3, [sumsq_ptrq+(384+16)*4*2+16]   ; l0sq [right]
2046.sumsq_loop_y_noload:
2047    paddd         m0, [esp+0x1c]
2048    paddd         m1, [esp+0x0c]
2049    paddd         m0, m4
2050    paddd         m1, m5
2051    paddd         m0, m6
2052    paddd         m1, m7
2053    paddd         m0, m2
2054    paddd         m1, m3
2055    movu [sumsq_ptrq+ 0], m0
2056    movu [sumsq_ptrq+16], m1
2057
2058    ; shift position down by one
2059    mova          m0, m4
2060    mova          m1, m5
2061    mova          m4, m2
2062    mova          m5, m3
2063    mova  [esp+0x1c], m6
2064    mova  [esp+0x0c], m7
2065    add   sumsq_ptrq, (384+16)*4*2
2066    sub           yd, 2
2067    jge .sumsq_loop_y
2068    ; l1 = l0
2069    mova          m6, m2
2070    mova          m7, m3
2071    cmp           yd, ylimd
2072    jg .sumsq_loop_y_noload
2073    add           xd, 8
2074    cmp           xd, wm
2075    jl .sumsq_loop_x
2076
2077    mov           xd, -2
2078.sum_loop_x:
2079    lea           yd, [ylimd+2]
2080    add           yd, hm
2081    lea     sum_ptrq, [sumq+xq*2+2-(384+16)*2]
2082    test  byte edgem, 4                             ; have_top
2083    jnz .sum_load_top
2084    movu          m0, [sum_ptrq+(384+16)*2*1]
2085    mova          m1, m0
2086    mova          m2, m0
2087    mova          m3, m0
2088    jmp .sum_loop_y_second_load
2089.sum_load_top:
2090    movu          m0, [sum_ptrq-(384+16)*2*1]        ; l3/4
2091    movu          m2, [sum_ptrq-(384+16)*2*0]        ; l2
2092    mova          m1, m0
2093.sum_loop_y:
2094    movu          m3, [sum_ptrq+(384+16)*2*1]        ; l1
2095.sum_loop_y_second_load:
2096    test          yd, yd
2097    jle .sum_emulate_second_load
2098    movu          m4, [sum_ptrq+(384+16)*2*2]        ; l0
2099.sum_loop_y_noload:
2100    paddw         m0, m1
2101    paddw         m0, m2
2102    paddw         m0, m3
2103    paddw         m0, m4
2104    movu  [sum_ptrq], m0
2105
2106    ; shift position down by one
2107    mova          m0, m2
2108    mova          m1, m3
2109    mova          m2, m4
2110    add     sum_ptrq, (384+16)*2*2
2111    sub           yd, 2
2112    jge .sum_loop_y
2113    ; l1 = l0
2114    mova          m3, m4
2115    cmp           yd, ylimd
2116    jg .sum_loop_y_noload
2117    add           xd, 8
2118    cmp           xd, wm
2119    jl .sum_loop_x
2120    RET
2121.sumsq_emulate_second_load:
2122    mova          m2, m6
2123    mova          m3, m7
2124    jmp .sumsq_loop_y_noload
2125.sum_emulate_second_load:
2126    mova          m4, m3
2127    jmp .sum_loop_y_noload
2128%endif
2129
2130cglobal sgr_calc_ab2_8bpc, 4, 7, 11, a, b, w, h, s
2131    movifnidn     sd, sm
2132    sub           aq, (384+16-1)*4
2133    sub           bq, (384+16-1)*2
2134    add           hd, 2
2135%if ARCH_X86_64
2136    LEA           r5, sgr_x_by_x-0xF03
2137%else
2138    SETUP_PIC r5, 0
2139%endif
2140    movd          m6, sd
2141    pshuflw       m6, m6, q0000
2142    punpcklqdq    m6, m6
2143    pxor          m7, m7
2144    DEFINE_ARGS a, b, w, h, x
2145%if ARCH_X86_64
2146    mova          m8, [pd_0xF0080029]
2147    mova          m9, [pw_256]
2148    psrld        m10, m9, 15                        ; pd_512
2149%else
2150 %define m8     [PIC_sym(pd_0xF0080029)]
2151 %define m9     [PIC_sym(pw_256)]
2152 %define m10    [PIC_sym(pd_512)]
2153%endif
2154.loop_y:
2155    mov           xq, -2
2156.loop_x:
2157    movq          m0, [bq+xq*2+0]
2158    movq          m1, [bq+xq*2+8]
2159    punpcklwd     m0, m7
2160    punpcklwd     m1, m7
2161    movu          m2, [aq+xq*4+ 0]
2162    movu          m3, [aq+xq*4+16]
2163    pslld         m4, m2, 3                         ; aa * 8
2164    pslld         m5, m3, 3
2165    paddd         m2, m4                            ; aa * 9
2166    paddd         m3, m5
2167    paddd         m4, m4                            ; aa * 16
2168    paddd         m5, m5
2169    paddd         m2, m4                            ; aa * 25
2170    paddd         m3, m5
2171    pmaddwd       m4, m0, m0
2172    pmaddwd       m5, m1, m1
2173    psubd         m2, m4                            ; p = aa * 25 - bb * bb
2174    psubd         m3, m5
2175    MULLD         m2, m6
2176    MULLD         m3, m6
2177    paddusw       m2, m8
2178    paddusw       m3, m8
2179    psrld         m2, 20                            ; z
2180    psrld         m3, 20
2181    GATHERDD      m4, m2                            ; xx
2182    GATHERDD      m2, m3
2183    psrld         m4, 24
2184    psrld         m2, 24
2185    packssdw      m3, m4, m2
2186    pmullw        m4, m8
2187    pmullw        m2, m8
2188    psubw         m5, m9, m3
2189    pmaddwd       m0, m4
2190    pmaddwd       m1, m2
2191    paddd         m0, m10
2192    paddd         m1, m10
2193    psrld         m0, 10
2194    psrld         m1, 10
2195    movu   [bq+xq*2], m5
2196    movu [aq+xq*4+ 0], m0
2197    movu [aq+xq*4+16], m1
2198    add           xd, 8
2199    cmp           xd, wd
2200    jl .loop_x
2201    add           aq, (384+16)*4*2
2202    add           bq, (384+16)*2*2
2203    sub           hd, 2
2204    jg .loop_y
2205    RET
2206
2207%if ARCH_X86_64
2208cglobal sgr_finish_filter2_8bpc, 5, 13, 14, t, src, stride, a, b, w, h, \
2209                                       tmp_base, src_base, a_base, b_base, x, y
2210    movifnidn     wd, wm
2211    mov           hd, hm
2212    mov    tmp_baseq, tq
2213    mov    src_baseq, srcq
2214    mov      a_baseq, aq
2215    mov      b_baseq, bq
2216    mova          m9, [pw_5_6]
2217    mova         m12, [pw_256]
2218    psrlw        m10, m12, 8                    ; pw_1
2219    psrlw        m11, m12, 1                    ; pw_128
2220    pxor         m13, m13
2221%else
2222cglobal sgr_finish_filter2_8bpc, 6, 7, 8, t, src, stride, a, b, x, y
2223 %define tmp_baseq  r0m
2224 %define src_baseq  r1m
2225 %define a_baseq    r3m
2226 %define b_baseq    r4m
2227 %define wd         r5m
2228 %define hd         r6m
2229
2230    SUB          esp, 8
2231    SETUP_PIC yd
2232
2233 %define m8     m5
2234 %define m9     [PIC_sym(pw_5_6)]
2235 %define m10    [PIC_sym(pw_1)]
2236 %define m11    [PIC_sym(pw_128)]
2237 %define m12    [PIC_sym(pw_256)]
2238 %define m13    m0
2239%endif
2240    xor           xd, xd
2241.loop_x:
2242    mov           tq, tmp_baseq
2243    mov         srcq, src_baseq
2244    mov           aq, a_baseq
2245    mov           bq, b_baseq
2246    movu          m0, [aq+xq*4-(384+16)*4-4]
2247    mova          m1, [aq+xq*4-(384+16)*4]
2248    movu          m2, [aq+xq*4-(384+16)*4+4]
2249    movu          m3, [aq+xq*4-(384+16)*4-4+16]
2250    mova          m4, [aq+xq*4-(384+16)*4+16]
2251    movu          m5, [aq+xq*4-(384+16)*4+4+16]
2252    paddd         m0, m2
2253    paddd         m3, m5
2254    paddd         m0, m1
2255    paddd         m3, m4
2256    pslld         m2, m0, 2
2257    pslld         m5, m3, 2
2258    paddd         m2, m0
2259    paddd         m5, m3
2260    paddd         m0, m2, m1                    ; prev_odd_b [first half]
2261    paddd         m1, m5, m4                    ; prev_odd_b [second half]
2262    movu          m3, [bq+xq*2-(384+16)*2-2]
2263    mova          m4, [bq+xq*2-(384+16)*2]
2264    movu          m5, [bq+xq*2-(384+16)*2+2]
2265    paddw         m3, m5
2266    punpcklwd     m5, m3, m4
2267    punpckhwd     m3, m4
2268    pmaddwd       m5, m9
2269    pmaddwd       m3, m9
2270    mova          m2, m5
2271    packssdw      m2, m3                        ; prev_odd_a
2272    lea           tq, [tq+xq*2]
2273    lea         srcq, [srcq+xq*1]
2274    lea           aq, [aq+xq*4+(384+16)*4]
2275    lea           bq, [bq+xq*2+(384+16)*2]
2276%if ARCH_X86_32
2277    mov        [esp], PIC_reg
2278%endif
2279    mov           yd, hd
2280    XCHG_PIC_REG
2281.loop_y:
2282    movu          m3, [aq-4]
2283    mova          m4, [aq]
2284    movu          m5, [aq+4]
2285    paddd         m3, m5
2286    paddd         m3, m4
2287    pslld         m5, m3, 2
2288    paddd         m5, m3
2289    paddd         m5, m4                        ; cur_odd_b [first half]
2290    movu          m3, [aq+16-4]
2291    mova          m6, [aq+16]
2292    movu          m7, [aq+16+4]
2293    paddd         m3, m7
2294    paddd         m3, m6
2295    pslld         m7, m3, 2
2296    paddd         m7, m3
2297    paddd         m4, m7, m6                    ; cur_odd_b [second half]
2298    movu          m3, [bq-2]
2299    mova          m6, [bq]
2300    movu          m7, [bq+2]
2301    paddw         m3, m7
2302    punpcklwd     m7, m3, m6
2303    punpckhwd     m3, m6
2304    pmaddwd       m7, m9
2305    pmaddwd       m3, m9
2306    packssdw      m6, m7, m3                    ; cur_odd_a
2307
2308    paddd         m0, m5                        ; cur_even_b [first half]
2309    paddd         m1, m4                        ; cur_even_b [second half]
2310    paddw         m2, m6                        ; cur_even_a
2311
2312    movq          m3, [srcq]
2313%if ARCH_X86_64
2314    punpcklbw     m3, m13
2315%else
2316    mova        [td], m5
2317    pxor          m7, m7
2318    punpcklbw     m3, m7
2319%endif
2320    punpcklwd     m7, m3, m10
2321    punpckhwd     m3, m10
2322    punpcklwd     m8, m2, m12
2323    punpckhwd     m2, m12
2324    pmaddwd       m7, m8
2325    pmaddwd       m3, m2
2326    paddd         m7, m0
2327    paddd         m3, m1
2328    psrad         m7, 9
2329    psrad         m3, 9
2330
2331%if ARCH_X86_32
2332    pxor         m13, m13
2333%endif
2334    movq          m8, [srcq+strideq]
2335    punpcklbw     m8, m13
2336    punpcklwd     m0, m8, m10
2337    punpckhwd     m8, m10
2338    punpcklwd     m1, m6, m11
2339    punpckhwd     m2, m6, m11
2340    pmaddwd       m0, m1
2341    pmaddwd       m8, m2
2342%if ARCH_X86_64
2343    paddd         m0, m5
2344%else
2345    paddd         m0, [td]
2346%endif
2347    paddd         m8, m4
2348    psrad         m0, 8
2349    psrad         m8, 8
2350
2351    packssdw      m7, m3
2352    packssdw      m0, m8
2353%if ARCH_X86_32
2354    mova          m5, [td]
2355%endif
2356    mova [tq+384*2*0], m7
2357    mova [tq+384*2*1], m0
2358
2359    mova          m0, m5
2360    mova          m1, m4
2361    mova          m2, m6
2362    add           aq, (384+16)*4*2
2363    add           bq, (384+16)*2*2
2364    add           tq, 384*2*2
2365    lea         srcq, [srcq+strideq*2]
2366%if ARCH_X86_64
2367    sub           yd, 2
2368%else
2369    sub dword [esp+4], 2
2370%endif
2371    jg .loop_y
2372    add           xd, 8
2373    cmp           xd, wd
2374    jl .loop_x
2375%if ARCH_X86_32
2376    ADD          esp, 8
2377%endif
2378    RET
2379
2380%undef t2
2381cglobal sgr_weighted2_8bpc, 4, 7, 12, dst, stride, t1, t2, w, h, wt
2382    movifnidn     wd, wm
2383    movd          m0, wtm
2384%if ARCH_X86_64
2385    movifnidn     hd, hm
2386    mova         m10, [pd_1024]
2387    pxor         m11, m11
2388%else
2389    SETUP_PIC     hd, 0
2390 %define m10    [PIC_sym(pd_1024)]
2391 %define m11    m7
2392%endif
2393    pshufd        m0, m0, 0
2394    DEFINE_ARGS dst, stride, t1, t2, w, h, idx
2395%if ARCH_X86_32
2396 %define hd     hmp
2397%endif
2398
2399.loop_y:
2400    xor         idxd, idxd
2401.loop_x:
2402    mova          m1, [t1q+idxq*2+ 0]
2403    mova          m2, [t1q+idxq*2+16]
2404    mova          m3, [t2q+idxq*2+ 0]
2405    mova          m4, [t2q+idxq*2+16]
2406    mova          m6, [dstq+idxq]
2407%if ARCH_X86_32
2408    pxor          m11, m11
2409%endif
2410    punpcklbw     m5, m6, m11
2411    punpckhbw     m6, m11
2412    psllw         m7, m5, 4
2413    psubw         m1, m7
2414    psubw         m3, m7
2415    psllw         m7, m6, 4
2416    psubw         m2, m7
2417    psubw         m4, m7
2418    punpcklwd     m7, m1, m3
2419    punpckhwd     m1, m3
2420    punpcklwd     m3, m2, m4
2421    punpckhwd     m2, m4
2422    pmaddwd       m7, m0
2423    pmaddwd       m1, m0
2424    pmaddwd       m3, m0
2425    pmaddwd       m2, m0
2426    paddd         m7, m10
2427    paddd         m1, m10
2428    paddd         m3, m10
2429    paddd         m2, m10
2430    psrad         m7, 11
2431    psrad         m1, 11
2432    psrad         m3, 11
2433    psrad         m2, 11
2434    packssdw      m7, m1
2435    packssdw      m3, m2
2436    paddw         m7, m5
2437    paddw         m3, m6
2438    packuswb      m7, m3
2439    mova [dstq+idxq], m7
2440    add         idxd, 16
2441    cmp         idxd, wd
2442    jl .loop_x
2443    add         dstq, strideq
2444    add          t1q, 384 * 2
2445    add          t2q, 384 * 2
2446    dec           hd
2447    jg .loop_y
2448    RET
2449