1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29SECTION_RODATA
30
31wiener_shufA:  db  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11
32wiener_shufB:  db  6,  7,  4,  5,  8,  9,  6,  7, 10, 11,  8,  9, 12, 13, 10, 11
33wiener_shufC:  db  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
34wiener_shufD:  db  2,  3, -1, -1,  4,  5, -1, -1,  6,  7, -1, -1,  8,  9, -1, -1
35wiener_shufE:  db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
36wiener_lshuf5: db  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
37wiener_lshuf7: db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7
38sgr_lshuf3:    db  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
39sgr_lshuf5:    db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9
40pb_0to15:      db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
41
42pb_m14_m13:    times 8 db -14,-13
43pb_m10_m9:     times 8 db -10, -9
44pb_m6_m5:      times 8 db  -6, -5
45pb_m2_m1:      times 8 db  -2, -1
46pb_2_3:        times 8 db   2,  3
47pb_6_7:        times 8 db   6,  7
48pw_25:         times 8 dw 25
49pw_256:        times 8 dw 256
50pw_1023:       times 8 dw 1023
51pd_8:          times 4 dd 8
52pd_4096:       times 4 dd 4096
53pd_34816:      times 4 dd 34816
54pd_m262128:    times 4 dd -262128
55pd_0xffff:     times 4 dd 0xffff
56pd_0xf00800a4: times 4 dd 0xf00800a4
57pd_0xf00801c7: times 4 dd 0xf00801c7
58
59wiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192
60wiener_round:  dd 1049600, 1048832
61
62cextern sgr_x_by_x
63
64SECTION .text
65
66%macro movif64 2 ; dst, src
67 %if ARCH_X86_64
68    mov             %1, %2
69 %endif
70%endmacro
71
72%macro movif32 2 ; dst, src
73 %if ARCH_X86_32
74    mov             %1, %2
75 %endif
76%endmacro
77
78INIT_XMM ssse3
79%if ARCH_X86_32
80DECLARE_REG_TMP 4, 6
81 %if STACK_ALIGNMENT < 16
82  %assign extra_stack 14*16
83 %else
84  %assign extra_stack 12*16
85 %endif
86cglobal wiener_filter7_16bpc, 5, 7, 8, -384*12-16-extra_stack, \
87                              dst, dst_stride, left, lpf, lpf_stride, w, flt
88 %if STACK_ALIGNMENT < 16
89  %define lpfm        dword [esp+calloff+16*12+ 0]
90  %define lpf_stridem dword [esp+calloff+16*12+ 4]
91  %define wm          dword [esp+calloff+16*12+ 8]
92  %define hd          dword [esp+calloff+16*12+12]
93  %define edgeb        byte [esp+calloff+16*12+16]
94  %define edged       dword [esp+calloff+16*12+16]
95 %else
96  %define hd dword r6m
97  %define edgeb byte r8m
98 %endif
99 %define PICmem dword [esp+calloff+4*0]
100 %define t0m    dword [esp+calloff+4*1] ; wiener ring buffer pointers
101 %define t1m    dword [esp+calloff+4*2]
102 %define t2m    dword [esp+calloff+4*3]
103 %define t3m    dword [esp+calloff+4*4]
104 %define t4m    dword [esp+calloff+4*5]
105 %define t5m    dword [esp+calloff+4*6]
106 %define t6m    dword [esp+calloff+4*7]
107 %define t2 t2m
108 %define t3 t3m
109 %define t4 t4m
110 %define t5 t5m
111 %define t6 t6m
112 %define  m8 [esp+calloff+16*2]
113 %define  m9 [esp+calloff+16*3]
114 %define m10 [esp+calloff+16*4]
115 %define m11 [esp+calloff+16*5]
116 %define m12 [esp+calloff+16*6]
117 %define m13 [esp+calloff+16*7]
118 %define m14 [esp+calloff+16*8]
119 %define m15 [esp+calloff+16*9]
120 %define r10 r5
121 %define base t0-wiener_shifts
122 %assign calloff 0
123 %if STACK_ALIGNMENT < 16
124    mov             wd, [rstk+stack_offset+24]
125    mov    lpf_stridem, lpf_strideq
126    mov             wm, wd
127    mov             r4, [rstk+stack_offset+28]
128    mov             hd, r4
129    mov             r4, [rstk+stack_offset+36]
130    mov          edged, r4 ; edge
131 %endif
132%else
133DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers
134cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
135                                                     lpf_stride, w, edge, flt, h
136 %define base
137%endif
138%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
139    movifnidn       wd, wm
140%endif
141%if ARCH_X86_64
142    mov           fltq, fltmp
143    mov          edged, r8m
144    mov             hd, r6m
145    mov            t3d, r9m ; pixel_max
146    movq           m13, [fltq]
147    movq           m15, [fltq+16]
148%else
149 %if STACK_ALIGNMENT < 16
150    mov             t0, [rstk+stack_offset+32]
151    mov             t1, [rstk+stack_offset+40] ; pixel_max
152    movq            m1, [t0]    ; fx
153    movq            m3, [t0+16] ; fy
154    LEA             t0, wiener_shifts
155 %else
156    LEA             t0, wiener_shifts
157    mov           fltq, r7m
158    movq            m1, [fltq]
159    movq            m3, [fltq+16]
160    mov             t1, r9m ; pixel_max
161 %endif
162    mov         PICmem, t0
163%endif
164    mova            m6, [base+wiener_shufA]
165    mova            m7, [base+wiener_shufB]
166%if ARCH_X86_64
167    lea             t4, [wiener_shifts]
168    add             wd, wd
169    pshufd         m12, m13, q0000 ; x0 x1
170    pshufd         m13, m13, q1111 ; x2 x3
171    pshufd         m14, m15, q0000 ; y0 y1
172    pshufd         m15, m15, q1111 ; y2 y3
173    mova            m8, [wiener_shufC]
174    mova            m9, [wiener_shufD]
175    add           lpfq, wq
176    lea             t1, [rsp+wq+16]
177    add           dstq, wq
178    neg             wq
179    shr            t3d, 11
180 %define base t4-wiener_shifts
181    movd           m10, [base+wiener_round+t3*4]
182    movq           m11, [base+wiener_shifts+t3*8]
183    pshufd         m10, m10, q0000
184    pshufd          m0, m11, q0000
185    pshufd         m11, m11, q1111
186    pmullw         m12, m0 ; upshift filter coefs to make the
187    pmullw         m13, m0 ; horizontal downshift constant
188 DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w
189 %define lpfm        [rsp+0]
190 %define lpf_stridem [rsp+8]
191 %define base
192 %define wiener_lshuf7_mem [wiener_lshuf7]
193 %define pd_m262128_mem [pd_m262128]
194%else
195    add             wd, wd
196    mova            m4, [base+wiener_shufC]
197    mova            m5, [base+wiener_shufD]
198    pshufd          m0, m1, q0000
199    pshufd          m1, m1, q1111
200    pshufd          m2, m3, q0000
201    pshufd          m3, m3, q1111
202    mova            m8, m4
203    mova            m9, m5
204    mova           m14, m2
205    mova           m15, m3
206    shr             t1, 11
207    add           lpfq, wq
208    mova            m3, [base+pd_m262128]
209    movd            m4, [base+wiener_round+t1*4]
210    movq            m5, [base+wiener_shifts+t1*8]
211    lea             t1, [esp+extra_stack+wq+16]
212    add           dstq, wq
213    neg             wq
214    pshufd          m4, m4, q0000
215    pshufd          m2, m5, q0000
216    pshufd          m5, m5, q1111
217    mov             wm, wq
218    pmullw          m0, m2
219    pmullw          m1, m2
220    mova            m2, [base+wiener_lshuf7]
221 %define pd_m262128_mem [esp+calloff+16*10]
222    mova pd_m262128_mem, m3
223    mova           m10, m4
224    mova           m11, m5
225    mova           m12, m0
226    mova           m13, m1
227 %define wiener_lshuf7_mem [esp+calloff+16*11]
228    mova wiener_lshuf7_mem, m2
229%endif
230    test         edgeb, 4 ; LR_HAVE_TOP
231    jz .no_top
232    call .h_top
233%if ARCH_X86_64
234    add           lpfq, lpf_strideq
235%else
236    add           lpfq, lpf_stridem
237%endif
238    mov             t6, t1
239    mov             t5, t1
240    add             t1, 384*2
241    call .h_top
242    movif32 lpf_strideq, lpf_stridem
243    lea            r10, [lpfq+lpf_strideq*4]
244    mov           lpfq, dstq
245    mov             t4, t1
246    add             t1, 384*2
247    movif64 lpf_stridem, lpf_strideq
248    add            r10, lpf_strideq
249    mov           lpfm, r10 ; below
250    call .h
251    mov             t3, t1
252    mov             t2, t1
253    dec             hd
254    jz .v1
255    add           lpfq, dst_strideq
256    add             t1, 384*2
257    call .h
258    mov             t2, t1
259    dec             hd
260    jz .v2
261    add           lpfq, dst_strideq
262    add             t1, 384*2
263    call .h
264    dec             hd
265    jz .v3
266.main:
267    lea             t0, [t1+384*2]
268.main_loop:
269    call .hv
270    dec             hd
271    jnz .main_loop
272    test         edgeb, 8 ; LR_HAVE_BOTTOM
273    jz .v3
274    mov           lpfq, lpfm
275    call .hv_bottom
276    add           lpfq, lpf_stridem
277    call .hv_bottom
278.v1:
279    call .v
280    RET
281.no_top:
282    movif32 lpf_strideq, lpf_stridem
283    lea            r10, [lpfq+lpf_strideq*4]
284    mov           lpfq, dstq
285    movif64 lpf_stridem, lpf_strideq
286    lea            r10, [r10+lpf_strideq*2]
287    mov           lpfm, r10
288    call .h
289    mov             t6, t1
290    mov             t5, t1
291    mov             t4, t1
292    mov             t3, t1
293    mov             t2, t1
294    dec             hd
295    jz .v1
296    add           lpfq, dst_strideq
297    add             t1, 384*2
298    call .h
299    mov             t2, t1
300    dec             hd
301    jz .v2
302    add           lpfq, dst_strideq
303    add             t1, 384*2
304    call .h
305    dec             hd
306    jz .v3
307    lea             t0, [t1+384*2]
308    call .hv
309    dec             hd
310    jz .v3
311    add             t0, 384*8
312    call .hv
313    dec             hd
314    jnz .main
315.v3:
316    call .v
317    movif32         wq, wm
318.v2:
319    call .v
320    movif32         wq, wm
321    jmp .v1
322.extend_right:
323%assign stack_offset stack_offset+8
324%assign calloff 8
325    movif32         t0, PICmem
326    pxor            m0, m0
327    movd            m1, wd
328    mova            m2, [base+pb_0to15]
329    pshufb          m1, m0
330    mova            m0, [base+pb_6_7]
331    psubb           m0, m1
332    pminub          m0, m2
333    pshufb          m3, m0
334    mova            m0, [base+pb_m2_m1]
335    psubb           m0, m1
336    pminub          m0, m2
337    pshufb          m4, m0
338    mova            m0, [base+pb_m10_m9]
339    psubb           m0, m1
340    pminub          m0, m2
341    pshufb          m5, m0
342    movif32         t0, t0m
343    ret
344%assign stack_offset stack_offset-4
345%assign calloff 4
346.h:
347    movif64         wq, r5
348    movif32         wq, wm
349    test         edgeb, 1 ; LR_HAVE_LEFT
350    jz .h_extend_left
351    movq            m3, [leftq]
352    movhps          m3, [lpfq+wq]
353    add          leftq, 8
354    jmp .h_main
355.h_extend_left:
356    mova            m3, [lpfq+wq]         ; avoid accessing memory located
357    pshufb          m3, wiener_lshuf7_mem ; before the start of the buffer
358    jmp .h_main
359.h_top:
360    movif64         wq, r5
361    test         edgeb, 1 ; LR_HAVE_LEFT
362    jz .h_extend_left
363.h_loop:
364    movu            m3, [lpfq+wq-8]
365.h_main:
366    mova            m4, [lpfq+wq+0]
367    movu            m5, [lpfq+wq+8]
368    test         edgeb, 2 ; LR_HAVE_RIGHT
369    jnz .h_have_right
370    cmp             wd, -18
371    jl .h_have_right
372    call .extend_right
373.h_have_right:
374    pshufb          m0, m3, m6
375    pshufb          m1, m4, m7
376    paddw           m0, m1
377    pshufb          m3, m8
378    pmaddwd         m0, m12
379    pshufb          m1, m4, m9
380    paddw           m3, m1
381    pshufb          m1, m4, m6
382    pmaddwd         m3, m13
383    pshufb          m2, m5, m7
384    paddw           m1, m2
385    mova            m2, pd_m262128_mem ; (1 << 4) - (1 << 18)
386    pshufb          m4, m8
387    pmaddwd         m1, m12
388    pshufb          m5, m9
389    paddw           m4, m5
390    pmaddwd         m4, m13
391    paddd           m0, m2
392    paddd           m1, m2
393    paddd           m0, m3
394    paddd           m1, m4
395    psrad           m0, 4
396    psrad           m1, 4
397    packssdw        m0, m1
398    psraw           m0, 1
399    mova       [t1+wq], m0
400    add             wq, 16
401    jl .h_loop
402    movif32         wq, wm
403    ret
404ALIGN function_align
405.hv:
406    add           lpfq, dst_strideq
407    movif64         wq, r5
408    movif32        t0m, t0
409    movif32        t1m, t1
410    test         edgeb, 1 ; LR_HAVE_LEFT
411    jz .hv_extend_left
412    movq            m3, [leftq]
413    movhps          m3, [lpfq+wq]
414    add          leftq, 8
415    jmp .hv_main
416.hv_extend_left:
417    mova            m3, [lpfq+wq]
418    pshufb          m3, wiener_lshuf7_mem
419    jmp .hv_main
420.hv_bottom:
421    movif64         wq, r5
422    movif32        t0m, t0
423    movif32        t1m, t1
424    test         edgeb, 1 ; LR_HAVE_LEFT
425    jz .hv_extend_left
426.hv_loop:
427    movu            m3, [lpfq+wq-8]
428.hv_main:
429    mova            m4, [lpfq+wq+0]
430    movu            m5, [lpfq+wq+8]
431    test         edgeb, 2 ; LR_HAVE_RIGHT
432    jnz .hv_have_right
433    cmp             wd, -18
434    jl .hv_have_right
435    call .extend_right
436.hv_have_right:
437    movif32         t1, t4m
438    movif32         t0, t2m
439    pshufb          m0, m3, m6
440    pshufb          m1, m4, m7
441    paddw           m0, m1
442    pshufb          m3, m8
443    pmaddwd         m0, m12
444    pshufb          m1, m4, m9
445    paddw           m3, m1
446    pshufb          m1, m4, m6
447    pmaddwd         m3, m13
448    pshufb          m2, m5, m7
449    paddw           m1, m2
450    mova            m2, pd_m262128_mem
451    pshufb          m4, m8
452    pmaddwd         m1, m12
453    pshufb          m5, m9
454    paddw           m4, m5
455    pmaddwd         m4, m13
456    paddd           m0, m2
457    paddd           m1, m2
458%if ARCH_X86_64
459    mova            m2, [t4+wq]
460    paddw           m2, [t2+wq]
461    mova            m5, [t3+wq]
462%else
463    mova            m2, [t1+wq]
464    paddw           m2, [t0+wq]
465    mov             t1, t3m
466    mov             t0, t5m
467    mova            m5, [t1+wq]
468    mov             t1, t1m
469%endif
470    paddd           m0, m3
471    paddd           m1, m4
472    psrad           m0, 4
473    psrad           m1, 4
474    packssdw        m0, m1
475%if ARCH_X86_64
476    mova            m4, [t5+wq]
477    paddw           m4, [t1+wq]
478    psraw           m0, 1
479    paddw           m3, m0, [t6+wq]
480%else
481    mova            m4, [t0+wq]
482    paddw           m4, [t1+wq]
483    mov             t0, t0m
484    mov             t1, t6m
485    psraw           m0, 1
486    paddw           m3, m0, [t1+wq]
487%endif
488    mova       [t0+wq], m0
489    punpcklwd       m0, m2, m5
490    pmaddwd         m0, m15
491    punpckhwd       m2, m5
492    pmaddwd         m2, m15
493    punpcklwd       m1, m3, m4
494    pmaddwd         m1, m14
495    punpckhwd       m3, m4
496    pmaddwd         m3, m14
497    paddd           m0, m10
498    paddd           m2, m10
499    paddd           m0, m1
500    paddd           m2, m3
501    psrad           m0, 6
502    psrad           m2, 6
503    packssdw        m0, m2
504    pmulhw          m0, m11
505    pxor            m1, m1
506    pmaxsw          m0, m1
507    mova     [dstq+wq], m0
508    add             wq, 16
509    jl .hv_loop
510%if ARCH_X86_64
511    mov             t6, t5
512    mov             t5, t4
513    mov             t4, t3
514    mov             t3, t2
515    mov             t2, t1
516    mov             t1, t0
517    mov             t0, t6
518%else
519    mov             r5, t5m
520    mov             t1, t4m
521    mov            t6m, r5
522    mov            t5m, t1
523    mov             r5, t3m
524    mov             t1, t2m
525    mov            t4m, r5
526    mov            t3m, t1
527    mov             r5, t1m
528    mov             t1, t0
529    mov            t2m, r5
530    mov             t0, t6m
531    mov             wq, wm
532%endif
533    add           dstq, dst_strideq
534    ret
535.v:
536    movif64         wq, r5
537    movif32        t0m, t0
538    movif32        t1m, t1
539.v_loop:
540%if ARCH_X86_64
541    mova            m1, [t4+wq]
542    paddw           m1, [t2+wq]
543    mova            m2, [t3+wq]
544    mova            m4, [t1+wq]
545    paddw           m3, m4, [t6+wq]
546    paddw           m4, [t5+wq]
547%else
548    mov             t0, t4m
549    mov             t1, t2m
550    mova            m1, [t0+wq]
551    paddw           m1, [t1+wq]
552    mov             t0, t3m
553    mov             t1, t1m
554    mova            m2, [t0+wq]
555    mova            m4, [t1+wq]
556    mov             t0, t6m
557    mov             t1, t5m
558    paddw           m3, m4, [t0+wq]
559    paddw           m4, [t1+wq]
560%endif
561    punpcklwd       m0, m1, m2
562    pmaddwd         m0, m15
563    punpckhwd       m1, m2
564    pmaddwd         m1, m15
565    punpcklwd       m2, m3, m4
566    pmaddwd         m2, m14
567    punpckhwd       m3, m4
568    pmaddwd         m3, m14
569    paddd           m0, m10
570    paddd           m1, m10
571    paddd           m0, m2
572    paddd           m1, m3
573    psrad           m0, 6
574    psrad           m1, 6
575    packssdw        m0, m1
576    pmulhw          m0, m11
577    pxor            m1, m1
578    pmaxsw          m0, m1
579    mova     [dstq+wq], m0
580    add             wq, 16
581    jl .v_loop
582%if ARCH_X86_64
583    mov             t6, t5
584    mov             t5, t4
585    mov             t4, t3
586    mov             t3, t2
587    mov             t2, t1
588%else
589    mov             t0, t5m
590    mov             t1, t4m
591    mov             r5, t3m
592    mov            t6m, t0
593    mov            t5m, t1
594    mov            t4m, r5
595    mov             r5, t2m
596    mov             t1, t1m
597    mov             t0, t0m
598    mov            t3m, r5
599    mov            t2m, t1
600%endif
601    add           dstq, dst_strideq
602    ret
603
604%if ARCH_X86_32
605 %if STACK_ALIGNMENT < 16
606  %assign stack_size 12*16+384*8
607 %else
608  %assign stack_size 11*16+384*8
609 %endif
610cglobal wiener_filter5_16bpc, 5, 7, 8, -stack_size, dst, dst_stride, left, \
611                                                    lpf, lpf_stride, w, flt
612 %if STACK_ALIGNMENT < 16
613  %define lpfm        dword [esp+calloff+4*6]
614  %define lpf_stridem dword [esp+calloff+4*7]
615  %define wm          dword [esp+calloff+16*10+0]
616  %define hd          dword [esp+calloff+16*10+4]
617  %define edgeb        byte [esp+calloff+16*10+8]
618  %define edged       dword [esp+calloff+16*10+8]
619 %else
620  %define hd dword r6m
621  %define edgeb byte r8m
622 %endif
623 %define PICmem dword [esp+calloff+4*0]
624 %define t0m    dword [esp+calloff+4*1] ; wiener ring buffer pointers
625 %define t1m    dword [esp+calloff+4*2]
626 %define t2m    dword [esp+calloff+4*3]
627 %define t3m    dword [esp+calloff+4*4]
628 %define t4m    dword [esp+calloff+4*5]
629 %define t2 t2m
630 %define t3 t3m
631 %define t4 t4m
632 %define  m8 [esp+calloff+16*2]
633 %define  m9 [esp+calloff+16*3]
634 %define m10 [esp+calloff+16*4]
635 %define m11 [esp+calloff+16*5]
636 %define m12 [esp+calloff+16*6]
637 %define m13 [esp+calloff+16*7]
638 %define m14 [esp+calloff+16*8]
639 %define m15 [esp+calloff+16*9]
640 %define base t0-wiener_shifts
641 %assign calloff 0
642 %if STACK_ALIGNMENT < 16
643    mov             wd, [rstk+stack_offset+24]
644    mov    lpf_stridem, lpf_strideq
645    mov             wm, wd
646    mov             r4, [rstk+stack_offset+28]
647    mov             hd, r4
648    mov             r4, [rstk+stack_offset+36]
649    mov          edged, r4 ; edge
650 %endif
651%else
652cglobal wiener_filter5_16bpc, 5, 14, 16, 384*8+16, dst, dst_stride, left, lpf, \
653                                                   lpf_stride, w, edge, flt, h
654 %define base
655%endif
656%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
657    movifnidn       wd, wm
658%endif
659%if ARCH_X86_64
660    mov           fltq, fltmp
661    mov          edged, r8m
662    mov             hd, r6m
663    mov            t3d, r9m ; pixel_max
664    movq           m12, [fltq]
665    movq           m14, [fltq+16]
666%else
667 %if STACK_ALIGNMENT < 16
668    mov             t0, [rstk+stack_offset+32]
669    mov             t1, [rstk+stack_offset+40] ; pixel_max
670    movq            m1, [t0]    ; fx
671    movq            m3, [t0+16] ; fy
672    LEA             t0, wiener_shifts
673 %else
674    LEA             t0, wiener_shifts
675    mov           fltq, r7m
676    movq            m1, [fltq]
677    movq            m3, [fltq+16]
678    mov             t1, r9m ; pixel_max
679 %endif
680    mov         PICmem, t0
681%endif
682    mova            m5, [base+wiener_shufE]
683    mova            m6, [base+wiener_shufB]
684    mova            m7, [base+wiener_shufD]
685%if ARCH_X86_64
686    lea             t4, [wiener_shifts]
687    add             wd, wd
688    punpcklwd      m11, m12, m12
689    pshufd         m11, m11, q1111 ; x1
690    pshufd         m12, m12, q1111 ; x2 x3
691    punpcklwd      m13, m14, m14
692    pshufd         m13, m13, q1111 ; y1
693    pshufd         m14, m14, q1111 ; y2 y3
694    shr            t3d, 11
695    mova            m8, [pd_m262128] ; (1 << 4) - (1 << 18)
696    add           lpfq, wq
697    lea             t1, [rsp+wq+16]
698    add           dstq, wq
699    neg             wq
700 %define base t4-wiener_shifts
701    movd            m9, [base+wiener_round+t3*4]
702    movq           m10, [base+wiener_shifts+t3*8]
703    pshufd          m9, m9, q0000
704    pshufd          m0, m10, q0000
705    pshufd         m10, m10, q1111
706    mova           m15, [wiener_lshuf5]
707    pmullw         m11, m0
708    pmullw         m12, m0
709 DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w
710 %define lpfm        [rsp+0]
711 %define lpf_stridem [rsp+8]
712 %define base
713%else
714    add             wd, wd
715    punpcklwd       m0, m1, m1
716    pshufd          m0, m0, q1111 ; x1
717    pshufd          m1, m1, q1111 ; x2 x3
718    punpcklwd       m2, m3, m3
719    pshufd          m2, m2, q1111 ; y1
720    pshufd          m3, m3, q1111 ; y2 y3
721    mova            m4, [base+pd_m262128] ; (1 << 4) - (1 << 18)
722    mova           m13, m2
723    mova           m14, m3
724    mova            m8, m4
725    shr             t1, 11
726    add           lpfq, wq
727    movd            m2, [base+wiener_round+t1*4]
728    movq            m3, [base+wiener_shifts+t1*8]
729 %if STACK_ALIGNMENT < 16
730    lea             t1, [esp+16*11+wq+16]
731 %else
732    lea             t1, [esp+16*10+wq+16]
733 %endif
734    add           dstq, wq
735    neg             wq
736    pshufd          m2, m2, q0000
737    pshufd          m4, m3, q0000
738    pshufd          m3, m3, q1111
739    mov             wm, wq
740    pmullw          m0, m4
741    pmullw          m1, m4
742    mova            m4, [base+wiener_lshuf5]
743    mova            m9, m2
744    mova           m10, m3
745    mova           m11, m0
746    mova           m12, m1
747    mova           m15, m4
748%endif
749    test         edgeb, 4 ; LR_HAVE_TOP
750    jz .no_top
751    call .h_top
752%if ARCH_X86_64
753    add           lpfq, lpf_strideq
754%else
755    add           lpfq, lpf_stridem
756%endif
757    mov             t4, t1
758    add             t1, 384*2
759    call .h_top
760    movif32 lpf_strideq, lpf_stridem
761    lea            r10, [lpfq+lpf_strideq*4]
762    mov           lpfq, dstq
763    mov             t3, t1
764    add             t1, 384*2
765    movif64 lpf_stridem, lpf_strideq
766    add            r10, lpf_strideq
767    mov           lpfm, r10 ; below
768    call .h
769    mov             t2, t1
770    dec             hd
771    jz .v1
772    add           lpfq, dst_strideq
773    add             t1, 384*2
774    call .h
775    dec             hd
776    jz .v2
777.main:
778    mov             t0, t4
779.main_loop:
780    call .hv
781    dec             hd
782    jnz .main_loop
783    test         edgeb, 8 ; LR_HAVE_BOTTOM
784    jz .v2
785    mov           lpfq, lpfm
786    call .hv_bottom
787    add           lpfq, lpf_stridem
788    call .hv_bottom
789.end:
790    RET
791.no_top:
792    movif32 lpf_strideq, lpf_stridem
793    lea            r10, [lpfq+lpf_strideq*4]
794    mov           lpfq, dstq
795    movif64 lpf_stridem, lpf_strideq
796    lea            r10, [r10+lpf_strideq*2]
797    mov           lpfm, r10
798    call .h
799    mov             t4, t1
800    mov             t3, t1
801    mov             t2, t1
802    dec             hd
803    jz .v1
804    add           lpfq, dst_strideq
805    add             t1, 384*2
806    call .h
807    dec             hd
808    jz .v2
809    lea             t0, [t1+384*2]
810    call .hv
811    dec             hd
812    jz .v2
813    add             t0, 384*6
814    call .hv
815    dec             hd
816    jnz .main
817.v2:
818    call .v
819%if ARCH_X86_64
820    mov             t4, t3
821    mov             t3, t2
822    mov             t2, t1
823%else
824    mov             t0, t3m
825    mov             r5, t2m
826    mov             t1, t1m
827    mov            t4m, t0
828    mov            t3m, r5
829    mov            t2m, t1
830    mov             wq, wm
831%endif
832    add           dstq, dst_strideq
833.v1:
834    call .v
835    jmp .end
836.extend_right:
837%assign stack_offset stack_offset+8
838%assign calloff 8
839    movif32         t0, PICmem
840    pxor            m1, m1
841    movd            m2, wd
842    mova            m0, [base+pb_2_3]
843    pshufb          m2, m1
844    mova            m1, [base+pb_m6_m5]
845    psubb           m0, m2
846    psubb           m1, m2
847    mova            m2, [base+pb_0to15]
848    pminub          m0, m2
849    pminub          m1, m2
850    pshufb          m3, m0
851    pshufb          m4, m1
852    ret
853%assign stack_offset stack_offset-4
854%assign calloff 4
855.h:
856    movif64         wq, r5
857    movif32         wq, wm
858    test         edgeb, 1 ; LR_HAVE_LEFT
859    jz .h_extend_left
860    mova            m4, [lpfq+wq]
861    movd            m3, [leftq+4]
862    pslldq          m4, 4
863    por             m3, m4
864    add          leftq, 8
865    jmp .h_main
866.h_extend_left:
867    mova            m3, [lpfq+wq] ; avoid accessing memory located
868    pshufb          m3, m15       ; before the start of the buffer
869    jmp .h_main
870.h_top:
871    movif64         wq, r5
872    movif32         wq, wm
873    test         edgeb, 1 ; LR_HAVE_LEFT
874    jz .h_extend_left
875.h_loop:
876    movu            m3, [lpfq+wq-4]
877.h_main:
878    movu            m4, [lpfq+wq+4]
879    test         edgeb, 2 ; LR_HAVE_RIGHT
880    jnz .h_have_right
881    cmp             wd, -18
882    jl .h_have_right
883    call .extend_right
884.h_have_right:
885    pshufb          m0, m3, m5
886    pmaddwd         m0, m11
887    pshufb          m1, m4, m5
888    pmaddwd         m1, m11
889    pshufb          m2, m3, m6
890    pshufb          m3, m7
891    paddw           m2, m3
892    pshufb          m3, m4, m6
893    pmaddwd         m2, m12
894    pshufb          m4, m7
895    paddw           m3, m4
896    pmaddwd         m3, m12
897    paddd           m0, m8
898    paddd           m1, m8
899    paddd           m0, m2
900    paddd           m1, m3
901    psrad           m0, 4
902    psrad           m1, 4
903    packssdw        m0, m1
904    psraw           m0, 1
905    mova       [t1+wq], m0
906    add             wq, 16
907    jl .h_loop
908    movif32         wq, wm
909    ret
910ALIGN function_align
911.hv:
912    add           lpfq, dst_strideq
913    movif64         wq, r5
914    movif32        t0m, t0
915    movif32        t1m, t1
916    test         edgeb, 1 ; LR_HAVE_LEFT
917    jz .hv_extend_left
918    mova            m4, [lpfq+wq]
919    movd            m3, [leftq+4]
920    pslldq          m4, 4
921    por             m3, m4
922    add          leftq, 8
923    jmp .hv_main
924.hv_extend_left:
925    mova            m3, [lpfq+wq]
926    pshufb          m3, m15
927    jmp .hv_main
928.hv_bottom:
929    movif64         wq, r5
930    movif32        t0m, t0
931    movif32        t1m, t1
932    test         edgeb, 1 ; LR_HAVE_LEFT
933    jz .hv_extend_left
934.hv_loop:
935    movu            m3, [lpfq+wq-4]
936.hv_main:
937    movu            m4, [lpfq+wq+4]
938    test         edgeb, 2 ; LR_HAVE_RIGHT
939    jnz .hv_have_right
940    cmp             wd, -18
941    jl .hv_have_right
942    call .extend_right
943.hv_have_right:
944    movif32         t1, t1m
945    movif32         t0, t3m
946    pshufb          m0, m3, m5
947    pmaddwd         m0, m11
948    pshufb          m1, m4, m5
949    pmaddwd         m1, m11
950    pshufb          m2, m3, m6
951    pshufb          m3, m7
952    paddw           m2, m3
953    pshufb          m3, m4, m6
954    pmaddwd         m2, m12
955    pshufb          m4, m7
956    paddw           m3, m4
957    pmaddwd         m3, m12
958    paddd           m0, m8
959    paddd           m1, m8
960    paddd           m0, m2
961%if ARCH_X86_64
962    mova            m2, [t3+wq]
963    paddw           m2, [t1+wq]
964    paddd           m1, m3
965    mova            m4, [t2+wq]
966%else
967    mova            m2, [t0+wq]
968    mov             t0, t2m
969    paddw           m2, [t1+wq]
970    mov             t1, t4m
971    paddd           m1, m3
972    mova            m4, [t0+wq]
973    mov             t0, t0m
974%endif
975    punpckhwd       m3, m2, m4
976    pmaddwd         m3, m14
977    punpcklwd       m2, m4
978%if ARCH_X86_64
979    mova            m4, [t4+wq]
980%else
981    mova            m4, [t1+wq]
982%endif
983    psrad           m0, 4
984    psrad           m1, 4
985    packssdw        m0, m1
986    pmaddwd         m2, m14
987    psraw           m0, 1
988    mova       [t0+wq], m0
989    punpckhwd       m1, m0, m4
990    pmaddwd         m1, m13
991    punpcklwd       m0, m4
992    pmaddwd         m0, m13
993    paddd           m3, m9
994    paddd           m2, m9
995    paddd           m1, m3
996    paddd           m0, m2
997    psrad           m1, 6
998    psrad           m0, 6
999    packssdw        m0, m1
1000    pmulhw          m0, m10
1001    pxor            m1, m1
1002    pmaxsw          m0, m1
1003    mova     [dstq+wq], m0
1004    add             wq, 16
1005    jl .hv_loop
1006%if ARCH_X86_64
1007    mov             t4, t3
1008    mov             t3, t2
1009    mov             t2, t1
1010    mov             t1, t0
1011    mov             t0, t4
1012%else
1013    mov             r5, t3m
1014    mov             t1, t2m
1015    mov            t4m, r5
1016    mov            t3m, t1
1017    mov             r5, t1m
1018    mov             t1, t0
1019    mov            t2m, r5
1020    mov             t0, t4m
1021    mov             wq, wm
1022%endif
1023    add           dstq, dst_strideq
1024    ret
1025.v:
1026    movif64         wq, r5
1027    movif32        t1m, t1
1028.v_loop:
1029%if ARCH_X86_64
1030    mova            m0, [t1+wq]
1031    paddw           m2, m0, [t3+wq]
1032    mova            m1, [t2+wq]
1033    mova            m4, [t4+wq]
1034%else
1035    mov             t0, t3m
1036    mova            m0, [t1+wq]
1037    mov             t1, t2m
1038    paddw           m2, m0, [t0+wq]
1039    mov             t0, t4m
1040    mova            m1, [t1+wq]
1041    mova            m4, [t0+wq]
1042%endif
1043    punpckhwd       m3, m2, m1
1044    pmaddwd         m3, m14
1045    punpcklwd       m2, m1
1046    pmaddwd         m2, m14
1047    punpckhwd       m1, m0, m4
1048    pmaddwd         m1, m13
1049    punpcklwd       m0, m4
1050    pmaddwd         m0, m13
1051    paddd           m3, m9
1052    paddd           m2, m9
1053    paddd           m1, m3
1054    paddd           m0, m2
1055    psrad           m1, 6
1056    psrad           m0, 6
1057    packssdw        m0, m1
1058    pmulhw          m0, m10
1059    pxor            m1, m1
1060    pmaxsw          m0, m1
1061    mova     [dstq+wq], m0
1062    add             wq, 16
1063%if ARCH_X86_64
1064    jl .v_loop
1065%else
1066    jge .v_end
1067    mov             t1, t1m
1068    jmp .v_loop
1069.v_end:
1070%endif
1071    ret
1072
1073%macro GATHERDD 3 ; dst, src, tmp
1074    movd           %3d, %2
1075 %if ARCH_X86_64
1076    movd            %1, [r13+%3]
1077    pextrw         %3d, %2, 2
1078    pinsrw          %1, [r13+%3+2], 3
1079    pextrw         %3d, %2, 4
1080    pinsrw          %1, [r13+%3+2], 5
1081    pextrw         %3d, %2, 6
1082    pinsrw          %1, [r13+%3+2], 7
1083 %else
1084    movd            %1, [base+sgr_x_by_x-0xf03+%3]
1085    pextrw          %3, %2, 2
1086    pinsrw          %1, [base+sgr_x_by_x-0xf03+%3+2], 3
1087    pextrw          %3, %2, 4
1088    pinsrw          %1, [base+sgr_x_by_x-0xf03+%3+2], 5
1089    pextrw          %3, %2, 6
1090    pinsrw          %1, [base+sgr_x_by_x-0xf03+%3+2], 7
1091 %endif
1092%endmacro
1093
1094%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore
1095 %if ARCH_X86_64
1096  %define tmp r14
1097 %else
1098  %define tmp %4
1099 %endif
1100    GATHERDD        %1, %2, tmp
1101    GATHERDD        %2, %3, tmp
1102    movif32         %4, %5
1103    psrld           %1, 24
1104    psrld           %2, 24
1105    packssdw        %1, %2
1106%endmacro
1107
1108%macro MAXSD 3-4 0 ; dst, src, restore_tmp
1109    pcmpgtd         %3, %1, %2
1110    pand            %1, %3
1111    pandn           %3, %2
1112    por             %1, %3
1113 %if %4 == 1
1114    pxor            %3, %3
1115 %endif
1116%endmacro
1117
1118%macro MULLD 3 ; dst, src, tmp
1119    pmulhuw         %3, %1, %2
1120    pmullw          %1, %2
1121    pslld           %3, 16
1122    paddd           %1, %3
1123%endmacro
1124
1125%if ARCH_X86_32
1126DECLARE_REG_TMP 0, 1, 2, 3, 4
1127 %if STACK_ALIGNMENT < 16
1128  %assign extra_stack 5*16
1129 %else
1130  %assign extra_stack 3*16
1131 %endif
1132cglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \
1133                              dst, dst_stride, left, lpf, lpf_stride, w, params, h
1134 %if STACK_ALIGNMENT < 16
1135  %define dstm         dword [esp+calloff+16*0+4*6]
1136  %define dst_stridemp dword [esp+calloff+16*3+4*7]
1137  %define leftm        dword [esp+calloff+16*3+4*0]
1138  %define lpfm         dword [esp+calloff+16*3+4*1]
1139  %define lpf_stridem  dword [esp+calloff+16*3+4*2]
1140  %define w0m          dword [esp+calloff+16*3+4*3]
1141  %define hd           dword [esp+calloff+16*3+4*4]
1142  %define edgeb         byte [esp+calloff+16*3+4*5]
1143  %define edged        dword [esp+calloff+16*3+4*5]
1144  %define leftmp leftm
1145 %else
1146  %define w0m wm
1147  %define hd dword r6m
1148  %define edgeb  byte r8m
1149  %define edged dword r8m
1150 %endif
1151 %define hvsrcm dword [esp+calloff+4*0]
1152 %define w1m    dword [esp+calloff+4*1]
1153 %define t0m    dword [esp+calloff+4*2]
1154 %define t2m    dword [esp+calloff+4*3]
1155 %define t3m    dword [esp+calloff+4*4]
1156 %define t4m    dword [esp+calloff+4*5]
1157 %define  m8 [base+pd_8]
1158 %define  m9 [base+pw_25]
1159 %define m10 [esp+calloff+16*2]
1160 %define m11 [base+pd_0xf00800a4]
1161 %define m12 [base+pw_256]
1162 %define m13 [base+pd_34816]
1163 %define m14 [base+pw_1023]
1164 %define m15 [base+sgr_lshuf5]
1165 %define r10 r5
1166 %define base r6-$$
1167 %assign calloff 0
1168 %if STACK_ALIGNMENT < 16
1169    mov    dst_strideq, [rstk+stack_offset+ 8]
1170    mov          leftq, [rstk+stack_offset+12]
1171    mov           lpfq, [rstk+stack_offset+16]
1172    mov    lpf_strideq, [rstk+stack_offset+20]
1173    mov             wd, [rstk+stack_offset+24]
1174    mov           dstm, dstq
1175    mov   dst_stridemp, dst_strideq
1176    mov          leftm, leftq
1177    mov             r1, [rstk+stack_offset+28]
1178    mov             r2, [rstk+stack_offset+36]
1179    mov           lpfm, lpfq
1180    mov    lpf_stridem, lpf_strideq
1181    mov             hd, r1
1182    mov          edged, r2
1183 %endif
1184%else
1185cglobal sgr_filter_5x5_16bpc, 5, 15, 16, -400*24-16, dst, dst_stride, left, lpf, \
1186                                                     lpf_stride, w, edge, params, h
1187%endif
1188%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
1189    movifnidn       wd, wm
1190%endif
1191%if ARCH_X86_64
1192    mov        paramsq, paramsmp
1193    lea            r13, [sgr_x_by_x-0xf03]
1194    mov          edged, r8m
1195    add             wd, wd
1196    mov             hd, r6m
1197    movu           m10, [paramsq]
1198    mova           m12, [pw_256]
1199    add           lpfq, wq
1200    mova            m8, [pd_8]
1201    lea             t1, [rsp+wq+20]
1202    mova            m9, [pw_25]
1203    add           dstq, wq
1204    lea             t3, [rsp+wq*2+400*12+16]
1205    mova           m11, [pd_0xf00800a4]
1206    lea             t4, [rsp+wq+400*20+16]
1207    pshufhw         m7, m10, q0000
1208    pshufb         m10, m12       ; s0
1209    punpckhqdq      m7, m7        ; w0
1210    neg             wq
1211    mova           m13, [pd_34816]  ; (1 << 11) + (1 << 15)
1212    pxor            m6, m6
1213    mova           m14, [pw_1023]
1214    psllw           m7, 4
1215    mova           m15, [sgr_lshuf5]
1216 DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w
1217 %define lpfm        [rsp+0]
1218 %define lpf_stridem [rsp+8]
1219%else
1220    mov             r1, [rstk+stack_offset+32] ; params
1221    LEA             r6, $$
1222    add             wd, wd
1223    movu            m1, [r1]
1224    add           lpfm, wq
1225    lea             t1, [rsp+extra_stack+wq+20]
1226    add           dstq, wq
1227    lea             t3, [rsp+extra_stack+wq*2+400*12+16]
1228    mov           dstm, dstq
1229    lea             t4, [rsp+extra_stack+wq+400*20+16]
1230    mov            t3m, t3
1231    pshufhw         m7, m1, q0000
1232    mov            t4m, t4
1233    pshufb          m1, m12       ; s0
1234    punpckhqdq      m7, m7        ; w0
1235    psllw           m7, 4
1236    neg             wq
1237    mova           m10, m1
1238    pxor            m6, m6
1239    mov            w1m, wd
1240    sub             wd, 4
1241    mov           lpfq, lpfm
1242    mov    lpf_strideq, lpf_stridem
1243    mov            w0m, wd
1244%endif
1245    test         edgeb, 4 ; LR_HAVE_TOP
1246    jz .no_top
1247    call .h_top
1248    add           lpfq, lpf_strideq
1249    movif32        t2m, t1
1250    mov             t2, t1
1251    call .top_fixup
1252    add             t1, 400*6
1253    call .h_top
1254    lea            r10, [lpfq+lpf_strideq*4]
1255    mov           lpfq, dstq
1256    movif64 lpf_stridem, lpf_strideq
1257    add            r10, lpf_strideq
1258    mov           lpfm, r10 ; below
1259    movif32        t0m, t2
1260    mov             t0, t2
1261    dec             hd
1262    jz .height1
1263    or           edged, 16
1264    call .h
1265.main:
1266    add           lpfq, dst_stridemp
1267    movif32         t4, t4m
1268    call .hv
1269    call .prep_n
1270    sub             hd, 2
1271    jl .extend_bottom
1272.main_loop:
1273    movif32       lpfq, hvsrcm
1274    add           lpfq, dst_stridemp
1275%if ARCH_X86_64
1276    test            hb, hb
1277%else
1278    mov             r5, hd
1279    test            r5, r5
1280%endif
1281    jz .odd_height
1282    call .h
1283    add           lpfq, dst_stridemp
1284    call .hv
1285    movif32       dstq, dstm
1286    call .n0
1287    call .n1
1288    sub             hd, 2
1289    movif32         t0, t0m
1290    jge .main_loop
1291    test         edgeb, 8 ; LR_HAVE_BOTTOM
1292    jz .extend_bottom
1293    mov           lpfq, lpfm
1294    call .h_top
1295    add           lpfq, lpf_stridem
1296    call .hv_bottom
1297.end:
1298    movif32       dstq, dstm
1299    call .n0
1300    call .n1
1301.end2:
1302    RET
1303.height1:
1304    movif32         t4, t4m
1305    call .hv
1306    call .prep_n
1307    jmp .odd_height_end
1308.odd_height:
1309    call .hv
1310    movif32       dstq, dstm
1311    call .n0
1312    call .n1
1313.odd_height_end:
1314    call .v
1315    movif32       dstq, dstm
1316    call .n0
1317    jmp .end2
1318.extend_bottom:
1319    call .v
1320    jmp .end
1321.no_top:
1322    lea            r10, [lpfq+lpf_strideq*4]
1323    mov           lpfq, dstq
1324    movif64 lpf_stridem, lpf_strideq
1325    lea            r10, [r10+lpf_strideq*2]
1326    mov           lpfm, r10
1327    call .h
1328    lea             t2, [t1+400*6]
1329    movif32        t2m, t2
1330    call .top_fixup
1331    dec             hd
1332    jz .no_top_height1
1333    or           edged, 16
1334    mov             t0, t1
1335    mov             t1, t2
1336    movif32        t0m, t0
1337    jmp .main
1338.no_top_height1:
1339    movif32         t3, t3m
1340    movif32         t4, t4m
1341    call .v
1342    call .prep_n
1343    jmp .odd_height_end
1344.extend_right:
1345%assign stack_offset stack_offset+8
1346%assign calloff 8
1347    movd            m1, wd
1348    mova            m3, [base+pb_m14_m13]
1349    mova            m0, [base+pb_0to15]
1350    pshufb          m1, m6
1351    psubb           m2, m12, m1
1352    psubb           m3, m1
1353    movd            m1, [lpfq-2]
1354    pcmpgtb         m2, m0
1355    pcmpgtb         m3, m0
1356    pshufb          m1, m12
1357    pand            m4, m2
1358    pand            m5, m3
1359    pandn           m2, m1
1360    pandn           m3, m1
1361    por             m4, m2
1362    por             m5, m3
1363    ret
1364%assign stack_offset stack_offset-4
1365%assign calloff 4
1366.h: ; horizontal boxsum
1367%if ARCH_X86_64
1368    lea             wq, [r5-4]
1369%else
1370 %define leftq r5
1371%endif
1372    test         edgeb, 1 ; LR_HAVE_LEFT
1373    jz .h_extend_left
1374    movif32      leftq, leftm
1375    movddup         m5, [leftq]
1376    movif32         wq, w0m
1377    mova            m4, [lpfq+wq+4]
1378    add         leftmp, 8
1379    palignr         m4, m5, 10
1380    jmp .h_main
1381.h_extend_left:
1382    movif32         wq, w0m
1383    mova            m4, [lpfq+wq+4]
1384    pshufb          m4, m15
1385    jmp .h_main
1386.h_top:
1387%if ARCH_X86_64
1388    lea             wq, [r5-4]
1389%endif
1390    test         edgeb, 1 ; LR_HAVE_LEFT
1391    jz .h_extend_left
1392    movif32         wq, w0m
1393.h_loop:
1394    movu            m4, [lpfq+wq- 2]
1395.h_main:
1396    movu            m5, [lpfq+wq+14]
1397    test         edgeb, 2 ; LR_HAVE_RIGHT
1398    jnz .h_have_right
1399    cmp             wd, -20
1400    jl .h_have_right
1401    call .extend_right
1402.h_have_right:
1403    palignr         m2, m5, m4, 2
1404    paddw           m0, m4, m2
1405    palignr         m3, m5, m4, 6
1406    paddw           m0, m3
1407    punpcklwd       m1, m2, m3
1408    pmaddwd         m1, m1
1409    punpckhwd       m2, m3
1410    pmaddwd         m2, m2
1411    palignr         m5, m4, 8
1412    paddw           m0, m5
1413    punpcklwd       m3, m4, m5
1414    pmaddwd         m3, m3
1415    paddd           m1, m3
1416    punpckhwd       m3, m4, m5
1417    pmaddwd         m3, m3
1418    shufps          m4, m5, q2121
1419    paddw           m0, m4             ; sum
1420    punpcklwd       m5, m4, m6
1421    pmaddwd         m5, m5
1422    punpckhwd       m4, m6
1423    pmaddwd         m4, m4
1424    paddd           m2, m3
1425    test         edgeb, 16             ; y > 0
1426    jz .h_loop_end
1427    paddw           m0, [t1+wq+400*0]
1428    paddd           m1, [t1+wq+400*2]
1429    paddd           m2, [t1+wq+400*4]
1430.h_loop_end:
1431    paddd           m1, m5             ; sumsq
1432    paddd           m2, m4
1433    mova [t1+wq+400*0], m0
1434    mova [t1+wq+400*2], m1
1435    mova [t1+wq+400*4], m2
1436    add             wq, 16
1437    jl .h_loop
1438    ret
1439.top_fixup:
1440%if ARCH_X86_64
1441    lea             wq, [r5-4]
1442%else
1443    mov             wd, w0m
1444%endif
1445.top_fixup_loop: ; the sums of the first row needs to be doubled
1446    mova            m0, [t1+wq+400*0]
1447    mova            m1, [t1+wq+400*2]
1448    mova            m2, [t1+wq+400*4]
1449    paddw           m0, m0
1450    paddd           m1, m1
1451    paddd           m2, m2
1452    mova [t2+wq+400*0], m0
1453    mova [t2+wq+400*2], m1
1454    mova [t2+wq+400*4], m2
1455    add             wq, 16
1456    jl .top_fixup_loop
1457    ret
1458ALIGN function_align
1459.hv: ; horizontal boxsum + vertical boxsum + ab
1460%if ARCH_X86_64
1461    lea             wq, [r5-4]
1462%else
1463    mov         hvsrcm, lpfq
1464%endif
1465    test         edgeb, 1 ; LR_HAVE_LEFT
1466    jz .hv_extend_left
1467    movif32      leftq, leftm
1468    movddup         m5, [leftq]
1469    movif32         wq, w0m
1470    mova            m4, [lpfq+wq+4]
1471    add         leftmp, 8
1472    palignr         m4, m5, 10
1473    jmp .hv_main
1474.hv_extend_left:
1475    movif32         wq, w0m
1476    mova            m4, [lpfq+wq+4]
1477    pshufb          m4, m15
1478    jmp .hv_main
1479.hv_bottom:
1480%if ARCH_X86_64
1481    lea             wq, [r5-4]
1482%else
1483    mov         hvsrcm, lpfq
1484%endif
1485    test         edgeb, 1 ; LR_HAVE_LEFT
1486    jz .hv_extend_left
1487    movif32         wq, w0m
1488%if ARCH_X86_32
1489    jmp .hv_loop_start
1490%endif
1491.hv_loop:
1492    movif32       lpfq, hvsrcm
1493.hv_loop_start:
1494    movu            m4, [lpfq+wq- 2]
1495.hv_main:
1496    movu            m5, [lpfq+wq+14]
1497    test         edgeb, 2 ; LR_HAVE_RIGHT
1498    jnz .hv_have_right
1499    cmp             wd, -20
1500    jl .hv_have_right
1501    call .extend_right
1502.hv_have_right:
1503    movif32         t3, hd
1504    palignr         m3, m5, m4, 2
1505    paddw           m0, m4, m3
1506    palignr         m1, m5, m4, 6
1507    paddw           m0, m1
1508    punpcklwd       m2, m3, m1
1509    pmaddwd         m2, m2
1510    punpckhwd       m3, m1
1511    pmaddwd         m3, m3
1512    palignr         m5, m4, 8
1513    paddw           m0, m5
1514    punpcklwd       m1, m4, m5
1515    pmaddwd         m1, m1
1516    paddd           m2, m1
1517    punpckhwd       m1, m4, m5
1518    pmaddwd         m1, m1
1519    shufps          m4, m5, q2121
1520    paddw           m0, m4            ; h sum
1521    punpcklwd       m5, m4, m6
1522    pmaddwd         m5, m5
1523    punpckhwd       m4, m6
1524    pmaddwd         m4, m4
1525    paddd           m3, m1
1526    paddd           m2, m5            ; h sumsq
1527    paddd           m3, m4
1528    paddw           m1, m0, [t1+wq+400*0]
1529    paddd           m4, m2, [t1+wq+400*2]
1530    paddd           m5, m3, [t1+wq+400*4]
1531%if ARCH_X86_64
1532    test            hd, hd
1533%else
1534    test            t3, t3
1535%endif
1536    jz .hv_last_row
1537.hv_main2:
1538    paddw           m1, [t2+wq+400*0] ; hv sum
1539    paddd           m4, [t2+wq+400*2] ; hv sumsq
1540    paddd           m5, [t2+wq+400*4]
1541    mova [t0+wq+400*0], m0
1542    mova [t0+wq+400*2], m2
1543    mova [t0+wq+400*4], m3
1544    psrlw           m3, m1, 1
1545    paddd           m4, m8
1546    pavgw           m3, m6             ; (b + 2) >> 2
1547    paddd           m5, m8
1548    psrld           m4, 4              ; (a + 8) >> 4
1549    punpcklwd       m2, m3, m6
1550    psrld           m5, 4
1551    punpckhwd       m3, m6
1552    MULLD           m4, m9, m0         ; a * 25
1553    MULLD           m5, m9, m0
1554    pmaddwd         m2, m2             ; b * b
1555    pmaddwd         m3, m3
1556    punpcklwd       m0, m1, m6         ; b
1557    punpckhwd       m1, m6
1558    MAXSD           m4, m2, m6
1559    MAXSD           m5, m3, m6, 1
1560    psubd           m4, m2             ; p
1561    psubd           m5, m3
1562    MULLD           m4, m10, m2        ; p * s
1563    MULLD           m5, m10, m2
1564    pmaddwd         m0, m11            ; b * 164
1565    pmaddwd         m1, m11
1566    paddusw         m4, m11
1567    paddusw         m5, m11
1568    psrld           m3, m4, 20         ; min(z, 255)
1569    movif32         t3, t3m
1570    psrld           m4, m5, 20
1571    GATHER_X_BY_X   m2, m3, m4, t2, t2m
1572    punpcklwd       m3, m2, m2
1573    punpckhwd       m4, m2, m2
1574    MULLD           m0, m3, m5
1575    MULLD           m1, m4, m5
1576    psubw           m5, m12, m2        ; a
1577    paddd           m0, m13            ; x * b * 164 + (1 << 11) + (1 << 15)
1578    paddd           m1, m13
1579    mova     [t4+wq+4], m5
1580    psrld           m0, 12             ; b
1581    psrld           m1, 12
1582    mova  [t3+wq*2+ 8], m0
1583    mova  [t3+wq*2+24], m1
1584    add             wq, 16
1585    jl .hv_loop
1586    mov             t2, t1
1587    mov             t1, t0
1588    mov             t0, t2
1589    movif32        t2m, t2
1590    movif32        t0m, t0
1591    ret
1592.hv_last_row: ; esoteric edge case for odd heights
1593    mova    [t1+wq+400*0], m1
1594    paddw              m1, m0
1595    mova    [t1+wq+400*2], m4
1596    paddd              m4, m2
1597    mova    [t1+wq+400*4], m5
1598    paddd              m5, m3
1599    jmp .hv_main2
1600.v: ; vertical boxsum + ab
1601%if ARCH_X86_64
1602    lea             wq, [r5-4]
1603%else
1604    mov             wd, w0m
1605%endif
1606.v_loop:
1607    mova            m0, [t1+wq+400*0]
1608    mova            m2, [t1+wq+400*2]
1609    mova            m3, [t1+wq+400*4]
1610    paddw           m1, m0, [t2+wq+400*0]
1611    paddd           m4, m2, [t2+wq+400*2]
1612    paddd           m5, m3, [t2+wq+400*4]
1613    paddw           m0, m0
1614    paddd           m2, m2
1615    paddd           m3, m3
1616    paddw           m1, m0             ; hv sum
1617    paddd           m4, m2             ; hv sumsq
1618    paddd           m5, m3
1619    psrlw           m3, m1, 1
1620    paddd           m4, m8
1621    pavgw           m3, m6             ; (b + 2) >> 2
1622    paddd           m5, m8
1623    psrld           m4, 4              ; (a + 8) >> 4
1624    punpcklwd       m2, m3, m6
1625    psrld           m5, 4
1626    punpckhwd       m3, m6
1627    MULLD           m4, m9, m0         ; a * 25
1628    MULLD           m5, m9, m0
1629    pmaddwd         m2, m2             ; b * b
1630    pmaddwd         m3, m3
1631    punpcklwd       m0, m1, m6         ; b
1632    punpckhwd       m1, m6
1633    MAXSD           m4, m2, m6
1634    MAXSD           m5, m3, m6, 1
1635    psubd           m4, m2             ; p
1636    psubd           m5, m3
1637    MULLD           m4, m10, m2        ; p * s
1638    MULLD           m5, m10, m2
1639    pmaddwd         m0, m11            ; b * 164
1640    pmaddwd         m1, m11
1641    paddusw         m4, m11
1642    paddusw         m5, m11
1643    psrld           m3, m4, 20         ; min(z, 255)
1644    psrld           m4, m5, 20
1645    GATHER_X_BY_X   m2, m3, m4, t2, t2m
1646    punpcklwd       m3, m2, m2
1647    punpckhwd       m4, m2, m2
1648    MULLD           m0, m3, m5
1649    MULLD           m1, m4, m5
1650    psubw           m5, m12, m2        ; a
1651    paddd           m0, m13            ; x * b * 164 + (1 << 11) + (1 << 15)
1652    paddd           m1, m13
1653    mova     [t4+wq+4], m5
1654    psrld           m0, 12             ; b
1655    psrld           m1, 12
1656    mova  [t3+wq*2+ 8], m0
1657    mova  [t3+wq*2+24], m1
1658    add             wq, 16
1659    jl .v_loop
1660    ret
1661.prep_n: ; initial neighbor setup
1662    movif64         wq, r5
1663    movif32         wd, w1m
1664.prep_n_loop:
1665    movu            m0, [t4+wq*1+ 2]
1666    movu            m3, [t4+wq*1+ 4]
1667    movu            m1, [t3+wq*2+ 4]
1668    movu            m4, [t3+wq*2+ 8]
1669    movu            m2, [t3+wq*2+20]
1670    movu            m5, [t3+wq*2+24]
1671    paddw           m3, m0
1672    paddd           m4, m1
1673    paddd           m5, m2
1674    paddw           m3, [t4+wq*1+ 0]
1675    paddd           m4, [t3+wq*2+ 0]
1676    paddd           m5, [t3+wq*2+16]
1677    paddw           m0, m3
1678    psllw           m3, 2
1679    paddd           m1, m4
1680    pslld           m4, 2
1681    paddd           m2, m5
1682    pslld           m5, 2
1683    paddw           m0, m3             ; a 565
1684    paddd           m1, m4             ; b 565
1685    paddd           m2, m5
1686    mova [t4+wq*1+400*2+ 0], m0
1687    mova [t3+wq*2+400*4+ 0], m1
1688    mova [t3+wq*2+400*4+16], m2
1689    add             wq, 16
1690    jl .prep_n_loop
1691    ret
1692ALIGN function_align
1693.n0: ; neighbor + output (even rows)
1694    movif64         wq, r5
1695    movif32         wd, w1m
1696.n0_loop:
1697    movu            m0, [t4+wq*1+ 2]
1698    movu            m3, [t4+wq*1+ 4]
1699    movu            m1, [t3+wq*2+ 4]
1700    movu            m4, [t3+wq*2+ 8]
1701    movu            m2, [t3+wq*2+20]
1702    movu            m5, [t3+wq*2+24]
1703    paddw           m3, m0
1704    paddd           m4, m1
1705    paddd           m5, m2
1706    paddw           m3, [t4+wq*1+ 0]
1707    paddd           m4, [t3+wq*2+ 0]
1708    paddd           m5, [t3+wq*2+16]
1709    paddw           m0, m3
1710    psllw           m3, 2
1711    paddd           m1, m4
1712    pslld           m4, 2
1713    paddd           m2, m5
1714    pslld           m5, 2
1715    paddw           m0, m3             ; a 565
1716    paddd           m1, m4             ; b 565
1717    paddd           m2, m5
1718    paddw           m3, m0, [t4+wq*1+400*2+ 0]
1719    paddd           m4, m1, [t3+wq*2+400*4+ 0]
1720    paddd           m5, m2, [t3+wq*2+400*4+16]
1721    mova [t4+wq*1+400*2+ 0], m0
1722    mova [t3+wq*2+400*4+ 0], m1
1723    mova [t3+wq*2+400*4+16], m2
1724    mova            m0, [dstq+wq]
1725    punpcklwd       m1, m0, m6          ; src
1726    punpcklwd       m2, m3, m6          ; a
1727    pmaddwd         m2, m1              ; a * src
1728    punpckhwd       m1, m0, m6
1729    punpckhwd       m3, m6
1730    pmaddwd         m3, m1
1731    paddd           m2, m4              ; a * src + b + (1 << 8)
1732    paddd           m3, m5
1733    psrld           m2, 9
1734    psrld           m3, 9
1735    packssdw        m2, m3
1736    psllw           m1, m0, 4
1737    psubw           m2, m1
1738    pmulhrsw        m2, m7
1739    paddw           m0, m2
1740    pmaxsw          m0, m6
1741    pminsw          m0, m14
1742    mova     [dstq+wq], m0
1743    add             wq, 16
1744    jl .n0_loop
1745    add           dstq, dst_stridemp
1746    ret
1747ALIGN function_align
1748.n1: ; neighbor + output (odd rows)
1749    movif64         wq, r5
1750    movif32         wd, w1m
1751.n1_loop:
1752    mova            m0, [dstq+wq]
1753    mova            m3, [t4+wq*1+400*2+ 0]
1754    mova            m4, [t3+wq*2+400*4+ 0]
1755    mova            m5, [t3+wq*2+400*4+16]
1756    punpcklwd       m1, m0, m6          ; src
1757    punpcklwd       m2, m3, m6          ; a
1758    pmaddwd         m2, m1
1759    punpckhwd       m1, m0, m6
1760    punpckhwd       m3, m6
1761    pmaddwd         m3, m1
1762    paddd           m2, m4              ; a * src + b + (1 << 7)
1763    paddd           m3, m5
1764    psrld           m2, 8
1765    psrld           m3, 8
1766    packssdw        m2, m3
1767    psllw           m1, m0, 4
1768    psubw           m2, m1
1769    pmulhrsw        m2, m7
1770    paddw           m0, m2
1771    pmaxsw          m0, m6
1772    pminsw          m0, m14
1773    mova     [dstq+wq], m0
1774    add             wq, 16
1775    jl .n1_loop
1776    add           dstq, dst_stridemp
1777    movif32       dstm, dstq
1778    ret
1779
1780%if ARCH_X86_32
1781 %if STACK_ALIGNMENT < 16
1782  %assign extra_stack 4*16
1783 %else
1784  %assign extra_stack 2*16
1785 %endif
1786cglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \
1787                              dst, dst_stride, left, lpf, lpf_stride, w, params, h
1788 %if STACK_ALIGNMENT < 16
1789  %define dstm         dword [esp+calloff+16*2+4*0]
1790  %define dst_stridemp dword [esp+calloff+16*2+4*1]
1791  %define leftm        dword [esp+calloff+16*2+4*2]
1792  %define lpfm         dword [esp+calloff+16*2+4*3]
1793  %define lpf_stridem  dword [esp+calloff+16*2+4*4]
1794  %define w0m          dword [esp+calloff+16*2+4*5]
1795  %define hd           dword [esp+calloff+16*2+4*6]
1796  %define edgeb         byte [esp+calloff+16*2+4*7]
1797  %define edged        dword [esp+calloff+16*2+4*7]
1798  %define leftmp leftm
1799 %else
1800  %define w0m wm
1801  %define hd dword r6m
1802  %define edgeb  byte r8m
1803  %define edged dword r8m
1804 %endif
1805 %define hvsrcm dword [esp+calloff+4*0]
1806 %define w1m    dword [esp+calloff+4*1]
1807 %define t3m    dword [esp+calloff+4*2]
1808 %define t4m    dword [esp+calloff+4*3]
1809 %define  m8 [base+pd_8]
1810 %define  m9 [esp+calloff+16*1]
1811 %define m10 [base+pd_0xf00801c7]
1812 %define m11 [base+pd_34816]
1813 %define m12 [base+pw_256]
1814 %define m13 [base+pw_1023]
1815 %define m14 [base+sgr_lshuf3]
1816 %define m15 m6
1817 %define base r6-$$
1818 %assign calloff 0
1819 %if STACK_ALIGNMENT < 16
1820    mov    dst_strideq, [rstk+stack_offset+ 8]
1821    mov          leftq, [rstk+stack_offset+12]
1822    mov           lpfq, [rstk+stack_offset+16]
1823    mov    lpf_strideq, [rstk+stack_offset+20]
1824    mov             wd, [rstk+stack_offset+24]
1825    mov           dstm, dstq
1826    mov   dst_stridemp, dst_strideq
1827    mov          leftm, leftq
1828    mov             r1, [rstk+stack_offset+28]
1829    mov             r2, [rstk+stack_offset+36]
1830    mov           lpfm, lpfq
1831    mov    lpf_stridem, lpf_strideq
1832    mov             hd, r1
1833    mov          edged, r2
1834 %endif
1835%else
1836cglobal sgr_filter_3x3_16bpc, 5, 15, 16, 400*42+8, dst, dst_stride, left, lpf, \
1837                                                   lpf_stride, w, edge, params, h
1838%endif
1839%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
1840    movifnidn       wd, wm
1841%endif
1842%if ARCH_X86_64
1843    mov        paramsq, paramsmp
1844    lea            r13, [sgr_x_by_x-0xf03]
1845    mov          edged, r8m
1846    add             wd, wd
1847    mov             hd, r6m
1848    movq            m9, [paramsq+4]
1849    mova           m12, [pw_256]
1850    add           lpfq, wq
1851    lea             t1, [rsp+wq+12]
1852    mova            m8, [pd_8]
1853    add           dstq, wq
1854    lea             t3, [rsp+wq*2+400*12+8]
1855    mova           m10, [pd_0xf00801c7]
1856    lea             t4, [rsp+wq+400*32+8]
1857    mova           m11, [pd_34816]
1858    pshuflw         m7, m9, q3333
1859    pshufb          m9, m12       ; s1
1860    punpcklqdq      m7, m7        ; w1
1861    neg             wq
1862    pxor            m6, m6
1863    mova           m13, [pw_1023]
1864    psllw           m7, 4
1865    mova           m14, [sgr_lshuf3]
1866 DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w
1867 %define lpfm [rsp]
1868%else
1869    mov             r1, [rstk+stack_offset+32] ; params
1870    LEA             r6, $$
1871    add             wd, wd
1872    movq            m1, [r1+4]
1873    add           lpfm, wq
1874    lea             t1, [rsp+extra_stack+wq+20]
1875    add           dstq, wq
1876    lea             t3, [rsp+extra_stack+wq*2+400*12+16]
1877    mov           dstm, dstq
1878    lea             t4, [rsp+extra_stack+wq+400*32+16]
1879    mov            t3m, t3
1880    pshuflw         m7, m1, q3333
1881    mov            t4m, t4
1882    pshufb          m1, m12       ; s1
1883    punpcklqdq      m7, m7        ; w1
1884    psllw           m7, 4
1885    neg             wq
1886    mova            m9, m1
1887    pxor            m6, m6
1888    mov            w1m, wd
1889    sub             wd, 4
1890    mov           lpfq, lpfm
1891    mov    lpf_strideq, lpf_stridem
1892    mov            w0m, wd
1893%endif
1894    test         edgeb, 4 ; LR_HAVE_TOP
1895    jz .no_top
1896    call .h_top
1897    add           lpfq, lpf_strideq
1898    mov             t2, t1
1899    add             t1, 400*6
1900    call .h_top
1901    lea            r10, [lpfq+lpf_strideq*4]
1902    mov           lpfq, dstq
1903    add            r10, lpf_strideq
1904    mov           lpfm, r10 ; below
1905    movif32         t4, t4m
1906    call .hv0
1907.main:
1908    dec             hd
1909    jz .height1
1910    movif32       lpfq, hvsrcm
1911    add           lpfq, dst_stridemp
1912    call .hv1
1913    call .prep_n
1914    sub             hd, 2
1915    jl .extend_bottom
1916.main_loop:
1917    movif32       lpfq, hvsrcm
1918    add           lpfq, dst_stridemp
1919    call .hv0
1920%if ARCH_X86_64
1921    test            hb, hb
1922%else
1923    mov             r5, hd
1924    test            r5, r5
1925%endif
1926    jz .odd_height
1927    movif32       lpfq, hvsrcm
1928    add           lpfq, dst_stridemp
1929    call .hv1
1930    call .n0
1931    call .n1
1932    sub             hd, 2
1933    jge .main_loop
1934    test         edgeb, 8 ; LR_HAVE_BOTTOM
1935    jz .extend_bottom
1936    mov           lpfq, lpfm
1937    call .hv0_bottom
1938%if ARCH_X86_64
1939    add           lpfq, lpf_strideq
1940%else
1941    mov           lpfq, hvsrcm
1942    add           lpfq, lpf_stridem
1943%endif
1944    call .hv1_bottom
1945.end:
1946    call .n0
1947    call .n1
1948.end2:
1949    RET
1950.height1:
1951    call .v1
1952    call .prep_n
1953    jmp .odd_height_end
1954.odd_height:
1955    call .v1
1956    call .n0
1957    call .n1
1958.odd_height_end:
1959    call .v0
1960    call .v1
1961    call .n0
1962    jmp .end2
1963.extend_bottom:
1964    call .v0
1965    call .v1
1966    jmp .end
1967.no_top:
1968    lea            r10, [lpfq+lpf_strideq*4]
1969    mov           lpfq, dstq
1970    lea            r10, [r10+lpf_strideq*2]
1971    mov           lpfm, r10
1972    call .h
1973%if ARCH_X86_64
1974    lea             wq, [r5-4]
1975%else
1976    mov             wq, w0m
1977    mov         hvsrcm, lpfq
1978%endif
1979    lea             t2, [t1+400*6]
1980.top_fixup_loop:
1981    mova            m0, [t1+wq+400*0]
1982    mova            m1, [t1+wq+400*2]
1983    mova            m2, [t1+wq+400*4]
1984    mova [t2+wq+400*0], m0
1985    mova [t2+wq+400*2], m1
1986    mova [t2+wq+400*4], m2
1987    add             wq, 16
1988    jl .top_fixup_loop
1989    movif32         t3, t3m
1990    movif32         t4, t4m
1991    call .v0
1992    jmp .main
1993.extend_right:
1994%assign stack_offset stack_offset+8
1995%assign calloff 8
1996    movd            m1, wd
1997    mova            m2, [base+pb_m2_m1]
1998    mova            m3, [base+pb_0to15]
1999    movd            m5, [lpfq-2]
2000    pshufb          m1, m6
2001    pshufb          m5, m12
2002    psubb           m2, m1
2003    pcmpgtb         m2, m3
2004    pand            m4, m2
2005    pandn           m2, m5
2006    por             m4, m2
2007    ret
2008%assign stack_offset stack_offset-4
2009%assign calloff 4
2010.h: ; horizontal boxsum
2011%if ARCH_X86_64
2012    lea             wq, [r5-4]
2013%else
2014 %define leftq r5
2015%endif
2016    test         edgeb, 1 ; LR_HAVE_LEFT
2017    jz .h_extend_left
2018    movif32      leftq, leftm
2019    movddup         m5, [leftq]
2020    movif32         wq, w0m
2021    mova            m4, [lpfq+wq+4]
2022    add         leftmp, 8
2023    palignr         m4, m5, 12
2024    jmp .h_main
2025.h_extend_left:
2026    movif32         wq, w0m
2027    mova            m4, [lpfq+wq+4]
2028    pshufb          m4, m14
2029    jmp .h_main
2030.h_top:
2031%if ARCH_X86_64
2032    lea             wq, [r5-4]
2033%endif
2034    test         edgeb, 1 ; LR_HAVE_LEFT
2035    jz .h_extend_left
2036    movif32         wq, w0m
2037.h_loop:
2038    movu            m4, [lpfq+wq+ 0]
2039.h_main:
2040    movu            m5, [lpfq+wq+16]
2041    test         edgeb, 2 ; LR_HAVE_RIGHT
2042    jnz .h_have_right
2043    cmp             wd, -18
2044    jl .h_have_right
2045    call .extend_right
2046.h_have_right:
2047    palignr         m0, m5, m4, 2
2048    paddw           m1, m4, m0
2049    punpcklwd       m2, m4, m0
2050    pmaddwd         m2, m2
2051    punpckhwd       m3, m4, m0
2052    pmaddwd         m3, m3
2053    palignr         m5, m4, 4
2054    paddw           m1, m5             ; sum
2055    punpcklwd       m4, m5, m6
2056    pmaddwd         m4, m4
2057    punpckhwd       m5, m6
2058    pmaddwd         m5, m5
2059    paddd           m2, m4             ; sumsq
2060    paddd           m3, m5
2061    mova [t1+wq+400*0], m1
2062    mova [t1+wq+400*2], m2
2063    mova [t1+wq+400*4], m3
2064    add             wq, 16
2065    jl .h_loop
2066    ret
2067ALIGN function_align
2068.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
2069%if ARCH_X86_64
2070    lea             wq, [r5-4]
2071%else
2072    mov         hvsrcm, lpfq
2073%endif
2074    test         edgeb, 1 ; LR_HAVE_LEFT
2075    jz .hv0_extend_left
2076    movif32      leftq, leftm
2077    movddup         m5, [leftq]
2078    movif32         wq, w0m
2079    mova            m4, [lpfq+wq+4]
2080    add         leftmp, 8
2081    palignr         m4, m5, 12
2082    jmp .hv0_main
2083.hv0_extend_left:
2084    movif32         wq, w0m
2085    mova            m4, [lpfq+wq+4]
2086    pshufb          m4, m14
2087    jmp .hv0_main
2088.hv0_bottom:
2089%if ARCH_X86_64
2090    lea             wq, [r5-4]
2091%else
2092    mov         hvsrcm, lpfq
2093%endif
2094    test         edgeb, 1 ; LR_HAVE_LEFT
2095    jz .hv0_extend_left
2096    movif32         wq, w0m
2097%if ARCH_X86_32
2098    jmp .hv0_loop_start
2099%endif
2100.hv0_loop:
2101    movif32       lpfq, hvsrcm
2102.hv0_loop_start:
2103    movu            m4, [lpfq+wq+ 0]
2104.hv0_main:
2105    movu            m5, [lpfq+wq+16]
2106    test         edgeb, 2 ; LR_HAVE_RIGHT
2107    jnz .hv0_have_right
2108    cmp             wd, -18
2109    jl .hv0_have_right
2110    call .extend_right
2111.hv0_have_right:
2112    palignr         m0, m5, m4, 2
2113    paddw           m1, m4, m0
2114    punpcklwd       m2, m4, m0
2115    pmaddwd         m2, m2
2116    punpckhwd       m3, m4, m0
2117    pmaddwd         m3, m3
2118    palignr         m5, m4, 4
2119    paddw           m1, m5             ; sum
2120    punpcklwd       m4, m5, m6
2121    pmaddwd         m4, m4
2122    punpckhwd       m5, m6
2123    pmaddwd         m5, m5
2124    paddd           m2, m4             ; sumsq
2125    paddd           m3, m5
2126    paddw           m0, m1, [t1+wq+400*0]
2127    paddd           m4, m2, [t1+wq+400*2]
2128    paddd           m5, m3, [t1+wq+400*4]
2129    mova [t1+wq+400*0], m1
2130    mova [t1+wq+400*2], m2
2131    mova [t1+wq+400*4], m3
2132    paddw           m1, m0, [t2+wq+400*0]
2133    paddd           m2, m4, [t2+wq+400*2]
2134    paddd           m3, m5, [t2+wq+400*4]
2135    mova [t2+wq+400*0], m0
2136    mova [t2+wq+400*2], m4
2137    mova [t2+wq+400*4], m5
2138    paddd           m2, m8
2139    paddd           m3, m8
2140    psrld           m2, 4              ; (a + 8) >> 4
2141    psrld           m3, 4
2142    pslld           m4, m2, 3
2143    pslld           m5, m3, 3
2144    paddd           m4, m2             ; ((a + 8) >> 4) * 9
2145    paddd           m5, m3
2146    psrlw           m3, m1, 1
2147    pavgw           m3, m6             ; (b + 2) >> 2
2148    punpcklwd       m2, m3, m6
2149    pmaddwd         m2, m2
2150    punpckhwd       m3, m6
2151    pmaddwd         m3, m3
2152    punpcklwd       m0, m1, m6         ; b
2153    punpckhwd       m1, m6
2154    MAXSD           m4, m2, m15
2155    MAXSD           m5, m3, m15
2156    psubd           m4, m2             ; p
2157    psubd           m5, m3
2158    MULLD           m4, m9, m15        ; p * s
2159    MULLD           m5, m9, m15
2160    pmaddwd         m0, m10            ; b * 455
2161    pmaddwd         m1, m10
2162    paddusw         m4, m10
2163    paddusw         m5, m10
2164    psrld           m3, m4, 20         ; min(z, 255)
2165    movif32         t3, t3m
2166    psrld           m4, m5, 20
2167    GATHER_X_BY_X   m2, m3, m4, r0, dstm
2168    punpcklwd       m3, m2, m2
2169    punpckhwd       m4, m2, m2
2170    MULLD           m0, m3, m15
2171    MULLD           m1, m4, m15
2172    psubw           m5, m12, m2
2173%if ARCH_X86_32
2174    pxor            m6, m6
2175%endif
2176    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
2177    paddd           m1, m11
2178    mova     [t4+wq+4], m5
2179    psrld           m0, 12
2180    psrld           m1, 12
2181    mova  [t3+wq*2+ 8], m0
2182    mova  [t3+wq*2+24], m1
2183    add             wq, 16
2184    jl .hv0_loop
2185    ret
2186ALIGN function_align
2187.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
2188%if ARCH_X86_64
2189    lea             wq, [r5-4]
2190%else
2191    mov         hvsrcm, lpfq
2192%endif
2193    test         edgeb, 1 ; LR_HAVE_LEFT
2194    jz .hv1_extend_left
2195    movif32      leftq, leftm
2196    movddup         m5, [leftq]
2197    movif32         wq, w0m
2198    mova            m4, [lpfq+wq+4]
2199    add         leftmp, 8
2200    palignr         m4, m5, 12
2201    jmp .hv1_main
2202.hv1_extend_left:
2203    movif32         wq, w0m
2204    mova            m4, [lpfq+wq+4]
2205    pshufb          m4, m14
2206    jmp .hv1_main
2207.hv1_bottom:
2208%if ARCH_X86_64
2209    lea             wq, [r5-4]
2210%else
2211    mov         hvsrcm, lpfq
2212%endif
2213    test         edgeb, 1 ; LR_HAVE_LEFT
2214    jz .hv1_extend_left
2215    movif32         wq, w0m
2216%if ARCH_X86_32
2217    jmp .hv1_loop_start
2218%endif
2219.hv1_loop:
2220    movif32       lpfq, hvsrcm
2221.hv1_loop_start:
2222    movu            m4, [lpfq+wq+ 0]
2223.hv1_main:
2224    movu            m5, [lpfq+wq+16]
2225    test         edgeb, 2 ; LR_HAVE_RIGHT
2226    jnz .hv1_have_right
2227    cmp             wd, -18
2228    jl .hv1_have_right
2229    call .extend_right
2230.hv1_have_right:
2231    palignr         m1, m5, m4, 2
2232    paddw           m0, m4, m1
2233    punpcklwd       m2, m4, m1
2234    pmaddwd         m2, m2
2235    punpckhwd       m3, m4, m1
2236    pmaddwd         m3, m3
2237    palignr         m5, m4, 4
2238    paddw           m0, m5             ; h sum
2239    punpcklwd       m1, m5, m6
2240    pmaddwd         m1, m1
2241    punpckhwd       m5, m6
2242    pmaddwd         m5, m5
2243    paddd           m2, m1             ; h sumsq
2244    paddd           m3, m5
2245    paddw           m1, m0, [t2+wq+400*0]
2246    paddd           m4, m2, [t2+wq+400*2]
2247    paddd           m5, m3, [t2+wq+400*4]
2248    mova [t2+wq+400*0], m0
2249    mova [t2+wq+400*2], m2
2250    mova [t2+wq+400*4], m3
2251    paddd           m4, m8
2252    paddd           m5, m8
2253    psrld           m4, 4              ; (a + 8) >> 4
2254    psrld           m5, 4
2255    pslld           m2, m4, 3
2256    pslld           m3, m5, 3
2257    paddd           m4, m2             ; ((a + 8) >> 4) * 9
2258    paddd           m5, m3
2259    psrlw           m3, m1, 1
2260    pavgw           m3, m6             ; (b + 2) >> 2
2261    punpcklwd       m2, m3, m6
2262    pmaddwd         m2, m2
2263    punpckhwd       m3, m6
2264    pmaddwd         m3, m3
2265    punpcklwd       m0, m1, m6         ; b
2266    punpckhwd       m1, m6
2267    MAXSD           m4, m2, m15
2268    MAXSD           m5, m3, m15
2269    psubd           m4, m2             ; p
2270    psubd           m5, m3
2271    MULLD           m4, m9, m15        ; p * s
2272    MULLD           m5, m9, m15
2273    pmaddwd         m0, m10            ; b * 455
2274    pmaddwd         m1, m10
2275    paddusw         m4, m10
2276    paddusw         m5, m10
2277    psrld           m3, m4, 20         ; min(z, 255)
2278    movif32         t3, t3m
2279    psrld           m4, m5, 20
2280    GATHER_X_BY_X   m2, m3, m4, r0, dstm
2281    punpcklwd       m3, m2, m2
2282    punpckhwd       m4, m2, m2
2283    MULLD           m0, m3, m15
2284    MULLD           m1, m4, m15
2285    psubw           m5, m12, m2
2286%if ARCH_X86_32
2287    pxor            m6, m6
2288%endif
2289    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
2290    paddd           m1, m11
2291    mova [t4+wq*1+400*2 +4], m5
2292    psrld           m0, 12
2293    psrld           m1, 12
2294    mova [t3+wq*2+400*4+ 8], m0
2295    mova [t3+wq*2+400*4+24], m1
2296    add             wq, 16
2297    jl .hv1_loop
2298    mov            r10, t2
2299    mov             t2, t1
2300    mov             t1, r10
2301    ret
2302.v0: ; vertical boxsums + ab (even rows)
2303%if ARCH_X86_64
2304    lea             wq, [r5-4]
2305%else
2306    mov             wd, w0m
2307%endif
2308.v0_loop:
2309    mova            m0, [t1+wq+400*0]
2310    mova            m4, [t1+wq+400*2]
2311    mova            m5, [t1+wq+400*4]
2312    paddw           m0, m0
2313    paddd           m4, m4
2314    paddd           m5, m5
2315    paddw           m1, m0, [t2+wq+400*0]
2316    paddd           m2, m4, [t2+wq+400*2]
2317    paddd           m3, m5, [t2+wq+400*4]
2318    mova [t2+wq+400*0], m0
2319    mova [t2+wq+400*2], m4
2320    mova [t2+wq+400*4], m5
2321    paddd           m2, m8
2322    paddd           m3, m8
2323    psrld           m2, 4              ; (a + 8) >> 4
2324    psrld           m3, 4
2325    pslld           m4, m2, 3
2326    pslld           m5, m3, 3
2327    paddd           m4, m2             ; ((a + 8) >> 4) * 9
2328    paddd           m5, m3
2329    psrlw           m3, m1, 1
2330    pavgw           m3, m6             ; (b + 2) >> 2
2331    punpcklwd       m2, m3, m6
2332    pmaddwd         m2, m2
2333    punpckhwd       m3, m6
2334    pmaddwd         m3, m3
2335    punpcklwd       m0, m1, m6         ; b
2336    punpckhwd       m1, m6
2337    MAXSD           m4, m2, m15
2338    MAXSD           m5, m3, m15
2339    psubd           m4, m2             ; p
2340    psubd           m5, m3
2341    MULLD           m4, m9, m15        ; p * s
2342    MULLD           m5, m9, m15
2343    pmaddwd         m0, m10            ; b * 455
2344    pmaddwd         m1, m10
2345    paddusw         m4, m10
2346    paddusw         m5, m10
2347    psrld           m3, m4, 20         ; min(z, 255)
2348    psrld           m4, m5, 20
2349    GATHER_X_BY_X   m2, m3, m4, r0, dstm
2350    punpcklwd       m3, m2, m2
2351    punpckhwd       m4, m2, m2
2352    MULLD           m0, m3, m15
2353    MULLD           m1, m4, m15
2354    psubw           m5, m12, m2
2355%if ARCH_X86_32
2356    pxor            m6, m6
2357%endif
2358    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
2359    paddd           m1, m11
2360    mova [t4+wq*1+400*0+ 4], m5
2361    psrld           m0, 12
2362    psrld           m1, 12
2363    mova [t3+wq*2+400*0+ 8], m0
2364    mova [t3+wq*2+400*0+24], m1
2365    add             wq, 16
2366    jl .v0_loop
2367    ret
2368.v1: ; vertical boxsums + ab (odd rows)
2369%if ARCH_X86_64
2370    lea             wq, [r5-4]
2371%else
2372    mov             wd, w0m
2373%endif
2374.v1_loop:
2375    mova            m0, [t1+wq+400*0]
2376    mova            m4, [t1+wq+400*2]
2377    mova            m5, [t1+wq+400*4]
2378    paddw           m1, m0, [t2+wq+400*0]
2379    paddd           m2, m4, [t2+wq+400*2]
2380    paddd           m3, m5, [t2+wq+400*4]
2381    mova [t2+wq+400*0], m0
2382    mova [t2+wq+400*2], m4
2383    mova [t2+wq+400*4], m5
2384    paddd           m2, m8
2385    paddd           m3, m8
2386    psrld           m2, 4              ; (a + 8) >> 4
2387    psrld           m3, 4
2388    pslld           m4, m2, 3
2389    pslld           m5, m3, 3
2390    paddd           m4, m2             ; ((a + 8) >> 4) * 9
2391    paddd           m5, m3
2392    psrlw           m3, m1, 1
2393    pavgw           m3, m6             ; (b + 2) >> 2
2394    punpcklwd       m2, m3, m6
2395    pmaddwd         m2, m2
2396    punpckhwd       m3, m6
2397    pmaddwd         m3, m3
2398    punpcklwd       m0, m1, m6         ; b
2399    punpckhwd       m1, m6
2400    MAXSD           m4, m2, m15
2401    MAXSD           m5, m3, m15
2402    psubd           m4, m2             ; p
2403    psubd           m5, m3
2404    MULLD           m4, m9, m15        ; p * s
2405    MULLD           m5, m9, m15
2406    pmaddwd         m0, m10            ; b * 455
2407    pmaddwd         m1, m10
2408    paddusw         m4, m10
2409    paddusw         m5, m10
2410    psrld           m3, m4, 20         ; min(z, 255)
2411    psrld           m4, m5, 20
2412    GATHER_X_BY_X   m2, m3, m4, r0, dstm
2413    punpcklwd       m3, m2, m2
2414    punpckhwd       m4, m2, m2
2415    MULLD           m0, m3, m15
2416    MULLD           m1, m4, m15
2417    psubw           m5, m12, m2
2418%if ARCH_X86_32
2419    pxor            m6, m6
2420%endif
2421    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
2422    paddd           m1, m11
2423    mova [t4+wq*1+400*2+ 4], m5
2424    psrld           m0, 12
2425    psrld           m1, 12
2426    mova [t3+wq*2+400*4+ 8], m0
2427    mova [t3+wq*2+400*4+24], m1
2428    add             wq, 16
2429    jl .v1_loop
2430    mov            r10, t2
2431    mov             t2, t1
2432    mov             t1, r10
2433    ret
2434.prep_n: ; initial neighbor setup
2435    movif64         wq, r5
2436    movif32         wd, w1m
2437.prep_n_loop:
2438    movu            m0, [t4+wq*1+400*0+ 4]
2439    movu            m1, [t3+wq*2+400*0+ 8]
2440    movu            m2, [t3+wq*2+400*0+24]
2441    movu            m3, [t4+wq*1+400*0+ 2]
2442    movu            m4, [t3+wq*2+400*0+ 4]
2443    movu            m5, [t3+wq*2+400*0+20]
2444    paddw           m0, [t4+wq*1+400*0+ 0]
2445    paddd           m1, [t3+wq*2+400*0+ 0]
2446    paddd           m2, [t3+wq*2+400*0+16]
2447    paddw           m3, m0
2448    paddd           m4, m1
2449    paddd           m5, m2
2450    psllw           m3, 2                ; a[-1] 444
2451    pslld           m4, 2                ; b[-1] 444
2452    pslld           m5, 2
2453    psubw           m3, m0               ; a[-1] 343
2454    psubd           m4, m1               ; b[-1] 343
2455    psubd           m5, m2
2456    mova [t4+wq*1+400*4], m3
2457    mova [t3+wq*2+400*8+ 0], m4
2458    mova [t3+wq*2+400*8+16], m5
2459    movu            m0, [t4+wq*1+400*2+ 4]
2460    movu            m1, [t3+wq*2+400*4+ 8]
2461    movu            m2, [t3+wq*2+400*4+24]
2462    movu            m3, [t4+wq*1+400*2+ 2]
2463    movu            m4, [t3+wq*2+400*4+ 4]
2464    movu            m5, [t3+wq*2+400*4+20]
2465    paddw           m0, [t4+wq*1+400*2+ 0]
2466    paddd           m1, [t3+wq*2+400*4+ 0]
2467    paddd           m2, [t3+wq*2+400*4+16]
2468    paddw           m3, m0
2469    paddd           m4, m1
2470    paddd           m5, m2
2471    psllw           m3, 2                 ; a[ 0] 444
2472    pslld           m4, 2                 ; b[ 0] 444
2473    pslld           m5, 2
2474    mova [t4+wq*1+400* 6], m3
2475    mova [t3+wq*2+400*12+ 0], m4
2476    mova [t3+wq*2+400*12+16], m5
2477    psubw           m3, m0                ; a[ 0] 343
2478    psubd           m4, m1                ; b[ 0] 343
2479    psubd           m5, m2
2480    mova [t4+wq*1+400* 8], m3
2481    mova [t3+wq*2+400*16+ 0], m4
2482    mova [t3+wq*2+400*16+16], m5
2483    add             wq, 16
2484    jl .prep_n_loop
2485    ret
2486ALIGN function_align
2487.n0: ; neighbor + output (even rows)
2488    movif64         wq, r5
2489    movif32         wd, w1m
2490.n0_loop:
2491    movu            m3, [t4+wq*1+400*0+4]
2492    movu            m1, [t4+wq*1+400*0+2]
2493    paddw           m3, [t4+wq*1+400*0+0]
2494    paddw           m1, m3
2495    psllw           m1, 2                ; a[ 1] 444
2496    psubw           m2, m1, m3           ; a[ 1] 343
2497    paddw           m3, m2, [t4+wq*1+400*4]
2498    paddw           m3, [t4+wq*1+400*6]
2499    mova [t4+wq*1+400*4], m2
2500    mova [t4+wq*1+400*6], m1
2501    movu            m4, [t3+wq*2+400*0+8]
2502    movu            m1, [t3+wq*2+400*0+4]
2503    paddd           m4, [t3+wq*2+400*0+0]
2504    paddd           m1, m4
2505    pslld           m1, 2                ; b[ 1] 444
2506    psubd           m2, m1, m4           ; b[ 1] 343
2507    paddd           m4, m2, [t3+wq*2+400* 8+ 0]
2508    paddd           m4, [t3+wq*2+400*12+ 0]
2509    mova [t3+wq*2+400* 8+ 0], m2
2510    mova [t3+wq*2+400*12+ 0], m1
2511    movu            m5, [t3+wq*2+400*0+24]
2512    movu            m1, [t3+wq*2+400*0+20]
2513    paddd           m5, [t3+wq*2+400*0+16]
2514    paddd           m1, m5
2515    pslld           m1, 2
2516    psubd           m2, m1, m5
2517    paddd           m5, m2, [t3+wq*2+400* 8+16]
2518    paddd           m5, [t3+wq*2+400*12+16]
2519    mova [t3+wq*2+400* 8+16], m2
2520    mova [t3+wq*2+400*12+16], m1
2521    mova            m0, [dstq+wq]
2522    punpcklwd       m1, m0, m6
2523    punpcklwd       m2, m3, m6
2524    pmaddwd         m2, m1               ; a * src
2525    punpckhwd       m1, m0, m6
2526    punpckhwd       m3, m6
2527    pmaddwd         m3, m1
2528    paddd           m2, m4               ; a * src + b + (1 << 8)
2529    paddd           m3, m5
2530    psrld           m2, 9
2531    psrld           m3, 9
2532    packssdw        m2, m3
2533    psllw           m1, m0, 4
2534    psubw           m2, m1
2535    pmulhrsw        m2, m7
2536    paddw           m0, m2
2537    pmaxsw          m0, m6
2538    pminsw          m0, m13
2539    mova     [dstq+wq], m0
2540    add             wq, 16
2541    jl .n0_loop
2542    add           dstq, dst_stridemp
2543    ret
2544ALIGN function_align
2545.n1: ; neighbor + output (odd rows)
2546    movif64         wq, r5
2547    movif32         wd, w1m
2548.n1_loop:
2549    movu            m3, [t4+wq*1+400*2+4]
2550    movu            m1, [t4+wq*1+400*2+2]
2551    paddw           m3, [t4+wq*1+400*2+0]
2552    paddw           m1, m3
2553    psllw           m1, 2                ; a[ 1] 444
2554    psubw           m2, m1, m3           ; a[ 1] 343
2555    paddw           m3, m2, [t4+wq*1+400*6]
2556    paddw           m3, [t4+wq*1+400*8]
2557    mova [t4+wq*1+400*6], m1
2558    mova [t4+wq*1+400*8], m2
2559    movu            m4, [t3+wq*2+400*4+8]
2560    movu            m1, [t3+wq*2+400*4+4]
2561    paddd           m4, [t3+wq*2+400*4+0]
2562    paddd           m1, m4
2563    pslld           m1, 2                ; b[ 1] 444
2564    psubd           m2, m1, m4           ; b[ 1] 343
2565    paddd           m4, m2, [t3+wq*2+400*12+ 0]
2566    paddd           m4, [t3+wq*2+400*16+ 0]
2567    mova [t3+wq*2+400*12+ 0], m1
2568    mova [t3+wq*2+400*16+ 0], m2
2569    movu            m5, [t3+wq*2+400*4+24]
2570    movu            m1, [t3+wq*2+400*4+20]
2571    paddd           m5, [t3+wq*2+400*4+16]
2572    paddd           m1, m5
2573    pslld           m1, 2
2574    psubd           m2, m1, m5
2575    paddd           m5, m2, [t3+wq*2+400*12+16]
2576    paddd           m5, [t3+wq*2+400*16+16]
2577    mova [t3+wq*2+400*12+16], m1
2578    mova [t3+wq*2+400*16+16], m2
2579    mova            m0, [dstq+wq]
2580    punpcklwd       m1, m0, m6
2581    punpcklwd       m2, m3, m6
2582    pmaddwd         m2, m1               ; a * src
2583    punpckhwd       m1, m0, m6
2584    punpckhwd       m3, m6
2585    pmaddwd         m3, m1
2586    paddd           m2, m4               ; a * src + b + (1 << 8)
2587    paddd           m3, m5
2588    psrld           m2, 9
2589    psrld           m3, 9
2590    packssdw        m2, m3
2591    psllw           m1, m0, 4
2592    psubw           m2, m1
2593    pmulhrsw        m2, m7
2594    paddw           m0, m2
2595    pmaxsw          m0, m6
2596    pminsw          m0, m13
2597    mova     [dstq+wq], m0
2598    add             wq, 16
2599    jl .n1_loop
2600    add           dstq, dst_stridemp
2601    movif32       dstm, dstq
2602    ret
2603
2604%if ARCH_X86_32
2605 %if STACK_ALIGNMENT < 16
2606  %assign extra_stack 10*16
2607 %else
2608  %assign extra_stack 8*16
2609 %endif
2610cglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \
2611                              dst, dst_stride, left, lpf, lpf_stride, w, params, h
2612 %if STACK_ALIGNMENT < 16
2613  %define dstm         dword [esp+calloff+16*8+4*0]
2614  %define dst_stridemp dword [esp+calloff+16*8+4*1]
2615  %define leftm        dword [esp+calloff+16*8+4*2]
2616  %define lpfm         dword [esp+calloff+16*8+4*3]
2617  %define lpf_stridem  dword [esp+calloff+16*8+4*4]
2618  %define w0m          dword [esp+calloff+16*8+4*5]
2619  %define hd           dword [esp+calloff+16*8+4*6]
2620  %define edgeb         byte [esp+calloff+16*8+4*7]
2621  %define edged        dword [esp+calloff+16*8+4*7]
2622  %define leftmp leftm
2623 %else
2624  %define w0m wm
2625  %define hd dword r6m
2626  %define edgeb  byte r8m
2627  %define edged dword r8m
2628 %endif
2629 %define hvsrcm dword [esp+calloff+4*0]
2630 %define w1m    dword [esp+calloff+4*1]
2631 %define t3m    dword [esp+calloff+4*2]
2632 %define t4m    dword [esp+calloff+4*3]
2633 %xdefine m8 m6
2634 %define  m9 [base+pd_8]
2635 %define m10 [base+pd_34816]
2636 %define m11 [base+pd_0xf00801c7]
2637 %define m12 [base+pw_256]
2638 %define m13 [esp+calloff+16*4]
2639 %define m14 [esp+calloff+16*5]
2640 %define m15 [esp+calloff+16*6]
2641 %define  m6 [esp+calloff+16*7]
2642 %define base r6-$$
2643 %assign calloff 0
2644 %if STACK_ALIGNMENT < 16
2645    mov    dst_strideq, [rstk+stack_offset+ 8]
2646    mov          leftq, [rstk+stack_offset+12]
2647    mov           lpfq, [rstk+stack_offset+16]
2648    mov    lpf_strideq, [rstk+stack_offset+20]
2649    mov             wd, [rstk+stack_offset+24]
2650    mov           dstm, dstq
2651    mov   dst_stridemp, dst_strideq
2652    mov          leftm, leftq
2653    mov             r1, [rstk+stack_offset+28]
2654    mov             r2, [rstk+stack_offset+36]
2655    mov           lpfm, lpfq
2656    mov    lpf_stridem, lpf_strideq
2657    mov             hd, r1
2658    mov          edged, r2
2659 %endif
2660%else
2661cglobal sgr_filter_mix_16bpc, 5, 15, 16, -400*66-40, dst, dst_stride, left, \
2662                                                     lpf, lpf_stride, w, edge, \
2663                                                     params, h
2664%endif
2665%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
2666    movifnidn       wd, wm
2667%endif
2668%if ARCH_X86_64
2669    mov        paramsq, paramsmp
2670    lea            r13, [sgr_x_by_x-0xf03]
2671    mov          edged, r8m
2672    add             wd, wd
2673    mov             hd, r6m
2674    mova           m15, [paramsq]
2675    add           lpfq, wq
2676    mova            m9, [pd_8]
2677    lea             t1, [rsp+wq+44]
2678    mova           m10, [pd_34816]
2679    add           dstq, wq
2680    mova           m12, [pw_256]
2681    lea             t3, [rsp+wq*2+400*24+40]
2682    mova           m11, [pd_0xf00801c7]
2683    lea             t4, [rsp+wq+400*52+40]
2684    neg             wq
2685    pshuflw        m13, m15, q0000
2686    pshuflw        m14, m15, q2222
2687    pshufhw        m15, m15, q1010
2688    punpcklqdq     m13, m13 ; s0
2689    punpcklqdq     m14, m14 ; s1
2690    punpckhqdq     m15, m15 ; w0 w1
2691    pxor            m6, m6
2692    psllw          m15, 2
2693 DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w
2694 %define lpfm [rsp]
2695%else
2696    mov             r1, [rstk+stack_offset+32] ; params
2697    LEA             r6, $$
2698    add             wd, wd
2699    mova            m2, [r1]
2700    add           lpfm, wq
2701    lea             t1, [rsp+extra_stack+wq+52]
2702    add           dstq, wq
2703    lea             t3, [rsp+extra_stack+wq*2+400*24+48]
2704    mov           dstm, dstq
2705    lea             t4, [rsp+extra_stack+wq+400*52+48]
2706    mov            t3m, t3
2707    mov            t4m, t4
2708    neg             wq
2709    pshuflw         m0, m2, q0000
2710    pshuflw         m1, m2, q2222
2711    pshufhw         m2, m2, q1010
2712    punpcklqdq      m0, m0 ; s0
2713    punpcklqdq      m1, m1 ; s1
2714    punpckhqdq      m2, m2 ; w0 w1
2715    mov            w1m, wd
2716    pxor            m3, m3
2717    psllw           m2, 2
2718    mova           m13, m0
2719    mova           m14, m1
2720    sub             wd, 4
2721    mova           m15, m2
2722    mova            m6, m3
2723    mov           lpfq, lpfm
2724    mov    lpf_strideq, lpf_stridem
2725    mov            w0m, wd
2726%endif
2727    test         edgeb, 4 ; LR_HAVE_TOP
2728    jz .no_top
2729    call .h_top
2730    add           lpfq, lpf_strideq
2731    mov             t2, t1
2732%if ARCH_X86_64
2733    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup
2734%else
2735    mov             wq, w0m
2736    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup_loop
2737%endif
2738    add             t1, 400*12
2739    call .h_top
2740    lea            r10, [lpfq+lpf_strideq*4]
2741    mov           lpfq, dstq
2742    add            r10, lpf_strideq
2743    mov           lpfm, r10 ; below
2744    movif32         t4, t4m
2745    call .hv0
2746.main:
2747    dec             hd
2748    jz .height1
2749    movif32       lpfq, hvsrcm
2750    add           lpfq, dst_stridemp
2751    call .hv1
2752    call .prep_n
2753    sub             hd, 2
2754    jl .extend_bottom
2755.main_loop:
2756    movif32       lpfq, hvsrcm
2757    add           lpfq, dst_stridemp
2758    call .hv0
2759%if ARCH_X86_64
2760    test            hd, hd
2761%else
2762    mov             r5, hd
2763    test            r5, r5
2764%endif
2765    jz .odd_height
2766    movif32       lpfq, hvsrcm
2767    add           lpfq, dst_stridemp
2768    call .hv1
2769    call .n0
2770    call .n1
2771    sub             hd, 2
2772    jge .main_loop
2773    test         edgeb, 8 ; LR_HAVE_BOTTOM
2774    jz .extend_bottom
2775    mov           lpfq, lpfm
2776    call .hv0_bottom
2777%if ARCH_X86_64
2778    add           lpfq, lpf_strideq
2779%else
2780    mov           lpfq, hvsrcm
2781    add           lpfq, lpf_stridem
2782%endif
2783    call .hv1_bottom
2784.end:
2785    call .n0
2786    call .n1
2787.end2:
2788    RET
2789.height1:
2790    call .v1
2791    call .prep_n
2792    jmp .odd_height_end
2793.odd_height:
2794    call .v1
2795    call .n0
2796    call .n1
2797.odd_height_end:
2798    call .v0
2799    call .v1
2800    call .n0
2801    jmp .end2
2802.extend_bottom:
2803    call .v0
2804    call .v1
2805    jmp .end
2806.no_top:
2807    lea            r10, [lpfq+lpf_strideq*4]
2808    mov           lpfq, dstq
2809    lea            r10, [r10+lpf_strideq*2]
2810    mov           lpfm, r10
2811    call .h
2812%if ARCH_X86_64
2813    lea             wq, [r5-4]
2814%else
2815    mov             wq, w0m
2816    mov         hvsrcm, lpfq
2817%endif
2818    lea             t2, [t1+400*12]
2819.top_fixup_loop:
2820    mova            m0, [t1+wq+400* 0]
2821    mova            m1, [t1+wq+400* 2]
2822    mova            m2, [t1+wq+400* 4]
2823    paddw           m0, m0
2824    mova            m3, [t1+wq+400* 6]
2825    paddd           m1, m1
2826    mova            m4, [t1+wq+400* 8]
2827    paddd           m2, m2
2828    mova            m5, [t1+wq+400*10]
2829    mova [t2+wq+400* 0], m0
2830    mova [t2+wq+400* 2], m1
2831    mova [t2+wq+400* 4], m2
2832    mova [t2+wq+400* 6], m3
2833    mova [t2+wq+400* 8], m4
2834    mova [t2+wq+400*10], m5
2835    add             wq, 16
2836    jl .top_fixup_loop
2837    movif32         t3, t3m
2838    movif32         t4, t4m
2839    call .v0
2840    jmp .main
2841.h: ; horizontal boxsum
2842%assign stack_offset stack_offset+4
2843%assign calloff 4
2844%if ARCH_X86_64
2845    lea             wq, [r5-4]
2846%else
2847 %define leftq r5
2848%endif
2849    test         edgeb, 1 ; LR_HAVE_LEFT
2850    jz .h_extend_left
2851    movif32      leftq, leftm
2852    movddup         m5, [leftq]
2853    movif32         wq, w0m
2854    mova            m4, [lpfq+wq+4]
2855    add         leftmp, 8
2856    palignr         m4, m5, 10
2857    jmp .h_main
2858.h_extend_left:
2859    movif32         wq, w0m
2860    mova            m4, [lpfq+wq+4]
2861    pshufb          m4, [base+sgr_lshuf5]
2862    jmp .h_main
2863.h_top:
2864%if ARCH_X86_64
2865    lea             wq, [r5-4]
2866%endif
2867    test         edgeb, 1 ; LR_HAVE_LEFT
2868    jz .h_extend_left
2869    movif32         wq, w0m
2870.h_loop:
2871    movu            m4, [lpfq+wq- 2]
2872.h_main:
2873    movu            m5, [lpfq+wq+14]
2874    test         edgeb, 2 ; LR_HAVE_RIGHT
2875    jnz .h_have_right
2876    cmp             wd, -20
2877    jl .h_have_right
2878%if ARCH_X86_32
2879    pxor            m8, m8
2880%endif
2881    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
2882.h_have_right:
2883    palignr         m3, m5, m4, 2
2884    palignr         m0, m5, m4, 4
2885    paddw           m1, m3, m0
2886    punpcklwd       m2, m3, m0
2887    pmaddwd         m2, m2
2888    punpckhwd       m3, m0
2889    pmaddwd         m3, m3
2890    palignr         m0, m5, m4, 6
2891    paddw           m1, m0             ; sum3
2892    punpcklwd       m7, m0, m6
2893    pmaddwd         m7, m7
2894    punpckhwd       m0, m6
2895    pmaddwd         m0, m0
2896    paddd           m2, m7             ; sumsq3
2897    palignr         m5, m4, 8
2898    punpcklwd       m7, m5, m4
2899    paddw           m8, m4, m5
2900    pmaddwd         m7, m7
2901    punpckhwd       m5, m4
2902    pmaddwd         m5, m5
2903    paddd           m3, m0
2904    mova [t1+wq+400* 6], m1
2905    mova [t1+wq+400* 8], m2
2906    mova [t1+wq+400*10], m3
2907    paddw           m8, m1             ; sum5
2908    paddd           m7, m2             ; sumsq5
2909    paddd           m5, m3
2910    mova [t1+wq+400* 0], m8
2911    mova [t1+wq+400* 2], m7
2912    mova [t1+wq+400* 4], m5
2913    add             wq, 16
2914    jl .h_loop
2915    ret
2916ALIGN function_align
2917.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
2918%if ARCH_X86_64
2919    lea             wq, [r5-4]
2920%else
2921    mov         hvsrcm, lpfq
2922%endif
2923    test         edgeb, 1 ; LR_HAVE_LEFT
2924    jz .hv0_extend_left
2925    movif32      leftq, leftm
2926    movddup         m5, [leftq]
2927    movif32         wq, w0m
2928    mova            m4, [lpfq+wq+4]
2929    add         leftmp, 8
2930    palignr         m4, m5, 10
2931    jmp .hv0_main
2932.hv0_extend_left:
2933    movif32         wq, w0m
2934    mova            m4, [lpfq+wq+4]
2935    pshufb          m4, [base+sgr_lshuf5]
2936    jmp .hv0_main
2937.hv0_bottom:
2938%if ARCH_X86_64
2939    lea             wq, [r5-4]
2940%else
2941    mov         hvsrcm, lpfq
2942%endif
2943    test         edgeb, 1 ; LR_HAVE_LEFT
2944    jz .hv0_extend_left
2945    movif32         wq, w0m
2946%if ARCH_X86_32
2947    jmp .hv0_loop_start
2948%endif
2949.hv0_loop:
2950    movif32       lpfq, hvsrcm
2951.hv0_loop_start:
2952    movu            m4, [lpfq+wq- 2]
2953.hv0_main:
2954    movu            m5, [lpfq+wq+14]
2955    test         edgeb, 2 ; LR_HAVE_RIGHT
2956    jnz .hv0_have_right
2957    cmp             wd, -20
2958    jl .hv0_have_right
2959%if ARCH_X86_32
2960    pxor            m8, m8
2961%endif
2962    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
2963.hv0_have_right:
2964    palignr         m3, m5, m4, 2
2965    palignr         m0, m5, m4, 4
2966    movif32         t3, t3m
2967    paddw           m1, m3, m0
2968    punpcklwd       m2, m3, m0
2969    pmaddwd         m2, m2
2970    punpckhwd       m3, m0
2971    pmaddwd         m3, m3
2972    palignr         m0, m5, m4, 6
2973    paddw           m1, m0             ; h sum3
2974    punpcklwd       m7, m0, m6
2975    pmaddwd         m7, m7
2976    punpckhwd       m0, m6
2977    pmaddwd         m0, m0
2978    paddd           m2, m7             ; h sumsq3
2979    palignr         m5, m4, 8
2980    punpcklwd       m7, m5, m4
2981    paddw           m8, m4, m5
2982    pmaddwd         m7, m7
2983    punpckhwd       m5, m4
2984    pmaddwd         m5, m5
2985    paddd           m3, m0
2986    paddw           m8, m1             ; h sum5
2987    paddd           m7, m2             ; h sumsq5
2988    paddd           m5, m3
2989    mova [t3+wq*2+400*8+ 8], m8
2990    mova [t3+wq*2+400*0+ 8], m7
2991    mova [t3+wq*2+400*0+24], m5
2992    paddw           m8, [t1+wq+400* 0]
2993    paddd           m7, [t1+wq+400* 2]
2994    paddd           m5, [t1+wq+400* 4]
2995    mova [t1+wq+400* 0], m8
2996    mova [t1+wq+400* 2], m7
2997    mova [t1+wq+400* 4], m5
2998    paddw           m0, m1, [t1+wq+400* 6]
2999    paddd           m4, m2, [t1+wq+400* 8]
3000    paddd           m5, m3, [t1+wq+400*10]
3001    mova [t1+wq+400* 6], m1
3002    mova [t1+wq+400* 8], m2
3003    mova [t1+wq+400*10], m3
3004    paddw           m1, m0, [t2+wq+400* 6]
3005    paddd           m2, m4, [t2+wq+400* 8]
3006    paddd           m3, m5, [t2+wq+400*10]
3007    mova [t2+wq+400* 6], m0
3008    mova [t2+wq+400* 8], m4
3009    mova [t2+wq+400*10], m5
3010    paddd           m2, m9
3011    paddd           m3, m9
3012    psrld           m2, 4              ; (a3 + 8) >> 4
3013    psrld           m3, 4
3014%if ARCH_X86_32
3015    pxor            m7, m7
3016%else
3017    SWAP            m7, m6
3018%endif
3019    pslld           m4, m2, 3
3020    pslld           m5, m3, 3
3021    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
3022    paddd           m5, m3
3023    psrlw           m3, m1, 1
3024    pavgw           m3, m7             ; (b3 + 2) >> 2
3025    punpcklwd       m2, m3, m7
3026    pmaddwd         m2, m2
3027    punpckhwd       m3, m7
3028    pmaddwd         m3, m3
3029    punpcklwd       m0, m1, m7         ; b3
3030    punpckhwd       m1, m7
3031%if ARCH_X86_64
3032    SWAP            m7, m6
3033%endif
3034    MAXSD           m4, m2, m7
3035    MAXSD           m5, m3, m7
3036    psubd           m4, m2             ; p3
3037    psubd           m5, m3
3038    MULLD           m4, m14, m7        ; p3 * s1
3039    MULLD           m5, m14, m7
3040    pmaddwd         m0, m11            ; b3 * 455
3041    pmaddwd         m1, m11
3042    paddusw         m4, m11
3043    paddusw         m5, m11
3044    psrld           m3, m4, 20         ; min(z3, 255)
3045    psrld           m4, m5, 20
3046    GATHER_X_BY_X   m2, m3, m4, r0, dstm
3047    punpcklwd       m3, m2, m2
3048    punpckhwd       m4, m2, m2
3049    MULLD           m0, m3, m7
3050    MULLD           m1, m4, m7
3051    psubw           m5, m12, m2
3052    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
3053    paddd           m1, m10
3054    mova [t4+wq*1+400*2+ 4], m5
3055    psrld           m0, 12
3056    psrld           m1, 12
3057    mova [t3+wq*2+400*4+ 8], m0
3058    mova [t3+wq*2+400*4+24], m1
3059    add             wq, 16
3060    jl .hv0_loop
3061    ret
3062ALIGN function_align
3063.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
3064%if ARCH_X86_64
3065    lea             wq, [r5-4]
3066%else
3067    mov         hvsrcm, lpfq
3068%endif
3069    test         edgeb, 1 ; LR_HAVE_LEFT
3070    jz .hv1_extend_left
3071    movif32      leftq, leftm
3072    movddup         m5, [leftq]
3073    movif32         wq, w0m
3074    mova            m4, [lpfq+wq+4]
3075    add         leftmp, 8
3076    palignr         m4, m5, 10
3077    jmp .hv1_main
3078.hv1_extend_left:
3079    movif32         wq, w0m
3080    mova            m4, [lpfq+wq+4]
3081    pshufb          m4, [base+sgr_lshuf5]
3082    jmp .hv1_main
3083.hv1_bottom:
3084%if ARCH_X86_64
3085    lea             wq, [r5-4]
3086%else
3087    mov         hvsrcm, lpfq
3088%endif
3089    test         edgeb, 1 ; LR_HAVE_LEFT
3090    jz .hv1_extend_left
3091    movif32         wq, w0m
3092%if ARCH_X86_32
3093    jmp .hv1_loop_start
3094%endif
3095.hv1_loop:
3096    movif32       lpfq, hvsrcm
3097.hv1_loop_start:
3098    movu            m4, [lpfq+wq- 2]
3099.hv1_main:
3100    movu            m5, [lpfq+wq+14]
3101    test         edgeb, 2 ; LR_HAVE_RIGHT
3102    jnz .hv1_have_right
3103    cmp             wd, -20
3104    jl .hv1_have_right
3105%if ARCH_X86_32
3106    pxor            m8, m8
3107%endif
3108    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
3109.hv1_have_right:
3110    palignr         m7, m5, m4, 2
3111    palignr         m3, m5, m4, 4
3112    paddw           m2, m7, m3
3113    punpcklwd       m0, m7, m3
3114    pmaddwd         m0, m0
3115    punpckhwd       m7, m3
3116    pmaddwd         m7, m7
3117    palignr         m3, m5, m4, 6
3118    paddw           m2, m3             ; h sum3
3119    punpcklwd       m1, m3, m6
3120    pmaddwd         m1, m1
3121    punpckhwd       m3, m6
3122    pmaddwd         m3, m3
3123    paddd           m0, m1             ; h sumsq3
3124    palignr         m5, m4, 8
3125    punpckhwd       m1, m4, m5
3126    paddw           m8, m4, m5
3127    pmaddwd         m1, m1
3128    punpcklwd       m4, m5
3129    pmaddwd         m4, m4
3130    paddd           m7, m3
3131    paddw           m5, m2, [t2+wq+400* 6]
3132    mova [t2+wq+400* 6], m2
3133    paddw           m8, m2             ; h sum5
3134    paddd           m2, m0, [t2+wq+400* 8]
3135    paddd           m3, m7, [t2+wq+400*10]
3136    mova [t2+wq+400* 8], m0
3137    mova [t2+wq+400*10], m7
3138    paddd           m4, m0             ; h sumsq5
3139    paddd           m1, m7
3140    paddd           m2, m9
3141    paddd           m3, m9
3142    psrld           m2, 4              ; (a3 + 8) >> 4
3143    psrld           m3, 4
3144    pslld           m0, m2, 3
3145    pslld           m7, m3, 3
3146    paddd           m2, m0             ; ((a3 + 8) >> 4) * 9
3147    paddd           m3, m7
3148    psrlw           m7, m5, 1
3149    pavgw           m7, m6             ; (b3 + 2) >> 2
3150    punpcklwd       m0, m7, m6
3151    pmaddwd         m0, m0
3152    punpckhwd       m7, m6
3153    pmaddwd         m7, m7
3154%if ARCH_X86_32
3155    mova      [esp+20], m8
3156%else
3157    SWAP            m8, m6
3158%endif
3159    MAXSD           m2, m0, m8
3160    MAXSD           m3, m7, m8
3161    pxor            m8, m8
3162    psubd           m2, m0             ; p3
3163    psubd           m3, m7
3164    punpcklwd       m0, m5, m8         ; b3
3165    punpckhwd       m5, m8
3166    MULLD           m2, m14, m8        ; p3 * s1
3167    MULLD           m3, m14, m8
3168    pmaddwd         m0, m11            ; b3 * 455
3169    pmaddwd         m5, m11
3170    paddusw         m2, m11
3171    paddusw         m3, m11
3172    psrld           m8, m2, 20         ; min(z3, 255)
3173    movif32         t3, t3m
3174    psrld           m2, m3, 20
3175    GATHER_X_BY_X   m7, m8, m2, r0, dstm
3176    punpcklwd       m2, m7, m7
3177    punpckhwd       m8, m7, m7
3178    MULLD           m0, m2, m3
3179    MULLD           m5, m8, m3
3180    psubw           m3, m12, m7
3181%if ARCH_X86_32
3182    mova            m8, [esp+20]
3183%endif
3184    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
3185    paddd           m5, m10
3186    psrld           m0, 12
3187    psrld           m5, 12
3188    mova [t4+wq*1+400*4+4], m3
3189    mova [t3+wq*2+400*8+ 8], m0
3190    mova [t3+wq*2+400*8+24], m5
3191%if ARCH_X86_64
3192    SWAP            m6, m8
3193    pxor            m6, m6
3194%endif
3195    paddw           m5, m8, [t2+wq+400*0]
3196    paddd           m2, m4, [t2+wq+400*2]
3197    paddd           m3, m1, [t2+wq+400*4]
3198    paddw           m5, [t1+wq+400*0]
3199    paddd           m2, [t1+wq+400*2]
3200    paddd           m3, [t1+wq+400*4]
3201    mova [t2+wq+400*0], m8
3202    mova [t2+wq+400*2], m4
3203    mova [t2+wq+400*4], m1
3204    mova            m4, [base+pw_25]
3205    paddd           m2, m9
3206    paddd           m3, m9
3207    psrld           m2, 4              ; (a5 + 8) >> 4
3208    psrld           m3, 4
3209    MULLD           m2, m4, m7         ; ((a5 + 8) >> 4) * 25
3210    MULLD           m3, m4, m7
3211%if ARCH_X86_32
3212    pxor            m7, m7
3213%else
3214    SWAP            m7, m6
3215%endif
3216    psrlw           m1, m5, 1
3217    pavgw           m1, m7             ; (b5 + 2) >> 2
3218    punpcklwd       m4, m1, m7
3219    pmaddwd         m4, m4
3220    punpckhwd       m1, m7
3221    pmaddwd         m1, m1
3222    punpcklwd       m0, m5, m7         ; b5
3223    punpckhwd       m5, m7
3224%if ARCH_X86_64
3225    SWAP            m7, m6
3226%endif
3227    MAXSD           m2, m4, m7
3228    psubd           m2, m4             ; p5
3229    mova            m4, [base+pd_0xf00800a4]
3230    MAXSD           m3, m1, m7
3231    psubd           m3, m1
3232    MULLD           m2, m13, m7        ; p5 * s0
3233    MULLD           m3, m13, m7
3234    pmaddwd         m0, m4             ; b5 * 164
3235    pmaddwd         m5, m4
3236    paddusw         m2, m4
3237    paddusw         m3, m4
3238    psrld           m1, m2, 20         ; min(z5, 255)
3239    psrld           m2, m3, 20
3240    GATHER_X_BY_X   m4, m1, m2, r0, dstm
3241    punpcklwd       m2, m4, m4
3242    punpckhwd       m3, m4, m4
3243    MULLD           m0, m2, m7
3244    MULLD           m5, m3, m7
3245    psubw           m1, m12, m4
3246    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
3247    paddd           m5, m10
3248    mova [t4+wq*1+400*0+ 4], m1
3249    psrld           m0, 12
3250    psrld           m5, 12
3251    mova [t3+wq*2+400*0+ 8], m0
3252    mova [t3+wq*2+400*0+24], m5
3253    add             wq, 16
3254    jl .hv1_loop
3255    mov            r10, t2
3256    mov             t2, t1
3257    mov             t1, r10
3258    ret
3259.v0: ; vertical boxsums + ab3 (even rows)
3260%if ARCH_X86_64
3261    lea             wq, [r5-4]
3262%else
3263    mov             wd, w0m
3264%endif
3265.v0_loop:
3266    mova            m0, [t1+wq+400* 6]
3267    mova            m4, [t1+wq+400* 8]
3268    mova            m5, [t1+wq+400*10]
3269    paddw           m0, m0
3270    paddd           m4, m4
3271    paddd           m5, m5
3272    paddw           m1, m0, [t2+wq+400* 6]
3273    paddd           m2, m4, [t2+wq+400* 8]
3274    paddd           m3, m5, [t2+wq+400*10]
3275    mova [t2+wq+400* 6], m0
3276    mova [t2+wq+400* 8], m4
3277    mova [t2+wq+400*10], m5
3278    paddd           m2, m9
3279    paddd           m3, m9
3280    psrld           m2, 4              ; (a3 + 8) >> 4
3281    psrld           m3, 4
3282%if ARCH_X86_32
3283    pxor            m7, m7
3284%else
3285    SWAP            m7, m6
3286%endif
3287    pslld           m4, m2, 3
3288    pslld           m5, m3, 3
3289    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
3290    paddd           m5, m3
3291    psrlw           m3, m1, 1
3292    pavgw           m3, m7             ; (b3 + 2) >> 2
3293    punpcklwd       m2, m3, m7
3294    pmaddwd         m2, m2
3295    punpckhwd       m3, m7
3296    pmaddwd         m3, m3
3297    punpcklwd       m0, m1, m7         ; b3
3298    punpckhwd       m1, m7
3299%if ARCH_X86_64
3300    SWAP            m7, m6
3301%endif
3302    MAXSD           m4, m2, m7
3303    MAXSD           m5, m3, m7
3304    psubd           m4, m2             ; p3
3305    psubd           m5, m3
3306    MULLD           m4, m14, m7        ; p3 * s1
3307    MULLD           m5, m14, m7
3308    pmaddwd         m0, m11            ; b3 * 455
3309    pmaddwd         m1, m11
3310    paddusw         m4, m11
3311    paddusw         m5, m11
3312    psrld           m3, m4, 20         ; min(z3, 255)
3313    psrld           m4, m5, 20
3314    GATHER_X_BY_X   m2, m3, m4, r0, dstm
3315    punpcklwd       m3, m2, m2
3316    punpckhwd       m4, m2, m2
3317    MULLD           m0, m3, m7
3318    MULLD           m1, m4, m7
3319    psubw           m5, m12, m2
3320    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
3321    paddd           m1, m10
3322    mova [t4+wq*1+400*2+4], m5
3323    psrld           m0, 12
3324    psrld           m1, 12
3325    mova            m3, [t1+wq+400*0]
3326    mova            m4, [t1+wq+400*2]
3327    mova            m5, [t1+wq+400*4]
3328    mova [t3+wq*2+400*8+ 8], m3
3329    mova [t3+wq*2+400*0+ 8], m4
3330    mova [t3+wq*2+400*0+24], m5
3331    paddw           m3, m3 ; cc5
3332    paddd           m4, m4
3333    paddd           m5, m5
3334    mova [t1+wq+400*0], m3
3335    mova [t1+wq+400*2], m4
3336    mova [t1+wq+400*4], m5
3337    mova [t3+wq*2+400*4+ 8], m0
3338    mova [t3+wq*2+400*4+24], m1
3339    add             wq, 16
3340    jl .v0_loop
3341    ret
3342.v1: ; vertical boxsums + ab (odd rows)
3343%if ARCH_X86_64
3344    lea             wq, [r5-4]
3345%else
3346    mov             wd, w0m
3347%endif
3348.v1_loop:
3349    mova            m4, [t1+wq+400* 6]
3350    mova            m5, [t1+wq+400* 8]
3351    mova            m7, [t1+wq+400*10]
3352    paddw           m1, m4, [t2+wq+400* 6]
3353    paddd           m2, m5, [t2+wq+400* 8]
3354    paddd           m3, m7, [t2+wq+400*10]
3355    mova [t2+wq+400* 6], m4
3356    mova [t2+wq+400* 8], m5
3357    mova [t2+wq+400*10], m7
3358    paddd           m2, m9
3359    paddd           m3, m9
3360    psrld           m2, 4              ; (a3 + 8) >> 4
3361    psrld           m3, 4
3362%if ARCH_X86_32
3363    pxor            m7, m7
3364%else
3365    SWAP            m7, m6
3366%endif
3367    pslld           m4, m2, 3
3368    pslld           m5, m3, 3
3369    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
3370    paddd           m5, m3
3371    psrlw           m3, m1, 1
3372    pavgw           m3, m7             ; (b3 + 2) >> 2
3373    punpcklwd       m2, m3, m7
3374    pmaddwd         m2, m2
3375    punpckhwd       m3, m7
3376    pmaddwd         m3, m3
3377    punpcklwd       m0, m1, m7         ; b3
3378    punpckhwd       m1, m7
3379%if ARCH_X86_64
3380    SWAP            m7, m6
3381%endif
3382    MAXSD           m4, m2, m7
3383    MAXSD           m5, m3, m7
3384    psubd           m4, m2             ; p3
3385    psubd           m5, m3
3386    MULLD           m4, m14, m7        ; p3 * s1
3387    MULLD           m5, m14, m7
3388    pmaddwd         m0, m11            ; b3 * 455
3389    pmaddwd         m1, m11
3390    paddusw         m4, m11
3391    paddusw         m5, m11
3392    psrld           m3, m4, 20         ; min(z3, 255)
3393    psrld           m4, m5, 20
3394    GATHER_X_BY_X   m2, m3, m4, r0, dstm
3395    punpcklwd       m3, m2, m2
3396    punpckhwd       m4, m2, m2
3397    MULLD           m0, m3, m7
3398    MULLD           m1, m4, m7
3399    psubw           m5, m12, m2
3400    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
3401    paddd           m1, m10
3402    mova [t4+wq*1+400*4+4], m5
3403    psrld           m0, 12
3404    psrld           m8, m1, 12
3405    mova            m4, [t3+wq*2+400*8+ 8]
3406    mova            m5, [t3+wq*2+400*0+ 8]
3407    mova            m7, [t3+wq*2+400*0+24]
3408    paddw           m1, m4, [t2+wq+400*0]
3409    paddd           m2, m5, [t2+wq+400*2]
3410    paddd           m3, m7, [t2+wq+400*4]
3411    paddw           m1, [t1+wq+400*0]
3412    paddd           m2, [t1+wq+400*2]
3413    paddd           m3, [t1+wq+400*4]
3414    mova [t2+wq+400*0], m4
3415    mova [t2+wq+400*2], m5
3416    mova [t2+wq+400*4], m7
3417    mova            m4, [base+pw_25]
3418    mova         [t3+wq*2+400*8+ 8], m0
3419    mova         [t3+wq*2+400*8+24], m8
3420    paddd           m2, m9
3421    paddd           m3, m9
3422    psrld           m2, 4              ; (a5 + 8) >> 4
3423    psrld           m3, 4
3424    MULLD           m2, m4, m7         ; ((a5 + 8) >> 4) * 25
3425    MULLD           m3, m4, m7
3426%if ARCH_X86_32
3427    pxor            m7, m7
3428%else
3429    SWAP            m7, m6
3430%endif
3431    psrlw           m5, m1, 1
3432    pavgw           m5, m7             ; (b5 + 2) >> 2
3433    punpcklwd       m4, m5, m7
3434    pmaddwd         m4, m4
3435    punpckhwd       m5, m7
3436    pmaddwd         m5, m5
3437    punpcklwd       m0, m1, m7         ; b5
3438    punpckhwd       m1, m7
3439%if ARCH_X86_64
3440    SWAP            m7, m6
3441%endif
3442    MAXSD           m2, m4, m7
3443    psubd           m2, m4             ; p5
3444    mova            m4, [base+pd_0xf00800a4]
3445    MAXSD           m3, m5, m7
3446    psubd           m3, m5
3447    MULLD           m2, m13, m7        ; p5 * s0
3448    MULLD           m3, m13, m7
3449    pmaddwd         m0, m4             ; b5 * 164
3450    pmaddwd         m1, m4
3451    paddusw         m2, m4
3452    paddusw         m3, m4
3453    psrld           m5, m2, 20         ; min(z5, 255)
3454    psrld           m2, m3, 20
3455    GATHER_X_BY_X   m4, m5, m2, r0, dstm
3456    punpcklwd       m2, m4, m4
3457    punpckhwd       m3, m4, m4
3458    psubw           m5, m12, m4
3459    MULLD           m0, m2, m7
3460    MULLD           m1, m3, m7
3461    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
3462    paddd           m1, m10
3463    mova [t4+wq*1+400*0+ 4], m5
3464    psrld           m0, 12
3465    psrld           m1, 12
3466    mova [t3+wq*2+400*0+ 8], m0
3467    mova [t3+wq*2+400*0+24], m1
3468    add             wq, 16
3469    jl .v1_loop
3470    mov            r10, t2
3471    mov             t2, t1
3472    mov             t1, r10
3473    ret
3474.prep_n: ; initial neighbor setup
3475    movif64         wq, r5
3476    movif32         wd, w1m
3477.prep_n_loop:
3478    movu            m0, [t4+wq*1+400*0+ 2]
3479    movu            m1, [t3+wq*2+400*0+ 4]
3480    movu            m2, [t3+wq*2+400*0+20]
3481    movu            m7, [t4+wq*1+400*0+ 4]
3482    movu            m8, [t3+wq*2+400*0+ 8]
3483    paddw           m3, m0, [t4+wq*1+400*0+ 0]
3484    paddd           m4, m1, [t3+wq*2+400*0+ 0]
3485    paddd           m5, m2, [t3+wq*2+400*0+16]
3486    paddw           m3, m7
3487    paddd           m4, m8
3488    movu            m7, [t3+wq*2+400*0+24]
3489    paddw           m0, m3
3490    paddd           m1, m4
3491    psllw           m3, 2
3492    pslld           m4, 2
3493    paddd           m5, m7
3494    paddd           m2, m5
3495    pslld           m5, 2
3496    paddw           m0, m3               ; a5 565
3497    paddd           m1, m4               ; b5 565
3498    paddd           m2, m5
3499    mova [t4+wq*1+400* 6+ 0], m0
3500    mova [t3+wq*2+400*12+ 0], m1
3501    mova [t3+wq*2+400*12+16], m2
3502    movu            m0, [t4+wq*1+400*2+ 4]
3503    movu            m3, [t4+wq*1+400*2+ 2]
3504    paddw           m0, [t4+wq*1+400*2+ 0]
3505    movu            m1, [t3+wq*2+400*4+ 8]
3506    movu            m4, [t3+wq*2+400*4+ 4]
3507    paddd           m1, [t3+wq*2+400*4+ 0]
3508    movu            m2, [t3+wq*2+400*4+24]
3509    movu            m5, [t3+wq*2+400*4+20]
3510    paddd           m2, [t3+wq*2+400*4+16]
3511    paddw           m3, m0
3512    paddd           m4, m1
3513    paddd           m5, m2
3514    psllw           m3, 2                ; a3[-1] 444
3515    pslld           m4, 2                ; b3[-1] 444
3516    pslld           m5, 2
3517    psubw           m3, m0               ; a3[-1] 343
3518    psubd           m4, m1               ; b3[-1] 343
3519    psubd           m5, m2
3520    mova [t4+wq*1+400* 8+ 0], m3
3521    mova [t3+wq*2+400*16+ 0], m4
3522    mova [t3+wq*2+400*16+16], m5
3523    movu            m0, [t4+wq*1+400*4+ 4]
3524    movu            m3, [t4+wq*1+400*4+ 2]
3525    paddw           m0, [t4+wq*1+400*4+ 0]
3526    movu            m1, [t3+wq*2+400*8+ 8]
3527    movu            m4, [t3+wq*2+400*8+ 4]
3528    paddd           m1, [t3+wq*2+400*8+ 0]
3529    movu            m2, [t3+wq*2+400*8+24]
3530    movu            m5, [t3+wq*2+400*8+20]
3531    paddd           m2, [t3+wq*2+400*8+16]
3532    paddw           m3, m0
3533    paddd           m4, m1
3534    paddd           m5, m2
3535    psllw           m3, 2                 ; a3[ 0] 444
3536    pslld           m4, 2                 ; b3[ 0] 444
3537    pslld           m5, 2
3538    mova [t4+wq*1+400*10+ 0], m3
3539    mova [t3+wq*2+400*20+ 0], m4
3540    mova [t3+wq*2+400*20+16], m5
3541    psubw           m3, m0                ; a3[ 0] 343
3542    psubd           m4, m1                ; b3[ 0] 343
3543    psubd           m5, m2
3544    mova [t4+wq*1+400*12+ 0], m3
3545    mova [t3+wq*2+400*24+ 0], m4
3546    mova [t3+wq*2+400*24+16], m5
3547    add             wq, 16
3548    jl .prep_n_loop
3549    ret
3550ALIGN function_align
3551.n0: ; neighbor + output (even rows)
3552    movif64         wq, r5
3553    movif32         wd, w1m
3554.n0_loop:
3555    movu            m0, [t4+wq*1+ 4]
3556    movu            m2, [t4+wq*1+ 2]
3557    paddw           m0, [t4+wq*1+ 0]
3558    paddw           m0, m2
3559    paddw           m2, m0
3560    psllw           m0, 2
3561    paddw           m0, m2               ; a5
3562    movu            m4, [t3+wq*2+ 8]
3563    movu            m5, [t3+wq*2+24]
3564    movu            m1, [t3+wq*2+ 4]
3565    movu            m3, [t3+wq*2+20]
3566    paddd           m4, [t3+wq*2+ 0]
3567    paddd           m5, [t3+wq*2+16]
3568    paddd           m4, m1
3569    paddd           m5, m3
3570    paddd           m1, m4
3571    paddd           m3, m5
3572    pslld           m4, 2
3573    pslld           m5, 2
3574    paddd           m4, m1               ; b5
3575    paddd           m5, m3
3576    movu            m2, [t4+wq*1+400* 6]
3577    paddw           m2, m0
3578    mova [t4+wq*1+400* 6], m0
3579    paddd           m0, m4, [t3+wq*2+400*12+ 0]
3580    paddd           m1, m5, [t3+wq*2+400*12+16]
3581    mova [t3+wq*2+400*12+ 0], m4
3582    mova [t3+wq*2+400*12+16], m5
3583    mova [rsp+16+ARCH_X86_32*4], m1
3584    movu            m3, [t4+wq*1+400*2+4]
3585    movu            m5, [t4+wq*1+400*2+2]
3586    paddw           m3, [t4+wq*1+400*2+0]
3587    paddw           m5, m3
3588    psllw           m5, 2                ; a3[ 1] 444
3589    psubw           m4, m5, m3           ; a3[ 1] 343
3590    movu            m3, [t4+wq*1+400* 8]
3591    paddw           m3, [t4+wq*1+400*10]
3592    paddw           m3, m4
3593    mova [t4+wq*1+400* 8], m4
3594    mova [t4+wq*1+400*10], m5
3595    movu            m1, [t3+wq*2+400*4+ 8]
3596    movu            m5, [t3+wq*2+400*4+ 4]
3597    movu            m7, [t3+wq*2+400*4+24]
3598    movu            m8, [t3+wq*2+400*4+20]
3599    paddd           m1, [t3+wq*2+400*4+ 0]
3600    paddd           m7, [t3+wq*2+400*4+16]
3601    paddd           m5, m1
3602    paddd           m8, m7
3603    pslld           m5, 2                ; b3[ 1] 444
3604    pslld           m8, 2
3605    psubd           m4, m5, m1           ; b3[ 1] 343
3606%if ARCH_X86_32
3607    mova      [esp+52], m8
3608    psubd           m8, m7
3609%else
3610    psubd           m6, m8, m7
3611    SWAP            m8, m6
3612%endif
3613    paddd           m1, m4, [t3+wq*2+400*16+ 0]
3614    paddd           m7, m8, [t3+wq*2+400*16+16]
3615    paddd           m1, [t3+wq*2+400*20+ 0]
3616    paddd           m7, [t3+wq*2+400*20+16]
3617    mova [t3+wq*2+400*16+ 0], m4
3618    mova [t3+wq*2+400*16+16], m8
3619    mova [t3+wq*2+400*20+ 0], m5
3620%if ARCH_X86_32
3621    mova            m8, [esp+52]
3622%else
3623    SWAP            m8, m6
3624    pxor            m6, m6
3625%endif
3626    mova [t3+wq*2+400*20+16], m8
3627    mova [rsp+32+ARCH_X86_32*4], m7
3628    movu            m4, [dstq+wq]
3629    punpcklwd       m7, m2, m6
3630    punpckhwd       m2, m6
3631    punpcklwd       m8, m3, m6
3632    punpckhwd       m3, m6
3633    punpcklwd       m5, m4, m6
3634    punpckhwd       m4, m6
3635    pmaddwd         m7, m5               ; a5 * src
3636    pmaddwd         m8, m5               ; a3 * src
3637    pmaddwd         m2, m4
3638    pmaddwd         m3, m4
3639    pslld           m5, 13
3640    pslld           m4, 13
3641    psubd           m0, m5
3642    psubd           m1, m5
3643    paddd           m0, m7               ; a5 * src + b5 + (1 << 8) - (src << 13)
3644    paddd           m1, m8               ; a3 * src + b3 + (1 << 8) - (src << 13)
3645    mova            m7, [base+pd_0xffff]
3646    psrld           m0, 9
3647    pslld           m1, 7
3648    pand            m0, m7
3649    pandn           m8, m7, m1
3650    por             m0, m8
3651    psubd           m1, m4, [rsp+16+ARCH_X86_32*4]
3652    psubd           m8, m4, [rsp+32+ARCH_X86_32*4]
3653    psubd           m2, m1
3654    psubd           m3, m8
3655    mova            m1, [base+pd_4096]
3656    psrld           m2, 9
3657    pslld           m3, 7
3658    pand            m2, m7
3659    pandn           m7, m3
3660    por             m2, m7
3661    pmaddwd         m0, m15
3662    pmaddwd         m2, m15
3663%if ARCH_X86_32
3664    pxor            m7, m7
3665%else
3666    SWAP            m7, m6
3667%endif
3668    paddd           m5, m1
3669    paddd           m4, m1
3670    paddd           m0, m5
3671    paddd           m2, m4
3672    psrad           m0, 8
3673    psrad           m2, 8
3674    packssdw        m0, m2               ; clip
3675    pmaxsw          m0, m7
3676    psrlw           m0, 5
3677    mova     [dstq+wq], m0
3678    add             wq, 16
3679    jl .n0_loop
3680    add           dstq, dst_stridemp
3681    ret
3682%if ARCH_X86_64
3683    SWAP            m6, m7
3684%endif
3685ALIGN function_align
3686.n1: ; neighbor + output (odd rows)
3687    movif64         wq, r5
3688    movif32         wd, w1m
3689.n1_loop:
3690    movu            m3, [t4+wq*1+400*4+4]
3691    movu            m5, [t4+wq*1+400*4+2]
3692    paddw           m3, [t4+wq*1+400*4+0]
3693    paddw           m5, m3
3694    psllw           m5, 2                ; a3[ 1] 444
3695    psubw           m4, m5, m3           ; a3[ 1] 343
3696    paddw           m3, m4, [t4+wq*1+400*12]
3697    paddw           m3, [t4+wq*1+400*10]
3698    mova [t4+wq*1+400*10], m5
3699    mova [t4+wq*1+400*12], m4
3700    movu            m1, [t3+wq*2+400*8+ 8]
3701    movu            m5, [t3+wq*2+400*8+ 4]
3702    movu            m7, [t3+wq*2+400*8+24]
3703    movu            m8, [t3+wq*2+400*8+20]
3704    paddd           m1, [t3+wq*2+400*8+ 0]
3705    paddd           m7, [t3+wq*2+400*8+16]
3706    paddd           m5, m1
3707    paddd           m8, m7
3708    pslld           m5, 2                ; b3[ 1] 444
3709    pslld           m8, 2
3710    psubd           m4, m5, m1           ; b3[ 1] 343
3711    psubd           m0, m8, m7
3712    paddd           m1, m4, [t3+wq*2+400*24+ 0]
3713    paddd           m7, m0, [t3+wq*2+400*24+16]
3714    paddd           m1, [t3+wq*2+400*20+ 0]
3715    paddd           m7, [t3+wq*2+400*20+16]
3716    mova [t3+wq*2+400*20+ 0], m5
3717    mova [t3+wq*2+400*20+16], m8
3718    mova [t3+wq*2+400*24+ 0], m4
3719    mova [t3+wq*2+400*24+16], m0
3720    mova            m5, [dstq+wq]
3721    mova            m8, [t4+wq*1+400* 6]
3722    punpcklwd       m4, m5, m6
3723    punpckhwd       m5, m6
3724    punpcklwd       m0, m8, m6
3725    punpckhwd       m8, m6
3726    punpcklwd       m2, m3, m6
3727    punpckhwd       m3, m6
3728    pmaddwd         m0, m4               ; a5 * src
3729    pmaddwd         m2, m4               ; a3 * src
3730    pmaddwd         m8, m5
3731    pmaddwd         m3, m5
3732    paddd           m1, m2               ; a3 * src + b3 + (1 << 8) - (src << 13)
3733    pslld           m4, 12
3734    pslld           m5, 12
3735    psubd           m2, m4, [t3+wq*2+400*12+ 0]
3736    psubd           m0, m2               ; a5 * src + b5 + (1 << 8) - (src << 13)
3737    psubd           m2, m5, [t3+wq*2+400*12+16]
3738    psubd           m8, m2
3739    paddd           m4, m4
3740    paddd           m5, m5
3741    paddd           m7, m3
3742    mova            m2, [base+pd_0xffff]
3743    psubd           m1, m4
3744    psubd           m7, m5
3745    psrld           m0, 8
3746    psrld           m8, 8
3747    pslld           m1, 7
3748    pslld           m7, 7
3749    pand            m0, m2
3750    pand            m8, m2
3751    pandn           m3, m2, m1
3752    pandn           m2, m7
3753    por             m0, m3
3754    por             m8, m2
3755    mova            m1, [base+pd_4096]
3756    pmaddwd         m0, m15
3757    pmaddwd         m8, m15
3758%if ARCH_X86_64
3759    pxor            m6, m6
3760    SWAP            m7, m6
3761%else
3762    pxor            m7, m7
3763%endif
3764    paddd           m4, m1
3765    paddd           m5, m1
3766    paddd           m0, m4
3767    paddd           m8, m5
3768    psrad           m0, 8
3769    psrad           m8, 8
3770    packssdw        m0, m8              ; clip
3771    pmaxsw          m0, m7
3772    psrlw           m0, 5
3773    mova     [dstq+wq], m0
3774    add             wq, 16
3775    jl .n1_loop
3776    add           dstq, dst_stridemp
3777    movif32       dstm, dstq
3778    ret
3779