1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29SECTION_RODATA
30
31wiener_shufA:  db  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11
32wiener_shufB:  db  6,  7,  4,  5,  8,  9,  6,  7, 10, 11,  8,  9, 12, 13, 10, 11
33wiener_shufC:  db  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
34wiener_shufD:  db  2,  3, -1, -1,  4,  5, -1, -1,  6,  7, -1, -1,  8,  9, -1, -1
35wiener_shufE:  db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
36wiener_lshuf5: db  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
37wiener_lshuf7: db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7
38pb_0to15:      db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
39
40pb_m10_m9:     times 8 db -10, -9
41pb_m6_m5:      times 8 db  -6, -5
42pb_m2_m1:      times 8 db  -2, -1
43pb_2_3:        times 8 db   2,  3
44pb_6_7:        times 8 db   6,  7
45pd_m262128:    times 4 dd -262128
46
47wiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192
48wiener_round:  dd 1049600, 1048832
49
50SECTION .text
51
52INIT_XMM ssse3
53%if ARCH_X86_32
54DECLARE_REG_TMP 4, 6
55 %if STACK_ALIGNMENT < 16
56  %assign stack_size 13*16+384*12
57 %else
58  %assign stack_size 11*16+384*12
59 %endif
60cglobal wiener_filter7_16bpc, 5, 7, 8, -stack_size, dst, dst_stride, left, \
61                                                    lpf, lpf_stride, w, flt
62 %if STACK_ALIGNMENT < 16
63  %define lpfm        dword [esp+calloff+16*10+0]
64  %define lpf_stridem dword [esp+calloff+16*10+4]
65  %define wm          dword [esp+calloff+16*10+8]
66  %define hd          dword [esp+calloff+16*10+12]
67  %define edgeb        byte [esp+calloff+16*10+16]
68 %else
69  %define hd dword r6m
70  %define edgeb byte r8m
71 %endif
72 %define PICmem dword [esp+calloff+4*0]
73 %define t0m    dword [esp+calloff+4*1] ; wiener ring buffer pointers
74 %define t1m    dword [esp+calloff+4*2]
75 %define t2m    dword [esp+calloff+4*3]
76 %define t3m    dword [esp+calloff+4*4]
77 %define t4m    dword [esp+calloff+4*5]
78 %define t5m    dword [esp+calloff+4*6]
79 %define t6m    dword [esp+calloff+4*7]
80 %define t2 t2m
81 %define t3 t3m
82 %define t4 t4m
83 %define t5 t5m
84 %define t6 t6m
85 %define  m8 [esp+calloff+16*2]
86 %define  m9 [esp+calloff+16*3]
87 %define m10 [esp+calloff+16*4]
88 %define m11 [esp+calloff+16*5]
89 %define m12 [esp+calloff+16*6]
90 %define m13 [esp+calloff+16*7]
91 %define m14 [esp+calloff+16*8]
92 %define m15 [esp+calloff+16*9]
93 %define base t0-wiener_shifts
94 %assign calloff 0
95 %if STACK_ALIGNMENT < 16
96    mov             wd, [rstk+stack_offset+24]
97    mov    lpf_stridem, lpf_strideq
98    mov             wm, wd
99    mov             r4, [rstk+stack_offset+28]
100    mov             hd, r4
101    mov             r4, [rstk+stack_offset+36]
102    mov    [esp+16*11], r4 ; edge
103 %endif
104%else
105DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers
106cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
107                                                     lpf_stride, w, edge, flt, h
108 %define base
109%endif
110%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
111    movifnidn       wd, wm
112%endif
113%if ARCH_X86_64
114    mov           fltq, fltmp
115    mov          edged, r8m
116    mov             hd, r6m
117    mov            t3d, r9m ; pixel_max
118    movq           m13, [fltq]
119    movq           m15, [fltq+16]
120%else
121 %if STACK_ALIGNMENT < 16
122    mov             t0, [rstk+stack_offset+32]
123    mov             t1, [rstk+stack_offset+40] ; pixel_max
124    movq            m1, [t0]    ; fx
125    movq            m3, [t0+16] ; fy
126    LEA             t0, wiener_shifts
127    mov         PICmem, t0
128 %else
129    LEA             t0, wiener_shifts
130    mov           fltq, r7m
131    movq            m1, [fltq]
132    movq            m3, [fltq+16]
133    mov             t1, r9m ; pixel_max
134    mov         PICmem, t0
135 %endif
136%endif
137    mova            m6, [base+wiener_shufA]
138    mova            m7, [base+wiener_shufB]
139%if ARCH_X86_64
140    lea             t4, [wiener_shifts]
141    add             wd, wd
142    pshufd         m12, m13, q0000 ; x0 x1
143    pshufd         m13, m13, q1111 ; x2 x3
144    pshufd         m14, m15, q0000 ; y0 y1
145    pshufd         m15, m15, q1111 ; y2 y3
146    mova            m8, [wiener_shufC]
147    mova            m9, [wiener_shufD]
148    add           lpfq, wq
149    lea             t1, [rsp+wq+16]
150    add           dstq, wq
151    neg             wq
152    shr            t3d, 11
153 %define base t4-wiener_shifts
154    movd           m10, [base+wiener_round+t3*4]
155    movq           m11, [base+wiener_shifts+t3*8]
156    pshufd         m10, m10, q0000
157    pshufd          m0, m11, q0000
158    pshufd         m11, m11, q1111
159    pmullw         m12, m0 ; upshift filter coefs to make the
160    pmullw         m13, m0 ; horizontal downshift constant
161 DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w
162 %define lpfm        [rsp+0]
163 %define lpf_stridem [rsp+8]
164 %define base
165%else
166    add             wd, wd
167    mova            m4, [base+wiener_shufC]
168    mova            m5, [base+wiener_shufD]
169    pshufd          m0, m1, q0000
170    pshufd          m1, m1, q1111
171    pshufd          m2, m3, q0000
172    pshufd          m3, m3, q1111
173    mova            m8, m4
174    mova            m9, m5
175    mova           m14, m2
176    mova           m15, m3
177    shr             t1, 11
178    add           lpfq, wq
179    movd            m4, [base+wiener_round+t1*4]
180    movq            m5, [base+wiener_shifts+t1*8]
181 %if STACK_ALIGNMENT < 16
182    lea             t1, [esp+16*12+wq+16]
183 %else
184    lea             t1, [esp+16*10+wq+16]
185 %endif
186    add           dstq, wq
187    neg             wq
188    pshufd          m4, m4, q0000
189    pshufd          m2, m5, q0000
190    pshufd          m5, m5, q1111
191    mov             wm, wq
192    pmullw          m0, m2
193    pmullw          m1, m2
194    mova           m10, m4
195    mova           m11, m5
196    mova           m12, m0
197    mova           m13, m1
198%endif
199    test         edgeb, 4 ; LR_HAVE_TOP
200    jz .no_top
201    call .h_top
202%if ARCH_X86_64
203    add           lpfq, lpf_strideq
204%else
205    add           lpfq, lpf_stridem
206%endif
207    mov             t6, t1
208    mov             t5, t1
209    add             t1, 384*2
210    call .h_top
211%if ARCH_X86_64
212    lea             r7, [lpfq+lpf_strideq*4]
213    mov           lpfq, dstq
214    mov             t4, t1
215    add             t1, 384*2
216    mov    lpf_stridem, lpf_strideq
217    add             r7, lpf_strideq
218    mov           lpfm, r7 ; below
219%else
220    mov            t4m, t1
221    mov             t0, lpf_stridem
222    lea             t1, [lpfq+t0*4]
223    mov           lpfq, dstq
224    add             t1, t0
225    mov           lpfm, t1 ; below
226    mov             t1, t4m
227    mov             t0, PICmem
228    add             t1, 384*2
229%endif
230    call .h
231    mov             t3, t1
232    mov             t2, t1
233    dec             hd
234    jz .v1
235    add           lpfq, dst_strideq
236    add             t1, 384*2
237    call .h
238    mov             t2, t1
239    dec             hd
240    jz .v2
241    add           lpfq, dst_strideq
242    add             t1, 384*2
243    call .h
244    dec             hd
245    jz .v3
246.main:
247    lea             t0, [t1+384*2]
248.main_loop:
249    call .hv
250    dec             hd
251    jnz .main_loop
252    test         edgeb, 8 ; LR_HAVE_BOTTOM
253    jz .v3
254    mov           lpfq, lpfm
255    call .hv_bottom
256    add           lpfq, lpf_stridem
257    call .hv_bottom
258.v1:
259    call .v
260    RET
261.no_top:
262%if ARCH_X86_64
263    lea             r7, [lpfq+lpf_strideq*4]
264    mov           lpfq, dstq
265    mov    lpf_stridem, lpf_strideq
266    lea             r7, [r7+lpf_strideq*2]
267    mov           lpfm, r7
268    call .h
269%else
270    mov            t1m, t1
271    mov             t0, lpf_stridem
272    lea             t1, [lpfq+t0*4]
273    mov           lpfq, dstq
274    lea             t1, [t1+t0*2]
275    mov           lpfm, t1
276    mov             t0, PICmem
277    mov             t1, t1m
278    call .h
279%endif
280    mov             t6, t1
281    mov             t5, t1
282    mov             t4, t1
283    mov             t3, t1
284    mov             t2, t1
285    dec             hd
286    jz .v1
287    add           lpfq, dst_strideq
288    add             t1, 384*2
289    call .h
290    mov             t2, t1
291    dec             hd
292    jz .v2
293    add           lpfq, dst_strideq
294    add             t1, 384*2
295    call .h
296    dec             hd
297    jz .v3
298    lea             t0, [t1+384*2]
299    call .hv
300    dec             hd
301    jz .v3
302    add             t0, 384*8
303    call .hv
304    dec             hd
305    jnz .main
306.v3:
307    call .v
308%if ARCH_X86_32
309    mov             wq, wm
310%endif
311.v2:
312    call .v
313%if ARCH_X86_32
314    mov             wq, wm
315%endif
316    jmp .v1
317.extend_right:
318%assign stack_offset_tmp stack_offset
319%assign stack_offset stack_offset+8
320%assign calloff 8
321    pxor            m0, m0
322    movd            m1, wd
323    mova            m2, [base+pb_0to15]
324    pshufb          m1, m0
325    mova            m0, [base+pb_6_7]
326    psubb           m0, m1
327    pminub          m0, m2
328    pshufb          m3, m0
329    mova            m0, [base+pb_m2_m1]
330    psubb           m0, m1
331    pminub          m0, m2
332    pshufb          m4, m0
333    mova            m0, [base+pb_m10_m9]
334    psubb           m0, m1
335    pminub          m0, m2
336    pshufb          m5, m0
337    ret
338%assign stack_offset stack_offset-4
339%assign calloff 4
340.h:
341%if ARCH_X86_64
342    mov             wq, r5
343%else
344    mov             wq, wm
345%endif
346    test         edgeb, 1 ; LR_HAVE_LEFT
347    jz .h_extend_left
348    movq            m3, [leftq]
349    movhps          m3, [lpfq+wq]
350    add          leftq, 8
351    jmp .h_main
352.h_extend_left:
353    mova            m3, [lpfq+wq]            ; avoid accessing memory located
354    pshufb          m3, [base+wiener_lshuf7] ; before the start of the buffer
355    jmp .h_main
356.h_top:
357%if ARCH_X86_64
358    mov             wq, r5
359%endif
360    test         edgeb, 1 ; LR_HAVE_LEFT
361    jz .h_extend_left
362.h_loop:
363    movu            m3, [lpfq+wq-8]
364.h_main:
365    mova            m4, [lpfq+wq+0]
366    movu            m5, [lpfq+wq+8]
367    test         edgeb, 2 ; LR_HAVE_RIGHT
368    jnz .h_have_right
369    cmp             wd, -18
370    jl .h_have_right
371    call .extend_right
372.h_have_right:
373    pshufb          m0, m3, m6
374    pshufb          m1, m4, m7
375    paddw           m0, m1
376    pshufb          m3, m8
377    pmaddwd         m0, m12
378    pshufb          m1, m4, m9
379    paddw           m3, m1
380    pshufb          m1, m4, m6
381    pmaddwd         m3, m13
382    pshufb          m2, m5, m7
383    paddw           m1, m2
384    mova            m2, [base+pd_m262128] ; (1 << 4) - (1 << 18)
385    pshufb          m4, m8
386    pmaddwd         m1, m12
387    pshufb          m5, m9
388    paddw           m4, m5
389    pmaddwd         m4, m13
390    paddd           m0, m2
391    paddd           m1, m2
392    paddd           m0, m3
393    paddd           m1, m4
394    psrad           m0, 4
395    psrad           m1, 4
396    packssdw        m0, m1
397    psraw           m0, 1
398    mova       [t1+wq], m0
399    add             wq, 16
400    jl .h_loop
401%if ARCH_X86_32
402    mov             wq, wm
403%endif
404    ret
405ALIGN function_align
406.hv:
407    add           lpfq, dst_strideq
408%if ARCH_X86_64
409    mov             wq, r5
410%else
411    mov            t0m, t0
412    mov            t1m, t1
413    mov             t0, PICmem
414%endif
415    test         edgeb, 1 ; LR_HAVE_LEFT
416    jz .hv_extend_left
417    movq            m3, [leftq]
418    movhps          m3, [lpfq+wq]
419    add          leftq, 8
420    jmp .hv_main
421.hv_extend_left:
422    mova            m3, [lpfq+wq]
423    pshufb          m3, [base+wiener_lshuf7]
424    jmp .hv_main
425.hv_bottom:
426%if ARCH_X86_64
427    mov             wq, r5
428%else
429    mov            t0m, t0
430    mov            t1m, t1
431    mov             t0, PICmem
432%endif
433    test         edgeb, 1 ; LR_HAVE_LEFT
434    jz .hv_extend_left
435.hv_loop:
436    movu            m3, [lpfq+wq-8]
437.hv_main:
438    mova            m4, [lpfq+wq+0]
439    movu            m5, [lpfq+wq+8]
440    test         edgeb, 2 ; LR_HAVE_RIGHT
441    jnz .hv_have_right
442    cmp             wd, -18
443    jl .hv_have_right
444    call .extend_right
445.hv_have_right:
446%if ARCH_X86_32
447    mov             t1, t4m
448%endif
449    pshufb          m0, m3, m6
450    pshufb          m1, m4, m7
451    paddw           m0, m1
452    pshufb          m3, m8
453    pmaddwd         m0, m12
454    pshufb          m1, m4, m9
455    paddw           m3, m1
456    pshufb          m1, m4, m6
457    pmaddwd         m3, m13
458    pshufb          m2, m5, m7
459    paddw           m1, m2
460    mova            m2, [base+pd_m262128]
461    pshufb          m4, m8
462    pmaddwd         m1, m12
463    pshufb          m5, m9
464    paddw           m4, m5
465    pmaddwd         m4, m13
466    paddd           m0, m2
467    paddd           m1, m2
468%if ARCH_X86_64
469    mova            m2, [t4+wq]
470    paddw           m2, [t2+wq]
471    mova            m5, [t3+wq]
472%else
473    mov             t0, t0m
474    mova            m2, [t1+wq]
475    mov             t1, t2m
476    paddw           m2, [t1+wq]
477    mov             t1, t3m
478    mova            m5, [t1+wq]
479    mov             t1, t5m
480%endif
481    paddd           m0, m3
482    paddd           m1, m4
483    psrad           m0, 4
484    psrad           m1, 4
485    packssdw        m0, m1
486%if ARCH_X86_64
487    mova            m4, [t5+wq]
488    paddw           m4, [t1+wq]
489    psraw           m0, 1
490    paddw           m3, m0, [t6+wq]
491%else
492    mova            m4, [t1+wq]
493    mov             t1, t1m
494    paddw           m4, [t1+wq]
495    psraw           m0, 1
496    mov             t1, t6m
497    paddw           m3, m0, [t1+wq]
498%endif
499    mova       [t0+wq], m0
500    punpcklwd       m0, m2, m5
501    pmaddwd         m0, m15
502    punpckhwd       m2, m5
503    pmaddwd         m2, m15
504    punpcklwd       m1, m3, m4
505    pmaddwd         m1, m14
506    punpckhwd       m3, m4
507    pmaddwd         m3, m14
508    paddd           m0, m10
509    paddd           m2, m10
510    paddd           m0, m1
511    paddd           m2, m3
512    psrad           m0, 6
513    psrad           m2, 6
514    packssdw        m0, m2
515    pmulhw          m0, m11
516    pxor            m1, m1
517    pmaxsw          m0, m1
518    mova     [dstq+wq], m0
519    add             wq, 16
520%if ARCH_X86_64
521    jl .hv_loop
522    mov             t6, t5
523    mov             t5, t4
524    mov             t4, t3
525    mov             t3, t2
526    mov             t2, t1
527    mov             t1, t0
528    mov             t0, t6
529%else
530    jge .hv_end
531    mov             t0, PICmem
532    jmp .hv_loop
533.hv_end:
534    mov             r5, t5m
535    mov             t1, t4m
536    mov            t6m, r5
537    mov            t5m, t1
538    mov             r5, t3m
539    mov             t1, t2m
540    mov            t4m, r5
541    mov            t3m, t1
542    mov             r5, t1m
543    mov             t1, t0
544    mov            t2m, r5
545    mov             t0, t6m
546    mov             wq, wm
547%endif
548    add           dstq, dst_strideq
549    ret
550.v:
551%if ARCH_X86_64
552    mov             wq, r5
553.v_loop:
554    mova            m1, [t4+wq]
555    paddw           m1, [t2+wq]
556    mova            m2, [t3+wq]
557    mova            m4, [t1+wq]
558    paddw           m3, m4, [t6+wq]
559    paddw           m4, [t5+wq]
560%else
561    mov            t1m, t1
562.v_loop:
563    mov             t1, t4m
564    mova            m1, [t1+wq]
565    mov             t1, t2m
566    paddw           m1, [t1+wq]
567    mov             t1, t3m
568    mova            m2, [t1+wq]
569    mov             t1, t1m
570    mova            m4, [t1+wq]
571    mov             t1, t6m
572    paddw           m3, m4, [t1+wq]
573    mov             t1, t5m
574    paddw           m4, [t1+wq]
575%endif
576    punpcklwd       m0, m1, m2
577    pmaddwd         m0, m15
578    punpckhwd       m1, m2
579    pmaddwd         m1, m15
580    punpcklwd       m2, m3, m4
581    pmaddwd         m2, m14
582    punpckhwd       m3, m4
583    pmaddwd         m3, m14
584    paddd           m0, m10
585    paddd           m1, m10
586    paddd           m0, m2
587    paddd           m1, m3
588    psrad           m0, 6
589    psrad           m1, 6
590    packssdw        m0, m1
591    pmulhw          m0, m11
592    pxor            m1, m1
593    pmaxsw          m0, m1
594    mova     [dstq+wq], m0
595    add             wq, 16
596    jl .v_loop
597%if ARCH_X86_64
598    mov             t6, t5
599    mov             t5, t4
600    mov             t4, t3
601    mov             t3, t2
602    mov             t2, t1
603%else
604    mov             t1, t5m
605    mov             r5, t4m
606    mov            t6m, t1
607    mov            t5m, r5
608    mov             t1, t3m
609    mov             r5, t2m
610    mov            t4m, t1
611    mov            t3m, r5
612    mov             t1, t1m
613    mov            t2m, t1
614%endif
615    add           dstq, dst_strideq
616    ret
617
618%if ARCH_X86_32
619 %if STACK_ALIGNMENT < 16
620  %assign stack_size 12*16+384*8
621 %else
622  %assign stack_size 11*16+384*8
623 %endif
624cglobal wiener_filter5_16bpc, 5, 7, 8, -stack_size, dst, dst_stride, left, \
625                                                    lpf, lpf_stride, w, flt
626 %if STACK_ALIGNMENT < 16
627  %define lpfm        dword [esp+calloff+4*6]
628  %define lpf_stridem dword [esp+calloff+4*7]
629  %define wm          dword [esp+calloff+16*10+0]
630  %define hd          dword [esp+calloff+16*10+4]
631  %define edgeb        byte [esp+calloff+16*10+8]
632 %else
633  %define hd dword r6m
634  %define edgeb byte r8m
635 %endif
636 %define PICmem dword [esp+calloff+4*0]
637 %define t0m    dword [esp+calloff+4*1] ; wiener ring buffer pointers
638 %define t1m    dword [esp+calloff+4*2]
639 %define t2m    dword [esp+calloff+4*3]
640 %define t3m    dword [esp+calloff+4*4]
641 %define t4m    dword [esp+calloff+4*5]
642 %define t2 t2m
643 %define t3 t3m
644 %define t4 t4m
645 %define  m8 [esp+calloff+16*2]
646 %define  m9 [esp+calloff+16*3]
647 %define m10 [esp+calloff+16*4]
648 %define m11 [esp+calloff+16*5]
649 %define m12 [esp+calloff+16*6]
650 %define m13 [esp+calloff+16*7]
651 %define m14 [esp+calloff+16*8]
652 %define m15 [esp+calloff+16*9]
653 %define base t0-wiener_shifts
654 %assign calloff 0
655 %if STACK_ALIGNMENT < 16
656    mov             wd, [rstk+stack_offset+24]
657    mov    lpf_stridem, lpf_strideq
658    mov             wm, wd
659    mov             r4, [rstk+stack_offset+28]
660    mov             hd, r4
661    mov             r4, [rstk+stack_offset+36]
662    mov  [esp+16*10+8], r4 ; edge
663 %endif
664%else
665cglobal wiener_filter5_16bpc, 5, 15, 16, 384*8+16, dst, dst_stride, left, lpf, \
666                                                   lpf_stride, w, edge, flt, h
667 %define base
668%endif
669%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
670    movifnidn       wd, wm
671%endif
672%if ARCH_X86_64
673    mov           fltq, fltmp
674    mov          edged, r8m
675    mov             hd, r6m
676    mov            t3d, r9m ; pixel_max
677    movq           m12, [fltq]
678    movq           m14, [fltq+16]
679%else
680 %if STACK_ALIGNMENT < 16
681    mov             t0, [rstk+stack_offset+32]
682    mov             t1, [rstk+stack_offset+40] ; pixel_max
683    movq            m1, [t0]    ; fx
684    movq            m3, [t0+16] ; fy
685    LEA             t0, wiener_shifts
686    mov         PICmem, t0
687 %else
688    LEA             t0, wiener_shifts
689    mov           fltq, r7m
690    movq            m1, [fltq]
691    movq            m3, [fltq+16]
692    mov             t1, r9m ; pixel_max
693    mov         PICmem, t0
694 %endif
695%endif
696    mova            m5, [base+wiener_shufE]
697    mova            m6, [base+wiener_shufB]
698    mova            m7, [base+wiener_shufD]
699%if ARCH_X86_64
700    lea             t4, [wiener_shifts]
701    add             wd, wd
702    punpcklwd      m11, m12, m12
703    pshufd         m11, m11, q1111 ; x1
704    pshufd         m12, m12, q1111 ; x2 x3
705    punpcklwd      m13, m14, m14
706    pshufd         m13, m13, q1111 ; y1
707    pshufd         m14, m14, q1111 ; y2 y3
708    shr            t3d, 11
709    mova            m8, [pd_m262128] ; (1 << 4) - (1 << 18)
710    add           lpfq, wq
711    lea             t1, [rsp+wq+16]
712    add           dstq, wq
713    neg             wq
714 %define base t4-wiener_shifts
715    movd            m9, [base+wiener_round+t3*4]
716    movq           m10, [base+wiener_shifts+t3*8]
717    pshufd          m9, m9, q0000
718    pshufd          m0, m10, q0000
719    pshufd         m10, m10, q1111
720    mova           m15, [wiener_lshuf5]
721    pmullw         m11, m0
722    pmullw         m12, m0
723 DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w
724 %define lpfm        [rsp+0]
725 %define lpf_stridem [rsp+8]
726 %define base
727%else
728    add             wd, wd
729    punpcklwd       m0, m1, m1
730    pshufd          m0, m0, q1111 ; x1
731    pshufd          m1, m1, q1111 ; x2 x3
732    punpcklwd       m2, m3, m3
733    pshufd          m2, m2, q1111 ; y1
734    pshufd          m3, m3, q1111 ; y2 y3
735    mova            m4, [base+pd_m262128] ; (1 << 4) - (1 << 18)
736    mova           m13, m2
737    mova           m14, m3
738    mova            m8, m4
739    shr             t1, 11
740    add           lpfq, wq
741    movd            m2, [base+wiener_round+t1*4]
742    movq            m3, [base+wiener_shifts+t1*8]
743 %if STACK_ALIGNMENT < 16
744    lea             t1, [esp+16*11+wq+16]
745 %else
746    lea             t1, [esp+16*10+wq+16]
747 %endif
748    add           dstq, wq
749    neg             wq
750    pshufd          m2, m2, q0000
751    pshufd          m4, m3, q0000
752    pshufd          m3, m3, q1111
753    mov             wm, wq
754    pmullw          m0, m4
755    pmullw          m1, m4
756    mova            m4, [base+wiener_lshuf5]
757    mova            m9, m2
758    mova           m10, m3
759    mova           m11, m0
760    mova           m12, m1
761    mova           m15, m4
762%endif
763    test         edgeb, 4 ; LR_HAVE_TOP
764    jz .no_top
765    call .h_top
766%if ARCH_X86_64
767    add           lpfq, lpf_strideq
768%else
769    add           lpfq, lpf_stridem
770%endif
771    mov             t4, t1
772    add             t1, 384*2
773    call .h_top
774%if ARCH_X86_64
775    lea             r7, [lpfq+lpf_strideq*4]
776    mov           lpfq, dstq
777    mov             t3, t1
778    add             t1, 384*2
779    mov    lpf_stridem, lpf_strideq
780    add             r7, lpf_strideq
781    mov           lpfm, r7 ; below
782%else
783    mov            t3m, t1
784    mov             t0, lpf_stridem
785    lea             t1, [lpfq+t0*4]
786    mov           lpfq, dstq
787    add             t1, t0
788    mov           lpfm, t1 ; below
789    mov             t1, t3m
790    add             t1, 384*2
791%endif
792    call .h
793    mov             t2, t1
794    dec             hd
795    jz .v1
796    add           lpfq, dst_strideq
797    add             t1, 384*2
798    call .h
799    dec             hd
800    jz .v2
801.main:
802    mov             t0, t4
803.main_loop:
804    call .hv
805    dec             hd
806    jnz .main_loop
807    test         edgeb, 8 ; LR_HAVE_BOTTOM
808    jz .v2
809    mov           lpfq, lpfm
810    call .hv_bottom
811    add           lpfq, lpf_stridem
812    call .hv_bottom
813.end:
814    RET
815.no_top:
816%if ARCH_X86_64
817    lea             r7, [lpfq+lpf_strideq*4]
818    mov           lpfq, dstq
819    mov    lpf_stridem, lpf_strideq
820    lea             r7, [r7+lpf_strideq*2]
821    mov           lpfm, r7
822    call .h
823%else
824    mov            t1m, t1
825    mov             t0, lpf_stridem
826    lea             t1, [lpfq+t0*4]
827    mov           lpfq, dstq
828    lea             t1, [t1+t0*2]
829    mov           lpfm, t1
830    mov             t1, t1m
831    call .h
832%endif
833    mov             t4, t1
834    mov             t3, t1
835    mov             t2, t1
836    dec             hd
837    jz .v1
838    add           lpfq, dst_strideq
839    add             t1, 384*2
840    call .h
841    dec             hd
842    jz .v2
843    lea             t0, [t1+384*2]
844    call .hv
845    dec             hd
846    jz .v2
847    add             t0, 384*6
848    call .hv
849    dec             hd
850    jnz .main
851.v2:
852    call .v
853%if ARCH_X86_64
854    mov             t4, t3
855    mov             t3, t2
856    mov             t2, t1
857%else
858    mov             t0, t3m
859    mov             r5, t2m
860    mov             t1, t1m
861    mov            t4m, t0
862    mov            t3m, r5
863    mov            t2m, t1
864    mov             wq, wm
865%endif
866    add           dstq, dst_strideq
867.v1:
868    call .v
869    jmp .end
870.extend_right:
871%assign stack_offset_tmp stack_offset
872%assign stack_offset stack_offset+8
873%assign calloff 8
874%if ARCH_X86_32
875    mov             t0, PICmem
876%endif
877    pxor            m1, m1
878    movd            m2, wd
879    mova            m0, [base+pb_2_3]
880    pshufb          m2, m1
881    mova            m1, [base+pb_m6_m5]
882    psubb           m0, m2
883    psubb           m1, m2
884    mova            m2, [base+pb_0to15]
885    pminub          m0, m2
886    pminub          m1, m2
887    pshufb          m3, m0
888    pshufb          m4, m1
889    ret
890%assign stack_offset stack_offset-4
891%assign calloff 4
892.h:
893%if ARCH_X86_64
894    mov             wq, r5
895%else
896    mov             wq, wm
897%endif
898    test         edgeb, 1 ; LR_HAVE_LEFT
899    jz .h_extend_left
900    mova            m4, [lpfq+wq]
901    movd            m3, [leftq+4]
902    pslldq          m4, 4
903    por             m3, m4
904    add          leftq, 8
905    jmp .h_main
906.h_extend_left:
907    mova            m3, [lpfq+wq] ; avoid accessing memory located
908    pshufb          m3, m15       ; before the start of the buffer
909    jmp .h_main
910.h_top:
911%if ARCH_X86_64
912    mov             wq, r5
913%else
914    mov             wq, wm
915%endif
916    test         edgeb, 1 ; LR_HAVE_LEFT
917    jz .h_extend_left
918.h_loop:
919    movu            m3, [lpfq+wq-4]
920.h_main:
921    movu            m4, [lpfq+wq+4]
922    test         edgeb, 2 ; LR_HAVE_RIGHT
923    jnz .h_have_right
924    cmp             wd, -18
925    jl .h_have_right
926    call .extend_right
927.h_have_right:
928    pshufb          m0, m3, m5
929    pmaddwd         m0, m11
930    pshufb          m1, m4, m5
931    pmaddwd         m1, m11
932    pshufb          m2, m3, m6
933    pshufb          m3, m7
934    paddw           m2, m3
935    pshufb          m3, m4, m6
936    pmaddwd         m2, m12
937    pshufb          m4, m7
938    paddw           m3, m4
939    pmaddwd         m3, m12
940    paddd           m0, m8
941    paddd           m1, m8
942    paddd           m0, m2
943    paddd           m1, m3
944    psrad           m0, 4
945    psrad           m1, 4
946    packssdw        m0, m1
947    psraw           m0, 1
948    mova       [t1+wq], m0
949    add             wq, 16
950    jl .h_loop
951%if ARCH_X86_32
952    mov             wq, wm
953%endif
954    ret
955ALIGN function_align
956.hv:
957    add           lpfq, dst_strideq
958%if ARCH_X86_64
959    mov             wq, r5
960%else
961    mov            t0m, t0
962    mov            t1m, t1
963%endif
964    test         edgeb, 1 ; LR_HAVE_LEFT
965    jz .hv_extend_left
966    mova            m4, [lpfq+wq]
967    movd            m3, [leftq+4]
968    pslldq          m4, 4
969    por             m3, m4
970    add          leftq, 8
971    jmp .hv_main
972.hv_extend_left:
973    mova            m3, [lpfq+wq]
974    pshufb          m3, m15
975    jmp .hv_main
976.hv_bottom:
977%if ARCH_X86_64
978    mov             wq, r5
979%else
980    mov            t0m, t0
981    mov            t1m, t1
982%endif
983    test         edgeb, 1 ; LR_HAVE_LEFT
984    jz .hv_extend_left
985.hv_loop:
986    movu            m3, [lpfq+wq-4]
987.hv_main:
988    movu            m4, [lpfq+wq+4]
989    test         edgeb, 2 ; LR_HAVE_RIGHT
990    jnz .hv_have_right
991    cmp             wd, -18
992    jl .hv_have_right
993    call .extend_right
994.hv_have_right:
995%if ARCH_X86_32
996    mov             t1, t1m
997    mov             t0, t3m
998%endif
999    pshufb          m0, m3, m5
1000    pmaddwd         m0, m11
1001    pshufb          m1, m4, m5
1002    pmaddwd         m1, m11
1003    pshufb          m2, m3, m6
1004    pshufb          m3, m7
1005    paddw           m2, m3
1006    pshufb          m3, m4, m6
1007    pmaddwd         m2, m12
1008    pshufb          m4, m7
1009    paddw           m3, m4
1010    pmaddwd         m3, m12
1011    paddd           m0, m8
1012    paddd           m1, m8
1013    paddd           m0, m2
1014%if ARCH_X86_64
1015    mova            m2, [t3+wq]
1016    paddw           m2, [t1+wq]
1017    paddd           m1, m3
1018    mova            m4, [t2+wq]
1019%else
1020    mova            m2, [t0+wq]
1021    mov             t0, t2m
1022    paddw           m2, [t1+wq]
1023    mov             t1, t4m
1024    paddd           m1, m3
1025    mova            m4, [t0+wq]
1026    mov             t0, t0m
1027%endif
1028    punpckhwd       m3, m2, m4
1029    pmaddwd         m3, m14
1030    punpcklwd       m2, m4
1031%if ARCH_X86_64
1032    mova            m4, [t4+wq]
1033%else
1034    mova            m4, [t1+wq]
1035%endif
1036    psrad           m0, 4
1037    psrad           m1, 4
1038    packssdw        m0, m1
1039    pmaddwd         m2, m14
1040    psraw           m0, 1
1041    mova       [t0+wq], m0
1042    punpckhwd       m1, m0, m4
1043    pmaddwd         m1, m13
1044    punpcklwd       m0, m4
1045    pmaddwd         m0, m13
1046    paddd           m3, m9
1047    paddd           m2, m9
1048    paddd           m1, m3
1049    paddd           m0, m2
1050    psrad           m1, 6
1051    psrad           m0, 6
1052    packssdw        m0, m1
1053    pmulhw          m0, m10
1054    pxor            m1, m1
1055    pmaxsw          m0, m1
1056    mova     [dstq+wq], m0
1057    add             wq, 16
1058    jl .hv_loop
1059%if ARCH_X86_64
1060    mov             t4, t3
1061    mov             t3, t2
1062    mov             t2, t1
1063    mov             t1, t0
1064    mov             t0, t4
1065%else
1066    mov             r5, t3m
1067    mov             t1, t2m
1068    mov            t4m, r5
1069    mov            t3m, t1
1070    mov             r5, t1m
1071    mov             t1, t0
1072    mov            t2m, r5
1073    mov             t0, t4m
1074    mov             wq, wm
1075%endif
1076    add           dstq, dst_strideq
1077    ret
1078.v:
1079%if ARCH_X86_64
1080    mov             wq, r5
1081.v_loop:
1082    mova            m0, [t1+wq]
1083    paddw           m2, m0, [t3+wq]
1084    mova            m1, [t2+wq]
1085    mova            m4, [t4+wq]
1086%else
1087    mov            t1m, t1
1088.v_loop:
1089    mov             t0, t3m
1090    mova            m0, [t1+wq]
1091    mov             t1, t2m
1092    paddw           m2, m0, [t0+wq]
1093    mov             t0, t4m
1094    mova            m1, [t1+wq]
1095    mova            m4, [t0+wq]
1096%endif
1097    punpckhwd       m3, m2, m1
1098    pmaddwd         m3, m14
1099    punpcklwd       m2, m1
1100    pmaddwd         m2, m14
1101    punpckhwd       m1, m0, m4
1102    pmaddwd         m1, m13
1103    punpcklwd       m0, m4
1104    pmaddwd         m0, m13
1105    paddd           m3, m9
1106    paddd           m2, m9
1107    paddd           m1, m3
1108    paddd           m0, m2
1109    psrad           m1, 6
1110    psrad           m0, 6
1111    packssdw        m0, m1
1112    pmulhw          m0, m10
1113    pxor            m1, m1
1114    pmaxsw          m0, m1
1115    mova     [dstq+wq], m0
1116    add             wq, 16
1117%if ARCH_X86_64
1118    jl .v_loop
1119%else
1120    jge .v_end
1121    mov             t1, t1m
1122    jmp .v_loop
1123.v_end:
1124%endif
1125    ret
1126