1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29SECTION_RODATA 16
30
31%if ARCH_X86_64
32%define PIC_sym(a) a
33%else
34%define PIC_base $$
35%define PIC_sym(a) pic_regq+a-PIC_base
36%endif
37
38pb_4x1_4x5_4x9_4x13: times 4 db 0, 1
39                     times 4 db 8, 9
40
41pw_1: times 8 dw 1
42pw_2: times 8 dw 2
43pw_3: times 8 dw 3
44; 4 and 16 need to be next to each other since they are used as alternates
45; depending on whether bitdepth is 10 or 12
46pw_4: times 8 dw 4
47pw_16: times 8 dw 16
48pw_8: times 8 dw 8
49pw_4096: times 8 dw 4096
50
51pb_mask: dd 1, 1, 2, 2
52
53SECTION .text
54
55%if ARCH_X86_32
56%if STACK_ALIGNMENT < 16
57%define extra_stack 2
58%else
59%define extra_stack 0
60%endif
61%endif
62
63%macro RELOC_ARGS 2 ; h/v, off
64ASSERT ARCH_X86_32
65%if STACK_ALIGNMENT < 16
66    mov          r5d, [rstk + stack_offset + 4*4 + 4]
67%define lstridem [esp+%2+0*gprsize]
68    mov     lstridem, r5d
69    mov          r5d, [rstk + stack_offset + 4*5 + 4]
70%define lutm [esp+%2+1*gprsize]
71    mov         lutm, r5d
72    mov          r5d, [rstk + stack_offset + 4*6 + 4]
73%ifidn %1, v
74%define wm [esp+%2+2*gprsize]
75    mov           wm, r5d
76    mov          r5d, [rstk + stack_offset + 4*3 + 4]
77%define lm [esp+%2+3*gprsize]
78    mov           lm, r5d
79%else ; %1 == h
80%define hm [esp+%2+2*gprsize]
81    mov           hm, r5d
82%endif ; %1==v
83   mov           r5d, r7m
84%define bdmulm [esp+%2+4*gprsize]
85    mov       bdmulm, r5d
86%else
87%define lstridem r4m
88%define lutm r5m
89%ifidn %1, v
90%define wm r6m
91%define lm r3m
92%else
93%define hm r6m
94%endif
95%define bdmulm r7m
96%endif ; STACK_ALIGNMENT
97%endmacro
98
99%macro UNRELOC_ARGS 0
100%if ARCH_X86_32
101%undef lm
102%undef lstridem
103%undef wm
104%undef hm
105%undef lutm
106%endif
107%endmacro
108
109%macro SPLATD 2
110    movd %1, %2
111    pshufd %1, %1, q0000
112%endmacro
113
114%macro SPLATW 2
115    movd %1, %2
116    pshuflw %1, %1, q0000
117    punpcklqdq %1, %1
118%endmacro
119
120;        in:            out:
121; mm%1   a b c d        a e i m
122; mm%2   e f g h        b f j n
123; mm%3   i j k l   ->   c g k o
124; mm%4   m n o p        d h l p
125%macro TRANSPOSE4X4W 5
126    punpcklwd        m%5, m%1, m%2
127    punpckhwd        m%1, m%2
128    punpcklwd        m%2, m%3, m%4
129    punpckhwd        m%3, m%4
130    punpckldq        m%4, m%5, m%2
131    punpckhdq        m%5, m%2
132    punpckldq        m%2, m%1, m%3
133    punpckhdq        m%1, m%3
134
135    SWAP              %1, %4
136    SWAP              %2, %5, %3
137%endmacro
138
139;         in:                  out:
140; m%1   a b c d e f g h      a i q y 6 E M U
141; m%2   i j k l m n o p      b j r z 7 F N V
142; m%3   q r s t u v w x      c k s 0 8 G O W
143; m%4   y z 0 1 2 3 4 5      d l t 1 9 H P X
144; m%5   6 7 8 9 A B C D  ->  e m u 2 A I Q Y
145; m%6   E F G H I J K L      f n v 3 B J R Z
146; m%7   M N O P Q R S T      g o w 4 C K S +
147; m%8   U V W X Y Z + =      h p x 5 D L T =
148%if ARCH_X86_64
149%macro TRANSPOSE8X8W 9
150    ; m%1   a b c d e f g h      a i q y b j r z
151    ; m%2   i j k l m n o p      c k s 0 d l t 1
152    ; m%3   q r s t u v w x  ->  e m u 2 f n v 3
153    ; m%4   y z 0 1 2 3 4 5      g o w 4 h p x 5
154    TRANSPOSE4X4W     %1, %2, %3, %4, %9
155
156    ; m%5   6 7 8 9 A B C D      6 E M U 7 F N V
157    ; m%6   E F G H I J K L      8 G O W 9 H P X
158    ; m%7   M N O P Q R S T  ->  A I Q Y B J R Z
159    ; m%8   U V W X Y Z + =      C K S + D L T =
160    TRANSPOSE4X4W     %5, %6, %7, %8, %9
161
162    ; m%1   a i q y b j r z      a i q y 6 E M U
163    ; m%2   c k s 0 d l t 1      b j r z 7 F N V
164    ; m%3   e m u 2 f n v 3      c k s 0 8 G O W
165    ; m%4   g o w 4 h p x 5      d l t 1 9 H P X
166    ; m%5   6 E M U 7 F N V  ->  e m u 2 A I Q Y
167    ; m%6   8 G O W 9 H P X      f n v 3 B J R Z
168    ; m%7   A I Q Y B J R Z      g o w 4 C K S +
169    ; m%8   C K S + D L T =      h p x 5 D L T =
170    punpckhqdq       m%9, m%1, m%5
171    punpcklqdq       m%1, m%5
172    punpckhqdq       m%5, m%2, m%6
173    punpcklqdq       m%2, m%6
174    punpckhqdq       m%6, m%3, m%7
175    punpcklqdq       m%3, m%7
176    punpckhqdq       m%7, m%4, m%8
177    punpcklqdq       m%4, m%8
178
179    SWAP %8, %7, %4, %5, %3, %2, %9
180%endmacro
181%else ; x86-32
182; input: 1-7 in registers, 8 in first memory [read-only]
183; second memory is scratch, and may overlap with first or third memory
184; output: 1-5,7-8 in registers, 6 in third memory [write-only]
185%macro TRANSPOSE8X8W 13 ; regs [8x], mem [3x], a/u [in/out alignment [2x]
186    TRANSPOSE4X4W     %1, %2, %3, %4, %8
187%ifnidn %9, ""
188    mov%12           m%8, %9
189%else
190    mova             m%8, %10
191%endif
192    mova             %10, m%4
193    TRANSPOSE4X4W     %5, %6, %7, %8, %4
194    punpckhqdq       m%4, m%1, m%5
195    punpcklqdq       m%1, m%5
196    punpckhqdq       m%5, m%2, m%6
197    punpcklqdq       m%2, m%6
198    punpckhqdq       m%6, m%3, m%7
199    punpcklqdq       m%3, m%7
200    mova             m%7, %10
201%ifnidn %11, ""
202    mov%13           %11, m%6
203%else
204    mova             %10, m%6
205%endif
206    punpckhqdq       m%6, m%7, m%8
207    punpcklqdq       m%7, m%8
208
209    ; 1,4,2,5,3,8,7,6 -> 1,2,3,4,5,6,7,8
210    SWAP              %2, %4, %5, %3
211    SWAP              %6, %8
212%endmacro
213%endif ; x86-32/64
214
215; transpose and write m8-11, everything else is scratch
216%macro TRANSPOSE_8x4_AND_WRITE_4x8 5 ; p1, p0, q0, q1, tmp
217    ; transpose 8x4
218    punpcklwd     %5, %1, %2
219    punpckhwd     %1, %2
220    punpcklwd     %2, %3, %4
221    punpckhwd     %3, %4
222    punpckldq     %4, %5, %2
223    punpckhdq     %5, %2
224    punpckldq     %2, %1, %3
225    punpckhdq     %1, %3
226
227    ; write out
228    movq   [dstq+strideq*0-4], %4
229    movhps [dstq+strideq*1-4], %4
230    movq   [dstq+strideq*2-4], %5
231    movhps [dstq+stride3q -4], %5
232    lea         dstq, [dstq+strideq*4]
233    movq   [dstq+strideq*0-4], %2
234    movhps [dstq+strideq*1-4], %2
235    movq   [dstq+strideq*2-4], %1
236    movhps [dstq+stride3q -4], %1
237    lea         dstq, [dstq+strideq*4]
238%endmacro
239
240%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
241    ; load data
242%ifidn %2, v
243%if %1 == 4
244%if ARCH_X86_64
245%define P1 m8
246%define P0 m9
247%define Q0 m10
248%define Q1 m11
249    mova          P1, [dstq+mstrideq*2]         ; p1
250    mova          P0, [dstq+mstrideq*1]         ; p0
251    mova          Q0, [dstq+strideq*0]          ; q0
252    mova          Q1, [dstq+strideq*1]          ; q1
253%else ; x86-32
254%define P1 [dstq+mstrideq*2]
255%define P0 [dstq+mstrideq*1]
256%define Q0 [dstq+strideq*0]
257%define Q1 [dstq+strideq*1]
258%endif ; x86-32/64
259%else ; %1 != 4
260    ; load 6-8 pixels, remainder (for wd=16) will be read inline
261    lea         tmpq, [dstq+mstrideq*4]
262%if ARCH_X86_64
263    ; we load p3 later
264%define P2 m13
265%define P1 m8
266%define P0 m9
267%define Q0 m10
268%define Q1 m11
269%define Q2 m14
270    mova          P2, [tmpq+strideq*1]
271    mova          P1, [tmpq+strideq*2]
272    mova          P0, [tmpq+stride3q]
273    mova          Q0, [dstq+strideq*0]
274    mova          Q1, [dstq+strideq*1]
275    mova          Q2, [dstq+strideq*2]
276%if %1 != 6
277%define P3 [tmpq+strideq*0]
278%define Q3 m15
279    mova          Q3, [dstq+stride3q]
280%endif ; %1 != 6
281%else ; x86-32
282%define P2 [tmpq+strideq*1]
283%define P1 [dstq+mstrideq*2]
284%define P0 [dstq+mstrideq*1]
285%define Q0 [dstq+strideq*0]
286%define Q1 [dstq+strideq*1]
287%define Q2 [dstq+strideq*2]
288%if %1 != 6
289%define P3 [dstq+mstrideq*4]
290%define Q3 [dstq+stride3q]
291%endif ; %1 != 6
292%endif ; x86-32/64
293%endif ; %1 ==/!= 4
294%else ; %2 != v
295    ; load lines
296%if %1 == 4
297    movq          m0, [dstq+strideq*0-4]
298    movq          m2, [dstq+strideq*1-4]
299    movq          m4, [dstq+strideq*2-4]
300    movq          m5, [dstq+stride3q -4]
301    lea         tmpq, [dstq+strideq*4]
302    movq          m3, [tmpq+strideq*0-4]
303    movq          m6, [tmpq+strideq*1-4]
304    movq          m1, [tmpq+strideq*2-4]
305    movq          m7, [tmpq+stride3q -4]
306
307    ; transpose 4x8
308    ; m0: A-D0
309    ; m2: A-D1
310    ; m4: A-D2
311    ; m5: A-D3
312    ; m3: A-D4
313    ; m6: A-D5
314    ; m1: A-D6
315    ; m7: A-D7
316    punpcklwd     m0, m2
317    punpcklwd     m4, m5
318    punpcklwd     m3, m6
319    punpcklwd     m1, m7
320    ; m0: A0-1,B0-1,C0-1,D0-1
321    ; m4: A2-3,B2-3,C2-3,D2-3
322    ; m3: A4-5,B4-5,C4-5,D4-5
323    ; m1: A6-7,B6-7,C6-7,D6-7
324    punpckhdq     m2, m0, m4
325    punpckldq     m0, m4
326    punpckhdq     m4, m3, m1
327    punpckldq     m3, m1
328    ; m0: A0-3,B0-3
329    ; m2: C0-3,D0-3
330    ; m3: A4-7,B4-7
331    ; m4: C4-7,D4-7
332    punpckhqdq    m1, m0, m3
333    punpcklqdq    m0, m3
334    punpckhqdq    m3, m2, m4
335    punpcklqdq    m2, m4
336    ; m0: A0-7
337    ; m1: B0-7
338    ; m2: C0-7
339    ; m3: D0-7
340%if ARCH_X86_64
341    SWAP           0, 8
342    SWAP           1, 9
343    SWAP           2, 10
344    SWAP           3, 11
345%define P1 m8
346%define P0 m9
347%define Q0 m10
348%define Q1 m11
349%else
350%define P1 [esp+3*mmsize]
351%define P0 [esp+4*mmsize]
352%define Q0 [esp+5*mmsize]
353%define Q1 [esp+6*mmsize]
354    mova          P1, m0
355    mova          P0, m1
356    mova          Q0, m2
357    mova          Q1, m3
358%endif
359%elif %1 == 6 || %1 == 8
360    movu          m0, [dstq+strideq*0-8]
361    movu          m1, [dstq+strideq*1-8]
362    movu          m2, [dstq+strideq*2-8]
363    movu          m3, [dstq+stride3q -8]
364    lea         tmpq, [dstq+strideq*4]
365    movu          m4, [tmpq+strideq*0-8]
366    movu          m5, [tmpq+strideq*1-8]
367    movu          m6, [tmpq+strideq*2-8]
368%if ARCH_X86_64
369    movu          m7, [tmpq+stride3q -8]
370%endif
371
372    ; transpose 8x16
373    ; m0: A-H0,A-H8
374    ; m1: A-H1,A-H9
375    ; m2: A-H2,A-H10
376    ; m3: A-H3,A-H11
377    ; m4: A-H4,A-H12
378    ; m5: A-H5,A-H13
379    ; m6: A-H6,A-H14
380    ; m7: A-H7,A-H15
381%if ARCH_X86_64
382    punpcklwd     m8, m0, m1
383%else
384    punpcklwd     m7, m0, m1
385%endif
386    punpckhwd     m0, m1
387    punpcklwd     m1, m2, m3
388    punpckhwd     m2, m3
389    punpcklwd     m3, m4, m5
390    punpckhwd     m4, m5
391%if ARCH_X86_64
392    punpcklwd     m5, m6, m7
393    punpckhwd     m6, m7
394%else
395    mova  [rsp+3*16], m4
396    movu          m4, [tmpq+stride3q -8]
397    punpcklwd     m5, m6, m4
398    punpckhwd     m6, m4
399%endif
400    ; m8: A0-1,B0-1,C0-1,D0-1 [m7 on x86-32]
401    ; m0: E0-1,F0-1,G0-1,H0-1
402    ; m1: A2-3,B2-3,C2-3,D2-3
403    ; m2: E2-3,F2-3,G2-3,H2-3
404    ; m3: A4-5,B4-5,C4-5,D4-5
405    ; m4: E4-5,F4-5,G4-5,H4-5 [r3 on x86-32]
406    ; m5: A6-7,B6-7,C6-7,D6-7
407    ; m6: E6-7,F6-7,G6-7,H6-7
408%if ARCH_X86_64
409    punpckldq     m7, m8, m1
410    punpckhdq     m8, m1
411%else
412    punpckldq     m4, m7, m1
413    punpckhdq     m7, m1
414%endif
415    punpckldq     m1, m0, m2
416    punpckhdq     m0, m2
417    punpckldq     m2, m3, m5
418    punpckhdq     m3, m5
419%if ARCH_X86_64
420    punpckldq     m5, m4, m6
421    punpckhdq     m4, m6
422%else
423    mova  [rsp+4*16], m3
424    mova          m3, [rsp+3*16]
425    punpckldq     m5, m3, m6
426    punpckhdq     m3, m6
427%endif
428    ; m7: A0-3,B0-3 [m4 on x86-32]
429    ; m8: C0-3,D0-3 [m7 on x86-32]
430    ; m1: E0-3,F0-3
431    ; m0: G0-3,H0-3
432    ; m2: A4-7,B4-7
433    ; m3: C4-7,D4-7 [r4 on x86-32]
434    ; m5: E4-7,F4-7
435    ; m4: G4-7,H4-7 [m3 on x86-32]
436%if ARCH_X86_64
437%if %1 != 6
438    punpcklqdq    m6, m7, m2
439%endif
440    punpckhqdq    m7, m2
441    punpcklqdq    m2, m8, m3
442    punpckhqdq    m8, m3
443    punpcklqdq    m3, m1, m5
444    punpckhqdq    m1, m5
445%if %1 != 6
446    punpckhqdq    m5, m0, m4
447%endif
448    punpcklqdq    m0, m4
449%if %1 == 8
450    mova  [rsp+1*16], m6
451%define P3 [rsp+1*16]
452%endif
453    ; 7,2,8,3,1,0,5 -> 13,8,9,10,11,14,15
454    SWAP           7, 13
455    SWAP           8, 2, 9
456    SWAP           3, 10
457    SWAP           1, 11
458    SWAP           0, 14
459    SWAP           5, 15
460%define P2 m13
461%define P1 m8
462%define P0 m9
463%define Q0 m10
464%define Q1 m11
465%define Q2 m14
466%if %1 == 8
467%define Q3 m15
468%endif
469%else ; x86-32
470%if %1 == 8
471%define P3 [rsp+ 6*16]
472    punpcklqdq    m6, m4, m2
473    mova          P3, m6
474%endif
475    mova          m6, [rsp+4*16]
476    punpckhqdq    m4, m2
477    punpcklqdq    m2, m7, m6
478    punpckhqdq    m7, m6
479    punpcklqdq    m6, m1, m5
480    punpckhqdq    m1, m5
481%if %1 == 8
482%define Q3 [rsp+24*16]
483    punpckhqdq    m5, m0, m3
484    mova          Q3, m5
485%endif
486    punpcklqdq    m0, m3
487%if %1 == 8
488%define P2 [rsp+18*16]
489%define P1 [rsp+19*16]
490%define P0 [rsp+20*16]
491%define Q0 [rsp+21*16]
492%define Q1 [rsp+22*16]
493%define Q2 [rsp+23*16]
494%else
495%define P2 [rsp+3*16]
496%define P1 [rsp+4*16]
497%define P0 [rsp+5*16]
498%define Q0 [rsp+6*16]
499%define Q1 [rsp+7*16]
500%define Q2 [rsp+8*16]
501%endif
502    mova          P2, m4
503    mova          P1, m2
504    mova          P0, m7
505    mova          Q0, m6
506    mova          Q1, m1
507    mova          Q2, m0
508%endif ; x86-32/64
509%else ; %1 == 16
510    ; We only use 14 pixels but we'll need the remainder at the end for
511    ; the second transpose
512    mova          m0, [dstq+strideq*0-16]
513    mova          m1, [dstq+strideq*1-16]
514    mova          m2, [dstq+strideq*2-16]
515    mova          m3, [dstq+stride3q -16]
516    lea         tmpq, [dstq+strideq*4]
517    mova          m4, [tmpq+strideq*0-16]
518    mova          m5, [tmpq+strideq*1-16]
519    mova          m6, [tmpq+strideq*2-16]
520%if ARCH_X86_64
521    mova          m7, [tmpq+stride3q -16]
522
523    TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8
524    SWAP           5, 13
525    SWAP           6, 8
526    SWAP           7, 9
527%define P2 m13
528%define P1 m8
529%define P0 m9
530%else ; x86-32
531%define P2 [esp+18*16]
532%define P1 [esp+19*16]
533%define P0 [esp+20*16]
534    TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \
535                     [tmpq+stride3q -16], P2, "", a, a
536    mova          P1, m6
537    mova          P0, m7
538%endif ; x86-32/64
539    mova [rsp+ 7*16], m0
540    mova [rsp+ 8*16], m1
541    mova [rsp+ 9*16], m2
542    mova [rsp+10*16], m3
543%define P3 [rsp+6*16]
544    mova          P3, m4
545
546    mova          m0, [dstq+strideq*0]
547    mova          m1, [dstq+strideq*1]
548    mova          m2, [dstq+strideq*2]
549    mova          m3, [dstq+stride3q ]
550    lea         tmpq, [dstq+strideq*4]
551    mova          m4, [tmpq+strideq*0]
552    mova          m5, [tmpq+strideq*1]
553    mova          m6, [tmpq+strideq*2]
554%if ARCH_X86_64
555    mova          m7, [tmpq+stride3q ]
556
557    TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 10
558    SWAP          0, 10
559    SWAP          1, 11
560    SWAP          2, 14
561    SWAP          3, 15
562%define Q0 m10
563%define Q1 m11
564%define Q2 m14
565%define Q3 m15
566%else ; x86-32
567    TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \
568                     [tmpq+stride3q ], [rsp+12*16], "", a, a
569%define Q0 [esp+21*16]
570%define Q1 [esp+22*16]
571%define Q2 [esp+23*16]
572%define Q3 [esp+24*16]
573    mova         Q0, m0
574    mova         Q1, m1
575    mova         Q2, m2
576    mova         Q3, m3
577%endif ; x86-32/64
578
579    mova [rsp+11*16], m4
580%if ARCH_X86_64
581    mova [rsp+12*16], m5
582%endif
583    mova [rsp+13*16], m6
584    mova [rsp+14*16], m7
585%endif ; %1 == 4/6/8/16
586%endif ; %2 ==/!= v
587
588    ; load L/E/I/H
589%if ARCH_X86_32
590%define l_strideq r5
591    mov    l_strideq, dword lstridem
592%ifidn %2, v
593%define lq r3
594    mov           lq, dword lm
595%endif
596%endif
597%ifidn %2, v
598%if cpuflag(sse4)
599    pmovzxbw      m1, [lq]
600    pmovzxbw      m0, [lq+l_strideq]
601    pxor          m2, m2
602%else ; ssse3
603    movq          m1, [lq]
604    movq          m0, [lq+l_strideq]
605    pxor          m2, m2
606    REPX {punpcklbw x, m2}, m1, m0
607%endif ; ssse3/sse4
608%else ; %2 != v
609    movq          m0, [lq]                      ; l0, l1
610    movq          m1, [lq+l_strideq]            ; l2, l3
611    punpckldq     m0, m1                        ; l0, l2, l1, l3
612    pxor          m2, m2
613    punpcklbw     m1, m0, m2                    ; l0, l2
614    punpckhbw     m0, m2                        ; l1, l3
615%endif ; %2==/!=v
616%if ARCH_X86_32
617%ifidn %2, v
618%undef lq
619    mov     mstrideq, mstridem
620%endif
621%endif
622    pcmpeqw       m5, m2, m0
623    pand          m1, m5
624    por           m0, m1                        ; l[x][] ? l[x][] : l[x-stride][]
625    pshufb        m0, [PIC_sym(pb_4x1_4x5_4x9_4x13)] ; l[x][1]
626    pcmpeqw       m5, m2, m0                    ; !L
627    psrlw         m5, 1
628%if ARCH_X86_64
629    psrlw         m2, m0, [lutq+128]
630    SPLATW        m1, [lutq+136]
631%else ; x86-32
632    mov           r5, lutm
633    psrlw         m2, m0, [r5+128]
634    SPLATW        m1, [r5+136]
635%endif ; x86-32/64
636    pminsw        m2, m1
637    pmaxsw        m2, [PIC_sym(pw_1)]           ; I
638    psrlw         m1, m0, 4                     ; H
639    paddw         m0, [PIC_sym(pw_2)]
640    paddw         m0, m0
641    paddw         m0, m2                        ; E
642    REPX {pmullw x, [bdmulq]}, m0, m1, m2
643%if ARCH_X86_32
644%undef l_strideq
645    lea    stride3q, [strideq*3]
646%endif
647
648    psubw         m3, P1, P0                    ; p1-p0
649    psubw         m4, Q0, Q1                    ; q0-q1
650    REPX {pabsw x, x}, m3, m4
651    pmaxsw        m3, m5
652    pmaxsw        m3, m4
653    pcmpgtw       m7, m3, m1                    ; hev
654%if %1 != 4
655    psubw         m4, P2, P0                    ; p2-p0
656    pabsw         m4, m4
657    pmaxsw        m4, m3
658%if %1 != 6
659    mova          m6, P3                        ; p3
660    psubw         m5, m6, P0                    ; p3-p0
661    pabsw         m5, m5
662    pmaxsw        m4, m5
663%endif ; %1 != 6
664    psubw         m5, Q0, Q2                    ; q0-q2
665    pabsw         m5, m5
666    pmaxsw        m4, m5
667%if %1 != 6
668    psubw         m5, Q0, Q3                    ; q0-q3
669    pabsw         m5, m5
670    pmaxsw        m4, m5
671%endif ; %1 != 6
672    pcmpgtw       m4, [bdmulq]                     ; !flat8in
673
674    psubw         m5, P2, P1                    ; p2-p1
675    pabsw         m5, m5
676%if %1 != 6
677    psubw         m6, P2                        ; p3-p2
678    pabsw         m6, m6
679    pmaxsw        m5, m6
680    psubw         m6, Q2, Q3                    ; q2-q3
681    pabsw         m6, m6
682    pmaxsw        m5, m6
683%endif ; %1 != 6
684    psubw         m6, Q2, Q1                    ; q2-q1
685    pabsw         m6, m6
686    pmaxsw        m5, m6
687
688%if %1 == 16
689    SPLATD        m6, [maskq+8]
690    SPLATD        m1, [maskq+4]
691    por           m6, m1
692    pand          m6, m12
693    pcmpeqd       m6, m12
694    pand          m5, m6
695%else ; %1 != 16
696    SPLATD        m6, [maskq+4]
697    pand          m6, m12
698    pcmpeqd       m6, m12
699    pand          m5, m6                        ; only apply fm-wide to wd>4 blocks
700%endif ; %1==/!=16
701    pmaxsw        m3, m5
702%endif ; %1 != 4
703    pcmpgtw       m3, m2
704
705    psubw         m5, P1, Q1                    ; p1-q1
706    psubw         m6, P0, Q0                    ; p0-q0
707    REPX {pabsw x, x}, m5, m6
708    paddw         m6, m6
709    psrlw         m5, 1
710    paddw         m5, m6                        ; abs(p0-q0)*2+(abs(p1-q1)>>1)
711    pcmpgtw       m5, m0                        ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
712    por           m3, m5
713
714%if %1 == 16
715
716%ifidn %2, v
717    lea         tmpq, [dstq+mstrideq*8]
718    mova          m0, [tmpq+strideq*1]
719    mova          m1, [tmpq+strideq*2]
720    mova          m2, [tmpq+stride3q]
721%else ; %2 != v
722    mova          m0, [rsp+ 8*16]
723    mova          m1, [rsp+ 9*16]
724    mova          m2, [rsp+10*16]
725%endif ; %2==/!=v
726    REPX {psubw x, P0}, m0, m1, m2
727    REPX {pabsw x, x}, m0, m1, m2
728    pmaxsw        m1, m0
729    pmaxsw        m1, m2
730%ifidn %2, v
731    lea         tmpq, [dstq+strideq*4]
732    mova          m0, [tmpq+strideq*0]
733    mova          m2, [tmpq+strideq*1]
734    mova          m5, [tmpq+strideq*2]
735%else ; %2 != v
736    mova          m0, [rsp+11*16]
737    mova          m2, [rsp+12*16]
738    mova          m5, [rsp+13*16]
739%endif ; %2==/!=v
740    REPX {psubw x, Q0}, m0, m2, m5
741    REPX {pabsw x, x}, m0, m2, m5
742    pmaxsw        m0, m2
743    pmaxsw        m1, m5
744    pmaxsw        m1, m0
745    pcmpgtw       m1, [bdmulq]                  ; !flat8out
746    por           m1, m4                        ; !flat8in | !flat8out
747    SPLATD        m2, [maskq+8]
748    pand          m5, m2, m12
749    pcmpeqd       m5, m12
750    pandn         m1, m5                        ; flat16
751    pandn         m5, m3, m1                    ; flat16 & fm
752    SWAP           1, 5
753
754    SPLATD        m5, [maskq+4]
755    por           m5, m2
756    pand          m2, m5, m12
757    pcmpeqd       m2, m12
758    pandn         m4, m2                        ; flat8in
759    pandn         m2, m3, m4
760    SWAP           2, 4
761    SPLATD        m2, [maskq+0]
762    por           m2, m5
763    pand          m2, m12
764    pcmpeqd       m2, m12
765    pandn         m3, m2
766    pandn         m0, m4, m3                    ; fm & !flat8 & !flat16
767    SWAP           0, 3
768    pandn         m0, m1, m4                    ; flat8 & !flat16
769    SWAP           0, 4
770%elif %1 != 4
771    SPLATD        m0, [maskq+4]
772    pand          m2, m0, m12
773    pcmpeqd       m2, m12
774    pandn         m4, m2
775    pandn         m2, m3, m4                    ; flat8 & fm
776    SWAP           2, 4
777    SPLATD        m2, [maskq+0]
778    por           m0, m2
779    pand          m0, m12
780    pcmpeqd       m0, m12
781    pandn         m3, m0
782    pandn         m0, m4, m3                    ; fm & !flat8
783    SWAP           0, 3
784%else ; %1 == 4
785    SPLATD        m0, [maskq+0]
786    pand          m0, m12
787    pcmpeqd       m0, m12
788    pandn         m3, m0                        ; fm
789%endif ; %1==/!=4
790
791    ; short filter
792%if ARCH_X86_64
793    SPLATW        m0, r7m
794%else
795    SPLATW        m0, bdmulm
796%endif
797    pcmpeqw       m2, m2
798    psrlw         m0, 1                         ; 511 or 2047
799    pxor          m2, m0                        ; -512 or -2048
800
801    psubw         m5, Q0, P0                    ; q0-p0
802    paddw         m6, m5, m5
803    paddw         m6, m5                        ; 3*(q0-p0)
804    psubw         m5, P1, Q1                    ; iclip_diff(p1-q1)
805    pminsw        m5, m0
806    pmaxsw        m5, m2
807    pand          m5, m7                        ; f=iclip_diff(p1-q1)&hev
808    paddw         m5, m6                        ; f=iclip_diff(3*(q0-p0)+f)
809    pminsw        m5, m0
810    pmaxsw        m5, m2
811    pand          m3, m5                        ; f&=fm
812    paddw         m5, m3, [PIC_sym(pw_3)]
813    paddw         m3, [PIC_sym(pw_4)]
814    REPX {pminsw x, m0}, m5, m3
815    psraw         m5, 3                         ; f2
816    psraw         m3, 3                         ; f1
817    psubw         m0, m2                        ; 1023 or 4095
818    pxor          m2, m2
819%if ARCH_X86_64
820    paddw         P0, m5
821    psubw         Q0, m3
822%else
823    paddw          m5, P0
824    psubw          m6, Q0, m3
825    REPX {pminsw x, m0}, m5, m6
826    REPX {pmaxsw x, m2}, m5, m6
827%endif
828
829    paddw         m3, [PIC_sym(pw_1)]
830    psraw         m3, 1                         ; f=(f1+1)>>1
831    pandn         m7, m3                        ; f&=!hev
832    SWAP           7, 3
833%if ARCH_X86_64
834    paddw         P1, m3
835    psubw         Q1, m3
836    REPX {pminsw x, m0}, P1, P0, Q0, Q1
837    REPX {pmaxsw x, m2}, P1, P0, Q0, Q1
838%else
839    psubw         m7, Q1, m3
840    paddw         m3, P1
841    REPX {pminsw x, m0}, m7, m3
842    REPX {pmaxsw x, m2}, m7, m3
843%if %1 > 4
844    mova          P1, m3
845    mova          P0, m5
846    mova          Q0, m6
847    mova          Q1, m7
848%endif
849%endif
850
851%if %1 == 16
852
853; m8-11 = p1/p0/q0/q1, m4=flat8, m1=flat16
854; m12=filter bits mask
855; m13-15=p2/q2/q3
856; m0,2-3,5-7 = free
857
858    ; flat16 filter
859%ifidn %2, v
860    lea         tmpq, [dstq+mstrideq*8]
861    mova          m0, [tmpq+strideq*1]          ; p6
862    mova          m2, [tmpq+strideq*2]          ; p5
863    mova          m7, [tmpq+stride3q]           ; p4
864    mova          m6, [tmpq+strideq*4]          ; p3
865    lea         tmpq, [dstq+mstrideq*4]
866%else ; %2 != v
867    mova          m0, [rsp+ 8*16]
868    mova          m2, [rsp+ 9*16]
869    mova          m7, [rsp+10*16]
870    mova          m6, [rsp+ 6*16]
871%endif ; %2==/!=v
872
873    mova [rsp+ 0*16], m4
874
875    ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
876    psllw         m3, m0, 3                     ; p6*8
877    paddw         m3, [PIC_sym(pw_8)]
878    paddw         m5, m2, m7                    ; p5+p4
879    psubw         m3, m0
880    paddw         m5, m5                        ; (p5+p4)*2
881    paddw         m3, m6                        ; p6*7+p3
882    paddw         m5, P2                        ; (p5+p4)*2+p2
883    paddw         m3, P1                        ; p6*7+p3+p1
884    paddw         m5, P0                        ; (p5+p4)*2+p2+p0
885    paddw         m3, Q0                        ; p6*7+p3+p1+q0
886    paddw         m3, m5                        ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
887    psrlw         m5, m3, 4
888    pand          m5, m1
889    pandn         m4, m1, m2
890    por           m5, m4
891%ifidn %2, v
892    mova [tmpq+mstrideq*2], m5                   ; p5
893%else ; %2 != v
894    mova  [rsp+9*16], m5
895%endif ; %2==/!=v
896
897    ; sub p6*2, add p3/q1
898    paddw         m3, m6
899    paddw         m5, m0, m0
900    paddw         m3, Q1
901    psubw         m3, m5
902    psrlw         m5, m3, 4
903    pand          m5, m1
904    pandn         m4, m1, m7
905    por           m5, m4
906%ifidn %2, v
907    mova [tmpq+mstrideq*1], m5                   ; p4
908%else ; %2 != v
909    mova [rsp+10*16], m5
910%endif ; %2==/!=v
911
912    ; sub p6/p5, add p2/q2
913    psubw         m3, m0
914    paddw         m5, P2, Q2
915    psubw         m3, m2
916    paddw         m3, m5
917    psrlw         m5, m3, 4
918    pand          m5, m1
919    pandn         m4, m1, m6
920    por           m5, m4
921%ifidn %2, v
922    mova [tmpq+strideq*0], m5                  ; p3
923%else ; %2 != v
924    mova  [rsp+6*16], m5
925%endif ; %2==/!=v
926
927%define WRITE_IN_PLACE 0
928%ifidn %2, v
929%if ARCH_X86_64
930%define WRITE_IN_PLACE 1
931%endif
932%endif
933
934    ; sub p6/p4, add p1/q3
935    paddw         m3, P1
936    paddw         m5, m0, m7
937    paddw         m3, Q3
938    psubw         m3, m5
939    psrlw         m5, m3, 4
940    pand          m5, m1
941    pandn         m4, m1, P2
942    por           m5, m4
943%if WRITE_IN_PLACE
944    mova [tmpq+strideq*1], m5
945%else
946    mova  [rsp+1*16], m5                        ; don't clobber p2/m13
947%endif
948
949    ; sub p6/p3, add p0/q4
950    paddw         m3, P0
951    paddw         m5, m0, m6
952%ifidn %2, v
953    paddw         m3, [dstq+strideq*4]
954%else ; %2 != v
955    paddw         m3, [rsp+11*16]
956%endif ; %2==/!=v
957    psubw         m3, m5
958    psrlw         m5, m3, 4
959    pand          m5, m1
960    pandn         m4, m1, P1
961    por           m5, m4
962%if WRITE_IN_PLACE
963    mova [dstq+mstrideq*2], m5
964%else
965    mova  [rsp+2*16], m5                        ; don't clobber p1/m3
966%endif
967
968    ; sub p6/p2, add q0/q5
969    paddw         m3, Q0
970    paddw         m5, m0, P2
971%ifidn %2, v
972%if ARCH_X86_32
973    lea           r4, P2
974%endif
975    lea         tmpq, [dstq+strideq*4]
976    paddw         m3, [tmpq+strideq*1]
977%else ; %2 != v
978    paddw         m3, [rsp+12*16]
979%endif ; %2==/!=v
980    psubw         m3, m5
981    psrlw         m5, m3, 4
982    pand          m5, m1
983    pandn         m4, m1, P0
984    por           m5, m4
985%if WRITE_IN_PLACE
986    mova [dstq+mstrideq*1], m5
987%else
988    mova  [rsp+3*16], m5                        ; don't clobber p0/m4
989%endif
990
991    ; sub p6/p1, add q1/q6
992    paddw         m3, Q1
993    paddw         m5, m0, P1
994%ifidn %2, v
995    mova          m0, [tmpq+strideq*2]          ; q6
996%else ; %2 != v
997    mova          m0, [rsp+13*16]               ; q6
998%endif ; %2==/!=v
999    paddw         m3, m0
1000    psubw         m3, m5
1001    psrlw         m5, m3, 4
1002    pand          m5, m1
1003    pandn         m4, m1, Q0
1004    por           m5, m4
1005%if WRITE_IN_PLACE
1006    mova      [dstq], m5
1007%else
1008    mova  [rsp+4*16], m5                        ; don't clobber q0/m5
1009%endif
1010
1011    ; sub p5/p0, add q2/q6
1012    paddw         m3, Q2
1013    paddw         m5, m2, P0
1014    paddw         m3, m0
1015    psubw         m3, m5
1016    psrlw         m5, m3, 4
1017    pand          m5, m1
1018    pandn         m4, m1, Q1
1019    por           m2, m5, m4                    ; don't clobber q1/m6
1020
1021    ; sub p4/q0, add q3/q6
1022    paddw         m3, Q3
1023    paddw         m7, Q0
1024    paddw         m3, m0
1025    psubw         m3, m7
1026    psrlw         m7, m3, 4
1027    pand          m7, m1
1028    pandn         m4, m1, Q2
1029    por           m7, m4                        ; don't clobber q2/m14
1030
1031    ; sub p3/q1, add q4/q6
1032%ifidn %2, v
1033    paddw         m3, [tmpq+strideq*0]
1034%else ; %2 != v
1035    paddw         m3, [rsp+11*16]
1036%endif ; %2==/!=v
1037    paddw         m6, Q1
1038    paddw         m3, m0
1039    psubw         m3, m6
1040    psrlw         m6, m3, 4
1041    pand          m6, m1
1042    pandn         m4, m1, Q3
1043    por           m6, m4
1044%if WRITE_IN_PLACE
1045    mova [tmpq+mstrideq], m6                    ; q3
1046%else ; %2 != v
1047    mova  [rsp+5*16], m6
1048%endif ; %2==/!=v
1049
1050    ; sub p2/q2, add q5/q6
1051%ifidn %2, v
1052    paddw         m3, [tmpq+strideq*1]
1053%if ARCH_X86_64
1054    paddw         m5, P2, Q2
1055%else
1056    ; because tmpq is clobbered, so we use a backup pointer for P2 instead
1057    paddw         m5, [r4], Q2
1058    mov     pic_regq, pic_regm
1059%endif
1060%else ; %2 != v
1061    paddw         m3, [rsp+12*16]
1062    paddw         m5, P2, Q2
1063%endif ; %2==/!=v
1064    paddw         m3, m0
1065    psubw         m3, m5
1066    psrlw         m5, m3, 4
1067    pand          m5, m1
1068%ifidn %2, v
1069    pandn         m4, m1, [tmpq+strideq*0]
1070%else ; %2 != v
1071    pandn         m4, m1, [rsp+11*16]
1072%endif ; %2==/!=v
1073    por           m5, m4
1074%ifidn %2, v
1075    mova [tmpq+strideq*0], m5                   ; q4
1076%else ; %2 != v
1077    mova [rsp+11*16], m5
1078%endif ; %2==/!=v
1079
1080    ; sub p1/q3, add q6*2
1081    psubw         m3, P1
1082    paddw         m0, m0
1083    psubw         m3, Q3
1084    paddw         m3, m0
1085    psrlw         m5, m3, 4
1086    pand          m5, m1
1087%ifidn %2, v
1088    pandn         m4, m1, [tmpq+strideq*1]
1089%else ; %2 != v
1090    pandn         m4, m1, [rsp+12*16]
1091%endif ; %2==/!=v
1092    por           m5, m4
1093%ifidn %2, v
1094    mova [tmpq+strideq*1], m5                   ; q5
1095%else ; %2 != v
1096    mova [rsp+12*16], m5
1097%endif ; %2==/!=v
1098
1099    mova          m4, [rsp+0*16]
1100%ifidn %2, v
1101    lea         tmpq, [dstq+mstrideq*4]
1102%endif
1103%if ARCH_X86_64
1104    SWAP           2, 11
1105    SWAP           7, 14
1106    SWAP           6, 15
1107%else ; x86-32
1108    mova          Q1, m2
1109    mova          Q2, m7
1110%endif ; x86-32/64
1111%if WRITE_IN_PLACE
1112    mova          P2, [tmpq+strideq*1]
1113    mova          P1, [tmpq+strideq*2]
1114    mova          P0, [tmpq+stride3q]
1115    mova          Q0, [dstq]
1116%elif ARCH_X86_64
1117    mova          P2, [rsp+1*16]
1118    mova          P1, [rsp+2*16]
1119    mova          P0, [rsp+3*16]
1120    mova          Q0, [rsp+4*16]
1121%else ; !WRITE_IN_PLACE & x86-32
1122    mova          m0, [rsp+1*16]
1123    mova          m1, [rsp+2*16]
1124    mova          m2, [rsp+3*16]
1125    mova          m3, [rsp+4*16]
1126    mova          m7, [rsp+5*16]
1127    mova          P2, m0
1128    mova          P1, m1
1129    mova          P0, m2
1130    mova          Q0, m3
1131    mova          Q3, m7
1132%endif ; WRITE_IN_PLACE / x86-32/64
1133%undef WRITE_IN_PLACE
1134%endif ; %1 == 16
1135
1136%if %1 >= 8
1137
1138    ; flat8 filter
1139    mova          m0, P3                        ; p3
1140    paddw         m1, m0, P2                    ; p3+p2
1141    paddw         m2, P1, P0                    ; p1+p0
1142    paddw         m3, m1, m1                    ; 2*(p3+p2)
1143    paddw         m2, m0                        ; p1+p0+p3
1144    paddw         m3, Q0                        ; 2*(p3+p2)+q0
1145    paddw         m2, m3                        ; 3*p3+2*p2+p1+p0+q0
1146    pmulhrsw      m7, m2, [PIC_sym(pw_4096)]
1147    psubw         m7, P2
1148    pand          m7, m4
1149
1150    paddw         m3, P1, Q1                    ; p1+q1
1151    psubw         m2, m1                        ; 2*p3+p2+p1+p0+q0
1152    paddw         m2, m3                        ; 2*p3+p2+2*p1+p0+q0+q1
1153    pmulhrsw      m3, m2, [PIC_sym(pw_4096)]
1154    psubw         m3, P1
1155    pand          m3, m4
1156
1157    paddw         m5, m0, P1                    ; p3+p1
1158    paddw         m6, P0, Q2                    ; p0+q2
1159    psubw         m2, m5                        ; p3+p2+p1+p0+q0+q1
1160    paddw         m2, m6                        ; p3+p2+p1+2*p0+q0+q1+q2
1161    pmulhrsw      m5, m2, [PIC_sym(pw_4096)]
1162    psubw         m5, P0
1163    pand          m5, m4
1164
1165    paddw         m6, m0, P0                    ; p3+p0
1166    paddw         m1, Q0, Q3                    ; q0+q3
1167    psubw         m2, m6                        ; p2+p1+p0+q0+q1+q2
1168    paddw         m2, m1                        ; p2+p1+p0+2*q0+q1+q2+q3
1169    pmulhrsw      m6, m2, [PIC_sym(pw_4096)]
1170    psubw         m6, Q0
1171    pand          m6, m4
1172
1173    paddw         m2, Q1                        ; p2+p1+p0+2*q0+2*q1+q2+q3
1174    paddw         m2, Q3                        ; p2+p1+p0+2*q0+2*q1+q2+2*q3
1175    paddw         m1, P2, Q0                    ; p2+q0
1176    psubw         m2, m1                        ; p1+p0+q0+2*q1+q2+2*q3
1177    pmulhrsw      m1, m2, [PIC_sym(pw_4096)]
1178    psubw         m1, Q1
1179    pand          m1, m4
1180
1181    psubw         m2, P1                        ; p0+q0+2*q1+q2+2*q3
1182    psubw         m2, Q1                        ; p0+q0+q1+q2+2*q3
1183    paddw         m0, Q3, Q2                    ; q3+q2
1184    paddw         m2, m0                        ; p0+q0+q1+2*q2+3*q3
1185    pmulhrsw      m2, [PIC_sym(pw_4096)]
1186    psubw         m2, Q2
1187    pand          m2, m4
1188
1189    paddw         m7, P2
1190    paddw         m3, P1
1191    paddw         m5, P0
1192    paddw         m6, Q0
1193    paddw         m1, Q1
1194    paddw         m2, Q2
1195
1196%ifidn %2, v
1197    mova [tmpq+strideq*1], m7                   ; p2
1198    mova [tmpq+strideq*2], m3                   ; p1
1199    mova [tmpq+stride3q ], m5                   ; p0
1200    mova [dstq+strideq*0], m6                   ; q0
1201    mova [dstq+strideq*1], m1                   ; q1
1202    mova [dstq+strideq*2], m2                   ; q2
1203%else ; %2 != v
1204    mova          m0, P3
1205
1206%if %1 == 8
1207    lea         tmpq, [dstq+strideq*4]
1208%if ARCH_X86_64
1209    SWAP           4, 15
1210    TRANSPOSE8X8W  0, 7, 3, 5, 6, 1, 2, 4, 8
1211%else
1212    TRANSPOSE8X8W  0, 7, 3, 5, 6, 1, 2, 4, "", \
1213                      Q3, [tmpq+strideq*1-8], a, u
1214%endif
1215
1216    ; write 8x8
1217    movu   [dstq+strideq*0-8], m0
1218    movu   [dstq+strideq*1-8], m7
1219    movu   [dstq+strideq*2-8], m3
1220    movu   [dstq+stride3q -8], m5
1221    movu   [tmpq+strideq*0-8], m6
1222%if ARCH_X86_64
1223    movu   [tmpq+strideq*1-8], m1
1224%endif
1225    movu   [tmpq+strideq*2-8], m2
1226    movu   [tmpq+stride3q -8], m4
1227    lea         dstq, [dstq+strideq*8]
1228%else ; %1 != 8
1229%if ARCH_X86_64
1230    SWAP           6, 8
1231    SWAP           1, 9
1232    SWAP           2, 10
1233%else
1234    mova  [rsp+1*16], m6
1235    mova  [rsp+2*16], m1
1236    mova  [rsp+3*16], m2
1237%endif
1238
1239    mova          m1, [rsp+ 7*16]
1240    mova          m2, [rsp+ 8*16]
1241    mova          m4, [rsp+ 9*16]
1242    mova          m6, [rsp+10*16]
1243    lea         tmpq, [dstq+strideq*4]
1244%if ARCH_X86_64
1245    TRANSPOSE8X8W  1, 2, 4, 6, 0, 7, 3, 5, 11
1246%else
1247    mova  [rsp+7*16],  m5
1248    TRANSPOSE8X8W  1, 2, 4, 6, 0, 7, 3, 5, "", \
1249                      [rsp+7*16], [tmpq+strideq*1-16], a, a
1250%endif
1251
1252    mova [dstq+strideq*0-16], m1
1253    mova [dstq+strideq*1-16], m2
1254    mova [dstq+strideq*2-16], m4
1255    mova [dstq+stride3q -16], m6
1256    mova [tmpq+strideq*0-16], m0
1257%if ARCH_X86_64
1258    mova [tmpq+strideq*1-16], m7
1259%endif
1260    mova [tmpq+strideq*2-16], m3
1261    mova [tmpq+stride3q -16], m5
1262
1263%if ARCH_X86_64
1264    SWAP           6, 8
1265    SWAP           1, 9
1266    SWAP           2, 10
1267    SWAP           4, 15
1268%else
1269    mova          m6, [rsp+1*16]
1270    mova          m1, [rsp+2*16]
1271    mova          m2, [rsp+3*16]
1272    mova          m4, Q3
1273%endif
1274    mova          m0, [rsp+11*16]
1275    mova          m3, [rsp+12*16]
1276    mova          m5, [rsp+13*16]
1277%if ARCH_X86_64
1278    mova          m7, [rsp+14*16]
1279    TRANSPOSE8X8W  6, 1, 2, 4, 0, 3, 5, 7, 8
1280%else
1281    TRANSPOSE8X8W  6, 1, 2, 4, 0, 3, 5, 7, "", \
1282                      [rsp+14*16], [tmpq+strideq*1], a, a
1283%endif
1284    mova [dstq+strideq*0], m6
1285    mova [dstq+strideq*1], m1
1286    mova [dstq+strideq*2], m2
1287    mova [dstq+stride3q ], m4
1288    mova [tmpq+strideq*0], m0
1289%if ARCH_X86_64
1290    mova [tmpq+strideq*1], m3
1291%endif
1292    mova [tmpq+strideq*2], m5
1293    mova [tmpq+stride3q ], m7
1294    lea         dstq, [dstq+strideq*8]
1295%endif ; %1==/!=8
1296%endif ; %2==/!=v
1297%elif %1 == 6
1298    ; flat6 filter
1299    paddw         m3, P1, P0                    ; p1+p0
1300    paddw         m3, P2                        ; p2+p1+p0
1301    paddw         m6, P2, Q0                    ; p2+q0
1302    paddw         m3, m3                        ; 2*(p2+p1+p0)
1303    paddw         m3, m6                        ; p2+2*(p2+p1+p0)+q0
1304    pmulhrsw      m2, m3, [PIC_sym(pw_4096)]
1305    psubw         m2, P1
1306    pand          m2, m4
1307
1308    paddw         m3, Q0                        ; p2+2*(p2+p1+p0+q0)
1309    paddw         m6, P2, P2                    ; 2*p2
1310    paddw         m3, Q1                        ; p2+2*(p2+p1+p0+q0)+q1
1311    psubw         m3, m6                        ; p2+2*(p1+p0+q0)+q1
1312    pmulhrsw      m5, m3, [PIC_sym(pw_4096)]
1313    psubw         m5, P0
1314    pand          m5, m4
1315
1316    paddw         m3, Q1                        ; p2+2*(p1+p0+q0+q1)
1317    paddw         m6, P2, P1                    ; p2+p1
1318    paddw         m3, Q2                        ; p2+2*(p1+p0+q0+q1)+q2
1319    psubw         m3, m6                        ; p1+2*(p0+q0+q1)+q2
1320    pmulhrsw      m6, m3, [PIC_sym(pw_4096)]
1321    psubw         m6, Q0
1322    pand          m6, m4
1323
1324    psubw         m3, P1                        ; 2*(p0+q0+q1)+q2
1325%if ARCH_X86_64
1326    paddw         Q2, Q2                        ; q2*2
1327%else
1328    mova          m0, Q2
1329    paddw         m0, m0
1330%endif
1331    psubw         m3, P0                        ; p0+2*(q0+q1)+q2
1332%if ARCH_X86_64
1333    paddw         m3, Q2                        ; p0+q*(q0+q1+q2)+q2
1334%else
1335    paddw         m3, m0
1336%endif
1337    pmulhrsw      m3, [PIC_sym(pw_4096)]
1338    psubw         m3, Q1
1339    pand          m3, m4
1340
1341    paddw         m2, P1
1342    paddw         m5, P0
1343    paddw         m6, Q0
1344    paddw         m3, Q1
1345
1346%ifidn %2, v
1347    mova [dstq+mstrideq*2], m2                   ; p1
1348    mova [dstq+mstrideq*1], m5                   ; p0
1349    mova [dstq+strideq*0], m6                   ; q0
1350    mova [dstq+strideq*1], m3                   ; q1
1351%else ; %2 != v
1352    TRANSPOSE_8x4_AND_WRITE_4x8 m2, m5, m6, m3, m0
1353%endif ; %2==/!=v
1354%else ; %1 == 4
1355%if ARCH_X86_64
1356%ifidn %2, v
1357    mova [dstq+mstrideq*2], P1                   ; p1
1358    mova [dstq+mstrideq*1], P0                   ; p0
1359    mova [dstq+strideq*0], Q0                   ; q0
1360    mova [dstq+strideq*1], Q1                   ; q1
1361%else ; %2 != v
1362    TRANSPOSE_8x4_AND_WRITE_4x8 P1, P0, Q0, Q1, m0
1363%endif ; %2==/!=v
1364%else ; x86-32
1365%ifidn %2, v
1366    mova [dstq+mstrideq*2], m3
1367    mova [dstq+mstrideq*1], m5
1368    mova [dstq+strideq*0], m6
1369    mova [dstq+strideq*1], m7
1370%else ; %2 != v
1371    TRANSPOSE_8x4_AND_WRITE_4x8 m3, m5, m6, m7, m0
1372%endif ; %2==/!=v
1373%endif ; x86-32/64
1374%endif ; %1
1375%undef P3
1376%undef P2
1377%undef P1
1378%undef P0
1379%undef Q0
1380%undef Q1
1381%undef Q2
1382%undef Q3
1383%endmacro
1384
1385INIT_XMM ssse3
1386; stack layout:
1387; r0 - flat8 backup inside flat16 code
1388%if ARCH_X86_64
1389cglobal lpf_v_sb_y_16bpc, 6, 12, 16, -16 * 1, \
1390                          dst, stride, mask, l, l_stride, lut, \
1391                          w, stride3, mstride, tmp, mask_bits, bdmul
1392    mov          r6d, r7m
1393    sar          r6d, 7
1394    and          r6d, 16                      ; 0 for 10bpc, 16 for 12bpc
1395    lea       bdmulq, [pw_4]
1396    add       bdmulq, r6
1397    mov           wd, wm
1398    shl    l_strideq, 2
1399    sub           lq, l_strideq
1400%else
1401; stack layout [32bit only]:
1402; r1-4 - p2-q0 post-filter16
1403; r5 - p3
1404; r6 - q3 post-filter16
1405; r7 - GPRs [mask_bitsm, mstridem]
1406; r8 - m12/pb_mask
1407; r9 - bdmulq
1408cglobal lpf_v_sb_y_16bpc, 4, 7, 8, -16 * (10 + extra_stack), \
1409                          dst, stride, mask, mstride, pic_reg, stride3, tmp
1410    RELOC_ARGS     v, 10*16
1411%if STACK_ALIGNMENT >= 16
1412    mov          r5d, r7m
1413%endif
1414    sar          r5d, 7
1415    and          r5d, 16                      ; 0 for 10bpc, 16 for 12bpc
1416    LEA     pic_regq, PIC_base
1417%define pic_regm dword [esp+7*16+2*gprsize]
1418    mov     pic_regm, pic_regq
1419    mova          m0, [PIC_sym(pw_4)+r5]
1420%define bdmulq esp+9*16
1421    mova    [bdmulq], m0
1422    shl dword lstridem, 2
1423    sub           r3, dword lstridem
1424    mov     dword lm, r3
1425%endif
1426    mov     mstrideq, strideq
1427    neg     mstrideq
1428    lea     stride3q, [strideq*3]
1429%if ARCH_X86_64
1430    mov   mask_bitsd, 0x3
1431    mova         m12, [pb_mask]
1432%else
1433%define mstridem dword [esp+7*16+1*gprsize]
1434    mov     mstridem, mstrideq
1435%define mask_bitsm dword [esp+7*16+0*gprsize]
1436    mov   mask_bitsm, 0x3
1437    mova          m0, [PIC_sym(pb_mask)]
1438%define m12 [esp+8*16]
1439    mova         m12, m0
1440%endif
1441
1442.loop:
1443%if ARCH_X86_64
1444    test   [maskq+8], mask_bitsd              ; vmask[2]
1445%else
1446    mov          r6d, mask_bitsm
1447    test   [maskq+8], r6d
1448%endif
1449    jz .no_flat16
1450
1451    FILTER        16, v
1452    jmp .end
1453
1454.no_flat16:
1455%if ARCH_X86_64
1456    test   [maskq+4], mask_bitsd              ; vmask[1]
1457%else
1458    test   [maskq+4], r6d
1459%endif
1460    jz .no_flat
1461
1462    FILTER         8, v
1463    jmp .end
1464
1465.no_flat:
1466%if ARCH_X86_64
1467    test   [maskq+0], mask_bitsd              ; vmask[0]
1468%else
1469    test   [maskq+0], r6d
1470%endif
1471    jz .end
1472
1473    FILTER         4, v
1474
1475.end:
1476%if ARCH_X86_64
1477    pslld        m12, 2
1478    add           lq, 8
1479%else
1480    mova          m0, m12
1481    pslld         m0, 2
1482    mova         m12, m0
1483    add     dword lm, 8
1484%endif
1485    add         dstq, 16
1486%if ARCH_X86_64
1487    shl   mask_bitsd, 2
1488    sub           wd, 2
1489%else
1490    shl   mask_bitsm, 2
1491    sub     dword wm, 2
1492%endif
1493    jg .loop
1494%undef mask_bitsm
1495%undef bdmulq
1496    UNRELOC_ARGS
1497    RET
1498
1499INIT_XMM ssse3
1500; stack layout:
1501; r0 - flat8 backup inside flat16
1502; r1-4 - p2-q0 post-filter16 backup
1503; r5 - q3 post-filter16 backup
1504; r6 - p3
1505; r7-10 - p7-4
1506; r11-14 - q4-7
1507%if ARCH_X86_64
1508cglobal lpf_h_sb_y_16bpc, 6, 11, 16, -16 * 15, \
1509                          dst, stride, mask, l, l_stride, lut, \
1510                          h, stride3, tmp, mask_bits, bdmul
1511    mov          r6d, r7m
1512    sar          r6d, 7
1513    and          r6d, 16                      ; 0 for 10bpc, 16 for 12bpc
1514    lea       bdmulq, [pw_4]
1515    add       bdmulq, r6
1516    mov           hd, hm
1517    shl    l_strideq, 2
1518%else
1519; stack layout [32bit only]:
1520; r15 - GPRs [mask_bitsm]
1521; r16 - m12/pb_mask
1522; r17 - bdmulq
1523; r18-24 - p2-q3
1524cglobal lpf_h_sb_y_16bpc, 4, 7, 8, -16 * (25 + extra_stack), \
1525                          dst, stride, mask, l, pic_reg, stride3, tmp
1526    RELOC_ARGS     h, 25*16
1527%if STACK_ALIGNMENT >= 16
1528    mov          r5d, r7m
1529%endif
1530    sar          r5d, 7
1531    and          r5d, 16                      ; 0 for 10bpc, 16 for 12bpc
1532    LEA     pic_regq, PIC_base
1533    mova          m0, [PIC_sym(pw_4)+r5]
1534%define bdmulq esp+17*16
1535    mova    [bdmulq], m0
1536    shl dword lstridem, 2
1537%endif
1538    sub           lq, 4
1539    lea     stride3q, [strideq*3]
1540%if ARCH_X86_64
1541    mov   mask_bitsd, 0x3
1542    mova         m12, [pb_mask]
1543%else
1544%define mask_bitsm dword [esp+15*16+0*gprsize]
1545    mov   mask_bitsm, 0x3
1546    mova          m0, [PIC_sym(pb_mask)]
1547%define m12 [esp+16*16]
1548    mova         m12, m0
1549%endif
1550
1551.loop:
1552%if ARCH_X86_64
1553    test   [maskq+8], mask_bitsd            ; vmask[2]
1554%else
1555    mov         r6d, mask_bitsm
1556    test   [maskq+8], r6d
1557%endif
1558    jz .no_flat16
1559
1560    FILTER        16, h
1561    jmp .end
1562
1563.no_flat16:
1564%if ARCH_X86_64
1565    test   [maskq+4], mask_bitsd            ; vmask[1]
1566%else
1567    test   [maskq+4], r6d
1568%endif
1569    jz .no_flat
1570
1571    FILTER         8, h
1572    jmp .end
1573
1574.no_flat:
1575%if ARCH_X86_64
1576    test   [maskq+0], mask_bitsd            ; vmask[0]
1577%else
1578    test   [maskq+0], r6d
1579%endif
1580    jz .no_filter
1581
1582    FILTER         4, h
1583    jmp .end
1584
1585.no_filter:
1586    lea         dstq, [dstq+strideq*8]
1587.end:
1588%if ARCH_X86_64
1589    pslld        m12, 2
1590    lea           lq, [lq+l_strideq*2]
1591    shl   mask_bitsd, 2
1592    sub           hd, 2
1593%else
1594    mova          m0, m12
1595    pslld         m0, 2
1596    mova         m12, m0
1597    add           lq, dword lstridem
1598    add           lq, dword lstridem
1599    shl   mask_bitsm, 2
1600    sub     dword hm, 2
1601%endif
1602    jg .loop
1603%undef mask_bitsm
1604%undef bdmulq
1605    UNRELOC_ARGS
1606    RET
1607
1608INIT_XMM ssse3
1609%if ARCH_X86_64
1610cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \
1611                           dst, stride, mask, l, l_stride, lut, \
1612                           w, stride3, mstride, tmp, mask_bits, bdmul
1613    mov          r6d, r7m
1614    sar          r6d, 7
1615    and          r6d, 16                      ; 0 for 10bpc, 16 for 12bpc
1616    lea       bdmulq, [pw_4]
1617    add       bdmulq, r6
1618    mov           wd, wm
1619    shl    l_strideq, 2
1620    sub           lq, l_strideq
1621%else
1622; stack layout [32bit only]:
1623; r0 - GPRs [mask_bitsm, mstridem]
1624; r1 - m12/pb_mask
1625; r2 - bdmulq
1626cglobal lpf_v_sb_uv_16bpc, 4, 7, 8, -16 * (3 + extra_stack), \
1627                           dst, stride, mask, mstride, pic_reg, stride3, tmp
1628    RELOC_ARGS     v, 3*16
1629%if STACK_ALIGNMENT >= 16
1630    mov          r5d, r7m
1631%endif
1632    sar          r5d, 7
1633    and          r5d, 16                      ; 0 for 10bpc, 16 for 12bpc
1634    LEA     pic_regq, PIC_base
1635    mova          m0, [PIC_sym(pw_4)+r5]
1636%define bdmulq esp+2*16
1637    mova    [bdmulq], m0
1638    shl dword lstridem, 2
1639    sub           r3, dword lstridem
1640    mov     dword lm, r3
1641%endif
1642    mov     mstrideq, strideq
1643    neg     mstrideq
1644    lea     stride3q, [strideq*3]
1645%if ARCH_X86_64
1646    mov   mask_bitsd, 0x3
1647    mova         m12, [pb_mask]
1648%else
1649%define mask_bitsm dword [esp+0*gprsize]
1650%define mstridem dword [esp+1*gprsize]
1651    mov   mask_bitsm, 0x3
1652    mov     mstridem, mstrideq
1653    mova          m0, [PIC_sym(pb_mask)]
1654%define m12 [esp+1*16]
1655    mova         m12, m0
1656%endif
1657
1658.loop:
1659%if ARCH_X86_64
1660    test   [maskq+4], mask_bitsd            ; vmask[1]
1661%else
1662    mov          r6d, mask_bitsm
1663    test   [maskq+4], r6d
1664%endif
1665    jz .no_flat
1666
1667    FILTER         6, v
1668    jmp .end
1669
1670.no_flat:
1671%if ARCH_X86_64
1672    test   [maskq+0], mask_bitsd            ; vmask[0]
1673%else
1674    test   [maskq+0], r6d
1675%endif
1676    jz .end
1677
1678    FILTER         4, v
1679
1680.end:
1681%if ARCH_X86_64
1682    pslld        m12, 2
1683    add           lq, 8
1684%else
1685    mova          m0, m12
1686    pslld         m0, 2
1687    mova         m12, m0
1688    add     dword lm, 8
1689%endif
1690    add         dstq, 16
1691%if ARCH_X86_64
1692    shl   mask_bitsd, 2
1693    sub           wd, 2
1694%else
1695    shl   mask_bitsm, 2
1696    sub     dword wm, 2
1697%endif
1698    jg .loop
1699%undef mask_bitsm
1700%undef bdmulq
1701    UNRELOC_ARGS
1702    RET
1703
1704INIT_XMM ssse3
1705%if ARCH_X86_64
1706cglobal lpf_h_sb_uv_16bpc, 6, 11, 16, \
1707                           dst, stride, mask, l, l_stride, lut, \
1708                           h, stride3, tmp, mask_bits, bdmul
1709    mov          r6d, r7m
1710    sar          r6d, 7
1711    and          r6d, 16                      ; 0 for 10bpc, 16 for 12bpc
1712    lea       bdmulq, [pw_4]
1713    add       bdmulq, r6
1714    mov           hd, hm
1715    shl    l_strideq, 2
1716%else
1717; stack layout [32bit only]:
1718; r0 - GPRs [mask_bitsm]
1719; r1 - m12/pb_mask
1720; r2 - bdmulq
1721; r3-8 - p2-q2
1722cglobal lpf_h_sb_uv_16bpc, 4, 7, 8, -16 * (9 + extra_stack), \
1723                           dst, stride, mask, l, pic_reg, stride3, tmp
1724    RELOC_ARGS     h, 9*16
1725%if STACK_ALIGNMENT >= 16
1726    mov          r5d, r7m
1727%endif
1728    sar          r5d, 7
1729    and          r5d, 16                      ; 0 for 10bpc, 16 for 12bpc
1730    LEA     pic_regq, PIC_base
1731    mova          m0, [PIC_sym(pw_4)+r5]
1732%define bdmulq esp+2*16
1733    mova    [bdmulq], m0
1734    shl dword lstridem, 2
1735%endif
1736    sub           lq, 4
1737    lea     stride3q, [strideq*3]
1738%if ARCH_X86_64
1739    mov   mask_bitsd, 0x3
1740    mova         m12, [pb_mask]
1741%else
1742%define mask_bitsm dword [esp+0*gprsize]
1743    mov   mask_bitsm, 0x3
1744    mova          m0, [PIC_sym(pb_mask)]
1745%define m12 [esp+1*16]
1746    mova         m12, m0
1747%endif
1748
1749.loop:
1750%if ARCH_X86_64
1751    test   [maskq+4], mask_bitsd            ; vmask[1]
1752%else
1753    mov          r6d, mask_bitsm
1754    test   [maskq+4], r6d
1755%endif
1756    jz .no_flat
1757
1758    FILTER         6, h
1759    jmp .end
1760
1761.no_flat:
1762%if ARCH_X86_64
1763    test   [maskq+0], mask_bitsd            ; vmask[0]
1764%else
1765    test   [maskq+0], r6d
1766%endif
1767    jz .no_filter
1768
1769    FILTER         4, h
1770    jmp .end
1771
1772.no_filter:
1773    lea         dstq, [dstq+strideq*8]
1774.end:
1775%if ARCH_X86_64
1776    pslld        m12, 2
1777    lea           lq, [lq+l_strideq*2]
1778    shl   mask_bitsd, 2
1779    sub           hd, 2
1780%else
1781    mova          m0, m12
1782    pslld         m0, 2
1783    mova         m12, m0
1784    add           lq, dword lstridem
1785    add           lq, dword lstridem
1786    shl   mask_bitsm, 2
1787    sub     dword hm, 2
1788%endif
1789    jg .loop
1790%undef mask_bitsm
1791%undef bdmulq
1792    UNRELOC_ARGS
1793    RET
1794